summaryrefslogtreecommitdiffstats
path: root/tools/testing/selftests
diff options
context:
space:
mode:
Diffstat (limited to 'tools/testing/selftests')
-rw-r--r--tools/testing/selftests/.gitignore10
-rw-r--r--tools/testing/selftests/Makefile265
-rw-r--r--tools/testing/selftests/android/Makefile39
-rw-r--r--tools/testing/selftests/android/config5
-rw-r--r--tools/testing/selftests/android/ion/.gitignore4
-rw-r--r--tools/testing/selftests/android/ion/Makefile20
-rw-r--r--tools/testing/selftests/android/ion/README101
-rw-r--r--tools/testing/selftests/android/ion/ion.h134
-rwxr-xr-xtools/testing/selftests/android/ion/ion_test.sh58
-rw-r--r--tools/testing/selftests/android/ion/ionapp_export.c127
-rw-r--r--tools/testing/selftests/android/ion/ionapp_import.c79
-rw-r--r--tools/testing/selftests/android/ion/ionmap_test.c136
-rw-r--r--tools/testing/selftests/android/ion/ionutils.c253
-rw-r--r--tools/testing/selftests/android/ion/ionutils.h55
-rw-r--r--tools/testing/selftests/android/ion/ipcsocket.c227
-rw-r--r--tools/testing/selftests/android/ion/ipcsocket.h35
-rwxr-xr-xtools/testing/selftests/android/run.sh3
-rw-r--r--tools/testing/selftests/arm64/Makefile66
-rw-r--r--tools/testing/selftests/arm64/README25
-rw-r--r--tools/testing/selftests/arm64/fp/.gitignore5
-rw-r--r--tools/testing/selftests/arm64/fp/Makefile17
-rw-r--r--tools/testing/selftests/arm64/fp/README100
-rw-r--r--tools/testing/selftests/arm64/fp/asm-offsets.h11
-rw-r--r--tools/testing/selftests/arm64/fp/assembler.h57
-rwxr-xr-xtools/testing/selftests/arm64/fp/fpsimd-stress60
-rw-r--r--tools/testing/selftests/arm64/fp/fpsimd-test.S482
-rw-r--r--tools/testing/selftests/arm64/fp/sve-probe-vls.c58
-rw-r--r--tools/testing/selftests/arm64/fp/sve-ptrace-asm.S33
-rw-r--r--tools/testing/selftests/arm64/fp/sve-ptrace.c336
-rwxr-xr-xtools/testing/selftests/arm64/fp/sve-stress59
-rw-r--r--tools/testing/selftests/arm64/fp/sve-test.S684
-rw-r--r--tools/testing/selftests/arm64/fp/vlset.c155
-rw-r--r--tools/testing/selftests/arm64/mte/.gitignore6
-rw-r--r--tools/testing/selftests/arm64/mte/Makefile27
-rw-r--r--tools/testing/selftests/arm64/mte/check_buffer_fill.c478
-rw-r--r--tools/testing/selftests/arm64/mte/check_child_memory.c198
-rw-r--r--tools/testing/selftests/arm64/mte/check_ksm_options.c163
-rw-r--r--tools/testing/selftests/arm64/mte/check_mmap_options.c266
-rw-r--r--tools/testing/selftests/arm64/mte/check_tags_inclusion.c188
-rw-r--r--tools/testing/selftests/arm64/mte/check_user_mem.c115
-rw-r--r--tools/testing/selftests/arm64/mte/mte_common_util.c332
-rw-r--r--tools/testing/selftests/arm64/mte/mte_common_util.h118
-rw-r--r--tools/testing/selftests/arm64/mte/mte_def.h60
-rw-r--r--tools/testing/selftests/arm64/mte/mte_helper.S128
-rw-r--r--tools/testing/selftests/arm64/pauth/.gitignore2
-rw-r--r--tools/testing/selftests/arm64/pauth/Makefile39
-rw-r--r--tools/testing/selftests/arm64/pauth/exec_target.c34
-rw-r--r--tools/testing/selftests/arm64/pauth/helper.c39
-rw-r--r--tools/testing/selftests/arm64/pauth/helper.h28
-rw-r--r--tools/testing/selftests/arm64/pauth/pac.c370
-rw-r--r--tools/testing/selftests/arm64/pauth/pac_corruptor.S19
-rw-r--r--tools/testing/selftests/arm64/signal/.gitignore4
-rw-r--r--tools/testing/selftests/arm64/signal/Makefile28
-rw-r--r--tools/testing/selftests/arm64/signal/README59
-rw-r--r--tools/testing/selftests/arm64/signal/signals.S64
-rw-r--r--tools/testing/selftests/arm64/signal/test_signals.c29
-rw-r--r--tools/testing/selftests/arm64/signal/test_signals.h102
-rw-r--r--tools/testing/selftests/arm64/signal/test_signals_utils.c334
-rw-r--r--tools/testing/selftests/arm64/signal/test_signals_utils.h120
-rw-r--r--tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_bad_magic.c52
-rw-r--r--tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_bad_size.c77
-rw-r--r--tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_bad_size_for_magic0.c46
-rw-r--r--tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_duplicated_fpsimd.c50
-rw-r--r--tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_misaligned_sp.c37
-rw-r--r--tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_missing_fpsimd.c50
-rw-r--r--tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_compat_toggle.c31
-rw-r--r--tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_daif_bits.c35
-rw-r--r--tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el1h.c15
-rw-r--r--tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el1t.c15
-rw-r--r--tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el2h.c15
-rw-r--r--tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el2t.c15
-rw-r--r--tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el3h.c15
-rw-r--r--tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el3t.c15
-rw-r--r--tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_template.h28
-rw-r--r--tools/testing/selftests/arm64/signal/testcases/testcases.c196
-rw-r--r--tools/testing/selftests/arm64/signal/testcases/testcases.h104
-rw-r--r--tools/testing/selftests/arm64/tags/.gitignore2
-rw-r--r--tools/testing/selftests/arm64/tags/Makefile7
-rwxr-xr-xtools/testing/selftests/arm64/tags/run_tags_test.sh12
-rw-r--r--tools/testing/selftests/arm64/tags/tags_test.c31
-rw-r--r--tools/testing/selftests/bpf/.gitignore39
-rw-r--r--tools/testing/selftests/bpf/Makefile466
-rw-r--r--tools/testing/selftests/bpf/README.rst104
-rw-r--r--tools/testing/selftests/bpf/bench.c464
-rw-r--r--tools/testing/selftests/bpf/bench.h81
-rw-r--r--tools/testing/selftests/bpf/benchs/bench_count.c91
-rw-r--r--tools/testing/selftests/bpf/benchs/bench_rename.c178
-rw-r--r--tools/testing/selftests/bpf/benchs/bench_ringbufs.c566
-rw-r--r--tools/testing/selftests/bpf/benchs/bench_trigger.c184
-rwxr-xr-xtools/testing/selftests/bpf/benchs/run_bench_rename.sh9
-rwxr-xr-xtools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh75
-rwxr-xr-xtools/testing/selftests/bpf/benchs/run_bench_trigger.sh9
-rw-r--r--tools/testing/selftests/bpf/bpf_legacy.h25
-rw-r--r--tools/testing/selftests/bpf/bpf_rand.h80
-rw-r--r--tools/testing/selftests/bpf/bpf_rlimit.h28
-rw-r--r--tools/testing/selftests/bpf/bpf_tcp_helpers.h232
-rw-r--r--tools/testing/selftests/bpf/bpf_util.h43
-rw-r--r--tools/testing/selftests/bpf/cgroup_helpers.c315
-rw-r--r--tools/testing/selftests/bpf/cgroup_helpers.h19
-rw-r--r--tools/testing/selftests/bpf/config41
-rw-r--r--tools/testing/selftests/bpf/flow_dissector_load.c109
-rw-r--r--tools/testing/selftests/bpf/flow_dissector_load.h73
-rw-r--r--tools/testing/selftests/bpf/get_cgroup_id_user.c139
-rw-r--r--tools/testing/selftests/bpf/gnu/stubs.h1
-rw-r--r--tools/testing/selftests/bpf/map_tests/.gitignore2
-rw-r--r--tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c129
-rw-r--r--tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c283
-rw-r--r--tools/testing/selftests/bpf/map_tests/sk_storage_map.c629
-rw-r--r--tools/testing/selftests/bpf/netcnt_common.h24
-rw-r--r--tools/testing/selftests/bpf/network_helpers.c245
-rw-r--r--tools/testing/selftests/bpf/network_helpers.h45
-rw-r--r--tools/testing/selftests/bpf/prog_tests/.gitignore2
-rw-r--r--tools/testing/selftests/bpf/prog_tests/align.c676
-rw-r--r--tools/testing/selftests/bpf/prog_tests/attach_probe.c105
-rw-r--r--tools/testing/selftests/bpf/prog_tests/autoload.c41
-rw-r--r--tools/testing/selftests/bpf/prog_tests/bpf_iter.c1074
-rw-r--r--tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c345
-rw-r--r--tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c236
-rw-r--r--tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c119
-rw-r--r--tools/testing/selftests/bpf/prog_tests/btf.c6839
-rw-r--r--tools/testing/selftests/bpf/prog_tests/btf_dump.c248
-rw-r--r--tools/testing/selftests/bpf/prog_tests/btf_endian.c101
-rw-r--r--tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c197
-rw-r--r--tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c246
-rw-r--r--tools/testing/selftests/bpf/prog_tests/btf_write.c244
-rw-r--r--tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c421
-rw-r--r--tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c111
-rw-r--r--tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c292
-rw-r--r--tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c148
-rw-r--r--tools/testing/selftests/bpf/prog_tests/cgroup_link.c262
-rw-r--r--tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c91
-rw-r--r--tools/testing/selftests/bpf/prog_tests/cls_redirect.c499
-rw-r--r--tools/testing/selftests/bpf/prog_tests/connect_force_port.c166
-rw-r--r--tools/testing/selftests/bpf/prog_tests/core_autosize.c225
-rw-r--r--tools/testing/selftests/bpf/prog_tests/core_extern.c169
-rw-r--r--tools/testing/selftests/bpf/prog_tests/core_reloc.c891
-rw-r--r--tools/testing/selftests/bpf/prog_tests/core_retro.c37
-rw-r--r--tools/testing/selftests/bpf/prog_tests/cpu_mask.c78
-rw-r--r--tools/testing/selftests/bpf/prog_tests/d_path.c157
-rw-r--r--tools/testing/selftests/bpf/prog_tests/enable_stats.c45
-rw-r--r--tools/testing/selftests/bpf/prog_tests/endian.c53
-rw-r--r--tools/testing/selftests/bpf/prog_tests/fentry_fexit.c49
-rw-r--r--tools/testing/selftests/bpf/prog_tests/fentry_test.c37
-rw-r--r--tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c373
-rw-r--r--tools/testing/selftests/bpf/prog_tests/fexit_stress.c76
-rw-r--r--tools/testing/selftests/bpf/prog_tests/fexit_test.c37
-rw-r--r--tools/testing/selftests/bpf/prog_tests/flow_dissector.c619
-rw-r--r--tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c49
-rw-r--r--tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c678
-rw-r--r--tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c149
-rw-r--r--tools/testing/selftests/bpf/prog_tests/get_stackid_cannot_attach.c91
-rw-r--r--tools/testing/selftests/bpf/prog_tests/global_data.c148
-rw-r--r--tools/testing/selftests/bpf/prog_tests/global_data_init.c62
-rw-r--r--tools/testing/selftests/bpf/prog_tests/hashmap.c380
-rw-r--r--tools/testing/selftests/bpf/prog_tests/kfree_skb.c155
-rw-r--r--tools/testing/selftests/bpf/prog_tests/ksyms.c61
-rw-r--r--tools/testing/selftests/bpf/prog_tests/ksyms_btf.c109
-rw-r--r--tools/testing/selftests/bpf/prog_tests/l4lb_all.c87
-rw-r--r--tools/testing/selftests/bpf/prog_tests/link_pinning.c105
-rw-r--r--tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c71
-rw-r--r--tools/testing/selftests/bpf/prog_tests/map_init.c214
-rw-r--r--tools/testing/selftests/bpf/prog_tests/map_lock.c89
-rw-r--r--tools/testing/selftests/bpf/prog_tests/map_ptr.c32
-rw-r--r--tools/testing/selftests/bpf/prog_tests/metadata.c141
-rw-r--r--tools/testing/selftests/bpf/prog_tests/mmap.c290
-rw-r--r--tools/testing/selftests/bpf/prog_tests/modify_return.c65
-rw-r--r--tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c85
-rw-r--r--tools/testing/selftests/bpf/prog_tests/obj_name.c71
-rw-r--r--tools/testing/selftests/bpf/prog_tests/pe_preserve_elems.c66
-rw-r--r--tools/testing/selftests/bpf/prog_tests/perf_branches.c170
-rw-r--r--tools/testing/selftests/bpf/prog_tests/perf_buffer.c143
-rw-r--r--tools/testing/selftests/bpf/prog_tests/perf_event_stackmap.c116
-rw-r--r--tools/testing/selftests/bpf/prog_tests/pinning.c271
-rw-r--r--tools/testing/selftests/bpf/prog_tests/pkt_access.c28
-rw-r--r--tools/testing/selftests/bpf/prog_tests/pkt_md_access.c23
-rw-r--r--tools/testing/selftests/bpf/prog_tests/probe_read_user_str.c71
-rw-r--r--tools/testing/selftests/bpf/prog_tests/probe_user.c76
-rw-r--r--tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c50
-rw-r--r--tools/testing/selftests/bpf/prog_tests/queue_stack_map.c100
-rw-r--r--tools/testing/selftests/bpf/prog_tests/raw_tp_test_run.c96
-rw-r--r--tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c42
-rw-r--r--tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c80
-rw-r--r--tools/testing/selftests/bpf/prog_tests/rdonly_maps.c92
-rw-r--r--tools/testing/selftests/bpf/prog_tests/reference_tracking.c52
-rw-r--r--tools/testing/selftests/bpf/prog_tests/resolve_btfids.c172
-rw-r--r--tools/testing/selftests/bpf/prog_tests/ringbuf.c245
-rw-r--r--tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c102
-rw-r--r--tools/testing/selftests/bpf/prog_tests/section_names.c215
-rw-r--r--tools/testing/selftests/bpf/prog_tests/select_reuseport.c879
-rw-r--r--tools/testing/selftests/bpf/prog_tests/send_signal.c209
-rw-r--r--tools/testing/selftests/bpf/prog_tests/send_signal_sched_switch.c60
-rw-r--r--tools/testing/selftests/bpf/prog_tests/signal_pending.c49
-rw-r--r--tools/testing/selftests/bpf/prog_tests/sk_assign.c342
-rw-r--r--tools/testing/selftests/bpf/prog_tests/sk_lookup.c1383
-rw-r--r--tools/testing/selftests/bpf/prog_tests/skb_ctx.c110
-rw-r--r--tools/testing/selftests/bpf/prog_tests/skb_helpers.c30
-rw-r--r--tools/testing/selftests/bpf/prog_tests/skeleton.c98
-rw-r--r--tools/testing/selftests/bpf/prog_tests/snprintf_btf.c62
-rw-r--r--tools/testing/selftests/bpf/prog_tests/sock_fields.c404
-rw-r--r--tools/testing/selftests/bpf/prog_tests/sockmap_basic.c304
-rw-r--r--tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c124
-rw-r--r--tools/testing/selftests/bpf/prog_tests/sockmap_listen.c1635
-rw-r--r--tools/testing/selftests/bpf/prog_tests/sockopt.c985
-rw-r--r--tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c236
-rw-r--r--tools/testing/selftests/bpf/prog_tests/sockopt_multi.c333
-rw-r--r--tools/testing/selftests/bpf/prog_tests/sockopt_sk.c260
-rw-r--r--tools/testing/selftests/bpf/prog_tests/spinlock.c43
-rw-r--r--tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c108
-rw-r--r--tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c152
-rw-r--r--tools/testing/selftests/bpf/prog_tests/stacktrace_map.c75
-rw-r--r--tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c65
-rw-r--r--tools/testing/selftests/bpf/prog_tests/subprogs.c37
-rw-r--r--tools/testing/selftests/bpf/prog_tests/tailcalls.c819
-rw-r--r--tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c75
-rw-r--r--tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c77
-rw-r--r--tools/testing/selftests/bpf/prog_tests/tcp_estats.c17
-rw-r--r--tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c610
-rw-r--r--tools/testing/selftests/bpf/prog_tests/tcp_rtt.c174
-rw-r--r--tools/testing/selftests/bpf/prog_tests/test_bpffs.c94
-rw-r--r--tools/testing/selftests/bpf/prog_tests/test_global_funcs.c83
-rw-r--r--tools/testing/selftests/bpf/prog_tests/test_local_storage.c60
-rw-r--r--tools/testing/selftests/bpf/prog_tests/test_lsm.c95
-rw-r--r--tools/testing/selftests/bpf/prog_tests/test_overhead.c148
-rw-r--r--tools/testing/selftests/bpf/prog_tests/test_profiler.c72
-rw-r--r--tools/testing/selftests/bpf/prog_tests/tp_attach_query.c135
-rw-r--r--tools/testing/selftests/bpf/prog_tests/trace_ext.c111
-rw-r--r--tools/testing/selftests/bpf/prog_tests/trace_printk.c75
-rw-r--r--tools/testing/selftests/bpf/prog_tests/trampoline_count.c123
-rw-r--r--tools/testing/selftests/bpf/prog_tests/udp_limit.c75
-rw-r--r--tools/testing/selftests/bpf/prog_tests/varlen.c68
-rw-r--r--tools/testing/selftests/bpf/prog_tests/vmlinux.c43
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp.c45
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c141
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_attach.c90
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c129
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c70
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c89
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_info.c68
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_link.c151
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_noinline.c67
-rw-r--r--tools/testing/selftests/bpf/prog_tests/xdp_perf.c25
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_cubic.c545
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_dctcp.c234
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_flow.c421
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter.h130
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_bpf_array_map.c40
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_bpf_hash_map.c115
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c28
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c46
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_hash_map.c50
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_map.c34
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c57
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_netlink.c64
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_sockmap.c59
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_task.c26
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_task_btf.c50
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_task_file.c34
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c37
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c234
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c250
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c4
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c4
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c18
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c52
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_test_kern5.c35
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_test_kern6.c21
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h22
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_udp4.c71
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_iter_udp6.c79
-rw-r--r--tools/testing/selftests/bpf/progs/bpf_tracing_net.h51
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_arrays.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___diff_arr_dim.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___diff_arr_val_sz.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___equiv_zero_sz_arr.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_bad_zero_sz_arr.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_non_array.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_too_shallow.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_too_small.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_wrong_val_type.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___fixed_arr.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_bitfields.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_bitfields___bit_sz_change.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_bitfields___bitfield_vs_int.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_bitfields___err_too_big_bitfield.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_bitfields___just_big_enough.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_enumval.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_enumval___diff.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_enumval___err_missing.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_enumval___val3_missing.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_existence.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_existence___minimal.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_existence___wrong_field_defs.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_flavors.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_flavors__err_wrong_name.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_ints.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_ints___bool.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_ints___reverse_sign.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_misc.c5
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_mods.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_mods___mod_swap.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_mods___typedefs.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_nesting.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___anon_embed.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___dup_compat_types.c5
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_array_container.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_array_field.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_dup_incompat_types.c4
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_missing_container.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_missing_field.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_nonstruct_container.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_partial_match_dups.c4
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_too_deep.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___extra_nesting.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___struct_union_mixup.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_primitives.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_enum_def.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_func_proto.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_ptr_type.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_enum.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_int.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_ptr.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_ptr_as_arr.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_ptr_as_arr___diff_sz.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_size.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_size___diff_sz.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_size___err_ambiguous.c4
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_type_based.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___all_missing.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___diff_sz.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___fn_wrong_args.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___incompat.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_type_id.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf__core_reloc_type_id___missing_targets.c3
-rw-r--r--tools/testing/selftests/bpf/progs/btf_data.c50
-rw-r--r--tools/testing/selftests/bpf/progs/btf_dump_test_case_bitfields.c92
-rw-r--r--tools/testing/selftests/bpf/progs/btf_dump_test_case_multidim.c35
-rw-r--r--tools/testing/selftests/bpf/progs/btf_dump_test_case_namespacing.c73
-rw-r--r--tools/testing/selftests/bpf/progs/btf_dump_test_case_ordering.c63
-rw-r--r--tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c153
-rw-r--r--tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c249
-rw-r--r--tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c229
-rw-r--r--tools/testing/selftests/bpf/progs/btf_ptr.h27
-rw-r--r--tools/testing/selftests/bpf/progs/cg_storage_multi.h13
-rw-r--r--tools/testing/selftests/bpf/progs/cg_storage_multi_egress_only.c33
-rw-r--r--tools/testing/selftests/bpf/progs/cg_storage_multi_isolated.c57
-rw-r--r--tools/testing/selftests/bpf/progs/cg_storage_multi_shared.c57
-rw-r--r--tools/testing/selftests/bpf/progs/cgroup_skb_sk_lookup_kern.c97
-rw-r--r--tools/testing/selftests/bpf/progs/connect4_prog.c201
-rw-r--r--tools/testing/selftests/bpf/progs/connect6_prog.c95
-rw-r--r--tools/testing/selftests/bpf/progs/connect_force_port4.c83
-rw-r--r--tools/testing/selftests/bpf/progs/connect_force_port6.c94
-rw-r--r--tools/testing/selftests/bpf/progs/core_reloc_types.h1145
-rw-r--r--tools/testing/selftests/bpf/progs/dev_cgroup.c60
-rw-r--r--tools/testing/selftests/bpf/progs/fentry_test.c79
-rw-r--r--tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c181
-rw-r--r--tools/testing/selftests/bpf/progs/fexit_bpf2bpf_simple.c27
-rw-r--r--tools/testing/selftests/bpf/progs/fexit_test.c80
-rw-r--r--tools/testing/selftests/bpf/progs/fmod_ret_freplace.c14
-rw-r--r--tools/testing/selftests/bpf/progs/freplace_attach_probe.c40
-rw-r--r--tools/testing/selftests/bpf/progs/freplace_cls_redirect.c34
-rw-r--r--tools/testing/selftests/bpf/progs/freplace_connect4.c18
-rw-r--r--tools/testing/selftests/bpf/progs/freplace_connect_v4_prog.c19
-rw-r--r--tools/testing/selftests/bpf/progs/freplace_get_constant.c15
-rw-r--r--tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c40
-rw-r--r--tools/testing/selftests/bpf/progs/kfree_skb.c153
-rw-r--r--tools/testing/selftests/bpf/progs/load_bytes_relative.c48
-rw-r--r--tools/testing/selftests/bpf/progs/local_storage.c146
-rw-r--r--tools/testing/selftests/bpf/progs/loop1.c29
-rw-r--r--tools/testing/selftests/bpf/progs/loop2.c29
-rw-r--r--tools/testing/selftests/bpf/progs/loop3.c23
-rw-r--r--tools/testing/selftests/bpf/progs/loop4.c18
-rw-r--r--tools/testing/selftests/bpf/progs/loop5.c32
-rw-r--r--tools/testing/selftests/bpf/progs/lsm.c110
-rw-r--r--tools/testing/selftests/bpf/progs/map_ptr_kern.c694
-rw-r--r--tools/testing/selftests/bpf/progs/metadata_unused.c15
-rw-r--r--tools/testing/selftests/bpf/progs/metadata_used.c15
-rw-r--r--tools/testing/selftests/bpf/progs/modify_return.c49
-rw-r--r--tools/testing/selftests/bpf/progs/netcnt_prog.c71
-rw-r--r--tools/testing/selftests/bpf/progs/netif_receive_skb.c256
-rw-r--r--tools/testing/selftests/bpf/progs/perf_event_stackmap.c59
-rw-r--r--tools/testing/selftests/bpf/progs/perfbuf_bench.c33
-rw-r--r--tools/testing/selftests/bpf/progs/profiler.h177
-rw-r--r--tools/testing/selftests/bpf/progs/profiler.inc.h976
-rw-r--r--tools/testing/selftests/bpf/progs/profiler1.c6
-rw-r--r--tools/testing/selftests/bpf/progs/profiler2.c6
-rw-r--r--tools/testing/selftests/bpf/progs/profiler3.c6
-rw-r--r--tools/testing/selftests/bpf/progs/pyperf.h280
-rw-r--r--tools/testing/selftests/bpf/progs/pyperf100.c4
-rw-r--r--tools/testing/selftests/bpf/progs/pyperf180.c4
-rw-r--r--tools/testing/selftests/bpf/progs/pyperf50.c4
-rw-r--r--tools/testing/selftests/bpf/progs/pyperf600.c9
-rw-r--r--tools/testing/selftests/bpf/progs/pyperf600_nounroll.c8
-rw-r--r--tools/testing/selftests/bpf/progs/pyperf_global.c5
-rw-r--r--tools/testing/selftests/bpf/progs/pyperf_subprogs.c5
-rw-r--r--tools/testing/selftests/bpf/progs/ringbuf_bench.c60
-rw-r--r--tools/testing/selftests/bpf/progs/sample_map_ret0.c34
-rw-r--r--tools/testing/selftests/bpf/progs/sample_ret0.c7
-rw-r--r--tools/testing/selftests/bpf/progs/sendmsg4_prog.c49
-rw-r--r--tools/testing/selftests/bpf/progs/sendmsg6_prog.c59
-rw-r--r--tools/testing/selftests/bpf/progs/socket_cookie_prog.c70
-rw-r--r--tools/testing/selftests/bpf/progs/sockmap_parse_prog.c37
-rw-r--r--tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c26
-rw-r--r--tools/testing/selftests/bpf/progs/sockmap_verdict_prog.c65
-rw-r--r--tools/testing/selftests/bpf/progs/sockopt_inherit.c97
-rw-r--r--tools/testing/selftests/bpf/progs/sockopt_multi.c71
-rw-r--r--tools/testing/selftests/bpf/progs/sockopt_sk.c201
-rw-r--r--tools/testing/selftests/bpf/progs/strobemeta.c10
-rw-r--r--tools/testing/selftests/bpf/progs/strobemeta.h547
-rw-r--r--tools/testing/selftests/bpf/progs/strobemeta_nounroll1.c9
-rw-r--r--tools/testing/selftests/bpf/progs/strobemeta_nounroll2.c9
-rw-r--r--tools/testing/selftests/bpf/progs/strobemeta_subprogs.c10
-rw-r--r--tools/testing/selftests/bpf/progs/tailcall1.c48
-rw-r--r--tools/testing/selftests/bpf/progs/tailcall2.c59
-rw-r--r--tools/testing/selftests/bpf/progs/tailcall3.c31
-rw-r--r--tools/testing/selftests/bpf/progs/tailcall4.c33
-rw-r--r--tools/testing/selftests/bpf/progs/tailcall5.c40
-rw-r--r--tools/testing/selftests/bpf/progs/tailcall_bpf2bpf1.c38
-rw-r--r--tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c41
-rw-r--r--tools/testing/selftests/bpf/progs/tailcall_bpf2bpf3.c61
-rw-r--r--tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c61
-rw-r--r--tools/testing/selftests/bpf/progs/tcp_rtt.c60
-rw-r--r--tools/testing/selftests/bpf/progs/test_attach_probe.c42
-rw-r--r--tools/testing/selftests/bpf/progs/test_autoload.c40
-rw-r--r--tools/testing/selftests/bpf/progs/test_btf_haskv.c50
-rw-r--r--tools/testing/selftests/bpf/progs/test_btf_map_in_map.c150
-rw-r--r--tools/testing/selftests/bpf/progs/test_btf_newkv.c63
-rw-r--r--tools/testing/selftests/bpf/progs/test_btf_nokv.c47
-rw-r--r--tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c174
-rw-r--r--tools/testing/selftests/bpf/progs/test_cgroup_link.c24
-rw-r--r--tools/testing/selftests/bpf/progs/test_cls_redirect.c1068
-rw-r--r--tools/testing/selftests/bpf/progs/test_cls_redirect.h63
-rw-r--r--tools/testing/selftests/bpf/progs/test_cls_redirect_subprogs.c2
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_autosize.c182
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_extern.c62
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c58
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_direct.c63
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_probed.c57
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_enumval.c72
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_existence.c79
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_flavors.c65
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_ints.c47
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c98
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_misc.c60
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_mods.c65
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_nesting.c49
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_primitives.c46
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_ptr_as_arr.c33
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_size.c51
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c110
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_reloc_type_id.c115
-rw-r--r--tools/testing/selftests/bpf/progs/test_core_retro.c43
-rw-r--r--tools/testing/selftests/bpf/progs/test_d_path.c65
-rw-r--r--tools/testing/selftests/bpf/progs/test_enable_stats.c18
-rw-r--r--tools/testing/selftests/bpf/progs/test_endian.c37
-rw-r--r--tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c102
-rw-r--r--tools/testing/selftests/bpf/progs/test_get_stack_rawtp_err.c26
-rw-r--r--tools/testing/selftests/bpf/progs/test_global_data.c106
-rw-r--r--tools/testing/selftests/bpf/progs/test_global_func1.c45
-rw-r--r--tools/testing/selftests/bpf/progs/test_global_func2.c4
-rw-r--r--tools/testing/selftests/bpf/progs/test_global_func3.c65
-rw-r--r--tools/testing/selftests/bpf/progs/test_global_func4.c4
-rw-r--r--tools/testing/selftests/bpf/progs/test_global_func5.c31
-rw-r--r--tools/testing/selftests/bpf/progs/test_global_func6.c31
-rw-r--r--tools/testing/selftests/bpf/progs/test_global_func7.c18
-rw-r--r--tools/testing/selftests/bpf/progs/test_global_func8.c19
-rw-r--r--tools/testing/selftests/bpf/progs/test_jhash.h71
-rw-r--r--tools/testing/selftests/bpf/progs/test_ksyms.c32
-rw-r--r--tools/testing/selftests/bpf/progs/test_ksyms_btf.c55
-rw-r--r--tools/testing/selftests/bpf/progs/test_ksyms_btf_null_check.c31
-rw-r--r--tools/testing/selftests/bpf/progs/test_l4lb.c473
-rw-r--r--tools/testing/selftests/bpf/progs/test_l4lb_noinline.c470
-rw-r--r--tools/testing/selftests/bpf/progs/test_link_pinning.c25
-rw-r--r--tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c26
-rw-r--r--tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c85
-rw-r--r--tools/testing/selftests/bpf/progs/test_lwt_seg6local.c426
-rw-r--r--tools/testing/selftests/bpf/progs/test_map_in_map.c53
-rw-r--r--tools/testing/selftests/bpf/progs/test_map_init.c33
-rw-r--r--tools/testing/selftests/bpf/progs/test_map_lock.c62
-rw-r--r--tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c325
-rw-r--r--tools/testing/selftests/bpf/progs/test_mmap.c53
-rw-r--r--tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c37
-rw-r--r--tools/testing/selftests/bpf/progs/test_obj_id.c24
-rw-r--r--tools/testing/selftests/bpf/progs/test_overhead.c42
-rw-r--r--tools/testing/selftests/bpf/progs/test_pe_preserve_elems.c38
-rw-r--r--tools/testing/selftests/bpf/progs/test_perf_branches.c50
-rw-r--r--tools/testing/selftests/bpf/progs/test_perf_buffer.c25
-rw-r--r--tools/testing/selftests/bpf/progs/test_pinning.c31
-rw-r--r--tools/testing/selftests/bpf/progs/test_pinning_invalid.c16
-rw-r--r--tools/testing/selftests/bpf/progs/test_pkt_access.c150
-rw-r--r--tools/testing/selftests/bpf/progs/test_pkt_md_access.c43
-rw-r--r--tools/testing/selftests/bpf/progs/test_probe_read_user_str.c25
-rw-r--r--tools/testing/selftests/bpf/progs/test_probe_user.c26
-rw-r--r--tools/testing/selftests/bpf/progs/test_queue_map.c4
-rw-r--r--tools/testing/selftests/bpf/progs/test_queue_stack_map.h59
-rw-r--r--tools/testing/selftests/bpf/progs/test_raw_tp_test_run.c24
-rw-r--r--tools/testing/selftests/bpf/progs/test_rdonly_maps.c83
-rw-r--r--tools/testing/selftests/bpf/progs/test_ringbuf.c78
-rw-r--r--tools/testing/selftests/bpf/progs/test_ringbuf_multi.c77
-rw-r--r--tools/testing/selftests/bpf/progs/test_seg6_loop.c260
-rw-r--r--tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c186
-rw-r--r--tools/testing/selftests/bpf/progs/test_send_signal_kern.c46
-rw-r--r--tools/testing/selftests/bpf/progs/test_sk_assign.c197
-rw-r--r--tools/testing/selftests/bpf/progs/test_sk_assign_libbpf.c3
-rw-r--r--tools/testing/selftests/bpf/progs/test_sk_lookup.c647
-rw-r--r--tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c181
-rw-r--r--tools/testing/selftests/bpf/progs/test_skb_cgroup_id_kern.c47
-rw-r--r--tools/testing/selftests/bpf/progs/test_skb_ctx.c30
-rw-r--r--tools/testing/selftests/bpf/progs/test_skb_helpers.c28
-rw-r--r--tools/testing/selftests/bpf/progs/test_skeleton.c59
-rw-r--r--tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c47
-rw-r--r--tools/testing/selftests/bpf/progs/test_sock_fields.c298
-rw-r--r--tools/testing/selftests/bpf/progs/test_sockhash_kern.c5
-rw-r--r--tools/testing/selftests/bpf/progs/test_sockmap_invalid_update.c23
-rw-r--r--tools/testing/selftests/bpf/progs/test_sockmap_kern.c5
-rw-r--r--tools/testing/selftests/bpf/progs/test_sockmap_kern.h375
-rw-r--r--tools/testing/selftests/bpf/progs/test_sockmap_listen.c98
-rw-r--r--tools/testing/selftests/bpf/progs/test_sockmap_update.c48
-rw-r--r--tools/testing/selftests/bpf/progs/test_spin_lock.c101
-rw-r--r--tools/testing/selftests/bpf/progs/test_stack_map.c4
-rw-r--r--tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c76
-rw-r--r--tools/testing/selftests/bpf/progs/test_stacktrace_map.c76
-rw-r--r--tools/testing/selftests/bpf/progs/test_subprogs.c103
-rw-r--r--tools/testing/selftests/bpf/progs/test_subprogs_unused.c21
-rw-r--r--tools/testing/selftests/bpf/progs/test_sysctl_loop1.c74
-rw-r--r--tools/testing/selftests/bpf/progs/test_sysctl_loop2.c72
-rw-r--r--tools/testing/selftests/bpf/progs/test_sysctl_prog.c73
-rw-r--r--tools/testing/selftests/bpf/progs/test_tc_edt.c110
-rw-r--r--tools/testing/selftests/bpf/progs/test_tc_neigh.c149
-rw-r--r--tools/testing/selftests/bpf/progs/test_tc_neigh_fib.c155
-rw-r--r--tools/testing/selftests/bpf/progs/test_tc_peer.c45
-rw-r--r--tools/testing/selftests/bpf/progs/test_tc_tunnel.c536
-rw-r--r--tools/testing/selftests/bpf/progs/test_tcp_check_syncookie_kern.c167
-rw-r--r--tools/testing/selftests/bpf/progs/test_tcp_estats.c258
-rw-r--r--tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c626
-rw-r--r--tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c195
-rw-r--r--tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c95
-rw-r--r--tools/testing/selftests/bpf/progs/test_trace_ext.c18
-rw-r--r--tools/testing/selftests/bpf/progs/test_trace_ext_tracing.c25
-rw-r--r--tools/testing/selftests/bpf/progs/test_tracepoint.c26
-rw-r--r--tools/testing/selftests/bpf/progs/test_trampoline_count.c22
-rw-r--r--tools/testing/selftests/bpf/progs/test_tunnel_kern.c681
-rw-r--r--tools/testing/selftests/bpf/progs/test_varlen.c158
-rw-r--r--tools/testing/selftests/bpf/progs/test_verif_scale1.c30
-rw-r--r--tools/testing/selftests/bpf/progs/test_verif_scale2.c30
-rw-r--r--tools/testing/selftests/bpf/progs/test_verif_scale3.c30
-rw-r--r--tools/testing/selftests/bpf/progs/test_vmlinux.c90
-rw-r--r--tools/testing/selftests/bpf/progs/test_xdp.c235
-rw-r--r--tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c33
-rw-r--r--tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c30
-rw-r--r--tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c68
-rw-r--r--tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c22
-rw-r--r--tools/testing/selftests/bpf/progs/test_xdp_link.c12
-rw-r--r--tools/testing/selftests/bpf/progs/test_xdp_loop.c231
-rw-r--r--tools/testing/selftests/bpf/progs/test_xdp_meta.c53
-rw-r--r--tools/testing/selftests/bpf/progs/test_xdp_noinline.c838
-rw-r--r--tools/testing/selftests/bpf/progs/test_xdp_redirect.c28
-rw-r--r--tools/testing/selftests/bpf/progs/test_xdp_vlan.c292
-rw-r--r--tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c36
-rw-r--r--tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c43
-rw-r--r--tools/testing/selftests/bpf/progs/trace_printk.c21
-rw-r--r--tools/testing/selftests/bpf/progs/trigger_bench.c54
-rw-r--r--tools/testing/selftests/bpf/progs/udp_limit.c61
-rw-r--r--tools/testing/selftests/bpf/progs/xdp_dummy.c13
-rw-r--r--tools/testing/selftests/bpf/progs/xdp_redirect_map.c31
-rw-r--r--tools/testing/selftests/bpf/progs/xdp_tx.c12
-rw-r--r--tools/testing/selftests/bpf/progs/xdping_kern.c184
-rw-r--r--tools/testing/selftests/bpf/settings1
-rwxr-xr-xtools/testing/selftests/bpf/tcp_client.py50
-rwxr-xr-xtools/testing/selftests/bpf/tcp_server.py80
-rw-r--r--tools/testing/selftests/bpf/test_bpftool.py178
-rwxr-xr-xtools/testing/selftests/bpf/test_bpftool.sh5
-rwxr-xr-xtools/testing/selftests/bpf/test_bpftool_build.sh168
-rwxr-xr-xtools/testing/selftests/bpf/test_bpftool_metadata.sh82
-rw-r--r--tools/testing/selftests/bpf/test_btf.h69
-rw-r--r--tools/testing/selftests/bpf/test_cgroup_storage.c170
-rw-r--r--tools/testing/selftests/bpf/test_cpp.cpp30
-rw-r--r--tools/testing/selftests/bpf/test_current_pid_tgid_new_ns.c160
-rw-r--r--tools/testing/selftests/bpf/test_dev_cgroup.c82
-rw-r--r--tools/testing/selftests/bpf/test_flow_dissector.c780
-rwxr-xr-xtools/testing/selftests/bpf/test_flow_dissector.sh168
-rwxr-xr-xtools/testing/selftests/bpf/test_ftrace.sh39
-rw-r--r--tools/testing/selftests/bpf/test_iptunnel_common.h34
-rwxr-xr-xtools/testing/selftests/bpf/test_kmod.sh67
-rwxr-xr-xtools/testing/selftests/bpf/test_lirc_mode2.sh41
-rw-r--r--tools/testing/selftests/bpf/test_lirc_mode2_user.c176
-rw-r--r--tools/testing/selftests/bpf/test_lpm_map.c804
-rw-r--r--tools/testing/selftests/bpf/test_lru_map.c903
-rwxr-xr-xtools/testing/selftests/bpf/test_lwt_ip_encap.sh475
-rwxr-xr-xtools/testing/selftests/bpf/test_lwt_seg6local.sh149
-rw-r--r--tools/testing/selftests/bpf/test_maps.c1776
-rw-r--r--tools/testing/selftests/bpf/test_maps.h17
-rw-r--r--tools/testing/selftests/bpf/test_netcnt.c148
-rwxr-xr-xtools/testing/selftests/bpf/test_offload.py1406
-rw-r--r--tools/testing/selftests/bpf/test_progs.c751
-rw-r--r--tools/testing/selftests/bpf/test_progs.h216
-rw-r--r--tools/testing/selftests/bpf/test_select_reuseport_common.h36
-rwxr-xr-xtools/testing/selftests/bpf/test_skb_cgroup_id.sh63
-rw-r--r--tools/testing/selftests/bpf/test_skb_cgroup_id_user.c181
-rw-r--r--tools/testing/selftests/bpf/test_sock.c481
-rw-r--r--tools/testing/selftests/bpf/test_sock_addr.c1655
-rwxr-xr-xtools/testing/selftests/bpf/test_sock_addr.sh58
-rw-r--r--tools/testing/selftests/bpf/test_socket_cookie.c208
-rw-r--r--tools/testing/selftests/bpf/test_sockmap.c2026
-rw-r--r--tools/testing/selftests/bpf/test_stub.c44
-rw-r--r--tools/testing/selftests/bpf/test_sysctl.c1636
-rw-r--r--tools/testing/selftests/bpf/test_tag.c202
-rwxr-xr-xtools/testing/selftests/bpf/test_tc_edt.sh99
-rwxr-xr-xtools/testing/selftests/bpf/test_tc_redirect.sh216
-rwxr-xr-xtools/testing/selftests/bpf/test_tc_tunnel.sh295
-rwxr-xr-xtools/testing/selftests/bpf/test_tcp_check_syncookie.sh84
-rw-r--r--tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c257
-rw-r--r--tools/testing/selftests/bpf/test_tcp_hdr_options.h152
-rw-r--r--tools/testing/selftests/bpf/test_tcpbpf.h18
-rw-r--r--tools/testing/selftests/bpf/test_tcpbpf_user.c165
-rw-r--r--tools/testing/selftests/bpf/test_tcpnotify.h19
-rw-r--r--tools/testing/selftests/bpf/test_tcpnotify_user.c169
-rwxr-xr-xtools/testing/selftests/bpf/test_tunnel.sh798
-rw-r--r--tools/testing/selftests/bpf/test_verifier.c1244
-rw-r--r--tools/testing/selftests/bpf/test_verifier_log.c174
-rwxr-xr-xtools/testing/selftests/bpf/test_xdp_meta.sh52
-rwxr-xr-xtools/testing/selftests/bpf/test_xdp_redirect.sh77
-rwxr-xr-xtools/testing/selftests/bpf/test_xdp_veth.sh118
-rwxr-xr-xtools/testing/selftests/bpf/test_xdp_vlan.sh228
-rwxr-xr-xtools/testing/selftests/bpf/test_xdp_vlan_mode_generic.sh9
-rwxr-xr-xtools/testing/selftests/bpf/test_xdp_vlan_mode_native.sh9
-rwxr-xr-xtools/testing/selftests/bpf/test_xdping.sh99
-rw-r--r--tools/testing/selftests/bpf/testing_helpers.c80
-rw-r--r--tools/testing/selftests/bpf/testing_helpers.h8
-rw-r--r--tools/testing/selftests/bpf/trace_helpers.c138
-rw-r--r--tools/testing/selftests/bpf/trace_helpers.h21
-rw-r--r--tools/testing/selftests/bpf/urandom_read.c35
-rw-r--r--tools/testing/selftests/bpf/verifier/.gitignore2
-rw-r--r--tools/testing/selftests/bpf/verifier/and.c68
-rw-r--r--tools/testing/selftests/bpf/verifier/array_access.c379
-rw-r--r--tools/testing/selftests/bpf/verifier/basic.c23
-rw-r--r--tools/testing/selftests/bpf/verifier/basic_call.c50
-rw-r--r--tools/testing/selftests/bpf/verifier/basic_instr.c219
-rw-r--r--tools/testing/selftests/bpf/verifier/basic_stack.c64
-rw-r--r--tools/testing/selftests/bpf/verifier/basic_stx_ldx.c45
-rw-r--r--tools/testing/selftests/bpf/verifier/bounds.c755
-rw-r--r--tools/testing/selftests/bpf/verifier/bounds_deduction.c136
-rw-r--r--tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c411
-rw-r--r--tools/testing/selftests/bpf/verifier/bpf_get_stack.c44
-rw-r--r--tools/testing/selftests/bpf/verifier/calls.c2034
-rw-r--r--tools/testing/selftests/bpf/verifier/cfg.c73
-rw-r--r--tools/testing/selftests/bpf/verifier/cgroup_inv_retcode.c72
-rw-r--r--tools/testing/selftests/bpf/verifier/cgroup_skb.c197
-rw-r--r--tools/testing/selftests/bpf/verifier/cgroup_storage.c220
-rw-r--r--tools/testing/selftests/bpf/verifier/const_or.c60
-rw-r--r--tools/testing/selftests/bpf/verifier/ctx.c198
-rw-r--r--tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c493
-rw-r--r--tools/testing/selftests/bpf/verifier/ctx_sk_msg.c181
-rw-r--r--tools/testing/selftests/bpf/verifier/ctx_skb.c1091
-rw-r--r--tools/testing/selftests/bpf/verifier/d_path.c37
-rw-r--r--tools/testing/selftests/bpf/verifier/dead_code.c161
-rw-r--r--tools/testing/selftests/bpf/verifier/direct_packet_access.c656
-rw-r--r--tools/testing/selftests/bpf/verifier/direct_stack_access_wraparound.c40
-rw-r--r--tools/testing/selftests/bpf/verifier/direct_value_access.c347
-rw-r--r--tools/testing/selftests/bpf/verifier/div0.c184
-rw-r--r--tools/testing/selftests/bpf/verifier/div_overflow.c110
-rw-r--r--tools/testing/selftests/bpf/verifier/event_output.c119
-rw-r--r--tools/testing/selftests/bpf/verifier/helper_access_var_len.c616
-rw-r--r--tools/testing/selftests/bpf/verifier/helper_packet_access.c460
-rw-r--r--tools/testing/selftests/bpf/verifier/helper_value_access.c953
-rw-r--r--tools/testing/selftests/bpf/verifier/int_ptr.c160
-rw-r--r--tools/testing/selftests/bpf/verifier/jit.c107
-rw-r--r--tools/testing/selftests/bpf/verifier/jmp32.c866
-rw-r--r--tools/testing/selftests/bpf/verifier/jset.c169
-rw-r--r--tools/testing/selftests/bpf/verifier/jump.c375
-rw-r--r--tools/testing/selftests/bpf/verifier/junk_insn.c45
-rw-r--r--tools/testing/selftests/bpf/verifier/ld_abs.c286
-rw-r--r--tools/testing/selftests/bpf/verifier/ld_dw.c45
-rw-r--r--tools/testing/selftests/bpf/verifier/ld_imm64.c146
-rw-r--r--tools/testing/selftests/bpf/verifier/ld_ind.c72
-rw-r--r--tools/testing/selftests/bpf/verifier/leak_ptr.c67
-rw-r--r--tools/testing/selftests/bpf/verifier/loops1.c206
-rw-r--r--tools/testing/selftests/bpf/verifier/lwt.c189
-rw-r--r--tools/testing/selftests/bpf/verifier/map_in_map.c62
-rw-r--r--tools/testing/selftests/bpf/verifier/map_ptr.c98
-rw-r--r--tools/testing/selftests/bpf/verifier/map_ptr_mixing.c100
-rw-r--r--tools/testing/selftests/bpf/verifier/map_ret_val.c65
-rw-r--r--tools/testing/selftests/bpf/verifier/masking.c322
-rw-r--r--tools/testing/selftests/bpf/verifier/meta_access.c235
-rw-r--r--tools/testing/selftests/bpf/verifier/perf_event_sample_period.c59
-rw-r--r--tools/testing/selftests/bpf/verifier/precise.c194
-rw-r--r--tools/testing/selftests/bpf/verifier/prevent_map_lookup.c29
-rw-r--r--tools/testing/selftests/bpf/verifier/raw_stack.c305
-rw-r--r--tools/testing/selftests/bpf/verifier/raw_tp_writable.c34
-rw-r--r--tools/testing/selftests/bpf/verifier/ref_tracking.c939
-rw-r--r--tools/testing/selftests/bpf/verifier/regalloc.c269
-rw-r--r--tools/testing/selftests/bpf/verifier/runtime_jit.c231
-rw-r--r--tools/testing/selftests/bpf/verifier/scale.c18
-rw-r--r--tools/testing/selftests/bpf/verifier/search_pruning.c192
-rw-r--r--tools/testing/selftests/bpf/verifier/sock.c733
-rw-r--r--tools/testing/selftests/bpf/verifier/spill_fill.c106
-rw-r--r--tools/testing/selftests/bpf/verifier/spin_lock.c333
-rw-r--r--tools/testing/selftests/bpf/verifier/stack_ptr.c359
-rw-r--r--tools/testing/selftests/bpf/verifier/subreg.c533
-rw-r--r--tools/testing/selftests/bpf/verifier/uninit.c39
-rw-r--r--tools/testing/selftests/bpf/verifier/unpriv.c538
-rw-r--r--tools/testing/selftests/bpf/verifier/value.c104
-rw-r--r--tools/testing/selftests/bpf/verifier/value_adj_spill.c43
-rw-r--r--tools/testing/selftests/bpf/verifier/value_illegal_alu.c95
-rw-r--r--tools/testing/selftests/bpf/verifier/value_or_null.c171
-rw-r--r--tools/testing/selftests/bpf/verifier/value_ptr_arith.c911
-rw-r--r--tools/testing/selftests/bpf/verifier/var_off.c343
-rw-r--r--tools/testing/selftests/bpf/verifier/wide_access.c73
-rw-r--r--tools/testing/selftests/bpf/verifier/xadd.c97
-rw-r--r--tools/testing/selftests/bpf/verifier/xdp.c14
-rw-r--r--tools/testing/selftests/bpf/verifier/xdp_direct_packet_access.c1468
-rwxr-xr-xtools/testing/selftests/bpf/with_addr.sh54
-rwxr-xr-xtools/testing/selftests/bpf/with_tunnels.sh36
-rw-r--r--tools/testing/selftests/bpf/xdping.c258
-rw-r--r--tools/testing/selftests/bpf/xdping.h13
-rw-r--r--tools/testing/selftests/breakpoints/.gitignore3
-rw-r--r--tools/testing/selftests/breakpoints/Makefile16
-rw-r--r--tools/testing/selftests/breakpoints/breakpoint_test.c409
-rw-r--r--tools/testing/selftests/breakpoints/breakpoint_test_arm64.c250
-rw-r--r--tools/testing/selftests/breakpoints/step_after_suspend_test.c226
-rw-r--r--tools/testing/selftests/capabilities/.gitignore3
-rw-r--r--tools/testing/selftests/capabilities/Makefile9
-rw-r--r--tools/testing/selftests/capabilities/test_execve.c462
-rw-r--r--tools/testing/selftests/capabilities/validate_cap.c80
-rw-r--r--tools/testing/selftests/cgroup/.gitignore5
-rw-r--r--tools/testing/selftests/cgroup/Makefile18
-rw-r--r--tools/testing/selftests/cgroup/cgroup_util.c578
-rw-r--r--tools/testing/selftests/cgroup/cgroup_util.h56
-rw-r--r--tools/testing/selftests/cgroup/test_core.c888
-rw-r--r--tools/testing/selftests/cgroup/test_freezer.c905
-rw-r--r--tools/testing/selftests/cgroup/test_kmem.c450
-rw-r--r--tools/testing/selftests/cgroup/test_memcontrol.c1228
-rwxr-xr-xtools/testing/selftests/cgroup/test_stress.sh4
-rwxr-xr-xtools/testing/selftests/cgroup/with_stress.sh101
-rw-r--r--tools/testing/selftests/clone3/.gitignore5
-rw-r--r--tools/testing/selftests/clone3/Makefile8
-rw-r--r--tools/testing/selftests/clone3/clone3.c199
-rw-r--r--tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c182
-rw-r--r--tools/testing/selftests/clone3/clone3_clear_sighand.c128
-rw-r--r--tools/testing/selftests/clone3/clone3_selftests.h82
-rw-r--r--tools/testing/selftests/clone3/clone3_set_tid.c397
-rw-r--r--tools/testing/selftests/core/.gitignore1
-rw-r--r--tools/testing/selftests/core/Makefile7
-rw-r--r--tools/testing/selftests/core/close_range_test.c227
-rw-r--r--tools/testing/selftests/cpu-hotplug/Makefile11
-rw-r--r--tools/testing/selftests/cpu-hotplug/config1
-rwxr-xr-xtools/testing/selftests/cpu-hotplug/cpu-on-off-test.sh293
-rw-r--r--tools/testing/selftests/cpufreq/Makefile9
-rw-r--r--tools/testing/selftests/cpufreq/config15
-rwxr-xr-xtools/testing/selftests/cpufreq/cpu.sh85
-rwxr-xr-xtools/testing/selftests/cpufreq/cpufreq.sh242
-rwxr-xr-xtools/testing/selftests/cpufreq/governor.sh154
-rwxr-xr-xtools/testing/selftests/cpufreq/main.sh198
-rwxr-xr-xtools/testing/selftests/cpufreq/module.sh244
-rwxr-xr-xtools/testing/selftests/cpufreq/special-tests.sh116
-rw-r--r--tools/testing/selftests/dmabuf-heaps/Makefile6
-rw-r--r--tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c397
-rw-r--r--tools/testing/selftests/drivers/.gitignore2
-rw-r--r--tools/testing/selftests/drivers/dma-buf/Makefile8
-rw-r--r--tools/testing/selftests/drivers/dma-buf/config1
-rw-r--r--tools/testing/selftests/drivers/dma-buf/udmabuf.c103
-rwxr-xr-xtools/testing/selftests/drivers/gpu/drm_mm.sh16
-rwxr-xr-xtools/testing/selftests/drivers/gpu/i915.sh16
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/blackhole_routes.sh201
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/devlink_trap.sh129
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/devlink_trap_acl_drops.sh151
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/devlink_trap_control.sh688
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh430
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh660
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh552
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/devlink_trap_policer.sh361
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh263
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh327
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/extack.sh170
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/fib.sh256
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/fib_offload.sh349
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/mirror_gre.sh217
-rw-r--r--tools/testing/selftests/drivers/net/mlxsw/mirror_gre_scale.sh198
-rw-r--r--tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh13
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/one_armed_router.sh259
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/qos_defprio.sh166
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/qos_dscp_bridge.sh194
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/qos_dscp_router.sh284
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/qos_ets_strict.sh320
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/qos_headroom.sh379
-rw-r--r--tools/testing/selftests/drivers/net/mlxsw/qos_lib.sh98
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh341
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh419
-rw-r--r--tools/testing/selftests/drivers/net/mlxsw/router_scale.sh142
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh698
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/sch_ets.sh81
-rw-r--r--tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh657
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh116
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/sch_red_prio.sh5
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/sch_red_root.sh76
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/sch_tbf_ets.sh9
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/sch_tbf_prio.sh9
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/sch_tbf_root.sh9
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh222
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/sharedbuffer_configuration.py416
-rw-r--r--tools/testing/selftests/drivers/net/mlxsw/spectrum-2/mirror_gre_scale.sh16
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh55
-rw-r--r--tools/testing/selftests/drivers/net/mlxsw/spectrum-2/router_scale.sh18
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh1129
-rw-r--r--tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh20
-rw-r--r--tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_police_scale.sh16
-rw-r--r--tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_lib_spectrum.sh119
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_resources.sh120
-rw-r--r--tools/testing/selftests/drivers/net/mlxsw/spectrum/mirror_gre_scale.sh16
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh57
-rw-r--r--tools/testing/selftests/drivers/net/mlxsw/spectrum/router_scale.sh18
-rw-r--r--tools/testing/selftests/drivers/net/mlxsw/spectrum/tc_flower_scale.sh19
-rw-r--r--tools/testing/selftests/drivers/net/mlxsw/spectrum/tc_police_scale.sh16
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/tc_action_hw_stats.sh130
-rw-r--r--tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh123
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/tc_police_occ.sh108
-rw-r--r--tools/testing/selftests/drivers/net/mlxsw/tc_police_scale.sh101
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh394
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/vxlan.sh1156
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/vxlan_fdb_veto.sh126
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh326
-rwxr-xr-xtools/testing/selftests/drivers/net/netdevsim/devlink.sh548
-rwxr-xr-xtools/testing/selftests/drivers/net/netdevsim/devlink_in_netns.sh72
-rwxr-xr-xtools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh489
-rwxr-xr-xtools/testing/selftests/drivers/net/netdevsim/ethtool-pause.sh108
-rwxr-xr-xtools/testing/selftests/drivers/net/netdevsim/fib.sh341
-rwxr-xr-xtools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh953
-rwxr-xr-xtools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh316
-rwxr-xr-xtools/testing/selftests/drivers/usb/usbip/usbip_test.sh200
-rw-r--r--tools/testing/selftests/efivarfs/.gitignore3
-rw-r--r--tools/testing/selftests/efivarfs/Makefile8
-rw-r--r--tools/testing/selftests/efivarfs/config1
-rw-r--r--tools/testing/selftests/efivarfs/create-read.c41
-rwxr-xr-xtools/testing/selftests/efivarfs/efivarfs.sh218
-rw-r--r--tools/testing/selftests/efivarfs/open-unlink.c134
-rw-r--r--tools/testing/selftests/exec/.gitignore14
-rw-r--r--tools/testing/selftests/exec/Makefile35
-rwxr-xr-xtools/testing/selftests/exec/binfmt_script171
-rw-r--r--tools/testing/selftests/exec/execveat.c432
-rw-r--r--tools/testing/selftests/exec/load_address.c68
-rw-r--r--tools/testing/selftests/exec/non-regular.c196
-rw-r--r--tools/testing/selftests/exec/recursion-depth.c67
-rw-r--r--tools/testing/selftests/filesystems/.gitignore3
-rw-r--r--tools/testing/selftests/filesystems/Makefile7
-rw-r--r--tools/testing/selftests/filesystems/binderfs/.gitignore2
-rw-r--r--tools/testing/selftests/filesystems/binderfs/Makefile8
-rw-r--r--tools/testing/selftests/filesystems/binderfs/binderfs_test.c521
-rw-r--r--tools/testing/selftests/filesystems/binderfs/config3
-rw-r--r--tools/testing/selftests/filesystems/devpts_pts.c316
-rw-r--r--tools/testing/selftests/filesystems/dnotify_test.c35
-rw-r--r--tools/testing/selftests/filesystems/epoll/.gitignore2
-rw-r--r--tools/testing/selftests/filesystems/epoll/Makefile7
-rw-r--r--tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c3380
-rw-r--r--tools/testing/selftests/firmware/.gitignore2
-rw-r--r--tools/testing/selftests/firmware/Makefile10
-rw-r--r--tools/testing/selftests/firmware/config5
-rwxr-xr-xtools/testing/selftests/firmware/fw_fallback.sh283
-rwxr-xr-xtools/testing/selftests/firmware/fw_filesystem.sh552
-rwxr-xr-xtools/testing/selftests/firmware/fw_lib.sh223
-rw-r--r--tools/testing/selftests/firmware/fw_namespace.c152
-rwxr-xr-xtools/testing/selftests/firmware/fw_run_tests.sh75
-rw-r--r--tools/testing/selftests/firmware/settings8
-rw-r--r--tools/testing/selftests/fpu/.gitignore2
-rw-r--r--tools/testing/selftests/fpu/Makefile9
-rwxr-xr-xtools/testing/selftests/fpu/run_test_fpu.sh46
-rw-r--r--tools/testing/selftests/fpu/test_fpu.c61
-rw-r--r--tools/testing/selftests/ftrace/.gitignore2
-rw-r--r--tools/testing/selftests/ftrace/Makefile8
-rw-r--r--tools/testing/selftests/ftrace/README82
-rw-r--r--tools/testing/selftests/ftrace/config16
-rwxr-xr-xtools/testing/selftests/ftrace/ftracetest453
-rw-r--r--tools/testing/selftests/ftrace/samples/fail.tc4
-rw-r--r--tools/testing/selftests/ftrace/samples/pass.tc3
-rw-r--r--tools/testing/selftests/ftrace/samples/unresolved.tc4
-rw-r--r--tools/testing/selftests/ftrace/samples/unsupported.tc3
-rw-r--r--tools/testing/selftests/ftrace/samples/untested.tc3
-rw-r--r--tools/testing/selftests/ftrace/samples/xfail.tc3
-rw-r--r--tools/testing/selftests/ftrace/settings1
-rw-r--r--tools/testing/selftests/ftrace/test.d/00basic/basic1.tc3
-rw-r--r--tools/testing/selftests/ftrace/test.d/00basic/basic2.tc9
-rw-r--r--tools/testing/selftests/ftrace/test.d/00basic/basic3.tc10
-rw-r--r--tools/testing/selftests/ftrace/test.d/00basic/basic4.tc5
-rw-r--r--tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_size.tc22
-rw-r--r--tools/testing/selftests/ftrace/test.d/00basic/snapshot.tc27
-rw-r--r--tools/testing/selftests/ftrace/test.d/00basic/trace_pipe.tc15
-rw-r--r--tools/testing/selftests/ftrace/test.d/direct/ftrace-direct.tc69
-rw-r--r--tools/testing/selftests/ftrace/test.d/direct/kprobe-direct.tc80
-rw-r--r--tools/testing/selftests/ftrace/test.d/dynevent/add_remove_kprobe.tc26
-rw-r--r--tools/testing/selftests/ftrace/test.d/dynevent/add_remove_synth.tc24
-rw-r--r--tools/testing/selftests/ftrace/test.d/dynevent/clear_select_events.tc41
-rw-r--r--tools/testing/selftests/ftrace/test.d/dynevent/generic_clear_event.tc43
-rw-r--r--tools/testing/selftests/ftrace/test.d/event/event-enable.tc48
-rw-r--r--tools/testing/selftests/ftrace/test.d/event/event-no-pid.tc123
-rw-r--r--tools/testing/selftests/ftrace/test.d/event/event-pid.tc61
-rw-r--r--tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc48
-rw-r--r--tools/testing/selftests/ftrace/test.d/event/toplevel-enable.tc51
-rw-r--r--tools/testing/selftests/ftrace/test.d/event/trace_printk.tc27
-rw-r--r--tools/testing/selftests/ftrace/test.d/ftrace/fgraph-filter-stack.tc73
-rw-r--r--tools/testing/selftests/ftrace/test.d/ftrace/fgraph-filter.tc40
-rw-r--r--tools/testing/selftests/ftrace/test.d/ftrace/func-filter-glob.tc58
-rw-r--r--tools/testing/selftests/ftrace/test.d/ftrace/func-filter-notrace-pid.tc94
-rw-r--r--tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc88
-rw-r--r--tools/testing/selftests/ftrace/test.d/ftrace/func-filter-stacktrace.tc13
-rw-r--r--tools/testing/selftests/ftrace/test.d/ftrace/func_cpumask.tc43
-rw-r--r--tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc123
-rw-r--r--tools/testing/selftests/ftrace/test.d/ftrace/func_mod_trace.tc23
-rw-r--r--tools/testing/selftests/ftrace/test.d/ftrace/func_profile_stat.tc21
-rw-r--r--tools/testing/selftests/ftrace/test.d/ftrace/func_profiler.tc64
-rw-r--r--tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc154
-rw-r--r--tools/testing/selftests/ftrace/test.d/ftrace/func_stack_tracer.tc35
-rw-r--r--tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc172
-rw-r--r--tools/testing/selftests/ftrace/test.d/ftrace/tracing-error-log.tc15
-rw-r--r--tools/testing/selftests/ftrace/test.d/functions155
-rw-r--r--tools/testing/selftests/ftrace/test.d/instances/instance-event.tc142
-rw-r--r--tools/testing/selftests/ftrace/test.d/instances/instance.tc82
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/add_and_remove.tc9
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/busy_check.tc11
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args.tc19
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_comm.tc16
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc42
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_symbol.tc38
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_syntax.tc99
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_type.tc47
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc34
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/kprobe_eventname.tc45
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/kprobe_ftrace.tc45
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/kprobe_module.tc52
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/kprobe_multiprobe.tc32
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/kprobe_non_uniq_symbol.tc13
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc105
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_args.tc19
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc35
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_return_suffix.tc21
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc33
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/probepoint.tc38
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/profile.tc14
-rw-r--r--tools/testing/selftests/ftrace/test.d/kprobe/uprobe_syntax_errors.tc26
-rw-r--r--tools/testing/selftests/ftrace/test.d/preemptirq/irqsoff_tracer.tc78
-rw-r--r--tools/testing/selftests/ftrace/test.d/selftest/bashisms.tc21
-rw-r--r--tools/testing/selftests/ftrace/test.d/template15
-rw-r--r--tools/testing/selftests/ftrace/test.d/tracer/wakeup.tc21
-rw-r--r--tools/testing/selftests/ftrace/test.d/tracer/wakeup_rt.tc21
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-action-hist-xfail.tc19
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc33
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc37
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-multi-actions-accept.tc22
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onchange-action-hist.tc22
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc30
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc30
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc28
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-snapshot-action-hist.tc30
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc34
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-dynstring.tc31
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-syntax.tc71
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic_event_syntax_errors.tc19
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-trace-action-hist.tc31
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/trigger-eventonoff.tc45
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/trigger-filter.tc38
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-mod.tc50
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-syntax-errors.tc16
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/trigger-hist.tc58
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/trigger-multihist.tc44
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/trigger-snapshot.tc36
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/trigger-stacktrace.tc33
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/trigger-trace-marker-hist.tc19
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/trigger-trace-marker-snapshot.tc43
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/trigger-trace-marker-synthetic-kernel.tc27
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/trigger-trace-marker-synthetic.tc30
-rw-r--r--tools/testing/selftests/ftrace/test.d/trigger/trigger-traceonoff.tc38
-rw-r--r--tools/testing/selftests/futex/Makefile37
-rw-r--r--tools/testing/selftests/futex/README62
-rw-r--r--tools/testing/selftests/futex/functional/.gitignore8
-rw-r--r--tools/testing/selftests/futex/functional/Makefile23
-rw-r--r--tools/testing/selftests/futex/functional/futex_requeue_pi.c409
-rw-r--r--tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c135
-rw-r--r--tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c222
-rw-r--r--tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c125
-rw-r--r--tools/testing/selftests/futex/functional/futex_wait_timeout.c86
-rw-r--r--tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c123
-rw-r--r--tools/testing/selftests/futex/functional/futex_wait_wouldblock.c78
-rwxr-xr-xtools/testing/selftests/futex/functional/run.sh75
-rw-r--r--tools/testing/selftests/futex/include/atomic.h79
-rw-r--r--tools/testing/selftests/futex/include/futextest.h262
-rw-r--r--tools/testing/selftests/futex/include/logging.h148
-rwxr-xr-xtools/testing/selftests/futex/run.sh29
-rwxr-xr-xtools/testing/selftests/gen_kselftest_tar.sh63
-rw-r--r--tools/testing/selftests/gpio/.gitignore2
-rw-r--r--tools/testing/selftests/gpio/Makefile34
-rw-r--r--tools/testing/selftests/gpio/config2
-rw-r--r--tools/testing/selftests/gpio/gpio-mockup-chardev.c323
-rwxr-xr-xtools/testing/selftests/gpio/gpio-mockup-sysfs.sh135
-rwxr-xr-xtools/testing/selftests/gpio/gpio-mockup.sh206
-rw-r--r--tools/testing/selftests/ia64/.gitignore2
-rw-r--r--tools/testing/selftests/ia64/Makefile9
-rw-r--r--tools/testing/selftests/ia64/aliasing-test.c260
-rw-r--r--tools/testing/selftests/intel_pstate/.gitignore3
-rw-r--r--tools/testing/selftests/intel_pstate/Makefile16
-rw-r--r--tools/testing/selftests/intel_pstate/aperf.c93
-rw-r--r--tools/testing/selftests/intel_pstate/msr.c40
-rwxr-xr-xtools/testing/selftests/intel_pstate/run.sh128
-rw-r--r--tools/testing/selftests/ipc/.gitignore3
-rw-r--r--tools/testing/selftests/ipc/Makefile18
-rw-r--r--tools/testing/selftests/ipc/config2
-rw-r--r--tools/testing/selftests/ipc/msgque.c255
-rw-r--r--tools/testing/selftests/ir/.gitignore2
-rw-r--r--tools/testing/selftests/ir/Makefile7
-rw-r--r--tools/testing/selftests/ir/ir_loopback.c210
-rwxr-xr-xtools/testing/selftests/ir/ir_loopback.sh25
-rw-r--r--tools/testing/selftests/kcmp/.gitignore3
-rw-r--r--tools/testing/selftests/kcmp/Makefile9
-rw-r--r--tools/testing/selftests/kcmp/kcmp_test.c166
-rw-r--r--tools/testing/selftests/kexec/Makefile13
-rw-r--r--tools/testing/selftests/kexec/config3
-rwxr-xr-xtools/testing/selftests/kexec/kexec_common_lib.sh220
-rwxr-xr-xtools/testing/selftests/kexec/test_kexec_file_load.sh238
-rwxr-xr-xtools/testing/selftests/kexec/test_kexec_load.sh47
-rw-r--r--tools/testing/selftests/kmod/Makefile12
-rw-r--r--tools/testing/selftests/kmod/config7
-rwxr-xr-xtools/testing/selftests/kmod/kmod.sh689
-rw-r--r--tools/testing/selftests/kselftest.h290
-rwxr-xr-xtools/testing/selftests/kselftest/module.sh84
-rwxr-xr-xtools/testing/selftests/kselftest/prefix.pl24
-rw-r--r--tools/testing/selftests/kselftest/runner.sh120
-rwxr-xr-xtools/testing/selftests/kselftest_deps.sh325
-rw-r--r--tools/testing/selftests/kselftest_harness.h1065
-rwxr-xr-xtools/testing/selftests/kselftest_install.sh35
-rw-r--r--tools/testing/selftests/kselftest_module.h48
-rw-r--r--tools/testing/selftests/kvm/.gitignore33
-rw-r--r--tools/testing/selftests/kvm/Makefile144
-rw-r--r--tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c3
-rw-r--r--tools/testing/selftests/kvm/aarch64/get-reg-list.c841
-rw-r--r--tools/testing/selftests/kvm/config3
-rw-r--r--tools/testing/selftests/kvm/demand_paging_test.c498
-rw-r--r--tools/testing/selftests/kvm/dirty_log_perf_test.c376
-rw-r--r--tools/testing/selftests/kvm/dirty_log_test.c639
-rw-r--r--tools/testing/selftests/kvm/include/aarch64/processor.h59
-rw-r--r--tools/testing/selftests/kvm/include/evmcs.h1102
-rw-r--r--tools/testing/selftests/kvm/include/kvm_util.h348
-rw-r--r--tools/testing/selftests/kvm/include/perf_test_util.h198
-rw-r--r--tools/testing/selftests/kvm/include/s390x/processor.h22
-rw-r--r--tools/testing/selftests/kvm/include/sparsebit.h73
-rw-r--r--tools/testing/selftests/kvm/include/test_util.h70
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/processor.h422
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/svm.h297
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/svm_util.h49
-rw-r--r--tools/testing/selftests/kvm/include/x86_64/vmx.h625
-rw-r--r--tools/testing/selftests/kvm/kvm_create_max_vcpus.c96
-rw-r--r--tools/testing/selftests/kvm/lib/aarch64/processor.c356
-rw-r--r--tools/testing/selftests/kvm/lib/aarch64/ucall.c114
-rw-r--r--tools/testing/selftests/kvm/lib/assert.c93
-rw-r--r--tools/testing/selftests/kvm/lib/elf.c196
-rw-r--r--tools/testing/selftests/kvm/lib/io.c157
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util.c1865
-rw-r--r--tools/testing/selftests/kvm/lib/kvm_util_internal.h113
-rw-r--r--tools/testing/selftests/kvm/lib/s390x/processor.c247
-rw-r--r--tools/testing/selftests/kvm/lib/s390x/ucall.c59
-rw-r--r--tools/testing/selftests/kvm/lib/sparsebit.c2086
-rw-r--r--tools/testing/selftests/kvm/lib/test_util.c111
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/handlers.S81
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/processor.c1258
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/svm.c177
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/ucall.c59
-rw-r--r--tools/testing/selftests/kvm/lib/x86_64/vmx.c553
-rw-r--r--tools/testing/selftests/kvm/s390x/memop.c166
-rw-r--r--tools/testing/selftests/kvm/s390x/resets.c279
-rw-r--r--tools/testing/selftests/kvm/s390x/sync_regs_test.c193
-rw-r--r--tools/testing/selftests/kvm/set_memory_region_test.c417
-rw-r--r--tools/testing/selftests/kvm/steal_time.c352
-rw-r--r--tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c117
-rw-r--r--tools/testing/selftests/kvm/x86_64/debug_regs.c202
-rw-r--r--tools/testing/selftests/kvm/x86_64/evmcs_test.c166
-rw-r--r--tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c190
-rw-r--r--tools/testing/selftests/kvm/x86_64/kvm_pv_test.c234
-rw-r--r--tools/testing/selftests/kvm/x86_64/mmio_warning_test.c127
-rw-r--r--tools/testing/selftests/kvm/x86_64/platform_info_test.c107
-rw-r--r--tools/testing/selftests/kvm/x86_64/set_sregs_test.c52
-rw-r--r--tools/testing/selftests/kvm/x86_64/smm_test.c164
-rw-r--r--tools/testing/selftests/kvm/x86_64/state_test.c233
-rw-r--r--tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c77
-rw-r--r--tools/testing/selftests/kvm/x86_64/sync_regs_test.c243
-rw-r--r--tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c168
-rw-r--r--tools/testing/selftests/kvm/x86_64/user_msr_test.c248
-rw-r--r--tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c142
-rw-r--r--tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c87
-rw-r--r--tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c157
-rw-r--r--tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c259
-rw-r--r--tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c298
-rw-r--r--tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c168
-rw-r--r--tools/testing/selftests/kvm/x86_64/xss_msr_test.c76
-rw-r--r--tools/testing/selftests/lib.mk159
-rw-r--r--tools/testing/selftests/lib/Makefile9
-rwxr-xr-xtools/testing/selftests/lib/bitmap.sh3
-rw-r--r--tools/testing/selftests/lib/config5
-rwxr-xr-xtools/testing/selftests/lib/prime_numbers.sh4
-rwxr-xr-xtools/testing/selftests/lib/printf.sh4
-rwxr-xr-xtools/testing/selftests/lib/strscpy.sh3
-rw-r--r--tools/testing/selftests/livepatch/Makefile13
-rw-r--r--tools/testing/selftests/livepatch/README43
-rw-r--r--tools/testing/selftests/livepatch/config3
-rw-r--r--tools/testing/selftests/livepatch/functions.sh294
-rw-r--r--tools/testing/selftests/livepatch/settings1
-rwxr-xr-xtools/testing/selftests/livepatch/test-callbacks.sh553
-rwxr-xr-xtools/testing/selftests/livepatch/test-ftrace.sh63
-rwxr-xr-xtools/testing/selftests/livepatch/test-livepatch.sh162
-rwxr-xr-xtools/testing/selftests/livepatch/test-shadow-vars.sh79
-rwxr-xr-xtools/testing/selftests/livepatch/test-state.sh176
-rw-r--r--tools/testing/selftests/lkdtm/.gitignore2
-rw-r--r--tools/testing/selftests/lkdtm/Makefile12
-rw-r--r--tools/testing/selftests/lkdtm/config1
-rwxr-xr-xtools/testing/selftests/lkdtm/run.sh104
-rw-r--r--tools/testing/selftests/lkdtm/tests.txt70
-rw-r--r--tools/testing/selftests/locking/Makefile10
-rwxr-xr-xtools/testing/selftests/locking/ww_mutex.sh19
-rw-r--r--tools/testing/selftests/media_tests/.gitignore4
-rw-r--r--tools/testing/selftests/media_tests/Makefile6
-rwxr-xr-xtools/testing/selftests/media_tests/bind_unbind_sample.sh13
-rwxr-xr-xtools/testing/selftests/media_tests/media_dev_allocator.sh85
-rw-r--r--tools/testing/selftests/media_tests/media_device_open.c82
-rw-r--r--tools/testing/selftests/media_tests/media_device_test.c103
-rwxr-xr-xtools/testing/selftests/media_tests/open_loop_test.sh11
-rw-r--r--tools/testing/selftests/media_tests/regression_test.txt43
-rw-r--r--tools/testing/selftests/media_tests/video_device_test.c101
-rw-r--r--tools/testing/selftests/membarrier/.gitignore3
-rw-r--r--tools/testing/selftests/membarrier/Makefile8
-rw-r--r--tools/testing/selftests/membarrier/membarrier_test_impl.h317
-rw-r--r--tools/testing/selftests/membarrier/membarrier_test_multi_thread.c73
-rw-r--r--tools/testing/selftests/membarrier/membarrier_test_single_thread.c24
-rw-r--r--tools/testing/selftests/memfd/.gitignore5
-rw-r--r--tools/testing/selftests/memfd/Makefile30
-rw-r--r--tools/testing/selftests/memfd/common.c46
-rw-r--r--tools/testing/selftests/memfd/common.h9
-rw-r--r--tools/testing/selftests/memfd/config1
-rw-r--r--tools/testing/selftests/memfd/fuse_mnt.c111
-rw-r--r--tools/testing/selftests/memfd/fuse_test.c331
-rw-r--r--tools/testing/selftests/memfd/memfd_test.c1080
-rwxr-xr-xtools/testing/selftests/memfd/run_fuse_test.sh15
-rwxr-xr-xtools/testing/selftests/memfd/run_hugetlbfs_test.sh68
-rw-r--r--tools/testing/selftests/memory-hotplug/Makefile11
-rw-r--r--tools/testing/selftests/memory-hotplug/config5
-rwxr-xr-xtools/testing/selftests/memory-hotplug/mem-on-off-test.sh291
-rw-r--r--tools/testing/selftests/mincore/.gitignore2
-rw-r--r--tools/testing/selftests/mincore/Makefile6
-rw-r--r--tools/testing/selftests/mincore/mincore_selftest.c369
-rw-r--r--tools/testing/selftests/mount/.gitignore3
-rw-r--r--tools/testing/selftests/mount/Makefile9
-rw-r--r--tools/testing/selftests/mount/config1
-rw-r--r--tools/testing/selftests/mount/nosymfollow-test.c218
-rwxr-xr-xtools/testing/selftests/mount/run_nosymfollow.sh4
-rwxr-xr-xtools/testing/selftests/mount/run_unprivileged_remount.sh12
-rw-r--r--tools/testing/selftests/mount/unprivileged-remount-test.c371
-rw-r--r--tools/testing/selftests/mqueue/.gitignore3
-rw-r--r--tools/testing/selftests/mqueue/Makefile7
-rw-r--r--tools/testing/selftests/mqueue/mq_open_tests.c502
-rw-r--r--tools/testing/selftests/mqueue/mq_perf_tests.c752
-rw-r--r--tools/testing/selftests/net/.gitignore32
-rw-r--r--tools/testing/selftests/net/Makefile43
-rwxr-xr-xtools/testing/selftests/net/altnames.sh75
-rw-r--r--tools/testing/selftests/net/config36
-rwxr-xr-xtools/testing/selftests/net/devlink_port_split.py307
-rwxr-xr-xtools/testing/selftests/net/drop_monitor_tests.sh215
-rwxr-xr-xtools/testing/selftests/net/fcnal-test.sh4034
-rwxr-xr-xtools/testing/selftests/net/fib-onlink-tests.sh505
-rwxr-xr-xtools/testing/selftests/net/fib_nexthop_multiprefix.sh292
-rwxr-xr-xtools/testing/selftests/net/fib_nexthops.sh1696
-rwxr-xr-xtools/testing/selftests/net/fib_rule_tests.sh260
-rwxr-xr-xtools/testing/selftests/net/fib_tests.sh1841
-rw-r--r--tools/testing/selftests/net/fin_ack_lat.c151
-rwxr-xr-xtools/testing/selftests/net/fin_ack_lat.sh35
-rw-r--r--tools/testing/selftests/net/forwarding/.gitignore2
-rw-r--r--tools/testing/selftests/net/forwarding/Makefile77
-rw-r--r--tools/testing/selftests/net/forwarding/README58
-rwxr-xr-xtools/testing/selftests/net/forwarding/bridge_igmp.sh152
-rwxr-xr-xtools/testing/selftests/net/forwarding/bridge_port_isolation.sh151
-rwxr-xr-xtools/testing/selftests/net/forwarding/bridge_sticky_fdb.sh69
-rwxr-xr-xtools/testing/selftests/net/forwarding/bridge_vlan_aware.sh151
-rwxr-xr-xtools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh104
-rw-r--r--tools/testing/selftests/net/forwarding/config14
-rw-r--r--tools/testing/selftests/net/forwarding/devlink_lib.sh557
-rwxr-xr-xtools/testing/selftests/net/forwarding/ethtool.sh301
-rwxr-xr-xtools/testing/selftests/net/forwarding/ethtool_extended_state.sh104
-rw-r--r--tools/testing/selftests/net/forwarding/ethtool_lib.sh86
-rw-r--r--tools/testing/selftests/net/forwarding/fib_offload_lib.sh873
-rw-r--r--tools/testing/selftests/net/forwarding/forwarding.config.sample45
-rwxr-xr-xtools/testing/selftests/net/forwarding/gre_inner_v4_multipath.sh305
-rwxr-xr-xtools/testing/selftests/net/forwarding/gre_inner_v6_multipath.sh306
-rwxr-xr-xtools/testing/selftests/net/forwarding/gre_multipath.sh257
-rwxr-xr-xtools/testing/selftests/net/forwarding/ip6_forward_instats_vrf.sh172
-rwxr-xr-xtools/testing/selftests/net/forwarding/ip6gre_inner_v4_multipath.sh304
-rwxr-xr-xtools/testing/selftests/net/forwarding/ip6gre_inner_v6_multipath.sh305
-rwxr-xr-xtools/testing/selftests/net/forwarding/ipip_flat_gre.sh63
-rwxr-xr-xtools/testing/selftests/net/forwarding/ipip_flat_gre_key.sh63
-rwxr-xr-xtools/testing/selftests/net/forwarding/ipip_flat_gre_keys.sh63
-rwxr-xr-xtools/testing/selftests/net/forwarding/ipip_hier_gre.sh63
-rwxr-xr-xtools/testing/selftests/net/forwarding/ipip_hier_gre_key.sh63
-rwxr-xr-xtools/testing/selftests/net/forwarding/ipip_hier_gre_keys.sh63
-rw-r--r--tools/testing/selftests/net/forwarding/ipip_lib.sh349
-rw-r--r--tools/testing/selftests/net/forwarding/lib.sh1300
-rwxr-xr-xtools/testing/selftests/net/forwarding/loopback.sh102
-rwxr-xr-xtools/testing/selftests/net/forwarding/mirror_gre.sh160
-rwxr-xr-xtools/testing/selftests/net/forwarding/mirror_gre_bound.sh226
-rwxr-xr-xtools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh132
-rwxr-xr-xtools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh132
-rwxr-xr-xtools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh129
-rwxr-xr-xtools/testing/selftests/net/forwarding/mirror_gre_bridge_1q_lag.sh292
-rwxr-xr-xtools/testing/selftests/net/forwarding/mirror_gre_changes.sh273
-rwxr-xr-xtools/testing/selftests/net/forwarding/mirror_gre_flower.sh137
-rwxr-xr-xtools/testing/selftests/net/forwarding/mirror_gre_lag_lacp.sh285
-rw-r--r--tools/testing/selftests/net/forwarding/mirror_gre_lib.sh130
-rwxr-xr-xtools/testing/selftests/net/forwarding/mirror_gre_neigh.sh115
-rwxr-xr-xtools/testing/selftests/net/forwarding/mirror_gre_nh.sh131
-rw-r--r--tools/testing/selftests/net/forwarding/mirror_gre_topo_lib.sh94
-rwxr-xr-xtools/testing/selftests/net/forwarding/mirror_gre_vlan.sh92
-rwxr-xr-xtools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh347
-rw-r--r--tools/testing/selftests/net/forwarding/mirror_lib.sh148
-rw-r--r--tools/testing/selftests/net/forwarding/mirror_topo_lib.sh101
-rwxr-xr-xtools/testing/selftests/net/forwarding/mirror_vlan.sh131
-rwxr-xr-xtools/testing/selftests/net/forwarding/pedit_dsfield.sh311
-rwxr-xr-xtools/testing/selftests/net/forwarding/pedit_l4port.sh200
-rwxr-xr-xtools/testing/selftests/net/forwarding/router.sh322
-rwxr-xr-xtools/testing/selftests/net/forwarding/router_bridge.sh113
-rwxr-xr-xtools/testing/selftests/net/forwarding/router_bridge_vlan.sh132
-rwxr-xr-xtools/testing/selftests/net/forwarding/router_broadcast.sh237
-rwxr-xr-xtools/testing/selftests/net/forwarding/router_mpath_nh.sh359
-rwxr-xr-xtools/testing/selftests/net/forwarding/router_multicast.sh416
-rwxr-xr-xtools/testing/selftests/net/forwarding/router_multipath.sh342
-rwxr-xr-xtools/testing/selftests/net/forwarding/router_vid_1.sh135
-rwxr-xr-xtools/testing/selftests/net/forwarding/sch_ets.sh47
-rw-r--r--tools/testing/selftests/net/forwarding/sch_ets_core.sh300
-rw-r--r--tools/testing/selftests/net/forwarding/sch_ets_tests.sh223
-rwxr-xr-xtools/testing/selftests/net/forwarding/sch_red.sh493
-rw-r--r--tools/testing/selftests/net/forwarding/sch_tbf_core.sh233
-rwxr-xr-xtools/testing/selftests/net/forwarding/sch_tbf_ets.sh6
-rw-r--r--tools/testing/selftests/net/forwarding/sch_tbf_etsprio.sh39
-rwxr-xr-xtools/testing/selftests/net/forwarding/sch_tbf_prio.sh6
-rwxr-xr-xtools/testing/selftests/net/forwarding/sch_tbf_root.sh33
-rw-r--r--tools/testing/selftests/net/forwarding/settings1
-rwxr-xr-xtools/testing/selftests/net/forwarding/skbedit_priority.sh170
-rwxr-xr-xtools/testing/selftests/net/forwarding/tc_actions.sh269
-rwxr-xr-xtools/testing/selftests/net/forwarding/tc_chains.sh205
-rw-r--r--tools/testing/selftests/net/forwarding/tc_common.sh26
-rwxr-xr-xtools/testing/selftests/net/forwarding/tc_flower.sh411
-rwxr-xr-xtools/testing/selftests/net/forwarding/tc_flower_router.sh172
-rwxr-xr-xtools/testing/selftests/net/forwarding/tc_police.sh385
-rwxr-xr-xtools/testing/selftests/net/forwarding/tc_shblocks.sh152
-rwxr-xr-xtools/testing/selftests/net/forwarding/tc_vlan_modify.sh164
-rwxr-xr-xtools/testing/selftests/net/forwarding/vxlan_asymmetric.sh577
-rwxr-xr-xtools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh786
-rwxr-xr-xtools/testing/selftests/net/forwarding/vxlan_bridge_1d_port_8472.sh10
-rwxr-xr-xtools/testing/selftests/net/forwarding/vxlan_bridge_1q.sh860
-rwxr-xr-xtools/testing/selftests/net/forwarding/vxlan_bridge_1q_port_8472.sh10
-rwxr-xr-xtools/testing/selftests/net/forwarding/vxlan_symmetric.sh561
-rw-r--r--tools/testing/selftests/net/hwtstamp_config.c135
-rwxr-xr-xtools/testing/selftests/net/icmp_redirect.sh537
-rwxr-xr-xtools/testing/selftests/net/in_netns.sh23
-rwxr-xr-xtools/testing/selftests/net/ip6_gre_headroom.sh65
-rw-r--r--tools/testing/selftests/net/ip_defrag.c472
-rwxr-xr-xtools/testing/selftests/net/ip_defrag.sh64
-rw-r--r--tools/testing/selftests/net/ipsec.c2195
-rw-r--r--tools/testing/selftests/net/ipv6_flowlabel.c229
-rwxr-xr-xtools/testing/selftests/net/ipv6_flowlabel.sh21
-rw-r--r--tools/testing/selftests/net/ipv6_flowlabel_mgr.c199
-rwxr-xr-xtools/testing/selftests/net/l2tp.sh382
-rw-r--r--tools/testing/selftests/net/mptcp/.gitignore4
-rw-r--r--tools/testing/selftests/net/mptcp/Makefile17
-rw-r--r--tools/testing/selftests/net/mptcp/config9
-rwxr-xr-xtools/testing/selftests/net/mptcp/diag.sh125
-rw-r--r--tools/testing/selftests/net/mptcp/mptcp_connect.c912
-rwxr-xr-xtools/testing/selftests/net/mptcp/mptcp_connect.sh697
-rwxr-xr-xtools/testing/selftests/net/mptcp/mptcp_join.sh629
-rw-r--r--tools/testing/selftests/net/mptcp/mptcp_lib.sh104
-rwxr-xr-xtools/testing/selftests/net/mptcp/pm_netlink.sh136
-rw-r--r--tools/testing/selftests/net/mptcp/pm_nl_ctl.c616
-rw-r--r--tools/testing/selftests/net/mptcp/settings1
-rwxr-xr-xtools/testing/selftests/net/mptcp/simult_flows.sh297
-rw-r--r--tools/testing/selftests/net/msg_zerocopy.c811
-rwxr-xr-xtools/testing/selftests/net/msg_zerocopy.sh122
-rwxr-xr-xtools/testing/selftests/net/netdevice.sh205
-rw-r--r--tools/testing/selftests/net/nettest.c1815
-rwxr-xr-xtools/testing/selftests/net/pmtu.sh1924
-rw-r--r--tools/testing/selftests/net/psock_fanout.c472
-rw-r--r--tools/testing/selftests/net/psock_lib.h144
-rw-r--r--tools/testing/selftests/net/psock_snd.c397
-rwxr-xr-xtools/testing/selftests/net/psock_snd.sh98
-rw-r--r--tools/testing/selftests/net/psock_tpacket.c850
-rw-r--r--tools/testing/selftests/net/reuseaddr_conflict.c114
-rw-r--r--tools/testing/selftests/net/reuseaddr_ports_exhausted.c162
-rwxr-xr-xtools/testing/selftests/net/reuseaddr_ports_exhausted.sh35
-rw-r--r--tools/testing/selftests/net/reuseport_addr_any.c278
-rwxr-xr-xtools/testing/selftests/net/reuseport_addr_any.sh4
-rw-r--r--tools/testing/selftests/net/reuseport_bpf.c641
-rw-r--r--tools/testing/selftests/net/reuseport_bpf_cpu.c259
-rw-r--r--tools/testing/selftests/net/reuseport_bpf_numa.c258
-rw-r--r--tools/testing/selftests/net/reuseport_dualstack.c210
-rwxr-xr-xtools/testing/selftests/net/route_localnet.sh74
-rwxr-xr-xtools/testing/selftests/net/rtnetlink.sh1301
-rwxr-xr-xtools/testing/selftests/net/run_afpackettests46
-rwxr-xr-xtools/testing/selftests/net/run_netsocktests13
-rw-r--r--tools/testing/selftests/net/rxtimestamp.c430
-rwxr-xr-xtools/testing/selftests/net/rxtimestamp.sh4
-rw-r--r--tools/testing/selftests/net/so_txtime.c393
-rwxr-xr-xtools/testing/selftests/net/so_txtime.sh36
-rw-r--r--tools/testing/selftests/net/socket.c93
-rw-r--r--tools/testing/selftests/net/tcp_fastopen_backup_key.c335
-rwxr-xr-xtools/testing/selftests/net/tcp_fastopen_backup_key.sh55
-rw-r--r--tools/testing/selftests/net/tcp_inq.c179
-rw-r--r--tools/testing/selftests/net/tcp_mmap.c517
-rwxr-xr-xtools/testing/selftests/net/test_blackhole_dev.sh11
-rwxr-xr-xtools/testing/selftests/net/test_bpf.sh11
-rwxr-xr-xtools/testing/selftests/net/test_vxlan_fdb_changelink.sh29
-rwxr-xr-xtools/testing/selftests/net/test_vxlan_under_vrf.sh129
-rw-r--r--tools/testing/selftests/net/timestamping.c515
-rw-r--r--tools/testing/selftests/net/tls.c1335
-rwxr-xr-xtools/testing/selftests/net/traceroute.sh322
-rw-r--r--tools/testing/selftests/net/txring_overwrite.c179
-rw-r--r--tools/testing/selftests/net/txtimestamp.c922
-rwxr-xr-xtools/testing/selftests/net/txtimestamp.sh82
-rwxr-xr-xtools/testing/selftests/net/udpgro.sh216
-rwxr-xr-xtools/testing/selftests/net/udpgro_bench.sh95
-rw-r--r--tools/testing/selftests/net/udpgso.c685
-rwxr-xr-xtools/testing/selftests/net/udpgso.sh29
-rwxr-xr-xtools/testing/selftests/net/udpgso_bench.sh151
-rw-r--r--tools/testing/selftests/net/udpgso_bench_rx.c409
-rw-r--r--tools/testing/selftests/net/udpgso_bench_tx.c734
-rwxr-xr-xtools/testing/selftests/net/vrf-xfrm-tests.sh436
-rwxr-xr-xtools/testing/selftests/net/vrf_route_leaking.sh626
-rwxr-xr-xtools/testing/selftests/net/vrf_strict_mode_test.sh396
-rwxr-xr-xtools/testing/selftests/net/xfrm_policy.sh486
-rw-r--r--tools/testing/selftests/netfilter/.gitignore2
-rw-r--r--tools/testing/selftests/netfilter/Makefile13
-rwxr-xr-xtools/testing/selftests/netfilter/bridge_brouter.sh146
-rw-r--r--tools/testing/selftests/netfilter/config8
-rwxr-xr-xtools/testing/selftests/netfilter/conntrack_icmp_related.sh315
-rwxr-xr-xtools/testing/selftests/netfilter/conntrack_vrf.sh241
-rwxr-xr-xtools/testing/selftests/netfilter/ipvs.sh228
-rw-r--r--tools/testing/selftests/netfilter/nf-queue.c395
-rwxr-xr-xtools/testing/selftests/netfilter/nft_concat_range.sh1586
-rwxr-xr-xtools/testing/selftests/netfilter/nft_conntrack_helper.sh181
-rwxr-xr-xtools/testing/selftests/netfilter/nft_flowtable.sh420
-rwxr-xr-xtools/testing/selftests/netfilter/nft_meta.sh142
-rwxr-xr-xtools/testing/selftests/netfilter/nft_nat.sh914
-rwxr-xr-xtools/testing/selftests/netfilter/nft_queue.sh376
-rwxr-xr-xtools/testing/selftests/netfilter/nft_trans_stress.sh78
-rw-r--r--tools/testing/selftests/nsfs/.gitignore3
-rw-r--r--tools/testing/selftests/nsfs/Makefile6
-rw-r--r--tools/testing/selftests/nsfs/config3
-rw-r--r--tools/testing/selftests/nsfs/owner.c92
-rw-r--r--tools/testing/selftests/nsfs/pidns.c79
-rwxr-xr-xtools/testing/selftests/ntb/ntb_test.sh631
-rw-r--r--tools/testing/selftests/openat2/.gitignore2
-rw-r--r--tools/testing/selftests/openat2/Makefile8
-rw-r--r--tools/testing/selftests/openat2/helpers.c109
-rw-r--r--tools/testing/selftests/openat2/helpers.h108
-rw-r--r--tools/testing/selftests/openat2/openat2_test.c322
-rw-r--r--tools/testing/selftests/openat2/rename_attack_test.c160
-rw-r--r--tools/testing/selftests/openat2/resolve_test.c523
-rw-r--r--tools/testing/selftests/pid_namespace/.gitignore1
-rw-r--r--tools/testing/selftests/pid_namespace/Makefile8
-rw-r--r--tools/testing/selftests/pid_namespace/config2
-rw-r--r--tools/testing/selftests/pid_namespace/regression_enomem.c44
-rw-r--r--tools/testing/selftests/pidfd/.gitignore8
-rw-r--r--tools/testing/selftests/pidfd/Makefile8
-rw-r--r--tools/testing/selftests/pidfd/config7
-rw-r--r--tools/testing/selftests/pidfd/pidfd.h118
-rw-r--r--tools/testing/selftests/pidfd/pidfd_fdinfo_test.c310
-rw-r--r--tools/testing/selftests/pidfd/pidfd_getfd_test.c246
-rw-r--r--tools/testing/selftests/pidfd/pidfd_open_test.c163
-rw-r--r--tools/testing/selftests/pidfd/pidfd_poll_test.c116
-rw-r--r--tools/testing/selftests/pidfd/pidfd_setns_test.c559
-rw-r--r--tools/testing/selftests/pidfd/pidfd_test.c573
-rw-r--r--tools/testing/selftests/pidfd/pidfd_wait.c224
-rw-r--r--tools/testing/selftests/powerpc/Makefile77
-rw-r--r--tools/testing/selftests/powerpc/alignment/.gitignore3
-rw-r--r--tools/testing/selftests/powerpc/alignment/Makefile7
-rw-r--r--tools/testing/selftests/powerpc/alignment/alignment_handler.c689
-rw-r--r--tools/testing/selftests/powerpc/alignment/copy_first_unaligned.c67
-rw-r--r--tools/testing/selftests/powerpc/benchmarks/.gitignore8
-rw-r--r--tools/testing/selftests/powerpc/benchmarks/Makefile20
-rw-r--r--tools/testing/selftests/powerpc/benchmarks/context_switch.c508
-rw-r--r--tools/testing/selftests/powerpc/benchmarks/exec_target.c16
-rw-r--r--tools/testing/selftests/powerpc/benchmarks/fork.c325
-rw-r--r--tools/testing/selftests/powerpc/benchmarks/futex_bench.c43
-rw-r--r--tools/testing/selftests/powerpc/benchmarks/gettimeofday.c31
-rw-r--r--tools/testing/selftests/powerpc/benchmarks/mmap_bench.c90
-rw-r--r--tools/testing/selftests/powerpc/benchmarks/null_syscall.c153
-rw-r--r--tools/testing/selftests/powerpc/benchmarks/settings1
-rw-r--r--tools/testing/selftests/powerpc/cache_shape/.gitignore2
-rw-r--r--tools/testing/selftests/powerpc/cache_shape/Makefile7
-rw-r--r--tools/testing/selftests/powerpc/cache_shape/cache_shape.c121
-rw-r--r--tools/testing/selftests/powerpc/copyloops/.gitignore15
-rw-r--r--tools/testing/selftests/powerpc/copyloops/Makefile58
-rw-r--r--tools/testing/selftests/powerpc/copyloops/asm/asm-compat.h0
-rw-r--r--tools/testing/selftests/powerpc/copyloops/asm/export.h4
-rw-r--r--tools/testing/selftests/powerpc/copyloops/asm/feature-fixups.h0
-rw-r--r--tools/testing/selftests/powerpc/copyloops/asm/kasan.h0
-rw-r--r--tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h48
-rw-r--r--tools/testing/selftests/powerpc/copyloops/asm/processor.h0
l---------tools/testing/selftests/powerpc/copyloops/copy_mc_64.S1
-rw-r--r--tools/testing/selftests/powerpc/copyloops/copy_tofrom_user_reference.S24
l---------tools/testing/selftests/powerpc/copyloops/copyuser_64.S1
l---------tools/testing/selftests/powerpc/copyloops/copyuser_power7.S1
-rw-r--r--tools/testing/selftests/powerpc/copyloops/exc_validate.c124
l---------tools/testing/selftests/powerpc/copyloops/memcpy_64.S1
l---------tools/testing/selftests/powerpc/copyloops/memcpy_power7.S1
-rw-r--r--tools/testing/selftests/powerpc/copyloops/stubs.S19
-rw-r--r--tools/testing/selftests/powerpc/copyloops/validate.c100
-rw-r--r--tools/testing/selftests/powerpc/dscr/.gitignore8
-rw-r--r--tools/testing/selftests/powerpc/dscr/Makefile13
-rw-r--r--tools/testing/selftests/powerpc/dscr/dscr.h122
-rw-r--r--tools/testing/selftests/powerpc/dscr/dscr_default_test.c126
-rw-r--r--tools/testing/selftests/powerpc/dscr/dscr_explicit_test.c70
-rw-r--r--tools/testing/selftests/powerpc/dscr/dscr_inherit_exec_test.c108
-rw-r--r--tools/testing/selftests/powerpc/dscr/dscr_inherit_test.c86
-rw-r--r--tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c103
-rw-r--r--tools/testing/selftests/powerpc/dscr/dscr_sysfs_thread_test.c79
-rw-r--r--tools/testing/selftests/powerpc/dscr/dscr_user_test.c60
-rw-r--r--tools/testing/selftests/powerpc/dscr/settings1
-rw-r--r--tools/testing/selftests/powerpc/eeh/Makefile9
-rwxr-xr-xtools/testing/selftests/powerpc/eeh/eeh-basic.sh90
-rwxr-xr-xtools/testing/selftests/powerpc/eeh/eeh-functions.sh85
-rw-r--r--tools/testing/selftests/powerpc/harness.c133
-rw-r--r--tools/testing/selftests/powerpc/include/basic_asm.h74
-rw-r--r--tools/testing/selftests/powerpc/include/fpu_asm.h76
-rw-r--r--tools/testing/selftests/powerpc/include/gpr_asm.h92
-rw-r--r--tools/testing/selftests/powerpc/include/instructions.h146
-rw-r--r--tools/testing/selftests/powerpc/include/pkeys.h136
-rw-r--r--tools/testing/selftests/powerpc/include/reg.h163
-rw-r--r--tools/testing/selftests/powerpc/include/subunit.h52
-rw-r--r--tools/testing/selftests/powerpc/include/utils.h148
-rw-r--r--tools/testing/selftests/powerpc/include/vmx_asm.h92
-rw-r--r--tools/testing/selftests/powerpc/include/vsx_asm.h67
-rw-r--r--tools/testing/selftests/powerpc/lib/reg.S393
-rw-r--r--tools/testing/selftests/powerpc/math/.gitignore9
-rw-r--r--tools/testing/selftests/powerpc/math/Makefile19
-rw-r--r--tools/testing/selftests/powerpc/math/fpu_asm.S131
-rw-r--r--tools/testing/selftests/powerpc/math/fpu_denormal.c38
-rw-r--r--tools/testing/selftests/powerpc/math/fpu_preempt.c110
-rw-r--r--tools/testing/selftests/powerpc/math/fpu_signal.c131
-rw-r--r--tools/testing/selftests/powerpc/math/fpu_syscall.c86
-rw-r--r--tools/testing/selftests/powerpc/math/vmx_asm.S148
-rw-r--r--tools/testing/selftests/powerpc/math/vmx_preempt.c113
-rw-r--r--tools/testing/selftests/powerpc/math/vmx_signal.c155
-rw-r--r--tools/testing/selftests/powerpc/math/vmx_syscall.c92
-rw-r--r--tools/testing/selftests/powerpc/math/vsx_asm.S57
-rw-r--r--tools/testing/selftests/powerpc/math/vsx_preempt.c145
-rw-r--r--tools/testing/selftests/powerpc/mm/.gitignore14
-rw-r--r--tools/testing/selftests/powerpc/mm/Makefile34
-rw-r--r--tools/testing/selftests/powerpc/mm/bad_accesses.c144
-rw-r--r--tools/testing/selftests/powerpc/mm/hugetlb_vs_thp_test.c77
-rw-r--r--tools/testing/selftests/powerpc/mm/large_vm_fork_separation.c87
-rw-r--r--tools/testing/selftests/powerpc/mm/pkey_exec_prot.c294
-rw-r--r--tools/testing/selftests/powerpc/mm/pkey_siginfo.c333
-rw-r--r--tools/testing/selftests/powerpc/mm/prot_sao.c48
-rw-r--r--tools/testing/selftests/powerpc/mm/segv_errors.c78
-rw-r--r--tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c202
-rw-r--r--tools/testing/selftests/powerpc/mm/stack_expansion_signal.c118
-rw-r--r--tools/testing/selftests/powerpc/mm/subpage_prot.c236
-rw-r--r--tools/testing/selftests/powerpc/mm/tlbie_test.c734
-rw-r--r--tools/testing/selftests/powerpc/mm/wild_bctr.c170
-rw-r--r--tools/testing/selftests/powerpc/nx-gzip/99-nx-gzip.rules1
-rw-r--r--tools/testing/selftests/powerpc/nx-gzip/Makefile8
-rw-r--r--tools/testing/selftests/powerpc/nx-gzip/README45
-rw-r--r--tools/testing/selftests/powerpc/nx-gzip/gunz_test.c1028
-rw-r--r--tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c433
-rw-r--r--tools/testing/selftests/powerpc/nx-gzip/gzip_vas.c316
-rw-r--r--tools/testing/selftests/powerpc/nx-gzip/include/copy-paste.h56
-rw-r--r--tools/testing/selftests/powerpc/nx-gzip/include/crb.h155
-rw-r--r--tools/testing/selftests/powerpc/nx-gzip/include/nx.h38
-rw-r--r--tools/testing/selftests/powerpc/nx-gzip/include/nx_dbg.h95
-rw-r--r--tools/testing/selftests/powerpc/nx-gzip/include/nxu.h650
l---------tools/testing/selftests/powerpc/nx-gzip/include/vas-api.h1
-rwxr-xr-xtools/testing/selftests/powerpc/nx-gzip/nx-gzip-test.sh46
-rw-r--r--tools/testing/selftests/powerpc/pmu/.gitignore5
-rw-r--r--tools/testing/selftests/powerpc/pmu/Makefile50
-rw-r--r--tools/testing/selftests/powerpc/pmu/count_instructions.c147
-rw-r--r--tools/testing/selftests/powerpc/pmu/count_stcx_fail.c164
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/.gitignore23
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/Makefile37
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/back_to_back_ebbs_test.c106
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/busy_loop.S271
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/close_clears_pmcc_test.c61
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/cpu_event_pinned_vs_ebb_test.c95
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/cpu_event_vs_ebb_test.c91
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/cycles_test.c58
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/cycles_with_freeze_test.c117
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/cycles_with_mmcr2_test.c91
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/ebb.c485
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/ebb.h78
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/ebb_handler.S365
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/ebb_on_child_test.c88
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/ebb_on_willing_child_test.c92
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/ebb_vs_cpu_event_test.c88
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/event_attributes_test.c133
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/fixed_instruction_loop.S43
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/fork_cleanup_test.c80
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/instruction_count_test.c167
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/lost_exception_test.c102
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/multi_counter_test.c86
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/multi_ebb_procs_test.c109
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/no_handler_test.c61
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/pmae_handling_test.c106
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/pmc56_overflow_test.c93
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/reg_access_test.c40
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/task_event_pinned_vs_ebb_test.c93
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/task_event_vs_ebb_test.c85
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/trace.c300
-rw-r--r--tools/testing/selftests/powerpc/pmu/ebb/trace.h41
-rw-r--r--tools/testing/selftests/powerpc/pmu/event.c131
-rw-r--r--tools/testing/selftests/powerpc/pmu/event.h43
-rw-r--r--tools/testing/selftests/powerpc/pmu/l3_bank_test.c51
-rw-r--r--tools/testing/selftests/powerpc/pmu/lib.c227
-rw-r--r--tools/testing/selftests/powerpc/pmu/lib.h41
-rw-r--r--tools/testing/selftests/powerpc/pmu/loop.S78
-rw-r--r--tools/testing/selftests/powerpc/pmu/per_event_excludes.c111
-rw-r--r--tools/testing/selftests/powerpc/primitives/.gitignore2
-rw-r--r--tools/testing/selftests/powerpc/primitives/Makefile9
l---------tools/testing/selftests/powerpc/primitives/asm/asm-compat.h1
l---------tools/testing/selftests/powerpc/primitives/asm/asm-const.h1
l---------tools/testing/selftests/powerpc/primitives/asm/feature-fixups.h1
-rw-r--r--tools/testing/selftests/powerpc/primitives/asm/firmware.h0
-rw-r--r--tools/testing/selftests/powerpc/primitives/asm/ppc-opcode.h0
l---------tools/testing/selftests/powerpc/primitives/asm/ppc_asm.h1
-rw-r--r--tools/testing/selftests/powerpc/primitives/asm/processor.h0
-rw-r--r--tools/testing/selftests/powerpc/primitives/linux/stringify.h0
-rw-r--r--tools/testing/selftests/powerpc/primitives/load_unaligned_zeropad.c147
l---------tools/testing/selftests/powerpc/primitives/word-at-a-time.h1
-rw-r--r--tools/testing/selftests/powerpc/ptrace/.gitignore16
-rw-r--r--tools/testing/selftests/powerpc/ptrace/Makefile15
-rw-r--r--tools/testing/selftests/powerpc/ptrace/child.h139
-rw-r--r--tools/testing/selftests/powerpc/ptrace/core-pkey.c462
-rw-r--r--tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c308
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-gpr.c119
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-gpr.h70
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c550
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c330
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-syscall.c228
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-tar.c134
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-tar.h46
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c154
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c165
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-tar.c170
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-vsx.c180
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-tm-spr.c163
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar.c156
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-tm-vsx.c163
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-vsx.c115
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace-vsx.h123
-rw-r--r--tools/testing/selftests/powerpc/ptrace/ptrace.h745
-rwxr-xr-xtools/testing/selftests/powerpc/scripts/hmi.sh82
-rw-r--r--tools/testing/selftests/powerpc/security/.gitignore3
-rw-r--r--tools/testing/selftests/powerpc/security/Makefile15
-rw-r--r--tools/testing/selftests/powerpc/security/branch_loops.S82
-rw-r--r--tools/testing/selftests/powerpc/security/entry_flush.c139
-rw-r--r--tools/testing/selftests/powerpc/security/flush_utils.c70
-rw-r--r--tools/testing/selftests/powerpc/security/flush_utils.h21
-rw-r--r--tools/testing/selftests/powerpc/security/rfi_flush.c142
-rw-r--r--tools/testing/selftests/powerpc/security/spectre_v2.c231
-rw-r--r--tools/testing/selftests/powerpc/signal/.gitignore5
-rw-r--r--tools/testing/selftests/powerpc/signal/Makefile13
-rw-r--r--tools/testing/selftests/powerpc/signal/settings1
-rw-r--r--tools/testing/selftests/powerpc/signal/sig_sc_double_restart.c174
-rw-r--r--tools/testing/selftests/powerpc/signal/sigfuz.c325
-rw-r--r--tools/testing/selftests/powerpc/signal/signal.S46
-rw-r--r--tools/testing/selftests/powerpc/signal/signal.c107
-rw-r--r--tools/testing/selftests/powerpc/signal/signal_tm.c106
-rw-r--r--tools/testing/selftests/powerpc/signal/sigreturn_vdso.c127
-rw-r--r--tools/testing/selftests/powerpc/stringloops/.gitignore5
-rw-r--r--tools/testing/selftests/powerpc/stringloops/Makefile35
-rw-r--r--tools/testing/selftests/powerpc/stringloops/asm/cache.h1
-rw-r--r--tools/testing/selftests/powerpc/stringloops/asm/export.h1
-rw-r--r--tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h35
-rw-r--r--tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h33
-rw-r--r--tools/testing/selftests/powerpc/stringloops/memcmp.c169
l---------tools/testing/selftests/powerpc/stringloops/memcmp_32.S1
l---------tools/testing/selftests/powerpc/stringloops/memcmp_64.S1
-rw-r--r--tools/testing/selftests/powerpc/stringloops/string.c21
-rw-r--r--tools/testing/selftests/powerpc/stringloops/strlen.c127
l---------tools/testing/selftests/powerpc/stringloops/strlen_32.S1
-rw-r--r--tools/testing/selftests/powerpc/switch_endian/.gitignore3
-rw-r--r--tools/testing/selftests/powerpc/switch_endian/Makefile18
-rw-r--r--tools/testing/selftests/powerpc/switch_endian/check.S101
-rw-r--r--tools/testing/selftests/powerpc/switch_endian/common.h7
-rw-r--r--tools/testing/selftests/powerpc/switch_endian/switch_endian_test.S97
-rw-r--r--tools/testing/selftests/powerpc/syscalls/.gitignore2
-rw-r--r--tools/testing/selftests/powerpc/syscalls/Makefile9
-rw-r--r--tools/testing/selftests/powerpc/syscalls/ipc.h48
-rw-r--r--tools/testing/selftests/powerpc/syscalls/ipc_unmuxed.c57
-rw-r--r--tools/testing/selftests/powerpc/syscalls/rtas_filter.c285
-rw-r--r--tools/testing/selftests/powerpc/tm/.gitignore22
-rw-r--r--tools/testing/selftests/powerpc/tm/Makefile32
-rw-r--r--tools/testing/selftests/powerpc/tm/settings1
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-exec.c66
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-fork.c42
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-poison.c182
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-resched-dscr.c100
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c110
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c112
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c135
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c184
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c180
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal-msr-resv.c74
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal-pagefault.c284
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c50
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal-stack.c76
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-signal.S110
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-sigreturn.c93
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-syscall-asm.S28
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-syscall.c106
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-tar.c91
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-tmspr.c143
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-trap.c333
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-unavailable.c410
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-vmx-unavail.c118
-rw-r--r--tools/testing/selftests/powerpc/tm/tm-vmxcopy.c104
-rw-r--r--tools/testing/selftests/powerpc/tm/tm.h96
-rw-r--r--tools/testing/selftests/powerpc/utils.c303
-rw-r--r--tools/testing/selftests/powerpc/vphn/.gitignore2
-rw-r--r--tools/testing/selftests/powerpc/vphn/Makefile10
l---------tools/testing/selftests/powerpc/vphn/asm/lppaca.h1
-rw-r--r--tools/testing/selftests/powerpc/vphn/test-vphn.c411
l---------tools/testing/selftests/powerpc/vphn/vphn.c1
-rw-r--r--tools/testing/selftests/prctl/.gitignore4
-rw-r--r--tools/testing/selftests/prctl/Makefile16
-rw-r--r--tools/testing/selftests/prctl/disable-tsc-ctxt-sw-stress-test.c98
-rw-r--r--tools/testing/selftests/prctl/disable-tsc-on-off-stress-test.c97
-rw-r--r--tools/testing/selftests/prctl/disable-tsc-test.c96
-rw-r--r--tools/testing/selftests/proc/.gitignore19
-rw-r--r--tools/testing/selftests/proc/Makefile25
-rw-r--r--tools/testing/selftests/proc/config1
-rw-r--r--tools/testing/selftests/proc/fd-001-lookup.c168
-rw-r--r--tools/testing/selftests/proc/fd-002-posix-eq.c57
-rw-r--r--tools/testing/selftests/proc/fd-003-kthread.c178
-rw-r--r--tools/testing/selftests/proc/proc-fsconfig-hidepid.c50
-rw-r--r--tools/testing/selftests/proc/proc-loadavg-001.c82
-rw-r--r--tools/testing/selftests/proc/proc-multiple-procfs.c48
-rw-r--r--tools/testing/selftests/proc/proc-pid-vm.c462
-rw-r--r--tools/testing/selftests/proc/proc-self-map-files-001.c82
-rw-r--r--tools/testing/selftests/proc/proc-self-map-files-002.c94
-rw-r--r--tools/testing/selftests/proc/proc-self-syscall.c58
-rw-r--r--tools/testing/selftests/proc/proc-self-wchan.c40
-rw-r--r--tools/testing/selftests/proc/proc-uptime-001.c45
-rw-r--r--tools/testing/selftests/proc/proc-uptime-002.c79
-rw-r--r--tools/testing/selftests/proc/proc-uptime.h60
-rw-r--r--tools/testing/selftests/proc/proc.h51
-rw-r--r--tools/testing/selftests/proc/read.c146
-rw-r--r--tools/testing/selftests/proc/self.c39
-rw-r--r--tools/testing/selftests/proc/setns-dcache.c129
-rw-r--r--tools/testing/selftests/proc/setns-sysvipc.c133
-rw-r--r--tools/testing/selftests/proc/thread-self.c64
-rw-r--r--tools/testing/selftests/pstore/.gitignore3
-rw-r--r--tools/testing/selftests/pstore/Makefile14
-rwxr-xr-xtools/testing/selftests/pstore/common_tests83
-rw-r--r--tools/testing/selftests/pstore/config5
-rwxr-xr-xtools/testing/selftests/pstore/pstore_crash_test30
-rwxr-xr-xtools/testing/selftests/pstore/pstore_post_reboot_tests80
-rwxr-xr-xtools/testing/selftests/pstore/pstore_tests30
-rw-r--r--tools/testing/selftests/ptp/.gitignore2
-rw-r--r--tools/testing/selftests/ptp/Makefile10
-rwxr-xr-xtools/testing/selftests/ptp/phc.sh166
-rw-r--r--tools/testing/selftests/ptp/testptp.c511
-rw-r--r--tools/testing/selftests/ptp/testptp.mk33
-rw-r--r--tools/testing/selftests/ptrace/.gitignore4
-rw-r--r--tools/testing/selftests/ptrace/Makefile6
-rw-r--r--tools/testing/selftests/ptrace/get_syscall_info.c271
-rw-r--r--tools/testing/selftests/ptrace/peeksiginfo.c219
-rw-r--r--tools/testing/selftests/ptrace/vmaccess.c86
-rw-r--r--tools/testing/selftests/rcutorture/.gitignore5
-rw-r--r--tools/testing/selftests/rcutorture/Makefile3
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/configNR_CPUS.sh32
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/config_override.sh48
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/configcheck.sh43
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/configinit.sh44
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/console-badness.sh17
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/cpus2use.sh40
-rw-r--r--tools/testing/selftests/rcutorture/bin/functions.sh292
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/jitter.sh102
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kcsan-collapse.sh22
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-build.sh52
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-check-branches.sh108
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-find-errors.sh66
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-recheck-lock.sh38
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh76
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale-ftrace.sh109
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh83
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-recheck-refscale.sh71
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-recheck-scf.sh38
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-recheck.sh101
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh287
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm-transform.sh51
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/kvm.sh536
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/mkinitrd.sh81
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/parse-build.sh49
-rwxr-xr-xtools/testing/selftests/rcutorture/bin/parse-console.sh174
-rw-r--r--tools/testing/selftests/rcutorture/configs/lock/BUSTED6
-rw-r--r--tools/testing/selftests/rcutorture/configs/lock/BUSTED.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/lock/CFLIST7
-rw-r--r--tools/testing/selftests/rcutorture/configs/lock/CFcommon2
-rw-r--r--tools/testing/selftests/rcutorture/configs/lock/LOCK016
-rw-r--r--tools/testing/selftests/rcutorture/configs/lock/LOCK026
-rw-r--r--tools/testing/selftests/rcutorture/configs/lock/LOCK02.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/lock/LOCK036
-rw-r--r--tools/testing/selftests/rcutorture/configs/lock/LOCK03.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/lock/LOCK046
-rw-r--r--tools/testing/selftests/rcutorture/configs/lock/LOCK04.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/lock/LOCK056
-rw-r--r--tools/testing/selftests/rcutorture/configs/lock/LOCK05.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/lock/LOCK066
-rw-r--r--tools/testing/selftests/rcutorture/configs/lock/LOCK06.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/lock/LOCK076
-rw-r--r--tools/testing/selftests/rcutorture/configs/lock/LOCK07.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/lock/ver_functions.sh29
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/BUSTED7
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/BUSTED.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/CFLIST19
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/CFcommon7
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/RUDE0110
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/RUDE01.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/SRCU-N8
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/SRCU-N.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/SRCU-P12
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot2
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/SRCU-t10
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/SRCU-t.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/SRCU-u10
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/SRCU-u.boot2
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TASKS0110
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TASKS01.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TASKS024
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TASKS02.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TASKS039
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TASKS03.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TINY0113
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TINY0214
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TINY02.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TRACE0111
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TRACE01.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TRACE0211
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TRACE02.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE0118
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot6
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE0220
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE0318
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot6
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE0417
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE0521
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot4
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE0620
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot5
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE0717
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE0820
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot3
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE0915
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TREE1018
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL11
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL.boot3
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh43
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcuscale/CFLIST1
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon2
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcuscale/TINY16
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcuscale/TREE19
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcuscale/TREE5422
-rw-r--r--tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh16
-rw-r--r--tools/testing/selftests/rcutorture/configs/refscale/CFLIST2
-rw-r--r--tools/testing/selftests/rcutorture/configs/refscale/CFcommon2
-rw-r--r--tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT18
-rw-r--r--tools/testing/selftests/rcutorture/configs/refscale/PREEMPT18
-rw-r--r--tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh16
-rw-r--r--tools/testing/selftests/rcutorture/configs/scf/CFLIST2
-rw-r--r--tools/testing/selftests/rcutorture/configs/scf/CFcommon2
-rw-r--r--tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT9
-rw-r--r--tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT.boot1
-rw-r--r--tools/testing/selftests/rcutorture/configs/scf/PREEMPT9
-rw-r--r--tools/testing/selftests/rcutorture/configs/scf/ver_functions.sh30
-rw-r--r--tools/testing/selftests/rcutorture/doc/TINY_RCU.txt38
-rw-r--r--tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt80
-rw-r--r--tools/testing/selftests/rcutorture/doc/initrd.txt16
-rw-r--r--tools/testing/selftests/rcutorture/doc/rcu-test-image.txt67
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/.gitignore2
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/Makefile17
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/delay.h0
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/export.h0
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/mutex.h0
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/percpu.h0
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/preempt.h0
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/rcupdate.h0
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/sched.h0
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/smp.h0
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/workqueue.h0
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/uapi/linux/types.h0
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/.gitignore2
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/kconfig.h1
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/types.h152
-rwxr-xr-xtools/testing/selftests/rcutorture/formal/srcu-cbmc/modify_srcu.awk376
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/assume.h17
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/barriers.h41
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/bug_on.h14
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/combined_source.c14
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/config.h28
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/include_srcu.c32
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/int_typedefs.h34
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/locks.h221
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/misc.c12
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/misc.h58
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/percpu.h93
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/preempt.c79
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/preempt.h59
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/simple_sync_srcu.c51
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/workqueues.h103
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/.gitignore2
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/Makefile12
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/assert_end.fail1
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force.fail1
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force2.fail1
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force3.fail1
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/main.pass0
-rw-r--r--tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/test.c73
-rwxr-xr-xtools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/test_script.sh103
-rw-r--r--tools/testing/selftests/resctrl/Makefile17
-rw-r--r--tools/testing/selftests/resctrl/README53
-rw-r--r--tools/testing/selftests/resctrl/cache.c276
-rw-r--r--tools/testing/selftests/resctrl/cat_test.c250
-rw-r--r--tools/testing/selftests/resctrl/cqm_test.c176
-rw-r--r--tools/testing/selftests/resctrl/fill_buf.c218
-rw-r--r--tools/testing/selftests/resctrl/mba_test.c176
-rw-r--r--tools/testing/selftests/resctrl/mbm_test.c145
-rw-r--r--tools/testing/selftests/resctrl/resctrl.h113
-rw-r--r--tools/testing/selftests/resctrl/resctrl_tests.c207
-rw-r--r--tools/testing/selftests/resctrl/resctrl_val.c767
-rw-r--r--tools/testing/selftests/resctrl/resctrlfs.c723
-rw-r--r--tools/testing/selftests/rseq/.gitignore7
-rw-r--r--tools/testing/selftests/rseq/Makefile40
-rw-r--r--tools/testing/selftests/rseq/basic_percpu_ops_test.c311
-rw-r--r--tools/testing/selftests/rseq/basic_test.c56
-rw-r--r--tools/testing/selftests/rseq/compiler.h30
-rw-r--r--tools/testing/selftests/rseq/param_test.c1550
-rw-r--r--tools/testing/selftests/rseq/rseq-abi.h151
-rw-r--r--tools/testing/selftests/rseq/rseq-arm.h827
-rw-r--r--tools/testing/selftests/rseq/rseq-arm64.h695
-rw-r--r--tools/testing/selftests/rseq/rseq-generic-thread-pointer.h25
-rw-r--r--tools/testing/selftests/rseq/rseq-mips.h777
-rw-r--r--tools/testing/selftests/rseq/rseq-ppc-thread-pointer.h30
-rw-r--r--tools/testing/selftests/rseq/rseq-ppc.h791
-rw-r--r--tools/testing/selftests/rseq/rseq-s390.h610
-rw-r--r--tools/testing/selftests/rseq/rseq-skip.h65
-rw-r--r--tools/testing/selftests/rseq/rseq-thread-pointer.h19
-rw-r--r--tools/testing/selftests/rseq/rseq-x86-thread-pointer.h40
-rw-r--r--tools/testing/selftests/rseq/rseq-x86.h1365
-rw-r--r--tools/testing/selftests/rseq/rseq.c168
-rw-r--r--tools/testing/selftests/rseq/rseq.h175
-rwxr-xr-xtools/testing/selftests/rseq/run_param_test.sh126
-rw-r--r--tools/testing/selftests/rseq/settings1
-rw-r--r--tools/testing/selftests/rtc/.gitignore3
-rw-r--r--tools/testing/selftests/rtc/Makefile11
-rw-r--r--tools/testing/selftests/rtc/rtctest.c337
-rw-r--r--tools/testing/selftests/rtc/setdate.c77
-rw-r--r--tools/testing/selftests/rtc/settings1
-rwxr-xr-xtools/testing/selftests/run_kselftest.sh93
-rw-r--r--tools/testing/selftests/safesetid/.gitignore2
-rw-r--r--tools/testing/selftests/safesetid/Makefile9
-rw-r--r--tools/testing/selftests/safesetid/config2
-rw-r--r--tools/testing/selftests/safesetid/safesetid-test.c335
-rwxr-xr-xtools/testing/selftests/safesetid/safesetid-test.sh26
-rw-r--r--tools/testing/selftests/seccomp/.gitignore3
-rw-r--r--tools/testing/selftests/seccomp/Makefile6
-rw-r--r--tools/testing/selftests/seccomp/config4
-rw-r--r--tools/testing/selftests/seccomp/seccomp_benchmark.c135
-rw-r--r--tools/testing/selftests/seccomp/seccomp_bpf.c4162
-rw-r--r--tools/testing/selftests/seccomp/settings1
-rw-r--r--tools/testing/selftests/sigaltstack/.gitignore2
-rw-r--r--tools/testing/selftests/sigaltstack/Makefile6
-rw-r--r--tools/testing/selftests/sigaltstack/current_stack_pointer.h23
-rw-r--r--tools/testing/selftests/sigaltstack/sas.c187
-rw-r--r--tools/testing/selftests/size/.gitignore2
-rw-r--r--tools/testing/selftests/size/Makefile6
-rw-r--r--tools/testing/selftests/size/get_size.c116
-rw-r--r--tools/testing/selftests/sparc64/Makefile50
-rw-r--r--tools/testing/selftests/sparc64/drivers/.gitignore2
-rw-r--r--tools/testing/selftests/sparc64/drivers/Makefile15
-rw-r--r--tools/testing/selftests/sparc64/drivers/adi-test.c721
-rwxr-xr-xtools/testing/selftests/sparc64/drivers/drivers_test.sh30
-rwxr-xr-xtools/testing/selftests/sparc64/run.sh3
-rw-r--r--tools/testing/selftests/splice/.gitignore3
-rw-r--r--tools/testing/selftests/splice/Makefile5
-rw-r--r--tools/testing/selftests/splice/config1
-rw-r--r--tools/testing/selftests/splice/default_file_splice_read.c9
-rwxr-xr-xtools/testing/selftests/splice/default_file_splice_read.sh8
-rw-r--r--tools/testing/selftests/splice/settings1
-rwxr-xr-xtools/testing/selftests/splice/short_splice_read.sh133
-rw-r--r--tools/testing/selftests/splice/splice_read.c57
-rw-r--r--tools/testing/selftests/static_keys/Makefile9
-rw-r--r--tools/testing/selftests/static_keys/config1
-rwxr-xr-xtools/testing/selftests/static_keys/test_static_keys.sh30
-rw-r--r--tools/testing/selftests/sync/.gitignore2
-rw-r--r--tools/testing/selftests/sync/Makefile38
-rw-r--r--tools/testing/selftests/sync/config4
-rw-r--r--tools/testing/selftests/sync/sw_sync.h46
-rw-r--r--tools/testing/selftests/sync/sync.c221
-rw-r--r--tools/testing/selftests/sync/sync.h40
-rw-r--r--tools/testing/selftests/sync/sync_alloc.c74
-rw-r--r--tools/testing/selftests/sync/sync_fence.c132
-rw-r--r--tools/testing/selftests/sync/sync_merge.c60
-rw-r--r--tools/testing/selftests/sync/sync_stress_consumer.c185
-rw-r--r--tools/testing/selftests/sync/sync_stress_merge.c115
-rw-r--r--tools/testing/selftests/sync/sync_stress_parallelism.c111
-rw-r--r--tools/testing/selftests/sync/sync_test.c114
-rw-r--r--tools/testing/selftests/sync/sync_wait.c91
-rw-r--r--tools/testing/selftests/sync/synctest.h67
-rw-r--r--tools/testing/selftests/sysctl/Makefile13
-rw-r--r--tools/testing/selftests/sysctl/config1
-rwxr-xr-xtools/testing/selftests/sysctl/sysctl.sh971
-rw-r--r--tools/testing/selftests/tc-testing/.gitignore7
-rw-r--r--tools/testing/selftests/tc-testing/Makefile33
-rw-r--r--tools/testing/selftests/tc-testing/README257
-rw-r--r--tools/testing/selftests/tc-testing/TODO.txt31
-rw-r--r--tools/testing/selftests/tc-testing/TdcPlugin.py74
-rw-r--r--tools/testing/selftests/tc-testing/TdcResults.py132
-rw-r--r--tools/testing/selftests/tc-testing/action.c23
-rw-r--r--tools/testing/selftests/tc-testing/config68
-rw-r--r--tools/testing/selftests/tc-testing/creating-plugins/AddingPlugins.txt104
-rw-r--r--tools/testing/selftests/tc-testing/creating-testcases/AddingTestCases.txt105
-rw-r--r--tools/testing/selftests/tc-testing/creating-testcases/example.json55
-rw-r--r--tools/testing/selftests/tc-testing/creating-testcases/scapy-example.json98
-rw-r--r--tools/testing/selftests/tc-testing/creating-testcases/template.json51
-rw-r--r--tools/testing/selftests/tc-testing/plugin-lib/README-PLUGINS27
-rw-r--r--tools/testing/selftests/tc-testing/plugin-lib/buildebpfPlugin.py67
-rw-r--r--tools/testing/selftests/tc-testing/plugin-lib/nsPlugin.py155
-rw-r--r--tools/testing/selftests/tc-testing/plugin-lib/rootPlugin.py19
-rw-r--r--tools/testing/selftests/tc-testing/plugin-lib/scapyPlugin.py50
-rw-r--r--tools/testing/selftests/tc-testing/plugin-lib/valgrindPlugin.py160
-rw-r--r--tools/testing/selftests/tc-testing/plugins/__init__.py0
-rw-r--r--tools/testing/selftests/tc-testing/settings1
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json321
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/actions/connmark.json316
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/actions/csum.json553
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/actions/ct.json410
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/actions/gact.json613
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/actions/ife.json1089
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json581
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/actions/mpls.json1233
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/actions/nat.json618
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json1726
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/actions/police.json768
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/actions/sample.json637
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/actions/simple.json155
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json721
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/actions/skbmod.json421
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json937
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/actions/vlan.json835
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/filters/basic.json1278
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/filters/concurrency.json177
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/filters/fw.json1355
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/filters/matchall.json391
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/filters/tests.json129
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/filters/u32.json205
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/qdiscs/ets.json940
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/qdiscs/fifo.json304
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json21
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/qdiscs/ingress.json102
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/qdiscs/prio.json276
-rw-r--r--tools/testing/selftests/tc-testing/tc-tests/qdiscs/red.json185
-rwxr-xr-xtools/testing/selftests/tc-testing/tdc.py774
-rwxr-xr-xtools/testing/selftests/tc-testing/tdc.sh5
-rwxr-xr-xtools/testing/selftests/tc-testing/tdc_batch.py112
-rw-r--r--tools/testing/selftests/tc-testing/tdc_config.py42
-rw-r--r--tools/testing/selftests/tc-testing/tdc_config_local_template.py23
-rw-r--r--tools/testing/selftests/tc-testing/tdc_helper.py70
-rwxr-xr-xtools/testing/selftests/tc-testing/tdc_multibatch.py65
-rw-r--r--tools/testing/selftests/timens/.gitignore9
-rw-r--r--tools/testing/selftests/timens/Makefile7
-rw-r--r--tools/testing/selftests/timens/clock_nanosleep.c149
-rw-r--r--tools/testing/selftests/timens/config1
-rw-r--r--tools/testing/selftests/timens/exec.c93
-rw-r--r--tools/testing/selftests/timens/futex.c110
-rw-r--r--tools/testing/selftests/timens/gettime_perf.c95
-rw-r--r--tools/testing/selftests/timens/log.h26
-rw-r--r--tools/testing/selftests/timens/procfs.c143
-rw-r--r--tools/testing/selftests/timens/timens.c189
-rw-r--r--tools/testing/selftests/timens/timens.h111
-rw-r--r--tools/testing/selftests/timens/timer.c126
-rw-r--r--tools/testing/selftests/timens/timerfd.c133
-rw-r--r--tools/testing/selftests/timers/.gitignore22
-rw-r--r--tools/testing/selftests/timers/Makefile24
-rw-r--r--tools/testing/selftests/timers/adjtick.c211
-rw-r--r--tools/testing/selftests/timers/alarmtimer-suspend.c178
-rw-r--r--tools/testing/selftests/timers/change_skew.c96
-rw-r--r--tools/testing/selftests/timers/clocksource-switch.c168
-rw-r--r--tools/testing/selftests/timers/freq-step.c263
-rw-r--r--tools/testing/selftests/timers/inconsistency-check.c193
-rw-r--r--tools/testing/selftests/timers/leap-a-day.c378
-rw-r--r--tools/testing/selftests/timers/leapcrash.c108
-rw-r--r--tools/testing/selftests/timers/mqueue-lat.c114
-rw-r--r--tools/testing/selftests/timers/nanosleep.c165
-rw-r--r--tools/testing/selftests/timers/nsleep-lat.c180
-rw-r--r--tools/testing/selftests/timers/posix_timers.c221
-rw-r--r--tools/testing/selftests/timers/raw_skew.c148
-rw-r--r--tools/testing/selftests/timers/rtcpie.c142
-rw-r--r--tools/testing/selftests/timers/set-2038.c133
-rw-r--r--tools/testing/selftests/timers/set-tai.c69
-rw-r--r--tools/testing/selftests/timers/set-timer-lat.c283
-rw-r--r--tools/testing/selftests/timers/set-tz.c110
-rw-r--r--tools/testing/selftests/timers/settings1
-rw-r--r--tools/testing/selftests/timers/skew_consistency.c77
-rw-r--r--tools/testing/selftests/timers/threadtest.c193
-rw-r--r--tools/testing/selftests/timers/valid-adjtimex.c330
-rw-r--r--tools/testing/selftests/tmpfs/.gitignore2
-rw-r--r--tools/testing/selftests/tmpfs/Makefile8
-rw-r--r--tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c67
-rw-r--r--tools/testing/selftests/tpm2/Makefile5
-rwxr-xr-xtools/testing/selftests/tpm2/test_smoke.sh10
-rwxr-xr-xtools/testing/selftests/tpm2/test_space.sh9
-rw-r--r--tools/testing/selftests/tpm2/tpm2.py718
-rw-r--r--tools/testing/selftests/tpm2/tpm2_tests.py304
-rw-r--r--tools/testing/selftests/uevent/Makefile17
-rw-r--r--tools/testing/selftests/uevent/config2
-rw-r--r--tools/testing/selftests/uevent/uevent_filtering.c485
-rw-r--r--tools/testing/selftests/user/Makefile9
-rw-r--r--tools/testing/selftests/user/config1
-rwxr-xr-xtools/testing/selftests/user/test_user_copy.sh18
-rw-r--r--tools/testing/selftests/vDSO/.gitignore5
-rw-r--r--tools/testing/selftests/vDSO/Makefile27
-rw-r--r--tools/testing/selftests/vDSO/parse_vdso.c247
-rw-r--r--tools/testing/selftests/vDSO/parse_vdso.h31
-rw-r--r--tools/testing/selftests/vDSO/vdso_standalone_test_x86.c126
-rw-r--r--tools/testing/selftests/vDSO/vdso_test_getcpu.c54
-rw-r--r--tools/testing/selftests/vDSO/vdso_test_gettimeofday.c66
-rw-r--r--tools/testing/selftests/vm/.gitignore22
-rw-r--r--tools/testing/selftests/vm/Makefile136
-rw-r--r--tools/testing/selftests/vm/charge_reserved_hugetlb.sh581
-rw-r--r--tools/testing/selftests/vm/compaction_test.c231
-rw-r--r--tools/testing/selftests/vm/config6
-rw-r--r--tools/testing/selftests/vm/gup_benchmark.c143
-rw-r--r--tools/testing/selftests/vm/hmm-tests.c1522
-rw-r--r--tools/testing/selftests/vm/hugepage-mmap.c93
-rw-r--r--tools/testing/selftests/vm/hugepage-shm.c101
-rw-r--r--tools/testing/selftests/vm/hugetlb_reparenting_test.sh249
-rw-r--r--tools/testing/selftests/vm/khugepaged.c1035
-rw-r--r--tools/testing/selftests/vm/map_fixed_noreplace.c231
-rw-r--r--tools/testing/selftests/vm/map_hugetlb.c109
-rw-r--r--tools/testing/selftests/vm/map_populate.c113
-rw-r--r--tools/testing/selftests/vm/mlock-random-test.c294
-rw-r--r--tools/testing/selftests/vm/mlock2-tests.c520
-rw-r--r--tools/testing/selftests/vm/mlock2.h63
-rw-r--r--tools/testing/selftests/vm/mremap_dontunmap.c312
-rw-r--r--tools/testing/selftests/vm/on-fault-limit.c48
-rw-r--r--tools/testing/selftests/vm/pkey-helpers.h225
-rw-r--r--tools/testing/selftests/vm/pkey-powerpc.h133
-rw-r--r--tools/testing/selftests/vm/pkey-x86.h181
-rw-r--r--tools/testing/selftests/vm/protection_keys.c1588
-rwxr-xr-xtools/testing/selftests/vm/run_vmtests326
-rwxr-xr-xtools/testing/selftests/vm/test_hmm.sh97
-rwxr-xr-xtools/testing/selftests/vm/test_vmalloc.sh176
-rw-r--r--tools/testing/selftests/vm/thuge-gen.c257
-rw-r--r--tools/testing/selftests/vm/transhuge-stress.c144
-rw-r--r--tools/testing/selftests/vm/userfaultfd.c1559
-rw-r--r--tools/testing/selftests/vm/va_128TBswitch.c289
-rw-r--r--tools/testing/selftests/vm/virtual_address_range.c139
-rw-r--r--tools/testing/selftests/vm/write_hugetlb_memory.sh23
-rw-r--r--tools/testing/selftests/vm/write_to_hugetlbfs.c240
-rw-r--r--tools/testing/selftests/watchdog/.gitignore2
-rw-r--r--tools/testing/selftests/watchdog/Makefile4
-rw-r--r--tools/testing/selftests/watchdog/watchdog-test.c257
-rwxr-xr-xtools/testing/selftests/wireguard/netns.sh684
-rw-r--r--tools/testing/selftests/wireguard/qemu/.gitignore3
-rw-r--r--tools/testing/selftests/wireguard/qemu/Makefile377
-rw-r--r--tools/testing/selftests/wireguard/qemu/arch/aarch64.config5
-rw-r--r--tools/testing/selftests/wireguard/qemu/arch/aarch64_be.config6
-rw-r--r--tools/testing/selftests/wireguard/qemu/arch/arm.config9
-rw-r--r--tools/testing/selftests/wireguard/qemu/arch/armeb.config10
-rw-r--r--tools/testing/selftests/wireguard/qemu/arch/i686.config5
-rw-r--r--tools/testing/selftests/wireguard/qemu/arch/m68k.config9
-rw-r--r--tools/testing/selftests/wireguard/qemu/arch/mips.config11
-rw-r--r--tools/testing/selftests/wireguard/qemu/arch/mips64.config14
-rw-r--r--tools/testing/selftests/wireguard/qemu/arch/mips64el.config15
-rw-r--r--tools/testing/selftests/wireguard/qemu/arch/mipsel.config12
-rw-r--r--tools/testing/selftests/wireguard/qemu/arch/powerpc.config10
-rw-r--r--tools/testing/selftests/wireguard/qemu/arch/powerpc64le.config13
-rw-r--r--tools/testing/selftests/wireguard/qemu/arch/x86_64.config5
-rw-r--r--tools/testing/selftests/wireguard/qemu/debug.config64
-rw-r--r--tools/testing/selftests/wireguard/qemu/init.c284
-rw-r--r--tools/testing/selftests/wireguard/qemu/kernel.config89
-rw-r--r--tools/testing/selftests/x86/.gitignore15
-rw-r--r--tools/testing/selftests/x86/Makefile106
-rwxr-xr-xtools/testing/selftests/x86/check_cc.sh16
-rw-r--r--tools/testing/selftests/x86/check_initial_reg_state.c101
-rw-r--r--tools/testing/selftests/x86/entry_from_vm86.c348
-rw-r--r--tools/testing/selftests/x86/fsgsbase.c678
-rw-r--r--tools/testing/selftests/x86/fsgsbase_restore.c245
-rw-r--r--tools/testing/selftests/x86/helpers.h41
-rw-r--r--tools/testing/selftests/x86/ioperm.c185
-rw-r--r--tools/testing/selftests/x86/iopl.c281
-rw-r--r--tools/testing/selftests/x86/ldt_gdt.c927
-rw-r--r--tools/testing/selftests/x86/mov_ss_trap.c286
-rw-r--r--tools/testing/selftests/x86/ptrace_syscall.c430
-rw-r--r--tools/testing/selftests/x86/raw_syscall_helper_32.S47
-rw-r--r--tools/testing/selftests/x86/sigreturn.c876
-rw-r--r--tools/testing/selftests/x86/single_step_syscall.c242
-rw-r--r--tools/testing/selftests/x86/syscall_arg_fault.c237
-rw-r--r--tools/testing/selftests/x86/syscall_nt.c96
-rw-r--r--tools/testing/selftests/x86/syscall_numbering.c89
-rw-r--r--tools/testing/selftests/x86/sysret_rip.c187
-rw-r--r--tools/testing/selftests/x86/sysret_ss_attrs.c104
-rw-r--r--tools/testing/selftests/x86/test_FCMOV.c94
-rw-r--r--tools/testing/selftests/x86/test_FCOMI.c332
-rw-r--r--tools/testing/selftests/x86/test_FISTTP.c138
-rw-r--r--tools/testing/selftests/x86/test_mremap_vdso.c107
-rw-r--r--tools/testing/selftests/x86/test_syscall_vdso.c400
-rw-r--r--tools/testing/selftests/x86/test_vdso.c342
-rw-r--r--tools/testing/selftests/x86/test_vsyscall.c583
-rw-r--r--tools/testing/selftests/x86/thunks.S59
-rw-r--r--tools/testing/selftests/x86/thunks_32.S47
-rw-r--r--tools/testing/selftests/x86/trivial_32bit_program.c18
-rw-r--r--tools/testing/selftests/x86/trivial_64bit_program.c18
-rw-r--r--tools/testing/selftests/x86/trivial_program.c10
-rw-r--r--tools/testing/selftests/x86/unwind_vdso.c183
-rw-r--r--tools/testing/selftests/x86/vdso_restorer.c95
-rw-r--r--tools/testing/selftests/zram/Makefile9
-rw-r--r--tools/testing/selftests/zram/README40
-rw-r--r--tools/testing/selftests/zram/config2
-rwxr-xr-xtools/testing/selftests/zram/zram.sh18
-rwxr-xr-xtools/testing/selftests/zram/zram01.sh75
-rwxr-xr-xtools/testing/selftests/zram/zram02.sh44
-rwxr-xr-xtools/testing/selftests/zram/zram_lib.sh269
2122 files changed, 355819 insertions, 0 deletions
diff --git a/tools/testing/selftests/.gitignore b/tools/testing/selftests/.gitignore
new file mode 100644
index 000000000..055a5019b
--- /dev/null
+++ b/tools/testing/selftests/.gitignore
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+gpiogpio-event-mon
+gpiogpio-hammer
+gpioinclude/
+gpiolsgpio
+tpm2/SpaceTest.log
+
+# Python bytecode and cache
+__pycache__/
+*.py[cod]
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
new file mode 100644
index 000000000..db1e24d71
--- /dev/null
+++ b/tools/testing/selftests/Makefile
@@ -0,0 +1,265 @@
+# SPDX-License-Identifier: GPL-2.0
+TARGETS = android
+TARGETS += arm64
+TARGETS += bpf
+TARGETS += breakpoints
+TARGETS += capabilities
+TARGETS += cgroup
+TARGETS += clone3
+TARGETS += core
+TARGETS += cpufreq
+TARGETS += cpu-hotplug
+TARGETS += drivers/dma-buf
+TARGETS += efivarfs
+TARGETS += exec
+TARGETS += filesystems
+TARGETS += filesystems/binderfs
+TARGETS += filesystems/epoll
+TARGETS += firmware
+TARGETS += fpu
+TARGETS += ftrace
+TARGETS += futex
+TARGETS += gpio
+TARGETS += intel_pstate
+TARGETS += ipc
+TARGETS += ir
+TARGETS += kcmp
+TARGETS += kexec
+TARGETS += kvm
+TARGETS += lib
+TARGETS += livepatch
+TARGETS += lkdtm
+TARGETS += membarrier
+TARGETS += memfd
+TARGETS += memory-hotplug
+TARGETS += mincore
+TARGETS += mount
+TARGETS += mqueue
+TARGETS += net
+TARGETS += net/forwarding
+TARGETS += net/mptcp
+TARGETS += netfilter
+TARGETS += nsfs
+TARGETS += pidfd
+TARGETS += pid_namespace
+TARGETS += powerpc
+TARGETS += proc
+TARGETS += pstore
+TARGETS += ptrace
+TARGETS += openat2
+TARGETS += rseq
+TARGETS += rtc
+TARGETS += seccomp
+TARGETS += sigaltstack
+TARGETS += size
+TARGETS += sparc64
+TARGETS += splice
+TARGETS += static_keys
+TARGETS += sync
+TARGETS += sysctl
+TARGETS += tc-testing
+TARGETS += timens
+ifneq (1, $(quicktest))
+TARGETS += timers
+endif
+TARGETS += tmpfs
+TARGETS += tpm2
+TARGETS += user
+TARGETS += vm
+TARGETS += x86
+TARGETS += zram
+#Please keep the TARGETS list alphabetically sorted
+# Run "make quicktest=1 run_tests" or
+# "make quicktest=1 kselftest" from top level Makefile
+
+TARGETS_HOTPLUG = cpu-hotplug
+TARGETS_HOTPLUG += memory-hotplug
+
+# User can optionally provide a TARGETS skiplist.
+SKIP_TARGETS ?=
+ifneq ($(SKIP_TARGETS),)
+ TMP := $(filter-out $(SKIP_TARGETS), $(TARGETS))
+ override TARGETS := $(TMP)
+endif
+
+# User can set FORCE_TARGETS to 1 to require all targets to be successfully
+# built; make will fail if any of the targets cannot be built. If
+# FORCE_TARGETS is not set (the default), make will succeed if at least one
+# of the targets gets built.
+FORCE_TARGETS ?=
+
+# Clear LDFLAGS and MAKEFLAGS when implicit rules are missing. This provides
+# implicit rules to sub-test Makefiles which avoids build failures in test
+# Makefile that don't have explicit build rules.
+ifeq (,$(LINK.c))
+override LDFLAGS =
+override MAKEFLAGS =
+endif
+
+# Append kselftest to KBUILD_OUTPUT and O to avoid cluttering
+# KBUILD_OUTPUT with selftest objects and headers installed
+# by selftests Makefile or lib.mk.
+ifdef building_out_of_srctree
+override LDFLAGS =
+endif
+
+top_srcdir ?= ../../..
+
+ifeq ("$(origin O)", "command line")
+ KBUILD_OUTPUT := $(O)
+endif
+
+ifneq ($(KBUILD_OUTPUT),)
+ # Make's built-in functions such as $(abspath ...), $(realpath ...) cannot
+ # expand a shell special character '~'. We use a somewhat tedious way here.
+ abs_objtree := $(shell cd $(top_srcdir) && mkdir -p $(KBUILD_OUTPUT) && cd $(KBUILD_OUTPUT) && pwd)
+ $(if $(abs_objtree),, \
+ $(error failed to create output directory "$(KBUILD_OUTPUT)"))
+ # $(realpath ...) resolves symlinks
+ abs_objtree := $(realpath $(abs_objtree))
+ BUILD := $(abs_objtree)/kselftest
+else
+ BUILD := $(CURDIR)
+ DEFAULT_INSTALL_HDR_PATH := 1
+endif
+
+# Prepare for headers install
+include $(top_srcdir)/scripts/subarch.include
+ARCH ?= $(SUBARCH)
+export KSFT_KHDR_INSTALL_DONE := 1
+export BUILD
+
+# build and run gpio when output directory is the src dir.
+# gpio has dependency on tools/gpio and builds tools/gpio
+# objects in the src directory in all cases making the src
+# repo dirty even when objects are relocated.
+ifneq (1,$(DEFAULT_INSTALL_HDR_PATH))
+ TMP := $(filter-out gpio, $(TARGETS))
+ TARGETS := $(TMP)
+endif
+
+# set default goal to all, so make without a target runs all, even when
+# all isn't the first target in the file.
+.DEFAULT_GOAL := all
+
+# Install headers here once for all tests. KSFT_KHDR_INSTALL_DONE
+# is used to avoid running headers_install from lib.mk.
+# Invoke headers install with --no-builtin-rules to avoid circular
+# dependency in "make kselftest" case. In this case, second level
+# make inherits builtin-rules which will use the rule generate
+# Makefile.o and runs into
+# "Circular Makefile.o <- prepare dependency dropped."
+# and headers_install fails and test compile fails.
+#
+# O= KBUILD_OUTPUT cases don't run into this error, since main Makefile
+# invokes them as sub-makes and --no-builtin-rules is not necessary,
+# but doesn't cause any failures. Keep it simple and use the same
+# flags in both cases.
+# Local build cases: "make kselftest", "make -C" - headers are installed
+# in the default INSTALL_HDR_PATH usr/include.
+khdr:
+ifeq (1,$(DEFAULT_INSTALL_HDR_PATH))
+ $(MAKE) --no-builtin-rules ARCH=$(ARCH) -C $(top_srcdir) headers_install
+else
+ $(MAKE) --no-builtin-rules INSTALL_HDR_PATH=$$BUILD/usr \
+ ARCH=$(ARCH) -C $(top_srcdir) headers_install
+endif
+
+all: khdr
+ @ret=1; \
+ for TARGET in $(TARGETS); do \
+ BUILD_TARGET=$$BUILD/$$TARGET; \
+ mkdir $$BUILD_TARGET -p; \
+ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET \
+ $(if $(FORCE_TARGETS),|| exit); \
+ ret=$$((ret * $$?)); \
+ done; exit $$ret;
+
+run_tests: all
+ @for TARGET in $(TARGETS); do \
+ BUILD_TARGET=$$BUILD/$$TARGET; \
+ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests;\
+ done;
+
+hotplug:
+ @for TARGET in $(TARGETS_HOTPLUG); do \
+ BUILD_TARGET=$$BUILD/$$TARGET; \
+ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET;\
+ done;
+
+run_hotplug: hotplug
+ @for TARGET in $(TARGETS_HOTPLUG); do \
+ BUILD_TARGET=$$BUILD/$$TARGET; \
+ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_full_test;\
+ done;
+
+clean_hotplug:
+ @for TARGET in $(TARGETS_HOTPLUG); do \
+ BUILD_TARGET=$$BUILD/$$TARGET; \
+ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean;\
+ done;
+
+run_pstore_crash:
+ $(MAKE) -C pstore run_crash
+
+# Use $BUILD as the default install root. $BUILD points to the
+# right output location for the following cases:
+# 1. output_dir=kernel_src
+# 2. a separate output directory is specified using O= KBUILD_OUTPUT
+# 3. a separate output directory is specified using KBUILD_OUTPUT
+# Avoid conflict with INSTALL_PATH set by the main Makefile
+#
+KSFT_INSTALL_PATH ?= $(BUILD)/kselftest_install
+KSFT_INSTALL_PATH := $(abspath $(KSFT_INSTALL_PATH))
+# Avoid changing the rest of the logic here and lib.mk.
+INSTALL_PATH := $(KSFT_INSTALL_PATH)
+ALL_SCRIPT := $(INSTALL_PATH)/run_kselftest.sh
+TEST_LIST := $(INSTALL_PATH)/kselftest-list.txt
+
+install: all
+ifdef INSTALL_PATH
+ @# Ask all targets to install their files
+ mkdir -p $(INSTALL_PATH)/kselftest
+ install -m 744 kselftest/module.sh $(INSTALL_PATH)/kselftest/
+ install -m 744 kselftest/runner.sh $(INSTALL_PATH)/kselftest/
+ install -m 744 kselftest/prefix.pl $(INSTALL_PATH)/kselftest/
+ install -m 744 run_kselftest.sh $(INSTALL_PATH)/
+ rm -f $(TEST_LIST)
+ @ret=1; \
+ for TARGET in $(TARGETS); do \
+ BUILD_TARGET=$$BUILD/$$TARGET; \
+ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET INSTALL_PATH=$(INSTALL_PATH)/$$TARGET install \
+ $(if $(FORCE_TARGETS),|| exit); \
+ ret=$$((ret * $$?)); \
+ done; exit $$ret;
+
+
+ @# Ask all targets to emit their test scripts
+ @# While building kselftest-list.text skip also non-existent TARGET dirs:
+ @# they could be the result of a build failure and should NOT be
+ @# included in the generated runlist.
+ for TARGET in $(TARGETS); do \
+ BUILD_TARGET=$$BUILD/$$TARGET; \
+ [ ! -d $(INSTALL_PATH)/$$TARGET ] && echo "Skipping non-existent dir: $$TARGET" && continue; \
+ echo -n "Emit Tests for $$TARGET\n"; \
+ $(MAKE) -s --no-print-directory OUTPUT=$$BUILD_TARGET COLLECTION=$$TARGET \
+ -C $$TARGET emit_tests >> $(TEST_LIST); \
+ done;
+else
+ $(error Error: set INSTALL_PATH to use install)
+endif
+
+FORMAT ?= .gz
+TAR_PATH = $(abspath ${INSTALL_PATH}/kselftest-packages/kselftest.tar${FORMAT})
+gen_tar: install
+ @mkdir -p ${INSTALL_PATH}/kselftest-packages/
+ @tar caf ${TAR_PATH} --exclude=kselftest-packages -C ${INSTALL_PATH} .
+ @echo "Created ${TAR_PATH}"
+
+clean:
+ @for TARGET in $(TARGETS); do \
+ BUILD_TARGET=$$BUILD/$$TARGET; \
+ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean;\
+ done;
+
+.PHONY: khdr all run_tests hotplug run_hotplug clean_hotplug run_pstore_crash install clean gen_tar
diff --git a/tools/testing/selftests/android/Makefile b/tools/testing/selftests/android/Makefile
new file mode 100644
index 000000000..9258306ca
--- /dev/null
+++ b/tools/testing/selftests/android/Makefile
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: GPL-2.0-only
+SUBDIRS := ion
+
+TEST_PROGS := run.sh
+
+.PHONY: all clean
+
+include ../lib.mk
+
+all:
+ @for DIR in $(SUBDIRS); do \
+ BUILD_TARGET=$(OUTPUT)/$$DIR; \
+ mkdir $$BUILD_TARGET -p; \
+ make OUTPUT=$$BUILD_TARGET -C $$DIR $@;\
+ #SUBDIR test prog name should be in the form: SUBDIR_test.sh \
+ TEST=$$DIR"_test.sh"; \
+ if [ -e $$DIR/$$TEST ]; then \
+ rsync -a $$DIR/$$TEST $$BUILD_TARGET/; \
+ fi \
+ done
+
+override define INSTALL_RULE
+ mkdir -p $(INSTALL_PATH)
+install -t $(INSTALL_PATH) $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES)
+
+ @for SUBDIR in $(SUBDIRS); do \
+ BUILD_TARGET=$(OUTPUT)/$$SUBDIR; \
+ mkdir $$BUILD_TARGET -p; \
+ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$SUBDIR INSTALL_PATH=$(INSTALL_PATH)/$$SUBDIR install; \
+ done;
+endef
+
+override define CLEAN
+ @for DIR in $(SUBDIRS); do \
+ BUILD_TARGET=$(OUTPUT)/$$DIR; \
+ mkdir $$BUILD_TARGET -p; \
+ make OUTPUT=$$BUILD_TARGET -C $$DIR $@;\
+ done
+endef
diff --git a/tools/testing/selftests/android/config b/tools/testing/selftests/android/config
new file mode 100644
index 000000000..b4ad748a9
--- /dev/null
+++ b/tools/testing/selftests/android/config
@@ -0,0 +1,5 @@
+CONFIG_ANDROID=y
+CONFIG_STAGING=y
+CONFIG_ION=y
+CONFIG_ION_SYSTEM_HEAP=y
+CONFIG_DRM_VGEM=y
diff --git a/tools/testing/selftests/android/ion/.gitignore b/tools/testing/selftests/android/ion/.gitignore
new file mode 100644
index 000000000..78eae9972
--- /dev/null
+++ b/tools/testing/selftests/android/ion/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+ionapp_export
+ionapp_import
+ionmap_test
diff --git a/tools/testing/selftests/android/ion/Makefile b/tools/testing/selftests/android/ion/Makefile
new file mode 100644
index 000000000..42b71f005
--- /dev/null
+++ b/tools/testing/selftests/android/ion/Makefile
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+INCLUDEDIR := -I. -I../../../../../drivers/staging/android/uapi/ -I../../../../../usr/include/
+CFLAGS := $(CFLAGS) $(INCLUDEDIR) -Wall -O2 -g
+
+TEST_GEN_FILES := ionapp_export ionapp_import ionmap_test
+
+all: $(TEST_GEN_FILES)
+
+$(TEST_GEN_FILES): ipcsocket.c ionutils.c
+
+TEST_PROGS := ion_test.sh
+
+KSFT_KHDR_INSTALL := 1
+top_srcdir = ../../../../..
+include ../../lib.mk
+
+$(OUTPUT)/ionapp_export: ionapp_export.c ipcsocket.c ionutils.c
+$(OUTPUT)/ionapp_import: ionapp_import.c ipcsocket.c ionutils.c
+$(OUTPUT)/ionmap_test: ionmap_test.c ionutils.c ipcsocket.c
diff --git a/tools/testing/selftests/android/ion/README b/tools/testing/selftests/android/ion/README
new file mode 100644
index 000000000..21783e9c4
--- /dev/null
+++ b/tools/testing/selftests/android/ion/README
@@ -0,0 +1,101 @@
+ION BUFFER SHARING UTILITY
+==========================
+File: ion_test.sh : Utility to test ION driver buffer sharing mechanism.
+Author: Pintu Kumar <pintu.ping@gmail.com>
+
+Introduction:
+-------------
+This is a test utility to verify ION buffer sharing in user space
+between 2 independent processes.
+It uses unix domain socket (with SCM_RIGHTS) as IPC to transfer an FD to
+another process to share the same buffer.
+This utility demonstrates how ION buffer sharing can be implemented between
+two user space processes, using various heap types.
+The following heap types are supported by ION driver.
+ION_HEAP_TYPE_SYSTEM (0)
+ION_HEAP_TYPE_SYSTEM_CONTIG (1)
+ION_HEAP_TYPE_CARVEOUT (2)
+ION_HEAP_TYPE_CHUNK (3)
+ION_HEAP_TYPE_DMA (4)
+
+By default only the SYSTEM and SYSTEM_CONTIG heaps are supported.
+Each heap is associated with the respective heap id.
+This utility is designed in the form of client/server program.
+The server part (ionapp_export) is the exporter of the buffer.
+It is responsible for creating an ION client, allocating the buffer based on
+the heap id, writing some data to this buffer and then exporting the FD
+(associated with this buffer) to another process using socket IPC.
+This FD is called as buffer FD (which is different than the ION client FD).
+
+The client part (ionapp_import) is the importer of the buffer.
+It retrives the FD from the socket data and installs into its address space.
+This new FD internally points to the same kernel buffer.
+So first it reads the data that is stored in this buffer and prints it.
+Then it writes the different size of data (it could be different data) to the
+same buffer.
+Finally the buffer FD must be closed by both the exporter and importer.
+Thus the same kernel buffer is shared among two user space processes using
+ION driver and only one time allocation.
+
+Prerequisite:
+-------------
+This utility works only if /dev/ion interface is present.
+The following configs needs to be enabled in kernel to include ion driver.
+CONFIG_ANDROID=y
+CONFIG_STAGING=y
+CONFIG_ION=y
+CONFIG_ION_SYSTEM_HEAP=y
+
+This utility requires to be run as root user.
+
+
+Compile and test:
+-----------------
+This utility is made to be run as part of kselftest framework in kernel.
+To compile and run using kselftest you can simply do the following from the
+kernel top directory.
+linux$ make TARGETS=android kselftest
+Or you can also use:
+linux$ make -C tools/testing/selftests TARGETS=android run_tests
+Using the selftest it can directly execute the ion_test.sh script to test the
+buffer sharing using ion system heap.
+Currently the heap size is hard coded as just 10 bytes inside this script.
+You need to be a root user to run under selftest.
+
+You can also compile and test manually using the following steps:
+ion$ make
+These will generate 2 executable: ionapp_export, ionapp_import
+Now you can run the export and import manually by specifying the heap type
+and the heap size.
+You can also directly execute the shell script to run the test automatically.
+Simply use the following command to run the test.
+ion$ sudo ./ion_test.sh
+
+Test Results:
+-------------
+The utility is verified on Ubuntu-32 bit system with Linux Kernel 4.14.
+Here is the snapshot of the test result using kselftest.
+
+linux# make TARGETS=android kselftest
+heap_type: 0, heap_size: 10
+--------------------------------------
+heap type: 0
+ heap id: 1
+heap name: ion_system_heap
+--------------------------------------
+Fill buffer content:
+0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd
+Sharing fd: 6, Client fd: 5
+<ion_close_buffer_fd>: buffer release successfully....
+Received buffer fd: 4
+Read buffer content:
+0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0x0 0x0 0x0 0x0 0x0 0x0
+0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0 0x0
+Fill buffer content:
+0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd
+0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd 0xfd
+0xfd 0xfd
+<ion_close_buffer_fd>: buffer release successfully....
+ion_test.sh: heap_type: 0 - [PASS]
+
+ion_test.sh: done
diff --git a/tools/testing/selftests/android/ion/ion.h b/tools/testing/selftests/android/ion/ion.h
new file mode 100644
index 000000000..33db23018
--- /dev/null
+++ b/tools/testing/selftests/android/ion/ion.h
@@ -0,0 +1,134 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * ion.h
+ *
+ * Copyright (C) 2011 Google, Inc.
+ */
+
+/* This file is copied from drivers/staging/android/uapi/ion.h
+ * This local copy is required for the selftest to pass, when build
+ * outside the kernel source tree.
+ * Please keep this file in sync with its original file until the
+ * ion driver is moved outside the staging tree.
+ */
+
+#ifndef _UAPI_LINUX_ION_H
+#define _UAPI_LINUX_ION_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/**
+ * enum ion_heap_types - list of all possible types of heaps
+ * @ION_HEAP_TYPE_SYSTEM: memory allocated via vmalloc
+ * @ION_HEAP_TYPE_SYSTEM_CONTIG: memory allocated via kmalloc
+ * @ION_HEAP_TYPE_CARVEOUT: memory allocated from a prereserved
+ * carveout heap, allocations are physically
+ * contiguous
+ * @ION_HEAP_TYPE_DMA: memory allocated via DMA API
+ * @ION_NUM_HEAPS: helper for iterating over heaps, a bit mask
+ * is used to identify the heaps, so only 32
+ * total heap types are supported
+ */
+enum ion_heap_type {
+ ION_HEAP_TYPE_SYSTEM,
+ ION_HEAP_TYPE_SYSTEM_CONTIG,
+ ION_HEAP_TYPE_CARVEOUT,
+ ION_HEAP_TYPE_CHUNK,
+ ION_HEAP_TYPE_DMA,
+ ION_HEAP_TYPE_CUSTOM, /*
+ * must be last so device specific heaps always
+ * are at the end of this enum
+ */
+};
+
+#define ION_NUM_HEAP_IDS (sizeof(unsigned int) * 8)
+
+/**
+ * allocation flags - the lower 16 bits are used by core ion, the upper 16
+ * bits are reserved for use by the heaps themselves.
+ */
+
+/*
+ * mappings of this buffer should be cached, ion will do cache maintenance
+ * when the buffer is mapped for dma
+ */
+#define ION_FLAG_CACHED 1
+
+/**
+ * DOC: Ion Userspace API
+ *
+ * create a client by opening /dev/ion
+ * most operations handled via following ioctls
+ *
+ */
+
+/**
+ * struct ion_allocation_data - metadata passed from userspace for allocations
+ * @len: size of the allocation
+ * @heap_id_mask: mask of heap ids to allocate from
+ * @flags: flags passed to heap
+ * @handle: pointer that will be populated with a cookie to use to
+ * refer to this allocation
+ *
+ * Provided by userspace as an argument to the ioctl
+ */
+struct ion_allocation_data {
+ __u64 len;
+ __u32 heap_id_mask;
+ __u32 flags;
+ __u32 fd;
+ __u32 unused;
+};
+
+#define MAX_HEAP_NAME 32
+
+/**
+ * struct ion_heap_data - data about a heap
+ * @name - first 32 characters of the heap name
+ * @type - heap type
+ * @heap_id - heap id for the heap
+ */
+struct ion_heap_data {
+ char name[MAX_HEAP_NAME];
+ __u32 type;
+ __u32 heap_id;
+ __u32 reserved0;
+ __u32 reserved1;
+ __u32 reserved2;
+};
+
+/**
+ * struct ion_heap_query - collection of data about all heaps
+ * @cnt - total number of heaps to be copied
+ * @heaps - buffer to copy heap data
+ */
+struct ion_heap_query {
+ __u32 cnt; /* Total number of heaps to be copied */
+ __u32 reserved0; /* align to 64bits */
+ __u64 heaps; /* buffer to be populated */
+ __u32 reserved1;
+ __u32 reserved2;
+};
+
+#define ION_IOC_MAGIC 'I'
+
+/**
+ * DOC: ION_IOC_ALLOC - allocate memory
+ *
+ * Takes an ion_allocation_data struct and returns it with the handle field
+ * populated with the opaque handle for the allocation.
+ */
+#define ION_IOC_ALLOC _IOWR(ION_IOC_MAGIC, 0, \
+ struct ion_allocation_data)
+
+/**
+ * DOC: ION_IOC_HEAP_QUERY - information about available heaps
+ *
+ * Takes an ion_heap_query structure and populates information about
+ * available Ion heaps.
+ */
+#define ION_IOC_HEAP_QUERY _IOWR(ION_IOC_MAGIC, 8, \
+ struct ion_heap_query)
+
+#endif /* _UAPI_LINUX_ION_H */
diff --git a/tools/testing/selftests/android/ion/ion_test.sh b/tools/testing/selftests/android/ion/ion_test.sh
new file mode 100755
index 000000000..69e676cfc
--- /dev/null
+++ b/tools/testing/selftests/android/ion/ion_test.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+heapsize=4096
+TCID="ion_test.sh"
+errcode=0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+run_test()
+{
+ heaptype=$1
+ ./ionapp_export -i $heaptype -s $heapsize &
+ sleep 1
+ ./ionapp_import
+ if [ $? -ne 0 ]; then
+ echo "$TCID: heap_type: $heaptype - [FAIL]"
+ errcode=1
+ else
+ echo "$TCID: heap_type: $heaptype - [PASS]"
+ fi
+ sleep 1
+ echo ""
+}
+
+check_root()
+{
+ uid=$(id -u)
+ if [ $uid -ne 0 ]; then
+ echo $TCID: must be run as root >&2
+ exit $ksft_skip
+ fi
+}
+
+check_device()
+{
+ DEVICE=/dev/ion
+ if [ ! -e $DEVICE ]; then
+ echo $TCID: No $DEVICE device found >&2
+ echo $TCID: May be CONFIG_ION is not set >&2
+ exit $ksft_skip
+ fi
+}
+
+main_function()
+{
+ check_device
+ check_root
+
+ # ION_SYSTEM_HEAP TEST
+ run_test 0
+ # ION_SYSTEM_CONTIG_HEAP TEST
+ run_test 1
+}
+
+main_function
+echo "$TCID: done"
+exit $errcode
diff --git a/tools/testing/selftests/android/ion/ionapp_export.c b/tools/testing/selftests/android/ion/ionapp_export.c
new file mode 100644
index 000000000..063b7830d
--- /dev/null
+++ b/tools/testing/selftests/android/ion/ionapp_export.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ionapp_export.c
+ *
+ * It is a user space utility to create and export android
+ * ion memory buffer fd to another process using unix domain socket as IPC.
+ * This acts like a server for ionapp_import(client).
+ * So, this server has to be started first before the client.
+ *
+ * Copyright (C) 2017 Pintu Kumar <pintu.ping@gmail.com>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/time.h>
+#include "ionutils.h"
+#include "ipcsocket.h"
+
+
+void print_usage(int argc, char *argv[])
+{
+ printf("Usage: %s [-h <help>] [-i <heap id>] [-s <size in bytes>]\n",
+ argv[0]);
+}
+
+int main(int argc, char *argv[])
+{
+ int opt, ret, status, heapid;
+ int sockfd, client_fd, shared_fd;
+ unsigned char *map_buf;
+ unsigned long map_len, heap_type, heap_size, flags;
+ struct ion_buffer_info info;
+ struct socket_info skinfo;
+
+ if (argc < 2) {
+ print_usage(argc, argv);
+ return -1;
+ }
+
+ heap_size = 0;
+ flags = 0;
+ heap_type = ION_HEAP_TYPE_SYSTEM;
+
+ while ((opt = getopt(argc, argv, "hi:s:")) != -1) {
+ switch (opt) {
+ case 'h':
+ print_usage(argc, argv);
+ exit(0);
+ break;
+ case 'i':
+ heapid = atoi(optarg);
+ switch (heapid) {
+ case 0:
+ heap_type = ION_HEAP_TYPE_SYSTEM;
+ break;
+ case 1:
+ heap_type = ION_HEAP_TYPE_SYSTEM_CONTIG;
+ break;
+ default:
+ printf("ERROR: heap type not supported\n");
+ exit(1);
+ }
+ break;
+ case 's':
+ heap_size = atoi(optarg);
+ break;
+ default:
+ print_usage(argc, argv);
+ exit(1);
+ break;
+ }
+ }
+
+ if (heap_size <= 0) {
+ printf("heap_size cannot be 0\n");
+ print_usage(argc, argv);
+ exit(1);
+ }
+
+ printf("heap_type: %ld, heap_size: %ld\n", heap_type, heap_size);
+ info.heap_type = heap_type;
+ info.heap_size = heap_size;
+ info.flag_type = flags;
+
+ /* This is server: open the socket connection first */
+ /* Here; 1 indicates server or exporter */
+ status = opensocket(&sockfd, SOCKET_NAME, 1);
+ if (status < 0) {
+ fprintf(stderr, "<%s>: Failed opensocket.\n", __func__);
+ goto err_socket;
+ }
+ skinfo.sockfd = sockfd;
+
+ ret = ion_export_buffer_fd(&info);
+ if (ret < 0) {
+ fprintf(stderr, "FAILED: ion_get_buffer_fd\n");
+ goto err_export;
+ }
+ client_fd = info.ionfd;
+ shared_fd = info.buffd;
+ map_buf = info.buffer;
+ map_len = info.buflen;
+ write_buffer(map_buf, map_len);
+
+ /* share ion buf fd with other user process */
+ printf("Sharing fd: %d, Client fd: %d\n", shared_fd, client_fd);
+ skinfo.datafd = shared_fd;
+ skinfo.buflen = map_len;
+
+ ret = socket_send_fd(&skinfo);
+ if (ret < 0) {
+ fprintf(stderr, "FAILED: socket_send_fd\n");
+ goto err_send;
+ }
+
+err_send:
+err_export:
+ ion_close_buffer_fd(&info);
+
+err_socket:
+ closesocket(sockfd, SOCKET_NAME);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/android/ion/ionapp_import.c b/tools/testing/selftests/android/ion/ionapp_import.c
new file mode 100644
index 000000000..54b580cb0
--- /dev/null
+++ b/tools/testing/selftests/android/ion/ionapp_import.c
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ionapp_import.c
+ *
+ * It is a user space utility to receive android ion memory buffer fd
+ * over unix domain socket IPC that can be exported by ionapp_export.
+ * This acts like a client for ionapp_export.
+ *
+ * Copyright (C) 2017 Pintu Kumar <pintu.ping@gmail.com>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include "ionutils.h"
+#include "ipcsocket.h"
+
+
+int main(void)
+{
+ int ret, status;
+ int sockfd, shared_fd;
+ unsigned char *map_buf;
+ unsigned long map_len;
+ struct ion_buffer_info info;
+ struct socket_info skinfo;
+
+ /* This is the client part. Here 0 means client or importer */
+ status = opensocket(&sockfd, SOCKET_NAME, 0);
+ if (status < 0) {
+ fprintf(stderr, "No exporter exists...\n");
+ ret = status;
+ goto err_socket;
+ }
+
+ skinfo.sockfd = sockfd;
+
+ ret = socket_receive_fd(&skinfo);
+ if (ret < 0) {
+ fprintf(stderr, "Failed: socket_receive_fd\n");
+ goto err_recv;
+ }
+
+ shared_fd = skinfo.datafd;
+ printf("Received buffer fd: %d\n", shared_fd);
+ if (shared_fd <= 0) {
+ fprintf(stderr, "ERROR: improper buf fd\n");
+ ret = -1;
+ goto err_fd;
+ }
+
+ memset(&info, 0, sizeof(info));
+ info.buffd = shared_fd;
+ info.buflen = ION_BUFFER_LEN;
+
+ ret = ion_import_buffer_fd(&info);
+ if (ret < 0) {
+ fprintf(stderr, "Failed: ion_use_buffer_fd\n");
+ goto err_import;
+ }
+
+ map_buf = info.buffer;
+ map_len = info.buflen;
+ read_buffer(map_buf, map_len);
+
+ /* Write probably new data to the same buffer again */
+ map_len = ION_BUFFER_LEN;
+ write_buffer(map_buf, map_len);
+
+err_import:
+ ion_close_buffer_fd(&info);
+err_fd:
+err_recv:
+err_socket:
+ closesocket(sockfd, SOCKET_NAME);
+
+ return ret;
+}
diff --git a/tools/testing/selftests/android/ion/ionmap_test.c b/tools/testing/selftests/android/ion/ionmap_test.c
new file mode 100644
index 000000000..dab36b06b
--- /dev/null
+++ b/tools/testing/selftests/android/ion/ionmap_test.c
@@ -0,0 +1,136 @@
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <linux/dma-buf.h>
+
+#include <drm/drm.h>
+
+#include "ion.h"
+#include "ionutils.h"
+
+int check_vgem(int fd)
+{
+ drm_version_t version = { 0 };
+ char name[5];
+ int ret;
+
+ version.name_len = 4;
+ version.name = name;
+
+ ret = ioctl(fd, DRM_IOCTL_VERSION, &version);
+ if (ret)
+ return 1;
+
+ return strcmp(name, "vgem");
+}
+
+int open_vgem(void)
+{
+ int i, fd;
+ const char *drmstr = "/dev/dri/card";
+
+ fd = -1;
+ for (i = 0; i < 16; i++) {
+ char name[80];
+
+ sprintf(name, "%s%u", drmstr, i);
+
+ fd = open(name, O_RDWR);
+ if (fd < 0)
+ continue;
+
+ if (check_vgem(fd)) {
+ close(fd);
+ continue;
+ } else {
+ break;
+ }
+
+ }
+ return fd;
+}
+
+int import_vgem_fd(int vgem_fd, int dma_buf_fd, uint32_t *handle)
+{
+ struct drm_prime_handle import_handle = { 0 };
+ int ret;
+
+ import_handle.fd = dma_buf_fd;
+ import_handle.flags = 0;
+ import_handle.handle = 0;
+
+ ret = ioctl(vgem_fd, DRM_IOCTL_PRIME_FD_TO_HANDLE, &import_handle);
+ if (ret == 0)
+ *handle = import_handle.handle;
+ return ret;
+}
+
+void close_handle(int vgem_fd, uint32_t handle)
+{
+ struct drm_gem_close close = { 0 };
+
+ close.handle = handle;
+ ioctl(vgem_fd, DRM_IOCTL_GEM_CLOSE, &close);
+}
+
+int main()
+{
+ int ret, vgem_fd;
+ struct ion_buffer_info info;
+ uint32_t handle = 0;
+ struct dma_buf_sync sync = { 0 };
+
+ info.heap_type = ION_HEAP_TYPE_SYSTEM;
+ info.heap_size = 4096;
+ info.flag_type = ION_FLAG_CACHED;
+
+ ret = ion_export_buffer_fd(&info);
+ if (ret < 0) {
+ printf("ion buffer alloc failed\n");
+ return -1;
+ }
+
+ vgem_fd = open_vgem();
+ if (vgem_fd < 0) {
+ ret = vgem_fd;
+ printf("Failed to open vgem\n");
+ goto out_ion;
+ }
+
+ ret = import_vgem_fd(vgem_fd, info.buffd, &handle);
+
+ if (ret < 0) {
+ printf("Failed to import buffer\n");
+ goto out_vgem;
+ }
+
+ sync.flags = DMA_BUF_SYNC_START | DMA_BUF_SYNC_RW;
+ ret = ioctl(info.buffd, DMA_BUF_IOCTL_SYNC, &sync);
+ if (ret)
+ printf("sync start failed %d\n", errno);
+
+ memset(info.buffer, 0xff, 4096);
+
+ sync.flags = DMA_BUF_SYNC_END | DMA_BUF_SYNC_RW;
+ ret = ioctl(info.buffd, DMA_BUF_IOCTL_SYNC, &sync);
+ if (ret)
+ printf("sync end failed %d\n", errno);
+
+ close_handle(vgem_fd, handle);
+ ret = 0;
+
+out_vgem:
+ close(vgem_fd);
+out_ion:
+ ion_close_buffer_fd(&info);
+ printf("done.\n");
+ return ret;
+}
diff --git a/tools/testing/selftests/android/ion/ionutils.c b/tools/testing/selftests/android/ion/ionutils.c
new file mode 100644
index 000000000..7d1d37c4e
--- /dev/null
+++ b/tools/testing/selftests/android/ion/ionutils.c
@@ -0,0 +1,253 @@
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+//#include <stdint.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include "ionutils.h"
+#include "ipcsocket.h"
+
+
+void write_buffer(void *buffer, unsigned long len)
+{
+ int i;
+ unsigned char *ptr = (unsigned char *)buffer;
+
+ if (!ptr) {
+ fprintf(stderr, "<%s>: Invalid buffer...\n", __func__);
+ return;
+ }
+
+ printf("Fill buffer content:\n");
+ memset(ptr, 0xfd, len);
+ for (i = 0; i < len; i++)
+ printf("0x%x ", ptr[i]);
+ printf("\n");
+}
+
+void read_buffer(void *buffer, unsigned long len)
+{
+ int i;
+ unsigned char *ptr = (unsigned char *)buffer;
+
+ if (!ptr) {
+ fprintf(stderr, "<%s>: Invalid buffer...\n", __func__);
+ return;
+ }
+
+ printf("Read buffer content:\n");
+ for (i = 0; i < len; i++)
+ printf("0x%x ", ptr[i]);
+ printf("\n");
+}
+
+int ion_export_buffer_fd(struct ion_buffer_info *ion_info)
+{
+ int i, ret, ionfd, buffer_fd;
+ unsigned int heap_id;
+ unsigned long maplen;
+ unsigned char *map_buffer;
+ struct ion_allocation_data alloc_data;
+ struct ion_heap_query query;
+ struct ion_heap_data heap_data[MAX_HEAP_COUNT];
+
+ if (!ion_info) {
+ fprintf(stderr, "<%s>: Invalid ion info\n", __func__);
+ return -1;
+ }
+
+ /* Create an ION client */
+ ionfd = open(ION_DEVICE, O_RDWR);
+ if (ionfd < 0) {
+ fprintf(stderr, "<%s>: Failed to open ion client: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+
+ memset(&query, 0, sizeof(query));
+ query.cnt = MAX_HEAP_COUNT;
+ query.heaps = (unsigned long int)&heap_data[0];
+ /* Query ION heap_id_mask from ION heap */
+ ret = ioctl(ionfd, ION_IOC_HEAP_QUERY, &query);
+ if (ret < 0) {
+ fprintf(stderr, "<%s>: Failed: ION_IOC_HEAP_QUERY: %s\n",
+ __func__, strerror(errno));
+ goto err_query;
+ }
+
+ heap_id = MAX_HEAP_COUNT + 1;
+ for (i = 0; i < query.cnt; i++) {
+ if (heap_data[i].type == ion_info->heap_type) {
+ heap_id = heap_data[i].heap_id;
+ break;
+ }
+ }
+
+ if (heap_id > MAX_HEAP_COUNT) {
+ fprintf(stderr, "<%s>: ERROR: heap type does not exists\n",
+ __func__);
+ goto err_heap;
+ }
+
+ alloc_data.len = ion_info->heap_size;
+ alloc_data.heap_id_mask = 1 << heap_id;
+ alloc_data.flags = ion_info->flag_type;
+
+ /* Allocate memory for this ION client as per heap_type */
+ ret = ioctl(ionfd, ION_IOC_ALLOC, &alloc_data);
+ if (ret < 0) {
+ fprintf(stderr, "<%s>: Failed: ION_IOC_ALLOC: %s\n",
+ __func__, strerror(errno));
+ goto err_alloc;
+ }
+
+ /* This will return a valid buffer fd */
+ buffer_fd = alloc_data.fd;
+ maplen = alloc_data.len;
+
+ if (buffer_fd < 0 || maplen <= 0) {
+ fprintf(stderr, "<%s>: Invalid map data, fd: %d, len: %ld\n",
+ __func__, buffer_fd, maplen);
+ goto err_fd_data;
+ }
+
+ /* Create memory mapped buffer for the buffer fd */
+ map_buffer = (unsigned char *)mmap(NULL, maplen, PROT_READ|PROT_WRITE,
+ MAP_SHARED, buffer_fd, 0);
+ if (map_buffer == MAP_FAILED) {
+ fprintf(stderr, "<%s>: Failed: mmap: %s\n",
+ __func__, strerror(errno));
+ goto err_mmap;
+ }
+
+ ion_info->ionfd = ionfd;
+ ion_info->buffd = buffer_fd;
+ ion_info->buffer = map_buffer;
+ ion_info->buflen = maplen;
+
+ return 0;
+
+ munmap(map_buffer, maplen);
+
+err_fd_data:
+err_mmap:
+ /* in case of error: close the buffer fd */
+ if (buffer_fd)
+ close(buffer_fd);
+
+err_query:
+err_heap:
+err_alloc:
+ /* In case of error: close the ion client fd */
+ if (ionfd)
+ close(ionfd);
+
+ return -1;
+}
+
+int ion_import_buffer_fd(struct ion_buffer_info *ion_info)
+{
+ int buffd;
+ unsigned char *map_buf;
+ unsigned long map_len;
+
+ if (!ion_info) {
+ fprintf(stderr, "<%s>: Invalid ion info\n", __func__);
+ return -1;
+ }
+
+ map_len = ion_info->buflen;
+ buffd = ion_info->buffd;
+
+ if (buffd < 0 || map_len <= 0) {
+ fprintf(stderr, "<%s>: Invalid map data, fd: %d, len: %ld\n",
+ __func__, buffd, map_len);
+ goto err_buffd;
+ }
+
+ map_buf = (unsigned char *)mmap(NULL, map_len, PROT_READ|PROT_WRITE,
+ MAP_SHARED, buffd, 0);
+ if (map_buf == MAP_FAILED) {
+ printf("<%s>: Failed - mmap: %s\n",
+ __func__, strerror(errno));
+ goto err_mmap;
+ }
+
+ ion_info->buffer = map_buf;
+ ion_info->buflen = map_len;
+
+ return 0;
+
+err_mmap:
+ if (buffd)
+ close(buffd);
+
+err_buffd:
+ return -1;
+}
+
+void ion_close_buffer_fd(struct ion_buffer_info *ion_info)
+{
+ if (ion_info) {
+ /* unmap the buffer properly in the end */
+ munmap(ion_info->buffer, ion_info->buflen);
+ /* close the buffer fd */
+ if (ion_info->buffd > 0)
+ close(ion_info->buffd);
+ /* Finally, close the client fd */
+ if (ion_info->ionfd > 0)
+ close(ion_info->ionfd);
+ }
+}
+
+int socket_send_fd(struct socket_info *info)
+{
+ int status;
+ int fd, sockfd;
+ struct socketdata skdata;
+
+ if (!info) {
+ fprintf(stderr, "<%s>: Invalid socket info\n", __func__);
+ return -1;
+ }
+
+ sockfd = info->sockfd;
+ fd = info->datafd;
+ memset(&skdata, 0, sizeof(skdata));
+ skdata.data = fd;
+ skdata.len = sizeof(skdata.data);
+ status = sendtosocket(sockfd, &skdata);
+ if (status < 0) {
+ fprintf(stderr, "<%s>: Failed: sendtosocket\n", __func__);
+ return -1;
+ }
+
+ return 0;
+}
+
+int socket_receive_fd(struct socket_info *info)
+{
+ int status;
+ int fd, sockfd;
+ struct socketdata skdata;
+
+ if (!info) {
+ fprintf(stderr, "<%s>: Invalid socket info\n", __func__);
+ return -1;
+ }
+
+ sockfd = info->sockfd;
+ memset(&skdata, 0, sizeof(skdata));
+ status = receivefromsocket(sockfd, &skdata);
+ if (status < 0) {
+ fprintf(stderr, "<%s>: Failed: receivefromsocket\n", __func__);
+ return -1;
+ }
+
+ fd = (int)skdata.data;
+ info->datafd = fd;
+
+ return status;
+}
diff --git a/tools/testing/selftests/android/ion/ionutils.h b/tools/testing/selftests/android/ion/ionutils.h
new file mode 100644
index 000000000..9941eb858
--- /dev/null
+++ b/tools/testing/selftests/android/ion/ionutils.h
@@ -0,0 +1,55 @@
+#ifndef __ION_UTILS_H
+#define __ION_UTILS_H
+
+#include "ion.h"
+
+#define SOCKET_NAME "ion_socket"
+#define ION_DEVICE "/dev/ion"
+
+#define ION_BUFFER_LEN 4096
+#define MAX_HEAP_COUNT ION_HEAP_TYPE_CUSTOM
+
+struct socket_info {
+ int sockfd;
+ int datafd;
+ unsigned long buflen;
+};
+
+struct ion_buffer_info {
+ int ionfd;
+ int buffd;
+ unsigned int heap_type;
+ unsigned int flag_type;
+ unsigned long heap_size;
+ unsigned long buflen;
+ unsigned char *buffer;
+};
+
+
+/* This is used to fill the data into the mapped buffer */
+void write_buffer(void *buffer, unsigned long len);
+
+/* This is used to read the data from the exported buffer */
+void read_buffer(void *buffer, unsigned long len);
+
+/* This is used to create an ION buffer FD for the kernel buffer
+ * So you can export this same buffer to others in the form of FD
+ */
+int ion_export_buffer_fd(struct ion_buffer_info *ion_info);
+
+/* This is used to import or map an exported FD.
+ * So we point to same buffer without making a copy. Hence zero-copy.
+ */
+int ion_import_buffer_fd(struct ion_buffer_info *ion_info);
+
+/* This is used to close all references for the ION client */
+void ion_close_buffer_fd(struct ion_buffer_info *ion_info);
+
+/* This is used to send FD to another process using socket IPC */
+int socket_send_fd(struct socket_info *skinfo);
+
+/* This is used to receive FD from another process using socket IPC */
+int socket_receive_fd(struct socket_info *skinfo);
+
+
+#endif
diff --git a/tools/testing/selftests/android/ion/ipcsocket.c b/tools/testing/selftests/android/ion/ipcsocket.c
new file mode 100644
index 000000000..7dc521002
--- /dev/null
+++ b/tools/testing/selftests/android/ion/ipcsocket.c
@@ -0,0 +1,227 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/un.h>
+#include <errno.h>
+
+#include "ipcsocket.h"
+
+
+int opensocket(int *sockfd, const char *name, int connecttype)
+{
+ int ret, temp = 1;
+
+ if (!name || strlen(name) > MAX_SOCK_NAME_LEN) {
+ fprintf(stderr, "<%s>: Invalid socket name.\n", __func__);
+ return -1;
+ }
+
+ ret = socket(PF_LOCAL, SOCK_STREAM, 0);
+ if (ret < 0) {
+ fprintf(stderr, "<%s>: Failed socket: <%s>\n",
+ __func__, strerror(errno));
+ return ret;
+ }
+
+ *sockfd = ret;
+ if (setsockopt(*sockfd, SOL_SOCKET, SO_REUSEADDR,
+ (char *)&temp, sizeof(int)) < 0) {
+ fprintf(stderr, "<%s>: Failed setsockopt: <%s>\n",
+ __func__, strerror(errno));
+ goto err;
+ }
+
+ sprintf(sock_name, "/tmp/%s", name);
+
+ if (connecttype == 1) {
+ /* This is for Server connection */
+ struct sockaddr_un skaddr;
+ int clientfd;
+ socklen_t sklen;
+
+ unlink(sock_name);
+ memset(&skaddr, 0, sizeof(skaddr));
+ skaddr.sun_family = AF_LOCAL;
+ strcpy(skaddr.sun_path, sock_name);
+
+ ret = bind(*sockfd, (struct sockaddr *)&skaddr,
+ SUN_LEN(&skaddr));
+ if (ret < 0) {
+ fprintf(stderr, "<%s>: Failed bind: <%s>\n",
+ __func__, strerror(errno));
+ goto err;
+ }
+
+ ret = listen(*sockfd, 5);
+ if (ret < 0) {
+ fprintf(stderr, "<%s>: Failed listen: <%s>\n",
+ __func__, strerror(errno));
+ goto err;
+ }
+
+ memset(&skaddr, 0, sizeof(skaddr));
+ sklen = sizeof(skaddr);
+
+ ret = accept(*sockfd, (struct sockaddr *)&skaddr,
+ (socklen_t *)&sklen);
+ if (ret < 0) {
+ fprintf(stderr, "<%s>: Failed accept: <%s>\n",
+ __func__, strerror(errno));
+ goto err;
+ }
+
+ clientfd = ret;
+ *sockfd = clientfd;
+ } else {
+ /* This is for client connection */
+ struct sockaddr_un skaddr;
+
+ memset(&skaddr, 0, sizeof(skaddr));
+ skaddr.sun_family = AF_LOCAL;
+ strcpy(skaddr.sun_path, sock_name);
+
+ ret = connect(*sockfd, (struct sockaddr *)&skaddr,
+ SUN_LEN(&skaddr));
+ if (ret < 0) {
+ fprintf(stderr, "<%s>: Failed connect: <%s>\n",
+ __func__, strerror(errno));
+ goto err;
+ }
+ }
+
+ return 0;
+
+err:
+ if (*sockfd)
+ close(*sockfd);
+
+ return ret;
+}
+
+int sendtosocket(int sockfd, struct socketdata *skdata)
+{
+ int ret, buffd;
+ unsigned int len;
+ char cmsg_b[CMSG_SPACE(sizeof(int))];
+ struct cmsghdr *cmsg;
+ struct msghdr msgh;
+ struct iovec iov;
+ struct timeval timeout;
+ fd_set selFDs;
+
+ if (!skdata) {
+ fprintf(stderr, "<%s>: socketdata is NULL\n", __func__);
+ return -1;
+ }
+
+ FD_ZERO(&selFDs);
+ FD_SET(0, &selFDs);
+ FD_SET(sockfd, &selFDs);
+ timeout.tv_sec = 20;
+ timeout.tv_usec = 0;
+
+ ret = select(sockfd+1, NULL, &selFDs, NULL, &timeout);
+ if (ret < 0) {
+ fprintf(stderr, "<%s>: Failed select: <%s>\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+
+ if (FD_ISSET(sockfd, &selFDs)) {
+ buffd = skdata->data;
+ len = skdata->len;
+ memset(&msgh, 0, sizeof(msgh));
+ msgh.msg_control = &cmsg_b;
+ msgh.msg_controllen = CMSG_LEN(len);
+ iov.iov_base = "OK";
+ iov.iov_len = 2;
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ cmsg = CMSG_FIRSTHDR(&msgh);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(len);
+ memcpy(CMSG_DATA(cmsg), &buffd, len);
+
+ ret = sendmsg(sockfd, &msgh, MSG_DONTWAIT);
+ if (ret < 0) {
+ fprintf(stderr, "<%s>: Failed sendmsg: <%s>\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+int receivefromsocket(int sockfd, struct socketdata *skdata)
+{
+ int ret, buffd;
+ unsigned int len = 0;
+ char cmsg_b[CMSG_SPACE(sizeof(int))];
+ struct cmsghdr *cmsg;
+ struct msghdr msgh;
+ struct iovec iov;
+ fd_set recvFDs;
+ char data[32];
+
+ if (!skdata) {
+ fprintf(stderr, "<%s>: socketdata is NULL\n", __func__);
+ return -1;
+ }
+
+ FD_ZERO(&recvFDs);
+ FD_SET(0, &recvFDs);
+ FD_SET(sockfd, &recvFDs);
+
+ ret = select(sockfd+1, &recvFDs, NULL, NULL, NULL);
+ if (ret < 0) {
+ fprintf(stderr, "<%s>: Failed select: <%s>\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+
+ if (FD_ISSET(sockfd, &recvFDs)) {
+ len = sizeof(buffd);
+ memset(&msgh, 0, sizeof(msgh));
+ msgh.msg_control = &cmsg_b;
+ msgh.msg_controllen = CMSG_LEN(len);
+ iov.iov_base = data;
+ iov.iov_len = sizeof(data)-1;
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ cmsg = CMSG_FIRSTHDR(&msgh);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(len);
+
+ ret = recvmsg(sockfd, &msgh, MSG_DONTWAIT);
+ if (ret < 0) {
+ fprintf(stderr, "<%s>: Failed recvmsg: <%s>\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+
+ memcpy(&buffd, CMSG_DATA(cmsg), len);
+ skdata->data = buffd;
+ skdata->len = len;
+ }
+ return 0;
+}
+
+int closesocket(int sockfd, char *name)
+{
+ char sockname[MAX_SOCK_NAME_LEN];
+
+ if (sockfd)
+ close(sockfd);
+ sprintf(sockname, "/tmp/%s", name);
+ unlink(sockname);
+ shutdown(sockfd, 2);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/android/ion/ipcsocket.h b/tools/testing/selftests/android/ion/ipcsocket.h
new file mode 100644
index 000000000..b3e84498a
--- /dev/null
+++ b/tools/testing/selftests/android/ion/ipcsocket.h
@@ -0,0 +1,35 @@
+
+#ifndef _IPCSOCKET_H
+#define _IPCSOCKET_H
+
+
+#define MAX_SOCK_NAME_LEN 64
+
+char sock_name[MAX_SOCK_NAME_LEN];
+
+/* This structure is responsible for holding the IPC data
+ * data: hold the buffer fd
+ * len: just the length of 32-bit integer fd
+ */
+struct socketdata {
+ int data;
+ unsigned int len;
+};
+
+/* This API is used to open the IPC socket connection
+ * name: implies a unique socket name in the system
+ * connecttype: implies server(0) or client(1)
+ */
+int opensocket(int *sockfd, const char *name, int connecttype);
+
+/* This is the API to send socket data over IPC socket */
+int sendtosocket(int sockfd, struct socketdata *data);
+
+/* This is the API to receive socket data over IPC socket */
+int receivefromsocket(int sockfd, struct socketdata *data);
+
+/* This is the API to close the socket connection */
+int closesocket(int sockfd, char *name);
+
+
+#endif
diff --git a/tools/testing/selftests/android/run.sh b/tools/testing/selftests/android/run.sh
new file mode 100755
index 000000000..dd8edf291
--- /dev/null
+++ b/tools/testing/selftests/android/run.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+(cd ion; ./ion_test.sh)
diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile
new file mode 100644
index 000000000..2c9d01279
--- /dev/null
+++ b/tools/testing/selftests/arm64/Makefile
@@ -0,0 +1,66 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# When ARCH not overridden for crosscompiling, lookup machine
+ARCH ?= $(shell uname -m 2>/dev/null || echo not)
+
+ifneq (,$(filter $(ARCH),aarch64 arm64))
+ARM64_SUBTARGETS ?= tags signal pauth fp mte
+else
+ARM64_SUBTARGETS :=
+endif
+
+CFLAGS := -Wall -O2 -g
+
+# A proper top_srcdir is needed by KSFT(lib.mk)
+top_srcdir = $(realpath ../../../../)
+
+# Additional include paths needed by kselftest.h and local headers
+CFLAGS += -I$(top_srcdir)/tools/testing/selftests/
+
+# Guessing where the Kernel headers could have been installed
+# depending on ENV config
+ifeq ($(KBUILD_OUTPUT),)
+khdr_dir = $(top_srcdir)/usr/include
+else
+# the KSFT preferred location when KBUILD_OUTPUT is set
+khdr_dir = $(KBUILD_OUTPUT)/kselftest/usr/include
+endif
+
+CFLAGS += -I$(khdr_dir)
+
+export CFLAGS
+export top_srcdir
+
+all:
+ @for DIR in $(ARM64_SUBTARGETS); do \
+ BUILD_TARGET=$(OUTPUT)/$$DIR; \
+ mkdir -p $$BUILD_TARGET; \
+ make OUTPUT=$$BUILD_TARGET -C $$DIR $@; \
+ done
+
+install: all
+ @for DIR in $(ARM64_SUBTARGETS); do \
+ BUILD_TARGET=$(OUTPUT)/$$DIR; \
+ make OUTPUT=$$BUILD_TARGET -C $$DIR $@; \
+ done
+
+run_tests: all
+ @for DIR in $(ARM64_SUBTARGETS); do \
+ BUILD_TARGET=$(OUTPUT)/$$DIR; \
+ make OUTPUT=$$BUILD_TARGET -C $$DIR $@; \
+ done
+
+# Avoid any output on non arm64 on emit_tests
+emit_tests: all
+ @for DIR in $(ARM64_SUBTARGETS); do \
+ BUILD_TARGET=$(OUTPUT)/$$DIR; \
+ make OUTPUT=$$BUILD_TARGET -C $$DIR $@; \
+ done
+
+clean:
+ @for DIR in $(ARM64_SUBTARGETS); do \
+ BUILD_TARGET=$(OUTPUT)/$$DIR; \
+ make OUTPUT=$$BUILD_TARGET -C $$DIR $@; \
+ done
+
+.PHONY: all clean install run_tests emit_tests
diff --git a/tools/testing/selftests/arm64/README b/tools/testing/selftests/arm64/README
new file mode 100644
index 000000000..a1badd882
--- /dev/null
+++ b/tools/testing/selftests/arm64/README
@@ -0,0 +1,25 @@
+KSelfTest ARM64
+===============
+
+- These tests are arm64 specific and so not built or run but just skipped
+ completely when env-variable ARCH is found to be different than 'arm64'
+ and `uname -m` reports other than 'aarch64'.
+
+- Holding true the above, ARM64 KSFT tests can be run within the KSelfTest
+ framework using standard Linux top-level-makefile targets:
+
+ $ make TARGETS=arm64 kselftest-clean
+ $ make TARGETS=arm64 kselftest
+
+ or
+
+ $ make -C tools/testing/selftests TARGETS=arm64 \
+ INSTALL_PATH=<your-installation-path> install
+
+ or, alternatively, only specific arm64/ subtargets can be picked:
+
+ $ make -C tools/testing/selftests TARGETS=arm64 ARM64_SUBTARGETS="tags signal" \
+ INSTALL_PATH=<your-installation-path> install
+
+ Further details on building and running KFST can be found in:
+ Documentation/dev-tools/kselftest.rst
diff --git a/tools/testing/selftests/arm64/fp/.gitignore b/tools/testing/selftests/arm64/fp/.gitignore
new file mode 100644
index 000000000..d66f76d2a
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/.gitignore
@@ -0,0 +1,5 @@
+fpsimd-test
+sve-probe-vls
+sve-ptrace
+sve-test
+vlset
diff --git a/tools/testing/selftests/arm64/fp/Makefile b/tools/testing/selftests/arm64/fp/Makefile
new file mode 100644
index 000000000..a57009d3a
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/Makefile
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0
+
+CFLAGS += -I../../../../../usr/include/
+TEST_GEN_PROGS := sve-ptrace sve-probe-vls
+TEST_PROGS_EXTENDED := fpsimd-test fpsimd-stress sve-test sve-stress vlset
+
+all: $(TEST_GEN_PROGS) $(TEST_PROGS_EXTENDED)
+
+fpsimd-test: fpsimd-test.o
+ $(CC) -nostdlib $^ -o $@
+sve-ptrace: sve-ptrace.o sve-ptrace-asm.o
+sve-probe-vls: sve-probe-vls.o
+sve-test: sve-test.o
+ $(CC) -nostdlib $^ -o $@
+vlset: vlset.o
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/arm64/fp/README b/tools/testing/selftests/arm64/fp/README
new file mode 100644
index 000000000..03e3dad86
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/README
@@ -0,0 +1,100 @@
+This directory contains a mix of tests integrated with kselftest and
+standalone stress tests.
+
+kselftest tests
+===============
+
+sve-probe-vls - Checks the SVE vector length enumeration interface
+sve-ptrace - Checks the SVE ptrace interface
+
+Running the non-kselftest tests
+===============================
+
+sve-stress performs an SVE context switch stress test, as described
+below.
+
+(The fpsimd-stress test works the same way; just substitute "fpsimd" for
+"sve" in the following commands.)
+
+
+The test runs until killed by the user.
+
+If no context switch error was detected, you will see output such as
+the following:
+
+$ ./sve-stress
+(wait for some time)
+^C
+Vector length: 512 bits
+PID: 1573
+Terminated by signal 15, no error, iterations=9467, signals=1014
+Vector length: 512 bits
+PID: 1575
+Terminated by signal 15, no error, iterations=9448, signals=1028
+Vector length: 512 bits
+PID: 1577
+Terminated by signal 15, no error, iterations=9436, signals=1039
+Vector length: 512 bits
+PID: 1579
+Terminated by signal 15, no error, iterations=9421, signals=1039
+Vector length: 512 bits
+PID: 1581
+Terminated by signal 15, no error, iterations=9403, signals=1039
+Vector length: 512 bits
+PID: 1583
+Terminated by signal 15, no error, iterations=9385, signals=1036
+Vector length: 512 bits
+PID: 1585
+Terminated by signal 15, no error, iterations=9376, signals=1039
+Vector length: 512 bits
+PID: 1587
+Terminated by signal 15, no error, iterations=9361, signals=1039
+Vector length: 512 bits
+PID: 1589
+Terminated by signal 15, no error, iterations=9350, signals=1039
+
+
+If an error was detected, details of the mismatch will be printed
+instead of "no error".
+
+Ideally, the test should be allowed to run for many minutes or hours
+to maximise test coverage.
+
+
+KVM stress testing
+==================
+
+To try to reproduce the bugs that we have been observing, sve-stress
+should be run in parallel in two KVM guests, while simultaneously
+running on the host.
+
+1) Start 2 guests, using the following command for each:
+
+$ lkvm run --console=virtio -pconsole=hvc0 --sve Image
+
+(Depending on the hardware GIC implementation, you may also need
+--irqchip=gicv3. New kvmtool defaults to that if appropriate, but I
+can't remember whether my branch is new enough for that. Try without
+the option first.)
+
+Kvmtool occupies the terminal until you kill it (Ctrl+A x),
+or until the guest terminates. It is therefore recommended to run
+each instance in separate terminal (use screen or ssh etc.) This
+allows multiple guests to be run in parallel while running other
+commands on the host.
+
+Within the guest, the host filesystem is accessible, mounted on /host.
+
+2) Run the sve-stress on *each* guest with the Vector-Length set to 32:
+guest$ ./vlset --inherit 32 ./sve-stress
+
+3) Run the sve-stress on the host with the maximum Vector-Length:
+host$ ./vlset --inherit --max ./sve-stress
+
+
+Again, the test should be allowed to run for many minutes or hours to
+maximise test coverage.
+
+If no error is detected, you will see output from each sve-stress
+instance similar to that illustrated above; otherwise details of the
+observed mismatches will be printed.
diff --git a/tools/testing/selftests/arm64/fp/asm-offsets.h b/tools/testing/selftests/arm64/fp/asm-offsets.h
new file mode 100644
index 000000000..a18085149
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/asm-offsets.h
@@ -0,0 +1,11 @@
+#define sa_sz 32
+#define sa_flags 8
+#define sa_handler 0
+#define sa_mask_sz 8
+#define SIGUSR1 10
+#define SIGTERM 15
+#define SIGINT 2
+#define SIGABRT 6
+#define SA_NODEFER 1073741824
+#define SA_SIGINFO 4
+#define ucontext_regs 184
diff --git a/tools/testing/selftests/arm64/fp/assembler.h b/tools/testing/selftests/arm64/fp/assembler.h
new file mode 100644
index 000000000..8944f2189
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/assembler.h
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2015-2019 ARM Limited.
+// Original author: Dave Martin <Dave.Martin@arm.com>
+
+#ifndef ASSEMBLER_H
+#define ASSEMBLER_H
+
+.macro __for from:req, to:req
+ .if (\from) == (\to)
+ _for__body %\from
+ .else
+ __for \from, %(\from) + ((\to) - (\from)) / 2
+ __for %(\from) + ((\to) - (\from)) / 2 + 1, \to
+ .endif
+.endm
+
+.macro _for var:req, from:req, to:req, insn:vararg
+ .macro _for__body \var:req
+ .noaltmacro
+ \insn
+ .altmacro
+ .endm
+
+ .altmacro
+ __for \from, \to
+ .noaltmacro
+
+ .purgem _for__body
+.endm
+
+.macro function name
+ .macro endfunction
+ .type \name, @function
+ .purgem endfunction
+ .endm
+\name:
+.endm
+
+.macro define_accessor name, num, insn
+ .macro \name\()_entry n
+ \insn \n, 1
+ ret
+ .endm
+
+function \name
+ adr x2, .L__accessor_tbl\@
+ add x2, x2, x0, lsl #3
+ br x2
+
+.L__accessor_tbl\@:
+ _for x, 0, (\num) - 1, \name\()_entry \x
+endfunction
+
+ .purgem \name\()_entry
+.endm
+
+#endif /* ! ASSEMBLER_H */
diff --git a/tools/testing/selftests/arm64/fp/fpsimd-stress b/tools/testing/selftests/arm64/fp/fpsimd-stress
new file mode 100755
index 000000000..781b5b022
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/fpsimd-stress
@@ -0,0 +1,60 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-only
+# Copyright (C) 2015-2019 ARM Limited.
+# Original author: Dave Martin <Dave.Martin@arm.com>
+
+set -ue
+
+NR_CPUS=`nproc`
+
+pids=
+logs=
+
+cleanup () {
+ trap - INT TERM CHLD
+ set +e
+
+ if [ -n "$pids" ]; then
+ kill $pids
+ wait $pids
+ pids=
+ fi
+
+ if [ -n "$logs" ]; then
+ cat $logs
+ rm $logs
+ logs=
+ fi
+}
+
+interrupt () {
+ cleanup
+ exit 0
+}
+
+child_died () {
+ cleanup
+ exit 1
+}
+
+trap interrupt INT TERM EXIT
+trap child_died CHLD
+
+for x in `seq 0 $((NR_CPUS * 4))`; do
+ log=`mktemp`
+ logs=$logs\ $log
+ ./fpsimd-test >$log &
+ pids=$pids\ $!
+done
+
+# Wait for all child processes to be created:
+sleep 10
+
+while :; do
+ kill -USR1 $pids
+done &
+pids=$pids\ $!
+
+wait
+
+exit 1
diff --git a/tools/testing/selftests/arm64/fp/fpsimd-test.S b/tools/testing/selftests/arm64/fp/fpsimd-test.S
new file mode 100644
index 000000000..1c5556bdd
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/fpsimd-test.S
@@ -0,0 +1,482 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2015-2019 ARM Limited.
+// Original author: Dave Martin <Dave.Martin@arm.com>
+//
+// Simple FPSIMD context switch test
+// Repeatedly writes unique test patterns into each FPSIMD register
+// and reads them back to verify integrity.
+//
+// for x in `seq 1 NR_CPUS`; do fpsimd-test & pids=$pids\ $! ; done
+// (leave it running for as long as you want...)
+// kill $pids
+
+#include <asm/unistd.h>
+#include "assembler.h"
+#include "asm-offsets.h"
+
+#define NVR 32
+#define MAXVL_B (128 / 8)
+
+.macro _vldr Vn:req, Xt:req
+ ld1 {v\Vn\().2d}, [x\Xt]
+.endm
+
+.macro _vstr Vn:req, Xt:req
+ st1 {v\Vn\().2d}, [x\Xt]
+.endm
+
+// Generate accessor functions to read/write programmatically selected
+// FPSIMD registers.
+// x0 is the register index to access
+// x1 is the memory address to read from (getv,setp) or store to (setv,setp)
+// All clobber x0-x2
+define_accessor setv, NVR, _vldr
+define_accessor getv, NVR, _vstr
+
+// Print a single character x0 to stdout
+// Clobbers x0-x2,x8
+function putc
+ str x0, [sp, #-16]!
+
+ mov x0, #1 // STDOUT_FILENO
+ mov x1, sp
+ mov x2, #1
+ mov x8, #__NR_write
+ svc #0
+
+ add sp, sp, #16
+ ret
+endfunction
+
+// Print a NUL-terminated string starting at address x0 to stdout
+// Clobbers x0-x3,x8
+function puts
+ mov x1, x0
+
+ mov x2, #0
+0: ldrb w3, [x0], #1
+ cbz w3, 1f
+ add x2, x2, #1
+ b 0b
+
+1: mov w0, #1 // STDOUT_FILENO
+ mov x8, #__NR_write
+ svc #0
+
+ ret
+endfunction
+
+// Utility macro to print a literal string
+// Clobbers x0-x4,x8
+.macro puts string
+ .pushsection .rodata.str1.1, "aMS", 1
+.L__puts_literal\@: .string "\string"
+ .popsection
+
+ ldr x0, =.L__puts_literal\@
+ bl puts
+.endm
+
+// Print an unsigned decimal number x0 to stdout
+// Clobbers x0-x4,x8
+function putdec
+ mov x1, sp
+ str x30, [sp, #-32]! // Result can't be > 20 digits
+
+ mov x2, #0
+ strb w2, [x1, #-1]! // Write the NUL terminator
+
+ mov x2, #10
+0: udiv x3, x0, x2 // div-mod loop to generate the digits
+ msub x0, x3, x2, x0
+ add w0, w0, #'0'
+ strb w0, [x1, #-1]!
+ mov x0, x3
+ cbnz x3, 0b
+
+ ldrb w0, [x1]
+ cbnz w0, 1f
+ mov w0, #'0' // Print "0" for 0, not ""
+ strb w0, [x1, #-1]!
+
+1: mov x0, x1
+ bl puts
+
+ ldr x30, [sp], #32
+ ret
+endfunction
+
+// Print an unsigned decimal number x0 to stdout, followed by a newline
+// Clobbers x0-x5,x8
+function putdecn
+ mov x5, x30
+
+ bl putdec
+ mov x0, #'\n'
+ bl putc
+
+ ret x5
+endfunction
+
+
+// Clobbers x0-x3,x8
+function puthexb
+ str x30, [sp, #-0x10]!
+
+ mov w3, w0
+ lsr w0, w0, #4
+ bl puthexnibble
+ mov w0, w3
+
+ ldr x30, [sp], #0x10
+ // fall through to puthexnibble
+endfunction
+// Clobbers x0-x2,x8
+function puthexnibble
+ and w0, w0, #0xf
+ cmp w0, #10
+ blo 1f
+ add w0, w0, #'a' - ('9' + 1)
+1: add w0, w0, #'0'
+ b putc
+endfunction
+
+// x0=data in, x1=size in, clobbers x0-x5,x8
+function dumphex
+ str x30, [sp, #-0x10]!
+
+ mov x4, x0
+ mov x5, x1
+
+0: subs x5, x5, #1
+ b.lo 1f
+ ldrb w0, [x4], #1
+ bl puthexb
+ b 0b
+
+1: ldr x30, [sp], #0x10
+ ret
+endfunction
+
+// Declare some storate space to shadow the SVE register contents:
+.pushsection .text
+.data
+.align 4
+vref:
+ .space MAXVL_B * NVR
+scratch:
+ .space MAXVL_B
+.popsection
+
+// Trivial memory copy: copy x2 bytes, starting at address x1, to address x0.
+// Clobbers x0-x3
+function memcpy
+ cmp x2, #0
+ b.eq 1f
+0: ldrb w3, [x1], #1
+ strb w3, [x0], #1
+ subs x2, x2, #1
+ b.ne 0b
+1: ret
+endfunction
+
+// Generate a test pattern for storage in SVE registers
+// x0: pid (16 bits)
+// x1: register number (6 bits)
+// x2: generation (4 bits)
+function pattern
+ orr w1, w0, w1, lsl #16
+ orr w2, w1, w2, lsl #28
+
+ ldr x0, =scratch
+ mov w1, #MAXVL_B / 4
+
+0: str w2, [x0], #4
+ add w2, w2, #(1 << 22)
+ subs w1, w1, #1
+ bne 0b
+
+ ret
+endfunction
+
+// Get the address of shadow data for FPSIMD V-register V<xn>
+.macro _adrv xd, xn, nrtmp
+ ldr \xd, =vref
+ mov x\nrtmp, #16
+ madd \xd, x\nrtmp, \xn, \xd
+.endm
+
+// Set up test pattern in a FPSIMD V-register
+// x0: pid
+// x1: register number
+// x2: generation
+function setup_vreg
+ mov x4, x30
+
+ mov x6, x1
+ bl pattern
+ _adrv x0, x6, 2
+ mov x5, x0
+ ldr x1, =scratch
+ bl memcpy
+
+ mov x0, x6
+ mov x1, x5
+ bl setv
+
+ ret x4
+endfunction
+
+// Fill x1 bytes starting at x0 with 0xae (for canary purposes)
+// Clobbers x1, x2.
+function memfill_ae
+ mov w2, #0xae
+ b memfill
+endfunction
+
+// Fill x1 bytes starting at x0 with 0.
+// Clobbers x1, x2.
+function memclr
+ mov w2, #0
+endfunction
+ // fall through to memfill
+
+// Trivial memory fill: fill x1 bytes starting at address x0 with byte w2
+// Clobbers x1
+function memfill
+ cmp x1, #0
+ b.eq 1f
+
+0: strb w2, [x0], #1
+ subs x1, x1, #1
+ b.ne 0b
+
+1: ret
+endfunction
+
+// Trivial memory compare: compare x2 bytes starting at address x0 with
+// bytes starting at address x1.
+// Returns only if all bytes match; otherwise, the program is aborted.
+// Clobbers x0-x5.
+function memcmp
+ cbz x2, 1f
+
+ mov x5, #0
+0: ldrb w3, [x0, x5]
+ ldrb w4, [x1, x5]
+ add x5, x5, #1
+ cmp w3, w4
+ b.ne barf
+ subs x2, x2, #1
+ b.ne 0b
+
+1: ret
+endfunction
+
+// Verify that a FPSIMD V-register matches its shadow in memory, else abort
+// x0: reg number
+// Clobbers x0-x5.
+function check_vreg
+ mov x3, x30
+
+ _adrv x5, x0, 6
+ mov x4, x0
+ ldr x7, =scratch
+
+ mov x0, x7
+ mov x1, x6
+ bl memfill_ae
+
+ mov x0, x4
+ mov x1, x7
+ bl getv
+
+ mov x0, x5
+ mov x1, x7
+ mov x2, x6
+ mov x30, x3
+ b memcmp
+endfunction
+
+// Any SVE register modified here can cause corruption in the main
+// thread -- but *only* the registers modified here.
+function irritator_handler
+ // Increment the irritation signal count (x23):
+ ldr x0, [x2, #ucontext_regs + 8 * 23]
+ add x0, x0, #1
+ str x0, [x2, #ucontext_regs + 8 * 23]
+
+ // Corrupt some random V-regs
+ adr x0, .text + (irritator_handler - .text) / 16 * 16
+ movi v0.8b, #7
+ movi v9.16b, #9
+ movi v31.8b, #31
+
+ ret
+endfunction
+
+function terminate_handler
+ mov w21, w0
+ mov x20, x2
+
+ puts "Terminated by signal "
+ mov w0, w21
+ bl putdec
+ puts ", no error, iterations="
+ ldr x0, [x20, #ucontext_regs + 8 * 22]
+ bl putdec
+ puts ", signals="
+ ldr x0, [x20, #ucontext_regs + 8 * 23]
+ bl putdecn
+
+ mov x0, #0
+ mov x8, #__NR_exit
+ svc #0
+endfunction
+
+// w0: signal number
+// x1: sa_action
+// w2: sa_flags
+// Clobbers x0-x6,x8
+function setsignal
+ str x30, [sp, #-((sa_sz + 15) / 16 * 16 + 16)]!
+
+ mov w4, w0
+ mov x5, x1
+ mov w6, w2
+
+ add x0, sp, #16
+ mov x1, #sa_sz
+ bl memclr
+
+ mov w0, w4
+ add x1, sp, #16
+ str w6, [x1, #sa_flags]
+ str x5, [x1, #sa_handler]
+ mov x2, #0
+ mov x3, #sa_mask_sz
+ mov x8, #__NR_rt_sigaction
+ svc #0
+
+ cbz w0, 1f
+
+ puts "sigaction failure\n"
+ b .Labort
+
+1: ldr x30, [sp], #((sa_sz + 15) / 16 * 16 + 16)
+ ret
+endfunction
+
+// Main program entry point
+.globl _start
+function _start
+_start:
+ // Sanity-check and report the vector length
+
+ mov x19, #128
+ cmp x19, #128
+ b.lo 1f
+ cmp x19, #2048
+ b.hi 1f
+ tst x19, #(8 - 1)
+ b.eq 2f
+
+1: puts "Bad vector length: "
+ mov x0, x19
+ bl putdecn
+ b .Labort
+
+2: puts "Vector length:\t"
+ mov x0, x19
+ bl putdec
+ puts " bits\n"
+
+ // Obtain our PID, to ensure test pattern uniqueness between processes
+
+ mov x8, #__NR_getpid
+ svc #0
+ mov x20, x0
+
+ puts "PID:\t"
+ mov x0, x20
+ bl putdecn
+
+ mov x23, #0 // Irritation signal count
+
+ mov w0, #SIGINT
+ adr x1, terminate_handler
+ mov w2, #SA_SIGINFO
+ bl setsignal
+
+ mov w0, #SIGTERM
+ adr x1, terminate_handler
+ mov w2, #SA_SIGINFO
+ bl setsignal
+
+ mov w0, #SIGUSR1
+ adr x1, irritator_handler
+ mov w2, #SA_SIGINFO
+ orr w2, w2, #SA_NODEFER
+ bl setsignal
+
+ mov x22, #0 // generation number, increments per iteration
+.Ltest_loop:
+
+ mov x21, #0 // Set up V-regs & shadow with test pattern
+0: mov x0, x20
+ mov x1, x21
+ and x2, x22, #0xf
+ bl setup_vreg
+ add x21, x21, #1
+ cmp x21, #NVR
+ b.lo 0b
+
+// Can't do this when SVE state is volatile across SVC:
+ mov x8, #__NR_sched_yield // Encourage preemption
+ svc #0
+
+ mov x21, #0
+0: mov x0, x21
+ bl check_vreg
+ add x21, x21, #1
+ cmp x21, #NVR
+ b.lo 0b
+
+ add x22, x22, #1
+ b .Ltest_loop
+
+.Labort:
+ mov x0, #0
+ mov x1, #SIGABRT
+ mov x8, #__NR_kill
+ svc #0
+endfunction
+
+function barf
+ mov x10, x0 // expected data
+ mov x11, x1 // actual data
+ mov x12, x2 // data size
+
+ puts "Mistatch: PID="
+ mov x0, x20
+ bl putdec
+ puts ", iteration="
+ mov x0, x22
+ bl putdec
+ puts ", reg="
+ mov x0, x21
+ bl putdecn
+ puts "\tExpected ["
+ mov x0, x10
+ mov x1, x12
+ bl dumphex
+ puts "]\n\tGot ["
+ mov x0, x11
+ mov x1, x12
+ bl dumphex
+ puts "]\n"
+
+ mov x8, #__NR_exit
+ mov x1, #1
+ svc #0
+endfunction
diff --git a/tools/testing/selftests/arm64/fp/sve-probe-vls.c b/tools/testing/selftests/arm64/fp/sve-probe-vls.c
new file mode 100644
index 000000000..b29cbc642
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/sve-probe-vls.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015-2020 ARM Limited.
+ * Original author: Dave Martin <Dave.Martin@arm.com>
+ */
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+#include <asm/sigcontext.h>
+
+#include "../../kselftest.h"
+
+int main(int argc, char **argv)
+{
+ unsigned int vq;
+ int vl;
+ static unsigned int vqs[SVE_VQ_MAX];
+ unsigned int nvqs = 0;
+
+ ksft_print_header();
+ ksft_set_plan(2);
+
+ if (!(getauxval(AT_HWCAP) & HWCAP_SVE))
+ ksft_exit_skip("SVE not available");
+
+ /*
+ * Enumerate up to SVE_VQ_MAX vector lengths
+ */
+ for (vq = SVE_VQ_MAX; vq > 0; --vq) {
+ vl = prctl(PR_SVE_SET_VL, vq * 16);
+ if (vl == -1)
+ ksft_exit_fail_msg("PR_SVE_SET_VL failed: %s (%d)\n",
+ strerror(errno), errno);
+
+ vl &= PR_SVE_VL_LEN_MASK;
+
+ if (!sve_vl_valid(vl))
+ ksft_exit_fail_msg("VL %d invalid\n", vl);
+ vq = sve_vq_from_vl(vl);
+
+ if (!(nvqs < SVE_VQ_MAX))
+ ksft_exit_fail_msg("Too many VLs %u >= SVE_VQ_MAX\n",
+ nvqs);
+ vqs[nvqs++] = vq;
+ }
+ ksft_test_result_pass("Enumerated %d vector lengths\n", nvqs);
+ ksft_test_result_pass("All vector lengths valid\n");
+
+ /* Print out the vector lengths in ascending order: */
+ while (nvqs--)
+ ksft_print_msg("%u\n", 16 * vqs[nvqs]);
+
+ ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace-asm.S b/tools/testing/selftests/arm64/fp/sve-ptrace-asm.S
new file mode 100644
index 000000000..3e81f9fab
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/sve-ptrace-asm.S
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2015-2019 ARM Limited.
+// Original author: Dave Martin <Dave.Martin@arm.com>
+#include <asm/unistd.h>
+
+.arch_extension sve
+
+.globl sve_store_patterns
+
+sve_store_patterns:
+ mov x1, x0
+
+ index z0.b, #0, #1
+ str q0, [x1]
+
+ mov w8, #__NR_getpid
+ svc #0
+ str q0, [x1, #0x10]
+
+ mov z1.d, z0.d
+ str q0, [x1, #0x20]
+
+ mov w8, #__NR_getpid
+ svc #0
+ str q0, [x1, #0x30]
+
+ mov z1.d, z0.d
+ str q0, [x1, #0x40]
+
+ ret
+
+.size sve_store_patterns, . - sve_store_patterns
+.type sve_store_patterns, @function
diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c
new file mode 100644
index 000000000..612d38996
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c
@@ -0,0 +1,336 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015-2020 ARM Limited.
+ * Original author: Dave Martin <Dave.Martin@arm.com>
+ */
+#include <errno.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/auxv.h>
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/wait.h>
+#include <asm/sigcontext.h>
+#include <asm/ptrace.h>
+
+#include "../../kselftest.h"
+
+/* <linux/elf.h> and <sys/auxv.h> don't like each other, so: */
+#ifndef NT_ARM_SVE
+#define NT_ARM_SVE 0x405
+#endif
+
+/* Number of registers filled in by sve_store_patterns */
+#define NR_VREGS 5
+
+void sve_store_patterns(__uint128_t v[NR_VREGS]);
+
+static void dump(const void *buf, size_t size)
+{
+ size_t i;
+ const unsigned char *p = buf;
+
+ for (i = 0; i < size; ++i)
+ printf(" %.2x", *p++);
+}
+
+static int check_vregs(const __uint128_t vregs[NR_VREGS])
+{
+ int i;
+ int ok = 1;
+
+ for (i = 0; i < NR_VREGS; ++i) {
+ printf("# v[%d]:", i);
+ dump(&vregs[i], sizeof vregs[i]);
+ putchar('\n');
+
+ if (vregs[i] != vregs[0])
+ ok = 0;
+ }
+
+ return ok;
+}
+
+static int do_child(void)
+{
+ if (ptrace(PTRACE_TRACEME, -1, NULL, NULL))
+ ksft_exit_fail_msg("PTRACE_TRACEME", strerror(errno));
+
+ if (raise(SIGSTOP))
+ ksft_exit_fail_msg("raise(SIGSTOP)", strerror(errno));
+
+ return EXIT_SUCCESS;
+}
+
+static struct user_sve_header *get_sve(pid_t pid, void **buf, size_t *size)
+{
+ struct user_sve_header *sve;
+ void *p;
+ size_t sz = sizeof *sve;
+ struct iovec iov;
+
+ while (1) {
+ if (*size < sz) {
+ p = realloc(*buf, sz);
+ if (!p) {
+ errno = ENOMEM;
+ goto error;
+ }
+
+ *buf = p;
+ *size = sz;
+ }
+
+ iov.iov_base = *buf;
+ iov.iov_len = sz;
+ if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_SVE, &iov))
+ goto error;
+
+ sve = *buf;
+ if (sve->size <= sz)
+ break;
+
+ sz = sve->size;
+ }
+
+ return sve;
+
+error:
+ return NULL;
+}
+
+static int set_sve(pid_t pid, const struct user_sve_header *sve)
+{
+ struct iovec iov;
+
+ iov.iov_base = (void *)sve;
+ iov.iov_len = sve->size;
+ return ptrace(PTRACE_SETREGSET, pid, NT_ARM_SVE, &iov);
+}
+
+static void dump_sve_regs(const struct user_sve_header *sve, unsigned int num,
+ unsigned int vlmax)
+{
+ unsigned int vq;
+ unsigned int i;
+
+ if ((sve->flags & SVE_PT_REGS_MASK) != SVE_PT_REGS_SVE)
+ ksft_exit_fail_msg("Dumping non-SVE register\n");
+
+ if (vlmax > sve->vl)
+ vlmax = sve->vl;
+
+ vq = sve_vq_from_vl(sve->vl);
+ for (i = 0; i < num; ++i) {
+ printf("# z%u:", i);
+ dump((const char *)sve + SVE_PT_SVE_ZREG_OFFSET(vq, i),
+ vlmax);
+ printf("%s\n", vlmax == sve->vl ? "" : " ...");
+ }
+}
+
+static int do_parent(pid_t child)
+{
+ int ret = EXIT_FAILURE;
+ pid_t pid;
+ int status;
+ siginfo_t si;
+ void *svebuf = NULL, *newsvebuf;
+ size_t svebufsz = 0, newsvebufsz;
+ struct user_sve_header *sve, *new_sve;
+ struct user_fpsimd_state *fpsimd;
+ unsigned int i, j;
+ unsigned char *p;
+ unsigned int vq;
+
+ /* Attach to the child */
+ while (1) {
+ int sig;
+
+ pid = wait(&status);
+ if (pid == -1) {
+ perror("wait");
+ goto error;
+ }
+
+ /*
+ * This should never happen but it's hard to flag in
+ * the framework.
+ */
+ if (pid != child)
+ continue;
+
+ if (WIFEXITED(status) || WIFSIGNALED(status))
+ ksft_exit_fail_msg("Child died unexpectedly\n");
+
+ ksft_test_result(WIFSTOPPED(status), "WIFSTOPPED(%d)\n",
+ status);
+ if (!WIFSTOPPED(status))
+ goto error;
+
+ sig = WSTOPSIG(status);
+
+ if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &si)) {
+ if (errno == ESRCH)
+ goto disappeared;
+
+ if (errno == EINVAL) {
+ sig = 0; /* bust group-stop */
+ goto cont;
+ }
+
+ ksft_test_result_fail("PTRACE_GETSIGINFO: %s\n",
+ strerror(errno));
+ goto error;
+ }
+
+ if (sig == SIGSTOP && si.si_code == SI_TKILL &&
+ si.si_pid == pid)
+ break;
+
+ cont:
+ if (ptrace(PTRACE_CONT, pid, NULL, sig)) {
+ if (errno == ESRCH)
+ goto disappeared;
+
+ ksft_test_result_fail("PTRACE_CONT: %s\n",
+ strerror(errno));
+ goto error;
+ }
+ }
+
+ sve = get_sve(pid, &svebuf, &svebufsz);
+ if (!sve) {
+ int e = errno;
+
+ ksft_test_result_fail("get_sve: %s\n", strerror(errno));
+ if (e == ESRCH)
+ goto disappeared;
+
+ goto error;
+ } else {
+ ksft_test_result_pass("get_sve\n");
+ }
+
+ ksft_test_result((sve->flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD,
+ "FPSIMD registers\n");
+ if ((sve->flags & SVE_PT_REGS_MASK) != SVE_PT_REGS_FPSIMD)
+ goto error;
+
+ fpsimd = (struct user_fpsimd_state *)((char *)sve +
+ SVE_PT_FPSIMD_OFFSET);
+ for (i = 0; i < 32; ++i) {
+ p = (unsigned char *)&fpsimd->vregs[i];
+
+ for (j = 0; j < sizeof fpsimd->vregs[i]; ++j)
+ p[j] = j;
+ }
+
+ if (set_sve(pid, sve)) {
+ int e = errno;
+
+ ksft_test_result_fail("set_sve(FPSIMD): %s\n",
+ strerror(errno));
+ if (e == ESRCH)
+ goto disappeared;
+
+ goto error;
+ }
+
+ vq = sve_vq_from_vl(sve->vl);
+
+ newsvebufsz = SVE_PT_SVE_ZREG_OFFSET(vq, 1);
+ new_sve = newsvebuf = malloc(newsvebufsz);
+ if (!new_sve) {
+ errno = ENOMEM;
+ perror(NULL);
+ goto error;
+ }
+
+ *new_sve = *sve;
+ new_sve->flags &= ~SVE_PT_REGS_MASK;
+ new_sve->flags |= SVE_PT_REGS_SVE;
+ memset((char *)new_sve + SVE_PT_SVE_ZREG_OFFSET(vq, 0),
+ 0, SVE_PT_SVE_ZREG_SIZE(vq));
+ new_sve->size = SVE_PT_SVE_ZREG_OFFSET(vq, 1);
+ if (set_sve(pid, new_sve)) {
+ int e = errno;
+
+ ksft_test_result_fail("set_sve(ZREG): %s\n", strerror(errno));
+ if (e == ESRCH)
+ goto disappeared;
+
+ goto error;
+ }
+
+ new_sve = get_sve(pid, &newsvebuf, &newsvebufsz);
+ if (!new_sve) {
+ int e = errno;
+
+ ksft_test_result_fail("get_sve(ZREG): %s\n", strerror(errno));
+ if (e == ESRCH)
+ goto disappeared;
+
+ goto error;
+ }
+
+ ksft_test_result((new_sve->flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_SVE,
+ "SVE registers\n");
+ if ((new_sve->flags & SVE_PT_REGS_MASK) != SVE_PT_REGS_SVE)
+ goto error;
+
+ dump_sve_regs(new_sve, 3, sizeof fpsimd->vregs[0]);
+
+ p = (unsigned char *)new_sve + SVE_PT_SVE_ZREG_OFFSET(vq, 1);
+ for (i = 0; i < sizeof fpsimd->vregs[0]; ++i) {
+ unsigned char expected = i;
+
+ if (__BYTE_ORDER == __BIG_ENDIAN)
+ expected = sizeof fpsimd->vregs[0] - 1 - expected;
+
+ ksft_test_result(p[i] == expected, "p[%d] == expected\n", i);
+ if (p[i] != expected)
+ goto error;
+ }
+
+ ret = EXIT_SUCCESS;
+
+error:
+ kill(child, SIGKILL);
+
+disappeared:
+ return ret;
+}
+
+int main(void)
+{
+ int ret = EXIT_SUCCESS;
+ __uint128_t v[NR_VREGS];
+ pid_t child;
+
+ ksft_print_header();
+ ksft_set_plan(20);
+
+ if (!(getauxval(AT_HWCAP) & HWCAP_SVE))
+ ksft_exit_skip("SVE not available\n");
+
+ sve_store_patterns(v);
+
+ if (!check_vregs(v))
+ ksft_exit_fail_msg("Initial check_vregs() failed\n");
+
+ child = fork();
+ if (!child)
+ return do_child();
+
+ if (do_parent(child))
+ ret = EXIT_FAILURE;
+
+ ksft_print_cnts();
+
+ return ret;
+}
diff --git a/tools/testing/selftests/arm64/fp/sve-stress b/tools/testing/selftests/arm64/fp/sve-stress
new file mode 100755
index 000000000..24dd0922c
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/sve-stress
@@ -0,0 +1,59 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-only
+# Copyright (C) 2015-2019 ARM Limited.
+# Original author: Dave Martin <Dave.Martin@arm.com>
+
+set -ue
+
+NR_CPUS=`nproc`
+
+pids=
+logs=
+
+cleanup () {
+ trap - INT TERM CHLD
+ set +e
+
+ if [ -n "$pids" ]; then
+ kill $pids
+ wait $pids
+ pids=
+ fi
+
+ if [ -n "$logs" ]; then
+ cat $logs
+ rm $logs
+ logs=
+ fi
+}
+
+interrupt () {
+ cleanup
+ exit 0
+}
+
+child_died () {
+ cleanup
+ exit 1
+}
+
+trap interrupt INT TERM EXIT
+
+for x in `seq 0 $((NR_CPUS * 4))`; do
+ log=`mktemp`
+ logs=$logs\ $log
+ ./sve-test >$log &
+ pids=$pids\ $!
+done
+
+# Wait for all child processes to be created:
+sleep 10
+
+while :; do
+ kill -USR1 $pids
+done &
+pids=$pids\ $!
+
+wait
+
+exit 1
diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S
new file mode 100644
index 000000000..07f14e279
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/sve-test.S
@@ -0,0 +1,684 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2015-2019 ARM Limited.
+// Original author: Dave Martin <Dave.Martin@arm.com>
+//
+// Simple Scalable Vector Extension context switch test
+// Repeatedly writes unique test patterns into each SVE register
+// and reads them back to verify integrity.
+//
+// for x in `seq 1 NR_CPUS`; do sve-test & pids=$pids\ $! ; done
+// (leave it running for as long as you want...)
+// kill $pids
+
+#include <asm/unistd.h>
+#include "assembler.h"
+#include "asm-offsets.h"
+
+#define NZR 32
+#define NPR 16
+#define MAXVL_B (2048 / 8)
+
+.arch_extension sve
+
+.macro _sve_ldr_v zt, xn
+ ldr z\zt, [x\xn]
+.endm
+
+.macro _sve_str_v zt, xn
+ str z\zt, [x\xn]
+.endm
+
+.macro _sve_ldr_p pt, xn
+ ldr p\pt, [x\xn]
+.endm
+
+.macro _sve_str_p pt, xn
+ str p\pt, [x\xn]
+.endm
+
+// Generate accessor functions to read/write programmatically selected
+// SVE registers.
+// x0 is the register index to access
+// x1 is the memory address to read from (getz,setp) or store to (setz,setp)
+// All clobber x0-x2
+define_accessor setz, NZR, _sve_ldr_v
+define_accessor getz, NZR, _sve_str_v
+define_accessor setp, NPR, _sve_ldr_p
+define_accessor getp, NPR, _sve_str_p
+
+// Print a single character x0 to stdout
+// Clobbers x0-x2,x8
+function putc
+ str x0, [sp, #-16]!
+
+ mov x0, #1 // STDOUT_FILENO
+ mov x1, sp
+ mov x2, #1
+ mov x8, #__NR_write
+ svc #0
+
+ add sp, sp, #16
+ ret
+endfunction
+
+// Print a NUL-terminated string starting at address x0 to stdout
+// Clobbers x0-x3,x8
+function puts
+ mov x1, x0
+
+ mov x2, #0
+0: ldrb w3, [x0], #1
+ cbz w3, 1f
+ add x2, x2, #1
+ b 0b
+
+1: mov w0, #1 // STDOUT_FILENO
+ mov x8, #__NR_write
+ svc #0
+
+ ret
+endfunction
+
+// Utility macro to print a literal string
+// Clobbers x0-x4,x8
+.macro puts string
+ .pushsection .rodata.str1.1, "aMS", 1
+.L__puts_literal\@: .string "\string"
+ .popsection
+
+ ldr x0, =.L__puts_literal\@
+ bl puts
+.endm
+
+// Print an unsigned decimal number x0 to stdout
+// Clobbers x0-x4,x8
+function putdec
+ mov x1, sp
+ str x30, [sp, #-32]! // Result can't be > 20 digits
+
+ mov x2, #0
+ strb w2, [x1, #-1]! // Write the NUL terminator
+
+ mov x2, #10
+0: udiv x3, x0, x2 // div-mod loop to generate the digits
+ msub x0, x3, x2, x0
+ add w0, w0, #'0'
+ strb w0, [x1, #-1]!
+ mov x0, x3
+ cbnz x3, 0b
+
+ ldrb w0, [x1]
+ cbnz w0, 1f
+ mov w0, #'0' // Print "0" for 0, not ""
+ strb w0, [x1, #-1]!
+
+1: mov x0, x1
+ bl puts
+
+ ldr x30, [sp], #32
+ ret
+endfunction
+
+// Print an unsigned decimal number x0 to stdout, followed by a newline
+// Clobbers x0-x5,x8
+function putdecn
+ mov x5, x30
+
+ bl putdec
+ mov x0, #'\n'
+ bl putc
+
+ ret x5
+endfunction
+
+// Clobbers x0-x3,x8
+function puthexb
+ str x30, [sp, #-0x10]!
+
+ mov w3, w0
+ lsr w0, w0, #4
+ bl puthexnibble
+ mov w0, w3
+
+ ldr x30, [sp], #0x10
+ // fall through to puthexnibble
+endfunction
+// Clobbers x0-x2,x8
+function puthexnibble
+ and w0, w0, #0xf
+ cmp w0, #10
+ blo 1f
+ add w0, w0, #'a' - ('9' + 1)
+1: add w0, w0, #'0'
+ b putc
+endfunction
+
+// x0=data in, x1=size in, clobbers x0-x5,x8
+function dumphex
+ str x30, [sp, #-0x10]!
+
+ mov x4, x0
+ mov x5, x1
+
+0: subs x5, x5, #1
+ b.lo 1f
+ ldrb w0, [x4], #1
+ bl puthexb
+ b 0b
+
+1: ldr x30, [sp], #0x10
+ ret
+endfunction
+
+// Declare some storate space to shadow the SVE register contents:
+.pushsection .text
+.data
+.align 4
+zref:
+ .space MAXVL_B * NZR
+pref:
+ .space MAXVL_B / 8 * NPR
+ffrref:
+ .space MAXVL_B / 8
+scratch:
+ .space MAXVL_B
+.popsection
+
+// Trivial memory copy: copy x2 bytes, starting at address x1, to address x0.
+// Clobbers x0-x3
+function memcpy
+ cmp x2, #0
+ b.eq 1f
+0: ldrb w3, [x1], #1
+ strb w3, [x0], #1
+ subs x2, x2, #1
+ b.ne 0b
+1: ret
+endfunction
+
+// Generate a test pattern for storage in SVE registers
+// x0: pid (16 bits)
+// x1: register number (6 bits)
+// x2: generation (4 bits)
+
+// These values are used to constuct a 32-bit pattern that is repeated in the
+// scratch buffer as many times as will fit:
+// bits 31:28 generation number (increments once per test_loop)
+// bits 27:22 32-bit lane index
+// bits 21:16 register number
+// bits 15: 0 pid
+
+function pattern
+ orr w1, w0, w1, lsl #16
+ orr w2, w1, w2, lsl #28
+
+ ldr x0, =scratch
+ mov w1, #MAXVL_B / 4
+
+0: str w2, [x0], #4
+ add w2, w2, #(1 << 22)
+ subs w1, w1, #1
+ bne 0b
+
+ ret
+endfunction
+
+// Get the address of shadow data for SVE Z-register Z<xn>
+.macro _adrz xd, xn, nrtmp
+ ldr \xd, =zref
+ rdvl x\nrtmp, #1
+ madd \xd, x\nrtmp, \xn, \xd
+.endm
+
+// Get the address of shadow data for SVE P-register P<xn - NZR>
+.macro _adrp xd, xn, nrtmp
+ ldr \xd, =pref
+ rdvl x\nrtmp, #1
+ lsr x\nrtmp, x\nrtmp, #3
+ sub \xn, \xn, #NZR
+ madd \xd, x\nrtmp, \xn, \xd
+.endm
+
+// Set up test pattern in a SVE Z-register
+// x0: pid
+// x1: register number
+// x2: generation
+function setup_zreg
+ mov x4, x30
+
+ mov x6, x1
+ bl pattern
+ _adrz x0, x6, 2
+ mov x5, x0
+ ldr x1, =scratch
+ bl memcpy
+
+ mov x0, x6
+ mov x1, x5
+ bl setz
+
+ ret x4
+endfunction
+
+// Set up test pattern in a SVE P-register
+// x0: pid
+// x1: register number
+// x2: generation
+function setup_preg
+ mov x4, x30
+
+ mov x6, x1
+ bl pattern
+ _adrp x0, x6, 2
+ mov x5, x0
+ ldr x1, =scratch
+ bl memcpy
+
+ mov x0, x6
+ mov x1, x5
+ bl setp
+
+ ret x4
+endfunction
+
+// Set up test pattern in the FFR
+// x0: pid
+// x2: generation
+//
+// We need to generate a canonical FFR value, which consists of a number of
+// low "1" bits, followed by a number of zeros. This gives us 17 unique values
+// per 16 bits of FFR, so we create a 4 bit signature out of the PID and
+// generation, and use that as the initial number of ones in the pattern.
+// We fill the upper lanes of FFR with zeros.
+// Beware: corrupts P0.
+function setup_ffr
+ mov x4, x30
+
+ and w0, w0, #0x3
+ bfi w0, w2, #2, #2
+ mov w1, #1
+ lsl w1, w1, w0
+ sub w1, w1, #1
+
+ ldr x0, =ffrref
+ strh w1, [x0], 2
+ rdvl x1, #1
+ lsr x1, x1, #3
+ sub x1, x1, #2
+ bl memclr
+
+ mov x0, #0
+ ldr x1, =ffrref
+ bl setp
+
+ wrffr p0.b
+
+ ret x4
+endfunction
+
+// Fill x1 bytes starting at x0 with 0xae (for canary purposes)
+// Clobbers x1, x2.
+function memfill_ae
+ mov w2, #0xae
+ b memfill
+endfunction
+
+// Fill x1 bytes starting at x0 with 0.
+// Clobbers x1, x2.
+function memclr
+ mov w2, #0
+endfunction
+ // fall through to memfill
+
+// Trivial memory fill: fill x1 bytes starting at address x0 with byte w2
+// Clobbers x1
+function memfill
+ cmp x1, #0
+ b.eq 1f
+
+0: strb w2, [x0], #1
+ subs x1, x1, #1
+ b.ne 0b
+
+1: ret
+endfunction
+
+// Trivial memory compare: compare x2 bytes starting at address x0 with
+// bytes starting at address x1.
+// Returns only if all bytes match; otherwise, the program is aborted.
+// Clobbers x0-x5.
+function memcmp
+ cbz x2, 2f
+
+ stp x0, x1, [sp, #-0x20]!
+ str x2, [sp, #0x10]
+
+ mov x5, #0
+0: ldrb w3, [x0, x5]
+ ldrb w4, [x1, x5]
+ add x5, x5, #1
+ cmp w3, w4
+ b.ne 1f
+ subs x2, x2, #1
+ b.ne 0b
+
+1: ldr x2, [sp, #0x10]
+ ldp x0, x1, [sp], #0x20
+ b.ne barf
+
+2: ret
+endfunction
+
+// Verify that a SVE Z-register matches its shadow in memory, else abort
+// x0: reg number
+// Clobbers x0-x7.
+function check_zreg
+ mov x3, x30
+
+ _adrz x5, x0, 6
+ mov x4, x0
+ ldr x7, =scratch
+
+ mov x0, x7
+ mov x1, x6
+ bl memfill_ae
+
+ mov x0, x4
+ mov x1, x7
+ bl getz
+
+ mov x0, x5
+ mov x1, x7
+ mov x2, x6
+ mov x30, x3
+ b memcmp
+endfunction
+
+// Verify that a SVE P-register matches its shadow in memory, else abort
+// x0: reg number
+// Clobbers x0-x7.
+function check_preg
+ mov x3, x30
+
+ _adrp x5, x0, 6
+ mov x4, x0
+ ldr x7, =scratch
+
+ mov x0, x7
+ mov x1, x6
+ bl memfill_ae
+
+ mov x0, x4
+ mov x1, x7
+ bl getp
+
+ mov x0, x5
+ mov x1, x7
+ mov x2, x6
+ mov x30, x3
+ b memcmp
+endfunction
+
+// Verify that the FFR matches its shadow in memory, else abort
+// Beware -- corrupts P0.
+// Clobbers x0-x5.
+function check_ffr
+ mov x3, x30
+
+ ldr x4, =scratch
+ rdvl x5, #1
+ lsr x5, x5, #3
+
+ mov x0, x4
+ mov x1, x5
+ bl memfill_ae
+
+ rdffr p0.b
+ mov x0, #0
+ mov x1, x4
+ bl getp
+
+ ldr x0, =ffrref
+ mov x1, x4
+ mov x2, x5
+ mov x30, x3
+ b memcmp
+endfunction
+
+// Any SVE register modified here can cause corruption in the main
+// thread -- but *only* the registers modified here.
+function irritator_handler
+ // Increment the irritation signal count (x23):
+ ldr x0, [x2, #ucontext_regs + 8 * 23]
+ add x0, x0, #1
+ str x0, [x2, #ucontext_regs + 8 * 23]
+
+ // Corrupt some random Z-regs
+ adr x0, .text + (irritator_handler - .text) / 16 * 16
+ movi v0.8b, #1
+ movi v9.16b, #2
+ movi v31.8b, #3
+ // And P0
+ rdffr p0.b
+ // And FFR
+ wrffr p15.b
+
+ ret
+endfunction
+
+function terminate_handler
+ mov w21, w0
+ mov x20, x2
+
+ puts "Terminated by signal "
+ mov w0, w21
+ bl putdec
+ puts ", no error, iterations="
+ ldr x0, [x20, #ucontext_regs + 8 * 22]
+ bl putdec
+ puts ", signals="
+ ldr x0, [x20, #ucontext_regs + 8 * 23]
+ bl putdecn
+
+ mov x0, #0
+ mov x8, #__NR_exit
+ svc #0
+endfunction
+
+// w0: signal number
+// x1: sa_action
+// w2: sa_flags
+// Clobbers x0-x6,x8
+function setsignal
+ str x30, [sp, #-((sa_sz + 15) / 16 * 16 + 16)]!
+
+ mov w4, w0
+ mov x5, x1
+ mov w6, w2
+
+ add x0, sp, #16
+ mov x1, #sa_sz
+ bl memclr
+
+ mov w0, w4
+ add x1, sp, #16
+ str w6, [x1, #sa_flags]
+ str x5, [x1, #sa_handler]
+ mov x2, #0
+ mov x3, #sa_mask_sz
+ mov x8, #__NR_rt_sigaction
+ svc #0
+
+ cbz w0, 1f
+
+ puts "sigaction failure\n"
+ b .Labort
+
+1: ldr x30, [sp], #((sa_sz + 15) / 16 * 16 + 16)
+ ret
+endfunction
+
+// Main program entry point
+.globl _start
+function _start
+_start:
+ // Sanity-check and report the vector length
+
+ rdvl x19, #8
+ cmp x19, #128
+ b.lo 1f
+ cmp x19, #2048
+ b.hi 1f
+ tst x19, #(8 - 1)
+ b.eq 2f
+
+1: puts "Bad vector length: "
+ mov x0, x19
+ bl putdecn
+ b .Labort
+
+2: puts "Vector length:\t"
+ mov x0, x19
+ bl putdec
+ puts " bits\n"
+
+ // Obtain our PID, to ensure test pattern uniqueness between processes
+
+ mov x8, #__NR_getpid
+ svc #0
+ mov x20, x0
+
+ puts "PID:\t"
+ mov x0, x20
+ bl putdecn
+
+ mov x23, #0 // Irritation signal count
+
+ mov w0, #SIGINT
+ adr x1, terminate_handler
+ mov w2, #SA_SIGINFO
+ bl setsignal
+
+ mov w0, #SIGTERM
+ adr x1, terminate_handler
+ mov w2, #SA_SIGINFO
+ bl setsignal
+
+ mov w0, #SIGUSR1
+ adr x1, irritator_handler
+ mov w2, #SA_SIGINFO
+ orr w2, w2, #SA_NODEFER
+ bl setsignal
+
+ mov x22, #0 // generation number, increments per iteration
+.Ltest_loop:
+ rdvl x0, #8
+ cmp x0, x19
+ b.ne vl_barf
+
+ mov x21, #0 // Set up Z-regs & shadow with test pattern
+0: mov x0, x20
+ mov x1, x21
+ and x2, x22, #0xf
+ bl setup_zreg
+ add x21, x21, #1
+ cmp x21, #NZR
+ b.lo 0b
+
+ mov x0, x20 // Set up FFR & shadow with test pattern
+ mov x1, #NZR + NPR
+ and x2, x22, #0xf
+ bl setup_ffr
+
+0: mov x0, x20 // Set up P-regs & shadow with test pattern
+ mov x1, x21
+ and x2, x22, #0xf
+ bl setup_preg
+ add x21, x21, #1
+ cmp x21, #NZR + NPR
+ b.lo 0b
+
+// Can't do this when SVE state is volatile across SVC:
+// mov x8, #__NR_sched_yield // Encourage preemption
+// svc #0
+
+ mov x21, #0
+0: mov x0, x21
+ bl check_zreg
+ add x21, x21, #1
+ cmp x21, #NZR
+ b.lo 0b
+
+0: mov x0, x21
+ bl check_preg
+ add x21, x21, #1
+ cmp x21, #NZR + NPR
+ b.lo 0b
+
+ bl check_ffr
+
+ add x22, x22, #1
+ b .Ltest_loop
+
+.Labort:
+ mov x0, #0
+ mov x1, #SIGABRT
+ mov x8, #__NR_kill
+ svc #0
+endfunction
+
+function barf
+// fpsimd.c acitivty log dump hack
+// ldr w0, =0xdeadc0de
+// mov w8, #__NR_exit
+// svc #0
+// end hack
+ mov x10, x0 // expected data
+ mov x11, x1 // actual data
+ mov x12, x2 // data size
+
+ puts "Mistatch: PID="
+ mov x0, x20
+ bl putdec
+ puts ", iteration="
+ mov x0, x22
+ bl putdec
+ puts ", reg="
+ mov x0, x21
+ bl putdecn
+ puts "\tExpected ["
+ mov x0, x10
+ mov x1, x12
+ bl dumphex
+ puts "]\n\tGot ["
+ mov x0, x11
+ mov x1, x12
+ bl dumphex
+ puts "]\n"
+
+ mov x8, #__NR_getpid
+ svc #0
+// fpsimd.c acitivty log dump hack
+// ldr w0, =0xdeadc0de
+// mov w8, #__NR_exit
+// svc #0
+// ^ end of hack
+ mov x1, #SIGABRT
+ mov x8, #__NR_kill
+ svc #0
+// mov x8, #__NR_exit
+// mov x1, #1
+// svc #0
+endfunction
+
+function vl_barf
+ mov x10, x0
+
+ puts "Bad active VL: "
+ mov x0, x10
+ bl putdecn
+
+ mov x8, #__NR_exit
+ mov x1, #1
+ svc #0
+endfunction
diff --git a/tools/testing/selftests/arm64/fp/vlset.c b/tools/testing/selftests/arm64/fp/vlset.c
new file mode 100644
index 000000000..308d27a68
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/vlset.c
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2015-2019 ARM Limited.
+ * Original author: Dave Martin <Dave.Martin@arm.com>
+ */
+#define _GNU_SOURCE
+#include <assert.h>
+#include <errno.h>
+#include <limits.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+#include <unistd.h>
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+#include <asm/hwcap.h>
+#include <asm/sigcontext.h>
+
+static int inherit = 0;
+static int no_inherit = 0;
+static int force = 0;
+static unsigned long vl;
+
+static const struct option options[] = {
+ { "force", no_argument, NULL, 'f' },
+ { "inherit", no_argument, NULL, 'i' },
+ { "max", no_argument, NULL, 'M' },
+ { "no-inherit", no_argument, &no_inherit, 1 },
+ { "help", no_argument, NULL, '?' },
+ {}
+};
+
+static char const *program_name;
+
+static int parse_options(int argc, char **argv)
+{
+ int c;
+ char *rest;
+
+ program_name = strrchr(argv[0], '/');
+ if (program_name)
+ ++program_name;
+ else
+ program_name = argv[0];
+
+ while ((c = getopt_long(argc, argv, "Mfhi", options, NULL)) != -1)
+ switch (c) {
+ case 'M': vl = SVE_VL_MAX; break;
+ case 'f': force = 1; break;
+ case 'i': inherit = 1; break;
+ case 0: break;
+ default: goto error;
+ }
+
+ if (inherit && no_inherit)
+ goto error;
+
+ if (!vl) {
+ /* vector length */
+ if (optind >= argc)
+ goto error;
+
+ errno = 0;
+ vl = strtoul(argv[optind], &rest, 0);
+ if (*rest) {
+ vl = ULONG_MAX;
+ errno = EINVAL;
+ }
+ if (vl == ULONG_MAX && errno) {
+ fprintf(stderr, "%s: %s: %s\n",
+ program_name, argv[optind], strerror(errno));
+ goto error;
+ }
+
+ ++optind;
+ }
+
+ /* command */
+ if (optind >= argc)
+ goto error;
+
+ return 0;
+
+error:
+ fprintf(stderr,
+ "Usage: %s [-f | --force] "
+ "[-i | --inherit | --no-inherit] "
+ "{-M | --max | <vector length>} "
+ "<command> [<arguments> ...]\n",
+ program_name);
+ return -1;
+}
+
+int main(int argc, char **argv)
+{
+ int ret = 126; /* same as sh(1) command-not-executable error */
+ long flags;
+ char *path;
+ int t, e;
+
+ if (parse_options(argc, argv))
+ return 2; /* same as sh(1) builtin incorrect-usage */
+
+ if (vl & ~(vl & PR_SVE_VL_LEN_MASK)) {
+ fprintf(stderr, "%s: Invalid vector length %lu\n",
+ program_name, vl);
+ return 2; /* same as sh(1) builtin incorrect-usage */
+ }
+
+ if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
+ fprintf(stderr, "%s: Scalable Vector Extension not present\n",
+ program_name);
+
+ if (!force)
+ goto error;
+
+ fputs("Going ahead anyway (--force): "
+ "This is a debug option. Don't rely on it.\n",
+ stderr);
+ }
+
+ flags = PR_SVE_SET_VL_ONEXEC;
+ if (inherit)
+ flags |= PR_SVE_VL_INHERIT;
+
+ t = prctl(PR_SVE_SET_VL, vl | flags);
+ if (t < 0) {
+ fprintf(stderr, "%s: PR_SVE_SET_VL: %s\n",
+ program_name, strerror(errno));
+ goto error;
+ }
+
+ t = prctl(PR_SVE_GET_VL);
+ if (t == -1) {
+ fprintf(stderr, "%s: PR_SVE_GET_VL: %s\n",
+ program_name, strerror(errno));
+ goto error;
+ }
+ flags = PR_SVE_VL_LEN_MASK;
+ flags = t & ~flags;
+
+ assert(optind < argc);
+ path = argv[optind];
+
+ execvp(path, &argv[optind]);
+ e = errno;
+ if (errno == ENOENT)
+ ret = 127; /* same as sh(1) not-found error */
+ fprintf(stderr, "%s: %s: %s\n", program_name, path, strerror(e));
+
+error:
+ return ret; /* same as sh(1) not-executable error */
+}
diff --git a/tools/testing/selftests/arm64/mte/.gitignore b/tools/testing/selftests/arm64/mte/.gitignore
new file mode 100644
index 000000000..bc3ac63f3
--- /dev/null
+++ b/tools/testing/selftests/arm64/mte/.gitignore
@@ -0,0 +1,6 @@
+check_buffer_fill
+check_tags_inclusion
+check_child_memory
+check_mmap_options
+check_ksm_options
+check_user_mem
diff --git a/tools/testing/selftests/arm64/mte/Makefile b/tools/testing/selftests/arm64/mte/Makefile
new file mode 100644
index 000000000..4084ef108
--- /dev/null
+++ b/tools/testing/selftests/arm64/mte/Makefile
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2020 ARM Limited
+
+CFLAGS += -std=gnu99 -I.
+SRCS := $(filter-out mte_common_util.c,$(wildcard *.c))
+PROGS := $(patsubst %.c,%,$(SRCS))
+
+#Add mte compiler option
+CFLAGS += -march=armv8.5-a+memtag
+
+#check if the compiler works well
+mte_cc_support := $(shell if ($(CC) $(CFLAGS) -E -x c /dev/null -o /dev/null 2>&1) then echo "1"; fi)
+
+ifeq ($(mte_cc_support),1)
+# Generated binaries to be installed by top KSFT script
+TEST_GEN_PROGS := $(PROGS)
+
+# Get Kernel headers installed and use them.
+KSFT_KHDR_INSTALL := 1
+endif
+
+# Include KSFT lib.mk.
+include ../../lib.mk
+
+ifeq ($(mte_cc_support),1)
+$(TEST_GEN_PROGS): mte_common_util.c mte_common_util.h mte_helper.S
+endif
diff --git a/tools/testing/selftests/arm64/mte/check_buffer_fill.c b/tools/testing/selftests/arm64/mte/check_buffer_fill.c
new file mode 100644
index 000000000..c9fa141eb
--- /dev/null
+++ b/tools/testing/selftests/arm64/mte/check_buffer_fill.c
@@ -0,0 +1,478 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2020 ARM Limited
+
+#define _GNU_SOURCE
+
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "kselftest.h"
+#include "mte_common_util.h"
+#include "mte_def.h"
+
+#define OVERFLOW_RANGE MT_GRANULE_SIZE
+
+static int sizes[] = {
+ 1, 555, 1033, MT_GRANULE_SIZE - 1, MT_GRANULE_SIZE,
+ /* page size - 1*/ 0, /* page_size */ 0, /* page size + 1 */ 0
+};
+
+enum mte_block_test_alloc {
+ UNTAGGED_TAGGED,
+ TAGGED_UNTAGGED,
+ TAGGED_TAGGED,
+ BLOCK_ALLOC_MAX,
+};
+
+static int check_buffer_by_byte(int mem_type, int mode)
+{
+ char *ptr;
+ int i, j, item;
+ bool err;
+
+ mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG);
+ item = sizeof(sizes)/sizeof(int);
+
+ for (i = 0; i < item; i++) {
+ ptr = (char *)mte_allocate_memory(sizes[i], mem_type, 0, true);
+ if (check_allocated_memory(ptr, sizes[i], mem_type, true) != KSFT_PASS)
+ return KSFT_FAIL;
+ mte_initialize_current_context(mode, (uintptr_t)ptr, sizes[i]);
+ /* Set some value in tagged memory */
+ for (j = 0; j < sizes[i]; j++)
+ ptr[j] = '1';
+ mte_wait_after_trig();
+ err = cur_mte_cxt.fault_valid;
+ /* Check the buffer whether it is filled. */
+ for (j = 0; j < sizes[i] && !err; j++) {
+ if (ptr[j] != '1')
+ err = true;
+ }
+ mte_free_memory((void *)ptr, sizes[i], mem_type, true);
+
+ if (err)
+ break;
+ }
+ if (!err)
+ return KSFT_PASS;
+ else
+ return KSFT_FAIL;
+}
+
+static int check_buffer_underflow_by_byte(int mem_type, int mode,
+ int underflow_range)
+{
+ char *ptr;
+ int i, j, item, last_index;
+ bool err;
+ char *und_ptr = NULL;
+
+ mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG);
+ item = sizeof(sizes)/sizeof(int);
+ for (i = 0; i < item; i++) {
+ ptr = (char *)mte_allocate_memory_tag_range(sizes[i], mem_type, 0,
+ underflow_range, 0);
+ if (check_allocated_memory_range(ptr, sizes[i], mem_type,
+ underflow_range, 0) != KSFT_PASS)
+ return KSFT_FAIL;
+
+ mte_initialize_current_context(mode, (uintptr_t)ptr, -underflow_range);
+ last_index = 0;
+ /* Set some value in tagged memory and make the buffer underflow */
+ for (j = sizes[i] - 1; (j >= -underflow_range) &&
+ (cur_mte_cxt.fault_valid == false); j--) {
+ ptr[j] = '1';
+ last_index = j;
+ }
+ mte_wait_after_trig();
+ err = false;
+ /* Check whether the buffer is filled */
+ for (j = 0; j < sizes[i]; j++) {
+ if (ptr[j] != '1') {
+ err = true;
+ ksft_print_msg("Buffer is not filled at index:%d of ptr:0x%lx\n",
+ j, ptr);
+ break;
+ }
+ }
+ if (err)
+ goto check_buffer_underflow_by_byte_err;
+
+ switch (mode) {
+ case MTE_NONE_ERR:
+ if (cur_mte_cxt.fault_valid == true || last_index != -underflow_range) {
+ err = true;
+ break;
+ }
+ /* There were no fault so the underflow area should be filled */
+ und_ptr = (char *) MT_CLEAR_TAG((size_t) ptr - underflow_range);
+ for (j = 0 ; j < underflow_range; j++) {
+ if (und_ptr[j] != '1') {
+ err = true;
+ break;
+ }
+ }
+ break;
+ case MTE_ASYNC_ERR:
+ /* Imprecise fault should occur otherwise return error */
+ if (cur_mte_cxt.fault_valid == false) {
+ err = true;
+ break;
+ }
+ /*
+ * The imprecise fault is checked after the write to the buffer,
+ * so the underflow area before the fault should be filled.
+ */
+ und_ptr = (char *) MT_CLEAR_TAG((size_t) ptr);
+ for (j = last_index ; j < 0 ; j++) {
+ if (und_ptr[j] != '1') {
+ err = true;
+ break;
+ }
+ }
+ break;
+ case MTE_SYNC_ERR:
+ /* Precise fault should occur otherwise return error */
+ if (!cur_mte_cxt.fault_valid || (last_index != (-1))) {
+ err = true;
+ break;
+ }
+ /* Underflow area should not be filled */
+ und_ptr = (char *) MT_CLEAR_TAG((size_t) ptr);
+ if (und_ptr[-1] == '1')
+ err = true;
+ break;
+ default:
+ err = true;
+ break;
+ }
+check_buffer_underflow_by_byte_err:
+ mte_free_memory_tag_range((void *)ptr, sizes[i], mem_type, underflow_range, 0);
+ if (err)
+ break;
+ }
+ return (err ? KSFT_FAIL : KSFT_PASS);
+}
+
+static int check_buffer_overflow_by_byte(int mem_type, int mode,
+ int overflow_range)
+{
+ char *ptr;
+ int i, j, item, last_index;
+ bool err;
+ size_t tagged_size, overflow_size;
+ char *over_ptr = NULL;
+
+ mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG);
+ item = sizeof(sizes)/sizeof(int);
+ for (i = 0; i < item; i++) {
+ ptr = (char *)mte_allocate_memory_tag_range(sizes[i], mem_type, 0,
+ 0, overflow_range);
+ if (check_allocated_memory_range(ptr, sizes[i], mem_type,
+ 0, overflow_range) != KSFT_PASS)
+ return KSFT_FAIL;
+
+ tagged_size = MT_ALIGN_UP(sizes[i]);
+
+ mte_initialize_current_context(mode, (uintptr_t)ptr, sizes[i] + overflow_range);
+
+ /* Set some value in tagged memory and make the buffer underflow */
+ for (j = 0, last_index = 0 ; (j < (sizes[i] + overflow_range)) &&
+ (cur_mte_cxt.fault_valid == false); j++) {
+ ptr[j] = '1';
+ last_index = j;
+ }
+ mte_wait_after_trig();
+ err = false;
+ /* Check whether the buffer is filled */
+ for (j = 0; j < sizes[i]; j++) {
+ if (ptr[j] != '1') {
+ err = true;
+ ksft_print_msg("Buffer is not filled at index:%d of ptr:0x%lx\n",
+ j, ptr);
+ break;
+ }
+ }
+ if (err)
+ goto check_buffer_overflow_by_byte_err;
+
+ overflow_size = overflow_range - (tagged_size - sizes[i]);
+
+ switch (mode) {
+ case MTE_NONE_ERR:
+ if ((cur_mte_cxt.fault_valid == true) ||
+ (last_index != (sizes[i] + overflow_range - 1))) {
+ err = true;
+ break;
+ }
+ /* There were no fault so the overflow area should be filled */
+ over_ptr = (char *) MT_CLEAR_TAG((size_t) ptr + tagged_size);
+ for (j = 0 ; j < overflow_size; j++) {
+ if (over_ptr[j] != '1') {
+ err = true;
+ break;
+ }
+ }
+ break;
+ case MTE_ASYNC_ERR:
+ /* Imprecise fault should occur otherwise return error */
+ if (cur_mte_cxt.fault_valid == false) {
+ err = true;
+ break;
+ }
+ /*
+ * The imprecise fault is checked after the write to the buffer,
+ * so the overflow area should be filled before the fault.
+ */
+ over_ptr = (char *) MT_CLEAR_TAG((size_t) ptr);
+ for (j = tagged_size ; j < last_index; j++) {
+ if (over_ptr[j] != '1') {
+ err = true;
+ break;
+ }
+ }
+ break;
+ case MTE_SYNC_ERR:
+ /* Precise fault should occur otherwise return error */
+ if (!cur_mte_cxt.fault_valid || (last_index != tagged_size)) {
+ err = true;
+ break;
+ }
+ /* Underflow area should not be filled */
+ over_ptr = (char *) MT_CLEAR_TAG((size_t) ptr + tagged_size);
+ for (j = 0 ; j < overflow_size; j++) {
+ if (over_ptr[j] == '1')
+ err = true;
+ }
+ break;
+ default:
+ err = true;
+ break;
+ }
+check_buffer_overflow_by_byte_err:
+ mte_free_memory_tag_range((void *)ptr, sizes[i], mem_type, 0, overflow_range);
+ if (err)
+ break;
+ }
+ return (err ? KSFT_FAIL : KSFT_PASS);
+}
+
+static int check_buffer_by_block_iterate(int mem_type, int mode, size_t size)
+{
+ char *src, *dst;
+ int j, result = KSFT_PASS;
+ enum mte_block_test_alloc alloc_type = UNTAGGED_TAGGED;
+
+ for (alloc_type = UNTAGGED_TAGGED; alloc_type < (int) BLOCK_ALLOC_MAX; alloc_type++) {
+ switch (alloc_type) {
+ case UNTAGGED_TAGGED:
+ src = (char *)mte_allocate_memory(size, mem_type, 0, false);
+ if (check_allocated_memory(src, size, mem_type, false) != KSFT_PASS)
+ return KSFT_FAIL;
+
+ dst = (char *)mte_allocate_memory(size, mem_type, 0, true);
+ if (check_allocated_memory(dst, size, mem_type, true) != KSFT_PASS) {
+ mte_free_memory((void *)src, size, mem_type, false);
+ return KSFT_FAIL;
+ }
+
+ break;
+ case TAGGED_UNTAGGED:
+ dst = (char *)mte_allocate_memory(size, mem_type, 0, false);
+ if (check_allocated_memory(dst, size, mem_type, false) != KSFT_PASS)
+ return KSFT_FAIL;
+
+ src = (char *)mte_allocate_memory(size, mem_type, 0, true);
+ if (check_allocated_memory(src, size, mem_type, true) != KSFT_PASS) {
+ mte_free_memory((void *)dst, size, mem_type, false);
+ return KSFT_FAIL;
+ }
+ break;
+ case TAGGED_TAGGED:
+ src = (char *)mte_allocate_memory(size, mem_type, 0, true);
+ if (check_allocated_memory(src, size, mem_type, true) != KSFT_PASS)
+ return KSFT_FAIL;
+
+ dst = (char *)mte_allocate_memory(size, mem_type, 0, true);
+ if (check_allocated_memory(dst, size, mem_type, true) != KSFT_PASS) {
+ mte_free_memory((void *)src, size, mem_type, true);
+ return KSFT_FAIL;
+ }
+ break;
+ default:
+ return KSFT_FAIL;
+ }
+
+ cur_mte_cxt.fault_valid = false;
+ result = KSFT_PASS;
+ mte_initialize_current_context(mode, (uintptr_t)dst, size);
+ /* Set some value in memory and copy*/
+ memset((void *)src, (int)'1', size);
+ memcpy((void *)dst, (void *)src, size);
+ mte_wait_after_trig();
+ if (cur_mte_cxt.fault_valid) {
+ result = KSFT_FAIL;
+ goto check_buffer_by_block_err;
+ }
+ /* Check the buffer whether it is filled. */
+ for (j = 0; j < size; j++) {
+ if (src[j] != dst[j] || src[j] != '1') {
+ result = KSFT_FAIL;
+ break;
+ }
+ }
+check_buffer_by_block_err:
+ mte_free_memory((void *)src, size, mem_type,
+ MT_FETCH_TAG((uintptr_t)src) ? true : false);
+ mte_free_memory((void *)dst, size, mem_type,
+ MT_FETCH_TAG((uintptr_t)dst) ? true : false);
+ if (result != KSFT_PASS)
+ return result;
+ }
+ return result;
+}
+
+static int check_buffer_by_block(int mem_type, int mode)
+{
+ int i, item, result = KSFT_PASS;
+
+ mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG);
+ item = sizeof(sizes)/sizeof(int);
+ cur_mte_cxt.fault_valid = false;
+ for (i = 0; i < item; i++) {
+ result = check_buffer_by_block_iterate(mem_type, mode, sizes[i]);
+ if (result != KSFT_PASS)
+ break;
+ }
+ return result;
+}
+
+static int compare_memory_tags(char *ptr, size_t size, int tag)
+{
+ int i, new_tag;
+
+ for (i = 0 ; i < size ; i += MT_GRANULE_SIZE) {
+ new_tag = MT_FETCH_TAG((uintptr_t)(mte_get_tag_address(ptr + i)));
+ if (tag != new_tag) {
+ ksft_print_msg("FAIL: child mte tag mismatch\n");
+ return KSFT_FAIL;
+ }
+ }
+ return KSFT_PASS;
+}
+
+static int check_memory_initial_tags(int mem_type, int mode, int mapping)
+{
+ char *ptr;
+ int run, fd;
+ int total = sizeof(sizes)/sizeof(int);
+
+ mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG);
+ for (run = 0; run < total; run++) {
+ /* check initial tags for anonymous mmap */
+ ptr = (char *)mte_allocate_memory(sizes[run], mem_type, mapping, false);
+ if (check_allocated_memory(ptr, sizes[run], mem_type, false) != KSFT_PASS)
+ return KSFT_FAIL;
+ if (compare_memory_tags(ptr, sizes[run], 0) != KSFT_PASS) {
+ mte_free_memory((void *)ptr, sizes[run], mem_type, false);
+ return KSFT_FAIL;
+ }
+ mte_free_memory((void *)ptr, sizes[run], mem_type, false);
+
+ /* check initial tags for file mmap */
+ fd = create_temp_file();
+ if (fd == -1)
+ return KSFT_FAIL;
+ ptr = (char *)mte_allocate_file_memory(sizes[run], mem_type, mapping, false, fd);
+ if (check_allocated_memory(ptr, sizes[run], mem_type, false) != KSFT_PASS) {
+ close(fd);
+ return KSFT_FAIL;
+ }
+ if (compare_memory_tags(ptr, sizes[run], 0) != KSFT_PASS) {
+ mte_free_memory((void *)ptr, sizes[run], mem_type, false);
+ close(fd);
+ return KSFT_FAIL;
+ }
+ mte_free_memory((void *)ptr, sizes[run], mem_type, false);
+ close(fd);
+ }
+ return KSFT_PASS;
+}
+
+int main(int argc, char *argv[])
+{
+ int err;
+ size_t page_size = getpagesize();
+ int item = sizeof(sizes)/sizeof(int);
+
+ sizes[item - 3] = page_size - 1;
+ sizes[item - 2] = page_size;
+ sizes[item - 1] = page_size + 1;
+
+ err = mte_default_setup();
+ if (err)
+ return err;
+
+ /* Register SIGSEGV handler */
+ mte_register_signal(SIGSEGV, mte_default_handler);
+
+ /* Set test plan */
+ ksft_set_plan(20);
+
+ /* Buffer by byte tests */
+ evaluate_test(check_buffer_by_byte(USE_MMAP, MTE_SYNC_ERR),
+ "Check buffer correctness by byte with sync err mode and mmap memory\n");
+ evaluate_test(check_buffer_by_byte(USE_MMAP, MTE_ASYNC_ERR),
+ "Check buffer correctness by byte with async err mode and mmap memory\n");
+ evaluate_test(check_buffer_by_byte(USE_MPROTECT, MTE_SYNC_ERR),
+ "Check buffer correctness by byte with sync err mode and mmap/mprotect memory\n");
+ evaluate_test(check_buffer_by_byte(USE_MPROTECT, MTE_ASYNC_ERR),
+ "Check buffer correctness by byte with async err mode and mmap/mprotect memory\n");
+
+ /* Check buffer underflow with underflow size as 16 */
+ evaluate_test(check_buffer_underflow_by_byte(USE_MMAP, MTE_SYNC_ERR, MT_GRANULE_SIZE),
+ "Check buffer write underflow by byte with sync mode and mmap memory\n");
+ evaluate_test(check_buffer_underflow_by_byte(USE_MMAP, MTE_ASYNC_ERR, MT_GRANULE_SIZE),
+ "Check buffer write underflow by byte with async mode and mmap memory\n");
+ evaluate_test(check_buffer_underflow_by_byte(USE_MMAP, MTE_NONE_ERR, MT_GRANULE_SIZE),
+ "Check buffer write underflow by byte with tag check fault ignore and mmap memory\n");
+
+ /* Check buffer underflow with underflow size as page size */
+ evaluate_test(check_buffer_underflow_by_byte(USE_MMAP, MTE_SYNC_ERR, page_size),
+ "Check buffer write underflow by byte with sync mode and mmap memory\n");
+ evaluate_test(check_buffer_underflow_by_byte(USE_MMAP, MTE_ASYNC_ERR, page_size),
+ "Check buffer write underflow by byte with async mode and mmap memory\n");
+ evaluate_test(check_buffer_underflow_by_byte(USE_MMAP, MTE_NONE_ERR, page_size),
+ "Check buffer write underflow by byte with tag check fault ignore and mmap memory\n");
+
+ /* Check buffer overflow with overflow size as 16 */
+ evaluate_test(check_buffer_overflow_by_byte(USE_MMAP, MTE_SYNC_ERR, MT_GRANULE_SIZE),
+ "Check buffer write overflow by byte with sync mode and mmap memory\n");
+ evaluate_test(check_buffer_overflow_by_byte(USE_MMAP, MTE_ASYNC_ERR, MT_GRANULE_SIZE),
+ "Check buffer write overflow by byte with async mode and mmap memory\n");
+ evaluate_test(check_buffer_overflow_by_byte(USE_MMAP, MTE_NONE_ERR, MT_GRANULE_SIZE),
+ "Check buffer write overflow by byte with tag fault ignore mode and mmap memory\n");
+
+ /* Buffer by block tests */
+ evaluate_test(check_buffer_by_block(USE_MMAP, MTE_SYNC_ERR),
+ "Check buffer write correctness by block with sync mode and mmap memory\n");
+ evaluate_test(check_buffer_by_block(USE_MMAP, MTE_ASYNC_ERR),
+ "Check buffer write correctness by block with async mode and mmap memory\n");
+ evaluate_test(check_buffer_by_block(USE_MMAP, MTE_NONE_ERR),
+ "Check buffer write correctness by block with tag fault ignore and mmap memory\n");
+
+ /* Initial tags are supposed to be 0 */
+ evaluate_test(check_memory_initial_tags(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE),
+ "Check initial tags with private mapping, sync error mode and mmap memory\n");
+ evaluate_test(check_memory_initial_tags(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE),
+ "Check initial tags with private mapping, sync error mode and mmap/mprotect memory\n");
+ evaluate_test(check_memory_initial_tags(USE_MMAP, MTE_SYNC_ERR, MAP_SHARED),
+ "Check initial tags with shared mapping, sync error mode and mmap memory\n");
+ evaluate_test(check_memory_initial_tags(USE_MPROTECT, MTE_SYNC_ERR, MAP_SHARED),
+ "Check initial tags with shared mapping, sync error mode and mmap/mprotect memory\n");
+
+ mte_restore_setup();
+ ksft_print_cnts();
+ return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL;
+}
diff --git a/tools/testing/selftests/arm64/mte/check_child_memory.c b/tools/testing/selftests/arm64/mte/check_child_memory.c
new file mode 100644
index 000000000..43bd94f85
--- /dev/null
+++ b/tools/testing/selftests/arm64/mte/check_child_memory.c
@@ -0,0 +1,198 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2020 ARM Limited
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ucontext.h>
+#include <sys/wait.h>
+
+#include "kselftest.h"
+#include "mte_common_util.h"
+#include "mte_def.h"
+
+#define BUFFER_SIZE (5 * MT_GRANULE_SIZE)
+#define RUNS (MT_TAG_COUNT)
+#define UNDERFLOW MT_GRANULE_SIZE
+#define OVERFLOW MT_GRANULE_SIZE
+
+static size_t page_size;
+static int sizes[] = {
+ 1, 537, 989, 1269, MT_GRANULE_SIZE - 1, MT_GRANULE_SIZE,
+ /* page size - 1*/ 0, /* page_size */ 0, /* page size + 1 */ 0
+};
+
+static int check_child_tag_inheritance(char *ptr, int size, int mode)
+{
+ int i, parent_tag, child_tag, fault, child_status;
+ pid_t child;
+
+ parent_tag = MT_FETCH_TAG((uintptr_t)ptr);
+ fault = 0;
+
+ child = fork();
+ if (child == -1) {
+ ksft_print_msg("FAIL: child process creation\n");
+ return KSFT_FAIL;
+ } else if (child == 0) {
+ mte_initialize_current_context(mode, (uintptr_t)ptr, size);
+ /* Do copy on write */
+ memset(ptr, '1', size);
+ mte_wait_after_trig();
+ if (cur_mte_cxt.fault_valid == true) {
+ fault = 1;
+ goto check_child_tag_inheritance_err;
+ }
+ for (i = 0 ; i < size ; i += MT_GRANULE_SIZE) {
+ child_tag = MT_FETCH_TAG((uintptr_t)(mte_get_tag_address(ptr + i)));
+ if (parent_tag != child_tag) {
+ ksft_print_msg("FAIL: child mte tag mismatch\n");
+ fault = 1;
+ goto check_child_tag_inheritance_err;
+ }
+ }
+ mte_initialize_current_context(mode, (uintptr_t)ptr, -UNDERFLOW);
+ memset(ptr - UNDERFLOW, '2', UNDERFLOW);
+ mte_wait_after_trig();
+ if (cur_mte_cxt.fault_valid == false) {
+ fault = 1;
+ goto check_child_tag_inheritance_err;
+ }
+ mte_initialize_current_context(mode, (uintptr_t)ptr, size + OVERFLOW);
+ memset(ptr + size, '3', OVERFLOW);
+ mte_wait_after_trig();
+ if (cur_mte_cxt.fault_valid == false) {
+ fault = 1;
+ goto check_child_tag_inheritance_err;
+ }
+check_child_tag_inheritance_err:
+ _exit(fault);
+ }
+ /* Wait for child process to terminate */
+ wait(&child_status);
+ if (WIFEXITED(child_status))
+ fault = WEXITSTATUS(child_status);
+ else
+ fault = 1;
+ return (fault) ? KSFT_FAIL : KSFT_PASS;
+}
+
+static int check_child_memory_mapping(int mem_type, int mode, int mapping)
+{
+ char *ptr;
+ int run, result;
+ int item = sizeof(sizes)/sizeof(int);
+
+ item = sizeof(sizes)/sizeof(int);
+ mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG);
+ for (run = 0; run < item; run++) {
+ ptr = (char *)mte_allocate_memory_tag_range(sizes[run], mem_type, mapping,
+ UNDERFLOW, OVERFLOW);
+ if (check_allocated_memory_range(ptr, sizes[run], mem_type,
+ UNDERFLOW, OVERFLOW) != KSFT_PASS)
+ return KSFT_FAIL;
+ result = check_child_tag_inheritance(ptr, sizes[run], mode);
+ mte_free_memory_tag_range((void *)ptr, sizes[run], mem_type, UNDERFLOW, OVERFLOW);
+ if (result == KSFT_FAIL)
+ return result;
+ }
+ return KSFT_PASS;
+}
+
+static int check_child_file_mapping(int mem_type, int mode, int mapping)
+{
+ char *ptr, *map_ptr;
+ int run, fd, map_size, result = KSFT_PASS;
+ int total = sizeof(sizes)/sizeof(int);
+
+ mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG);
+ for (run = 0; run < total; run++) {
+ fd = create_temp_file();
+ if (fd == -1)
+ return KSFT_FAIL;
+
+ map_size = sizes[run] + OVERFLOW + UNDERFLOW;
+ map_ptr = (char *)mte_allocate_file_memory(map_size, mem_type, mapping, false, fd);
+ if (check_allocated_memory(map_ptr, map_size, mem_type, false) != KSFT_PASS) {
+ close(fd);
+ return KSFT_FAIL;
+ }
+ ptr = map_ptr + UNDERFLOW;
+ mte_initialize_current_context(mode, (uintptr_t)ptr, sizes[run]);
+ /* Only mte enabled memory will allow tag insertion */
+ ptr = mte_insert_tags((void *)ptr, sizes[run]);
+ if (!ptr || cur_mte_cxt.fault_valid == true) {
+ ksft_print_msg("FAIL: Insert tags on file based memory\n");
+ munmap((void *)map_ptr, map_size);
+ close(fd);
+ return KSFT_FAIL;
+ }
+ result = check_child_tag_inheritance(ptr, sizes[run], mode);
+ mte_clear_tags((void *)ptr, sizes[run]);
+ munmap((void *)map_ptr, map_size);
+ close(fd);
+ if (result != KSFT_PASS)
+ return KSFT_FAIL;
+ }
+ return KSFT_PASS;
+}
+
+int main(int argc, char *argv[])
+{
+ int err;
+ int item = sizeof(sizes)/sizeof(int);
+
+ page_size = getpagesize();
+ if (!page_size) {
+ ksft_print_msg("ERR: Unable to get page size\n");
+ return KSFT_FAIL;
+ }
+ sizes[item - 3] = page_size - 1;
+ sizes[item - 2] = page_size;
+ sizes[item - 1] = page_size + 1;
+
+ err = mte_default_setup();
+ if (err)
+ return err;
+
+ /* Register SIGSEGV handler */
+ mte_register_signal(SIGSEGV, mte_default_handler);
+ mte_register_signal(SIGBUS, mte_default_handler);
+
+ /* Set test plan */
+ ksft_set_plan(12);
+
+ evaluate_test(check_child_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE),
+ "Check child anonymous memory with private mapping, precise mode and mmap memory\n");
+ evaluate_test(check_child_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_SHARED),
+ "Check child anonymous memory with shared mapping, precise mode and mmap memory\n");
+ evaluate_test(check_child_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE),
+ "Check child anonymous memory with private mapping, imprecise mode and mmap memory\n");
+ evaluate_test(check_child_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_SHARED),
+ "Check child anonymous memory with shared mapping, imprecise mode and mmap memory\n");
+ evaluate_test(check_child_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE),
+ "Check child anonymous memory with private mapping, precise mode and mmap/mprotect memory\n");
+ evaluate_test(check_child_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_SHARED),
+ "Check child anonymous memory with shared mapping, precise mode and mmap/mprotect memory\n");
+
+ evaluate_test(check_child_file_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE),
+ "Check child file memory with private mapping, precise mode and mmap memory\n");
+ evaluate_test(check_child_file_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_SHARED),
+ "Check child file memory with shared mapping, precise mode and mmap memory\n");
+ evaluate_test(check_child_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE),
+ "Check child file memory with private mapping, imprecise mode and mmap memory\n");
+ evaluate_test(check_child_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_SHARED),
+ "Check child file memory with shared mapping, imprecise mode and mmap memory\n");
+ evaluate_test(check_child_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE),
+ "Check child file memory with private mapping, precise mode and mmap/mprotect memory\n");
+ evaluate_test(check_child_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_SHARED),
+ "Check child file memory with shared mapping, precise mode and mmap/mprotect memory\n");
+
+ mte_restore_setup();
+ ksft_print_cnts();
+ return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL;
+}
diff --git a/tools/testing/selftests/arm64/mte/check_ksm_options.c b/tools/testing/selftests/arm64/mte/check_ksm_options.c
new file mode 100644
index 000000000..3b23c4d61
--- /dev/null
+++ b/tools/testing/selftests/arm64/mte/check_ksm_options.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2020 ARM Limited
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ucontext.h>
+#include <sys/mman.h>
+
+#include "kselftest.h"
+#include "mte_common_util.h"
+#include "mte_def.h"
+
+#define TEST_UNIT 10
+#define PATH_KSM "/sys/kernel/mm/ksm/"
+#define MAX_LOOP 4
+
+static size_t page_sz;
+static unsigned long ksm_sysfs[5];
+
+static unsigned long read_sysfs(char *str)
+{
+ FILE *f;
+ unsigned long val = 0;
+
+ f = fopen(str, "r");
+ if (!f) {
+ ksft_print_msg("ERR: missing %s\n", str);
+ return 0;
+ }
+ fscanf(f, "%lu", &val);
+ fclose(f);
+ return val;
+}
+
+static void write_sysfs(char *str, unsigned long val)
+{
+ FILE *f;
+
+ f = fopen(str, "w");
+ if (!f) {
+ ksft_print_msg("ERR: missing %s\n", str);
+ return;
+ }
+ fprintf(f, "%lu", val);
+ fclose(f);
+}
+
+static void mte_ksm_setup(void)
+{
+ ksm_sysfs[0] = read_sysfs(PATH_KSM "merge_across_nodes");
+ write_sysfs(PATH_KSM "merge_across_nodes", 1);
+ ksm_sysfs[1] = read_sysfs(PATH_KSM "sleep_millisecs");
+ write_sysfs(PATH_KSM "sleep_millisecs", 0);
+ ksm_sysfs[2] = read_sysfs(PATH_KSM "run");
+ write_sysfs(PATH_KSM "run", 1);
+ ksm_sysfs[3] = read_sysfs(PATH_KSM "max_page_sharing");
+ write_sysfs(PATH_KSM "max_page_sharing", ksm_sysfs[3] + TEST_UNIT);
+ ksm_sysfs[4] = read_sysfs(PATH_KSM "pages_to_scan");
+ write_sysfs(PATH_KSM "pages_to_scan", ksm_sysfs[4] + TEST_UNIT);
+}
+
+static void mte_ksm_restore(void)
+{
+ write_sysfs(PATH_KSM "merge_across_nodes", ksm_sysfs[0]);
+ write_sysfs(PATH_KSM "sleep_millisecs", ksm_sysfs[1]);
+ write_sysfs(PATH_KSM "run", ksm_sysfs[2]);
+ write_sysfs(PATH_KSM "max_page_sharing", ksm_sysfs[3]);
+ write_sysfs(PATH_KSM "pages_to_scan", ksm_sysfs[4]);
+}
+
+static void mte_ksm_scan(void)
+{
+ int cur_count = read_sysfs(PATH_KSM "full_scans");
+ int scan_count = cur_count + 1;
+ int max_loop_count = MAX_LOOP;
+
+ while ((cur_count < scan_count) && max_loop_count) {
+ sleep(1);
+ cur_count = read_sysfs(PATH_KSM "full_scans");
+ max_loop_count--;
+ }
+#ifdef DEBUG
+ ksft_print_msg("INFO: pages_shared=%lu pages_sharing=%lu\n",
+ read_sysfs(PATH_KSM "pages_shared"),
+ read_sysfs(PATH_KSM "pages_sharing"));
+#endif
+}
+
+static int check_madvise_options(int mem_type, int mode, int mapping)
+{
+ char *ptr;
+ int err, ret;
+
+ err = KSFT_FAIL;
+ if (access(PATH_KSM, F_OK) == -1) {
+ ksft_print_msg("ERR: Kernel KSM config not enabled\n");
+ return err;
+ }
+
+ mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG);
+ ptr = mte_allocate_memory(TEST_UNIT * page_sz, mem_type, mapping, true);
+ if (check_allocated_memory(ptr, TEST_UNIT * page_sz, mem_type, false) != KSFT_PASS)
+ return KSFT_FAIL;
+
+ /* Insert same data in all the pages */
+ memset(ptr, 'A', TEST_UNIT * page_sz);
+ ret = madvise(ptr, TEST_UNIT * page_sz, MADV_MERGEABLE);
+ if (ret) {
+ ksft_print_msg("ERR: madvise failed to set MADV_UNMERGEABLE\n");
+ goto madvise_err;
+ }
+ mte_ksm_scan();
+ /* Tagged pages should not merge */
+ if ((read_sysfs(PATH_KSM "pages_shared") < 1) ||
+ (read_sysfs(PATH_KSM "pages_sharing") < (TEST_UNIT - 1)))
+ err = KSFT_PASS;
+madvise_err:
+ mte_free_memory(ptr, TEST_UNIT * page_sz, mem_type, true);
+ return err;
+}
+
+int main(int argc, char *argv[])
+{
+ int err;
+
+ err = mte_default_setup();
+ if (err)
+ return err;
+ page_sz = getpagesize();
+ if (!page_sz) {
+ ksft_print_msg("ERR: Unable to get page size\n");
+ return KSFT_FAIL;
+ }
+ /* Register signal handlers */
+ mte_register_signal(SIGBUS, mte_default_handler);
+ mte_register_signal(SIGSEGV, mte_default_handler);
+
+ /* Set test plan */
+ ksft_set_plan(4);
+
+ /* Enable KSM */
+ mte_ksm_setup();
+
+ evaluate_test(check_madvise_options(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE),
+ "Check KSM mte page merge for private mapping, sync mode and mmap memory\n");
+ evaluate_test(check_madvise_options(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE),
+ "Check KSM mte page merge for private mapping, async mode and mmap memory\n");
+ evaluate_test(check_madvise_options(USE_MMAP, MTE_SYNC_ERR, MAP_SHARED),
+ "Check KSM mte page merge for shared mapping, sync mode and mmap memory\n");
+ evaluate_test(check_madvise_options(USE_MMAP, MTE_ASYNC_ERR, MAP_SHARED),
+ "Check KSM mte page merge for shared mapping, async mode and mmap memory\n");
+
+ mte_ksm_restore();
+ mte_restore_setup();
+ ksft_print_cnts();
+ return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL;
+}
diff --git a/tools/testing/selftests/arm64/mte/check_mmap_options.c b/tools/testing/selftests/arm64/mte/check_mmap_options.c
new file mode 100644
index 000000000..a04b12c21
--- /dev/null
+++ b/tools/testing/selftests/arm64/mte/check_mmap_options.c
@@ -0,0 +1,266 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2020 ARM Limited
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ucontext.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "kselftest.h"
+#include "mte_common_util.h"
+#include "mte_def.h"
+
+#define RUNS (MT_TAG_COUNT)
+#define UNDERFLOW MT_GRANULE_SIZE
+#define OVERFLOW MT_GRANULE_SIZE
+#define TAG_CHECK_ON 0
+#define TAG_CHECK_OFF 1
+
+static size_t page_size;
+static int sizes[] = {
+ 1, 537, 989, 1269, MT_GRANULE_SIZE - 1, MT_GRANULE_SIZE,
+ /* page size - 1*/ 0, /* page_size */ 0, /* page size + 1 */ 0
+};
+
+static int check_mte_memory(char *ptr, int size, int mode, int tag_check)
+{
+ mte_initialize_current_context(mode, (uintptr_t)ptr, size);
+ memset(ptr, '1', size);
+ mte_wait_after_trig();
+ if (cur_mte_cxt.fault_valid == true)
+ return KSFT_FAIL;
+
+ mte_initialize_current_context(mode, (uintptr_t)ptr, -UNDERFLOW);
+ memset(ptr - UNDERFLOW, '2', UNDERFLOW);
+ mte_wait_after_trig();
+ if (cur_mte_cxt.fault_valid == false && tag_check == TAG_CHECK_ON)
+ return KSFT_FAIL;
+ if (cur_mte_cxt.fault_valid == true && tag_check == TAG_CHECK_OFF)
+ return KSFT_FAIL;
+
+ mte_initialize_current_context(mode, (uintptr_t)ptr, size + OVERFLOW);
+ memset(ptr + size, '3', OVERFLOW);
+ mte_wait_after_trig();
+ if (cur_mte_cxt.fault_valid == false && tag_check == TAG_CHECK_ON)
+ return KSFT_FAIL;
+ if (cur_mte_cxt.fault_valid == true && tag_check == TAG_CHECK_OFF)
+ return KSFT_FAIL;
+
+ return KSFT_PASS;
+}
+
+static int check_anonymous_memory_mapping(int mem_type, int mode, int mapping, int tag_check)
+{
+ char *ptr, *map_ptr;
+ int run, result, map_size;
+ int item = sizeof(sizes)/sizeof(int);
+
+ item = sizeof(sizes)/sizeof(int);
+ mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG);
+ for (run = 0; run < item; run++) {
+ map_size = sizes[run] + OVERFLOW + UNDERFLOW;
+ map_ptr = (char *)mte_allocate_memory(map_size, mem_type, mapping, false);
+ if (check_allocated_memory(map_ptr, map_size, mem_type, false) != KSFT_PASS)
+ return KSFT_FAIL;
+
+ ptr = map_ptr + UNDERFLOW;
+ mte_initialize_current_context(mode, (uintptr_t)ptr, sizes[run]);
+ /* Only mte enabled memory will allow tag insertion */
+ ptr = mte_insert_tags((void *)ptr, sizes[run]);
+ if (!ptr || cur_mte_cxt.fault_valid == true) {
+ ksft_print_msg("FAIL: Insert tags on anonymous mmap memory\n");
+ munmap((void *)map_ptr, map_size);
+ return KSFT_FAIL;
+ }
+ result = check_mte_memory(ptr, sizes[run], mode, tag_check);
+ mte_clear_tags((void *)ptr, sizes[run]);
+ mte_free_memory((void *)map_ptr, map_size, mem_type, false);
+ if (result == KSFT_FAIL)
+ return KSFT_FAIL;
+ }
+ return KSFT_PASS;
+}
+
+static int check_file_memory_mapping(int mem_type, int mode, int mapping, int tag_check)
+{
+ char *ptr, *map_ptr;
+ int run, fd, map_size;
+ int total = sizeof(sizes)/sizeof(int);
+ int result = KSFT_PASS;
+
+ mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG);
+ for (run = 0; run < total; run++) {
+ fd = create_temp_file();
+ if (fd == -1)
+ return KSFT_FAIL;
+
+ map_size = sizes[run] + UNDERFLOW + OVERFLOW;
+ map_ptr = (char *)mte_allocate_file_memory(map_size, mem_type, mapping, false, fd);
+ if (check_allocated_memory(map_ptr, map_size, mem_type, false) != KSFT_PASS) {
+ close(fd);
+ return KSFT_FAIL;
+ }
+ ptr = map_ptr + UNDERFLOW;
+ mte_initialize_current_context(mode, (uintptr_t)ptr, sizes[run]);
+ /* Only mte enabled memory will allow tag insertion */
+ ptr = mte_insert_tags((void *)ptr, sizes[run]);
+ if (!ptr || cur_mte_cxt.fault_valid == true) {
+ ksft_print_msg("FAIL: Insert tags on file based memory\n");
+ munmap((void *)map_ptr, map_size);
+ close(fd);
+ return KSFT_FAIL;
+ }
+ result = check_mte_memory(ptr, sizes[run], mode, tag_check);
+ mte_clear_tags((void *)ptr, sizes[run]);
+ munmap((void *)map_ptr, map_size);
+ close(fd);
+ if (result == KSFT_FAIL)
+ break;
+ }
+ return result;
+}
+
+static int check_clear_prot_mte_flag(int mem_type, int mode, int mapping)
+{
+ char *ptr, *map_ptr;
+ int run, prot_flag, result, fd, map_size;
+ int total = sizeof(sizes)/sizeof(int);
+
+ prot_flag = PROT_READ | PROT_WRITE;
+ mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG);
+ for (run = 0; run < total; run++) {
+ map_size = sizes[run] + OVERFLOW + UNDERFLOW;
+ ptr = (char *)mte_allocate_memory_tag_range(sizes[run], mem_type, mapping,
+ UNDERFLOW, OVERFLOW);
+ if (check_allocated_memory_range(ptr, sizes[run], mem_type,
+ UNDERFLOW, OVERFLOW) != KSFT_PASS)
+ return KSFT_FAIL;
+ map_ptr = ptr - UNDERFLOW;
+ /* Try to clear PROT_MTE property and verify it by tag checking */
+ if (mprotect(map_ptr, map_size, prot_flag)) {
+ mte_free_memory_tag_range((void *)ptr, sizes[run], mem_type,
+ UNDERFLOW, OVERFLOW);
+ ksft_print_msg("FAIL: mprotect not ignoring clear PROT_MTE property\n");
+ return KSFT_FAIL;
+ }
+ result = check_mte_memory(ptr, sizes[run], mode, TAG_CHECK_ON);
+ mte_free_memory_tag_range((void *)ptr, sizes[run], mem_type, UNDERFLOW, OVERFLOW);
+ if (result != KSFT_PASS)
+ return KSFT_FAIL;
+
+ fd = create_temp_file();
+ if (fd == -1)
+ return KSFT_FAIL;
+ ptr = (char *)mte_allocate_file_memory_tag_range(sizes[run], mem_type, mapping,
+ UNDERFLOW, OVERFLOW, fd);
+ if (check_allocated_memory_range(ptr, sizes[run], mem_type,
+ UNDERFLOW, OVERFLOW) != KSFT_PASS) {
+ close(fd);
+ return KSFT_FAIL;
+ }
+ map_ptr = ptr - UNDERFLOW;
+ /* Try to clear PROT_MTE property and verify it by tag checking */
+ if (mprotect(map_ptr, map_size, prot_flag)) {
+ ksft_print_msg("FAIL: mprotect not ignoring clear PROT_MTE property\n");
+ mte_free_memory_tag_range((void *)ptr, sizes[run], mem_type,
+ UNDERFLOW, OVERFLOW);
+ close(fd);
+ return KSFT_FAIL;
+ }
+ result = check_mte_memory(ptr, sizes[run], mode, TAG_CHECK_ON);
+ mte_free_memory_tag_range((void *)ptr, sizes[run], mem_type, UNDERFLOW, OVERFLOW);
+ close(fd);
+ if (result != KSFT_PASS)
+ return KSFT_FAIL;
+ }
+ return KSFT_PASS;
+}
+
+int main(int argc, char *argv[])
+{
+ int err;
+ int item = sizeof(sizes)/sizeof(int);
+
+ err = mte_default_setup();
+ if (err)
+ return err;
+ page_size = getpagesize();
+ if (!page_size) {
+ ksft_print_msg("ERR: Unable to get page size\n");
+ return KSFT_FAIL;
+ }
+ sizes[item - 3] = page_size - 1;
+ sizes[item - 2] = page_size;
+ sizes[item - 1] = page_size + 1;
+
+ /* Register signal handlers */
+ mte_register_signal(SIGBUS, mte_default_handler);
+ mte_register_signal(SIGSEGV, mte_default_handler);
+
+ /* Set test plan */
+ ksft_set_plan(22);
+
+ mte_enable_pstate_tco();
+
+ evaluate_test(check_anonymous_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE, TAG_CHECK_OFF),
+ "Check anonymous memory with private mapping, sync error mode, mmap memory and tag check off\n");
+ evaluate_test(check_file_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE, TAG_CHECK_OFF),
+ "Check file memory with private mapping, sync error mode, mmap/mprotect memory and tag check off\n");
+
+ mte_disable_pstate_tco();
+ evaluate_test(check_anonymous_memory_mapping(USE_MMAP, MTE_NONE_ERR, MAP_PRIVATE, TAG_CHECK_OFF),
+ "Check anonymous memory with private mapping, no error mode, mmap memory and tag check off\n");
+ evaluate_test(check_file_memory_mapping(USE_MPROTECT, MTE_NONE_ERR, MAP_PRIVATE, TAG_CHECK_OFF),
+ "Check file memory with private mapping, no error mode, mmap/mprotect memory and tag check off\n");
+
+ evaluate_test(check_anonymous_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE, TAG_CHECK_ON),
+ "Check anonymous memory with private mapping, sync error mode, mmap memory and tag check on\n");
+ evaluate_test(check_anonymous_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE, TAG_CHECK_ON),
+ "Check anonymous memory with private mapping, sync error mode, mmap/mprotect memory and tag check on\n");
+ evaluate_test(check_anonymous_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_SHARED, TAG_CHECK_ON),
+ "Check anonymous memory with shared mapping, sync error mode, mmap memory and tag check on\n");
+ evaluate_test(check_anonymous_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_SHARED, TAG_CHECK_ON),
+ "Check anonymous memory with shared mapping, sync error mode, mmap/mprotect memory and tag check on\n");
+ evaluate_test(check_anonymous_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE, TAG_CHECK_ON),
+ "Check anonymous memory with private mapping, async error mode, mmap memory and tag check on\n");
+ evaluate_test(check_anonymous_memory_mapping(USE_MPROTECT, MTE_ASYNC_ERR, MAP_PRIVATE, TAG_CHECK_ON),
+ "Check anonymous memory with private mapping, async error mode, mmap/mprotect memory and tag check on\n");
+ evaluate_test(check_anonymous_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_SHARED, TAG_CHECK_ON),
+ "Check anonymous memory with shared mapping, async error mode, mmap memory and tag check on\n");
+ evaluate_test(check_anonymous_memory_mapping(USE_MPROTECT, MTE_ASYNC_ERR, MAP_SHARED, TAG_CHECK_ON),
+ "Check anonymous memory with shared mapping, async error mode, mmap/mprotect memory and tag check on\n");
+
+ evaluate_test(check_file_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE, TAG_CHECK_ON),
+ "Check file memory with private mapping, sync error mode, mmap memory and tag check on\n");
+ evaluate_test(check_file_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE, TAG_CHECK_ON),
+ "Check file memory with private mapping, sync error mode, mmap/mprotect memory and tag check on\n");
+ evaluate_test(check_file_memory_mapping(USE_MMAP, MTE_SYNC_ERR, MAP_SHARED, TAG_CHECK_ON),
+ "Check file memory with shared mapping, sync error mode, mmap memory and tag check on\n");
+ evaluate_test(check_file_memory_mapping(USE_MPROTECT, MTE_SYNC_ERR, MAP_SHARED, TAG_CHECK_ON),
+ "Check file memory with shared mapping, sync error mode, mmap/mprotect memory and tag check on\n");
+ evaluate_test(check_file_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE, TAG_CHECK_ON),
+ "Check file memory with private mapping, async error mode, mmap memory and tag check on\n");
+ evaluate_test(check_file_memory_mapping(USE_MPROTECT, MTE_ASYNC_ERR, MAP_PRIVATE, TAG_CHECK_ON),
+ "Check file memory with private mapping, async error mode, mmap/mprotect memory and tag check on\n");
+ evaluate_test(check_file_memory_mapping(USE_MMAP, MTE_ASYNC_ERR, MAP_SHARED, TAG_CHECK_ON),
+ "Check file memory with shared mapping, async error mode, mmap memory and tag check on\n");
+ evaluate_test(check_file_memory_mapping(USE_MPROTECT, MTE_ASYNC_ERR, MAP_SHARED, TAG_CHECK_ON),
+ "Check file memory with shared mapping, async error mode, mmap/mprotect memory and tag check on\n");
+
+ evaluate_test(check_clear_prot_mte_flag(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE),
+ "Check clear PROT_MTE flags with private mapping, sync error mode and mmap memory\n");
+ evaluate_test(check_clear_prot_mte_flag(USE_MPROTECT, MTE_SYNC_ERR, MAP_PRIVATE),
+ "Check clear PROT_MTE flags with private mapping and sync error mode and mmap/mprotect memory\n");
+
+ mte_restore_setup();
+ ksft_print_cnts();
+ return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL;
+}
diff --git a/tools/testing/selftests/arm64/mte/check_tags_inclusion.c b/tools/testing/selftests/arm64/mte/check_tags_inclusion.c
new file mode 100644
index 000000000..deaef1f61
--- /dev/null
+++ b/tools/testing/selftests/arm64/mte/check_tags_inclusion.c
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2020 ARM Limited
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ucontext.h>
+#include <sys/wait.h>
+
+#include "kselftest.h"
+#include "mte_common_util.h"
+#include "mte_def.h"
+
+#define BUFFER_SIZE (5 * MT_GRANULE_SIZE)
+#define RUNS (MT_TAG_COUNT * 2)
+#define MTE_LAST_TAG_MASK (0x7FFF)
+
+static int verify_mte_pointer_validity(char *ptr, int mode)
+{
+ mte_initialize_current_context(mode, (uintptr_t)ptr, BUFFER_SIZE);
+ /* Check the validity of the tagged pointer */
+ memset((void *)ptr, '1', BUFFER_SIZE);
+ mte_wait_after_trig();
+ if (cur_mte_cxt.fault_valid)
+ return KSFT_FAIL;
+ /* Proceed further for nonzero tags */
+ if (!MT_FETCH_TAG((uintptr_t)ptr))
+ return KSFT_PASS;
+ mte_initialize_current_context(mode, (uintptr_t)ptr, BUFFER_SIZE + 1);
+ /* Check the validity outside the range */
+ ptr[BUFFER_SIZE] = '2';
+ mte_wait_after_trig();
+ if (!cur_mte_cxt.fault_valid)
+ return KSFT_FAIL;
+ else
+ return KSFT_PASS;
+}
+
+static int check_single_included_tags(int mem_type, int mode)
+{
+ char *ptr;
+ int tag, run, result = KSFT_PASS;
+
+ ptr = (char *)mte_allocate_memory(BUFFER_SIZE + MT_GRANULE_SIZE, mem_type, 0, false);
+ if (check_allocated_memory(ptr, BUFFER_SIZE + MT_GRANULE_SIZE,
+ mem_type, false) != KSFT_PASS)
+ return KSFT_FAIL;
+
+ for (tag = 0; (tag < MT_TAG_COUNT) && (result == KSFT_PASS); tag++) {
+ mte_switch_mode(mode, MT_INCLUDE_VALID_TAG(tag));
+ /* Try to catch a excluded tag by a number of tries. */
+ for (run = 0; (run < RUNS) && (result == KSFT_PASS); run++) {
+ ptr = (char *)mte_insert_tags(ptr, BUFFER_SIZE);
+ /* Check tag value */
+ if (MT_FETCH_TAG((uintptr_t)ptr) == tag) {
+ ksft_print_msg("FAIL: wrong tag = 0x%x with include mask=0x%x\n",
+ MT_FETCH_TAG((uintptr_t)ptr),
+ MT_INCLUDE_VALID_TAG(tag));
+ result = KSFT_FAIL;
+ break;
+ }
+ result = verify_mte_pointer_validity(ptr, mode);
+ }
+ }
+ mte_free_memory_tag_range((void *)ptr, BUFFER_SIZE, mem_type, 0, MT_GRANULE_SIZE);
+ return result;
+}
+
+static int check_multiple_included_tags(int mem_type, int mode)
+{
+ char *ptr;
+ int tag, run, result = KSFT_PASS;
+ unsigned long excl_mask = 0;
+
+ ptr = (char *)mte_allocate_memory(BUFFER_SIZE + MT_GRANULE_SIZE, mem_type, 0, false);
+ if (check_allocated_memory(ptr, BUFFER_SIZE + MT_GRANULE_SIZE,
+ mem_type, false) != KSFT_PASS)
+ return KSFT_FAIL;
+
+ for (tag = 0; (tag < MT_TAG_COUNT - 1) && (result == KSFT_PASS); tag++) {
+ excl_mask |= 1 << tag;
+ mte_switch_mode(mode, MT_INCLUDE_VALID_TAGS(excl_mask));
+ /* Try to catch a excluded tag by a number of tries. */
+ for (run = 0; (run < RUNS) && (result == KSFT_PASS); run++) {
+ ptr = (char *)mte_insert_tags(ptr, BUFFER_SIZE);
+ /* Check tag value */
+ if (MT_FETCH_TAG((uintptr_t)ptr) < tag) {
+ ksft_print_msg("FAIL: wrong tag = 0x%x with include mask=0x%x\n",
+ MT_FETCH_TAG((uintptr_t)ptr),
+ MT_INCLUDE_VALID_TAGS(excl_mask));
+ result = KSFT_FAIL;
+ break;
+ }
+ result = verify_mte_pointer_validity(ptr, mode);
+ }
+ }
+ mte_free_memory_tag_range((void *)ptr, BUFFER_SIZE, mem_type, 0, MT_GRANULE_SIZE);
+ return result;
+}
+
+static int check_all_included_tags(int mem_type, int mode)
+{
+ char *ptr;
+ int run, result = KSFT_PASS;
+
+ ptr = (char *)mte_allocate_memory(BUFFER_SIZE + MT_GRANULE_SIZE, mem_type, 0, false);
+ if (check_allocated_memory(ptr, BUFFER_SIZE + MT_GRANULE_SIZE,
+ mem_type, false) != KSFT_PASS)
+ return KSFT_FAIL;
+
+ mte_switch_mode(mode, MT_INCLUDE_TAG_MASK);
+ /* Try to catch a excluded tag by a number of tries. */
+ for (run = 0; (run < RUNS) && (result == KSFT_PASS); run++) {
+ ptr = (char *)mte_insert_tags(ptr, BUFFER_SIZE);
+ /*
+ * Here tag byte can be between 0x0 to 0xF (full allowed range)
+ * so no need to match so just verify if it is writable.
+ */
+ result = verify_mte_pointer_validity(ptr, mode);
+ }
+ mte_free_memory_tag_range((void *)ptr, BUFFER_SIZE, mem_type, 0, MT_GRANULE_SIZE);
+ return result;
+}
+
+static int check_none_included_tags(int mem_type, int mode)
+{
+ char *ptr;
+ int run;
+
+ ptr = (char *)mte_allocate_memory(BUFFER_SIZE, mem_type, 0, false);
+ if (check_allocated_memory(ptr, BUFFER_SIZE, mem_type, false) != KSFT_PASS)
+ return KSFT_FAIL;
+
+ mte_switch_mode(mode, MT_EXCLUDE_TAG_MASK);
+ /* Try to catch a excluded tag by a number of tries. */
+ for (run = 0; run < RUNS; run++) {
+ ptr = (char *)mte_insert_tags(ptr, BUFFER_SIZE);
+ /* Here all tags exluded so tag value generated should be 0 */
+ if (MT_FETCH_TAG((uintptr_t)ptr)) {
+ ksft_print_msg("FAIL: included tag value found\n");
+ mte_free_memory((void *)ptr, BUFFER_SIZE, mem_type, true);
+ return KSFT_FAIL;
+ }
+ mte_initialize_current_context(mode, (uintptr_t)ptr, BUFFER_SIZE);
+ /* Check the write validity of the untagged pointer */
+ memset((void *)ptr, '1', BUFFER_SIZE);
+ mte_wait_after_trig();
+ if (cur_mte_cxt.fault_valid)
+ break;
+ }
+ mte_free_memory((void *)ptr, BUFFER_SIZE, mem_type, false);
+ if (cur_mte_cxt.fault_valid)
+ return KSFT_FAIL;
+ else
+ return KSFT_PASS;
+}
+
+int main(int argc, char *argv[])
+{
+ int err;
+
+ err = mte_default_setup();
+ if (err)
+ return err;
+
+ /* Register SIGSEGV handler */
+ mte_register_signal(SIGSEGV, mte_default_handler);
+
+ /* Set test plan */
+ ksft_set_plan(4);
+
+ evaluate_test(check_single_included_tags(USE_MMAP, MTE_SYNC_ERR),
+ "Check an included tag value with sync mode\n");
+ evaluate_test(check_multiple_included_tags(USE_MMAP, MTE_SYNC_ERR),
+ "Check different included tags value with sync mode\n");
+ evaluate_test(check_none_included_tags(USE_MMAP, MTE_SYNC_ERR),
+ "Check none included tags value with sync mode\n");
+ evaluate_test(check_all_included_tags(USE_MMAP, MTE_SYNC_ERR),
+ "Check all included tags value with sync mode\n");
+
+ mte_restore_setup();
+ ksft_print_cnts();
+ return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL;
+}
diff --git a/tools/testing/selftests/arm64/mte/check_user_mem.c b/tools/testing/selftests/arm64/mte/check_user_mem.c
new file mode 100644
index 000000000..4bfa80f2a
--- /dev/null
+++ b/tools/testing/selftests/arm64/mte/check_user_mem.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2020 ARM Limited
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <ucontext.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+#include "kselftest.h"
+#include "mte_common_util.h"
+#include "mte_def.h"
+
+static size_t page_sz;
+
+static int check_usermem_access_fault(int mem_type, int mode, int mapping)
+{
+ int fd, i, err;
+ char val = 'A';
+ size_t len, read_len;
+ void *ptr, *ptr_next;
+
+ err = KSFT_FAIL;
+ len = 2 * page_sz;
+ mte_switch_mode(mode, MTE_ALLOW_NON_ZERO_TAG);
+ fd = create_temp_file();
+ if (fd == -1)
+ return KSFT_FAIL;
+ for (i = 0; i < len; i++)
+ write(fd, &val, sizeof(val));
+ lseek(fd, 0, 0);
+ ptr = mte_allocate_memory(len, mem_type, mapping, true);
+ if (check_allocated_memory(ptr, len, mem_type, true) != KSFT_PASS) {
+ close(fd);
+ return KSFT_FAIL;
+ }
+ mte_initialize_current_context(mode, (uintptr_t)ptr, len);
+ /* Copy from file into buffer with valid tag */
+ read_len = read(fd, ptr, len);
+ mte_wait_after_trig();
+ if (cur_mte_cxt.fault_valid || read_len < len)
+ goto usermem_acc_err;
+ /* Verify same pattern is read */
+ for (i = 0; i < len; i++)
+ if (*(char *)(ptr + i) != val)
+ break;
+ if (i < len)
+ goto usermem_acc_err;
+
+ /* Tag the next half of memory with different value */
+ ptr_next = (void *)((unsigned long)ptr + page_sz);
+ ptr_next = mte_insert_new_tag(ptr_next);
+ mte_set_tag_address_range(ptr_next, page_sz);
+
+ lseek(fd, 0, 0);
+ /* Copy from file into buffer with invalid tag */
+ read_len = read(fd, ptr, len);
+ mte_wait_after_trig();
+ /*
+ * Accessing user memory in kernel with invalid tag should fail in sync
+ * mode without fault but may not fail in async mode as per the
+ * implemented MTE userspace support in Arm64 kernel.
+ */
+ if (mode == MTE_SYNC_ERR &&
+ !cur_mte_cxt.fault_valid && read_len < len) {
+ err = KSFT_PASS;
+ } else if (mode == MTE_ASYNC_ERR &&
+ !cur_mte_cxt.fault_valid && read_len == len) {
+ err = KSFT_PASS;
+ }
+usermem_acc_err:
+ mte_free_memory((void *)ptr, len, mem_type, true);
+ close(fd);
+ return err;
+}
+
+int main(int argc, char *argv[])
+{
+ int err;
+
+ page_sz = getpagesize();
+ if (!page_sz) {
+ ksft_print_msg("ERR: Unable to get page size\n");
+ return KSFT_FAIL;
+ }
+ err = mte_default_setup();
+ if (err)
+ return err;
+
+ /* Register signal handlers */
+ mte_register_signal(SIGSEGV, mte_default_handler);
+
+ /* Set test plan */
+ ksft_set_plan(4);
+
+ evaluate_test(check_usermem_access_fault(USE_MMAP, MTE_SYNC_ERR, MAP_PRIVATE),
+ "Check memory access from kernel in sync mode, private mapping and mmap memory\n");
+ evaluate_test(check_usermem_access_fault(USE_MMAP, MTE_SYNC_ERR, MAP_SHARED),
+ "Check memory access from kernel in sync mode, shared mapping and mmap memory\n");
+
+ evaluate_test(check_usermem_access_fault(USE_MMAP, MTE_ASYNC_ERR, MAP_PRIVATE),
+ "Check memory access from kernel in async mode, private mapping and mmap memory\n");
+ evaluate_test(check_usermem_access_fault(USE_MMAP, MTE_ASYNC_ERR, MAP_SHARED),
+ "Check memory access from kernel in async mode, shared mapping and mmap memory\n");
+
+ mte_restore_setup();
+ ksft_print_cnts();
+ return ksft_get_fail_cnt() == 0 ? KSFT_PASS : KSFT_FAIL;
+}
diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.c b/tools/testing/selftests/arm64/mte/mte_common_util.c
new file mode 100644
index 000000000..2703bd628
--- /dev/null
+++ b/tools/testing/selftests/arm64/mte/mte_common_util.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2020 ARM Limited
+
+#include <fcntl.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <linux/auxvec.h>
+#include <sys/auxv.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+
+#include <asm/hwcap.h>
+
+#include "kselftest.h"
+#include "mte_common_util.h"
+#include "mte_def.h"
+
+#define INIT_BUFFER_SIZE 256
+
+struct mte_fault_cxt cur_mte_cxt;
+static unsigned int mte_cur_mode;
+static unsigned int mte_cur_pstate_tco;
+
+void mte_default_handler(int signum, siginfo_t *si, void *uc)
+{
+ unsigned long addr = (unsigned long)si->si_addr;
+
+ if (signum == SIGSEGV) {
+#ifdef DEBUG
+ ksft_print_msg("INFO: SIGSEGV signal at pc=%lx, fault addr=%lx, si_code=%lx\n",
+ ((ucontext_t *)uc)->uc_mcontext.pc, addr, si->si_code);
+#endif
+ if (si->si_code == SEGV_MTEAERR) {
+ if (cur_mte_cxt.trig_si_code == si->si_code)
+ cur_mte_cxt.fault_valid = true;
+ return;
+ }
+ /* Compare the context for precise error */
+ else if (si->si_code == SEGV_MTESERR) {
+ if (cur_mte_cxt.trig_si_code == si->si_code &&
+ ((cur_mte_cxt.trig_range >= 0 &&
+ addr >= MT_CLEAR_TAG(cur_mte_cxt.trig_addr) &&
+ addr <= (MT_CLEAR_TAG(cur_mte_cxt.trig_addr) + cur_mte_cxt.trig_range)) ||
+ (cur_mte_cxt.trig_range < 0 &&
+ addr <= MT_CLEAR_TAG(cur_mte_cxt.trig_addr) &&
+ addr >= (MT_CLEAR_TAG(cur_mte_cxt.trig_addr) + cur_mte_cxt.trig_range)))) {
+ cur_mte_cxt.fault_valid = true;
+ /* Adjust the pc by 4 */
+ ((ucontext_t *)uc)->uc_mcontext.pc += 4;
+ } else {
+ ksft_print_msg("Invalid MTE synchronous exception caught!\n");
+ exit(1);
+ }
+ } else {
+ ksft_print_msg("Unknown SIGSEGV exception caught!\n");
+ exit(1);
+ }
+ } else if (signum == SIGBUS) {
+ ksft_print_msg("INFO: SIGBUS signal at pc=%lx, fault addr=%lx, si_code=%lx\n",
+ ((ucontext_t *)uc)->uc_mcontext.pc, addr, si->si_code);
+ if ((cur_mte_cxt.trig_range >= 0 &&
+ addr >= MT_CLEAR_TAG(cur_mte_cxt.trig_addr) &&
+ addr <= (MT_CLEAR_TAG(cur_mte_cxt.trig_addr) + cur_mte_cxt.trig_range)) ||
+ (cur_mte_cxt.trig_range < 0 &&
+ addr <= MT_CLEAR_TAG(cur_mte_cxt.trig_addr) &&
+ addr >= (MT_CLEAR_TAG(cur_mte_cxt.trig_addr) + cur_mte_cxt.trig_range))) {
+ cur_mte_cxt.fault_valid = true;
+ /* Adjust the pc by 4 */
+ ((ucontext_t *)uc)->uc_mcontext.pc += 4;
+ }
+ }
+}
+
+void mte_register_signal(int signal, void (*handler)(int, siginfo_t *, void *))
+{
+ struct sigaction sa;
+
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO;
+ sigemptyset(&sa.sa_mask);
+ sigaction(signal, &sa, NULL);
+}
+
+void mte_wait_after_trig(void)
+{
+ sched_yield();
+}
+
+void *mte_insert_tags(void *ptr, size_t size)
+{
+ void *tag_ptr;
+ int align_size;
+
+ if (!ptr || (unsigned long)(ptr) & MT_ALIGN_GRANULE) {
+ ksft_print_msg("FAIL: Addr=%lx: invalid\n", ptr);
+ return NULL;
+ }
+ align_size = MT_ALIGN_UP(size);
+ tag_ptr = mte_insert_random_tag(ptr);
+ mte_set_tag_address_range(tag_ptr, align_size);
+ return tag_ptr;
+}
+
+void mte_clear_tags(void *ptr, size_t size)
+{
+ if (!ptr || (unsigned long)(ptr) & MT_ALIGN_GRANULE) {
+ ksft_print_msg("FAIL: Addr=%lx: invalid\n", ptr);
+ return;
+ }
+ size = MT_ALIGN_UP(size);
+ ptr = (void *)MT_CLEAR_TAG((unsigned long)ptr);
+ mte_clear_tag_address_range(ptr, size);
+}
+
+static void *__mte_allocate_memory_range(size_t size, int mem_type, int mapping,
+ size_t range_before, size_t range_after,
+ bool tags, int fd)
+{
+ void *ptr;
+ int prot_flag, map_flag;
+ size_t entire_size = size + range_before + range_after;
+
+ if (mem_type != USE_MALLOC && mem_type != USE_MMAP &&
+ mem_type != USE_MPROTECT) {
+ ksft_print_msg("FAIL: Invalid allocate request\n");
+ return NULL;
+ }
+ if (mem_type == USE_MALLOC)
+ return malloc(entire_size) + range_before;
+
+ prot_flag = PROT_READ | PROT_WRITE;
+ if (mem_type == USE_MMAP)
+ prot_flag |= PROT_MTE;
+
+ map_flag = mapping;
+ if (fd == -1)
+ map_flag = MAP_ANONYMOUS | map_flag;
+ if (!(mapping & MAP_SHARED))
+ map_flag |= MAP_PRIVATE;
+ ptr = mmap(NULL, entire_size, prot_flag, map_flag, fd, 0);
+ if (ptr == MAP_FAILED) {
+ ksft_print_msg("FAIL: mmap allocation\n");
+ return NULL;
+ }
+ if (mem_type == USE_MPROTECT) {
+ if (mprotect(ptr, entire_size, prot_flag | PROT_MTE)) {
+ munmap(ptr, size);
+ ksft_print_msg("FAIL: mprotect PROT_MTE property\n");
+ return NULL;
+ }
+ }
+ if (tags)
+ ptr = mte_insert_tags(ptr + range_before, size);
+ return ptr;
+}
+
+void *mte_allocate_memory_tag_range(size_t size, int mem_type, int mapping,
+ size_t range_before, size_t range_after)
+{
+ return __mte_allocate_memory_range(size, mem_type, mapping, range_before,
+ range_after, true, -1);
+}
+
+void *mte_allocate_memory(size_t size, int mem_type, int mapping, bool tags)
+{
+ return __mte_allocate_memory_range(size, mem_type, mapping, 0, 0, tags, -1);
+}
+
+void *mte_allocate_file_memory(size_t size, int mem_type, int mapping, bool tags, int fd)
+{
+ int index;
+ char buffer[INIT_BUFFER_SIZE];
+
+ if (mem_type != USE_MPROTECT && mem_type != USE_MMAP) {
+ ksft_print_msg("FAIL: Invalid mmap file request\n");
+ return NULL;
+ }
+ /* Initialize the file for mappable size */
+ lseek(fd, 0, SEEK_SET);
+ for (index = INIT_BUFFER_SIZE; index < size; index += INIT_BUFFER_SIZE)
+ write(fd, buffer, INIT_BUFFER_SIZE);
+ index -= INIT_BUFFER_SIZE;
+ write(fd, buffer, size - index);
+ return __mte_allocate_memory_range(size, mem_type, mapping, 0, 0, tags, fd);
+}
+
+void *mte_allocate_file_memory_tag_range(size_t size, int mem_type, int mapping,
+ size_t range_before, size_t range_after, int fd)
+{
+ int index;
+ char buffer[INIT_BUFFER_SIZE];
+ int map_size = size + range_before + range_after;
+
+ if (mem_type != USE_MPROTECT && mem_type != USE_MMAP) {
+ ksft_print_msg("FAIL: Invalid mmap file request\n");
+ return NULL;
+ }
+ /* Initialize the file for mappable size */
+ lseek(fd, 0, SEEK_SET);
+ for (index = INIT_BUFFER_SIZE; index < map_size; index += INIT_BUFFER_SIZE)
+ write(fd, buffer, INIT_BUFFER_SIZE);
+ index -= INIT_BUFFER_SIZE;
+ write(fd, buffer, map_size - index);
+ return __mte_allocate_memory_range(size, mem_type, mapping, range_before,
+ range_after, true, fd);
+}
+
+static void __mte_free_memory_range(void *ptr, size_t size, int mem_type,
+ size_t range_before, size_t range_after, bool tags)
+{
+ switch (mem_type) {
+ case USE_MALLOC:
+ free(ptr - range_before);
+ break;
+ case USE_MMAP:
+ case USE_MPROTECT:
+ if (tags)
+ mte_clear_tags(ptr, size);
+ munmap(ptr - range_before, size + range_before + range_after);
+ break;
+ default:
+ ksft_print_msg("FAIL: Invalid free request\n");
+ break;
+ }
+}
+
+void mte_free_memory_tag_range(void *ptr, size_t size, int mem_type,
+ size_t range_before, size_t range_after)
+{
+ __mte_free_memory_range(ptr, size, mem_type, range_before, range_after, true);
+}
+
+void mte_free_memory(void *ptr, size_t size, int mem_type, bool tags)
+{
+ __mte_free_memory_range(ptr, size, mem_type, 0, 0, tags);
+}
+
+void mte_initialize_current_context(int mode, uintptr_t ptr, ssize_t range)
+{
+ cur_mte_cxt.fault_valid = false;
+ cur_mte_cxt.trig_addr = ptr;
+ cur_mte_cxt.trig_range = range;
+ if (mode == MTE_SYNC_ERR)
+ cur_mte_cxt.trig_si_code = SEGV_MTESERR;
+ else if (mode == MTE_ASYNC_ERR)
+ cur_mte_cxt.trig_si_code = SEGV_MTEAERR;
+ else
+ cur_mte_cxt.trig_si_code = 0;
+}
+
+int mte_switch_mode(int mte_option, unsigned long incl_mask)
+{
+ unsigned long en = 0;
+
+ if (!(mte_option == MTE_SYNC_ERR || mte_option == MTE_ASYNC_ERR ||
+ mte_option == MTE_NONE_ERR || incl_mask <= MTE_ALLOW_NON_ZERO_TAG)) {
+ ksft_print_msg("FAIL: Invalid mte config option\n");
+ return -EINVAL;
+ }
+ en = PR_TAGGED_ADDR_ENABLE;
+ if (mte_option == MTE_SYNC_ERR)
+ en |= PR_MTE_TCF_SYNC;
+ else if (mte_option == MTE_ASYNC_ERR)
+ en |= PR_MTE_TCF_ASYNC;
+ else if (mte_option == MTE_NONE_ERR)
+ en |= PR_MTE_TCF_NONE;
+
+ en |= (incl_mask << PR_MTE_TAG_SHIFT);
+ /* Enable address tagging ABI, mte error reporting mode and tag inclusion mask. */
+ if (!prctl(PR_SET_TAGGED_ADDR_CTRL, en, 0, 0, 0) == 0) {
+ ksft_print_msg("FAIL:prctl PR_SET_TAGGED_ADDR_CTRL for mte mode\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int mte_default_setup(void)
+{
+ unsigned long hwcaps2 = getauxval(AT_HWCAP2);
+ unsigned long en = 0;
+ int ret;
+
+ if (!(hwcaps2 & HWCAP2_MTE)) {
+ ksft_print_msg("SKIP: MTE features unavailable\n");
+ return KSFT_SKIP;
+ }
+ /* Get current mte mode */
+ ret = prctl(PR_GET_TAGGED_ADDR_CTRL, en, 0, 0, 0);
+ if (ret < 0) {
+ ksft_print_msg("FAIL:prctl PR_GET_TAGGED_ADDR_CTRL with error =%d\n", ret);
+ return KSFT_FAIL;
+ }
+ if (ret & PR_MTE_TCF_SYNC)
+ mte_cur_mode = MTE_SYNC_ERR;
+ else if (ret & PR_MTE_TCF_ASYNC)
+ mte_cur_mode = MTE_ASYNC_ERR;
+ else if (ret & PR_MTE_TCF_NONE)
+ mte_cur_mode = MTE_NONE_ERR;
+
+ mte_cur_pstate_tco = mte_get_pstate_tco();
+ /* Disable PSTATE.TCO */
+ mte_disable_pstate_tco();
+ return 0;
+}
+
+void mte_restore_setup(void)
+{
+ mte_switch_mode(mte_cur_mode, MTE_ALLOW_NON_ZERO_TAG);
+ if (mte_cur_pstate_tco == MT_PSTATE_TCO_EN)
+ mte_enable_pstate_tco();
+ else if (mte_cur_pstate_tco == MT_PSTATE_TCO_DIS)
+ mte_disable_pstate_tco();
+}
+
+int create_temp_file(void)
+{
+ int fd;
+ char filename[] = "/dev/shm/tmp_XXXXXX";
+
+ /* Create a file in the tmpfs filesystem */
+ fd = mkstemp(&filename[0]);
+ if (fd == -1) {
+ ksft_print_msg("FAIL: Unable to open temporary file\n");
+ return 0;
+ }
+ unlink(&filename[0]);
+ return fd;
+}
diff --git a/tools/testing/selftests/arm64/mte/mte_common_util.h b/tools/testing/selftests/arm64/mte/mte_common_util.h
new file mode 100644
index 000000000..195a7d187
--- /dev/null
+++ b/tools/testing/selftests/arm64/mte/mte_common_util.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2020 ARM Limited */
+
+#ifndef _MTE_COMMON_UTIL_H
+#define _MTE_COMMON_UTIL_H
+
+#include <signal.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <sys/auxv.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+#include "mte_def.h"
+#include "kselftest.h"
+
+enum mte_mem_type {
+ USE_MALLOC,
+ USE_MMAP,
+ USE_MPROTECT,
+};
+
+enum mte_mode {
+ MTE_NONE_ERR,
+ MTE_SYNC_ERR,
+ MTE_ASYNC_ERR,
+};
+
+struct mte_fault_cxt {
+ /* Address start which triggers mte tag fault */
+ unsigned long trig_addr;
+ /* Address range for mte tag fault and negative value means underflow */
+ ssize_t trig_range;
+ /* siginfo si code */
+ unsigned long trig_si_code;
+ /* Flag to denote if correct fault caught */
+ bool fault_valid;
+};
+
+extern struct mte_fault_cxt cur_mte_cxt;
+
+/* MTE utility functions */
+void mte_default_handler(int signum, siginfo_t *si, void *uc);
+void mte_register_signal(int signal, void (*handler)(int, siginfo_t *, void *));
+void mte_wait_after_trig(void);
+void *mte_allocate_memory(size_t size, int mem_type, int mapping, bool tags);
+void *mte_allocate_memory_tag_range(size_t size, int mem_type, int mapping,
+ size_t range_before, size_t range_after);
+void *mte_allocate_file_memory(size_t size, int mem_type, int mapping,
+ bool tags, int fd);
+void *mte_allocate_file_memory_tag_range(size_t size, int mem_type, int mapping,
+ size_t range_before, size_t range_after, int fd);
+void mte_free_memory(void *ptr, size_t size, int mem_type, bool tags);
+void mte_free_memory_tag_range(void *ptr, size_t size, int mem_type,
+ size_t range_before, size_t range_after);
+void *mte_insert_tags(void *ptr, size_t size);
+void mte_clear_tags(void *ptr, size_t size);
+int mte_default_setup(void);
+void mte_restore_setup(void);
+int mte_switch_mode(int mte_option, unsigned long incl_mask);
+void mte_initialize_current_context(int mode, uintptr_t ptr, ssize_t range);
+
+/* Common utility functions */
+int create_temp_file(void);
+
+/* Assembly MTE utility functions */
+void *mte_insert_random_tag(void *ptr);
+void *mte_insert_new_tag(void *ptr);
+void *mte_get_tag_address(void *ptr);
+void mte_set_tag_address_range(void *ptr, int range);
+void mte_clear_tag_address_range(void *ptr, int range);
+void mte_disable_pstate_tco(void);
+void mte_enable_pstate_tco(void);
+unsigned int mte_get_pstate_tco(void);
+
+/* Test framework static inline functions/macros */
+static inline void evaluate_test(int err, const char *msg)
+{
+ if (err == KSFT_PASS)
+ ksft_test_result_pass(msg);
+ else if (err == KSFT_FAIL)
+ ksft_test_result_fail(msg);
+}
+
+static inline int check_allocated_memory(void *ptr, size_t size,
+ int mem_type, bool tags)
+{
+ if (ptr == NULL) {
+ ksft_print_msg("FAIL: memory allocation\n");
+ return KSFT_FAIL;
+ }
+
+ if (tags && !MT_FETCH_TAG((uintptr_t)ptr)) {
+ ksft_print_msg("FAIL: tag not found at addr(%p)\n", ptr);
+ mte_free_memory((void *)ptr, size, mem_type, false);
+ return KSFT_FAIL;
+ }
+
+ return KSFT_PASS;
+}
+
+static inline int check_allocated_memory_range(void *ptr, size_t size, int mem_type,
+ size_t range_before, size_t range_after)
+{
+ if (ptr == NULL) {
+ ksft_print_msg("FAIL: memory allocation\n");
+ return KSFT_FAIL;
+ }
+
+ if (!MT_FETCH_TAG((uintptr_t)ptr)) {
+ ksft_print_msg("FAIL: tag not found at addr(%p)\n", ptr);
+ mte_free_memory_tag_range((void *)ptr, size, mem_type, range_before,
+ range_after);
+ return KSFT_FAIL;
+ }
+ return KSFT_PASS;
+}
+
+#endif /* _MTE_COMMON_UTIL_H */
diff --git a/tools/testing/selftests/arm64/mte/mte_def.h b/tools/testing/selftests/arm64/mte/mte_def.h
new file mode 100644
index 000000000..9b188254b
--- /dev/null
+++ b/tools/testing/selftests/arm64/mte/mte_def.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2020 ARM Limited */
+
+/*
+ * Below definitions may be found in kernel headers, However, they are
+ * redefined here to decouple the MTE selftests compilations from them.
+ */
+#ifndef SEGV_MTEAERR
+#define SEGV_MTEAERR 8
+#endif
+#ifndef SEGV_MTESERR
+#define SEGV_MTESERR 9
+#endif
+#ifndef PROT_MTE
+#define PROT_MTE 0x20
+#endif
+#ifndef HWCAP2_MTE
+#define HWCAP2_MTE (1 << 18)
+#endif
+
+#ifndef PR_MTE_TCF_SHIFT
+#define PR_MTE_TCF_SHIFT 1
+#endif
+#ifndef PR_MTE_TCF_NONE
+#define PR_MTE_TCF_NONE (0UL << PR_MTE_TCF_SHIFT)
+#endif
+#ifndef PR_MTE_TCF_SYNC
+#define PR_MTE_TCF_SYNC (1UL << PR_MTE_TCF_SHIFT)
+#endif
+#ifndef PR_MTE_TCF_ASYNC
+#define PR_MTE_TCF_ASYNC (2UL << PR_MTE_TCF_SHIFT)
+#endif
+#ifndef PR_MTE_TAG_SHIFT
+#define PR_MTE_TAG_SHIFT 3
+#endif
+
+/* MTE Hardware feature definitions below. */
+#define MT_TAG_SHIFT 56
+#define MT_TAG_MASK 0xFUL
+#define MT_FREE_TAG 0x0UL
+#define MT_GRANULE_SIZE 16
+#define MT_TAG_COUNT 16
+#define MT_INCLUDE_TAG_MASK 0xFFFF
+#define MT_EXCLUDE_TAG_MASK 0x0
+
+#define MT_ALIGN_GRANULE (MT_GRANULE_SIZE - 1)
+#define MT_CLEAR_TAG(x) ((x) & ~(MT_TAG_MASK << MT_TAG_SHIFT))
+#define MT_SET_TAG(x, y) ((x) | (y << MT_TAG_SHIFT))
+#define MT_FETCH_TAG(x) ((x >> MT_TAG_SHIFT) & (MT_TAG_MASK))
+#define MT_ALIGN_UP(x) ((x + MT_ALIGN_GRANULE) & ~(MT_ALIGN_GRANULE))
+
+#define MT_PSTATE_TCO_SHIFT 25
+#define MT_PSTATE_TCO_MASK ~(0x1 << MT_PSTATE_TCO_SHIFT)
+#define MT_PSTATE_TCO_EN 1
+#define MT_PSTATE_TCO_DIS 0
+
+#define MT_EXCLUDE_TAG(x) (1 << (x))
+#define MT_INCLUDE_VALID_TAG(x) (MT_INCLUDE_TAG_MASK ^ MT_EXCLUDE_TAG(x))
+#define MT_INCLUDE_VALID_TAGS(x) (MT_INCLUDE_TAG_MASK ^ (x))
+#define MTE_ALLOW_NON_ZERO_TAG MT_INCLUDE_VALID_TAG(0)
diff --git a/tools/testing/selftests/arm64/mte/mte_helper.S b/tools/testing/selftests/arm64/mte/mte_helper.S
new file mode 100644
index 000000000..a02c04cd0
--- /dev/null
+++ b/tools/testing/selftests/arm64/mte/mte_helper.S
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2020 ARM Limited */
+
+#include "mte_def.h"
+
+#define ENTRY(name) \
+ .globl name ;\
+ .p2align 2;\
+ .type name, @function ;\
+name:
+
+#define ENDPROC(name) \
+ .size name, .-name ;
+
+ .text
+/*
+ * mte_insert_random_tag: Insert random tag and might be same as the source tag if
+ * the source pointer has it.
+ * Input:
+ * x0 - source pointer with a tag/no-tag
+ * Return:
+ * x0 - pointer with random tag
+ */
+ENTRY(mte_insert_random_tag)
+ irg x0, x0, xzr
+ ret
+ENDPROC(mte_insert_random_tag)
+
+/*
+ * mte_insert_new_tag: Insert new tag and different from the source tag if
+ * source pointer has it.
+ * Input:
+ * x0 - source pointer with a tag/no-tag
+ * Return:
+ * x0 - pointer with random tag
+ */
+ENTRY(mte_insert_new_tag)
+ gmi x1, x0, xzr
+ irg x0, x0, x1
+ ret
+ENDPROC(mte_insert_new_tag)
+
+/*
+ * mte_get_tag_address: Get the tag from given address.
+ * Input:
+ * x0 - source pointer
+ * Return:
+ * x0 - pointer with appended tag
+ */
+ENTRY(mte_get_tag_address)
+ ldg x0, [x0]
+ ret
+ENDPROC(mte_get_tag_address)
+
+/*
+ * mte_set_tag_address_range: Set the tag range from the given address
+ * Input:
+ * x0 - source pointer with tag data
+ * x1 - range
+ * Return:
+ * none
+ */
+ENTRY(mte_set_tag_address_range)
+ cbz x1, 2f
+1:
+ stg x0, [x0, #0x0]
+ add x0, x0, #MT_GRANULE_SIZE
+ sub x1, x1, #MT_GRANULE_SIZE
+ cbnz x1, 1b
+2:
+ ret
+ENDPROC(mte_set_tag_address_range)
+
+/*
+ * mt_clear_tag_address_range: Clear the tag range from the given address
+ * Input:
+ * x0 - source pointer with tag data
+ * x1 - range
+ * Return:
+ * none
+ */
+ENTRY(mte_clear_tag_address_range)
+ cbz x1, 2f
+1:
+ stzg x0, [x0, #0x0]
+ add x0, x0, #MT_GRANULE_SIZE
+ sub x1, x1, #MT_GRANULE_SIZE
+ cbnz x1, 1b
+2:
+ ret
+ENDPROC(mte_clear_tag_address_range)
+
+/*
+ * mte_enable_pstate_tco: Enable PSTATE.TCO (tag check override) field
+ * Input:
+ * none
+ * Return:
+ * none
+ */
+ENTRY(mte_enable_pstate_tco)
+ msr tco, #MT_PSTATE_TCO_EN
+ ret
+ENDPROC(mte_enable_pstate_tco)
+
+/*
+ * mte_disable_pstate_tco: Disable PSTATE.TCO (tag check override) field
+ * Input:
+ * none
+ * Return:
+ * none
+ */
+ENTRY(mte_disable_pstate_tco)
+ msr tco, #MT_PSTATE_TCO_DIS
+ ret
+ENDPROC(mte_disable_pstate_tco)
+
+/*
+ * mte_get_pstate_tco: Get PSTATE.TCO (tag check override) field
+ * Input:
+ * none
+ * Return:
+ * x0
+ */
+ENTRY(mte_get_pstate_tco)
+ mrs x0, tco
+ ubfx x0, x0, #MT_PSTATE_TCO_SHIFT, #1
+ ret
+ENDPROC(mte_get_pstate_tco)
diff --git a/tools/testing/selftests/arm64/pauth/.gitignore b/tools/testing/selftests/arm64/pauth/.gitignore
new file mode 100644
index 000000000..155137d92
--- /dev/null
+++ b/tools/testing/selftests/arm64/pauth/.gitignore
@@ -0,0 +1,2 @@
+exec_target
+pac
diff --git a/tools/testing/selftests/arm64/pauth/Makefile b/tools/testing/selftests/arm64/pauth/Makefile
new file mode 100644
index 000000000..72e290b0b
--- /dev/null
+++ b/tools/testing/selftests/arm64/pauth/Makefile
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2020 ARM Limited
+
+# preserve CC value from top level Makefile
+ifeq ($(CC),cc)
+CC := $(CROSS_COMPILE)gcc
+endif
+
+CFLAGS += -mbranch-protection=pac-ret
+# check if the compiler supports ARMv8.3 and branch protection with PAuth
+pauth_cc_support := $(shell if ($(CC) $(CFLAGS) -march=armv8.3-a -E -x c /dev/null -o /dev/null 2>&1) then echo "1"; fi)
+
+ifeq ($(pauth_cc_support),1)
+TEST_GEN_PROGS := pac
+TEST_GEN_FILES := pac_corruptor.o helper.o
+TEST_GEN_PROGS_EXTENDED := exec_target
+endif
+
+include ../../lib.mk
+
+ifeq ($(pauth_cc_support),1)
+# pac* and aut* instructions are not available on architectures berfore
+# ARMv8.3. Therefore target ARMv8.3 wherever they are used directly
+$(OUTPUT)/pac_corruptor.o: pac_corruptor.S
+ $(CC) -c $^ -o $@ $(CFLAGS) -march=armv8.3-a
+
+$(OUTPUT)/helper.o: helper.c
+ $(CC) -c $^ -o $@ $(CFLAGS) -march=armv8.3-a
+
+# when -mbranch-protection is enabled and the target architecture is ARMv8.3 or
+# greater, gcc emits pac* instructions which are not in HINT NOP space,
+# preventing the tests from occurring at all. Compile for ARMv8.2 so tests can
+# run on earlier targets and print a meaningful error messages
+$(OUTPUT)/exec_target: exec_target.c $(OUTPUT)/helper.o
+ $(CC) $^ -o $@ $(CFLAGS) -march=armv8.2-a
+
+$(OUTPUT)/pac: pac.c $(OUTPUT)/pac_corruptor.o $(OUTPUT)/helper.o
+ $(CC) $^ -o $@ $(CFLAGS) -march=armv8.2-a
+endif
diff --git a/tools/testing/selftests/arm64/pauth/exec_target.c b/tools/testing/selftests/arm64/pauth/exec_target.c
new file mode 100644
index 000000000..4435600ca
--- /dev/null
+++ b/tools/testing/selftests/arm64/pauth/exec_target.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2020 ARM Limited
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/auxv.h>
+
+#include "helper.h"
+
+int main(void)
+{
+ struct signatures signed_vals;
+ unsigned long hwcaps;
+ size_t val;
+
+ fread(&val, sizeof(size_t), 1, stdin);
+
+ /* don't try to execute illegal (unimplemented) instructions) caller
+ * should have checked this and keep worker simple
+ */
+ hwcaps = getauxval(AT_HWCAP);
+
+ if (hwcaps & HWCAP_PACA) {
+ signed_vals.keyia = keyia_sign(val);
+ signed_vals.keyib = keyib_sign(val);
+ signed_vals.keyda = keyda_sign(val);
+ signed_vals.keydb = keydb_sign(val);
+ }
+ signed_vals.keyg = (hwcaps & HWCAP_PACG) ? keyg_sign(val) : 0;
+
+ fwrite(&signed_vals, sizeof(struct signatures), 1, stdout);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/arm64/pauth/helper.c b/tools/testing/selftests/arm64/pauth/helper.c
new file mode 100644
index 000000000..2c201e7d0
--- /dev/null
+++ b/tools/testing/selftests/arm64/pauth/helper.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2020 ARM Limited
+
+#include "helper.h"
+
+size_t keyia_sign(size_t ptr)
+{
+ asm volatile("paciza %0" : "+r" (ptr));
+ return ptr;
+}
+
+size_t keyib_sign(size_t ptr)
+{
+ asm volatile("pacizb %0" : "+r" (ptr));
+ return ptr;
+}
+
+size_t keyda_sign(size_t ptr)
+{
+ asm volatile("pacdza %0" : "+r" (ptr));
+ return ptr;
+}
+
+size_t keydb_sign(size_t ptr)
+{
+ asm volatile("pacdzb %0" : "+r" (ptr));
+ return ptr;
+}
+
+size_t keyg_sign(size_t ptr)
+{
+ /* output is encoded in the upper 32 bits */
+ size_t dest = 0;
+ size_t modifier = 0;
+
+ asm volatile("pacga %0, %1, %2" : "=r" (dest) : "r" (ptr), "r" (modifier));
+
+ return dest;
+}
diff --git a/tools/testing/selftests/arm64/pauth/helper.h b/tools/testing/selftests/arm64/pauth/helper.h
new file mode 100644
index 000000000..652496c7b
--- /dev/null
+++ b/tools/testing/selftests/arm64/pauth/helper.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2020 ARM Limited */
+
+#ifndef _HELPER_H_
+#define _HELPER_H_
+
+#include <stdlib.h>
+
+#define NKEYS 5
+
+struct signatures {
+ size_t keyia;
+ size_t keyib;
+ size_t keyda;
+ size_t keydb;
+ size_t keyg;
+};
+
+void pac_corruptor(void);
+
+/* PAuth sign a value with key ia and modifier value 0 */
+size_t keyia_sign(size_t val);
+size_t keyib_sign(size_t val);
+size_t keyda_sign(size_t val);
+size_t keydb_sign(size_t val);
+size_t keyg_sign(size_t val);
+
+#endif
diff --git a/tools/testing/selftests/arm64/pauth/pac.c b/tools/testing/selftests/arm64/pauth/pac.c
new file mode 100644
index 000000000..b743daa77
--- /dev/null
+++ b/tools/testing/selftests/arm64/pauth/pac.c
@@ -0,0 +1,370 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2020 ARM Limited
+
+#define _GNU_SOURCE
+
+#include <sys/auxv.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <signal.h>
+#include <setjmp.h>
+#include <sched.h>
+
+#include "../../kselftest_harness.h"
+#include "helper.h"
+
+#define PAC_COLLISION_ATTEMPTS 10
+/*
+ * The kernel sets TBID by default. So bits 55 and above should remain
+ * untouched no matter what.
+ * The VA space size is 48 bits. Bigger is opt-in.
+ */
+#define PAC_MASK (~0xff80ffffffffffff)
+#define ARBITRARY_VALUE (0x1234)
+#define ASSERT_PAUTH_ENABLED() \
+do { \
+ unsigned long hwcaps = getauxval(AT_HWCAP); \
+ /* data key instructions are not in NOP space. This prevents a SIGILL */ \
+ if (!(hwcaps & HWCAP_PACA)) \
+ SKIP(return, "PAUTH not enabled"); \
+} while (0)
+#define ASSERT_GENERIC_PAUTH_ENABLED() \
+do { \
+ unsigned long hwcaps = getauxval(AT_HWCAP); \
+ /* generic key instructions are not in NOP space. This prevents a SIGILL */ \
+ if (!(hwcaps & HWCAP_PACG)) \
+ SKIP(return, "Generic PAUTH not enabled"); \
+} while (0)
+
+void sign_specific(struct signatures *sign, size_t val)
+{
+ sign->keyia = keyia_sign(val);
+ sign->keyib = keyib_sign(val);
+ sign->keyda = keyda_sign(val);
+ sign->keydb = keydb_sign(val);
+}
+
+void sign_all(struct signatures *sign, size_t val)
+{
+ sign->keyia = keyia_sign(val);
+ sign->keyib = keyib_sign(val);
+ sign->keyda = keyda_sign(val);
+ sign->keydb = keydb_sign(val);
+ sign->keyg = keyg_sign(val);
+}
+
+int n_same(struct signatures *old, struct signatures *new, int nkeys)
+{
+ int res = 0;
+
+ res += old->keyia == new->keyia;
+ res += old->keyib == new->keyib;
+ res += old->keyda == new->keyda;
+ res += old->keydb == new->keydb;
+ if (nkeys == NKEYS)
+ res += old->keyg == new->keyg;
+
+ return res;
+}
+
+int n_same_single_set(struct signatures *sign, int nkeys)
+{
+ size_t vals[nkeys];
+ int same = 0;
+
+ vals[0] = sign->keyia & PAC_MASK;
+ vals[1] = sign->keyib & PAC_MASK;
+ vals[2] = sign->keyda & PAC_MASK;
+ vals[3] = sign->keydb & PAC_MASK;
+
+ if (nkeys >= 4)
+ vals[4] = sign->keyg & PAC_MASK;
+
+ for (int i = 0; i < nkeys - 1; i++) {
+ for (int j = i + 1; j < nkeys; j++) {
+ if (vals[i] == vals[j])
+ same += 1;
+ }
+ }
+ return same;
+}
+
+int exec_sign_all(struct signatures *signed_vals, size_t val)
+{
+ int new_stdin[2];
+ int new_stdout[2];
+ int status;
+ int i;
+ ssize_t ret;
+ pid_t pid;
+ cpu_set_t mask;
+
+ ret = pipe(new_stdin);
+ if (ret == -1) {
+ perror("pipe returned error");
+ return -1;
+ }
+
+ ret = pipe(new_stdout);
+ if (ret == -1) {
+ perror("pipe returned error");
+ return -1;
+ }
+
+ /*
+ * pin this process and all its children to a single CPU, so it can also
+ * guarantee a context switch with its child
+ */
+ sched_getaffinity(0, sizeof(mask), &mask);
+
+ for (i = 0; i < sizeof(cpu_set_t); i++)
+ if (CPU_ISSET(i, &mask))
+ break;
+
+ CPU_ZERO(&mask);
+ CPU_SET(i, &mask);
+ sched_setaffinity(0, sizeof(mask), &mask);
+
+ pid = fork();
+ // child
+ if (pid == 0) {
+ dup2(new_stdin[0], STDIN_FILENO);
+ if (ret == -1) {
+ perror("dup2 returned error");
+ exit(1);
+ }
+
+ dup2(new_stdout[1], STDOUT_FILENO);
+ if (ret == -1) {
+ perror("dup2 returned error");
+ exit(1);
+ }
+
+ close(new_stdin[0]);
+ close(new_stdin[1]);
+ close(new_stdout[0]);
+ close(new_stdout[1]);
+
+ ret = execl("exec_target", "exec_target", (char *)NULL);
+ if (ret == -1) {
+ perror("exec returned error");
+ exit(1);
+ }
+ }
+
+ close(new_stdin[0]);
+ close(new_stdout[1]);
+
+ ret = write(new_stdin[1], &val, sizeof(size_t));
+ if (ret == -1) {
+ perror("write returned error");
+ return -1;
+ }
+
+ /*
+ * wait for the worker to finish, so that read() reads all data
+ * will also context switch with worker so that this function can be used
+ * for context switch tests
+ */
+ waitpid(pid, &status, 0);
+ if (WIFEXITED(status) == 0) {
+ fprintf(stderr, "worker exited unexpectedly\n");
+ return -1;
+ }
+ if (WEXITSTATUS(status) != 0) {
+ fprintf(stderr, "worker exited with error\n");
+ return -1;
+ }
+
+ ret = read(new_stdout[0], signed_vals, sizeof(struct signatures));
+ if (ret == -1) {
+ perror("read returned error");
+ return -1;
+ }
+
+ return 0;
+}
+
+sigjmp_buf jmpbuf;
+void pac_signal_handler(int signum, siginfo_t *si, void *uc)
+{
+ if (signum == SIGSEGV || signum == SIGILL)
+ siglongjmp(jmpbuf, 1);
+}
+
+/* check that a corrupted PAC results in SIGSEGV or SIGILL */
+TEST(corrupt_pac)
+{
+ struct sigaction sa;
+
+ ASSERT_PAUTH_ENABLED();
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ sa.sa_sigaction = pac_signal_handler;
+ sa.sa_flags = SA_SIGINFO | SA_RESETHAND;
+ sigemptyset(&sa.sa_mask);
+
+ sigaction(SIGSEGV, &sa, NULL);
+ sigaction(SIGILL, &sa, NULL);
+
+ pac_corruptor();
+ ASSERT_TRUE(0) TH_LOG("SIGSEGV/SIGILL signal did not occur");
+ }
+}
+
+/*
+ * There are no separate pac* and aut* controls so checking only the pac*
+ * instructions is sufficient
+ */
+TEST(pac_instructions_not_nop)
+{
+ size_t keyia = 0;
+ size_t keyib = 0;
+ size_t keyda = 0;
+ size_t keydb = 0;
+
+ ASSERT_PAUTH_ENABLED();
+
+ for (int i = 0; i < PAC_COLLISION_ATTEMPTS; i++) {
+ keyia |= keyia_sign(i) & PAC_MASK;
+ keyib |= keyib_sign(i) & PAC_MASK;
+ keyda |= keyda_sign(i) & PAC_MASK;
+ keydb |= keydb_sign(i) & PAC_MASK;
+ }
+
+ ASSERT_NE(0, keyia) TH_LOG("keyia instructions did nothing");
+ ASSERT_NE(0, keyib) TH_LOG("keyib instructions did nothing");
+ ASSERT_NE(0, keyda) TH_LOG("keyda instructions did nothing");
+ ASSERT_NE(0, keydb) TH_LOG("keydb instructions did nothing");
+}
+
+TEST(pac_instructions_not_nop_generic)
+{
+ size_t keyg = 0;
+
+ ASSERT_GENERIC_PAUTH_ENABLED();
+
+ for (int i = 0; i < PAC_COLLISION_ATTEMPTS; i++)
+ keyg |= keyg_sign(i) & PAC_MASK;
+
+ ASSERT_NE(0, keyg) TH_LOG("keyg instructions did nothing");
+}
+
+TEST(single_thread_different_keys)
+{
+ int same = 10;
+ int nkeys = NKEYS;
+ int tmp;
+ struct signatures signed_vals;
+ unsigned long hwcaps = getauxval(AT_HWCAP);
+
+ /* generic and data key instructions are not in NOP space. This prevents a SIGILL */
+ ASSERT_PAUTH_ENABLED();
+ if (!(hwcaps & HWCAP_PACG)) {
+ TH_LOG("WARNING: Generic PAUTH not enabled. Skipping generic key checks");
+ nkeys = NKEYS - 1;
+ }
+
+ /*
+ * In Linux the PAC field can be up to 7 bits wide. Even if keys are
+ * different, there is about 5% chance for PACs to collide with
+ * different addresses. This chance rapidly increases with fewer bits
+ * allocated for the PAC (e.g. wider address). A comparison of the keys
+ * directly will be more reliable.
+ * All signed values need to be different at least once out of n
+ * attempts to be certain that the keys are different
+ */
+ for (int i = 0; i < PAC_COLLISION_ATTEMPTS; i++) {
+ if (nkeys == NKEYS)
+ sign_all(&signed_vals, i);
+ else
+ sign_specific(&signed_vals, i);
+
+ tmp = n_same_single_set(&signed_vals, nkeys);
+ if (tmp < same)
+ same = tmp;
+ }
+
+ ASSERT_EQ(0, same) TH_LOG("%d keys clashed every time", same);
+}
+
+/*
+ * fork() does not change keys. Only exec() does so call a worker program.
+ * Its only job is to sign a value and report back the resutls
+ */
+TEST(exec_changed_keys)
+{
+ struct signatures new_keys;
+ struct signatures old_keys;
+ int ret;
+ int same = 10;
+ int nkeys = NKEYS;
+ unsigned long hwcaps = getauxval(AT_HWCAP);
+
+ /* generic and data key instructions are not in NOP space. This prevents a SIGILL */
+ ASSERT_PAUTH_ENABLED();
+ if (!(hwcaps & HWCAP_PACG)) {
+ TH_LOG("WARNING: Generic PAUTH not enabled. Skipping generic key checks");
+ nkeys = NKEYS - 1;
+ }
+
+ for (int i = 0; i < PAC_COLLISION_ATTEMPTS; i++) {
+ ret = exec_sign_all(&new_keys, i);
+ ASSERT_EQ(0, ret) TH_LOG("failed to run worker");
+
+ if (nkeys == NKEYS)
+ sign_all(&old_keys, i);
+ else
+ sign_specific(&old_keys, i);
+
+ ret = n_same(&old_keys, &new_keys, nkeys);
+ if (ret < same)
+ same = ret;
+ }
+
+ ASSERT_EQ(0, same) TH_LOG("exec() did not change %d keys", same);
+}
+
+TEST(context_switch_keep_keys)
+{
+ int ret;
+ struct signatures trash;
+ struct signatures before;
+ struct signatures after;
+
+ ASSERT_PAUTH_ENABLED();
+
+ sign_specific(&before, ARBITRARY_VALUE);
+
+ /* will context switch with a process with different keys at least once */
+ ret = exec_sign_all(&trash, ARBITRARY_VALUE);
+ ASSERT_EQ(0, ret) TH_LOG("failed to run worker");
+
+ sign_specific(&after, ARBITRARY_VALUE);
+
+ ASSERT_EQ(before.keyia, after.keyia) TH_LOG("keyia changed after context switching");
+ ASSERT_EQ(before.keyib, after.keyib) TH_LOG("keyib changed after context switching");
+ ASSERT_EQ(before.keyda, after.keyda) TH_LOG("keyda changed after context switching");
+ ASSERT_EQ(before.keydb, after.keydb) TH_LOG("keydb changed after context switching");
+}
+
+TEST(context_switch_keep_keys_generic)
+{
+ int ret;
+ struct signatures trash;
+ size_t before;
+ size_t after;
+
+ ASSERT_GENERIC_PAUTH_ENABLED();
+
+ before = keyg_sign(ARBITRARY_VALUE);
+
+ /* will context switch with a process with different keys at least once */
+ ret = exec_sign_all(&trash, ARBITRARY_VALUE);
+ ASSERT_EQ(0, ret) TH_LOG("failed to run worker");
+
+ after = keyg_sign(ARBITRARY_VALUE);
+
+ ASSERT_EQ(before, after) TH_LOG("keyg changed after context switching");
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/arm64/pauth/pac_corruptor.S b/tools/testing/selftests/arm64/pauth/pac_corruptor.S
new file mode 100644
index 000000000..aa6588050
--- /dev/null
+++ b/tools/testing/selftests/arm64/pauth/pac_corruptor.S
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2020 ARM Limited */
+
+.global pac_corruptor
+
+.text
+/*
+ * Corrupting a single bit of the PAC ensures the authentication will fail. It
+ * also guarantees no possible collision. TCR_EL1.TBI0 is set by default so no
+ * top byte PAC is tested
+ */
+ pac_corruptor:
+ paciasp
+
+ /* corrupt the top bit of the PAC */
+ eor lr, lr, #1 << 53
+
+ autiasp
+ ret
diff --git a/tools/testing/selftests/arm64/signal/.gitignore b/tools/testing/selftests/arm64/signal/.gitignore
new file mode 100644
index 000000000..78c902045
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+mangle_*
+fake_sigreturn_*
+!*.[ch]
diff --git a/tools/testing/selftests/arm64/signal/Makefile b/tools/testing/selftests/arm64/signal/Makefile
new file mode 100644
index 000000000..ac4ad0005
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/Makefile
@@ -0,0 +1,28 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2019 ARM Limited
+
+# Additional include paths needed by kselftest.h and local headers
+CFLAGS += -D_GNU_SOURCE -std=gnu99 -I.
+
+SRCS := $(filter-out testcases/testcases.c,$(wildcard testcases/*.c))
+PROGS := $(patsubst %.c,%,$(SRCS))
+
+# Generated binaries to be installed by top KSFT script
+TEST_GEN_PROGS := $(notdir $(PROGS))
+
+# Get Kernel headers installed and use them.
+KSFT_KHDR_INSTALL := 1
+
+# Including KSFT lib.mk here will also mangle the TEST_GEN_PROGS list
+# to account for any OUTPUT target-dirs optionally provided by
+# the toplevel makefile
+include ../../lib.mk
+
+$(TEST_GEN_PROGS): $(PROGS)
+ cp $(PROGS) $(OUTPUT)/
+
+# Common test-unit targets to build common-layout test-cases executables
+# Needs secondary expansion to properly include the testcase c-file in pre-reqs
+.SECONDEXPANSION:
+$(PROGS): test_signals.c test_signals_utils.c testcases/testcases.c signals.S $$@.c test_signals.h test_signals_utils.h testcases/testcases.h
+ $(CC) $(CFLAGS) $^ -o $@
diff --git a/tools/testing/selftests/arm64/signal/README b/tools/testing/selftests/arm64/signal/README
new file mode 100644
index 000000000..967a531b2
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/README
@@ -0,0 +1,59 @@
+KSelfTest arm64/signal/
+=======================
+
+Signals Tests
++++++++++++++
+
+- Tests are built around a common main compilation unit: such shared main
+ enforces a standard sequence of operations needed to perform a single
+ signal-test (setup/trigger/run/result/cleanup)
+
+- The above mentioned ops are configurable on a test-by-test basis: each test
+ is described (and configured) using the descriptor signals.h::struct tdescr
+
+- Each signal testcase is compiled into its own executable: a separate
+ executable is used for each test since many tests complete successfully
+ by receiving some kind of fatal signal from the Kernel, so it's safer
+ to run each test unit in its own standalone process, so as to start each
+ test from a clean slate.
+
+- New tests can be simply defined in testcases/ dir providing a proper struct
+ tdescr overriding all the defaults we wish to change (as of now providing a
+ custom run method is mandatory though)
+
+- Signals' test-cases hereafter defined belong currently to two
+ principal families:
+
+ - 'mangle_' tests: a real signal (SIGUSR1) is raised and used as a trigger
+ and then the test case code modifies the signal frame from inside the
+ signal handler itself.
+
+ - 'fake_sigreturn_' tests: a brand new custom artificial sigframe structure
+ is placed on the stack and a sigreturn syscall is called to simulate a
+ real signal return. This kind of tests does not use a trigger usually and
+ they are just fired using some simple included assembly trampoline code.
+
+ - Most of these tests are successfully passing if the process gets killed by
+ some fatal signal: usually SIGSEGV or SIGBUS. Since while writing this
+ kind of tests it is extremely easy in fact to end-up injecting other
+ unrelated SEGV bugs in the testcases, it becomes extremely tricky to
+ be really sure that the tests are really addressing what they are meant
+ to address and they are not instead falling apart due to unplanned bugs
+ in the test code.
+ In order to alleviate the misery of the life of such test-developer, a few
+ helpers are provided:
+
+ - a couple of ASSERT_BAD/GOOD_CONTEXT() macros to easily parse a ucontext_t
+ and verify if it is indeed GOOD or BAD (depending on what we were
+ expecting), using the same logic/perspective as in the arm64 Kernel signals
+ routines.
+
+ - a sanity mechanism to be used in 'fake_sigreturn_'-alike tests: enabled by
+ default it takes care to verify that the test-execution had at least
+ successfully progressed up to the stage of triggering the fake sigreturn
+ call.
+
+ In both cases test results are expected in terms of:
+ - some fatal signal sent by the Kernel to the test process
+ or
+ - analyzing some final regs state
diff --git a/tools/testing/selftests/arm64/signal/signals.S b/tools/testing/selftests/arm64/signal/signals.S
new file mode 100644
index 000000000..9f8c1aefc
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/signals.S
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2019 ARM Limited */
+
+#include <asm/unistd.h>
+
+.section .rodata, "a"
+call_fmt:
+ .asciz "Calling sigreturn with fake sigframe sized:%zd at SP @%08lX\n"
+
+.text
+
+.globl fake_sigreturn
+
+/* fake_sigreturn x0:&sigframe, x1:sigframe_size, x2:misalign_bytes */
+fake_sigreturn:
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+
+ mov x20, x0
+ mov x21, x1
+ mov x22, x2
+
+ /* create space on the stack for fake sigframe 16 bytes-aligned */
+ add x0, x21, x22
+ add x0, x0, #15
+ bic x0, x0, #15 /* round_up(sigframe_size + misalign_bytes, 16) */
+ sub sp, sp, x0
+ add x23, sp, x22 /* new sigframe base with misaligment if any */
+
+ ldr x0, =call_fmt
+ mov x1, x21
+ mov x2, x23
+ bl printf
+
+ /* memcpy the provided content, while still keeping SP aligned */
+ mov x0, x23
+ mov x1, x20
+ mov x2, x21
+ bl memcpy
+
+ /*
+ * Here saving a last minute SP to current->token acts as a marker:
+ * if we got here, we are successfully faking a sigreturn; in other
+ * words we are sure no bad fatal signal has been raised till now
+ * for unrelated reasons, so we should consider the possibly observed
+ * fatal signal like SEGV coming from Kernel restore_sigframe() and
+ * triggered as expected from our test-case.
+ * For simplicity this assumes that current field 'token' is laid out
+ * as first in struct tdescr
+ */
+ ldr x0, current
+ str x23, [x0]
+ /* finally move SP to misaligned address...if any requested */
+ mov sp, x23
+
+ mov x8, #__NR_rt_sigreturn
+ svc #0
+
+ /*
+ * Above sigreturn should not return...looping here leads to a timeout
+ * and ensure proper and clean test failure, instead of jumping around
+ * on a potentially corrupted stack.
+ */
+ b .
diff --git a/tools/testing/selftests/arm64/signal/test_signals.c b/tools/testing/selftests/arm64/signal/test_signals.c
new file mode 100644
index 000000000..416b1ff43
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/test_signals.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 ARM Limited
+ *
+ * Generic test wrapper for arm64 signal tests.
+ *
+ * Each test provides its own tde struct tdescr descriptor to link with
+ * this wrapper. Framework provides common helpers.
+ */
+#include <kselftest.h>
+
+#include "test_signals.h"
+#include "test_signals_utils.h"
+
+struct tdescr *current;
+
+int main(int argc, char *argv[])
+{
+ current = &tde;
+
+ ksft_print_msg("%s :: %s\n", current->name, current->descr);
+ if (test_setup(current) && test_init(current)) {
+ test_run(current);
+ test_cleanup(current);
+ }
+ test_result(current);
+
+ return current->result;
+}
diff --git a/tools/testing/selftests/arm64/signal/test_signals.h b/tools/testing/selftests/arm64/signal/test_signals.h
new file mode 100644
index 000000000..ebe8694db
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/test_signals.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2019 ARM Limited */
+
+#ifndef __TEST_SIGNALS_H__
+#define __TEST_SIGNALS_H__
+
+#include <signal.h>
+#include <stdbool.h>
+#include <ucontext.h>
+
+/*
+ * Using ARCH specific and sanitized Kernel headers installed by KSFT
+ * framework since we asked for it by setting flag KSFT_KHDR_INSTALL
+ * in our Makefile.
+ */
+#include <asm/ptrace.h>
+#include <asm/hwcap.h>
+
+#define __stringify_1(x...) #x
+#define __stringify(x...) __stringify_1(x)
+
+#define get_regval(regname, out) \
+{ \
+ asm volatile("mrs %0, " __stringify(regname) \
+ : "=r" (out) \
+ : \
+ : "memory"); \
+}
+
+/*
+ * Feature flags used in tdescr.feats_required to specify
+ * any feature by the test
+ */
+enum {
+ FSSBS_BIT,
+ FSVE_BIT,
+ FMAX_END
+};
+
+#define FEAT_SSBS (1UL << FSSBS_BIT)
+#define FEAT_SVE (1UL << FSVE_BIT)
+
+/*
+ * A descriptor used to describe and configure a test case.
+ * Fields with a non-trivial meaning are described inline in the following.
+ */
+struct tdescr {
+ /* KEEP THIS FIELD FIRST for easier lookup from assembly */
+ void *token;
+ /* when disabled token based sanity checking is skipped in handler */
+ bool sanity_disabled;
+ /* just a name for the test-case; manadatory field */
+ char *name;
+ char *descr;
+ unsigned long feats_required;
+ /* bitmask of effectively supported feats: populated at run-time */
+ unsigned long feats_supported;
+ bool initialized;
+ unsigned int minsigstksz;
+ /* signum used as a test trigger. Zero if no trigger-signal is used */
+ int sig_trig;
+ /*
+ * signum considered as a successful test completion.
+ * Zero when no signal is expected on success
+ */
+ int sig_ok;
+ /* signum expected on unsupported CPU features. */
+ int sig_unsupp;
+ /* a timeout in second for test completion */
+ unsigned int timeout;
+ bool triggered;
+ bool pass;
+ unsigned int result;
+ /* optional sa_flags for the installed handler */
+ int sa_flags;
+ ucontext_t saved_uc;
+ /* used by get_current_ctx() */
+ size_t live_sz;
+ ucontext_t *live_uc;
+ volatile sig_atomic_t live_uc_valid;
+ /* optional test private data */
+ void *priv;
+
+ /* a custom setup: called alternatively to default_setup */
+ int (*setup)(struct tdescr *td);
+ /* a custom init: called by default test init after test_setup */
+ bool (*init)(struct tdescr *td);
+ /* a custom cleanup function called before test exits */
+ void (*cleanup)(struct tdescr *td);
+ /* an optional function to be used as a trigger for starting test */
+ int (*trigger)(struct tdescr *td);
+ /*
+ * the actual test-core: invoked differently depending on the
+ * presence of the trigger function above; this is mandatory
+ */
+ int (*run)(struct tdescr *td, siginfo_t *si, ucontext_t *uc);
+ /* an optional function for custom results' processing */
+ void (*check_result)(struct tdescr *td);
+};
+
+extern struct tdescr tde;
+#endif
diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.c b/tools/testing/selftests/arm64/signal/test_signals_utils.c
new file mode 100644
index 000000000..22722abc9
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/test_signals_utils.c
@@ -0,0 +1,334 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2019 ARM Limited */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+#include <sys/auxv.h>
+#include <linux/auxvec.h>
+#include <ucontext.h>
+
+#include <asm/unistd.h>
+
+#include <kselftest.h>
+
+#include "test_signals.h"
+#include "test_signals_utils.h"
+#include "testcases/testcases.h"
+
+
+extern struct tdescr *current;
+
+static int sig_copyctx = SIGTRAP;
+
+static char const *const feats_names[FMAX_END] = {
+ " SSBS ",
+ " SVE ",
+};
+
+#define MAX_FEATS_SZ 128
+static char feats_string[MAX_FEATS_SZ];
+
+static inline char *feats_to_string(unsigned long feats)
+{
+ size_t flen = MAX_FEATS_SZ - 1;
+
+ for (int i = 0; i < FMAX_END; i++) {
+ if (feats & (1UL << i)) {
+ size_t tlen = strlen(feats_names[i]);
+
+ assert(flen > tlen);
+ flen -= tlen;
+ strncat(feats_string, feats_names[i], flen);
+ }
+ }
+
+ return feats_string;
+}
+
+static void unblock_signal(int signum)
+{
+ sigset_t sset;
+
+ sigemptyset(&sset);
+ sigaddset(&sset, signum);
+ sigprocmask(SIG_UNBLOCK, &sset, NULL);
+}
+
+static void default_result(struct tdescr *td, bool force_exit)
+{
+ if (td->result == KSFT_SKIP) {
+ fprintf(stderr, "==>> completed. SKIP.\n");
+ } else if (td->pass) {
+ fprintf(stderr, "==>> completed. PASS(1)\n");
+ td->result = KSFT_PASS;
+ } else {
+ fprintf(stdout, "==>> completed. FAIL(0)\n");
+ td->result = KSFT_FAIL;
+ }
+
+ if (force_exit)
+ exit(td->result);
+}
+
+/*
+ * The following handle_signal_* helpers are used by main default_handler
+ * and are meant to return true when signal is handled successfully:
+ * when false is returned instead, it means that the signal was somehow
+ * unexpected in that context and it was NOT handled; default_handler will
+ * take care of such unexpected situations.
+ */
+
+static bool handle_signal_unsupported(struct tdescr *td,
+ siginfo_t *si, void *uc)
+{
+ if (feats_ok(td))
+ return false;
+
+ /* Mangling PC to avoid loops on original SIGILL */
+ ((ucontext_t *)uc)->uc_mcontext.pc += 4;
+
+ if (!td->initialized) {
+ fprintf(stderr,
+ "Got SIG_UNSUPP @test_init. Ignore.\n");
+ } else {
+ fprintf(stderr,
+ "-- RX SIG_UNSUPP on unsupported feat...OK\n");
+ td->pass = 1;
+ default_result(current, 1);
+ }
+
+ return true;
+}
+
+static bool handle_signal_trigger(struct tdescr *td,
+ siginfo_t *si, void *uc)
+{
+ td->triggered = 1;
+ /* ->run was asserted NON-NULL in test_setup() already */
+ td->run(td, si, uc);
+
+ return true;
+}
+
+static bool handle_signal_ok(struct tdescr *td,
+ siginfo_t *si, void *uc)
+{
+ /*
+ * it's a bug in the test code when this assert fail:
+ * if sig_trig was defined, it must have been used before getting here.
+ */
+ assert(!td->sig_trig || td->triggered);
+ fprintf(stderr,
+ "SIG_OK -- SP:0x%llX si_addr@:%p si_code:%d token@:%p offset:%ld\n",
+ ((ucontext_t *)uc)->uc_mcontext.sp,
+ si->si_addr, si->si_code, td->token, td->token - si->si_addr);
+ /*
+ * fake_sigreturn tests, which have sanity_enabled=1, set, at the very
+ * last time, the token field to the SP address used to place the fake
+ * sigframe: so token==0 means we never made it to the end,
+ * segfaulting well-before, and the test is possibly broken.
+ */
+ if (!td->sanity_disabled && !td->token) {
+ fprintf(stdout,
+ "current->token ZEROED...test is probably broken!\n");
+ abort();
+ }
+ /*
+ * Trying to narrow down the SEGV to the ones generated by Kernel itself
+ * via arm64_notify_segfault(). This is a best-effort check anyway, and
+ * the si_code check may need to change if this aspect of the kernel
+ * ABI changes.
+ */
+ if (td->sig_ok == SIGSEGV && si->si_code != SEGV_ACCERR) {
+ fprintf(stdout,
+ "si_code != SEGV_ACCERR...test is probably broken!\n");
+ abort();
+ }
+ td->pass = 1;
+ /*
+ * Some tests can lead to SEGV loops: in such a case we want to
+ * terminate immediately exiting straight away; some others are not
+ * supposed to outlive the signal handler code, due to the content of
+ * the fake sigframe which caused the signal itself.
+ */
+ default_result(current, 1);
+
+ return true;
+}
+
+static bool handle_signal_copyctx(struct tdescr *td,
+ siginfo_t *si, void *uc)
+{
+ /* Mangling PC to avoid loops on original BRK instr */
+ ((ucontext_t *)uc)->uc_mcontext.pc += 4;
+ memcpy(td->live_uc, uc, td->live_sz);
+ ASSERT_GOOD_CONTEXT(td->live_uc);
+ td->live_uc_valid = 1;
+ fprintf(stderr,
+ "GOOD CONTEXT grabbed from sig_copyctx handler\n");
+
+ return true;
+}
+
+static void default_handler(int signum, siginfo_t *si, void *uc)
+{
+ if (current->sig_unsupp && signum == current->sig_unsupp &&
+ handle_signal_unsupported(current, si, uc)) {
+ fprintf(stderr, "Handled SIG_UNSUPP\n");
+ } else if (current->sig_trig && signum == current->sig_trig &&
+ handle_signal_trigger(current, si, uc)) {
+ fprintf(stderr, "Handled SIG_TRIG\n");
+ } else if (current->sig_ok && signum == current->sig_ok &&
+ handle_signal_ok(current, si, uc)) {
+ fprintf(stderr, "Handled SIG_OK\n");
+ } else if (signum == sig_copyctx && current->live_uc &&
+ handle_signal_copyctx(current, si, uc)) {
+ fprintf(stderr, "Handled SIG_COPYCTX\n");
+ } else {
+ if (signum == SIGALRM && current->timeout) {
+ fprintf(stderr, "-- Timeout !\n");
+ } else {
+ fprintf(stderr,
+ "-- RX UNEXPECTED SIGNAL: %d\n", signum);
+ }
+ default_result(current, 1);
+ }
+}
+
+static int default_setup(struct tdescr *td)
+{
+ struct sigaction sa;
+
+ sa.sa_sigaction = default_handler;
+ sa.sa_flags = SA_SIGINFO | SA_RESTART;
+ sa.sa_flags |= td->sa_flags;
+ sigemptyset(&sa.sa_mask);
+ /* uncatchable signals naturally skipped ... */
+ for (int sig = 1; sig < 32; sig++)
+ sigaction(sig, &sa, NULL);
+ /*
+ * RT Signals default disposition is Term but they cannot be
+ * generated by the Kernel in response to our tests; so just catch
+ * them all and report them as UNEXPECTED signals.
+ */
+ for (int sig = SIGRTMIN; sig <= SIGRTMAX; sig++)
+ sigaction(sig, &sa, NULL);
+
+ /* just in case...unblock explicitly all we need */
+ if (td->sig_trig)
+ unblock_signal(td->sig_trig);
+ if (td->sig_ok)
+ unblock_signal(td->sig_ok);
+ if (td->sig_unsupp)
+ unblock_signal(td->sig_unsupp);
+
+ if (td->timeout) {
+ unblock_signal(SIGALRM);
+ alarm(td->timeout);
+ }
+ fprintf(stderr, "Registered handlers for all signals.\n");
+
+ return 1;
+}
+
+static inline int default_trigger(struct tdescr *td)
+{
+ return !raise(td->sig_trig);
+}
+
+int test_init(struct tdescr *td)
+{
+ if (td->sig_trig == sig_copyctx) {
+ fprintf(stdout,
+ "Signal %d is RESERVED, cannot be used as a trigger. Aborting\n",
+ sig_copyctx);
+ return 0;
+ }
+ /* just in case */
+ unblock_signal(sig_copyctx);
+
+ td->minsigstksz = getauxval(AT_MINSIGSTKSZ);
+ if (!td->minsigstksz)
+ td->minsigstksz = MINSIGSTKSZ;
+ fprintf(stderr, "Detected MINSTKSIGSZ:%d\n", td->minsigstksz);
+
+ if (td->feats_required) {
+ td->feats_supported = 0;
+ /*
+ * Checking for CPU required features using both the
+ * auxval and the arm64 MRS Emulation to read sysregs.
+ */
+ if (getauxval(AT_HWCAP) & HWCAP_SSBS)
+ td->feats_supported |= FEAT_SSBS;
+ if (getauxval(AT_HWCAP) & HWCAP_SVE)
+ td->feats_supported |= FEAT_SVE;
+ if (feats_ok(td)) {
+ fprintf(stderr,
+ "Required Features: [%s] supported\n",
+ feats_to_string(td->feats_required &
+ td->feats_supported));
+ } else {
+ fprintf(stderr,
+ "Required Features: [%s] NOT supported\n",
+ feats_to_string(td->feats_required &
+ ~td->feats_supported));
+ td->result = KSFT_SKIP;
+ return 0;
+ }
+ }
+
+ /* Perform test specific additional initialization */
+ if (td->init && !td->init(td)) {
+ fprintf(stderr, "FAILED Testcase initialization.\n");
+ return 0;
+ }
+ td->initialized = 1;
+ fprintf(stderr, "Testcase initialized.\n");
+
+ return 1;
+}
+
+int test_setup(struct tdescr *td)
+{
+ /* assert core invariants symptom of a rotten testcase */
+ assert(current);
+ assert(td);
+ assert(td->name);
+ assert(td->run);
+
+ /* Default result is FAIL if test setup fails */
+ td->result = KSFT_FAIL;
+ if (td->setup)
+ return td->setup(td);
+ else
+ return default_setup(td);
+}
+
+int test_run(struct tdescr *td)
+{
+ if (td->sig_trig) {
+ if (td->trigger)
+ return td->trigger(td);
+ else
+ return default_trigger(td);
+ } else {
+ return td->run(td, NULL, NULL);
+ }
+}
+
+void test_result(struct tdescr *td)
+{
+ if (td->initialized && td->result != KSFT_SKIP && td->check_result)
+ td->check_result(td);
+ default_result(td, 0);
+}
+
+void test_cleanup(struct tdescr *td)
+{
+ if (td->cleanup)
+ td->cleanup(td);
+}
diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.h b/tools/testing/selftests/arm64/signal/test_signals_utils.h
new file mode 100644
index 000000000..6772b5c8d
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/test_signals_utils.h
@@ -0,0 +1,120 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2019 ARM Limited */
+
+#ifndef __TEST_SIGNALS_UTILS_H__
+#define __TEST_SIGNALS_UTILS_H__
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "test_signals.h"
+
+int test_init(struct tdescr *td);
+int test_setup(struct tdescr *td);
+void test_cleanup(struct tdescr *td);
+int test_run(struct tdescr *td);
+void test_result(struct tdescr *td);
+
+static inline bool feats_ok(struct tdescr *td)
+{
+ return (td->feats_required & td->feats_supported) == td->feats_required;
+}
+
+/*
+ * Obtaining a valid and full-blown ucontext_t from userspace is tricky:
+ * libc getcontext does() not save all the regs and messes with some of
+ * them (pstate value in particular is not reliable).
+ *
+ * Here we use a service signal to grab the ucontext_t from inside a
+ * dedicated signal handler, since there, it is populated by Kernel
+ * itself in setup_sigframe(). The grabbed context is then stored and
+ * made available in td->live_uc.
+ *
+ * As service-signal is used a SIGTRAP induced by a 'brk' instruction,
+ * because here we have to avoid syscalls to trigger the signal since
+ * they would cause any SVE sigframe content (if any) to be removed.
+ *
+ * Anyway this function really serves a dual purpose:
+ *
+ * 1. grab a valid sigcontext into td->live_uc for result analysis: in
+ * such case it returns 1.
+ *
+ * 2. detect if, somehow, a previously grabbed live_uc context has been
+ * used actively with a sigreturn: in such a case the execution would have
+ * magically resumed in the middle of this function itself (seen_already==1):
+ * in such a case return 0, since in fact we have not just simply grabbed
+ * the context.
+ *
+ * This latter case is useful to detect when a fake_sigreturn test-case has
+ * unexpectedly survived without hitting a SEGV.
+ *
+ * Note that the case of runtime dynamically sized sigframes (like in SVE
+ * context) is still NOT addressed: sigframe size is supposed to be fixed
+ * at sizeof(ucontext_t).
+ */
+static __always_inline bool get_current_context(struct tdescr *td,
+ ucontext_t *dest_uc)
+{
+ static volatile bool seen_already;
+
+ assert(td && dest_uc);
+ /* it's a genuine invocation..reinit */
+ seen_already = 0;
+ td->live_uc_valid = 0;
+ td->live_sz = sizeof(*dest_uc);
+ memset(dest_uc, 0x00, td->live_sz);
+ td->live_uc = dest_uc;
+ /*
+ * Grab ucontext_t triggering a SIGTRAP.
+ *
+ * Note that:
+ * - live_uc_valid is declared volatile sig_atomic_t in
+ * struct tdescr since it will be changed inside the
+ * sig_copyctx handler
+ * - the additional 'memory' clobber is there to avoid possible
+ * compiler's assumption on live_uc_valid and the content
+ * pointed by dest_uc, which are all changed inside the signal
+ * handler
+ * - BRK causes a debug exception which is handled by the Kernel
+ * and finally causes the SIGTRAP signal to be delivered to this
+ * test thread. Since such delivery happens on the ret_to_user()
+ * /do_notify_resume() debug exception return-path, we are sure
+ * that the registered SIGTRAP handler has been run to completion
+ * before the execution path is restored here: as a consequence
+ * we can be sure that the volatile sig_atomic_t live_uc_valid
+ * carries a meaningful result. Being in a single thread context
+ * we'll also be sure that any access to memory modified by the
+ * handler (namely ucontext_t) will be visible once returned.
+ * - note that since we are using a breakpoint instruction here
+ * to cause a SIGTRAP, the ucontext_t grabbed from the signal
+ * handler would naturally contain a PC pointing exactly to this
+ * BRK line, which means that, on return from the signal handler,
+ * or if we place the ucontext_t on the stack to fake a sigreturn,
+ * we'll end up in an infinite loop of BRK-SIGTRAP-handler.
+ * For this reason we take care to artificially move forward the
+ * PC to the next instruction while inside the signal handler.
+ */
+ asm volatile ("brk #666"
+ : "+m" (*dest_uc)
+ :
+ : "memory");
+
+ /*
+ * If we get here with seen_already==1 it implies the td->live_uc
+ * context has been used to get back here....this probably means
+ * a test has failed to cause a SEGV...anyway live_uc does not
+ * point to a just acquired copy of ucontext_t...so return 0
+ */
+ if (seen_already) {
+ fprintf(stdout,
+ "Unexpected successful sigreturn detected: live_uc is stale !\n");
+ return 0;
+ }
+ seen_already = 1;
+
+ return td->live_uc_valid;
+}
+
+int fake_sigreturn(void *sigframe, size_t sz, int misalign_bytes);
+#endif
diff --git a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_bad_magic.c b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_bad_magic.c
new file mode 100644
index 000000000..8dc600a7d
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_bad_magic.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 ARM Limited
+ *
+ * Place a fake sigframe on the stack including a BAD Unknown magic
+ * record: on sigreturn Kernel must spot this attempt and the test
+ * case is expected to be terminated via SEGV.
+ */
+
+#include <signal.h>
+#include <ucontext.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+struct fake_sigframe sf;
+
+static int fake_sigreturn_bad_magic_run(struct tdescr *td,
+ siginfo_t *si, ucontext_t *uc)
+{
+ struct _aarch64_ctx *shead = GET_SF_RESV_HEAD(sf), *head;
+
+ /* just to fill the ucontext_t with something real */
+ if (!get_current_context(td, &sf.uc))
+ return 1;
+
+ /* need at least 2*HDR_SZ space: KSFT_BAD_MAGIC + terminator. */
+ head = get_starting_head(shead, HDR_SZ * 2, GET_SF_RESV_SIZE(sf), NULL);
+ if (!head)
+ return 0;
+
+ /*
+ * use a well known NON existent bad magic...something
+ * we should pretty sure won't be ever defined in Kernel
+ */
+ head->magic = KSFT_BAD_MAGIC;
+ head->size = HDR_SZ;
+ write_terminator_record(GET_RESV_NEXT_HEAD(head));
+
+ ASSERT_BAD_CONTEXT(&sf.uc);
+ fake_sigreturn(&sf, sizeof(sf), 0);
+
+ return 1;
+}
+
+struct tdescr tde = {
+ .name = "FAKE_SIGRETURN_BAD_MAGIC",
+ .descr = "Trigger a sigreturn with a sigframe with a bad magic",
+ .sig_ok = SIGSEGV,
+ .timeout = 3,
+ .run = fake_sigreturn_bad_magic_run,
+};
diff --git a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_bad_size.c b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_bad_size.c
new file mode 100644
index 000000000..b3c362100
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_bad_size.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 ARM Limited
+ *
+ * Place a fake sigframe on the stack including a bad record overflowing
+ * the __reserved space: on sigreturn Kernel must spot this attempt and
+ * the test case is expected to be terminated via SEGV.
+ */
+
+#include <signal.h>
+#include <ucontext.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+struct fake_sigframe sf;
+
+#define MIN_SZ_ALIGN 16
+
+static int fake_sigreturn_bad_size_run(struct tdescr *td,
+ siginfo_t *si, ucontext_t *uc)
+{
+ size_t resv_sz, need_sz, offset;
+ struct _aarch64_ctx *shead = GET_SF_RESV_HEAD(sf), *head;
+
+ /* just to fill the ucontext_t with something real */
+ if (!get_current_context(td, &sf.uc))
+ return 1;
+
+ resv_sz = GET_SF_RESV_SIZE(sf);
+ /* at least HDR_SZ + bad sized esr_context needed */
+ need_sz = sizeof(struct esr_context) + HDR_SZ;
+ head = get_starting_head(shead, need_sz, resv_sz, &offset);
+ if (!head)
+ return 0;
+
+ /*
+ * Use an esr_context to build a fake header with a
+ * size greater then the free __reserved area minus HDR_SZ;
+ * using ESR_MAGIC here since it is not checked for size nor
+ * is limited to one instance.
+ *
+ * At first inject an additional normal esr_context
+ */
+ head->magic = ESR_MAGIC;
+ head->size = sizeof(struct esr_context);
+ /* and terminate properly */
+ write_terminator_record(GET_RESV_NEXT_HEAD(head));
+ ASSERT_GOOD_CONTEXT(&sf.uc);
+
+ /*
+ * now mess with fake esr_context size: leaving less space than
+ * needed while keeping size value 16-aligned
+ *
+ * It must trigger a SEGV from Kernel on:
+ *
+ * resv_sz - offset < sizeof(*head)
+ */
+ /* at first set the maximum good 16-aligned size */
+ head->size = (resv_sz - offset - need_sz + MIN_SZ_ALIGN) & ~0xfUL;
+ /* plus a bit more of 16-aligned sized stuff */
+ head->size += MIN_SZ_ALIGN;
+ /* and terminate properly */
+ write_terminator_record(GET_RESV_NEXT_HEAD(head));
+ ASSERT_BAD_CONTEXT(&sf.uc);
+ fake_sigreturn(&sf, sizeof(sf), 0);
+
+ return 1;
+}
+
+struct tdescr tde = {
+ .name = "FAKE_SIGRETURN_BAD_SIZE",
+ .descr = "Triggers a sigreturn with a overrun __reserved area",
+ .sig_ok = SIGSEGV,
+ .timeout = 3,
+ .run = fake_sigreturn_bad_size_run,
+};
diff --git a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_bad_size_for_magic0.c b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_bad_size_for_magic0.c
new file mode 100644
index 000000000..a44b88bfc
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_bad_size_for_magic0.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 ARM Limited
+ *
+ * Place a fake sigframe on the stack including a badly sized terminator
+ * record: on sigreturn Kernel must spot this attempt and the test case
+ * is expected to be terminated via SEGV.
+ */
+
+#include <signal.h>
+#include <ucontext.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+struct fake_sigframe sf;
+
+static int fake_sigreturn_bad_size_for_magic0_run(struct tdescr *td,
+ siginfo_t *si, ucontext_t *uc)
+{
+ struct _aarch64_ctx *shead = GET_SF_RESV_HEAD(sf), *head;
+
+ /* just to fill the ucontext_t with something real */
+ if (!get_current_context(td, &sf.uc))
+ return 1;
+
+ /* at least HDR_SZ for the badly sized terminator. */
+ head = get_starting_head(shead, HDR_SZ, GET_SF_RESV_SIZE(sf), NULL);
+ if (!head)
+ return 0;
+
+ head->magic = 0;
+ head->size = HDR_SZ;
+ ASSERT_BAD_CONTEXT(&sf.uc);
+ fake_sigreturn(&sf, sizeof(sf), 0);
+
+ return 1;
+}
+
+struct tdescr tde = {
+ .name = "FAKE_SIGRETURN_BAD_SIZE_FOR_TERMINATOR",
+ .descr = "Trigger a sigreturn using non-zero size terminator",
+ .sig_ok = SIGSEGV,
+ .timeout = 3,
+ .run = fake_sigreturn_bad_size_for_magic0_run,
+};
diff --git a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_duplicated_fpsimd.c b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_duplicated_fpsimd.c
new file mode 100644
index 000000000..afe8915f0
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_duplicated_fpsimd.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 ARM Limited
+ *
+ * Place a fake sigframe on the stack including an additional FPSIMD
+ * record: on sigreturn Kernel must spot this attempt and the test
+ * case is expected to be terminated via SEGV.
+ */
+
+#include <signal.h>
+#include <ucontext.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+struct fake_sigframe sf;
+
+static int fake_sigreturn_duplicated_fpsimd_run(struct tdescr *td,
+ siginfo_t *si, ucontext_t *uc)
+{
+ struct _aarch64_ctx *shead = GET_SF_RESV_HEAD(sf), *head;
+
+ /* just to fill the ucontext_t with something real */
+ if (!get_current_context(td, &sf.uc))
+ return 1;
+
+ head = get_starting_head(shead, sizeof(struct fpsimd_context) + HDR_SZ,
+ GET_SF_RESV_SIZE(sf), NULL);
+ if (!head)
+ return 0;
+
+ /* Add a spurious fpsimd_context */
+ head->magic = FPSIMD_MAGIC;
+ head->size = sizeof(struct fpsimd_context);
+ /* and terminate */
+ write_terminator_record(GET_RESV_NEXT_HEAD(head));
+
+ ASSERT_BAD_CONTEXT(&sf.uc);
+ fake_sigreturn(&sf, sizeof(sf), 0);
+
+ return 1;
+}
+
+struct tdescr tde = {
+ .name = "FAKE_SIGRETURN_DUPLICATED_FPSIMD",
+ .descr = "Triggers a sigreturn including two fpsimd_context",
+ .sig_ok = SIGSEGV,
+ .timeout = 3,
+ .run = fake_sigreturn_duplicated_fpsimd_run,
+};
diff --git a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_misaligned_sp.c b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_misaligned_sp.c
new file mode 100644
index 000000000..1e089e66f
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_misaligned_sp.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 ARM Limited
+ *
+ * Place a fake sigframe on the stack at a misaligned SP: on sigreturn
+ * Kernel must spot this attempt and the test case is expected to be
+ * terminated via SEGV.
+ */
+
+#include <signal.h>
+#include <ucontext.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+struct fake_sigframe sf;
+
+static int fake_sigreturn_misaligned_run(struct tdescr *td,
+ siginfo_t *si, ucontext_t *uc)
+{
+ /* just to fill the ucontext_t with something real */
+ if (!get_current_context(td, &sf.uc))
+ return 1;
+
+ /* Forcing sigframe on misaligned SP (16 + 3) */
+ fake_sigreturn(&sf, sizeof(sf), 3);
+
+ return 1;
+}
+
+struct tdescr tde = {
+ .name = "FAKE_SIGRETURN_MISALIGNED_SP",
+ .descr = "Triggers a sigreturn with a misaligned sigframe",
+ .sig_ok = SIGSEGV,
+ .timeout = 3,
+ .run = fake_sigreturn_misaligned_run,
+};
diff --git a/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_missing_fpsimd.c b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_missing_fpsimd.c
new file mode 100644
index 000000000..08ecd8073
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/fake_sigreturn_missing_fpsimd.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 ARM Limited
+ *
+ * Place a fake sigframe on the stack missing the mandatory FPSIMD
+ * record: on sigreturn Kernel must spot this attempt and the test
+ * case is expected to be terminated via SEGV.
+ */
+
+#include <stdio.h>
+#include <signal.h>
+#include <ucontext.h>
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+struct fake_sigframe sf;
+
+static int fake_sigreturn_missing_fpsimd_run(struct tdescr *td,
+ siginfo_t *si, ucontext_t *uc)
+{
+ size_t resv_sz, offset;
+ struct _aarch64_ctx *head = GET_SF_RESV_HEAD(sf);
+
+ /* just to fill the ucontext_t with something real */
+ if (!get_current_context(td, &sf.uc))
+ return 1;
+
+ resv_sz = GET_SF_RESV_SIZE(sf);
+ head = get_header(head, FPSIMD_MAGIC, resv_sz, &offset);
+ if (head && resv_sz - offset >= HDR_SZ) {
+ fprintf(stderr, "Mangling template header. Spare space:%zd\n",
+ resv_sz - offset);
+ /* Just overwrite fpsmid_context */
+ write_terminator_record(head);
+
+ ASSERT_BAD_CONTEXT(&sf.uc);
+ fake_sigreturn(&sf, sizeof(sf), 0);
+ }
+
+ return 1;
+}
+
+struct tdescr tde = {
+ .name = "FAKE_SIGRETURN_MISSING_FPSIMD",
+ .descr = "Triggers a sigreturn with a missing fpsimd_context",
+ .sig_ok = SIGSEGV,
+ .timeout = 3,
+ .run = fake_sigreturn_missing_fpsimd_run,
+};
diff --git a/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_compat_toggle.c b/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_compat_toggle.c
new file mode 100644
index 000000000..2cb118b0b
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_compat_toggle.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 ARM Limited
+ *
+ * Try to mangle the ucontext from inside a signal handler, toggling
+ * the execution state bit: this attempt must be spotted by Kernel and
+ * the test case is expected to be terminated via SEGV.
+ */
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+static int mangle_invalid_pstate_run(struct tdescr *td, siginfo_t *si,
+ ucontext_t *uc)
+{
+ ASSERT_GOOD_CONTEXT(uc);
+
+ /* This config should trigger a SIGSEGV by Kernel */
+ uc->uc_mcontext.pstate ^= PSR_MODE32_BIT;
+
+ return 1;
+}
+
+struct tdescr tde = {
+ .sanity_disabled = true,
+ .name = "MANGLE_PSTATE_INVALID_STATE_TOGGLE",
+ .descr = "Mangling uc_mcontext with INVALID STATE_TOGGLE",
+ .sig_trig = SIGUSR1,
+ .sig_ok = SIGSEGV,
+ .run = mangle_invalid_pstate_run,
+};
diff --git a/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_daif_bits.c b/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_daif_bits.c
new file mode 100644
index 000000000..434b82597
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_daif_bits.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 ARM Limited
+ *
+ * Try to mangle the ucontext from inside a signal handler, mangling the
+ * DAIF bits in an illegal manner: this attempt must be spotted by Kernel
+ * and the test case is expected to be terminated via SEGV.
+ *
+ */
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+static int mangle_invalid_pstate_run(struct tdescr *td, siginfo_t *si,
+ ucontext_t *uc)
+{
+ ASSERT_GOOD_CONTEXT(uc);
+
+ /*
+ * This config should trigger a SIGSEGV by Kernel when it checks
+ * the sigframe consistency in valid_user_regs() routine.
+ */
+ uc->uc_mcontext.pstate |= PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT;
+
+ return 1;
+}
+
+struct tdescr tde = {
+ .sanity_disabled = true,
+ .name = "MANGLE_PSTATE_INVALID_DAIF_BITS",
+ .descr = "Mangling uc_mcontext with INVALID DAIF_BITS",
+ .sig_trig = SIGUSR1,
+ .sig_ok = SIGSEGV,
+ .run = mangle_invalid_pstate_run,
+};
diff --git a/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el1h.c b/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el1h.c
new file mode 100644
index 000000000..95f821abd
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el1h.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 ARM Limited
+ *
+ * Try to mangle the ucontext from inside a signal handler, toggling
+ * the mode bit to escalate exception level: this attempt must be spotted
+ * by Kernel and the test case is expected to be termninated via SEGV.
+ */
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+#include "mangle_pstate_invalid_mode_template.h"
+
+DEFINE_TESTCASE_MANGLE_PSTATE_INVALID_MODE(1h);
diff --git a/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el1t.c b/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el1t.c
new file mode 100644
index 000000000..cc222d8a6
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el1t.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 ARM Limited
+ *
+ * Try to mangle the ucontext from inside a signal handler, toggling
+ * the mode bit to escalate exception level: this attempt must be spotted
+ * by Kernel and the test case is expected to be termninated via SEGV.
+ */
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+#include "mangle_pstate_invalid_mode_template.h"
+
+DEFINE_TESTCASE_MANGLE_PSTATE_INVALID_MODE(1t);
diff --git a/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el2h.c b/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el2h.c
new file mode 100644
index 000000000..2188add7d
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el2h.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 ARM Limited
+ *
+ * Try to mangle the ucontext from inside a signal handler, toggling
+ * the mode bit to escalate exception level: this attempt must be spotted
+ * by Kernel and the test case is expected to be termninated via SEGV.
+ */
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+#include "mangle_pstate_invalid_mode_template.h"
+
+DEFINE_TESTCASE_MANGLE_PSTATE_INVALID_MODE(2h);
diff --git a/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el2t.c b/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el2t.c
new file mode 100644
index 000000000..df32dd5a4
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el2t.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 ARM Limited
+ *
+ * Try to mangle the ucontext from inside a signal handler, toggling
+ * the mode bit to escalate exception level: this attempt must be spotted
+ * by Kernel and the test case is expected to be termninated via SEGV.
+ */
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+#include "mangle_pstate_invalid_mode_template.h"
+
+DEFINE_TESTCASE_MANGLE_PSTATE_INVALID_MODE(2t);
diff --git a/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el3h.c b/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el3h.c
new file mode 100644
index 000000000..9e6829b7e
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el3h.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 ARM Limited
+ *
+ * Try to mangle the ucontext from inside a signal handler, toggling
+ * the mode bit to escalate exception level: this attempt must be spotted
+ * by Kernel and the test case is expected to be termninated via SEGV.
+ */
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+#include "mangle_pstate_invalid_mode_template.h"
+
+DEFINE_TESTCASE_MANGLE_PSTATE_INVALID_MODE(3h);
diff --git a/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el3t.c b/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el3t.c
new file mode 100644
index 000000000..5685a4f10
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_el3t.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 ARM Limited
+ *
+ * Try to mangle the ucontext from inside a signal handler, toggling
+ * the mode bit to escalate exception level: this attempt must be spotted
+ * by Kernel and the test case is expected to be termninated via SEGV.
+ */
+
+#include "test_signals_utils.h"
+#include "testcases.h"
+
+#include "mangle_pstate_invalid_mode_template.h"
+
+DEFINE_TESTCASE_MANGLE_PSTATE_INVALID_MODE(3t);
diff --git a/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_template.h b/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_template.h
new file mode 100644
index 000000000..f5bf1804d
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/mangle_pstate_invalid_mode_template.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019 ARM Limited
+ *
+ * Utility macro to ease definition of testcases toggling mode EL
+ */
+
+#define DEFINE_TESTCASE_MANGLE_PSTATE_INVALID_MODE(_mode) \
+ \
+static int mangle_invalid_pstate_run(struct tdescr *td, siginfo_t *si, \
+ ucontext_t *uc) \
+{ \
+ ASSERT_GOOD_CONTEXT(uc); \
+ \
+ uc->uc_mcontext.pstate &= ~PSR_MODE_MASK; \
+ uc->uc_mcontext.pstate |= PSR_MODE_EL ## _mode; \
+ \
+ return 1; \
+} \
+ \
+struct tdescr tde = { \
+ .sanity_disabled = true, \
+ .name = "MANGLE_PSTATE_INVALID_MODE_EL"#_mode, \
+ .descr = "Mangling uc_mcontext INVALID MODE EL"#_mode, \
+ .sig_trig = SIGUSR1, \
+ .sig_ok = SIGSEGV, \
+ .run = mangle_invalid_pstate_run, \
+}
diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.c b/tools/testing/selftests/arm64/signal/testcases/testcases.c
new file mode 100644
index 000000000..a3ac5c2d8
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/testcases.c
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) 2019 ARM Limited */
+#include "testcases.h"
+
+struct _aarch64_ctx *get_header(struct _aarch64_ctx *head, uint32_t magic,
+ size_t resv_sz, size_t *offset)
+{
+ size_t offs = 0;
+ struct _aarch64_ctx *found = NULL;
+
+ if (!head || resv_sz < HDR_SZ)
+ return found;
+
+ while (offs <= resv_sz - HDR_SZ &&
+ head->magic != magic && head->magic) {
+ offs += head->size;
+ head = GET_RESV_NEXT_HEAD(head);
+ }
+ if (head->magic == magic) {
+ found = head;
+ if (offset)
+ *offset = offs;
+ }
+
+ return found;
+}
+
+bool validate_extra_context(struct extra_context *extra, char **err)
+{
+ struct _aarch64_ctx *term;
+
+ if (!extra || !err)
+ return false;
+
+ fprintf(stderr, "Validating EXTRA...\n");
+ term = GET_RESV_NEXT_HEAD(&extra->head);
+ if (!term || term->magic || term->size) {
+ *err = "Missing terminator after EXTRA context";
+ return false;
+ }
+ if (extra->datap & 0x0fUL)
+ *err = "Extra DATAP misaligned";
+ else if (extra->size & 0x0fUL)
+ *err = "Extra SIZE misaligned";
+ else if (extra->datap != (uint64_t)term + sizeof(*term))
+ *err = "Extra DATAP misplaced (not contiguous)";
+ if (*err)
+ return false;
+
+ return true;
+}
+
+bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err)
+{
+ bool terminated = false;
+ size_t offs = 0;
+ int flags = 0;
+ struct extra_context *extra = NULL;
+ struct _aarch64_ctx *head =
+ (struct _aarch64_ctx *)uc->uc_mcontext.__reserved;
+
+ if (!err)
+ return false;
+ /* Walk till the end terminator verifying __reserved contents */
+ while (head && !terminated && offs < resv_sz) {
+ if ((uint64_t)head & 0x0fUL) {
+ *err = "Misaligned HEAD";
+ return false;
+ }
+
+ switch (head->magic) {
+ case 0:
+ if (head->size)
+ *err = "Bad size for terminator";
+ else
+ terminated = true;
+ break;
+ case FPSIMD_MAGIC:
+ if (flags & FPSIMD_CTX)
+ *err = "Multiple FPSIMD_MAGIC";
+ else if (head->size !=
+ sizeof(struct fpsimd_context))
+ *err = "Bad size for fpsimd_context";
+ flags |= FPSIMD_CTX;
+ break;
+ case ESR_MAGIC:
+ if (head->size != sizeof(struct esr_context))
+ *err = "Bad size for esr_context";
+ break;
+ case SVE_MAGIC:
+ if (flags & SVE_CTX)
+ *err = "Multiple SVE_MAGIC";
+ else if (head->size !=
+ sizeof(struct sve_context))
+ *err = "Bad size for sve_context";
+ flags |= SVE_CTX;
+ break;
+ case EXTRA_MAGIC:
+ if (flags & EXTRA_CTX)
+ *err = "Multiple EXTRA_MAGIC";
+ else if (head->size !=
+ sizeof(struct extra_context))
+ *err = "Bad size for extra_context";
+ flags |= EXTRA_CTX;
+ extra = (struct extra_context *)head;
+ break;
+ case KSFT_BAD_MAGIC:
+ /*
+ * This is a BAD magic header defined
+ * artificially by a testcase and surely
+ * unknown to the Kernel parse_user_sigframe().
+ * It MUST cause a Kernel induced SEGV
+ */
+ *err = "BAD MAGIC !";
+ break;
+ default:
+ /*
+ * A still unknown Magic: potentially freshly added
+ * to the Kernel code and still unknown to the
+ * tests.
+ */
+ fprintf(stdout,
+ "SKIP Unknown MAGIC: 0x%X - Is KSFT arm64/signal up to date ?\n",
+ head->magic);
+ break;
+ }
+
+ if (*err)
+ return false;
+
+ offs += head->size;
+ if (resv_sz < offs + sizeof(*head)) {
+ *err = "HEAD Overrun";
+ return false;
+ }
+
+ if (flags & EXTRA_CTX)
+ if (!validate_extra_context(extra, err))
+ return false;
+
+ head = GET_RESV_NEXT_HEAD(head);
+ }
+
+ if (terminated && !(flags & FPSIMD_CTX)) {
+ *err = "Missing FPSIMD";
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * This function walks through the records inside the provided reserved area
+ * trying to find enough space to fit @need_sz bytes: if not enough space is
+ * available and an extra_context record is present, it throws away the
+ * extra_context record.
+ *
+ * It returns a pointer to a new header where it is possible to start storing
+ * our need_sz bytes.
+ *
+ * @shead: points to the start of reserved area
+ * @need_sz: needed bytes
+ * @resv_sz: reserved area size in bytes
+ * @offset: if not null, this will be filled with the offset of the return
+ * head pointer from @shead
+ *
+ * @return: pointer to a new head where to start storing need_sz bytes, or
+ * NULL if space could not be made available.
+ */
+struct _aarch64_ctx *get_starting_head(struct _aarch64_ctx *shead,
+ size_t need_sz, size_t resv_sz,
+ size_t *offset)
+{
+ size_t offs = 0;
+ struct _aarch64_ctx *head;
+
+ head = get_terminator(shead, resv_sz, &offs);
+ /* not found a terminator...no need to update offset if any */
+ if (!head)
+ return head;
+ if (resv_sz - offs < need_sz) {
+ fprintf(stderr, "Low on space:%zd. Discarding extra_context.\n",
+ resv_sz - offs);
+ head = get_header(shead, EXTRA_MAGIC, resv_sz, &offs);
+ if (!head || resv_sz - offs < need_sz) {
+ fprintf(stderr,
+ "Failed to reclaim space on sigframe.\n");
+ return NULL;
+ }
+ }
+
+ fprintf(stderr, "Available space:%zd\n", resv_sz - offs);
+ if (offset)
+ *offset = offs;
+ return head;
+}
diff --git a/tools/testing/selftests/arm64/signal/testcases/testcases.h b/tools/testing/selftests/arm64/signal/testcases/testcases.h
new file mode 100644
index 000000000..ad884c135
--- /dev/null
+++ b/tools/testing/selftests/arm64/signal/testcases/testcases.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2019 ARM Limited */
+#ifndef __TESTCASES_H__
+#define __TESTCASES_H__
+
+#include <stddef.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <ucontext.h>
+#include <signal.h>
+
+/* Architecture specific sigframe definitions */
+#include <asm/sigcontext.h>
+
+#define FPSIMD_CTX (1 << 0)
+#define SVE_CTX (1 << 1)
+#define EXTRA_CTX (1 << 2)
+
+#define KSFT_BAD_MAGIC 0xdeadbeef
+
+#define HDR_SZ \
+ sizeof(struct _aarch64_ctx)
+
+#define GET_SF_RESV_HEAD(sf) \
+ (struct _aarch64_ctx *)(&(sf).uc.uc_mcontext.__reserved)
+
+#define GET_SF_RESV_SIZE(sf) \
+ sizeof((sf).uc.uc_mcontext.__reserved)
+
+#define GET_UCP_RESV_SIZE(ucp) \
+ sizeof((ucp)->uc_mcontext.__reserved)
+
+#define ASSERT_BAD_CONTEXT(uc) do { \
+ char *err = NULL; \
+ if (!validate_reserved((uc), GET_UCP_RESV_SIZE((uc)), &err)) { \
+ if (err) \
+ fprintf(stderr, \
+ "Using badly built context - ERR: %s\n",\
+ err); \
+ } else { \
+ abort(); \
+ } \
+} while (0)
+
+#define ASSERT_GOOD_CONTEXT(uc) do { \
+ char *err = NULL; \
+ if (!validate_reserved((uc), GET_UCP_RESV_SIZE((uc)), &err)) { \
+ if (err) \
+ fprintf(stderr, \
+ "Detected BAD context - ERR: %s\n", err);\
+ abort(); \
+ } else { \
+ fprintf(stderr, "uc context validated.\n"); \
+ } \
+} while (0)
+
+/*
+ * A simple record-walker for __reserved area: it walks through assuming
+ * only to find a proper struct __aarch64_ctx header descriptor.
+ *
+ * Instead it makes no assumptions on the content and ordering of the
+ * records, any needed bounds checking must be enforced by the caller
+ * if wanted: this way can be used by caller on any maliciously built bad
+ * contexts.
+ *
+ * head->size accounts both for payload and header _aarch64_ctx size !
+ */
+#define GET_RESV_NEXT_HEAD(h) \
+ (struct _aarch64_ctx *)((char *)(h) + (h)->size)
+
+struct fake_sigframe {
+ siginfo_t info;
+ ucontext_t uc;
+};
+
+
+bool validate_reserved(ucontext_t *uc, size_t resv_sz, char **err);
+
+bool validate_extra_context(struct extra_context *extra, char **err);
+
+struct _aarch64_ctx *get_header(struct _aarch64_ctx *head, uint32_t magic,
+ size_t resv_sz, size_t *offset);
+
+static inline struct _aarch64_ctx *get_terminator(struct _aarch64_ctx *head,
+ size_t resv_sz,
+ size_t *offset)
+{
+ return get_header(head, 0, resv_sz, offset);
+}
+
+static inline void write_terminator_record(struct _aarch64_ctx *tail)
+{
+ if (tail) {
+ tail->magic = 0;
+ tail->size = 0;
+ }
+}
+
+struct _aarch64_ctx *get_starting_head(struct _aarch64_ctx *shead,
+ size_t need_sz, size_t resv_sz,
+ size_t *offset);
+#endif
diff --git a/tools/testing/selftests/arm64/tags/.gitignore b/tools/testing/selftests/arm64/tags/.gitignore
new file mode 100644
index 000000000..f4f6c5112
--- /dev/null
+++ b/tools/testing/selftests/arm64/tags/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+tags_test
diff --git a/tools/testing/selftests/arm64/tags/Makefile b/tools/testing/selftests/arm64/tags/Makefile
new file mode 100644
index 000000000..41cb75070
--- /dev/null
+++ b/tools/testing/selftests/arm64/tags/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+
+CFLAGS += -I../../../../../usr/include/
+TEST_GEN_PROGS := tags_test
+TEST_PROGS := run_tags_test.sh
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/arm64/tags/run_tags_test.sh b/tools/testing/selftests/arm64/tags/run_tags_test.sh
new file mode 100755
index 000000000..745f11379
--- /dev/null
+++ b/tools/testing/selftests/arm64/tags/run_tags_test.sh
@@ -0,0 +1,12 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+echo "--------------------"
+echo "running tags test"
+echo "--------------------"
+./tags_test
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+else
+ echo "[PASS]"
+fi
diff --git a/tools/testing/selftests/arm64/tags/tags_test.c b/tools/testing/selftests/arm64/tags/tags_test.c
new file mode 100644
index 000000000..570116346
--- /dev/null
+++ b/tools/testing/selftests/arm64/tags/tags_test.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <sys/prctl.h>
+#include <sys/utsname.h>
+
+#define SHIFT_TAG(tag) ((uint64_t)(tag) << 56)
+#define SET_TAG(ptr, tag) (((uint64_t)(ptr) & ~SHIFT_TAG(0xff)) | \
+ SHIFT_TAG(tag))
+
+int main(void)
+{
+ static int tbi_enabled = 0;
+ unsigned long tag = 0;
+ struct utsname *ptr;
+ int err;
+
+ if (prctl(PR_SET_TAGGED_ADDR_CTRL, PR_TAGGED_ADDR_ENABLE, 0, 0, 0) == 0)
+ tbi_enabled = 1;
+ ptr = (struct utsname *)malloc(sizeof(*ptr));
+ if (tbi_enabled)
+ tag = 0x42;
+ ptr = (struct utsname *)SET_TAG(ptr, tag);
+ err = uname(ptr);
+ free(ptr);
+
+ return err;
+}
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
new file mode 100644
index 000000000..b1b37dcad
--- /dev/null
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -0,0 +1,39 @@
+# SPDX-License-Identifier: GPL-2.0-only
+test_verifier
+test_maps
+test_lru_map
+test_lpm_map
+test_tag
+FEATURE-DUMP.libbpf
+fixdep
+test_dev_cgroup
+/test_progs*
+test_tcpbpf_user
+!test_progs.h
+test_verifier_log
+feature
+test_sock
+test_sock_addr
+urandom_read
+test_sockmap
+test_lirc_mode2_user
+get_cgroup_id_user
+test_skb_cgroup_id_user
+test_socket_cookie
+test_cgroup_storage
+test_flow_dissector
+flow_dissector_load
+test_netcnt
+test_tcpnotify_user
+test_libbpf
+test_tcp_check_syncookie_user
+test_sysctl
+test_current_pid_tgid_new_ns
+xdping
+test_cpp
+*.skel.h
+/no_alu32
+/bpf_gcc
+/tools
+/runqslower
+/bench
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
new file mode 100644
index 000000000..a845724e0
--- /dev/null
+++ b/tools/testing/selftests/bpf/Makefile
@@ -0,0 +1,466 @@
+# SPDX-License-Identifier: GPL-2.0
+include ../../../../scripts/Kbuild.include
+include ../../../scripts/Makefile.arch
+
+CXX ?= $(CROSS_COMPILE)g++
+
+CURDIR := $(abspath .)
+TOOLSDIR := $(abspath ../../..)
+LIBDIR := $(TOOLSDIR)/lib
+BPFDIR := $(LIBDIR)/bpf
+TOOLSINCDIR := $(TOOLSDIR)/include
+BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool
+APIDIR := $(TOOLSINCDIR)/uapi
+GENDIR := $(abspath ../../../../include/generated)
+GENHDR := $(GENDIR)/autoconf.h
+
+ifneq ($(wildcard $(GENHDR)),)
+ GENFLAGS := -DHAVE_GENHDR
+endif
+
+CLANG ?= clang
+LLC ?= llc
+LLVM_OBJCOPY ?= llvm-objcopy
+BPF_GCC ?= $(shell command -v bpf-gcc;)
+SAN_CFLAGS ?=
+CFLAGS += -g -rdynamic -Wall -O2 $(GENFLAGS) $(SAN_CFLAGS) \
+ -I$(CURDIR) -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \
+ -I$(TOOLSINCDIR) -I$(APIDIR) \
+ -Dbpf_prog_load=bpf_prog_test_load \
+ -Dbpf_load_program=bpf_test_load_program
+LDLIBS += -lcap -lelf -lz -lrt -lpthread
+
+# Order correspond to 'make run_tests' order
+TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \
+ test_verifier_log test_dev_cgroup test_tcpbpf_user \
+ test_sock test_sockmap get_cgroup_id_user test_socket_cookie \
+ test_cgroup_storage \
+ test_netcnt test_tcpnotify_user test_sysctl \
+ test_progs-no_alu32 \
+ test_current_pid_tgid_new_ns
+
+# Also test bpf-gcc, if present
+ifneq ($(BPF_GCC),)
+TEST_GEN_PROGS += test_progs-bpf_gcc
+endif
+
+TEST_GEN_FILES =
+TEST_FILES = test_lwt_ip_encap.o \
+ test_tc_edt.o
+
+# Order correspond to 'make run_tests' order
+TEST_PROGS := test_kmod.sh \
+ test_xdp_redirect.sh \
+ test_xdp_meta.sh \
+ test_xdp_veth.sh \
+ test_offload.py \
+ test_sock_addr.sh \
+ test_tunnel.sh \
+ test_lwt_seg6local.sh \
+ test_lirc_mode2.sh \
+ test_skb_cgroup_id.sh \
+ test_flow_dissector.sh \
+ test_xdp_vlan_mode_generic.sh \
+ test_xdp_vlan_mode_native.sh \
+ test_lwt_ip_encap.sh \
+ test_tcp_check_syncookie.sh \
+ test_tc_tunnel.sh \
+ test_tc_edt.sh \
+ test_xdping.sh \
+ test_bpftool_build.sh \
+ test_bpftool.sh \
+ test_bpftool_metadata.sh \
+
+TEST_PROGS_EXTENDED := with_addr.sh \
+ with_tunnels.sh \
+ tcp_client.py \
+ tcp_server.py \
+ test_xdp_vlan.sh
+
+# Compile but not part of 'make run_tests'
+TEST_GEN_PROGS_EXTENDED = test_sock_addr test_skb_cgroup_id_user \
+ flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
+ test_lirc_mode2_user xdping test_cpp runqslower bench
+
+TEST_CUSTOM_PROGS = urandom_read
+
+# Emit succinct information message describing current building step
+# $1 - generic step name (e.g., CC, LINK, etc);
+# $2 - optional "flavor" specifier; if provided, will be emitted as [flavor];
+# $3 - target (assumed to be file); only file name will be emitted;
+# $4 - optional extra arg, emitted as-is, if provided.
+ifeq ($(V),1)
+Q =
+msg =
+else
+Q = @
+msg = @printf ' %-8s%s %s%s\n' "$(1)" "$(if $(2), [$(2)])" "$(notdir $(3))" "$(if $(4), $(4))";
+MAKEFLAGS += --no-print-directory
+submake_extras := feature_display=0
+endif
+
+# override lib.mk's default rules
+OVERRIDE_TARGETS := 1
+override define CLEAN
+ $(call msg,CLEAN)
+ $(Q)$(RM) -r $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(EXTRA_CLEAN)
+endef
+
+include ../lib.mk
+
+SCRATCH_DIR := $(OUTPUT)/tools
+BUILD_DIR := $(SCRATCH_DIR)/build
+INCLUDE_DIR := $(SCRATCH_DIR)/include
+BPFOBJ := $(BUILD_DIR)/libbpf/libbpf.a
+RESOLVE_BTFIDS := $(BUILD_DIR)/resolve_btfids/resolve_btfids
+
+# Define simple and short `make test_progs`, `make test_sysctl`, etc targets
+# to build individual tests.
+# NOTE: Semicolon at the end is critical to override lib.mk's default static
+# rule for binaries.
+$(notdir $(TEST_GEN_PROGS) \
+ $(TEST_GEN_PROGS_EXTENDED) \
+ $(TEST_CUSTOM_PROGS)): %: $(OUTPUT)/% ;
+
+$(OUTPUT)/%.o: %.c
+ $(call msg,CC,,$@)
+ $(Q)$(CC) $(CFLAGS) -c $(filter %.c,$^) $(LDLIBS) -o $@
+
+$(OUTPUT)/%:%.c
+ $(call msg,BINARY,,$@)
+ $(Q)$(LINK.c) $^ $(LDLIBS) -o $@
+
+$(OUTPUT)/urandom_read: urandom_read.c
+ $(call msg,BINARY,,$@)
+ $(Q)$(CC) $(LDFLAGS) -o $@ $< $(LDLIBS) -Wl,--build-id=sha1
+
+$(OUTPUT)/test_stub.o: test_stub.c $(BPFOBJ)
+ $(call msg,CC,,$@)
+ $(Q)$(CC) -c $(CFLAGS) -o $@ $<
+
+VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \
+ $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \
+ ../../../../vmlinux \
+ /sys/kernel/btf/vmlinux \
+ /boot/vmlinux-$(shell uname -r)
+VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS))))
+ifeq ($(VMLINUX_BTF),)
+$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)")
+endif
+
+DEFAULT_BPFTOOL := $(SCRATCH_DIR)/sbin/bpftool
+
+$(OUTPUT)/runqslower: $(BPFOBJ) | $(DEFAULT_BPFTOOL)
+ $(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/runqslower \
+ OUTPUT=$(SCRATCH_DIR)/ VMLINUX_BTF=$(VMLINUX_BTF) \
+ BPFOBJ=$(BPFOBJ) BPF_INCLUDE=$(INCLUDE_DIR) && \
+ cp $(SCRATCH_DIR)/runqslower $@
+
+$(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/test_stub.o $(BPFOBJ)
+
+$(OUTPUT)/test_dev_cgroup: cgroup_helpers.c
+$(OUTPUT)/test_skb_cgroup_id_user: cgroup_helpers.c
+$(OUTPUT)/test_sock: cgroup_helpers.c
+$(OUTPUT)/test_sock_addr: cgroup_helpers.c
+$(OUTPUT)/test_socket_cookie: cgroup_helpers.c
+$(OUTPUT)/test_sockmap: cgroup_helpers.c
+$(OUTPUT)/test_tcpbpf_user: cgroup_helpers.c
+$(OUTPUT)/test_tcpnotify_user: cgroup_helpers.c trace_helpers.c
+$(OUTPUT)/get_cgroup_id_user: cgroup_helpers.c
+$(OUTPUT)/test_cgroup_storage: cgroup_helpers.c
+$(OUTPUT)/test_netcnt: cgroup_helpers.c
+$(OUTPUT)/test_sock_fields: cgroup_helpers.c
+$(OUTPUT)/test_sysctl: cgroup_helpers.c
+
+BPFTOOL ?= $(DEFAULT_BPFTOOL)
+$(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \
+ $(BPFOBJ) | $(BUILD_DIR)/bpftool
+ $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \
+ OUTPUT=$(BUILD_DIR)/bpftool/ \
+ prefix= DESTDIR=$(SCRATCH_DIR)/ install
+ $(Q)mkdir -p $(BUILD_DIR)/bpftool/Documentation
+ $(Q)RST2MAN_OPTS="--exit-status=1" $(MAKE) $(submake_extras) \
+ -C $(BPFTOOLDIR)/Documentation \
+ OUTPUT=$(BUILD_DIR)/bpftool/Documentation/ \
+ prefix= DESTDIR=$(SCRATCH_DIR)/ install
+
+$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \
+ ../../../include/uapi/linux/bpf.h \
+ | $(INCLUDE_DIR) $(BUILD_DIR)/libbpf
+ $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(BUILD_DIR)/libbpf/ \
+ DESTDIR=$(SCRATCH_DIR) prefix= all install_headers
+
+$(BUILD_DIR)/libbpf $(BUILD_DIR)/bpftool $(BUILD_DIR)/resolve_btfids $(INCLUDE_DIR):
+ $(call msg,MKDIR,,$@)
+ $(Q)mkdir -p $@
+
+$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR)
+ifeq ($(VMLINUX_H),)
+ $(call msg,GEN,,$@)
+ $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@
+else
+ $(call msg,CP,,$@)
+ $(Q)cp "$(VMLINUX_H)" $@
+endif
+
+$(RESOLVE_BTFIDS): $(BPFOBJ) | $(BUILD_DIR)/resolve_btfids \
+ $(TOOLSDIR)/bpf/resolve_btfids/main.c \
+ $(TOOLSDIR)/lib/rbtree.c \
+ $(TOOLSDIR)/lib/zalloc.c \
+ $(TOOLSDIR)/lib/string.c \
+ $(TOOLSDIR)/lib/ctype.c \
+ $(TOOLSDIR)/lib/str_error_r.c
+ $(Q)$(MAKE) $(submake_extras) -C $(TOOLSDIR)/bpf/resolve_btfids \
+ OUTPUT=$(BUILD_DIR)/resolve_btfids/ BPFOBJ=$(BPFOBJ)
+
+# Get Clang's default includes on this system, as opposed to those seen by
+# '-target bpf'. This fixes "missing" files on some architectures/distros,
+# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
+#
+# Use '-idirafter': Don't interfere with include mechanics except where the
+# build would have failed anyways.
+define get_sys_includes
+$(shell $(1) -v -E - </dev/null 2>&1 \
+ | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
+$(shell $(1) -dM -E - </dev/null | grep '#define __riscv_xlen ' | sed 's/#define /-D/' | sed 's/ /=/')
+endef
+
+# Determine target endianness.
+IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null | \
+ grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__')
+MENDIAN=$(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian)
+
+CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG))
+BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) $(MENDIAN) \
+ -I$(INCLUDE_DIR) -I$(CURDIR) -I$(APIDIR) \
+ -I$(abspath $(OUTPUT)/../usr/include)
+
+CLANG_CFLAGS = $(CLANG_SYS_INCLUDES) \
+ -Wno-compare-distinct-pointer-types
+
+$(OUTPUT)/test_l4lb_noinline.o: BPF_CFLAGS += -fno-inline
+$(OUTPUT)/test_xdp_noinline.o: BPF_CFLAGS += -fno-inline
+
+$(OUTPUT)/flow_dissector_load.o: flow_dissector_load.h
+
+# Build BPF object using Clang
+# $1 - input .c file
+# $2 - output .o file
+# $3 - CFLAGS
+# $4 - LDFLAGS
+define CLANG_BPF_BUILD_RULE
+ $(call msg,CLNG-LLC,$(TRUNNER_BINARY),$2)
+ $(Q)($(CLANG) $3 -O2 -target bpf -emit-llvm \
+ -c $1 -o - || echo "BPF obj compilation failed") | \
+ $(LLC) -mattr=dwarfris -march=bpf -mcpu=v3 $4 -filetype=obj -o $2
+endef
+# Similar to CLANG_BPF_BUILD_RULE, but with disabled alu32
+define CLANG_NOALU32_BPF_BUILD_RULE
+ $(call msg,CLNG-LLC,$(TRUNNER_BINARY),$2)
+ $(Q)($(CLANG) $3 -O2 -target bpf -emit-llvm \
+ -c $1 -o - || echo "BPF obj compilation failed") | \
+ $(LLC) -march=bpf -mcpu=v2 $4 -filetype=obj -o $2
+endef
+# Similar to CLANG_BPF_BUILD_RULE, but using native Clang and bpf LLC
+define CLANG_NATIVE_BPF_BUILD_RULE
+ $(call msg,CLNG-BPF,$(TRUNNER_BINARY),$2)
+ $(Q)($(CLANG) $3 -O2 -emit-llvm \
+ -c $1 -o - || echo "BPF obj compilation failed") | \
+ $(LLC) -march=bpf -mcpu=v3 $4 -filetype=obj -o $2
+endef
+# Build BPF object using GCC
+define GCC_BPF_BUILD_RULE
+ $(call msg,GCC-BPF,$(TRUNNER_BINARY),$2)
+ $(Q)$(BPF_GCC) $3 $4 -O2 -c $1 -o $2
+endef
+
+SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c
+
+# Set up extra TRUNNER_XXX "temporary" variables in the environment (relies on
+# $eval()) and pass control to DEFINE_TEST_RUNNER_RULES.
+# Parameters:
+# $1 - test runner base binary name (e.g., test_progs)
+# $2 - test runner extra "flavor" (e.g., no_alu32, gcc-bpf, etc)
+define DEFINE_TEST_RUNNER
+
+TRUNNER_OUTPUT := $(OUTPUT)$(if $2,/)$2
+TRUNNER_BINARY := $1$(if $2,-)$2
+TRUNNER_TEST_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.test.o, \
+ $$(notdir $$(wildcard $(TRUNNER_TESTS_DIR)/*.c)))
+TRUNNER_EXTRA_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, \
+ $$(filter %.c,$(TRUNNER_EXTRA_SOURCES)))
+TRUNNER_EXTRA_HDRS := $$(filter %.h,$(TRUNNER_EXTRA_SOURCES))
+TRUNNER_TESTS_HDR := $(TRUNNER_TESTS_DIR)/tests.h
+TRUNNER_BPF_SRCS := $$(notdir $$(wildcard $(TRUNNER_BPF_PROGS_DIR)/*.c))
+TRUNNER_BPF_OBJS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.o, $$(TRUNNER_BPF_SRCS))
+TRUNNER_BPF_SKELS := $$(patsubst %.c,$$(TRUNNER_OUTPUT)/%.skel.h, \
+ $$(filter-out $(SKEL_BLACKLIST), \
+ $$(TRUNNER_BPF_SRCS)))
+TEST_GEN_FILES += $$(TRUNNER_BPF_OBJS)
+
+# Evaluate rules now with extra TRUNNER_XXX variables above already defined
+$$(eval $$(call DEFINE_TEST_RUNNER_RULES,$1,$2))
+
+endef
+
+# Using TRUNNER_XXX variables, provided by callers of DEFINE_TEST_RUNNER and
+# set up by DEFINE_TEST_RUNNER itself, create test runner build rules with:
+# $1 - test runner base binary name (e.g., test_progs)
+# $2 - test runner extra "flavor" (e.g., no_alu32, gcc-bpf, etc)
+define DEFINE_TEST_RUNNER_RULES
+
+ifeq ($($(TRUNNER_OUTPUT)-dir),)
+$(TRUNNER_OUTPUT)-dir := y
+$(TRUNNER_OUTPUT):
+ $$(call msg,MKDIR,,$$@)
+ $(Q)mkdir -p $$@
+endif
+
+# ensure we set up BPF objects generation rule just once for a given
+# input/output directory combination
+ifeq ($($(TRUNNER_BPF_PROGS_DIR)$(if $2,-)$2-bpfobjs),)
+$(TRUNNER_BPF_PROGS_DIR)$(if $2,-)$2-bpfobjs := y
+$(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o: \
+ $(TRUNNER_BPF_PROGS_DIR)/%.c \
+ $(TRUNNER_BPF_PROGS_DIR)/*.h \
+ $$(INCLUDE_DIR)/vmlinux.h \
+ $(wildcard $(BPFDIR)/bpf_*.h) \
+ | $(TRUNNER_OUTPUT) $$(BPFOBJ)
+ $$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@, \
+ $(TRUNNER_BPF_CFLAGS), \
+ $(TRUNNER_BPF_LDFLAGS))
+
+$(TRUNNER_BPF_SKELS): $(TRUNNER_OUTPUT)/%.skel.h: \
+ $(TRUNNER_OUTPUT)/%.o \
+ $(BPFTOOL) \
+ | $(TRUNNER_OUTPUT)
+ $$(call msg,GEN-SKEL,$(TRUNNER_BINARY),$$@)
+ $(Q)$$(BPFTOOL) gen skeleton $$< > $$@
+endif
+
+# ensure we set up tests.h header generation rule just once
+ifeq ($($(TRUNNER_TESTS_DIR)-tests-hdr),)
+$(TRUNNER_TESTS_DIR)-tests-hdr := y
+$(TRUNNER_TESTS_HDR): $(TRUNNER_TESTS_DIR)/*.c
+ $$(call msg,TEST-HDR,$(TRUNNER_BINARY),$$@)
+ $$(shell ( cd $(TRUNNER_TESTS_DIR); \
+ echo '/* Generated header, do not edit */'; \
+ ls *.c 2> /dev/null | \
+ sed -e 's@\([^\.]*\)\.c@DEFINE_TEST(\1)@'; \
+ ) > $$@)
+endif
+
+# compile individual test files
+# Note: we cd into output directory to ensure embedded BPF object is found
+$(TRUNNER_TEST_OBJS): $(TRUNNER_OUTPUT)/%.test.o: \
+ $(TRUNNER_TESTS_DIR)/%.c \
+ $(TRUNNER_EXTRA_HDRS) \
+ $(TRUNNER_BPF_OBJS) \
+ $(TRUNNER_BPF_SKELS) \
+ $$(BPFOBJ) | $(TRUNNER_OUTPUT)
+ $$(call msg,TEST-OBJ,$(TRUNNER_BINARY),$$@)
+ $(Q)cd $$(@D) && $$(CC) -I. $$(CFLAGS) -c $(CURDIR)/$$< $$(LDLIBS) -o $$(@F)
+
+$(TRUNNER_EXTRA_OBJS): $(TRUNNER_OUTPUT)/%.o: \
+ %.c \
+ $(TRUNNER_EXTRA_HDRS) \
+ $(TRUNNER_TESTS_HDR) \
+ $$(BPFOBJ) | $(TRUNNER_OUTPUT)
+ $$(call msg,EXT-OBJ,$(TRUNNER_BINARY),$$@)
+ $(Q)$$(CC) $$(CFLAGS) -c $$< $$(LDLIBS) -o $$@
+
+# only copy extra resources if in flavored build
+$(TRUNNER_BINARY)-extras: $(TRUNNER_EXTRA_FILES) | $(TRUNNER_OUTPUT)
+ifneq ($2,)
+ $$(call msg,EXT-COPY,$(TRUNNER_BINARY),$(TRUNNER_EXTRA_FILES))
+ $(Q)cp -a $$^ $(TRUNNER_OUTPUT)/
+endif
+
+$(OUTPUT)/$(TRUNNER_BINARY): $(TRUNNER_TEST_OBJS) \
+ $(TRUNNER_EXTRA_OBJS) $$(BPFOBJ) \
+ $(RESOLVE_BTFIDS) \
+ | $(TRUNNER_BINARY)-extras
+ $$(call msg,BINARY,,$$@)
+ $(Q)$$(CC) $$(CFLAGS) $$(filter %.a %.o,$$^) $$(LDLIBS) -o $$@
+ $(Q)$(RESOLVE_BTFIDS) --no-fail --btf btf_data.o $$@
+
+endef
+
+# Define test_progs test runner.
+TRUNNER_TESTS_DIR := prog_tests
+TRUNNER_BPF_PROGS_DIR := progs
+TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c \
+ network_helpers.c testing_helpers.c \
+ flow_dissector_load.h
+TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read \
+ $(wildcard progs/btf_dump_test_case_*.c)
+TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE
+TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(CLANG_CFLAGS)
+TRUNNER_BPF_LDFLAGS := -mattr=+alu32
+$(eval $(call DEFINE_TEST_RUNNER,test_progs))
+
+# Define test_progs-no_alu32 test runner.
+TRUNNER_BPF_BUILD_RULE := CLANG_NOALU32_BPF_BUILD_RULE
+TRUNNER_BPF_LDFLAGS :=
+$(eval $(call DEFINE_TEST_RUNNER,test_progs,no_alu32))
+
+# Define test_progs BPF-GCC-flavored test runner.
+ifneq ($(BPF_GCC),)
+TRUNNER_BPF_BUILD_RULE := GCC_BPF_BUILD_RULE
+TRUNNER_BPF_CFLAGS := $(BPF_CFLAGS) $(call get_sys_includes,gcc)
+TRUNNER_BPF_LDFLAGS :=
+$(eval $(call DEFINE_TEST_RUNNER,test_progs,bpf_gcc))
+endif
+
+# Define test_maps test runner.
+TRUNNER_TESTS_DIR := map_tests
+TRUNNER_BPF_PROGS_DIR := progs
+TRUNNER_EXTRA_SOURCES := test_maps.c
+TRUNNER_EXTRA_FILES :=
+TRUNNER_BPF_BUILD_RULE := $$(error no BPF objects should be built)
+TRUNNER_BPF_CFLAGS :=
+TRUNNER_BPF_LDFLAGS :=
+$(eval $(call DEFINE_TEST_RUNNER,test_maps))
+
+# Define test_verifier test runner.
+# It is much simpler than test_maps/test_progs and sufficiently different from
+# them (e.g., test.h is using completely pattern), that it's worth just
+# explicitly defining all the rules explicitly.
+verifier/tests.h: verifier/*.c
+ $(shell ( cd verifier/; \
+ echo '/* Generated header, do not edit */'; \
+ echo '#ifdef FILL_ARRAY'; \
+ ls *.c 2> /dev/null | sed -e 's@\(.*\)@#include \"\1\"@'; \
+ echo '#endif' \
+ ) > verifier/tests.h)
+$(OUTPUT)/test_verifier: test_verifier.c verifier/tests.h $(BPFOBJ) | $(OUTPUT)
+ $(call msg,BINARY,,$@)
+ $(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@
+
+# Make sure we are able to include and link libbpf against c++.
+$(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ)
+ $(call msg,CXX,,$@)
+ $(Q)$(CXX) $(CFLAGS) $^ $(LDLIBS) -o $@
+
+# Benchmark runner
+$(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h
+ $(call msg,CC,,$@)
+ $(Q)$(CC) $(CFLAGS) -c $(filter %.c,$^) $(LDLIBS) -o $@
+$(OUTPUT)/bench_rename.o: $(OUTPUT)/test_overhead.skel.h
+$(OUTPUT)/bench_trigger.o: $(OUTPUT)/trigger_bench.skel.h
+$(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \
+ $(OUTPUT)/perfbuf_bench.skel.h
+$(OUTPUT)/bench.o: bench.h testing_helpers.h
+$(OUTPUT)/bench: LDLIBS += -lm
+$(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \
+ $(OUTPUT)/bench_count.o \
+ $(OUTPUT)/bench_rename.o \
+ $(OUTPUT)/bench_trigger.o \
+ $(OUTPUT)/bench_ringbufs.o
+ $(call msg,BINARY,,$@)
+ $(Q)$(CC) $(LDFLAGS) -o $@ $(filter %.a %.o,$^) $(LDLIBS)
+
+EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(SCRATCH_DIR) \
+ prog_tests/tests.h map_tests/tests.h verifier/tests.h \
+ feature \
+ $(addprefix $(OUTPUT)/,*.o *.skel.h no_alu32 bpf_gcc)
diff --git a/tools/testing/selftests/bpf/README.rst b/tools/testing/selftests/bpf/README.rst
new file mode 100644
index 000000000..ac9eda830
--- /dev/null
+++ b/tools/testing/selftests/bpf/README.rst
@@ -0,0 +1,104 @@
+==================
+BPF Selftest Notes
+==================
+General instructions on running selftests can be found in
+`Documentation/bpf/bpf_devel_QA.rst`_.
+
+Additional information about selftest failures are
+documented here.
+
+profiler[23] test failures with clang/llvm <12.0.0
+==================================================
+
+With clang/llvm <12.0.0, the profiler[23] test may fail.
+The symptom looks like
+
+.. code-block:: c
+
+ // r9 is a pointer to map_value
+ // r7 is a scalar
+ 17: bf 96 00 00 00 00 00 00 r6 = r9
+ 18: 0f 76 00 00 00 00 00 00 r6 += r7
+ math between map_value pointer and register with unbounded min value is not allowed
+
+ // the instructions below will not be seen in the verifier log
+ 19: a5 07 01 00 01 01 00 00 if r7 < 257 goto +1
+ 20: bf 96 00 00 00 00 00 00 r6 = r9
+ // r6 is used here
+
+The verifier will reject such code with above error.
+At insn 18 the r7 is indeed unbounded. The later insn 19 checks the bounds and
+the insn 20 undoes map_value addition. It is currently impossible for the
+verifier to understand such speculative pointer arithmetic.
+Hence
+ https://reviews.llvm.org/D85570
+addresses it on the compiler side. It was committed on llvm 12.
+
+The corresponding C code
+.. code-block:: c
+
+ for (int i = 0; i < MAX_CGROUPS_PATH_DEPTH; i++) {
+ filepart_length = bpf_probe_read_str(payload, ...);
+ if (filepart_length <= MAX_PATH) {
+ barrier_var(filepart_length); // workaround
+ payload += filepart_length;
+ }
+ }
+
+bpf_iter test failures with clang/llvm 10.0.0
+=============================================
+
+With clang/llvm 10.0.0, the following two bpf_iter tests failed:
+ * ``bpf_iter/ipv6_route``
+ * ``bpf_iter/netlink``
+
+The symptom for ``bpf_iter/ipv6_route`` looks like
+
+.. code-block:: c
+
+ 2: (79) r8 = *(u64 *)(r1 +8)
+ ...
+ 14: (bf) r2 = r8
+ 15: (0f) r2 += r1
+ ; BPF_SEQ_PRINTF(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
+ 16: (7b) *(u64 *)(r8 +64) = r2
+ only read is supported
+
+The symptom for ``bpf_iter/netlink`` looks like
+
+.. code-block:: c
+
+ ; struct netlink_sock *nlk = ctx->sk;
+ 2: (79) r7 = *(u64 *)(r1 +8)
+ ...
+ 15: (bf) r2 = r7
+ 16: (0f) r2 += r1
+ ; BPF_SEQ_PRINTF(seq, "%pK %-3d ", s, s->sk_protocol);
+ 17: (7b) *(u64 *)(r7 +0) = r2
+ only read is supported
+
+This is due to a llvm BPF backend bug. The fix
+ https://reviews.llvm.org/D78466
+has been pushed to llvm 10.x release branch and will be
+available in 10.0.1. The fix is available in llvm 11.0.0 trunk.
+
+BPF CO-RE-based tests and Clang version
+=======================================
+
+A set of selftests use BPF target-specific built-ins, which might require
+bleeding-edge Clang versions (Clang 12 nightly at this time).
+
+Few sub-tests of core_reloc test suit (part of test_progs test runner) require
+the following built-ins, listed with corresponding Clang diffs introducing
+them to Clang/LLVM. These sub-tests are going to be skipped if Clang is too
+old to support them, they shouldn't cause build failures or runtime test
+failures:
+
+ - __builtin_btf_type_id() ([0], [1], [2]);
+ - __builtin_preserve_type_info(), __builtin_preserve_enum_value() ([3], [4]).
+
+ [0] https://reviews.llvm.org/D74572
+ [1] https://reviews.llvm.org/D74668
+ [2] https://reviews.llvm.org/D85174
+ [3] https://reviews.llvm.org/D83878
+ [4] https://reviews.llvm.org/D83242
diff --git a/tools/testing/selftests/bpf/bench.c b/tools/testing/selftests/bpf/bench.c
new file mode 100644
index 000000000..332ed2f7b
--- /dev/null
+++ b/tools/testing/selftests/bpf/bench.c
@@ -0,0 +1,464 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#define _GNU_SOURCE
+#include <argp.h>
+#include <linux/compiler.h>
+#include <sys/time.h>
+#include <sched.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sys/sysinfo.h>
+#include <sys/resource.h>
+#include <signal.h>
+#include "bench.h"
+#include "testing_helpers.h"
+
+struct env env = {
+ .warmup_sec = 1,
+ .duration_sec = 5,
+ .affinity = false,
+ .consumer_cnt = 1,
+ .producer_cnt = 1,
+};
+
+static int libbpf_print_fn(enum libbpf_print_level level,
+ const char *format, va_list args)
+{
+ if (level == LIBBPF_DEBUG && !env.verbose)
+ return 0;
+ return vfprintf(stderr, format, args);
+}
+
+static int bump_memlock_rlimit(void)
+{
+ struct rlimit rlim_new = {
+ .rlim_cur = RLIM_INFINITY,
+ .rlim_max = RLIM_INFINITY,
+ };
+
+ return setrlimit(RLIMIT_MEMLOCK, &rlim_new);
+}
+
+void setup_libbpf()
+{
+ int err;
+
+ libbpf_set_print(libbpf_print_fn);
+
+ err = bump_memlock_rlimit();
+ if (err)
+ fprintf(stderr, "failed to increase RLIMIT_MEMLOCK: %d", err);
+}
+
+void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns)
+{
+ double hits_per_sec, drops_per_sec;
+ double hits_per_prod;
+
+ hits_per_sec = res->hits / 1000000.0 / (delta_ns / 1000000000.0);
+ hits_per_prod = hits_per_sec / env.producer_cnt;
+ drops_per_sec = res->drops / 1000000.0 / (delta_ns / 1000000000.0);
+
+ printf("Iter %3d (%7.3lfus): ",
+ iter, (delta_ns - 1000000000) / 1000.0);
+
+ printf("hits %8.3lfM/s (%7.3lfM/prod), drops %8.3lfM/s\n",
+ hits_per_sec, hits_per_prod, drops_per_sec);
+}
+
+void hits_drops_report_final(struct bench_res res[], int res_cnt)
+{
+ int i;
+ double hits_mean = 0.0, drops_mean = 0.0;
+ double hits_stddev = 0.0, drops_stddev = 0.0;
+
+ for (i = 0; i < res_cnt; i++) {
+ hits_mean += res[i].hits / 1000000.0 / (0.0 + res_cnt);
+ drops_mean += res[i].drops / 1000000.0 / (0.0 + res_cnt);
+ }
+
+ if (res_cnt > 1) {
+ for (i = 0; i < res_cnt; i++) {
+ hits_stddev += (hits_mean - res[i].hits / 1000000.0) *
+ (hits_mean - res[i].hits / 1000000.0) /
+ (res_cnt - 1.0);
+ drops_stddev += (drops_mean - res[i].drops / 1000000.0) *
+ (drops_mean - res[i].drops / 1000000.0) /
+ (res_cnt - 1.0);
+ }
+ hits_stddev = sqrt(hits_stddev);
+ drops_stddev = sqrt(drops_stddev);
+ }
+ printf("Summary: hits %8.3lf \u00B1 %5.3lfM/s (%7.3lfM/prod), ",
+ hits_mean, hits_stddev, hits_mean / env.producer_cnt);
+ printf("drops %8.3lf \u00B1 %5.3lfM/s\n",
+ drops_mean, drops_stddev);
+}
+
+const char *argp_program_version = "benchmark";
+const char *argp_program_bug_address = "<bpf@vger.kernel.org>";
+const char argp_program_doc[] =
+"benchmark Generic benchmarking framework.\n"
+"\n"
+"This tool runs benchmarks.\n"
+"\n"
+"USAGE: benchmark <bench-name>\n"
+"\n"
+"EXAMPLES:\n"
+" # run 'count-local' benchmark with 1 producer and 1 consumer\n"
+" benchmark count-local\n"
+" # run 'count-local' with 16 producer and 8 consumer thread, pinned to CPUs\n"
+" benchmark -p16 -c8 -a count-local\n";
+
+enum {
+ ARG_PROD_AFFINITY_SET = 1000,
+ ARG_CONS_AFFINITY_SET = 1001,
+};
+
+static const struct argp_option opts[] = {
+ { "list", 'l', NULL, 0, "List available benchmarks"},
+ { "duration", 'd', "SEC", 0, "Duration of benchmark, seconds"},
+ { "warmup", 'w', "SEC", 0, "Warm-up period, seconds"},
+ { "producers", 'p', "NUM", 0, "Number of producer threads"},
+ { "consumers", 'c', "NUM", 0, "Number of consumer threads"},
+ { "verbose", 'v', NULL, 0, "Verbose debug output"},
+ { "affinity", 'a', NULL, 0, "Set consumer/producer thread affinity"},
+ { "prod-affinity", ARG_PROD_AFFINITY_SET, "CPUSET", 0,
+ "Set of CPUs for producer threads; implies --affinity"},
+ { "cons-affinity", ARG_CONS_AFFINITY_SET, "CPUSET", 0,
+ "Set of CPUs for consumer threads; implies --affinity"},
+ {},
+};
+
+extern struct argp bench_ringbufs_argp;
+
+static const struct argp_child bench_parsers[] = {
+ { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 },
+ {},
+};
+
+static error_t parse_arg(int key, char *arg, struct argp_state *state)
+{
+ static int pos_args;
+
+ switch (key) {
+ case 'v':
+ env.verbose = true;
+ break;
+ case 'l':
+ env.list = true;
+ break;
+ case 'd':
+ env.duration_sec = strtol(arg, NULL, 10);
+ if (env.duration_sec <= 0) {
+ fprintf(stderr, "Invalid duration: %s\n", arg);
+ argp_usage(state);
+ }
+ break;
+ case 'w':
+ env.warmup_sec = strtol(arg, NULL, 10);
+ if (env.warmup_sec <= 0) {
+ fprintf(stderr, "Invalid warm-up duration: %s\n", arg);
+ argp_usage(state);
+ }
+ break;
+ case 'p':
+ env.producer_cnt = strtol(arg, NULL, 10);
+ if (env.producer_cnt <= 0) {
+ fprintf(stderr, "Invalid producer count: %s\n", arg);
+ argp_usage(state);
+ }
+ break;
+ case 'c':
+ env.consumer_cnt = strtol(arg, NULL, 10);
+ if (env.consumer_cnt <= 0) {
+ fprintf(stderr, "Invalid consumer count: %s\n", arg);
+ argp_usage(state);
+ }
+ break;
+ case 'a':
+ env.affinity = true;
+ break;
+ case ARG_PROD_AFFINITY_SET:
+ env.affinity = true;
+ if (parse_num_list(arg, &env.prod_cpus.cpus,
+ &env.prod_cpus.cpus_len)) {
+ fprintf(stderr, "Invalid format of CPU set for producers.");
+ argp_usage(state);
+ }
+ break;
+ case ARG_CONS_AFFINITY_SET:
+ env.affinity = true;
+ if (parse_num_list(arg, &env.cons_cpus.cpus,
+ &env.cons_cpus.cpus_len)) {
+ fprintf(stderr, "Invalid format of CPU set for consumers.");
+ argp_usage(state);
+ }
+ break;
+ case ARGP_KEY_ARG:
+ if (pos_args++) {
+ fprintf(stderr,
+ "Unrecognized positional argument: %s\n", arg);
+ argp_usage(state);
+ }
+ env.bench_name = strdup(arg);
+ break;
+ default:
+ return ARGP_ERR_UNKNOWN;
+ }
+ return 0;
+}
+
+static void parse_cmdline_args(int argc, char **argv)
+{
+ static const struct argp argp = {
+ .options = opts,
+ .parser = parse_arg,
+ .doc = argp_program_doc,
+ .children = bench_parsers,
+ };
+ if (argp_parse(&argp, argc, argv, 0, NULL, NULL))
+ exit(1);
+ if (!env.list && !env.bench_name) {
+ argp_help(&argp, stderr, ARGP_HELP_DOC, "bench");
+ exit(1);
+ }
+}
+
+static void collect_measurements(long delta_ns);
+
+static __u64 last_time_ns;
+static void sigalarm_handler(int signo)
+{
+ long new_time_ns = get_time_ns();
+ long delta_ns = new_time_ns - last_time_ns;
+
+ collect_measurements(delta_ns);
+
+ last_time_ns = new_time_ns;
+}
+
+/* set up periodic 1-second timer */
+static void setup_timer()
+{
+ static struct sigaction sigalarm_action = {
+ .sa_handler = sigalarm_handler,
+ };
+ struct itimerval timer_settings = {};
+ int err;
+
+ last_time_ns = get_time_ns();
+ err = sigaction(SIGALRM, &sigalarm_action, NULL);
+ if (err < 0) {
+ fprintf(stderr, "failed to install SIGALRM handler: %d\n", -errno);
+ exit(1);
+ }
+ timer_settings.it_interval.tv_sec = 1;
+ timer_settings.it_value.tv_sec = 1;
+ err = setitimer(ITIMER_REAL, &timer_settings, NULL);
+ if (err < 0) {
+ fprintf(stderr, "failed to arm interval timer: %d\n", -errno);
+ exit(1);
+ }
+}
+
+static void set_thread_affinity(pthread_t thread, int cpu)
+{
+ cpu_set_t cpuset;
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpu, &cpuset);
+ if (pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset)) {
+ fprintf(stderr, "setting affinity to CPU #%d failed: %d\n",
+ cpu, errno);
+ exit(1);
+ }
+}
+
+static int next_cpu(struct cpu_set *cpu_set)
+{
+ if (cpu_set->cpus) {
+ int i;
+
+ /* find next available CPU */
+ for (i = cpu_set->next_cpu; i < cpu_set->cpus_len; i++) {
+ if (cpu_set->cpus[i]) {
+ cpu_set->next_cpu = i + 1;
+ return i;
+ }
+ }
+ fprintf(stderr, "Not enough CPUs specified, need CPU #%d or higher.\n", i);
+ exit(1);
+ }
+
+ return cpu_set->next_cpu++;
+}
+
+static struct bench_state {
+ int res_cnt;
+ struct bench_res *results;
+ pthread_t *consumers;
+ pthread_t *producers;
+} state;
+
+const struct bench *bench = NULL;
+
+extern const struct bench bench_count_global;
+extern const struct bench bench_count_local;
+extern const struct bench bench_rename_base;
+extern const struct bench bench_rename_kprobe;
+extern const struct bench bench_rename_kretprobe;
+extern const struct bench bench_rename_rawtp;
+extern const struct bench bench_rename_fentry;
+extern const struct bench bench_rename_fexit;
+extern const struct bench bench_trig_base;
+extern const struct bench bench_trig_tp;
+extern const struct bench bench_trig_rawtp;
+extern const struct bench bench_trig_kprobe;
+extern const struct bench bench_trig_fentry;
+extern const struct bench bench_trig_fentry_sleep;
+extern const struct bench bench_trig_fmodret;
+extern const struct bench bench_rb_libbpf;
+extern const struct bench bench_rb_custom;
+extern const struct bench bench_pb_libbpf;
+extern const struct bench bench_pb_custom;
+
+static const struct bench *benchs[] = {
+ &bench_count_global,
+ &bench_count_local,
+ &bench_rename_base,
+ &bench_rename_kprobe,
+ &bench_rename_kretprobe,
+ &bench_rename_rawtp,
+ &bench_rename_fentry,
+ &bench_rename_fexit,
+ &bench_trig_base,
+ &bench_trig_tp,
+ &bench_trig_rawtp,
+ &bench_trig_kprobe,
+ &bench_trig_fentry,
+ &bench_trig_fentry_sleep,
+ &bench_trig_fmodret,
+ &bench_rb_libbpf,
+ &bench_rb_custom,
+ &bench_pb_libbpf,
+ &bench_pb_custom,
+};
+
+static void setup_benchmark()
+{
+ int i, err;
+
+ if (!env.bench_name) {
+ fprintf(stderr, "benchmark name is not specified\n");
+ exit(1);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(benchs); i++) {
+ if (strcmp(benchs[i]->name, env.bench_name) == 0) {
+ bench = benchs[i];
+ break;
+ }
+ }
+ if (!bench) {
+ fprintf(stderr, "benchmark '%s' not found\n", env.bench_name);
+ exit(1);
+ }
+
+ printf("Setting up benchmark '%s'...\n", bench->name);
+
+ state.producers = calloc(env.producer_cnt, sizeof(*state.producers));
+ state.consumers = calloc(env.consumer_cnt, sizeof(*state.consumers));
+ state.results = calloc(env.duration_sec + env.warmup_sec + 2,
+ sizeof(*state.results));
+ if (!state.producers || !state.consumers || !state.results)
+ exit(1);
+
+ if (bench->validate)
+ bench->validate();
+ if (bench->setup)
+ bench->setup();
+
+ for (i = 0; i < env.consumer_cnt; i++) {
+ err = pthread_create(&state.consumers[i], NULL,
+ bench->consumer_thread, (void *)(long)i);
+ if (err) {
+ fprintf(stderr, "failed to create consumer thread #%d: %d\n",
+ i, -errno);
+ exit(1);
+ }
+ if (env.affinity)
+ set_thread_affinity(state.consumers[i],
+ next_cpu(&env.cons_cpus));
+ }
+
+ /* unless explicit producer CPU list is specified, continue after
+ * last consumer CPU
+ */
+ if (!env.prod_cpus.cpus)
+ env.prod_cpus.next_cpu = env.cons_cpus.next_cpu;
+
+ for (i = 0; i < env.producer_cnt; i++) {
+ err = pthread_create(&state.producers[i], NULL,
+ bench->producer_thread, (void *)(long)i);
+ if (err) {
+ fprintf(stderr, "failed to create producer thread #%d: %d\n",
+ i, -errno);
+ exit(1);
+ }
+ if (env.affinity)
+ set_thread_affinity(state.producers[i],
+ next_cpu(&env.prod_cpus));
+ }
+
+ printf("Benchmark '%s' started.\n", bench->name);
+}
+
+static pthread_mutex_t bench_done_mtx = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t bench_done = PTHREAD_COND_INITIALIZER;
+
+static void collect_measurements(long delta_ns) {
+ int iter = state.res_cnt++;
+ struct bench_res *res = &state.results[iter];
+
+ bench->measure(res);
+
+ if (bench->report_progress)
+ bench->report_progress(iter, res, delta_ns);
+
+ if (iter == env.duration_sec + env.warmup_sec) {
+ pthread_mutex_lock(&bench_done_mtx);
+ pthread_cond_signal(&bench_done);
+ pthread_mutex_unlock(&bench_done_mtx);
+ }
+}
+
+int main(int argc, char **argv)
+{
+ parse_cmdline_args(argc, argv);
+
+ if (env.list) {
+ int i;
+
+ printf("Available benchmarks:\n");
+ for (i = 0; i < ARRAY_SIZE(benchs); i++) {
+ printf("- %s\n", benchs[i]->name);
+ }
+ return 0;
+ }
+
+ setup_benchmark();
+
+ setup_timer();
+
+ pthread_mutex_lock(&bench_done_mtx);
+ pthread_cond_wait(&bench_done, &bench_done_mtx);
+ pthread_mutex_unlock(&bench_done_mtx);
+
+ if (bench->report_final)
+ /* skip first sample */
+ bench->report_final(state.results + env.warmup_sec,
+ state.res_cnt - env.warmup_sec);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/bench.h b/tools/testing/selftests/bpf/bench.h
new file mode 100644
index 000000000..c1f48a473
--- /dev/null
+++ b/tools/testing/selftests/bpf/bench.h
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#pragma once
+#include <stdlib.h>
+#include <stdbool.h>
+#include <linux/err.h>
+#include <errno.h>
+#include <unistd.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include <math.h>
+#include <time.h>
+#include <sys/syscall.h>
+
+struct cpu_set {
+ bool *cpus;
+ int cpus_len;
+ int next_cpu;
+};
+
+struct env {
+ char *bench_name;
+ int duration_sec;
+ int warmup_sec;
+ bool verbose;
+ bool list;
+ bool affinity;
+ int consumer_cnt;
+ int producer_cnt;
+ struct cpu_set prod_cpus;
+ struct cpu_set cons_cpus;
+};
+
+struct bench_res {
+ long hits;
+ long drops;
+};
+
+struct bench {
+ const char *name;
+ void (*validate)();
+ void (*setup)();
+ void *(*producer_thread)(void *ctx);
+ void *(*consumer_thread)(void *ctx);
+ void (*measure)(struct bench_res* res);
+ void (*report_progress)(int iter, struct bench_res* res, long delta_ns);
+ void (*report_final)(struct bench_res res[], int res_cnt);
+};
+
+struct counter {
+ long value;
+} __attribute__((aligned(128)));
+
+extern struct env env;
+extern const struct bench *bench;
+
+void setup_libbpf();
+void hits_drops_report_progress(int iter, struct bench_res *res, long delta_ns);
+void hits_drops_report_final(struct bench_res res[], int res_cnt);
+
+static inline __u64 get_time_ns() {
+ struct timespec t;
+
+ clock_gettime(CLOCK_MONOTONIC, &t);
+
+ return (u64)t.tv_sec * 1000000000 + t.tv_nsec;
+}
+
+static inline void atomic_inc(long *value)
+{
+ (void)__atomic_add_fetch(value, 1, __ATOMIC_RELAXED);
+}
+
+static inline void atomic_add(long *value, long n)
+{
+ (void)__atomic_add_fetch(value, n, __ATOMIC_RELAXED);
+}
+
+static inline long atomic_swap(long *value, long n)
+{
+ return __atomic_exchange_n(value, n, __ATOMIC_RELAXED);
+}
diff --git a/tools/testing/selftests/bpf/benchs/bench_count.c b/tools/testing/selftests/bpf/benchs/bench_count.c
new file mode 100644
index 000000000..befba7a82
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/bench_count.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bench.h"
+
+/* COUNT-GLOBAL benchmark */
+
+static struct count_global_ctx {
+ struct counter hits;
+} count_global_ctx;
+
+static void *count_global_producer(void *input)
+{
+ struct count_global_ctx *ctx = &count_global_ctx;
+
+ while (true) {
+ atomic_inc(&ctx->hits.value);
+ }
+ return NULL;
+}
+
+static void *count_global_consumer(void *input)
+{
+ return NULL;
+}
+
+static void count_global_measure(struct bench_res *res)
+{
+ struct count_global_ctx *ctx = &count_global_ctx;
+
+ res->hits = atomic_swap(&ctx->hits.value, 0);
+}
+
+/* COUNT-local benchmark */
+
+static struct count_local_ctx {
+ struct counter *hits;
+} count_local_ctx;
+
+static void count_local_setup()
+{
+ struct count_local_ctx *ctx = &count_local_ctx;
+
+ ctx->hits = calloc(env.consumer_cnt, sizeof(*ctx->hits));
+ if (!ctx->hits)
+ exit(1);
+}
+
+static void *count_local_producer(void *input)
+{
+ struct count_local_ctx *ctx = &count_local_ctx;
+ int idx = (long)input;
+
+ while (true) {
+ atomic_inc(&ctx->hits[idx].value);
+ }
+ return NULL;
+}
+
+static void *count_local_consumer(void *input)
+{
+ return NULL;
+}
+
+static void count_local_measure(struct bench_res *res)
+{
+ struct count_local_ctx *ctx = &count_local_ctx;
+ int i;
+
+ for (i = 0; i < env.producer_cnt; i++) {
+ res->hits += atomic_swap(&ctx->hits[i].value, 0);
+ }
+}
+
+const struct bench bench_count_global = {
+ .name = "count-global",
+ .producer_thread = count_global_producer,
+ .consumer_thread = count_global_consumer,
+ .measure = count_global_measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
+
+const struct bench bench_count_local = {
+ .name = "count-local",
+ .setup = count_local_setup,
+ .producer_thread = count_local_producer,
+ .consumer_thread = count_local_consumer,
+ .measure = count_local_measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
diff --git a/tools/testing/selftests/bpf/benchs/bench_rename.c b/tools/testing/selftests/bpf/benchs/bench_rename.c
new file mode 100644
index 000000000..a96767409
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/bench_rename.c
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <fcntl.h>
+#include "bench.h"
+#include "test_overhead.skel.h"
+
+/* BPF triggering benchmarks */
+static struct ctx {
+ struct test_overhead *skel;
+ struct counter hits;
+ int fd;
+} ctx;
+
+static void validate()
+{
+ if (env.producer_cnt != 1) {
+ fprintf(stderr, "benchmark doesn't support multi-producer!\n");
+ exit(1);
+ }
+ if (env.consumer_cnt != 1) {
+ fprintf(stderr, "benchmark doesn't support multi-consumer!\n");
+ exit(1);
+ }
+}
+
+static void *producer(void *input)
+{
+ char buf[] = "test_overhead";
+ int err;
+
+ while (true) {
+ err = write(ctx.fd, buf, sizeof(buf));
+ if (err < 0) {
+ fprintf(stderr, "write failed\n");
+ exit(1);
+ }
+ atomic_inc(&ctx.hits.value);
+ }
+}
+
+static void measure(struct bench_res *res)
+{
+ res->hits = atomic_swap(&ctx.hits.value, 0);
+}
+
+static void setup_ctx()
+{
+ setup_libbpf();
+
+ ctx.skel = test_overhead__open_and_load();
+ if (!ctx.skel) {
+ fprintf(stderr, "failed to open skeleton\n");
+ exit(1);
+ }
+
+ ctx.fd = open("/proc/self/comm", O_WRONLY|O_TRUNC);
+ if (ctx.fd < 0) {
+ fprintf(stderr, "failed to open /proc/self/comm: %d\n", -errno);
+ exit(1);
+ }
+}
+
+static void attach_bpf(struct bpf_program *prog)
+{
+ struct bpf_link *link;
+
+ link = bpf_program__attach(prog);
+ if (IS_ERR(link)) {
+ fprintf(stderr, "failed to attach program!\n");
+ exit(1);
+ }
+}
+
+static void setup_base()
+{
+ setup_ctx();
+}
+
+static void setup_kprobe()
+{
+ setup_ctx();
+ attach_bpf(ctx.skel->progs.prog1);
+}
+
+static void setup_kretprobe()
+{
+ setup_ctx();
+ attach_bpf(ctx.skel->progs.prog2);
+}
+
+static void setup_rawtp()
+{
+ setup_ctx();
+ attach_bpf(ctx.skel->progs.prog3);
+}
+
+static void setup_fentry()
+{
+ setup_ctx();
+ attach_bpf(ctx.skel->progs.prog4);
+}
+
+static void setup_fexit()
+{
+ setup_ctx();
+ attach_bpf(ctx.skel->progs.prog5);
+}
+
+static void *consumer(void *input)
+{
+ return NULL;
+}
+
+const struct bench bench_rename_base = {
+ .name = "rename-base",
+ .validate = validate,
+ .setup = setup_base,
+ .producer_thread = producer,
+ .consumer_thread = consumer,
+ .measure = measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
+
+const struct bench bench_rename_kprobe = {
+ .name = "rename-kprobe",
+ .validate = validate,
+ .setup = setup_kprobe,
+ .producer_thread = producer,
+ .consumer_thread = consumer,
+ .measure = measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
+
+const struct bench bench_rename_kretprobe = {
+ .name = "rename-kretprobe",
+ .validate = validate,
+ .setup = setup_kretprobe,
+ .producer_thread = producer,
+ .consumer_thread = consumer,
+ .measure = measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
+
+const struct bench bench_rename_rawtp = {
+ .name = "rename-rawtp",
+ .validate = validate,
+ .setup = setup_rawtp,
+ .producer_thread = producer,
+ .consumer_thread = consumer,
+ .measure = measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
+
+const struct bench bench_rename_fentry = {
+ .name = "rename-fentry",
+ .validate = validate,
+ .setup = setup_fentry,
+ .producer_thread = producer,
+ .consumer_thread = consumer,
+ .measure = measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
+
+const struct bench bench_rename_fexit = {
+ .name = "rename-fexit",
+ .validate = validate,
+ .setup = setup_fexit,
+ .producer_thread = producer,
+ .consumer_thread = consumer,
+ .measure = measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
diff --git a/tools/testing/selftests/bpf/benchs/bench_ringbufs.c b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c
new file mode 100644
index 000000000..da87c7f31
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/bench_ringbufs.c
@@ -0,0 +1,566 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <asm/barrier.h>
+#include <linux/perf_event.h>
+#include <linux/ring_buffer.h>
+#include <sys/epoll.h>
+#include <sys/mman.h>
+#include <argp.h>
+#include <stdlib.h>
+#include "bench.h"
+#include "ringbuf_bench.skel.h"
+#include "perfbuf_bench.skel.h"
+
+static struct {
+ bool back2back;
+ int batch_cnt;
+ bool sampled;
+ int sample_rate;
+ int ringbuf_sz; /* per-ringbuf, in bytes */
+ bool ringbuf_use_output; /* use slower output API */
+ int perfbuf_sz; /* per-CPU size, in pages */
+} args = {
+ .back2back = false,
+ .batch_cnt = 500,
+ .sampled = false,
+ .sample_rate = 500,
+ .ringbuf_sz = 512 * 1024,
+ .ringbuf_use_output = false,
+ .perfbuf_sz = 128,
+};
+
+enum {
+ ARG_RB_BACK2BACK = 2000,
+ ARG_RB_USE_OUTPUT = 2001,
+ ARG_RB_BATCH_CNT = 2002,
+ ARG_RB_SAMPLED = 2003,
+ ARG_RB_SAMPLE_RATE = 2004,
+};
+
+static const struct argp_option opts[] = {
+ { "rb-b2b", ARG_RB_BACK2BACK, NULL, 0, "Back-to-back mode"},
+ { "rb-use-output", ARG_RB_USE_OUTPUT, NULL, 0, "Use bpf_ringbuf_output() instead of bpf_ringbuf_reserve()"},
+ { "rb-batch-cnt", ARG_RB_BATCH_CNT, "CNT", 0, "Set BPF-side record batch count"},
+ { "rb-sampled", ARG_RB_SAMPLED, NULL, 0, "Notification sampling"},
+ { "rb-sample-rate", ARG_RB_SAMPLE_RATE, "RATE", 0, "Notification sample rate"},
+ {},
+};
+
+static error_t parse_arg(int key, char *arg, struct argp_state *state)
+{
+ switch (key) {
+ case ARG_RB_BACK2BACK:
+ args.back2back = true;
+ break;
+ case ARG_RB_USE_OUTPUT:
+ args.ringbuf_use_output = true;
+ break;
+ case ARG_RB_BATCH_CNT:
+ args.batch_cnt = strtol(arg, NULL, 10);
+ if (args.batch_cnt < 0) {
+ fprintf(stderr, "Invalid batch count.");
+ argp_usage(state);
+ }
+ break;
+ case ARG_RB_SAMPLED:
+ args.sampled = true;
+ break;
+ case ARG_RB_SAMPLE_RATE:
+ args.sample_rate = strtol(arg, NULL, 10);
+ if (args.sample_rate < 0) {
+ fprintf(stderr, "Invalid perfbuf sample rate.");
+ argp_usage(state);
+ }
+ break;
+ default:
+ return ARGP_ERR_UNKNOWN;
+ }
+ return 0;
+}
+
+/* exported into benchmark runner */
+const struct argp bench_ringbufs_argp = {
+ .options = opts,
+ .parser = parse_arg,
+};
+
+/* RINGBUF-LIBBPF benchmark */
+
+static struct counter buf_hits;
+
+static inline void bufs_trigger_batch()
+{
+ (void)syscall(__NR_getpgid);
+}
+
+static void bufs_validate()
+{
+ if (env.consumer_cnt != 1) {
+ fprintf(stderr, "rb-libbpf benchmark doesn't support multi-consumer!\n");
+ exit(1);
+ }
+
+ if (args.back2back && env.producer_cnt > 1) {
+ fprintf(stderr, "back-to-back mode makes sense only for single-producer case!\n");
+ exit(1);
+ }
+}
+
+static void *bufs_sample_producer(void *input)
+{
+ if (args.back2back) {
+ /* initial batch to get everything started */
+ bufs_trigger_batch();
+ return NULL;
+ }
+
+ while (true)
+ bufs_trigger_batch();
+ return NULL;
+}
+
+static struct ringbuf_libbpf_ctx {
+ struct ringbuf_bench *skel;
+ struct ring_buffer *ringbuf;
+} ringbuf_libbpf_ctx;
+
+static void ringbuf_libbpf_measure(struct bench_res *res)
+{
+ struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx;
+
+ res->hits = atomic_swap(&buf_hits.value, 0);
+ res->drops = atomic_swap(&ctx->skel->bss->dropped, 0);
+}
+
+static struct ringbuf_bench *ringbuf_setup_skeleton()
+{
+ struct ringbuf_bench *skel;
+
+ setup_libbpf();
+
+ skel = ringbuf_bench__open();
+ if (!skel) {
+ fprintf(stderr, "failed to open skeleton\n");
+ exit(1);
+ }
+
+ skel->rodata->batch_cnt = args.batch_cnt;
+ skel->rodata->use_output = args.ringbuf_use_output ? 1 : 0;
+
+ if (args.sampled)
+ /* record data + header take 16 bytes */
+ skel->rodata->wakeup_data_size = args.sample_rate * 16;
+
+ bpf_map__resize(skel->maps.ringbuf, args.ringbuf_sz);
+
+ if (ringbuf_bench__load(skel)) {
+ fprintf(stderr, "failed to load skeleton\n");
+ exit(1);
+ }
+
+ return skel;
+}
+
+static int buf_process_sample(void *ctx, void *data, size_t len)
+{
+ atomic_inc(&buf_hits.value);
+ return 0;
+}
+
+static void ringbuf_libbpf_setup()
+{
+ struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx;
+ struct bpf_link *link;
+
+ ctx->skel = ringbuf_setup_skeleton();
+ ctx->ringbuf = ring_buffer__new(bpf_map__fd(ctx->skel->maps.ringbuf),
+ buf_process_sample, NULL, NULL);
+ if (!ctx->ringbuf) {
+ fprintf(stderr, "failed to create ringbuf\n");
+ exit(1);
+ }
+
+ link = bpf_program__attach(ctx->skel->progs.bench_ringbuf);
+ if (IS_ERR(link)) {
+ fprintf(stderr, "failed to attach program!\n");
+ exit(1);
+ }
+}
+
+static void *ringbuf_libbpf_consumer(void *input)
+{
+ struct ringbuf_libbpf_ctx *ctx = &ringbuf_libbpf_ctx;
+
+ while (ring_buffer__poll(ctx->ringbuf, -1) >= 0) {
+ if (args.back2back)
+ bufs_trigger_batch();
+ }
+ fprintf(stderr, "ringbuf polling failed!\n");
+ return NULL;
+}
+
+/* RINGBUF-CUSTOM benchmark */
+struct ringbuf_custom {
+ __u64 *consumer_pos;
+ __u64 *producer_pos;
+ __u64 mask;
+ void *data;
+ int map_fd;
+};
+
+static struct ringbuf_custom_ctx {
+ struct ringbuf_bench *skel;
+ struct ringbuf_custom ringbuf;
+ int epoll_fd;
+ struct epoll_event event;
+} ringbuf_custom_ctx;
+
+static void ringbuf_custom_measure(struct bench_res *res)
+{
+ struct ringbuf_custom_ctx *ctx = &ringbuf_custom_ctx;
+
+ res->hits = atomic_swap(&buf_hits.value, 0);
+ res->drops = atomic_swap(&ctx->skel->bss->dropped, 0);
+}
+
+static void ringbuf_custom_setup()
+{
+ struct ringbuf_custom_ctx *ctx = &ringbuf_custom_ctx;
+ const size_t page_size = getpagesize();
+ struct bpf_link *link;
+ struct ringbuf_custom *r;
+ void *tmp;
+ int err;
+
+ ctx->skel = ringbuf_setup_skeleton();
+
+ ctx->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+ if (ctx->epoll_fd < 0) {
+ fprintf(stderr, "failed to create epoll fd: %d\n", -errno);
+ exit(1);
+ }
+
+ r = &ctx->ringbuf;
+ r->map_fd = bpf_map__fd(ctx->skel->maps.ringbuf);
+ r->mask = args.ringbuf_sz - 1;
+
+ /* Map writable consumer page */
+ tmp = mmap(NULL, page_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+ r->map_fd, 0);
+ if (tmp == MAP_FAILED) {
+ fprintf(stderr, "failed to mmap consumer page: %d\n", -errno);
+ exit(1);
+ }
+ r->consumer_pos = tmp;
+
+ /* Map read-only producer page and data pages. */
+ tmp = mmap(NULL, page_size + 2 * args.ringbuf_sz, PROT_READ, MAP_SHARED,
+ r->map_fd, page_size);
+ if (tmp == MAP_FAILED) {
+ fprintf(stderr, "failed to mmap data pages: %d\n", -errno);
+ exit(1);
+ }
+ r->producer_pos = tmp;
+ r->data = tmp + page_size;
+
+ ctx->event.events = EPOLLIN;
+ err = epoll_ctl(ctx->epoll_fd, EPOLL_CTL_ADD, r->map_fd, &ctx->event);
+ if (err < 0) {
+ fprintf(stderr, "failed to epoll add ringbuf: %d\n", -errno);
+ exit(1);
+ }
+
+ link = bpf_program__attach(ctx->skel->progs.bench_ringbuf);
+ if (IS_ERR(link)) {
+ fprintf(stderr, "failed to attach program\n");
+ exit(1);
+ }
+}
+
+#define RINGBUF_BUSY_BIT (1 << 31)
+#define RINGBUF_DISCARD_BIT (1 << 30)
+#define RINGBUF_META_LEN 8
+
+static inline int roundup_len(__u32 len)
+{
+ /* clear out top 2 bits */
+ len <<= 2;
+ len >>= 2;
+ /* add length prefix */
+ len += RINGBUF_META_LEN;
+ /* round up to 8 byte alignment */
+ return (len + 7) / 8 * 8;
+}
+
+static void ringbuf_custom_process_ring(struct ringbuf_custom *r)
+{
+ unsigned long cons_pos, prod_pos;
+ int *len_ptr, len;
+ bool got_new_data;
+
+ cons_pos = smp_load_acquire(r->consumer_pos);
+ while (true) {
+ got_new_data = false;
+ prod_pos = smp_load_acquire(r->producer_pos);
+ while (cons_pos < prod_pos) {
+ len_ptr = r->data + (cons_pos & r->mask);
+ len = smp_load_acquire(len_ptr);
+
+ /* sample not committed yet, bail out for now */
+ if (len & RINGBUF_BUSY_BIT)
+ return;
+
+ got_new_data = true;
+ cons_pos += roundup_len(len);
+
+ atomic_inc(&buf_hits.value);
+ }
+ if (got_new_data)
+ smp_store_release(r->consumer_pos, cons_pos);
+ else
+ break;
+ };
+}
+
+static void *ringbuf_custom_consumer(void *input)
+{
+ struct ringbuf_custom_ctx *ctx = &ringbuf_custom_ctx;
+ int cnt;
+
+ do {
+ if (args.back2back)
+ bufs_trigger_batch();
+ cnt = epoll_wait(ctx->epoll_fd, &ctx->event, 1, -1);
+ if (cnt > 0)
+ ringbuf_custom_process_ring(&ctx->ringbuf);
+ } while (cnt >= 0);
+ fprintf(stderr, "ringbuf polling failed!\n");
+ return 0;
+}
+
+/* PERFBUF-LIBBPF benchmark */
+static struct perfbuf_libbpf_ctx {
+ struct perfbuf_bench *skel;
+ struct perf_buffer *perfbuf;
+} perfbuf_libbpf_ctx;
+
+static void perfbuf_measure(struct bench_res *res)
+{
+ struct perfbuf_libbpf_ctx *ctx = &perfbuf_libbpf_ctx;
+
+ res->hits = atomic_swap(&buf_hits.value, 0);
+ res->drops = atomic_swap(&ctx->skel->bss->dropped, 0);
+}
+
+static struct perfbuf_bench *perfbuf_setup_skeleton()
+{
+ struct perfbuf_bench *skel;
+
+ setup_libbpf();
+
+ skel = perfbuf_bench__open();
+ if (!skel) {
+ fprintf(stderr, "failed to open skeleton\n");
+ exit(1);
+ }
+
+ skel->rodata->batch_cnt = args.batch_cnt;
+
+ if (perfbuf_bench__load(skel)) {
+ fprintf(stderr, "failed to load skeleton\n");
+ exit(1);
+ }
+
+ return skel;
+}
+
+static enum bpf_perf_event_ret
+perfbuf_process_sample_raw(void *input_ctx, int cpu,
+ struct perf_event_header *e)
+{
+ switch (e->type) {
+ case PERF_RECORD_SAMPLE:
+ atomic_inc(&buf_hits.value);
+ break;
+ case PERF_RECORD_LOST:
+ break;
+ default:
+ return LIBBPF_PERF_EVENT_ERROR;
+ }
+ return LIBBPF_PERF_EVENT_CONT;
+}
+
+static void perfbuf_libbpf_setup()
+{
+ struct perfbuf_libbpf_ctx *ctx = &perfbuf_libbpf_ctx;
+ struct perf_event_attr attr;
+ struct perf_buffer_raw_opts pb_opts = {
+ .event_cb = perfbuf_process_sample_raw,
+ .ctx = (void *)(long)0,
+ .attr = &attr,
+ };
+ struct bpf_link *link;
+
+ ctx->skel = perfbuf_setup_skeleton();
+
+ memset(&attr, 0, sizeof(attr));
+ attr.config = PERF_COUNT_SW_BPF_OUTPUT,
+ attr.type = PERF_TYPE_SOFTWARE;
+ attr.sample_type = PERF_SAMPLE_RAW;
+ /* notify only every Nth sample */
+ if (args.sampled) {
+ attr.sample_period = args.sample_rate;
+ attr.wakeup_events = args.sample_rate;
+ } else {
+ attr.sample_period = 1;
+ attr.wakeup_events = 1;
+ }
+
+ if (args.sample_rate > args.batch_cnt) {
+ fprintf(stderr, "sample rate %d is too high for given batch count %d\n",
+ args.sample_rate, args.batch_cnt);
+ exit(1);
+ }
+
+ ctx->perfbuf = perf_buffer__new_raw(bpf_map__fd(ctx->skel->maps.perfbuf),
+ args.perfbuf_sz, &pb_opts);
+ if (!ctx->perfbuf) {
+ fprintf(stderr, "failed to create perfbuf\n");
+ exit(1);
+ }
+
+ link = bpf_program__attach(ctx->skel->progs.bench_perfbuf);
+ if (IS_ERR(link)) {
+ fprintf(stderr, "failed to attach program\n");
+ exit(1);
+ }
+}
+
+static void *perfbuf_libbpf_consumer(void *input)
+{
+ struct perfbuf_libbpf_ctx *ctx = &perfbuf_libbpf_ctx;
+
+ while (perf_buffer__poll(ctx->perfbuf, -1) >= 0) {
+ if (args.back2back)
+ bufs_trigger_batch();
+ }
+ fprintf(stderr, "perfbuf polling failed!\n");
+ return NULL;
+}
+
+/* PERFBUF-CUSTOM benchmark */
+
+/* copies of internal libbpf definitions */
+struct perf_cpu_buf {
+ struct perf_buffer *pb;
+ void *base; /* mmap()'ed memory */
+ void *buf; /* for reconstructing segmented data */
+ size_t buf_size;
+ int fd;
+ int cpu;
+ int map_key;
+};
+
+struct perf_buffer {
+ perf_buffer_event_fn event_cb;
+ perf_buffer_sample_fn sample_cb;
+ perf_buffer_lost_fn lost_cb;
+ void *ctx; /* passed into callbacks */
+
+ size_t page_size;
+ size_t mmap_size;
+ struct perf_cpu_buf **cpu_bufs;
+ struct epoll_event *events;
+ int cpu_cnt; /* number of allocated CPU buffers */
+ int epoll_fd; /* perf event FD */
+ int map_fd; /* BPF_MAP_TYPE_PERF_EVENT_ARRAY BPF map FD */
+};
+
+static void *perfbuf_custom_consumer(void *input)
+{
+ struct perfbuf_libbpf_ctx *ctx = &perfbuf_libbpf_ctx;
+ struct perf_buffer *pb = ctx->perfbuf;
+ struct perf_cpu_buf *cpu_buf;
+ struct perf_event_mmap_page *header;
+ size_t mmap_mask = pb->mmap_size - 1;
+ struct perf_event_header *ehdr;
+ __u64 data_head, data_tail;
+ size_t ehdr_size;
+ void *base;
+ int i, cnt;
+
+ while (true) {
+ if (args.back2back)
+ bufs_trigger_batch();
+ cnt = epoll_wait(pb->epoll_fd, pb->events, pb->cpu_cnt, -1);
+ if (cnt <= 0) {
+ fprintf(stderr, "perf epoll failed: %d\n", -errno);
+ exit(1);
+ }
+
+ for (i = 0; i < cnt; ++i) {
+ cpu_buf = pb->events[i].data.ptr;
+ header = cpu_buf->base;
+ base = ((void *)header) + pb->page_size;
+
+ data_head = ring_buffer_read_head(header);
+ data_tail = header->data_tail;
+ while (data_head != data_tail) {
+ ehdr = base + (data_tail & mmap_mask);
+ ehdr_size = ehdr->size;
+
+ if (ehdr->type == PERF_RECORD_SAMPLE)
+ atomic_inc(&buf_hits.value);
+
+ data_tail += ehdr_size;
+ }
+ ring_buffer_write_tail(header, data_tail);
+ }
+ }
+ return NULL;
+}
+
+const struct bench bench_rb_libbpf = {
+ .name = "rb-libbpf",
+ .validate = bufs_validate,
+ .setup = ringbuf_libbpf_setup,
+ .producer_thread = bufs_sample_producer,
+ .consumer_thread = ringbuf_libbpf_consumer,
+ .measure = ringbuf_libbpf_measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
+
+const struct bench bench_rb_custom = {
+ .name = "rb-custom",
+ .validate = bufs_validate,
+ .setup = ringbuf_custom_setup,
+ .producer_thread = bufs_sample_producer,
+ .consumer_thread = ringbuf_custom_consumer,
+ .measure = ringbuf_custom_measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
+
+const struct bench bench_pb_libbpf = {
+ .name = "pb-libbpf",
+ .validate = bufs_validate,
+ .setup = perfbuf_libbpf_setup,
+ .producer_thread = bufs_sample_producer,
+ .consumer_thread = perfbuf_libbpf_consumer,
+ .measure = perfbuf_measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
+
+const struct bench bench_pb_custom = {
+ .name = "pb-custom",
+ .validate = bufs_validate,
+ .setup = perfbuf_libbpf_setup,
+ .producer_thread = bufs_sample_producer,
+ .consumer_thread = perfbuf_custom_consumer,
+ .measure = perfbuf_measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
+
diff --git a/tools/testing/selftests/bpf/benchs/bench_trigger.c b/tools/testing/selftests/bpf/benchs/bench_trigger.c
new file mode 100644
index 000000000..2a0b6c988
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/bench_trigger.c
@@ -0,0 +1,184 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bench.h"
+#include "trigger_bench.skel.h"
+
+/* BPF triggering benchmarks */
+static struct trigger_ctx {
+ struct trigger_bench *skel;
+} ctx;
+
+static struct counter base_hits;
+
+static void trigger_validate()
+{
+ if (env.consumer_cnt != 1) {
+ fprintf(stderr, "benchmark doesn't support multi-consumer!\n");
+ exit(1);
+ }
+}
+
+static void *trigger_base_producer(void *input)
+{
+ while (true) {
+ (void)syscall(__NR_getpgid);
+ atomic_inc(&base_hits.value);
+ }
+ return NULL;
+}
+
+static void trigger_base_measure(struct bench_res *res)
+{
+ res->hits = atomic_swap(&base_hits.value, 0);
+}
+
+static void *trigger_producer(void *input)
+{
+ while (true)
+ (void)syscall(__NR_getpgid);
+ return NULL;
+}
+
+static void trigger_measure(struct bench_res *res)
+{
+ res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
+}
+
+static void setup_ctx()
+{
+ setup_libbpf();
+
+ ctx.skel = trigger_bench__open_and_load();
+ if (!ctx.skel) {
+ fprintf(stderr, "failed to open skeleton\n");
+ exit(1);
+ }
+}
+
+static void attach_bpf(struct bpf_program *prog)
+{
+ struct bpf_link *link;
+
+ link = bpf_program__attach(prog);
+ if (IS_ERR(link)) {
+ fprintf(stderr, "failed to attach program!\n");
+ exit(1);
+ }
+}
+
+static void trigger_tp_setup()
+{
+ setup_ctx();
+ attach_bpf(ctx.skel->progs.bench_trigger_tp);
+}
+
+static void trigger_rawtp_setup()
+{
+ setup_ctx();
+ attach_bpf(ctx.skel->progs.bench_trigger_raw_tp);
+}
+
+static void trigger_kprobe_setup()
+{
+ setup_ctx();
+ attach_bpf(ctx.skel->progs.bench_trigger_kprobe);
+}
+
+static void trigger_fentry_setup()
+{
+ setup_ctx();
+ attach_bpf(ctx.skel->progs.bench_trigger_fentry);
+}
+
+static void trigger_fentry_sleep_setup()
+{
+ setup_ctx();
+ attach_bpf(ctx.skel->progs.bench_trigger_fentry_sleep);
+}
+
+static void trigger_fmodret_setup()
+{
+ setup_ctx();
+ attach_bpf(ctx.skel->progs.bench_trigger_fmodret);
+}
+
+static void *trigger_consumer(void *input)
+{
+ return NULL;
+}
+
+const struct bench bench_trig_base = {
+ .name = "trig-base",
+ .validate = trigger_validate,
+ .producer_thread = trigger_base_producer,
+ .consumer_thread = trigger_consumer,
+ .measure = trigger_base_measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
+
+const struct bench bench_trig_tp = {
+ .name = "trig-tp",
+ .validate = trigger_validate,
+ .setup = trigger_tp_setup,
+ .producer_thread = trigger_producer,
+ .consumer_thread = trigger_consumer,
+ .measure = trigger_measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
+
+const struct bench bench_trig_rawtp = {
+ .name = "trig-rawtp",
+ .validate = trigger_validate,
+ .setup = trigger_rawtp_setup,
+ .producer_thread = trigger_producer,
+ .consumer_thread = trigger_consumer,
+ .measure = trigger_measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
+
+const struct bench bench_trig_kprobe = {
+ .name = "trig-kprobe",
+ .validate = trigger_validate,
+ .setup = trigger_kprobe_setup,
+ .producer_thread = trigger_producer,
+ .consumer_thread = trigger_consumer,
+ .measure = trigger_measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
+
+const struct bench bench_trig_fentry = {
+ .name = "trig-fentry",
+ .validate = trigger_validate,
+ .setup = trigger_fentry_setup,
+ .producer_thread = trigger_producer,
+ .consumer_thread = trigger_consumer,
+ .measure = trigger_measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
+
+const struct bench bench_trig_fentry_sleep = {
+ .name = "trig-fentry-sleep",
+ .validate = trigger_validate,
+ .setup = trigger_fentry_sleep_setup,
+ .producer_thread = trigger_producer,
+ .consumer_thread = trigger_consumer,
+ .measure = trigger_measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
+
+const struct bench bench_trig_fmodret = {
+ .name = "trig-fmodret",
+ .validate = trigger_validate,
+ .setup = trigger_fmodret_setup,
+ .producer_thread = trigger_producer,
+ .consumer_thread = trigger_consumer,
+ .measure = trigger_measure,
+ .report_progress = hits_drops_report_progress,
+ .report_final = hits_drops_report_final,
+};
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_rename.sh b/tools/testing/selftests/bpf/benchs/run_bench_rename.sh
new file mode 100755
index 000000000..7b281dbe4
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/run_bench_rename.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+set -eufo pipefail
+
+for i in base kprobe kretprobe rawtp fentry fexit
+do
+ summary=$(sudo ./bench -w2 -d5 -a rename-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)
+ printf "%-10s: %s\n" $i "$summary"
+done
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh
new file mode 100755
index 000000000..af4aa04ca
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/run_bench_ringbufs.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+set -eufo pipefail
+
+RUN_BENCH="sudo ./bench -w3 -d10 -a"
+
+function hits()
+{
+ echo "$*" | sed -E "s/.*hits\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+M\/s).*/\1/"
+}
+
+function drops()
+{
+ echo "$*" | sed -E "s/.*drops\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+M\/s).*/\1/"
+}
+
+function header()
+{
+ local len=${#1}
+
+ printf "\n%s\n" "$1"
+ for i in $(seq 1 $len); do printf '='; done
+ printf '\n'
+}
+
+function summarize()
+{
+ bench="$1"
+ summary=$(echo $2 | tail -n1)
+ printf "%-20s %s (drops %s)\n" "$bench" "$(hits $summary)" "$(drops $summary)"
+}
+
+header "Single-producer, parallel producer"
+for b in rb-libbpf rb-custom pb-libbpf pb-custom; do
+ summarize $b "$($RUN_BENCH $b)"
+done
+
+header "Single-producer, parallel producer, sampled notification"
+for b in rb-libbpf rb-custom pb-libbpf pb-custom; do
+ summarize $b "$($RUN_BENCH --rb-sampled $b)"
+done
+
+header "Single-producer, back-to-back mode"
+for b in rb-libbpf rb-custom pb-libbpf pb-custom; do
+ summarize $b "$($RUN_BENCH --rb-b2b $b)"
+ summarize $b-sampled "$($RUN_BENCH --rb-sampled --rb-b2b $b)"
+done
+
+header "Ringbuf back-to-back, effect of sample rate"
+for b in 1 5 10 25 50 100 250 500 1000 2000 3000; do
+ summarize "rb-sampled-$b" "$($RUN_BENCH --rb-b2b --rb-batch-cnt $b --rb-sampled --rb-sample-rate $b rb-custom)"
+done
+header "Perfbuf back-to-back, effect of sample rate"
+for b in 1 5 10 25 50 100 250 500 1000 2000 3000; do
+ summarize "pb-sampled-$b" "$($RUN_BENCH --rb-b2b --rb-batch-cnt $b --rb-sampled --rb-sample-rate $b pb-custom)"
+done
+
+header "Ringbuf back-to-back, reserve+commit vs output"
+summarize "reserve" "$($RUN_BENCH --rb-b2b rb-custom)"
+summarize "output" "$($RUN_BENCH --rb-b2b --rb-use-output rb-custom)"
+
+header "Ringbuf sampled, reserve+commit vs output"
+summarize "reserve-sampled" "$($RUN_BENCH --rb-sampled rb-custom)"
+summarize "output-sampled" "$($RUN_BENCH --rb-sampled --rb-use-output rb-custom)"
+
+header "Single-producer, consumer/producer competing on the same CPU, low batch count"
+for b in rb-libbpf rb-custom pb-libbpf pb-custom; do
+ summarize $b "$($RUN_BENCH --rb-batch-cnt 1 --rb-sample-rate 1 --prod-affinity 0 --cons-affinity 0 $b)"
+done
+
+header "Ringbuf, multi-producer contention"
+for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do
+ summarize "rb-libbpf nr_prod $b" "$($RUN_BENCH -p$b --rb-batch-cnt 50 rb-libbpf)"
+done
+
diff --git a/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
new file mode 100755
index 000000000..78e83f243
--- /dev/null
+++ b/tools/testing/selftests/bpf/benchs/run_bench_trigger.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+set -eufo pipefail
+
+for i in base tp rawtp kprobe fentry fmodret
+do
+ summary=$(sudo ./bench -w2 -d5 -a trig-$i | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)
+ printf "%-10s: %s\n" $i "$summary"
+done
diff --git a/tools/testing/selftests/bpf/bpf_legacy.h b/tools/testing/selftests/bpf/bpf_legacy.h
new file mode 100644
index 000000000..719ab56cd
--- /dev/null
+++ b/tools/testing/selftests/bpf/bpf_legacy.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+#ifndef __BPF_LEGACY__
+#define __BPF_LEGACY__
+
+#define BPF_ANNOTATE_KV_PAIR(name, type_key, type_val) \
+ struct ____btf_map_##name { \
+ type_key key; \
+ type_val value; \
+ }; \
+ struct ____btf_map_##name \
+ __attribute__ ((section(".maps." #name), used)) \
+ ____btf_map_##name = { }
+
+/* llvm builtin functions that eBPF C program may use to
+ * emit BPF_LD_ABS and BPF_LD_IND instructions
+ */
+unsigned long long load_byte(void *skb,
+ unsigned long long off) asm("llvm.bpf.load.byte");
+unsigned long long load_half(void *skb,
+ unsigned long long off) asm("llvm.bpf.load.half");
+unsigned long long load_word(void *skb,
+ unsigned long long off) asm("llvm.bpf.load.word");
+
+#endif
+
diff --git a/tools/testing/selftests/bpf/bpf_rand.h b/tools/testing/selftests/bpf/bpf_rand.h
new file mode 100644
index 000000000..59bf3e1a9
--- /dev/null
+++ b/tools/testing/selftests/bpf/bpf_rand.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BPF_RAND__
+#define __BPF_RAND__
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <time.h>
+
+static inline uint64_t bpf_rand_mask(uint64_t mask)
+{
+ return (((uint64_t)(uint32_t)rand()) |
+ ((uint64_t)(uint32_t)rand() << 32)) & mask;
+}
+
+#define bpf_rand_ux(x, m) \
+static inline uint64_t bpf_rand_u##x(int shift) \
+{ \
+ return bpf_rand_mask((m)) << shift; \
+}
+
+bpf_rand_ux( 8, 0xffULL)
+bpf_rand_ux(16, 0xffffULL)
+bpf_rand_ux(24, 0xffffffULL)
+bpf_rand_ux(32, 0xffffffffULL)
+bpf_rand_ux(40, 0xffffffffffULL)
+bpf_rand_ux(48, 0xffffffffffffULL)
+bpf_rand_ux(56, 0xffffffffffffffULL)
+bpf_rand_ux(64, 0xffffffffffffffffULL)
+
+static inline void bpf_semi_rand_init(void)
+{
+ srand(time(NULL));
+}
+
+static inline uint64_t bpf_semi_rand_get(void)
+{
+ switch (rand() % 39) {
+ case 0: return 0x000000ff00000000ULL | bpf_rand_u8(0);
+ case 1: return 0xffffffff00000000ULL | bpf_rand_u16(0);
+ case 2: return 0x00000000ffff0000ULL | bpf_rand_u16(0);
+ case 3: return 0x8000000000000000ULL | bpf_rand_u32(0);
+ case 4: return 0x00000000f0000000ULL | bpf_rand_u32(0);
+ case 5: return 0x0000000100000000ULL | bpf_rand_u24(0);
+ case 6: return 0x800ff00000000000ULL | bpf_rand_u32(0);
+ case 7: return 0x7fffffff00000000ULL | bpf_rand_u32(0);
+ case 8: return 0xffffffffffffff00ULL ^ bpf_rand_u32(24);
+ case 9: return 0xffffffffffffff00ULL | bpf_rand_u8(0);
+ case 10: return 0x0000000010000000ULL | bpf_rand_u32(0);
+ case 11: return 0xf000000000000000ULL | bpf_rand_u8(0);
+ case 12: return 0x0000f00000000000ULL | bpf_rand_u8(8);
+ case 13: return 0x000000000f000000ULL | bpf_rand_u8(16);
+ case 14: return 0x0000000000000f00ULL | bpf_rand_u8(32);
+ case 15: return 0x00fff00000000f00ULL | bpf_rand_u8(48);
+ case 16: return 0x00007fffffffffffULL ^ bpf_rand_u32(1);
+ case 17: return 0xffff800000000000ULL | bpf_rand_u8(4);
+ case 18: return 0xffff800000000000ULL | bpf_rand_u8(20);
+ case 19: return (0xffffffc000000000ULL + 0x80000ULL) | bpf_rand_u32(0);
+ case 20: return (0xffffffc000000000ULL - 0x04000000ULL) | bpf_rand_u32(0);
+ case 21: return 0x0000000000000000ULL | bpf_rand_u8(55) | bpf_rand_u32(20);
+ case 22: return 0xffffffffffffffffULL ^ bpf_rand_u8(3) ^ bpf_rand_u32(40);
+ case 23: return 0x0000000000000000ULL | bpf_rand_u8(bpf_rand_u8(0) % 64);
+ case 24: return 0x0000000000000000ULL | bpf_rand_u16(bpf_rand_u8(0) % 64);
+ case 25: return 0xffffffffffffffffULL ^ bpf_rand_u8(bpf_rand_u8(0) % 64);
+ case 26: return 0xffffffffffffffffULL ^ bpf_rand_u40(bpf_rand_u8(0) % 64);
+ case 27: return 0x0000800000000000ULL;
+ case 28: return 0x8000000000000000ULL;
+ case 29: return 0x0000000000000000ULL;
+ case 30: return 0xffffffffffffffffULL;
+ case 31: return bpf_rand_u16(bpf_rand_u8(0) % 64);
+ case 32: return bpf_rand_u24(bpf_rand_u8(0) % 64);
+ case 33: return bpf_rand_u32(bpf_rand_u8(0) % 64);
+ case 34: return bpf_rand_u40(bpf_rand_u8(0) % 64);
+ case 35: return bpf_rand_u48(bpf_rand_u8(0) % 64);
+ case 36: return bpf_rand_u56(bpf_rand_u8(0) % 64);
+ case 37: return bpf_rand_u64(bpf_rand_u8(0) % 64);
+ default: return bpf_rand_u64(0);
+ }
+}
+
+#endif /* __BPF_RAND__ */
diff --git a/tools/testing/selftests/bpf/bpf_rlimit.h b/tools/testing/selftests/bpf/bpf_rlimit.h
new file mode 100644
index 000000000..9dac9b30f
--- /dev/null
+++ b/tools/testing/selftests/bpf/bpf_rlimit.h
@@ -0,0 +1,28 @@
+#include <sys/resource.h>
+#include <stdio.h>
+
+static __attribute__((constructor)) void bpf_rlimit_ctor(void)
+{
+ struct rlimit rlim_old, rlim_new = {
+ .rlim_cur = RLIM_INFINITY,
+ .rlim_max = RLIM_INFINITY,
+ };
+
+ getrlimit(RLIMIT_MEMLOCK, &rlim_old);
+ /* For the sake of running the test cases, we temporarily
+ * set rlimit to infinity in order for kernel to focus on
+ * errors from actual test cases and not getting noise
+ * from hitting memlock limits. The limit is on per-process
+ * basis and not a global one, hence destructor not really
+ * needed here.
+ */
+ if (setrlimit(RLIMIT_MEMLOCK, &rlim_new) < 0) {
+ perror("Unable to lift memlock rlimit");
+ /* Trying out lower limit, but expect potential test
+ * case failures from this!
+ */
+ rlim_new.rlim_cur = rlim_old.rlim_cur + (1UL << 20);
+ rlim_new.rlim_max = rlim_old.rlim_max + (1UL << 20);
+ setrlimit(RLIMIT_MEMLOCK, &rlim_new);
+ }
+}
diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
new file mode 100644
index 000000000..2915664c3
--- /dev/null
+++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
@@ -0,0 +1,232 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BPF_TCP_HELPERS_H
+#define __BPF_TCP_HELPERS_H
+
+#include <stdbool.h>
+#include <linux/types.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_tracing.h>
+
+#define BPF_STRUCT_OPS(name, args...) \
+SEC("struct_ops/"#name) \
+BPF_PROG(name, args)
+
+#define tcp_jiffies32 ((__u32)bpf_jiffies64())
+
+struct sock_common {
+ unsigned char skc_state;
+ __u16 skc_num;
+} __attribute__((preserve_access_index));
+
+enum sk_pacing {
+ SK_PACING_NONE = 0,
+ SK_PACING_NEEDED = 1,
+ SK_PACING_FQ = 2,
+};
+
+struct sock {
+ struct sock_common __sk_common;
+ unsigned long sk_pacing_rate;
+ __u32 sk_pacing_status; /* see enum sk_pacing */
+} __attribute__((preserve_access_index));
+
+struct inet_sock {
+ struct sock sk;
+} __attribute__((preserve_access_index));
+
+struct inet_connection_sock {
+ struct inet_sock icsk_inet;
+ __u8 icsk_ca_state:6,
+ icsk_ca_setsockopt:1,
+ icsk_ca_dst_locked:1;
+ struct {
+ __u8 pending;
+ } icsk_ack;
+ __u64 icsk_ca_priv[104 / sizeof(__u64)];
+} __attribute__((preserve_access_index));
+
+struct request_sock {
+ struct sock_common __req_common;
+} __attribute__((preserve_access_index));
+
+struct tcp_sock {
+ struct inet_connection_sock inet_conn;
+
+ __u32 rcv_nxt;
+ __u32 snd_nxt;
+ __u32 snd_una;
+ __u8 ecn_flags;
+ __u32 delivered;
+ __u32 delivered_ce;
+ __u32 snd_cwnd;
+ __u32 snd_cwnd_cnt;
+ __u32 snd_cwnd_clamp;
+ __u32 snd_ssthresh;
+ __u8 syn_data:1, /* SYN includes data */
+ syn_fastopen:1, /* SYN includes Fast Open option */
+ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
+ syn_fastopen_ch:1, /* Active TFO re-enabling probe */
+ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
+ save_syn:1, /* Save headers of SYN packet */
+ is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
+ syn_smc:1; /* SYN includes SMC */
+ __u32 max_packets_out;
+ __u32 lsndtime;
+ __u32 prior_cwnd;
+ __u64 tcp_mstamp; /* most recent packet received/sent */
+} __attribute__((preserve_access_index));
+
+static __always_inline struct inet_connection_sock *inet_csk(const struct sock *sk)
+{
+ return (struct inet_connection_sock *)sk;
+}
+
+static __always_inline void *inet_csk_ca(const struct sock *sk)
+{
+ return (void *)inet_csk(sk)->icsk_ca_priv;
+}
+
+static __always_inline struct tcp_sock *tcp_sk(const struct sock *sk)
+{
+ return (struct tcp_sock *)sk;
+}
+
+static __always_inline bool before(__u32 seq1, __u32 seq2)
+{
+ return (__s32)(seq1-seq2) < 0;
+}
+#define after(seq2, seq1) before(seq1, seq2)
+
+#define TCP_ECN_OK 1
+#define TCP_ECN_QUEUE_CWR 2
+#define TCP_ECN_DEMAND_CWR 4
+#define TCP_ECN_SEEN 8
+
+enum inet_csk_ack_state_t {
+ ICSK_ACK_SCHED = 1,
+ ICSK_ACK_TIMER = 2,
+ ICSK_ACK_PUSHED = 4,
+ ICSK_ACK_PUSHED2 = 8,
+ ICSK_ACK_NOW = 16 /* Send the next ACK immediately (once) */
+};
+
+enum tcp_ca_event {
+ CA_EVENT_TX_START = 0,
+ CA_EVENT_CWND_RESTART = 1,
+ CA_EVENT_COMPLETE_CWR = 2,
+ CA_EVENT_LOSS = 3,
+ CA_EVENT_ECN_NO_CE = 4,
+ CA_EVENT_ECN_IS_CE = 5,
+};
+
+struct ack_sample {
+ __u32 pkts_acked;
+ __s32 rtt_us;
+ __u32 in_flight;
+} __attribute__((preserve_access_index));
+
+struct rate_sample {
+ __u64 prior_mstamp; /* starting timestamp for interval */
+ __u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
+ __s32 delivered; /* number of packets delivered over interval */
+ long interval_us; /* time for tp->delivered to incr "delivered" */
+ __u32 snd_interval_us; /* snd interval for delivered packets */
+ __u32 rcv_interval_us; /* rcv interval for delivered packets */
+ long rtt_us; /* RTT of last (S)ACKed packet (or -1) */
+ int losses; /* number of packets marked lost upon ACK */
+ __u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */
+ __u32 prior_in_flight; /* in flight before this ACK */
+ bool is_app_limited; /* is sample from packet with bubble in pipe? */
+ bool is_retrans; /* is sample from retransmission? */
+ bool is_ack_delayed; /* is this (likely) a delayed ACK? */
+} __attribute__((preserve_access_index));
+
+#define TCP_CA_NAME_MAX 16
+#define TCP_CONG_NEEDS_ECN 0x2
+
+struct tcp_congestion_ops {
+ char name[TCP_CA_NAME_MAX];
+ __u32 flags;
+
+ /* initialize private data (optional) */
+ void (*init)(struct sock *sk);
+ /* cleanup private data (optional) */
+ void (*release)(struct sock *sk);
+
+ /* return slow start threshold (required) */
+ __u32 (*ssthresh)(struct sock *sk);
+ /* do new cwnd calculation (required) */
+ void (*cong_avoid)(struct sock *sk, __u32 ack, __u32 acked);
+ /* call before changing ca_state (optional) */
+ void (*set_state)(struct sock *sk, __u8 new_state);
+ /* call when cwnd event occurs (optional) */
+ void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
+ /* call when ack arrives (optional) */
+ void (*in_ack_event)(struct sock *sk, __u32 flags);
+ /* new value of cwnd after loss (required) */
+ __u32 (*undo_cwnd)(struct sock *sk);
+ /* hook for packet ack accounting (optional) */
+ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
+ /* override sysctl_tcp_min_tso_segs */
+ __u32 (*min_tso_segs)(struct sock *sk);
+ /* returns the multiplier used in tcp_sndbuf_expand (optional) */
+ __u32 (*sndbuf_expand)(struct sock *sk);
+ /* call when packets are delivered to update cwnd and pacing rate,
+ * after all the ca_state processing. (optional)
+ */
+ void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
+};
+
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#define max(a, b) ((a) > (b) ? (a) : (b))
+#define min_not_zero(x, y) ({ \
+ typeof(x) __x = (x); \
+ typeof(y) __y = (y); \
+ __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
+
+static __always_inline __u32 tcp_slow_start(struct tcp_sock *tp, __u32 acked)
+{
+ __u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh);
+
+ acked -= cwnd - tp->snd_cwnd;
+ tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
+
+ return acked;
+}
+
+static __always_inline bool tcp_in_slow_start(const struct tcp_sock *tp)
+{
+ return tp->snd_cwnd < tp->snd_ssthresh;
+}
+
+static __always_inline bool tcp_is_cwnd_limited(const struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ /* If in slow start, ensure cwnd grows to twice what was ACKed. */
+ if (tcp_in_slow_start(tp))
+ return tp->snd_cwnd < 2 * tp->max_packets_out;
+
+ return !!BPF_CORE_READ_BITFIELD(tp, is_cwnd_limited);
+}
+
+static __always_inline void tcp_cong_avoid_ai(struct tcp_sock *tp, __u32 w, __u32 acked)
+{
+ /* If credits accumulated at a higher w, apply them gently now. */
+ if (tp->snd_cwnd_cnt >= w) {
+ tp->snd_cwnd_cnt = 0;
+ tp->snd_cwnd++;
+ }
+
+ tp->snd_cwnd_cnt += acked;
+ if (tp->snd_cwnd_cnt >= w) {
+ __u32 delta = tp->snd_cwnd_cnt / w;
+
+ tp->snd_cwnd_cnt -= delta * w;
+ tp->snd_cwnd += delta;
+ }
+ tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp);
+}
+
+#endif
diff --git a/tools/testing/selftests/bpf/bpf_util.h b/tools/testing/selftests/bpf/bpf_util.h
new file mode 100644
index 000000000..a3352a64c
--- /dev/null
+++ b/tools/testing/selftests/bpf/bpf_util.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BPF_UTIL__
+#define __BPF_UTIL__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <bpf/libbpf.h> /* libbpf_num_possible_cpus */
+
+static inline unsigned int bpf_num_possible_cpus(void)
+{
+ int possible_cpus = libbpf_num_possible_cpus();
+
+ if (possible_cpus < 0) {
+ printf("Failed to get # of possible cpus: '%s'!\n",
+ strerror(-possible_cpus));
+ exit(1);
+ }
+ return possible_cpus;
+}
+
+#define __bpf_percpu_val_align __attribute__((__aligned__(8)))
+
+#define BPF_DECLARE_PERCPU(type, name) \
+ struct { type v; /* padding */ } __bpf_percpu_val_align \
+ name[bpf_num_possible_cpus()]
+#define bpf_percpu(name, cpu) name[(cpu)].v
+
+#ifndef ARRAY_SIZE
+# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+#ifndef sizeof_field
+#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
+#endif
+
+#ifndef offsetofend
+#define offsetofend(TYPE, MEMBER) \
+ (offsetof(TYPE, MEMBER) + sizeof_field(TYPE, MEMBER))
+#endif
+
+#endif /* __BPF_UTIL__ */
diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c
new file mode 100644
index 000000000..033051717
--- /dev/null
+++ b/tools/testing/selftests/bpf/cgroup_helpers.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sched.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <linux/limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <linux/sched.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <ftw.h>
+
+
+#include "cgroup_helpers.h"
+
+/*
+ * To avoid relying on the system setup, when setup_cgroup_env is called
+ * we create a new mount namespace, and cgroup namespace. The cgroup2
+ * root is mounted at CGROUP_MOUNT_PATH
+ *
+ * Unfortunately, most people don't have cgroupv2 enabled at this point in time.
+ * It's easier to create our own mount namespace and manage it ourselves.
+ *
+ * We assume /mnt exists.
+ */
+
+#define WALK_FD_LIMIT 16
+#define CGROUP_MOUNT_PATH "/mnt"
+#define CGROUP_WORK_DIR "/cgroup-test-work-dir"
+#define format_cgroup_path(buf, path) \
+ snprintf(buf, sizeof(buf), "%s%s%s", CGROUP_MOUNT_PATH, \
+ CGROUP_WORK_DIR, path)
+
+/**
+ * enable_all_controllers() - Enable all available cgroup v2 controllers
+ *
+ * Enable all available cgroup v2 controllers in order to increase
+ * the code coverage.
+ *
+ * If successful, 0 is returned.
+ */
+static int enable_all_controllers(char *cgroup_path)
+{
+ char path[PATH_MAX + 1];
+ char buf[PATH_MAX];
+ char *c, *c2;
+ int fd, cfd;
+ ssize_t len;
+
+ snprintf(path, sizeof(path), "%s/cgroup.controllers", cgroup_path);
+ fd = open(path, O_RDONLY);
+ if (fd < 0) {
+ log_err("Opening cgroup.controllers: %s", path);
+ return 1;
+ }
+
+ len = read(fd, buf, sizeof(buf) - 1);
+ if (len < 0) {
+ close(fd);
+ log_err("Reading cgroup.controllers: %s", path);
+ return 1;
+ }
+ buf[len] = 0;
+ close(fd);
+
+ /* No controllers available? We're probably on cgroup v1. */
+ if (len == 0)
+ return 0;
+
+ snprintf(path, sizeof(path), "%s/cgroup.subtree_control", cgroup_path);
+ cfd = open(path, O_RDWR);
+ if (cfd < 0) {
+ log_err("Opening cgroup.subtree_control: %s", path);
+ return 1;
+ }
+
+ for (c = strtok_r(buf, " ", &c2); c; c = strtok_r(NULL, " ", &c2)) {
+ if (dprintf(cfd, "+%s\n", c) <= 0) {
+ log_err("Enabling controller %s: %s", c, path);
+ close(cfd);
+ return 1;
+ }
+ }
+ close(cfd);
+ return 0;
+}
+
+/**
+ * setup_cgroup_environment() - Setup the cgroup environment
+ *
+ * After calling this function, cleanup_cgroup_environment should be called
+ * once testing is complete.
+ *
+ * This function will print an error to stderr and return 1 if it is unable
+ * to setup the cgroup environment. If setup is successful, 0 is returned.
+ */
+int setup_cgroup_environment(void)
+{
+ char cgroup_workdir[PATH_MAX - 24];
+
+ format_cgroup_path(cgroup_workdir, "");
+
+ if (unshare(CLONE_NEWNS)) {
+ log_err("unshare");
+ return 1;
+ }
+
+ if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL)) {
+ log_err("mount fakeroot");
+ return 1;
+ }
+
+ if (mount("none", CGROUP_MOUNT_PATH, "cgroup2", 0, NULL) && errno != EBUSY) {
+ log_err("mount cgroup2");
+ return 1;
+ }
+
+ /* Cleanup existing failed runs, now that the environment is setup */
+ cleanup_cgroup_environment();
+
+ if (mkdir(cgroup_workdir, 0777) && errno != EEXIST) {
+ log_err("mkdir cgroup work dir");
+ return 1;
+ }
+
+ if (enable_all_controllers(cgroup_workdir))
+ return 1;
+
+ return 0;
+}
+
+static int nftwfunc(const char *filename, const struct stat *statptr,
+ int fileflags, struct FTW *pfwt)
+{
+ if ((fileflags & FTW_D) && rmdir(filename))
+ log_err("Removing cgroup: %s", filename);
+ return 0;
+}
+
+
+static int join_cgroup_from_top(char *cgroup_path)
+{
+ char cgroup_procs_path[PATH_MAX + 1];
+ pid_t pid = getpid();
+ int fd, rc = 0;
+
+ snprintf(cgroup_procs_path, sizeof(cgroup_procs_path),
+ "%s/cgroup.procs", cgroup_path);
+
+ fd = open(cgroup_procs_path, O_WRONLY);
+ if (fd < 0) {
+ log_err("Opening Cgroup Procs: %s", cgroup_procs_path);
+ return 1;
+ }
+
+ if (dprintf(fd, "%d\n", pid) < 0) {
+ log_err("Joining Cgroup");
+ rc = 1;
+ }
+
+ close(fd);
+ return rc;
+}
+
+/**
+ * join_cgroup() - Join a cgroup
+ * @path: The cgroup path, relative to the workdir, to join
+ *
+ * This function expects a cgroup to already be created, relative to the cgroup
+ * work dir, and it joins it. For example, passing "/my-cgroup" as the path
+ * would actually put the calling process into the cgroup
+ * "/cgroup-test-work-dir/my-cgroup"
+ *
+ * On success, it returns 0, otherwise on failure it returns 1.
+ */
+int join_cgroup(const char *path)
+{
+ char cgroup_path[PATH_MAX + 1];
+
+ format_cgroup_path(cgroup_path, path);
+ return join_cgroup_from_top(cgroup_path);
+}
+
+/**
+ * cleanup_cgroup_environment() - Cleanup Cgroup Testing Environment
+ *
+ * This is an idempotent function to delete all temporary cgroups that
+ * have been created during the test, including the cgroup testing work
+ * directory.
+ *
+ * At call time, it moves the calling process to the root cgroup, and then
+ * runs the deletion process. It is idempotent, and should not fail, unless
+ * a process is lingering.
+ *
+ * On failure, it will print an error to stderr, and try to continue.
+ */
+void cleanup_cgroup_environment(void)
+{
+ char cgroup_workdir[PATH_MAX + 1];
+
+ format_cgroup_path(cgroup_workdir, "");
+ join_cgroup_from_top(CGROUP_MOUNT_PATH);
+ nftw(cgroup_workdir, nftwfunc, WALK_FD_LIMIT, FTW_DEPTH | FTW_MOUNT);
+}
+
+/**
+ * create_and_get_cgroup() - Create a cgroup, relative to workdir, and get the FD
+ * @path: The cgroup path, relative to the workdir, to join
+ *
+ * This function creates a cgroup under the top level workdir and returns the
+ * file descriptor. It is idempotent.
+ *
+ * On success, it returns the file descriptor. On failure it returns -1.
+ * If there is a failure, it prints the error to stderr.
+ */
+int create_and_get_cgroup(const char *path)
+{
+ char cgroup_path[PATH_MAX + 1];
+ int fd;
+
+ format_cgroup_path(cgroup_path, path);
+ if (mkdir(cgroup_path, 0777) && errno != EEXIST) {
+ log_err("mkdiring cgroup %s .. %s", path, cgroup_path);
+ return -1;
+ }
+
+ fd = open(cgroup_path, O_RDONLY);
+ if (fd < 0) {
+ log_err("Opening Cgroup");
+ return -1;
+ }
+
+ return fd;
+}
+
+/**
+ * get_cgroup_id() - Get cgroup id for a particular cgroup path
+ * @path: The cgroup path, relative to the workdir, to join
+ *
+ * On success, it returns the cgroup id. On failure it returns 0,
+ * which is an invalid cgroup id.
+ * If there is a failure, it prints the error to stderr.
+ */
+unsigned long long get_cgroup_id(const char *path)
+{
+ int dirfd, err, flags, mount_id, fhsize;
+ union {
+ unsigned long long cgid;
+ unsigned char raw_bytes[8];
+ } id;
+ char cgroup_workdir[PATH_MAX + 1];
+ struct file_handle *fhp, *fhp2;
+ unsigned long long ret = 0;
+
+ format_cgroup_path(cgroup_workdir, path);
+
+ dirfd = AT_FDCWD;
+ flags = 0;
+ fhsize = sizeof(*fhp);
+ fhp = calloc(1, fhsize);
+ if (!fhp) {
+ log_err("calloc");
+ return 0;
+ }
+ err = name_to_handle_at(dirfd, cgroup_workdir, fhp, &mount_id, flags);
+ if (err >= 0 || fhp->handle_bytes != 8) {
+ log_err("name_to_handle_at");
+ goto free_mem;
+ }
+
+ fhsize = sizeof(struct file_handle) + fhp->handle_bytes;
+ fhp2 = realloc(fhp, fhsize);
+ if (!fhp2) {
+ log_err("realloc");
+ goto free_mem;
+ }
+ err = name_to_handle_at(dirfd, cgroup_workdir, fhp2, &mount_id, flags);
+ fhp = fhp2;
+ if (err < 0) {
+ log_err("name_to_handle_at");
+ goto free_mem;
+ }
+
+ memcpy(id.raw_bytes, fhp->f_handle, 8);
+ ret = id.cgid;
+
+free_mem:
+ free(fhp);
+ return ret;
+}
+
+int cgroup_setup_and_join(const char *path) {
+ int cg_fd;
+
+ if (setup_cgroup_environment()) {
+ fprintf(stderr, "Failed to setup cgroup environment\n");
+ return -EINVAL;
+ }
+
+ cg_fd = create_and_get_cgroup(path);
+ if (cg_fd < 0) {
+ fprintf(stderr, "Failed to create test cgroup\n");
+ cleanup_cgroup_environment();
+ return cg_fd;
+ }
+
+ if (join_cgroup(path)) {
+ fprintf(stderr, "Failed to join cgroup\n");
+ cleanup_cgroup_environment();
+ return -EINVAL;
+ }
+ return cg_fd;
+}
diff --git a/tools/testing/selftests/bpf/cgroup_helpers.h b/tools/testing/selftests/bpf/cgroup_helpers.h
new file mode 100644
index 000000000..5fe3d88e4
--- /dev/null
+++ b/tools/testing/selftests/bpf/cgroup_helpers.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __CGROUP_HELPERS_H
+#define __CGROUP_HELPERS_H
+#include <errno.h>
+#include <string.h>
+
+#define clean_errno() (errno == 0 ? "None" : strerror(errno))
+#define log_err(MSG, ...) fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \
+ __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__)
+
+
+int cgroup_setup_and_join(const char *path);
+int create_and_get_cgroup(const char *path);
+int join_cgroup(const char *path);
+int setup_cgroup_environment(void);
+void cleanup_cgroup_environment(void);
+unsigned long long get_cgroup_id(const char *path);
+
+#endif
diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config
new file mode 100644
index 000000000..2118e23ac
--- /dev/null
+++ b/tools/testing/selftests/bpf/config
@@ -0,0 +1,41 @@
+CONFIG_BPF=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_NET_CLS_BPF=m
+CONFIG_BPF_EVENTS=y
+CONFIG_TEST_BPF=m
+CONFIG_CGROUP_BPF=y
+CONFIG_NETDEVSIM=m
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_INGRESS=y
+CONFIG_NET_IPIP=y
+CONFIG_IPV6=y
+CONFIG_NET_IPGRE_DEMUX=y
+CONFIG_NET_IPGRE=y
+CONFIG_IPV6_GRE=y
+CONFIG_CRYPTO_USER_API_HASH=m
+CONFIG_CRYPTO_HMAC=m
+CONFIG_CRYPTO_SHA256=m
+CONFIG_VXLAN=y
+CONFIG_GENEVE=y
+CONFIG_NET_CLS_FLOWER=m
+CONFIG_LWTUNNEL=y
+CONFIG_BPF_STREAM_PARSER=y
+CONFIG_XDP_SOCKETS=y
+CONFIG_FTRACE_SYSCALLS=y
+CONFIG_IPV6_TUNNEL=y
+CONFIG_IPV6_GRE=y
+CONFIG_IPV6_SEG6_BPF=y
+CONFIG_NET_FOU=m
+CONFIG_NET_FOU_IP_TUNNELS=y
+CONFIG_IPV6_FOU=m
+CONFIG_IPV6_FOU_TUNNEL=m
+CONFIG_MPLS=y
+CONFIG_NET_MPLS_GSO=m
+CONFIG_MPLS_ROUTING=m
+CONFIG_MPLS_IPTUNNEL=m
+CONFIG_IPV6_SIT=m
+CONFIG_BPF_JIT=y
+CONFIG_BPF_LSM=y
+CONFIG_SECURITY=y
+CONFIG_LIRC=y
diff --git a/tools/testing/selftests/bpf/flow_dissector_load.c b/tools/testing/selftests/bpf/flow_dissector_load.c
new file mode 100644
index 000000000..3fd83b9dc
--- /dev/null
+++ b/tools/testing/selftests/bpf/flow_dissector_load.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <error.h>
+#include <errno.h>
+#include <getopt.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "bpf_rlimit.h"
+#include "flow_dissector_load.h"
+
+const char *cfg_pin_path = "/sys/fs/bpf/flow_dissector";
+const char *cfg_map_name = "jmp_table";
+bool cfg_attach = true;
+char *cfg_section_name;
+char *cfg_path_name;
+
+static void load_and_attach_program(void)
+{
+ int prog_fd, ret;
+ struct bpf_object *obj;
+
+ ret = bpf_flow_load(&obj, cfg_path_name, cfg_section_name,
+ cfg_map_name, NULL, &prog_fd, NULL);
+ if (ret)
+ error(1, 0, "bpf_flow_load %s", cfg_path_name);
+
+ ret = bpf_prog_attach(prog_fd, 0 /* Ignore */, BPF_FLOW_DISSECTOR, 0);
+ if (ret)
+ error(1, 0, "bpf_prog_attach %s", cfg_path_name);
+
+ ret = bpf_object__pin(obj, cfg_pin_path);
+ if (ret)
+ error(1, 0, "bpf_object__pin %s", cfg_pin_path);
+}
+
+static void detach_program(void)
+{
+ char command[64];
+ int ret;
+
+ ret = bpf_prog_detach(0, BPF_FLOW_DISSECTOR);
+ if (ret)
+ error(1, 0, "bpf_prog_detach");
+
+ /* To unpin, it is necessary and sufficient to just remove this dir */
+ sprintf(command, "rm -r %s", cfg_pin_path);
+ ret = system(command);
+ if (ret)
+ error(1, errno, "%s", command);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ bool attach = false;
+ bool detach = false;
+ int c;
+
+ while ((c = getopt(argc, argv, "adp:s:")) != -1) {
+ switch (c) {
+ case 'a':
+ if (detach)
+ error(1, 0, "attach/detach are exclusive");
+ attach = true;
+ break;
+ case 'd':
+ if (attach)
+ error(1, 0, "attach/detach are exclusive");
+ detach = true;
+ break;
+ case 'p':
+ if (cfg_path_name)
+ error(1, 0, "only one prog name can be given");
+
+ cfg_path_name = optarg;
+ break;
+ case 's':
+ if (cfg_section_name)
+ error(1, 0, "only one section can be given");
+
+ cfg_section_name = optarg;
+ break;
+ }
+ }
+
+ if (detach)
+ cfg_attach = false;
+
+ if (cfg_attach && !cfg_path_name)
+ error(1, 0, "must provide a path to the BPF program");
+
+ if (cfg_attach && !cfg_section_name)
+ error(1, 0, "must provide a section name");
+}
+
+int main(int argc, char **argv)
+{
+ parse_opts(argc, argv);
+ if (cfg_attach)
+ load_and_attach_program();
+ else
+ detach_program();
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/flow_dissector_load.h b/tools/testing/selftests/bpf/flow_dissector_load.h
new file mode 100644
index 000000000..7290401ec
--- /dev/null
+++ b/tools/testing/selftests/bpf/flow_dissector_load.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
+#ifndef FLOW_DISSECTOR_LOAD
+#define FLOW_DISSECTOR_LOAD
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+static inline int bpf_flow_load(struct bpf_object **obj,
+ const char *path,
+ const char *section_name,
+ const char *map_name,
+ const char *keys_map_name,
+ int *prog_fd,
+ int *keys_fd)
+{
+ struct bpf_program *prog, *main_prog;
+ struct bpf_map *prog_array, *keys;
+ int prog_array_fd;
+ int ret, fd, i;
+
+ ret = bpf_prog_load(path, BPF_PROG_TYPE_FLOW_DISSECTOR, obj,
+ prog_fd);
+ if (ret)
+ return ret;
+
+ main_prog = NULL;
+ bpf_object__for_each_program(prog, *obj) {
+ if (strcmp(section_name, bpf_program__section_name(prog)) == 0) {
+ main_prog = prog;
+ break;
+ }
+ }
+ if (!main_prog)
+ return -1;
+
+ *prog_fd = bpf_program__fd(main_prog);
+ if (*prog_fd < 0)
+ return -1;
+
+ prog_array = bpf_object__find_map_by_name(*obj, map_name);
+ if (!prog_array)
+ return -1;
+
+ prog_array_fd = bpf_map__fd(prog_array);
+ if (prog_array_fd < 0)
+ return -1;
+
+ if (keys_map_name && keys_fd) {
+ keys = bpf_object__find_map_by_name(*obj, keys_map_name);
+ if (!keys)
+ return -1;
+
+ *keys_fd = bpf_map__fd(keys);
+ if (*keys_fd < 0)
+ return -1;
+ }
+
+ i = 0;
+ bpf_object__for_each_program(prog, *obj) {
+ fd = bpf_program__fd(prog);
+ if (fd < 0)
+ return fd;
+
+ if (fd != *prog_fd) {
+ bpf_map_update_elem(prog_array_fd, &i, &fd, BPF_ANY);
+ ++i;
+ }
+ }
+
+ return 0;
+}
+
+#endif /* FLOW_DISSECTOR_LOAD */
diff --git a/tools/testing/selftests/bpf/get_cgroup_id_user.c b/tools/testing/selftests/bpf/get_cgroup_id_user.c
new file mode 100644
index 000000000..b8d6aef99
--- /dev/null
+++ b/tools/testing/selftests/bpf/get_cgroup_id_user.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <linux/perf_event.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "cgroup_helpers.h"
+#include "bpf_rlimit.h"
+
+#define CHECK(condition, tag, format...) ({ \
+ int __ret = !!(condition); \
+ if (__ret) { \
+ printf("%s:FAIL:%s ", __func__, tag); \
+ printf(format); \
+ } else { \
+ printf("%s:PASS:%s\n", __func__, tag); \
+ } \
+ __ret; \
+})
+
+static int bpf_find_map(const char *test, struct bpf_object *obj,
+ const char *name)
+{
+ struct bpf_map *map;
+
+ map = bpf_object__find_map_by_name(obj, name);
+ if (!map)
+ return -1;
+ return bpf_map__fd(map);
+}
+
+#define TEST_CGROUP "/test-bpf-get-cgroup-id/"
+
+int main(int argc, char **argv)
+{
+ const char *probe_name = "syscalls/sys_enter_nanosleep";
+ const char *file = "get_cgroup_id_kern.o";
+ int err, bytes, efd, prog_fd, pmu_fd;
+ int cgroup_fd, cgidmap_fd, pidmap_fd;
+ struct perf_event_attr attr = {};
+ struct bpf_object *obj;
+ __u64 kcgid = 0, ucgid;
+ __u32 key = 0, pid;
+ int exit_code = 1;
+ char buf[256];
+
+ cgroup_fd = cgroup_setup_and_join(TEST_CGROUP);
+ if (CHECK(cgroup_fd < 0, "cgroup_setup_and_join", "err %d errno %d\n", cgroup_fd, errno))
+ return 1;
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+ if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
+ goto cleanup_cgroup_env;
+
+ cgidmap_fd = bpf_find_map(__func__, obj, "cg_ids");
+ if (CHECK(cgidmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
+ cgidmap_fd, errno))
+ goto close_prog;
+
+ pidmap_fd = bpf_find_map(__func__, obj, "pidmap");
+ if (CHECK(pidmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
+ pidmap_fd, errno))
+ goto close_prog;
+
+ pid = getpid();
+ bpf_map_update_elem(pidmap_fd, &key, &pid, 0);
+
+ snprintf(buf, sizeof(buf),
+ "/sys/kernel/debug/tracing/events/%s/id", probe_name);
+ efd = open(buf, O_RDONLY, 0);
+ if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+ goto close_prog;
+ bytes = read(efd, buf, sizeof(buf));
+ close(efd);
+ if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read",
+ "bytes %d errno %d\n", bytes, errno))
+ goto close_prog;
+
+ attr.config = strtol(buf, NULL, 0);
+ attr.type = PERF_TYPE_TRACEPOINT;
+ attr.sample_type = PERF_SAMPLE_RAW;
+ attr.sample_period = 1;
+ attr.wakeup_events = 1;
+
+ /* attach to this pid so the all bpf invocations will be in the
+ * cgroup associated with this pid.
+ */
+ pmu_fd = syscall(__NR_perf_event_open, &attr, getpid(), -1, -1, 0);
+ if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", pmu_fd,
+ errno))
+ goto close_prog;
+
+ err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+ if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err,
+ errno))
+ goto close_pmu;
+
+ err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+ if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", err,
+ errno))
+ goto close_pmu;
+
+ /* trigger some syscalls */
+ sleep(1);
+
+ err = bpf_map_lookup_elem(cgidmap_fd, &key, &kcgid);
+ if (CHECK(err, "bpf_map_lookup_elem", "err %d errno %d\n", err, errno))
+ goto close_pmu;
+
+ ucgid = get_cgroup_id(TEST_CGROUP);
+ if (CHECK(kcgid != ucgid, "compare_cgroup_id",
+ "kern cgid %llx user cgid %llx", kcgid, ucgid))
+ goto close_pmu;
+
+ exit_code = 0;
+ printf("%s:PASS\n", argv[0]);
+
+close_pmu:
+ close(pmu_fd);
+close_prog:
+ bpf_object__close(obj);
+cleanup_cgroup_env:
+ cleanup_cgroup_environment();
+ return exit_code;
+}
diff --git a/tools/testing/selftests/bpf/gnu/stubs.h b/tools/testing/selftests/bpf/gnu/stubs.h
new file mode 100644
index 000000000..719225b16
--- /dev/null
+++ b/tools/testing/selftests/bpf/gnu/stubs.h
@@ -0,0 +1 @@
+/* dummy .h to trick /usr/include/features.h to work with 'clang -target bpf' */
diff --git a/tools/testing/selftests/bpf/map_tests/.gitignore b/tools/testing/selftests/bpf/map_tests/.gitignore
new file mode 100644
index 000000000..89c4a3d37
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+tests.h
diff --git a/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c
new file mode 100644
index 000000000..f0a64d8ac
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/array_map_batch_ops.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_maps.h>
+
+static void map_batch_update(int map_fd, __u32 max_entries, int *keys,
+ int *values)
+{
+ int i, err;
+ DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts,
+ .elem_flags = 0,
+ .flags = 0,
+ );
+
+ for (i = 0; i < max_entries; i++) {
+ keys[i] = i;
+ values[i] = i + 1;
+ }
+
+ err = bpf_map_update_batch(map_fd, keys, values, &max_entries, &opts);
+ CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno));
+}
+
+static void map_batch_verify(int *visited, __u32 max_entries,
+ int *keys, int *values)
+{
+ int i;
+
+ memset(visited, 0, max_entries * sizeof(*visited));
+ for (i = 0; i < max_entries; i++) {
+ CHECK(keys[i] + 1 != values[i], "key/value checking",
+ "error: i %d key %d value %d\n", i, keys[i], values[i]);
+ visited[i] = 1;
+ }
+ for (i = 0; i < max_entries; i++) {
+ CHECK(visited[i] != 1, "visited checking",
+ "error: keys array at index %d missing\n", i);
+ }
+}
+
+void test_array_map_batch_ops(void)
+{
+ struct bpf_create_map_attr xattr = {
+ .name = "array_map",
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ };
+ int map_fd, *keys, *values, *visited;
+ __u32 count, total, total_success;
+ const __u32 max_entries = 10;
+ bool nospace_err;
+ __u64 batch = 0;
+ int err, step;
+ DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts,
+ .elem_flags = 0,
+ .flags = 0,
+ );
+
+ xattr.max_entries = max_entries;
+ map_fd = bpf_create_map_xattr(&xattr);
+ CHECK(map_fd == -1,
+ "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+
+ keys = malloc(max_entries * sizeof(int));
+ values = malloc(max_entries * sizeof(int));
+ visited = malloc(max_entries * sizeof(int));
+ CHECK(!keys || !values || !visited, "malloc()", "error:%s\n",
+ strerror(errno));
+
+ /* populate elements to the map */
+ map_batch_update(map_fd, max_entries, keys, values);
+
+ /* test 1: lookup in a loop with various steps. */
+ total_success = 0;
+ for (step = 1; step < max_entries; step++) {
+ map_batch_update(map_fd, max_entries, keys, values);
+ map_batch_verify(visited, max_entries, keys, values);
+ memset(keys, 0, max_entries * sizeof(*keys));
+ memset(values, 0, max_entries * sizeof(*values));
+ batch = 0;
+ total = 0;
+ /* iteratively lookup/delete elements with 'step'
+ * elements each.
+ */
+ count = step;
+ nospace_err = false;
+ while (true) {
+ err = bpf_map_lookup_batch(map_fd,
+ total ? &batch : NULL, &batch,
+ keys + total,
+ values + total,
+ &count, &opts);
+
+ CHECK((err && errno != ENOENT), "lookup with steps",
+ "error: %s\n", strerror(errno));
+
+ total += count;
+ if (err)
+ break;
+
+ }
+
+ if (nospace_err == true)
+ continue;
+
+ CHECK(total != max_entries, "lookup with steps",
+ "total = %u, max_entries = %u\n", total, max_entries);
+
+ map_batch_verify(visited, max_entries, keys, values);
+
+ total_success++;
+ }
+
+ CHECK(total_success == 0, "check total_success",
+ "unexpected failure\n");
+
+ printf("%s:PASS\n", __func__);
+
+ free(keys);
+ free(values);
+ free(visited);
+}
diff --git a/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c b/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c
new file mode 100644
index 000000000..976bf415f
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/htab_map_batch_ops.c
@@ -0,0 +1,283 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <bpf_util.h>
+#include <test_maps.h>
+
+static void map_batch_update(int map_fd, __u32 max_entries, int *keys,
+ void *values, bool is_pcpu)
+{
+ typedef BPF_DECLARE_PERCPU(int, value);
+ value *v = NULL;
+ int i, j, err;
+ DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts,
+ .elem_flags = 0,
+ .flags = 0,
+ );
+
+ if (is_pcpu)
+ v = (value *)values;
+
+ for (i = 0; i < max_entries; i++) {
+ keys[i] = i + 1;
+ if (is_pcpu)
+ for (j = 0; j < bpf_num_possible_cpus(); j++)
+ bpf_percpu(v[i], j) = i + 2 + j;
+ else
+ ((int *)values)[i] = i + 2;
+ }
+
+ err = bpf_map_update_batch(map_fd, keys, values, &max_entries, &opts);
+ CHECK(err, "bpf_map_update_batch()", "error:%s\n", strerror(errno));
+}
+
+static void map_batch_verify(int *visited, __u32 max_entries,
+ int *keys, void *values, bool is_pcpu)
+{
+ typedef BPF_DECLARE_PERCPU(int, value);
+ value *v = NULL;
+ int i, j;
+
+ if (is_pcpu)
+ v = (value *)values;
+
+ memset(visited, 0, max_entries * sizeof(*visited));
+ for (i = 0; i < max_entries; i++) {
+
+ if (is_pcpu) {
+ for (j = 0; j < bpf_num_possible_cpus(); j++) {
+ CHECK(keys[i] + 1 + j != bpf_percpu(v[i], j),
+ "key/value checking",
+ "error: i %d j %d key %d value %d\n",
+ i, j, keys[i], bpf_percpu(v[i], j));
+ }
+ } else {
+ CHECK(keys[i] + 1 != ((int *)values)[i],
+ "key/value checking",
+ "error: i %d key %d value %d\n", i, keys[i],
+ ((int *)values)[i]);
+ }
+
+ visited[i] = 1;
+
+ }
+ for (i = 0; i < max_entries; i++) {
+ CHECK(visited[i] != 1, "visited checking",
+ "error: keys array at index %d missing\n", i);
+ }
+}
+
+void __test_map_lookup_and_delete_batch(bool is_pcpu)
+{
+ __u32 batch, count, total, total_success;
+ typedef BPF_DECLARE_PERCPU(int, value);
+ int map_fd, *keys, *visited, key;
+ const __u32 max_entries = 10;
+ value pcpu_values[max_entries];
+ int err, step, value_size;
+ bool nospace_err;
+ void *values;
+ struct bpf_create_map_attr xattr = {
+ .name = "hash_map",
+ .map_type = is_pcpu ? BPF_MAP_TYPE_PERCPU_HASH :
+ BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ };
+ DECLARE_LIBBPF_OPTS(bpf_map_batch_opts, opts,
+ .elem_flags = 0,
+ .flags = 0,
+ );
+
+ xattr.max_entries = max_entries;
+ map_fd = bpf_create_map_xattr(&xattr);
+ CHECK(map_fd == -1,
+ "bpf_create_map_xattr()", "error:%s\n", strerror(errno));
+
+ value_size = is_pcpu ? sizeof(value) : sizeof(int);
+ keys = malloc(max_entries * sizeof(int));
+ if (is_pcpu)
+ values = pcpu_values;
+ else
+ values = malloc(max_entries * sizeof(int));
+ visited = malloc(max_entries * sizeof(int));
+ CHECK(!keys || !values || !visited, "malloc()",
+ "error:%s\n", strerror(errno));
+
+ /* test 1: lookup/delete an empty hash table, -ENOENT */
+ count = max_entries;
+ err = bpf_map_lookup_and_delete_batch(map_fd, NULL, &batch, keys,
+ values, &count, &opts);
+ CHECK((err && errno != ENOENT), "empty map",
+ "error: %s\n", strerror(errno));
+
+ /* populate elements to the map */
+ map_batch_update(map_fd, max_entries, keys, values, is_pcpu);
+
+ /* test 2: lookup/delete with count = 0, success */
+ count = 0;
+ err = bpf_map_lookup_and_delete_batch(map_fd, NULL, &batch, keys,
+ values, &count, &opts);
+ CHECK(err, "count = 0", "error: %s\n", strerror(errno));
+
+ /* test 3: lookup/delete with count = max_entries, success */
+ memset(keys, 0, max_entries * sizeof(*keys));
+ memset(values, 0, max_entries * value_size);
+ count = max_entries;
+ err = bpf_map_lookup_and_delete_batch(map_fd, NULL, &batch, keys,
+ values, &count, &opts);
+ CHECK((err && errno != ENOENT), "count = max_entries",
+ "error: %s\n", strerror(errno));
+ CHECK(count != max_entries, "count = max_entries",
+ "count = %u, max_entries = %u\n", count, max_entries);
+ map_batch_verify(visited, max_entries, keys, values, is_pcpu);
+
+ /* bpf_map_get_next_key() should return -ENOENT for an empty map. */
+ err = bpf_map_get_next_key(map_fd, NULL, &key);
+ CHECK(!err, "bpf_map_get_next_key()", "error: %s\n", strerror(errno));
+
+ /* test 4: lookup/delete in a loop with various steps. */
+ total_success = 0;
+ for (step = 1; step < max_entries; step++) {
+ map_batch_update(map_fd, max_entries, keys, values, is_pcpu);
+ memset(keys, 0, max_entries * sizeof(*keys));
+ memset(values, 0, max_entries * value_size);
+ total = 0;
+ /* iteratively lookup/delete elements with 'step'
+ * elements each
+ */
+ count = step;
+ nospace_err = false;
+ while (true) {
+ err = bpf_map_lookup_batch(map_fd,
+ total ? &batch : NULL,
+ &batch, keys + total,
+ values +
+ total * value_size,
+ &count, &opts);
+ /* It is possible that we are failing due to buffer size
+ * not big enough. In such cases, let us just exit and
+ * go with large steps. Not that a buffer size with
+ * max_entries should always work.
+ */
+ if (err && errno == ENOSPC) {
+ nospace_err = true;
+ break;
+ }
+
+ CHECK((err && errno != ENOENT), "lookup with steps",
+ "error: %s\n", strerror(errno));
+
+ total += count;
+ if (err)
+ break;
+
+ }
+ if (nospace_err == true)
+ continue;
+
+ CHECK(total != max_entries, "lookup with steps",
+ "total = %u, max_entries = %u\n", total, max_entries);
+ map_batch_verify(visited, max_entries, keys, values, is_pcpu);
+
+ total = 0;
+ count = step;
+ while (total < max_entries) {
+ if (max_entries - total < step)
+ count = max_entries - total;
+ err = bpf_map_delete_batch(map_fd,
+ keys + total,
+ &count, &opts);
+ CHECK((err && errno != ENOENT), "delete batch",
+ "error: %s\n", strerror(errno));
+ total += count;
+ if (err)
+ break;
+ }
+ CHECK(total != max_entries, "delete with steps",
+ "total = %u, max_entries = %u\n", total, max_entries);
+
+ /* check map is empty, errono == ENOENT */
+ err = bpf_map_get_next_key(map_fd, NULL, &key);
+ CHECK(!err || errno != ENOENT, "bpf_map_get_next_key()",
+ "error: %s\n", strerror(errno));
+
+ /* iteratively lookup/delete elements with 'step'
+ * elements each
+ */
+ map_batch_update(map_fd, max_entries, keys, values, is_pcpu);
+ memset(keys, 0, max_entries * sizeof(*keys));
+ memset(values, 0, max_entries * value_size);
+ total = 0;
+ count = step;
+ nospace_err = false;
+ while (true) {
+ err = bpf_map_lookup_and_delete_batch(map_fd,
+ total ? &batch : NULL,
+ &batch, keys + total,
+ values +
+ total * value_size,
+ &count, &opts);
+ /* It is possible that we are failing due to buffer size
+ * not big enough. In such cases, let us just exit and
+ * go with large steps. Not that a buffer size with
+ * max_entries should always work.
+ */
+ if (err && errno == ENOSPC) {
+ nospace_err = true;
+ break;
+ }
+
+ CHECK((err && errno != ENOENT), "lookup with steps",
+ "error: %s\n", strerror(errno));
+
+ total += count;
+ if (err)
+ break;
+ }
+
+ if (nospace_err == true)
+ continue;
+
+ CHECK(total != max_entries, "lookup/delete with steps",
+ "total = %u, max_entries = %u\n", total, max_entries);
+
+ map_batch_verify(visited, max_entries, keys, values, is_pcpu);
+ err = bpf_map_get_next_key(map_fd, NULL, &key);
+ CHECK(!err, "bpf_map_get_next_key()", "error: %s\n",
+ strerror(errno));
+
+ total_success++;
+ }
+
+ CHECK(total_success == 0, "check total_success",
+ "unexpected failure\n");
+ free(keys);
+ free(visited);
+ if (!is_pcpu)
+ free(values);
+}
+
+void htab_map_batch_ops(void)
+{
+ __test_map_lookup_and_delete_batch(false);
+ printf("test_%s:PASS\n", __func__);
+}
+
+void htab_percpu_map_batch_ops(void)
+{
+ __test_map_lookup_and_delete_batch(true);
+ printf("test_%s:PASS\n", __func__);
+}
+
+void test_htab_map_batch_ops(void)
+{
+ htab_map_batch_ops();
+ htab_percpu_map_batch_ops();
+}
diff --git a/tools/testing/selftests/bpf/map_tests/sk_storage_map.c b/tools/testing/selftests/bpf/map_tests/sk_storage_map.c
new file mode 100644
index 000000000..e569edc67
--- /dev/null
+++ b/tools/testing/selftests/bpf/map_tests/sk_storage_map.c
@@ -0,0 +1,629 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+#include <linux/compiler.h>
+#include <linux/err.h>
+
+#include <sys/resource.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <linux/btf.h>
+#include <unistd.h>
+#include <signal.h>
+#include <errno.h>
+#include <string.h>
+#include <pthread.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <test_btf.h>
+#include <test_maps.h>
+
+static struct bpf_create_map_attr xattr = {
+ .name = "sk_storage_map",
+ .map_type = BPF_MAP_TYPE_SK_STORAGE,
+ .map_flags = BPF_F_NO_PREALLOC,
+ .max_entries = 0,
+ .key_size = 4,
+ .value_size = 8,
+ .btf_key_type_id = 1,
+ .btf_value_type_id = 3,
+ .btf_fd = -1,
+};
+
+static unsigned int nr_sk_threads_done;
+static unsigned int nr_sk_threads_err;
+static unsigned int nr_sk_per_thread = 4096;
+static unsigned int nr_sk_threads = 4;
+static int sk_storage_map = -1;
+static unsigned int stop;
+static int runtime_s = 5;
+
+static bool is_stopped(void)
+{
+ return READ_ONCE(stop);
+}
+
+static unsigned int threads_err(void)
+{
+ return READ_ONCE(nr_sk_threads_err);
+}
+
+static void notify_thread_err(void)
+{
+ __sync_add_and_fetch(&nr_sk_threads_err, 1);
+}
+
+static bool wait_for_threads_err(void)
+{
+ while (!is_stopped() && !threads_err())
+ usleep(500);
+
+ return !is_stopped();
+}
+
+static unsigned int threads_done(void)
+{
+ return READ_ONCE(nr_sk_threads_done);
+}
+
+static void notify_thread_done(void)
+{
+ __sync_add_and_fetch(&nr_sk_threads_done, 1);
+}
+
+static void notify_thread_redo(void)
+{
+ __sync_sub_and_fetch(&nr_sk_threads_done, 1);
+}
+
+static bool wait_for_threads_done(void)
+{
+ while (threads_done() != nr_sk_threads && !is_stopped() &&
+ !threads_err())
+ usleep(50);
+
+ return !is_stopped() && !threads_err();
+}
+
+static bool wait_for_threads_redo(void)
+{
+ while (threads_done() && !is_stopped() && !threads_err())
+ usleep(50);
+
+ return !is_stopped() && !threads_err();
+}
+
+static bool wait_for_map(void)
+{
+ while (READ_ONCE(sk_storage_map) == -1 && !is_stopped())
+ usleep(50);
+
+ return !is_stopped();
+}
+
+static bool wait_for_map_close(void)
+{
+ while (READ_ONCE(sk_storage_map) != -1 && !is_stopped())
+ ;
+
+ return !is_stopped();
+}
+
+static int load_btf(void)
+{
+ const char btf_str_sec[] = "\0bpf_spin_lock\0val\0cnt\0l";
+ __u32 btf_raw_types[] = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* struct bpf_spin_lock */ /* [2] */
+ BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 4),
+ BTF_MEMBER_ENC(15, 1, 0), /* int val; */
+ /* struct val */ /* [3] */
+ BTF_TYPE_ENC(15, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 8),
+ BTF_MEMBER_ENC(19, 1, 0), /* int cnt; */
+ BTF_MEMBER_ENC(23, 2, 32),/* struct bpf_spin_lock l; */
+ };
+ struct btf_header btf_hdr = {
+ .magic = BTF_MAGIC,
+ .version = BTF_VERSION,
+ .hdr_len = sizeof(struct btf_header),
+ .type_len = sizeof(btf_raw_types),
+ .str_off = sizeof(btf_raw_types),
+ .str_len = sizeof(btf_str_sec),
+ };
+ __u8 raw_btf[sizeof(struct btf_header) + sizeof(btf_raw_types) +
+ sizeof(btf_str_sec)];
+
+ memcpy(raw_btf, &btf_hdr, sizeof(btf_hdr));
+ memcpy(raw_btf + sizeof(btf_hdr), btf_raw_types, sizeof(btf_raw_types));
+ memcpy(raw_btf + sizeof(btf_hdr) + sizeof(btf_raw_types),
+ btf_str_sec, sizeof(btf_str_sec));
+
+ return bpf_load_btf(raw_btf, sizeof(raw_btf), 0, 0, 0);
+}
+
+static int create_sk_storage_map(void)
+{
+ int btf_fd, map_fd;
+
+ btf_fd = load_btf();
+ CHECK(btf_fd == -1, "bpf_load_btf", "btf_fd:%d errno:%d\n",
+ btf_fd, errno);
+ xattr.btf_fd = btf_fd;
+
+ map_fd = bpf_create_map_xattr(&xattr);
+ xattr.btf_fd = -1;
+ close(btf_fd);
+ CHECK(map_fd == -1,
+ "bpf_create_map_xattr()", "errno:%d\n", errno);
+
+ return map_fd;
+}
+
+static void *insert_close_thread(void *arg)
+{
+ struct {
+ int cnt;
+ int lock;
+ } value = { .cnt = 0xeB9F, .lock = 0, };
+ int i, map_fd, err, *sk_fds;
+
+ sk_fds = malloc(sizeof(*sk_fds) * nr_sk_per_thread);
+ if (!sk_fds) {
+ notify_thread_err();
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (i = 0; i < nr_sk_per_thread; i++)
+ sk_fds[i] = -1;
+
+ while (!is_stopped()) {
+ if (!wait_for_map())
+ goto close_all;
+
+ map_fd = READ_ONCE(sk_storage_map);
+ for (i = 0; i < nr_sk_per_thread && !is_stopped(); i++) {
+ sk_fds[i] = socket(AF_INET6, SOCK_STREAM, 0);
+ if (sk_fds[i] == -1) {
+ err = -errno;
+ fprintf(stderr, "socket(): errno:%d\n", errno);
+ goto errout;
+ }
+ err = bpf_map_update_elem(map_fd, &sk_fds[i], &value,
+ BPF_NOEXIST);
+ if (err) {
+ err = -errno;
+ fprintf(stderr,
+ "bpf_map_update_elem(): errno:%d\n",
+ errno);
+ goto errout;
+ }
+ }
+
+ notify_thread_done();
+ wait_for_map_close();
+
+close_all:
+ for (i = 0; i < nr_sk_per_thread; i++) {
+ close(sk_fds[i]);
+ sk_fds[i] = -1;
+ }
+
+ notify_thread_redo();
+ }
+
+ free(sk_fds);
+ return NULL;
+
+errout:
+ for (i = 0; i < nr_sk_per_thread && sk_fds[i] != -1; i++)
+ close(sk_fds[i]);
+ free(sk_fds);
+ notify_thread_err();
+ return ERR_PTR(err);
+}
+
+static int do_sk_storage_map_stress_free(void)
+{
+ int i, map_fd = -1, err = 0, nr_threads_created = 0;
+ pthread_t *sk_thread_ids;
+ void *thread_ret;
+
+ sk_thread_ids = malloc(sizeof(pthread_t) * nr_sk_threads);
+ if (!sk_thread_ids) {
+ fprintf(stderr, "malloc(sk_threads): NULL\n");
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < nr_sk_threads; i++) {
+ err = pthread_create(&sk_thread_ids[i], NULL,
+ insert_close_thread, NULL);
+ if (err) {
+ err = -errno;
+ goto done;
+ }
+ nr_threads_created++;
+ }
+
+ while (!is_stopped()) {
+ map_fd = create_sk_storage_map();
+ WRITE_ONCE(sk_storage_map, map_fd);
+
+ if (!wait_for_threads_done())
+ break;
+
+ WRITE_ONCE(sk_storage_map, -1);
+ close(map_fd);
+ map_fd = -1;
+
+ if (!wait_for_threads_redo())
+ break;
+ }
+
+done:
+ WRITE_ONCE(stop, 1);
+ for (i = 0; i < nr_threads_created; i++) {
+ pthread_join(sk_thread_ids[i], &thread_ret);
+ if (IS_ERR(thread_ret) && !err) {
+ err = PTR_ERR(thread_ret);
+ fprintf(stderr, "threads#%u: err:%d\n", i, err);
+ }
+ }
+ free(sk_thread_ids);
+
+ if (map_fd != -1)
+ close(map_fd);
+
+ return err;
+}
+
+static void *update_thread(void *arg)
+{
+ struct {
+ int cnt;
+ int lock;
+ } value = { .cnt = 0xeB9F, .lock = 0, };
+ int map_fd = READ_ONCE(sk_storage_map);
+ int sk_fd = *(int *)arg;
+ int err = 0; /* Suppress compiler false alarm */
+
+ while (!is_stopped()) {
+ err = bpf_map_update_elem(map_fd, &sk_fd, &value, 0);
+ if (err && errno != EAGAIN) {
+ err = -errno;
+ fprintf(stderr, "bpf_map_update_elem: %d %d\n",
+ err, errno);
+ break;
+ }
+ }
+
+ if (!is_stopped()) {
+ notify_thread_err();
+ return ERR_PTR(err);
+ }
+
+ return NULL;
+}
+
+static void *delete_thread(void *arg)
+{
+ int map_fd = READ_ONCE(sk_storage_map);
+ int sk_fd = *(int *)arg;
+ int err = 0; /* Suppress compiler false alarm */
+
+ while (!is_stopped()) {
+ err = bpf_map_delete_elem(map_fd, &sk_fd);
+ if (err && errno != ENOENT) {
+ err = -errno;
+ fprintf(stderr, "bpf_map_delete_elem: %d %d\n",
+ err, errno);
+ break;
+ }
+ }
+
+ if (!is_stopped()) {
+ notify_thread_err();
+ return ERR_PTR(err);
+ }
+
+ return NULL;
+}
+
+static int do_sk_storage_map_stress_change(void)
+{
+ int i, sk_fd, map_fd = -1, err = 0, nr_threads_created = 0;
+ pthread_t *sk_thread_ids;
+ void *thread_ret;
+
+ sk_thread_ids = malloc(sizeof(pthread_t) * nr_sk_threads);
+ if (!sk_thread_ids) {
+ fprintf(stderr, "malloc(sk_threads): NULL\n");
+ return -ENOMEM;
+ }
+
+ sk_fd = socket(AF_INET6, SOCK_STREAM, 0);
+ if (sk_fd == -1) {
+ err = -errno;
+ goto done;
+ }
+
+ map_fd = create_sk_storage_map();
+ WRITE_ONCE(sk_storage_map, map_fd);
+
+ for (i = 0; i < nr_sk_threads; i++) {
+ if (i & 0x1)
+ err = pthread_create(&sk_thread_ids[i], NULL,
+ update_thread, &sk_fd);
+ else
+ err = pthread_create(&sk_thread_ids[i], NULL,
+ delete_thread, &sk_fd);
+ if (err) {
+ err = -errno;
+ goto done;
+ }
+ nr_threads_created++;
+ }
+
+ wait_for_threads_err();
+
+done:
+ WRITE_ONCE(stop, 1);
+ for (i = 0; i < nr_threads_created; i++) {
+ pthread_join(sk_thread_ids[i], &thread_ret);
+ if (IS_ERR(thread_ret) && !err) {
+ err = PTR_ERR(thread_ret);
+ fprintf(stderr, "threads#%u: err:%d\n", i, err);
+ }
+ }
+ free(sk_thread_ids);
+
+ if (sk_fd != -1)
+ close(sk_fd);
+ close(map_fd);
+
+ return err;
+}
+
+static void stop_handler(int signum)
+{
+ if (signum != SIGALRM)
+ printf("stopping...\n");
+ WRITE_ONCE(stop, 1);
+}
+
+#define BPF_SK_STORAGE_MAP_TEST_NR_THREADS "BPF_SK_STORAGE_MAP_TEST_NR_THREADS"
+#define BPF_SK_STORAGE_MAP_TEST_SK_PER_THREAD "BPF_SK_STORAGE_MAP_TEST_SK_PER_THREAD"
+#define BPF_SK_STORAGE_MAP_TEST_RUNTIME_S "BPF_SK_STORAGE_MAP_TEST_RUNTIME_S"
+#define BPF_SK_STORAGE_MAP_TEST_NAME "BPF_SK_STORAGE_MAP_TEST_NAME"
+
+static void test_sk_storage_map_stress_free(void)
+{
+ struct rlimit rlim_old, rlim_new = {};
+ int err;
+
+ getrlimit(RLIMIT_NOFILE, &rlim_old);
+
+ signal(SIGTERM, stop_handler);
+ signal(SIGINT, stop_handler);
+ if (runtime_s > 0) {
+ signal(SIGALRM, stop_handler);
+ alarm(runtime_s);
+ }
+
+ if (rlim_old.rlim_cur < nr_sk_threads * nr_sk_per_thread) {
+ rlim_new.rlim_cur = nr_sk_threads * nr_sk_per_thread + 128;
+ rlim_new.rlim_max = rlim_new.rlim_cur + 128;
+ err = setrlimit(RLIMIT_NOFILE, &rlim_new);
+ CHECK(err, "setrlimit(RLIMIT_NOFILE)", "rlim_new:%lu errno:%d",
+ rlim_new.rlim_cur, errno);
+ }
+
+ err = do_sk_storage_map_stress_free();
+
+ signal(SIGTERM, SIG_DFL);
+ signal(SIGINT, SIG_DFL);
+ if (runtime_s > 0) {
+ signal(SIGALRM, SIG_DFL);
+ alarm(0);
+ }
+
+ if (rlim_new.rlim_cur)
+ setrlimit(RLIMIT_NOFILE, &rlim_old);
+
+ CHECK(err, "test_sk_storage_map_stress_free", "err:%d\n", err);
+}
+
+static void test_sk_storage_map_stress_change(void)
+{
+ int err;
+
+ signal(SIGTERM, stop_handler);
+ signal(SIGINT, stop_handler);
+ if (runtime_s > 0) {
+ signal(SIGALRM, stop_handler);
+ alarm(runtime_s);
+ }
+
+ err = do_sk_storage_map_stress_change();
+
+ signal(SIGTERM, SIG_DFL);
+ signal(SIGINT, SIG_DFL);
+ if (runtime_s > 0) {
+ signal(SIGALRM, SIG_DFL);
+ alarm(0);
+ }
+
+ CHECK(err, "test_sk_storage_map_stress_change", "err:%d\n", err);
+}
+
+static void test_sk_storage_map_basic(void)
+{
+ struct {
+ int cnt;
+ int lock;
+ } value = { .cnt = 0xeB9f, .lock = 0, }, lookup_value;
+ struct bpf_create_map_attr bad_xattr;
+ int btf_fd, map_fd, sk_fd, err;
+
+ btf_fd = load_btf();
+ CHECK(btf_fd == -1, "bpf_load_btf", "btf_fd:%d errno:%d\n",
+ btf_fd, errno);
+ xattr.btf_fd = btf_fd;
+
+ sk_fd = socket(AF_INET6, SOCK_STREAM, 0);
+ CHECK(sk_fd == -1, "socket()", "sk_fd:%d errno:%d\n",
+ sk_fd, errno);
+
+ map_fd = bpf_create_map_xattr(&xattr);
+ CHECK(map_fd == -1, "bpf_create_map_xattr(good_xattr)",
+ "map_fd:%d errno:%d\n", map_fd, errno);
+
+ /* Add new elem */
+ memcpy(&lookup_value, &value, sizeof(value));
+ err = bpf_map_update_elem(map_fd, &sk_fd, &value,
+ BPF_NOEXIST | BPF_F_LOCK);
+ CHECK(err, "bpf_map_update_elem(BPF_NOEXIST|BPF_F_LOCK)",
+ "err:%d errno:%d\n", err, errno);
+ err = bpf_map_lookup_elem_flags(map_fd, &sk_fd, &lookup_value,
+ BPF_F_LOCK);
+ CHECK(err || lookup_value.cnt != value.cnt,
+ "bpf_map_lookup_elem_flags(BPF_F_LOCK)",
+ "err:%d errno:%d cnt:%x(%x)\n",
+ err, errno, lookup_value.cnt, value.cnt);
+
+ /* Bump the cnt and update with BPF_EXIST | BPF_F_LOCK */
+ value.cnt += 1;
+ err = bpf_map_update_elem(map_fd, &sk_fd, &value,
+ BPF_EXIST | BPF_F_LOCK);
+ CHECK(err, "bpf_map_update_elem(BPF_EXIST|BPF_F_LOCK)",
+ "err:%d errno:%d\n", err, errno);
+ err = bpf_map_lookup_elem_flags(map_fd, &sk_fd, &lookup_value,
+ BPF_F_LOCK);
+ CHECK(err || lookup_value.cnt != value.cnt,
+ "bpf_map_lookup_elem_flags(BPF_F_LOCK)",
+ "err:%d errno:%d cnt:%x(%x)\n",
+ err, errno, lookup_value.cnt, value.cnt);
+
+ /* Bump the cnt and update with BPF_EXIST */
+ value.cnt += 1;
+ err = bpf_map_update_elem(map_fd, &sk_fd, &value, BPF_EXIST);
+ CHECK(err, "bpf_map_update_elem(BPF_EXIST)",
+ "err:%d errno:%d\n", err, errno);
+ err = bpf_map_lookup_elem_flags(map_fd, &sk_fd, &lookup_value,
+ BPF_F_LOCK);
+ CHECK(err || lookup_value.cnt != value.cnt,
+ "bpf_map_lookup_elem_flags(BPF_F_LOCK)",
+ "err:%d errno:%d cnt:%x(%x)\n",
+ err, errno, lookup_value.cnt, value.cnt);
+
+ /* Update with BPF_NOEXIST */
+ value.cnt += 1;
+ err = bpf_map_update_elem(map_fd, &sk_fd, &value,
+ BPF_NOEXIST | BPF_F_LOCK);
+ CHECK(!err || errno != EEXIST,
+ "bpf_map_update_elem(BPF_NOEXIST|BPF_F_LOCK)",
+ "err:%d errno:%d\n", err, errno);
+ err = bpf_map_update_elem(map_fd, &sk_fd, &value, BPF_NOEXIST);
+ CHECK(!err || errno != EEXIST, "bpf_map_update_elem(BPF_NOEXIST)",
+ "err:%d errno:%d\n", err, errno);
+ value.cnt -= 1;
+ err = bpf_map_lookup_elem_flags(map_fd, &sk_fd, &lookup_value,
+ BPF_F_LOCK);
+ CHECK(err || lookup_value.cnt != value.cnt,
+ "bpf_map_lookup_elem_flags(BPF_F_LOCK)",
+ "err:%d errno:%d cnt:%x(%x)\n",
+ err, errno, lookup_value.cnt, value.cnt);
+
+ /* Bump the cnt again and update with map_flags == 0 */
+ value.cnt += 1;
+ err = bpf_map_update_elem(map_fd, &sk_fd, &value, 0);
+ CHECK(err, "bpf_map_update_elem()", "err:%d errno:%d\n",
+ err, errno);
+ err = bpf_map_lookup_elem_flags(map_fd, &sk_fd, &lookup_value,
+ BPF_F_LOCK);
+ CHECK(err || lookup_value.cnt != value.cnt,
+ "bpf_map_lookup_elem_flags(BPF_F_LOCK)",
+ "err:%d errno:%d cnt:%x(%x)\n",
+ err, errno, lookup_value.cnt, value.cnt);
+
+ /* Test delete elem */
+ err = bpf_map_delete_elem(map_fd, &sk_fd);
+ CHECK(err, "bpf_map_delete_elem()", "err:%d errno:%d\n",
+ err, errno);
+ err = bpf_map_lookup_elem_flags(map_fd, &sk_fd, &lookup_value,
+ BPF_F_LOCK);
+ CHECK(!err || errno != ENOENT,
+ "bpf_map_lookup_elem_flags(BPF_F_LOCK)",
+ "err:%d errno:%d\n", err, errno);
+ err = bpf_map_delete_elem(map_fd, &sk_fd);
+ CHECK(!err || errno != ENOENT, "bpf_map_delete_elem()",
+ "err:%d errno:%d\n", err, errno);
+
+ memcpy(&bad_xattr, &xattr, sizeof(xattr));
+ bad_xattr.btf_key_type_id = 0;
+ err = bpf_create_map_xattr(&bad_xattr);
+ CHECK(!err || errno != EINVAL, "bap_create_map_xattr(bad_xattr)",
+ "err:%d errno:%d\n", err, errno);
+
+ memcpy(&bad_xattr, &xattr, sizeof(xattr));
+ bad_xattr.btf_key_type_id = 3;
+ err = bpf_create_map_xattr(&bad_xattr);
+ CHECK(!err || errno != EINVAL, "bap_create_map_xattr(bad_xattr)",
+ "err:%d errno:%d\n", err, errno);
+
+ memcpy(&bad_xattr, &xattr, sizeof(xattr));
+ bad_xattr.max_entries = 1;
+ err = bpf_create_map_xattr(&bad_xattr);
+ CHECK(!err || errno != EINVAL, "bap_create_map_xattr(bad_xattr)",
+ "err:%d errno:%d\n", err, errno);
+
+ memcpy(&bad_xattr, &xattr, sizeof(xattr));
+ bad_xattr.map_flags = 0;
+ err = bpf_create_map_xattr(&bad_xattr);
+ CHECK(!err || errno != EINVAL, "bap_create_map_xattr(bad_xattr)",
+ "err:%d errno:%d\n", err, errno);
+
+ xattr.btf_fd = -1;
+ close(btf_fd);
+ close(map_fd);
+ close(sk_fd);
+}
+
+void test_sk_storage_map(void)
+{
+ const char *test_name, *env_opt;
+ bool test_ran = false;
+
+ test_name = getenv(BPF_SK_STORAGE_MAP_TEST_NAME);
+
+ env_opt = getenv(BPF_SK_STORAGE_MAP_TEST_NR_THREADS);
+ if (env_opt)
+ nr_sk_threads = atoi(env_opt);
+
+ env_opt = getenv(BPF_SK_STORAGE_MAP_TEST_SK_PER_THREAD);
+ if (env_opt)
+ nr_sk_per_thread = atoi(env_opt);
+
+ env_opt = getenv(BPF_SK_STORAGE_MAP_TEST_RUNTIME_S);
+ if (env_opt)
+ runtime_s = atoi(env_opt);
+
+ if (!test_name || !strcmp(test_name, "basic")) {
+ test_sk_storage_map_basic();
+ test_ran = true;
+ }
+ if (!test_name || !strcmp(test_name, "stress_free")) {
+ test_sk_storage_map_stress_free();
+ test_ran = true;
+ }
+ if (!test_name || !strcmp(test_name, "stress_change")) {
+ test_sk_storage_map_stress_change();
+ test_ran = true;
+ }
+
+ if (test_ran)
+ printf("%s:PASS\n", __func__);
+ else
+ CHECK(1, "Invalid test_name", "%s\n", test_name);
+}
diff --git a/tools/testing/selftests/bpf/netcnt_common.h b/tools/testing/selftests/bpf/netcnt_common.h
new file mode 100644
index 000000000..81084c1c2
--- /dev/null
+++ b/tools/testing/selftests/bpf/netcnt_common.h
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef __NETCNT_COMMON_H
+#define __NETCNT_COMMON_H
+
+#include <linux/types.h>
+
+#define MAX_PERCPU_PACKETS 32
+
+struct percpu_net_cnt {
+ __u64 packets;
+ __u64 bytes;
+
+ __u64 prev_ts;
+
+ __u64 prev_packets;
+ __u64 prev_bytes;
+};
+
+struct net_cnt {
+ __u64 packets;
+ __u64 bytes;
+};
+
+#endif
diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c
new file mode 100644
index 000000000..12ee40284
--- /dev/null
+++ b/tools/testing/selftests/bpf/network_helpers.c
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <arpa/inet.h>
+
+#include <linux/err.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+
+#include "bpf_util.h"
+#include "network_helpers.h"
+
+#define clean_errno() (errno == 0 ? "None" : strerror(errno))
+#define log_err(MSG, ...) ({ \
+ int __save = errno; \
+ fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \
+ __FILE__, __LINE__, clean_errno(), \
+ ##__VA_ARGS__); \
+ errno = __save; \
+})
+
+struct ipv4_packet pkt_v4 = {
+ .eth.h_proto = __bpf_constant_htons(ETH_P_IP),
+ .iph.ihl = 5,
+ .iph.protocol = IPPROTO_TCP,
+ .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES),
+ .tcp.urg_ptr = 123,
+ .tcp.doff = 5,
+};
+
+struct ipv6_packet pkt_v6 = {
+ .eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
+ .iph.nexthdr = IPPROTO_TCP,
+ .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
+ .tcp.urg_ptr = 123,
+ .tcp.doff = 5,
+};
+
+static int settimeo(int fd, int timeout_ms)
+{
+ struct timeval timeout = { .tv_sec = 3 };
+
+ if (timeout_ms > 0) {
+ timeout.tv_sec = timeout_ms / 1000;
+ timeout.tv_usec = (timeout_ms % 1000) * 1000;
+ }
+
+ if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeout,
+ sizeof(timeout))) {
+ log_err("Failed to set SO_RCVTIMEO");
+ return -1;
+ }
+
+ if (setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeout,
+ sizeof(timeout))) {
+ log_err("Failed to set SO_SNDTIMEO");
+ return -1;
+ }
+
+ return 0;
+}
+
+#define save_errno_close(fd) ({ int __save = errno; close(fd); errno = __save; })
+
+int start_server(int family, int type, const char *addr_str, __u16 port,
+ int timeout_ms)
+{
+ struct sockaddr_storage addr = {};
+ socklen_t len;
+ int fd;
+
+ if (make_sockaddr(family, addr_str, port, &addr, &len))
+ return -1;
+
+ fd = socket(family, type, 0);
+ if (fd < 0) {
+ log_err("Failed to create server socket");
+ return -1;
+ }
+
+ if (settimeo(fd, timeout_ms))
+ goto error_close;
+
+ if (bind(fd, (const struct sockaddr *)&addr, len) < 0) {
+ log_err("Failed to bind socket");
+ goto error_close;
+ }
+
+ if (type == SOCK_STREAM) {
+ if (listen(fd, 1) < 0) {
+ log_err("Failed to listed on socket");
+ goto error_close;
+ }
+ }
+
+ return fd;
+
+error_close:
+ save_errno_close(fd);
+ return -1;
+}
+
+int fastopen_connect(int server_fd, const char *data, unsigned int data_len,
+ int timeout_ms)
+{
+ struct sockaddr_storage addr;
+ socklen_t addrlen = sizeof(addr);
+ struct sockaddr_in *addr_in;
+ int fd, ret;
+
+ if (getsockname(server_fd, (struct sockaddr *)&addr, &addrlen)) {
+ log_err("Failed to get server addr");
+ return -1;
+ }
+
+ addr_in = (struct sockaddr_in *)&addr;
+ fd = socket(addr_in->sin_family, SOCK_STREAM, 0);
+ if (fd < 0) {
+ log_err("Failed to create client socket");
+ return -1;
+ }
+
+ if (settimeo(fd, timeout_ms))
+ goto error_close;
+
+ ret = sendto(fd, data, data_len, MSG_FASTOPEN, (struct sockaddr *)&addr,
+ addrlen);
+ if (ret != data_len) {
+ log_err("sendto(data, %u) != %d\n", data_len, ret);
+ goto error_close;
+ }
+
+ return fd;
+
+error_close:
+ save_errno_close(fd);
+ return -1;
+}
+
+static int connect_fd_to_addr(int fd,
+ const struct sockaddr_storage *addr,
+ socklen_t addrlen)
+{
+ if (connect(fd, (const struct sockaddr *)addr, addrlen)) {
+ log_err("Failed to connect to server");
+ return -1;
+ }
+
+ return 0;
+}
+
+int connect_to_fd(int server_fd, int timeout_ms)
+{
+ struct sockaddr_storage addr;
+ struct sockaddr_in *addr_in;
+ socklen_t addrlen, optlen;
+ int fd, type;
+
+ optlen = sizeof(type);
+ if (getsockopt(server_fd, SOL_SOCKET, SO_TYPE, &type, &optlen)) {
+ log_err("getsockopt(SOL_TYPE)");
+ return -1;
+ }
+
+ addrlen = sizeof(addr);
+ if (getsockname(server_fd, (struct sockaddr *)&addr, &addrlen)) {
+ log_err("Failed to get server addr");
+ return -1;
+ }
+
+ addr_in = (struct sockaddr_in *)&addr;
+ fd = socket(addr_in->sin_family, type, 0);
+ if (fd < 0) {
+ log_err("Failed to create client socket");
+ return -1;
+ }
+
+ if (settimeo(fd, timeout_ms))
+ goto error_close;
+
+ if (connect_fd_to_addr(fd, &addr, addrlen))
+ goto error_close;
+
+ return fd;
+
+error_close:
+ save_errno_close(fd);
+ return -1;
+}
+
+int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms)
+{
+ struct sockaddr_storage addr;
+ socklen_t len = sizeof(addr);
+
+ if (settimeo(client_fd, timeout_ms))
+ return -1;
+
+ if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) {
+ log_err("Failed to get server addr");
+ return -1;
+ }
+
+ if (connect_fd_to_addr(client_fd, &addr, len))
+ return -1;
+
+ return 0;
+}
+
+int make_sockaddr(int family, const char *addr_str, __u16 port,
+ struct sockaddr_storage *addr, socklen_t *len)
+{
+ if (family == AF_INET) {
+ struct sockaddr_in *sin = (void *)addr;
+
+ sin->sin_family = AF_INET;
+ sin->sin_port = htons(port);
+ if (addr_str &&
+ inet_pton(AF_INET, addr_str, &sin->sin_addr) != 1) {
+ log_err("inet_pton(AF_INET, %s)", addr_str);
+ return -1;
+ }
+ if (len)
+ *len = sizeof(*sin);
+ return 0;
+ } else if (family == AF_INET6) {
+ struct sockaddr_in6 *sin6 = (void *)addr;
+
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = htons(port);
+ if (addr_str &&
+ inet_pton(AF_INET6, addr_str, &sin6->sin6_addr) != 1) {
+ log_err("inet_pton(AF_INET6, %s)", addr_str);
+ return -1;
+ }
+ if (len)
+ *len = sizeof(*sin6);
+ return 0;
+ }
+ return -1;
+}
diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
new file mode 100644
index 000000000..7205f8afd
--- /dev/null
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NETWORK_HELPERS_H
+#define __NETWORK_HELPERS_H
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <linux/types.h>
+typedef __u16 __sum16;
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <netinet/tcp.h>
+#include <bpf/bpf_endian.h>
+
+#define MAGIC_VAL 0x1234
+#define NUM_ITER 100000
+#define VIP_NUM 5
+#define MAGIC_BYTES 123
+
+/* ipv4 test vector */
+struct ipv4_packet {
+ struct ethhdr eth;
+ struct iphdr iph;
+ struct tcphdr tcp;
+} __packed;
+extern struct ipv4_packet pkt_v4;
+
+/* ipv6 test vector */
+struct ipv6_packet {
+ struct ethhdr eth;
+ struct ipv6hdr iph;
+ struct tcphdr tcp;
+} __packed;
+extern struct ipv6_packet pkt_v6;
+
+int start_server(int family, int type, const char *addr, __u16 port,
+ int timeout_ms);
+int connect_to_fd(int server_fd, int timeout_ms);
+int connect_fd_to_fd(int client_fd, int server_fd, int timeout_ms);
+int fastopen_connect(int server_fd, const char *data, unsigned int data_len,
+ int timeout_ms);
+int make_sockaddr(int family, const char *addr_str, __u16 port,
+ struct sockaddr_storage *addr, socklen_t *len);
+
+#endif
diff --git a/tools/testing/selftests/bpf/prog_tests/.gitignore b/tools/testing/selftests/bpf/prog_tests/.gitignore
new file mode 100644
index 000000000..89c4a3d37
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+tests.h
diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c
new file mode 100644
index 000000000..7996ec07e
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/align.c
@@ -0,0 +1,676 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+#define MAX_INSNS 512
+#define MAX_MATCHES 24
+
+struct bpf_reg_match {
+ unsigned int line;
+ const char *match;
+};
+
+struct bpf_align_test {
+ const char *descr;
+ struct bpf_insn insns[MAX_INSNS];
+ enum {
+ UNDEF,
+ ACCEPT,
+ REJECT
+ } result;
+ enum bpf_prog_type prog_type;
+ /* Matches must be in order of increasing line */
+ struct bpf_reg_match matches[MAX_MATCHES];
+};
+
+static struct bpf_align_test tests[] = {
+ /* Four tests of known constants. These aren't staggeringly
+ * interesting since we track exact values now.
+ */
+ {
+ .descr = "mov",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_3, 2),
+ BPF_MOV64_IMM(BPF_REG_3, 4),
+ BPF_MOV64_IMM(BPF_REG_3, 8),
+ BPF_MOV64_IMM(BPF_REG_3, 16),
+ BPF_MOV64_IMM(BPF_REG_3, 32),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .matches = {
+ {1, "R1=ctx(id=0,off=0,imm=0)"},
+ {1, "R10=fp0"},
+ {1, "R3_w=inv2"},
+ {2, "R3_w=inv4"},
+ {3, "R3_w=inv8"},
+ {4, "R3_w=inv16"},
+ {5, "R3_w=inv32"},
+ },
+ },
+ {
+ .descr = "shift",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_3, 1),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_3, 4),
+ BPF_MOV64_IMM(BPF_REG_4, 32),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .matches = {
+ {1, "R1=ctx(id=0,off=0,imm=0)"},
+ {1, "R10=fp0"},
+ {1, "R3_w=inv1"},
+ {2, "R3_w=inv2"},
+ {3, "R3_w=inv4"},
+ {4, "R3_w=inv8"},
+ {5, "R3_w=inv16"},
+ {6, "R3_w=inv1"},
+ {7, "R4_w=inv32"},
+ {8, "R4_w=inv16"},
+ {9, "R4_w=inv8"},
+ {10, "R4_w=inv4"},
+ {11, "R4_w=inv2"},
+ },
+ },
+ {
+ .descr = "addsub",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_3, 4),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 4),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 2),
+ BPF_MOV64_IMM(BPF_REG_4, 8),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .matches = {
+ {1, "R1=ctx(id=0,off=0,imm=0)"},
+ {1, "R10=fp0"},
+ {1, "R3_w=inv4"},
+ {2, "R3_w=inv8"},
+ {3, "R3_w=inv10"},
+ {4, "R4_w=inv8"},
+ {5, "R4_w=inv12"},
+ {6, "R4_w=inv14"},
+ },
+ },
+ {
+ .descr = "mul",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_3, 7),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, 1),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, 2),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, 4),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .matches = {
+ {1, "R1=ctx(id=0,off=0,imm=0)"},
+ {1, "R10=fp0"},
+ {1, "R3_w=inv7"},
+ {2, "R3_w=inv7"},
+ {3, "R3_w=inv14"},
+ {4, "R3_w=inv56"},
+ },
+ },
+
+ /* Tests using unknown values */
+#define PREP_PKT_POINTERS \
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, \
+ offsetof(struct __sk_buff, data)), \
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, \
+ offsetof(struct __sk_buff, data_end))
+
+#define LOAD_UNKNOWN(DST_REG) \
+ PREP_PKT_POINTERS, \
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), \
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8), \
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_0, 1), \
+ BPF_EXIT_INSN(), \
+ BPF_LDX_MEM(BPF_B, DST_REG, BPF_REG_2, 0)
+
+ {
+ .descr = "unknown shift",
+ .insns = {
+ LOAD_UNKNOWN(BPF_REG_3),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_3, 1),
+ LOAD_UNKNOWN(BPF_REG_4),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_4, 5),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .matches = {
+ {7, "R0_w=pkt(id=0,off=8,r=8,imm=0)"},
+ {7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+ {8, "R3_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
+ {9, "R3_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ {10, "R3_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
+ {11, "R3_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
+ {18, "R3=pkt_end(id=0,off=0,imm=0)"},
+ {18, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+ {19, "R4_w=inv(id=0,umax_value=8160,var_off=(0x0; 0x1fe0))"},
+ {20, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
+ {21, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
+ {22, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ {23, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
+ },
+ },
+ {
+ .descr = "unknown mul",
+ .insns = {
+ LOAD_UNKNOWN(BPF_REG_3),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_3),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 1),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_3),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 2),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_3),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 4),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_3),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 8),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .matches = {
+ {7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+ {8, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
+ {9, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+ {10, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
+ {11, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
+ {12, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
+ {13, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ {14, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
+ {15, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
+ {16, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
+ },
+ },
+ {
+ .descr = "packet const offset",
+ .insns = {
+ PREP_PKT_POINTERS,
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+
+ /* Skip over ethernet header. */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+ BPF_EXIT_INSN(),
+
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 0),
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 2),
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_5, 3),
+ BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_5, 0),
+ BPF_LDX_MEM(BPF_H, BPF_REG_4, BPF_REG_5, 2),
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0),
+
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .matches = {
+ {4, "R5_w=pkt(id=0,off=0,r=0,imm=0)"},
+ {5, "R5_w=pkt(id=0,off=14,r=0,imm=0)"},
+ {6, "R4_w=pkt(id=0,off=14,r=0,imm=0)"},
+ {10, "R2=pkt(id=0,off=0,r=18,imm=0)"},
+ {10, "R5=pkt(id=0,off=14,r=18,imm=0)"},
+ {10, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
+ {14, "R4_w=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff))"},
+ {15, "R4_w=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff))"},
+ },
+ },
+ {
+ .descr = "packet variable offset",
+ .insns = {
+ LOAD_UNKNOWN(BPF_REG_6),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
+
+ /* First, add a constant to the R5 packet pointer,
+ * then a variable with a known alignment.
+ */
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0),
+
+ /* Now, test in the other direction. Adding first
+ * the variable offset to R5, then the constant.
+ */
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0),
+
+ /* Test multiple accumulations of unknown values
+ * into a packet pointer.
+ */
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_5, 0),
+
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .matches = {
+ /* Calculated offset in R6 has unknown value, but known
+ * alignment of 4.
+ */
+ {8, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
+ {8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* Offset is added to packet pointer R5, resulting in
+ * known fixed offset, and variable offset from R6.
+ */
+ {11, "R5_w=pkt(id=1,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* At the time the word size load is performed from R5,
+ * it's total offset is NET_IP_ALIGN + reg->off (0) +
+ * reg->aux_off (14) which is 16. Then the variable
+ * offset is considered using reg->aux_off_align which
+ * is 4 and meets the load's requirements.
+ */
+ {15, "R4=pkt(id=1,off=18,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ {15, "R5=pkt(id=1,off=14,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* Variable offset is added to R5 packet pointer,
+ * resulting in auxiliary alignment of 4. To avoid BPF
+ * verifier's precision backtracking logging
+ * interfering we also have a no-op R4 = R5
+ * instruction to validate R5 state. We also check
+ * that R4 is what it should be in such case.
+ */
+ {19, "R4_w=pkt(id=2,off=0,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ {19, "R5_w=pkt(id=2,off=0,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* Constant offset is added to R5, resulting in
+ * reg->off of 14.
+ */
+ {20, "R5_w=pkt(id=2,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* At the time the word size load is performed from R5,
+ * its total fixed offset is NET_IP_ALIGN + reg->off
+ * (14) which is 16. Then the variable offset is 4-byte
+ * aligned, so the total offset is 4-byte aligned and
+ * meets the load's requirements.
+ */
+ {24, "R4=pkt(id=2,off=18,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ {24, "R5=pkt(id=2,off=14,r=18,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* Constant offset is added to R5 packet pointer,
+ * resulting in reg->off value of 14.
+ */
+ {27, "R5_w=pkt(id=0,off=14,r=8"},
+ /* Variable offset is added to R5, resulting in a
+ * variable offset of (4n). See comment for insn #19
+ * for R4 = R5 trick.
+ */
+ {29, "R4_w=pkt(id=3,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ {29, "R5_w=pkt(id=3,off=14,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* Constant is added to R5 again, setting reg->off to 18. */
+ {30, "R5_w=pkt(id=3,off=18,r=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* And once more we add a variable; resulting var_off
+ * is still (4n), fixed offset is not changed.
+ * Also, we create a new reg->id.
+ */
+ {32, "R4_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"},
+ {32, "R5_w=pkt(id=4,off=18,r=0,umax_value=2040,var_off=(0x0; 0x7fc)"},
+ /* At the time the word size load is performed from R5,
+ * its total fixed offset is NET_IP_ALIGN + reg->off (18)
+ * which is 20. Then the variable offset is (4n), so
+ * the total offset is 4-byte aligned and meets the
+ * load's requirements.
+ */
+ {35, "R4=pkt(id=4,off=22,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"},
+ {35, "R5=pkt(id=4,off=18,r=22,umax_value=2040,var_off=(0x0; 0x7fc)"},
+ },
+ },
+ {
+ .descr = "packet variable offset 2",
+ .insns = {
+ /* Create an unknown offset, (4n+2)-aligned */
+ LOAD_UNKNOWN(BPF_REG_6),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 14),
+ /* Add it to the packet pointer */
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+ /* Check bounds and perform a read */
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0),
+ /* Make a (4n) offset from the value we just read */
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 0xff),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
+ /* Add it to the packet pointer */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+ /* Check bounds and perform a read */
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .matches = {
+ /* Calculated offset in R6 has unknown value, but known
+ * alignment of 4.
+ */
+ {8, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
+ {8, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* Adding 14 makes R6 be (4n+2) */
+ {9, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
+ /* Packet pointer has (4n+2) offset */
+ {11, "R5_w=pkt(id=1,off=0,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"},
+ {13, "R4=pkt(id=1,off=4,r=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"},
+ /* At the time the word size load is performed from R5,
+ * its total fixed offset is NET_IP_ALIGN + reg->off (0)
+ * which is 2. Then the variable offset is (4n+2), so
+ * the total offset is 4-byte aligned and meets the
+ * load's requirements.
+ */
+ {15, "R5=pkt(id=1,off=0,r=4,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc)"},
+ /* Newly read value in R6 was shifted left by 2, so has
+ * known alignment of 4.
+ */
+ {18, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* Added (4n) to packet pointer's (4n+2) var_off, giving
+ * another (4n+2).
+ */
+ {19, "R5_w=pkt(id=2,off=0,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"},
+ {21, "R4=pkt(id=2,off=4,r=0,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"},
+ /* At the time the word size load is performed from R5,
+ * its total fixed offset is NET_IP_ALIGN + reg->off (0)
+ * which is 2. Then the variable offset is (4n+2), so
+ * the total offset is 4-byte aligned and meets the
+ * load's requirements.
+ */
+ {23, "R5=pkt(id=2,off=0,r=4,umin_value=14,umax_value=2054,var_off=(0x2; 0xffc)"},
+ },
+ },
+ {
+ .descr = "dubious pointer arithmetic",
+ .insns = {
+ PREP_PKT_POINTERS,
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ /* (ptr - ptr) << 2 */
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_3),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_5, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_5, 2),
+ /* We have a (4n) value. Let's make a packet offset
+ * out of it. First add 14, to make it a (4n+2)
+ */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
+ /* Then make sure it's nonnegative */
+ BPF_JMP_IMM(BPF_JSGE, BPF_REG_5, 0, 1),
+ BPF_EXIT_INSN(),
+ /* Add it to packet pointer */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_5),
+ /* Check bounds and perform a read */
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_6, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .matches = {
+ {4, "R5_w=pkt_end(id=0,off=0,imm=0)"},
+ /* (ptr - ptr) << 2 == unknown, (4n) */
+ {6, "R5_w=inv(id=0,smax_value=9223372036854775804,umax_value=18446744073709551612,var_off=(0x0; 0xfffffffffffffffc)"},
+ /* (4n) + 14 == (4n+2). We blow our bounds, because
+ * the add could overflow.
+ */
+ {7, "R5_w=inv(id=0,smin_value=-9223372036854775806,smax_value=9223372036854775806,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"},
+ /* Checked s>=0 */
+ {9, "R5=inv(id=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"},
+ /* packet pointer + nonnegative (4n+2) */
+ {11, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"},
+ {13, "R4_w=pkt(id=1,off=4,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"},
+ /* NET_IP_ALIGN + (4n+2) == (4n), alignment is fine.
+ * We checked the bounds, but it might have been able
+ * to overflow if the packet pointer started in the
+ * upper half of the address space.
+ * So we did not get a 'range' on R6, and the access
+ * attempt will fail.
+ */
+ {15, "R6_w=pkt(id=1,off=0,r=0,umin_value=2,umax_value=9223372036854775806,var_off=(0x2; 0x7ffffffffffffffc)"},
+ }
+ },
+ {
+ .descr = "variable subtraction",
+ .insns = {
+ /* Create an unknown offset, (4n+2)-aligned */
+ LOAD_UNKNOWN(BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 14),
+ /* Create another unknown, (4n)-aligned, and subtract
+ * it from the first one
+ */
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_7, 2),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_6, BPF_REG_7),
+ /* Bounds-check the result */
+ BPF_JMP_IMM(BPF_JSGE, BPF_REG_6, 0, 1),
+ BPF_EXIT_INSN(),
+ /* Add it to the packet pointer */
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_6),
+ /* Check bounds and perform a read */
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .matches = {
+ /* Calculated offset in R6 has unknown value, but known
+ * alignment of 4.
+ */
+ {7, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
+ {9, "R6_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* Adding 14 makes R6 be (4n+2) */
+ {10, "R6_w=inv(id=0,umin_value=14,umax_value=1034,var_off=(0x2; 0x7fc))"},
+ /* New unknown value in R7 is (4n) */
+ {11, "R7_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
+ /* Subtracting it from R6 blows our unsigned bounds */
+ {12, "R6=inv(id=0,smin_value=-1006,smax_value=1034,umin_value=2,umax_value=18446744073709551614,var_off=(0x2; 0xfffffffffffffffc)"},
+ /* Checked s>= 0 */
+ {14, "R6=inv(id=0,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc))"},
+ /* At the time the word size load is performed from R5,
+ * its total fixed offset is NET_IP_ALIGN + reg->off (0)
+ * which is 2. Then the variable offset is (4n+2), so
+ * the total offset is 4-byte aligned and meets the
+ * load's requirements.
+ */
+ {20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc)"},
+
+ },
+ },
+ {
+ .descr = "pointer variable subtraction",
+ .insns = {
+ /* Create an unknown offset, (4n+2)-aligned and bounded
+ * to [14,74]
+ */
+ LOAD_UNKNOWN(BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 0xf),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 14),
+ /* Subtract it from the packet pointer */
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_5, BPF_REG_6),
+ /* Create another unknown, (4n)-aligned and >= 74.
+ * That in fact means >= 76, since 74 % 4 == 2
+ */
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_7, 2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 76),
+ /* Add it to the packet pointer */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_5, BPF_REG_7),
+ /* Check bounds and perform a read */
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_5),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_4, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_5, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .matches = {
+ /* Calculated offset in R6 has unknown value, but known
+ * alignment of 4.
+ */
+ {7, "R2_w=pkt(id=0,off=0,r=8,imm=0)"},
+ {10, "R6_w=inv(id=0,umax_value=60,var_off=(0x0; 0x3c))"},
+ /* Adding 14 makes R6 be (4n+2) */
+ {11, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"},
+ /* Subtracting from packet pointer overflows ubounds */
+ {13, "R5_w=pkt(id=2,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"},
+ /* New unknown value in R7 is (4n), >= 76 */
+ {15, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"},
+ /* Adding it to packet pointer gives nice bounds again */
+ {16, "R5_w=pkt(id=3,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
+ /* At the time the word size load is performed from R5,
+ * its total fixed offset is NET_IP_ALIGN + reg->off (0)
+ * which is 2. Then the variable offset is (4n+2), so
+ * the total offset is 4-byte aligned and meets the
+ * load's requirements.
+ */
+ {20, "R5=pkt(id=3,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
+ },
+ },
+};
+
+static int probe_filter_length(const struct bpf_insn *fp)
+{
+ int len;
+
+ for (len = MAX_INSNS - 1; len > 0; --len)
+ if (fp[len].code != 0 || fp[len].imm != 0)
+ break;
+ return len + 1;
+}
+
+static char bpf_vlog[32768];
+
+static int do_test_single(struct bpf_align_test *test)
+{
+ struct bpf_insn *prog = test->insns;
+ int prog_type = test->prog_type;
+ char bpf_vlog_copy[32768];
+ const char *line_ptr;
+ int cur_line = -1;
+ int prog_len, i;
+ int fd_prog;
+ int ret;
+
+ prog_len = probe_filter_length(prog);
+ fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER,
+ prog, prog_len, BPF_F_STRICT_ALIGNMENT,
+ "GPL", 0, bpf_vlog, sizeof(bpf_vlog), 2);
+ if (fd_prog < 0 && test->result != REJECT) {
+ printf("Failed to load program.\n");
+ printf("%s", bpf_vlog);
+ ret = 1;
+ } else if (fd_prog >= 0 && test->result == REJECT) {
+ printf("Unexpected success to load!\n");
+ printf("%s", bpf_vlog);
+ ret = 1;
+ close(fd_prog);
+ } else {
+ ret = 0;
+ /* We make a local copy so that we can strtok() it */
+ strncpy(bpf_vlog_copy, bpf_vlog, sizeof(bpf_vlog_copy));
+ line_ptr = strtok(bpf_vlog_copy, "\n");
+ for (i = 0; i < MAX_MATCHES; i++) {
+ struct bpf_reg_match m = test->matches[i];
+
+ if (!m.match)
+ break;
+ while (line_ptr) {
+ cur_line = -1;
+ sscanf(line_ptr, "%u: ", &cur_line);
+ if (cur_line == m.line)
+ break;
+ line_ptr = strtok(NULL, "\n");
+ }
+ if (!line_ptr) {
+ printf("Failed to find line %u for match: %s\n",
+ m.line, m.match);
+ ret = 1;
+ printf("%s", bpf_vlog);
+ break;
+ }
+ if (!strstr(line_ptr, m.match)) {
+ printf("Failed to find match %u: %s\n",
+ m.line, m.match);
+ ret = 1;
+ printf("%s", bpf_vlog);
+ break;
+ }
+ }
+ if (fd_prog >= 0)
+ close(fd_prog);
+ }
+ return ret;
+}
+
+void test_align(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ struct bpf_align_test *test = &tests[i];
+
+ if (!test__start_subtest(test->descr))
+ continue;
+
+ CHECK_FAIL(do_test_single(test));
+ }
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c
new file mode 100644
index 000000000..a0ee87c8e
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include "test_attach_probe.skel.h"
+
+ssize_t get_base_addr() {
+ size_t start, offset;
+ char buf[256];
+ FILE *f;
+
+ f = fopen("/proc/self/maps", "r");
+ if (!f)
+ return -errno;
+
+ while (fscanf(f, "%zx-%*x %s %zx %*[^\n]\n",
+ &start, buf, &offset) == 3) {
+ if (strcmp(buf, "r-xp") == 0) {
+ fclose(f);
+ return start - offset;
+ }
+ }
+
+ fclose(f);
+ return -EINVAL;
+}
+
+void test_attach_probe(void)
+{
+ int duration = 0;
+ struct bpf_link *kprobe_link, *kretprobe_link;
+ struct bpf_link *uprobe_link, *uretprobe_link;
+ struct test_attach_probe* skel;
+ size_t uprobe_offset;
+ ssize_t base_addr;
+
+ base_addr = get_base_addr();
+ if (CHECK(base_addr < 0, "get_base_addr",
+ "failed to find base addr: %zd", base_addr))
+ return;
+ uprobe_offset = (size_t)&get_base_addr - base_addr;
+
+ skel = test_attach_probe__open_and_load();
+ if (CHECK(!skel, "skel_open", "failed to open skeleton\n"))
+ return;
+ if (CHECK(!skel->bss, "check_bss", ".bss wasn't mmap()-ed\n"))
+ goto cleanup;
+
+ kprobe_link = bpf_program__attach_kprobe(skel->progs.handle_kprobe,
+ false /* retprobe */,
+ SYS_NANOSLEEP_KPROBE_NAME);
+ if (CHECK(IS_ERR(kprobe_link), "attach_kprobe",
+ "err %ld\n", PTR_ERR(kprobe_link)))
+ goto cleanup;
+ skel->links.handle_kprobe = kprobe_link;
+
+ kretprobe_link = bpf_program__attach_kprobe(skel->progs.handle_kretprobe,
+ true /* retprobe */,
+ SYS_NANOSLEEP_KPROBE_NAME);
+ if (CHECK(IS_ERR(kretprobe_link), "attach_kretprobe",
+ "err %ld\n", PTR_ERR(kretprobe_link)))
+ goto cleanup;
+ skel->links.handle_kretprobe = kretprobe_link;
+
+ uprobe_link = bpf_program__attach_uprobe(skel->progs.handle_uprobe,
+ false /* retprobe */,
+ 0 /* self pid */,
+ "/proc/self/exe",
+ uprobe_offset);
+ if (CHECK(IS_ERR(uprobe_link), "attach_uprobe",
+ "err %ld\n", PTR_ERR(uprobe_link)))
+ goto cleanup;
+ skel->links.handle_uprobe = uprobe_link;
+
+ uretprobe_link = bpf_program__attach_uprobe(skel->progs.handle_uretprobe,
+ true /* retprobe */,
+ -1 /* any pid */,
+ "/proc/self/exe",
+ uprobe_offset);
+ if (CHECK(IS_ERR(uretprobe_link), "attach_uretprobe",
+ "err %ld\n", PTR_ERR(uretprobe_link)))
+ goto cleanup;
+ skel->links.handle_uretprobe = uretprobe_link;
+
+ /* trigger & validate kprobe && kretprobe */
+ usleep(1);
+
+ if (CHECK(skel->bss->kprobe_res != 1, "check_kprobe_res",
+ "wrong kprobe res: %d\n", skel->bss->kprobe_res))
+ goto cleanup;
+ if (CHECK(skel->bss->kretprobe_res != 2, "check_kretprobe_res",
+ "wrong kretprobe res: %d\n", skel->bss->kretprobe_res))
+ goto cleanup;
+
+ /* trigger & validate uprobe & uretprobe */
+ get_base_addr();
+
+ if (CHECK(skel->bss->uprobe_res != 3, "check_uprobe_res",
+ "wrong uprobe res: %d\n", skel->bss->uprobe_res))
+ goto cleanup;
+ if (CHECK(skel->bss->uretprobe_res != 4, "check_uretprobe_res",
+ "wrong uretprobe res: %d\n", skel->bss->uretprobe_res))
+ goto cleanup;
+
+cleanup:
+ test_attach_probe__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/autoload.c b/tools/testing/selftests/bpf/prog_tests/autoload.c
new file mode 100644
index 000000000..3693f7d13
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/autoload.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include <test_progs.h>
+#include <time.h>
+#include "test_autoload.skel.h"
+
+void test_autoload(void)
+{
+ int duration = 0, err;
+ struct test_autoload* skel;
+
+ skel = test_autoload__open_and_load();
+ /* prog3 should be broken */
+ if (CHECK(skel, "skel_open_and_load", "unexpected success\n"))
+ goto cleanup;
+
+ skel = test_autoload__open();
+ if (CHECK(!skel, "skel_open", "failed to open skeleton\n"))
+ goto cleanup;
+
+ /* don't load prog3 */
+ bpf_program__set_autoload(skel->progs.prog3, false);
+
+ err = test_autoload__load(skel);
+ if (CHECK(err, "skel_load", "failed to load skeleton: %d\n", err))
+ goto cleanup;
+
+ err = test_autoload__attach(skel);
+ if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err))
+ goto cleanup;
+
+ usleep(1);
+
+ CHECK(!skel->bss->prog1_called, "prog1", "not called\n");
+ CHECK(!skel->bss->prog2_called, "prog2", "not called\n");
+ CHECK(skel->bss->prog3_called, "prog3", "called?!\n");
+
+cleanup:
+ test_autoload__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
new file mode 100644
index 000000000..448885b95
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -0,0 +1,1074 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <test_progs.h>
+#include "bpf_iter_ipv6_route.skel.h"
+#include "bpf_iter_netlink.skel.h"
+#include "bpf_iter_bpf_map.skel.h"
+#include "bpf_iter_task.skel.h"
+#include "bpf_iter_task_stack.skel.h"
+#include "bpf_iter_task_file.skel.h"
+#include "bpf_iter_task_btf.skel.h"
+#include "bpf_iter_tcp4.skel.h"
+#include "bpf_iter_tcp6.skel.h"
+#include "bpf_iter_udp4.skel.h"
+#include "bpf_iter_udp6.skel.h"
+#include "bpf_iter_test_kern1.skel.h"
+#include "bpf_iter_test_kern2.skel.h"
+#include "bpf_iter_test_kern3.skel.h"
+#include "bpf_iter_test_kern4.skel.h"
+#include "bpf_iter_bpf_hash_map.skel.h"
+#include "bpf_iter_bpf_percpu_hash_map.skel.h"
+#include "bpf_iter_bpf_array_map.skel.h"
+#include "bpf_iter_bpf_percpu_array_map.skel.h"
+#include "bpf_iter_bpf_sk_storage_map.skel.h"
+#include "bpf_iter_test_kern5.skel.h"
+#include "bpf_iter_test_kern6.skel.h"
+
+static int duration;
+
+static void test_btf_id_or_null(void)
+{
+ struct bpf_iter_test_kern3 *skel;
+
+ skel = bpf_iter_test_kern3__open_and_load();
+ if (CHECK(skel, "bpf_iter_test_kern3__open_and_load",
+ "skeleton open_and_load unexpectedly succeeded\n")) {
+ bpf_iter_test_kern3__destroy(skel);
+ return;
+ }
+}
+
+static void do_dummy_read(struct bpf_program *prog)
+{
+ struct bpf_link *link;
+ char buf[16] = {};
+ int iter_fd, len;
+
+ link = bpf_program__attach_iter(prog, NULL);
+ if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+ return;
+
+ iter_fd = bpf_iter_create(bpf_link__fd(link));
+ if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+ goto free_link;
+
+ /* not check contents, but ensure read() ends without error */
+ while ((len = read(iter_fd, buf, sizeof(buf))) > 0)
+ ;
+ CHECK(len < 0, "read", "read failed: %s\n", strerror(errno));
+
+ close(iter_fd);
+
+free_link:
+ bpf_link__destroy(link);
+}
+
+static void test_ipv6_route(void)
+{
+ struct bpf_iter_ipv6_route *skel;
+
+ skel = bpf_iter_ipv6_route__open_and_load();
+ if (CHECK(!skel, "bpf_iter_ipv6_route__open_and_load",
+ "skeleton open_and_load failed\n"))
+ return;
+
+ do_dummy_read(skel->progs.dump_ipv6_route);
+
+ bpf_iter_ipv6_route__destroy(skel);
+}
+
+static void test_netlink(void)
+{
+ struct bpf_iter_netlink *skel;
+
+ skel = bpf_iter_netlink__open_and_load();
+ if (CHECK(!skel, "bpf_iter_netlink__open_and_load",
+ "skeleton open_and_load failed\n"))
+ return;
+
+ do_dummy_read(skel->progs.dump_netlink);
+
+ bpf_iter_netlink__destroy(skel);
+}
+
+static void test_bpf_map(void)
+{
+ struct bpf_iter_bpf_map *skel;
+
+ skel = bpf_iter_bpf_map__open_and_load();
+ if (CHECK(!skel, "bpf_iter_bpf_map__open_and_load",
+ "skeleton open_and_load failed\n"))
+ return;
+
+ do_dummy_read(skel->progs.dump_bpf_map);
+
+ bpf_iter_bpf_map__destroy(skel);
+}
+
+static void test_task(void)
+{
+ struct bpf_iter_task *skel;
+
+ skel = bpf_iter_task__open_and_load();
+ if (CHECK(!skel, "bpf_iter_task__open_and_load",
+ "skeleton open_and_load failed\n"))
+ return;
+
+ do_dummy_read(skel->progs.dump_task);
+
+ bpf_iter_task__destroy(skel);
+}
+
+static void test_task_stack(void)
+{
+ struct bpf_iter_task_stack *skel;
+
+ skel = bpf_iter_task_stack__open_and_load();
+ if (CHECK(!skel, "bpf_iter_task_stack__open_and_load",
+ "skeleton open_and_load failed\n"))
+ return;
+
+ do_dummy_read(skel->progs.dump_task_stack);
+
+ bpf_iter_task_stack__destroy(skel);
+}
+
+static void *do_nothing(void *arg)
+{
+ pthread_exit(arg);
+}
+
+static void test_task_file(void)
+{
+ struct bpf_iter_task_file *skel;
+ pthread_t thread_id;
+ void *ret;
+
+ skel = bpf_iter_task_file__open_and_load();
+ if (CHECK(!skel, "bpf_iter_task_file__open_and_load",
+ "skeleton open_and_load failed\n"))
+ return;
+
+ skel->bss->tgid = getpid();
+
+ if (CHECK(pthread_create(&thread_id, NULL, &do_nothing, NULL),
+ "pthread_create", "pthread_create failed\n"))
+ goto done;
+
+ do_dummy_read(skel->progs.dump_task_file);
+
+ if (CHECK(pthread_join(thread_id, &ret) || ret != NULL,
+ "pthread_join", "pthread_join failed\n"))
+ goto done;
+
+ CHECK(skel->bss->count != 0, "check_count",
+ "invalid non pthread file visit count %d\n", skel->bss->count);
+
+done:
+ bpf_iter_task_file__destroy(skel);
+}
+
+#define TASKBUFSZ 32768
+
+static char taskbuf[TASKBUFSZ];
+
+static int do_btf_read(struct bpf_iter_task_btf *skel)
+{
+ struct bpf_program *prog = skel->progs.dump_task_struct;
+ struct bpf_iter_task_btf__bss *bss = skel->bss;
+ int iter_fd = -1, len = 0, bufleft = TASKBUFSZ;
+ struct bpf_link *link;
+ char *buf = taskbuf;
+ int ret = 0;
+
+ link = bpf_program__attach_iter(prog, NULL);
+ if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+ return ret;
+
+ iter_fd = bpf_iter_create(bpf_link__fd(link));
+ if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+ goto free_link;
+
+ do {
+ len = read(iter_fd, buf, bufleft);
+ if (len > 0) {
+ buf += len;
+ bufleft -= len;
+ }
+ } while (len > 0);
+
+ if (bss->skip) {
+ printf("%s:SKIP:no __builtin_btf_type_id\n", __func__);
+ ret = 1;
+ test__skip();
+ goto free_link;
+ }
+
+ if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+ goto free_link;
+
+ CHECK(strstr(taskbuf, "(struct task_struct)") == NULL,
+ "check for btf representation of task_struct in iter data",
+ "struct task_struct not found");
+free_link:
+ if (iter_fd > 0)
+ close(iter_fd);
+ bpf_link__destroy(link);
+ return ret;
+}
+
+static void test_task_btf(void)
+{
+ struct bpf_iter_task_btf__bss *bss;
+ struct bpf_iter_task_btf *skel;
+ int ret;
+
+ skel = bpf_iter_task_btf__open_and_load();
+ if (CHECK(!skel, "bpf_iter_task_btf__open_and_load",
+ "skeleton open_and_load failed\n"))
+ return;
+
+ bss = skel->bss;
+
+ ret = do_btf_read(skel);
+ if (ret)
+ goto cleanup;
+
+ if (CHECK(bss->tasks == 0, "check if iterated over tasks",
+ "no task iteration, did BPF program run?\n"))
+ goto cleanup;
+
+ CHECK(bss->seq_err != 0, "check for unexpected err",
+ "bpf_seq_printf_btf returned %ld", bss->seq_err);
+
+cleanup:
+ bpf_iter_task_btf__destroy(skel);
+}
+
+static void test_tcp4(void)
+{
+ struct bpf_iter_tcp4 *skel;
+
+ skel = bpf_iter_tcp4__open_and_load();
+ if (CHECK(!skel, "bpf_iter_tcp4__open_and_load",
+ "skeleton open_and_load failed\n"))
+ return;
+
+ do_dummy_read(skel->progs.dump_tcp4);
+
+ bpf_iter_tcp4__destroy(skel);
+}
+
+static void test_tcp6(void)
+{
+ struct bpf_iter_tcp6 *skel;
+
+ skel = bpf_iter_tcp6__open_and_load();
+ if (CHECK(!skel, "bpf_iter_tcp6__open_and_load",
+ "skeleton open_and_load failed\n"))
+ return;
+
+ do_dummy_read(skel->progs.dump_tcp6);
+
+ bpf_iter_tcp6__destroy(skel);
+}
+
+static void test_udp4(void)
+{
+ struct bpf_iter_udp4 *skel;
+
+ skel = bpf_iter_udp4__open_and_load();
+ if (CHECK(!skel, "bpf_iter_udp4__open_and_load",
+ "skeleton open_and_load failed\n"))
+ return;
+
+ do_dummy_read(skel->progs.dump_udp4);
+
+ bpf_iter_udp4__destroy(skel);
+}
+
+static void test_udp6(void)
+{
+ struct bpf_iter_udp6 *skel;
+
+ skel = bpf_iter_udp6__open_and_load();
+ if (CHECK(!skel, "bpf_iter_udp6__open_and_load",
+ "skeleton open_and_load failed\n"))
+ return;
+
+ do_dummy_read(skel->progs.dump_udp6);
+
+ bpf_iter_udp6__destroy(skel);
+}
+
+/* The expected string is less than 16 bytes */
+static int do_read_with_fd(int iter_fd, const char *expected,
+ bool read_one_char)
+{
+ int err = -1, len, read_buf_len, start;
+ char buf[16] = {};
+
+ read_buf_len = read_one_char ? 1 : 16;
+ start = 0;
+ while ((len = read(iter_fd, buf + start, read_buf_len)) > 0) {
+ start += len;
+ if (CHECK(start >= 16, "read", "read len %d\n", len))
+ return -1;
+ read_buf_len = read_one_char ? 1 : 16 - start;
+ }
+ if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+ return -1;
+
+ err = strcmp(buf, expected);
+ if (CHECK(err, "read", "incorrect read result: buf %s, expected %s\n",
+ buf, expected))
+ return -1;
+
+ return 0;
+}
+
+static void test_anon_iter(bool read_one_char)
+{
+ struct bpf_iter_test_kern1 *skel;
+ struct bpf_link *link;
+ int iter_fd, err;
+
+ skel = bpf_iter_test_kern1__open_and_load();
+ if (CHECK(!skel, "bpf_iter_test_kern1__open_and_load",
+ "skeleton open_and_load failed\n"))
+ return;
+
+ err = bpf_iter_test_kern1__attach(skel);
+ if (CHECK(err, "bpf_iter_test_kern1__attach",
+ "skeleton attach failed\n")) {
+ goto out;
+ }
+
+ link = skel->links.dump_task;
+ iter_fd = bpf_iter_create(bpf_link__fd(link));
+ if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+ goto out;
+
+ do_read_with_fd(iter_fd, "abcd", read_one_char);
+ close(iter_fd);
+
+out:
+ bpf_iter_test_kern1__destroy(skel);
+}
+
+static int do_read(const char *path, const char *expected)
+{
+ int err, iter_fd;
+
+ iter_fd = open(path, O_RDONLY);
+ if (CHECK(iter_fd < 0, "open", "open %s failed: %s\n",
+ path, strerror(errno)))
+ return -1;
+
+ err = do_read_with_fd(iter_fd, expected, false);
+ close(iter_fd);
+ return err;
+}
+
+static void test_file_iter(void)
+{
+ const char *path = "/sys/fs/bpf/bpf_iter_test1";
+ struct bpf_iter_test_kern1 *skel1;
+ struct bpf_iter_test_kern2 *skel2;
+ struct bpf_link *link;
+ int err;
+
+ skel1 = bpf_iter_test_kern1__open_and_load();
+ if (CHECK(!skel1, "bpf_iter_test_kern1__open_and_load",
+ "skeleton open_and_load failed\n"))
+ return;
+
+ link = bpf_program__attach_iter(skel1->progs.dump_task, NULL);
+ if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+ goto out;
+
+ /* unlink this path if it exists. */
+ unlink(path);
+
+ err = bpf_link__pin(link, path);
+ if (CHECK(err, "pin_iter", "pin_iter to %s failed: %d\n", path, err))
+ goto free_link;
+
+ err = do_read(path, "abcd");
+ if (err)
+ goto unlink_path;
+
+ /* file based iterator seems working fine. Let us a link update
+ * of the underlying link and `cat` the iterator again, its content
+ * should change.
+ */
+ skel2 = bpf_iter_test_kern2__open_and_load();
+ if (CHECK(!skel2, "bpf_iter_test_kern2__open_and_load",
+ "skeleton open_and_load failed\n"))
+ goto unlink_path;
+
+ err = bpf_link__update_program(link, skel2->progs.dump_task);
+ if (CHECK(err, "update_prog", "update_prog failed\n"))
+ goto destroy_skel2;
+
+ do_read(path, "ABCD");
+
+destroy_skel2:
+ bpf_iter_test_kern2__destroy(skel2);
+unlink_path:
+ unlink(path);
+free_link:
+ bpf_link__destroy(link);
+out:
+ bpf_iter_test_kern1__destroy(skel1);
+}
+
+static void test_overflow(bool test_e2big_overflow, bool ret1)
+{
+ __u32 map_info_len, total_read_len, expected_read_len;
+ int err, iter_fd, map1_fd, map2_fd, len;
+ struct bpf_map_info map_info = {};
+ struct bpf_iter_test_kern4 *skel;
+ struct bpf_link *link;
+ __u32 iter_size;
+ char *buf;
+
+ skel = bpf_iter_test_kern4__open();
+ if (CHECK(!skel, "bpf_iter_test_kern4__open",
+ "skeleton open failed\n"))
+ return;
+
+ /* create two maps: bpf program will only do bpf_seq_write
+ * for these two maps. The goal is one map output almost
+ * fills seq_file buffer and then the other will trigger
+ * overflow and needs restart.
+ */
+ map1_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0);
+ if (CHECK(map1_fd < 0, "bpf_create_map",
+ "map_creation failed: %s\n", strerror(errno)))
+ goto out;
+ map2_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0);
+ if (CHECK(map2_fd < 0, "bpf_create_map",
+ "map_creation failed: %s\n", strerror(errno)))
+ goto free_map1;
+
+ /* bpf_seq_printf kernel buffer is 8 pages, so one map
+ * bpf_seq_write will mostly fill it, and the other map
+ * will partially fill and then trigger overflow and need
+ * bpf_seq_read restart.
+ */
+ iter_size = sysconf(_SC_PAGE_SIZE) << 3;
+
+ if (test_e2big_overflow) {
+ skel->rodata->print_len = (iter_size + 8) / 8;
+ expected_read_len = 2 * (iter_size + 8);
+ } else if (!ret1) {
+ skel->rodata->print_len = (iter_size - 8) / 8;
+ expected_read_len = 2 * (iter_size - 8);
+ } else {
+ skel->rodata->print_len = 1;
+ expected_read_len = 2 * 8;
+ }
+ skel->rodata->ret1 = ret1;
+
+ if (CHECK(bpf_iter_test_kern4__load(skel),
+ "bpf_iter_test_kern4__load", "skeleton load failed\n"))
+ goto free_map2;
+
+ /* setup filtering map_id in bpf program */
+ map_info_len = sizeof(map_info);
+ err = bpf_obj_get_info_by_fd(map1_fd, &map_info, &map_info_len);
+ if (CHECK(err, "get_map_info", "get map info failed: %s\n",
+ strerror(errno)))
+ goto free_map2;
+ skel->bss->map1_id = map_info.id;
+
+ err = bpf_obj_get_info_by_fd(map2_fd, &map_info, &map_info_len);
+ if (CHECK(err, "get_map_info", "get map info failed: %s\n",
+ strerror(errno)))
+ goto free_map2;
+ skel->bss->map2_id = map_info.id;
+
+ link = bpf_program__attach_iter(skel->progs.dump_bpf_map, NULL);
+ if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+ goto free_map2;
+
+ iter_fd = bpf_iter_create(bpf_link__fd(link));
+ if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+ goto free_link;
+
+ buf = malloc(expected_read_len);
+ if (!buf)
+ goto close_iter;
+
+ /* do read */
+ total_read_len = 0;
+ if (test_e2big_overflow) {
+ while ((len = read(iter_fd, buf, expected_read_len)) > 0)
+ total_read_len += len;
+
+ CHECK(len != -1 || errno != E2BIG, "read",
+ "expected ret -1, errno E2BIG, but get ret %d, error %s\n",
+ len, strerror(errno));
+ goto free_buf;
+ } else if (!ret1) {
+ while ((len = read(iter_fd, buf, expected_read_len)) > 0)
+ total_read_len += len;
+
+ if (CHECK(len < 0, "read", "read failed: %s\n",
+ strerror(errno)))
+ goto free_buf;
+ } else {
+ do {
+ len = read(iter_fd, buf, expected_read_len);
+ if (len > 0)
+ total_read_len += len;
+ } while (len > 0 || len == -EAGAIN);
+
+ if (CHECK(len < 0, "read", "read failed: %s\n",
+ strerror(errno)))
+ goto free_buf;
+ }
+
+ if (CHECK(total_read_len != expected_read_len, "read",
+ "total len %u, expected len %u\n", total_read_len,
+ expected_read_len))
+ goto free_buf;
+
+ if (CHECK(skel->bss->map1_accessed != 1, "map1_accessed",
+ "expected 1 actual %d\n", skel->bss->map1_accessed))
+ goto free_buf;
+
+ if (CHECK(skel->bss->map2_accessed != 2, "map2_accessed",
+ "expected 2 actual %d\n", skel->bss->map2_accessed))
+ goto free_buf;
+
+ CHECK(skel->bss->map2_seqnum1 != skel->bss->map2_seqnum2,
+ "map2_seqnum", "two different seqnum %lld %lld\n",
+ skel->bss->map2_seqnum1, skel->bss->map2_seqnum2);
+
+free_buf:
+ free(buf);
+close_iter:
+ close(iter_fd);
+free_link:
+ bpf_link__destroy(link);
+free_map2:
+ close(map2_fd);
+free_map1:
+ close(map1_fd);
+out:
+ bpf_iter_test_kern4__destroy(skel);
+}
+
+static void test_bpf_hash_map(void)
+{
+ __u32 expected_key_a = 0, expected_key_b = 0, expected_key_c = 0;
+ DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+ struct bpf_iter_bpf_hash_map *skel;
+ int err, i, len, map_fd, iter_fd;
+ union bpf_iter_link_info linfo;
+ __u64 val, expected_val = 0;
+ struct bpf_link *link;
+ struct key_t {
+ int a;
+ int b;
+ int c;
+ } key;
+ char buf[64];
+
+ skel = bpf_iter_bpf_hash_map__open();
+ if (CHECK(!skel, "bpf_iter_bpf_hash_map__open",
+ "skeleton open failed\n"))
+ return;
+
+ skel->bss->in_test_mode = true;
+
+ err = bpf_iter_bpf_hash_map__load(skel);
+ if (CHECK(!skel, "bpf_iter_bpf_hash_map__load",
+ "skeleton load failed\n"))
+ goto out;
+
+ /* iterator with hashmap2 and hashmap3 should fail */
+ memset(&linfo, 0, sizeof(linfo));
+ linfo.map.map_fd = bpf_map__fd(skel->maps.hashmap2);
+ opts.link_info = &linfo;
+ opts.link_info_len = sizeof(linfo);
+ link = bpf_program__attach_iter(skel->progs.dump_bpf_hash_map, &opts);
+ if (CHECK(!IS_ERR(link), "attach_iter",
+ "attach_iter for hashmap2 unexpected succeeded\n"))
+ goto out;
+
+ linfo.map.map_fd = bpf_map__fd(skel->maps.hashmap3);
+ link = bpf_program__attach_iter(skel->progs.dump_bpf_hash_map, &opts);
+ if (CHECK(!IS_ERR(link), "attach_iter",
+ "attach_iter for hashmap3 unexpected succeeded\n"))
+ goto out;
+
+ /* hashmap1 should be good, update map values here */
+ map_fd = bpf_map__fd(skel->maps.hashmap1);
+ for (i = 0; i < bpf_map__max_entries(skel->maps.hashmap1); i++) {
+ key.a = i + 1;
+ key.b = i + 2;
+ key.c = i + 3;
+ val = i + 4;
+ expected_key_a += key.a;
+ expected_key_b += key.b;
+ expected_key_c += key.c;
+ expected_val += val;
+
+ err = bpf_map_update_elem(map_fd, &key, &val, BPF_ANY);
+ if (CHECK(err, "map_update", "map_update failed\n"))
+ goto out;
+ }
+
+ linfo.map.map_fd = map_fd;
+ link = bpf_program__attach_iter(skel->progs.dump_bpf_hash_map, &opts);
+ if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+ goto out;
+
+ iter_fd = bpf_iter_create(bpf_link__fd(link));
+ if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+ goto free_link;
+
+ /* do some tests */
+ while ((len = read(iter_fd, buf, sizeof(buf))) > 0)
+ ;
+ if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+ goto close_iter;
+
+ /* test results */
+ if (CHECK(skel->bss->key_sum_a != expected_key_a,
+ "key_sum_a", "got %u expected %u\n",
+ skel->bss->key_sum_a, expected_key_a))
+ goto close_iter;
+ if (CHECK(skel->bss->key_sum_b != expected_key_b,
+ "key_sum_b", "got %u expected %u\n",
+ skel->bss->key_sum_b, expected_key_b))
+ goto close_iter;
+ if (CHECK(skel->bss->val_sum != expected_val,
+ "val_sum", "got %llu expected %llu\n",
+ skel->bss->val_sum, expected_val))
+ goto close_iter;
+
+close_iter:
+ close(iter_fd);
+free_link:
+ bpf_link__destroy(link);
+out:
+ bpf_iter_bpf_hash_map__destroy(skel);
+}
+
+static void test_bpf_percpu_hash_map(void)
+{
+ __u32 expected_key_a = 0, expected_key_b = 0, expected_key_c = 0;
+ DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+ struct bpf_iter_bpf_percpu_hash_map *skel;
+ int err, i, j, len, map_fd, iter_fd;
+ union bpf_iter_link_info linfo;
+ __u32 expected_val = 0;
+ struct bpf_link *link;
+ struct key_t {
+ int a;
+ int b;
+ int c;
+ } key;
+ char buf[64];
+ void *val;
+
+ val = malloc(8 * bpf_num_possible_cpus());
+
+ skel = bpf_iter_bpf_percpu_hash_map__open();
+ if (CHECK(!skel, "bpf_iter_bpf_percpu_hash_map__open",
+ "skeleton open failed\n"))
+ return;
+
+ skel->rodata->num_cpus = bpf_num_possible_cpus();
+
+ err = bpf_iter_bpf_percpu_hash_map__load(skel);
+ if (CHECK(!skel, "bpf_iter_bpf_percpu_hash_map__load",
+ "skeleton load failed\n"))
+ goto out;
+
+ /* update map values here */
+ map_fd = bpf_map__fd(skel->maps.hashmap1);
+ for (i = 0; i < bpf_map__max_entries(skel->maps.hashmap1); i++) {
+ key.a = i + 1;
+ key.b = i + 2;
+ key.c = i + 3;
+ expected_key_a += key.a;
+ expected_key_b += key.b;
+ expected_key_c += key.c;
+
+ for (j = 0; j < bpf_num_possible_cpus(); j++) {
+ *(__u32 *)(val + j * 8) = i + j;
+ expected_val += i + j;
+ }
+
+ err = bpf_map_update_elem(map_fd, &key, val, BPF_ANY);
+ if (CHECK(err, "map_update", "map_update failed\n"))
+ goto out;
+ }
+
+ memset(&linfo, 0, sizeof(linfo));
+ linfo.map.map_fd = map_fd;
+ opts.link_info = &linfo;
+ opts.link_info_len = sizeof(linfo);
+ link = bpf_program__attach_iter(skel->progs.dump_bpf_percpu_hash_map, &opts);
+ if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+ goto out;
+
+ iter_fd = bpf_iter_create(bpf_link__fd(link));
+ if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+ goto free_link;
+
+ /* do some tests */
+ while ((len = read(iter_fd, buf, sizeof(buf))) > 0)
+ ;
+ if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+ goto close_iter;
+
+ /* test results */
+ if (CHECK(skel->bss->key_sum_a != expected_key_a,
+ "key_sum_a", "got %u expected %u\n",
+ skel->bss->key_sum_a, expected_key_a))
+ goto close_iter;
+ if (CHECK(skel->bss->key_sum_b != expected_key_b,
+ "key_sum_b", "got %u expected %u\n",
+ skel->bss->key_sum_b, expected_key_b))
+ goto close_iter;
+ if (CHECK(skel->bss->val_sum != expected_val,
+ "val_sum", "got %u expected %u\n",
+ skel->bss->val_sum, expected_val))
+ goto close_iter;
+
+close_iter:
+ close(iter_fd);
+free_link:
+ bpf_link__destroy(link);
+out:
+ bpf_iter_bpf_percpu_hash_map__destroy(skel);
+}
+
+static void test_bpf_array_map(void)
+{
+ __u64 val, expected_val = 0, res_first_val, first_val = 0;
+ DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+ __u32 expected_key = 0, res_first_key;
+ struct bpf_iter_bpf_array_map *skel;
+ union bpf_iter_link_info linfo;
+ int err, i, map_fd, iter_fd;
+ struct bpf_link *link;
+ char buf[64] = {};
+ int len, start;
+
+ skel = bpf_iter_bpf_array_map__open_and_load();
+ if (CHECK(!skel, "bpf_iter_bpf_array_map__open_and_load",
+ "skeleton open_and_load failed\n"))
+ return;
+
+ map_fd = bpf_map__fd(skel->maps.arraymap1);
+ for (i = 0; i < bpf_map__max_entries(skel->maps.arraymap1); i++) {
+ val = i + 4;
+ expected_key += i;
+ expected_val += val;
+
+ if (i == 0)
+ first_val = val;
+
+ err = bpf_map_update_elem(map_fd, &i, &val, BPF_ANY);
+ if (CHECK(err, "map_update", "map_update failed\n"))
+ goto out;
+ }
+
+ memset(&linfo, 0, sizeof(linfo));
+ linfo.map.map_fd = map_fd;
+ opts.link_info = &linfo;
+ opts.link_info_len = sizeof(linfo);
+ link = bpf_program__attach_iter(skel->progs.dump_bpf_array_map, &opts);
+ if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+ goto out;
+
+ iter_fd = bpf_iter_create(bpf_link__fd(link));
+ if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+ goto free_link;
+
+ /* do some tests */
+ start = 0;
+ while ((len = read(iter_fd, buf + start, sizeof(buf) - start)) > 0)
+ start += len;
+ if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+ goto close_iter;
+
+ /* test results */
+ res_first_key = *(__u32 *)buf;
+ res_first_val = *(__u64 *)(buf + sizeof(__u32));
+ if (CHECK(res_first_key != 0 || res_first_val != first_val,
+ "bpf_seq_write",
+ "seq_write failure: first key %u vs expected 0, "
+ " first value %llu vs expected %llu\n",
+ res_first_key, res_first_val, first_val))
+ goto close_iter;
+
+ if (CHECK(skel->bss->key_sum != expected_key,
+ "key_sum", "got %u expected %u\n",
+ skel->bss->key_sum, expected_key))
+ goto close_iter;
+ if (CHECK(skel->bss->val_sum != expected_val,
+ "val_sum", "got %llu expected %llu\n",
+ skel->bss->val_sum, expected_val))
+ goto close_iter;
+
+ for (i = 0; i < bpf_map__max_entries(skel->maps.arraymap1); i++) {
+ err = bpf_map_lookup_elem(map_fd, &i, &val);
+ if (CHECK(err, "map_lookup", "map_lookup failed\n"))
+ goto out;
+ if (CHECK(i != val, "invalid_val",
+ "got value %llu expected %u\n", val, i))
+ goto out;
+ }
+
+close_iter:
+ close(iter_fd);
+free_link:
+ bpf_link__destroy(link);
+out:
+ bpf_iter_bpf_array_map__destroy(skel);
+}
+
+static void test_bpf_percpu_array_map(void)
+{
+ DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+ struct bpf_iter_bpf_percpu_array_map *skel;
+ __u32 expected_key = 0, expected_val = 0;
+ union bpf_iter_link_info linfo;
+ int err, i, j, map_fd, iter_fd;
+ struct bpf_link *link;
+ char buf[64];
+ void *val;
+ int len;
+
+ val = malloc(8 * bpf_num_possible_cpus());
+
+ skel = bpf_iter_bpf_percpu_array_map__open();
+ if (CHECK(!skel, "bpf_iter_bpf_percpu_array_map__open",
+ "skeleton open failed\n"))
+ return;
+
+ skel->rodata->num_cpus = bpf_num_possible_cpus();
+
+ err = bpf_iter_bpf_percpu_array_map__load(skel);
+ if (CHECK(!skel, "bpf_iter_bpf_percpu_array_map__load",
+ "skeleton load failed\n"))
+ goto out;
+
+ /* update map values here */
+ map_fd = bpf_map__fd(skel->maps.arraymap1);
+ for (i = 0; i < bpf_map__max_entries(skel->maps.arraymap1); i++) {
+ expected_key += i;
+
+ for (j = 0; j < bpf_num_possible_cpus(); j++) {
+ *(__u32 *)(val + j * 8) = i + j;
+ expected_val += i + j;
+ }
+
+ err = bpf_map_update_elem(map_fd, &i, val, BPF_ANY);
+ if (CHECK(err, "map_update", "map_update failed\n"))
+ goto out;
+ }
+
+ memset(&linfo, 0, sizeof(linfo));
+ linfo.map.map_fd = map_fd;
+ opts.link_info = &linfo;
+ opts.link_info_len = sizeof(linfo);
+ link = bpf_program__attach_iter(skel->progs.dump_bpf_percpu_array_map, &opts);
+ if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+ goto out;
+
+ iter_fd = bpf_iter_create(bpf_link__fd(link));
+ if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+ goto free_link;
+
+ /* do some tests */
+ while ((len = read(iter_fd, buf, sizeof(buf))) > 0)
+ ;
+ if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+ goto close_iter;
+
+ /* test results */
+ if (CHECK(skel->bss->key_sum != expected_key,
+ "key_sum", "got %u expected %u\n",
+ skel->bss->key_sum, expected_key))
+ goto close_iter;
+ if (CHECK(skel->bss->val_sum != expected_val,
+ "val_sum", "got %u expected %u\n",
+ skel->bss->val_sum, expected_val))
+ goto close_iter;
+
+close_iter:
+ close(iter_fd);
+free_link:
+ bpf_link__destroy(link);
+out:
+ bpf_iter_bpf_percpu_array_map__destroy(skel);
+}
+
+static void test_bpf_sk_storage_map(void)
+{
+ DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+ int err, i, len, map_fd, iter_fd, num_sockets;
+ struct bpf_iter_bpf_sk_storage_map *skel;
+ union bpf_iter_link_info linfo;
+ int sock_fd[3] = {-1, -1, -1};
+ __u32 val, expected_val = 0;
+ struct bpf_link *link;
+ char buf[64];
+
+ skel = bpf_iter_bpf_sk_storage_map__open_and_load();
+ if (CHECK(!skel, "bpf_iter_bpf_sk_storage_map__open_and_load",
+ "skeleton open_and_load failed\n"))
+ return;
+
+ map_fd = bpf_map__fd(skel->maps.sk_stg_map);
+ num_sockets = ARRAY_SIZE(sock_fd);
+ for (i = 0; i < num_sockets; i++) {
+ sock_fd[i] = socket(AF_INET6, SOCK_STREAM, 0);
+ if (CHECK(sock_fd[i] < 0, "socket", "errno: %d\n", errno))
+ goto out;
+
+ val = i + 1;
+ expected_val += val;
+
+ err = bpf_map_update_elem(map_fd, &sock_fd[i], &val,
+ BPF_NOEXIST);
+ if (CHECK(err, "map_update", "map_update failed\n"))
+ goto out;
+ }
+
+ memset(&linfo, 0, sizeof(linfo));
+ linfo.map.map_fd = map_fd;
+ opts.link_info = &linfo;
+ opts.link_info_len = sizeof(linfo);
+ link = bpf_program__attach_iter(skel->progs.dump_bpf_sk_storage_map, &opts);
+ if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+ goto out;
+
+ iter_fd = bpf_iter_create(bpf_link__fd(link));
+ if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+ goto free_link;
+
+ /* do some tests */
+ while ((len = read(iter_fd, buf, sizeof(buf))) > 0)
+ ;
+ if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+ goto close_iter;
+
+ /* test results */
+ if (CHECK(skel->bss->ipv6_sk_count != num_sockets,
+ "ipv6_sk_count", "got %u expected %u\n",
+ skel->bss->ipv6_sk_count, num_sockets))
+ goto close_iter;
+
+ if (CHECK(skel->bss->val_sum != expected_val,
+ "val_sum", "got %u expected %u\n",
+ skel->bss->val_sum, expected_val))
+ goto close_iter;
+
+close_iter:
+ close(iter_fd);
+free_link:
+ bpf_link__destroy(link);
+out:
+ for (i = 0; i < num_sockets; i++) {
+ if (sock_fd[i] >= 0)
+ close(sock_fd[i]);
+ }
+ bpf_iter_bpf_sk_storage_map__destroy(skel);
+}
+
+static void test_rdonly_buf_out_of_bound(void)
+{
+ DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+ struct bpf_iter_test_kern5 *skel;
+ union bpf_iter_link_info linfo;
+ struct bpf_link *link;
+
+ skel = bpf_iter_test_kern5__open_and_load();
+ if (CHECK(!skel, "bpf_iter_test_kern5__open_and_load",
+ "skeleton open_and_load failed\n"))
+ return;
+
+ memset(&linfo, 0, sizeof(linfo));
+ linfo.map.map_fd = bpf_map__fd(skel->maps.hashmap1);
+ opts.link_info = &linfo;
+ opts.link_info_len = sizeof(linfo);
+ link = bpf_program__attach_iter(skel->progs.dump_bpf_hash_map, &opts);
+ if (CHECK(!IS_ERR(link), "attach_iter", "unexpected success\n"))
+ bpf_link__destroy(link);
+
+ bpf_iter_test_kern5__destroy(skel);
+}
+
+static void test_buf_neg_offset(void)
+{
+ struct bpf_iter_test_kern6 *skel;
+
+ skel = bpf_iter_test_kern6__open_and_load();
+ if (CHECK(skel, "bpf_iter_test_kern6__open_and_load",
+ "skeleton open_and_load unexpected success\n"))
+ bpf_iter_test_kern6__destroy(skel);
+}
+
+void test_bpf_iter(void)
+{
+ if (test__start_subtest("btf_id_or_null"))
+ test_btf_id_or_null();
+ if (test__start_subtest("ipv6_route"))
+ test_ipv6_route();
+ if (test__start_subtest("netlink"))
+ test_netlink();
+ if (test__start_subtest("bpf_map"))
+ test_bpf_map();
+ if (test__start_subtest("task"))
+ test_task();
+ if (test__start_subtest("task_stack"))
+ test_task_stack();
+ if (test__start_subtest("task_file"))
+ test_task_file();
+ if (test__start_subtest("task_btf"))
+ test_task_btf();
+ if (test__start_subtest("tcp4"))
+ test_tcp4();
+ if (test__start_subtest("tcp6"))
+ test_tcp6();
+ if (test__start_subtest("udp4"))
+ test_udp4();
+ if (test__start_subtest("udp6"))
+ test_udp6();
+ if (test__start_subtest("anon"))
+ test_anon_iter(false);
+ if (test__start_subtest("anon-read-one-char"))
+ test_anon_iter(true);
+ if (test__start_subtest("file"))
+ test_file_iter();
+ if (test__start_subtest("overflow"))
+ test_overflow(false, false);
+ if (test__start_subtest("overflow-e2big"))
+ test_overflow(true, false);
+ if (test__start_subtest("prog-ret-1"))
+ test_overflow(false, true);
+ if (test__start_subtest("bpf_hash_map"))
+ test_bpf_hash_map();
+ if (test__start_subtest("bpf_percpu_hash_map"))
+ test_bpf_percpu_hash_map();
+ if (test__start_subtest("bpf_array_map"))
+ test_bpf_array_map();
+ if (test__start_subtest("bpf_percpu_array_map"))
+ test_bpf_percpu_array_map();
+ if (test__start_subtest("bpf_sk_storage_map"))
+ test_bpf_sk_storage_map();
+ if (test__start_subtest("rdonly-buf-out-of-bound"))
+ test_rdonly_buf_out_of_bound();
+ if (test__start_subtest("buf-neg-offset"))
+ test_buf_neg_offset();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
new file mode 100644
index 000000000..284d5921c
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_obj_id.c
@@ -0,0 +1,345 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+#define nr_iters 2
+
+void test_bpf_obj_id(void)
+{
+ const __u64 array_magic_value = 0xfaceb00c;
+ const __u32 array_key = 0;
+ const char *file = "./test_obj_id.o";
+ const char *expected_prog_name = "test_obj_id";
+ const char *expected_map_name = "test_map_id";
+ const __u64 nsec_per_sec = 1000000000;
+
+ struct bpf_object *objs[nr_iters] = {};
+ struct bpf_link *links[nr_iters] = {};
+ struct bpf_program *prog;
+ int prog_fds[nr_iters], map_fds[nr_iters];
+ /* +1 to test for the info_len returned by kernel */
+ struct bpf_prog_info prog_infos[nr_iters + 1];
+ struct bpf_map_info map_infos[nr_iters + 1];
+ struct bpf_link_info link_infos[nr_iters + 1];
+ /* Each prog only uses one map. +1 to test nr_map_ids
+ * returned by kernel.
+ */
+ __u32 map_ids[nr_iters + 1];
+ char jited_insns[128], xlated_insns[128], zeros[128], tp_name[128];
+ __u32 i, next_id, info_len, nr_id_found, duration = 0;
+ struct timespec real_time_ts, boot_time_ts;
+ int err = 0;
+ __u64 array_value;
+ uid_t my_uid = getuid();
+ time_t now, load_time;
+
+ err = bpf_prog_get_fd_by_id(0);
+ CHECK(err >= 0 || errno != ENOENT,
+ "get-fd-by-notexist-prog-id", "err %d errno %d\n", err, errno);
+
+ err = bpf_map_get_fd_by_id(0);
+ CHECK(err >= 0 || errno != ENOENT,
+ "get-fd-by-notexist-map-id", "err %d errno %d\n", err, errno);
+
+ err = bpf_link_get_fd_by_id(0);
+ CHECK(err >= 0 || errno != ENOENT,
+ "get-fd-by-notexist-link-id", "err %d errno %d\n", err, errno);
+
+ /* Check bpf_obj_get_info_by_fd() */
+ bzero(zeros, sizeof(zeros));
+ for (i = 0; i < nr_iters; i++) {
+ now = time(NULL);
+ err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT,
+ &objs[i], &prog_fds[i]);
+ /* test_obj_id.o is a dumb prog. It should never fail
+ * to load.
+ */
+ if (CHECK_FAIL(err))
+ continue;
+
+ /* Insert a magic value to the map */
+ map_fds[i] = bpf_find_map(__func__, objs[i], "test_map_id");
+ if (CHECK_FAIL(map_fds[i] < 0))
+ goto done;
+ err = bpf_map_update_elem(map_fds[i], &array_key,
+ &array_magic_value, 0);
+ if (CHECK_FAIL(err))
+ goto done;
+
+ prog = bpf_object__find_program_by_title(objs[i],
+ "raw_tp/sys_enter");
+ if (CHECK_FAIL(!prog))
+ goto done;
+ links[i] = bpf_program__attach(prog);
+ err = libbpf_get_error(links[i]);
+ if (CHECK(err, "prog_attach", "prog #%d, err %d\n", i, err)) {
+ links[i] = NULL;
+ goto done;
+ }
+
+ /* Check getting map info */
+ info_len = sizeof(struct bpf_map_info) * 2;
+ bzero(&map_infos[i], info_len);
+ err = bpf_obj_get_info_by_fd(map_fds[i], &map_infos[i],
+ &info_len);
+ if (CHECK(err ||
+ map_infos[i].type != BPF_MAP_TYPE_ARRAY ||
+ map_infos[i].key_size != sizeof(__u32) ||
+ map_infos[i].value_size != sizeof(__u64) ||
+ map_infos[i].max_entries != 1 ||
+ map_infos[i].map_flags != 0 ||
+ info_len != sizeof(struct bpf_map_info) ||
+ strcmp((char *)map_infos[i].name, expected_map_name),
+ "get-map-info(fd)",
+ "err %d errno %d type %d(%d) info_len %u(%zu) key_size %u value_size %u max_entries %u map_flags %X name %s(%s)\n",
+ err, errno,
+ map_infos[i].type, BPF_MAP_TYPE_ARRAY,
+ info_len, sizeof(struct bpf_map_info),
+ map_infos[i].key_size,
+ map_infos[i].value_size,
+ map_infos[i].max_entries,
+ map_infos[i].map_flags,
+ map_infos[i].name, expected_map_name))
+ goto done;
+
+ /* Check getting prog info */
+ info_len = sizeof(struct bpf_prog_info) * 2;
+ bzero(&prog_infos[i], info_len);
+ bzero(jited_insns, sizeof(jited_insns));
+ bzero(xlated_insns, sizeof(xlated_insns));
+ prog_infos[i].jited_prog_insns = ptr_to_u64(jited_insns);
+ prog_infos[i].jited_prog_len = sizeof(jited_insns);
+ prog_infos[i].xlated_prog_insns = ptr_to_u64(xlated_insns);
+ prog_infos[i].xlated_prog_len = sizeof(xlated_insns);
+ prog_infos[i].map_ids = ptr_to_u64(map_ids + i);
+ prog_infos[i].nr_map_ids = 2;
+ err = clock_gettime(CLOCK_REALTIME, &real_time_ts);
+ if (CHECK_FAIL(err))
+ goto done;
+ err = clock_gettime(CLOCK_BOOTTIME, &boot_time_ts);
+ if (CHECK_FAIL(err))
+ goto done;
+ err = bpf_obj_get_info_by_fd(prog_fds[i], &prog_infos[i],
+ &info_len);
+ load_time = (real_time_ts.tv_sec - boot_time_ts.tv_sec)
+ + (prog_infos[i].load_time / nsec_per_sec);
+ if (CHECK(err ||
+ prog_infos[i].type != BPF_PROG_TYPE_RAW_TRACEPOINT ||
+ info_len != sizeof(struct bpf_prog_info) ||
+ (env.jit_enabled && !prog_infos[i].jited_prog_len) ||
+ (env.jit_enabled &&
+ !memcmp(jited_insns, zeros, sizeof(zeros))) ||
+ !prog_infos[i].xlated_prog_len ||
+ !memcmp(xlated_insns, zeros, sizeof(zeros)) ||
+ load_time < now - 60 || load_time > now + 60 ||
+ prog_infos[i].created_by_uid != my_uid ||
+ prog_infos[i].nr_map_ids != 1 ||
+ *(int *)(long)prog_infos[i].map_ids != map_infos[i].id ||
+ strcmp((char *)prog_infos[i].name, expected_prog_name),
+ "get-prog-info(fd)",
+ "err %d errno %d i %d type %d(%d) info_len %u(%zu) "
+ "jit_enabled %d jited_prog_len %u xlated_prog_len %u "
+ "jited_prog %d xlated_prog %d load_time %lu(%lu) "
+ "uid %u(%u) nr_map_ids %u(%u) map_id %u(%u) "
+ "name %s(%s)\n",
+ err, errno, i,
+ prog_infos[i].type, BPF_PROG_TYPE_SOCKET_FILTER,
+ info_len, sizeof(struct bpf_prog_info),
+ env.jit_enabled,
+ prog_infos[i].jited_prog_len,
+ prog_infos[i].xlated_prog_len,
+ !!memcmp(jited_insns, zeros, sizeof(zeros)),
+ !!memcmp(xlated_insns, zeros, sizeof(zeros)),
+ load_time, now,
+ prog_infos[i].created_by_uid, my_uid,
+ prog_infos[i].nr_map_ids, 1,
+ *(int *)(long)prog_infos[i].map_ids, map_infos[i].id,
+ prog_infos[i].name, expected_prog_name))
+ goto done;
+
+ /* Check getting link info */
+ info_len = sizeof(struct bpf_link_info) * 2;
+ bzero(&link_infos[i], info_len);
+ link_infos[i].raw_tracepoint.tp_name = ptr_to_u64(&tp_name);
+ link_infos[i].raw_tracepoint.tp_name_len = sizeof(tp_name);
+ err = bpf_obj_get_info_by_fd(bpf_link__fd(links[i]),
+ &link_infos[i], &info_len);
+ if (CHECK(err ||
+ link_infos[i].type != BPF_LINK_TYPE_RAW_TRACEPOINT ||
+ link_infos[i].prog_id != prog_infos[i].id ||
+ link_infos[i].raw_tracepoint.tp_name != ptr_to_u64(&tp_name) ||
+ strcmp(u64_to_ptr(link_infos[i].raw_tracepoint.tp_name),
+ "sys_enter") ||
+ info_len != sizeof(struct bpf_link_info),
+ "get-link-info(fd)",
+ "err %d errno %d info_len %u(%zu) type %d(%d) id %d "
+ "prog_id %d (%d) tp_name %s(%s)\n",
+ err, errno,
+ info_len, sizeof(struct bpf_link_info),
+ link_infos[i].type, BPF_LINK_TYPE_RAW_TRACEPOINT,
+ link_infos[i].id,
+ link_infos[i].prog_id, prog_infos[i].id,
+ (const char *)u64_to_ptr(link_infos[i].raw_tracepoint.tp_name),
+ "sys_enter"))
+ goto done;
+
+ }
+
+ /* Check bpf_prog_get_next_id() */
+ nr_id_found = 0;
+ next_id = 0;
+ while (!bpf_prog_get_next_id(next_id, &next_id)) {
+ struct bpf_prog_info prog_info = {};
+ __u32 saved_map_id;
+ int prog_fd;
+
+ info_len = sizeof(prog_info);
+
+ prog_fd = bpf_prog_get_fd_by_id(next_id);
+ if (prog_fd < 0 && errno == ENOENT)
+ /* The bpf_prog is in the dead row */
+ continue;
+ if (CHECK(prog_fd < 0, "get-prog-fd(next_id)",
+ "prog_fd %d next_id %d errno %d\n",
+ prog_fd, next_id, errno))
+ break;
+
+ for (i = 0; i < nr_iters; i++)
+ if (prog_infos[i].id == next_id)
+ break;
+
+ if (i == nr_iters)
+ continue;
+
+ nr_id_found++;
+
+ /* Negative test:
+ * prog_info.nr_map_ids = 1
+ * prog_info.map_ids = NULL
+ */
+ prog_info.nr_map_ids = 1;
+ err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len);
+ if (CHECK(!err || errno != EFAULT,
+ "get-prog-fd-bad-nr-map-ids", "err %d errno %d(%d)",
+ err, errno, EFAULT))
+ break;
+ bzero(&prog_info, sizeof(prog_info));
+ info_len = sizeof(prog_info);
+
+ saved_map_id = *(int *)((long)prog_infos[i].map_ids);
+ prog_info.map_ids = prog_infos[i].map_ids;
+ prog_info.nr_map_ids = 2;
+ err = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len);
+ prog_infos[i].jited_prog_insns = 0;
+ prog_infos[i].xlated_prog_insns = 0;
+ CHECK(err || info_len != sizeof(struct bpf_prog_info) ||
+ memcmp(&prog_info, &prog_infos[i], info_len) ||
+ *(int *)(long)prog_info.map_ids != saved_map_id,
+ "get-prog-info(next_id->fd)",
+ "err %d errno %d info_len %u(%zu) memcmp %d map_id %u(%u)\n",
+ err, errno, info_len, sizeof(struct bpf_prog_info),
+ memcmp(&prog_info, &prog_infos[i], info_len),
+ *(int *)(long)prog_info.map_ids, saved_map_id);
+ close(prog_fd);
+ }
+ CHECK(nr_id_found != nr_iters,
+ "check total prog id found by get_next_id",
+ "nr_id_found %u(%u)\n",
+ nr_id_found, nr_iters);
+
+ /* Check bpf_map_get_next_id() */
+ nr_id_found = 0;
+ next_id = 0;
+ while (!bpf_map_get_next_id(next_id, &next_id)) {
+ struct bpf_map_info map_info = {};
+ int map_fd;
+
+ info_len = sizeof(map_info);
+
+ map_fd = bpf_map_get_fd_by_id(next_id);
+ if (map_fd < 0 && errno == ENOENT)
+ /* The bpf_map is in the dead row */
+ continue;
+ if (CHECK(map_fd < 0, "get-map-fd(next_id)",
+ "map_fd %d next_id %u errno %d\n",
+ map_fd, next_id, errno))
+ break;
+
+ for (i = 0; i < nr_iters; i++)
+ if (map_infos[i].id == next_id)
+ break;
+
+ if (i == nr_iters)
+ continue;
+
+ nr_id_found++;
+
+ err = bpf_map_lookup_elem(map_fd, &array_key, &array_value);
+ if (CHECK_FAIL(err))
+ goto done;
+
+ err = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len);
+ CHECK(err || info_len != sizeof(struct bpf_map_info) ||
+ memcmp(&map_info, &map_infos[i], info_len) ||
+ array_value != array_magic_value,
+ "check get-map-info(next_id->fd)",
+ "err %d errno %d info_len %u(%zu) memcmp %d array_value %llu(%llu)\n",
+ err, errno, info_len, sizeof(struct bpf_map_info),
+ memcmp(&map_info, &map_infos[i], info_len),
+ array_value, array_magic_value);
+
+ close(map_fd);
+ }
+ CHECK(nr_id_found != nr_iters,
+ "check total map id found by get_next_id",
+ "nr_id_found %u(%u)\n",
+ nr_id_found, nr_iters);
+
+ /* Check bpf_link_get_next_id() */
+ nr_id_found = 0;
+ next_id = 0;
+ while (!bpf_link_get_next_id(next_id, &next_id)) {
+ struct bpf_link_info link_info;
+ int link_fd, cmp_res;
+
+ info_len = sizeof(link_info);
+ memset(&link_info, 0, info_len);
+
+ link_fd = bpf_link_get_fd_by_id(next_id);
+ if (link_fd < 0 && errno == ENOENT)
+ /* The bpf_link is in the dead row */
+ continue;
+ if (CHECK(link_fd < 0, "get-link-fd(next_id)",
+ "link_fd %d next_id %u errno %d\n",
+ link_fd, next_id, errno))
+ break;
+
+ for (i = 0; i < nr_iters; i++)
+ if (link_infos[i].id == next_id)
+ break;
+
+ if (i == nr_iters)
+ continue;
+
+ nr_id_found++;
+
+ err = bpf_obj_get_info_by_fd(link_fd, &link_info, &info_len);
+ cmp_res = memcmp(&link_info, &link_infos[i],
+ offsetof(struct bpf_link_info, raw_tracepoint));
+ CHECK(err || info_len != sizeof(link_info) || cmp_res,
+ "check get-link-info(next_id->fd)",
+ "err %d errno %d info_len %u(%zu) memcmp %d\n",
+ err, errno, info_len, sizeof(struct bpf_link_info),
+ cmp_res);
+
+ close(link_fd);
+ }
+ CHECK(nr_id_found != nr_iters,
+ "check total link id found by get_next_id",
+ "nr_id_found %u(%u)\n", nr_id_found, nr_iters);
+
+done:
+ for (i = 0; i < nr_iters; i++) {
+ bpf_link__destroy(links[i]);
+ bpf_object__close(objs[i]);
+ }
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
new file mode 100644
index 000000000..37c5494a0
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+
+#include <linux/err.h>
+#include <netinet/tcp.h>
+#include <test_progs.h>
+#include "bpf_dctcp.skel.h"
+#include "bpf_cubic.skel.h"
+
+#define min(a, b) ((a) < (b) ? (a) : (b))
+
+static const unsigned int total_bytes = 10 * 1024 * 1024;
+static const struct timeval timeo_sec = { .tv_sec = 10 };
+static const size_t timeo_optlen = sizeof(timeo_sec);
+static int expected_stg = 0xeB9F;
+static int stop, duration;
+
+static int settimeo(int fd)
+{
+ int err;
+
+ err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec,
+ timeo_optlen);
+ if (CHECK(err == -1, "setsockopt(fd, SO_RCVTIMEO)", "errno:%d\n",
+ errno))
+ return -1;
+
+ err = setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo_sec,
+ timeo_optlen);
+ if (CHECK(err == -1, "setsockopt(fd, SO_SNDTIMEO)", "errno:%d\n",
+ errno))
+ return -1;
+
+ return 0;
+}
+
+static int settcpca(int fd, const char *tcp_ca)
+{
+ int err;
+
+ err = setsockopt(fd, IPPROTO_TCP, TCP_CONGESTION, tcp_ca, strlen(tcp_ca));
+ if (CHECK(err == -1, "setsockopt(fd, TCP_CONGESTION)", "errno:%d\n",
+ errno))
+ return -1;
+
+ return 0;
+}
+
+static void *server(void *arg)
+{
+ int lfd = (int)(long)arg, err = 0, fd;
+ ssize_t nr_sent = 0, bytes = 0;
+ char batch[1500];
+
+ fd = accept(lfd, NULL, NULL);
+ while (fd == -1) {
+ if (errno == EINTR)
+ continue;
+ err = -errno;
+ goto done;
+ }
+
+ if (settimeo(fd)) {
+ err = -errno;
+ goto done;
+ }
+
+ while (bytes < total_bytes && !READ_ONCE(stop)) {
+ nr_sent = send(fd, &batch,
+ min(total_bytes - bytes, sizeof(batch)), 0);
+ if (nr_sent == -1 && errno == EINTR)
+ continue;
+ if (nr_sent == -1) {
+ err = -errno;
+ break;
+ }
+ bytes += nr_sent;
+ }
+
+ CHECK(bytes != total_bytes, "send", "%zd != %u nr_sent:%zd errno:%d\n",
+ bytes, total_bytes, nr_sent, errno);
+
+done:
+ if (fd != -1)
+ close(fd);
+ if (err) {
+ WRITE_ONCE(stop, 1);
+ return ERR_PTR(err);
+ }
+ return NULL;
+}
+
+static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map)
+{
+ struct sockaddr_in6 sa6 = {};
+ ssize_t nr_recv = 0, bytes = 0;
+ int lfd = -1, fd = -1;
+ pthread_t srv_thread;
+ socklen_t addrlen = sizeof(sa6);
+ void *thread_ret;
+ char batch[1500];
+ int err;
+
+ WRITE_ONCE(stop, 0);
+
+ lfd = socket(AF_INET6, SOCK_STREAM, 0);
+ if (CHECK(lfd == -1, "socket", "errno:%d\n", errno))
+ return;
+ fd = socket(AF_INET6, SOCK_STREAM, 0);
+ if (CHECK(fd == -1, "socket", "errno:%d\n", errno)) {
+ close(lfd);
+ return;
+ }
+
+ if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca) ||
+ settimeo(lfd) || settimeo(fd))
+ goto done;
+
+ /* bind, listen and start server thread to accept */
+ sa6.sin6_family = AF_INET6;
+ sa6.sin6_addr = in6addr_loopback;
+ err = bind(lfd, (struct sockaddr *)&sa6, addrlen);
+ if (CHECK(err == -1, "bind", "errno:%d\n", errno))
+ goto done;
+ err = getsockname(lfd, (struct sockaddr *)&sa6, &addrlen);
+ if (CHECK(err == -1, "getsockname", "errno:%d\n", errno))
+ goto done;
+ err = listen(lfd, 1);
+ if (CHECK(err == -1, "listen", "errno:%d\n", errno))
+ goto done;
+
+ if (sk_stg_map) {
+ err = bpf_map_update_elem(bpf_map__fd(sk_stg_map), &fd,
+ &expected_stg, BPF_NOEXIST);
+ if (CHECK(err, "bpf_map_update_elem(sk_stg_map)",
+ "err:%d errno:%d\n", err, errno))
+ goto done;
+ }
+
+ /* connect to server */
+ err = connect(fd, (struct sockaddr *)&sa6, addrlen);
+ if (CHECK(err == -1, "connect", "errno:%d\n", errno))
+ goto done;
+
+ if (sk_stg_map) {
+ int tmp_stg;
+
+ err = bpf_map_lookup_elem(bpf_map__fd(sk_stg_map), &fd,
+ &tmp_stg);
+ if (CHECK(!err || errno != ENOENT,
+ "bpf_map_lookup_elem(sk_stg_map)",
+ "err:%d errno:%d\n", err, errno))
+ goto done;
+ }
+
+ err = pthread_create(&srv_thread, NULL, server, (void *)(long)lfd);
+ if (CHECK(err != 0, "pthread_create", "err:%d errno:%d\n", err, errno))
+ goto done;
+
+ /* recv total_bytes */
+ while (bytes < total_bytes && !READ_ONCE(stop)) {
+ nr_recv = recv(fd, &batch,
+ min(total_bytes - bytes, sizeof(batch)), 0);
+ if (nr_recv == -1 && errno == EINTR)
+ continue;
+ if (nr_recv == -1)
+ break;
+ bytes += nr_recv;
+ }
+
+ CHECK(bytes != total_bytes, "recv", "%zd != %u nr_recv:%zd errno:%d\n",
+ bytes, total_bytes, nr_recv, errno);
+
+ WRITE_ONCE(stop, 1);
+ pthread_join(srv_thread, &thread_ret);
+ CHECK(IS_ERR(thread_ret), "pthread_join", "thread_ret:%ld",
+ PTR_ERR(thread_ret));
+done:
+ close(lfd);
+ close(fd);
+}
+
+static void test_cubic(void)
+{
+ struct bpf_cubic *cubic_skel;
+ struct bpf_link *link;
+
+ cubic_skel = bpf_cubic__open_and_load();
+ if (CHECK(!cubic_skel, "bpf_cubic__open_and_load", "failed\n"))
+ return;
+
+ link = bpf_map__attach_struct_ops(cubic_skel->maps.cubic);
+ if (CHECK(IS_ERR(link), "bpf_map__attach_struct_ops", "err:%ld\n",
+ PTR_ERR(link))) {
+ bpf_cubic__destroy(cubic_skel);
+ return;
+ }
+
+ do_test("bpf_cubic", NULL);
+
+ bpf_link__destroy(link);
+ bpf_cubic__destroy(cubic_skel);
+}
+
+static void test_dctcp(void)
+{
+ struct bpf_dctcp *dctcp_skel;
+ struct bpf_link *link;
+
+ dctcp_skel = bpf_dctcp__open_and_load();
+ if (CHECK(!dctcp_skel, "bpf_dctcp__open_and_load", "failed\n"))
+ return;
+
+ link = bpf_map__attach_struct_ops(dctcp_skel->maps.dctcp);
+ if (CHECK(IS_ERR(link), "bpf_map__attach_struct_ops", "err:%ld\n",
+ PTR_ERR(link))) {
+ bpf_dctcp__destroy(dctcp_skel);
+ return;
+ }
+
+ do_test("bpf_dctcp", dctcp_skel->maps.sk_stg_map);
+ CHECK(dctcp_skel->bss->stg_result != expected_stg,
+ "Unexpected stg_result", "stg_result (%x) != expected_stg (%x)\n",
+ dctcp_skel->bss->stg_result, expected_stg);
+
+ bpf_link__destroy(link);
+ bpf_dctcp__destroy(dctcp_skel);
+}
+
+void test_bpf_tcp_ca(void)
+{
+ if (test__start_subtest("dctcp"))
+ test_dctcp();
+ if (test__start_subtest("cubic"))
+ test_cubic();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
new file mode 100644
index 000000000..e698ee6bb
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <test_progs.h>
+static int libbpf_debug_print(enum libbpf_print_level level,
+ const char *format, va_list args)
+{
+ if (level != LIBBPF_DEBUG) {
+ vprintf(format, args);
+ return 0;
+ }
+
+ if (!strstr(format, "verifier log"))
+ return 0;
+ vprintf("%s", args);
+ return 0;
+}
+
+extern int extra_prog_load_log_flags;
+
+static int check_load(const char *file, enum bpf_prog_type type)
+{
+ struct bpf_prog_load_attr attr;
+ struct bpf_object *obj = NULL;
+ int err, prog_fd;
+
+ memset(&attr, 0, sizeof(struct bpf_prog_load_attr));
+ attr.file = file;
+ attr.prog_type = type;
+ attr.log_level = 4 | extra_prog_load_log_flags;
+ attr.prog_flags = BPF_F_TEST_RND_HI32;
+ err = bpf_prog_load_xattr(&attr, &obj, &prog_fd);
+ bpf_object__close(obj);
+ return err;
+}
+
+struct scale_test_def {
+ const char *file;
+ enum bpf_prog_type attach_type;
+ bool fails;
+};
+
+void test_bpf_verif_scale(void)
+{
+ struct scale_test_def tests[] = {
+ { "loop3.o", BPF_PROG_TYPE_RAW_TRACEPOINT, true /* fails */ },
+
+ { "test_verif_scale1.o", BPF_PROG_TYPE_SCHED_CLS },
+ { "test_verif_scale2.o", BPF_PROG_TYPE_SCHED_CLS },
+ { "test_verif_scale3.o", BPF_PROG_TYPE_SCHED_CLS },
+
+ { "pyperf_global.o", BPF_PROG_TYPE_RAW_TRACEPOINT },
+ { "pyperf_subprogs.o", BPF_PROG_TYPE_RAW_TRACEPOINT },
+
+ /* full unroll by llvm */
+ { "pyperf50.o", BPF_PROG_TYPE_RAW_TRACEPOINT },
+ { "pyperf100.o", BPF_PROG_TYPE_RAW_TRACEPOINT },
+ { "pyperf180.o", BPF_PROG_TYPE_RAW_TRACEPOINT },
+
+ /* partial unroll. llvm will unroll loop ~150 times.
+ * C loop count -> 600.
+ * Asm loop count -> 4.
+ * 16k insns in loop body.
+ * Total of 5 such loops. Total program size ~82k insns.
+ */
+ { "pyperf600.o", BPF_PROG_TYPE_RAW_TRACEPOINT },
+
+ /* no unroll at all.
+ * C loop count -> 600.
+ * ASM loop count -> 600.
+ * ~110 insns in loop body.
+ * Total of 5 such loops. Total program size ~1500 insns.
+ */
+ { "pyperf600_nounroll.o", BPF_PROG_TYPE_RAW_TRACEPOINT },
+
+ { "loop1.o", BPF_PROG_TYPE_RAW_TRACEPOINT },
+ { "loop2.o", BPF_PROG_TYPE_RAW_TRACEPOINT },
+ { "loop4.o", BPF_PROG_TYPE_SCHED_CLS },
+ { "loop5.o", BPF_PROG_TYPE_SCHED_CLS },
+
+ /* partial unroll. 19k insn in a loop.
+ * Total program size 20.8k insn.
+ * ~350k processed_insns
+ */
+ { "strobemeta.o", BPF_PROG_TYPE_RAW_TRACEPOINT },
+
+ /* no unroll, tiny loops */
+ { "strobemeta_nounroll1.o", BPF_PROG_TYPE_RAW_TRACEPOINT },
+ { "strobemeta_nounroll2.o", BPF_PROG_TYPE_RAW_TRACEPOINT },
+
+ /* non-inlined subprogs */
+ { "strobemeta_subprogs.o", BPF_PROG_TYPE_RAW_TRACEPOINT },
+
+ { "test_sysctl_loop1.o", BPF_PROG_TYPE_CGROUP_SYSCTL },
+ { "test_sysctl_loop2.o", BPF_PROG_TYPE_CGROUP_SYSCTL },
+
+ { "test_xdp_loop.o", BPF_PROG_TYPE_XDP },
+ { "test_seg6_loop.o", BPF_PROG_TYPE_LWT_SEG6LOCAL },
+ };
+ libbpf_print_fn_t old_print_fn = NULL;
+ int err, i;
+
+ if (env.verifier_stats) {
+ test__force_log();
+ old_print_fn = libbpf_set_print(libbpf_debug_print);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ const struct scale_test_def *test = &tests[i];
+
+ if (!test__start_subtest(test->file))
+ continue;
+
+ err = check_load(test->file, test->attach_type);
+ CHECK_FAIL(err && !test->fails);
+ }
+
+ if (env.verifier_stats)
+ libbpf_set_print(old_print_fn);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c
new file mode 100644
index 000000000..28d22265b
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/btf.c
@@ -0,0 +1,6839 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/filter.h>
+#include <linux/unistd.h>
+#include <bpf/bpf.h>
+#include <sys/resource.h>
+#include <libelf.h>
+#include <gelf.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <assert.h>
+#include <bpf/libbpf.h>
+#include <bpf/btf.h>
+
+#include "bpf_rlimit.h"
+#include "bpf_util.h"
+#include "../test_btf.h"
+#include "test_progs.h"
+
+#define MAX_INSNS 512
+#define MAX_SUBPROGS 16
+
+static int duration = 0;
+static bool always_log;
+
+#undef CHECK
+#define CHECK(condition, format...) _CHECK(condition, "check", duration, format)
+
+#define BTF_END_RAW 0xdeadbeef
+#define NAME_TBD 0xdeadb33f
+
+#define NAME_NTH(N) (0xffff0000 | N)
+#define IS_NAME_NTH(X) ((X & 0xffff0000) == 0xffff0000)
+#define GET_NAME_NTH_IDX(X) (X & 0x0000ffff)
+
+#define MAX_NR_RAW_U32 1024
+#define BTF_LOG_BUF_SIZE 65535
+
+static char btf_log_buf[BTF_LOG_BUF_SIZE];
+
+static struct btf_header hdr_tmpl = {
+ .magic = BTF_MAGIC,
+ .version = BTF_VERSION,
+ .hdr_len = sizeof(struct btf_header),
+};
+
+/* several different mapv kinds(types) supported by pprint */
+enum pprint_mapv_kind_t {
+ PPRINT_MAPV_KIND_BASIC = 0,
+ PPRINT_MAPV_KIND_INT128,
+};
+
+struct btf_raw_test {
+ const char *descr;
+ const char *str_sec;
+ const char *map_name;
+ const char *err_str;
+ __u32 raw_types[MAX_NR_RAW_U32];
+ __u32 str_sec_size;
+ enum bpf_map_type map_type;
+ __u32 key_size;
+ __u32 value_size;
+ __u32 key_type_id;
+ __u32 value_type_id;
+ __u32 max_entries;
+ bool btf_load_err;
+ bool map_create_err;
+ bool ordered_map;
+ bool lossless_map;
+ bool percpu_map;
+ int hdr_len_delta;
+ int type_off_delta;
+ int str_off_delta;
+ int str_len_delta;
+ enum pprint_mapv_kind_t mapv_kind;
+};
+
+#define BTF_STR_SEC(str) \
+ .str_sec = str, .str_sec_size = sizeof(str)
+
+static struct btf_raw_test raw_tests[] = {
+/* enum E {
+ * E0,
+ * E1,
+ * };
+ *
+ * struct A {
+ * unsigned long long m;
+ * int n;
+ * char o;
+ * [3 bytes hole]
+ * int p[8];
+ * int q[4][8];
+ * enum E r;
+ * };
+ */
+{
+ .descr = "struct test #1",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* unsigned long long */
+ BTF_TYPE_INT_ENC(0, 0, 0, 64, 8), /* [2] */
+ /* char */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 8, 1), /* [3] */
+ /* int[8] */
+ BTF_TYPE_ARRAY_ENC(1, 1, 8), /* [4] */
+ /* struct A { */ /* [5] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 6), 180),
+ BTF_MEMBER_ENC(NAME_TBD, 2, 0), /* unsigned long long m;*/
+ BTF_MEMBER_ENC(NAME_TBD, 1, 64),/* int n; */
+ BTF_MEMBER_ENC(NAME_TBD, 3, 96),/* char o; */
+ BTF_MEMBER_ENC(NAME_TBD, 4, 128),/* int p[8] */
+ BTF_MEMBER_ENC(NAME_TBD, 6, 384),/* int q[4][8] */
+ BTF_MEMBER_ENC(NAME_TBD, 7, 1408), /* enum E r */
+ /* } */
+ /* int[4][8] */
+ BTF_TYPE_ARRAY_ENC(4, 1, 4), /* [6] */
+ /* enum E */ /* [7] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 2), sizeof(int)),
+ BTF_ENUM_ENC(NAME_TBD, 0),
+ BTF_ENUM_ENC(NAME_TBD, 1),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0m\0n\0o\0p\0q\0r\0E\0E0\0E1",
+ .str_sec_size = sizeof("\0A\0m\0n\0o\0p\0q\0r\0E\0E0\0E1"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "struct_test1_map",
+ .key_size = sizeof(int),
+ .value_size = 180,
+ .key_type_id = 1,
+ .value_type_id = 5,
+ .max_entries = 4,
+},
+
+/* typedef struct b Struct_B;
+ *
+ * struct A {
+ * int m;
+ * struct b n[4];
+ * const Struct_B o[4];
+ * };
+ *
+ * struct B {
+ * int m;
+ * int n;
+ * };
+ */
+{
+ .descr = "struct test #2",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* struct b [4] */ /* [2] */
+ BTF_TYPE_ARRAY_ENC(4, 1, 4),
+
+ /* struct A { */ /* [3] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 3), 68),
+ BTF_MEMBER_ENC(NAME_TBD, 1, 0), /* int m; */
+ BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* struct B n[4] */
+ BTF_MEMBER_ENC(NAME_TBD, 8, 288),/* const Struct_B o[4];*/
+ /* } */
+
+ /* struct B { */ /* [4] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 8),
+ BTF_MEMBER_ENC(NAME_TBD, 1, 0), /* int m; */
+ BTF_MEMBER_ENC(NAME_TBD, 1, 32),/* int n; */
+ /* } */
+
+ /* const int */ /* [5] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 1),
+ /* typedef struct b Struct_B */ /* [6] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_TYPEDEF, 0, 0), 4),
+ /* const Struct_B */ /* [7] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 6),
+ /* const Struct_B [4] */ /* [8] */
+ BTF_TYPE_ARRAY_ENC(7, 1, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0m\0n\0o\0B\0m\0n\0Struct_B",
+ .str_sec_size = sizeof("\0A\0m\0n\0o\0B\0m\0n\0Struct_B"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "struct_test2_map",
+ .key_size = sizeof(int),
+ .value_size = 68,
+ .key_type_id = 1,
+ .value_type_id = 3,
+ .max_entries = 4,
+},
+{
+ .descr = "struct test #3 Invalid member offset",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* int64 */ /* [2] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 64, 8),
+
+ /* struct A { */ /* [3] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 16),
+ BTF_MEMBER_ENC(NAME_TBD, 1, 64), /* int m; */
+ BTF_MEMBER_ENC(NAME_TBD, 2, 0), /* int64 n; */
+ /* } */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0m\0n\0",
+ .str_sec_size = sizeof("\0A\0m\0n\0"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "struct_test3_map",
+ .key_size = sizeof(int),
+ .value_size = 16,
+ .key_type_id = 1,
+ .value_type_id = 3,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid member bits_offset",
+},
+/*
+ * struct A {
+ * unsigned long long m;
+ * int n;
+ * char o;
+ * [3 bytes hole]
+ * int p[8];
+ * };
+ */
+{
+ .descr = "global data test #1",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* unsigned long long */
+ BTF_TYPE_INT_ENC(0, 0, 0, 64, 8), /* [2] */
+ /* char */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 8, 1), /* [3] */
+ /* int[8] */
+ BTF_TYPE_ARRAY_ENC(1, 1, 8), /* [4] */
+ /* struct A { */ /* [5] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 4), 48),
+ BTF_MEMBER_ENC(NAME_TBD, 2, 0), /* unsigned long long m;*/
+ BTF_MEMBER_ENC(NAME_TBD, 1, 64),/* int n; */
+ BTF_MEMBER_ENC(NAME_TBD, 3, 96),/* char o; */
+ BTF_MEMBER_ENC(NAME_TBD, 4, 128),/* int p[8] */
+ /* } */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0m\0n\0o\0p",
+ .str_sec_size = sizeof("\0A\0m\0n\0o\0p"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "struct_test1_map",
+ .key_size = sizeof(int),
+ .value_size = 48,
+ .key_type_id = 1,
+ .value_type_id = 5,
+ .max_entries = 4,
+},
+/*
+ * struct A {
+ * unsigned long long m;
+ * int n;
+ * char o;
+ * [3 bytes hole]
+ * int p[8];
+ * };
+ * static struct A t; <- in .bss
+ */
+{
+ .descr = "global data test #2",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* unsigned long long */
+ BTF_TYPE_INT_ENC(0, 0, 0, 64, 8), /* [2] */
+ /* char */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 8, 1), /* [3] */
+ /* int[8] */
+ BTF_TYPE_ARRAY_ENC(1, 1, 8), /* [4] */
+ /* struct A { */ /* [5] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 4), 48),
+ BTF_MEMBER_ENC(NAME_TBD, 2, 0), /* unsigned long long m;*/
+ BTF_MEMBER_ENC(NAME_TBD, 1, 64),/* int n; */
+ BTF_MEMBER_ENC(NAME_TBD, 3, 96),/* char o; */
+ BTF_MEMBER_ENC(NAME_TBD, 4, 128),/* int p[8] */
+ /* } */
+ /* static struct A t */
+ BTF_VAR_ENC(NAME_TBD, 5, 0), /* [6] */
+ /* .bss section */ /* [7] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 48),
+ BTF_VAR_SECINFO_ENC(6, 0, 48),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0m\0n\0o\0p\0t\0.bss",
+ .str_sec_size = sizeof("\0A\0m\0n\0o\0p\0t\0.bss"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = ".bss",
+ .key_size = sizeof(int),
+ .value_size = 48,
+ .key_type_id = 0,
+ .value_type_id = 7,
+ .max_entries = 1,
+},
+{
+ .descr = "global data test #3",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* static int t */
+ BTF_VAR_ENC(NAME_TBD, 1, 0), /* [2] */
+ /* .bss section */ /* [3] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4),
+ BTF_VAR_SECINFO_ENC(2, 0, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0t\0.bss",
+ .str_sec_size = sizeof("\0t\0.bss"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = ".bss",
+ .key_size = sizeof(int),
+ .value_size = 4,
+ .key_type_id = 0,
+ .value_type_id = 3,
+ .max_entries = 1,
+},
+{
+ .descr = "global data test #4, unsupported linkage",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* static int t */
+ BTF_VAR_ENC(NAME_TBD, 1, 2), /* [2] */
+ /* .bss section */ /* [3] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4),
+ BTF_VAR_SECINFO_ENC(2, 0, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0t\0.bss",
+ .str_sec_size = sizeof("\0t\0.bss"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = ".bss",
+ .key_size = sizeof(int),
+ .value_size = 4,
+ .key_type_id = 0,
+ .value_type_id = 3,
+ .max_entries = 1,
+ .btf_load_err = true,
+ .err_str = "Linkage not supported",
+},
+{
+ .descr = "global data test #5, invalid var type",
+ .raw_types = {
+ /* static void t */
+ BTF_VAR_ENC(NAME_TBD, 0, 0), /* [1] */
+ /* .bss section */ /* [2] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4),
+ BTF_VAR_SECINFO_ENC(1, 0, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0t\0.bss",
+ .str_sec_size = sizeof("\0t\0.bss"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = ".bss",
+ .key_size = sizeof(int),
+ .value_size = 4,
+ .key_type_id = 0,
+ .value_type_id = 2,
+ .max_entries = 1,
+ .btf_load_err = true,
+ .err_str = "Invalid type_id",
+},
+{
+ .descr = "global data test #6, invalid var type (fwd type)",
+ .raw_types = {
+ /* union A */
+ BTF_TYPE_ENC(NAME_TBD,
+ BTF_INFO_ENC(BTF_KIND_FWD, 1, 0), 0), /* [1] */
+ /* static union A t */
+ BTF_VAR_ENC(NAME_TBD, 1, 0), /* [2] */
+ /* .bss section */ /* [3] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4),
+ BTF_VAR_SECINFO_ENC(2, 0, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0t\0.bss",
+ .str_sec_size = sizeof("\0A\0t\0.bss"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = ".bss",
+ .key_size = sizeof(int),
+ .value_size = 4,
+ .key_type_id = 0,
+ .value_type_id = 2,
+ .max_entries = 1,
+ .btf_load_err = true,
+ .err_str = "Invalid type",
+},
+{
+ .descr = "global data test #7, invalid var type (fwd type)",
+ .raw_types = {
+ /* union A */
+ BTF_TYPE_ENC(NAME_TBD,
+ BTF_INFO_ENC(BTF_KIND_FWD, 1, 0), 0), /* [1] */
+ /* static union A t */
+ BTF_VAR_ENC(NAME_TBD, 1, 0), /* [2] */
+ /* .bss section */ /* [3] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4),
+ BTF_VAR_SECINFO_ENC(1, 0, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0t\0.bss",
+ .str_sec_size = sizeof("\0A\0t\0.bss"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = ".bss",
+ .key_size = sizeof(int),
+ .value_size = 4,
+ .key_type_id = 0,
+ .value_type_id = 2,
+ .max_entries = 1,
+ .btf_load_err = true,
+ .err_str = "Invalid type",
+},
+{
+ .descr = "global data test #8, invalid var size",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* unsigned long long */
+ BTF_TYPE_INT_ENC(0, 0, 0, 64, 8), /* [2] */
+ /* char */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 8, 1), /* [3] */
+ /* int[8] */
+ BTF_TYPE_ARRAY_ENC(1, 1, 8), /* [4] */
+ /* struct A { */ /* [5] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 4), 48),
+ BTF_MEMBER_ENC(NAME_TBD, 2, 0), /* unsigned long long m;*/
+ BTF_MEMBER_ENC(NAME_TBD, 1, 64),/* int n; */
+ BTF_MEMBER_ENC(NAME_TBD, 3, 96),/* char o; */
+ BTF_MEMBER_ENC(NAME_TBD, 4, 128),/* int p[8] */
+ /* } */
+ /* static struct A t */
+ BTF_VAR_ENC(NAME_TBD, 5, 0), /* [6] */
+ /* .bss section */ /* [7] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 48),
+ BTF_VAR_SECINFO_ENC(6, 0, 47),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0m\0n\0o\0p\0t\0.bss",
+ .str_sec_size = sizeof("\0A\0m\0n\0o\0p\0t\0.bss"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = ".bss",
+ .key_size = sizeof(int),
+ .value_size = 48,
+ .key_type_id = 0,
+ .value_type_id = 7,
+ .max_entries = 1,
+ .btf_load_err = true,
+ .err_str = "Invalid size",
+},
+{
+ .descr = "global data test #9, invalid var size",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* unsigned long long */
+ BTF_TYPE_INT_ENC(0, 0, 0, 64, 8), /* [2] */
+ /* char */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 8, 1), /* [3] */
+ /* int[8] */
+ BTF_TYPE_ARRAY_ENC(1, 1, 8), /* [4] */
+ /* struct A { */ /* [5] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 4), 48),
+ BTF_MEMBER_ENC(NAME_TBD, 2, 0), /* unsigned long long m;*/
+ BTF_MEMBER_ENC(NAME_TBD, 1, 64),/* int n; */
+ BTF_MEMBER_ENC(NAME_TBD, 3, 96),/* char o; */
+ BTF_MEMBER_ENC(NAME_TBD, 4, 128),/* int p[8] */
+ /* } */
+ /* static struct A t */
+ BTF_VAR_ENC(NAME_TBD, 5, 0), /* [6] */
+ /* .bss section */ /* [7] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 46),
+ BTF_VAR_SECINFO_ENC(6, 0, 48),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0m\0n\0o\0p\0t\0.bss",
+ .str_sec_size = sizeof("\0A\0m\0n\0o\0p\0t\0.bss"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = ".bss",
+ .key_size = sizeof(int),
+ .value_size = 48,
+ .key_type_id = 0,
+ .value_type_id = 7,
+ .max_entries = 1,
+ .btf_load_err = true,
+ .err_str = "Invalid size",
+},
+{
+ .descr = "global data test #10, invalid var size",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* unsigned long long */
+ BTF_TYPE_INT_ENC(0, 0, 0, 64, 8), /* [2] */
+ /* char */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 8, 1), /* [3] */
+ /* int[8] */
+ BTF_TYPE_ARRAY_ENC(1, 1, 8), /* [4] */
+ /* struct A { */ /* [5] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 4), 48),
+ BTF_MEMBER_ENC(NAME_TBD, 2, 0), /* unsigned long long m;*/
+ BTF_MEMBER_ENC(NAME_TBD, 1, 64),/* int n; */
+ BTF_MEMBER_ENC(NAME_TBD, 3, 96),/* char o; */
+ BTF_MEMBER_ENC(NAME_TBD, 4, 128),/* int p[8] */
+ /* } */
+ /* static struct A t */
+ BTF_VAR_ENC(NAME_TBD, 5, 0), /* [6] */
+ /* .bss section */ /* [7] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 46),
+ BTF_VAR_SECINFO_ENC(6, 0, 46),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0m\0n\0o\0p\0t\0.bss",
+ .str_sec_size = sizeof("\0A\0m\0n\0o\0p\0t\0.bss"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = ".bss",
+ .key_size = sizeof(int),
+ .value_size = 48,
+ .key_type_id = 0,
+ .value_type_id = 7,
+ .max_entries = 1,
+ .btf_load_err = true,
+ .err_str = "Invalid size",
+},
+{
+ .descr = "global data test #11, multiple section members",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* unsigned long long */
+ BTF_TYPE_INT_ENC(0, 0, 0, 64, 8), /* [2] */
+ /* char */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 8, 1), /* [3] */
+ /* int[8] */
+ BTF_TYPE_ARRAY_ENC(1, 1, 8), /* [4] */
+ /* struct A { */ /* [5] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 4), 48),
+ BTF_MEMBER_ENC(NAME_TBD, 2, 0), /* unsigned long long m;*/
+ BTF_MEMBER_ENC(NAME_TBD, 1, 64),/* int n; */
+ BTF_MEMBER_ENC(NAME_TBD, 3, 96),/* char o; */
+ BTF_MEMBER_ENC(NAME_TBD, 4, 128),/* int p[8] */
+ /* } */
+ /* static struct A t */
+ BTF_VAR_ENC(NAME_TBD, 5, 0), /* [6] */
+ /* static int u */
+ BTF_VAR_ENC(NAME_TBD, 1, 0), /* [7] */
+ /* .bss section */ /* [8] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 2), 62),
+ BTF_VAR_SECINFO_ENC(6, 10, 48),
+ BTF_VAR_SECINFO_ENC(7, 58, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0m\0n\0o\0p\0t\0u\0.bss",
+ .str_sec_size = sizeof("\0A\0m\0n\0o\0p\0t\0u\0.bss"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = ".bss",
+ .key_size = sizeof(int),
+ .value_size = 62,
+ .key_type_id = 0,
+ .value_type_id = 8,
+ .max_entries = 1,
+},
+{
+ .descr = "global data test #12, invalid offset",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* unsigned long long */
+ BTF_TYPE_INT_ENC(0, 0, 0, 64, 8), /* [2] */
+ /* char */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 8, 1), /* [3] */
+ /* int[8] */
+ BTF_TYPE_ARRAY_ENC(1, 1, 8), /* [4] */
+ /* struct A { */ /* [5] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 4), 48),
+ BTF_MEMBER_ENC(NAME_TBD, 2, 0), /* unsigned long long m;*/
+ BTF_MEMBER_ENC(NAME_TBD, 1, 64),/* int n; */
+ BTF_MEMBER_ENC(NAME_TBD, 3, 96),/* char o; */
+ BTF_MEMBER_ENC(NAME_TBD, 4, 128),/* int p[8] */
+ /* } */
+ /* static struct A t */
+ BTF_VAR_ENC(NAME_TBD, 5, 0), /* [6] */
+ /* static int u */
+ BTF_VAR_ENC(NAME_TBD, 1, 0), /* [7] */
+ /* .bss section */ /* [8] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 2), 62),
+ BTF_VAR_SECINFO_ENC(6, 10, 48),
+ BTF_VAR_SECINFO_ENC(7, 60, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0m\0n\0o\0p\0t\0u\0.bss",
+ .str_sec_size = sizeof("\0A\0m\0n\0o\0p\0t\0u\0.bss"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = ".bss",
+ .key_size = sizeof(int),
+ .value_size = 62,
+ .key_type_id = 0,
+ .value_type_id = 8,
+ .max_entries = 1,
+ .btf_load_err = true,
+ .err_str = "Invalid offset+size",
+},
+{
+ .descr = "global data test #13, invalid offset",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* unsigned long long */
+ BTF_TYPE_INT_ENC(0, 0, 0, 64, 8), /* [2] */
+ /* char */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 8, 1), /* [3] */
+ /* int[8] */
+ BTF_TYPE_ARRAY_ENC(1, 1, 8), /* [4] */
+ /* struct A { */ /* [5] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 4), 48),
+ BTF_MEMBER_ENC(NAME_TBD, 2, 0), /* unsigned long long m;*/
+ BTF_MEMBER_ENC(NAME_TBD, 1, 64),/* int n; */
+ BTF_MEMBER_ENC(NAME_TBD, 3, 96),/* char o; */
+ BTF_MEMBER_ENC(NAME_TBD, 4, 128),/* int p[8] */
+ /* } */
+ /* static struct A t */
+ BTF_VAR_ENC(NAME_TBD, 5, 0), /* [6] */
+ /* static int u */
+ BTF_VAR_ENC(NAME_TBD, 1, 0), /* [7] */
+ /* .bss section */ /* [8] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 2), 62),
+ BTF_VAR_SECINFO_ENC(6, 10, 48),
+ BTF_VAR_SECINFO_ENC(7, 12, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0m\0n\0o\0p\0t\0u\0.bss",
+ .str_sec_size = sizeof("\0A\0m\0n\0o\0p\0t\0u\0.bss"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = ".bss",
+ .key_size = sizeof(int),
+ .value_size = 62,
+ .key_type_id = 0,
+ .value_type_id = 8,
+ .max_entries = 1,
+ .btf_load_err = true,
+ .err_str = "Invalid offset",
+},
+{
+ .descr = "global data test #14, invalid offset",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* unsigned long long */
+ BTF_TYPE_INT_ENC(0, 0, 0, 64, 8), /* [2] */
+ /* char */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 8, 1), /* [3] */
+ /* int[8] */
+ BTF_TYPE_ARRAY_ENC(1, 1, 8), /* [4] */
+ /* struct A { */ /* [5] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 4), 48),
+ BTF_MEMBER_ENC(NAME_TBD, 2, 0), /* unsigned long long m;*/
+ BTF_MEMBER_ENC(NAME_TBD, 1, 64),/* int n; */
+ BTF_MEMBER_ENC(NAME_TBD, 3, 96),/* char o; */
+ BTF_MEMBER_ENC(NAME_TBD, 4, 128),/* int p[8] */
+ /* } */
+ /* static struct A t */
+ BTF_VAR_ENC(NAME_TBD, 5, 0), /* [6] */
+ /* static int u */
+ BTF_VAR_ENC(NAME_TBD, 1, 0), /* [7] */
+ /* .bss section */ /* [8] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 2), 62),
+ BTF_VAR_SECINFO_ENC(7, 58, 4),
+ BTF_VAR_SECINFO_ENC(6, 10, 48),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0m\0n\0o\0p\0t\0u\0.bss",
+ .str_sec_size = sizeof("\0A\0m\0n\0o\0p\0t\0u\0.bss"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = ".bss",
+ .key_size = sizeof(int),
+ .value_size = 62,
+ .key_type_id = 0,
+ .value_type_id = 8,
+ .max_entries = 1,
+ .btf_load_err = true,
+ .err_str = "Invalid offset",
+},
+{
+ .descr = "global data test #15, not var kind",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_VAR_ENC(NAME_TBD, 1, 0), /* [2] */
+ /* .bss section */ /* [3] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4),
+ BTF_VAR_SECINFO_ENC(1, 0, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0t\0.bss",
+ .str_sec_size = sizeof("\0A\0t\0.bss"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = ".bss",
+ .key_size = sizeof(int),
+ .value_size = 4,
+ .key_type_id = 0,
+ .value_type_id = 3,
+ .max_entries = 1,
+ .btf_load_err = true,
+ .err_str = "Not a VAR kind member",
+},
+{
+ .descr = "global data test #16, invalid var referencing sec",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_VAR_ENC(NAME_TBD, 5, 0), /* [2] */
+ BTF_VAR_ENC(NAME_TBD, 2, 0), /* [3] */
+ /* a section */ /* [4] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4),
+ BTF_VAR_SECINFO_ENC(3, 0, 4),
+ /* a section */ /* [5] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4),
+ BTF_VAR_SECINFO_ENC(6, 0, 4),
+ BTF_VAR_ENC(NAME_TBD, 1, 0), /* [6] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0t\0s\0a\0a",
+ .str_sec_size = sizeof("\0A\0t\0s\0a\0a"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = ".bss",
+ .key_size = sizeof(int),
+ .value_size = 4,
+ .key_type_id = 0,
+ .value_type_id = 4,
+ .max_entries = 1,
+ .btf_load_err = true,
+ .err_str = "Invalid type_id",
+},
+{
+ .descr = "global data test #17, invalid var referencing var",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_VAR_ENC(NAME_TBD, 1, 0), /* [2] */
+ BTF_VAR_ENC(NAME_TBD, 2, 0), /* [3] */
+ /* a section */ /* [4] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4),
+ BTF_VAR_SECINFO_ENC(3, 0, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0t\0s\0a\0a",
+ .str_sec_size = sizeof("\0A\0t\0s\0a\0a"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = ".bss",
+ .key_size = sizeof(int),
+ .value_size = 4,
+ .key_type_id = 0,
+ .value_type_id = 4,
+ .max_entries = 1,
+ .btf_load_err = true,
+ .err_str = "Invalid type_id",
+},
+{
+ .descr = "global data test #18, invalid var loop",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_VAR_ENC(NAME_TBD, 2, 0), /* [2] */
+ /* .bss section */ /* [3] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4),
+ BTF_VAR_SECINFO_ENC(2, 0, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0t\0aaa",
+ .str_sec_size = sizeof("\0A\0t\0aaa"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = ".bss",
+ .key_size = sizeof(int),
+ .value_size = 4,
+ .key_type_id = 0,
+ .value_type_id = 4,
+ .max_entries = 1,
+ .btf_load_err = true,
+ .err_str = "Invalid type_id",
+},
+{
+ .descr = "global data test #19, invalid var referencing var",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_VAR_ENC(NAME_TBD, 3, 0), /* [2] */
+ BTF_VAR_ENC(NAME_TBD, 1, 0), /* [3] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0t\0s\0a\0a",
+ .str_sec_size = sizeof("\0A\0t\0s\0a\0a"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = ".bss",
+ .key_size = sizeof(int),
+ .value_size = 4,
+ .key_type_id = 0,
+ .value_type_id = 4,
+ .max_entries = 1,
+ .btf_load_err = true,
+ .err_str = "Invalid type_id",
+},
+{
+ .descr = "global data test #20, invalid ptr referencing var",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* PTR type_id=3 */ /* [2] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 3),
+ BTF_VAR_ENC(NAME_TBD, 1, 0), /* [3] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0t\0s\0a\0a",
+ .str_sec_size = sizeof("\0A\0t\0s\0a\0a"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = ".bss",
+ .key_size = sizeof(int),
+ .value_size = 4,
+ .key_type_id = 0,
+ .value_type_id = 4,
+ .max_entries = 1,
+ .btf_load_err = true,
+ .err_str = "Invalid type_id",
+},
+{
+ .descr = "global data test #21, var included in struct",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* struct A { */ /* [2] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), sizeof(int) * 2),
+ BTF_MEMBER_ENC(NAME_TBD, 1, 0), /* int m; */
+ BTF_MEMBER_ENC(NAME_TBD, 3, 32),/* VAR type_id=3; */
+ /* } */
+ BTF_VAR_ENC(NAME_TBD, 1, 0), /* [3] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0t\0s\0a\0a",
+ .str_sec_size = sizeof("\0A\0t\0s\0a\0a"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = ".bss",
+ .key_size = sizeof(int),
+ .value_size = 4,
+ .key_type_id = 0,
+ .value_type_id = 4,
+ .max_entries = 1,
+ .btf_load_err = true,
+ .err_str = "Invalid member",
+},
+{
+ .descr = "global data test #22, array of var",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ARRAY_ENC(3, 1, 4), /* [2] */
+ BTF_VAR_ENC(NAME_TBD, 1, 0), /* [3] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0t\0s\0a\0a",
+ .str_sec_size = sizeof("\0A\0t\0s\0a\0a"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = ".bss",
+ .key_size = sizeof(int),
+ .value_size = 4,
+ .key_type_id = 0,
+ .value_type_id = 4,
+ .max_entries = 1,
+ .btf_load_err = true,
+ .err_str = "Invalid elem",
+},
+{
+ .descr = "var after datasec, ptr followed by modifier",
+ .raw_types = {
+ /* .bss section */ /* [1] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 2),
+ sizeof(void*)+4),
+ BTF_VAR_SECINFO_ENC(4, 0, sizeof(void*)),
+ BTF_VAR_SECINFO_ENC(6, sizeof(void*), 4),
+ /* int */ /* [2] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* int* */ /* [3] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2),
+ BTF_VAR_ENC(NAME_TBD, 3, 0), /* [4] */
+ /* const int */ /* [5] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 2),
+ BTF_VAR_ENC(NAME_TBD, 5, 0), /* [6] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0a\0b\0c\0",
+ .str_sec_size = sizeof("\0a\0b\0c\0"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = ".bss",
+ .key_size = sizeof(int),
+ .value_size = sizeof(void*)+4,
+ .key_type_id = 0,
+ .value_type_id = 1,
+ .max_entries = 1,
+},
+/* Test member exceeds the size of struct.
+ *
+ * struct A {
+ * int m;
+ * int n;
+ * };
+ */
+{
+ .descr = "size check test #1",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* struct A { */ /* [2] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), sizeof(int) * 2 - 1),
+ BTF_MEMBER_ENC(NAME_TBD, 1, 0), /* int m; */
+ BTF_MEMBER_ENC(NAME_TBD, 1, 32),/* int n; */
+ /* } */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0m\0n",
+ .str_sec_size = sizeof("\0A\0m\0n"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "size_check1_map",
+ .key_size = sizeof(int),
+ .value_size = 1,
+ .key_type_id = 1,
+ .value_type_id = 2,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Member exceeds struct_size",
+},
+
+/* Test member exeeds the size of struct
+ *
+ * struct A {
+ * int m;
+ * int n[2];
+ * };
+ */
+{
+ .descr = "size check test #2",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, sizeof(int)),
+ /* int[2] */ /* [2] */
+ BTF_TYPE_ARRAY_ENC(1, 1, 2),
+ /* struct A { */ /* [3] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), sizeof(int) * 3 - 1),
+ BTF_MEMBER_ENC(NAME_TBD, 1, 0), /* int m; */
+ BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* int n[2]; */
+ /* } */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0m\0n",
+ .str_sec_size = sizeof("\0A\0m\0n"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "size_check2_map",
+ .key_size = sizeof(int),
+ .value_size = 1,
+ .key_type_id = 1,
+ .value_type_id = 3,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Member exceeds struct_size",
+},
+
+/* Test member exeeds the size of struct
+ *
+ * struct A {
+ * int m;
+ * void *n;
+ * };
+ */
+{
+ .descr = "size check test #3",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, sizeof(int)),
+ /* void* */ /* [2] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 0),
+ /* struct A { */ /* [3] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), sizeof(int) + sizeof(void *) - 1),
+ BTF_MEMBER_ENC(NAME_TBD, 1, 0), /* int m; */
+ BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* void *n; */
+ /* } */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0m\0n",
+ .str_sec_size = sizeof("\0A\0m\0n"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "size_check3_map",
+ .key_size = sizeof(int),
+ .value_size = 1,
+ .key_type_id = 1,
+ .value_type_id = 3,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Member exceeds struct_size",
+},
+
+/* Test member exceeds the size of struct
+ *
+ * enum E {
+ * E0,
+ * E1,
+ * };
+ *
+ * struct A {
+ * int m;
+ * enum E n;
+ * };
+ */
+{
+ .descr = "size check test #4",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, sizeof(int)),
+ /* enum E { */ /* [2] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 2), sizeof(int)),
+ BTF_ENUM_ENC(NAME_TBD, 0),
+ BTF_ENUM_ENC(NAME_TBD, 1),
+ /* } */
+ /* struct A { */ /* [3] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), sizeof(int) * 2 - 1),
+ BTF_MEMBER_ENC(NAME_TBD, 1, 0), /* int m; */
+ BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* enum E n; */
+ /* } */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0E\0E0\0E1\0A\0m\0n",
+ .str_sec_size = sizeof("\0E\0E0\0E1\0A\0m\0n"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "size_check4_map",
+ .key_size = sizeof(int),
+ .value_size = 1,
+ .key_type_id = 1,
+ .value_type_id = 3,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Member exceeds struct_size",
+},
+
+/* Test member unexceeds the size of struct
+ *
+ * enum E {
+ * E0,
+ * E1,
+ * };
+ *
+ * struct A {
+ * char m;
+ * enum E __attribute__((packed)) n;
+ * };
+ */
+{
+ .descr = "size check test #5",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, sizeof(int)),
+ /* char */ /* [2] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 8, 1),
+ /* enum E { */ /* [3] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 2), 1),
+ BTF_ENUM_ENC(NAME_TBD, 0),
+ BTF_ENUM_ENC(NAME_TBD, 1),
+ /* } */
+ /* struct A { */ /* [4] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 2),
+ BTF_MEMBER_ENC(NAME_TBD, 2, 0), /* char m; */
+ BTF_MEMBER_ENC(NAME_TBD, 3, 8),/* enum E __attribute__((packed)) n; */
+ /* } */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0E\0E0\0E1\0A\0m\0n",
+ .str_sec_size = sizeof("\0E\0E0\0E1\0A\0m\0n"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "size_check5_map",
+ .key_size = sizeof(int),
+ .value_size = 2,
+ .key_type_id = 1,
+ .value_type_id = 4,
+ .max_entries = 4,
+},
+
+/* typedef const void * const_void_ptr;
+ * struct A {
+ * const_void_ptr m;
+ * };
+ */
+{
+ .descr = "void test #1",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* const void */ /* [2] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0),
+ /* const void* */ /* [3] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2),
+ /* typedef const void * const_void_ptr */
+ BTF_TYPEDEF_ENC(NAME_TBD, 3), /* [4] */
+ /* struct A { */ /* [5] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), sizeof(void *)),
+ /* const_void_ptr m; */
+ BTF_MEMBER_ENC(NAME_TBD, 4, 0),
+ /* } */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0const_void_ptr\0A\0m",
+ .str_sec_size = sizeof("\0const_void_ptr\0A\0m"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "void_test1_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(void *),
+ .key_type_id = 1,
+ .value_type_id = 4,
+ .max_entries = 4,
+},
+
+/* struct A {
+ * const void m;
+ * };
+ */
+{
+ .descr = "void test #2",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* const void */ /* [2] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0),
+ /* struct A { */ /* [3] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 8),
+ /* const void m; */
+ BTF_MEMBER_ENC(NAME_TBD, 2, 0),
+ /* } */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0m",
+ .str_sec_size = sizeof("\0A\0m"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "void_test2_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(void *),
+ .key_type_id = 1,
+ .value_type_id = 3,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid member",
+},
+
+/* typedef const void * const_void_ptr;
+ * const_void_ptr[4]
+ */
+{
+ .descr = "void test #3",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* const void */ /* [2] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0),
+ /* const void* */ /* [3] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2),
+ /* typedef const void * const_void_ptr */
+ BTF_TYPEDEF_ENC(NAME_TBD, 3), /* [4] */
+ /* const_void_ptr[4] */
+ BTF_TYPE_ARRAY_ENC(4, 1, 4), /* [5] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0const_void_ptr",
+ .str_sec_size = sizeof("\0const_void_ptr"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "void_test3_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(void *) * 4,
+ .key_type_id = 1,
+ .value_type_id = 5,
+ .max_entries = 4,
+},
+
+/* const void[4] */
+{
+ .descr = "void test #4",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* const void */ /* [2] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0),
+ /* const void[4] */ /* [3] */
+ BTF_TYPE_ARRAY_ENC(2, 1, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0m",
+ .str_sec_size = sizeof("\0A\0m"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "void_test4_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(void *) * 4,
+ .key_type_id = 1,
+ .value_type_id = 3,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid elem",
+},
+
+/* Array_A <------------------+
+ * elem_type == Array_B |
+ * | |
+ * | |
+ * Array_B <-------- + |
+ * elem_type == Array A --+
+ */
+{
+ .descr = "loop test #1",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* Array_A */ /* [2] */
+ BTF_TYPE_ARRAY_ENC(3, 1, 8),
+ /* Array_B */ /* [3] */
+ BTF_TYPE_ARRAY_ENC(2, 1, 8),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "loop_test1_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(sizeof(int) * 8),
+ .key_type_id = 1,
+ .value_type_id = 2,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Loop detected",
+},
+
+/* typedef is _before_ the BTF type of Array_A and Array_B
+ *
+ * typedef Array_B int_array;
+ *
+ * Array_A <------------------+
+ * elem_type == int_array |
+ * | |
+ * | |
+ * Array_B <-------- + |
+ * elem_type == Array_A --+
+ */
+{
+ .descr = "loop test #2",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* typedef Array_B int_array */
+ BTF_TYPEDEF_ENC(1, 4), /* [2] */
+ /* Array_A */
+ BTF_TYPE_ARRAY_ENC(2, 1, 8), /* [3] */
+ /* Array_B */
+ BTF_TYPE_ARRAY_ENC(3, 1, 8), /* [4] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0int_array\0",
+ .str_sec_size = sizeof("\0int_array"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "loop_test2_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(sizeof(int) * 8),
+ .key_type_id = 1,
+ .value_type_id = 2,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Loop detected",
+},
+
+/* Array_A <------------------+
+ * elem_type == Array_B |
+ * | |
+ * | |
+ * Array_B <-------- + |
+ * elem_type == Array_A --+
+ */
+{
+ .descr = "loop test #3",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* Array_A */ /* [2] */
+ BTF_TYPE_ARRAY_ENC(3, 1, 8),
+ /* Array_B */ /* [3] */
+ BTF_TYPE_ARRAY_ENC(2, 1, 8),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "loop_test3_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(sizeof(int) * 8),
+ .key_type_id = 1,
+ .value_type_id = 2,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Loop detected",
+},
+
+/* typedef is _between_ the BTF type of Array_A and Array_B
+ *
+ * typedef Array_B int_array;
+ *
+ * Array_A <------------------+
+ * elem_type == int_array |
+ * | |
+ * | |
+ * Array_B <-------- + |
+ * elem_type == Array_A --+
+ */
+{
+ .descr = "loop test #4",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* Array_A */ /* [2] */
+ BTF_TYPE_ARRAY_ENC(3, 1, 8),
+ /* typedef Array_B int_array */ /* [3] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 4),
+ /* Array_B */ /* [4] */
+ BTF_TYPE_ARRAY_ENC(2, 1, 8),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0int_array\0",
+ .str_sec_size = sizeof("\0int_array"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "loop_test4_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(sizeof(int) * 8),
+ .key_type_id = 1,
+ .value_type_id = 2,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Loop detected",
+},
+
+/* typedef struct B Struct_B
+ *
+ * struct A {
+ * int x;
+ * Struct_B y;
+ * };
+ *
+ * struct B {
+ * int x;
+ * struct A y;
+ * };
+ */
+{
+ .descr = "loop test #5",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* struct A */ /* [2] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 8),
+ BTF_MEMBER_ENC(NAME_TBD, 1, 0), /* int x; */
+ BTF_MEMBER_ENC(NAME_TBD, 3, 32),/* Struct_B y; */
+ /* typedef struct B Struct_B */
+ BTF_TYPEDEF_ENC(NAME_TBD, 4), /* [3] */
+ /* struct B */ /* [4] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 8),
+ BTF_MEMBER_ENC(NAME_TBD, 1, 0), /* int x; */
+ BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* struct A y; */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0x\0y\0Struct_B\0B\0x\0y",
+ .str_sec_size = sizeof("\0A\0x\0y\0Struct_B\0B\0x\0y"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "loop_test5_map",
+ .key_size = sizeof(int),
+ .value_size = 8,
+ .key_type_id = 1,
+ .value_type_id = 2,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Loop detected",
+},
+
+/* struct A {
+ * int x;
+ * struct A array_a[4];
+ * };
+ */
+{
+ .descr = "loop test #6",
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ARRAY_ENC(3, 1, 4), /* [2] */
+ /* struct A */ /* [3] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 8),
+ BTF_MEMBER_ENC(NAME_TBD, 1, 0), /* int x; */
+ BTF_MEMBER_ENC(NAME_TBD, 2, 32),/* struct A array_a[4]; */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0x\0y",
+ .str_sec_size = sizeof("\0A\0x\0y"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "loop_test6_map",
+ .key_size = sizeof(int),
+ .value_size = 8,
+ .key_type_id = 1,
+ .value_type_id = 2,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Loop detected",
+},
+
+{
+ .descr = "loop test #7",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* struct A { */ /* [2] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), sizeof(void *)),
+ /* const void *m; */
+ BTF_MEMBER_ENC(NAME_TBD, 3, 0),
+ /* CONST type_id=3 */ /* [3] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 4),
+ /* PTR type_id=2 */ /* [4] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 3),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0m",
+ .str_sec_size = sizeof("\0A\0m"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "loop_test7_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(void *),
+ .key_type_id = 1,
+ .value_type_id = 2,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Loop detected",
+},
+
+{
+ .descr = "loop test #8",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* struct A { */ /* [2] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), sizeof(void *)),
+ /* const void *m; */
+ BTF_MEMBER_ENC(NAME_TBD, 4, 0),
+ /* struct B { */ /* [3] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), sizeof(void *)),
+ /* const void *n; */
+ BTF_MEMBER_ENC(NAME_TBD, 6, 0),
+ /* CONST type_id=5 */ /* [4] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 5),
+ /* PTR type_id=6 */ /* [5] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 6),
+ /* CONST type_id=7 */ /* [6] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 7),
+ /* PTR type_id=4 */ /* [7] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0m\0B\0n",
+ .str_sec_size = sizeof("\0A\0m\0B\0n"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "loop_test8_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(void *),
+ .key_type_id = 1,
+ .value_type_id = 2,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Loop detected",
+},
+
+{
+ .descr = "string section does not end with null",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0int",
+ .str_sec_size = sizeof("\0int") - 1,
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "hdr_test_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid string section",
+},
+
+{
+ .descr = "empty string section",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = 0,
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "hdr_test_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid string section",
+},
+
+{
+ .descr = "empty type section",
+ .raw_types = {
+ BTF_END_RAW,
+ },
+ .str_sec = "\0int",
+ .str_sec_size = sizeof("\0int"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "hdr_test_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "No type found",
+},
+
+{
+ .descr = "btf_header test. Longer hdr_len",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0int",
+ .str_sec_size = sizeof("\0int"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "hdr_test_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .hdr_len_delta = 4,
+ .err_str = "Unsupported btf_header",
+},
+
+{
+ .descr = "btf_header test. Gap between hdr and type",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0int",
+ .str_sec_size = sizeof("\0int"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "hdr_test_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .type_off_delta = 4,
+ .err_str = "Unsupported section found",
+},
+
+{
+ .descr = "btf_header test. Gap between type and str",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0int",
+ .str_sec_size = sizeof("\0int"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "hdr_test_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .str_off_delta = 4,
+ .err_str = "Unsupported section found",
+},
+
+{
+ .descr = "btf_header test. Overlap between type and str",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0int",
+ .str_sec_size = sizeof("\0int"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "hdr_test_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .str_off_delta = -4,
+ .err_str = "Section overlap found",
+},
+
+{
+ .descr = "btf_header test. Larger BTF size",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0int",
+ .str_sec_size = sizeof("\0int"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "hdr_test_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .str_len_delta = -4,
+ .err_str = "Unsupported section found",
+},
+
+{
+ .descr = "btf_header test. Smaller BTF size",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0int",
+ .str_sec_size = sizeof("\0int"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "hdr_test_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .str_len_delta = 4,
+ .err_str = "Total section length too long",
+},
+
+{
+ .descr = "array test. index_type/elem_type \"int\"",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* int[16] */ /* [2] */
+ BTF_TYPE_ARRAY_ENC(1, 1, 16),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "array_test_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "array test. index_type/elem_type \"const int\"",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* int[16] */ /* [2] */
+ BTF_TYPE_ARRAY_ENC(3, 3, 16),
+ /* CONST type_id=1 */ /* [3] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 1),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "array_test_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "array test. index_type \"const int:31\"",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* int:31 */ /* [2] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 31, 4),
+ /* int[16] */ /* [3] */
+ BTF_TYPE_ARRAY_ENC(1, 4, 16),
+ /* CONST type_id=2 */ /* [4] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 2),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "array_test_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid index",
+},
+
+{
+ .descr = "array test. elem_type \"const int:31\"",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* int:31 */ /* [2] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 31, 4),
+ /* int[16] */ /* [3] */
+ BTF_TYPE_ARRAY_ENC(4, 1, 16),
+ /* CONST type_id=2 */ /* [4] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 2),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "array_test_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid array of int",
+},
+
+{
+ .descr = "array test. index_type \"void\"",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* int[16] */ /* [2] */
+ BTF_TYPE_ARRAY_ENC(1, 0, 16),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "array_test_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid index",
+},
+
+{
+ .descr = "array test. index_type \"const void\"",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* int[16] */ /* [2] */
+ BTF_TYPE_ARRAY_ENC(1, 3, 16),
+ /* CONST type_id=0 (void) */ /* [3] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "array_test_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid index",
+},
+
+{
+ .descr = "array test. elem_type \"const void\"",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* int[16] */ /* [2] */
+ BTF_TYPE_ARRAY_ENC(3, 1, 16),
+ /* CONST type_id=0 (void) */ /* [3] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 0),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "array_test_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid elem",
+},
+
+{
+ .descr = "array test. elem_type \"const void *\"",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* const void *[16] */ /* [2] */
+ BTF_TYPE_ARRAY_ENC(3, 1, 16),
+ /* CONST type_id=4 */ /* [3] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 4),
+ /* void* */ /* [4] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 0),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "array_test_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "array test. index_type \"const void *\"",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* const void *[16] */ /* [2] */
+ BTF_TYPE_ARRAY_ENC(3, 3, 16),
+ /* CONST type_id=4 */ /* [3] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 4),
+ /* void* */ /* [4] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 0),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "array_test_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid index",
+},
+
+{
+ .descr = "array test. t->size != 0\"",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* int[16] */ /* [2] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_ARRAY, 0, 0), 1),
+ BTF_ARRAY_ENC(1, 1, 16),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "array_test_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "size != 0",
+},
+
+{
+ .descr = "int test. invalid int_data",
+ .raw_types = {
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_INT, 0, 0), 4),
+ 0x10000000,
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "array_test_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid int_data",
+},
+
+{
+ .descr = "invalid BTF_INFO",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ BTF_TYPE_ENC(0, 0x10000000, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "array_test_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid btf_info",
+},
+
+{
+ .descr = "fwd test. t->type != 0\"",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* fwd type */ /* [2] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FWD, 0, 0), 1),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "fwd_test_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "type != 0",
+},
+
+{
+ .descr = "typedef (invalid name, name_off = 0)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPEDEF_ENC(0, 1), /* [2] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0__int",
+ .str_sec_size = sizeof("\0__int"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "typedef_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid name",
+},
+
+{
+ .descr = "typedef (invalid name, invalid identifier)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 1), /* [2] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0__!int",
+ .str_sec_size = sizeof("\0__!int"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "typedef_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid name",
+},
+
+{
+ .descr = "ptr type (invalid name, name_off <> 0)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(NAME_TBD,
+ BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 1), /* [2] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0__int",
+ .str_sec_size = sizeof("\0__int"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "ptr_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid name",
+},
+
+{
+ .descr = "volatile type (invalid name, name_off <> 0)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(NAME_TBD,
+ BTF_INFO_ENC(BTF_KIND_VOLATILE, 0, 0), 1), /* [2] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0__int",
+ .str_sec_size = sizeof("\0__int"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "volatile_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid name",
+},
+
+{
+ .descr = "const type (invalid name, name_off <> 0)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(NAME_TBD,
+ BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 1), /* [2] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0__int",
+ .str_sec_size = sizeof("\0__int"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "const_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid name",
+},
+
+{
+ .descr = "restrict type (invalid name, name_off <> 0)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 1), /* [2] */
+ BTF_TYPE_ENC(NAME_TBD,
+ BTF_INFO_ENC(BTF_KIND_RESTRICT, 0, 0), 2), /* [3] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0__int",
+ .str_sec_size = sizeof("\0__int"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "restrict_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid name",
+},
+
+{
+ .descr = "fwd type (invalid name, name_off = 0)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FWD, 0, 0), 0), /* [2] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0__skb",
+ .str_sec_size = sizeof("\0__skb"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "fwd_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid name",
+},
+
+{
+ .descr = "fwd type (invalid name, invalid identifier)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(NAME_TBD,
+ BTF_INFO_ENC(BTF_KIND_FWD, 0, 0), 0), /* [2] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0__!skb",
+ .str_sec_size = sizeof("\0__!skb"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "fwd_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid name",
+},
+
+{
+ .descr = "array type (invalid name, name_off <> 0)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(NAME_TBD,
+ BTF_INFO_ENC(BTF_KIND_ARRAY, 0, 0), 0), /* [2] */
+ BTF_ARRAY_ENC(1, 1, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0__skb",
+ .str_sec_size = sizeof("\0__skb"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "array_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid name",
+},
+
+{
+ .descr = "struct type (name_off = 0)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0,
+ BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 4), /* [2] */
+ BTF_MEMBER_ENC(NAME_TBD, 1, 0),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A",
+ .str_sec_size = sizeof("\0A"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "struct_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "struct type (invalid name, invalid identifier)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(NAME_TBD,
+ BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 4), /* [2] */
+ BTF_MEMBER_ENC(NAME_TBD, 1, 0),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A!\0B",
+ .str_sec_size = sizeof("\0A!\0B"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "struct_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid name",
+},
+
+{
+ .descr = "struct member (name_off = 0)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0,
+ BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 4), /* [2] */
+ BTF_MEMBER_ENC(NAME_TBD, 1, 0),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A",
+ .str_sec_size = sizeof("\0A"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "struct_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "struct member (invalid name, invalid identifier)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(NAME_TBD,
+ BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 4), /* [2] */
+ BTF_MEMBER_ENC(NAME_TBD, 1, 0),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0B*",
+ .str_sec_size = sizeof("\0A\0B*"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "struct_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid name",
+},
+
+{
+ .descr = "enum type (name_off = 0)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0,
+ BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1),
+ sizeof(int)), /* [2] */
+ BTF_ENUM_ENC(NAME_TBD, 0),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A\0B",
+ .str_sec_size = sizeof("\0A\0B"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "enum_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "enum type (invalid name, invalid identifier)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(NAME_TBD,
+ BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1),
+ sizeof(int)), /* [2] */
+ BTF_ENUM_ENC(NAME_TBD, 0),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A!\0B",
+ .str_sec_size = sizeof("\0A!\0B"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "enum_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid name",
+},
+
+{
+ .descr = "enum member (invalid name, name_off = 0)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0,
+ BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1),
+ sizeof(int)), /* [2] */
+ BTF_ENUM_ENC(0, 0),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "enum_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid name",
+},
+
+{
+ .descr = "enum member (invalid name, invalid identifier)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0,
+ BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1),
+ sizeof(int)), /* [2] */
+ BTF_ENUM_ENC(NAME_TBD, 0),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0A!",
+ .str_sec_size = sizeof("\0A!"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "enum_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid name",
+},
+{
+ .descr = "arraymap invalid btf key (a bit field)",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* 32 bit int with 32 bit offset */ /* [2] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 32, 32, 8),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "array_map_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 2,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .map_create_err = true,
+},
+
+{
+ .descr = "arraymap invalid btf key (!= 32 bits)",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* 16 bit int with 0 bit offset */ /* [2] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 16, 2),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "array_map_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 2,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .map_create_err = true,
+},
+
+{
+ .descr = "arraymap invalid btf value (too small)",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "array_map_check_btf",
+ .key_size = sizeof(int),
+ /* btf_value_size < map->value_size */
+ .value_size = sizeof(__u64),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .map_create_err = true,
+},
+
+{
+ .descr = "arraymap invalid btf value (too big)",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "array_map_check_btf",
+ .key_size = sizeof(int),
+ /* btf_value_size > map->value_size */
+ .value_size = sizeof(__u16),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .map_create_err = true,
+},
+
+{
+ .descr = "func proto (int (*)(int, unsigned int))",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, 0, 0, 32, 4), /* [2] */
+ /* int (*)(int, unsigned int) */
+ BTF_FUNC_PROTO_ENC(1, 2), /* [3] */
+ BTF_FUNC_PROTO_ARG_ENC(0, 1),
+ BTF_FUNC_PROTO_ARG_ENC(0, 2),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "func_proto_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "func proto (vararg)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, 0, 0, 32, 4), /* [2] */
+ /* void (*)(int, unsigned int, ...) */
+ BTF_FUNC_PROTO_ENC(0, 3), /* [3] */
+ BTF_FUNC_PROTO_ARG_ENC(0, 1),
+ BTF_FUNC_PROTO_ARG_ENC(0, 2),
+ BTF_FUNC_PROTO_ARG_ENC(0, 0),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "func_proto_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "func proto (vararg with name)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, 0, 0, 32, 4), /* [2] */
+ /* void (*)(int a, unsigned int b, ... c) */
+ BTF_FUNC_PROTO_ENC(0, 3), /* [3] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 2),
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 0),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0a\0b\0c",
+ .str_sec_size = sizeof("\0a\0b\0c"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "func_proto_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid arg#3",
+},
+
+{
+ .descr = "func proto (arg after vararg)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, 0, 0, 32, 4), /* [2] */
+ /* void (*)(int a, ..., unsigned int b) */
+ BTF_FUNC_PROTO_ENC(0, 3), /* [3] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_PROTO_ARG_ENC(0, 0),
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 2),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0a\0b",
+ .str_sec_size = sizeof("\0a\0b"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "func_proto_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid arg#2",
+},
+
+{
+ .descr = "func proto (CONST=>TYPEDEF=>PTR=>FUNC_PROTO)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, 0, 0, 32, 4), /* [2] */
+ /* typedef void (*func_ptr)(int, unsigned int) */
+ BTF_TYPEDEF_ENC(NAME_TBD, 5), /* [3] */
+ /* const func_ptr */
+ BTF_CONST_ENC(3), /* [4] */
+ BTF_PTR_ENC(6), /* [5] */
+ BTF_FUNC_PROTO_ENC(0, 2), /* [6] */
+ BTF_FUNC_PROTO_ARG_ENC(0, 1),
+ BTF_FUNC_PROTO_ARG_ENC(0, 2),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0func_ptr",
+ .str_sec_size = sizeof("\0func_ptr"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "func_proto_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "func proto (TYPEDEF=>FUNC_PROTO)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, 0, 0, 32, 4), /* [2] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 4), /* [3] */
+ BTF_FUNC_PROTO_ENC(0, 2), /* [4] */
+ BTF_FUNC_PROTO_ARG_ENC(0, 1),
+ BTF_FUNC_PROTO_ARG_ENC(0, 2),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0func_typedef",
+ .str_sec_size = sizeof("\0func_typedef"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "func_proto_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "func proto (btf_resolve(arg))",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* void (*)(const void *) */
+ BTF_FUNC_PROTO_ENC(0, 1), /* [2] */
+ BTF_FUNC_PROTO_ARG_ENC(0, 3),
+ BTF_CONST_ENC(4), /* [3] */
+ BTF_PTR_ENC(0), /* [4] */
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "func_proto_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "func proto (Not all arg has name)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, 0, 0, 32, 4), /* [2] */
+ /* void (*)(int, unsigned int b) */
+ BTF_FUNC_PROTO_ENC(0, 2), /* [3] */
+ BTF_FUNC_PROTO_ARG_ENC(0, 1),
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 2),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0b",
+ .str_sec_size = sizeof("\0b"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "func_proto_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "func proto (Bad arg name_off)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, 0, 0, 32, 4), /* [2] */
+ /* void (*)(int a, unsigned int <bad_name_off>) */
+ BTF_FUNC_PROTO_ENC(0, 2), /* [3] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_PROTO_ARG_ENC(0x0fffffff, 2),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0a",
+ .str_sec_size = sizeof("\0a"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "func_proto_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid arg#2",
+},
+
+{
+ .descr = "func proto (Bad arg name)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, 0, 0, 32, 4), /* [2] */
+ /* void (*)(int a, unsigned int !!!) */
+ BTF_FUNC_PROTO_ENC(0, 2), /* [3] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 2),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0a\0!!!",
+ .str_sec_size = sizeof("\0a\0!!!"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "func_proto_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid arg#2",
+},
+
+{
+ .descr = "func proto (Invalid return type)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, 0, 0, 32, 4), /* [2] */
+ /* <bad_ret_type> (*)(int, unsigned int) */
+ BTF_FUNC_PROTO_ENC(100, 2), /* [3] */
+ BTF_FUNC_PROTO_ARG_ENC(0, 1),
+ BTF_FUNC_PROTO_ARG_ENC(0, 2),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "func_proto_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid return type",
+},
+
+{
+ .descr = "func proto (with func name)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, 0, 0, 32, 4), /* [2] */
+ /* void func_proto(int, unsigned int) */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 2), 0), /* [3] */
+ BTF_FUNC_PROTO_ARG_ENC(0, 1),
+ BTF_FUNC_PROTO_ARG_ENC(0, 2),
+ BTF_END_RAW,
+ },
+ .str_sec = "\0func_proto",
+ .str_sec_size = sizeof("\0func_proto"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "func_proto_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid name",
+},
+
+{
+ .descr = "func proto (const void arg)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, 0, 0, 32, 4), /* [2] */
+ /* void (*)(const void) */
+ BTF_FUNC_PROTO_ENC(0, 1), /* [3] */
+ BTF_FUNC_PROTO_ARG_ENC(0, 4),
+ BTF_CONST_ENC(0), /* [4] */
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "func_proto_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid arg#1",
+},
+
+{
+ .descr = "func (void func(int a, unsigned int b))",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, 0, 0, 32, 4), /* [2] */
+ /* void (*)(int a, unsigned int b) */
+ BTF_FUNC_PROTO_ENC(0, 2), /* [3] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 2),
+ /* void func(int a, unsigned int b) */
+ BTF_FUNC_ENC(NAME_TBD, 3), /* [4] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0a\0b\0func",
+ .str_sec_size = sizeof("\0a\0b\0func"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "func_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "func (No func name)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, 0, 0, 32, 4), /* [2] */
+ /* void (*)(int a, unsigned int b) */
+ BTF_FUNC_PROTO_ENC(0, 2), /* [3] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 2),
+ /* void <no_name>(int a, unsigned int b) */
+ BTF_FUNC_ENC(0, 3), /* [4] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0a\0b",
+ .str_sec_size = sizeof("\0a\0b"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "func_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid name",
+},
+
+{
+ .descr = "func (Invalid func name)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, 0, 0, 32, 4), /* [2] */
+ /* void (*)(int a, unsigned int b) */
+ BTF_FUNC_PROTO_ENC(0, 2), /* [3] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 2),
+ /* void !!!(int a, unsigned int b) */
+ BTF_FUNC_ENC(NAME_TBD, 3), /* [4] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0a\0b\0!!!",
+ .str_sec_size = sizeof("\0a\0b\0!!!"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "func_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid name",
+},
+
+{
+ .descr = "func (Some arg has no name)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, 0, 0, 32, 4), /* [2] */
+ /* void (*)(int a, unsigned int) */
+ BTF_FUNC_PROTO_ENC(0, 2), /* [3] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_PROTO_ARG_ENC(0, 2),
+ /* void func(int a, unsigned int) */
+ BTF_FUNC_ENC(NAME_TBD, 3), /* [4] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0a\0func",
+ .str_sec_size = sizeof("\0a\0func"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "func_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid arg#2",
+},
+
+{
+ .descr = "func (Non zero vlen)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, 0, 0, 32, 4), /* [2] */
+ /* void (*)(int a, unsigned int b) */
+ BTF_FUNC_PROTO_ENC(0, 2), /* [3] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 2),
+ /* void func(int a, unsigned int b) */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_FUNC, 0, 2), 3), /* [4] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0a\0b\0func",
+ .str_sec_size = sizeof("\0a\0b\0func"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "func_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid func linkage",
+},
+
+{
+ .descr = "func (Not referring to FUNC_PROTO)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_FUNC_ENC(NAME_TBD, 1), /* [2] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0func",
+ .str_sec_size = sizeof("\0func"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "func_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid type_id",
+},
+
+{
+ .descr = "invalid int kind_flag",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_INT, 1, 0), 4), /* [2] */
+ BTF_INT_ENC(0, 0, 32),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "int_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid btf_info kind_flag",
+},
+
+{
+ .descr = "invalid ptr kind_flag",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 1, 0), 1), /* [2] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "ptr_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid btf_info kind_flag",
+},
+
+{
+ .descr = "invalid array kind_flag",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_ARRAY, 1, 0), 0), /* [2] */
+ BTF_ARRAY_ENC(1, 1, 1),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "array_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid btf_info kind_flag",
+},
+
+{
+ .descr = "invalid enum kind_flag",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_ENUM, 1, 1), 4), /* [2] */
+ BTF_ENUM_ENC(NAME_TBD, 0),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "enum_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid btf_info kind_flag",
+},
+
+{
+ .descr = "valid fwd kind_flag",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(NAME_TBD,
+ BTF_INFO_ENC(BTF_KIND_FWD, 1, 0), 0), /* [2] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "fwd_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "invalid typedef kind_flag",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(NAME_TBD,
+ BTF_INFO_ENC(BTF_KIND_TYPEDEF, 1, 0), 1), /* [2] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "typedef_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid btf_info kind_flag",
+},
+
+{
+ .descr = "invalid volatile kind_flag",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_VOLATILE, 1, 0), 1), /* [2] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "volatile_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid btf_info kind_flag",
+},
+
+{
+ .descr = "invalid const kind_flag",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 1, 0), 1), /* [2] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "const_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid btf_info kind_flag",
+},
+
+{
+ .descr = "invalid restrict kind_flag",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_RESTRICT, 1, 0), 1), /* [2] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "restrict_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid btf_info kind_flag",
+},
+
+{
+ .descr = "invalid func kind_flag",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 0), 0), /* [2] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_FUNC, 1, 0), 2), /* [3] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "func_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid btf_info kind_flag",
+},
+
+{
+ .descr = "invalid func_proto kind_flag",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 1, 0), 0), /* [2] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC(""),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "func_proto_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid btf_info kind_flag",
+},
+
+{
+ .descr = "valid struct, kind_flag, bitfield_size = 0",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 2), 8), /* [2] */
+ BTF_MEMBER_ENC(NAME_TBD, 1, BTF_MEMBER_OFFSET(0, 0)),
+ BTF_MEMBER_ENC(NAME_TBD, 1, BTF_MEMBER_OFFSET(0, 32)),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A\0B"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "struct_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "valid struct, kind_flag, int member, bitfield_size != 0",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 2), 4), /* [2] */
+ BTF_MEMBER_ENC(NAME_TBD, 1, BTF_MEMBER_OFFSET(4, 0)),
+ BTF_MEMBER_ENC(NAME_TBD, 1, BTF_MEMBER_OFFSET(4, 4)),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A\0B"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "struct_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "valid union, kind_flag, int member, bitfield_size != 0",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_UNION, 1, 2), 4), /* [2] */
+ BTF_MEMBER_ENC(NAME_TBD, 1, BTF_MEMBER_OFFSET(4, 0)),
+ BTF_MEMBER_ENC(NAME_TBD, 1, BTF_MEMBER_OFFSET(4, 0)),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A\0B"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "union_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "valid struct, kind_flag, enum member, bitfield_size != 0",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), 4), /* [2] */
+ BTF_ENUM_ENC(NAME_TBD, 0),
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 2), 4),/* [3] */
+ BTF_MEMBER_ENC(NAME_TBD, 2, BTF_MEMBER_OFFSET(4, 0)),
+ BTF_MEMBER_ENC(NAME_TBD, 2, BTF_MEMBER_OFFSET(4, 4)),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A\0B\0C"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "struct_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "valid union, kind_flag, enum member, bitfield_size != 0",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), 4), /* [2] */
+ BTF_ENUM_ENC(NAME_TBD, 0),
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_UNION, 1, 2), 4), /* [3] */
+ BTF_MEMBER_ENC(NAME_TBD, 2, BTF_MEMBER_OFFSET(4, 0)),
+ BTF_MEMBER_ENC(NAME_TBD, 2, BTF_MEMBER_OFFSET(4, 0)),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A\0B\0C"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "union_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "valid struct, kind_flag, typedef member, bitfield_size != 0",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), 4), /* [2] */
+ BTF_ENUM_ENC(NAME_TBD, 0),
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 2), 4),/* [3] */
+ BTF_MEMBER_ENC(NAME_TBD, 4, BTF_MEMBER_OFFSET(4, 0)),
+ BTF_MEMBER_ENC(NAME_TBD, 5, BTF_MEMBER_OFFSET(4, 4)),
+ BTF_TYPEDEF_ENC(NAME_TBD, 1), /* [4] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 2), /* [5] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A\0B\0C\0D\0E"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "struct_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "valid union, kind_flag, typedef member, bitfield_size != 0",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), 4), /* [2] */
+ BTF_ENUM_ENC(NAME_TBD, 0),
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_UNION, 1, 2), 4), /* [3] */
+ BTF_MEMBER_ENC(NAME_TBD, 4, BTF_MEMBER_OFFSET(4, 0)),
+ BTF_MEMBER_ENC(NAME_TBD, 5, BTF_MEMBER_OFFSET(4, 0)),
+ BTF_TYPEDEF_ENC(NAME_TBD, 1), /* [4] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 2), /* [5] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A\0B\0C\0D\0E"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "union_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "invalid struct, kind_flag, bitfield_size greater than struct size",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 2), 4), /* [2] */
+ BTF_MEMBER_ENC(NAME_TBD, 1, BTF_MEMBER_OFFSET(20, 0)),
+ BTF_MEMBER_ENC(NAME_TBD, 1, BTF_MEMBER_OFFSET(20, 20)),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A\0B"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "struct_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Member exceeds struct_size",
+},
+
+{
+ .descr = "invalid struct, kind_flag, bitfield base_type int not regular",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 20, 4), /* [2] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 2), 4), /* [3] */
+ BTF_MEMBER_ENC(NAME_TBD, 2, BTF_MEMBER_OFFSET(20, 0)),
+ BTF_MEMBER_ENC(NAME_TBD, 2, BTF_MEMBER_OFFSET(20, 20)),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A\0B"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "struct_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid member base type",
+},
+
+{
+ .descr = "invalid struct, kind_flag, base_type int not regular",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 12, 4), /* [2] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 2), 4), /* [3] */
+ BTF_MEMBER_ENC(NAME_TBD, 2, BTF_MEMBER_OFFSET(8, 0)),
+ BTF_MEMBER_ENC(NAME_TBD, 2, BTF_MEMBER_OFFSET(8, 8)),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A\0B"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "struct_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid member base type",
+},
+
+{
+ .descr = "invalid union, kind_flag, bitfield_size greater than struct size",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_UNION, 1, 2), 2), /* [2] */
+ BTF_MEMBER_ENC(NAME_TBD, 1, BTF_MEMBER_OFFSET(8, 0)),
+ BTF_MEMBER_ENC(NAME_TBD, 1, BTF_MEMBER_OFFSET(20, 0)),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A\0B"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "union_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Member exceeds struct_size",
+},
+
+{
+ .descr = "invalid struct, kind_flag, int member, bitfield_size = 0, wrong byte alignment",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [2] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 2), 12), /* [3] */
+ BTF_MEMBER_ENC(NAME_TBD, 2, BTF_MEMBER_OFFSET(0, 0)),
+ BTF_MEMBER_ENC(NAME_TBD, 2, BTF_MEMBER_OFFSET(0, 36)),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A\0B"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "struct_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid member offset",
+},
+
+{
+ .descr = "invalid struct, kind_flag, enum member, bitfield_size = 0, wrong byte alignment",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [2] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), 4), /* [2] */
+ BTF_ENUM_ENC(NAME_TBD, 0),
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 2), 12), /* [3] */
+ BTF_MEMBER_ENC(NAME_TBD, 2, BTF_MEMBER_OFFSET(0, 0)),
+ BTF_MEMBER_ENC(NAME_TBD, 2, BTF_MEMBER_OFFSET(0, 36)),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A\0B\0C"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "struct_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+ .btf_load_err = true,
+ .err_str = "Invalid member offset",
+},
+
+{
+ .descr = "128-bit int",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 128, 16), /* [2] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "int_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "struct, 128-bit int member",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 128, 16), /* [2] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 16), /* [3] */
+ BTF_MEMBER_ENC(NAME_TBD, 2, 0),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "struct_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "struct, 120-bit int member bitfield",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 120, 16), /* [2] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 16), /* [3] */
+ BTF_MEMBER_ENC(NAME_TBD, 2, 0),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "struct_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "struct, kind_flag, 128-bit int member",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 128, 16), /* [2] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 1), 16), /* [3] */
+ BTF_MEMBER_ENC(NAME_TBD, 2, BTF_MEMBER_OFFSET(0, 0)),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "struct_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+
+{
+ .descr = "struct, kind_flag, 120-bit int member bitfield",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 128, 16), /* [2] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 1), 16), /* [3] */
+ BTF_MEMBER_ENC(NAME_TBD, 2, BTF_MEMBER_OFFSET(120, 0)),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "struct_type_check_btf",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .key_type_id = 1,
+ .value_type_id = 1,
+ .max_entries = 4,
+},
+/*
+ * typedef int arr_t[16];
+ * struct s {
+ * arr_t *a;
+ * };
+ */
+{
+ .descr = "struct->ptr->typedef->array->int size resolution",
+ .raw_types = {
+ BTF_STRUCT_ENC(NAME_TBD, 1, 8), /* [1] */
+ BTF_MEMBER_ENC(NAME_TBD, 2, 0),
+ BTF_PTR_ENC(3), /* [2] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 4), /* [3] */
+ BTF_TYPE_ARRAY_ENC(5, 5, 16), /* [4] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [5] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0s\0a\0arr_t"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "ptr_mod_chain_size_resolve_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int) * 16,
+ .key_type_id = 5 /* int */,
+ .value_type_id = 3 /* arr_t */,
+ .max_entries = 4,
+},
+/*
+ * typedef int arr_t[16][8][4];
+ * struct s {
+ * arr_t *a;
+ * };
+ */
+{
+ .descr = "struct->ptr->typedef->multi-array->int size resolution",
+ .raw_types = {
+ BTF_STRUCT_ENC(NAME_TBD, 1, 8), /* [1] */
+ BTF_MEMBER_ENC(NAME_TBD, 2, 0),
+ BTF_PTR_ENC(3), /* [2] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 4), /* [3] */
+ BTF_TYPE_ARRAY_ENC(5, 7, 16), /* [4] */
+ BTF_TYPE_ARRAY_ENC(6, 7, 8), /* [5] */
+ BTF_TYPE_ARRAY_ENC(7, 7, 4), /* [6] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [7] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0s\0a\0arr_t"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "multi_arr_size_resolve_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int) * 16 * 8 * 4,
+ .key_type_id = 7 /* int */,
+ .value_type_id = 3 /* arr_t */,
+ .max_entries = 4,
+},
+/*
+ * typedef int int_t;
+ * typedef int_t arr3_t[4];
+ * typedef arr3_t arr2_t[8];
+ * typedef arr2_t arr1_t[16];
+ * struct s {
+ * arr1_t *a;
+ * };
+ */
+{
+ .descr = "typedef/multi-arr mix size resolution",
+ .raw_types = {
+ BTF_STRUCT_ENC(NAME_TBD, 1, 8), /* [1] */
+ BTF_MEMBER_ENC(NAME_TBD, 2, 0),
+ BTF_PTR_ENC(3), /* [2] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 4), /* [3] */
+ BTF_TYPE_ARRAY_ENC(5, 10, 16), /* [4] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 6), /* [5] */
+ BTF_TYPE_ARRAY_ENC(7, 10, 8), /* [6] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 8), /* [7] */
+ BTF_TYPE_ARRAY_ENC(9, 10, 4), /* [8] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 10), /* [9] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [10] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0s\0a\0arr1_t\0arr2_t\0arr3_t\0int_t"),
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "typedef_arra_mix_size_resolve_map",
+ .key_size = sizeof(int),
+ .value_size = sizeof(int) * 16 * 8 * 4,
+ .key_type_id = 10 /* int */,
+ .value_type_id = 3 /* arr_t */,
+ .max_entries = 4,
+},
+
+}; /* struct btf_raw_test raw_tests[] */
+
+static const char *get_next_str(const char *start, const char *end)
+{
+ return start < end - 1 ? start + 1 : NULL;
+}
+
+static int get_raw_sec_size(const __u32 *raw_types)
+{
+ int i;
+
+ for (i = MAX_NR_RAW_U32 - 1;
+ i >= 0 && raw_types[i] != BTF_END_RAW;
+ i--)
+ ;
+
+ return i < 0 ? i : i * sizeof(raw_types[0]);
+}
+
+static void *btf_raw_create(const struct btf_header *hdr,
+ const __u32 *raw_types,
+ const char *str,
+ unsigned int str_sec_size,
+ unsigned int *btf_size,
+ const char **ret_next_str)
+{
+ const char *next_str = str, *end_str = str + str_sec_size;
+ const char **strs_idx = NULL, **tmp_strs_idx;
+ int strs_cap = 0, strs_cnt = 0, next_str_idx = 0;
+ unsigned int size_needed, offset;
+ struct btf_header *ret_hdr;
+ int i, type_sec_size, err = 0;
+ uint32_t *ret_types;
+ void *raw_btf = NULL;
+
+ type_sec_size = get_raw_sec_size(raw_types);
+ if (CHECK(type_sec_size < 0, "Cannot get nr_raw_types"))
+ return NULL;
+
+ size_needed = sizeof(*hdr) + type_sec_size + str_sec_size;
+ raw_btf = malloc(size_needed);
+ if (CHECK(!raw_btf, "Cannot allocate memory for raw_btf"))
+ return NULL;
+
+ /* Copy header */
+ memcpy(raw_btf, hdr, sizeof(*hdr));
+ offset = sizeof(*hdr);
+
+ /* Index strings */
+ while ((next_str = get_next_str(next_str, end_str))) {
+ if (strs_cnt == strs_cap) {
+ strs_cap += max(16, strs_cap / 2);
+ tmp_strs_idx = realloc(strs_idx,
+ sizeof(*strs_idx) * strs_cap);
+ if (CHECK(!tmp_strs_idx,
+ "Cannot allocate memory for strs_idx")) {
+ err = -1;
+ goto done;
+ }
+ strs_idx = tmp_strs_idx;
+ }
+ strs_idx[strs_cnt++] = next_str;
+ next_str += strlen(next_str);
+ }
+
+ /* Copy type section */
+ ret_types = raw_btf + offset;
+ for (i = 0; i < type_sec_size / sizeof(raw_types[0]); i++) {
+ if (raw_types[i] == NAME_TBD) {
+ if (CHECK(next_str_idx == strs_cnt,
+ "Error in getting next_str #%d",
+ next_str_idx)) {
+ err = -1;
+ goto done;
+ }
+ ret_types[i] = strs_idx[next_str_idx++] - str;
+ } else if (IS_NAME_NTH(raw_types[i])) {
+ int idx = GET_NAME_NTH_IDX(raw_types[i]);
+
+ if (CHECK(idx <= 0 || idx > strs_cnt,
+ "Error getting string #%d, strs_cnt:%d",
+ idx, strs_cnt)) {
+ err = -1;
+ goto done;
+ }
+ ret_types[i] = strs_idx[idx-1] - str;
+ } else {
+ ret_types[i] = raw_types[i];
+ }
+ }
+ offset += type_sec_size;
+
+ /* Copy string section */
+ memcpy(raw_btf + offset, str, str_sec_size);
+
+ ret_hdr = (struct btf_header *)raw_btf;
+ ret_hdr->type_len = type_sec_size;
+ ret_hdr->str_off = type_sec_size;
+ ret_hdr->str_len = str_sec_size;
+
+ *btf_size = size_needed;
+ if (ret_next_str)
+ *ret_next_str =
+ next_str_idx < strs_cnt ? strs_idx[next_str_idx] : NULL;
+
+done:
+ if (err) {
+ if (raw_btf)
+ free(raw_btf);
+ if (strs_idx)
+ free(strs_idx);
+ return NULL;
+ }
+ return raw_btf;
+}
+
+static void do_test_raw(unsigned int test_num)
+{
+ struct btf_raw_test *test = &raw_tests[test_num - 1];
+ struct bpf_create_map_attr create_attr = {};
+ int map_fd = -1, btf_fd = -1;
+ unsigned int raw_btf_size;
+ struct btf_header *hdr;
+ void *raw_btf;
+ int err;
+
+ if (!test__start_subtest(test->descr))
+ return;
+
+ raw_btf = btf_raw_create(&hdr_tmpl,
+ test->raw_types,
+ test->str_sec,
+ test->str_sec_size,
+ &raw_btf_size, NULL);
+ if (!raw_btf)
+ return;
+
+ hdr = raw_btf;
+
+ hdr->hdr_len = (int)hdr->hdr_len + test->hdr_len_delta;
+ hdr->type_off = (int)hdr->type_off + test->type_off_delta;
+ hdr->str_off = (int)hdr->str_off + test->str_off_delta;
+ hdr->str_len = (int)hdr->str_len + test->str_len_delta;
+
+ *btf_log_buf = '\0';
+ btf_fd = bpf_load_btf(raw_btf, raw_btf_size,
+ btf_log_buf, BTF_LOG_BUF_SIZE,
+ always_log);
+ free(raw_btf);
+
+ err = ((btf_fd == -1) != test->btf_load_err);
+ if (CHECK(err, "btf_fd:%d test->btf_load_err:%u",
+ btf_fd, test->btf_load_err) ||
+ CHECK(test->err_str && !strstr(btf_log_buf, test->err_str),
+ "expected err_str:%s", test->err_str)) {
+ err = -1;
+ goto done;
+ }
+
+ if (err || btf_fd == -1)
+ goto done;
+
+ create_attr.name = test->map_name;
+ create_attr.map_type = test->map_type;
+ create_attr.key_size = test->key_size;
+ create_attr.value_size = test->value_size;
+ create_attr.max_entries = test->max_entries;
+ create_attr.btf_fd = btf_fd;
+ create_attr.btf_key_type_id = test->key_type_id;
+ create_attr.btf_value_type_id = test->value_type_id;
+
+ map_fd = bpf_create_map_xattr(&create_attr);
+
+ err = ((map_fd == -1) != test->map_create_err);
+ CHECK(err, "map_fd:%d test->map_create_err:%u",
+ map_fd, test->map_create_err);
+
+done:
+ if (*btf_log_buf && (err || always_log))
+ fprintf(stderr, "\n%s", btf_log_buf);
+ if (btf_fd != -1)
+ close(btf_fd);
+ if (map_fd != -1)
+ close(map_fd);
+}
+
+struct btf_get_info_test {
+ const char *descr;
+ const char *str_sec;
+ __u32 raw_types[MAX_NR_RAW_U32];
+ __u32 str_sec_size;
+ int btf_size_delta;
+ int (*special_test)(unsigned int test_num);
+};
+
+static int test_big_btf_info(unsigned int test_num);
+static int test_btf_id(unsigned int test_num);
+
+const struct btf_get_info_test get_info_tests[] = {
+{
+ .descr = "== raw_btf_size+1",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .btf_size_delta = 1,
+},
+{
+ .descr = "== raw_btf_size-3",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .btf_size_delta = -3,
+},
+{
+ .descr = "Large bpf_btf_info",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .special_test = test_big_btf_info,
+},
+{
+ .descr = "BTF ID",
+ .raw_types = {
+ /* int */ /* [1] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4),
+ /* unsigned int */ /* [2] */
+ BTF_TYPE_INT_ENC(0, 0, 0, 32, 4),
+ BTF_END_RAW,
+ },
+ .str_sec = "",
+ .str_sec_size = sizeof(""),
+ .special_test = test_btf_id,
+},
+};
+
+static int test_big_btf_info(unsigned int test_num)
+{
+ const struct btf_get_info_test *test = &get_info_tests[test_num - 1];
+ uint8_t *raw_btf = NULL, *user_btf = NULL;
+ unsigned int raw_btf_size;
+ struct {
+ struct bpf_btf_info info;
+ uint64_t garbage;
+ } info_garbage;
+ struct bpf_btf_info *info;
+ int btf_fd = -1, err;
+ uint32_t info_len;
+
+ raw_btf = btf_raw_create(&hdr_tmpl,
+ test->raw_types,
+ test->str_sec,
+ test->str_sec_size,
+ &raw_btf_size, NULL);
+
+ if (!raw_btf)
+ return -1;
+
+ *btf_log_buf = '\0';
+
+ user_btf = malloc(raw_btf_size);
+ if (CHECK(!user_btf, "!user_btf")) {
+ err = -1;
+ goto done;
+ }
+
+ btf_fd = bpf_load_btf(raw_btf, raw_btf_size,
+ btf_log_buf, BTF_LOG_BUF_SIZE,
+ always_log);
+ if (CHECK(btf_fd == -1, "errno:%d", errno)) {
+ err = -1;
+ goto done;
+ }
+
+ /*
+ * GET_INFO should error out if the userspace info
+ * has non zero tailing bytes.
+ */
+ info = &info_garbage.info;
+ memset(info, 0, sizeof(*info));
+ info_garbage.garbage = 0xdeadbeef;
+ info_len = sizeof(info_garbage);
+ info->btf = ptr_to_u64(user_btf);
+ info->btf_size = raw_btf_size;
+
+ err = bpf_obj_get_info_by_fd(btf_fd, info, &info_len);
+ if (CHECK(!err, "!err")) {
+ err = -1;
+ goto done;
+ }
+
+ /*
+ * GET_INFO should succeed even info_len is larger than
+ * the kernel supported as long as tailing bytes are zero.
+ * The kernel supported info len should also be returned
+ * to userspace.
+ */
+ info_garbage.garbage = 0;
+ err = bpf_obj_get_info_by_fd(btf_fd, info, &info_len);
+ if (CHECK(err || info_len != sizeof(*info),
+ "err:%d errno:%d info_len:%u sizeof(*info):%zu",
+ err, errno, info_len, sizeof(*info))) {
+ err = -1;
+ goto done;
+ }
+
+ fprintf(stderr, "OK");
+
+done:
+ if (*btf_log_buf && (err || always_log))
+ fprintf(stderr, "\n%s", btf_log_buf);
+
+ free(raw_btf);
+ free(user_btf);
+
+ if (btf_fd != -1)
+ close(btf_fd);
+
+ return err;
+}
+
+static int test_btf_id(unsigned int test_num)
+{
+ const struct btf_get_info_test *test = &get_info_tests[test_num - 1];
+ struct bpf_create_map_attr create_attr = {};
+ uint8_t *raw_btf = NULL, *user_btf[2] = {};
+ int btf_fd[2] = {-1, -1}, map_fd = -1;
+ struct bpf_map_info map_info = {};
+ struct bpf_btf_info info[2] = {};
+ unsigned int raw_btf_size;
+ uint32_t info_len;
+ int err, i, ret;
+
+ raw_btf = btf_raw_create(&hdr_tmpl,
+ test->raw_types,
+ test->str_sec,
+ test->str_sec_size,
+ &raw_btf_size, NULL);
+
+ if (!raw_btf)
+ return -1;
+
+ *btf_log_buf = '\0';
+
+ for (i = 0; i < 2; i++) {
+ user_btf[i] = malloc(raw_btf_size);
+ if (CHECK(!user_btf[i], "!user_btf[%d]", i)) {
+ err = -1;
+ goto done;
+ }
+ info[i].btf = ptr_to_u64(user_btf[i]);
+ info[i].btf_size = raw_btf_size;
+ }
+
+ btf_fd[0] = bpf_load_btf(raw_btf, raw_btf_size,
+ btf_log_buf, BTF_LOG_BUF_SIZE,
+ always_log);
+ if (CHECK(btf_fd[0] == -1, "errno:%d", errno)) {
+ err = -1;
+ goto done;
+ }
+
+ /* Test BPF_OBJ_GET_INFO_BY_ID on btf_id */
+ info_len = sizeof(info[0]);
+ err = bpf_obj_get_info_by_fd(btf_fd[0], &info[0], &info_len);
+ if (CHECK(err, "errno:%d", errno)) {
+ err = -1;
+ goto done;
+ }
+
+ btf_fd[1] = bpf_btf_get_fd_by_id(info[0].id);
+ if (CHECK(btf_fd[1] == -1, "errno:%d", errno)) {
+ err = -1;
+ goto done;
+ }
+
+ ret = 0;
+ err = bpf_obj_get_info_by_fd(btf_fd[1], &info[1], &info_len);
+ if (CHECK(err || info[0].id != info[1].id ||
+ info[0].btf_size != info[1].btf_size ||
+ (ret = memcmp(user_btf[0], user_btf[1], info[0].btf_size)),
+ "err:%d errno:%d id0:%u id1:%u btf_size0:%u btf_size1:%u memcmp:%d",
+ err, errno, info[0].id, info[1].id,
+ info[0].btf_size, info[1].btf_size, ret)) {
+ err = -1;
+ goto done;
+ }
+
+ /* Test btf members in struct bpf_map_info */
+ create_attr.name = "test_btf_id";
+ create_attr.map_type = BPF_MAP_TYPE_ARRAY;
+ create_attr.key_size = sizeof(int);
+ create_attr.value_size = sizeof(unsigned int);
+ create_attr.max_entries = 4;
+ create_attr.btf_fd = btf_fd[0];
+ create_attr.btf_key_type_id = 1;
+ create_attr.btf_value_type_id = 2;
+
+ map_fd = bpf_create_map_xattr(&create_attr);
+ if (CHECK(map_fd == -1, "errno:%d", errno)) {
+ err = -1;
+ goto done;
+ }
+
+ info_len = sizeof(map_info);
+ err = bpf_obj_get_info_by_fd(map_fd, &map_info, &info_len);
+ if (CHECK(err || map_info.btf_id != info[0].id ||
+ map_info.btf_key_type_id != 1 || map_info.btf_value_type_id != 2,
+ "err:%d errno:%d info.id:%u btf_id:%u btf_key_type_id:%u btf_value_type_id:%u",
+ err, errno, info[0].id, map_info.btf_id, map_info.btf_key_type_id,
+ map_info.btf_value_type_id)) {
+ err = -1;
+ goto done;
+ }
+
+ for (i = 0; i < 2; i++) {
+ close(btf_fd[i]);
+ btf_fd[i] = -1;
+ }
+
+ /* Test BTF ID is removed from the kernel */
+ btf_fd[0] = bpf_btf_get_fd_by_id(map_info.btf_id);
+ if (CHECK(btf_fd[0] == -1, "errno:%d", errno)) {
+ err = -1;
+ goto done;
+ }
+ close(btf_fd[0]);
+ btf_fd[0] = -1;
+
+ /* The map holds the last ref to BTF and its btf_id */
+ close(map_fd);
+ map_fd = -1;
+ btf_fd[0] = bpf_btf_get_fd_by_id(map_info.btf_id);
+ if (CHECK(btf_fd[0] != -1, "BTF lingers")) {
+ err = -1;
+ goto done;
+ }
+
+ fprintf(stderr, "OK");
+
+done:
+ if (*btf_log_buf && (err || always_log))
+ fprintf(stderr, "\n%s", btf_log_buf);
+
+ free(raw_btf);
+ if (map_fd != -1)
+ close(map_fd);
+ for (i = 0; i < 2; i++) {
+ free(user_btf[i]);
+ if (btf_fd[i] != -1)
+ close(btf_fd[i]);
+ }
+
+ return err;
+}
+
+static void do_test_get_info(unsigned int test_num)
+{
+ const struct btf_get_info_test *test = &get_info_tests[test_num - 1];
+ unsigned int raw_btf_size, user_btf_size, expected_nbytes;
+ uint8_t *raw_btf = NULL, *user_btf = NULL;
+ struct bpf_btf_info info = {};
+ int btf_fd = -1, err, ret;
+ uint32_t info_len;
+
+ if (!test__start_subtest(test->descr))
+ return;
+
+ if (test->special_test) {
+ err = test->special_test(test_num);
+ if (CHECK(err, "failed: %d\n", err))
+ return;
+ }
+
+ raw_btf = btf_raw_create(&hdr_tmpl,
+ test->raw_types,
+ test->str_sec,
+ test->str_sec_size,
+ &raw_btf_size, NULL);
+
+ if (!raw_btf)
+ return;
+
+ *btf_log_buf = '\0';
+
+ user_btf = malloc(raw_btf_size);
+ if (CHECK(!user_btf, "!user_btf")) {
+ err = -1;
+ goto done;
+ }
+
+ btf_fd = bpf_load_btf(raw_btf, raw_btf_size,
+ btf_log_buf, BTF_LOG_BUF_SIZE,
+ always_log);
+ if (CHECK(btf_fd == -1, "errno:%d", errno)) {
+ err = -1;
+ goto done;
+ }
+
+ user_btf_size = (int)raw_btf_size + test->btf_size_delta;
+ expected_nbytes = min(raw_btf_size, user_btf_size);
+ if (raw_btf_size > expected_nbytes)
+ memset(user_btf + expected_nbytes, 0xff,
+ raw_btf_size - expected_nbytes);
+
+ info_len = sizeof(info);
+ info.btf = ptr_to_u64(user_btf);
+ info.btf_size = user_btf_size;
+
+ ret = 0;
+ err = bpf_obj_get_info_by_fd(btf_fd, &info, &info_len);
+ if (CHECK(err || !info.id || info_len != sizeof(info) ||
+ info.btf_size != raw_btf_size ||
+ (ret = memcmp(raw_btf, user_btf, expected_nbytes)),
+ "err:%d errno:%d info.id:%u info_len:%u sizeof(info):%zu raw_btf_size:%u info.btf_size:%u expected_nbytes:%u memcmp:%d",
+ err, errno, info.id, info_len, sizeof(info),
+ raw_btf_size, info.btf_size, expected_nbytes, ret)) {
+ err = -1;
+ goto done;
+ }
+
+ while (expected_nbytes < raw_btf_size) {
+ fprintf(stderr, "%u...", expected_nbytes);
+ if (CHECK(user_btf[expected_nbytes++] != 0xff,
+ "user_btf[%u]:%x != 0xff", expected_nbytes - 1,
+ user_btf[expected_nbytes - 1])) {
+ err = -1;
+ goto done;
+ }
+ }
+
+ fprintf(stderr, "OK");
+
+done:
+ if (*btf_log_buf && (err || always_log))
+ fprintf(stderr, "\n%s", btf_log_buf);
+
+ free(raw_btf);
+ free(user_btf);
+
+ if (btf_fd != -1)
+ close(btf_fd);
+}
+
+struct btf_file_test {
+ const char *file;
+ bool btf_kv_notfound;
+};
+
+static struct btf_file_test file_tests[] = {
+ { .file = "test_btf_haskv.o", },
+ { .file = "test_btf_newkv.o", },
+ { .file = "test_btf_nokv.o", .btf_kv_notfound = true, },
+};
+
+static void do_test_file(unsigned int test_num)
+{
+ const struct btf_file_test *test = &file_tests[test_num - 1];
+ const char *expected_fnames[] = {"_dummy_tracepoint",
+ "test_long_fname_1",
+ "test_long_fname_2"};
+ struct btf_ext *btf_ext = NULL;
+ struct bpf_prog_info info = {};
+ struct bpf_object *obj = NULL;
+ struct bpf_func_info *finfo;
+ struct bpf_program *prog;
+ __u32 info_len, rec_size;
+ bool has_btf_ext = false;
+ struct btf *btf = NULL;
+ void *func_info = NULL;
+ struct bpf_map *map;
+ int i, err, prog_fd;
+
+ if (!test__start_subtest(test->file))
+ return;
+
+ btf = btf__parse_elf(test->file, &btf_ext);
+ if (IS_ERR(btf)) {
+ if (PTR_ERR(btf) == -ENOENT) {
+ printf("%s:SKIP: No ELF %s found", __func__, BTF_ELF_SEC);
+ test__skip();
+ return;
+ }
+ return;
+ }
+ btf__free(btf);
+
+ has_btf_ext = btf_ext != NULL;
+ btf_ext__free(btf_ext);
+
+ obj = bpf_object__open(test->file);
+ if (CHECK(IS_ERR(obj), "obj: %ld", PTR_ERR(obj)))
+ return;
+
+ prog = bpf_program__next(NULL, obj);
+ if (CHECK(!prog, "Cannot find bpf_prog")) {
+ err = -1;
+ goto done;
+ }
+
+ bpf_program__set_type(prog, BPF_PROG_TYPE_TRACEPOINT);
+ err = bpf_object__load(obj);
+ if (CHECK(err < 0, "bpf_object__load: %d", err))
+ goto done;
+ prog_fd = bpf_program__fd(prog);
+
+ map = bpf_object__find_map_by_name(obj, "btf_map");
+ if (CHECK(!map, "btf_map not found")) {
+ err = -1;
+ goto done;
+ }
+
+ err = (bpf_map__btf_key_type_id(map) == 0 || bpf_map__btf_value_type_id(map) == 0)
+ != test->btf_kv_notfound;
+ if (CHECK(err, "btf_key_type_id:%u btf_value_type_id:%u test->btf_kv_notfound:%u",
+ bpf_map__btf_key_type_id(map), bpf_map__btf_value_type_id(map),
+ test->btf_kv_notfound))
+ goto done;
+
+ if (!has_btf_ext)
+ goto skip;
+
+ /* get necessary program info */
+ info_len = sizeof(struct bpf_prog_info);
+ err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+
+ if (CHECK(err == -1, "invalid get info (1st) errno:%d", errno)) {
+ fprintf(stderr, "%s\n", btf_log_buf);
+ err = -1;
+ goto done;
+ }
+ if (CHECK(info.nr_func_info != 3,
+ "incorrect info.nr_func_info (1st) %d",
+ info.nr_func_info)) {
+ err = -1;
+ goto done;
+ }
+ rec_size = info.func_info_rec_size;
+ if (CHECK(rec_size != sizeof(struct bpf_func_info),
+ "incorrect info.func_info_rec_size (1st) %d\n", rec_size)) {
+ err = -1;
+ goto done;
+ }
+
+ func_info = malloc(info.nr_func_info * rec_size);
+ if (CHECK(!func_info, "out of memory")) {
+ err = -1;
+ goto done;
+ }
+
+ /* reset info to only retrieve func_info related data */
+ memset(&info, 0, sizeof(info));
+ info.nr_func_info = 3;
+ info.func_info_rec_size = rec_size;
+ info.func_info = ptr_to_u64(func_info);
+
+ err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+
+ if (CHECK(err == -1, "invalid get info (2nd) errno:%d", errno)) {
+ fprintf(stderr, "%s\n", btf_log_buf);
+ err = -1;
+ goto done;
+ }
+ if (CHECK(info.nr_func_info != 3,
+ "incorrect info.nr_func_info (2nd) %d",
+ info.nr_func_info)) {
+ err = -1;
+ goto done;
+ }
+ if (CHECK(info.func_info_rec_size != rec_size,
+ "incorrect info.func_info_rec_size (2nd) %d",
+ info.func_info_rec_size)) {
+ err = -1;
+ goto done;
+ }
+
+ err = btf__get_from_id(info.btf_id, &btf);
+ if (CHECK(err, "cannot get btf from kernel, err: %d", err))
+ goto done;
+
+ /* check three functions */
+ finfo = func_info;
+ for (i = 0; i < 3; i++) {
+ const struct btf_type *t;
+ const char *fname;
+
+ t = btf__type_by_id(btf, finfo->type_id);
+ if (CHECK(!t, "btf__type_by_id failure: id %u",
+ finfo->type_id)) {
+ err = -1;
+ goto done;
+ }
+
+ fname = btf__name_by_offset(btf, t->name_off);
+ err = strcmp(fname, expected_fnames[i]);
+ /* for the second and third functions in .text section,
+ * the compiler may order them either way.
+ */
+ if (i && err)
+ err = strcmp(fname, expected_fnames[3 - i]);
+ if (CHECK(err, "incorrect fname %s", fname ? : "")) {
+ err = -1;
+ goto done;
+ }
+
+ finfo = (void *)finfo + rec_size;
+ }
+
+skip:
+ fprintf(stderr, "OK");
+
+done:
+ free(func_info);
+ bpf_object__close(obj);
+}
+
+const char *pprint_enum_str[] = {
+ "ENUM_ZERO",
+ "ENUM_ONE",
+ "ENUM_TWO",
+ "ENUM_THREE",
+};
+
+struct pprint_mapv {
+ uint32_t ui32;
+ uint16_t ui16;
+ /* 2 bytes hole */
+ int32_t si32;
+ uint32_t unused_bits2a:2,
+ bits28:28,
+ unused_bits2b:2;
+ union {
+ uint64_t ui64;
+ uint8_t ui8a[8];
+ };
+ enum {
+ ENUM_ZERO,
+ ENUM_ONE,
+ ENUM_TWO,
+ ENUM_THREE,
+ } aenum;
+ uint32_t ui32b;
+ uint32_t bits2c:2;
+ uint8_t si8_4[2][2];
+};
+
+#ifdef __SIZEOF_INT128__
+struct pprint_mapv_int128 {
+ __int128 si128a;
+ __int128 si128b;
+ unsigned __int128 bits3:3;
+ unsigned __int128 bits80:80;
+ unsigned __int128 ui128;
+};
+#endif
+
+static struct btf_raw_test pprint_test_template[] = {
+{
+ .raw_types = {
+ /* unsighed char */ /* [1] */
+ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 8, 1),
+ /* unsigned short */ /* [2] */
+ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 16, 2),
+ /* unsigned int */ /* [3] */
+ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 32, 4),
+ /* int */ /* [4] */
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+ /* unsigned long long */ /* [5] */
+ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 64, 8),
+ /* 2 bits */ /* [6] */
+ BTF_TYPE_INT_ENC(0, 0, 0, 2, 2),
+ /* 28 bits */ /* [7] */
+ BTF_TYPE_INT_ENC(0, 0, 0, 28, 4),
+ /* uint8_t[8] */ /* [8] */
+ BTF_TYPE_ARRAY_ENC(9, 1, 8),
+ /* typedef unsigned char uint8_t */ /* [9] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 1),
+ /* typedef unsigned short uint16_t */ /* [10] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 2),
+ /* typedef unsigned int uint32_t */ /* [11] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 3),
+ /* typedef int int32_t */ /* [12] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 4),
+ /* typedef unsigned long long uint64_t *//* [13] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 5),
+ /* union (anon) */ /* [14] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_UNION, 0, 2), 8),
+ BTF_MEMBER_ENC(NAME_TBD, 13, 0),/* uint64_t ui64; */
+ BTF_MEMBER_ENC(NAME_TBD, 8, 0), /* uint8_t ui8a[8]; */
+ /* enum (anon) */ /* [15] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 4), 4),
+ BTF_ENUM_ENC(NAME_TBD, 0),
+ BTF_ENUM_ENC(NAME_TBD, 1),
+ BTF_ENUM_ENC(NAME_TBD, 2),
+ BTF_ENUM_ENC(NAME_TBD, 3),
+ /* struct pprint_mapv */ /* [16] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 11), 40),
+ BTF_MEMBER_ENC(NAME_TBD, 11, 0), /* uint32_t ui32 */
+ BTF_MEMBER_ENC(NAME_TBD, 10, 32), /* uint16_t ui16 */
+ BTF_MEMBER_ENC(NAME_TBD, 12, 64), /* int32_t si32 */
+ BTF_MEMBER_ENC(NAME_TBD, 6, 96), /* unused_bits2a */
+ BTF_MEMBER_ENC(NAME_TBD, 7, 98), /* bits28 */
+ BTF_MEMBER_ENC(NAME_TBD, 6, 126), /* unused_bits2b */
+ BTF_MEMBER_ENC(0, 14, 128), /* union (anon) */
+ BTF_MEMBER_ENC(NAME_TBD, 15, 192), /* aenum */
+ BTF_MEMBER_ENC(NAME_TBD, 11, 224), /* uint32_t ui32b */
+ BTF_MEMBER_ENC(NAME_TBD, 6, 256), /* bits2c */
+ BTF_MEMBER_ENC(NAME_TBD, 17, 264), /* si8_4 */
+ BTF_TYPE_ARRAY_ENC(18, 1, 2), /* [17] */
+ BTF_TYPE_ARRAY_ENC(1, 1, 2), /* [18] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0unsigned char\0unsigned short\0unsigned int\0int\0unsigned long long\0uint8_t\0uint16_t\0uint32_t\0int32_t\0uint64_t\0ui64\0ui8a\0ENUM_ZERO\0ENUM_ONE\0ENUM_TWO\0ENUM_THREE\0pprint_mapv\0ui32\0ui16\0si32\0unused_bits2a\0bits28\0unused_bits2b\0aenum\0ui32b\0bits2c\0si8_4"),
+ .key_size = sizeof(unsigned int),
+ .value_size = sizeof(struct pprint_mapv),
+ .key_type_id = 3, /* unsigned int */
+ .value_type_id = 16, /* struct pprint_mapv */
+ .max_entries = 128,
+},
+
+{
+ /* this type will have the same type as the
+ * first .raw_types definition, but struct type will
+ * be encoded with kind_flag set.
+ */
+ .raw_types = {
+ /* unsighed char */ /* [1] */
+ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 8, 1),
+ /* unsigned short */ /* [2] */
+ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 16, 2),
+ /* unsigned int */ /* [3] */
+ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 32, 4),
+ /* int */ /* [4] */
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+ /* unsigned long long */ /* [5] */
+ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 64, 8),
+ BTF_TYPE_INT_ENC(0, 0, 0, 32, 4), /* [6] */
+ BTF_TYPE_INT_ENC(0, 0, 0, 32, 4), /* [7] */
+ /* uint8_t[8] */ /* [8] */
+ BTF_TYPE_ARRAY_ENC(9, 1, 8),
+ /* typedef unsigned char uint8_t */ /* [9] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 1),
+ /* typedef unsigned short uint16_t */ /* [10] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 2),
+ /* typedef unsigned int uint32_t */ /* [11] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 3),
+ /* typedef int int32_t */ /* [12] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 4),
+ /* typedef unsigned long long uint64_t *//* [13] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 5),
+ /* union (anon) */ /* [14] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_UNION, 0, 2), 8),
+ BTF_MEMBER_ENC(NAME_TBD, 13, 0),/* uint64_t ui64; */
+ BTF_MEMBER_ENC(NAME_TBD, 8, 0), /* uint8_t ui8a[8]; */
+ /* enum (anon) */ /* [15] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 4), 4),
+ BTF_ENUM_ENC(NAME_TBD, 0),
+ BTF_ENUM_ENC(NAME_TBD, 1),
+ BTF_ENUM_ENC(NAME_TBD, 2),
+ BTF_ENUM_ENC(NAME_TBD, 3),
+ /* struct pprint_mapv */ /* [16] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 11), 40),
+ BTF_MEMBER_ENC(NAME_TBD, 11, BTF_MEMBER_OFFSET(0, 0)), /* uint32_t ui32 */
+ BTF_MEMBER_ENC(NAME_TBD, 10, BTF_MEMBER_OFFSET(0, 32)), /* uint16_t ui16 */
+ BTF_MEMBER_ENC(NAME_TBD, 12, BTF_MEMBER_OFFSET(0, 64)), /* int32_t si32 */
+ BTF_MEMBER_ENC(NAME_TBD, 6, BTF_MEMBER_OFFSET(2, 96)), /* unused_bits2a */
+ BTF_MEMBER_ENC(NAME_TBD, 7, BTF_MEMBER_OFFSET(28, 98)), /* bits28 */
+ BTF_MEMBER_ENC(NAME_TBD, 6, BTF_MEMBER_OFFSET(2, 126)), /* unused_bits2b */
+ BTF_MEMBER_ENC(0, 14, BTF_MEMBER_OFFSET(0, 128)), /* union (anon) */
+ BTF_MEMBER_ENC(NAME_TBD, 15, BTF_MEMBER_OFFSET(0, 192)), /* aenum */
+ BTF_MEMBER_ENC(NAME_TBD, 11, BTF_MEMBER_OFFSET(0, 224)), /* uint32_t ui32b */
+ BTF_MEMBER_ENC(NAME_TBD, 6, BTF_MEMBER_OFFSET(2, 256)), /* bits2c */
+ BTF_MEMBER_ENC(NAME_TBD, 17, 264), /* si8_4 */
+ BTF_TYPE_ARRAY_ENC(18, 1, 2), /* [17] */
+ BTF_TYPE_ARRAY_ENC(1, 1, 2), /* [18] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0unsigned char\0unsigned short\0unsigned int\0int\0unsigned long long\0uint8_t\0uint16_t\0uint32_t\0int32_t\0uint64_t\0ui64\0ui8a\0ENUM_ZERO\0ENUM_ONE\0ENUM_TWO\0ENUM_THREE\0pprint_mapv\0ui32\0ui16\0si32\0unused_bits2a\0bits28\0unused_bits2b\0aenum\0ui32b\0bits2c\0si8_4"),
+ .key_size = sizeof(unsigned int),
+ .value_size = sizeof(struct pprint_mapv),
+ .key_type_id = 3, /* unsigned int */
+ .value_type_id = 16, /* struct pprint_mapv */
+ .max_entries = 128,
+},
+
+{
+ /* this type will have the same layout as the
+ * first .raw_types definition. The struct type will
+ * be encoded with kind_flag set, bitfield members
+ * are added typedef/const/volatile, and bitfield members
+ * will have both int and enum types.
+ */
+ .raw_types = {
+ /* unsighed char */ /* [1] */
+ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 8, 1),
+ /* unsigned short */ /* [2] */
+ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 16, 2),
+ /* unsigned int */ /* [3] */
+ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 32, 4),
+ /* int */ /* [4] */
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4),
+ /* unsigned long long */ /* [5] */
+ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 64, 8),
+ BTF_TYPE_INT_ENC(0, 0, 0, 32, 4), /* [6] */
+ BTF_TYPE_INT_ENC(0, 0, 0, 32, 4), /* [7] */
+ /* uint8_t[8] */ /* [8] */
+ BTF_TYPE_ARRAY_ENC(9, 1, 8),
+ /* typedef unsigned char uint8_t */ /* [9] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 1),
+ /* typedef unsigned short uint16_t */ /* [10] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 2),
+ /* typedef unsigned int uint32_t */ /* [11] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 3),
+ /* typedef int int32_t */ /* [12] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 4),
+ /* typedef unsigned long long uint64_t *//* [13] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 5),
+ /* union (anon) */ /* [14] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_UNION, 0, 2), 8),
+ BTF_MEMBER_ENC(NAME_TBD, 13, 0),/* uint64_t ui64; */
+ BTF_MEMBER_ENC(NAME_TBD, 8, 0), /* uint8_t ui8a[8]; */
+ /* enum (anon) */ /* [15] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 4), 4),
+ BTF_ENUM_ENC(NAME_TBD, 0),
+ BTF_ENUM_ENC(NAME_TBD, 1),
+ BTF_ENUM_ENC(NAME_TBD, 2),
+ BTF_ENUM_ENC(NAME_TBD, 3),
+ /* struct pprint_mapv */ /* [16] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 11), 40),
+ BTF_MEMBER_ENC(NAME_TBD, 11, BTF_MEMBER_OFFSET(0, 0)), /* uint32_t ui32 */
+ BTF_MEMBER_ENC(NAME_TBD, 10, BTF_MEMBER_OFFSET(0, 32)), /* uint16_t ui16 */
+ BTF_MEMBER_ENC(NAME_TBD, 12, BTF_MEMBER_OFFSET(0, 64)), /* int32_t si32 */
+ BTF_MEMBER_ENC(NAME_TBD, 17, BTF_MEMBER_OFFSET(2, 96)), /* unused_bits2a */
+ BTF_MEMBER_ENC(NAME_TBD, 7, BTF_MEMBER_OFFSET(28, 98)), /* bits28 */
+ BTF_MEMBER_ENC(NAME_TBD, 19, BTF_MEMBER_OFFSET(2, 126)),/* unused_bits2b */
+ BTF_MEMBER_ENC(0, 14, BTF_MEMBER_OFFSET(0, 128)), /* union (anon) */
+ BTF_MEMBER_ENC(NAME_TBD, 15, BTF_MEMBER_OFFSET(0, 192)), /* aenum */
+ BTF_MEMBER_ENC(NAME_TBD, 11, BTF_MEMBER_OFFSET(0, 224)), /* uint32_t ui32b */
+ BTF_MEMBER_ENC(NAME_TBD, 17, BTF_MEMBER_OFFSET(2, 256)), /* bits2c */
+ BTF_MEMBER_ENC(NAME_TBD, 20, BTF_MEMBER_OFFSET(0, 264)), /* si8_4 */
+ /* typedef unsigned int ___int */ /* [17] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 18),
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_VOLATILE, 0, 0), 6), /* [18] */
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), 15), /* [19] */
+ BTF_TYPE_ARRAY_ENC(21, 1, 2), /* [20] */
+ BTF_TYPE_ARRAY_ENC(1, 1, 2), /* [21] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0unsigned char\0unsigned short\0unsigned int\0int\0unsigned long long\0uint8_t\0uint16_t\0uint32_t\0int32_t\0uint64_t\0ui64\0ui8a\0ENUM_ZERO\0ENUM_ONE\0ENUM_TWO\0ENUM_THREE\0pprint_mapv\0ui32\0ui16\0si32\0unused_bits2a\0bits28\0unused_bits2b\0aenum\0ui32b\0bits2c\0___int\0si8_4"),
+ .key_size = sizeof(unsigned int),
+ .value_size = sizeof(struct pprint_mapv),
+ .key_type_id = 3, /* unsigned int */
+ .value_type_id = 16, /* struct pprint_mapv */
+ .max_entries = 128,
+},
+
+#ifdef __SIZEOF_INT128__
+{
+ /* test int128 */
+ .raw_types = {
+ /* unsigned int */ /* [1] */
+ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 32, 4),
+ /* __int128 */ /* [2] */
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 128, 16),
+ /* unsigned __int128 */ /* [3] */
+ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 128, 16),
+ /* struct pprint_mapv_int128 */ /* [4] */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_STRUCT, 1, 5), 64),
+ BTF_MEMBER_ENC(NAME_TBD, 2, BTF_MEMBER_OFFSET(0, 0)), /* si128a */
+ BTF_MEMBER_ENC(NAME_TBD, 2, BTF_MEMBER_OFFSET(0, 128)), /* si128b */
+ BTF_MEMBER_ENC(NAME_TBD, 3, BTF_MEMBER_OFFSET(3, 256)), /* bits3 */
+ BTF_MEMBER_ENC(NAME_TBD, 3, BTF_MEMBER_OFFSET(80, 259)), /* bits80 */
+ BTF_MEMBER_ENC(NAME_TBD, 3, BTF_MEMBER_OFFSET(0, 384)), /* ui128 */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0unsigned int\0__int128\0unsigned __int128\0pprint_mapv_int128\0si128a\0si128b\0bits3\0bits80\0ui128"),
+ .key_size = sizeof(unsigned int),
+ .value_size = sizeof(struct pprint_mapv_int128),
+ .key_type_id = 1,
+ .value_type_id = 4,
+ .max_entries = 128,
+ .mapv_kind = PPRINT_MAPV_KIND_INT128,
+},
+#endif
+
+};
+
+static struct btf_pprint_test_meta {
+ const char *descr;
+ enum bpf_map_type map_type;
+ const char *map_name;
+ bool ordered_map;
+ bool lossless_map;
+ bool percpu_map;
+} pprint_tests_meta[] = {
+{
+ .descr = "BTF pretty print array",
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .map_name = "pprint_test_array",
+ .ordered_map = true,
+ .lossless_map = true,
+ .percpu_map = false,
+},
+
+{
+ .descr = "BTF pretty print hash",
+ .map_type = BPF_MAP_TYPE_HASH,
+ .map_name = "pprint_test_hash",
+ .ordered_map = false,
+ .lossless_map = true,
+ .percpu_map = false,
+},
+
+{
+ .descr = "BTF pretty print lru hash",
+ .map_type = BPF_MAP_TYPE_LRU_HASH,
+ .map_name = "pprint_test_lru_hash",
+ .ordered_map = false,
+ .lossless_map = false,
+ .percpu_map = false,
+},
+
+{
+ .descr = "BTF pretty print percpu array",
+ .map_type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .map_name = "pprint_test_percpu_array",
+ .ordered_map = true,
+ .lossless_map = true,
+ .percpu_map = true,
+},
+
+{
+ .descr = "BTF pretty print percpu hash",
+ .map_type = BPF_MAP_TYPE_PERCPU_HASH,
+ .map_name = "pprint_test_percpu_hash",
+ .ordered_map = false,
+ .lossless_map = true,
+ .percpu_map = true,
+},
+
+{
+ .descr = "BTF pretty print lru percpu hash",
+ .map_type = BPF_MAP_TYPE_LRU_PERCPU_HASH,
+ .map_name = "pprint_test_lru_percpu_hash",
+ .ordered_map = false,
+ .lossless_map = false,
+ .percpu_map = true,
+},
+
+};
+
+static size_t get_pprint_mapv_size(enum pprint_mapv_kind_t mapv_kind)
+{
+ if (mapv_kind == PPRINT_MAPV_KIND_BASIC)
+ return sizeof(struct pprint_mapv);
+
+#ifdef __SIZEOF_INT128__
+ if (mapv_kind == PPRINT_MAPV_KIND_INT128)
+ return sizeof(struct pprint_mapv_int128);
+#endif
+
+ assert(0);
+}
+
+static void set_pprint_mapv(enum pprint_mapv_kind_t mapv_kind,
+ void *mapv, uint32_t i,
+ int num_cpus, int rounded_value_size)
+{
+ int cpu;
+
+ if (mapv_kind == PPRINT_MAPV_KIND_BASIC) {
+ struct pprint_mapv *v = mapv;
+
+ for (cpu = 0; cpu < num_cpus; cpu++) {
+ v->ui32 = i + cpu;
+ v->si32 = -i;
+ v->unused_bits2a = 3;
+ v->bits28 = i;
+ v->unused_bits2b = 3;
+ v->ui64 = i;
+ v->aenum = i & 0x03;
+ v->ui32b = 4;
+ v->bits2c = 1;
+ v->si8_4[0][0] = (cpu + i) & 0xff;
+ v->si8_4[0][1] = (cpu + i + 1) & 0xff;
+ v->si8_4[1][0] = (cpu + i + 2) & 0xff;
+ v->si8_4[1][1] = (cpu + i + 3) & 0xff;
+ v = (void *)v + rounded_value_size;
+ }
+ }
+
+#ifdef __SIZEOF_INT128__
+ if (mapv_kind == PPRINT_MAPV_KIND_INT128) {
+ struct pprint_mapv_int128 *v = mapv;
+
+ for (cpu = 0; cpu < num_cpus; cpu++) {
+ v->si128a = i;
+ v->si128b = -i;
+ v->bits3 = i & 0x07;
+ v->bits80 = (((unsigned __int128)1) << 64) + i;
+ v->ui128 = (((unsigned __int128)2) << 64) + i;
+ v = (void *)v + rounded_value_size;
+ }
+ }
+#endif
+}
+
+ssize_t get_pprint_expected_line(enum pprint_mapv_kind_t mapv_kind,
+ char *expected_line, ssize_t line_size,
+ bool percpu_map, unsigned int next_key,
+ int cpu, void *mapv)
+{
+ ssize_t nexpected_line = -1;
+
+ if (mapv_kind == PPRINT_MAPV_KIND_BASIC) {
+ struct pprint_mapv *v = mapv;
+
+ nexpected_line = snprintf(expected_line, line_size,
+ "%s%u: {%u,0,%d,0x%x,0x%x,0x%x,"
+ "{%llu|[%u,%u,%u,%u,%u,%u,%u,%u]},%s,"
+ "%u,0x%x,[[%d,%d],[%d,%d]]}\n",
+ percpu_map ? "\tcpu" : "",
+ percpu_map ? cpu : next_key,
+ v->ui32, v->si32,
+ v->unused_bits2a,
+ v->bits28,
+ v->unused_bits2b,
+ (__u64)v->ui64,
+ v->ui8a[0], v->ui8a[1],
+ v->ui8a[2], v->ui8a[3],
+ v->ui8a[4], v->ui8a[5],
+ v->ui8a[6], v->ui8a[7],
+ pprint_enum_str[v->aenum],
+ v->ui32b,
+ v->bits2c,
+ v->si8_4[0][0], v->si8_4[0][1],
+ v->si8_4[1][0], v->si8_4[1][1]);
+ }
+
+#ifdef __SIZEOF_INT128__
+ if (mapv_kind == PPRINT_MAPV_KIND_INT128) {
+ struct pprint_mapv_int128 *v = mapv;
+
+ nexpected_line = snprintf(expected_line, line_size,
+ "%s%u: {0x%lx,0x%lx,0x%lx,"
+ "0x%lx%016lx,0x%lx%016lx}\n",
+ percpu_map ? "\tcpu" : "",
+ percpu_map ? cpu : next_key,
+ (uint64_t)v->si128a,
+ (uint64_t)v->si128b,
+ (uint64_t)v->bits3,
+ (uint64_t)(v->bits80 >> 64),
+ (uint64_t)v->bits80,
+ (uint64_t)(v->ui128 >> 64),
+ (uint64_t)v->ui128);
+ }
+#endif
+
+ return nexpected_line;
+}
+
+static int check_line(const char *expected_line, int nexpected_line,
+ int expected_line_len, const char *line)
+{
+ if (CHECK(nexpected_line == expected_line_len,
+ "expected_line is too long"))
+ return -1;
+
+ if (strcmp(expected_line, line)) {
+ fprintf(stderr, "unexpected pprint output\n");
+ fprintf(stderr, "expected: %s", expected_line);
+ fprintf(stderr, " read: %s", line);
+ return -1;
+ }
+
+ return 0;
+}
+
+
+static void do_test_pprint(int test_num)
+{
+ const struct btf_raw_test *test = &pprint_test_template[test_num];
+ enum pprint_mapv_kind_t mapv_kind = test->mapv_kind;
+ struct bpf_create_map_attr create_attr = {};
+ bool ordered_map, lossless_map, percpu_map;
+ int err, ret, num_cpus, rounded_value_size;
+ unsigned int key, nr_read_elems;
+ int map_fd = -1, btf_fd = -1;
+ unsigned int raw_btf_size;
+ char expected_line[255];
+ FILE *pin_file = NULL;
+ char pin_path[255];
+ size_t line_len = 0;
+ char *line = NULL;
+ void *mapv = NULL;
+ uint8_t *raw_btf;
+ ssize_t nread;
+
+ if (!test__start_subtest(test->descr))
+ return;
+
+ raw_btf = btf_raw_create(&hdr_tmpl, test->raw_types,
+ test->str_sec, test->str_sec_size,
+ &raw_btf_size, NULL);
+
+ if (!raw_btf)
+ return;
+
+ *btf_log_buf = '\0';
+ btf_fd = bpf_load_btf(raw_btf, raw_btf_size,
+ btf_log_buf, BTF_LOG_BUF_SIZE,
+ always_log);
+ free(raw_btf);
+
+ if (CHECK(btf_fd == -1, "errno:%d", errno)) {
+ err = -1;
+ goto done;
+ }
+
+ create_attr.name = test->map_name;
+ create_attr.map_type = test->map_type;
+ create_attr.key_size = test->key_size;
+ create_attr.value_size = test->value_size;
+ create_attr.max_entries = test->max_entries;
+ create_attr.btf_fd = btf_fd;
+ create_attr.btf_key_type_id = test->key_type_id;
+ create_attr.btf_value_type_id = test->value_type_id;
+
+ map_fd = bpf_create_map_xattr(&create_attr);
+ if (CHECK(map_fd == -1, "errno:%d", errno)) {
+ err = -1;
+ goto done;
+ }
+
+ ret = snprintf(pin_path, sizeof(pin_path), "%s/%s",
+ "/sys/fs/bpf", test->map_name);
+
+ if (CHECK(ret >= sizeof(pin_path), "pin_path %s/%s is too long",
+ "/sys/fs/bpf", test->map_name)) {
+ err = -1;
+ goto done;
+ }
+
+ err = bpf_obj_pin(map_fd, pin_path);
+ if (CHECK(err, "bpf_obj_pin(%s): errno:%d.", pin_path, errno))
+ goto done;
+
+ percpu_map = test->percpu_map;
+ num_cpus = percpu_map ? bpf_num_possible_cpus() : 1;
+ rounded_value_size = round_up(get_pprint_mapv_size(mapv_kind), 8);
+ mapv = calloc(num_cpus, rounded_value_size);
+ if (CHECK(!mapv, "mapv allocation failure")) {
+ err = -1;
+ goto done;
+ }
+
+ for (key = 0; key < test->max_entries; key++) {
+ set_pprint_mapv(mapv_kind, mapv, key, num_cpus, rounded_value_size);
+ bpf_map_update_elem(map_fd, &key, mapv, 0);
+ }
+
+ pin_file = fopen(pin_path, "r");
+ if (CHECK(!pin_file, "fopen(%s): errno:%d", pin_path, errno)) {
+ err = -1;
+ goto done;
+ }
+
+ /* Skip lines start with '#' */
+ while ((nread = getline(&line, &line_len, pin_file)) > 0 &&
+ *line == '#')
+ ;
+
+ if (CHECK(nread <= 0, "Unexpected EOF")) {
+ err = -1;
+ goto done;
+ }
+
+ nr_read_elems = 0;
+ ordered_map = test->ordered_map;
+ lossless_map = test->lossless_map;
+ do {
+ ssize_t nexpected_line;
+ unsigned int next_key;
+ void *cmapv;
+ int cpu;
+
+ next_key = ordered_map ? nr_read_elems : atoi(line);
+ set_pprint_mapv(mapv_kind, mapv, next_key, num_cpus, rounded_value_size);
+ cmapv = mapv;
+
+ for (cpu = 0; cpu < num_cpus; cpu++) {
+ if (percpu_map) {
+ /* for percpu map, the format looks like:
+ * <key>: {
+ * cpu0: <value_on_cpu0>
+ * cpu1: <value_on_cpu1>
+ * ...
+ * cpun: <value_on_cpun>
+ * }
+ *
+ * let us verify the line containing the key here.
+ */
+ if (cpu == 0) {
+ nexpected_line = snprintf(expected_line,
+ sizeof(expected_line),
+ "%u: {\n",
+ next_key);
+
+ err = check_line(expected_line, nexpected_line,
+ sizeof(expected_line), line);
+ if (err == -1)
+ goto done;
+ }
+
+ /* read value@cpu */
+ nread = getline(&line, &line_len, pin_file);
+ if (nread < 0)
+ break;
+ }
+
+ nexpected_line = get_pprint_expected_line(mapv_kind, expected_line,
+ sizeof(expected_line),
+ percpu_map, next_key,
+ cpu, cmapv);
+ err = check_line(expected_line, nexpected_line,
+ sizeof(expected_line), line);
+ if (err == -1)
+ goto done;
+
+ cmapv = cmapv + rounded_value_size;
+ }
+
+ if (percpu_map) {
+ /* skip the last bracket for the percpu map */
+ nread = getline(&line, &line_len, pin_file);
+ if (nread < 0)
+ break;
+ }
+
+ nread = getline(&line, &line_len, pin_file);
+ } while (++nr_read_elems < test->max_entries && nread > 0);
+
+ if (lossless_map &&
+ CHECK(nr_read_elems < test->max_entries,
+ "Unexpected EOF. nr_read_elems:%u test->max_entries:%u",
+ nr_read_elems, test->max_entries)) {
+ err = -1;
+ goto done;
+ }
+
+ if (CHECK(nread > 0, "Unexpected extra pprint output: %s", line)) {
+ err = -1;
+ goto done;
+ }
+
+ err = 0;
+
+done:
+ if (mapv)
+ free(mapv);
+ if (!err)
+ fprintf(stderr, "OK");
+ if (*btf_log_buf && (err || always_log))
+ fprintf(stderr, "\n%s", btf_log_buf);
+ if (btf_fd != -1)
+ close(btf_fd);
+ if (map_fd != -1)
+ close(map_fd);
+ if (pin_file)
+ fclose(pin_file);
+ unlink(pin_path);
+ free(line);
+}
+
+static void test_pprint(void)
+{
+ unsigned int i;
+
+ /* test various maps with the first test template */
+ for (i = 0; i < ARRAY_SIZE(pprint_tests_meta); i++) {
+ pprint_test_template[0].descr = pprint_tests_meta[i].descr;
+ pprint_test_template[0].map_type = pprint_tests_meta[i].map_type;
+ pprint_test_template[0].map_name = pprint_tests_meta[i].map_name;
+ pprint_test_template[0].ordered_map = pprint_tests_meta[i].ordered_map;
+ pprint_test_template[0].lossless_map = pprint_tests_meta[i].lossless_map;
+ pprint_test_template[0].percpu_map = pprint_tests_meta[i].percpu_map;
+
+ do_test_pprint(0);
+ }
+
+ /* test rest test templates with the first map */
+ for (i = 1; i < ARRAY_SIZE(pprint_test_template); i++) {
+ pprint_test_template[i].descr = pprint_tests_meta[0].descr;
+ pprint_test_template[i].map_type = pprint_tests_meta[0].map_type;
+ pprint_test_template[i].map_name = pprint_tests_meta[0].map_name;
+ pprint_test_template[i].ordered_map = pprint_tests_meta[0].ordered_map;
+ pprint_test_template[i].lossless_map = pprint_tests_meta[0].lossless_map;
+ pprint_test_template[i].percpu_map = pprint_tests_meta[0].percpu_map;
+ do_test_pprint(i);
+ }
+}
+
+#define BPF_LINE_INFO_ENC(insn_off, file_off, line_off, line_num, line_col) \
+ (insn_off), (file_off), (line_off), ((line_num) << 10 | ((line_col) & 0x3ff))
+
+static struct prog_info_raw_test {
+ const char *descr;
+ const char *str_sec;
+ const char *err_str;
+ __u32 raw_types[MAX_NR_RAW_U32];
+ __u32 str_sec_size;
+ struct bpf_insn insns[MAX_INSNS];
+ __u32 prog_type;
+ __u32 func_info[MAX_SUBPROGS][2];
+ __u32 func_info_rec_size;
+ __u32 func_info_cnt;
+ __u32 line_info[MAX_NR_RAW_U32];
+ __u32 line_info_rec_size;
+ __u32 nr_jited_ksyms;
+ bool expected_prog_load_failure;
+ __u32 dead_code_cnt;
+ __u32 dead_code_mask;
+ __u32 dead_func_cnt;
+ __u32 dead_func_mask;
+} info_raw_tests[] = {
+{
+ .descr = "func_type (main func + one sub)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 32, 4), /* [2] */
+ BTF_FUNC_PROTO_ENC(1, 2), /* [3] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 2),
+ BTF_FUNC_PROTO_ENC(1, 2), /* [4] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 2),
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_ENC(NAME_TBD, 3), /* [5] */
+ BTF_FUNC_ENC(NAME_TBD, 4), /* [6] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0int\0unsigned int\0a\0b\0c\0d\0funcA\0funcB",
+ .str_sec_size = sizeof("\0int\0unsigned int\0a\0b\0c\0d\0funcA\0funcB"),
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .func_info = { {0, 5}, {3, 6} },
+ .func_info_rec_size = 8,
+ .func_info_cnt = 2,
+ .line_info = { BTF_END_RAW },
+},
+
+{
+ .descr = "func_type (Incorrect func_info_rec_size)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 32, 4), /* [2] */
+ BTF_FUNC_PROTO_ENC(1, 2), /* [3] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 2),
+ BTF_FUNC_PROTO_ENC(1, 2), /* [4] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 2),
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_ENC(NAME_TBD, 3), /* [5] */
+ BTF_FUNC_ENC(NAME_TBD, 4), /* [6] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0int\0unsigned int\0a\0b\0c\0d\0funcA\0funcB",
+ .str_sec_size = sizeof("\0int\0unsigned int\0a\0b\0c\0d\0funcA\0funcB"),
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .func_info = { {0, 5}, {3, 6} },
+ .func_info_rec_size = 4,
+ .func_info_cnt = 2,
+ .line_info = { BTF_END_RAW },
+ .expected_prog_load_failure = true,
+},
+
+{
+ .descr = "func_type (Incorrect func_info_cnt)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 32, 4), /* [2] */
+ BTF_FUNC_PROTO_ENC(1, 2), /* [3] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 2),
+ BTF_FUNC_PROTO_ENC(1, 2), /* [4] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 2),
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_ENC(NAME_TBD, 3), /* [5] */
+ BTF_FUNC_ENC(NAME_TBD, 4), /* [6] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0int\0unsigned int\0a\0b\0c\0d\0funcA\0funcB",
+ .str_sec_size = sizeof("\0int\0unsigned int\0a\0b\0c\0d\0funcA\0funcB"),
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .func_info = { {0, 5}, {3, 6} },
+ .func_info_rec_size = 8,
+ .func_info_cnt = 1,
+ .line_info = { BTF_END_RAW },
+ .expected_prog_load_failure = true,
+},
+
+{
+ .descr = "func_type (Incorrect bpf_func_info.insn_off)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 32, 4), /* [2] */
+ BTF_FUNC_PROTO_ENC(1, 2), /* [3] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 2),
+ BTF_FUNC_PROTO_ENC(1, 2), /* [4] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 2),
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_ENC(NAME_TBD, 3), /* [5] */
+ BTF_FUNC_ENC(NAME_TBD, 4), /* [6] */
+ BTF_END_RAW,
+ },
+ .str_sec = "\0int\0unsigned int\0a\0b\0c\0d\0funcA\0funcB",
+ .str_sec_size = sizeof("\0int\0unsigned int\0a\0b\0c\0d\0funcA\0funcB"),
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .func_info = { {0, 5}, {2, 6} },
+ .func_info_rec_size = 8,
+ .func_info_cnt = 2,
+ .line_info = { BTF_END_RAW },
+ .expected_prog_load_failure = true,
+},
+
+{
+ .descr = "line_info (No subprog)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0int a=1;\0int b=2;\0return a + b;\0return a + b;"),
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_MOV64_IMM(BPF_REG_1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .func_info_cnt = 0,
+ .line_info = {
+ BPF_LINE_INFO_ENC(0, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(1, 0, NAME_TBD, 2, 9),
+ BPF_LINE_INFO_ENC(2, 0, NAME_TBD, 3, 8),
+ BPF_LINE_INFO_ENC(3, 0, NAME_TBD, 4, 7),
+ BTF_END_RAW,
+ },
+ .line_info_rec_size = sizeof(struct bpf_line_info),
+ .nr_jited_ksyms = 1,
+},
+
+{
+ .descr = "line_info (No subprog. insn_off >= prog->len)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0int a=1;\0int b=2;\0return a + b;\0return a + b;"),
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_MOV64_IMM(BPF_REG_1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .func_info_cnt = 0,
+ .line_info = {
+ BPF_LINE_INFO_ENC(0, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(1, 0, NAME_TBD, 2, 9),
+ BPF_LINE_INFO_ENC(2, 0, NAME_TBD, 3, 8),
+ BPF_LINE_INFO_ENC(3, 0, NAME_TBD, 4, 7),
+ BPF_LINE_INFO_ENC(4, 0, 0, 5, 6),
+ BTF_END_RAW,
+ },
+ .line_info_rec_size = sizeof(struct bpf_line_info),
+ .nr_jited_ksyms = 1,
+ .err_str = "line_info[4].insn_off",
+ .expected_prog_load_failure = true,
+},
+
+{
+ .descr = "line_info (Zero bpf insn code)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_TYPE_INT_ENC(NAME_TBD, 0, 0, 64, 8), /* [2] */
+ BTF_TYPEDEF_ENC(NAME_TBD, 2), /* [3] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0unsigned long\0u64\0u64 a=1;\0return a;"),
+ .insns = {
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .func_info_cnt = 0,
+ .line_info = {
+ BPF_LINE_INFO_ENC(0, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(1, 0, 0, 2, 9),
+ BPF_LINE_INFO_ENC(2, 0, NAME_TBD, 3, 8),
+ BTF_END_RAW,
+ },
+ .line_info_rec_size = sizeof(struct bpf_line_info),
+ .nr_jited_ksyms = 1,
+ .err_str = "Invalid insn code at line_info[1]",
+ .expected_prog_load_failure = true,
+},
+
+{
+ .descr = "line_info (No subprog. zero tailing line_info",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0int a=1;\0int b=2;\0return a + b;\0return a + b;"),
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_MOV64_IMM(BPF_REG_1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .func_info_cnt = 0,
+ .line_info = {
+ BPF_LINE_INFO_ENC(0, 0, NAME_TBD, 1, 10), 0,
+ BPF_LINE_INFO_ENC(1, 0, NAME_TBD, 2, 9), 0,
+ BPF_LINE_INFO_ENC(2, 0, NAME_TBD, 3, 8), 0,
+ BPF_LINE_INFO_ENC(3, 0, NAME_TBD, 4, 7), 0,
+ BTF_END_RAW,
+ },
+ .line_info_rec_size = sizeof(struct bpf_line_info) + sizeof(__u32),
+ .nr_jited_ksyms = 1,
+},
+
+{
+ .descr = "line_info (No subprog. nonzero tailing line_info)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0int a=1;\0int b=2;\0return a + b;\0return a + b;"),
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_MOV64_IMM(BPF_REG_1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .func_info_cnt = 0,
+ .line_info = {
+ BPF_LINE_INFO_ENC(0, 0, NAME_TBD, 1, 10), 0,
+ BPF_LINE_INFO_ENC(1, 0, NAME_TBD, 2, 9), 0,
+ BPF_LINE_INFO_ENC(2, 0, NAME_TBD, 3, 8), 0,
+ BPF_LINE_INFO_ENC(3, 0, NAME_TBD, 4, 7), 1,
+ BTF_END_RAW,
+ },
+ .line_info_rec_size = sizeof(struct bpf_line_info) + sizeof(__u32),
+ .nr_jited_ksyms = 1,
+ .err_str = "nonzero tailing record in line_info",
+ .expected_prog_load_failure = true,
+},
+
+{
+ .descr = "line_info (subprog)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0int a=1+1;\0return func(a);\0b+=1;\0return b;"),
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_CALL_REL(1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .func_info_cnt = 0,
+ .line_info = {
+ BPF_LINE_INFO_ENC(0, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(2, 0, NAME_TBD, 2, 9),
+ BPF_LINE_INFO_ENC(5, 0, NAME_TBD, 3, 8),
+ BPF_LINE_INFO_ENC(7, 0, NAME_TBD, 4, 7),
+ BTF_END_RAW,
+ },
+ .line_info_rec_size = sizeof(struct bpf_line_info),
+ .nr_jited_ksyms = 2,
+},
+
+{
+ .descr = "line_info (subprog + func_info)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_FUNC_PROTO_ENC(1, 1), /* [2] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_ENC(NAME_TBD, 2), /* [3] */
+ BTF_FUNC_ENC(NAME_TBD, 2), /* [4] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0x\0sub\0main\0int a=1+1;\0return func(a);\0b+=1;\0return b;"),
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_CALL_REL(1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .func_info_cnt = 2,
+ .func_info_rec_size = 8,
+ .func_info = { {0, 4}, {5, 3} },
+ .line_info = {
+ BPF_LINE_INFO_ENC(0, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(2, 0, NAME_TBD, 2, 9),
+ BPF_LINE_INFO_ENC(5, 0, NAME_TBD, 3, 8),
+ BPF_LINE_INFO_ENC(7, 0, NAME_TBD, 4, 7),
+ BTF_END_RAW,
+ },
+ .line_info_rec_size = sizeof(struct bpf_line_info),
+ .nr_jited_ksyms = 2,
+},
+
+{
+ .descr = "line_info (subprog. missing 1st func line info)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0int a=1+1;\0return func(a);\0b+=1;\0return b;"),
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_CALL_REL(1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .func_info_cnt = 0,
+ .line_info = {
+ BPF_LINE_INFO_ENC(1, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(2, 0, NAME_TBD, 2, 9),
+ BPF_LINE_INFO_ENC(5, 0, NAME_TBD, 3, 8),
+ BPF_LINE_INFO_ENC(7, 0, NAME_TBD, 4, 7),
+ BTF_END_RAW,
+ },
+ .line_info_rec_size = sizeof(struct bpf_line_info),
+ .nr_jited_ksyms = 2,
+ .err_str = "missing bpf_line_info for func#0",
+ .expected_prog_load_failure = true,
+},
+
+{
+ .descr = "line_info (subprog. missing 2nd func line info)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0int a=1+1;\0return func(a);\0b+=1;\0return b;"),
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_CALL_REL(1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .func_info_cnt = 0,
+ .line_info = {
+ BPF_LINE_INFO_ENC(0, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(2, 0, NAME_TBD, 2, 9),
+ BPF_LINE_INFO_ENC(6, 0, NAME_TBD, 3, 8),
+ BPF_LINE_INFO_ENC(7, 0, NAME_TBD, 4, 7),
+ BTF_END_RAW,
+ },
+ .line_info_rec_size = sizeof(struct bpf_line_info),
+ .nr_jited_ksyms = 2,
+ .err_str = "missing bpf_line_info for func#1",
+ .expected_prog_load_failure = true,
+},
+
+{
+ .descr = "line_info (subprog. unordered insn offset)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0int a=1+1;\0return func(a);\0b+=1;\0return b;"),
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_CALL_REL(1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .func_info_cnt = 0,
+ .line_info = {
+ BPF_LINE_INFO_ENC(0, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(5, 0, NAME_TBD, 2, 9),
+ BPF_LINE_INFO_ENC(2, 0, NAME_TBD, 3, 8),
+ BPF_LINE_INFO_ENC(7, 0, NAME_TBD, 4, 7),
+ BTF_END_RAW,
+ },
+ .line_info_rec_size = sizeof(struct bpf_line_info),
+ .nr_jited_ksyms = 2,
+ .err_str = "Invalid line_info[2].insn_off",
+ .expected_prog_load_failure = true,
+},
+
+{
+ .descr = "line_info (dead start)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0/* dead jmp */\0int a=1;\0int b=2;\0return a + b;\0return a + b;"),
+ .insns = {
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_MOV64_IMM(BPF_REG_1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .func_info_cnt = 0,
+ .line_info = {
+ BPF_LINE_INFO_ENC(0, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(1, 0, NAME_TBD, 2, 9),
+ BPF_LINE_INFO_ENC(2, 0, NAME_TBD, 3, 8),
+ BPF_LINE_INFO_ENC(3, 0, NAME_TBD, 4, 7),
+ BPF_LINE_INFO_ENC(4, 0, NAME_TBD, 5, 6),
+ BTF_END_RAW,
+ },
+ .line_info_rec_size = sizeof(struct bpf_line_info),
+ .nr_jited_ksyms = 1,
+ .dead_code_cnt = 1,
+ .dead_code_mask = 0x01,
+},
+
+{
+ .descr = "line_info (dead end)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0int a=1;\0int b=2;\0return a + b;\0/* dead jmp */\0return a + b;\0/* dead exit */"),
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_MOV64_IMM(BPF_REG_1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 10, 1),
+ BPF_EXIT_INSN(),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .func_info_cnt = 0,
+ .line_info = {
+ BPF_LINE_INFO_ENC(0, 0, NAME_TBD, 1, 12),
+ BPF_LINE_INFO_ENC(1, 0, NAME_TBD, 2, 11),
+ BPF_LINE_INFO_ENC(2, 0, NAME_TBD, 3, 10),
+ BPF_LINE_INFO_ENC(3, 0, NAME_TBD, 4, 9),
+ BPF_LINE_INFO_ENC(4, 0, NAME_TBD, 5, 8),
+ BPF_LINE_INFO_ENC(5, 0, NAME_TBD, 6, 7),
+ BTF_END_RAW,
+ },
+ .line_info_rec_size = sizeof(struct bpf_line_info),
+ .nr_jited_ksyms = 1,
+ .dead_code_cnt = 2,
+ .dead_code_mask = 0x28,
+},
+
+{
+ .descr = "line_info (dead code + subprog + func_info)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_FUNC_PROTO_ENC(1, 1), /* [2] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_ENC(NAME_TBD, 2), /* [3] */
+ BTF_FUNC_ENC(NAME_TBD, 2), /* [4] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0x\0sub\0main\0int a=1+1;\0/* dead jmp */"
+ "\0/* dead */\0/* dead */\0/* dead */\0/* dead */"
+ "\0/* dead */\0/* dead */\0/* dead */\0/* dead */"
+ "\0return func(a);\0b+=1;\0return b;"),
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_2, 0, 8),
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_CALL_REL(1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .func_info_cnt = 2,
+ .func_info_rec_size = 8,
+ .func_info = { {0, 4}, {14, 3} },
+ .line_info = {
+ BPF_LINE_INFO_ENC(0, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(3, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(4, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(5, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(6, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(7, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(8, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(9, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(10, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(11, 0, NAME_TBD, 2, 9),
+ BPF_LINE_INFO_ENC(12, 0, NAME_TBD, 2, 9),
+ BPF_LINE_INFO_ENC(14, 0, NAME_TBD, 3, 8),
+ BPF_LINE_INFO_ENC(16, 0, NAME_TBD, 4, 7),
+ BTF_END_RAW,
+ },
+ .line_info_rec_size = sizeof(struct bpf_line_info),
+ .nr_jited_ksyms = 2,
+ .dead_code_cnt = 9,
+ .dead_code_mask = 0x3fe,
+},
+
+{
+ .descr = "line_info (dead subprog)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_FUNC_PROTO_ENC(1, 1), /* [2] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_ENC(NAME_TBD, 2), /* [3] */
+ BTF_FUNC_ENC(NAME_TBD, 2), /* [4] */
+ BTF_FUNC_ENC(NAME_TBD, 2), /* [5] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0x\0dead\0main\0func\0int a=1+1;\0/* live call */"
+ "\0return 0;\0return 0;\0/* dead */\0/* dead */"
+ "\0/* dead */\0return bla + 1;\0return bla + 1;"
+ "\0return bla + 1;\0return func(a);\0b+=1;\0return b;"),
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_2, 0, 1),
+ BPF_CALL_REL(3),
+ BPF_CALL_REL(5),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_CALL_REL(1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_0, 2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .func_info_cnt = 3,
+ .func_info_rec_size = 8,
+ .func_info = { {0, 4}, {6, 3}, {9, 5} },
+ .line_info = {
+ BPF_LINE_INFO_ENC(0, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(3, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(4, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(5, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(6, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(7, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(8, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(9, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(10, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(11, 0, NAME_TBD, 2, 9),
+ BTF_END_RAW,
+ },
+ .line_info_rec_size = sizeof(struct bpf_line_info),
+ .nr_jited_ksyms = 2,
+ .dead_code_cnt = 3,
+ .dead_code_mask = 0x70,
+ .dead_func_cnt = 1,
+ .dead_func_mask = 0x2,
+},
+
+{
+ .descr = "line_info (dead last subprog)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_FUNC_PROTO_ENC(1, 1), /* [2] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_ENC(NAME_TBD, 2), /* [3] */
+ BTF_FUNC_ENC(NAME_TBD, 2), /* [5] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0x\0dead\0main\0int a=1+1;\0/* live call */"
+ "\0return 0;\0/* dead */\0/* dead */"),
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_2, 0, 1),
+ BPF_CALL_REL(2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .func_info_cnt = 2,
+ .func_info_rec_size = 8,
+ .func_info = { {0, 4}, {5, 3} },
+ .line_info = {
+ BPF_LINE_INFO_ENC(0, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(3, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(4, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(5, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(6, 0, NAME_TBD, 1, 10),
+ BTF_END_RAW,
+ },
+ .line_info_rec_size = sizeof(struct bpf_line_info),
+ .nr_jited_ksyms = 1,
+ .dead_code_cnt = 2,
+ .dead_code_mask = 0x18,
+ .dead_func_cnt = 1,
+ .dead_func_mask = 0x2,
+},
+
+{
+ .descr = "line_info (dead subprog + dead start)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_FUNC_PROTO_ENC(1, 1), /* [2] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_ENC(NAME_TBD, 2), /* [3] */
+ BTF_FUNC_ENC(NAME_TBD, 2), /* [4] */
+ BTF_FUNC_ENC(NAME_TBD, 2), /* [5] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0x\0dead\0main\0func\0int a=1+1;\0/* dead */"
+ "\0return 0;\0return 0;\0return 0;"
+ "\0/* dead */\0/* dead */\0/* dead */\0/* dead */"
+ "\0return b + 1;\0return b + 1;\0return b + 1;"),
+ .insns = {
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_2, 0, 1),
+ BPF_CALL_REL(3),
+ BPF_CALL_REL(5),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_CALL_REL(1),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_MOV64_REG(BPF_REG_0, 2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .func_info_cnt = 3,
+ .func_info_rec_size = 8,
+ .func_info = { {0, 4}, {7, 3}, {10, 5} },
+ .line_info = {
+ BPF_LINE_INFO_ENC(0, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(3, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(4, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(5, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(6, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(7, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(8, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(9, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(10, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(11, 0, NAME_TBD, 2, 9),
+ BPF_LINE_INFO_ENC(12, 0, NAME_TBD, 2, 9),
+ BPF_LINE_INFO_ENC(13, 0, NAME_TBD, 2, 9),
+ BTF_END_RAW,
+ },
+ .line_info_rec_size = sizeof(struct bpf_line_info),
+ .nr_jited_ksyms = 2,
+ .dead_code_cnt = 5,
+ .dead_code_mask = 0x1e2,
+ .dead_func_cnt = 1,
+ .dead_func_mask = 0x2,
+},
+
+{
+ .descr = "line_info (dead subprog + dead start w/ move)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_FUNC_PROTO_ENC(1, 1), /* [2] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_ENC(NAME_TBD, 2), /* [3] */
+ BTF_FUNC_ENC(NAME_TBD, 2), /* [4] */
+ BTF_FUNC_ENC(NAME_TBD, 2), /* [5] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0x\0dead\0main\0func\0int a=1+1;\0/* live call */"
+ "\0return 0;\0return 0;\0/* dead */\0/* dead */"
+ "\0/* dead */\0return bla + 1;\0return bla + 1;"
+ "\0return bla + 1;\0return func(a);\0b+=1;\0return b;"),
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_2, 0, 1),
+ BPF_CALL_REL(3),
+ BPF_CALL_REL(5),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_CALL_REL(1),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_MOV64_REG(BPF_REG_0, 2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .func_info_cnt = 3,
+ .func_info_rec_size = 8,
+ .func_info = { {0, 4}, {6, 3}, {9, 5} },
+ .line_info = {
+ BPF_LINE_INFO_ENC(0, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(3, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(4, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(5, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(6, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(7, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(8, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(9, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(11, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(12, 0, NAME_TBD, 2, 9),
+ BTF_END_RAW,
+ },
+ .line_info_rec_size = sizeof(struct bpf_line_info),
+ .nr_jited_ksyms = 2,
+ .dead_code_cnt = 3,
+ .dead_code_mask = 0x70,
+ .dead_func_cnt = 1,
+ .dead_func_mask = 0x2,
+},
+
+{
+ .descr = "line_info (dead end + subprog start w/ no linfo)",
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ BTF_FUNC_PROTO_ENC(1, 1), /* [2] */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_ENC(NAME_TBD, 2), /* [3] */
+ BTF_FUNC_ENC(NAME_TBD, 2), /* [4] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0x\0main\0func\0/* main linfo */\0/* func linfo */"),
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 1, 3),
+ BPF_CALL_REL(3),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .func_info_cnt = 2,
+ .func_info_rec_size = 8,
+ .func_info = { {0, 3}, {6, 4}, },
+ .line_info = {
+ BPF_LINE_INFO_ENC(0, 0, NAME_TBD, 1, 10),
+ BPF_LINE_INFO_ENC(6, 0, NAME_TBD, 1, 10),
+ BTF_END_RAW,
+ },
+ .line_info_rec_size = sizeof(struct bpf_line_info),
+ .nr_jited_ksyms = 2,
+},
+
+};
+
+static size_t probe_prog_length(const struct bpf_insn *fp)
+{
+ size_t len;
+
+ for (len = MAX_INSNS - 1; len > 0; --len)
+ if (fp[len].code != 0 || fp[len].imm != 0)
+ break;
+ return len + 1;
+}
+
+static __u32 *patch_name_tbd(const __u32 *raw_u32,
+ const char *str, __u32 str_off,
+ unsigned int str_sec_size,
+ unsigned int *ret_size)
+{
+ int i, raw_u32_size = get_raw_sec_size(raw_u32);
+ const char *end_str = str + str_sec_size;
+ const char *next_str = str + str_off;
+ __u32 *new_u32 = NULL;
+
+ if (raw_u32_size == -1)
+ return ERR_PTR(-EINVAL);
+
+ if (!raw_u32_size) {
+ *ret_size = 0;
+ return NULL;
+ }
+
+ new_u32 = malloc(raw_u32_size);
+ if (!new_u32)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0; i < raw_u32_size / sizeof(raw_u32[0]); i++) {
+ if (raw_u32[i] == NAME_TBD) {
+ next_str = get_next_str(next_str, end_str);
+ if (CHECK(!next_str, "Error in getting next_str\n")) {
+ free(new_u32);
+ return ERR_PTR(-EINVAL);
+ }
+ new_u32[i] = next_str - str;
+ next_str += strlen(next_str);
+ } else {
+ new_u32[i] = raw_u32[i];
+ }
+ }
+
+ *ret_size = raw_u32_size;
+ return new_u32;
+}
+
+static int test_get_finfo(const struct prog_info_raw_test *test,
+ int prog_fd)
+{
+ struct bpf_prog_info info = {};
+ struct bpf_func_info *finfo;
+ __u32 info_len, rec_size, i;
+ void *func_info = NULL;
+ __u32 nr_func_info;
+ int err;
+
+ /* get necessary lens */
+ info_len = sizeof(struct bpf_prog_info);
+ err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+ if (CHECK(err == -1, "invalid get info (1st) errno:%d", errno)) {
+ fprintf(stderr, "%s\n", btf_log_buf);
+ return -1;
+ }
+ nr_func_info = test->func_info_cnt - test->dead_func_cnt;
+ if (CHECK(info.nr_func_info != nr_func_info,
+ "incorrect info.nr_func_info (1st) %d",
+ info.nr_func_info)) {
+ return -1;
+ }
+
+ rec_size = info.func_info_rec_size;
+ if (CHECK(rec_size != sizeof(struct bpf_func_info),
+ "incorrect info.func_info_rec_size (1st) %d", rec_size)) {
+ return -1;
+ }
+
+ if (!info.nr_func_info)
+ return 0;
+
+ func_info = malloc(info.nr_func_info * rec_size);
+ if (CHECK(!func_info, "out of memory"))
+ return -1;
+
+ /* reset info to only retrieve func_info related data */
+ memset(&info, 0, sizeof(info));
+ info.nr_func_info = nr_func_info;
+ info.func_info_rec_size = rec_size;
+ info.func_info = ptr_to_u64(func_info);
+ err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+ if (CHECK(err == -1, "invalid get info (2nd) errno:%d", errno)) {
+ fprintf(stderr, "%s\n", btf_log_buf);
+ err = -1;
+ goto done;
+ }
+ if (CHECK(info.nr_func_info != nr_func_info,
+ "incorrect info.nr_func_info (2nd) %d",
+ info.nr_func_info)) {
+ err = -1;
+ goto done;
+ }
+ if (CHECK(info.func_info_rec_size != rec_size,
+ "incorrect info.func_info_rec_size (2nd) %d",
+ info.func_info_rec_size)) {
+ err = -1;
+ goto done;
+ }
+
+ finfo = func_info;
+ for (i = 0; i < nr_func_info; i++) {
+ if (test->dead_func_mask & (1 << i))
+ continue;
+ if (CHECK(finfo->type_id != test->func_info[i][1],
+ "incorrect func_type %u expected %u",
+ finfo->type_id, test->func_info[i][1])) {
+ err = -1;
+ goto done;
+ }
+ finfo = (void *)finfo + rec_size;
+ }
+
+ err = 0;
+
+done:
+ free(func_info);
+ return err;
+}
+
+static int test_get_linfo(const struct prog_info_raw_test *test,
+ const void *patched_linfo,
+ __u32 cnt, int prog_fd)
+{
+ __u32 i, info_len, nr_jited_ksyms, nr_jited_func_lens;
+ __u64 *jited_linfo = NULL, *jited_ksyms = NULL;
+ __u32 rec_size, jited_rec_size, jited_cnt;
+ struct bpf_line_info *linfo = NULL;
+ __u32 cur_func_len, ksyms_found;
+ struct bpf_prog_info info = {};
+ __u32 *jited_func_lens = NULL;
+ __u64 cur_func_ksyms;
+ __u32 dead_insns;
+ int err;
+
+ jited_cnt = cnt;
+ rec_size = sizeof(*linfo);
+ jited_rec_size = sizeof(*jited_linfo);
+ if (test->nr_jited_ksyms)
+ nr_jited_ksyms = test->nr_jited_ksyms;
+ else
+ nr_jited_ksyms = test->func_info_cnt - test->dead_func_cnt;
+ nr_jited_func_lens = nr_jited_ksyms;
+
+ info_len = sizeof(struct bpf_prog_info);
+ err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+ if (CHECK(err == -1, "err:%d errno:%d", err, errno)) {
+ err = -1;
+ goto done;
+ }
+
+ if (!info.jited_prog_len) {
+ /* prog is not jited */
+ jited_cnt = 0;
+ nr_jited_ksyms = 1;
+ nr_jited_func_lens = 1;
+ }
+
+ if (CHECK(info.nr_line_info != cnt ||
+ info.nr_jited_line_info != jited_cnt ||
+ info.nr_jited_ksyms != nr_jited_ksyms ||
+ info.nr_jited_func_lens != nr_jited_func_lens ||
+ (!info.nr_line_info && info.nr_jited_line_info),
+ "info: nr_line_info:%u(expected:%u) nr_jited_line_info:%u(expected:%u) nr_jited_ksyms:%u(expected:%u) nr_jited_func_lens:%u(expected:%u)",
+ info.nr_line_info, cnt,
+ info.nr_jited_line_info, jited_cnt,
+ info.nr_jited_ksyms, nr_jited_ksyms,
+ info.nr_jited_func_lens, nr_jited_func_lens)) {
+ err = -1;
+ goto done;
+ }
+
+ if (CHECK(info.line_info_rec_size != sizeof(struct bpf_line_info) ||
+ info.jited_line_info_rec_size != sizeof(__u64),
+ "info: line_info_rec_size:%u(userspace expected:%u) jited_line_info_rec_size:%u(userspace expected:%u)",
+ info.line_info_rec_size, rec_size,
+ info.jited_line_info_rec_size, jited_rec_size)) {
+ err = -1;
+ goto done;
+ }
+
+ if (!cnt)
+ return 0;
+
+ rec_size = info.line_info_rec_size;
+ jited_rec_size = info.jited_line_info_rec_size;
+
+ memset(&info, 0, sizeof(info));
+
+ linfo = calloc(cnt, rec_size);
+ if (CHECK(!linfo, "!linfo")) {
+ err = -1;
+ goto done;
+ }
+ info.nr_line_info = cnt;
+ info.line_info_rec_size = rec_size;
+ info.line_info = ptr_to_u64(linfo);
+
+ if (jited_cnt) {
+ jited_linfo = calloc(jited_cnt, jited_rec_size);
+ jited_ksyms = calloc(nr_jited_ksyms, sizeof(*jited_ksyms));
+ jited_func_lens = calloc(nr_jited_func_lens,
+ sizeof(*jited_func_lens));
+ if (CHECK(!jited_linfo || !jited_ksyms || !jited_func_lens,
+ "jited_linfo:%p jited_ksyms:%p jited_func_lens:%p",
+ jited_linfo, jited_ksyms, jited_func_lens)) {
+ err = -1;
+ goto done;
+ }
+
+ info.nr_jited_line_info = jited_cnt;
+ info.jited_line_info_rec_size = jited_rec_size;
+ info.jited_line_info = ptr_to_u64(jited_linfo);
+ info.nr_jited_ksyms = nr_jited_ksyms;
+ info.jited_ksyms = ptr_to_u64(jited_ksyms);
+ info.nr_jited_func_lens = nr_jited_func_lens;
+ info.jited_func_lens = ptr_to_u64(jited_func_lens);
+ }
+
+ err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+
+ /*
+ * Only recheck the info.*line_info* fields.
+ * Other fields are not the concern of this test.
+ */
+ if (CHECK(err == -1 ||
+ info.nr_line_info != cnt ||
+ (jited_cnt && !info.jited_line_info) ||
+ info.nr_jited_line_info != jited_cnt ||
+ info.line_info_rec_size != rec_size ||
+ info.jited_line_info_rec_size != jited_rec_size,
+ "err:%d errno:%d info: nr_line_info:%u(expected:%u) nr_jited_line_info:%u(expected:%u) line_info_rec_size:%u(expected:%u) jited_linfo_rec_size:%u(expected:%u) line_info:%p jited_line_info:%p",
+ err, errno,
+ info.nr_line_info, cnt,
+ info.nr_jited_line_info, jited_cnt,
+ info.line_info_rec_size, rec_size,
+ info.jited_line_info_rec_size, jited_rec_size,
+ (void *)(long)info.line_info,
+ (void *)(long)info.jited_line_info)) {
+ err = -1;
+ goto done;
+ }
+
+ dead_insns = 0;
+ while (test->dead_code_mask & (1 << dead_insns))
+ dead_insns++;
+
+ CHECK(linfo[0].insn_off, "linfo[0].insn_off:%u",
+ linfo[0].insn_off);
+ for (i = 1; i < cnt; i++) {
+ const struct bpf_line_info *expected_linfo;
+
+ while (test->dead_code_mask & (1 << (i + dead_insns)))
+ dead_insns++;
+
+ expected_linfo = patched_linfo +
+ ((i + dead_insns) * test->line_info_rec_size);
+ if (CHECK(linfo[i].insn_off <= linfo[i - 1].insn_off,
+ "linfo[%u].insn_off:%u <= linfo[%u].insn_off:%u",
+ i, linfo[i].insn_off,
+ i - 1, linfo[i - 1].insn_off)) {
+ err = -1;
+ goto done;
+ }
+ if (CHECK(linfo[i].file_name_off != expected_linfo->file_name_off ||
+ linfo[i].line_off != expected_linfo->line_off ||
+ linfo[i].line_col != expected_linfo->line_col,
+ "linfo[%u] (%u, %u, %u) != (%u, %u, %u)", i,
+ linfo[i].file_name_off,
+ linfo[i].line_off,
+ linfo[i].line_col,
+ expected_linfo->file_name_off,
+ expected_linfo->line_off,
+ expected_linfo->line_col)) {
+ err = -1;
+ goto done;
+ }
+ }
+
+ if (!jited_cnt) {
+ fprintf(stderr, "not jited. skipping jited_line_info check. ");
+ err = 0;
+ goto done;
+ }
+
+ if (CHECK(jited_linfo[0] != jited_ksyms[0],
+ "jited_linfo[0]:%lx != jited_ksyms[0]:%lx",
+ (long)(jited_linfo[0]), (long)(jited_ksyms[0]))) {
+ err = -1;
+ goto done;
+ }
+
+ ksyms_found = 1;
+ cur_func_len = jited_func_lens[0];
+ cur_func_ksyms = jited_ksyms[0];
+ for (i = 1; i < jited_cnt; i++) {
+ if (ksyms_found < nr_jited_ksyms &&
+ jited_linfo[i] == jited_ksyms[ksyms_found]) {
+ cur_func_ksyms = jited_ksyms[ksyms_found];
+ cur_func_len = jited_ksyms[ksyms_found];
+ ksyms_found++;
+ continue;
+ }
+
+ if (CHECK(jited_linfo[i] <= jited_linfo[i - 1],
+ "jited_linfo[%u]:%lx <= jited_linfo[%u]:%lx",
+ i, (long)jited_linfo[i],
+ i - 1, (long)(jited_linfo[i - 1]))) {
+ err = -1;
+ goto done;
+ }
+
+ if (CHECK(jited_linfo[i] - cur_func_ksyms > cur_func_len,
+ "jited_linfo[%u]:%lx - %lx > %u",
+ i, (long)jited_linfo[i], (long)cur_func_ksyms,
+ cur_func_len)) {
+ err = -1;
+ goto done;
+ }
+ }
+
+ if (CHECK(ksyms_found != nr_jited_ksyms,
+ "ksyms_found:%u != nr_jited_ksyms:%u",
+ ksyms_found, nr_jited_ksyms)) {
+ err = -1;
+ goto done;
+ }
+
+ err = 0;
+
+done:
+ free(linfo);
+ free(jited_linfo);
+ free(jited_ksyms);
+ free(jited_func_lens);
+ return err;
+}
+
+static void do_test_info_raw(unsigned int test_num)
+{
+ const struct prog_info_raw_test *test = &info_raw_tests[test_num - 1];
+ unsigned int raw_btf_size, linfo_str_off, linfo_size;
+ int btf_fd = -1, prog_fd = -1, err = 0;
+ void *raw_btf, *patched_linfo = NULL;
+ const char *ret_next_str;
+ union bpf_attr attr = {};
+
+ if (!test__start_subtest(test->descr))
+ return;
+
+ raw_btf = btf_raw_create(&hdr_tmpl, test->raw_types,
+ test->str_sec, test->str_sec_size,
+ &raw_btf_size, &ret_next_str);
+ if (!raw_btf)
+ return;
+
+ *btf_log_buf = '\0';
+ btf_fd = bpf_load_btf(raw_btf, raw_btf_size,
+ btf_log_buf, BTF_LOG_BUF_SIZE,
+ always_log);
+ free(raw_btf);
+
+ if (CHECK(btf_fd == -1, "invalid btf_fd errno:%d", errno)) {
+ err = -1;
+ goto done;
+ }
+
+ if (*btf_log_buf && always_log)
+ fprintf(stderr, "\n%s", btf_log_buf);
+ *btf_log_buf = '\0';
+
+ linfo_str_off = ret_next_str - test->str_sec;
+ patched_linfo = patch_name_tbd(test->line_info,
+ test->str_sec, linfo_str_off,
+ test->str_sec_size, &linfo_size);
+ if (IS_ERR(patched_linfo)) {
+ fprintf(stderr, "error in creating raw bpf_line_info");
+ err = -1;
+ goto done;
+ }
+
+ attr.prog_type = test->prog_type;
+ attr.insns = ptr_to_u64(test->insns);
+ attr.insn_cnt = probe_prog_length(test->insns);
+ attr.license = ptr_to_u64("GPL");
+ attr.prog_btf_fd = btf_fd;
+ attr.func_info_rec_size = test->func_info_rec_size;
+ attr.func_info_cnt = test->func_info_cnt;
+ attr.func_info = ptr_to_u64(test->func_info);
+ attr.log_buf = ptr_to_u64(btf_log_buf);
+ attr.log_size = BTF_LOG_BUF_SIZE;
+ attr.log_level = 1;
+ if (linfo_size) {
+ attr.line_info_rec_size = test->line_info_rec_size;
+ attr.line_info = ptr_to_u64(patched_linfo);
+ attr.line_info_cnt = linfo_size / attr.line_info_rec_size;
+ }
+
+ prog_fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+ err = ((prog_fd == -1) != test->expected_prog_load_failure);
+ if (CHECK(err, "prog_fd:%d expected_prog_load_failure:%u errno:%d",
+ prog_fd, test->expected_prog_load_failure, errno) ||
+ CHECK(test->err_str && !strstr(btf_log_buf, test->err_str),
+ "expected err_str:%s", test->err_str)) {
+ err = -1;
+ goto done;
+ }
+
+ if (prog_fd == -1)
+ goto done;
+
+ err = test_get_finfo(test, prog_fd);
+ if (err)
+ goto done;
+
+ err = test_get_linfo(test, patched_linfo,
+ attr.line_info_cnt - test->dead_code_cnt,
+ prog_fd);
+ if (err)
+ goto done;
+
+done:
+ if (*btf_log_buf && (err || always_log))
+ fprintf(stderr, "\n%s", btf_log_buf);
+
+ if (btf_fd != -1)
+ close(btf_fd);
+ if (prog_fd != -1)
+ close(prog_fd);
+
+ if (!IS_ERR(patched_linfo))
+ free(patched_linfo);
+}
+
+struct btf_raw_data {
+ __u32 raw_types[MAX_NR_RAW_U32];
+ const char *str_sec;
+ __u32 str_sec_size;
+};
+
+struct btf_dedup_test {
+ const char *descr;
+ struct btf_raw_data input;
+ struct btf_raw_data expect;
+ struct btf_dedup_opts opts;
+};
+
+const struct btf_dedup_test dedup_tests[] = {
+
+{
+ .descr = "dedup: unused strings filtering",
+ .input = {
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_NTH(2), BTF_INT_SIGNED, 0, 32, 4),
+ BTF_TYPE_INT_ENC(NAME_NTH(5), BTF_INT_SIGNED, 0, 64, 8),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0unused\0int\0foo\0bar\0long"),
+ },
+ .expect = {
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4),
+ BTF_TYPE_INT_ENC(NAME_NTH(2), BTF_INT_SIGNED, 0, 64, 8),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0long"),
+ },
+ .opts = {
+ .dont_resolve_fwds = false,
+ },
+},
+{
+ .descr = "dedup: strings deduplication",
+ .input = {
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4),
+ BTF_TYPE_INT_ENC(NAME_NTH(2), BTF_INT_SIGNED, 0, 64, 8),
+ BTF_TYPE_INT_ENC(NAME_NTH(3), BTF_INT_SIGNED, 0, 32, 4),
+ BTF_TYPE_INT_ENC(NAME_NTH(4), BTF_INT_SIGNED, 0, 64, 8),
+ BTF_TYPE_INT_ENC(NAME_NTH(5), BTF_INT_SIGNED, 0, 32, 4),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0long int\0int\0long int\0int"),
+ },
+ .expect = {
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4),
+ BTF_TYPE_INT_ENC(NAME_NTH(2), BTF_INT_SIGNED, 0, 64, 8),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0long int"),
+ },
+ .opts = {
+ .dont_resolve_fwds = false,
+ },
+},
+{
+ .descr = "dedup: struct example #1",
+ /*
+ * struct s {
+ * struct s *next;
+ * const int *a;
+ * int b[16];
+ * int c;
+ * }
+ */
+ .input = {
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* int[16] */
+ BTF_TYPE_ARRAY_ENC(1, 1, 16), /* [2] */
+ /* struct s { */
+ BTF_STRUCT_ENC(NAME_NTH(2), 4, 84), /* [3] */
+ BTF_MEMBER_ENC(NAME_NTH(3), 4, 0), /* struct s *next; */
+ BTF_MEMBER_ENC(NAME_NTH(4), 5, 64), /* const int *a; */
+ BTF_MEMBER_ENC(NAME_NTH(5), 2, 128), /* int b[16]; */
+ BTF_MEMBER_ENC(NAME_NTH(6), 1, 640), /* int c; */
+ /* ptr -> [3] struct s */
+ BTF_PTR_ENC(3), /* [4] */
+ /* ptr -> [6] const int */
+ BTF_PTR_ENC(6), /* [5] */
+ /* const -> [1] int */
+ BTF_CONST_ENC(1), /* [6] */
+
+ /* full copy of the above */
+ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4), /* [7] */
+ BTF_TYPE_ARRAY_ENC(7, 7, 16), /* [8] */
+ BTF_STRUCT_ENC(NAME_NTH(2), 4, 84), /* [9] */
+ BTF_MEMBER_ENC(NAME_NTH(3), 10, 0),
+ BTF_MEMBER_ENC(NAME_NTH(4), 11, 64),
+ BTF_MEMBER_ENC(NAME_NTH(5), 8, 128),
+ BTF_MEMBER_ENC(NAME_NTH(6), 7, 640),
+ BTF_PTR_ENC(9), /* [10] */
+ BTF_PTR_ENC(12), /* [11] */
+ BTF_CONST_ENC(7), /* [12] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0s\0next\0a\0b\0c\0"),
+ },
+ .expect = {
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(NAME_NTH(4), BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* int[16] */
+ BTF_TYPE_ARRAY_ENC(1, 1, 16), /* [2] */
+ /* struct s { */
+ BTF_STRUCT_ENC(NAME_NTH(6), 4, 84), /* [3] */
+ BTF_MEMBER_ENC(NAME_NTH(5), 4, 0), /* struct s *next; */
+ BTF_MEMBER_ENC(NAME_NTH(1), 5, 64), /* const int *a; */
+ BTF_MEMBER_ENC(NAME_NTH(2), 2, 128), /* int b[16]; */
+ BTF_MEMBER_ENC(NAME_NTH(3), 1, 640), /* int c; */
+ /* ptr -> [3] struct s */
+ BTF_PTR_ENC(3), /* [4] */
+ /* ptr -> [6] const int */
+ BTF_PTR_ENC(6), /* [5] */
+ /* const -> [1] int */
+ BTF_CONST_ENC(1), /* [6] */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0a\0b\0c\0int\0next\0s"),
+ },
+ .opts = {
+ .dont_resolve_fwds = false,
+ },
+},
+{
+ .descr = "dedup: struct <-> fwd resolution w/ hash collision",
+ /*
+ * // CU 1:
+ * struct x;
+ * struct s {
+ * struct x *x;
+ * };
+ * // CU 2:
+ * struct x {};
+ * struct s {
+ * struct x *x;
+ * };
+ */
+ .input = {
+ .raw_types = {
+ /* CU 1 */
+ BTF_FWD_ENC(NAME_TBD, 0 /* struct fwd */), /* [1] fwd x */
+ BTF_PTR_ENC(1), /* [2] ptr -> [1] */
+ BTF_STRUCT_ENC(NAME_TBD, 1, 8), /* [3] struct s */
+ BTF_MEMBER_ENC(NAME_TBD, 2, 0),
+ /* CU 2 */
+ BTF_STRUCT_ENC(NAME_TBD, 0, 0), /* [4] struct x */
+ BTF_PTR_ENC(4), /* [5] ptr -> [4] */
+ BTF_STRUCT_ENC(NAME_TBD, 1, 8), /* [6] struct s */
+ BTF_MEMBER_ENC(NAME_TBD, 5, 0),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0x\0s\0x\0x\0s\0x\0"),
+ },
+ .expect = {
+ .raw_types = {
+ BTF_PTR_ENC(3), /* [1] ptr -> [3] */
+ BTF_STRUCT_ENC(NAME_TBD, 1, 8), /* [2] struct s */
+ BTF_MEMBER_ENC(NAME_TBD, 1, 0),
+ BTF_STRUCT_ENC(NAME_NTH(2), 0, 0), /* [3] struct x */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0s\0x"),
+ },
+ .opts = {
+ .dont_resolve_fwds = false,
+ .dedup_table_size = 1, /* force hash collisions */
+ },
+},
+{
+ .descr = "dedup: void equiv check",
+ /*
+ * // CU 1:
+ * struct s {
+ * struct {} *x;
+ * };
+ * // CU 2:
+ * struct s {
+ * int *x;
+ * };
+ */
+ .input = {
+ .raw_types = {
+ /* CU 1 */
+ BTF_STRUCT_ENC(0, 0, 1), /* [1] struct {} */
+ BTF_PTR_ENC(1), /* [2] ptr -> [1] */
+ BTF_STRUCT_ENC(NAME_NTH(1), 1, 8), /* [3] struct s */
+ BTF_MEMBER_ENC(NAME_NTH(2), 2, 0),
+ /* CU 2 */
+ BTF_PTR_ENC(0), /* [4] ptr -> void */
+ BTF_STRUCT_ENC(NAME_NTH(1), 1, 8), /* [5] struct s */
+ BTF_MEMBER_ENC(NAME_NTH(2), 4, 0),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0s\0x"),
+ },
+ .expect = {
+ .raw_types = {
+ /* CU 1 */
+ BTF_STRUCT_ENC(0, 0, 1), /* [1] struct {} */
+ BTF_PTR_ENC(1), /* [2] ptr -> [1] */
+ BTF_STRUCT_ENC(NAME_NTH(1), 1, 8), /* [3] struct s */
+ BTF_MEMBER_ENC(NAME_NTH(2), 2, 0),
+ /* CU 2 */
+ BTF_PTR_ENC(0), /* [4] ptr -> void */
+ BTF_STRUCT_ENC(NAME_NTH(1), 1, 8), /* [5] struct s */
+ BTF_MEMBER_ENC(NAME_NTH(2), 4, 0),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0s\0x"),
+ },
+ .opts = {
+ .dont_resolve_fwds = false,
+ .dedup_table_size = 1, /* force hash collisions */
+ },
+},
+{
+ .descr = "dedup: all possible kinds (no duplicates)",
+ .input = {
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 8), /* [1] int */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 2), 4), /* [2] enum */
+ BTF_ENUM_ENC(NAME_TBD, 0),
+ BTF_ENUM_ENC(NAME_TBD, 1),
+ BTF_FWD_ENC(NAME_TBD, 1 /* union kind_flag */), /* [3] fwd */
+ BTF_TYPE_ARRAY_ENC(2, 1, 7), /* [4] array */
+ BTF_STRUCT_ENC(NAME_TBD, 1, 4), /* [5] struct */
+ BTF_MEMBER_ENC(NAME_TBD, 1, 0),
+ BTF_UNION_ENC(NAME_TBD, 1, 4), /* [6] union */
+ BTF_MEMBER_ENC(NAME_TBD, 1, 0),
+ BTF_TYPEDEF_ENC(NAME_TBD, 1), /* [7] typedef */
+ BTF_PTR_ENC(0), /* [8] ptr */
+ BTF_CONST_ENC(8), /* [9] const */
+ BTF_VOLATILE_ENC(8), /* [10] volatile */
+ BTF_RESTRICT_ENC(8), /* [11] restrict */
+ BTF_FUNC_PROTO_ENC(1, 2), /* [12] func_proto */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 8),
+ BTF_FUNC_ENC(NAME_TBD, 12), /* [13] func */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M"),
+ },
+ .expect = {
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_TBD, BTF_INT_SIGNED, 0, 32, 8), /* [1] int */
+ BTF_TYPE_ENC(NAME_TBD, BTF_INFO_ENC(BTF_KIND_ENUM, 0, 2), 4), /* [2] enum */
+ BTF_ENUM_ENC(NAME_TBD, 0),
+ BTF_ENUM_ENC(NAME_TBD, 1),
+ BTF_FWD_ENC(NAME_TBD, 1 /* union kind_flag */), /* [3] fwd */
+ BTF_TYPE_ARRAY_ENC(2, 1, 7), /* [4] array */
+ BTF_STRUCT_ENC(NAME_TBD, 1, 4), /* [5] struct */
+ BTF_MEMBER_ENC(NAME_TBD, 1, 0),
+ BTF_UNION_ENC(NAME_TBD, 1, 4), /* [6] union */
+ BTF_MEMBER_ENC(NAME_TBD, 1, 0),
+ BTF_TYPEDEF_ENC(NAME_TBD, 1), /* [7] typedef */
+ BTF_PTR_ENC(0), /* [8] ptr */
+ BTF_CONST_ENC(8), /* [9] const */
+ BTF_VOLATILE_ENC(8), /* [10] volatile */
+ BTF_RESTRICT_ENC(8), /* [11] restrict */
+ BTF_FUNC_PROTO_ENC(1, 2), /* [12] func_proto */
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 1),
+ BTF_FUNC_PROTO_ARG_ENC(NAME_TBD, 8),
+ BTF_FUNC_ENC(NAME_TBD, 12), /* [13] func */
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0A\0B\0C\0D\0E\0F\0G\0H\0I\0J\0K\0L\0M"),
+ },
+ .opts = {
+ .dont_resolve_fwds = false,
+ },
+},
+{
+ .descr = "dedup: no int duplicates",
+ .input = {
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 8),
+ /* different name */
+ BTF_TYPE_INT_ENC(NAME_NTH(2), BTF_INT_SIGNED, 0, 32, 8),
+ /* different encoding */
+ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_CHAR, 0, 32, 8),
+ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_BOOL, 0, 32, 8),
+ /* different bit offset */
+ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 8, 32, 8),
+ /* different bit size */
+ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 27, 8),
+ /* different byte size */
+ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0some other int"),
+ },
+ .expect = {
+ .raw_types = {
+ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 8),
+ /* different name */
+ BTF_TYPE_INT_ENC(NAME_NTH(2), BTF_INT_SIGNED, 0, 32, 8),
+ /* different encoding */
+ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_CHAR, 0, 32, 8),
+ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_BOOL, 0, 32, 8),
+ /* different bit offset */
+ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 8, 32, 8),
+ /* different bit size */
+ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 27, 8),
+ /* different byte size */
+ BTF_TYPE_INT_ENC(NAME_NTH(1), BTF_INT_SIGNED, 0, 32, 4),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0int\0some other int"),
+ },
+ .opts = {
+ .dont_resolve_fwds = false,
+ },
+},
+{
+ .descr = "dedup: enum fwd resolution",
+ .input = {
+ .raw_types = {
+ /* [1] fwd enum 'e1' before full enum */
+ BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM, 0, 0), 4),
+ /* [2] full enum 'e1' after fwd */
+ BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), 4),
+ BTF_ENUM_ENC(NAME_NTH(2), 123),
+ /* [3] full enum 'e2' before fwd */
+ BTF_TYPE_ENC(NAME_NTH(3), BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), 4),
+ BTF_ENUM_ENC(NAME_NTH(4), 456),
+ /* [4] fwd enum 'e2' after full enum */
+ BTF_TYPE_ENC(NAME_NTH(3), BTF_INFO_ENC(BTF_KIND_ENUM, 0, 0), 4),
+ /* [5] incompatible fwd enum with different size */
+ BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM, 0, 0), 1),
+ /* [6] incompatible full enum with different value */
+ BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), 4),
+ BTF_ENUM_ENC(NAME_NTH(2), 321),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0e1\0e1_val\0e2\0e2_val"),
+ },
+ .expect = {
+ .raw_types = {
+ /* [1] full enum 'e1' */
+ BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), 4),
+ BTF_ENUM_ENC(NAME_NTH(2), 123),
+ /* [2] full enum 'e2' */
+ BTF_TYPE_ENC(NAME_NTH(3), BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), 4),
+ BTF_ENUM_ENC(NAME_NTH(4), 456),
+ /* [3] incompatible fwd enum with different size */
+ BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM, 0, 0), 1),
+ /* [4] incompatible full enum with different value */
+ BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_ENUM, 0, 1), 4),
+ BTF_ENUM_ENC(NAME_NTH(2), 321),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0e1\0e1_val\0e2\0e2_val"),
+ },
+ .opts = {
+ .dont_resolve_fwds = false,
+ },
+},
+{
+ .descr = "dedup: datasec and vars pass-through",
+ .input = {
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* static int t */
+ BTF_VAR_ENC(NAME_NTH(2), 1, 0), /* [2] */
+ /* .bss section */ /* [3] */
+ BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4),
+ BTF_VAR_SECINFO_ENC(2, 0, 4),
+ /* int, referenced from [5] */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [4] */
+ /* another static int t */
+ BTF_VAR_ENC(NAME_NTH(2), 4, 0), /* [5] */
+ /* another .bss section */ /* [6] */
+ BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4),
+ BTF_VAR_SECINFO_ENC(5, 0, 4),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0.bss\0t"),
+ },
+ .expect = {
+ .raw_types = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* static int t */
+ BTF_VAR_ENC(NAME_NTH(2), 1, 0), /* [2] */
+ /* .bss section */ /* [3] */
+ BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4),
+ BTF_VAR_SECINFO_ENC(2, 0, 4),
+ /* another static int t */
+ BTF_VAR_ENC(NAME_NTH(2), 1, 0), /* [4] */
+ /* another .bss section */ /* [5] */
+ BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4),
+ BTF_VAR_SECINFO_ENC(4, 0, 4),
+ BTF_END_RAW,
+ },
+ BTF_STR_SEC("\0.bss\0t"),
+ },
+ .opts = {
+ .dont_resolve_fwds = false,
+ .dedup_table_size = 1
+ },
+},
+
+};
+
+static int btf_type_size(const struct btf_type *t)
+{
+ int base_size = sizeof(struct btf_type);
+ __u16 vlen = BTF_INFO_VLEN(t->info);
+ __u16 kind = BTF_INFO_KIND(t->info);
+
+ switch (kind) {
+ case BTF_KIND_FWD:
+ case BTF_KIND_CONST:
+ case BTF_KIND_VOLATILE:
+ case BTF_KIND_RESTRICT:
+ case BTF_KIND_PTR:
+ case BTF_KIND_TYPEDEF:
+ case BTF_KIND_FUNC:
+ return base_size;
+ case BTF_KIND_INT:
+ return base_size + sizeof(__u32);
+ case BTF_KIND_ENUM:
+ return base_size + vlen * sizeof(struct btf_enum);
+ case BTF_KIND_ARRAY:
+ return base_size + sizeof(struct btf_array);
+ case BTF_KIND_STRUCT:
+ case BTF_KIND_UNION:
+ return base_size + vlen * sizeof(struct btf_member);
+ case BTF_KIND_FUNC_PROTO:
+ return base_size + vlen * sizeof(struct btf_param);
+ case BTF_KIND_VAR:
+ return base_size + sizeof(struct btf_var);
+ case BTF_KIND_DATASEC:
+ return base_size + vlen * sizeof(struct btf_var_secinfo);
+ default:
+ fprintf(stderr, "Unsupported BTF_KIND:%u\n", kind);
+ return -EINVAL;
+ }
+}
+
+static void dump_btf_strings(const char *strs, __u32 len)
+{
+ const char *cur = strs;
+ int i = 0;
+
+ while (cur < strs + len) {
+ fprintf(stderr, "string #%d: '%s'\n", i, cur);
+ cur += strlen(cur) + 1;
+ i++;
+ }
+}
+
+static void do_test_dedup(unsigned int test_num)
+{
+ const struct btf_dedup_test *test = &dedup_tests[test_num - 1];
+ __u32 test_nr_types, expect_nr_types, test_btf_size, expect_btf_size;
+ const struct btf_header *test_hdr, *expect_hdr;
+ struct btf *test_btf = NULL, *expect_btf = NULL;
+ const void *test_btf_data, *expect_btf_data;
+ const char *ret_test_next_str, *ret_expect_next_str;
+ const char *test_strs, *expect_strs;
+ const char *test_str_cur, *test_str_end;
+ const char *expect_str_cur, *expect_str_end;
+ unsigned int raw_btf_size;
+ void *raw_btf;
+ int err = 0, i;
+
+ if (!test__start_subtest(test->descr))
+ return;
+
+ raw_btf = btf_raw_create(&hdr_tmpl, test->input.raw_types,
+ test->input.str_sec, test->input.str_sec_size,
+ &raw_btf_size, &ret_test_next_str);
+ if (!raw_btf)
+ return;
+
+ test_btf = btf__new((__u8 *)raw_btf, raw_btf_size);
+ free(raw_btf);
+ if (CHECK(IS_ERR(test_btf), "invalid test_btf errno:%ld",
+ PTR_ERR(test_btf))) {
+ err = -1;
+ goto done;
+ }
+
+ raw_btf = btf_raw_create(&hdr_tmpl, test->expect.raw_types,
+ test->expect.str_sec,
+ test->expect.str_sec_size,
+ &raw_btf_size, &ret_expect_next_str);
+ if (!raw_btf)
+ return;
+ expect_btf = btf__new((__u8 *)raw_btf, raw_btf_size);
+ free(raw_btf);
+ if (CHECK(IS_ERR(expect_btf), "invalid expect_btf errno:%ld",
+ PTR_ERR(expect_btf))) {
+ err = -1;
+ goto done;
+ }
+
+ err = btf__dedup(test_btf, NULL, &test->opts);
+ if (CHECK(err, "btf_dedup failed errno:%d", err)) {
+ err = -1;
+ goto done;
+ }
+
+ test_btf_data = btf__get_raw_data(test_btf, &test_btf_size);
+ expect_btf_data = btf__get_raw_data(expect_btf, &expect_btf_size);
+ if (CHECK(test_btf_size != expect_btf_size,
+ "test_btf_size:%u != expect_btf_size:%u",
+ test_btf_size, expect_btf_size)) {
+ err = -1;
+ goto done;
+ }
+
+ test_hdr = test_btf_data;
+ test_strs = test_btf_data + sizeof(*test_hdr) + test_hdr->str_off;
+ expect_hdr = expect_btf_data;
+ expect_strs = expect_btf_data + sizeof(*test_hdr) + expect_hdr->str_off;
+ if (CHECK(test_hdr->str_len != expect_hdr->str_len,
+ "test_hdr->str_len:%u != expect_hdr->str_len:%u",
+ test_hdr->str_len, expect_hdr->str_len)) {
+ fprintf(stderr, "\ntest strings:\n");
+ dump_btf_strings(test_strs, test_hdr->str_len);
+ fprintf(stderr, "\nexpected strings:\n");
+ dump_btf_strings(expect_strs, expect_hdr->str_len);
+ err = -1;
+ goto done;
+ }
+
+ test_str_cur = test_strs;
+ test_str_end = test_strs + test_hdr->str_len;
+ expect_str_cur = expect_strs;
+ expect_str_end = expect_strs + expect_hdr->str_len;
+ while (test_str_cur < test_str_end && expect_str_cur < expect_str_end) {
+ size_t test_len, expect_len;
+
+ test_len = strlen(test_str_cur);
+ expect_len = strlen(expect_str_cur);
+ if (CHECK(test_len != expect_len,
+ "test_len:%zu != expect_len:%zu "
+ "(test_str:%s, expect_str:%s)",
+ test_len, expect_len, test_str_cur, expect_str_cur)) {
+ err = -1;
+ goto done;
+ }
+ if (CHECK(strcmp(test_str_cur, expect_str_cur),
+ "test_str:%s != expect_str:%s",
+ test_str_cur, expect_str_cur)) {
+ err = -1;
+ goto done;
+ }
+ test_str_cur += test_len + 1;
+ expect_str_cur += expect_len + 1;
+ }
+ if (CHECK(test_str_cur != test_str_end,
+ "test_str_cur:%p != test_str_end:%p",
+ test_str_cur, test_str_end)) {
+ err = -1;
+ goto done;
+ }
+
+ test_nr_types = btf__get_nr_types(test_btf);
+ expect_nr_types = btf__get_nr_types(expect_btf);
+ if (CHECK(test_nr_types != expect_nr_types,
+ "test_nr_types:%u != expect_nr_types:%u",
+ test_nr_types, expect_nr_types)) {
+ err = -1;
+ goto done;
+ }
+
+ for (i = 1; i <= test_nr_types; i++) {
+ const struct btf_type *test_type, *expect_type;
+ int test_size, expect_size;
+
+ test_type = btf__type_by_id(test_btf, i);
+ expect_type = btf__type_by_id(expect_btf, i);
+ test_size = btf_type_size(test_type);
+ expect_size = btf_type_size(expect_type);
+
+ if (CHECK(test_size != expect_size,
+ "type #%d: test_size:%d != expect_size:%u",
+ i, test_size, expect_size)) {
+ err = -1;
+ goto done;
+ }
+ if (CHECK(memcmp((void *)test_type,
+ (void *)expect_type,
+ test_size),
+ "type #%d: contents differ", i)) {
+ err = -1;
+ goto done;
+ }
+ }
+
+done:
+ if (!IS_ERR(test_btf))
+ btf__free(test_btf);
+ if (!IS_ERR(expect_btf))
+ btf__free(expect_btf);
+}
+
+void test_btf(void)
+{
+ int i;
+
+ always_log = env.verbosity > VERBOSE_NONE;
+
+ for (i = 1; i <= ARRAY_SIZE(raw_tests); i++)
+ do_test_raw(i);
+ for (i = 1; i <= ARRAY_SIZE(get_info_tests); i++)
+ do_test_get_info(i);
+ for (i = 1; i <= ARRAY_SIZE(file_tests); i++)
+ do_test_file(i);
+ for (i = 1; i <= ARRAY_SIZE(info_raw_tests); i++)
+ do_test_info_raw(i);
+ for (i = 1; i <= ARRAY_SIZE(dedup_tests); i++)
+ do_test_dedup(i);
+ test_pprint();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_dump.c b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
new file mode 100644
index 000000000..c60091ee8
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/btf_dump.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <bpf/btf.h>
+
+static int duration = 0;
+
+void btf_dump_printf(void *ctx, const char *fmt, va_list args)
+{
+ vfprintf(ctx, fmt, args);
+}
+
+static struct btf_dump_test_case {
+ const char *name;
+ const char *file;
+ bool known_ptr_sz;
+ struct btf_dump_opts opts;
+} btf_dump_test_cases[] = {
+ {"btf_dump: syntax", "btf_dump_test_case_syntax", true, {}},
+ {"btf_dump: ordering", "btf_dump_test_case_ordering", false, {}},
+ {"btf_dump: padding", "btf_dump_test_case_padding", true, {}},
+ {"btf_dump: packing", "btf_dump_test_case_packing", true, {}},
+ {"btf_dump: bitfields", "btf_dump_test_case_bitfields", true, {}},
+ {"btf_dump: multidim", "btf_dump_test_case_multidim", false, {}},
+ {"btf_dump: namespacing", "btf_dump_test_case_namespacing", false, {}},
+};
+
+static int btf_dump_all_types(const struct btf *btf,
+ const struct btf_dump_opts *opts)
+{
+ size_t type_cnt = btf__get_nr_types(btf);
+ struct btf_dump *d;
+ int err = 0, id;
+
+ d = btf_dump__new(btf, NULL, opts, btf_dump_printf);
+ if (IS_ERR(d))
+ return PTR_ERR(d);
+
+ for (id = 1; id <= type_cnt; id++) {
+ err = btf_dump__dump_type(d, id);
+ if (err)
+ goto done;
+ }
+
+done:
+ btf_dump__free(d);
+ return err;
+}
+
+static int test_btf_dump_case(int n, struct btf_dump_test_case *t)
+{
+ char test_file[256], out_file[256], diff_cmd[1024];
+ struct btf *btf = NULL;
+ int err = 0, fd = -1;
+ FILE *f = NULL;
+
+ snprintf(test_file, sizeof(test_file), "%s.o", t->file);
+
+ btf = btf__parse_elf(test_file, NULL);
+ if (CHECK(IS_ERR(btf), "btf_parse_elf",
+ "failed to load test BTF: %ld\n", PTR_ERR(btf))) {
+ err = -PTR_ERR(btf);
+ btf = NULL;
+ goto done;
+ }
+
+ /* tests with t->known_ptr_sz have no "long" or "unsigned long" type,
+ * so it's impossible to determine correct pointer size; but if they
+ * do, it should be 8 regardless of host architecture, becaues BPF
+ * target is always 64-bit
+ */
+ if (!t->known_ptr_sz) {
+ btf__set_pointer_size(btf, 8);
+ } else {
+ CHECK(btf__pointer_size(btf) != 8, "ptr_sz", "exp %d, got %zu\n",
+ 8, btf__pointer_size(btf));
+ }
+
+ snprintf(out_file, sizeof(out_file), "/tmp/%s.output.XXXXXX", t->file);
+ fd = mkstemp(out_file);
+ if (CHECK(fd < 0, "create_tmp", "failed to create file: %d\n", fd)) {
+ err = fd;
+ goto done;
+ }
+ f = fdopen(fd, "w");
+ if (CHECK(f == NULL, "open_tmp", "failed to open file: %s(%d)\n",
+ strerror(errno), errno)) {
+ close(fd);
+ goto done;
+ }
+
+ t->opts.ctx = f;
+ err = btf_dump_all_types(btf, &t->opts);
+ fclose(f);
+ close(fd);
+ if (CHECK(err, "btf_dump", "failure during C dumping: %d\n", err)) {
+ goto done;
+ }
+
+ snprintf(test_file, sizeof(test_file), "progs/%s.c", t->file);
+ if (access(test_file, R_OK) == -1)
+ /*
+ * When the test is run with O=, kselftest copies TEST_FILES
+ * without preserving the directory structure.
+ */
+ snprintf(test_file, sizeof(test_file), "%s.c", t->file);
+ /*
+ * Diff test output and expected test output, contained between
+ * START-EXPECTED-OUTPUT and END-EXPECTED-OUTPUT lines in test case.
+ * For expected output lines, everything before '*' is stripped out.
+ * Also lines containing comment start and comment end markers are
+ * ignored.
+ */
+ snprintf(diff_cmd, sizeof(diff_cmd),
+ "awk '/START-EXPECTED-OUTPUT/{out=1;next} "
+ "/END-EXPECTED-OUTPUT/{out=0} "
+ "/\\/\\*|\\*\\//{next} " /* ignore comment start/end lines */
+ "out {sub(/^[ \\t]*\\*/, \"\"); print}' '%s' | diff -u - '%s'",
+ test_file, out_file);
+ err = system(diff_cmd);
+ if (CHECK(err, "diff",
+ "differing test output, output=%s, err=%d, diff cmd:\n%s\n",
+ out_file, err, diff_cmd))
+ goto done;
+
+ remove(out_file);
+
+done:
+ btf__free(btf);
+ return err;
+}
+
+static char *dump_buf;
+static size_t dump_buf_sz;
+static FILE *dump_buf_file;
+
+void test_btf_dump_incremental(void)
+{
+ struct btf *btf = NULL;
+ struct btf_dump *d = NULL;
+ struct btf_dump_opts opts;
+ int id, err, i;
+
+ dump_buf_file = open_memstream(&dump_buf, &dump_buf_sz);
+ if (!ASSERT_OK_PTR(dump_buf_file, "dump_memstream"))
+ return;
+ btf = btf__new_empty();
+ if (!ASSERT_OK_PTR(btf, "new_empty"))
+ goto err_out;
+ opts.ctx = dump_buf_file;
+ d = btf_dump__new(btf, NULL, &opts, btf_dump_printf);
+ if (!ASSERT_OK(libbpf_get_error(d), "btf_dump__new"))
+ goto err_out;
+
+ /* First, generate BTF corresponding to the following C code:
+ *
+ * enum { VAL = 1 };
+ *
+ * struct s { int x; };
+ *
+ */
+ id = btf__add_enum(btf, NULL, 4);
+ ASSERT_EQ(id, 1, "enum_id");
+ err = btf__add_enum_value(btf, "VAL", 1);
+ ASSERT_OK(err, "enum_val_ok");
+
+ id = btf__add_int(btf, "int", 4, BTF_INT_SIGNED);
+ ASSERT_EQ(id, 2, "int_id");
+
+ id = btf__add_struct(btf, "s", 4);
+ ASSERT_EQ(id, 3, "struct_id");
+ err = btf__add_field(btf, "x", 2, 0, 0);
+ ASSERT_OK(err, "field_ok");
+
+ for (i = 1; i <= btf__get_nr_types(btf); i++) {
+ err = btf_dump__dump_type(d, i);
+ ASSERT_OK(err, "dump_type_ok");
+ }
+
+ fflush(dump_buf_file);
+ dump_buf[dump_buf_sz] = 0; /* some libc implementations don't do this */
+ ASSERT_STREQ(dump_buf,
+"enum {\n"
+" VAL = 1,\n"
+"};\n"
+"\n"
+"struct s {\n"
+" int x;\n"
+"};\n\n", "c_dump1");
+
+ /* Now, after dumping original BTF, append another struct that embeds
+ * anonymous enum. It also has a name conflict with the first struct:
+ *
+ * struct s___2 {
+ * enum { VAL___2 = 1 } x;
+ * struct s s;
+ * };
+ *
+ * This will test that btf_dump'er maintains internal state properly.
+ * Note that VAL___2 enum value. It's because we've already emitted
+ * that enum as a global anonymous enum, so btf_dump will ensure that
+ * enum values don't conflict;
+ *
+ */
+ fseek(dump_buf_file, 0, SEEK_SET);
+
+ id = btf__add_struct(btf, "s", 4);
+ ASSERT_EQ(id, 4, "struct_id");
+ err = btf__add_field(btf, "x", 1, 0, 0);
+ ASSERT_OK(err, "field_ok");
+ err = btf__add_field(btf, "s", 3, 32, 0);
+ ASSERT_OK(err, "field_ok");
+
+ for (i = 1; i <= btf__get_nr_types(btf); i++) {
+ err = btf_dump__dump_type(d, i);
+ ASSERT_OK(err, "dump_type_ok");
+ }
+
+ fflush(dump_buf_file);
+ dump_buf[dump_buf_sz] = 0; /* some libc implementations don't do this */
+ ASSERT_STREQ(dump_buf,
+"struct s___2 {\n"
+" enum {\n"
+" VAL___2 = 1,\n"
+" } x;\n"
+" struct s s;\n"
+"};\n\n" , "c_dump1");
+
+err_out:
+ fclose(dump_buf_file);
+ free(dump_buf);
+ btf_dump__free(d);
+ btf__free(btf);
+}
+
+void test_btf_dump() {
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(btf_dump_test_cases); i++) {
+ struct btf_dump_test_case *t = &btf_dump_test_cases[i];
+
+ if (!test__start_subtest(t->name))
+ continue;
+
+ test_btf_dump_case(i, &btf_dump_test_cases[i]);
+ }
+ if (test__start_subtest("btf_dump: incremental"))
+ test_btf_dump_incremental();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_endian.c b/tools/testing/selftests/bpf/prog_tests/btf_endian.c
new file mode 100644
index 000000000..8c52d72c8
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/btf_endian.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#define _GNU_SOURCE
+#include <string.h>
+#include <byteswap.h>
+#include <test_progs.h>
+#include <bpf/btf.h>
+
+static int duration = 0;
+
+void test_btf_endian() {
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ enum btf_endianness endian = BTF_LITTLE_ENDIAN;
+#elif __BYTE_ORDER == __BIG_ENDIAN
+ enum btf_endianness endian = BTF_BIG_ENDIAN;
+#else
+#error "Unrecognized __BYTE_ORDER"
+#endif
+ enum btf_endianness swap_endian = 1 - endian;
+ struct btf *btf = NULL, *swap_btf = NULL;
+ const void *raw_data, *swap_raw_data;
+ const struct btf_type *t;
+ const struct btf_header *hdr;
+ __u32 raw_sz, swap_raw_sz;
+ int var_id;
+
+ /* Load BTF in native endianness */
+ btf = btf__parse_elf("btf_dump_test_case_syntax.o", NULL);
+ if (!ASSERT_OK_PTR(btf, "parse_native_btf"))
+ goto err_out;
+
+ ASSERT_EQ(btf__endianness(btf), endian, "endian");
+ btf__set_endianness(btf, swap_endian);
+ ASSERT_EQ(btf__endianness(btf), swap_endian, "endian");
+
+ /* Get raw BTF data in non-native endianness... */
+ raw_data = btf__get_raw_data(btf, &raw_sz);
+ if (!ASSERT_OK_PTR(raw_data, "raw_data_inverted"))
+ goto err_out;
+
+ /* ...and open it as a new BTF instance */
+ swap_btf = btf__new(raw_data, raw_sz);
+ if (!ASSERT_OK_PTR(swap_btf, "parse_swap_btf"))
+ goto err_out;
+
+ ASSERT_EQ(btf__endianness(swap_btf), swap_endian, "endian");
+ ASSERT_EQ(btf__get_nr_types(swap_btf), btf__get_nr_types(btf), "nr_types");
+
+ swap_raw_data = btf__get_raw_data(swap_btf, &swap_raw_sz);
+ if (!ASSERT_OK_PTR(swap_raw_data, "swap_raw_data"))
+ goto err_out;
+
+ /* both raw data should be identical (with non-native endianness) */
+ ASSERT_OK(memcmp(raw_data, swap_raw_data, raw_sz), "mem_identical");
+
+ /* make sure that at least BTF header data is really swapped */
+ hdr = swap_raw_data;
+ ASSERT_EQ(bswap_16(hdr->magic), BTF_MAGIC, "btf_magic_swapped");
+ ASSERT_EQ(raw_sz, swap_raw_sz, "raw_sizes");
+
+ /* swap it back to native endianness */
+ btf__set_endianness(swap_btf, endian);
+ swap_raw_data = btf__get_raw_data(swap_btf, &swap_raw_sz);
+ if (!ASSERT_OK_PTR(swap_raw_data, "swap_raw_data"))
+ goto err_out;
+
+ /* now header should have native BTF_MAGIC */
+ hdr = swap_raw_data;
+ ASSERT_EQ(hdr->magic, BTF_MAGIC, "btf_magic_native");
+ ASSERT_EQ(raw_sz, swap_raw_sz, "raw_sizes");
+
+ /* now modify original BTF */
+ var_id = btf__add_var(btf, "some_var", BTF_VAR_GLOBAL_ALLOCATED, 1);
+ CHECK(var_id <= 0, "var_id", "failed %d\n", var_id);
+
+ btf__free(swap_btf);
+ swap_btf = NULL;
+
+ btf__set_endianness(btf, swap_endian);
+ raw_data = btf__get_raw_data(btf, &raw_sz);
+ if (!ASSERT_OK_PTR(raw_data, "raw_data_inverted"))
+ goto err_out;
+
+ /* and re-open swapped raw data again */
+ swap_btf = btf__new(raw_data, raw_sz);
+ if (!ASSERT_OK_PTR(swap_btf, "parse_swap_btf"))
+ goto err_out;
+
+ ASSERT_EQ(btf__endianness(swap_btf), swap_endian, "endian");
+ ASSERT_EQ(btf__get_nr_types(swap_btf), btf__get_nr_types(btf), "nr_types");
+
+ /* the type should appear as if it was stored in native endianness */
+ t = btf__type_by_id(swap_btf, var_id);
+ ASSERT_STREQ(btf__str_by_offset(swap_btf, t->name_off), "some_var", "var_name");
+ ASSERT_EQ(btf_var(t)->linkage, BTF_VAR_GLOBAL_ALLOCATED, "var_linkage");
+ ASSERT_EQ(t->type, 1, "var_type");
+
+err_out:
+ btf__free(btf);
+ btf__free(swap_btf);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c
new file mode 100644
index 000000000..76ebe4c25
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/btf_map_in_map.c
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include <test_progs.h>
+
+#include "test_btf_map_in_map.skel.h"
+
+static int duration;
+
+static __u32 bpf_map_id(struct bpf_map *map)
+{
+ struct bpf_map_info info;
+ __u32 info_len = sizeof(info);
+ int err;
+
+ memset(&info, 0, info_len);
+ err = bpf_obj_get_info_by_fd(bpf_map__fd(map), &info, &info_len);
+ if (err)
+ return 0;
+ return info.id;
+}
+
+/*
+ * Trigger synchronize_rcu() in kernel.
+ *
+ * ARRAY_OF_MAPS/HASH_OF_MAPS lookup/update operations trigger synchronize_rcu()
+ * if looking up an existing non-NULL element or updating the map with a valid
+ * inner map FD. Use this fact to trigger synchronize_rcu(): create map-in-map,
+ * create a trivial ARRAY map, update map-in-map with ARRAY inner map. Then
+ * cleanup. At the end, at least one synchronize_rcu() would be called.
+ */
+static int kern_sync_rcu(void)
+{
+ int inner_map_fd, outer_map_fd, err, zero = 0;
+
+ inner_map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 4, 1, 0);
+ if (CHECK(inner_map_fd < 0, "inner_map_create", "failed %d\n", -errno))
+ return -1;
+
+ outer_map_fd = bpf_create_map_in_map(BPF_MAP_TYPE_ARRAY_OF_MAPS, NULL,
+ sizeof(int), inner_map_fd, 1, 0);
+ if (CHECK(outer_map_fd < 0, "outer_map_create", "failed %d\n", -errno)) {
+ close(inner_map_fd);
+ return -1;
+ }
+
+ err = bpf_map_update_elem(outer_map_fd, &zero, &inner_map_fd, 0);
+ if (err)
+ err = -errno;
+ CHECK(err, "outer_map_update", "failed %d\n", err);
+ close(inner_map_fd);
+ close(outer_map_fd);
+ return err;
+}
+
+static void test_lookup_update(void)
+{
+ int map1_fd, map2_fd, map3_fd, map4_fd, map5_fd, map1_id, map2_id;
+ int outer_arr_fd, outer_hash_fd, outer_arr_dyn_fd;
+ struct test_btf_map_in_map *skel;
+ int err, key = 0, val, i, fd;
+
+ skel = test_btf_map_in_map__open_and_load();
+ if (CHECK(!skel, "skel_open", "failed to open&load skeleton\n"))
+ return;
+
+ err = test_btf_map_in_map__attach(skel);
+ if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err))
+ goto cleanup;
+
+ map1_fd = bpf_map__fd(skel->maps.inner_map1);
+ map2_fd = bpf_map__fd(skel->maps.inner_map2);
+ map3_fd = bpf_map__fd(skel->maps.inner_map3);
+ map4_fd = bpf_map__fd(skel->maps.inner_map4);
+ map5_fd = bpf_map__fd(skel->maps.inner_map5);
+ outer_arr_dyn_fd = bpf_map__fd(skel->maps.outer_arr_dyn);
+ outer_arr_fd = bpf_map__fd(skel->maps.outer_arr);
+ outer_hash_fd = bpf_map__fd(skel->maps.outer_hash);
+
+ /* inner1 = input, inner2 = input + 1, inner3 = input + 2 */
+ bpf_map_update_elem(outer_arr_fd, &key, &map1_fd, 0);
+ bpf_map_update_elem(outer_hash_fd, &key, &map2_fd, 0);
+ bpf_map_update_elem(outer_arr_dyn_fd, &key, &map3_fd, 0);
+ skel->bss->input = 1;
+ usleep(1);
+ bpf_map_lookup_elem(map1_fd, &key, &val);
+ CHECK(val != 1, "inner1", "got %d != exp %d\n", val, 1);
+ bpf_map_lookup_elem(map2_fd, &key, &val);
+ CHECK(val != 2, "inner2", "got %d != exp %d\n", val, 2);
+ bpf_map_lookup_elem(map3_fd, &key, &val);
+ CHECK(val != 3, "inner3", "got %d != exp %d\n", val, 3);
+
+ /* inner2 = input, inner1 = input + 1, inner4 = input + 2 */
+ bpf_map_update_elem(outer_arr_fd, &key, &map2_fd, 0);
+ bpf_map_update_elem(outer_hash_fd, &key, &map1_fd, 0);
+ bpf_map_update_elem(outer_arr_dyn_fd, &key, &map4_fd, 0);
+ skel->bss->input = 3;
+ usleep(1);
+ bpf_map_lookup_elem(map1_fd, &key, &val);
+ CHECK(val != 4, "inner1", "got %d != exp %d\n", val, 4);
+ bpf_map_lookup_elem(map2_fd, &key, &val);
+ CHECK(val != 3, "inner2", "got %d != exp %d\n", val, 3);
+ bpf_map_lookup_elem(map4_fd, &key, &val);
+ CHECK(val != 5, "inner4", "got %d != exp %d\n", val, 5);
+
+ /* inner5 = input + 2 */
+ bpf_map_update_elem(outer_arr_dyn_fd, &key, &map5_fd, 0);
+ skel->bss->input = 5;
+ usleep(1);
+ bpf_map_lookup_elem(map5_fd, &key, &val);
+ CHECK(val != 7, "inner5", "got %d != exp %d\n", val, 7);
+
+ for (i = 0; i < 5; i++) {
+ val = i % 2 ? map1_fd : map2_fd;
+ err = bpf_map_update_elem(outer_hash_fd, &key, &val, 0);
+ if (CHECK_FAIL(err)) {
+ printf("failed to update hash_of_maps on iter #%d\n", i);
+ goto cleanup;
+ }
+ err = bpf_map_update_elem(outer_arr_fd, &key, &val, 0);
+ if (CHECK_FAIL(err)) {
+ printf("failed to update array_of_maps on iter #%d\n", i);
+ goto cleanup;
+ }
+ val = i % 2 ? map4_fd : map5_fd;
+ err = bpf_map_update_elem(outer_arr_dyn_fd, &key, &val, 0);
+ if (CHECK_FAIL(err)) {
+ printf("failed to update array_of_maps (dyn) on iter #%d\n", i);
+ goto cleanup;
+ }
+ }
+
+ map1_id = bpf_map_id(skel->maps.inner_map1);
+ map2_id = bpf_map_id(skel->maps.inner_map2);
+ CHECK(map1_id == 0, "map1_id", "failed to get ID 1\n");
+ CHECK(map2_id == 0, "map2_id", "failed to get ID 2\n");
+
+ test_btf_map_in_map__destroy(skel);
+ skel = NULL;
+
+ /* we need to either wait for or force synchronize_rcu(), before
+ * checking for "still exists" condition, otherwise map could still be
+ * resolvable by ID, causing false positives.
+ *
+ * Older kernels (5.8 and earlier) freed map only after two
+ * synchronize_rcu()s, so trigger two, to be entirely sure.
+ */
+ CHECK(kern_sync_rcu(), "sync_rcu", "failed\n");
+ CHECK(kern_sync_rcu(), "sync_rcu", "failed\n");
+
+ fd = bpf_map_get_fd_by_id(map1_id);
+ if (CHECK(fd >= 0, "map1_leak", "inner_map1 leaked!\n")) {
+ close(fd);
+ goto cleanup;
+ }
+ fd = bpf_map_get_fd_by_id(map2_id);
+ if (CHECK(fd >= 0, "map2_leak", "inner_map2 leaked!\n")) {
+ close(fd);
+ goto cleanup;
+ }
+
+cleanup:
+ test_btf_map_in_map__destroy(skel);
+}
+
+static void test_diff_size(void)
+{
+ struct test_btf_map_in_map *skel;
+ int err, inner_map_fd, zero = 0;
+
+ skel = test_btf_map_in_map__open_and_load();
+ if (CHECK(!skel, "skel_open", "failed to open&load skeleton\n"))
+ return;
+
+ inner_map_fd = bpf_map__fd(skel->maps.sockarr_sz2);
+ err = bpf_map_update_elem(bpf_map__fd(skel->maps.outer_sockarr), &zero,
+ &inner_map_fd, 0);
+ CHECK(err, "outer_sockarr inner map size check",
+ "cannot use a different size inner_map\n");
+
+ inner_map_fd = bpf_map__fd(skel->maps.inner_map_sz2);
+ err = bpf_map_update_elem(bpf_map__fd(skel->maps.outer_arr), &zero,
+ &inner_map_fd, 0);
+ CHECK(!err, "outer_arr inner map size check",
+ "incorrectly updated with a different size inner_map\n");
+
+ test_btf_map_in_map__destroy(skel);
+}
+
+void test_btf_map_in_map(void)
+{
+ if (test__start_subtest("lookup_update"))
+ test_lookup_update();
+
+ if (test__start_subtest("diff_size"))
+ test_diff_size();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
new file mode 100644
index 000000000..d16fd8882
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/btf_skc_cls_ingress.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#define _GNU_SOURCE
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <sched.h>
+#include <linux/compiler.h>
+#include <bpf/libbpf.h>
+
+#include "network_helpers.h"
+#include "test_progs.h"
+#include "test_btf_skc_cls_ingress.skel.h"
+
+static struct test_btf_skc_cls_ingress *skel;
+struct sockaddr_in6 srv_sa6;
+static __u32 duration;
+
+#define PROG_PIN_FILE "/sys/fs/bpf/btf_skc_cls_ingress"
+
+static int write_sysctl(const char *sysctl, const char *value)
+{
+ int fd, err, len;
+
+ fd = open(sysctl, O_WRONLY);
+ if (CHECK(fd == -1, "open sysctl", "open(%s): %s (%d)\n",
+ sysctl, strerror(errno), errno))
+ return -1;
+
+ len = strlen(value);
+ err = write(fd, value, len);
+ close(fd);
+ if (CHECK(err != len, "write sysctl",
+ "write(%s, %s, %d): err:%d %s (%d)\n",
+ sysctl, value, len, err, strerror(errno), errno))
+ return -1;
+
+ return 0;
+}
+
+static int prepare_netns(void)
+{
+ if (CHECK(unshare(CLONE_NEWNET), "create netns",
+ "unshare(CLONE_NEWNET): %s (%d)",
+ strerror(errno), errno))
+ return -1;
+
+ if (CHECK(system("ip link set dev lo up"),
+ "ip link set dev lo up", "failed\n"))
+ return -1;
+
+ if (CHECK(system("tc qdisc add dev lo clsact"),
+ "tc qdisc add dev lo clsact", "failed\n"))
+ return -1;
+
+ if (CHECK(system("tc filter add dev lo ingress bpf direct-action object-pinned " PROG_PIN_FILE),
+ "install tc cls-prog at ingress", "failed\n"))
+ return -1;
+
+ /* Ensure 20 bytes options (i.e. in total 40 bytes tcp header) for the
+ * bpf_tcp_gen_syncookie() helper.
+ */
+ if (write_sysctl("/proc/sys/net/ipv4/tcp_window_scaling", "1") ||
+ write_sysctl("/proc/sys/net/ipv4/tcp_timestamps", "1") ||
+ write_sysctl("/proc/sys/net/ipv4/tcp_sack", "1"))
+ return -1;
+
+ return 0;
+}
+
+static void reset_test(void)
+{
+ memset(&skel->bss->srv_sa6, 0, sizeof(skel->bss->srv_sa6));
+ skel->bss->listen_tp_sport = 0;
+ skel->bss->req_sk_sport = 0;
+ skel->bss->recv_cookie = 0;
+ skel->bss->gen_cookie = 0;
+ skel->bss->linum = 0;
+}
+
+static void print_err_line(void)
+{
+ if (skel->bss->linum)
+ printf("bpf prog error at line %u\n", skel->bss->linum);
+}
+
+static void test_conn(void)
+{
+ int listen_fd = -1, cli_fd = -1, srv_fd = -1, err;
+ socklen_t addrlen = sizeof(srv_sa6);
+ int srv_port;
+
+ if (write_sysctl("/proc/sys/net/ipv4/tcp_syncookies", "1"))
+ return;
+
+ listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0);
+ if (CHECK_FAIL(listen_fd == -1))
+ return;
+
+ err = getsockname(listen_fd, (struct sockaddr *)&srv_sa6, &addrlen);
+ if (CHECK(err, "getsockname(listen_fd)", "err:%d errno:%d\n", err,
+ errno))
+ goto done;
+ memcpy(&skel->bss->srv_sa6, &srv_sa6, sizeof(srv_sa6));
+ srv_port = ntohs(srv_sa6.sin6_port);
+
+ cli_fd = connect_to_fd(listen_fd, 0);
+ if (CHECK_FAIL(cli_fd == -1))
+ goto done;
+
+ srv_fd = accept(listen_fd, NULL, NULL);
+ if (CHECK_FAIL(srv_fd == -1))
+ goto done;
+
+ if (CHECK(skel->bss->listen_tp_sport != srv_port ||
+ skel->bss->req_sk_sport != srv_port,
+ "Unexpected sk src port",
+ "listen_tp_sport:%u req_sk_sport:%u expected:%u\n",
+ skel->bss->listen_tp_sport, skel->bss->req_sk_sport,
+ srv_port))
+ goto done;
+
+ if (CHECK(skel->bss->gen_cookie || skel->bss->recv_cookie,
+ "Unexpected syncookie states",
+ "gen_cookie:%u recv_cookie:%u\n",
+ skel->bss->gen_cookie, skel->bss->recv_cookie))
+ goto done;
+
+ CHECK(skel->bss->linum, "bpf prog detected error", "at line %u\n",
+ skel->bss->linum);
+
+done:
+ if (listen_fd != -1)
+ close(listen_fd);
+ if (cli_fd != -1)
+ close(cli_fd);
+ if (srv_fd != -1)
+ close(srv_fd);
+}
+
+static void test_syncookie(void)
+{
+ int listen_fd = -1, cli_fd = -1, srv_fd = -1, err;
+ socklen_t addrlen = sizeof(srv_sa6);
+ int srv_port;
+
+ /* Enforce syncookie mode */
+ if (write_sysctl("/proc/sys/net/ipv4/tcp_syncookies", "2"))
+ return;
+
+ listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0);
+ if (CHECK_FAIL(listen_fd == -1))
+ return;
+
+ err = getsockname(listen_fd, (struct sockaddr *)&srv_sa6, &addrlen);
+ if (CHECK(err, "getsockname(listen_fd)", "err:%d errno:%d\n", err,
+ errno))
+ goto done;
+ memcpy(&skel->bss->srv_sa6, &srv_sa6, sizeof(srv_sa6));
+ srv_port = ntohs(srv_sa6.sin6_port);
+
+ cli_fd = connect_to_fd(listen_fd, 0);
+ if (CHECK_FAIL(cli_fd == -1))
+ goto done;
+
+ srv_fd = accept(listen_fd, NULL, NULL);
+ if (CHECK_FAIL(srv_fd == -1))
+ goto done;
+
+ if (CHECK(skel->bss->listen_tp_sport != srv_port,
+ "Unexpected tp src port",
+ "listen_tp_sport:%u expected:%u\n",
+ skel->bss->listen_tp_sport, srv_port))
+ goto done;
+
+ if (CHECK(skel->bss->req_sk_sport,
+ "Unexpected req_sk src port",
+ "req_sk_sport:%u expected:0\n",
+ skel->bss->req_sk_sport))
+ goto done;
+
+ if (CHECK(!skel->bss->gen_cookie ||
+ skel->bss->gen_cookie != skel->bss->recv_cookie,
+ "Unexpected syncookie states",
+ "gen_cookie:%u recv_cookie:%u\n",
+ skel->bss->gen_cookie, skel->bss->recv_cookie))
+ goto done;
+
+ CHECK(skel->bss->linum, "bpf prog detected error", "at line %u\n",
+ skel->bss->linum);
+
+done:
+ if (listen_fd != -1)
+ close(listen_fd);
+ if (cli_fd != -1)
+ close(cli_fd);
+ if (srv_fd != -1)
+ close(srv_fd);
+}
+
+struct test {
+ const char *desc;
+ void (*run)(void);
+};
+
+#define DEF_TEST(name) { #name, test_##name }
+static struct test tests[] = {
+ DEF_TEST(conn),
+ DEF_TEST(syncookie),
+};
+
+void test_btf_skc_cls_ingress(void)
+{
+ int i, err;
+
+ skel = test_btf_skc_cls_ingress__open_and_load();
+ if (CHECK(!skel, "test_btf_skc_cls_ingress__open_and_load", "failed\n"))
+ return;
+
+ err = bpf_program__pin(skel->progs.cls_ingress, PROG_PIN_FILE);
+ if (CHECK(err, "bpf_program__pin",
+ "cannot pin bpf prog to %s. err:%d\n", PROG_PIN_FILE, err)) {
+ test_btf_skc_cls_ingress__destroy(skel);
+ return;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ if (!test__start_subtest(tests[i].desc))
+ continue;
+
+ if (prepare_netns())
+ break;
+
+ tests[i].run();
+
+ print_err_line();
+ reset_test();
+ }
+
+ bpf_program__unpin(skel->progs.cls_ingress, PROG_PIN_FILE);
+ test_btf_skc_cls_ingress__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/btf_write.c b/tools/testing/selftests/bpf/prog_tests/btf_write.c
new file mode 100644
index 000000000..314e1e7c3
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/btf_write.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <test_progs.h>
+#include <bpf/btf.h>
+
+static int duration = 0;
+
+void test_btf_write() {
+ const struct btf_var_secinfo *vi;
+ const struct btf_type *t;
+ const struct btf_member *m;
+ const struct btf_enum *v;
+ const struct btf_param *p;
+ struct btf *btf;
+ int id, err, str_off;
+
+ btf = btf__new_empty();
+ if (CHECK(IS_ERR(btf), "new_empty", "failed: %ld\n", PTR_ERR(btf)))
+ return;
+
+ str_off = btf__find_str(btf, "int");
+ ASSERT_EQ(str_off, -ENOENT, "int_str_missing_off");
+
+ str_off = btf__add_str(btf, "int");
+ ASSERT_EQ(str_off, 1, "int_str_off");
+
+ str_off = btf__find_str(btf, "int");
+ ASSERT_EQ(str_off, 1, "int_str_found_off");
+
+ /* BTF_KIND_INT */
+ id = btf__add_int(btf, "int", 4, BTF_INT_SIGNED);
+ ASSERT_EQ(id, 1, "int_id");
+
+ t = btf__type_by_id(btf, 1);
+ /* should re-use previously added "int" string */
+ ASSERT_EQ(t->name_off, str_off, "int_name_off");
+ ASSERT_STREQ(btf__str_by_offset(btf, t->name_off), "int", "int_name");
+ ASSERT_EQ(btf_kind(t), BTF_KIND_INT, "int_kind");
+ ASSERT_EQ(t->size, 4, "int_sz");
+ ASSERT_EQ(btf_int_encoding(t), BTF_INT_SIGNED, "int_enc");
+ ASSERT_EQ(btf_int_bits(t), 32, "int_bits");
+
+ /* invalid int size */
+ id = btf__add_int(btf, "bad sz int", 7, 0);
+ ASSERT_ERR(id, "int_bad_sz");
+ /* invalid encoding */
+ id = btf__add_int(btf, "bad enc int", 4, 123);
+ ASSERT_ERR(id, "int_bad_enc");
+ /* NULL name */
+ id = btf__add_int(btf, NULL, 4, 0);
+ ASSERT_ERR(id, "int_bad_null_name");
+ /* empty name */
+ id = btf__add_int(btf, "", 4, 0);
+ ASSERT_ERR(id, "int_bad_empty_name");
+
+ /* PTR/CONST/VOLATILE/RESTRICT */
+ id = btf__add_ptr(btf, 1);
+ ASSERT_EQ(id, 2, "ptr_id");
+ t = btf__type_by_id(btf, 2);
+ ASSERT_EQ(btf_kind(t), BTF_KIND_PTR, "ptr_kind");
+ ASSERT_EQ(t->type, 1, "ptr_type");
+
+ id = btf__add_const(btf, 5); /* points forward to restrict */
+ ASSERT_EQ(id, 3, "const_id");
+ t = btf__type_by_id(btf, 3);
+ ASSERT_EQ(btf_kind(t), BTF_KIND_CONST, "const_kind");
+ ASSERT_EQ(t->type, 5, "const_type");
+
+ id = btf__add_volatile(btf, 3);
+ ASSERT_EQ(id, 4, "volatile_id");
+ t = btf__type_by_id(btf, 4);
+ ASSERT_EQ(btf_kind(t), BTF_KIND_VOLATILE, "volatile_kind");
+ ASSERT_EQ(t->type, 3, "volatile_type");
+
+ id = btf__add_restrict(btf, 4);
+ ASSERT_EQ(id, 5, "restrict_id");
+ t = btf__type_by_id(btf, 5);
+ ASSERT_EQ(btf_kind(t), BTF_KIND_RESTRICT, "restrict_kind");
+ ASSERT_EQ(t->type, 4, "restrict_type");
+
+ /* ARRAY */
+ id = btf__add_array(btf, 1, 2, 10); /* int *[10] */
+ ASSERT_EQ(id, 6, "array_id");
+ t = btf__type_by_id(btf, 6);
+ ASSERT_EQ(btf_kind(t), BTF_KIND_ARRAY, "array_kind");
+ ASSERT_EQ(btf_array(t)->index_type, 1, "array_index_type");
+ ASSERT_EQ(btf_array(t)->type, 2, "array_elem_type");
+ ASSERT_EQ(btf_array(t)->nelems, 10, "array_nelems");
+
+ /* STRUCT */
+ err = btf__add_field(btf, "field", 1, 0, 0);
+ ASSERT_ERR(err, "no_struct_field");
+ id = btf__add_struct(btf, "s1", 8);
+ ASSERT_EQ(id, 7, "struct_id");
+ err = btf__add_field(btf, "f1", 1, 0, 0);
+ ASSERT_OK(err, "f1_res");
+ err = btf__add_field(btf, "f2", 1, 32, 16);
+ ASSERT_OK(err, "f2_res");
+
+ t = btf__type_by_id(btf, 7);
+ ASSERT_STREQ(btf__str_by_offset(btf, t->name_off), "s1", "struct_name");
+ ASSERT_EQ(btf_kind(t), BTF_KIND_STRUCT, "struct_kind");
+ ASSERT_EQ(btf_vlen(t), 2, "struct_vlen");
+ ASSERT_EQ(btf_kflag(t), true, "struct_kflag");
+ ASSERT_EQ(t->size, 8, "struct_sz");
+ m = btf_members(t) + 0;
+ ASSERT_STREQ(btf__str_by_offset(btf, m->name_off), "f1", "f1_name");
+ ASSERT_EQ(m->type, 1, "f1_type");
+ ASSERT_EQ(btf_member_bit_offset(t, 0), 0, "f1_bit_off");
+ ASSERT_EQ(btf_member_bitfield_size(t, 0), 0, "f1_bit_sz");
+ m = btf_members(t) + 1;
+ ASSERT_STREQ(btf__str_by_offset(btf, m->name_off), "f2", "f2_name");
+ ASSERT_EQ(m->type, 1, "f2_type");
+ ASSERT_EQ(btf_member_bit_offset(t, 1), 32, "f2_bit_off");
+ ASSERT_EQ(btf_member_bitfield_size(t, 1), 16, "f2_bit_sz");
+
+ /* UNION */
+ id = btf__add_union(btf, "u1", 8);
+ ASSERT_EQ(id, 8, "union_id");
+
+ /* invalid, non-zero offset */
+ err = btf__add_field(btf, "field", 1, 1, 0);
+ ASSERT_ERR(err, "no_struct_field");
+
+ err = btf__add_field(btf, "f1", 1, 0, 16);
+ ASSERT_OK(err, "f1_res");
+
+ t = btf__type_by_id(btf, 8);
+ ASSERT_STREQ(btf__str_by_offset(btf, t->name_off), "u1", "union_name");
+ ASSERT_EQ(btf_kind(t), BTF_KIND_UNION, "union_kind");
+ ASSERT_EQ(btf_vlen(t), 1, "union_vlen");
+ ASSERT_EQ(btf_kflag(t), true, "union_kflag");
+ ASSERT_EQ(t->size, 8, "union_sz");
+ m = btf_members(t) + 0;
+ ASSERT_STREQ(btf__str_by_offset(btf, m->name_off), "f1", "f1_name");
+ ASSERT_EQ(m->type, 1, "f1_type");
+ ASSERT_EQ(btf_member_bit_offset(t, 0), 0, "f1_bit_off");
+ ASSERT_EQ(btf_member_bitfield_size(t, 0), 16, "f1_bit_sz");
+
+ /* ENUM */
+ id = btf__add_enum(btf, "e1", 4);
+ ASSERT_EQ(id, 9, "enum_id");
+ err = btf__add_enum_value(btf, "v1", 1);
+ ASSERT_OK(err, "v1_res");
+ err = btf__add_enum_value(btf, "v2", 2);
+ ASSERT_OK(err, "v2_res");
+
+ t = btf__type_by_id(btf, 9);
+ ASSERT_STREQ(btf__str_by_offset(btf, t->name_off), "e1", "enum_name");
+ ASSERT_EQ(btf_kind(t), BTF_KIND_ENUM, "enum_kind");
+ ASSERT_EQ(btf_vlen(t), 2, "enum_vlen");
+ ASSERT_EQ(t->size, 4, "enum_sz");
+ v = btf_enum(t) + 0;
+ ASSERT_STREQ(btf__str_by_offset(btf, v->name_off), "v1", "v1_name");
+ ASSERT_EQ(v->val, 1, "v1_val");
+ v = btf_enum(t) + 1;
+ ASSERT_STREQ(btf__str_by_offset(btf, v->name_off), "v2", "v2_name");
+ ASSERT_EQ(v->val, 2, "v2_val");
+
+ /* FWDs */
+ id = btf__add_fwd(btf, "struct_fwd", BTF_FWD_STRUCT);
+ ASSERT_EQ(id, 10, "struct_fwd_id");
+ t = btf__type_by_id(btf, 10);
+ ASSERT_STREQ(btf__str_by_offset(btf, t->name_off), "struct_fwd", "fwd_name");
+ ASSERT_EQ(btf_kind(t), BTF_KIND_FWD, "fwd_kind");
+ ASSERT_EQ(btf_kflag(t), 0, "fwd_kflag");
+
+ id = btf__add_fwd(btf, "union_fwd", BTF_FWD_UNION);
+ ASSERT_EQ(id, 11, "union_fwd_id");
+ t = btf__type_by_id(btf, 11);
+ ASSERT_STREQ(btf__str_by_offset(btf, t->name_off), "union_fwd", "fwd_name");
+ ASSERT_EQ(btf_kind(t), BTF_KIND_FWD, "fwd_kind");
+ ASSERT_EQ(btf_kflag(t), 1, "fwd_kflag");
+
+ id = btf__add_fwd(btf, "enum_fwd", BTF_FWD_ENUM);
+ ASSERT_EQ(id, 12, "enum_fwd_id");
+ t = btf__type_by_id(btf, 12);
+ ASSERT_STREQ(btf__str_by_offset(btf, t->name_off), "enum_fwd", "fwd_name");
+ ASSERT_EQ(btf_kind(t), BTF_KIND_ENUM, "enum_fwd_kind");
+ ASSERT_EQ(btf_vlen(t), 0, "enum_fwd_kind");
+ ASSERT_EQ(t->size, 4, "enum_fwd_sz");
+
+ /* TYPEDEF */
+ id = btf__add_typedef(btf, "typedef1", 1);
+ ASSERT_EQ(id, 13, "typedef_fwd_id");
+ t = btf__type_by_id(btf, 13);
+ ASSERT_STREQ(btf__str_by_offset(btf, t->name_off), "typedef1", "typedef_name");
+ ASSERT_EQ(btf_kind(t), BTF_KIND_TYPEDEF, "typedef_kind");
+ ASSERT_EQ(t->type, 1, "typedef_type");
+
+ /* FUNC & FUNC_PROTO */
+ id = btf__add_func(btf, "func1", BTF_FUNC_GLOBAL, 15);
+ ASSERT_EQ(id, 14, "func_id");
+ t = btf__type_by_id(btf, 14);
+ ASSERT_STREQ(btf__str_by_offset(btf, t->name_off), "func1", "func_name");
+ ASSERT_EQ(t->type, 15, "func_type");
+ ASSERT_EQ(btf_kind(t), BTF_KIND_FUNC, "func_kind");
+ ASSERT_EQ(btf_vlen(t), BTF_FUNC_GLOBAL, "func_vlen");
+
+ id = btf__add_func_proto(btf, 1);
+ ASSERT_EQ(id, 15, "func_proto_id");
+ err = btf__add_func_param(btf, "p1", 1);
+ ASSERT_OK(err, "p1_res");
+ err = btf__add_func_param(btf, "p2", 2);
+ ASSERT_OK(err, "p2_res");
+
+ t = btf__type_by_id(btf, 15);
+ ASSERT_EQ(btf_kind(t), BTF_KIND_FUNC_PROTO, "func_proto_kind");
+ ASSERT_EQ(btf_vlen(t), 2, "func_proto_vlen");
+ ASSERT_EQ(t->type, 1, "func_proto_ret_type");
+ p = btf_params(t) + 0;
+ ASSERT_STREQ(btf__str_by_offset(btf, p->name_off), "p1", "p1_name");
+ ASSERT_EQ(p->type, 1, "p1_type");
+ p = btf_params(t) + 1;
+ ASSERT_STREQ(btf__str_by_offset(btf, p->name_off), "p2", "p2_name");
+ ASSERT_EQ(p->type, 2, "p2_type");
+
+ /* VAR */
+ id = btf__add_var(btf, "var1", BTF_VAR_GLOBAL_ALLOCATED, 1);
+ ASSERT_EQ(id, 16, "var_id");
+ t = btf__type_by_id(btf, 16);
+ ASSERT_STREQ(btf__str_by_offset(btf, t->name_off), "var1", "var_name");
+ ASSERT_EQ(btf_kind(t), BTF_KIND_VAR, "var_kind");
+ ASSERT_EQ(t->type, 1, "var_type");
+ ASSERT_EQ(btf_var(t)->linkage, BTF_VAR_GLOBAL_ALLOCATED, "var_type");
+
+ /* DATASECT */
+ id = btf__add_datasec(btf, "datasec1", 12);
+ ASSERT_EQ(id, 17, "datasec_id");
+ err = btf__add_datasec_var_info(btf, 1, 4, 8);
+ ASSERT_OK(err, "v1_res");
+
+ t = btf__type_by_id(btf, 17);
+ ASSERT_STREQ(btf__str_by_offset(btf, t->name_off), "datasec1", "datasec_name");
+ ASSERT_EQ(t->size, 12, "datasec_sz");
+ ASSERT_EQ(btf_kind(t), BTF_KIND_DATASEC, "datasec_kind");
+ ASSERT_EQ(btf_vlen(t), 1, "datasec_vlen");
+ vi = btf_var_secinfos(t) + 0;
+ ASSERT_EQ(vi->type, 1, "v1_type");
+ ASSERT_EQ(vi->offset, 4, "v1_off");
+ ASSERT_EQ(vi->size, 8, "v1_sz");
+
+ btf__free(btf);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c b/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c
new file mode 100644
index 000000000..48dc5827d
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c
@@ -0,0 +1,421 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright 2020 Google LLC.
+ */
+
+#include <test_progs.h>
+#include <cgroup_helpers.h>
+#include <network_helpers.h>
+
+#include "progs/cg_storage_multi.h"
+
+#include "cg_storage_multi_egress_only.skel.h"
+#include "cg_storage_multi_isolated.skel.h"
+#include "cg_storage_multi_shared.skel.h"
+
+#define PARENT_CGROUP "/cgroup_storage"
+#define CHILD_CGROUP "/cgroup_storage/child"
+
+static int duration;
+
+static bool assert_storage(struct bpf_map *map, const void *key,
+ struct cgroup_value *expected)
+{
+ struct cgroup_value value;
+ int map_fd;
+
+ map_fd = bpf_map__fd(map);
+
+ if (CHECK(bpf_map_lookup_elem(map_fd, key, &value) < 0,
+ "map-lookup", "errno %d", errno))
+ return true;
+ if (CHECK(memcmp(&value, expected, sizeof(struct cgroup_value)),
+ "assert-storage", "storages differ"))
+ return true;
+
+ return false;
+}
+
+static bool assert_storage_noexist(struct bpf_map *map, const void *key)
+{
+ struct cgroup_value value;
+ int map_fd;
+
+ map_fd = bpf_map__fd(map);
+
+ if (CHECK(bpf_map_lookup_elem(map_fd, key, &value) == 0,
+ "map-lookup", "succeeded, expected ENOENT"))
+ return true;
+ if (CHECK(errno != ENOENT,
+ "map-lookup", "errno %d, expected ENOENT", errno))
+ return true;
+
+ return false;
+}
+
+static bool connect_send(const char *cgroup_path)
+{
+ int server_fd = -1, client_fd = -1;
+ char message[] = "message";
+ bool res = true;
+
+ if (join_cgroup(cgroup_path))
+ goto out_clean;
+
+ server_fd = start_server(AF_INET, SOCK_DGRAM, NULL, 0, 0);
+ if (server_fd < 0)
+ goto out_clean;
+
+ client_fd = connect_to_fd(server_fd, 0);
+ if (client_fd < 0)
+ goto out_clean;
+
+ if (send(client_fd, &message, sizeof(message), 0) < 0)
+ goto out_clean;
+
+ if (read(server_fd, &message, sizeof(message)) < 0)
+ goto out_clean;
+
+ res = false;
+
+out_clean:
+ close(client_fd);
+ close(server_fd);
+ return res;
+}
+
+static void test_egress_only(int parent_cgroup_fd, int child_cgroup_fd)
+{
+ struct cg_storage_multi_egress_only *obj;
+ struct cgroup_value expected_cgroup_value;
+ struct bpf_cgroup_storage_key key;
+ struct bpf_link *parent_link = NULL, *child_link = NULL;
+ bool err;
+
+ key.attach_type = BPF_CGROUP_INET_EGRESS;
+
+ obj = cg_storage_multi_egress_only__open_and_load();
+ if (CHECK(!obj, "skel-load", "errno %d", errno))
+ return;
+
+ /* Attach to parent cgroup, trigger packet from child.
+ * Assert that there is only one run and in that run the storage is
+ * parent cgroup's storage.
+ * Also assert that child cgroup's storage does not exist
+ */
+ parent_link = bpf_program__attach_cgroup(obj->progs.egress,
+ parent_cgroup_fd);
+ if (CHECK(IS_ERR(parent_link), "parent-cg-attach",
+ "err %ld", PTR_ERR(parent_link)))
+ goto close_bpf_object;
+ err = connect_send(CHILD_CGROUP);
+ if (CHECK(err, "first-connect-send", "errno %d", errno))
+ goto close_bpf_object;
+ if (CHECK(obj->bss->invocations != 1,
+ "first-invoke", "invocations=%d", obj->bss->invocations))
+ goto close_bpf_object;
+ key.cgroup_inode_id = get_cgroup_id(PARENT_CGROUP);
+ expected_cgroup_value = (struct cgroup_value) { .egress_pkts = 1 };
+ if (assert_storage(obj->maps.cgroup_storage,
+ &key, &expected_cgroup_value))
+ goto close_bpf_object;
+ key.cgroup_inode_id = get_cgroup_id(CHILD_CGROUP);
+ if (assert_storage_noexist(obj->maps.cgroup_storage, &key))
+ goto close_bpf_object;
+
+ /* Attach to parent and child cgroup, trigger packet from child.
+ * Assert that there are two additional runs, one that run with parent
+ * cgroup's storage and one with child cgroup's storage.
+ */
+ child_link = bpf_program__attach_cgroup(obj->progs.egress,
+ child_cgroup_fd);
+ if (CHECK(IS_ERR(child_link), "child-cg-attach",
+ "err %ld", PTR_ERR(child_link)))
+ goto close_bpf_object;
+ err = connect_send(CHILD_CGROUP);
+ if (CHECK(err, "second-connect-send", "errno %d", errno))
+ goto close_bpf_object;
+ if (CHECK(obj->bss->invocations != 3,
+ "second-invoke", "invocations=%d", obj->bss->invocations))
+ goto close_bpf_object;
+ key.cgroup_inode_id = get_cgroup_id(PARENT_CGROUP);
+ expected_cgroup_value = (struct cgroup_value) { .egress_pkts = 2 };
+ if (assert_storage(obj->maps.cgroup_storage,
+ &key, &expected_cgroup_value))
+ goto close_bpf_object;
+ key.cgroup_inode_id = get_cgroup_id(CHILD_CGROUP);
+ expected_cgroup_value = (struct cgroup_value) { .egress_pkts = 1 };
+ if (assert_storage(obj->maps.cgroup_storage,
+ &key, &expected_cgroup_value))
+ goto close_bpf_object;
+
+close_bpf_object:
+ if (!IS_ERR(parent_link))
+ bpf_link__destroy(parent_link);
+ if (!IS_ERR(child_link))
+ bpf_link__destroy(child_link);
+
+ cg_storage_multi_egress_only__destroy(obj);
+}
+
+static void test_isolated(int parent_cgroup_fd, int child_cgroup_fd)
+{
+ struct cg_storage_multi_isolated *obj;
+ struct cgroup_value expected_cgroup_value;
+ struct bpf_cgroup_storage_key key;
+ struct bpf_link *parent_egress1_link = NULL, *parent_egress2_link = NULL;
+ struct bpf_link *child_egress1_link = NULL, *child_egress2_link = NULL;
+ struct bpf_link *parent_ingress_link = NULL, *child_ingress_link = NULL;
+ bool err;
+
+ obj = cg_storage_multi_isolated__open_and_load();
+ if (CHECK(!obj, "skel-load", "errno %d", errno))
+ return;
+
+ /* Attach to parent cgroup, trigger packet from child.
+ * Assert that there is three runs, two with parent cgroup egress and
+ * one with parent cgroup ingress, stored in separate parent storages.
+ * Also assert that child cgroup's storages does not exist
+ */
+ parent_egress1_link = bpf_program__attach_cgroup(obj->progs.egress1,
+ parent_cgroup_fd);
+ if (CHECK(IS_ERR(parent_egress1_link), "parent-egress1-cg-attach",
+ "err %ld", PTR_ERR(parent_egress1_link)))
+ goto close_bpf_object;
+ parent_egress2_link = bpf_program__attach_cgroup(obj->progs.egress2,
+ parent_cgroup_fd);
+ if (CHECK(IS_ERR(parent_egress2_link), "parent-egress2-cg-attach",
+ "err %ld", PTR_ERR(parent_egress2_link)))
+ goto close_bpf_object;
+ parent_ingress_link = bpf_program__attach_cgroup(obj->progs.ingress,
+ parent_cgroup_fd);
+ if (CHECK(IS_ERR(parent_ingress_link), "parent-ingress-cg-attach",
+ "err %ld", PTR_ERR(parent_ingress_link)))
+ goto close_bpf_object;
+ err = connect_send(CHILD_CGROUP);
+ if (CHECK(err, "first-connect-send", "errno %d", errno))
+ goto close_bpf_object;
+ if (CHECK(obj->bss->invocations != 3,
+ "first-invoke", "invocations=%d", obj->bss->invocations))
+ goto close_bpf_object;
+ key.cgroup_inode_id = get_cgroup_id(PARENT_CGROUP);
+ key.attach_type = BPF_CGROUP_INET_EGRESS;
+ expected_cgroup_value = (struct cgroup_value) { .egress_pkts = 2 };
+ if (assert_storage(obj->maps.cgroup_storage,
+ &key, &expected_cgroup_value))
+ goto close_bpf_object;
+ key.attach_type = BPF_CGROUP_INET_INGRESS;
+ expected_cgroup_value = (struct cgroup_value) { .ingress_pkts = 1 };
+ if (assert_storage(obj->maps.cgroup_storage,
+ &key, &expected_cgroup_value))
+ goto close_bpf_object;
+ key.cgroup_inode_id = get_cgroup_id(CHILD_CGROUP);
+ key.attach_type = BPF_CGROUP_INET_EGRESS;
+ if (assert_storage_noexist(obj->maps.cgroup_storage, &key))
+ goto close_bpf_object;
+ key.attach_type = BPF_CGROUP_INET_INGRESS;
+ if (assert_storage_noexist(obj->maps.cgroup_storage, &key))
+ goto close_bpf_object;
+
+ /* Attach to parent and child cgroup, trigger packet from child.
+ * Assert that there is six additional runs, parent cgroup egresses and
+ * ingress, child cgroup egresses and ingress.
+ * Assert that egree and ingress storages are separate.
+ */
+ child_egress1_link = bpf_program__attach_cgroup(obj->progs.egress1,
+ child_cgroup_fd);
+ if (CHECK(IS_ERR(child_egress1_link), "child-egress1-cg-attach",
+ "err %ld", PTR_ERR(child_egress1_link)))
+ goto close_bpf_object;
+ child_egress2_link = bpf_program__attach_cgroup(obj->progs.egress2,
+ child_cgroup_fd);
+ if (CHECK(IS_ERR(child_egress2_link), "child-egress2-cg-attach",
+ "err %ld", PTR_ERR(child_egress2_link)))
+ goto close_bpf_object;
+ child_ingress_link = bpf_program__attach_cgroup(obj->progs.ingress,
+ child_cgroup_fd);
+ if (CHECK(IS_ERR(child_ingress_link), "child-ingress-cg-attach",
+ "err %ld", PTR_ERR(child_ingress_link)))
+ goto close_bpf_object;
+ err = connect_send(CHILD_CGROUP);
+ if (CHECK(err, "second-connect-send", "errno %d", errno))
+ goto close_bpf_object;
+ if (CHECK(obj->bss->invocations != 9,
+ "second-invoke", "invocations=%d", obj->bss->invocations))
+ goto close_bpf_object;
+ key.cgroup_inode_id = get_cgroup_id(PARENT_CGROUP);
+ key.attach_type = BPF_CGROUP_INET_EGRESS;
+ expected_cgroup_value = (struct cgroup_value) { .egress_pkts = 4 };
+ if (assert_storage(obj->maps.cgroup_storage,
+ &key, &expected_cgroup_value))
+ goto close_bpf_object;
+ key.attach_type = BPF_CGROUP_INET_INGRESS;
+ expected_cgroup_value = (struct cgroup_value) { .ingress_pkts = 2 };
+ if (assert_storage(obj->maps.cgroup_storage,
+ &key, &expected_cgroup_value))
+ goto close_bpf_object;
+ key.cgroup_inode_id = get_cgroup_id(CHILD_CGROUP);
+ key.attach_type = BPF_CGROUP_INET_EGRESS;
+ expected_cgroup_value = (struct cgroup_value) { .egress_pkts = 2 };
+ if (assert_storage(obj->maps.cgroup_storage,
+ &key, &expected_cgroup_value))
+ goto close_bpf_object;
+ key.attach_type = BPF_CGROUP_INET_INGRESS;
+ expected_cgroup_value = (struct cgroup_value) { .ingress_pkts = 1 };
+ if (assert_storage(obj->maps.cgroup_storage,
+ &key, &expected_cgroup_value))
+ goto close_bpf_object;
+
+close_bpf_object:
+ if (!IS_ERR(parent_egress1_link))
+ bpf_link__destroy(parent_egress1_link);
+ if (!IS_ERR(parent_egress2_link))
+ bpf_link__destroy(parent_egress2_link);
+ if (!IS_ERR(parent_ingress_link))
+ bpf_link__destroy(parent_ingress_link);
+ if (!IS_ERR(child_egress1_link))
+ bpf_link__destroy(child_egress1_link);
+ if (!IS_ERR(child_egress2_link))
+ bpf_link__destroy(child_egress2_link);
+ if (!IS_ERR(child_ingress_link))
+ bpf_link__destroy(child_ingress_link);
+
+ cg_storage_multi_isolated__destroy(obj);
+}
+
+static void test_shared(int parent_cgroup_fd, int child_cgroup_fd)
+{
+ struct cg_storage_multi_shared *obj;
+ struct cgroup_value expected_cgroup_value;
+ __u64 key;
+ struct bpf_link *parent_egress1_link = NULL, *parent_egress2_link = NULL;
+ struct bpf_link *child_egress1_link = NULL, *child_egress2_link = NULL;
+ struct bpf_link *parent_ingress_link = NULL, *child_ingress_link = NULL;
+ bool err;
+
+ obj = cg_storage_multi_shared__open_and_load();
+ if (CHECK(!obj, "skel-load", "errno %d", errno))
+ return;
+
+ /* Attach to parent cgroup, trigger packet from child.
+ * Assert that there is three runs, two with parent cgroup egress and
+ * one with parent cgroup ingress.
+ * Also assert that child cgroup's storage does not exist
+ */
+ parent_egress1_link = bpf_program__attach_cgroup(obj->progs.egress1,
+ parent_cgroup_fd);
+ if (CHECK(IS_ERR(parent_egress1_link), "parent-egress1-cg-attach",
+ "err %ld", PTR_ERR(parent_egress1_link)))
+ goto close_bpf_object;
+ parent_egress2_link = bpf_program__attach_cgroup(obj->progs.egress2,
+ parent_cgroup_fd);
+ if (CHECK(IS_ERR(parent_egress2_link), "parent-egress2-cg-attach",
+ "err %ld", PTR_ERR(parent_egress2_link)))
+ goto close_bpf_object;
+ parent_ingress_link = bpf_program__attach_cgroup(obj->progs.ingress,
+ parent_cgroup_fd);
+ if (CHECK(IS_ERR(parent_ingress_link), "parent-ingress-cg-attach",
+ "err %ld", PTR_ERR(parent_ingress_link)))
+ goto close_bpf_object;
+ err = connect_send(CHILD_CGROUP);
+ if (CHECK(err, "first-connect-send", "errno %d", errno))
+ goto close_bpf_object;
+ if (CHECK(obj->bss->invocations != 3,
+ "first-invoke", "invocations=%d", obj->bss->invocations))
+ goto close_bpf_object;
+ key = get_cgroup_id(PARENT_CGROUP);
+ expected_cgroup_value = (struct cgroup_value) {
+ .egress_pkts = 2,
+ .ingress_pkts = 1,
+ };
+ if (assert_storage(obj->maps.cgroup_storage,
+ &key, &expected_cgroup_value))
+ goto close_bpf_object;
+ key = get_cgroup_id(CHILD_CGROUP);
+ if (assert_storage_noexist(obj->maps.cgroup_storage, &key))
+ goto close_bpf_object;
+
+ /* Attach to parent and child cgroup, trigger packet from child.
+ * Assert that there is six additional runs, parent cgroup egresses and
+ * ingress, child cgroup egresses and ingress.
+ */
+ child_egress1_link = bpf_program__attach_cgroup(obj->progs.egress1,
+ child_cgroup_fd);
+ if (CHECK(IS_ERR(child_egress1_link), "child-egress1-cg-attach",
+ "err %ld", PTR_ERR(child_egress1_link)))
+ goto close_bpf_object;
+ child_egress2_link = bpf_program__attach_cgroup(obj->progs.egress2,
+ child_cgroup_fd);
+ if (CHECK(IS_ERR(child_egress2_link), "child-egress2-cg-attach",
+ "err %ld", PTR_ERR(child_egress2_link)))
+ goto close_bpf_object;
+ child_ingress_link = bpf_program__attach_cgroup(obj->progs.ingress,
+ child_cgroup_fd);
+ if (CHECK(IS_ERR(child_ingress_link), "child-ingress-cg-attach",
+ "err %ld", PTR_ERR(child_ingress_link)))
+ goto close_bpf_object;
+ err = connect_send(CHILD_CGROUP);
+ if (CHECK(err, "second-connect-send", "errno %d", errno))
+ goto close_bpf_object;
+ if (CHECK(obj->bss->invocations != 9,
+ "second-invoke", "invocations=%d", obj->bss->invocations))
+ goto close_bpf_object;
+ key = get_cgroup_id(PARENT_CGROUP);
+ expected_cgroup_value = (struct cgroup_value) {
+ .egress_pkts = 4,
+ .ingress_pkts = 2,
+ };
+ if (assert_storage(obj->maps.cgroup_storage,
+ &key, &expected_cgroup_value))
+ goto close_bpf_object;
+ key = get_cgroup_id(CHILD_CGROUP);
+ expected_cgroup_value = (struct cgroup_value) {
+ .egress_pkts = 2,
+ .ingress_pkts = 1,
+ };
+ if (assert_storage(obj->maps.cgroup_storage,
+ &key, &expected_cgroup_value))
+ goto close_bpf_object;
+
+close_bpf_object:
+ if (!IS_ERR(parent_egress1_link))
+ bpf_link__destroy(parent_egress1_link);
+ if (!IS_ERR(parent_egress2_link))
+ bpf_link__destroy(parent_egress2_link);
+ if (!IS_ERR(parent_ingress_link))
+ bpf_link__destroy(parent_ingress_link);
+ if (!IS_ERR(child_egress1_link))
+ bpf_link__destroy(child_egress1_link);
+ if (!IS_ERR(child_egress2_link))
+ bpf_link__destroy(child_egress2_link);
+ if (!IS_ERR(child_ingress_link))
+ bpf_link__destroy(child_ingress_link);
+
+ cg_storage_multi_shared__destroy(obj);
+}
+
+void test_cg_storage_multi(void)
+{
+ int parent_cgroup_fd = -1, child_cgroup_fd = -1;
+
+ parent_cgroup_fd = test__join_cgroup(PARENT_CGROUP);
+ if (CHECK(parent_cgroup_fd < 0, "cg-create-parent", "errno %d", errno))
+ goto close_cgroup_fd;
+ child_cgroup_fd = create_and_get_cgroup(CHILD_CGROUP);
+ if (CHECK(child_cgroup_fd < 0, "cg-create-child", "errno %d", errno))
+ goto close_cgroup_fd;
+
+ if (test__start_subtest("egress_only"))
+ test_egress_only(parent_cgroup_fd, child_cgroup_fd);
+
+ if (test__start_subtest("isolated"))
+ test_isolated(parent_cgroup_fd, child_cgroup_fd);
+
+ if (test__start_subtest("shared"))
+ test_shared(parent_cgroup_fd, child_cgroup_fd);
+
+close_cgroup_fd:
+ close(child_cgroup_fd);
+ close(parent_cgroup_fd);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c
new file mode 100644
index 000000000..70e94e783
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_autodetach.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+
+#include "cgroup_helpers.h"
+
+#define PING_CMD "ping -q -c1 -w1 127.0.0.1 > /dev/null"
+
+static char bpf_log_buf[BPF_LOG_BUF_SIZE];
+
+static int prog_load(void)
+{
+ struct bpf_insn prog[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 1), /* r0 = 1 */
+ BPF_EXIT_INSN(),
+ };
+ size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
+
+ return bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB,
+ prog, insns_cnt, "GPL", 0,
+ bpf_log_buf, BPF_LOG_BUF_SIZE);
+}
+
+void test_cgroup_attach_autodetach(void)
+{
+ __u32 duration = 0, prog_cnt = 4, attach_flags;
+ int allow_prog[2] = {-1};
+ __u32 prog_ids[2] = {0};
+ void *ptr = NULL;
+ int cg = 0, i;
+ int attempts;
+
+ for (i = 0; i < ARRAY_SIZE(allow_prog); i++) {
+ allow_prog[i] = prog_load();
+ if (CHECK(allow_prog[i] < 0, "prog_load",
+ "verifier output:\n%s\n-------\n", bpf_log_buf))
+ goto err;
+ }
+
+ if (CHECK_FAIL(setup_cgroup_environment()))
+ goto err;
+
+ /* create a cgroup, attach two programs and remember their ids */
+ cg = create_and_get_cgroup("/cg_autodetach");
+ if (CHECK_FAIL(cg < 0))
+ goto err;
+
+ if (CHECK_FAIL(join_cgroup("/cg_autodetach")))
+ goto err;
+
+ for (i = 0; i < ARRAY_SIZE(allow_prog); i++)
+ if (CHECK(bpf_prog_attach(allow_prog[i], cg,
+ BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_MULTI),
+ "prog_attach", "prog[%d], errno=%d\n", i, errno))
+ goto err;
+
+ /* make sure that programs are attached and run some traffic */
+ if (CHECK(bpf_prog_query(cg, BPF_CGROUP_INET_EGRESS, 0, &attach_flags,
+ prog_ids, &prog_cnt),
+ "prog_query", "errno=%d\n", errno))
+ goto err;
+ if (CHECK_FAIL(system(PING_CMD)))
+ goto err;
+
+ /* allocate some memory (4Mb) to pin the original cgroup */
+ ptr = malloc(4 * (1 << 20));
+ if (CHECK_FAIL(!ptr))
+ goto err;
+
+ /* close programs and cgroup fd */
+ for (i = 0; i < ARRAY_SIZE(allow_prog); i++) {
+ close(allow_prog[i]);
+ allow_prog[i] = -1;
+ }
+
+ close(cg);
+ cg = 0;
+
+ /* leave the cgroup and remove it. don't detach programs */
+ cleanup_cgroup_environment();
+
+ /* wait for the asynchronous auto-detachment.
+ * wait for no more than 5 sec and give up.
+ */
+ for (i = 0; i < ARRAY_SIZE(prog_ids); i++) {
+ for (attempts = 5; attempts >= 0; attempts--) {
+ int fd = bpf_prog_get_fd_by_id(prog_ids[i]);
+
+ if (fd < 0)
+ break;
+
+ /* don't leave the fd open */
+ close(fd);
+
+ if (CHECK_FAIL(!attempts))
+ goto err;
+
+ sleep(1);
+ }
+ }
+
+err:
+ for (i = 0; i < ARRAY_SIZE(allow_prog); i++)
+ if (allow_prog[i] >= 0)
+ close(allow_prog[i]);
+ if (cg)
+ close(cg);
+ free(ptr);
+ cleanup_cgroup_environment();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c
new file mode 100644
index 000000000..b549fcfac
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c
@@ -0,0 +1,292 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+
+#include "cgroup_helpers.h"
+
+#define PING_CMD "ping -q -c1 -w1 127.0.0.1 > /dev/null"
+
+static char bpf_log_buf[BPF_LOG_BUF_SIZE];
+
+static int map_fd = -1;
+
+static int prog_load_cnt(int verdict, int val)
+{
+ int cgroup_storage_fd, percpu_cgroup_storage_fd;
+
+ if (map_fd < 0)
+ map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, 4, 8, 1, 0);
+ if (map_fd < 0) {
+ printf("failed to create map '%s'\n", strerror(errno));
+ return -1;
+ }
+
+ cgroup_storage_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_STORAGE,
+ sizeof(struct bpf_cgroup_storage_key), 8, 0, 0);
+ if (cgroup_storage_fd < 0) {
+ printf("failed to create map '%s'\n", strerror(errno));
+ return -1;
+ }
+
+ percpu_cgroup_storage_fd = bpf_create_map(
+ BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
+ sizeof(struct bpf_cgroup_storage_key), 8, 0, 0);
+ if (percpu_cgroup_storage_fd < 0) {
+ printf("failed to create map '%s'\n", strerror(errno));
+ return -1;
+ }
+
+ struct bpf_insn prog[] = {
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
+ BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_1, val), /* r1 = 1 */
+ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+
+ BPF_LD_MAP_FD(BPF_REG_1, cgroup_storage_fd),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
+ BPF_MOV64_IMM(BPF_REG_1, val),
+ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_W, BPF_REG_0, BPF_REG_1, 0, 0),
+
+ BPF_LD_MAP_FD(BPF_REG_1, percpu_cgroup_storage_fd),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 0x1),
+ BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_3, 0),
+
+ BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */
+ BPF_EXIT_INSN(),
+ };
+ size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
+ int ret;
+
+ ret = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB,
+ prog, insns_cnt, "GPL", 0,
+ bpf_log_buf, BPF_LOG_BUF_SIZE);
+
+ close(cgroup_storage_fd);
+ return ret;
+}
+
+void test_cgroup_attach_multi(void)
+{
+ __u32 prog_ids[4], prog_cnt = 0, attach_flags, saved_prog_id;
+ int cg1 = 0, cg2 = 0, cg3 = 0, cg4 = 0, cg5 = 0, key = 0;
+ DECLARE_LIBBPF_OPTS(bpf_prog_attach_opts, attach_opts);
+ int allow_prog[7] = {-1};
+ unsigned long long value;
+ __u32 duration = 0;
+ int i = 0;
+
+ for (i = 0; i < ARRAY_SIZE(allow_prog); i++) {
+ allow_prog[i] = prog_load_cnt(1, 1 << i);
+ if (CHECK(allow_prog[i] < 0, "prog_load",
+ "verifier output:\n%s\n-------\n", bpf_log_buf))
+ goto err;
+ }
+
+ if (CHECK_FAIL(setup_cgroup_environment()))
+ goto err;
+
+ cg1 = create_and_get_cgroup("/cg1");
+ if (CHECK_FAIL(cg1 < 0))
+ goto err;
+ cg2 = create_and_get_cgroup("/cg1/cg2");
+ if (CHECK_FAIL(cg2 < 0))
+ goto err;
+ cg3 = create_and_get_cgroup("/cg1/cg2/cg3");
+ if (CHECK_FAIL(cg3 < 0))
+ goto err;
+ cg4 = create_and_get_cgroup("/cg1/cg2/cg3/cg4");
+ if (CHECK_FAIL(cg4 < 0))
+ goto err;
+ cg5 = create_and_get_cgroup("/cg1/cg2/cg3/cg4/cg5");
+ if (CHECK_FAIL(cg5 < 0))
+ goto err;
+
+ if (CHECK_FAIL(join_cgroup("/cg1/cg2/cg3/cg4/cg5")))
+ goto err;
+
+ if (CHECK(bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_MULTI),
+ "prog0_attach_to_cg1_multi", "errno=%d\n", errno))
+ goto err;
+
+ if (CHECK(!bpf_prog_attach(allow_prog[0], cg1, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_MULTI),
+ "fail_same_prog_attach_to_cg1", "unexpected success\n"))
+ goto err;
+
+ if (CHECK(bpf_prog_attach(allow_prog[1], cg1, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_MULTI),
+ "prog1_attach_to_cg1_multi", "errno=%d\n", errno))
+ goto err;
+
+ if (CHECK(bpf_prog_attach(allow_prog[2], cg2, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE),
+ "prog2_attach_to_cg2_override", "errno=%d\n", errno))
+ goto err;
+
+ if (CHECK(bpf_prog_attach(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_MULTI),
+ "prog3_attach_to_cg3_multi", "errno=%d\n", errno))
+ goto err;
+
+ if (CHECK(bpf_prog_attach(allow_prog[4], cg4, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE),
+ "prog4_attach_to_cg4_override", "errno=%d\n", errno))
+ goto err;
+
+ if (CHECK(bpf_prog_attach(allow_prog[5], cg5, BPF_CGROUP_INET_EGRESS, 0),
+ "prog5_attach_to_cg5_none", "errno=%d\n", errno))
+ goto err;
+
+ CHECK_FAIL(system(PING_CMD));
+ CHECK_FAIL(bpf_map_lookup_elem(map_fd, &key, &value));
+ CHECK_FAIL(value != 1 + 2 + 8 + 32);
+
+ /* query the number of effective progs in cg5 */
+ CHECK_FAIL(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS,
+ BPF_F_QUERY_EFFECTIVE, NULL, NULL, &prog_cnt));
+ CHECK_FAIL(prog_cnt != 4);
+ /* retrieve prog_ids of effective progs in cg5 */
+ CHECK_FAIL(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS,
+ BPF_F_QUERY_EFFECTIVE, &attach_flags,
+ prog_ids, &prog_cnt));
+ CHECK_FAIL(prog_cnt != 4);
+ CHECK_FAIL(attach_flags != 0);
+ saved_prog_id = prog_ids[0];
+ /* check enospc handling */
+ prog_ids[0] = 0;
+ prog_cnt = 2;
+ CHECK_FAIL(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS,
+ BPF_F_QUERY_EFFECTIVE, &attach_flags,
+ prog_ids, &prog_cnt) != -1);
+ CHECK_FAIL(errno != ENOSPC);
+ CHECK_FAIL(prog_cnt != 4);
+ /* check that prog_ids are returned even when buffer is too small */
+ CHECK_FAIL(prog_ids[0] != saved_prog_id);
+ /* retrieve prog_id of single attached prog in cg5 */
+ prog_ids[0] = 0;
+ CHECK_FAIL(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0, NULL,
+ prog_ids, &prog_cnt));
+ CHECK_FAIL(prog_cnt != 1);
+ CHECK_FAIL(prog_ids[0] != saved_prog_id);
+
+ /* detach bottom program and ping again */
+ if (CHECK(bpf_prog_detach2(-1, cg5, BPF_CGROUP_INET_EGRESS),
+ "prog_detach_from_cg5", "errno=%d\n", errno))
+ goto err;
+
+ value = 0;
+ CHECK_FAIL(bpf_map_update_elem(map_fd, &key, &value, 0));
+ CHECK_FAIL(system(PING_CMD));
+ CHECK_FAIL(bpf_map_lookup_elem(map_fd, &key, &value));
+ CHECK_FAIL(value != 1 + 2 + 8 + 16);
+
+ /* test replace */
+
+ attach_opts.flags = BPF_F_ALLOW_OVERRIDE | BPF_F_REPLACE;
+ attach_opts.replace_prog_fd = allow_prog[0];
+ if (CHECK(!bpf_prog_attach_xattr(allow_prog[6], cg1,
+ BPF_CGROUP_INET_EGRESS, &attach_opts),
+ "fail_prog_replace_override", "unexpected success\n"))
+ goto err;
+ CHECK_FAIL(errno != EINVAL);
+
+ attach_opts.flags = BPF_F_REPLACE;
+ if (CHECK(!bpf_prog_attach_xattr(allow_prog[6], cg1,
+ BPF_CGROUP_INET_EGRESS, &attach_opts),
+ "fail_prog_replace_no_multi", "unexpected success\n"))
+ goto err;
+ CHECK_FAIL(errno != EINVAL);
+
+ attach_opts.flags = BPF_F_ALLOW_MULTI | BPF_F_REPLACE;
+ attach_opts.replace_prog_fd = -1;
+ if (CHECK(!bpf_prog_attach_xattr(allow_prog[6], cg1,
+ BPF_CGROUP_INET_EGRESS, &attach_opts),
+ "fail_prog_replace_bad_fd", "unexpected success\n"))
+ goto err;
+ CHECK_FAIL(errno != EBADF);
+
+ /* replacing a program that is not attached to cgroup should fail */
+ attach_opts.replace_prog_fd = allow_prog[3];
+ if (CHECK(!bpf_prog_attach_xattr(allow_prog[6], cg1,
+ BPF_CGROUP_INET_EGRESS, &attach_opts),
+ "fail_prog_replace_no_ent", "unexpected success\n"))
+ goto err;
+ CHECK_FAIL(errno != ENOENT);
+
+ /* replace 1st from the top program */
+ attach_opts.replace_prog_fd = allow_prog[0];
+ if (CHECK(bpf_prog_attach_xattr(allow_prog[6], cg1,
+ BPF_CGROUP_INET_EGRESS, &attach_opts),
+ "prog_replace", "errno=%d\n", errno))
+ goto err;
+
+ /* replace program with itself */
+ attach_opts.replace_prog_fd = allow_prog[6];
+ if (CHECK(bpf_prog_attach_xattr(allow_prog[6], cg1,
+ BPF_CGROUP_INET_EGRESS, &attach_opts),
+ "prog_replace", "errno=%d\n", errno))
+ goto err;
+
+ value = 0;
+ CHECK_FAIL(bpf_map_update_elem(map_fd, &key, &value, 0));
+ CHECK_FAIL(system(PING_CMD));
+ CHECK_FAIL(bpf_map_lookup_elem(map_fd, &key, &value));
+ CHECK_FAIL(value != 64 + 2 + 8 + 16);
+
+ /* detach 3rd from bottom program and ping again */
+ if (CHECK(!bpf_prog_detach2(0, cg3, BPF_CGROUP_INET_EGRESS),
+ "fail_prog_detach_from_cg3", "unexpected success\n"))
+ goto err;
+
+ if (CHECK(bpf_prog_detach2(allow_prog[3], cg3, BPF_CGROUP_INET_EGRESS),
+ "prog3_detach_from_cg3", "errno=%d\n", errno))
+ goto err;
+
+ value = 0;
+ CHECK_FAIL(bpf_map_update_elem(map_fd, &key, &value, 0));
+ CHECK_FAIL(system(PING_CMD));
+ CHECK_FAIL(bpf_map_lookup_elem(map_fd, &key, &value));
+ CHECK_FAIL(value != 64 + 2 + 16);
+
+ /* detach 2nd from bottom program and ping again */
+ if (CHECK(bpf_prog_detach2(-1, cg4, BPF_CGROUP_INET_EGRESS),
+ "prog_detach_from_cg4", "errno=%d\n", errno))
+ goto err;
+
+ value = 0;
+ CHECK_FAIL(bpf_map_update_elem(map_fd, &key, &value, 0));
+ CHECK_FAIL(system(PING_CMD));
+ CHECK_FAIL(bpf_map_lookup_elem(map_fd, &key, &value));
+ CHECK_FAIL(value != 64 + 2 + 4);
+
+ prog_cnt = 4;
+ CHECK_FAIL(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS,
+ BPF_F_QUERY_EFFECTIVE, &attach_flags,
+ prog_ids, &prog_cnt));
+ CHECK_FAIL(prog_cnt != 3);
+ CHECK_FAIL(attach_flags != 0);
+ CHECK_FAIL(bpf_prog_query(cg5, BPF_CGROUP_INET_EGRESS, 0, NULL,
+ prog_ids, &prog_cnt));
+ CHECK_FAIL(prog_cnt != 0);
+
+err:
+ for (i = 0; i < ARRAY_SIZE(allow_prog); i++)
+ if (allow_prog[i] >= 0)
+ close(allow_prog[i]);
+ close(cg1);
+ close(cg2);
+ close(cg3);
+ close(cg4);
+ close(cg5);
+ cleanup_cgroup_environment();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c
new file mode 100644
index 000000000..9e96f8d87
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_override.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+
+#include "cgroup_helpers.h"
+
+#define FOO "/foo"
+#define BAR "/foo/bar/"
+#define PING_CMD "ping -q -c1 -w1 127.0.0.1 > /dev/null"
+
+static char bpf_log_buf[BPF_LOG_BUF_SIZE];
+
+static int prog_load(int verdict)
+{
+ struct bpf_insn prog[] = {
+ BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */
+ BPF_EXIT_INSN(),
+ };
+ size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
+
+ return bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB,
+ prog, insns_cnt, "GPL", 0,
+ bpf_log_buf, BPF_LOG_BUF_SIZE);
+}
+
+void test_cgroup_attach_override(void)
+{
+ int drop_prog = -1, allow_prog = -1, foo = -1, bar = -1;
+ __u32 duration = 0;
+
+ allow_prog = prog_load(1);
+ if (CHECK(allow_prog < 0, "prog_load_allow",
+ "verifier output:\n%s\n-------\n", bpf_log_buf))
+ goto err;
+
+ drop_prog = prog_load(0);
+ if (CHECK(drop_prog < 0, "prog_load_drop",
+ "verifier output:\n%s\n-------\n", bpf_log_buf))
+ goto err;
+
+ foo = test__join_cgroup(FOO);
+ if (CHECK(foo < 0, "cgroup_join_foo", "cgroup setup failed\n"))
+ goto err;
+
+ if (CHECK(bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE),
+ "prog_attach_drop_foo_override",
+ "attach prog to %s failed, errno=%d\n", FOO, errno))
+ goto err;
+
+ if (CHECK(!system(PING_CMD), "ping_fail",
+ "ping unexpectedly succeeded\n"))
+ goto err;
+
+ bar = test__join_cgroup(BAR);
+ if (CHECK(bar < 0, "cgroup_join_bar", "cgroup setup failed\n"))
+ goto err;
+
+ if (CHECK(!system(PING_CMD), "ping_fail",
+ "ping unexpectedly succeeded\n"))
+ goto err;
+
+ if (CHECK(bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE),
+ "prog_attach_allow_bar_override",
+ "attach prog to %s failed, errno=%d\n", BAR, errno))
+ goto err;
+
+ if (CHECK(system(PING_CMD), "ping_ok", "ping failed\n"))
+ goto err;
+
+ if (CHECK(bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS),
+ "prog_detach_bar",
+ "detach prog from %s failed, errno=%d\n", BAR, errno))
+ goto err;
+
+ if (CHECK(!system(PING_CMD), "ping_fail",
+ "ping unexpectedly succeeded\n"))
+ goto err;
+
+ if (CHECK(bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE),
+ "prog_attach_allow_bar_override",
+ "attach prog to %s failed, errno=%d\n", BAR, errno))
+ goto err;
+
+ if (CHECK(bpf_prog_detach(foo, BPF_CGROUP_INET_EGRESS),
+ "prog_detach_foo",
+ "detach prog from %s failed, errno=%d\n", FOO, errno))
+ goto err;
+
+ if (CHECK(system(PING_CMD), "ping_ok", "ping failed\n"))
+ goto err;
+
+ if (CHECK(bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE),
+ "prog_attach_allow_bar_override",
+ "attach prog to %s failed, errno=%d\n", BAR, errno))
+ goto err;
+
+ if (CHECK(!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0),
+ "fail_prog_attach_allow_bar_none",
+ "attach prog to %s unexpectedly succeeded\n", BAR))
+ goto err;
+
+ if (CHECK(bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS),
+ "prog_detach_bar",
+ "detach prog from %s failed, errno=%d\n", BAR, errno))
+ goto err;
+
+ if (CHECK(!bpf_prog_detach(foo, BPF_CGROUP_INET_EGRESS),
+ "fail_prog_detach_foo",
+ "double detach from %s unexpectedly succeeded\n", FOO))
+ goto err;
+
+ if (CHECK(bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS, 0),
+ "prog_attach_allow_foo_none",
+ "attach prog to %s failed, errno=%d\n", FOO, errno))
+ goto err;
+
+ if (CHECK(!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS, 0),
+ "fail_prog_attach_allow_bar_none",
+ "attach prog to %s unexpectedly succeeded\n", BAR))
+ goto err;
+
+ if (CHECK(!bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE),
+ "fail_prog_attach_allow_bar_override",
+ "attach prog to %s unexpectedly succeeded\n", BAR))
+ goto err;
+
+ if (CHECK(!bpf_prog_attach(allow_prog, foo, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_OVERRIDE),
+ "fail_prog_attach_allow_foo_override",
+ "attach prog to %s unexpectedly succeeded\n", FOO))
+ goto err;
+
+ if (CHECK(bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS, 0),
+ "prog_attach_drop_foo_none",
+ "attach prog to %s failed, errno=%d\n", FOO, errno))
+ goto err;
+
+err:
+ close(foo);
+ close(bar);
+ close(allow_prog);
+ close(drop_prog);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_link.c b/tools/testing/selftests/bpf/prog_tests/cgroup_link.c
new file mode 100644
index 000000000..4d9b514b3
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_link.c
@@ -0,0 +1,262 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include "cgroup_helpers.h"
+#include "testing_helpers.h"
+#include "test_cgroup_link.skel.h"
+
+static __u32 duration = 0;
+#define PING_CMD "ping -q -c1 -w1 127.0.0.1 > /dev/null"
+
+static struct test_cgroup_link *skel = NULL;
+
+int ping_and_check(int exp_calls, int exp_alt_calls)
+{
+ skel->bss->calls = 0;
+ skel->bss->alt_calls = 0;
+ CHECK_FAIL(system(PING_CMD));
+ if (CHECK(skel->bss->calls != exp_calls, "call_cnt",
+ "exp %d, got %d\n", exp_calls, skel->bss->calls))
+ return -EINVAL;
+ if (CHECK(skel->bss->alt_calls != exp_alt_calls, "alt_call_cnt",
+ "exp %d, got %d\n", exp_alt_calls, skel->bss->alt_calls))
+ return -EINVAL;
+ return 0;
+}
+
+void test_cgroup_link(void)
+{
+ struct {
+ const char *path;
+ int fd;
+ } cgs[] = {
+ { "/cg1" },
+ { "/cg1/cg2" },
+ { "/cg1/cg2/cg3" },
+ { "/cg1/cg2/cg3/cg4" },
+ };
+ int last_cg = ARRAY_SIZE(cgs) - 1, cg_nr = ARRAY_SIZE(cgs);
+ DECLARE_LIBBPF_OPTS(bpf_link_update_opts, link_upd_opts);
+ struct bpf_link *links[ARRAY_SIZE(cgs)] = {}, *tmp_link;
+ __u32 prog_ids[ARRAY_SIZE(cgs)], prog_cnt = 0, attach_flags, prog_id;
+ struct bpf_link_info info;
+ int i = 0, err, prog_fd;
+ bool detach_legacy = false;
+
+ skel = test_cgroup_link__open_and_load();
+ if (CHECK(!skel, "skel_open_load", "failed to open/load skeleton\n"))
+ return;
+ prog_fd = bpf_program__fd(skel->progs.egress);
+
+ err = setup_cgroup_environment();
+ if (CHECK(err, "cg_init", "failed: %d\n", err))
+ goto cleanup;
+
+ for (i = 0; i < cg_nr; i++) {
+ cgs[i].fd = create_and_get_cgroup(cgs[i].path);
+ if (CHECK(cgs[i].fd < 0, "cg_create", "fail: %d\n", cgs[i].fd))
+ goto cleanup;
+ }
+
+ err = join_cgroup(cgs[last_cg].path);
+ if (CHECK(err, "cg_join", "fail: %d\n", err))
+ goto cleanup;
+
+ for (i = 0; i < cg_nr; i++) {
+ links[i] = bpf_program__attach_cgroup(skel->progs.egress,
+ cgs[i].fd);
+ if (CHECK(IS_ERR(links[i]), "cg_attach", "i: %d, err: %ld\n",
+ i, PTR_ERR(links[i])))
+ goto cleanup;
+ }
+
+ ping_and_check(cg_nr, 0);
+
+ /* query the number of effective progs and attach flags in root cg */
+ err = bpf_prog_query(cgs[0].fd, BPF_CGROUP_INET_EGRESS,
+ BPF_F_QUERY_EFFECTIVE, &attach_flags, NULL,
+ &prog_cnt);
+ CHECK_FAIL(err);
+ CHECK_FAIL(attach_flags != BPF_F_ALLOW_MULTI);
+ if (CHECK(prog_cnt != 1, "effect_cnt", "exp %d, got %d\n", 1, prog_cnt))
+ goto cleanup;
+
+ /* query the number of effective progs in last cg */
+ err = bpf_prog_query(cgs[last_cg].fd, BPF_CGROUP_INET_EGRESS,
+ BPF_F_QUERY_EFFECTIVE, NULL, NULL,
+ &prog_cnt);
+ CHECK_FAIL(err);
+ CHECK_FAIL(attach_flags != BPF_F_ALLOW_MULTI);
+ if (CHECK(prog_cnt != cg_nr, "effect_cnt", "exp %d, got %d\n",
+ cg_nr, prog_cnt))
+ goto cleanup;
+
+ /* query the effective prog IDs in last cg */
+ err = bpf_prog_query(cgs[last_cg].fd, BPF_CGROUP_INET_EGRESS,
+ BPF_F_QUERY_EFFECTIVE, &attach_flags,
+ prog_ids, &prog_cnt);
+ CHECK_FAIL(err);
+ CHECK_FAIL(attach_flags != BPF_F_ALLOW_MULTI);
+ if (CHECK(prog_cnt != cg_nr, "effect_cnt", "exp %d, got %d\n",
+ cg_nr, prog_cnt))
+ goto cleanup;
+ for (i = 1; i < prog_cnt; i++) {
+ CHECK(prog_ids[i - 1] != prog_ids[i], "prog_id_check",
+ "idx %d, prev id %d, cur id %d\n",
+ i, prog_ids[i - 1], prog_ids[i]);
+ }
+
+ /* detach bottom program and ping again */
+ bpf_link__destroy(links[last_cg]);
+ links[last_cg] = NULL;
+
+ ping_and_check(cg_nr - 1, 0);
+
+ /* mix in with non link-based multi-attachments */
+ err = bpf_prog_attach(prog_fd, cgs[last_cg].fd,
+ BPF_CGROUP_INET_EGRESS, BPF_F_ALLOW_MULTI);
+ if (CHECK(err, "cg_attach_legacy", "errno=%d\n", errno))
+ goto cleanup;
+ detach_legacy = true;
+
+ links[last_cg] = bpf_program__attach_cgroup(skel->progs.egress,
+ cgs[last_cg].fd);
+ if (CHECK(IS_ERR(links[last_cg]), "cg_attach", "err: %ld\n",
+ PTR_ERR(links[last_cg])))
+ goto cleanup;
+
+ ping_and_check(cg_nr + 1, 0);
+
+ /* detach link */
+ bpf_link__destroy(links[last_cg]);
+ links[last_cg] = NULL;
+
+ /* detach legacy */
+ err = bpf_prog_detach2(prog_fd, cgs[last_cg].fd, BPF_CGROUP_INET_EGRESS);
+ if (CHECK(err, "cg_detach_legacy", "errno=%d\n", errno))
+ goto cleanup;
+ detach_legacy = false;
+
+ /* attach legacy exclusive prog attachment */
+ err = bpf_prog_attach(prog_fd, cgs[last_cg].fd,
+ BPF_CGROUP_INET_EGRESS, 0);
+ if (CHECK(err, "cg_attach_exclusive", "errno=%d\n", errno))
+ goto cleanup;
+ detach_legacy = true;
+
+ /* attempt to mix in with multi-attach bpf_link */
+ tmp_link = bpf_program__attach_cgroup(skel->progs.egress,
+ cgs[last_cg].fd);
+ if (CHECK(!IS_ERR(tmp_link), "cg_attach_fail", "unexpected success!\n")) {
+ bpf_link__destroy(tmp_link);
+ goto cleanup;
+ }
+
+ ping_and_check(cg_nr, 0);
+
+ /* detach */
+ err = bpf_prog_detach2(prog_fd, cgs[last_cg].fd, BPF_CGROUP_INET_EGRESS);
+ if (CHECK(err, "cg_detach_legacy", "errno=%d\n", errno))
+ goto cleanup;
+ detach_legacy = false;
+
+ ping_and_check(cg_nr - 1, 0);
+
+ /* attach back link-based one */
+ links[last_cg] = bpf_program__attach_cgroup(skel->progs.egress,
+ cgs[last_cg].fd);
+ if (CHECK(IS_ERR(links[last_cg]), "cg_attach", "err: %ld\n",
+ PTR_ERR(links[last_cg])))
+ goto cleanup;
+
+ ping_and_check(cg_nr, 0);
+
+ /* check legacy exclusive prog can't be attached */
+ err = bpf_prog_attach(prog_fd, cgs[last_cg].fd,
+ BPF_CGROUP_INET_EGRESS, 0);
+ if (CHECK(!err, "cg_attach_exclusive", "unexpected success")) {
+ bpf_prog_detach2(prog_fd, cgs[last_cg].fd, BPF_CGROUP_INET_EGRESS);
+ goto cleanup;
+ }
+
+ /* replace BPF programs inside their links for all but first link */
+ for (i = 1; i < cg_nr; i++) {
+ err = bpf_link__update_program(links[i], skel->progs.egress_alt);
+ if (CHECK(err, "prog_upd", "link #%d\n", i))
+ goto cleanup;
+ }
+
+ ping_and_check(1, cg_nr - 1);
+
+ /* Attempt program update with wrong expected BPF program */
+ link_upd_opts.old_prog_fd = bpf_program__fd(skel->progs.egress_alt);
+ link_upd_opts.flags = BPF_F_REPLACE;
+ err = bpf_link_update(bpf_link__fd(links[0]),
+ bpf_program__fd(skel->progs.egress_alt),
+ &link_upd_opts);
+ if (CHECK(err == 0 || errno != EPERM, "prog_cmpxchg1",
+ "unexpectedly succeeded, err %d, errno %d\n", err, -errno))
+ goto cleanup;
+
+ /* Compare-exchange single link program from egress to egress_alt */
+ link_upd_opts.old_prog_fd = bpf_program__fd(skel->progs.egress);
+ link_upd_opts.flags = BPF_F_REPLACE;
+ err = bpf_link_update(bpf_link__fd(links[0]),
+ bpf_program__fd(skel->progs.egress_alt),
+ &link_upd_opts);
+ if (CHECK(err, "prog_cmpxchg2", "errno %d\n", -errno))
+ goto cleanup;
+
+ /* ping */
+ ping_and_check(0, cg_nr);
+
+ /* close cgroup FDs before detaching links */
+ for (i = 0; i < cg_nr; i++) {
+ if (cgs[i].fd > 0) {
+ close(cgs[i].fd);
+ cgs[i].fd = -1;
+ }
+ }
+
+ /* BPF programs should still get called */
+ ping_and_check(0, cg_nr);
+
+ prog_id = link_info_prog_id(links[0], &info);
+ CHECK(prog_id == 0, "link_info", "failed\n");
+ CHECK(info.cgroup.cgroup_id == 0, "cgroup_id", "unexpected %llu\n", info.cgroup.cgroup_id);
+
+ err = bpf_link__detach(links[0]);
+ if (CHECK(err, "link_detach", "failed %d\n", err))
+ goto cleanup;
+
+ /* cgroup_id should be zero in link_info */
+ prog_id = link_info_prog_id(links[0], &info);
+ CHECK(prog_id == 0, "link_info", "failed\n");
+ CHECK(info.cgroup.cgroup_id != 0, "cgroup_id", "unexpected %llu\n", info.cgroup.cgroup_id);
+
+ /* First BPF program shouldn't be called anymore */
+ ping_and_check(0, cg_nr - 1);
+
+ /* leave cgroup and remove them, don't detach programs */
+ cleanup_cgroup_environment();
+
+ /* BPF programs should have been auto-detached */
+ ping_and_check(0, 0);
+
+cleanup:
+ if (detach_legacy)
+ bpf_prog_detach2(prog_fd, cgs[last_cg].fd,
+ BPF_CGROUP_INET_EGRESS);
+
+ for (i = 0; i < cg_nr; i++) {
+ if (!IS_ERR(links[i]))
+ bpf_link__destroy(links[i]);
+ }
+ test_cgroup_link__destroy(skel);
+
+ for (i = 0; i < cg_nr; i++) {
+ if (cgs[i].fd > 0)
+ close(cgs[i].fd);
+ }
+ cleanup_cgroup_environment();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c
new file mode 100644
index 000000000..464edc1c1
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_skb_sk_lookup.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+
+#include <test_progs.h>
+
+#include "network_helpers.h"
+#include "cgroup_skb_sk_lookup_kern.skel.h"
+
+static void run_lookup_test(__u16 *g_serv_port, int out_sk)
+{
+ int serv_sk = -1, in_sk = -1, serv_in_sk = -1, err;
+ struct sockaddr_in6 addr = {};
+ socklen_t addr_len = sizeof(addr);
+ __u32 duration = 0;
+
+ serv_sk = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0);
+ if (CHECK(serv_sk < 0, "start_server", "failed to start server\n"))
+ return;
+
+ err = getsockname(serv_sk, (struct sockaddr *)&addr, &addr_len);
+ if (CHECK(err, "getsockname", "errno %d\n", errno))
+ goto cleanup;
+
+ *g_serv_port = addr.sin6_port;
+
+ /* Client outside of test cgroup should fail to connect by timeout. */
+ err = connect_fd_to_fd(out_sk, serv_sk, 1000);
+ if (CHECK(!err || errno != EINPROGRESS, "connect_fd_to_fd",
+ "unexpected result err %d errno %d\n", err, errno))
+ goto cleanup;
+
+ /* Client inside test cgroup should connect just fine. */
+ in_sk = connect_to_fd(serv_sk, 0);
+ if (CHECK(in_sk < 0, "connect_to_fd", "errno %d\n", errno))
+ goto cleanup;
+
+ serv_in_sk = accept(serv_sk, NULL, NULL);
+ if (CHECK(serv_in_sk < 0, "accept", "errno %d\n", errno))
+ goto cleanup;
+
+cleanup:
+ close(serv_in_sk);
+ close(in_sk);
+ close(serv_sk);
+}
+
+static void run_cgroup_bpf_test(const char *cg_path, int out_sk)
+{
+ struct cgroup_skb_sk_lookup_kern *skel;
+ struct bpf_link *link;
+ __u32 duration = 0;
+ int cgfd = -1;
+
+ skel = cgroup_skb_sk_lookup_kern__open_and_load();
+ if (CHECK(!skel, "skel_open_load", "open_load failed\n"))
+ return;
+
+ cgfd = test__join_cgroup(cg_path);
+ if (CHECK(cgfd < 0, "cgroup_join", "cgroup setup failed\n"))
+ goto cleanup;
+
+ link = bpf_program__attach_cgroup(skel->progs.ingress_lookup, cgfd);
+ if (CHECK(IS_ERR(link), "cgroup_attach", "err: %ld\n", PTR_ERR(link)))
+ goto cleanup;
+
+ run_lookup_test(&skel->bss->g_serv_port, out_sk);
+
+ bpf_link__destroy(link);
+
+cleanup:
+ close(cgfd);
+ cgroup_skb_sk_lookup_kern__destroy(skel);
+}
+
+void test_cgroup_skb_sk_lookup(void)
+{
+ const char *cg_path = "/foo";
+ int out_sk;
+
+ /* Create a socket before joining testing cgroup so that its cgroup id
+ * differs from that of testing cgroup. Moving selftests process to
+ * testing cgroup won't change cgroup id of an already created socket.
+ */
+ out_sk = socket(AF_INET6, SOCK_STREAM, 0);
+ if (CHECK_FAIL(out_sk < 0))
+ return;
+
+ run_cgroup_bpf_test(cg_path, out_sk);
+
+ close(out_sk);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c
new file mode 100644
index 000000000..e075d03ab
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c
@@ -0,0 +1,499 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+// Copyright (c) 2020 Cloudflare
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <string.h>
+
+#include <linux/pkt_cls.h>
+#include <netinet/tcp.h>
+
+#include <test_progs.h>
+
+#include "progs/test_cls_redirect.h"
+#include "test_cls_redirect.skel.h"
+#include "test_cls_redirect_subprogs.skel.h"
+
+#define ENCAP_IP INADDR_LOOPBACK
+#define ENCAP_PORT (1234)
+
+static int duration = 0;
+
+struct addr_port {
+ in_port_t port;
+ union {
+ struct in_addr in_addr;
+ struct in6_addr in6_addr;
+ };
+};
+
+struct tuple {
+ int family;
+ struct addr_port src;
+ struct addr_port dst;
+};
+
+static int start_server(const struct sockaddr *addr, socklen_t len, int type)
+{
+ int fd = socket(addr->sa_family, type, 0);
+ if (CHECK_FAIL(fd == -1))
+ return -1;
+ if (CHECK_FAIL(bind(fd, addr, len) == -1))
+ goto err;
+ if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1))
+ goto err;
+
+ return fd;
+
+err:
+ close(fd);
+ return -1;
+}
+
+static int connect_to_server(const struct sockaddr *addr, socklen_t len,
+ int type)
+{
+ int fd = socket(addr->sa_family, type, 0);
+ if (CHECK_FAIL(fd == -1))
+ return -1;
+ if (CHECK_FAIL(connect(fd, addr, len)))
+ goto err;
+
+ return fd;
+
+err:
+ close(fd);
+ return -1;
+}
+
+static bool fill_addr_port(const struct sockaddr *sa, struct addr_port *ap)
+{
+ const struct sockaddr_in6 *in6;
+ const struct sockaddr_in *in;
+
+ switch (sa->sa_family) {
+ case AF_INET:
+ in = (const struct sockaddr_in *)sa;
+ ap->in_addr = in->sin_addr;
+ ap->port = in->sin_port;
+ return true;
+
+ case AF_INET6:
+ in6 = (const struct sockaddr_in6 *)sa;
+ ap->in6_addr = in6->sin6_addr;
+ ap->port = in6->sin6_port;
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+static bool set_up_conn(const struct sockaddr *addr, socklen_t len, int type,
+ int *server, int *conn, struct tuple *tuple)
+{
+ struct sockaddr_storage ss;
+ socklen_t slen = sizeof(ss);
+ struct sockaddr *sa = (struct sockaddr *)&ss;
+
+ *server = start_server(addr, len, type);
+ if (*server < 0)
+ return false;
+
+ if (CHECK_FAIL(getsockname(*server, sa, &slen)))
+ goto close_server;
+
+ *conn = connect_to_server(sa, slen, type);
+ if (*conn < 0)
+ goto close_server;
+
+ /* We want to simulate packets arriving at conn, so we have to
+ * swap src and dst.
+ */
+ slen = sizeof(ss);
+ if (CHECK_FAIL(getsockname(*conn, sa, &slen)))
+ goto close_conn;
+
+ if (CHECK_FAIL(!fill_addr_port(sa, &tuple->dst)))
+ goto close_conn;
+
+ slen = sizeof(ss);
+ if (CHECK_FAIL(getpeername(*conn, sa, &slen)))
+ goto close_conn;
+
+ if (CHECK_FAIL(!fill_addr_port(sa, &tuple->src)))
+ goto close_conn;
+
+ tuple->family = ss.ss_family;
+ return true;
+
+close_conn:
+ close(*conn);
+ *conn = -1;
+close_server:
+ close(*server);
+ *server = -1;
+ return false;
+}
+
+static socklen_t prepare_addr(struct sockaddr_storage *addr, int family)
+{
+ struct sockaddr_in *addr4;
+ struct sockaddr_in6 *addr6;
+
+ switch (family) {
+ case AF_INET:
+ addr4 = (struct sockaddr_in *)addr;
+ memset(addr4, 0, sizeof(*addr4));
+ addr4->sin_family = family;
+ addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ return sizeof(*addr4);
+ case AF_INET6:
+ addr6 = (struct sockaddr_in6 *)addr;
+ memset(addr6, 0, sizeof(*addr6));
+ addr6->sin6_family = family;
+ addr6->sin6_addr = in6addr_loopback;
+ return sizeof(*addr6);
+ default:
+ fprintf(stderr, "Invalid family %d", family);
+ return 0;
+ }
+}
+
+static bool was_decapsulated(struct bpf_prog_test_run_attr *tattr)
+{
+ return tattr->data_size_out < tattr->data_size_in;
+}
+
+enum type {
+ UDP,
+ TCP,
+ __NR_KIND,
+};
+
+enum hops {
+ NO_HOPS,
+ ONE_HOP,
+};
+
+enum flags {
+ NONE,
+ SYN,
+ ACK,
+};
+
+enum conn {
+ KNOWN_CONN,
+ UNKNOWN_CONN,
+};
+
+enum result {
+ ACCEPT,
+ FORWARD,
+};
+
+struct test_cfg {
+ enum type type;
+ enum result result;
+ enum conn conn;
+ enum hops hops;
+ enum flags flags;
+};
+
+static int test_str(void *buf, size_t len, const struct test_cfg *test,
+ int family)
+{
+ const char *family_str, *type, *conn, *hops, *result, *flags;
+
+ family_str = "IPv4";
+ if (family == AF_INET6)
+ family_str = "IPv6";
+
+ type = "TCP";
+ if (test->type == UDP)
+ type = "UDP";
+
+ conn = "known";
+ if (test->conn == UNKNOWN_CONN)
+ conn = "unknown";
+
+ hops = "no hops";
+ if (test->hops == ONE_HOP)
+ hops = "one hop";
+
+ result = "accept";
+ if (test->result == FORWARD)
+ result = "forward";
+
+ flags = "none";
+ if (test->flags == SYN)
+ flags = "SYN";
+ else if (test->flags == ACK)
+ flags = "ACK";
+
+ return snprintf(buf, len, "%s %s %s %s (%s, flags: %s)", family_str,
+ type, result, conn, hops, flags);
+}
+
+static struct test_cfg tests[] = {
+ { TCP, ACCEPT, UNKNOWN_CONN, NO_HOPS, SYN },
+ { TCP, ACCEPT, UNKNOWN_CONN, NO_HOPS, ACK },
+ { TCP, FORWARD, UNKNOWN_CONN, ONE_HOP, ACK },
+ { TCP, ACCEPT, KNOWN_CONN, ONE_HOP, ACK },
+ { UDP, ACCEPT, UNKNOWN_CONN, NO_HOPS, NONE },
+ { UDP, FORWARD, UNKNOWN_CONN, ONE_HOP, NONE },
+ { UDP, ACCEPT, KNOWN_CONN, ONE_HOP, NONE },
+};
+
+static void encap_init(encap_headers_t *encap, uint8_t hop_count, uint8_t proto)
+{
+ const uint8_t hlen =
+ (sizeof(struct guehdr) / sizeof(uint32_t)) + hop_count;
+ *encap = (encap_headers_t){
+ .eth = { .h_proto = htons(ETH_P_IP) },
+ .ip = {
+ .ihl = 5,
+ .version = 4,
+ .ttl = IPDEFTTL,
+ .protocol = IPPROTO_UDP,
+ .daddr = htonl(ENCAP_IP)
+ },
+ .udp = {
+ .dest = htons(ENCAP_PORT),
+ },
+ .gue = {
+ .hlen = hlen,
+ .proto_ctype = proto
+ },
+ .unigue = {
+ .hop_count = hop_count
+ },
+ };
+}
+
+static size_t build_input(const struct test_cfg *test, void *const buf,
+ const struct tuple *tuple)
+{
+ in_port_t sport = tuple->src.port;
+ encap_headers_t encap;
+ struct iphdr ip;
+ struct ipv6hdr ipv6;
+ struct tcphdr tcp;
+ struct udphdr udp;
+ struct in_addr next_hop;
+ uint8_t *p = buf;
+ int proto;
+
+ proto = IPPROTO_IPIP;
+ if (tuple->family == AF_INET6)
+ proto = IPPROTO_IPV6;
+
+ encap_init(&encap, test->hops == ONE_HOP ? 1 : 0, proto);
+ p = mempcpy(p, &encap, sizeof(encap));
+
+ if (test->hops == ONE_HOP) {
+ next_hop = (struct in_addr){ .s_addr = htonl(0x7f000002) };
+ p = mempcpy(p, &next_hop, sizeof(next_hop));
+ }
+
+ proto = IPPROTO_TCP;
+ if (test->type == UDP)
+ proto = IPPROTO_UDP;
+
+ switch (tuple->family) {
+ case AF_INET:
+ ip = (struct iphdr){
+ .ihl = 5,
+ .version = 4,
+ .ttl = IPDEFTTL,
+ .protocol = proto,
+ .saddr = tuple->src.in_addr.s_addr,
+ .daddr = tuple->dst.in_addr.s_addr,
+ };
+ p = mempcpy(p, &ip, sizeof(ip));
+ break;
+ case AF_INET6:
+ ipv6 = (struct ipv6hdr){
+ .version = 6,
+ .hop_limit = IPDEFTTL,
+ .nexthdr = proto,
+ .saddr = tuple->src.in6_addr,
+ .daddr = tuple->dst.in6_addr,
+ };
+ p = mempcpy(p, &ipv6, sizeof(ipv6));
+ break;
+ default:
+ return 0;
+ }
+
+ if (test->conn == UNKNOWN_CONN)
+ sport--;
+
+ switch (test->type) {
+ case TCP:
+ tcp = (struct tcphdr){
+ .source = sport,
+ .dest = tuple->dst.port,
+ };
+ if (test->flags == SYN)
+ tcp.syn = true;
+ if (test->flags == ACK)
+ tcp.ack = true;
+ p = mempcpy(p, &tcp, sizeof(tcp));
+ break;
+ case UDP:
+ udp = (struct udphdr){
+ .source = sport,
+ .dest = tuple->dst.port,
+ };
+ p = mempcpy(p, &udp, sizeof(udp));
+ break;
+ default:
+ return 0;
+ }
+
+ return (void *)p - buf;
+}
+
+static void close_fds(int *fds, int n)
+{
+ int i;
+
+ for (i = 0; i < n; i++)
+ if (fds[i] > 0)
+ close(fds[i]);
+}
+
+static void test_cls_redirect_common(struct bpf_program *prog)
+{
+ struct bpf_prog_test_run_attr tattr = {};
+ int families[] = { AF_INET, AF_INET6 };
+ struct sockaddr_storage ss;
+ struct sockaddr *addr;
+ socklen_t slen;
+ int i, j, err;
+ int servers[__NR_KIND][ARRAY_SIZE(families)] = {};
+ int conns[__NR_KIND][ARRAY_SIZE(families)] = {};
+ struct tuple tuples[__NR_KIND][ARRAY_SIZE(families)];
+
+ addr = (struct sockaddr *)&ss;
+ for (i = 0; i < ARRAY_SIZE(families); i++) {
+ slen = prepare_addr(&ss, families[i]);
+ if (CHECK_FAIL(!slen))
+ goto cleanup;
+
+ if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_DGRAM,
+ &servers[UDP][i], &conns[UDP][i],
+ &tuples[UDP][i])))
+ goto cleanup;
+
+ if (CHECK_FAIL(!set_up_conn(addr, slen, SOCK_STREAM,
+ &servers[TCP][i], &conns[TCP][i],
+ &tuples[TCP][i])))
+ goto cleanup;
+ }
+
+ tattr.prog_fd = bpf_program__fd(prog);
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ struct test_cfg *test = &tests[i];
+
+ for (j = 0; j < ARRAY_SIZE(families); j++) {
+ struct tuple *tuple = &tuples[test->type][j];
+ char input[256];
+ char tmp[256];
+
+ test_str(tmp, sizeof(tmp), test, tuple->family);
+ if (!test__start_subtest(tmp))
+ continue;
+
+ tattr.data_out = tmp;
+ tattr.data_size_out = sizeof(tmp);
+
+ tattr.data_in = input;
+ tattr.data_size_in = build_input(test, input, tuple);
+ if (CHECK_FAIL(!tattr.data_size_in))
+ continue;
+
+ err = bpf_prog_test_run_xattr(&tattr);
+ if (CHECK_FAIL(err))
+ continue;
+
+ if (tattr.retval != TC_ACT_REDIRECT) {
+ PRINT_FAIL("expected TC_ACT_REDIRECT, got %d\n",
+ tattr.retval);
+ continue;
+ }
+
+ switch (test->result) {
+ case ACCEPT:
+ if (CHECK_FAIL(!was_decapsulated(&tattr)))
+ continue;
+ break;
+ case FORWARD:
+ if (CHECK_FAIL(was_decapsulated(&tattr)))
+ continue;
+ break;
+ default:
+ PRINT_FAIL("unknown result %d\n", test->result);
+ continue;
+ }
+ }
+ }
+
+cleanup:
+ close_fds((int *)servers, sizeof(servers) / sizeof(servers[0][0]));
+ close_fds((int *)conns, sizeof(conns) / sizeof(conns[0][0]));
+}
+
+static void test_cls_redirect_inlined(void)
+{
+ struct test_cls_redirect *skel;
+ int err;
+
+ skel = test_cls_redirect__open();
+ if (CHECK(!skel, "skel_open", "failed\n"))
+ return;
+
+ skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP);
+ skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT);
+
+ err = test_cls_redirect__load(skel);
+ if (CHECK(err, "skel_load", "failed: %d\n", err))
+ goto cleanup;
+
+ test_cls_redirect_common(skel->progs.cls_redirect);
+
+cleanup:
+ test_cls_redirect__destroy(skel);
+}
+
+static void test_cls_redirect_subprogs(void)
+{
+ struct test_cls_redirect_subprogs *skel;
+ int err;
+
+ skel = test_cls_redirect_subprogs__open();
+ if (CHECK(!skel, "skel_open", "failed\n"))
+ return;
+
+ skel->rodata->ENCAPSULATION_IP = htonl(ENCAP_IP);
+ skel->rodata->ENCAPSULATION_PORT = htons(ENCAP_PORT);
+
+ err = test_cls_redirect_subprogs__load(skel);
+ if (CHECK(err, "skel_load", "failed: %d\n", err))
+ goto cleanup;
+
+ test_cls_redirect_common(skel->progs.cls_redirect);
+
+cleanup:
+ test_cls_redirect_subprogs__destroy(skel);
+}
+
+void test_cls_redirect(void)
+{
+ if (test__start_subtest("cls_redirect_inlined"))
+ test_cls_redirect_inlined();
+ if (test__start_subtest("cls_redirect_subprogs"))
+ test_cls_redirect_subprogs();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c
new file mode 100644
index 000000000..9229db2f5
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include "cgroup_helpers.h"
+#include "network_helpers.h"
+
+static int verify_ports(int family, int fd,
+ __u16 expected_local, __u16 expected_peer)
+{
+ struct sockaddr_storage addr;
+ socklen_t len = sizeof(addr);
+ __u16 port;
+
+ if (getsockname(fd, (struct sockaddr *)&addr, &len)) {
+ log_err("Failed to get server addr");
+ return -1;
+ }
+
+ if (family == AF_INET)
+ port = ((struct sockaddr_in *)&addr)->sin_port;
+ else
+ port = ((struct sockaddr_in6 *)&addr)->sin6_port;
+
+ if (ntohs(port) != expected_local) {
+ log_err("Unexpected local port %d, expected %d", ntohs(port),
+ expected_local);
+ return -1;
+ }
+
+ if (getpeername(fd, (struct sockaddr *)&addr, &len)) {
+ log_err("Failed to get peer addr");
+ return -1;
+ }
+
+ if (family == AF_INET)
+ port = ((struct sockaddr_in *)&addr)->sin_port;
+ else
+ port = ((struct sockaddr_in6 *)&addr)->sin6_port;
+
+ if (ntohs(port) != expected_peer) {
+ log_err("Unexpected peer port %d, expected %d", ntohs(port),
+ expected_peer);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int run_test(int cgroup_fd, int server_fd, int family, int type)
+{
+ bool v4 = family == AF_INET;
+ __u16 expected_local_port = v4 ? 22222 : 22223;
+ __u16 expected_peer_port = 60000;
+ struct bpf_prog_load_attr attr = {
+ .file = v4 ? "./connect_force_port4.o" :
+ "./connect_force_port6.o",
+ };
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ int xlate_fd, fd, err;
+ __u32 duration = 0;
+
+ err = bpf_prog_load_xattr(&attr, &obj, &xlate_fd);
+ if (err) {
+ log_err("Failed to load BPF object");
+ return -1;
+ }
+
+ prog = bpf_object__find_program_by_title(obj, v4 ?
+ "cgroup/connect4" :
+ "cgroup/connect6");
+ if (CHECK(!prog, "find_prog", "connect prog not found\n")) {
+ err = -EIO;
+ goto close_bpf_object;
+ }
+
+ err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ?
+ BPF_CGROUP_INET4_CONNECT :
+ BPF_CGROUP_INET6_CONNECT, 0);
+ if (err) {
+ log_err("Failed to attach BPF program");
+ goto close_bpf_object;
+ }
+
+ prog = bpf_object__find_program_by_title(obj, v4 ?
+ "cgroup/getpeername4" :
+ "cgroup/getpeername6");
+ if (CHECK(!prog, "find_prog", "getpeername prog not found\n")) {
+ err = -EIO;
+ goto close_bpf_object;
+ }
+
+ err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ?
+ BPF_CGROUP_INET4_GETPEERNAME :
+ BPF_CGROUP_INET6_GETPEERNAME, 0);
+ if (err) {
+ log_err("Failed to attach BPF program");
+ goto close_bpf_object;
+ }
+
+ prog = bpf_object__find_program_by_title(obj, v4 ?
+ "cgroup/getsockname4" :
+ "cgroup/getsockname6");
+ if (CHECK(!prog, "find_prog", "getsockname prog not found\n")) {
+ err = -EIO;
+ goto close_bpf_object;
+ }
+
+ err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd, v4 ?
+ BPF_CGROUP_INET4_GETSOCKNAME :
+ BPF_CGROUP_INET6_GETSOCKNAME, 0);
+ if (err) {
+ log_err("Failed to attach BPF program");
+ goto close_bpf_object;
+ }
+
+ fd = connect_to_fd(server_fd, 0);
+ if (fd < 0) {
+ err = -1;
+ goto close_bpf_object;
+ }
+
+ err = verify_ports(family, fd, expected_local_port,
+ expected_peer_port);
+ close(fd);
+
+close_bpf_object:
+ bpf_object__close(obj);
+ return err;
+}
+
+void test_connect_force_port(void)
+{
+ int server_fd, cgroup_fd;
+
+ cgroup_fd = test__join_cgroup("/connect_force_port");
+ if (CHECK_FAIL(cgroup_fd < 0))
+ return;
+
+ server_fd = start_server(AF_INET, SOCK_STREAM, NULL, 60123, 0);
+ if (CHECK_FAIL(server_fd < 0))
+ goto close_cgroup_fd;
+ CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_STREAM));
+ close(server_fd);
+
+ server_fd = start_server(AF_INET6, SOCK_STREAM, NULL, 60124, 0);
+ if (CHECK_FAIL(server_fd < 0))
+ goto close_cgroup_fd;
+ CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_STREAM));
+ close(server_fd);
+
+ server_fd = start_server(AF_INET, SOCK_DGRAM, NULL, 60123, 0);
+ if (CHECK_FAIL(server_fd < 0))
+ goto close_cgroup_fd;
+ CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_DGRAM));
+ close(server_fd);
+
+ server_fd = start_server(AF_INET6, SOCK_DGRAM, NULL, 60124, 0);
+ if (CHECK_FAIL(server_fd < 0))
+ goto close_cgroup_fd;
+ CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_DGRAM));
+ close(server_fd);
+
+close_cgroup_fd:
+ close(cgroup_fd);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/core_autosize.c b/tools/testing/selftests/bpf/prog_tests/core_autosize.c
new file mode 100644
index 000000000..981c25145
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/core_autosize.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include <test_progs.h>
+#include <bpf/btf.h>
+
+/* real layout and sizes according to test's (32-bit) BTF
+ * needs to be defined before skeleton is included */
+struct test_struct___real {
+ unsigned int ptr; /* can't use `void *`, it is always 8 byte in BPF target */
+ unsigned int val2;
+ unsigned long long val1;
+ unsigned short val3;
+ unsigned char val4;
+ unsigned char _pad;
+};
+
+#include "test_core_autosize.skel.h"
+
+static int duration = 0;
+
+static struct {
+ unsigned long long ptr_samesized;
+ unsigned long long val1_samesized;
+ unsigned long long val2_samesized;
+ unsigned long long val3_samesized;
+ unsigned long long val4_samesized;
+ struct test_struct___real output_samesized;
+
+ unsigned long long ptr_downsized;
+ unsigned long long val1_downsized;
+ unsigned long long val2_downsized;
+ unsigned long long val3_downsized;
+ unsigned long long val4_downsized;
+ struct test_struct___real output_downsized;
+
+ unsigned long long ptr_probed;
+ unsigned long long val1_probed;
+ unsigned long long val2_probed;
+ unsigned long long val3_probed;
+ unsigned long long val4_probed;
+
+ unsigned long long ptr_signed;
+ unsigned long long val1_signed;
+ unsigned long long val2_signed;
+ unsigned long long val3_signed;
+ unsigned long long val4_signed;
+ struct test_struct___real output_signed;
+} out;
+
+void test_core_autosize(void)
+{
+ char btf_file[] = "/tmp/core_autosize.btf.XXXXXX";
+ int err, fd = -1, zero = 0;
+ int char_id, short_id, int_id, long_long_id, void_ptr_id, id;
+ struct test_core_autosize* skel = NULL;
+ struct bpf_object_load_attr load_attr = {};
+ struct bpf_program *prog;
+ struct bpf_map *bss_map;
+ struct btf *btf = NULL;
+ size_t written;
+ const void *raw_data;
+ __u32 raw_sz;
+ FILE *f = NULL;
+
+ btf = btf__new_empty();
+ if (!ASSERT_OK_PTR(btf, "empty_btf"))
+ return;
+ /* Emit the following struct with 32-bit pointer size:
+ *
+ * struct test_struct {
+ * void *ptr;
+ * unsigned long val2;
+ * unsigned long long val1;
+ * unsigned short val3;
+ * unsigned char val4;
+ * char: 8;
+ * };
+ *
+ * This struct is going to be used as the "kernel BTF" for this test.
+ * It's equivalent memory-layout-wise to test_struct__real above.
+ */
+
+ /* force 32-bit pointer size */
+ btf__set_pointer_size(btf, 4);
+
+ char_id = btf__add_int(btf, "unsigned char", 1, 0);
+ ASSERT_EQ(char_id, 1, "char_id");
+ short_id = btf__add_int(btf, "unsigned short", 2, 0);
+ ASSERT_EQ(short_id, 2, "short_id");
+ /* "long unsigned int" of 4 byte size tells BTF that sizeof(void *) == 4 */
+ int_id = btf__add_int(btf, "long unsigned int", 4, 0);
+ ASSERT_EQ(int_id, 3, "int_id");
+ long_long_id = btf__add_int(btf, "unsigned long long", 8, 0);
+ ASSERT_EQ(long_long_id, 4, "long_long_id");
+ void_ptr_id = btf__add_ptr(btf, 0);
+ ASSERT_EQ(void_ptr_id, 5, "void_ptr_id");
+
+ id = btf__add_struct(btf, "test_struct", 20 /* bytes */);
+ ASSERT_EQ(id, 6, "struct_id");
+ err = btf__add_field(btf, "ptr", void_ptr_id, 0, 0);
+ err = err ?: btf__add_field(btf, "val2", int_id, 32, 0);
+ err = err ?: btf__add_field(btf, "val1", long_long_id, 64, 0);
+ err = err ?: btf__add_field(btf, "val3", short_id, 128, 0);
+ err = err ?: btf__add_field(btf, "val4", char_id, 144, 0);
+ ASSERT_OK(err, "struct_fields");
+
+ fd = mkstemp(btf_file);
+ if (CHECK(fd < 0, "btf_tmp", "failed to create file: %d\n", fd))
+ goto cleanup;
+ f = fdopen(fd, "w");
+ if (!ASSERT_OK_PTR(f, "btf_fdopen"))
+ goto cleanup;
+
+ raw_data = btf__get_raw_data(btf, &raw_sz);
+ if (!ASSERT_OK_PTR(raw_data, "raw_data"))
+ goto cleanup;
+ written = fwrite(raw_data, 1, raw_sz, f);
+ if (CHECK(written != raw_sz, "btf_write", "written: %zu, errno: %d\n", written, errno))
+ goto cleanup;
+ fflush(f);
+ fclose(f);
+ f = NULL;
+ close(fd);
+ fd = -1;
+
+ /* open and load BPF program with custom BTF as the kernel BTF */
+ skel = test_core_autosize__open();
+ if (!ASSERT_OK_PTR(skel, "skel_open"))
+ return;
+
+ /* disable handle_signed() for now */
+ prog = bpf_object__find_program_by_name(skel->obj, "handle_signed");
+ if (!ASSERT_OK_PTR(prog, "prog_find"))
+ goto cleanup;
+ bpf_program__set_autoload(prog, false);
+
+ load_attr.obj = skel->obj;
+ load_attr.target_btf_path = btf_file;
+ err = bpf_object__load_xattr(&load_attr);
+ if (!ASSERT_OK(err, "prog_load"))
+ goto cleanup;
+
+ prog = bpf_object__find_program_by_name(skel->obj, "handle_samesize");
+ if (!ASSERT_OK_PTR(prog, "prog_find"))
+ goto cleanup;
+ skel->links.handle_samesize = bpf_program__attach(prog);
+ if (!ASSERT_OK_PTR(skel->links.handle_samesize, "prog_attach"))
+ goto cleanup;
+
+ prog = bpf_object__find_program_by_name(skel->obj, "handle_downsize");
+ if (!ASSERT_OK_PTR(prog, "prog_find"))
+ goto cleanup;
+ skel->links.handle_downsize = bpf_program__attach(prog);
+ if (!ASSERT_OK_PTR(skel->links.handle_downsize, "prog_attach"))
+ goto cleanup;
+
+ prog = bpf_object__find_program_by_name(skel->obj, "handle_probed");
+ if (!ASSERT_OK_PTR(prog, "prog_find"))
+ goto cleanup;
+ skel->links.handle_probed = bpf_program__attach(prog);
+ if (!ASSERT_OK_PTR(skel->links.handle_probed, "prog_attach"))
+ goto cleanup;
+
+ usleep(1);
+
+ bss_map = bpf_object__find_map_by_name(skel->obj, "test_cor.bss");
+ if (!ASSERT_OK_PTR(bss_map, "bss_map_find"))
+ goto cleanup;
+
+ err = bpf_map_lookup_elem(bpf_map__fd(bss_map), &zero, (void *)&out);
+ if (!ASSERT_OK(err, "bss_lookup"))
+ goto cleanup;
+
+ ASSERT_EQ(out.ptr_samesized, 0x01020304, "ptr_samesized");
+ ASSERT_EQ(out.val1_samesized, 0x1020304050607080, "val1_samesized");
+ ASSERT_EQ(out.val2_samesized, 0x0a0b0c0d, "val2_samesized");
+ ASSERT_EQ(out.val3_samesized, 0xfeed, "val3_samesized");
+ ASSERT_EQ(out.val4_samesized, 0xb9, "val4_samesized");
+ ASSERT_EQ(out.output_samesized.ptr, 0x01020304, "ptr_samesized");
+ ASSERT_EQ(out.output_samesized.val1, 0x1020304050607080, "val1_samesized");
+ ASSERT_EQ(out.output_samesized.val2, 0x0a0b0c0d, "val2_samesized");
+ ASSERT_EQ(out.output_samesized.val3, 0xfeed, "val3_samesized");
+ ASSERT_EQ(out.output_samesized.val4, 0xb9, "val4_samesized");
+
+ ASSERT_EQ(out.ptr_downsized, 0x01020304, "ptr_downsized");
+ ASSERT_EQ(out.val1_downsized, 0x1020304050607080, "val1_downsized");
+ ASSERT_EQ(out.val2_downsized, 0x0a0b0c0d, "val2_downsized");
+ ASSERT_EQ(out.val3_downsized, 0xfeed, "val3_downsized");
+ ASSERT_EQ(out.val4_downsized, 0xb9, "val4_downsized");
+ ASSERT_EQ(out.output_downsized.ptr, 0x01020304, "ptr_downsized");
+ ASSERT_EQ(out.output_downsized.val1, 0x1020304050607080, "val1_downsized");
+ ASSERT_EQ(out.output_downsized.val2, 0x0a0b0c0d, "val2_downsized");
+ ASSERT_EQ(out.output_downsized.val3, 0xfeed, "val3_downsized");
+ ASSERT_EQ(out.output_downsized.val4, 0xb9, "val4_downsized");
+
+ ASSERT_EQ(out.ptr_probed, 0x01020304, "ptr_probed");
+ ASSERT_EQ(out.val1_probed, 0x1020304050607080, "val1_probed");
+ ASSERT_EQ(out.val2_probed, 0x0a0b0c0d, "val2_probed");
+ ASSERT_EQ(out.val3_probed, 0xfeed, "val3_probed");
+ ASSERT_EQ(out.val4_probed, 0xb9, "val4_probed");
+
+ test_core_autosize__destroy(skel);
+ skel = NULL;
+
+ /* now re-load with handle_signed() enabled, it should fail loading */
+ skel = test_core_autosize__open();
+ if (!ASSERT_OK_PTR(skel, "skel_open"))
+ return;
+
+ load_attr.obj = skel->obj;
+ load_attr.target_btf_path = btf_file;
+ err = bpf_object__load_xattr(&load_attr);
+ if (!ASSERT_ERR(err, "bad_prog_load"))
+ goto cleanup;
+
+cleanup:
+ if (f)
+ fclose(f);
+ if (fd >= 0)
+ close(fd);
+ remove(btf_file);
+ btf__free(btf);
+ test_core_autosize__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/core_extern.c b/tools/testing/selftests/bpf/prog_tests/core_extern.c
new file mode 100644
index 000000000..1931a1585
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/core_extern.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+
+#include <test_progs.h>
+#include <sys/mman.h>
+#include <sys/utsname.h>
+#include <linux/version.h>
+#include "test_core_extern.skel.h"
+
+static uint32_t get_kernel_version(void)
+{
+ uint32_t major, minor, patch;
+ struct utsname info;
+
+ uname(&info);
+ if (sscanf(info.release, "%u.%u.%u", &major, &minor, &patch) != 3)
+ return 0;
+ return KERNEL_VERSION(major, minor, patch);
+}
+
+#define CFG "CONFIG_BPF_SYSCALL=n\n"
+
+static struct test_case {
+ const char *name;
+ const char *cfg;
+ bool fails;
+ struct test_core_extern__data data;
+} test_cases[] = {
+ { .name = "default search path", .data = { .bpf_syscall = true } },
+ {
+ .name = "custom values",
+ .cfg = "CONFIG_BPF_SYSCALL=n\n"
+ "CONFIG_TRISTATE=m\n"
+ "CONFIG_BOOL=y\n"
+ "CONFIG_CHAR=100\n"
+ "CONFIG_USHORT=30000\n"
+ "CONFIG_INT=123456\n"
+ "CONFIG_ULONG=0xDEADBEEFC0DE\n"
+ "CONFIG_STR=\"abracad\"\n"
+ "CONFIG_MISSING=0",
+ .data = {
+ .bpf_syscall = false,
+ .tristate_val = TRI_MODULE,
+ .bool_val = true,
+ .char_val = 100,
+ .ushort_val = 30000,
+ .int_val = 123456,
+ .ulong_val = 0xDEADBEEFC0DE,
+ .str_val = "abracad",
+ },
+ },
+ /* TRISTATE */
+ { .name = "tristate (y)", .cfg = CFG"CONFIG_TRISTATE=y\n",
+ .data = { .tristate_val = TRI_YES } },
+ { .name = "tristate (n)", .cfg = CFG"CONFIG_TRISTATE=n\n",
+ .data = { .tristate_val = TRI_NO } },
+ { .name = "tristate (m)", .cfg = CFG"CONFIG_TRISTATE=m\n",
+ .data = { .tristate_val = TRI_MODULE } },
+ { .name = "tristate (int)", .fails = 1, .cfg = CFG"CONFIG_TRISTATE=1" },
+ { .name = "tristate (bad)", .fails = 1, .cfg = CFG"CONFIG_TRISTATE=M" },
+ /* BOOL */
+ { .name = "bool (y)", .cfg = CFG"CONFIG_BOOL=y\n",
+ .data = { .bool_val = true } },
+ { .name = "bool (n)", .cfg = CFG"CONFIG_BOOL=n\n",
+ .data = { .bool_val = false } },
+ { .name = "bool (tristate)", .fails = 1, .cfg = CFG"CONFIG_BOOL=m" },
+ { .name = "bool (int)", .fails = 1, .cfg = CFG"CONFIG_BOOL=1" },
+ /* CHAR */
+ { .name = "char (tristate)", .cfg = CFG"CONFIG_CHAR=m\n",
+ .data = { .char_val = 'm' } },
+ { .name = "char (bad)", .fails = 1, .cfg = CFG"CONFIG_CHAR=q\n" },
+ { .name = "char (empty)", .fails = 1, .cfg = CFG"CONFIG_CHAR=\n" },
+ { .name = "char (str)", .fails = 1, .cfg = CFG"CONFIG_CHAR=\"y\"\n" },
+ /* STRING */
+ { .name = "str (empty)", .cfg = CFG"CONFIG_STR=\"\"\n",
+ .data = { .str_val = "\0\0\0\0\0\0\0" } },
+ { .name = "str (padded)", .cfg = CFG"CONFIG_STR=\"abra\"\n",
+ .data = { .str_val = "abra\0\0\0" } },
+ { .name = "str (too long)", .cfg = CFG"CONFIG_STR=\"abracada\"\n",
+ .data = { .str_val = "abracad" } },
+ { .name = "str (no value)", .fails = 1, .cfg = CFG"CONFIG_STR=\n" },
+ { .name = "str (bad value)", .fails = 1, .cfg = CFG"CONFIG_STR=bla\n" },
+ /* INTEGERS */
+ {
+ .name = "integer forms",
+ .cfg = CFG
+ "CONFIG_CHAR=0xA\n"
+ "CONFIG_USHORT=0462\n"
+ "CONFIG_INT=-100\n"
+ "CONFIG_ULONG=+1000000000000",
+ .data = {
+ .char_val = 0xA,
+ .ushort_val = 0462,
+ .int_val = -100,
+ .ulong_val = 1000000000000,
+ },
+ },
+ { .name = "int (bad)", .fails = 1, .cfg = CFG"CONFIG_INT=abc" },
+ { .name = "int (str)", .fails = 1, .cfg = CFG"CONFIG_INT=\"abc\"" },
+ { .name = "int (empty)", .fails = 1, .cfg = CFG"CONFIG_INT=" },
+ { .name = "int (mixed)", .fails = 1, .cfg = CFG"CONFIG_INT=123abc" },
+ { .name = "int (max)", .cfg = CFG"CONFIG_INT=2147483647",
+ .data = { .int_val = 2147483647 } },
+ { .name = "int (min)", .cfg = CFG"CONFIG_INT=-2147483648",
+ .data = { .int_val = -2147483648 } },
+ { .name = "int (max+1)", .fails = 1, .cfg = CFG"CONFIG_INT=2147483648" },
+ { .name = "int (min-1)", .fails = 1, .cfg = CFG"CONFIG_INT=-2147483649" },
+ { .name = "ushort (max)", .cfg = CFG"CONFIG_USHORT=65535",
+ .data = { .ushort_val = 65535 } },
+ { .name = "ushort (min)", .cfg = CFG"CONFIG_USHORT=0",
+ .data = { .ushort_val = 0 } },
+ { .name = "ushort (max+1)", .fails = 1, .cfg = CFG"CONFIG_USHORT=65536" },
+ { .name = "ushort (min-1)", .fails = 1, .cfg = CFG"CONFIG_USHORT=-1" },
+ { .name = "u64 (max)", .cfg = CFG"CONFIG_ULONG=0xffffffffffffffff",
+ .data = { .ulong_val = 0xffffffffffffffff } },
+ { .name = "u64 (min)", .cfg = CFG"CONFIG_ULONG=0",
+ .data = { .ulong_val = 0 } },
+ { .name = "u64 (max+1)", .fails = 1, .cfg = CFG"CONFIG_ULONG=0x10000000000000000" },
+};
+
+void test_core_extern(void)
+{
+ const uint32_t kern_ver = get_kernel_version();
+ int err, duration = 0, i, j;
+ struct test_core_extern *skel = NULL;
+ uint64_t *got, *exp;
+ int n = sizeof(*skel->data) / sizeof(uint64_t);
+
+ for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
+ struct test_case *t = &test_cases[i];
+ DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts,
+ .kconfig = t->cfg,
+ );
+
+ if (!test__start_subtest(t->name))
+ continue;
+
+ skel = test_core_extern__open_opts(&opts);
+ if (CHECK(!skel, "skel_open", "skeleton open failed\n"))
+ goto cleanup;
+ err = test_core_extern__load(skel);
+ if (t->fails) {
+ CHECK(!err, "skel_load",
+ "shouldn't succeed open/load of skeleton\n");
+ goto cleanup;
+ } else if (CHECK(err, "skel_load",
+ "failed to open/load skeleton\n")) {
+ goto cleanup;
+ }
+ err = test_core_extern__attach(skel);
+ if (CHECK(err, "attach_raw_tp", "failed attach: %d\n", err))
+ goto cleanup;
+
+ usleep(1);
+
+ t->data.kern_ver = kern_ver;
+ t->data.missing_val = 0xDEADC0DE;
+ got = (uint64_t *)skel->data;
+ exp = (uint64_t *)&t->data;
+ for (j = 0; j < n; j++) {
+ CHECK(got[j] != exp[j], "check_res",
+ "result #%d: expected %llx, but got %llx\n",
+ j, (__u64)exp[j], (__u64)got[j]);
+ }
+cleanup:
+ test_core_extern__destroy(skel);
+ skel = NULL;
+ }
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/core_reloc.c b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
new file mode 100644
index 000000000..5b52985cb
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/core_reloc.c
@@ -0,0 +1,891 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include "progs/core_reloc_types.h"
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <bpf/btf.h>
+
+static int duration = 0;
+
+#define STRUCT_TO_CHAR_PTR(struct_name) (const char *)&(struct struct_name)
+
+#define FLAVORS_DATA(struct_name) STRUCT_TO_CHAR_PTR(struct_name) { \
+ .a = 42, \
+ .b = 0xc001, \
+ .c = 0xbeef, \
+}
+
+#define FLAVORS_CASE_COMMON(name) \
+ .case_name = #name, \
+ .bpf_obj_file = "test_core_reloc_flavors.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".o" \
+
+#define FLAVORS_CASE(name) { \
+ FLAVORS_CASE_COMMON(name), \
+ .input = FLAVORS_DATA(core_reloc_##name), \
+ .input_len = sizeof(struct core_reloc_##name), \
+ .output = FLAVORS_DATA(core_reloc_flavors), \
+ .output_len = sizeof(struct core_reloc_flavors), \
+}
+
+#define FLAVORS_ERR_CASE(name) { \
+ FLAVORS_CASE_COMMON(name), \
+ .fails = true, \
+}
+
+#define NESTING_DATA(struct_name) STRUCT_TO_CHAR_PTR(struct_name) { \
+ .a = { .a = { .a = 42 } }, \
+ .b = { .b = { .b = 0xc001 } }, \
+}
+
+#define NESTING_CASE_COMMON(name) \
+ .case_name = #name, \
+ .bpf_obj_file = "test_core_reloc_nesting.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".o"
+
+#define NESTING_CASE(name) { \
+ NESTING_CASE_COMMON(name), \
+ .input = NESTING_DATA(core_reloc_##name), \
+ .input_len = sizeof(struct core_reloc_##name), \
+ .output = NESTING_DATA(core_reloc_nesting), \
+ .output_len = sizeof(struct core_reloc_nesting) \
+}
+
+#define NESTING_ERR_CASE(name) { \
+ NESTING_CASE_COMMON(name), \
+ .fails = true, \
+}
+
+#define ARRAYS_DATA(struct_name) STRUCT_TO_CHAR_PTR(struct_name) { \
+ .a = { [2] = 1 }, \
+ .b = { [1] = { [2] = { [3] = 2 } } }, \
+ .c = { [1] = { .c = 3 } }, \
+ .d = { [0] = { [0] = { .d = 4 } } }, \
+}
+
+#define ARRAYS_CASE_COMMON(name) \
+ .case_name = #name, \
+ .bpf_obj_file = "test_core_reloc_arrays.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".o"
+
+#define ARRAYS_CASE(name) { \
+ ARRAYS_CASE_COMMON(name), \
+ .input = ARRAYS_DATA(core_reloc_##name), \
+ .input_len = sizeof(struct core_reloc_##name), \
+ .output = STRUCT_TO_CHAR_PTR(core_reloc_arrays_output) { \
+ .a2 = 1, \
+ .b123 = 2, \
+ .c1c = 3, \
+ .d00d = 4, \
+ .f10c = 0, \
+ }, \
+ .output_len = sizeof(struct core_reloc_arrays_output) \
+}
+
+#define ARRAYS_ERR_CASE(name) { \
+ ARRAYS_CASE_COMMON(name), \
+ .fails = true, \
+}
+
+#define PRIMITIVES_DATA(struct_name) STRUCT_TO_CHAR_PTR(struct_name) { \
+ .a = 1, \
+ .b = 2, \
+ .c = 3, \
+ .d = (void *)4, \
+ .f = (void *)5, \
+}
+
+#define PRIMITIVES_CASE_COMMON(name) \
+ .case_name = #name, \
+ .bpf_obj_file = "test_core_reloc_primitives.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".o"
+
+#define PRIMITIVES_CASE(name) { \
+ PRIMITIVES_CASE_COMMON(name), \
+ .input = PRIMITIVES_DATA(core_reloc_##name), \
+ .input_len = sizeof(struct core_reloc_##name), \
+ .output = PRIMITIVES_DATA(core_reloc_primitives), \
+ .output_len = sizeof(struct core_reloc_primitives), \
+}
+
+#define PRIMITIVES_ERR_CASE(name) { \
+ PRIMITIVES_CASE_COMMON(name), \
+ .fails = true, \
+}
+
+#define MODS_CASE(name) { \
+ .case_name = #name, \
+ .bpf_obj_file = "test_core_reloc_mods.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".o", \
+ .input = STRUCT_TO_CHAR_PTR(core_reloc_##name) { \
+ .a = 1, \
+ .b = 2, \
+ .c = (void *)3, \
+ .d = (void *)4, \
+ .e = { [2] = 5 }, \
+ .f = { [1] = 6 }, \
+ .g = { .x = 7 }, \
+ .h = { .y = 8 }, \
+ }, \
+ .input_len = sizeof(struct core_reloc_##name), \
+ .output = STRUCT_TO_CHAR_PTR(core_reloc_mods_output) { \
+ .a = 1, .b = 2, .c = 3, .d = 4, \
+ .e = 5, .f = 6, .g = 7, .h = 8, \
+ }, \
+ .output_len = sizeof(struct core_reloc_mods_output), \
+}
+
+#define PTR_AS_ARR_CASE(name) { \
+ .case_name = #name, \
+ .bpf_obj_file = "test_core_reloc_ptr_as_arr.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".o", \
+ .input = (const char *)&(struct core_reloc_##name []){ \
+ { .a = 1 }, \
+ { .a = 2 }, \
+ { .a = 3 }, \
+ }, \
+ .input_len = 3 * sizeof(struct core_reloc_##name), \
+ .output = STRUCT_TO_CHAR_PTR(core_reloc_ptr_as_arr) { \
+ .a = 3, \
+ }, \
+ .output_len = sizeof(struct core_reloc_ptr_as_arr), \
+}
+
+#define INTS_DATA(struct_name) STRUCT_TO_CHAR_PTR(struct_name) { \
+ .u8_field = 1, \
+ .s8_field = 2, \
+ .u16_field = 3, \
+ .s16_field = 4, \
+ .u32_field = 5, \
+ .s32_field = 6, \
+ .u64_field = 7, \
+ .s64_field = 8, \
+}
+
+#define INTS_CASE_COMMON(name) \
+ .case_name = #name, \
+ .bpf_obj_file = "test_core_reloc_ints.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".o"
+
+#define INTS_CASE(name) { \
+ INTS_CASE_COMMON(name), \
+ .input = INTS_DATA(core_reloc_##name), \
+ .input_len = sizeof(struct core_reloc_##name), \
+ .output = INTS_DATA(core_reloc_ints), \
+ .output_len = sizeof(struct core_reloc_ints), \
+}
+
+#define INTS_ERR_CASE(name) { \
+ INTS_CASE_COMMON(name), \
+ .fails = true, \
+}
+
+#define FIELD_EXISTS_CASE_COMMON(name) \
+ .case_name = #name, \
+ .bpf_obj_file = "test_core_reloc_existence.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".o" \
+
+#define BITFIELDS_CASE_COMMON(objfile, test_name_prefix, name) \
+ .case_name = test_name_prefix#name, \
+ .bpf_obj_file = objfile, \
+ .btf_src_file = "btf__core_reloc_" #name ".o"
+
+#define BITFIELDS_CASE(name, ...) { \
+ BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_probed.o", \
+ "probed:", name), \
+ .input = STRUCT_TO_CHAR_PTR(core_reloc_##name) __VA_ARGS__, \
+ .input_len = sizeof(struct core_reloc_##name), \
+ .output = STRUCT_TO_CHAR_PTR(core_reloc_bitfields_output) \
+ __VA_ARGS__, \
+ .output_len = sizeof(struct core_reloc_bitfields_output), \
+}, { \
+ BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_direct.o", \
+ "direct:", name), \
+ .input = STRUCT_TO_CHAR_PTR(core_reloc_##name) __VA_ARGS__, \
+ .input_len = sizeof(struct core_reloc_##name), \
+ .output = STRUCT_TO_CHAR_PTR(core_reloc_bitfields_output) \
+ __VA_ARGS__, \
+ .output_len = sizeof(struct core_reloc_bitfields_output), \
+ .direct_raw_tp = true, \
+}
+
+
+#define BITFIELDS_ERR_CASE(name) { \
+ BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_probed.o", \
+ "probed:", name), \
+ .fails = true, \
+}, { \
+ BITFIELDS_CASE_COMMON("test_core_reloc_bitfields_direct.o", \
+ "direct:", name), \
+ .direct_raw_tp = true, \
+ .fails = true, \
+}
+
+#define SIZE_CASE_COMMON(name) \
+ .case_name = #name, \
+ .bpf_obj_file = "test_core_reloc_size.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".o", \
+ .relaxed_core_relocs = true
+
+#define SIZE_OUTPUT_DATA(type) \
+ STRUCT_TO_CHAR_PTR(core_reloc_size_output) { \
+ .int_sz = sizeof(((type *)0)->int_field), \
+ .struct_sz = sizeof(((type *)0)->struct_field), \
+ .union_sz = sizeof(((type *)0)->union_field), \
+ .arr_sz = sizeof(((type *)0)->arr_field), \
+ .arr_elem_sz = sizeof(((type *)0)->arr_field[0]), \
+ .ptr_sz = 8, /* always 8-byte pointer for BPF */ \
+ .enum_sz = sizeof(((type *)0)->enum_field), \
+ }
+
+#define SIZE_CASE(name) { \
+ SIZE_CASE_COMMON(name), \
+ .input_len = 0, \
+ .output = SIZE_OUTPUT_DATA(struct core_reloc_##name), \
+ .output_len = sizeof(struct core_reloc_size_output), \
+}
+
+#define SIZE_ERR_CASE(name) { \
+ SIZE_CASE_COMMON(name), \
+ .fails = true, \
+}
+
+#define TYPE_BASED_CASE_COMMON(name) \
+ .case_name = #name, \
+ .bpf_obj_file = "test_core_reloc_type_based.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".o" \
+
+#define TYPE_BASED_CASE(name, ...) { \
+ TYPE_BASED_CASE_COMMON(name), \
+ .output = STRUCT_TO_CHAR_PTR(core_reloc_type_based_output) \
+ __VA_ARGS__, \
+ .output_len = sizeof(struct core_reloc_type_based_output), \
+}
+
+#define TYPE_BASED_ERR_CASE(name) { \
+ TYPE_BASED_CASE_COMMON(name), \
+ .fails = true, \
+}
+
+#define TYPE_ID_CASE_COMMON(name) \
+ .case_name = #name, \
+ .bpf_obj_file = "test_core_reloc_type_id.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".o" \
+
+#define TYPE_ID_CASE(name, setup_fn) { \
+ TYPE_ID_CASE_COMMON(name), \
+ .output = STRUCT_TO_CHAR_PTR(core_reloc_type_id_output) {}, \
+ .output_len = sizeof(struct core_reloc_type_id_output), \
+ .setup = setup_fn, \
+}
+
+#define TYPE_ID_ERR_CASE(name) { \
+ TYPE_ID_CASE_COMMON(name), \
+ .fails = true, \
+}
+
+#define ENUMVAL_CASE_COMMON(name) \
+ .case_name = #name, \
+ .bpf_obj_file = "test_core_reloc_enumval.o", \
+ .btf_src_file = "btf__core_reloc_" #name ".o" \
+
+#define ENUMVAL_CASE(name, ...) { \
+ ENUMVAL_CASE_COMMON(name), \
+ .output = STRUCT_TO_CHAR_PTR(core_reloc_enumval_output) \
+ __VA_ARGS__, \
+ .output_len = sizeof(struct core_reloc_enumval_output), \
+}
+
+#define ENUMVAL_ERR_CASE(name) { \
+ ENUMVAL_CASE_COMMON(name), \
+ .fails = true, \
+}
+
+struct core_reloc_test_case;
+
+typedef int (*setup_test_fn)(struct core_reloc_test_case *test);
+
+struct core_reloc_test_case {
+ const char *case_name;
+ const char *bpf_obj_file;
+ const char *btf_src_file;
+ const char *input;
+ int input_len;
+ const char *output;
+ int output_len;
+ bool fails;
+ bool relaxed_core_relocs;
+ bool direct_raw_tp;
+ setup_test_fn setup;
+};
+
+static int find_btf_type(const struct btf *btf, const char *name, __u32 kind)
+{
+ int id;
+
+ id = btf__find_by_name_kind(btf, name, kind);
+ if (CHECK(id <= 0, "find_type_id", "failed to find '%s', kind %d: %d\n", name, kind, id))
+ return -1;
+
+ return id;
+}
+
+static int setup_type_id_case_local(struct core_reloc_test_case *test)
+{
+ struct core_reloc_type_id_output *exp = (void *)test->output;
+ struct btf *local_btf = btf__parse(test->bpf_obj_file, NULL);
+ struct btf *targ_btf = btf__parse(test->btf_src_file, NULL);
+ const struct btf_type *t;
+ const char *name;
+ int i;
+
+ if (CHECK(IS_ERR(local_btf), "local_btf", "failed: %ld\n", PTR_ERR(local_btf)) ||
+ CHECK(IS_ERR(targ_btf), "targ_btf", "failed: %ld\n", PTR_ERR(targ_btf))) {
+ btf__free(local_btf);
+ btf__free(targ_btf);
+ return -EINVAL;
+ }
+
+ exp->local_anon_struct = -1;
+ exp->local_anon_union = -1;
+ exp->local_anon_enum = -1;
+ exp->local_anon_func_proto_ptr = -1;
+ exp->local_anon_void_ptr = -1;
+ exp->local_anon_arr = -1;
+
+ for (i = 1; i <= btf__get_nr_types(local_btf); i++)
+ {
+ t = btf__type_by_id(local_btf, i);
+ /* we are interested only in anonymous types */
+ if (t->name_off)
+ continue;
+
+ if (btf_is_struct(t) && btf_vlen(t) &&
+ (name = btf__name_by_offset(local_btf, btf_members(t)[0].name_off)) &&
+ strcmp(name, "marker_field") == 0) {
+ exp->local_anon_struct = i;
+ } else if (btf_is_union(t) && btf_vlen(t) &&
+ (name = btf__name_by_offset(local_btf, btf_members(t)[0].name_off)) &&
+ strcmp(name, "marker_field") == 0) {
+ exp->local_anon_union = i;
+ } else if (btf_is_enum(t) && btf_vlen(t) &&
+ (name = btf__name_by_offset(local_btf, btf_enum(t)[0].name_off)) &&
+ strcmp(name, "MARKER_ENUM_VAL") == 0) {
+ exp->local_anon_enum = i;
+ } else if (btf_is_ptr(t) && (t = btf__type_by_id(local_btf, t->type))) {
+ if (btf_is_func_proto(t) && (t = btf__type_by_id(local_btf, t->type)) &&
+ btf_is_int(t) && (name = btf__name_by_offset(local_btf, t->name_off)) &&
+ strcmp(name, "_Bool") == 0) {
+ /* ptr -> func_proto -> _Bool */
+ exp->local_anon_func_proto_ptr = i;
+ } else if (btf_is_void(t)) {
+ /* ptr -> void */
+ exp->local_anon_void_ptr = i;
+ }
+ } else if (btf_is_array(t) && (t = btf__type_by_id(local_btf, btf_array(t)->type)) &&
+ btf_is_int(t) && (name = btf__name_by_offset(local_btf, t->name_off)) &&
+ strcmp(name, "_Bool") == 0) {
+ /* _Bool[] */
+ exp->local_anon_arr = i;
+ }
+ }
+
+ exp->local_struct = find_btf_type(local_btf, "a_struct", BTF_KIND_STRUCT);
+ exp->local_union = find_btf_type(local_btf, "a_union", BTF_KIND_UNION);
+ exp->local_enum = find_btf_type(local_btf, "an_enum", BTF_KIND_ENUM);
+ exp->local_int = find_btf_type(local_btf, "int", BTF_KIND_INT);
+ exp->local_struct_typedef = find_btf_type(local_btf, "named_struct_typedef", BTF_KIND_TYPEDEF);
+ exp->local_func_proto_typedef = find_btf_type(local_btf, "func_proto_typedef", BTF_KIND_TYPEDEF);
+ exp->local_arr_typedef = find_btf_type(local_btf, "arr_typedef", BTF_KIND_TYPEDEF);
+
+ btf__free(local_btf);
+ btf__free(targ_btf);
+ return 0;
+}
+
+static int setup_type_id_case_success(struct core_reloc_test_case *test) {
+ struct core_reloc_type_id_output *exp = (void *)test->output;
+ struct btf *targ_btf = btf__parse(test->btf_src_file, NULL);
+ int err;
+
+ err = setup_type_id_case_local(test);
+ if (err)
+ return err;
+
+ targ_btf = btf__parse(test->btf_src_file, NULL);
+
+ exp->targ_struct = find_btf_type(targ_btf, "a_struct", BTF_KIND_STRUCT);
+ exp->targ_union = find_btf_type(targ_btf, "a_union", BTF_KIND_UNION);
+ exp->targ_enum = find_btf_type(targ_btf, "an_enum", BTF_KIND_ENUM);
+ exp->targ_int = find_btf_type(targ_btf, "int", BTF_KIND_INT);
+ exp->targ_struct_typedef = find_btf_type(targ_btf, "named_struct_typedef", BTF_KIND_TYPEDEF);
+ exp->targ_func_proto_typedef = find_btf_type(targ_btf, "func_proto_typedef", BTF_KIND_TYPEDEF);
+ exp->targ_arr_typedef = find_btf_type(targ_btf, "arr_typedef", BTF_KIND_TYPEDEF);
+
+ btf__free(targ_btf);
+ return 0;
+}
+
+static int setup_type_id_case_failure(struct core_reloc_test_case *test)
+{
+ struct core_reloc_type_id_output *exp = (void *)test->output;
+ int err;
+
+ err = setup_type_id_case_local(test);
+ if (err)
+ return err;
+
+ exp->targ_struct = 0;
+ exp->targ_union = 0;
+ exp->targ_enum = 0;
+ exp->targ_int = 0;
+ exp->targ_struct_typedef = 0;
+ exp->targ_func_proto_typedef = 0;
+ exp->targ_arr_typedef = 0;
+
+ return 0;
+}
+
+static struct core_reloc_test_case test_cases[] = {
+ /* validate we can find kernel image and use its BTF for relocs */
+ {
+ .case_name = "kernel",
+ .bpf_obj_file = "test_core_reloc_kernel.o",
+ .btf_src_file = NULL, /* load from /lib/modules/$(uname -r) */
+ .input = "",
+ .input_len = 0,
+ .output = STRUCT_TO_CHAR_PTR(core_reloc_kernel_output) {
+ .valid = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, },
+ .comm = "test_progs",
+ .comm_len = sizeof("test_progs"),
+ },
+ .output_len = sizeof(struct core_reloc_kernel_output),
+ },
+
+ /* validate BPF program can use multiple flavors to match against
+ * single target BTF type
+ */
+ FLAVORS_CASE(flavors),
+
+ FLAVORS_ERR_CASE(flavors__err_wrong_name),
+
+ /* various struct/enum nesting and resolution scenarios */
+ NESTING_CASE(nesting),
+ NESTING_CASE(nesting___anon_embed),
+ NESTING_CASE(nesting___struct_union_mixup),
+ NESTING_CASE(nesting___extra_nesting),
+ NESTING_CASE(nesting___dup_compat_types),
+
+ NESTING_ERR_CASE(nesting___err_missing_field),
+ NESTING_ERR_CASE(nesting___err_array_field),
+ NESTING_ERR_CASE(nesting___err_missing_container),
+ NESTING_ERR_CASE(nesting___err_nonstruct_container),
+ NESTING_ERR_CASE(nesting___err_array_container),
+ NESTING_ERR_CASE(nesting___err_dup_incompat_types),
+ NESTING_ERR_CASE(nesting___err_partial_match_dups),
+ NESTING_ERR_CASE(nesting___err_too_deep),
+
+ /* various array access relocation scenarios */
+ ARRAYS_CASE(arrays),
+ ARRAYS_CASE(arrays___diff_arr_dim),
+ ARRAYS_CASE(arrays___diff_arr_val_sz),
+ ARRAYS_CASE(arrays___equiv_zero_sz_arr),
+ ARRAYS_CASE(arrays___fixed_arr),
+
+ ARRAYS_ERR_CASE(arrays___err_too_small),
+ ARRAYS_ERR_CASE(arrays___err_too_shallow),
+ ARRAYS_ERR_CASE(arrays___err_non_array),
+ ARRAYS_ERR_CASE(arrays___err_wrong_val_type),
+ ARRAYS_ERR_CASE(arrays___err_bad_zero_sz_arr),
+
+ /* enum/ptr/int handling scenarios */
+ PRIMITIVES_CASE(primitives),
+ PRIMITIVES_CASE(primitives___diff_enum_def),
+ PRIMITIVES_CASE(primitives___diff_func_proto),
+ PRIMITIVES_CASE(primitives___diff_ptr_type),
+
+ PRIMITIVES_ERR_CASE(primitives___err_non_enum),
+ PRIMITIVES_ERR_CASE(primitives___err_non_int),
+ PRIMITIVES_ERR_CASE(primitives___err_non_ptr),
+
+ /* const/volatile/restrict and typedefs scenarios */
+ MODS_CASE(mods),
+ MODS_CASE(mods___mod_swap),
+ MODS_CASE(mods___typedefs),
+
+ /* handling "ptr is an array" semantics */
+ PTR_AS_ARR_CASE(ptr_as_arr),
+ PTR_AS_ARR_CASE(ptr_as_arr___diff_sz),
+
+ /* int signedness/sizing/bitfield handling */
+ INTS_CASE(ints),
+ INTS_CASE(ints___bool),
+ INTS_CASE(ints___reverse_sign),
+
+ /* validate edge cases of capturing relocations */
+ {
+ .case_name = "misc",
+ .bpf_obj_file = "test_core_reloc_misc.o",
+ .btf_src_file = "btf__core_reloc_misc.o",
+ .input = (const char *)&(struct core_reloc_misc_extensible[]){
+ { .a = 1 },
+ { .a = 2 }, /* not read */
+ { .a = 3 },
+ },
+ .input_len = 4 * sizeof(int),
+ .output = STRUCT_TO_CHAR_PTR(core_reloc_misc_output) {
+ .a = 1,
+ .b = 1,
+ .c = 0, /* BUG in clang, should be 3 */
+ },
+ .output_len = sizeof(struct core_reloc_misc_output),
+ },
+
+ /* validate field existence checks */
+ {
+ FIELD_EXISTS_CASE_COMMON(existence),
+ .input = STRUCT_TO_CHAR_PTR(core_reloc_existence) {
+ .a = 1,
+ .b = 2,
+ .c = 3,
+ .arr = { 4 },
+ .s = { .x = 5 },
+ },
+ .input_len = sizeof(struct core_reloc_existence),
+ .output = STRUCT_TO_CHAR_PTR(core_reloc_existence_output) {
+ .a_exists = 1,
+ .b_exists = 1,
+ .c_exists = 1,
+ .arr_exists = 1,
+ .s_exists = 1,
+ .a_value = 1,
+ .b_value = 2,
+ .c_value = 3,
+ .arr_value = 4,
+ .s_value = 5,
+ },
+ .output_len = sizeof(struct core_reloc_existence_output),
+ },
+ {
+ FIELD_EXISTS_CASE_COMMON(existence___minimal),
+ .input = STRUCT_TO_CHAR_PTR(core_reloc_existence___minimal) {
+ .a = 42,
+ },
+ .input_len = sizeof(struct core_reloc_existence___minimal),
+ .output = STRUCT_TO_CHAR_PTR(core_reloc_existence_output) {
+ .a_exists = 1,
+ .b_exists = 0,
+ .c_exists = 0,
+ .arr_exists = 0,
+ .s_exists = 0,
+ .a_value = 42,
+ .b_value = 0xff000002u,
+ .c_value = 0xff000003u,
+ .arr_value = 0xff000004u,
+ .s_value = 0xff000005u,
+ },
+ .output_len = sizeof(struct core_reloc_existence_output),
+ },
+ {
+ FIELD_EXISTS_CASE_COMMON(existence___wrong_field_defs),
+ .input = STRUCT_TO_CHAR_PTR(core_reloc_existence___wrong_field_defs) {
+ },
+ .input_len = sizeof(struct core_reloc_existence___wrong_field_defs),
+ .output = STRUCT_TO_CHAR_PTR(core_reloc_existence_output) {
+ .a_exists = 0,
+ .b_exists = 0,
+ .c_exists = 0,
+ .arr_exists = 0,
+ .s_exists = 0,
+ .a_value = 0xff000001u,
+ .b_value = 0xff000002u,
+ .c_value = 0xff000003u,
+ .arr_value = 0xff000004u,
+ .s_value = 0xff000005u,
+ },
+ .output_len = sizeof(struct core_reloc_existence_output),
+ },
+
+ /* bitfield relocation checks */
+ BITFIELDS_CASE(bitfields, {
+ .ub1 = 1,
+ .ub2 = 2,
+ .ub7 = 96,
+ .sb4 = -7,
+ .sb20 = -0x76543,
+ .u32 = 0x80000000,
+ .s32 = -0x76543210,
+ }),
+ BITFIELDS_CASE(bitfields___bit_sz_change, {
+ .ub1 = 6,
+ .ub2 = 0xABCDE,
+ .ub7 = 1,
+ .sb4 = -1,
+ .sb20 = -0x17654321,
+ .u32 = 0xBEEF,
+ .s32 = -0x3FEDCBA987654321LL,
+ }),
+ BITFIELDS_CASE(bitfields___bitfield_vs_int, {
+ .ub1 = 0xFEDCBA9876543210LL,
+ .ub2 = 0xA6,
+ .ub7 = -0x7EDCBA987654321LL,
+ .sb4 = -0x6123456789ABCDELL,
+ .sb20 = 0xD00DLL,
+ .u32 = -0x76543,
+ .s32 = 0x0ADEADBEEFBADB0BLL,
+ }),
+ BITFIELDS_CASE(bitfields___just_big_enough, {
+ .ub1 = 0xFLL,
+ .ub2 = 0x0812345678FEDCBALL,
+ }),
+ BITFIELDS_ERR_CASE(bitfields___err_too_big_bitfield),
+
+ /* size relocation checks */
+ SIZE_CASE(size),
+ SIZE_CASE(size___diff_sz),
+ SIZE_ERR_CASE(size___err_ambiguous),
+
+ /* validate type existence and size relocations */
+ TYPE_BASED_CASE(type_based, {
+ .struct_exists = 1,
+ .union_exists = 1,
+ .enum_exists = 1,
+ .typedef_named_struct_exists = 1,
+ .typedef_anon_struct_exists = 1,
+ .typedef_struct_ptr_exists = 1,
+ .typedef_int_exists = 1,
+ .typedef_enum_exists = 1,
+ .typedef_void_ptr_exists = 1,
+ .typedef_func_proto_exists = 1,
+ .typedef_arr_exists = 1,
+ .struct_sz = sizeof(struct a_struct),
+ .union_sz = sizeof(union a_union),
+ .enum_sz = sizeof(enum an_enum),
+ .typedef_named_struct_sz = sizeof(named_struct_typedef),
+ .typedef_anon_struct_sz = sizeof(anon_struct_typedef),
+ .typedef_struct_ptr_sz = sizeof(struct_ptr_typedef),
+ .typedef_int_sz = sizeof(int_typedef),
+ .typedef_enum_sz = sizeof(enum_typedef),
+ .typedef_void_ptr_sz = sizeof(void_ptr_typedef),
+ .typedef_func_proto_sz = sizeof(func_proto_typedef),
+ .typedef_arr_sz = sizeof(arr_typedef),
+ }),
+ TYPE_BASED_CASE(type_based___all_missing, {
+ /* all zeros */
+ }),
+ TYPE_BASED_CASE(type_based___diff_sz, {
+ .struct_exists = 1,
+ .union_exists = 1,
+ .enum_exists = 1,
+ .typedef_named_struct_exists = 1,
+ .typedef_anon_struct_exists = 1,
+ .typedef_struct_ptr_exists = 1,
+ .typedef_int_exists = 1,
+ .typedef_enum_exists = 1,
+ .typedef_void_ptr_exists = 1,
+ .typedef_func_proto_exists = 1,
+ .typedef_arr_exists = 1,
+ .struct_sz = sizeof(struct a_struct___diff_sz),
+ .union_sz = sizeof(union a_union___diff_sz),
+ .enum_sz = sizeof(enum an_enum___diff_sz),
+ .typedef_named_struct_sz = sizeof(named_struct_typedef___diff_sz),
+ .typedef_anon_struct_sz = sizeof(anon_struct_typedef___diff_sz),
+ .typedef_struct_ptr_sz = sizeof(struct_ptr_typedef___diff_sz),
+ .typedef_int_sz = sizeof(int_typedef___diff_sz),
+ .typedef_enum_sz = sizeof(enum_typedef___diff_sz),
+ .typedef_void_ptr_sz = sizeof(void_ptr_typedef___diff_sz),
+ .typedef_func_proto_sz = sizeof(func_proto_typedef___diff_sz),
+ .typedef_arr_sz = sizeof(arr_typedef___diff_sz),
+ }),
+ TYPE_BASED_CASE(type_based___incompat, {
+ .enum_exists = 1,
+ .enum_sz = sizeof(enum an_enum),
+ }),
+ TYPE_BASED_CASE(type_based___fn_wrong_args, {
+ .struct_exists = 1,
+ .struct_sz = sizeof(struct a_struct),
+ }),
+
+ /* BTF_TYPE_ID_LOCAL/BTF_TYPE_ID_TARGET tests */
+ TYPE_ID_CASE(type_id, setup_type_id_case_success),
+ TYPE_ID_CASE(type_id___missing_targets, setup_type_id_case_failure),
+
+ /* Enumerator value existence and value relocations */
+ ENUMVAL_CASE(enumval, {
+ .named_val1_exists = true,
+ .named_val2_exists = true,
+ .named_val3_exists = true,
+ .anon_val1_exists = true,
+ .anon_val2_exists = true,
+ .anon_val3_exists = true,
+ .named_val1 = 1,
+ .named_val2 = 2,
+ .anon_val1 = 0x10,
+ .anon_val2 = 0x20,
+ }),
+ ENUMVAL_CASE(enumval___diff, {
+ .named_val1_exists = true,
+ .named_val2_exists = true,
+ .named_val3_exists = true,
+ .anon_val1_exists = true,
+ .anon_val2_exists = true,
+ .anon_val3_exists = true,
+ .named_val1 = 101,
+ .named_val2 = 202,
+ .anon_val1 = 0x11,
+ .anon_val2 = 0x22,
+ }),
+ ENUMVAL_CASE(enumval___val3_missing, {
+ .named_val1_exists = true,
+ .named_val2_exists = true,
+ .named_val3_exists = false,
+ .anon_val1_exists = true,
+ .anon_val2_exists = true,
+ .anon_val3_exists = false,
+ .named_val1 = 111,
+ .named_val2 = 222,
+ .anon_val1 = 0x111,
+ .anon_val2 = 0x222,
+ }),
+ ENUMVAL_ERR_CASE(enumval___err_missing),
+};
+
+struct data {
+ char in[256];
+ char out[256];
+ bool skip;
+ uint64_t my_pid_tgid;
+};
+
+static size_t roundup_page(size_t sz)
+{
+ long page_size = sysconf(_SC_PAGE_SIZE);
+ return (sz + page_size - 1) / page_size * page_size;
+}
+
+void test_core_reloc(void)
+{
+ const size_t mmap_sz = roundup_page(sizeof(struct data));
+ struct bpf_object_load_attr load_attr = {};
+ struct core_reloc_test_case *test_case;
+ const char *tp_name, *probe_name;
+ int err, i, equal;
+ struct bpf_link *link = NULL;
+ struct bpf_map *data_map;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ uint64_t my_pid_tgid;
+ struct data *data;
+ void *mmap_data = NULL;
+
+ my_pid_tgid = getpid() | ((uint64_t)syscall(SYS_gettid) << 32);
+
+ for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
+ test_case = &test_cases[i];
+ if (!test__start_subtest(test_case->case_name))
+ continue;
+
+ if (test_case->setup) {
+ err = test_case->setup(test_case);
+ if (CHECK(err, "test_setup", "test #%d setup failed: %d\n", i, err))
+ continue;
+ }
+
+ obj = bpf_object__open_file(test_case->bpf_obj_file, NULL);
+ if (CHECK(IS_ERR(obj), "obj_open", "failed to open '%s': %ld\n",
+ test_case->bpf_obj_file, PTR_ERR(obj)))
+ continue;
+
+ /* for typed raw tracepoints, NULL should be specified */
+ if (test_case->direct_raw_tp) {
+ probe_name = "tp_btf/sys_enter";
+ tp_name = NULL;
+ } else {
+ probe_name = "raw_tracepoint/sys_enter";
+ tp_name = "sys_enter";
+ }
+
+ prog = bpf_object__find_program_by_title(obj, probe_name);
+ if (CHECK(!prog, "find_probe",
+ "prog '%s' not found\n", probe_name))
+ goto cleanup;
+
+
+ if (test_case->btf_src_file) {
+ err = access(test_case->btf_src_file, R_OK);
+ if (!ASSERT_OK(err, "btf_src_file"))
+ goto cleanup;
+ }
+
+ load_attr.obj = obj;
+ load_attr.log_level = 0;
+ load_attr.target_btf_path = test_case->btf_src_file;
+ err = bpf_object__load_xattr(&load_attr);
+ if (err) {
+ if (!test_case->fails)
+ ASSERT_OK(err, "obj_load");
+ goto cleanup;
+ }
+
+ data_map = bpf_object__find_map_by_name(obj, "test_cor.bss");
+ if (CHECK(!data_map, "find_data_map", "data map not found\n"))
+ goto cleanup;
+
+ mmap_data = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE,
+ MAP_SHARED, bpf_map__fd(data_map), 0);
+ if (CHECK(mmap_data == MAP_FAILED, "mmap",
+ ".bss mmap failed: %d", errno)) {
+ mmap_data = NULL;
+ goto cleanup;
+ }
+ data = mmap_data;
+
+ memset(mmap_data, 0, sizeof(*data));
+ memcpy(data->in, test_case->input, test_case->input_len);
+ data->my_pid_tgid = my_pid_tgid;
+
+ link = bpf_program__attach_raw_tracepoint(prog, tp_name);
+ if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n",
+ PTR_ERR(link)))
+ goto cleanup;
+
+ /* trigger test run */
+ usleep(1);
+
+ if (data->skip) {
+ test__skip();
+ goto cleanup;
+ }
+
+ if (!ASSERT_EQ(test_case->fails, false, "obj_load_should_fail"))
+ goto cleanup;
+
+ equal = memcmp(data->out, test_case->output,
+ test_case->output_len) == 0;
+ if (CHECK(!equal, "check_result",
+ "input/output data don't match\n")) {
+ int j;
+
+ for (j = 0; j < test_case->input_len; j++) {
+ printf("input byte #%d: 0x%02hhx\n",
+ j, test_case->input[j]);
+ }
+ for (j = 0; j < test_case->output_len; j++) {
+ printf("output byte #%d: EXP 0x%02hhx GOT 0x%02hhx\n",
+ j, test_case->output[j], data->out[j]);
+ }
+ goto cleanup;
+ }
+
+cleanup:
+ if (mmap_data) {
+ CHECK_FAIL(munmap(mmap_data, mmap_sz));
+ mmap_data = NULL;
+ }
+ if (!IS_ERR_OR_NULL(link)) {
+ bpf_link__destroy(link);
+ link = NULL;
+ }
+ bpf_object__close(obj);
+ }
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/core_retro.c b/tools/testing/selftests/bpf/prog_tests/core_retro.c
new file mode 100644
index 000000000..6acb0e94d
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/core_retro.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+#define _GNU_SOURCE
+#include <test_progs.h>
+#include "test_core_retro.skel.h"
+
+void test_core_retro(void)
+{
+ int err, zero = 0, res, duration = 0, my_pid = getpid();
+ struct test_core_retro *skel;
+
+ /* load program */
+ skel = test_core_retro__open_and_load();
+ if (CHECK(!skel, "skel_load", "skeleton open/load failed\n"))
+ goto out_close;
+
+ err = bpf_map_update_elem(bpf_map__fd(skel->maps.exp_tgid_map), &zero, &my_pid, 0);
+ if (CHECK(err, "map_update", "failed to set expected PID: %d\n", errno))
+ goto out_close;
+
+ /* attach probe */
+ err = test_core_retro__attach(skel);
+ if (CHECK(err, "attach_kprobe", "err %d\n", err))
+ goto out_close;
+
+ /* trigger */
+ usleep(1);
+
+ err = bpf_map_lookup_elem(bpf_map__fd(skel->maps.results), &zero, &res);
+ if (CHECK(err, "map_lookup", "failed to lookup result: %d\n", errno))
+ goto out_close;
+
+ CHECK(res != my_pid, "pid_check", "got %d != exp %d\n", res, my_pid);
+
+out_close:
+ test_core_retro__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/cpu_mask.c b/tools/testing/selftests/bpf/prog_tests/cpu_mask.c
new file mode 100644
index 000000000..f7c7e2523
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cpu_mask.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <bpf/btf.h>
+#include "bpf/libbpf_internal.h"
+
+static int duration = 0;
+
+static void validate_mask(int case_nr, const char *exp, bool *mask, int n)
+{
+ int i;
+
+ for (i = 0; exp[i]; i++) {
+ if (exp[i] == '1') {
+ if (CHECK(i + 1 > n, "mask_short",
+ "case #%d: mask too short, got n=%d, need at least %d\n",
+ case_nr, n, i + 1))
+ return;
+ CHECK(!mask[i], "cpu_not_set",
+ "case #%d: mask differs, expected cpu#%d SET\n",
+ case_nr, i);
+ } else {
+ CHECK(i < n && mask[i], "cpu_set",
+ "case #%d: mask differs, expected cpu#%d UNSET\n",
+ case_nr, i);
+ }
+ }
+ CHECK(i < n, "mask_long",
+ "case #%d: mask too long, got n=%d, expected at most %d\n",
+ case_nr, n, i);
+}
+
+static struct {
+ const char *cpu_mask;
+ const char *expect;
+ bool fails;
+} test_cases[] = {
+ { "0\n", "1", false },
+ { "0,2\n", "101", false },
+ { "0-2\n", "111", false },
+ { "0-2,3-4\n", "11111", false },
+ { "0", "1", false },
+ { "0-2", "111", false },
+ { "0,2", "101", false },
+ { "0,1-3", "1111", false },
+ { "0,1,2,3", "1111", false },
+ { "0,2-3,5", "101101", false },
+ { "3-3", "0001", false },
+ { "2-4,6,9-10", "00111010011", false },
+ /* failure cases */
+ { "", "", true },
+ { "0-", "", true },
+ { "0 ", "", true },
+ { "0_1", "", true },
+ { "1-0", "", true },
+ { "-1", "", true },
+};
+
+void test_cpu_mask()
+{
+ int i, err, n;
+ bool *mask;
+
+ for (i = 0; i < ARRAY_SIZE(test_cases); i++) {
+ mask = NULL;
+ err = parse_cpu_mask_str(test_cases[i].cpu_mask, &mask, &n);
+ if (test_cases[i].fails) {
+ CHECK(!err, "should_fail",
+ "case #%d: parsing should fail!\n", i + 1);
+ } else {
+ if (CHECK(err, "parse_err",
+ "case #%d: cpu mask parsing failed: %d\n",
+ i + 1, err))
+ continue;
+ validate_mask(i + 1, test_cases[i].expect, mask, n);
+ }
+ free(mask);
+ }
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/d_path.c b/tools/testing/selftests/bpf/prog_tests/d_path.c
new file mode 100644
index 000000000..0a577a248
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/d_path.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <test_progs.h>
+#include <sys/stat.h>
+#include <linux/sched.h>
+#include <sys/syscall.h>
+
+#define MAX_PATH_LEN 128
+#define MAX_FILES 7
+
+#include "test_d_path.skel.h"
+
+static int duration;
+
+static struct {
+ __u32 cnt;
+ char paths[MAX_FILES][MAX_PATH_LEN];
+} src;
+
+static int set_pathname(int fd, pid_t pid)
+{
+ char buf[MAX_PATH_LEN];
+
+ snprintf(buf, MAX_PATH_LEN, "/proc/%d/fd/%d", pid, fd);
+ return readlink(buf, src.paths[src.cnt++], MAX_PATH_LEN);
+}
+
+static int trigger_fstat_events(pid_t pid)
+{
+ int sockfd = -1, procfd = -1, devfd = -1;
+ int localfd = -1, indicatorfd = -1;
+ int pipefd[2] = { -1, -1 };
+ struct stat fileStat;
+ int ret = -1;
+
+ /* unmountable pseudo-filesystems */
+ if (CHECK(pipe(pipefd) < 0, "trigger", "pipe failed\n"))
+ return ret;
+ /* unmountable pseudo-filesystems */
+ sockfd = socket(AF_INET, SOCK_STREAM, 0);
+ if (CHECK(sockfd < 0, "trigger", "socket failed\n"))
+ goto out_close;
+ /* mountable pseudo-filesystems */
+ procfd = open("/proc/self/comm", O_RDONLY);
+ if (CHECK(procfd < 0, "trigger", "open /proc/self/comm failed\n"))
+ goto out_close;
+ devfd = open("/dev/urandom", O_RDONLY);
+ if (CHECK(devfd < 0, "trigger", "open /dev/urandom failed\n"))
+ goto out_close;
+ localfd = open("/tmp/d_path_loadgen.txt", O_CREAT | O_RDONLY, 0644);
+ if (CHECK(localfd < 0, "trigger", "open /tmp/d_path_loadgen.txt failed\n"))
+ goto out_close;
+ /* bpf_d_path will return path with (deleted) */
+ remove("/tmp/d_path_loadgen.txt");
+ indicatorfd = open("/tmp/", O_PATH);
+ if (CHECK(indicatorfd < 0, "trigger", "open /tmp/ failed\n"))
+ goto out_close;
+
+ ret = set_pathname(pipefd[0], pid);
+ if (CHECK(ret < 0, "trigger", "set_pathname failed for pipe[0]\n"))
+ goto out_close;
+ ret = set_pathname(pipefd[1], pid);
+ if (CHECK(ret < 0, "trigger", "set_pathname failed for pipe[1]\n"))
+ goto out_close;
+ ret = set_pathname(sockfd, pid);
+ if (CHECK(ret < 0, "trigger", "set_pathname failed for socket\n"))
+ goto out_close;
+ ret = set_pathname(procfd, pid);
+ if (CHECK(ret < 0, "trigger", "set_pathname failed for proc\n"))
+ goto out_close;
+ ret = set_pathname(devfd, pid);
+ if (CHECK(ret < 0, "trigger", "set_pathname failed for dev\n"))
+ goto out_close;
+ ret = set_pathname(localfd, pid);
+ if (CHECK(ret < 0, "trigger", "set_pathname failed for file\n"))
+ goto out_close;
+ ret = set_pathname(indicatorfd, pid);
+ if (CHECK(ret < 0, "trigger", "set_pathname failed for dir\n"))
+ goto out_close;
+
+ /* triggers vfs_getattr */
+ fstat(pipefd[0], &fileStat);
+ fstat(pipefd[1], &fileStat);
+ fstat(sockfd, &fileStat);
+ fstat(procfd, &fileStat);
+ fstat(devfd, &fileStat);
+ fstat(localfd, &fileStat);
+ fstat(indicatorfd, &fileStat);
+
+out_close:
+ /* triggers filp_close */
+ close(pipefd[0]);
+ close(pipefd[1]);
+ close(sockfd);
+ close(procfd);
+ close(devfd);
+ close(localfd);
+ close(indicatorfd);
+ return ret;
+}
+
+void test_d_path(void)
+{
+ struct test_d_path__bss *bss;
+ struct test_d_path *skel;
+ int err;
+
+ skel = test_d_path__open_and_load();
+ if (CHECK(!skel, "setup", "d_path skeleton failed\n"))
+ goto cleanup;
+
+ err = test_d_path__attach(skel);
+ if (CHECK(err, "setup", "attach failed: %d\n", err))
+ goto cleanup;
+
+ bss = skel->bss;
+ bss->my_pid = getpid();
+
+ err = trigger_fstat_events(bss->my_pid);
+ if (err < 0)
+ goto cleanup;
+
+ if (CHECK(!bss->called_stat,
+ "stat",
+ "trampoline for security_inode_getattr was not called\n"))
+ goto cleanup;
+
+ if (CHECK(!bss->called_close,
+ "close",
+ "trampoline for filp_close was not called\n"))
+ goto cleanup;
+
+ for (int i = 0; i < MAX_FILES; i++) {
+ CHECK(strncmp(src.paths[i], bss->paths_stat[i], MAX_PATH_LEN),
+ "check",
+ "failed to get stat path[%d]: %s vs %s\n",
+ i, src.paths[i], bss->paths_stat[i]);
+ CHECK(strncmp(src.paths[i], bss->paths_close[i], MAX_PATH_LEN),
+ "check",
+ "failed to get close path[%d]: %s vs %s\n",
+ i, src.paths[i], bss->paths_close[i]);
+ /* The d_path helper returns size plus NUL char, hence + 1 */
+ CHECK(bss->rets_stat[i] != strlen(bss->paths_stat[i]) + 1,
+ "check",
+ "failed to match stat return [%d]: %d vs %zd [%s]\n",
+ i, bss->rets_stat[i], strlen(bss->paths_stat[i]) + 1,
+ bss->paths_stat[i]);
+ CHECK(bss->rets_close[i] != strlen(bss->paths_stat[i]) + 1,
+ "check",
+ "failed to match stat return [%d]: %d vs %zd [%s]\n",
+ i, bss->rets_close[i], strlen(bss->paths_close[i]) + 1,
+ bss->paths_stat[i]);
+ }
+
+cleanup:
+ test_d_path__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/enable_stats.c b/tools/testing/selftests/bpf/prog_tests/enable_stats.c
new file mode 100644
index 000000000..2cb208591
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/enable_stats.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include "test_enable_stats.skel.h"
+
+void test_enable_stats(void)
+{
+ struct test_enable_stats *skel;
+ int stats_fd, err, prog_fd;
+ struct bpf_prog_info info;
+ __u32 info_len = sizeof(info);
+ int duration = 0;
+
+ skel = test_enable_stats__open_and_load();
+ if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n"))
+ return;
+
+ stats_fd = bpf_enable_stats(BPF_STATS_RUN_TIME);
+ if (CHECK(stats_fd < 0, "get_stats_fd", "failed %d\n", errno)) {
+ test_enable_stats__destroy(skel);
+ return;
+ }
+
+ err = test_enable_stats__attach(skel);
+ if (CHECK(err, "attach_raw_tp", "err %d\n", err))
+ goto cleanup;
+
+ test_enable_stats__detach(skel);
+
+ prog_fd = bpf_program__fd(skel->progs.test_enable_stats);
+ memset(&info, 0, info_len);
+ err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
+ if (CHECK(err, "get_prog_info",
+ "failed to get bpf_prog_info for fd %d\n", prog_fd))
+ goto cleanup;
+ if (CHECK(info.run_time_ns == 0, "check_stats_enabled",
+ "failed to enable run_time_ns stats\n"))
+ goto cleanup;
+
+ CHECK(info.run_cnt != skel->bss->count, "check_run_cnt_valid",
+ "invalid run_cnt stats\n");
+
+cleanup:
+ test_enable_stats__destroy(skel);
+ close(stats_fd);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/endian.c b/tools/testing/selftests/bpf/prog_tests/endian.c
new file mode 100644
index 000000000..1a11612ac
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/endian.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include <test_progs.h>
+#include "test_endian.skel.h"
+
+static int duration;
+
+#define IN16 0x1234
+#define IN32 0x12345678U
+#define IN64 0x123456789abcdef0ULL
+
+#define OUT16 0x3412
+#define OUT32 0x78563412U
+#define OUT64 0xf0debc9a78563412ULL
+
+void test_endian(void)
+{
+ struct test_endian* skel;
+ struct test_endian__bss *bss;
+ int err;
+
+ skel = test_endian__open_and_load();
+ if (CHECK(!skel, "skel_open", "failed to open skeleton\n"))
+ return;
+ bss = skel->bss;
+
+ bss->in16 = IN16;
+ bss->in32 = IN32;
+ bss->in64 = IN64;
+
+ err = test_endian__attach(skel);
+ if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err))
+ goto cleanup;
+
+ usleep(1);
+
+ CHECK(bss->out16 != OUT16, "out16", "got 0x%llx != exp 0x%llx\n",
+ (__u64)bss->out16, (__u64)OUT16);
+ CHECK(bss->out32 != OUT32, "out32", "got 0x%llx != exp 0x%llx\n",
+ (__u64)bss->out32, (__u64)OUT32);
+ CHECK(bss->out64 != OUT64, "out16", "got 0x%llx != exp 0x%llx\n",
+ (__u64)bss->out64, (__u64)OUT64);
+
+ CHECK(bss->const16 != OUT16, "const16", "got 0x%llx != exp 0x%llx\n",
+ (__u64)bss->const16, (__u64)OUT16);
+ CHECK(bss->const32 != OUT32, "const32", "got 0x%llx != exp 0x%llx\n",
+ (__u64)bss->const32, (__u64)OUT32);
+ CHECK(bss->const64 != OUT64, "const64", "got 0x%llx != exp 0x%llx\n",
+ (__u64)bss->const64, (__u64)OUT64);
+cleanup:
+ test_endian__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c
new file mode 100644
index 000000000..109d0345a
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/fentry_fexit.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+#include <test_progs.h>
+#include "fentry_test.skel.h"
+#include "fexit_test.skel.h"
+
+void test_fentry_fexit(void)
+{
+ struct fentry_test *fentry_skel = NULL;
+ struct fexit_test *fexit_skel = NULL;
+ __u64 *fentry_res, *fexit_res;
+ __u32 duration = 0, retval;
+ int err, prog_fd, i;
+
+ fentry_skel = fentry_test__open_and_load();
+ if (CHECK(!fentry_skel, "fentry_skel_load", "fentry skeleton failed\n"))
+ goto close_prog;
+ fexit_skel = fexit_test__open_and_load();
+ if (CHECK(!fexit_skel, "fexit_skel_load", "fexit skeleton failed\n"))
+ goto close_prog;
+
+ err = fentry_test__attach(fentry_skel);
+ if (CHECK(err, "fentry_attach", "fentry attach failed: %d\n", err))
+ goto close_prog;
+ err = fexit_test__attach(fexit_skel);
+ if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err))
+ goto close_prog;
+
+ prog_fd = bpf_program__fd(fexit_skel->progs.test1);
+ err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
+ NULL, NULL, &retval, &duration);
+ CHECK(err || retval, "ipv6",
+ "err %d errno %d retval %d duration %d\n",
+ err, errno, retval, duration);
+
+ fentry_res = (__u64 *)fentry_skel->bss;
+ fexit_res = (__u64 *)fexit_skel->bss;
+ printf("%lld\n", fentry_skel->bss->test1_result);
+ for (i = 0; i < 8; i++) {
+ CHECK(fentry_res[i] != 1, "result",
+ "fentry_test%d failed err %lld\n", i + 1, fentry_res[i]);
+ CHECK(fexit_res[i] != 1, "result",
+ "fexit_test%d failed err %lld\n", i + 1, fexit_res[i]);
+ }
+
+close_prog:
+ fentry_test__destroy(fentry_skel);
+ fexit_test__destroy(fexit_skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/fentry_test.c b/tools/testing/selftests/bpf/prog_tests/fentry_test.c
new file mode 100644
index 000000000..04ebbf1cb
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/fentry_test.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+#include <test_progs.h>
+#include "fentry_test.skel.h"
+
+void test_fentry_test(void)
+{
+ struct fentry_test *fentry_skel = NULL;
+ int err, prog_fd, i;
+ __u32 duration = 0, retval;
+ __u64 *result;
+
+ fentry_skel = fentry_test__open_and_load();
+ if (CHECK(!fentry_skel, "fentry_skel_load", "fentry skeleton failed\n"))
+ goto cleanup;
+
+ err = fentry_test__attach(fentry_skel);
+ if (CHECK(err, "fentry_attach", "fentry attach failed: %d\n", err))
+ goto cleanup;
+
+ prog_fd = bpf_program__fd(fentry_skel->progs.test1);
+ err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
+ NULL, NULL, &retval, &duration);
+ CHECK(err || retval, "test_run",
+ "err %d errno %d retval %d duration %d\n",
+ err, errno, retval, duration);
+
+ result = (__u64 *)fentry_skel->bss;
+ for (i = 0; i < 6; i++) {
+ if (CHECK(result[i] != 1, "result",
+ "fentry_test%d failed err %lld\n", i + 1, result[i]))
+ goto cleanup;
+ }
+
+cleanup:
+ fentry_test__destroy(fentry_skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c
new file mode 100644
index 000000000..5c0448910
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/fexit_bpf2bpf.c
@@ -0,0 +1,373 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+#include <test_progs.h>
+#include <network_helpers.h>
+#include <bpf/btf.h>
+
+typedef int (*test_cb)(struct bpf_object *obj);
+
+static int check_data_map(struct bpf_object *obj, int prog_cnt, bool reset)
+{
+ struct bpf_map *data_map = NULL, *map;
+ __u64 *result = NULL;
+ const int zero = 0;
+ __u32 duration = 0;
+ int ret = -1, i;
+
+ result = malloc((prog_cnt + 32 /* spare */) * sizeof(__u64));
+ if (CHECK(!result, "alloc_memory", "failed to alloc memory"))
+ return -ENOMEM;
+
+ bpf_object__for_each_map(map, obj)
+ if (bpf_map__is_internal(map)) {
+ data_map = map;
+ break;
+ }
+ if (CHECK(!data_map, "find_data_map", "data map not found\n"))
+ goto out;
+
+ ret = bpf_map_lookup_elem(bpf_map__fd(data_map), &zero, result);
+ if (CHECK(ret, "get_result",
+ "failed to get output data: %d\n", ret))
+ goto out;
+
+ for (i = 0; i < prog_cnt; i++) {
+ if (CHECK(result[i] != 1, "result",
+ "fexit_bpf2bpf result[%d] failed err %llu\n",
+ i, result[i]))
+ goto out;
+ result[i] = 0;
+ }
+ if (reset) {
+ ret = bpf_map_update_elem(bpf_map__fd(data_map), &zero, result, 0);
+ if (CHECK(ret, "reset_result", "failed to reset result\n"))
+ goto out;
+ }
+
+ ret = 0;
+out:
+ free(result);
+ return ret;
+}
+
+static void test_fexit_bpf2bpf_common(const char *obj_file,
+ const char *target_obj_file,
+ int prog_cnt,
+ const char **prog_name,
+ bool run_prog,
+ test_cb cb)
+{
+ struct bpf_object *obj = NULL, *tgt_obj;
+ struct bpf_program **prog = NULL;
+ struct bpf_link **link = NULL;
+ __u32 duration = 0, retval;
+ int err, tgt_fd, i;
+
+ err = bpf_prog_load(target_obj_file, BPF_PROG_TYPE_UNSPEC,
+ &tgt_obj, &tgt_fd);
+ if (CHECK(err, "tgt_prog_load", "file %s err %d errno %d\n",
+ target_obj_file, err, errno))
+ return;
+ DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts,
+ .attach_prog_fd = tgt_fd,
+ );
+
+ link = calloc(sizeof(struct bpf_link *), prog_cnt);
+ prog = calloc(sizeof(struct bpf_program *), prog_cnt);
+ if (CHECK(!link || !prog, "alloc_memory", "failed to alloc memory"))
+ goto close_prog;
+
+ obj = bpf_object__open_file(obj_file, &opts);
+ if (CHECK(IS_ERR_OR_NULL(obj), "obj_open",
+ "failed to open %s: %ld\n", obj_file,
+ PTR_ERR(obj)))
+ goto close_prog;
+
+ err = bpf_object__load(obj);
+ if (CHECK(err, "obj_load", "err %d\n", err))
+ goto close_prog;
+
+ for (i = 0; i < prog_cnt; i++) {
+ prog[i] = bpf_object__find_program_by_title(obj, prog_name[i]);
+ if (CHECK(!prog[i], "find_prog", "prog %s not found\n", prog_name[i]))
+ goto close_prog;
+ link[i] = bpf_program__attach_trace(prog[i]);
+ if (CHECK(IS_ERR(link[i]), "attach_trace", "failed to link\n"))
+ goto close_prog;
+ }
+
+ if (cb) {
+ err = cb(obj);
+ if (err)
+ goto close_prog;
+ }
+
+ if (!run_prog)
+ goto close_prog;
+
+ err = bpf_prog_test_run(tgt_fd, 1, &pkt_v6, sizeof(pkt_v6),
+ NULL, NULL, &retval, &duration);
+ CHECK(err || retval, "ipv6",
+ "err %d errno %d retval %d duration %d\n",
+ err, errno, retval, duration);
+
+ if (check_data_map(obj, prog_cnt, false))
+ goto close_prog;
+
+close_prog:
+ for (i = 0; i < prog_cnt; i++)
+ if (!IS_ERR_OR_NULL(link[i]))
+ bpf_link__destroy(link[i]);
+ if (!IS_ERR_OR_NULL(obj))
+ bpf_object__close(obj);
+ bpf_object__close(tgt_obj);
+ free(link);
+ free(prog);
+}
+
+static void test_target_no_callees(void)
+{
+ const char *prog_name[] = {
+ "fexit/test_pkt_md_access",
+ };
+ test_fexit_bpf2bpf_common("./fexit_bpf2bpf_simple.o",
+ "./test_pkt_md_access.o",
+ ARRAY_SIZE(prog_name),
+ prog_name, true, NULL);
+}
+
+static void test_target_yes_callees(void)
+{
+ const char *prog_name[] = {
+ "fexit/test_pkt_access",
+ "fexit/test_pkt_access_subprog1",
+ "fexit/test_pkt_access_subprog2",
+ "fexit/test_pkt_access_subprog3",
+ };
+ test_fexit_bpf2bpf_common("./fexit_bpf2bpf.o",
+ "./test_pkt_access.o",
+ ARRAY_SIZE(prog_name),
+ prog_name, true, NULL);
+}
+
+static void test_func_replace(void)
+{
+ const char *prog_name[] = {
+ "fexit/test_pkt_access",
+ "fexit/test_pkt_access_subprog1",
+ "fexit/test_pkt_access_subprog2",
+ "fexit/test_pkt_access_subprog3",
+ "freplace/get_skb_len",
+ "freplace/get_skb_ifindex",
+ "freplace/get_constant",
+ "freplace/test_pkt_write_access_subprog",
+ };
+ test_fexit_bpf2bpf_common("./fexit_bpf2bpf.o",
+ "./test_pkt_access.o",
+ ARRAY_SIZE(prog_name),
+ prog_name, true, NULL);
+}
+
+static void test_func_replace_verify(void)
+{
+ const char *prog_name[] = {
+ "freplace/do_bind",
+ };
+ test_fexit_bpf2bpf_common("./freplace_connect4.o",
+ "./connect4_prog.o",
+ ARRAY_SIZE(prog_name),
+ prog_name, false, NULL);
+}
+
+static int test_second_attach(struct bpf_object *obj)
+{
+ const char *prog_name = "freplace/get_constant";
+ const char *tgt_name = prog_name + 9; /* cut off freplace/ */
+ const char *tgt_obj_file = "./test_pkt_access.o";
+ struct bpf_program *prog = NULL;
+ struct bpf_object *tgt_obj;
+ __u32 duration = 0, retval;
+ struct bpf_link *link;
+ int err = 0, tgt_fd;
+
+ prog = bpf_object__find_program_by_title(obj, prog_name);
+ if (CHECK(!prog, "find_prog", "prog %s not found\n", prog_name))
+ return -ENOENT;
+
+ err = bpf_prog_load(tgt_obj_file, BPF_PROG_TYPE_UNSPEC,
+ &tgt_obj, &tgt_fd);
+ if (CHECK(err, "second_prog_load", "file %s err %d errno %d\n",
+ tgt_obj_file, err, errno))
+ return err;
+
+ link = bpf_program__attach_freplace(prog, tgt_fd, tgt_name);
+ if (CHECK(IS_ERR(link), "second_link", "failed to attach second link prog_fd %d tgt_fd %d\n", bpf_program__fd(prog), tgt_fd))
+ goto out;
+
+ err = bpf_prog_test_run(tgt_fd, 1, &pkt_v6, sizeof(pkt_v6),
+ NULL, NULL, &retval, &duration);
+ if (CHECK(err || retval, "ipv6",
+ "err %d errno %d retval %d duration %d\n",
+ err, errno, retval, duration))
+ goto out;
+
+ err = check_data_map(obj, 1, true);
+ if (err)
+ goto out;
+
+out:
+ bpf_link__destroy(link);
+ bpf_object__close(tgt_obj);
+ return err;
+}
+
+static void test_func_replace_multi(void)
+{
+ const char *prog_name[] = {
+ "freplace/get_constant",
+ };
+ test_fexit_bpf2bpf_common("./freplace_get_constant.o",
+ "./test_pkt_access.o",
+ ARRAY_SIZE(prog_name),
+ prog_name, true, test_second_attach);
+}
+
+static void test_fmod_ret_freplace(void)
+{
+ struct bpf_object *freplace_obj = NULL, *pkt_obj, *fmod_obj = NULL;
+ const char *freplace_name = "./freplace_get_constant.o";
+ const char *fmod_ret_name = "./fmod_ret_freplace.o";
+ DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts);
+ const char *tgt_name = "./test_pkt_access.o";
+ struct bpf_link *freplace_link = NULL;
+ struct bpf_program *prog;
+ __u32 duration = 0;
+ int err, pkt_fd;
+
+ err = bpf_prog_load(tgt_name, BPF_PROG_TYPE_UNSPEC,
+ &pkt_obj, &pkt_fd);
+ /* the target prog should load fine */
+ if (CHECK(err, "tgt_prog_load", "file %s err %d errno %d\n",
+ tgt_name, err, errno))
+ return;
+ opts.attach_prog_fd = pkt_fd;
+
+ freplace_obj = bpf_object__open_file(freplace_name, &opts);
+ if (CHECK(IS_ERR_OR_NULL(freplace_obj), "freplace_obj_open",
+ "failed to open %s: %ld\n", freplace_name,
+ PTR_ERR(freplace_obj)))
+ goto out;
+
+ err = bpf_object__load(freplace_obj);
+ if (CHECK(err, "freplace_obj_load", "err %d\n", err))
+ goto out;
+
+ prog = bpf_program__next(NULL, freplace_obj);
+ freplace_link = bpf_program__attach_trace(prog);
+ if (CHECK(IS_ERR(freplace_link), "freplace_attach_trace", "failed to link\n"))
+ goto out;
+
+ opts.attach_prog_fd = bpf_program__fd(prog);
+ fmod_obj = bpf_object__open_file(fmod_ret_name, &opts);
+ if (CHECK(IS_ERR_OR_NULL(fmod_obj), "fmod_obj_open",
+ "failed to open %s: %ld\n", fmod_ret_name,
+ PTR_ERR(fmod_obj)))
+ goto out;
+
+ err = bpf_object__load(fmod_obj);
+ if (CHECK(!err, "fmod_obj_load", "loading fmod_ret should fail\n"))
+ goto out;
+
+out:
+ bpf_link__destroy(freplace_link);
+ bpf_object__close(freplace_obj);
+ bpf_object__close(fmod_obj);
+ bpf_object__close(pkt_obj);
+}
+
+
+static void test_func_sockmap_update(void)
+{
+ const char *prog_name[] = {
+ "freplace/cls_redirect",
+ };
+ test_fexit_bpf2bpf_common("./freplace_cls_redirect.o",
+ "./test_cls_redirect.o",
+ ARRAY_SIZE(prog_name),
+ prog_name, false, NULL);
+}
+
+static void test_obj_load_failure_common(const char *obj_file,
+ const char *target_obj_file)
+
+{
+ /*
+ * standalone test that asserts failure to load freplace prog
+ * because of invalid return code.
+ */
+ struct bpf_object *obj = NULL, *pkt_obj;
+ int err, pkt_fd;
+ __u32 duration = 0;
+
+ err = bpf_prog_load(target_obj_file, BPF_PROG_TYPE_UNSPEC,
+ &pkt_obj, &pkt_fd);
+ /* the target prog should load fine */
+ if (CHECK(err, "tgt_prog_load", "file %s err %d errno %d\n",
+ target_obj_file, err, errno))
+ return;
+ DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts,
+ .attach_prog_fd = pkt_fd,
+ );
+
+ obj = bpf_object__open_file(obj_file, &opts);
+ if (CHECK(IS_ERR_OR_NULL(obj), "obj_open",
+ "failed to open %s: %ld\n", obj_file,
+ PTR_ERR(obj)))
+ goto close_prog;
+
+ /* It should fail to load the program */
+ err = bpf_object__load(obj);
+ if (CHECK(!err, "bpf_obj_load should fail", "err %d\n", err))
+ goto close_prog;
+
+close_prog:
+ if (!IS_ERR_OR_NULL(obj))
+ bpf_object__close(obj);
+ bpf_object__close(pkt_obj);
+}
+
+static void test_func_replace_return_code(void)
+{
+ /* test invalid return code in the replaced program */
+ test_obj_load_failure_common("./freplace_connect_v4_prog.o",
+ "./connect4_prog.o");
+}
+
+static void test_func_map_prog_compatibility(void)
+{
+ /* test with spin lock map value in the replaced program */
+ test_obj_load_failure_common("./freplace_attach_probe.o",
+ "./test_attach_probe.o");
+}
+
+void test_fexit_bpf2bpf(void)
+{
+ if (test__start_subtest("target_no_callees"))
+ test_target_no_callees();
+ if (test__start_subtest("target_yes_callees"))
+ test_target_yes_callees();
+ if (test__start_subtest("func_replace"))
+ test_func_replace();
+ if (test__start_subtest("func_replace_verify"))
+ test_func_replace_verify();
+ if (test__start_subtest("func_sockmap_update"))
+ test_func_sockmap_update();
+ if (test__start_subtest("func_replace_return_code"))
+ test_func_replace_return_code();
+ if (test__start_subtest("func_map_prog_compatibility"))
+ test_func_map_prog_compatibility();
+ if (test__start_subtest("func_replace_multi"))
+ test_func_replace_multi();
+ if (test__start_subtest("fmod_ret_freplace"))
+ test_fmod_ret_freplace();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_stress.c b/tools/testing/selftests/bpf/prog_tests/fexit_stress.c
new file mode 100644
index 000000000..3b9dbf743
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/fexit_stress.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+#include <test_progs.h>
+
+/* x86-64 fits 55 JITed and 43 interpreted progs into half page */
+#define CNT 40
+
+void test_fexit_stress(void)
+{
+ char test_skb[128] = {};
+ int fexit_fd[CNT] = {};
+ int link_fd[CNT] = {};
+ __u32 duration = 0;
+ char error[4096];
+ __u32 prog_ret;
+ int err, i, filter_fd;
+
+ const struct bpf_insn trace_program[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ };
+
+ struct bpf_load_program_attr load_attr = {
+ .prog_type = BPF_PROG_TYPE_TRACING,
+ .license = "GPL",
+ .insns = trace_program,
+ .insns_cnt = sizeof(trace_program) / sizeof(struct bpf_insn),
+ .expected_attach_type = BPF_TRACE_FEXIT,
+ };
+
+ const struct bpf_insn skb_program[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ };
+
+ struct bpf_load_program_attr skb_load_attr = {
+ .prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
+ .license = "GPL",
+ .insns = skb_program,
+ .insns_cnt = sizeof(skb_program) / sizeof(struct bpf_insn),
+ };
+
+ err = libbpf_find_vmlinux_btf_id("bpf_fentry_test1",
+ load_attr.expected_attach_type);
+ if (CHECK(err <= 0, "find_vmlinux_btf_id", "failed: %d\n", err))
+ goto out;
+ load_attr.attach_btf_id = err;
+
+ for (i = 0; i < CNT; i++) {
+ fexit_fd[i] = bpf_load_program_xattr(&load_attr, error, sizeof(error));
+ if (CHECK(fexit_fd[i] < 0, "fexit loaded",
+ "failed: %d errno %d\n", fexit_fd[i], errno))
+ goto out;
+ link_fd[i] = bpf_raw_tracepoint_open(NULL, fexit_fd[i]);
+ if (CHECK(link_fd[i] < 0, "fexit attach failed",
+ "prog %d failed: %d err %d\n", i, link_fd[i], errno))
+ goto out;
+ }
+
+ filter_fd = bpf_load_program_xattr(&skb_load_attr, error, sizeof(error));
+ if (CHECK(filter_fd < 0, "test_program_loaded", "failed: %d errno %d\n",
+ filter_fd, errno))
+ goto out;
+
+ err = bpf_prog_test_run(filter_fd, 1, test_skb, sizeof(test_skb), 0,
+ 0, &prog_ret, 0);
+ close(filter_fd);
+ CHECK_FAIL(err);
+out:
+ for (i = 0; i < CNT; i++) {
+ if (link_fd[i])
+ close(link_fd[i]);
+ if (fexit_fd[i])
+ close(fexit_fd[i]);
+ }
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_test.c b/tools/testing/selftests/bpf/prog_tests/fexit_test.c
new file mode 100644
index 000000000..78d7a2765
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/fexit_test.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+#include <test_progs.h>
+#include "fexit_test.skel.h"
+
+void test_fexit_test(void)
+{
+ struct fexit_test *fexit_skel = NULL;
+ int err, prog_fd, i;
+ __u32 duration = 0, retval;
+ __u64 *result;
+
+ fexit_skel = fexit_test__open_and_load();
+ if (CHECK(!fexit_skel, "fexit_skel_load", "fexit skeleton failed\n"))
+ goto cleanup;
+
+ err = fexit_test__attach(fexit_skel);
+ if (CHECK(err, "fexit_attach", "fexit attach failed: %d\n", err))
+ goto cleanup;
+
+ prog_fd = bpf_program__fd(fexit_skel->progs.test1);
+ err = bpf_prog_test_run(prog_fd, 1, NULL, 0,
+ NULL, NULL, &retval, &duration);
+ CHECK(err || retval, "test_run",
+ "err %d errno %d retval %d duration %d\n",
+ err, errno, retval, duration);
+
+ result = (__u64 *)fexit_skel->bss;
+ for (i = 0; i < 6; i++) {
+ if (CHECK(result[i] != 1, "result",
+ "fexit_test%d failed err %lld\n", i + 1, result[i]))
+ goto cleanup;
+ }
+
+cleanup:
+ fexit_test__destroy(fexit_skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
new file mode 100644
index 000000000..cd6dc80ed
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector.c
@@ -0,0 +1,619 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+#include <error.h>
+#include <linux/if.h>
+#include <linux/if_tun.h>
+#include <sys/uio.h>
+
+#include "bpf_flow.skel.h"
+
+#ifndef IP_MF
+#define IP_MF 0x2000
+#endif
+
+#define CHECK_FLOW_KEYS(desc, got, expected) \
+ CHECK_ATTR(memcmp(&got, &expected, sizeof(got)) != 0, \
+ desc, \
+ "nhoff=%u/%u " \
+ "thoff=%u/%u " \
+ "addr_proto=0x%x/0x%x " \
+ "is_frag=%u/%u " \
+ "is_first_frag=%u/%u " \
+ "is_encap=%u/%u " \
+ "ip_proto=0x%x/0x%x " \
+ "n_proto=0x%x/0x%x " \
+ "flow_label=0x%x/0x%x " \
+ "sport=%u/%u " \
+ "dport=%u/%u\n", \
+ got.nhoff, expected.nhoff, \
+ got.thoff, expected.thoff, \
+ got.addr_proto, expected.addr_proto, \
+ got.is_frag, expected.is_frag, \
+ got.is_first_frag, expected.is_first_frag, \
+ got.is_encap, expected.is_encap, \
+ got.ip_proto, expected.ip_proto, \
+ got.n_proto, expected.n_proto, \
+ got.flow_label, expected.flow_label, \
+ got.sport, expected.sport, \
+ got.dport, expected.dport)
+
+struct ipv4_pkt {
+ struct ethhdr eth;
+ struct iphdr iph;
+ struct tcphdr tcp;
+} __packed;
+
+struct ipip_pkt {
+ struct ethhdr eth;
+ struct iphdr iph;
+ struct iphdr iph_inner;
+ struct tcphdr tcp;
+} __packed;
+
+struct svlan_ipv4_pkt {
+ struct ethhdr eth;
+ __u16 vlan_tci;
+ __u16 vlan_proto;
+ struct iphdr iph;
+ struct tcphdr tcp;
+} __packed;
+
+struct ipv6_pkt {
+ struct ethhdr eth;
+ struct ipv6hdr iph;
+ struct tcphdr tcp;
+} __packed;
+
+struct ipv6_frag_pkt {
+ struct ethhdr eth;
+ struct ipv6hdr iph;
+ struct frag_hdr {
+ __u8 nexthdr;
+ __u8 reserved;
+ __be16 frag_off;
+ __be32 identification;
+ } ipf;
+ struct tcphdr tcp;
+} __packed;
+
+struct dvlan_ipv6_pkt {
+ struct ethhdr eth;
+ __u16 vlan_tci;
+ __u16 vlan_proto;
+ __u16 vlan_tci2;
+ __u16 vlan_proto2;
+ struct ipv6hdr iph;
+ struct tcphdr tcp;
+} __packed;
+
+struct test {
+ const char *name;
+ union {
+ struct ipv4_pkt ipv4;
+ struct svlan_ipv4_pkt svlan_ipv4;
+ struct ipip_pkt ipip;
+ struct ipv6_pkt ipv6;
+ struct ipv6_frag_pkt ipv6_frag;
+ struct dvlan_ipv6_pkt dvlan_ipv6;
+ } pkt;
+ struct bpf_flow_keys keys;
+ __u32 flags;
+};
+
+#define VLAN_HLEN 4
+
+static __u32 duration;
+struct test tests[] = {
+ {
+ .name = "ipv4",
+ .pkt.ipv4 = {
+ .eth.h_proto = __bpf_constant_htons(ETH_P_IP),
+ .iph.ihl = 5,
+ .iph.protocol = IPPROTO_TCP,
+ .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES),
+ .tcp.doff = 5,
+ .tcp.source = 80,
+ .tcp.dest = 8080,
+ },
+ .keys = {
+ .nhoff = ETH_HLEN,
+ .thoff = ETH_HLEN + sizeof(struct iphdr),
+ .addr_proto = ETH_P_IP,
+ .ip_proto = IPPROTO_TCP,
+ .n_proto = __bpf_constant_htons(ETH_P_IP),
+ .sport = 80,
+ .dport = 8080,
+ },
+ },
+ {
+ .name = "ipv6",
+ .pkt.ipv6 = {
+ .eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
+ .iph.nexthdr = IPPROTO_TCP,
+ .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
+ .tcp.doff = 5,
+ .tcp.source = 80,
+ .tcp.dest = 8080,
+ },
+ .keys = {
+ .nhoff = ETH_HLEN,
+ .thoff = ETH_HLEN + sizeof(struct ipv6hdr),
+ .addr_proto = ETH_P_IPV6,
+ .ip_proto = IPPROTO_TCP,
+ .n_proto = __bpf_constant_htons(ETH_P_IPV6),
+ .sport = 80,
+ .dport = 8080,
+ },
+ },
+ {
+ .name = "802.1q-ipv4",
+ .pkt.svlan_ipv4 = {
+ .eth.h_proto = __bpf_constant_htons(ETH_P_8021Q),
+ .vlan_proto = __bpf_constant_htons(ETH_P_IP),
+ .iph.ihl = 5,
+ .iph.protocol = IPPROTO_TCP,
+ .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES),
+ .tcp.doff = 5,
+ .tcp.source = 80,
+ .tcp.dest = 8080,
+ },
+ .keys = {
+ .nhoff = ETH_HLEN + VLAN_HLEN,
+ .thoff = ETH_HLEN + VLAN_HLEN + sizeof(struct iphdr),
+ .addr_proto = ETH_P_IP,
+ .ip_proto = IPPROTO_TCP,
+ .n_proto = __bpf_constant_htons(ETH_P_IP),
+ .sport = 80,
+ .dport = 8080,
+ },
+ },
+ {
+ .name = "802.1ad-ipv6",
+ .pkt.dvlan_ipv6 = {
+ .eth.h_proto = __bpf_constant_htons(ETH_P_8021AD),
+ .vlan_proto = __bpf_constant_htons(ETH_P_8021Q),
+ .vlan_proto2 = __bpf_constant_htons(ETH_P_IPV6),
+ .iph.nexthdr = IPPROTO_TCP,
+ .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
+ .tcp.doff = 5,
+ .tcp.source = 80,
+ .tcp.dest = 8080,
+ },
+ .keys = {
+ .nhoff = ETH_HLEN + VLAN_HLEN * 2,
+ .thoff = ETH_HLEN + VLAN_HLEN * 2 +
+ sizeof(struct ipv6hdr),
+ .addr_proto = ETH_P_IPV6,
+ .ip_proto = IPPROTO_TCP,
+ .n_proto = __bpf_constant_htons(ETH_P_IPV6),
+ .sport = 80,
+ .dport = 8080,
+ },
+ },
+ {
+ .name = "ipv4-frag",
+ .pkt.ipv4 = {
+ .eth.h_proto = __bpf_constant_htons(ETH_P_IP),
+ .iph.ihl = 5,
+ .iph.protocol = IPPROTO_TCP,
+ .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES),
+ .iph.frag_off = __bpf_constant_htons(IP_MF),
+ .tcp.doff = 5,
+ .tcp.source = 80,
+ .tcp.dest = 8080,
+ },
+ .keys = {
+ .flags = BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG,
+ .nhoff = ETH_HLEN,
+ .thoff = ETH_HLEN + sizeof(struct iphdr),
+ .addr_proto = ETH_P_IP,
+ .ip_proto = IPPROTO_TCP,
+ .n_proto = __bpf_constant_htons(ETH_P_IP),
+ .is_frag = true,
+ .is_first_frag = true,
+ .sport = 80,
+ .dport = 8080,
+ },
+ .flags = BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG,
+ },
+ {
+ .name = "ipv4-no-frag",
+ .pkt.ipv4 = {
+ .eth.h_proto = __bpf_constant_htons(ETH_P_IP),
+ .iph.ihl = 5,
+ .iph.protocol = IPPROTO_TCP,
+ .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES),
+ .iph.frag_off = __bpf_constant_htons(IP_MF),
+ .tcp.doff = 5,
+ .tcp.source = 80,
+ .tcp.dest = 8080,
+ },
+ .keys = {
+ .nhoff = ETH_HLEN,
+ .thoff = ETH_HLEN + sizeof(struct iphdr),
+ .addr_proto = ETH_P_IP,
+ .ip_proto = IPPROTO_TCP,
+ .n_proto = __bpf_constant_htons(ETH_P_IP),
+ .is_frag = true,
+ .is_first_frag = true,
+ },
+ },
+ {
+ .name = "ipv6-frag",
+ .pkt.ipv6_frag = {
+ .eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
+ .iph.nexthdr = IPPROTO_FRAGMENT,
+ .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
+ .ipf.nexthdr = IPPROTO_TCP,
+ .tcp.doff = 5,
+ .tcp.source = 80,
+ .tcp.dest = 8080,
+ },
+ .keys = {
+ .flags = BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG,
+ .nhoff = ETH_HLEN,
+ .thoff = ETH_HLEN + sizeof(struct ipv6hdr) +
+ sizeof(struct frag_hdr),
+ .addr_proto = ETH_P_IPV6,
+ .ip_proto = IPPROTO_TCP,
+ .n_proto = __bpf_constant_htons(ETH_P_IPV6),
+ .is_frag = true,
+ .is_first_frag = true,
+ .sport = 80,
+ .dport = 8080,
+ },
+ .flags = BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG,
+ },
+ {
+ .name = "ipv6-no-frag",
+ .pkt.ipv6_frag = {
+ .eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
+ .iph.nexthdr = IPPROTO_FRAGMENT,
+ .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
+ .ipf.nexthdr = IPPROTO_TCP,
+ .tcp.doff = 5,
+ .tcp.source = 80,
+ .tcp.dest = 8080,
+ },
+ .keys = {
+ .nhoff = ETH_HLEN,
+ .thoff = ETH_HLEN + sizeof(struct ipv6hdr) +
+ sizeof(struct frag_hdr),
+ .addr_proto = ETH_P_IPV6,
+ .ip_proto = IPPROTO_TCP,
+ .n_proto = __bpf_constant_htons(ETH_P_IPV6),
+ .is_frag = true,
+ .is_first_frag = true,
+ },
+ },
+ {
+ .name = "ipv6-flow-label",
+ .pkt.ipv6 = {
+ .eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
+ .iph.nexthdr = IPPROTO_TCP,
+ .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
+ .iph.flow_lbl = { 0xb, 0xee, 0xef },
+ .tcp.doff = 5,
+ .tcp.source = 80,
+ .tcp.dest = 8080,
+ },
+ .keys = {
+ .nhoff = ETH_HLEN,
+ .thoff = ETH_HLEN + sizeof(struct ipv6hdr),
+ .addr_proto = ETH_P_IPV6,
+ .ip_proto = IPPROTO_TCP,
+ .n_proto = __bpf_constant_htons(ETH_P_IPV6),
+ .sport = 80,
+ .dport = 8080,
+ .flow_label = __bpf_constant_htonl(0xbeeef),
+ },
+ },
+ {
+ .name = "ipv6-no-flow-label",
+ .pkt.ipv6 = {
+ .eth.h_proto = __bpf_constant_htons(ETH_P_IPV6),
+ .iph.nexthdr = IPPROTO_TCP,
+ .iph.payload_len = __bpf_constant_htons(MAGIC_BYTES),
+ .iph.flow_lbl = { 0xb, 0xee, 0xef },
+ .tcp.doff = 5,
+ .tcp.source = 80,
+ .tcp.dest = 8080,
+ },
+ .keys = {
+ .flags = BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL,
+ .nhoff = ETH_HLEN,
+ .thoff = ETH_HLEN + sizeof(struct ipv6hdr),
+ .addr_proto = ETH_P_IPV6,
+ .ip_proto = IPPROTO_TCP,
+ .n_proto = __bpf_constant_htons(ETH_P_IPV6),
+ .flow_label = __bpf_constant_htonl(0xbeeef),
+ },
+ .flags = BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL,
+ },
+ {
+ .name = "ipip-encap",
+ .pkt.ipip = {
+ .eth.h_proto = __bpf_constant_htons(ETH_P_IP),
+ .iph.ihl = 5,
+ .iph.protocol = IPPROTO_IPIP,
+ .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES),
+ .iph_inner.ihl = 5,
+ .iph_inner.protocol = IPPROTO_TCP,
+ .iph_inner.tot_len =
+ __bpf_constant_htons(MAGIC_BYTES) -
+ sizeof(struct iphdr),
+ .tcp.doff = 5,
+ .tcp.source = 80,
+ .tcp.dest = 8080,
+ },
+ .keys = {
+ .nhoff = ETH_HLEN,
+ .thoff = ETH_HLEN + sizeof(struct iphdr) +
+ sizeof(struct iphdr),
+ .addr_proto = ETH_P_IP,
+ .ip_proto = IPPROTO_TCP,
+ .n_proto = __bpf_constant_htons(ETH_P_IP),
+ .is_encap = true,
+ .sport = 80,
+ .dport = 8080,
+ },
+ },
+ {
+ .name = "ipip-no-encap",
+ .pkt.ipip = {
+ .eth.h_proto = __bpf_constant_htons(ETH_P_IP),
+ .iph.ihl = 5,
+ .iph.protocol = IPPROTO_IPIP,
+ .iph.tot_len = __bpf_constant_htons(MAGIC_BYTES),
+ .iph_inner.ihl = 5,
+ .iph_inner.protocol = IPPROTO_TCP,
+ .iph_inner.tot_len =
+ __bpf_constant_htons(MAGIC_BYTES) -
+ sizeof(struct iphdr),
+ .tcp.doff = 5,
+ .tcp.source = 80,
+ .tcp.dest = 8080,
+ },
+ .keys = {
+ .flags = BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP,
+ .nhoff = ETH_HLEN,
+ .thoff = ETH_HLEN + sizeof(struct iphdr),
+ .addr_proto = ETH_P_IP,
+ .ip_proto = IPPROTO_IPIP,
+ .n_proto = __bpf_constant_htons(ETH_P_IP),
+ .is_encap = true,
+ },
+ .flags = BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP,
+ },
+};
+
+static int create_tap(const char *ifname)
+{
+ struct ifreq ifr = {
+ .ifr_flags = IFF_TAP | IFF_NO_PI | IFF_NAPI | IFF_NAPI_FRAGS,
+ };
+ int fd, ret;
+
+ strncpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name));
+
+ fd = open("/dev/net/tun", O_RDWR);
+ if (fd < 0)
+ return -1;
+
+ ret = ioctl(fd, TUNSETIFF, &ifr);
+ if (ret)
+ return -1;
+
+ return fd;
+}
+
+static int tx_tap(int fd, void *pkt, size_t len)
+{
+ struct iovec iov[] = {
+ {
+ .iov_len = len,
+ .iov_base = pkt,
+ },
+ };
+ return writev(fd, iov, ARRAY_SIZE(iov));
+}
+
+static int ifup(const char *ifname)
+{
+ struct ifreq ifr = {};
+ int sk, ret;
+
+ strncpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name));
+
+ sk = socket(PF_INET, SOCK_DGRAM, 0);
+ if (sk < 0)
+ return -1;
+
+ ret = ioctl(sk, SIOCGIFFLAGS, &ifr);
+ if (ret) {
+ close(sk);
+ return -1;
+ }
+
+ ifr.ifr_flags |= IFF_UP;
+ ret = ioctl(sk, SIOCSIFFLAGS, &ifr);
+ if (ret) {
+ close(sk);
+ return -1;
+ }
+
+ close(sk);
+ return 0;
+}
+
+static int init_prog_array(struct bpf_object *obj, struct bpf_map *prog_array)
+{
+ int i, err, map_fd, prog_fd;
+ struct bpf_program *prog;
+ char prog_name[32];
+
+ map_fd = bpf_map__fd(prog_array);
+ if (map_fd < 0)
+ return -1;
+
+ for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+ snprintf(prog_name, sizeof(prog_name), "flow_dissector/%i", i);
+
+ prog = bpf_object__find_program_by_title(obj, prog_name);
+ if (!prog)
+ return -1;
+
+ prog_fd = bpf_program__fd(prog);
+ if (prog_fd < 0)
+ return -1;
+
+ err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY);
+ if (err)
+ return -1;
+ }
+ return 0;
+}
+
+static void run_tests_skb_less(int tap_fd, struct bpf_map *keys)
+{
+ int i, err, keys_fd;
+
+ keys_fd = bpf_map__fd(keys);
+ if (CHECK(keys_fd < 0, "bpf_map__fd", "err %d\n", keys_fd))
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ /* Keep in sync with 'flags' from eth_get_headlen. */
+ __u32 eth_get_headlen_flags =
+ BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
+ struct bpf_prog_test_run_attr tattr = {};
+ struct bpf_flow_keys flow_keys = {};
+ __u32 key = (__u32)(tests[i].keys.sport) << 16 |
+ tests[i].keys.dport;
+
+ /* For skb-less case we can't pass input flags; run
+ * only the tests that have a matching set of flags.
+ */
+
+ if (tests[i].flags != eth_get_headlen_flags)
+ continue;
+
+ err = tx_tap(tap_fd, &tests[i].pkt, sizeof(tests[i].pkt));
+ CHECK(err < 0, "tx_tap", "err %d errno %d\n", err, errno);
+
+ err = bpf_map_lookup_elem(keys_fd, &key, &flow_keys);
+ CHECK_ATTR(err, tests[i].name, "bpf_map_lookup_elem %d\n", err);
+
+ CHECK_ATTR(err, tests[i].name, "skb-less err %d\n", err);
+ CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys);
+
+ err = bpf_map_delete_elem(keys_fd, &key);
+ CHECK_ATTR(err, tests[i].name, "bpf_map_delete_elem %d\n", err);
+ }
+}
+
+static void test_skb_less_prog_attach(struct bpf_flow *skel, int tap_fd)
+{
+ int err, prog_fd;
+
+ prog_fd = bpf_program__fd(skel->progs._dissect);
+ if (CHECK(prog_fd < 0, "bpf_program__fd", "err %d\n", prog_fd))
+ return;
+
+ err = bpf_prog_attach(prog_fd, 0, BPF_FLOW_DISSECTOR, 0);
+ if (CHECK(err, "bpf_prog_attach", "err %d errno %d\n", err, errno))
+ return;
+
+ run_tests_skb_less(tap_fd, skel->maps.last_dissection);
+
+ err = bpf_prog_detach2(prog_fd, 0, BPF_FLOW_DISSECTOR);
+ CHECK(err, "bpf_prog_detach2", "err %d errno %d\n", err, errno);
+}
+
+static void test_skb_less_link_create(struct bpf_flow *skel, int tap_fd)
+{
+ struct bpf_link *link;
+ int err, net_fd;
+
+ net_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (CHECK(net_fd < 0, "open(/proc/self/ns/net)", "err %d\n", errno))
+ return;
+
+ link = bpf_program__attach_netns(skel->progs._dissect, net_fd);
+ if (CHECK(IS_ERR(link), "attach_netns", "err %ld\n", PTR_ERR(link)))
+ goto out_close;
+
+ run_tests_skb_less(tap_fd, skel->maps.last_dissection);
+
+ err = bpf_link__destroy(link);
+ CHECK(err, "bpf_link__destroy", "err %d\n", err);
+out_close:
+ close(net_fd);
+}
+
+void test_flow_dissector(void)
+{
+ int i, err, prog_fd, keys_fd = -1, tap_fd;
+ struct bpf_flow *skel;
+
+ skel = bpf_flow__open_and_load();
+ if (CHECK(!skel, "skel", "failed to open/load skeleton\n"))
+ return;
+
+ prog_fd = bpf_program__fd(skel->progs._dissect);
+ if (CHECK(prog_fd < 0, "bpf_program__fd", "err %d\n", prog_fd))
+ goto out_destroy_skel;
+ keys_fd = bpf_map__fd(skel->maps.last_dissection);
+ if (CHECK(keys_fd < 0, "bpf_map__fd", "err %d\n", keys_fd))
+ goto out_destroy_skel;
+ err = init_prog_array(skel->obj, skel->maps.jmp_table);
+ if (CHECK(err, "init_prog_array", "err %d\n", err))
+ goto out_destroy_skel;
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ struct bpf_flow_keys flow_keys;
+ struct bpf_prog_test_run_attr tattr = {
+ .prog_fd = prog_fd,
+ .data_in = &tests[i].pkt,
+ .data_size_in = sizeof(tests[i].pkt),
+ .data_out = &flow_keys,
+ };
+ static struct bpf_flow_keys ctx = {};
+
+ if (tests[i].flags) {
+ tattr.ctx_in = &ctx;
+ tattr.ctx_size_in = sizeof(ctx);
+ ctx.flags = tests[i].flags;
+ }
+
+ err = bpf_prog_test_run_xattr(&tattr);
+ CHECK_ATTR(tattr.data_size_out != sizeof(flow_keys) ||
+ err || tattr.retval != 1,
+ tests[i].name,
+ "err %d errno %d retval %d duration %d size %u/%zu\n",
+ err, errno, tattr.retval, tattr.duration,
+ tattr.data_size_out, sizeof(flow_keys));
+ CHECK_FLOW_KEYS(tests[i].name, flow_keys, tests[i].keys);
+ }
+
+ /* Do the same tests but for skb-less flow dissector.
+ * We use a known path in the net/tun driver that calls
+ * eth_get_headlen and we manually export bpf_flow_keys
+ * via BPF map in this case.
+ */
+
+ tap_fd = create_tap("tap0");
+ CHECK(tap_fd < 0, "create_tap", "tap_fd %d errno %d\n", tap_fd, errno);
+ err = ifup("tap0");
+ CHECK(err, "ifup", "err %d errno %d\n", err, errno);
+
+ /* Test direct prog attachment */
+ test_skb_less_prog_attach(skel, tap_fd);
+ /* Test indirect prog attachment via link */
+ test_skb_less_link_create(skel, tap_fd);
+
+ close(tap_fd);
+out_destroy_skel:
+ bpf_flow__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c
new file mode 100644
index 000000000..0e8a4d2f0
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector_load_bytes.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+
+void test_flow_dissector_load_bytes(void)
+{
+ struct bpf_flow_keys flow_keys;
+ __u32 duration = 0, retval, size;
+ struct bpf_insn prog[] = {
+ // BPF_REG_1 - 1st argument: context
+ // BPF_REG_2 - 2nd argument: offset, start at first byte
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ // BPF_REG_3 - 3rd argument: destination, reserve byte on stack
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_3, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -1),
+ // BPF_REG_4 - 4th argument: copy one byte
+ BPF_MOV64_IMM(BPF_REG_4, 1),
+ // bpf_skb_load_bytes(ctx, sizeof(pkt_v4), ptr, 1)
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_skb_load_bytes),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ // if (ret == 0) return BPF_DROP (2)
+ BPF_MOV64_IMM(BPF_REG_0, BPF_DROP),
+ BPF_EXIT_INSN(),
+ // if (ret != 0) return BPF_OK (0)
+ BPF_MOV64_IMM(BPF_REG_0, BPF_OK),
+ BPF_EXIT_INSN(),
+ };
+ int fd, err;
+
+ /* make sure bpf_skb_load_bytes is not allowed from skb-less context
+ */
+ fd = bpf_load_program(BPF_PROG_TYPE_FLOW_DISSECTOR, prog,
+ ARRAY_SIZE(prog), "GPL", 0, NULL, 0);
+ CHECK(fd < 0,
+ "flow_dissector-bpf_skb_load_bytes-load",
+ "fd %d errno %d\n",
+ fd, errno);
+
+ err = bpf_prog_test_run(fd, 1, &pkt_v4, sizeof(pkt_v4),
+ &flow_keys, &size, &retval, &duration);
+ CHECK(size != sizeof(flow_keys) || err || retval != 1,
+ "flow_dissector-bpf_skb_load_bytes",
+ "err %d errno %d retval %d duration %d size %u/%zu\n",
+ err, errno, retval, duration, size, sizeof(flow_keys));
+
+ if (fd >= -1)
+ close(fd);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c b/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c
new file mode 100644
index 000000000..172c586b6
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/flow_dissector_reattach.c
@@ -0,0 +1,678 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tests for attaching, detaching, and replacing flow_dissector BPF program.
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+
+#include "test_progs.h"
+
+static int init_net = -1;
+
+static __u32 query_attached_prog_id(int netns)
+{
+ __u32 prog_ids[1] = {};
+ __u32 prog_cnt = ARRAY_SIZE(prog_ids);
+ int err;
+
+ err = bpf_prog_query(netns, BPF_FLOW_DISSECTOR, 0, NULL,
+ prog_ids, &prog_cnt);
+ if (CHECK_FAIL(err)) {
+ perror("bpf_prog_query");
+ return 0;
+ }
+
+ return prog_cnt == 1 ? prog_ids[0] : 0;
+}
+
+static bool prog_is_attached(int netns)
+{
+ return query_attached_prog_id(netns) > 0;
+}
+
+static int load_prog(enum bpf_prog_type type)
+{
+ struct bpf_insn prog[] = {
+ BPF_MOV64_IMM(BPF_REG_0, BPF_OK),
+ BPF_EXIT_INSN(),
+ };
+ int fd;
+
+ fd = bpf_load_program(type, prog, ARRAY_SIZE(prog), "GPL", 0, NULL, 0);
+ if (CHECK_FAIL(fd < 0))
+ perror("bpf_load_program");
+
+ return fd;
+}
+
+static __u32 query_prog_id(int prog)
+{
+ struct bpf_prog_info info = {};
+ __u32 info_len = sizeof(info);
+ int err;
+
+ err = bpf_obj_get_info_by_fd(prog, &info, &info_len);
+ if (CHECK_FAIL(err || info_len != sizeof(info))) {
+ perror("bpf_obj_get_info_by_fd");
+ return 0;
+ }
+
+ return info.id;
+}
+
+static int unshare_net(int old_net)
+{
+ int err, new_net;
+
+ err = unshare(CLONE_NEWNET);
+ if (CHECK_FAIL(err)) {
+ perror("unshare(CLONE_NEWNET)");
+ return -1;
+ }
+ new_net = open("/proc/self/ns/net", O_RDONLY);
+ if (CHECK_FAIL(new_net < 0)) {
+ perror("open(/proc/self/ns/net)");
+ setns(old_net, CLONE_NEWNET);
+ return -1;
+ }
+ return new_net;
+}
+
+static void test_prog_attach_prog_attach(int netns, int prog1, int prog2)
+{
+ int err;
+
+ err = bpf_prog_attach(prog1, 0, BPF_FLOW_DISSECTOR, 0);
+ if (CHECK_FAIL(err)) {
+ perror("bpf_prog_attach(prog1)");
+ return;
+ }
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ /* Expect success when attaching a different program */
+ err = bpf_prog_attach(prog2, 0, BPF_FLOW_DISSECTOR, 0);
+ if (CHECK_FAIL(err)) {
+ perror("bpf_prog_attach(prog2) #1");
+ goto out_detach;
+ }
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog2));
+
+ /* Expect failure when attaching the same program twice */
+ err = bpf_prog_attach(prog2, 0, BPF_FLOW_DISSECTOR, 0);
+ if (CHECK_FAIL(!err || errno != EINVAL))
+ perror("bpf_prog_attach(prog2) #2");
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog2));
+
+out_detach:
+ err = bpf_prog_detach2(prog2, 0, BPF_FLOW_DISSECTOR);
+ if (CHECK_FAIL(err))
+ perror("bpf_prog_detach");
+ CHECK_FAIL(prog_is_attached(netns));
+}
+
+static void test_link_create_link_create(int netns, int prog1, int prog2)
+{
+ DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts);
+ int link1, link2;
+
+ link1 = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &opts);
+ if (CHECK_FAIL(link < 0)) {
+ perror("bpf_link_create(prog1)");
+ return;
+ }
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ /* Expect failure creating link when another link exists */
+ errno = 0;
+ link2 = bpf_link_create(prog2, netns, BPF_FLOW_DISSECTOR, &opts);
+ if (CHECK_FAIL(link2 != -1 || errno != E2BIG))
+ perror("bpf_prog_attach(prog2) expected E2BIG");
+ if (link2 != -1)
+ close(link2);
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ close(link1);
+ CHECK_FAIL(prog_is_attached(netns));
+}
+
+static void test_prog_attach_link_create(int netns, int prog1, int prog2)
+{
+ DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts);
+ int err, link;
+
+ err = bpf_prog_attach(prog1, 0, BPF_FLOW_DISSECTOR, 0);
+ if (CHECK_FAIL(err)) {
+ perror("bpf_prog_attach(prog1)");
+ return;
+ }
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ /* Expect failure creating link when prog attached */
+ errno = 0;
+ link = bpf_link_create(prog2, netns, BPF_FLOW_DISSECTOR, &opts);
+ if (CHECK_FAIL(link != -1 || errno != EEXIST))
+ perror("bpf_link_create(prog2) expected EEXIST");
+ if (link != -1)
+ close(link);
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ err = bpf_prog_detach2(prog1, 0, BPF_FLOW_DISSECTOR);
+ if (CHECK_FAIL(err))
+ perror("bpf_prog_detach");
+ CHECK_FAIL(prog_is_attached(netns));
+}
+
+static void test_link_create_prog_attach(int netns, int prog1, int prog2)
+{
+ DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts);
+ int err, link;
+
+ link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &opts);
+ if (CHECK_FAIL(link < 0)) {
+ perror("bpf_link_create(prog1)");
+ return;
+ }
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ /* Expect failure attaching prog when link exists */
+ errno = 0;
+ err = bpf_prog_attach(prog2, 0, BPF_FLOW_DISSECTOR, 0);
+ if (CHECK_FAIL(!err || errno != EEXIST))
+ perror("bpf_prog_attach(prog2) expected EEXIST");
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ close(link);
+ CHECK_FAIL(prog_is_attached(netns));
+}
+
+static void test_link_create_prog_detach(int netns, int prog1, int prog2)
+{
+ DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts);
+ int err, link;
+
+ link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &opts);
+ if (CHECK_FAIL(link < 0)) {
+ perror("bpf_link_create(prog1)");
+ return;
+ }
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ /* Expect failure detaching prog when link exists */
+ errno = 0;
+ err = bpf_prog_detach2(prog1, 0, BPF_FLOW_DISSECTOR);
+ if (CHECK_FAIL(!err || errno != EINVAL))
+ perror("bpf_prog_detach expected EINVAL");
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ close(link);
+ CHECK_FAIL(prog_is_attached(netns));
+}
+
+static void test_prog_attach_detach_query(int netns, int prog1, int prog2)
+{
+ int err;
+
+ err = bpf_prog_attach(prog1, 0, BPF_FLOW_DISSECTOR, 0);
+ if (CHECK_FAIL(err)) {
+ perror("bpf_prog_attach(prog1)");
+ return;
+ }
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ err = bpf_prog_detach2(prog1, 0, BPF_FLOW_DISSECTOR);
+ if (CHECK_FAIL(err)) {
+ perror("bpf_prog_detach");
+ return;
+ }
+
+ /* Expect no prog attached after successful detach */
+ CHECK_FAIL(prog_is_attached(netns));
+}
+
+static void test_link_create_close_query(int netns, int prog1, int prog2)
+{
+ DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts);
+ int link;
+
+ link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &opts);
+ if (CHECK_FAIL(link < 0)) {
+ perror("bpf_link_create(prog1)");
+ return;
+ }
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ close(link);
+ /* Expect no prog attached after closing last link FD */
+ CHECK_FAIL(prog_is_attached(netns));
+}
+
+static void test_link_update_no_old_prog(int netns, int prog1, int prog2)
+{
+ DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts);
+ DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts);
+ int err, link;
+
+ link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts);
+ if (CHECK_FAIL(link < 0)) {
+ perror("bpf_link_create(prog1)");
+ return;
+ }
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ /* Expect success replacing the prog when old prog not specified */
+ update_opts.flags = 0;
+ update_opts.old_prog_fd = 0;
+ err = bpf_link_update(link, prog2, &update_opts);
+ if (CHECK_FAIL(err))
+ perror("bpf_link_update");
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog2));
+
+ close(link);
+ CHECK_FAIL(prog_is_attached(netns));
+}
+
+static void test_link_update_replace_old_prog(int netns, int prog1, int prog2)
+{
+ DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts);
+ DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts);
+ int err, link;
+
+ link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts);
+ if (CHECK_FAIL(link < 0)) {
+ perror("bpf_link_create(prog1)");
+ return;
+ }
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ /* Expect success F_REPLACE and old prog specified to succeed */
+ update_opts.flags = BPF_F_REPLACE;
+ update_opts.old_prog_fd = prog1;
+ err = bpf_link_update(link, prog2, &update_opts);
+ if (CHECK_FAIL(err))
+ perror("bpf_link_update");
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog2));
+
+ close(link);
+ CHECK_FAIL(prog_is_attached(netns));
+}
+
+static void test_link_update_same_prog(int netns, int prog1, int prog2)
+{
+ DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts);
+ DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts);
+ int err, link;
+
+ link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts);
+ if (CHECK_FAIL(link < 0)) {
+ perror("bpf_link_create(prog1)");
+ return;
+ }
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ /* Expect success updating the prog with the same one */
+ update_opts.flags = 0;
+ update_opts.old_prog_fd = 0;
+ err = bpf_link_update(link, prog1, &update_opts);
+ if (CHECK_FAIL(err))
+ perror("bpf_link_update");
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ close(link);
+ CHECK_FAIL(prog_is_attached(netns));
+}
+
+static void test_link_update_invalid_opts(int netns, int prog1, int prog2)
+{
+ DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts);
+ DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts);
+ int err, link;
+
+ link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts);
+ if (CHECK_FAIL(link < 0)) {
+ perror("bpf_link_create(prog1)");
+ return;
+ }
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ /* Expect update to fail w/ old prog FD but w/o F_REPLACE*/
+ errno = 0;
+ update_opts.flags = 0;
+ update_opts.old_prog_fd = prog1;
+ err = bpf_link_update(link, prog2, &update_opts);
+ if (CHECK_FAIL(!err || errno != EINVAL)) {
+ perror("bpf_link_update expected EINVAL");
+ goto out_close;
+ }
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ /* Expect update to fail on old prog FD mismatch */
+ errno = 0;
+ update_opts.flags = BPF_F_REPLACE;
+ update_opts.old_prog_fd = prog2;
+ err = bpf_link_update(link, prog2, &update_opts);
+ if (CHECK_FAIL(!err || errno != EPERM)) {
+ perror("bpf_link_update expected EPERM");
+ goto out_close;
+ }
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ /* Expect update to fail for invalid old prog FD */
+ errno = 0;
+ update_opts.flags = BPF_F_REPLACE;
+ update_opts.old_prog_fd = -1;
+ err = bpf_link_update(link, prog2, &update_opts);
+ if (CHECK_FAIL(!err || errno != EBADF)) {
+ perror("bpf_link_update expected EBADF");
+ goto out_close;
+ }
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ /* Expect update to fail with invalid flags */
+ errno = 0;
+ update_opts.flags = BPF_F_ALLOW_MULTI;
+ update_opts.old_prog_fd = 0;
+ err = bpf_link_update(link, prog2, &update_opts);
+ if (CHECK_FAIL(!err || errno != EINVAL))
+ perror("bpf_link_update expected EINVAL");
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+out_close:
+ close(link);
+ CHECK_FAIL(prog_is_attached(netns));
+}
+
+static void test_link_update_invalid_prog(int netns, int prog1, int prog2)
+{
+ DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts);
+ DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts);
+ int err, link, prog3;
+
+ link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts);
+ if (CHECK_FAIL(link < 0)) {
+ perror("bpf_link_create(prog1)");
+ return;
+ }
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ /* Expect failure when new prog FD is not valid */
+ errno = 0;
+ update_opts.flags = 0;
+ update_opts.old_prog_fd = 0;
+ err = bpf_link_update(link, -1, &update_opts);
+ if (CHECK_FAIL(!err || errno != EBADF)) {
+ perror("bpf_link_update expected EINVAL");
+ goto out_close_link;
+ }
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ prog3 = load_prog(BPF_PROG_TYPE_SOCKET_FILTER);
+ if (prog3 < 0)
+ goto out_close_link;
+
+ /* Expect failure when new prog FD type doesn't match */
+ errno = 0;
+ update_opts.flags = 0;
+ update_opts.old_prog_fd = 0;
+ err = bpf_link_update(link, prog3, &update_opts);
+ if (CHECK_FAIL(!err || errno != EINVAL))
+ perror("bpf_link_update expected EINVAL");
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ close(prog3);
+out_close_link:
+ close(link);
+ CHECK_FAIL(prog_is_attached(netns));
+}
+
+static void test_link_update_netns_gone(int netns, int prog1, int prog2)
+{
+ DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts);
+ DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts);
+ int err, link, old_net;
+
+ old_net = netns;
+ netns = unshare_net(old_net);
+ if (netns < 0)
+ return;
+
+ link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts);
+ if (CHECK_FAIL(link < 0)) {
+ perror("bpf_link_create(prog1)");
+ return;
+ }
+ CHECK_FAIL(query_attached_prog_id(netns) != query_prog_id(prog1));
+
+ close(netns);
+ err = setns(old_net, CLONE_NEWNET);
+ if (CHECK_FAIL(err)) {
+ perror("setns(CLONE_NEWNET)");
+ close(link);
+ return;
+ }
+
+ /* Expect failure when netns destroyed */
+ errno = 0;
+ update_opts.flags = 0;
+ update_opts.old_prog_fd = 0;
+ err = bpf_link_update(link, prog2, &update_opts);
+ if (CHECK_FAIL(!err || errno != ENOLINK))
+ perror("bpf_link_update");
+
+ close(link);
+}
+
+static void test_link_get_info(int netns, int prog1, int prog2)
+{
+ DECLARE_LIBBPF_OPTS(bpf_link_create_opts, create_opts);
+ DECLARE_LIBBPF_OPTS(bpf_link_update_opts, update_opts);
+ struct bpf_link_info info = {};
+ struct stat netns_stat = {};
+ __u32 info_len, link_id;
+ int err, link, old_net;
+
+ old_net = netns;
+ netns = unshare_net(old_net);
+ if (netns < 0)
+ return;
+
+ err = fstat(netns, &netns_stat);
+ if (CHECK_FAIL(err)) {
+ perror("stat(netns)");
+ goto out_resetns;
+ }
+
+ link = bpf_link_create(prog1, netns, BPF_FLOW_DISSECTOR, &create_opts);
+ if (CHECK_FAIL(link < 0)) {
+ perror("bpf_link_create(prog1)");
+ goto out_resetns;
+ }
+
+ info_len = sizeof(info);
+ err = bpf_obj_get_info_by_fd(link, &info, &info_len);
+ if (CHECK_FAIL(err)) {
+ perror("bpf_obj_get_info");
+ goto out_unlink;
+ }
+ CHECK_FAIL(info_len != sizeof(info));
+
+ /* Expect link info to be sane and match prog and netns details */
+ CHECK_FAIL(info.type != BPF_LINK_TYPE_NETNS);
+ CHECK_FAIL(info.id == 0);
+ CHECK_FAIL(info.prog_id != query_prog_id(prog1));
+ CHECK_FAIL(info.netns.netns_ino != netns_stat.st_ino);
+ CHECK_FAIL(info.netns.attach_type != BPF_FLOW_DISSECTOR);
+
+ update_opts.flags = 0;
+ update_opts.old_prog_fd = 0;
+ err = bpf_link_update(link, prog2, &update_opts);
+ if (CHECK_FAIL(err)) {
+ perror("bpf_link_update(prog2)");
+ goto out_unlink;
+ }
+
+ link_id = info.id;
+ info_len = sizeof(info);
+ err = bpf_obj_get_info_by_fd(link, &info, &info_len);
+ if (CHECK_FAIL(err)) {
+ perror("bpf_obj_get_info");
+ goto out_unlink;
+ }
+ CHECK_FAIL(info_len != sizeof(info));
+
+ /* Expect no info change after update except in prog id */
+ CHECK_FAIL(info.type != BPF_LINK_TYPE_NETNS);
+ CHECK_FAIL(info.id != link_id);
+ CHECK_FAIL(info.prog_id != query_prog_id(prog2));
+ CHECK_FAIL(info.netns.netns_ino != netns_stat.st_ino);
+ CHECK_FAIL(info.netns.attach_type != BPF_FLOW_DISSECTOR);
+
+ /* Leave netns link is attached to and close last FD to it */
+ err = setns(old_net, CLONE_NEWNET);
+ if (CHECK_FAIL(err)) {
+ perror("setns(NEWNET)");
+ goto out_unlink;
+ }
+ close(netns);
+ old_net = -1;
+ netns = -1;
+
+ info_len = sizeof(info);
+ err = bpf_obj_get_info_by_fd(link, &info, &info_len);
+ if (CHECK_FAIL(err)) {
+ perror("bpf_obj_get_info");
+ goto out_unlink;
+ }
+ CHECK_FAIL(info_len != sizeof(info));
+
+ /* Expect netns_ino to change to 0 */
+ CHECK_FAIL(info.type != BPF_LINK_TYPE_NETNS);
+ CHECK_FAIL(info.id != link_id);
+ CHECK_FAIL(info.prog_id != query_prog_id(prog2));
+ CHECK_FAIL(info.netns.netns_ino != 0);
+ CHECK_FAIL(info.netns.attach_type != BPF_FLOW_DISSECTOR);
+
+out_unlink:
+ close(link);
+out_resetns:
+ if (old_net != -1)
+ setns(old_net, CLONE_NEWNET);
+ if (netns != -1)
+ close(netns);
+}
+
+static void run_tests(int netns)
+{
+ struct test {
+ const char *test_name;
+ void (*test_func)(int netns, int prog1, int prog2);
+ } tests[] = {
+ { "prog attach, prog attach",
+ test_prog_attach_prog_attach },
+ { "link create, link create",
+ test_link_create_link_create },
+ { "prog attach, link create",
+ test_prog_attach_link_create },
+ { "link create, prog attach",
+ test_link_create_prog_attach },
+ { "link create, prog detach",
+ test_link_create_prog_detach },
+ { "prog attach, detach, query",
+ test_prog_attach_detach_query },
+ { "link create, close, query",
+ test_link_create_close_query },
+ { "link update no old prog",
+ test_link_update_no_old_prog },
+ { "link update with replace old prog",
+ test_link_update_replace_old_prog },
+ { "link update with same prog",
+ test_link_update_same_prog },
+ { "link update invalid opts",
+ test_link_update_invalid_opts },
+ { "link update invalid prog",
+ test_link_update_invalid_prog },
+ { "link update netns gone",
+ test_link_update_netns_gone },
+ { "link get info",
+ test_link_get_info },
+ };
+ int i, progs[2] = { -1, -1 };
+ char test_name[80];
+
+ for (i = 0; i < ARRAY_SIZE(progs); i++) {
+ progs[i] = load_prog(BPF_PROG_TYPE_FLOW_DISSECTOR);
+ if (progs[i] < 0)
+ goto out_close;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ snprintf(test_name, sizeof(test_name),
+ "flow dissector %s%s",
+ tests[i].test_name,
+ netns == init_net ? " (init_net)" : "");
+ if (test__start_subtest(test_name))
+ tests[i].test_func(netns, progs[0], progs[1]);
+ }
+out_close:
+ for (i = 0; i < ARRAY_SIZE(progs); i++) {
+ if (progs[i] != -1)
+ CHECK_FAIL(close(progs[i]));
+ }
+}
+
+void test_flow_dissector_reattach(void)
+{
+ int err, new_net, saved_net;
+
+ saved_net = open("/proc/self/ns/net", O_RDONLY);
+ if (CHECK_FAIL(saved_net < 0)) {
+ perror("open(/proc/self/ns/net");
+ return;
+ }
+
+ init_net = open("/proc/1/ns/net", O_RDONLY);
+ if (CHECK_FAIL(init_net < 0)) {
+ perror("open(/proc/1/ns/net)");
+ goto out_close;
+ }
+
+ err = setns(init_net, CLONE_NEWNET);
+ if (CHECK_FAIL(err)) {
+ perror("setns(/proc/1/ns/net)");
+ goto out_close;
+ }
+
+ if (prog_is_attached(init_net)) {
+ test__skip();
+ printf("Can't test with flow dissector attached to init_net\n");
+ goto out_setns;
+ }
+
+ /* First run tests in root network namespace */
+ run_tests(init_net);
+
+ /* Then repeat tests in a non-root namespace */
+ new_net = unshare_net(init_net);
+ if (new_net < 0)
+ goto out_setns;
+ run_tests(new_net);
+ close(new_net);
+
+out_setns:
+ /* Move back to netns we started in. */
+ err = setns(saved_net, CLONE_NEWNET);
+ if (CHECK_FAIL(err))
+ perror("setns(/proc/self/ns/net)");
+
+out_close:
+ close(init_net);
+ close(saved_net);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
new file mode 100644
index 000000000..925722217
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/get_stack_raw_tp.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <sched.h>
+#include <sys/socket.h>
+#include <test_progs.h>
+
+#define MAX_CNT_RAWTP 10ull
+#define MAX_STACK_RAWTP 100
+
+static int duration = 0;
+
+struct get_stack_trace_t {
+ int pid;
+ int kern_stack_size;
+ int user_stack_size;
+ int user_stack_buildid_size;
+ __u64 kern_stack[MAX_STACK_RAWTP];
+ __u64 user_stack[MAX_STACK_RAWTP];
+ struct bpf_stack_build_id user_stack_buildid[MAX_STACK_RAWTP];
+};
+
+static void get_stack_print_output(void *ctx, int cpu, void *data, __u32 size)
+{
+ bool good_kern_stack = false, good_user_stack = false;
+ const char *nonjit_func = "___bpf_prog_run";
+ struct get_stack_trace_t *e = data;
+ int i, num_stack;
+ static __u64 cnt;
+ struct ksym *ks;
+
+ cnt++;
+
+ if (size < sizeof(struct get_stack_trace_t)) {
+ __u64 *raw_data = data;
+ bool found = false;
+
+ num_stack = size / sizeof(__u64);
+ /* If jit is enabled, we do not have a good way to
+ * verify the sanity of the kernel stack. So we
+ * just assume it is good if the stack is not empty.
+ * This could be improved in the future.
+ */
+ if (env.jit_enabled) {
+ found = num_stack > 0;
+ } else {
+ for (i = 0; i < num_stack; i++) {
+ ks = ksym_search(raw_data[i]);
+ if (ks && (strcmp(ks->name, nonjit_func) == 0)) {
+ found = true;
+ break;
+ }
+ }
+ }
+ if (found) {
+ good_kern_stack = true;
+ good_user_stack = true;
+ }
+ } else {
+ num_stack = e->kern_stack_size / sizeof(__u64);
+ if (env.jit_enabled) {
+ good_kern_stack = num_stack > 0;
+ } else {
+ for (i = 0; i < num_stack; i++) {
+ ks = ksym_search(e->kern_stack[i]);
+ if (ks && (strcmp(ks->name, nonjit_func) == 0)) {
+ good_kern_stack = true;
+ break;
+ }
+ }
+ }
+ if (e->user_stack_size > 0 && e->user_stack_buildid_size > 0)
+ good_user_stack = true;
+ }
+
+ if (!good_kern_stack)
+ CHECK(!good_kern_stack, "kern_stack", "corrupted kernel stack\n");
+ if (!good_user_stack)
+ CHECK(!good_user_stack, "user_stack", "corrupted user stack\n");
+}
+
+void test_get_stack_raw_tp(void)
+{
+ const char *file = "./test_get_stack_rawtp.o";
+ const char *file_err = "./test_get_stack_rawtp_err.o";
+ const char *prog_name = "raw_tracepoint/sys_enter";
+ int i, err, prog_fd, exp_cnt = MAX_CNT_RAWTP;
+ struct perf_buffer_opts pb_opts = {};
+ struct perf_buffer *pb = NULL;
+ struct bpf_link *link = NULL;
+ struct timespec tv = {0, 10};
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ struct bpf_map *map;
+ cpu_set_t cpu_set;
+
+ err = bpf_prog_load(file_err, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
+ if (CHECK(err >= 0, "prog_load raw tp", "err %d errno %d\n", err, errno))
+ return;
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
+ if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
+ return;
+
+ prog = bpf_object__find_program_by_title(obj, prog_name);
+ if (CHECK(!prog, "find_probe", "prog '%s' not found\n", prog_name))
+ goto close_prog;
+
+ map = bpf_object__find_map_by_name(obj, "perfmap");
+ if (CHECK(!map, "bpf_find_map", "not found\n"))
+ goto close_prog;
+
+ err = load_kallsyms();
+ if (CHECK(err < 0, "load_kallsyms", "err %d errno %d\n", err, errno))
+ goto close_prog;
+
+ CPU_ZERO(&cpu_set);
+ CPU_SET(0, &cpu_set);
+ err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set);
+ if (CHECK(err, "set_affinity", "err %d, errno %d\n", err, errno))
+ goto close_prog;
+
+ link = bpf_program__attach_raw_tracepoint(prog, "sys_enter");
+ if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n", PTR_ERR(link)))
+ goto close_prog;
+
+ pb_opts.sample_cb = get_stack_print_output;
+ pb = perf_buffer__new(bpf_map__fd(map), 8, &pb_opts);
+ if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb)))
+ goto close_prog;
+
+ /* trigger some syscall action */
+ for (i = 0; i < MAX_CNT_RAWTP; i++)
+ nanosleep(&tv, NULL);
+
+ while (exp_cnt > 0) {
+ err = perf_buffer__poll(pb, 100);
+ if (err < 0 && CHECK(err < 0, "pb__poll", "err %d\n", err))
+ goto close_prog;
+ exp_cnt -= err;
+ }
+
+close_prog:
+ if (!IS_ERR_OR_NULL(link))
+ bpf_link__destroy(link);
+ if (!IS_ERR_OR_NULL(pb))
+ perf_buffer__free(pb);
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/get_stackid_cannot_attach.c b/tools/testing/selftests/bpf/prog_tests/get_stackid_cannot_attach.c
new file mode 100644
index 000000000..d884b2ed5
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/get_stackid_cannot_attach.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+#include <test_progs.h>
+#include "test_stacktrace_build_id.skel.h"
+
+void test_get_stackid_cannot_attach(void)
+{
+ struct perf_event_attr attr = {
+ /* .type = PERF_TYPE_SOFTWARE, */
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_CPU_CYCLES,
+ .precise_ip = 1,
+ .sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_BRANCH_STACK,
+ .branch_sample_type = PERF_SAMPLE_BRANCH_USER |
+ PERF_SAMPLE_BRANCH_NO_FLAGS |
+ PERF_SAMPLE_BRANCH_NO_CYCLES |
+ PERF_SAMPLE_BRANCH_CALL_STACK,
+ .sample_period = 5000,
+ .size = sizeof(struct perf_event_attr),
+ };
+ struct test_stacktrace_build_id *skel;
+ __u32 duration = 0;
+ int pmu_fd, err;
+
+ skel = test_stacktrace_build_id__open();
+ if (CHECK(!skel, "skel_open", "skeleton open failed\n"))
+ return;
+
+ /* override program type */
+ bpf_program__set_perf_event(skel->progs.oncpu);
+
+ err = test_stacktrace_build_id__load(skel);
+ if (CHECK(err, "skel_load", "skeleton load failed: %d\n", err))
+ goto cleanup;
+
+ pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+ 0 /* cpu 0 */, -1 /* group id */,
+ 0 /* flags */);
+ if (pmu_fd < 0 && (errno == ENOENT || errno == EOPNOTSUPP)) {
+ printf("%s:SKIP:cannot open PERF_COUNT_HW_CPU_CYCLES with precise_ip > 0\n",
+ __func__);
+ test__skip();
+ goto cleanup;
+ }
+ if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n",
+ pmu_fd, errno))
+ goto cleanup;
+
+ skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu,
+ pmu_fd);
+ CHECK(!IS_ERR(skel->links.oncpu), "attach_perf_event_no_callchain",
+ "should have failed\n");
+ close(pmu_fd);
+
+ /* add PERF_SAMPLE_CALLCHAIN, attach should succeed */
+ attr.sample_type |= PERF_SAMPLE_CALLCHAIN;
+
+ pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+ 0 /* cpu 0 */, -1 /* group id */,
+ 0 /* flags */);
+
+ if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n",
+ pmu_fd, errno))
+ goto cleanup;
+
+ skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu,
+ pmu_fd);
+ CHECK(IS_ERR(skel->links.oncpu), "attach_perf_event_callchain",
+ "err: %ld\n", PTR_ERR(skel->links.oncpu));
+ close(pmu_fd);
+
+ /* add exclude_callchain_kernel, attach should fail */
+ attr.exclude_callchain_kernel = 1;
+
+ pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+ 0 /* cpu 0 */, -1 /* group id */,
+ 0 /* flags */);
+
+ if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n",
+ pmu_fd, errno))
+ goto cleanup;
+
+ skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu,
+ pmu_fd);
+ CHECK(!IS_ERR(skel->links.oncpu), "attach_perf_event_exclude_callchain_kernel",
+ "should have failed\n");
+ close(pmu_fd);
+
+cleanup:
+ test_stacktrace_build_id__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/global_data.c b/tools/testing/selftests/bpf/prog_tests/global_data.c
new file mode 100644
index 000000000..9efa7e50e
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/global_data.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+
+static void test_global_data_number(struct bpf_object *obj, __u32 duration)
+{
+ int i, err, map_fd;
+ __u64 num;
+
+ map_fd = bpf_find_map(__func__, obj, "result_number");
+ if (CHECK_FAIL(map_fd < 0))
+ return;
+
+ struct {
+ char *name;
+ uint32_t key;
+ __u64 num;
+ } tests[] = {
+ { "relocate .bss reference", 0, 0 },
+ { "relocate .data reference", 1, 42 },
+ { "relocate .rodata reference", 2, 24 },
+ { "relocate .bss reference", 3, 0 },
+ { "relocate .data reference", 4, 0xffeeff },
+ { "relocate .rodata reference", 5, 0xabab },
+ { "relocate .bss reference", 6, 1234 },
+ { "relocate .bss reference", 7, 0 },
+ { "relocate .rodata reference", 8, 0xab },
+ { "relocate .rodata reference", 9, 0x1111111111111111 },
+ { "relocate .rodata reference", 10, ~0 },
+ };
+
+ for (i = 0; i < sizeof(tests) / sizeof(tests[0]); i++) {
+ err = bpf_map_lookup_elem(map_fd, &tests[i].key, &num);
+ CHECK(err || num != tests[i].num, tests[i].name,
+ "err %d result %llx expected %llx\n",
+ err, num, tests[i].num);
+ }
+}
+
+static void test_global_data_string(struct bpf_object *obj, __u32 duration)
+{
+ int i, err, map_fd;
+ char str[32];
+
+ map_fd = bpf_find_map(__func__, obj, "result_string");
+ if (CHECK_FAIL(map_fd < 0))
+ return;
+
+ struct {
+ char *name;
+ uint32_t key;
+ char str[32];
+ } tests[] = {
+ { "relocate .rodata reference", 0, "abcdefghijklmnopqrstuvwxyz" },
+ { "relocate .data reference", 1, "abcdefghijklmnopqrstuvwxyz" },
+ { "relocate .bss reference", 2, "" },
+ { "relocate .data reference", 3, "abcdexghijklmnopqrstuvwxyz" },
+ { "relocate .bss reference", 4, "\0\0hello" },
+ };
+
+ for (i = 0; i < sizeof(tests) / sizeof(tests[0]); i++) {
+ err = bpf_map_lookup_elem(map_fd, &tests[i].key, str);
+ CHECK(err || memcmp(str, tests[i].str, sizeof(str)),
+ tests[i].name, "err %d result \'%s\' expected \'%s\'\n",
+ err, str, tests[i].str);
+ }
+}
+
+struct foo {
+ __u8 a;
+ __u32 b;
+ __u64 c;
+};
+
+static void test_global_data_struct(struct bpf_object *obj, __u32 duration)
+{
+ int i, err, map_fd;
+ struct foo val;
+
+ map_fd = bpf_find_map(__func__, obj, "result_struct");
+ if (CHECK_FAIL(map_fd < 0))
+ return;
+
+ struct {
+ char *name;
+ uint32_t key;
+ struct foo val;
+ } tests[] = {
+ { "relocate .rodata reference", 0, { 42, 0xfefeefef, 0x1111111111111111ULL, } },
+ { "relocate .bss reference", 1, { } },
+ { "relocate .rodata reference", 2, { } },
+ { "relocate .data reference", 3, { 41, 0xeeeeefef, 0x2111111111111111ULL, } },
+ };
+
+ for (i = 0; i < sizeof(tests) / sizeof(tests[0]); i++) {
+ err = bpf_map_lookup_elem(map_fd, &tests[i].key, &val);
+ CHECK(err || memcmp(&val, &tests[i].val, sizeof(val)),
+ tests[i].name, "err %d result { %u, %u, %llu } expected { %u, %u, %llu }\n",
+ err, val.a, val.b, val.c, tests[i].val.a, tests[i].val.b, tests[i].val.c);
+ }
+}
+
+static void test_global_data_rdonly(struct bpf_object *obj, __u32 duration)
+{
+ int err = -ENOMEM, map_fd, zero = 0;
+ struct bpf_map *map;
+ __u8 *buff;
+
+ map = bpf_object__find_map_by_name(obj, "test_glo.rodata");
+ if (CHECK_FAIL(!map || !bpf_map__is_internal(map)))
+ return;
+
+ map_fd = bpf_map__fd(map);
+ if (CHECK_FAIL(map_fd < 0))
+ return;
+
+ buff = malloc(bpf_map__def(map)->value_size);
+ if (buff)
+ err = bpf_map_update_elem(map_fd, &zero, buff, 0);
+ free(buff);
+ CHECK(!err || errno != EPERM, "test .rodata read-only map",
+ "err %d errno %d\n", err, errno);
+}
+
+void test_global_data(void)
+{
+ const char *file = "./test_global_data.o";
+ __u32 duration = 0, retval;
+ struct bpf_object *obj;
+ int err, prog_fd;
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
+ if (CHECK(err, "load program", "error %d loading %s\n", err, file))
+ return;
+
+ err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+ NULL, NULL, &retval, &duration);
+ CHECK(err || retval, "pass global data run",
+ "err %d errno %d retval %d duration %d\n",
+ err, errno, retval, duration);
+
+ test_global_data_number(obj, duration);
+ test_global_data_string(obj, duration);
+ test_global_data_struct(obj, duration);
+ test_global_data_rdonly(obj, duration);
+
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/global_data_init.c b/tools/testing/selftests/bpf/prog_tests/global_data_init.c
new file mode 100644
index 000000000..ee46b11f1
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/global_data_init.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_global_data_init(void)
+{
+ const char *file = "./test_global_data.o";
+ int err = -ENOMEM, map_fd, zero = 0;
+ __u8 *buff = NULL, *newval = NULL;
+ struct bpf_object *obj;
+ struct bpf_map *map;
+ __u32 duration = 0;
+ size_t sz;
+
+ obj = bpf_object__open_file(file, NULL);
+ err = libbpf_get_error(obj);
+ if (CHECK_FAIL(err))
+ return;
+
+ map = bpf_object__find_map_by_name(obj, "test_glo.rodata");
+ if (CHECK_FAIL(!map || !bpf_map__is_internal(map)))
+ goto out;
+
+ sz = bpf_map__def(map)->value_size;
+ newval = malloc(sz);
+ if (CHECK_FAIL(!newval))
+ goto out;
+
+ memset(newval, 0, sz);
+ /* wrong size, should fail */
+ err = bpf_map__set_initial_value(map, newval, sz - 1);
+ if (CHECK(!err, "reject set initial value wrong size", "err %d\n", err))
+ goto out;
+
+ err = bpf_map__set_initial_value(map, newval, sz);
+ if (CHECK(err, "set initial value", "err %d\n", err))
+ goto out;
+
+ err = bpf_object__load(obj);
+ if (CHECK_FAIL(err))
+ goto out;
+
+ map_fd = bpf_map__fd(map);
+ if (CHECK_FAIL(map_fd < 0))
+ goto out;
+
+ buff = malloc(sz);
+ if (buff)
+ err = bpf_map_lookup_elem(map_fd, &zero, buff);
+ if (CHECK(!buff || err || memcmp(buff, newval, sz),
+ "compare .rodata map data override",
+ "err %d errno %d\n", err, errno))
+ goto out;
+
+ memset(newval, 1, sz);
+ /* object loaded - should fail */
+ err = bpf_map__set_initial_value(map, newval, sz);
+ CHECK(!err, "reject set initial value after load", "err %d\n", err);
+out:
+ free(buff);
+ free(newval);
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/hashmap.c b/tools/testing/selftests/bpf/prog_tests/hashmap.c
new file mode 100644
index 000000000..428d48883
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/hashmap.c
@@ -0,0 +1,380 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+
+/*
+ * Tests for libbpf's hashmap.
+ *
+ * Copyright (c) 2019 Facebook
+ */
+#include "test_progs.h"
+#include "bpf/hashmap.h"
+
+static int duration = 0;
+
+static size_t hash_fn(const void *k, void *ctx)
+{
+ return (long)k;
+}
+
+static bool equal_fn(const void *a, const void *b, void *ctx)
+{
+ return (long)a == (long)b;
+}
+
+static inline size_t next_pow_2(size_t n)
+{
+ size_t r = 1;
+
+ while (r < n)
+ r <<= 1;
+ return r;
+}
+
+static inline size_t exp_cap(size_t sz)
+{
+ size_t r = next_pow_2(sz);
+
+ if (sz * 4 / 3 > r)
+ r <<= 1;
+ return r;
+}
+
+#define ELEM_CNT 62
+
+static void test_hashmap_generic(void)
+{
+ struct hashmap_entry *entry, *tmp;
+ int err, bkt, found_cnt, i;
+ long long found_msk;
+ struct hashmap *map;
+
+ map = hashmap__new(hash_fn, equal_fn, NULL);
+ if (CHECK(IS_ERR(map), "hashmap__new",
+ "failed to create map: %ld\n", PTR_ERR(map)))
+ return;
+
+ for (i = 0; i < ELEM_CNT; i++) {
+ const void *oldk, *k = (const void *)(long)i;
+ void *oldv, *v = (void *)(long)(1024 + i);
+
+ err = hashmap__update(map, k, v, &oldk, &oldv);
+ if (CHECK(err != -ENOENT, "hashmap__update",
+ "unexpected result: %d\n", err))
+ goto cleanup;
+
+ if (i % 2) {
+ err = hashmap__add(map, k, v);
+ } else {
+ err = hashmap__set(map, k, v, &oldk, &oldv);
+ if (CHECK(oldk != NULL || oldv != NULL, "check_kv",
+ "unexpected k/v: %p=%p\n", oldk, oldv))
+ goto cleanup;
+ }
+
+ if (CHECK(err, "elem_add", "failed to add k/v %ld = %ld: %d\n",
+ (long)k, (long)v, err))
+ goto cleanup;
+
+ if (CHECK(!hashmap__find(map, k, &oldv), "elem_find",
+ "failed to find key %ld\n", (long)k))
+ goto cleanup;
+ if (CHECK(oldv != v, "elem_val",
+ "found value is wrong: %ld\n", (long)oldv))
+ goto cleanup;
+ }
+
+ if (CHECK(hashmap__size(map) != ELEM_CNT, "hashmap__size",
+ "invalid map size: %zu\n", hashmap__size(map)))
+ goto cleanup;
+ if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)),
+ "hashmap_cap",
+ "unexpected map capacity: %zu\n", hashmap__capacity(map)))
+ goto cleanup;
+
+ found_msk = 0;
+ hashmap__for_each_entry(map, entry, bkt) {
+ long k = (long)entry->key;
+ long v = (long)entry->value;
+
+ found_msk |= 1ULL << k;
+ if (CHECK(v - k != 1024, "check_kv",
+ "invalid k/v pair: %ld = %ld\n", k, v))
+ goto cleanup;
+ }
+ if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1, "elem_cnt",
+ "not all keys iterated: %llx\n", found_msk))
+ goto cleanup;
+
+ for (i = 0; i < ELEM_CNT; i++) {
+ const void *oldk, *k = (const void *)(long)i;
+ void *oldv, *v = (void *)(long)(256 + i);
+
+ err = hashmap__add(map, k, v);
+ if (CHECK(err != -EEXIST, "hashmap__add",
+ "unexpected add result: %d\n", err))
+ goto cleanup;
+
+ if (i % 2)
+ err = hashmap__update(map, k, v, &oldk, &oldv);
+ else
+ err = hashmap__set(map, k, v, &oldk, &oldv);
+
+ if (CHECK(err, "elem_upd",
+ "failed to update k/v %ld = %ld: %d\n",
+ (long)k, (long)v, err))
+ goto cleanup;
+ if (CHECK(!hashmap__find(map, k, &oldv), "elem_find",
+ "failed to find key %ld\n", (long)k))
+ goto cleanup;
+ if (CHECK(oldv != v, "elem_val",
+ "found value is wrong: %ld\n", (long)oldv))
+ goto cleanup;
+ }
+
+ if (CHECK(hashmap__size(map) != ELEM_CNT, "hashmap__size",
+ "invalid updated map size: %zu\n", hashmap__size(map)))
+ goto cleanup;
+ if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)),
+ "hashmap__capacity",
+ "unexpected map capacity: %zu\n", hashmap__capacity(map)))
+ goto cleanup;
+
+ found_msk = 0;
+ hashmap__for_each_entry_safe(map, entry, tmp, bkt) {
+ long k = (long)entry->key;
+ long v = (long)entry->value;
+
+ found_msk |= 1ULL << k;
+ if (CHECK(v - k != 256, "elem_check",
+ "invalid updated k/v pair: %ld = %ld\n", k, v))
+ goto cleanup;
+ }
+ if (CHECK(found_msk != (1ULL << ELEM_CNT) - 1, "elem_cnt",
+ "not all keys iterated after update: %llx\n", found_msk))
+ goto cleanup;
+
+ found_cnt = 0;
+ hashmap__for_each_key_entry(map, entry, (void *)0) {
+ found_cnt++;
+ }
+ if (CHECK(!found_cnt, "found_cnt",
+ "didn't find any entries for key 0\n"))
+ goto cleanup;
+
+ found_msk = 0;
+ found_cnt = 0;
+ hashmap__for_each_key_entry_safe(map, entry, tmp, (void *)0) {
+ const void *oldk, *k;
+ void *oldv, *v;
+
+ k = entry->key;
+ v = entry->value;
+
+ found_cnt++;
+ found_msk |= 1ULL << (long)k;
+
+ if (CHECK(!hashmap__delete(map, k, &oldk, &oldv), "elem_del",
+ "failed to delete k/v %ld = %ld\n",
+ (long)k, (long)v))
+ goto cleanup;
+ if (CHECK(oldk != k || oldv != v, "check_old",
+ "invalid deleted k/v: expected %ld = %ld, got %ld = %ld\n",
+ (long)k, (long)v, (long)oldk, (long)oldv))
+ goto cleanup;
+ if (CHECK(hashmap__delete(map, k, &oldk, &oldv), "elem_del",
+ "unexpectedly deleted k/v %ld = %ld\n",
+ (long)oldk, (long)oldv))
+ goto cleanup;
+ }
+
+ if (CHECK(!found_cnt || !found_msk, "found_entries",
+ "didn't delete any key entries\n"))
+ goto cleanup;
+ if (CHECK(hashmap__size(map) != ELEM_CNT - found_cnt, "elem_cnt",
+ "invalid updated map size (already deleted: %d): %zu\n",
+ found_cnt, hashmap__size(map)))
+ goto cleanup;
+ if (CHECK(hashmap__capacity(map) != exp_cap(hashmap__size(map)),
+ "hashmap__capacity",
+ "unexpected map capacity: %zu\n", hashmap__capacity(map)))
+ goto cleanup;
+
+ hashmap__for_each_entry_safe(map, entry, tmp, bkt) {
+ const void *oldk, *k;
+ void *oldv, *v;
+
+ k = entry->key;
+ v = entry->value;
+
+ found_cnt++;
+ found_msk |= 1ULL << (long)k;
+
+ if (CHECK(!hashmap__delete(map, k, &oldk, &oldv), "elem_del",
+ "failed to delete k/v %ld = %ld\n",
+ (long)k, (long)v))
+ goto cleanup;
+ if (CHECK(oldk != k || oldv != v, "elem_check",
+ "invalid old k/v: expect %ld = %ld, got %ld = %ld\n",
+ (long)k, (long)v, (long)oldk, (long)oldv))
+ goto cleanup;
+ if (CHECK(hashmap__delete(map, k, &oldk, &oldv), "elem_del",
+ "unexpectedly deleted k/v %ld = %ld\n",
+ (long)k, (long)v))
+ goto cleanup;
+ }
+
+ if (CHECK(found_cnt != ELEM_CNT || found_msk != (1ULL << ELEM_CNT) - 1,
+ "found_cnt",
+ "not all keys were deleted: found_cnt:%d, found_msk:%llx\n",
+ found_cnt, found_msk))
+ goto cleanup;
+ if (CHECK(hashmap__size(map) != 0, "hashmap__size",
+ "invalid updated map size (already deleted: %d): %zu\n",
+ found_cnt, hashmap__size(map)))
+ goto cleanup;
+
+ found_cnt = 0;
+ hashmap__for_each_entry(map, entry, bkt) {
+ CHECK(false, "elem_exists",
+ "unexpected map entries left: %ld = %ld\n",
+ (long)entry->key, (long)entry->value);
+ goto cleanup;
+ }
+
+ hashmap__clear(map);
+ hashmap__for_each_entry(map, entry, bkt) {
+ CHECK(false, "elem_exists",
+ "unexpected map entries left: %ld = %ld\n",
+ (long)entry->key, (long)entry->value);
+ goto cleanup;
+ }
+
+cleanup:
+ hashmap__free(map);
+}
+
+static size_t collision_hash_fn(const void *k, void *ctx)
+{
+ return 0;
+}
+
+static void test_hashmap_multimap(void)
+{
+ void *k1 = (void *)0, *k2 = (void *)1;
+ struct hashmap_entry *entry;
+ struct hashmap *map;
+ long found_msk;
+ int err, bkt;
+
+ /* force collisions */
+ map = hashmap__new(collision_hash_fn, equal_fn, NULL);
+ if (CHECK(IS_ERR(map), "hashmap__new",
+ "failed to create map: %ld\n", PTR_ERR(map)))
+ return;
+
+ /* set up multimap:
+ * [0] -> 1, 2, 4;
+ * [1] -> 8, 16, 32;
+ */
+ err = hashmap__append(map, k1, (void *)1);
+ if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err))
+ goto cleanup;
+ err = hashmap__append(map, k1, (void *)2);
+ if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err))
+ goto cleanup;
+ err = hashmap__append(map, k1, (void *)4);
+ if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err))
+ goto cleanup;
+
+ err = hashmap__append(map, k2, (void *)8);
+ if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err))
+ goto cleanup;
+ err = hashmap__append(map, k2, (void *)16);
+ if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err))
+ goto cleanup;
+ err = hashmap__append(map, k2, (void *)32);
+ if (CHECK(err, "elem_add", "failed to add k/v: %d\n", err))
+ goto cleanup;
+
+ if (CHECK(hashmap__size(map) != 6, "hashmap_size",
+ "invalid map size: %zu\n", hashmap__size(map)))
+ goto cleanup;
+
+ /* verify global iteration still works and sees all values */
+ found_msk = 0;
+ hashmap__for_each_entry(map, entry, bkt) {
+ found_msk |= (long)entry->value;
+ }
+ if (CHECK(found_msk != (1 << 6) - 1, "found_msk",
+ "not all keys iterated: %lx\n", found_msk))
+ goto cleanup;
+
+ /* iterate values for key 1 */
+ found_msk = 0;
+ hashmap__for_each_key_entry(map, entry, k1) {
+ found_msk |= (long)entry->value;
+ }
+ if (CHECK(found_msk != (1 | 2 | 4), "found_msk",
+ "invalid k1 values: %lx\n", found_msk))
+ goto cleanup;
+
+ /* iterate values for key 2 */
+ found_msk = 0;
+ hashmap__for_each_key_entry(map, entry, k2) {
+ found_msk |= (long)entry->value;
+ }
+ if (CHECK(found_msk != (8 | 16 | 32), "found_msk",
+ "invalid k2 values: %lx\n", found_msk))
+ goto cleanup;
+
+cleanup:
+ hashmap__free(map);
+}
+
+static void test_hashmap_empty()
+{
+ struct hashmap_entry *entry;
+ int bkt;
+ struct hashmap *map;
+ void *k = (void *)0;
+
+ /* force collisions */
+ map = hashmap__new(hash_fn, equal_fn, NULL);
+ if (CHECK(IS_ERR(map), "hashmap__new",
+ "failed to create map: %ld\n", PTR_ERR(map)))
+ goto cleanup;
+
+ if (CHECK(hashmap__size(map) != 0, "hashmap__size",
+ "invalid map size: %zu\n", hashmap__size(map)))
+ goto cleanup;
+ if (CHECK(hashmap__capacity(map) != 0, "hashmap__capacity",
+ "invalid map capacity: %zu\n", hashmap__capacity(map)))
+ goto cleanup;
+ if (CHECK(hashmap__find(map, k, NULL), "elem_find",
+ "unexpected find\n"))
+ goto cleanup;
+ if (CHECK(hashmap__delete(map, k, NULL, NULL), "elem_del",
+ "unexpected delete\n"))
+ goto cleanup;
+
+ hashmap__for_each_entry(map, entry, bkt) {
+ CHECK(false, "elem_found", "unexpected iterated entry\n");
+ goto cleanup;
+ }
+ hashmap__for_each_key_entry(map, entry, k) {
+ CHECK(false, "key_found", "unexpected key entry\n");
+ goto cleanup;
+ }
+
+cleanup:
+ hashmap__free(map);
+}
+
+void test_hashmap()
+{
+ if (test__start_subtest("generic"))
+ test_hashmap_generic();
+ if (test__start_subtest("multimap"))
+ test_hashmap_multimap();
+ if (test__start_subtest("empty"))
+ test_hashmap_empty();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/kfree_skb.c b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
new file mode 100644
index 000000000..42c3a3103
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/kfree_skb.c
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+
+struct meta {
+ int ifindex;
+ __u32 cb32_0;
+ __u8 cb8_0;
+};
+
+static union {
+ __u32 cb32[5];
+ __u8 cb8[20];
+} cb = {
+ .cb32[0] = 0x81828384,
+};
+
+static void on_sample(void *ctx, int cpu, void *data, __u32 size)
+{
+ struct meta *meta = (struct meta *)data;
+ struct ipv6_packet *pkt_v6 = data + sizeof(*meta);
+ int duration = 0;
+
+ if (CHECK(size != 72 + sizeof(*meta), "check_size", "size %u != %zu\n",
+ size, 72 + sizeof(*meta)))
+ return;
+ if (CHECK(meta->ifindex != 1, "check_meta_ifindex",
+ "meta->ifindex = %d\n", meta->ifindex))
+ /* spurious kfree_skb not on loopback device */
+ return;
+ if (CHECK(meta->cb8_0 != cb.cb8[0], "check_cb8_0", "cb8_0 %x != %x\n",
+ meta->cb8_0, cb.cb8[0]))
+ return;
+ if (CHECK(meta->cb32_0 != cb.cb32[0], "check_cb32_0",
+ "cb32_0 %x != %x\n",
+ meta->cb32_0, cb.cb32[0]))
+ return;
+ if (CHECK(pkt_v6->eth.h_proto != 0xdd86, "check_eth",
+ "h_proto %x\n", pkt_v6->eth.h_proto))
+ return;
+ if (CHECK(pkt_v6->iph.nexthdr != 6, "check_ip",
+ "iph.nexthdr %x\n", pkt_v6->iph.nexthdr))
+ return;
+ if (CHECK(pkt_v6->tcp.doff != 5, "check_tcp",
+ "tcp.doff %x\n", pkt_v6->tcp.doff))
+ return;
+
+ *(bool *)ctx = true;
+}
+
+void test_kfree_skb(void)
+{
+ struct __sk_buff skb = {};
+ struct bpf_prog_test_run_attr tattr = {
+ .data_in = &pkt_v6,
+ .data_size_in = sizeof(pkt_v6),
+ .ctx_in = &skb,
+ .ctx_size_in = sizeof(skb),
+ };
+ struct bpf_prog_load_attr attr = {
+ .file = "./kfree_skb.o",
+ };
+
+ struct bpf_link *link = NULL, *link_fentry = NULL, *link_fexit = NULL;
+ struct bpf_map *perf_buf_map, *global_data;
+ struct bpf_program *prog, *fentry, *fexit;
+ struct bpf_object *obj, *obj2 = NULL;
+ struct perf_buffer_opts pb_opts = {};
+ struct perf_buffer *pb = NULL;
+ int err, kfree_skb_fd;
+ bool passed = false;
+ __u32 duration = 0;
+ const int zero = 0;
+ bool test_ok[2];
+
+ err = bpf_prog_load("./test_pkt_access.o", BPF_PROG_TYPE_SCHED_CLS,
+ &obj, &tattr.prog_fd);
+ if (CHECK(err, "prog_load sched cls", "err %d errno %d\n", err, errno))
+ return;
+
+ err = bpf_prog_load_xattr(&attr, &obj2, &kfree_skb_fd);
+ if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
+ goto close_prog;
+
+ prog = bpf_object__find_program_by_title(obj2, "tp_btf/kfree_skb");
+ if (CHECK(!prog, "find_prog", "prog kfree_skb not found\n"))
+ goto close_prog;
+ fentry = bpf_object__find_program_by_title(obj2, "fentry/eth_type_trans");
+ if (CHECK(!fentry, "find_prog", "prog eth_type_trans not found\n"))
+ goto close_prog;
+ fexit = bpf_object__find_program_by_title(obj2, "fexit/eth_type_trans");
+ if (CHECK(!fexit, "find_prog", "prog eth_type_trans not found\n"))
+ goto close_prog;
+
+ global_data = bpf_object__find_map_by_name(obj2, "kfree_sk.bss");
+ if (CHECK(!global_data, "find global data", "not found\n"))
+ goto close_prog;
+
+ link = bpf_program__attach_raw_tracepoint(prog, NULL);
+ if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n", PTR_ERR(link)))
+ goto close_prog;
+ link_fentry = bpf_program__attach_trace(fentry);
+ if (CHECK(IS_ERR(link_fentry), "attach fentry", "err %ld\n",
+ PTR_ERR(link_fentry)))
+ goto close_prog;
+ link_fexit = bpf_program__attach_trace(fexit);
+ if (CHECK(IS_ERR(link_fexit), "attach fexit", "err %ld\n",
+ PTR_ERR(link_fexit)))
+ goto close_prog;
+
+ perf_buf_map = bpf_object__find_map_by_name(obj2, "perf_buf_map");
+ if (CHECK(!perf_buf_map, "find_perf_buf_map", "not found\n"))
+ goto close_prog;
+
+ /* set up perf buffer */
+ pb_opts.sample_cb = on_sample;
+ pb_opts.ctx = &passed;
+ pb = perf_buffer__new(bpf_map__fd(perf_buf_map), 1, &pb_opts);
+ if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb)))
+ goto close_prog;
+
+ memcpy(skb.cb, &cb, sizeof(cb));
+ err = bpf_prog_test_run_xattr(&tattr);
+ duration = tattr.duration;
+ CHECK(err || tattr.retval, "ipv6",
+ "err %d errno %d retval %d duration %d\n",
+ err, errno, tattr.retval, duration);
+
+ /* read perf buffer */
+ err = perf_buffer__poll(pb, 100);
+ if (CHECK(err < 0, "perf_buffer__poll", "err %d\n", err))
+ goto close_prog;
+
+ /* make sure kfree_skb program was triggered
+ * and it sent expected skb into ring buffer
+ */
+ CHECK_FAIL(!passed);
+
+ err = bpf_map_lookup_elem(bpf_map__fd(global_data), &zero, test_ok);
+ if (CHECK(err, "get_result",
+ "failed to get output data: %d\n", err))
+ goto close_prog;
+
+ CHECK_FAIL(!test_ok[0] || !test_ok[1]);
+close_prog:
+ perf_buffer__free(pb);
+ if (!IS_ERR_OR_NULL(link))
+ bpf_link__destroy(link);
+ if (!IS_ERR_OR_NULL(link_fentry))
+ bpf_link__destroy(link_fentry);
+ if (!IS_ERR_OR_NULL(link_fexit))
+ bpf_link__destroy(link_fexit);
+ bpf_object__close(obj);
+ bpf_object__close(obj2);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms.c b/tools/testing/selftests/bpf/prog_tests/ksyms.c
new file mode 100644
index 000000000..b295969b2
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/ksyms.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+
+#include <test_progs.h>
+#include "test_ksyms.skel.h"
+#include <sys/stat.h>
+
+static int duration;
+
+void test_ksyms(void)
+{
+ const char *btf_path = "/sys/kernel/btf/vmlinux";
+ struct test_ksyms *skel;
+ struct test_ksyms__data *data;
+ __u64 link_fops_addr, per_cpu_start_addr;
+ struct stat st;
+ __u64 btf_size;
+ int err;
+
+ err = kallsyms_find("bpf_link_fops", &link_fops_addr);
+ if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno))
+ return;
+ if (CHECK(err == -ENOENT, "ksym_find", "symbol 'bpf_link_fops' not found\n"))
+ return;
+
+ err = kallsyms_find("__per_cpu_start", &per_cpu_start_addr);
+ if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno))
+ return;
+ if (CHECK(err == -ENOENT, "ksym_find", "symbol 'per_cpu_start' not found\n"))
+ return;
+
+ if (CHECK(stat(btf_path, &st), "stat_btf", "err %d\n", errno))
+ return;
+ btf_size = st.st_size;
+
+ skel = test_ksyms__open_and_load();
+ if (CHECK(!skel, "skel_open", "failed to open and load skeleton\n"))
+ return;
+
+ err = test_ksyms__attach(skel);
+ if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err))
+ goto cleanup;
+
+ /* trigger tracepoint */
+ usleep(1);
+
+ data = skel->data;
+ CHECK(data->out__bpf_link_fops != link_fops_addr, "bpf_link_fops",
+ "got 0x%llx, exp 0x%llx\n",
+ data->out__bpf_link_fops, link_fops_addr);
+ CHECK(data->out__bpf_link_fops1 != 0, "bpf_link_fops1",
+ "got %llu, exp %llu\n", data->out__bpf_link_fops1, (__u64)0);
+ CHECK(data->out__btf_size != btf_size, "btf_size",
+ "got %llu, exp %llu\n", data->out__btf_size, btf_size);
+ CHECK(data->out__per_cpu_start != per_cpu_start_addr, "__per_cpu_start",
+ "got %llu, exp %llu\n", data->out__per_cpu_start,
+ per_cpu_start_addr);
+
+cleanup:
+ test_ksyms__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
new file mode 100644
index 000000000..b58b775d1
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Google */
+
+#include <test_progs.h>
+#include <bpf/libbpf.h>
+#include <bpf/btf.h>
+#include "test_ksyms_btf.skel.h"
+#include "test_ksyms_btf_null_check.skel.h"
+
+static int duration;
+
+static void test_basic(void)
+{
+ __u64 runqueues_addr, bpf_prog_active_addr;
+ __u32 this_rq_cpu;
+ int this_bpf_prog_active;
+ struct test_ksyms_btf *skel = NULL;
+ struct test_ksyms_btf__data *data;
+ int err;
+
+ err = kallsyms_find("runqueues", &runqueues_addr);
+ if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno))
+ return;
+ if (CHECK(err == -ENOENT, "ksym_find", "symbol 'runqueues' not found\n"))
+ return;
+
+ err = kallsyms_find("bpf_prog_active", &bpf_prog_active_addr);
+ if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno))
+ return;
+ if (CHECK(err == -ENOENT, "ksym_find", "symbol 'bpf_prog_active' not found\n"))
+ return;
+
+ skel = test_ksyms_btf__open_and_load();
+ if (CHECK(!skel, "skel_open", "failed to open and load skeleton\n"))
+ goto cleanup;
+
+ err = test_ksyms_btf__attach(skel);
+ if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err))
+ goto cleanup;
+
+ /* trigger tracepoint */
+ usleep(1);
+
+ data = skel->data;
+ CHECK(data->out__runqueues_addr != runqueues_addr, "runqueues_addr",
+ "got %llu, exp %llu\n",
+ (unsigned long long)data->out__runqueues_addr,
+ (unsigned long long)runqueues_addr);
+ CHECK(data->out__bpf_prog_active_addr != bpf_prog_active_addr, "bpf_prog_active_addr",
+ "got %llu, exp %llu\n",
+ (unsigned long long)data->out__bpf_prog_active_addr,
+ (unsigned long long)bpf_prog_active_addr);
+
+ CHECK(data->out__rq_cpu == -1, "rq_cpu",
+ "got %u, exp != -1\n", data->out__rq_cpu);
+ CHECK(data->out__bpf_prog_active < 0, "bpf_prog_active",
+ "got %d, exp >= 0\n", data->out__bpf_prog_active);
+ CHECK(data->out__cpu_0_rq_cpu != 0, "cpu_rq(0)->cpu",
+ "got %u, exp 0\n", data->out__cpu_0_rq_cpu);
+
+ this_rq_cpu = data->out__this_rq_cpu;
+ CHECK(this_rq_cpu != data->out__rq_cpu, "this_rq_cpu",
+ "got %u, exp %u\n", this_rq_cpu, data->out__rq_cpu);
+
+ this_bpf_prog_active = data->out__this_bpf_prog_active;
+ CHECK(this_bpf_prog_active != data->out__bpf_prog_active, "this_bpf_prog_active",
+ "got %d, exp %d\n", this_bpf_prog_active,
+ data->out__bpf_prog_active);
+
+cleanup:
+ test_ksyms_btf__destroy(skel);
+}
+
+static void test_null_check(void)
+{
+ struct test_ksyms_btf_null_check *skel;
+
+ skel = test_ksyms_btf_null_check__open_and_load();
+ CHECK(skel, "skel_open", "unexpected load of a prog missing null check\n");
+
+ test_ksyms_btf_null_check__destroy(skel);
+}
+
+void test_ksyms_btf(void)
+{
+ int percpu_datasec;
+ struct btf *btf;
+
+ btf = libbpf_find_kernel_btf();
+ if (CHECK(IS_ERR(btf), "btf_exists", "failed to load kernel BTF: %ld\n",
+ PTR_ERR(btf)))
+ return;
+
+ percpu_datasec = btf__find_by_name_kind(btf, ".data..percpu",
+ BTF_KIND_DATASEC);
+ btf__free(btf);
+ if (percpu_datasec < 0) {
+ printf("%s:SKIP:no PERCPU DATASEC in kernel btf\n",
+ __func__);
+ test__skip();
+ return;
+ }
+
+ if (test__start_subtest("basic"))
+ test_basic();
+
+ if (test__start_subtest("null_check"))
+ test_null_check();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/l4lb_all.c b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c
new file mode 100644
index 000000000..807310554
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/l4lb_all.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+
+static void test_l4lb(const char *file)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ struct vip key = {.protocol = 6};
+ struct vip_meta {
+ __u32 flags;
+ __u32 vip_num;
+ } value = {.vip_num = VIP_NUM};
+ __u32 stats_key = VIP_NUM;
+ struct vip_stats {
+ __u64 bytes;
+ __u64 pkts;
+ } stats[nr_cpus];
+ struct real_definition {
+ union {
+ __be32 dst;
+ __be32 dstv6[4];
+ };
+ __u8 flags;
+ } real_def = {.dst = MAGIC_VAL};
+ __u32 ch_key = 11, real_num = 3;
+ __u32 duration, retval, size;
+ int err, i, prog_fd, map_fd;
+ __u64 bytes = 0, pkts = 0;
+ struct bpf_object *obj;
+ char buf[128];
+ u32 *magic = (u32 *)buf;
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
+ if (CHECK_FAIL(err))
+ return;
+
+ map_fd = bpf_find_map(__func__, obj, "vip_map");
+ if (map_fd < 0)
+ goto out;
+ bpf_map_update_elem(map_fd, &key, &value, 0);
+
+ map_fd = bpf_find_map(__func__, obj, "ch_rings");
+ if (map_fd < 0)
+ goto out;
+ bpf_map_update_elem(map_fd, &ch_key, &real_num, 0);
+
+ map_fd = bpf_find_map(__func__, obj, "reals");
+ if (map_fd < 0)
+ goto out;
+ bpf_map_update_elem(map_fd, &real_num, &real_def, 0);
+
+ err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v4, sizeof(pkt_v4),
+ buf, &size, &retval, &duration);
+ CHECK(err || retval != 7/*TC_ACT_REDIRECT*/ || size != 54 ||
+ *magic != MAGIC_VAL, "ipv4",
+ "err %d errno %d retval %d size %d magic %x\n",
+ err, errno, retval, size, *magic);
+
+ err = bpf_prog_test_run(prog_fd, NUM_ITER, &pkt_v6, sizeof(pkt_v6),
+ buf, &size, &retval, &duration);
+ CHECK(err || retval != 7/*TC_ACT_REDIRECT*/ || size != 74 ||
+ *magic != MAGIC_VAL, "ipv6",
+ "err %d errno %d retval %d size %d magic %x\n",
+ err, errno, retval, size, *magic);
+
+ map_fd = bpf_find_map(__func__, obj, "stats");
+ if (map_fd < 0)
+ goto out;
+ bpf_map_lookup_elem(map_fd, &stats_key, stats);
+ for (i = 0; i < nr_cpus; i++) {
+ bytes += stats[i].bytes;
+ pkts += stats[i].pkts;
+ }
+ if (CHECK_FAIL(bytes != MAGIC_BYTES * NUM_ITER * 2 ||
+ pkts != NUM_ITER * 2))
+ printf("test_l4lb:FAIL:stats %lld %lld\n", bytes, pkts);
+out:
+ bpf_object__close(obj);
+}
+
+void test_l4lb_all(void)
+{
+ if (test__start_subtest("l4lb_inline"))
+ test_l4lb("test_l4lb.o");
+ if (test__start_subtest("l4lb_noinline"))
+ test_l4lb("test_l4lb_noinline.o");
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/link_pinning.c b/tools/testing/selftests/bpf/prog_tests/link_pinning.c
new file mode 100644
index 000000000..a743288cf
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/link_pinning.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include <test_progs.h>
+#include <sys/stat.h>
+
+#include "test_link_pinning.skel.h"
+
+static int duration = 0;
+
+void test_link_pinning_subtest(struct bpf_program *prog,
+ struct test_link_pinning__bss *bss)
+{
+ const char *link_pin_path = "/sys/fs/bpf/pinned_link_test";
+ struct stat statbuf = {};
+ struct bpf_link *link;
+ int err, i;
+
+ link = bpf_program__attach(prog);
+ if (CHECK(IS_ERR(link), "link_attach", "err: %ld\n", PTR_ERR(link)))
+ goto cleanup;
+
+ bss->in = 1;
+ usleep(1);
+ CHECK(bss->out != 1, "res_check1", "exp %d, got %d\n", 1, bss->out);
+
+ /* pin link */
+ err = bpf_link__pin(link, link_pin_path);
+ if (CHECK(err, "link_pin", "err: %d\n", err))
+ goto cleanup;
+
+ CHECK(strcmp(link_pin_path, bpf_link__pin_path(link)), "pin_path1",
+ "exp %s, got %s\n", link_pin_path, bpf_link__pin_path(link));
+
+ /* check that link was pinned */
+ err = stat(link_pin_path, &statbuf);
+ if (CHECK(err, "stat_link", "err %d errno %d\n", err, errno))
+ goto cleanup;
+
+ bss->in = 2;
+ usleep(1);
+ CHECK(bss->out != 2, "res_check2", "exp %d, got %d\n", 2, bss->out);
+
+ /* destroy link, pinned link should keep program attached */
+ bpf_link__destroy(link);
+ link = NULL;
+
+ bss->in = 3;
+ usleep(1);
+ CHECK(bss->out != 3, "res_check3", "exp %d, got %d\n", 3, bss->out);
+
+ /* re-open link from BPFFS */
+ link = bpf_link__open(link_pin_path);
+ if (CHECK(IS_ERR(link), "link_open", "err: %ld\n", PTR_ERR(link)))
+ goto cleanup;
+
+ CHECK(strcmp(link_pin_path, bpf_link__pin_path(link)), "pin_path2",
+ "exp %s, got %s\n", link_pin_path, bpf_link__pin_path(link));
+
+ /* unpin link from BPFFS, program still attached */
+ err = bpf_link__unpin(link);
+ if (CHECK(err, "link_unpin", "err: %d\n", err))
+ goto cleanup;
+
+ /* still active, as we have FD open now */
+ bss->in = 4;
+ usleep(1);
+ CHECK(bss->out != 4, "res_check4", "exp %d, got %d\n", 4, bss->out);
+
+ bpf_link__destroy(link);
+ link = NULL;
+
+ /* Validate it's finally detached.
+ * Actual detachment might get delayed a bit, so there is no reliable
+ * way to validate it immediately here, let's count up for long enough
+ * and see if eventually output stops being updated
+ */
+ for (i = 5; i < 10000; i++) {
+ bss->in = i;
+ usleep(1);
+ if (bss->out == i - 1)
+ break;
+ }
+ CHECK(i == 10000, "link_attached", "got to iteration #%d\n", i);
+
+cleanup:
+ if (!IS_ERR(link))
+ bpf_link__destroy(link);
+}
+
+void test_link_pinning(void)
+{
+ struct test_link_pinning* skel;
+
+ skel = test_link_pinning__open_and_load();
+ if (CHECK(!skel, "skel_open", "failed to open skeleton\n"))
+ return;
+
+ if (test__start_subtest("pin_raw_tp"))
+ test_link_pinning_subtest(skel->progs.raw_tp_prog, skel->bss);
+ if (test__start_subtest("pin_tp_btf"))
+ test_link_pinning_subtest(skel->progs.tp_btf_prog, skel->bss);
+
+ test_link_pinning__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c b/tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c
new file mode 100644
index 000000000..5a2a689db
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/load_bytes_relative.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright 2020 Google LLC.
+ */
+
+#include <test_progs.h>
+#include <network_helpers.h>
+
+void test_load_bytes_relative(void)
+{
+ int server_fd, cgroup_fd, prog_fd, map_fd, client_fd;
+ int err;
+ struct bpf_object *obj;
+ struct bpf_program *prog;
+ struct bpf_map *test_result;
+ __u32 duration = 0;
+
+ __u32 map_key = 0;
+ __u32 map_value = 0;
+
+ cgroup_fd = test__join_cgroup("/load_bytes_relative");
+ if (CHECK_FAIL(cgroup_fd < 0))
+ return;
+
+ server_fd = start_server(AF_INET, SOCK_STREAM, NULL, 0, 0);
+ if (CHECK_FAIL(server_fd < 0))
+ goto close_cgroup_fd;
+
+ err = bpf_prog_load("./load_bytes_relative.o", BPF_PROG_TYPE_CGROUP_SKB,
+ &obj, &prog_fd);
+ if (CHECK_FAIL(err))
+ goto close_server_fd;
+
+ test_result = bpf_object__find_map_by_name(obj, "test_result");
+ if (CHECK_FAIL(!test_result))
+ goto close_bpf_object;
+
+ map_fd = bpf_map__fd(test_result);
+ if (map_fd < 0)
+ goto close_bpf_object;
+
+ prog = bpf_object__find_program_by_name(obj, "load_bytes_relative");
+ if (CHECK_FAIL(!prog))
+ goto close_bpf_object;
+
+ err = bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_INET_EGRESS,
+ BPF_F_ALLOW_MULTI);
+ if (CHECK_FAIL(err))
+ goto close_bpf_object;
+
+ client_fd = connect_to_fd(server_fd, 0);
+ if (CHECK_FAIL(client_fd < 0))
+ goto close_bpf_object;
+ close(client_fd);
+
+ err = bpf_map_lookup_elem(map_fd, &map_key, &map_value);
+ if (CHECK_FAIL(err))
+ goto close_bpf_object;
+
+ CHECK(map_value != 1, "bpf", "bpf program returned failure");
+
+close_bpf_object:
+ bpf_object__close(obj);
+
+close_server_fd:
+ close(server_fd);
+
+close_cgroup_fd:
+ close(cgroup_fd);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/map_init.c b/tools/testing/selftests/bpf/prog_tests/map_init.c
new file mode 100644
index 000000000..14a31109d
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/map_init.c
@@ -0,0 +1,214 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Tessares SA <http://www.tessares.net> */
+
+#include <test_progs.h>
+#include "test_map_init.skel.h"
+
+#define TEST_VALUE 0x1234
+#define FILL_VALUE 0xdeadbeef
+
+static int nr_cpus;
+static int duration;
+
+typedef unsigned long long map_key_t;
+typedef unsigned long long map_value_t;
+typedef struct {
+ map_value_t v; /* padding */
+} __bpf_percpu_val_align pcpu_map_value_t;
+
+
+static int map_populate(int map_fd, int num)
+{
+ pcpu_map_value_t value[nr_cpus];
+ int i, err;
+ map_key_t key;
+
+ for (i = 0; i < nr_cpus; i++)
+ bpf_percpu(value, i) = FILL_VALUE;
+
+ for (key = 1; key <= num; key++) {
+ err = bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST);
+ if (!ASSERT_OK(err, "bpf_map_update_elem"))
+ return -1;
+ }
+
+ return 0;
+}
+
+static struct test_map_init *setup(enum bpf_map_type map_type, int map_sz,
+ int *map_fd, int populate)
+{
+ struct test_map_init *skel;
+ int err;
+
+ skel = test_map_init__open();
+ if (!ASSERT_OK_PTR(skel, "skel_open"))
+ return NULL;
+
+ err = bpf_map__set_type(skel->maps.hashmap1, map_type);
+ if (!ASSERT_OK(err, "bpf_map__set_type"))
+ goto error;
+
+ err = bpf_map__set_max_entries(skel->maps.hashmap1, map_sz);
+ if (!ASSERT_OK(err, "bpf_map__set_max_entries"))
+ goto error;
+
+ err = test_map_init__load(skel);
+ if (!ASSERT_OK(err, "skel_load"))
+ goto error;
+
+ *map_fd = bpf_map__fd(skel->maps.hashmap1);
+ if (CHECK(*map_fd < 0, "bpf_map__fd", "failed\n"))
+ goto error;
+
+ err = map_populate(*map_fd, populate);
+ if (!ASSERT_OK(err, "map_populate"))
+ goto error_map;
+
+ return skel;
+
+error_map:
+ close(*map_fd);
+error:
+ test_map_init__destroy(skel);
+ return NULL;
+}
+
+/* executes bpf program that updates map with key, value */
+static int prog_run_insert_elem(struct test_map_init *skel, map_key_t key,
+ map_value_t value)
+{
+ struct test_map_init__bss *bss;
+
+ bss = skel->bss;
+
+ bss->inKey = key;
+ bss->inValue = value;
+ bss->inPid = getpid();
+
+ if (!ASSERT_OK(test_map_init__attach(skel), "skel_attach"))
+ return -1;
+
+ /* Let tracepoint trigger */
+ syscall(__NR_getpgid);
+
+ test_map_init__detach(skel);
+
+ return 0;
+}
+
+static int check_values_one_cpu(pcpu_map_value_t *value, map_value_t expected)
+{
+ int i, nzCnt = 0;
+ map_value_t val;
+
+ for (i = 0; i < nr_cpus; i++) {
+ val = bpf_percpu(value, i);
+ if (val) {
+ if (CHECK(val != expected, "map value",
+ "unexpected for cpu %d: 0x%llx\n", i, val))
+ return -1;
+ nzCnt++;
+ }
+ }
+
+ if (CHECK(nzCnt != 1, "map value", "set for %d CPUs instead of 1!\n",
+ nzCnt))
+ return -1;
+
+ return 0;
+}
+
+/* Add key=1 elem with values set for all CPUs
+ * Delete elem key=1
+ * Run bpf prog that inserts new key=1 elem with value=0x1234
+ * (bpf prog can only set value for current CPU)
+ * Lookup Key=1 and check value is as expected for all CPUs:
+ * value set by bpf prog for one CPU, 0 for all others
+ */
+static void test_pcpu_map_init(void)
+{
+ pcpu_map_value_t value[nr_cpus];
+ struct test_map_init *skel;
+ int map_fd, err;
+ map_key_t key;
+
+ /* max 1 elem in map so insertion is forced to reuse freed entry */
+ skel = setup(BPF_MAP_TYPE_PERCPU_HASH, 1, &map_fd, 1);
+ if (!ASSERT_OK_PTR(skel, "prog_setup"))
+ return;
+
+ /* delete element so the entry can be re-used*/
+ key = 1;
+ err = bpf_map_delete_elem(map_fd, &key);
+ if (!ASSERT_OK(err, "bpf_map_delete_elem"))
+ goto cleanup;
+
+ /* run bpf prog that inserts new elem, re-using the slot just freed */
+ err = prog_run_insert_elem(skel, key, TEST_VALUE);
+ if (!ASSERT_OK(err, "prog_run_insert_elem"))
+ goto cleanup;
+
+ /* check that key=1 was re-created by bpf prog */
+ err = bpf_map_lookup_elem(map_fd, &key, value);
+ if (!ASSERT_OK(err, "bpf_map_lookup_elem"))
+ goto cleanup;
+
+ /* and has expected values */
+ check_values_one_cpu(value, TEST_VALUE);
+
+cleanup:
+ test_map_init__destroy(skel);
+}
+
+/* Add key=1 and key=2 elems with values set for all CPUs
+ * Run bpf prog that inserts new key=3 elem
+ * (only for current cpu; other cpus should have initial value = 0)
+ * Lookup Key=1 and check value is as expected for all CPUs
+ */
+static void test_pcpu_lru_map_init(void)
+{
+ pcpu_map_value_t value[nr_cpus];
+ struct test_map_init *skel;
+ int map_fd, err;
+ map_key_t key;
+
+ /* Set up LRU map with 2 elements, values filled for all CPUs.
+ * With these 2 elements, the LRU map is full
+ */
+ skel = setup(BPF_MAP_TYPE_LRU_PERCPU_HASH, 2, &map_fd, 2);
+ if (!ASSERT_OK_PTR(skel, "prog_setup"))
+ return;
+
+ /* run bpf prog that inserts new key=3 element, re-using LRU slot */
+ key = 3;
+ err = prog_run_insert_elem(skel, key, TEST_VALUE);
+ if (!ASSERT_OK(err, "prog_run_insert_elem"))
+ goto cleanup;
+
+ /* check that key=3 replaced one of earlier elements */
+ err = bpf_map_lookup_elem(map_fd, &key, value);
+ if (!ASSERT_OK(err, "bpf_map_lookup_elem"))
+ goto cleanup;
+
+ /* and has expected values */
+ check_values_one_cpu(value, TEST_VALUE);
+
+cleanup:
+ test_map_init__destroy(skel);
+}
+
+void test_map_init(void)
+{
+ nr_cpus = bpf_num_possible_cpus();
+ if (nr_cpus <= 1) {
+ printf("%s:SKIP: >1 cpu needed for this test\n", __func__);
+ test__skip();
+ return;
+ }
+
+ if (test__start_subtest("pcpu_map_init"))
+ test_pcpu_map_init();
+ if (test__start_subtest("pcpu_lru_map_init"))
+ test_pcpu_lru_map_init();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/map_lock.c b/tools/testing/selftests/bpf/prog_tests/map_lock.c
new file mode 100644
index 000000000..ce17b1ed8
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/map_lock.c
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+
+static void *spin_lock_thread(void *arg)
+{
+ __u32 duration, retval;
+ int err, prog_fd = *(u32 *) arg;
+
+ err = bpf_prog_test_run(prog_fd, 10000, &pkt_v4, sizeof(pkt_v4),
+ NULL, NULL, &retval, &duration);
+ CHECK(err || retval, "",
+ "err %d errno %d retval %d duration %d\n",
+ err, errno, retval, duration);
+ pthread_exit(arg);
+}
+
+static void *parallel_map_access(void *arg)
+{
+ int err, map_fd = *(u32 *) arg;
+ int vars[17], i, j, rnd, key = 0;
+
+ for (i = 0; i < 10000; i++) {
+ err = bpf_map_lookup_elem_flags(map_fd, &key, vars, BPF_F_LOCK);
+ if (CHECK_FAIL(err)) {
+ printf("lookup failed\n");
+ goto out;
+ }
+ if (CHECK_FAIL(vars[0] != 0)) {
+ printf("lookup #%d var[0]=%d\n", i, vars[0]);
+ goto out;
+ }
+ rnd = vars[1];
+ for (j = 2; j < 17; j++) {
+ if (vars[j] == rnd)
+ continue;
+ printf("lookup #%d var[1]=%d var[%d]=%d\n",
+ i, rnd, j, vars[j]);
+ CHECK_FAIL(vars[j] != rnd);
+ goto out;
+ }
+ }
+out:
+ pthread_exit(arg);
+}
+
+void test_map_lock(void)
+{
+ const char *file = "./test_map_lock.o";
+ int prog_fd, map_fd[2], vars[17] = {};
+ pthread_t thread_id[6];
+ struct bpf_object *obj = NULL;
+ int err = 0, key = 0, i;
+ void *ret;
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_CGROUP_SKB, &obj, &prog_fd);
+ if (CHECK_FAIL(err)) {
+ printf("test_map_lock:bpf_prog_load errno %d\n", errno);
+ goto close_prog;
+ }
+ map_fd[0] = bpf_find_map(__func__, obj, "hash_map");
+ if (CHECK_FAIL(map_fd[0] < 0))
+ goto close_prog;
+ map_fd[1] = bpf_find_map(__func__, obj, "array_map");
+ if (CHECK_FAIL(map_fd[1] < 0))
+ goto close_prog;
+
+ bpf_map_update_elem(map_fd[0], &key, vars, BPF_F_LOCK);
+
+ for (i = 0; i < 4; i++)
+ if (CHECK_FAIL(pthread_create(&thread_id[i], NULL,
+ &spin_lock_thread, &prog_fd)))
+ goto close_prog;
+ for (i = 4; i < 6; i++)
+ if (CHECK_FAIL(pthread_create(&thread_id[i], NULL,
+ &parallel_map_access,
+ &map_fd[i - 4])))
+ goto close_prog;
+ for (i = 0; i < 4; i++)
+ if (CHECK_FAIL(pthread_join(thread_id[i], &ret) ||
+ ret != (void *)&prog_fd))
+ goto close_prog;
+ for (i = 4; i < 6; i++)
+ if (CHECK_FAIL(pthread_join(thread_id[i], &ret) ||
+ ret != (void *)&map_fd[i - 4]))
+ goto close_prog;
+close_prog:
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/map_ptr.c b/tools/testing/selftests/bpf/prog_tests/map_ptr.c
new file mode 100644
index 000000000..c230a573c
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/map_ptr.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+
+#include <test_progs.h>
+#include <network_helpers.h>
+
+#include "map_ptr_kern.skel.h"
+
+void test_map_ptr(void)
+{
+ struct map_ptr_kern *skel;
+ __u32 duration = 0, retval;
+ char buf[128];
+ int err;
+
+ skel = map_ptr_kern__open_and_load();
+ if (CHECK(!skel, "skel_open_load", "open_load failed\n"))
+ return;
+
+ err = bpf_prog_test_run(bpf_program__fd(skel->progs.cg_skb), 1, &pkt_v4,
+ sizeof(pkt_v4), buf, NULL, &retval, NULL);
+
+ if (CHECK(err, "test_run", "err=%d errno=%d\n", err, errno))
+ goto cleanup;
+
+ if (CHECK(!retval, "retval", "retval=%d map_type=%u line=%u\n", retval,
+ skel->bss->g_map_type, skel->bss->g_line))
+ goto cleanup;
+
+cleanup:
+ map_ptr_kern__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/metadata.c b/tools/testing/selftests/bpf/prog_tests/metadata.c
new file mode 100644
index 000000000..2c53eade8
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/metadata.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright 2020 Google LLC.
+ */
+
+#include <test_progs.h>
+#include <cgroup_helpers.h>
+#include <network_helpers.h>
+
+#include "metadata_unused.skel.h"
+#include "metadata_used.skel.h"
+
+static int duration;
+
+static int prog_holds_map(int prog_fd, int map_fd)
+{
+ struct bpf_prog_info prog_info = {};
+ struct bpf_prog_info map_info = {};
+ __u32 prog_info_len;
+ __u32 map_info_len;
+ __u32 *map_ids;
+ int nr_maps;
+ int ret;
+ int i;
+
+ map_info_len = sizeof(map_info);
+ ret = bpf_obj_get_info_by_fd(map_fd, &map_info, &map_info_len);
+ if (ret)
+ return -errno;
+
+ prog_info_len = sizeof(prog_info);
+ ret = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_info_len);
+ if (ret)
+ return -errno;
+
+ map_ids = calloc(prog_info.nr_map_ids, sizeof(__u32));
+ if (!map_ids)
+ return -ENOMEM;
+
+ nr_maps = prog_info.nr_map_ids;
+ memset(&prog_info, 0, sizeof(prog_info));
+ prog_info.nr_map_ids = nr_maps;
+ prog_info.map_ids = ptr_to_u64(map_ids);
+ prog_info_len = sizeof(prog_info);
+
+ ret = bpf_obj_get_info_by_fd(prog_fd, &prog_info, &prog_info_len);
+ if (ret) {
+ ret = -errno;
+ goto free_map_ids;
+ }
+
+ ret = -ENOENT;
+ for (i = 0; i < prog_info.nr_map_ids; i++) {
+ if (map_ids[i] == map_info.id) {
+ ret = 0;
+ break;
+ }
+ }
+
+free_map_ids:
+ free(map_ids);
+ return ret;
+}
+
+static void test_metadata_unused(void)
+{
+ struct metadata_unused *obj;
+ int err;
+
+ obj = metadata_unused__open_and_load();
+ if (CHECK(!obj, "skel-load", "errno %d", errno))
+ return;
+
+ err = prog_holds_map(bpf_program__fd(obj->progs.prog),
+ bpf_map__fd(obj->maps.rodata));
+ if (CHECK(err, "prog-holds-rodata", "errno: %d", err))
+ return;
+
+ /* Assert that we can access the metadata in skel and the values are
+ * what we expect.
+ */
+ if (CHECK(strncmp(obj->rodata->bpf_metadata_a, "foo",
+ sizeof(obj->rodata->bpf_metadata_a)),
+ "bpf_metadata_a", "expected \"foo\", value differ"))
+ goto close_bpf_object;
+ if (CHECK(obj->rodata->bpf_metadata_b != 1, "bpf_metadata_b",
+ "expected 1, got %d", obj->rodata->bpf_metadata_b))
+ goto close_bpf_object;
+
+ /* Assert that binding metadata map to prog again succeeds. */
+ err = bpf_prog_bind_map(bpf_program__fd(obj->progs.prog),
+ bpf_map__fd(obj->maps.rodata), NULL);
+ CHECK(err, "rebind_map", "errno %d, expected 0", errno);
+
+close_bpf_object:
+ metadata_unused__destroy(obj);
+}
+
+static void test_metadata_used(void)
+{
+ struct metadata_used *obj;
+ int err;
+
+ obj = metadata_used__open_and_load();
+ if (CHECK(!obj, "skel-load", "errno %d", errno))
+ return;
+
+ err = prog_holds_map(bpf_program__fd(obj->progs.prog),
+ bpf_map__fd(obj->maps.rodata));
+ if (CHECK(err, "prog-holds-rodata", "errno: %d", err))
+ return;
+
+ /* Assert that we can access the metadata in skel and the values are
+ * what we expect.
+ */
+ if (CHECK(strncmp(obj->rodata->bpf_metadata_a, "bar",
+ sizeof(obj->rodata->bpf_metadata_a)),
+ "metadata_a", "expected \"bar\", value differ"))
+ goto close_bpf_object;
+ if (CHECK(obj->rodata->bpf_metadata_b != 2, "metadata_b",
+ "expected 2, got %d", obj->rodata->bpf_metadata_b))
+ goto close_bpf_object;
+
+ /* Assert that binding metadata map to prog again succeeds. */
+ err = bpf_prog_bind_map(bpf_program__fd(obj->progs.prog),
+ bpf_map__fd(obj->maps.rodata), NULL);
+ CHECK(err, "rebind_map", "errno %d, expected 0", errno);
+
+close_bpf_object:
+ metadata_used__destroy(obj);
+}
+
+void test_metadata(void)
+{
+ if (test__start_subtest("unused"))
+ test_metadata_unused();
+
+ if (test__start_subtest("used"))
+ test_metadata_used();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/mmap.c b/tools/testing/selftests/bpf/prog_tests/mmap.c
new file mode 100644
index 000000000..9c3c5c0f0
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/mmap.c
@@ -0,0 +1,290 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <sys/mman.h>
+#include "test_mmap.skel.h"
+
+struct map_data {
+ __u64 val[512 * 4];
+};
+
+static size_t roundup_page(size_t sz)
+{
+ long page_size = sysconf(_SC_PAGE_SIZE);
+ return (sz + page_size - 1) / page_size * page_size;
+}
+
+void test_mmap(void)
+{
+ const size_t bss_sz = roundup_page(sizeof(struct test_mmap__bss));
+ const size_t map_sz = roundup_page(sizeof(struct map_data));
+ const int zero = 0, one = 1, two = 2, far = 1500;
+ const long page_size = sysconf(_SC_PAGE_SIZE);
+ int err, duration = 0, i, data_map_fd, data_map_id, tmp_fd, rdmap_fd;
+ struct bpf_map *data_map, *bss_map;
+ void *bss_mmaped = NULL, *map_mmaped = NULL, *tmp0, *tmp1, *tmp2;
+ struct test_mmap__bss *bss_data;
+ struct bpf_map_info map_info;
+ __u32 map_info_sz = sizeof(map_info);
+ struct map_data *map_data;
+ struct test_mmap *skel;
+ __u64 val = 0;
+
+ skel = test_mmap__open_and_load();
+ if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n"))
+ return;
+
+ bss_map = skel->maps.bss;
+ data_map = skel->maps.data_map;
+ data_map_fd = bpf_map__fd(data_map);
+
+ rdmap_fd = bpf_map__fd(skel->maps.rdonly_map);
+ tmp1 = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_SHARED, rdmap_fd, 0);
+ if (CHECK(tmp1 != MAP_FAILED, "rdonly_write_mmap", "unexpected success\n")) {
+ munmap(tmp1, 4096);
+ goto cleanup;
+ }
+ /* now double-check if it's mmap()'able at all */
+ tmp1 = mmap(NULL, 4096, PROT_READ, MAP_SHARED, rdmap_fd, 0);
+ if (CHECK(tmp1 == MAP_FAILED, "rdonly_read_mmap", "failed: %d\n", errno))
+ goto cleanup;
+
+ /* get map's ID */
+ memset(&map_info, 0, map_info_sz);
+ err = bpf_obj_get_info_by_fd(data_map_fd, &map_info, &map_info_sz);
+ if (CHECK(err, "map_get_info", "failed %d\n", errno))
+ goto cleanup;
+ data_map_id = map_info.id;
+
+ /* mmap BSS map */
+ bss_mmaped = mmap(NULL, bss_sz, PROT_READ | PROT_WRITE, MAP_SHARED,
+ bpf_map__fd(bss_map), 0);
+ if (CHECK(bss_mmaped == MAP_FAILED, "bss_mmap",
+ ".bss mmap failed: %d\n", errno)) {
+ bss_mmaped = NULL;
+ goto cleanup;
+ }
+ /* map as R/W first */
+ map_mmaped = mmap(NULL, map_sz, PROT_READ | PROT_WRITE, MAP_SHARED,
+ data_map_fd, 0);
+ if (CHECK(map_mmaped == MAP_FAILED, "data_mmap",
+ "data_map mmap failed: %d\n", errno)) {
+ map_mmaped = NULL;
+ goto cleanup;
+ }
+
+ bss_data = bss_mmaped;
+ map_data = map_mmaped;
+
+ CHECK_FAIL(bss_data->in_val);
+ CHECK_FAIL(bss_data->out_val);
+ CHECK_FAIL(skel->bss->in_val);
+ CHECK_FAIL(skel->bss->out_val);
+ CHECK_FAIL(map_data->val[0]);
+ CHECK_FAIL(map_data->val[1]);
+ CHECK_FAIL(map_data->val[2]);
+ CHECK_FAIL(map_data->val[far]);
+
+ err = test_mmap__attach(skel);
+ if (CHECK(err, "attach_raw_tp", "err %d\n", err))
+ goto cleanup;
+
+ bss_data->in_val = 123;
+ val = 111;
+ CHECK_FAIL(bpf_map_update_elem(data_map_fd, &zero, &val, 0));
+
+ usleep(1);
+
+ CHECK_FAIL(bss_data->in_val != 123);
+ CHECK_FAIL(bss_data->out_val != 123);
+ CHECK_FAIL(skel->bss->in_val != 123);
+ CHECK_FAIL(skel->bss->out_val != 123);
+ CHECK_FAIL(map_data->val[0] != 111);
+ CHECK_FAIL(map_data->val[1] != 222);
+ CHECK_FAIL(map_data->val[2] != 123);
+ CHECK_FAIL(map_data->val[far] != 3 * 123);
+
+ CHECK_FAIL(bpf_map_lookup_elem(data_map_fd, &zero, &val));
+ CHECK_FAIL(val != 111);
+ CHECK_FAIL(bpf_map_lookup_elem(data_map_fd, &one, &val));
+ CHECK_FAIL(val != 222);
+ CHECK_FAIL(bpf_map_lookup_elem(data_map_fd, &two, &val));
+ CHECK_FAIL(val != 123);
+ CHECK_FAIL(bpf_map_lookup_elem(data_map_fd, &far, &val));
+ CHECK_FAIL(val != 3 * 123);
+
+ /* data_map freeze should fail due to R/W mmap() */
+ err = bpf_map_freeze(data_map_fd);
+ if (CHECK(!err || errno != EBUSY, "no_freeze",
+ "data_map freeze succeeded: err=%d, errno=%d\n", err, errno))
+ goto cleanup;
+
+ err = mprotect(map_mmaped, map_sz, PROT_READ);
+ if (CHECK(err, "mprotect_ro", "mprotect to r/o failed %d\n", errno))
+ goto cleanup;
+
+ /* unmap R/W mapping */
+ err = munmap(map_mmaped, map_sz);
+ map_mmaped = NULL;
+ if (CHECK(err, "data_map_munmap", "data_map munmap failed: %d\n", errno))
+ goto cleanup;
+
+ /* re-map as R/O now */
+ map_mmaped = mmap(NULL, map_sz, PROT_READ, MAP_SHARED, data_map_fd, 0);
+ if (CHECK(map_mmaped == MAP_FAILED, "data_mmap",
+ "data_map R/O mmap failed: %d\n", errno)) {
+ map_mmaped = NULL;
+ goto cleanup;
+ }
+ err = mprotect(map_mmaped, map_sz, PROT_WRITE);
+ if (CHECK(!err, "mprotect_wr", "mprotect() succeeded unexpectedly!\n"))
+ goto cleanup;
+ err = mprotect(map_mmaped, map_sz, PROT_EXEC);
+ if (CHECK(!err, "mprotect_ex", "mprotect() succeeded unexpectedly!\n"))
+ goto cleanup;
+ map_data = map_mmaped;
+
+ /* map/unmap in a loop to test ref counting */
+ for (i = 0; i < 10; i++) {
+ int flags = i % 2 ? PROT_READ : PROT_WRITE;
+ void *p;
+
+ p = mmap(NULL, map_sz, flags, MAP_SHARED, data_map_fd, 0);
+ if (CHECK_FAIL(p == MAP_FAILED))
+ goto cleanup;
+ err = munmap(p, map_sz);
+ if (CHECK_FAIL(err))
+ goto cleanup;
+ }
+
+ /* data_map freeze should now succeed due to no R/W mapping */
+ err = bpf_map_freeze(data_map_fd);
+ if (CHECK(err, "freeze", "data_map freeze failed: err=%d, errno=%d\n",
+ err, errno))
+ goto cleanup;
+
+ /* mapping as R/W now should fail */
+ tmp1 = mmap(NULL, map_sz, PROT_READ | PROT_WRITE, MAP_SHARED,
+ data_map_fd, 0);
+ if (CHECK(tmp1 != MAP_FAILED, "data_mmap", "mmap succeeded\n")) {
+ munmap(tmp1, map_sz);
+ goto cleanup;
+ }
+
+ bss_data->in_val = 321;
+ usleep(1);
+ CHECK_FAIL(bss_data->in_val != 321);
+ CHECK_FAIL(bss_data->out_val != 321);
+ CHECK_FAIL(skel->bss->in_val != 321);
+ CHECK_FAIL(skel->bss->out_val != 321);
+ CHECK_FAIL(map_data->val[0] != 111);
+ CHECK_FAIL(map_data->val[1] != 222);
+ CHECK_FAIL(map_data->val[2] != 321);
+ CHECK_FAIL(map_data->val[far] != 3 * 321);
+
+ /* check some more advanced mmap() manipulations */
+
+ tmp0 = mmap(NULL, 4 * page_size, PROT_READ, MAP_SHARED | MAP_ANONYMOUS,
+ -1, 0);
+ if (CHECK(tmp0 == MAP_FAILED, "adv_mmap0", "errno %d\n", errno))
+ goto cleanup;
+
+ /* map all but last page: pages 1-3 mapped */
+ tmp1 = mmap(tmp0, 3 * page_size, PROT_READ, MAP_SHARED | MAP_FIXED,
+ data_map_fd, 0);
+ if (CHECK(tmp0 != tmp1, "adv_mmap1", "tmp0: %p, tmp1: %p\n", tmp0, tmp1)) {
+ munmap(tmp0, 4 * page_size);
+ goto cleanup;
+ }
+
+ /* unmap second page: pages 1, 3 mapped */
+ err = munmap(tmp1 + page_size, page_size);
+ if (CHECK(err, "adv_mmap2", "errno %d\n", errno)) {
+ munmap(tmp1, 4 * page_size);
+ goto cleanup;
+ }
+
+ /* map page 2 back */
+ tmp2 = mmap(tmp1 + page_size, page_size, PROT_READ,
+ MAP_SHARED | MAP_FIXED, data_map_fd, 0);
+ if (CHECK(tmp2 == MAP_FAILED, "adv_mmap3", "errno %d\n", errno)) {
+ munmap(tmp1, page_size);
+ munmap(tmp1 + 2*page_size, 2 * page_size);
+ goto cleanup;
+ }
+ CHECK(tmp1 + page_size != tmp2, "adv_mmap4",
+ "tmp1: %p, tmp2: %p\n", tmp1, tmp2);
+
+ /* re-map all 4 pages */
+ tmp2 = mmap(tmp1, 4 * page_size, PROT_READ, MAP_SHARED | MAP_FIXED,
+ data_map_fd, 0);
+ if (CHECK(tmp2 == MAP_FAILED, "adv_mmap5", "errno %d\n", errno)) {
+ munmap(tmp1, 4 * page_size); /* unmap page 1 */
+ goto cleanup;
+ }
+ CHECK(tmp1 != tmp2, "adv_mmap6", "tmp1: %p, tmp2: %p\n", tmp1, tmp2);
+
+ map_data = tmp2;
+ CHECK_FAIL(bss_data->in_val != 321);
+ CHECK_FAIL(bss_data->out_val != 321);
+ CHECK_FAIL(skel->bss->in_val != 321);
+ CHECK_FAIL(skel->bss->out_val != 321);
+ CHECK_FAIL(map_data->val[0] != 111);
+ CHECK_FAIL(map_data->val[1] != 222);
+ CHECK_FAIL(map_data->val[2] != 321);
+ CHECK_FAIL(map_data->val[far] != 3 * 321);
+
+ munmap(tmp2, 4 * page_size);
+
+ /* map all 4 pages, but with pg_off=1 page, should fail */
+ tmp1 = mmap(NULL, 4 * page_size, PROT_READ, MAP_SHARED | MAP_FIXED,
+ data_map_fd, page_size /* initial page shift */);
+ if (CHECK(tmp1 != MAP_FAILED, "adv_mmap7", "unexpected success")) {
+ munmap(tmp1, 4 * page_size);
+ goto cleanup;
+ }
+
+ tmp1 = mmap(NULL, map_sz, PROT_READ, MAP_SHARED, data_map_fd, 0);
+ if (CHECK(tmp1 == MAP_FAILED, "last_mmap", "failed %d\n", errno))
+ goto cleanup;
+
+ test_mmap__destroy(skel);
+ skel = NULL;
+ CHECK_FAIL(munmap(bss_mmaped, bss_sz));
+ bss_mmaped = NULL;
+ CHECK_FAIL(munmap(map_mmaped, map_sz));
+ map_mmaped = NULL;
+
+ /* map should be still held by active mmap */
+ tmp_fd = bpf_map_get_fd_by_id(data_map_id);
+ if (CHECK(tmp_fd < 0, "get_map_by_id", "failed %d\n", errno)) {
+ munmap(tmp1, map_sz);
+ goto cleanup;
+ }
+ close(tmp_fd);
+
+ /* this should release data map finally */
+ munmap(tmp1, map_sz);
+
+ /* we need to wait for RCU grace period */
+ for (i = 0; i < 10000; i++) {
+ __u32 id = data_map_id - 1;
+ if (bpf_map_get_next_id(id, &id) || id > data_map_id)
+ break;
+ usleep(1);
+ }
+
+ /* should fail to get map FD by non-existing ID */
+ tmp_fd = bpf_map_get_fd_by_id(data_map_id);
+ if (CHECK(tmp_fd >= 0, "get_map_by_id_after",
+ "unexpectedly succeeded %d\n", tmp_fd)) {
+ close(tmp_fd);
+ goto cleanup;
+ }
+
+cleanup:
+ if (bss_mmaped)
+ CHECK_FAIL(munmap(bss_mmaped, bss_sz));
+ if (map_mmaped)
+ CHECK_FAIL(munmap(map_mmaped, map_sz));
+ test_mmap__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/modify_return.c b/tools/testing/selftests/bpf/prog_tests/modify_return.c
new file mode 100644
index 000000000..97fec70c6
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/modify_return.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2020 Google LLC.
+ */
+
+#include <test_progs.h>
+#include "modify_return.skel.h"
+
+#define LOWER(x) ((x) & 0xffff)
+#define UPPER(x) ((x) >> 16)
+
+
+static void run_test(__u32 input_retval, __u16 want_side_effect, __s16 want_ret)
+{
+ struct modify_return *skel = NULL;
+ int err, prog_fd;
+ __u32 duration = 0, retval;
+ __u16 side_effect;
+ __s16 ret;
+
+ skel = modify_return__open_and_load();
+ if (CHECK(!skel, "skel_load", "modify_return skeleton failed\n"))
+ goto cleanup;
+
+ err = modify_return__attach(skel);
+ if (CHECK(err, "modify_return", "attach failed: %d\n", err))
+ goto cleanup;
+
+ skel->bss->input_retval = input_retval;
+ prog_fd = bpf_program__fd(skel->progs.fmod_ret_test);
+ err = bpf_prog_test_run(prog_fd, 1, NULL, 0, NULL, 0,
+ &retval, &duration);
+
+ CHECK(err, "test_run", "err %d errno %d\n", err, errno);
+
+ side_effect = UPPER(retval);
+ ret = LOWER(retval);
+
+ CHECK(ret != want_ret, "test_run",
+ "unexpected ret: %d, expected: %d\n", ret, want_ret);
+ CHECK(side_effect != want_side_effect, "modify_return",
+ "unexpected side_effect: %d\n", side_effect);
+
+ CHECK(skel->bss->fentry_result != 1, "modify_return",
+ "fentry failed\n");
+ CHECK(skel->bss->fexit_result != 1, "modify_return",
+ "fexit failed\n");
+ CHECK(skel->bss->fmod_ret_result != 1, "modify_return",
+ "fmod_ret failed\n");
+
+cleanup:
+ modify_return__destroy(skel);
+}
+
+void test_modify_return(void)
+{
+ run_test(0 /* input_retval */,
+ 1 /* want_side_effect */,
+ 4 /* want_ret */);
+ run_test(-EINVAL /* input_retval */,
+ 0 /* want_side_effect */,
+ -EINVAL /* want_ret */);
+}
+
diff --git a/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c
new file mode 100644
index 000000000..e74dc501b
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/ns_current_pid_tgid.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Carlos Neira cneirabustos@gmail.com */
+#include <test_progs.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+struct bss {
+ __u64 dev;
+ __u64 ino;
+ __u64 pid_tgid;
+ __u64 user_pid_tgid;
+};
+
+void test_ns_current_pid_tgid(void)
+{
+ const char *probe_name = "raw_tracepoint/sys_enter";
+ const char *file = "test_ns_current_pid_tgid.o";
+ int err, key = 0, duration = 0;
+ struct bpf_link *link = NULL;
+ struct bpf_program *prog;
+ struct bpf_map *bss_map;
+ struct bpf_object *obj;
+ struct bss bss;
+ struct stat st;
+ __u64 id;
+
+ obj = bpf_object__open_file(file, NULL);
+ if (CHECK(IS_ERR(obj), "obj_open", "err %ld\n", PTR_ERR(obj)))
+ return;
+
+ err = bpf_object__load(obj);
+ if (CHECK(err, "obj_load", "err %d errno %d\n", err, errno))
+ goto cleanup;
+
+ bss_map = bpf_object__find_map_by_name(obj, "test_ns_.bss");
+ if (CHECK(!bss_map, "find_bss_map", "failed\n"))
+ goto cleanup;
+
+ prog = bpf_object__find_program_by_title(obj, probe_name);
+ if (CHECK(!prog, "find_prog", "prog '%s' not found\n",
+ probe_name))
+ goto cleanup;
+
+ memset(&bss, 0, sizeof(bss));
+ pid_t tid = syscall(SYS_gettid);
+ pid_t pid = getpid();
+
+ id = (__u64) tid << 32 | pid;
+ bss.user_pid_tgid = id;
+
+ if (CHECK_FAIL(stat("/proc/self/ns/pid", &st))) {
+ perror("Failed to stat /proc/self/ns/pid");
+ goto cleanup;
+ }
+
+ bss.dev = st.st_dev;
+ bss.ino = st.st_ino;
+
+ err = bpf_map_update_elem(bpf_map__fd(bss_map), &key, &bss, 0);
+ if (CHECK(err, "setting_bss", "failed to set bss : %d\n", err))
+ goto cleanup;
+
+ link = bpf_program__attach_raw_tracepoint(prog, "sys_enter");
+ if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n",
+ PTR_ERR(link))) {
+ link = NULL;
+ goto cleanup;
+ }
+
+ /* trigger some syscalls */
+ usleep(1);
+
+ err = bpf_map_lookup_elem(bpf_map__fd(bss_map), &key, &bss);
+ if (CHECK(err, "set_bss", "failed to get bss : %d\n", err))
+ goto cleanup;
+
+ if (CHECK(id != bss.pid_tgid, "Compare user pid/tgid vs. bpf pid/tgid",
+ "User pid/tgid %llu BPF pid/tgid %llu\n", id, bss.pid_tgid))
+ goto cleanup;
+cleanup:
+ bpf_link__destroy(link);
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/obj_name.c b/tools/testing/selftests/bpf/prog_tests/obj_name.c
new file mode 100644
index 000000000..e178416bd
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/obj_name.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_obj_name(void)
+{
+ struct {
+ const char *name;
+ int success;
+ int expected_errno;
+ } tests[] = {
+ { "", 1, 0 },
+ { "_123456789ABCDE", 1, 0 },
+ { "_123456789ABCDEF", 0, EINVAL },
+ { "_123456789ABCD\n", 0, EINVAL },
+ };
+ struct bpf_insn prog[] = {
+ BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ };
+ __u32 duration = 0;
+ int i;
+
+ for (i = 0; i < sizeof(tests) / sizeof(tests[0]); i++) {
+ size_t name_len = strlen(tests[i].name) + 1;
+ union bpf_attr attr;
+ size_t ncopy;
+ int fd;
+
+ /* test different attr.prog_name during BPF_PROG_LOAD */
+ ncopy = name_len < sizeof(attr.prog_name) ?
+ name_len : sizeof(attr.prog_name);
+ bzero(&attr, sizeof(attr));
+ attr.prog_type = BPF_PROG_TYPE_SCHED_CLS;
+ attr.insn_cnt = 2;
+ attr.insns = ptr_to_u64(prog);
+ attr.license = ptr_to_u64("");
+ memcpy(attr.prog_name, tests[i].name, ncopy);
+
+ fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+ CHECK((tests[i].success && fd < 0) ||
+ (!tests[i].success && fd != -1) ||
+ (!tests[i].success && errno != tests[i].expected_errno),
+ "check-bpf-prog-name",
+ "fd %d(%d) errno %d(%d)\n",
+ fd, tests[i].success, errno, tests[i].expected_errno);
+
+ if (fd != -1)
+ close(fd);
+
+ /* test different attr.map_name during BPF_MAP_CREATE */
+ ncopy = name_len < sizeof(attr.map_name) ?
+ name_len : sizeof(attr.map_name);
+ bzero(&attr, sizeof(attr));
+ attr.map_type = BPF_MAP_TYPE_ARRAY;
+ attr.key_size = 4;
+ attr.value_size = 4;
+ attr.max_entries = 1;
+ attr.map_flags = 0;
+ memcpy(attr.map_name, tests[i].name, ncopy);
+ fd = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
+ CHECK((tests[i].success && fd < 0) ||
+ (!tests[i].success && fd != -1) ||
+ (!tests[i].success && errno != tests[i].expected_errno),
+ "check-bpf-map-name",
+ "fd %d(%d) errno %d(%d)\n",
+ fd, tests[i].success, errno, tests[i].expected_errno);
+
+ if (fd != -1)
+ close(fd);
+ }
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/pe_preserve_elems.c b/tools/testing/selftests/bpf/prog_tests/pe_preserve_elems.c
new file mode 100644
index 000000000..673d38395
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/pe_preserve_elems.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2019 Facebook */
+#include <test_progs.h>
+#include <linux/bpf.h>
+#include "test_pe_preserve_elems.skel.h"
+
+static int duration;
+
+static void test_one_map(struct bpf_map *map, struct bpf_program *prog,
+ bool has_share_pe)
+{
+ int err, key = 0, pfd = -1, mfd = bpf_map__fd(map);
+ DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts);
+ struct perf_event_attr attr = {
+ .size = sizeof(struct perf_event_attr),
+ .type = PERF_TYPE_SOFTWARE,
+ .config = PERF_COUNT_SW_CPU_CLOCK,
+ };
+
+ pfd = syscall(__NR_perf_event_open, &attr, 0 /* pid */,
+ -1 /* cpu 0 */, -1 /* group id */, 0 /* flags */);
+ if (CHECK(pfd < 0, "perf_event_open", "failed\n"))
+ return;
+
+ err = bpf_map_update_elem(mfd, &key, &pfd, BPF_ANY);
+ close(pfd);
+ if (CHECK(err < 0, "bpf_map_update_elem", "failed\n"))
+ return;
+
+ err = bpf_prog_test_run_opts(bpf_program__fd(prog), &opts);
+ if (CHECK(err < 0, "bpf_prog_test_run_opts", "failed\n"))
+ return;
+ if (CHECK(opts.retval != 0, "bpf_perf_event_read_value",
+ "failed with %d\n", opts.retval))
+ return;
+
+ /* closing mfd, prog still holds a reference on map */
+ close(mfd);
+
+ err = bpf_prog_test_run_opts(bpf_program__fd(prog), &opts);
+ if (CHECK(err < 0, "bpf_prog_test_run_opts", "failed\n"))
+ return;
+
+ if (has_share_pe) {
+ CHECK(opts.retval != 0, "bpf_perf_event_read_value",
+ "failed with %d\n", opts.retval);
+ } else {
+ CHECK(opts.retval != -ENOENT, "bpf_perf_event_read_value",
+ "should have failed with %d, but got %d\n", -ENOENT,
+ opts.retval);
+ }
+}
+
+void test_pe_preserve_elems(void)
+{
+ struct test_pe_preserve_elems *skel;
+
+ skel = test_pe_preserve_elems__open_and_load();
+ if (CHECK(!skel, "skel_open", "failed to open skeleton\n"))
+ return;
+
+ test_one_map(skel->maps.array_1, skel->progs.read_array_1, false);
+ test_one_map(skel->maps.array_2, skel->progs.read_array_2, true);
+
+ test_pe_preserve_elems__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/perf_branches.c b/tools/testing/selftests/bpf/prog_tests/perf_branches.c
new file mode 100644
index 000000000..e35c44490
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/perf_branches.c
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <sched.h>
+#include <sys/socket.h>
+#include <test_progs.h>
+#include "bpf/libbpf_internal.h"
+#include "test_perf_branches.skel.h"
+
+static void check_good_sample(struct test_perf_branches *skel)
+{
+ int written_global = skel->bss->written_global_out;
+ int required_size = skel->bss->required_size_out;
+ int written_stack = skel->bss->written_stack_out;
+ int pbe_size = sizeof(struct perf_branch_entry);
+ int duration = 0;
+
+ if (CHECK(!skel->bss->valid, "output not valid",
+ "no valid sample from prog"))
+ return;
+
+ /*
+ * It's hard to validate the contents of the branch entries b/c it
+ * would require some kind of disassembler and also encoding the
+ * valid jump instructions for supported architectures. So just check
+ * the easy stuff for now.
+ */
+ CHECK(required_size <= 0, "read_branches_size", "err %d\n", required_size);
+ CHECK(written_stack < 0, "read_branches_stack", "err %d\n", written_stack);
+ CHECK(written_stack % pbe_size != 0, "read_branches_stack",
+ "stack bytes written=%d not multiple of struct size=%d\n",
+ written_stack, pbe_size);
+ CHECK(written_global < 0, "read_branches_global", "err %d\n", written_global);
+ CHECK(written_global % pbe_size != 0, "read_branches_global",
+ "global bytes written=%d not multiple of struct size=%d\n",
+ written_global, pbe_size);
+ CHECK(written_global < written_stack, "read_branches_size",
+ "written_global=%d < written_stack=%d\n", written_global, written_stack);
+}
+
+static void check_bad_sample(struct test_perf_branches *skel)
+{
+ int written_global = skel->bss->written_global_out;
+ int required_size = skel->bss->required_size_out;
+ int written_stack = skel->bss->written_stack_out;
+ int duration = 0;
+
+ if (CHECK(!skel->bss->valid, "output not valid",
+ "no valid sample from prog"))
+ return;
+
+ CHECK((required_size != -EINVAL && required_size != -ENOENT),
+ "read_branches_size", "err %d\n", required_size);
+ CHECK((written_stack != -EINVAL && written_stack != -ENOENT),
+ "read_branches_stack", "written %d\n", written_stack);
+ CHECK((written_global != -EINVAL && written_global != -ENOENT),
+ "read_branches_global", "written %d\n", written_global);
+}
+
+static void test_perf_branches_common(int perf_fd,
+ void (*cb)(struct test_perf_branches *))
+{
+ struct test_perf_branches *skel;
+ int err, i, duration = 0;
+ bool detached = false;
+ struct bpf_link *link;
+ volatile int j = 0;
+ cpu_set_t cpu_set;
+
+ skel = test_perf_branches__open_and_load();
+ if (CHECK(!skel, "test_perf_branches_load",
+ "perf_branches skeleton failed\n"))
+ return;
+
+ /* attach perf_event */
+ link = bpf_program__attach_perf_event(skel->progs.perf_branches, perf_fd);
+ if (CHECK(IS_ERR(link), "attach_perf_event", "err %ld\n", PTR_ERR(link)))
+ goto out_destroy_skel;
+
+ /* generate some branches on cpu 0 */
+ CPU_ZERO(&cpu_set);
+ CPU_SET(0, &cpu_set);
+ err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set);
+ if (CHECK(err, "set_affinity", "cpu #0, err %d\n", err))
+ goto out_destroy;
+ /* spin the loop for a while (random high number) */
+ for (i = 0; i < 1000000; ++i)
+ ++j;
+
+ test_perf_branches__detach(skel);
+ detached = true;
+
+ cb(skel);
+out_destroy:
+ bpf_link__destroy(link);
+out_destroy_skel:
+ if (!detached)
+ test_perf_branches__detach(skel);
+ test_perf_branches__destroy(skel);
+}
+
+static void test_perf_branches_hw(void)
+{
+ struct perf_event_attr attr = {0};
+ int duration = 0;
+ int pfd;
+
+ /* create perf event */
+ attr.size = sizeof(attr);
+ attr.type = PERF_TYPE_HARDWARE;
+ attr.config = PERF_COUNT_HW_CPU_CYCLES;
+ attr.freq = 1;
+ attr.sample_freq = 4000;
+ attr.sample_type = PERF_SAMPLE_BRANCH_STACK;
+ attr.branch_sample_type = PERF_SAMPLE_BRANCH_USER | PERF_SAMPLE_BRANCH_ANY;
+ pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC);
+
+ /*
+ * Some setups don't support branch records (virtual machines, !x86),
+ * so skip test in this case.
+ */
+ if (pfd == -1) {
+ if (errno == ENOENT || errno == EOPNOTSUPP) {
+ printf("%s:SKIP:no PERF_SAMPLE_BRANCH_STACK\n",
+ __func__);
+ test__skip();
+ return;
+ }
+ if (CHECK(pfd < 0, "perf_event_open", "err %d errno %d\n",
+ pfd, errno))
+ return;
+ }
+
+ test_perf_branches_common(pfd, check_good_sample);
+
+ close(pfd);
+}
+
+/*
+ * Tests negative case -- run bpf_read_branch_records() on improperly configured
+ * perf event.
+ */
+static void test_perf_branches_no_hw(void)
+{
+ struct perf_event_attr attr = {0};
+ int duration = 0;
+ int pfd;
+
+ /* create perf event */
+ attr.size = sizeof(attr);
+ attr.type = PERF_TYPE_SOFTWARE;
+ attr.config = PERF_COUNT_SW_CPU_CLOCK;
+ attr.freq = 1;
+ attr.sample_freq = 4000;
+ pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC);
+ if (CHECK(pfd < 0, "perf_event_open", "err %d\n", pfd))
+ return;
+
+ test_perf_branches_common(pfd, check_bad_sample);
+
+ close(pfd);
+}
+
+void test_perf_branches(void)
+{
+ if (test__start_subtest("perf_branches_hw"))
+ test_perf_branches_hw();
+ if (test__start_subtest("perf_branches_no_hw"))
+ test_perf_branches_no_hw();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/perf_buffer.c b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
new file mode 100644
index 000000000..8d7547540
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/perf_buffer.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <sched.h>
+#include <sys/socket.h>
+#include <test_progs.h>
+#include "test_perf_buffer.skel.h"
+#include "bpf/libbpf_internal.h"
+
+static int duration;
+
+/* AddressSanitizer sometimes crashes due to data dereference below, due to
+ * this being mmap()'ed memory. Disable instrumentation with
+ * no_sanitize_address attribute
+ */
+__attribute__((no_sanitize_address))
+static void on_sample(void *ctx, int cpu, void *data, __u32 size)
+{
+ int cpu_data = *(int *)data, duration = 0;
+ cpu_set_t *cpu_seen = ctx;
+
+ if (cpu_data != cpu)
+ CHECK(cpu_data != cpu, "check_cpu_data",
+ "cpu_data %d != cpu %d\n", cpu_data, cpu);
+
+ CPU_SET(cpu, cpu_seen);
+}
+
+int trigger_on_cpu(int cpu)
+{
+ cpu_set_t cpu_set;
+ int err;
+
+ CPU_ZERO(&cpu_set);
+ CPU_SET(cpu, &cpu_set);
+
+ err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set);
+ if (err && CHECK(err, "set_affinity", "cpu #%d, err %d\n", cpu, err))
+ return err;
+
+ usleep(1);
+
+ return 0;
+}
+
+void test_perf_buffer(void)
+{
+ int err, on_len, nr_on_cpus = 0, nr_cpus, i;
+ struct perf_buffer_opts pb_opts = {};
+ struct test_perf_buffer *skel;
+ cpu_set_t cpu_seen;
+ struct perf_buffer *pb;
+ int last_fd = -1, fd;
+ bool *online;
+
+ nr_cpus = libbpf_num_possible_cpus();
+ if (CHECK(nr_cpus < 0, "nr_cpus", "err %d\n", nr_cpus))
+ return;
+
+ err = parse_cpu_mask_file("/sys/devices/system/cpu/online",
+ &online, &on_len);
+ if (CHECK(err, "nr_on_cpus", "err %d\n", err))
+ return;
+
+ for (i = 0; i < on_len; i++)
+ if (online[i])
+ nr_on_cpus++;
+
+ /* load program */
+ skel = test_perf_buffer__open_and_load();
+ if (CHECK(!skel, "skel_load", "skeleton open/load failed\n"))
+ goto out_close;
+
+ /* attach probe */
+ err = test_perf_buffer__attach(skel);
+ if (CHECK(err, "attach_kprobe", "err %d\n", err))
+ goto out_close;
+
+ /* set up perf buffer */
+ pb_opts.sample_cb = on_sample;
+ pb_opts.ctx = &cpu_seen;
+ pb = perf_buffer__new(bpf_map__fd(skel->maps.perf_buf_map), 1, &pb_opts);
+ if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb)))
+ goto out_close;
+
+ CHECK(perf_buffer__epoll_fd(pb) < 0, "epoll_fd",
+ "bad fd: %d\n", perf_buffer__epoll_fd(pb));
+
+ /* trigger kprobe on every CPU */
+ CPU_ZERO(&cpu_seen);
+ for (i = 0; i < nr_cpus; i++) {
+ if (i >= on_len || !online[i]) {
+ printf("skipping offline CPU #%d\n", i);
+ continue;
+ }
+
+ if (trigger_on_cpu(i))
+ goto out_close;
+ }
+
+ /* read perf buffer */
+ err = perf_buffer__poll(pb, 100);
+ if (CHECK(err < 0, "perf_buffer__poll", "err %d\n", err))
+ goto out_free_pb;
+
+ if (CHECK(CPU_COUNT(&cpu_seen) != nr_on_cpus, "seen_cpu_cnt",
+ "expect %d, seen %d\n", nr_on_cpus, CPU_COUNT(&cpu_seen)))
+ goto out_free_pb;
+
+ if (CHECK(perf_buffer__buffer_cnt(pb) != nr_on_cpus, "buf_cnt",
+ "got %zu, expected %d\n", perf_buffer__buffer_cnt(pb), nr_on_cpus))
+ goto out_close;
+
+ for (i = 0; i < nr_cpus; i++) {
+ if (i >= on_len || !online[i])
+ continue;
+
+ fd = perf_buffer__buffer_fd(pb, i);
+ CHECK(fd < 0 || last_fd == fd, "fd_check", "last fd %d == fd %d\n", last_fd, fd);
+ last_fd = fd;
+
+ err = perf_buffer__consume_buffer(pb, i);
+ if (CHECK(err, "drain_buf", "cpu %d, err %d\n", i, err))
+ goto out_close;
+
+ CPU_CLR(i, &cpu_seen);
+ if (trigger_on_cpu(i))
+ goto out_close;
+
+ err = perf_buffer__consume_buffer(pb, i);
+ if (CHECK(err, "consume_buf", "cpu %d, err %d\n", i, err))
+ goto out_close;
+
+ if (CHECK(!CPU_ISSET(i, &cpu_seen), "cpu_seen", "cpu %d not seen\n", i))
+ goto out_close;
+ }
+
+out_free_pb:
+ perf_buffer__free(pb);
+out_close:
+ test_perf_buffer__destroy(skel);
+ free(online);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/perf_event_stackmap.c b/tools/testing/selftests/bpf/prog_tests/perf_event_stackmap.c
new file mode 100644
index 000000000..72c369084
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/perf_event_stackmap.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <sched.h>
+#include <test_progs.h>
+#include "perf_event_stackmap.skel.h"
+
+#ifndef noinline
+#define noinline __attribute__((noinline))
+#endif
+
+noinline int func_1(void)
+{
+ static int val = 1;
+
+ val += 1;
+
+ usleep(100);
+ return val;
+}
+
+noinline int func_2(void)
+{
+ return func_1();
+}
+
+noinline int func_3(void)
+{
+ return func_2();
+}
+
+noinline int func_4(void)
+{
+ return func_3();
+}
+
+noinline int func_5(void)
+{
+ return func_4();
+}
+
+noinline int func_6(void)
+{
+ int i, val = 1;
+
+ for (i = 0; i < 100; i++)
+ val += func_5();
+
+ return val;
+}
+
+void test_perf_event_stackmap(void)
+{
+ struct perf_event_attr attr = {
+ /* .type = PERF_TYPE_SOFTWARE, */
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_CPU_CYCLES,
+ .precise_ip = 2,
+ .sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_BRANCH_STACK |
+ PERF_SAMPLE_CALLCHAIN,
+ .branch_sample_type = PERF_SAMPLE_BRANCH_USER |
+ PERF_SAMPLE_BRANCH_NO_FLAGS |
+ PERF_SAMPLE_BRANCH_NO_CYCLES |
+ PERF_SAMPLE_BRANCH_CALL_STACK,
+ .sample_period = 5000,
+ .size = sizeof(struct perf_event_attr),
+ };
+ struct perf_event_stackmap *skel;
+ __u32 duration = 0;
+ cpu_set_t cpu_set;
+ int pmu_fd, err;
+
+ skel = perf_event_stackmap__open();
+
+ if (CHECK(!skel, "skel_open", "skeleton open failed\n"))
+ return;
+
+ err = perf_event_stackmap__load(skel);
+ if (CHECK(err, "skel_load", "skeleton load failed: %d\n", err))
+ goto cleanup;
+
+ CPU_ZERO(&cpu_set);
+ CPU_SET(0, &cpu_set);
+ err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set);
+ if (CHECK(err, "set_affinity", "err %d, errno %d\n", err, errno))
+ goto cleanup;
+
+ pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+ 0 /* cpu 0 */, -1 /* group id */,
+ 0 /* flags */);
+ if (pmu_fd < 0) {
+ printf("%s:SKIP:cpu doesn't support the event\n", __func__);
+ test__skip();
+ goto cleanup;
+ }
+
+ skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu,
+ pmu_fd);
+ if (CHECK(IS_ERR(skel->links.oncpu), "attach_perf_event",
+ "err %ld\n", PTR_ERR(skel->links.oncpu))) {
+ close(pmu_fd);
+ goto cleanup;
+ }
+
+ /* create kernel and user stack traces for testing */
+ func_6();
+
+ CHECK(skel->data->stackid_kernel != 2, "get_stackid_kernel", "failed\n");
+ CHECK(skel->data->stackid_user != 2, "get_stackid_user", "failed\n");
+ CHECK(skel->data->stack_kernel != 2, "get_stack_kernel", "failed\n");
+ CHECK(skel->data->stack_user != 2, "get_stack_user", "failed\n");
+
+cleanup:
+ perf_event_stackmap__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/pinning.c b/tools/testing/selftests/bpf/prog_tests/pinning.c
new file mode 100644
index 000000000..fcf54b3a1
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/pinning.c
@@ -0,0 +1,271 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <test_progs.h>
+
+__u32 get_map_id(struct bpf_object *obj, const char *name)
+{
+ struct bpf_map_info map_info = {};
+ __u32 map_info_len, duration = 0;
+ struct bpf_map *map;
+ int err;
+
+ map_info_len = sizeof(map_info);
+
+ map = bpf_object__find_map_by_name(obj, name);
+ if (CHECK(!map, "find map", "NULL map"))
+ return 0;
+
+ err = bpf_obj_get_info_by_fd(bpf_map__fd(map),
+ &map_info, &map_info_len);
+ CHECK(err, "get map info", "err %d errno %d", err, errno);
+ return map_info.id;
+}
+
+void test_pinning(void)
+{
+ const char *file_invalid = "./test_pinning_invalid.o";
+ const char *custpinpath = "/sys/fs/bpf/custom/pinmap";
+ const char *nopinpath = "/sys/fs/bpf/nopinmap";
+ const char *nopinpath2 = "/sys/fs/bpf/nopinmap2";
+ const char *custpath = "/sys/fs/bpf/custom";
+ const char *pinpath = "/sys/fs/bpf/pinmap";
+ const char *file = "./test_pinning.o";
+ __u32 map_id, map_id2, duration = 0;
+ struct stat statbuf = {};
+ struct bpf_object *obj;
+ struct bpf_map *map;
+ int err, map_fd;
+ DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts,
+ .pin_root_path = custpath,
+ );
+
+ /* check that opening fails with invalid pinning value in map def */
+ obj = bpf_object__open_file(file_invalid, NULL);
+ err = libbpf_get_error(obj);
+ if (CHECK(err != -EINVAL, "invalid open", "err %d errno %d\n", err, errno)) {
+ obj = NULL;
+ goto out;
+ }
+
+ /* open the valid object file */
+ obj = bpf_object__open_file(file, NULL);
+ err = libbpf_get_error(obj);
+ if (CHECK(err, "default open", "err %d errno %d\n", err, errno)) {
+ obj = NULL;
+ goto out;
+ }
+
+ err = bpf_object__load(obj);
+ if (CHECK(err, "default load", "err %d errno %d\n", err, errno))
+ goto out;
+
+ /* check that pinmap was pinned */
+ err = stat(pinpath, &statbuf);
+ if (CHECK(err, "stat pinpath", "err %d errno %d\n", err, errno))
+ goto out;
+
+ /* check that nopinmap was *not* pinned */
+ err = stat(nopinpath, &statbuf);
+ if (CHECK(!err || errno != ENOENT, "stat nopinpath",
+ "err %d errno %d\n", err, errno))
+ goto out;
+
+ /* check that nopinmap2 was *not* pinned */
+ err = stat(nopinpath2, &statbuf);
+ if (CHECK(!err || errno != ENOENT, "stat nopinpath2",
+ "err %d errno %d\n", err, errno))
+ goto out;
+
+ map_id = get_map_id(obj, "pinmap");
+ if (!map_id)
+ goto out;
+
+ bpf_object__close(obj);
+
+ obj = bpf_object__open_file(file, NULL);
+ if (CHECK_FAIL(libbpf_get_error(obj))) {
+ obj = NULL;
+ goto out;
+ }
+
+ err = bpf_object__load(obj);
+ if (CHECK(err, "default load", "err %d errno %d\n", err, errno))
+ goto out;
+
+ /* check that same map ID was reused for second load */
+ map_id2 = get_map_id(obj, "pinmap");
+ if (CHECK(map_id != map_id2, "check reuse",
+ "err %d errno %d id %d id2 %d\n", err, errno, map_id, map_id2))
+ goto out;
+
+ /* should be no-op to re-pin same map */
+ map = bpf_object__find_map_by_name(obj, "pinmap");
+ if (CHECK(!map, "find map", "NULL map"))
+ goto out;
+
+ err = bpf_map__pin(map, NULL);
+ if (CHECK(err, "re-pin map", "err %d errno %d\n", err, errno))
+ goto out;
+
+ /* but error to pin at different location */
+ err = bpf_map__pin(map, "/sys/fs/bpf/other");
+ if (CHECK(!err, "pin map different", "err %d errno %d\n", err, errno))
+ goto out;
+
+ /* unpin maps with a pin_path set */
+ err = bpf_object__unpin_maps(obj, NULL);
+ if (CHECK(err, "unpin maps", "err %d errno %d\n", err, errno))
+ goto out;
+
+ /* and re-pin them... */
+ err = bpf_object__pin_maps(obj, NULL);
+ if (CHECK(err, "pin maps", "err %d errno %d\n", err, errno))
+ goto out;
+
+ /* set pinning path of other map and re-pin all */
+ map = bpf_object__find_map_by_name(obj, "nopinmap");
+ if (CHECK(!map, "find map", "NULL map"))
+ goto out;
+
+ err = bpf_map__set_pin_path(map, custpinpath);
+ if (CHECK(err, "set pin path", "err %d errno %d\n", err, errno))
+ goto out;
+
+ /* should only pin the one unpinned map */
+ err = bpf_object__pin_maps(obj, NULL);
+ if (CHECK(err, "pin maps", "err %d errno %d\n", err, errno))
+ goto out;
+
+ /* check that nopinmap was pinned at the custom path */
+ err = stat(custpinpath, &statbuf);
+ if (CHECK(err, "stat custpinpath", "err %d errno %d\n", err, errno))
+ goto out;
+
+ /* remove the custom pin path to re-test it with auto-pinning below */
+ err = unlink(custpinpath);
+ if (CHECK(err, "unlink custpinpath", "err %d errno %d\n", err, errno))
+ goto out;
+
+ err = rmdir(custpath);
+ if (CHECK(err, "rmdir custpindir", "err %d errno %d\n", err, errno))
+ goto out;
+
+ bpf_object__close(obj);
+
+ /* open the valid object file again */
+ obj = bpf_object__open_file(file, NULL);
+ err = libbpf_get_error(obj);
+ if (CHECK(err, "default open", "err %d errno %d\n", err, errno)) {
+ obj = NULL;
+ goto out;
+ }
+
+ /* set pin paths so that nopinmap2 will attempt to reuse the map at
+ * pinpath (which will fail), but not before pinmap has already been
+ * reused
+ */
+ bpf_object__for_each_map(map, obj) {
+ if (!strcmp(bpf_map__name(map), "nopinmap"))
+ err = bpf_map__set_pin_path(map, nopinpath2);
+ else if (!strcmp(bpf_map__name(map), "nopinmap2"))
+ err = bpf_map__set_pin_path(map, pinpath);
+ else
+ continue;
+
+ if (CHECK(err, "set pin path", "err %d errno %d\n", err, errno))
+ goto out;
+ }
+
+ /* should fail because of map parameter mismatch */
+ err = bpf_object__load(obj);
+ if (CHECK(err != -EINVAL, "param mismatch load", "err %d errno %d\n", err, errno))
+ goto out;
+
+ /* nopinmap2 should have been pinned and cleaned up again */
+ err = stat(nopinpath2, &statbuf);
+ if (CHECK(!err || errno != ENOENT, "stat nopinpath2",
+ "err %d errno %d\n", err, errno))
+ goto out;
+
+ /* pinmap should still be there */
+ err = stat(pinpath, &statbuf);
+ if (CHECK(err, "stat pinpath", "err %d errno %d\n", err, errno))
+ goto out;
+
+ bpf_object__close(obj);
+
+ /* test auto-pinning at custom path with open opt */
+ obj = bpf_object__open_file(file, &opts);
+ if (CHECK_FAIL(libbpf_get_error(obj))) {
+ obj = NULL;
+ goto out;
+ }
+
+ err = bpf_object__load(obj);
+ if (CHECK(err, "custom load", "err %d errno %d\n", err, errno))
+ goto out;
+
+ /* check that pinmap was pinned at the custom path */
+ err = stat(custpinpath, &statbuf);
+ if (CHECK(err, "stat custpinpath", "err %d errno %d\n", err, errno))
+ goto out;
+
+ /* remove the custom pin path to re-test it with reuse fd below */
+ err = unlink(custpinpath);
+ if (CHECK(err, "unlink custpinpath", "err %d errno %d\n", err, errno))
+ goto out;
+
+ err = rmdir(custpath);
+ if (CHECK(err, "rmdir custpindir", "err %d errno %d\n", err, errno))
+ goto out;
+
+ bpf_object__close(obj);
+
+ /* test pinning at custom path with reuse fd */
+ obj = bpf_object__open_file(file, NULL);
+ err = libbpf_get_error(obj);
+ if (CHECK(err, "default open", "err %d errno %d\n", err, errno)) {
+ obj = NULL;
+ goto out;
+ }
+
+ map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(__u32),
+ sizeof(__u64), 1, 0);
+ if (CHECK(map_fd < 0, "create pinmap manually", "fd %d\n", map_fd))
+ goto out;
+
+ map = bpf_object__find_map_by_name(obj, "pinmap");
+ if (CHECK(!map, "find map", "NULL map"))
+ goto close_map_fd;
+
+ err = bpf_map__reuse_fd(map, map_fd);
+ if (CHECK(err, "reuse pinmap fd", "err %d errno %d\n", err, errno))
+ goto close_map_fd;
+
+ err = bpf_map__set_pin_path(map, custpinpath);
+ if (CHECK(err, "set pin path", "err %d errno %d\n", err, errno))
+ goto close_map_fd;
+
+ err = bpf_object__load(obj);
+ if (CHECK(err, "custom load", "err %d errno %d\n", err, errno))
+ goto close_map_fd;
+
+ /* check that pinmap was pinned at the custom path */
+ err = stat(custpinpath, &statbuf);
+ if (CHECK(err, "stat custpinpath", "err %d errno %d\n", err, errno))
+ goto close_map_fd;
+
+close_map_fd:
+ close(map_fd);
+out:
+ unlink(pinpath);
+ unlink(nopinpath);
+ unlink(nopinpath2);
+ unlink(custpinpath);
+ rmdir(custpath);
+ if (obj)
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/pkt_access.c b/tools/testing/selftests/bpf/prog_tests/pkt_access.c
new file mode 100644
index 000000000..44b514fab
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/pkt_access.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+
+void test_pkt_access(void)
+{
+ const char *file = "./test_pkt_access.o";
+ struct bpf_object *obj;
+ __u32 duration, retval;
+ int err, prog_fd;
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
+ if (CHECK_FAIL(err))
+ return;
+
+ err = bpf_prog_test_run(prog_fd, 100000, &pkt_v4, sizeof(pkt_v4),
+ NULL, NULL, &retval, &duration);
+ CHECK(err || retval, "ipv4",
+ "err %d errno %d retval %d duration %d\n",
+ err, errno, retval, duration);
+
+ err = bpf_prog_test_run(prog_fd, 100000, &pkt_v6, sizeof(pkt_v6),
+ NULL, NULL, &retval, &duration);
+ CHECK(err || retval, "ipv6",
+ "err %d errno %d retval %d duration %d\n",
+ err, errno, retval, duration);
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c b/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c
new file mode 100644
index 000000000..939015cd6
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/pkt_md_access.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+
+void test_pkt_md_access(void)
+{
+ const char *file = "./test_pkt_md_access.o";
+ struct bpf_object *obj;
+ __u32 duration, retval;
+ int err, prog_fd;
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
+ if (CHECK_FAIL(err))
+ return;
+
+ err = bpf_prog_test_run(prog_fd, 10, &pkt_v4, sizeof(pkt_v4),
+ NULL, NULL, &retval, &duration);
+ CHECK(err || retval, "",
+ "err %d errno %d retval %d duration %d\n",
+ err, errno, retval, duration);
+
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/probe_read_user_str.c b/tools/testing/selftests/bpf/prog_tests/probe_read_user_str.c
new file mode 100644
index 000000000..e41929813
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/probe_read_user_str.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include "test_probe_read_user_str.skel.h"
+
+static const char str1[] = "mestring";
+static const char str2[] = "mestringalittlebigger";
+static const char str3[] = "mestringblubblubblubblubblub";
+
+static int test_one_str(struct test_probe_read_user_str *skel, const char *str,
+ size_t len)
+{
+ int err, duration = 0;
+ char buf[256];
+
+ /* Ensure bytes after string are ones */
+ memset(buf, 1, sizeof(buf));
+ memcpy(buf, str, len);
+
+ /* Give prog our userspace pointer */
+ skel->bss->user_ptr = buf;
+
+ /* Trigger tracepoint */
+ usleep(1);
+
+ /* Did helper fail? */
+ if (CHECK(skel->bss->ret < 0, "prog_ret", "prog returned: %ld\n",
+ skel->bss->ret))
+ return 1;
+
+ /* Check that string was copied correctly */
+ err = memcmp(skel->bss->buf, str, len);
+ if (CHECK(err, "memcmp", "prog copied wrong string"))
+ return 1;
+
+ /* Now check that no extra trailing bytes were copied */
+ memset(buf, 0, sizeof(buf));
+ err = memcmp(skel->bss->buf + len, buf, sizeof(buf) - len);
+ if (CHECK(err, "memcmp", "trailing bytes were not stripped"))
+ return 1;
+
+ return 0;
+}
+
+void test_probe_read_user_str(void)
+{
+ struct test_probe_read_user_str *skel;
+ int err, duration = 0;
+
+ skel = test_probe_read_user_str__open_and_load();
+ if (CHECK(!skel, "test_probe_read_user_str__open_and_load",
+ "skeleton open and load failed\n"))
+ return;
+
+ /* Give pid to bpf prog so it doesn't read from anyone else */
+ skel->bss->pid = getpid();
+
+ err = test_probe_read_user_str__attach(skel);
+ if (CHECK(err, "test_probe_read_user_str__attach",
+ "skeleton attach failed: %d\n", err))
+ goto out;
+
+ if (test_one_str(skel, str1, sizeof(str1)))
+ goto out;
+ if (test_one_str(skel, str2, sizeof(str2)))
+ goto out;
+ if (test_one_str(skel, str3, sizeof(str3)))
+ goto out;
+
+out:
+ test_probe_read_user_str__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/probe_user.c b/tools/testing/selftests/bpf/prog_tests/probe_user.c
new file mode 100644
index 000000000..7aecfd9e8
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/probe_user.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_probe_user(void)
+{
+ const char *prog_name = "kprobe/__sys_connect";
+ const char *obj_file = "./test_probe_user.o";
+ DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, );
+ int err, results_map_fd, sock_fd, duration = 0;
+ struct sockaddr curr, orig, tmp;
+ struct sockaddr_in *in = (struct sockaddr_in *)&curr;
+ struct bpf_link *kprobe_link = NULL;
+ struct bpf_program *kprobe_prog;
+ struct bpf_object *obj;
+ static const int zero = 0;
+
+ obj = bpf_object__open_file(obj_file, &opts);
+ if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj)))
+ return;
+
+ kprobe_prog = bpf_object__find_program_by_title(obj, prog_name);
+ if (CHECK(!kprobe_prog, "find_probe",
+ "prog '%s' not found\n", prog_name))
+ goto cleanup;
+
+ err = bpf_object__load(obj);
+ if (CHECK(err, "obj_load", "err %d\n", err))
+ goto cleanup;
+
+ results_map_fd = bpf_find_map(__func__, obj, "test_pro.bss");
+ if (CHECK(results_map_fd < 0, "find_bss_map",
+ "err %d\n", results_map_fd))
+ goto cleanup;
+
+ kprobe_link = bpf_program__attach(kprobe_prog);
+ if (CHECK(IS_ERR(kprobe_link), "attach_kprobe",
+ "err %ld\n", PTR_ERR(kprobe_link))) {
+ kprobe_link = NULL;
+ goto cleanup;
+ }
+
+ memset(&curr, 0, sizeof(curr));
+ in->sin_family = AF_INET;
+ in->sin_port = htons(5555);
+ in->sin_addr.s_addr = inet_addr("255.255.255.255");
+ memcpy(&orig, &curr, sizeof(curr));
+
+ sock_fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (CHECK(sock_fd < 0, "create_sock_fd", "err %d\n", sock_fd))
+ goto cleanup;
+
+ connect(sock_fd, &curr, sizeof(curr));
+ close(sock_fd);
+
+ err = bpf_map_lookup_elem(results_map_fd, &zero, &tmp);
+ if (CHECK(err, "get_kprobe_res",
+ "failed to get kprobe res: %d\n", err))
+ goto cleanup;
+
+ in = (struct sockaddr_in *)&tmp;
+ if (CHECK(memcmp(&tmp, &orig, sizeof(orig)), "check_kprobe_res",
+ "wrong kprobe res from probe read: %s:%u\n",
+ inet_ntoa(in->sin_addr), ntohs(in->sin_port)))
+ goto cleanup;
+
+ memset(&tmp, 0xab, sizeof(tmp));
+
+ in = (struct sockaddr_in *)&curr;
+ if (CHECK(memcmp(&curr, &tmp, sizeof(tmp)), "check_kprobe_res",
+ "wrong kprobe res from probe write: %s:%u\n",
+ inet_ntoa(in->sin_addr), ntohs(in->sin_port)))
+ goto cleanup;
+cleanup:
+ bpf_link__destroy(kprobe_link);
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c
new file mode 100644
index 000000000..935a294f0
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/prog_run_xattr.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+
+void test_prog_run_xattr(void)
+{
+ const char *file = "./test_pkt_access.o";
+ struct bpf_object *obj;
+ char buf[10];
+ int err;
+ struct bpf_prog_test_run_attr tattr = {
+ .repeat = 1,
+ .data_in = &pkt_v4,
+ .data_size_in = sizeof(pkt_v4),
+ .data_out = buf,
+ .data_size_out = 5,
+ };
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj,
+ &tattr.prog_fd);
+ if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno))
+ return;
+
+ memset(buf, 0, sizeof(buf));
+
+ err = bpf_prog_test_run_xattr(&tattr);
+ CHECK_ATTR(err != -1 || errno != ENOSPC || tattr.retval, "run",
+ "err %d errno %d retval %d\n", err, errno, tattr.retval);
+
+ CHECK_ATTR(tattr.data_size_out != sizeof(pkt_v4), "data_size_out",
+ "incorrect output size, want %zu have %u\n",
+ sizeof(pkt_v4), tattr.data_size_out);
+
+ CHECK_ATTR(buf[5] != 0, "overflow",
+ "BPF_PROG_TEST_RUN ignored size hint\n");
+
+ tattr.data_out = NULL;
+ tattr.data_size_out = 0;
+ errno = 0;
+
+ err = bpf_prog_test_run_xattr(&tattr);
+ CHECK_ATTR(err || errno || tattr.retval, "run_no_output",
+ "err %d errno %d retval %d\n", err, errno, tattr.retval);
+
+ tattr.data_size_out = 1;
+ err = bpf_prog_test_run_xattr(&tattr);
+ CHECK_ATTR(err != -EINVAL, "run_wrong_size_out", "err %d\n", err);
+
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c
new file mode 100644
index 000000000..f47e7b1cb
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/queue_stack_map.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+
+enum {
+ QUEUE,
+ STACK,
+};
+
+static void test_queue_stack_map_by_type(int type)
+{
+ const int MAP_SIZE = 32;
+ __u32 vals[MAP_SIZE], duration, retval, size, val;
+ int i, err, prog_fd, map_in_fd, map_out_fd;
+ char file[32], buf[128];
+ struct bpf_object *obj;
+ struct iphdr *iph = (void *)buf + sizeof(struct ethhdr);
+
+ /* Fill test values to be used */
+ for (i = 0; i < MAP_SIZE; i++)
+ vals[i] = rand();
+
+ if (type == QUEUE)
+ strncpy(file, "./test_queue_map.o", sizeof(file));
+ else if (type == STACK)
+ strncpy(file, "./test_stack_map.o", sizeof(file));
+ else
+ return;
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_SCHED_CLS, &obj, &prog_fd);
+ if (CHECK_FAIL(err))
+ return;
+
+ map_in_fd = bpf_find_map(__func__, obj, "map_in");
+ if (map_in_fd < 0)
+ goto out;
+
+ map_out_fd = bpf_find_map(__func__, obj, "map_out");
+ if (map_out_fd < 0)
+ goto out;
+
+ /* Push 32 elements to the input map */
+ for (i = 0; i < MAP_SIZE; i++) {
+ err = bpf_map_update_elem(map_in_fd, NULL, &vals[i], 0);
+ if (CHECK_FAIL(err))
+ goto out;
+ }
+
+ /* The eBPF program pushes iph.saddr in the output map,
+ * pops the input map and saves this value in iph.daddr
+ */
+ for (i = 0; i < MAP_SIZE; i++) {
+ if (type == QUEUE) {
+ val = vals[i];
+ pkt_v4.iph.saddr = vals[i] * 5;
+ } else if (type == STACK) {
+ val = vals[MAP_SIZE - 1 - i];
+ pkt_v4.iph.saddr = vals[MAP_SIZE - 1 - i] * 5;
+ }
+
+ err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+ buf, &size, &retval, &duration);
+ if (err || retval || size != sizeof(pkt_v4) ||
+ iph->daddr != val)
+ break;
+ }
+
+ CHECK(err || retval || size != sizeof(pkt_v4) || iph->daddr != val,
+ "bpf_map_pop_elem",
+ "err %d errno %d retval %d size %d iph->daddr %u\n",
+ err, errno, retval, size, iph->daddr);
+
+ /* Queue is empty, program should return TC_ACT_SHOT */
+ err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+ buf, &size, &retval, &duration);
+ CHECK(err || retval != 2 /* TC_ACT_SHOT */|| size != sizeof(pkt_v4),
+ "check-queue-stack-map-empty",
+ "err %d errno %d retval %d size %d\n",
+ err, errno, retval, size);
+
+ /* Check that the program pushed elements correctly */
+ for (i = 0; i < MAP_SIZE; i++) {
+ err = bpf_map_lookup_and_delete_elem(map_out_fd, NULL, &val);
+ if (err || val != vals[i] * 5)
+ break;
+ }
+
+ CHECK(i != MAP_SIZE && (err || val != vals[i] * 5),
+ "bpf_map_push_elem", "err %d value %u\n", err, val);
+
+out:
+ pkt_v4.iph.saddr = 0;
+ bpf_object__close(obj);
+}
+
+void test_queue_stack_map(void)
+{
+ test_queue_stack_map_by_type(QUEUE);
+ test_queue_stack_map_by_type(STACK);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/raw_tp_test_run.c b/tools/testing/selftests/bpf/prog_tests/raw_tp_test_run.c
new file mode 100644
index 000000000..c5fb19187
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/raw_tp_test_run.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2019 Facebook */
+#include <test_progs.h>
+#include <linux/bpf.h>
+#include "bpf/libbpf_internal.h"
+#include "test_raw_tp_test_run.skel.h"
+
+static int duration;
+
+void test_raw_tp_test_run(void)
+{
+ struct bpf_prog_test_run_attr test_attr = {};
+ int comm_fd = -1, err, nr_online, i, prog_fd;
+ __u64 args[2] = {0x1234ULL, 0x5678ULL};
+ int expected_retval = 0x1234 + 0x5678;
+ struct test_raw_tp_test_run *skel;
+ char buf[] = "new_name";
+ bool *online = NULL;
+ DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
+ .ctx_in = args,
+ .ctx_size_in = sizeof(args),
+ .flags = BPF_F_TEST_RUN_ON_CPU,
+ );
+
+ err = parse_cpu_mask_file("/sys/devices/system/cpu/online", &online,
+ &nr_online);
+ if (CHECK(err, "parse_cpu_mask_file", "err %d\n", err))
+ return;
+
+ skel = test_raw_tp_test_run__open_and_load();
+ if (CHECK(!skel, "skel_open", "failed to open skeleton\n"))
+ goto cleanup;
+
+ err = test_raw_tp_test_run__attach(skel);
+ if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err))
+ goto cleanup;
+
+ comm_fd = open("/proc/self/comm", O_WRONLY|O_TRUNC);
+ if (CHECK(comm_fd < 0, "open /proc/self/comm", "err %d\n", errno))
+ goto cleanup;
+
+ err = write(comm_fd, buf, sizeof(buf));
+ CHECK(err < 0, "task rename", "err %d", errno);
+
+ CHECK(skel->bss->count == 0, "check_count", "didn't increase\n");
+ CHECK(skel->data->on_cpu != 0xffffffff, "check_on_cpu", "got wrong value\n");
+
+ prog_fd = bpf_program__fd(skel->progs.rename);
+ test_attr.prog_fd = prog_fd;
+ test_attr.ctx_in = args;
+ test_attr.ctx_size_in = sizeof(__u64);
+
+ err = bpf_prog_test_run_xattr(&test_attr);
+ CHECK(err == 0, "test_run", "should fail for too small ctx\n");
+
+ test_attr.ctx_size_in = sizeof(args);
+ err = bpf_prog_test_run_xattr(&test_attr);
+ CHECK(err < 0, "test_run", "err %d\n", errno);
+ CHECK(test_attr.retval != expected_retval, "check_retval",
+ "expect 0x%x, got 0x%x\n", expected_retval, test_attr.retval);
+
+ for (i = 0; i < nr_online; i++) {
+ if (!online[i])
+ continue;
+
+ opts.cpu = i;
+ opts.retval = 0;
+ err = bpf_prog_test_run_opts(prog_fd, &opts);
+ CHECK(err < 0, "test_run_opts", "err %d\n", errno);
+ CHECK(skel->data->on_cpu != i, "check_on_cpu",
+ "expect %d got %d\n", i, skel->data->on_cpu);
+ CHECK(opts.retval != expected_retval,
+ "check_retval", "expect 0x%x, got 0x%x\n",
+ expected_retval, opts.retval);
+ }
+
+ /* invalid cpu ID should fail with ENXIO */
+ opts.cpu = 0xffffffff;
+ err = bpf_prog_test_run_opts(prog_fd, &opts);
+ CHECK(err != -1 || errno != ENXIO,
+ "test_run_opts_fail",
+ "should failed with ENXIO\n");
+
+ /* non-zero cpu w/o BPF_F_TEST_RUN_ON_CPU should fail with EINVAL */
+ opts.cpu = 1;
+ opts.flags = 0;
+ err = bpf_prog_test_run_opts(prog_fd, &opts);
+ CHECK(err != -1 || errno != EINVAL,
+ "test_run_opts_fail",
+ "should failed with EINVAL\n");
+
+cleanup:
+ close(comm_fd);
+ test_raw_tp_test_run__destroy(skel);
+ free(online);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c
new file mode 100644
index 000000000..9807336a3
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_reject_nbd_invalid.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include <linux/nbd.h>
+
+void test_raw_tp_writable_reject_nbd_invalid(void)
+{
+ __u32 duration = 0;
+ char error[4096];
+ int bpf_fd = -1, tp_fd = -1;
+
+ const struct bpf_insn program[] = {
+ /* r6 is our tp buffer */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+ /* one byte beyond the end of the nbd_request struct */
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_6,
+ sizeof(struct nbd_request)),
+ BPF_EXIT_INSN(),
+ };
+
+ struct bpf_load_program_attr load_attr = {
+ .prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
+ .license = "GPL v2",
+ .insns = program,
+ .insns_cnt = sizeof(program) / sizeof(struct bpf_insn),
+ .log_level = 2,
+ };
+
+ bpf_fd = bpf_load_program_xattr(&load_attr, error, sizeof(error));
+ if (CHECK(bpf_fd < 0, "bpf_raw_tracepoint_writable load",
+ "failed: %d errno %d\n", bpf_fd, errno))
+ return;
+
+ tp_fd = bpf_raw_tracepoint_open("nbd_send_request", bpf_fd);
+ if (CHECK(tp_fd >= 0, "bpf_raw_tracepoint_writable open",
+ "erroneously succeeded\n"))
+ goto out_bpffd;
+
+ close(tp_fd);
+out_bpffd:
+ close(bpf_fd);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c
new file mode 100644
index 000000000..5c45424ca
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_run.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <test_progs.h>
+#include <linux/nbd.h>
+
+void test_raw_tp_writable_test_run(void)
+{
+ __u32 duration = 0;
+ char error[4096];
+
+ const struct bpf_insn trace_program[] = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ };
+
+ struct bpf_load_program_attr load_attr = {
+ .prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
+ .license = "GPL v2",
+ .insns = trace_program,
+ .insns_cnt = sizeof(trace_program) / sizeof(struct bpf_insn),
+ .log_level = 2,
+ };
+
+ int bpf_fd = bpf_load_program_xattr(&load_attr, error, sizeof(error));
+ if (CHECK(bpf_fd < 0, "bpf_raw_tracepoint_writable loaded",
+ "failed: %d errno %d\n", bpf_fd, errno))
+ return;
+
+ const struct bpf_insn skb_program[] = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ };
+
+ struct bpf_load_program_attr skb_load_attr = {
+ .prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
+ .license = "GPL v2",
+ .insns = skb_program,
+ .insns_cnt = sizeof(skb_program) / sizeof(struct bpf_insn),
+ };
+
+ int filter_fd =
+ bpf_load_program_xattr(&skb_load_attr, error, sizeof(error));
+ if (CHECK(filter_fd < 0, "test_program_loaded", "failed: %d errno %d\n",
+ filter_fd, errno))
+ goto out_bpffd;
+
+ int tp_fd = bpf_raw_tracepoint_open("bpf_test_finish", bpf_fd);
+ if (CHECK(tp_fd < 0, "bpf_raw_tracepoint_writable opened",
+ "failed: %d errno %d\n", tp_fd, errno))
+ goto out_filterfd;
+
+ char test_skb[128] = {
+ 0,
+ };
+
+ __u32 prog_ret;
+ int err = bpf_prog_test_run(filter_fd, 1, test_skb, sizeof(test_skb), 0,
+ 0, &prog_ret, 0);
+ CHECK(err != 42, "test_run",
+ "tracepoint did not modify return value\n");
+ CHECK(prog_ret != 0, "test_run_ret",
+ "socket_filter did not return 0\n");
+
+ close(tp_fd);
+
+ err = bpf_prog_test_run(filter_fd, 1, test_skb, sizeof(test_skb), 0, 0,
+ &prog_ret, 0);
+ CHECK(err != 0, "test_run_notrace",
+ "test_run failed with %d errno %d\n", err, errno);
+ CHECK(prog_ret != 0, "test_run_ret_notrace",
+ "socket_filter did not return 0\n");
+
+out_filterfd:
+ close(filter_fd);
+out_bpffd:
+ close(bpf_fd);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c b/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c
new file mode 100644
index 000000000..563e12120
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/rdonly_maps.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+struct bss {
+ unsigned did_run;
+ unsigned iters;
+ unsigned sum;
+};
+
+struct rdonly_map_subtest {
+ const char *subtest_name;
+ const char *prog_name;
+ unsigned exp_iters;
+ unsigned exp_sum;
+};
+
+void test_rdonly_maps(void)
+{
+ const char *file = "test_rdonly_maps.o";
+ struct rdonly_map_subtest subtests[] = {
+ { "skip loop", "skip_loop", 0, 0 },
+ { "part loop", "part_loop", 3, 2 + 3 + 4 },
+ { "full loop", "full_loop", 4, 2 + 3 + 4 + 5 },
+ };
+ int i, err, zero = 0, duration = 0;
+ struct bpf_link *link = NULL;
+ struct bpf_program *prog;
+ struct bpf_map *bss_map;
+ struct bpf_object *obj;
+ struct bss bss;
+
+ obj = bpf_object__open_file(file, NULL);
+ if (CHECK(IS_ERR(obj), "obj_open", "err %ld\n", PTR_ERR(obj)))
+ return;
+
+ err = bpf_object__load(obj);
+ if (CHECK(err, "obj_load", "err %d errno %d\n", err, errno))
+ goto cleanup;
+
+ bss_map = bpf_object__find_map_by_name(obj, "test_rdo.bss");
+ if (CHECK(!bss_map, "find_bss_map", "failed\n"))
+ goto cleanup;
+
+ for (i = 0; i < ARRAY_SIZE(subtests); i++) {
+ const struct rdonly_map_subtest *t = &subtests[i];
+
+ if (!test__start_subtest(t->subtest_name))
+ continue;
+
+ prog = bpf_object__find_program_by_name(obj, t->prog_name);
+ if (CHECK(!prog, "find_prog", "prog '%s' not found\n",
+ t->prog_name))
+ goto cleanup;
+
+ memset(&bss, 0, sizeof(bss));
+ err = bpf_map_update_elem(bpf_map__fd(bss_map), &zero, &bss, 0);
+ if (CHECK(err, "set_bss", "failed to set bss data: %d\n", err))
+ goto cleanup;
+
+ link = bpf_program__attach_raw_tracepoint(prog, "sys_enter");
+ if (CHECK(IS_ERR(link), "attach_prog", "prog '%s', err %ld\n",
+ t->prog_name, PTR_ERR(link))) {
+ link = NULL;
+ goto cleanup;
+ }
+
+ /* trigger probe */
+ usleep(1);
+
+ bpf_link__destroy(link);
+ link = NULL;
+
+ err = bpf_map_lookup_elem(bpf_map__fd(bss_map), &zero, &bss);
+ if (CHECK(err, "get_bss", "failed to get bss data: %d\n", err))
+ goto cleanup;
+ if (CHECK(bss.did_run == 0, "check_run",
+ "prog '%s' didn't run?\n", t->prog_name))
+ goto cleanup;
+ if (CHECK(bss.iters != t->exp_iters, "check_iters",
+ "prog '%s' iters: %d, expected: %d\n",
+ t->prog_name, bss.iters, t->exp_iters))
+ goto cleanup;
+ if (CHECK(bss.sum != t->exp_sum, "check_sum",
+ "prog '%s' sum: %d, expected: %d\n",
+ t->prog_name, bss.sum, t->exp_sum))
+ goto cleanup;
+ }
+
+cleanup:
+ bpf_link__destroy(link);
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/reference_tracking.c b/tools/testing/selftests/bpf/prog_tests/reference_tracking.c
new file mode 100644
index 000000000..ac1ee10cf
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/reference_tracking.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_reference_tracking(void)
+{
+ const char *file = "test_sk_lookup_kern.o";
+ const char *obj_name = "ref_track";
+ DECLARE_LIBBPF_OPTS(bpf_object_open_opts, open_opts,
+ .object_name = obj_name,
+ .relaxed_maps = true,
+ );
+ struct bpf_object *obj;
+ struct bpf_program *prog;
+ __u32 duration = 0;
+ int err = 0;
+
+ obj = bpf_object__open_file(file, &open_opts);
+ if (CHECK_FAIL(IS_ERR(obj)))
+ return;
+
+ if (CHECK(strcmp(bpf_object__name(obj), obj_name), "obj_name",
+ "wrong obj name '%s', expected '%s'\n",
+ bpf_object__name(obj), obj_name))
+ goto cleanup;
+
+ bpf_object__for_each_program(prog, obj) {
+ const char *title;
+
+ /* Ignore .text sections */
+ title = bpf_program__section_name(prog);
+ if (strstr(title, ".text") != NULL)
+ continue;
+
+ if (!test__start_subtest(title))
+ continue;
+
+ /* Expect verifier failure if test name has 'fail' */
+ if (strstr(title, "fail") != NULL) {
+ libbpf_print_fn_t old_print_fn;
+
+ old_print_fn = libbpf_set_print(NULL);
+ err = !bpf_program__load(prog, "GPL", 0);
+ libbpf_set_print(old_print_fn);
+ } else {
+ err = bpf_program__load(prog, "GPL", 0);
+ }
+ CHECK(err, title, "\n");
+ }
+
+cleanup:
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c
new file mode 100644
index 000000000..6ace5e9ef
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/resolve_btfids.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/err.h>
+#include <string.h>
+#include <bpf/btf.h>
+#include <bpf/libbpf.h>
+#include <linux/btf.h>
+#include <linux/kernel.h>
+#define CONFIG_DEBUG_INFO_BTF
+#include <linux/btf_ids.h>
+#include "test_progs.h"
+
+static int duration;
+
+struct symbol {
+ const char *name;
+ int type;
+ int id;
+};
+
+struct symbol test_symbols[] = {
+ { "unused", BTF_KIND_UNKN, 0 },
+ { "S", BTF_KIND_TYPEDEF, -1 },
+ { "T", BTF_KIND_TYPEDEF, -1 },
+ { "U", BTF_KIND_TYPEDEF, -1 },
+ { "S", BTF_KIND_STRUCT, -1 },
+ { "U", BTF_KIND_UNION, -1 },
+ { "func", BTF_KIND_FUNC, -1 },
+};
+
+/* Align the .BTF_ids section to 4 bytes */
+asm (
+".pushsection " BTF_IDS_SECTION " ,\"a\"; \n"
+".balign 4, 0; \n"
+".popsection; \n");
+
+BTF_ID_LIST(test_list_local)
+BTF_ID_UNUSED
+BTF_ID(typedef, S)
+BTF_ID(typedef, T)
+BTF_ID(typedef, U)
+BTF_ID(struct, S)
+BTF_ID(union, U)
+BTF_ID(func, func)
+
+extern __u32 test_list_global[];
+BTF_ID_LIST_GLOBAL(test_list_global)
+BTF_ID_UNUSED
+BTF_ID(typedef, S)
+BTF_ID(typedef, T)
+BTF_ID(typedef, U)
+BTF_ID(struct, S)
+BTF_ID(union, U)
+BTF_ID(func, func)
+
+BTF_SET_START(test_set)
+BTF_ID(typedef, S)
+BTF_ID(typedef, T)
+BTF_ID(typedef, U)
+BTF_ID(struct, S)
+BTF_ID(union, U)
+BTF_ID(func, func)
+BTF_SET_END(test_set)
+
+static int
+__resolve_symbol(struct btf *btf, int type_id)
+{
+ const struct btf_type *type;
+ const char *str;
+ unsigned int i;
+
+ type = btf__type_by_id(btf, type_id);
+ if (!type) {
+ PRINT_FAIL("Failed to get type for ID %d\n", type_id);
+ return -1;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(test_symbols); i++) {
+ if (test_symbols[i].id != -1)
+ continue;
+
+ if (BTF_INFO_KIND(type->info) != test_symbols[i].type)
+ continue;
+
+ str = btf__name_by_offset(btf, type->name_off);
+ if (!str) {
+ PRINT_FAIL("Failed to get name for BTF ID %d\n", type_id);
+ return -1;
+ }
+
+ if (!strcmp(str, test_symbols[i].name))
+ test_symbols[i].id = type_id;
+ }
+
+ return 0;
+}
+
+static int resolve_symbols(void)
+{
+ struct btf *btf;
+ int type_id;
+ __u32 nr;
+
+ btf = btf__parse_elf("btf_data.o", NULL);
+ if (CHECK(libbpf_get_error(btf), "resolve",
+ "Failed to load BTF from btf_data.o\n"))
+ return -1;
+
+ nr = btf__get_nr_types(btf);
+
+ for (type_id = 1; type_id <= nr; type_id++) {
+ if (__resolve_symbol(btf, type_id))
+ break;
+ }
+
+ btf__free(btf);
+ return 0;
+}
+
+int test_resolve_btfids(void)
+{
+ __u32 *test_list, *test_lists[] = { test_list_local, test_list_global };
+ unsigned int i, j;
+ int ret = 0;
+
+ if (resolve_symbols())
+ return -1;
+
+ /* Check BTF_ID_LIST(test_list_local) and
+ * BTF_ID_LIST_GLOBAL(test_list_global) IDs
+ */
+ for (j = 0; j < ARRAY_SIZE(test_lists); j++) {
+ test_list = test_lists[j];
+ for (i = 0; i < ARRAY_SIZE(test_symbols); i++) {
+ ret = CHECK(test_list[i] != test_symbols[i].id,
+ "id_check",
+ "wrong ID for %s (%d != %d)\n",
+ test_symbols[i].name,
+ test_list[i], test_symbols[i].id);
+ if (ret)
+ return ret;
+ }
+ }
+
+ /* Check BTF_SET_START(test_set) IDs */
+ for (i = 0; i < test_set.cnt; i++) {
+ bool found = false;
+
+ for (j = 0; j < ARRAY_SIZE(test_symbols); j++) {
+ if (test_symbols[j].id != test_set.ids[i])
+ continue;
+ found = true;
+ break;
+ }
+
+ ret = CHECK(!found, "id_check",
+ "ID %d not found in test_symbols\n",
+ test_set.ids[i]);
+ if (ret)
+ break;
+
+ if (i > 0) {
+ ret = CHECK(test_set.ids[i - 1] > test_set.ids[i],
+ "sort_check",
+ "test_set is not sorted\n");
+ if (ret)
+ break;
+ }
+ }
+
+ return ret;
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf.c b/tools/testing/selftests/bpf/prog_tests/ringbuf.c
new file mode 100644
index 000000000..fddbc5db5
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/ringbuf.c
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <linux/compiler.h>
+#include <asm/barrier.h>
+#include <test_progs.h>
+#include <sys/mman.h>
+#include <sys/epoll.h>
+#include <time.h>
+#include <sched.h>
+#include <signal.h>
+#include <pthread.h>
+#include <sys/sysinfo.h>
+#include <linux/perf_event.h>
+#include <linux/ring_buffer.h>
+#include "test_ringbuf.skel.h"
+
+#define EDONE 7777
+
+static int duration = 0;
+
+struct sample {
+ int pid;
+ int seq;
+ long value;
+ char comm[16];
+};
+
+static int sample_cnt;
+
+static void atomic_inc(int *cnt)
+{
+ __atomic_add_fetch(cnt, 1, __ATOMIC_SEQ_CST);
+}
+
+static int atomic_xchg(int *cnt, int val)
+{
+ return __atomic_exchange_n(cnt, val, __ATOMIC_SEQ_CST);
+}
+
+static int process_sample(void *ctx, void *data, size_t len)
+{
+ struct sample *s = data;
+
+ atomic_inc(&sample_cnt);
+
+ switch (s->seq) {
+ case 0:
+ CHECK(s->value != 333, "sample1_value", "exp %ld, got %ld\n",
+ 333L, s->value);
+ return 0;
+ case 1:
+ CHECK(s->value != 777, "sample2_value", "exp %ld, got %ld\n",
+ 777L, s->value);
+ return -EDONE;
+ default:
+ /* we don't care about the rest */
+ return 0;
+ }
+}
+
+static struct test_ringbuf *skel;
+static struct ring_buffer *ringbuf;
+
+static void trigger_samples()
+{
+ skel->bss->dropped = 0;
+ skel->bss->total = 0;
+ skel->bss->discarded = 0;
+
+ /* trigger exactly two samples */
+ skel->bss->value = 333;
+ syscall(__NR_getpgid);
+ skel->bss->value = 777;
+ syscall(__NR_getpgid);
+}
+
+static void *poll_thread(void *input)
+{
+ long timeout = (long)input;
+
+ return (void *)(long)ring_buffer__poll(ringbuf, timeout);
+}
+
+void test_ringbuf(void)
+{
+ const size_t rec_sz = BPF_RINGBUF_HDR_SZ + sizeof(struct sample);
+ pthread_t thread;
+ long bg_ret = -1;
+ int err, cnt;
+
+ skel = test_ringbuf__open_and_load();
+ if (CHECK(!skel, "skel_open_load", "skeleton open&load failed\n"))
+ return;
+
+ /* only trigger BPF program for current process */
+ skel->bss->pid = getpid();
+
+ ringbuf = ring_buffer__new(bpf_map__fd(skel->maps.ringbuf),
+ process_sample, NULL, NULL);
+ if (CHECK(!ringbuf, "ringbuf_create", "failed to create ringbuf\n"))
+ goto cleanup;
+
+ err = test_ringbuf__attach(skel);
+ if (CHECK(err, "skel_attach", "skeleton attachment failed: %d\n", err))
+ goto cleanup;
+
+ trigger_samples();
+
+ /* 2 submitted + 1 discarded records */
+ CHECK(skel->bss->avail_data != 3 * rec_sz,
+ "err_avail_size", "exp %ld, got %ld\n",
+ 3L * rec_sz, skel->bss->avail_data);
+ CHECK(skel->bss->ring_size != 4096,
+ "err_ring_size", "exp %ld, got %ld\n",
+ 4096L, skel->bss->ring_size);
+ CHECK(skel->bss->cons_pos != 0,
+ "err_cons_pos", "exp %ld, got %ld\n",
+ 0L, skel->bss->cons_pos);
+ CHECK(skel->bss->prod_pos != 3 * rec_sz,
+ "err_prod_pos", "exp %ld, got %ld\n",
+ 3L * rec_sz, skel->bss->prod_pos);
+
+ /* poll for samples */
+ err = ring_buffer__poll(ringbuf, -1);
+
+ /* -EDONE is used as an indicator that we are done */
+ if (CHECK(err != -EDONE, "err_done", "done err: %d\n", err))
+ goto cleanup;
+ cnt = atomic_xchg(&sample_cnt, 0);
+ CHECK(cnt != 2, "cnt", "exp %d samples, got %d\n", 2, cnt);
+
+ /* we expect extra polling to return nothing */
+ err = ring_buffer__poll(ringbuf, 0);
+ if (CHECK(err != 0, "extra_samples", "poll result: %d\n", err))
+ goto cleanup;
+ cnt = atomic_xchg(&sample_cnt, 0);
+ CHECK(cnt != 0, "cnt", "exp %d samples, got %d\n", 0, cnt);
+
+ CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n",
+ 0L, skel->bss->dropped);
+ CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n",
+ 2L, skel->bss->total);
+ CHECK(skel->bss->discarded != 1, "err_discarded", "exp %ld, got %ld\n",
+ 1L, skel->bss->discarded);
+
+ /* now validate consumer position is updated and returned */
+ trigger_samples();
+ CHECK(skel->bss->cons_pos != 3 * rec_sz,
+ "err_cons_pos", "exp %ld, got %ld\n",
+ 3L * rec_sz, skel->bss->cons_pos);
+ err = ring_buffer__poll(ringbuf, -1);
+ CHECK(err <= 0, "poll_err", "err %d\n", err);
+ cnt = atomic_xchg(&sample_cnt, 0);
+ CHECK(cnt != 2, "cnt", "exp %d samples, got %d\n", 2, cnt);
+
+ /* start poll in background w/ long timeout */
+ err = pthread_create(&thread, NULL, poll_thread, (void *)(long)10000);
+ if (CHECK(err, "bg_poll", "pthread_create failed: %d\n", err))
+ goto cleanup;
+
+ /* turn off notifications now */
+ skel->bss->flags = BPF_RB_NO_WAKEUP;
+
+ /* give background thread a bit of a time */
+ usleep(50000);
+ trigger_samples();
+ /* sleeping arbitrarily is bad, but no better way to know that
+ * epoll_wait() **DID NOT** unblock in background thread
+ */
+ usleep(50000);
+ /* background poll should still be blocked */
+ err = pthread_tryjoin_np(thread, (void **)&bg_ret);
+ if (CHECK(err != EBUSY, "try_join", "err %d\n", err))
+ goto cleanup;
+
+ /* BPF side did everything right */
+ CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n",
+ 0L, skel->bss->dropped);
+ CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n",
+ 2L, skel->bss->total);
+ CHECK(skel->bss->discarded != 1, "err_discarded", "exp %ld, got %ld\n",
+ 1L, skel->bss->discarded);
+ cnt = atomic_xchg(&sample_cnt, 0);
+ CHECK(cnt != 0, "cnt", "exp %d samples, got %d\n", 0, cnt);
+
+ /* clear flags to return to "adaptive" notification mode */
+ skel->bss->flags = 0;
+
+ /* produce new samples, no notification should be triggered, because
+ * consumer is now behind
+ */
+ trigger_samples();
+
+ /* background poll should still be blocked */
+ err = pthread_tryjoin_np(thread, (void **)&bg_ret);
+ if (CHECK(err != EBUSY, "try_join", "err %d\n", err))
+ goto cleanup;
+
+ /* still no samples, because consumer is behind */
+ cnt = atomic_xchg(&sample_cnt, 0);
+ CHECK(cnt != 0, "cnt", "exp %d samples, got %d\n", 0, cnt);
+
+ skel->bss->dropped = 0;
+ skel->bss->total = 0;
+ skel->bss->discarded = 0;
+
+ skel->bss->value = 333;
+ syscall(__NR_getpgid);
+ /* now force notifications */
+ skel->bss->flags = BPF_RB_FORCE_WAKEUP;
+ skel->bss->value = 777;
+ syscall(__NR_getpgid);
+
+ /* now we should get a pending notification */
+ usleep(50000);
+ err = pthread_tryjoin_np(thread, (void **)&bg_ret);
+ if (CHECK(err, "join_bg", "err %d\n", err))
+ goto cleanup;
+
+ if (CHECK(bg_ret <= 0, "bg_ret", "epoll_wait result: %ld", bg_ret))
+ goto cleanup;
+
+ /* due to timing variations, there could still be non-notified
+ * samples, so consume them here to collect all the samples
+ */
+ err = ring_buffer__consume(ringbuf);
+ CHECK(err < 0, "rb_consume", "failed: %d\b", err);
+
+ /* 3 rounds, 2 samples each */
+ cnt = atomic_xchg(&sample_cnt, 0);
+ CHECK(cnt != 6, "cnt", "exp %d samples, got %d\n", 6, cnt);
+
+ /* BPF side did everything right */
+ CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n",
+ 0L, skel->bss->dropped);
+ CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n",
+ 2L, skel->bss->total);
+ CHECK(skel->bss->discarded != 1, "err_discarded", "exp %ld, got %ld\n",
+ 1L, skel->bss->discarded);
+
+ test_ringbuf__detach(skel);
+cleanup:
+ ring_buffer__free(ringbuf);
+ test_ringbuf__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c
new file mode 100644
index 000000000..d37161e59
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/ringbuf_multi.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <test_progs.h>
+#include <sys/epoll.h>
+#include "test_ringbuf_multi.skel.h"
+
+static int duration = 0;
+
+struct sample {
+ int pid;
+ int seq;
+ long value;
+ char comm[16];
+};
+
+static int process_sample(void *ctx, void *data, size_t len)
+{
+ int ring = (unsigned long)ctx;
+ struct sample *s = data;
+
+ switch (s->seq) {
+ case 0:
+ CHECK(ring != 1, "sample1_ring", "exp %d, got %d\n", 1, ring);
+ CHECK(s->value != 333, "sample1_value", "exp %ld, got %ld\n",
+ 333L, s->value);
+ break;
+ case 1:
+ CHECK(ring != 2, "sample2_ring", "exp %d, got %d\n", 2, ring);
+ CHECK(s->value != 777, "sample2_value", "exp %ld, got %ld\n",
+ 777L, s->value);
+ break;
+ default:
+ CHECK(true, "extra_sample", "unexpected sample seq %d, val %ld\n",
+ s->seq, s->value);
+ return -1;
+ }
+
+ return 0;
+}
+
+void test_ringbuf_multi(void)
+{
+ struct test_ringbuf_multi *skel;
+ struct ring_buffer *ringbuf;
+ int err;
+
+ skel = test_ringbuf_multi__open_and_load();
+ if (CHECK(!skel, "skel_open_load", "skeleton open&load failed\n"))
+ return;
+
+ /* only trigger BPF program for current process */
+ skel->bss->pid = getpid();
+
+ ringbuf = ring_buffer__new(bpf_map__fd(skel->maps.ringbuf1),
+ process_sample, (void *)(long)1, NULL);
+ if (CHECK(!ringbuf, "ringbuf_create", "failed to create ringbuf\n"))
+ goto cleanup;
+
+ err = ring_buffer__add(ringbuf, bpf_map__fd(skel->maps.ringbuf2),
+ process_sample, (void *)(long)2);
+ if (CHECK(err, "ringbuf_add", "failed to add another ring\n"))
+ goto cleanup;
+
+ err = test_ringbuf_multi__attach(skel);
+ if (CHECK(err, "skel_attach", "skeleton attachment failed: %d\n", err))
+ goto cleanup;
+
+ /* trigger few samples, some will be skipped */
+ skel->bss->target_ring = 0;
+ skel->bss->value = 333;
+ syscall(__NR_getpgid);
+
+ /* skipped, no ringbuf in slot 1 */
+ skel->bss->target_ring = 1;
+ skel->bss->value = 555;
+ syscall(__NR_getpgid);
+
+ skel->bss->target_ring = 2;
+ skel->bss->value = 777;
+ syscall(__NR_getpgid);
+
+ /* poll for samples, should get 2 ringbufs back */
+ err = ring_buffer__poll(ringbuf, -1);
+ if (CHECK(err != 2, "poll_res", "expected 2 records, got %d\n", err))
+ goto cleanup;
+
+ /* expect extra polling to return nothing */
+ err = ring_buffer__poll(ringbuf, 0);
+ if (CHECK(err < 0, "extra_samples", "poll result: %d\n", err))
+ goto cleanup;
+
+ CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n",
+ 0L, skel->bss->dropped);
+ CHECK(skel->bss->skipped != 1, "err_skipped", "exp %ld, got %ld\n",
+ 1L, skel->bss->skipped);
+ CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n",
+ 2L, skel->bss->total);
+
+cleanup:
+ ring_buffer__free(ringbuf);
+ test_ringbuf_multi__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/section_names.c b/tools/testing/selftests/bpf/prog_tests/section_names.c
new file mode 100644
index 000000000..8b571890c
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/section_names.c
@@ -0,0 +1,215 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+#include <test_progs.h>
+
+static int duration = 0;
+
+struct sec_name_test {
+ const char sec_name[32];
+ struct {
+ int rc;
+ enum bpf_prog_type prog_type;
+ enum bpf_attach_type expected_attach_type;
+ } expected_load;
+ struct {
+ int rc;
+ enum bpf_attach_type attach_type;
+ } expected_attach;
+};
+
+static struct sec_name_test tests[] = {
+ {"InvAliD", {-ESRCH, 0, 0}, {-EINVAL, 0} },
+ {"cgroup", {-ESRCH, 0, 0}, {-EINVAL, 0} },
+ {"socket", {0, BPF_PROG_TYPE_SOCKET_FILTER, 0}, {-EINVAL, 0} },
+ {"kprobe/", {0, BPF_PROG_TYPE_KPROBE, 0}, {-EINVAL, 0} },
+ {"uprobe/", {0, BPF_PROG_TYPE_KPROBE, 0}, {-EINVAL, 0} },
+ {"kretprobe/", {0, BPF_PROG_TYPE_KPROBE, 0}, {-EINVAL, 0} },
+ {"uretprobe/", {0, BPF_PROG_TYPE_KPROBE, 0}, {-EINVAL, 0} },
+ {"classifier", {0, BPF_PROG_TYPE_SCHED_CLS, 0}, {-EINVAL, 0} },
+ {"action", {0, BPF_PROG_TYPE_SCHED_ACT, 0}, {-EINVAL, 0} },
+ {"tracepoint/", {0, BPF_PROG_TYPE_TRACEPOINT, 0}, {-EINVAL, 0} },
+ {"tp/", {0, BPF_PROG_TYPE_TRACEPOINT, 0}, {-EINVAL, 0} },
+ {
+ "raw_tracepoint/",
+ {0, BPF_PROG_TYPE_RAW_TRACEPOINT, 0},
+ {-EINVAL, 0},
+ },
+ {"raw_tp/", {0, BPF_PROG_TYPE_RAW_TRACEPOINT, 0}, {-EINVAL, 0} },
+ {"xdp", {0, BPF_PROG_TYPE_XDP, BPF_XDP}, {0, BPF_XDP} },
+ {"perf_event", {0, BPF_PROG_TYPE_PERF_EVENT, 0}, {-EINVAL, 0} },
+ {"lwt_in", {0, BPF_PROG_TYPE_LWT_IN, 0}, {-EINVAL, 0} },
+ {"lwt_out", {0, BPF_PROG_TYPE_LWT_OUT, 0}, {-EINVAL, 0} },
+ {"lwt_xmit", {0, BPF_PROG_TYPE_LWT_XMIT, 0}, {-EINVAL, 0} },
+ {"lwt_seg6local", {0, BPF_PROG_TYPE_LWT_SEG6LOCAL, 0}, {-EINVAL, 0} },
+ {
+ "cgroup_skb/ingress",
+ {0, BPF_PROG_TYPE_CGROUP_SKB, BPF_CGROUP_INET_INGRESS},
+ {0, BPF_CGROUP_INET_INGRESS},
+ },
+ {
+ "cgroup_skb/egress",
+ {0, BPF_PROG_TYPE_CGROUP_SKB, BPF_CGROUP_INET_EGRESS},
+ {0, BPF_CGROUP_INET_EGRESS},
+ },
+ {"cgroup/skb", {0, BPF_PROG_TYPE_CGROUP_SKB, 0}, {-EINVAL, 0} },
+ {
+ "cgroup/sock",
+ {0, BPF_PROG_TYPE_CGROUP_SOCK, BPF_CGROUP_INET_SOCK_CREATE},
+ {0, BPF_CGROUP_INET_SOCK_CREATE},
+ },
+ {
+ "cgroup/post_bind4",
+ {0, BPF_PROG_TYPE_CGROUP_SOCK, BPF_CGROUP_INET4_POST_BIND},
+ {0, BPF_CGROUP_INET4_POST_BIND},
+ },
+ {
+ "cgroup/post_bind6",
+ {0, BPF_PROG_TYPE_CGROUP_SOCK, BPF_CGROUP_INET6_POST_BIND},
+ {0, BPF_CGROUP_INET6_POST_BIND},
+ },
+ {
+ "cgroup/dev",
+ {0, BPF_PROG_TYPE_CGROUP_DEVICE, BPF_CGROUP_DEVICE},
+ {0, BPF_CGROUP_DEVICE},
+ },
+ {
+ "sockops",
+ {0, BPF_PROG_TYPE_SOCK_OPS, BPF_CGROUP_SOCK_OPS},
+ {0, BPF_CGROUP_SOCK_OPS},
+ },
+ {
+ "sk_skb/stream_parser",
+ {0, BPF_PROG_TYPE_SK_SKB, BPF_SK_SKB_STREAM_PARSER},
+ {0, BPF_SK_SKB_STREAM_PARSER},
+ },
+ {
+ "sk_skb/stream_verdict",
+ {0, BPF_PROG_TYPE_SK_SKB, BPF_SK_SKB_STREAM_VERDICT},
+ {0, BPF_SK_SKB_STREAM_VERDICT},
+ },
+ {"sk_skb", {0, BPF_PROG_TYPE_SK_SKB, 0}, {-EINVAL, 0} },
+ {
+ "sk_msg",
+ {0, BPF_PROG_TYPE_SK_MSG, BPF_SK_MSG_VERDICT},
+ {0, BPF_SK_MSG_VERDICT},
+ },
+ {
+ "lirc_mode2",
+ {0, BPF_PROG_TYPE_LIRC_MODE2, BPF_LIRC_MODE2},
+ {0, BPF_LIRC_MODE2},
+ },
+ {
+ "flow_dissector",
+ {0, BPF_PROG_TYPE_FLOW_DISSECTOR, BPF_FLOW_DISSECTOR},
+ {0, BPF_FLOW_DISSECTOR},
+ },
+ {
+ "cgroup/bind4",
+ {0, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_BIND},
+ {0, BPF_CGROUP_INET4_BIND},
+ },
+ {
+ "cgroup/bind6",
+ {0, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_BIND},
+ {0, BPF_CGROUP_INET6_BIND},
+ },
+ {
+ "cgroup/connect4",
+ {0, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_CONNECT},
+ {0, BPF_CGROUP_INET4_CONNECT},
+ },
+ {
+ "cgroup/connect6",
+ {0, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_CONNECT},
+ {0, BPF_CGROUP_INET6_CONNECT},
+ },
+ {
+ "cgroup/sendmsg4",
+ {0, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_SENDMSG},
+ {0, BPF_CGROUP_UDP4_SENDMSG},
+ },
+ {
+ "cgroup/sendmsg6",
+ {0, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_SENDMSG},
+ {0, BPF_CGROUP_UDP6_SENDMSG},
+ },
+ {
+ "cgroup/recvmsg4",
+ {0, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_RECVMSG},
+ {0, BPF_CGROUP_UDP4_RECVMSG},
+ },
+ {
+ "cgroup/recvmsg6",
+ {0, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_RECVMSG},
+ {0, BPF_CGROUP_UDP6_RECVMSG},
+ },
+ {
+ "cgroup/sysctl",
+ {0, BPF_PROG_TYPE_CGROUP_SYSCTL, BPF_CGROUP_SYSCTL},
+ {0, BPF_CGROUP_SYSCTL},
+ },
+ {
+ "cgroup/getsockopt",
+ {0, BPF_PROG_TYPE_CGROUP_SOCKOPT, BPF_CGROUP_GETSOCKOPT},
+ {0, BPF_CGROUP_GETSOCKOPT},
+ },
+ {
+ "cgroup/setsockopt",
+ {0, BPF_PROG_TYPE_CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT},
+ {0, BPF_CGROUP_SETSOCKOPT},
+ },
+};
+
+static void test_prog_type_by_name(const struct sec_name_test *test)
+{
+ enum bpf_attach_type expected_attach_type;
+ enum bpf_prog_type prog_type;
+ int rc;
+
+ rc = libbpf_prog_type_by_name(test->sec_name, &prog_type,
+ &expected_attach_type);
+
+ CHECK(rc != test->expected_load.rc, "check_code",
+ "prog: unexpected rc=%d for %s\n", rc, test->sec_name);
+
+ if (rc)
+ return;
+
+ CHECK(prog_type != test->expected_load.prog_type, "check_prog_type",
+ "prog: unexpected prog_type=%d for %s\n",
+ prog_type, test->sec_name);
+
+ CHECK(expected_attach_type != test->expected_load.expected_attach_type,
+ "check_attach_type", "prog: unexpected expected_attach_type=%d for %s\n",
+ expected_attach_type, test->sec_name);
+}
+
+static void test_attach_type_by_name(const struct sec_name_test *test)
+{
+ enum bpf_attach_type attach_type;
+ int rc;
+
+ rc = libbpf_attach_type_by_name(test->sec_name, &attach_type);
+
+ CHECK(rc != test->expected_attach.rc, "check_ret",
+ "attach: unexpected rc=%d for %s\n", rc, test->sec_name);
+
+ if (rc)
+ return;
+
+ CHECK(attach_type != test->expected_attach.attach_type,
+ "check_attach_type", "attach: unexpected attach_type=%d for %s\n",
+ attach_type, test->sec_name);
+}
+
+void test_section_names(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(tests); ++i) {
+ struct sec_name_test *test = &tests[i];
+
+ test_prog_type_by_name(test);
+ test_attach_type_by_name(test);
+ }
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/select_reuseport.c b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c
new file mode 100644
index 000000000..821b4146b
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/select_reuseport.c
@@ -0,0 +1,879 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018 Facebook */
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <linux/types.h>
+#include <linux/if_ether.h>
+#include <sys/types.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include "bpf_rlimit.h"
+#include "bpf_util.h"
+
+#include "test_progs.h"
+#include "test_select_reuseport_common.h"
+
+#define MAX_TEST_NAME 80
+#define MIN_TCPHDR_LEN 20
+#define UDPHDR_LEN 8
+
+#define TCP_SYNCOOKIE_SYSCTL "/proc/sys/net/ipv4/tcp_syncookies"
+#define TCP_FO_SYSCTL "/proc/sys/net/ipv4/tcp_fastopen"
+#define REUSEPORT_ARRAY_SIZE 32
+
+static int result_map, tmp_index_ovr_map, linum_map, data_check_map;
+static __u32 expected_results[NR_RESULTS];
+static int sk_fds[REUSEPORT_ARRAY_SIZE];
+static int reuseport_array = -1, outer_map = -1;
+static enum bpf_map_type inner_map_type;
+static int select_by_skb_data_prog;
+static int saved_tcp_syncookie = -1;
+static struct bpf_object *obj;
+static int saved_tcp_fo = -1;
+static __u32 index_zero;
+static int epfd;
+
+static union sa46 {
+ struct sockaddr_in6 v6;
+ struct sockaddr_in v4;
+ sa_family_t family;
+} srv_sa;
+
+#define RET_IF(condition, tag, format...) ({ \
+ if (CHECK_FAIL(condition)) { \
+ printf(tag " " format); \
+ return; \
+ } \
+})
+
+#define RET_ERR(condition, tag, format...) ({ \
+ if (CHECK_FAIL(condition)) { \
+ printf(tag " " format); \
+ return -1; \
+ } \
+})
+
+static int create_maps(enum bpf_map_type inner_type)
+{
+ struct bpf_create_map_attr attr = {};
+
+ inner_map_type = inner_type;
+
+ /* Creating reuseport_array */
+ attr.name = "reuseport_array";
+ attr.map_type = inner_type;
+ attr.key_size = sizeof(__u32);
+ attr.value_size = sizeof(__u32);
+ attr.max_entries = REUSEPORT_ARRAY_SIZE;
+
+ reuseport_array = bpf_create_map_xattr(&attr);
+ RET_ERR(reuseport_array == -1, "creating reuseport_array",
+ "reuseport_array:%d errno:%d\n", reuseport_array, errno);
+
+ /* Creating outer_map */
+ attr.name = "outer_map";
+ attr.map_type = BPF_MAP_TYPE_ARRAY_OF_MAPS;
+ attr.key_size = sizeof(__u32);
+ attr.value_size = sizeof(__u32);
+ attr.max_entries = 1;
+ attr.inner_map_fd = reuseport_array;
+ outer_map = bpf_create_map_xattr(&attr);
+ RET_ERR(outer_map == -1, "creating outer_map",
+ "outer_map:%d errno:%d\n", outer_map, errno);
+
+ return 0;
+}
+
+static int prepare_bpf_obj(void)
+{
+ struct bpf_program *prog;
+ struct bpf_map *map;
+ int err;
+
+ obj = bpf_object__open("test_select_reuseport_kern.o");
+ RET_ERR(IS_ERR_OR_NULL(obj), "open test_select_reuseport_kern.o",
+ "obj:%p PTR_ERR(obj):%ld\n", obj, PTR_ERR(obj));
+
+ map = bpf_object__find_map_by_name(obj, "outer_map");
+ RET_ERR(!map, "find outer_map", "!map\n");
+ err = bpf_map__reuse_fd(map, outer_map);
+ RET_ERR(err, "reuse outer_map", "err:%d\n", err);
+
+ err = bpf_object__load(obj);
+ RET_ERR(err, "load bpf_object", "err:%d\n", err);
+
+ prog = bpf_program__next(NULL, obj);
+ RET_ERR(!prog, "get first bpf_program", "!prog\n");
+ select_by_skb_data_prog = bpf_program__fd(prog);
+ RET_ERR(select_by_skb_data_prog == -1, "get prog fd",
+ "select_by_skb_data_prog:%d\n", select_by_skb_data_prog);
+
+ map = bpf_object__find_map_by_name(obj, "result_map");
+ RET_ERR(!map, "find result_map", "!map\n");
+ result_map = bpf_map__fd(map);
+ RET_ERR(result_map == -1, "get result_map fd",
+ "result_map:%d\n", result_map);
+
+ map = bpf_object__find_map_by_name(obj, "tmp_index_ovr_map");
+ RET_ERR(!map, "find tmp_index_ovr_map\n", "!map");
+ tmp_index_ovr_map = bpf_map__fd(map);
+ RET_ERR(tmp_index_ovr_map == -1, "get tmp_index_ovr_map fd",
+ "tmp_index_ovr_map:%d\n", tmp_index_ovr_map);
+
+ map = bpf_object__find_map_by_name(obj, "linum_map");
+ RET_ERR(!map, "find linum_map", "!map\n");
+ linum_map = bpf_map__fd(map);
+ RET_ERR(linum_map == -1, "get linum_map fd",
+ "linum_map:%d\n", linum_map);
+
+ map = bpf_object__find_map_by_name(obj, "data_check_map");
+ RET_ERR(!map, "find data_check_map", "!map\n");
+ data_check_map = bpf_map__fd(map);
+ RET_ERR(data_check_map == -1, "get data_check_map fd",
+ "data_check_map:%d\n", data_check_map);
+
+ return 0;
+}
+
+static void sa46_init_loopback(union sa46 *sa, sa_family_t family)
+{
+ memset(sa, 0, sizeof(*sa));
+ sa->family = family;
+ if (sa->family == AF_INET6)
+ sa->v6.sin6_addr = in6addr_loopback;
+ else
+ sa->v4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+}
+
+static void sa46_init_inany(union sa46 *sa, sa_family_t family)
+{
+ memset(sa, 0, sizeof(*sa));
+ sa->family = family;
+ if (sa->family == AF_INET6)
+ sa->v6.sin6_addr = in6addr_any;
+ else
+ sa->v4.sin_addr.s_addr = INADDR_ANY;
+}
+
+static int read_int_sysctl(const char *sysctl)
+{
+ char buf[16];
+ int fd, ret;
+
+ fd = open(sysctl, 0);
+ RET_ERR(fd == -1, "open(sysctl)",
+ "sysctl:%s fd:%d errno:%d\n", sysctl, fd, errno);
+
+ ret = read(fd, buf, sizeof(buf));
+ RET_ERR(ret <= 0, "read(sysctl)",
+ "sysctl:%s ret:%d errno:%d\n", sysctl, ret, errno);
+
+ close(fd);
+ return atoi(buf);
+}
+
+static int write_int_sysctl(const char *sysctl, int v)
+{
+ int fd, ret, size;
+ char buf[16];
+
+ fd = open(sysctl, O_RDWR);
+ RET_ERR(fd == -1, "open(sysctl)",
+ "sysctl:%s fd:%d errno:%d\n", sysctl, fd, errno);
+
+ size = snprintf(buf, sizeof(buf), "%d", v);
+ ret = write(fd, buf, size);
+ RET_ERR(ret != size, "write(sysctl)",
+ "sysctl:%s ret:%d size:%d errno:%d\n",
+ sysctl, ret, size, errno);
+
+ close(fd);
+ return 0;
+}
+
+static void restore_sysctls(void)
+{
+ if (saved_tcp_fo != -1)
+ write_int_sysctl(TCP_FO_SYSCTL, saved_tcp_fo);
+ if (saved_tcp_syncookie != -1)
+ write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, saved_tcp_syncookie);
+}
+
+static int enable_fastopen(void)
+{
+ int fo;
+
+ fo = read_int_sysctl(TCP_FO_SYSCTL);
+ if (fo < 0)
+ return -1;
+
+ return write_int_sysctl(TCP_FO_SYSCTL, fo | 7);
+}
+
+static int enable_syncookie(void)
+{
+ return write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, 2);
+}
+
+static int disable_syncookie(void)
+{
+ return write_int_sysctl(TCP_SYNCOOKIE_SYSCTL, 0);
+}
+
+static long get_linum(void)
+{
+ __u32 linum;
+ int err;
+
+ err = bpf_map_lookup_elem(linum_map, &index_zero, &linum);
+ RET_ERR(err == -1, "lookup_elem(linum_map)", "err:%d errno:%d\n",
+ err, errno);
+
+ return linum;
+}
+
+static void check_data(int type, sa_family_t family, const struct cmd *cmd,
+ int cli_fd)
+{
+ struct data_check expected = {}, result;
+ union sa46 cli_sa;
+ socklen_t addrlen;
+ int err;
+
+ addrlen = sizeof(cli_sa);
+ err = getsockname(cli_fd, (struct sockaddr *)&cli_sa,
+ &addrlen);
+ RET_IF(err == -1, "getsockname(cli_fd)", "err:%d errno:%d\n",
+ err, errno);
+
+ err = bpf_map_lookup_elem(data_check_map, &index_zero, &result);
+ RET_IF(err == -1, "lookup_elem(data_check_map)", "err:%d errno:%d\n",
+ err, errno);
+
+ if (type == SOCK_STREAM) {
+ expected.len = MIN_TCPHDR_LEN;
+ expected.ip_protocol = IPPROTO_TCP;
+ } else {
+ expected.len = UDPHDR_LEN;
+ expected.ip_protocol = IPPROTO_UDP;
+ }
+
+ if (family == AF_INET6) {
+ expected.eth_protocol = htons(ETH_P_IPV6);
+ expected.bind_inany = !srv_sa.v6.sin6_addr.s6_addr32[3] &&
+ !srv_sa.v6.sin6_addr.s6_addr32[2] &&
+ !srv_sa.v6.sin6_addr.s6_addr32[1] &&
+ !srv_sa.v6.sin6_addr.s6_addr32[0];
+
+ memcpy(&expected.skb_addrs[0], cli_sa.v6.sin6_addr.s6_addr32,
+ sizeof(cli_sa.v6.sin6_addr));
+ memcpy(&expected.skb_addrs[4], &in6addr_loopback,
+ sizeof(in6addr_loopback));
+ expected.skb_ports[0] = cli_sa.v6.sin6_port;
+ expected.skb_ports[1] = srv_sa.v6.sin6_port;
+ } else {
+ expected.eth_protocol = htons(ETH_P_IP);
+ expected.bind_inany = !srv_sa.v4.sin_addr.s_addr;
+
+ expected.skb_addrs[0] = cli_sa.v4.sin_addr.s_addr;
+ expected.skb_addrs[1] = htonl(INADDR_LOOPBACK);
+ expected.skb_ports[0] = cli_sa.v4.sin_port;
+ expected.skb_ports[1] = srv_sa.v4.sin_port;
+ }
+
+ if (memcmp(&result, &expected, offsetof(struct data_check,
+ equal_check_end))) {
+ printf("unexpected data_check\n");
+ printf(" result: (0x%x, %u, %u)\n",
+ result.eth_protocol, result.ip_protocol,
+ result.bind_inany);
+ printf("expected: (0x%x, %u, %u)\n",
+ expected.eth_protocol, expected.ip_protocol,
+ expected.bind_inany);
+ RET_IF(1, "data_check result != expected",
+ "bpf_prog_linum:%ld\n", get_linum());
+ }
+
+ RET_IF(!result.hash, "data_check result.hash empty",
+ "result.hash:%u", result.hash);
+
+ expected.len += cmd ? sizeof(*cmd) : 0;
+ if (type == SOCK_STREAM)
+ RET_IF(expected.len > result.len, "expected.len > result.len",
+ "expected.len:%u result.len:%u bpf_prog_linum:%ld\n",
+ expected.len, result.len, get_linum());
+ else
+ RET_IF(expected.len != result.len, "expected.len != result.len",
+ "expected.len:%u result.len:%u bpf_prog_linum:%ld\n",
+ expected.len, result.len, get_linum());
+}
+
+static const char *result_to_str(enum result res)
+{
+ switch (res) {
+ case DROP_ERR_INNER_MAP:
+ return "DROP_ERR_INNER_MAP";
+ case DROP_ERR_SKB_DATA:
+ return "DROP_ERR_SKB_DATA";
+ case DROP_ERR_SK_SELECT_REUSEPORT:
+ return "DROP_ERR_SK_SELECT_REUSEPORT";
+ case DROP_MISC:
+ return "DROP_MISC";
+ case PASS:
+ return "PASS";
+ case PASS_ERR_SK_SELECT_REUSEPORT:
+ return "PASS_ERR_SK_SELECT_REUSEPORT";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+static void check_results(void)
+{
+ __u32 results[NR_RESULTS];
+ __u32 i, broken = 0;
+ int err;
+
+ for (i = 0; i < NR_RESULTS; i++) {
+ err = bpf_map_lookup_elem(result_map, &i, &results[i]);
+ RET_IF(err == -1, "lookup_elem(result_map)",
+ "i:%u err:%d errno:%d\n", i, err, errno);
+ }
+
+ for (i = 0; i < NR_RESULTS; i++) {
+ if (results[i] != expected_results[i]) {
+ broken = i;
+ break;
+ }
+ }
+
+ if (i == NR_RESULTS)
+ return;
+
+ printf("unexpected result\n");
+ printf(" result: [");
+ printf("%u", results[0]);
+ for (i = 1; i < NR_RESULTS; i++)
+ printf(", %u", results[i]);
+ printf("]\n");
+
+ printf("expected: [");
+ printf("%u", expected_results[0]);
+ for (i = 1; i < NR_RESULTS; i++)
+ printf(", %u", expected_results[i]);
+ printf("]\n");
+
+ printf("mismatch on %s (bpf_prog_linum:%ld)\n", result_to_str(broken),
+ get_linum());
+
+ CHECK_FAIL(true);
+}
+
+static int send_data(int type, sa_family_t family, void *data, size_t len,
+ enum result expected)
+{
+ union sa46 cli_sa;
+ int fd, err;
+
+ fd = socket(family, type, 0);
+ RET_ERR(fd == -1, "socket()", "fd:%d errno:%d\n", fd, errno);
+
+ sa46_init_loopback(&cli_sa, family);
+ err = bind(fd, (struct sockaddr *)&cli_sa, sizeof(cli_sa));
+ RET_ERR(fd == -1, "bind(cli_sa)", "err:%d errno:%d\n", err, errno);
+
+ err = sendto(fd, data, len, MSG_FASTOPEN, (struct sockaddr *)&srv_sa,
+ sizeof(srv_sa));
+ RET_ERR(err != len && expected >= PASS,
+ "sendto()", "family:%u err:%d errno:%d expected:%d\n",
+ family, err, errno, expected);
+
+ return fd;
+}
+
+static void do_test(int type, sa_family_t family, struct cmd *cmd,
+ enum result expected)
+{
+ int nev, srv_fd, cli_fd;
+ struct epoll_event ev;
+ struct cmd rcv_cmd;
+ ssize_t nread;
+
+ cli_fd = send_data(type, family, cmd, cmd ? sizeof(*cmd) : 0,
+ expected);
+ if (cli_fd < 0)
+ return;
+ nev = epoll_wait(epfd, &ev, 1, expected >= PASS ? 5 : 0);
+ RET_IF((nev <= 0 && expected >= PASS) ||
+ (nev > 0 && expected < PASS),
+ "nev <> expected",
+ "nev:%d expected:%d type:%d family:%d data:(%d, %d)\n",
+ nev, expected, type, family,
+ cmd ? cmd->reuseport_index : -1,
+ cmd ? cmd->pass_on_failure : -1);
+ check_results();
+ check_data(type, family, cmd, cli_fd);
+
+ if (expected < PASS)
+ return;
+
+ RET_IF(expected != PASS_ERR_SK_SELECT_REUSEPORT &&
+ cmd->reuseport_index != ev.data.u32,
+ "check cmd->reuseport_index",
+ "cmd:(%u, %u) ev.data.u32:%u\n",
+ cmd->pass_on_failure, cmd->reuseport_index, ev.data.u32);
+
+ srv_fd = sk_fds[ev.data.u32];
+ if (type == SOCK_STREAM) {
+ int new_fd = accept(srv_fd, NULL, 0);
+
+ RET_IF(new_fd == -1, "accept(srv_fd)",
+ "ev.data.u32:%u new_fd:%d errno:%d\n",
+ ev.data.u32, new_fd, errno);
+
+ nread = recv(new_fd, &rcv_cmd, sizeof(rcv_cmd), MSG_DONTWAIT);
+ RET_IF(nread != sizeof(rcv_cmd),
+ "recv(new_fd)",
+ "ev.data.u32:%u nread:%zd sizeof(rcv_cmd):%zu errno:%d\n",
+ ev.data.u32, nread, sizeof(rcv_cmd), errno);
+
+ close(new_fd);
+ } else {
+ nread = recv(srv_fd, &rcv_cmd, sizeof(rcv_cmd), MSG_DONTWAIT);
+ RET_IF(nread != sizeof(rcv_cmd),
+ "recv(sk_fds)",
+ "ev.data.u32:%u nread:%zd sizeof(rcv_cmd):%zu errno:%d\n",
+ ev.data.u32, nread, sizeof(rcv_cmd), errno);
+ }
+
+ close(cli_fd);
+}
+
+static void test_err_inner_map(int type, sa_family_t family)
+{
+ struct cmd cmd = {
+ .reuseport_index = 0,
+ .pass_on_failure = 0,
+ };
+
+ expected_results[DROP_ERR_INNER_MAP]++;
+ do_test(type, family, &cmd, DROP_ERR_INNER_MAP);
+}
+
+static void test_err_skb_data(int type, sa_family_t family)
+{
+ expected_results[DROP_ERR_SKB_DATA]++;
+ do_test(type, family, NULL, DROP_ERR_SKB_DATA);
+}
+
+static void test_err_sk_select_port(int type, sa_family_t family)
+{
+ struct cmd cmd = {
+ .reuseport_index = REUSEPORT_ARRAY_SIZE,
+ .pass_on_failure = 0,
+ };
+
+ expected_results[DROP_ERR_SK_SELECT_REUSEPORT]++;
+ do_test(type, family, &cmd, DROP_ERR_SK_SELECT_REUSEPORT);
+}
+
+static void test_pass(int type, sa_family_t family)
+{
+ struct cmd cmd;
+ int i;
+
+ cmd.pass_on_failure = 0;
+ for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++) {
+ expected_results[PASS]++;
+ cmd.reuseport_index = i;
+ do_test(type, family, &cmd, PASS);
+ }
+}
+
+static void test_syncookie(int type, sa_family_t family)
+{
+ int err, tmp_index = 1;
+ struct cmd cmd = {
+ .reuseport_index = 0,
+ .pass_on_failure = 0,
+ };
+
+ /*
+ * +1 for TCP-SYN and
+ * +1 for the TCP-ACK (ack the syncookie)
+ */
+ expected_results[PASS] += 2;
+ enable_syncookie();
+ /*
+ * Simulate TCP-SYN and TCP-ACK are handled by two different sk:
+ * TCP-SYN: select sk_fds[tmp_index = 1] tmp_index is from the
+ * tmp_index_ovr_map
+ * TCP-ACK: select sk_fds[reuseport_index = 0] reuseport_index
+ * is from the cmd.reuseport_index
+ */
+ err = bpf_map_update_elem(tmp_index_ovr_map, &index_zero,
+ &tmp_index, BPF_ANY);
+ RET_IF(err == -1, "update_elem(tmp_index_ovr_map, 0, 1)",
+ "err:%d errno:%d\n", err, errno);
+ do_test(type, family, &cmd, PASS);
+ err = bpf_map_lookup_elem(tmp_index_ovr_map, &index_zero,
+ &tmp_index);
+ RET_IF(err == -1 || tmp_index != -1,
+ "lookup_elem(tmp_index_ovr_map)",
+ "err:%d errno:%d tmp_index:%d\n",
+ err, errno, tmp_index);
+ disable_syncookie();
+}
+
+static void test_pass_on_err(int type, sa_family_t family)
+{
+ struct cmd cmd = {
+ .reuseport_index = REUSEPORT_ARRAY_SIZE,
+ .pass_on_failure = 1,
+ };
+
+ expected_results[PASS_ERR_SK_SELECT_REUSEPORT] += 1;
+ do_test(type, family, &cmd, PASS_ERR_SK_SELECT_REUSEPORT);
+}
+
+static void test_detach_bpf(int type, sa_family_t family)
+{
+#ifdef SO_DETACH_REUSEPORT_BPF
+ __u32 nr_run_before = 0, nr_run_after = 0, tmp, i;
+ struct epoll_event ev;
+ int cli_fd, err, nev;
+ struct cmd cmd = {};
+ int optvalue = 0;
+
+ err = setsockopt(sk_fds[0], SOL_SOCKET, SO_DETACH_REUSEPORT_BPF,
+ &optvalue, sizeof(optvalue));
+ RET_IF(err == -1, "setsockopt(SO_DETACH_REUSEPORT_BPF)",
+ "err:%d errno:%d\n", err, errno);
+
+ err = setsockopt(sk_fds[1], SOL_SOCKET, SO_DETACH_REUSEPORT_BPF,
+ &optvalue, sizeof(optvalue));
+ RET_IF(err == 0 || errno != ENOENT,
+ "setsockopt(SO_DETACH_REUSEPORT_BPF)",
+ "err:%d errno:%d\n", err, errno);
+
+ for (i = 0; i < NR_RESULTS; i++) {
+ err = bpf_map_lookup_elem(result_map, &i, &tmp);
+ RET_IF(err == -1, "lookup_elem(result_map)",
+ "i:%u err:%d errno:%d\n", i, err, errno);
+ nr_run_before += tmp;
+ }
+
+ cli_fd = send_data(type, family, &cmd, sizeof(cmd), PASS);
+ if (cli_fd < 0)
+ return;
+ nev = epoll_wait(epfd, &ev, 1, 5);
+ RET_IF(nev <= 0, "nev <= 0",
+ "nev:%d expected:1 type:%d family:%d data:(0, 0)\n",
+ nev, type, family);
+
+ for (i = 0; i < NR_RESULTS; i++) {
+ err = bpf_map_lookup_elem(result_map, &i, &tmp);
+ RET_IF(err == -1, "lookup_elem(result_map)",
+ "i:%u err:%d errno:%d\n", i, err, errno);
+ nr_run_after += tmp;
+ }
+
+ RET_IF(nr_run_before != nr_run_after,
+ "nr_run_before != nr_run_after",
+ "nr_run_before:%u nr_run_after:%u\n",
+ nr_run_before, nr_run_after);
+
+ close(cli_fd);
+#else
+ test__skip();
+#endif
+}
+
+static void prepare_sk_fds(int type, sa_family_t family, bool inany)
+{
+ const int first = REUSEPORT_ARRAY_SIZE - 1;
+ int i, err, optval = 1;
+ struct epoll_event ev;
+ socklen_t addrlen;
+
+ if (inany)
+ sa46_init_inany(&srv_sa, family);
+ else
+ sa46_init_loopback(&srv_sa, family);
+ addrlen = sizeof(srv_sa);
+
+ /*
+ * The sk_fds[] is filled from the back such that the order
+ * is exactly opposite to the (struct sock_reuseport *)reuse->socks[].
+ */
+ for (i = first; i >= 0; i--) {
+ sk_fds[i] = socket(family, type, 0);
+ RET_IF(sk_fds[i] == -1, "socket()", "sk_fds[%d]:%d errno:%d\n",
+ i, sk_fds[i], errno);
+ err = setsockopt(sk_fds[i], SOL_SOCKET, SO_REUSEPORT,
+ &optval, sizeof(optval));
+ RET_IF(err == -1, "setsockopt(SO_REUSEPORT)",
+ "sk_fds[%d] err:%d errno:%d\n",
+ i, err, errno);
+
+ if (i == first) {
+ err = setsockopt(sk_fds[i], SOL_SOCKET,
+ SO_ATTACH_REUSEPORT_EBPF,
+ &select_by_skb_data_prog,
+ sizeof(select_by_skb_data_prog));
+ RET_IF(err == -1, "setsockopt(SO_ATTACH_REUEPORT_EBPF)",
+ "err:%d errno:%d\n", err, errno);
+ }
+
+ err = bind(sk_fds[i], (struct sockaddr *)&srv_sa, addrlen);
+ RET_IF(err == -1, "bind()", "sk_fds[%d] err:%d errno:%d\n",
+ i, err, errno);
+
+ if (type == SOCK_STREAM) {
+ err = listen(sk_fds[i], 10);
+ RET_IF(err == -1, "listen()",
+ "sk_fds[%d] err:%d errno:%d\n",
+ i, err, errno);
+ }
+
+ err = bpf_map_update_elem(reuseport_array, &i, &sk_fds[i],
+ BPF_NOEXIST);
+ RET_IF(err == -1, "update_elem(reuseport_array)",
+ "sk_fds[%d] err:%d errno:%d\n", i, err, errno);
+
+ if (i == first) {
+ socklen_t addrlen = sizeof(srv_sa);
+
+ err = getsockname(sk_fds[i], (struct sockaddr *)&srv_sa,
+ &addrlen);
+ RET_IF(err == -1, "getsockname()",
+ "sk_fds[%d] err:%d errno:%d\n", i, err, errno);
+ }
+ }
+
+ epfd = epoll_create(1);
+ RET_IF(epfd == -1, "epoll_create(1)",
+ "epfd:%d errno:%d\n", epfd, errno);
+
+ ev.events = EPOLLIN;
+ for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++) {
+ ev.data.u32 = i;
+ err = epoll_ctl(epfd, EPOLL_CTL_ADD, sk_fds[i], &ev);
+ RET_IF(err, "epoll_ctl(EPOLL_CTL_ADD)", "sk_fds[%d]\n", i);
+ }
+}
+
+static void setup_per_test(int type, sa_family_t family, bool inany,
+ bool no_inner_map)
+{
+ int ovr = -1, err;
+
+ prepare_sk_fds(type, family, inany);
+ err = bpf_map_update_elem(tmp_index_ovr_map, &index_zero, &ovr,
+ BPF_ANY);
+ RET_IF(err == -1, "update_elem(tmp_index_ovr_map, 0, -1)",
+ "err:%d errno:%d\n", err, errno);
+
+ /* Install reuseport_array to outer_map? */
+ if (no_inner_map)
+ return;
+
+ err = bpf_map_update_elem(outer_map, &index_zero, &reuseport_array,
+ BPF_ANY);
+ RET_IF(err == -1, "update_elem(outer_map, 0, reuseport_array)",
+ "err:%d errno:%d\n", err, errno);
+}
+
+static void cleanup_per_test(bool no_inner_map)
+{
+ int i, err, zero = 0;
+
+ memset(expected_results, 0, sizeof(expected_results));
+
+ for (i = 0; i < NR_RESULTS; i++) {
+ err = bpf_map_update_elem(result_map, &i, &zero, BPF_ANY);
+ RET_IF(err, "reset elem in result_map",
+ "i:%u err:%d errno:%d\n", i, err, errno);
+ }
+
+ err = bpf_map_update_elem(linum_map, &zero, &zero, BPF_ANY);
+ RET_IF(err, "reset line number in linum_map", "err:%d errno:%d\n",
+ err, errno);
+
+ for (i = 0; i < REUSEPORT_ARRAY_SIZE; i++)
+ close(sk_fds[i]);
+ close(epfd);
+
+ /* Delete reuseport_array from outer_map? */
+ if (no_inner_map)
+ return;
+
+ err = bpf_map_delete_elem(outer_map, &index_zero);
+ RET_IF(err == -1, "delete_elem(outer_map)",
+ "err:%d errno:%d\n", err, errno);
+}
+
+static void cleanup(void)
+{
+ if (outer_map != -1) {
+ close(outer_map);
+ outer_map = -1;
+ }
+
+ if (reuseport_array != -1) {
+ close(reuseport_array);
+ reuseport_array = -1;
+ }
+
+ if (obj) {
+ bpf_object__close(obj);
+ obj = NULL;
+ }
+
+ memset(expected_results, 0, sizeof(expected_results));
+}
+
+static const char *maptype_str(enum bpf_map_type type)
+{
+ switch (type) {
+ case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY:
+ return "reuseport_sockarray";
+ case BPF_MAP_TYPE_SOCKMAP:
+ return "sockmap";
+ case BPF_MAP_TYPE_SOCKHASH:
+ return "sockhash";
+ default:
+ return "unknown";
+ }
+}
+
+static const char *family_str(sa_family_t family)
+{
+ switch (family) {
+ case AF_INET:
+ return "IPv4";
+ case AF_INET6:
+ return "IPv6";
+ default:
+ return "unknown";
+ }
+}
+
+static const char *sotype_str(int sotype)
+{
+ switch (sotype) {
+ case SOCK_STREAM:
+ return "TCP";
+ case SOCK_DGRAM:
+ return "UDP";
+ default:
+ return "unknown";
+ }
+}
+
+#define TEST_INIT(fn_, ...) { .fn = fn_, .name = #fn_, __VA_ARGS__ }
+
+static void test_config(int sotype, sa_family_t family, bool inany)
+{
+ const struct test {
+ void (*fn)(int sotype, sa_family_t family);
+ const char *name;
+ bool no_inner_map;
+ int need_sotype;
+ } tests[] = {
+ TEST_INIT(test_err_inner_map,
+ .no_inner_map = true),
+ TEST_INIT(test_err_skb_data),
+ TEST_INIT(test_err_sk_select_port),
+ TEST_INIT(test_pass),
+ TEST_INIT(test_syncookie,
+ .need_sotype = SOCK_STREAM),
+ TEST_INIT(test_pass_on_err),
+ TEST_INIT(test_detach_bpf),
+ };
+ char s[MAX_TEST_NAME];
+ const struct test *t;
+
+ for (t = tests; t < tests + ARRAY_SIZE(tests); t++) {
+ if (t->need_sotype && t->need_sotype != sotype)
+ continue; /* test not compatible with socket type */
+
+ snprintf(s, sizeof(s), "%s %s/%s %s %s",
+ maptype_str(inner_map_type),
+ family_str(family), sotype_str(sotype),
+ inany ? "INANY" : "LOOPBACK", t->name);
+
+ if (!test__start_subtest(s))
+ continue;
+
+ setup_per_test(sotype, family, inany, t->no_inner_map);
+ t->fn(sotype, family);
+ cleanup_per_test(t->no_inner_map);
+ }
+}
+
+#define BIND_INANY true
+
+static void test_all(void)
+{
+ const struct config {
+ int sotype;
+ sa_family_t family;
+ bool inany;
+ } configs[] = {
+ { SOCK_STREAM, AF_INET },
+ { SOCK_STREAM, AF_INET, BIND_INANY },
+ { SOCK_STREAM, AF_INET6 },
+ { SOCK_STREAM, AF_INET6, BIND_INANY },
+ { SOCK_DGRAM, AF_INET },
+ { SOCK_DGRAM, AF_INET6 },
+ };
+ const struct config *c;
+
+ for (c = configs; c < configs + ARRAY_SIZE(configs); c++)
+ test_config(c->sotype, c->family, c->inany);
+}
+
+void test_map_type(enum bpf_map_type mt)
+{
+ if (create_maps(mt))
+ goto out;
+ if (prepare_bpf_obj())
+ goto out;
+
+ test_all();
+out:
+ cleanup();
+}
+
+void test_select_reuseport(void)
+{
+ saved_tcp_fo = read_int_sysctl(TCP_FO_SYSCTL);
+ if (saved_tcp_fo < 0)
+ goto out;
+ saved_tcp_syncookie = read_int_sysctl(TCP_SYNCOOKIE_SYSCTL);
+ if (saved_tcp_syncookie < 0)
+ goto out;
+
+ if (enable_fastopen())
+ goto out;
+ if (disable_syncookie())
+ goto out;
+
+ test_map_type(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY);
+ test_map_type(BPF_MAP_TYPE_SOCKMAP);
+ test_map_type(BPF_MAP_TYPE_SOCKHASH);
+out:
+ restore_sysctls();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal.c b/tools/testing/selftests/bpf/prog_tests/send_signal.c
new file mode 100644
index 000000000..75b72c751
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/send_signal.c
@@ -0,0 +1,209 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include "test_send_signal_kern.skel.h"
+
+static volatile int sigusr1_received = 0;
+
+static void sigusr1_handler(int signum)
+{
+ sigusr1_received++;
+}
+
+static void test_send_signal_common(struct perf_event_attr *attr,
+ bool signal_thread,
+ const char *test_name)
+{
+ struct test_send_signal_kern *skel;
+ int pipe_c2p[2], pipe_p2c[2];
+ int err = -1, pmu_fd = -1;
+ __u32 duration = 0;
+ char buf[256];
+ pid_t pid;
+
+ if (CHECK(pipe(pipe_c2p), test_name,
+ "pipe pipe_c2p error: %s\n", strerror(errno)))
+ return;
+
+ if (CHECK(pipe(pipe_p2c), test_name,
+ "pipe pipe_p2c error: %s\n", strerror(errno))) {
+ close(pipe_c2p[0]);
+ close(pipe_c2p[1]);
+ return;
+ }
+
+ pid = fork();
+ if (CHECK(pid < 0, test_name, "fork error: %s\n", strerror(errno))) {
+ close(pipe_c2p[0]);
+ close(pipe_c2p[1]);
+ close(pipe_p2c[0]);
+ close(pipe_p2c[1]);
+ return;
+ }
+
+ if (pid == 0) {
+ int old_prio;
+
+ /* install signal handler and notify parent */
+ signal(SIGUSR1, sigusr1_handler);
+
+ close(pipe_c2p[0]); /* close read */
+ close(pipe_p2c[1]); /* close write */
+
+ /* boost with a high priority so we got a higher chance
+ * that if an interrupt happens, the underlying task
+ * is this process.
+ */
+ errno = 0;
+ old_prio = getpriority(PRIO_PROCESS, 0);
+ ASSERT_OK(errno, "getpriority");
+ ASSERT_OK(setpriority(PRIO_PROCESS, 0, -20), "setpriority");
+
+ /* notify parent signal handler is installed */
+ CHECK(write(pipe_c2p[1], buf, 1) != 1, "pipe_write", "err %d\n", -errno);
+
+ /* make sure parent enabled bpf program to send_signal */
+ CHECK(read(pipe_p2c[0], buf, 1) != 1, "pipe_read", "err %d\n", -errno);
+
+ /* wait a little for signal handler */
+ sleep(1);
+
+ buf[0] = sigusr1_received ? '2' : '0';
+ CHECK(write(pipe_c2p[1], buf, 1) != 1, "pipe_write", "err %d\n", -errno);
+
+ /* wait for parent notification and exit */
+ CHECK(read(pipe_p2c[0], buf, 1) != 1, "pipe_read", "err %d\n", -errno);
+
+ /* restore the old priority */
+ ASSERT_OK(setpriority(PRIO_PROCESS, 0, old_prio), "setpriority");
+
+ close(pipe_c2p[1]);
+ close(pipe_p2c[0]);
+ exit(0);
+ }
+
+ close(pipe_c2p[1]); /* close write */
+ close(pipe_p2c[0]); /* close read */
+
+ skel = test_send_signal_kern__open_and_load();
+ if (CHECK(!skel, "skel_open_and_load", "skeleton open_and_load failed\n"))
+ goto skel_open_load_failure;
+
+ if (!attr) {
+ err = test_send_signal_kern__attach(skel);
+ if (CHECK(err, "skel_attach", "skeleton attach failed\n")) {
+ err = -1;
+ goto destroy_skel;
+ }
+ } else {
+ pmu_fd = syscall(__NR_perf_event_open, attr, pid, -1,
+ -1 /* group id */, 0 /* flags */);
+ if (CHECK(pmu_fd < 0, test_name, "perf_event_open error: %s\n",
+ strerror(errno))) {
+ err = -1;
+ goto destroy_skel;
+ }
+
+ skel->links.send_signal_perf =
+ bpf_program__attach_perf_event(skel->progs.send_signal_perf, pmu_fd);
+ if (CHECK(IS_ERR(skel->links.send_signal_perf), "attach_perf_event",
+ "err %ld\n", PTR_ERR(skel->links.send_signal_perf)))
+ goto disable_pmu;
+ }
+
+ /* wait until child signal handler installed */
+ CHECK(read(pipe_c2p[0], buf, 1) != 1, "pipe_read", "err %d\n", -errno);
+
+ /* trigger the bpf send_signal */
+ skel->bss->pid = pid;
+ skel->bss->sig = SIGUSR1;
+ skel->bss->signal_thread = signal_thread;
+
+ /* notify child that bpf program can send_signal now */
+ CHECK(write(pipe_p2c[1], buf, 1) != 1, "pipe_write", "err %d\n", -errno);
+
+ /* wait for result */
+ err = read(pipe_c2p[0], buf, 1);
+ if (CHECK(err < 0, test_name, "reading pipe error: %s\n", strerror(errno)))
+ goto disable_pmu;
+ if (CHECK(err == 0, test_name, "reading pipe error: size 0\n")) {
+ err = -1;
+ goto disable_pmu;
+ }
+
+ CHECK(buf[0] != '2', test_name, "incorrect result\n");
+
+ /* notify child safe to exit */
+ CHECK(write(pipe_p2c[1], buf, 1) != 1, "pipe_write", "err %d\n", -errno);
+
+disable_pmu:
+ close(pmu_fd);
+destroy_skel:
+ test_send_signal_kern__destroy(skel);
+skel_open_load_failure:
+ close(pipe_c2p[0]);
+ close(pipe_p2c[1]);
+ wait(NULL);
+}
+
+static void test_send_signal_tracepoint(bool signal_thread)
+{
+ test_send_signal_common(NULL, signal_thread, "tracepoint");
+}
+
+static void test_send_signal_perf(bool signal_thread)
+{
+ struct perf_event_attr attr = {
+ .sample_period = 1,
+ .type = PERF_TYPE_SOFTWARE,
+ .config = PERF_COUNT_SW_CPU_CLOCK,
+ };
+
+ test_send_signal_common(&attr, signal_thread, "perf_sw_event");
+}
+
+static void test_send_signal_nmi(bool signal_thread)
+{
+ struct perf_event_attr attr = {
+ .sample_period = 1,
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_CPU_CYCLES,
+ };
+ int pmu_fd;
+
+ /* Some setups (e.g. virtual machines) might run with hardware
+ * perf events disabled. If this is the case, skip this test.
+ */
+ pmu_fd = syscall(__NR_perf_event_open, &attr, 0 /* pid */,
+ -1 /* cpu */, -1 /* group_fd */, 0 /* flags */);
+ if (pmu_fd == -1) {
+ if (errno == ENOENT) {
+ printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n",
+ __func__);
+ test__skip();
+ return;
+ }
+ /* Let the test fail with a more informative message */
+ } else {
+ close(pmu_fd);
+ }
+
+ test_send_signal_common(&attr, signal_thread, "perf_hw_event");
+}
+
+void test_send_signal(void)
+{
+ if (test__start_subtest("send_signal_tracepoint"))
+ test_send_signal_tracepoint(false);
+ if (test__start_subtest("send_signal_perf"))
+ test_send_signal_perf(false);
+ if (test__start_subtest("send_signal_nmi"))
+ test_send_signal_nmi(false);
+ if (test__start_subtest("send_signal_tracepoint_thread"))
+ test_send_signal_tracepoint(true);
+ if (test__start_subtest("send_signal_perf_thread"))
+ test_send_signal_perf(true);
+ if (test__start_subtest("send_signal_nmi_thread"))
+ test_send_signal_nmi(true);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/send_signal_sched_switch.c b/tools/testing/selftests/bpf/prog_tests/send_signal_sched_switch.c
new file mode 100644
index 000000000..189a34a7a
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/send_signal_sched_switch.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "test_send_signal_kern.skel.h"
+
+static void sigusr1_handler(int signum)
+{
+}
+
+#define THREAD_COUNT 100
+
+static void *worker(void *p)
+{
+ int i;
+
+ for ( i = 0; i < 1000; i++)
+ usleep(1);
+
+ return NULL;
+}
+
+void test_send_signal_sched_switch(void)
+{
+ struct test_send_signal_kern *skel;
+ pthread_t threads[THREAD_COUNT];
+ u32 duration = 0;
+ int i, err;
+
+ signal(SIGUSR1, sigusr1_handler);
+
+ skel = test_send_signal_kern__open_and_load();
+ if (CHECK(!skel, "skel_open_and_load", "skeleton open_and_load failed\n"))
+ return;
+
+ skel->bss->pid = getpid();
+ skel->bss->sig = SIGUSR1;
+
+ err = test_send_signal_kern__attach(skel);
+ if (CHECK(err, "skel_attach", "skeleton attach failed\n"))
+ goto destroy_skel;
+
+ for (i = 0; i < THREAD_COUNT; i++) {
+ err = pthread_create(threads + i, NULL, worker, NULL);
+ if (CHECK(err, "pthread_create", "Error creating thread, %s\n",
+ strerror(errno)))
+ goto destroy_skel;
+ }
+
+ for (i = 0; i < THREAD_COUNT; i++)
+ pthread_join(threads[i], NULL);
+
+destroy_skel:
+ test_send_signal_kern__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/signal_pending.c b/tools/testing/selftests/bpf/prog_tests/signal_pending.c
new file mode 100644
index 000000000..dfcbddcbe
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/signal_pending.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+
+static void sigalrm_handler(int s) {}
+static struct sigaction sigalrm_action = {
+ .sa_handler = sigalrm_handler,
+};
+
+static void test_signal_pending_by_type(enum bpf_prog_type prog_type)
+{
+ struct bpf_insn prog[4096];
+ struct itimerval timeo = {
+ .it_value.tv_usec = 100000, /* 100ms */
+ };
+ __u32 duration = 0, retval;
+ int prog_fd;
+ int err;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(prog); i++)
+ prog[i] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0);
+ prog[ARRAY_SIZE(prog) - 1] = BPF_EXIT_INSN();
+
+ prog_fd = bpf_load_program(prog_type, prog, ARRAY_SIZE(prog),
+ "GPL", 0, NULL, 0);
+ CHECK(prog_fd < 0, "test-run", "errno %d\n", errno);
+
+ err = sigaction(SIGALRM, &sigalrm_action, NULL);
+ CHECK(err, "test-run-signal-sigaction", "errno %d\n", errno);
+
+ err = setitimer(ITIMER_REAL, &timeo, NULL);
+ CHECK(err, "test-run-signal-timer", "errno %d\n", errno);
+
+ err = bpf_prog_test_run(prog_fd, 0xffffffff, &pkt_v4, sizeof(pkt_v4),
+ NULL, NULL, &retval, &duration);
+ CHECK(duration > 500000000, /* 500ms */
+ "test-run-signal-duration",
+ "duration %dns > 500ms\n",
+ duration);
+
+ signal(SIGALRM, SIG_DFL);
+}
+
+void test_signal_pending(enum bpf_prog_type prog_type)
+{
+ test_signal_pending_by_type(BPF_PROG_TYPE_SOCKET_FILTER);
+ test_signal_pending_by_type(BPF_PROG_TYPE_FLOW_DISSECTOR);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/sk_assign.c b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
new file mode 100644
index 000000000..e09c5239a
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/sk_assign.c
@@ -0,0 +1,342 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+// Copyright (c) 2019 Cloudflare
+// Copyright (c) 2020 Isovalent, Inc.
+/*
+ * Test that the socket assign program is able to redirect traffic towards a
+ * socket, regardless of whether the port or address destination of the traffic
+ * matches the port.
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "test_progs.h"
+
+#define BIND_PORT 1234
+#define CONNECT_PORT 4321
+#define TEST_DADDR (0xC0A80203)
+#define NS_SELF "/proc/self/ns/net"
+#define SERVER_MAP_PATH "/sys/fs/bpf/tc/globals/server_map"
+
+static const struct timeval timeo_sec = { .tv_sec = 3 };
+static const size_t timeo_optlen = sizeof(timeo_sec);
+static int stop, duration;
+
+static bool
+configure_stack(void)
+{
+ char tc_version[128];
+ char tc_cmd[BUFSIZ];
+ char *prog;
+ FILE *tc;
+
+ /* Check whether tc is built with libbpf. */
+ tc = popen("tc -V", "r");
+ if (CHECK_FAIL(!tc))
+ return false;
+ if (CHECK_FAIL(!fgets(tc_version, sizeof(tc_version), tc)))
+ return false;
+ if (strstr(tc_version, ", libbpf "))
+ prog = "test_sk_assign_libbpf.o";
+ else
+ prog = "test_sk_assign.o";
+ if (CHECK_FAIL(pclose(tc)))
+ return false;
+
+ /* Move to a new networking namespace */
+ if (CHECK_FAIL(unshare(CLONE_NEWNET)))
+ return false;
+
+ /* Configure necessary links, routes */
+ if (CHECK_FAIL(system("ip link set dev lo up")))
+ return false;
+ if (CHECK_FAIL(system("ip route add local default dev lo")))
+ return false;
+ if (CHECK_FAIL(system("ip -6 route add local default dev lo")))
+ return false;
+
+ /* Load qdisc, BPF program */
+ if (CHECK_FAIL(system("tc qdisc add dev lo clsact")))
+ return false;
+ sprintf(tc_cmd, "%s %s %s %s %s", "tc filter add dev lo ingress bpf",
+ "direct-action object-file", prog,
+ "section classifier/sk_assign_test",
+ (env.verbosity < VERBOSE_VERY) ? " 2>/dev/null" : "verbose");
+ if (CHECK(system(tc_cmd), "BPF load failed;",
+ "run with -vv for more info\n"))
+ return false;
+
+ return true;
+}
+
+static int
+start_server(const struct sockaddr *addr, socklen_t len, int type)
+{
+ int fd;
+
+ fd = socket(addr->sa_family, type, 0);
+ if (CHECK_FAIL(fd == -1))
+ goto out;
+ if (CHECK_FAIL(setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo_sec,
+ timeo_optlen)))
+ goto close_out;
+ if (CHECK_FAIL(bind(fd, addr, len) == -1))
+ goto close_out;
+ if (type == SOCK_STREAM && CHECK_FAIL(listen(fd, 128) == -1))
+ goto close_out;
+
+ goto out;
+close_out:
+ close(fd);
+ fd = -1;
+out:
+ return fd;
+}
+
+static int
+connect_to_server(const struct sockaddr *addr, socklen_t len, int type)
+{
+ int fd = -1;
+
+ fd = socket(addr->sa_family, type, 0);
+ if (CHECK_FAIL(fd == -1))
+ goto out;
+ if (CHECK_FAIL(setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo_sec,
+ timeo_optlen)))
+ goto close_out;
+ if (CHECK_FAIL(connect(fd, addr, len)))
+ goto close_out;
+
+ goto out;
+close_out:
+ close(fd);
+ fd = -1;
+out:
+ return fd;
+}
+
+static in_port_t
+get_port(int fd)
+{
+ struct sockaddr_storage ss;
+ socklen_t slen = sizeof(ss);
+ in_port_t port = 0;
+
+ if (CHECK_FAIL(getsockname(fd, (struct sockaddr *)&ss, &slen)))
+ return port;
+
+ switch (ss.ss_family) {
+ case AF_INET:
+ port = ((struct sockaddr_in *)&ss)->sin_port;
+ break;
+ case AF_INET6:
+ port = ((struct sockaddr_in6 *)&ss)->sin6_port;
+ break;
+ default:
+ CHECK(1, "Invalid address family", "%d\n", ss.ss_family);
+ }
+ return port;
+}
+
+static ssize_t
+rcv_msg(int srv_client, int type)
+{
+ char buf[BUFSIZ];
+
+ if (type == SOCK_STREAM)
+ return read(srv_client, &buf, sizeof(buf));
+ else
+ return recvfrom(srv_client, &buf, sizeof(buf), 0, NULL, NULL);
+}
+
+static int
+run_test(int server_fd, const struct sockaddr *addr, socklen_t len, int type)
+{
+ int client = -1, srv_client = -1;
+ char buf[] = "testing";
+ in_port_t port;
+ int ret = 1;
+
+ client = connect_to_server(addr, len, type);
+ if (client == -1) {
+ perror("Cannot connect to server");
+ goto out;
+ }
+
+ if (type == SOCK_STREAM) {
+ srv_client = accept(server_fd, NULL, NULL);
+ if (CHECK_FAIL(srv_client == -1)) {
+ perror("Can't accept connection");
+ goto out;
+ }
+ } else {
+ srv_client = server_fd;
+ }
+ if (CHECK_FAIL(write(client, buf, sizeof(buf)) != sizeof(buf))) {
+ perror("Can't write on client");
+ goto out;
+ }
+ if (CHECK_FAIL(rcv_msg(srv_client, type) != sizeof(buf))) {
+ perror("Can't read on server");
+ goto out;
+ }
+
+ port = get_port(srv_client);
+ if (CHECK_FAIL(!port))
+ goto out;
+ /* SOCK_STREAM is connected via accept(), so the server's local address
+ * will be the CONNECT_PORT rather than the BIND port that corresponds
+ * to the listen socket. SOCK_DGRAM on the other hand is connectionless
+ * so we can't really do the same check there; the server doesn't ever
+ * create a socket with CONNECT_PORT.
+ */
+ if (type == SOCK_STREAM &&
+ CHECK(port != htons(CONNECT_PORT), "Expected", "port %u but got %u",
+ CONNECT_PORT, ntohs(port)))
+ goto out;
+ else if (type == SOCK_DGRAM &&
+ CHECK(port != htons(BIND_PORT), "Expected",
+ "port %u but got %u", BIND_PORT, ntohs(port)))
+ goto out;
+
+ ret = 0;
+out:
+ close(client);
+ if (srv_client != server_fd)
+ close(srv_client);
+ if (ret)
+ WRITE_ONCE(stop, 1);
+ return ret;
+}
+
+static void
+prepare_addr(struct sockaddr *addr, int family, __u16 port, bool rewrite_addr)
+{
+ struct sockaddr_in *addr4;
+ struct sockaddr_in6 *addr6;
+
+ switch (family) {
+ case AF_INET:
+ addr4 = (struct sockaddr_in *)addr;
+ memset(addr4, 0, sizeof(*addr4));
+ addr4->sin_family = family;
+ addr4->sin_port = htons(port);
+ if (rewrite_addr)
+ addr4->sin_addr.s_addr = htonl(TEST_DADDR);
+ else
+ addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ break;
+ case AF_INET6:
+ addr6 = (struct sockaddr_in6 *)addr;
+ memset(addr6, 0, sizeof(*addr6));
+ addr6->sin6_family = family;
+ addr6->sin6_port = htons(port);
+ addr6->sin6_addr = in6addr_loopback;
+ if (rewrite_addr)
+ addr6->sin6_addr.s6_addr32[3] = htonl(TEST_DADDR);
+ break;
+ default:
+ fprintf(stderr, "Invalid family %d", family);
+ }
+}
+
+struct test_sk_cfg {
+ const char *name;
+ int family;
+ struct sockaddr *addr;
+ socklen_t len;
+ int type;
+ bool rewrite_addr;
+};
+
+#define TEST(NAME, FAMILY, TYPE, REWRITE) \
+{ \
+ .name = NAME, \
+ .family = FAMILY, \
+ .addr = (FAMILY == AF_INET) ? (struct sockaddr *)&addr4 \
+ : (struct sockaddr *)&addr6, \
+ .len = (FAMILY == AF_INET) ? sizeof(addr4) : sizeof(addr6), \
+ .type = TYPE, \
+ .rewrite_addr = REWRITE, \
+}
+
+void test_sk_assign(void)
+{
+ struct sockaddr_in addr4;
+ struct sockaddr_in6 addr6;
+ struct test_sk_cfg tests[] = {
+ TEST("ipv4 tcp port redir", AF_INET, SOCK_STREAM, false),
+ TEST("ipv4 tcp addr redir", AF_INET, SOCK_STREAM, true),
+ TEST("ipv6 tcp port redir", AF_INET6, SOCK_STREAM, false),
+ TEST("ipv6 tcp addr redir", AF_INET6, SOCK_STREAM, true),
+ TEST("ipv4 udp port redir", AF_INET, SOCK_DGRAM, false),
+ TEST("ipv4 udp addr redir", AF_INET, SOCK_DGRAM, true),
+ TEST("ipv6 udp port redir", AF_INET6, SOCK_DGRAM, false),
+ TEST("ipv6 udp addr redir", AF_INET6, SOCK_DGRAM, true),
+ };
+ __s64 server = -1;
+ int server_map;
+ int self_net;
+ int i;
+
+ self_net = open(NS_SELF, O_RDONLY);
+ if (CHECK_FAIL(self_net < 0)) {
+ perror("Unable to open "NS_SELF);
+ return;
+ }
+
+ if (!configure_stack()) {
+ perror("configure_stack");
+ goto cleanup;
+ }
+
+ server_map = bpf_obj_get(SERVER_MAP_PATH);
+ if (CHECK_FAIL(server_map < 0)) {
+ perror("Unable to open " SERVER_MAP_PATH);
+ goto cleanup;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(tests) && !READ_ONCE(stop); i++) {
+ struct test_sk_cfg *test = &tests[i];
+ const struct sockaddr *addr;
+ const int zero = 0;
+ int err;
+
+ if (!test__start_subtest(test->name))
+ continue;
+ prepare_addr(test->addr, test->family, BIND_PORT, false);
+ addr = (const struct sockaddr *)test->addr;
+ server = start_server(addr, test->len, test->type);
+ if (server == -1)
+ goto close;
+
+ err = bpf_map_update_elem(server_map, &zero, &server, BPF_ANY);
+ if (CHECK_FAIL(err)) {
+ perror("Unable to update server_map");
+ goto close;
+ }
+
+ /* connect to unbound ports */
+ prepare_addr(test->addr, test->family, CONNECT_PORT,
+ test->rewrite_addr);
+ if (run_test(server, addr, test->len, test->type))
+ goto close;
+
+ close(server);
+ server = -1;
+ }
+
+close:
+ close(server);
+ close(server_map);
+cleanup:
+ if (CHECK_FAIL(unlink(SERVER_MAP_PATH)))
+ perror("Unable to unlink " SERVER_MAP_PATH);
+ if (CHECK_FAIL(setns(self_net, CLONE_NEWNET)))
+ perror("Failed to setns("NS_SELF")");
+ close(self_net);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
new file mode 100644
index 000000000..b4c9f4a96
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
@@ -0,0 +1,1383 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+// Copyright (c) 2020 Cloudflare
+/*
+ * Test BPF attach point for INET socket lookup (BPF_SK_LOOKUP).
+ *
+ * Tests exercise:
+ * - attaching/detaching/querying programs to BPF_SK_LOOKUP hook,
+ * - redirecting socket lookup to a socket selected by BPF program,
+ * - failing a socket lookup on BPF program's request,
+ * - error scenarios for selecting a socket from BPF program,
+ * - accessing BPF program context,
+ * - attaching and running multiple BPF programs.
+ *
+ * Tests run in a dedicated network namespace.
+ */
+
+#define _GNU_SOURCE
+#include <arpa/inet.h>
+#include <assert.h>
+#include <errno.h>
+#include <error.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+
+#include "test_progs.h"
+#include "bpf_rlimit.h"
+#include "bpf_util.h"
+#include "cgroup_helpers.h"
+#include "network_helpers.h"
+#include "testing_helpers.h"
+#include "test_sk_lookup.skel.h"
+
+/* External (address, port) pairs the client sends packets to. */
+#define EXT_IP4 "127.0.0.1"
+#define EXT_IP6 "fd00::1"
+#define EXT_PORT 7007
+
+/* Internal (address, port) pairs the server listens/receives at. */
+#define INT_IP4 "127.0.0.2"
+#define INT_IP4_V6 "::ffff:127.0.0.2"
+#define INT_IP6 "fd00::2"
+#define INT_PORT 8008
+
+#define IO_TIMEOUT_SEC 3
+
+enum server {
+ SERVER_A = 0,
+ SERVER_B = 1,
+ MAX_SERVERS,
+};
+
+enum {
+ PROG1 = 0,
+ PROG2,
+};
+
+struct inet_addr {
+ const char *ip;
+ unsigned short port;
+};
+
+struct test {
+ const char *desc;
+ struct bpf_program *lookup_prog;
+ struct bpf_program *reuseport_prog;
+ struct bpf_map *sock_map;
+ int sotype;
+ struct inet_addr connect_to;
+ struct inet_addr listen_at;
+ enum server accept_on;
+ bool reuseport_has_conns; /* Add a connected socket to reuseport group */
+};
+
+static __u32 duration; /* for CHECK macro */
+
+static bool is_ipv6(const char *ip)
+{
+ return !!strchr(ip, ':');
+}
+
+static int attach_reuseport(int sock_fd, struct bpf_program *reuseport_prog)
+{
+ int err, prog_fd;
+
+ prog_fd = bpf_program__fd(reuseport_prog);
+ if (prog_fd < 0) {
+ errno = -prog_fd;
+ return -1;
+ }
+
+ err = setsockopt(sock_fd, SOL_SOCKET, SO_ATTACH_REUSEPORT_EBPF,
+ &prog_fd, sizeof(prog_fd));
+ if (err)
+ return -1;
+
+ return 0;
+}
+
+static socklen_t inetaddr_len(const struct sockaddr_storage *addr)
+{
+ return (addr->ss_family == AF_INET ? sizeof(struct sockaddr_in) :
+ addr->ss_family == AF_INET6 ? sizeof(struct sockaddr_in6) : 0);
+}
+
+static int make_socket(int sotype, const char *ip, int port,
+ struct sockaddr_storage *addr)
+{
+ struct timeval timeo = { .tv_sec = IO_TIMEOUT_SEC };
+ int err, family, fd;
+
+ family = is_ipv6(ip) ? AF_INET6 : AF_INET;
+ err = make_sockaddr(family, ip, port, addr, NULL);
+ if (CHECK(err, "make_address", "failed\n"))
+ return -1;
+
+ fd = socket(addr->ss_family, sotype, 0);
+ if (CHECK(fd < 0, "socket", "failed\n")) {
+ log_err("failed to make socket");
+ return -1;
+ }
+
+ err = setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &timeo, sizeof(timeo));
+ if (CHECK(err, "setsockopt(SO_SNDTIMEO)", "failed\n")) {
+ log_err("failed to set SNDTIMEO");
+ close(fd);
+ return -1;
+ }
+
+ err = setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &timeo, sizeof(timeo));
+ if (CHECK(err, "setsockopt(SO_RCVTIMEO)", "failed\n")) {
+ log_err("failed to set RCVTIMEO");
+ close(fd);
+ return -1;
+ }
+
+ return fd;
+}
+
+static int make_server(int sotype, const char *ip, int port,
+ struct bpf_program *reuseport_prog)
+{
+ struct sockaddr_storage addr = {0};
+ const int one = 1;
+ int err, fd = -1;
+
+ fd = make_socket(sotype, ip, port, &addr);
+ if (fd < 0)
+ return -1;
+
+ /* Enabled for UDPv6 sockets for IPv4-mapped IPv6 to work. */
+ if (sotype == SOCK_DGRAM) {
+ err = setsockopt(fd, SOL_IP, IP_RECVORIGDSTADDR, &one,
+ sizeof(one));
+ if (CHECK(err, "setsockopt(IP_RECVORIGDSTADDR)", "failed\n")) {
+ log_err("failed to enable IP_RECVORIGDSTADDR");
+ goto fail;
+ }
+ }
+
+ if (sotype == SOCK_DGRAM && addr.ss_family == AF_INET6) {
+ err = setsockopt(fd, SOL_IPV6, IPV6_RECVORIGDSTADDR, &one,
+ sizeof(one));
+ if (CHECK(err, "setsockopt(IPV6_RECVORIGDSTADDR)", "failed\n")) {
+ log_err("failed to enable IPV6_RECVORIGDSTADDR");
+ goto fail;
+ }
+ }
+
+ if (sotype == SOCK_STREAM) {
+ err = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one,
+ sizeof(one));
+ if (CHECK(err, "setsockopt(SO_REUSEADDR)", "failed\n")) {
+ log_err("failed to enable SO_REUSEADDR");
+ goto fail;
+ }
+ }
+
+ if (reuseport_prog) {
+ err = setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &one,
+ sizeof(one));
+ if (CHECK(err, "setsockopt(SO_REUSEPORT)", "failed\n")) {
+ log_err("failed to enable SO_REUSEPORT");
+ goto fail;
+ }
+ }
+
+ err = bind(fd, (void *)&addr, inetaddr_len(&addr));
+ if (CHECK(err, "bind", "failed\n")) {
+ log_err("failed to bind listen socket");
+ goto fail;
+ }
+
+ if (sotype == SOCK_STREAM) {
+ err = listen(fd, SOMAXCONN);
+ if (CHECK(err, "make_server", "listen")) {
+ log_err("failed to listen on port %d", port);
+ goto fail;
+ }
+ }
+
+ /* Late attach reuseport prog so we can have one init path */
+ if (reuseport_prog) {
+ err = attach_reuseport(fd, reuseport_prog);
+ if (CHECK(err, "attach_reuseport", "failed\n")) {
+ log_err("failed to attach reuseport prog");
+ goto fail;
+ }
+ }
+
+ return fd;
+fail:
+ close(fd);
+ return -1;
+}
+
+static int make_client(int sotype, const char *ip, int port)
+{
+ struct sockaddr_storage addr = {0};
+ int err, fd;
+
+ fd = make_socket(sotype, ip, port, &addr);
+ if (fd < 0)
+ return -1;
+
+ err = connect(fd, (void *)&addr, inetaddr_len(&addr));
+ if (CHECK(err, "make_client", "connect")) {
+ log_err("failed to connect client socket");
+ goto fail;
+ }
+
+ return fd;
+fail:
+ close(fd);
+ return -1;
+}
+
+static __u64 socket_cookie(int fd)
+{
+ __u64 cookie;
+ socklen_t cookie_len = sizeof(cookie);
+
+ if (CHECK(getsockopt(fd, SOL_SOCKET, SO_COOKIE, &cookie, &cookie_len) < 0,
+ "getsockopt(SO_COOKIE)", "%s\n", strerror(errno)))
+ return 0;
+ return cookie;
+}
+
+static int fill_sk_lookup_ctx(struct bpf_sk_lookup *ctx, const char *local_ip, __u16 local_port,
+ const char *remote_ip, __u16 remote_port)
+{
+ void *local, *remote;
+ int err;
+
+ memset(ctx, 0, sizeof(*ctx));
+ ctx->local_port = local_port;
+ ctx->remote_port = htons(remote_port);
+
+ if (is_ipv6(local_ip)) {
+ ctx->family = AF_INET6;
+ local = &ctx->local_ip6[0];
+ remote = &ctx->remote_ip6[0];
+ } else {
+ ctx->family = AF_INET;
+ local = &ctx->local_ip4;
+ remote = &ctx->remote_ip4;
+ }
+
+ err = inet_pton(ctx->family, local_ip, local);
+ if (CHECK(err != 1, "inet_pton", "local_ip failed\n"))
+ return 1;
+
+ err = inet_pton(ctx->family, remote_ip, remote);
+ if (CHECK(err != 1, "inet_pton", "remote_ip failed\n"))
+ return 1;
+
+ return 0;
+}
+
+static int send_byte(int fd)
+{
+ ssize_t n;
+
+ errno = 0;
+ n = send(fd, "a", 1, 0);
+ if (CHECK(n <= 0, "send_byte", "send")) {
+ log_err("failed/partial send");
+ return -1;
+ }
+ return 0;
+}
+
+static int recv_byte(int fd)
+{
+ char buf[1];
+ ssize_t n;
+
+ n = recv(fd, buf, sizeof(buf), 0);
+ if (CHECK(n <= 0, "recv_byte", "recv")) {
+ log_err("failed/partial recv");
+ return -1;
+ }
+ return 0;
+}
+
+static int tcp_recv_send(int server_fd)
+{
+ char buf[1];
+ int ret, fd;
+ ssize_t n;
+
+ fd = accept(server_fd, NULL, NULL);
+ if (CHECK(fd < 0, "accept", "failed\n")) {
+ log_err("failed to accept");
+ return -1;
+ }
+
+ n = recv(fd, buf, sizeof(buf), 0);
+ if (CHECK(n <= 0, "recv", "failed\n")) {
+ log_err("failed/partial recv");
+ ret = -1;
+ goto close;
+ }
+
+ n = send(fd, buf, n, 0);
+ if (CHECK(n <= 0, "send", "failed\n")) {
+ log_err("failed/partial send");
+ ret = -1;
+ goto close;
+ }
+
+ ret = 0;
+close:
+ close(fd);
+ return ret;
+}
+
+static void v4_to_v6(struct sockaddr_storage *ss)
+{
+ struct sockaddr_in6 *v6 = (struct sockaddr_in6 *)ss;
+ struct sockaddr_in v4 = *(struct sockaddr_in *)ss;
+
+ v6->sin6_family = AF_INET6;
+ v6->sin6_port = v4.sin_port;
+ v6->sin6_addr.s6_addr[10] = 0xff;
+ v6->sin6_addr.s6_addr[11] = 0xff;
+ memcpy(&v6->sin6_addr.s6_addr[12], &v4.sin_addr.s_addr, 4);
+ memset(&v6->sin6_addr.s6_addr[0], 0, 10);
+}
+
+static int udp_recv_send(int server_fd)
+{
+ char cmsg_buf[CMSG_SPACE(sizeof(struct sockaddr_storage))];
+ struct sockaddr_storage _src_addr = { 0 };
+ struct sockaddr_storage *src_addr = &_src_addr;
+ struct sockaddr_storage *dst_addr = NULL;
+ struct msghdr msg = { 0 };
+ struct iovec iov = { 0 };
+ struct cmsghdr *cm;
+ char buf[1];
+ int ret, fd;
+ ssize_t n;
+
+ iov.iov_base = buf;
+ iov.iov_len = sizeof(buf);
+
+ msg.msg_name = src_addr;
+ msg.msg_namelen = sizeof(*src_addr);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsg_buf;
+ msg.msg_controllen = sizeof(cmsg_buf);
+
+ errno = 0;
+ n = recvmsg(server_fd, &msg, 0);
+ if (CHECK(n <= 0, "recvmsg", "failed\n")) {
+ log_err("failed to receive");
+ return -1;
+ }
+ if (CHECK(msg.msg_flags & MSG_CTRUNC, "recvmsg", "truncated cmsg\n"))
+ return -1;
+
+ for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) {
+ if ((cm->cmsg_level == SOL_IP &&
+ cm->cmsg_type == IP_ORIGDSTADDR) ||
+ (cm->cmsg_level == SOL_IPV6 &&
+ cm->cmsg_type == IPV6_ORIGDSTADDR)) {
+ dst_addr = (struct sockaddr_storage *)CMSG_DATA(cm);
+ break;
+ }
+ log_err("warning: ignored cmsg at level %d type %d",
+ cm->cmsg_level, cm->cmsg_type);
+ }
+ if (CHECK(!dst_addr, "recvmsg", "missing ORIGDSTADDR\n"))
+ return -1;
+
+ /* Server socket bound to IPv4-mapped IPv6 address */
+ if (src_addr->ss_family == AF_INET6 &&
+ dst_addr->ss_family == AF_INET) {
+ v4_to_v6(dst_addr);
+ }
+
+ /* Reply from original destination address. */
+ fd = socket(dst_addr->ss_family, SOCK_DGRAM, 0);
+ if (CHECK(fd < 0, "socket", "failed\n")) {
+ log_err("failed to create tx socket");
+ return -1;
+ }
+
+ ret = bind(fd, (struct sockaddr *)dst_addr, sizeof(*dst_addr));
+ if (CHECK(ret, "bind", "failed\n")) {
+ log_err("failed to bind tx socket");
+ goto out;
+ }
+
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ n = sendmsg(fd, &msg, 0);
+ if (CHECK(n <= 0, "sendmsg", "failed\n")) {
+ log_err("failed to send echo reply");
+ ret = -1;
+ goto out;
+ }
+
+ ret = 0;
+out:
+ close(fd);
+ return ret;
+}
+
+static int tcp_echo_test(int client_fd, int server_fd)
+{
+ int err;
+
+ err = send_byte(client_fd);
+ if (err)
+ return -1;
+ err = tcp_recv_send(server_fd);
+ if (err)
+ return -1;
+ err = recv_byte(client_fd);
+ if (err)
+ return -1;
+
+ return 0;
+}
+
+static int udp_echo_test(int client_fd, int server_fd)
+{
+ int err;
+
+ err = send_byte(client_fd);
+ if (err)
+ return -1;
+ err = udp_recv_send(server_fd);
+ if (err)
+ return -1;
+ err = recv_byte(client_fd);
+ if (err)
+ return -1;
+
+ return 0;
+}
+
+static struct bpf_link *attach_lookup_prog(struct bpf_program *prog)
+{
+ struct bpf_link *link;
+ int net_fd;
+
+ net_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (CHECK(net_fd < 0, "open", "failed\n")) {
+ log_err("failed to open /proc/self/ns/net");
+ return NULL;
+ }
+
+ link = bpf_program__attach_netns(prog, net_fd);
+ if (CHECK(IS_ERR(link), "bpf_program__attach_netns", "failed\n")) {
+ errno = -PTR_ERR(link);
+ log_err("failed to attach program '%s' to netns",
+ bpf_program__name(prog));
+ link = NULL;
+ }
+
+ close(net_fd);
+ return link;
+}
+
+static int update_lookup_map(struct bpf_map *map, int index, int sock_fd)
+{
+ int err, map_fd;
+ uint64_t value;
+
+ map_fd = bpf_map__fd(map);
+ if (CHECK(map_fd < 0, "bpf_map__fd", "failed\n")) {
+ errno = -map_fd;
+ log_err("failed to get map FD");
+ return -1;
+ }
+
+ value = (uint64_t)sock_fd;
+ err = bpf_map_update_elem(map_fd, &index, &value, BPF_NOEXIST);
+ if (CHECK(err, "bpf_map_update_elem", "failed\n")) {
+ log_err("failed to update redir_map @ %d", index);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void query_lookup_prog(struct test_sk_lookup *skel)
+{
+ struct bpf_link *link[3] = {};
+ struct bpf_link_info info;
+ __u32 attach_flags = 0;
+ __u32 prog_ids[3] = {};
+ __u32 prog_cnt = 3;
+ __u32 prog_id;
+ int net_fd;
+ int err;
+
+ net_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (CHECK(net_fd < 0, "open", "failed\n")) {
+ log_err("failed to open /proc/self/ns/net");
+ return;
+ }
+
+ link[0] = attach_lookup_prog(skel->progs.lookup_pass);
+ if (!link[0])
+ goto close;
+ link[1] = attach_lookup_prog(skel->progs.lookup_pass);
+ if (!link[1])
+ goto detach;
+ link[2] = attach_lookup_prog(skel->progs.lookup_drop);
+ if (!link[2])
+ goto detach;
+
+ err = bpf_prog_query(net_fd, BPF_SK_LOOKUP, 0 /* query flags */,
+ &attach_flags, prog_ids, &prog_cnt);
+ if (CHECK(err, "bpf_prog_query", "failed\n")) {
+ log_err("failed to query lookup prog");
+ goto detach;
+ }
+
+ errno = 0;
+ if (CHECK(attach_flags != 0, "bpf_prog_query",
+ "wrong attach_flags on query: %u", attach_flags))
+ goto detach;
+ if (CHECK(prog_cnt != 3, "bpf_prog_query",
+ "wrong program count on query: %u", prog_cnt))
+ goto detach;
+ prog_id = link_info_prog_id(link[0], &info);
+ CHECK(prog_ids[0] != prog_id, "bpf_prog_query",
+ "invalid program #0 id on query: %u != %u\n",
+ prog_ids[0], prog_id);
+ CHECK(info.netns.netns_ino == 0, "netns_ino",
+ "unexpected netns_ino: %u\n", info.netns.netns_ino);
+ prog_id = link_info_prog_id(link[1], &info);
+ CHECK(prog_ids[1] != prog_id, "bpf_prog_query",
+ "invalid program #1 id on query: %u != %u\n",
+ prog_ids[1], prog_id);
+ CHECK(info.netns.netns_ino == 0, "netns_ino",
+ "unexpected netns_ino: %u\n", info.netns.netns_ino);
+ prog_id = link_info_prog_id(link[2], &info);
+ CHECK(prog_ids[2] != prog_id, "bpf_prog_query",
+ "invalid program #2 id on query: %u != %u\n",
+ prog_ids[2], prog_id);
+ CHECK(info.netns.netns_ino == 0, "netns_ino",
+ "unexpected netns_ino: %u\n", info.netns.netns_ino);
+
+ err = bpf_link__detach(link[0]);
+ if (CHECK(err, "link_detach", "failed %d\n", err))
+ goto detach;
+
+ /* prog id is still there, but netns_ino is zeroed out */
+ prog_id = link_info_prog_id(link[0], &info);
+ CHECK(prog_ids[0] != prog_id, "bpf_prog_query",
+ "invalid program #0 id on query: %u != %u\n",
+ prog_ids[0], prog_id);
+ CHECK(info.netns.netns_ino != 0, "netns_ino",
+ "unexpected netns_ino: %u\n", info.netns.netns_ino);
+
+detach:
+ if (link[2])
+ bpf_link__destroy(link[2]);
+ if (link[1])
+ bpf_link__destroy(link[1]);
+ if (link[0])
+ bpf_link__destroy(link[0]);
+close:
+ close(net_fd);
+}
+
+static void run_lookup_prog(const struct test *t)
+{
+ int server_fds[] = { [0 ... MAX_SERVERS - 1] = -1 };
+ int client_fd, reuse_conn_fd = -1;
+ struct bpf_link *lookup_link;
+ int i, err;
+
+ lookup_link = attach_lookup_prog(t->lookup_prog);
+ if (!lookup_link)
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(server_fds); i++) {
+ server_fds[i] = make_server(t->sotype, t->listen_at.ip,
+ t->listen_at.port,
+ t->reuseport_prog);
+ if (server_fds[i] < 0)
+ goto close;
+
+ err = update_lookup_map(t->sock_map, i, server_fds[i]);
+ if (err)
+ goto close;
+
+ /* want just one server for non-reuseport test */
+ if (!t->reuseport_prog)
+ break;
+ }
+
+ /* Regular UDP socket lookup with reuseport behaves
+ * differently when reuseport group contains connected
+ * sockets. Check that adding a connected UDP socket to the
+ * reuseport group does not affect how reuseport works with
+ * BPF socket lookup.
+ */
+ if (t->reuseport_has_conns) {
+ struct sockaddr_storage addr = {};
+ socklen_t len = sizeof(addr);
+
+ /* Add an extra socket to reuseport group */
+ reuse_conn_fd = make_server(t->sotype, t->listen_at.ip,
+ t->listen_at.port,
+ t->reuseport_prog);
+ if (reuse_conn_fd < 0)
+ goto close;
+
+ /* Connect the extra socket to itself */
+ err = getsockname(reuse_conn_fd, (void *)&addr, &len);
+ if (CHECK(err, "getsockname", "errno %d\n", errno))
+ goto close;
+ err = connect(reuse_conn_fd, (void *)&addr, len);
+ if (CHECK(err, "connect", "errno %d\n", errno))
+ goto close;
+ }
+
+ client_fd = make_client(t->sotype, t->connect_to.ip, t->connect_to.port);
+ if (client_fd < 0)
+ goto close;
+
+ if (t->sotype == SOCK_STREAM)
+ tcp_echo_test(client_fd, server_fds[t->accept_on]);
+ else
+ udp_echo_test(client_fd, server_fds[t->accept_on]);
+
+ close(client_fd);
+close:
+ if (reuse_conn_fd != -1)
+ close(reuse_conn_fd);
+ for (i = 0; i < ARRAY_SIZE(server_fds); i++) {
+ if (server_fds[i] != -1)
+ close(server_fds[i]);
+ }
+ bpf_link__destroy(lookup_link);
+}
+
+static void test_redirect_lookup(struct test_sk_lookup *skel)
+{
+ const struct test tests[] = {
+ {
+ .desc = "TCP IPv4 redir port",
+ .lookup_prog = skel->progs.redir_port,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_STREAM,
+ .connect_to = { EXT_IP4, EXT_PORT },
+ .listen_at = { EXT_IP4, INT_PORT },
+ },
+ {
+ .desc = "TCP IPv4 redir addr",
+ .lookup_prog = skel->progs.redir_ip4,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_STREAM,
+ .connect_to = { EXT_IP4, EXT_PORT },
+ .listen_at = { INT_IP4, EXT_PORT },
+ },
+ {
+ .desc = "TCP IPv4 redir with reuseport",
+ .lookup_prog = skel->progs.select_sock_a,
+ .reuseport_prog = skel->progs.select_sock_b,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_STREAM,
+ .connect_to = { EXT_IP4, EXT_PORT },
+ .listen_at = { INT_IP4, INT_PORT },
+ .accept_on = SERVER_B,
+ },
+ {
+ .desc = "TCP IPv4 redir skip reuseport",
+ .lookup_prog = skel->progs.select_sock_a_no_reuseport,
+ .reuseport_prog = skel->progs.select_sock_b,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_STREAM,
+ .connect_to = { EXT_IP4, EXT_PORT },
+ .listen_at = { INT_IP4, INT_PORT },
+ .accept_on = SERVER_A,
+ },
+ {
+ .desc = "TCP IPv6 redir port",
+ .lookup_prog = skel->progs.redir_port,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_STREAM,
+ .connect_to = { EXT_IP6, EXT_PORT },
+ .listen_at = { EXT_IP6, INT_PORT },
+ },
+ {
+ .desc = "TCP IPv6 redir addr",
+ .lookup_prog = skel->progs.redir_ip6,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_STREAM,
+ .connect_to = { EXT_IP6, EXT_PORT },
+ .listen_at = { INT_IP6, EXT_PORT },
+ },
+ {
+ .desc = "TCP IPv4->IPv6 redir port",
+ .lookup_prog = skel->progs.redir_port,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_STREAM,
+ .connect_to = { EXT_IP4, EXT_PORT },
+ .listen_at = { INT_IP4_V6, INT_PORT },
+ },
+ {
+ .desc = "TCP IPv6 redir with reuseport",
+ .lookup_prog = skel->progs.select_sock_a,
+ .reuseport_prog = skel->progs.select_sock_b,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_STREAM,
+ .connect_to = { EXT_IP6, EXT_PORT },
+ .listen_at = { INT_IP6, INT_PORT },
+ .accept_on = SERVER_B,
+ },
+ {
+ .desc = "TCP IPv6 redir skip reuseport",
+ .lookup_prog = skel->progs.select_sock_a_no_reuseport,
+ .reuseport_prog = skel->progs.select_sock_b,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_STREAM,
+ .connect_to = { EXT_IP6, EXT_PORT },
+ .listen_at = { INT_IP6, INT_PORT },
+ .accept_on = SERVER_A,
+ },
+ {
+ .desc = "UDP IPv4 redir port",
+ .lookup_prog = skel->progs.redir_port,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_DGRAM,
+ .connect_to = { EXT_IP4, EXT_PORT },
+ .listen_at = { EXT_IP4, INT_PORT },
+ },
+ {
+ .desc = "UDP IPv4 redir addr",
+ .lookup_prog = skel->progs.redir_ip4,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_DGRAM,
+ .connect_to = { EXT_IP4, EXT_PORT },
+ .listen_at = { INT_IP4, EXT_PORT },
+ },
+ {
+ .desc = "UDP IPv4 redir with reuseport",
+ .lookup_prog = skel->progs.select_sock_a,
+ .reuseport_prog = skel->progs.select_sock_b,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_DGRAM,
+ .connect_to = { EXT_IP4, EXT_PORT },
+ .listen_at = { INT_IP4, INT_PORT },
+ .accept_on = SERVER_B,
+ },
+ {
+ .desc = "UDP IPv4 redir and reuseport with conns",
+ .lookup_prog = skel->progs.select_sock_a,
+ .reuseport_prog = skel->progs.select_sock_b,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_DGRAM,
+ .connect_to = { EXT_IP4, EXT_PORT },
+ .listen_at = { INT_IP4, INT_PORT },
+ .accept_on = SERVER_B,
+ .reuseport_has_conns = true,
+ },
+ {
+ .desc = "UDP IPv4 redir skip reuseport",
+ .lookup_prog = skel->progs.select_sock_a_no_reuseport,
+ .reuseport_prog = skel->progs.select_sock_b,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_DGRAM,
+ .connect_to = { EXT_IP4, EXT_PORT },
+ .listen_at = { INT_IP4, INT_PORT },
+ .accept_on = SERVER_A,
+ },
+ {
+ .desc = "UDP IPv6 redir port",
+ .lookup_prog = skel->progs.redir_port,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_DGRAM,
+ .connect_to = { EXT_IP6, EXT_PORT },
+ .listen_at = { EXT_IP6, INT_PORT },
+ },
+ {
+ .desc = "UDP IPv6 redir addr",
+ .lookup_prog = skel->progs.redir_ip6,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_DGRAM,
+ .connect_to = { EXT_IP6, EXT_PORT },
+ .listen_at = { INT_IP6, EXT_PORT },
+ },
+ {
+ .desc = "UDP IPv4->IPv6 redir port",
+ .lookup_prog = skel->progs.redir_port,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_DGRAM,
+ .listen_at = { INT_IP4_V6, INT_PORT },
+ .connect_to = { EXT_IP4, EXT_PORT },
+ },
+ {
+ .desc = "UDP IPv6 redir and reuseport",
+ .lookup_prog = skel->progs.select_sock_a,
+ .reuseport_prog = skel->progs.select_sock_b,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_DGRAM,
+ .connect_to = { EXT_IP6, EXT_PORT },
+ .listen_at = { INT_IP6, INT_PORT },
+ .accept_on = SERVER_B,
+ },
+ {
+ .desc = "UDP IPv6 redir and reuseport with conns",
+ .lookup_prog = skel->progs.select_sock_a,
+ .reuseport_prog = skel->progs.select_sock_b,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_DGRAM,
+ .connect_to = { EXT_IP6, EXT_PORT },
+ .listen_at = { INT_IP6, INT_PORT },
+ .accept_on = SERVER_B,
+ .reuseport_has_conns = true,
+ },
+ {
+ .desc = "UDP IPv6 redir skip reuseport",
+ .lookup_prog = skel->progs.select_sock_a_no_reuseport,
+ .reuseport_prog = skel->progs.select_sock_b,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_DGRAM,
+ .connect_to = { EXT_IP6, EXT_PORT },
+ .listen_at = { INT_IP6, INT_PORT },
+ .accept_on = SERVER_A,
+ },
+ };
+ const struct test *t;
+
+ for (t = tests; t < tests + ARRAY_SIZE(tests); t++) {
+ if (test__start_subtest(t->desc))
+ run_lookup_prog(t);
+ }
+}
+
+static void drop_on_lookup(const struct test *t)
+{
+ struct sockaddr_storage dst = {};
+ int client_fd, server_fd, err;
+ struct bpf_link *lookup_link;
+ ssize_t n;
+
+ lookup_link = attach_lookup_prog(t->lookup_prog);
+ if (!lookup_link)
+ return;
+
+ server_fd = make_server(t->sotype, t->listen_at.ip, t->listen_at.port,
+ t->reuseport_prog);
+ if (server_fd < 0)
+ goto detach;
+
+ client_fd = make_socket(t->sotype, t->connect_to.ip,
+ t->connect_to.port, &dst);
+ if (client_fd < 0)
+ goto close_srv;
+
+ err = connect(client_fd, (void *)&dst, inetaddr_len(&dst));
+ if (t->sotype == SOCK_DGRAM) {
+ err = send_byte(client_fd);
+ if (err)
+ goto close_all;
+
+ /* Read out asynchronous error */
+ n = recv(client_fd, NULL, 0, 0);
+ err = n == -1;
+ }
+ if (CHECK(!err || errno != ECONNREFUSED, "connect",
+ "unexpected success or error\n"))
+ log_err("expected ECONNREFUSED on connect");
+
+close_all:
+ close(client_fd);
+close_srv:
+ close(server_fd);
+detach:
+ bpf_link__destroy(lookup_link);
+}
+
+static void test_drop_on_lookup(struct test_sk_lookup *skel)
+{
+ const struct test tests[] = {
+ {
+ .desc = "TCP IPv4 drop on lookup",
+ .lookup_prog = skel->progs.lookup_drop,
+ .sotype = SOCK_STREAM,
+ .connect_to = { EXT_IP4, EXT_PORT },
+ .listen_at = { EXT_IP4, EXT_PORT },
+ },
+ {
+ .desc = "TCP IPv6 drop on lookup",
+ .lookup_prog = skel->progs.lookup_drop,
+ .sotype = SOCK_STREAM,
+ .connect_to = { EXT_IP6, EXT_PORT },
+ .listen_at = { EXT_IP6, EXT_PORT },
+ },
+ {
+ .desc = "UDP IPv4 drop on lookup",
+ .lookup_prog = skel->progs.lookup_drop,
+ .sotype = SOCK_DGRAM,
+ .connect_to = { EXT_IP4, EXT_PORT },
+ .listen_at = { EXT_IP4, EXT_PORT },
+ },
+ {
+ .desc = "UDP IPv6 drop on lookup",
+ .lookup_prog = skel->progs.lookup_drop,
+ .sotype = SOCK_DGRAM,
+ .connect_to = { EXT_IP6, EXT_PORT },
+ .listen_at = { EXT_IP6, INT_PORT },
+ },
+ };
+ const struct test *t;
+
+ for (t = tests; t < tests + ARRAY_SIZE(tests); t++) {
+ if (test__start_subtest(t->desc))
+ drop_on_lookup(t);
+ }
+}
+
+static void drop_on_reuseport(const struct test *t)
+{
+ struct sockaddr_storage dst = { 0 };
+ int client, server1, server2, err;
+ struct bpf_link *lookup_link;
+ ssize_t n;
+
+ lookup_link = attach_lookup_prog(t->lookup_prog);
+ if (!lookup_link)
+ return;
+
+ server1 = make_server(t->sotype, t->listen_at.ip, t->listen_at.port,
+ t->reuseport_prog);
+ if (server1 < 0)
+ goto detach;
+
+ err = update_lookup_map(t->sock_map, SERVER_A, server1);
+ if (err)
+ goto detach;
+
+ /* second server on destination address we should never reach */
+ server2 = make_server(t->sotype, t->connect_to.ip, t->connect_to.port,
+ NULL /* reuseport prog */);
+ if (server2 < 0)
+ goto close_srv1;
+
+ client = make_socket(t->sotype, t->connect_to.ip,
+ t->connect_to.port, &dst);
+ if (client < 0)
+ goto close_srv2;
+
+ err = connect(client, (void *)&dst, inetaddr_len(&dst));
+ if (t->sotype == SOCK_DGRAM) {
+ err = send_byte(client);
+ if (err)
+ goto close_all;
+
+ /* Read out asynchronous error */
+ n = recv(client, NULL, 0, 0);
+ err = n == -1;
+ }
+ if (CHECK(!err || errno != ECONNREFUSED, "connect",
+ "unexpected success or error\n"))
+ log_err("expected ECONNREFUSED on connect");
+
+close_all:
+ close(client);
+close_srv2:
+ close(server2);
+close_srv1:
+ close(server1);
+detach:
+ bpf_link__destroy(lookup_link);
+}
+
+static void test_drop_on_reuseport(struct test_sk_lookup *skel)
+{
+ const struct test tests[] = {
+ {
+ .desc = "TCP IPv4 drop on reuseport",
+ .lookup_prog = skel->progs.select_sock_a,
+ .reuseport_prog = skel->progs.reuseport_drop,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_STREAM,
+ .connect_to = { EXT_IP4, EXT_PORT },
+ .listen_at = { INT_IP4, INT_PORT },
+ },
+ {
+ .desc = "TCP IPv6 drop on reuseport",
+ .lookup_prog = skel->progs.select_sock_a,
+ .reuseport_prog = skel->progs.reuseport_drop,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_STREAM,
+ .connect_to = { EXT_IP6, EXT_PORT },
+ .listen_at = { INT_IP6, INT_PORT },
+ },
+ {
+ .desc = "UDP IPv4 drop on reuseport",
+ .lookup_prog = skel->progs.select_sock_a,
+ .reuseport_prog = skel->progs.reuseport_drop,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_DGRAM,
+ .connect_to = { EXT_IP4, EXT_PORT },
+ .listen_at = { INT_IP4, INT_PORT },
+ },
+ {
+ .desc = "TCP IPv6 drop on reuseport",
+ .lookup_prog = skel->progs.select_sock_a,
+ .reuseport_prog = skel->progs.reuseport_drop,
+ .sock_map = skel->maps.redir_map,
+ .sotype = SOCK_STREAM,
+ .connect_to = { EXT_IP6, EXT_PORT },
+ .listen_at = { INT_IP6, INT_PORT },
+ },
+ };
+ const struct test *t;
+
+ for (t = tests; t < tests + ARRAY_SIZE(tests); t++) {
+ if (test__start_subtest(t->desc))
+ drop_on_reuseport(t);
+ }
+}
+
+static void run_sk_assign(struct test_sk_lookup *skel,
+ struct bpf_program *lookup_prog,
+ const char *remote_ip, const char *local_ip)
+{
+ int server_fds[] = { [0 ... MAX_SERVERS - 1] = -1 };
+ struct bpf_sk_lookup ctx;
+ __u64 server_cookie;
+ int i, err;
+
+ DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts,
+ .ctx_in = &ctx,
+ .ctx_size_in = sizeof(ctx),
+ .ctx_out = &ctx,
+ .ctx_size_out = sizeof(ctx),
+ );
+
+ if (fill_sk_lookup_ctx(&ctx, local_ip, EXT_PORT, remote_ip, INT_PORT))
+ return;
+
+ ctx.protocol = IPPROTO_TCP;
+
+ for (i = 0; i < ARRAY_SIZE(server_fds); i++) {
+ server_fds[i] = make_server(SOCK_STREAM, local_ip, 0, NULL);
+ if (server_fds[i] < 0)
+ goto close_servers;
+
+ err = update_lookup_map(skel->maps.redir_map, i,
+ server_fds[i]);
+ if (err)
+ goto close_servers;
+ }
+
+ server_cookie = socket_cookie(server_fds[SERVER_B]);
+ if (!server_cookie)
+ return;
+
+ err = bpf_prog_test_run_opts(bpf_program__fd(lookup_prog), &opts);
+ if (CHECK(err, "test_run", "failed with error %d\n", errno))
+ goto close_servers;
+
+ if (CHECK(ctx.cookie == 0, "ctx.cookie", "no socket selected\n"))
+ goto close_servers;
+
+ CHECK(ctx.cookie != server_cookie, "ctx.cookie",
+ "selected sk %llu instead of %llu\n", ctx.cookie, server_cookie);
+
+close_servers:
+ for (i = 0; i < ARRAY_SIZE(server_fds); i++) {
+ if (server_fds[i] != -1)
+ close(server_fds[i]);
+ }
+}
+
+static void run_sk_assign_v4(struct test_sk_lookup *skel,
+ struct bpf_program *lookup_prog)
+{
+ run_sk_assign(skel, lookup_prog, INT_IP4, EXT_IP4);
+}
+
+static void run_sk_assign_v6(struct test_sk_lookup *skel,
+ struct bpf_program *lookup_prog)
+{
+ run_sk_assign(skel, lookup_prog, INT_IP6, EXT_IP6);
+}
+
+static void run_sk_assign_connected(struct test_sk_lookup *skel,
+ int sotype)
+{
+ int err, client_fd, connected_fd, server_fd;
+ struct bpf_link *lookup_link;
+
+ server_fd = make_server(sotype, EXT_IP4, EXT_PORT, NULL);
+ if (server_fd < 0)
+ return;
+
+ connected_fd = make_client(sotype, EXT_IP4, EXT_PORT);
+ if (connected_fd < 0)
+ goto out_close_server;
+
+ /* Put a connected socket in redirect map */
+ err = update_lookup_map(skel->maps.redir_map, SERVER_A, connected_fd);
+ if (err)
+ goto out_close_connected;
+
+ lookup_link = attach_lookup_prog(skel->progs.sk_assign_esocknosupport);
+ if (!lookup_link)
+ goto out_close_connected;
+
+ /* Try to redirect TCP SYN / UDP packet to a connected socket */
+ client_fd = make_client(sotype, EXT_IP4, EXT_PORT);
+ if (client_fd < 0)
+ goto out_unlink_prog;
+ if (sotype == SOCK_DGRAM) {
+ send_byte(client_fd);
+ recv_byte(server_fd);
+ }
+
+ close(client_fd);
+out_unlink_prog:
+ bpf_link__destroy(lookup_link);
+out_close_connected:
+ close(connected_fd);
+out_close_server:
+ close(server_fd);
+}
+
+static void test_sk_assign_helper(struct test_sk_lookup *skel)
+{
+ if (test__start_subtest("sk_assign returns EEXIST"))
+ run_sk_assign_v4(skel, skel->progs.sk_assign_eexist);
+ if (test__start_subtest("sk_assign honors F_REPLACE"))
+ run_sk_assign_v4(skel, skel->progs.sk_assign_replace_flag);
+ if (test__start_subtest("sk_assign accepts NULL socket"))
+ run_sk_assign_v4(skel, skel->progs.sk_assign_null);
+ if (test__start_subtest("access ctx->sk"))
+ run_sk_assign_v4(skel, skel->progs.access_ctx_sk);
+ if (test__start_subtest("narrow access to ctx v4"))
+ run_sk_assign_v4(skel, skel->progs.ctx_narrow_access);
+ if (test__start_subtest("narrow access to ctx v6"))
+ run_sk_assign_v6(skel, skel->progs.ctx_narrow_access);
+ if (test__start_subtest("sk_assign rejects TCP established"))
+ run_sk_assign_connected(skel, SOCK_STREAM);
+ if (test__start_subtest("sk_assign rejects UDP connected"))
+ run_sk_assign_connected(skel, SOCK_DGRAM);
+}
+
+struct test_multi_prog {
+ const char *desc;
+ struct bpf_program *prog1;
+ struct bpf_program *prog2;
+ struct bpf_map *redir_map;
+ struct bpf_map *run_map;
+ int expect_errno;
+ struct inet_addr listen_at;
+};
+
+static void run_multi_prog_lookup(const struct test_multi_prog *t)
+{
+ struct sockaddr_storage dst = {};
+ int map_fd, server_fd, client_fd;
+ struct bpf_link *link1, *link2;
+ int prog_idx, done, err;
+
+ map_fd = bpf_map__fd(t->run_map);
+
+ done = 0;
+ prog_idx = PROG1;
+ err = bpf_map_update_elem(map_fd, &prog_idx, &done, BPF_ANY);
+ if (CHECK(err, "bpf_map_update_elem", "failed\n"))
+ return;
+ prog_idx = PROG2;
+ err = bpf_map_update_elem(map_fd, &prog_idx, &done, BPF_ANY);
+ if (CHECK(err, "bpf_map_update_elem", "failed\n"))
+ return;
+
+ link1 = attach_lookup_prog(t->prog1);
+ if (!link1)
+ return;
+ link2 = attach_lookup_prog(t->prog2);
+ if (!link2)
+ goto out_unlink1;
+
+ server_fd = make_server(SOCK_STREAM, t->listen_at.ip,
+ t->listen_at.port, NULL);
+ if (server_fd < 0)
+ goto out_unlink2;
+
+ err = update_lookup_map(t->redir_map, SERVER_A, server_fd);
+ if (err)
+ goto out_close_server;
+
+ client_fd = make_socket(SOCK_STREAM, EXT_IP4, EXT_PORT, &dst);
+ if (client_fd < 0)
+ goto out_close_server;
+
+ err = connect(client_fd, (void *)&dst, inetaddr_len(&dst));
+ if (CHECK(err && !t->expect_errno, "connect",
+ "unexpected error %d\n", errno))
+ goto out_close_client;
+ if (CHECK(err && t->expect_errno && errno != t->expect_errno,
+ "connect", "unexpected error %d\n", errno))
+ goto out_close_client;
+
+ done = 0;
+ prog_idx = PROG1;
+ err = bpf_map_lookup_elem(map_fd, &prog_idx, &done);
+ CHECK(err, "bpf_map_lookup_elem", "failed\n");
+ CHECK(!done, "bpf_map_lookup_elem", "PROG1 !done\n");
+
+ done = 0;
+ prog_idx = PROG2;
+ err = bpf_map_lookup_elem(map_fd, &prog_idx, &done);
+ CHECK(err, "bpf_map_lookup_elem", "failed\n");
+ CHECK(!done, "bpf_map_lookup_elem", "PROG2 !done\n");
+
+out_close_client:
+ close(client_fd);
+out_close_server:
+ close(server_fd);
+out_unlink2:
+ bpf_link__destroy(link2);
+out_unlink1:
+ bpf_link__destroy(link1);
+}
+
+static void test_multi_prog_lookup(struct test_sk_lookup *skel)
+{
+ struct test_multi_prog tests[] = {
+ {
+ .desc = "multi prog - pass, pass",
+ .prog1 = skel->progs.multi_prog_pass1,
+ .prog2 = skel->progs.multi_prog_pass2,
+ .listen_at = { EXT_IP4, EXT_PORT },
+ },
+ {
+ .desc = "multi prog - drop, drop",
+ .prog1 = skel->progs.multi_prog_drop1,
+ .prog2 = skel->progs.multi_prog_drop2,
+ .listen_at = { EXT_IP4, EXT_PORT },
+ .expect_errno = ECONNREFUSED,
+ },
+ {
+ .desc = "multi prog - pass, drop",
+ .prog1 = skel->progs.multi_prog_pass1,
+ .prog2 = skel->progs.multi_prog_drop2,
+ .listen_at = { EXT_IP4, EXT_PORT },
+ .expect_errno = ECONNREFUSED,
+ },
+ {
+ .desc = "multi prog - drop, pass",
+ .prog1 = skel->progs.multi_prog_drop1,
+ .prog2 = skel->progs.multi_prog_pass2,
+ .listen_at = { EXT_IP4, EXT_PORT },
+ .expect_errno = ECONNREFUSED,
+ },
+ {
+ .desc = "multi prog - pass, redir",
+ .prog1 = skel->progs.multi_prog_pass1,
+ .prog2 = skel->progs.multi_prog_redir2,
+ .listen_at = { INT_IP4, INT_PORT },
+ },
+ {
+ .desc = "multi prog - redir, pass",
+ .prog1 = skel->progs.multi_prog_redir1,
+ .prog2 = skel->progs.multi_prog_pass2,
+ .listen_at = { INT_IP4, INT_PORT },
+ },
+ {
+ .desc = "multi prog - drop, redir",
+ .prog1 = skel->progs.multi_prog_drop1,
+ .prog2 = skel->progs.multi_prog_redir2,
+ .listen_at = { INT_IP4, INT_PORT },
+ },
+ {
+ .desc = "multi prog - redir, drop",
+ .prog1 = skel->progs.multi_prog_redir1,
+ .prog2 = skel->progs.multi_prog_drop2,
+ .listen_at = { INT_IP4, INT_PORT },
+ },
+ {
+ .desc = "multi prog - redir, redir",
+ .prog1 = skel->progs.multi_prog_redir1,
+ .prog2 = skel->progs.multi_prog_redir2,
+ .listen_at = { INT_IP4, INT_PORT },
+ },
+ };
+ struct test_multi_prog *t;
+
+ for (t = tests; t < tests + ARRAY_SIZE(tests); t++) {
+ t->redir_map = skel->maps.redir_map;
+ t->run_map = skel->maps.run_map;
+ if (test__start_subtest(t->desc))
+ run_multi_prog_lookup(t);
+ }
+}
+
+static void run_tests(struct test_sk_lookup *skel)
+{
+ if (test__start_subtest("query lookup prog"))
+ query_lookup_prog(skel);
+ test_redirect_lookup(skel);
+ test_drop_on_lookup(skel);
+ test_drop_on_reuseport(skel);
+ test_sk_assign_helper(skel);
+ test_multi_prog_lookup(skel);
+}
+
+static int switch_netns(void)
+{
+ static const char * const setup_script[] = {
+ "ip -6 addr add dev lo " EXT_IP6 "/128",
+ "ip -6 addr add dev lo " INT_IP6 "/128",
+ "ip link set dev lo up",
+ NULL,
+ };
+ const char * const *cmd;
+ int err;
+
+ err = unshare(CLONE_NEWNET);
+ if (CHECK(err, "unshare", "failed\n")) {
+ log_err("unshare(CLONE_NEWNET)");
+ return -1;
+ }
+
+ for (cmd = setup_script; *cmd; cmd++) {
+ err = system(*cmd);
+ if (CHECK(err, "system", "failed\n")) {
+ log_err("system(%s)", *cmd);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+void test_sk_lookup(void)
+{
+ struct test_sk_lookup *skel;
+ int err;
+
+ err = switch_netns();
+ if (err)
+ return;
+
+ skel = test_sk_lookup__open_and_load();
+ if (CHECK(!skel, "skel open_and_load", "failed\n"))
+ return;
+
+ run_tests(skel);
+
+ test_sk_lookup__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c
new file mode 100644
index 000000000..23915be61
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+
+void test_skb_ctx(void)
+{
+ struct __sk_buff skb = {
+ .cb[0] = 1,
+ .cb[1] = 2,
+ .cb[2] = 3,
+ .cb[3] = 4,
+ .cb[4] = 5,
+ .priority = 6,
+ .ifindex = 1,
+ .tstamp = 7,
+ .wire_len = 100,
+ .gso_segs = 8,
+ .mark = 9,
+ .gso_size = 10,
+ };
+ struct bpf_prog_test_run_attr tattr = {
+ .data_in = &pkt_v4,
+ .data_size_in = sizeof(pkt_v4),
+ .ctx_in = &skb,
+ .ctx_size_in = sizeof(skb),
+ .ctx_out = &skb,
+ .ctx_size_out = sizeof(skb),
+ };
+ struct bpf_object *obj;
+ int err;
+ int i;
+
+ err = bpf_prog_load("./test_skb_ctx.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
+ &tattr.prog_fd);
+ if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno))
+ return;
+
+ /* ctx_in != NULL, ctx_size_in == 0 */
+
+ tattr.ctx_size_in = 0;
+ err = bpf_prog_test_run_xattr(&tattr);
+ CHECK_ATTR(err == 0, "ctx_size_in", "err %d errno %d\n", err, errno);
+ tattr.ctx_size_in = sizeof(skb);
+
+ /* ctx_out != NULL, ctx_size_out == 0 */
+
+ tattr.ctx_size_out = 0;
+ err = bpf_prog_test_run_xattr(&tattr);
+ CHECK_ATTR(err == 0, "ctx_size_out", "err %d errno %d\n", err, errno);
+ tattr.ctx_size_out = sizeof(skb);
+
+ /* non-zero [len, tc_index] fields should be rejected*/
+
+ skb.len = 1;
+ err = bpf_prog_test_run_xattr(&tattr);
+ CHECK_ATTR(err == 0, "len", "err %d errno %d\n", err, errno);
+ skb.len = 0;
+
+ skb.tc_index = 1;
+ err = bpf_prog_test_run_xattr(&tattr);
+ CHECK_ATTR(err == 0, "tc_index", "err %d errno %d\n", err, errno);
+ skb.tc_index = 0;
+
+ /* non-zero [hash, sk] fields should be rejected */
+
+ skb.hash = 1;
+ err = bpf_prog_test_run_xattr(&tattr);
+ CHECK_ATTR(err == 0, "hash", "err %d errno %d\n", err, errno);
+ skb.hash = 0;
+
+ skb.sk = (struct bpf_sock *)1;
+ err = bpf_prog_test_run_xattr(&tattr);
+ CHECK_ATTR(err == 0, "sk", "err %d errno %d\n", err, errno);
+ skb.sk = 0;
+
+ err = bpf_prog_test_run_xattr(&tattr);
+ CHECK_ATTR(err != 0 || tattr.retval,
+ "run",
+ "err %d errno %d retval %d\n",
+ err, errno, tattr.retval);
+
+ CHECK_ATTR(tattr.ctx_size_out != sizeof(skb),
+ "ctx_size_out",
+ "incorrect output size, want %zu have %u\n",
+ sizeof(skb), tattr.ctx_size_out);
+
+ for (i = 0; i < 5; i++)
+ CHECK_ATTR(skb.cb[i] != i + 2,
+ "ctx_out_cb",
+ "skb->cb[i] == %d, expected %d\n",
+ skb.cb[i], i + 2);
+ CHECK_ATTR(skb.priority != 7,
+ "ctx_out_priority",
+ "skb->priority == %d, expected %d\n",
+ skb.priority, 7);
+ CHECK_ATTR(skb.ifindex != 1,
+ "ctx_out_ifindex",
+ "skb->ifindex == %d, expected %d\n",
+ skb.ifindex, 1);
+ CHECK_ATTR(skb.tstamp != 8,
+ "ctx_out_tstamp",
+ "skb->tstamp == %lld, expected %d\n",
+ skb.tstamp, 8);
+ CHECK_ATTR(skb.mark != 10,
+ "ctx_out_mark",
+ "skb->mark == %u, expected %d\n",
+ skb.mark, 10);
+
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/skb_helpers.c b/tools/testing/selftests/bpf/prog_tests/skb_helpers.c
new file mode 100644
index 000000000..f302ad84a
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/skb_helpers.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+
+void test_skb_helpers(void)
+{
+ struct __sk_buff skb = {
+ .wire_len = 100,
+ .gso_segs = 8,
+ .gso_size = 10,
+ };
+ struct bpf_prog_test_run_attr tattr = {
+ .data_in = &pkt_v4,
+ .data_size_in = sizeof(pkt_v4),
+ .ctx_in = &skb,
+ .ctx_size_in = sizeof(skb),
+ .ctx_out = &skb,
+ .ctx_size_out = sizeof(skb),
+ };
+ struct bpf_object *obj;
+ int err;
+
+ err = bpf_prog_load("./test_skb_helpers.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
+ &tattr.prog_fd);
+ if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno))
+ return;
+ err = bpf_prog_test_run_xattr(&tattr);
+ CHECK_ATTR(err, "len", "err %d errno %d\n", err, errno);
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/skeleton.c b/tools/testing/selftests/bpf/prog_tests/skeleton.c
new file mode 100644
index 000000000..fe87b77af
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/skeleton.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+
+#include <test_progs.h>
+
+struct s {
+ int a;
+ long long b;
+} __attribute__((packed));
+
+#include "test_skeleton.skel.h"
+
+void test_skeleton(void)
+{
+ int duration = 0, err;
+ struct test_skeleton* skel;
+ struct test_skeleton__bss *bss;
+ struct test_skeleton__data *data;
+ struct test_skeleton__rodata *rodata;
+ struct test_skeleton__kconfig *kcfg;
+
+ skel = test_skeleton__open();
+ if (CHECK(!skel, "skel_open", "failed to open skeleton\n"))
+ return;
+
+ if (CHECK(skel->kconfig, "skel_kconfig", "kconfig is mmaped()!\n"))
+ goto cleanup;
+
+ bss = skel->bss;
+ data = skel->data;
+ rodata = skel->rodata;
+
+ /* validate values are pre-initialized correctly */
+ CHECK(data->in1 != -1, "in1", "got %d != exp %d\n", data->in1, -1);
+ CHECK(data->out1 != -1, "out1", "got %d != exp %d\n", data->out1, -1);
+ CHECK(data->in2 != -1, "in2", "got %lld != exp %lld\n", data->in2, -1LL);
+ CHECK(data->out2 != -1, "out2", "got %lld != exp %lld\n", data->out2, -1LL);
+
+ CHECK(bss->in3 != 0, "in3", "got %d != exp %d\n", bss->in3, 0);
+ CHECK(bss->out3 != 0, "out3", "got %d != exp %d\n", bss->out3, 0);
+ CHECK(bss->in4 != 0, "in4", "got %lld != exp %lld\n", bss->in4, 0LL);
+ CHECK(bss->out4 != 0, "out4", "got %lld != exp %lld\n", bss->out4, 0LL);
+
+ CHECK(rodata->in.in6 != 0, "in6", "got %d != exp %d\n", rodata->in.in6, 0);
+ CHECK(bss->out6 != 0, "out6", "got %d != exp %d\n", bss->out6, 0);
+
+ /* validate we can pre-setup global variables, even in .bss */
+ data->in1 = 10;
+ data->in2 = 11;
+ bss->in3 = 12;
+ bss->in4 = 13;
+ rodata->in.in6 = 14;
+
+ err = test_skeleton__load(skel);
+ if (CHECK(err, "skel_load", "failed to load skeleton: %d\n", err))
+ goto cleanup;
+
+ /* validate pre-setup values are still there */
+ CHECK(data->in1 != 10, "in1", "got %d != exp %d\n", data->in1, 10);
+ CHECK(data->in2 != 11, "in2", "got %lld != exp %lld\n", data->in2, 11LL);
+ CHECK(bss->in3 != 12, "in3", "got %d != exp %d\n", bss->in3, 12);
+ CHECK(bss->in4 != 13, "in4", "got %lld != exp %lld\n", bss->in4, 13LL);
+ CHECK(rodata->in.in6 != 14, "in6", "got %d != exp %d\n", rodata->in.in6, 14);
+
+ /* now set new values and attach to get them into outX variables */
+ data->in1 = 1;
+ data->in2 = 2;
+ bss->in3 = 3;
+ bss->in4 = 4;
+ bss->in5.a = 5;
+ bss->in5.b = 6;
+ kcfg = skel->kconfig;
+
+ err = test_skeleton__attach(skel);
+ if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err))
+ goto cleanup;
+
+ /* trigger tracepoint */
+ usleep(1);
+
+ CHECK(data->out1 != 1, "res1", "got %d != exp %d\n", data->out1, 1);
+ CHECK(data->out2 != 2, "res2", "got %lld != exp %d\n", data->out2, 2);
+ CHECK(bss->out3 != 3, "res3", "got %d != exp %d\n", (int)bss->out3, 3);
+ CHECK(bss->out4 != 4, "res4", "got %lld != exp %d\n", bss->out4, 4);
+ CHECK(bss->handler_out5.a != 5, "res5", "got %d != exp %d\n",
+ bss->handler_out5.a, 5);
+ CHECK(bss->handler_out5.b != 6, "res6", "got %lld != exp %d\n",
+ bss->handler_out5.b, 6);
+ CHECK(bss->out6 != 14, "res7", "got %d != exp %d\n", bss->out6, 14);
+
+ CHECK(bss->bpf_syscall != kcfg->CONFIG_BPF_SYSCALL, "ext1",
+ "got %d != exp %d\n", bss->bpf_syscall, kcfg->CONFIG_BPF_SYSCALL);
+ CHECK(bss->kern_ver != kcfg->LINUX_KERNEL_VERSION, "ext2",
+ "got %d != exp %d\n", bss->kern_ver, kcfg->LINUX_KERNEL_VERSION);
+
+cleanup:
+ test_skeleton__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/snprintf_btf.c b/tools/testing/selftests/bpf/prog_tests/snprintf_btf.c
new file mode 100644
index 000000000..686b40f11
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/snprintf_btf.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <linux/btf.h>
+#include "netif_receive_skb.skel.h"
+
+/* Demonstrate that bpf_snprintf_btf succeeds and that various data types
+ * are formatted correctly.
+ */
+void test_snprintf_btf(void)
+{
+ struct netif_receive_skb *skel;
+ struct netif_receive_skb__bss *bss;
+ int err, duration = 0;
+
+ skel = netif_receive_skb__open();
+ if (CHECK(!skel, "skel_open", "failed to open skeleton\n"))
+ return;
+
+ err = netif_receive_skb__load(skel);
+ if (CHECK(err, "skel_load", "failed to load skeleton: %d\n", err))
+ goto cleanup;
+
+ bss = skel->bss;
+
+ err = netif_receive_skb__attach(skel);
+ if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err))
+ goto cleanup;
+
+ /* generate receive event */
+ err = system("ping -c 1 127.0.0.1 > /dev/null");
+ if (CHECK(err, "system", "ping failed: %d\n", err))
+ goto cleanup;
+
+ if (bss->skip) {
+ printf("%s:SKIP:no __builtin_btf_type_id\n", __func__);
+ test__skip();
+ goto cleanup;
+ }
+
+ /*
+ * Make sure netif_receive_skb program was triggered
+ * and it set expected return values from bpf_trace_printk()s
+ * and all tests ran.
+ */
+ if (CHECK(bss->ret <= 0,
+ "bpf_snprintf_btf: got return value",
+ "ret <= 0 %ld test %d\n", bss->ret, bss->ran_subtests))
+ goto cleanup;
+
+ if (CHECK(bss->ran_subtests == 0, "check if subtests ran",
+ "no subtests ran, did BPF program run?"))
+ goto cleanup;
+
+ if (CHECK(bss->num_subtests != bss->ran_subtests,
+ "check all subtests ran",
+ "only ran %d of %d tests\n", bss->num_subtests,
+ bss->ran_subtests))
+ goto cleanup;
+
+cleanup:
+ netif_receive_skb__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/sock_fields.c b/tools/testing/selftests/bpf/prog_tests/sock_fields.c
new file mode 100644
index 000000000..e8b5bf707
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/sock_fields.c
@@ -0,0 +1,404 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+
+#define _GNU_SOURCE
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+#include <sched.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include <linux/compiler.h>
+
+#include "network_helpers.h"
+#include "cgroup_helpers.h"
+#include "test_progs.h"
+#include "bpf_rlimit.h"
+#include "test_sock_fields.skel.h"
+
+enum bpf_linum_array_idx {
+ EGRESS_LINUM_IDX,
+ INGRESS_LINUM_IDX,
+ READ_SK_DST_PORT_LINUM_IDX,
+ __NR_BPF_LINUM_ARRAY_IDX,
+};
+
+struct bpf_spinlock_cnt {
+ struct bpf_spin_lock lock;
+ __u32 cnt;
+};
+
+#define PARENT_CGROUP "/test-bpf-sock-fields"
+#define CHILD_CGROUP "/test-bpf-sock-fields/child"
+#define DATA "Hello BPF!"
+#define DATA_LEN sizeof(DATA)
+
+static struct sockaddr_in6 srv_sa6, cli_sa6;
+static int sk_pkt_out_cnt10_fd;
+static struct test_sock_fields *skel;
+static int sk_pkt_out_cnt_fd;
+static __u64 parent_cg_id;
+static __u64 child_cg_id;
+static int linum_map_fd;
+static __u32 duration;
+
+static bool create_netns(void)
+{
+ if (!ASSERT_OK(unshare(CLONE_NEWNET), "create netns"))
+ return false;
+
+ if (!ASSERT_OK(system("ip link set dev lo up"), "bring up lo"))
+ return false;
+
+ return true;
+}
+
+static void print_sk(const struct bpf_sock *sk, const char *prefix)
+{
+ char src_ip4[24], dst_ip4[24];
+ char src_ip6[64], dst_ip6[64];
+
+ inet_ntop(AF_INET, &sk->src_ip4, src_ip4, sizeof(src_ip4));
+ inet_ntop(AF_INET6, &sk->src_ip6, src_ip6, sizeof(src_ip6));
+ inet_ntop(AF_INET, &sk->dst_ip4, dst_ip4, sizeof(dst_ip4));
+ inet_ntop(AF_INET6, &sk->dst_ip6, dst_ip6, sizeof(dst_ip6));
+
+ printf("%s: state:%u bound_dev_if:%u family:%u type:%u protocol:%u mark:%u priority:%u "
+ "src_ip4:%x(%s) src_ip6:%x:%x:%x:%x(%s) src_port:%u "
+ "dst_ip4:%x(%s) dst_ip6:%x:%x:%x:%x(%s) dst_port:%u\n",
+ prefix,
+ sk->state, sk->bound_dev_if, sk->family, sk->type, sk->protocol,
+ sk->mark, sk->priority,
+ sk->src_ip4, src_ip4,
+ sk->src_ip6[0], sk->src_ip6[1], sk->src_ip6[2], sk->src_ip6[3],
+ src_ip6, sk->src_port,
+ sk->dst_ip4, dst_ip4,
+ sk->dst_ip6[0], sk->dst_ip6[1], sk->dst_ip6[2], sk->dst_ip6[3],
+ dst_ip6, ntohs(sk->dst_port));
+}
+
+static void print_tp(const struct bpf_tcp_sock *tp, const char *prefix)
+{
+ printf("%s: snd_cwnd:%u srtt_us:%u rtt_min:%u snd_ssthresh:%u rcv_nxt:%u "
+ "snd_nxt:%u snd:una:%u mss_cache:%u ecn_flags:%u "
+ "rate_delivered:%u rate_interval_us:%u packets_out:%u "
+ "retrans_out:%u total_retrans:%u segs_in:%u data_segs_in:%u "
+ "segs_out:%u data_segs_out:%u lost_out:%u sacked_out:%u "
+ "bytes_received:%llu bytes_acked:%llu\n",
+ prefix,
+ tp->snd_cwnd, tp->srtt_us, tp->rtt_min, tp->snd_ssthresh,
+ tp->rcv_nxt, tp->snd_nxt, tp->snd_una, tp->mss_cache,
+ tp->ecn_flags, tp->rate_delivered, tp->rate_interval_us,
+ tp->packets_out, tp->retrans_out, tp->total_retrans,
+ tp->segs_in, tp->data_segs_in, tp->segs_out,
+ tp->data_segs_out, tp->lost_out, tp->sacked_out,
+ tp->bytes_received, tp->bytes_acked);
+}
+
+static void check_result(void)
+{
+ struct bpf_tcp_sock srv_tp, cli_tp, listen_tp;
+ struct bpf_sock srv_sk, cli_sk, listen_sk;
+ __u32 idx, ingress_linum, egress_linum, linum;
+ int err;
+
+ idx = EGRESS_LINUM_IDX;
+ err = bpf_map_lookup_elem(linum_map_fd, &idx, &egress_linum);
+ CHECK(err == -1, "bpf_map_lookup_elem(linum_map_fd)",
+ "err:%d errno:%d\n", err, errno);
+
+ idx = INGRESS_LINUM_IDX;
+ err = bpf_map_lookup_elem(linum_map_fd, &idx, &ingress_linum);
+ CHECK(err == -1, "bpf_map_lookup_elem(linum_map_fd)",
+ "err:%d errno:%d\n", err, errno);
+
+ idx = READ_SK_DST_PORT_LINUM_IDX;
+ err = bpf_map_lookup_elem(linum_map_fd, &idx, &linum);
+ ASSERT_OK(err, "bpf_map_lookup_elem(linum_map_fd, READ_SK_DST_PORT_IDX)");
+ ASSERT_EQ(linum, 0, "failure in read_sk_dst_port on line");
+
+ memcpy(&srv_sk, &skel->bss->srv_sk, sizeof(srv_sk));
+ memcpy(&srv_tp, &skel->bss->srv_tp, sizeof(srv_tp));
+ memcpy(&cli_sk, &skel->bss->cli_sk, sizeof(cli_sk));
+ memcpy(&cli_tp, &skel->bss->cli_tp, sizeof(cli_tp));
+ memcpy(&listen_sk, &skel->bss->listen_sk, sizeof(listen_sk));
+ memcpy(&listen_tp, &skel->bss->listen_tp, sizeof(listen_tp));
+
+ print_sk(&listen_sk, "listen_sk");
+ print_sk(&srv_sk, "srv_sk");
+ print_sk(&cli_sk, "cli_sk");
+ print_tp(&listen_tp, "listen_tp");
+ print_tp(&srv_tp, "srv_tp");
+ print_tp(&cli_tp, "cli_tp");
+
+ CHECK(listen_sk.state != 10 ||
+ listen_sk.family != AF_INET6 ||
+ listen_sk.protocol != IPPROTO_TCP ||
+ memcmp(listen_sk.src_ip6, &in6addr_loopback,
+ sizeof(listen_sk.src_ip6)) ||
+ listen_sk.dst_ip6[0] || listen_sk.dst_ip6[1] ||
+ listen_sk.dst_ip6[2] || listen_sk.dst_ip6[3] ||
+ listen_sk.src_port != ntohs(srv_sa6.sin6_port) ||
+ listen_sk.dst_port,
+ "listen_sk",
+ "Unexpected. Check listen_sk output. ingress_linum:%u\n",
+ ingress_linum);
+
+ CHECK(srv_sk.state == 10 ||
+ !srv_sk.state ||
+ srv_sk.family != AF_INET6 ||
+ srv_sk.protocol != IPPROTO_TCP ||
+ memcmp(srv_sk.src_ip6, &in6addr_loopback,
+ sizeof(srv_sk.src_ip6)) ||
+ memcmp(srv_sk.dst_ip6, &in6addr_loopback,
+ sizeof(srv_sk.dst_ip6)) ||
+ srv_sk.src_port != ntohs(srv_sa6.sin6_port) ||
+ srv_sk.dst_port != cli_sa6.sin6_port,
+ "srv_sk", "Unexpected. Check srv_sk output. egress_linum:%u\n",
+ egress_linum);
+
+ CHECK(!skel->bss->lsndtime, "srv_tp", "Unexpected lsndtime:0\n");
+
+ CHECK(cli_sk.state == 10 ||
+ !cli_sk.state ||
+ cli_sk.family != AF_INET6 ||
+ cli_sk.protocol != IPPROTO_TCP ||
+ memcmp(cli_sk.src_ip6, &in6addr_loopback,
+ sizeof(cli_sk.src_ip6)) ||
+ memcmp(cli_sk.dst_ip6, &in6addr_loopback,
+ sizeof(cli_sk.dst_ip6)) ||
+ cli_sk.src_port != ntohs(cli_sa6.sin6_port) ||
+ cli_sk.dst_port != srv_sa6.sin6_port,
+ "cli_sk", "Unexpected. Check cli_sk output. egress_linum:%u\n",
+ egress_linum);
+
+ CHECK(listen_tp.data_segs_out ||
+ listen_tp.data_segs_in ||
+ listen_tp.total_retrans ||
+ listen_tp.bytes_acked,
+ "listen_tp",
+ "Unexpected. Check listen_tp output. ingress_linum:%u\n",
+ ingress_linum);
+
+ CHECK(srv_tp.data_segs_out != 2 ||
+ srv_tp.data_segs_in ||
+ srv_tp.snd_cwnd != 10 ||
+ srv_tp.total_retrans ||
+ srv_tp.bytes_acked < 2 * DATA_LEN,
+ "srv_tp", "Unexpected. Check srv_tp output. egress_linum:%u\n",
+ egress_linum);
+
+ CHECK(cli_tp.data_segs_out ||
+ cli_tp.data_segs_in != 2 ||
+ cli_tp.snd_cwnd != 10 ||
+ cli_tp.total_retrans ||
+ cli_tp.bytes_received < 2 * DATA_LEN,
+ "cli_tp", "Unexpected. Check cli_tp output. egress_linum:%u\n",
+ egress_linum);
+
+ CHECK(skel->bss->parent_cg_id != parent_cg_id,
+ "parent_cg_id", "%zu != %zu\n",
+ (size_t)skel->bss->parent_cg_id, (size_t)parent_cg_id);
+
+ CHECK(skel->bss->child_cg_id != child_cg_id,
+ "child_cg_id", "%zu != %zu\n",
+ (size_t)skel->bss->child_cg_id, (size_t)child_cg_id);
+}
+
+static void check_sk_pkt_out_cnt(int accept_fd, int cli_fd)
+{
+ struct bpf_spinlock_cnt pkt_out_cnt = {}, pkt_out_cnt10 = {};
+ int err;
+
+ pkt_out_cnt.cnt = ~0;
+ pkt_out_cnt10.cnt = ~0;
+ err = bpf_map_lookup_elem(sk_pkt_out_cnt_fd, &accept_fd, &pkt_out_cnt);
+ if (!err)
+ err = bpf_map_lookup_elem(sk_pkt_out_cnt10_fd, &accept_fd,
+ &pkt_out_cnt10);
+
+ /* The bpf prog only counts for fullsock and
+ * passive connection did not become fullsock until 3WHS
+ * had been finished, so the bpf prog only counted two data
+ * packet out.
+ */
+ CHECK(err || pkt_out_cnt.cnt < 0xeB9F + 2 ||
+ pkt_out_cnt10.cnt < 0xeB9F + 20,
+ "bpf_map_lookup_elem(sk_pkt_out_cnt, &accept_fd)",
+ "err:%d errno:%d pkt_out_cnt:%u pkt_out_cnt10:%u\n",
+ err, errno, pkt_out_cnt.cnt, pkt_out_cnt10.cnt);
+
+ pkt_out_cnt.cnt = ~0;
+ pkt_out_cnt10.cnt = ~0;
+ err = bpf_map_lookup_elem(sk_pkt_out_cnt_fd, &cli_fd, &pkt_out_cnt);
+ if (!err)
+ err = bpf_map_lookup_elem(sk_pkt_out_cnt10_fd, &cli_fd,
+ &pkt_out_cnt10);
+ /* Active connection is fullsock from the beginning.
+ * 1 SYN and 1 ACK during 3WHS
+ * 2 Acks on data packet.
+ *
+ * The bpf_prog initialized it to 0xeB9F.
+ */
+ CHECK(err || pkt_out_cnt.cnt < 0xeB9F + 4 ||
+ pkt_out_cnt10.cnt < 0xeB9F + 40,
+ "bpf_map_lookup_elem(sk_pkt_out_cnt, &cli_fd)",
+ "err:%d errno:%d pkt_out_cnt:%u pkt_out_cnt10:%u\n",
+ err, errno, pkt_out_cnt.cnt, pkt_out_cnt10.cnt);
+}
+
+static int init_sk_storage(int sk_fd, __u32 pkt_out_cnt)
+{
+ struct bpf_spinlock_cnt scnt = {};
+ int err;
+
+ scnt.cnt = pkt_out_cnt;
+ err = bpf_map_update_elem(sk_pkt_out_cnt_fd, &sk_fd, &scnt,
+ BPF_NOEXIST);
+ if (CHECK(err, "bpf_map_update_elem(sk_pkt_out_cnt_fd)",
+ "err:%d errno:%d\n", err, errno))
+ return err;
+
+ err = bpf_map_update_elem(sk_pkt_out_cnt10_fd, &sk_fd, &scnt,
+ BPF_NOEXIST);
+ if (CHECK(err, "bpf_map_update_elem(sk_pkt_out_cnt10_fd)",
+ "err:%d errno:%d\n", err, errno))
+ return err;
+
+ return 0;
+}
+
+static void test(void)
+{
+ int listen_fd = -1, cli_fd = -1, accept_fd = -1, err, i;
+ socklen_t addrlen = sizeof(struct sockaddr_in6);
+ char buf[DATA_LEN];
+
+ /* Prepare listen_fd */
+ listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0xcafe, 0);
+ /* start_server() has logged the error details */
+ if (CHECK_FAIL(listen_fd == -1))
+ goto done;
+
+ err = getsockname(listen_fd, (struct sockaddr *)&srv_sa6, &addrlen);
+ if (CHECK(err, "getsockname(listen_fd)", "err:%d errno:%d\n", err,
+ errno))
+ goto done;
+ memcpy(&skel->bss->srv_sa6, &srv_sa6, sizeof(srv_sa6));
+
+ cli_fd = connect_to_fd(listen_fd, 0);
+ if (CHECK_FAIL(cli_fd == -1))
+ goto done;
+
+ err = getsockname(cli_fd, (struct sockaddr *)&cli_sa6, &addrlen);
+ if (CHECK(err, "getsockname(cli_fd)", "err:%d errno:%d\n",
+ err, errno))
+ goto done;
+
+ accept_fd = accept(listen_fd, NULL, NULL);
+ if (CHECK(accept_fd == -1, "accept(listen_fd)",
+ "accept_fd:%d errno:%d\n",
+ accept_fd, errno))
+ goto done;
+
+ if (init_sk_storage(accept_fd, 0xeB9F))
+ goto done;
+
+ for (i = 0; i < 2; i++) {
+ /* Send some data from accept_fd to cli_fd.
+ * MSG_EOR to stop kernel from coalescing two pkts.
+ */
+ err = send(accept_fd, DATA, DATA_LEN, MSG_EOR);
+ if (CHECK(err != DATA_LEN, "send(accept_fd)",
+ "err:%d errno:%d\n", err, errno))
+ goto done;
+
+ err = recv(cli_fd, buf, DATA_LEN, 0);
+ if (CHECK(err != DATA_LEN, "recv(cli_fd)", "err:%d errno:%d\n",
+ err, errno))
+ goto done;
+ }
+
+ shutdown(cli_fd, SHUT_WR);
+ err = recv(accept_fd, buf, 1, 0);
+ if (CHECK(err, "recv(accept_fd) for fin", "err:%d errno:%d\n",
+ err, errno))
+ goto done;
+ shutdown(accept_fd, SHUT_WR);
+ err = recv(cli_fd, buf, 1, 0);
+ if (CHECK(err, "recv(cli_fd) for fin", "err:%d errno:%d\n",
+ err, errno))
+ goto done;
+ check_sk_pkt_out_cnt(accept_fd, cli_fd);
+ check_result();
+
+done:
+ if (accept_fd != -1)
+ close(accept_fd);
+ if (cli_fd != -1)
+ close(cli_fd);
+ if (listen_fd != -1)
+ close(listen_fd);
+}
+
+void test_sock_fields(void)
+{
+ int parent_cg_fd = -1, child_cg_fd = -1;
+ struct bpf_link *link;
+
+ /* Use a dedicated netns to have a fixed listen port */
+ if (!create_netns())
+ return;
+
+ /* Create a cgroup, get fd, and join it */
+ parent_cg_fd = test__join_cgroup(PARENT_CGROUP);
+ if (CHECK_FAIL(parent_cg_fd < 0))
+ return;
+ parent_cg_id = get_cgroup_id(PARENT_CGROUP);
+ if (CHECK_FAIL(!parent_cg_id))
+ goto done;
+
+ child_cg_fd = test__join_cgroup(CHILD_CGROUP);
+ if (CHECK_FAIL(child_cg_fd < 0))
+ goto done;
+ child_cg_id = get_cgroup_id(CHILD_CGROUP);
+ if (CHECK_FAIL(!child_cg_id))
+ goto done;
+
+ skel = test_sock_fields__open_and_load();
+ if (CHECK(!skel, "test_sock_fields__open_and_load", "failed\n"))
+ goto done;
+
+ link = bpf_program__attach_cgroup(skel->progs.egress_read_sock_fields, child_cg_fd);
+ if (!ASSERT_OK_PTR(link, "attach_cgroup(egress_read_sock_fields)"))
+ goto done;
+ skel->links.egress_read_sock_fields = link;
+
+ link = bpf_program__attach_cgroup(skel->progs.ingress_read_sock_fields, child_cg_fd);
+ if (!ASSERT_OK_PTR(link, "attach_cgroup(ingress_read_sock_fields)"))
+ goto done;
+ skel->links.ingress_read_sock_fields = link;
+
+ link = bpf_program__attach_cgroup(skel->progs.read_sk_dst_port, child_cg_fd);
+ if (!ASSERT_OK_PTR(link, "attach_cgroup(read_sk_dst_port"))
+ goto done;
+ skel->links.read_sk_dst_port = link;
+
+ linum_map_fd = bpf_map__fd(skel->maps.linum_map);
+ sk_pkt_out_cnt_fd = bpf_map__fd(skel->maps.sk_pkt_out_cnt);
+ sk_pkt_out_cnt10_fd = bpf_map__fd(skel->maps.sk_pkt_out_cnt10);
+
+ test();
+
+done:
+ test_sock_fields__detach(skel);
+ test_sock_fields__destroy(skel);
+ if (child_cg_fd != -1)
+ close(child_cg_fd);
+ if (parent_cg_fd != -1)
+ close(parent_cg_fd);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
new file mode 100644
index 000000000..b8b48cac2
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -0,0 +1,304 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Cloudflare
+#include <error.h>
+#include <netinet/tcp.h>
+
+#include "test_progs.h"
+#include "test_skmsg_load_helpers.skel.h"
+#include "test_sockmap_update.skel.h"
+#include "test_sockmap_invalid_update.skel.h"
+#include "bpf_iter_sockmap.skel.h"
+
+#define TCP_REPAIR 19 /* TCP sock is under repair right now */
+
+#define TCP_REPAIR_ON 1
+#define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */
+
+static int connected_socket_v4(void)
+{
+ struct sockaddr_in addr = {
+ .sin_family = AF_INET,
+ .sin_port = htons(80),
+ .sin_addr = { inet_addr("127.0.0.1") },
+ };
+ socklen_t len = sizeof(addr);
+ int s, repair, err;
+
+ s = socket(AF_INET, SOCK_STREAM, 0);
+ if (CHECK_FAIL(s == -1))
+ goto error;
+
+ repair = TCP_REPAIR_ON;
+ err = setsockopt(s, SOL_TCP, TCP_REPAIR, &repair, sizeof(repair));
+ if (CHECK_FAIL(err))
+ goto error;
+
+ err = connect(s, (struct sockaddr *)&addr, len);
+ if (CHECK_FAIL(err))
+ goto error;
+
+ repair = TCP_REPAIR_OFF_NO_WP;
+ err = setsockopt(s, SOL_TCP, TCP_REPAIR, &repair, sizeof(repair));
+ if (CHECK_FAIL(err))
+ goto error;
+
+ return s;
+error:
+ perror(__func__);
+ close(s);
+ return -1;
+}
+
+static void compare_cookies(struct bpf_map *src, struct bpf_map *dst)
+{
+ __u32 i, max_entries = bpf_map__max_entries(src);
+ int err, duration = 0, src_fd, dst_fd;
+
+ src_fd = bpf_map__fd(src);
+ dst_fd = bpf_map__fd(dst);
+
+ for (i = 0; i < max_entries; i++) {
+ __u64 src_cookie, dst_cookie;
+
+ err = bpf_map_lookup_elem(src_fd, &i, &src_cookie);
+ if (err && errno == ENOENT) {
+ err = bpf_map_lookup_elem(dst_fd, &i, &dst_cookie);
+ CHECK(!err, "map_lookup_elem(dst)", "element %u not deleted\n", i);
+ CHECK(err && errno != ENOENT, "map_lookup_elem(dst)", "%s\n",
+ strerror(errno));
+ continue;
+ }
+ if (CHECK(err, "lookup_elem(src)", "%s\n", strerror(errno)))
+ continue;
+
+ err = bpf_map_lookup_elem(dst_fd, &i, &dst_cookie);
+ if (CHECK(err, "lookup_elem(dst)", "%s\n", strerror(errno)))
+ continue;
+
+ CHECK(dst_cookie != src_cookie, "cookie mismatch",
+ "%llu != %llu (pos %u)\n", dst_cookie, src_cookie, i);
+ }
+}
+
+/* Create a map, populate it with one socket, and free the map. */
+static void test_sockmap_create_update_free(enum bpf_map_type map_type)
+{
+ const int zero = 0;
+ int s, map, err;
+
+ s = connected_socket_v4();
+ if (CHECK_FAIL(s == -1))
+ return;
+
+ map = bpf_create_map(map_type, sizeof(int), sizeof(int), 1, 0);
+ if (CHECK_FAIL(map == -1)) {
+ perror("bpf_create_map");
+ goto out;
+ }
+
+ err = bpf_map_update_elem(map, &zero, &s, BPF_NOEXIST);
+ if (CHECK_FAIL(err)) {
+ perror("bpf_map_update");
+ goto out;
+ }
+
+out:
+ close(map);
+ close(s);
+}
+
+static void test_skmsg_helpers(enum bpf_map_type map_type)
+{
+ struct test_skmsg_load_helpers *skel;
+ int err, map, verdict;
+
+ skel = test_skmsg_load_helpers__open_and_load();
+ if (CHECK_FAIL(!skel)) {
+ perror("test_skmsg_load_helpers__open_and_load");
+ return;
+ }
+
+ verdict = bpf_program__fd(skel->progs.prog_msg_verdict);
+ map = bpf_map__fd(skel->maps.sock_map);
+
+ err = bpf_prog_attach(verdict, map, BPF_SK_MSG_VERDICT, 0);
+ if (CHECK_FAIL(err)) {
+ perror("bpf_prog_attach");
+ goto out;
+ }
+
+ err = bpf_prog_detach2(verdict, map, BPF_SK_MSG_VERDICT);
+ if (CHECK_FAIL(err)) {
+ perror("bpf_prog_detach2");
+ goto out;
+ }
+out:
+ test_skmsg_load_helpers__destroy(skel);
+}
+
+static void test_sockmap_update(enum bpf_map_type map_type)
+{
+ struct bpf_prog_test_run_attr tattr;
+ int err, prog, src, duration = 0;
+ struct test_sockmap_update *skel;
+ struct bpf_map *dst_map;
+ const __u32 zero = 0;
+ char dummy[14] = {0};
+ __s64 sk;
+
+ sk = connected_socket_v4();
+ if (CHECK(sk == -1, "connected_socket_v4", "cannot connect\n"))
+ return;
+
+ skel = test_sockmap_update__open_and_load();
+ if (CHECK(!skel, "open_and_load", "cannot load skeleton\n"))
+ goto close_sk;
+
+ prog = bpf_program__fd(skel->progs.copy_sock_map);
+ src = bpf_map__fd(skel->maps.src);
+ if (map_type == BPF_MAP_TYPE_SOCKMAP)
+ dst_map = skel->maps.dst_sock_map;
+ else
+ dst_map = skel->maps.dst_sock_hash;
+
+ err = bpf_map_update_elem(src, &zero, &sk, BPF_NOEXIST);
+ if (CHECK(err, "update_elem(src)", "errno=%u\n", errno))
+ goto out;
+
+ tattr = (struct bpf_prog_test_run_attr){
+ .prog_fd = prog,
+ .repeat = 1,
+ .data_in = dummy,
+ .data_size_in = sizeof(dummy),
+ };
+
+ err = bpf_prog_test_run_xattr(&tattr);
+ if (CHECK_ATTR(err || !tattr.retval, "bpf_prog_test_run",
+ "errno=%u retval=%u\n", errno, tattr.retval))
+ goto out;
+
+ compare_cookies(skel->maps.src, dst_map);
+
+out:
+ test_sockmap_update__destroy(skel);
+close_sk:
+ close(sk);
+}
+
+static void test_sockmap_invalid_update(void)
+{
+ struct test_sockmap_invalid_update *skel;
+ int duration = 0;
+
+ skel = test_sockmap_invalid_update__open_and_load();
+ if (CHECK(skel, "open_and_load", "verifier accepted map_update\n"))
+ test_sockmap_invalid_update__destroy(skel);
+}
+
+static void test_sockmap_copy(enum bpf_map_type map_type)
+{
+ DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+ int err, len, src_fd, iter_fd, duration = 0;
+ union bpf_iter_link_info linfo = {};
+ __u32 i, num_sockets, num_elems;
+ struct bpf_iter_sockmap *skel;
+ __s64 *sock_fd = NULL;
+ struct bpf_link *link;
+ struct bpf_map *src;
+ char buf[64];
+
+ skel = bpf_iter_sockmap__open_and_load();
+ if (CHECK(!skel, "bpf_iter_sockmap__open_and_load", "skeleton open_and_load failed\n"))
+ return;
+
+ if (map_type == BPF_MAP_TYPE_SOCKMAP) {
+ src = skel->maps.sockmap;
+ num_elems = bpf_map__max_entries(src);
+ num_sockets = num_elems - 1;
+ } else {
+ src = skel->maps.sockhash;
+ num_elems = bpf_map__max_entries(src) - 1;
+ num_sockets = num_elems;
+ }
+
+ sock_fd = calloc(num_sockets, sizeof(*sock_fd));
+ if (CHECK(!sock_fd, "calloc(sock_fd)", "failed to allocate\n"))
+ goto out;
+
+ for (i = 0; i < num_sockets; i++)
+ sock_fd[i] = -1;
+
+ src_fd = bpf_map__fd(src);
+
+ for (i = 0; i < num_sockets; i++) {
+ sock_fd[i] = connected_socket_v4();
+ if (CHECK(sock_fd[i] == -1, "connected_socket_v4", "cannot connect\n"))
+ goto out;
+
+ err = bpf_map_update_elem(src_fd, &i, &sock_fd[i], BPF_NOEXIST);
+ if (CHECK(err, "map_update", "failed: %s\n", strerror(errno)))
+ goto out;
+ }
+
+ linfo.map.map_fd = src_fd;
+ opts.link_info = &linfo;
+ opts.link_info_len = sizeof(linfo);
+ link = bpf_program__attach_iter(skel->progs.copy, &opts);
+ if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+ goto out;
+
+ iter_fd = bpf_iter_create(bpf_link__fd(link));
+ if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+ goto free_link;
+
+ /* do some tests */
+ while ((len = read(iter_fd, buf, sizeof(buf))) > 0)
+ ;
+ if (CHECK(len < 0, "read", "failed: %s\n", strerror(errno)))
+ goto close_iter;
+
+ /* test results */
+ if (CHECK(skel->bss->elems != num_elems, "elems", "got %u expected %u\n",
+ skel->bss->elems, num_elems))
+ goto close_iter;
+
+ if (CHECK(skel->bss->socks != num_sockets, "socks", "got %u expected %u\n",
+ skel->bss->socks, num_sockets))
+ goto close_iter;
+
+ compare_cookies(src, skel->maps.dst);
+
+close_iter:
+ close(iter_fd);
+free_link:
+ bpf_link__destroy(link);
+out:
+ for (i = 0; sock_fd && i < num_sockets; i++)
+ if (sock_fd[i] >= 0)
+ close(sock_fd[i]);
+ if (sock_fd)
+ free(sock_fd);
+ bpf_iter_sockmap__destroy(skel);
+}
+
+void test_sockmap_basic(void)
+{
+ if (test__start_subtest("sockmap create_update_free"))
+ test_sockmap_create_update_free(BPF_MAP_TYPE_SOCKMAP);
+ if (test__start_subtest("sockhash create_update_free"))
+ test_sockmap_create_update_free(BPF_MAP_TYPE_SOCKHASH);
+ if (test__start_subtest("sockmap sk_msg load helpers"))
+ test_skmsg_helpers(BPF_MAP_TYPE_SOCKMAP);
+ if (test__start_subtest("sockhash sk_msg load helpers"))
+ test_skmsg_helpers(BPF_MAP_TYPE_SOCKHASH);
+ if (test__start_subtest("sockmap update"))
+ test_sockmap_update(BPF_MAP_TYPE_SOCKMAP);
+ if (test__start_subtest("sockhash update"))
+ test_sockmap_update(BPF_MAP_TYPE_SOCKHASH);
+ if (test__start_subtest("sockmap update in unsafe context"))
+ test_sockmap_invalid_update();
+ if (test__start_subtest("sockmap copy"))
+ test_sockmap_copy(BPF_MAP_TYPE_SOCKMAP);
+ if (test__start_subtest("sockhash copy"))
+ test_sockmap_copy(BPF_MAP_TYPE_SOCKHASH);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c
new file mode 100644
index 000000000..06b86addc
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Cloudflare
+/*
+ * Tests for sockmap/sockhash holding kTLS sockets.
+ */
+
+#include "test_progs.h"
+
+#define MAX_TEST_NAME 80
+#define TCP_ULP 31
+
+static int tcp_server(int family)
+{
+ int err, s;
+
+ s = socket(family, SOCK_STREAM, 0);
+ if (CHECK_FAIL(s == -1)) {
+ perror("socket");
+ return -1;
+ }
+
+ err = listen(s, SOMAXCONN);
+ if (CHECK_FAIL(err)) {
+ perror("listen");
+ return -1;
+ }
+
+ return s;
+}
+
+static int disconnect(int fd)
+{
+ struct sockaddr unspec = { AF_UNSPEC };
+
+ return connect(fd, &unspec, sizeof(unspec));
+}
+
+/* Disconnect (unhash) a kTLS socket after removing it from sockmap. */
+static void test_sockmap_ktls_disconnect_after_delete(int family, int map)
+{
+ struct sockaddr_storage addr = {0};
+ socklen_t len = sizeof(addr);
+ int err, cli, srv, zero = 0;
+
+ srv = tcp_server(family);
+ if (srv == -1)
+ return;
+
+ err = getsockname(srv, (struct sockaddr *)&addr, &len);
+ if (CHECK_FAIL(err)) {
+ perror("getsockopt");
+ goto close_srv;
+ }
+
+ cli = socket(family, SOCK_STREAM, 0);
+ if (CHECK_FAIL(cli == -1)) {
+ perror("socket");
+ goto close_srv;
+ }
+
+ err = connect(cli, (struct sockaddr *)&addr, len);
+ if (CHECK_FAIL(err)) {
+ perror("connect");
+ goto close_cli;
+ }
+
+ err = bpf_map_update_elem(map, &zero, &cli, 0);
+ if (CHECK_FAIL(err)) {
+ perror("bpf_map_update_elem");
+ goto close_cli;
+ }
+
+ err = setsockopt(cli, IPPROTO_TCP, TCP_ULP, "tls", strlen("tls"));
+ if (CHECK_FAIL(err)) {
+ perror("setsockopt(TCP_ULP)");
+ goto close_cli;
+ }
+
+ err = bpf_map_delete_elem(map, &zero);
+ if (CHECK_FAIL(err)) {
+ perror("bpf_map_delete_elem");
+ goto close_cli;
+ }
+
+ err = disconnect(cli);
+ if (CHECK_FAIL(err))
+ perror("disconnect");
+
+close_cli:
+ close(cli);
+close_srv:
+ close(srv);
+}
+
+static void run_tests(int family, enum bpf_map_type map_type)
+{
+ char test_name[MAX_TEST_NAME];
+ int map;
+
+ map = bpf_create_map(map_type, sizeof(int), sizeof(int), 1, 0);
+ if (CHECK_FAIL(map == -1)) {
+ perror("bpf_map_create");
+ return;
+ }
+
+ snprintf(test_name, MAX_TEST_NAME,
+ "sockmap_ktls disconnect_after_delete %s %s",
+ family == AF_INET ? "IPv4" : "IPv6",
+ map_type == BPF_MAP_TYPE_SOCKMAP ? "SOCKMAP" : "SOCKHASH");
+ if (!test__start_subtest(test_name))
+ return;
+
+ test_sockmap_ktls_disconnect_after_delete(family, map);
+
+ close(map);
+}
+
+void test_sockmap_ktls(void)
+{
+ run_tests(AF_INET, BPF_MAP_TYPE_SOCKMAP);
+ run_tests(AF_INET, BPF_MAP_TYPE_SOCKHASH);
+ run_tests(AF_INET6, BPF_MAP_TYPE_SOCKMAP);
+ run_tests(AF_INET6, BPF_MAP_TYPE_SOCKHASH);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
new file mode 100644
index 000000000..d7d65a700
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_listen.c
@@ -0,0 +1,1635 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Cloudflare
+/*
+ * Test suite for SOCKMAP/SOCKHASH holding listening sockets.
+ * Covers:
+ * 1. BPF map operations - bpf_map_{update,lookup delete}_elem
+ * 2. BPF redirect helpers - bpf_{sk,msg}_redirect_map
+ * 3. BPF reuseport helper - bpf_sk_select_reuseport
+ */
+
+#include <linux/compiler.h>
+#include <errno.h>
+#include <error.h>
+#include <limits.h>
+#include <netinet/in.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/select.h>
+#include <unistd.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "bpf_util.h"
+#include "test_progs.h"
+#include "test_sockmap_listen.skel.h"
+
+#define IO_TIMEOUT_SEC 30
+#define MAX_STRERR_LEN 256
+#define MAX_TEST_NAME 80
+
+#define _FAIL(errnum, fmt...) \
+ ({ \
+ error_at_line(0, (errnum), __func__, __LINE__, fmt); \
+ CHECK_FAIL(true); \
+ })
+#define FAIL(fmt...) _FAIL(0, fmt)
+#define FAIL_ERRNO(fmt...) _FAIL(errno, fmt)
+#define FAIL_LIBBPF(err, msg) \
+ ({ \
+ char __buf[MAX_STRERR_LEN]; \
+ libbpf_strerror((err), __buf, sizeof(__buf)); \
+ FAIL("%s: %s", (msg), __buf); \
+ })
+
+/* Wrappers that fail the test on error and report it. */
+
+#define xaccept_nonblock(fd, addr, len) \
+ ({ \
+ int __ret = \
+ accept_timeout((fd), (addr), (len), IO_TIMEOUT_SEC); \
+ if (__ret == -1) \
+ FAIL_ERRNO("accept"); \
+ __ret; \
+ })
+
+#define xbind(fd, addr, len) \
+ ({ \
+ int __ret = bind((fd), (addr), (len)); \
+ if (__ret == -1) \
+ FAIL_ERRNO("bind"); \
+ __ret; \
+ })
+
+#define xclose(fd) \
+ ({ \
+ int __ret = close((fd)); \
+ if (__ret == -1) \
+ FAIL_ERRNO("close"); \
+ __ret; \
+ })
+
+#define xconnect(fd, addr, len) \
+ ({ \
+ int __ret = connect((fd), (addr), (len)); \
+ if (__ret == -1) \
+ FAIL_ERRNO("connect"); \
+ __ret; \
+ })
+
+#define xgetsockname(fd, addr, len) \
+ ({ \
+ int __ret = getsockname((fd), (addr), (len)); \
+ if (__ret == -1) \
+ FAIL_ERRNO("getsockname"); \
+ __ret; \
+ })
+
+#define xgetsockopt(fd, level, name, val, len) \
+ ({ \
+ int __ret = getsockopt((fd), (level), (name), (val), (len)); \
+ if (__ret == -1) \
+ FAIL_ERRNO("getsockopt(" #name ")"); \
+ __ret; \
+ })
+
+#define xlisten(fd, backlog) \
+ ({ \
+ int __ret = listen((fd), (backlog)); \
+ if (__ret == -1) \
+ FAIL_ERRNO("listen"); \
+ __ret; \
+ })
+
+#define xsetsockopt(fd, level, name, val, len) \
+ ({ \
+ int __ret = setsockopt((fd), (level), (name), (val), (len)); \
+ if (__ret == -1) \
+ FAIL_ERRNO("setsockopt(" #name ")"); \
+ __ret; \
+ })
+
+#define xsend(fd, buf, len, flags) \
+ ({ \
+ ssize_t __ret = send((fd), (buf), (len), (flags)); \
+ if (__ret == -1) \
+ FAIL_ERRNO("send"); \
+ __ret; \
+ })
+
+#define xrecv_nonblock(fd, buf, len, flags) \
+ ({ \
+ ssize_t __ret = recv_timeout((fd), (buf), (len), (flags), \
+ IO_TIMEOUT_SEC); \
+ if (__ret == -1) \
+ FAIL_ERRNO("recv"); \
+ __ret; \
+ })
+
+#define xsocket(family, sotype, flags) \
+ ({ \
+ int __ret = socket(family, sotype, flags); \
+ if (__ret == -1) \
+ FAIL_ERRNO("socket"); \
+ __ret; \
+ })
+
+#define xbpf_map_delete_elem(fd, key) \
+ ({ \
+ int __ret = bpf_map_delete_elem((fd), (key)); \
+ if (__ret == -1) \
+ FAIL_ERRNO("map_delete"); \
+ __ret; \
+ })
+
+#define xbpf_map_lookup_elem(fd, key, val) \
+ ({ \
+ int __ret = bpf_map_lookup_elem((fd), (key), (val)); \
+ if (__ret == -1) \
+ FAIL_ERRNO("map_lookup"); \
+ __ret; \
+ })
+
+#define xbpf_map_update_elem(fd, key, val, flags) \
+ ({ \
+ int __ret = bpf_map_update_elem((fd), (key), (val), (flags)); \
+ if (__ret == -1) \
+ FAIL_ERRNO("map_update"); \
+ __ret; \
+ })
+
+#define xbpf_prog_attach(prog, target, type, flags) \
+ ({ \
+ int __ret = \
+ bpf_prog_attach((prog), (target), (type), (flags)); \
+ if (__ret == -1) \
+ FAIL_ERRNO("prog_attach(" #type ")"); \
+ __ret; \
+ })
+
+#define xbpf_prog_detach2(prog, target, type) \
+ ({ \
+ int __ret = bpf_prog_detach2((prog), (target), (type)); \
+ if (__ret == -1) \
+ FAIL_ERRNO("prog_detach2(" #type ")"); \
+ __ret; \
+ })
+
+#define xpthread_create(thread, attr, func, arg) \
+ ({ \
+ int __ret = pthread_create((thread), (attr), (func), (arg)); \
+ errno = __ret; \
+ if (__ret) \
+ FAIL_ERRNO("pthread_create"); \
+ __ret; \
+ })
+
+#define xpthread_join(thread, retval) \
+ ({ \
+ int __ret = pthread_join((thread), (retval)); \
+ errno = __ret; \
+ if (__ret) \
+ FAIL_ERRNO("pthread_join"); \
+ __ret; \
+ })
+
+static int poll_read(int fd, unsigned int timeout_sec)
+{
+ struct timeval timeout = { .tv_sec = timeout_sec };
+ fd_set rfds;
+ int r;
+
+ FD_ZERO(&rfds);
+ FD_SET(fd, &rfds);
+
+ r = select(fd + 1, &rfds, NULL, NULL, &timeout);
+ if (r == 0)
+ errno = ETIME;
+
+ return r == 1 ? 0 : -1;
+}
+
+static int accept_timeout(int fd, struct sockaddr *addr, socklen_t *len,
+ unsigned int timeout_sec)
+{
+ if (poll_read(fd, timeout_sec))
+ return -1;
+
+ return accept(fd, addr, len);
+}
+
+static int recv_timeout(int fd, void *buf, size_t len, int flags,
+ unsigned int timeout_sec)
+{
+ if (poll_read(fd, timeout_sec))
+ return -1;
+
+ return recv(fd, buf, len, flags);
+}
+
+static void init_addr_loopback4(struct sockaddr_storage *ss, socklen_t *len)
+{
+ struct sockaddr_in *addr4 = memset(ss, 0, sizeof(*ss));
+
+ addr4->sin_family = AF_INET;
+ addr4->sin_port = 0;
+ addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ *len = sizeof(*addr4);
+}
+
+static void init_addr_loopback6(struct sockaddr_storage *ss, socklen_t *len)
+{
+ struct sockaddr_in6 *addr6 = memset(ss, 0, sizeof(*ss));
+
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_port = 0;
+ addr6->sin6_addr = in6addr_loopback;
+ *len = sizeof(*addr6);
+}
+
+static void init_addr_loopback(int family, struct sockaddr_storage *ss,
+ socklen_t *len)
+{
+ switch (family) {
+ case AF_INET:
+ init_addr_loopback4(ss, len);
+ return;
+ case AF_INET6:
+ init_addr_loopback6(ss, len);
+ return;
+ default:
+ FAIL("unsupported address family %d", family);
+ }
+}
+
+static inline struct sockaddr *sockaddr(struct sockaddr_storage *ss)
+{
+ return (struct sockaddr *)ss;
+}
+
+static int enable_reuseport(int s, int progfd)
+{
+ int err, one = 1;
+
+ err = xsetsockopt(s, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one));
+ if (err)
+ return -1;
+ err = xsetsockopt(s, SOL_SOCKET, SO_ATTACH_REUSEPORT_EBPF, &progfd,
+ sizeof(progfd));
+ if (err)
+ return -1;
+
+ return 0;
+}
+
+static int socket_loopback_reuseport(int family, int sotype, int progfd)
+{
+ struct sockaddr_storage addr;
+ socklen_t len;
+ int err, s;
+
+ init_addr_loopback(family, &addr, &len);
+
+ s = xsocket(family, sotype, 0);
+ if (s == -1)
+ return -1;
+
+ if (progfd >= 0)
+ enable_reuseport(s, progfd);
+
+ err = xbind(s, sockaddr(&addr), len);
+ if (err)
+ goto close;
+
+ if (sotype & SOCK_DGRAM)
+ return s;
+
+ err = xlisten(s, SOMAXCONN);
+ if (err)
+ goto close;
+
+ return s;
+close:
+ xclose(s);
+ return -1;
+}
+
+static int socket_loopback(int family, int sotype)
+{
+ return socket_loopback_reuseport(family, sotype, -1);
+}
+
+static void test_insert_invalid(int family, int sotype, int mapfd)
+{
+ u32 key = 0;
+ u64 value;
+ int err;
+
+ value = -1;
+ err = bpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST);
+ if (!err || errno != EINVAL)
+ FAIL_ERRNO("map_update: expected EINVAL");
+
+ value = INT_MAX;
+ err = bpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST);
+ if (!err || errno != EBADF)
+ FAIL_ERRNO("map_update: expected EBADF");
+}
+
+static void test_insert_opened(int family, int sotype, int mapfd)
+{
+ u32 key = 0;
+ u64 value;
+ int err, s;
+
+ s = xsocket(family, sotype, 0);
+ if (s == -1)
+ return;
+
+ errno = 0;
+ value = s;
+ err = bpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST);
+ if (!err || errno != EOPNOTSUPP)
+ FAIL_ERRNO("map_update: expected EOPNOTSUPP");
+
+ xclose(s);
+}
+
+static void test_insert_bound(int family, int sotype, int mapfd)
+{
+ struct sockaddr_storage addr;
+ socklen_t len;
+ u32 key = 0;
+ u64 value;
+ int err, s;
+
+ init_addr_loopback(family, &addr, &len);
+
+ s = xsocket(family, sotype, 0);
+ if (s == -1)
+ return;
+
+ err = xbind(s, sockaddr(&addr), len);
+ if (err)
+ goto close;
+
+ errno = 0;
+ value = s;
+ err = bpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST);
+ if (!err || errno != EOPNOTSUPP)
+ FAIL_ERRNO("map_update: expected EOPNOTSUPP");
+close:
+ xclose(s);
+}
+
+static void test_insert(int family, int sotype, int mapfd)
+{
+ u64 value;
+ u32 key;
+ int s;
+
+ s = socket_loopback(family, sotype);
+ if (s < 0)
+ return;
+
+ key = 0;
+ value = s;
+ xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST);
+ xclose(s);
+}
+
+static void test_delete_after_insert(int family, int sotype, int mapfd)
+{
+ u64 value;
+ u32 key;
+ int s;
+
+ s = socket_loopback(family, sotype);
+ if (s < 0)
+ return;
+
+ key = 0;
+ value = s;
+ xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST);
+ xbpf_map_delete_elem(mapfd, &key);
+ xclose(s);
+}
+
+static void test_delete_after_close(int family, int sotype, int mapfd)
+{
+ int err, s;
+ u64 value;
+ u32 key;
+
+ s = socket_loopback(family, sotype);
+ if (s < 0)
+ return;
+
+ key = 0;
+ value = s;
+ xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST);
+
+ xclose(s);
+
+ errno = 0;
+ err = bpf_map_delete_elem(mapfd, &key);
+ if (!err || (errno != EINVAL && errno != ENOENT))
+ /* SOCKMAP and SOCKHASH return different error codes */
+ FAIL_ERRNO("map_delete: expected EINVAL/EINVAL");
+}
+
+static void test_lookup_after_insert(int family, int sotype, int mapfd)
+{
+ u64 cookie, value;
+ socklen_t len;
+ u32 key;
+ int s;
+
+ s = socket_loopback(family, sotype);
+ if (s < 0)
+ return;
+
+ key = 0;
+ value = s;
+ xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST);
+
+ len = sizeof(cookie);
+ xgetsockopt(s, SOL_SOCKET, SO_COOKIE, &cookie, &len);
+
+ xbpf_map_lookup_elem(mapfd, &key, &value);
+
+ if (value != cookie) {
+ FAIL("map_lookup: have %#llx, want %#llx",
+ (unsigned long long)value, (unsigned long long)cookie);
+ }
+
+ xclose(s);
+}
+
+static void test_lookup_after_delete(int family, int sotype, int mapfd)
+{
+ int err, s;
+ u64 value;
+ u32 key;
+
+ s = socket_loopback(family, sotype);
+ if (s < 0)
+ return;
+
+ key = 0;
+ value = s;
+ xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST);
+ xbpf_map_delete_elem(mapfd, &key);
+
+ errno = 0;
+ err = bpf_map_lookup_elem(mapfd, &key, &value);
+ if (!err || errno != ENOENT)
+ FAIL_ERRNO("map_lookup: expected ENOENT");
+
+ xclose(s);
+}
+
+static void test_lookup_32_bit_value(int family, int sotype, int mapfd)
+{
+ u32 key, value32;
+ int err, s;
+
+ s = socket_loopback(family, sotype);
+ if (s < 0)
+ return;
+
+ mapfd = bpf_create_map(BPF_MAP_TYPE_SOCKMAP, sizeof(key),
+ sizeof(value32), 1, 0);
+ if (mapfd < 0) {
+ FAIL_ERRNO("map_create");
+ goto close;
+ }
+
+ key = 0;
+ value32 = s;
+ xbpf_map_update_elem(mapfd, &key, &value32, BPF_NOEXIST);
+
+ errno = 0;
+ err = bpf_map_lookup_elem(mapfd, &key, &value32);
+ if (!err || errno != ENOSPC)
+ FAIL_ERRNO("map_lookup: expected ENOSPC");
+
+ xclose(mapfd);
+close:
+ xclose(s);
+}
+
+static void test_update_existing(int family, int sotype, int mapfd)
+{
+ int s1, s2;
+ u64 value;
+ u32 key;
+
+ s1 = socket_loopback(family, sotype);
+ if (s1 < 0)
+ return;
+
+ s2 = socket_loopback(family, sotype);
+ if (s2 < 0)
+ goto close_s1;
+
+ key = 0;
+ value = s1;
+ xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST);
+
+ value = s2;
+ xbpf_map_update_elem(mapfd, &key, &value, BPF_EXIST);
+ xclose(s2);
+close_s1:
+ xclose(s1);
+}
+
+/* Exercise the code path where we destroy child sockets that never
+ * got accept()'ed, aka orphans, when parent socket gets closed.
+ */
+static void test_destroy_orphan_child(int family, int sotype, int mapfd)
+{
+ struct sockaddr_storage addr;
+ socklen_t len;
+ int err, s, c;
+ u64 value;
+ u32 key;
+
+ s = socket_loopback(family, sotype);
+ if (s < 0)
+ return;
+
+ len = sizeof(addr);
+ err = xgetsockname(s, sockaddr(&addr), &len);
+ if (err)
+ goto close_srv;
+
+ key = 0;
+ value = s;
+ xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST);
+
+ c = xsocket(family, sotype, 0);
+ if (c == -1)
+ goto close_srv;
+
+ xconnect(c, sockaddr(&addr), len);
+ xclose(c);
+close_srv:
+ xclose(s);
+}
+
+/* Perform a passive open after removing listening socket from SOCKMAP
+ * to ensure that callbacks get restored properly.
+ */
+static void test_clone_after_delete(int family, int sotype, int mapfd)
+{
+ struct sockaddr_storage addr;
+ socklen_t len;
+ int err, s, c;
+ u64 value;
+ u32 key;
+
+ s = socket_loopback(family, sotype);
+ if (s < 0)
+ return;
+
+ len = sizeof(addr);
+ err = xgetsockname(s, sockaddr(&addr), &len);
+ if (err)
+ goto close_srv;
+
+ key = 0;
+ value = s;
+ xbpf_map_update_elem(mapfd, &key, &value, BPF_NOEXIST);
+ xbpf_map_delete_elem(mapfd, &key);
+
+ c = xsocket(family, sotype, 0);
+ if (c < 0)
+ goto close_srv;
+
+ xconnect(c, sockaddr(&addr), len);
+ xclose(c);
+close_srv:
+ xclose(s);
+}
+
+/* Check that child socket that got created while parent was in a
+ * SOCKMAP, but got accept()'ed only after the parent has been removed
+ * from SOCKMAP, gets cloned without parent psock state or callbacks.
+ */
+static void test_accept_after_delete(int family, int sotype, int mapfd)
+{
+ struct sockaddr_storage addr;
+ const u32 zero = 0;
+ int err, s, c, p;
+ socklen_t len;
+ u64 value;
+
+ s = socket_loopback(family, sotype | SOCK_NONBLOCK);
+ if (s == -1)
+ return;
+
+ len = sizeof(addr);
+ err = xgetsockname(s, sockaddr(&addr), &len);
+ if (err)
+ goto close_srv;
+
+ value = s;
+ err = xbpf_map_update_elem(mapfd, &zero, &value, BPF_NOEXIST);
+ if (err)
+ goto close_srv;
+
+ c = xsocket(family, sotype, 0);
+ if (c == -1)
+ goto close_srv;
+
+ /* Create child while parent is in sockmap */
+ err = xconnect(c, sockaddr(&addr), len);
+ if (err)
+ goto close_cli;
+
+ /* Remove parent from sockmap */
+ err = xbpf_map_delete_elem(mapfd, &zero);
+ if (err)
+ goto close_cli;
+
+ p = xaccept_nonblock(s, NULL, NULL);
+ if (p == -1)
+ goto close_cli;
+
+ /* Check that child sk_user_data is not set */
+ value = p;
+ xbpf_map_update_elem(mapfd, &zero, &value, BPF_NOEXIST);
+
+ xclose(p);
+close_cli:
+ xclose(c);
+close_srv:
+ xclose(s);
+}
+
+/* Check that child socket that got created and accepted while parent
+ * was in a SOCKMAP is cloned without parent psock state or callbacks.
+ */
+static void test_accept_before_delete(int family, int sotype, int mapfd)
+{
+ struct sockaddr_storage addr;
+ const u32 zero = 0, one = 1;
+ int err, s, c, p;
+ socklen_t len;
+ u64 value;
+
+ s = socket_loopback(family, sotype | SOCK_NONBLOCK);
+ if (s == -1)
+ return;
+
+ len = sizeof(addr);
+ err = xgetsockname(s, sockaddr(&addr), &len);
+ if (err)
+ goto close_srv;
+
+ value = s;
+ err = xbpf_map_update_elem(mapfd, &zero, &value, BPF_NOEXIST);
+ if (err)
+ goto close_srv;
+
+ c = xsocket(family, sotype, 0);
+ if (c == -1)
+ goto close_srv;
+
+ /* Create & accept child while parent is in sockmap */
+ err = xconnect(c, sockaddr(&addr), len);
+ if (err)
+ goto close_cli;
+
+ p = xaccept_nonblock(s, NULL, NULL);
+ if (p == -1)
+ goto close_cli;
+
+ /* Check that child sk_user_data is not set */
+ value = p;
+ xbpf_map_update_elem(mapfd, &one, &value, BPF_NOEXIST);
+
+ xclose(p);
+close_cli:
+ xclose(c);
+close_srv:
+ xclose(s);
+}
+
+struct connect_accept_ctx {
+ int sockfd;
+ unsigned int done;
+ unsigned int nr_iter;
+};
+
+static bool is_thread_done(struct connect_accept_ctx *ctx)
+{
+ return READ_ONCE(ctx->done);
+}
+
+static void *connect_accept_thread(void *arg)
+{
+ struct connect_accept_ctx *ctx = arg;
+ struct sockaddr_storage addr;
+ int family, socktype;
+ socklen_t len;
+ int err, i, s;
+
+ s = ctx->sockfd;
+
+ len = sizeof(addr);
+ err = xgetsockname(s, sockaddr(&addr), &len);
+ if (err)
+ goto done;
+
+ len = sizeof(family);
+ err = xgetsockopt(s, SOL_SOCKET, SO_DOMAIN, &family, &len);
+ if (err)
+ goto done;
+
+ len = sizeof(socktype);
+ err = xgetsockopt(s, SOL_SOCKET, SO_TYPE, &socktype, &len);
+ if (err)
+ goto done;
+
+ for (i = 0; i < ctx->nr_iter; i++) {
+ int c, p;
+
+ c = xsocket(family, socktype, 0);
+ if (c < 0)
+ break;
+
+ err = xconnect(c, (struct sockaddr *)&addr, sizeof(addr));
+ if (err) {
+ xclose(c);
+ break;
+ }
+
+ p = xaccept_nonblock(s, NULL, NULL);
+ if (p < 0) {
+ xclose(c);
+ break;
+ }
+
+ xclose(p);
+ xclose(c);
+ }
+done:
+ WRITE_ONCE(ctx->done, 1);
+ return NULL;
+}
+
+static void test_syn_recv_insert_delete(int family, int sotype, int mapfd)
+{
+ struct connect_accept_ctx ctx = { 0 };
+ struct sockaddr_storage addr;
+ socklen_t len;
+ u32 zero = 0;
+ pthread_t t;
+ int err, s;
+ u64 value;
+
+ s = socket_loopback(family, sotype | SOCK_NONBLOCK);
+ if (s < 0)
+ return;
+
+ len = sizeof(addr);
+ err = xgetsockname(s, sockaddr(&addr), &len);
+ if (err)
+ goto close;
+
+ ctx.sockfd = s;
+ ctx.nr_iter = 1000;
+
+ err = xpthread_create(&t, NULL, connect_accept_thread, &ctx);
+ if (err)
+ goto close;
+
+ value = s;
+ while (!is_thread_done(&ctx)) {
+ err = xbpf_map_update_elem(mapfd, &zero, &value, BPF_NOEXIST);
+ if (err)
+ break;
+
+ err = xbpf_map_delete_elem(mapfd, &zero);
+ if (err)
+ break;
+ }
+
+ xpthread_join(t, NULL);
+close:
+ xclose(s);
+}
+
+static void *listen_thread(void *arg)
+{
+ struct sockaddr unspec = { AF_UNSPEC };
+ struct connect_accept_ctx *ctx = arg;
+ int err, i, s;
+
+ s = ctx->sockfd;
+
+ for (i = 0; i < ctx->nr_iter; i++) {
+ err = xlisten(s, 1);
+ if (err)
+ break;
+ err = xconnect(s, &unspec, sizeof(unspec));
+ if (err)
+ break;
+ }
+
+ WRITE_ONCE(ctx->done, 1);
+ return NULL;
+}
+
+static void test_race_insert_listen(int family, int socktype, int mapfd)
+{
+ struct connect_accept_ctx ctx = { 0 };
+ const u32 zero = 0;
+ const int one = 1;
+ pthread_t t;
+ int err, s;
+ u64 value;
+
+ s = xsocket(family, socktype, 0);
+ if (s < 0)
+ return;
+
+ err = xsetsockopt(s, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one));
+ if (err)
+ goto close;
+
+ ctx.sockfd = s;
+ ctx.nr_iter = 10000;
+
+ err = pthread_create(&t, NULL, listen_thread, &ctx);
+ if (err)
+ goto close;
+
+ value = s;
+ while (!is_thread_done(&ctx)) {
+ err = bpf_map_update_elem(mapfd, &zero, &value, BPF_NOEXIST);
+ /* Expecting EOPNOTSUPP before listen() */
+ if (err && errno != EOPNOTSUPP) {
+ FAIL_ERRNO("map_update");
+ break;
+ }
+
+ err = bpf_map_delete_elem(mapfd, &zero);
+ /* Expecting no entry after unhash on connect(AF_UNSPEC) */
+ if (err && errno != EINVAL && errno != ENOENT) {
+ FAIL_ERRNO("map_delete");
+ break;
+ }
+ }
+
+ xpthread_join(t, NULL);
+close:
+ xclose(s);
+}
+
+static void zero_verdict_count(int mapfd)
+{
+ unsigned int zero = 0;
+ int key;
+
+ key = SK_DROP;
+ xbpf_map_update_elem(mapfd, &key, &zero, BPF_ANY);
+ key = SK_PASS;
+ xbpf_map_update_elem(mapfd, &key, &zero, BPF_ANY);
+}
+
+enum redir_mode {
+ REDIR_INGRESS,
+ REDIR_EGRESS,
+};
+
+static const char *redir_mode_str(enum redir_mode mode)
+{
+ switch (mode) {
+ case REDIR_INGRESS:
+ return "ingress";
+ case REDIR_EGRESS:
+ return "egress";
+ default:
+ return "unknown";
+ }
+}
+
+static void redir_to_connected(int family, int sotype, int sock_mapfd,
+ int verd_mapfd, enum redir_mode mode)
+{
+ const char *log_prefix = redir_mode_str(mode);
+ struct sockaddr_storage addr;
+ int s, c0, c1, p0, p1;
+ unsigned int pass;
+ socklen_t len;
+ int err, n;
+ u64 value;
+ u32 key;
+ char b;
+
+ zero_verdict_count(verd_mapfd);
+
+ s = socket_loopback(family, sotype | SOCK_NONBLOCK);
+ if (s < 0)
+ return;
+
+ len = sizeof(addr);
+ err = xgetsockname(s, sockaddr(&addr), &len);
+ if (err)
+ goto close_srv;
+
+ c0 = xsocket(family, sotype, 0);
+ if (c0 < 0)
+ goto close_srv;
+ err = xconnect(c0, sockaddr(&addr), len);
+ if (err)
+ goto close_cli0;
+
+ p0 = xaccept_nonblock(s, NULL, NULL);
+ if (p0 < 0)
+ goto close_cli0;
+
+ c1 = xsocket(family, sotype, 0);
+ if (c1 < 0)
+ goto close_peer0;
+ err = xconnect(c1, sockaddr(&addr), len);
+ if (err)
+ goto close_cli1;
+
+ p1 = xaccept_nonblock(s, NULL, NULL);
+ if (p1 < 0)
+ goto close_cli1;
+
+ key = 0;
+ value = p0;
+ err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST);
+ if (err)
+ goto close_peer1;
+
+ key = 1;
+ value = p1;
+ err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST);
+ if (err)
+ goto close_peer1;
+
+ n = write(mode == REDIR_INGRESS ? c1 : p1, "a", 1);
+ if (n < 0)
+ FAIL_ERRNO("%s: write", log_prefix);
+ if (n == 0)
+ FAIL("%s: incomplete write", log_prefix);
+ if (n < 1)
+ goto close_peer1;
+
+ key = SK_PASS;
+ err = xbpf_map_lookup_elem(verd_mapfd, &key, &pass);
+ if (err)
+ goto close_peer1;
+ if (pass != 1)
+ FAIL("%s: want pass count 1, have %d", log_prefix, pass);
+
+ n = read(c0, &b, 1);
+ if (n < 0)
+ FAIL_ERRNO("%s: read", log_prefix);
+ if (n == 0)
+ FAIL("%s: incomplete read", log_prefix);
+
+close_peer1:
+ xclose(p1);
+close_cli1:
+ xclose(c1);
+close_peer0:
+ xclose(p0);
+close_cli0:
+ xclose(c0);
+close_srv:
+ xclose(s);
+}
+
+static void test_skb_redir_to_connected(struct test_sockmap_listen *skel,
+ struct bpf_map *inner_map, int family,
+ int sotype)
+{
+ int verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
+ int parser = bpf_program__fd(skel->progs.prog_skb_parser);
+ int verdict_map = bpf_map__fd(skel->maps.verdict_map);
+ int sock_map = bpf_map__fd(inner_map);
+ int err;
+
+ err = xbpf_prog_attach(parser, sock_map, BPF_SK_SKB_STREAM_PARSER, 0);
+ if (err)
+ return;
+ err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT, 0);
+ if (err)
+ goto detach;
+
+ redir_to_connected(family, sotype, sock_map, verdict_map,
+ REDIR_INGRESS);
+
+ xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT);
+detach:
+ xbpf_prog_detach2(parser, sock_map, BPF_SK_SKB_STREAM_PARSER);
+}
+
+static void test_msg_redir_to_connected(struct test_sockmap_listen *skel,
+ struct bpf_map *inner_map, int family,
+ int sotype)
+{
+ int verdict = bpf_program__fd(skel->progs.prog_msg_verdict);
+ int verdict_map = bpf_map__fd(skel->maps.verdict_map);
+ int sock_map = bpf_map__fd(inner_map);
+ int err;
+
+ err = xbpf_prog_attach(verdict, sock_map, BPF_SK_MSG_VERDICT, 0);
+ if (err)
+ return;
+
+ redir_to_connected(family, sotype, sock_map, verdict_map, REDIR_EGRESS);
+
+ xbpf_prog_detach2(verdict, sock_map, BPF_SK_MSG_VERDICT);
+}
+
+static void redir_to_listening(int family, int sotype, int sock_mapfd,
+ int verd_mapfd, enum redir_mode mode)
+{
+ const char *log_prefix = redir_mode_str(mode);
+ struct sockaddr_storage addr;
+ int s, c, p, err, n;
+ unsigned int drop;
+ socklen_t len;
+ u64 value;
+ u32 key;
+
+ zero_verdict_count(verd_mapfd);
+
+ s = socket_loopback(family, sotype | SOCK_NONBLOCK);
+ if (s < 0)
+ return;
+
+ len = sizeof(addr);
+ err = xgetsockname(s, sockaddr(&addr), &len);
+ if (err)
+ goto close_srv;
+
+ c = xsocket(family, sotype, 0);
+ if (c < 0)
+ goto close_srv;
+ err = xconnect(c, sockaddr(&addr), len);
+ if (err)
+ goto close_cli;
+
+ p = xaccept_nonblock(s, NULL, NULL);
+ if (p < 0)
+ goto close_cli;
+
+ key = 0;
+ value = s;
+ err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST);
+ if (err)
+ goto close_peer;
+
+ key = 1;
+ value = p;
+ err = xbpf_map_update_elem(sock_mapfd, &key, &value, BPF_NOEXIST);
+ if (err)
+ goto close_peer;
+
+ n = write(mode == REDIR_INGRESS ? c : p, "a", 1);
+ if (n < 0 && errno != EACCES)
+ FAIL_ERRNO("%s: write", log_prefix);
+ if (n == 0)
+ FAIL("%s: incomplete write", log_prefix);
+ if (n < 1)
+ goto close_peer;
+
+ key = SK_DROP;
+ err = xbpf_map_lookup_elem(verd_mapfd, &key, &drop);
+ if (err)
+ goto close_peer;
+ if (drop != 1)
+ FAIL("%s: want drop count 1, have %d", log_prefix, drop);
+
+close_peer:
+ xclose(p);
+close_cli:
+ xclose(c);
+close_srv:
+ xclose(s);
+}
+
+static void test_skb_redir_to_listening(struct test_sockmap_listen *skel,
+ struct bpf_map *inner_map, int family,
+ int sotype)
+{
+ int verdict = bpf_program__fd(skel->progs.prog_skb_verdict);
+ int parser = bpf_program__fd(skel->progs.prog_skb_parser);
+ int verdict_map = bpf_map__fd(skel->maps.verdict_map);
+ int sock_map = bpf_map__fd(inner_map);
+ int err;
+
+ err = xbpf_prog_attach(parser, sock_map, BPF_SK_SKB_STREAM_PARSER, 0);
+ if (err)
+ return;
+ err = xbpf_prog_attach(verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT, 0);
+ if (err)
+ goto detach;
+
+ redir_to_listening(family, sotype, sock_map, verdict_map,
+ REDIR_INGRESS);
+
+ xbpf_prog_detach2(verdict, sock_map, BPF_SK_SKB_STREAM_VERDICT);
+detach:
+ xbpf_prog_detach2(parser, sock_map, BPF_SK_SKB_STREAM_PARSER);
+}
+
+static void test_msg_redir_to_listening(struct test_sockmap_listen *skel,
+ struct bpf_map *inner_map, int family,
+ int sotype)
+{
+ int verdict = bpf_program__fd(skel->progs.prog_msg_verdict);
+ int verdict_map = bpf_map__fd(skel->maps.verdict_map);
+ int sock_map = bpf_map__fd(inner_map);
+ int err;
+
+ err = xbpf_prog_attach(verdict, sock_map, BPF_SK_MSG_VERDICT, 0);
+ if (err)
+ return;
+
+ redir_to_listening(family, sotype, sock_map, verdict_map, REDIR_EGRESS);
+
+ xbpf_prog_detach2(verdict, sock_map, BPF_SK_MSG_VERDICT);
+}
+
+static void test_reuseport_select_listening(int family, int sotype,
+ int sock_map, int verd_map,
+ int reuseport_prog)
+{
+ struct sockaddr_storage addr;
+ unsigned int pass;
+ int s, c, err;
+ socklen_t len;
+ u64 value;
+ u32 key;
+
+ zero_verdict_count(verd_map);
+
+ s = socket_loopback_reuseport(family, sotype | SOCK_NONBLOCK,
+ reuseport_prog);
+ if (s < 0)
+ return;
+
+ len = sizeof(addr);
+ err = xgetsockname(s, sockaddr(&addr), &len);
+ if (err)
+ goto close_srv;
+
+ key = 0;
+ value = s;
+ err = xbpf_map_update_elem(sock_map, &key, &value, BPF_NOEXIST);
+ if (err)
+ goto close_srv;
+
+ c = xsocket(family, sotype, 0);
+ if (c < 0)
+ goto close_srv;
+ err = xconnect(c, sockaddr(&addr), len);
+ if (err)
+ goto close_cli;
+
+ if (sotype == SOCK_STREAM) {
+ int p;
+
+ p = xaccept_nonblock(s, NULL, NULL);
+ if (p < 0)
+ goto close_cli;
+ xclose(p);
+ } else {
+ char b = 'a';
+ ssize_t n;
+
+ n = xsend(c, &b, sizeof(b), 0);
+ if (n == -1)
+ goto close_cli;
+
+ n = xrecv_nonblock(s, &b, sizeof(b), 0);
+ if (n == -1)
+ goto close_cli;
+ }
+
+ key = SK_PASS;
+ err = xbpf_map_lookup_elem(verd_map, &key, &pass);
+ if (err)
+ goto close_cli;
+ if (pass != 1)
+ FAIL("want pass count 1, have %d", pass);
+
+close_cli:
+ xclose(c);
+close_srv:
+ xclose(s);
+}
+
+static void test_reuseport_select_connected(int family, int sotype,
+ int sock_map, int verd_map,
+ int reuseport_prog)
+{
+ struct sockaddr_storage addr;
+ int s, c0, c1, p0, err;
+ unsigned int drop;
+ socklen_t len;
+ u64 value;
+ u32 key;
+
+ zero_verdict_count(verd_map);
+
+ s = socket_loopback_reuseport(family, sotype, reuseport_prog);
+ if (s < 0)
+ return;
+
+ /* Populate sock_map[0] to avoid ENOENT on first connection */
+ key = 0;
+ value = s;
+ err = xbpf_map_update_elem(sock_map, &key, &value, BPF_NOEXIST);
+ if (err)
+ goto close_srv;
+
+ len = sizeof(addr);
+ err = xgetsockname(s, sockaddr(&addr), &len);
+ if (err)
+ goto close_srv;
+
+ c0 = xsocket(family, sotype, 0);
+ if (c0 < 0)
+ goto close_srv;
+
+ err = xconnect(c0, sockaddr(&addr), len);
+ if (err)
+ goto close_cli0;
+
+ if (sotype == SOCK_STREAM) {
+ p0 = xaccept_nonblock(s, NULL, NULL);
+ if (p0 < 0)
+ goto close_cli0;
+ } else {
+ p0 = xsocket(family, sotype, 0);
+ if (p0 < 0)
+ goto close_cli0;
+
+ len = sizeof(addr);
+ err = xgetsockname(c0, sockaddr(&addr), &len);
+ if (err)
+ goto close_cli0;
+
+ err = xconnect(p0, sockaddr(&addr), len);
+ if (err)
+ goto close_cli0;
+ }
+
+ /* Update sock_map[0] to redirect to a connected socket */
+ key = 0;
+ value = p0;
+ err = xbpf_map_update_elem(sock_map, &key, &value, BPF_EXIST);
+ if (err)
+ goto close_peer0;
+
+ c1 = xsocket(family, sotype, 0);
+ if (c1 < 0)
+ goto close_peer0;
+
+ len = sizeof(addr);
+ err = xgetsockname(s, sockaddr(&addr), &len);
+ if (err)
+ goto close_srv;
+
+ errno = 0;
+ err = connect(c1, sockaddr(&addr), len);
+ if (sotype == SOCK_DGRAM) {
+ char b = 'a';
+ ssize_t n;
+
+ n = xsend(c1, &b, sizeof(b), 0);
+ if (n == -1)
+ goto close_cli1;
+
+ n = recv_timeout(c1, &b, sizeof(b), 0, IO_TIMEOUT_SEC);
+ err = n == -1;
+ }
+ if (!err || errno != ECONNREFUSED)
+ FAIL_ERRNO("connect: expected ECONNREFUSED");
+
+ key = SK_DROP;
+ err = xbpf_map_lookup_elem(verd_map, &key, &drop);
+ if (err)
+ goto close_cli1;
+ if (drop != 1)
+ FAIL("want drop count 1, have %d", drop);
+
+close_cli1:
+ xclose(c1);
+close_peer0:
+ xclose(p0);
+close_cli0:
+ xclose(c0);
+close_srv:
+ xclose(s);
+}
+
+/* Check that redirecting across reuseport groups is not allowed. */
+static void test_reuseport_mixed_groups(int family, int sotype, int sock_map,
+ int verd_map, int reuseport_prog)
+{
+ struct sockaddr_storage addr;
+ int s1, s2, c, err;
+ unsigned int drop;
+ socklen_t len;
+ u64 value;
+ u32 key;
+
+ zero_verdict_count(verd_map);
+
+ /* Create two listeners, each in its own reuseport group */
+ s1 = socket_loopback_reuseport(family, sotype, reuseport_prog);
+ if (s1 < 0)
+ return;
+
+ s2 = socket_loopback_reuseport(family, sotype, reuseport_prog);
+ if (s2 < 0)
+ goto close_srv1;
+
+ key = 0;
+ value = s1;
+ err = xbpf_map_update_elem(sock_map, &key, &value, BPF_NOEXIST);
+ if (err)
+ goto close_srv2;
+
+ key = 1;
+ value = s2;
+ err = xbpf_map_update_elem(sock_map, &key, &value, BPF_NOEXIST);
+
+ /* Connect to s2, reuseport BPF selects s1 via sock_map[0] */
+ len = sizeof(addr);
+ err = xgetsockname(s2, sockaddr(&addr), &len);
+ if (err)
+ goto close_srv2;
+
+ c = xsocket(family, sotype, 0);
+ if (c < 0)
+ goto close_srv2;
+
+ err = connect(c, sockaddr(&addr), len);
+ if (sotype == SOCK_DGRAM) {
+ char b = 'a';
+ ssize_t n;
+
+ n = xsend(c, &b, sizeof(b), 0);
+ if (n == -1)
+ goto close_cli;
+
+ n = recv_timeout(c, &b, sizeof(b), 0, IO_TIMEOUT_SEC);
+ err = n == -1;
+ }
+ if (!err || errno != ECONNREFUSED) {
+ FAIL_ERRNO("connect: expected ECONNREFUSED");
+ goto close_cli;
+ }
+
+ /* Expect drop, can't redirect outside of reuseport group */
+ key = SK_DROP;
+ err = xbpf_map_lookup_elem(verd_map, &key, &drop);
+ if (err)
+ goto close_cli;
+ if (drop != 1)
+ FAIL("want drop count 1, have %d", drop);
+
+close_cli:
+ xclose(c);
+close_srv2:
+ xclose(s2);
+close_srv1:
+ xclose(s1);
+}
+
+#define TEST(fn, ...) \
+ { \
+ fn, #fn, __VA_ARGS__ \
+ }
+
+static void test_ops_cleanup(const struct bpf_map *map)
+{
+ const struct bpf_map_def *def;
+ int err, mapfd;
+ u32 key;
+
+ def = bpf_map__def(map);
+ mapfd = bpf_map__fd(map);
+
+ for (key = 0; key < def->max_entries; key++) {
+ err = bpf_map_delete_elem(mapfd, &key);
+ if (err && errno != EINVAL && errno != ENOENT)
+ FAIL_ERRNO("map_delete: expected EINVAL/ENOENT");
+ }
+}
+
+static const char *family_str(sa_family_t family)
+{
+ switch (family) {
+ case AF_INET:
+ return "IPv4";
+ case AF_INET6:
+ return "IPv6";
+ default:
+ return "unknown";
+ }
+}
+
+static const char *map_type_str(const struct bpf_map *map)
+{
+ const struct bpf_map_def *def;
+
+ def = bpf_map__def(map);
+ if (IS_ERR(def))
+ return "invalid";
+
+ switch (def->type) {
+ case BPF_MAP_TYPE_SOCKMAP:
+ return "sockmap";
+ case BPF_MAP_TYPE_SOCKHASH:
+ return "sockhash";
+ default:
+ return "unknown";
+ }
+}
+
+static const char *sotype_str(int sotype)
+{
+ switch (sotype) {
+ case SOCK_DGRAM:
+ return "UDP";
+ case SOCK_STREAM:
+ return "TCP";
+ default:
+ return "unknown";
+ }
+}
+
+static void test_ops(struct test_sockmap_listen *skel, struct bpf_map *map,
+ int family, int sotype)
+{
+ const struct op_test {
+ void (*fn)(int family, int sotype, int mapfd);
+ const char *name;
+ int sotype;
+ } tests[] = {
+ /* insert */
+ TEST(test_insert_invalid),
+ TEST(test_insert_opened),
+ TEST(test_insert_bound, SOCK_STREAM),
+ TEST(test_insert),
+ /* delete */
+ TEST(test_delete_after_insert),
+ TEST(test_delete_after_close),
+ /* lookup */
+ TEST(test_lookup_after_insert),
+ TEST(test_lookup_after_delete),
+ TEST(test_lookup_32_bit_value),
+ /* update */
+ TEST(test_update_existing),
+ /* races with insert/delete */
+ TEST(test_destroy_orphan_child, SOCK_STREAM),
+ TEST(test_syn_recv_insert_delete, SOCK_STREAM),
+ TEST(test_race_insert_listen, SOCK_STREAM),
+ /* child clone */
+ TEST(test_clone_after_delete, SOCK_STREAM),
+ TEST(test_accept_after_delete, SOCK_STREAM),
+ TEST(test_accept_before_delete, SOCK_STREAM),
+ };
+ const char *family_name, *map_name, *sotype_name;
+ const struct op_test *t;
+ char s[MAX_TEST_NAME];
+ int map_fd;
+
+ family_name = family_str(family);
+ map_name = map_type_str(map);
+ sotype_name = sotype_str(sotype);
+ map_fd = bpf_map__fd(map);
+
+ for (t = tests; t < tests + ARRAY_SIZE(tests); t++) {
+ snprintf(s, sizeof(s), "%s %s %s %s", map_name, family_name,
+ sotype_name, t->name);
+
+ if (t->sotype != 0 && t->sotype != sotype)
+ continue;
+
+ if (!test__start_subtest(s))
+ continue;
+
+ t->fn(family, sotype, map_fd);
+ test_ops_cleanup(map);
+ }
+}
+
+static void test_redir(struct test_sockmap_listen *skel, struct bpf_map *map,
+ int family, int sotype)
+{
+ const struct redir_test {
+ void (*fn)(struct test_sockmap_listen *skel,
+ struct bpf_map *map, int family, int sotype);
+ const char *name;
+ } tests[] = {
+ TEST(test_skb_redir_to_connected),
+ TEST(test_skb_redir_to_listening),
+ TEST(test_msg_redir_to_connected),
+ TEST(test_msg_redir_to_listening),
+ };
+ const char *family_name, *map_name;
+ const struct redir_test *t;
+ char s[MAX_TEST_NAME];
+
+ family_name = family_str(family);
+ map_name = map_type_str(map);
+
+ for (t = tests; t < tests + ARRAY_SIZE(tests); t++) {
+ snprintf(s, sizeof(s), "%s %s %s", map_name, family_name,
+ t->name);
+
+ if (!test__start_subtest(s))
+ continue;
+
+ t->fn(skel, map, family, sotype);
+ }
+}
+
+static void test_reuseport(struct test_sockmap_listen *skel,
+ struct bpf_map *map, int family, int sotype)
+{
+ const struct reuseport_test {
+ void (*fn)(int family, int sotype, int socket_map,
+ int verdict_map, int reuseport_prog);
+ const char *name;
+ int sotype;
+ } tests[] = {
+ TEST(test_reuseport_select_listening),
+ TEST(test_reuseport_select_connected),
+ TEST(test_reuseport_mixed_groups),
+ };
+ int socket_map, verdict_map, reuseport_prog;
+ const char *family_name, *map_name, *sotype_name;
+ const struct reuseport_test *t;
+ char s[MAX_TEST_NAME];
+
+ family_name = family_str(family);
+ map_name = map_type_str(map);
+ sotype_name = sotype_str(sotype);
+
+ socket_map = bpf_map__fd(map);
+ verdict_map = bpf_map__fd(skel->maps.verdict_map);
+ reuseport_prog = bpf_program__fd(skel->progs.prog_reuseport);
+
+ for (t = tests; t < tests + ARRAY_SIZE(tests); t++) {
+ snprintf(s, sizeof(s), "%s %s %s %s", map_name, family_name,
+ sotype_name, t->name);
+
+ if (t->sotype != 0 && t->sotype != sotype)
+ continue;
+
+ if (!test__start_subtest(s))
+ continue;
+
+ t->fn(family, sotype, socket_map, verdict_map, reuseport_prog);
+ }
+}
+
+static void run_tests(struct test_sockmap_listen *skel, struct bpf_map *map,
+ int family)
+{
+ test_ops(skel, map, family, SOCK_STREAM);
+ test_ops(skel, map, family, SOCK_DGRAM);
+ test_redir(skel, map, family, SOCK_STREAM);
+ test_reuseport(skel, map, family, SOCK_STREAM);
+ test_reuseport(skel, map, family, SOCK_DGRAM);
+}
+
+void test_sockmap_listen(void)
+{
+ struct test_sockmap_listen *skel;
+
+ skel = test_sockmap_listen__open_and_load();
+ if (!skel) {
+ FAIL("skeleton open/load failed");
+ return;
+ }
+
+ skel->bss->test_sockmap = true;
+ run_tests(skel, skel->maps.sock_map, AF_INET);
+ run_tests(skel, skel->maps.sock_map, AF_INET6);
+
+ skel->bss->test_sockmap = false;
+ run_tests(skel, skel->maps.sock_hash, AF_INET);
+ run_tests(skel, skel->maps.sock_hash, AF_INET6);
+
+ test_sockmap_listen__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt.c b/tools/testing/selftests/bpf/prog_tests/sockopt.c
new file mode 100644
index 000000000..3e8517a83
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt.c
@@ -0,0 +1,985 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include "cgroup_helpers.h"
+
+static char bpf_log_buf[4096];
+static bool verbose;
+
+enum sockopt_test_error {
+ OK = 0,
+ DENY_LOAD,
+ DENY_ATTACH,
+ EPERM_GETSOCKOPT,
+ EFAULT_GETSOCKOPT,
+ EPERM_SETSOCKOPT,
+ EFAULT_SETSOCKOPT,
+};
+
+static struct sockopt_test {
+ const char *descr;
+ const struct bpf_insn insns[64];
+ enum bpf_attach_type attach_type;
+ enum bpf_attach_type expected_attach_type;
+
+ int set_optname;
+ int set_level;
+ const char set_optval[64];
+ socklen_t set_optlen;
+
+ int get_optname;
+ int get_level;
+ const char get_optval[64];
+ socklen_t get_optlen;
+ socklen_t get_optlen_ret;
+
+ enum sockopt_test_error error;
+} tests[] = {
+
+ /* ==================== getsockopt ==================== */
+
+ {
+ .descr = "getsockopt: no expected_attach_type",
+ .insns = {
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = 0,
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "getsockopt: wrong expected_attach_type",
+ .insns = {
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+ .error = DENY_ATTACH,
+ },
+ {
+ .descr = "getsockopt: bypass bpf hook",
+ .insns = {
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .get_level = SOL_IP,
+ .set_level = SOL_IP,
+
+ .get_optname = IP_TOS,
+ .set_optname = IP_TOS,
+
+ .set_optval = { 1 << 3 },
+ .set_optlen = 1,
+
+ .get_optval = { 1 << 3 },
+ .get_optlen = 1,
+ },
+ {
+ .descr = "getsockopt: return EPERM from bpf hook",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .get_level = SOL_IP,
+ .get_optname = IP_TOS,
+
+ .get_optlen = 1,
+ .error = EPERM_GETSOCKOPT,
+ },
+ {
+ .descr = "getsockopt: no optval bounds check, deny loading",
+ .insns = {
+ /* r6 = ctx->optval */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optval)),
+
+ /* ctx->optval[0] = 0x80 */
+ BPF_MOV64_IMM(BPF_REG_0, 0x80),
+ BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_0, 0),
+
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "getsockopt: read ctx->level",
+ .insns = {
+ /* r6 = ctx->level */
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, level)),
+
+ /* if (ctx->level == 123) { */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 123, 4),
+ /* ctx->retval = 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, retval)),
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+ /* } else { */
+ /* return 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ /* } */
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .get_level = 123,
+
+ .get_optlen = 1,
+ },
+ {
+ .descr = "getsockopt: deny writing to ctx->level",
+ .insns = {
+ /* ctx->level = 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, level)),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "getsockopt: read ctx->optname",
+ .insns = {
+ /* r6 = ctx->optname */
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optname)),
+
+ /* if (ctx->optname == 123) { */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 123, 4),
+ /* ctx->retval = 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, retval)),
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+ /* } else { */
+ /* return 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ /* } */
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .get_optname = 123,
+
+ .get_optlen = 1,
+ },
+ {
+ .descr = "getsockopt: read ctx->retval",
+ .insns = {
+ /* r6 = ctx->retval */
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, retval)),
+
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .get_level = SOL_IP,
+ .get_optname = IP_TOS,
+ .get_optlen = 1,
+ },
+ {
+ .descr = "getsockopt: deny writing to ctx->optname",
+ .insns = {
+ /* ctx->optname = 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optname)),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "getsockopt: read ctx->optlen",
+ .insns = {
+ /* r6 = ctx->optlen */
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optlen)),
+
+ /* if (ctx->optlen == 64) { */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 64, 4),
+ /* ctx->retval = 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, retval)),
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+ /* } else { */
+ /* return 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ /* } */
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .get_optlen = 64,
+ },
+ {
+ .descr = "getsockopt: deny bigger ctx->optlen",
+ .insns = {
+ /* ctx->optlen = 65 */
+ BPF_MOV64_IMM(BPF_REG_0, 65),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optlen)),
+
+ /* ctx->retval = 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, retval)),
+
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .get_optlen = 64,
+
+ .error = EFAULT_GETSOCKOPT,
+ },
+ {
+ .descr = "getsockopt: deny arbitrary ctx->retval",
+ .insns = {
+ /* ctx->retval = 123 */
+ BPF_MOV64_IMM(BPF_REG_0, 123),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, retval)),
+
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .get_optlen = 64,
+
+ .error = EFAULT_GETSOCKOPT,
+ },
+ {
+ .descr = "getsockopt: support smaller ctx->optlen",
+ .insns = {
+ /* ctx->optlen = 32 */
+ BPF_MOV64_IMM(BPF_REG_0, 32),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optlen)),
+ /* ctx->retval = 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, retval)),
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .get_optlen = 64,
+ .get_optlen_ret = 32,
+ },
+ {
+ .descr = "getsockopt: deny writing to ctx->optval",
+ .insns = {
+ /* ctx->optval = 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optval)),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "getsockopt: deny writing to ctx->optval_end",
+ .insns = {
+ /* ctx->optval_end = 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optval_end)),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "getsockopt: rewrite value",
+ .insns = {
+ /* r6 = ctx->optval */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optval)),
+ /* r2 = ctx->optval */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_6),
+ /* r6 = ctx->optval + 1 */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+
+ /* r7 = ctx->optval_end */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optval_end)),
+
+ /* if (ctx->optval + 1 <= ctx->optval_end) { */
+ BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_7, 1),
+ /* ctx->optval[0] = 0xF0 */
+ BPF_ST_MEM(BPF_B, BPF_REG_2, 0, 0xF0),
+ /* } */
+
+ /* ctx->retval = 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, retval)),
+
+ /* return 1*/
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_GETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+
+ .get_level = SOL_IP,
+ .get_optname = IP_TOS,
+
+ .get_optval = { 0xF0 },
+ .get_optlen = 1,
+ },
+
+ /* ==================== setsockopt ==================== */
+
+ {
+ .descr = "setsockopt: no expected_attach_type",
+ .insns = {
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = 0,
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "setsockopt: wrong expected_attach_type",
+ .insns = {
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_GETSOCKOPT,
+ .error = DENY_ATTACH,
+ },
+ {
+ .descr = "setsockopt: bypass bpf hook",
+ .insns = {
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .get_level = SOL_IP,
+ .set_level = SOL_IP,
+
+ .get_optname = IP_TOS,
+ .set_optname = IP_TOS,
+
+ .set_optval = { 1 << 3 },
+ .set_optlen = 1,
+
+ .get_optval = { 1 << 3 },
+ .get_optlen = 1,
+ },
+ {
+ .descr = "setsockopt: return EPERM from bpf hook",
+ .insns = {
+ /* return 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .set_level = SOL_IP,
+ .set_optname = IP_TOS,
+
+ .set_optlen = 1,
+ .error = EPERM_SETSOCKOPT,
+ },
+ {
+ .descr = "setsockopt: no optval bounds check, deny loading",
+ .insns = {
+ /* r6 = ctx->optval */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optval)),
+
+ /* r0 = ctx->optval[0] */
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, 0),
+
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "setsockopt: read ctx->level",
+ .insns = {
+ /* r6 = ctx->level */
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, level)),
+
+ /* if (ctx->level == 123) { */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 123, 4),
+ /* ctx->optlen = -1 */
+ BPF_MOV64_IMM(BPF_REG_0, -1),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optlen)),
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+ /* } else { */
+ /* return 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ /* } */
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .set_level = 123,
+
+ .set_optlen = 1,
+ },
+ {
+ .descr = "setsockopt: allow changing ctx->level",
+ .insns = {
+ /* ctx->level = SOL_IP */
+ BPF_MOV64_IMM(BPF_REG_0, SOL_IP),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, level)),
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .get_level = SOL_IP,
+ .set_level = 234, /* should be rewritten to SOL_IP */
+
+ .get_optname = IP_TOS,
+ .set_optname = IP_TOS,
+
+ .set_optval = { 1 << 3 },
+ .set_optlen = 1,
+ .get_optval = { 1 << 3 },
+ .get_optlen = 1,
+ },
+ {
+ .descr = "setsockopt: read ctx->optname",
+ .insns = {
+ /* r6 = ctx->optname */
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optname)),
+
+ /* if (ctx->optname == 123) { */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 123, 4),
+ /* ctx->optlen = -1 */
+ BPF_MOV64_IMM(BPF_REG_0, -1),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optlen)),
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+ /* } else { */
+ /* return 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ /* } */
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .set_optname = 123,
+
+ .set_optlen = 1,
+ },
+ {
+ .descr = "setsockopt: allow changing ctx->optname",
+ .insns = {
+ /* ctx->optname = IP_TOS */
+ BPF_MOV64_IMM(BPF_REG_0, IP_TOS),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optname)),
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .get_level = SOL_IP,
+ .set_level = SOL_IP,
+
+ .get_optname = IP_TOS,
+ .set_optname = 456, /* should be rewritten to IP_TOS */
+
+ .set_optval = { 1 << 3 },
+ .set_optlen = 1,
+ .get_optval = { 1 << 3 },
+ .get_optlen = 1,
+ },
+ {
+ .descr = "setsockopt: read ctx->optlen",
+ .insns = {
+ /* r6 = ctx->optlen */
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optlen)),
+
+ /* if (ctx->optlen == 64) { */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 64, 4),
+ /* ctx->optlen = -1 */
+ BPF_MOV64_IMM(BPF_REG_0, -1),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optlen)),
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+ /* } else { */
+ /* return 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ /* } */
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .set_optlen = 64,
+ },
+ {
+ .descr = "setsockopt: ctx->optlen == -1 is ok",
+ .insns = {
+ /* ctx->optlen = -1 */
+ BPF_MOV64_IMM(BPF_REG_0, -1),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optlen)),
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .set_optlen = 64,
+ },
+ {
+ .descr = "setsockopt: deny ctx->optlen < 0 (except -1)",
+ .insns = {
+ /* ctx->optlen = -2 */
+ BPF_MOV64_IMM(BPF_REG_0, -2),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optlen)),
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .set_optlen = 4,
+
+ .error = EFAULT_SETSOCKOPT,
+ },
+ {
+ .descr = "setsockopt: deny ctx->optlen > input optlen",
+ .insns = {
+ /* ctx->optlen = 65 */
+ BPF_MOV64_IMM(BPF_REG_0, 65),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optlen)),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .set_optlen = 64,
+
+ .error = EFAULT_SETSOCKOPT,
+ },
+ {
+ .descr = "setsockopt: allow changing ctx->optlen within bounds",
+ .insns = {
+ /* r6 = ctx->optval */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optval)),
+ /* r2 = ctx->optval */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_6),
+ /* r6 = ctx->optval + 1 */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+
+ /* r7 = ctx->optval_end */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optval_end)),
+
+ /* if (ctx->optval + 1 <= ctx->optval_end) { */
+ BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_7, 1),
+ /* ctx->optval[0] = 1 << 3 */
+ BPF_ST_MEM(BPF_B, BPF_REG_2, 0, 1 << 3),
+ /* } */
+
+ /* ctx->optlen = 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optlen)),
+
+ /* return 1*/
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .get_level = SOL_IP,
+ .set_level = SOL_IP,
+
+ .get_optname = IP_TOS,
+ .set_optname = IP_TOS,
+
+ .set_optval = { 1, 1, 1, 1 },
+ .set_optlen = 4,
+ .get_optval = { 1 << 3 },
+ .get_optlen = 1,
+ },
+ {
+ .descr = "setsockopt: deny write ctx->retval",
+ .insns = {
+ /* ctx->retval = 0 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, retval)),
+
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "setsockopt: deny read ctx->retval",
+ .insns = {
+ /* r6 = ctx->retval */
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, retval)),
+
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "setsockopt: deny writing to ctx->optval",
+ .insns = {
+ /* ctx->optval = 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optval)),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "setsockopt: deny writing to ctx->optval_end",
+ .insns = {
+ /* ctx->optval_end = 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sockopt, optval_end)),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .error = DENY_LOAD,
+ },
+ {
+ .descr = "setsockopt: allow IP_TOS <= 128",
+ .insns = {
+ /* r6 = ctx->optval */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optval)),
+ /* r7 = ctx->optval + 1 */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 1),
+
+ /* r8 = ctx->optval_end */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optval_end)),
+
+ /* if (ctx->optval + 1 <= ctx->optval_end) { */
+ BPF_JMP_REG(BPF_JGT, BPF_REG_7, BPF_REG_8, 4),
+
+ /* r9 = ctx->optval[0] */
+ BPF_LDX_MEM(BPF_B, BPF_REG_9, BPF_REG_6, 0),
+
+ /* if (ctx->optval[0] < 128) */
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_9, 128, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+ /* } */
+
+ /* } else { */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ /* } */
+
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .get_level = SOL_IP,
+ .set_level = SOL_IP,
+
+ .get_optname = IP_TOS,
+ .set_optname = IP_TOS,
+
+ .set_optval = { 0x80 },
+ .set_optlen = 1,
+ .get_optval = { 0x80 },
+ .get_optlen = 1,
+ },
+ {
+ .descr = "setsockopt: deny IP_TOS > 128",
+ .insns = {
+ /* r6 = ctx->optval */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optval)),
+ /* r7 = ctx->optval + 1 */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 1),
+
+ /* r8 = ctx->optval_end */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_1,
+ offsetof(struct bpf_sockopt, optval_end)),
+
+ /* if (ctx->optval + 1 <= ctx->optval_end) { */
+ BPF_JMP_REG(BPF_JGT, BPF_REG_7, BPF_REG_8, 4),
+
+ /* r9 = ctx->optval[0] */
+ BPF_LDX_MEM(BPF_B, BPF_REG_9, BPF_REG_6, 0),
+
+ /* if (ctx->optval[0] < 128) */
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_9, 128, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+ /* } */
+
+ /* } else { */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ /* } */
+
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SETSOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+
+ .get_level = SOL_IP,
+ .set_level = SOL_IP,
+
+ .get_optname = IP_TOS,
+ .set_optname = IP_TOS,
+
+ .set_optval = { 0x81 },
+ .set_optlen = 1,
+ .get_optval = { 0x00 },
+ .get_optlen = 1,
+
+ .error = EPERM_SETSOCKOPT,
+ },
+};
+
+static int load_prog(const struct bpf_insn *insns,
+ enum bpf_attach_type expected_attach_type)
+{
+ struct bpf_load_program_attr attr = {
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCKOPT,
+ .expected_attach_type = expected_attach_type,
+ .insns = insns,
+ .license = "GPL",
+ .log_level = 2,
+ };
+ int fd;
+
+ for (;
+ insns[attr.insns_cnt].code != (BPF_JMP | BPF_EXIT);
+ attr.insns_cnt++) {
+ }
+ attr.insns_cnt++;
+
+ fd = bpf_load_program_xattr(&attr, bpf_log_buf, sizeof(bpf_log_buf));
+ if (verbose && fd < 0)
+ fprintf(stderr, "%s\n", bpf_log_buf);
+
+ return fd;
+}
+
+static int run_test(int cgroup_fd, struct sockopt_test *test)
+{
+ int sock_fd, err, prog_fd;
+ void *optval = NULL;
+ int ret = 0;
+
+ prog_fd = load_prog(test->insns, test->expected_attach_type);
+ if (prog_fd < 0) {
+ if (test->error == DENY_LOAD)
+ return 0;
+
+ log_err("Failed to load BPF program");
+ return -1;
+ }
+
+ err = bpf_prog_attach(prog_fd, cgroup_fd, test->attach_type, 0);
+ if (err < 0) {
+ if (test->error == DENY_ATTACH)
+ goto close_prog_fd;
+
+ log_err("Failed to attach BPF program");
+ ret = -1;
+ goto close_prog_fd;
+ }
+
+ sock_fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (sock_fd < 0) {
+ log_err("Failed to create AF_INET socket");
+ ret = -1;
+ goto detach_prog;
+ }
+
+ if (test->set_optlen) {
+ err = setsockopt(sock_fd, test->set_level, test->set_optname,
+ test->set_optval, test->set_optlen);
+ if (err) {
+ if (errno == EPERM && test->error == EPERM_SETSOCKOPT)
+ goto close_sock_fd;
+ if (errno == EFAULT && test->error == EFAULT_SETSOCKOPT)
+ goto free_optval;
+
+ log_err("Failed to call setsockopt");
+ ret = -1;
+ goto close_sock_fd;
+ }
+ }
+
+ if (test->get_optlen) {
+ optval = malloc(test->get_optlen);
+ socklen_t optlen = test->get_optlen;
+ socklen_t expected_get_optlen = test->get_optlen_ret ?:
+ test->get_optlen;
+
+ err = getsockopt(sock_fd, test->get_level, test->get_optname,
+ optval, &optlen);
+ if (err) {
+ if (errno == EPERM && test->error == EPERM_GETSOCKOPT)
+ goto free_optval;
+ if (errno == EFAULT && test->error == EFAULT_GETSOCKOPT)
+ goto free_optval;
+
+ log_err("Failed to call getsockopt");
+ ret = -1;
+ goto free_optval;
+ }
+
+ if (optlen != expected_get_optlen) {
+ errno = 0;
+ log_err("getsockopt returned unexpected optlen");
+ ret = -1;
+ goto free_optval;
+ }
+
+ if (memcmp(optval, test->get_optval, optlen) != 0) {
+ errno = 0;
+ log_err("getsockopt returned unexpected optval");
+ ret = -1;
+ goto free_optval;
+ }
+ }
+
+ ret = test->error != OK;
+
+free_optval:
+ free(optval);
+close_sock_fd:
+ close(sock_fd);
+detach_prog:
+ bpf_prog_detach2(prog_fd, cgroup_fd, test->attach_type);
+close_prog_fd:
+ close(prog_fd);
+ return ret;
+}
+
+void test_sockopt(void)
+{
+ int cgroup_fd, i;
+
+ cgroup_fd = test__join_cgroup("/sockopt");
+ if (CHECK_FAIL(cgroup_fd < 0))
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ test__start_subtest(tests[i].descr);
+ CHECK_FAIL(run_test(cgroup_fd, &tests[i]));
+ }
+
+ close(cgroup_fd);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
new file mode 100644
index 000000000..86f97681a
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include "cgroup_helpers.h"
+
+#define SOL_CUSTOM 0xdeadbeef
+#define CUSTOM_INHERIT1 0
+#define CUSTOM_INHERIT2 1
+#define CUSTOM_LISTENER 2
+
+static int connect_to_server(int server_fd)
+{
+ struct sockaddr_storage addr;
+ socklen_t len = sizeof(addr);
+ int fd;
+
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (fd < 0) {
+ log_err("Failed to create client socket");
+ return -1;
+ }
+
+ if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) {
+ log_err("Failed to get server addr");
+ goto out;
+ }
+
+ if (connect(fd, (const struct sockaddr *)&addr, len) < 0) {
+ log_err("Fail to connect to server");
+ goto out;
+ }
+
+ return fd;
+
+out:
+ close(fd);
+ return -1;
+}
+
+static int verify_sockopt(int fd, int optname, const char *msg, char expected)
+{
+ socklen_t optlen = 1;
+ char buf = 0;
+ int err;
+
+ err = getsockopt(fd, SOL_CUSTOM, optname, &buf, &optlen);
+ if (err) {
+ log_err("%s: failed to call getsockopt", msg);
+ return 1;
+ }
+
+ printf("%s %d: got=0x%x ? expected=0x%x\n", msg, optname, buf, expected);
+
+ if (buf != expected) {
+ log_err("%s: unexpected getsockopt value %d != %d", msg,
+ buf, expected);
+ return 1;
+ }
+
+ return 0;
+}
+
+static pthread_mutex_t server_started_mtx = PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t server_started = PTHREAD_COND_INITIALIZER;
+
+static void *server_thread(void *arg)
+{
+ struct sockaddr_storage addr;
+ socklen_t len = sizeof(addr);
+ int fd = *(int *)arg;
+ int client_fd;
+ int err = 0;
+
+ err = listen(fd, 1);
+
+ pthread_mutex_lock(&server_started_mtx);
+ pthread_cond_signal(&server_started);
+ pthread_mutex_unlock(&server_started_mtx);
+
+ if (CHECK_FAIL(err < 0)) {
+ perror("Failed to listed on socket");
+ return NULL;
+ }
+
+ err += verify_sockopt(fd, CUSTOM_INHERIT1, "listen", 1);
+ err += verify_sockopt(fd, CUSTOM_INHERIT2, "listen", 1);
+ err += verify_sockopt(fd, CUSTOM_LISTENER, "listen", 1);
+
+ client_fd = accept(fd, (struct sockaddr *)&addr, &len);
+ if (CHECK_FAIL(client_fd < 0)) {
+ perror("Failed to accept client");
+ return NULL;
+ }
+
+ err += verify_sockopt(client_fd, CUSTOM_INHERIT1, "accept", 1);
+ err += verify_sockopt(client_fd, CUSTOM_INHERIT2, "accept", 1);
+ err += verify_sockopt(client_fd, CUSTOM_LISTENER, "accept", 0);
+
+ close(client_fd);
+
+ return (void *)(long)err;
+}
+
+static int start_server(void)
+{
+ struct sockaddr_in addr = {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
+ };
+ char buf;
+ int err;
+ int fd;
+ int i;
+
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (fd < 0) {
+ log_err("Failed to create server socket");
+ return -1;
+ }
+
+ for (i = CUSTOM_INHERIT1; i <= CUSTOM_LISTENER; i++) {
+ buf = 0x01;
+ err = setsockopt(fd, SOL_CUSTOM, i, &buf, 1);
+ if (err) {
+ log_err("Failed to call setsockopt(%d)", i);
+ close(fd);
+ return -1;
+ }
+ }
+
+ if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) < 0) {
+ log_err("Failed to bind socket");
+ close(fd);
+ return -1;
+ }
+
+ return fd;
+}
+
+static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title)
+{
+ enum bpf_attach_type attach_type;
+ enum bpf_prog_type prog_type;
+ struct bpf_program *prog;
+ int err;
+
+ err = libbpf_prog_type_by_name(title, &prog_type, &attach_type);
+ if (err) {
+ log_err("Failed to deduct types for %s BPF program", title);
+ return -1;
+ }
+
+ prog = bpf_object__find_program_by_title(obj, title);
+ if (!prog) {
+ log_err("Failed to find %s BPF program", title);
+ return -1;
+ }
+
+ err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd,
+ attach_type, 0);
+ if (err) {
+ log_err("Failed to attach %s BPF program", title);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void run_test(int cgroup_fd)
+{
+ struct bpf_prog_load_attr attr = {
+ .file = "./sockopt_inherit.o",
+ };
+ int server_fd = -1, client_fd;
+ struct bpf_object *obj;
+ void *server_err;
+ pthread_t tid;
+ int ignored;
+ int err;
+
+ err = bpf_prog_load_xattr(&attr, &obj, &ignored);
+ if (CHECK_FAIL(err))
+ return;
+
+ err = prog_attach(obj, cgroup_fd, "cgroup/getsockopt");
+ if (CHECK_FAIL(err))
+ goto close_bpf_object;
+
+ err = prog_attach(obj, cgroup_fd, "cgroup/setsockopt");
+ if (CHECK_FAIL(err))
+ goto close_bpf_object;
+
+ server_fd = start_server();
+ if (CHECK_FAIL(server_fd < 0))
+ goto close_bpf_object;
+
+ pthread_mutex_lock(&server_started_mtx);
+ if (CHECK_FAIL(pthread_create(&tid, NULL, server_thread,
+ (void *)&server_fd))) {
+ pthread_mutex_unlock(&server_started_mtx);
+ goto close_server_fd;
+ }
+ pthread_cond_wait(&server_started, &server_started_mtx);
+ pthread_mutex_unlock(&server_started_mtx);
+
+ client_fd = connect_to_server(server_fd);
+ if (CHECK_FAIL(client_fd < 0))
+ goto close_server_fd;
+
+ CHECK_FAIL(verify_sockopt(client_fd, CUSTOM_INHERIT1, "connect", 0));
+ CHECK_FAIL(verify_sockopt(client_fd, CUSTOM_INHERIT2, "connect", 0));
+ CHECK_FAIL(verify_sockopt(client_fd, CUSTOM_LISTENER, "connect", 0));
+
+ pthread_join(tid, &server_err);
+
+ err = (int)(long)server_err;
+ CHECK_FAIL(err);
+
+ close(client_fd);
+
+close_server_fd:
+ close(server_fd);
+close_bpf_object:
+ bpf_object__close(obj);
+}
+
+void test_sockopt_inherit(void)
+{
+ int cgroup_fd;
+
+ cgroup_fd = test__join_cgroup("/sockopt_inherit");
+ if (CHECK_FAIL(cgroup_fd < 0))
+ return;
+
+ run_test(cgroup_fd);
+ close(cgroup_fd);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c b/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c
new file mode 100644
index 000000000..51fac975b
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_multi.c
@@ -0,0 +1,333 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include "cgroup_helpers.h"
+
+static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title)
+{
+ enum bpf_attach_type attach_type;
+ enum bpf_prog_type prog_type;
+ struct bpf_program *prog;
+ int err;
+
+ err = libbpf_prog_type_by_name(title, &prog_type, &attach_type);
+ if (err) {
+ log_err("Failed to deduct types for %s BPF program", title);
+ return -1;
+ }
+
+ prog = bpf_object__find_program_by_title(obj, title);
+ if (!prog) {
+ log_err("Failed to find %s BPF program", title);
+ return -1;
+ }
+
+ err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd,
+ attach_type, BPF_F_ALLOW_MULTI);
+ if (err) {
+ log_err("Failed to attach %s BPF program", title);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int prog_detach(struct bpf_object *obj, int cgroup_fd, const char *title)
+{
+ enum bpf_attach_type attach_type;
+ enum bpf_prog_type prog_type;
+ struct bpf_program *prog;
+ int err;
+
+ err = libbpf_prog_type_by_name(title, &prog_type, &attach_type);
+ if (err)
+ return -1;
+
+ prog = bpf_object__find_program_by_title(obj, title);
+ if (!prog)
+ return -1;
+
+ err = bpf_prog_detach2(bpf_program__fd(prog), cgroup_fd,
+ attach_type);
+ if (err)
+ return -1;
+
+ return 0;
+}
+
+static int run_getsockopt_test(struct bpf_object *obj, int cg_parent,
+ int cg_child, int sock_fd)
+{
+ socklen_t optlen;
+ __u8 buf;
+ int err;
+
+ /* Set IP_TOS to the expected value (0x80). */
+
+ buf = 0x80;
+ err = setsockopt(sock_fd, SOL_IP, IP_TOS, &buf, 1);
+ if (err < 0) {
+ log_err("Failed to call setsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ buf = 0x00;
+ optlen = 1;
+ err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen);
+ if (err) {
+ log_err("Failed to call getsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ if (buf != 0x80) {
+ log_err("Unexpected getsockopt 0x%x != 0x80 without BPF", buf);
+ err = -1;
+ goto detach;
+ }
+
+ /* Attach child program and make sure it returns new value:
+ * - kernel: -> 0x80
+ * - child: 0x80 -> 0x90
+ */
+
+ err = prog_attach(obj, cg_child, "cgroup/getsockopt/child");
+ if (err)
+ goto detach;
+
+ buf = 0x00;
+ optlen = 1;
+ err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen);
+ if (err) {
+ log_err("Failed to call getsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ if (buf != 0x90) {
+ log_err("Unexpected getsockopt 0x%x != 0x90", buf);
+ err = -1;
+ goto detach;
+ }
+
+ /* Attach parent program and make sure it returns new value:
+ * - kernel: -> 0x80
+ * - child: 0x80 -> 0x90
+ * - parent: 0x90 -> 0xA0
+ */
+
+ err = prog_attach(obj, cg_parent, "cgroup/getsockopt/parent");
+ if (err)
+ goto detach;
+
+ buf = 0x00;
+ optlen = 1;
+ err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen);
+ if (err) {
+ log_err("Failed to call getsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ if (buf != 0xA0) {
+ log_err("Unexpected getsockopt 0x%x != 0xA0", buf);
+ err = -1;
+ goto detach;
+ }
+
+ /* Setting unexpected initial sockopt should return EPERM:
+ * - kernel: -> 0x40
+ * - child: unexpected 0x40, EPERM
+ * - parent: unexpected 0x40, EPERM
+ */
+
+ buf = 0x40;
+ err = setsockopt(sock_fd, SOL_IP, IP_TOS, &buf, 1);
+ if (err < 0) {
+ log_err("Failed to call setsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ buf = 0x00;
+ optlen = 1;
+ err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen);
+ if (!err) {
+ log_err("Unexpected success from getsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ /* Detach child program and make sure we still get EPERM:
+ * - kernel: -> 0x40
+ * - parent: unexpected 0x40, EPERM
+ */
+
+ err = prog_detach(obj, cg_child, "cgroup/getsockopt/child");
+ if (err) {
+ log_err("Failed to detach child program");
+ goto detach;
+ }
+
+ buf = 0x00;
+ optlen = 1;
+ err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen);
+ if (!err) {
+ log_err("Unexpected success from getsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ /* Set initial value to the one the parent program expects:
+ * - kernel: -> 0x90
+ * - parent: 0x90 -> 0xA0
+ */
+
+ buf = 0x90;
+ err = setsockopt(sock_fd, SOL_IP, IP_TOS, &buf, 1);
+ if (err < 0) {
+ log_err("Failed to call setsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ buf = 0x00;
+ optlen = 1;
+ err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen);
+ if (err) {
+ log_err("Failed to call getsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ if (buf != 0xA0) {
+ log_err("Unexpected getsockopt 0x%x != 0xA0", buf);
+ err = -1;
+ goto detach;
+ }
+
+detach:
+ prog_detach(obj, cg_child, "cgroup/getsockopt/child");
+ prog_detach(obj, cg_parent, "cgroup/getsockopt/parent");
+
+ return err;
+}
+
+static int run_setsockopt_test(struct bpf_object *obj, int cg_parent,
+ int cg_child, int sock_fd)
+{
+ socklen_t optlen;
+ __u8 buf;
+ int err;
+
+ /* Set IP_TOS to the expected value (0x80). */
+
+ buf = 0x80;
+ err = setsockopt(sock_fd, SOL_IP, IP_TOS, &buf, 1);
+ if (err < 0) {
+ log_err("Failed to call setsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ buf = 0x00;
+ optlen = 1;
+ err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen);
+ if (err) {
+ log_err("Failed to call getsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ if (buf != 0x80) {
+ log_err("Unexpected getsockopt 0x%x != 0x80 without BPF", buf);
+ err = -1;
+ goto detach;
+ }
+
+ /* Attach child program and make sure it adds 0x10. */
+
+ err = prog_attach(obj, cg_child, "cgroup/setsockopt");
+ if (err)
+ goto detach;
+
+ buf = 0x80;
+ err = setsockopt(sock_fd, SOL_IP, IP_TOS, &buf, 1);
+ if (err < 0) {
+ log_err("Failed to call setsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ buf = 0x00;
+ optlen = 1;
+ err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen);
+ if (err) {
+ log_err("Failed to call getsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ if (buf != 0x80 + 0x10) {
+ log_err("Unexpected getsockopt 0x%x != 0x80 + 0x10", buf);
+ err = -1;
+ goto detach;
+ }
+
+ /* Attach parent program and make sure it adds another 0x10. */
+
+ err = prog_attach(obj, cg_parent, "cgroup/setsockopt");
+ if (err)
+ goto detach;
+
+ buf = 0x80;
+ err = setsockopt(sock_fd, SOL_IP, IP_TOS, &buf, 1);
+ if (err < 0) {
+ log_err("Failed to call setsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ buf = 0x00;
+ optlen = 1;
+ err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen);
+ if (err) {
+ log_err("Failed to call getsockopt(IP_TOS)");
+ goto detach;
+ }
+
+ if (buf != 0x80 + 2 * 0x10) {
+ log_err("Unexpected getsockopt 0x%x != 0x80 + 2 * 0x10", buf);
+ err = -1;
+ goto detach;
+ }
+
+detach:
+ prog_detach(obj, cg_child, "cgroup/setsockopt");
+ prog_detach(obj, cg_parent, "cgroup/setsockopt");
+
+ return err;
+}
+
+void test_sockopt_multi(void)
+{
+ struct bpf_prog_load_attr attr = {
+ .file = "./sockopt_multi.o",
+ };
+ int cg_parent = -1, cg_child = -1;
+ struct bpf_object *obj = NULL;
+ int sock_fd = -1;
+ int err = -1;
+ int ignored;
+
+ cg_parent = test__join_cgroup("/parent");
+ if (CHECK_FAIL(cg_parent < 0))
+ goto out;
+
+ cg_child = test__join_cgroup("/parent/child");
+ if (CHECK_FAIL(cg_child < 0))
+ goto out;
+
+ err = bpf_prog_load_xattr(&attr, &obj, &ignored);
+ if (CHECK_FAIL(err))
+ goto out;
+
+ sock_fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (CHECK_FAIL(sock_fd < 0))
+ goto out;
+
+ CHECK_FAIL(run_getsockopt_test(obj, cg_parent, cg_child, sock_fd));
+ CHECK_FAIL(run_setsockopt_test(obj, cg_parent, cg_child, sock_fd));
+
+out:
+ close(sock_fd);
+ bpf_object__close(obj);
+ close(cg_child);
+ close(cg_parent);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
new file mode 100644
index 000000000..d5b44b135
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c
@@ -0,0 +1,260 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include "cgroup_helpers.h"
+
+#include <linux/tcp.h>
+
+#ifndef SOL_TCP
+#define SOL_TCP IPPROTO_TCP
+#endif
+
+#define SOL_CUSTOM 0xdeadbeef
+
+static int getsetsockopt(void)
+{
+ int fd, err;
+ union {
+ char u8[4];
+ __u32 u32;
+ char cc[16]; /* TCP_CA_NAME_MAX */
+ struct tcp_zerocopy_receive zc;
+ } buf = {};
+ socklen_t optlen;
+ char *big_buf = NULL;
+
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (fd < 0) {
+ log_err("Failed to create socket");
+ return -1;
+ }
+
+ /* IP_TOS - BPF bypass */
+
+ optlen = getpagesize() * 2;
+ big_buf = calloc(1, optlen);
+ if (!big_buf) {
+ log_err("Couldn't allocate two pages");
+ goto err;
+ }
+
+ *(int *)big_buf = 0x08;
+ err = setsockopt(fd, SOL_IP, IP_TOS, big_buf, optlen);
+ if (err) {
+ log_err("Failed to call setsockopt(IP_TOS)");
+ goto err;
+ }
+
+ memset(big_buf, 0, optlen);
+ optlen = 1;
+ err = getsockopt(fd, SOL_IP, IP_TOS, big_buf, &optlen);
+ if (err) {
+ log_err("Failed to call getsockopt(IP_TOS)");
+ goto err;
+ }
+
+ if (*big_buf != 0x08) {
+ log_err("Unexpected getsockopt(IP_TOS) optval 0x%x != 0x08",
+ (int)*big_buf);
+ goto err;
+ }
+
+ /* IP_TTL - EPERM */
+
+ buf.u8[0] = 1;
+ err = setsockopt(fd, SOL_IP, IP_TTL, &buf, 1);
+ if (!err || errno != EPERM) {
+ log_err("Unexpected success from setsockopt(IP_TTL)");
+ goto err;
+ }
+
+ /* SOL_CUSTOM - handled by BPF */
+
+ buf.u8[0] = 0x01;
+ err = setsockopt(fd, SOL_CUSTOM, 0, &buf, 1);
+ if (err) {
+ log_err("Failed to call setsockopt");
+ goto err;
+ }
+
+ buf.u32 = 0x00;
+ optlen = 4;
+ err = getsockopt(fd, SOL_CUSTOM, 0, &buf, &optlen);
+ if (err) {
+ log_err("Failed to call getsockopt");
+ goto err;
+ }
+
+ if (optlen != 1) {
+ log_err("Unexpected optlen %d != 1", optlen);
+ goto err;
+ }
+ if (buf.u8[0] != 0x01) {
+ log_err("Unexpected buf[0] 0x%02x != 0x01", buf.u8[0]);
+ goto err;
+ }
+
+ /* IP_FREEBIND - BPF can't access optval past PAGE_SIZE */
+
+ optlen = getpagesize() * 2;
+ memset(big_buf, 0, optlen);
+
+ err = setsockopt(fd, SOL_IP, IP_FREEBIND, big_buf, optlen);
+ if (err != 0) {
+ log_err("Failed to call setsockopt, ret=%d", err);
+ goto err;
+ }
+
+ err = getsockopt(fd, SOL_IP, IP_FREEBIND, big_buf, &optlen);
+ if (err != 0) {
+ log_err("Failed to call getsockopt, ret=%d", err);
+ goto err;
+ }
+
+ if (optlen != 1 || *(__u8 *)big_buf != 0x55) {
+ log_err("Unexpected IP_FREEBIND getsockopt, optlen=%d, optval=0x%x",
+ optlen, *(__u8 *)big_buf);
+ }
+
+ /* SO_SNDBUF is overwritten */
+
+ buf.u32 = 0x01010101;
+ err = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &buf, 4);
+ if (err) {
+ log_err("Failed to call setsockopt(SO_SNDBUF)");
+ goto err;
+ }
+
+ buf.u32 = 0x00;
+ optlen = 4;
+ err = getsockopt(fd, SOL_SOCKET, SO_SNDBUF, &buf, &optlen);
+ if (err) {
+ log_err("Failed to call getsockopt(SO_SNDBUF)");
+ goto err;
+ }
+
+ if (buf.u32 != 0x55AA*2) {
+ log_err("Unexpected getsockopt(SO_SNDBUF) 0x%x != 0x55AA*2",
+ buf.u32);
+ goto err;
+ }
+
+ /* TCP_CONGESTION can extend the string */
+
+ strcpy(buf.cc, "nv");
+ err = setsockopt(fd, SOL_TCP, TCP_CONGESTION, &buf, strlen("nv"));
+ if (err) {
+ log_err("Failed to call setsockopt(TCP_CONGESTION)");
+ goto err;
+ }
+
+
+ optlen = sizeof(buf.cc);
+ err = getsockopt(fd, SOL_TCP, TCP_CONGESTION, &buf, &optlen);
+ if (err) {
+ log_err("Failed to call getsockopt(TCP_CONGESTION)");
+ goto err;
+ }
+
+ if (strcmp(buf.cc, "cubic") != 0) {
+ log_err("Unexpected getsockopt(TCP_CONGESTION) %s != %s",
+ buf.cc, "cubic");
+ goto err;
+ }
+
+ /* TCP_ZEROCOPY_RECEIVE triggers */
+ memset(&buf, 0, sizeof(buf));
+ optlen = sizeof(buf.zc);
+ err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen);
+ if (err) {
+ log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d",
+ err, errno);
+ goto err;
+ }
+
+ memset(&buf, 0, sizeof(buf));
+ buf.zc.address = 12345; /* rejected by BPF */
+ optlen = sizeof(buf.zc);
+ errno = 0;
+ err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen);
+ if (errno != EPERM) {
+ log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d",
+ err, errno);
+ goto err;
+ }
+
+ free(big_buf);
+ close(fd);
+ return 0;
+err:
+ free(big_buf);
+ close(fd);
+ return -1;
+}
+
+static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title)
+{
+ enum bpf_attach_type attach_type;
+ enum bpf_prog_type prog_type;
+ struct bpf_program *prog;
+ int err;
+
+ err = libbpf_prog_type_by_name(title, &prog_type, &attach_type);
+ if (err) {
+ log_err("Failed to deduct types for %s BPF program", title);
+ return -1;
+ }
+
+ prog = bpf_object__find_program_by_title(obj, title);
+ if (!prog) {
+ log_err("Failed to find %s BPF program", title);
+ return -1;
+ }
+
+ err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd,
+ attach_type, 0);
+ if (err) {
+ log_err("Failed to attach %s BPF program", title);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void run_test(int cgroup_fd)
+{
+ struct bpf_prog_load_attr attr = {
+ .file = "./sockopt_sk.o",
+ };
+ struct bpf_object *obj;
+ int ignored;
+ int err;
+
+ err = bpf_prog_load_xattr(&attr, &obj, &ignored);
+ if (CHECK_FAIL(err))
+ return;
+
+ err = prog_attach(obj, cgroup_fd, "cgroup/getsockopt");
+ if (CHECK_FAIL(err))
+ goto close_bpf_object;
+
+ err = prog_attach(obj, cgroup_fd, "cgroup/setsockopt");
+ if (CHECK_FAIL(err))
+ goto close_bpf_object;
+
+ CHECK_FAIL(getsetsockopt());
+
+close_bpf_object:
+ bpf_object__close(obj);
+}
+
+void test_sockopt_sk(void)
+{
+ int cgroup_fd;
+
+ cgroup_fd = test__join_cgroup("/sockopt_sk");
+ if (CHECK_FAIL(cgroup_fd < 0))
+ return;
+
+ run_test(cgroup_fd);
+ close(cgroup_fd);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/spinlock.c b/tools/testing/selftests/bpf/prog_tests/spinlock.c
new file mode 100644
index 000000000..7577a77a4
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/spinlock.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+
+static void *spin_lock_thread(void *arg)
+{
+ __u32 duration, retval;
+ int err, prog_fd = *(u32 *) arg;
+
+ err = bpf_prog_test_run(prog_fd, 10000, &pkt_v4, sizeof(pkt_v4),
+ NULL, NULL, &retval, &duration);
+ CHECK(err || retval, "",
+ "err %d errno %d retval %d duration %d\n",
+ err, errno, retval, duration);
+ pthread_exit(arg);
+}
+
+void test_spinlock(void)
+{
+ const char *file = "./test_spin_lock.o";
+ pthread_t thread_id[4];
+ struct bpf_object *obj = NULL;
+ int prog_fd;
+ int err = 0, i;
+ void *ret;
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_CGROUP_SKB, &obj, &prog_fd);
+ if (CHECK_FAIL(err)) {
+ printf("test_spin_lock:bpf_prog_load errno %d\n", errno);
+ goto close_prog;
+ }
+ for (i = 0; i < 4; i++)
+ if (CHECK_FAIL(pthread_create(&thread_id[i], NULL,
+ &spin_lock_thread, &prog_fd)))
+ goto close_prog;
+
+ for (i = 0; i < 4; i++)
+ if (CHECK_FAIL(pthread_join(thread_id[i], &ret) ||
+ ret != (void *)&prog_fd))
+ goto close_prog;
+close_prog:
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c
new file mode 100644
index 000000000..e8399ae50
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id.c
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include "test_stacktrace_build_id.skel.h"
+
+void test_stacktrace_build_id(void)
+{
+
+ int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
+ struct test_stacktrace_build_id *skel;
+ int err, stack_trace_len;
+ __u32 key, previous_key, val, duration = 0;
+ char buf[256];
+ int i, j;
+ struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH];
+ int build_id_matches = 0;
+ int retry = 1;
+
+retry:
+ skel = test_stacktrace_build_id__open_and_load();
+ if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n"))
+ return;
+
+ err = test_stacktrace_build_id__attach(skel);
+ if (CHECK(err, "attach_tp", "err %d\n", err))
+ goto cleanup;
+
+ /* find map fds */
+ control_map_fd = bpf_map__fd(skel->maps.control_map);
+ stackid_hmap_fd = bpf_map__fd(skel->maps.stackid_hmap);
+ stackmap_fd = bpf_map__fd(skel->maps.stackmap);
+ stack_amap_fd = bpf_map__fd(skel->maps.stack_amap);
+
+ if (CHECK_FAIL(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")))
+ goto cleanup;
+ if (CHECK_FAIL(system("./urandom_read")))
+ goto cleanup;
+ /* disable stack trace collection */
+ key = 0;
+ val = 1;
+ bpf_map_update_elem(control_map_fd, &key, &val, 0);
+
+ /* for every element in stackid_hmap, we can find a corresponding one
+ * in stackmap, and vise versa.
+ */
+ err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
+ if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
+ "err %d errno %d\n", err, errno))
+ goto cleanup;
+
+ err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
+ if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
+ "err %d errno %d\n", err, errno))
+ goto cleanup;
+
+ err = extract_build_id(buf, 256);
+
+ if (CHECK(err, "get build_id with readelf",
+ "err %d errno %d\n", err, errno))
+ goto cleanup;
+
+ err = bpf_map_get_next_key(stackmap_fd, NULL, &key);
+ if (CHECK(err, "get_next_key from stackmap",
+ "err %d, errno %d\n", err, errno))
+ goto cleanup;
+
+ do {
+ char build_id[64];
+
+ err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs);
+ if (CHECK(err, "lookup_elem from stackmap",
+ "err %d, errno %d\n", err, errno))
+ goto cleanup;
+ for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i)
+ if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID &&
+ id_offs[i].offset != 0) {
+ for (j = 0; j < 20; ++j)
+ sprintf(build_id + 2 * j, "%02x",
+ id_offs[i].build_id[j] & 0xff);
+ if (strstr(buf, build_id) != NULL)
+ build_id_matches = 1;
+ }
+ previous_key = key;
+ } while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0);
+
+ /* stack_map_get_build_id_offset() is racy and sometimes can return
+ * BPF_STACK_BUILD_ID_IP instead of BPF_STACK_BUILD_ID_VALID;
+ * try it one more time.
+ */
+ if (build_id_matches < 1 && retry--) {
+ test_stacktrace_build_id__destroy(skel);
+ printf("%s:WARN:Didn't find expected build ID from the map, retrying\n",
+ __func__);
+ goto retry;
+ }
+
+ if (CHECK(build_id_matches < 1, "build id match",
+ "Didn't find expected build ID from the map\n"))
+ goto cleanup;
+
+ stack_trace_len = PERF_MAX_STACK_DEPTH *
+ sizeof(struct bpf_stack_build_id);
+ err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len);
+ CHECK(err, "compare_stack_ips stackmap vs. stack_amap",
+ "err %d errno %d\n", err, errno);
+
+cleanup:
+ test_stacktrace_build_id__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
new file mode 100644
index 000000000..11a769e18
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_build_id_nmi.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include "test_stacktrace_build_id.skel.h"
+
+static __u64 read_perf_max_sample_freq(void)
+{
+ __u64 sample_freq = 5000; /* fallback to 5000 on error */
+ FILE *f;
+ __u32 duration = 0;
+
+ f = fopen("/proc/sys/kernel/perf_event_max_sample_rate", "r");
+ if (f == NULL)
+ return sample_freq;
+ CHECK(fscanf(f, "%llu", &sample_freq) != 1, "Get max sample rate",
+ "return default value: 5000,err %d\n", -errno);
+ fclose(f);
+ return sample_freq;
+}
+
+void test_stacktrace_build_id_nmi(void)
+{
+ int control_map_fd, stackid_hmap_fd, stackmap_fd;
+ struct test_stacktrace_build_id *skel;
+ int err, pmu_fd;
+ struct perf_event_attr attr = {
+ .freq = 1,
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_CPU_CYCLES,
+ };
+ __u32 key, previous_key, val, duration = 0;
+ char buf[256];
+ int i, j;
+ struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH];
+ int build_id_matches = 0;
+ int retry = 1;
+
+ attr.sample_freq = read_perf_max_sample_freq();
+
+retry:
+ skel = test_stacktrace_build_id__open();
+ if (CHECK(!skel, "skel_open", "skeleton open failed\n"))
+ return;
+
+ /* override program type */
+ bpf_program__set_perf_event(skel->progs.oncpu);
+
+ err = test_stacktrace_build_id__load(skel);
+ if (CHECK(err, "skel_load", "skeleton load failed: %d\n", err))
+ goto cleanup;
+
+ pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+ 0 /* cpu 0 */, -1 /* group id */,
+ 0 /* flags */);
+ if (pmu_fd < 0 && errno == ENOENT) {
+ printf("%s:SKIP:no PERF_COUNT_HW_CPU_CYCLES\n", __func__);
+ test__skip();
+ goto cleanup;
+ }
+ if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n",
+ pmu_fd, errno))
+ goto cleanup;
+
+ skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu,
+ pmu_fd);
+ if (CHECK(IS_ERR(skel->links.oncpu), "attach_perf_event",
+ "err %ld\n", PTR_ERR(skel->links.oncpu))) {
+ close(pmu_fd);
+ goto cleanup;
+ }
+
+ /* find map fds */
+ control_map_fd = bpf_map__fd(skel->maps.control_map);
+ stackid_hmap_fd = bpf_map__fd(skel->maps.stackid_hmap);
+ stackmap_fd = bpf_map__fd(skel->maps.stackmap);
+
+ if (CHECK_FAIL(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")))
+ goto cleanup;
+ if (CHECK_FAIL(system("taskset 0x1 ./urandom_read 100000")))
+ goto cleanup;
+ /* disable stack trace collection */
+ key = 0;
+ val = 1;
+ bpf_map_update_elem(control_map_fd, &key, &val, 0);
+
+ /* for every element in stackid_hmap, we can find a corresponding one
+ * in stackmap, and vise versa.
+ */
+ err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
+ if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
+ "err %d errno %d\n", err, errno))
+ goto cleanup;
+
+ err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
+ if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
+ "err %d errno %d\n", err, errno))
+ goto cleanup;
+
+ err = extract_build_id(buf, 256);
+
+ if (CHECK(err, "get build_id with readelf",
+ "err %d errno %d\n", err, errno))
+ goto cleanup;
+
+ err = bpf_map_get_next_key(stackmap_fd, NULL, &key);
+ if (CHECK(err, "get_next_key from stackmap",
+ "err %d, errno %d\n", err, errno))
+ goto cleanup;
+
+ do {
+ char build_id[64];
+
+ err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs);
+ if (CHECK(err, "lookup_elem from stackmap",
+ "err %d, errno %d\n", err, errno))
+ goto cleanup;
+ for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i)
+ if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID &&
+ id_offs[i].offset != 0) {
+ for (j = 0; j < 20; ++j)
+ sprintf(build_id + 2 * j, "%02x",
+ id_offs[i].build_id[j] & 0xff);
+ if (strstr(buf, build_id) != NULL)
+ build_id_matches = 1;
+ }
+ previous_key = key;
+ } while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0);
+
+ /* stack_map_get_build_id_offset() is racy and sometimes can return
+ * BPF_STACK_BUILD_ID_IP instead of BPF_STACK_BUILD_ID_VALID;
+ * try it one more time.
+ */
+ if (build_id_matches < 1 && retry--) {
+ test_stacktrace_build_id__destroy(skel);
+ printf("%s:WARN:Didn't find expected build ID from the map, retrying\n",
+ __func__);
+ goto retry;
+ }
+
+ if (CHECK(build_id_matches < 1, "build id match",
+ "Didn't find expected build ID from the map\n"))
+ goto cleanup;
+
+ /*
+ * We intentionally skip compare_stack_ips(). This is because we
+ * only support one in_nmi() ips-to-build_id translation per cpu
+ * at any time, thus stack_amap here will always fallback to
+ * BPF_STACK_BUILD_ID_IP;
+ */
+
+cleanup:
+ test_stacktrace_build_id__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
new file mode 100644
index 000000000..37269d23d
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_stacktrace_map(void)
+{
+ int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
+ const char *prog_name = "tracepoint/sched/sched_switch";
+ int err, prog_fd, stack_trace_len;
+ const char *file = "./test_stacktrace_map.o";
+ __u32 key, val, duration = 0;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ struct bpf_link *link;
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+ if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
+ return;
+
+ prog = bpf_object__find_program_by_title(obj, prog_name);
+ if (CHECK(!prog, "find_prog", "prog '%s' not found\n", prog_name))
+ goto close_prog;
+
+ link = bpf_program__attach_tracepoint(prog, "sched", "sched_switch");
+ if (CHECK(IS_ERR(link), "attach_tp", "err %ld\n", PTR_ERR(link)))
+ goto close_prog;
+
+ /* find map fds */
+ control_map_fd = bpf_find_map(__func__, obj, "control_map");
+ if (CHECK_FAIL(control_map_fd < 0))
+ goto disable_pmu;
+
+ stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
+ if (CHECK_FAIL(stackid_hmap_fd < 0))
+ goto disable_pmu;
+
+ stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
+ if (CHECK_FAIL(stackmap_fd < 0))
+ goto disable_pmu;
+
+ stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
+ if (CHECK_FAIL(stack_amap_fd < 0))
+ goto disable_pmu;
+
+ /* give some time for bpf program run */
+ sleep(1);
+
+ /* disable stack trace collection */
+ key = 0;
+ val = 1;
+ bpf_map_update_elem(control_map_fd, &key, &val, 0);
+
+ /* for every element in stackid_hmap, we can find a corresponding one
+ * in stackmap, and vise versa.
+ */
+ err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
+ if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
+ "err %d errno %d\n", err, errno))
+ goto disable_pmu;
+
+ err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
+ if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
+ "err %d errno %d\n", err, errno))
+ goto disable_pmu;
+
+ stack_trace_len = PERF_MAX_STACK_DEPTH * sizeof(__u64);
+ err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len);
+ if (CHECK(err, "compare_stack_ips stackmap vs. stack_amap",
+ "err %d errno %d\n", err, errno))
+ goto disable_pmu;
+
+disable_pmu:
+ bpf_link__destroy(link);
+close_prog:
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c
new file mode 100644
index 000000000..404a5498e
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/stacktrace_map_raw_tp.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_stacktrace_map_raw_tp(void)
+{
+ const char *prog_name = "tracepoint/sched/sched_switch";
+ int control_map_fd, stackid_hmap_fd, stackmap_fd;
+ const char *file = "./test_stacktrace_map.o";
+ __u32 key, val, duration = 0;
+ int err, prog_fd;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ struct bpf_link *link = NULL;
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
+ if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
+ return;
+
+ prog = bpf_object__find_program_by_title(obj, prog_name);
+ if (CHECK(!prog, "find_prog", "prog '%s' not found\n", prog_name))
+ goto close_prog;
+
+ link = bpf_program__attach_raw_tracepoint(prog, "sched_switch");
+ if (CHECK(IS_ERR(link), "attach_raw_tp", "err %ld\n", PTR_ERR(link)))
+ goto close_prog;
+
+ /* find map fds */
+ control_map_fd = bpf_find_map(__func__, obj, "control_map");
+ if (CHECK_FAIL(control_map_fd < 0))
+ goto close_prog;
+
+ stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
+ if (CHECK_FAIL(stackid_hmap_fd < 0))
+ goto close_prog;
+
+ stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
+ if (CHECK_FAIL(stackmap_fd < 0))
+ goto close_prog;
+
+ /* give some time for bpf program run */
+ sleep(1);
+
+ /* disable stack trace collection */
+ key = 0;
+ val = 1;
+ bpf_map_update_elem(control_map_fd, &key, &val, 0);
+
+ /* for every element in stackid_hmap, we can find a corresponding one
+ * in stackmap, and vise versa.
+ */
+ err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
+ if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
+ "err %d errno %d\n", err, errno))
+ goto close_prog;
+
+ err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
+ if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
+ "err %d errno %d\n", err, errno))
+ goto close_prog;
+
+close_prog:
+ if (!IS_ERR_OR_NULL(link))
+ bpf_link__destroy(link);
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/subprogs.c b/tools/testing/selftests/bpf/prog_tests/subprogs.c
new file mode 100644
index 000000000..3f3d2ac4d
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/subprogs.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <test_progs.h>
+#include <time.h>
+#include "test_subprogs.skel.h"
+#include "test_subprogs_unused.skel.h"
+
+static int duration;
+
+void test_subprogs(void)
+{
+ struct test_subprogs *skel;
+ struct test_subprogs_unused *skel2;
+ int err;
+
+ skel = test_subprogs__open_and_load();
+ if (CHECK(!skel, "skel_open", "failed to open skeleton\n"))
+ return;
+
+ err = test_subprogs__attach(skel);
+ if (CHECK(err, "skel_attach", "failed to attach skeleton: %d\n", err))
+ goto cleanup;
+
+ usleep(1);
+
+ CHECK(skel->bss->res1 != 12, "res1", "got %d, exp %d\n", skel->bss->res1, 12);
+ CHECK(skel->bss->res2 != 17, "res2", "got %d, exp %d\n", skel->bss->res2, 17);
+ CHECK(skel->bss->res3 != 19, "res3", "got %d, exp %d\n", skel->bss->res3, 19);
+ CHECK(skel->bss->res4 != 36, "res4", "got %d, exp %d\n", skel->bss->res4, 36);
+
+ skel2 = test_subprogs_unused__open_and_load();
+ ASSERT_OK_PTR(skel2, "unused_progs_skel");
+ test_subprogs_unused__destroy(skel2);
+
+cleanup:
+ test_subprogs__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/tailcalls.c b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
new file mode 100644
index 000000000..ee27d68d2
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/tailcalls.c
@@ -0,0 +1,819 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+
+/* test_tailcall_1 checks basic functionality by patching multiple locations
+ * in a single program for a single tail call slot with nop->jmp, jmp->nop
+ * and jmp->jmp rewrites. Also checks for nop->nop.
+ */
+static void test_tailcall_1(void)
+{
+ int err, map_fd, prog_fd, main_fd, i, j;
+ struct bpf_map *prog_array;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ __u32 retval, duration;
+ char prog_name[32];
+ char buff[128] = {};
+
+ err = bpf_prog_load("tailcall1.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
+ &prog_fd);
+ if (CHECK_FAIL(err))
+ return;
+
+ prog = bpf_object__find_program_by_title(obj, "classifier");
+ if (CHECK_FAIL(!prog))
+ goto out;
+
+ main_fd = bpf_program__fd(prog);
+ if (CHECK_FAIL(main_fd < 0))
+ goto out;
+
+ prog_array = bpf_object__find_map_by_name(obj, "jmp_table");
+ if (CHECK_FAIL(!prog_array))
+ goto out;
+
+ map_fd = bpf_map__fd(prog_array);
+ if (CHECK_FAIL(map_fd < 0))
+ goto out;
+
+ for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+ snprintf(prog_name, sizeof(prog_name), "classifier/%i", i);
+
+ prog = bpf_object__find_program_by_title(obj, prog_name);
+ if (CHECK_FAIL(!prog))
+ goto out;
+
+ prog_fd = bpf_program__fd(prog);
+ if (CHECK_FAIL(prog_fd < 0))
+ goto out;
+
+ err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY);
+ if (CHECK_FAIL(err))
+ goto out;
+ }
+
+ for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+ err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
+ &duration, &retval, NULL);
+ CHECK(err || retval != i, "tailcall",
+ "err %d errno %d retval %d\n", err, errno, retval);
+
+ err = bpf_map_delete_elem(map_fd, &i);
+ if (CHECK_FAIL(err))
+ goto out;
+ }
+
+ err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
+ &duration, &retval, NULL);
+ CHECK(err || retval != 3, "tailcall", "err %d errno %d retval %d\n",
+ err, errno, retval);
+
+ for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+ snprintf(prog_name, sizeof(prog_name), "classifier/%i", i);
+
+ prog = bpf_object__find_program_by_title(obj, prog_name);
+ if (CHECK_FAIL(!prog))
+ goto out;
+
+ prog_fd = bpf_program__fd(prog);
+ if (CHECK_FAIL(prog_fd < 0))
+ goto out;
+
+ err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY);
+ if (CHECK_FAIL(err))
+ goto out;
+ }
+
+ err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
+ &duration, &retval, NULL);
+ CHECK(err || retval != 0, "tailcall", "err %d errno %d retval %d\n",
+ err, errno, retval);
+
+ for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+ j = bpf_map__def(prog_array)->max_entries - 1 - i;
+ snprintf(prog_name, sizeof(prog_name), "classifier/%i", j);
+
+ prog = bpf_object__find_program_by_title(obj, prog_name);
+ if (CHECK_FAIL(!prog))
+ goto out;
+
+ prog_fd = bpf_program__fd(prog);
+ if (CHECK_FAIL(prog_fd < 0))
+ goto out;
+
+ err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY);
+ if (CHECK_FAIL(err))
+ goto out;
+ }
+
+ for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+ j = bpf_map__def(prog_array)->max_entries - 1 - i;
+
+ err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
+ &duration, &retval, NULL);
+ CHECK(err || retval != j, "tailcall",
+ "err %d errno %d retval %d\n", err, errno, retval);
+
+ err = bpf_map_delete_elem(map_fd, &i);
+ if (CHECK_FAIL(err))
+ goto out;
+ }
+
+ err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
+ &duration, &retval, NULL);
+ CHECK(err || retval != 3, "tailcall", "err %d errno %d retval %d\n",
+ err, errno, retval);
+
+ for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+ err = bpf_map_delete_elem(map_fd, &i);
+ if (CHECK_FAIL(err >= 0 || errno != ENOENT))
+ goto out;
+
+ err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
+ &duration, &retval, NULL);
+ CHECK(err || retval != 3, "tailcall",
+ "err %d errno %d retval %d\n", err, errno, retval);
+ }
+
+out:
+ bpf_object__close(obj);
+}
+
+/* test_tailcall_2 checks that patching multiple programs for a single
+ * tail call slot works. It also jumps through several programs and tests
+ * the tail call limit counter.
+ */
+static void test_tailcall_2(void)
+{
+ int err, map_fd, prog_fd, main_fd, i;
+ struct bpf_map *prog_array;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ __u32 retval, duration;
+ char prog_name[32];
+ char buff[128] = {};
+
+ err = bpf_prog_load("tailcall2.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
+ &prog_fd);
+ if (CHECK_FAIL(err))
+ return;
+
+ prog = bpf_object__find_program_by_title(obj, "classifier");
+ if (CHECK_FAIL(!prog))
+ goto out;
+
+ main_fd = bpf_program__fd(prog);
+ if (CHECK_FAIL(main_fd < 0))
+ goto out;
+
+ prog_array = bpf_object__find_map_by_name(obj, "jmp_table");
+ if (CHECK_FAIL(!prog_array))
+ goto out;
+
+ map_fd = bpf_map__fd(prog_array);
+ if (CHECK_FAIL(map_fd < 0))
+ goto out;
+
+ for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+ snprintf(prog_name, sizeof(prog_name), "classifier/%i", i);
+
+ prog = bpf_object__find_program_by_title(obj, prog_name);
+ if (CHECK_FAIL(!prog))
+ goto out;
+
+ prog_fd = bpf_program__fd(prog);
+ if (CHECK_FAIL(prog_fd < 0))
+ goto out;
+
+ err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY);
+ if (CHECK_FAIL(err))
+ goto out;
+ }
+
+ err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
+ &duration, &retval, NULL);
+ CHECK(err || retval != 2, "tailcall", "err %d errno %d retval %d\n",
+ err, errno, retval);
+
+ i = 2;
+ err = bpf_map_delete_elem(map_fd, &i);
+ if (CHECK_FAIL(err))
+ goto out;
+
+ err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
+ &duration, &retval, NULL);
+ CHECK(err || retval != 1, "tailcall", "err %d errno %d retval %d\n",
+ err, errno, retval);
+
+ i = 0;
+ err = bpf_map_delete_elem(map_fd, &i);
+ if (CHECK_FAIL(err))
+ goto out;
+
+ err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
+ &duration, &retval, NULL);
+ CHECK(err || retval != 3, "tailcall", "err %d errno %d retval %d\n",
+ err, errno, retval);
+out:
+ bpf_object__close(obj);
+}
+
+/* test_tailcall_3 checks that the count value of the tail call limit
+ * enforcement matches with expectations.
+ */
+static void test_tailcall_3(void)
+{
+ int err, map_fd, prog_fd, main_fd, data_fd, i, val;
+ struct bpf_map *prog_array, *data_map;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ __u32 retval, duration;
+ char buff[128] = {};
+
+ err = bpf_prog_load("tailcall3.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
+ &prog_fd);
+ if (CHECK_FAIL(err))
+ return;
+
+ prog = bpf_object__find_program_by_title(obj, "classifier");
+ if (CHECK_FAIL(!prog))
+ goto out;
+
+ main_fd = bpf_program__fd(prog);
+ if (CHECK_FAIL(main_fd < 0))
+ goto out;
+
+ prog_array = bpf_object__find_map_by_name(obj, "jmp_table");
+ if (CHECK_FAIL(!prog_array))
+ goto out;
+
+ map_fd = bpf_map__fd(prog_array);
+ if (CHECK_FAIL(map_fd < 0))
+ goto out;
+
+ prog = bpf_object__find_program_by_title(obj, "classifier/0");
+ if (CHECK_FAIL(!prog))
+ goto out;
+
+ prog_fd = bpf_program__fd(prog);
+ if (CHECK_FAIL(prog_fd < 0))
+ goto out;
+
+ i = 0;
+ err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY);
+ if (CHECK_FAIL(err))
+ goto out;
+
+ err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
+ &duration, &retval, NULL);
+ CHECK(err || retval != 1, "tailcall", "err %d errno %d retval %d\n",
+ err, errno, retval);
+
+ data_map = bpf_object__find_map_by_name(obj, "tailcall.bss");
+ if (CHECK_FAIL(!data_map || !bpf_map__is_internal(data_map)))
+ return;
+
+ data_fd = bpf_map__fd(data_map);
+ if (CHECK_FAIL(map_fd < 0))
+ return;
+
+ i = 0;
+ err = bpf_map_lookup_elem(data_fd, &i, &val);
+ CHECK(err || val != 33, "tailcall count", "err %d errno %d count %d\n",
+ err, errno, val);
+
+ i = 0;
+ err = bpf_map_delete_elem(map_fd, &i);
+ if (CHECK_FAIL(err))
+ goto out;
+
+ err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
+ &duration, &retval, NULL);
+ CHECK(err || retval != 0, "tailcall", "err %d errno %d retval %d\n",
+ err, errno, retval);
+out:
+ bpf_object__close(obj);
+}
+
+/* test_tailcall_4 checks that the kernel properly selects indirect jump
+ * for the case where the key is not known. Latter is passed via global
+ * data to select different targets we can compare return value of.
+ */
+static void test_tailcall_4(void)
+{
+ int err, map_fd, prog_fd, main_fd, data_fd, i;
+ struct bpf_map *prog_array, *data_map;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ __u32 retval, duration;
+ static const int zero = 0;
+ char buff[128] = {};
+ char prog_name[32];
+
+ err = bpf_prog_load("tailcall4.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
+ &prog_fd);
+ if (CHECK_FAIL(err))
+ return;
+
+ prog = bpf_object__find_program_by_title(obj, "classifier");
+ if (CHECK_FAIL(!prog))
+ goto out;
+
+ main_fd = bpf_program__fd(prog);
+ if (CHECK_FAIL(main_fd < 0))
+ goto out;
+
+ prog_array = bpf_object__find_map_by_name(obj, "jmp_table");
+ if (CHECK_FAIL(!prog_array))
+ goto out;
+
+ map_fd = bpf_map__fd(prog_array);
+ if (CHECK_FAIL(map_fd < 0))
+ goto out;
+
+ data_map = bpf_object__find_map_by_name(obj, "tailcall.bss");
+ if (CHECK_FAIL(!data_map || !bpf_map__is_internal(data_map)))
+ return;
+
+ data_fd = bpf_map__fd(data_map);
+ if (CHECK_FAIL(map_fd < 0))
+ return;
+
+ for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+ snprintf(prog_name, sizeof(prog_name), "classifier/%i", i);
+
+ prog = bpf_object__find_program_by_title(obj, prog_name);
+ if (CHECK_FAIL(!prog))
+ goto out;
+
+ prog_fd = bpf_program__fd(prog);
+ if (CHECK_FAIL(prog_fd < 0))
+ goto out;
+
+ err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY);
+ if (CHECK_FAIL(err))
+ goto out;
+ }
+
+ for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+ err = bpf_map_update_elem(data_fd, &zero, &i, BPF_ANY);
+ if (CHECK_FAIL(err))
+ goto out;
+
+ err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
+ &duration, &retval, NULL);
+ CHECK(err || retval != i, "tailcall",
+ "err %d errno %d retval %d\n", err, errno, retval);
+ }
+
+ for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+ err = bpf_map_update_elem(data_fd, &zero, &i, BPF_ANY);
+ if (CHECK_FAIL(err))
+ goto out;
+
+ err = bpf_map_delete_elem(map_fd, &i);
+ if (CHECK_FAIL(err))
+ goto out;
+
+ err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
+ &duration, &retval, NULL);
+ CHECK(err || retval != 3, "tailcall",
+ "err %d errno %d retval %d\n", err, errno, retval);
+ }
+out:
+ bpf_object__close(obj);
+}
+
+/* test_tailcall_5 probes similarly to test_tailcall_4 that the kernel generates
+ * an indirect jump when the keys are const but different from different branches.
+ */
+static void test_tailcall_5(void)
+{
+ int err, map_fd, prog_fd, main_fd, data_fd, i, key[] = { 1111, 1234, 5678 };
+ struct bpf_map *prog_array, *data_map;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ __u32 retval, duration;
+ static const int zero = 0;
+ char buff[128] = {};
+ char prog_name[32];
+
+ err = bpf_prog_load("tailcall5.o", BPF_PROG_TYPE_SCHED_CLS, &obj,
+ &prog_fd);
+ if (CHECK_FAIL(err))
+ return;
+
+ prog = bpf_object__find_program_by_title(obj, "classifier");
+ if (CHECK_FAIL(!prog))
+ goto out;
+
+ main_fd = bpf_program__fd(prog);
+ if (CHECK_FAIL(main_fd < 0))
+ goto out;
+
+ prog_array = bpf_object__find_map_by_name(obj, "jmp_table");
+ if (CHECK_FAIL(!prog_array))
+ goto out;
+
+ map_fd = bpf_map__fd(prog_array);
+ if (CHECK_FAIL(map_fd < 0))
+ goto out;
+
+ data_map = bpf_object__find_map_by_name(obj, "tailcall.bss");
+ if (CHECK_FAIL(!data_map || !bpf_map__is_internal(data_map)))
+ return;
+
+ data_fd = bpf_map__fd(data_map);
+ if (CHECK_FAIL(map_fd < 0))
+ return;
+
+ for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+ snprintf(prog_name, sizeof(prog_name), "classifier/%i", i);
+
+ prog = bpf_object__find_program_by_title(obj, prog_name);
+ if (CHECK_FAIL(!prog))
+ goto out;
+
+ prog_fd = bpf_program__fd(prog);
+ if (CHECK_FAIL(prog_fd < 0))
+ goto out;
+
+ err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY);
+ if (CHECK_FAIL(err))
+ goto out;
+ }
+
+ for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+ err = bpf_map_update_elem(data_fd, &zero, &key[i], BPF_ANY);
+ if (CHECK_FAIL(err))
+ goto out;
+
+ err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
+ &duration, &retval, NULL);
+ CHECK(err || retval != i, "tailcall",
+ "err %d errno %d retval %d\n", err, errno, retval);
+ }
+
+ for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+ err = bpf_map_update_elem(data_fd, &zero, &key[i], BPF_ANY);
+ if (CHECK_FAIL(err))
+ goto out;
+
+ err = bpf_map_delete_elem(map_fd, &i);
+ if (CHECK_FAIL(err))
+ goto out;
+
+ err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
+ &duration, &retval, NULL);
+ CHECK(err || retval != 3, "tailcall",
+ "err %d errno %d retval %d\n", err, errno, retval);
+ }
+out:
+ bpf_object__close(obj);
+}
+
+/* test_tailcall_bpf2bpf_1 purpose is to make sure that tailcalls are working
+ * correctly in correlation with BPF subprograms
+ */
+static void test_tailcall_bpf2bpf_1(void)
+{
+ int err, map_fd, prog_fd, main_fd, i;
+ struct bpf_map *prog_array;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ __u32 retval, duration;
+ char prog_name[32];
+
+ err = bpf_prog_load("tailcall_bpf2bpf1.o", BPF_PROG_TYPE_SCHED_CLS,
+ &obj, &prog_fd);
+ if (CHECK_FAIL(err))
+ return;
+
+ prog = bpf_object__find_program_by_title(obj, "classifier");
+ if (CHECK_FAIL(!prog))
+ goto out;
+
+ main_fd = bpf_program__fd(prog);
+ if (CHECK_FAIL(main_fd < 0))
+ goto out;
+
+ prog_array = bpf_object__find_map_by_name(obj, "jmp_table");
+ if (CHECK_FAIL(!prog_array))
+ goto out;
+
+ map_fd = bpf_map__fd(prog_array);
+ if (CHECK_FAIL(map_fd < 0))
+ goto out;
+
+ /* nop -> jmp */
+ for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+ snprintf(prog_name, sizeof(prog_name), "classifier/%i", i);
+
+ prog = bpf_object__find_program_by_title(obj, prog_name);
+ if (CHECK_FAIL(!prog))
+ goto out;
+
+ prog_fd = bpf_program__fd(prog);
+ if (CHECK_FAIL(prog_fd < 0))
+ goto out;
+
+ err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY);
+ if (CHECK_FAIL(err))
+ goto out;
+ }
+
+ err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0,
+ 0, &retval, &duration);
+ CHECK(err || retval != 1, "tailcall",
+ "err %d errno %d retval %d\n", err, errno, retval);
+
+ /* jmp -> nop, call subprog that will do tailcall */
+ i = 1;
+ err = bpf_map_delete_elem(map_fd, &i);
+ if (CHECK_FAIL(err))
+ goto out;
+
+ err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0,
+ 0, &retval, &duration);
+ CHECK(err || retval != 0, "tailcall", "err %d errno %d retval %d\n",
+ err, errno, retval);
+
+ /* make sure that subprog can access ctx and entry prog that
+ * called this subprog can properly return
+ */
+ i = 0;
+ err = bpf_map_delete_elem(map_fd, &i);
+ if (CHECK_FAIL(err))
+ goto out;
+
+ err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0,
+ 0, &retval, &duration);
+ CHECK(err || retval != sizeof(pkt_v4) * 2,
+ "tailcall", "err %d errno %d retval %d\n",
+ err, errno, retval);
+out:
+ bpf_object__close(obj);
+}
+
+/* test_tailcall_bpf2bpf_2 checks that the count value of the tail call limit
+ * enforcement matches with expectations when tailcall is preceded with
+ * bpf2bpf call.
+ */
+static void test_tailcall_bpf2bpf_2(void)
+{
+ int err, map_fd, prog_fd, main_fd, data_fd, i, val;
+ struct bpf_map *prog_array, *data_map;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ __u32 retval, duration;
+ char buff[128] = {};
+
+ err = bpf_prog_load("tailcall_bpf2bpf2.o", BPF_PROG_TYPE_SCHED_CLS,
+ &obj, &prog_fd);
+ if (CHECK_FAIL(err))
+ return;
+
+ prog = bpf_object__find_program_by_title(obj, "classifier");
+ if (CHECK_FAIL(!prog))
+ goto out;
+
+ main_fd = bpf_program__fd(prog);
+ if (CHECK_FAIL(main_fd < 0))
+ goto out;
+
+ prog_array = bpf_object__find_map_by_name(obj, "jmp_table");
+ if (CHECK_FAIL(!prog_array))
+ goto out;
+
+ map_fd = bpf_map__fd(prog_array);
+ if (CHECK_FAIL(map_fd < 0))
+ goto out;
+
+ prog = bpf_object__find_program_by_title(obj, "classifier/0");
+ if (CHECK_FAIL(!prog))
+ goto out;
+
+ prog_fd = bpf_program__fd(prog);
+ if (CHECK_FAIL(prog_fd < 0))
+ goto out;
+
+ i = 0;
+ err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY);
+ if (CHECK_FAIL(err))
+ goto out;
+
+ err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
+ &duration, &retval, NULL);
+ CHECK(err || retval != 1, "tailcall", "err %d errno %d retval %d\n",
+ err, errno, retval);
+
+ data_map = bpf_object__find_map_by_name(obj, "tailcall.bss");
+ if (CHECK_FAIL(!data_map || !bpf_map__is_internal(data_map)))
+ return;
+
+ data_fd = bpf_map__fd(data_map);
+ if (CHECK_FAIL(map_fd < 0))
+ return;
+
+ i = 0;
+ err = bpf_map_lookup_elem(data_fd, &i, &val);
+ CHECK(err || val != 33, "tailcall count", "err %d errno %d count %d\n",
+ err, errno, val);
+
+ i = 0;
+ err = bpf_map_delete_elem(map_fd, &i);
+ if (CHECK_FAIL(err))
+ goto out;
+
+ err = bpf_prog_test_run(main_fd, 1, buff, sizeof(buff), 0,
+ &duration, &retval, NULL);
+ CHECK(err || retval != 0, "tailcall", "err %d errno %d retval %d\n",
+ err, errno, retval);
+out:
+ bpf_object__close(obj);
+}
+
+/* test_tailcall_bpf2bpf_3 checks that non-trivial amount of stack (up to
+ * 256 bytes) can be used within bpf subprograms that have the tailcalls
+ * in them
+ */
+static void test_tailcall_bpf2bpf_3(void)
+{
+ int err, map_fd, prog_fd, main_fd, i;
+ struct bpf_map *prog_array;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ __u32 retval, duration;
+ char prog_name[32];
+
+ err = bpf_prog_load("tailcall_bpf2bpf3.o", BPF_PROG_TYPE_SCHED_CLS,
+ &obj, &prog_fd);
+ if (CHECK_FAIL(err))
+ return;
+
+ prog = bpf_object__find_program_by_title(obj, "classifier");
+ if (CHECK_FAIL(!prog))
+ goto out;
+
+ main_fd = bpf_program__fd(prog);
+ if (CHECK_FAIL(main_fd < 0))
+ goto out;
+
+ prog_array = bpf_object__find_map_by_name(obj, "jmp_table");
+ if (CHECK_FAIL(!prog_array))
+ goto out;
+
+ map_fd = bpf_map__fd(prog_array);
+ if (CHECK_FAIL(map_fd < 0))
+ goto out;
+
+ for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+ snprintf(prog_name, sizeof(prog_name), "classifier/%i", i);
+
+ prog = bpf_object__find_program_by_title(obj, prog_name);
+ if (CHECK_FAIL(!prog))
+ goto out;
+
+ prog_fd = bpf_program__fd(prog);
+ if (CHECK_FAIL(prog_fd < 0))
+ goto out;
+
+ err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY);
+ if (CHECK_FAIL(err))
+ goto out;
+ }
+
+ err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0,
+ &duration, &retval, NULL);
+ CHECK(err || retval != sizeof(pkt_v4) * 3,
+ "tailcall", "err %d errno %d retval %d\n",
+ err, errno, retval);
+
+ i = 1;
+ err = bpf_map_delete_elem(map_fd, &i);
+ if (CHECK_FAIL(err))
+ goto out;
+
+ err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0,
+ &duration, &retval, NULL);
+ CHECK(err || retval != sizeof(pkt_v4),
+ "tailcall", "err %d errno %d retval %d\n",
+ err, errno, retval);
+
+ i = 0;
+ err = bpf_map_delete_elem(map_fd, &i);
+ if (CHECK_FAIL(err))
+ goto out;
+
+ err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0,
+ &duration, &retval, NULL);
+ CHECK(err || retval != sizeof(pkt_v4) * 2,
+ "tailcall", "err %d errno %d retval %d\n",
+ err, errno, retval);
+out:
+ bpf_object__close(obj);
+}
+
+/* test_tailcall_bpf2bpf_4 checks that tailcall counter is correctly preserved
+ * across tailcalls combined with bpf2bpf calls. for making sure that tailcall
+ * counter behaves correctly, bpf program will go through following flow:
+ *
+ * entry -> entry_subprog -> tailcall0 -> bpf_func0 -> subprog0 ->
+ * -> tailcall1 -> bpf_func1 -> subprog1 -> tailcall2 -> bpf_func2 ->
+ * subprog2 [here bump global counter] --------^
+ *
+ * We go through first two tailcalls and start counting from the subprog2 where
+ * the loop begins. At the end of the test make sure that the global counter is
+ * equal to 31, because tailcall counter includes the first two tailcalls
+ * whereas global counter is incremented only on loop presented on flow above.
+ */
+static void test_tailcall_bpf2bpf_4(void)
+{
+ int err, map_fd, prog_fd, main_fd, data_fd, i, val;
+ struct bpf_map *prog_array, *data_map;
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ __u32 retval, duration;
+ char prog_name[32];
+
+ err = bpf_prog_load("tailcall_bpf2bpf4.o", BPF_PROG_TYPE_SCHED_CLS,
+ &obj, &prog_fd);
+ if (CHECK_FAIL(err))
+ return;
+
+ prog = bpf_object__find_program_by_title(obj, "classifier");
+ if (CHECK_FAIL(!prog))
+ goto out;
+
+ main_fd = bpf_program__fd(prog);
+ if (CHECK_FAIL(main_fd < 0))
+ goto out;
+
+ prog_array = bpf_object__find_map_by_name(obj, "jmp_table");
+ if (CHECK_FAIL(!prog_array))
+ goto out;
+
+ map_fd = bpf_map__fd(prog_array);
+ if (CHECK_FAIL(map_fd < 0))
+ goto out;
+
+ for (i = 0; i < bpf_map__def(prog_array)->max_entries; i++) {
+ snprintf(prog_name, sizeof(prog_name), "classifier/%i", i);
+
+ prog = bpf_object__find_program_by_title(obj, prog_name);
+ if (CHECK_FAIL(!prog))
+ goto out;
+
+ prog_fd = bpf_program__fd(prog);
+ if (CHECK_FAIL(prog_fd < 0))
+ goto out;
+
+ err = bpf_map_update_elem(map_fd, &i, &prog_fd, BPF_ANY);
+ if (CHECK_FAIL(err))
+ goto out;
+ }
+
+ err = bpf_prog_test_run(main_fd, 1, &pkt_v4, sizeof(pkt_v4), 0,
+ &duration, &retval, NULL);
+ CHECK(err || retval != sizeof(pkt_v4) * 3, "tailcall", "err %d errno %d retval %d\n",
+ err, errno, retval);
+
+ data_map = bpf_object__find_map_by_name(obj, "tailcall.bss");
+ if (CHECK_FAIL(!data_map || !bpf_map__is_internal(data_map)))
+ return;
+
+ data_fd = bpf_map__fd(data_map);
+ if (CHECK_FAIL(map_fd < 0))
+ return;
+
+ i = 0;
+ err = bpf_map_lookup_elem(data_fd, &i, &val);
+ CHECK(err || val != 31, "tailcall count", "err %d errno %d count %d\n",
+ err, errno, val);
+
+out:
+ bpf_object__close(obj);
+}
+
+void test_tailcalls(void)
+{
+ if (test__start_subtest("tailcall_1"))
+ test_tailcall_1();
+ if (test__start_subtest("tailcall_2"))
+ test_tailcall_2();
+ if (test__start_subtest("tailcall_3"))
+ test_tailcall_3();
+ if (test__start_subtest("tailcall_4"))
+ test_tailcall_4();
+ if (test__start_subtest("tailcall_5"))
+ test_tailcall_5();
+ if (test__start_subtest("tailcall_bpf2bpf_1"))
+ test_tailcall_bpf2bpf_1();
+ if (test__start_subtest("tailcall_bpf2bpf_2"))
+ test_tailcall_bpf2bpf_2();
+ if (test__start_subtest("tailcall_bpf2bpf_3"))
+ test_tailcall_bpf2bpf_3();
+ if (test__start_subtest("tailcall_bpf2bpf_4"))
+ test_tailcall_bpf2bpf_4();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c b/tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c
new file mode 100644
index 000000000..1bdc1d86a
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/task_fd_query_rawtp.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_task_fd_query_rawtp(void)
+{
+ const char *file = "./test_get_stack_rawtp.o";
+ __u64 probe_offset, probe_addr;
+ __u32 len, prog_id, fd_type;
+ struct bpf_object *obj;
+ int efd, err, prog_fd;
+ __u32 duration = 0;
+ char buf[256];
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
+ if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
+ return;
+
+ efd = bpf_raw_tracepoint_open("sys_enter", prog_fd);
+ if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno))
+ goto close_prog;
+
+ /* query (getpid(), efd) */
+ len = sizeof(buf);
+ err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id,
+ &fd_type, &probe_offset, &probe_addr);
+ if (CHECK(err < 0, "bpf_task_fd_query", "err %d errno %d\n", err,
+ errno))
+ goto close_prog;
+
+ err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+ strcmp(buf, "sys_enter") == 0;
+ if (CHECK(!err, "check_results", "fd_type %d tp_name %s\n",
+ fd_type, buf))
+ goto close_prog;
+
+ /* test zero len */
+ len = 0;
+ err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id,
+ &fd_type, &probe_offset, &probe_addr);
+ if (CHECK(err < 0, "bpf_task_fd_query (len = 0)", "err %d errno %d\n",
+ err, errno))
+ goto close_prog;
+ err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+ len == strlen("sys_enter");
+ if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+ goto close_prog;
+
+ /* test empty buffer */
+ len = sizeof(buf);
+ err = bpf_task_fd_query(getpid(), efd, 0, 0, &len, &prog_id,
+ &fd_type, &probe_offset, &probe_addr);
+ if (CHECK(err < 0, "bpf_task_fd_query (buf = 0)", "err %d errno %d\n",
+ err, errno))
+ goto close_prog;
+ err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+ len == strlen("sys_enter");
+ if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+ goto close_prog;
+
+ /* test smaller buffer */
+ len = 3;
+ err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id,
+ &fd_type, &probe_offset, &probe_addr);
+ if (CHECK(err >= 0 || errno != ENOSPC, "bpf_task_fd_query (len = 3)",
+ "err %d errno %d\n", err, errno))
+ goto close_prog;
+ err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+ len == strlen("sys_enter") &&
+ strcmp(buf, "sy") == 0;
+ if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+ goto close_prog;
+
+close_prog:
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c b/tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c
new file mode 100644
index 000000000..3f131b8fe
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/task_fd_query_tp.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+static void test_task_fd_query_tp_core(const char *probe_name,
+ const char *tp_name)
+{
+ const char *file = "./test_tracepoint.o";
+ int err, bytes, efd, prog_fd, pmu_fd;
+ struct perf_event_attr attr = {};
+ __u64 probe_offset, probe_addr;
+ __u32 len, prog_id, fd_type;
+ struct bpf_object *obj = NULL;
+ __u32 duration = 0;
+ char buf[256];
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+ if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
+ goto close_prog;
+
+ snprintf(buf, sizeof(buf),
+ "/sys/kernel/debug/tracing/events/%s/id", probe_name);
+ efd = open(buf, O_RDONLY, 0);
+ if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+ goto close_prog;
+ bytes = read(efd, buf, sizeof(buf));
+ close(efd);
+ if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read",
+ "bytes %d errno %d\n", bytes, errno))
+ goto close_prog;
+
+ attr.config = strtol(buf, NULL, 0);
+ attr.type = PERF_TYPE_TRACEPOINT;
+ attr.sample_type = PERF_SAMPLE_RAW;
+ attr.sample_period = 1;
+ attr.wakeup_events = 1;
+ pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+ 0 /* cpu 0 */, -1 /* group id */,
+ 0 /* flags */);
+ if (CHECK(err, "perf_event_open", "err %d errno %d\n", err, errno))
+ goto close_pmu;
+
+ err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+ if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err,
+ errno))
+ goto close_pmu;
+
+ err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+ if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", err,
+ errno))
+ goto close_pmu;
+
+ /* query (getpid(), pmu_fd) */
+ len = sizeof(buf);
+ err = bpf_task_fd_query(getpid(), pmu_fd, 0, buf, &len, &prog_id,
+ &fd_type, &probe_offset, &probe_addr);
+ if (CHECK(err < 0, "bpf_task_fd_query", "err %d errno %d\n", err,
+ errno))
+ goto close_pmu;
+
+ err = (fd_type == BPF_FD_TYPE_TRACEPOINT) && !strcmp(buf, tp_name);
+ if (CHECK(!err, "check_results", "fd_type %d tp_name %s\n",
+ fd_type, buf))
+ goto close_pmu;
+
+close_pmu:
+ close(pmu_fd);
+close_prog:
+ bpf_object__close(obj);
+}
+
+void test_task_fd_query_tp(void)
+{
+ test_task_fd_query_tp_core("sched/sched_switch",
+ "sched_switch");
+ test_task_fd_query_tp_core("syscalls/sys_enter_read",
+ "sys_enter_read");
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_estats.c b/tools/testing/selftests/bpf/prog_tests/tcp_estats.c
new file mode 100644
index 000000000..594307dff
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_estats.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_tcp_estats(void)
+{
+ const char *file = "./test_tcp_estats.o";
+ int err, prog_fd;
+ struct bpf_object *obj;
+ __u32 duration = 0;
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+ CHECK(err, "", "err %d errno %d\n", err, errno);
+ if (err)
+ return;
+
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
new file mode 100644
index 000000000..c85174cdc
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_hdr_options.c
@@ -0,0 +1,610 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#define _GNU_SOURCE
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <linux/compiler.h>
+
+#include "test_progs.h"
+#include "cgroup_helpers.h"
+#include "network_helpers.h"
+#include "test_tcp_hdr_options.h"
+#include "test_tcp_hdr_options.skel.h"
+#include "test_misc_tcp_hdr_options.skel.h"
+
+#define LO_ADDR6 "::1"
+#define CG_NAME "/tcpbpf-hdr-opt-test"
+
+struct bpf_test_option exp_passive_estab_in;
+struct bpf_test_option exp_active_estab_in;
+struct bpf_test_option exp_passive_fin_in;
+struct bpf_test_option exp_active_fin_in;
+struct hdr_stg exp_passive_hdr_stg;
+struct hdr_stg exp_active_hdr_stg = { .active = true, };
+
+static struct test_misc_tcp_hdr_options *misc_skel;
+static struct test_tcp_hdr_options *skel;
+static int lport_linum_map_fd;
+static int hdr_stg_map_fd;
+static __u32 duration;
+static int cg_fd;
+
+struct sk_fds {
+ int srv_fd;
+ int passive_fd;
+ int active_fd;
+ int passive_lport;
+ int active_lport;
+};
+
+static int create_netns(void)
+{
+ if (CHECK(unshare(CLONE_NEWNET), "create netns",
+ "unshare(CLONE_NEWNET): %s (%d)",
+ strerror(errno), errno))
+ return -1;
+
+ if (CHECK(system("ip link set dev lo up"), "run ip cmd",
+ "failed to bring lo link up\n"))
+ return -1;
+
+ return 0;
+}
+
+static int write_sysctl(const char *sysctl, const char *value)
+{
+ int fd, err, len;
+
+ fd = open(sysctl, O_WRONLY);
+ if (CHECK(fd == -1, "open sysctl", "open(%s): %s (%d)\n",
+ sysctl, strerror(errno), errno))
+ return -1;
+
+ len = strlen(value);
+ err = write(fd, value, len);
+ close(fd);
+ if (CHECK(err != len, "write sysctl",
+ "write(%s, %s): err:%d %s (%d)\n",
+ sysctl, value, err, strerror(errno), errno))
+ return -1;
+
+ return 0;
+}
+
+static void print_hdr_stg(const struct hdr_stg *hdr_stg, const char *prefix)
+{
+ fprintf(stderr, "%s{active:%u, resend_syn:%u, syncookie:%u, fastopen:%u}\n",
+ prefix ? : "", hdr_stg->active, hdr_stg->resend_syn,
+ hdr_stg->syncookie, hdr_stg->fastopen);
+}
+
+static void print_option(const struct bpf_test_option *opt, const char *prefix)
+{
+ fprintf(stderr, "%s{flags:0x%x, max_delack_ms:%u, rand:0x%x}\n",
+ prefix ? : "", opt->flags, opt->max_delack_ms, opt->rand);
+}
+
+static void sk_fds_close(struct sk_fds *sk_fds)
+{
+ close(sk_fds->srv_fd);
+ close(sk_fds->passive_fd);
+ close(sk_fds->active_fd);
+}
+
+static int sk_fds_shutdown(struct sk_fds *sk_fds)
+{
+ int ret, abyte;
+
+ shutdown(sk_fds->active_fd, SHUT_WR);
+ ret = read(sk_fds->passive_fd, &abyte, sizeof(abyte));
+ if (CHECK(ret != 0, "read-after-shutdown(passive_fd):",
+ "ret:%d %s (%d)\n",
+ ret, strerror(errno), errno))
+ return -1;
+
+ shutdown(sk_fds->passive_fd, SHUT_WR);
+ ret = read(sk_fds->active_fd, &abyte, sizeof(abyte));
+ if (CHECK(ret != 0, "read-after-shutdown(active_fd):",
+ "ret:%d %s (%d)\n",
+ ret, strerror(errno), errno))
+ return -1;
+
+ return 0;
+}
+
+static int sk_fds_connect(struct sk_fds *sk_fds, bool fast_open)
+{
+ const char fast[] = "FAST!!!";
+ struct sockaddr_in6 addr6;
+ socklen_t len;
+
+ sk_fds->srv_fd = start_server(AF_INET6, SOCK_STREAM, LO_ADDR6, 0, 0);
+ if (CHECK(sk_fds->srv_fd == -1, "start_server", "%s (%d)\n",
+ strerror(errno), errno))
+ goto error;
+
+ if (fast_open)
+ sk_fds->active_fd = fastopen_connect(sk_fds->srv_fd, fast,
+ sizeof(fast), 0);
+ else
+ sk_fds->active_fd = connect_to_fd(sk_fds->srv_fd, 0);
+
+ if (CHECK_FAIL(sk_fds->active_fd == -1)) {
+ close(sk_fds->srv_fd);
+ goto error;
+ }
+
+ len = sizeof(addr6);
+ if (CHECK(getsockname(sk_fds->srv_fd, (struct sockaddr *)&addr6,
+ &len), "getsockname(srv_fd)", "%s (%d)\n",
+ strerror(errno), errno))
+ goto error_close;
+ sk_fds->passive_lport = ntohs(addr6.sin6_port);
+
+ len = sizeof(addr6);
+ if (CHECK(getsockname(sk_fds->active_fd, (struct sockaddr *)&addr6,
+ &len), "getsockname(active_fd)", "%s (%d)\n",
+ strerror(errno), errno))
+ goto error_close;
+ sk_fds->active_lport = ntohs(addr6.sin6_port);
+
+ sk_fds->passive_fd = accept(sk_fds->srv_fd, NULL, 0);
+ if (CHECK(sk_fds->passive_fd == -1, "accept(srv_fd)", "%s (%d)\n",
+ strerror(errno), errno))
+ goto error_close;
+
+ if (fast_open) {
+ char bytes_in[sizeof(fast)];
+ int ret;
+
+ ret = read(sk_fds->passive_fd, bytes_in, sizeof(bytes_in));
+ if (CHECK(ret != sizeof(fast), "read fastopen syn data",
+ "expected=%lu actual=%d\n", sizeof(fast), ret)) {
+ close(sk_fds->passive_fd);
+ goto error_close;
+ }
+ }
+
+ return 0;
+
+error_close:
+ close(sk_fds->active_fd);
+ close(sk_fds->srv_fd);
+
+error:
+ memset(sk_fds, -1, sizeof(*sk_fds));
+ return -1;
+}
+
+static int check_hdr_opt(const struct bpf_test_option *exp,
+ const struct bpf_test_option *act,
+ const char *hdr_desc)
+{
+ if (CHECK(memcmp(exp, act, sizeof(*exp)),
+ "expected-vs-actual", "unexpected %s\n", hdr_desc)) {
+ print_option(exp, "expected: ");
+ print_option(act, " actual: ");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int check_hdr_stg(const struct hdr_stg *exp, int fd,
+ const char *stg_desc)
+{
+ struct hdr_stg act;
+
+ if (CHECK(bpf_map_lookup_elem(hdr_stg_map_fd, &fd, &act),
+ "map_lookup(hdr_stg_map_fd)", "%s %s (%d)\n",
+ stg_desc, strerror(errno), errno))
+ return -1;
+
+ if (CHECK(memcmp(exp, &act, sizeof(*exp)),
+ "expected-vs-actual", "unexpected %s\n", stg_desc)) {
+ print_hdr_stg(exp, "expected: ");
+ print_hdr_stg(&act, " actual: ");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int check_error_linum(const struct sk_fds *sk_fds)
+{
+ unsigned int nr_errors = 0;
+ struct linum_err linum_err;
+ int lport;
+
+ lport = sk_fds->passive_lport;
+ if (!bpf_map_lookup_elem(lport_linum_map_fd, &lport, &linum_err)) {
+ fprintf(stderr,
+ "bpf prog error out at lport:passive(%d), linum:%u err:%d\n",
+ lport, linum_err.linum, linum_err.err);
+ nr_errors++;
+ }
+
+ lport = sk_fds->active_lport;
+ if (!bpf_map_lookup_elem(lport_linum_map_fd, &lport, &linum_err)) {
+ fprintf(stderr,
+ "bpf prog error out at lport:active(%d), linum:%u err:%d\n",
+ lport, linum_err.linum, linum_err.err);
+ nr_errors++;
+ }
+
+ return nr_errors;
+}
+
+static void check_hdr_and_close_fds(struct sk_fds *sk_fds)
+{
+ const __u32 expected_inherit_cb_flags =
+ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG |
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG |
+ BPF_SOCK_OPS_STATE_CB_FLAG;
+
+ if (sk_fds_shutdown(sk_fds))
+ goto check_linum;
+
+ if (CHECK(expected_inherit_cb_flags != skel->bss->inherit_cb_flags,
+ "Unexpected inherit_cb_flags", "0x%x != 0x%x\n",
+ skel->bss->inherit_cb_flags, expected_inherit_cb_flags))
+ goto check_linum;
+
+ if (check_hdr_stg(&exp_passive_hdr_stg, sk_fds->passive_fd,
+ "passive_hdr_stg"))
+ goto check_linum;
+
+ if (check_hdr_stg(&exp_active_hdr_stg, sk_fds->active_fd,
+ "active_hdr_stg"))
+ goto check_linum;
+
+ if (check_hdr_opt(&exp_passive_estab_in, &skel->bss->passive_estab_in,
+ "passive_estab_in"))
+ goto check_linum;
+
+ if (check_hdr_opt(&exp_active_estab_in, &skel->bss->active_estab_in,
+ "active_estab_in"))
+ goto check_linum;
+
+ if (check_hdr_opt(&exp_passive_fin_in, &skel->bss->passive_fin_in,
+ "passive_fin_in"))
+ goto check_linum;
+
+ check_hdr_opt(&exp_active_fin_in, &skel->bss->active_fin_in,
+ "active_fin_in");
+
+check_linum:
+ CHECK_FAIL(check_error_linum(sk_fds));
+ sk_fds_close(sk_fds);
+}
+
+static void prepare_out(void)
+{
+ skel->bss->active_syn_out = exp_passive_estab_in;
+ skel->bss->passive_synack_out = exp_active_estab_in;
+
+ skel->bss->active_fin_out = exp_passive_fin_in;
+ skel->bss->passive_fin_out = exp_active_fin_in;
+}
+
+static void reset_test(void)
+{
+ size_t optsize = sizeof(struct bpf_test_option);
+ int lport, err;
+
+ memset(&skel->bss->passive_synack_out, 0, optsize);
+ memset(&skel->bss->passive_fin_out, 0, optsize);
+
+ memset(&skel->bss->passive_estab_in, 0, optsize);
+ memset(&skel->bss->passive_fin_in, 0, optsize);
+
+ memset(&skel->bss->active_syn_out, 0, optsize);
+ memset(&skel->bss->active_fin_out, 0, optsize);
+
+ memset(&skel->bss->active_estab_in, 0, optsize);
+ memset(&skel->bss->active_fin_in, 0, optsize);
+
+ skel->bss->inherit_cb_flags = 0;
+
+ skel->data->test_kind = TCPOPT_EXP;
+ skel->data->test_magic = 0xeB9F;
+
+ memset(&exp_passive_estab_in, 0, optsize);
+ memset(&exp_active_estab_in, 0, optsize);
+ memset(&exp_passive_fin_in, 0, optsize);
+ memset(&exp_active_fin_in, 0, optsize);
+
+ memset(&exp_passive_hdr_stg, 0, sizeof(exp_passive_hdr_stg));
+ memset(&exp_active_hdr_stg, 0, sizeof(exp_active_hdr_stg));
+ exp_active_hdr_stg.active = true;
+
+ err = bpf_map_get_next_key(lport_linum_map_fd, NULL, &lport);
+ while (!err) {
+ bpf_map_delete_elem(lport_linum_map_fd, &lport);
+ err = bpf_map_get_next_key(lport_linum_map_fd, &lport, &lport);
+ }
+}
+
+static void fastopen_estab(void)
+{
+ struct bpf_link *link;
+ struct sk_fds sk_fds;
+
+ hdr_stg_map_fd = bpf_map__fd(skel->maps.hdr_stg_map);
+ lport_linum_map_fd = bpf_map__fd(skel->maps.lport_linum_map);
+
+ exp_passive_estab_in.flags = OPTION_F_RAND | OPTION_F_MAX_DELACK_MS;
+ exp_passive_estab_in.rand = 0xfa;
+ exp_passive_estab_in.max_delack_ms = 11;
+
+ exp_active_estab_in.flags = OPTION_F_RAND | OPTION_F_MAX_DELACK_MS;
+ exp_active_estab_in.rand = 0xce;
+ exp_active_estab_in.max_delack_ms = 22;
+
+ exp_passive_hdr_stg.fastopen = true;
+
+ prepare_out();
+
+ /* Allow fastopen without fastopen cookie */
+ if (write_sysctl("/proc/sys/net/ipv4/tcp_fastopen", "1543"))
+ return;
+
+ link = bpf_program__attach_cgroup(skel->progs.estab, cg_fd);
+ if (CHECK(IS_ERR(link), "attach_cgroup(estab)", "err: %ld\n",
+ PTR_ERR(link)))
+ return;
+
+ if (sk_fds_connect(&sk_fds, true)) {
+ bpf_link__destroy(link);
+ return;
+ }
+
+ check_hdr_and_close_fds(&sk_fds);
+ bpf_link__destroy(link);
+}
+
+static void syncookie_estab(void)
+{
+ struct bpf_link *link;
+ struct sk_fds sk_fds;
+
+ hdr_stg_map_fd = bpf_map__fd(skel->maps.hdr_stg_map);
+ lport_linum_map_fd = bpf_map__fd(skel->maps.lport_linum_map);
+
+ exp_passive_estab_in.flags = OPTION_F_RAND | OPTION_F_MAX_DELACK_MS;
+ exp_passive_estab_in.rand = 0xfa;
+ exp_passive_estab_in.max_delack_ms = 11;
+
+ exp_active_estab_in.flags = OPTION_F_RAND | OPTION_F_MAX_DELACK_MS |
+ OPTION_F_RESEND;
+ exp_active_estab_in.rand = 0xce;
+ exp_active_estab_in.max_delack_ms = 22;
+
+ exp_passive_hdr_stg.syncookie = true;
+ exp_active_hdr_stg.resend_syn = true,
+
+ prepare_out();
+
+ /* Clear the RESEND to ensure the bpf prog can learn
+ * want_cookie and set the RESEND by itself.
+ */
+ skel->bss->passive_synack_out.flags &= ~OPTION_F_RESEND;
+
+ /* Enforce syncookie mode */
+ if (write_sysctl("/proc/sys/net/ipv4/tcp_syncookies", "2"))
+ return;
+
+ link = bpf_program__attach_cgroup(skel->progs.estab, cg_fd);
+ if (CHECK(IS_ERR(link), "attach_cgroup(estab)", "err: %ld\n",
+ PTR_ERR(link)))
+ return;
+
+ if (sk_fds_connect(&sk_fds, false)) {
+ bpf_link__destroy(link);
+ return;
+ }
+
+ check_hdr_and_close_fds(&sk_fds);
+ bpf_link__destroy(link);
+}
+
+static void fin(void)
+{
+ struct bpf_link *link;
+ struct sk_fds sk_fds;
+
+ hdr_stg_map_fd = bpf_map__fd(skel->maps.hdr_stg_map);
+ lport_linum_map_fd = bpf_map__fd(skel->maps.lport_linum_map);
+
+ exp_passive_fin_in.flags = OPTION_F_RAND;
+ exp_passive_fin_in.rand = 0xfa;
+
+ exp_active_fin_in.flags = OPTION_F_RAND;
+ exp_active_fin_in.rand = 0xce;
+
+ prepare_out();
+
+ if (write_sysctl("/proc/sys/net/ipv4/tcp_syncookies", "1"))
+ return;
+
+ link = bpf_program__attach_cgroup(skel->progs.estab, cg_fd);
+ if (CHECK(IS_ERR(link), "attach_cgroup(estab)", "err: %ld\n",
+ PTR_ERR(link)))
+ return;
+
+ if (sk_fds_connect(&sk_fds, false)) {
+ bpf_link__destroy(link);
+ return;
+ }
+
+ check_hdr_and_close_fds(&sk_fds);
+ bpf_link__destroy(link);
+}
+
+static void __simple_estab(bool exprm)
+{
+ struct bpf_link *link;
+ struct sk_fds sk_fds;
+
+ hdr_stg_map_fd = bpf_map__fd(skel->maps.hdr_stg_map);
+ lport_linum_map_fd = bpf_map__fd(skel->maps.lport_linum_map);
+
+ exp_passive_estab_in.flags = OPTION_F_RAND | OPTION_F_MAX_DELACK_MS;
+ exp_passive_estab_in.rand = 0xfa;
+ exp_passive_estab_in.max_delack_ms = 11;
+
+ exp_active_estab_in.flags = OPTION_F_RAND | OPTION_F_MAX_DELACK_MS;
+ exp_active_estab_in.rand = 0xce;
+ exp_active_estab_in.max_delack_ms = 22;
+
+ prepare_out();
+
+ if (!exprm) {
+ skel->data->test_kind = 0xB9;
+ skel->data->test_magic = 0;
+ }
+
+ if (write_sysctl("/proc/sys/net/ipv4/tcp_syncookies", "1"))
+ return;
+
+ link = bpf_program__attach_cgroup(skel->progs.estab, cg_fd);
+ if (CHECK(IS_ERR(link), "attach_cgroup(estab)", "err: %ld\n",
+ PTR_ERR(link)))
+ return;
+
+ if (sk_fds_connect(&sk_fds, false)) {
+ bpf_link__destroy(link);
+ return;
+ }
+
+ check_hdr_and_close_fds(&sk_fds);
+ bpf_link__destroy(link);
+}
+
+static void no_exprm_estab(void)
+{
+ __simple_estab(false);
+}
+
+static void simple_estab(void)
+{
+ __simple_estab(true);
+}
+
+static void misc(void)
+{
+ const char send_msg[] = "MISC!!!";
+ char recv_msg[sizeof(send_msg)];
+ const unsigned int nr_data = 2;
+ struct bpf_link *link;
+ struct sk_fds sk_fds;
+ int i, ret;
+
+ lport_linum_map_fd = bpf_map__fd(misc_skel->maps.lport_linum_map);
+
+ if (write_sysctl("/proc/sys/net/ipv4/tcp_syncookies", "1"))
+ return;
+
+ link = bpf_program__attach_cgroup(misc_skel->progs.misc_estab, cg_fd);
+ if (CHECK(IS_ERR(link), "attach_cgroup(misc_estab)", "err: %ld\n",
+ PTR_ERR(link)))
+ return;
+
+ if (sk_fds_connect(&sk_fds, false)) {
+ bpf_link__destroy(link);
+ return;
+ }
+
+ for (i = 0; i < nr_data; i++) {
+ /* MSG_EOR to ensure skb will not be combined */
+ ret = send(sk_fds.active_fd, send_msg, sizeof(send_msg),
+ MSG_EOR);
+ if (CHECK(ret != sizeof(send_msg), "send(msg)", "ret:%d\n",
+ ret))
+ goto check_linum;
+
+ ret = read(sk_fds.passive_fd, recv_msg, sizeof(recv_msg));
+ if (CHECK(ret != sizeof(send_msg), "read(msg)", "ret:%d\n",
+ ret))
+ goto check_linum;
+ }
+
+ if (sk_fds_shutdown(&sk_fds))
+ goto check_linum;
+
+ CHECK(misc_skel->bss->nr_syn != 1, "unexpected nr_syn",
+ "expected (1) != actual (%u)\n",
+ misc_skel->bss->nr_syn);
+
+ CHECK(misc_skel->bss->nr_data != nr_data, "unexpected nr_data",
+ "expected (%u) != actual (%u)\n",
+ nr_data, misc_skel->bss->nr_data);
+
+ /* The last ACK may have been delayed, so it is either 1 or 2. */
+ CHECK(misc_skel->bss->nr_pure_ack != 1 &&
+ misc_skel->bss->nr_pure_ack != 2,
+ "unexpected nr_pure_ack",
+ "expected (1 or 2) != actual (%u)\n",
+ misc_skel->bss->nr_pure_ack);
+
+ CHECK(misc_skel->bss->nr_fin != 1, "unexpected nr_fin",
+ "expected (1) != actual (%u)\n",
+ misc_skel->bss->nr_fin);
+
+check_linum:
+ CHECK_FAIL(check_error_linum(&sk_fds));
+ sk_fds_close(&sk_fds);
+ bpf_link__destroy(link);
+}
+
+struct test {
+ const char *desc;
+ void (*run)(void);
+};
+
+#define DEF_TEST(name) { #name, name }
+static struct test tests[] = {
+ DEF_TEST(simple_estab),
+ DEF_TEST(no_exprm_estab),
+ DEF_TEST(syncookie_estab),
+ DEF_TEST(fastopen_estab),
+ DEF_TEST(fin),
+ DEF_TEST(misc),
+};
+
+void test_tcp_hdr_options(void)
+{
+ int i;
+
+ skel = test_tcp_hdr_options__open_and_load();
+ if (CHECK(!skel, "open and load skel", "failed"))
+ return;
+
+ misc_skel = test_misc_tcp_hdr_options__open_and_load();
+ if (CHECK(!misc_skel, "open and load misc test skel", "failed"))
+ goto skel_destroy;
+
+ cg_fd = test__join_cgroup(CG_NAME);
+ if (CHECK_FAIL(cg_fd < 0))
+ goto skel_destroy;
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ if (!test__start_subtest(tests[i].desc))
+ continue;
+
+ if (create_netns())
+ break;
+
+ tests[i].run();
+
+ reset_test();
+ }
+
+ close(cg_fd);
+skel_destroy:
+ test_misc_tcp_hdr_options__destroy(misc_skel);
+ test_tcp_hdr_options__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c
new file mode 100644
index 000000000..d207e968e
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_rtt.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include "cgroup_helpers.h"
+#include "network_helpers.h"
+
+struct tcp_rtt_storage {
+ __u32 invoked;
+ __u32 dsack_dups;
+ __u32 delivered;
+ __u32 delivered_ce;
+ __u32 icsk_retransmits;
+};
+
+static void send_byte(int fd)
+{
+ char b = 0x55;
+
+ if (CHECK_FAIL(write(fd, &b, sizeof(b)) != 1))
+ perror("Failed to send single byte");
+}
+
+static int wait_for_ack(int fd, int retries)
+{
+ struct tcp_info info;
+ socklen_t optlen;
+ int i, err;
+
+ for (i = 0; i < retries; i++) {
+ optlen = sizeof(info);
+ err = getsockopt(fd, SOL_TCP, TCP_INFO, &info, &optlen);
+ if (err < 0) {
+ log_err("Failed to lookup TCP stats");
+ return err;
+ }
+
+ if (info.tcpi_unacked == 0)
+ return 0;
+
+ usleep(10);
+ }
+
+ log_err("Did not receive ACK");
+ return -1;
+}
+
+static int verify_sk(int map_fd, int client_fd, const char *msg, __u32 invoked,
+ __u32 dsack_dups, __u32 delivered, __u32 delivered_ce,
+ __u32 icsk_retransmits)
+{
+ int err = 0;
+ struct tcp_rtt_storage val;
+
+ if (CHECK_FAIL(bpf_map_lookup_elem(map_fd, &client_fd, &val) < 0)) {
+ perror("Failed to read socket storage");
+ return -1;
+ }
+
+ if (val.invoked != invoked) {
+ log_err("%s: unexpected bpf_tcp_sock.invoked %d != %d",
+ msg, val.invoked, invoked);
+ err++;
+ }
+
+ if (val.dsack_dups != dsack_dups) {
+ log_err("%s: unexpected bpf_tcp_sock.dsack_dups %d != %d",
+ msg, val.dsack_dups, dsack_dups);
+ err++;
+ }
+
+ if (val.delivered != delivered) {
+ log_err("%s: unexpected bpf_tcp_sock.delivered %d != %d",
+ msg, val.delivered, delivered);
+ err++;
+ }
+
+ if (val.delivered_ce != delivered_ce) {
+ log_err("%s: unexpected bpf_tcp_sock.delivered_ce %d != %d",
+ msg, val.delivered_ce, delivered_ce);
+ err++;
+ }
+
+ if (val.icsk_retransmits != icsk_retransmits) {
+ log_err("%s: unexpected bpf_tcp_sock.icsk_retransmits %d != %d",
+ msg, val.icsk_retransmits, icsk_retransmits);
+ err++;
+ }
+
+ return err;
+}
+
+
+static int run_test(int cgroup_fd, int server_fd)
+{
+ struct bpf_prog_load_attr attr = {
+ .prog_type = BPF_PROG_TYPE_SOCK_OPS,
+ .file = "./tcp_rtt.o",
+ .expected_attach_type = BPF_CGROUP_SOCK_OPS,
+ };
+ struct bpf_object *obj;
+ struct bpf_map *map;
+ int client_fd;
+ int prog_fd;
+ int map_fd;
+ int err;
+
+ err = bpf_prog_load_xattr(&attr, &obj, &prog_fd);
+ if (err) {
+ log_err("Failed to load BPF object");
+ return -1;
+ }
+
+ map = bpf_map__next(NULL, obj);
+ map_fd = bpf_map__fd(map);
+
+ err = bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_SOCK_OPS, 0);
+ if (err) {
+ log_err("Failed to attach BPF program");
+ goto close_bpf_object;
+ }
+
+ client_fd = connect_to_fd(server_fd, 0);
+ if (client_fd < 0) {
+ err = -1;
+ goto close_bpf_object;
+ }
+
+ err += verify_sk(map_fd, client_fd, "syn-ack",
+ /*invoked=*/1,
+ /*dsack_dups=*/0,
+ /*delivered=*/1,
+ /*delivered_ce=*/0,
+ /*icsk_retransmits=*/0);
+
+ send_byte(client_fd);
+ if (wait_for_ack(client_fd, 100) < 0) {
+ err = -1;
+ goto close_client_fd;
+ }
+
+
+ err += verify_sk(map_fd, client_fd, "first payload byte",
+ /*invoked=*/2,
+ /*dsack_dups=*/0,
+ /*delivered=*/2,
+ /*delivered_ce=*/0,
+ /*icsk_retransmits=*/0);
+
+close_client_fd:
+ close(client_fd);
+
+close_bpf_object:
+ bpf_object__close(obj);
+ return err;
+}
+
+void test_tcp_rtt(void)
+{
+ int server_fd, cgroup_fd;
+
+ cgroup_fd = test__join_cgroup("/tcp_rtt");
+ if (CHECK_FAIL(cgroup_fd < 0))
+ return;
+
+ server_fd = start_server(AF_INET, SOCK_STREAM, NULL, 0, 0);
+ if (CHECK_FAIL(server_fd < 0))
+ goto close_cgroup_fd;
+
+ CHECK_FAIL(run_test(cgroup_fd, server_fd));
+
+ close(server_fd);
+
+close_cgroup_fd:
+ close(cgroup_fd);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/test_bpffs.c b/tools/testing/selftests/bpf/prog_tests/test_bpffs.c
new file mode 100644
index 000000000..172c999e5
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_bpffs.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#define _GNU_SOURCE
+#include <sched.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <test_progs.h>
+
+#define TDIR "/sys/kernel/debug"
+
+static int read_iter(char *file)
+{
+ /* 1024 should be enough to get contiguous 4 "iter" letters at some point */
+ char buf[1024];
+ int fd, len;
+
+ fd = open(file, 0);
+ if (fd < 0)
+ return -1;
+ while ((len = read(fd, buf, sizeof(buf))) > 0)
+ if (strstr(buf, "iter")) {
+ close(fd);
+ return 0;
+ }
+ close(fd);
+ return -1;
+}
+
+static int fn(void)
+{
+ int err, duration = 0;
+
+ err = unshare(CLONE_NEWNS);
+ if (CHECK(err, "unshare", "failed: %d\n", errno))
+ goto out;
+
+ err = mount("", "/", "", MS_REC | MS_PRIVATE, NULL);
+ if (CHECK(err, "mount /", "failed: %d\n", errno))
+ goto out;
+
+ err = umount(TDIR);
+ if (CHECK(err, "umount " TDIR, "failed: %d\n", errno))
+ goto out;
+
+ err = mount("none", TDIR, "tmpfs", 0, NULL);
+ if (CHECK(err, "mount", "mount root failed: %d\n", errno))
+ goto out;
+
+ err = mkdir(TDIR "/fs1", 0777);
+ if (CHECK(err, "mkdir "TDIR"/fs1", "failed: %d\n", errno))
+ goto out;
+ err = mkdir(TDIR "/fs2", 0777);
+ if (CHECK(err, "mkdir "TDIR"/fs2", "failed: %d\n", errno))
+ goto out;
+
+ err = mount("bpf", TDIR "/fs1", "bpf", 0, NULL);
+ if (CHECK(err, "mount bpffs "TDIR"/fs1", "failed: %d\n", errno))
+ goto out;
+ err = mount("bpf", TDIR "/fs2", "bpf", 0, NULL);
+ if (CHECK(err, "mount bpffs " TDIR "/fs2", "failed: %d\n", errno))
+ goto out;
+
+ err = read_iter(TDIR "/fs1/maps.debug");
+ if (CHECK(err, "reading " TDIR "/fs1/maps.debug", "failed\n"))
+ goto out;
+ err = read_iter(TDIR "/fs2/progs.debug");
+ if (CHECK(err, "reading " TDIR "/fs2/progs.debug", "failed\n"))
+ goto out;
+out:
+ umount(TDIR "/fs1");
+ umount(TDIR "/fs2");
+ rmdir(TDIR "/fs1");
+ rmdir(TDIR "/fs2");
+ umount(TDIR);
+ exit(err);
+}
+
+void test_test_bpffs(void)
+{
+ int err, duration = 0, status = 0;
+ pid_t pid;
+
+ pid = fork();
+ if (CHECK(pid == -1, "clone", "clone failed %d", errno))
+ return;
+ if (pid == 0)
+ fn();
+ err = waitpid(pid, &status, 0);
+ if (CHECK(err == -1 && errno != ECHILD, "waitpid", "failed %d", errno))
+ return;
+ if (CHECK(WEXITSTATUS(status), "bpffs test ", "failed %d", WEXITSTATUS(status)))
+ return;
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c
new file mode 100644
index 000000000..32e4348b7
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <test_progs.h>
+
+const char *err_str;
+bool found;
+
+static int libbpf_debug_print(enum libbpf_print_level level,
+ const char *format, va_list args)
+{
+ char *log_buf;
+
+ if (level != LIBBPF_WARN ||
+ strcmp(format, "libbpf: \n%s\n")) {
+ vprintf(format, args);
+ return 0;
+ }
+
+ log_buf = va_arg(args, char *);
+ if (!log_buf)
+ goto out;
+ if (err_str && strstr(log_buf, err_str) == 0)
+ found = true;
+out:
+ printf(format, log_buf);
+ return 0;
+}
+
+extern int extra_prog_load_log_flags;
+
+static int check_load(const char *file)
+{
+ struct bpf_prog_load_attr attr;
+ struct bpf_object *obj = NULL;
+ int err, prog_fd;
+
+ memset(&attr, 0, sizeof(struct bpf_prog_load_attr));
+ attr.file = file;
+ attr.prog_type = BPF_PROG_TYPE_UNSPEC;
+ attr.log_level = extra_prog_load_log_flags;
+ attr.prog_flags = BPF_F_TEST_RND_HI32;
+ found = false;
+ err = bpf_prog_load_xattr(&attr, &obj, &prog_fd);
+ bpf_object__close(obj);
+ return err;
+}
+
+struct test_def {
+ const char *file;
+ const char *err_str;
+};
+
+void test_test_global_funcs(void)
+{
+ struct test_def tests[] = {
+ { "test_global_func1.o", "combined stack size of 4 calls is 544" },
+ { "test_global_func2.o" },
+ { "test_global_func3.o" , "the call stack of 8 frames" },
+ { "test_global_func4.o" },
+ { "test_global_func5.o" , "expected pointer to ctx, but got PTR" },
+ { "test_global_func6.o" , "modified ctx ptr R2" },
+ { "test_global_func7.o" , "foo() doesn't return scalar" },
+ { "test_global_func8.o" },
+ };
+ libbpf_print_fn_t old_print_fn = NULL;
+ int err, i, duration = 0;
+
+ old_print_fn = libbpf_set_print(libbpf_debug_print);
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ const struct test_def *test = &tests[i];
+
+ if (!test__start_subtest(test->file))
+ continue;
+
+ err_str = test->err_str;
+ err = check_load(test->file);
+ CHECK_FAIL(!!err ^ !!err_str);
+ if (err_str)
+ CHECK(found, "", "expected string '%s'", err_str);
+ }
+ libbpf_set_print(old_print_fn);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c
new file mode 100644
index 000000000..91cd6f357
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (C) 2020 Google LLC.
+ */
+
+#include <test_progs.h>
+#include <linux/limits.h>
+
+#include "local_storage.skel.h"
+#include "network_helpers.h"
+
+int create_and_unlink_file(void)
+{
+ char fname[PATH_MAX] = "/tmp/fileXXXXXX";
+ int fd;
+
+ fd = mkstemp(fname);
+ if (fd < 0)
+ return fd;
+
+ close(fd);
+ unlink(fname);
+ return 0;
+}
+
+void test_test_local_storage(void)
+{
+ struct local_storage *skel = NULL;
+ int err, duration = 0, serv_sk = -1;
+
+ skel = local_storage__open_and_load();
+ if (CHECK(!skel, "skel_load", "lsm skeleton failed\n"))
+ goto close_prog;
+
+ err = local_storage__attach(skel);
+ if (CHECK(err, "attach", "lsm attach failed: %d\n", err))
+ goto close_prog;
+
+ skel->bss->monitored_pid = getpid();
+
+ err = create_and_unlink_file();
+ if (CHECK(err < 0, "exec_cmd", "err %d errno %d\n", err, errno))
+ goto close_prog;
+
+ CHECK(skel->data->inode_storage_result != 0, "inode_storage_result",
+ "inode_local_storage not set\n");
+
+ serv_sk = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0);
+ if (CHECK(serv_sk < 0, "start_server", "failed to start server\n"))
+ goto close_prog;
+
+ CHECK(skel->data->sk_storage_result != 0, "sk_storage_result",
+ "sk_local_storage not set\n");
+
+ close(serv_sk);
+
+close_prog:
+ local_storage__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/test_lsm.c b/tools/testing/selftests/bpf/prog_tests/test_lsm.c
new file mode 100644
index 000000000..6ab29226c
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_lsm.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (C) 2020 Google LLC.
+ */
+
+#include <test_progs.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <malloc.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "lsm.skel.h"
+
+char *CMD_ARGS[] = {"true", NULL};
+
+#define GET_PAGE_ADDR(ADDR, PAGE_SIZE) \
+ (char *)(((unsigned long) (ADDR + PAGE_SIZE)) & ~(PAGE_SIZE-1))
+
+int stack_mprotect(void)
+{
+ void *buf;
+ long sz;
+ int ret;
+
+ sz = sysconf(_SC_PAGESIZE);
+ if (sz < 0)
+ return sz;
+
+ buf = alloca(sz * 3);
+ ret = mprotect(GET_PAGE_ADDR(buf, sz), sz,
+ PROT_READ | PROT_WRITE | PROT_EXEC);
+ return ret;
+}
+
+int exec_cmd(int *monitored_pid)
+{
+ int child_pid, child_status;
+
+ child_pid = fork();
+ if (child_pid == 0) {
+ *monitored_pid = getpid();
+ execvp(CMD_ARGS[0], CMD_ARGS);
+ return -EINVAL;
+ } else if (child_pid > 0) {
+ waitpid(child_pid, &child_status, 0);
+ return child_status;
+ }
+
+ return -EINVAL;
+}
+
+void test_test_lsm(void)
+{
+ struct lsm *skel = NULL;
+ int err, duration = 0;
+ int buf = 1234;
+
+ skel = lsm__open_and_load();
+ if (CHECK(!skel, "skel_load", "lsm skeleton failed\n"))
+ goto close_prog;
+
+ err = lsm__attach(skel);
+ if (CHECK(err, "attach", "lsm attach failed: %d\n", err))
+ goto close_prog;
+
+ err = exec_cmd(&skel->bss->monitored_pid);
+ if (CHECK(err < 0, "exec_cmd", "err %d errno %d\n", err, errno))
+ goto close_prog;
+
+ CHECK(skel->bss->bprm_count != 1, "bprm_count", "bprm_count = %d\n",
+ skel->bss->bprm_count);
+
+ skel->bss->monitored_pid = getpid();
+
+ err = stack_mprotect();
+ if (CHECK(errno != EPERM, "stack_mprotect", "want err=EPERM, got %d\n",
+ errno))
+ goto close_prog;
+
+ CHECK(skel->bss->mprotect_count != 1, "mprotect_count",
+ "mprotect_count = %d\n", skel->bss->mprotect_count);
+
+ syscall(__NR_setdomainname, &buf, -2L);
+ syscall(__NR_setdomainname, 0, -3L);
+ syscall(__NR_setdomainname, ~0L, -4L);
+
+ CHECK(skel->bss->copy_test != 3, "copy_test",
+ "copy_test = %d\n", skel->bss->copy_test);
+
+close_prog:
+ lsm__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/test_overhead.c b/tools/testing/selftests/bpf/prog_tests/test_overhead.c
new file mode 100644
index 000000000..996668586
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_overhead.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2019 Facebook */
+#define _GNU_SOURCE
+#include <sched.h>
+#include <sys/prctl.h>
+#include <test_progs.h>
+
+#define MAX_CNT 100000
+
+static __u64 time_get_ns(void)
+{
+ struct timespec ts;
+
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ return ts.tv_sec * 1000000000ull + ts.tv_nsec;
+}
+
+static int test_task_rename(const char *prog)
+{
+ int i, fd, duration = 0, err;
+ char buf[] = "test_overhead";
+ __u64 start_time;
+
+ fd = open("/proc/self/comm", O_WRONLY|O_TRUNC);
+ if (CHECK(fd < 0, "open /proc", "err %d", errno))
+ return -1;
+ start_time = time_get_ns();
+ for (i = 0; i < MAX_CNT; i++) {
+ err = write(fd, buf, sizeof(buf));
+ if (err < 0) {
+ CHECK(err < 0, "task rename", "err %d", errno);
+ close(fd);
+ return -1;
+ }
+ }
+ printf("task_rename %s\t%lluK events per sec\n", prog,
+ MAX_CNT * 1000000ll / (time_get_ns() - start_time));
+ close(fd);
+ return 0;
+}
+
+static void test_run(const char *prog)
+{
+ test_task_rename(prog);
+}
+
+static void setaffinity(void)
+{
+ cpu_set_t cpuset;
+ int cpu = 0;
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpu, &cpuset);
+ sched_setaffinity(0, sizeof(cpuset), &cpuset);
+}
+
+void test_test_overhead(void)
+{
+ const char *kprobe_name = "kprobe/__set_task_comm";
+ const char *kretprobe_name = "kretprobe/__set_task_comm";
+ const char *raw_tp_name = "raw_tp/task_rename";
+ const char *fentry_name = "fentry/__set_task_comm";
+ const char *fexit_name = "fexit/__set_task_comm";
+ const char *kprobe_func = "__set_task_comm";
+ struct bpf_program *kprobe_prog, *kretprobe_prog, *raw_tp_prog;
+ struct bpf_program *fentry_prog, *fexit_prog;
+ struct bpf_object *obj;
+ struct bpf_link *link;
+ int err, duration = 0;
+ char comm[16] = {};
+
+ if (CHECK_FAIL(prctl(PR_GET_NAME, comm, 0L, 0L, 0L)))
+ return;
+
+ obj = bpf_object__open_file("./test_overhead.o", NULL);
+ if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj)))
+ return;
+
+ kprobe_prog = bpf_object__find_program_by_title(obj, kprobe_name);
+ if (CHECK(!kprobe_prog, "find_probe",
+ "prog '%s' not found\n", kprobe_name))
+ goto cleanup;
+ kretprobe_prog = bpf_object__find_program_by_title(obj, kretprobe_name);
+ if (CHECK(!kretprobe_prog, "find_probe",
+ "prog '%s' not found\n", kretprobe_name))
+ goto cleanup;
+ raw_tp_prog = bpf_object__find_program_by_title(obj, raw_tp_name);
+ if (CHECK(!raw_tp_prog, "find_probe",
+ "prog '%s' not found\n", raw_tp_name))
+ goto cleanup;
+ fentry_prog = bpf_object__find_program_by_title(obj, fentry_name);
+ if (CHECK(!fentry_prog, "find_probe",
+ "prog '%s' not found\n", fentry_name))
+ goto cleanup;
+ fexit_prog = bpf_object__find_program_by_title(obj, fexit_name);
+ if (CHECK(!fexit_prog, "find_probe",
+ "prog '%s' not found\n", fexit_name))
+ goto cleanup;
+ err = bpf_object__load(obj);
+ if (CHECK(err, "obj_load", "err %d\n", err))
+ goto cleanup;
+
+ setaffinity();
+
+ /* base line run */
+ test_run("base");
+
+ /* attach kprobe */
+ link = bpf_program__attach_kprobe(kprobe_prog, false /* retprobe */,
+ kprobe_func);
+ if (CHECK(IS_ERR(link), "attach_kprobe", "err %ld\n", PTR_ERR(link)))
+ goto cleanup;
+ test_run("kprobe");
+ bpf_link__destroy(link);
+
+ /* attach kretprobe */
+ link = bpf_program__attach_kprobe(kretprobe_prog, true /* retprobe */,
+ kprobe_func);
+ if (CHECK(IS_ERR(link), "attach kretprobe", "err %ld\n", PTR_ERR(link)))
+ goto cleanup;
+ test_run("kretprobe");
+ bpf_link__destroy(link);
+
+ /* attach raw_tp */
+ link = bpf_program__attach_raw_tracepoint(raw_tp_prog, "task_rename");
+ if (CHECK(IS_ERR(link), "attach fentry", "err %ld\n", PTR_ERR(link)))
+ goto cleanup;
+ test_run("raw_tp");
+ bpf_link__destroy(link);
+
+ /* attach fentry */
+ link = bpf_program__attach_trace(fentry_prog);
+ if (CHECK(IS_ERR(link), "attach fentry", "err %ld\n", PTR_ERR(link)))
+ goto cleanup;
+ test_run("fentry");
+ bpf_link__destroy(link);
+
+ /* attach fexit */
+ link = bpf_program__attach_trace(fexit_prog);
+ if (CHECK(IS_ERR(link), "attach fexit", "err %ld\n", PTR_ERR(link)))
+ goto cleanup;
+ test_run("fexit");
+ bpf_link__destroy(link);
+
+cleanup:
+ prctl(PR_SET_NAME, comm, 0L, 0L, 0L);
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/test_profiler.c b/tools/testing/selftests/bpf/prog_tests/test_profiler.c
new file mode 100644
index 000000000..4ca275101
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/test_profiler.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <test_progs.h>
+#include "progs/profiler.h"
+#include "profiler1.skel.h"
+#include "profiler2.skel.h"
+#include "profiler3.skel.h"
+
+static int sanity_run(struct bpf_program *prog)
+{
+ struct bpf_prog_test_run_attr test_attr = {};
+ __u64 args[] = {1, 2, 3};
+ __u32 duration = 0;
+ int err, prog_fd;
+
+ prog_fd = bpf_program__fd(prog);
+ test_attr.prog_fd = prog_fd;
+ test_attr.ctx_in = args;
+ test_attr.ctx_size_in = sizeof(args);
+ err = bpf_prog_test_run_xattr(&test_attr);
+ if (CHECK(err || test_attr.retval, "test_run",
+ "err %d errno %d retval %d duration %d\n",
+ err, errno, test_attr.retval, duration))
+ return -1;
+ return 0;
+}
+
+void test_test_profiler(void)
+{
+ struct profiler1 *profiler1_skel = NULL;
+ struct profiler2 *profiler2_skel = NULL;
+ struct profiler3 *profiler3_skel = NULL;
+ __u32 duration = 0;
+ int err;
+
+ profiler1_skel = profiler1__open_and_load();
+ if (CHECK(!profiler1_skel, "profiler1_skel_load", "profiler1 skeleton failed\n"))
+ goto cleanup;
+
+ err = profiler1__attach(profiler1_skel);
+ if (CHECK(err, "profiler1_attach", "profiler1 attach failed: %d\n", err))
+ goto cleanup;
+
+ if (sanity_run(profiler1_skel->progs.raw_tracepoint__sched_process_exec))
+ goto cleanup;
+
+ profiler2_skel = profiler2__open_and_load();
+ if (CHECK(!profiler2_skel, "profiler2_skel_load", "profiler2 skeleton failed\n"))
+ goto cleanup;
+
+ err = profiler2__attach(profiler2_skel);
+ if (CHECK(err, "profiler2_attach", "profiler2 attach failed: %d\n", err))
+ goto cleanup;
+
+ if (sanity_run(profiler2_skel->progs.raw_tracepoint__sched_process_exec))
+ goto cleanup;
+
+ profiler3_skel = profiler3__open_and_load();
+ if (CHECK(!profiler3_skel, "profiler3_skel_load", "profiler3 skeleton failed\n"))
+ goto cleanup;
+
+ err = profiler3__attach(profiler3_skel);
+ if (CHECK(err, "profiler3_attach", "profiler3 attach failed: %d\n", err))
+ goto cleanup;
+
+ if (sanity_run(profiler3_skel->progs.raw_tracepoint__sched_process_exec))
+ goto cleanup;
+cleanup:
+ profiler1__destroy(profiler1_skel);
+ profiler2__destroy(profiler2_skel);
+ profiler3__destroy(profiler3_skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/tp_attach_query.c b/tools/testing/selftests/bpf/prog_tests/tp_attach_query.c
new file mode 100644
index 000000000..fb095e5cd
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/tp_attach_query.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_tp_attach_query(void)
+{
+ const int num_progs = 3;
+ int i, j, bytes, efd, err, prog_fd[num_progs], pmu_fd[num_progs];
+ __u32 duration = 0, info_len, saved_prog_ids[num_progs];
+ const char *file = "./test_tracepoint.o";
+ struct perf_event_query_bpf *query;
+ struct perf_event_attr attr = {};
+ struct bpf_object *obj[num_progs];
+ struct bpf_prog_info prog_info;
+ char buf[256];
+
+ for (i = 0; i < num_progs; i++)
+ obj[i] = NULL;
+
+ snprintf(buf, sizeof(buf),
+ "/sys/kernel/debug/tracing/events/sched/sched_switch/id");
+ efd = open(buf, O_RDONLY, 0);
+ if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+ return;
+ bytes = read(efd, buf, sizeof(buf));
+ close(efd);
+ if (CHECK(bytes <= 0 || bytes >= sizeof(buf),
+ "read", "bytes %d errno %d\n", bytes, errno))
+ return;
+
+ attr.config = strtol(buf, NULL, 0);
+ attr.type = PERF_TYPE_TRACEPOINT;
+ attr.sample_type = PERF_SAMPLE_RAW | PERF_SAMPLE_CALLCHAIN;
+ attr.sample_period = 1;
+ attr.wakeup_events = 1;
+
+ query = malloc(sizeof(*query) + sizeof(__u32) * num_progs);
+ for (i = 0; i < num_progs; i++) {
+ err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj[i],
+ &prog_fd[i]);
+ if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
+ goto cleanup1;
+
+ bzero(&prog_info, sizeof(prog_info));
+ prog_info.jited_prog_len = 0;
+ prog_info.xlated_prog_len = 0;
+ prog_info.nr_map_ids = 0;
+ info_len = sizeof(prog_info);
+ err = bpf_obj_get_info_by_fd(prog_fd[i], &prog_info, &info_len);
+ if (CHECK(err, "bpf_obj_get_info_by_fd", "err %d errno %d\n",
+ err, errno))
+ goto cleanup1;
+ saved_prog_ids[i] = prog_info.id;
+
+ pmu_fd[i] = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+ 0 /* cpu 0 */, -1 /* group id */,
+ 0 /* flags */);
+ if (CHECK(pmu_fd[i] < 0, "perf_event_open", "err %d errno %d\n",
+ pmu_fd[i], errno))
+ goto cleanup2;
+ err = ioctl(pmu_fd[i], PERF_EVENT_IOC_ENABLE, 0);
+ if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n",
+ err, errno))
+ goto cleanup3;
+
+ if (i == 0) {
+ /* check NULL prog array query */
+ query->ids_len = num_progs;
+ err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF, query);
+ if (CHECK(err || query->prog_cnt != 0,
+ "perf_event_ioc_query_bpf",
+ "err %d errno %d query->prog_cnt %u\n",
+ err, errno, query->prog_cnt))
+ goto cleanup3;
+ }
+
+ err = ioctl(pmu_fd[i], PERF_EVENT_IOC_SET_BPF, prog_fd[i]);
+ if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n",
+ err, errno))
+ goto cleanup3;
+
+ if (i == 1) {
+ /* try to get # of programs only */
+ query->ids_len = 0;
+ err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF, query);
+ if (CHECK(err || query->prog_cnt != 2,
+ "perf_event_ioc_query_bpf",
+ "err %d errno %d query->prog_cnt %u\n",
+ err, errno, query->prog_cnt))
+ goto cleanup3;
+
+ /* try a few negative tests */
+ /* invalid query pointer */
+ err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF,
+ (struct perf_event_query_bpf *)0x1);
+ if (CHECK(!err || errno != EFAULT,
+ "perf_event_ioc_query_bpf",
+ "err %d errno %d\n", err, errno))
+ goto cleanup3;
+
+ /* no enough space */
+ query->ids_len = 1;
+ err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF, query);
+ if (CHECK(!err || errno != ENOSPC || query->prog_cnt != 2,
+ "perf_event_ioc_query_bpf",
+ "err %d errno %d query->prog_cnt %u\n",
+ err, errno, query->prog_cnt))
+ goto cleanup3;
+ }
+
+ query->ids_len = num_progs;
+ err = ioctl(pmu_fd[i], PERF_EVENT_IOC_QUERY_BPF, query);
+ if (CHECK(err || query->prog_cnt != (i + 1),
+ "perf_event_ioc_query_bpf",
+ "err %d errno %d query->prog_cnt %u\n",
+ err, errno, query->prog_cnt))
+ goto cleanup3;
+ for (j = 0; j < i + 1; j++)
+ if (CHECK(saved_prog_ids[j] != query->ids[j],
+ "perf_event_ioc_query_bpf",
+ "#%d saved_prog_id %x query prog_id %x\n",
+ j, saved_prog_ids[j], query->ids[j]))
+ goto cleanup3;
+ }
+
+ i = num_progs - 1;
+ for (; i >= 0; i--) {
+ cleanup3:
+ ioctl(pmu_fd[i], PERF_EVENT_IOC_DISABLE);
+ cleanup2:
+ close(pmu_fd[i]);
+ cleanup1:
+ bpf_object__close(obj[i]);
+ }
+ free(query);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/trace_ext.c b/tools/testing/selftests/bpf/prog_tests/trace_ext.c
new file mode 100644
index 000000000..924441d43
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/trace_ext.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <test_progs.h>
+#include <network_helpers.h>
+#include <sys/stat.h>
+#include <linux/sched.h>
+#include <sys/syscall.h>
+
+#include "test_pkt_md_access.skel.h"
+#include "test_trace_ext.skel.h"
+#include "test_trace_ext_tracing.skel.h"
+
+static __u32 duration;
+
+void test_trace_ext(void)
+{
+ struct test_pkt_md_access *skel_pkt = NULL;
+ struct test_trace_ext_tracing *skel_trace = NULL;
+ struct test_trace_ext_tracing__bss *bss_trace;
+ struct test_trace_ext *skel_ext = NULL;
+ struct test_trace_ext__bss *bss_ext;
+ int err, pkt_fd, ext_fd;
+ struct bpf_program *prog;
+ char buf[100];
+ __u32 retval;
+ __u64 len;
+
+ /* open/load/attach test_pkt_md_access */
+ skel_pkt = test_pkt_md_access__open_and_load();
+ if (CHECK(!skel_pkt, "setup", "classifier/test_pkt_md_access open failed\n"))
+ goto cleanup;
+
+ err = test_pkt_md_access__attach(skel_pkt);
+ if (CHECK(err, "setup", "classifier/test_pkt_md_access attach failed: %d\n", err))
+ goto cleanup;
+
+ prog = skel_pkt->progs.test_pkt_md_access;
+ pkt_fd = bpf_program__fd(prog);
+
+ /* open extension */
+ skel_ext = test_trace_ext__open();
+ if (CHECK(!skel_ext, "setup", "freplace/test_pkt_md_access open failed\n"))
+ goto cleanup;
+
+ /* set extension's attach target - test_pkt_md_access */
+ prog = skel_ext->progs.test_pkt_md_access_new;
+ bpf_program__set_attach_target(prog, pkt_fd, "test_pkt_md_access");
+
+ /* load/attach extension */
+ err = test_trace_ext__load(skel_ext);
+ if (CHECK(err, "setup", "freplace/test_pkt_md_access load failed\n")) {
+ libbpf_strerror(err, buf, sizeof(buf));
+ fprintf(stderr, "%s\n", buf);
+ goto cleanup;
+ }
+
+ err = test_trace_ext__attach(skel_ext);
+ if (CHECK(err, "setup", "freplace/test_pkt_md_access attach failed: %d\n", err))
+ goto cleanup;
+
+ prog = skel_ext->progs.test_pkt_md_access_new;
+ ext_fd = bpf_program__fd(prog);
+
+ /* open tracing */
+ skel_trace = test_trace_ext_tracing__open();
+ if (CHECK(!skel_trace, "setup", "tracing/test_pkt_md_access_new open failed\n"))
+ goto cleanup;
+
+ /* set tracing's attach target - fentry */
+ prog = skel_trace->progs.fentry;
+ bpf_program__set_attach_target(prog, ext_fd, "test_pkt_md_access_new");
+
+ /* set tracing's attach target - fexit */
+ prog = skel_trace->progs.fexit;
+ bpf_program__set_attach_target(prog, ext_fd, "test_pkt_md_access_new");
+
+ /* load/attach tracing */
+ err = test_trace_ext_tracing__load(skel_trace);
+ if (CHECK(err, "setup", "tracing/test_pkt_md_access_new load failed\n")) {
+ libbpf_strerror(err, buf, sizeof(buf));
+ fprintf(stderr, "%s\n", buf);
+ goto cleanup;
+ }
+
+ err = test_trace_ext_tracing__attach(skel_trace);
+ if (CHECK(err, "setup", "tracing/test_pkt_md_access_new attach failed: %d\n", err))
+ goto cleanup;
+
+ /* trigger the test */
+ err = bpf_prog_test_run(pkt_fd, 1, &pkt_v4, sizeof(pkt_v4),
+ NULL, NULL, &retval, &duration);
+ CHECK(err || retval, "run", "err %d errno %d retval %d\n", err, errno, retval);
+
+ bss_ext = skel_ext->bss;
+ bss_trace = skel_trace->bss;
+
+ len = bss_ext->ext_called;
+
+ CHECK(bss_ext->ext_called == 0,
+ "check", "failed to trigger freplace/test_pkt_md_access\n");
+ CHECK(bss_trace->fentry_called != len,
+ "check", "failed to trigger fentry/test_pkt_md_access_new\n");
+ CHECK(bss_trace->fexit_called != len,
+ "check", "failed to trigger fexit/test_pkt_md_access_new\n");
+
+cleanup:
+ test_trace_ext_tracing__destroy(skel_trace);
+ test_trace_ext__destroy(skel_ext);
+ test_pkt_md_access__destroy(skel_pkt);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/trace_printk.c b/tools/testing/selftests/bpf/prog_tests/trace_printk.c
new file mode 100644
index 000000000..39b0decb1
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/trace_printk.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020, Oracle and/or its affiliates. */
+
+#include <test_progs.h>
+
+#include "trace_printk.skel.h"
+
+#define TRACEBUF "/sys/kernel/debug/tracing/trace_pipe"
+#define SEARCHMSG "testing,testing"
+
+void test_trace_printk(void)
+{
+ int err, iter = 0, duration = 0, found = 0;
+ struct trace_printk__bss *bss;
+ struct trace_printk *skel;
+ char *buf = NULL;
+ FILE *fp = NULL;
+ size_t buflen;
+
+ skel = trace_printk__open();
+ if (CHECK(!skel, "skel_open", "failed to open skeleton\n"))
+ return;
+
+ err = trace_printk__load(skel);
+ if (CHECK(err, "skel_load", "failed to load skeleton: %d\n", err))
+ goto cleanup;
+
+ bss = skel->bss;
+
+ err = trace_printk__attach(skel);
+ if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err))
+ goto cleanup;
+
+ fp = fopen(TRACEBUF, "r");
+ if (CHECK(fp == NULL, "could not open trace buffer",
+ "error %d opening %s", errno, TRACEBUF))
+ goto cleanup;
+
+ /* We do not want to wait forever if this test fails... */
+ fcntl(fileno(fp), F_SETFL, O_NONBLOCK);
+
+ /* wait for tracepoint to trigger */
+ usleep(1);
+ trace_printk__detach(skel);
+
+ if (CHECK(bss->trace_printk_ran == 0,
+ "bpf_trace_printk never ran",
+ "ran == %d", bss->trace_printk_ran))
+ goto cleanup;
+
+ if (CHECK(bss->trace_printk_ret <= 0,
+ "bpf_trace_printk returned <= 0 value",
+ "got %d", bss->trace_printk_ret))
+ goto cleanup;
+
+ /* verify our search string is in the trace buffer */
+ while (getline(&buf, &buflen, fp) >= 0 || errno == EAGAIN) {
+ if (strstr(buf, SEARCHMSG) != NULL)
+ found++;
+ if (found == bss->trace_printk_ran)
+ break;
+ if (++iter > 1000)
+ break;
+ }
+
+ if (CHECK(!found, "message from bpf_trace_printk not found",
+ "no instance of %s in %s", SEARCHMSG, TRACEBUF))
+ goto cleanup;
+
+cleanup:
+ trace_printk__destroy(skel);
+ free(buf);
+ if (fp)
+ fclose(fp);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c
new file mode 100644
index 000000000..781c8d116
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#define _GNU_SOURCE
+#include <sched.h>
+#include <sys/prctl.h>
+#include <test_progs.h>
+
+#define MAX_TRAMP_PROGS 40
+
+struct inst {
+ struct bpf_object *obj;
+ struct bpf_link *link_fentry;
+ struct bpf_link *link_fexit;
+};
+
+static int test_task_rename(void)
+{
+ int fd, duration = 0, err;
+ char buf[] = "test_overhead";
+
+ fd = open("/proc/self/comm", O_WRONLY|O_TRUNC);
+ if (CHECK(fd < 0, "open /proc", "err %d", errno))
+ return -1;
+ err = write(fd, buf, sizeof(buf));
+ if (err < 0) {
+ CHECK(err < 0, "task rename", "err %d", errno);
+ close(fd);
+ return -1;
+ }
+ close(fd);
+ return 0;
+}
+
+static struct bpf_link *load(struct bpf_object *obj, const char *name)
+{
+ struct bpf_program *prog;
+ int duration = 0;
+
+ prog = bpf_object__find_program_by_title(obj, name);
+ if (CHECK(!prog, "find_probe", "prog '%s' not found\n", name))
+ return ERR_PTR(-EINVAL);
+ return bpf_program__attach_trace(prog);
+}
+
+void test_trampoline_count(void)
+{
+ const char *fentry_name = "fentry/__set_task_comm";
+ const char *fexit_name = "fexit/__set_task_comm";
+ const char *object = "test_trampoline_count.o";
+ struct inst inst[MAX_TRAMP_PROGS] = {};
+ int err, i = 0, duration = 0;
+ struct bpf_object *obj;
+ struct bpf_link *link;
+ char comm[16] = {};
+
+ /* attach 'allowed' 40 trampoline programs */
+ for (i = 0; i < MAX_TRAMP_PROGS; i++) {
+ obj = bpf_object__open_file(object, NULL);
+ if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj))) {
+ obj = NULL;
+ goto cleanup;
+ }
+
+ err = bpf_object__load(obj);
+ if (CHECK(err, "obj_load", "err %d\n", err))
+ goto cleanup;
+ inst[i].obj = obj;
+ obj = NULL;
+
+ if (rand() % 2) {
+ link = load(inst[i].obj, fentry_name);
+ if (CHECK(IS_ERR(link), "attach prog", "err %ld\n", PTR_ERR(link))) {
+ link = NULL;
+ goto cleanup;
+ }
+ inst[i].link_fentry = link;
+ } else {
+ link = load(inst[i].obj, fexit_name);
+ if (CHECK(IS_ERR(link), "attach prog", "err %ld\n", PTR_ERR(link))) {
+ link = NULL;
+ goto cleanup;
+ }
+ inst[i].link_fexit = link;
+ }
+ }
+
+ /* and try 1 extra.. */
+ obj = bpf_object__open_file(object, NULL);
+ if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj))) {
+ obj = NULL;
+ goto cleanup;
+ }
+
+ err = bpf_object__load(obj);
+ if (CHECK(err, "obj_load", "err %d\n", err))
+ goto cleanup_extra;
+
+ /* ..that needs to fail */
+ link = load(obj, fentry_name);
+ if (CHECK(!IS_ERR(link), "cannot attach over the limit", "err %ld\n", PTR_ERR(link))) {
+ bpf_link__destroy(link);
+ goto cleanup_extra;
+ }
+
+ /* with E2BIG error */
+ CHECK(PTR_ERR(link) != -E2BIG, "proper error check", "err %ld\n", PTR_ERR(link));
+
+ /* and finaly execute the probe */
+ if (CHECK_FAIL(prctl(PR_GET_NAME, comm, 0L, 0L, 0L)))
+ goto cleanup_extra;
+ CHECK_FAIL(test_task_rename());
+ CHECK_FAIL(prctl(PR_SET_NAME, comm, 0L, 0L, 0L));
+
+cleanup_extra:
+ bpf_object__close(obj);
+cleanup:
+ if (i >= MAX_TRAMP_PROGS)
+ i = MAX_TRAMP_PROGS - 1;
+ for (; i >= 0; i--) {
+ bpf_link__destroy(inst[i].link_fentry);
+ bpf_link__destroy(inst[i].link_fexit);
+ bpf_object__close(inst[i].obj);
+ }
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/udp_limit.c b/tools/testing/selftests/bpf/prog_tests/udp_limit.c
new file mode 100644
index 000000000..2aba09d4d
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/udp_limit.c
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include "udp_limit.skel.h"
+
+#include <sys/types.h>
+#include <sys/socket.h>
+
+static int duration;
+
+void test_udp_limit(void)
+{
+ struct udp_limit *skel;
+ int fd1 = -1, fd2 = -1;
+ int cgroup_fd;
+
+ cgroup_fd = test__join_cgroup("/udp_limit");
+ if (CHECK(cgroup_fd < 0, "cg-join", "errno %d", errno))
+ return;
+
+ skel = udp_limit__open_and_load();
+ if (CHECK(!skel, "skel-load", "errno %d", errno))
+ goto close_cgroup_fd;
+
+ skel->links.sock = bpf_program__attach_cgroup(skel->progs.sock, cgroup_fd);
+ skel->links.sock_release = bpf_program__attach_cgroup(skel->progs.sock_release, cgroup_fd);
+ if (CHECK(IS_ERR(skel->links.sock) || IS_ERR(skel->links.sock_release),
+ "cg-attach", "sock %ld sock_release %ld",
+ PTR_ERR(skel->links.sock),
+ PTR_ERR(skel->links.sock_release)))
+ goto close_skeleton;
+
+ /* BPF program enforces a single UDP socket per cgroup,
+ * verify that.
+ */
+ fd1 = socket(AF_INET, SOCK_DGRAM, 0);
+ if (CHECK(fd1 < 0, "fd1", "errno %d", errno))
+ goto close_skeleton;
+
+ fd2 = socket(AF_INET, SOCK_DGRAM, 0);
+ if (CHECK(fd2 >= 0, "fd2", "errno %d", errno))
+ goto close_skeleton;
+
+ /* We can reopen again after close. */
+ close(fd1);
+ fd1 = -1;
+
+ fd1 = socket(AF_INET, SOCK_DGRAM, 0);
+ if (CHECK(fd1 < 0, "fd1-again", "errno %d", errno))
+ goto close_skeleton;
+
+ /* Make sure the program was invoked the expected
+ * number of times:
+ * - open fd1 - BPF_CGROUP_INET_SOCK_CREATE
+ * - attempt to openfd2 - BPF_CGROUP_INET_SOCK_CREATE
+ * - close fd1 - BPF_CGROUP_INET_SOCK_RELEASE
+ * - open fd1 again - BPF_CGROUP_INET_SOCK_CREATE
+ */
+ if (CHECK(skel->bss->invocations != 4, "bss-invocations",
+ "invocations=%d", skel->bss->invocations))
+ goto close_skeleton;
+
+ /* We should still have a single socket in use */
+ if (CHECK(skel->bss->in_use != 1, "bss-in_use",
+ "in_use=%d", skel->bss->in_use))
+ goto close_skeleton;
+
+close_skeleton:
+ if (fd1 >= 0)
+ close(fd1);
+ if (fd2 >= 0)
+ close(fd2);
+ udp_limit__destroy(skel);
+close_cgroup_fd:
+ close(cgroup_fd);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/varlen.c b/tools/testing/selftests/bpf/prog_tests/varlen.c
new file mode 100644
index 000000000..dd324b493
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/varlen.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include <test_progs.h>
+#include <time.h>
+#include "test_varlen.skel.h"
+
+#define CHECK_VAL(got, exp) \
+ CHECK((got) != (exp), "check", "got %ld != exp %ld\n", \
+ (long)(got), (long)(exp))
+
+void test_varlen(void)
+{
+ int duration = 0, err;
+ struct test_varlen* skel;
+ struct test_varlen__bss *bss;
+ struct test_varlen__data *data;
+ const char str1[] = "Hello, ";
+ const char str2[] = "World!";
+ const char exp_str[] = "Hello, \0World!\0";
+ const int size1 = sizeof(str1);
+ const int size2 = sizeof(str2);
+
+ skel = test_varlen__open_and_load();
+ if (CHECK(!skel, "skel_open", "failed to open skeleton\n"))
+ return;
+ bss = skel->bss;
+ data = skel->data;
+
+ err = test_varlen__attach(skel);
+ if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err))
+ goto cleanup;
+
+ bss->test_pid = getpid();
+
+ /* trigger everything */
+ memcpy(bss->buf_in1, str1, size1);
+ memcpy(bss->buf_in2, str2, size2);
+ bss->capture = true;
+ usleep(1);
+ bss->capture = false;
+
+ CHECK_VAL(bss->payload1_len1, size1);
+ CHECK_VAL(bss->payload1_len2, size2);
+ CHECK_VAL(bss->total1, size1 + size2);
+ CHECK(memcmp(bss->payload1, exp_str, size1 + size2), "content_check",
+ "doesn't match!\n");
+
+ CHECK_VAL(data->payload2_len1, size1);
+ CHECK_VAL(data->payload2_len2, size2);
+ CHECK_VAL(data->total2, size1 + size2);
+ CHECK(memcmp(data->payload2, exp_str, size1 + size2), "content_check",
+ "doesn't match!\n");
+
+ CHECK_VAL(data->payload3_len1, size1);
+ CHECK_VAL(data->payload3_len2, size2);
+ CHECK_VAL(data->total3, size1 + size2);
+ CHECK(memcmp(data->payload3, exp_str, size1 + size2), "content_check",
+ "doesn't match!\n");
+
+ CHECK_VAL(data->payload4_len1, size1);
+ CHECK_VAL(data->payload4_len2, size2);
+ CHECK_VAL(data->total4, size1 + size2);
+ CHECK(memcmp(data->payload4, exp_str, size1 + size2), "content_check",
+ "doesn't match!\n");
+cleanup:
+ test_varlen__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/vmlinux.c b/tools/testing/selftests/bpf/prog_tests/vmlinux.c
new file mode 100644
index 000000000..72310cfc6
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/vmlinux.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include <test_progs.h>
+#include <time.h>
+#include "test_vmlinux.skel.h"
+
+#define MY_TV_NSEC 1337
+
+static void nsleep()
+{
+ struct timespec ts = { .tv_nsec = MY_TV_NSEC };
+
+ (void)syscall(__NR_nanosleep, &ts, NULL);
+}
+
+void test_vmlinux(void)
+{
+ int duration = 0, err;
+ struct test_vmlinux* skel;
+ struct test_vmlinux__bss *bss;
+
+ skel = test_vmlinux__open_and_load();
+ if (CHECK(!skel, "skel_open", "failed to open skeleton\n"))
+ return;
+ bss = skel->bss;
+
+ err = test_vmlinux__attach(skel);
+ if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err))
+ goto cleanup;
+
+ /* trigger everything */
+ nsleep();
+
+ CHECK(!bss->tp_called, "tp", "not called\n");
+ CHECK(!bss->raw_tp_called, "raw_tp", "not called\n");
+ CHECK(!bss->tp_btf_called, "tp_btf", "not called\n");
+ CHECK(!bss->kprobe_called, "kprobe", "not called\n");
+ CHECK(!bss->fentry_called, "fentry", "not called\n");
+
+cleanup:
+ test_vmlinux__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp.c b/tools/testing/selftests/bpf/prog_tests/xdp.c
new file mode 100644
index 000000000..48921ff74
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/xdp.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+
+void test_xdp(void)
+{
+ struct vip key4 = {.protocol = 6, .family = AF_INET};
+ struct vip key6 = {.protocol = 6, .family = AF_INET6};
+ struct iptnl_info value4 = {.family = AF_INET};
+ struct iptnl_info value6 = {.family = AF_INET6};
+ const char *file = "./test_xdp.o";
+ struct bpf_object *obj;
+ char buf[128];
+ struct ipv6hdr *iph6 = (void *)buf + sizeof(struct ethhdr);
+ struct iphdr *iph = (void *)buf + sizeof(struct ethhdr);
+ __u32 duration, retval, size;
+ int err, prog_fd, map_fd;
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
+ if (CHECK_FAIL(err))
+ return;
+
+ map_fd = bpf_find_map(__func__, obj, "vip2tnl");
+ if (map_fd < 0)
+ goto out;
+ bpf_map_update_elem(map_fd, &key4, &value4, 0);
+ bpf_map_update_elem(map_fd, &key6, &value6, 0);
+
+ err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+ buf, &size, &retval, &duration);
+
+ CHECK(err || retval != XDP_TX || size != 74 ||
+ iph->protocol != IPPROTO_IPIP, "ipv4",
+ "err %d errno %d retval %d size %d\n",
+ err, errno, retval, size);
+
+ err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6),
+ buf, &size, &retval, &duration);
+ CHECK(err || retval != XDP_TX || size != 114 ||
+ iph6->nexthdr != IPPROTO_IPV6, "ipv6",
+ "err %d errno %d retval %d size %d\n",
+ err, errno, retval, size);
+out:
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
new file mode 100644
index 000000000..d5c98f2cb
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_adjust_tail.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+
+void test_xdp_adjust_tail_shrink(void)
+{
+ const char *file = "./test_xdp_adjust_tail_shrink.o";
+ __u32 duration, retval, size, expect_sz;
+ struct bpf_object *obj;
+ int err, prog_fd;
+ char buf[128];
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
+ if (CHECK_FAIL(err))
+ return;
+
+ err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+ buf, &size, &retval, &duration);
+
+ CHECK(err || retval != XDP_DROP,
+ "ipv4", "err %d errno %d retval %d size %d\n",
+ err, errno, retval, size);
+
+ expect_sz = sizeof(pkt_v6) - 20; /* Test shrink with 20 bytes */
+ err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6),
+ buf, &size, &retval, &duration);
+ CHECK(err || retval != XDP_TX || size != expect_sz,
+ "ipv6", "err %d errno %d retval %d size %d expect-size %d\n",
+ err, errno, retval, size, expect_sz);
+ bpf_object__close(obj);
+}
+
+void test_xdp_adjust_tail_grow(void)
+{
+ const char *file = "./test_xdp_adjust_tail_grow.o";
+ struct bpf_object *obj;
+ char buf[4096]; /* avoid segfault: large buf to hold grow results */
+ __u32 duration, retval, size, expect_sz;
+ int err, prog_fd;
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
+ if (CHECK_FAIL(err))
+ return;
+
+ err = bpf_prog_test_run(prog_fd, 1, &pkt_v4, sizeof(pkt_v4),
+ buf, &size, &retval, &duration);
+ CHECK(err || retval != XDP_DROP,
+ "ipv4", "err %d errno %d retval %d size %d\n",
+ err, errno, retval, size);
+
+ expect_sz = sizeof(pkt_v6) + 40; /* Test grow with 40 bytes */
+ err = bpf_prog_test_run(prog_fd, 1, &pkt_v6, sizeof(pkt_v6) /* 74 */,
+ buf, &size, &retval, &duration);
+ CHECK(err || retval != XDP_TX || size != expect_sz,
+ "ipv6", "err %d errno %d retval %d size %d expect-size %d\n",
+ err, errno, retval, size, expect_sz);
+
+ bpf_object__close(obj);
+}
+
+void test_xdp_adjust_tail_grow2(void)
+{
+ const char *file = "./test_xdp_adjust_tail_grow.o";
+ char buf[4096]; /* avoid segfault: large buf to hold grow results */
+ int tailroom = 320; /* SKB_DATA_ALIGN(sizeof(struct skb_shared_info))*/;
+ struct bpf_object *obj;
+ int err, cnt, i;
+ int max_grow;
+
+ struct bpf_prog_test_run_attr tattr = {
+ .repeat = 1,
+ .data_in = &buf,
+ .data_out = &buf,
+ .data_size_in = 0, /* Per test */
+ .data_size_out = 0, /* Per test */
+ };
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &tattr.prog_fd);
+ if (CHECK_ATTR(err, "load", "err %d errno %d\n", err, errno))
+ return;
+
+ /* Test case-64 */
+ memset(buf, 1, sizeof(buf));
+ tattr.data_size_in = 64; /* Determine test case via pkt size */
+ tattr.data_size_out = 128; /* Limit copy_size */
+ /* Kernel side alloc packet memory area that is zero init */
+ err = bpf_prog_test_run_xattr(&tattr);
+
+ CHECK_ATTR(errno != ENOSPC /* Due limit copy_size in bpf_test_finish */
+ || tattr.retval != XDP_TX
+ || tattr.data_size_out != 192, /* Expected grow size */
+ "case-64",
+ "err %d errno %d retval %d size %d\n",
+ err, errno, tattr.retval, tattr.data_size_out);
+
+ /* Extra checks for data contents */
+ CHECK_ATTR(tattr.data_size_out != 192
+ || buf[0] != 1 || buf[63] != 1 /* 0-63 memset to 1 */
+ || buf[64] != 0 || buf[127] != 0 /* 64-127 memset to 0 */
+ || buf[128] != 1 || buf[191] != 1, /*128-191 memset to 1 */
+ "case-64-data",
+ "err %d errno %d retval %d size %d\n",
+ err, errno, tattr.retval, tattr.data_size_out);
+
+ /* Test case-128 */
+ memset(buf, 2, sizeof(buf));
+ tattr.data_size_in = 128; /* Determine test case via pkt size */
+ tattr.data_size_out = sizeof(buf); /* Copy everything */
+ err = bpf_prog_test_run_xattr(&tattr);
+
+ max_grow = 4096 - XDP_PACKET_HEADROOM - tailroom; /* 3520 */
+ CHECK_ATTR(err
+ || tattr.retval != XDP_TX
+ || tattr.data_size_out != max_grow,/* Expect max grow size */
+ "case-128",
+ "err %d errno %d retval %d size %d expect-size %d\n",
+ err, errno, tattr.retval, tattr.data_size_out, max_grow);
+
+ /* Extra checks for data content: Count grow size, will contain zeros */
+ for (i = 0, cnt = 0; i < sizeof(buf); i++) {
+ if (buf[i] == 0)
+ cnt++;
+ }
+ CHECK_ATTR((cnt != (max_grow - tattr.data_size_in)) /* Grow increase */
+ || tattr.data_size_out != max_grow, /* Total grow size */
+ "case-128-data",
+ "err %d errno %d retval %d size %d grow-size %d\n",
+ err, errno, tattr.retval, tattr.data_size_out, cnt);
+
+ bpf_object__close(obj);
+}
+
+void test_xdp_adjust_tail(void)
+{
+ if (test__start_subtest("xdp_adjust_tail_shrink"))
+ test_xdp_adjust_tail_shrink();
+ if (test__start_subtest("xdp_adjust_tail_grow"))
+ test_xdp_adjust_tail_grow();
+ if (test__start_subtest("xdp_adjust_tail_grow2"))
+ test_xdp_adjust_tail_grow2();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_attach.c
new file mode 100644
index 000000000..15ef35314
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_attach.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+#define IFINDEX_LO 1
+#define XDP_FLAGS_REPLACE (1U << 4)
+
+void test_xdp_attach(void)
+{
+ __u32 duration = 0, id1, id2, id0 = 0, len;
+ struct bpf_object *obj1, *obj2, *obj3;
+ const char *file = "./test_xdp.o";
+ struct bpf_prog_info info = {};
+ int err, fd1, fd2, fd3;
+ DECLARE_LIBBPF_OPTS(bpf_xdp_set_link_opts, opts,
+ .old_fd = -1);
+
+ len = sizeof(info);
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj1, &fd1);
+ if (CHECK_FAIL(err))
+ return;
+ err = bpf_obj_get_info_by_fd(fd1, &info, &len);
+ if (CHECK_FAIL(err))
+ goto out_1;
+ id1 = info.id;
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj2, &fd2);
+ if (CHECK_FAIL(err))
+ goto out_1;
+
+ memset(&info, 0, sizeof(info));
+ err = bpf_obj_get_info_by_fd(fd2, &info, &len);
+ if (CHECK_FAIL(err))
+ goto out_2;
+ id2 = info.id;
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj3, &fd3);
+ if (CHECK_FAIL(err))
+ goto out_2;
+
+ err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, fd1, XDP_FLAGS_REPLACE,
+ &opts);
+ if (CHECK(err, "load_ok", "initial load failed"))
+ goto out_close;
+
+ err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0);
+ if (CHECK(err || id0 != id1, "id1_check",
+ "loaded prog id %u != id1 %u, err %d", id0, id1, err))
+ goto out_close;
+
+ err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, fd2, XDP_FLAGS_REPLACE,
+ &opts);
+ if (CHECK(!err, "load_fail", "load with expected id didn't fail"))
+ goto out;
+
+ opts.old_fd = fd1;
+ err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, fd2, 0, &opts);
+ if (CHECK(err, "replace_ok", "replace valid old_fd failed"))
+ goto out;
+ err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0);
+ if (CHECK(err || id0 != id2, "id2_check",
+ "loaded prog id %u != id2 %u, err %d", id0, id2, err))
+ goto out_close;
+
+ err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, fd3, 0, &opts);
+ if (CHECK(!err, "replace_fail", "replace invalid old_fd didn't fail"))
+ goto out;
+
+ err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, -1, 0, &opts);
+ if (CHECK(!err, "remove_fail", "remove invalid old_fd didn't fail"))
+ goto out;
+
+ opts.old_fd = fd2;
+ err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, -1, 0, &opts);
+ if (CHECK(err, "remove_ok", "remove valid old_fd failed"))
+ goto out;
+
+ err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0);
+ if (CHECK(err || id0 != 0, "unload_check",
+ "loaded prog id %u != 0, err %d", id0, err))
+ goto out_close;
+out:
+ bpf_set_link_xdp_fd(IFINDEX_LO, -1, 0);
+out_close:
+ bpf_object__close(obj3);
+out_2:
+ bpf_object__close(obj2);
+out_1:
+ bpf_object__close(obj1);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c
new file mode 100644
index 000000000..2c6c570b2
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_bpf2bpf.c
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+#include <net/if.h>
+#include "test_xdp.skel.h"
+#include "test_xdp_bpf2bpf.skel.h"
+
+struct meta {
+ int ifindex;
+ int pkt_len;
+};
+
+static void on_sample(void *ctx, int cpu, void *data, __u32 size)
+{
+ int duration = 0;
+ struct meta *meta = (struct meta *)data;
+ struct ipv4_packet *trace_pkt_v4 = data + sizeof(*meta);
+
+ if (CHECK(size < sizeof(pkt_v4) + sizeof(*meta),
+ "check_size", "size %u < %zu\n",
+ size, sizeof(pkt_v4) + sizeof(*meta)))
+ return;
+
+ if (CHECK(meta->ifindex != if_nametoindex("lo"), "check_meta_ifindex",
+ "meta->ifindex = %d\n", meta->ifindex))
+ return;
+
+ if (CHECK(meta->pkt_len != sizeof(pkt_v4), "check_meta_pkt_len",
+ "meta->pkt_len = %zd\n", sizeof(pkt_v4)))
+ return;
+
+ if (CHECK(memcmp(trace_pkt_v4, &pkt_v4, sizeof(pkt_v4)),
+ "check_packet_content", "content not the same\n"))
+ return;
+
+ *(bool *)ctx = true;
+}
+
+void test_xdp_bpf2bpf(void)
+{
+ __u32 duration = 0, retval, size;
+ char buf[128];
+ int err, pkt_fd, map_fd;
+ bool passed = false;
+ struct iphdr *iph = (void *)buf + sizeof(struct ethhdr);
+ struct iptnl_info value4 = {.family = AF_INET};
+ struct test_xdp *pkt_skel = NULL;
+ struct test_xdp_bpf2bpf *ftrace_skel = NULL;
+ struct vip key4 = {.protocol = 6, .family = AF_INET};
+ struct bpf_program *prog;
+ struct perf_buffer *pb = NULL;
+ struct perf_buffer_opts pb_opts = {};
+
+ /* Load XDP program to introspect */
+ pkt_skel = test_xdp__open_and_load();
+ if (CHECK(!pkt_skel, "pkt_skel_load", "test_xdp skeleton failed\n"))
+ return;
+
+ pkt_fd = bpf_program__fd(pkt_skel->progs._xdp_tx_iptunnel);
+
+ map_fd = bpf_map__fd(pkt_skel->maps.vip2tnl);
+ bpf_map_update_elem(map_fd, &key4, &value4, 0);
+
+ /* Load trace program */
+ ftrace_skel = test_xdp_bpf2bpf__open();
+ if (CHECK(!ftrace_skel, "__open", "ftrace skeleton failed\n"))
+ goto out;
+
+ /* Demonstrate the bpf_program__set_attach_target() API rather than
+ * the load with options, i.e. opts.attach_prog_fd.
+ */
+ prog = ftrace_skel->progs.trace_on_entry;
+ bpf_program__set_expected_attach_type(prog, BPF_TRACE_FENTRY);
+ bpf_program__set_attach_target(prog, pkt_fd, "_xdp_tx_iptunnel");
+
+ prog = ftrace_skel->progs.trace_on_exit;
+ bpf_program__set_expected_attach_type(prog, BPF_TRACE_FEXIT);
+ bpf_program__set_attach_target(prog, pkt_fd, "_xdp_tx_iptunnel");
+
+ err = test_xdp_bpf2bpf__load(ftrace_skel);
+ if (CHECK(err, "__load", "ftrace skeleton failed\n"))
+ goto out;
+
+ err = test_xdp_bpf2bpf__attach(ftrace_skel);
+ if (CHECK(err, "ftrace_attach", "ftrace attach failed: %d\n", err))
+ goto out;
+
+ /* Set up perf buffer */
+ pb_opts.sample_cb = on_sample;
+ pb_opts.ctx = &passed;
+ pb = perf_buffer__new(bpf_map__fd(ftrace_skel->maps.perf_buf_map),
+ 1, &pb_opts);
+ if (CHECK(IS_ERR(pb), "perf_buf__new", "err %ld\n", PTR_ERR(pb)))
+ goto out;
+
+ /* Run test program */
+ err = bpf_prog_test_run(pkt_fd, 1, &pkt_v4, sizeof(pkt_v4),
+ buf, &size, &retval, &duration);
+
+ if (CHECK(err || retval != XDP_TX || size != 74 ||
+ iph->protocol != IPPROTO_IPIP, "ipv4",
+ "err %d errno %d retval %d size %d\n",
+ err, errno, retval, size))
+ goto out;
+
+ /* Make sure bpf_xdp_output() was triggered and it sent the expected
+ * data to the perf ring buffer.
+ */
+ err = perf_buffer__poll(pb, 100);
+ if (CHECK(err < 0, "perf_buffer__poll", "err %d\n", err))
+ goto out;
+
+ CHECK_FAIL(!passed);
+
+ /* Verify test results */
+ if (CHECK(ftrace_skel->bss->test_result_fentry != if_nametoindex("lo"),
+ "result", "fentry failed err %llu\n",
+ ftrace_skel->bss->test_result_fentry))
+ goto out;
+
+ CHECK(ftrace_skel->bss->test_result_fexit != XDP_TX, "result",
+ "fexit failed err %llu\n", ftrace_skel->bss->test_result_fexit);
+
+out:
+ if (pb)
+ perf_buffer__free(pb);
+ test_xdp__destroy(pkt_skel);
+ test_xdp_bpf2bpf__destroy(ftrace_skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c
new file mode 100644
index 000000000..0176573fe
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <uapi/linux/bpf.h>
+#include <linux/if_link.h>
+#include <test_progs.h>
+
+#include "test_xdp_with_cpumap_helpers.skel.h"
+
+#define IFINDEX_LO 1
+
+void test_xdp_with_cpumap_helpers(void)
+{
+ struct test_xdp_with_cpumap_helpers *skel;
+ struct bpf_prog_info info = {};
+ struct bpf_cpumap_val val = {
+ .qsize = 192,
+ };
+ __u32 duration = 0, idx = 0;
+ __u32 len = sizeof(info);
+ int err, prog_fd, map_fd;
+
+ skel = test_xdp_with_cpumap_helpers__open_and_load();
+ if (CHECK_FAIL(!skel)) {
+ perror("test_xdp_with_cpumap_helpers__open_and_load");
+ return;
+ }
+
+ /* can not attach program with cpumaps that allow programs
+ * as xdp generic
+ */
+ prog_fd = bpf_program__fd(skel->progs.xdp_redir_prog);
+ err = bpf_set_link_xdp_fd(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE);
+ CHECK(err == 0, "Generic attach of program with 8-byte CPUMAP",
+ "should have failed\n");
+
+ prog_fd = bpf_program__fd(skel->progs.xdp_dummy_cm);
+ map_fd = bpf_map__fd(skel->maps.cpu_map);
+ err = bpf_obj_get_info_by_fd(prog_fd, &info, &len);
+ if (CHECK_FAIL(err))
+ goto out_close;
+
+ val.bpf_prog.fd = prog_fd;
+ err = bpf_map_update_elem(map_fd, &idx, &val, 0);
+ CHECK(err, "Add program to cpumap entry", "err %d errno %d\n",
+ err, errno);
+
+ err = bpf_map_lookup_elem(map_fd, &idx, &val);
+ CHECK(err, "Read cpumap entry", "err %d errno %d\n", err, errno);
+ CHECK(info.id != val.bpf_prog.id, "Expected program id in cpumap entry",
+ "expected %u read %u\n", info.id, val.bpf_prog.id);
+
+ /* can not attach BPF_XDP_CPUMAP program to a device */
+ err = bpf_set_link_xdp_fd(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE);
+ CHECK(err == 0, "Attach of BPF_XDP_CPUMAP program",
+ "should have failed\n");
+
+ val.qsize = 192;
+ val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_prog);
+ err = bpf_map_update_elem(map_fd, &idx, &val, 0);
+ CHECK(err == 0, "Add non-BPF_XDP_CPUMAP program to cpumap entry",
+ "should have failed\n");
+
+out_close:
+ test_xdp_with_cpumap_helpers__destroy(skel);
+}
+
+void test_xdp_cpumap_attach(void)
+{
+ if (test__start_subtest("cpumap_with_progs"))
+ test_xdp_with_cpumap_helpers();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c
new file mode 100644
index 000000000..88ef3ec8a
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <uapi/linux/bpf.h>
+#include <linux/if_link.h>
+#include <test_progs.h>
+
+#include "test_xdp_devmap_helpers.skel.h"
+#include "test_xdp_with_devmap_helpers.skel.h"
+
+#define IFINDEX_LO 1
+
+void test_xdp_with_devmap_helpers(void)
+{
+ struct test_xdp_with_devmap_helpers *skel;
+ struct bpf_prog_info info = {};
+ struct bpf_devmap_val val = {
+ .ifindex = IFINDEX_LO,
+ };
+ __u32 len = sizeof(info);
+ __u32 duration = 0, idx = 0;
+ int err, dm_fd, map_fd;
+
+
+ skel = test_xdp_with_devmap_helpers__open_and_load();
+ if (CHECK_FAIL(!skel)) {
+ perror("test_xdp_with_devmap_helpers__open_and_load");
+ return;
+ }
+
+ /* can not attach program with DEVMAPs that allow programs
+ * as xdp generic
+ */
+ dm_fd = bpf_program__fd(skel->progs.xdp_redir_prog);
+ err = bpf_set_link_xdp_fd(IFINDEX_LO, dm_fd, XDP_FLAGS_SKB_MODE);
+ CHECK(err == 0, "Generic attach of program with 8-byte devmap",
+ "should have failed\n");
+
+ dm_fd = bpf_program__fd(skel->progs.xdp_dummy_dm);
+ map_fd = bpf_map__fd(skel->maps.dm_ports);
+ err = bpf_obj_get_info_by_fd(dm_fd, &info, &len);
+ if (CHECK_FAIL(err))
+ goto out_close;
+
+ val.bpf_prog.fd = dm_fd;
+ err = bpf_map_update_elem(map_fd, &idx, &val, 0);
+ CHECK(err, "Add program to devmap entry",
+ "err %d errno %d\n", err, errno);
+
+ err = bpf_map_lookup_elem(map_fd, &idx, &val);
+ CHECK(err, "Read devmap entry", "err %d errno %d\n", err, errno);
+ CHECK(info.id != val.bpf_prog.id, "Expected program id in devmap entry",
+ "expected %u read %u\n", info.id, val.bpf_prog.id);
+
+ /* can not attach BPF_XDP_DEVMAP program to a device */
+ err = bpf_set_link_xdp_fd(IFINDEX_LO, dm_fd, XDP_FLAGS_SKB_MODE);
+ CHECK(err == 0, "Attach of BPF_XDP_DEVMAP program",
+ "should have failed\n");
+
+ val.ifindex = 1;
+ val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_prog);
+ err = bpf_map_update_elem(map_fd, &idx, &val, 0);
+ CHECK(err == 0, "Add non-BPF_XDP_DEVMAP program to devmap entry",
+ "should have failed\n");
+
+out_close:
+ test_xdp_with_devmap_helpers__destroy(skel);
+}
+
+void test_neg_xdp_devmap_helpers(void)
+{
+ struct test_xdp_devmap_helpers *skel;
+ __u32 duration = 0;
+
+ skel = test_xdp_devmap_helpers__open_and_load();
+ if (CHECK(skel,
+ "Load of XDP program accessing egress ifindex without attach type",
+ "should have failed\n")) {
+ test_xdp_devmap_helpers__destroy(skel);
+ }
+}
+
+
+void test_xdp_devmap_attach(void)
+{
+ if (test__start_subtest("DEVMAP with programs in entries"))
+ test_xdp_with_devmap_helpers();
+
+ if (test__start_subtest("Verifier check of DEVMAP programs"))
+ test_neg_xdp_devmap_helpers();
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_info.c b/tools/testing/selftests/bpf/prog_tests/xdp_info.c
new file mode 100644
index 000000000..d2d7a283d
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_info.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/if_link.h>
+#include <test_progs.h>
+
+#define IFINDEX_LO 1
+
+void test_xdp_info(void)
+{
+ __u32 len = sizeof(struct bpf_prog_info), duration = 0, prog_id;
+ const char *file = "./xdp_dummy.o";
+ struct bpf_prog_info info = {};
+ struct bpf_object *obj;
+ int err, prog_fd;
+
+ /* Get prog_id for XDP_ATTACHED_NONE mode */
+
+ err = bpf_get_link_xdp_id(IFINDEX_LO, &prog_id, 0);
+ if (CHECK(err, "get_xdp_none", "errno=%d\n", errno))
+ return;
+ if (CHECK(prog_id, "prog_id_none", "unexpected prog_id=%u\n", prog_id))
+ return;
+
+ err = bpf_get_link_xdp_id(IFINDEX_LO, &prog_id, XDP_FLAGS_SKB_MODE);
+ if (CHECK(err, "get_xdp_none_skb", "errno=%d\n", errno))
+ return;
+ if (CHECK(prog_id, "prog_id_none_skb", "unexpected prog_id=%u\n",
+ prog_id))
+ return;
+
+ /* Setup prog */
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
+ if (CHECK_FAIL(err))
+ return;
+
+ err = bpf_obj_get_info_by_fd(prog_fd, &info, &len);
+ if (CHECK(err, "get_prog_info", "errno=%d\n", errno))
+ goto out_close;
+
+ err = bpf_set_link_xdp_fd(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE);
+ if (CHECK(err, "set_xdp_skb", "errno=%d\n", errno))
+ goto out_close;
+
+ /* Get prog_id for single prog mode */
+
+ err = bpf_get_link_xdp_id(IFINDEX_LO, &prog_id, 0);
+ if (CHECK(err, "get_xdp", "errno=%d\n", errno))
+ goto out;
+ if (CHECK(prog_id != info.id, "prog_id", "prog_id not available\n"))
+ goto out;
+
+ err = bpf_get_link_xdp_id(IFINDEX_LO, &prog_id, XDP_FLAGS_SKB_MODE);
+ if (CHECK(err, "get_xdp_skb", "errno=%d\n", errno))
+ goto out;
+ if (CHECK(prog_id != info.id, "prog_id_skb", "prog_id not available\n"))
+ goto out;
+
+ err = bpf_get_link_xdp_id(IFINDEX_LO, &prog_id, XDP_FLAGS_DRV_MODE);
+ if (CHECK(err, "get_xdp_drv", "errno=%d\n", errno))
+ goto out;
+ if (CHECK(prog_id, "prog_id_drv", "unexpected prog_id=%u\n", prog_id))
+ goto out;
+
+out:
+ bpf_set_link_xdp_fd(IFINDEX_LO, -1, 0);
+out_close:
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_link.c b/tools/testing/selftests/bpf/prog_tests/xdp_link.c
new file mode 100644
index 000000000..6f814999b
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_link.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <uapi/linux/if_link.h>
+#include <test_progs.h>
+#include "test_xdp_link.skel.h"
+
+#define IFINDEX_LO 1
+
+void test_xdp_link(void)
+{
+ __u32 duration = 0, id1, id2, id0 = 0, prog_fd1, prog_fd2, err;
+ DECLARE_LIBBPF_OPTS(bpf_xdp_set_link_opts, opts, .old_fd = -1);
+ struct test_xdp_link *skel1 = NULL, *skel2 = NULL;
+ struct bpf_link_info link_info;
+ struct bpf_prog_info prog_info;
+ struct bpf_link *link;
+ __u32 link_info_len = sizeof(link_info);
+ __u32 prog_info_len = sizeof(prog_info);
+
+ skel1 = test_xdp_link__open_and_load();
+ if (CHECK(!skel1, "skel_load", "skeleton open and load failed\n"))
+ goto cleanup;
+ prog_fd1 = bpf_program__fd(skel1->progs.xdp_handler);
+
+ skel2 = test_xdp_link__open_and_load();
+ if (CHECK(!skel2, "skel_load", "skeleton open and load failed\n"))
+ goto cleanup;
+ prog_fd2 = bpf_program__fd(skel2->progs.xdp_handler);
+
+ memset(&prog_info, 0, sizeof(prog_info));
+ err = bpf_obj_get_info_by_fd(prog_fd1, &prog_info, &prog_info_len);
+ if (CHECK(err, "fd_info1", "failed %d\n", -errno))
+ goto cleanup;
+ id1 = prog_info.id;
+
+ memset(&prog_info, 0, sizeof(prog_info));
+ err = bpf_obj_get_info_by_fd(prog_fd2, &prog_info, &prog_info_len);
+ if (CHECK(err, "fd_info2", "failed %d\n", -errno))
+ goto cleanup;
+ id2 = prog_info.id;
+
+ /* set initial prog attachment */
+ err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, prog_fd1, XDP_FLAGS_REPLACE, &opts);
+ if (CHECK(err, "fd_attach", "initial prog attach failed: %d\n", err))
+ goto cleanup;
+
+ /* validate prog ID */
+ err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0);
+ CHECK(err || id0 != id1, "id1_check",
+ "loaded prog id %u != id1 %u, err %d", id0, id1, err);
+
+ /* BPF link is not allowed to replace prog attachment */
+ link = bpf_program__attach_xdp(skel1->progs.xdp_handler, IFINDEX_LO);
+ if (CHECK(!IS_ERR(link), "link_attach_fail", "unexpected success\n")) {
+ bpf_link__destroy(link);
+ /* best-effort detach prog */
+ opts.old_fd = prog_fd1;
+ bpf_set_link_xdp_fd_opts(IFINDEX_LO, -1, XDP_FLAGS_REPLACE, &opts);
+ goto cleanup;
+ }
+
+ /* detach BPF program */
+ opts.old_fd = prog_fd1;
+ err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, -1, XDP_FLAGS_REPLACE, &opts);
+ if (CHECK(err, "prog_detach", "failed %d\n", err))
+ goto cleanup;
+
+ /* now BPF link should attach successfully */
+ link = bpf_program__attach_xdp(skel1->progs.xdp_handler, IFINDEX_LO);
+ if (CHECK(IS_ERR(link), "link_attach", "failed: %ld\n", PTR_ERR(link)))
+ goto cleanup;
+ skel1->links.xdp_handler = link;
+
+ /* validate prog ID */
+ err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0);
+ if (CHECK(err || id0 != id1, "id1_check",
+ "loaded prog id %u != id1 %u, err %d", id0, id1, err))
+ goto cleanup;
+
+ /* BPF prog attach is not allowed to replace BPF link */
+ opts.old_fd = prog_fd1;
+ err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, prog_fd2, XDP_FLAGS_REPLACE, &opts);
+ if (CHECK(!err, "prog_attach_fail", "unexpected success\n"))
+ goto cleanup;
+
+ /* Can't force-update when BPF link is active */
+ err = bpf_set_link_xdp_fd(IFINDEX_LO, prog_fd2, 0);
+ if (CHECK(!err, "prog_update_fail", "unexpected success\n"))
+ goto cleanup;
+
+ /* Can't force-detach when BPF link is active */
+ err = bpf_set_link_xdp_fd(IFINDEX_LO, -1, 0);
+ if (CHECK(!err, "prog_detach_fail", "unexpected success\n"))
+ goto cleanup;
+
+ /* BPF link is not allowed to replace another BPF link */
+ link = bpf_program__attach_xdp(skel2->progs.xdp_handler, IFINDEX_LO);
+ if (CHECK(!IS_ERR(link), "link_attach_fail", "unexpected success\n")) {
+ bpf_link__destroy(link);
+ goto cleanup;
+ }
+
+ bpf_link__destroy(skel1->links.xdp_handler);
+ skel1->links.xdp_handler = NULL;
+
+ /* new link attach should succeed */
+ link = bpf_program__attach_xdp(skel2->progs.xdp_handler, IFINDEX_LO);
+ if (CHECK(IS_ERR(link), "link_attach", "failed: %ld\n", PTR_ERR(link)))
+ goto cleanup;
+ skel2->links.xdp_handler = link;
+
+ err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0);
+ if (CHECK(err || id0 != id2, "id2_check",
+ "loaded prog id %u != id2 %u, err %d", id0, id1, err))
+ goto cleanup;
+
+ /* updating program under active BPF link works as expected */
+ err = bpf_link__update_program(link, skel1->progs.xdp_handler);
+ if (CHECK(err, "link_upd", "failed: %d\n", err))
+ goto cleanup;
+
+ memset(&link_info, 0, sizeof(link_info));
+ err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &link_info, &link_info_len);
+ if (CHECK(err, "link_info", "failed: %d\n", err))
+ goto cleanup;
+
+ CHECK(link_info.type != BPF_LINK_TYPE_XDP, "link_type",
+ "got %u != exp %u\n", link_info.type, BPF_LINK_TYPE_XDP);
+ CHECK(link_info.prog_id != id1, "link_prog_id",
+ "got %u != exp %u\n", link_info.prog_id, id1);
+ CHECK(link_info.xdp.ifindex != IFINDEX_LO, "link_ifindex",
+ "got %u != exp %u\n", link_info.xdp.ifindex, IFINDEX_LO);
+
+ err = bpf_link__detach(link);
+ if (CHECK(err, "link_detach", "failed %d\n", err))
+ goto cleanup;
+
+ memset(&link_info, 0, sizeof(link_info));
+ err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &link_info, &link_info_len);
+ if (CHECK(err, "link_info", "failed: %d\n", err))
+ goto cleanup;
+ CHECK(link_info.prog_id != id1, "link_prog_id",
+ "got %u != exp %u\n", link_info.prog_id, id1);
+ /* ifindex should be zeroed out */
+ CHECK(link_info.xdp.ifindex != 0, "link_ifindex",
+ "got %u != exp %u\n", link_info.xdp.ifindex, 0);
+
+cleanup:
+ test_xdp_link__destroy(skel1);
+ test_xdp_link__destroy(skel2);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c b/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c
new file mode 100644
index 000000000..0281095de
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_noinline.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+#include <network_helpers.h>
+#include "test_xdp_noinline.skel.h"
+
+void test_xdp_noinline(void)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ struct test_xdp_noinline *skel;
+ struct vip key = {.protocol = 6};
+ struct vip_meta {
+ __u32 flags;
+ __u32 vip_num;
+ } value = {.vip_num = VIP_NUM};
+ __u32 stats_key = VIP_NUM;
+ struct vip_stats {
+ __u64 bytes;
+ __u64 pkts;
+ } stats[nr_cpus];
+ struct real_definition {
+ union {
+ __be32 dst;
+ __be32 dstv6[4];
+ };
+ __u8 flags;
+ } real_def = {.dst = MAGIC_VAL};
+ __u32 ch_key = 11, real_num = 3;
+ __u32 duration = 0, retval, size;
+ int err, i;
+ __u64 bytes = 0, pkts = 0;
+ char buf[128];
+ u32 *magic = (u32 *)buf;
+
+ skel = test_xdp_noinline__open_and_load();
+ if (CHECK(!skel, "skel_open_and_load", "failed\n"))
+ return;
+
+ bpf_map_update_elem(bpf_map__fd(skel->maps.vip_map), &key, &value, 0);
+ bpf_map_update_elem(bpf_map__fd(skel->maps.ch_rings), &ch_key, &real_num, 0);
+ bpf_map_update_elem(bpf_map__fd(skel->maps.reals), &real_num, &real_def, 0);
+
+ err = bpf_prog_test_run(bpf_program__fd(skel->progs.balancer_ingress_v4),
+ NUM_ITER, &pkt_v4, sizeof(pkt_v4),
+ buf, &size, &retval, &duration);
+ CHECK(err || retval != 1 || size != 54 ||
+ *magic != MAGIC_VAL, "ipv4",
+ "err %d errno %d retval %d size %d magic %x\n",
+ err, errno, retval, size, *magic);
+
+ err = bpf_prog_test_run(bpf_program__fd(skel->progs.balancer_ingress_v6),
+ NUM_ITER, &pkt_v6, sizeof(pkt_v6),
+ buf, &size, &retval, &duration);
+ CHECK(err || retval != 1 || size != 74 ||
+ *magic != MAGIC_VAL, "ipv6",
+ "err %d errno %d retval %d size %d magic %x\n",
+ err, errno, retval, size, *magic);
+
+ bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), &stats_key, stats);
+ for (i = 0; i < nr_cpus; i++) {
+ bytes += stats[i].bytes;
+ pkts += stats[i].pkts;
+ }
+ CHECK(bytes != MAGIC_BYTES * NUM_ITER * 2 || pkts != NUM_ITER * 2,
+ "stats", "bytes %lld pkts %lld\n",
+ (unsigned long long)bytes, (unsigned long long)pkts);
+ test_xdp_noinline__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_perf.c b/tools/testing/selftests/bpf/prog_tests/xdp_perf.c
new file mode 100644
index 000000000..7185bee16
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_perf.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <test_progs.h>
+
+void test_xdp_perf(void)
+{
+ const char *file = "./xdp_dummy.o";
+ __u32 duration, retval, size;
+ struct bpf_object *obj;
+ char in[128], out[128];
+ int err, prog_fd;
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
+ if (CHECK_FAIL(err))
+ return;
+
+ err = bpf_prog_test_run(prog_fd, 1000000, &in[0], 128,
+ out, &size, &retval, &duration);
+
+ CHECK(err || retval != XDP_PASS || size != 128,
+ "xdp-perf",
+ "err %d errno %d retval %d size %d\n",
+ err, errno, retval, size);
+
+ bpf_object__close(obj);
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_cubic.c b/tools/testing/selftests/bpf/progs/bpf_cubic.c
new file mode 100644
index 000000000..6939bfd86
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_cubic.c
@@ -0,0 +1,545 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/* WARNING: This implemenation is not necessarily the same
+ * as the tcp_cubic.c. The purpose is mainly for testing
+ * the kernel BPF logic.
+ *
+ * Highlights:
+ * 1. CONFIG_HZ .kconfig map is used.
+ * 2. In bictcp_update(), calculation is changed to use usec
+ * resolution (i.e. USEC_PER_JIFFY) instead of using jiffies.
+ * Thus, usecs_to_jiffies() is not used in the bpf_cubic.c.
+ * 3. In bitctcp_update() [under tcp_friendliness], the original
+ * "while (ca->ack_cnt > delta)" loop is changed to the equivalent
+ * "ca->ack_cnt / delta" operation.
+ */
+
+#include <linux/bpf.h>
+#include <linux/stddef.h>
+#include <linux/tcp.h>
+#include "bpf_tcp_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+
+#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)
+
+#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation
+ * max_cwnd = snd_cwnd * beta
+ */
+#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */
+
+/* Two methods of hybrid slow start */
+#define HYSTART_ACK_TRAIN 0x1
+#define HYSTART_DELAY 0x2
+
+/* Number of delay samples for detecting the increase of delay */
+#define HYSTART_MIN_SAMPLES 8
+#define HYSTART_DELAY_MIN (4000U) /* 4ms */
+#define HYSTART_DELAY_MAX (16000U) /* 16 ms */
+#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
+
+static int fast_convergence = 1;
+static const int beta = 717; /* = 717/1024 (BICTCP_BETA_SCALE) */
+static int initial_ssthresh;
+static const int bic_scale = 41;
+static int tcp_friendliness = 1;
+
+static int hystart = 1;
+static int hystart_detect = HYSTART_ACK_TRAIN | HYSTART_DELAY;
+static int hystart_low_window = 16;
+static int hystart_ack_delta_us = 2000;
+
+static const __u32 cube_rtt_scale = (bic_scale * 10); /* 1024*c/rtt */
+static const __u32 beta_scale = 8*(BICTCP_BETA_SCALE+beta) / 3
+ / (BICTCP_BETA_SCALE - beta);
+/* calculate the "K" for (wmax-cwnd) = c/rtt * K^3
+ * so K = cubic_root( (wmax-cwnd)*rtt/c )
+ * the unit of K is bictcp_HZ=2^10, not HZ
+ *
+ * c = bic_scale >> 10
+ * rtt = 100ms
+ *
+ * the following code has been designed and tested for
+ * cwnd < 1 million packets
+ * RTT < 100 seconds
+ * HZ < 1,000,00 (corresponding to 10 nano-second)
+ */
+
+/* 1/c * 2^2*bictcp_HZ * srtt, 2^40 */
+static const __u64 cube_factor = (__u64)(1ull << (10+3*BICTCP_HZ))
+ / (bic_scale * 10);
+
+/* BIC TCP Parameters */
+struct bictcp {
+ __u32 cnt; /* increase cwnd by 1 after ACKs */
+ __u32 last_max_cwnd; /* last maximum snd_cwnd */
+ __u32 last_cwnd; /* the last snd_cwnd */
+ __u32 last_time; /* time when updated last_cwnd */
+ __u32 bic_origin_point;/* origin point of bic function */
+ __u32 bic_K; /* time to origin point
+ from the beginning of the current epoch */
+ __u32 delay_min; /* min delay (usec) */
+ __u32 epoch_start; /* beginning of an epoch */
+ __u32 ack_cnt; /* number of acks */
+ __u32 tcp_cwnd; /* estimated tcp cwnd */
+ __u16 unused;
+ __u8 sample_cnt; /* number of samples to decide curr_rtt */
+ __u8 found; /* the exit point is found? */
+ __u32 round_start; /* beginning of each round */
+ __u32 end_seq; /* end_seq of the round */
+ __u32 last_ack; /* last time when the ACK spacing is close */
+ __u32 curr_rtt; /* the minimum rtt of current round */
+};
+
+static inline void bictcp_reset(struct bictcp *ca)
+{
+ ca->cnt = 0;
+ ca->last_max_cwnd = 0;
+ ca->last_cwnd = 0;
+ ca->last_time = 0;
+ ca->bic_origin_point = 0;
+ ca->bic_K = 0;
+ ca->delay_min = 0;
+ ca->epoch_start = 0;
+ ca->ack_cnt = 0;
+ ca->tcp_cwnd = 0;
+ ca->found = 0;
+}
+
+extern unsigned long CONFIG_HZ __kconfig;
+#define HZ CONFIG_HZ
+#define USEC_PER_MSEC 1000UL
+#define USEC_PER_SEC 1000000UL
+#define USEC_PER_JIFFY (USEC_PER_SEC / HZ)
+
+static __always_inline __u64 div64_u64(__u64 dividend, __u64 divisor)
+{
+ return dividend / divisor;
+}
+
+#define div64_ul div64_u64
+
+#define BITS_PER_U64 (sizeof(__u64) * 8)
+static __always_inline int fls64(__u64 x)
+{
+ int num = BITS_PER_U64 - 1;
+
+ if (x == 0)
+ return 0;
+
+ if (!(x & (~0ull << (BITS_PER_U64-32)))) {
+ num -= 32;
+ x <<= 32;
+ }
+ if (!(x & (~0ull << (BITS_PER_U64-16)))) {
+ num -= 16;
+ x <<= 16;
+ }
+ if (!(x & (~0ull << (BITS_PER_U64-8)))) {
+ num -= 8;
+ x <<= 8;
+ }
+ if (!(x & (~0ull << (BITS_PER_U64-4)))) {
+ num -= 4;
+ x <<= 4;
+ }
+ if (!(x & (~0ull << (BITS_PER_U64-2)))) {
+ num -= 2;
+ x <<= 2;
+ }
+ if (!(x & (~0ull << (BITS_PER_U64-1))))
+ num -= 1;
+
+ return num + 1;
+}
+
+static __always_inline __u32 bictcp_clock_us(const struct sock *sk)
+{
+ return tcp_sk(sk)->tcp_mstamp;
+}
+
+static __always_inline void bictcp_hystart_reset(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ ca->round_start = ca->last_ack = bictcp_clock_us(sk);
+ ca->end_seq = tp->snd_nxt;
+ ca->curr_rtt = ~0U;
+ ca->sample_cnt = 0;
+}
+
+/* "struct_ops/" prefix is not a requirement
+ * It will be recognized as BPF_PROG_TYPE_STRUCT_OPS
+ * as long as it is used in one of the func ptr
+ * under SEC(".struct_ops").
+ */
+SEC("struct_ops/bictcp_init")
+void BPF_PROG(bictcp_init, struct sock *sk)
+{
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ bictcp_reset(ca);
+
+ if (hystart)
+ bictcp_hystart_reset(sk);
+
+ if (!hystart && initial_ssthresh)
+ tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
+}
+
+/* No prefix in SEC will also work.
+ * The remaining tcp-cubic functions have an easier way.
+ */
+SEC("no-sec-prefix-bictcp_cwnd_event")
+void BPF_PROG(bictcp_cwnd_event, struct sock *sk, enum tcp_ca_event event)
+{
+ if (event == CA_EVENT_TX_START) {
+ struct bictcp *ca = inet_csk_ca(sk);
+ __u32 now = tcp_jiffies32;
+ __s32 delta;
+
+ delta = now - tcp_sk(sk)->lsndtime;
+
+ /* We were application limited (idle) for a while.
+ * Shift epoch_start to keep cwnd growth to cubic curve.
+ */
+ if (ca->epoch_start && delta > 0) {
+ ca->epoch_start += delta;
+ if (after(ca->epoch_start, now))
+ ca->epoch_start = now;
+ }
+ return;
+ }
+}
+
+/*
+ * cbrt(x) MSB values for x MSB values in [0..63].
+ * Precomputed then refined by hand - Willy Tarreau
+ *
+ * For x in [0..63],
+ * v = cbrt(x << 18) - 1
+ * cbrt(x) = (v[x] + 10) >> 6
+ */
+static const __u8 v[] = {
+ /* 0x00 */ 0, 54, 54, 54, 118, 118, 118, 118,
+ /* 0x08 */ 123, 129, 134, 138, 143, 147, 151, 156,
+ /* 0x10 */ 157, 161, 164, 168, 170, 173, 176, 179,
+ /* 0x18 */ 181, 185, 187, 190, 192, 194, 197, 199,
+ /* 0x20 */ 200, 202, 204, 206, 209, 211, 213, 215,
+ /* 0x28 */ 217, 219, 221, 222, 224, 225, 227, 229,
+ /* 0x30 */ 231, 232, 234, 236, 237, 239, 240, 242,
+ /* 0x38 */ 244, 245, 246, 248, 250, 251, 252, 254,
+};
+
+/* calculate the cubic root of x using a table lookup followed by one
+ * Newton-Raphson iteration.
+ * Avg err ~= 0.195%
+ */
+static __always_inline __u32 cubic_root(__u64 a)
+{
+ __u32 x, b, shift;
+
+ if (a < 64) {
+ /* a in [0..63] */
+ return ((__u32)v[(__u32)a] + 35) >> 6;
+ }
+
+ b = fls64(a);
+ b = ((b * 84) >> 8) - 1;
+ shift = (a >> (b * 3));
+
+ /* it is needed for verifier's bound check on v */
+ if (shift >= 64)
+ return 0;
+
+ x = ((__u32)(((__u32)v[shift] + 10) << b)) >> 6;
+
+ /*
+ * Newton-Raphson iteration
+ * 2
+ * x = ( 2 * x + a / x ) / 3
+ * k+1 k k
+ */
+ x = (2 * x + (__u32)div64_u64(a, (__u64)x * (__u64)(x - 1)));
+ x = ((x * 341) >> 10);
+ return x;
+}
+
+/*
+ * Compute congestion window to use.
+ */
+static __always_inline void bictcp_update(struct bictcp *ca, __u32 cwnd,
+ __u32 acked)
+{
+ __u32 delta, bic_target, max_cnt;
+ __u64 offs, t;
+
+ ca->ack_cnt += acked; /* count the number of ACKed packets */
+
+ if (ca->last_cwnd == cwnd &&
+ (__s32)(tcp_jiffies32 - ca->last_time) <= HZ / 32)
+ return;
+
+ /* The CUBIC function can update ca->cnt at most once per jiffy.
+ * On all cwnd reduction events, ca->epoch_start is set to 0,
+ * which will force a recalculation of ca->cnt.
+ */
+ if (ca->epoch_start && tcp_jiffies32 == ca->last_time)
+ goto tcp_friendliness;
+
+ ca->last_cwnd = cwnd;
+ ca->last_time = tcp_jiffies32;
+
+ if (ca->epoch_start == 0) {
+ ca->epoch_start = tcp_jiffies32; /* record beginning */
+ ca->ack_cnt = acked; /* start counting */
+ ca->tcp_cwnd = cwnd; /* syn with cubic */
+
+ if (ca->last_max_cwnd <= cwnd) {
+ ca->bic_K = 0;
+ ca->bic_origin_point = cwnd;
+ } else {
+ /* Compute new K based on
+ * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ)
+ */
+ ca->bic_K = cubic_root(cube_factor
+ * (ca->last_max_cwnd - cwnd));
+ ca->bic_origin_point = ca->last_max_cwnd;
+ }
+ }
+
+ /* cubic function - calc*/
+ /* calculate c * time^3 / rtt,
+ * while considering overflow in calculation of time^3
+ * (so time^3 is done by using 64 bit)
+ * and without the support of division of 64bit numbers
+ * (so all divisions are done by using 32 bit)
+ * also NOTE the unit of those veriables
+ * time = (t - K) / 2^bictcp_HZ
+ * c = bic_scale >> 10
+ * rtt = (srtt >> 3) / HZ
+ * !!! The following code does not have overflow problems,
+ * if the cwnd < 1 million packets !!!
+ */
+
+ t = (__s32)(tcp_jiffies32 - ca->epoch_start) * USEC_PER_JIFFY;
+ t += ca->delay_min;
+ /* change the unit from usec to bictcp_HZ */
+ t <<= BICTCP_HZ;
+ t /= USEC_PER_SEC;
+
+ if (t < ca->bic_K) /* t - K */
+ offs = ca->bic_K - t;
+ else
+ offs = t - ca->bic_K;
+
+ /* c/rtt * (t-K)^3 */
+ delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ);
+ if (t < ca->bic_K) /* below origin*/
+ bic_target = ca->bic_origin_point - delta;
+ else /* above origin*/
+ bic_target = ca->bic_origin_point + delta;
+
+ /* cubic function - calc bictcp_cnt*/
+ if (bic_target > cwnd) {
+ ca->cnt = cwnd / (bic_target - cwnd);
+ } else {
+ ca->cnt = 100 * cwnd; /* very small increment*/
+ }
+
+ /*
+ * The initial growth of cubic function may be too conservative
+ * when the available bandwidth is still unknown.
+ */
+ if (ca->last_max_cwnd == 0 && ca->cnt > 20)
+ ca->cnt = 20; /* increase cwnd 5% per RTT */
+
+tcp_friendliness:
+ /* TCP Friendly */
+ if (tcp_friendliness) {
+ __u32 scale = beta_scale;
+ __u32 n;
+
+ /* update tcp cwnd */
+ delta = (cwnd * scale) >> 3;
+ if (ca->ack_cnt > delta && delta) {
+ n = ca->ack_cnt / delta;
+ ca->ack_cnt -= n * delta;
+ ca->tcp_cwnd += n;
+ }
+
+ if (ca->tcp_cwnd > cwnd) { /* if bic is slower than tcp */
+ delta = ca->tcp_cwnd - cwnd;
+ max_cnt = cwnd / delta;
+ if (ca->cnt > max_cnt)
+ ca->cnt = max_cnt;
+ }
+ }
+
+ /* The maximum rate of cwnd increase CUBIC allows is 1 packet per
+ * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT.
+ */
+ ca->cnt = max(ca->cnt, 2U);
+}
+
+/* Or simply use the BPF_STRUCT_OPS to avoid the SEC boiler plate. */
+void BPF_STRUCT_OPS(bictcp_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ if (tcp_in_slow_start(tp)) {
+ if (hystart && after(ack, ca->end_seq))
+ bictcp_hystart_reset(sk);
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
+ }
+ bictcp_update(ca, tp->snd_cwnd, acked);
+ tcp_cong_avoid_ai(tp, ca->cnt, acked);
+}
+
+__u32 BPF_STRUCT_OPS(bictcp_recalc_ssthresh, struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+
+ ca->epoch_start = 0; /* end of epoch */
+
+ /* Wmax and fast convergence */
+ if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
+ ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
+ / (2 * BICTCP_BETA_SCALE);
+ else
+ ca->last_max_cwnd = tp->snd_cwnd;
+
+ return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
+}
+
+void BPF_STRUCT_OPS(bictcp_state, struct sock *sk, __u8 new_state)
+{
+ if (new_state == TCP_CA_Loss) {
+ bictcp_reset(inet_csk_ca(sk));
+ bictcp_hystart_reset(sk);
+ }
+}
+
+#define GSO_MAX_SIZE 65536
+
+/* Account for TSO/GRO delays.
+ * Otherwise short RTT flows could get too small ssthresh, since during
+ * slow start we begin with small TSO packets and ca->delay_min would
+ * not account for long aggregation delay when TSO packets get bigger.
+ * Ideally even with a very small RTT we would like to have at least one
+ * TSO packet being sent and received by GRO, and another one in qdisc layer.
+ * We apply another 100% factor because @rate is doubled at this point.
+ * We cap the cushion to 1ms.
+ */
+static __always_inline __u32 hystart_ack_delay(struct sock *sk)
+{
+ unsigned long rate;
+
+ rate = sk->sk_pacing_rate;
+ if (!rate)
+ return 0;
+ return min((__u64)USEC_PER_MSEC,
+ div64_ul((__u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate));
+}
+
+static __always_inline void hystart_update(struct sock *sk, __u32 delay)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+ __u32 threshold;
+
+ if (hystart_detect & HYSTART_ACK_TRAIN) {
+ __u32 now = bictcp_clock_us(sk);
+
+ /* first detection parameter - ack-train detection */
+ if ((__s32)(now - ca->last_ack) <= hystart_ack_delta_us) {
+ ca->last_ack = now;
+
+ threshold = ca->delay_min + hystart_ack_delay(sk);
+
+ /* Hystart ack train triggers if we get ack past
+ * ca->delay_min/2.
+ * Pacing might have delayed packets up to RTT/2
+ * during slow start.
+ */
+ if (sk->sk_pacing_status == SK_PACING_NONE)
+ threshold >>= 1;
+
+ if ((__s32)(now - ca->round_start) > threshold) {
+ ca->found = 1;
+ tp->snd_ssthresh = tp->snd_cwnd;
+ }
+ }
+ }
+
+ if (hystart_detect & HYSTART_DELAY) {
+ /* obtain the minimum delay of more than sampling packets */
+ if (ca->curr_rtt > delay)
+ ca->curr_rtt = delay;
+ if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
+ ca->sample_cnt++;
+ } else {
+ if (ca->curr_rtt > ca->delay_min +
+ HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
+ ca->found = 1;
+ tp->snd_ssthresh = tp->snd_cwnd;
+ }
+ }
+ }
+}
+
+void BPF_STRUCT_OPS(bictcp_acked, struct sock *sk,
+ const struct ack_sample *sample)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct bictcp *ca = inet_csk_ca(sk);
+ __u32 delay;
+
+ /* Some calls are for duplicates without timetamps */
+ if (sample->rtt_us < 0)
+ return;
+
+ /* Discard delay samples right after fast recovery */
+ if (ca->epoch_start && (__s32)(tcp_jiffies32 - ca->epoch_start) < HZ)
+ return;
+
+ delay = sample->rtt_us;
+ if (delay == 0)
+ delay = 1;
+
+ /* first time call or link delay decreases */
+ if (ca->delay_min == 0 || ca->delay_min > delay)
+ ca->delay_min = delay;
+
+ /* hystart triggers when cwnd is larger than some threshold */
+ if (!ca->found && tcp_in_slow_start(tp) && hystart &&
+ tp->snd_cwnd >= hystart_low_window)
+ hystart_update(sk, delay);
+}
+
+__u32 BPF_STRUCT_OPS(tcp_reno_undo_cwnd, struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ return max(tp->snd_cwnd, tp->prior_cwnd);
+}
+
+SEC(".struct_ops")
+struct tcp_congestion_ops cubic = {
+ .init = (void *)bictcp_init,
+ .ssthresh = (void *)bictcp_recalc_ssthresh,
+ .cong_avoid = (void *)bictcp_cong_avoid,
+ .set_state = (void *)bictcp_state,
+ .undo_cwnd = (void *)tcp_reno_undo_cwnd,
+ .cwnd_event = (void *)bictcp_cwnd_event,
+ .pkts_acked = (void *)bictcp_acked,
+ .name = "bpf_cubic",
+};
diff --git a/tools/testing/selftests/bpf/progs/bpf_dctcp.c b/tools/testing/selftests/bpf/progs/bpf_dctcp.c
new file mode 100644
index 000000000..4dc1a9677
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_dctcp.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+
+/* WARNING: This implemenation is not necessarily the same
+ * as the tcp_dctcp.c. The purpose is mainly for testing
+ * the kernel BPF logic.
+ */
+
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/tcp.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include "bpf_tcp_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+
+int stg_result = 0;
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, int);
+} sk_stg_map SEC(".maps");
+
+#define DCTCP_MAX_ALPHA 1024U
+
+struct dctcp {
+ __u32 old_delivered;
+ __u32 old_delivered_ce;
+ __u32 prior_rcv_nxt;
+ __u32 dctcp_alpha;
+ __u32 next_seq;
+ __u32 ce_state;
+ __u32 loss_cwnd;
+};
+
+static unsigned int dctcp_shift_g = 4; /* g = 1/2^4 */
+static unsigned int dctcp_alpha_on_init = DCTCP_MAX_ALPHA;
+
+static __always_inline void dctcp_reset(const struct tcp_sock *tp,
+ struct dctcp *ca)
+{
+ ca->next_seq = tp->snd_nxt;
+
+ ca->old_delivered = tp->delivered;
+ ca->old_delivered_ce = tp->delivered_ce;
+}
+
+SEC("struct_ops/dctcp_init")
+void BPF_PROG(dctcp_init, struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct dctcp *ca = inet_csk_ca(sk);
+ int *stg;
+
+ ca->prior_rcv_nxt = tp->rcv_nxt;
+ ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
+ ca->loss_cwnd = 0;
+ ca->ce_state = 0;
+
+ stg = bpf_sk_storage_get(&sk_stg_map, (void *)tp, NULL, 0);
+ if (stg) {
+ stg_result = *stg;
+ bpf_sk_storage_delete(&sk_stg_map, (void *)tp);
+ }
+ dctcp_reset(tp, ca);
+}
+
+SEC("struct_ops/dctcp_ssthresh")
+__u32 BPF_PROG(dctcp_ssthresh, struct sock *sk)
+{
+ struct dctcp *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ ca->loss_cwnd = tp->snd_cwnd;
+ return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
+}
+
+SEC("struct_ops/dctcp_update_alpha")
+void BPF_PROG(dctcp_update_alpha, struct sock *sk, __u32 flags)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct dctcp *ca = inet_csk_ca(sk);
+
+ /* Expired RTT */
+ if (!before(tp->snd_una, ca->next_seq)) {
+ __u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce;
+ __u32 alpha = ca->dctcp_alpha;
+
+ /* alpha = (1 - g) * alpha + g * F */
+
+ alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g);
+ if (delivered_ce) {
+ __u32 delivered = tp->delivered - ca->old_delivered;
+
+ /* If dctcp_shift_g == 1, a 32bit value would overflow
+ * after 8 M packets.
+ */
+ delivered_ce <<= (10 - dctcp_shift_g);
+ delivered_ce /= max(1U, delivered);
+
+ alpha = min(alpha + delivered_ce, DCTCP_MAX_ALPHA);
+ }
+ ca->dctcp_alpha = alpha;
+ dctcp_reset(tp, ca);
+ }
+}
+
+static __always_inline void dctcp_react_to_loss(struct sock *sk)
+{
+ struct dctcp *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ ca->loss_cwnd = tp->snd_cwnd;
+ tp->snd_ssthresh = max(tp->snd_cwnd >> 1U, 2U);
+}
+
+SEC("struct_ops/dctcp_state")
+void BPF_PROG(dctcp_state, struct sock *sk, __u8 new_state)
+{
+ if (new_state == TCP_CA_Recovery &&
+ new_state != BPF_CORE_READ_BITFIELD(inet_csk(sk), icsk_ca_state))
+ dctcp_react_to_loss(sk);
+ /* We handle RTO in dctcp_cwnd_event to ensure that we perform only
+ * one loss-adjustment per RTT.
+ */
+}
+
+static __always_inline void dctcp_ece_ack_cwr(struct sock *sk, __u32 ce_state)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (ce_state == 1)
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+ else
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+/* Minimal DCTP CE state machine:
+ *
+ * S: 0 <- last pkt was non-CE
+ * 1 <- last pkt was CE
+ */
+static __always_inline
+void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt,
+ __u32 *prior_rcv_nxt, __u32 *ce_state)
+{
+ __u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0;
+
+ if (*ce_state != new_ce_state) {
+ /* CE state has changed, force an immediate ACK to
+ * reflect the new CE state. If an ACK was delayed,
+ * send that first to reflect the prior CE state.
+ */
+ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
+ dctcp_ece_ack_cwr(sk, *ce_state);
+ bpf_tcp_send_ack(sk, *prior_rcv_nxt);
+ }
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+ }
+ *prior_rcv_nxt = tcp_sk(sk)->rcv_nxt;
+ *ce_state = new_ce_state;
+ dctcp_ece_ack_cwr(sk, new_ce_state);
+}
+
+SEC("struct_ops/dctcp_cwnd_event")
+void BPF_PROG(dctcp_cwnd_event, struct sock *sk, enum tcp_ca_event ev)
+{
+ struct dctcp *ca = inet_csk_ca(sk);
+
+ switch (ev) {
+ case CA_EVENT_ECN_IS_CE:
+ case CA_EVENT_ECN_NO_CE:
+ dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state);
+ break;
+ case CA_EVENT_LOSS:
+ dctcp_react_to_loss(sk);
+ break;
+ default:
+ /* Don't care for the rest. */
+ break;
+ }
+}
+
+SEC("struct_ops/dctcp_cwnd_undo")
+__u32 BPF_PROG(dctcp_cwnd_undo, struct sock *sk)
+{
+ const struct dctcp *ca = inet_csk_ca(sk);
+
+ return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+}
+
+SEC("struct_ops/tcp_reno_cong_avoid")
+void BPF_PROG(tcp_reno_cong_avoid, struct sock *sk, __u32 ack, __u32 acked)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tcp_is_cwnd_limited(sk))
+ return;
+
+ /* In "safe" area, increase. */
+ if (tcp_in_slow_start(tp)) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
+ }
+ /* In dangerous area, increase slowly. */
+ tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
+}
+
+SEC(".struct_ops")
+struct tcp_congestion_ops dctcp_nouse = {
+ .init = (void *)dctcp_init,
+ .set_state = (void *)dctcp_state,
+ .flags = TCP_CONG_NEEDS_ECN,
+ .name = "bpf_dctcp_nouse",
+};
+
+SEC(".struct_ops")
+struct tcp_congestion_ops dctcp = {
+ .init = (void *)dctcp_init,
+ .in_ack_event = (void *)dctcp_update_alpha,
+ .cwnd_event = (void *)dctcp_cwnd_event,
+ .ssthresh = (void *)dctcp_ssthresh,
+ .cong_avoid = (void *)tcp_reno_cong_avoid,
+ .undo_cwnd = (void *)dctcp_cwnd_undo,
+ .set_state = (void *)dctcp_state,
+ .flags = TCP_CONG_NEEDS_ECN,
+ .name = "bpf_dctcp",
+};
diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c
new file mode 100644
index 000000000..5a65f6b51
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_flow.c
@@ -0,0 +1,421 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <limits.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/pkt_cls.h>
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/icmp.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_packet.h>
+#include <sys/socket.h>
+#include <linux/if_tunnel.h>
+#include <linux/mpls.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+int _version SEC("version") = 1;
+#define PROG(F) PROG_(F, _##F)
+#define PROG_(NUM, NAME) SEC("flow_dissector/"#NUM) int bpf_func##NAME
+
+/* These are the identifiers of the BPF programs that will be used in tail
+ * calls. Name is limited to 16 characters, with the terminating character and
+ * bpf_func_ above, we have only 6 to work with, anything after will be cropped.
+ */
+#define IP 0
+#define IPV6 1
+#define IPV6OP 2 /* Destination/Hop-by-Hop Options IPv6 Ext. Header */
+#define IPV6FR 3 /* Fragmentation IPv6 Extension Header */
+#define MPLS 4
+#define VLAN 5
+#define MAX_PROG 6
+
+#define IP_MF 0x2000
+#define IP_OFFSET 0x1FFF
+#define IP6_MF 0x0001
+#define IP6_OFFSET 0xFFF8
+
+struct vlan_hdr {
+ __be16 h_vlan_TCI;
+ __be16 h_vlan_encapsulated_proto;
+};
+
+struct gre_hdr {
+ __be16 flags;
+ __be16 proto;
+};
+
+struct frag_hdr {
+ __u8 nexthdr;
+ __u8 reserved;
+ __be16 frag_off;
+ __be32 identification;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __uint(max_entries, MAX_PROG);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u32));
+} jmp_table SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 1024);
+ __type(key, __u32);
+ __type(value, struct bpf_flow_keys);
+} last_dissection SEC(".maps");
+
+static __always_inline int export_flow_keys(struct bpf_flow_keys *keys,
+ int ret)
+{
+ __u32 key = (__u32)(keys->sport) << 16 | keys->dport;
+ struct bpf_flow_keys val;
+
+ memcpy(&val, keys, sizeof(val));
+ bpf_map_update_elem(&last_dissection, &key, &val, BPF_ANY);
+ return ret;
+}
+
+#define IPV6_FLOWLABEL_MASK __bpf_constant_htonl(0x000FFFFF)
+static inline __be32 ip6_flowlabel(const struct ipv6hdr *hdr)
+{
+ return *(__be32 *)hdr & IPV6_FLOWLABEL_MASK;
+}
+
+static __always_inline void *bpf_flow_dissect_get_header(struct __sk_buff *skb,
+ __u16 hdr_size,
+ void *buffer)
+{
+ void *data_end = (void *)(long)skb->data_end;
+ void *data = (void *)(long)skb->data;
+ __u16 thoff = skb->flow_keys->thoff;
+ __u8 *hdr;
+
+ /* Verifies this variable offset does not overflow */
+ if (thoff > (USHRT_MAX - hdr_size))
+ return NULL;
+
+ hdr = data + thoff;
+ if (hdr + hdr_size <= data_end)
+ return hdr;
+
+ if (bpf_skb_load_bytes(skb, thoff, buffer, hdr_size))
+ return NULL;
+
+ return buffer;
+}
+
+/* Dispatches on ETHERTYPE */
+static __always_inline int parse_eth_proto(struct __sk_buff *skb, __be16 proto)
+{
+ struct bpf_flow_keys *keys = skb->flow_keys;
+
+ switch (proto) {
+ case bpf_htons(ETH_P_IP):
+ bpf_tail_call_static(skb, &jmp_table, IP);
+ break;
+ case bpf_htons(ETH_P_IPV6):
+ bpf_tail_call_static(skb, &jmp_table, IPV6);
+ break;
+ case bpf_htons(ETH_P_MPLS_MC):
+ case bpf_htons(ETH_P_MPLS_UC):
+ bpf_tail_call_static(skb, &jmp_table, MPLS);
+ break;
+ case bpf_htons(ETH_P_8021Q):
+ case bpf_htons(ETH_P_8021AD):
+ bpf_tail_call_static(skb, &jmp_table, VLAN);
+ break;
+ default:
+ /* Protocol not supported */
+ return export_flow_keys(keys, BPF_DROP);
+ }
+
+ return export_flow_keys(keys, BPF_DROP);
+}
+
+SEC("flow_dissector")
+int _dissect(struct __sk_buff *skb)
+{
+ struct bpf_flow_keys *keys = skb->flow_keys;
+
+ return parse_eth_proto(skb, keys->n_proto);
+}
+
+/* Parses on IPPROTO_* */
+static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
+{
+ struct bpf_flow_keys *keys = skb->flow_keys;
+ void *data_end = (void *)(long)skb->data_end;
+ struct icmphdr *icmp, _icmp;
+ struct gre_hdr *gre, _gre;
+ struct ethhdr *eth, _eth;
+ struct tcphdr *tcp, _tcp;
+ struct udphdr *udp, _udp;
+
+ switch (proto) {
+ case IPPROTO_ICMP:
+ icmp = bpf_flow_dissect_get_header(skb, sizeof(*icmp), &_icmp);
+ if (!icmp)
+ return export_flow_keys(keys, BPF_DROP);
+ return export_flow_keys(keys, BPF_OK);
+ case IPPROTO_IPIP:
+ keys->is_encap = true;
+ if (keys->flags & BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP)
+ return export_flow_keys(keys, BPF_OK);
+
+ return parse_eth_proto(skb, bpf_htons(ETH_P_IP));
+ case IPPROTO_IPV6:
+ keys->is_encap = true;
+ if (keys->flags & BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP)
+ return export_flow_keys(keys, BPF_OK);
+
+ return parse_eth_proto(skb, bpf_htons(ETH_P_IPV6));
+ case IPPROTO_GRE:
+ gre = bpf_flow_dissect_get_header(skb, sizeof(*gre), &_gre);
+ if (!gre)
+ return export_flow_keys(keys, BPF_DROP);
+
+ if (bpf_htons(gre->flags & GRE_VERSION))
+ /* Only inspect standard GRE packets with version 0 */
+ return export_flow_keys(keys, BPF_OK);
+
+ keys->thoff += sizeof(*gre); /* Step over GRE Flags and Proto */
+ if (GRE_IS_CSUM(gre->flags))
+ keys->thoff += 4; /* Step over chksum and Padding */
+ if (GRE_IS_KEY(gre->flags))
+ keys->thoff += 4; /* Step over key */
+ if (GRE_IS_SEQ(gre->flags))
+ keys->thoff += 4; /* Step over sequence number */
+
+ keys->is_encap = true;
+ if (keys->flags & BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP)
+ return export_flow_keys(keys, BPF_OK);
+
+ if (gre->proto == bpf_htons(ETH_P_TEB)) {
+ eth = bpf_flow_dissect_get_header(skb, sizeof(*eth),
+ &_eth);
+ if (!eth)
+ return export_flow_keys(keys, BPF_DROP);
+
+ keys->thoff += sizeof(*eth);
+
+ return parse_eth_proto(skb, eth->h_proto);
+ } else {
+ return parse_eth_proto(skb, gre->proto);
+ }
+ case IPPROTO_TCP:
+ tcp = bpf_flow_dissect_get_header(skb, sizeof(*tcp), &_tcp);
+ if (!tcp)
+ return export_flow_keys(keys, BPF_DROP);
+
+ if (tcp->doff < 5)
+ return export_flow_keys(keys, BPF_DROP);
+
+ if ((__u8 *)tcp + (tcp->doff << 2) > data_end)
+ return export_flow_keys(keys, BPF_DROP);
+
+ keys->sport = tcp->source;
+ keys->dport = tcp->dest;
+ return export_flow_keys(keys, BPF_OK);
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ udp = bpf_flow_dissect_get_header(skb, sizeof(*udp), &_udp);
+ if (!udp)
+ return export_flow_keys(keys, BPF_DROP);
+
+ keys->sport = udp->source;
+ keys->dport = udp->dest;
+ return export_flow_keys(keys, BPF_OK);
+ default:
+ return export_flow_keys(keys, BPF_DROP);
+ }
+
+ return export_flow_keys(keys, BPF_DROP);
+}
+
+static __always_inline int parse_ipv6_proto(struct __sk_buff *skb, __u8 nexthdr)
+{
+ struct bpf_flow_keys *keys = skb->flow_keys;
+
+ switch (nexthdr) {
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_DSTOPTS:
+ bpf_tail_call_static(skb, &jmp_table, IPV6OP);
+ break;
+ case IPPROTO_FRAGMENT:
+ bpf_tail_call_static(skb, &jmp_table, IPV6FR);
+ break;
+ default:
+ return parse_ip_proto(skb, nexthdr);
+ }
+
+ return export_flow_keys(keys, BPF_DROP);
+}
+
+PROG(IP)(struct __sk_buff *skb)
+{
+ void *data_end = (void *)(long)skb->data_end;
+ struct bpf_flow_keys *keys = skb->flow_keys;
+ void *data = (void *)(long)skb->data;
+ struct iphdr *iph, _iph;
+ bool done = false;
+
+ iph = bpf_flow_dissect_get_header(skb, sizeof(*iph), &_iph);
+ if (!iph)
+ return export_flow_keys(keys, BPF_DROP);
+
+ /* IP header cannot be smaller than 20 bytes */
+ if (iph->ihl < 5)
+ return export_flow_keys(keys, BPF_DROP);
+
+ keys->addr_proto = ETH_P_IP;
+ keys->ipv4_src = iph->saddr;
+ keys->ipv4_dst = iph->daddr;
+ keys->ip_proto = iph->protocol;
+
+ keys->thoff += iph->ihl << 2;
+ if (data + keys->thoff > data_end)
+ return export_flow_keys(keys, BPF_DROP);
+
+ if (iph->frag_off & bpf_htons(IP_MF | IP_OFFSET)) {
+ keys->is_frag = true;
+ if (iph->frag_off & bpf_htons(IP_OFFSET)) {
+ /* From second fragment on, packets do not have headers
+ * we can parse.
+ */
+ done = true;
+ } else {
+ keys->is_first_frag = true;
+ /* No need to parse fragmented packet unless
+ * explicitly asked for.
+ */
+ if (!(keys->flags &
+ BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG))
+ done = true;
+ }
+ }
+
+ if (done)
+ return export_flow_keys(keys, BPF_OK);
+
+ return parse_ip_proto(skb, iph->protocol);
+}
+
+PROG(IPV6)(struct __sk_buff *skb)
+{
+ struct bpf_flow_keys *keys = skb->flow_keys;
+ struct ipv6hdr *ip6h, _ip6h;
+
+ ip6h = bpf_flow_dissect_get_header(skb, sizeof(*ip6h), &_ip6h);
+ if (!ip6h)
+ return export_flow_keys(keys, BPF_DROP);
+
+ keys->addr_proto = ETH_P_IPV6;
+ memcpy(&keys->ipv6_src, &ip6h->saddr, 2*sizeof(ip6h->saddr));
+
+ keys->thoff += sizeof(struct ipv6hdr);
+ keys->ip_proto = ip6h->nexthdr;
+ keys->flow_label = ip6_flowlabel(ip6h);
+
+ if (keys->flags & BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)
+ return export_flow_keys(keys, BPF_OK);
+
+ return parse_ipv6_proto(skb, ip6h->nexthdr);
+}
+
+PROG(IPV6OP)(struct __sk_buff *skb)
+{
+ struct bpf_flow_keys *keys = skb->flow_keys;
+ struct ipv6_opt_hdr *ip6h, _ip6h;
+
+ ip6h = bpf_flow_dissect_get_header(skb, sizeof(*ip6h), &_ip6h);
+ if (!ip6h)
+ return export_flow_keys(keys, BPF_DROP);
+
+ /* hlen is in 8-octets and does not include the first 8 bytes
+ * of the header
+ */
+ keys->thoff += (1 + ip6h->hdrlen) << 3;
+ keys->ip_proto = ip6h->nexthdr;
+
+ return parse_ipv6_proto(skb, ip6h->nexthdr);
+}
+
+PROG(IPV6FR)(struct __sk_buff *skb)
+{
+ struct bpf_flow_keys *keys = skb->flow_keys;
+ struct frag_hdr *fragh, _fragh;
+
+ fragh = bpf_flow_dissect_get_header(skb, sizeof(*fragh), &_fragh);
+ if (!fragh)
+ return export_flow_keys(keys, BPF_DROP);
+
+ keys->thoff += sizeof(*fragh);
+ keys->is_frag = true;
+ keys->ip_proto = fragh->nexthdr;
+
+ if (!(fragh->frag_off & bpf_htons(IP6_OFFSET))) {
+ keys->is_first_frag = true;
+
+ /* No need to parse fragmented packet unless
+ * explicitly asked for.
+ */
+ if (!(keys->flags & BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG))
+ return export_flow_keys(keys, BPF_OK);
+ }
+
+ return parse_ipv6_proto(skb, fragh->nexthdr);
+}
+
+PROG(MPLS)(struct __sk_buff *skb)
+{
+ struct bpf_flow_keys *keys = skb->flow_keys;
+ struct mpls_label *mpls, _mpls;
+
+ mpls = bpf_flow_dissect_get_header(skb, sizeof(*mpls), &_mpls);
+ if (!mpls)
+ return export_flow_keys(keys, BPF_DROP);
+
+ return export_flow_keys(keys, BPF_OK);
+}
+
+PROG(VLAN)(struct __sk_buff *skb)
+{
+ struct bpf_flow_keys *keys = skb->flow_keys;
+ struct vlan_hdr *vlan, _vlan;
+
+ /* Account for double-tagging */
+ if (keys->n_proto == bpf_htons(ETH_P_8021AD)) {
+ vlan = bpf_flow_dissect_get_header(skb, sizeof(*vlan), &_vlan);
+ if (!vlan)
+ return export_flow_keys(keys, BPF_DROP);
+
+ if (vlan->h_vlan_encapsulated_proto != bpf_htons(ETH_P_8021Q))
+ return export_flow_keys(keys, BPF_DROP);
+
+ keys->nhoff += sizeof(*vlan);
+ keys->thoff += sizeof(*vlan);
+ }
+
+ vlan = bpf_flow_dissect_get_header(skb, sizeof(*vlan), &_vlan);
+ if (!vlan)
+ return export_flow_keys(keys, BPF_DROP);
+
+ keys->nhoff += sizeof(*vlan);
+ keys->thoff += sizeof(*vlan);
+ /* Only allow 8021AD + 8021Q double tagging and no triple tagging.*/
+ if (vlan->h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021AD) ||
+ vlan->h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021Q))
+ return export_flow_keys(keys, BPF_DROP);
+
+ keys->n_proto = vlan->h_vlan_encapsulated_proto;
+ return parse_eth_proto(skb, vlan->h_vlan_encapsulated_proto);
+}
+
+char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter.h b/tools/testing/selftests/bpf/progs/bpf_iter.h
new file mode 100644
index 000000000..6a1255465
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2020 Facebook */
+/* "undefine" structs in vmlinux.h, because we "override" them below */
+#define bpf_iter_meta bpf_iter_meta___not_used
+#define bpf_iter__bpf_map bpf_iter__bpf_map___not_used
+#define bpf_iter__ipv6_route bpf_iter__ipv6_route___not_used
+#define bpf_iter__netlink bpf_iter__netlink___not_used
+#define bpf_iter__task bpf_iter__task___not_used
+#define bpf_iter__task_file bpf_iter__task_file___not_used
+#define bpf_iter__tcp bpf_iter__tcp___not_used
+#define tcp6_sock tcp6_sock___not_used
+#define bpf_iter__udp bpf_iter__udp___not_used
+#define udp6_sock udp6_sock___not_used
+#define bpf_iter__bpf_map_elem bpf_iter__bpf_map_elem___not_used
+#define bpf_iter__bpf_sk_storage_map bpf_iter__bpf_sk_storage_map___not_used
+#define bpf_iter__sockmap bpf_iter__sockmap___not_used
+#define btf_ptr btf_ptr___not_used
+#define BTF_F_COMPACT BTF_F_COMPACT___not_used
+#define BTF_F_NONAME BTF_F_NONAME___not_used
+#define BTF_F_PTR_RAW BTF_F_PTR_RAW___not_used
+#define BTF_F_ZERO BTF_F_ZERO___not_used
+#include "vmlinux.h"
+#undef bpf_iter_meta
+#undef bpf_iter__bpf_map
+#undef bpf_iter__ipv6_route
+#undef bpf_iter__netlink
+#undef bpf_iter__task
+#undef bpf_iter__task_file
+#undef bpf_iter__tcp
+#undef tcp6_sock
+#undef bpf_iter__udp
+#undef udp6_sock
+#undef bpf_iter__bpf_map_elem
+#undef bpf_iter__bpf_sk_storage_map
+#undef bpf_iter__sockmap
+#undef btf_ptr
+#undef BTF_F_COMPACT
+#undef BTF_F_NONAME
+#undef BTF_F_PTR_RAW
+#undef BTF_F_ZERO
+
+struct bpf_iter_meta {
+ struct seq_file *seq;
+ __u64 session_id;
+ __u64 seq_num;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__ipv6_route {
+ struct bpf_iter_meta *meta;
+ struct fib6_info *rt;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__netlink {
+ struct bpf_iter_meta *meta;
+ struct netlink_sock *sk;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__task {
+ struct bpf_iter_meta *meta;
+ struct task_struct *task;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__task_file {
+ struct bpf_iter_meta *meta;
+ struct task_struct *task;
+ __u32 fd;
+ struct file *file;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__bpf_map {
+ struct bpf_iter_meta *meta;
+ struct bpf_map *map;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__tcp {
+ struct bpf_iter_meta *meta;
+ struct sock_common *sk_common;
+ uid_t uid;
+} __attribute__((preserve_access_index));
+
+struct tcp6_sock {
+ struct tcp_sock tcp;
+ struct ipv6_pinfo inet6;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__udp {
+ struct bpf_iter_meta *meta;
+ struct udp_sock *udp_sk;
+ uid_t uid __attribute__((aligned(8)));
+ int bucket __attribute__((aligned(8)));
+} __attribute__((preserve_access_index));
+
+struct udp6_sock {
+ struct udp_sock udp;
+ struct ipv6_pinfo inet6;
+} __attribute__((preserve_access_index));
+
+struct bpf_iter__bpf_map_elem {
+ struct bpf_iter_meta *meta;
+ struct bpf_map *map;
+ void *key;
+ void *value;
+};
+
+struct bpf_iter__bpf_sk_storage_map {
+ struct bpf_iter_meta *meta;
+ struct bpf_map *map;
+ struct sock *sk;
+ void *value;
+};
+
+struct bpf_iter__sockmap {
+ struct bpf_iter_meta *meta;
+ struct bpf_map *map;
+ void *key;
+ struct sock *sk;
+};
+
+struct btf_ptr {
+ void *ptr;
+ __u32 type_id;
+ __u32 flags;
+};
+
+enum {
+ BTF_F_COMPACT = (1ULL << 0),
+ BTF_F_NONAME = (1ULL << 1),
+ BTF_F_PTR_RAW = (1ULL << 2),
+ BTF_F_ZERO = (1ULL << 3),
+};
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_array_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_array_map.c
new file mode 100644
index 000000000..6286023fd
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_array_map.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct key_t {
+ int a;
+ int b;
+ int c;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 3);
+ __type(key, __u32);
+ __type(value, __u64);
+} arraymap1 SEC(".maps");
+
+__u32 key_sum = 0;
+__u64 val_sum = 0;
+
+SEC("iter/bpf_map_elem")
+int dump_bpf_array_map(struct bpf_iter__bpf_map_elem *ctx)
+{
+ __u32 *key = ctx->key;
+ __u64 *val = ctx->value;
+
+ if (key == (void *)0 || val == (void *)0)
+ return 0;
+
+ bpf_seq_write(ctx->meta->seq, key, sizeof(__u32));
+ bpf_seq_write(ctx->meta->seq, val, sizeof(__u64));
+ key_sum += *key;
+ val_sum += *val;
+ *val = *key;
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_hash_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_hash_map.c
new file mode 100644
index 000000000..6dfce3fd6
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_hash_map.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct key_t {
+ int a;
+ int b;
+ int c;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 3);
+ __type(key, struct key_t);
+ __type(value, __u64);
+} hashmap1 SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 3);
+ __type(key, __u64);
+ __type(value, __u64);
+} hashmap2 SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 3);
+ __type(key, struct key_t);
+ __type(value, __u32);
+} hashmap3 SEC(".maps");
+
+/* will set before prog run */
+bool in_test_mode = 0;
+
+/* will collect results during prog run */
+__u32 key_sum_a = 0, key_sum_b = 0, key_sum_c = 0;
+__u64 val_sum = 0;
+
+SEC("iter/bpf_map_elem")
+int dump_bpf_hash_map(struct bpf_iter__bpf_map_elem *ctx)
+{
+ struct seq_file *seq = ctx->meta->seq;
+ __u32 seq_num = ctx->meta->seq_num;
+ struct bpf_map *map = ctx->map;
+ struct key_t *key = ctx->key;
+ struct key_t tmp_key;
+ __u64 *val = ctx->value;
+ __u64 tmp_val = 0;
+ int ret;
+
+ if (in_test_mode) {
+ /* test mode is used by selftests to
+ * test functionality of bpf_hash_map iter.
+ *
+ * the above hashmap1 will have correct size
+ * and will be accepted, hashmap2 and hashmap3
+ * should be rejected due to smaller key/value
+ * size.
+ */
+ if (key == (void *)0 || val == (void *)0)
+ return 0;
+
+ /* update the value and then delete the <key, value> pair.
+ * it should not impact the existing 'val' which is still
+ * accessible under rcu.
+ */
+ __builtin_memcpy(&tmp_key, key, sizeof(struct key_t));
+ ret = bpf_map_update_elem(&hashmap1, &tmp_key, &tmp_val, 0);
+ if (ret)
+ return 0;
+ ret = bpf_map_delete_elem(&hashmap1, &tmp_key);
+ if (ret)
+ return 0;
+
+ key_sum_a += key->a;
+ key_sum_b += key->b;
+ key_sum_c += key->c;
+ val_sum += *val;
+ return 0;
+ }
+
+ /* non-test mode, the map is prepared with the
+ * below bpftool command sequence:
+ * bpftool map create /sys/fs/bpf/m1 type hash \
+ * key 12 value 8 entries 3 name map1
+ * bpftool map update id 77 key 0 0 0 1 0 0 0 0 0 0 0 1 \
+ * value 0 0 0 1 0 0 0 1
+ * bpftool map update id 77 key 0 0 0 1 0 0 0 0 0 0 0 2 \
+ * value 0 0 0 1 0 0 0 2
+ * The bpftool iter command line:
+ * bpftool iter pin ./bpf_iter_bpf_hash_map.o /sys/fs/bpf/p1 \
+ * map id 77
+ * The below output will be:
+ * map dump starts
+ * 77: (1000000 0 2000000) (200000001000000)
+ * 77: (1000000 0 1000000) (100000001000000)
+ * map dump ends
+ */
+ if (seq_num == 0)
+ BPF_SEQ_PRINTF(seq, "map dump starts\n");
+
+ if (key == (void *)0 || val == (void *)0) {
+ BPF_SEQ_PRINTF(seq, "map dump ends\n");
+ return 0;
+ }
+
+ BPF_SEQ_PRINTF(seq, "%d: (%x %d %x) (%llx)\n", map->id,
+ key->a, key->b, key->c, *val);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
new file mode 100644
index 000000000..08651b23e
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC("iter/bpf_map")
+int dump_bpf_map(struct bpf_iter__bpf_map *ctx)
+{
+ struct seq_file *seq = ctx->meta->seq;
+ __u64 seq_num = ctx->meta->seq_num;
+ struct bpf_map *map = ctx->map;
+
+ if (map == (void *)0) {
+ BPF_SEQ_PRINTF(seq, " %%%%%% END %%%%%%\n");
+ return 0;
+ }
+
+ if (seq_num == 0)
+ BPF_SEQ_PRINTF(seq, " id refcnt usercnt locked_vm\n");
+
+ BPF_SEQ_PRINTF(seq, "%8u %8ld %8ld %10lu\n", map->id, map->refcnt.counter,
+ map->usercnt.counter,
+ map->memory.user->locked_vm.counter);
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c
new file mode 100644
index 000000000..85fa710fa
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct key_t {
+ int a;
+ int b;
+ int c;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(max_entries, 3);
+ __type(key, __u32);
+ __type(value, __u32);
+} arraymap1 SEC(".maps");
+
+/* will set before prog run */
+volatile const __u32 num_cpus = 0;
+
+__u32 key_sum = 0, val_sum = 0;
+
+SEC("iter/bpf_map_elem")
+int dump_bpf_percpu_array_map(struct bpf_iter__bpf_map_elem *ctx)
+{
+ __u32 *key = ctx->key;
+ void *pptr = ctx->value;
+ __u32 step;
+ int i;
+
+ if (key == (void *)0 || pptr == (void *)0)
+ return 0;
+
+ key_sum += *key;
+
+ step = 8;
+ for (i = 0; i < num_cpus; i++) {
+ val_sum += *(__u32 *)pptr;
+ pptr += step;
+ }
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_hash_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_hash_map.c
new file mode 100644
index 000000000..feaaa2b89
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_hash_map.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct key_t {
+ int a;
+ int b;
+ int c;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+ __uint(max_entries, 3);
+ __type(key, struct key_t);
+ __type(value, __u32);
+} hashmap1 SEC(".maps");
+
+/* will set before prog run */
+volatile const __u32 num_cpus = 0;
+
+/* will collect results during prog run */
+__u32 key_sum_a = 0, key_sum_b = 0, key_sum_c = 0;
+__u32 val_sum = 0;
+
+SEC("iter/bpf_map_elem")
+int dump_bpf_percpu_hash_map(struct bpf_iter__bpf_map_elem *ctx)
+{
+ struct key_t *key = ctx->key;
+ void *pptr = ctx->value;
+ __u32 step;
+ int i;
+
+ if (key == (void *)0 || pptr == (void *)0)
+ return 0;
+
+ key_sum_a += key->a;
+ key_sum_b += key->b;
+ key_sum_c += key->c;
+
+ step = 8;
+ for (i = 0; i < num_cpus; i++) {
+ val_sum += *(__u32 *)pptr;
+ pptr += step;
+ }
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_map.c
new file mode 100644
index 000000000..6b70ccaba
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_map.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include "bpf_tracing_net.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, int);
+} sk_stg_map SEC(".maps");
+
+__u32 val_sum = 0;
+__u32 ipv6_sk_count = 0;
+
+SEC("iter/bpf_sk_storage_map")
+int dump_bpf_sk_storage_map(struct bpf_iter__bpf_sk_storage_map *ctx)
+{
+ struct sock *sk = ctx->sk;
+ __u32 *val = ctx->value;
+
+ if (sk == (void *)0 || val == (void *)0)
+ return 0;
+
+ if (sk->sk_family == AF_INET6)
+ ipv6_sk_count++;
+
+ val_sum += *val;
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
new file mode 100644
index 000000000..d58d9f164
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_ipv6_route.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include "bpf_tracing_net.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+extern bool CONFIG_IPV6_SUBTREES __kconfig __weak;
+
+SEC("iter/ipv6_route")
+int dump_ipv6_route(struct bpf_iter__ipv6_route *ctx)
+{
+ struct seq_file *seq = ctx->meta->seq;
+ struct fib6_info *rt = ctx->rt;
+ const struct net_device *dev;
+ struct fib6_nh *fib6_nh;
+ unsigned int flags;
+ struct nexthop *nh;
+
+ if (rt == (void *)0)
+ return 0;
+
+ fib6_nh = &rt->fib6_nh[0];
+ flags = rt->fib6_flags;
+
+ /* FIXME: nexthop_is_multipath is not handled here. */
+ nh = rt->nh;
+ if (rt->nh)
+ fib6_nh = &nh->nh_info->fib6_nh;
+
+ BPF_SEQ_PRINTF(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
+
+ if (CONFIG_IPV6_SUBTREES)
+ BPF_SEQ_PRINTF(seq, "%pi6 %02x ", &rt->fib6_src.addr,
+ rt->fib6_src.plen);
+ else
+ BPF_SEQ_PRINTF(seq, "00000000000000000000000000000000 00 ");
+
+ if (fib6_nh->fib_nh_gw_family) {
+ flags |= RTF_GATEWAY;
+ BPF_SEQ_PRINTF(seq, "%pi6 ", &fib6_nh->fib_nh_gw6);
+ } else {
+ BPF_SEQ_PRINTF(seq, "00000000000000000000000000000000 ");
+ }
+
+ dev = fib6_nh->fib_nh_dev;
+ if (dev)
+ BPF_SEQ_PRINTF(seq, "%08x %08x %08x %08x %8s\n", rt->fib6_metric,
+ rt->fib6_ref.refs.counter, 0, flags, dev->name);
+ else
+ BPF_SEQ_PRINTF(seq, "%08x %08x %08x %08x\n", rt->fib6_metric,
+ rt->fib6_ref.refs.counter, 0, flags);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
new file mode 100644
index 000000000..95989f4c9
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_netlink.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include "bpf_tracing_net.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+static __attribute__((noinline)) struct inode *SOCK_INODE(struct socket *socket)
+{
+ return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
+}
+
+SEC("iter/netlink")
+int dump_netlink(struct bpf_iter__netlink *ctx)
+{
+ struct seq_file *seq = ctx->meta->seq;
+ struct netlink_sock *nlk = ctx->sk;
+ unsigned long group, ino;
+ struct inode *inode;
+ struct socket *sk;
+ struct sock *s;
+
+ if (nlk == (void *)0)
+ return 0;
+
+ if (ctx->meta->seq_num == 0)
+ BPF_SEQ_PRINTF(seq, "sk Eth Pid Groups "
+ "Rmem Wmem Dump Locks Drops "
+ "Inode\n");
+
+ s = &nlk->sk;
+ BPF_SEQ_PRINTF(seq, "%pK %-3d ", s, s->sk_protocol);
+
+ if (!nlk->groups) {
+ group = 0;
+ } else {
+ /* FIXME: temporary use bpf_probe_read_kernel here, needs
+ * verifier support to do direct access.
+ */
+ bpf_probe_read_kernel(&group, sizeof(group), &nlk->groups[0]);
+ }
+ BPF_SEQ_PRINTF(seq, "%-10u %08x %-8d %-8d %-5d %-8d ",
+ nlk->portid, (u32)group,
+ s->sk_rmem_alloc.counter,
+ s->sk_wmem_alloc.refs.counter - 1,
+ nlk->cb_running, s->sk_refcnt.refs.counter);
+
+ sk = s->sk_socket;
+ if (!sk) {
+ ino = 0;
+ } else {
+ /* FIXME: container_of inside SOCK_INODE has a forced
+ * type conversion, and direct access cannot be used
+ * with current verifier.
+ */
+ inode = SOCK_INODE(sk);
+ bpf_probe_read_kernel(&ino, sizeof(ino), &inode->i_ino);
+ }
+ BPF_SEQ_PRINTF(seq, "%-8u %-8lu\n", s->sk_drops.counter, ino);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_sockmap.c b/tools/testing/selftests/bpf/progs/bpf_iter_sockmap.c
new file mode 100644
index 000000000..f3af0e30c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_sockmap.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Cloudflare */
+#include "bpf_iter.h"
+#include "bpf_tracing_net.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <errno.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKMAP);
+ __uint(max_entries, 64);
+ __type(key, __u32);
+ __type(value, __u64);
+} sockmap SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKHASH);
+ __uint(max_entries, 64);
+ __type(key, __u32);
+ __type(value, __u64);
+} sockhash SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKHASH);
+ __uint(max_entries, 64);
+ __type(key, __u32);
+ __type(value, __u64);
+} dst SEC(".maps");
+
+__u32 elems = 0;
+__u32 socks = 0;
+
+SEC("iter/sockmap")
+int copy(struct bpf_iter__sockmap *ctx)
+{
+ struct sock *sk = ctx->sk;
+ __u32 tmp, *key = ctx->key;
+ int ret;
+
+ if (!key)
+ return 0;
+
+ elems++;
+
+ /* We need a temporary buffer on the stack, since the verifier doesn't
+ * let us use the pointer from the context as an argument to the helper.
+ */
+ tmp = *key;
+
+ if (sk) {
+ socks++;
+ return bpf_map_update_elem(&dst, &tmp, sk, 0) != 0;
+ }
+
+ ret = bpf_map_delete_elem(&dst, &tmp);
+ return ret && ret != -ENOENT;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task.c b/tools/testing/selftests/bpf/progs/bpf_iter_task.c
new file mode 100644
index 000000000..b7f32c160
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC("iter/task")
+int dump_task(struct bpf_iter__task *ctx)
+{
+ struct seq_file *seq = ctx->meta->seq;
+ struct task_struct *task = ctx->task;
+ static char info[] = " === END ===";
+
+ if (task == (void *)0) {
+ BPF_SEQ_PRINTF(seq, "%s\n", info);
+ return 0;
+ }
+
+ if (ctx->meta->seq_num == 0)
+ BPF_SEQ_PRINTF(seq, " tgid gid\n");
+
+ BPF_SEQ_PRINTF(seq, "%8d %8d\n", task->tgid, task->pid);
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_btf.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_btf.c
new file mode 100644
index 000000000..a1ddc36f1
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_btf.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020, Oracle and/or its affiliates. */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+#include <errno.h>
+
+char _license[] SEC("license") = "GPL";
+
+long tasks = 0;
+long seq_err = 0;
+bool skip = false;
+
+SEC("iter/task")
+int dump_task_struct(struct bpf_iter__task *ctx)
+{
+ struct seq_file *seq = ctx->meta->seq;
+ struct task_struct *task = ctx->task;
+ static struct btf_ptr ptr = { };
+ long ret;
+
+#if __has_builtin(__builtin_btf_type_id)
+ ptr.type_id = bpf_core_type_id_kernel(struct task_struct);
+ ptr.ptr = task;
+
+ if (ctx->meta->seq_num == 0)
+ BPF_SEQ_PRINTF(seq, "Raw BTF task\n");
+
+ ret = bpf_seq_printf_btf(seq, &ptr, sizeof(ptr), 0);
+ switch (ret) {
+ case 0:
+ tasks++;
+ break;
+ case -ERANGE:
+ /* NULL task or task->fs, don't count it as an error. */
+ break;
+ case -E2BIG:
+ return 1;
+ default:
+ seq_err = ret;
+ break;
+ }
+#else
+ skip = true;
+#endif
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
new file mode 100644
index 000000000..b2f7c7c5f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_file.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+int count = 0;
+int tgid = 0;
+
+SEC("iter/task_file")
+int dump_task_file(struct bpf_iter__task_file *ctx)
+{
+ struct seq_file *seq = ctx->meta->seq;
+ struct task_struct *task = ctx->task;
+ __u32 fd = ctx->fd;
+ struct file *file = ctx->file;
+
+ if (task == (void *)0 || file == (void *)0)
+ return 0;
+
+ if (ctx->meta->seq_num == 0) {
+ count = 0;
+ BPF_SEQ_PRINTF(seq, " tgid gid fd file\n");
+ }
+
+ if (tgid == task->tgid && task->tgid != task->pid)
+ count++;
+
+ BPF_SEQ_PRINTF(seq, "%8d %8d %8d %lx\n", task->tgid, task->pid, fd,
+ (long)file->f_op);
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
new file mode 100644
index 000000000..50e59a2e1
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_task_stack.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+#define MAX_STACK_TRACE_DEPTH 64
+unsigned long entries[MAX_STACK_TRACE_DEPTH] = {};
+#define SIZE_OF_ULONG (sizeof(unsigned long))
+
+SEC("iter/task")
+int dump_task_stack(struct bpf_iter__task *ctx)
+{
+ struct seq_file *seq = ctx->meta->seq;
+ struct task_struct *task = ctx->task;
+ long i, retlen;
+
+ if (task == (void *)0)
+ return 0;
+
+ retlen = bpf_get_task_stack(task, entries,
+ MAX_STACK_TRACE_DEPTH * SIZE_OF_ULONG, 0);
+ if (retlen < 0)
+ return 0;
+
+ BPF_SEQ_PRINTF(seq, "pid: %8u num_entries: %8u\n", task->pid,
+ retlen / SIZE_OF_ULONG);
+ for (i = 0; i < MAX_STACK_TRACE_DEPTH; i++) {
+ if (retlen > i * SIZE_OF_ULONG)
+ BPF_SEQ_PRINTF(seq, "[<0>] %pB\n", (void *)entries[i]);
+ }
+ BPF_SEQ_PRINTF(seq, "\n");
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c b/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c
new file mode 100644
index 000000000..aa96b604b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_tcp4.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include "bpf_tracing_net.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_endian.h>
+
+char _license[] SEC("license") = "GPL";
+
+static int hlist_unhashed_lockless(const struct hlist_node *h)
+{
+ return !(h->pprev);
+}
+
+static int timer_pending(const struct timer_list * timer)
+{
+ return !hlist_unhashed_lockless(&timer->entry);
+}
+
+extern unsigned CONFIG_HZ __kconfig;
+
+#define USER_HZ 100
+#define NSEC_PER_SEC 1000000000ULL
+static clock_t jiffies_to_clock_t(unsigned long x)
+{
+ /* The implementation here tailored to a particular
+ * setting of USER_HZ.
+ */
+ u64 tick_nsec = (NSEC_PER_SEC + CONFIG_HZ/2) / CONFIG_HZ;
+ u64 user_hz_nsec = NSEC_PER_SEC / USER_HZ;
+
+ if ((tick_nsec % user_hz_nsec) == 0) {
+ if (CONFIG_HZ < USER_HZ)
+ return x * (USER_HZ / CONFIG_HZ);
+ else
+ return x / (CONFIG_HZ / USER_HZ);
+ }
+ return x * tick_nsec/user_hz_nsec;
+}
+
+static clock_t jiffies_delta_to_clock_t(long delta)
+{
+ if (delta <= 0)
+ return 0;
+
+ return jiffies_to_clock_t(delta);
+}
+
+static long sock_i_ino(const struct sock *sk)
+{
+ const struct socket *sk_socket = sk->sk_socket;
+ const struct inode *inode;
+ unsigned long ino;
+
+ if (!sk_socket)
+ return 0;
+
+ inode = &container_of(sk_socket, struct socket_alloc, socket)->vfs_inode;
+ bpf_probe_read_kernel(&ino, sizeof(ino), &inode->i_ino);
+ return ino;
+}
+
+static bool
+inet_csk_in_pingpong_mode(const struct inet_connection_sock *icsk)
+{
+ return icsk->icsk_ack.pingpong >= TCP_PINGPONG_THRESH;
+}
+
+static bool tcp_in_initial_slowstart(const struct tcp_sock *tcp)
+{
+ return tcp->snd_ssthresh >= TCP_INFINITE_SSTHRESH;
+}
+
+static int dump_tcp_sock(struct seq_file *seq, struct tcp_sock *tp,
+ uid_t uid, __u32 seq_num)
+{
+ const struct inet_connection_sock *icsk;
+ const struct fastopen_queue *fastopenq;
+ const struct inet_sock *inet;
+ unsigned long timer_expires;
+ const struct sock *sp;
+ __u16 destp, srcp;
+ __be32 dest, src;
+ int timer_active;
+ int rx_queue;
+ int state;
+
+ icsk = &tp->inet_conn;
+ inet = &icsk->icsk_inet;
+ sp = &inet->sk;
+ fastopenq = &icsk->icsk_accept_queue.fastopenq;
+
+ dest = inet->inet_daddr;
+ src = inet->inet_rcv_saddr;
+ destp = bpf_ntohs(inet->inet_dport);
+ srcp = bpf_ntohs(inet->inet_sport);
+
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ timer_active = 1;
+ timer_expires = icsk->icsk_timeout;
+ } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+ timer_active = 4;
+ timer_expires = icsk->icsk_timeout;
+ } else if (timer_pending(&sp->sk_timer)) {
+ timer_active = 2;
+ timer_expires = sp->sk_timer.expires;
+ } else {
+ timer_active = 0;
+ timer_expires = bpf_jiffies64();
+ }
+
+ state = sp->sk_state;
+ if (state == TCP_LISTEN) {
+ rx_queue = sp->sk_ack_backlog;
+ } else {
+ rx_queue = tp->rcv_nxt - tp->copied_seq;
+ if (rx_queue < 0)
+ rx_queue = 0;
+ }
+
+ BPF_SEQ_PRINTF(seq, "%4d: %08X:%04X %08X:%04X ",
+ seq_num, src, srcp, dest, destp);
+ BPF_SEQ_PRINTF(seq, "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d ",
+ state,
+ tp->write_seq - tp->snd_una, rx_queue,
+ timer_active,
+ jiffies_delta_to_clock_t(timer_expires - bpf_jiffies64()),
+ icsk->icsk_retransmits, uid,
+ icsk->icsk_probes_out,
+ sock_i_ino(sp),
+ sp->sk_refcnt.refs.counter);
+ BPF_SEQ_PRINTF(seq, "%pK %lu %lu %u %u %d\n",
+ tp,
+ jiffies_to_clock_t(icsk->icsk_rto),
+ jiffies_to_clock_t(icsk->icsk_ack.ato),
+ (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(icsk),
+ tp->snd_cwnd,
+ state == TCP_LISTEN ? fastopenq->max_qlen
+ : (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)
+ );
+
+ return 0;
+}
+
+static int dump_tw_sock(struct seq_file *seq, struct tcp_timewait_sock *ttw,
+ uid_t uid, __u32 seq_num)
+{
+ struct inet_timewait_sock *tw = &ttw->tw_sk;
+ __u16 destp, srcp;
+ __be32 dest, src;
+ long delta;
+
+ delta = tw->tw_timer.expires - bpf_jiffies64();
+ dest = tw->tw_daddr;
+ src = tw->tw_rcv_saddr;
+ destp = bpf_ntohs(tw->tw_dport);
+ srcp = bpf_ntohs(tw->tw_sport);
+
+ BPF_SEQ_PRINTF(seq, "%4d: %08X:%04X %08X:%04X ",
+ seq_num, src, srcp, dest, destp);
+
+ BPF_SEQ_PRINTF(seq, "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n",
+ tw->tw_substate, 0, 0,
+ 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
+ tw->tw_refcnt.refs.counter, tw);
+
+ return 0;
+}
+
+static int dump_req_sock(struct seq_file *seq, struct tcp_request_sock *treq,
+ uid_t uid, __u32 seq_num)
+{
+ struct inet_request_sock *irsk = &treq->req;
+ struct request_sock *req = &irsk->req;
+ long ttd;
+
+ ttd = req->rsk_timer.expires - bpf_jiffies64();
+
+ if (ttd < 0)
+ ttd = 0;
+
+ BPF_SEQ_PRINTF(seq, "%4d: %08X:%04X %08X:%04X ",
+ seq_num, irsk->ir_loc_addr,
+ irsk->ir_num, irsk->ir_rmt_addr,
+ bpf_ntohs(irsk->ir_rmt_port));
+ BPF_SEQ_PRINTF(seq, "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n",
+ TCP_SYN_RECV, 0, 0, 1, jiffies_to_clock_t(ttd),
+ req->num_timeout, uid, 0, 0, 0, req);
+
+ return 0;
+}
+
+SEC("iter/tcp")
+int dump_tcp4(struct bpf_iter__tcp *ctx)
+{
+ struct sock_common *sk_common = ctx->sk_common;
+ struct seq_file *seq = ctx->meta->seq;
+ struct tcp_timewait_sock *tw;
+ struct tcp_request_sock *req;
+ struct tcp_sock *tp;
+ uid_t uid = ctx->uid;
+ __u32 seq_num;
+
+ if (sk_common == (void *)0)
+ return 0;
+
+ seq_num = ctx->meta->seq_num;
+ if (seq_num == 0)
+ BPF_SEQ_PRINTF(seq, " sl "
+ "local_address "
+ "rem_address "
+ "st tx_queue rx_queue tr tm->when retrnsmt"
+ " uid timeout inode\n");
+
+ if (sk_common->skc_family != AF_INET)
+ return 0;
+
+ tp = bpf_skc_to_tcp_sock(sk_common);
+ if (tp)
+ return dump_tcp_sock(seq, tp, uid, seq_num);
+
+ tw = bpf_skc_to_tcp_timewait_sock(sk_common);
+ if (tw)
+ return dump_tw_sock(seq, tw, uid, seq_num);
+
+ req = bpf_skc_to_tcp_request_sock(sk_common);
+ if (req)
+ return dump_req_sock(seq, req, uid, seq_num);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c b/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c
new file mode 100644
index 000000000..b4fbddfa4
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_tcp6.c
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include "bpf_tracing_net.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_endian.h>
+
+char _license[] SEC("license") = "GPL";
+
+static int hlist_unhashed_lockless(const struct hlist_node *h)
+{
+ return !(h->pprev);
+}
+
+static int timer_pending(const struct timer_list * timer)
+{
+ return !hlist_unhashed_lockless(&timer->entry);
+}
+
+extern unsigned CONFIG_HZ __kconfig;
+
+#define USER_HZ 100
+#define NSEC_PER_SEC 1000000000ULL
+static clock_t jiffies_to_clock_t(unsigned long x)
+{
+ /* The implementation here tailored to a particular
+ * setting of USER_HZ.
+ */
+ u64 tick_nsec = (NSEC_PER_SEC + CONFIG_HZ/2) / CONFIG_HZ;
+ u64 user_hz_nsec = NSEC_PER_SEC / USER_HZ;
+
+ if ((tick_nsec % user_hz_nsec) == 0) {
+ if (CONFIG_HZ < USER_HZ)
+ return x * (USER_HZ / CONFIG_HZ);
+ else
+ return x / (CONFIG_HZ / USER_HZ);
+ }
+ return x * tick_nsec/user_hz_nsec;
+}
+
+static clock_t jiffies_delta_to_clock_t(long delta)
+{
+ if (delta <= 0)
+ return 0;
+
+ return jiffies_to_clock_t(delta);
+}
+
+static long sock_i_ino(const struct sock *sk)
+{
+ const struct socket *sk_socket = sk->sk_socket;
+ const struct inode *inode;
+ unsigned long ino;
+
+ if (!sk_socket)
+ return 0;
+
+ inode = &container_of(sk_socket, struct socket_alloc, socket)->vfs_inode;
+ bpf_probe_read_kernel(&ino, sizeof(ino), &inode->i_ino);
+ return ino;
+}
+
+static bool
+inet_csk_in_pingpong_mode(const struct inet_connection_sock *icsk)
+{
+ return icsk->icsk_ack.pingpong >= TCP_PINGPONG_THRESH;
+}
+
+static bool tcp_in_initial_slowstart(const struct tcp_sock *tcp)
+{
+ return tcp->snd_ssthresh >= TCP_INFINITE_SSTHRESH;
+}
+
+static int dump_tcp6_sock(struct seq_file *seq, struct tcp6_sock *tp,
+ uid_t uid, __u32 seq_num)
+{
+ const struct inet_connection_sock *icsk;
+ const struct fastopen_queue *fastopenq;
+ const struct in6_addr *dest, *src;
+ const struct inet_sock *inet;
+ unsigned long timer_expires;
+ const struct sock *sp;
+ __u16 destp, srcp;
+ int timer_active;
+ int rx_queue;
+ int state;
+
+ icsk = &tp->tcp.inet_conn;
+ inet = &icsk->icsk_inet;
+ sp = &inet->sk;
+ fastopenq = &icsk->icsk_accept_queue.fastopenq;
+
+ dest = &sp->sk_v6_daddr;
+ src = &sp->sk_v6_rcv_saddr;
+ destp = bpf_ntohs(inet->inet_dport);
+ srcp = bpf_ntohs(inet->inet_sport);
+
+ if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
+ icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ timer_active = 1;
+ timer_expires = icsk->icsk_timeout;
+ } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+ timer_active = 4;
+ timer_expires = icsk->icsk_timeout;
+ } else if (timer_pending(&sp->sk_timer)) {
+ timer_active = 2;
+ timer_expires = sp->sk_timer.expires;
+ } else {
+ timer_active = 0;
+ timer_expires = bpf_jiffies64();
+ }
+
+ state = sp->sk_state;
+ if (state == TCP_LISTEN) {
+ rx_queue = sp->sk_ack_backlog;
+ } else {
+ rx_queue = tp->tcp.rcv_nxt - tp->tcp.copied_seq;
+ if (rx_queue < 0)
+ rx_queue = 0;
+ }
+
+ BPF_SEQ_PRINTF(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X ",
+ seq_num,
+ src->s6_addr32[0], src->s6_addr32[1],
+ src->s6_addr32[2], src->s6_addr32[3], srcp,
+ dest->s6_addr32[0], dest->s6_addr32[1],
+ dest->s6_addr32[2], dest->s6_addr32[3], destp);
+ BPF_SEQ_PRINTF(seq, "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d ",
+ state,
+ tp->tcp.write_seq - tp->tcp.snd_una, rx_queue,
+ timer_active,
+ jiffies_delta_to_clock_t(timer_expires - bpf_jiffies64()),
+ icsk->icsk_retransmits, uid,
+ icsk->icsk_probes_out,
+ sock_i_ino(sp),
+ sp->sk_refcnt.refs.counter);
+ BPF_SEQ_PRINTF(seq, "%pK %lu %lu %u %u %d\n",
+ tp,
+ jiffies_to_clock_t(icsk->icsk_rto),
+ jiffies_to_clock_t(icsk->icsk_ack.ato),
+ (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(icsk),
+ tp->tcp.snd_cwnd,
+ state == TCP_LISTEN ? fastopenq->max_qlen
+ : (tcp_in_initial_slowstart(&tp->tcp) ? -1
+ : tp->tcp.snd_ssthresh)
+ );
+
+ return 0;
+}
+
+static int dump_tw_sock(struct seq_file *seq, struct tcp_timewait_sock *ttw,
+ uid_t uid, __u32 seq_num)
+{
+ struct inet_timewait_sock *tw = &ttw->tw_sk;
+ const struct in6_addr *dest, *src;
+ __u16 destp, srcp;
+ long delta;
+
+ delta = tw->tw_timer.expires - bpf_jiffies64();
+ dest = &tw->tw_v6_daddr;
+ src = &tw->tw_v6_rcv_saddr;
+ destp = bpf_ntohs(tw->tw_dport);
+ srcp = bpf_ntohs(tw->tw_sport);
+
+ BPF_SEQ_PRINTF(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X ",
+ seq_num,
+ src->s6_addr32[0], src->s6_addr32[1],
+ src->s6_addr32[2], src->s6_addr32[3], srcp,
+ dest->s6_addr32[0], dest->s6_addr32[1],
+ dest->s6_addr32[2], dest->s6_addr32[3], destp);
+
+ BPF_SEQ_PRINTF(seq, "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n",
+ tw->tw_substate, 0, 0,
+ 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
+ tw->tw_refcnt.refs.counter, tw);
+
+ return 0;
+}
+
+static int dump_req_sock(struct seq_file *seq, struct tcp_request_sock *treq,
+ uid_t uid, __u32 seq_num)
+{
+ struct inet_request_sock *irsk = &treq->req;
+ struct request_sock *req = &irsk->req;
+ struct in6_addr *src, *dest;
+ long ttd;
+
+ ttd = req->rsk_timer.expires - bpf_jiffies64();
+ src = &irsk->ir_v6_loc_addr;
+ dest = &irsk->ir_v6_rmt_addr;
+
+ if (ttd < 0)
+ ttd = 0;
+
+ BPF_SEQ_PRINTF(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X ",
+ seq_num,
+ src->s6_addr32[0], src->s6_addr32[1],
+ src->s6_addr32[2], src->s6_addr32[3],
+ irsk->ir_num,
+ dest->s6_addr32[0], dest->s6_addr32[1],
+ dest->s6_addr32[2], dest->s6_addr32[3],
+ bpf_ntohs(irsk->ir_rmt_port));
+ BPF_SEQ_PRINTF(seq, "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n",
+ TCP_SYN_RECV, 0, 0, 1, jiffies_to_clock_t(ttd),
+ req->num_timeout, uid, 0, 0, 0, req);
+
+ return 0;
+}
+
+SEC("iter/tcp")
+int dump_tcp6(struct bpf_iter__tcp *ctx)
+{
+ struct sock_common *sk_common = ctx->sk_common;
+ struct seq_file *seq = ctx->meta->seq;
+ struct tcp_timewait_sock *tw;
+ struct tcp_request_sock *req;
+ struct tcp6_sock *tp;
+ uid_t uid = ctx->uid;
+ __u32 seq_num;
+
+ if (sk_common == (void *)0)
+ return 0;
+
+ seq_num = ctx->meta->seq_num;
+ if (seq_num == 0)
+ BPF_SEQ_PRINTF(seq, " sl "
+ "local_address "
+ "remote_address "
+ "st tx_queue rx_queue tr tm->when retrnsmt"
+ " uid timeout inode\n");
+
+ if (sk_common->skc_family != AF_INET6)
+ return 0;
+
+ tp = bpf_skc_to_tcp6_sock(sk_common);
+ if (tp)
+ return dump_tcp6_sock(seq, tp, uid, seq_num);
+
+ tw = bpf_skc_to_tcp_timewait_sock(sk_common);
+ if (tw)
+ return dump_tw_sock(seq, tw, uid, seq_num);
+
+ req = bpf_skc_to_tcp_request_sock(sk_common);
+ if (req)
+ return dump_req_sock(seq, req, uid, seq_num);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c
new file mode 100644
index 000000000..c71a7c283
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern1.c
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#define START_CHAR 'a'
+#include "bpf_iter_test_kern_common.h"
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c
new file mode 100644
index 000000000..8bdc8dc07
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern2.c
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#define START_CHAR 'A'
+#include "bpf_iter_test_kern_common.h"
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c
new file mode 100644
index 000000000..2a4647f20
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern3.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC("iter/task")
+int dump_task(struct bpf_iter__task *ctx)
+{
+ struct seq_file *seq = ctx->meta->seq;
+ struct task_struct *task = ctx->task;
+ int tgid;
+
+ tgid = task->tgid;
+ bpf_seq_write(seq, &tgid, sizeof(tgid));
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c
new file mode 100644
index 000000000..ee49493dc
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern4.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+__u32 map1_id = 0, map2_id = 0;
+__u32 map1_accessed = 0, map2_accessed = 0;
+__u64 map1_seqnum = 0, map2_seqnum1 = 0, map2_seqnum2 = 0;
+
+static volatile const __u32 print_len;
+static volatile const __u32 ret1;
+
+SEC("iter/bpf_map")
+int dump_bpf_map(struct bpf_iter__bpf_map *ctx)
+{
+ struct seq_file *seq = ctx->meta->seq;
+ struct bpf_map *map = ctx->map;
+ __u64 seq_num;
+ int i, ret = 0;
+
+ if (map == (void *)0)
+ return 0;
+
+ /* only dump map1_id and map2_id */
+ if (map->id != map1_id && map->id != map2_id)
+ return 0;
+
+ seq_num = ctx->meta->seq_num;
+ if (map->id == map1_id) {
+ map1_seqnum = seq_num;
+ map1_accessed++;
+ }
+
+ if (map->id == map2_id) {
+ if (map2_accessed == 0) {
+ map2_seqnum1 = seq_num;
+ if (ret1)
+ ret = 1;
+ } else {
+ map2_seqnum2 = seq_num;
+ }
+ map2_accessed++;
+ }
+
+ /* fill seq_file buffer */
+ for (i = 0; i < print_len; i++)
+ bpf_seq_write(seq, &seq_num, sizeof(seq_num));
+
+ return ret;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern5.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern5.c
new file mode 100644
index 000000000..e3a7575e8
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern5.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct key_t {
+ int a;
+ int b;
+ int c;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 3);
+ __type(key, struct key_t);
+ __type(value, __u64);
+} hashmap1 SEC(".maps");
+
+__u32 key_sum = 0;
+
+SEC("iter/bpf_map_elem")
+int dump_bpf_hash_map(struct bpf_iter__bpf_map_elem *ctx)
+{
+ void *key = ctx->key;
+
+ if (key == (void *)0)
+ return 0;
+
+ /* out of bound access w.r.t. hashmap1 */
+ key_sum += *(__u32 *)(key + sizeof(struct key_t));
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern6.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern6.c
new file mode 100644
index 000000000..1c7304f56
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern6.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+__u32 value_sum = 0;
+
+SEC("iter/bpf_map_elem")
+int dump_bpf_hash_map(struct bpf_iter__bpf_map_elem *ctx)
+{
+ void *value = ctx->value;
+
+ if (value == (void *)0)
+ return 0;
+
+ /* negative offset, verifier failure. */
+ value_sum += *(__u32 *)(value - 4);
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h
new file mode 100644
index 000000000..d5e3df66a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern_common.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+int count = 0;
+
+SEC("iter/task")
+int dump_task(struct bpf_iter__task *ctx)
+{
+ struct seq_file *seq = ctx->meta->seq;
+ char c;
+
+ if (count < 4) {
+ c = START_CHAR + count;
+ bpf_seq_write(seq, &c, sizeof(c));
+ count++;
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c b/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c
new file mode 100644
index 000000000..f258583af
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_udp4.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include "bpf_tracing_net.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_endian.h>
+
+char _license[] SEC("license") = "GPL";
+
+static long sock_i_ino(const struct sock *sk)
+{
+ const struct socket *sk_socket = sk->sk_socket;
+ const struct inode *inode;
+ unsigned long ino;
+
+ if (!sk_socket)
+ return 0;
+
+ inode = &container_of(sk_socket, struct socket_alloc, socket)->vfs_inode;
+ bpf_probe_read_kernel(&ino, sizeof(ino), &inode->i_ino);
+ return ino;
+}
+
+SEC("iter/udp")
+int dump_udp4(struct bpf_iter__udp *ctx)
+{
+ struct seq_file *seq = ctx->meta->seq;
+ struct udp_sock *udp_sk = ctx->udp_sk;
+ struct inet_sock *inet;
+ __u16 srcp, destp;
+ __be32 dest, src;
+ __u32 seq_num;
+ int rqueue;
+
+ if (udp_sk == (void *)0)
+ return 0;
+
+ seq_num = ctx->meta->seq_num;
+ if (seq_num == 0)
+ BPF_SEQ_PRINTF(seq,
+ " sl local_address rem_address st tx_queue "
+ "rx_queue tr tm->when retrnsmt uid timeout "
+ "inode ref pointer drops\n");
+
+ /* filter out udp6 sockets */
+ inet = &udp_sk->inet;
+ if (inet->sk.sk_family == AF_INET6)
+ return 0;
+
+ inet = &udp_sk->inet;
+ dest = inet->inet_daddr;
+ src = inet->inet_rcv_saddr;
+ srcp = bpf_ntohs(inet->inet_sport);
+ destp = bpf_ntohs(inet->inet_dport);
+ rqueue = inet->sk.sk_rmem_alloc.counter - udp_sk->forward_deficit;
+
+ BPF_SEQ_PRINTF(seq, "%5d: %08X:%04X %08X:%04X ",
+ ctx->bucket, src, srcp, dest, destp);
+
+ BPF_SEQ_PRINTF(seq, "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u\n",
+ inet->sk.sk_state,
+ inet->sk.sk_wmem_alloc.refs.counter - 1,
+ rqueue,
+ 0, 0L, 0, ctx->uid, 0,
+ sock_i_ino(&inet->sk),
+ inet->sk.sk_refcnt.refs.counter, udp_sk,
+ inet->sk.sk_drops.counter);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c b/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c
new file mode 100644
index 000000000..65f93bb03
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_udp6.c
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include "bpf_tracing_net.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_endian.h>
+
+char _license[] SEC("license") = "GPL";
+
+#define IPV6_SEQ_DGRAM_HEADER \
+ " sl " \
+ "local_address " \
+ "remote_address " \
+ "st tx_queue rx_queue tr tm->when retrnsmt" \
+ " uid timeout inode ref pointer drops\n"
+
+static long sock_i_ino(const struct sock *sk)
+{
+ const struct socket *sk_socket = sk->sk_socket;
+ const struct inode *inode;
+ unsigned long ino;
+
+ if (!sk_socket)
+ return 0;
+
+ inode = &container_of(sk_socket, struct socket_alloc, socket)->vfs_inode;
+ bpf_probe_read_kernel(&ino, sizeof(ino), &inode->i_ino);
+ return ino;
+}
+
+SEC("iter/udp")
+int dump_udp6(struct bpf_iter__udp *ctx)
+{
+ struct seq_file *seq = ctx->meta->seq;
+ struct udp_sock *udp_sk = ctx->udp_sk;
+ const struct in6_addr *dest, *src;
+ struct udp6_sock *udp6_sk;
+ struct inet_sock *inet;
+ __u16 srcp, destp;
+ __u32 seq_num;
+ int rqueue;
+
+ if (udp_sk == (void *)0)
+ return 0;
+
+ seq_num = ctx->meta->seq_num;
+ if (seq_num == 0)
+ BPF_SEQ_PRINTF(seq, IPV6_SEQ_DGRAM_HEADER);
+
+ udp6_sk = bpf_skc_to_udp6_sock(udp_sk);
+ if (udp6_sk == (void *)0)
+ return 0;
+
+ inet = &udp_sk->inet;
+ srcp = bpf_ntohs(inet->inet_sport);
+ destp = bpf_ntohs(inet->inet_dport);
+ rqueue = inet->sk.sk_rmem_alloc.counter - udp_sk->forward_deficit;
+ dest = &inet->sk.sk_v6_daddr;
+ src = &inet->sk.sk_v6_rcv_saddr;
+
+ BPF_SEQ_PRINTF(seq, "%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X ",
+ ctx->bucket,
+ src->s6_addr32[0], src->s6_addr32[1],
+ src->s6_addr32[2], src->s6_addr32[3], srcp,
+ dest->s6_addr32[0], dest->s6_addr32[1],
+ dest->s6_addr32[2], dest->s6_addr32[3], destp);
+
+ BPF_SEQ_PRINTF(seq, "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u\n",
+ inet->sk.sk_state,
+ inet->sk.sk_wmem_alloc.refs.counter - 1,
+ rqueue,
+ 0, 0L, 0, ctx->uid, 0,
+ sock_i_ino(&inet->sk),
+ inet->sk.sk_refcnt.refs.counter, udp_sk,
+ inet->sk.sk_drops.counter);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
new file mode 100644
index 000000000..013789112
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+#ifndef __BPF_TRACING_NET_H__
+#define __BPF_TRACING_NET_H__
+
+#define AF_INET 2
+#define AF_INET6 10
+
+#define ICSK_TIME_RETRANS 1
+#define ICSK_TIME_PROBE0 3
+#define ICSK_TIME_LOSS_PROBE 5
+#define ICSK_TIME_REO_TIMEOUT 6
+
+#define IFNAMSIZ 16
+
+#define RTF_GATEWAY 0x0002
+
+#define TCP_INFINITE_SSTHRESH 0x7fffffff
+#define TCP_PINGPONG_THRESH 3
+
+#define fib_nh_dev nh_common.nhc_dev
+#define fib_nh_gw_family nh_common.nhc_gw_family
+#define fib_nh_gw6 nh_common.nhc_gw.ipv6
+
+#define inet_daddr sk.__sk_common.skc_daddr
+#define inet_rcv_saddr sk.__sk_common.skc_rcv_saddr
+#define inet_dport sk.__sk_common.skc_dport
+
+#define ir_loc_addr req.__req_common.skc_rcv_saddr
+#define ir_num req.__req_common.skc_num
+#define ir_rmt_addr req.__req_common.skc_daddr
+#define ir_rmt_port req.__req_common.skc_dport
+#define ir_v6_rmt_addr req.__req_common.skc_v6_daddr
+#define ir_v6_loc_addr req.__req_common.skc_v6_rcv_saddr
+
+#define sk_family __sk_common.skc_family
+#define sk_rmem_alloc sk_backlog.rmem_alloc
+#define sk_refcnt __sk_common.skc_refcnt
+#define sk_state __sk_common.skc_state
+#define sk_v6_daddr __sk_common.skc_v6_daddr
+#define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr
+
+#define s6_addr32 in6_u.u6_addr32
+
+#define tw_daddr __tw_common.skc_daddr
+#define tw_rcv_saddr __tw_common.skc_rcv_saddr
+#define tw_dport __tw_common.skc_dport
+#define tw_refcnt __tw_common.skc_refcnt
+#define tw_v6_daddr __tw_common.skc_v6_daddr
+#define tw_v6_rcv_saddr __tw_common.skc_v6_rcv_saddr
+
+#endif
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays.c
new file mode 100644
index 000000000..018ed7fbb
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_arrays x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___diff_arr_dim.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___diff_arr_dim.c
new file mode 100644
index 000000000..13d662c57
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___diff_arr_dim.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_arrays___diff_arr_dim x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___diff_arr_val_sz.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___diff_arr_val_sz.c
new file mode 100644
index 000000000..a351f418c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___diff_arr_val_sz.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_arrays___diff_arr_val_sz x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___equiv_zero_sz_arr.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___equiv_zero_sz_arr.c
new file mode 100644
index 000000000..65eac371b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___equiv_zero_sz_arr.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_arrays___equiv_zero_sz_arr x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_bad_zero_sz_arr.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_bad_zero_sz_arr.c
new file mode 100644
index 000000000..ecda2b545
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_bad_zero_sz_arr.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_arrays___err_bad_zero_sz_arr x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_non_array.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_non_array.c
new file mode 100644
index 000000000..a8735009b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_non_array.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_arrays___err_non_array x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_too_shallow.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_too_shallow.c
new file mode 100644
index 000000000..2a67c28b1
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_too_shallow.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_arrays___err_too_shallow x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_too_small.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_too_small.c
new file mode 100644
index 000000000..1142c08c9
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_too_small.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_arrays___err_too_small x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_wrong_val_type.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_wrong_val_type.c
new file mode 100644
index 000000000..f5a7c832d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___err_wrong_val_type.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_arrays___err_wrong_val_type x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___fixed_arr.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___fixed_arr.c
new file mode 100644
index 000000000..fe1d01232
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_arrays___fixed_arr.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_arrays___fixed_arr x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_bitfields.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_bitfields.c
new file mode 100644
index 000000000..cff6f1836
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_bitfields.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_bitfields x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_bitfields___bit_sz_change.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_bitfields___bit_sz_change.c
new file mode 100644
index 000000000..a1cd157d5
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_bitfields___bit_sz_change.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_bitfields___bit_sz_change x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_bitfields___bitfield_vs_int.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_bitfields___bitfield_vs_int.c
new file mode 100644
index 000000000..3f2c7b07c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_bitfields___bitfield_vs_int.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_bitfields___bitfield_vs_int x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_bitfields___err_too_big_bitfield.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_bitfields___err_too_big_bitfield.c
new file mode 100644
index 000000000..f9746d6be
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_bitfields___err_too_big_bitfield.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_bitfields___err_too_big_bitfield x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_bitfields___just_big_enough.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_bitfields___just_big_enough.c
new file mode 100644
index 000000000..e7c75a695
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_bitfields___just_big_enough.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_bitfields___just_big_enough x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_enumval.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_enumval.c
new file mode 100644
index 000000000..48e62f3f0
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_enumval.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_enumval x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_enumval___diff.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_enumval___diff.c
new file mode 100644
index 000000000..53e5e5a76
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_enumval___diff.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_enumval___diff x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_enumval___err_missing.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_enumval___err_missing.c
new file mode 100644
index 000000000..d024fb2ac
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_enumval___err_missing.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_enumval___err_missing x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_enumval___val3_missing.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_enumval___val3_missing.c
new file mode 100644
index 000000000..9de6595d2
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_enumval___val3_missing.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_enumval___val3_missing x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_existence.c
new file mode 100644
index 000000000..0b62315ad
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_existence.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_existence x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___minimal.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___minimal.c
new file mode 100644
index 000000000..aec2dec20
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___minimal.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_existence___minimal x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___wrong_field_defs.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___wrong_field_defs.c
new file mode 100644
index 000000000..d14b49619
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_existence___wrong_field_defs.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_existence___wrong_field_defs x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_flavors.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_flavors.c
new file mode 100644
index 000000000..b74455b91
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_flavors.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_flavors x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_flavors__err_wrong_name.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_flavors__err_wrong_name.c
new file mode 100644
index 000000000..7b6035f86
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_flavors__err_wrong_name.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_flavors__err_wrong_name x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_ints.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_ints.c
new file mode 100644
index 000000000..7d0f04104
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_ints.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_ints x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_ints___bool.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_ints___bool.c
new file mode 100644
index 000000000..f93594501
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_ints___bool.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_ints___bool x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_ints___reverse_sign.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_ints___reverse_sign.c
new file mode 100644
index 000000000..aafb1c581
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_ints___reverse_sign.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_ints___reverse_sign x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_misc.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_misc.c
new file mode 100644
index 000000000..ed9ad8b5b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_misc.c
@@ -0,0 +1,5 @@
+#include "core_reloc_types.h"
+
+void f1(struct core_reloc_misc___a x) {}
+void f2(struct core_reloc_misc___b x) {}
+void f3(struct core_reloc_misc_extensible x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_mods.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_mods.c
new file mode 100644
index 000000000..124197a2e
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_mods.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_mods x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_mods___mod_swap.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_mods___mod_swap.c
new file mode 100644
index 000000000..f8a6592ca
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_mods___mod_swap.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_mods___mod_swap x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_mods___typedefs.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_mods___typedefs.c
new file mode 100644
index 000000000..5c0d73687
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_mods___typedefs.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_mods___typedefs x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting.c
new file mode 100644
index 000000000..4480fcc0f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_nesting x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___anon_embed.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___anon_embed.c
new file mode 100644
index 000000000..13e108f76
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___anon_embed.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_nesting___anon_embed x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___dup_compat_types.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___dup_compat_types.c
new file mode 100644
index 000000000..76b54fda5
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___dup_compat_types.c
@@ -0,0 +1,5 @@
+#include "core_reloc_types.h"
+
+void f1(struct core_reloc_nesting___dup_compat_types x) {}
+void f2(struct core_reloc_nesting___dup_compat_types__2 x) {}
+void f3(struct core_reloc_nesting___dup_compat_types__3 x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_array_container.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_array_container.c
new file mode 100644
index 000000000..975fb95db
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_array_container.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_nesting___err_array_container x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_array_field.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_array_field.c
new file mode 100644
index 000000000..ad66c67e7
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_array_field.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_nesting___err_array_field x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_dup_incompat_types.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_dup_incompat_types.c
new file mode 100644
index 000000000..35c5f8da6
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_dup_incompat_types.c
@@ -0,0 +1,4 @@
+#include "core_reloc_types.h"
+
+void f1(struct core_reloc_nesting___err_dup_incompat_types__1 x) {}
+void f2(struct core_reloc_nesting___err_dup_incompat_types__2 x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_missing_container.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_missing_container.c
new file mode 100644
index 000000000..142e33204
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_missing_container.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_nesting___err_missing_container x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_missing_field.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_missing_field.c
new file mode 100644
index 000000000..efcae167f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_missing_field.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_nesting___err_missing_field x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_nonstruct_container.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_nonstruct_container.c
new file mode 100644
index 000000000..97aaaedd8
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_nonstruct_container.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_nesting___err_nonstruct_container x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_partial_match_dups.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_partial_match_dups.c
new file mode 100644
index 000000000..ffde35086
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_partial_match_dups.c
@@ -0,0 +1,4 @@
+#include "core_reloc_types.h"
+
+void f1(struct core_reloc_nesting___err_partial_match_dups__a x) {}
+void f2(struct core_reloc_nesting___err_partial_match_dups__b x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_too_deep.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_too_deep.c
new file mode 100644
index 000000000..39a2fadd8
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___err_too_deep.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_nesting___err_too_deep x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___extra_nesting.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___extra_nesting.c
new file mode 100644
index 000000000..a09d9dfb2
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___extra_nesting.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_nesting___extra_nesting x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___struct_union_mixup.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___struct_union_mixup.c
new file mode 100644
index 000000000..3d8a1a740
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_nesting___struct_union_mixup.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_nesting___struct_union_mixup x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives.c
new file mode 100644
index 000000000..96b90e392
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_primitives x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_enum_def.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_enum_def.c
new file mode 100644
index 000000000..6e87233a3
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_enum_def.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_primitives___diff_enum_def x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_func_proto.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_func_proto.c
new file mode 100644
index 000000000..d9f48e80b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_func_proto.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_primitives___diff_func_proto x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_ptr_type.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_ptr_type.c
new file mode 100644
index 000000000..c718f75f8
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___diff_ptr_type.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_primitives___diff_ptr_type x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_enum.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_enum.c
new file mode 100644
index 000000000..b8a120830
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_enum.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_primitives___err_non_enum x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_int.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_int.c
new file mode 100644
index 000000000..ad8b3c9aa
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_int.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_primitives___err_non_int x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_ptr.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_ptr.c
new file mode 100644
index 000000000..e20bc1d42
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_primitives___err_non_ptr.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_primitives___err_non_ptr x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_ptr_as_arr.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_ptr_as_arr.c
new file mode 100644
index 000000000..8da52432b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_ptr_as_arr.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_ptr_as_arr x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_ptr_as_arr___diff_sz.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_ptr_as_arr___diff_sz.c
new file mode 100644
index 000000000..003acfc9a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_ptr_as_arr___diff_sz.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_ptr_as_arr___diff_sz x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_size.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_size.c
new file mode 100644
index 000000000..3c80903da
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_size.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_size x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_size___diff_sz.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_size___diff_sz.c
new file mode 100644
index 000000000..6dbd14436
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_size___diff_sz.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_size___diff_sz x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_size___err_ambiguous.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_size___err_ambiguous.c
new file mode 100644
index 000000000..f3e9904df
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_size___err_ambiguous.c
@@ -0,0 +1,4 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_size___err_ambiguous1 x,
+ struct core_reloc_size___err_ambiguous2 y) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_type_based.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_type_based.c
new file mode 100644
index 000000000..fc3f69e58
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_type_based.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_type_based x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___all_missing.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___all_missing.c
new file mode 100644
index 000000000..51511648b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___all_missing.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_type_based___all_missing x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___diff_sz.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___diff_sz.c
new file mode 100644
index 000000000..67db3dceb
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___diff_sz.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_type_based___diff_sz x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___fn_wrong_args.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___fn_wrong_args.c
new file mode 100644
index 000000000..b357fc654
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___fn_wrong_args.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_type_based___fn_wrong_args x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___incompat.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___incompat.c
new file mode 100644
index 000000000..8ddf20d33
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_type_based___incompat.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_type_based___incompat x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_type_id.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_type_id.c
new file mode 100644
index 000000000..abbe5bddc
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_type_id.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_type_id x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf__core_reloc_type_id___missing_targets.c b/tools/testing/selftests/bpf/progs/btf__core_reloc_type_id___missing_targets.c
new file mode 100644
index 000000000..24e7caf4f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf__core_reloc_type_id___missing_targets.c
@@ -0,0 +1,3 @@
+#include "core_reloc_types.h"
+
+void f(struct core_reloc_type_id___missing_targets x) {}
diff --git a/tools/testing/selftests/bpf/progs/btf_data.c b/tools/testing/selftests/bpf/progs/btf_data.c
new file mode 100644
index 000000000..baa525275
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf_data.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+
+struct S {
+ int a;
+ int b;
+ int c;
+};
+
+union U {
+ int a;
+ int b;
+ int c;
+};
+
+struct S1 {
+ int a;
+ int b;
+ int c;
+};
+
+union U1 {
+ int a;
+ int b;
+ int c;
+};
+
+typedef int T;
+typedef int S;
+typedef int U;
+typedef int T1;
+typedef int S1;
+typedef int U1;
+
+struct root_struct {
+ S m_1;
+ T m_2;
+ U m_3;
+ S1 m_4;
+ T1 m_5;
+ U1 m_6;
+ struct S m_7;
+ struct S1 m_8;
+ union U m_9;
+ union U1 m_10;
+};
+
+int func(struct root_struct *root)
+{
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_bitfields.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_bitfields.c
new file mode 100644
index 000000000..22a7cd8fd
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_bitfields.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+
+/*
+ * BTF-to-C dumper tests for bitfield.
+ *
+ * Copyright (c) 2019 Facebook
+ */
+#include <stdbool.h>
+
+/* ----- START-EXPECTED-OUTPUT ----- */
+/*
+ *struct bitfields_only_mixed_types {
+ * int a: 3;
+ * long int b: 2;
+ * _Bool c: 1;
+ * enum {
+ * A = 0,
+ * B = 1,
+ * } d: 1;
+ * short e: 5;
+ * int: 20;
+ * unsigned int f: 30;
+ *};
+ *
+ */
+/* ------ END-EXPECTED-OUTPUT ------ */
+
+struct bitfields_only_mixed_types {
+ int a: 3;
+ long int b: 2;
+ bool c: 1; /* it's really a _Bool type */
+ enum {
+ A, /* A = 0, dumper is very explicit */
+ B, /* B = 1, same */
+ } d: 1;
+ short e: 5;
+ /* 20-bit padding here */
+ unsigned f: 30; /* this gets aligned on 4-byte boundary */
+};
+
+/* ----- START-EXPECTED-OUTPUT ----- */
+/*
+ *struct bitfield_mixed_with_others {
+ * char: 4;
+ * int a: 4;
+ * short b;
+ * long int c;
+ * long int d: 8;
+ * int e;
+ * int f;
+ *};
+ *
+ */
+/* ------ END-EXPECTED-OUTPUT ------ */
+struct bitfield_mixed_with_others {
+ char: 4; /* char is enough as a backing field */
+ int a: 4;
+ /* 8-bit implicit padding */
+ short b; /* combined with previous bitfield */
+ /* 4 more bytes of implicit padding */
+ long c;
+ long d: 8;
+ /* 24 bits implicit padding */
+ int e; /* combined with previous bitfield */
+ int f;
+ /* 4 bytes of padding */
+};
+
+/* ----- START-EXPECTED-OUTPUT ----- */
+/*
+ *struct bitfield_flushed {
+ * int a: 4;
+ * long: 60;
+ * long int b: 16;
+ *};
+ *
+ */
+/* ------ END-EXPECTED-OUTPUT ------ */
+struct bitfield_flushed {
+ int a: 4;
+ long: 0; /* flush until next natural alignment boundary */
+ long b: 16;
+};
+
+int f(struct {
+ struct bitfields_only_mixed_types _1;
+ struct bitfield_mixed_with_others _2;
+ struct bitfield_flushed _3;
+} *_)
+{
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_multidim.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_multidim.c
new file mode 100644
index 000000000..ba97165bd
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_multidim.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+
+/*
+ * BTF-to-C dumper test for multi-dimensional array output.
+ *
+ * Copyright (c) 2019 Facebook
+ */
+/* ----- START-EXPECTED-OUTPUT ----- */
+typedef int arr_t[2];
+
+typedef int multiarr_t[3][4][5];
+
+typedef int *ptr_arr_t[6];
+
+typedef int *ptr_multiarr_t[7][8][9][10];
+
+typedef int * (*fn_ptr_arr_t[11])();
+
+typedef int * (*fn_ptr_multiarr_t[12][13])();
+
+struct root_struct {
+ arr_t _1;
+ multiarr_t _2;
+ ptr_arr_t _3;
+ ptr_multiarr_t _4;
+ fn_ptr_arr_t _5;
+ fn_ptr_multiarr_t _6;
+};
+
+/* ------ END-EXPECTED-OUTPUT ------ */
+
+int f(struct root_struct *s)
+{
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_namespacing.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_namespacing.c
new file mode 100644
index 000000000..92a4ad428
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_namespacing.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+
+/*
+ * BTF-to-C dumper test validating no name versioning happens between
+ * independent C namespaces (struct/union/enum vs typedef/enum values).
+ *
+ * Copyright (c) 2019 Facebook
+ */
+/* ----- START-EXPECTED-OUTPUT ----- */
+struct S {
+ int S;
+ int U;
+};
+
+typedef struct S S;
+
+union U {
+ int S;
+ int U;
+};
+
+typedef union U U;
+
+enum E {
+ V = 0,
+};
+
+typedef enum E E;
+
+struct A {};
+
+union B {};
+
+enum C {
+ A = 1,
+ B = 2,
+ C = 3,
+};
+
+struct X {};
+
+union Y {};
+
+enum Z;
+
+typedef int X;
+
+typedef int Y;
+
+typedef int Z;
+
+/*------ END-EXPECTED-OUTPUT ------ */
+
+int f(struct {
+ struct S _1;
+ S _2;
+ union U _3;
+ U _4;
+ enum E _5;
+ E _6;
+ struct A a;
+ union B b;
+ enum C c;
+ struct X x;
+ union Y y;
+ enum Z *z;
+ X xx;
+ Y yy;
+ Z zz;
+} *_)
+{
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_ordering.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_ordering.c
new file mode 100644
index 000000000..7c95702ee
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_ordering.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+
+/*
+ * BTF-to-C dumper test for topological sorting of dependent structs.
+ *
+ * Copyright (c) 2019 Facebook
+ */
+/* ----- START-EXPECTED-OUTPUT ----- */
+struct s1 {};
+
+struct s3;
+
+struct s4;
+
+struct s2 {
+ struct s2 *s2;
+ struct s3 *s3;
+ struct s4 *s4;
+};
+
+struct s3 {
+ struct s1 s1;
+ struct s2 s2;
+};
+
+struct s4 {
+ struct s1 s1;
+ struct s3 s3;
+};
+
+struct list_head {
+ struct list_head *next;
+ struct list_head *prev;
+};
+
+struct hlist_node {
+ struct hlist_node *next;
+ struct hlist_node **pprev;
+};
+
+struct hlist_head {
+ struct hlist_node *first;
+};
+
+struct callback_head {
+ struct callback_head *next;
+ void (*func)(struct callback_head *);
+};
+
+struct root_struct {
+ struct s4 s4;
+ struct list_head l;
+ struct hlist_node n;
+ struct hlist_head h;
+ struct callback_head cb;
+};
+
+/*------ END-EXPECTED-OUTPUT ------ */
+
+int f(struct root_struct *root)
+{
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c
new file mode 100644
index 000000000..22dbd1213
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_packing.c
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+
+/*
+ * BTF-to-C dumper tests for struct packing determination.
+ *
+ * Copyright (c) 2019 Facebook
+ */
+/* ----- START-EXPECTED-OUTPUT ----- */
+struct packed_trailing_space {
+ int a;
+ short b;
+} __attribute__((packed));
+
+struct non_packed_trailing_space {
+ int a;
+ short b;
+};
+
+struct packed_fields {
+ short a;
+ int b;
+} __attribute__((packed));
+
+struct non_packed_fields {
+ short a;
+ int b;
+};
+
+struct nested_packed {
+ char: 4;
+ int a: 4;
+ long int b;
+ struct {
+ char c;
+ int d;
+ } __attribute__((packed)) e;
+} __attribute__((packed));
+
+union union_is_never_packed {
+ int a: 4;
+ char b;
+ char c: 1;
+};
+
+union union_does_not_need_packing {
+ struct {
+ long int a;
+ int b;
+ } __attribute__((packed));
+ int c;
+};
+
+union jump_code_union {
+ char code[5];
+ struct {
+ char jump;
+ int offset;
+ } __attribute__((packed));
+};
+
+/* ----- START-EXPECTED-OUTPUT ----- */
+/*
+ *struct nested_packed_but_aligned_struct {
+ * int x1;
+ * int x2;
+ *};
+ *
+ *struct outer_implicitly_packed_struct {
+ * char y1;
+ * struct nested_packed_but_aligned_struct y2;
+ *} __attribute__((packed));
+ *
+ */
+/* ------ END-EXPECTED-OUTPUT ------ */
+
+struct nested_packed_but_aligned_struct {
+ int x1;
+ int x2;
+} __attribute__((packed));
+
+struct outer_implicitly_packed_struct {
+ char y1;
+ struct nested_packed_but_aligned_struct y2;
+};
+/* ----- START-EXPECTED-OUTPUT ----- */
+/*
+ *struct usb_ss_ep_comp_descriptor {
+ * char: 8;
+ * char bDescriptorType;
+ * char bMaxBurst;
+ * short wBytesPerInterval;
+ *};
+ *
+ *struct usb_host_endpoint {
+ * long: 64;
+ * char: 8;
+ * struct usb_ss_ep_comp_descriptor ss_ep_comp;
+ * long: 0;
+ *} __attribute__((packed));
+ *
+ */
+/* ------ END-EXPECTED-OUTPUT ------ */
+
+struct usb_ss_ep_comp_descriptor {
+ char: 8;
+ char bDescriptorType;
+ char bMaxBurst;
+ int: 0;
+ short wBytesPerInterval;
+} __attribute__((packed));
+
+struct usb_host_endpoint {
+ long: 64;
+ char: 8;
+ struct usb_ss_ep_comp_descriptor ss_ep_comp;
+ long: 0;
+};
+
+/* ----- START-EXPECTED-OUTPUT ----- */
+struct nested_packed_struct {
+ int a;
+ char b;
+} __attribute__((packed));
+
+struct outer_nonpacked_struct {
+ short a;
+ struct nested_packed_struct b;
+};
+
+struct outer_packed_struct {
+ short a;
+ struct nested_packed_struct b;
+} __attribute__((packed));
+
+/* ------ END-EXPECTED-OUTPUT ------ */
+
+int f(struct {
+ struct packed_trailing_space _1;
+ struct non_packed_trailing_space _2;
+ struct packed_fields _3;
+ struct non_packed_fields _4;
+ struct nested_packed _5;
+ union union_is_never_packed _6;
+ union union_does_not_need_packing _7;
+ union jump_code_union _8;
+ struct outer_implicitly_packed_struct _9;
+ struct usb_host_endpoint _10;
+ struct outer_nonpacked_struct _11;
+ struct outer_packed_struct _12;
+} *_)
+{
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c
new file mode 100644
index 000000000..0b3cdffbf
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_padding.c
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+
+/*
+ * BTF-to-C dumper tests for implicit and explicit padding between fields and
+ * at the end of a struct.
+ *
+ * Copyright (c) 2019 Facebook
+ */
+/* ----- START-EXPECTED-OUTPUT ----- */
+struct padded_implicitly {
+ int a;
+ long int b;
+ char c;
+};
+
+/* ------ END-EXPECTED-OUTPUT ------ */
+
+/* ----- START-EXPECTED-OUTPUT ----- */
+/*
+ *struct padded_explicitly {
+ * int a;
+ * long: 0;
+ * int b;
+ *};
+ *
+ */
+/* ------ END-EXPECTED-OUTPUT ------ */
+
+struct padded_explicitly {
+ int a;
+ int: 1; /* algo will emit aligning `long: 0;` here */
+ int b;
+};
+
+/* ----- START-EXPECTED-OUTPUT ----- */
+struct padded_a_lot {
+ int a;
+ long: 64;
+ long: 64;
+ int b;
+};
+
+/* ------ END-EXPECTED-OUTPUT ------ */
+
+/* ----- START-EXPECTED-OUTPUT ----- */
+/*
+ *struct padded_cache_line {
+ * int a;
+ * long: 64;
+ * long: 64;
+ * long: 64;
+ * int b;
+ * long: 64;
+ * long: 64;
+ * long: 64;
+ *};
+ *
+ */
+/* ------ END-EXPECTED-OUTPUT ------ */
+
+struct padded_cache_line {
+ int a;
+ int b __attribute__((aligned(32)));
+};
+
+/* ----- START-EXPECTED-OUTPUT ----- */
+/*
+ *struct zone_padding {
+ * char x[0];
+ *};
+ *
+ *struct zone {
+ * int a;
+ * short b;
+ * long: 0;
+ * struct zone_padding __pad__;
+ *};
+ *
+ */
+/* ------ END-EXPECTED-OUTPUT ------ */
+
+struct zone_padding {
+ char x[0];
+} __attribute__((__aligned__(8)));
+
+struct zone {
+ int a;
+ short b;
+ struct zone_padding __pad__;
+};
+
+/* ----- START-EXPECTED-OUTPUT ----- */
+struct padding_wo_named_members {
+ long: 64;
+ long: 64;
+};
+
+struct padding_weird_1 {
+ int a;
+ long: 64;
+ short: 16;
+ short b;
+};
+
+/* ------ END-EXPECTED-OUTPUT ------ */
+
+/* ----- START-EXPECTED-OUTPUT ----- */
+/*
+ *struct padding_weird_2 {
+ * long: 56;
+ * char a;
+ * long: 56;
+ * char b;
+ * char: 8;
+ *};
+ *
+ */
+/* ------ END-EXPECTED-OUTPUT ------ */
+struct padding_weird_2 {
+ int: 32; /* these paddings will be collapsed into `long: 56;` */
+ short: 16;
+ char: 8;
+ char a;
+ int: 32; /* these paddings will be collapsed into `long: 56;` */
+ short: 16;
+ char: 8;
+ char b;
+ char: 8;
+};
+
+/* ----- START-EXPECTED-OUTPUT ----- */
+struct exact_1byte {
+ char x;
+};
+
+struct padded_1byte {
+ char: 8;
+};
+
+struct exact_2bytes {
+ short x;
+};
+
+struct padded_2bytes {
+ short: 16;
+};
+
+struct exact_4bytes {
+ int x;
+};
+
+struct padded_4bytes {
+ int: 32;
+};
+
+struct exact_8bytes {
+ long x;
+};
+
+struct padded_8bytes {
+ long: 64;
+};
+
+struct ff_periodic_effect {
+ int: 32;
+ short magnitude;
+ long: 0;
+ short phase;
+ long: 0;
+ int: 32;
+ int custom_len;
+ short *custom_data;
+};
+
+struct ib_wc {
+ long: 64;
+ long: 64;
+ int: 32;
+ int byte_len;
+ void *qp;
+ union {} ex;
+ long: 64;
+ int slid;
+ int wc_flags;
+ long: 64;
+ char smac[6];
+ long: 0;
+ char network_hdr_type;
+};
+
+struct acpi_object_method {
+ long: 64;
+ char: 8;
+ char type;
+ short reference_count;
+ char flags;
+ short: 0;
+ char: 8;
+ char sync_level;
+ long: 64;
+ void *node;
+ void *aml_start;
+ union {} dispatch;
+ long: 64;
+ int aml_length;
+};
+
+struct nested_unpacked {
+ int x;
+};
+
+struct nested_packed {
+ struct nested_unpacked a;
+ char c;
+} __attribute__((packed));
+
+struct outer_mixed_but_unpacked {
+ struct nested_packed b1;
+ short a1;
+ struct nested_packed b2;
+};
+
+/* ------ END-EXPECTED-OUTPUT ------ */
+
+int f(struct {
+ struct padded_implicitly _1;
+ struct padded_explicitly _2;
+ struct padded_a_lot _3;
+ struct padded_cache_line _4;
+ struct zone _5;
+ struct padding_wo_named_members _6;
+ struct padding_weird_1 _7;
+ struct padding_weird_2 _8;
+ struct exact_1byte _100;
+ struct padded_1byte _101;
+ struct exact_2bytes _102;
+ struct padded_2bytes _103;
+ struct exact_4bytes _104;
+ struct padded_4bytes _105;
+ struct exact_8bytes _106;
+ struct padded_8bytes _107;
+ struct ff_periodic_effect _200;
+ struct ib_wc _201;
+ struct acpi_object_method _202;
+ struct outer_mixed_but_unpacked _203;
+} *_)
+{
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c
new file mode 100644
index 000000000..fe43556e1
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+
+/*
+ * BTF-to-C dumper test for majority of C syntax quirks.
+ *
+ * Copyright (c) 2019 Facebook
+ */
+/* ----- START-EXPECTED-OUTPUT ----- */
+enum e1 {
+ A = 0,
+ B = 1,
+};
+
+enum e2 {
+ C = 100,
+ D = 4294967295,
+ E = 0,
+};
+
+typedef enum e2 e2_t;
+
+typedef enum {
+ F = 0,
+ G = 1,
+ H = 2,
+} e3_t;
+
+typedef int int_t;
+
+typedef volatile const int * volatile const crazy_ptr_t;
+
+typedef int *****we_need_to_go_deeper_ptr_t;
+
+typedef volatile const we_need_to_go_deeper_ptr_t * restrict * volatile * const * restrict volatile * restrict const * volatile const * restrict volatile const how_about_this_ptr_t;
+
+typedef int *ptr_arr_t[10];
+
+typedef void (*fn_ptr1_t)(int);
+
+typedef void (*printf_fn_t)(const char *, ...);
+
+/* ------ END-EXPECTED-OUTPUT ------ */
+/*
+ * While previous function pointers are pretty trivial (C-syntax-level
+ * trivial), the following are deciphered here for future generations:
+ *
+ * - `fn_ptr2_t`: function, taking anonymous struct as a first arg and pointer
+ * to a function, that takes int and returns int, as a second arg; returning
+ * a pointer to a const pointer to a char. Equivalent to:
+ * typedef struct { int a; } s_t;
+ * typedef int (*fn_t)(int);
+ * typedef char * const * (*fn_ptr2_t)(s_t, fn_t);
+ *
+ * - `fn_complext_t`: pointer to a function returning struct and accepting
+ * union and struct. All structs and enum are anonymous and defined inline.
+ *
+ * - `signal_t: pointer to a function accepting a pointer to a function as an
+ * argument and returning pointer to a function as a result. Sane equivalent:
+ * typedef void (*signal_handler_t)(int);
+ * typedef signal_handler_t (*signal_ptr_t)(int, signal_handler_t);
+ *
+ * - fn_ptr_arr1_t: array of pointers to a function accepting pointer to
+ * a pointer to an int and returning pointer to a char. Easy.
+ *
+ * - fn_ptr_arr2_t: array of const pointers to a function taking no arguments
+ * and returning a const pointer to a function, that takes pointer to a
+ * `int -> char *` function and returns pointer to a char. Equivalent:
+ * typedef char * (*fn_input_t)(int);
+ * typedef char * (*fn_output_outer_t)(fn_input_t);
+ * typedef const fn_output_outer_t (* fn_output_inner_t)();
+ * typedef const fn_output_inner_t fn_ptr_arr2_t[5];
+ */
+/* ----- START-EXPECTED-OUTPUT ----- */
+typedef char * const * (*fn_ptr2_t)(struct {
+ int a;
+}, int (*)(int));
+
+typedef struct {
+ int a;
+ void (*b)(int, struct {
+ int c;
+ }, union {
+ char d;
+ int e[5];
+ });
+} (*fn_complex_t)(union {
+ void *f;
+ char g[16];
+}, struct {
+ int h;
+});
+
+typedef void (* (*signal_t)(int, void (*)(int)))(int);
+
+typedef char * (*fn_ptr_arr1_t[10])(int **);
+
+typedef char * (* (* const fn_ptr_arr2_t[5])())(char * (*)(int));
+
+struct struct_w_typedefs {
+ int_t a;
+ crazy_ptr_t b;
+ we_need_to_go_deeper_ptr_t c;
+ how_about_this_ptr_t d;
+ ptr_arr_t e;
+ fn_ptr1_t f;
+ printf_fn_t g;
+ fn_ptr2_t h;
+ fn_complex_t i;
+ signal_t j;
+ fn_ptr_arr1_t k;
+ fn_ptr_arr2_t l;
+};
+
+typedef struct {
+ int x;
+ int y;
+ int z;
+} anon_struct_t;
+
+struct struct_fwd;
+
+typedef struct struct_fwd struct_fwd_t;
+
+typedef struct struct_fwd *struct_fwd_ptr_t;
+
+union union_fwd;
+
+typedef union union_fwd union_fwd_t;
+
+typedef union union_fwd *union_fwd_ptr_t;
+
+struct struct_empty {};
+
+struct struct_simple {
+ int a;
+ char b;
+ const int_t *p;
+ struct struct_empty s;
+ enum e2 e;
+ enum {
+ ANON_VAL1 = 1,
+ ANON_VAL2 = 2,
+ } f;
+ int arr1[13];
+ enum e2 arr2[5];
+};
+
+union union_empty {};
+
+union union_simple {
+ void *ptr;
+ int num;
+ int_t num2;
+ union union_empty u;
+};
+
+struct struct_in_struct {
+ struct struct_simple simple;
+ union union_simple also_simple;
+ struct {
+ int a;
+ } not_so_hard_as_well;
+ union {
+ int b;
+ int c;
+ } anon_union_is_good;
+ struct {
+ int d;
+ int e;
+ };
+ union {
+ int f;
+ int g;
+ };
+};
+
+struct struct_with_embedded_stuff {
+ int a;
+ struct {
+ int b;
+ struct {
+ struct struct_with_embedded_stuff *c;
+ const char *d;
+ } e;
+ union {
+ volatile long int f;
+ void * restrict g;
+ };
+ };
+ union {
+ const int_t *h;
+ void (*i)(char, int, void *);
+ } j;
+ enum {
+ K = 100,
+ L = 200,
+ } m;
+ char n[16];
+ struct {
+ char o;
+ int p;
+ void (*q)(int);
+ } r[5];
+ struct struct_in_struct s[10];
+ int t[11];
+};
+
+struct root_struct {
+ enum e1 _1;
+ enum e2 _2;
+ e2_t _2_1;
+ e3_t _2_2;
+ struct struct_w_typedefs _3;
+ anon_struct_t _7;
+ struct struct_fwd *_8;
+ struct_fwd_t *_9;
+ struct_fwd_ptr_t _10;
+ union union_fwd *_11;
+ union_fwd_t *_12;
+ union_fwd_ptr_t _13;
+ struct struct_with_embedded_stuff _14;
+};
+
+/* ------ END-EXPECTED-OUTPUT ------ */
+
+int f(struct root_struct *s)
+{
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/btf_ptr.h b/tools/testing/selftests/bpf/progs/btf_ptr.h
new file mode 100644
index 000000000..c3c9797c6
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/btf_ptr.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2020, Oracle and/or its affiliates. */
+/* "undefine" structs in vmlinux.h, because we "override" them below */
+#define btf_ptr btf_ptr___not_used
+#define BTF_F_COMPACT BTF_F_COMPACT___not_used
+#define BTF_F_NONAME BTF_F_NONAME___not_used
+#define BTF_F_PTR_RAW BTF_F_PTR_RAW___not_used
+#define BTF_F_ZERO BTF_F_ZERO___not_used
+#include "vmlinux.h"
+#undef btf_ptr
+#undef BTF_F_COMPACT
+#undef BTF_F_NONAME
+#undef BTF_F_PTR_RAW
+#undef BTF_F_ZERO
+
+struct btf_ptr {
+ void *ptr;
+ __u32 type_id;
+ __u32 flags;
+};
+
+enum {
+ BTF_F_COMPACT = (1ULL << 0),
+ BTF_F_NONAME = (1ULL << 1),
+ BTF_F_PTR_RAW = (1ULL << 2),
+ BTF_F_ZERO = (1ULL << 3),
+};
diff --git a/tools/testing/selftests/bpf/progs/cg_storage_multi.h b/tools/testing/selftests/bpf/progs/cg_storage_multi.h
new file mode 100644
index 000000000..a0778fe78
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cg_storage_multi.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __PROGS_CG_STORAGE_MULTI_H
+#define __PROGS_CG_STORAGE_MULTI_H
+
+#include <asm/types.h>
+
+struct cgroup_value {
+ __u32 egress_pkts;
+ __u32 ingress_pkts;
+};
+
+#endif
diff --git a/tools/testing/selftests/bpf/progs/cg_storage_multi_egress_only.c b/tools/testing/selftests/bpf/progs/cg_storage_multi_egress_only.c
new file mode 100644
index 000000000..44ad46b33
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cg_storage_multi_egress_only.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright 2020 Google LLC.
+ */
+
+#include <errno.h>
+#include <linux/bpf.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <bpf/bpf_helpers.h>
+
+#include "progs/cg_storage_multi.h"
+
+struct {
+ __uint(type, BPF_MAP_TYPE_CGROUP_STORAGE);
+ __type(key, struct bpf_cgroup_storage_key);
+ __type(value, struct cgroup_value);
+} cgroup_storage SEC(".maps");
+
+__u32 invocations = 0;
+
+SEC("cgroup_skb/egress")
+int egress(struct __sk_buff *skb)
+{
+ struct cgroup_value *ptr_cg_storage =
+ bpf_get_local_storage(&cgroup_storage, 0);
+
+ __sync_fetch_and_add(&ptr_cg_storage->egress_pkts, 1);
+ __sync_fetch_and_add(&invocations, 1);
+
+ return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/cg_storage_multi_isolated.c b/tools/testing/selftests/bpf/progs/cg_storage_multi_isolated.c
new file mode 100644
index 000000000..a25373002
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cg_storage_multi_isolated.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright 2020 Google LLC.
+ */
+
+#include <errno.h>
+#include <linux/bpf.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <bpf/bpf_helpers.h>
+
+#include "progs/cg_storage_multi.h"
+
+struct {
+ __uint(type, BPF_MAP_TYPE_CGROUP_STORAGE);
+ __type(key, struct bpf_cgroup_storage_key);
+ __type(value, struct cgroup_value);
+} cgroup_storage SEC(".maps");
+
+__u32 invocations = 0;
+
+SEC("cgroup_skb/egress/1")
+int egress1(struct __sk_buff *skb)
+{
+ struct cgroup_value *ptr_cg_storage =
+ bpf_get_local_storage(&cgroup_storage, 0);
+
+ __sync_fetch_and_add(&ptr_cg_storage->egress_pkts, 1);
+ __sync_fetch_and_add(&invocations, 1);
+
+ return 1;
+}
+
+SEC("cgroup_skb/egress/2")
+int egress2(struct __sk_buff *skb)
+{
+ struct cgroup_value *ptr_cg_storage =
+ bpf_get_local_storage(&cgroup_storage, 0);
+
+ __sync_fetch_and_add(&ptr_cg_storage->egress_pkts, 1);
+ __sync_fetch_and_add(&invocations, 1);
+
+ return 1;
+}
+
+SEC("cgroup_skb/ingress")
+int ingress(struct __sk_buff *skb)
+{
+ struct cgroup_value *ptr_cg_storage =
+ bpf_get_local_storage(&cgroup_storage, 0);
+
+ __sync_fetch_and_add(&ptr_cg_storage->ingress_pkts, 1);
+ __sync_fetch_and_add(&invocations, 1);
+
+ return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/cg_storage_multi_shared.c b/tools/testing/selftests/bpf/progs/cg_storage_multi_shared.c
new file mode 100644
index 000000000..a149f33bc
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cg_storage_multi_shared.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright 2020 Google LLC.
+ */
+
+#include <errno.h>
+#include <linux/bpf.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <bpf/bpf_helpers.h>
+
+#include "progs/cg_storage_multi.h"
+
+struct {
+ __uint(type, BPF_MAP_TYPE_CGROUP_STORAGE);
+ __type(key, __u64);
+ __type(value, struct cgroup_value);
+} cgroup_storage SEC(".maps");
+
+__u32 invocations = 0;
+
+SEC("cgroup_skb/egress/1")
+int egress1(struct __sk_buff *skb)
+{
+ struct cgroup_value *ptr_cg_storage =
+ bpf_get_local_storage(&cgroup_storage, 0);
+
+ __sync_fetch_and_add(&ptr_cg_storage->egress_pkts, 1);
+ __sync_fetch_and_add(&invocations, 1);
+
+ return 1;
+}
+
+SEC("cgroup_skb/egress/2")
+int egress2(struct __sk_buff *skb)
+{
+ struct cgroup_value *ptr_cg_storage =
+ bpf_get_local_storage(&cgroup_storage, 0);
+
+ __sync_fetch_and_add(&ptr_cg_storage->egress_pkts, 1);
+ __sync_fetch_and_add(&invocations, 1);
+
+ return 1;
+}
+
+SEC("cgroup_skb/ingress")
+int ingress(struct __sk_buff *skb)
+{
+ struct cgroup_value *ptr_cg_storage =
+ bpf_get_local_storage(&cgroup_storage, 0);
+
+ __sync_fetch_and_add(&ptr_cg_storage->ingress_pkts, 1);
+ __sync_fetch_and_add(&invocations, 1);
+
+ return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/cgroup_skb_sk_lookup_kern.c b/tools/testing/selftests/bpf/progs/cgroup_skb_sk_lookup_kern.c
new file mode 100644
index 000000000..3f757e30d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cgroup_skb_sk_lookup_kern.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+
+#include <linux/bpf.h>
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_helpers.h>
+
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+
+#include <sys/types.h>
+#include <sys/socket.h>
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
+
+__u16 g_serv_port = 0;
+
+static inline void set_ip(__u32 *dst, const struct in6_addr *src)
+{
+ dst[0] = src->in6_u.u6_addr32[0];
+ dst[1] = src->in6_u.u6_addr32[1];
+ dst[2] = src->in6_u.u6_addr32[2];
+ dst[3] = src->in6_u.u6_addr32[3];
+}
+
+static inline void set_tuple(struct bpf_sock_tuple *tuple,
+ const struct ipv6hdr *ip6h,
+ const struct tcphdr *tcph)
+{
+ set_ip(tuple->ipv6.saddr, &ip6h->daddr);
+ set_ip(tuple->ipv6.daddr, &ip6h->saddr);
+ tuple->ipv6.sport = tcph->dest;
+ tuple->ipv6.dport = tcph->source;
+}
+
+static inline int is_allowed_peer_cg(struct __sk_buff *skb,
+ const struct ipv6hdr *ip6h,
+ const struct tcphdr *tcph)
+{
+ __u64 cgid, acgid, peer_cgid, peer_acgid;
+ struct bpf_sock_tuple tuple;
+ size_t tuple_len = sizeof(tuple.ipv6);
+ struct bpf_sock *peer_sk;
+
+ set_tuple(&tuple, ip6h, tcph);
+
+ peer_sk = bpf_sk_lookup_tcp(skb, &tuple, tuple_len,
+ BPF_F_CURRENT_NETNS, 0);
+ if (!peer_sk)
+ return 0;
+
+ cgid = bpf_skb_cgroup_id(skb);
+ peer_cgid = bpf_sk_cgroup_id(peer_sk);
+
+ acgid = bpf_skb_ancestor_cgroup_id(skb, 2);
+ peer_acgid = bpf_sk_ancestor_cgroup_id(peer_sk, 2);
+
+ bpf_sk_release(peer_sk);
+
+ return cgid && cgid == peer_cgid && acgid && acgid == peer_acgid;
+}
+
+SEC("cgroup_skb/ingress")
+int ingress_lookup(struct __sk_buff *skb)
+{
+ __u32 serv_port_key = 0;
+ struct ipv6hdr ip6h;
+ struct tcphdr tcph;
+
+ if (skb->protocol != bpf_htons(ETH_P_IPV6))
+ return 1;
+
+ /* For SYN packets coming to listening socket skb->remote_port will be
+ * zero, so IPv6/TCP headers are loaded to identify remote peer
+ * instead.
+ */
+ if (bpf_skb_load_bytes(skb, 0, &ip6h, sizeof(ip6h)))
+ return 1;
+
+ if (ip6h.nexthdr != IPPROTO_TCP)
+ return 1;
+
+ if (bpf_skb_load_bytes(skb, sizeof(ip6h), &tcph, sizeof(tcph)))
+ return 1;
+
+ if (!g_serv_port)
+ return 0;
+
+ if (tcph.dest != g_serv_port)
+ return 1;
+
+ return is_allowed_peer_cg(skb, &ip6h, &tcph);
+}
diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c
new file mode 100644
index 000000000..38ab1ce32
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/connect4_prog.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <string.h>
+
+#include <linux/stddef.h>
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <sys/socket.h>
+#include <netinet/tcp.h>
+#include <linux/if.h>
+#include <errno.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#define SRC_REWRITE_IP4 0x7f000004U
+#define DST_REWRITE_IP4 0x7f000001U
+#define DST_REWRITE_PORT4 4444
+
+#ifndef TCP_CA_NAME_MAX
+#define TCP_CA_NAME_MAX 16
+#endif
+
+#ifndef TCP_NOTSENT_LOWAT
+#define TCP_NOTSENT_LOWAT 25
+#endif
+
+#ifndef IFNAMSIZ
+#define IFNAMSIZ 16
+#endif
+
+int _version SEC("version") = 1;
+
+__attribute__ ((noinline)) __weak
+int do_bind(struct bpf_sock_addr *ctx)
+{
+ struct sockaddr_in sa = {};
+
+ sa.sin_family = AF_INET;
+ sa.sin_port = bpf_htons(0);
+ sa.sin_addr.s_addr = bpf_htonl(SRC_REWRITE_IP4);
+
+ if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0)
+ return 0;
+
+ return 1;
+}
+
+static __inline int verify_cc(struct bpf_sock_addr *ctx,
+ char expected[TCP_CA_NAME_MAX])
+{
+ char buf[TCP_CA_NAME_MAX];
+ int i;
+
+ if (bpf_getsockopt(ctx, SOL_TCP, TCP_CONGESTION, &buf, sizeof(buf)))
+ return 1;
+
+ for (i = 0; i < TCP_CA_NAME_MAX; i++) {
+ if (buf[i] != expected[i])
+ return 1;
+ if (buf[i] == 0)
+ break;
+ }
+
+ return 0;
+}
+
+static __inline int set_cc(struct bpf_sock_addr *ctx)
+{
+ char reno[TCP_CA_NAME_MAX] = "reno";
+ char cubic[TCP_CA_NAME_MAX] = "cubic";
+
+ if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &reno, sizeof(reno)))
+ return 1;
+ if (verify_cc(ctx, reno))
+ return 1;
+
+ if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &cubic, sizeof(cubic)))
+ return 1;
+ if (verify_cc(ctx, cubic))
+ return 1;
+
+ return 0;
+}
+
+static __inline int bind_to_device(struct bpf_sock_addr *ctx)
+{
+ char veth1[IFNAMSIZ] = "test_sock_addr1";
+ char veth2[IFNAMSIZ] = "test_sock_addr2";
+ char missing[IFNAMSIZ] = "nonexistent_dev";
+ char del_bind[IFNAMSIZ] = "";
+
+ if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE,
+ &veth1, sizeof(veth1)))
+ return 1;
+ if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE,
+ &veth2, sizeof(veth2)))
+ return 1;
+ if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE,
+ &missing, sizeof(missing)) != -ENODEV)
+ return 1;
+ if (bpf_setsockopt(ctx, SOL_SOCKET, SO_BINDTODEVICE,
+ &del_bind, sizeof(del_bind)))
+ return 1;
+
+ return 0;
+}
+
+static __inline int set_keepalive(struct bpf_sock_addr *ctx)
+{
+ int zero = 0, one = 1;
+
+ if (bpf_setsockopt(ctx, SOL_SOCKET, SO_KEEPALIVE, &one, sizeof(one)))
+ return 1;
+ if (ctx->type == SOCK_STREAM) {
+ if (bpf_setsockopt(ctx, SOL_TCP, TCP_KEEPIDLE, &one, sizeof(one)))
+ return 1;
+ if (bpf_setsockopt(ctx, SOL_TCP, TCP_KEEPINTVL, &one, sizeof(one)))
+ return 1;
+ if (bpf_setsockopt(ctx, SOL_TCP, TCP_KEEPCNT, &one, sizeof(one)))
+ return 1;
+ if (bpf_setsockopt(ctx, SOL_TCP, TCP_SYNCNT, &one, sizeof(one)))
+ return 1;
+ if (bpf_setsockopt(ctx, SOL_TCP, TCP_USER_TIMEOUT, &one, sizeof(one)))
+ return 1;
+ }
+ if (bpf_setsockopt(ctx, SOL_SOCKET, SO_KEEPALIVE, &zero, sizeof(zero)))
+ return 1;
+
+ return 0;
+}
+
+static __inline int set_notsent_lowat(struct bpf_sock_addr *ctx)
+{
+ int lowat = 65535;
+
+ if (ctx->type == SOCK_STREAM) {
+ if (bpf_setsockopt(ctx, SOL_TCP, TCP_NOTSENT_LOWAT, &lowat, sizeof(lowat)))
+ return 1;
+ }
+
+ return 0;
+}
+
+SEC("cgroup/connect4")
+int connect_v4_prog(struct bpf_sock_addr *ctx)
+{
+ struct bpf_sock_tuple tuple = {};
+ struct bpf_sock *sk;
+
+ /* Verify that new destination is available. */
+ memset(&tuple.ipv4.saddr, 0, sizeof(tuple.ipv4.saddr));
+ memset(&tuple.ipv4.sport, 0, sizeof(tuple.ipv4.sport));
+
+ tuple.ipv4.daddr = bpf_htonl(DST_REWRITE_IP4);
+ tuple.ipv4.dport = bpf_htons(DST_REWRITE_PORT4);
+
+ /* Bind to device and unbind it. */
+ if (bind_to_device(ctx))
+ return 0;
+
+ if (set_keepalive(ctx))
+ return 0;
+
+ if (set_notsent_lowat(ctx))
+ return 0;
+
+ if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM)
+ return 0;
+ else if (ctx->type == SOCK_STREAM)
+ sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof(tuple.ipv4),
+ BPF_F_CURRENT_NETNS, 0);
+ else
+ sk = bpf_sk_lookup_udp(ctx, &tuple, sizeof(tuple.ipv4),
+ BPF_F_CURRENT_NETNS, 0);
+
+ if (!sk)
+ return 0;
+
+ if (sk->src_ip4 != tuple.ipv4.daddr ||
+ sk->src_port != DST_REWRITE_PORT4) {
+ bpf_sk_release(sk);
+ return 0;
+ }
+
+ bpf_sk_release(sk);
+
+ /* Rewrite congestion control. */
+ if (ctx->type == SOCK_STREAM && set_cc(ctx))
+ return 0;
+
+ /* Rewrite destination. */
+ ctx->user_ip4 = bpf_htonl(DST_REWRITE_IP4);
+ ctx->user_port = bpf_htons(DST_REWRITE_PORT4);
+
+ return do_bind(ctx) ? 1 : 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/connect6_prog.c b/tools/testing/selftests/bpf/progs/connect6_prog.c
new file mode 100644
index 000000000..506d0f81a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/connect6_prog.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <string.h>
+
+#include <linux/stddef.h>
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <sys/socket.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#define SRC_REWRITE_IP6_0 0
+#define SRC_REWRITE_IP6_1 0
+#define SRC_REWRITE_IP6_2 0
+#define SRC_REWRITE_IP6_3 6
+
+#define DST_REWRITE_IP6_0 0
+#define DST_REWRITE_IP6_1 0
+#define DST_REWRITE_IP6_2 0
+#define DST_REWRITE_IP6_3 1
+
+#define DST_REWRITE_PORT6 6666
+
+int _version SEC("version") = 1;
+
+SEC("cgroup/connect6")
+int connect_v6_prog(struct bpf_sock_addr *ctx)
+{
+ struct bpf_sock_tuple tuple = {};
+ struct sockaddr_in6 sa;
+ struct bpf_sock *sk;
+
+ /* Verify that new destination is available. */
+ memset(&tuple.ipv6.saddr, 0, sizeof(tuple.ipv6.saddr));
+ memset(&tuple.ipv6.sport, 0, sizeof(tuple.ipv6.sport));
+
+ tuple.ipv6.daddr[0] = bpf_htonl(DST_REWRITE_IP6_0);
+ tuple.ipv6.daddr[1] = bpf_htonl(DST_REWRITE_IP6_1);
+ tuple.ipv6.daddr[2] = bpf_htonl(DST_REWRITE_IP6_2);
+ tuple.ipv6.daddr[3] = bpf_htonl(DST_REWRITE_IP6_3);
+
+ tuple.ipv6.dport = bpf_htons(DST_REWRITE_PORT6);
+
+ if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM)
+ return 0;
+ else if (ctx->type == SOCK_STREAM)
+ sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof(tuple.ipv6),
+ BPF_F_CURRENT_NETNS, 0);
+ else
+ sk = bpf_sk_lookup_udp(ctx, &tuple, sizeof(tuple.ipv6),
+ BPF_F_CURRENT_NETNS, 0);
+
+ if (!sk)
+ return 0;
+
+ if (sk->src_ip6[0] != tuple.ipv6.daddr[0] ||
+ sk->src_ip6[1] != tuple.ipv6.daddr[1] ||
+ sk->src_ip6[2] != tuple.ipv6.daddr[2] ||
+ sk->src_ip6[3] != tuple.ipv6.daddr[3] ||
+ sk->src_port != DST_REWRITE_PORT6) {
+ bpf_sk_release(sk);
+ return 0;
+ }
+
+ bpf_sk_release(sk);
+
+ /* Rewrite destination. */
+ ctx->user_ip6[0] = bpf_htonl(DST_REWRITE_IP6_0);
+ ctx->user_ip6[1] = bpf_htonl(DST_REWRITE_IP6_1);
+ ctx->user_ip6[2] = bpf_htonl(DST_REWRITE_IP6_2);
+ ctx->user_ip6[3] = bpf_htonl(DST_REWRITE_IP6_3);
+
+ ctx->user_port = bpf_htons(DST_REWRITE_PORT6);
+
+ /* Rewrite source. */
+ memset(&sa, 0, sizeof(sa));
+
+ sa.sin6_family = AF_INET6;
+ sa.sin6_port = bpf_htons(0);
+
+ sa.sin6_addr.s6_addr32[0] = bpf_htonl(SRC_REWRITE_IP6_0);
+ sa.sin6_addr.s6_addr32[1] = bpf_htonl(SRC_REWRITE_IP6_1);
+ sa.sin6_addr.s6_addr32[2] = bpf_htonl(SRC_REWRITE_IP6_2);
+ sa.sin6_addr.s6_addr32[3] = bpf_htonl(SRC_REWRITE_IP6_3);
+
+ if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0)
+ return 0;
+
+ return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/connect_force_port4.c b/tools/testing/selftests/bpf/progs/connect_force_port4.c
new file mode 100644
index 000000000..739630867
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/connect_force_port4.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <string.h>
+#include <stdbool.h>
+
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <sys/socket.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+char _license[] SEC("license") = "GPL";
+int _version SEC("version") = 1;
+
+struct svc_addr {
+ __be32 addr;
+ __be16 port;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct svc_addr);
+} service_mapping SEC(".maps");
+
+SEC("cgroup/connect4")
+int connect4(struct bpf_sock_addr *ctx)
+{
+ struct sockaddr_in sa = {};
+ struct svc_addr *orig;
+
+ /* Force local address to 127.0.0.1:22222. */
+ sa.sin_family = AF_INET;
+ sa.sin_port = bpf_htons(22222);
+ sa.sin_addr.s_addr = bpf_htonl(0x7f000001);
+
+ if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0)
+ return 0;
+
+ /* Rewire service 1.2.3.4:60000 to backend 127.0.0.1:60123. */
+ if (ctx->user_port == bpf_htons(60000)) {
+ orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ if (!orig)
+ return 0;
+
+ orig->addr = ctx->user_ip4;
+ orig->port = ctx->user_port;
+
+ ctx->user_ip4 = bpf_htonl(0x7f000001);
+ ctx->user_port = bpf_htons(60123);
+ }
+ return 1;
+}
+
+SEC("cgroup/getsockname4")
+int getsockname4(struct bpf_sock_addr *ctx)
+{
+ /* Expose local server as 1.2.3.4:60000 to client. */
+ if (ctx->user_port == bpf_htons(60123)) {
+ ctx->user_ip4 = bpf_htonl(0x01020304);
+ ctx->user_port = bpf_htons(60000);
+ }
+ return 1;
+}
+
+SEC("cgroup/getpeername4")
+int getpeername4(struct bpf_sock_addr *ctx)
+{
+ struct svc_addr *orig;
+
+ /* Expose service 1.2.3.4:60000 as peer instead of backend. */
+ if (ctx->user_port == bpf_htons(60123)) {
+ orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, 0);
+ if (orig) {
+ ctx->user_ip4 = orig->addr;
+ ctx->user_port = orig->port;
+ }
+ }
+ return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/connect_force_port6.c b/tools/testing/selftests/bpf/progs/connect_force_port6.c
new file mode 100644
index 000000000..c1a2b555e
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/connect_force_port6.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <string.h>
+
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <sys/socket.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+char _license[] SEC("license") = "GPL";
+int _version SEC("version") = 1;
+
+struct svc_addr {
+ __be32 addr[4];
+ __be16 port;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct svc_addr);
+} service_mapping SEC(".maps");
+
+SEC("cgroup/connect6")
+int connect6(struct bpf_sock_addr *ctx)
+{
+ struct sockaddr_in6 sa = {};
+ struct svc_addr *orig;
+
+ /* Force local address to [::1]:22223. */
+ sa.sin6_family = AF_INET6;
+ sa.sin6_port = bpf_htons(22223);
+ sa.sin6_addr.s6_addr32[3] = bpf_htonl(1);
+
+ if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0)
+ return 0;
+
+ /* Rewire service [fc00::1]:60000 to backend [::1]:60124. */
+ if (ctx->user_port == bpf_htons(60000)) {
+ orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ if (!orig)
+ return 0;
+
+ orig->addr[0] = ctx->user_ip6[0];
+ orig->addr[1] = ctx->user_ip6[1];
+ orig->addr[2] = ctx->user_ip6[2];
+ orig->addr[3] = ctx->user_ip6[3];
+ orig->port = ctx->user_port;
+
+ ctx->user_ip6[0] = 0;
+ ctx->user_ip6[1] = 0;
+ ctx->user_ip6[2] = 0;
+ ctx->user_ip6[3] = bpf_htonl(1);
+ ctx->user_port = bpf_htons(60124);
+ }
+ return 1;
+}
+
+SEC("cgroup/getsockname6")
+int getsockname6(struct bpf_sock_addr *ctx)
+{
+ /* Expose local server as [fc00::1]:60000 to client. */
+ if (ctx->user_port == bpf_htons(60124)) {
+ ctx->user_ip6[0] = bpf_htonl(0xfc000000);
+ ctx->user_ip6[1] = 0;
+ ctx->user_ip6[2] = 0;
+ ctx->user_ip6[3] = bpf_htonl(1);
+ ctx->user_port = bpf_htons(60000);
+ }
+ return 1;
+}
+
+SEC("cgroup/getpeername6")
+int getpeername6(struct bpf_sock_addr *ctx)
+{
+ struct svc_addr *orig;
+
+ /* Expose service [fc00::1]:60000 as peer instead of backend. */
+ if (ctx->user_port == bpf_htons(60124)) {
+ orig = bpf_sk_storage_get(&service_mapping, ctx->sk, 0, 0);
+ if (orig) {
+ ctx->user_ip6[0] = orig->addr[0];
+ ctx->user_ip6[1] = orig->addr[1];
+ ctx->user_ip6[2] = orig->addr[2];
+ ctx->user_ip6[3] = orig->addr[3];
+ ctx->user_port = orig->port;
+ }
+ }
+ return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/core_reloc_types.h b/tools/testing/selftests/bpf/progs/core_reloc_types.h
new file mode 100644
index 000000000..af58ef9a2
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/core_reloc_types.h
@@ -0,0 +1,1145 @@
+#include <stdint.h>
+#include <stdbool.h>
+
+void preserce_ptr_sz_fn(long x) {}
+
+#define __bpf_aligned __attribute__((aligned(8)))
+
+/*
+ * KERNEL
+ */
+
+struct core_reloc_kernel_output {
+ int valid[10];
+ char comm[sizeof("test_progs")];
+ int comm_len;
+};
+
+/*
+ * FLAVORS
+ */
+struct core_reloc_flavors {
+ int a;
+ int b;
+ int c;
+};
+
+/* this is not a flavor, as it doesn't have triple underscore */
+struct core_reloc_flavors__err_wrong_name {
+ int a;
+ int b;
+ int c;
+};
+
+/*
+ * NESTING
+ */
+/* original set up, used to record relocations in BPF program */
+struct core_reloc_nesting_substruct {
+ int a;
+};
+
+union core_reloc_nesting_subunion {
+ int b;
+};
+
+struct core_reloc_nesting {
+ union {
+ struct core_reloc_nesting_substruct a;
+ } a;
+ struct {
+ union core_reloc_nesting_subunion b;
+ } b;
+};
+
+/* inlined anonymous struct/union instead of named structs in original */
+struct core_reloc_nesting___anon_embed {
+ int __just_for_padding;
+ union {
+ struct {
+ int a;
+ } a;
+ } a;
+ struct {
+ union {
+ int b;
+ } b;
+ } b;
+};
+
+/* different mix of nested structs/unions than in original */
+struct core_reloc_nesting___struct_union_mixup {
+ int __a;
+ struct {
+ int __a;
+ union {
+ char __a;
+ int a;
+ } a;
+ } a;
+ int __b;
+ union {
+ int __b;
+ union {
+ char __b;
+ int b;
+ } b;
+ } b;
+};
+
+/* extra anon structs/unions, but still valid a.a.a and b.b.b accessors */
+struct core_reloc_nesting___extra_nesting {
+ int __padding;
+ struct {
+ struct {
+ struct {
+ struct {
+ union {
+ int a;
+ } a;
+ };
+ };
+ } a;
+ int __some_more;
+ struct {
+ union {
+ union {
+ union {
+ struct {
+ int b;
+ };
+ } b;
+ };
+ } b;
+ };
+ };
+};
+
+/* three flavors of same struct with different structure but same layout for
+ * a.a.a and b.b.b, thus successfully resolved and relocatable */
+struct core_reloc_nesting___dup_compat_types {
+ char __just_for_padding;
+ /* 3 more bytes of padding */
+ struct {
+ struct {
+ int a; /* offset 4 */
+ } a;
+ } a;
+ long long __more_padding;
+ struct {
+ struct {
+ int b; /* offset 16 */
+ } b;
+ } b;
+};
+
+struct core_reloc_nesting___dup_compat_types__2 {
+ int __aligned_padding;
+ struct {
+ int __trickier_noop[0];
+ struct {
+ char __some_more_noops[0];
+ int a; /* offset 4 */
+ } a;
+ } a;
+ int __more_padding;
+ struct {
+ struct {
+ struct {
+ int __critical_padding;
+ int b; /* offset 16 */
+ } b;
+ int __does_not_matter;
+ };
+ } b;
+ int __more_irrelevant_stuff;
+};
+
+struct core_reloc_nesting___dup_compat_types__3 {
+ char __correct_padding[4];
+ struct {
+ struct {
+ int a; /* offset 4 */
+ } a;
+ } a;
+ /* 8 byte padding due to next struct's alignment */
+ struct {
+ struct {
+ int b;
+ } b;
+ } b __attribute__((aligned(16)));
+};
+
+/* b.b.b field is missing */
+struct core_reloc_nesting___err_missing_field {
+ struct {
+ struct {
+ int a;
+ } a;
+ } a;
+ struct {
+ struct {
+ int x;
+ } b;
+ } b;
+};
+
+/* b.b.b field is an array of integers instead of plain int */
+struct core_reloc_nesting___err_array_field {
+ struct {
+ struct {
+ int a;
+ } a;
+ } a;
+ struct {
+ struct {
+ int b[1];
+ } b;
+ } b;
+};
+
+/* middle b container is missing */
+struct core_reloc_nesting___err_missing_container {
+ struct {
+ struct {
+ int a;
+ } a;
+ } a;
+ struct {
+ int x;
+ } b;
+};
+
+/* middle b container is referenced through pointer instead of being embedded */
+struct core_reloc_nesting___err_nonstruct_container {
+ struct {
+ struct {
+ int a;
+ } a;
+ } a;
+ struct {
+ struct {
+ int b;
+ } *b;
+ } b;
+};
+
+/* middle b container is an array of structs instead of plain struct */
+struct core_reloc_nesting___err_array_container {
+ struct {
+ struct {
+ int a;
+ } a;
+ } a;
+ struct {
+ struct {
+ int b;
+ } b[1];
+ } b;
+};
+
+/* two flavors of same struct with incompatible layout for b.b.b */
+struct core_reloc_nesting___err_dup_incompat_types__1 {
+ struct {
+ struct {
+ int a; /* offset 0 */
+ } a;
+ } a;
+ struct {
+ struct {
+ int b; /* offset 4 */
+ } b;
+ } b;
+};
+
+struct core_reloc_nesting___err_dup_incompat_types__2 {
+ struct {
+ struct {
+ int a; /* offset 0 */
+ } a;
+ } a;
+ int __extra_padding;
+ struct {
+ struct {
+ int b; /* offset 8 (!) */
+ } b;
+ } b;
+};
+
+/* two flavors of same struct having one of a.a.a and b.b.b, but not both */
+struct core_reloc_nesting___err_partial_match_dups__a {
+ struct {
+ struct {
+ int a;
+ } a;
+ } a;
+};
+
+struct core_reloc_nesting___err_partial_match_dups__b {
+ struct {
+ struct {
+ int b;
+ } b;
+ } b;
+};
+
+struct core_reloc_nesting___err_too_deep {
+ struct {
+ struct {
+ int a;
+ } a;
+ } a;
+ /* 65 levels of nestedness for b.b.b */
+ struct {
+ struct {
+ struct { struct { struct { struct { struct {
+ struct { struct { struct { struct { struct {
+ struct { struct { struct { struct { struct {
+ struct { struct { struct { struct { struct {
+ struct { struct { struct { struct { struct {
+ struct { struct { struct { struct { struct {
+ struct { struct { struct { struct { struct {
+ struct { struct { struct { struct { struct {
+ struct { struct { struct { struct { struct {
+ struct { struct { struct { struct { struct {
+ struct { struct { struct { struct { struct {
+ struct { struct { struct { struct { struct {
+ /* this one is one too much */
+ struct {
+ int b;
+ };
+ }; }; }; }; };
+ }; }; }; }; };
+ }; }; }; }; };
+ }; }; }; }; };
+ }; }; }; }; };
+ }; }; }; }; };
+ }; }; }; }; };
+ }; }; }; }; };
+ }; }; }; }; };
+ }; }; }; }; };
+ }; }; }; }; };
+ }; }; }; }; };
+ } b;
+ } b;
+};
+
+/*
+ * ARRAYS
+ */
+struct core_reloc_arrays_output {
+ int a2;
+ char b123;
+ int c1c;
+ int d00d;
+ int f10c;
+};
+
+struct core_reloc_arrays_substruct {
+ int c;
+ int d;
+};
+
+struct core_reloc_arrays {
+ int a[5];
+ char b[2][3][4];
+ struct core_reloc_arrays_substruct c[3];
+ struct core_reloc_arrays_substruct d[1][2];
+ struct core_reloc_arrays_substruct f[][2];
+};
+
+/* bigger array dimensions */
+struct core_reloc_arrays___diff_arr_dim {
+ int a[7];
+ char b[3][4][5];
+ struct core_reloc_arrays_substruct c[4];
+ struct core_reloc_arrays_substruct d[2][3];
+ struct core_reloc_arrays_substruct f[1][3];
+};
+
+/* different size of array's value (struct) */
+struct core_reloc_arrays___diff_arr_val_sz {
+ int a[5];
+ char b[2][3][4];
+ struct {
+ int __padding1;
+ int c;
+ int __padding2;
+ } c[3];
+ struct {
+ int __padding1;
+ int d;
+ int __padding2;
+ } d[1][2];
+ struct {
+ int __padding1;
+ int c;
+ int __padding2;
+ } f[][2];
+};
+
+struct core_reloc_arrays___equiv_zero_sz_arr {
+ int a[5];
+ char b[2][3][4];
+ struct core_reloc_arrays_substruct c[3];
+ struct core_reloc_arrays_substruct d[1][2];
+ /* equivalent to flexible array */
+ struct core_reloc_arrays_substruct f[][2];
+};
+
+struct core_reloc_arrays___fixed_arr {
+ int a[5];
+ char b[2][3][4];
+ struct core_reloc_arrays_substruct c[3];
+ struct core_reloc_arrays_substruct d[1][2];
+ /* not a flexible array anymore, but within access bounds */
+ struct core_reloc_arrays_substruct f[1][2];
+};
+
+struct core_reloc_arrays___err_too_small {
+ int a[2]; /* this one is too small */
+ char b[2][3][4];
+ struct core_reloc_arrays_substruct c[3];
+ struct core_reloc_arrays_substruct d[1][2];
+ struct core_reloc_arrays_substruct f[][2];
+};
+
+struct core_reloc_arrays___err_too_shallow {
+ int a[5];
+ char b[2][3]; /* this one lacks one dimension */
+ struct core_reloc_arrays_substruct c[3];
+ struct core_reloc_arrays_substruct d[1][2];
+ struct core_reloc_arrays_substruct f[][2];
+};
+
+struct core_reloc_arrays___err_non_array {
+ int a; /* not an array */
+ char b[2][3][4];
+ struct core_reloc_arrays_substruct c[3];
+ struct core_reloc_arrays_substruct d[1][2];
+ struct core_reloc_arrays_substruct f[][2];
+};
+
+struct core_reloc_arrays___err_wrong_val_type {
+ int a[5];
+ char b[2][3][4];
+ int c[3]; /* value is not a struct */
+ struct core_reloc_arrays_substruct d[1][2];
+ struct core_reloc_arrays_substruct f[][2];
+};
+
+struct core_reloc_arrays___err_bad_zero_sz_arr {
+ /* zero-sized array, but not at the end */
+ struct core_reloc_arrays_substruct f[0][2];
+ int a[5];
+ char b[2][3][4];
+ struct core_reloc_arrays_substruct c[3];
+ struct core_reloc_arrays_substruct d[1][2];
+};
+
+/*
+ * PRIMITIVES
+ */
+enum core_reloc_primitives_enum {
+ A = 0,
+ B = 1,
+};
+
+struct core_reloc_primitives {
+ char a;
+ int b;
+ enum core_reloc_primitives_enum c;
+ void *d __bpf_aligned;
+ int (*f)(const char *) __bpf_aligned;
+};
+
+struct core_reloc_primitives___diff_enum_def {
+ char a;
+ int b;
+ void *d __bpf_aligned;
+ int (*f)(const char *) __bpf_aligned;
+ enum {
+ X = 100,
+ Y = 200,
+ } c __bpf_aligned; /* inline enum def with differing set of values */
+};
+
+struct core_reloc_primitives___diff_func_proto {
+ void (*f)(int) __bpf_aligned; /* incompatible function prototype */
+ void *d __bpf_aligned;
+ enum core_reloc_primitives_enum c __bpf_aligned;
+ int b;
+ char a;
+};
+
+struct core_reloc_primitives___diff_ptr_type {
+ const char * const d __bpf_aligned; /* different pointee type + modifiers */
+ char a __bpf_aligned;
+ int b;
+ enum core_reloc_primitives_enum c;
+ int (*f)(const char *) __bpf_aligned;
+};
+
+struct core_reloc_primitives___err_non_enum {
+ char a[1];
+ int b;
+ int c; /* int instead of enum */
+ void *d __bpf_aligned;
+ int (*f)(const char *) __bpf_aligned;
+};
+
+struct core_reloc_primitives___err_non_int {
+ char a[1];
+ int *b __bpf_aligned; /* ptr instead of int */
+ enum core_reloc_primitives_enum c __bpf_aligned;
+ void *d __bpf_aligned;
+ int (*f)(const char *) __bpf_aligned;
+};
+
+struct core_reloc_primitives___err_non_ptr {
+ char a[1];
+ int b;
+ enum core_reloc_primitives_enum c;
+ int d; /* int instead of ptr */
+ int (*f)(const char *) __bpf_aligned;
+};
+
+/*
+ * MODS
+ */
+struct core_reloc_mods_output {
+ int a, b, c, d, e, f, g, h;
+};
+
+typedef const int int_t;
+typedef const char *char_ptr_t __bpf_aligned;
+typedef const int arr_t[7];
+
+struct core_reloc_mods_substruct {
+ int x;
+ int y;
+};
+
+typedef struct {
+ int x;
+ int y;
+} core_reloc_mods_substruct_t;
+
+struct core_reloc_mods {
+ int a;
+ int_t b;
+ char *c __bpf_aligned;
+ char_ptr_t d;
+ int e[3] __bpf_aligned;
+ arr_t f;
+ struct core_reloc_mods_substruct g;
+ core_reloc_mods_substruct_t h;
+};
+
+/* a/b, c/d, e/f, and g/h pairs are swapped */
+struct core_reloc_mods___mod_swap {
+ int b;
+ int_t a;
+ char *d __bpf_aligned;
+ char_ptr_t c;
+ int f[3] __bpf_aligned;
+ arr_t e;
+ struct {
+ int y;
+ int x;
+ } h;
+ core_reloc_mods_substruct_t g;
+};
+
+typedef int int1_t;
+typedef int1_t int2_t;
+typedef int2_t int3_t;
+
+typedef int arr1_t[5];
+typedef arr1_t arr2_t;
+typedef arr2_t arr3_t;
+typedef arr3_t arr4_t;
+
+typedef const char * const volatile fancy_char_ptr_t __bpf_aligned;
+
+typedef core_reloc_mods_substruct_t core_reloc_mods_substruct_tt;
+
+/* we need more typedefs */
+struct core_reloc_mods___typedefs {
+ core_reloc_mods_substruct_tt g;
+ core_reloc_mods_substruct_tt h;
+ arr4_t f;
+ arr4_t e;
+ fancy_char_ptr_t d;
+ fancy_char_ptr_t c;
+ int3_t b __bpf_aligned;
+ int3_t a;
+};
+
+/*
+ * PTR_AS_ARR
+ */
+struct core_reloc_ptr_as_arr {
+ int a;
+};
+
+struct core_reloc_ptr_as_arr___diff_sz {
+ int :32; /* padding */
+ char __some_more_padding;
+ int a;
+};
+
+/*
+ * INTS
+ */
+struct core_reloc_ints {
+ uint8_t u8_field;
+ int8_t s8_field;
+ uint16_t u16_field;
+ int16_t s16_field;
+ uint32_t u32_field;
+ int32_t s32_field;
+ uint64_t u64_field;
+ int64_t s64_field;
+};
+
+/* signed/unsigned types swap */
+struct core_reloc_ints___reverse_sign {
+ int8_t u8_field;
+ uint8_t s8_field;
+ int16_t u16_field;
+ uint16_t s16_field;
+ int32_t u32_field;
+ uint32_t s32_field;
+ int64_t u64_field;
+ uint64_t s64_field;
+};
+
+struct core_reloc_ints___bool {
+ bool u8_field; /* bool instead of uint8 */
+ int8_t s8_field;
+ uint16_t u16_field;
+ int16_t s16_field;
+ uint32_t u32_field;
+ int32_t s32_field;
+ uint64_t u64_field;
+ int64_t s64_field;
+};
+
+/*
+ * MISC
+ */
+struct core_reloc_misc_output {
+ int a, b, c;
+};
+
+struct core_reloc_misc___a {
+ int a1;
+ int a2;
+};
+
+struct core_reloc_misc___b {
+ int b1;
+ int b2;
+};
+
+/* this one extends core_reloc_misc_extensible struct from BPF prog */
+struct core_reloc_misc_extensible {
+ int a;
+ int b;
+ int c;
+ int d;
+};
+
+/*
+ * FIELD EXISTENCE
+ */
+struct core_reloc_existence_output {
+ int a_exists;
+ int a_value;
+ int b_exists;
+ int b_value;
+ int c_exists;
+ int c_value;
+ int arr_exists;
+ int arr_value;
+ int s_exists;
+ int s_value;
+};
+
+struct core_reloc_existence {
+ int a;
+ struct {
+ int b;
+ };
+ int c;
+ int arr[1];
+ struct {
+ int x;
+ } s;
+};
+
+struct core_reloc_existence___minimal {
+ int a;
+};
+
+struct core_reloc_existence___wrong_field_defs {
+ void *a;
+ int b[1];
+ struct{ int x; } c;
+ int arr;
+ int s;
+};
+
+/*
+ * BITFIELDS
+ */
+/* bitfield read results, all as plain integers */
+struct core_reloc_bitfields_output {
+ int64_t ub1;
+ int64_t ub2;
+ int64_t ub7;
+ int64_t sb4;
+ int64_t sb20;
+ int64_t u32;
+ int64_t s32;
+};
+
+struct core_reloc_bitfields {
+ /* unsigned bitfields */
+ uint8_t ub1: 1;
+ uint8_t ub2: 2;
+ uint32_t ub7: 7;
+ /* signed bitfields */
+ int8_t sb4: 4;
+ int32_t sb20: 20;
+ /* non-bitfields */
+ uint32_t u32;
+ int32_t s32;
+};
+
+/* different bit sizes (both up and down) */
+struct core_reloc_bitfields___bit_sz_change {
+ /* unsigned bitfields */
+ uint16_t ub1: 3; /* 1 -> 3 */
+ uint32_t ub2: 20; /* 2 -> 20 */
+ uint8_t ub7: 1; /* 7 -> 1 */
+ /* signed bitfields */
+ int8_t sb4: 1; /* 4 -> 1 */
+ int32_t sb20: 30; /* 20 -> 30 */
+ /* non-bitfields */
+ uint16_t u32; /* 32 -> 16 */
+ int64_t s32 __bpf_aligned; /* 32 -> 64 */
+};
+
+/* turn bitfield into non-bitfield and vice versa */
+struct core_reloc_bitfields___bitfield_vs_int {
+ uint64_t ub1; /* 3 -> 64 non-bitfield */
+ uint8_t ub2; /* 20 -> 8 non-bitfield */
+ int64_t ub7 __bpf_aligned; /* 7 -> 64 non-bitfield signed */
+ int64_t sb4 __bpf_aligned; /* 4 -> 64 non-bitfield signed */
+ uint64_t sb20 __bpf_aligned; /* 20 -> 16 non-bitfield unsigned */
+ int32_t u32: 20; /* 32 non-bitfield -> 20 bitfield */
+ uint64_t s32: 60 __bpf_aligned; /* 32 non-bitfield -> 60 bitfield */
+};
+
+struct core_reloc_bitfields___just_big_enough {
+ uint64_t ub1: 4;
+ uint64_t ub2: 60; /* packed tightly */
+ uint32_t ub7;
+ uint32_t sb4;
+ uint32_t sb20;
+ uint32_t u32;
+ uint32_t s32;
+} __attribute__((packed)) ;
+
+struct core_reloc_bitfields___err_too_big_bitfield {
+ uint64_t ub1: 4;
+ uint64_t ub2: 61; /* packed tightly */
+ uint32_t ub7;
+ uint32_t sb4;
+ uint32_t sb20;
+ uint32_t u32;
+ uint32_t s32;
+} __attribute__((packed)) ;
+
+/*
+ * SIZE
+ */
+struct core_reloc_size_output {
+ int int_sz;
+ int struct_sz;
+ int union_sz;
+ int arr_sz;
+ int arr_elem_sz;
+ int ptr_sz;
+ int enum_sz;
+};
+
+struct core_reloc_size {
+ int int_field;
+ struct { int x; } struct_field;
+ union { int x; } union_field;
+ int arr_field[4];
+ void *ptr_field;
+ enum { VALUE = 123 } enum_field;
+};
+
+struct core_reloc_size___diff_sz {
+ uint64_t int_field;
+ struct { int x; int y; int z; } struct_field;
+ union { int x; char bla[123]; } union_field;
+ char arr_field[10];
+ void *ptr_field;
+ enum { OTHER_VALUE = 0xFFFFFFFFFFFFFFFF } enum_field;
+};
+
+/* Error case of two candidates with the fields (int_field) at the same
+ * offset, but with differing final relocation values: size 4 vs size 1
+ */
+struct core_reloc_size___err_ambiguous1 {
+ /* int at offset 0 */
+ int int_field;
+
+ struct { int x; } struct_field;
+ union { int x; } union_field;
+ int arr_field[4];
+ void *ptr_field;
+ enum { VALUE___1 = 123 } enum_field;
+};
+
+struct core_reloc_size___err_ambiguous2 {
+ /* char at offset 0 */
+ char int_field;
+
+ struct { int x; } struct_field;
+ union { int x; } union_field;
+ int arr_field[4];
+ void *ptr_field;
+ enum { VALUE___2 = 123 } enum_field;
+};
+
+/*
+ * TYPE EXISTENCE & SIZE
+ */
+struct core_reloc_type_based_output {
+ bool struct_exists;
+ bool union_exists;
+ bool enum_exists;
+ bool typedef_named_struct_exists;
+ bool typedef_anon_struct_exists;
+ bool typedef_struct_ptr_exists;
+ bool typedef_int_exists;
+ bool typedef_enum_exists;
+ bool typedef_void_ptr_exists;
+ bool typedef_func_proto_exists;
+ bool typedef_arr_exists;
+
+ int struct_sz;
+ int union_sz;
+ int enum_sz;
+ int typedef_named_struct_sz;
+ int typedef_anon_struct_sz;
+ int typedef_struct_ptr_sz;
+ int typedef_int_sz;
+ int typedef_enum_sz;
+ int typedef_void_ptr_sz;
+ int typedef_func_proto_sz;
+ int typedef_arr_sz;
+};
+
+struct a_struct {
+ int x;
+};
+
+union a_union {
+ int y;
+ int z;
+};
+
+typedef struct a_struct named_struct_typedef;
+
+typedef struct { int x, y, z; } anon_struct_typedef;
+
+typedef struct {
+ int a, b, c;
+} *struct_ptr_typedef;
+
+enum an_enum {
+ AN_ENUM_VAL1 = 1,
+ AN_ENUM_VAL2 = 2,
+ AN_ENUM_VAL3 = 3,
+};
+
+typedef int int_typedef;
+
+typedef enum { TYPEDEF_ENUM_VAL1, TYPEDEF_ENUM_VAL2 } enum_typedef;
+
+typedef void *void_ptr_typedef;
+
+typedef int (*func_proto_typedef)(long);
+
+typedef char arr_typedef[20];
+
+struct core_reloc_type_based {
+ struct a_struct f1;
+ union a_union f2;
+ enum an_enum f3;
+ named_struct_typedef f4;
+ anon_struct_typedef f5;
+ struct_ptr_typedef f6;
+ int_typedef f7;
+ enum_typedef f8;
+ void_ptr_typedef f9;
+ func_proto_typedef f10;
+ arr_typedef f11;
+};
+
+/* no types in target */
+struct core_reloc_type_based___all_missing {
+};
+
+/* different type sizes, extra modifiers, anon vs named enums, etc */
+struct a_struct___diff_sz {
+ long x;
+ int y;
+ char z;
+};
+
+union a_union___diff_sz {
+ char yy;
+ char zz;
+};
+
+typedef struct a_struct___diff_sz named_struct_typedef___diff_sz;
+
+typedef struct { long xx, yy, zzz; } anon_struct_typedef___diff_sz;
+
+typedef struct {
+ char aa[1], bb[2], cc[3];
+} *struct_ptr_typedef___diff_sz;
+
+enum an_enum___diff_sz {
+ AN_ENUM_VAL1___diff_sz = 0x123412341234,
+ AN_ENUM_VAL2___diff_sz = 2,
+};
+
+typedef unsigned long int_typedef___diff_sz;
+
+typedef enum an_enum___diff_sz enum_typedef___diff_sz;
+
+typedef const void * const void_ptr_typedef___diff_sz;
+
+typedef int_typedef___diff_sz (*func_proto_typedef___diff_sz)(char);
+
+typedef int arr_typedef___diff_sz[2];
+
+struct core_reloc_type_based___diff_sz {
+ struct a_struct___diff_sz f1;
+ union a_union___diff_sz f2;
+ enum an_enum___diff_sz f3;
+ named_struct_typedef___diff_sz f4;
+ anon_struct_typedef___diff_sz f5;
+ struct_ptr_typedef___diff_sz f6;
+ int_typedef___diff_sz f7;
+ enum_typedef___diff_sz f8;
+ void_ptr_typedef___diff_sz f9;
+ func_proto_typedef___diff_sz f10;
+ arr_typedef___diff_sz f11;
+};
+
+/* incompatibilities between target and local types */
+union a_struct___incompat { /* union instead of struct */
+ int x;
+};
+
+struct a_union___incompat { /* struct instead of union */
+ int y;
+ int z;
+};
+
+/* typedef to union, not to struct */
+typedef union a_struct___incompat named_struct_typedef___incompat;
+
+/* typedef to void pointer, instead of struct */
+typedef void *anon_struct_typedef___incompat;
+
+/* extra pointer indirection */
+typedef struct {
+ int a, b, c;
+} **struct_ptr_typedef___incompat;
+
+/* typedef of a struct with int, instead of int */
+typedef struct { int x; } int_typedef___incompat;
+
+/* typedef to func_proto, instead of enum */
+typedef int (*enum_typedef___incompat)(void);
+
+/* pointer to char instead of void */
+typedef char *void_ptr_typedef___incompat;
+
+/* void return type instead of int */
+typedef void (*func_proto_typedef___incompat)(long);
+
+/* multi-dimensional array instead of a single-dimensional */
+typedef int arr_typedef___incompat[20][2];
+
+struct core_reloc_type_based___incompat {
+ union a_struct___incompat f1;
+ struct a_union___incompat f2;
+ /* the only valid one is enum, to check that something still succeeds */
+ enum an_enum f3;
+ named_struct_typedef___incompat f4;
+ anon_struct_typedef___incompat f5;
+ struct_ptr_typedef___incompat f6;
+ int_typedef___incompat f7;
+ enum_typedef___incompat f8;
+ void_ptr_typedef___incompat f9;
+ func_proto_typedef___incompat f10;
+ arr_typedef___incompat f11;
+};
+
+/* func_proto with incompatible signature */
+typedef void (*func_proto_typedef___fn_wrong_ret1)(long);
+typedef int * (*func_proto_typedef___fn_wrong_ret2)(long);
+typedef struct { int x; } int_struct_typedef;
+typedef int_struct_typedef (*func_proto_typedef___fn_wrong_ret3)(long);
+typedef int (*func_proto_typedef___fn_wrong_arg)(void *);
+typedef int (*func_proto_typedef___fn_wrong_arg_cnt1)(long, long);
+typedef int (*func_proto_typedef___fn_wrong_arg_cnt2)(void);
+
+struct core_reloc_type_based___fn_wrong_args {
+ /* one valid type to make sure relos still work */
+ struct a_struct f1;
+ func_proto_typedef___fn_wrong_ret1 f2;
+ func_proto_typedef___fn_wrong_ret2 f3;
+ func_proto_typedef___fn_wrong_ret3 f4;
+ func_proto_typedef___fn_wrong_arg f5;
+ func_proto_typedef___fn_wrong_arg_cnt1 f6;
+ func_proto_typedef___fn_wrong_arg_cnt2 f7;
+};
+
+/*
+ * TYPE ID MAPPING (LOCAL AND TARGET)
+ */
+struct core_reloc_type_id_output {
+ int local_anon_struct;
+ int local_anon_union;
+ int local_anon_enum;
+ int local_anon_func_proto_ptr;
+ int local_anon_void_ptr;
+ int local_anon_arr;
+
+ int local_struct;
+ int local_union;
+ int local_enum;
+ int local_int;
+ int local_struct_typedef;
+ int local_func_proto_typedef;
+ int local_arr_typedef;
+
+ int targ_struct;
+ int targ_union;
+ int targ_enum;
+ int targ_int;
+ int targ_struct_typedef;
+ int targ_func_proto_typedef;
+ int targ_arr_typedef;
+};
+
+struct core_reloc_type_id {
+ struct a_struct f1;
+ union a_union f2;
+ enum an_enum f3;
+ named_struct_typedef f4;
+ func_proto_typedef f5;
+ arr_typedef f6;
+};
+
+struct core_reloc_type_id___missing_targets {
+ /* nothing */
+};
+
+/*
+ * ENUMERATOR VALUE EXISTENCE AND VALUE RELOCATION
+ */
+struct core_reloc_enumval_output {
+ bool named_val1_exists;
+ bool named_val2_exists;
+ bool named_val3_exists;
+ bool anon_val1_exists;
+ bool anon_val2_exists;
+ bool anon_val3_exists;
+
+ int named_val1;
+ int named_val2;
+ int anon_val1;
+ int anon_val2;
+};
+
+enum named_enum {
+ NAMED_ENUM_VAL1 = 1,
+ NAMED_ENUM_VAL2 = 2,
+ NAMED_ENUM_VAL3 = 3,
+};
+
+typedef enum {
+ ANON_ENUM_VAL1 = 0x10,
+ ANON_ENUM_VAL2 = 0x20,
+ ANON_ENUM_VAL3 = 0x30,
+} anon_enum;
+
+struct core_reloc_enumval {
+ enum named_enum f1;
+ anon_enum f2;
+};
+
+/* differing enumerator values */
+enum named_enum___diff {
+ NAMED_ENUM_VAL1___diff = 101,
+ NAMED_ENUM_VAL2___diff = 202,
+ NAMED_ENUM_VAL3___diff = 303,
+};
+
+typedef enum {
+ ANON_ENUM_VAL1___diff = 0x11,
+ ANON_ENUM_VAL2___diff = 0x22,
+ ANON_ENUM_VAL3___diff = 0x33,
+} anon_enum___diff;
+
+struct core_reloc_enumval___diff {
+ enum named_enum___diff f1;
+ anon_enum___diff f2;
+};
+
+/* missing (optional) third enum value */
+enum named_enum___val3_missing {
+ NAMED_ENUM_VAL1___val3_missing = 111,
+ NAMED_ENUM_VAL2___val3_missing = 222,
+};
+
+typedef enum {
+ ANON_ENUM_VAL1___val3_missing = 0x111,
+ ANON_ENUM_VAL2___val3_missing = 0x222,
+} anon_enum___val3_missing;
+
+struct core_reloc_enumval___val3_missing {
+ enum named_enum___val3_missing f1;
+ anon_enum___val3_missing f2;
+};
+
+/* missing (mandatory) second enum value, should fail */
+enum named_enum___err_missing {
+ NAMED_ENUM_VAL1___err_missing = 1,
+ NAMED_ENUM_VAL3___err_missing = 3,
+};
+
+typedef enum {
+ ANON_ENUM_VAL1___err_missing = 0x111,
+ ANON_ENUM_VAL3___err_missing = 0x222,
+} anon_enum___err_missing;
+
+struct core_reloc_enumval___err_missing {
+ enum named_enum___err_missing f1;
+ anon_enum___err_missing f2;
+};
diff --git a/tools/testing/selftests/bpf/progs/dev_cgroup.c b/tools/testing/selftests/bpf/progs/dev_cgroup.c
new file mode 100644
index 000000000..8924e06bd
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/dev_cgroup.c
@@ -0,0 +1,60 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#include <linux/bpf.h>
+#include <linux/version.h>
+#include <bpf/bpf_helpers.h>
+
+SEC("cgroup/dev")
+int bpf_prog1(struct bpf_cgroup_dev_ctx *ctx)
+{
+ short type = ctx->access_type & 0xFFFF;
+#ifdef DEBUG
+ short access = ctx->access_type >> 16;
+ char fmt[] = " %d:%d \n";
+
+ switch (type) {
+ case BPF_DEVCG_DEV_BLOCK:
+ fmt[0] = 'b';
+ break;
+ case BPF_DEVCG_DEV_CHAR:
+ fmt[0] = 'c';
+ break;
+ default:
+ fmt[0] = '?';
+ break;
+ }
+
+ if (access & BPF_DEVCG_ACC_READ)
+ fmt[8] = 'r';
+
+ if (access & BPF_DEVCG_ACC_WRITE)
+ fmt[9] = 'w';
+
+ if (access & BPF_DEVCG_ACC_MKNOD)
+ fmt[10] = 'm';
+
+ bpf_trace_printk(fmt, sizeof(fmt), ctx->major, ctx->minor);
+#endif
+
+ /* Allow access to /dev/zero and /dev/random.
+ * Forbid everything else.
+ */
+ if (ctx->major != 1 || type != BPF_DEVCG_DEV_CHAR)
+ return 0;
+
+ switch (ctx->minor) {
+ case 5: /* 1:5 /dev/zero */
+ case 9: /* 1:9 /dev/urandom */
+ return 1;
+ }
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/tools/testing/selftests/bpf/progs/fentry_test.c b/tools/testing/selftests/bpf/progs/fentry_test.c
new file mode 100644
index 000000000..5f645fdab
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/fentry_test.c
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+__u64 test1_result = 0;
+SEC("fentry/bpf_fentry_test1")
+int BPF_PROG(test1, int a)
+{
+ test1_result = a == 1;
+ return 0;
+}
+
+__u64 test2_result = 0;
+SEC("fentry/bpf_fentry_test2")
+int BPF_PROG(test2, int a, __u64 b)
+{
+ test2_result = a == 2 && b == 3;
+ return 0;
+}
+
+__u64 test3_result = 0;
+SEC("fentry/bpf_fentry_test3")
+int BPF_PROG(test3, char a, int b, __u64 c)
+{
+ test3_result = a == 4 && b == 5 && c == 6;
+ return 0;
+}
+
+__u64 test4_result = 0;
+SEC("fentry/bpf_fentry_test4")
+int BPF_PROG(test4, void *a, char b, int c, __u64 d)
+{
+ test4_result = a == (void *)7 && b == 8 && c == 9 && d == 10;
+ return 0;
+}
+
+__u64 test5_result = 0;
+SEC("fentry/bpf_fentry_test5")
+int BPF_PROG(test5, __u64 a, void *b, short c, int d, __u64 e)
+{
+ test5_result = a == 11 && b == (void *)12 && c == 13 && d == 14 &&
+ e == 15;
+ return 0;
+}
+
+__u64 test6_result = 0;
+SEC("fentry/bpf_fentry_test6")
+int BPF_PROG(test6, __u64 a, void *b, short c, int d, void * e, __u64 f)
+{
+ test6_result = a == 16 && b == (void *)17 && c == 18 && d == 19 &&
+ e == (void *)20 && f == 21;
+ return 0;
+}
+
+struct bpf_fentry_test_t {
+ struct bpf_fentry_test_t *a;
+};
+
+__u64 test7_result = 0;
+SEC("fentry/bpf_fentry_test7")
+int BPF_PROG(test7, struct bpf_fentry_test_t *arg)
+{
+ if (arg == 0)
+ test7_result = 1;
+ return 0;
+}
+
+__u64 test8_result = 0;
+SEC("fentry/bpf_fentry_test8")
+int BPF_PROG(test8, struct bpf_fentry_test_t *arg)
+{
+ if (arg->a == 0)
+ test8_result = 1;
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c
new file mode 100644
index 000000000..49a84a3a2
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf.c
@@ -0,0 +1,181 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+#include <linux/stddef.h>
+#include <linux/if_ether.h>
+#include <linux/ipv6.h>
+#include <linux/bpf.h>
+#include <linux/tcp.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_tracing.h>
+
+struct sk_buff {
+ unsigned int len;
+};
+
+__u64 test_result = 0;
+SEC("fexit/test_pkt_access")
+int BPF_PROG(test_main, struct sk_buff *skb, int ret)
+{
+ int len;
+
+ __builtin_preserve_access_index(({
+ len = skb->len;
+ }));
+ if (len != 74 || ret != 0)
+ return 0;
+ test_result = 1;
+ return 0;
+}
+
+__u64 test_result_subprog1 = 0;
+SEC("fexit/test_pkt_access_subprog1")
+int BPF_PROG(test_subprog1, struct sk_buff *skb, int ret)
+{
+ int len;
+
+ __builtin_preserve_access_index(({
+ len = skb->len;
+ }));
+ if (len != 74 || ret != 148)
+ return 0;
+ test_result_subprog1 = 1;
+ return 0;
+}
+
+/* Though test_pkt_access_subprog2() is defined in C as:
+ * static __attribute__ ((noinline))
+ * int test_pkt_access_subprog2(int val, volatile struct __sk_buff *skb)
+ * {
+ * return skb->len * val;
+ * }
+ * llvm optimizations remove 'int val' argument and generate BPF assembly:
+ * r0 = *(u32 *)(r1 + 0)
+ * w0 <<= 1
+ * exit
+ * In such case the verifier falls back to conservative and
+ * tracing program can access arguments and return value as u64
+ * instead of accurate types.
+ */
+struct args_subprog2 {
+ __u64 args[5];
+ __u64 ret;
+};
+__u64 test_result_subprog2 = 0;
+SEC("fexit/test_pkt_access_subprog2")
+int test_subprog2(struct args_subprog2 *ctx)
+{
+ struct sk_buff *skb = (void *)ctx->args[0];
+ __u64 ret;
+ int len;
+
+ bpf_probe_read_kernel(&len, sizeof(len),
+ __builtin_preserve_access_index(&skb->len));
+
+ ret = ctx->ret;
+ /* bpf_prog_load() loads "test_pkt_access.o" with BPF_F_TEST_RND_HI32
+ * which randomizes upper 32 bits after BPF_ALU32 insns.
+ * Hence after 'w0 <<= 1' upper bits of $rax are random.
+ * That is expected and correct. Trim them.
+ */
+ ret = (__u32) ret;
+ if (len != 74 || ret != 148)
+ return 0;
+ test_result_subprog2 = 1;
+ return 0;
+}
+
+__u64 test_result_subprog3 = 0;
+SEC("fexit/test_pkt_access_subprog3")
+int BPF_PROG(test_subprog3, int val, struct sk_buff *skb, int ret)
+{
+ int len;
+
+ __builtin_preserve_access_index(({
+ len = skb->len;
+ }));
+ if (len != 74 || ret != 74 * val || val != 3)
+ return 0;
+ test_result_subprog3 = 1;
+ return 0;
+}
+
+__u64 test_get_skb_len = 0;
+SEC("freplace/get_skb_len")
+int new_get_skb_len(struct __sk_buff *skb)
+{
+ int len = skb->len;
+
+ if (len != 74)
+ return 0;
+ test_get_skb_len = 1;
+ return 74; /* original get_skb_len() returns skb->len */
+}
+
+__u64 test_get_skb_ifindex = 0;
+SEC("freplace/get_skb_ifindex")
+int new_get_skb_ifindex(int val, struct __sk_buff *skb, int var)
+{
+ void *data_end = (void *)(long)skb->data_end;
+ void *data = (void *)(long)skb->data;
+ struct ipv6hdr ip6, *ip6p;
+ int ifindex = skb->ifindex;
+ __u32 eth_proto;
+ __u32 nh_off;
+
+ /* check that BPF extension can read packet via direct packet access */
+ if (data + 14 + sizeof(ip6) > data_end)
+ return 0;
+ ip6p = data + 14;
+
+ if (ip6p->nexthdr != 6 || ip6p->payload_len != __bpf_constant_htons(123))
+ return 0;
+
+ /* check that legacy packet access helper works too */
+ if (bpf_skb_load_bytes(skb, 14, &ip6, sizeof(ip6)) < 0)
+ return 0;
+ ip6p = &ip6;
+ if (ip6p->nexthdr != 6 || ip6p->payload_len != __bpf_constant_htons(123))
+ return 0;
+
+ if (ifindex != 1 || val != 3 || var != 1)
+ return 0;
+ test_get_skb_ifindex = 1;
+ return 3; /* original get_skb_ifindex() returns val * ifindex * var */
+}
+
+volatile __u64 test_get_constant = 0;
+SEC("freplace/get_constant")
+int new_get_constant(long val)
+{
+ if (val != 123)
+ return 0;
+ test_get_constant = 1;
+ return test_get_constant; /* original get_constant() returns val - 122 */
+}
+
+__u64 test_pkt_write_access_subprog = 0;
+SEC("freplace/test_pkt_write_access_subprog")
+int new_test_pkt_write_access_subprog(struct __sk_buff *skb, __u32 off)
+{
+
+ void *data = (void *)(long)skb->data;
+ void *data_end = (void *)(long)skb->data_end;
+ struct tcphdr *tcp;
+
+ if (off > sizeof(struct ethhdr) + sizeof(struct ipv6hdr))
+ return -1;
+
+ tcp = data + off;
+ if (tcp + 1 > data_end)
+ return -1;
+
+ /* make modifications to the packet data */
+ tcp->check++;
+ tcp->syn = 0;
+
+ test_pkt_write_access_subprog = 1;
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/fexit_bpf2bpf_simple.c b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf_simple.c
new file mode 100644
index 000000000..85c0b516d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/fexit_bpf2bpf_simple.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+struct sk_buff {
+ unsigned int len;
+};
+
+__u64 test_result = 0;
+
+SEC("fexit/test_pkt_md_access")
+int BPF_PROG(test_main2, struct sk_buff *skb, int ret)
+{
+ int len;
+
+ __builtin_preserve_access_index(({
+ len = skb->len;
+ }));
+ if (len != 74 || ret != 0)
+ return 0;
+
+ test_result = 1;
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/fexit_test.c b/tools/testing/selftests/bpf/progs/fexit_test.c
new file mode 100644
index 000000000..0952affb2
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/fexit_test.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+__u64 test1_result = 0;
+SEC("fexit/bpf_fentry_test1")
+int BPF_PROG(test1, int a, int ret)
+{
+ test1_result = a == 1 && ret == 2;
+ return 0;
+}
+
+__u64 test2_result = 0;
+SEC("fexit/bpf_fentry_test2")
+int BPF_PROG(test2, int a, __u64 b, int ret)
+{
+ test2_result = a == 2 && b == 3 && ret == 5;
+ return 0;
+}
+
+__u64 test3_result = 0;
+SEC("fexit/bpf_fentry_test3")
+int BPF_PROG(test3, char a, int b, __u64 c, int ret)
+{
+ test3_result = a == 4 && b == 5 && c == 6 && ret == 15;
+ return 0;
+}
+
+__u64 test4_result = 0;
+SEC("fexit/bpf_fentry_test4")
+int BPF_PROG(test4, void *a, char b, int c, __u64 d, int ret)
+{
+ test4_result = a == (void *)7 && b == 8 && c == 9 && d == 10 &&
+ ret == 34;
+ return 0;
+}
+
+__u64 test5_result = 0;
+SEC("fexit/bpf_fentry_test5")
+int BPF_PROG(test5, __u64 a, void *b, short c, int d, __u64 e, int ret)
+{
+ test5_result = a == 11 && b == (void *)12 && c == 13 && d == 14 &&
+ e == 15 && ret == 65;
+ return 0;
+}
+
+__u64 test6_result = 0;
+SEC("fexit/bpf_fentry_test6")
+int BPF_PROG(test6, __u64 a, void *b, short c, int d, void *e, __u64 f, int ret)
+{
+ test6_result = a == 16 && b == (void *)17 && c == 18 && d == 19 &&
+ e == (void *)20 && f == 21 && ret == 111;
+ return 0;
+}
+
+struct bpf_fentry_test_t {
+ struct bpf_fentry_test *a;
+};
+
+__u64 test7_result = 0;
+SEC("fexit/bpf_fentry_test7")
+int BPF_PROG(test7, struct bpf_fentry_test_t *arg)
+{
+ if (arg == 0)
+ test7_result = 1;
+ return 0;
+}
+
+__u64 test8_result = 0;
+SEC("fexit/bpf_fentry_test8")
+int BPF_PROG(test8, struct bpf_fentry_test_t *arg)
+{
+ if (arg->a == 0)
+ test8_result = 1;
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/fmod_ret_freplace.c b/tools/testing/selftests/bpf/progs/fmod_ret_freplace.c
new file mode 100644
index 000000000..c8943ccee
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/fmod_ret_freplace.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+volatile __u64 test_fmod_ret = 0;
+SEC("fmod_ret/security_new_get_constant")
+int BPF_PROG(fmod_ret_test, long val, int ret)
+{
+ test_fmod_ret = 1;
+ return 120;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/freplace_attach_probe.c b/tools/testing/selftests/bpf/progs/freplace_attach_probe.c
new file mode 100644
index 000000000..bb2a77c5b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/freplace_attach_probe.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#define VAR_NUM 2
+
+struct hmap_elem {
+ struct bpf_spin_lock lock;
+ int var[VAR_NUM];
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, struct hmap_elem);
+} hash_map SEC(".maps");
+
+SEC("freplace/handle_kprobe")
+int new_handle_kprobe(struct pt_regs *ctx)
+{
+ struct hmap_elem zero = {}, *val;
+ int key = 0;
+
+ val = bpf_map_lookup_elem(&hash_map, &key);
+ if (!val)
+ return 1;
+ /* spin_lock in hash map */
+ bpf_spin_lock(&val->lock);
+ val->var[0] = 99;
+ bpf_spin_unlock(&val->lock);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/freplace_cls_redirect.c b/tools/testing/selftests/bpf/progs/freplace_cls_redirect.c
new file mode 100644
index 000000000..68a5a9db9
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/freplace_cls_redirect.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+
+#include <linux/stddef.h>
+#include <linux/bpf.h>
+#include <linux/pkt_cls.h>
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_helpers.h>
+
+struct bpf_map_def SEC("maps") sock_map = {
+ .type = BPF_MAP_TYPE_SOCKMAP,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .max_entries = 2,
+};
+
+SEC("freplace/cls_redirect")
+int freplace_cls_redirect_test(struct __sk_buff *skb)
+{
+ int ret = 0;
+ const int zero = 0;
+ struct bpf_sock *sk;
+
+ sk = bpf_map_lookup_elem(&sock_map, &zero);
+ if (!sk)
+ return TC_ACT_SHOT;
+
+ ret = bpf_map_update_elem(&sock_map, &zero, sk, 0);
+ bpf_sk_release(sk);
+
+ return ret == 0 ? TC_ACT_OK : TC_ACT_SHOT;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/freplace_connect4.c b/tools/testing/selftests/bpf/progs/freplace_connect4.c
new file mode 100644
index 000000000..a0ae84230
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/freplace_connect4.c
@@ -0,0 +1,18 @@
+#include <linux/stddef.h>
+#include <linux/ipv6.h>
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <sys/socket.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+SEC("freplace/do_bind")
+int new_do_bind(struct bpf_sock_addr *ctx)
+{
+ struct sockaddr_in sa = {};
+
+ bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa));
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/freplace_connect_v4_prog.c b/tools/testing/selftests/bpf/progs/freplace_connect_v4_prog.c
new file mode 100644
index 000000000..544e5ac90
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/freplace_connect_v4_prog.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+
+#include <linux/stddef.h>
+#include <linux/ipv6.h>
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <sys/socket.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+SEC("freplace/connect_v4_prog")
+int new_connect_v4_prog(struct bpf_sock_addr *ctx)
+{
+ // return value thats in invalid range
+ return 255;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/freplace_get_constant.c b/tools/testing/selftests/bpf/progs/freplace_get_constant.c
new file mode 100644
index 000000000..705e4b64d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/freplace_get_constant.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+volatile __u64 test_get_constant = 0;
+SEC("freplace/get_constant")
+int security_new_get_constant(long val)
+{
+ if (val != 123)
+ return 0;
+ test_get_constant = 1;
+ return test_get_constant; /* original get_constant() returns val - 122 */
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c b/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c
new file mode 100644
index 000000000..6b42db2fe
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/get_cgroup_id_kern.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u64);
+} cg_ids SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u32);
+} pidmap SEC(".maps");
+
+SEC("tracepoint/syscalls/sys_enter_nanosleep")
+int trace(void *ctx)
+{
+ __u32 pid = bpf_get_current_pid_tgid();
+ __u32 key = 0, *expected_pid;
+ __u64 *val;
+
+ expected_pid = bpf_map_lookup_elem(&pidmap, &key);
+ if (!expected_pid || *expected_pid != pid)
+ return 0;
+
+ val = bpf_map_lookup_elem(&cg_ids, &key);
+ if (val)
+ *val = bpf_get_current_cgroup_id();
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1; /* ignored by tracepoints, required by libbpf.a */
diff --git a/tools/testing/selftests/bpf/progs/kfree_skb.c b/tools/testing/selftests/bpf/progs/kfree_skb.c
new file mode 100644
index 000000000..a46a264ce
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/kfree_skb.c
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <linux/bpf.h>
+#include <stdbool.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+} perf_buf_map SEC(".maps");
+
+#define _(P) (__builtin_preserve_access_index(P))
+
+/* define few struct-s that bpf program needs to access */
+struct callback_head {
+ struct callback_head *next;
+ void (*func)(struct callback_head *head);
+};
+struct dev_ifalias {
+ struct callback_head rcuhead;
+};
+
+struct net_device /* same as kernel's struct net_device */ {
+ int ifindex;
+ struct dev_ifalias *ifalias;
+};
+
+typedef struct {
+ int counter;
+} atomic_t;
+typedef struct refcount_struct {
+ atomic_t refs;
+} refcount_t;
+
+struct sk_buff {
+ /* field names and sizes should match to those in the kernel */
+ unsigned int len, data_len;
+ __u16 mac_len, hdr_len, queue_mapping;
+ struct net_device *dev;
+ /* order of the fields doesn't matter */
+ refcount_t users;
+ unsigned char *data;
+ char __pkt_type_offset[0];
+ char cb[48];
+};
+
+struct meta {
+ int ifindex;
+ __u32 cb32_0;
+ __u8 cb8_0;
+};
+
+/* TRACE_EVENT(kfree_skb,
+ * TP_PROTO(struct sk_buff *skb, void *location),
+ */
+SEC("tp_btf/kfree_skb")
+int BPF_PROG(trace_kfree_skb, struct sk_buff *skb, void *location)
+{
+ struct net_device *dev;
+ struct callback_head *ptr;
+ void *func;
+ int users;
+ unsigned char *data;
+ unsigned short pkt_data;
+ struct meta meta = {};
+ char pkt_type;
+ __u32 *cb32;
+ __u8 *cb8;
+
+ __builtin_preserve_access_index(({
+ users = skb->users.refs.counter;
+ data = skb->data;
+ dev = skb->dev;
+ ptr = dev->ifalias->rcuhead.next;
+ func = ptr->func;
+ cb8 = (__u8 *)&skb->cb;
+ cb32 = (__u32 *)&skb->cb;
+ }));
+
+ meta.ifindex = _(dev->ifindex);
+ meta.cb8_0 = cb8[8];
+ meta.cb32_0 = cb32[2];
+
+ bpf_probe_read_kernel(&pkt_type, sizeof(pkt_type), _(&skb->__pkt_type_offset));
+ pkt_type &= 7;
+
+ /* read eth proto */
+ bpf_probe_read_kernel(&pkt_data, sizeof(pkt_data), data + 12);
+
+ bpf_printk("rcuhead.next %llx func %llx\n", ptr, func);
+ bpf_printk("skb->len %d users %d pkt_type %x\n",
+ _(skb->len), users, pkt_type);
+ bpf_printk("skb->queue_mapping %d\n", _(skb->queue_mapping));
+ bpf_printk("dev->ifindex %d data %llx pkt_data %x\n",
+ meta.ifindex, data, pkt_data);
+ bpf_printk("cb8_0:%x cb32_0:%x\n", meta.cb8_0, meta.cb32_0);
+
+ if (users != 1 || pkt_data != bpf_htons(0x86dd) || meta.ifindex != 1)
+ /* raw tp ignores return value */
+ return 0;
+
+ /* send first 72 byte of the packet to user space */
+ bpf_skb_output(skb, &perf_buf_map, (72ull << 32) | BPF_F_CURRENT_CPU,
+ &meta, sizeof(meta));
+ return 0;
+}
+
+static volatile struct {
+ bool fentry_test_ok;
+ bool fexit_test_ok;
+} result;
+
+SEC("fentry/eth_type_trans")
+int BPF_PROG(fentry_eth_type_trans, struct sk_buff *skb, struct net_device *dev,
+ unsigned short protocol)
+{
+ int len, ifindex;
+
+ __builtin_preserve_access_index(({
+ len = skb->len;
+ ifindex = dev->ifindex;
+ }));
+
+ /* fentry sees full packet including L2 header */
+ if (len != 74 || ifindex != 1)
+ return 0;
+ result.fentry_test_ok = true;
+ return 0;
+}
+
+SEC("fexit/eth_type_trans")
+int BPF_PROG(fexit_eth_type_trans, struct sk_buff *skb, struct net_device *dev,
+ unsigned short protocol)
+{
+ int len, ifindex;
+
+ __builtin_preserve_access_index(({
+ len = skb->len;
+ ifindex = dev->ifindex;
+ }));
+
+ /* fexit sees packet without L2 header that eth_type_trans should have
+ * consumed.
+ */
+ if (len != 60 || protocol != bpf_htons(0x86dd) || ifindex != 1)
+ return 0;
+ result.fexit_test_ok = true;
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/load_bytes_relative.c b/tools/testing/selftests/bpf/progs/load_bytes_relative.c
new file mode 100644
index 000000000..dc1d04a7a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/load_bytes_relative.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright 2020 Google LLC.
+ */
+
+#include <errno.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u32);
+} test_result SEC(".maps");
+
+SEC("cgroup_skb/egress")
+int load_bytes_relative(struct __sk_buff *skb)
+{
+ struct ethhdr eth;
+ struct iphdr iph;
+
+ __u32 map_key = 0;
+ __u32 test_passed = 0;
+
+ /* MAC header is not set by the time cgroup_skb/egress triggers */
+ if (bpf_skb_load_bytes_relative(skb, 0, &eth, sizeof(eth),
+ BPF_HDR_START_MAC) != -EFAULT)
+ goto fail;
+
+ if (bpf_skb_load_bytes_relative(skb, 0, &iph, sizeof(iph),
+ BPF_HDR_START_NET))
+ goto fail;
+
+ if (bpf_skb_load_bytes_relative(skb, 0xffff, &iph, sizeof(iph),
+ BPF_HDR_START_NET) != -EFAULT)
+ goto fail;
+
+ test_passed = 1;
+
+fail:
+ bpf_map_update_elem(&test_result, &map_key, &test_passed, BPF_ANY);
+
+ return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/local_storage.c b/tools/testing/selftests/bpf/progs/local_storage.c
new file mode 100644
index 000000000..09529e33b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/local_storage.c
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2020 Google LLC.
+ */
+
+#include <errno.h>
+#include <linux/bpf.h>
+#include <stdbool.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+#define DUMMY_STORAGE_VALUE 0xdeadbeef
+
+int monitored_pid = 0;
+int inode_storage_result = -1;
+int sk_storage_result = -1;
+
+struct dummy_storage {
+ __u32 value;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_INODE_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct dummy_storage);
+} inode_storage_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC | BPF_F_CLONE);
+ __type(key, int);
+ __type(value, struct dummy_storage);
+} sk_storage_map SEC(".maps");
+
+/* TODO Use vmlinux.h once BTF pruning for embedded types is fixed.
+ */
+struct sock {} __attribute__((preserve_access_index));
+struct sockaddr {} __attribute__((preserve_access_index));
+struct socket {
+ struct sock *sk;
+} __attribute__((preserve_access_index));
+
+struct inode {} __attribute__((preserve_access_index));
+struct dentry {
+ struct inode *d_inode;
+} __attribute__((preserve_access_index));
+struct file {
+ struct inode *f_inode;
+} __attribute__((preserve_access_index));
+
+
+SEC("lsm/inode_unlink")
+int BPF_PROG(unlink_hook, struct inode *dir, struct dentry *victim)
+{
+ __u32 pid = bpf_get_current_pid_tgid() >> 32;
+ struct dummy_storage *storage;
+ int err;
+
+ if (pid != monitored_pid)
+ return 0;
+
+ storage = bpf_inode_storage_get(&inode_storage_map, victim->d_inode, 0,
+ BPF_LOCAL_STORAGE_GET_F_CREATE);
+ if (!storage)
+ return 0;
+
+ if (storage->value != DUMMY_STORAGE_VALUE)
+ inode_storage_result = -1;
+
+ err = bpf_inode_storage_delete(&inode_storage_map, victim->d_inode);
+ if (!err)
+ inode_storage_result = err;
+
+ return 0;
+}
+
+SEC("lsm/socket_bind")
+int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address,
+ int addrlen)
+{
+ __u32 pid = bpf_get_current_pid_tgid() >> 32;
+ struct dummy_storage *storage;
+ int err;
+
+ if (pid != monitored_pid)
+ return 0;
+
+ storage = bpf_sk_storage_get(&sk_storage_map, sock->sk, 0,
+ BPF_LOCAL_STORAGE_GET_F_CREATE);
+ if (!storage)
+ return 0;
+
+ if (storage->value != DUMMY_STORAGE_VALUE)
+ sk_storage_result = -1;
+
+ err = bpf_sk_storage_delete(&sk_storage_map, sock->sk);
+ if (!err)
+ sk_storage_result = err;
+
+ return 0;
+}
+
+SEC("lsm/socket_post_create")
+int BPF_PROG(socket_post_create, struct socket *sock, int family, int type,
+ int protocol, int kern)
+{
+ __u32 pid = bpf_get_current_pid_tgid() >> 32;
+ struct dummy_storage *storage;
+
+ if (pid != monitored_pid)
+ return 0;
+
+ storage = bpf_sk_storage_get(&sk_storage_map, sock->sk, 0,
+ BPF_LOCAL_STORAGE_GET_F_CREATE);
+ if (!storage)
+ return 0;
+
+ storage->value = DUMMY_STORAGE_VALUE;
+
+ return 0;
+}
+
+SEC("lsm/file_open")
+int BPF_PROG(file_open, struct file *file)
+{
+ __u32 pid = bpf_get_current_pid_tgid() >> 32;
+ struct dummy_storage *storage;
+
+ if (pid != monitored_pid)
+ return 0;
+
+ if (!file->f_inode)
+ return 0;
+
+ storage = bpf_inode_storage_get(&inode_storage_map, file->f_inode, 0,
+ BPF_LOCAL_STORAGE_GET_F_CREATE);
+ if (!storage)
+ return 0;
+
+ storage->value = DUMMY_STORAGE_VALUE;
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/loop1.c b/tools/testing/selftests/bpf/progs/loop1.c
new file mode 100644
index 000000000..50e66772c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/loop1.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC("raw_tracepoint/kfree_skb")
+int nested_loops(volatile struct pt_regs* ctx)
+{
+ int i, j, sum = 0, m;
+
+ for (j = 0; j < 300; j++)
+ for (i = 0; i < j; i++) {
+ if (j & 1)
+ m = PT_REGS_RC(ctx);
+ else
+ m = j;
+ sum += i * m;
+ }
+
+ return sum;
+}
diff --git a/tools/testing/selftests/bpf/progs/loop2.c b/tools/testing/selftests/bpf/progs/loop2.c
new file mode 100644
index 000000000..947bb7e98
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/loop2.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC("raw_tracepoint/consume_skb")
+int while_true(volatile struct pt_regs* ctx)
+{
+ int i = 0;
+
+ while (true) {
+ if (PT_REGS_RC(ctx) & 1)
+ i += 3;
+ else
+ i += 7;
+ if (i > 40)
+ break;
+ }
+
+ return i;
+}
diff --git a/tools/testing/selftests/bpf/progs/loop3.c b/tools/testing/selftests/bpf/progs/loop3.c
new file mode 100644
index 000000000..76e93b31c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/loop3.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC("raw_tracepoint/consume_skb")
+int while_true(volatile struct pt_regs* ctx)
+{
+ __u64 i = 0, sum = 0;
+ do {
+ i++;
+ sum += PT_REGS_RC(ctx);
+ } while (i < 0x100000000ULL);
+ return sum;
+}
diff --git a/tools/testing/selftests/bpf/progs/loop4.c b/tools/testing/selftests/bpf/progs/loop4.c
new file mode 100644
index 000000000..b35337926
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/loop4.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+SEC("socket")
+int combinations(volatile struct __sk_buff* skb)
+{
+ int ret = 0, i;
+
+#pragma nounroll
+ for (i = 0; i < 20; i++)
+ if (skb->len)
+ ret |= 1 << i;
+ return ret;
+}
diff --git a/tools/testing/selftests/bpf/progs/loop5.c b/tools/testing/selftests/bpf/progs/loop5.c
new file mode 100644
index 000000000..913791923
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/loop5.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#define barrier() __asm__ __volatile__("": : :"memory")
+
+char _license[] SEC("license") = "GPL";
+
+SEC("socket")
+int while_true(volatile struct __sk_buff* skb)
+{
+ int i = 0;
+
+ while (1) {
+ if (skb->len)
+ i += 3;
+ else
+ i += 7;
+ if (i == 9)
+ break;
+ barrier();
+ if (i == 10)
+ break;
+ barrier();
+ if (i == 13)
+ break;
+ barrier();
+ if (i == 14)
+ break;
+ }
+ return i;
+}
diff --git a/tools/testing/selftests/bpf/progs/lsm.c b/tools/testing/selftests/bpf/progs/lsm.c
new file mode 100644
index 000000000..ff4d343b9
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/lsm.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2020 Google LLC.
+ */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <errno.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u64);
+} array SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u64);
+} hash SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_LRU_HASH);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u64);
+} lru_hash SEC(".maps");
+
+char _license[] SEC("license") = "GPL";
+
+int monitored_pid = 0;
+int mprotect_count = 0;
+int bprm_count = 0;
+
+SEC("lsm/file_mprotect")
+int BPF_PROG(test_int_hook, struct vm_area_struct *vma,
+ unsigned long reqprot, unsigned long prot, int ret)
+{
+ if (ret != 0)
+ return ret;
+
+ __u32 pid = bpf_get_current_pid_tgid() >> 32;
+ int is_stack = 0;
+
+ is_stack = (vma->vm_start <= vma->vm_mm->start_stack &&
+ vma->vm_end >= vma->vm_mm->start_stack);
+
+ if (is_stack && monitored_pid == pid) {
+ mprotect_count++;
+ ret = -EPERM;
+ }
+
+ return ret;
+}
+
+SEC("lsm.s/bprm_committed_creds")
+int BPF_PROG(test_void_hook, struct linux_binprm *bprm)
+{
+ __u32 pid = bpf_get_current_pid_tgid() >> 32;
+ char args[64];
+ __u32 key = 0;
+ __u64 *value;
+
+ if (monitored_pid == pid)
+ bprm_count++;
+
+ bpf_copy_from_user(args, sizeof(args), (void *)bprm->vma->vm_mm->arg_start);
+ bpf_copy_from_user(args, sizeof(args), (void *)bprm->mm->arg_start);
+
+ value = bpf_map_lookup_elem(&array, &key);
+ if (value)
+ *value = 0;
+ value = bpf_map_lookup_elem(&hash, &key);
+ if (value)
+ *value = 0;
+ value = bpf_map_lookup_elem(&lru_hash, &key);
+ if (value)
+ *value = 0;
+
+ return 0;
+}
+SEC("lsm/task_free") /* lsm/ is ok, lsm.s/ fails */
+int BPF_PROG(test_task_free, struct task_struct *task)
+{
+ return 0;
+}
+
+int copy_test = 0;
+
+SEC("fentry.s/__x64_sys_setdomainname")
+int BPF_PROG(test_sys_setdomainname, struct pt_regs *regs)
+{
+ void *ptr = (void *)PT_REGS_PARM1(regs);
+ int len = PT_REGS_PARM2(regs);
+ int buf = 0;
+ long ret;
+
+ ret = bpf_copy_from_user(&buf, sizeof(buf), ptr);
+ if (len == -2 && ret == 0 && buf == 1234)
+ copy_test++;
+ if (len == -3 && ret == -EFAULT)
+ copy_test++;
+ if (len == -4 && ret == -EFAULT)
+ copy_test++;
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/map_ptr_kern.c b/tools/testing/selftests/bpf/progs/map_ptr_kern.c
new file mode 100644
index 000000000..c32540575
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/map_ptr_kern.c
@@ -0,0 +1,694 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+#define LOOP_BOUND 0xf
+#define MAX_ENTRIES 8
+#define HALF_ENTRIES (MAX_ENTRIES >> 1)
+
+_Static_assert(MAX_ENTRIES < LOOP_BOUND, "MAX_ENTRIES must be < LOOP_BOUND");
+
+enum bpf_map_type g_map_type = BPF_MAP_TYPE_UNSPEC;
+__u32 g_line = 0;
+
+#define VERIFY_TYPE(type, func) ({ \
+ g_map_type = type; \
+ if (!func()) \
+ return 0; \
+})
+
+
+#define VERIFY(expr) ({ \
+ g_line = __LINE__; \
+ if (!(expr)) \
+ return 0; \
+})
+
+struct bpf_map_memory {
+ __u32 pages;
+} __attribute__((preserve_access_index));
+
+struct bpf_map {
+ enum bpf_map_type map_type;
+ __u32 key_size;
+ __u32 value_size;
+ __u32 max_entries;
+ __u32 id;
+ struct bpf_map_memory memory;
+} __attribute__((preserve_access_index));
+
+static inline int check_bpf_map_fields(struct bpf_map *map, __u32 key_size,
+ __u32 value_size, __u32 max_entries)
+{
+ VERIFY(map->map_type == g_map_type);
+ VERIFY(map->key_size == key_size);
+ VERIFY(map->value_size == value_size);
+ VERIFY(map->max_entries == max_entries);
+ VERIFY(map->id > 0);
+ VERIFY(map->memory.pages > 0);
+
+ return 1;
+}
+
+static inline int check_bpf_map_ptr(struct bpf_map *indirect,
+ struct bpf_map *direct)
+{
+ VERIFY(indirect->map_type == direct->map_type);
+ VERIFY(indirect->key_size == direct->key_size);
+ VERIFY(indirect->value_size == direct->value_size);
+ VERIFY(indirect->max_entries == direct->max_entries);
+ VERIFY(indirect->id == direct->id);
+ VERIFY(indirect->memory.pages == direct->memory.pages);
+
+ return 1;
+}
+
+static inline int check(struct bpf_map *indirect, struct bpf_map *direct,
+ __u32 key_size, __u32 value_size, __u32 max_entries)
+{
+ VERIFY(check_bpf_map_ptr(indirect, direct));
+ VERIFY(check_bpf_map_fields(indirect, key_size, value_size,
+ max_entries));
+ return 1;
+}
+
+static inline int check_default(struct bpf_map *indirect,
+ struct bpf_map *direct)
+{
+ VERIFY(check(indirect, direct, sizeof(__u32), sizeof(__u32),
+ MAX_ENTRIES));
+ return 1;
+}
+
+static __noinline int
+check_default_noinline(struct bpf_map *indirect, struct bpf_map *direct)
+{
+ VERIFY(check(indirect, direct, sizeof(__u32), sizeof(__u32),
+ MAX_ENTRIES));
+ return 1;
+}
+
+typedef struct {
+ int counter;
+} atomic_t;
+
+struct bpf_htab {
+ struct bpf_map map;
+ atomic_t count;
+ __u32 n_buckets;
+ __u32 elem_size;
+} __attribute__((preserve_access_index));
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(map_flags, BPF_F_NO_PREALLOC); /* to test bpf_htab.count */
+ __uint(max_entries, MAX_ENTRIES);
+ __type(key, __u32);
+ __type(value, __u32);
+} m_hash SEC(".maps");
+
+static inline int check_hash(void)
+{
+ struct bpf_htab *hash = (struct bpf_htab *)&m_hash;
+ struct bpf_map *map = (struct bpf_map *)&m_hash;
+ int i;
+
+ VERIFY(check_default_noinline(&hash->map, map));
+
+ VERIFY(hash->n_buckets == MAX_ENTRIES);
+ VERIFY(hash->elem_size == 64);
+
+ VERIFY(hash->count.counter == 0);
+ for (i = 0; i < HALF_ENTRIES; ++i) {
+ const __u32 key = i;
+ const __u32 val = 1;
+
+ if (bpf_map_update_elem(hash, &key, &val, 0))
+ return 0;
+ }
+ VERIFY(hash->count.counter == HALF_ENTRIES);
+
+ return 1;
+}
+
+struct bpf_array {
+ struct bpf_map map;
+ __u32 elem_size;
+} __attribute__((preserve_access_index));
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, MAX_ENTRIES);
+ __type(key, __u32);
+ __type(value, __u32);
+} m_array SEC(".maps");
+
+static inline int check_array(void)
+{
+ struct bpf_array *array = (struct bpf_array *)&m_array;
+ struct bpf_map *map = (struct bpf_map *)&m_array;
+ int i, n_lookups = 0, n_keys = 0;
+
+ VERIFY(check_default(&array->map, map));
+
+ VERIFY(array->elem_size == 8);
+
+ for (i = 0; i < array->map.max_entries && i < LOOP_BOUND; ++i) {
+ const __u32 key = i;
+ __u32 *val = bpf_map_lookup_elem(array, &key);
+
+ ++n_lookups;
+ if (val)
+ ++n_keys;
+ }
+
+ VERIFY(n_lookups == MAX_ENTRIES);
+ VERIFY(n_keys == MAX_ENTRIES);
+
+ return 1;
+}
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __uint(max_entries, MAX_ENTRIES);
+ __type(key, __u32);
+ __type(value, __u32);
+} m_prog_array SEC(".maps");
+
+static inline int check_prog_array(void)
+{
+ struct bpf_array *prog_array = (struct bpf_array *)&m_prog_array;
+ struct bpf_map *map = (struct bpf_map *)&m_prog_array;
+
+ VERIFY(check_default(&prog_array->map, map));
+
+ return 1;
+}
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(max_entries, MAX_ENTRIES);
+ __type(key, __u32);
+ __type(value, __u32);
+} m_perf_event_array SEC(".maps");
+
+static inline int check_perf_event_array(void)
+{
+ struct bpf_array *perf_event_array = (struct bpf_array *)&m_perf_event_array;
+ struct bpf_map *map = (struct bpf_map *)&m_perf_event_array;
+
+ VERIFY(check_default(&perf_event_array->map, map));
+
+ return 1;
+}
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+ __uint(max_entries, MAX_ENTRIES);
+ __type(key, __u32);
+ __type(value, __u32);
+} m_percpu_hash SEC(".maps");
+
+static inline int check_percpu_hash(void)
+{
+ struct bpf_htab *percpu_hash = (struct bpf_htab *)&m_percpu_hash;
+ struct bpf_map *map = (struct bpf_map *)&m_percpu_hash;
+
+ VERIFY(check_default(&percpu_hash->map, map));
+
+ return 1;
+}
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(max_entries, MAX_ENTRIES);
+ __type(key, __u32);
+ __type(value, __u32);
+} m_percpu_array SEC(".maps");
+
+static inline int check_percpu_array(void)
+{
+ struct bpf_array *percpu_array = (struct bpf_array *)&m_percpu_array;
+ struct bpf_map *map = (struct bpf_map *)&m_percpu_array;
+
+ VERIFY(check_default(&percpu_array->map, map));
+
+ return 1;
+}
+
+struct bpf_stack_map {
+ struct bpf_map map;
+} __attribute__((preserve_access_index));
+
+struct {
+ __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+ __uint(max_entries, MAX_ENTRIES);
+ __type(key, __u32);
+ __type(value, __u64);
+} m_stack_trace SEC(".maps");
+
+static inline int check_stack_trace(void)
+{
+ struct bpf_stack_map *stack_trace =
+ (struct bpf_stack_map *)&m_stack_trace;
+ struct bpf_map *map = (struct bpf_map *)&m_stack_trace;
+
+ VERIFY(check(&stack_trace->map, map, sizeof(__u32), sizeof(__u64),
+ MAX_ENTRIES));
+
+ return 1;
+}
+
+struct {
+ __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY);
+ __uint(max_entries, MAX_ENTRIES);
+ __type(key, __u32);
+ __type(value, __u32);
+} m_cgroup_array SEC(".maps");
+
+static inline int check_cgroup_array(void)
+{
+ struct bpf_array *cgroup_array = (struct bpf_array *)&m_cgroup_array;
+ struct bpf_map *map = (struct bpf_map *)&m_cgroup_array;
+
+ VERIFY(check_default(&cgroup_array->map, map));
+
+ return 1;
+}
+
+struct {
+ __uint(type, BPF_MAP_TYPE_LRU_HASH);
+ __uint(max_entries, MAX_ENTRIES);
+ __type(key, __u32);
+ __type(value, __u32);
+} m_lru_hash SEC(".maps");
+
+static inline int check_lru_hash(void)
+{
+ struct bpf_htab *lru_hash = (struct bpf_htab *)&m_lru_hash;
+ struct bpf_map *map = (struct bpf_map *)&m_lru_hash;
+
+ VERIFY(check_default(&lru_hash->map, map));
+
+ return 1;
+}
+
+struct {
+ __uint(type, BPF_MAP_TYPE_LRU_PERCPU_HASH);
+ __uint(max_entries, MAX_ENTRIES);
+ __type(key, __u32);
+ __type(value, __u32);
+} m_lru_percpu_hash SEC(".maps");
+
+static inline int check_lru_percpu_hash(void)
+{
+ struct bpf_htab *lru_percpu_hash = (struct bpf_htab *)&m_lru_percpu_hash;
+ struct bpf_map *map = (struct bpf_map *)&m_lru_percpu_hash;
+
+ VERIFY(check_default(&lru_percpu_hash->map, map));
+
+ return 1;
+}
+
+struct lpm_trie {
+ struct bpf_map map;
+} __attribute__((preserve_access_index));
+
+struct lpm_key {
+ struct bpf_lpm_trie_key trie_key;
+ __u32 data;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_LPM_TRIE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __uint(max_entries, MAX_ENTRIES);
+ __type(key, struct lpm_key);
+ __type(value, __u32);
+} m_lpm_trie SEC(".maps");
+
+static inline int check_lpm_trie(void)
+{
+ struct lpm_trie *lpm_trie = (struct lpm_trie *)&m_lpm_trie;
+ struct bpf_map *map = (struct bpf_map *)&m_lpm_trie;
+
+ VERIFY(check(&lpm_trie->map, map, sizeof(struct lpm_key), sizeof(__u32),
+ MAX_ENTRIES));
+
+ return 1;
+}
+
+struct inner_map {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u32);
+} inner_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+ __uint(max_entries, MAX_ENTRIES);
+ __type(key, __u32);
+ __type(value, __u32);
+ __array(values, struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u32);
+ });
+} m_array_of_maps SEC(".maps") = {
+ .values = { (void *)&inner_map, 0, 0, 0, 0, 0, 0, 0, 0 },
+};
+
+static inline int check_array_of_maps(void)
+{
+ struct bpf_array *array_of_maps = (struct bpf_array *)&m_array_of_maps;
+ struct bpf_map *map = (struct bpf_map *)&m_array_of_maps;
+
+ VERIFY(check_default(&array_of_maps->map, map));
+
+ return 1;
+}
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
+ __uint(max_entries, MAX_ENTRIES);
+ __type(key, __u32);
+ __type(value, __u32);
+ __array(values, struct inner_map);
+} m_hash_of_maps SEC(".maps") = {
+ .values = {
+ [2] = &inner_map,
+ },
+};
+
+static inline int check_hash_of_maps(void)
+{
+ struct bpf_htab *hash_of_maps = (struct bpf_htab *)&m_hash_of_maps;
+ struct bpf_map *map = (struct bpf_map *)&m_hash_of_maps;
+
+ VERIFY(check_default(&hash_of_maps->map, map));
+
+ return 1;
+}
+
+struct bpf_dtab {
+ struct bpf_map map;
+} __attribute__((preserve_access_index));
+
+struct {
+ __uint(type, BPF_MAP_TYPE_DEVMAP);
+ __uint(max_entries, MAX_ENTRIES);
+ __type(key, __u32);
+ __type(value, __u32);
+} m_devmap SEC(".maps");
+
+static inline int check_devmap(void)
+{
+ struct bpf_dtab *devmap = (struct bpf_dtab *)&m_devmap;
+ struct bpf_map *map = (struct bpf_map *)&m_devmap;
+
+ VERIFY(check_default(&devmap->map, map));
+
+ return 1;
+}
+
+struct bpf_stab {
+ struct bpf_map map;
+} __attribute__((preserve_access_index));
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKMAP);
+ __uint(max_entries, MAX_ENTRIES);
+ __type(key, __u32);
+ __type(value, __u32);
+} m_sockmap SEC(".maps");
+
+static inline int check_sockmap(void)
+{
+ struct bpf_stab *sockmap = (struct bpf_stab *)&m_sockmap;
+ struct bpf_map *map = (struct bpf_map *)&m_sockmap;
+
+ VERIFY(check_default(&sockmap->map, map));
+
+ return 1;
+}
+
+struct bpf_cpu_map {
+ struct bpf_map map;
+} __attribute__((preserve_access_index));
+
+struct {
+ __uint(type, BPF_MAP_TYPE_CPUMAP);
+ __uint(max_entries, MAX_ENTRIES);
+ __type(key, __u32);
+ __type(value, __u32);
+} m_cpumap SEC(".maps");
+
+static inline int check_cpumap(void)
+{
+ struct bpf_cpu_map *cpumap = (struct bpf_cpu_map *)&m_cpumap;
+ struct bpf_map *map = (struct bpf_map *)&m_cpumap;
+
+ VERIFY(check_default(&cpumap->map, map));
+
+ return 1;
+}
+
+struct xsk_map {
+ struct bpf_map map;
+} __attribute__((preserve_access_index));
+
+struct {
+ __uint(type, BPF_MAP_TYPE_XSKMAP);
+ __uint(max_entries, MAX_ENTRIES);
+ __type(key, __u32);
+ __type(value, __u32);
+} m_xskmap SEC(".maps");
+
+static inline int check_xskmap(void)
+{
+ struct xsk_map *xskmap = (struct xsk_map *)&m_xskmap;
+ struct bpf_map *map = (struct bpf_map *)&m_xskmap;
+
+ VERIFY(check_default(&xskmap->map, map));
+
+ return 1;
+}
+
+struct bpf_shtab {
+ struct bpf_map map;
+} __attribute__((preserve_access_index));
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKHASH);
+ __uint(max_entries, MAX_ENTRIES);
+ __type(key, __u32);
+ __type(value, __u32);
+} m_sockhash SEC(".maps");
+
+static inline int check_sockhash(void)
+{
+ struct bpf_shtab *sockhash = (struct bpf_shtab *)&m_sockhash;
+ struct bpf_map *map = (struct bpf_map *)&m_sockhash;
+
+ VERIFY(check_default(&sockhash->map, map));
+
+ return 1;
+}
+
+struct bpf_cgroup_storage_map {
+ struct bpf_map map;
+} __attribute__((preserve_access_index));
+
+struct {
+ __uint(type, BPF_MAP_TYPE_CGROUP_STORAGE);
+ __type(key, struct bpf_cgroup_storage_key);
+ __type(value, __u32);
+} m_cgroup_storage SEC(".maps");
+
+static inline int check_cgroup_storage(void)
+{
+ struct bpf_cgroup_storage_map *cgroup_storage =
+ (struct bpf_cgroup_storage_map *)&m_cgroup_storage;
+ struct bpf_map *map = (struct bpf_map *)&m_cgroup_storage;
+
+ VERIFY(check(&cgroup_storage->map, map,
+ sizeof(struct bpf_cgroup_storage_key), sizeof(__u32), 0));
+
+ return 1;
+}
+
+struct reuseport_array {
+ struct bpf_map map;
+} __attribute__((preserve_access_index));
+
+struct {
+ __uint(type, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY);
+ __uint(max_entries, MAX_ENTRIES);
+ __type(key, __u32);
+ __type(value, __u32);
+} m_reuseport_sockarray SEC(".maps");
+
+static inline int check_reuseport_sockarray(void)
+{
+ struct reuseport_array *reuseport_sockarray =
+ (struct reuseport_array *)&m_reuseport_sockarray;
+ struct bpf_map *map = (struct bpf_map *)&m_reuseport_sockarray;
+
+ VERIFY(check_default(&reuseport_sockarray->map, map));
+
+ return 1;
+}
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
+ __type(key, struct bpf_cgroup_storage_key);
+ __type(value, __u32);
+} m_percpu_cgroup_storage SEC(".maps");
+
+static inline int check_percpu_cgroup_storage(void)
+{
+ struct bpf_cgroup_storage_map *percpu_cgroup_storage =
+ (struct bpf_cgroup_storage_map *)&m_percpu_cgroup_storage;
+ struct bpf_map *map = (struct bpf_map *)&m_percpu_cgroup_storage;
+
+ VERIFY(check(&percpu_cgroup_storage->map, map,
+ sizeof(struct bpf_cgroup_storage_key), sizeof(__u32), 0));
+
+ return 1;
+}
+
+struct bpf_queue_stack {
+ struct bpf_map map;
+} __attribute__((preserve_access_index));
+
+struct {
+ __uint(type, BPF_MAP_TYPE_QUEUE);
+ __uint(max_entries, MAX_ENTRIES);
+ __type(value, __u32);
+} m_queue SEC(".maps");
+
+static inline int check_queue(void)
+{
+ struct bpf_queue_stack *queue = (struct bpf_queue_stack *)&m_queue;
+ struct bpf_map *map = (struct bpf_map *)&m_queue;
+
+ VERIFY(check(&queue->map, map, 0, sizeof(__u32), MAX_ENTRIES));
+
+ return 1;
+}
+
+struct {
+ __uint(type, BPF_MAP_TYPE_STACK);
+ __uint(max_entries, MAX_ENTRIES);
+ __type(value, __u32);
+} m_stack SEC(".maps");
+
+static inline int check_stack(void)
+{
+ struct bpf_queue_stack *stack = (struct bpf_queue_stack *)&m_stack;
+ struct bpf_map *map = (struct bpf_map *)&m_stack;
+
+ VERIFY(check(&stack->map, map, 0, sizeof(__u32), MAX_ENTRIES));
+
+ return 1;
+}
+
+struct bpf_local_storage_map {
+ struct bpf_map map;
+} __attribute__((preserve_access_index));
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, __u32);
+ __type(value, __u32);
+} m_sk_storage SEC(".maps");
+
+static inline int check_sk_storage(void)
+{
+ struct bpf_local_storage_map *sk_storage =
+ (struct bpf_local_storage_map *)&m_sk_storage;
+ struct bpf_map *map = (struct bpf_map *)&m_sk_storage;
+
+ VERIFY(check(&sk_storage->map, map, sizeof(__u32), sizeof(__u32), 0));
+
+ return 1;
+}
+
+struct {
+ __uint(type, BPF_MAP_TYPE_DEVMAP_HASH);
+ __uint(max_entries, MAX_ENTRIES);
+ __type(key, __u32);
+ __type(value, __u32);
+} m_devmap_hash SEC(".maps");
+
+static inline int check_devmap_hash(void)
+{
+ struct bpf_dtab *devmap_hash = (struct bpf_dtab *)&m_devmap_hash;
+ struct bpf_map *map = (struct bpf_map *)&m_devmap_hash;
+
+ VERIFY(check_default(&devmap_hash->map, map));
+
+ return 1;
+}
+
+struct bpf_ringbuf_map {
+ struct bpf_map map;
+} __attribute__((preserve_access_index));
+
+struct {
+ __uint(type, BPF_MAP_TYPE_RINGBUF);
+ __uint(max_entries, 1 << 12);
+} m_ringbuf SEC(".maps");
+
+static inline int check_ringbuf(void)
+{
+ struct bpf_ringbuf_map *ringbuf = (struct bpf_ringbuf_map *)&m_ringbuf;
+ struct bpf_map *map = (struct bpf_map *)&m_ringbuf;
+
+ VERIFY(check(&ringbuf->map, map, 0, 0, 1 << 12));
+
+ return 1;
+}
+
+SEC("cgroup_skb/egress")
+int cg_skb(void *ctx)
+{
+ VERIFY_TYPE(BPF_MAP_TYPE_HASH, check_hash);
+ VERIFY_TYPE(BPF_MAP_TYPE_ARRAY, check_array);
+ VERIFY_TYPE(BPF_MAP_TYPE_PROG_ARRAY, check_prog_array);
+ VERIFY_TYPE(BPF_MAP_TYPE_PERF_EVENT_ARRAY, check_perf_event_array);
+ VERIFY_TYPE(BPF_MAP_TYPE_PERCPU_HASH, check_percpu_hash);
+ VERIFY_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, check_percpu_array);
+ VERIFY_TYPE(BPF_MAP_TYPE_STACK_TRACE, check_stack_trace);
+ VERIFY_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, check_cgroup_array);
+ VERIFY_TYPE(BPF_MAP_TYPE_LRU_HASH, check_lru_hash);
+ VERIFY_TYPE(BPF_MAP_TYPE_LRU_PERCPU_HASH, check_lru_percpu_hash);
+ VERIFY_TYPE(BPF_MAP_TYPE_LPM_TRIE, check_lpm_trie);
+ VERIFY_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, check_array_of_maps);
+ VERIFY_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, check_hash_of_maps);
+ VERIFY_TYPE(BPF_MAP_TYPE_DEVMAP, check_devmap);
+ VERIFY_TYPE(BPF_MAP_TYPE_SOCKMAP, check_sockmap);
+ VERIFY_TYPE(BPF_MAP_TYPE_CPUMAP, check_cpumap);
+ VERIFY_TYPE(BPF_MAP_TYPE_XSKMAP, check_xskmap);
+ VERIFY_TYPE(BPF_MAP_TYPE_SOCKHASH, check_sockhash);
+ VERIFY_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, check_cgroup_storage);
+ VERIFY_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
+ check_reuseport_sockarray);
+ VERIFY_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
+ check_percpu_cgroup_storage);
+ VERIFY_TYPE(BPF_MAP_TYPE_QUEUE, check_queue);
+ VERIFY_TYPE(BPF_MAP_TYPE_STACK, check_stack);
+ VERIFY_TYPE(BPF_MAP_TYPE_SK_STORAGE, check_sk_storage);
+ VERIFY_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, check_devmap_hash);
+ VERIFY_TYPE(BPF_MAP_TYPE_RINGBUF, check_ringbuf);
+
+ return 1;
+}
+
+__u32 _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/metadata_unused.c b/tools/testing/selftests/bpf/progs/metadata_unused.c
new file mode 100644
index 000000000..672a0d19f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/metadata_unused.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+volatile const char bpf_metadata_a[] SEC(".rodata") = "foo";
+volatile const int bpf_metadata_b SEC(".rodata") = 1;
+
+SEC("cgroup_skb/egress")
+int prog(struct xdp_md *ctx)
+{
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/metadata_used.c b/tools/testing/selftests/bpf/progs/metadata_used.c
new file mode 100644
index 000000000..b7198e653
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/metadata_used.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+volatile const char bpf_metadata_a[] SEC(".rodata") = "bar";
+volatile const int bpf_metadata_b SEC(".rodata") = 2;
+
+SEC("cgroup_skb/egress")
+int prog(struct xdp_md *ctx)
+{
+ return bpf_metadata_b ? 1 : 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/modify_return.c b/tools/testing/selftests/bpf/progs/modify_return.c
new file mode 100644
index 000000000..8b7466a15
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/modify_return.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2020 Google LLC.
+ */
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+static int sequence = 0;
+__s32 input_retval = 0;
+
+__u64 fentry_result = 0;
+SEC("fentry/bpf_modify_return_test")
+int BPF_PROG(fentry_test, int a, __u64 b)
+{
+ sequence++;
+ fentry_result = (sequence == 1);
+ return 0;
+}
+
+__u64 fmod_ret_result = 0;
+SEC("fmod_ret/bpf_modify_return_test")
+int BPF_PROG(fmod_ret_test, int a, int *b, int ret)
+{
+ sequence++;
+ /* This is the first fmod_ret program, the ret passed should be 0 */
+ fmod_ret_result = (sequence == 2 && ret == 0);
+ return input_retval;
+}
+
+__u64 fexit_result = 0;
+SEC("fexit/bpf_modify_return_test")
+int BPF_PROG(fexit_test, int a, __u64 b, int ret)
+{
+ sequence++;
+ /* If the input_reval is non-zero a successful modification should have
+ * occurred.
+ */
+ if (input_retval)
+ fexit_result = (sequence == 3 && ret == input_retval);
+ else
+ fexit_result = (sequence == 3 && ret == 4);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/netcnt_prog.c b/tools/testing/selftests/bpf/progs/netcnt_prog.c
new file mode 100644
index 000000000..d071adf17
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/netcnt_prog.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <linux/version.h>
+
+#include <bpf/bpf_helpers.h>
+#include "netcnt_common.h"
+
+#define MAX_BPS (3 * 1024 * 1024)
+
+#define REFRESH_TIME_NS 100000000
+#define NS_PER_SEC 1000000000
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
+ __type(key, struct bpf_cgroup_storage_key);
+ __type(value, struct percpu_net_cnt);
+} percpu_netcnt SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_CGROUP_STORAGE);
+ __type(key, struct bpf_cgroup_storage_key);
+ __type(value, struct net_cnt);
+} netcnt SEC(".maps");
+
+SEC("cgroup/skb")
+int bpf_nextcnt(struct __sk_buff *skb)
+{
+ struct percpu_net_cnt *percpu_cnt;
+ char fmt[] = "%d %llu %llu\n";
+ struct net_cnt *cnt;
+ __u64 ts, dt;
+ int ret;
+
+ cnt = bpf_get_local_storage(&netcnt, 0);
+ percpu_cnt = bpf_get_local_storage(&percpu_netcnt, 0);
+
+ percpu_cnt->packets++;
+ percpu_cnt->bytes += skb->len;
+
+ if (percpu_cnt->packets > MAX_PERCPU_PACKETS) {
+ __sync_fetch_and_add(&cnt->packets,
+ percpu_cnt->packets);
+ percpu_cnt->packets = 0;
+
+ __sync_fetch_and_add(&cnt->bytes,
+ percpu_cnt->bytes);
+ percpu_cnt->bytes = 0;
+ }
+
+ ts = bpf_ktime_get_ns();
+ dt = ts - percpu_cnt->prev_ts;
+
+ dt *= MAX_BPS;
+ dt /= NS_PER_SEC;
+
+ if (cnt->bytes + percpu_cnt->bytes - percpu_cnt->prev_bytes < dt)
+ ret = 1;
+ else
+ ret = 0;
+
+ if (dt > REFRESH_TIME_NS) {
+ percpu_cnt->prev_ts = ts;
+ percpu_cnt->prev_packets = cnt->packets;
+ percpu_cnt->prev_bytes = cnt->bytes;
+ }
+
+ return !!ret;
+}
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/tools/testing/selftests/bpf/progs/netif_receive_skb.c b/tools/testing/selftests/bpf/progs/netif_receive_skb.c
new file mode 100644
index 000000000..1d8918dfb
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/netif_receive_skb.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020, Oracle and/or its affiliates. */
+
+#include "btf_ptr.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+#include <errno.h>
+
+long ret = 0;
+int num_subtests = 0;
+int ran_subtests = 0;
+bool skip = false;
+
+#define STRSIZE 2048
+#define EXPECTED_STRSIZE 256
+
+#if defined(bpf_target_s390)
+/* NULL points to a readable struct lowcore on s390, so take the last page */
+#define BADPTR ((void *)0xFFFFFFFFFFFFF000ULL)
+#else
+#define BADPTR 0
+#endif
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, char[STRSIZE]);
+} strdata SEC(".maps");
+
+static int __strncmp(const void *m1, const void *m2, size_t len)
+{
+ const unsigned char *s1 = m1;
+ const unsigned char *s2 = m2;
+ int i, delta = 0;
+
+ for (i = 0; i < len; i++) {
+ delta = s1[i] - s2[i];
+ if (delta || s1[i] == 0 || s2[i] == 0)
+ break;
+ }
+ return delta;
+}
+
+#if __has_builtin(__builtin_btf_type_id)
+#define TEST_BTF(_str, _type, _flags, _expected, ...) \
+ do { \
+ static const char _expectedval[EXPECTED_STRSIZE] = \
+ _expected; \
+ static const char _ptrtype[64] = #_type; \
+ __u64 _hflags = _flags | BTF_F_COMPACT; \
+ static _type _ptrdata = __VA_ARGS__; \
+ static struct btf_ptr _ptr = { }; \
+ int _cmp; \
+ \
+ ++num_subtests; \
+ if (ret < 0) \
+ break; \
+ ++ran_subtests; \
+ _ptr.ptr = &_ptrdata; \
+ _ptr.type_id = bpf_core_type_id_kernel(_type); \
+ if (_ptr.type_id <= 0) { \
+ ret = -EINVAL; \
+ break; \
+ } \
+ ret = bpf_snprintf_btf(_str, STRSIZE, \
+ &_ptr, sizeof(_ptr), _hflags); \
+ if (ret) \
+ break; \
+ _cmp = __strncmp(_str, _expectedval, EXPECTED_STRSIZE); \
+ if (_cmp != 0) { \
+ bpf_printk("(%d) got %s", _cmp, _str); \
+ bpf_printk("(%d) expected %s", _cmp, \
+ _expectedval); \
+ ret = -EBADMSG; \
+ break; \
+ } \
+ } while (0)
+#endif
+
+/* Use where expected data string matches its stringified declaration */
+#define TEST_BTF_C(_str, _type, _flags, ...) \
+ TEST_BTF(_str, _type, _flags, "(" #_type ")" #__VA_ARGS__, \
+ __VA_ARGS__)
+
+/* TRACE_EVENT(netif_receive_skb,
+ * TP_PROTO(struct sk_buff *skb),
+ */
+SEC("tp_btf/netif_receive_skb")
+int BPF_PROG(trace_netif_receive_skb, struct sk_buff *skb)
+{
+ static __u64 flags[] = { 0, BTF_F_COMPACT, BTF_F_ZERO, BTF_F_PTR_RAW,
+ BTF_F_NONAME, BTF_F_COMPACT | BTF_F_ZERO |
+ BTF_F_PTR_RAW | BTF_F_NONAME };
+ static struct btf_ptr p = { };
+ __u32 key = 0;
+ int i, __ret;
+ char *str;
+
+#if __has_builtin(__builtin_btf_type_id)
+ str = bpf_map_lookup_elem(&strdata, &key);
+ if (!str)
+ return 0;
+
+ /* Ensure we can write skb string representation */
+ p.type_id = bpf_core_type_id_kernel(struct sk_buff);
+ p.ptr = skb;
+ for (i = 0; i < ARRAY_SIZE(flags); i++) {
+ ++num_subtests;
+ ret = bpf_snprintf_btf(str, STRSIZE, &p, sizeof(p), 0);
+ if (ret < 0)
+ bpf_printk("returned %d when writing skb", ret);
+ ++ran_subtests;
+ }
+
+ /* Check invalid ptr value */
+ p.ptr = BADPTR;
+ __ret = bpf_snprintf_btf(str, STRSIZE, &p, sizeof(p), 0);
+ if (__ret >= 0) {
+ bpf_printk("printing %llx should generate error, got (%d)",
+ (unsigned long long)BADPTR, __ret);
+ ret = -ERANGE;
+ }
+
+ /* Verify type display for various types. */
+
+ /* simple int */
+ TEST_BTF_C(str, int, 0, 1234);
+ TEST_BTF(str, int, BTF_F_NONAME, "1234", 1234);
+ /* zero value should be printed at toplevel */
+ TEST_BTF(str, int, 0, "(int)0", 0);
+ TEST_BTF(str, int, BTF_F_NONAME, "0", 0);
+ TEST_BTF(str, int, BTF_F_ZERO, "(int)0", 0);
+ TEST_BTF(str, int, BTF_F_NONAME | BTF_F_ZERO, "0", 0);
+ TEST_BTF_C(str, int, 0, -4567);
+ TEST_BTF(str, int, BTF_F_NONAME, "-4567", -4567);
+
+ /* simple char */
+ TEST_BTF_C(str, char, 0, 100);
+ TEST_BTF(str, char, BTF_F_NONAME, "100", 100);
+ /* zero value should be printed at toplevel */
+ TEST_BTF(str, char, 0, "(char)0", 0);
+ TEST_BTF(str, char, BTF_F_NONAME, "0", 0);
+ TEST_BTF(str, char, BTF_F_ZERO, "(char)0", 0);
+ TEST_BTF(str, char, BTF_F_NONAME | BTF_F_ZERO, "0", 0);
+
+ /* simple typedef */
+ TEST_BTF_C(str, uint64_t, 0, 100);
+ TEST_BTF(str, u64, BTF_F_NONAME, "1", 1);
+ /* zero value should be printed at toplevel */
+ TEST_BTF(str, u64, 0, "(u64)0", 0);
+ TEST_BTF(str, u64, BTF_F_NONAME, "0", 0);
+ TEST_BTF(str, u64, BTF_F_ZERO, "(u64)0", 0);
+ TEST_BTF(str, u64, BTF_F_NONAME|BTF_F_ZERO, "0", 0);
+
+ /* typedef struct */
+ TEST_BTF_C(str, atomic_t, 0, {.counter = (int)1,});
+ TEST_BTF(str, atomic_t, BTF_F_NONAME, "{1,}", {.counter = 1,});
+ /* typedef with 0 value should be printed at toplevel */
+ TEST_BTF(str, atomic_t, 0, "(atomic_t){}", {.counter = 0,});
+ TEST_BTF(str, atomic_t, BTF_F_NONAME, "{}", {.counter = 0,});
+ TEST_BTF(str, atomic_t, BTF_F_ZERO, "(atomic_t){.counter = (int)0,}",
+ {.counter = 0,});
+ TEST_BTF(str, atomic_t, BTF_F_NONAME|BTF_F_ZERO,
+ "{0,}", {.counter = 0,});
+
+ /* enum where enum value does (and does not) exist */
+ TEST_BTF_C(str, enum bpf_cmd, 0, BPF_MAP_CREATE);
+ TEST_BTF(str, enum bpf_cmd, 0, "(enum bpf_cmd)BPF_MAP_CREATE", 0);
+ TEST_BTF(str, enum bpf_cmd, BTF_F_NONAME, "BPF_MAP_CREATE",
+ BPF_MAP_CREATE);
+ TEST_BTF(str, enum bpf_cmd, BTF_F_NONAME|BTF_F_ZERO,
+ "BPF_MAP_CREATE", 0);
+
+ TEST_BTF(str, enum bpf_cmd, BTF_F_ZERO, "(enum bpf_cmd)BPF_MAP_CREATE",
+ BPF_MAP_CREATE);
+ TEST_BTF(str, enum bpf_cmd, BTF_F_NONAME|BTF_F_ZERO,
+ "BPF_MAP_CREATE", BPF_MAP_CREATE);
+ TEST_BTF_C(str, enum bpf_cmd, 0, 2000);
+ TEST_BTF(str, enum bpf_cmd, BTF_F_NONAME, "2000", 2000);
+
+ /* simple struct */
+ TEST_BTF_C(str, struct btf_enum, 0,
+ {.name_off = (__u32)3,.val = (__s32)-1,});
+ TEST_BTF(str, struct btf_enum, BTF_F_NONAME, "{3,-1,}",
+ { .name_off = 3, .val = -1,});
+ TEST_BTF(str, struct btf_enum, BTF_F_NONAME, "{-1,}",
+ { .name_off = 0, .val = -1,});
+ TEST_BTF(str, struct btf_enum, BTF_F_NONAME|BTF_F_ZERO, "{0,-1,}",
+ { .name_off = 0, .val = -1,});
+ /* empty struct should be printed */
+ TEST_BTF(str, struct btf_enum, 0, "(struct btf_enum){}",
+ { .name_off = 0, .val = 0,});
+ TEST_BTF(str, struct btf_enum, BTF_F_NONAME, "{}",
+ { .name_off = 0, .val = 0,});
+ TEST_BTF(str, struct btf_enum, BTF_F_ZERO,
+ "(struct btf_enum){.name_off = (__u32)0,.val = (__s32)0,}",
+ { .name_off = 0, .val = 0,});
+
+ /* struct with pointers */
+ TEST_BTF(str, struct list_head, BTF_F_PTR_RAW,
+ "(struct list_head){.next = (struct list_head *)0x0000000000000001,}",
+ { .next = (struct list_head *)1 });
+ /* NULL pointer should not be displayed */
+ TEST_BTF(str, struct list_head, BTF_F_PTR_RAW,
+ "(struct list_head){}",
+ { .next = (struct list_head *)0 });
+
+ /* struct with char array */
+ TEST_BTF(str, struct bpf_prog_info, 0,
+ "(struct bpf_prog_info){.name = (char[])['f','o','o',],}",
+ { .name = "foo",});
+ TEST_BTF(str, struct bpf_prog_info, BTF_F_NONAME,
+ "{['f','o','o',],}",
+ {.name = "foo",});
+ /* leading null char means do not display string */
+ TEST_BTF(str, struct bpf_prog_info, 0,
+ "(struct bpf_prog_info){}",
+ {.name = {'\0', 'f', 'o', 'o'}});
+ /* handle non-printable characters */
+ TEST_BTF(str, struct bpf_prog_info, 0,
+ "(struct bpf_prog_info){.name = (char[])[1,2,3,],}",
+ { .name = {1, 2, 3, 0}});
+
+ /* struct with non-char array */
+ TEST_BTF(str, struct __sk_buff, 0,
+ "(struct __sk_buff){.cb = (__u32[])[1,2,3,4,5,],}",
+ { .cb = {1, 2, 3, 4, 5,},});
+ TEST_BTF(str, struct __sk_buff, BTF_F_NONAME,
+ "{[1,2,3,4,5,],}",
+ { .cb = { 1, 2, 3, 4, 5},});
+ /* For non-char, arrays, show non-zero values only */
+ TEST_BTF(str, struct __sk_buff, 0,
+ "(struct __sk_buff){.cb = (__u32[])[1,],}",
+ { .cb = { 0, 0, 1, 0, 0},});
+
+ /* struct with bitfields */
+ TEST_BTF_C(str, struct bpf_insn, 0,
+ {.code = (__u8)1,.dst_reg = (__u8)0x2,.src_reg = (__u8)0x3,.off = (__s16)4,.imm = (__s32)5,});
+ TEST_BTF(str, struct bpf_insn, BTF_F_NONAME, "{1,0x2,0x3,4,5,}",
+ {.code = 1, .dst_reg = 0x2, .src_reg = 0x3, .off = 4,
+ .imm = 5,});
+#else
+ skip = true;
+#endif
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/perf_event_stackmap.c b/tools/testing/selftests/bpf/progs/perf_event_stackmap.c
new file mode 100644
index 000000000..25467d13c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/perf_event_stackmap.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+#ifndef PERF_MAX_STACK_DEPTH
+#define PERF_MAX_STACK_DEPTH 127
+#endif
+
+typedef __u64 stack_trace_t[PERF_MAX_STACK_DEPTH];
+struct {
+ __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+ __uint(max_entries, 16384);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(stack_trace_t));
+} stackmap SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, stack_trace_t);
+} stackdata_map SEC(".maps");
+
+long stackid_kernel = 1;
+long stackid_user = 1;
+long stack_kernel = 1;
+long stack_user = 1;
+
+SEC("perf_event")
+int oncpu(void *ctx)
+{
+ stack_trace_t *trace;
+ __u32 key = 0;
+ long val;
+
+ val = bpf_get_stackid(ctx, &stackmap, 0);
+ if (val > 0)
+ stackid_kernel = 2;
+ val = bpf_get_stackid(ctx, &stackmap, BPF_F_USER_STACK);
+ if (val > 0)
+ stackid_user = 2;
+
+ trace = bpf_map_lookup_elem(&stackdata_map, &key);
+ if (!trace)
+ return 0;
+
+ val = bpf_get_stack(ctx, trace, sizeof(stack_trace_t), 0);
+ if (val > 0)
+ stack_kernel = 2;
+
+ val = bpf_get_stack(ctx, trace, sizeof(stack_trace_t), BPF_F_USER_STACK);
+ if (val > 0)
+ stack_user = 2;
+
+ return 0;
+}
+
+char LICENSE[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/perfbuf_bench.c b/tools/testing/selftests/bpf/progs/perfbuf_bench.c
new file mode 100644
index 000000000..e5ab4836a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/perfbuf_bench.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(value_size, sizeof(int));
+ __uint(key_size, sizeof(int));
+} perfbuf SEC(".maps");
+
+const volatile int batch_cnt = 0;
+
+long sample_val = 42;
+long dropped __attribute__((aligned(128))) = 0;
+
+SEC("fentry/__x64_sys_getpgid")
+int bench_perfbuf(void *ctx)
+{
+ __u64 *sample;
+ int i;
+
+ for (i = 0; i < batch_cnt; i++) {
+ if (bpf_perf_event_output(ctx, &perfbuf, BPF_F_CURRENT_CPU,
+ &sample_val, sizeof(sample_val)))
+ __sync_add_and_fetch(&dropped, 1);
+ }
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/profiler.h b/tools/testing/selftests/bpf/progs/profiler.h
new file mode 100644
index 000000000..3bac4fdd4
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/profiler.h
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#pragma once
+
+#define TASK_COMM_LEN 16
+#define MAX_ANCESTORS 4
+#define MAX_PATH 256
+#define KILL_TARGET_LEN 64
+#define CTL_MAXNAME 10
+#define MAX_ARGS_LEN 4096
+#define MAX_FILENAME_LEN 512
+#define MAX_ENVIRON_LEN 8192
+#define MAX_PATH_DEPTH 32
+#define MAX_FILEPATH_LENGTH (MAX_PATH_DEPTH * MAX_PATH)
+#define MAX_CGROUPS_PATH_DEPTH 8
+
+#define MAX_METADATA_PAYLOAD_LEN TASK_COMM_LEN
+
+#define MAX_CGROUP_PAYLOAD_LEN \
+ (MAX_PATH * 2 + (MAX_PATH * MAX_CGROUPS_PATH_DEPTH))
+
+#define MAX_CAP_PAYLOAD_LEN (MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN)
+
+#define MAX_SYSCTL_PAYLOAD_LEN \
+ (MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + CTL_MAXNAME + MAX_PATH)
+
+#define MAX_KILL_PAYLOAD_LEN \
+ (MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + TASK_COMM_LEN + \
+ KILL_TARGET_LEN)
+
+#define MAX_EXEC_PAYLOAD_LEN \
+ (MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + MAX_FILENAME_LEN + \
+ MAX_ARGS_LEN + MAX_ENVIRON_LEN)
+
+#define MAX_FILEMOD_PAYLOAD_LEN \
+ (MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + MAX_FILEPATH_LENGTH + \
+ MAX_FILEPATH_LENGTH)
+
+enum data_type {
+ INVALID_EVENT,
+ EXEC_EVENT,
+ FORK_EVENT,
+ KILL_EVENT,
+ SYSCTL_EVENT,
+ FILEMOD_EVENT,
+ MAX_DATA_TYPE_EVENT
+};
+
+enum filemod_type {
+ FMOD_OPEN,
+ FMOD_LINK,
+ FMOD_SYMLINK,
+};
+
+struct ancestors_data_t {
+ pid_t ancestor_pids[MAX_ANCESTORS];
+ uint32_t ancestor_exec_ids[MAX_ANCESTORS];
+ uint64_t ancestor_start_times[MAX_ANCESTORS];
+ uint32_t num_ancestors;
+};
+
+struct var_metadata_t {
+ enum data_type type;
+ pid_t pid;
+ uint32_t exec_id;
+ uid_t uid;
+ gid_t gid;
+ uint64_t start_time;
+ uint32_t cpu_id;
+ uint64_t bpf_stats_num_perf_events;
+ uint64_t bpf_stats_start_ktime_ns;
+ uint8_t comm_length;
+};
+
+struct cgroup_data_t {
+ ino_t cgroup_root_inode;
+ ino_t cgroup_proc_inode;
+ uint64_t cgroup_root_mtime;
+ uint64_t cgroup_proc_mtime;
+ uint16_t cgroup_root_length;
+ uint16_t cgroup_proc_length;
+ uint16_t cgroup_full_length;
+ int cgroup_full_path_root_pos;
+};
+
+struct var_sysctl_data_t {
+ struct var_metadata_t meta;
+ struct cgroup_data_t cgroup_data;
+ struct ancestors_data_t ancestors_info;
+ uint8_t sysctl_val_length;
+ uint16_t sysctl_path_length;
+ char payload[MAX_SYSCTL_PAYLOAD_LEN];
+};
+
+struct var_kill_data_t {
+ struct var_metadata_t meta;
+ struct cgroup_data_t cgroup_data;
+ struct ancestors_data_t ancestors_info;
+ pid_t kill_target_pid;
+ int kill_sig;
+ uint32_t kill_count;
+ uint64_t last_kill_time;
+ uint8_t kill_target_name_length;
+ uint8_t kill_target_cgroup_proc_length;
+ char payload[MAX_KILL_PAYLOAD_LEN];
+ size_t payload_length;
+};
+
+struct var_exec_data_t {
+ struct var_metadata_t meta;
+ struct cgroup_data_t cgroup_data;
+ pid_t parent_pid;
+ uint32_t parent_exec_id;
+ uid_t parent_uid;
+ uint64_t parent_start_time;
+ uint16_t bin_path_length;
+ uint16_t cmdline_length;
+ uint16_t environment_length;
+ char payload[MAX_EXEC_PAYLOAD_LEN];
+};
+
+struct var_fork_data_t {
+ struct var_metadata_t meta;
+ pid_t parent_pid;
+ uint32_t parent_exec_id;
+ uint64_t parent_start_time;
+ char payload[MAX_METADATA_PAYLOAD_LEN];
+};
+
+struct var_filemod_data_t {
+ struct var_metadata_t meta;
+ struct cgroup_data_t cgroup_data;
+ enum filemod_type fmod_type;
+ unsigned int dst_flags;
+ uint32_t src_device_id;
+ uint32_t dst_device_id;
+ ino_t src_inode;
+ ino_t dst_inode;
+ uint16_t src_filepath_length;
+ uint16_t dst_filepath_length;
+ char payload[MAX_FILEMOD_PAYLOAD_LEN];
+};
+
+struct profiler_config_struct {
+ bool fetch_cgroups_from_bpf;
+ ino_t cgroup_fs_inode;
+ ino_t cgroup_login_session_inode;
+ uint64_t kill_signals_mask;
+ ino_t inode_filter;
+ uint32_t stale_info_secs;
+ bool use_variable_buffers;
+ bool read_environ_from_exec;
+ bool enable_cgroup_v1_resolver;
+};
+
+struct bpf_func_stats_data {
+ uint64_t time_elapsed_ns;
+ uint64_t num_executions;
+ uint64_t num_perf_events;
+};
+
+struct bpf_func_stats_ctx {
+ uint64_t start_time_ns;
+ struct bpf_func_stats_data* bpf_func_stats_data_val;
+};
+
+enum bpf_function_id {
+ profiler_bpf_proc_sys_write,
+ profiler_bpf_sched_process_exec,
+ profiler_bpf_sched_process_exit,
+ profiler_bpf_sys_enter_kill,
+ profiler_bpf_do_filp_open_ret,
+ profiler_bpf_sched_process_fork,
+ profiler_bpf_vfs_link,
+ profiler_bpf_vfs_symlink,
+ profiler_bpf_max_function_id
+};
diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h
new file mode 100644
index 000000000..4896fdf81
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/profiler.inc.h
@@ -0,0 +1,976 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <vmlinux.h>
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#include "profiler.h"
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+#define O_WRONLY 00000001
+#define O_RDWR 00000002
+#define O_DIRECTORY 00200000
+#define __O_TMPFILE 020000000
+#define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
+#define MAX_ERRNO 4095
+#define S_IFMT 00170000
+#define S_IFSOCK 0140000
+#define S_IFLNK 0120000
+#define S_IFREG 0100000
+#define S_IFBLK 0060000
+#define S_IFDIR 0040000
+#define S_IFCHR 0020000
+#define S_IFIFO 0010000
+#define S_ISUID 0004000
+#define S_ISGID 0002000
+#define S_ISVTX 0001000
+#define S_ISLNK(m) (((m)&S_IFMT) == S_IFLNK)
+#define S_ISDIR(m) (((m)&S_IFMT) == S_IFDIR)
+#define S_ISCHR(m) (((m)&S_IFMT) == S_IFCHR)
+#define S_ISBLK(m) (((m)&S_IFMT) == S_IFBLK)
+#define S_ISFIFO(m) (((m)&S_IFMT) == S_IFIFO)
+#define S_ISSOCK(m) (((m)&S_IFMT) == S_IFSOCK)
+#define IS_ERR_VALUE(x) (unsigned long)(void*)(x) >= (unsigned long)-MAX_ERRNO
+
+#define KILL_DATA_ARRAY_SIZE 8
+
+struct var_kill_data_arr_t {
+ struct var_kill_data_t array[KILL_DATA_ARRAY_SIZE];
+};
+
+union any_profiler_data_t {
+ struct var_exec_data_t var_exec;
+ struct var_kill_data_t var_kill;
+ struct var_sysctl_data_t var_sysctl;
+ struct var_filemod_data_t var_filemod;
+ struct var_fork_data_t var_fork;
+ struct var_kill_data_arr_t var_kill_data_arr;
+};
+
+volatile struct profiler_config_struct bpf_config = {};
+
+#define FETCH_CGROUPS_FROM_BPF (bpf_config.fetch_cgroups_from_bpf)
+#define CGROUP_FS_INODE (bpf_config.cgroup_fs_inode)
+#define CGROUP_LOGIN_SESSION_INODE \
+ (bpf_config.cgroup_login_session_inode)
+#define KILL_SIGNALS (bpf_config.kill_signals_mask)
+#define STALE_INFO (bpf_config.stale_info_secs)
+#define INODE_FILTER (bpf_config.inode_filter)
+#define READ_ENVIRON_FROM_EXEC (bpf_config.read_environ_from_exec)
+#define ENABLE_CGROUP_V1_RESOLVER (bpf_config.enable_cgroup_v1_resolver)
+
+struct kernfs_iattrs___52 {
+ struct iattr ia_iattr;
+};
+
+struct kernfs_node___52 {
+ union /* kernfs_node_id */ {
+ struct {
+ u32 ino;
+ u32 generation;
+ };
+ u64 id;
+ } id;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, u32);
+ __type(value, union any_profiler_data_t);
+} data_heap SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+} events SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, KILL_DATA_ARRAY_SIZE);
+ __type(key, u32);
+ __type(value, struct var_kill_data_arr_t);
+} var_tpid_to_data SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(max_entries, profiler_bpf_max_function_id);
+ __type(key, u32);
+ __type(value, struct bpf_func_stats_data);
+} bpf_func_stats SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, u32);
+ __type(value, bool);
+ __uint(max_entries, 16);
+} allowed_devices SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, u64);
+ __type(value, bool);
+ __uint(max_entries, 1024);
+} allowed_file_inodes SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, u64);
+ __type(value, bool);
+ __uint(max_entries, 1024);
+} allowed_directory_inodes SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __type(key, u32);
+ __type(value, bool);
+ __uint(max_entries, 16);
+} disallowed_exec_inodes SEC(".maps");
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
+#endif
+
+static INLINE bool IS_ERR(const void* ptr)
+{
+ return IS_ERR_VALUE((unsigned long)ptr);
+}
+
+static INLINE u32 get_userspace_pid()
+{
+ return bpf_get_current_pid_tgid() >> 32;
+}
+
+static INLINE bool is_init_process(u32 tgid)
+{
+ return tgid == 1 || tgid == 0;
+}
+
+static INLINE unsigned long
+probe_read_lim(void* dst, void* src, unsigned long len, unsigned long max)
+{
+ len = len < max ? len : max;
+ if (len > 1) {
+ if (bpf_probe_read(dst, len, src))
+ return 0;
+ } else if (len == 1) {
+ if (bpf_probe_read(dst, 1, src))
+ return 0;
+ }
+ return len;
+}
+
+static INLINE int get_var_spid_index(struct var_kill_data_arr_t* arr_struct,
+ int spid)
+{
+#ifdef UNROLL
+#pragma unroll
+#endif
+ for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++)
+ if (arr_struct->array[i].meta.pid == spid)
+ return i;
+ return -1;
+}
+
+static INLINE void populate_ancestors(struct task_struct* task,
+ struct ancestors_data_t* ancestors_data)
+{
+ struct task_struct* parent = task;
+ u32 num_ancestors, ppid;
+
+ ancestors_data->num_ancestors = 0;
+#ifdef UNROLL
+#pragma unroll
+#endif
+ for (num_ancestors = 0; num_ancestors < MAX_ANCESTORS; num_ancestors++) {
+ parent = BPF_CORE_READ(parent, real_parent);
+ if (parent == NULL)
+ break;
+ ppid = BPF_CORE_READ(parent, tgid);
+ if (is_init_process(ppid))
+ break;
+ ancestors_data->ancestor_pids[num_ancestors] = ppid;
+ ancestors_data->ancestor_exec_ids[num_ancestors] =
+ BPF_CORE_READ(parent, self_exec_id);
+ ancestors_data->ancestor_start_times[num_ancestors] =
+ BPF_CORE_READ(parent, start_time);
+ ancestors_data->num_ancestors = num_ancestors;
+ }
+}
+
+static INLINE void* read_full_cgroup_path(struct kernfs_node* cgroup_node,
+ struct kernfs_node* cgroup_root_node,
+ void* payload,
+ int* root_pos)
+{
+ void* payload_start = payload;
+ size_t filepart_length;
+
+#ifdef UNROLL
+#pragma unroll
+#endif
+ for (int i = 0; i < MAX_CGROUPS_PATH_DEPTH; i++) {
+ filepart_length =
+ bpf_probe_read_str(payload, MAX_PATH, BPF_CORE_READ(cgroup_node, name));
+ if (!cgroup_node)
+ return payload;
+ if (cgroup_node == cgroup_root_node)
+ *root_pos = payload - payload_start;
+ if (filepart_length <= MAX_PATH) {
+ barrier_var(filepart_length);
+ payload += filepart_length;
+ }
+ cgroup_node = BPF_CORE_READ(cgroup_node, parent);
+ }
+ return payload;
+}
+
+static ino_t get_inode_from_kernfs(struct kernfs_node* node)
+{
+ struct kernfs_node___52* node52 = (void*)node;
+
+ if (bpf_core_field_exists(node52->id.ino)) {
+ barrier_var(node52);
+ return BPF_CORE_READ(node52, id.ino);
+ } else {
+ barrier_var(node);
+ return (u64)BPF_CORE_READ(node, id);
+ }
+}
+
+extern bool CONFIG_CGROUP_PIDS __kconfig __weak;
+enum cgroup_subsys_id___local {
+ pids_cgrp_id___local = 123, /* value doesn't matter */
+};
+
+static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data,
+ struct task_struct* task,
+ void* payload)
+{
+ struct kernfs_node* root_kernfs =
+ BPF_CORE_READ(task, nsproxy, cgroup_ns, root_cset, dfl_cgrp, kn);
+ struct kernfs_node* proc_kernfs = BPF_CORE_READ(task, cgroups, dfl_cgrp, kn);
+
+#if __has_builtin(__builtin_preserve_enum_value)
+ if (ENABLE_CGROUP_V1_RESOLVER && CONFIG_CGROUP_PIDS) {
+ int cgrp_id = bpf_core_enum_value(enum cgroup_subsys_id___local,
+ pids_cgrp_id___local);
+#ifdef UNROLL
+#pragma unroll
+#endif
+ for (int i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys_state* subsys =
+ BPF_CORE_READ(task, cgroups, subsys[i]);
+ if (subsys != NULL) {
+ int subsys_id = BPF_CORE_READ(subsys, ss, id);
+ if (subsys_id == cgrp_id) {
+ proc_kernfs = BPF_CORE_READ(subsys, cgroup, kn);
+ root_kernfs = BPF_CORE_READ(subsys, ss, root, kf_root, kn);
+ break;
+ }
+ }
+ }
+ }
+#endif
+
+ cgroup_data->cgroup_root_inode = get_inode_from_kernfs(root_kernfs);
+ cgroup_data->cgroup_proc_inode = get_inode_from_kernfs(proc_kernfs);
+
+ if (bpf_core_field_exists(root_kernfs->iattr->ia_mtime)) {
+ cgroup_data->cgroup_root_mtime =
+ BPF_CORE_READ(root_kernfs, iattr, ia_mtime.tv_nsec);
+ cgroup_data->cgroup_proc_mtime =
+ BPF_CORE_READ(proc_kernfs, iattr, ia_mtime.tv_nsec);
+ } else {
+ struct kernfs_iattrs___52* root_iattr =
+ (struct kernfs_iattrs___52*)BPF_CORE_READ(root_kernfs, iattr);
+ cgroup_data->cgroup_root_mtime =
+ BPF_CORE_READ(root_iattr, ia_iattr.ia_mtime.tv_nsec);
+
+ struct kernfs_iattrs___52* proc_iattr =
+ (struct kernfs_iattrs___52*)BPF_CORE_READ(proc_kernfs, iattr);
+ cgroup_data->cgroup_proc_mtime =
+ BPF_CORE_READ(proc_iattr, ia_iattr.ia_mtime.tv_nsec);
+ }
+
+ cgroup_data->cgroup_root_length = 0;
+ cgroup_data->cgroup_proc_length = 0;
+ cgroup_data->cgroup_full_length = 0;
+
+ size_t cgroup_root_length =
+ bpf_probe_read_str(payload, MAX_PATH, BPF_CORE_READ(root_kernfs, name));
+ barrier_var(cgroup_root_length);
+ if (cgroup_root_length <= MAX_PATH) {
+ barrier_var(cgroup_root_length);
+ cgroup_data->cgroup_root_length = cgroup_root_length;
+ payload += cgroup_root_length;
+ }
+
+ size_t cgroup_proc_length =
+ bpf_probe_read_str(payload, MAX_PATH, BPF_CORE_READ(proc_kernfs, name));
+ barrier_var(cgroup_proc_length);
+ if (cgroup_proc_length <= MAX_PATH) {
+ barrier_var(cgroup_proc_length);
+ cgroup_data->cgroup_proc_length = cgroup_proc_length;
+ payload += cgroup_proc_length;
+ }
+
+ if (FETCH_CGROUPS_FROM_BPF) {
+ cgroup_data->cgroup_full_path_root_pos = -1;
+ void* payload_end_pos = read_full_cgroup_path(proc_kernfs, root_kernfs, payload,
+ &cgroup_data->cgroup_full_path_root_pos);
+ cgroup_data->cgroup_full_length = payload_end_pos - payload;
+ payload = payload_end_pos;
+ }
+
+ return (void*)payload;
+}
+
+static INLINE void* populate_var_metadata(struct var_metadata_t* metadata,
+ struct task_struct* task,
+ u32 pid, void* payload)
+{
+ u64 uid_gid = bpf_get_current_uid_gid();
+
+ metadata->uid = (u32)uid_gid;
+ metadata->gid = uid_gid >> 32;
+ metadata->pid = pid;
+ metadata->exec_id = BPF_CORE_READ(task, self_exec_id);
+ metadata->start_time = BPF_CORE_READ(task, start_time);
+ metadata->comm_length = 0;
+
+ size_t comm_length = bpf_core_read_str(payload, TASK_COMM_LEN, &task->comm);
+ barrier_var(comm_length);
+ if (comm_length <= TASK_COMM_LEN) {
+ barrier_var(comm_length);
+ metadata->comm_length = comm_length;
+ payload += comm_length;
+ }
+
+ return (void*)payload;
+}
+
+static INLINE struct var_kill_data_t*
+get_var_kill_data(struct pt_regs* ctx, int spid, int tpid, int sig)
+{
+ int zero = 0;
+ struct var_kill_data_t* kill_data = bpf_map_lookup_elem(&data_heap, &zero);
+
+ if (kill_data == NULL)
+ return NULL;
+ struct task_struct* task = (struct task_struct*)bpf_get_current_task();
+
+ void* payload = populate_var_metadata(&kill_data->meta, task, spid, kill_data->payload);
+ payload = populate_cgroup_info(&kill_data->cgroup_data, task, payload);
+ size_t payload_length = payload - (void*)kill_data->payload;
+ kill_data->payload_length = payload_length;
+ populate_ancestors(task, &kill_data->ancestors_info);
+ kill_data->meta.type = KILL_EVENT;
+ kill_data->kill_target_pid = tpid;
+ kill_data->kill_sig = sig;
+ kill_data->kill_count = 1;
+ kill_data->last_kill_time = bpf_ktime_get_ns();
+ return kill_data;
+}
+
+static INLINE int trace_var_sys_kill(void* ctx, int tpid, int sig)
+{
+ if ((KILL_SIGNALS & (1ULL << sig)) == 0)
+ return 0;
+
+ u32 spid = get_userspace_pid();
+ struct var_kill_data_arr_t* arr_struct = bpf_map_lookup_elem(&var_tpid_to_data, &tpid);
+
+ if (arr_struct == NULL) {
+ struct var_kill_data_t* kill_data = get_var_kill_data(ctx, spid, tpid, sig);
+ int zero = 0;
+
+ if (kill_data == NULL)
+ return 0;
+ arr_struct = bpf_map_lookup_elem(&data_heap, &zero);
+ if (arr_struct == NULL)
+ return 0;
+ bpf_probe_read(&arr_struct->array[0], sizeof(arr_struct->array[0]), kill_data);
+ } else {
+ int index = get_var_spid_index(arr_struct, spid);
+
+ if (index == -1) {
+ struct var_kill_data_t* kill_data =
+ get_var_kill_data(ctx, spid, tpid, sig);
+ if (kill_data == NULL)
+ return 0;
+#ifdef UNROLL
+#pragma unroll
+#endif
+ for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++)
+ if (arr_struct->array[i].meta.pid == 0) {
+ bpf_probe_read(&arr_struct->array[i],
+ sizeof(arr_struct->array[i]), kill_data);
+ bpf_map_update_elem(&var_tpid_to_data, &tpid,
+ arr_struct, 0);
+
+ return 0;
+ }
+ return 0;
+ }
+
+ struct var_kill_data_t* kill_data = &arr_struct->array[index];
+
+ u64 delta_sec =
+ (bpf_ktime_get_ns() - kill_data->last_kill_time) / 1000000000;
+
+ if (delta_sec < STALE_INFO) {
+ kill_data->kill_count++;
+ kill_data->last_kill_time = bpf_ktime_get_ns();
+ bpf_probe_read(&arr_struct->array[index],
+ sizeof(arr_struct->array[index]),
+ kill_data);
+ } else {
+ struct var_kill_data_t* kill_data =
+ get_var_kill_data(ctx, spid, tpid, sig);
+ if (kill_data == NULL)
+ return 0;
+ bpf_probe_read(&arr_struct->array[index],
+ sizeof(arr_struct->array[index]),
+ kill_data);
+ }
+ }
+ bpf_map_update_elem(&var_tpid_to_data, &tpid, arr_struct, 0);
+ return 0;
+}
+
+static INLINE void bpf_stats_enter(struct bpf_func_stats_ctx* bpf_stat_ctx,
+ enum bpf_function_id func_id)
+{
+ int func_id_key = func_id;
+
+ bpf_stat_ctx->start_time_ns = bpf_ktime_get_ns();
+ bpf_stat_ctx->bpf_func_stats_data_val =
+ bpf_map_lookup_elem(&bpf_func_stats, &func_id_key);
+ if (bpf_stat_ctx->bpf_func_stats_data_val)
+ bpf_stat_ctx->bpf_func_stats_data_val->num_executions++;
+}
+
+static INLINE void bpf_stats_exit(struct bpf_func_stats_ctx* bpf_stat_ctx)
+{
+ if (bpf_stat_ctx->bpf_func_stats_data_val)
+ bpf_stat_ctx->bpf_func_stats_data_val->time_elapsed_ns +=
+ bpf_ktime_get_ns() - bpf_stat_ctx->start_time_ns;
+}
+
+static INLINE void
+bpf_stats_pre_submit_var_perf_event(struct bpf_func_stats_ctx* bpf_stat_ctx,
+ struct var_metadata_t* meta)
+{
+ if (bpf_stat_ctx->bpf_func_stats_data_val) {
+ bpf_stat_ctx->bpf_func_stats_data_val->num_perf_events++;
+ meta->bpf_stats_num_perf_events =
+ bpf_stat_ctx->bpf_func_stats_data_val->num_perf_events;
+ }
+ meta->bpf_stats_start_ktime_ns = bpf_stat_ctx->start_time_ns;
+ meta->cpu_id = bpf_get_smp_processor_id();
+}
+
+static INLINE size_t
+read_absolute_file_path_from_dentry(struct dentry* filp_dentry, void* payload)
+{
+ size_t length = 0;
+ size_t filepart_length;
+ struct dentry* parent_dentry;
+
+#ifdef UNROLL
+#pragma unroll
+#endif
+ for (int i = 0; i < MAX_PATH_DEPTH; i++) {
+ filepart_length = bpf_probe_read_str(payload, MAX_PATH,
+ BPF_CORE_READ(filp_dentry, d_name.name));
+ barrier_var(filepart_length);
+ if (filepart_length > MAX_PATH)
+ break;
+ barrier_var(filepart_length);
+ payload += filepart_length;
+ length += filepart_length;
+
+ parent_dentry = BPF_CORE_READ(filp_dentry, d_parent);
+ if (filp_dentry == parent_dentry)
+ break;
+ filp_dentry = parent_dentry;
+ }
+
+ return length;
+}
+
+static INLINE bool
+is_ancestor_in_allowed_inodes(struct dentry* filp_dentry)
+{
+ struct dentry* parent_dentry;
+#ifdef UNROLL
+#pragma unroll
+#endif
+ for (int i = 0; i < MAX_PATH_DEPTH; i++) {
+ u64 dir_ino = BPF_CORE_READ(filp_dentry, d_inode, i_ino);
+ bool* allowed_dir = bpf_map_lookup_elem(&allowed_directory_inodes, &dir_ino);
+
+ if (allowed_dir != NULL)
+ return true;
+ parent_dentry = BPF_CORE_READ(filp_dentry, d_parent);
+ if (filp_dentry == parent_dentry)
+ break;
+ filp_dentry = parent_dentry;
+ }
+ return false;
+}
+
+static INLINE bool is_dentry_allowed_for_filemod(struct dentry* file_dentry,
+ u32* device_id,
+ u64* file_ino)
+{
+ u32 dev_id = BPF_CORE_READ(file_dentry, d_sb, s_dev);
+ *device_id = dev_id;
+ bool* allowed_device = bpf_map_lookup_elem(&allowed_devices, &dev_id);
+
+ if (allowed_device == NULL)
+ return false;
+
+ u64 ino = BPF_CORE_READ(file_dentry, d_inode, i_ino);
+ *file_ino = ino;
+ bool* allowed_file = bpf_map_lookup_elem(&allowed_file_inodes, &ino);
+
+ if (allowed_file == NULL)
+ if (!is_ancestor_in_allowed_inodes(BPF_CORE_READ(file_dentry, d_parent)))
+ return false;
+ return true;
+}
+
+SEC("kprobe/proc_sys_write")
+ssize_t BPF_KPROBE(kprobe__proc_sys_write,
+ struct file* filp, const char* buf,
+ size_t count, loff_t* ppos)
+{
+ struct bpf_func_stats_ctx stats_ctx;
+ bpf_stats_enter(&stats_ctx, profiler_bpf_proc_sys_write);
+
+ u32 pid = get_userspace_pid();
+ int zero = 0;
+ struct var_sysctl_data_t* sysctl_data =
+ bpf_map_lookup_elem(&data_heap, &zero);
+ if (!sysctl_data)
+ goto out;
+
+ struct task_struct* task = (struct task_struct*)bpf_get_current_task();
+ sysctl_data->meta.type = SYSCTL_EVENT;
+ void* payload = populate_var_metadata(&sysctl_data->meta, task, pid, sysctl_data->payload);
+ payload = populate_cgroup_info(&sysctl_data->cgroup_data, task, payload);
+
+ populate_ancestors(task, &sysctl_data->ancestors_info);
+
+ sysctl_data->sysctl_val_length = 0;
+ sysctl_data->sysctl_path_length = 0;
+
+ size_t sysctl_val_length = bpf_probe_read_str(payload, CTL_MAXNAME, buf);
+ barrier_var(sysctl_val_length);
+ if (sysctl_val_length <= CTL_MAXNAME) {
+ barrier_var(sysctl_val_length);
+ sysctl_data->sysctl_val_length = sysctl_val_length;
+ payload += sysctl_val_length;
+ }
+
+ size_t sysctl_path_length = bpf_probe_read_str(payload, MAX_PATH,
+ BPF_CORE_READ(filp, f_path.dentry, d_name.name));
+ barrier_var(sysctl_path_length);
+ if (sysctl_path_length <= MAX_PATH) {
+ barrier_var(sysctl_path_length);
+ sysctl_data->sysctl_path_length = sysctl_path_length;
+ payload += sysctl_path_length;
+ }
+
+ bpf_stats_pre_submit_var_perf_event(&stats_ctx, &sysctl_data->meta);
+ unsigned long data_len = payload - (void*)sysctl_data;
+ data_len = data_len > sizeof(struct var_sysctl_data_t)
+ ? sizeof(struct var_sysctl_data_t)
+ : data_len;
+ bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, sysctl_data, data_len);
+out:
+ bpf_stats_exit(&stats_ctx);
+ return 0;
+}
+
+SEC("tracepoint/syscalls/sys_enter_kill")
+int tracepoint__syscalls__sys_enter_kill(struct trace_event_raw_sys_enter* ctx)
+{
+ struct bpf_func_stats_ctx stats_ctx;
+
+ bpf_stats_enter(&stats_ctx, profiler_bpf_sys_enter_kill);
+ int pid = ctx->args[0];
+ int sig = ctx->args[1];
+ int ret = trace_var_sys_kill(ctx, pid, sig);
+ bpf_stats_exit(&stats_ctx);
+ return ret;
+};
+
+SEC("raw_tracepoint/sched_process_exit")
+int raw_tracepoint__sched_process_exit(void* ctx)
+{
+ int zero = 0;
+ struct bpf_func_stats_ctx stats_ctx;
+ bpf_stats_enter(&stats_ctx, profiler_bpf_sched_process_exit);
+
+ u32 tpid = get_userspace_pid();
+
+ struct var_kill_data_arr_t* arr_struct = bpf_map_lookup_elem(&var_tpid_to_data, &tpid);
+ struct var_kill_data_t* kill_data = bpf_map_lookup_elem(&data_heap, &zero);
+
+ if (arr_struct == NULL || kill_data == NULL)
+ goto out;
+
+ struct task_struct* task = (struct task_struct*)bpf_get_current_task();
+ struct kernfs_node* proc_kernfs = BPF_CORE_READ(task, cgroups, dfl_cgrp, kn);
+
+#ifdef UNROLL
+#pragma unroll
+#endif
+ for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++) {
+ struct var_kill_data_t* past_kill_data = &arr_struct->array[i];
+
+ if (past_kill_data != NULL && past_kill_data->kill_target_pid == tpid) {
+ bpf_probe_read(kill_data, sizeof(*past_kill_data), past_kill_data);
+ void* payload = kill_data->payload;
+ size_t offset = kill_data->payload_length;
+ if (offset >= MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN)
+ return 0;
+ payload += offset;
+
+ kill_data->kill_target_name_length = 0;
+ kill_data->kill_target_cgroup_proc_length = 0;
+
+ size_t comm_length = bpf_core_read_str(payload, TASK_COMM_LEN, &task->comm);
+ barrier_var(comm_length);
+ if (comm_length <= TASK_COMM_LEN) {
+ barrier_var(comm_length);
+ kill_data->kill_target_name_length = comm_length;
+ payload += comm_length;
+ }
+
+ size_t cgroup_proc_length = bpf_probe_read_str(payload, KILL_TARGET_LEN,
+ BPF_CORE_READ(proc_kernfs, name));
+ barrier_var(cgroup_proc_length);
+ if (cgroup_proc_length <= KILL_TARGET_LEN) {
+ barrier_var(cgroup_proc_length);
+ kill_data->kill_target_cgroup_proc_length = cgroup_proc_length;
+ payload += cgroup_proc_length;
+ }
+
+ bpf_stats_pre_submit_var_perf_event(&stats_ctx, &kill_data->meta);
+ unsigned long data_len = (void*)payload - (void*)kill_data;
+ data_len = data_len > sizeof(struct var_kill_data_t)
+ ? sizeof(struct var_kill_data_t)
+ : data_len;
+ bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, kill_data, data_len);
+ }
+ }
+ bpf_map_delete_elem(&var_tpid_to_data, &tpid);
+out:
+ bpf_stats_exit(&stats_ctx);
+ return 0;
+}
+
+SEC("raw_tracepoint/sched_process_exec")
+int raw_tracepoint__sched_process_exec(struct bpf_raw_tracepoint_args* ctx)
+{
+ struct bpf_func_stats_ctx stats_ctx;
+ bpf_stats_enter(&stats_ctx, profiler_bpf_sched_process_exec);
+
+ struct linux_binprm* bprm = (struct linux_binprm*)ctx->args[2];
+ u64 inode = BPF_CORE_READ(bprm, file, f_inode, i_ino);
+
+ bool* should_filter_binprm = bpf_map_lookup_elem(&disallowed_exec_inodes, &inode);
+ if (should_filter_binprm != NULL)
+ goto out;
+
+ int zero = 0;
+ struct var_exec_data_t* proc_exec_data = bpf_map_lookup_elem(&data_heap, &zero);
+ if (!proc_exec_data)
+ goto out;
+
+ if (INODE_FILTER && inode != INODE_FILTER)
+ return 0;
+
+ u32 pid = get_userspace_pid();
+ struct task_struct* task = (struct task_struct*)bpf_get_current_task();
+
+ proc_exec_data->meta.type = EXEC_EVENT;
+ proc_exec_data->bin_path_length = 0;
+ proc_exec_data->cmdline_length = 0;
+ proc_exec_data->environment_length = 0;
+ void* payload = populate_var_metadata(&proc_exec_data->meta, task, pid,
+ proc_exec_data->payload);
+ payload = populate_cgroup_info(&proc_exec_data->cgroup_data, task, payload);
+
+ struct task_struct* parent_task = BPF_CORE_READ(task, real_parent);
+ proc_exec_data->parent_pid = BPF_CORE_READ(parent_task, tgid);
+ proc_exec_data->parent_uid = BPF_CORE_READ(parent_task, real_cred, uid.val);
+ proc_exec_data->parent_exec_id = BPF_CORE_READ(parent_task, self_exec_id);
+ proc_exec_data->parent_start_time = BPF_CORE_READ(parent_task, start_time);
+
+ const char* filename = BPF_CORE_READ(bprm, filename);
+ size_t bin_path_length = bpf_probe_read_str(payload, MAX_FILENAME_LEN, filename);
+ barrier_var(bin_path_length);
+ if (bin_path_length <= MAX_FILENAME_LEN) {
+ barrier_var(bin_path_length);
+ proc_exec_data->bin_path_length = bin_path_length;
+ payload += bin_path_length;
+ }
+
+ void* arg_start = (void*)BPF_CORE_READ(task, mm, arg_start);
+ void* arg_end = (void*)BPF_CORE_READ(task, mm, arg_end);
+ unsigned int cmdline_length = probe_read_lim(payload, arg_start,
+ arg_end - arg_start, MAX_ARGS_LEN);
+
+ if (cmdline_length <= MAX_ARGS_LEN) {
+ barrier_var(cmdline_length);
+ proc_exec_data->cmdline_length = cmdline_length;
+ payload += cmdline_length;
+ }
+
+ if (READ_ENVIRON_FROM_EXEC) {
+ void* env_start = (void*)BPF_CORE_READ(task, mm, env_start);
+ void* env_end = (void*)BPF_CORE_READ(task, mm, env_end);
+ unsigned long env_len = probe_read_lim(payload, env_start,
+ env_end - env_start, MAX_ENVIRON_LEN);
+ if (cmdline_length <= MAX_ENVIRON_LEN) {
+ proc_exec_data->environment_length = env_len;
+ payload += env_len;
+ }
+ }
+
+ bpf_stats_pre_submit_var_perf_event(&stats_ctx, &proc_exec_data->meta);
+ unsigned long data_len = payload - (void*)proc_exec_data;
+ data_len = data_len > sizeof(struct var_exec_data_t)
+ ? sizeof(struct var_exec_data_t)
+ : data_len;
+ bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, proc_exec_data, data_len);
+out:
+ bpf_stats_exit(&stats_ctx);
+ return 0;
+}
+
+SEC("kretprobe/do_filp_open")
+int kprobe_ret__do_filp_open(struct pt_regs* ctx)
+{
+ struct bpf_func_stats_ctx stats_ctx;
+ bpf_stats_enter(&stats_ctx, profiler_bpf_do_filp_open_ret);
+
+ struct file* filp = (struct file*)PT_REGS_RC_CORE(ctx);
+
+ if (filp == NULL || IS_ERR(filp))
+ goto out;
+ unsigned int flags = BPF_CORE_READ(filp, f_flags);
+ if ((flags & (O_RDWR | O_WRONLY)) == 0)
+ goto out;
+ if ((flags & O_TMPFILE) > 0)
+ goto out;
+ struct inode* file_inode = BPF_CORE_READ(filp, f_inode);
+ umode_t mode = BPF_CORE_READ(file_inode, i_mode);
+ if (S_ISDIR(mode) || S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) ||
+ S_ISSOCK(mode))
+ goto out;
+
+ struct dentry* filp_dentry = BPF_CORE_READ(filp, f_path.dentry);
+ u32 device_id = 0;
+ u64 file_ino = 0;
+ if (!is_dentry_allowed_for_filemod(filp_dentry, &device_id, &file_ino))
+ goto out;
+
+ int zero = 0;
+ struct var_filemod_data_t* filemod_data = bpf_map_lookup_elem(&data_heap, &zero);
+ if (!filemod_data)
+ goto out;
+
+ u32 pid = get_userspace_pid();
+ struct task_struct* task = (struct task_struct*)bpf_get_current_task();
+
+ filemod_data->meta.type = FILEMOD_EVENT;
+ filemod_data->fmod_type = FMOD_OPEN;
+ filemod_data->dst_flags = flags;
+ filemod_data->src_inode = 0;
+ filemod_data->dst_inode = file_ino;
+ filemod_data->src_device_id = 0;
+ filemod_data->dst_device_id = device_id;
+ filemod_data->src_filepath_length = 0;
+ filemod_data->dst_filepath_length = 0;
+
+ void* payload = populate_var_metadata(&filemod_data->meta, task, pid,
+ filemod_data->payload);
+ payload = populate_cgroup_info(&filemod_data->cgroup_data, task, payload);
+
+ size_t len = read_absolute_file_path_from_dentry(filp_dentry, payload);
+ barrier_var(len);
+ if (len <= MAX_FILEPATH_LENGTH) {
+ barrier_var(len);
+ payload += len;
+ filemod_data->dst_filepath_length = len;
+ }
+ bpf_stats_pre_submit_var_perf_event(&stats_ctx, &filemod_data->meta);
+ unsigned long data_len = payload - (void*)filemod_data;
+ data_len = data_len > sizeof(*filemod_data) ? sizeof(*filemod_data) : data_len;
+ bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, filemod_data, data_len);
+out:
+ bpf_stats_exit(&stats_ctx);
+ return 0;
+}
+
+SEC("kprobe/vfs_link")
+int BPF_KPROBE(kprobe__vfs_link,
+ struct dentry* old_dentry, struct inode* dir,
+ struct dentry* new_dentry, struct inode** delegated_inode)
+{
+ struct bpf_func_stats_ctx stats_ctx;
+ bpf_stats_enter(&stats_ctx, profiler_bpf_vfs_link);
+
+ u32 src_device_id = 0;
+ u64 src_file_ino = 0;
+ u32 dst_device_id = 0;
+ u64 dst_file_ino = 0;
+ if (!is_dentry_allowed_for_filemod(old_dentry, &src_device_id, &src_file_ino) &&
+ !is_dentry_allowed_for_filemod(new_dentry, &dst_device_id, &dst_file_ino))
+ goto out;
+
+ int zero = 0;
+ struct var_filemod_data_t* filemod_data = bpf_map_lookup_elem(&data_heap, &zero);
+ if (!filemod_data)
+ goto out;
+
+ u32 pid = get_userspace_pid();
+ struct task_struct* task = (struct task_struct*)bpf_get_current_task();
+
+ filemod_data->meta.type = FILEMOD_EVENT;
+ filemod_data->fmod_type = FMOD_LINK;
+ filemod_data->dst_flags = 0;
+ filemod_data->src_inode = src_file_ino;
+ filemod_data->dst_inode = dst_file_ino;
+ filemod_data->src_device_id = src_device_id;
+ filemod_data->dst_device_id = dst_device_id;
+ filemod_data->src_filepath_length = 0;
+ filemod_data->dst_filepath_length = 0;
+
+ void* payload = populate_var_metadata(&filemod_data->meta, task, pid,
+ filemod_data->payload);
+ payload = populate_cgroup_info(&filemod_data->cgroup_data, task, payload);
+
+ size_t len = read_absolute_file_path_from_dentry(old_dentry, payload);
+ barrier_var(len);
+ if (len <= MAX_FILEPATH_LENGTH) {
+ barrier_var(len);
+ payload += len;
+ filemod_data->src_filepath_length = len;
+ }
+
+ len = read_absolute_file_path_from_dentry(new_dentry, payload);
+ barrier_var(len);
+ if (len <= MAX_FILEPATH_LENGTH) {
+ barrier_var(len);
+ payload += len;
+ filemod_data->dst_filepath_length = len;
+ }
+
+ bpf_stats_pre_submit_var_perf_event(&stats_ctx, &filemod_data->meta);
+ unsigned long data_len = payload - (void*)filemod_data;
+ data_len = data_len > sizeof(*filemod_data) ? sizeof(*filemod_data) : data_len;
+ bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, filemod_data, data_len);
+out:
+ bpf_stats_exit(&stats_ctx);
+ return 0;
+}
+
+SEC("kprobe/vfs_symlink")
+int BPF_KPROBE(kprobe__vfs_symlink, struct inode* dir, struct dentry* dentry,
+ const char* oldname)
+{
+ struct bpf_func_stats_ctx stats_ctx;
+ bpf_stats_enter(&stats_ctx, profiler_bpf_vfs_symlink);
+
+ u32 dst_device_id = 0;
+ u64 dst_file_ino = 0;
+ if (!is_dentry_allowed_for_filemod(dentry, &dst_device_id, &dst_file_ino))
+ goto out;
+
+ int zero = 0;
+ struct var_filemod_data_t* filemod_data = bpf_map_lookup_elem(&data_heap, &zero);
+ if (!filemod_data)
+ goto out;
+
+ u32 pid = get_userspace_pid();
+ struct task_struct* task = (struct task_struct*)bpf_get_current_task();
+
+ filemod_data->meta.type = FILEMOD_EVENT;
+ filemod_data->fmod_type = FMOD_SYMLINK;
+ filemod_data->dst_flags = 0;
+ filemod_data->src_inode = 0;
+ filemod_data->dst_inode = dst_file_ino;
+ filemod_data->src_device_id = 0;
+ filemod_data->dst_device_id = dst_device_id;
+ filemod_data->src_filepath_length = 0;
+ filemod_data->dst_filepath_length = 0;
+
+ void* payload = populate_var_metadata(&filemod_data->meta, task, pid,
+ filemod_data->payload);
+ payload = populate_cgroup_info(&filemod_data->cgroup_data, task, payload);
+
+ size_t len = bpf_probe_read_str(payload, MAX_FILEPATH_LENGTH, oldname);
+ barrier_var(len);
+ if (len <= MAX_FILEPATH_LENGTH) {
+ barrier_var(len);
+ payload += len;
+ filemod_data->src_filepath_length = len;
+ }
+ len = read_absolute_file_path_from_dentry(dentry, payload);
+ barrier_var(len);
+ if (len <= MAX_FILEPATH_LENGTH) {
+ barrier_var(len);
+ payload += len;
+ filemod_data->dst_filepath_length = len;
+ }
+ bpf_stats_pre_submit_var_perf_event(&stats_ctx, &filemod_data->meta);
+ unsigned long data_len = payload - (void*)filemod_data;
+ data_len = data_len > sizeof(*filemod_data) ? sizeof(*filemod_data) : data_len;
+ bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, filemod_data, data_len);
+out:
+ bpf_stats_exit(&stats_ctx);
+ return 0;
+}
+
+SEC("raw_tracepoint/sched_process_fork")
+int raw_tracepoint__sched_process_fork(struct bpf_raw_tracepoint_args* ctx)
+{
+ struct bpf_func_stats_ctx stats_ctx;
+ bpf_stats_enter(&stats_ctx, profiler_bpf_sched_process_fork);
+
+ int zero = 0;
+ struct var_fork_data_t* fork_data = bpf_map_lookup_elem(&data_heap, &zero);
+ if (!fork_data)
+ goto out;
+
+ struct task_struct* parent = (struct task_struct*)ctx->args[0];
+ struct task_struct* child = (struct task_struct*)ctx->args[1];
+ fork_data->meta.type = FORK_EVENT;
+
+ void* payload = populate_var_metadata(&fork_data->meta, child,
+ BPF_CORE_READ(child, pid), fork_data->payload);
+ fork_data->parent_pid = BPF_CORE_READ(parent, pid);
+ fork_data->parent_exec_id = BPF_CORE_READ(parent, self_exec_id);
+ fork_data->parent_start_time = BPF_CORE_READ(parent, start_time);
+ bpf_stats_pre_submit_var_perf_event(&stats_ctx, &fork_data->meta);
+
+ unsigned long data_len = payload - (void*)fork_data;
+ data_len = data_len > sizeof(*fork_data) ? sizeof(*fork_data) : data_len;
+ bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, fork_data, data_len);
+out:
+ bpf_stats_exit(&stats_ctx);
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/profiler1.c b/tools/testing/selftests/bpf/progs/profiler1.c
new file mode 100644
index 000000000..4df9088bf
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/profiler1.c
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#define barrier_var(var) asm volatile("" : "=r"(var) : "0"(var))
+#define UNROLL
+#define INLINE __always_inline
+#include "profiler.inc.h"
diff --git a/tools/testing/selftests/bpf/progs/profiler2.c b/tools/testing/selftests/bpf/progs/profiler2.c
new file mode 100644
index 000000000..0f32a3cbf
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/profiler2.c
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#define barrier_var(var) /**/
+/* undef #define UNROLL */
+#define INLINE /**/
+#include "profiler.inc.h"
diff --git a/tools/testing/selftests/bpf/progs/profiler3.c b/tools/testing/selftests/bpf/progs/profiler3.c
new file mode 100644
index 000000000..6249fc31c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/profiler3.c
@@ -0,0 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#define barrier_var(var) /**/
+#define UNROLL
+#define INLINE __noinline
+#include "profiler.inc.h"
diff --git a/tools/testing/selftests/bpf/progs/pyperf.h b/tools/testing/selftests/bpf/progs/pyperf.h
new file mode 100644
index 000000000..2fb7adafb
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/pyperf.h
@@ -0,0 +1,280 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+#define FUNCTION_NAME_LEN 64
+#define FILE_NAME_LEN 128
+#define TASK_COMM_LEN 16
+
+typedef struct {
+ int PyThreadState_frame;
+ int PyThreadState_thread;
+ int PyFrameObject_back;
+ int PyFrameObject_code;
+ int PyFrameObject_lineno;
+ int PyCodeObject_filename;
+ int PyCodeObject_name;
+ int String_data;
+ int String_size;
+} OffsetConfig;
+
+typedef struct {
+ uintptr_t current_state_addr;
+ uintptr_t tls_key_addr;
+ OffsetConfig offsets;
+ bool use_tls;
+} PidData;
+
+typedef struct {
+ uint32_t success;
+} Stats;
+
+typedef struct {
+ char name[FUNCTION_NAME_LEN];
+ char file[FILE_NAME_LEN];
+} Symbol;
+
+typedef struct {
+ uint32_t pid;
+ uint32_t tid;
+ char comm[TASK_COMM_LEN];
+ int32_t kernel_stack_id;
+ int32_t user_stack_id;
+ bool thread_current;
+ bool pthread_match;
+ bool stack_complete;
+ int16_t stack_len;
+ int32_t stack[STACK_MAX_LEN];
+
+ int has_meta;
+ int metadata;
+ char dummy_safeguard;
+} Event;
+
+
+typedef int pid_t;
+
+typedef struct {
+ void* f_back; // PyFrameObject.f_back, previous frame
+ void* f_code; // PyFrameObject.f_code, pointer to PyCodeObject
+ void* co_filename; // PyCodeObject.co_filename
+ void* co_name; // PyCodeObject.co_name
+} FrameData;
+
+#ifdef SUBPROGS
+__noinline
+#else
+__always_inline
+#endif
+static void *get_thread_state(void *tls_base, PidData *pidData)
+{
+ void* thread_state;
+ int key;
+
+ bpf_probe_read_user(&key, sizeof(key), (void*)(long)pidData->tls_key_addr);
+ bpf_probe_read_user(&thread_state, sizeof(thread_state),
+ tls_base + 0x310 + key * 0x10 + 0x08);
+ return thread_state;
+}
+
+static __always_inline bool get_frame_data(void *frame_ptr, PidData *pidData,
+ FrameData *frame, Symbol *symbol)
+{
+ // read data from PyFrameObject
+ bpf_probe_read_user(&frame->f_back,
+ sizeof(frame->f_back),
+ frame_ptr + pidData->offsets.PyFrameObject_back);
+ bpf_probe_read_user(&frame->f_code,
+ sizeof(frame->f_code),
+ frame_ptr + pidData->offsets.PyFrameObject_code);
+
+ // read data from PyCodeObject
+ if (!frame->f_code)
+ return false;
+ bpf_probe_read_user(&frame->co_filename,
+ sizeof(frame->co_filename),
+ frame->f_code + pidData->offsets.PyCodeObject_filename);
+ bpf_probe_read_user(&frame->co_name,
+ sizeof(frame->co_name),
+ frame->f_code + pidData->offsets.PyCodeObject_name);
+ // read actual names into symbol
+ if (frame->co_filename)
+ bpf_probe_read_user_str(&symbol->file,
+ sizeof(symbol->file),
+ frame->co_filename +
+ pidData->offsets.String_data);
+ if (frame->co_name)
+ bpf_probe_read_user_str(&symbol->name,
+ sizeof(symbol->name),
+ frame->co_name +
+ pidData->offsets.String_data);
+ return true;
+}
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, PidData);
+} pidmap SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, Event);
+} eventmap SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 1);
+ __type(key, Symbol);
+ __type(value, int);
+} symbolmap SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, Stats);
+} statsmap SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(max_entries, 32);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+} perfmap SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+ __uint(max_entries, 1000);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(long long) * 127);
+} stackmap SEC(".maps");
+
+#ifdef GLOBAL_FUNC
+__noinline
+#elif defined(SUBPROGS)
+static __noinline
+#else
+static __always_inline
+#endif
+int __on_event(struct bpf_raw_tracepoint_args *ctx)
+{
+ uint64_t pid_tgid = bpf_get_current_pid_tgid();
+ pid_t pid = (pid_t)(pid_tgid >> 32);
+ PidData* pidData = bpf_map_lookup_elem(&pidmap, &pid);
+ if (!pidData)
+ return 0;
+
+ int zero = 0;
+ Event* event = bpf_map_lookup_elem(&eventmap, &zero);
+ if (!event)
+ return 0;
+
+ event->pid = pid;
+
+ event->tid = (pid_t)pid_tgid;
+ bpf_get_current_comm(&event->comm, sizeof(event->comm));
+
+ event->user_stack_id = bpf_get_stackid(ctx, &stackmap, BPF_F_USER_STACK);
+ event->kernel_stack_id = bpf_get_stackid(ctx, &stackmap, 0);
+
+ void* thread_state_current = (void*)0;
+ bpf_probe_read_user(&thread_state_current,
+ sizeof(thread_state_current),
+ (void*)(long)pidData->current_state_addr);
+
+ struct task_struct* task = (struct task_struct*)bpf_get_current_task();
+ void* tls_base = (void*)task;
+
+ void* thread_state = pidData->use_tls ? get_thread_state(tls_base, pidData)
+ : thread_state_current;
+ event->thread_current = thread_state == thread_state_current;
+
+ if (pidData->use_tls) {
+ uint64_t pthread_created;
+ uint64_t pthread_self;
+ bpf_probe_read_user(&pthread_self, sizeof(pthread_self),
+ tls_base + 0x10);
+
+ bpf_probe_read_user(&pthread_created,
+ sizeof(pthread_created),
+ thread_state +
+ pidData->offsets.PyThreadState_thread);
+ event->pthread_match = pthread_created == pthread_self;
+ } else {
+ event->pthread_match = 1;
+ }
+
+ if (event->pthread_match || !pidData->use_tls) {
+ void* frame_ptr;
+ FrameData frame;
+ Symbol sym = {};
+ int cur_cpu = bpf_get_smp_processor_id();
+
+ bpf_probe_read_user(&frame_ptr,
+ sizeof(frame_ptr),
+ thread_state +
+ pidData->offsets.PyThreadState_frame);
+
+ int32_t* symbol_counter = bpf_map_lookup_elem(&symbolmap, &sym);
+ if (symbol_counter == NULL)
+ return 0;
+#ifdef NO_UNROLL
+#pragma clang loop unroll(disable)
+#else
+#pragma clang loop unroll(full)
+#endif
+ /* Unwind python stack */
+ for (int i = 0; i < STACK_MAX_LEN; ++i) {
+ if (frame_ptr && get_frame_data(frame_ptr, pidData, &frame, &sym)) {
+ int32_t new_symbol_id = *symbol_counter * 64 + cur_cpu;
+ int32_t *symbol_id = bpf_map_lookup_elem(&symbolmap, &sym);
+ if (!symbol_id) {
+ bpf_map_update_elem(&symbolmap, &sym, &zero, 0);
+ symbol_id = bpf_map_lookup_elem(&symbolmap, &sym);
+ if (!symbol_id)
+ return 0;
+ }
+ if (*symbol_id == new_symbol_id)
+ (*symbol_counter)++;
+ event->stack[i] = *symbol_id;
+ event->stack_len = i + 1;
+ frame_ptr = frame.f_back;
+ }
+ }
+ event->stack_complete = frame_ptr == NULL;
+ } else {
+ event->stack_complete = 1;
+ }
+
+ Stats* stats = bpf_map_lookup_elem(&statsmap, &zero);
+ if (stats)
+ stats->success++;
+
+ event->has_meta = 0;
+ bpf_perf_event_output(ctx, &perfmap, 0, event, offsetof(Event, metadata));
+ return 0;
+}
+
+SEC("raw_tracepoint/kfree_skb")
+int on_event(struct bpf_raw_tracepoint_args* ctx)
+{
+ int i, ret = 0;
+ ret |= __on_event(ctx);
+ ret |= __on_event(ctx);
+ ret |= __on_event(ctx);
+ ret |= __on_event(ctx);
+ ret |= __on_event(ctx);
+ return ret;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/pyperf100.c b/tools/testing/selftests/bpf/progs/pyperf100.c
new file mode 100644
index 000000000..29786325d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/pyperf100.c
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#define STACK_MAX_LEN 100
+#include "pyperf.h"
diff --git a/tools/testing/selftests/bpf/progs/pyperf180.c b/tools/testing/selftests/bpf/progs/pyperf180.c
new file mode 100644
index 000000000..c39f559d3
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/pyperf180.c
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#define STACK_MAX_LEN 180
+#include "pyperf.h"
diff --git a/tools/testing/selftests/bpf/progs/pyperf50.c b/tools/testing/selftests/bpf/progs/pyperf50.c
new file mode 100644
index 000000000..ef7ce340a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/pyperf50.c
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#define STACK_MAX_LEN 50
+#include "pyperf.h"
diff --git a/tools/testing/selftests/bpf/progs/pyperf600.c b/tools/testing/selftests/bpf/progs/pyperf600.c
new file mode 100644
index 000000000..cb49b89e3
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/pyperf600.c
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#define STACK_MAX_LEN 600
+/* clang will not unroll the loop 600 times.
+ * Instead it will unroll it to the amount it deemed
+ * appropriate, but the loop will still execute 600 times.
+ * Total program size is around 90k insns
+ */
+#include "pyperf.h"
diff --git a/tools/testing/selftests/bpf/progs/pyperf600_nounroll.c b/tools/testing/selftests/bpf/progs/pyperf600_nounroll.c
new file mode 100644
index 000000000..6beff7502
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/pyperf600_nounroll.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#define STACK_MAX_LEN 600
+#define NO_UNROLL
+/* clang will not unroll at all.
+ * Total program size is around 2k insns
+ */
+#include "pyperf.h"
diff --git a/tools/testing/selftests/bpf/progs/pyperf_global.c b/tools/testing/selftests/bpf/progs/pyperf_global.c
new file mode 100644
index 000000000..079e78a75
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/pyperf_global.c
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#define STACK_MAX_LEN 50
+#define GLOBAL_FUNC
+#include "pyperf.h"
diff --git a/tools/testing/selftests/bpf/progs/pyperf_subprogs.c b/tools/testing/selftests/bpf/progs/pyperf_subprogs.c
new file mode 100644
index 000000000..60e27a7f0
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/pyperf_subprogs.c
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#define STACK_MAX_LEN 50
+#define SUBPROGS
+#include "pyperf.h"
diff --git a/tools/testing/selftests/bpf/progs/ringbuf_bench.c b/tools/testing/selftests/bpf/progs/ringbuf_bench.c
new file mode 100644
index 000000000..123607d31
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/ringbuf_bench.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ __uint(type, BPF_MAP_TYPE_RINGBUF);
+} ringbuf SEC(".maps");
+
+const volatile int batch_cnt = 0;
+const volatile long use_output = 0;
+
+long sample_val = 42;
+long dropped __attribute__((aligned(128))) = 0;
+
+const volatile long wakeup_data_size = 0;
+
+static __always_inline long get_flags()
+{
+ long sz;
+
+ if (!wakeup_data_size)
+ return 0;
+
+ sz = bpf_ringbuf_query(&ringbuf, BPF_RB_AVAIL_DATA);
+ return sz >= wakeup_data_size ? BPF_RB_FORCE_WAKEUP : BPF_RB_NO_WAKEUP;
+}
+
+SEC("fentry/__x64_sys_getpgid")
+int bench_ringbuf(void *ctx)
+{
+ long *sample, flags;
+ int i;
+
+ if (!use_output) {
+ for (i = 0; i < batch_cnt; i++) {
+ sample = bpf_ringbuf_reserve(&ringbuf,
+ sizeof(sample_val), 0);
+ if (!sample) {
+ __sync_add_and_fetch(&dropped, 1);
+ } else {
+ *sample = sample_val;
+ flags = get_flags();
+ bpf_ringbuf_submit(sample, flags);
+ }
+ }
+ } else {
+ for (i = 0; i < batch_cnt; i++) {
+ flags = get_flags();
+ if (bpf_ringbuf_output(&ringbuf, &sample_val,
+ sizeof(sample_val), flags))
+ __sync_add_and_fetch(&dropped, 1);
+ }
+ }
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/sample_map_ret0.c b/tools/testing/selftests/bpf/progs/sample_map_ret0.c
new file mode 100644
index 000000000..1612a3200
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/sample_map_ret0.c
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+struct bpf_map_def SEC("maps") htab = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(long),
+ .max_entries = 2,
+};
+
+struct bpf_map_def SEC("maps") array = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(long),
+ .max_entries = 2,
+};
+
+/* Sample program which should always load for testing control paths. */
+SEC(".text") int func()
+{
+ __u64 key64 = 0;
+ __u32 key = 0;
+ long *value;
+
+ value = bpf_map_lookup_elem(&htab, &key);
+ if (!value)
+ return 1;
+ value = bpf_map_lookup_elem(&array, &key64);
+ if (!value)
+ return 1;
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/sample_ret0.c b/tools/testing/selftests/bpf/progs/sample_ret0.c
new file mode 100644
index 000000000..fec99750d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/sample_ret0.c
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) */
+
+/* Sample program which should always load for testing control paths. */
+int func()
+{
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/sendmsg4_prog.c b/tools/testing/selftests/bpf/progs/sendmsg4_prog.c
new file mode 100644
index 000000000..092d9da53
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/sendmsg4_prog.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <linux/stddef.h>
+#include <linux/bpf.h>
+#include <sys/socket.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#define SRC1_IP4 0xAC100001U /* 172.16.0.1 */
+#define SRC2_IP4 0x00000000U
+#define SRC_REWRITE_IP4 0x7f000004U
+#define DST_IP4 0xC0A801FEU /* 192.168.1.254 */
+#define DST_REWRITE_IP4 0x7f000001U
+#define DST_PORT 4040
+#define DST_REWRITE_PORT4 4444
+
+int _version SEC("version") = 1;
+
+SEC("cgroup/sendmsg4")
+int sendmsg_v4_prog(struct bpf_sock_addr *ctx)
+{
+ if (ctx->type != SOCK_DGRAM)
+ return 0;
+
+ /* Rewrite source. */
+ if (ctx->msg_src_ip4 == bpf_htonl(SRC1_IP4) ||
+ ctx->msg_src_ip4 == bpf_htonl(SRC2_IP4)) {
+ ctx->msg_src_ip4 = bpf_htonl(SRC_REWRITE_IP4);
+ } else {
+ /* Unexpected source. Reject sendmsg. */
+ return 0;
+ }
+
+ /* Rewrite destination. */
+ if ((ctx->user_ip4 >> 24) == (bpf_htonl(DST_IP4) >> 24) &&
+ ctx->user_port == bpf_htons(DST_PORT)) {
+ ctx->user_ip4 = bpf_htonl(DST_REWRITE_IP4);
+ ctx->user_port = bpf_htons(DST_REWRITE_PORT4);
+ } else {
+ /* Unexpected source. Reject sendmsg. */
+ return 0;
+ }
+
+ return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/sendmsg6_prog.c b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c
new file mode 100644
index 000000000..255a432bc
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/sendmsg6_prog.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <linux/stddef.h>
+#include <linux/bpf.h>
+#include <sys/socket.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#define SRC_REWRITE_IP6_0 0
+#define SRC_REWRITE_IP6_1 0
+#define SRC_REWRITE_IP6_2 0
+#define SRC_REWRITE_IP6_3 6
+
+#define DST_REWRITE_IP6_0 0
+#define DST_REWRITE_IP6_1 0
+#define DST_REWRITE_IP6_2 0
+#define DST_REWRITE_IP6_3 1
+
+#define DST_REWRITE_PORT6 6666
+
+int _version SEC("version") = 1;
+
+SEC("cgroup/sendmsg6")
+int sendmsg_v6_prog(struct bpf_sock_addr *ctx)
+{
+ if (ctx->type != SOCK_DGRAM)
+ return 0;
+
+ /* Rewrite source. */
+ if (ctx->msg_src_ip6[3] == bpf_htonl(1) ||
+ ctx->msg_src_ip6[3] == bpf_htonl(0)) {
+ ctx->msg_src_ip6[0] = bpf_htonl(SRC_REWRITE_IP6_0);
+ ctx->msg_src_ip6[1] = bpf_htonl(SRC_REWRITE_IP6_1);
+ ctx->msg_src_ip6[2] = bpf_htonl(SRC_REWRITE_IP6_2);
+ ctx->msg_src_ip6[3] = bpf_htonl(SRC_REWRITE_IP6_3);
+ } else {
+ /* Unexpected source. Reject sendmsg. */
+ return 0;
+ }
+
+ /* Rewrite destination. */
+ if (ctx->user_ip6[0] == bpf_htonl(0xFACEB00C)) {
+ ctx->user_ip6[0] = bpf_htonl(DST_REWRITE_IP6_0);
+ ctx->user_ip6[1] = bpf_htonl(DST_REWRITE_IP6_1);
+ ctx->user_ip6[2] = bpf_htonl(DST_REWRITE_IP6_2);
+ ctx->user_ip6[3] = bpf_htonl(DST_REWRITE_IP6_3);
+
+ ctx->user_port = bpf_htons(DST_REWRITE_PORT6);
+ } else {
+ /* Unexpected destination. Reject sendmsg. */
+ return 0;
+ }
+
+ return 1;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/socket_cookie_prog.c b/tools/testing/selftests/bpf/progs/socket_cookie_prog.c
new file mode 100644
index 000000000..0cb5656a2
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/socket_cookie_prog.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <linux/bpf.h>
+#include <sys/socket.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+struct socket_cookie {
+ __u64 cookie_key;
+ __u32 cookie_value;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct socket_cookie);
+} socket_cookies SEC(".maps");
+
+SEC("cgroup/connect6")
+int set_cookie(struct bpf_sock_addr *ctx)
+{
+ struct socket_cookie *p;
+
+ if (ctx->family != AF_INET6 || ctx->user_family != AF_INET6)
+ return 1;
+
+ p = bpf_sk_storage_get(&socket_cookies, ctx->sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ if (!p)
+ return 1;
+
+ p->cookie_value = 0xFF;
+ p->cookie_key = bpf_get_socket_cookie(ctx);
+
+ return 1;
+}
+
+SEC("sockops")
+int update_cookie(struct bpf_sock_ops *ctx)
+{
+ struct bpf_sock *sk;
+ struct socket_cookie *p;
+
+ if (ctx->family != AF_INET6)
+ return 1;
+
+ if (ctx->op != BPF_SOCK_OPS_TCP_CONNECT_CB)
+ return 1;
+
+ if (!ctx->sk)
+ return 1;
+
+ p = bpf_sk_storage_get(&socket_cookies, ctx->sk, 0, 0);
+ if (!p)
+ return 1;
+
+ if (p->cookie_key != bpf_get_socket_cookie(ctx))
+ return 1;
+
+ p->cookie_value = (ctx->local_port << 8) | p->cookie_value;
+
+ return 1;
+}
+
+int _version SEC("version") = 1;
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/sockmap_parse_prog.c b/tools/testing/selftests/bpf/progs/sockmap_parse_prog.c
new file mode 100644
index 000000000..ca283af80
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/sockmap_parse_prog.c
@@ -0,0 +1,37 @@
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+int _version SEC("version") = 1;
+
+SEC("sk_skb1")
+int bpf_prog1(struct __sk_buff *skb)
+{
+ void *data_end = (void *)(long) skb->data_end;
+ void *data = (void *)(long) skb->data;
+ __u32 lport = skb->local_port;
+ __u32 rport = skb->remote_port;
+ __u8 *d = data;
+ int err;
+
+ if (data + 10 > data_end) {
+ err = bpf_skb_pull_data(skb, 10);
+ if (err)
+ return SK_DROP;
+
+ data_end = (void *)(long)skb->data_end;
+ data = (void *)(long)skb->data;
+ if (data + 10 > data_end)
+ return SK_DROP;
+ }
+
+ /* This write/read is a bit pointless but tests the verifier and
+ * strparser handler for read/write pkt data and access into sk
+ * fields.
+ */
+ d = data;
+ d[7] = 1;
+ return skb->len;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c b/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c
new file mode 100644
index 000000000..fdb4bf440
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/sockmap_tcp_msg_prog.c
@@ -0,0 +1,26 @@
+#include <linux/bpf.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+int _version SEC("version") = 1;
+
+SEC("sk_msg1")
+int bpf_prog1(struct sk_msg_md *msg)
+{
+ void *data_end = (void *)(long) msg->data_end;
+ void *data = (void *)(long) msg->data;
+
+ char *d;
+
+ if (data + 8 > data_end)
+ return SK_DROP;
+
+ bpf_printk("data length %i\n", (__u64)msg->data_end - (__u64)msg->data);
+ d = (char *)data;
+ bpf_printk("hello sendmsg hook %i %i\n", d[0], d[1]);
+
+ return SK_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/sockmap_verdict_prog.c b/tools/testing/selftests/bpf/progs/sockmap_verdict_prog.c
new file mode 100644
index 000000000..4797dc985
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/sockmap_verdict_prog.c
@@ -0,0 +1,65 @@
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+int _version SEC("version") = 1;
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKMAP);
+ __uint(max_entries, 20);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+} sock_map_rx SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKMAP);
+ __uint(max_entries, 20);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+} sock_map_tx SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKMAP);
+ __uint(max_entries, 20);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+} sock_map_msg SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 20);
+ __type(key, int);
+ __type(value, int);
+} sock_map_break SEC(".maps");
+
+SEC("sk_skb2")
+int bpf_prog2(struct __sk_buff *skb)
+{
+ void *data_end = (void *)(long) skb->data_end;
+ void *data = (void *)(long) skb->data;
+ __u32 lport = skb->local_port;
+ __u32 rport = skb->remote_port;
+ __u8 *d = data;
+ __u8 sk, map;
+
+ if (data + 8 > data_end)
+ return SK_DROP;
+
+ map = d[0];
+ sk = d[1];
+
+ d[0] = 0xd;
+ d[1] = 0xe;
+ d[2] = 0xa;
+ d[3] = 0xd;
+ d[4] = 0xb;
+ d[5] = 0xe;
+ d[6] = 0xe;
+ d[7] = 0xf;
+
+ if (!map)
+ return bpf_sk_redirect_map(skb, &sock_map_rx, sk, 0);
+ return bpf_sk_redirect_map(skb, &sock_map_tx, sk, 0);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/sockopt_inherit.c b/tools/testing/selftests/bpf/progs/sockopt_inherit.c
new file mode 100644
index 000000000..c6d428a8d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/sockopt_inherit.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1;
+
+#define SOL_CUSTOM 0xdeadbeef
+#define CUSTOM_INHERIT1 0
+#define CUSTOM_INHERIT2 1
+#define CUSTOM_LISTENER 2
+
+struct sockopt_inherit {
+ __u8 val;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC | BPF_F_CLONE);
+ __type(key, int);
+ __type(value, struct sockopt_inherit);
+} cloned1_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC | BPF_F_CLONE);
+ __type(key, int);
+ __type(value, struct sockopt_inherit);
+} cloned2_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct sockopt_inherit);
+} listener_only_map SEC(".maps");
+
+static __inline struct sockopt_inherit *get_storage(struct bpf_sockopt *ctx)
+{
+ if (ctx->optname == CUSTOM_INHERIT1)
+ return bpf_sk_storage_get(&cloned1_map, ctx->sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ else if (ctx->optname == CUSTOM_INHERIT2)
+ return bpf_sk_storage_get(&cloned2_map, ctx->sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ else
+ return bpf_sk_storage_get(&listener_only_map, ctx->sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+}
+
+SEC("cgroup/getsockopt")
+int _getsockopt(struct bpf_sockopt *ctx)
+{
+ __u8 *optval_end = ctx->optval_end;
+ struct sockopt_inherit *storage;
+ __u8 *optval = ctx->optval;
+
+ if (ctx->level != SOL_CUSTOM)
+ return 1; /* only interested in SOL_CUSTOM */
+
+ if (optval + 1 > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ storage = get_storage(ctx);
+ if (!storage)
+ return 0; /* EPERM, couldn't get sk storage */
+
+ ctx->retval = 0; /* Reset system call return value to zero */
+
+ optval[0] = storage->val;
+ ctx->optlen = 1;
+
+ return 1;
+}
+
+SEC("cgroup/setsockopt")
+int _setsockopt(struct bpf_sockopt *ctx)
+{
+ __u8 *optval_end = ctx->optval_end;
+ struct sockopt_inherit *storage;
+ __u8 *optval = ctx->optval;
+
+ if (ctx->level != SOL_CUSTOM)
+ return 1; /* only interested in SOL_CUSTOM */
+
+ if (optval + 1 > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ storage = get_storage(ctx);
+ if (!storage)
+ return 0; /* EPERM, couldn't get sk storage */
+
+ storage->val = optval[0];
+ ctx->optlen = -1;
+
+ return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/sockopt_multi.c b/tools/testing/selftests/bpf/progs/sockopt_multi.c
new file mode 100644
index 000000000..9d8c212dd
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/sockopt_multi.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <netinet/in.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1;
+
+SEC("cgroup/getsockopt/child")
+int _getsockopt_child(struct bpf_sockopt *ctx)
+{
+ __u8 *optval_end = ctx->optval_end;
+ __u8 *optval = ctx->optval;
+
+ if (ctx->level != SOL_IP || ctx->optname != IP_TOS)
+ return 1;
+
+ if (optval + 1 > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ if (optval[0] != 0x80)
+ return 0; /* EPERM, unexpected optval from the kernel */
+
+ ctx->retval = 0; /* Reset system call return value to zero */
+
+ optval[0] = 0x90;
+ ctx->optlen = 1;
+
+ return 1;
+}
+
+SEC("cgroup/getsockopt/parent")
+int _getsockopt_parent(struct bpf_sockopt *ctx)
+{
+ __u8 *optval_end = ctx->optval_end;
+ __u8 *optval = ctx->optval;
+
+ if (ctx->level != SOL_IP || ctx->optname != IP_TOS)
+ return 1;
+
+ if (optval + 1 > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ if (optval[0] != 0x90)
+ return 0; /* EPERM, unexpected optval from the kernel */
+
+ ctx->retval = 0; /* Reset system call return value to zero */
+
+ optval[0] = 0xA0;
+ ctx->optlen = 1;
+
+ return 1;
+}
+
+SEC("cgroup/setsockopt")
+int _setsockopt(struct bpf_sockopt *ctx)
+{
+ __u8 *optval_end = ctx->optval_end;
+ __u8 *optval = ctx->optval;
+
+ if (ctx->level != SOL_IP || ctx->optname != IP_TOS)
+ return 1;
+
+ if (optval + 1 > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ optval[0] += 0x10;
+ ctx->optlen = 1;
+
+ return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c
new file mode 100644
index 000000000..d3597f81e
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <string.h>
+#include <linux/tcp.h>
+#include <linux/bpf.h>
+#include <netinet/in.h>
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1;
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE 4096
+#endif
+
+#ifndef SOL_TCP
+#define SOL_TCP IPPROTO_TCP
+#endif
+
+#define SOL_CUSTOM 0xdeadbeef
+
+struct sockopt_sk {
+ __u8 val;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct sockopt_sk);
+} socket_storage_map SEC(".maps");
+
+SEC("cgroup/getsockopt")
+int _getsockopt(struct bpf_sockopt *ctx)
+{
+ __u8 *optval_end = ctx->optval_end;
+ __u8 *optval = ctx->optval;
+ struct sockopt_sk *storage;
+
+ if (ctx->level == SOL_IP && ctx->optname == IP_TOS) {
+ /* Not interested in SOL_IP:IP_TOS;
+ * let next BPF program in the cgroup chain or kernel
+ * handle it.
+ */
+ ctx->optlen = 0; /* bypass optval>PAGE_SIZE */
+ return 1;
+ }
+
+ if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) {
+ /* Not interested in SOL_SOCKET:SO_SNDBUF;
+ * let next BPF program in the cgroup chain or kernel
+ * handle it.
+ */
+ return 1;
+ }
+
+ if (ctx->level == SOL_TCP && ctx->optname == TCP_CONGESTION) {
+ /* Not interested in SOL_TCP:TCP_CONGESTION;
+ * let next BPF program in the cgroup chain or kernel
+ * handle it.
+ */
+ return 1;
+ }
+
+ if (ctx->level == SOL_TCP && ctx->optname == TCP_ZEROCOPY_RECEIVE) {
+ /* Verify that TCP_ZEROCOPY_RECEIVE triggers.
+ * It has a custom implementation for performance
+ * reasons.
+ */
+
+ if (optval + sizeof(struct tcp_zerocopy_receive) > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ if (((struct tcp_zerocopy_receive *)optval)->address != 0)
+ return 0; /* EPERM, unexpected data */
+
+ return 1;
+ }
+
+ if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
+ if (optval + 1 > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ ctx->retval = 0; /* Reset system call return value to zero */
+
+ /* Always export 0x55 */
+ optval[0] = 0x55;
+ ctx->optlen = 1;
+
+ /* Userspace buffer is PAGE_SIZE * 2, but BPF
+ * program can only see the first PAGE_SIZE
+ * bytes of data.
+ */
+ if (optval_end - optval != PAGE_SIZE)
+ return 0; /* EPERM, unexpected data size */
+
+ return 1;
+ }
+
+ if (ctx->level != SOL_CUSTOM)
+ return 0; /* EPERM, deny everything except custom level */
+
+ if (optval + 1 > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ storage = bpf_sk_storage_get(&socket_storage_map, ctx->sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ if (!storage)
+ return 0; /* EPERM, couldn't get sk storage */
+
+ if (!ctx->retval)
+ return 0; /* EPERM, kernel should not have handled
+ * SOL_CUSTOM, something is wrong!
+ */
+ ctx->retval = 0; /* Reset system call return value to zero */
+
+ optval[0] = storage->val;
+ ctx->optlen = 1;
+
+ return 1;
+}
+
+SEC("cgroup/setsockopt")
+int _setsockopt(struct bpf_sockopt *ctx)
+{
+ __u8 *optval_end = ctx->optval_end;
+ __u8 *optval = ctx->optval;
+ struct sockopt_sk *storage;
+
+ if (ctx->level == SOL_IP && ctx->optname == IP_TOS) {
+ /* Not interested in SOL_IP:IP_TOS;
+ * let next BPF program in the cgroup chain or kernel
+ * handle it.
+ */
+ ctx->optlen = 0; /* bypass optval>PAGE_SIZE */
+ return 1;
+ }
+
+ if (ctx->level == SOL_SOCKET && ctx->optname == SO_SNDBUF) {
+ /* Overwrite SO_SNDBUF value */
+
+ if (optval + sizeof(__u32) > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ *(__u32 *)optval = 0x55AA;
+ ctx->optlen = 4;
+
+ return 1;
+ }
+
+ if (ctx->level == SOL_TCP && ctx->optname == TCP_CONGESTION) {
+ /* Always use cubic */
+
+ if (optval + 5 > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ memcpy(optval, "cubic", 5);
+ ctx->optlen = 5;
+
+ return 1;
+ }
+
+ if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) {
+ /* Original optlen is larger than PAGE_SIZE. */
+ if (ctx->optlen != PAGE_SIZE * 2)
+ return 0; /* EPERM, unexpected data size */
+
+ if (optval + 1 > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ /* Make sure we can trim the buffer. */
+ optval[0] = 0;
+ ctx->optlen = 1;
+
+ /* Usepace buffer is PAGE_SIZE * 2, but BPF
+ * program can only see the first PAGE_SIZE
+ * bytes of data.
+ */
+ if (optval_end - optval != PAGE_SIZE)
+ return 0; /* EPERM, unexpected data size */
+
+ return 1;
+ }
+
+ if (ctx->level != SOL_CUSTOM)
+ return 0; /* EPERM, deny everything except custom level */
+
+ if (optval + 1 > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ storage = bpf_sk_storage_get(&socket_storage_map, ctx->sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ if (!storage)
+ return 0; /* EPERM, couldn't get sk storage */
+
+ storage->val = optval[0];
+ ctx->optlen = -1; /* BPF has consumed this option, don't call kernel
+ * setsockopt handler.
+ */
+
+ return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/strobemeta.c b/tools/testing/selftests/bpf/progs/strobemeta.c
new file mode 100644
index 000000000..d3df3d86f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/strobemeta.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+// Copyright (c) 2019 Facebook
+
+#define STROBE_MAX_INTS 2
+#define STROBE_MAX_STRS 25
+#define STROBE_MAX_MAPS 100
+#define STROBE_MAX_MAP_ENTRIES 20
+/* full unroll by llvm #undef NO_UNROLL */
+#include "strobemeta.h"
+
diff --git a/tools/testing/selftests/bpf/progs/strobemeta.h b/tools/testing/selftests/bpf/progs/strobemeta.h
new file mode 100644
index 000000000..60c93aee2
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/strobemeta.h
@@ -0,0 +1,547 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <linux/bpf.h>
+#include <linux/ptrace.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <bpf/bpf_helpers.h>
+
+typedef uint32_t pid_t;
+struct task_struct {};
+
+#define TASK_COMM_LEN 16
+#define PERF_MAX_STACK_DEPTH 127
+
+#define STROBE_TYPE_INVALID 0
+#define STROBE_TYPE_INT 1
+#define STROBE_TYPE_STR 2
+#define STROBE_TYPE_MAP 3
+
+#define STACK_TABLE_EPOCH_SHIFT 20
+#define STROBE_MAX_STR_LEN 1
+#define STROBE_MAX_CFGS 32
+#define STROBE_MAX_PAYLOAD \
+ (STROBE_MAX_STRS * STROBE_MAX_STR_LEN + \
+ STROBE_MAX_MAPS * (1 + STROBE_MAX_MAP_ENTRIES * 2) * STROBE_MAX_STR_LEN)
+
+struct strobe_value_header {
+ /*
+ * meaning depends on type:
+ * 1. int: 0, if value not set, 1 otherwise
+ * 2. str: 1 always, whether value is set or not is determined by ptr
+ * 3. map: 1 always, pointer points to additional struct with number
+ * of entries (up to STROBE_MAX_MAP_ENTRIES)
+ */
+ uint16_t len;
+ /*
+ * _reserved might be used for some future fields/flags, but we always
+ * want to keep strobe_value_header to be 8 bytes, so BPF can read 16
+ * bytes in one go and get both header and value
+ */
+ uint8_t _reserved[6];
+};
+
+/*
+ * strobe_value_generic is used from BPF probe only, but needs to be a union
+ * of strobe_value_int/strobe_value_str/strobe_value_map
+ */
+struct strobe_value_generic {
+ struct strobe_value_header header;
+ union {
+ int64_t val;
+ void *ptr;
+ };
+};
+
+struct strobe_value_int {
+ struct strobe_value_header header;
+ int64_t value;
+};
+
+struct strobe_value_str {
+ struct strobe_value_header header;
+ const char* value;
+};
+
+struct strobe_value_map {
+ struct strobe_value_header header;
+ const struct strobe_map_raw* value;
+};
+
+struct strobe_map_entry {
+ const char* key;
+ const char* val;
+};
+
+/*
+ * Map of C-string key/value pairs with fixed maximum capacity. Each map has
+ * corresponding int64 ID, which application can use (or ignore) in whatever
+ * way appropriate. Map is "write-only", there is no way to get data out of
+ * map. Map is intended to be used to provide metadata for profilers and is
+ * not to be used for internal in-app communication. All methods are
+ * thread-safe.
+ */
+struct strobe_map_raw {
+ /*
+ * general purpose unique ID that's up to application to decide
+ * whether and how to use; for request metadata use case id is unique
+ * request ID that's used to match metadata with stack traces on
+ * Strobelight backend side
+ */
+ int64_t id;
+ /* number of used entries in map */
+ int64_t cnt;
+ /*
+ * having volatile doesn't change anything on BPF side, but clang
+ * emits warnings for passing `volatile const char *` into
+ * bpf_probe_read_user_str that expects just `const char *`
+ */
+ const char* tag;
+ /*
+ * key/value entries, each consisting of 2 pointers to key and value
+ * C strings
+ */
+ struct strobe_map_entry entries[STROBE_MAX_MAP_ENTRIES];
+};
+
+/* Following values define supported values of TLS mode */
+#define TLS_NOT_SET -1
+#define TLS_LOCAL_EXEC 0
+#define TLS_IMM_EXEC 1
+#define TLS_GENERAL_DYN 2
+
+/*
+ * structure that universally represents TLS location (both for static
+ * executables and shared libraries)
+ */
+struct strobe_value_loc {
+ /*
+ * tls_mode defines what TLS mode was used for particular metavariable:
+ * - -1 (TLS_NOT_SET) - no metavariable;
+ * - 0 (TLS_LOCAL_EXEC) - Local Executable mode;
+ * - 1 (TLS_IMM_EXEC) - Immediate Executable mode;
+ * - 2 (TLS_GENERAL_DYN) - General Dynamic mode;
+ * Local Dynamic mode is not yet supported, because never seen in
+ * practice. Mode defines how offset field is interpreted. See
+ * calc_location() in below for details.
+ */
+ int64_t tls_mode;
+ /*
+ * TLS_LOCAL_EXEC: offset from thread pointer (fs:0 for x86-64,
+ * tpidr_el0 for aarch64).
+ * TLS_IMM_EXEC: absolute address of GOT entry containing offset
+ * from thread pointer;
+ * TLS_GENERAL_DYN: absolute addres of double GOT entry
+ * containing tls_index_t struct;
+ */
+ int64_t offset;
+};
+
+struct strobemeta_cfg {
+ int64_t req_meta_idx;
+ struct strobe_value_loc int_locs[STROBE_MAX_INTS];
+ struct strobe_value_loc str_locs[STROBE_MAX_STRS];
+ struct strobe_value_loc map_locs[STROBE_MAX_MAPS];
+};
+
+struct strobe_map_descr {
+ uint64_t id;
+ int16_t tag_len;
+ /*
+ * cnt <0 - map value isn't set;
+ * 0 - map has id set, but no key/value entries
+ */
+ int16_t cnt;
+ /*
+ * both key_lens[i] and val_lens[i] should be >0 for present key/value
+ * entry
+ */
+ uint16_t key_lens[STROBE_MAX_MAP_ENTRIES];
+ uint16_t val_lens[STROBE_MAX_MAP_ENTRIES];
+};
+
+struct strobemeta_payload {
+ /* req_id has valid request ID, if req_meta_valid == 1 */
+ int64_t req_id;
+ uint8_t req_meta_valid;
+ /*
+ * mask has Nth bit set to 1, if Nth metavar was present and
+ * successfully read
+ */
+ uint64_t int_vals_set_mask;
+ int64_t int_vals[STROBE_MAX_INTS];
+ /* len is >0 for present values */
+ uint16_t str_lens[STROBE_MAX_STRS];
+ /* if map_descrs[i].cnt == -1, metavar is not present/set */
+ struct strobe_map_descr map_descrs[STROBE_MAX_MAPS];
+ /*
+ * payload has compactly packed values of str and map variables in the
+ * form: strval1\0strval2\0map1key1\0map1val1\0map2key1\0map2val1\0
+ * (and so on); str_lens[i], key_lens[i] and val_lens[i] determines
+ * value length
+ */
+ char payload[STROBE_MAX_PAYLOAD];
+};
+
+struct strobelight_bpf_sample {
+ uint64_t ktime;
+ char comm[TASK_COMM_LEN];
+ pid_t pid;
+ int user_stack_id;
+ int kernel_stack_id;
+ int has_meta;
+ struct strobemeta_payload metadata;
+ /*
+ * makes it possible to pass (<real payload size> + 1) as data size to
+ * perf_submit() to avoid perf_submit's paranoia about passing zero as
+ * size, as it deduces that <real payload size> might be
+ * **theoretically** zero
+ */
+ char dummy_safeguard;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(max_entries, 32);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+} samples SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+ __uint(max_entries, 16);
+ __uint(key_size, sizeof(uint32_t));
+ __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH);
+} stacks_0 SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+ __uint(max_entries, 16);
+ __uint(key_size, sizeof(uint32_t));
+ __uint(value_size, sizeof(uint64_t) * PERF_MAX_STACK_DEPTH);
+} stacks_1 SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, uint32_t);
+ __type(value, struct strobelight_bpf_sample);
+} sample_heap SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(max_entries, STROBE_MAX_CFGS);
+ __type(key, pid_t);
+ __type(value, struct strobemeta_cfg);
+} strobemeta_cfgs SEC(".maps");
+
+/* Type for the dtv. */
+/* https://github.com/lattera/glibc/blob/master/nptl/sysdeps/x86_64/tls.h#L34 */
+typedef union dtv {
+ size_t counter;
+ struct {
+ void* val;
+ bool is_static;
+ } pointer;
+} dtv_t;
+
+/* Partial definition for tcbhead_t */
+/* https://github.com/bminor/glibc/blob/master/sysdeps/x86_64/nptl/tls.h#L42 */
+struct tcbhead {
+ void* tcb;
+ dtv_t* dtv;
+};
+
+/*
+ * TLS module/offset information for shared library case.
+ * For x86-64, this is mapped onto two entries in GOT.
+ * For aarch64, this is pointed to by second GOT entry.
+ */
+struct tls_index {
+ uint64_t module;
+ uint64_t offset;
+};
+
+#ifdef SUBPROGS
+__noinline
+#else
+__always_inline
+#endif
+static void *calc_location(struct strobe_value_loc *loc, void *tls_base)
+{
+ /*
+ * tls_mode value is:
+ * - -1 (TLS_NOT_SET), if no metavar is present;
+ * - 0 (TLS_LOCAL_EXEC), if metavar uses Local Executable mode of TLS
+ * (offset from fs:0 for x86-64 or tpidr_el0 for aarch64);
+ * - 1 (TLS_IMM_EXEC), if metavar uses Immediate Executable mode of TLS;
+ * - 2 (TLS_GENERAL_DYN), if metavar uses General Dynamic mode of TLS;
+ * This schema allows to use something like:
+ * (tls_mode + 1) * (tls_base + offset)
+ * to get NULL for "no metavar" location, or correct pointer for local
+ * executable mode without doing extra ifs.
+ */
+ if (loc->tls_mode <= TLS_LOCAL_EXEC) {
+ /* static executable is simple, we just have offset from
+ * tls_base */
+ void *addr = tls_base + loc->offset;
+ /* multiply by (tls_mode + 1) to get NULL, if we have no
+ * metavar in this slot */
+ return (void *)((loc->tls_mode + 1) * (int64_t)addr);
+ }
+ /*
+ * Other modes are more complicated, we need to jump through few hoops.
+ *
+ * For immediate executable mode (currently supported only for aarch64):
+ * - loc->offset is pointing to a GOT entry containing fixed offset
+ * relative to tls_base;
+ *
+ * For general dynamic mode:
+ * - loc->offset is pointing to a beginning of double GOT entries;
+ * - (for aarch64 only) second entry points to tls_index_t struct;
+ * - (for x86-64 only) two GOT entries are already tls_index_t;
+ * - tls_index_t->module is used to find start of TLS section in
+ * which variable resides;
+ * - tls_index_t->offset provides offset within that TLS section,
+ * pointing to value of variable.
+ */
+ struct tls_index tls_index;
+ dtv_t *dtv;
+ void *tls_ptr;
+
+ bpf_probe_read_user(&tls_index, sizeof(struct tls_index),
+ (void *)loc->offset);
+ /* valid module index is always positive */
+ if (tls_index.module > 0) {
+ /* dtv = ((struct tcbhead *)tls_base)->dtv[tls_index.module] */
+ bpf_probe_read_user(&dtv, sizeof(dtv),
+ &((struct tcbhead *)tls_base)->dtv);
+ dtv += tls_index.module;
+ } else {
+ dtv = NULL;
+ }
+ bpf_probe_read_user(&tls_ptr, sizeof(void *), dtv);
+ /* if pointer has (void *)-1 value, then TLS wasn't initialized yet */
+ return tls_ptr && tls_ptr != (void *)-1
+ ? tls_ptr + tls_index.offset
+ : NULL;
+}
+
+#ifdef SUBPROGS
+__noinline
+#else
+__always_inline
+#endif
+static void read_int_var(struct strobemeta_cfg *cfg,
+ size_t idx, void *tls_base,
+ struct strobe_value_generic *value,
+ struct strobemeta_payload *data)
+{
+ void *location = calc_location(&cfg->int_locs[idx], tls_base);
+ if (!location)
+ return;
+
+ bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
+ data->int_vals[idx] = value->val;
+ if (value->header.len)
+ data->int_vals_set_mask |= (1 << idx);
+}
+
+static __always_inline uint64_t read_str_var(struct strobemeta_cfg *cfg,
+ size_t idx, void *tls_base,
+ struct strobe_value_generic *value,
+ struct strobemeta_payload *data,
+ void *payload)
+{
+ void *location;
+ uint64_t len;
+
+ data->str_lens[idx] = 0;
+ location = calc_location(&cfg->str_locs[idx], tls_base);
+ if (!location)
+ return 0;
+
+ bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
+ len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, value->ptr);
+ /*
+ * if bpf_probe_read_user_str returns error (<0), due to casting to
+ * unsinged int, it will become big number, so next check is
+ * sufficient to check for errors AND prove to BPF verifier, that
+ * bpf_probe_read_user_str won't return anything bigger than
+ * STROBE_MAX_STR_LEN
+ */
+ if (len > STROBE_MAX_STR_LEN)
+ return 0;
+
+ data->str_lens[idx] = len;
+ return len;
+}
+
+static __always_inline void *read_map_var(struct strobemeta_cfg *cfg,
+ size_t idx, void *tls_base,
+ struct strobe_value_generic *value,
+ struct strobemeta_payload *data,
+ void *payload)
+{
+ struct strobe_map_descr* descr = &data->map_descrs[idx];
+ struct strobe_map_raw map;
+ void *location;
+ uint64_t len;
+ int i;
+
+ descr->tag_len = 0; /* presume no tag is set */
+ descr->cnt = -1; /* presume no value is set */
+
+ location = calc_location(&cfg->map_locs[idx], tls_base);
+ if (!location)
+ return payload;
+
+ bpf_probe_read_user(value, sizeof(struct strobe_value_generic), location);
+ if (bpf_probe_read_user(&map, sizeof(struct strobe_map_raw), value->ptr))
+ return payload;
+
+ descr->id = map.id;
+ descr->cnt = map.cnt;
+ if (cfg->req_meta_idx == idx) {
+ data->req_id = map.id;
+ data->req_meta_valid = 1;
+ }
+
+ len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN, map.tag);
+ if (len <= STROBE_MAX_STR_LEN) {
+ descr->tag_len = len;
+ payload += len;
+ }
+
+#ifdef NO_UNROLL
+#pragma clang loop unroll(disable)
+#else
+#pragma unroll
+#endif
+ for (int i = 0; i < STROBE_MAX_MAP_ENTRIES; ++i) {
+ if (i >= map.cnt)
+ break;
+
+ descr->key_lens[i] = 0;
+ len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN,
+ map.entries[i].key);
+ if (len <= STROBE_MAX_STR_LEN) {
+ descr->key_lens[i] = len;
+ payload += len;
+ }
+ descr->val_lens[i] = 0;
+ len = bpf_probe_read_user_str(payload, STROBE_MAX_STR_LEN,
+ map.entries[i].val);
+ if (len <= STROBE_MAX_STR_LEN) {
+ descr->val_lens[i] = len;
+ payload += len;
+ }
+ }
+
+ return payload;
+}
+
+/*
+ * read_strobe_meta returns NULL, if no metadata was read; otherwise returns
+ * pointer to *right after* payload ends
+ */
+#ifdef SUBPROGS
+__noinline
+#else
+__always_inline
+#endif
+static void *read_strobe_meta(struct task_struct *task,
+ struct strobemeta_payload *data)
+{
+ pid_t pid = bpf_get_current_pid_tgid() >> 32;
+ struct strobe_value_generic value = {0};
+ struct strobemeta_cfg *cfg;
+ void *tls_base, *payload;
+
+ cfg = bpf_map_lookup_elem(&strobemeta_cfgs, &pid);
+ if (!cfg)
+ return NULL;
+
+ data->int_vals_set_mask = 0;
+ data->req_meta_valid = 0;
+ payload = data->payload;
+ /*
+ * we don't have struct task_struct definition, it should be:
+ * tls_base = (void *)task->thread.fsbase;
+ */
+ tls_base = (void *)task;
+
+#ifdef NO_UNROLL
+#pragma clang loop unroll(disable)
+#else
+#pragma unroll
+#endif
+ for (int i = 0; i < STROBE_MAX_INTS; ++i) {
+ read_int_var(cfg, i, tls_base, &value, data);
+ }
+#ifdef NO_UNROLL
+#pragma clang loop unroll(disable)
+#else
+#pragma unroll
+#endif
+ for (int i = 0; i < STROBE_MAX_STRS; ++i) {
+ payload += read_str_var(cfg, i, tls_base, &value, data, payload);
+ }
+#ifdef NO_UNROLL
+#pragma clang loop unroll(disable)
+#else
+#pragma unroll
+#endif
+ for (int i = 0; i < STROBE_MAX_MAPS; ++i) {
+ payload = read_map_var(cfg, i, tls_base, &value, data, payload);
+ }
+ /*
+ * return pointer right after end of payload, so it's possible to
+ * calculate exact amount of useful data that needs to be sent
+ */
+ return payload;
+}
+
+SEC("raw_tracepoint/kfree_skb")
+int on_event(struct pt_regs *ctx) {
+ pid_t pid = bpf_get_current_pid_tgid() >> 32;
+ struct strobelight_bpf_sample* sample;
+ struct task_struct *task;
+ uint32_t zero = 0;
+ uint64_t ktime_ns;
+ void *sample_end;
+
+ sample = bpf_map_lookup_elem(&sample_heap, &zero);
+ if (!sample)
+ return 0; /* this will never happen */
+
+ sample->pid = pid;
+ bpf_get_current_comm(&sample->comm, TASK_COMM_LEN);
+ ktime_ns = bpf_ktime_get_ns();
+ sample->ktime = ktime_ns;
+
+ task = (struct task_struct *)bpf_get_current_task();
+ sample_end = read_strobe_meta(task, &sample->metadata);
+ sample->has_meta = sample_end != NULL;
+ sample_end = sample_end ? : &sample->metadata;
+
+ if ((ktime_ns >> STACK_TABLE_EPOCH_SHIFT) & 1) {
+ sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_1, 0);
+ sample->user_stack_id = bpf_get_stackid(ctx, &stacks_1, BPF_F_USER_STACK);
+ } else {
+ sample->kernel_stack_id = bpf_get_stackid(ctx, &stacks_0, 0);
+ sample->user_stack_id = bpf_get_stackid(ctx, &stacks_0, BPF_F_USER_STACK);
+ }
+
+ uint64_t sample_size = sample_end - (void *)sample;
+ /* should always be true */
+ if (sample_size < sizeof(struct strobelight_bpf_sample))
+ bpf_perf_event_output(ctx, &samples, 0, sample, 1 + sample_size);
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/strobemeta_nounroll1.c b/tools/testing/selftests/bpf/progs/strobemeta_nounroll1.c
new file mode 100644
index 000000000..f0a1669e1
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/strobemeta_nounroll1.c
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+// Copyright (c) 2019 Facebook
+
+#define STROBE_MAX_INTS 2
+#define STROBE_MAX_STRS 25
+#define STROBE_MAX_MAPS 13
+#define STROBE_MAX_MAP_ENTRIES 20
+#define NO_UNROLL
+#include "strobemeta.h"
diff --git a/tools/testing/selftests/bpf/progs/strobemeta_nounroll2.c b/tools/testing/selftests/bpf/progs/strobemeta_nounroll2.c
new file mode 100644
index 000000000..4291a7d64
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/strobemeta_nounroll2.c
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+// Copyright (c) 2019 Facebook
+
+#define STROBE_MAX_INTS 2
+#define STROBE_MAX_STRS 25
+#define STROBE_MAX_MAPS 30
+#define STROBE_MAX_MAP_ENTRIES 20
+#define NO_UNROLL
+#include "strobemeta.h"
diff --git a/tools/testing/selftests/bpf/progs/strobemeta_subprogs.c b/tools/testing/selftests/bpf/progs/strobemeta_subprogs.c
new file mode 100644
index 000000000..b6c01f8fc
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/strobemeta_subprogs.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+// Copyright (c) 2019 Facebook
+
+#define STROBE_MAX_INTS 2
+#define STROBE_MAX_STRS 25
+#define STROBE_MAX_MAPS 13
+#define STROBE_MAX_MAP_ENTRIES 20
+#define NO_UNROLL
+#define SUBPROGS
+#include "strobemeta.h"
diff --git a/tools/testing/selftests/bpf/progs/tailcall1.c b/tools/testing/selftests/bpf/progs/tailcall1.c
new file mode 100644
index 000000000..7115bcefb
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall1.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __uint(max_entries, 3);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u32));
+} jmp_table SEC(".maps");
+
+#define TAIL_FUNC(x) \
+ SEC("classifier/" #x) \
+ int bpf_func_##x(struct __sk_buff *skb) \
+ { \
+ return x; \
+ }
+TAIL_FUNC(0)
+TAIL_FUNC(1)
+TAIL_FUNC(2)
+
+SEC("classifier")
+int entry(struct __sk_buff *skb)
+{
+ /* Multiple locations to make sure we patch
+ * all of them.
+ */
+ bpf_tail_call_static(skb, &jmp_table, 0);
+ bpf_tail_call_static(skb, &jmp_table, 0);
+ bpf_tail_call_static(skb, &jmp_table, 0);
+ bpf_tail_call_static(skb, &jmp_table, 0);
+
+ bpf_tail_call_static(skb, &jmp_table, 1);
+ bpf_tail_call_static(skb, &jmp_table, 1);
+ bpf_tail_call_static(skb, &jmp_table, 1);
+ bpf_tail_call_static(skb, &jmp_table, 1);
+
+ bpf_tail_call_static(skb, &jmp_table, 2);
+ bpf_tail_call_static(skb, &jmp_table, 2);
+ bpf_tail_call_static(skb, &jmp_table, 2);
+ bpf_tail_call_static(skb, &jmp_table, 2);
+
+ return 3;
+}
+
+char __license[] SEC("license") = "GPL";
+int _version SEC("version") = 1;
diff --git a/tools/testing/selftests/bpf/progs/tailcall2.c b/tools/testing/selftests/bpf/progs/tailcall2.c
new file mode 100644
index 000000000..0431e4fe7
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall2.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __uint(max_entries, 5);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u32));
+} jmp_table SEC(".maps");
+
+SEC("classifier/0")
+int bpf_func_0(struct __sk_buff *skb)
+{
+ bpf_tail_call_static(skb, &jmp_table, 1);
+ return 0;
+}
+
+SEC("classifier/1")
+int bpf_func_1(struct __sk_buff *skb)
+{
+ bpf_tail_call_static(skb, &jmp_table, 2);
+ return 1;
+}
+
+SEC("classifier/2")
+int bpf_func_2(struct __sk_buff *skb)
+{
+ return 2;
+}
+
+SEC("classifier/3")
+int bpf_func_3(struct __sk_buff *skb)
+{
+ bpf_tail_call_static(skb, &jmp_table, 4);
+ return 3;
+}
+
+SEC("classifier/4")
+int bpf_func_4(struct __sk_buff *skb)
+{
+ bpf_tail_call_static(skb, &jmp_table, 3);
+ return 4;
+}
+
+SEC("classifier")
+int entry(struct __sk_buff *skb)
+{
+ bpf_tail_call_static(skb, &jmp_table, 0);
+ /* Check multi-prog update. */
+ bpf_tail_call_static(skb, &jmp_table, 2);
+ /* Check tail call limit. */
+ bpf_tail_call_static(skb, &jmp_table, 3);
+ return 3;
+}
+
+char __license[] SEC("license") = "GPL";
+int _version SEC("version") = 1;
diff --git a/tools/testing/selftests/bpf/progs/tailcall3.c b/tools/testing/selftests/bpf/progs/tailcall3.c
new file mode 100644
index 000000000..739dc2a51
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall3.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __uint(max_entries, 1);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u32));
+} jmp_table SEC(".maps");
+
+static volatile int count;
+
+SEC("classifier/0")
+int bpf_func_0(struct __sk_buff *skb)
+{
+ count++;
+ bpf_tail_call_static(skb, &jmp_table, 0);
+ return 1;
+}
+
+SEC("classifier")
+int entry(struct __sk_buff *skb)
+{
+ bpf_tail_call_static(skb, &jmp_table, 0);
+ return 0;
+}
+
+char __license[] SEC("license") = "GPL";
+int _version SEC("version") = 1;
diff --git a/tools/testing/selftests/bpf/progs/tailcall4.c b/tools/testing/selftests/bpf/progs/tailcall4.c
new file mode 100644
index 000000000..f82075b47
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall4.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __uint(max_entries, 3);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u32));
+} jmp_table SEC(".maps");
+
+static volatile int selector;
+
+#define TAIL_FUNC(x) \
+ SEC("classifier/" #x) \
+ int bpf_func_##x(struct __sk_buff *skb) \
+ { \
+ return x; \
+ }
+TAIL_FUNC(0)
+TAIL_FUNC(1)
+TAIL_FUNC(2)
+
+SEC("classifier")
+int entry(struct __sk_buff *skb)
+{
+ bpf_tail_call(skb, &jmp_table, selector);
+ return 3;
+}
+
+char __license[] SEC("license") = "GPL";
+int _version SEC("version") = 1;
diff --git a/tools/testing/selftests/bpf/progs/tailcall5.c b/tools/testing/selftests/bpf/progs/tailcall5.c
new file mode 100644
index 000000000..ce5450744
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall5.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __uint(max_entries, 3);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u32));
+} jmp_table SEC(".maps");
+
+static volatile int selector;
+
+#define TAIL_FUNC(x) \
+ SEC("classifier/" #x) \
+ int bpf_func_##x(struct __sk_buff *skb) \
+ { \
+ return x; \
+ }
+TAIL_FUNC(0)
+TAIL_FUNC(1)
+TAIL_FUNC(2)
+
+SEC("classifier")
+int entry(struct __sk_buff *skb)
+{
+ int idx = 0;
+
+ if (selector == 1234)
+ idx = 1;
+ else if (selector == 5678)
+ idx = 2;
+
+ bpf_tail_call(skb, &jmp_table, idx);
+ return 3;
+}
+
+char __license[] SEC("license") = "GPL";
+int _version SEC("version") = 1;
diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf1.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf1.c
new file mode 100644
index 000000000..0103f3dd9
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf1.c
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __uint(max_entries, 2);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u32));
+} jmp_table SEC(".maps");
+
+#define TAIL_FUNC(x) \
+ SEC("classifier/" #x) \
+ int bpf_func_##x(struct __sk_buff *skb) \
+ { \
+ return x; \
+ }
+TAIL_FUNC(0)
+TAIL_FUNC(1)
+
+static __noinline
+int subprog_tail(struct __sk_buff *skb)
+{
+ bpf_tail_call_static(skb, &jmp_table, 0);
+
+ return skb->len * 2;
+}
+
+SEC("classifier")
+int entry(struct __sk_buff *skb)
+{
+ bpf_tail_call_static(skb, &jmp_table, 1);
+
+ return subprog_tail(skb);
+}
+
+char __license[] SEC("license") = "GPL";
+int _version SEC("version") = 1;
diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c
new file mode 100644
index 000000000..7b1c04183
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf2.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_legacy.h"
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __uint(max_entries, 1);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u32));
+} jmp_table SEC(".maps");
+
+static __noinline
+int subprog_tail(struct __sk_buff *skb)
+{
+ if (load_byte(skb, 0))
+ bpf_tail_call_static(skb, &jmp_table, 1);
+ else
+ bpf_tail_call_static(skb, &jmp_table, 0);
+ return 1;
+}
+
+static volatile int count;
+
+SEC("classifier/0")
+int bpf_func_0(struct __sk_buff *skb)
+{
+ count++;
+ return subprog_tail(skb);
+}
+
+SEC("classifier")
+int entry(struct __sk_buff *skb)
+{
+ bpf_tail_call_static(skb, &jmp_table, 0);
+
+ return 0;
+}
+
+char __license[] SEC("license") = "GPL";
+int _version SEC("version") = 1;
diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf3.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf3.c
new file mode 100644
index 000000000..0d5482bea
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf3.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_legacy.h"
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __uint(max_entries, 2);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u32));
+} jmp_table SEC(".maps");
+
+__noinline
+int subprog_tail2(struct __sk_buff *skb)
+{
+ volatile char arr[64] = {};
+
+ if (load_word(skb, 0) || load_half(skb, 0))
+ bpf_tail_call_static(skb, &jmp_table, 10);
+ else
+ bpf_tail_call_static(skb, &jmp_table, 1);
+
+ return skb->len;
+}
+
+static __noinline
+int subprog_tail(struct __sk_buff *skb)
+{
+ volatile char arr[64] = {};
+
+ bpf_tail_call_static(skb, &jmp_table, 0);
+
+ return skb->len * 2;
+}
+
+SEC("classifier/0")
+int bpf_func_0(struct __sk_buff *skb)
+{
+ volatile char arr[128] = {};
+
+ return subprog_tail2(skb);
+}
+
+SEC("classifier/1")
+int bpf_func_1(struct __sk_buff *skb)
+{
+ volatile char arr[128] = {};
+
+ return skb->len * 3;
+}
+
+SEC("classifier")
+int entry(struct __sk_buff *skb)
+{
+ volatile char arr[128] = {};
+
+ return subprog_tail(skb);
+}
+
+char __license[] SEC("license") = "GPL";
+int _version SEC("version") = 1;
diff --git a/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c
new file mode 100644
index 000000000..9a1b166b7
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tailcall_bpf2bpf4.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
+ __uint(max_entries, 3);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u32));
+} jmp_table SEC(".maps");
+
+static volatile int count;
+
+__noinline
+int subprog_tail_2(struct __sk_buff *skb)
+{
+ bpf_tail_call_static(skb, &jmp_table, 2);
+ return skb->len * 3;
+}
+
+__noinline
+int subprog_tail_1(struct __sk_buff *skb)
+{
+ bpf_tail_call_static(skb, &jmp_table, 1);
+ return skb->len * 2;
+}
+
+__noinline
+int subprog_tail(struct __sk_buff *skb)
+{
+ bpf_tail_call_static(skb, &jmp_table, 0);
+ return skb->len;
+}
+
+SEC("classifier/1")
+int bpf_func_1(struct __sk_buff *skb)
+{
+ return subprog_tail_2(skb);
+}
+
+SEC("classifier/2")
+int bpf_func_2(struct __sk_buff *skb)
+{
+ count++;
+ return subprog_tail_2(skb);
+}
+
+SEC("classifier/0")
+int bpf_func_0(struct __sk_buff *skb)
+{
+ return subprog_tail_1(skb);
+}
+
+SEC("classifier")
+int entry(struct __sk_buff *skb)
+{
+ return subprog_tail(skb);
+}
+
+char __license[] SEC("license") = "GPL";
+int _version SEC("version") = 1;
diff --git a/tools/testing/selftests/bpf/progs/tcp_rtt.c b/tools/testing/selftests/bpf/progs/tcp_rtt.c
new file mode 100644
index 000000000..0cb3204dd
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/tcp_rtt.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1;
+
+struct tcp_rtt_storage {
+ __u32 invoked;
+ __u32 dsack_dups;
+ __u32 delivered;
+ __u32 delivered_ce;
+ __u32 icsk_retransmits;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct tcp_rtt_storage);
+} socket_storage_map SEC(".maps");
+
+SEC("sockops")
+int _sockops(struct bpf_sock_ops *ctx)
+{
+ struct tcp_rtt_storage *storage;
+ struct bpf_tcp_sock *tcp_sk;
+ int op = (int) ctx->op;
+ struct bpf_sock *sk;
+
+ sk = ctx->sk;
+ if (!sk)
+ return 1;
+
+ storage = bpf_sk_storage_get(&socket_storage_map, sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ if (!storage)
+ return 1;
+
+ if (op == BPF_SOCK_OPS_TCP_CONNECT_CB) {
+ bpf_sock_ops_cb_flags_set(ctx, BPF_SOCK_OPS_RTT_CB_FLAG);
+ return 1;
+ }
+
+ if (op != BPF_SOCK_OPS_RTT_CB)
+ return 1;
+
+ tcp_sk = bpf_tcp_sock(sk);
+ if (!tcp_sk)
+ return 1;
+
+ storage->invoked++;
+
+ storage->dsack_dups = tcp_sk->dsack_dups;
+ storage->delivered = tcp_sk->delivered;
+ storage->delivered_ce = tcp_sk->delivered_ce;
+ storage->icsk_retransmits = tcp_sk->icsk_retransmits;
+
+ return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_attach_probe.c b/tools/testing/selftests/bpf/progs/test_attach_probe.c
new file mode 100644
index 000000000..8056a4c6d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_attach_probe.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2017 Facebook
+
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+int kprobe_res = 0;
+int kretprobe_res = 0;
+int uprobe_res = 0;
+int uretprobe_res = 0;
+
+SEC("kprobe/sys_nanosleep")
+int handle_kprobe(struct pt_regs *ctx)
+{
+ kprobe_res = 1;
+ return 0;
+}
+
+SEC("kretprobe/sys_nanosleep")
+int BPF_KRETPROBE(handle_kretprobe)
+{
+ kretprobe_res = 2;
+ return 0;
+}
+
+SEC("uprobe/trigger_func")
+int handle_uprobe(struct pt_regs *ctx)
+{
+ uprobe_res = 3;
+ return 0;
+}
+
+SEC("uretprobe/trigger_func")
+int handle_uretprobe(struct pt_regs *ctx)
+{
+ uretprobe_res = 4;
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_autoload.c b/tools/testing/selftests/bpf/progs/test_autoload.c
new file mode 100644
index 000000000..62c8cdec6
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_autoload.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+bool prog1_called = false;
+bool prog2_called = false;
+bool prog3_called = false;
+
+SEC("raw_tp/sys_enter")
+int prog1(const void *ctx)
+{
+ prog1_called = true;
+ return 0;
+}
+
+SEC("raw_tp/sys_exit")
+int prog2(const void *ctx)
+{
+ prog2_called = true;
+ return 0;
+}
+
+struct fake_kernel_struct {
+ int whatever;
+} __attribute__((preserve_access_index));
+
+SEC("fentry/unexisting-kprobe-will-fail-if-loaded")
+int prog3(const void *ctx)
+{
+ struct fake_kernel_struct *fake = (void *)ctx;
+ fake->whatever = 123;
+ prog3_called = true;
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_btf_haskv.c b/tools/testing/selftests/bpf/progs/test_btf_haskv.c
new file mode 100644
index 000000000..31538c9ed
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_btf_haskv.c
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_legacy.h"
+
+int _version SEC("version") = 1;
+
+struct ipv_counts {
+ unsigned int v4;
+ unsigned int v6;
+};
+
+struct bpf_map_def SEC("maps") btf_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(struct ipv_counts),
+ .max_entries = 4,
+};
+
+BPF_ANNOTATE_KV_PAIR(btf_map, int, struct ipv_counts);
+
+__attribute__((noinline))
+int test_long_fname_2(void)
+{
+ struct ipv_counts *counts;
+ int key = 0;
+
+ counts = bpf_map_lookup_elem(&btf_map, &key);
+ if (!counts)
+ return 0;
+
+ counts->v6++;
+
+ return 0;
+}
+
+__attribute__((noinline))
+int test_long_fname_1(void)
+{
+ return test_long_fname_2();
+}
+
+SEC("dummy_tracepoint")
+int _dummy_tracepoint(void *arg)
+{
+ return test_long_fname_1();
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c b/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c
new file mode 100644
index 000000000..c1e0c8c7c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_btf_map_in_map.c
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2020 Facebook */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+struct inner_map {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, int);
+} inner_map1 SEC(".maps"),
+ inner_map2 SEC(".maps");
+
+struct inner_map_sz2 {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 2);
+ __type(key, int);
+ __type(value, int);
+} inner_map_sz2 SEC(".maps");
+
+struct outer_arr {
+ __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+ __uint(max_entries, 3);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+ /* it's possible to use anonymous struct as inner map definition here */
+ __array(values, struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ /* changing max_entries to 2 will fail during load
+ * due to incompatibility with inner_map definition */
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, int);
+ });
+} outer_arr SEC(".maps") = {
+ /* (void *) cast is necessary because we didn't use `struct inner_map`
+ * in __inner(values, ...)
+ * Actually, a conscious effort is required to screw up initialization
+ * of inner map slots, which is a great thing!
+ */
+ .values = { (void *)&inner_map1, 0, (void *)&inner_map2 },
+};
+
+struct inner_map_sz3 {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(map_flags, BPF_F_INNER_MAP);
+ __uint(max_entries, 3);
+ __type(key, int);
+ __type(value, int);
+} inner_map3 SEC(".maps"),
+ inner_map4 SEC(".maps");
+
+struct inner_map_sz4 {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(map_flags, BPF_F_INNER_MAP);
+ __uint(max_entries, 5);
+ __type(key, int);
+ __type(value, int);
+} inner_map5 SEC(".maps");
+
+struct outer_arr_dyn {
+ __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+ __uint(max_entries, 3);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+ __array(values, struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(map_flags, BPF_F_INNER_MAP);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, int);
+ });
+} outer_arr_dyn SEC(".maps") = {
+ .values = {
+ [0] = (void *)&inner_map3,
+ [1] = (void *)&inner_map4,
+ [2] = (void *)&inner_map5,
+ },
+};
+
+struct outer_hash {
+ __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
+ __uint(max_entries, 5);
+ __uint(key_size, sizeof(int));
+ /* Here everything works flawlessly due to reuse of struct inner_map
+ * and compiler will complain at the attempt to use non-inner_map
+ * references below. This is great experience.
+ */
+ __array(values, struct inner_map);
+} outer_hash SEC(".maps") = {
+ .values = {
+ [0] = &inner_map2,
+ [4] = &inner_map1,
+ },
+};
+
+struct sockarr_sz1 {
+ __uint(type, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, int);
+} sockarr_sz1 SEC(".maps");
+
+struct sockarr_sz2 {
+ __uint(type, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY);
+ __uint(max_entries, 2);
+ __type(key, int);
+ __type(value, int);
+} sockarr_sz2 SEC(".maps");
+
+struct outer_sockarr_sz1 {
+ __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+ __uint(max_entries, 1);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+ __array(values, struct sockarr_sz1);
+} outer_sockarr SEC(".maps") = {
+ .values = { (void *)&sockarr_sz1 },
+};
+
+int input = 0;
+
+SEC("raw_tp/sys_enter")
+int handle__sys_enter(void *ctx)
+{
+ struct inner_map *inner_map;
+ int key = 0, val;
+
+ inner_map = bpf_map_lookup_elem(&outer_arr, &key);
+ if (!inner_map)
+ return 1;
+ val = input;
+ bpf_map_update_elem(inner_map, &key, &val, 0);
+
+ inner_map = bpf_map_lookup_elem(&outer_hash, &key);
+ if (!inner_map)
+ return 1;
+ val = input + 1;
+ bpf_map_update_elem(inner_map, &key, &val, 0);
+
+ inner_map = bpf_map_lookup_elem(&outer_arr_dyn, &key);
+ if (!inner_map)
+ return 1;
+ val = input + 2;
+ bpf_map_update_elem(inner_map, &key, &val, 0);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_btf_newkv.c b/tools/testing/selftests/bpf/progs/test_btf_newkv.c
new file mode 100644
index 000000000..6c5560162
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_btf_newkv.c
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include "bpf_legacy.h"
+
+int _version SEC("version") = 1;
+
+struct ipv_counts {
+ unsigned int v4;
+ unsigned int v6;
+};
+
+/* just to validate we can handle maps in multiple sections */
+struct bpf_map_def SEC("maps") btf_map_legacy = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(long long),
+ .max_entries = 4,
+};
+
+BPF_ANNOTATE_KV_PAIR(btf_map_legacy, int, struct ipv_counts);
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 4);
+ __type(key, int);
+ __type(value, struct ipv_counts);
+} btf_map SEC(".maps");
+
+__attribute__((noinline))
+int test_long_fname_2(void)
+{
+ struct ipv_counts *counts;
+ int key = 0;
+
+ counts = bpf_map_lookup_elem(&btf_map, &key);
+ if (!counts)
+ return 0;
+
+ counts->v6++;
+
+ /* just verify we can reference both maps */
+ counts = bpf_map_lookup_elem(&btf_map_legacy, &key);
+ if (!counts)
+ return 0;
+
+ return 0;
+}
+
+__attribute__((noinline))
+int test_long_fname_1(void)
+{
+ return test_long_fname_2();
+}
+
+SEC("dummy_tracepoint")
+int _dummy_tracepoint(void *arg)
+{
+ return test_long_fname_1();
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_btf_nokv.c b/tools/testing/selftests/bpf/progs/test_btf_nokv.c
new file mode 100644
index 000000000..506da7fd2
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_btf_nokv.c
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+int _version SEC("version") = 1;
+
+struct ipv_counts {
+ unsigned int v4;
+ unsigned int v6;
+};
+
+struct bpf_map_def SEC("maps") btf_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(struct ipv_counts),
+ .max_entries = 4,
+};
+
+__attribute__((noinline))
+int test_long_fname_2(void)
+{
+ struct ipv_counts *counts;
+ int key = 0;
+
+ counts = bpf_map_lookup_elem(&btf_map, &key);
+ if (!counts)
+ return 0;
+
+ counts->v6++;
+
+ return 0;
+}
+
+__attribute__((noinline))
+int test_long_fname_1(void)
+{
+ return test_long_fname_2();
+}
+
+SEC("dummy_tracepoint")
+int _dummy_tracepoint(void *arg)
+{
+ return test_long_fname_1();
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c b/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c
new file mode 100644
index 000000000..9a6b85dd5
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_btf_skc_cls_ingress.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include <string.h>
+#include <errno.h>
+#include <netinet/in.h>
+#include <linux/stddef.h>
+#include <linux/bpf.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/if_ether.h>
+#include <linux/pkt_cls.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include "bpf_tcp_helpers.h"
+
+struct sockaddr_in6 srv_sa6 = {};
+__u16 listen_tp_sport = 0;
+__u16 req_sk_sport = 0;
+__u32 recv_cookie = 0;
+__u32 gen_cookie = 0;
+__u32 linum = 0;
+
+#define LOG() ({ if (!linum) linum = __LINE__; })
+
+static void test_syncookie_helper(struct ipv6hdr *ip6h, struct tcphdr *th,
+ struct tcp_sock *tp,
+ struct __sk_buff *skb)
+{
+ if (th->syn) {
+ __s64 mss_cookie;
+ void *data_end;
+
+ data_end = (void *)(long)(skb->data_end);
+
+ if (th->doff * 4 != 40) {
+ LOG();
+ return;
+ }
+
+ if ((void *)th + 40 > data_end) {
+ LOG();
+ return;
+ }
+
+ mss_cookie = bpf_tcp_gen_syncookie(tp, ip6h, sizeof(*ip6h),
+ th, 40);
+ if (mss_cookie < 0) {
+ if (mss_cookie != -ENOENT)
+ LOG();
+ } else {
+ gen_cookie = (__u32)mss_cookie;
+ }
+ } else if (gen_cookie) {
+ /* It was in cookie mode */
+ int ret = bpf_tcp_check_syncookie(tp, ip6h, sizeof(*ip6h),
+ th, sizeof(*th));
+
+ if (ret < 0) {
+ if (ret != -ENOENT)
+ LOG();
+ } else {
+ recv_cookie = bpf_ntohl(th->ack_seq) - 1;
+ }
+ }
+}
+
+static int handle_ip6_tcp(struct ipv6hdr *ip6h, struct __sk_buff *skb)
+{
+ struct bpf_sock_tuple *tuple;
+ struct bpf_sock *bpf_skc;
+ unsigned int tuple_len;
+ struct tcphdr *th;
+ void *data_end;
+
+ data_end = (void *)(long)(skb->data_end);
+
+ th = (struct tcphdr *)(ip6h + 1);
+ if (th + 1 > data_end)
+ return TC_ACT_OK;
+
+ /* Is it the testing traffic? */
+ if (th->dest != srv_sa6.sin6_port)
+ return TC_ACT_OK;
+
+ tuple_len = sizeof(tuple->ipv6);
+ tuple = (struct bpf_sock_tuple *)&ip6h->saddr;
+ if ((void *)tuple + tuple_len > data_end) {
+ LOG();
+ return TC_ACT_OK;
+ }
+
+ bpf_skc = bpf_skc_lookup_tcp(skb, tuple, tuple_len,
+ BPF_F_CURRENT_NETNS, 0);
+ if (!bpf_skc) {
+ LOG();
+ return TC_ACT_OK;
+ }
+
+ if (bpf_skc->state == BPF_TCP_NEW_SYN_RECV) {
+ struct request_sock *req_sk;
+
+ req_sk = (struct request_sock *)bpf_skc_to_tcp_request_sock(bpf_skc);
+ if (!req_sk) {
+ LOG();
+ goto release;
+ }
+
+ if (bpf_sk_assign(skb, req_sk, 0)) {
+ LOG();
+ goto release;
+ }
+
+ req_sk_sport = req_sk->__req_common.skc_num;
+
+ bpf_sk_release(req_sk);
+ return TC_ACT_OK;
+ } else if (bpf_skc->state == BPF_TCP_LISTEN) {
+ struct tcp_sock *tp;
+
+ tp = bpf_skc_to_tcp_sock(bpf_skc);
+ if (!tp) {
+ LOG();
+ goto release;
+ }
+
+ if (bpf_sk_assign(skb, tp, 0)) {
+ LOG();
+ goto release;
+ }
+
+ listen_tp_sport = tp->inet_conn.icsk_inet.sk.__sk_common.skc_num;
+
+ test_syncookie_helper(ip6h, th, tp, skb);
+ bpf_sk_release(tp);
+ return TC_ACT_OK;
+ }
+
+ if (bpf_sk_assign(skb, bpf_skc, 0))
+ LOG();
+
+release:
+ bpf_sk_release(bpf_skc);
+ return TC_ACT_OK;
+}
+
+SEC("classifier/ingress")
+int cls_ingress(struct __sk_buff *skb)
+{
+ struct ipv6hdr *ip6h;
+ struct ethhdr *eth;
+ void *data_end;
+
+ data_end = (void *)(long)(skb->data_end);
+
+ eth = (struct ethhdr *)(long)(skb->data);
+ if (eth + 1 > data_end)
+ return TC_ACT_OK;
+
+ if (eth->h_proto != bpf_htons(ETH_P_IPV6))
+ return TC_ACT_OK;
+
+ ip6h = (struct ipv6hdr *)(eth + 1);
+ if (ip6h + 1 > data_end)
+ return TC_ACT_OK;
+
+ if (ip6h->nexthdr == IPPROTO_TCP)
+ return handle_ip6_tcp(ip6h, skb);
+
+ return TC_ACT_OK;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_cgroup_link.c b/tools/testing/selftests/bpf/progs/test_cgroup_link.c
new file mode 100644
index 000000000..77e47b9e4
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_cgroup_link.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+int calls = 0;
+int alt_calls = 0;
+
+SEC("cgroup_skb/egress1")
+int egress(struct __sk_buff *skb)
+{
+ __sync_fetch_and_add(&calls, 1);
+ return 1;
+}
+
+SEC("cgroup_skb/egress2")
+int egress_alt(struct __sk_buff *skb)
+{
+ __sync_fetch_and_add(&alt_calls, 1);
+ return 1;
+}
+
+char _license[] SEC("license") = "GPL";
+
diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.c b/tools/testing/selftests/bpf/progs/test_cls_redirect.c
new file mode 100644
index 000000000..c9f846499
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.c
@@ -0,0 +1,1068 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+// Copyright (c) 2019, 2020 Cloudflare
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <linux/bpf.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/pkt_cls.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#include "test_cls_redirect.h"
+
+#ifdef SUBPROGS
+#define INLINING __noinline
+#else
+#define INLINING __always_inline
+#endif
+
+#define offsetofend(TYPE, MEMBER) \
+ (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER)))
+
+#define IP_OFFSET_MASK (0x1FFF)
+#define IP_MF (0x2000)
+
+char _license[] SEC("license") = "Dual BSD/GPL";
+
+/**
+ * Destination port and IP used for UDP encapsulation.
+ */
+static volatile const __be16 ENCAPSULATION_PORT;
+static volatile const __be32 ENCAPSULATION_IP;
+
+typedef struct {
+ uint64_t processed_packets_total;
+ uint64_t l3_protocol_packets_total_ipv4;
+ uint64_t l3_protocol_packets_total_ipv6;
+ uint64_t l4_protocol_packets_total_tcp;
+ uint64_t l4_protocol_packets_total_udp;
+ uint64_t accepted_packets_total_syn;
+ uint64_t accepted_packets_total_syn_cookies;
+ uint64_t accepted_packets_total_last_hop;
+ uint64_t accepted_packets_total_icmp_echo_request;
+ uint64_t accepted_packets_total_established;
+ uint64_t forwarded_packets_total_gue;
+ uint64_t forwarded_packets_total_gre;
+
+ uint64_t errors_total_unknown_l3_proto;
+ uint64_t errors_total_unknown_l4_proto;
+ uint64_t errors_total_malformed_ip;
+ uint64_t errors_total_fragmented_ip;
+ uint64_t errors_total_malformed_icmp;
+ uint64_t errors_total_unwanted_icmp;
+ uint64_t errors_total_malformed_icmp_pkt_too_big;
+ uint64_t errors_total_malformed_tcp;
+ uint64_t errors_total_malformed_udp;
+ uint64_t errors_total_icmp_echo_replies;
+ uint64_t errors_total_malformed_encapsulation;
+ uint64_t errors_total_encap_adjust_failed;
+ uint64_t errors_total_encap_buffer_too_small;
+ uint64_t errors_total_redirect_loop;
+} metrics_t;
+
+typedef enum {
+ INVALID = 0,
+ UNKNOWN,
+ ECHO_REQUEST,
+ SYN,
+ SYN_COOKIE,
+ ESTABLISHED,
+} verdict_t;
+
+typedef struct {
+ uint16_t src, dst;
+} flow_ports_t;
+
+_Static_assert(
+ sizeof(flow_ports_t) !=
+ offsetofend(struct bpf_sock_tuple, ipv4.dport) -
+ offsetof(struct bpf_sock_tuple, ipv4.sport) - 1,
+ "flow_ports_t must match sport and dport in struct bpf_sock_tuple");
+_Static_assert(
+ sizeof(flow_ports_t) !=
+ offsetofend(struct bpf_sock_tuple, ipv6.dport) -
+ offsetof(struct bpf_sock_tuple, ipv6.sport) - 1,
+ "flow_ports_t must match sport and dport in struct bpf_sock_tuple");
+
+typedef int ret_t;
+
+/* This is a bit of a hack. We need a return value which allows us to
+ * indicate that the regular flow of the program should continue,
+ * while allowing functions to use XDP_PASS and XDP_DROP, etc.
+ */
+static const ret_t CONTINUE_PROCESSING = -1;
+
+/* Convenience macro to call functions which return ret_t.
+ */
+#define MAYBE_RETURN(x) \
+ do { \
+ ret_t __ret = x; \
+ if (__ret != CONTINUE_PROCESSING) \
+ return __ret; \
+ } while (0)
+
+/* Linux packet pointers are either aligned to NET_IP_ALIGN (aka 2 bytes),
+ * or not aligned if the arch supports efficient unaligned access.
+ *
+ * Since the verifier ensures that eBPF packet accesses follow these rules,
+ * we can tell LLVM to emit code as if we always had a larger alignment.
+ * It will yell at us if we end up on a platform where this is not valid.
+ */
+typedef uint8_t *net_ptr __attribute__((align_value(8)));
+
+typedef struct buf {
+ struct __sk_buff *skb;
+ net_ptr head;
+ /* NB: tail musn't have alignment other than 1, otherwise
+ * LLVM will go and eliminate code, e.g. when checking packet lengths.
+ */
+ uint8_t *const tail;
+} buf_t;
+
+static __always_inline size_t buf_off(const buf_t *buf)
+{
+ /* Clang seems to optimize constructs like
+ * a - b + c
+ * if c is known:
+ * r? = c
+ * r? -= b
+ * r? += a
+ *
+ * This is a problem if a and b are packet pointers,
+ * since the verifier allows subtracting two pointers to
+ * get a scalar, but not a scalar and a pointer.
+ *
+ * Use inline asm to break this optimization.
+ */
+ size_t off = (size_t)buf->head;
+ asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data));
+ return off;
+}
+
+static __always_inline bool buf_copy(buf_t *buf, void *dst, size_t len)
+{
+ if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) {
+ return false;
+ }
+
+ buf->head += len;
+ return true;
+}
+
+static __always_inline bool buf_skip(buf_t *buf, const size_t len)
+{
+ /* Check whether off + len is valid in the non-linear part. */
+ if (buf_off(buf) + len > buf->skb->len) {
+ return false;
+ }
+
+ buf->head += len;
+ return true;
+}
+
+/* Returns a pointer to the start of buf, or NULL if len is
+ * larger than the remaining data. Consumes len bytes on a successful
+ * call.
+ *
+ * If scratch is not NULL, the function will attempt to load non-linear
+ * data via bpf_skb_load_bytes. On success, scratch is returned.
+ */
+static __always_inline void *buf_assign(buf_t *buf, const size_t len, void *scratch)
+{
+ if (buf->head + len > buf->tail) {
+ if (scratch == NULL) {
+ return NULL;
+ }
+
+ return buf_copy(buf, scratch, len) ? scratch : NULL;
+ }
+
+ void *ptr = buf->head;
+ buf->head += len;
+ return ptr;
+}
+
+static INLINING bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4)
+{
+ if (ipv4->ihl <= 5) {
+ return true;
+ }
+
+ return buf_skip(buf, (ipv4->ihl - 5) * 4);
+}
+
+static INLINING bool ipv4_is_fragment(const struct iphdr *ip)
+{
+ uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK);
+ return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0;
+}
+
+static __always_inline struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch)
+{
+ struct iphdr *ipv4 = buf_assign(pkt, sizeof(*ipv4), scratch);
+ if (ipv4 == NULL) {
+ return NULL;
+ }
+
+ if (ipv4->ihl < 5) {
+ return NULL;
+ }
+
+ if (!pkt_skip_ipv4_options(pkt, ipv4)) {
+ return NULL;
+ }
+
+ return ipv4;
+}
+
+/* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */
+static INLINING bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports)
+{
+ if (!buf_copy(pkt, ports, sizeof(*ports))) {
+ return false;
+ }
+
+ /* Ports in the L4 headers are reversed, since we are parsing an ICMP
+ * payload which is going towards the eyeball.
+ */
+ uint16_t dst = ports->src;
+ ports->src = ports->dst;
+ ports->dst = dst;
+ return true;
+}
+
+static INLINING uint16_t pkt_checksum_fold(uint32_t csum)
+{
+ /* The highest reasonable value for an IPv4 header
+ * checksum requires two folds, so we just do that always.
+ */
+ csum = (csum & 0xffff) + (csum >> 16);
+ csum = (csum & 0xffff) + (csum >> 16);
+ return (uint16_t)~csum;
+}
+
+static INLINING void pkt_ipv4_checksum(struct iphdr *iph)
+{
+ iph->check = 0;
+
+ /* An IP header without options is 20 bytes. Two of those
+ * are the checksum, which we always set to zero. Hence,
+ * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7,
+ * which fits in 32 bit.
+ */
+ _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes");
+ uint32_t acc = 0;
+ uint16_t *ipw = (uint16_t *)iph;
+
+#pragma clang loop unroll(full)
+ for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) {
+ acc += ipw[i];
+ }
+
+ iph->check = pkt_checksum_fold(acc);
+}
+
+static INLINING
+bool pkt_skip_ipv6_extension_headers(buf_t *pkt,
+ const struct ipv6hdr *ipv6,
+ uint8_t *upper_proto,
+ bool *is_fragment)
+{
+ /* We understand five extension headers.
+ * https://tools.ietf.org/html/rfc8200#section-4.1 states that all
+ * headers should occur once, except Destination Options, which may
+ * occur twice. Hence we give up after 6 headers.
+ */
+ struct {
+ uint8_t next;
+ uint8_t len;
+ } exthdr = {
+ .next = ipv6->nexthdr,
+ };
+ *is_fragment = false;
+
+#pragma clang loop unroll(full)
+ for (int i = 0; i < 6; i++) {
+ switch (exthdr.next) {
+ case IPPROTO_FRAGMENT:
+ *is_fragment = true;
+ /* NB: We don't check that hdrlen == 0 as per spec. */
+ /* fallthrough; */
+
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_ROUTING:
+ case IPPROTO_DSTOPTS:
+ case IPPROTO_MH:
+ if (!buf_copy(pkt, &exthdr, sizeof(exthdr))) {
+ return false;
+ }
+
+ /* hdrlen is in 8-octet units, and excludes the first 8 octets. */
+ if (!buf_skip(pkt,
+ (exthdr.len + 1) * 8 - sizeof(exthdr))) {
+ return false;
+ }
+
+ /* Decode next header */
+ break;
+
+ default:
+ /* The next header is not one of the known extension
+ * headers, treat it as the upper layer header.
+ *
+ * This handles IPPROTO_NONE.
+ *
+ * Encapsulating Security Payload (50) and Authentication
+ * Header (51) also end up here (and will trigger an
+ * unknown proto error later). They have a custom header
+ * format and seem too esoteric to care about.
+ */
+ *upper_proto = exthdr.next;
+ return true;
+ }
+ }
+
+ /* We never found an upper layer header. */
+ return false;
+}
+
+/* This function has to be inlined, because the verifier otherwise rejects it
+ * due to returning a pointer to the stack. This is technically correct, since
+ * scratch is allocated on the stack. However, this usage should be safe since
+ * it's the callers stack after all.
+ */
+static __always_inline struct ipv6hdr *
+pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto,
+ bool *is_fragment)
+{
+ struct ipv6hdr *ipv6 = buf_assign(pkt, sizeof(*ipv6), scratch);
+ if (ipv6 == NULL) {
+ return NULL;
+ }
+
+ if (!pkt_skip_ipv6_extension_headers(pkt, ipv6, proto, is_fragment)) {
+ return NULL;
+ }
+
+ return ipv6;
+}
+
+/* Global metrics, per CPU
+ */
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, unsigned int);
+ __type(value, metrics_t);
+} metrics_map SEC(".maps");
+
+static INLINING metrics_t *get_global_metrics(void)
+{
+ uint64_t key = 0;
+ return bpf_map_lookup_elem(&metrics_map, &key);
+}
+
+static INLINING ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap)
+{
+ const int payload_off =
+ sizeof(*encap) +
+ sizeof(struct in_addr) * encap->unigue.hop_count;
+ int32_t encap_overhead = payload_off - sizeof(struct ethhdr);
+
+ // Changing the ethertype if the encapsulated packet is ipv6
+ if (encap->gue.proto_ctype == IPPROTO_IPV6) {
+ encap->eth.h_proto = bpf_htons(ETH_P_IPV6);
+ }
+
+ if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC,
+ BPF_F_ADJ_ROOM_FIXED_GSO |
+ BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
+ bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC))
+ return TC_ACT_SHOT;
+
+ return bpf_redirect(skb->ifindex, BPF_F_INGRESS);
+}
+
+static INLINING ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap,
+ struct in_addr *next_hop, metrics_t *metrics)
+{
+ metrics->forwarded_packets_total_gre++;
+
+ const int payload_off =
+ sizeof(*encap) +
+ sizeof(struct in_addr) * encap->unigue.hop_count;
+ int32_t encap_overhead =
+ payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr);
+ int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead;
+ uint16_t proto = ETH_P_IP;
+
+ /* Loop protection: the inner packet's TTL is decremented as a safeguard
+ * against any forwarding loop. As the only interesting field is the TTL
+ * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes
+ * as they handle the split packets if needed (no need for the data to be
+ * in the linear section).
+ */
+ if (encap->gue.proto_ctype == IPPROTO_IPV6) {
+ proto = ETH_P_IPV6;
+ uint8_t ttl;
+ int rc;
+
+ rc = bpf_skb_load_bytes(
+ skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
+ &ttl, 1);
+ if (rc != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ if (ttl == 0) {
+ metrics->errors_total_redirect_loop++;
+ return TC_ACT_SHOT;
+ }
+
+ ttl--;
+ rc = bpf_skb_store_bytes(
+ skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
+ &ttl, 1, 0);
+ if (rc != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+ } else {
+ uint8_t ttl;
+ int rc;
+
+ rc = bpf_skb_load_bytes(
+ skb, payload_off + offsetof(struct iphdr, ttl), &ttl,
+ 1);
+ if (rc != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ if (ttl == 0) {
+ metrics->errors_total_redirect_loop++;
+ return TC_ACT_SHOT;
+ }
+
+ /* IPv4 also has a checksum to patch. While the TTL is only one byte,
+ * this function only works for 2 and 4 bytes arguments (the result is
+ * the same).
+ */
+ rc = bpf_l3_csum_replace(
+ skb, payload_off + offsetof(struct iphdr, check), ttl,
+ ttl - 1, 2);
+ if (rc != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ ttl--;
+ rc = bpf_skb_store_bytes(
+ skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1,
+ 0);
+ if (rc != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+ }
+
+ if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET,
+ BPF_F_ADJ_ROOM_FIXED_GSO |
+ BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
+ bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) {
+ metrics->errors_total_encap_adjust_failed++;
+ return TC_ACT_SHOT;
+ }
+
+ if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) {
+ metrics->errors_total_encap_buffer_too_small++;
+ return TC_ACT_SHOT;
+ }
+
+ buf_t pkt = {
+ .skb = skb,
+ .head = (uint8_t *)(long)skb->data,
+ .tail = (uint8_t *)(long)skb->data_end,
+ };
+
+ encap_gre_t *encap_gre = buf_assign(&pkt, sizeof(encap_gre_t), NULL);
+ if (encap_gre == NULL) {
+ metrics->errors_total_encap_buffer_too_small++;
+ return TC_ACT_SHOT;
+ }
+
+ encap_gre->ip.protocol = IPPROTO_GRE;
+ encap_gre->ip.daddr = next_hop->s_addr;
+ encap_gre->ip.saddr = ENCAPSULATION_IP;
+ encap_gre->ip.tot_len =
+ bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta);
+ encap_gre->gre.flags = 0;
+ encap_gre->gre.protocol = bpf_htons(proto);
+ pkt_ipv4_checksum((void *)&encap_gre->ip);
+
+ return bpf_redirect(skb->ifindex, 0);
+}
+
+static INLINING ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap,
+ struct in_addr *next_hop, metrics_t *metrics)
+{
+ /* swap L2 addresses */
+ /* This assumes that packets are received from a router.
+ * So just swapping the MAC addresses here will make the packet go back to
+ * the router, which will send it to the appropriate machine.
+ */
+ unsigned char temp[ETH_ALEN];
+ memcpy(temp, encap->eth.h_dest, sizeof(temp));
+ memcpy(encap->eth.h_dest, encap->eth.h_source,
+ sizeof(encap->eth.h_dest));
+ memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source));
+
+ if (encap->unigue.next_hop == encap->unigue.hop_count - 1 &&
+ encap->unigue.last_hop_gre) {
+ return forward_with_gre(skb, encap, next_hop, metrics);
+ }
+
+ metrics->forwarded_packets_total_gue++;
+ uint32_t old_saddr = encap->ip.saddr;
+ encap->ip.saddr = encap->ip.daddr;
+ encap->ip.daddr = next_hop->s_addr;
+ if (encap->unigue.next_hop < encap->unigue.hop_count) {
+ encap->unigue.next_hop++;
+ }
+
+ /* Remove ip->saddr, add next_hop->s_addr */
+ const uint64_t off = offsetof(typeof(*encap), ip.check);
+ int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4);
+ if (ret < 0) {
+ return TC_ACT_SHOT;
+ }
+
+ return bpf_redirect(skb->ifindex, 0);
+}
+
+static INLINING ret_t skip_next_hops(buf_t *pkt, int n)
+{
+ switch (n) {
+ case 1:
+ if (!buf_skip(pkt, sizeof(struct in_addr)))
+ return TC_ACT_SHOT;
+ case 0:
+ return CONTINUE_PROCESSING;
+
+ default:
+ return TC_ACT_SHOT;
+ }
+}
+
+/* Get the next hop from the GLB header.
+ *
+ * Sets next_hop->s_addr to 0 if there are no more hops left.
+ * pkt is positioned just after the variable length GLB header
+ * iff the call is successful.
+ */
+static INLINING ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap,
+ struct in_addr *next_hop)
+{
+ if (encap->unigue.next_hop > encap->unigue.hop_count) {
+ return TC_ACT_SHOT;
+ }
+
+ /* Skip "used" next hops. */
+ MAYBE_RETURN(skip_next_hops(pkt, encap->unigue.next_hop));
+
+ if (encap->unigue.next_hop == encap->unigue.hop_count) {
+ /* No more next hops, we are at the end of the GLB header. */
+ next_hop->s_addr = 0;
+ return CONTINUE_PROCESSING;
+ }
+
+ if (!buf_copy(pkt, next_hop, sizeof(*next_hop))) {
+ return TC_ACT_SHOT;
+ }
+
+ /* Skip the remainig next hops (may be zero). */
+ return skip_next_hops(pkt, encap->unigue.hop_count -
+ encap->unigue.next_hop - 1);
+}
+
+/* Fill a bpf_sock_tuple to be used with the socket lookup functions.
+ * This is a kludge that let's us work around verifier limitations:
+ *
+ * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321)
+ *
+ * clang will substitue a costant for sizeof, which allows the verifier
+ * to track it's value. Based on this, it can figure out the constant
+ * return value, and calling code works while still being "generic" to
+ * IPv4 and IPv6.
+ */
+static INLINING uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph,
+ uint64_t iphlen, uint16_t sport, uint16_t dport)
+{
+ switch (iphlen) {
+ case sizeof(struct iphdr): {
+ struct iphdr *ipv4 = (struct iphdr *)iph;
+ tuple->ipv4.daddr = ipv4->daddr;
+ tuple->ipv4.saddr = ipv4->saddr;
+ tuple->ipv4.sport = sport;
+ tuple->ipv4.dport = dport;
+ return sizeof(tuple->ipv4);
+ }
+
+ case sizeof(struct ipv6hdr): {
+ struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph;
+ memcpy(&tuple->ipv6.daddr, &ipv6->daddr,
+ sizeof(tuple->ipv6.daddr));
+ memcpy(&tuple->ipv6.saddr, &ipv6->saddr,
+ sizeof(tuple->ipv6.saddr));
+ tuple->ipv6.sport = sport;
+ tuple->ipv6.dport = dport;
+ return sizeof(tuple->ipv6);
+ }
+
+ default:
+ return 0;
+ }
+}
+
+static INLINING verdict_t classify_tcp(struct __sk_buff *skb,
+ struct bpf_sock_tuple *tuple, uint64_t tuplen,
+ void *iph, struct tcphdr *tcp)
+{
+ struct bpf_sock *sk =
+ bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
+ if (sk == NULL) {
+ return UNKNOWN;
+ }
+
+ if (sk->state != BPF_TCP_LISTEN) {
+ bpf_sk_release(sk);
+ return ESTABLISHED;
+ }
+
+ if (iph != NULL && tcp != NULL) {
+ /* Kludge: we've run out of arguments, but need the length of the ip header. */
+ uint64_t iphlen = sizeof(struct iphdr);
+ if (tuplen == sizeof(tuple->ipv6)) {
+ iphlen = sizeof(struct ipv6hdr);
+ }
+
+ if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp,
+ sizeof(*tcp)) == 0) {
+ bpf_sk_release(sk);
+ return SYN_COOKIE;
+ }
+ }
+
+ bpf_sk_release(sk);
+ return UNKNOWN;
+}
+
+static INLINING verdict_t classify_udp(struct __sk_buff *skb,
+ struct bpf_sock_tuple *tuple, uint64_t tuplen)
+{
+ struct bpf_sock *sk =
+ bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
+ if (sk == NULL) {
+ return UNKNOWN;
+ }
+
+ if (sk->state == BPF_TCP_ESTABLISHED) {
+ bpf_sk_release(sk);
+ return ESTABLISHED;
+ }
+
+ bpf_sk_release(sk);
+ return UNKNOWN;
+}
+
+static INLINING verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto,
+ struct bpf_sock_tuple *tuple, uint64_t tuplen,
+ metrics_t *metrics)
+{
+ switch (proto) {
+ case IPPROTO_TCP:
+ return classify_tcp(skb, tuple, tuplen, NULL, NULL);
+
+ case IPPROTO_UDP:
+ return classify_udp(skb, tuple, tuplen);
+
+ default:
+ metrics->errors_total_malformed_icmp++;
+ return INVALID;
+ }
+}
+
+static INLINING verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics)
+{
+ struct icmphdr icmp;
+ if (!buf_copy(pkt, &icmp, sizeof(icmp))) {
+ metrics->errors_total_malformed_icmp++;
+ return INVALID;
+ }
+
+ /* We should never receive encapsulated echo replies. */
+ if (icmp.type == ICMP_ECHOREPLY) {
+ metrics->errors_total_icmp_echo_replies++;
+ return INVALID;
+ }
+
+ if (icmp.type == ICMP_ECHO) {
+ return ECHO_REQUEST;
+ }
+
+ if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) {
+ metrics->errors_total_unwanted_icmp++;
+ return INVALID;
+ }
+
+ struct iphdr _ip4;
+ const struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4);
+ if (ipv4 == NULL) {
+ metrics->errors_total_malformed_icmp_pkt_too_big++;
+ return INVALID;
+ }
+
+ /* The source address in the outer IP header is from the entity that
+ * originated the ICMP message. Use the original IP header to restore
+ * the correct flow tuple.
+ */
+ struct bpf_sock_tuple tuple;
+ tuple.ipv4.saddr = ipv4->daddr;
+ tuple.ipv4.daddr = ipv4->saddr;
+
+ if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv4.sport)) {
+ metrics->errors_total_malformed_icmp_pkt_too_big++;
+ return INVALID;
+ }
+
+ return classify_icmp(pkt->skb, ipv4->protocol, &tuple,
+ sizeof(tuple.ipv4), metrics);
+}
+
+static INLINING verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics)
+{
+ struct icmp6hdr icmp6;
+ if (!buf_copy(pkt, &icmp6, sizeof(icmp6))) {
+ metrics->errors_total_malformed_icmp++;
+ return INVALID;
+ }
+
+ /* We should never receive encapsulated echo replies. */
+ if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) {
+ metrics->errors_total_icmp_echo_replies++;
+ return INVALID;
+ }
+
+ if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) {
+ return ECHO_REQUEST;
+ }
+
+ if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) {
+ metrics->errors_total_unwanted_icmp++;
+ return INVALID;
+ }
+
+ bool is_fragment;
+ uint8_t l4_proto;
+ struct ipv6hdr _ipv6;
+ const struct ipv6hdr *ipv6 =
+ pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment);
+ if (ipv6 == NULL) {
+ metrics->errors_total_malformed_icmp_pkt_too_big++;
+ return INVALID;
+ }
+
+ if (is_fragment) {
+ metrics->errors_total_fragmented_ip++;
+ return INVALID;
+ }
+
+ /* Swap source and dest addresses. */
+ struct bpf_sock_tuple tuple;
+ memcpy(&tuple.ipv6.saddr, &ipv6->daddr, sizeof(tuple.ipv6.saddr));
+ memcpy(&tuple.ipv6.daddr, &ipv6->saddr, sizeof(tuple.ipv6.daddr));
+
+ if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv6.sport)) {
+ metrics->errors_total_malformed_icmp_pkt_too_big++;
+ return INVALID;
+ }
+
+ return classify_icmp(pkt->skb, l4_proto, &tuple, sizeof(tuple.ipv6),
+ metrics);
+}
+
+static INLINING verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen,
+ metrics_t *metrics)
+{
+ metrics->l4_protocol_packets_total_tcp++;
+
+ struct tcphdr _tcp;
+ struct tcphdr *tcp = buf_assign(pkt, sizeof(_tcp), &_tcp);
+ if (tcp == NULL) {
+ metrics->errors_total_malformed_tcp++;
+ return INVALID;
+ }
+
+ if (tcp->syn) {
+ return SYN;
+ }
+
+ struct bpf_sock_tuple tuple;
+ uint64_t tuplen =
+ fill_tuple(&tuple, iph, iphlen, tcp->source, tcp->dest);
+ return classify_tcp(pkt->skb, &tuple, tuplen, iph, tcp);
+}
+
+static INLINING verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen,
+ metrics_t *metrics)
+{
+ metrics->l4_protocol_packets_total_udp++;
+
+ struct udphdr _udp;
+ struct udphdr *udph = buf_assign(pkt, sizeof(_udp), &_udp);
+ if (udph == NULL) {
+ metrics->errors_total_malformed_udp++;
+ return INVALID;
+ }
+
+ struct bpf_sock_tuple tuple;
+ uint64_t tuplen =
+ fill_tuple(&tuple, iph, iphlen, udph->source, udph->dest);
+ return classify_udp(pkt->skb, &tuple, tuplen);
+}
+
+static INLINING verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics)
+{
+ metrics->l3_protocol_packets_total_ipv4++;
+
+ struct iphdr _ip4;
+ struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4);
+ if (ipv4 == NULL) {
+ metrics->errors_total_malformed_ip++;
+ return INVALID;
+ }
+
+ if (ipv4->version != 4) {
+ metrics->errors_total_malformed_ip++;
+ return INVALID;
+ }
+
+ if (ipv4_is_fragment(ipv4)) {
+ metrics->errors_total_fragmented_ip++;
+ return INVALID;
+ }
+
+ switch (ipv4->protocol) {
+ case IPPROTO_ICMP:
+ return process_icmpv4(pkt, metrics);
+
+ case IPPROTO_TCP:
+ return process_tcp(pkt, ipv4, sizeof(*ipv4), metrics);
+
+ case IPPROTO_UDP:
+ return process_udp(pkt, ipv4, sizeof(*ipv4), metrics);
+
+ default:
+ metrics->errors_total_unknown_l4_proto++;
+ return INVALID;
+ }
+}
+
+static INLINING verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics)
+{
+ metrics->l3_protocol_packets_total_ipv6++;
+
+ uint8_t l4_proto;
+ bool is_fragment;
+ struct ipv6hdr _ipv6;
+ struct ipv6hdr *ipv6 =
+ pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment);
+ if (ipv6 == NULL) {
+ metrics->errors_total_malformed_ip++;
+ return INVALID;
+ }
+
+ if (ipv6->version != 6) {
+ metrics->errors_total_malformed_ip++;
+ return INVALID;
+ }
+
+ if (is_fragment) {
+ metrics->errors_total_fragmented_ip++;
+ return INVALID;
+ }
+
+ switch (l4_proto) {
+ case IPPROTO_ICMPV6:
+ return process_icmpv6(pkt, metrics);
+
+ case IPPROTO_TCP:
+ return process_tcp(pkt, ipv6, sizeof(*ipv6), metrics);
+
+ case IPPROTO_UDP:
+ return process_udp(pkt, ipv6, sizeof(*ipv6), metrics);
+
+ default:
+ metrics->errors_total_unknown_l4_proto++;
+ return INVALID;
+ }
+}
+
+SEC("classifier/cls_redirect")
+int cls_redirect(struct __sk_buff *skb)
+{
+ metrics_t *metrics = get_global_metrics();
+ if (metrics == NULL) {
+ return TC_ACT_SHOT;
+ }
+
+ metrics->processed_packets_total++;
+
+ /* Pass bogus packets as long as we're not sure they're
+ * destined for us.
+ */
+ if (skb->protocol != bpf_htons(ETH_P_IP)) {
+ return TC_ACT_OK;
+ }
+
+ encap_headers_t *encap;
+
+ /* Make sure that all encapsulation headers are available in
+ * the linear portion of the skb. This makes it easy to manipulate them.
+ */
+ if (bpf_skb_pull_data(skb, sizeof(*encap))) {
+ return TC_ACT_OK;
+ }
+
+ buf_t pkt = {
+ .skb = skb,
+ .head = (uint8_t *)(long)skb->data,
+ .tail = (uint8_t *)(long)skb->data_end,
+ };
+
+ encap = buf_assign(&pkt, sizeof(*encap), NULL);
+ if (encap == NULL) {
+ return TC_ACT_OK;
+ }
+
+ if (encap->ip.ihl != 5) {
+ /* We never have any options. */
+ return TC_ACT_OK;
+ }
+
+ if (encap->ip.daddr != ENCAPSULATION_IP ||
+ encap->ip.protocol != IPPROTO_UDP) {
+ return TC_ACT_OK;
+ }
+
+ /* TODO Check UDP length? */
+ if (encap->udp.dest != ENCAPSULATION_PORT) {
+ return TC_ACT_OK;
+ }
+
+ /* We now know that the packet is destined to us, we can
+ * drop bogus ones.
+ */
+ if (ipv4_is_fragment((void *)&encap->ip)) {
+ metrics->errors_total_fragmented_ip++;
+ return TC_ACT_SHOT;
+ }
+
+ if (encap->gue.variant != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ if (encap->gue.control != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ if (encap->gue.flags != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ if (encap->gue.hlen !=
+ sizeof(encap->unigue) / 4 + encap->unigue.hop_count) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ if (encap->unigue.version != 0) {
+ metrics->errors_total_malformed_encapsulation++;
+ return TC_ACT_SHOT;
+ }
+
+ if (encap->unigue.reserved != 0) {
+ return TC_ACT_SHOT;
+ }
+
+ struct in_addr next_hop;
+ MAYBE_RETURN(get_next_hop(&pkt, encap, &next_hop));
+
+ if (next_hop.s_addr == 0) {
+ metrics->accepted_packets_total_last_hop++;
+ return accept_locally(skb, encap);
+ }
+
+ verdict_t verdict;
+ switch (encap->gue.proto_ctype) {
+ case IPPROTO_IPIP:
+ verdict = process_ipv4(&pkt, metrics);
+ break;
+
+ case IPPROTO_IPV6:
+ verdict = process_ipv6(&pkt, metrics);
+ break;
+
+ default:
+ metrics->errors_total_unknown_l3_proto++;
+ return TC_ACT_SHOT;
+ }
+
+ switch (verdict) {
+ case INVALID:
+ /* metrics have already been bumped */
+ return TC_ACT_SHOT;
+
+ case UNKNOWN:
+ return forward_to_next_hop(skb, encap, &next_hop, metrics);
+
+ case ECHO_REQUEST:
+ metrics->accepted_packets_total_icmp_echo_request++;
+ break;
+
+ case SYN:
+ if (encap->unigue.forward_syn) {
+ return forward_to_next_hop(skb, encap, &next_hop,
+ metrics);
+ }
+
+ metrics->accepted_packets_total_syn++;
+ break;
+
+ case SYN_COOKIE:
+ metrics->accepted_packets_total_syn_cookies++;
+ break;
+
+ case ESTABLISHED:
+ metrics->accepted_packets_total_established++;
+ break;
+ }
+
+ return accept_locally(skb, encap);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect.h b/tools/testing/selftests/bpf/progs/test_cls_redirect.h
new file mode 100644
index 000000000..233b089d1
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_cls_redirect.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/* Copyright 2019, 2020 Cloudflare */
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/udp.h>
+
+/* offsetof() is used in static asserts, and the libbpf-redefined CO-RE
+ * friendly version breaks compilation for older clang versions <= 15
+ * when invoked in a static assert. Restore original here.
+ */
+#ifdef offsetof
+#undef offsetof
+#define offsetof(type, member) __builtin_offsetof(type, member)
+#endif
+
+struct gre_base_hdr {
+ uint16_t flags;
+ uint16_t protocol;
+} __attribute__((packed));
+
+struct guehdr {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ uint8_t hlen : 5, control : 1, variant : 2;
+#else
+ uint8_t variant : 2, control : 1, hlen : 5;
+#endif
+ uint8_t proto_ctype;
+ uint16_t flags;
+};
+
+struct unigue {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ uint8_t _r : 2, last_hop_gre : 1, forward_syn : 1, version : 4;
+#else
+ uint8_t version : 4, forward_syn : 1, last_hop_gre : 1, _r : 2;
+#endif
+ uint8_t reserved;
+ uint8_t next_hop;
+ uint8_t hop_count;
+ // Next hops go here
+} __attribute__((packed));
+
+typedef struct {
+ struct ethhdr eth;
+ struct iphdr ip;
+ struct gre_base_hdr gre;
+} __attribute__((packed)) encap_gre_t;
+
+typedef struct {
+ struct ethhdr eth;
+ struct iphdr ip;
+ struct udphdr udp;
+ struct guehdr gue;
+ struct unigue unigue;
+} __attribute__((packed)) encap_headers_t;
diff --git a/tools/testing/selftests/bpf/progs/test_cls_redirect_subprogs.c b/tools/testing/selftests/bpf/progs/test_cls_redirect_subprogs.c
new file mode 100644
index 000000000..eed26b70e
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_cls_redirect_subprogs.c
@@ -0,0 +1,2 @@
+#define SUBPROGS
+#include "test_cls_redirect.c"
diff --git a/tools/testing/selftests/bpf/progs/test_core_autosize.c b/tools/testing/selftests/bpf/progs/test_core_autosize.c
new file mode 100644
index 000000000..9a7829c5e
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_core_autosize.c
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+char _license[] SEC("license") = "GPL";
+
+/* fields of exactly the same size */
+struct test_struct___samesize {
+ void *ptr;
+ unsigned long long val1;
+ unsigned int val2;
+ unsigned short val3;
+ unsigned char val4;
+} __attribute((preserve_access_index));
+
+/* unsigned fields that have to be downsized by libbpf */
+struct test_struct___downsize {
+ void *ptr;
+ unsigned long val1;
+ unsigned long val2;
+ unsigned long val3;
+ unsigned long val4;
+ /* total sz: 40 */
+} __attribute__((preserve_access_index));
+
+/* fields with signed integers of wrong size, should be rejected */
+struct test_struct___signed {
+ void *ptr;
+ long val1;
+ long val2;
+ long val3;
+ long val4;
+} __attribute((preserve_access_index));
+
+/* real layout and sizes according to test's (32-bit) BTF */
+struct test_struct___real {
+ unsigned int ptr; /* can't use `void *`, it is always 8 byte in BPF target */
+ unsigned int val2;
+ unsigned long long val1;
+ unsigned short val3;
+ unsigned char val4;
+ unsigned char _pad;
+ /* total sz: 20 */
+};
+
+struct test_struct___real input = {
+ .ptr = 0x01020304,
+ .val1 = 0x1020304050607080,
+ .val2 = 0x0a0b0c0d,
+ .val3 = 0xfeed,
+ .val4 = 0xb9,
+ ._pad = 0xff, /* make sure no accidental zeros are present */
+};
+
+unsigned long long ptr_samesized = 0;
+unsigned long long val1_samesized = 0;
+unsigned long long val2_samesized = 0;
+unsigned long long val3_samesized = 0;
+unsigned long long val4_samesized = 0;
+struct test_struct___real output_samesized = {};
+
+unsigned long long ptr_downsized = 0;
+unsigned long long val1_downsized = 0;
+unsigned long long val2_downsized = 0;
+unsigned long long val3_downsized = 0;
+unsigned long long val4_downsized = 0;
+struct test_struct___real output_downsized = {};
+
+unsigned long long ptr_probed = 0;
+unsigned long long val1_probed = 0;
+unsigned long long val2_probed = 0;
+unsigned long long val3_probed = 0;
+unsigned long long val4_probed = 0;
+
+unsigned long long ptr_signed = 0;
+unsigned long long val1_signed = 0;
+unsigned long long val2_signed = 0;
+unsigned long long val3_signed = 0;
+unsigned long long val4_signed = 0;
+struct test_struct___real output_signed = {};
+
+SEC("raw_tp/sys_exit")
+int handle_samesize(void *ctx)
+{
+ struct test_struct___samesize *in = (void *)&input;
+ struct test_struct___samesize *out = (void *)&output_samesized;
+
+ ptr_samesized = (unsigned long long)in->ptr;
+ val1_samesized = in->val1;
+ val2_samesized = in->val2;
+ val3_samesized = in->val3;
+ val4_samesized = in->val4;
+
+ out->ptr = in->ptr;
+ out->val1 = in->val1;
+ out->val2 = in->val2;
+ out->val3 = in->val3;
+ out->val4 = in->val4;
+
+ return 0;
+}
+
+SEC("raw_tp/sys_exit")
+int handle_downsize(void *ctx)
+{
+ struct test_struct___downsize *in = (void *)&input;
+ struct test_struct___downsize *out = (void *)&output_downsized;
+
+ ptr_downsized = (unsigned long long)in->ptr;
+ val1_downsized = in->val1;
+ val2_downsized = in->val2;
+ val3_downsized = in->val3;
+ val4_downsized = in->val4;
+
+ out->ptr = in->ptr;
+ out->val1 = in->val1;
+ out->val2 = in->val2;
+ out->val3 = in->val3;
+ out->val4 = in->val4;
+
+ return 0;
+}
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define bpf_core_read_int bpf_core_read
+#else
+#define bpf_core_read_int(dst, sz, src) ({ \
+ /* Prevent "subtraction from stack pointer prohibited" */ \
+ volatile long __off = sizeof(*dst) - (sz); \
+ bpf_core_read((char *)(dst) + __off, sz, src); \
+})
+#endif
+
+SEC("raw_tp/sys_enter")
+int handle_probed(void *ctx)
+{
+ struct test_struct___downsize *in = (void *)&input;
+ __u64 tmp;
+
+ tmp = 0;
+ bpf_core_read_int(&tmp, bpf_core_field_size(in->ptr), &in->ptr);
+ ptr_probed = tmp;
+
+ tmp = 0;
+ bpf_core_read_int(&tmp, bpf_core_field_size(in->val1), &in->val1);
+ val1_probed = tmp;
+
+ tmp = 0;
+ bpf_core_read_int(&tmp, bpf_core_field_size(in->val2), &in->val2);
+ val2_probed = tmp;
+
+ tmp = 0;
+ bpf_core_read_int(&tmp, bpf_core_field_size(in->val3), &in->val3);
+ val3_probed = tmp;
+
+ tmp = 0;
+ bpf_core_read_int(&tmp, bpf_core_field_size(in->val4), &in->val4);
+ val4_probed = tmp;
+
+ return 0;
+}
+
+SEC("raw_tp/sys_enter")
+int handle_signed(void *ctx)
+{
+ struct test_struct___signed *in = (void *)&input;
+ struct test_struct___signed *out = (void *)&output_signed;
+
+ val2_signed = in->val2;
+ val3_signed = in->val3;
+ val4_signed = in->val4;
+
+ out->val2= in->val2;
+ out->val3= in->val3;
+ out->val4= in->val4;
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_core_extern.c b/tools/testing/selftests/bpf/progs/test_core_extern.c
new file mode 100644
index 000000000..3ac3603ad
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_core_extern.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+/* non-existing BPF helper, to test dead code elimination */
+static int (*bpf_missing_helper)(const void *arg1, int arg2) = (void *) 999;
+
+extern int LINUX_KERNEL_VERSION __kconfig;
+extern bool CONFIG_BPF_SYSCALL __kconfig; /* strong */
+extern enum libbpf_tristate CONFIG_TRISTATE __kconfig __weak;
+extern bool CONFIG_BOOL __kconfig __weak;
+extern char CONFIG_CHAR __kconfig __weak;
+extern uint16_t CONFIG_USHORT __kconfig __weak;
+extern int CONFIG_INT __kconfig __weak;
+extern uint64_t CONFIG_ULONG __kconfig __weak;
+extern const char CONFIG_STR[8] __kconfig __weak;
+extern uint64_t CONFIG_MISSING __kconfig __weak;
+
+uint64_t kern_ver = -1;
+uint64_t bpf_syscall = -1;
+uint64_t tristate_val = -1;
+uint64_t bool_val = -1;
+uint64_t char_val = -1;
+uint64_t ushort_val = -1;
+uint64_t int_val = -1;
+uint64_t ulong_val = -1;
+char str_val[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
+uint64_t missing_val = -1;
+
+SEC("raw_tp/sys_enter")
+int handle_sys_enter(struct pt_regs *ctx)
+{
+ int i;
+
+ kern_ver = LINUX_KERNEL_VERSION;
+ bpf_syscall = CONFIG_BPF_SYSCALL;
+ tristate_val = CONFIG_TRISTATE;
+ bool_val = CONFIG_BOOL;
+ char_val = CONFIG_CHAR;
+ ushort_val = CONFIG_USHORT;
+ int_val = CONFIG_INT;
+ ulong_val = CONFIG_ULONG;
+
+ for (i = 0; i < sizeof(CONFIG_STR); i++) {
+ str_val[i] = CONFIG_STR[i];
+ }
+
+ if (CONFIG_MISSING)
+ /* invalid, but dead code - never executed */
+ missing_val = bpf_missing_helper(ctx, 123);
+ else
+ missing_val = 0xDEADC0DE;
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c b/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c
new file mode 100644
index 000000000..51b3f79df
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_core_reloc_arrays.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ char in[256];
+ char out[256];
+} data = {};
+
+struct core_reloc_arrays_output {
+ int a2;
+ char b123;
+ int c1c;
+ int d00d;
+ int f01c;
+};
+
+struct core_reloc_arrays_substruct {
+ int c;
+ int d;
+};
+
+struct core_reloc_arrays {
+ int a[5];
+ char b[2][3][4];
+ struct core_reloc_arrays_substruct c[3];
+ struct core_reloc_arrays_substruct d[1][2];
+ struct core_reloc_arrays_substruct f[][2];
+};
+
+#define CORE_READ(dst, src) bpf_core_read(dst, sizeof(*(dst)), src)
+
+SEC("raw_tracepoint/sys_enter")
+int test_core_arrays(void *ctx)
+{
+ struct core_reloc_arrays *in = (void *)&data.in;
+ struct core_reloc_arrays_output *out = (void *)&data.out;
+
+ if (CORE_READ(&out->a2, &in->a[2]))
+ return 1;
+ if (CORE_READ(&out->b123, &in->b[1][2][3]))
+ return 1;
+ if (CORE_READ(&out->c1c, &in->c[1].c))
+ return 1;
+ if (CORE_READ(&out->d00d, &in->d[0][0].d))
+ return 1;
+ if (CORE_READ(&out->f01c, &in->f[0][1].c))
+ return 1;
+
+ return 0;
+}
+
diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_direct.c b/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_direct.c
new file mode 100644
index 000000000..56aec2021
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_direct.c
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ char in[256];
+ char out[256];
+} data = {};
+
+struct core_reloc_bitfields {
+ /* unsigned bitfields */
+ uint8_t ub1: 1;
+ uint8_t ub2: 2;
+ uint32_t ub7: 7;
+ /* signed bitfields */
+ int8_t sb4: 4;
+ int32_t sb20: 20;
+ /* non-bitfields */
+ uint32_t u32;
+ int32_t s32;
+};
+
+/* bitfield read results, all as plain integers */
+struct core_reloc_bitfields_output {
+ int64_t ub1;
+ int64_t ub2;
+ int64_t ub7;
+ int64_t sb4;
+ int64_t sb20;
+ int64_t u32;
+ int64_t s32;
+};
+
+struct pt_regs;
+
+struct trace_sys_enter {
+ struct pt_regs *regs;
+ long id;
+};
+
+SEC("tp_btf/sys_enter")
+int test_core_bitfields_direct(void *ctx)
+{
+ struct core_reloc_bitfields *in = (void *)&data.in;
+ struct core_reloc_bitfields_output *out = (void *)&data.out;
+
+ out->ub1 = BPF_CORE_READ_BITFIELD(in, ub1);
+ out->ub2 = BPF_CORE_READ_BITFIELD(in, ub2);
+ out->ub7 = BPF_CORE_READ_BITFIELD(in, ub7);
+ out->sb4 = BPF_CORE_READ_BITFIELD(in, sb4);
+ out->sb20 = BPF_CORE_READ_BITFIELD(in, sb20);
+ out->u32 = BPF_CORE_READ_BITFIELD(in, u32);
+ out->s32 = BPF_CORE_READ_BITFIELD(in, s32);
+
+ return 0;
+}
+
diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_probed.c b/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_probed.c
new file mode 100644
index 000000000..ab1e647ae
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_core_reloc_bitfields_probed.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ char in[256];
+ char out[256];
+} data = {};
+
+struct core_reloc_bitfields {
+ /* unsigned bitfields */
+ uint8_t ub1: 1;
+ uint8_t ub2: 2;
+ uint32_t ub7: 7;
+ /* signed bitfields */
+ int8_t sb4: 4;
+ int32_t sb20: 20;
+ /* non-bitfields */
+ uint32_t u32;
+ int32_t s32;
+};
+
+/* bitfield read results, all as plain integers */
+struct core_reloc_bitfields_output {
+ int64_t ub1;
+ int64_t ub2;
+ int64_t ub7;
+ int64_t sb4;
+ int64_t sb20;
+ int64_t u32;
+ int64_t s32;
+};
+
+SEC("raw_tracepoint/sys_enter")
+int test_core_bitfields(void *ctx)
+{
+ struct core_reloc_bitfields *in = (void *)&data.in;
+ struct core_reloc_bitfields_output *out = (void *)&data.out;
+ uint64_t res;
+
+ out->ub1 = BPF_CORE_READ_BITFIELD_PROBED(in, ub1);
+ out->ub2 = BPF_CORE_READ_BITFIELD_PROBED(in, ub2);
+ out->ub7 = BPF_CORE_READ_BITFIELD_PROBED(in, ub7);
+ out->sb4 = BPF_CORE_READ_BITFIELD_PROBED(in, sb4);
+ out->sb20 = BPF_CORE_READ_BITFIELD_PROBED(in, sb20);
+ out->u32 = BPF_CORE_READ_BITFIELD_PROBED(in, u32);
+ out->s32 = BPF_CORE_READ_BITFIELD_PROBED(in, s32);
+
+ return 0;
+}
+
diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_enumval.c b/tools/testing/selftests/bpf/progs/test_core_reloc_enumval.c
new file mode 100644
index 000000000..e7ef3dada
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_core_reloc_enumval.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ char in[256];
+ char out[256];
+ bool skip;
+} data = {};
+
+enum named_enum {
+ NAMED_ENUM_VAL1 = 1,
+ NAMED_ENUM_VAL2 = 2,
+ NAMED_ENUM_VAL3 = 3,
+};
+
+typedef enum {
+ ANON_ENUM_VAL1 = 0x10,
+ ANON_ENUM_VAL2 = 0x20,
+ ANON_ENUM_VAL3 = 0x30,
+} anon_enum;
+
+struct core_reloc_enumval_output {
+ bool named_val1_exists;
+ bool named_val2_exists;
+ bool named_val3_exists;
+ bool anon_val1_exists;
+ bool anon_val2_exists;
+ bool anon_val3_exists;
+
+ int named_val1;
+ int named_val2;
+ int anon_val1;
+ int anon_val2;
+};
+
+SEC("raw_tracepoint/sys_enter")
+int test_core_enumval(void *ctx)
+{
+#if __has_builtin(__builtin_preserve_enum_value)
+ struct core_reloc_enumval_output *out = (void *)&data.out;
+ enum named_enum named = 0;
+ anon_enum anon = 0;
+
+ out->named_val1_exists = bpf_core_enum_value_exists(named, NAMED_ENUM_VAL1);
+ out->named_val2_exists = bpf_core_enum_value_exists(enum named_enum, NAMED_ENUM_VAL2);
+ out->named_val3_exists = bpf_core_enum_value_exists(enum named_enum, NAMED_ENUM_VAL3);
+
+ out->anon_val1_exists = bpf_core_enum_value_exists(anon, ANON_ENUM_VAL1);
+ out->anon_val2_exists = bpf_core_enum_value_exists(anon_enum, ANON_ENUM_VAL2);
+ out->anon_val3_exists = bpf_core_enum_value_exists(anon_enum, ANON_ENUM_VAL3);
+
+ out->named_val1 = bpf_core_enum_value(named, NAMED_ENUM_VAL1);
+ out->named_val2 = bpf_core_enum_value(named, NAMED_ENUM_VAL2);
+ /* NAMED_ENUM_VAL3 value is optional */
+
+ out->anon_val1 = bpf_core_enum_value(anon, ANON_ENUM_VAL1);
+ out->anon_val2 = bpf_core_enum_value(anon, ANON_ENUM_VAL2);
+ /* ANON_ENUM_VAL3 value is optional */
+#else
+ data.skip = true;
+#endif
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_existence.c b/tools/testing/selftests/bpf/progs/test_core_reloc_existence.c
new file mode 100644
index 000000000..7e45e2bdf
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_core_reloc_existence.c
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ char in[256];
+ char out[256];
+} data = {};
+
+struct core_reloc_existence_output {
+ int a_exists;
+ int a_value;
+ int b_exists;
+ int b_value;
+ int c_exists;
+ int c_value;
+ int arr_exists;
+ int arr_value;
+ int s_exists;
+ int s_value;
+};
+
+struct core_reloc_existence {
+ struct {
+ int x;
+ } s;
+ int arr[1];
+ int a;
+ struct {
+ int b;
+ };
+ int c;
+};
+
+SEC("raw_tracepoint/sys_enter")
+int test_core_existence(void *ctx)
+{
+ struct core_reloc_existence *in = (void *)&data.in;
+ struct core_reloc_existence_output *out = (void *)&data.out;
+
+ out->a_exists = bpf_core_field_exists(in->a);
+ if (bpf_core_field_exists(in->a))
+ out->a_value = BPF_CORE_READ(in, a);
+ else
+ out->a_value = 0xff000001u;
+
+ out->b_exists = bpf_core_field_exists(in->b);
+ if (bpf_core_field_exists(in->b))
+ out->b_value = BPF_CORE_READ(in, b);
+ else
+ out->b_value = 0xff000002u;
+
+ out->c_exists = bpf_core_field_exists(in->c);
+ if (bpf_core_field_exists(in->c))
+ out->c_value = BPF_CORE_READ(in, c);
+ else
+ out->c_value = 0xff000003u;
+
+ out->arr_exists = bpf_core_field_exists(in->arr);
+ if (bpf_core_field_exists(in->arr))
+ out->arr_value = BPF_CORE_READ(in, arr[0]);
+ else
+ out->arr_value = 0xff000004u;
+
+ out->s_exists = bpf_core_field_exists(in->s);
+ if (bpf_core_field_exists(in->s))
+ out->s_value = BPF_CORE_READ(in, s.x);
+ else
+ out->s_value = 0xff000005u;
+
+ return 0;
+}
+
diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_flavors.c b/tools/testing/selftests/bpf/progs/test_core_reloc_flavors.c
new file mode 100644
index 000000000..525acc2f8
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_core_reloc_flavors.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ char in[256];
+ char out[256];
+} data = {};
+
+struct core_reloc_flavors {
+ int a;
+ int b;
+ int c;
+};
+
+/* local flavor with reversed layout */
+struct core_reloc_flavors___reversed {
+ int c;
+ int b;
+ int a;
+};
+
+/* local flavor with nested/overlapping layout */
+struct core_reloc_flavors___weird {
+ struct {
+ int b;
+ };
+ /* a and c overlap in local flavor, but this should still work
+ * correctly with target original flavor
+ */
+ union {
+ int a;
+ int c;
+ };
+};
+
+#define CORE_READ(dst, src) bpf_core_read(dst, sizeof(*(dst)), src)
+
+SEC("raw_tracepoint/sys_enter")
+int test_core_flavors(void *ctx)
+{
+ struct core_reloc_flavors *in_orig = (void *)&data.in;
+ struct core_reloc_flavors___reversed *in_rev = (void *)&data.in;
+ struct core_reloc_flavors___weird *in_weird = (void *)&data.in;
+ struct core_reloc_flavors *out = (void *)&data.out;
+
+ /* read a using weird layout */
+ if (CORE_READ(&out->a, &in_weird->a))
+ return 1;
+ /* read b using reversed layout */
+ if (CORE_READ(&out->b, &in_rev->b))
+ return 1;
+ /* read c using original layout */
+ if (CORE_READ(&out->c, &in_orig->c))
+ return 1;
+
+ return 0;
+}
+
diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_ints.c b/tools/testing/selftests/bpf/progs/test_core_reloc_ints.c
new file mode 100644
index 000000000..6b5290739
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_core_reloc_ints.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ char in[256];
+ char out[256];
+} data = {};
+
+struct core_reloc_ints {
+ uint8_t u8_field;
+ int8_t s8_field;
+ uint16_t u16_field;
+ int16_t s16_field;
+ uint32_t u32_field;
+ int32_t s32_field;
+ uint64_t u64_field;
+ int64_t s64_field;
+};
+
+#define CORE_READ(dst, src) bpf_core_read(dst, sizeof(*(dst)), src)
+
+SEC("raw_tracepoint/sys_enter")
+int test_core_ints(void *ctx)
+{
+ struct core_reloc_ints *in = (void *)&data.in;
+ struct core_reloc_ints *out = (void *)&data.out;
+
+ if (CORE_READ(&out->u8_field, &in->u8_field) ||
+ CORE_READ(&out->s8_field, &in->s8_field) ||
+ CORE_READ(&out->u16_field, &in->u16_field) ||
+ CORE_READ(&out->s16_field, &in->s16_field) ||
+ CORE_READ(&out->u32_field, &in->u32_field) ||
+ CORE_READ(&out->s32_field, &in->s32_field) ||
+ CORE_READ(&out->u64_field, &in->u64_field) ||
+ CORE_READ(&out->s64_field, &in->s64_field))
+ return 1;
+
+ return 0;
+}
+
diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c b/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c
new file mode 100644
index 000000000..145028b52
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_core_reloc_kernel.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ char in[256];
+ char out[256];
+ bool skip;
+ uint64_t my_pid_tgid;
+} data = {};
+
+struct core_reloc_kernel_output {
+ int valid[10];
+ /* we have test_progs[-flavor], so cut flavor part */
+ char comm[sizeof("test_progs")];
+ int comm_len;
+};
+
+struct task_struct {
+ int pid;
+ int tgid;
+ char comm[16];
+ struct task_struct *group_leader;
+};
+
+#define CORE_READ(dst, src) bpf_core_read(dst, sizeof(*(dst)), src)
+
+SEC("raw_tracepoint/sys_enter")
+int test_core_kernel(void *ctx)
+{
+ struct task_struct *task = (void *)bpf_get_current_task();
+ struct core_reloc_kernel_output *out = (void *)&data.out;
+ uint64_t pid_tgid = bpf_get_current_pid_tgid();
+ uint32_t real_tgid = (uint32_t)pid_tgid;
+ int pid, tgid;
+
+ if (data.my_pid_tgid != pid_tgid)
+ return 0;
+
+ if (CORE_READ(&pid, &task->pid) ||
+ CORE_READ(&tgid, &task->tgid))
+ return 1;
+
+ /* validate pid + tgid matches */
+ out->valid[0] = (((uint64_t)pid << 32) | tgid) == pid_tgid;
+
+ /* test variadic BPF_CORE_READ macros */
+ out->valid[1] = BPF_CORE_READ(task,
+ tgid) == real_tgid;
+ out->valid[2] = BPF_CORE_READ(task,
+ group_leader,
+ tgid) == real_tgid;
+ out->valid[3] = BPF_CORE_READ(task,
+ group_leader, group_leader,
+ tgid) == real_tgid;
+ out->valid[4] = BPF_CORE_READ(task,
+ group_leader, group_leader, group_leader,
+ tgid) == real_tgid;
+ out->valid[5] = BPF_CORE_READ(task,
+ group_leader, group_leader, group_leader,
+ group_leader,
+ tgid) == real_tgid;
+ out->valid[6] = BPF_CORE_READ(task,
+ group_leader, group_leader, group_leader,
+ group_leader, group_leader,
+ tgid) == real_tgid;
+ out->valid[7] = BPF_CORE_READ(task,
+ group_leader, group_leader, group_leader,
+ group_leader, group_leader, group_leader,
+ tgid) == real_tgid;
+ out->valid[8] = BPF_CORE_READ(task,
+ group_leader, group_leader, group_leader,
+ group_leader, group_leader, group_leader,
+ group_leader,
+ tgid) == real_tgid;
+ out->valid[9] = BPF_CORE_READ(task,
+ group_leader, group_leader, group_leader,
+ group_leader, group_leader, group_leader,
+ group_leader, group_leader,
+ tgid) == real_tgid;
+
+ /* test BPF_CORE_READ_STR_INTO() returns correct code and contents */
+ out->comm_len = BPF_CORE_READ_STR_INTO(
+ &out->comm, task,
+ group_leader, group_leader, group_leader, group_leader,
+ group_leader, group_leader, group_leader, group_leader,
+ comm);
+
+ return 0;
+}
+
diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_misc.c b/tools/testing/selftests/bpf/progs/test_core_reloc_misc.c
new file mode 100644
index 000000000..d5756dbde
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_core_reloc_misc.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ char in[256];
+ char out[256];
+} data = {};
+
+struct core_reloc_misc_output {
+ int a, b, c;
+};
+
+struct core_reloc_misc___a {
+ int a1;
+ int a2;
+};
+
+struct core_reloc_misc___b {
+ int b1;
+ int b2;
+};
+
+/* fixed two first members, can be extended with new fields */
+struct core_reloc_misc_extensible {
+ int a;
+ int b;
+};
+
+#define CORE_READ(dst, src) bpf_core_read(dst, sizeof(*(dst)), src)
+
+SEC("raw_tracepoint/sys_enter")
+int test_core_misc(void *ctx)
+{
+ struct core_reloc_misc___a *in_a = (void *)&data.in;
+ struct core_reloc_misc___b *in_b = (void *)&data.in;
+ struct core_reloc_misc_extensible *in_ext = (void *)&data.in;
+ struct core_reloc_misc_output *out = (void *)&data.out;
+
+ /* record two different relocations with the same accessor string */
+ if (CORE_READ(&out->a, &in_a->a1) || /* accessor: 0:0 */
+ CORE_READ(&out->b, &in_b->b1)) /* accessor: 0:0 */
+ return 1;
+
+ /* Validate relocations capture array-only accesses for structs with
+ * fixed header, but with potentially extendable tail. This will read
+ * first 4 bytes of 2nd element of in_ext array of potentially
+ * variably sized struct core_reloc_misc_extensible. */
+ if (CORE_READ(&out->c, &in_ext[2])) /* accessor: 2 */
+ return 1;
+
+ return 0;
+}
+
diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_mods.c b/tools/testing/selftests/bpf/progs/test_core_reloc_mods.c
new file mode 100644
index 000000000..8b533db4a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_core_reloc_mods.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ char in[256];
+ char out[256];
+} data = {};
+
+struct core_reloc_mods_output {
+ int a, b, c, d, e, f, g, h;
+};
+
+typedef const int int_t;
+typedef const char *char_ptr_t;
+typedef const int arr_t[7];
+
+struct core_reloc_mods_substruct {
+ int x;
+ int y;
+};
+
+typedef struct {
+ int x;
+ int y;
+} core_reloc_mods_substruct_t;
+
+struct core_reloc_mods {
+ int a;
+ int_t b;
+ char *c;
+ char_ptr_t d;
+ int e[3];
+ arr_t f;
+ struct core_reloc_mods_substruct g;
+ core_reloc_mods_substruct_t h;
+};
+
+#define CORE_READ(dst, src) bpf_core_read(dst, sizeof(*(dst)), src)
+
+SEC("raw_tracepoint/sys_enter")
+int test_core_mods(void *ctx)
+{
+ struct core_reloc_mods *in = (void *)&data.in;
+ struct core_reloc_mods_output *out = (void *)&data.out;
+
+ if (CORE_READ(&out->a, &in->a) ||
+ CORE_READ(&out->b, &in->b) ||
+ CORE_READ(&out->c, &in->c) ||
+ CORE_READ(&out->d, &in->d) ||
+ CORE_READ(&out->e, &in->e[2]) ||
+ CORE_READ(&out->f, &in->f[1]) ||
+ CORE_READ(&out->g, &in->g.x) ||
+ CORE_READ(&out->h, &in->h.y))
+ return 1;
+
+ return 0;
+}
+
diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_nesting.c b/tools/testing/selftests/bpf/progs/test_core_reloc_nesting.c
new file mode 100644
index 000000000..2b4b6d49c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_core_reloc_nesting.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ char in[256];
+ char out[256];
+} data = {};
+
+struct core_reloc_nesting_substruct {
+ int a;
+};
+
+union core_reloc_nesting_subunion {
+ int b;
+};
+
+/* int a.a.a and b.b.b accesses */
+struct core_reloc_nesting {
+ union {
+ struct core_reloc_nesting_substruct a;
+ } a;
+ struct {
+ union core_reloc_nesting_subunion b;
+ } b;
+};
+
+#define CORE_READ(dst, src) bpf_core_read(dst, sizeof(*(dst)), src)
+
+SEC("raw_tracepoint/sys_enter")
+int test_core_nesting(void *ctx)
+{
+ struct core_reloc_nesting *in = (void *)&data.in;
+ struct core_reloc_nesting *out = (void *)&data.out;
+
+ if (CORE_READ(&out->a.a.a, &in->a.a.a))
+ return 1;
+ if (CORE_READ(&out->b.b.b, &in->b.b.b))
+ return 1;
+
+ return 0;
+}
+
diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_primitives.c b/tools/testing/selftests/bpf/progs/test_core_reloc_primitives.c
new file mode 100644
index 000000000..2a8975678
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_core_reloc_primitives.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ char in[256];
+ char out[256];
+} data = {};
+
+enum core_reloc_primitives_enum {
+ A = 0,
+ B = 1,
+};
+
+struct core_reloc_primitives {
+ char a;
+ int b;
+ enum core_reloc_primitives_enum c;
+ void *d;
+ int (*f)(const char *);
+};
+
+#define CORE_READ(dst, src) bpf_core_read(dst, sizeof(*(dst)), src)
+
+SEC("raw_tracepoint/sys_enter")
+int test_core_primitives(void *ctx)
+{
+ struct core_reloc_primitives *in = (void *)&data.in;
+ struct core_reloc_primitives *out = (void *)&data.out;
+
+ if (CORE_READ(&out->a, &in->a) ||
+ CORE_READ(&out->b, &in->b) ||
+ CORE_READ(&out->c, &in->c) ||
+ CORE_READ(&out->d, &in->d) ||
+ CORE_READ(&out->f, &in->f))
+ return 1;
+
+ return 0;
+}
+
diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_ptr_as_arr.c b/tools/testing/selftests/bpf/progs/test_core_reloc_ptr_as_arr.c
new file mode 100644
index 000000000..ca61a5183
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_core_reloc_ptr_as_arr.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ char in[256];
+ char out[256];
+} data = {};
+
+struct core_reloc_ptr_as_arr {
+ int a;
+};
+
+#define CORE_READ(dst, src) bpf_core_read(dst, sizeof(*(dst)), src)
+
+SEC("raw_tracepoint/sys_enter")
+int test_core_ptr_as_arr(void *ctx)
+{
+ struct core_reloc_ptr_as_arr *in = (void *)&data.in;
+ struct core_reloc_ptr_as_arr *out = (void *)&data.out;
+
+ if (CORE_READ(&out->a, &in[2].a))
+ return 1;
+
+ return 0;
+}
+
diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_size.c b/tools/testing/selftests/bpf/progs/test_core_reloc_size.c
new file mode 100644
index 000000000..d7fb6cfc7
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_core_reloc_size.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ char in[256];
+ char out[256];
+} data = {};
+
+struct core_reloc_size_output {
+ int int_sz;
+ int struct_sz;
+ int union_sz;
+ int arr_sz;
+ int arr_elem_sz;
+ int ptr_sz;
+ int enum_sz;
+};
+
+struct core_reloc_size {
+ int int_field;
+ struct { int x; } struct_field;
+ union { int x; } union_field;
+ int arr_field[4];
+ void *ptr_field;
+ enum { VALUE = 123 } enum_field;
+};
+
+SEC("raw_tracepoint/sys_enter")
+int test_core_size(void *ctx)
+{
+ struct core_reloc_size *in = (void *)&data.in;
+ struct core_reloc_size_output *out = (void *)&data.out;
+
+ out->int_sz = bpf_core_field_size(in->int_field);
+ out->struct_sz = bpf_core_field_size(in->struct_field);
+ out->union_sz = bpf_core_field_size(in->union_field);
+ out->arr_sz = bpf_core_field_size(in->arr_field);
+ out->arr_elem_sz = bpf_core_field_size(in->arr_field[0]);
+ out->ptr_sz = bpf_core_field_size(in->ptr_field);
+ out->enum_sz = bpf_core_field_size(in->enum_field);
+
+ return 0;
+}
+
diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c b/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c
new file mode 100644
index 000000000..fb60f8195
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_core_reloc_type_based.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ char in[256];
+ char out[256];
+ bool skip;
+} data = {};
+
+struct a_struct {
+ int x;
+};
+
+union a_union {
+ int y;
+ int z;
+};
+
+typedef struct a_struct named_struct_typedef;
+
+typedef struct { int x, y, z; } anon_struct_typedef;
+
+typedef struct {
+ int a, b, c;
+} *struct_ptr_typedef;
+
+enum an_enum {
+ AN_ENUM_VAL1 = 1,
+ AN_ENUM_VAL2 = 2,
+ AN_ENUM_VAL3 = 3,
+};
+
+typedef int int_typedef;
+
+typedef enum { TYPEDEF_ENUM_VAL1, TYPEDEF_ENUM_VAL2 } enum_typedef;
+
+typedef void *void_ptr_typedef;
+
+typedef int (*func_proto_typedef)(long);
+
+typedef char arr_typedef[20];
+
+struct core_reloc_type_based_output {
+ bool struct_exists;
+ bool union_exists;
+ bool enum_exists;
+ bool typedef_named_struct_exists;
+ bool typedef_anon_struct_exists;
+ bool typedef_struct_ptr_exists;
+ bool typedef_int_exists;
+ bool typedef_enum_exists;
+ bool typedef_void_ptr_exists;
+ bool typedef_func_proto_exists;
+ bool typedef_arr_exists;
+
+ int struct_sz;
+ int union_sz;
+ int enum_sz;
+ int typedef_named_struct_sz;
+ int typedef_anon_struct_sz;
+ int typedef_struct_ptr_sz;
+ int typedef_int_sz;
+ int typedef_enum_sz;
+ int typedef_void_ptr_sz;
+ int typedef_func_proto_sz;
+ int typedef_arr_sz;
+};
+
+SEC("raw_tracepoint/sys_enter")
+int test_core_type_based(void *ctx)
+{
+#if __has_builtin(__builtin_preserve_type_info)
+ struct core_reloc_type_based_output *out = (void *)&data.out;
+
+ out->struct_exists = bpf_core_type_exists(struct a_struct);
+ out->union_exists = bpf_core_type_exists(union a_union);
+ out->enum_exists = bpf_core_type_exists(enum an_enum);
+ out->typedef_named_struct_exists = bpf_core_type_exists(named_struct_typedef);
+ out->typedef_anon_struct_exists = bpf_core_type_exists(anon_struct_typedef);
+ out->typedef_struct_ptr_exists = bpf_core_type_exists(struct_ptr_typedef);
+ out->typedef_int_exists = bpf_core_type_exists(int_typedef);
+ out->typedef_enum_exists = bpf_core_type_exists(enum_typedef);
+ out->typedef_void_ptr_exists = bpf_core_type_exists(void_ptr_typedef);
+ out->typedef_func_proto_exists = bpf_core_type_exists(func_proto_typedef);
+ out->typedef_arr_exists = bpf_core_type_exists(arr_typedef);
+
+ out->struct_sz = bpf_core_type_size(struct a_struct);
+ out->union_sz = bpf_core_type_size(union a_union);
+ out->enum_sz = bpf_core_type_size(enum an_enum);
+ out->typedef_named_struct_sz = bpf_core_type_size(named_struct_typedef);
+ out->typedef_anon_struct_sz = bpf_core_type_size(anon_struct_typedef);
+ out->typedef_struct_ptr_sz = bpf_core_type_size(struct_ptr_typedef);
+ out->typedef_int_sz = bpf_core_type_size(int_typedef);
+ out->typedef_enum_sz = bpf_core_type_size(enum_typedef);
+ out->typedef_void_ptr_sz = bpf_core_type_size(void_ptr_typedef);
+ out->typedef_func_proto_sz = bpf_core_type_size(func_proto_typedef);
+ out->typedef_arr_sz = bpf_core_type_size(arr_typedef);
+#else
+ data.skip = true;
+#endif
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_core_reloc_type_id.c b/tools/testing/selftests/bpf/progs/test_core_reloc_type_id.c
new file mode 100644
index 000000000..22aba3f6e
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_core_reloc_type_id.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ char in[256];
+ char out[256];
+ bool skip;
+} data = {};
+
+/* some types are shared with test_core_reloc_type_based.c */
+struct a_struct {
+ int x;
+};
+
+union a_union {
+ int y;
+ int z;
+};
+
+enum an_enum {
+ AN_ENUM_VAL1 = 1,
+ AN_ENUM_VAL2 = 2,
+ AN_ENUM_VAL3 = 3,
+};
+
+typedef struct a_struct named_struct_typedef;
+
+typedef int (*func_proto_typedef)(long);
+
+typedef char arr_typedef[20];
+
+struct core_reloc_type_id_output {
+ int local_anon_struct;
+ int local_anon_union;
+ int local_anon_enum;
+ int local_anon_func_proto_ptr;
+ int local_anon_void_ptr;
+ int local_anon_arr;
+
+ int local_struct;
+ int local_union;
+ int local_enum;
+ int local_int;
+ int local_struct_typedef;
+ int local_func_proto_typedef;
+ int local_arr_typedef;
+
+ int targ_struct;
+ int targ_union;
+ int targ_enum;
+ int targ_int;
+ int targ_struct_typedef;
+ int targ_func_proto_typedef;
+ int targ_arr_typedef;
+};
+
+/* preserve types even if Clang doesn't support built-in */
+struct a_struct t1 = {};
+union a_union t2 = {};
+enum an_enum t3 = 0;
+named_struct_typedef t4 = {};
+func_proto_typedef t5 = 0;
+arr_typedef t6 = {};
+
+SEC("raw_tracepoint/sys_enter")
+int test_core_type_id(void *ctx)
+{
+ /* We use __builtin_btf_type_id() in this tests, but up until the time
+ * __builtin_preserve_type_info() was added it contained a bug that
+ * would make this test fail. The bug was fixed ([0]) with addition of
+ * __builtin_preserve_type_info(), though, so that's what we are using
+ * to detect whether this test has to be executed, however strange
+ * that might look like.
+ *
+ * [0] https://reviews.llvm.org/D85174
+ */
+#if __has_builtin(__builtin_preserve_type_info)
+ struct core_reloc_type_id_output *out = (void *)&data.out;
+
+ out->local_anon_struct = bpf_core_type_id_local(struct { int marker_field; });
+ out->local_anon_union = bpf_core_type_id_local(union { int marker_field; });
+ out->local_anon_enum = bpf_core_type_id_local(enum { MARKER_ENUM_VAL = 123 });
+ out->local_anon_func_proto_ptr = bpf_core_type_id_local(_Bool(*)(int));
+ out->local_anon_void_ptr = bpf_core_type_id_local(void *);
+ out->local_anon_arr = bpf_core_type_id_local(_Bool[47]);
+
+ out->local_struct = bpf_core_type_id_local(struct a_struct);
+ out->local_union = bpf_core_type_id_local(union a_union);
+ out->local_enum = bpf_core_type_id_local(enum an_enum);
+ out->local_int = bpf_core_type_id_local(int);
+ out->local_struct_typedef = bpf_core_type_id_local(named_struct_typedef);
+ out->local_func_proto_typedef = bpf_core_type_id_local(func_proto_typedef);
+ out->local_arr_typedef = bpf_core_type_id_local(arr_typedef);
+
+ out->targ_struct = bpf_core_type_id_kernel(struct a_struct);
+ out->targ_union = bpf_core_type_id_kernel(union a_union);
+ out->targ_enum = bpf_core_type_id_kernel(enum an_enum);
+ out->targ_int = bpf_core_type_id_kernel(int);
+ out->targ_struct_typedef = bpf_core_type_id_kernel(named_struct_typedef);
+ out->targ_func_proto_typedef = bpf_core_type_id_kernel(func_proto_typedef);
+ out->targ_arr_typedef = bpf_core_type_id_kernel(arr_typedef);
+#else
+ data.skip = true;
+#endif
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_core_retro.c b/tools/testing/selftests/bpf/progs/test_core_retro.c
new file mode 100644
index 000000000..20861ec2f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_core_retro.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+struct task_struct {
+ int tgid;
+} __attribute__((preserve_access_index));
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, int);
+} exp_tgid_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, int);
+} results SEC(".maps");
+
+SEC("tp/raw_syscalls/sys_enter")
+int handle_sys_enter(void *ctx)
+{
+ struct task_struct *task = (void *)bpf_get_current_task();
+ int tgid = BPF_CORE_READ(task, tgid);
+ int zero = 0;
+ int real_tgid = bpf_get_current_pid_tgid() >> 32;
+ int *exp_tgid = bpf_map_lookup_elem(&exp_tgid_map, &zero);
+
+ /* only pass through sys_enters from test process */
+ if (!exp_tgid || *exp_tgid != real_tgid)
+ return 0;
+
+ bpf_map_update_elem(&results, &zero, &tgid, 0);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_d_path.c b/tools/testing/selftests/bpf/progs/test_d_path.c
new file mode 100644
index 000000000..84e1f883f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_d_path.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#define MAX_PATH_LEN 128
+#define MAX_FILES 7
+
+pid_t my_pid = 0;
+__u32 cnt_stat = 0;
+__u32 cnt_close = 0;
+char paths_stat[MAX_FILES][MAX_PATH_LEN] = {};
+char paths_close[MAX_FILES][MAX_PATH_LEN] = {};
+int rets_stat[MAX_FILES] = {};
+int rets_close[MAX_FILES] = {};
+
+int called_stat = 0;
+int called_close = 0;
+
+SEC("fentry/security_inode_getattr")
+int BPF_PROG(prog_stat, struct path *path, struct kstat *stat,
+ __u32 request_mask, unsigned int query_flags)
+{
+ pid_t pid = bpf_get_current_pid_tgid() >> 32;
+ __u32 cnt = cnt_stat;
+ int ret;
+
+ called_stat = 1;
+
+ if (pid != my_pid)
+ return 0;
+
+ if (cnt >= MAX_FILES)
+ return 0;
+ ret = bpf_d_path(path, paths_stat[cnt], MAX_PATH_LEN);
+
+ rets_stat[cnt] = ret;
+ cnt_stat++;
+ return 0;
+}
+
+SEC("fentry/filp_close")
+int BPF_PROG(prog_close, struct file *file, void *id)
+{
+ pid_t pid = bpf_get_current_pid_tgid() >> 32;
+ __u32 cnt = cnt_close;
+ int ret;
+
+ called_close = 1;
+
+ if (pid != my_pid)
+ return 0;
+
+ if (cnt >= MAX_FILES)
+ return 0;
+ ret = bpf_d_path(&file->f_path,
+ paths_close[cnt], MAX_PATH_LEN);
+
+ rets_close[cnt] = ret;
+ cnt_close++;
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_enable_stats.c b/tools/testing/selftests/bpf/progs/test_enable_stats.c
new file mode 100644
index 000000000..01a002ade
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_enable_stats.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <linux/types.h>
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+__u64 count = 0;
+
+SEC("raw_tracepoint/sys_enter")
+int test_enable_stats(void *ctx)
+{
+ count += 1;
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_endian.c b/tools/testing/selftests/bpf/progs/test_endian.c
new file mode 100644
index 000000000..ddb687c5d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_endian.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#define IN16 0x1234
+#define IN32 0x12345678U
+#define IN64 0x123456789abcdef0ULL
+
+__u16 in16 = 0;
+__u32 in32 = 0;
+__u64 in64 = 0;
+
+__u16 out16 = 0;
+__u32 out32 = 0;
+__u64 out64 = 0;
+
+__u16 const16 = 0;
+__u32 const32 = 0;
+__u64 const64 = 0;
+
+SEC("raw_tp/sys_enter")
+int sys_enter(const void *ctx)
+{
+ out16 = __builtin_bswap16(in16);
+ out32 = __builtin_bswap32(in32);
+ out64 = __builtin_bswap64(in64);
+ const16 = ___bpf_swab16(IN16);
+ const32 = ___bpf_swab32(IN32);
+ const64 = ___bpf_swab64(IN64);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c b/tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c
new file mode 100644
index 000000000..b6a6eb279
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_get_stack_rawtp.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+/* Permit pretty deep stack traces */
+#define MAX_STACK_RAWTP 100
+struct stack_trace_t {
+ int pid;
+ int kern_stack_size;
+ int user_stack_size;
+ int user_stack_buildid_size;
+ __u64 kern_stack[MAX_STACK_RAWTP];
+ __u64 user_stack[MAX_STACK_RAWTP];
+ struct bpf_stack_build_id user_stack_buildid[MAX_STACK_RAWTP];
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(max_entries, 2);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(__u32));
+} perfmap SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, struct stack_trace_t);
+} stackdata_map SEC(".maps");
+
+/* Allocate per-cpu space twice the needed. For the code below
+ * usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
+ * if (usize < 0)
+ * return 0;
+ * ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
+ *
+ * If we have value_size = MAX_STACK_RAWTP * sizeof(__u64),
+ * verifier will complain that access "raw_data + usize"
+ * with size "max_len - usize" may be out of bound.
+ * The maximum "raw_data + usize" is "raw_data + max_len"
+ * and the maximum "max_len - usize" is "max_len", verifier
+ * concludes that the maximum buffer access range is
+ * "raw_data[0...max_len * 2 - 1]" and hence reject the program.
+ *
+ * Doubling the to-be-used max buffer size can fix this verifier
+ * issue and avoid complicated C programming massaging.
+ * This is an acceptable workaround since there is one entry here.
+ */
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u64[2 * MAX_STACK_RAWTP]);
+} rawdata_map SEC(".maps");
+
+SEC("raw_tracepoint/sys_enter")
+int bpf_prog1(void *ctx)
+{
+ int max_len, max_buildid_len, total_size;
+ struct stack_trace_t *data;
+ long usize, ksize;
+ void *raw_data;
+ __u32 key = 0;
+
+ data = bpf_map_lookup_elem(&stackdata_map, &key);
+ if (!data)
+ return 0;
+
+ max_len = MAX_STACK_RAWTP * sizeof(__u64);
+ max_buildid_len = MAX_STACK_RAWTP * sizeof(struct bpf_stack_build_id);
+ data->pid = bpf_get_current_pid_tgid();
+ data->kern_stack_size = bpf_get_stack(ctx, data->kern_stack,
+ max_len, 0);
+ data->user_stack_size = bpf_get_stack(ctx, data->user_stack, max_len,
+ BPF_F_USER_STACK);
+ data->user_stack_buildid_size = bpf_get_stack(
+ ctx, data->user_stack_buildid, max_buildid_len,
+ BPF_F_USER_STACK | BPF_F_USER_BUILD_ID);
+ bpf_perf_event_output(ctx, &perfmap, 0, data, sizeof(*data));
+
+ /* write both kernel and user stacks to the same buffer */
+ raw_data = bpf_map_lookup_elem(&rawdata_map, &key);
+ if (!raw_data)
+ return 0;
+
+ usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
+ if (usize < 0)
+ return 0;
+
+ ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
+ if (ksize < 0)
+ return 0;
+
+ total_size = usize + ksize;
+ if (total_size > 0 && total_size <= max_len)
+ bpf_perf_event_output(ctx, &perfmap, 0, raw_data, total_size);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_get_stack_rawtp_err.c b/tools/testing/selftests/bpf/progs/test_get_stack_rawtp_err.c
new file mode 100644
index 000000000..8941a41c2
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_get_stack_rawtp_err.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+#define MAX_STACK_RAWTP 10
+
+SEC("raw_tracepoint/sys_enter")
+int bpf_prog2(void *ctx)
+{
+ __u64 stack[MAX_STACK_RAWTP];
+ int error;
+
+ /* set all the flags which should return -EINVAL */
+ error = bpf_get_stack(ctx, stack, 0, -1);
+ if (error < 0)
+ goto loop;
+
+ return error;
+loop:
+ while (1) {
+ error++;
+ }
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_global_data.c b/tools/testing/selftests/bpf/progs/test_global_data.c
new file mode 100644
index 000000000..1319be1c5
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_global_data.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Isovalent, Inc.
+
+#include <linux/bpf.h>
+#include <linux/pkt_cls.h>
+#include <string.h>
+
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 11);
+ __type(key, __u32);
+ __type(value, __u64);
+} result_number SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 5);
+ __type(key, __u32);
+ const char (*value)[32];
+} result_string SEC(".maps");
+
+struct foo {
+ __u8 a;
+ __u32 b;
+ __u64 c;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 5);
+ __type(key, __u32);
+ __type(value, struct foo);
+} result_struct SEC(".maps");
+
+/* Relocation tests for __u64s. */
+static __u64 num0;
+static __u64 num1 = 42;
+static const __u64 num2 = 24;
+static __u64 num3 = 0;
+static __u64 num4 = 0xffeeff;
+static const __u64 num5 = 0xabab;
+static const __u64 num6 = 0xab;
+
+/* Relocation tests for strings. */
+static const char str0[32] = "abcdefghijklmnopqrstuvwxyz";
+static char str1[32] = "abcdefghijklmnopqrstuvwxyz";
+static char str2[32];
+
+/* Relocation tests for structs. */
+static const struct foo struct0 = {
+ .a = 42,
+ .b = 0xfefeefef,
+ .c = 0x1111111111111111ULL,
+};
+static struct foo struct1;
+static const struct foo struct2;
+static struct foo struct3 = {
+ .a = 41,
+ .b = 0xeeeeefef,
+ .c = 0x2111111111111111ULL,
+};
+
+#define test_reloc(map, num, var) \
+ do { \
+ __u32 key = num; \
+ bpf_map_update_elem(&result_##map, &key, var, 0); \
+ } while (0)
+
+SEC("classifier/static_data_load")
+int load_static_data(struct __sk_buff *skb)
+{
+ static const __u64 bar = ~0;
+
+ test_reloc(number, 0, &num0);
+ test_reloc(number, 1, &num1);
+ test_reloc(number, 2, &num2);
+ test_reloc(number, 3, &num3);
+ test_reloc(number, 4, &num4);
+ test_reloc(number, 5, &num5);
+ num4 = 1234;
+ test_reloc(number, 6, &num4);
+ test_reloc(number, 7, &num0);
+ test_reloc(number, 8, &num6);
+
+ test_reloc(string, 0, str0);
+ test_reloc(string, 1, str1);
+ test_reloc(string, 2, str2);
+ str1[5] = 'x';
+ test_reloc(string, 3, str1);
+ __builtin_memcpy(&str2[2], "hello", sizeof("hello"));
+ test_reloc(string, 4, str2);
+
+ test_reloc(struct, 0, &struct0);
+ test_reloc(struct, 1, &struct1);
+ test_reloc(struct, 2, &struct2);
+ test_reloc(struct, 3, &struct3);
+
+ test_reloc(number, 9, &struct0.c);
+ test_reloc(number, 10, &bar);
+
+ return TC_ACT_OK;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_global_func1.c b/tools/testing/selftests/bpf/progs/test_global_func1.c
new file mode 100644
index 000000000..880260f6d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_global_func1.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+#ifndef MAX_STACK
+#define MAX_STACK (512 - 3 * 32 + 8)
+#endif
+
+static __attribute__ ((noinline))
+int f0(int var, struct __sk_buff *skb)
+{
+ return skb->len;
+}
+
+__attribute__ ((noinline))
+int f1(struct __sk_buff *skb)
+{
+ volatile char buf[MAX_STACK] = {};
+
+ return f0(0, skb) + skb->len;
+}
+
+int f3(int, struct __sk_buff *skb, int);
+
+__attribute__ ((noinline))
+int f2(int val, struct __sk_buff *skb)
+{
+ return f1(skb) + f3(val, skb, 1);
+}
+
+__attribute__ ((noinline))
+int f3(int val, struct __sk_buff *skb, int var)
+{
+ volatile char buf[MAX_STACK] = {};
+
+ return skb->ifindex * val * var;
+}
+
+SEC("classifier/test")
+int test_cls(struct __sk_buff *skb)
+{
+ return f0(1, skb) + f1(skb) + f2(2, skb) + f3(3, skb, 4);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_global_func2.c b/tools/testing/selftests/bpf/progs/test_global_func2.c
new file mode 100644
index 000000000..2c18d8292
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_global_func2.c
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+#define MAX_STACK (512 - 3 * 32)
+#include "test_global_func1.c"
diff --git a/tools/testing/selftests/bpf/progs/test_global_func3.c b/tools/testing/selftests/bpf/progs/test_global_func3.c
new file mode 100644
index 000000000..86f0ecb30
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_global_func3.c
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+__attribute__ ((noinline))
+int f1(struct __sk_buff *skb)
+{
+ return skb->len;
+}
+
+__attribute__ ((noinline))
+int f2(int val, struct __sk_buff *skb)
+{
+ return f1(skb) + val;
+}
+
+__attribute__ ((noinline))
+int f3(int val, struct __sk_buff *skb, int var)
+{
+ return f2(var, skb) + val;
+}
+
+__attribute__ ((noinline))
+int f4(struct __sk_buff *skb)
+{
+ return f3(1, skb, 2);
+}
+
+__attribute__ ((noinline))
+int f5(struct __sk_buff *skb)
+{
+ return f4(skb);
+}
+
+__attribute__ ((noinline))
+int f6(struct __sk_buff *skb)
+{
+ return f5(skb);
+}
+
+__attribute__ ((noinline))
+int f7(struct __sk_buff *skb)
+{
+ return f6(skb);
+}
+
+#ifndef NO_FN8
+__attribute__ ((noinline))
+int f8(struct __sk_buff *skb)
+{
+ return f7(skb);
+}
+#endif
+
+SEC("classifier/test")
+int test_cls(struct __sk_buff *skb)
+{
+#ifndef NO_FN8
+ return f8(skb);
+#else
+ return f7(skb);
+#endif
+}
diff --git a/tools/testing/selftests/bpf/progs/test_global_func4.c b/tools/testing/selftests/bpf/progs/test_global_func4.c
new file mode 100644
index 000000000..610f75edf
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_global_func4.c
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+#define NO_FN8
+#include "test_global_func3.c"
diff --git a/tools/testing/selftests/bpf/progs/test_global_func5.c b/tools/testing/selftests/bpf/progs/test_global_func5.c
new file mode 100644
index 000000000..260c25b82
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_global_func5.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+__attribute__ ((noinline))
+int f1(struct __sk_buff *skb)
+{
+ return skb->len;
+}
+
+int f3(int, struct __sk_buff *skb);
+
+__attribute__ ((noinline))
+int f2(int val, struct __sk_buff *skb)
+{
+ return f1(skb) + f3(val, (void *)&val); /* type mismatch */
+}
+
+__attribute__ ((noinline))
+int f3(int val, struct __sk_buff *skb)
+{
+ return skb->ifindex * val;
+}
+
+SEC("classifier/test")
+int test_cls(struct __sk_buff *skb)
+{
+ return f1(skb) + f2(2, skb) + f3(3, skb);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_global_func6.c b/tools/testing/selftests/bpf/progs/test_global_func6.c
new file mode 100644
index 000000000..69e19c64e
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_global_func6.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+__attribute__ ((noinline))
+int f1(struct __sk_buff *skb)
+{
+ return skb->len;
+}
+
+int f3(int, struct __sk_buff *skb);
+
+__attribute__ ((noinline))
+int f2(int val, struct __sk_buff *skb)
+{
+ return f1(skb) + f3(val, skb + 1); /* type mismatch */
+}
+
+__attribute__ ((noinline))
+int f3(int val, struct __sk_buff *skb)
+{
+ return skb->ifindex * val;
+}
+
+SEC("classifier/test")
+int test_cls(struct __sk_buff *skb)
+{
+ return f1(skb) + f2(2, skb) + f3(3, skb);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_global_func7.c b/tools/testing/selftests/bpf/progs/test_global_func7.c
new file mode 100644
index 000000000..309b3f613
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_global_func7.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+__attribute__ ((noinline))
+void foo(struct __sk_buff *skb)
+{
+ skb->tc_index = 0;
+}
+
+SEC("classifier/test")
+int test_cls(struct __sk_buff *skb)
+{
+ foo(skb);
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_global_func8.c b/tools/testing/selftests/bpf/progs/test_global_func8.c
new file mode 100644
index 000000000..d55a6544b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_global_func8.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+__noinline int foo(struct __sk_buff *skb)
+{
+ return bpf_get_prandom_u32();
+}
+
+SEC("cgroup_skb/ingress")
+int test_cls(struct __sk_buff *skb)
+{
+ if (!foo(skb))
+ return 0;
+
+ return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_jhash.h b/tools/testing/selftests/bpf/progs/test_jhash.h
new file mode 100644
index 000000000..c300734d2
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_jhash.h
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <features.h>
+
+typedef unsigned int u32;
+
+static __always_inline u32 rol32(u32 word, unsigned int shift)
+{
+ return (word << shift) | (word >> ((-shift) & 31));
+}
+
+#define __jhash_mix(a, b, c) \
+{ \
+ a -= c; a ^= rol32(c, 4); c += b; \
+ b -= a; b ^= rol32(a, 6); a += c; \
+ c -= b; c ^= rol32(b, 8); b += a; \
+ a -= c; a ^= rol32(c, 16); c += b; \
+ b -= a; b ^= rol32(a, 19); a += c; \
+ c -= b; c ^= rol32(b, 4); b += a; \
+}
+
+#define __jhash_final(a, b, c) \
+{ \
+ c ^= b; c -= rol32(b, 14); \
+ a ^= c; a -= rol32(c, 11); \
+ b ^= a; b -= rol32(a, 25); \
+ c ^= b; c -= rol32(b, 16); \
+ a ^= c; a -= rol32(c, 4); \
+ b ^= a; b -= rol32(a, 14); \
+ c ^= b; c -= rol32(b, 24); \
+}
+
+#define JHASH_INITVAL 0xdeadbeef
+
+static ATTR
+u32 jhash(const void *key, u32 length, u32 initval)
+{
+ u32 a, b, c;
+ const unsigned char *k = key;
+
+ a = b = c = JHASH_INITVAL + length + initval;
+
+ while (length > 12) {
+ a += *(volatile u32 *)(k);
+ b += *(volatile u32 *)(k + 4);
+ c += *(volatile u32 *)(k + 8);
+ __jhash_mix(a, b, c);
+ length -= 12;
+ k += 12;
+ }
+ switch (length) {
+ case 12: c += (u32)k[11]<<24;
+ case 11: c += (u32)k[10]<<16;
+ case 10: c += (u32)k[9]<<8;
+ case 9: c += k[8];
+ case 8: b += (u32)k[7]<<24;
+ case 7: b += (u32)k[6]<<16;
+ case 6: b += (u32)k[5]<<8;
+ case 5: b += k[4];
+ case 4: a += (u32)k[3]<<24;
+ case 3: a += (u32)k[2]<<16;
+ case 2: a += (u32)k[1]<<8;
+ case 1: a += k[0];
+ c ^= a;
+ __jhash_final(a, b, c);
+ case 0: /* Nothing left to add */
+ break;
+ }
+
+ return c;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_ksyms.c b/tools/testing/selftests/bpf/progs/test_ksyms.c
new file mode 100644
index 000000000..6c9cbb5a3
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_ksyms.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+
+#include <stdbool.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+__u64 out__bpf_link_fops = -1;
+__u64 out__bpf_link_fops1 = -1;
+__u64 out__btf_size = -1;
+__u64 out__per_cpu_start = -1;
+
+extern const void bpf_link_fops __ksym;
+extern const void __start_BTF __ksym;
+extern const void __stop_BTF __ksym;
+extern const void __per_cpu_start __ksym;
+/* non-existing symbol, weak, default to zero */
+extern const void bpf_link_fops1 __ksym __weak;
+
+SEC("raw_tp/sys_enter")
+int handler(const void *ctx)
+{
+ out__bpf_link_fops = (__u64)&bpf_link_fops;
+ out__btf_size = (__u64)(&__stop_BTF - &__start_BTF);
+ out__per_cpu_start = (__u64)&__per_cpu_start;
+
+ out__bpf_link_fops1 = (__u64)&bpf_link_fops1;
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_btf.c b/tools/testing/selftests/bpf/progs/test_ksyms_btf.c
new file mode 100644
index 000000000..bb8ea9270
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_ksyms_btf.c
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Google */
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+
+__u64 out__runqueues_addr = -1;
+__u64 out__bpf_prog_active_addr = -1;
+
+__u32 out__rq_cpu = -1; /* percpu struct fields */
+int out__bpf_prog_active = -1; /* percpu int */
+
+__u32 out__this_rq_cpu = -1;
+int out__this_bpf_prog_active = -1;
+
+__u32 out__cpu_0_rq_cpu = -1; /* cpu_rq(0)->cpu */
+
+extern const struct rq runqueues __ksym; /* struct type global var. */
+extern const int bpf_prog_active __ksym; /* int type global var. */
+
+SEC("raw_tp/sys_enter")
+int handler(const void *ctx)
+{
+ struct rq *rq;
+ int *active;
+ __u32 cpu;
+
+ out__runqueues_addr = (__u64)&runqueues;
+ out__bpf_prog_active_addr = (__u64)&bpf_prog_active;
+
+ cpu = bpf_get_smp_processor_id();
+
+ /* test bpf_per_cpu_ptr() */
+ rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, cpu);
+ if (rq)
+ out__rq_cpu = rq->cpu;
+ active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, cpu);
+ if (active)
+ out__bpf_prog_active = *active;
+
+ rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, 0);
+ if (rq) /* should always be valid, but we can't spare the check. */
+ out__cpu_0_rq_cpu = rq->cpu;
+
+ /* test bpf_this_cpu_ptr */
+ rq = (struct rq *)bpf_this_cpu_ptr(&runqueues);
+ out__this_rq_cpu = rq->cpu;
+ active = (int *)bpf_this_cpu_ptr(&bpf_prog_active);
+ out__this_bpf_prog_active = *active;
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_btf_null_check.c b/tools/testing/selftests/bpf/progs/test_ksyms_btf_null_check.c
new file mode 100644
index 000000000..8bc8f7c63
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_ksyms_btf_null_check.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_helpers.h>
+
+extern const struct rq runqueues __ksym; /* struct type global var. */
+extern const int bpf_prog_active __ksym; /* int type global var. */
+
+SEC("raw_tp/sys_enter")
+int handler(const void *ctx)
+{
+ struct rq *rq;
+ int *active;
+ __u32 cpu;
+
+ cpu = bpf_get_smp_processor_id();
+ rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, cpu);
+ active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, cpu);
+ if (active) {
+ /* READ_ONCE */
+ *(volatile int *)active;
+ /* !rq has not been tested, so verifier should reject. */
+ *(volatile int *)(&rq->cpu);
+ }
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_l4lb.c b/tools/testing/selftests/bpf/progs/test_l4lb.c
new file mode 100644
index 000000000..33493911d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_l4lb.c
@@ -0,0 +1,473 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/pkt_cls.h>
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <bpf/bpf_helpers.h>
+#include "test_iptunnel_common.h"
+#include <bpf/bpf_endian.h>
+
+int _version SEC("version") = 1;
+
+static inline __u32 rol32(__u32 word, unsigned int shift)
+{
+ return (word << shift) | (word >> ((-shift) & 31));
+}
+
+/* copy paste of jhash from kernel sources to make sure llvm
+ * can compile it into valid sequence of bpf instructions
+ */
+#define __jhash_mix(a, b, c) \
+{ \
+ a -= c; a ^= rol32(c, 4); c += b; \
+ b -= a; b ^= rol32(a, 6); a += c; \
+ c -= b; c ^= rol32(b, 8); b += a; \
+ a -= c; a ^= rol32(c, 16); c += b; \
+ b -= a; b ^= rol32(a, 19); a += c; \
+ c -= b; c ^= rol32(b, 4); b += a; \
+}
+
+#define __jhash_final(a, b, c) \
+{ \
+ c ^= b; c -= rol32(b, 14); \
+ a ^= c; a -= rol32(c, 11); \
+ b ^= a; b -= rol32(a, 25); \
+ c ^= b; c -= rol32(b, 16); \
+ a ^= c; a -= rol32(c, 4); \
+ b ^= a; b -= rol32(a, 14); \
+ c ^= b; c -= rol32(b, 24); \
+}
+
+#define JHASH_INITVAL 0xdeadbeef
+
+typedef unsigned int u32;
+
+static inline u32 jhash(const void *key, u32 length, u32 initval)
+{
+ u32 a, b, c;
+ const unsigned char *k = key;
+
+ a = b = c = JHASH_INITVAL + length + initval;
+
+ while (length > 12) {
+ a += *(u32 *)(k);
+ b += *(u32 *)(k + 4);
+ c += *(u32 *)(k + 8);
+ __jhash_mix(a, b, c);
+ length -= 12;
+ k += 12;
+ }
+ switch (length) {
+ case 12: c += (u32)k[11]<<24;
+ case 11: c += (u32)k[10]<<16;
+ case 10: c += (u32)k[9]<<8;
+ case 9: c += k[8];
+ case 8: b += (u32)k[7]<<24;
+ case 7: b += (u32)k[6]<<16;
+ case 6: b += (u32)k[5]<<8;
+ case 5: b += k[4];
+ case 4: a += (u32)k[3]<<24;
+ case 3: a += (u32)k[2]<<16;
+ case 2: a += (u32)k[1]<<8;
+ case 1: a += k[0];
+ __jhash_final(a, b, c);
+ case 0: /* Nothing left to add */
+ break;
+ }
+
+ return c;
+}
+
+static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
+{
+ a += initval;
+ b += initval;
+ c += initval;
+ __jhash_final(a, b, c);
+ return c;
+}
+
+static inline u32 jhash_2words(u32 a, u32 b, u32 initval)
+{
+ return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
+}
+
+#define PCKT_FRAGMENTED 65343
+#define IPV4_HDR_LEN_NO_OPT 20
+#define IPV4_PLUS_ICMP_HDR 28
+#define IPV6_PLUS_ICMP_HDR 48
+#define RING_SIZE 2
+#define MAX_VIPS 12
+#define MAX_REALS 5
+#define CTL_MAP_SIZE 16
+#define CH_RINGS_SIZE (MAX_VIPS * RING_SIZE)
+#define F_IPV6 (1 << 0)
+#define F_HASH_NO_SRC_PORT (1 << 0)
+#define F_ICMP (1 << 0)
+#define F_SYN_SET (1 << 1)
+
+struct packet_description {
+ union {
+ __be32 src;
+ __be32 srcv6[4];
+ };
+ union {
+ __be32 dst;
+ __be32 dstv6[4];
+ };
+ union {
+ __u32 ports;
+ __u16 port16[2];
+ };
+ __u8 proto;
+ __u8 flags;
+};
+
+struct ctl_value {
+ union {
+ __u64 value;
+ __u32 ifindex;
+ __u8 mac[6];
+ };
+};
+
+struct vip_meta {
+ __u32 flags;
+ __u32 vip_num;
+};
+
+struct real_definition {
+ union {
+ __be32 dst;
+ __be32 dstv6[4];
+ };
+ __u8 flags;
+};
+
+struct vip_stats {
+ __u64 bytes;
+ __u64 pkts;
+};
+
+struct eth_hdr {
+ unsigned char eth_dest[ETH_ALEN];
+ unsigned char eth_source[ETH_ALEN];
+ unsigned short eth_proto;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, MAX_VIPS);
+ __type(key, struct vip);
+ __type(value, struct vip_meta);
+} vip_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, CH_RINGS_SIZE);
+ __type(key, __u32);
+ __type(value, __u32);
+} ch_rings SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, MAX_REALS);
+ __type(key, __u32);
+ __type(value, struct real_definition);
+} reals SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(max_entries, MAX_VIPS);
+ __type(key, __u32);
+ __type(value, struct vip_stats);
+} stats SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, CTL_MAP_SIZE);
+ __type(key, __u32);
+ __type(value, struct ctl_value);
+} ctl_array SEC(".maps");
+
+static __always_inline __u32 get_packet_hash(struct packet_description *pckt,
+ bool ipv6)
+{
+ if (ipv6)
+ return jhash_2words(jhash(pckt->srcv6, 16, MAX_VIPS),
+ pckt->ports, CH_RINGS_SIZE);
+ else
+ return jhash_2words(pckt->src, pckt->ports, CH_RINGS_SIZE);
+}
+
+static __always_inline bool get_packet_dst(struct real_definition **real,
+ struct packet_description *pckt,
+ struct vip_meta *vip_info,
+ bool is_ipv6)
+{
+ __u32 hash = get_packet_hash(pckt, is_ipv6) % RING_SIZE;
+ __u32 key = RING_SIZE * vip_info->vip_num + hash;
+ __u32 *real_pos;
+
+ real_pos = bpf_map_lookup_elem(&ch_rings, &key);
+ if (!real_pos)
+ return false;
+ key = *real_pos;
+ *real = bpf_map_lookup_elem(&reals, &key);
+ if (!(*real))
+ return false;
+ return true;
+}
+
+static __always_inline int parse_icmpv6(void *data, void *data_end, __u64 off,
+ struct packet_description *pckt)
+{
+ struct icmp6hdr *icmp_hdr;
+ struct ipv6hdr *ip6h;
+
+ icmp_hdr = data + off;
+ if (icmp_hdr + 1 > data_end)
+ return TC_ACT_SHOT;
+ if (icmp_hdr->icmp6_type != ICMPV6_PKT_TOOBIG)
+ return TC_ACT_OK;
+ off += sizeof(struct icmp6hdr);
+ ip6h = data + off;
+ if (ip6h + 1 > data_end)
+ return TC_ACT_SHOT;
+ pckt->proto = ip6h->nexthdr;
+ pckt->flags |= F_ICMP;
+ memcpy(pckt->srcv6, ip6h->daddr.s6_addr32, 16);
+ memcpy(pckt->dstv6, ip6h->saddr.s6_addr32, 16);
+ return TC_ACT_UNSPEC;
+}
+
+static __always_inline int parse_icmp(void *data, void *data_end, __u64 off,
+ struct packet_description *pckt)
+{
+ struct icmphdr *icmp_hdr;
+ struct iphdr *iph;
+
+ icmp_hdr = data + off;
+ if (icmp_hdr + 1 > data_end)
+ return TC_ACT_SHOT;
+ if (icmp_hdr->type != ICMP_DEST_UNREACH ||
+ icmp_hdr->code != ICMP_FRAG_NEEDED)
+ return TC_ACT_OK;
+ off += sizeof(struct icmphdr);
+ iph = data + off;
+ if (iph + 1 > data_end)
+ return TC_ACT_SHOT;
+ if (iph->ihl != 5)
+ return TC_ACT_SHOT;
+ pckt->proto = iph->protocol;
+ pckt->flags |= F_ICMP;
+ pckt->src = iph->daddr;
+ pckt->dst = iph->saddr;
+ return TC_ACT_UNSPEC;
+}
+
+static __always_inline bool parse_udp(void *data, __u64 off, void *data_end,
+ struct packet_description *pckt)
+{
+ struct udphdr *udp;
+ udp = data + off;
+
+ if (udp + 1 > data_end)
+ return false;
+
+ if (!(pckt->flags & F_ICMP)) {
+ pckt->port16[0] = udp->source;
+ pckt->port16[1] = udp->dest;
+ } else {
+ pckt->port16[0] = udp->dest;
+ pckt->port16[1] = udp->source;
+ }
+ return true;
+}
+
+static __always_inline bool parse_tcp(void *data, __u64 off, void *data_end,
+ struct packet_description *pckt)
+{
+ struct tcphdr *tcp;
+
+ tcp = data + off;
+ if (tcp + 1 > data_end)
+ return false;
+
+ if (tcp->syn)
+ pckt->flags |= F_SYN_SET;
+
+ if (!(pckt->flags & F_ICMP)) {
+ pckt->port16[0] = tcp->source;
+ pckt->port16[1] = tcp->dest;
+ } else {
+ pckt->port16[0] = tcp->dest;
+ pckt->port16[1] = tcp->source;
+ }
+ return true;
+}
+
+static __always_inline int process_packet(void *data, __u64 off, void *data_end,
+ bool is_ipv6, struct __sk_buff *skb)
+{
+ void *pkt_start = (void *)(long)skb->data;
+ struct packet_description pckt = {};
+ struct eth_hdr *eth = pkt_start;
+ struct bpf_tunnel_key tkey = {};
+ struct vip_stats *data_stats;
+ struct real_definition *dst;
+ struct vip_meta *vip_info;
+ struct ctl_value *cval;
+ __u32 v4_intf_pos = 1;
+ __u32 v6_intf_pos = 2;
+ struct ipv6hdr *ip6h;
+ struct vip vip = {};
+ struct iphdr *iph;
+ int tun_flag = 0;
+ __u16 pkt_bytes;
+ __u64 iph_len;
+ __u32 ifindex;
+ __u8 protocol;
+ __u32 vip_num;
+ int action;
+
+ tkey.tunnel_ttl = 64;
+ if (is_ipv6) {
+ ip6h = data + off;
+ if (ip6h + 1 > data_end)
+ return TC_ACT_SHOT;
+
+ iph_len = sizeof(struct ipv6hdr);
+ protocol = ip6h->nexthdr;
+ pckt.proto = protocol;
+ pkt_bytes = bpf_ntohs(ip6h->payload_len);
+ off += iph_len;
+ if (protocol == IPPROTO_FRAGMENT) {
+ return TC_ACT_SHOT;
+ } else if (protocol == IPPROTO_ICMPV6) {
+ action = parse_icmpv6(data, data_end, off, &pckt);
+ if (action >= 0)
+ return action;
+ off += IPV6_PLUS_ICMP_HDR;
+ } else {
+ memcpy(pckt.srcv6, ip6h->saddr.s6_addr32, 16);
+ memcpy(pckt.dstv6, ip6h->daddr.s6_addr32, 16);
+ }
+ } else {
+ iph = data + off;
+ if (iph + 1 > data_end)
+ return TC_ACT_SHOT;
+ if (iph->ihl != 5)
+ return TC_ACT_SHOT;
+
+ protocol = iph->protocol;
+ pckt.proto = protocol;
+ pkt_bytes = bpf_ntohs(iph->tot_len);
+ off += IPV4_HDR_LEN_NO_OPT;
+
+ if (iph->frag_off & PCKT_FRAGMENTED)
+ return TC_ACT_SHOT;
+ if (protocol == IPPROTO_ICMP) {
+ action = parse_icmp(data, data_end, off, &pckt);
+ if (action >= 0)
+ return action;
+ off += IPV4_PLUS_ICMP_HDR;
+ } else {
+ pckt.src = iph->saddr;
+ pckt.dst = iph->daddr;
+ }
+ }
+ protocol = pckt.proto;
+
+ if (protocol == IPPROTO_TCP) {
+ if (!parse_tcp(data, off, data_end, &pckt))
+ return TC_ACT_SHOT;
+ } else if (protocol == IPPROTO_UDP) {
+ if (!parse_udp(data, off, data_end, &pckt))
+ return TC_ACT_SHOT;
+ } else {
+ return TC_ACT_SHOT;
+ }
+
+ if (is_ipv6)
+ memcpy(vip.daddr.v6, pckt.dstv6, 16);
+ else
+ vip.daddr.v4 = pckt.dst;
+
+ vip.dport = pckt.port16[1];
+ vip.protocol = pckt.proto;
+ vip_info = bpf_map_lookup_elem(&vip_map, &vip);
+ if (!vip_info) {
+ vip.dport = 0;
+ vip_info = bpf_map_lookup_elem(&vip_map, &vip);
+ if (!vip_info)
+ return TC_ACT_SHOT;
+ pckt.port16[1] = 0;
+ }
+
+ if (vip_info->flags & F_HASH_NO_SRC_PORT)
+ pckt.port16[0] = 0;
+
+ if (!get_packet_dst(&dst, &pckt, vip_info, is_ipv6))
+ return TC_ACT_SHOT;
+
+ if (dst->flags & F_IPV6) {
+ cval = bpf_map_lookup_elem(&ctl_array, &v6_intf_pos);
+ if (!cval)
+ return TC_ACT_SHOT;
+ ifindex = cval->ifindex;
+ memcpy(tkey.remote_ipv6, dst->dstv6, 16);
+ tun_flag = BPF_F_TUNINFO_IPV6;
+ } else {
+ cval = bpf_map_lookup_elem(&ctl_array, &v4_intf_pos);
+ if (!cval)
+ return TC_ACT_SHOT;
+ ifindex = cval->ifindex;
+ tkey.remote_ipv4 = dst->dst;
+ }
+ vip_num = vip_info->vip_num;
+ data_stats = bpf_map_lookup_elem(&stats, &vip_num);
+ if (!data_stats)
+ return TC_ACT_SHOT;
+ data_stats->pkts++;
+ data_stats->bytes += pkt_bytes;
+ bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), tun_flag);
+ *(u32 *)eth->eth_dest = tkey.remote_ipv4;
+ return bpf_redirect(ifindex, 0);
+}
+
+SEC("l4lb-demo")
+int balancer_ingress(struct __sk_buff *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct eth_hdr *eth = data;
+ __u32 eth_proto;
+ __u32 nh_off;
+
+ nh_off = sizeof(struct eth_hdr);
+ if (data + nh_off > data_end)
+ return TC_ACT_SHOT;
+ eth_proto = eth->eth_proto;
+ if (eth_proto == bpf_htons(ETH_P_IP))
+ return process_packet(data, nh_off, data_end, false, ctx);
+ else if (eth_proto == bpf_htons(ETH_P_IPV6))
+ return process_packet(data, nh_off, data_end, true, ctx);
+ else
+ return TC_ACT_SHOT;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
new file mode 100644
index 000000000..b9e2753f4
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
@@ -0,0 +1,470 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2017 Facebook
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/pkt_cls.h>
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <bpf/bpf_helpers.h>
+#include "test_iptunnel_common.h"
+#include <bpf/bpf_endian.h>
+
+static __always_inline __u32 rol32(__u32 word, unsigned int shift)
+{
+ return (word << shift) | (word >> ((-shift) & 31));
+}
+
+/* copy paste of jhash from kernel sources to make sure llvm
+ * can compile it into valid sequence of bpf instructions
+ */
+#define __jhash_mix(a, b, c) \
+{ \
+ a -= c; a ^= rol32(c, 4); c += b; \
+ b -= a; b ^= rol32(a, 6); a += c; \
+ c -= b; c ^= rol32(b, 8); b += a; \
+ a -= c; a ^= rol32(c, 16); c += b; \
+ b -= a; b ^= rol32(a, 19); a += c; \
+ c -= b; c ^= rol32(b, 4); b += a; \
+}
+
+#define __jhash_final(a, b, c) \
+{ \
+ c ^= b; c -= rol32(b, 14); \
+ a ^= c; a -= rol32(c, 11); \
+ b ^= a; b -= rol32(a, 25); \
+ c ^= b; c -= rol32(b, 16); \
+ a ^= c; a -= rol32(c, 4); \
+ b ^= a; b -= rol32(a, 14); \
+ c ^= b; c -= rol32(b, 24); \
+}
+
+#define JHASH_INITVAL 0xdeadbeef
+
+typedef unsigned int u32;
+
+static __noinline u32 jhash(const void *key, u32 length, u32 initval)
+{
+ u32 a, b, c;
+ const unsigned char *k = key;
+
+ a = b = c = JHASH_INITVAL + length + initval;
+
+ while (length > 12) {
+ a += *(u32 *)(k);
+ b += *(u32 *)(k + 4);
+ c += *(u32 *)(k + 8);
+ __jhash_mix(a, b, c);
+ length -= 12;
+ k += 12;
+ }
+ switch (length) {
+ case 12: c += (u32)k[11]<<24;
+ case 11: c += (u32)k[10]<<16;
+ case 10: c += (u32)k[9]<<8;
+ case 9: c += k[8];
+ case 8: b += (u32)k[7]<<24;
+ case 7: b += (u32)k[6]<<16;
+ case 6: b += (u32)k[5]<<8;
+ case 5: b += k[4];
+ case 4: a += (u32)k[3]<<24;
+ case 3: a += (u32)k[2]<<16;
+ case 2: a += (u32)k[1]<<8;
+ case 1: a += k[0];
+ __jhash_final(a, b, c);
+ case 0: /* Nothing left to add */
+ break;
+ }
+
+ return c;
+}
+
+static __noinline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
+{
+ a += initval;
+ b += initval;
+ c += initval;
+ __jhash_final(a, b, c);
+ return c;
+}
+
+static __noinline u32 jhash_2words(u32 a, u32 b, u32 initval)
+{
+ return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
+}
+
+#define PCKT_FRAGMENTED 65343
+#define IPV4_HDR_LEN_NO_OPT 20
+#define IPV4_PLUS_ICMP_HDR 28
+#define IPV6_PLUS_ICMP_HDR 48
+#define RING_SIZE 2
+#define MAX_VIPS 12
+#define MAX_REALS 5
+#define CTL_MAP_SIZE 16
+#define CH_RINGS_SIZE (MAX_VIPS * RING_SIZE)
+#define F_IPV6 (1 << 0)
+#define F_HASH_NO_SRC_PORT (1 << 0)
+#define F_ICMP (1 << 0)
+#define F_SYN_SET (1 << 1)
+
+struct packet_description {
+ union {
+ __be32 src;
+ __be32 srcv6[4];
+ };
+ union {
+ __be32 dst;
+ __be32 dstv6[4];
+ };
+ union {
+ __u32 ports;
+ __u16 port16[2];
+ };
+ __u8 proto;
+ __u8 flags;
+};
+
+struct ctl_value {
+ union {
+ __u64 value;
+ __u32 ifindex;
+ __u8 mac[6];
+ };
+};
+
+struct vip_meta {
+ __u32 flags;
+ __u32 vip_num;
+};
+
+struct real_definition {
+ union {
+ __be32 dst;
+ __be32 dstv6[4];
+ };
+ __u8 flags;
+};
+
+struct vip_stats {
+ __u64 bytes;
+ __u64 pkts;
+};
+
+struct eth_hdr {
+ unsigned char eth_dest[ETH_ALEN];
+ unsigned char eth_source[ETH_ALEN];
+ unsigned short eth_proto;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, MAX_VIPS);
+ __type(key, struct vip);
+ __type(value, struct vip_meta);
+} vip_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, CH_RINGS_SIZE);
+ __type(key, __u32);
+ __type(value, __u32);
+} ch_rings SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, MAX_REALS);
+ __type(key, __u32);
+ __type(value, struct real_definition);
+} reals SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(max_entries, MAX_VIPS);
+ __type(key, __u32);
+ __type(value, struct vip_stats);
+} stats SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, CTL_MAP_SIZE);
+ __type(key, __u32);
+ __type(value, struct ctl_value);
+} ctl_array SEC(".maps");
+
+static __noinline __u32 get_packet_hash(struct packet_description *pckt, bool ipv6)
+{
+ if (ipv6)
+ return jhash_2words(jhash(pckt->srcv6, 16, MAX_VIPS),
+ pckt->ports, CH_RINGS_SIZE);
+ else
+ return jhash_2words(pckt->src, pckt->ports, CH_RINGS_SIZE);
+}
+
+static __noinline bool get_packet_dst(struct real_definition **real,
+ struct packet_description *pckt,
+ struct vip_meta *vip_info,
+ bool is_ipv6)
+{
+ __u32 hash = get_packet_hash(pckt, is_ipv6);
+ __u32 key = RING_SIZE * vip_info->vip_num + hash % RING_SIZE;
+ __u32 *real_pos;
+
+ if (hash != 0x358459b7 /* jhash of ipv4 packet */ &&
+ hash != 0x2f4bc6bb /* jhash of ipv6 packet */)
+ return 0;
+
+ real_pos = bpf_map_lookup_elem(&ch_rings, &key);
+ if (!real_pos)
+ return false;
+ key = *real_pos;
+ *real = bpf_map_lookup_elem(&reals, &key);
+ if (!(*real))
+ return false;
+ return true;
+}
+
+static __noinline int parse_icmpv6(void *data, void *data_end, __u64 off,
+ struct packet_description *pckt)
+{
+ struct icmp6hdr *icmp_hdr;
+ struct ipv6hdr *ip6h;
+
+ icmp_hdr = data + off;
+ if (icmp_hdr + 1 > data_end)
+ return TC_ACT_SHOT;
+ if (icmp_hdr->icmp6_type != ICMPV6_PKT_TOOBIG)
+ return TC_ACT_OK;
+ off += sizeof(struct icmp6hdr);
+ ip6h = data + off;
+ if (ip6h + 1 > data_end)
+ return TC_ACT_SHOT;
+ pckt->proto = ip6h->nexthdr;
+ pckt->flags |= F_ICMP;
+ memcpy(pckt->srcv6, ip6h->daddr.s6_addr32, 16);
+ memcpy(pckt->dstv6, ip6h->saddr.s6_addr32, 16);
+ return TC_ACT_UNSPEC;
+}
+
+static __noinline int parse_icmp(void *data, void *data_end, __u64 off,
+ struct packet_description *pckt)
+{
+ struct icmphdr *icmp_hdr;
+ struct iphdr *iph;
+
+ icmp_hdr = data + off;
+ if (icmp_hdr + 1 > data_end)
+ return TC_ACT_SHOT;
+ if (icmp_hdr->type != ICMP_DEST_UNREACH ||
+ icmp_hdr->code != ICMP_FRAG_NEEDED)
+ return TC_ACT_OK;
+ off += sizeof(struct icmphdr);
+ iph = data + off;
+ if (iph + 1 > data_end)
+ return TC_ACT_SHOT;
+ if (iph->ihl != 5)
+ return TC_ACT_SHOT;
+ pckt->proto = iph->protocol;
+ pckt->flags |= F_ICMP;
+ pckt->src = iph->daddr;
+ pckt->dst = iph->saddr;
+ return TC_ACT_UNSPEC;
+}
+
+static __noinline bool parse_udp(void *data, __u64 off, void *data_end,
+ struct packet_description *pckt)
+{
+ struct udphdr *udp;
+ udp = data + off;
+
+ if (udp + 1 > data_end)
+ return false;
+
+ if (!(pckt->flags & F_ICMP)) {
+ pckt->port16[0] = udp->source;
+ pckt->port16[1] = udp->dest;
+ } else {
+ pckt->port16[0] = udp->dest;
+ pckt->port16[1] = udp->source;
+ }
+ return true;
+}
+
+static __noinline bool parse_tcp(void *data, __u64 off, void *data_end,
+ struct packet_description *pckt)
+{
+ struct tcphdr *tcp;
+
+ tcp = data + off;
+ if (tcp + 1 > data_end)
+ return false;
+
+ if (tcp->syn)
+ pckt->flags |= F_SYN_SET;
+
+ if (!(pckt->flags & F_ICMP)) {
+ pckt->port16[0] = tcp->source;
+ pckt->port16[1] = tcp->dest;
+ } else {
+ pckt->port16[0] = tcp->dest;
+ pckt->port16[1] = tcp->source;
+ }
+ return true;
+}
+
+static __noinline int process_packet(void *data, __u64 off, void *data_end,
+ bool is_ipv6, struct __sk_buff *skb)
+{
+ void *pkt_start = (void *)(long)skb->data;
+ struct packet_description pckt = {};
+ struct eth_hdr *eth = pkt_start;
+ struct bpf_tunnel_key tkey = {};
+ struct vip_stats *data_stats;
+ struct real_definition *dst;
+ struct vip_meta *vip_info;
+ struct ctl_value *cval;
+ __u32 v4_intf_pos = 1;
+ __u32 v6_intf_pos = 2;
+ struct ipv6hdr *ip6h;
+ struct vip vip = {};
+ struct iphdr *iph;
+ int tun_flag = 0;
+ __u16 pkt_bytes;
+ __u64 iph_len;
+ __u32 ifindex;
+ __u8 protocol;
+ __u32 vip_num;
+ int action;
+
+ tkey.tunnel_ttl = 64;
+ if (is_ipv6) {
+ ip6h = data + off;
+ if (ip6h + 1 > data_end)
+ return TC_ACT_SHOT;
+
+ iph_len = sizeof(struct ipv6hdr);
+ protocol = ip6h->nexthdr;
+ pckt.proto = protocol;
+ pkt_bytes = bpf_ntohs(ip6h->payload_len);
+ off += iph_len;
+ if (protocol == IPPROTO_FRAGMENT) {
+ return TC_ACT_SHOT;
+ } else if (protocol == IPPROTO_ICMPV6) {
+ action = parse_icmpv6(data, data_end, off, &pckt);
+ if (action >= 0)
+ return action;
+ off += IPV6_PLUS_ICMP_HDR;
+ } else {
+ memcpy(pckt.srcv6, ip6h->saddr.s6_addr32, 16);
+ memcpy(pckt.dstv6, ip6h->daddr.s6_addr32, 16);
+ }
+ } else {
+ iph = data + off;
+ if (iph + 1 > data_end)
+ return TC_ACT_SHOT;
+ if (iph->ihl != 5)
+ return TC_ACT_SHOT;
+
+ protocol = iph->protocol;
+ pckt.proto = protocol;
+ pkt_bytes = bpf_ntohs(iph->tot_len);
+ off += IPV4_HDR_LEN_NO_OPT;
+
+ if (iph->frag_off & PCKT_FRAGMENTED)
+ return TC_ACT_SHOT;
+ if (protocol == IPPROTO_ICMP) {
+ action = parse_icmp(data, data_end, off, &pckt);
+ if (action >= 0)
+ return action;
+ off += IPV4_PLUS_ICMP_HDR;
+ } else {
+ pckt.src = iph->saddr;
+ pckt.dst = iph->daddr;
+ }
+ }
+ protocol = pckt.proto;
+
+ if (protocol == IPPROTO_TCP) {
+ if (!parse_tcp(data, off, data_end, &pckt))
+ return TC_ACT_SHOT;
+ } else if (protocol == IPPROTO_UDP) {
+ if (!parse_udp(data, off, data_end, &pckt))
+ return TC_ACT_SHOT;
+ } else {
+ return TC_ACT_SHOT;
+ }
+
+ if (is_ipv6)
+ memcpy(vip.daddr.v6, pckt.dstv6, 16);
+ else
+ vip.daddr.v4 = pckt.dst;
+
+ vip.dport = pckt.port16[1];
+ vip.protocol = pckt.proto;
+ vip_info = bpf_map_lookup_elem(&vip_map, &vip);
+ if (!vip_info) {
+ vip.dport = 0;
+ vip_info = bpf_map_lookup_elem(&vip_map, &vip);
+ if (!vip_info)
+ return TC_ACT_SHOT;
+ pckt.port16[1] = 0;
+ }
+
+ if (vip_info->flags & F_HASH_NO_SRC_PORT)
+ pckt.port16[0] = 0;
+
+ if (!get_packet_dst(&dst, &pckt, vip_info, is_ipv6))
+ return TC_ACT_SHOT;
+
+ if (dst->flags & F_IPV6) {
+ cval = bpf_map_lookup_elem(&ctl_array, &v6_intf_pos);
+ if (!cval)
+ return TC_ACT_SHOT;
+ ifindex = cval->ifindex;
+ memcpy(tkey.remote_ipv6, dst->dstv6, 16);
+ tun_flag = BPF_F_TUNINFO_IPV6;
+ } else {
+ cval = bpf_map_lookup_elem(&ctl_array, &v4_intf_pos);
+ if (!cval)
+ return TC_ACT_SHOT;
+ ifindex = cval->ifindex;
+ tkey.remote_ipv4 = dst->dst;
+ }
+ vip_num = vip_info->vip_num;
+ data_stats = bpf_map_lookup_elem(&stats, &vip_num);
+ if (!data_stats)
+ return TC_ACT_SHOT;
+ data_stats->pkts++;
+ data_stats->bytes += pkt_bytes;
+ bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), tun_flag);
+ *(u32 *)eth->eth_dest = tkey.remote_ipv4;
+ return bpf_redirect(ifindex, 0);
+}
+
+SEC("l4lb-demo")
+int balancer_ingress(struct __sk_buff *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct eth_hdr *eth = data;
+ __u32 eth_proto;
+ __u32 nh_off;
+
+ nh_off = sizeof(struct eth_hdr);
+ if (data + nh_off > data_end)
+ return TC_ACT_SHOT;
+ eth_proto = eth->eth_proto;
+ if (eth_proto == bpf_htons(ETH_P_IP))
+ return process_packet(data, nh_off, data_end, false, ctx);
+ else if (eth_proto == bpf_htons(ETH_P_IPV6))
+ return process_packet(data, nh_off, data_end, true, ctx);
+ else
+ return TC_ACT_SHOT;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_link_pinning.c b/tools/testing/selftests/bpf/progs/test_link_pinning.c
new file mode 100644
index 000000000..bbf2a5264
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_link_pinning.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include <stdbool.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+int in = 0;
+int out = 0;
+
+SEC("raw_tp/sys_enter")
+int raw_tp_prog(const void *ctx)
+{
+ out = in;
+ return 0;
+}
+
+SEC("tp_btf/sys_enter")
+int tp_btf_prog(const void *ctx)
+{
+ out = in;
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c b/tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c
new file mode 100644
index 000000000..7a6620671
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_lirc_mode2_kern.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+// test ir decoder
+//
+// Copyright (C) 2018 Sean Young <sean@mess.org>
+
+#include <linux/bpf.h>
+#include <linux/lirc.h>
+#include <bpf/bpf_helpers.h>
+
+SEC("lirc_mode2")
+int bpf_decoder(unsigned int *sample)
+{
+ if (LIRC_IS_PULSE(*sample)) {
+ unsigned int duration = LIRC_VALUE(*sample);
+
+ if (duration & 0x10000)
+ bpf_rc_keydown(sample, 0x40, duration & 0xffff, 0);
+ if (duration & 0x20000)
+ bpf_rc_pointer_rel(sample, (duration >> 8) & 0xff,
+ duration & 0xff);
+ }
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c b/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c
new file mode 100644
index 000000000..d6cb986e7
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+struct grehdr {
+ __be16 flags;
+ __be16 protocol;
+};
+
+SEC("encap_gre")
+int bpf_lwt_encap_gre(struct __sk_buff *skb)
+{
+ struct encap_hdr {
+ struct iphdr iph;
+ struct grehdr greh;
+ } hdr;
+ int err;
+
+ memset(&hdr, 0, sizeof(struct encap_hdr));
+
+ hdr.iph.ihl = 5;
+ hdr.iph.version = 4;
+ hdr.iph.ttl = 0x40;
+ hdr.iph.protocol = 47; /* IPPROTO_GRE */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ hdr.iph.saddr = 0x640110ac; /* 172.16.1.100 */
+ hdr.iph.daddr = 0x641010ac; /* 172.16.16.100 */
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ hdr.iph.saddr = 0xac100164; /* 172.16.1.100 */
+ hdr.iph.daddr = 0xac101064; /* 172.16.16.100 */
+#else
+#error "Fix your compiler's __BYTE_ORDER__?!"
+#endif
+ hdr.iph.tot_len = bpf_htons(skb->len + sizeof(struct encap_hdr));
+
+ hdr.greh.protocol = skb->protocol;
+
+ err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, &hdr,
+ sizeof(struct encap_hdr));
+ if (err)
+ return BPF_DROP;
+
+ return BPF_LWT_REROUTE;
+}
+
+SEC("encap_gre6")
+int bpf_lwt_encap_gre6(struct __sk_buff *skb)
+{
+ struct encap_hdr {
+ struct ipv6hdr ip6hdr;
+ struct grehdr greh;
+ } hdr;
+ int err;
+
+ memset(&hdr, 0, sizeof(struct encap_hdr));
+
+ hdr.ip6hdr.version = 6;
+ hdr.ip6hdr.payload_len = bpf_htons(skb->len + sizeof(struct grehdr));
+ hdr.ip6hdr.nexthdr = 47; /* IPPROTO_GRE */
+ hdr.ip6hdr.hop_limit = 0x40;
+ /* fb01::1 */
+ hdr.ip6hdr.saddr.s6_addr[0] = 0xfb;
+ hdr.ip6hdr.saddr.s6_addr[1] = 1;
+ hdr.ip6hdr.saddr.s6_addr[15] = 1;
+ /* fb10::1 */
+ hdr.ip6hdr.daddr.s6_addr[0] = 0xfb;
+ hdr.ip6hdr.daddr.s6_addr[1] = 0x10;
+ hdr.ip6hdr.daddr.s6_addr[15] = 1;
+
+ hdr.greh.protocol = skb->protocol;
+
+ err = bpf_lwt_push_encap(skb, BPF_LWT_ENCAP_IP, &hdr,
+ sizeof(struct encap_hdr));
+ if (err)
+ return BPF_DROP;
+
+ return BPF_LWT_REROUTE;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c b/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c
new file mode 100644
index 000000000..48ff2b2ad
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_lwt_seg6local.c
@@ -0,0 +1,426 @@
+#include <stddef.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <linux/seg6_local.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+/* Packet parsing state machine helpers. */
+#define cursor_advance(_cursor, _len) \
+ ({ void *_tmp = _cursor; _cursor += _len; _tmp; })
+
+#define SR6_FLAG_ALERT (1 << 4)
+
+#define BPF_PACKET_HEADER __attribute__((packed))
+
+struct ip6_t {
+ unsigned int ver:4;
+ unsigned int priority:8;
+ unsigned int flow_label:20;
+ unsigned short payload_len;
+ unsigned char next_header;
+ unsigned char hop_limit;
+ unsigned long long src_hi;
+ unsigned long long src_lo;
+ unsigned long long dst_hi;
+ unsigned long long dst_lo;
+} BPF_PACKET_HEADER;
+
+struct ip6_addr_t {
+ unsigned long long hi;
+ unsigned long long lo;
+} BPF_PACKET_HEADER;
+
+struct ip6_srh_t {
+ unsigned char nexthdr;
+ unsigned char hdrlen;
+ unsigned char type;
+ unsigned char segments_left;
+ unsigned char first_segment;
+ unsigned char flags;
+ unsigned short tag;
+
+ struct ip6_addr_t segments[0];
+} BPF_PACKET_HEADER;
+
+struct sr6_tlv_t {
+ unsigned char type;
+ unsigned char len;
+ unsigned char value[0];
+} BPF_PACKET_HEADER;
+
+static __always_inline struct ip6_srh_t *get_srh(struct __sk_buff *skb)
+{
+ void *cursor, *data_end;
+ struct ip6_srh_t *srh;
+ struct ip6_t *ip;
+ uint8_t *ipver;
+
+ data_end = (void *)(long)skb->data_end;
+ cursor = (void *)(long)skb->data;
+ ipver = (uint8_t *)cursor;
+
+ if ((void *)ipver + sizeof(*ipver) > data_end)
+ return NULL;
+
+ if ((*ipver >> 4) != 6)
+ return NULL;
+
+ ip = cursor_advance(cursor, sizeof(*ip));
+ if ((void *)ip + sizeof(*ip) > data_end)
+ return NULL;
+
+ if (ip->next_header != 43)
+ return NULL;
+
+ srh = cursor_advance(cursor, sizeof(*srh));
+ if ((void *)srh + sizeof(*srh) > data_end)
+ return NULL;
+
+ if (srh->type != 4)
+ return NULL;
+
+ return srh;
+}
+
+static __always_inline
+int update_tlv_pad(struct __sk_buff *skb, uint32_t new_pad,
+ uint32_t old_pad, uint32_t pad_off)
+{
+ int err;
+
+ if (new_pad != old_pad) {
+ err = bpf_lwt_seg6_adjust_srh(skb, pad_off,
+ (int) new_pad - (int) old_pad);
+ if (err)
+ return err;
+ }
+
+ if (new_pad > 0) {
+ char pad_tlv_buf[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0};
+ struct sr6_tlv_t *pad_tlv = (struct sr6_tlv_t *) pad_tlv_buf;
+
+ pad_tlv->type = SR6_TLV_PADDING;
+ pad_tlv->len = new_pad - 2;
+
+ err = bpf_lwt_seg6_store_bytes(skb, pad_off,
+ (void *)pad_tlv_buf, new_pad);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static __always_inline
+int is_valid_tlv_boundary(struct __sk_buff *skb, struct ip6_srh_t *srh,
+ uint32_t *tlv_off, uint32_t *pad_size,
+ uint32_t *pad_off)
+{
+ uint32_t srh_off, cur_off;
+ int offset_valid = 0;
+ int err;
+
+ srh_off = (char *)srh - (char *)(long)skb->data;
+ // cur_off = end of segments, start of possible TLVs
+ cur_off = srh_off + sizeof(*srh) +
+ sizeof(struct ip6_addr_t) * (srh->first_segment + 1);
+
+ *pad_off = 0;
+
+ // we can only go as far as ~10 TLVs due to the BPF max stack size
+ #pragma clang loop unroll(full)
+ for (int i = 0; i < 10; i++) {
+ struct sr6_tlv_t tlv;
+
+ if (cur_off == *tlv_off)
+ offset_valid = 1;
+
+ if (cur_off >= srh_off + ((srh->hdrlen + 1) << 3))
+ break;
+
+ err = bpf_skb_load_bytes(skb, cur_off, &tlv, sizeof(tlv));
+ if (err)
+ return err;
+
+ if (tlv.type == SR6_TLV_PADDING) {
+ *pad_size = tlv.len + sizeof(tlv);
+ *pad_off = cur_off;
+
+ if (*tlv_off == srh_off) {
+ *tlv_off = cur_off;
+ offset_valid = 1;
+ }
+ break;
+
+ } else if (tlv.type == SR6_TLV_HMAC) {
+ break;
+ }
+
+ cur_off += sizeof(tlv) + tlv.len;
+ } // we reached the padding or HMAC TLVs, or the end of the SRH
+
+ if (*pad_off == 0)
+ *pad_off = cur_off;
+
+ if (*tlv_off == -1)
+ *tlv_off = cur_off;
+ else if (!offset_valid)
+ return -EINVAL;
+
+ return 0;
+}
+
+static __always_inline
+int add_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh, uint32_t tlv_off,
+ struct sr6_tlv_t *itlv, uint8_t tlv_size)
+{
+ uint32_t srh_off = (char *)srh - (char *)(long)skb->data;
+ uint8_t len_remaining, new_pad;
+ uint32_t pad_off = 0;
+ uint32_t pad_size = 0;
+ uint32_t partial_srh_len;
+ int err;
+
+ if (tlv_off != -1)
+ tlv_off += srh_off;
+
+ if (itlv->type == SR6_TLV_PADDING || itlv->type == SR6_TLV_HMAC)
+ return -EINVAL;
+
+ err = is_valid_tlv_boundary(skb, srh, &tlv_off, &pad_size, &pad_off);
+ if (err)
+ return err;
+
+ err = bpf_lwt_seg6_adjust_srh(skb, tlv_off, sizeof(*itlv) + itlv->len);
+ if (err)
+ return err;
+
+ err = bpf_lwt_seg6_store_bytes(skb, tlv_off, (void *)itlv, tlv_size);
+ if (err)
+ return err;
+
+ // the following can't be moved inside update_tlv_pad because the
+ // bpf verifier has some issues with it
+ pad_off += sizeof(*itlv) + itlv->len;
+ partial_srh_len = pad_off - srh_off;
+ len_remaining = partial_srh_len % 8;
+ new_pad = 8 - len_remaining;
+
+ if (new_pad == 1) // cannot pad for 1 byte only
+ new_pad = 9;
+ else if (new_pad == 8)
+ new_pad = 0;
+
+ return update_tlv_pad(skb, new_pad, pad_size, pad_off);
+}
+
+static __always_inline
+int delete_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh,
+ uint32_t tlv_off)
+{
+ uint32_t srh_off = (char *)srh - (char *)(long)skb->data;
+ uint8_t len_remaining, new_pad;
+ uint32_t partial_srh_len;
+ uint32_t pad_off = 0;
+ uint32_t pad_size = 0;
+ struct sr6_tlv_t tlv;
+ int err;
+
+ tlv_off += srh_off;
+
+ err = is_valid_tlv_boundary(skb, srh, &tlv_off, &pad_size, &pad_off);
+ if (err)
+ return err;
+
+ err = bpf_skb_load_bytes(skb, tlv_off, &tlv, sizeof(tlv));
+ if (err)
+ return err;
+
+ err = bpf_lwt_seg6_adjust_srh(skb, tlv_off, -(sizeof(tlv) + tlv.len));
+ if (err)
+ return err;
+
+ pad_off -= sizeof(tlv) + tlv.len;
+ partial_srh_len = pad_off - srh_off;
+ len_remaining = partial_srh_len % 8;
+ new_pad = 8 - len_remaining;
+ if (new_pad == 1) // cannot pad for 1 byte only
+ new_pad = 9;
+ else if (new_pad == 8)
+ new_pad = 0;
+
+ return update_tlv_pad(skb, new_pad, pad_size, pad_off);
+}
+
+static __always_inline
+int has_egr_tlv(struct __sk_buff *skb, struct ip6_srh_t *srh)
+{
+ int tlv_offset = sizeof(struct ip6_t) + sizeof(struct ip6_srh_t) +
+ ((srh->first_segment + 1) << 4);
+ struct sr6_tlv_t tlv;
+
+ if (bpf_skb_load_bytes(skb, tlv_offset, &tlv, sizeof(struct sr6_tlv_t)))
+ return 0;
+
+ if (tlv.type == SR6_TLV_EGRESS && tlv.len == 18) {
+ struct ip6_addr_t egr_addr;
+
+ if (bpf_skb_load_bytes(skb, tlv_offset + 4, &egr_addr, 16))
+ return 0;
+
+ // check if egress TLV value is correct
+ if (bpf_be64_to_cpu(egr_addr.hi) == 0xfd00000000000000 &&
+ bpf_be64_to_cpu(egr_addr.lo) == 0x4)
+ return 1;
+ }
+
+ return 0;
+}
+
+// This function will push a SRH with segments fd00::1, fd00::2, fd00::3,
+// fd00::4
+SEC("encap_srh")
+int __encap_srh(struct __sk_buff *skb)
+{
+ unsigned long long hi = 0xfd00000000000000;
+ struct ip6_addr_t *seg;
+ struct ip6_srh_t *srh;
+ char srh_buf[72]; // room for 4 segments
+ int err;
+
+ srh = (struct ip6_srh_t *)srh_buf;
+ srh->nexthdr = 0;
+ srh->hdrlen = 8;
+ srh->type = 4;
+ srh->segments_left = 3;
+ srh->first_segment = 3;
+ srh->flags = 0;
+ srh->tag = 0;
+
+ seg = (struct ip6_addr_t *)((char *)srh + sizeof(*srh));
+
+ #pragma clang loop unroll(full)
+ for (unsigned long long lo = 0; lo < 4; lo++) {
+ seg->lo = bpf_cpu_to_be64(4 - lo);
+ seg->hi = bpf_cpu_to_be64(hi);
+ seg = (struct ip6_addr_t *)((char *)seg + sizeof(*seg));
+ }
+
+ err = bpf_lwt_push_encap(skb, 0, (void *)srh, sizeof(srh_buf));
+ if (err)
+ return BPF_DROP;
+
+ return BPF_REDIRECT;
+}
+
+// Add an Egress TLV fc00::4, add the flag A,
+// and apply End.X action to fc42::1
+SEC("add_egr_x")
+int __add_egr_x(struct __sk_buff *skb)
+{
+ unsigned long long hi = 0xfc42000000000000;
+ unsigned long long lo = 0x1;
+ struct ip6_srh_t *srh = get_srh(skb);
+ uint8_t new_flags = SR6_FLAG_ALERT;
+ struct ip6_addr_t addr;
+ int err, offset;
+
+ if (srh == NULL)
+ return BPF_DROP;
+
+ uint8_t tlv[20] = {2, 18, 0, 0, 0xfd, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4};
+
+ err = add_tlv(skb, srh, (srh->hdrlen+1) << 3,
+ (struct sr6_tlv_t *)&tlv, 20);
+ if (err)
+ return BPF_DROP;
+
+ offset = sizeof(struct ip6_t) + offsetof(struct ip6_srh_t, flags);
+ err = bpf_lwt_seg6_store_bytes(skb, offset,
+ (void *)&new_flags, sizeof(new_flags));
+ if (err)
+ return BPF_DROP;
+
+ addr.lo = bpf_cpu_to_be64(lo);
+ addr.hi = bpf_cpu_to_be64(hi);
+ err = bpf_lwt_seg6_action(skb, SEG6_LOCAL_ACTION_END_X,
+ (void *)&addr, sizeof(addr));
+ if (err)
+ return BPF_DROP;
+ return BPF_REDIRECT;
+}
+
+// Pop the Egress TLV, reset the flags, change the tag 2442 and finally do a
+// simple End action
+SEC("pop_egr")
+int __pop_egr(struct __sk_buff *skb)
+{
+ struct ip6_srh_t *srh = get_srh(skb);
+ uint16_t new_tag = bpf_htons(2442);
+ uint8_t new_flags = 0;
+ int err, offset;
+
+ if (srh == NULL)
+ return BPF_DROP;
+
+ if (srh->flags != SR6_FLAG_ALERT)
+ return BPF_DROP;
+
+ if (srh->hdrlen != 11) // 4 segments + Egress TLV + Padding TLV
+ return BPF_DROP;
+
+ if (!has_egr_tlv(skb, srh))
+ return BPF_DROP;
+
+ err = delete_tlv(skb, srh, 8 + (srh->first_segment + 1) * 16);
+ if (err)
+ return BPF_DROP;
+
+ offset = sizeof(struct ip6_t) + offsetof(struct ip6_srh_t, flags);
+ if (bpf_lwt_seg6_store_bytes(skb, offset, (void *)&new_flags,
+ sizeof(new_flags)))
+ return BPF_DROP;
+
+ offset = sizeof(struct ip6_t) + offsetof(struct ip6_srh_t, tag);
+ if (bpf_lwt_seg6_store_bytes(skb, offset, (void *)&new_tag,
+ sizeof(new_tag)))
+ return BPF_DROP;
+
+ return BPF_OK;
+}
+
+// Inspect if the Egress TLV and flag have been removed, if the tag is correct,
+// then apply a End.T action to reach the last segment
+SEC("inspect_t")
+int __inspect_t(struct __sk_buff *skb)
+{
+ struct ip6_srh_t *srh = get_srh(skb);
+ int table = 117;
+ int err;
+
+ if (srh == NULL)
+ return BPF_DROP;
+
+ if (srh->flags != 0)
+ return BPF_DROP;
+
+ if (srh->tag != bpf_htons(2442))
+ return BPF_DROP;
+
+ if (srh->hdrlen != 8) // 4 segments
+ return BPF_DROP;
+
+ err = bpf_lwt_seg6_action(skb, SEG6_LOCAL_ACTION_END_T,
+ (void *)&table, sizeof(table));
+
+ if (err)
+ return BPF_DROP;
+
+ return BPF_REDIRECT;
+}
+
+char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_map_in_map.c b/tools/testing/selftests/bpf/progs/test_map_in_map.c
new file mode 100644
index 000000000..1cfeb940c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_map_in_map.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018 Facebook */
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <linux/types.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+ __uint(max_entries, 1);
+ __uint(map_flags, 0);
+ __uint(key_size, sizeof(__u32));
+ /* must be sizeof(__u32) for map in map */
+ __uint(value_size, sizeof(__u32));
+} mim_array SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
+ __uint(max_entries, 1);
+ __uint(map_flags, 0);
+ __uint(key_size, sizeof(int));
+ /* must be sizeof(__u32) for map in map */
+ __uint(value_size, sizeof(__u32));
+} mim_hash SEC(".maps");
+
+SEC("xdp_mimtest")
+int xdp_mimtest0(struct xdp_md *ctx)
+{
+ int value = 123;
+ int *value_p;
+ int key = 0;
+ void *map;
+
+ map = bpf_map_lookup_elem(&mim_array, &key);
+ if (!map)
+ return XDP_DROP;
+
+ bpf_map_update_elem(map, &key, &value, 0);
+ value_p = bpf_map_lookup_elem(map, &key);
+ if (!value_p || *value_p != 123)
+ return XDP_DROP;
+
+ map = bpf_map_lookup_elem(&mim_hash, &key);
+ if (!map)
+ return XDP_DROP;
+
+ bpf_map_update_elem(map, &key, &value, 0);
+
+ return XDP_PASS;
+}
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_map_init.c b/tools/testing/selftests/bpf/progs/test_map_init.c
new file mode 100644
index 000000000..c89d28ead
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_map_init.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Tessares SA <http://www.tessares.net> */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+__u64 inKey = 0;
+__u64 inValue = 0;
+__u32 inPid = 0;
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+ __uint(max_entries, 2);
+ __type(key, __u64);
+ __type(value, __u64);
+} hashmap1 SEC(".maps");
+
+
+SEC("tp/syscalls/sys_enter_getpgid")
+int sysenter_getpgid(const void *ctx)
+{
+ /* Just do it for once, when called from our own test prog. This
+ * ensures the map value is only updated for a single CPU.
+ */
+ int cur_pid = bpf_get_current_pid_tgid() >> 32;
+
+ if (cur_pid == inPid)
+ bpf_map_update_elem(&hashmap1, &inKey, &inValue, BPF_NOEXIST);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_map_lock.c b/tools/testing/selftests/bpf/progs/test_map_lock.c
new file mode 100644
index 000000000..b5c07ae7b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_map_lock.c
@@ -0,0 +1,62 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <linux/bpf.h>
+#include <linux/version.h>
+#include <bpf/bpf_helpers.h>
+
+#define VAR_NUM 16
+
+struct hmap_elem {
+ struct bpf_spin_lock lock;
+ int var[VAR_NUM];
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, struct hmap_elem);
+} hash_map SEC(".maps");
+
+struct array_elem {
+ struct bpf_spin_lock lock;
+ int var[VAR_NUM];
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, struct array_elem);
+} array_map SEC(".maps");
+
+SEC("map_lock_demo")
+int bpf_map_lock_test(struct __sk_buff *skb)
+{
+ struct hmap_elem zero = {}, *val;
+ int rnd = bpf_get_prandom_u32();
+ int key = 0, err = 1, i;
+ struct array_elem *q;
+
+ val = bpf_map_lookup_elem(&hash_map, &key);
+ if (!val)
+ goto err;
+ /* spin_lock in hash map */
+ bpf_spin_lock(&val->lock);
+ for (i = 0; i < VAR_NUM; i++)
+ val->var[i] = rnd;
+ bpf_spin_unlock(&val->lock);
+
+ /* spin_lock in array */
+ q = bpf_map_lookup_elem(&array_map, &key);
+ if (!q)
+ goto err;
+ bpf_spin_lock(&q->lock);
+ for (i = 0; i < VAR_NUM; i++)
+ q->var[i] = rnd;
+ bpf_spin_unlock(&q->lock);
+ err = 0;
+err:
+ return err;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c b/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c
new file mode 100644
index 000000000..6077a0250
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_misc_tcp_hdr_options.c
@@ -0,0 +1,325 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include <stddef.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/socket.h>
+#include <linux/bpf.h>
+#include <linux/types.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#define BPF_PROG_TEST_TCP_HDR_OPTIONS
+#include "test_tcp_hdr_options.h"
+
+__u16 last_addr16_n = __bpf_htons(1);
+__u16 active_lport_n = 0;
+__u16 active_lport_h = 0;
+__u16 passive_lport_n = 0;
+__u16 passive_lport_h = 0;
+
+/* options received at passive side */
+unsigned int nr_pure_ack = 0;
+unsigned int nr_data = 0;
+unsigned int nr_syn = 0;
+unsigned int nr_fin = 0;
+
+/* Check the header received from the active side */
+static int __check_active_hdr_in(struct bpf_sock_ops *skops, bool check_syn)
+{
+ union {
+ struct tcphdr th;
+ struct ipv6hdr ip6;
+ struct tcp_exprm_opt exprm_opt;
+ struct tcp_opt reg_opt;
+ __u8 data[100]; /* IPv6 (40) + Max TCP hdr (60) */
+ } hdr = {};
+ __u64 load_flags = check_syn ? BPF_LOAD_HDR_OPT_TCP_SYN : 0;
+ struct tcphdr *pth;
+ int ret;
+
+ hdr.reg_opt.kind = 0xB9;
+
+ /* The option is 4 bytes long instead of 2 bytes */
+ ret = bpf_load_hdr_opt(skops, &hdr.reg_opt, 2, load_flags);
+ if (ret != -ENOSPC)
+ RET_CG_ERR(ret);
+
+ /* Test searching magic with regular kind */
+ hdr.reg_opt.len = 4;
+ ret = bpf_load_hdr_opt(skops, &hdr.reg_opt, sizeof(hdr.reg_opt),
+ load_flags);
+ if (ret != -EINVAL)
+ RET_CG_ERR(ret);
+
+ hdr.reg_opt.len = 0;
+ ret = bpf_load_hdr_opt(skops, &hdr.reg_opt, sizeof(hdr.reg_opt),
+ load_flags);
+ if (ret != 4 || hdr.reg_opt.len != 4 || hdr.reg_opt.kind != 0xB9 ||
+ hdr.reg_opt.data[0] != 0xfa || hdr.reg_opt.data[1] != 0xce)
+ RET_CG_ERR(ret);
+
+ /* Test searching experimental option with invalid kind length */
+ hdr.exprm_opt.kind = TCPOPT_EXP;
+ hdr.exprm_opt.len = 5;
+ hdr.exprm_opt.magic = 0;
+ ret = bpf_load_hdr_opt(skops, &hdr.exprm_opt, sizeof(hdr.exprm_opt),
+ load_flags);
+ if (ret != -EINVAL)
+ RET_CG_ERR(ret);
+
+ /* Test searching experimental option with 0 magic value */
+ hdr.exprm_opt.len = 4;
+ ret = bpf_load_hdr_opt(skops, &hdr.exprm_opt, sizeof(hdr.exprm_opt),
+ load_flags);
+ if (ret != -ENOMSG)
+ RET_CG_ERR(ret);
+
+ hdr.exprm_opt.magic = __bpf_htons(0xeB9F);
+ ret = bpf_load_hdr_opt(skops, &hdr.exprm_opt, sizeof(hdr.exprm_opt),
+ load_flags);
+ if (ret != 4 || hdr.exprm_opt.len != 4 ||
+ hdr.exprm_opt.kind != TCPOPT_EXP ||
+ hdr.exprm_opt.magic != __bpf_htons(0xeB9F))
+ RET_CG_ERR(ret);
+
+ if (!check_syn)
+ return CG_OK;
+
+ /* Test loading from skops->syn_skb if sk_state == TCP_NEW_SYN_RECV
+ *
+ * Test loading from tp->saved_syn for other sk_state.
+ */
+ ret = bpf_getsockopt(skops, SOL_TCP, TCP_BPF_SYN_IP, &hdr.ip6,
+ sizeof(hdr.ip6));
+ if (ret != -ENOSPC)
+ RET_CG_ERR(ret);
+
+ if (hdr.ip6.saddr.s6_addr16[7] != last_addr16_n ||
+ hdr.ip6.daddr.s6_addr16[7] != last_addr16_n)
+ RET_CG_ERR(0);
+
+ ret = bpf_getsockopt(skops, SOL_TCP, TCP_BPF_SYN_IP, &hdr, sizeof(hdr));
+ if (ret < 0)
+ RET_CG_ERR(ret);
+
+ pth = (struct tcphdr *)(&hdr.ip6 + 1);
+ if (pth->dest != passive_lport_n || pth->source != active_lport_n)
+ RET_CG_ERR(0);
+
+ ret = bpf_getsockopt(skops, SOL_TCP, TCP_BPF_SYN, &hdr, sizeof(hdr));
+ if (ret < 0)
+ RET_CG_ERR(ret);
+
+ if (hdr.th.dest != passive_lport_n || hdr.th.source != active_lport_n)
+ RET_CG_ERR(0);
+
+ return CG_OK;
+}
+
+static int check_active_syn_in(struct bpf_sock_ops *skops)
+{
+ return __check_active_hdr_in(skops, true);
+}
+
+static int check_active_hdr_in(struct bpf_sock_ops *skops)
+{
+ struct tcphdr *th;
+
+ if (__check_active_hdr_in(skops, false) == CG_ERR)
+ return CG_ERR;
+
+ th = skops->skb_data;
+ if (th + 1 > skops->skb_data_end)
+ RET_CG_ERR(0);
+
+ if (tcp_hdrlen(th) < skops->skb_len)
+ nr_data++;
+
+ if (th->fin)
+ nr_fin++;
+
+ if (th->ack && !th->fin && tcp_hdrlen(th) == skops->skb_len)
+ nr_pure_ack++;
+
+ return CG_OK;
+}
+
+static int active_opt_len(struct bpf_sock_ops *skops)
+{
+ int err;
+
+ /* Reserve more than enough to allow the -EEXIST test in
+ * the write_active_opt().
+ */
+ err = bpf_reserve_hdr_opt(skops, 12, 0);
+ if (err)
+ RET_CG_ERR(err);
+
+ return CG_OK;
+}
+
+static int write_active_opt(struct bpf_sock_ops *skops)
+{
+ struct tcp_exprm_opt exprm_opt = {};
+ struct tcp_opt win_scale_opt = {};
+ struct tcp_opt reg_opt = {};
+ struct tcphdr *th;
+ int err, ret;
+
+ exprm_opt.kind = TCPOPT_EXP;
+ exprm_opt.len = 4;
+ exprm_opt.magic = __bpf_htons(0xeB9F);
+
+ reg_opt.kind = 0xB9;
+ reg_opt.len = 4;
+ reg_opt.data[0] = 0xfa;
+ reg_opt.data[1] = 0xce;
+
+ win_scale_opt.kind = TCPOPT_WINDOW;
+
+ err = bpf_store_hdr_opt(skops, &exprm_opt, sizeof(exprm_opt), 0);
+ if (err)
+ RET_CG_ERR(err);
+
+ /* Store the same exprm option */
+ err = bpf_store_hdr_opt(skops, &exprm_opt, sizeof(exprm_opt), 0);
+ if (err != -EEXIST)
+ RET_CG_ERR(err);
+
+ err = bpf_store_hdr_opt(skops, &reg_opt, sizeof(reg_opt), 0);
+ if (err)
+ RET_CG_ERR(err);
+ err = bpf_store_hdr_opt(skops, &reg_opt, sizeof(reg_opt), 0);
+ if (err != -EEXIST)
+ RET_CG_ERR(err);
+
+ /* Check the option has been written and can be searched */
+ ret = bpf_load_hdr_opt(skops, &exprm_opt, sizeof(exprm_opt), 0);
+ if (ret != 4 || exprm_opt.len != 4 || exprm_opt.kind != TCPOPT_EXP ||
+ exprm_opt.magic != __bpf_htons(0xeB9F))
+ RET_CG_ERR(ret);
+
+ reg_opt.len = 0;
+ ret = bpf_load_hdr_opt(skops, &reg_opt, sizeof(reg_opt), 0);
+ if (ret != 4 || reg_opt.len != 4 || reg_opt.kind != 0xB9 ||
+ reg_opt.data[0] != 0xfa || reg_opt.data[1] != 0xce)
+ RET_CG_ERR(ret);
+
+ th = skops->skb_data;
+ if (th + 1 > skops->skb_data_end)
+ RET_CG_ERR(0);
+
+ if (th->syn) {
+ active_lport_h = skops->local_port;
+ active_lport_n = th->source;
+
+ /* Search the win scale option written by kernel
+ * in the SYN packet.
+ */
+ ret = bpf_load_hdr_opt(skops, &win_scale_opt,
+ sizeof(win_scale_opt), 0);
+ if (ret != 3 || win_scale_opt.len != 3 ||
+ win_scale_opt.kind != TCPOPT_WINDOW)
+ RET_CG_ERR(ret);
+
+ /* Write the win scale option that kernel
+ * has already written.
+ */
+ err = bpf_store_hdr_opt(skops, &win_scale_opt,
+ sizeof(win_scale_opt), 0);
+ if (err != -EEXIST)
+ RET_CG_ERR(err);
+ }
+
+ return CG_OK;
+}
+
+static int handle_hdr_opt_len(struct bpf_sock_ops *skops)
+{
+ __u8 tcp_flags = skops_tcp_flags(skops);
+
+ if ((tcp_flags & TCPHDR_SYNACK) == TCPHDR_SYNACK)
+ /* Check the SYN from bpf_sock_ops_kern->syn_skb */
+ return check_active_syn_in(skops);
+
+ /* Passive side should have cleared the write hdr cb by now */
+ if (skops->local_port == passive_lport_h)
+ RET_CG_ERR(0);
+
+ return active_opt_len(skops);
+}
+
+static int handle_write_hdr_opt(struct bpf_sock_ops *skops)
+{
+ if (skops->local_port == passive_lport_h)
+ RET_CG_ERR(0);
+
+ return write_active_opt(skops);
+}
+
+static int handle_parse_hdr(struct bpf_sock_ops *skops)
+{
+ /* Passive side is not writing any non-standard/unknown
+ * option, so the active side should never be called.
+ */
+ if (skops->local_port == active_lport_h)
+ RET_CG_ERR(0);
+
+ return check_active_hdr_in(skops);
+}
+
+static int handle_passive_estab(struct bpf_sock_ops *skops)
+{
+ int err;
+
+ /* No more write hdr cb */
+ bpf_sock_ops_cb_flags_set(skops,
+ skops->bpf_sock_ops_cb_flags &
+ ~BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG);
+
+ /* Recheck the SYN but check the tp->saved_syn this time */
+ err = check_active_syn_in(skops);
+ if (err == CG_ERR)
+ return err;
+
+ nr_syn++;
+
+ /* The ack has header option written by the active side also */
+ return check_active_hdr_in(skops);
+}
+
+SEC("sockops/misc_estab")
+int misc_estab(struct bpf_sock_ops *skops)
+{
+ int true_val = 1;
+
+ switch (skops->op) {
+ case BPF_SOCK_OPS_TCP_LISTEN_CB:
+ passive_lport_h = skops->local_port;
+ passive_lport_n = __bpf_htons(passive_lport_h);
+ bpf_setsockopt(skops, SOL_TCP, TCP_SAVE_SYN,
+ &true_val, sizeof(true_val));
+ set_hdr_cb_flags(skops, 0);
+ break;
+ case BPF_SOCK_OPS_TCP_CONNECT_CB:
+ set_hdr_cb_flags(skops, 0);
+ break;
+ case BPF_SOCK_OPS_PARSE_HDR_OPT_CB:
+ return handle_parse_hdr(skops);
+ case BPF_SOCK_OPS_HDR_OPT_LEN_CB:
+ return handle_hdr_opt_len(skops);
+ case BPF_SOCK_OPS_WRITE_HDR_OPT_CB:
+ return handle_write_hdr_opt(skops);
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ return handle_passive_estab(skops);
+ }
+
+ return CG_OK;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_mmap.c b/tools/testing/selftests/bpf/progs/test_mmap.c
new file mode 100644
index 000000000..4eb42cff5
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_mmap.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 4096);
+ __uint(map_flags, BPF_F_MMAPABLE | BPF_F_RDONLY_PROG);
+ __type(key, __u32);
+ __type(value, char);
+} rdonly_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 512 * 4); /* at least 4 pages of data */
+ __uint(map_flags, BPF_F_MMAPABLE);
+ __type(key, __u32);
+ __type(value, __u64);
+} data_map SEC(".maps");
+
+__u64 in_val = 0;
+__u64 out_val = 0;
+
+SEC("raw_tracepoint/sys_enter")
+int test_mmap(void *ctx)
+{
+ int zero = 0, one = 1, two = 2, far = 1500;
+ __u64 val, *p;
+
+ out_val = in_val;
+
+ /* data_map[2] = in_val; */
+ bpf_map_update_elem(&data_map, &two, (const void *)&in_val, 0);
+
+ /* data_map[1] = data_map[0] * 2; */
+ p = bpf_map_lookup_elem(&data_map, &zero);
+ if (p) {
+ val = (*p) * 2;
+ bpf_map_update_elem(&data_map, &one, &val, 0);
+ }
+
+ /* data_map[far] = in_val * 3; */
+ val = in_val * 3;
+ bpf_map_update_elem(&data_map, &far, &val, 0);
+
+ return 0;
+}
+
diff --git a/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c b/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c
new file mode 100644
index 000000000..1dca70a6d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_ns_current_pid_tgid.c
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Carlos Neira cneirabustos@gmail.com */
+
+#include <linux/bpf.h>
+#include <stdint.h>
+#include <bpf/bpf_helpers.h>
+
+static volatile struct {
+ __u64 dev;
+ __u64 ino;
+ __u64 pid_tgid;
+ __u64 user_pid_tgid;
+} res;
+
+SEC("raw_tracepoint/sys_enter")
+int trace(void *ctx)
+{
+ __u64 ns_pid_tgid, expected_pid;
+ struct bpf_pidns_info nsdata;
+ __u32 key = 0;
+
+ if (bpf_get_ns_current_pid_tgid(res.dev, res.ino, &nsdata,
+ sizeof(struct bpf_pidns_info)))
+ return 0;
+
+ ns_pid_tgid = (__u64)nsdata.tgid << 32 | nsdata.pid;
+ expected_pid = res.user_pid_tgid;
+
+ if (expected_pid != ns_pid_tgid)
+ return 0;
+
+ res.pid_tgid = ns_pid_tgid;
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_obj_id.c b/tools/testing/selftests/bpf/progs/test_obj_id.c
new file mode 100644
index 000000000..ded71b3ff
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_obj_id.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2017 Facebook
+ */
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u64);
+} test_map_id SEC(".maps");
+
+SEC("raw_tp/sys_enter")
+int test_obj_id(void *ctx)
+{
+ __u32 key = 0;
+ __u64 *value;
+
+ value = bpf_map_lookup_elem(&test_map_id, &key);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_overhead.c b/tools/testing/selftests/bpf/progs/test_overhead.c
new file mode 100644
index 000000000..abb7344b5
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_overhead.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+#include <stdbool.h>
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <linux/ptrace.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+struct task_struct;
+
+SEC("kprobe/__set_task_comm")
+int BPF_KPROBE(prog1, struct task_struct *tsk, const char *buf, bool exec)
+{
+ return !tsk;
+}
+
+SEC("kretprobe/__set_task_comm")
+int BPF_KRETPROBE(prog2, int ret)
+{
+ return ret;
+}
+
+SEC("raw_tp/task_rename")
+int prog3(struct bpf_raw_tracepoint_args *ctx)
+{
+ return !ctx->args[0];
+}
+
+SEC("fentry/__set_task_comm")
+int BPF_PROG(prog4, struct task_struct *tsk, const char *buf, bool exec)
+{
+ return 0;
+}
+
+SEC("fexit/__set_task_comm")
+int BPF_PROG(prog5, struct task_struct *tsk, const char *buf, bool exec)
+{
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_pe_preserve_elems.c b/tools/testing/selftests/bpf/progs/test_pe_preserve_elems.c
new file mode 100644
index 000000000..fb22de7c3
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_pe_preserve_elems.c
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(max_entries, 1);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+} array_1 SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(max_entries, 1);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+ __uint(map_flags, BPF_F_PRESERVE_ELEMS);
+} array_2 SEC(".maps");
+
+SEC("raw_tp/sched_switch")
+int BPF_PROG(read_array_1)
+{
+ struct bpf_perf_event_value val;
+
+ return bpf_perf_event_read_value(&array_1, 0, &val, sizeof(val));
+}
+
+SEC("raw_tp/task_rename")
+int BPF_PROG(read_array_2)
+{
+ struct bpf_perf_event_value val;
+
+ return bpf_perf_event_read_value(&array_2, 0, &val, sizeof(val));
+}
+
+char LICENSE[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_perf_branches.c b/tools/testing/selftests/bpf/progs/test_perf_branches.c
new file mode 100644
index 000000000..a1ccc831c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_perf_branches.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <stddef.h>
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+int valid = 0;
+int required_size_out = 0;
+int written_stack_out = 0;
+int written_global_out = 0;
+
+struct {
+ __u64 _a;
+ __u64 _b;
+ __u64 _c;
+} fpbe[30] = {0};
+
+SEC("perf_event")
+int perf_branches(void *ctx)
+{
+ __u64 entries[4 * 3] = {0};
+ int required_size, written_stack, written_global;
+
+ /* write to stack */
+ written_stack = bpf_read_branch_records(ctx, entries, sizeof(entries), 0);
+ /* ignore spurious events */
+ if (!written_stack)
+ return 1;
+
+ /* get required size */
+ required_size = bpf_read_branch_records(ctx, NULL, 0,
+ BPF_F_GET_BRANCH_RECORDS_SIZE);
+
+ written_global = bpf_read_branch_records(ctx, fpbe, sizeof(fpbe), 0);
+ /* ignore spurious events */
+ if (!written_global)
+ return 1;
+
+ required_size_out = required_size;
+ written_stack_out = written_stack;
+ written_global_out = written_global;
+ valid = 1;
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_perf_buffer.c b/tools/testing/selftests/bpf/progs/test_perf_buffer.c
new file mode 100644
index 000000000..8207a2dc2
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_perf_buffer.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+} perf_buf_map SEC(".maps");
+
+SEC("tp/raw_syscalls/sys_enter")
+int handle_sys_enter(void *ctx)
+{
+ int cpu = bpf_get_smp_processor_id();
+
+ bpf_perf_event_output(ctx, &perf_buf_map, BPF_F_CURRENT_CPU,
+ &cpu, sizeof(cpu));
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_pinning.c b/tools/testing/selftests/bpf/progs/test_pinning.c
new file mode 100644
index 000000000..4ef263029
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_pinning.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+int _version SEC("version") = 1;
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u64);
+ __uint(pinning, LIBBPF_PIN_BY_NAME);
+} pinmap SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u64);
+} nopinmap SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u64);
+ __uint(pinning, LIBBPF_PIN_NONE);
+} nopinmap2 SEC(".maps");
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_pinning_invalid.c b/tools/testing/selftests/bpf/progs/test_pinning_invalid.c
new file mode 100644
index 000000000..5412e0c73
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_pinning_invalid.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+int _version SEC("version") = 1;
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u64);
+ __uint(pinning, 2); /* invalid */
+} nopinmap3 SEC(".maps");
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_pkt_access.c b/tools/testing/selftests/bpf/progs/test_pkt_access.c
new file mode 100644
index 000000000..852051064
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_pkt_access.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2017 Facebook
+ */
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/pkt_cls.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#define barrier() __asm__ __volatile__("": : :"memory")
+int _version SEC("version") = 1;
+
+/* llvm will optimize both subprograms into exactly the same BPF assembly
+ *
+ * Disassembly of section .text:
+ *
+ * 0000000000000000 test_pkt_access_subprog1:
+ * ; return skb->len * 2;
+ * 0: 61 10 00 00 00 00 00 00 r0 = *(u32 *)(r1 + 0)
+ * 1: 64 00 00 00 01 00 00 00 w0 <<= 1
+ * 2: 95 00 00 00 00 00 00 00 exit
+ *
+ * 0000000000000018 test_pkt_access_subprog2:
+ * ; return skb->len * val;
+ * 3: 61 10 00 00 00 00 00 00 r0 = *(u32 *)(r1 + 0)
+ * 4: 64 00 00 00 01 00 00 00 w0 <<= 1
+ * 5: 95 00 00 00 00 00 00 00 exit
+ *
+ * Which makes it an interesting test for BTF-enabled verifier.
+ */
+static __attribute__ ((noinline))
+int test_pkt_access_subprog1(volatile struct __sk_buff *skb)
+{
+ return skb->len * 2;
+}
+
+static __attribute__ ((noinline))
+int test_pkt_access_subprog2(int val, volatile struct __sk_buff *skb)
+{
+ return skb->len * val;
+}
+
+#define MAX_STACK (512 - 2 * 32)
+
+__attribute__ ((noinline))
+int get_skb_len(struct __sk_buff *skb)
+{
+ volatile char buf[MAX_STACK] = {};
+
+ return skb->len;
+}
+
+__attribute__ ((noinline))
+int get_constant(long val)
+{
+ return val - 122;
+}
+
+int get_skb_ifindex(int, struct __sk_buff *skb, int);
+
+__attribute__ ((noinline))
+int test_pkt_access_subprog3(int val, struct __sk_buff *skb)
+{
+ return get_skb_len(skb) * get_skb_ifindex(val, skb, get_constant(123));
+}
+
+__attribute__ ((noinline))
+int get_skb_ifindex(int val, struct __sk_buff *skb, int var)
+{
+ volatile char buf[MAX_STACK] = {};
+
+ return skb->ifindex * val * var;
+}
+
+__attribute__ ((noinline))
+int test_pkt_write_access_subprog(struct __sk_buff *skb, __u32 off)
+{
+ void *data = (void *)(long)skb->data;
+ void *data_end = (void *)(long)skb->data_end;
+ struct tcphdr *tcp = NULL;
+
+ if (off > sizeof(struct ethhdr) + sizeof(struct ipv6hdr))
+ return -1;
+
+ tcp = data + off;
+ if (tcp + 1 > data_end)
+ return -1;
+ /* make modification to the packet data */
+ tcp->check++;
+ return 0;
+}
+
+SEC("classifier/test_pkt_access")
+int test_pkt_access(struct __sk_buff *skb)
+{
+ void *data_end = (void *)(long)skb->data_end;
+ void *data = (void *)(long)skb->data;
+ struct ethhdr *eth = (struct ethhdr *)(data);
+ struct tcphdr *tcp = NULL;
+ __u8 proto = 255;
+ __u64 ihl_len;
+
+ if (eth + 1 > data_end)
+ return TC_ACT_SHOT;
+
+ if (eth->h_proto == bpf_htons(ETH_P_IP)) {
+ struct iphdr *iph = (struct iphdr *)(eth + 1);
+
+ if (iph + 1 > data_end)
+ return TC_ACT_SHOT;
+ ihl_len = iph->ihl * 4;
+ proto = iph->protocol;
+ tcp = (struct tcphdr *)((void *)(iph) + ihl_len);
+ } else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) {
+ struct ipv6hdr *ip6h = (struct ipv6hdr *)(eth + 1);
+
+ if (ip6h + 1 > data_end)
+ return TC_ACT_SHOT;
+ ihl_len = sizeof(*ip6h);
+ proto = ip6h->nexthdr;
+ tcp = (struct tcphdr *)((void *)(ip6h) + ihl_len);
+ }
+
+ if (test_pkt_access_subprog1(skb) != skb->len * 2)
+ return TC_ACT_SHOT;
+ if (test_pkt_access_subprog2(2, skb) != skb->len * 2)
+ return TC_ACT_SHOT;
+ if (test_pkt_access_subprog3(3, skb) != skb->len * 3 * skb->ifindex)
+ return TC_ACT_SHOT;
+ if (tcp) {
+ if (test_pkt_write_access_subprog(skb, (void *)tcp - data))
+ return TC_ACT_SHOT;
+ if (((void *)(tcp) + 20) > data_end || proto != 6)
+ return TC_ACT_SHOT;
+ barrier(); /* to force ordering of checks */
+ if (((void *)(tcp) + 18) > data_end)
+ return TC_ACT_SHOT;
+ if (tcp->urg_ptr == 123)
+ return TC_ACT_OK;
+ }
+
+ return TC_ACT_UNSPEC;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_pkt_md_access.c b/tools/testing/selftests/bpf/progs/test_pkt_md_access.c
new file mode 100644
index 000000000..610c74ea9
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_pkt_md_access.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2017 Facebook
+ */
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/pkt_cls.h>
+#include <bpf/bpf_helpers.h>
+
+int _version SEC("version") = 1;
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define TEST_FIELD(TYPE, FIELD, MASK) \
+ { \
+ TYPE tmp = *(volatile TYPE *)&skb->FIELD; \
+ if (tmp != ((*(volatile __u32 *)&skb->FIELD) & MASK)) \
+ return TC_ACT_SHOT; \
+ }
+#else
+#define TEST_FIELD_OFFSET(a, b) ((sizeof(a) - sizeof(b)) / sizeof(b))
+#define TEST_FIELD(TYPE, FIELD, MASK) \
+ { \
+ TYPE tmp = *((volatile TYPE *)&skb->FIELD + \
+ TEST_FIELD_OFFSET(skb->FIELD, TYPE)); \
+ if (tmp != ((*(volatile __u32 *)&skb->FIELD) & MASK)) \
+ return TC_ACT_SHOT; \
+ }
+#endif
+
+SEC("classifier/test_pkt_md_access")
+int test_pkt_md_access(struct __sk_buff *skb)
+{
+ TEST_FIELD(__u8, len, 0xFF);
+ TEST_FIELD(__u16, len, 0xFFFF);
+ TEST_FIELD(__u32, len, 0xFFFFFFFF);
+ TEST_FIELD(__u16, protocol, 0xFFFF);
+ TEST_FIELD(__u32, protocol, 0xFFFFFFFF);
+ TEST_FIELD(__u8, hash, 0xFF);
+ TEST_FIELD(__u16, hash, 0xFFFF);
+ TEST_FIELD(__u32, hash, 0xFFFFFFFF);
+
+ return TC_ACT_OK;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_probe_read_user_str.c b/tools/testing/selftests/bpf/progs/test_probe_read_user_str.c
new file mode 100644
index 000000000..3ae398b75
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_probe_read_user_str.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+#include <sys/types.h>
+
+pid_t pid = 0;
+long ret = 0;
+void *user_ptr = 0;
+char buf[256] = {};
+
+SEC("tracepoint/syscalls/sys_enter_nanosleep")
+int on_write(void *ctx)
+{
+ if (pid != (bpf_get_current_pid_tgid() >> 32))
+ return 0;
+
+ ret = bpf_probe_read_user_str(buf, sizeof(buf), user_ptr);
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_probe_user.c b/tools/testing/selftests/bpf/progs/test_probe_user.c
new file mode 100644
index 000000000..89b3532cc
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_probe_user.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+
+#include <netinet/in.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+static struct sockaddr_in old;
+
+SEC("kprobe/__sys_connect")
+int BPF_KPROBE(handle_sys_connect)
+{
+ void *ptr = (void *)PT_REGS_PARM2(ctx);
+ struct sockaddr_in new;
+
+ bpf_probe_read_user(&old, sizeof(old), ptr);
+ __builtin_memset(&new, 0xab, sizeof(new));
+ bpf_probe_write_user(ptr, &new, sizeof(new));
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_queue_map.c b/tools/testing/selftests/bpf/progs/test_queue_map.c
new file mode 100644
index 000000000..87db1f9da
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_queue_map.c
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Politecnico di Torino
+#define MAP_TYPE BPF_MAP_TYPE_QUEUE
+#include "test_queue_stack_map.h"
diff --git a/tools/testing/selftests/bpf/progs/test_queue_stack_map.h b/tools/testing/selftests/bpf/progs/test_queue_stack_map.h
new file mode 100644
index 000000000..4dd9806ad
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_queue_stack_map.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2018 Politecnico di Torino
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/pkt_cls.h>
+#include <bpf/bpf_helpers.h>
+
+int _version SEC("version") = 1;
+
+struct {
+ __uint(type, MAP_TYPE);
+ __uint(max_entries, 32);
+ __uint(map_flags, 0);
+ __uint(key_size, 0);
+ __uint(value_size, sizeof(__u32));
+} map_in SEC(".maps");
+
+struct {
+ __uint(type, MAP_TYPE);
+ __uint(max_entries, 32);
+ __uint(map_flags, 0);
+ __uint(key_size, 0);
+ __uint(value_size, sizeof(__u32));
+} map_out SEC(".maps");
+
+SEC("test")
+int _test(struct __sk_buff *skb)
+{
+ void *data_end = (void *)(long)skb->data_end;
+ void *data = (void *)(long)skb->data;
+ struct ethhdr *eth = (struct ethhdr *)(data);
+ __u32 value;
+ int err;
+
+ if (eth + 1 > data_end)
+ return TC_ACT_SHOT;
+
+ struct iphdr *iph = (struct iphdr *)(eth + 1);
+
+ if (iph + 1 > data_end)
+ return TC_ACT_SHOT;
+
+ err = bpf_map_pop_elem(&map_in, &value);
+ if (err)
+ return TC_ACT_SHOT;
+
+ iph->daddr = value;
+
+ err = bpf_map_push_elem(&map_out, &iph->saddr, 0);
+ if (err)
+ return TC_ACT_SHOT;
+
+ return TC_ACT_OK;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_raw_tp_test_run.c b/tools/testing/selftests/bpf/progs/test_raw_tp_test_run.c
new file mode 100644
index 000000000..4c63cc87b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_raw_tp_test_run.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+__u32 count = 0;
+__u32 on_cpu = 0xffffffff;
+
+SEC("raw_tp/task_rename")
+int BPF_PROG(rename, struct task_struct *task, char *comm)
+{
+
+ count++;
+ if ((__u64) task == 0x1234ULL && (__u64) comm == 0x5678ULL) {
+ on_cpu = bpf_get_smp_processor_id();
+ return (long)task + (long)comm;
+ }
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_rdonly_maps.c b/tools/testing/selftests/bpf/progs/test_rdonly_maps.c
new file mode 100644
index 000000000..ecbeea2df
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_rdonly_maps.c
@@ -0,0 +1,83 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+static volatile const struct {
+ unsigned a[4];
+ /*
+ * if the struct's size is multiple of 16, compiler will put it into
+ * .rodata.cst16 section, which is not recognized by libbpf; work
+ * around this by ensuring we don't have 16-aligned struct
+ */
+ char _y;
+} rdonly_values = { .a = {2, 3, 4, 5} };
+
+static volatile struct {
+ unsigned did_run;
+ unsigned iters;
+ unsigned sum;
+} res;
+
+SEC("raw_tracepoint/sys_enter:skip_loop")
+int skip_loop(struct pt_regs *ctx)
+{
+ /* prevent compiler to optimize everything out */
+ unsigned * volatile p = (void *)&rdonly_values.a;
+ unsigned iters = 0, sum = 0;
+
+ /* we should never enter this loop */
+ while (*p & 1) {
+ iters++;
+ sum += *p;
+ p++;
+ }
+ res.did_run = 1;
+ res.iters = iters;
+ res.sum = sum;
+ return 0;
+}
+
+SEC("raw_tracepoint/sys_enter:part_loop")
+int part_loop(struct pt_regs *ctx)
+{
+ /* prevent compiler to optimize everything out */
+ unsigned * volatile p = (void *)&rdonly_values.a;
+ unsigned iters = 0, sum = 0;
+
+ /* validate verifier can derive loop termination */
+ while (*p < 5) {
+ iters++;
+ sum += *p;
+ p++;
+ }
+ res.did_run = 1;
+ res.iters = iters;
+ res.sum = sum;
+ return 0;
+}
+
+SEC("raw_tracepoint/sys_enter:full_loop")
+int full_loop(struct pt_regs *ctx)
+{
+ /* prevent compiler to optimize everything out */
+ unsigned * volatile p = (void *)&rdonly_values.a;
+ int i = sizeof(rdonly_values.a) / sizeof(rdonly_values.a[0]);
+ unsigned iters = 0, sum = 0;
+
+ /* validate verifier can allow full loop as well */
+ while (i > 0 ) {
+ iters++;
+ sum += *p;
+ p++;
+ i--;
+ }
+ res.did_run = 1;
+ res.iters = iters;
+ res.sum = sum;
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf.c b/tools/testing/selftests/bpf/progs/test_ringbuf.c
new file mode 100644
index 000000000..8ba9959b0
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_ringbuf.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct sample {
+ int pid;
+ int seq;
+ long value;
+ char comm[16];
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_RINGBUF);
+ __uint(max_entries, 1 << 12);
+} ringbuf SEC(".maps");
+
+/* inputs */
+int pid = 0;
+long value = 0;
+long flags = 0;
+
+/* outputs */
+long total = 0;
+long discarded = 0;
+long dropped = 0;
+
+long avail_data = 0;
+long ring_size = 0;
+long cons_pos = 0;
+long prod_pos = 0;
+
+/* inner state */
+long seq = 0;
+
+SEC("tp/syscalls/sys_enter_getpgid")
+int test_ringbuf(void *ctx)
+{
+ int cur_pid = bpf_get_current_pid_tgid() >> 32;
+ struct sample *sample;
+ int zero = 0;
+
+ if (cur_pid != pid)
+ return 0;
+
+ sample = bpf_ringbuf_reserve(&ringbuf, sizeof(*sample), 0);
+ if (!sample) {
+ __sync_fetch_and_add(&dropped, 1);
+ return 1;
+ }
+
+ sample->pid = pid;
+ bpf_get_current_comm(sample->comm, sizeof(sample->comm));
+ sample->value = value;
+
+ sample->seq = seq++;
+ __sync_fetch_and_add(&total, 1);
+
+ if (sample->seq & 1) {
+ /* copy from reserved sample to a new one... */
+ bpf_ringbuf_output(&ringbuf, sample, sizeof(*sample), flags);
+ /* ...and then discard reserved sample */
+ bpf_ringbuf_discard(sample, flags);
+ __sync_fetch_and_add(&discarded, 1);
+ } else {
+ bpf_ringbuf_submit(sample, flags);
+ }
+
+ avail_data = bpf_ringbuf_query(&ringbuf, BPF_RB_AVAIL_DATA);
+ ring_size = bpf_ringbuf_query(&ringbuf, BPF_RB_RING_SIZE);
+ cons_pos = bpf_ringbuf_query(&ringbuf, BPF_RB_CONS_POS);
+ prod_pos = bpf_ringbuf_query(&ringbuf, BPF_RB_PROD_POS);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c b/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c
new file mode 100644
index 000000000..edf3b6953
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_ringbuf_multi.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct sample {
+ int pid;
+ int seq;
+ long value;
+ char comm[16];
+};
+
+struct ringbuf_map {
+ __uint(type, BPF_MAP_TYPE_RINGBUF);
+ __uint(max_entries, 1 << 12);
+} ringbuf1 SEC(".maps"),
+ ringbuf2 SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+ __uint(max_entries, 4);
+ __type(key, int);
+ __array(values, struct ringbuf_map);
+} ringbuf_arr SEC(".maps") = {
+ .values = {
+ [0] = &ringbuf1,
+ [2] = &ringbuf2,
+ },
+};
+
+/* inputs */
+int pid = 0;
+int target_ring = 0;
+long value = 0;
+
+/* outputs */
+long total = 0;
+long dropped = 0;
+long skipped = 0;
+
+SEC("tp/syscalls/sys_enter_getpgid")
+int test_ringbuf(void *ctx)
+{
+ int cur_pid = bpf_get_current_pid_tgid() >> 32;
+ struct sample *sample;
+ void *rb;
+ int zero = 0;
+
+ if (cur_pid != pid)
+ return 0;
+
+ rb = bpf_map_lookup_elem(&ringbuf_arr, &target_ring);
+ if (!rb) {
+ skipped += 1;
+ return 1;
+ }
+
+ sample = bpf_ringbuf_reserve(rb, sizeof(*sample), 0);
+ if (!sample) {
+ dropped += 1;
+ return 1;
+ }
+
+ sample->pid = pid;
+ bpf_get_current_comm(sample->comm, sizeof(sample->comm));
+ sample->value = value;
+
+ sample->seq = total;
+ total += 1;
+
+ bpf_ringbuf_submit(sample, 0);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_seg6_loop.c b/tools/testing/selftests/bpf/progs/test_seg6_loop.c
new file mode 100644
index 000000000..a7278f064
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_seg6_loop.c
@@ -0,0 +1,260 @@
+#include <stddef.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <linux/seg6_local.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+/* Packet parsing state machine helpers. */
+#define cursor_advance(_cursor, _len) \
+ ({ void *_tmp = _cursor; _cursor += _len; _tmp; })
+
+#define SR6_FLAG_ALERT (1 << 4)
+
+#define BPF_PACKET_HEADER __attribute__((packed))
+
+struct ip6_t {
+ unsigned int ver:4;
+ unsigned int priority:8;
+ unsigned int flow_label:20;
+ unsigned short payload_len;
+ unsigned char next_header;
+ unsigned char hop_limit;
+ unsigned long long src_hi;
+ unsigned long long src_lo;
+ unsigned long long dst_hi;
+ unsigned long long dst_lo;
+} BPF_PACKET_HEADER;
+
+struct ip6_addr_t {
+ unsigned long long hi;
+ unsigned long long lo;
+} BPF_PACKET_HEADER;
+
+struct ip6_srh_t {
+ unsigned char nexthdr;
+ unsigned char hdrlen;
+ unsigned char type;
+ unsigned char segments_left;
+ unsigned char first_segment;
+ unsigned char flags;
+ unsigned short tag;
+
+ struct ip6_addr_t segments[0];
+} BPF_PACKET_HEADER;
+
+struct sr6_tlv_t {
+ unsigned char type;
+ unsigned char len;
+ unsigned char value[0];
+} BPF_PACKET_HEADER;
+
+static __always_inline struct ip6_srh_t *get_srh(struct __sk_buff *skb)
+{
+ void *cursor, *data_end;
+ struct ip6_srh_t *srh;
+ struct ip6_t *ip;
+ uint8_t *ipver;
+
+ data_end = (void *)(long)skb->data_end;
+ cursor = (void *)(long)skb->data;
+ ipver = (uint8_t *)cursor;
+
+ if ((void *)ipver + sizeof(*ipver) > data_end)
+ return NULL;
+
+ if ((*ipver >> 4) != 6)
+ return NULL;
+
+ ip = cursor_advance(cursor, sizeof(*ip));
+ if ((void *)ip + sizeof(*ip) > data_end)
+ return NULL;
+
+ if (ip->next_header != 43)
+ return NULL;
+
+ srh = cursor_advance(cursor, sizeof(*srh));
+ if ((void *)srh + sizeof(*srh) > data_end)
+ return NULL;
+
+ if (srh->type != 4)
+ return NULL;
+
+ return srh;
+}
+
+static __always_inline int update_tlv_pad(struct __sk_buff *skb,
+ uint32_t new_pad, uint32_t old_pad,
+ uint32_t pad_off)
+{
+ int err;
+
+ if (new_pad != old_pad) {
+ err = bpf_lwt_seg6_adjust_srh(skb, pad_off,
+ (int) new_pad - (int) old_pad);
+ if (err)
+ return err;
+ }
+
+ if (new_pad > 0) {
+ char pad_tlv_buf[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0};
+ struct sr6_tlv_t *pad_tlv = (struct sr6_tlv_t *) pad_tlv_buf;
+
+ pad_tlv->type = SR6_TLV_PADDING;
+ pad_tlv->len = new_pad - 2;
+
+ err = bpf_lwt_seg6_store_bytes(skb, pad_off,
+ (void *)pad_tlv_buf, new_pad);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static __always_inline int is_valid_tlv_boundary(struct __sk_buff *skb,
+ struct ip6_srh_t *srh,
+ uint32_t *tlv_off,
+ uint32_t *pad_size,
+ uint32_t *pad_off)
+{
+ uint32_t srh_off, cur_off;
+ int offset_valid = 0;
+ int err;
+
+ srh_off = (char *)srh - (char *)(long)skb->data;
+ // cur_off = end of segments, start of possible TLVs
+ cur_off = srh_off + sizeof(*srh) +
+ sizeof(struct ip6_addr_t) * (srh->first_segment + 1);
+
+ *pad_off = 0;
+
+ // we can only go as far as ~10 TLVs due to the BPF max stack size
+ // workaround: define induction variable "i" as "long" instead
+ // of "int" to prevent alu32 sub-register spilling.
+ #pragma clang loop unroll(disable)
+ for (long i = 0; i < 100; i++) {
+ struct sr6_tlv_t tlv;
+
+ if (cur_off == *tlv_off)
+ offset_valid = 1;
+
+ if (cur_off >= srh_off + ((srh->hdrlen + 1) << 3))
+ break;
+
+ err = bpf_skb_load_bytes(skb, cur_off, &tlv, sizeof(tlv));
+ if (err)
+ return err;
+
+ if (tlv.type == SR6_TLV_PADDING) {
+ *pad_size = tlv.len + sizeof(tlv);
+ *pad_off = cur_off;
+
+ if (*tlv_off == srh_off) {
+ *tlv_off = cur_off;
+ offset_valid = 1;
+ }
+ break;
+
+ } else if (tlv.type == SR6_TLV_HMAC) {
+ break;
+ }
+
+ cur_off += sizeof(tlv) + tlv.len;
+ } // we reached the padding or HMAC TLVs, or the end of the SRH
+
+ if (*pad_off == 0)
+ *pad_off = cur_off;
+
+ if (*tlv_off == -1)
+ *tlv_off = cur_off;
+ else if (!offset_valid)
+ return -EINVAL;
+
+ return 0;
+}
+
+static __always_inline int add_tlv(struct __sk_buff *skb,
+ struct ip6_srh_t *srh, uint32_t tlv_off,
+ struct sr6_tlv_t *itlv, uint8_t tlv_size)
+{
+ uint32_t srh_off = (char *)srh - (char *)(long)skb->data;
+ uint8_t len_remaining, new_pad;
+ uint32_t pad_off = 0;
+ uint32_t pad_size = 0;
+ uint32_t partial_srh_len;
+ int err;
+
+ if (tlv_off != -1)
+ tlv_off += srh_off;
+
+ if (itlv->type == SR6_TLV_PADDING || itlv->type == SR6_TLV_HMAC)
+ return -EINVAL;
+
+ err = is_valid_tlv_boundary(skb, srh, &tlv_off, &pad_size, &pad_off);
+ if (err)
+ return err;
+
+ err = bpf_lwt_seg6_adjust_srh(skb, tlv_off, sizeof(*itlv) + itlv->len);
+ if (err)
+ return err;
+
+ err = bpf_lwt_seg6_store_bytes(skb, tlv_off, (void *)itlv, tlv_size);
+ if (err)
+ return err;
+
+ // the following can't be moved inside update_tlv_pad because the
+ // bpf verifier has some issues with it
+ pad_off += sizeof(*itlv) + itlv->len;
+ partial_srh_len = pad_off - srh_off;
+ len_remaining = partial_srh_len % 8;
+ new_pad = 8 - len_remaining;
+
+ if (new_pad == 1) // cannot pad for 1 byte only
+ new_pad = 9;
+ else if (new_pad == 8)
+ new_pad = 0;
+
+ return update_tlv_pad(skb, new_pad, pad_size, pad_off);
+}
+
+// Add an Egress TLV fc00::4, add the flag A,
+// and apply End.X action to fc42::1
+SEC("lwt_seg6local")
+int __add_egr_x(struct __sk_buff *skb)
+{
+ unsigned long long hi = 0xfc42000000000000;
+ unsigned long long lo = 0x1;
+ struct ip6_srh_t *srh = get_srh(skb);
+ uint8_t new_flags = SR6_FLAG_ALERT;
+ struct ip6_addr_t addr;
+ int err, offset;
+
+ if (srh == NULL)
+ return BPF_DROP;
+
+ uint8_t tlv[20] = {2, 18, 0, 0, 0xfd, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4};
+
+ err = add_tlv(skb, srh, (srh->hdrlen+1) << 3,
+ (struct sr6_tlv_t *)&tlv, 20);
+ if (err)
+ return BPF_DROP;
+
+ offset = sizeof(struct ip6_t) + offsetof(struct ip6_srh_t, flags);
+ err = bpf_lwt_seg6_store_bytes(skb, offset,
+ (void *)&new_flags, sizeof(new_flags));
+ if (err)
+ return BPF_DROP;
+
+ addr.lo = bpf_cpu_to_be64(lo);
+ addr.hi = bpf_cpu_to_be64(hi);
+ err = bpf_lwt_seg6_action(skb, SEG6_LOCAL_ACTION_END_X,
+ (void *)&addr, sizeof(addr));
+ if (err)
+ return BPF_DROP;
+ return BPF_REDIRECT;
+}
+char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c b/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c
new file mode 100644
index 000000000..26e77dcc7
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018 Facebook */
+
+#include <stdlib.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/bpf.h>
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_helpers.h>
+#include "test_select_reuseport_common.h"
+
+int _version SEC("version") = 1;
+
+#ifndef offsetof
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
+#endif
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+ __uint(max_entries, 1);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(__u32));
+} outer_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, NR_RESULTS);
+ __type(key, __u32);
+ __type(value, __u32);
+} result_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, int);
+} tmp_index_ovr_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u32);
+} linum_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, struct data_check);
+} data_check_map SEC(".maps");
+
+#define GOTO_DONE(_result) ({ \
+ result = (_result); \
+ linum = __LINE__; \
+ goto done; \
+})
+
+SEC("sk_reuseport")
+int _select_by_skb_data(struct sk_reuseport_md *reuse_md)
+{
+ __u32 linum, index = 0, flags = 0, index_zero = 0;
+ __u32 *result_cnt, *linum_value;
+ struct data_check data_check = {};
+ struct cmd *cmd, cmd_copy;
+ void *data, *data_end;
+ void *reuseport_array;
+ enum result result;
+ int *index_ovr;
+ int err;
+
+ data = reuse_md->data;
+ data_end = reuse_md->data_end;
+ data_check.len = reuse_md->len;
+ data_check.eth_protocol = reuse_md->eth_protocol;
+ data_check.ip_protocol = reuse_md->ip_protocol;
+ data_check.hash = reuse_md->hash;
+ data_check.bind_inany = reuse_md->bind_inany;
+ if (data_check.eth_protocol == bpf_htons(ETH_P_IP)) {
+ if (bpf_skb_load_bytes_relative(reuse_md,
+ offsetof(struct iphdr, saddr),
+ data_check.skb_addrs, 8,
+ BPF_HDR_START_NET))
+ GOTO_DONE(DROP_MISC);
+ } else {
+ if (bpf_skb_load_bytes_relative(reuse_md,
+ offsetof(struct ipv6hdr, saddr),
+ data_check.skb_addrs, 32,
+ BPF_HDR_START_NET))
+ GOTO_DONE(DROP_MISC);
+ }
+
+ /*
+ * The ip_protocol could be a compile time decision
+ * if the bpf_prog.o is dedicated to either TCP or
+ * UDP.
+ *
+ * Otherwise, reuse_md->ip_protocol or
+ * the protocol field in the iphdr can be used.
+ */
+ if (data_check.ip_protocol == IPPROTO_TCP) {
+ struct tcphdr *th = data;
+
+ if (th + 1 > data_end)
+ GOTO_DONE(DROP_MISC);
+
+ data_check.skb_ports[0] = th->source;
+ data_check.skb_ports[1] = th->dest;
+
+ if (th->fin)
+ /* The connection is being torn down at the end of a
+ * test. It can't contain a cmd, so return early.
+ */
+ return SK_PASS;
+
+ if ((th->doff << 2) + sizeof(*cmd) > data_check.len)
+ GOTO_DONE(DROP_ERR_SKB_DATA);
+ if (bpf_skb_load_bytes(reuse_md, th->doff << 2, &cmd_copy,
+ sizeof(cmd_copy)))
+ GOTO_DONE(DROP_MISC);
+ cmd = &cmd_copy;
+ } else if (data_check.ip_protocol == IPPROTO_UDP) {
+ struct udphdr *uh = data;
+
+ if (uh + 1 > data_end)
+ GOTO_DONE(DROP_MISC);
+
+ data_check.skb_ports[0] = uh->source;
+ data_check.skb_ports[1] = uh->dest;
+
+ if (sizeof(struct udphdr) + sizeof(*cmd) > data_check.len)
+ GOTO_DONE(DROP_ERR_SKB_DATA);
+ if (data + sizeof(struct udphdr) + sizeof(*cmd) > data_end) {
+ if (bpf_skb_load_bytes(reuse_md, sizeof(struct udphdr),
+ &cmd_copy, sizeof(cmd_copy)))
+ GOTO_DONE(DROP_MISC);
+ cmd = &cmd_copy;
+ } else {
+ cmd = data + sizeof(struct udphdr);
+ }
+ } else {
+ GOTO_DONE(DROP_MISC);
+ }
+
+ reuseport_array = bpf_map_lookup_elem(&outer_map, &index_zero);
+ if (!reuseport_array)
+ GOTO_DONE(DROP_ERR_INNER_MAP);
+
+ index = cmd->reuseport_index;
+ index_ovr = bpf_map_lookup_elem(&tmp_index_ovr_map, &index_zero);
+ if (!index_ovr)
+ GOTO_DONE(DROP_MISC);
+
+ if (*index_ovr != -1) {
+ index = *index_ovr;
+ *index_ovr = -1;
+ }
+ err = bpf_sk_select_reuseport(reuse_md, reuseport_array, &index,
+ flags);
+ if (!err)
+ GOTO_DONE(PASS);
+
+ if (cmd->pass_on_failure)
+ GOTO_DONE(PASS_ERR_SK_SELECT_REUSEPORT);
+ else
+ GOTO_DONE(DROP_ERR_SK_SELECT_REUSEPORT);
+
+done:
+ result_cnt = bpf_map_lookup_elem(&result_map, &result);
+ if (!result_cnt)
+ return SK_DROP;
+
+ bpf_map_update_elem(&linum_map, &index_zero, &linum, BPF_ANY);
+ bpf_map_update_elem(&data_check_map, &index_zero, &data_check, BPF_ANY);
+
+ (*result_cnt)++;
+ return result < PASS ? SK_DROP : SK_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_send_signal_kern.c b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
new file mode 100644
index 000000000..b4233d3ef
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_send_signal_kern.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <linux/bpf.h>
+#include <linux/version.h>
+#include <bpf/bpf_helpers.h>
+
+__u32 sig = 0, pid = 0, status = 0, signal_thread = 0;
+
+static __always_inline int bpf_send_signal_test(void *ctx)
+{
+ int ret;
+
+ if (status != 0 || sig == 0 || pid == 0)
+ return 0;
+
+ if ((bpf_get_current_pid_tgid() >> 32) == pid) {
+ if (signal_thread)
+ ret = bpf_send_signal_thread(sig);
+ else
+ ret = bpf_send_signal(sig);
+ if (ret == 0)
+ status = 1;
+ }
+
+ return 0;
+}
+
+SEC("tracepoint/syscalls/sys_enter_nanosleep")
+int send_signal_tp(void *ctx)
+{
+ return bpf_send_signal_test(ctx);
+}
+
+SEC("tracepoint/sched/sched_switch")
+int send_signal_tp_sched(void *ctx)
+{
+ return bpf_send_signal_test(ctx);
+}
+
+SEC("perf_event")
+int send_signal_perf(void *ctx)
+{
+ return bpf_send_signal_test(ctx);
+}
+
+char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c
new file mode 100644
index 000000000..77fd42f83
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c
@@ -0,0 +1,197 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Cloudflare Ltd.
+// Copyright (c) 2020 Isovalent, Inc.
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/pkt_cls.h>
+#include <linux/tcp.h>
+#include <sys/socket.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#if defined(IPROUTE2_HAVE_LIBBPF)
+/* Use a new-style map definition. */
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKMAP);
+ __type(key, int);
+ __type(value, __u64);
+ __uint(pinning, LIBBPF_PIN_BY_NAME);
+ __uint(max_entries, 1);
+} server_map SEC(".maps");
+#else
+/* Pin map under /sys/fs/bpf/tc/globals/<map name> */
+#define PIN_GLOBAL_NS 2
+
+/* Must match struct bpf_elf_map layout from iproute2 */
+struct {
+ __u32 type;
+ __u32 size_key;
+ __u32 size_value;
+ __u32 max_elem;
+ __u32 flags;
+ __u32 id;
+ __u32 pinning;
+} server_map SEC("maps") = {
+ .type = BPF_MAP_TYPE_SOCKMAP,
+ .size_key = sizeof(int),
+ .size_value = sizeof(__u64),
+ .max_elem = 1,
+ .pinning = PIN_GLOBAL_NS,
+};
+#endif
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
+
+/* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */
+static inline struct bpf_sock_tuple *
+get_tuple(struct __sk_buff *skb, bool *ipv4, bool *tcp)
+{
+ void *data_end = (void *)(long)skb->data_end;
+ void *data = (void *)(long)skb->data;
+ struct bpf_sock_tuple *result;
+ struct ethhdr *eth;
+ __u64 tuple_len;
+ __u8 proto = 0;
+ __u64 ihl_len;
+
+ eth = (struct ethhdr *)(data);
+ if (eth + 1 > data_end)
+ return NULL;
+
+ if (eth->h_proto == bpf_htons(ETH_P_IP)) {
+ struct iphdr *iph = (struct iphdr *)(data + sizeof(*eth));
+
+ if (iph + 1 > data_end)
+ return NULL;
+ if (iph->ihl != 5)
+ /* Options are not supported */
+ return NULL;
+ ihl_len = iph->ihl * 4;
+ proto = iph->protocol;
+ *ipv4 = true;
+ result = (struct bpf_sock_tuple *)&iph->saddr;
+ } else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) {
+ struct ipv6hdr *ip6h = (struct ipv6hdr *)(data + sizeof(*eth));
+
+ if (ip6h + 1 > data_end)
+ return NULL;
+ ihl_len = sizeof(*ip6h);
+ proto = ip6h->nexthdr;
+ *ipv4 = false;
+ result = (struct bpf_sock_tuple *)&ip6h->saddr;
+ } else {
+ return (struct bpf_sock_tuple *)data;
+ }
+
+ if (proto != IPPROTO_TCP && proto != IPPROTO_UDP)
+ return NULL;
+
+ *tcp = (proto == IPPROTO_TCP);
+ return result;
+}
+
+static inline int
+handle_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4)
+{
+ struct bpf_sock_tuple ln = {0};
+ struct bpf_sock *sk;
+ const int zero = 0;
+ size_t tuple_len;
+ __be16 dport;
+ int ret;
+
+ tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6);
+ if ((void *)tuple + tuple_len > (void *)(long)skb->data_end)
+ return TC_ACT_SHOT;
+
+ sk = bpf_sk_lookup_udp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0);
+ if (sk)
+ goto assign;
+
+ dport = ipv4 ? tuple->ipv4.dport : tuple->ipv6.dport;
+ if (dport != bpf_htons(4321))
+ return TC_ACT_OK;
+
+ sk = bpf_map_lookup_elem(&server_map, &zero);
+ if (!sk)
+ return TC_ACT_SHOT;
+
+assign:
+ ret = bpf_sk_assign(skb, sk, 0);
+ bpf_sk_release(sk);
+ return ret;
+}
+
+static inline int
+handle_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4)
+{
+ struct bpf_sock_tuple ln = {0};
+ struct bpf_sock *sk;
+ const int zero = 0;
+ size_t tuple_len;
+ __be16 dport;
+ int ret;
+
+ tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6);
+ if ((void *)tuple + tuple_len > (void *)(long)skb->data_end)
+ return TC_ACT_SHOT;
+
+ sk = bpf_skc_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0);
+ if (sk) {
+ if (sk->state != BPF_TCP_LISTEN)
+ goto assign;
+ bpf_sk_release(sk);
+ }
+
+ dport = ipv4 ? tuple->ipv4.dport : tuple->ipv6.dport;
+ if (dport != bpf_htons(4321))
+ return TC_ACT_OK;
+
+ sk = bpf_map_lookup_elem(&server_map, &zero);
+ if (!sk)
+ return TC_ACT_SHOT;
+
+ if (sk->state != BPF_TCP_LISTEN) {
+ bpf_sk_release(sk);
+ return TC_ACT_SHOT;
+ }
+
+assign:
+ ret = bpf_sk_assign(skb, sk, 0);
+ bpf_sk_release(sk);
+ return ret;
+}
+
+SEC("classifier/sk_assign_test")
+int bpf_sk_assign_test(struct __sk_buff *skb)
+{
+ struct bpf_sock_tuple *tuple, ln = {0};
+ bool ipv4 = false;
+ bool tcp = false;
+ int tuple_len;
+ int ret = 0;
+
+ tuple = get_tuple(skb, &ipv4, &tcp);
+ if (!tuple)
+ return TC_ACT_SHOT;
+
+ /* Note that the verifier socket return type for bpf_skc_lookup_tcp()
+ * differs from bpf_sk_lookup_udp(), so even though the C-level type is
+ * the same here, if we try to share the implementations they will
+ * fail to verify because we're crossing pointer types.
+ */
+ if (tcp)
+ ret = handle_tcp(skb, tuple, ipv4);
+ else
+ ret = handle_udp(skb, tuple, ipv4);
+
+ return ret == 0 ? TC_ACT_OK : TC_ACT_SHOT;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign_libbpf.c b/tools/testing/selftests/bpf/progs/test_sk_assign_libbpf.c
new file mode 100644
index 000000000..dcf46adfd
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sk_assign_libbpf.c
@@ -0,0 +1,3 @@
+// SPDX-License-Identifier: GPL-2.0
+#define IPROUTE2_HAVE_LIBBPF
+#include "test_sk_assign.c"
diff --git a/tools/testing/selftests/bpf/progs/test_sk_lookup.c b/tools/testing/selftests/bpf/progs/test_sk_lookup.c
new file mode 100644
index 000000000..ac6f7f205
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sk_lookup.c
@@ -0,0 +1,647 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+// Copyright (c) 2020 Cloudflare
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <sys/socket.h>
+
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_helpers.h>
+
+#define IP4(a, b, c, d) \
+ bpf_htonl((((__u32)(a) & 0xffU) << 24) | \
+ (((__u32)(b) & 0xffU) << 16) | \
+ (((__u32)(c) & 0xffU) << 8) | \
+ (((__u32)(d) & 0xffU) << 0))
+#define IP6(aaaa, bbbb, cccc, dddd) \
+ { bpf_htonl(aaaa), bpf_htonl(bbbb), bpf_htonl(cccc), bpf_htonl(dddd) }
+
+/* Macros for least-significant byte and word accesses. */
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define LSE_INDEX(index, size) (index)
+#else
+#define LSE_INDEX(index, size) ((size) - (index) - 1)
+#endif
+#define LSB(value, index) \
+ (((__u8 *)&(value))[LSE_INDEX((index), sizeof(value))])
+#define LSW(value, index) \
+ (((__u16 *)&(value))[LSE_INDEX((index), sizeof(value) / 2)])
+
+#define MAX_SOCKS 32
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKMAP);
+ __uint(max_entries, MAX_SOCKS);
+ __type(key, __u32);
+ __type(value, __u64);
+} redir_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 2);
+ __type(key, int);
+ __type(value, int);
+} run_map SEC(".maps");
+
+enum {
+ PROG1 = 0,
+ PROG2,
+};
+
+enum {
+ SERVER_A = 0,
+ SERVER_B,
+};
+
+/* Addressable key/value constants for convenience */
+static const int KEY_PROG1 = PROG1;
+static const int KEY_PROG2 = PROG2;
+static const int PROG_DONE = 1;
+
+static const __u32 KEY_SERVER_A = SERVER_A;
+static const __u32 KEY_SERVER_B = SERVER_B;
+
+static const __u16 SRC_PORT = bpf_htons(8008);
+static const __u32 SRC_IP4 = IP4(127, 0, 0, 2);
+static const __u32 SRC_IP6[] = IP6(0xfd000000, 0x0, 0x0, 0x00000002);
+
+static const __u16 DST_PORT = 7007; /* Host byte order */
+static const __u32 DST_IP4 = IP4(127, 0, 0, 1);
+static const __u32 DST_IP6[] = IP6(0xfd000000, 0x0, 0x0, 0x00000001);
+
+SEC("sk_lookup/lookup_pass")
+int lookup_pass(struct bpf_sk_lookup *ctx)
+{
+ return SK_PASS;
+}
+
+SEC("sk_lookup/lookup_drop")
+int lookup_drop(struct bpf_sk_lookup *ctx)
+{
+ return SK_DROP;
+}
+
+SEC("sk_reuseport/reuse_pass")
+int reuseport_pass(struct sk_reuseport_md *ctx)
+{
+ return SK_PASS;
+}
+
+SEC("sk_reuseport/reuse_drop")
+int reuseport_drop(struct sk_reuseport_md *ctx)
+{
+ return SK_DROP;
+}
+
+/* Redirect packets destined for port DST_PORT to socket at redir_map[0]. */
+SEC("sk_lookup/redir_port")
+int redir_port(struct bpf_sk_lookup *ctx)
+{
+ struct bpf_sock *sk;
+ int err;
+
+ if (ctx->local_port != DST_PORT)
+ return SK_PASS;
+
+ sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_A);
+ if (!sk)
+ return SK_PASS;
+
+ err = bpf_sk_assign(ctx, sk, 0);
+ bpf_sk_release(sk);
+ return err ? SK_DROP : SK_PASS;
+}
+
+/* Redirect packets destined for DST_IP4 address to socket at redir_map[0]. */
+SEC("sk_lookup/redir_ip4")
+int redir_ip4(struct bpf_sk_lookup *ctx)
+{
+ struct bpf_sock *sk;
+ int err;
+
+ if (ctx->family != AF_INET)
+ return SK_PASS;
+ if (ctx->local_port != DST_PORT)
+ return SK_PASS;
+ if (ctx->local_ip4 != DST_IP4)
+ return SK_PASS;
+
+ sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_A);
+ if (!sk)
+ return SK_PASS;
+
+ err = bpf_sk_assign(ctx, sk, 0);
+ bpf_sk_release(sk);
+ return err ? SK_DROP : SK_PASS;
+}
+
+/* Redirect packets destined for DST_IP6 address to socket at redir_map[0]. */
+SEC("sk_lookup/redir_ip6")
+int redir_ip6(struct bpf_sk_lookup *ctx)
+{
+ struct bpf_sock *sk;
+ int err;
+
+ if (ctx->family != AF_INET6)
+ return SK_PASS;
+ if (ctx->local_port != DST_PORT)
+ return SK_PASS;
+ if (ctx->local_ip6[0] != DST_IP6[0] ||
+ ctx->local_ip6[1] != DST_IP6[1] ||
+ ctx->local_ip6[2] != DST_IP6[2] ||
+ ctx->local_ip6[3] != DST_IP6[3])
+ return SK_PASS;
+
+ sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_A);
+ if (!sk)
+ return SK_PASS;
+
+ err = bpf_sk_assign(ctx, sk, 0);
+ bpf_sk_release(sk);
+ return err ? SK_DROP : SK_PASS;
+}
+
+SEC("sk_lookup/select_sock_a")
+int select_sock_a(struct bpf_sk_lookup *ctx)
+{
+ struct bpf_sock *sk;
+ int err;
+
+ sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_A);
+ if (!sk)
+ return SK_PASS;
+
+ err = bpf_sk_assign(ctx, sk, 0);
+ bpf_sk_release(sk);
+ return err ? SK_DROP : SK_PASS;
+}
+
+SEC("sk_lookup/select_sock_a_no_reuseport")
+int select_sock_a_no_reuseport(struct bpf_sk_lookup *ctx)
+{
+ struct bpf_sock *sk;
+ int err;
+
+ sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_A);
+ if (!sk)
+ return SK_DROP;
+
+ err = bpf_sk_assign(ctx, sk, BPF_SK_LOOKUP_F_NO_REUSEPORT);
+ bpf_sk_release(sk);
+ return err ? SK_DROP : SK_PASS;
+}
+
+SEC("sk_reuseport/select_sock_b")
+int select_sock_b(struct sk_reuseport_md *ctx)
+{
+ __u32 key = KEY_SERVER_B;
+ int err;
+
+ err = bpf_sk_select_reuseport(ctx, &redir_map, &key, 0);
+ return err ? SK_DROP : SK_PASS;
+}
+
+/* Check that bpf_sk_assign() returns -EEXIST if socket already selected. */
+SEC("sk_lookup/sk_assign_eexist")
+int sk_assign_eexist(struct bpf_sk_lookup *ctx)
+{
+ struct bpf_sock *sk;
+ int err, ret;
+
+ ret = SK_DROP;
+ sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_B);
+ if (!sk)
+ goto out;
+ err = bpf_sk_assign(ctx, sk, 0);
+ if (err)
+ goto out;
+ bpf_sk_release(sk);
+
+ sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_A);
+ if (!sk)
+ goto out;
+ err = bpf_sk_assign(ctx, sk, 0);
+ if (err != -EEXIST) {
+ bpf_printk("sk_assign returned %d, expected %d\n",
+ err, -EEXIST);
+ goto out;
+ }
+
+ ret = SK_PASS; /* Success, redirect to KEY_SERVER_B */
+out:
+ if (sk)
+ bpf_sk_release(sk);
+ return ret;
+}
+
+/* Check that bpf_sk_assign(BPF_SK_LOOKUP_F_REPLACE) can override selection. */
+SEC("sk_lookup/sk_assign_replace_flag")
+int sk_assign_replace_flag(struct bpf_sk_lookup *ctx)
+{
+ struct bpf_sock *sk;
+ int err, ret;
+
+ ret = SK_DROP;
+ sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_A);
+ if (!sk)
+ goto out;
+ err = bpf_sk_assign(ctx, sk, 0);
+ if (err)
+ goto out;
+ bpf_sk_release(sk);
+
+ sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_B);
+ if (!sk)
+ goto out;
+ err = bpf_sk_assign(ctx, sk, BPF_SK_LOOKUP_F_REPLACE);
+ if (err) {
+ bpf_printk("sk_assign returned %d, expected 0\n", err);
+ goto out;
+ }
+
+ ret = SK_PASS; /* Success, redirect to KEY_SERVER_B */
+out:
+ if (sk)
+ bpf_sk_release(sk);
+ return ret;
+}
+
+/* Check that bpf_sk_assign(sk=NULL) is accepted. */
+SEC("sk_lookup/sk_assign_null")
+int sk_assign_null(struct bpf_sk_lookup *ctx)
+{
+ struct bpf_sock *sk = NULL;
+ int err, ret;
+
+ ret = SK_DROP;
+
+ err = bpf_sk_assign(ctx, NULL, 0);
+ if (err) {
+ bpf_printk("sk_assign returned %d, expected 0\n", err);
+ goto out;
+ }
+
+ sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_B);
+ if (!sk)
+ goto out;
+ err = bpf_sk_assign(ctx, sk, BPF_SK_LOOKUP_F_REPLACE);
+ if (err) {
+ bpf_printk("sk_assign returned %d, expected 0\n", err);
+ goto out;
+ }
+
+ if (ctx->sk != sk)
+ goto out;
+ err = bpf_sk_assign(ctx, NULL, 0);
+ if (err != -EEXIST)
+ goto out;
+ err = bpf_sk_assign(ctx, NULL, BPF_SK_LOOKUP_F_REPLACE);
+ if (err)
+ goto out;
+ err = bpf_sk_assign(ctx, sk, BPF_SK_LOOKUP_F_REPLACE);
+ if (err)
+ goto out;
+
+ ret = SK_PASS; /* Success, redirect to KEY_SERVER_B */
+out:
+ if (sk)
+ bpf_sk_release(sk);
+ return ret;
+}
+
+/* Check that selected sk is accessible through context. */
+SEC("sk_lookup/access_ctx_sk")
+int access_ctx_sk(struct bpf_sk_lookup *ctx)
+{
+ struct bpf_sock *sk1 = NULL, *sk2 = NULL;
+ int err, ret;
+
+ ret = SK_DROP;
+
+ /* Try accessing unassigned (NULL) ctx->sk field */
+ if (ctx->sk && ctx->sk->family != AF_INET)
+ goto out;
+
+ /* Assign a value to ctx->sk */
+ sk1 = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_A);
+ if (!sk1)
+ goto out;
+ err = bpf_sk_assign(ctx, sk1, 0);
+ if (err)
+ goto out;
+ if (ctx->sk != sk1)
+ goto out;
+
+ /* Access ctx->sk fields */
+ if (ctx->sk->family != AF_INET ||
+ ctx->sk->type != SOCK_STREAM ||
+ ctx->sk->state != BPF_TCP_LISTEN)
+ goto out;
+
+ /* Reset selection */
+ err = bpf_sk_assign(ctx, NULL, BPF_SK_LOOKUP_F_REPLACE);
+ if (err)
+ goto out;
+ if (ctx->sk)
+ goto out;
+
+ /* Assign another socket */
+ sk2 = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_B);
+ if (!sk2)
+ goto out;
+ err = bpf_sk_assign(ctx, sk2, BPF_SK_LOOKUP_F_REPLACE);
+ if (err)
+ goto out;
+ if (ctx->sk != sk2)
+ goto out;
+
+ /* Access reassigned ctx->sk fields */
+ if (ctx->sk->family != AF_INET ||
+ ctx->sk->type != SOCK_STREAM ||
+ ctx->sk->state != BPF_TCP_LISTEN)
+ goto out;
+
+ ret = SK_PASS; /* Success, redirect to KEY_SERVER_B */
+out:
+ if (sk1)
+ bpf_sk_release(sk1);
+ if (sk2)
+ bpf_sk_release(sk2);
+ return ret;
+}
+
+/* Check narrow loads from ctx fields that support them.
+ *
+ * Narrow loads of size >= target field size from a non-zero offset
+ * are not covered because they give bogus results, that is the
+ * verifier ignores the offset.
+ */
+SEC("sk_lookup/ctx_narrow_access")
+int ctx_narrow_access(struct bpf_sk_lookup *ctx)
+{
+ struct bpf_sock *sk;
+ int err, family;
+ bool v4;
+
+ v4 = (ctx->family == AF_INET);
+
+ /* Narrow loads from family field */
+ if (LSB(ctx->family, 0) != (v4 ? AF_INET : AF_INET6) ||
+ LSB(ctx->family, 1) != 0 || LSB(ctx->family, 2) != 0 || LSB(ctx->family, 3) != 0)
+ return SK_DROP;
+ if (LSW(ctx->family, 0) != (v4 ? AF_INET : AF_INET6))
+ return SK_DROP;
+
+ /* Narrow loads from protocol field */
+ if (LSB(ctx->protocol, 0) != IPPROTO_TCP ||
+ LSB(ctx->protocol, 1) != 0 || LSB(ctx->protocol, 2) != 0 || LSB(ctx->protocol, 3) != 0)
+ return SK_DROP;
+ if (LSW(ctx->protocol, 0) != IPPROTO_TCP)
+ return SK_DROP;
+
+ /* Narrow loads from remote_port field. Expect SRC_PORT. */
+ if (LSB(ctx->remote_port, 0) != ((SRC_PORT >> 0) & 0xff) ||
+ LSB(ctx->remote_port, 1) != ((SRC_PORT >> 8) & 0xff) ||
+ LSB(ctx->remote_port, 2) != 0 || LSB(ctx->remote_port, 3) != 0)
+ return SK_DROP;
+ if (LSW(ctx->remote_port, 0) != SRC_PORT)
+ return SK_DROP;
+
+ /* Narrow loads from local_port field. Expect DST_PORT. */
+ if (LSB(ctx->local_port, 0) != ((DST_PORT >> 0) & 0xff) ||
+ LSB(ctx->local_port, 1) != ((DST_PORT >> 8) & 0xff) ||
+ LSB(ctx->local_port, 2) != 0 || LSB(ctx->local_port, 3) != 0)
+ return SK_DROP;
+ if (LSW(ctx->local_port, 0) != DST_PORT)
+ return SK_DROP;
+
+ /* Narrow loads from IPv4 fields */
+ if (v4) {
+ /* Expect SRC_IP4 in remote_ip4 */
+ if (LSB(ctx->remote_ip4, 0) != ((SRC_IP4 >> 0) & 0xff) ||
+ LSB(ctx->remote_ip4, 1) != ((SRC_IP4 >> 8) & 0xff) ||
+ LSB(ctx->remote_ip4, 2) != ((SRC_IP4 >> 16) & 0xff) ||
+ LSB(ctx->remote_ip4, 3) != ((SRC_IP4 >> 24) & 0xff))
+ return SK_DROP;
+ if (LSW(ctx->remote_ip4, 0) != ((SRC_IP4 >> 0) & 0xffff) ||
+ LSW(ctx->remote_ip4, 1) != ((SRC_IP4 >> 16) & 0xffff))
+ return SK_DROP;
+
+ /* Expect DST_IP4 in local_ip4 */
+ if (LSB(ctx->local_ip4, 0) != ((DST_IP4 >> 0) & 0xff) ||
+ LSB(ctx->local_ip4, 1) != ((DST_IP4 >> 8) & 0xff) ||
+ LSB(ctx->local_ip4, 2) != ((DST_IP4 >> 16) & 0xff) ||
+ LSB(ctx->local_ip4, 3) != ((DST_IP4 >> 24) & 0xff))
+ return SK_DROP;
+ if (LSW(ctx->local_ip4, 0) != ((DST_IP4 >> 0) & 0xffff) ||
+ LSW(ctx->local_ip4, 1) != ((DST_IP4 >> 16) & 0xffff))
+ return SK_DROP;
+ } else {
+ /* Expect 0.0.0.0 IPs when family != AF_INET */
+ if (LSB(ctx->remote_ip4, 0) != 0 || LSB(ctx->remote_ip4, 1) != 0 ||
+ LSB(ctx->remote_ip4, 2) != 0 || LSB(ctx->remote_ip4, 3) != 0)
+ return SK_DROP;
+ if (LSW(ctx->remote_ip4, 0) != 0 || LSW(ctx->remote_ip4, 1) != 0)
+ return SK_DROP;
+
+ if (LSB(ctx->local_ip4, 0) != 0 || LSB(ctx->local_ip4, 1) != 0 ||
+ LSB(ctx->local_ip4, 2) != 0 || LSB(ctx->local_ip4, 3) != 0)
+ return SK_DROP;
+ if (LSW(ctx->local_ip4, 0) != 0 || LSW(ctx->local_ip4, 1) != 0)
+ return SK_DROP;
+ }
+
+ /* Narrow loads from IPv6 fields */
+ if (!v4) {
+ /* Expect SRC_IP6 in remote_ip6 */
+ if (LSB(ctx->remote_ip6[0], 0) != ((SRC_IP6[0] >> 0) & 0xff) ||
+ LSB(ctx->remote_ip6[0], 1) != ((SRC_IP6[0] >> 8) & 0xff) ||
+ LSB(ctx->remote_ip6[0], 2) != ((SRC_IP6[0] >> 16) & 0xff) ||
+ LSB(ctx->remote_ip6[0], 3) != ((SRC_IP6[0] >> 24) & 0xff) ||
+ LSB(ctx->remote_ip6[1], 0) != ((SRC_IP6[1] >> 0) & 0xff) ||
+ LSB(ctx->remote_ip6[1], 1) != ((SRC_IP6[1] >> 8) & 0xff) ||
+ LSB(ctx->remote_ip6[1], 2) != ((SRC_IP6[1] >> 16) & 0xff) ||
+ LSB(ctx->remote_ip6[1], 3) != ((SRC_IP6[1] >> 24) & 0xff) ||
+ LSB(ctx->remote_ip6[2], 0) != ((SRC_IP6[2] >> 0) & 0xff) ||
+ LSB(ctx->remote_ip6[2], 1) != ((SRC_IP6[2] >> 8) & 0xff) ||
+ LSB(ctx->remote_ip6[2], 2) != ((SRC_IP6[2] >> 16) & 0xff) ||
+ LSB(ctx->remote_ip6[2], 3) != ((SRC_IP6[2] >> 24) & 0xff) ||
+ LSB(ctx->remote_ip6[3], 0) != ((SRC_IP6[3] >> 0) & 0xff) ||
+ LSB(ctx->remote_ip6[3], 1) != ((SRC_IP6[3] >> 8) & 0xff) ||
+ LSB(ctx->remote_ip6[3], 2) != ((SRC_IP6[3] >> 16) & 0xff) ||
+ LSB(ctx->remote_ip6[3], 3) != ((SRC_IP6[3] >> 24) & 0xff))
+ return SK_DROP;
+ if (LSW(ctx->remote_ip6[0], 0) != ((SRC_IP6[0] >> 0) & 0xffff) ||
+ LSW(ctx->remote_ip6[0], 1) != ((SRC_IP6[0] >> 16) & 0xffff) ||
+ LSW(ctx->remote_ip6[1], 0) != ((SRC_IP6[1] >> 0) & 0xffff) ||
+ LSW(ctx->remote_ip6[1], 1) != ((SRC_IP6[1] >> 16) & 0xffff) ||
+ LSW(ctx->remote_ip6[2], 0) != ((SRC_IP6[2] >> 0) & 0xffff) ||
+ LSW(ctx->remote_ip6[2], 1) != ((SRC_IP6[2] >> 16) & 0xffff) ||
+ LSW(ctx->remote_ip6[3], 0) != ((SRC_IP6[3] >> 0) & 0xffff) ||
+ LSW(ctx->remote_ip6[3], 1) != ((SRC_IP6[3] >> 16) & 0xffff))
+ return SK_DROP;
+ /* Expect DST_IP6 in local_ip6 */
+ if (LSB(ctx->local_ip6[0], 0) != ((DST_IP6[0] >> 0) & 0xff) ||
+ LSB(ctx->local_ip6[0], 1) != ((DST_IP6[0] >> 8) & 0xff) ||
+ LSB(ctx->local_ip6[0], 2) != ((DST_IP6[0] >> 16) & 0xff) ||
+ LSB(ctx->local_ip6[0], 3) != ((DST_IP6[0] >> 24) & 0xff) ||
+ LSB(ctx->local_ip6[1], 0) != ((DST_IP6[1] >> 0) & 0xff) ||
+ LSB(ctx->local_ip6[1], 1) != ((DST_IP6[1] >> 8) & 0xff) ||
+ LSB(ctx->local_ip6[1], 2) != ((DST_IP6[1] >> 16) & 0xff) ||
+ LSB(ctx->local_ip6[1], 3) != ((DST_IP6[1] >> 24) & 0xff) ||
+ LSB(ctx->local_ip6[2], 0) != ((DST_IP6[2] >> 0) & 0xff) ||
+ LSB(ctx->local_ip6[2], 1) != ((DST_IP6[2] >> 8) & 0xff) ||
+ LSB(ctx->local_ip6[2], 2) != ((DST_IP6[2] >> 16) & 0xff) ||
+ LSB(ctx->local_ip6[2], 3) != ((DST_IP6[2] >> 24) & 0xff) ||
+ LSB(ctx->local_ip6[3], 0) != ((DST_IP6[3] >> 0) & 0xff) ||
+ LSB(ctx->local_ip6[3], 1) != ((DST_IP6[3] >> 8) & 0xff) ||
+ LSB(ctx->local_ip6[3], 2) != ((DST_IP6[3] >> 16) & 0xff) ||
+ LSB(ctx->local_ip6[3], 3) != ((DST_IP6[3] >> 24) & 0xff))
+ return SK_DROP;
+ if (LSW(ctx->local_ip6[0], 0) != ((DST_IP6[0] >> 0) & 0xffff) ||
+ LSW(ctx->local_ip6[0], 1) != ((DST_IP6[0] >> 16) & 0xffff) ||
+ LSW(ctx->local_ip6[1], 0) != ((DST_IP6[1] >> 0) & 0xffff) ||
+ LSW(ctx->local_ip6[1], 1) != ((DST_IP6[1] >> 16) & 0xffff) ||
+ LSW(ctx->local_ip6[2], 0) != ((DST_IP6[2] >> 0) & 0xffff) ||
+ LSW(ctx->local_ip6[2], 1) != ((DST_IP6[2] >> 16) & 0xffff) ||
+ LSW(ctx->local_ip6[3], 0) != ((DST_IP6[3] >> 0) & 0xffff) ||
+ LSW(ctx->local_ip6[3], 1) != ((DST_IP6[3] >> 16) & 0xffff))
+ return SK_DROP;
+ } else {
+ /* Expect :: IPs when family != AF_INET6 */
+ if (LSB(ctx->remote_ip6[0], 0) != 0 || LSB(ctx->remote_ip6[0], 1) != 0 ||
+ LSB(ctx->remote_ip6[0], 2) != 0 || LSB(ctx->remote_ip6[0], 3) != 0 ||
+ LSB(ctx->remote_ip6[1], 0) != 0 || LSB(ctx->remote_ip6[1], 1) != 0 ||
+ LSB(ctx->remote_ip6[1], 2) != 0 || LSB(ctx->remote_ip6[1], 3) != 0 ||
+ LSB(ctx->remote_ip6[2], 0) != 0 || LSB(ctx->remote_ip6[2], 1) != 0 ||
+ LSB(ctx->remote_ip6[2], 2) != 0 || LSB(ctx->remote_ip6[2], 3) != 0 ||
+ LSB(ctx->remote_ip6[3], 0) != 0 || LSB(ctx->remote_ip6[3], 1) != 0 ||
+ LSB(ctx->remote_ip6[3], 2) != 0 || LSB(ctx->remote_ip6[3], 3) != 0)
+ return SK_DROP;
+ if (LSW(ctx->remote_ip6[0], 0) != 0 || LSW(ctx->remote_ip6[0], 1) != 0 ||
+ LSW(ctx->remote_ip6[1], 0) != 0 || LSW(ctx->remote_ip6[1], 1) != 0 ||
+ LSW(ctx->remote_ip6[2], 0) != 0 || LSW(ctx->remote_ip6[2], 1) != 0 ||
+ LSW(ctx->remote_ip6[3], 0) != 0 || LSW(ctx->remote_ip6[3], 1) != 0)
+ return SK_DROP;
+
+ if (LSB(ctx->local_ip6[0], 0) != 0 || LSB(ctx->local_ip6[0], 1) != 0 ||
+ LSB(ctx->local_ip6[0], 2) != 0 || LSB(ctx->local_ip6[0], 3) != 0 ||
+ LSB(ctx->local_ip6[1], 0) != 0 || LSB(ctx->local_ip6[1], 1) != 0 ||
+ LSB(ctx->local_ip6[1], 2) != 0 || LSB(ctx->local_ip6[1], 3) != 0 ||
+ LSB(ctx->local_ip6[2], 0) != 0 || LSB(ctx->local_ip6[2], 1) != 0 ||
+ LSB(ctx->local_ip6[2], 2) != 0 || LSB(ctx->local_ip6[2], 3) != 0 ||
+ LSB(ctx->local_ip6[3], 0) != 0 || LSB(ctx->local_ip6[3], 1) != 0 ||
+ LSB(ctx->local_ip6[3], 2) != 0 || LSB(ctx->local_ip6[3], 3) != 0)
+ return SK_DROP;
+ if (LSW(ctx->remote_ip6[0], 0) != 0 || LSW(ctx->remote_ip6[0], 1) != 0 ||
+ LSW(ctx->remote_ip6[1], 0) != 0 || LSW(ctx->remote_ip6[1], 1) != 0 ||
+ LSW(ctx->remote_ip6[2], 0) != 0 || LSW(ctx->remote_ip6[2], 1) != 0 ||
+ LSW(ctx->remote_ip6[3], 0) != 0 || LSW(ctx->remote_ip6[3], 1) != 0)
+ return SK_DROP;
+ }
+
+ /* Success, redirect to KEY_SERVER_B */
+ sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_B);
+ if (sk) {
+ bpf_sk_assign(ctx, sk, 0);
+ bpf_sk_release(sk);
+ }
+ return SK_PASS;
+}
+
+/* Check that sk_assign rejects SERVER_A socket with -ESOCKNOSUPPORT */
+SEC("sk_lookup/sk_assign_esocknosupport")
+int sk_assign_esocknosupport(struct bpf_sk_lookup *ctx)
+{
+ struct bpf_sock *sk;
+ int err, ret;
+
+ ret = SK_DROP;
+ sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_A);
+ if (!sk)
+ goto out;
+
+ err = bpf_sk_assign(ctx, sk, 0);
+ if (err != -ESOCKTNOSUPPORT) {
+ bpf_printk("sk_assign returned %d, expected %d\n",
+ err, -ESOCKTNOSUPPORT);
+ goto out;
+ }
+
+ ret = SK_PASS; /* Success, pass to regular lookup */
+out:
+ if (sk)
+ bpf_sk_release(sk);
+ return ret;
+}
+
+SEC("sk_lookup/multi_prog_pass1")
+int multi_prog_pass1(struct bpf_sk_lookup *ctx)
+{
+ bpf_map_update_elem(&run_map, &KEY_PROG1, &PROG_DONE, BPF_ANY);
+ return SK_PASS;
+}
+
+SEC("sk_lookup/multi_prog_pass2")
+int multi_prog_pass2(struct bpf_sk_lookup *ctx)
+{
+ bpf_map_update_elem(&run_map, &KEY_PROG2, &PROG_DONE, BPF_ANY);
+ return SK_PASS;
+}
+
+SEC("sk_lookup/multi_prog_drop1")
+int multi_prog_drop1(struct bpf_sk_lookup *ctx)
+{
+ bpf_map_update_elem(&run_map, &KEY_PROG1, &PROG_DONE, BPF_ANY);
+ return SK_DROP;
+}
+
+SEC("sk_lookup/multi_prog_drop2")
+int multi_prog_drop2(struct bpf_sk_lookup *ctx)
+{
+ bpf_map_update_elem(&run_map, &KEY_PROG2, &PROG_DONE, BPF_ANY);
+ return SK_DROP;
+}
+
+static __always_inline int select_server_a(struct bpf_sk_lookup *ctx)
+{
+ struct bpf_sock *sk;
+ int err;
+
+ sk = bpf_map_lookup_elem(&redir_map, &KEY_SERVER_A);
+ if (!sk)
+ return SK_DROP;
+
+ err = bpf_sk_assign(ctx, sk, 0);
+ bpf_sk_release(sk);
+ if (err)
+ return SK_DROP;
+
+ return SK_PASS;
+}
+
+SEC("sk_lookup/multi_prog_redir1")
+int multi_prog_redir1(struct bpf_sk_lookup *ctx)
+{
+ int ret;
+
+ ret = select_server_a(ctx);
+ bpf_map_update_elem(&run_map, &KEY_PROG1, &PROG_DONE, BPF_ANY);
+ return SK_PASS;
+}
+
+SEC("sk_lookup/multi_prog_redir2")
+int multi_prog_redir2(struct bpf_sk_lookup *ctx)
+{
+ int ret;
+
+ ret = select_server_a(ctx);
+ bpf_map_update_elem(&run_map, &KEY_PROG2, &PROG_DONE, BPF_ANY);
+ return SK_PASS;
+}
+
+char _license[] SEC("license") = "Dual BSD/GPL";
+__u32 _version SEC("version") = 1;
diff --git a/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c b/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c
new file mode 100644
index 000000000..e83d0b48d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sk_lookup_kern.c
@@ -0,0 +1,181 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/pkt_cls.h>
+#include <linux/tcp.h>
+#include <sys/socket.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
+
+/* Fill 'tuple' with L3 info, and attempt to find L4. On fail, return NULL. */
+static struct bpf_sock_tuple *get_tuple(void *data, __u64 nh_off,
+ void *data_end, __u16 eth_proto,
+ bool *ipv4)
+{
+ struct bpf_sock_tuple *result;
+ __u8 proto = 0;
+ __u64 ihl_len;
+
+ if (eth_proto == bpf_htons(ETH_P_IP)) {
+ struct iphdr *iph = (struct iphdr *)(data + nh_off);
+
+ if (iph + 1 > data_end)
+ return NULL;
+ ihl_len = iph->ihl * 4;
+ proto = iph->protocol;
+ *ipv4 = true;
+ result = (struct bpf_sock_tuple *)&iph->saddr;
+ } else if (eth_proto == bpf_htons(ETH_P_IPV6)) {
+ struct ipv6hdr *ip6h = (struct ipv6hdr *)(data + nh_off);
+
+ if (ip6h + 1 > data_end)
+ return NULL;
+ ihl_len = sizeof(*ip6h);
+ proto = ip6h->nexthdr;
+ *ipv4 = true;
+ result = (struct bpf_sock_tuple *)&ip6h->saddr;
+ }
+
+ if (data + nh_off + ihl_len > data_end || proto != IPPROTO_TCP)
+ return NULL;
+
+ return result;
+}
+
+SEC("classifier/sk_lookup_success")
+int bpf_sk_lookup_test0(struct __sk_buff *skb)
+{
+ void *data_end = (void *)(long)skb->data_end;
+ void *data = (void *)(long)skb->data;
+ struct ethhdr *eth = (struct ethhdr *)(data);
+ struct bpf_sock_tuple *tuple;
+ struct bpf_sock *sk;
+ size_t tuple_len;
+ bool ipv4;
+
+ if (eth + 1 > data_end)
+ return TC_ACT_SHOT;
+
+ tuple = get_tuple(data, sizeof(*eth), data_end, eth->h_proto, &ipv4);
+ if (!tuple || tuple + sizeof *tuple > data_end)
+ return TC_ACT_SHOT;
+
+ tuple_len = ipv4 ? sizeof(tuple->ipv4) : sizeof(tuple->ipv6);
+ sk = bpf_sk_lookup_tcp(skb, tuple, tuple_len, BPF_F_CURRENT_NETNS, 0);
+ bpf_printk("sk=%d\n", sk ? 1 : 0);
+ if (sk)
+ bpf_sk_release(sk);
+ return sk ? TC_ACT_OK : TC_ACT_UNSPEC;
+}
+
+SEC("classifier/sk_lookup_success_simple")
+int bpf_sk_lookup_test1(struct __sk_buff *skb)
+{
+ struct bpf_sock_tuple tuple = {};
+ struct bpf_sock *sk;
+
+ sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0);
+ if (sk)
+ bpf_sk_release(sk);
+ return 0;
+}
+
+SEC("classifier/fail_use_after_free")
+int bpf_sk_lookup_uaf(struct __sk_buff *skb)
+{
+ struct bpf_sock_tuple tuple = {};
+ struct bpf_sock *sk;
+ __u32 family = 0;
+
+ sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0);
+ if (sk) {
+ bpf_sk_release(sk);
+ family = sk->family;
+ }
+ return family;
+}
+
+SEC("classifier/fail_modify_sk_pointer")
+int bpf_sk_lookup_modptr(struct __sk_buff *skb)
+{
+ struct bpf_sock_tuple tuple = {};
+ struct bpf_sock *sk;
+ __u32 family;
+
+ sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0);
+ if (sk) {
+ sk += 1;
+ bpf_sk_release(sk);
+ }
+ return 0;
+}
+
+SEC("classifier/fail_modify_sk_or_null_pointer")
+int bpf_sk_lookup_modptr_or_null(struct __sk_buff *skb)
+{
+ struct bpf_sock_tuple tuple = {};
+ struct bpf_sock *sk;
+ __u32 family;
+
+ sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0);
+ sk += 1;
+ if (sk)
+ bpf_sk_release(sk);
+ return 0;
+}
+
+SEC("classifier/fail_no_release")
+int bpf_sk_lookup_test2(struct __sk_buff *skb)
+{
+ struct bpf_sock_tuple tuple = {};
+
+ bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0);
+ return 0;
+}
+
+SEC("classifier/fail_release_twice")
+int bpf_sk_lookup_test3(struct __sk_buff *skb)
+{
+ struct bpf_sock_tuple tuple = {};
+ struct bpf_sock *sk;
+
+ sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0);
+ bpf_sk_release(sk);
+ bpf_sk_release(sk);
+ return 0;
+}
+
+SEC("classifier/fail_release_unchecked")
+int bpf_sk_lookup_test4(struct __sk_buff *skb)
+{
+ struct bpf_sock_tuple tuple = {};
+ struct bpf_sock *sk;
+
+ sk = bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0);
+ bpf_sk_release(sk);
+ return 0;
+}
+
+void lookup_no_release(struct __sk_buff *skb)
+{
+ struct bpf_sock_tuple tuple = {};
+ bpf_sk_lookup_tcp(skb, &tuple, sizeof(tuple), BPF_F_CURRENT_NETNS, 0);
+}
+
+SEC("classifier/fail_no_release_subcall")
+int bpf_sk_lookup_test5(struct __sk_buff *skb)
+{
+ lookup_no_release(skb);
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_skb_cgroup_id_kern.c b/tools/testing/selftests/bpf/progs/test_skb_cgroup_id_kern.c
new file mode 100644
index 000000000..552f20906
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_skb_cgroup_id_kern.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <linux/bpf.h>
+#include <linux/pkt_cls.h>
+
+#include <string.h>
+
+#include <bpf/bpf_helpers.h>
+
+#define NUM_CGROUP_LEVELS 4
+
+struct bpf_map_def SEC("maps") cgroup_ids = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(__u64),
+ .max_entries = NUM_CGROUP_LEVELS,
+};
+
+static __always_inline void log_nth_level(struct __sk_buff *skb, __u32 level)
+{
+ __u64 id;
+
+ /* [1] &level passed to external function that may change it, it's
+ * incompatible with loop unroll.
+ */
+ id = bpf_skb_ancestor_cgroup_id(skb, level);
+ bpf_map_update_elem(&cgroup_ids, &level, &id, 0);
+}
+
+SEC("cgroup_id_logger")
+int log_cgroup_id(struct __sk_buff *skb)
+{
+ /* Loop unroll can't be used here due to [1]. Unrolling manually.
+ * Number of calls should be in sync with NUM_CGROUP_LEVELS.
+ */
+ log_nth_level(skb, 0);
+ log_nth_level(skb, 1);
+ log_nth_level(skb, 2);
+ log_nth_level(skb, 3);
+
+ return TC_ACT_OK;
+}
+
+int _version SEC("version") = 1;
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_skb_ctx.c b/tools/testing/selftests/bpf/progs/test_skb_ctx.c
new file mode 100644
index 000000000..b02ea589c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_skb_ctx.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
+
+SEC("skb_ctx")
+int process(struct __sk_buff *skb)
+{
+ #pragma clang loop unroll(full)
+ for (int i = 0; i < 5; i++) {
+ if (skb->cb[i] != i + 1)
+ return 1;
+ skb->cb[i]++;
+ }
+ skb->priority++;
+ skb->tstamp++;
+ skb->mark++;
+
+ if (skb->wire_len != 100)
+ return 1;
+ if (skb->gso_segs != 8)
+ return 1;
+ if (skb->gso_size != 10)
+ return 1;
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_skb_helpers.c b/tools/testing/selftests/bpf/progs/test_skb_helpers.c
new file mode 100644
index 000000000..bb3fbf1a2
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_skb_helpers.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#define TEST_COMM_LEN 16
+
+struct {
+ __uint(type, BPF_MAP_TYPE_CGROUP_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, u32);
+ __type(value, u32);
+} cgroup_map SEC(".maps");
+
+char _license[] SEC("license") = "GPL";
+
+SEC("classifier/test_skb_helpers")
+int test_skb_helpers(struct __sk_buff *skb)
+{
+ struct task_struct *task;
+ char comm[TEST_COMM_LEN];
+ __u32 tpid;
+
+ task = (struct task_struct *)bpf_get_current_task();
+ bpf_probe_read_kernel(&tpid , sizeof(tpid), &task->tgid);
+ bpf_probe_read_kernel_str(&comm, sizeof(comm), &task->comm);
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_skeleton.c b/tools/testing/selftests/bpf/progs/test_skeleton.c
new file mode 100644
index 000000000..374ccef70
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_skeleton.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+
+#include <stdbool.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+struct s {
+ int a;
+ long long b;
+} __attribute__((packed));
+
+/* .data section */
+int in1 = -1;
+long long in2 = -1;
+
+/* .bss section */
+char in3 = '\0';
+long long in4 __attribute__((aligned(64))) = 0;
+struct s in5 = {};
+
+/* .rodata section */
+const volatile struct {
+ const int in6;
+} in = {};
+
+/* .data section */
+int out1 = -1;
+long long out2 = -1;
+
+/* .bss section */
+char out3 = 0;
+long long out4 = 0;
+int out6 = 0;
+
+extern bool CONFIG_BPF_SYSCALL __kconfig;
+extern int LINUX_KERNEL_VERSION __kconfig;
+bool bpf_syscall = 0;
+int kern_ver = 0;
+
+SEC("raw_tp/sys_enter")
+int handler(const void *ctx)
+{
+ static volatile struct s out5;
+
+ out1 = in1;
+ out2 = in2;
+ out3 = in3;
+ out4 = in4;
+ out5 = in5;
+ out6 = in.in6;
+
+ bpf_syscall = CONFIG_BPF_SYSCALL;
+ kern_ver = LINUX_KERNEL_VERSION;
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c
new file mode 100644
index 000000000..45e8fc75a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_skmsg_load_helpers.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Isovalent, Inc.
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKMAP);
+ __uint(max_entries, 2);
+ __type(key, __u32);
+ __type(value, __u64);
+} sock_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKHASH);
+ __uint(max_entries, 2);
+ __type(key, __u32);
+ __type(value, __u64);
+} sock_hash SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, __u32);
+ __type(value, __u64);
+} socket_storage SEC(".maps");
+
+SEC("sk_msg")
+int prog_msg_verdict(struct sk_msg_md *msg)
+{
+ struct task_struct *task = (struct task_struct *)bpf_get_current_task();
+ int verdict = SK_PASS;
+ __u32 pid, tpid;
+ __u64 *sk_stg;
+
+ pid = bpf_get_current_pid_tgid() >> 32;
+ sk_stg = bpf_sk_storage_get(&socket_storage, msg->sk, 0, BPF_SK_STORAGE_GET_F_CREATE);
+ if (!sk_stg)
+ return SK_DROP;
+ *sk_stg = pid;
+ bpf_probe_read_kernel(&tpid , sizeof(tpid), &task->tgid);
+ if (pid != tpid)
+ verdict = SK_DROP;
+ bpf_sk_storage_delete(&socket_storage, (void *)msg->sk);
+ return verdict;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_sock_fields.c b/tools/testing/selftests/bpf/progs/test_sock_fields.c
new file mode 100644
index 000000000..43b31aa1f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sock_fields.c
@@ -0,0 +1,298 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+
+#include <linux/bpf.h>
+#include <netinet/in.h>
+#include <stdbool.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include "bpf_tcp_helpers.h"
+
+enum bpf_linum_array_idx {
+ EGRESS_LINUM_IDX,
+ INGRESS_LINUM_IDX,
+ READ_SK_DST_PORT_LINUM_IDX,
+ __NR_BPF_LINUM_ARRAY_IDX,
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, __NR_BPF_LINUM_ARRAY_IDX);
+ __type(key, __u32);
+ __type(value, __u32);
+} linum_map SEC(".maps");
+
+struct bpf_spinlock_cnt {
+ struct bpf_spin_lock lock;
+ __u32 cnt;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct bpf_spinlock_cnt);
+} sk_pkt_out_cnt SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct bpf_spinlock_cnt);
+} sk_pkt_out_cnt10 SEC(".maps");
+
+struct bpf_tcp_sock listen_tp = {};
+struct sockaddr_in6 srv_sa6 = {};
+struct bpf_tcp_sock cli_tp = {};
+struct bpf_tcp_sock srv_tp = {};
+struct bpf_sock listen_sk = {};
+struct bpf_sock srv_sk = {};
+struct bpf_sock cli_sk = {};
+__u64 parent_cg_id = 0;
+__u64 child_cg_id = 0;
+__u64 lsndtime = 0;
+
+static bool is_loopback6(__u32 *a6)
+{
+ return !a6[0] && !a6[1] && !a6[2] && a6[3] == bpf_htonl(1);
+}
+
+static void skcpy(struct bpf_sock *dst,
+ const struct bpf_sock *src)
+{
+ dst->bound_dev_if = src->bound_dev_if;
+ dst->family = src->family;
+ dst->type = src->type;
+ dst->protocol = src->protocol;
+ dst->mark = src->mark;
+ dst->priority = src->priority;
+ dst->src_ip4 = src->src_ip4;
+ dst->src_ip6[0] = src->src_ip6[0];
+ dst->src_ip6[1] = src->src_ip6[1];
+ dst->src_ip6[2] = src->src_ip6[2];
+ dst->src_ip6[3] = src->src_ip6[3];
+ dst->src_port = src->src_port;
+ dst->dst_ip4 = src->dst_ip4;
+ dst->dst_ip6[0] = src->dst_ip6[0];
+ dst->dst_ip6[1] = src->dst_ip6[1];
+ dst->dst_ip6[2] = src->dst_ip6[2];
+ dst->dst_ip6[3] = src->dst_ip6[3];
+ dst->dst_port = src->dst_port;
+ dst->state = src->state;
+}
+
+static void tpcpy(struct bpf_tcp_sock *dst,
+ const struct bpf_tcp_sock *src)
+{
+ dst->snd_cwnd = src->snd_cwnd;
+ dst->srtt_us = src->srtt_us;
+ dst->rtt_min = src->rtt_min;
+ dst->snd_ssthresh = src->snd_ssthresh;
+ dst->rcv_nxt = src->rcv_nxt;
+ dst->snd_nxt = src->snd_nxt;
+ dst->snd_una = src->snd_una;
+ dst->mss_cache = src->mss_cache;
+ dst->ecn_flags = src->ecn_flags;
+ dst->rate_delivered = src->rate_delivered;
+ dst->rate_interval_us = src->rate_interval_us;
+ dst->packets_out = src->packets_out;
+ dst->retrans_out = src->retrans_out;
+ dst->total_retrans = src->total_retrans;
+ dst->segs_in = src->segs_in;
+ dst->data_segs_in = src->data_segs_in;
+ dst->segs_out = src->segs_out;
+ dst->data_segs_out = src->data_segs_out;
+ dst->lost_out = src->lost_out;
+ dst->sacked_out = src->sacked_out;
+ dst->bytes_received = src->bytes_received;
+ dst->bytes_acked = src->bytes_acked;
+}
+
+/* Always return CG_OK so that no pkt will be filtered out */
+#define CG_OK 1
+
+#define RET_LOG() ({ \
+ linum = __LINE__; \
+ bpf_map_update_elem(&linum_map, &linum_idx, &linum, BPF_ANY); \
+ return CG_OK; \
+})
+
+SEC("cgroup_skb/egress")
+int egress_read_sock_fields(struct __sk_buff *skb)
+{
+ struct bpf_spinlock_cnt cli_cnt_init = { .lock = 0, .cnt = 0xeB9F };
+ struct bpf_spinlock_cnt *pkt_out_cnt, *pkt_out_cnt10;
+ struct bpf_tcp_sock *tp, *tp_ret;
+ struct bpf_sock *sk, *sk_ret;
+ __u32 linum, linum_idx;
+ struct tcp_sock *ktp;
+
+ linum_idx = EGRESS_LINUM_IDX;
+
+ sk = skb->sk;
+ if (!sk)
+ RET_LOG();
+
+ /* Not the testing egress traffic or
+ * TCP_LISTEN (10) socket will be copied at the ingress side.
+ */
+ if (sk->family != AF_INET6 || !is_loopback6(sk->src_ip6) ||
+ sk->state == 10)
+ return CG_OK;
+
+ if (sk->src_port == bpf_ntohs(srv_sa6.sin6_port)) {
+ /* Server socket */
+ sk_ret = &srv_sk;
+ tp_ret = &srv_tp;
+ } else if (sk->dst_port == srv_sa6.sin6_port) {
+ /* Client socket */
+ sk_ret = &cli_sk;
+ tp_ret = &cli_tp;
+ } else {
+ /* Not the testing egress traffic */
+ return CG_OK;
+ }
+
+ /* It must be a fullsock for cgroup_skb/egress prog */
+ sk = bpf_sk_fullsock(sk);
+ if (!sk)
+ RET_LOG();
+
+ /* Not the testing egress traffic */
+ if (sk->protocol != IPPROTO_TCP)
+ return CG_OK;
+
+ tp = bpf_tcp_sock(sk);
+ if (!tp)
+ RET_LOG();
+
+ skcpy(sk_ret, sk);
+ tpcpy(tp_ret, tp);
+
+ if (sk_ret == &srv_sk) {
+ ktp = bpf_skc_to_tcp_sock(sk);
+
+ if (!ktp)
+ RET_LOG();
+
+ lsndtime = ktp->lsndtime;
+
+ child_cg_id = bpf_sk_cgroup_id(ktp);
+ if (!child_cg_id)
+ RET_LOG();
+
+ parent_cg_id = bpf_sk_ancestor_cgroup_id(ktp, 2);
+ if (!parent_cg_id)
+ RET_LOG();
+
+ /* The userspace has created it for srv sk */
+ pkt_out_cnt = bpf_sk_storage_get(&sk_pkt_out_cnt, ktp, 0, 0);
+ pkt_out_cnt10 = bpf_sk_storage_get(&sk_pkt_out_cnt10, ktp,
+ 0, 0);
+ } else {
+ pkt_out_cnt = bpf_sk_storage_get(&sk_pkt_out_cnt, sk,
+ &cli_cnt_init,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ pkt_out_cnt10 = bpf_sk_storage_get(&sk_pkt_out_cnt10,
+ sk, &cli_cnt_init,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ }
+
+ if (!pkt_out_cnt || !pkt_out_cnt10)
+ RET_LOG();
+
+ /* Even both cnt and cnt10 have lock defined in their BTF,
+ * intentionally one cnt takes lock while one does not
+ * as a test for the spinlock support in BPF_MAP_TYPE_SK_STORAGE.
+ */
+ pkt_out_cnt->cnt += 1;
+ bpf_spin_lock(&pkt_out_cnt10->lock);
+ pkt_out_cnt10->cnt += 10;
+ bpf_spin_unlock(&pkt_out_cnt10->lock);
+
+ return CG_OK;
+}
+
+SEC("cgroup_skb/ingress")
+int ingress_read_sock_fields(struct __sk_buff *skb)
+{
+ struct bpf_tcp_sock *tp;
+ __u32 linum, linum_idx;
+ struct bpf_sock *sk;
+
+ linum_idx = INGRESS_LINUM_IDX;
+
+ sk = skb->sk;
+ if (!sk)
+ RET_LOG();
+
+ /* Not the testing ingress traffic to the server */
+ if (sk->family != AF_INET6 || !is_loopback6(sk->src_ip6) ||
+ sk->src_port != bpf_ntohs(srv_sa6.sin6_port))
+ return CG_OK;
+
+ /* Only interested in TCP_LISTEN */
+ if (sk->state != 10)
+ return CG_OK;
+
+ /* It must be a fullsock for cgroup_skb/ingress prog */
+ sk = bpf_sk_fullsock(sk);
+ if (!sk)
+ RET_LOG();
+
+ tp = bpf_tcp_sock(sk);
+ if (!tp)
+ RET_LOG();
+
+ skcpy(&listen_sk, sk);
+ tpcpy(&listen_tp, tp);
+
+ return CG_OK;
+}
+
+static __noinline bool sk_dst_port__load_word(struct bpf_sock *sk)
+{
+ __u32 *word = (__u32 *)&sk->dst_port;
+ return word[0] == bpf_htonl(0xcafe0000);
+}
+
+static __noinline bool sk_dst_port__load_half(struct bpf_sock *sk)
+{
+ __u16 *half = (__u16 *)&sk->dst_port;
+ return half[0] == bpf_htons(0xcafe);
+}
+
+static __noinline bool sk_dst_port__load_byte(struct bpf_sock *sk)
+{
+ __u8 *byte = (__u8 *)&sk->dst_port;
+ return byte[0] == 0xca && byte[1] == 0xfe;
+}
+
+SEC("cgroup_skb/egress")
+int read_sk_dst_port(struct __sk_buff *skb)
+{
+ __u32 linum, linum_idx;
+ struct bpf_sock *sk;
+
+ linum_idx = READ_SK_DST_PORT_LINUM_IDX;
+
+ sk = skb->sk;
+ if (!sk)
+ RET_LOG();
+
+ /* Ignore everything but the SYN from the client socket */
+ if (sk->state != BPF_TCP_SYN_SENT)
+ return CG_OK;
+
+ if (!sk_dst_port__load_word(sk))
+ RET_LOG();
+ if (!sk_dst_port__load_half(sk))
+ RET_LOG();
+ if (!sk_dst_port__load_byte(sk))
+ RET_LOG();
+
+ return CG_OK;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_sockhash_kern.c b/tools/testing/selftests/bpf/progs/test_sockhash_kern.c
new file mode 100644
index 000000000..e67559164
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sockhash_kern.c
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+#undef SOCKMAP
+#define TEST_MAP_TYPE BPF_MAP_TYPE_SOCKHASH
+#include "./test_sockmap_kern.h"
diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_invalid_update.c b/tools/testing/selftests/bpf/progs/test_sockmap_invalid_update.c
new file mode 100644
index 000000000..02a59e220
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sockmap_invalid_update.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Cloudflare
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKMAP);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u64);
+} map SEC(".maps");
+
+SEC("sockops")
+int bpf_sockmap(struct bpf_sock_ops *skops)
+{
+ __u32 key = 0;
+
+ if (skops->sk)
+ bpf_map_update_elem(&map, &key, skops->sk, 0);
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_kern.c b/tools/testing/selftests/bpf/progs/test_sockmap_kern.c
new file mode 100644
index 000000000..677b2ed1c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sockmap_kern.c
@@ -0,0 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+#define SOCKMAP
+#define TEST_MAP_TYPE BPF_MAP_TYPE_SOCKMAP
+#include "./test_sockmap_kern.h"
diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h
new file mode 100644
index 000000000..5cb90ca29
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h
@@ -0,0 +1,375 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io */
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/pkt_cls.h>
+#include <sys/socket.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+/* Sockmap sample program connects a client and a backend together
+ * using cgroups.
+ *
+ * client:X <---> frontend:80 client:X <---> backend:80
+ *
+ * For simplicity we hard code values here and bind 1:1. The hard
+ * coded values are part of the setup in sockmap.sh script that
+ * is associated with this BPF program.
+ *
+ * The bpf_printk is verbose and prints information as connections
+ * are established and verdicts are decided.
+ */
+
+struct {
+ __uint(type, TEST_MAP_TYPE);
+ __uint(max_entries, 20);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+} sock_map SEC(".maps");
+
+struct {
+ __uint(type, TEST_MAP_TYPE);
+ __uint(max_entries, 20);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+} sock_map_txmsg SEC(".maps");
+
+struct {
+ __uint(type, TEST_MAP_TYPE);
+ __uint(max_entries, 20);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+} sock_map_redir SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, int);
+} sock_apply_bytes SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, int);
+} sock_cork_bytes SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 6);
+ __type(key, int);
+ __type(value, int);
+} sock_bytes SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, int);
+} sock_redir_flags SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 3);
+ __type(key, int);
+ __type(value, int);
+} sock_skb_opts SEC(".maps");
+
+struct {
+ __uint(type, TEST_MAP_TYPE);
+ __uint(max_entries, 20);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+} tls_sock_map SEC(".maps");
+
+SEC("sk_skb1")
+int bpf_prog1(struct __sk_buff *skb)
+{
+ int *f, two = 2;
+
+ f = bpf_map_lookup_elem(&sock_skb_opts, &two);
+ if (f && *f) {
+ return *f;
+ }
+ return skb->len;
+}
+
+SEC("sk_skb2")
+int bpf_prog2(struct __sk_buff *skb)
+{
+ __u32 lport = skb->local_port;
+ __u32 rport = skb->remote_port;
+ int len, *f, ret, zero = 0;
+ __u64 flags = 0;
+
+ if (lport == 10000)
+ ret = 10;
+ else
+ ret = 1;
+
+ len = (__u32)skb->data_end - (__u32)skb->data;
+ f = bpf_map_lookup_elem(&sock_skb_opts, &zero);
+ if (f && *f) {
+ ret = 3;
+ flags = *f;
+ }
+
+#ifdef SOCKMAP
+ return bpf_sk_redirect_map(skb, &sock_map, ret, flags);
+#else
+ return bpf_sk_redirect_hash(skb, &sock_map, &ret, flags);
+#endif
+
+}
+
+static inline void bpf_write_pass(struct __sk_buff *skb, int offset)
+{
+ int err = bpf_skb_pull_data(skb, 6 + offset);
+ void *data_end;
+ char *c;
+
+ if (err)
+ return;
+
+ c = (char *)(long)skb->data;
+ data_end = (void *)(long)skb->data_end;
+
+ if (c + 5 + offset < data_end)
+ memcpy(c + offset, "PASS", 4);
+}
+
+SEC("sk_skb3")
+int bpf_prog3(struct __sk_buff *skb)
+{
+ int err, *f, ret = SK_PASS;
+ const int one = 1;
+
+ f = bpf_map_lookup_elem(&sock_skb_opts, &one);
+ if (f && *f) {
+ __u64 flags = 0;
+
+ ret = 0;
+ flags = *f;
+
+ err = bpf_skb_adjust_room(skb, -13, 0, 0);
+ if (err)
+ return SK_DROP;
+ err = bpf_skb_adjust_room(skb, 4, 0, 0);
+ if (err)
+ return SK_DROP;
+ bpf_write_pass(skb, 0);
+#ifdef SOCKMAP
+ return bpf_sk_redirect_map(skb, &tls_sock_map, ret, flags);
+#else
+ return bpf_sk_redirect_hash(skb, &tls_sock_map, &ret, flags);
+#endif
+ }
+ f = bpf_map_lookup_elem(&sock_skb_opts, &one);
+ if (f && *f)
+ ret = SK_DROP;
+ err = bpf_skb_adjust_room(skb, 4, 0, 0);
+ if (err)
+ return SK_DROP;
+ bpf_write_pass(skb, 13);
+tls_out:
+ return ret;
+}
+
+SEC("sockops")
+int bpf_sockmap(struct bpf_sock_ops *skops)
+{
+ __u32 lport, rport;
+ int op, err = 0, index, key, ret;
+
+
+ op = (int) skops->op;
+
+ switch (op) {
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ lport = skops->local_port;
+ rport = skops->remote_port;
+
+ if (lport == 10000) {
+ ret = 1;
+#ifdef SOCKMAP
+ err = bpf_sock_map_update(skops, &sock_map, &ret,
+ BPF_NOEXIST);
+#else
+ err = bpf_sock_hash_update(skops, &sock_map, &ret,
+ BPF_NOEXIST);
+#endif
+ }
+ break;
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ lport = skops->local_port;
+ rport = skops->remote_port;
+
+ if (bpf_ntohl(rport) == 10001) {
+ ret = 10;
+#ifdef SOCKMAP
+ err = bpf_sock_map_update(skops, &sock_map, &ret,
+ BPF_NOEXIST);
+#else
+ err = bpf_sock_hash_update(skops, &sock_map, &ret,
+ BPF_NOEXIST);
+#endif
+ }
+ break;
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+SEC("sk_msg1")
+int bpf_prog4(struct sk_msg_md *msg)
+{
+ int *bytes, zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5;
+ int *start, *end, *start_push, *end_push, *start_pop, *pop, err = 0;
+
+ bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+ if (bytes)
+ bpf_msg_apply_bytes(msg, *bytes);
+ bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+ if (bytes)
+ bpf_msg_cork_bytes(msg, *bytes);
+ start = bpf_map_lookup_elem(&sock_bytes, &zero);
+ end = bpf_map_lookup_elem(&sock_bytes, &one);
+ if (start && end)
+ bpf_msg_pull_data(msg, *start, *end, 0);
+ start_push = bpf_map_lookup_elem(&sock_bytes, &two);
+ end_push = bpf_map_lookup_elem(&sock_bytes, &three);
+ if (start_push && end_push) {
+ err = bpf_msg_push_data(msg, *start_push, *end_push, 0);
+ if (err)
+ return SK_DROP;
+ }
+ start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
+ pop = bpf_map_lookup_elem(&sock_bytes, &five);
+ if (start_pop && pop)
+ bpf_msg_pop_data(msg, *start_pop, *pop, 0);
+ return SK_PASS;
+}
+
+SEC("sk_msg2")
+int bpf_prog6(struct sk_msg_md *msg)
+{
+ int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5, key = 0;
+ int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop, *f;
+ int err = 0;
+ __u64 flags = 0;
+
+ bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+ if (bytes)
+ bpf_msg_apply_bytes(msg, *bytes);
+ bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+ if (bytes)
+ bpf_msg_cork_bytes(msg, *bytes);
+
+ start = bpf_map_lookup_elem(&sock_bytes, &zero);
+ end = bpf_map_lookup_elem(&sock_bytes, &one);
+ if (start && end)
+ bpf_msg_pull_data(msg, *start, *end, 0);
+
+ start_push = bpf_map_lookup_elem(&sock_bytes, &two);
+ end_push = bpf_map_lookup_elem(&sock_bytes, &three);
+ if (start_push && end_push) {
+ err = bpf_msg_push_data(msg, *start_push, *end_push, 0);
+ if (err)
+ return SK_DROP;
+ }
+
+ start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
+ pop = bpf_map_lookup_elem(&sock_bytes, &five);
+ if (start_pop && pop)
+ bpf_msg_pop_data(msg, *start_pop, *pop, 0);
+
+ f = bpf_map_lookup_elem(&sock_redir_flags, &zero);
+ if (f && *f) {
+ key = 2;
+ flags = *f;
+ }
+#ifdef SOCKMAP
+ return bpf_msg_redirect_map(msg, &sock_map_redir, key, flags);
+#else
+ return bpf_msg_redirect_hash(msg, &sock_map_redir, &key, flags);
+#endif
+}
+
+SEC("sk_msg3")
+int bpf_prog8(struct sk_msg_md *msg)
+{
+ void *data_end = (void *)(long) msg->data_end;
+ void *data = (void *)(long) msg->data;
+ int ret = 0, *bytes, zero = 0;
+
+ bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+ if (bytes) {
+ ret = bpf_msg_apply_bytes(msg, *bytes);
+ if (ret)
+ return SK_DROP;
+ } else {
+ return SK_DROP;
+ }
+ return SK_PASS;
+}
+SEC("sk_msg4")
+int bpf_prog9(struct sk_msg_md *msg)
+{
+ void *data_end = (void *)(long) msg->data_end;
+ void *data = (void *)(long) msg->data;
+ int ret = 0, *bytes, zero = 0;
+
+ bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+ if (bytes) {
+ if (((__u64)data_end - (__u64)data) >= *bytes)
+ return SK_PASS;
+ ret = bpf_msg_cork_bytes(msg, *bytes);
+ if (ret)
+ return SK_DROP;
+ }
+ return SK_PASS;
+}
+
+SEC("sk_msg5")
+int bpf_prog10(struct sk_msg_md *msg)
+{
+ int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop;
+ int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5, err = 0;
+
+ bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero);
+ if (bytes)
+ bpf_msg_apply_bytes(msg, *bytes);
+ bytes = bpf_map_lookup_elem(&sock_cork_bytes, &zero);
+ if (bytes)
+ bpf_msg_cork_bytes(msg, *bytes);
+ start = bpf_map_lookup_elem(&sock_bytes, &zero);
+ end = bpf_map_lookup_elem(&sock_bytes, &one);
+ if (start && end)
+ bpf_msg_pull_data(msg, *start, *end, 0);
+ start_push = bpf_map_lookup_elem(&sock_bytes, &two);
+ end_push = bpf_map_lookup_elem(&sock_bytes, &three);
+ if (start_push && end_push) {
+ err = bpf_msg_push_data(msg, *start_push, *end_push, 0);
+ if (err)
+ return SK_PASS;
+ }
+ start_pop = bpf_map_lookup_elem(&sock_bytes, &four);
+ pop = bpf_map_lookup_elem(&sock_bytes, &five);
+ if (start_pop && pop)
+ bpf_msg_pop_data(msg, *start_pop, *pop, 0);
+ return SK_DROP;
+}
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_listen.c b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c
new file mode 100644
index 000000000..a3a366c57
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sockmap_listen.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Cloudflare
+
+#include <errno.h>
+#include <stdbool.h>
+#include <linux/bpf.h>
+
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKMAP);
+ __uint(max_entries, 2);
+ __type(key, __u32);
+ __type(value, __u64);
+} sock_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKHASH);
+ __uint(max_entries, 2);
+ __type(key, __u32);
+ __type(value, __u64);
+} sock_hash SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 2);
+ __type(key, int);
+ __type(value, unsigned int);
+} verdict_map SEC(".maps");
+
+static volatile bool test_sockmap; /* toggled by user-space */
+
+SEC("sk_skb/stream_parser")
+int prog_skb_parser(struct __sk_buff *skb)
+{
+ return skb->len;
+}
+
+SEC("sk_skb/stream_verdict")
+int prog_skb_verdict(struct __sk_buff *skb)
+{
+ unsigned int *count;
+ __u32 zero = 0;
+ int verdict;
+
+ if (test_sockmap)
+ verdict = bpf_sk_redirect_map(skb, &sock_map, zero, 0);
+ else
+ verdict = bpf_sk_redirect_hash(skb, &sock_hash, &zero, 0);
+
+ count = bpf_map_lookup_elem(&verdict_map, &verdict);
+ if (count)
+ (*count)++;
+
+ return verdict;
+}
+
+SEC("sk_msg")
+int prog_msg_verdict(struct sk_msg_md *msg)
+{
+ unsigned int *count;
+ __u32 zero = 0;
+ int verdict;
+
+ if (test_sockmap)
+ verdict = bpf_msg_redirect_map(msg, &sock_map, zero, 0);
+ else
+ verdict = bpf_msg_redirect_hash(msg, &sock_hash, &zero, 0);
+
+ count = bpf_map_lookup_elem(&verdict_map, &verdict);
+ if (count)
+ (*count)++;
+
+ return verdict;
+}
+
+SEC("sk_reuseport")
+int prog_reuseport(struct sk_reuseport_md *reuse)
+{
+ unsigned int *count;
+ int err, verdict;
+ __u32 zero = 0;
+
+ if (test_sockmap)
+ err = bpf_sk_select_reuseport(reuse, &sock_map, &zero, 0);
+ else
+ err = bpf_sk_select_reuseport(reuse, &sock_hash, &zero, 0);
+ verdict = err ? SK_DROP : SK_PASS;
+
+ count = bpf_map_lookup_elem(&verdict_map, &verdict);
+ if (count)
+ (*count)++;
+
+ return verdict;
+}
+
+int _version SEC("version") = 1;
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_update.c b/tools/testing/selftests/bpf/progs/test_sockmap_update.c
new file mode 100644
index 000000000..9d0c9f28c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sockmap_update.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Cloudflare
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKMAP);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u64);
+} src SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKMAP);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u64);
+} dst_sock_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKHASH);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u64);
+} dst_sock_hash SEC(".maps");
+
+SEC("classifier/copy_sock_map")
+int copy_sock_map(void *ctx)
+{
+ struct bpf_sock *sk;
+ bool failed = false;
+ __u32 key = 0;
+
+ sk = bpf_map_lookup_elem(&src, &key);
+ if (!sk)
+ return SK_DROP;
+
+ if (bpf_map_update_elem(&dst_sock_map, &key, sk, 0))
+ failed = true;
+
+ if (bpf_map_update_elem(&dst_sock_hash, &key, sk, 0))
+ failed = true;
+
+ bpf_sk_release(sk);
+ return failed ? SK_DROP : SK_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_spin_lock.c b/tools/testing/selftests/bpf/progs/test_spin_lock.c
new file mode 100644
index 000000000..0d31a3b35
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_spin_lock.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <linux/bpf.h>
+#include <linux/version.h>
+#include <bpf/bpf_helpers.h>
+
+struct hmap_elem {
+ volatile int cnt;
+ struct bpf_spin_lock lock;
+ int test_padding;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, struct hmap_elem);
+} hmap SEC(".maps");
+
+struct cls_elem {
+ struct bpf_spin_lock lock;
+ volatile int cnt;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_CGROUP_STORAGE);
+ __type(key, struct bpf_cgroup_storage_key);
+ __type(value, struct cls_elem);
+} cls_map SEC(".maps");
+
+struct bpf_vqueue {
+ struct bpf_spin_lock lock;
+ /* 4 byte hole */
+ unsigned long long lasttime;
+ int credit;
+ unsigned int rate;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, struct bpf_vqueue);
+} vqueue SEC(".maps");
+
+#define CREDIT_PER_NS(delta, rate) (((delta) * rate) >> 20)
+
+SEC("spin_lock_demo")
+int bpf_sping_lock_test(struct __sk_buff *skb)
+{
+ volatile int credit = 0, max_credit = 100, pkt_len = 64;
+ struct hmap_elem zero = {}, *val;
+ unsigned long long curtime;
+ struct bpf_vqueue *q;
+ struct cls_elem *cls;
+ int key = 0;
+ int err = 0;
+
+ val = bpf_map_lookup_elem(&hmap, &key);
+ if (!val) {
+ bpf_map_update_elem(&hmap, &key, &zero, 0);
+ val = bpf_map_lookup_elem(&hmap, &key);
+ if (!val) {
+ err = 1;
+ goto err;
+ }
+ }
+ /* spin_lock in hash map run time test */
+ bpf_spin_lock(&val->lock);
+ if (val->cnt)
+ val->cnt--;
+ else
+ val->cnt++;
+ if (val->cnt != 0 && val->cnt != 1)
+ err = 1;
+ bpf_spin_unlock(&val->lock);
+
+ /* spin_lock in array. virtual queue demo */
+ q = bpf_map_lookup_elem(&vqueue, &key);
+ if (!q)
+ goto err;
+ curtime = bpf_ktime_get_ns();
+ bpf_spin_lock(&q->lock);
+ q->credit += CREDIT_PER_NS(curtime - q->lasttime, q->rate);
+ q->lasttime = curtime;
+ if (q->credit > max_credit)
+ q->credit = max_credit;
+ q->credit -= pkt_len;
+ credit = q->credit;
+ bpf_spin_unlock(&q->lock);
+
+ /* spin_lock in cgroup local storage */
+ cls = bpf_get_local_storage(&cls_map, 0);
+ bpf_spin_lock(&cls->lock);
+ cls->cnt++;
+ bpf_spin_unlock(&cls->lock);
+
+err:
+ return err;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_stack_map.c b/tools/testing/selftests/bpf/progs/test_stack_map.c
new file mode 100644
index 000000000..31c3880e6
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_stack_map.c
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Politecnico di Torino
+#define MAP_TYPE BPF_MAP_TYPE_STACK
+#include "test_queue_stack_map.h"
diff --git a/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c b/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c
new file mode 100644
index 000000000..0cf013463
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_stacktrace_build_id.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+#ifndef PERF_MAX_STACK_DEPTH
+#define PERF_MAX_STACK_DEPTH 127
+#endif
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u32);
+} control_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 16384);
+ __type(key, __u32);
+ __type(value, __u32);
+} stackid_hmap SEC(".maps");
+
+typedef struct bpf_stack_build_id stack_trace_t[PERF_MAX_STACK_DEPTH];
+
+struct {
+ __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+ __uint(max_entries, 128);
+ __uint(map_flags, BPF_F_STACK_BUILD_ID);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(stack_trace_t));
+} stackmap SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 128);
+ __type(key, __u32);
+ __type(value, stack_trace_t);
+} stack_amap SEC(".maps");
+
+/* taken from /sys/kernel/debug/tracing/events/random/urandom_read/format */
+struct random_urandom_args {
+ unsigned long long pad;
+ int got_bits;
+ int pool_left;
+ int input_left;
+};
+
+SEC("tracepoint/random/urandom_read")
+int oncpu(struct random_urandom_args *args)
+{
+ __u32 max_len = sizeof(struct bpf_stack_build_id)
+ * PERF_MAX_STACK_DEPTH;
+ __u32 key = 0, val = 0, *value_p;
+ void *stack_p;
+
+ value_p = bpf_map_lookup_elem(&control_map, &key);
+ if (value_p && *value_p)
+ return 0; /* skip if non-zero *value_p */
+
+ /* The size of stackmap and stackid_hmap should be the same */
+ key = bpf_get_stackid(args, &stackmap, BPF_F_USER_STACK);
+ if ((int)key >= 0) {
+ bpf_map_update_elem(&stackid_hmap, &key, &val, 0);
+ stack_p = bpf_map_lookup_elem(&stack_amap, &key);
+ if (stack_p)
+ bpf_get_stack(args, stack_p, max_len,
+ BPF_F_USER_STACK | BPF_F_USER_BUILD_ID);
+ }
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1; /* ignored by tracepoints, required by libbpf.a */
diff --git a/tools/testing/selftests/bpf/progs/test_stacktrace_map.c b/tools/testing/selftests/bpf/progs/test_stacktrace_map.c
new file mode 100644
index 000000000..00ed48672
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_stacktrace_map.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+#ifndef PERF_MAX_STACK_DEPTH
+#define PERF_MAX_STACK_DEPTH 127
+#endif
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, __u32);
+ __type(value, __u32);
+} control_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 16384);
+ __type(key, __u32);
+ __type(value, __u32);
+} stackid_hmap SEC(".maps");
+
+typedef __u64 stack_trace_t[PERF_MAX_STACK_DEPTH];
+
+struct {
+ __uint(type, BPF_MAP_TYPE_STACK_TRACE);
+ __uint(max_entries, 16384);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(stack_trace_t));
+} stackmap SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 16384);
+ __type(key, __u32);
+ __type(value, stack_trace_t);
+} stack_amap SEC(".maps");
+
+/* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
+struct sched_switch_args {
+ unsigned long long pad;
+ char prev_comm[16];
+ int prev_pid;
+ int prev_prio;
+ long long prev_state;
+ char next_comm[16];
+ int next_pid;
+ int next_prio;
+};
+
+SEC("tracepoint/sched/sched_switch")
+int oncpu(struct sched_switch_args *ctx)
+{
+ __u32 max_len = PERF_MAX_STACK_DEPTH * sizeof(__u64);
+ __u32 key = 0, val = 0, *value_p;
+ void *stack_p;
+
+ value_p = bpf_map_lookup_elem(&control_map, &key);
+ if (value_p && *value_p)
+ return 0; /* skip if non-zero *value_p */
+
+ /* The size of stackmap and stackid_hmap should be the same */
+ key = bpf_get_stackid(ctx, &stackmap, 0);
+ if ((int)key >= 0) {
+ bpf_map_update_elem(&stackid_hmap, &key, &val, 0);
+ stack_p = bpf_map_lookup_elem(&stack_amap, &key);
+ if (stack_p)
+ bpf_get_stack(ctx, stack_p, max_len, 0);
+ }
+
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_subprogs.c b/tools/testing/selftests/bpf/progs/test_subprogs.c
new file mode 100644
index 000000000..d3c5673c0
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_subprogs.c
@@ -0,0 +1,103 @@
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+const char LICENSE[] SEC("license") = "GPL";
+
+__noinline int sub1(int x)
+{
+ return x + 1;
+}
+
+static __noinline int sub5(int v);
+
+__noinline int sub2(int y)
+{
+ return sub5(y + 2);
+}
+
+static __noinline int sub3(int z)
+{
+ return z + 3 + sub1(4);
+}
+
+static __noinline int sub4(int w)
+{
+ return w + sub3(5) + sub1(6);
+}
+
+/* sub5() is an identitify function, just to test weirder functions layout and
+ * call patterns
+ */
+static __noinline int sub5(int v)
+{
+ return sub1(v) - 1; /* compensates sub1()'s + 1 */
+}
+
+/* unfortunately verifier rejects `struct task_struct *t` as an unkown pointer
+ * type, so we need to accept pointer as integer and then cast it inside the
+ * function
+ */
+__noinline int get_task_tgid(uintptr_t t)
+{
+ /* this ensures that CO-RE relocs work in multi-subprogs .text */
+ return BPF_CORE_READ((struct task_struct *)(void *)t, tgid);
+}
+
+int res1 = 0;
+int res2 = 0;
+int res3 = 0;
+int res4 = 0;
+
+SEC("raw_tp/sys_enter")
+int prog1(void *ctx)
+{
+ /* perform some CO-RE relocations to ensure they work with multi-prog
+ * sections correctly
+ */
+ struct task_struct *t = (void *)bpf_get_current_task();
+
+ if (!BPF_CORE_READ(t, pid) || !get_task_tgid((uintptr_t)t))
+ return 1;
+
+ res1 = sub1(1) + sub3(2); /* (1 + 1) + (2 + 3 + (4 + 1)) = 12 */
+ return 0;
+}
+
+SEC("raw_tp/sys_exit")
+int prog2(void *ctx)
+{
+ struct task_struct *t = (void *)bpf_get_current_task();
+
+ if (!BPF_CORE_READ(t, pid) || !get_task_tgid((uintptr_t)t))
+ return 1;
+
+ res2 = sub2(3) + sub3(4); /* (3 + 2) + (4 + 3 + (4 + 1)) = 17 */
+ return 0;
+}
+
+/* prog3 has the same section name as prog1 */
+SEC("raw_tp/sys_enter")
+int prog3(void *ctx)
+{
+ struct task_struct *t = (void *)bpf_get_current_task();
+
+ if (!BPF_CORE_READ(t, pid) || !get_task_tgid((uintptr_t)t))
+ return 1;
+
+ res3 = sub3(5) + 6; /* (5 + 3 + (4 + 1)) + 6 = 19 */
+ return 0;
+}
+
+/* prog4 has the same section name as prog2 */
+SEC("raw_tp/sys_exit")
+int prog4(void *ctx)
+{
+ struct task_struct *t = (void *)bpf_get_current_task();
+
+ if (!BPF_CORE_READ(t, pid) || !get_task_tgid((uintptr_t)t))
+ return 1;
+
+ res4 = sub4(7) + sub1(8); /* (7 + (5 + 3 + (4 + 1)) + (6 + 1)) + (8 + 1) = 36 */
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_subprogs_unused.c b/tools/testing/selftests/bpf/progs/test_subprogs_unused.c
new file mode 100644
index 000000000..bc49e050d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_subprogs_unused.c
@@ -0,0 +1,21 @@
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+
+const char LICENSE[] SEC("license") = "GPL";
+
+__attribute__((unused)) __noinline int unused1(int x)
+{
+ return x + 1;
+}
+
+static __attribute__((unused)) __noinline int unused2(int x)
+{
+ return x + 2;
+}
+
+SEC("raw_tp/sys_enter")
+int main_prog(void *ctx)
+{
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c b/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c
new file mode 100644
index 000000000..553a282d8
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sysctl_loop1.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <stdint.h>
+#include <string.h>
+
+#include <linux/stddef.h>
+#include <linux/bpf.h>
+
+#include <bpf/bpf_helpers.h>
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+/* tcp_mem sysctl has only 3 ints, but this test is doing TCP_MEM_LOOPS */
+#define TCP_MEM_LOOPS 28 /* because 30 doesn't fit into 512 bytes of stack */
+#define MAX_ULONG_STR_LEN 7
+#define MAX_VALUE_STR_LEN (TCP_MEM_LOOPS * MAX_ULONG_STR_LEN)
+
+const char tcp_mem_name[] = "net/ipv4/tcp_mem/very_very_very_very_long_pointless_string";
+static __always_inline int is_tcp_mem(struct bpf_sysctl *ctx)
+{
+ unsigned char i;
+ char name[sizeof(tcp_mem_name)];
+ int ret;
+
+ memset(name, 0, sizeof(name));
+ ret = bpf_sysctl_get_name(ctx, name, sizeof(name), 0);
+ if (ret < 0 || ret != sizeof(tcp_mem_name) - 1)
+ return 0;
+
+#pragma clang loop unroll(disable)
+ for (i = 0; i < sizeof(tcp_mem_name); ++i)
+ if (name[i] != tcp_mem_name[i])
+ return 0;
+
+ return 1;
+}
+
+SEC("cgroup/sysctl")
+int sysctl_tcp_mem(struct bpf_sysctl *ctx)
+{
+ unsigned long tcp_mem[TCP_MEM_LOOPS] = {};
+ char value[MAX_VALUE_STR_LEN];
+ unsigned char i, off = 0;
+ /* a workaround to prevent compiler from generating
+ * codes verifier cannot handle yet.
+ */
+ volatile int ret;
+
+ if (ctx->write)
+ return 0;
+
+ if (!is_tcp_mem(ctx))
+ return 0;
+
+ ret = bpf_sysctl_get_current_value(ctx, value, MAX_VALUE_STR_LEN);
+ if (ret < 0 || ret >= MAX_VALUE_STR_LEN)
+ return 0;
+
+#pragma clang loop unroll(disable)
+ for (i = 0; i < ARRAY_SIZE(tcp_mem); ++i) {
+ ret = bpf_strtoul(value + off, MAX_ULONG_STR_LEN, 0,
+ tcp_mem + i);
+ if (ret <= 0 || ret > MAX_ULONG_STR_LEN)
+ return 0;
+ off += ret & MAX_ULONG_STR_LEN;
+ }
+
+ return tcp_mem[0] < tcp_mem[1] && tcp_mem[1] < tcp_mem[2];
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c b/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c
new file mode 100644
index 000000000..2b64bc563
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sysctl_loop2.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <stdint.h>
+#include <string.h>
+
+#include <linux/stddef.h>
+#include <linux/bpf.h>
+
+#include <bpf/bpf_helpers.h>
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+/* tcp_mem sysctl has only 3 ints, but this test is doing TCP_MEM_LOOPS */
+#define TCP_MEM_LOOPS 20 /* because 30 doesn't fit into 512 bytes of stack */
+#define MAX_ULONG_STR_LEN 7
+#define MAX_VALUE_STR_LEN (TCP_MEM_LOOPS * MAX_ULONG_STR_LEN)
+
+const char tcp_mem_name[] = "net/ipv4/tcp_mem/very_very_very_very_long_pointless_string_to_stress_byte_loop";
+static __attribute__((noinline)) int is_tcp_mem(struct bpf_sysctl *ctx)
+{
+ unsigned char i;
+ char name[sizeof(tcp_mem_name)];
+ int ret;
+
+ memset(name, 0, sizeof(name));
+ ret = bpf_sysctl_get_name(ctx, name, sizeof(name), 0);
+ if (ret < 0 || ret != sizeof(tcp_mem_name) - 1)
+ return 0;
+
+#pragma clang loop unroll(disable)
+ for (i = 0; i < sizeof(tcp_mem_name); ++i)
+ if (name[i] != tcp_mem_name[i])
+ return 0;
+
+ return 1;
+}
+
+
+SEC("cgroup/sysctl")
+int sysctl_tcp_mem(struct bpf_sysctl *ctx)
+{
+ unsigned long tcp_mem[TCP_MEM_LOOPS] = {};
+ char value[MAX_VALUE_STR_LEN];
+ unsigned char i, off = 0;
+ int ret;
+
+ if (ctx->write)
+ return 0;
+
+ if (!is_tcp_mem(ctx))
+ return 0;
+
+ ret = bpf_sysctl_get_current_value(ctx, value, MAX_VALUE_STR_LEN);
+ if (ret < 0 || ret >= MAX_VALUE_STR_LEN)
+ return 0;
+
+#pragma clang loop unroll(disable)
+ for (i = 0; i < ARRAY_SIZE(tcp_mem); ++i) {
+ ret = bpf_strtoul(value + off, MAX_ULONG_STR_LEN, 0,
+ tcp_mem + i);
+ if (ret <= 0 || ret > MAX_ULONG_STR_LEN)
+ return 0;
+ off += ret & MAX_ULONG_STR_LEN;
+ }
+
+ return tcp_mem[0] < tcp_mem[1] && tcp_mem[1] < tcp_mem[2];
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_sysctl_prog.c b/tools/testing/selftests/bpf/progs/test_sysctl_prog.c
new file mode 100644
index 000000000..5489823c8
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sysctl_prog.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <stdint.h>
+#include <string.h>
+
+#include <linux/stddef.h>
+#include <linux/bpf.h>
+
+#include <bpf/bpf_helpers.h>
+
+/* Max supported length of a string with unsigned long in base 10 (pow2 - 1). */
+#define MAX_ULONG_STR_LEN 0xF
+
+/* Max supported length of sysctl value string (pow2). */
+#define MAX_VALUE_STR_LEN 0x40
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+const char tcp_mem_name[] = "net/ipv4/tcp_mem";
+static __always_inline int is_tcp_mem(struct bpf_sysctl *ctx)
+{
+ unsigned char i;
+ char name[sizeof(tcp_mem_name)];
+ int ret;
+
+ memset(name, 0, sizeof(name));
+ ret = bpf_sysctl_get_name(ctx, name, sizeof(name), 0);
+ if (ret < 0 || ret != sizeof(tcp_mem_name) - 1)
+ return 0;
+
+#pragma clang loop unroll(full)
+ for (i = 0; i < sizeof(tcp_mem_name); ++i)
+ if (name[i] != tcp_mem_name[i])
+ return 0;
+
+ return 1;
+}
+
+SEC("cgroup/sysctl")
+int sysctl_tcp_mem(struct bpf_sysctl *ctx)
+{
+ unsigned long tcp_mem[3] = {0, 0, 0};
+ char value[MAX_VALUE_STR_LEN];
+ unsigned char i, off = 0;
+ volatile int ret;
+
+ if (ctx->write)
+ return 0;
+
+ if (!is_tcp_mem(ctx))
+ return 0;
+
+ ret = bpf_sysctl_get_current_value(ctx, value, MAX_VALUE_STR_LEN);
+ if (ret < 0 || ret >= MAX_VALUE_STR_LEN)
+ return 0;
+
+#pragma clang loop unroll(full)
+ for (i = 0; i < ARRAY_SIZE(tcp_mem); ++i) {
+ ret = bpf_strtoul(value + off, MAX_ULONG_STR_LEN, 0,
+ tcp_mem + i);
+ if (ret <= 0 || ret > MAX_ULONG_STR_LEN)
+ return 0;
+ off += ret & MAX_ULONG_STR_LEN;
+ }
+
+
+ return tcp_mem[0] < tcp_mem[1] && tcp_mem[1] < tcp_mem[2];
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_tc_edt.c b/tools/testing/selftests/bpf/progs/test_tc_edt.c
new file mode 100644
index 000000000..bf28814bf
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_tc_edt.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdint.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/stddef.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/pkt_cls.h>
+#include <linux/tcp.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+/* the maximum delay we are willing to add (drop packets beyond that) */
+#define TIME_HORIZON_NS (2000 * 1000 * 1000)
+#define NS_PER_SEC 1000000000
+#define ECN_HORIZON_NS 5000000
+#define THROTTLE_RATE_BPS (5 * 1000 * 1000)
+
+/* flow_key => last_tstamp timestamp used */
+struct bpf_map_def SEC("maps") flow_map = {
+ .type = BPF_MAP_TYPE_HASH,
+ .key_size = sizeof(uint32_t),
+ .value_size = sizeof(uint64_t),
+ .max_entries = 1,
+};
+
+static inline int throttle_flow(struct __sk_buff *skb)
+{
+ int key = 0;
+ uint64_t *last_tstamp = bpf_map_lookup_elem(&flow_map, &key);
+ uint64_t delay_ns = ((uint64_t)skb->len) * NS_PER_SEC /
+ THROTTLE_RATE_BPS;
+ uint64_t now = bpf_ktime_get_ns();
+ uint64_t tstamp, next_tstamp = 0;
+
+ if (last_tstamp)
+ next_tstamp = *last_tstamp + delay_ns;
+
+ tstamp = skb->tstamp;
+ if (tstamp < now)
+ tstamp = now;
+
+ /* should we throttle? */
+ if (next_tstamp <= tstamp) {
+ if (bpf_map_update_elem(&flow_map, &key, &tstamp, BPF_ANY))
+ return TC_ACT_SHOT;
+ return TC_ACT_OK;
+ }
+
+ /* do not queue past the time horizon */
+ if (next_tstamp - now >= TIME_HORIZON_NS)
+ return TC_ACT_SHOT;
+
+ /* set ecn bit, if needed */
+ if (next_tstamp - now >= ECN_HORIZON_NS)
+ bpf_skb_ecn_set_ce(skb);
+
+ if (bpf_map_update_elem(&flow_map, &key, &next_tstamp, BPF_EXIST))
+ return TC_ACT_SHOT;
+ skb->tstamp = next_tstamp;
+
+ return TC_ACT_OK;
+}
+
+static inline int handle_tcp(struct __sk_buff *skb, struct tcphdr *tcp)
+{
+ void *data_end = (void *)(long)skb->data_end;
+
+ /* drop malformed packets */
+ if ((void *)(tcp + 1) > data_end)
+ return TC_ACT_SHOT;
+
+ if (tcp->dest == bpf_htons(9000))
+ return throttle_flow(skb);
+
+ return TC_ACT_OK;
+}
+
+static inline int handle_ipv4(struct __sk_buff *skb)
+{
+ void *data_end = (void *)(long)skb->data_end;
+ void *data = (void *)(long)skb->data;
+ struct iphdr *iph;
+ uint32_t ihl;
+
+ /* drop malformed packets */
+ if (data + sizeof(struct ethhdr) > data_end)
+ return TC_ACT_SHOT;
+ iph = (struct iphdr *)(data + sizeof(struct ethhdr));
+ if ((void *)(iph + 1) > data_end)
+ return TC_ACT_SHOT;
+ ihl = iph->ihl * 4;
+ if (((void *)iph) + ihl > data_end)
+ return TC_ACT_SHOT;
+
+ if (iph->protocol == IPPROTO_TCP)
+ return handle_tcp(skb, (struct tcphdr *)(((void *)iph) + ihl));
+
+ return TC_ACT_OK;
+}
+
+SEC("cls_test") int tc_prog(struct __sk_buff *skb)
+{
+ if (skb->protocol == bpf_htons(ETH_P_IP))
+ return handle_ipv4(skb);
+
+ return TC_ACT_OK;
+}
+
+char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_tc_neigh.c b/tools/testing/selftests/bpf/progs/test_tc_neigh.c
new file mode 100644
index 000000000..b985ac4e7
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_tc_neigh.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <linux/bpf.h>
+#include <linux/stddef.h>
+#include <linux/pkt_cls.h>
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#ifndef ctx_ptr
+# define ctx_ptr(field) (void *)(long)(field)
+#endif
+
+#define ip4_src 0xac100164 /* 172.16.1.100 */
+#define ip4_dst 0xac100264 /* 172.16.2.100 */
+
+#define ip6_src { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
+ 0x00, 0x01, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe }
+#define ip6_dst { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
+ 0x00, 0x02, 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe }
+
+#ifndef v6_equal
+# define v6_equal(a, b) (a.s6_addr32[0] == b.s6_addr32[0] && \
+ a.s6_addr32[1] == b.s6_addr32[1] && \
+ a.s6_addr32[2] == b.s6_addr32[2] && \
+ a.s6_addr32[3] == b.s6_addr32[3])
+#endif
+
+enum {
+ dev_src,
+ dev_dst,
+};
+
+struct bpf_map_def SEC("maps") ifindex_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .max_entries = 2,
+};
+
+static __always_inline bool is_remote_ep_v4(struct __sk_buff *skb,
+ __be32 addr)
+{
+ void *data_end = ctx_ptr(skb->data_end);
+ void *data = ctx_ptr(skb->data);
+ struct iphdr *ip4h;
+
+ if (data + sizeof(struct ethhdr) > data_end)
+ return false;
+
+ ip4h = (struct iphdr *)(data + sizeof(struct ethhdr));
+ if ((void *)(ip4h + 1) > data_end)
+ return false;
+
+ return ip4h->daddr == addr;
+}
+
+static __always_inline bool is_remote_ep_v6(struct __sk_buff *skb,
+ struct in6_addr addr)
+{
+ void *data_end = ctx_ptr(skb->data_end);
+ void *data = ctx_ptr(skb->data);
+ struct ipv6hdr *ip6h;
+
+ if (data + sizeof(struct ethhdr) > data_end)
+ return false;
+
+ ip6h = (struct ipv6hdr *)(data + sizeof(struct ethhdr));
+ if ((void *)(ip6h + 1) > data_end)
+ return false;
+
+ return v6_equal(ip6h->daddr, addr);
+}
+
+static __always_inline int get_dev_ifindex(int which)
+{
+ int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which);
+
+ return ifindex ? *ifindex : 0;
+}
+
+SEC("chk_egress") int tc_chk(struct __sk_buff *skb)
+{
+ void *data_end = ctx_ptr(skb->data_end);
+ void *data = ctx_ptr(skb->data);
+ __u32 *raw = data;
+
+ if (data + sizeof(struct ethhdr) > data_end)
+ return TC_ACT_SHOT;
+
+ return !raw[0] && !raw[1] && !raw[2] ? TC_ACT_SHOT : TC_ACT_OK;
+}
+
+SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
+{
+ __u8 zero[ETH_ALEN * 2];
+ bool redirect = false;
+
+ switch (skb->protocol) {
+ case __bpf_constant_htons(ETH_P_IP):
+ redirect = is_remote_ep_v4(skb, __bpf_constant_htonl(ip4_src));
+ break;
+ case __bpf_constant_htons(ETH_P_IPV6):
+ redirect = is_remote_ep_v6(skb, (struct in6_addr)ip6_src);
+ break;
+ }
+
+ if (!redirect)
+ return TC_ACT_OK;
+
+ __builtin_memset(&zero, 0, sizeof(zero));
+ if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0)
+ return TC_ACT_SHOT;
+
+ return bpf_redirect_neigh(get_dev_ifindex(dev_src), NULL, 0, 0);
+}
+
+SEC("src_ingress") int tc_src(struct __sk_buff *skb)
+{
+ __u8 zero[ETH_ALEN * 2];
+ bool redirect = false;
+
+ switch (skb->protocol) {
+ case __bpf_constant_htons(ETH_P_IP):
+ redirect = is_remote_ep_v4(skb, __bpf_constant_htonl(ip4_dst));
+ break;
+ case __bpf_constant_htons(ETH_P_IPV6):
+ redirect = is_remote_ep_v6(skb, (struct in6_addr)ip6_dst);
+ break;
+ }
+
+ if (!redirect)
+ return TC_ACT_OK;
+
+ __builtin_memset(&zero, 0, sizeof(zero));
+ if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0)
+ return TC_ACT_SHOT;
+
+ return bpf_redirect_neigh(get_dev_ifindex(dev_dst), NULL, 0, 0);
+}
+
+char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_tc_neigh_fib.c b/tools/testing/selftests/bpf/progs/test_tc_neigh_fib.c
new file mode 100644
index 000000000..d82ed3457
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_tc_neigh_fib.c
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdint.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#include <linux/bpf.h>
+#include <linux/stddef.h>
+#include <linux/pkt_cls.h>
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#ifndef ctx_ptr
+# define ctx_ptr(field) (void *)(long)(field)
+#endif
+
+#define AF_INET 2
+#define AF_INET6 10
+
+static __always_inline int fill_fib_params_v4(struct __sk_buff *skb,
+ struct bpf_fib_lookup *fib_params)
+{
+ void *data_end = ctx_ptr(skb->data_end);
+ void *data = ctx_ptr(skb->data);
+ struct iphdr *ip4h;
+
+ if (data + sizeof(struct ethhdr) > data_end)
+ return -1;
+
+ ip4h = (struct iphdr *)(data + sizeof(struct ethhdr));
+ if ((void *)(ip4h + 1) > data_end)
+ return -1;
+
+ fib_params->family = AF_INET;
+ fib_params->tos = ip4h->tos;
+ fib_params->l4_protocol = ip4h->protocol;
+ fib_params->sport = 0;
+ fib_params->dport = 0;
+ fib_params->tot_len = bpf_ntohs(ip4h->tot_len);
+ fib_params->ipv4_src = ip4h->saddr;
+ fib_params->ipv4_dst = ip4h->daddr;
+
+ return 0;
+}
+
+static __always_inline int fill_fib_params_v6(struct __sk_buff *skb,
+ struct bpf_fib_lookup *fib_params)
+{
+ struct in6_addr *src = (struct in6_addr *)fib_params->ipv6_src;
+ struct in6_addr *dst = (struct in6_addr *)fib_params->ipv6_dst;
+ void *data_end = ctx_ptr(skb->data_end);
+ void *data = ctx_ptr(skb->data);
+ struct ipv6hdr *ip6h;
+
+ if (data + sizeof(struct ethhdr) > data_end)
+ return -1;
+
+ ip6h = (struct ipv6hdr *)(data + sizeof(struct ethhdr));
+ if ((void *)(ip6h + 1) > data_end)
+ return -1;
+
+ fib_params->family = AF_INET6;
+ fib_params->flowinfo = 0;
+ fib_params->l4_protocol = ip6h->nexthdr;
+ fib_params->sport = 0;
+ fib_params->dport = 0;
+ fib_params->tot_len = bpf_ntohs(ip6h->payload_len);
+ *src = ip6h->saddr;
+ *dst = ip6h->daddr;
+
+ return 0;
+}
+
+SEC("chk_egress") int tc_chk(struct __sk_buff *skb)
+{
+ void *data_end = ctx_ptr(skb->data_end);
+ void *data = ctx_ptr(skb->data);
+ __u32 *raw = data;
+
+ if (data + sizeof(struct ethhdr) > data_end)
+ return TC_ACT_SHOT;
+
+ return !raw[0] && !raw[1] && !raw[2] ? TC_ACT_SHOT : TC_ACT_OK;
+}
+
+static __always_inline int tc_redir(struct __sk_buff *skb)
+{
+ struct bpf_fib_lookup fib_params = { .ifindex = skb->ingress_ifindex };
+ __u8 zero[ETH_ALEN * 2];
+ int ret = -1;
+
+ switch (skb->protocol) {
+ case __bpf_constant_htons(ETH_P_IP):
+ ret = fill_fib_params_v4(skb, &fib_params);
+ break;
+ case __bpf_constant_htons(ETH_P_IPV6):
+ ret = fill_fib_params_v6(skb, &fib_params);
+ break;
+ }
+
+ if (ret)
+ return TC_ACT_OK;
+
+ ret = bpf_fib_lookup(skb, &fib_params, sizeof(fib_params), 0);
+ if (ret == BPF_FIB_LKUP_RET_NOT_FWDED || ret < 0)
+ return TC_ACT_OK;
+
+ __builtin_memset(&zero, 0, sizeof(zero));
+ if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0)
+ return TC_ACT_SHOT;
+
+ if (ret == BPF_FIB_LKUP_RET_NO_NEIGH) {
+ struct bpf_redir_neigh nh_params = {};
+
+ nh_params.nh_family = fib_params.family;
+ __builtin_memcpy(&nh_params.ipv6_nh, &fib_params.ipv6_dst,
+ sizeof(nh_params.ipv6_nh));
+
+ return bpf_redirect_neigh(fib_params.ifindex, &nh_params,
+ sizeof(nh_params), 0);
+
+ } else if (ret == BPF_FIB_LKUP_RET_SUCCESS) {
+ void *data_end = ctx_ptr(skb->data_end);
+ struct ethhdr *eth = ctx_ptr(skb->data);
+
+ if (eth + 1 > data_end)
+ return TC_ACT_SHOT;
+
+ __builtin_memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
+ __builtin_memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
+
+ return bpf_redirect(fib_params.ifindex, 0);
+ }
+
+ return TC_ACT_SHOT;
+}
+
+/* these are identical, but keep them separate for compatibility with the
+ * section names expected by test_tc_redirect.sh
+ */
+SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
+{
+ return tc_redir(skb);
+}
+
+SEC("src_ingress") int tc_src(struct __sk_buff *skb)
+{
+ return tc_redir(skb);
+}
+
+char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_tc_peer.c b/tools/testing/selftests/bpf/progs/test_tc_peer.c
new file mode 100644
index 000000000..fc84a7685
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_tc_peer.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <linux/bpf.h>
+#include <linux/stddef.h>
+#include <linux/pkt_cls.h>
+
+#include <bpf/bpf_helpers.h>
+
+enum {
+ dev_src,
+ dev_dst,
+};
+
+struct bpf_map_def SEC("maps") ifindex_map = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(int),
+ .value_size = sizeof(int),
+ .max_entries = 2,
+};
+
+static __always_inline int get_dev_ifindex(int which)
+{
+ int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which);
+
+ return ifindex ? *ifindex : 0;
+}
+
+SEC("chk_egress") int tc_chk(struct __sk_buff *skb)
+{
+ return TC_ACT_SHOT;
+}
+
+SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
+{
+ return bpf_redirect_peer(get_dev_ifindex(dev_src), 0);
+}
+
+SEC("src_ingress") int tc_src(struct __sk_buff *skb)
+{
+ return bpf_redirect_peer(get_dev_ifindex(dev_dst), 0);
+}
+
+char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_tc_tunnel.c b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
new file mode 100644
index 000000000..37bce7a7c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_tc_tunnel.c
@@ -0,0 +1,536 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* In-place tunneling */
+
+#include <stdbool.h>
+#include <string.h>
+
+#include <linux/stddef.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/mpls.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/pkt_cls.h>
+#include <linux/types.h>
+
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_helpers.h>
+
+static const int cfg_port = 8000;
+
+static const int cfg_udp_src = 20000;
+
+#define UDP_PORT 5555
+#define MPLS_OVER_UDP_PORT 6635
+#define ETH_OVER_UDP_PORT 7777
+
+/* MPLS label 1000 with S bit (last label) set and ttl of 255. */
+static const __u32 mpls_label = __bpf_constant_htonl(1000 << 12 |
+ MPLS_LS_S_MASK | 0xff);
+
+struct gre_hdr {
+ __be16 flags;
+ __be16 protocol;
+} __attribute__((packed));
+
+union l4hdr {
+ struct udphdr udp;
+ struct gre_hdr gre;
+};
+
+struct v4hdr {
+ struct iphdr ip;
+ union l4hdr l4hdr;
+ __u8 pad[16]; /* enough space for L2 header */
+} __attribute__((packed));
+
+struct v6hdr {
+ struct ipv6hdr ip;
+ union l4hdr l4hdr;
+ __u8 pad[16]; /* enough space for L2 header */
+} __attribute__((packed));
+
+static __always_inline void set_ipv4_csum(struct iphdr *iph)
+{
+ __u16 *iph16 = (__u16 *)iph;
+ __u32 csum;
+ int i;
+
+ iph->check = 0;
+
+#pragma clang loop unroll(full)
+ for (i = 0, csum = 0; i < sizeof(*iph) >> 1; i++)
+ csum += *iph16++;
+
+ iph->check = ~((csum & 0xffff) + (csum >> 16));
+}
+
+static __always_inline int encap_ipv4(struct __sk_buff *skb, __u8 encap_proto,
+ __u16 l2_proto)
+{
+ __u16 udp_dst = UDP_PORT;
+ struct iphdr iph_inner;
+ struct v4hdr h_outer;
+ struct tcphdr tcph;
+ int olen, l2_len;
+ int tcp_off;
+ __u64 flags;
+
+ /* Most tests encapsulate a packet into a tunnel with the same
+ * network protocol, and derive the outer header fields from
+ * the inner header.
+ *
+ * The 6in4 case tests different inner and outer protocols. As
+ * the inner is ipv6, but the outer expects an ipv4 header as
+ * input, manually build a struct iphdr based on the ipv6hdr.
+ */
+ if (encap_proto == IPPROTO_IPV6) {
+ const __u32 saddr = (192 << 24) | (168 << 16) | (1 << 8) | 1;
+ const __u32 daddr = (192 << 24) | (168 << 16) | (1 << 8) | 2;
+ struct ipv6hdr iph6_inner;
+
+ /* Read the IPv6 header */
+ if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph6_inner,
+ sizeof(iph6_inner)) < 0)
+ return TC_ACT_OK;
+
+ /* Derive the IPv4 header fields from the IPv6 header */
+ memset(&iph_inner, 0, sizeof(iph_inner));
+ iph_inner.version = 4;
+ iph_inner.ihl = 5;
+ iph_inner.tot_len = bpf_htons(sizeof(iph6_inner) +
+ bpf_ntohs(iph6_inner.payload_len));
+ iph_inner.ttl = iph6_inner.hop_limit - 1;
+ iph_inner.protocol = iph6_inner.nexthdr;
+ iph_inner.saddr = __bpf_constant_htonl(saddr);
+ iph_inner.daddr = __bpf_constant_htonl(daddr);
+
+ tcp_off = sizeof(iph6_inner);
+ } else {
+ if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner,
+ sizeof(iph_inner)) < 0)
+ return TC_ACT_OK;
+
+ tcp_off = sizeof(iph_inner);
+ }
+
+ /* filter only packets we want */
+ if (iph_inner.ihl != 5 || iph_inner.protocol != IPPROTO_TCP)
+ return TC_ACT_OK;
+
+ if (bpf_skb_load_bytes(skb, ETH_HLEN + tcp_off,
+ &tcph, sizeof(tcph)) < 0)
+ return TC_ACT_OK;
+
+ if (tcph.dest != __bpf_constant_htons(cfg_port))
+ return TC_ACT_OK;
+
+ olen = sizeof(h_outer.ip);
+ l2_len = 0;
+
+ flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV4;
+
+ switch (l2_proto) {
+ case ETH_P_MPLS_UC:
+ l2_len = sizeof(mpls_label);
+ udp_dst = MPLS_OVER_UDP_PORT;
+ break;
+ case ETH_P_TEB:
+ l2_len = ETH_HLEN;
+ udp_dst = ETH_OVER_UDP_PORT;
+ break;
+ }
+ flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
+
+ switch (encap_proto) {
+ case IPPROTO_GRE:
+ flags |= BPF_F_ADJ_ROOM_ENCAP_L4_GRE;
+ olen += sizeof(h_outer.l4hdr.gre);
+ h_outer.l4hdr.gre.protocol = bpf_htons(l2_proto);
+ h_outer.l4hdr.gre.flags = 0;
+ break;
+ case IPPROTO_UDP:
+ flags |= BPF_F_ADJ_ROOM_ENCAP_L4_UDP;
+ olen += sizeof(h_outer.l4hdr.udp);
+ h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src);
+ h_outer.l4hdr.udp.dest = bpf_htons(udp_dst);
+ h_outer.l4hdr.udp.check = 0;
+ h_outer.l4hdr.udp.len = bpf_htons(bpf_ntohs(iph_inner.tot_len) +
+ sizeof(h_outer.l4hdr.udp) +
+ l2_len);
+ break;
+ case IPPROTO_IPIP:
+ case IPPROTO_IPV6:
+ break;
+ default:
+ return TC_ACT_OK;
+ }
+
+ /* add L2 encap (if specified) */
+ switch (l2_proto) {
+ case ETH_P_MPLS_UC:
+ *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
+ break;
+ case ETH_P_TEB:
+ if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
+ ETH_HLEN))
+ return TC_ACT_SHOT;
+ break;
+ }
+ olen += l2_len;
+
+ /* add room between mac and network header */
+ if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags))
+ return TC_ACT_SHOT;
+
+ /* prepare new outer network header */
+ h_outer.ip = iph_inner;
+ h_outer.ip.tot_len = bpf_htons(olen +
+ bpf_ntohs(h_outer.ip.tot_len));
+ h_outer.ip.protocol = encap_proto;
+
+ set_ipv4_csum((void *)&h_outer.ip);
+
+ /* store new outer network header */
+ if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen,
+ BPF_F_INVALIDATE_HASH) < 0)
+ return TC_ACT_SHOT;
+
+ /* if changing outer proto type, update eth->h_proto */
+ if (encap_proto == IPPROTO_IPV6) {
+ struct ethhdr eth;
+
+ if (bpf_skb_load_bytes(skb, 0, &eth, sizeof(eth)) < 0)
+ return TC_ACT_SHOT;
+ eth.h_proto = bpf_htons(ETH_P_IP);
+ if (bpf_skb_store_bytes(skb, 0, &eth, sizeof(eth), 0) < 0)
+ return TC_ACT_SHOT;
+ }
+
+ return TC_ACT_OK;
+}
+
+static __always_inline int encap_ipv6(struct __sk_buff *skb, __u8 encap_proto,
+ __u16 l2_proto)
+{
+ __u16 udp_dst = UDP_PORT;
+ struct ipv6hdr iph_inner;
+ struct v6hdr h_outer;
+ struct tcphdr tcph;
+ int olen, l2_len;
+ __u16 tot_len;
+ __u64 flags;
+
+ if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_inner,
+ sizeof(iph_inner)) < 0)
+ return TC_ACT_OK;
+
+ /* filter only packets we want */
+ if (bpf_skb_load_bytes(skb, ETH_HLEN + sizeof(iph_inner),
+ &tcph, sizeof(tcph)) < 0)
+ return TC_ACT_OK;
+
+ if (tcph.dest != __bpf_constant_htons(cfg_port))
+ return TC_ACT_OK;
+
+ olen = sizeof(h_outer.ip);
+ l2_len = 0;
+
+ flags = BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_ENCAP_L3_IPV6;
+
+ switch (l2_proto) {
+ case ETH_P_MPLS_UC:
+ l2_len = sizeof(mpls_label);
+ udp_dst = MPLS_OVER_UDP_PORT;
+ break;
+ case ETH_P_TEB:
+ l2_len = ETH_HLEN;
+ udp_dst = ETH_OVER_UDP_PORT;
+ break;
+ }
+ flags |= BPF_F_ADJ_ROOM_ENCAP_L2(l2_len);
+
+ switch (encap_proto) {
+ case IPPROTO_GRE:
+ flags |= BPF_F_ADJ_ROOM_ENCAP_L4_GRE;
+ olen += sizeof(h_outer.l4hdr.gre);
+ h_outer.l4hdr.gre.protocol = bpf_htons(l2_proto);
+ h_outer.l4hdr.gre.flags = 0;
+ break;
+ case IPPROTO_UDP:
+ flags |= BPF_F_ADJ_ROOM_ENCAP_L4_UDP;
+ olen += sizeof(h_outer.l4hdr.udp);
+ h_outer.l4hdr.udp.source = __bpf_constant_htons(cfg_udp_src);
+ h_outer.l4hdr.udp.dest = bpf_htons(udp_dst);
+ tot_len = bpf_ntohs(iph_inner.payload_len) + sizeof(iph_inner) +
+ sizeof(h_outer.l4hdr.udp);
+ h_outer.l4hdr.udp.check = 0;
+ h_outer.l4hdr.udp.len = bpf_htons(tot_len);
+ break;
+ case IPPROTO_IPV6:
+ break;
+ default:
+ return TC_ACT_OK;
+ }
+
+ /* add L2 encap (if specified) */
+ switch (l2_proto) {
+ case ETH_P_MPLS_UC:
+ *((__u32 *)((__u8 *)&h_outer + olen)) = mpls_label;
+ break;
+ case ETH_P_TEB:
+ if (bpf_skb_load_bytes(skb, 0, (__u8 *)&h_outer + olen,
+ ETH_HLEN))
+ return TC_ACT_SHOT;
+ break;
+ }
+ olen += l2_len;
+
+ /* add room between mac and network header */
+ if (bpf_skb_adjust_room(skb, olen, BPF_ADJ_ROOM_MAC, flags))
+ return TC_ACT_SHOT;
+
+ /* prepare new outer network header */
+ h_outer.ip = iph_inner;
+ h_outer.ip.payload_len = bpf_htons(olen +
+ bpf_ntohs(h_outer.ip.payload_len));
+
+ h_outer.ip.nexthdr = encap_proto;
+
+ /* store new outer network header */
+ if (bpf_skb_store_bytes(skb, ETH_HLEN, &h_outer, olen,
+ BPF_F_INVALIDATE_HASH) < 0)
+ return TC_ACT_SHOT;
+
+ return TC_ACT_OK;
+}
+
+SEC("encap_ipip_none")
+int __encap_ipip_none(struct __sk_buff *skb)
+{
+ if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
+ return encap_ipv4(skb, IPPROTO_IPIP, ETH_P_IP);
+ else
+ return TC_ACT_OK;
+}
+
+SEC("encap_gre_none")
+int __encap_gre_none(struct __sk_buff *skb)
+{
+ if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
+ return encap_ipv4(skb, IPPROTO_GRE, ETH_P_IP);
+ else
+ return TC_ACT_OK;
+}
+
+SEC("encap_gre_mpls")
+int __encap_gre_mpls(struct __sk_buff *skb)
+{
+ if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
+ return encap_ipv4(skb, IPPROTO_GRE, ETH_P_MPLS_UC);
+ else
+ return TC_ACT_OK;
+}
+
+SEC("encap_gre_eth")
+int __encap_gre_eth(struct __sk_buff *skb)
+{
+ if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
+ return encap_ipv4(skb, IPPROTO_GRE, ETH_P_TEB);
+ else
+ return TC_ACT_OK;
+}
+
+SEC("encap_udp_none")
+int __encap_udp_none(struct __sk_buff *skb)
+{
+ if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
+ return encap_ipv4(skb, IPPROTO_UDP, ETH_P_IP);
+ else
+ return TC_ACT_OK;
+}
+
+SEC("encap_udp_mpls")
+int __encap_udp_mpls(struct __sk_buff *skb)
+{
+ if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
+ return encap_ipv4(skb, IPPROTO_UDP, ETH_P_MPLS_UC);
+ else
+ return TC_ACT_OK;
+}
+
+SEC("encap_udp_eth")
+int __encap_udp_eth(struct __sk_buff *skb)
+{
+ if (skb->protocol == __bpf_constant_htons(ETH_P_IP))
+ return encap_ipv4(skb, IPPROTO_UDP, ETH_P_TEB);
+ else
+ return TC_ACT_OK;
+}
+
+SEC("encap_sit_none")
+int __encap_sit_none(struct __sk_buff *skb)
+{
+ if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
+ return encap_ipv4(skb, IPPROTO_IPV6, ETH_P_IP);
+ else
+ return TC_ACT_OK;
+}
+
+SEC("encap_ip6tnl_none")
+int __encap_ip6tnl_none(struct __sk_buff *skb)
+{
+ if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
+ return encap_ipv6(skb, IPPROTO_IPV6, ETH_P_IPV6);
+ else
+ return TC_ACT_OK;
+}
+
+SEC("encap_ip6gre_none")
+int __encap_ip6gre_none(struct __sk_buff *skb)
+{
+ if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
+ return encap_ipv6(skb, IPPROTO_GRE, ETH_P_IPV6);
+ else
+ return TC_ACT_OK;
+}
+
+SEC("encap_ip6gre_mpls")
+int __encap_ip6gre_mpls(struct __sk_buff *skb)
+{
+ if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
+ return encap_ipv6(skb, IPPROTO_GRE, ETH_P_MPLS_UC);
+ else
+ return TC_ACT_OK;
+}
+
+SEC("encap_ip6gre_eth")
+int __encap_ip6gre_eth(struct __sk_buff *skb)
+{
+ if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
+ return encap_ipv6(skb, IPPROTO_GRE, ETH_P_TEB);
+ else
+ return TC_ACT_OK;
+}
+
+SEC("encap_ip6udp_none")
+int __encap_ip6udp_none(struct __sk_buff *skb)
+{
+ if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
+ return encap_ipv6(skb, IPPROTO_UDP, ETH_P_IPV6);
+ else
+ return TC_ACT_OK;
+}
+
+SEC("encap_ip6udp_mpls")
+int __encap_ip6udp_mpls(struct __sk_buff *skb)
+{
+ if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
+ return encap_ipv6(skb, IPPROTO_UDP, ETH_P_MPLS_UC);
+ else
+ return TC_ACT_OK;
+}
+
+SEC("encap_ip6udp_eth")
+int __encap_ip6udp_eth(struct __sk_buff *skb)
+{
+ if (skb->protocol == __bpf_constant_htons(ETH_P_IPV6))
+ return encap_ipv6(skb, IPPROTO_UDP, ETH_P_TEB);
+ else
+ return TC_ACT_OK;
+}
+
+static int decap_internal(struct __sk_buff *skb, int off, int len, char proto)
+{
+ char buf[sizeof(struct v6hdr)];
+ struct gre_hdr greh;
+ struct udphdr udph;
+ int olen = len;
+
+ switch (proto) {
+ case IPPROTO_IPIP:
+ case IPPROTO_IPV6:
+ break;
+ case IPPROTO_GRE:
+ olen += sizeof(struct gre_hdr);
+ if (bpf_skb_load_bytes(skb, off + len, &greh, sizeof(greh)) < 0)
+ return TC_ACT_OK;
+ switch (bpf_ntohs(greh.protocol)) {
+ case ETH_P_MPLS_UC:
+ olen += sizeof(mpls_label);
+ break;
+ case ETH_P_TEB:
+ olen += ETH_HLEN;
+ break;
+ }
+ break;
+ case IPPROTO_UDP:
+ olen += sizeof(struct udphdr);
+ if (bpf_skb_load_bytes(skb, off + len, &udph, sizeof(udph)) < 0)
+ return TC_ACT_OK;
+ switch (bpf_ntohs(udph.dest)) {
+ case MPLS_OVER_UDP_PORT:
+ olen += sizeof(mpls_label);
+ break;
+ case ETH_OVER_UDP_PORT:
+ olen += ETH_HLEN;
+ break;
+ }
+ break;
+ default:
+ return TC_ACT_OK;
+ }
+
+ if (bpf_skb_adjust_room(skb, -olen, BPF_ADJ_ROOM_MAC,
+ BPF_F_ADJ_ROOM_FIXED_GSO))
+ return TC_ACT_SHOT;
+
+ return TC_ACT_OK;
+}
+
+static int decap_ipv4(struct __sk_buff *skb)
+{
+ struct iphdr iph_outer;
+
+ if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_outer,
+ sizeof(iph_outer)) < 0)
+ return TC_ACT_OK;
+
+ if (iph_outer.ihl != 5)
+ return TC_ACT_OK;
+
+ return decap_internal(skb, ETH_HLEN, sizeof(iph_outer),
+ iph_outer.protocol);
+}
+
+static int decap_ipv6(struct __sk_buff *skb)
+{
+ struct ipv6hdr iph_outer;
+
+ if (bpf_skb_load_bytes(skb, ETH_HLEN, &iph_outer,
+ sizeof(iph_outer)) < 0)
+ return TC_ACT_OK;
+
+ return decap_internal(skb, ETH_HLEN, sizeof(iph_outer),
+ iph_outer.nexthdr);
+}
+
+SEC("decap")
+int decap_f(struct __sk_buff *skb)
+{
+ switch (skb->protocol) {
+ case __bpf_constant_htons(ETH_P_IP):
+ return decap_ipv4(skb);
+ case __bpf_constant_htons(ETH_P_IPV6):
+ return decap_ipv6(skb);
+ default:
+ /* does not match, ignore */
+ return TC_ACT_OK;
+ }
+}
+
+char __license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_tcp_check_syncookie_kern.c b/tools/testing/selftests/bpf/progs/test_tcp_check_syncookie_kern.c
new file mode 100644
index 000000000..47cbe2eea
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_tcp_check_syncookie_kern.c
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+// Copyright (c) 2019 Cloudflare
+
+#include <string.h>
+
+#include <linux/bpf.h>
+#include <linux/pkt_cls.h>
+#include <linux/if_ether.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <sys/socket.h>
+#include <linux/tcp.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+struct bpf_map_def SEC("maps") results = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(__u32),
+ .max_entries = 3,
+};
+
+static __always_inline __s64 gen_syncookie(void *data_end, struct bpf_sock *sk,
+ void *iph, __u32 ip_size,
+ struct tcphdr *tcph)
+{
+ __u32 thlen = tcph->doff * 4;
+
+ if (tcph->syn && !tcph->ack) {
+ // packet should only have an MSS option
+ if (thlen != 24)
+ return 0;
+
+ if ((void *)tcph + thlen > data_end)
+ return 0;
+
+ return bpf_tcp_gen_syncookie(sk, iph, ip_size, tcph, thlen);
+ }
+ return 0;
+}
+
+static __always_inline void check_syncookie(void *ctx, void *data,
+ void *data_end)
+{
+ struct bpf_sock_tuple tup;
+ struct bpf_sock *sk;
+ struct ethhdr *ethh;
+ struct iphdr *ipv4h;
+ struct ipv6hdr *ipv6h;
+ struct tcphdr *tcph;
+ int ret;
+ __u32 key_mss = 2;
+ __u32 key_gen = 1;
+ __u32 key = 0;
+ __s64 seq_mss;
+
+ ethh = data;
+ if (ethh + 1 > data_end)
+ return;
+
+ switch (bpf_ntohs(ethh->h_proto)) {
+ case ETH_P_IP:
+ ipv4h = data + sizeof(struct ethhdr);
+ if (ipv4h + 1 > data_end)
+ return;
+
+ if (ipv4h->ihl != 5)
+ return;
+
+ tcph = data + sizeof(struct ethhdr) + sizeof(struct iphdr);
+ if (tcph + 1 > data_end)
+ return;
+
+ tup.ipv4.saddr = ipv4h->saddr;
+ tup.ipv4.daddr = ipv4h->daddr;
+ tup.ipv4.sport = tcph->source;
+ tup.ipv4.dport = tcph->dest;
+
+ sk = bpf_skc_lookup_tcp(ctx, &tup, sizeof(tup.ipv4),
+ BPF_F_CURRENT_NETNS, 0);
+ if (!sk)
+ return;
+
+ if (sk->state != BPF_TCP_LISTEN)
+ goto release;
+
+ seq_mss = gen_syncookie(data_end, sk, ipv4h, sizeof(*ipv4h),
+ tcph);
+
+ ret = bpf_tcp_check_syncookie(sk, ipv4h, sizeof(*ipv4h),
+ tcph, sizeof(*tcph));
+ break;
+
+ case ETH_P_IPV6:
+ ipv6h = data + sizeof(struct ethhdr);
+ if (ipv6h + 1 > data_end)
+ return;
+
+ if (ipv6h->nexthdr != IPPROTO_TCP)
+ return;
+
+ tcph = data + sizeof(struct ethhdr) + sizeof(struct ipv6hdr);
+ if (tcph + 1 > data_end)
+ return;
+
+ memcpy(tup.ipv6.saddr, &ipv6h->saddr, sizeof(tup.ipv6.saddr));
+ memcpy(tup.ipv6.daddr, &ipv6h->daddr, sizeof(tup.ipv6.daddr));
+ tup.ipv6.sport = tcph->source;
+ tup.ipv6.dport = tcph->dest;
+
+ sk = bpf_skc_lookup_tcp(ctx, &tup, sizeof(tup.ipv6),
+ BPF_F_CURRENT_NETNS, 0);
+ if (!sk)
+ return;
+
+ if (sk->state != BPF_TCP_LISTEN)
+ goto release;
+
+ seq_mss = gen_syncookie(data_end, sk, ipv6h, sizeof(*ipv6h),
+ tcph);
+
+ ret = bpf_tcp_check_syncookie(sk, ipv6h, sizeof(*ipv6h),
+ tcph, sizeof(*tcph));
+ break;
+
+ default:
+ return;
+ }
+
+ if (seq_mss > 0) {
+ __u32 cookie = (__u32)seq_mss;
+ __u32 mss = seq_mss >> 32;
+
+ bpf_map_update_elem(&results, &key_gen, &cookie, 0);
+ bpf_map_update_elem(&results, &key_mss, &mss, 0);
+ }
+
+ if (ret == 0) {
+ __u32 cookie = bpf_ntohl(tcph->ack_seq) - 1;
+
+ bpf_map_update_elem(&results, &key, &cookie, 0);
+ }
+
+release:
+ bpf_sk_release(sk);
+}
+
+SEC("clsact/check_syncookie")
+int check_syncookie_clsact(struct __sk_buff *skb)
+{
+ check_syncookie(skb, (void *)(long)skb->data,
+ (void *)(long)skb->data_end);
+ return TC_ACT_OK;
+}
+
+SEC("xdp/check_syncookie")
+int check_syncookie_xdp(struct xdp_md *ctx)
+{
+ check_syncookie(ctx, (void *)(long)ctx->data,
+ (void *)(long)ctx->data_end);
+ return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_tcp_estats.c b/tools/testing/selftests/bpf/progs/test_tcp_estats.c
new file mode 100644
index 000000000..adc83a54c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_tcp_estats.c
@@ -0,0 +1,258 @@
+/* Copyright (c) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+/* This program shows clang/llvm is able to generate code pattern
+ * like:
+ * _tcp_send_active_reset:
+ * 0: bf 16 00 00 00 00 00 00 r6 = r1
+ * ......
+ * 335: b7 01 00 00 0f 00 00 00 r1 = 15
+ * 336: 05 00 48 00 00 00 00 00 goto 72
+ *
+ * LBB0_3:
+ * 337: b7 01 00 00 01 00 00 00 r1 = 1
+ * 338: 63 1a d0 ff 00 00 00 00 *(u32 *)(r10 - 48) = r1
+ * 408: b7 01 00 00 03 00 00 00 r1 = 3
+ *
+ * LBB0_4:
+ * 409: 71 a2 fe ff 00 00 00 00 r2 = *(u8 *)(r10 - 2)
+ * 410: bf a7 00 00 00 00 00 00 r7 = r10
+ * 411: 07 07 00 00 b8 ff ff ff r7 += -72
+ * 412: bf 73 00 00 00 00 00 00 r3 = r7
+ * 413: 0f 13 00 00 00 00 00 00 r3 += r1
+ * 414: 73 23 2d 00 00 00 00 00 *(u8 *)(r3 + 45) = r2
+ *
+ * From the above code snippet, the code generated by the compiler
+ * is reasonable. The "r1" is assigned to different values in basic
+ * blocks "_tcp_send_active_reset" and "LBB0_3", and used in "LBB0_4".
+ * The verifier should be able to handle such code patterns.
+ */
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/ipv6.h>
+#include <linux/version.h>
+#include <sys/socket.h>
+#include <bpf/bpf_helpers.h>
+
+#define _(P) ({typeof(P) val = 0; bpf_probe_read_kernel(&val, sizeof(val), &P); val;})
+#define TCP_ESTATS_MAGIC 0xBAADBEEF
+
+/* This test case needs "sock" and "pt_regs" data structure.
+ * Recursively, "sock" needs "sock_common" and "inet_sock".
+ * However, this is a unit test case only for
+ * verifier purpose without bpf program execution.
+ * We can safely mock much simpler data structures, basically
+ * only taking the necessary fields from kernel headers.
+ */
+typedef __u32 __bitwise __portpair;
+typedef __u64 __bitwise __addrpair;
+
+struct sock_common {
+ unsigned short skc_family;
+ union {
+ __addrpair skc_addrpair;
+ struct {
+ __be32 skc_daddr;
+ __be32 skc_rcv_saddr;
+ };
+ };
+ union {
+ __portpair skc_portpair;
+ struct {
+ __be16 skc_dport;
+ __u16 skc_num;
+ };
+ };
+ struct in6_addr skc_v6_daddr;
+ struct in6_addr skc_v6_rcv_saddr;
+};
+
+struct sock {
+ struct sock_common __sk_common;
+#define sk_family __sk_common.skc_family
+#define sk_v6_daddr __sk_common.skc_v6_daddr
+#define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr
+};
+
+struct inet_sock {
+ struct sock sk;
+#define inet_daddr sk.__sk_common.skc_daddr
+#define inet_dport sk.__sk_common.skc_dport
+ __be32 inet_saddr;
+ __be16 inet_sport;
+};
+
+struct pt_regs {
+ long di;
+};
+
+static inline struct inet_sock *inet_sk(const struct sock *sk)
+{
+ return (struct inet_sock *)sk;
+}
+
+/* Define various data structures for state recording.
+ * Some fields are not used due to test simplification.
+ */
+enum tcp_estats_addrtype {
+ TCP_ESTATS_ADDRTYPE_IPV4 = 1,
+ TCP_ESTATS_ADDRTYPE_IPV6 = 2
+};
+
+enum tcp_estats_event_type {
+ TCP_ESTATS_ESTABLISH,
+ TCP_ESTATS_PERIODIC,
+ TCP_ESTATS_TIMEOUT,
+ TCP_ESTATS_RETRANSMIT_TIMEOUT,
+ TCP_ESTATS_RETRANSMIT_OTHER,
+ TCP_ESTATS_SYN_RETRANSMIT,
+ TCP_ESTATS_SYNACK_RETRANSMIT,
+ TCP_ESTATS_TERM,
+ TCP_ESTATS_TX_RESET,
+ TCP_ESTATS_RX_RESET,
+ TCP_ESTATS_WRITE_TIMEOUT,
+ TCP_ESTATS_CONN_TIMEOUT,
+ TCP_ESTATS_ACK_LATENCY,
+ TCP_ESTATS_NEVENTS,
+};
+
+struct tcp_estats_event {
+ int pid;
+ int cpu;
+ unsigned long ts;
+ unsigned int magic;
+ enum tcp_estats_event_type event_type;
+};
+
+/* The below data structure is packed in order for
+ * llvm compiler to generate expected code.
+ */
+struct tcp_estats_conn_id {
+ unsigned int localaddressType;
+ struct {
+ unsigned char data[16];
+ } localaddress;
+ struct {
+ unsigned char data[16];
+ } remaddress;
+ unsigned short localport;
+ unsigned short remport;
+} __attribute__((__packed__));
+
+struct tcp_estats_basic_event {
+ struct tcp_estats_event event;
+ struct tcp_estats_conn_id conn_id;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 1024);
+ __type(key, __u32);
+ __type(value, struct tcp_estats_basic_event);
+} ev_record_map SEC(".maps");
+
+struct dummy_tracepoint_args {
+ unsigned long long pad;
+ struct sock *sock;
+};
+
+static __always_inline void tcp_estats_ev_init(struct tcp_estats_event *event,
+ enum tcp_estats_event_type type)
+{
+ event->magic = TCP_ESTATS_MAGIC;
+ event->ts = bpf_ktime_get_ns();
+ event->event_type = type;
+}
+
+static __always_inline void unaligned_u32_set(unsigned char *to, __u8 *from)
+{
+ to[0] = _(from[0]);
+ to[1] = _(from[1]);
+ to[2] = _(from[2]);
+ to[3] = _(from[3]);
+}
+
+static __always_inline void conn_id_ipv4_init(struct tcp_estats_conn_id *conn_id,
+ __be32 *saddr, __be32 *daddr)
+{
+ conn_id->localaddressType = TCP_ESTATS_ADDRTYPE_IPV4;
+
+ unaligned_u32_set(conn_id->localaddress.data, (__u8 *)saddr);
+ unaligned_u32_set(conn_id->remaddress.data, (__u8 *)daddr);
+}
+
+static __always_inline void conn_id_ipv6_init(struct tcp_estats_conn_id *conn_id,
+ __be32 *saddr, __be32 *daddr)
+{
+ conn_id->localaddressType = TCP_ESTATS_ADDRTYPE_IPV6;
+
+ unaligned_u32_set(conn_id->localaddress.data, (__u8 *)saddr);
+ unaligned_u32_set(conn_id->localaddress.data + sizeof(__u32),
+ (__u8 *)(saddr + 1));
+ unaligned_u32_set(conn_id->localaddress.data + sizeof(__u32) * 2,
+ (__u8 *)(saddr + 2));
+ unaligned_u32_set(conn_id->localaddress.data + sizeof(__u32) * 3,
+ (__u8 *)(saddr + 3));
+
+ unaligned_u32_set(conn_id->remaddress.data,
+ (__u8 *)(daddr));
+ unaligned_u32_set(conn_id->remaddress.data + sizeof(__u32),
+ (__u8 *)(daddr + 1));
+ unaligned_u32_set(conn_id->remaddress.data + sizeof(__u32) * 2,
+ (__u8 *)(daddr + 2));
+ unaligned_u32_set(conn_id->remaddress.data + sizeof(__u32) * 3,
+ (__u8 *)(daddr + 3));
+}
+
+static __always_inline void tcp_estats_conn_id_init(struct tcp_estats_conn_id *conn_id,
+ struct sock *sk)
+{
+ conn_id->localport = _(inet_sk(sk)->inet_sport);
+ conn_id->remport = _(inet_sk(sk)->inet_dport);
+
+ if (_(sk->sk_family) == AF_INET6)
+ conn_id_ipv6_init(conn_id,
+ sk->sk_v6_rcv_saddr.s6_addr32,
+ sk->sk_v6_daddr.s6_addr32);
+ else
+ conn_id_ipv4_init(conn_id,
+ &inet_sk(sk)->inet_saddr,
+ &inet_sk(sk)->inet_daddr);
+}
+
+static __always_inline void tcp_estats_init(struct sock *sk,
+ struct tcp_estats_event *event,
+ struct tcp_estats_conn_id *conn_id,
+ enum tcp_estats_event_type type)
+{
+ tcp_estats_ev_init(event, type);
+ tcp_estats_conn_id_init(conn_id, sk);
+}
+
+static __always_inline void send_basic_event(struct sock *sk,
+ enum tcp_estats_event_type type)
+{
+ struct tcp_estats_basic_event ev;
+ __u32 key = bpf_get_prandom_u32();
+
+ memset(&ev, 0, sizeof(ev));
+ tcp_estats_init(sk, &ev.event, &ev.conn_id, type);
+ bpf_map_update_elem(&ev_record_map, &key, &ev, BPF_ANY);
+}
+
+SEC("dummy_tracepoint")
+int _dummy_tracepoint(struct dummy_tracepoint_args *arg)
+{
+ if (!arg->sock)
+ return 0;
+
+ send_basic_event(arg->sock, TCP_ESTATS_TX_RESET);
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1; /* ignored by tracepoints, required by libbpf.a */
diff --git a/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c b/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c
new file mode 100644
index 000000000..678bd0fad
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_tcp_hdr_options.c
@@ -0,0 +1,626 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include <stddef.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <linux/tcp.h>
+#include <linux/socket.h>
+#include <linux/bpf.h>
+#include <linux/types.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#define BPF_PROG_TEST_TCP_HDR_OPTIONS
+#include "test_tcp_hdr_options.h"
+
+#ifndef sizeof_field
+#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER))
+#endif
+
+__u8 test_kind = TCPOPT_EXP;
+__u16 test_magic = 0xeB9F;
+__u32 inherit_cb_flags = 0;
+
+struct bpf_test_option passive_synack_out = {};
+struct bpf_test_option passive_fin_out = {};
+
+struct bpf_test_option passive_estab_in = {};
+struct bpf_test_option passive_fin_in = {};
+
+struct bpf_test_option active_syn_out = {};
+struct bpf_test_option active_fin_out = {};
+
+struct bpf_test_option active_estab_in = {};
+struct bpf_test_option active_fin_in = {};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct hdr_stg);
+} hdr_stg_map SEC(".maps");
+
+static bool skops_want_cookie(const struct bpf_sock_ops *skops)
+{
+ return skops->args[0] == BPF_WRITE_HDR_TCP_SYNACK_COOKIE;
+}
+
+static bool skops_current_mss(const struct bpf_sock_ops *skops)
+{
+ return skops->args[0] == BPF_WRITE_HDR_TCP_CURRENT_MSS;
+}
+
+static __u8 option_total_len(__u8 flags)
+{
+ __u8 i, len = 1; /* +1 for flags */
+
+ if (!flags)
+ return 0;
+
+ /* RESEND bit does not use a byte */
+ for (i = OPTION_RESEND + 1; i < __NR_OPTION_FLAGS; i++)
+ len += !!TEST_OPTION_FLAGS(flags, i);
+
+ if (test_kind == TCPOPT_EXP)
+ return len + TCP_BPF_EXPOPT_BASE_LEN;
+ else
+ return len + 2; /* +1 kind, +1 kind-len */
+}
+
+static void write_test_option(const struct bpf_test_option *test_opt,
+ __u8 *data)
+{
+ __u8 offset = 0;
+
+ data[offset++] = test_opt->flags;
+ if (TEST_OPTION_FLAGS(test_opt->flags, OPTION_MAX_DELACK_MS))
+ data[offset++] = test_opt->max_delack_ms;
+
+ if (TEST_OPTION_FLAGS(test_opt->flags, OPTION_RAND))
+ data[offset++] = test_opt->rand;
+}
+
+static int store_option(struct bpf_sock_ops *skops,
+ const struct bpf_test_option *test_opt)
+{
+ union {
+ struct tcp_exprm_opt exprm;
+ struct tcp_opt regular;
+ } write_opt;
+ int err;
+
+ if (test_kind == TCPOPT_EXP) {
+ write_opt.exprm.kind = TCPOPT_EXP;
+ write_opt.exprm.len = option_total_len(test_opt->flags);
+ write_opt.exprm.magic = __bpf_htons(test_magic);
+ write_opt.exprm.data32 = 0;
+ write_test_option(test_opt, write_opt.exprm.data);
+ err = bpf_store_hdr_opt(skops, &write_opt.exprm,
+ sizeof(write_opt.exprm), 0);
+ } else {
+ write_opt.regular.kind = test_kind;
+ write_opt.regular.len = option_total_len(test_opt->flags);
+ write_opt.regular.data32 = 0;
+ write_test_option(test_opt, write_opt.regular.data);
+ err = bpf_store_hdr_opt(skops, &write_opt.regular,
+ sizeof(write_opt.regular), 0);
+ }
+
+ if (err)
+ RET_CG_ERR(err);
+
+ return CG_OK;
+}
+
+static int parse_test_option(struct bpf_test_option *opt, const __u8 *start)
+{
+ opt->flags = *start++;
+
+ if (TEST_OPTION_FLAGS(opt->flags, OPTION_MAX_DELACK_MS))
+ opt->max_delack_ms = *start++;
+
+ if (TEST_OPTION_FLAGS(opt->flags, OPTION_RAND))
+ opt->rand = *start++;
+
+ return 0;
+}
+
+static int load_option(struct bpf_sock_ops *skops,
+ struct bpf_test_option *test_opt, bool from_syn)
+{
+ union {
+ struct tcp_exprm_opt exprm;
+ struct tcp_opt regular;
+ } search_opt;
+ int ret, load_flags = from_syn ? BPF_LOAD_HDR_OPT_TCP_SYN : 0;
+
+ if (test_kind == TCPOPT_EXP) {
+ search_opt.exprm.kind = TCPOPT_EXP;
+ search_opt.exprm.len = 4;
+ search_opt.exprm.magic = __bpf_htons(test_magic);
+ search_opt.exprm.data32 = 0;
+ ret = bpf_load_hdr_opt(skops, &search_opt.exprm,
+ sizeof(search_opt.exprm), load_flags);
+ if (ret < 0)
+ return ret;
+ return parse_test_option(test_opt, search_opt.exprm.data);
+ } else {
+ search_opt.regular.kind = test_kind;
+ search_opt.regular.len = 0;
+ search_opt.regular.data32 = 0;
+ ret = bpf_load_hdr_opt(skops, &search_opt.regular,
+ sizeof(search_opt.regular), load_flags);
+ if (ret < 0)
+ return ret;
+ return parse_test_option(test_opt, search_opt.regular.data);
+ }
+}
+
+static int synack_opt_len(struct bpf_sock_ops *skops)
+{
+ struct bpf_test_option test_opt = {};
+ __u8 optlen;
+ int err;
+
+ if (!passive_synack_out.flags)
+ return CG_OK;
+
+ err = load_option(skops, &test_opt, true);
+
+ /* bpf_test_option is not found */
+ if (err == -ENOMSG)
+ return CG_OK;
+
+ if (err)
+ RET_CG_ERR(err);
+
+ optlen = option_total_len(passive_synack_out.flags);
+ if (optlen) {
+ err = bpf_reserve_hdr_opt(skops, optlen, 0);
+ if (err)
+ RET_CG_ERR(err);
+ }
+
+ return CG_OK;
+}
+
+static int write_synack_opt(struct bpf_sock_ops *skops)
+{
+ struct bpf_test_option opt;
+
+ if (!passive_synack_out.flags)
+ /* We should not even be called since no header
+ * space has been reserved.
+ */
+ RET_CG_ERR(0);
+
+ opt = passive_synack_out;
+ if (skops_want_cookie(skops))
+ SET_OPTION_FLAGS(opt.flags, OPTION_RESEND);
+
+ return store_option(skops, &opt);
+}
+
+static int syn_opt_len(struct bpf_sock_ops *skops)
+{
+ __u8 optlen;
+ int err;
+
+ if (!active_syn_out.flags)
+ return CG_OK;
+
+ optlen = option_total_len(active_syn_out.flags);
+ if (optlen) {
+ err = bpf_reserve_hdr_opt(skops, optlen, 0);
+ if (err)
+ RET_CG_ERR(err);
+ }
+
+ return CG_OK;
+}
+
+static int write_syn_opt(struct bpf_sock_ops *skops)
+{
+ if (!active_syn_out.flags)
+ RET_CG_ERR(0);
+
+ return store_option(skops, &active_syn_out);
+}
+
+static int fin_opt_len(struct bpf_sock_ops *skops)
+{
+ struct bpf_test_option *opt;
+ struct hdr_stg *hdr_stg;
+ __u8 optlen;
+ int err;
+
+ if (!skops->sk)
+ RET_CG_ERR(0);
+
+ hdr_stg = bpf_sk_storage_get(&hdr_stg_map, skops->sk, NULL, 0);
+ if (!hdr_stg)
+ RET_CG_ERR(0);
+
+ if (hdr_stg->active)
+ opt = &active_fin_out;
+ else
+ opt = &passive_fin_out;
+
+ optlen = option_total_len(opt->flags);
+ if (optlen) {
+ err = bpf_reserve_hdr_opt(skops, optlen, 0);
+ if (err)
+ RET_CG_ERR(err);
+ }
+
+ return CG_OK;
+}
+
+static int write_fin_opt(struct bpf_sock_ops *skops)
+{
+ struct bpf_test_option *opt;
+ struct hdr_stg *hdr_stg;
+
+ if (!skops->sk)
+ RET_CG_ERR(0);
+
+ hdr_stg = bpf_sk_storage_get(&hdr_stg_map, skops->sk, NULL, 0);
+ if (!hdr_stg)
+ RET_CG_ERR(0);
+
+ if (hdr_stg->active)
+ opt = &active_fin_out;
+ else
+ opt = &passive_fin_out;
+
+ if (!opt->flags)
+ RET_CG_ERR(0);
+
+ return store_option(skops, opt);
+}
+
+static int resend_in_ack(struct bpf_sock_ops *skops)
+{
+ struct hdr_stg *hdr_stg;
+
+ if (!skops->sk)
+ return -1;
+
+ hdr_stg = bpf_sk_storage_get(&hdr_stg_map, skops->sk, NULL, 0);
+ if (!hdr_stg)
+ return -1;
+
+ return !!hdr_stg->resend_syn;
+}
+
+static int nodata_opt_len(struct bpf_sock_ops *skops)
+{
+ int resend;
+
+ resend = resend_in_ack(skops);
+ if (resend < 0)
+ RET_CG_ERR(0);
+
+ if (resend)
+ return syn_opt_len(skops);
+
+ return CG_OK;
+}
+
+static int write_nodata_opt(struct bpf_sock_ops *skops)
+{
+ int resend;
+
+ resend = resend_in_ack(skops);
+ if (resend < 0)
+ RET_CG_ERR(0);
+
+ if (resend)
+ return write_syn_opt(skops);
+
+ return CG_OK;
+}
+
+static int data_opt_len(struct bpf_sock_ops *skops)
+{
+ /* Same as the nodata version. Mostly to show
+ * an example usage on skops->skb_len.
+ */
+ return nodata_opt_len(skops);
+}
+
+static int write_data_opt(struct bpf_sock_ops *skops)
+{
+ return write_nodata_opt(skops);
+}
+
+static int current_mss_opt_len(struct bpf_sock_ops *skops)
+{
+ /* Reserve maximum that may be needed */
+ int err;
+
+ err = bpf_reserve_hdr_opt(skops, option_total_len(OPTION_MASK), 0);
+ if (err)
+ RET_CG_ERR(err);
+
+ return CG_OK;
+}
+
+static int handle_hdr_opt_len(struct bpf_sock_ops *skops)
+{
+ __u8 tcp_flags = skops_tcp_flags(skops);
+
+ if ((tcp_flags & TCPHDR_SYNACK) == TCPHDR_SYNACK)
+ return synack_opt_len(skops);
+
+ if (tcp_flags & TCPHDR_SYN)
+ return syn_opt_len(skops);
+
+ if (tcp_flags & TCPHDR_FIN)
+ return fin_opt_len(skops);
+
+ if (skops_current_mss(skops))
+ /* The kernel is calculating the MSS */
+ return current_mss_opt_len(skops);
+
+ if (skops->skb_len)
+ return data_opt_len(skops);
+
+ return nodata_opt_len(skops);
+}
+
+static int handle_write_hdr_opt(struct bpf_sock_ops *skops)
+{
+ __u8 tcp_flags = skops_tcp_flags(skops);
+ struct tcphdr *th;
+
+ if ((tcp_flags & TCPHDR_SYNACK) == TCPHDR_SYNACK)
+ return write_synack_opt(skops);
+
+ if (tcp_flags & TCPHDR_SYN)
+ return write_syn_opt(skops);
+
+ if (tcp_flags & TCPHDR_FIN)
+ return write_fin_opt(skops);
+
+ th = skops->skb_data;
+ if (th + 1 > skops->skb_data_end)
+ RET_CG_ERR(0);
+
+ if (skops->skb_len > tcp_hdrlen(th))
+ return write_data_opt(skops);
+
+ return write_nodata_opt(skops);
+}
+
+static int set_delack_max(struct bpf_sock_ops *skops, __u8 max_delack_ms)
+{
+ __u32 max_delack_us = max_delack_ms * 1000;
+
+ return bpf_setsockopt(skops, SOL_TCP, TCP_BPF_DELACK_MAX,
+ &max_delack_us, sizeof(max_delack_us));
+}
+
+static int set_rto_min(struct bpf_sock_ops *skops, __u8 peer_max_delack_ms)
+{
+ __u32 min_rto_us = peer_max_delack_ms * 1000;
+
+ return bpf_setsockopt(skops, SOL_TCP, TCP_BPF_RTO_MIN, &min_rto_us,
+ sizeof(min_rto_us));
+}
+
+static int handle_active_estab(struct bpf_sock_ops *skops)
+{
+ struct hdr_stg init_stg = {
+ .active = true,
+ };
+ int err;
+
+ err = load_option(skops, &active_estab_in, false);
+ if (err && err != -ENOMSG)
+ RET_CG_ERR(err);
+
+ init_stg.resend_syn = TEST_OPTION_FLAGS(active_estab_in.flags,
+ OPTION_RESEND);
+ if (!skops->sk || !bpf_sk_storage_get(&hdr_stg_map, skops->sk,
+ &init_stg,
+ BPF_SK_STORAGE_GET_F_CREATE))
+ RET_CG_ERR(0);
+
+ if (init_stg.resend_syn)
+ /* Don't clear the write_hdr cb now because
+ * the ACK may get lost and retransmit may
+ * be needed.
+ *
+ * PARSE_ALL_HDR cb flag is set to learn if this
+ * resend_syn option has received by the peer.
+ *
+ * The header option will be resent until a valid
+ * packet is received at handle_parse_hdr()
+ * and all hdr cb flags will be cleared in
+ * handle_parse_hdr().
+ */
+ set_parse_all_hdr_cb_flags(skops);
+ else if (!active_fin_out.flags)
+ /* No options will be written from now */
+ clear_hdr_cb_flags(skops);
+
+ if (active_syn_out.max_delack_ms) {
+ err = set_delack_max(skops, active_syn_out.max_delack_ms);
+ if (err)
+ RET_CG_ERR(err);
+ }
+
+ if (active_estab_in.max_delack_ms) {
+ err = set_rto_min(skops, active_estab_in.max_delack_ms);
+ if (err)
+ RET_CG_ERR(err);
+ }
+
+ return CG_OK;
+}
+
+static int handle_passive_estab(struct bpf_sock_ops *skops)
+{
+ struct hdr_stg init_stg = {};
+ struct tcphdr *th;
+ int err;
+
+ inherit_cb_flags = skops->bpf_sock_ops_cb_flags;
+
+ err = load_option(skops, &passive_estab_in, true);
+ if (err == -ENOENT) {
+ /* saved_syn is not found. It was in syncookie mode.
+ * We have asked the active side to resend the options
+ * in ACK, so try to find the bpf_test_option from ACK now.
+ */
+ err = load_option(skops, &passive_estab_in, false);
+ init_stg.syncookie = true;
+ }
+
+ /* ENOMSG: The bpf_test_option is not found which is fine.
+ * Bail out now for all other errors.
+ */
+ if (err && err != -ENOMSG)
+ RET_CG_ERR(err);
+
+ th = skops->skb_data;
+ if (th + 1 > skops->skb_data_end)
+ RET_CG_ERR(0);
+
+ if (th->syn) {
+ /* Fastopen */
+
+ /* Cannot clear cb_flags to stop write_hdr cb.
+ * synack is not sent yet for fast open.
+ * Even it was, the synack may need to be retransmitted.
+ *
+ * PARSE_ALL_HDR cb flag is set to learn
+ * if synack has reached the peer.
+ * All cb_flags will be cleared in handle_parse_hdr().
+ */
+ set_parse_all_hdr_cb_flags(skops);
+ init_stg.fastopen = true;
+ } else if (!passive_fin_out.flags) {
+ /* No options will be written from now */
+ clear_hdr_cb_flags(skops);
+ }
+
+ if (!skops->sk ||
+ !bpf_sk_storage_get(&hdr_stg_map, skops->sk, &init_stg,
+ BPF_SK_STORAGE_GET_F_CREATE))
+ RET_CG_ERR(0);
+
+ if (passive_synack_out.max_delack_ms) {
+ err = set_delack_max(skops, passive_synack_out.max_delack_ms);
+ if (err)
+ RET_CG_ERR(err);
+ }
+
+ if (passive_estab_in.max_delack_ms) {
+ err = set_rto_min(skops, passive_estab_in.max_delack_ms);
+ if (err)
+ RET_CG_ERR(err);
+ }
+
+ return CG_OK;
+}
+
+static int handle_parse_hdr(struct bpf_sock_ops *skops)
+{
+ struct hdr_stg *hdr_stg;
+ struct tcphdr *th;
+
+ if (!skops->sk)
+ RET_CG_ERR(0);
+
+ th = skops->skb_data;
+ if (th + 1 > skops->skb_data_end)
+ RET_CG_ERR(0);
+
+ hdr_stg = bpf_sk_storage_get(&hdr_stg_map, skops->sk, NULL, 0);
+ if (!hdr_stg)
+ RET_CG_ERR(0);
+
+ if (hdr_stg->resend_syn || hdr_stg->fastopen)
+ /* The PARSE_ALL_HDR cb flag was turned on
+ * to ensure that the previously written
+ * options have reached the peer.
+ * Those previously written option includes:
+ * - Active side: resend_syn in ACK during syncookie
+ * or
+ * - Passive side: SYNACK during fastopen
+ *
+ * A valid packet has been received here after
+ * the 3WHS, so the PARSE_ALL_HDR cb flag
+ * can be cleared now.
+ */
+ clear_parse_all_hdr_cb_flags(skops);
+
+ if (hdr_stg->resend_syn && !active_fin_out.flags)
+ /* Active side resent the syn option in ACK
+ * because the server was in syncookie mode.
+ * A valid packet has been received, so
+ * clear header cb flags if there is no
+ * more option to send.
+ */
+ clear_hdr_cb_flags(skops);
+
+ if (hdr_stg->fastopen && !passive_fin_out.flags)
+ /* Passive side was in fastopen.
+ * A valid packet has been received, so
+ * the SYNACK has reached the peer.
+ * Clear header cb flags if there is no more
+ * option to send.
+ */
+ clear_hdr_cb_flags(skops);
+
+ if (th->fin) {
+ struct bpf_test_option *fin_opt;
+ int err;
+
+ if (hdr_stg->active)
+ fin_opt = &active_fin_in;
+ else
+ fin_opt = &passive_fin_in;
+
+ err = load_option(skops, fin_opt, false);
+ if (err && err != -ENOMSG)
+ RET_CG_ERR(err);
+ }
+
+ return CG_OK;
+}
+
+SEC("sockops/estab")
+int estab(struct bpf_sock_ops *skops)
+{
+ int true_val = 1;
+
+ switch (skops->op) {
+ case BPF_SOCK_OPS_TCP_LISTEN_CB:
+ bpf_setsockopt(skops, SOL_TCP, TCP_SAVE_SYN,
+ &true_val, sizeof(true_val));
+ set_hdr_cb_flags(skops, BPF_SOCK_OPS_STATE_CB_FLAG);
+ break;
+ case BPF_SOCK_OPS_TCP_CONNECT_CB:
+ set_hdr_cb_flags(skops, 0);
+ break;
+ case BPF_SOCK_OPS_PARSE_HDR_OPT_CB:
+ return handle_parse_hdr(skops);
+ case BPF_SOCK_OPS_HDR_OPT_LEN_CB:
+ return handle_hdr_opt_len(skops);
+ case BPF_SOCK_OPS_WRITE_HDR_OPT_CB:
+ return handle_write_hdr_opt(skops);
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ return handle_passive_estab(skops);
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ return handle_active_estab(skops);
+ }
+
+ return CG_OK;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c
new file mode 100644
index 000000000..3e6912e4d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stddef.h>
+#include <string.h>
+#include <netinet/in.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/tcp.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include "test_tcpbpf.h"
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 4);
+ __type(key, __u32);
+ __type(value, struct tcpbpf_globals);
+} global_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 2);
+ __type(key, __u32);
+ __type(value, int);
+} sockopt_results SEC(".maps");
+
+static inline void update_event_map(int event)
+{
+ __u32 key = 0;
+ struct tcpbpf_globals g, *gp;
+
+ gp = bpf_map_lookup_elem(&global_map, &key);
+ if (gp == NULL) {
+ struct tcpbpf_globals g = {0};
+
+ g.event_map |= (1 << event);
+ bpf_map_update_elem(&global_map, &key, &g,
+ BPF_ANY);
+ } else {
+ g = *gp;
+ g.event_map |= (1 << event);
+ bpf_map_update_elem(&global_map, &key, &g,
+ BPF_ANY);
+ }
+}
+
+int _version SEC("version") = 1;
+
+SEC("sockops")
+int bpf_testcb(struct bpf_sock_ops *skops)
+{
+ char header[sizeof(struct ipv6hdr) + sizeof(struct tcphdr)];
+ struct bpf_sock_ops *reuse = skops;
+ struct tcphdr *thdr;
+ int good_call_rv = 0;
+ int bad_call_rv = 0;
+ int save_syn = 1;
+ int rv = -1;
+ int v = 0;
+ int op;
+
+ /* Test reading fields in bpf_sock_ops using single register */
+ asm volatile (
+ "%[reuse] = *(u32 *)(%[reuse] +96)"
+ : [reuse] "+r"(reuse)
+ :);
+
+ asm volatile (
+ "%[op] = *(u32 *)(%[skops] +96)"
+ : [op] "+r"(op)
+ : [skops] "r"(skops)
+ :);
+
+ asm volatile (
+ "r9 = %[skops];\n"
+ "r8 = *(u32 *)(r9 +164);\n"
+ "*(u32 *)(r9 +164) = r8;\n"
+ :: [skops] "r"(skops)
+ : "r9", "r8");
+
+ asm volatile (
+ "r1 = %[skops];\n"
+ "r1 = *(u64 *)(r1 +184);\n"
+ "if r1 == 0 goto +1;\n"
+ "r1 = *(u32 *)(r1 +4);\n"
+ :: [skops] "r"(skops):"r1");
+
+ asm volatile (
+ "r9 = %[skops];\n"
+ "r9 = *(u64 *)(r9 +184);\n"
+ "if r9 == 0 goto +1;\n"
+ "r9 = *(u32 *)(r9 +4);\n"
+ :: [skops] "r"(skops):"r9");
+
+ asm volatile (
+ "r1 = %[skops];\n"
+ "r2 = *(u64 *)(r1 +184);\n"
+ "if r2 == 0 goto +1;\n"
+ "r2 = *(u32 *)(r2 +4);\n"
+ :: [skops] "r"(skops):"r1", "r2");
+
+ op = (int) skops->op;
+
+ update_event_map(op);
+
+ switch (op) {
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ /* Test failure to set largest cb flag (assumes not defined) */
+ bad_call_rv = bpf_sock_ops_cb_flags_set(skops, 0x80);
+ /* Set callback */
+ good_call_rv = bpf_sock_ops_cb_flags_set(skops,
+ BPF_SOCK_OPS_STATE_CB_FLAG);
+ /* Update results */
+ {
+ __u32 key = 0;
+ struct tcpbpf_globals g, *gp;
+
+ gp = bpf_map_lookup_elem(&global_map, &key);
+ if (!gp)
+ break;
+ g = *gp;
+ g.bad_cb_test_rv = bad_call_rv;
+ g.good_cb_test_rv = good_call_rv;
+ bpf_map_update_elem(&global_map, &key, &g,
+ BPF_ANY);
+ }
+ break;
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ skops->sk_txhash = 0x12345f;
+ v = 0xff;
+ rv = bpf_setsockopt(skops, SOL_IPV6, IPV6_TCLASS, &v,
+ sizeof(v));
+ if (skops->family == AF_INET6) {
+ v = bpf_getsockopt(skops, IPPROTO_TCP, TCP_SAVED_SYN,
+ header, (sizeof(struct ipv6hdr) +
+ sizeof(struct tcphdr)));
+ if (!v) {
+ int offset = sizeof(struct ipv6hdr);
+
+ thdr = (struct tcphdr *)(header + offset);
+ v = thdr->syn;
+ __u32 key = 1;
+
+ bpf_map_update_elem(&sockopt_results, &key, &v,
+ BPF_ANY);
+ }
+ }
+ break;
+ case BPF_SOCK_OPS_RTO_CB:
+ break;
+ case BPF_SOCK_OPS_RETRANS_CB:
+ break;
+ case BPF_SOCK_OPS_STATE_CB:
+ if (skops->args[1] == BPF_TCP_CLOSE) {
+ __u32 key = 0;
+ struct tcpbpf_globals g, *gp;
+
+ gp = bpf_map_lookup_elem(&global_map, &key);
+ if (!gp)
+ break;
+ g = *gp;
+ if (skops->args[0] == BPF_TCP_LISTEN) {
+ g.num_listen++;
+ } else {
+ g.total_retrans = skops->total_retrans;
+ g.data_segs_in = skops->data_segs_in;
+ g.data_segs_out = skops->data_segs_out;
+ g.bytes_received = skops->bytes_received;
+ g.bytes_acked = skops->bytes_acked;
+ }
+ g.num_close_events++;
+ bpf_map_update_elem(&global_map, &key, &g,
+ BPF_ANY);
+ }
+ break;
+ case BPF_SOCK_OPS_TCP_LISTEN_CB:
+ bpf_sock_ops_cb_flags_set(skops, BPF_SOCK_OPS_STATE_CB_FLAG);
+ v = bpf_setsockopt(skops, IPPROTO_TCP, TCP_SAVE_SYN,
+ &save_syn, sizeof(save_syn));
+ /* Update global map w/ result of setsock opt */
+ __u32 key = 0;
+
+ bpf_map_update_elem(&sockopt_results, &key, &v, BPF_ANY);
+ break;
+ default:
+ rv = -1;
+ }
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c b/tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c
new file mode 100644
index 000000000..ac63410bb
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_tcpnotify_kern.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stddef.h>
+#include <string.h>
+#include <netinet/in.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/tcp.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include "test_tcpnotify.h"
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 4);
+ __type(key, __u32);
+ __type(value, struct tcpnotify_globals);
+} global_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(max_entries, 2);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(__u32));
+} perf_event_map SEC(".maps");
+
+int _version SEC("version") = 1;
+
+SEC("sockops")
+int bpf_testcb(struct bpf_sock_ops *skops)
+{
+ int rv = -1;
+ int op;
+
+ op = (int) skops->op;
+
+ if (bpf_ntohl(skops->remote_port) != TESTPORT) {
+ skops->reply = -1;
+ return 0;
+ }
+
+ switch (op) {
+ case BPF_SOCK_OPS_TIMEOUT_INIT:
+ case BPF_SOCK_OPS_RWND_INIT:
+ case BPF_SOCK_OPS_NEEDS_ECN:
+ case BPF_SOCK_OPS_BASE_RTT:
+ case BPF_SOCK_OPS_RTO_CB:
+ rv = 1;
+ break;
+
+ case BPF_SOCK_OPS_TCP_CONNECT_CB:
+ case BPF_SOCK_OPS_TCP_LISTEN_CB:
+ case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
+ case BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB:
+ bpf_sock_ops_cb_flags_set(skops, (BPF_SOCK_OPS_RETRANS_CB_FLAG|
+ BPF_SOCK_OPS_RTO_CB_FLAG));
+ rv = 1;
+ break;
+ case BPF_SOCK_OPS_RETRANS_CB: {
+ __u32 key = 0;
+ struct tcpnotify_globals g, *gp;
+ struct tcp_notifier msg = {
+ .type = 0xde,
+ .subtype = 0xad,
+ .source = 0xbe,
+ .hash = 0xef,
+ };
+
+ rv = 1;
+
+ /* Update results */
+ gp = bpf_map_lookup_elem(&global_map, &key);
+ if (!gp)
+ break;
+ g = *gp;
+ g.total_retrans = skops->total_retrans;
+ g.ncalls++;
+ bpf_map_update_elem(&global_map, &key, &g,
+ BPF_ANY);
+ bpf_perf_event_output(skops, &perf_event_map,
+ BPF_F_CURRENT_CPU,
+ &msg, sizeof(msg));
+ }
+ break;
+ default:
+ rv = -1;
+ }
+ skops->reply = rv;
+ return 1;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_trace_ext.c b/tools/testing/selftests/bpf/progs/test_trace_ext.c
new file mode 100644
index 000000000..d19a634d0
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_trace_ext.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <linux/bpf.h>
+#include <stdbool.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include <bpf/bpf_tracing.h>
+
+__u64 ext_called = 0;
+
+SEC("freplace/test_pkt_md_access")
+int test_pkt_md_access_new(struct __sk_buff *skb)
+{
+ ext_called = skb->len;
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_trace_ext_tracing.c b/tools/testing/selftests/bpf/progs/test_trace_ext_tracing.c
new file mode 100644
index 000000000..52f3baf98
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_trace_ext_tracing.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+__u64 fentry_called = 0;
+
+SEC("fentry/test_pkt_md_access_new")
+int BPF_PROG(fentry, struct sk_buff *skb)
+{
+ fentry_called = skb->len;
+ return 0;
+}
+
+__u64 fexit_called = 0;
+
+SEC("fexit/test_pkt_md_access_new")
+int BPF_PROG(fexit, struct sk_buff *skb)
+{
+ fexit_called = skb->len;
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_tracepoint.c b/tools/testing/selftests/bpf/progs/test_tracepoint.c
new file mode 100644
index 000000000..4b825ee12
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_tracepoint.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2017 Facebook
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+/* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
+struct sched_switch_args {
+ unsigned long long pad;
+ char prev_comm[16];
+ int prev_pid;
+ int prev_prio;
+ long long prev_state;
+ char next_comm[16];
+ int next_pid;
+ int next_prio;
+};
+
+SEC("tracepoint/sched/sched_switch")
+int oncpu(struct sched_switch_args *ctx)
+{
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1; /* ignored by tracepoints, required by libbpf.a */
diff --git a/tools/testing/selftests/bpf/progs/test_trampoline_count.c b/tools/testing/selftests/bpf/progs/test_trampoline_count.c
new file mode 100644
index 000000000..f030e469d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_trampoline_count.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdbool.h>
+#include <stddef.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+struct task_struct;
+
+SEC("fentry/__set_task_comm")
+int BPF_PROG(prog1, struct task_struct *tsk, const char *buf, bool exec)
+{
+ return 0;
+}
+
+SEC("fexit/__set_task_comm")
+int BPF_PROG(prog2, struct task_struct *tsk, const char *buf, bool exec)
+{
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_tunnel_kern.c b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
new file mode 100644
index 000000000..ba6eadfec
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_tunnel_kern.c
@@ -0,0 +1,681 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2016 VMware
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stddef.h>
+#include <string.h>
+#include <arpa/inet.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/pkt_cls.h>
+#include <linux/erspan.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#define ERROR(ret) do {\
+ char fmt[] = "ERROR line:%d ret:%d\n";\
+ bpf_trace_printk(fmt, sizeof(fmt), __LINE__, ret); \
+ } while (0)
+
+int _version SEC("version") = 1;
+
+struct geneve_opt {
+ __be16 opt_class;
+ __u8 type;
+ __u8 length:5;
+ __u8 r3:1;
+ __u8 r2:1;
+ __u8 r1:1;
+ __u8 opt_data[8]; /* hard-coded to 8 byte */
+};
+
+struct vxlan_metadata {
+ __u32 gbp;
+};
+
+SEC("gre_set_tunnel")
+int _gre_set_tunnel(struct __sk_buff *skb)
+{
+ int ret;
+ struct bpf_tunnel_key key;
+
+ __builtin_memset(&key, 0x0, sizeof(key));
+ key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
+ key.tunnel_id = 2;
+ key.tunnel_tos = 0;
+ key.tunnel_ttl = 64;
+
+ ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+ BPF_F_ZERO_CSUM_TX | BPF_F_SEQ_NUMBER);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ return TC_ACT_OK;
+}
+
+SEC("gre_get_tunnel")
+int _gre_get_tunnel(struct __sk_buff *skb)
+{
+ int ret;
+ struct bpf_tunnel_key key;
+ char fmt[] = "key %d remote ip 0x%x\n";
+
+ ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ bpf_trace_printk(fmt, sizeof(fmt), key.tunnel_id, key.remote_ipv4);
+ return TC_ACT_OK;
+}
+
+SEC("ip6gretap_set_tunnel")
+int _ip6gretap_set_tunnel(struct __sk_buff *skb)
+{
+ struct bpf_tunnel_key key;
+ int ret;
+
+ __builtin_memset(&key, 0x0, sizeof(key));
+ key.remote_ipv6[3] = bpf_htonl(0x11); /* ::11 */
+ key.tunnel_id = 2;
+ key.tunnel_tos = 0;
+ key.tunnel_ttl = 64;
+ key.tunnel_label = 0xabcde;
+
+ ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+ BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
+ BPF_F_SEQ_NUMBER);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ return TC_ACT_OK;
+}
+
+SEC("ip6gretap_get_tunnel")
+int _ip6gretap_get_tunnel(struct __sk_buff *skb)
+{
+ char fmt[] = "key %d remote ip6 ::%x label %x\n";
+ struct bpf_tunnel_key key;
+ int ret;
+
+ ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key),
+ BPF_F_TUNINFO_IPV6);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ bpf_trace_printk(fmt, sizeof(fmt),
+ key.tunnel_id, key.remote_ipv6[3], key.tunnel_label);
+
+ return TC_ACT_OK;
+}
+
+SEC("erspan_set_tunnel")
+int _erspan_set_tunnel(struct __sk_buff *skb)
+{
+ struct bpf_tunnel_key key;
+ struct erspan_metadata md;
+ int ret;
+
+ __builtin_memset(&key, 0x0, sizeof(key));
+ key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
+ key.tunnel_id = 2;
+ key.tunnel_tos = 0;
+ key.tunnel_ttl = 64;
+
+ ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+ BPF_F_ZERO_CSUM_TX);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ __builtin_memset(&md, 0, sizeof(md));
+#ifdef ERSPAN_V1
+ md.version = 1;
+ md.u.index = bpf_htonl(123);
+#else
+ __u8 direction = 1;
+ __u8 hwid = 7;
+
+ md.version = 2;
+ md.u.md2.dir = direction;
+ md.u.md2.hwid = hwid & 0xf;
+ md.u.md2.hwid_upper = (hwid >> 4) & 0x3;
+#endif
+
+ ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ return TC_ACT_OK;
+}
+
+SEC("erspan_get_tunnel")
+int _erspan_get_tunnel(struct __sk_buff *skb)
+{
+ char fmt[] = "key %d remote ip 0x%x erspan version %d\n";
+ struct bpf_tunnel_key key;
+ struct erspan_metadata md;
+ __u32 index;
+ int ret;
+
+ ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ ret = bpf_skb_get_tunnel_opt(skb, &md, sizeof(md));
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ bpf_trace_printk(fmt, sizeof(fmt),
+ key.tunnel_id, key.remote_ipv4, md.version);
+
+#ifdef ERSPAN_V1
+ char fmt2[] = "\tindex %x\n";
+
+ index = bpf_ntohl(md.u.index);
+ bpf_trace_printk(fmt2, sizeof(fmt2), index);
+#else
+ char fmt2[] = "\tdirection %d hwid %x timestamp %u\n";
+
+ bpf_trace_printk(fmt2, sizeof(fmt2),
+ md.u.md2.dir,
+ (md.u.md2.hwid_upper << 4) + md.u.md2.hwid,
+ bpf_ntohl(md.u.md2.timestamp));
+#endif
+
+ return TC_ACT_OK;
+}
+
+SEC("ip4ip6erspan_set_tunnel")
+int _ip4ip6erspan_set_tunnel(struct __sk_buff *skb)
+{
+ struct bpf_tunnel_key key;
+ struct erspan_metadata md;
+ int ret;
+
+ __builtin_memset(&key, 0x0, sizeof(key));
+ key.remote_ipv6[3] = bpf_htonl(0x11);
+ key.tunnel_id = 2;
+ key.tunnel_tos = 0;
+ key.tunnel_ttl = 64;
+
+ ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+ BPF_F_TUNINFO_IPV6);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ __builtin_memset(&md, 0, sizeof(md));
+
+#ifdef ERSPAN_V1
+ md.u.index = bpf_htonl(123);
+ md.version = 1;
+#else
+ __u8 direction = 0;
+ __u8 hwid = 17;
+
+ md.version = 2;
+ md.u.md2.dir = direction;
+ md.u.md2.hwid = hwid & 0xf;
+ md.u.md2.hwid_upper = (hwid >> 4) & 0x3;
+#endif
+
+ ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ return TC_ACT_OK;
+}
+
+SEC("ip4ip6erspan_get_tunnel")
+int _ip4ip6erspan_get_tunnel(struct __sk_buff *skb)
+{
+ char fmt[] = "ip6erspan get key %d remote ip6 ::%x erspan version %d\n";
+ struct bpf_tunnel_key key;
+ struct erspan_metadata md;
+ __u32 index;
+ int ret;
+
+ ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key),
+ BPF_F_TUNINFO_IPV6);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ ret = bpf_skb_get_tunnel_opt(skb, &md, sizeof(md));
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ bpf_trace_printk(fmt, sizeof(fmt),
+ key.tunnel_id, key.remote_ipv4, md.version);
+
+#ifdef ERSPAN_V1
+ char fmt2[] = "\tindex %x\n";
+
+ index = bpf_ntohl(md.u.index);
+ bpf_trace_printk(fmt2, sizeof(fmt2), index);
+#else
+ char fmt2[] = "\tdirection %d hwid %x timestamp %u\n";
+
+ bpf_trace_printk(fmt2, sizeof(fmt2),
+ md.u.md2.dir,
+ (md.u.md2.hwid_upper << 4) + md.u.md2.hwid,
+ bpf_ntohl(md.u.md2.timestamp));
+#endif
+
+ return TC_ACT_OK;
+}
+
+SEC("vxlan_set_tunnel")
+int _vxlan_set_tunnel(struct __sk_buff *skb)
+{
+ int ret;
+ struct bpf_tunnel_key key;
+ struct vxlan_metadata md;
+
+ __builtin_memset(&key, 0x0, sizeof(key));
+ key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
+ key.tunnel_id = 2;
+ key.tunnel_tos = 0;
+ key.tunnel_ttl = 64;
+
+ ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+ BPF_F_ZERO_CSUM_TX);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ md.gbp = 0x800FF; /* Set VXLAN Group Policy extension */
+ ret = bpf_skb_set_tunnel_opt(skb, &md, sizeof(md));
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ return TC_ACT_OK;
+}
+
+SEC("vxlan_get_tunnel")
+int _vxlan_get_tunnel(struct __sk_buff *skb)
+{
+ int ret;
+ struct bpf_tunnel_key key;
+ struct vxlan_metadata md;
+ char fmt[] = "key %d remote ip 0x%x vxlan gbp 0x%x\n";
+
+ ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ ret = bpf_skb_get_tunnel_opt(skb, &md, sizeof(md));
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ bpf_trace_printk(fmt, sizeof(fmt),
+ key.tunnel_id, key.remote_ipv4, md.gbp);
+
+ return TC_ACT_OK;
+}
+
+SEC("ip6vxlan_set_tunnel")
+int _ip6vxlan_set_tunnel(struct __sk_buff *skb)
+{
+ struct bpf_tunnel_key key;
+ int ret;
+
+ __builtin_memset(&key, 0x0, sizeof(key));
+ key.remote_ipv6[3] = bpf_htonl(0x11); /* ::11 */
+ key.tunnel_id = 22;
+ key.tunnel_tos = 0;
+ key.tunnel_ttl = 64;
+
+ ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+ BPF_F_TUNINFO_IPV6);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ return TC_ACT_OK;
+}
+
+SEC("ip6vxlan_get_tunnel")
+int _ip6vxlan_get_tunnel(struct __sk_buff *skb)
+{
+ char fmt[] = "key %d remote ip6 ::%x label %x\n";
+ struct bpf_tunnel_key key;
+ int ret;
+
+ ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key),
+ BPF_F_TUNINFO_IPV6);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ bpf_trace_printk(fmt, sizeof(fmt),
+ key.tunnel_id, key.remote_ipv6[3], key.tunnel_label);
+
+ return TC_ACT_OK;
+}
+
+SEC("geneve_set_tunnel")
+int _geneve_set_tunnel(struct __sk_buff *skb)
+{
+ int ret, ret2;
+ struct bpf_tunnel_key key;
+ struct geneve_opt gopt;
+
+ __builtin_memset(&key, 0x0, sizeof(key));
+ key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
+ key.tunnel_id = 2;
+ key.tunnel_tos = 0;
+ key.tunnel_ttl = 64;
+
+ __builtin_memset(&gopt, 0x0, sizeof(gopt));
+ gopt.opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */
+ gopt.type = 0x08;
+ gopt.r1 = 0;
+ gopt.r2 = 0;
+ gopt.r3 = 0;
+ gopt.length = 2; /* 4-byte multiple */
+ *(int *) &gopt.opt_data = bpf_htonl(0xdeadbeef);
+
+ ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+ BPF_F_ZERO_CSUM_TX);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ ret = bpf_skb_set_tunnel_opt(skb, &gopt, sizeof(gopt));
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ return TC_ACT_OK;
+}
+
+SEC("geneve_get_tunnel")
+int _geneve_get_tunnel(struct __sk_buff *skb)
+{
+ int ret;
+ struct bpf_tunnel_key key;
+ struct geneve_opt gopt;
+ char fmt[] = "key %d remote ip 0x%x geneve class 0x%x\n";
+
+ ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ ret = bpf_skb_get_tunnel_opt(skb, &gopt, sizeof(gopt));
+ if (ret < 0)
+ gopt.opt_class = 0;
+
+ bpf_trace_printk(fmt, sizeof(fmt),
+ key.tunnel_id, key.remote_ipv4, gopt.opt_class);
+ return TC_ACT_OK;
+}
+
+SEC("ip6geneve_set_tunnel")
+int _ip6geneve_set_tunnel(struct __sk_buff *skb)
+{
+ struct bpf_tunnel_key key;
+ struct geneve_opt gopt;
+ int ret;
+
+ __builtin_memset(&key, 0x0, sizeof(key));
+ key.remote_ipv6[3] = bpf_htonl(0x11); /* ::11 */
+ key.tunnel_id = 22;
+ key.tunnel_tos = 0;
+ key.tunnel_ttl = 64;
+
+ ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+ BPF_F_TUNINFO_IPV6);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ __builtin_memset(&gopt, 0x0, sizeof(gopt));
+ gopt.opt_class = bpf_htons(0x102); /* Open Virtual Networking (OVN) */
+ gopt.type = 0x08;
+ gopt.r1 = 0;
+ gopt.r2 = 0;
+ gopt.r3 = 0;
+ gopt.length = 2; /* 4-byte multiple */
+ *(int *) &gopt.opt_data = bpf_htonl(0xfeedbeef);
+
+ ret = bpf_skb_set_tunnel_opt(skb, &gopt, sizeof(gopt));
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ return TC_ACT_OK;
+}
+
+SEC("ip6geneve_get_tunnel")
+int _ip6geneve_get_tunnel(struct __sk_buff *skb)
+{
+ char fmt[] = "key %d remote ip 0x%x geneve class 0x%x\n";
+ struct bpf_tunnel_key key;
+ struct geneve_opt gopt;
+ int ret;
+
+ ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key),
+ BPF_F_TUNINFO_IPV6);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ ret = bpf_skb_get_tunnel_opt(skb, &gopt, sizeof(gopt));
+ if (ret < 0)
+ gopt.opt_class = 0;
+
+ bpf_trace_printk(fmt, sizeof(fmt),
+ key.tunnel_id, key.remote_ipv4, gopt.opt_class);
+
+ return TC_ACT_OK;
+}
+
+SEC("ipip_set_tunnel")
+int _ipip_set_tunnel(struct __sk_buff *skb)
+{
+ struct bpf_tunnel_key key = {};
+ void *data = (void *)(long)skb->data;
+ struct iphdr *iph = data;
+ void *data_end = (void *)(long)skb->data_end;
+ int ret;
+
+ /* single length check */
+ if (data + sizeof(*iph) > data_end) {
+ ERROR(1);
+ return TC_ACT_SHOT;
+ }
+
+ key.tunnel_ttl = 64;
+ if (iph->protocol == IPPROTO_ICMP) {
+ key.remote_ipv4 = 0xac100164; /* 172.16.1.100 */
+ }
+
+ ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ return TC_ACT_OK;
+}
+
+SEC("ipip_get_tunnel")
+int _ipip_get_tunnel(struct __sk_buff *skb)
+{
+ int ret;
+ struct bpf_tunnel_key key;
+ char fmt[] = "remote ip 0x%x\n";
+
+ ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ bpf_trace_printk(fmt, sizeof(fmt), key.remote_ipv4);
+ return TC_ACT_OK;
+}
+
+SEC("ipip6_set_tunnel")
+int _ipip6_set_tunnel(struct __sk_buff *skb)
+{
+ struct bpf_tunnel_key key = {};
+ void *data = (void *)(long)skb->data;
+ struct iphdr *iph = data;
+ void *data_end = (void *)(long)skb->data_end;
+ int ret;
+
+ /* single length check */
+ if (data + sizeof(*iph) > data_end) {
+ ERROR(1);
+ return TC_ACT_SHOT;
+ }
+
+ __builtin_memset(&key, 0x0, sizeof(key));
+ key.tunnel_ttl = 64;
+ if (iph->protocol == IPPROTO_ICMP) {
+ key.remote_ipv6[3] = bpf_htonl(0x11); /* ::11 */
+ }
+
+ ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+ BPF_F_TUNINFO_IPV6);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ return TC_ACT_OK;
+}
+
+SEC("ipip6_get_tunnel")
+int _ipip6_get_tunnel(struct __sk_buff *skb)
+{
+ int ret;
+ struct bpf_tunnel_key key;
+ char fmt[] = "remote ip6 %x::%x\n";
+
+ ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key),
+ BPF_F_TUNINFO_IPV6);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ bpf_trace_printk(fmt, sizeof(fmt), bpf_htonl(key.remote_ipv6[0]),
+ bpf_htonl(key.remote_ipv6[3]));
+ return TC_ACT_OK;
+}
+
+SEC("ip6ip6_set_tunnel")
+int _ip6ip6_set_tunnel(struct __sk_buff *skb)
+{
+ struct bpf_tunnel_key key = {};
+ void *data = (void *)(long)skb->data;
+ struct ipv6hdr *iph = data;
+ void *data_end = (void *)(long)skb->data_end;
+ int ret;
+
+ /* single length check */
+ if (data + sizeof(*iph) > data_end) {
+ ERROR(1);
+ return TC_ACT_SHOT;
+ }
+
+ key.tunnel_ttl = 64;
+ if (iph->nexthdr == 58 /* NEXTHDR_ICMP */) {
+ key.remote_ipv6[3] = bpf_htonl(0x11); /* ::11 */
+ }
+
+ ret = bpf_skb_set_tunnel_key(skb, &key, sizeof(key),
+ BPF_F_TUNINFO_IPV6);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ return TC_ACT_OK;
+}
+
+SEC("ip6ip6_get_tunnel")
+int _ip6ip6_get_tunnel(struct __sk_buff *skb)
+{
+ int ret;
+ struct bpf_tunnel_key key;
+ char fmt[] = "remote ip6 %x::%x\n";
+
+ ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key),
+ BPF_F_TUNINFO_IPV6);
+ if (ret < 0) {
+ ERROR(ret);
+ return TC_ACT_SHOT;
+ }
+
+ bpf_trace_printk(fmt, sizeof(fmt), bpf_htonl(key.remote_ipv6[0]),
+ bpf_htonl(key.remote_ipv6[3]));
+ return TC_ACT_OK;
+}
+
+SEC("xfrm_get_state")
+int _xfrm_get_state(struct __sk_buff *skb)
+{
+ struct bpf_xfrm_state x;
+ char fmt[] = "reqid %d spi 0x%x remote ip 0x%x\n";
+ int ret;
+
+ ret = bpf_skb_get_xfrm_state(skb, 0, &x, sizeof(x), 0);
+ if (ret < 0)
+ return TC_ACT_OK;
+
+ bpf_trace_printk(fmt, sizeof(fmt), x.reqid, bpf_ntohl(x.spi),
+ bpf_ntohl(x.remote_ipv4));
+ return TC_ACT_OK;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_varlen.c b/tools/testing/selftests/bpf/progs/test_varlen.c
new file mode 100644
index 000000000..913acdffd
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_varlen.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+#define MAX_LEN 256
+
+char buf_in1[MAX_LEN] = {};
+char buf_in2[MAX_LEN] = {};
+
+int test_pid = 0;
+bool capture = false;
+
+/* .bss */
+__u64 payload1_len1 = 0;
+__u64 payload1_len2 = 0;
+__u64 total1 = 0;
+char payload1[MAX_LEN + MAX_LEN] = {};
+
+/* .data */
+int payload2_len1 = -1;
+int payload2_len2 = -1;
+int total2 = -1;
+char payload2[MAX_LEN + MAX_LEN] = { 1 };
+
+int payload3_len1 = -1;
+int payload3_len2 = -1;
+int total3= -1;
+char payload3[MAX_LEN + MAX_LEN] = { 1 };
+
+int payload4_len1 = -1;
+int payload4_len2 = -1;
+int total4= -1;
+char payload4[MAX_LEN + MAX_LEN] = { 1 };
+
+SEC("raw_tp/sys_enter")
+int handler64_unsigned(void *regs)
+{
+ int pid = bpf_get_current_pid_tgid() >> 32;
+ void *payload = payload1;
+ u64 len;
+
+ /* ignore irrelevant invocations */
+ if (test_pid != pid || !capture)
+ return 0;
+
+ len = bpf_probe_read_kernel_str(payload, MAX_LEN, &buf_in1[0]);
+ if (len <= MAX_LEN) {
+ payload += len;
+ payload1_len1 = len;
+ }
+
+ len = bpf_probe_read_kernel_str(payload, MAX_LEN, &buf_in2[0]);
+ if (len <= MAX_LEN) {
+ payload += len;
+ payload1_len2 = len;
+ }
+
+ total1 = payload - (void *)payload1;
+
+ return 0;
+}
+
+SEC("raw_tp/sys_exit")
+int handler64_signed(void *regs)
+{
+ int pid = bpf_get_current_pid_tgid() >> 32;
+ void *payload = payload3;
+ long len;
+
+ /* ignore irrelevant invocations */
+ if (test_pid != pid || !capture)
+ return 0;
+
+ len = bpf_probe_read_kernel_str(payload, MAX_LEN, &buf_in1[0]);
+ if (len >= 0) {
+ payload += len;
+ payload3_len1 = len;
+ }
+ len = bpf_probe_read_kernel_str(payload, MAX_LEN, &buf_in2[0]);
+ if (len >= 0) {
+ payload += len;
+ payload3_len2 = len;
+ }
+ total3 = payload - (void *)payload3;
+
+ return 0;
+}
+
+SEC("tp/raw_syscalls/sys_enter")
+int handler32_unsigned(void *regs)
+{
+ int pid = bpf_get_current_pid_tgid() >> 32;
+ void *payload = payload2;
+ u32 len;
+
+ /* ignore irrelevant invocations */
+ if (test_pid != pid || !capture)
+ return 0;
+
+ len = bpf_probe_read_kernel_str(payload, MAX_LEN, &buf_in1[0]);
+ if (len <= MAX_LEN) {
+ payload += len;
+ payload2_len1 = len;
+ }
+
+ len = bpf_probe_read_kernel_str(payload, MAX_LEN, &buf_in2[0]);
+ if (len <= MAX_LEN) {
+ payload += len;
+ payload2_len2 = len;
+ }
+
+ total2 = payload - (void *)payload2;
+
+ return 0;
+}
+
+SEC("tp/raw_syscalls/sys_exit")
+int handler32_signed(void *regs)
+{
+ int pid = bpf_get_current_pid_tgid() >> 32;
+ void *payload = payload4;
+ int len;
+
+ /* ignore irrelevant invocations */
+ if (test_pid != pid || !capture)
+ return 0;
+
+ len = bpf_probe_read_kernel_str(payload, MAX_LEN, &buf_in1[0]);
+ if (len >= 0) {
+ payload += len;
+ payload4_len1 = len;
+ }
+ len = bpf_probe_read_kernel_str(payload, MAX_LEN, &buf_in2[0]);
+ if (len >= 0) {
+ payload += len;
+ payload4_len2 = len;
+ }
+ total4 = payload - (void *)payload4;
+
+ return 0;
+}
+
+SEC("tp/syscalls/sys_exit_getpid")
+int handler_exit(void *regs)
+{
+ long bla;
+
+ if (bpf_probe_read_kernel(&bla, sizeof(bla), 0))
+ return 1;
+ else
+ return 0;
+}
+
+char LICENSE[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_verif_scale1.c b/tools/testing/selftests/bpf/progs/test_verif_scale1.c
new file mode 100644
index 000000000..d38153dab
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_verif_scale1.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#define ATTR __attribute__((noinline))
+#include "test_jhash.h"
+
+SEC("scale90_noinline")
+int balancer_ingress(struct __sk_buff *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ void *ptr;
+ int ret = 0, nh_off, i = 0;
+
+ nh_off = 14;
+
+ /* pragma unroll doesn't work on large loops */
+
+#define C do { \
+ ptr = data + i; \
+ if (ptr + nh_off > data_end) \
+ break; \
+ ctx->tc_index = jhash(ptr, nh_off, ctx->cb[0] + i++); \
+ } while (0);
+#define C30 C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;
+ C30;C30;C30; /* 90 calls */
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_verif_scale2.c b/tools/testing/selftests/bpf/progs/test_verif_scale2.c
new file mode 100644
index 000000000..f024154c7
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_verif_scale2.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#define ATTR __always_inline
+#include "test_jhash.h"
+
+SEC("scale90_inline")
+int balancer_ingress(struct __sk_buff *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ void *ptr;
+ int ret = 0, nh_off, i = 0;
+
+ nh_off = 14;
+
+ /* pragma unroll doesn't work on large loops */
+
+#define C do { \
+ ptr = data + i; \
+ if (ptr + nh_off > data_end) \
+ break; \
+ ctx->tc_index = jhash(ptr, nh_off, ctx->cb[0] + i++); \
+ } while (0);
+#define C30 C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;
+ C30;C30;C30; /* 90 calls */
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_verif_scale3.c b/tools/testing/selftests/bpf/progs/test_verif_scale3.c
new file mode 100644
index 000000000..9beb5bf80
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_verif_scale3.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#define ATTR __attribute__((noinline))
+#include "test_jhash.h"
+
+SEC("scale90_noinline32")
+int balancer_ingress(struct __sk_buff *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ void *ptr;
+ int ret = 0, nh_off, i = 0;
+
+ nh_off = 32;
+
+ /* pragma unroll doesn't work on large loops */
+
+#define C do { \
+ ptr = data + i; \
+ if (ptr + nh_off > data_end) \
+ break; \
+ ctx->tc_index = jhash(ptr, nh_off, ctx->cb[0] + i++); \
+ } while (0);
+#define C30 C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;C;
+ C30;C30;C30; /* 90 calls */
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_vmlinux.c b/tools/testing/selftests/bpf/progs/test_vmlinux.c
new file mode 100644
index 000000000..e9dfa0313
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_vmlinux.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+
+#include "vmlinux.h"
+#include <asm/unistd.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+#define MY_TV_NSEC 1337
+
+bool tp_called = false;
+bool raw_tp_called = false;
+bool tp_btf_called = false;
+bool kprobe_called = false;
+bool fentry_called = false;
+
+SEC("tp/syscalls/sys_enter_nanosleep")
+int handle__tp(struct trace_event_raw_sys_enter *args)
+{
+ struct __kernel_timespec *ts;
+ long tv_nsec;
+
+ if (args->id != __NR_nanosleep)
+ return 0;
+
+ ts = (void *)args->args[0];
+ if (bpf_probe_read_user(&tv_nsec, sizeof(ts->tv_nsec), &ts->tv_nsec) ||
+ tv_nsec != MY_TV_NSEC)
+ return 0;
+
+ tp_called = true;
+ return 0;
+}
+
+SEC("raw_tp/sys_enter")
+int BPF_PROG(handle__raw_tp, struct pt_regs *regs, long id)
+{
+ struct __kernel_timespec *ts;
+ long tv_nsec;
+
+ if (id != __NR_nanosleep)
+ return 0;
+
+ ts = (void *)PT_REGS_PARM1_CORE(regs);
+ if (bpf_probe_read_user(&tv_nsec, sizeof(ts->tv_nsec), &ts->tv_nsec) ||
+ tv_nsec != MY_TV_NSEC)
+ return 0;
+
+ raw_tp_called = true;
+ return 0;
+}
+
+SEC("tp_btf/sys_enter")
+int BPF_PROG(handle__tp_btf, struct pt_regs *regs, long id)
+{
+ struct __kernel_timespec *ts;
+ long tv_nsec;
+
+ if (id != __NR_nanosleep)
+ return 0;
+
+ ts = (void *)PT_REGS_PARM1_CORE(regs);
+ if (bpf_probe_read_user(&tv_nsec, sizeof(ts->tv_nsec), &ts->tv_nsec) ||
+ tv_nsec != MY_TV_NSEC)
+ return 0;
+
+ tp_btf_called = true;
+ return 0;
+}
+
+SEC("kprobe/hrtimer_start_range_ns")
+int BPF_KPROBE(handle__kprobe, struct hrtimer *timer, ktime_t tim, u64 delta_ns,
+ const enum hrtimer_mode mode)
+{
+ if (tim == MY_TV_NSEC)
+ kprobe_called = true;
+ return 0;
+}
+
+SEC("fentry/hrtimer_start_range_ns")
+int BPF_PROG(handle__fentry, struct hrtimer *timer, ktime_t tim, u64 delta_ns,
+ const enum hrtimer_mode mode)
+{
+ if (tim == MY_TV_NSEC)
+ fentry_called = true;
+ return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_xdp.c b/tools/testing/selftests/bpf/progs/test_xdp.c
new file mode 100644
index 000000000..31f9bce37
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_xdp.c
@@ -0,0 +1,235 @@
+/* Copyright (c) 2016,2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/pkt_cls.h>
+#include <sys/socket.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include "test_iptunnel_common.h"
+
+int _version SEC("version") = 1;
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(max_entries, 256);
+ __type(key, __u32);
+ __type(value, __u64);
+} rxcnt SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, MAX_IPTNL_ENTRIES);
+ __type(key, struct vip);
+ __type(value, struct iptnl_info);
+} vip2tnl SEC(".maps");
+
+static __always_inline void count_tx(__u32 protocol)
+{
+ __u64 *rxcnt_count;
+
+ rxcnt_count = bpf_map_lookup_elem(&rxcnt, &protocol);
+ if (rxcnt_count)
+ *rxcnt_count += 1;
+}
+
+static __always_inline int get_dport(void *trans_data, void *data_end,
+ __u8 protocol)
+{
+ struct tcphdr *th;
+ struct udphdr *uh;
+
+ switch (protocol) {
+ case IPPROTO_TCP:
+ th = (struct tcphdr *)trans_data;
+ if (th + 1 > data_end)
+ return -1;
+ return th->dest;
+ case IPPROTO_UDP:
+ uh = (struct udphdr *)trans_data;
+ if (uh + 1 > data_end)
+ return -1;
+ return uh->dest;
+ default:
+ return 0;
+ }
+}
+
+static __always_inline void set_ethhdr(struct ethhdr *new_eth,
+ const struct ethhdr *old_eth,
+ const struct iptnl_info *tnl,
+ __be16 h_proto)
+{
+ memcpy(new_eth->h_source, old_eth->h_dest, sizeof(new_eth->h_source));
+ memcpy(new_eth->h_dest, tnl->dmac, sizeof(new_eth->h_dest));
+ new_eth->h_proto = h_proto;
+}
+
+static __always_inline int handle_ipv4(struct xdp_md *xdp)
+{
+ void *data_end = (void *)(long)xdp->data_end;
+ void *data = (void *)(long)xdp->data;
+ struct iptnl_info *tnl;
+ struct ethhdr *new_eth;
+ struct ethhdr *old_eth;
+ struct iphdr *iph = data + sizeof(struct ethhdr);
+ __u16 *next_iph;
+ __u16 payload_len;
+ struct vip vip = {};
+ int dport;
+ __u32 csum = 0;
+ int i;
+
+ if (iph + 1 > data_end)
+ return XDP_DROP;
+
+ dport = get_dport(iph + 1, data_end, iph->protocol);
+ if (dport == -1)
+ return XDP_DROP;
+
+ vip.protocol = iph->protocol;
+ vip.family = AF_INET;
+ vip.daddr.v4 = iph->daddr;
+ vip.dport = dport;
+ payload_len = bpf_ntohs(iph->tot_len);
+
+ tnl = bpf_map_lookup_elem(&vip2tnl, &vip);
+ /* It only does v4-in-v4 */
+ if (!tnl || tnl->family != AF_INET)
+ return XDP_PASS;
+
+ if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct iphdr)))
+ return XDP_DROP;
+
+ data = (void *)(long)xdp->data;
+ data_end = (void *)(long)xdp->data_end;
+
+ new_eth = data;
+ iph = data + sizeof(*new_eth);
+ old_eth = data + sizeof(*iph);
+
+ if (new_eth + 1 > data_end ||
+ old_eth + 1 > data_end ||
+ iph + 1 > data_end)
+ return XDP_DROP;
+
+ set_ethhdr(new_eth, old_eth, tnl, bpf_htons(ETH_P_IP));
+
+ iph->version = 4;
+ iph->ihl = sizeof(*iph) >> 2;
+ iph->frag_off = 0;
+ iph->protocol = IPPROTO_IPIP;
+ iph->check = 0;
+ iph->tos = 0;
+ iph->tot_len = bpf_htons(payload_len + sizeof(*iph));
+ iph->daddr = tnl->daddr.v4;
+ iph->saddr = tnl->saddr.v4;
+ iph->ttl = 8;
+
+ next_iph = (__u16 *)iph;
+#pragma clang loop unroll(full)
+ for (i = 0; i < sizeof(*iph) >> 1; i++)
+ csum += *next_iph++;
+
+ iph->check = ~((csum & 0xffff) + (csum >> 16));
+
+ count_tx(vip.protocol);
+
+ return XDP_TX;
+}
+
+static __always_inline int handle_ipv6(struct xdp_md *xdp)
+{
+ void *data_end = (void *)(long)xdp->data_end;
+ void *data = (void *)(long)xdp->data;
+ struct iptnl_info *tnl;
+ struct ethhdr *new_eth;
+ struct ethhdr *old_eth;
+ struct ipv6hdr *ip6h = data + sizeof(struct ethhdr);
+ __u16 payload_len;
+ struct vip vip = {};
+ int dport;
+
+ if (ip6h + 1 > data_end)
+ return XDP_DROP;
+
+ dport = get_dport(ip6h + 1, data_end, ip6h->nexthdr);
+ if (dport == -1)
+ return XDP_DROP;
+
+ vip.protocol = ip6h->nexthdr;
+ vip.family = AF_INET6;
+ memcpy(vip.daddr.v6, ip6h->daddr.s6_addr32, sizeof(vip.daddr));
+ vip.dport = dport;
+ payload_len = ip6h->payload_len;
+
+ tnl = bpf_map_lookup_elem(&vip2tnl, &vip);
+ /* It only does v6-in-v6 */
+ if (!tnl || tnl->family != AF_INET6)
+ return XDP_PASS;
+
+ if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct ipv6hdr)))
+ return XDP_DROP;
+
+ data = (void *)(long)xdp->data;
+ data_end = (void *)(long)xdp->data_end;
+
+ new_eth = data;
+ ip6h = data + sizeof(*new_eth);
+ old_eth = data + sizeof(*ip6h);
+
+ if (new_eth + 1 > data_end || old_eth + 1 > data_end ||
+ ip6h + 1 > data_end)
+ return XDP_DROP;
+
+ set_ethhdr(new_eth, old_eth, tnl, bpf_htons(ETH_P_IPV6));
+
+ ip6h->version = 6;
+ ip6h->priority = 0;
+ memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl));
+ ip6h->payload_len = bpf_htons(bpf_ntohs(payload_len) + sizeof(*ip6h));
+ ip6h->nexthdr = IPPROTO_IPV6;
+ ip6h->hop_limit = 8;
+ memcpy(ip6h->saddr.s6_addr32, tnl->saddr.v6, sizeof(tnl->saddr.v6));
+ memcpy(ip6h->daddr.s6_addr32, tnl->daddr.v6, sizeof(tnl->daddr.v6));
+
+ count_tx(vip.protocol);
+
+ return XDP_TX;
+}
+
+SEC("xdp_tx_iptunnel")
+int _xdp_tx_iptunnel(struct xdp_md *xdp)
+{
+ void *data_end = (void *)(long)xdp->data_end;
+ void *data = (void *)(long)xdp->data;
+ struct ethhdr *eth = data;
+ __u16 h_proto;
+
+ if (eth + 1 > data_end)
+ return XDP_DROP;
+
+ h_proto = eth->h_proto;
+
+ if (h_proto == bpf_htons(ETH_P_IP))
+ return handle_ipv4(xdp);
+ else if (h_proto == bpf_htons(ETH_P_IPV6))
+
+ return handle_ipv6(xdp);
+ else
+ return XDP_DROP;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c
new file mode 100644
index 000000000..3d66599ee
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_grow.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+SEC("xdp_adjust_tail_grow")
+int _xdp_adjust_tail_grow(struct xdp_md *xdp)
+{
+ void *data_end = (void *)(long)xdp->data_end;
+ void *data = (void *)(long)xdp->data;
+ unsigned int data_len;
+ int offset = 0;
+
+ /* Data length determine test case */
+ data_len = data_end - data;
+
+ if (data_len == 54) { /* sizeof(pkt_v4) */
+ offset = 4096; /* test too large offset */
+ } else if (data_len == 74) { /* sizeof(pkt_v6) */
+ offset = 40;
+ } else if (data_len == 64) {
+ offset = 128;
+ } else if (data_len == 128) {
+ offset = 4096 - 256 - 320 - data_len; /* Max tail grow 3520 */
+ } else {
+ return XDP_ABORTED; /* No matching test */
+ }
+
+ if (bpf_xdp_adjust_tail(xdp, offset))
+ return XDP_DROP;
+ return XDP_TX;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c
new file mode 100644
index 000000000..22065a9cf
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_xdp_adjust_tail_shrink.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <bpf/bpf_helpers.h>
+
+int _version SEC("version") = 1;
+
+SEC("xdp_adjust_tail_shrink")
+int _xdp_adjust_tail_shrink(struct xdp_md *xdp)
+{
+ void *data_end = (void *)(long)xdp->data_end;
+ void *data = (void *)(long)xdp->data;
+ int offset = 0;
+
+ if (data_end - data == 54) /* sizeof(pkt_v4) */
+ offset = 256; /* shrink too much */
+ else
+ offset = 20;
+ if (bpf_xdp_adjust_tail(xdp, 0 - offset))
+ return XDP_DROP;
+ return XDP_TX;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c b/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c
new file mode 100644
index 000000000..a038e827f
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_xdp_bpf2bpf.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct net_device {
+ /* Structure does not need to contain all entries,
+ * as "preserve_access_index" will use BTF to fix this...
+ */
+ int ifindex;
+} __attribute__((preserve_access_index));
+
+struct xdp_rxq_info {
+ /* Structure does not need to contain all entries,
+ * as "preserve_access_index" will use BTF to fix this...
+ */
+ struct net_device *dev;
+ __u32 queue_index;
+} __attribute__((preserve_access_index));
+
+struct xdp_buff {
+ void *data;
+ void *data_end;
+ void *data_meta;
+ void *data_hard_start;
+ unsigned long handle;
+ struct xdp_rxq_info *rxq;
+} __attribute__((preserve_access_index));
+
+struct meta {
+ int ifindex;
+ int pkt_len;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+} perf_buf_map SEC(".maps");
+
+__u64 test_result_fentry = 0;
+SEC("fentry/FUNC")
+int BPF_PROG(trace_on_entry, struct xdp_buff *xdp)
+{
+ struct meta meta;
+ void *data_end = (void *)(long)xdp->data_end;
+ void *data = (void *)(long)xdp->data;
+
+ meta.ifindex = xdp->rxq->dev->ifindex;
+ meta.pkt_len = data_end - data;
+ bpf_xdp_output(xdp, &perf_buf_map,
+ ((__u64) meta.pkt_len << 32) |
+ BPF_F_CURRENT_CPU,
+ &meta, sizeof(meta));
+
+ test_result_fentry = xdp->rxq->dev->ifindex;
+ return 0;
+}
+
+__u64 test_result_fexit = 0;
+SEC("fexit/FUNC")
+int BPF_PROG(trace_on_exit, struct xdp_buff *xdp, int ret)
+{
+ test_result_fexit = ret;
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c
new file mode 100644
index 000000000..b360ba2bd
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_xdp_devmap_helpers.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+/* fails to load without expected_attach_type = BPF_XDP_DEVMAP
+ * because of access to egress_ifindex
+ */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+SEC("xdp_dm_log")
+int xdpdm_devlog(struct xdp_md *ctx)
+{
+ char fmt[] = "devmap redirect: dev %u -> dev %u len %u\n";
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ unsigned int len = data_end - data;
+
+ bpf_trace_printk(fmt, sizeof(fmt),
+ ctx->ingress_ifindex, ctx->egress_ifindex, len);
+
+ return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_link.c b/tools/testing/selftests/bpf/progs/test_xdp_link.c
new file mode 100644
index 000000000..eb93ea95d
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_xdp_link.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+char LICENSE[] SEC("license") = "GPL";
+
+SEC("xdp/handler")
+int xdp_handler(struct xdp_md *xdp)
+{
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_loop.c b/tools/testing/selftests/bpf/progs/test_xdp_loop.c
new file mode 100644
index 000000000..fcabcda30
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_xdp_loop.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/tcp.h>
+#include <linux/pkt_cls.h>
+#include <sys/socket.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include "test_iptunnel_common.h"
+
+int _version SEC("version") = 1;
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(max_entries, 256);
+ __type(key, __u32);
+ __type(value, __u64);
+} rxcnt SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, MAX_IPTNL_ENTRIES);
+ __type(key, struct vip);
+ __type(value, struct iptnl_info);
+} vip2tnl SEC(".maps");
+
+static __always_inline void count_tx(__u32 protocol)
+{
+ __u64 *rxcnt_count;
+
+ rxcnt_count = bpf_map_lookup_elem(&rxcnt, &protocol);
+ if (rxcnt_count)
+ *rxcnt_count += 1;
+}
+
+static __always_inline int get_dport(void *trans_data, void *data_end,
+ __u8 protocol)
+{
+ struct tcphdr *th;
+ struct udphdr *uh;
+
+ switch (protocol) {
+ case IPPROTO_TCP:
+ th = (struct tcphdr *)trans_data;
+ if (th + 1 > data_end)
+ return -1;
+ return th->dest;
+ case IPPROTO_UDP:
+ uh = (struct udphdr *)trans_data;
+ if (uh + 1 > data_end)
+ return -1;
+ return uh->dest;
+ default:
+ return 0;
+ }
+}
+
+static __always_inline void set_ethhdr(struct ethhdr *new_eth,
+ const struct ethhdr *old_eth,
+ const struct iptnl_info *tnl,
+ __be16 h_proto)
+{
+ memcpy(new_eth->h_source, old_eth->h_dest, sizeof(new_eth->h_source));
+ memcpy(new_eth->h_dest, tnl->dmac, sizeof(new_eth->h_dest));
+ new_eth->h_proto = h_proto;
+}
+
+static __always_inline int handle_ipv4(struct xdp_md *xdp)
+{
+ void *data_end = (void *)(long)xdp->data_end;
+ void *data = (void *)(long)xdp->data;
+ struct iptnl_info *tnl;
+ struct ethhdr *new_eth;
+ struct ethhdr *old_eth;
+ struct iphdr *iph = data + sizeof(struct ethhdr);
+ __u16 *next_iph;
+ __u16 payload_len;
+ struct vip vip = {};
+ int dport;
+ __u32 csum = 0;
+ int i;
+
+ if (iph + 1 > data_end)
+ return XDP_DROP;
+
+ dport = get_dport(iph + 1, data_end, iph->protocol);
+ if (dport == -1)
+ return XDP_DROP;
+
+ vip.protocol = iph->protocol;
+ vip.family = AF_INET;
+ vip.daddr.v4 = iph->daddr;
+ vip.dport = dport;
+ payload_len = bpf_ntohs(iph->tot_len);
+
+ tnl = bpf_map_lookup_elem(&vip2tnl, &vip);
+ /* It only does v4-in-v4 */
+ if (!tnl || tnl->family != AF_INET)
+ return XDP_PASS;
+
+ if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct iphdr)))
+ return XDP_DROP;
+
+ data = (void *)(long)xdp->data;
+ data_end = (void *)(long)xdp->data_end;
+
+ new_eth = data;
+ iph = data + sizeof(*new_eth);
+ old_eth = data + sizeof(*iph);
+
+ if (new_eth + 1 > data_end ||
+ old_eth + 1 > data_end ||
+ iph + 1 > data_end)
+ return XDP_DROP;
+
+ set_ethhdr(new_eth, old_eth, tnl, bpf_htons(ETH_P_IP));
+
+ iph->version = 4;
+ iph->ihl = sizeof(*iph) >> 2;
+ iph->frag_off = 0;
+ iph->protocol = IPPROTO_IPIP;
+ iph->check = 0;
+ iph->tos = 0;
+ iph->tot_len = bpf_htons(payload_len + sizeof(*iph));
+ iph->daddr = tnl->daddr.v4;
+ iph->saddr = tnl->saddr.v4;
+ iph->ttl = 8;
+
+ next_iph = (__u16 *)iph;
+#pragma clang loop unroll(disable)
+ for (i = 0; i < sizeof(*iph) >> 1; i++)
+ csum += *next_iph++;
+
+ iph->check = ~((csum & 0xffff) + (csum >> 16));
+
+ count_tx(vip.protocol);
+
+ return XDP_TX;
+}
+
+static __always_inline int handle_ipv6(struct xdp_md *xdp)
+{
+ void *data_end = (void *)(long)xdp->data_end;
+ void *data = (void *)(long)xdp->data;
+ struct iptnl_info *tnl;
+ struct ethhdr *new_eth;
+ struct ethhdr *old_eth;
+ struct ipv6hdr *ip6h = data + sizeof(struct ethhdr);
+ __u16 payload_len;
+ struct vip vip = {};
+ int dport;
+
+ if (ip6h + 1 > data_end)
+ return XDP_DROP;
+
+ dport = get_dport(ip6h + 1, data_end, ip6h->nexthdr);
+ if (dport == -1)
+ return XDP_DROP;
+
+ vip.protocol = ip6h->nexthdr;
+ vip.family = AF_INET6;
+ memcpy(vip.daddr.v6, ip6h->daddr.s6_addr32, sizeof(vip.daddr));
+ vip.dport = dport;
+ payload_len = ip6h->payload_len;
+
+ tnl = bpf_map_lookup_elem(&vip2tnl, &vip);
+ /* It only does v6-in-v6 */
+ if (!tnl || tnl->family != AF_INET6)
+ return XDP_PASS;
+
+ if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct ipv6hdr)))
+ return XDP_DROP;
+
+ data = (void *)(long)xdp->data;
+ data_end = (void *)(long)xdp->data_end;
+
+ new_eth = data;
+ ip6h = data + sizeof(*new_eth);
+ old_eth = data + sizeof(*ip6h);
+
+ if (new_eth + 1 > data_end || old_eth + 1 > data_end ||
+ ip6h + 1 > data_end)
+ return XDP_DROP;
+
+ set_ethhdr(new_eth, old_eth, tnl, bpf_htons(ETH_P_IPV6));
+
+ ip6h->version = 6;
+ ip6h->priority = 0;
+ memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl));
+ ip6h->payload_len = bpf_htons(bpf_ntohs(payload_len) + sizeof(*ip6h));
+ ip6h->nexthdr = IPPROTO_IPV6;
+ ip6h->hop_limit = 8;
+ memcpy(ip6h->saddr.s6_addr32, tnl->saddr.v6, sizeof(tnl->saddr.v6));
+ memcpy(ip6h->daddr.s6_addr32, tnl->daddr.v6, sizeof(tnl->daddr.v6));
+
+ count_tx(vip.protocol);
+
+ return XDP_TX;
+}
+
+SEC("xdp_tx_iptunnel")
+int _xdp_tx_iptunnel(struct xdp_md *xdp)
+{
+ void *data_end = (void *)(long)xdp->data_end;
+ void *data = (void *)(long)xdp->data;
+ struct ethhdr *eth = data;
+ __u16 h_proto;
+
+ if (eth + 1 > data_end)
+ return XDP_DROP;
+
+ h_proto = eth->h_proto;
+
+ if (h_proto == bpf_htons(ETH_P_IP))
+ return handle_ipv4(xdp);
+ else if (h_proto == bpf_htons(ETH_P_IPV6))
+
+ return handle_ipv6(xdp);
+ else
+ return XDP_DROP;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_meta.c b/tools/testing/selftests/bpf/progs/test_xdp_meta.c
new file mode 100644
index 000000000..a7c4a7d49
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_xdp_meta.c
@@ -0,0 +1,53 @@
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/pkt_cls.h>
+
+#include <bpf/bpf_helpers.h>
+
+#define __round_mask(x, y) ((__typeof__(x))((y) - 1))
+#define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1)
+#define ctx_ptr(ctx, mem) (void *)(unsigned long)ctx->mem
+
+SEC("t")
+int ing_cls(struct __sk_buff *ctx)
+{
+ __u8 *data, *data_meta, *data_end;
+ __u32 diff = 0;
+
+ data_meta = ctx_ptr(ctx, data_meta);
+ data_end = ctx_ptr(ctx, data_end);
+ data = ctx_ptr(ctx, data);
+
+ if (data + ETH_ALEN > data_end ||
+ data_meta + round_up(ETH_ALEN, 4) > data)
+ return TC_ACT_SHOT;
+
+ diff |= ((__u32 *)data_meta)[0] ^ ((__u32 *)data)[0];
+ diff |= ((__u16 *)data_meta)[2] ^ ((__u16 *)data)[2];
+
+ return diff ? TC_ACT_SHOT : TC_ACT_OK;
+}
+
+SEC("x")
+int ing_xdp(struct xdp_md *ctx)
+{
+ __u8 *data, *data_meta, *data_end;
+ int ret;
+
+ ret = bpf_xdp_adjust_meta(ctx, -round_up(ETH_ALEN, 4));
+ if (ret < 0)
+ return XDP_DROP;
+
+ data_meta = ctx_ptr(ctx, data_meta);
+ data_end = ctx_ptr(ctx, data_end);
+ data = ctx_ptr(ctx, data);
+
+ if (data + ETH_ALEN > data_end ||
+ data_meta + round_up(ETH_ALEN, 4) > data)
+ return XDP_DROP;
+
+ __builtin_memcpy(data_meta, data, ETH_ALEN);
+ return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c
new file mode 100644
index 000000000..3a67921f6
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c
@@ -0,0 +1,838 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2017 Facebook
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/pkt_cls.h>
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+static __always_inline __u32 rol32(__u32 word, unsigned int shift)
+{
+ return (word << shift) | (word >> ((-shift) & 31));
+}
+
+/* copy paste of jhash from kernel sources to make sure llvm
+ * can compile it into valid sequence of bpf instructions
+ */
+#define __jhash_mix(a, b, c) \
+{ \
+ a -= c; a ^= rol32(c, 4); c += b; \
+ b -= a; b ^= rol32(a, 6); a += c; \
+ c -= b; c ^= rol32(b, 8); b += a; \
+ a -= c; a ^= rol32(c, 16); c += b; \
+ b -= a; b ^= rol32(a, 19); a += c; \
+ c -= b; c ^= rol32(b, 4); b += a; \
+}
+
+#define __jhash_final(a, b, c) \
+{ \
+ c ^= b; c -= rol32(b, 14); \
+ a ^= c; a -= rol32(c, 11); \
+ b ^= a; b -= rol32(a, 25); \
+ c ^= b; c -= rol32(b, 16); \
+ a ^= c; a -= rol32(c, 4); \
+ b ^= a; b -= rol32(a, 14); \
+ c ^= b; c -= rol32(b, 24); \
+}
+
+#define JHASH_INITVAL 0xdeadbeef
+
+typedef unsigned int u32;
+
+static __noinline
+u32 jhash(const void *key, u32 length, u32 initval)
+{
+ u32 a, b, c;
+ const unsigned char *k = key;
+
+ a = b = c = JHASH_INITVAL + length + initval;
+
+ while (length > 12) {
+ a += *(u32 *)(k);
+ b += *(u32 *)(k + 4);
+ c += *(u32 *)(k + 8);
+ __jhash_mix(a, b, c);
+ length -= 12;
+ k += 12;
+ }
+ switch (length) {
+ case 12: c += (u32)k[11]<<24;
+ case 11: c += (u32)k[10]<<16;
+ case 10: c += (u32)k[9]<<8;
+ case 9: c += k[8];
+ case 8: b += (u32)k[7]<<24;
+ case 7: b += (u32)k[6]<<16;
+ case 6: b += (u32)k[5]<<8;
+ case 5: b += k[4];
+ case 4: a += (u32)k[3]<<24;
+ case 3: a += (u32)k[2]<<16;
+ case 2: a += (u32)k[1]<<8;
+ case 1: a += k[0];
+ __jhash_final(a, b, c);
+ case 0: /* Nothing left to add */
+ break;
+ }
+
+ return c;
+}
+
+__noinline
+u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval)
+{
+ a += initval;
+ b += initval;
+ c += initval;
+ __jhash_final(a, b, c);
+ return c;
+}
+
+__noinline
+u32 jhash_2words(u32 a, u32 b, u32 initval)
+{
+ return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
+}
+
+struct flow_key {
+ union {
+ __be32 src;
+ __be32 srcv6[4];
+ };
+ union {
+ __be32 dst;
+ __be32 dstv6[4];
+ };
+ union {
+ __u32 ports;
+ __u16 port16[2];
+ };
+ __u8 proto;
+};
+
+struct packet_description {
+ struct flow_key flow;
+ __u8 flags;
+};
+
+struct ctl_value {
+ union {
+ __u64 value;
+ __u32 ifindex;
+ __u8 mac[6];
+ };
+};
+
+struct vip_definition {
+ union {
+ __be32 vip;
+ __be32 vipv6[4];
+ };
+ __u16 port;
+ __u16 family;
+ __u8 proto;
+};
+
+struct vip_meta {
+ __u32 flags;
+ __u32 vip_num;
+};
+
+struct real_pos_lru {
+ __u32 pos;
+ __u64 atime;
+};
+
+struct real_definition {
+ union {
+ __be32 dst;
+ __be32 dstv6[4];
+ };
+ __u8 flags;
+};
+
+struct lb_stats {
+ __u64 v2;
+ __u64 v1;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 512);
+ __type(key, struct vip_definition);
+ __type(value, struct vip_meta);
+} vip_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_LRU_HASH);
+ __uint(max_entries, 300);
+ __uint(map_flags, 1U << 1);
+ __type(key, struct flow_key);
+ __type(value, struct real_pos_lru);
+} lru_cache SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 12 * 655);
+ __type(key, __u32);
+ __type(value, __u32);
+} ch_rings SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 40);
+ __type(key, __u32);
+ __type(value, struct real_definition);
+} reals SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(max_entries, 515);
+ __type(key, __u32);
+ __type(value, struct lb_stats);
+} stats SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 16);
+ __type(key, __u32);
+ __type(value, struct ctl_value);
+} ctl_array SEC(".maps");
+
+struct eth_hdr {
+ unsigned char eth_dest[6];
+ unsigned char eth_source[6];
+ unsigned short eth_proto;
+};
+
+static __noinline __u64 calc_offset(bool is_ipv6, bool is_icmp)
+{
+ __u64 off = sizeof(struct eth_hdr);
+ if (is_ipv6) {
+ off += sizeof(struct ipv6hdr);
+ if (is_icmp)
+ off += sizeof(struct icmp6hdr) + sizeof(struct ipv6hdr);
+ } else {
+ off += sizeof(struct iphdr);
+ if (is_icmp)
+ off += sizeof(struct icmphdr) + sizeof(struct iphdr);
+ }
+ return off;
+}
+
+static __attribute__ ((noinline))
+bool parse_udp(void *data, void *data_end,
+ bool is_ipv6, struct packet_description *pckt)
+{
+
+ bool is_icmp = !((pckt->flags & (1 << 0)) == 0);
+ __u64 off = calc_offset(is_ipv6, is_icmp);
+ struct udphdr *udp;
+ udp = data + off;
+
+ if (udp + 1 > data_end)
+ return 0;
+ if (!is_icmp) {
+ pckt->flow.port16[0] = udp->source;
+ pckt->flow.port16[1] = udp->dest;
+ } else {
+ pckt->flow.port16[0] = udp->dest;
+ pckt->flow.port16[1] = udp->source;
+ }
+ return 1;
+}
+
+static __attribute__ ((noinline))
+bool parse_tcp(void *data, void *data_end,
+ bool is_ipv6, struct packet_description *pckt)
+{
+
+ bool is_icmp = !((pckt->flags & (1 << 0)) == 0);
+ __u64 off = calc_offset(is_ipv6, is_icmp);
+ struct tcphdr *tcp;
+
+ tcp = data + off;
+ if (tcp + 1 > data_end)
+ return 0;
+ if (tcp->syn)
+ pckt->flags |= (1 << 1);
+ if (!is_icmp) {
+ pckt->flow.port16[0] = tcp->source;
+ pckt->flow.port16[1] = tcp->dest;
+ } else {
+ pckt->flow.port16[0] = tcp->dest;
+ pckt->flow.port16[1] = tcp->source;
+ }
+ return 1;
+}
+
+static __attribute__ ((noinline))
+bool encap_v6(struct xdp_md *xdp, struct ctl_value *cval,
+ struct packet_description *pckt,
+ struct real_definition *dst, __u32 pkt_bytes)
+{
+ struct eth_hdr *new_eth;
+ struct eth_hdr *old_eth;
+ struct ipv6hdr *ip6h;
+ __u32 ip_suffix;
+ void *data_end;
+ void *data;
+
+ if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct ipv6hdr)))
+ return 0;
+ data = (void *)(long)xdp->data;
+ data_end = (void *)(long)xdp->data_end;
+ new_eth = data;
+ ip6h = data + sizeof(struct eth_hdr);
+ old_eth = data + sizeof(struct ipv6hdr);
+ if (new_eth + 1 > data_end ||
+ old_eth + 1 > data_end || ip6h + 1 > data_end)
+ return 0;
+ memcpy(new_eth->eth_dest, cval->mac, 6);
+ memcpy(new_eth->eth_source, old_eth->eth_dest, 6);
+ new_eth->eth_proto = 56710;
+ ip6h->version = 6;
+ ip6h->priority = 0;
+ memset(ip6h->flow_lbl, 0, sizeof(ip6h->flow_lbl));
+
+ ip6h->nexthdr = IPPROTO_IPV6;
+ ip_suffix = pckt->flow.srcv6[3] ^ pckt->flow.port16[0];
+ ip6h->payload_len =
+ bpf_htons(pkt_bytes + sizeof(struct ipv6hdr));
+ ip6h->hop_limit = 4;
+
+ ip6h->saddr.in6_u.u6_addr32[0] = 1;
+ ip6h->saddr.in6_u.u6_addr32[1] = 2;
+ ip6h->saddr.in6_u.u6_addr32[2] = 3;
+ ip6h->saddr.in6_u.u6_addr32[3] = ip_suffix;
+ memcpy(ip6h->daddr.in6_u.u6_addr32, dst->dstv6, 16);
+ return 1;
+}
+
+static __attribute__ ((noinline))
+bool encap_v4(struct xdp_md *xdp, struct ctl_value *cval,
+ struct packet_description *pckt,
+ struct real_definition *dst, __u32 pkt_bytes)
+{
+
+ __u32 ip_suffix = bpf_ntohs(pckt->flow.port16[0]);
+ struct eth_hdr *new_eth;
+ struct eth_hdr *old_eth;
+ __u16 *next_iph_u16;
+ struct iphdr *iph;
+ __u32 csum = 0;
+ void *data_end;
+ void *data;
+
+ ip_suffix <<= 15;
+ ip_suffix ^= pckt->flow.src;
+ if (bpf_xdp_adjust_head(xdp, 0 - (int)sizeof(struct iphdr)))
+ return 0;
+ data = (void *)(long)xdp->data;
+ data_end = (void *)(long)xdp->data_end;
+ new_eth = data;
+ iph = data + sizeof(struct eth_hdr);
+ old_eth = data + sizeof(struct iphdr);
+ if (new_eth + 1 > data_end ||
+ old_eth + 1 > data_end || iph + 1 > data_end)
+ return 0;
+ memcpy(new_eth->eth_dest, cval->mac, 6);
+ memcpy(new_eth->eth_source, old_eth->eth_dest, 6);
+ new_eth->eth_proto = 8;
+ iph->version = 4;
+ iph->ihl = 5;
+ iph->frag_off = 0;
+ iph->protocol = IPPROTO_IPIP;
+ iph->check = 0;
+ iph->tos = 1;
+ iph->tot_len = bpf_htons(pkt_bytes + sizeof(struct iphdr));
+ /* don't update iph->daddr, since it will overwrite old eth_proto
+ * and multiple iterations of bpf_prog_run() will fail
+ */
+
+ iph->saddr = ((0xFFFF0000 & ip_suffix) | 4268) ^ dst->dst;
+ iph->ttl = 4;
+
+ next_iph_u16 = (__u16 *) iph;
+#pragma clang loop unroll(full)
+ for (int i = 0; i < sizeof(struct iphdr) >> 1; i++)
+ csum += *next_iph_u16++;
+ iph->check = ~((csum & 0xffff) + (csum >> 16));
+ if (bpf_xdp_adjust_head(xdp, (int)sizeof(struct iphdr)))
+ return 0;
+ return 1;
+}
+
+static __attribute__ ((noinline))
+bool decap_v6(struct xdp_md *xdp, void **data, void **data_end, bool inner_v4)
+{
+ struct eth_hdr *new_eth;
+ struct eth_hdr *old_eth;
+
+ old_eth = *data;
+ new_eth = *data + sizeof(struct ipv6hdr);
+ memcpy(new_eth->eth_source, old_eth->eth_source, 6);
+ memcpy(new_eth->eth_dest, old_eth->eth_dest, 6);
+ if (inner_v4)
+ new_eth->eth_proto = 8;
+ else
+ new_eth->eth_proto = 56710;
+ if (bpf_xdp_adjust_head(xdp, (int)sizeof(struct ipv6hdr)))
+ return 0;
+ *data = (void *)(long)xdp->data;
+ *data_end = (void *)(long)xdp->data_end;
+ return 1;
+}
+
+static __attribute__ ((noinline))
+bool decap_v4(struct xdp_md *xdp, void **data, void **data_end)
+{
+ struct eth_hdr *new_eth;
+ struct eth_hdr *old_eth;
+
+ old_eth = *data;
+ new_eth = *data + sizeof(struct iphdr);
+ memcpy(new_eth->eth_source, old_eth->eth_source, 6);
+ memcpy(new_eth->eth_dest, old_eth->eth_dest, 6);
+ new_eth->eth_proto = 8;
+ if (bpf_xdp_adjust_head(xdp, (int)sizeof(struct iphdr)))
+ return 0;
+ *data = (void *)(long)xdp->data;
+ *data_end = (void *)(long)xdp->data_end;
+ return 1;
+}
+
+static __attribute__ ((noinline))
+int swap_mac_and_send(void *data, void *data_end)
+{
+ unsigned char tmp_mac[6];
+ struct eth_hdr *eth;
+
+ eth = data;
+ memcpy(tmp_mac, eth->eth_source, 6);
+ memcpy(eth->eth_source, eth->eth_dest, 6);
+ memcpy(eth->eth_dest, tmp_mac, 6);
+ return XDP_TX;
+}
+
+static __attribute__ ((noinline))
+int send_icmp_reply(void *data, void *data_end)
+{
+ struct icmphdr *icmp_hdr;
+ __u16 *next_iph_u16;
+ __u32 tmp_addr = 0;
+ struct iphdr *iph;
+ __u32 csum1 = 0;
+ __u32 csum = 0;
+ __u64 off = 0;
+
+ if (data + sizeof(struct eth_hdr)
+ + sizeof(struct iphdr) + sizeof(struct icmphdr) > data_end)
+ return XDP_DROP;
+ off += sizeof(struct eth_hdr);
+ iph = data + off;
+ off += sizeof(struct iphdr);
+ icmp_hdr = data + off;
+ icmp_hdr->type = 0;
+ icmp_hdr->checksum += 0x0007;
+ iph->ttl = 4;
+ tmp_addr = iph->daddr;
+ iph->daddr = iph->saddr;
+ iph->saddr = tmp_addr;
+ iph->check = 0;
+ next_iph_u16 = (__u16 *) iph;
+#pragma clang loop unroll(full)
+ for (int i = 0; i < sizeof(struct iphdr) >> 1; i++)
+ csum += *next_iph_u16++;
+ iph->check = ~((csum & 0xffff) + (csum >> 16));
+ return swap_mac_and_send(data, data_end);
+}
+
+static __attribute__ ((noinline))
+int send_icmp6_reply(void *data, void *data_end)
+{
+ struct icmp6hdr *icmp_hdr;
+ struct ipv6hdr *ip6h;
+ __be32 tmp_addr[4];
+ __u64 off = 0;
+
+ if (data + sizeof(struct eth_hdr)
+ + sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr) > data_end)
+ return XDP_DROP;
+ off += sizeof(struct eth_hdr);
+ ip6h = data + off;
+ off += sizeof(struct ipv6hdr);
+ icmp_hdr = data + off;
+ icmp_hdr->icmp6_type = 129;
+ icmp_hdr->icmp6_cksum -= 0x0001;
+ ip6h->hop_limit = 4;
+ memcpy(tmp_addr, ip6h->saddr.in6_u.u6_addr32, 16);
+ memcpy(ip6h->saddr.in6_u.u6_addr32, ip6h->daddr.in6_u.u6_addr32, 16);
+ memcpy(ip6h->daddr.in6_u.u6_addr32, tmp_addr, 16);
+ return swap_mac_and_send(data, data_end);
+}
+
+static __attribute__ ((noinline))
+int parse_icmpv6(void *data, void *data_end, __u64 off,
+ struct packet_description *pckt)
+{
+ struct icmp6hdr *icmp_hdr;
+ struct ipv6hdr *ip6h;
+
+ icmp_hdr = data + off;
+ if (icmp_hdr + 1 > data_end)
+ return XDP_DROP;
+ if (icmp_hdr->icmp6_type == 128)
+ return send_icmp6_reply(data, data_end);
+ if (icmp_hdr->icmp6_type != 3)
+ return XDP_PASS;
+ off += sizeof(struct icmp6hdr);
+ ip6h = data + off;
+ if (ip6h + 1 > data_end)
+ return XDP_DROP;
+ pckt->flow.proto = ip6h->nexthdr;
+ pckt->flags |= (1 << 0);
+ memcpy(pckt->flow.srcv6, ip6h->daddr.in6_u.u6_addr32, 16);
+ memcpy(pckt->flow.dstv6, ip6h->saddr.in6_u.u6_addr32, 16);
+ return -1;
+}
+
+static __attribute__ ((noinline))
+int parse_icmp(void *data, void *data_end, __u64 off,
+ struct packet_description *pckt)
+{
+ struct icmphdr *icmp_hdr;
+ struct iphdr *iph;
+
+ icmp_hdr = data + off;
+ if (icmp_hdr + 1 > data_end)
+ return XDP_DROP;
+ if (icmp_hdr->type == 8)
+ return send_icmp_reply(data, data_end);
+ if ((icmp_hdr->type != 3) || (icmp_hdr->code != 4))
+ return XDP_PASS;
+ off += sizeof(struct icmphdr);
+ iph = data + off;
+ if (iph + 1 > data_end)
+ return XDP_DROP;
+ if (iph->ihl != 5)
+ return XDP_DROP;
+ pckt->flow.proto = iph->protocol;
+ pckt->flags |= (1 << 0);
+ pckt->flow.src = iph->daddr;
+ pckt->flow.dst = iph->saddr;
+ return -1;
+}
+
+static __attribute__ ((noinline))
+__u32 get_packet_hash(struct packet_description *pckt,
+ bool hash_16bytes)
+{
+ if (hash_16bytes)
+ return jhash_2words(jhash(pckt->flow.srcv6, 16, 12),
+ pckt->flow.ports, 24);
+ else
+ return jhash_2words(pckt->flow.src, pckt->flow.ports,
+ 24);
+}
+
+__attribute__ ((noinline))
+static bool get_packet_dst(struct real_definition **real,
+ struct packet_description *pckt,
+ struct vip_meta *vip_info,
+ bool is_ipv6, void *lru_map)
+{
+ struct real_pos_lru new_dst_lru = { };
+ bool hash_16bytes = is_ipv6;
+ __u32 *real_pos, hash, key;
+ __u64 cur_time;
+
+ if (vip_info->flags & (1 << 2))
+ hash_16bytes = 1;
+ if (vip_info->flags & (1 << 3)) {
+ pckt->flow.port16[0] = pckt->flow.port16[1];
+ memset(pckt->flow.srcv6, 0, 16);
+ }
+ hash = get_packet_hash(pckt, hash_16bytes);
+ if (hash != 0x358459b7 /* jhash of ipv4 packet */ &&
+ hash != 0x2f4bc6bb /* jhash of ipv6 packet */)
+ return 0;
+ key = 2 * vip_info->vip_num + hash % 2;
+ real_pos = bpf_map_lookup_elem(&ch_rings, &key);
+ if (!real_pos)
+ return 0;
+ key = *real_pos;
+ *real = bpf_map_lookup_elem(&reals, &key);
+ if (!(*real))
+ return 0;
+ if (!(vip_info->flags & (1 << 1))) {
+ __u32 conn_rate_key = 512 + 2;
+ struct lb_stats *conn_rate_stats =
+ bpf_map_lookup_elem(&stats, &conn_rate_key);
+
+ if (!conn_rate_stats)
+ return 1;
+ cur_time = bpf_ktime_get_ns();
+ if ((cur_time - conn_rate_stats->v2) >> 32 > 0xffFFFF) {
+ conn_rate_stats->v1 = 1;
+ conn_rate_stats->v2 = cur_time;
+ } else {
+ conn_rate_stats->v1 += 1;
+ if (conn_rate_stats->v1 >= 1)
+ return 1;
+ }
+ if (pckt->flow.proto == IPPROTO_UDP)
+ new_dst_lru.atime = cur_time;
+ new_dst_lru.pos = key;
+ bpf_map_update_elem(lru_map, &pckt->flow, &new_dst_lru, 0);
+ }
+ return 1;
+}
+
+__attribute__ ((noinline))
+static void connection_table_lookup(struct real_definition **real,
+ struct packet_description *pckt,
+ void *lru_map)
+{
+
+ struct real_pos_lru *dst_lru;
+ __u64 cur_time;
+ __u32 key;
+
+ dst_lru = bpf_map_lookup_elem(lru_map, &pckt->flow);
+ if (!dst_lru)
+ return;
+ if (pckt->flow.proto == IPPROTO_UDP) {
+ cur_time = bpf_ktime_get_ns();
+ if (cur_time - dst_lru->atime > 300000)
+ return;
+ dst_lru->atime = cur_time;
+ }
+ key = dst_lru->pos;
+ *real = bpf_map_lookup_elem(&reals, &key);
+}
+
+/* don't believe your eyes!
+ * below function has 6 arguments whereas bpf and llvm allow maximum of 5
+ * but since it's _static_ llvm can optimize one argument away
+ */
+__attribute__ ((noinline))
+static int process_l3_headers_v6(struct packet_description *pckt,
+ __u8 *protocol, __u64 off,
+ __u16 *pkt_bytes, void *data,
+ void *data_end)
+{
+ struct ipv6hdr *ip6h;
+ __u64 iph_len;
+ int action;
+
+ ip6h = data + off;
+ if (ip6h + 1 > data_end)
+ return XDP_DROP;
+ iph_len = sizeof(struct ipv6hdr);
+ *protocol = ip6h->nexthdr;
+ pckt->flow.proto = *protocol;
+ *pkt_bytes = bpf_ntohs(ip6h->payload_len);
+ off += iph_len;
+ if (*protocol == 45) {
+ return XDP_DROP;
+ } else if (*protocol == 59) {
+ action = parse_icmpv6(data, data_end, off, pckt);
+ if (action >= 0)
+ return action;
+ } else {
+ memcpy(pckt->flow.srcv6, ip6h->saddr.in6_u.u6_addr32, 16);
+ memcpy(pckt->flow.dstv6, ip6h->daddr.in6_u.u6_addr32, 16);
+ }
+ return -1;
+}
+
+__attribute__ ((noinline))
+static int process_l3_headers_v4(struct packet_description *pckt,
+ __u8 *protocol, __u64 off,
+ __u16 *pkt_bytes, void *data,
+ void *data_end)
+{
+ struct iphdr *iph;
+ __u64 iph_len;
+ int action;
+
+ iph = data + off;
+ if (iph + 1 > data_end)
+ return XDP_DROP;
+ if (iph->ihl != 5)
+ return XDP_DROP;
+ *protocol = iph->protocol;
+ pckt->flow.proto = *protocol;
+ *pkt_bytes = bpf_ntohs(iph->tot_len);
+ off += 20;
+ if (iph->frag_off & 65343)
+ return XDP_DROP;
+ if (*protocol == IPPROTO_ICMP) {
+ action = parse_icmp(data, data_end, off, pckt);
+ if (action >= 0)
+ return action;
+ } else {
+ pckt->flow.src = iph->saddr;
+ pckt->flow.dst = iph->daddr;
+ }
+ return -1;
+}
+
+__attribute__ ((noinline))
+static int process_packet(void *data, __u64 off, void *data_end,
+ bool is_ipv6, struct xdp_md *xdp)
+{
+
+ struct real_definition *dst = NULL;
+ struct packet_description pckt = { };
+ struct vip_definition vip = { };
+ struct lb_stats *data_stats;
+ struct eth_hdr *eth = data;
+ void *lru_map = &lru_cache;
+ struct vip_meta *vip_info;
+ __u32 lru_stats_key = 513;
+ __u32 mac_addr_pos = 0;
+ __u32 stats_key = 512;
+ struct ctl_value *cval;
+ __u16 pkt_bytes;
+ __u64 iph_len;
+ __u8 protocol;
+ __u32 vip_num;
+ int action;
+
+ if (is_ipv6)
+ action = process_l3_headers_v6(&pckt, &protocol, off,
+ &pkt_bytes, data, data_end);
+ else
+ action = process_l3_headers_v4(&pckt, &protocol, off,
+ &pkt_bytes, data, data_end);
+ if (action >= 0)
+ return action;
+ protocol = pckt.flow.proto;
+ if (protocol == IPPROTO_TCP) {
+ if (!parse_tcp(data, data_end, is_ipv6, &pckt))
+ return XDP_DROP;
+ } else if (protocol == IPPROTO_UDP) {
+ if (!parse_udp(data, data_end, is_ipv6, &pckt))
+ return XDP_DROP;
+ } else {
+ return XDP_TX;
+ }
+
+ if (is_ipv6)
+ memcpy(vip.vipv6, pckt.flow.dstv6, 16);
+ else
+ vip.vip = pckt.flow.dst;
+ vip.port = pckt.flow.port16[1];
+ vip.proto = pckt.flow.proto;
+ vip_info = bpf_map_lookup_elem(&vip_map, &vip);
+ if (!vip_info) {
+ vip.port = 0;
+ vip_info = bpf_map_lookup_elem(&vip_map, &vip);
+ if (!vip_info)
+ return XDP_PASS;
+ if (!(vip_info->flags & (1 << 4)))
+ pckt.flow.port16[1] = 0;
+ }
+ if (data_end - data > 1400)
+ return XDP_DROP;
+ data_stats = bpf_map_lookup_elem(&stats, &stats_key);
+ if (!data_stats)
+ return XDP_DROP;
+ data_stats->v1 += 1;
+ if (!dst) {
+ if (vip_info->flags & (1 << 0))
+ pckt.flow.port16[0] = 0;
+ if (!(pckt.flags & (1 << 1)) && !(vip_info->flags & (1 << 1)))
+ connection_table_lookup(&dst, &pckt, lru_map);
+ if (dst)
+ goto out;
+ if (pckt.flow.proto == IPPROTO_TCP) {
+ struct lb_stats *lru_stats =
+ bpf_map_lookup_elem(&stats, &lru_stats_key);
+
+ if (!lru_stats)
+ return XDP_DROP;
+ if (pckt.flags & (1 << 1))
+ lru_stats->v1 += 1;
+ else
+ lru_stats->v2 += 1;
+ }
+ if (!get_packet_dst(&dst, &pckt, vip_info, is_ipv6, lru_map))
+ return XDP_DROP;
+ data_stats->v2 += 1;
+ }
+out:
+ cval = bpf_map_lookup_elem(&ctl_array, &mac_addr_pos);
+ if (!cval)
+ return XDP_DROP;
+ if (dst->flags & (1 << 0)) {
+ if (!encap_v6(xdp, cval, &pckt, dst, pkt_bytes))
+ return XDP_DROP;
+ } else {
+ if (!encap_v4(xdp, cval, &pckt, dst, pkt_bytes))
+ return XDP_DROP;
+ }
+ vip_num = vip_info->vip_num;
+ data_stats = bpf_map_lookup_elem(&stats, &vip_num);
+ if (!data_stats)
+ return XDP_DROP;
+ data_stats->v1 += 1;
+ data_stats->v2 += pkt_bytes;
+
+ data = (void *)(long)xdp->data;
+ data_end = (void *)(long)xdp->data_end;
+ if (data + 4 > data_end)
+ return XDP_DROP;
+ *(u32 *)data = dst->dst;
+ return XDP_DROP;
+}
+
+SEC("xdp-test-v4")
+int balancer_ingress_v4(struct xdp_md *ctx)
+{
+ void *data = (void *)(long)ctx->data;
+ void *data_end = (void *)(long)ctx->data_end;
+ struct eth_hdr *eth = data;
+ __u32 eth_proto;
+ __u32 nh_off;
+
+ nh_off = sizeof(struct eth_hdr);
+ if (data + nh_off > data_end)
+ return XDP_DROP;
+ eth_proto = bpf_ntohs(eth->eth_proto);
+ if (eth_proto == ETH_P_IP)
+ return process_packet(data, nh_off, data_end, 0, ctx);
+ else
+ return XDP_DROP;
+}
+
+SEC("xdp-test-v6")
+int balancer_ingress_v6(struct xdp_md *ctx)
+{
+ void *data = (void *)(long)ctx->data;
+ void *data_end = (void *)(long)ctx->data_end;
+ struct eth_hdr *eth = data;
+ __u32 eth_proto;
+ __u32 nh_off;
+
+ nh_off = sizeof(struct eth_hdr);
+ if (data + nh_off > data_end)
+ return XDP_DROP;
+ eth_proto = bpf_ntohs(eth->eth_proto);
+ if (eth_proto == ETH_P_IPV6)
+ return process_packet(data, nh_off, data_end, 1, ctx);
+ else
+ return XDP_DROP;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_redirect.c b/tools/testing/selftests/bpf/progs/test_xdp_redirect.c
new file mode 100644
index 000000000..a5337cd94
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_xdp_redirect.c
@@ -0,0 +1,28 @@
+/* Copyright (c) 2017 VMware
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+int _version SEC("version") = 1;
+
+SEC("redirect_to_111")
+int xdp_redirect_to_111(struct xdp_md *xdp)
+{
+ return bpf_redirect(111, 0);
+}
+SEC("redirect_to_222")
+int xdp_redirect_to_222(struct xdp_md *xdp)
+{
+ return bpf_redirect(222, 0);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_vlan.c b/tools/testing/selftests/bpf/progs/test_xdp_vlan.c
new file mode 100644
index 000000000..134768f6b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_xdp_vlan.c
@@ -0,0 +1,292 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright(c) 2018 Jesper Dangaard Brouer.
+ *
+ * XDP/TC VLAN manipulation example
+ *
+ * GOTCHA: Remember to disable NIC hardware offloading of VLANs,
+ * else the VLAN tags are NOT inlined in the packet payload:
+ *
+ * # ethtool -K ixgbe2 rxvlan off
+ *
+ * Verify setting:
+ * # ethtool -k ixgbe2 | grep rx-vlan-offload
+ * rx-vlan-offload: off
+ *
+ */
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/in.h>
+#include <linux/pkt_cls.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+/* linux/if_vlan.h have not exposed this as UAPI, thus mirror some here
+ *
+ * struct vlan_hdr - vlan header
+ * @h_vlan_TCI: priority and VLAN ID
+ * @h_vlan_encapsulated_proto: packet type ID or len
+ */
+struct _vlan_hdr {
+ __be16 h_vlan_TCI;
+ __be16 h_vlan_encapsulated_proto;
+};
+#define VLAN_PRIO_MASK 0xe000 /* Priority Code Point */
+#define VLAN_PRIO_SHIFT 13
+#define VLAN_CFI_MASK 0x1000 /* Canonical Format Indicator */
+#define VLAN_TAG_PRESENT VLAN_CFI_MASK
+#define VLAN_VID_MASK 0x0fff /* VLAN Identifier */
+#define VLAN_N_VID 4096
+
+struct parse_pkt {
+ __u16 l3_proto;
+ __u16 l3_offset;
+ __u16 vlan_outer;
+ __u16 vlan_inner;
+ __u8 vlan_outer_offset;
+ __u8 vlan_inner_offset;
+};
+
+char _license[] SEC("license") = "GPL";
+
+static __always_inline
+bool parse_eth_frame(struct ethhdr *eth, void *data_end, struct parse_pkt *pkt)
+{
+ __u16 eth_type;
+ __u8 offset;
+
+ offset = sizeof(*eth);
+ /* Make sure packet is large enough for parsing eth + 2 VLAN headers */
+ if ((void *)eth + offset + (2*sizeof(struct _vlan_hdr)) > data_end)
+ return false;
+
+ eth_type = eth->h_proto;
+
+ /* Handle outer VLAN tag */
+ if (eth_type == bpf_htons(ETH_P_8021Q)
+ || eth_type == bpf_htons(ETH_P_8021AD)) {
+ struct _vlan_hdr *vlan_hdr;
+
+ vlan_hdr = (void *)eth + offset;
+ pkt->vlan_outer_offset = offset;
+ pkt->vlan_outer = bpf_ntohs(vlan_hdr->h_vlan_TCI)
+ & VLAN_VID_MASK;
+ eth_type = vlan_hdr->h_vlan_encapsulated_proto;
+ offset += sizeof(*vlan_hdr);
+ }
+
+ /* Handle inner (double) VLAN tag */
+ if (eth_type == bpf_htons(ETH_P_8021Q)
+ || eth_type == bpf_htons(ETH_P_8021AD)) {
+ struct _vlan_hdr *vlan_hdr;
+
+ vlan_hdr = (void *)eth + offset;
+ pkt->vlan_inner_offset = offset;
+ pkt->vlan_inner = bpf_ntohs(vlan_hdr->h_vlan_TCI)
+ & VLAN_VID_MASK;
+ eth_type = vlan_hdr->h_vlan_encapsulated_proto;
+ offset += sizeof(*vlan_hdr);
+ }
+
+ pkt->l3_proto = bpf_ntohs(eth_type); /* Convert to host-byte-order */
+ pkt->l3_offset = offset;
+
+ return true;
+}
+
+/* Hint, VLANs are choosen to hit network-byte-order issues */
+#define TESTVLAN 4011 /* 0xFAB */
+// #define TO_VLAN 4000 /* 0xFA0 (hint 0xOA0 = 160) */
+
+SEC("xdp_drop_vlan_4011")
+int xdp_prognum0(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct parse_pkt pkt = { 0 };
+
+ if (!parse_eth_frame(data, data_end, &pkt))
+ return XDP_ABORTED;
+
+ /* Drop specific VLAN ID example */
+ if (pkt.vlan_outer == TESTVLAN)
+ return XDP_ABORTED;
+ /*
+ * Using XDP_ABORTED makes it possible to record this event,
+ * via tracepoint xdp:xdp_exception like:
+ * # perf record -a -e xdp:xdp_exception
+ * # perf script
+ */
+ return XDP_PASS;
+}
+/*
+Commands to setup VLAN on Linux to test packets gets dropped:
+
+ export ROOTDEV=ixgbe2
+ export VLANID=4011
+ ip link add link $ROOTDEV name $ROOTDEV.$VLANID type vlan id $VLANID
+ ip link set dev $ROOTDEV.$VLANID up
+
+ ip link set dev $ROOTDEV mtu 1508
+ ip addr add 100.64.40.11/24 dev $ROOTDEV.$VLANID
+
+Load prog with ip tool:
+
+ ip link set $ROOTDEV xdp off
+ ip link set $ROOTDEV xdp object xdp_vlan01_kern.o section xdp_drop_vlan_4011
+
+*/
+
+/* Changing VLAN to zero, have same practical effect as removing the VLAN. */
+#define TO_VLAN 0
+
+SEC("xdp_vlan_change")
+int xdp_prognum1(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct parse_pkt pkt = { 0 };
+
+ if (!parse_eth_frame(data, data_end, &pkt))
+ return XDP_ABORTED;
+
+ /* Change specific VLAN ID */
+ if (pkt.vlan_outer == TESTVLAN) {
+ struct _vlan_hdr *vlan_hdr = data + pkt.vlan_outer_offset;
+
+ /* Modifying VLAN, preserve top 4 bits */
+ vlan_hdr->h_vlan_TCI =
+ bpf_htons((bpf_ntohs(vlan_hdr->h_vlan_TCI) & 0xf000)
+ | TO_VLAN);
+ }
+
+ return XDP_PASS;
+}
+
+/*
+ * Show XDP+TC can cooperate, on creating a VLAN rewriter.
+ * 1. Create a XDP prog that can "pop"/remove a VLAN header.
+ * 2. Create a TC-bpf prog that egress can add a VLAN header.
+ */
+
+#ifndef ETH_ALEN /* Ethernet MAC address length */
+#define ETH_ALEN 6 /* bytes */
+#endif
+#define VLAN_HDR_SZ 4 /* bytes */
+
+SEC("xdp_vlan_remove_outer")
+int xdp_prognum2(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct parse_pkt pkt = { 0 };
+ char *dest;
+
+ if (!parse_eth_frame(data, data_end, &pkt))
+ return XDP_ABORTED;
+
+ /* Skip packet if no outer VLAN was detected */
+ if (pkt.vlan_outer_offset == 0)
+ return XDP_PASS;
+
+ /* Moving Ethernet header, dest overlap with src, memmove handle this */
+ dest = data;
+ dest+= VLAN_HDR_SZ;
+ /*
+ * Notice: Taking over vlan_hdr->h_vlan_encapsulated_proto, by
+ * only moving two MAC addrs (12 bytes), not overwriting last 2 bytes
+ */
+ __builtin_memmove(dest, data, ETH_ALEN * 2);
+ /* Note: LLVM built-in memmove inlining require size to be constant */
+
+ /* Move start of packet header seen by Linux kernel stack */
+ bpf_xdp_adjust_head(ctx, VLAN_HDR_SZ);
+
+ return XDP_PASS;
+}
+
+static __always_inline
+void shift_mac_4bytes_16bit(void *data)
+{
+ __u16 *p = data;
+
+ p[7] = p[5]; /* delete p[7] was vlan_hdr->h_vlan_TCI */
+ p[6] = p[4]; /* delete p[6] was ethhdr->h_proto */
+ p[5] = p[3];
+ p[4] = p[2];
+ p[3] = p[1];
+ p[2] = p[0];
+}
+
+static __always_inline
+void shift_mac_4bytes_32bit(void *data)
+{
+ __u32 *p = data;
+
+ /* Assuming VLAN hdr present. The 4 bytes in p[3] that gets
+ * overwritten, is ethhdr->h_proto and vlan_hdr->h_vlan_TCI.
+ * The vlan_hdr->h_vlan_encapsulated_proto take over role as
+ * ethhdr->h_proto.
+ */
+ p[3] = p[2];
+ p[2] = p[1];
+ p[1] = p[0];
+}
+
+SEC("xdp_vlan_remove_outer2")
+int xdp_prognum3(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *orig_eth = data;
+ struct parse_pkt pkt = { 0 };
+
+ if (!parse_eth_frame(orig_eth, data_end, &pkt))
+ return XDP_ABORTED;
+
+ /* Skip packet if no outer VLAN was detected */
+ if (pkt.vlan_outer_offset == 0)
+ return XDP_PASS;
+
+ /* Simply shift down MAC addrs 4 bytes, overwrite h_proto + TCI */
+ shift_mac_4bytes_32bit(data);
+
+ /* Move start of packet header seen by Linux kernel stack */
+ bpf_xdp_adjust_head(ctx, VLAN_HDR_SZ);
+
+ return XDP_PASS;
+}
+
+/*=====================================
+ * BELOW: TC-hook based ebpf programs
+ * ====================================
+ * The TC-clsact eBPF programs (currently) need to be attach via TC commands
+ */
+
+SEC("tc_vlan_push")
+int _tc_progA(struct __sk_buff *ctx)
+{
+ bpf_skb_vlan_push(ctx, bpf_htons(ETH_P_8021Q), TESTVLAN);
+
+ return TC_ACT_OK;
+}
+/*
+Commands to setup TC to use above bpf prog:
+
+export ROOTDEV=ixgbe2
+export FILE=xdp_vlan01_kern.o
+
+# Re-attach clsact to clear/flush existing role
+tc qdisc del dev $ROOTDEV clsact 2> /dev/null ;\
+tc qdisc add dev $ROOTDEV clsact
+
+# Attach BPF prog EGRESS
+tc filter add dev $ROOTDEV egress \
+ prio 1 handle 1 bpf da obj $FILE sec tc_vlan_push
+
+tc filter show dev $ROOTDEV egress
+*/
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c
new file mode 100644
index 000000000..59ee4f182
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+#define IFINDEX_LO 1
+
+struct {
+ __uint(type, BPF_MAP_TYPE_CPUMAP);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(struct bpf_cpumap_val));
+ __uint(max_entries, 4);
+} cpu_map SEC(".maps");
+
+SEC("xdp_redir")
+int xdp_redir_prog(struct xdp_md *ctx)
+{
+ return bpf_redirect_map(&cpu_map, 1, 0);
+}
+
+SEC("xdp_dummy")
+int xdp_dummy_prog(struct xdp_md *ctx)
+{
+ return XDP_PASS;
+}
+
+SEC("xdp_cpumap/dummy_cm")
+int xdp_dummy_cm(struct xdp_md *ctx)
+{
+ if (ctx->ingress_ifindex == IFINDEX_LO)
+ return XDP_DROP;
+
+ return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c
new file mode 100644
index 000000000..0ac086497
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_DEVMAP);
+ __uint(key_size, sizeof(__u32));
+ __uint(value_size, sizeof(struct bpf_devmap_val));
+ __uint(max_entries, 4);
+} dm_ports SEC(".maps");
+
+SEC("xdp_redir")
+int xdp_redir_prog(struct xdp_md *ctx)
+{
+ return bpf_redirect_map(&dm_ports, 1, 0);
+}
+
+/* invalid program on DEVMAP entry;
+ * SEC name means expected attach type not set
+ */
+SEC("xdp_dummy")
+int xdp_dummy_prog(struct xdp_md *ctx)
+{
+ return XDP_PASS;
+}
+
+/* valid program on DEVMAP entry via SEC name;
+ * has access to egress and ingress ifindex
+ */
+SEC("xdp_devmap/map_prog")
+int xdp_dummy_dm(struct xdp_md *ctx)
+{
+ char fmt[] = "devmap redirect: dev %u -> dev %u len %u\n";
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ unsigned int len = data_end - data;
+
+ bpf_trace_printk(fmt, sizeof(fmt),
+ ctx->ingress_ifindex, ctx->egress_ifindex, len);
+
+ return XDP_PASS;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/trace_printk.c b/tools/testing/selftests/bpf/progs/trace_printk.c
new file mode 100644
index 000000000..8ca7f399b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/trace_printk.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020, Oracle and/or its affiliates.
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+int trace_printk_ret = 0;
+int trace_printk_ran = 0;
+
+SEC("tp/raw_syscalls/sys_enter")
+int sys_enter(void *ctx)
+{
+ static const char fmt[] = "testing,testing %d\n";
+
+ trace_printk_ret = bpf_trace_printk(fmt, sizeof(fmt),
+ ++trace_printk_ran);
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c
new file mode 100644
index 000000000..9a4d09590
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/trigger_bench.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+
+#include <linux/bpf.h>
+#include <asm/unistd.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+long hits = 0;
+
+SEC("tp/syscalls/sys_enter_getpgid")
+int bench_trigger_tp(void *ctx)
+{
+ __sync_add_and_fetch(&hits, 1);
+ return 0;
+}
+
+SEC("raw_tp/sys_enter")
+int BPF_PROG(bench_trigger_raw_tp, struct pt_regs *regs, long id)
+{
+ if (id == __NR_getpgid)
+ __sync_add_and_fetch(&hits, 1);
+ return 0;
+}
+
+SEC("kprobe/__x64_sys_getpgid")
+int bench_trigger_kprobe(void *ctx)
+{
+ __sync_add_and_fetch(&hits, 1);
+ return 0;
+}
+
+SEC("fentry/__x64_sys_getpgid")
+int bench_trigger_fentry(void *ctx)
+{
+ __sync_add_and_fetch(&hits, 1);
+ return 0;
+}
+
+SEC("fentry.s/__x64_sys_getpgid")
+int bench_trigger_fentry_sleep(void *ctx)
+{
+ __sync_add_and_fetch(&hits, 1);
+ return 0;
+}
+
+SEC("fmod_ret/__x64_sys_getpgid")
+int bench_trigger_fmodret(void *ctx)
+{
+ __sync_add_and_fetch(&hits, 1);
+ return -22;
+}
diff --git a/tools/testing/selftests/bpf/progs/udp_limit.c b/tools/testing/selftests/bpf/progs/udp_limit.c
new file mode 100644
index 000000000..165e3c2dd
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/udp_limit.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <sys/socket.h>
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+int invocations = 0, in_use = 0;
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, int);
+} sk_map SEC(".maps");
+
+SEC("cgroup/sock_create")
+int sock(struct bpf_sock *ctx)
+{
+ int *sk_storage;
+ __u32 key;
+
+ if (ctx->type != SOCK_DGRAM)
+ return 1;
+
+ sk_storage = bpf_sk_storage_get(&sk_map, ctx, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ if (!sk_storage)
+ return 0;
+ *sk_storage = 0xdeadbeef;
+
+ __sync_fetch_and_add(&invocations, 1);
+
+ if (in_use > 0) {
+ /* BPF_CGROUP_INET_SOCK_RELEASE is _not_ called
+ * when we return an error from the BPF
+ * program!
+ */
+ return 0;
+ }
+
+ __sync_fetch_and_add(&in_use, 1);
+ return 1;
+}
+
+SEC("cgroup/sock_release")
+int sock_release(struct bpf_sock *ctx)
+{
+ int *sk_storage;
+ __u32 key;
+
+ if (ctx->type != SOCK_DGRAM)
+ return 1;
+
+ sk_storage = bpf_sk_storage_get(&sk_map, ctx, 0, 0);
+ if (!sk_storage || *sk_storage != 0xdeadbeef)
+ return 0;
+
+ __sync_fetch_and_add(&invocations, 1);
+ __sync_fetch_and_add(&in_use, -1);
+ return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/xdp_dummy.c b/tools/testing/selftests/bpf/progs/xdp_dummy.c
new file mode 100644
index 000000000..ea25e8881
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/xdp_dummy.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define KBUILD_MODNAME "xdp_dummy"
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+SEC("xdp_dummy")
+int xdp_dummy_prog(struct xdp_md *ctx)
+{
+ return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/xdp_redirect_map.c b/tools/testing/selftests/bpf/progs/xdp_redirect_map.c
new file mode 100644
index 000000000..d037262c8
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/xdp_redirect_map.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_DEVMAP);
+ __uint(max_entries, 8);
+ __uint(key_size, sizeof(int));
+ __uint(value_size, sizeof(int));
+} tx_port SEC(".maps");
+
+SEC("redirect_map_0")
+int xdp_redirect_map_0(struct xdp_md *xdp)
+{
+ return bpf_redirect_map(&tx_port, 0, 0);
+}
+
+SEC("redirect_map_1")
+int xdp_redirect_map_1(struct xdp_md *xdp)
+{
+ return bpf_redirect_map(&tx_port, 1, 0);
+}
+
+SEC("redirect_map_2")
+int xdp_redirect_map_2(struct xdp_md *xdp)
+{
+ return bpf_redirect_map(&tx_port, 2, 0);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/xdp_tx.c b/tools/testing/selftests/bpf/progs/xdp_tx.c
new file mode 100644
index 000000000..5f725c720
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/xdp_tx.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+SEC("xdp")
+int xdp_tx(struct xdp_md *xdp)
+{
+ return XDP_TX;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/xdping_kern.c b/tools/testing/selftests/bpf/progs/xdping_kern.c
new file mode 100644
index 000000000..6b9ca40bd
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/xdping_kern.c
@@ -0,0 +1,184 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */
+
+#define KBUILD_MODNAME "foo"
+#include <stddef.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <linux/icmp.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+#include "xdping.h"
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 256);
+ __type(key, __u32);
+ __type(value, struct pinginfo);
+} ping_map SEC(".maps");
+
+static __always_inline void swap_src_dst_mac(void *data)
+{
+ unsigned short *p = data;
+ unsigned short dst[3];
+
+ dst[0] = p[0];
+ dst[1] = p[1];
+ dst[2] = p[2];
+ p[0] = p[3];
+ p[1] = p[4];
+ p[2] = p[5];
+ p[3] = dst[0];
+ p[4] = dst[1];
+ p[5] = dst[2];
+}
+
+static __always_inline __u16 csum_fold_helper(__wsum sum)
+{
+ sum = (sum & 0xffff) + (sum >> 16);
+ return ~((sum & 0xffff) + (sum >> 16));
+}
+
+static __always_inline __u16 ipv4_csum(void *data_start, int data_size)
+{
+ __wsum sum;
+
+ sum = bpf_csum_diff(0, 0, data_start, data_size, 0);
+ return csum_fold_helper(sum);
+}
+
+#define ICMP_ECHO_LEN 64
+
+static __always_inline int icmp_check(struct xdp_md *ctx, int type)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+ struct icmphdr *icmph;
+ struct iphdr *iph;
+
+ if (data + sizeof(*eth) + sizeof(*iph) + ICMP_ECHO_LEN > data_end)
+ return XDP_PASS;
+
+ if (eth->h_proto != bpf_htons(ETH_P_IP))
+ return XDP_PASS;
+
+ iph = data + sizeof(*eth);
+
+ if (iph->protocol != IPPROTO_ICMP)
+ return XDP_PASS;
+
+ if (bpf_ntohs(iph->tot_len) - sizeof(*iph) != ICMP_ECHO_LEN)
+ return XDP_PASS;
+
+ icmph = data + sizeof(*eth) + sizeof(*iph);
+
+ if (icmph->type != type)
+ return XDP_PASS;
+
+ return XDP_TX;
+}
+
+SEC("xdpclient")
+int xdping_client(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct pinginfo *pinginfo = NULL;
+ struct ethhdr *eth = data;
+ struct icmphdr *icmph;
+ struct iphdr *iph;
+ __u64 recvtime;
+ __be32 raddr;
+ __be16 seq;
+ int ret;
+ __u8 i;
+
+ ret = icmp_check(ctx, ICMP_ECHOREPLY);
+
+ if (ret != XDP_TX)
+ return ret;
+
+ iph = data + sizeof(*eth);
+ icmph = data + sizeof(*eth) + sizeof(*iph);
+ raddr = iph->saddr;
+
+ /* Record time reply received. */
+ recvtime = bpf_ktime_get_ns();
+ pinginfo = bpf_map_lookup_elem(&ping_map, &raddr);
+ if (!pinginfo || pinginfo->seq != icmph->un.echo.sequence)
+ return XDP_PASS;
+
+ if (pinginfo->start) {
+#pragma clang loop unroll(full)
+ for (i = 0; i < XDPING_MAX_COUNT; i++) {
+ if (pinginfo->times[i] == 0)
+ break;
+ }
+ /* verifier is fussy here... */
+ if (i < XDPING_MAX_COUNT) {
+ pinginfo->times[i] = recvtime -
+ pinginfo->start;
+ pinginfo->start = 0;
+ i++;
+ }
+ /* No more space for values? */
+ if (i == pinginfo->count || i == XDPING_MAX_COUNT)
+ return XDP_PASS;
+ }
+
+ /* Now convert reply back into echo request. */
+ swap_src_dst_mac(data);
+ iph->saddr = iph->daddr;
+ iph->daddr = raddr;
+ icmph->type = ICMP_ECHO;
+ seq = bpf_htons(bpf_ntohs(icmph->un.echo.sequence) + 1);
+ icmph->un.echo.sequence = seq;
+ icmph->checksum = 0;
+ icmph->checksum = ipv4_csum(icmph, ICMP_ECHO_LEN);
+
+ pinginfo->seq = seq;
+ pinginfo->start = bpf_ktime_get_ns();
+
+ return XDP_TX;
+}
+
+SEC("xdpserver")
+int xdping_server(struct xdp_md *ctx)
+{
+ void *data_end = (void *)(long)ctx->data_end;
+ void *data = (void *)(long)ctx->data;
+ struct ethhdr *eth = data;
+ struct icmphdr *icmph;
+ struct iphdr *iph;
+ __be32 raddr;
+ int ret;
+
+ ret = icmp_check(ctx, ICMP_ECHO);
+
+ if (ret != XDP_TX)
+ return ret;
+
+ iph = data + sizeof(*eth);
+ icmph = data + sizeof(*eth) + sizeof(*iph);
+ raddr = iph->saddr;
+
+ /* Now convert request into echo reply. */
+ swap_src_dst_mac(data);
+ iph->saddr = iph->daddr;
+ iph->daddr = raddr;
+ icmph->type = ICMP_ECHOREPLY;
+ icmph->checksum = 0;
+ icmph->checksum = ipv4_csum(icmph, ICMP_ECHO_LEN);
+
+ return XDP_TX;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/settings b/tools/testing/selftests/bpf/settings
new file mode 100644
index 000000000..e7b941753
--- /dev/null
+++ b/tools/testing/selftests/bpf/settings
@@ -0,0 +1 @@
+timeout=0
diff --git a/tools/testing/selftests/bpf/tcp_client.py b/tools/testing/selftests/bpf/tcp_client.py
new file mode 100755
index 000000000..bfff82be3
--- /dev/null
+++ b/tools/testing/selftests/bpf/tcp_client.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: GPL-2.0
+#
+
+import sys, os, os.path, getopt
+import socket, time
+import subprocess
+import select
+
+def read(sock, n):
+ buf = b''
+ while len(buf) < n:
+ rem = n - len(buf)
+ try: s = sock.recv(rem)
+ except (socket.error) as e: return b''
+ buf += s
+ return buf
+
+def send(sock, s):
+ total = len(s)
+ count = 0
+ while count < total:
+ try: n = sock.send(s)
+ except (socket.error) as e: n = 0
+ if n == 0:
+ return count;
+ count += n
+ return count
+
+
+serverPort = int(sys.argv[1])
+
+# create active socket
+sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
+try:
+ sock.connect(('::1', serverPort))
+except socket.error as e:
+ sys.exit(1)
+
+buf = b''
+n = 0
+while n < 1000:
+ buf += b'+'
+ n += 1
+
+sock.settimeout(1);
+n = send(sock, buf)
+n = read(sock, 500)
+sys.exit(0)
diff --git a/tools/testing/selftests/bpf/tcp_server.py b/tools/testing/selftests/bpf/tcp_server.py
new file mode 100755
index 000000000..42ab8882f
--- /dev/null
+++ b/tools/testing/selftests/bpf/tcp_server.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+#
+# SPDX-License-Identifier: GPL-2.0
+#
+
+import sys, os, os.path, getopt
+import socket, time
+import subprocess
+import select
+
+def read(sock, n):
+ buf = b''
+ while len(buf) < n:
+ rem = n - len(buf)
+ try: s = sock.recv(rem)
+ except (socket.error) as e: return b''
+ buf += s
+ return buf
+
+def send(sock, s):
+ total = len(s)
+ count = 0
+ while count < total:
+ try: n = sock.send(s)
+ except (socket.error) as e: n = 0
+ if n == 0:
+ return count;
+ count += n
+ return count
+
+
+SERVER_PORT = 12877
+MAX_PORTS = 2
+
+serverPort = SERVER_PORT
+serverSocket = None
+
+# create passive socket
+serverSocket = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
+
+try: serverSocket.bind(('::1', 0))
+except socket.error as msg:
+ print('bind fails: ' + str(msg))
+
+sn = serverSocket.getsockname()
+serverPort = sn[1]
+
+cmdStr = ("./tcp_client.py %d &") % (serverPort)
+os.system(cmdStr)
+
+buf = b''
+n = 0
+while n < 500:
+ buf += b'.'
+ n += 1
+
+serverSocket.listen(MAX_PORTS)
+readList = [serverSocket]
+
+while True:
+ readyRead, readyWrite, inError = \
+ select.select(readList, [], [], 2)
+
+ if len(readyRead) > 0:
+ waitCount = 0
+ for sock in readyRead:
+ if sock == serverSocket:
+ (clientSocket, address) = serverSocket.accept()
+ address = str(address[0])
+ readList.append(clientSocket)
+ else:
+ sock.settimeout(1);
+ s = read(sock, 1000)
+ n = send(sock, buf)
+ sock.close()
+ serverSocket.close()
+ sys.exit(0)
+ else:
+ print('Select timeout!')
+ sys.exit(1)
diff --git a/tools/testing/selftests/bpf/test_bpftool.py b/tools/testing/selftests/bpf/test_bpftool.py
new file mode 100644
index 000000000..4fed2dc25
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_bpftool.py
@@ -0,0 +1,178 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2020 SUSE LLC.
+
+import collections
+import functools
+import json
+import os
+import socket
+import subprocess
+import unittest
+
+
+# Add the source tree of bpftool and /usr/local/sbin to PATH
+cur_dir = os.path.dirname(os.path.realpath(__file__))
+bpftool_dir = os.path.abspath(os.path.join(cur_dir, "..", "..", "..", "..",
+ "tools", "bpf", "bpftool"))
+os.environ["PATH"] = bpftool_dir + ":/usr/local/sbin:" + os.environ["PATH"]
+
+
+class IfaceNotFoundError(Exception):
+ pass
+
+
+class UnprivilegedUserError(Exception):
+ pass
+
+
+def _bpftool(args, json=True):
+ _args = ["bpftool"]
+ if json:
+ _args.append("-j")
+ _args.extend(args)
+
+ return subprocess.check_output(_args)
+
+
+def bpftool(args):
+ return _bpftool(args, json=False).decode("utf-8")
+
+
+def bpftool_json(args):
+ res = _bpftool(args)
+ return json.loads(res)
+
+
+def get_default_iface():
+ for iface in socket.if_nameindex():
+ if iface[1] != "lo":
+ return iface[1]
+ raise IfaceNotFoundError("Could not find any network interface to probe")
+
+
+def default_iface(f):
+ @functools.wraps(f)
+ def wrapper(*args, **kwargs):
+ iface = get_default_iface()
+ return f(*args, iface, **kwargs)
+ return wrapper
+
+
+class TestBpftool(unittest.TestCase):
+ @classmethod
+ def setUpClass(cls):
+ if os.getuid() != 0:
+ raise UnprivilegedUserError(
+ "This test suite needs root privileges")
+
+ @default_iface
+ def test_feature_dev_json(self, iface):
+ unexpected_helpers = [
+ "bpf_probe_write_user",
+ "bpf_trace_printk",
+ ]
+ expected_keys = [
+ "syscall_config",
+ "program_types",
+ "map_types",
+ "helpers",
+ "misc",
+ ]
+
+ res = bpftool_json(["feature", "probe", "dev", iface])
+ # Check if the result has all expected keys.
+ self.assertCountEqual(res.keys(), expected_keys)
+ # Check if unexpected helpers are not included in helpers probes
+ # result.
+ for helpers in res["helpers"].values():
+ for unexpected_helper in unexpected_helpers:
+ self.assertNotIn(unexpected_helper, helpers)
+
+ def test_feature_kernel(self):
+ test_cases = [
+ bpftool_json(["feature", "probe", "kernel"]),
+ bpftool_json(["feature", "probe"]),
+ bpftool_json(["feature"]),
+ ]
+ unexpected_helpers = [
+ "bpf_probe_write_user",
+ "bpf_trace_printk",
+ ]
+ expected_keys = [
+ "syscall_config",
+ "system_config",
+ "program_types",
+ "map_types",
+ "helpers",
+ "misc",
+ ]
+
+ for tc in test_cases:
+ # Check if the result has all expected keys.
+ self.assertCountEqual(tc.keys(), expected_keys)
+ # Check if unexpected helpers are not included in helpers probes
+ # result.
+ for helpers in tc["helpers"].values():
+ for unexpected_helper in unexpected_helpers:
+ self.assertNotIn(unexpected_helper, helpers)
+
+ def test_feature_kernel_full(self):
+ test_cases = [
+ bpftool_json(["feature", "probe", "kernel", "full"]),
+ bpftool_json(["feature", "probe", "full"]),
+ ]
+ expected_helpers = [
+ "bpf_probe_write_user",
+ "bpf_trace_printk",
+ ]
+
+ for tc in test_cases:
+ # Check if expected helpers are included at least once in any
+ # helpers list for any program type. Unfortunately we cannot assume
+ # that they will be included in all program types or a specific
+ # subset of programs. It depends on the kernel version and
+ # configuration.
+ found_helpers = False
+
+ for helpers in tc["helpers"].values():
+ if all(expected_helper in helpers
+ for expected_helper in expected_helpers):
+ found_helpers = True
+ break
+
+ self.assertTrue(found_helpers)
+
+ def test_feature_kernel_full_vs_not_full(self):
+ full_res = bpftool_json(["feature", "probe", "full"])
+ not_full_res = bpftool_json(["feature", "probe"])
+ not_full_set = set()
+ full_set = set()
+
+ for helpers in full_res["helpers"].values():
+ for helper in helpers:
+ full_set.add(helper)
+
+ for helpers in not_full_res["helpers"].values():
+ for helper in helpers:
+ not_full_set.add(helper)
+
+ self.assertCountEqual(full_set - not_full_set,
+ {"bpf_probe_write_user", "bpf_trace_printk"})
+ self.assertCountEqual(not_full_set - full_set, set())
+
+ def test_feature_macros(self):
+ expected_patterns = [
+ r"/\*\*\* System call availability \*\*\*/",
+ r"#define HAVE_BPF_SYSCALL",
+ r"/\*\*\* eBPF program types \*\*\*/",
+ r"#define HAVE.*PROG_TYPE",
+ r"/\*\*\* eBPF map types \*\*\*/",
+ r"#define HAVE.*MAP_TYPE",
+ r"/\*\*\* eBPF helper functions \*\*\*/",
+ r"#define HAVE.*HELPER",
+ r"/\*\*\* eBPF misc features \*\*\*/",
+ ]
+
+ res = bpftool(["feature", "probe", "macros"])
+ for pattern in expected_patterns:
+ self.assertRegex(res, pattern)
diff --git a/tools/testing/selftests/bpf/test_bpftool.sh b/tools/testing/selftests/bpf/test_bpftool.sh
new file mode 100755
index 000000000..66690778e
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_bpftool.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2020 SUSE LLC.
+
+python3 -m unittest -v test_bpftool.TestBpftool
diff --git a/tools/testing/selftests/bpf/test_bpftool_build.sh b/tools/testing/selftests/bpf/test_bpftool_build.sh
new file mode 100755
index 000000000..2db3c60e1
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_bpftool_build.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+
+case $1 in
+ -h|--help)
+ echo -e "$0 [-j <n>]"
+ echo -e "\tTest the different ways of building bpftool."
+ echo -e ""
+ echo -e "\tOptions:"
+ echo -e "\t\t-j <n>:\tPass -j flag to 'make'."
+ exit 0
+ ;;
+esac
+
+J=$*
+
+# Assume script is located under tools/testing/selftests/bpf/. We want to start
+# build attempts from the top of kernel repository.
+SCRIPT_REL_PATH=$(realpath --relative-to=$PWD $0)
+SCRIPT_REL_DIR=$(dirname $SCRIPT_REL_PATH)
+KDIR_ROOT_DIR=$(realpath $PWD/$SCRIPT_REL_DIR/../../../../)
+cd $KDIR_ROOT_DIR
+if [ ! -e tools/bpf/bpftool/Makefile ]; then
+ echo -e "skip: bpftool files not found!\n"
+ exit 0
+fi
+
+ERROR=0
+TMPDIR=
+
+# If one build fails, continue but return non-0 on exit.
+return_value() {
+ if [ -d "$TMPDIR" ] ; then
+ rm -rf -- $TMPDIR
+ fi
+ exit $ERROR
+}
+trap return_value EXIT
+
+check() {
+ local dir=$(realpath $1)
+
+ echo -n "binary: "
+ # Returns non-null if file is found (and "false" is run)
+ find $dir -type f -executable -name bpftool -print -exec false {} + && \
+ ERROR=1 && printf "FAILURE: Did not find bpftool\n"
+}
+
+make_and_clean() {
+ echo -e "\$PWD: $PWD"
+ echo -e "command: make -s $* >/dev/null"
+ make $J -s $* >/dev/null
+ if [ $? -ne 0 ] ; then
+ ERROR=1
+ fi
+ if [ $# -ge 1 ] ; then
+ check ${@: -1}
+ else
+ check .
+ fi
+ (
+ if [ $# -ge 1 ] ; then
+ cd ${@: -1}
+ fi
+ make -s clean
+ )
+ echo
+}
+
+make_with_tmpdir() {
+ local ARGS
+
+ TMPDIR=$(mktemp -d)
+ if [ $# -ge 2 ] ; then
+ ARGS=${@:1:(($# - 1))}
+ fi
+ echo -e "\$PWD: $PWD"
+ echo -e "command: make -s $ARGS ${@: -1}=$TMPDIR/ >/dev/null"
+ make $J -s $ARGS ${@: -1}=$TMPDIR/ >/dev/null
+ if [ $? -ne 0 ] ; then
+ ERROR=1
+ fi
+ check $TMPDIR
+ rm -rf -- $TMPDIR
+ echo
+}
+
+make_doc_and_clean() {
+ echo -e "\$PWD: $PWD"
+ echo -e "command: make -s $* doc >/dev/null"
+ RST2MAN_OPTS="--exit-status=1" make $J -s $* doc
+ if [ $? -ne 0 ] ; then
+ ERROR=1
+ printf "FAILURE: Errors or warnings when building documentation\n"
+ fi
+ (
+ if [ $# -ge 1 ] ; then
+ cd ${@: -1}
+ fi
+ make -s doc-clean
+ )
+ echo
+}
+
+echo "Trying to build bpftool"
+echo -e "... through kbuild\n"
+
+if [ -f ".config" ] ; then
+ make_and_clean tools/bpf
+
+ ## $OUTPUT is overwritten in kbuild Makefile, and thus cannot be passed
+ ## down from toplevel Makefile to bpftool's Makefile.
+
+ # make_with_tmpdir tools/bpf OUTPUT
+ echo -e "skip: make tools/bpf OUTPUT=<dir> (not supported)\n"
+
+ make_with_tmpdir tools/bpf O
+else
+ echo -e "skip: make tools/bpf (no .config found)\n"
+ echo -e "skip: make tools/bpf OUTPUT=<dir> (not supported)\n"
+ echo -e "skip: make tools/bpf O=<dir> (no .config found)\n"
+fi
+
+echo -e "... from kernel source tree\n"
+
+make_and_clean -C tools/bpf/bpftool
+
+make_with_tmpdir -C tools/bpf/bpftool OUTPUT
+
+make_with_tmpdir -C tools/bpf/bpftool O
+
+echo -e "... from tools/\n"
+cd tools/
+
+make_and_clean bpf
+
+## In tools/bpf/Makefile, function "descend" is called and passes $(O) and
+## $(OUTPUT). We would like $(OUTPUT) to have "bpf/bpftool/" appended before
+## calling bpftool's Makefile, but this is not the case as the "descend"
+## function focuses on $(O)/$(subdir). However, in the present case, updating
+## $(O) to have $(OUTPUT) recomputed from it in bpftool's Makefile does not
+## work, because $(O) is not defined from command line and $(OUTPUT) is not
+## updated in tools/scripts/Makefile.include.
+##
+## Workarounds would require to a) edit "descend" or use an alternative way to
+## call bpftool's Makefile, b) modify the conditions to update $(OUTPUT) and
+## other variables in tools/scripts/Makefile.include (at the risk of breaking
+## the build of other tools), or c) append manually the "bpf/bpftool" suffix to
+## $(OUTPUT) in bpf's Makefile, which may break if targets for other directories
+## use "descend" in the future.
+
+# make_with_tmpdir bpf OUTPUT
+echo -e "skip: make bpf OUTPUT=<dir> (not supported)\n"
+
+make_with_tmpdir bpf O
+
+echo -e "... from bpftool's dir\n"
+cd bpf/bpftool
+
+make_and_clean
+
+make_with_tmpdir OUTPUT
+
+make_with_tmpdir O
+
+echo -e "Checking documentation build\n"
+# From tools/bpf/bpftool
+make_doc_and_clean
diff --git a/tools/testing/selftests/bpf/test_bpftool_metadata.sh b/tools/testing/selftests/bpf/test_bpftool_metadata.sh
new file mode 100755
index 000000000..1bf81b494
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_bpftool_metadata.sh
@@ -0,0 +1,82 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+TESTNAME=bpftool_metadata
+BPF_FS=$(awk '$3 == "bpf" {print $2; exit}' /proc/mounts)
+BPF_DIR=$BPF_FS/test_$TESTNAME
+
+_cleanup()
+{
+ set +e
+ rm -rf $BPF_DIR 2> /dev/null
+}
+
+cleanup_skip()
+{
+ echo "selftests: $TESTNAME [SKIP]"
+ _cleanup
+
+ exit $ksft_skip
+}
+
+cleanup()
+{
+ if [ "$?" = 0 ]; then
+ echo "selftests: $TESTNAME [PASS]"
+ else
+ echo "selftests: $TESTNAME [FAILED]"
+ fi
+ _cleanup
+}
+
+if [ $(id -u) -ne 0 ]; then
+ echo "selftests: $TESTNAME [SKIP] Need root privileges"
+ exit $ksft_skip
+fi
+
+if [ -z "$BPF_FS" ]; then
+ echo "selftests: $TESTNAME [SKIP] Could not run test without bpffs mounted"
+ exit $ksft_skip
+fi
+
+if ! bpftool version > /dev/null 2>&1; then
+ echo "selftests: $TESTNAME [SKIP] Could not run test without bpftool"
+ exit $ksft_skip
+fi
+
+set -e
+
+trap cleanup_skip EXIT
+
+mkdir $BPF_DIR
+
+trap cleanup EXIT
+
+bpftool prog load metadata_unused.o $BPF_DIR/unused
+
+METADATA_PLAIN="$(bpftool prog)"
+echo "$METADATA_PLAIN" | grep 'a = "foo"' > /dev/null
+echo "$METADATA_PLAIN" | grep 'b = 1' > /dev/null
+
+bpftool prog --json | grep '"metadata":{"a":"foo","b":1}' > /dev/null
+
+bpftool map | grep 'metadata.rodata' > /dev/null
+
+rm $BPF_DIR/unused
+
+bpftool prog load metadata_used.o $BPF_DIR/used
+
+METADATA_PLAIN="$(bpftool prog)"
+echo "$METADATA_PLAIN" | grep 'a = "bar"' > /dev/null
+echo "$METADATA_PLAIN" | grep 'b = 2' > /dev/null
+
+bpftool prog --json | grep '"metadata":{"a":"bar","b":2}' > /dev/null
+
+bpftool map | grep 'metadata.rodata' > /dev/null
+
+rm $BPF_DIR/used
+
+exit 0
diff --git a/tools/testing/selftests/bpf/test_btf.h b/tools/testing/selftests/bpf/test_btf.h
new file mode 100644
index 000000000..2023725f1
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_btf.h
@@ -0,0 +1,69 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2019 Facebook */
+
+#ifndef _TEST_BTF_H
+#define _TEST_BTF_H
+
+#define BTF_INFO_ENC(kind, kind_flag, vlen) \
+ ((!!(kind_flag) << 31) | ((kind) << 24) | ((vlen) & BTF_MAX_VLEN))
+
+#define BTF_TYPE_ENC(name, info, size_or_type) \
+ (name), (info), (size_or_type)
+
+#define BTF_INT_ENC(encoding, bits_offset, nr_bits) \
+ ((encoding) << 24 | (bits_offset) << 16 | (nr_bits))
+#define BTF_TYPE_INT_ENC(name, encoding, bits_offset, bits, sz) \
+ BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_INT, 0, 0), sz), \
+ BTF_INT_ENC(encoding, bits_offset, bits)
+
+#define BTF_FWD_ENC(name, kind_flag) \
+ BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FWD, kind_flag, 0), 0)
+
+#define BTF_ARRAY_ENC(type, index_type, nr_elems) \
+ (type), (index_type), (nr_elems)
+#define BTF_TYPE_ARRAY_ENC(type, index_type, nr_elems) \
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_ARRAY, 0, 0), 0), \
+ BTF_ARRAY_ENC(type, index_type, nr_elems)
+
+#define BTF_STRUCT_ENC(name, nr_elems, sz) \
+ BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, nr_elems), sz)
+
+#define BTF_UNION_ENC(name, nr_elems, sz) \
+ BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_UNION, 0, nr_elems), sz)
+
+#define BTF_VAR_ENC(name, type, linkage) \
+ BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), type), (linkage)
+#define BTF_VAR_SECINFO_ENC(type, offset, size) \
+ (type), (offset), (size)
+
+#define BTF_MEMBER_ENC(name, type, bits_offset) \
+ (name), (type), (bits_offset)
+#define BTF_ENUM_ENC(name, val) (name), (val)
+#define BTF_MEMBER_OFFSET(bitfield_size, bits_offset) \
+ ((bitfield_size) << 24 | (bits_offset))
+
+#define BTF_TYPEDEF_ENC(name, type) \
+ BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_TYPEDEF, 0, 0), type)
+
+#define BTF_PTR_ENC(type) \
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), type)
+
+#define BTF_CONST_ENC(type) \
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_CONST, 0, 0), type)
+
+#define BTF_VOLATILE_ENC(type) \
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_VOLATILE, 0, 0), type)
+
+#define BTF_RESTRICT_ENC(type) \
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_RESTRICT, 0, 0), type)
+
+#define BTF_FUNC_PROTO_ENC(ret_type, nargs) \
+ BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, nargs), ret_type)
+
+#define BTF_FUNC_PROTO_ARG_ENC(name, type) \
+ (name), (type)
+
+#define BTF_FUNC_ENC(name, func_proto) \
+ BTF_TYPE_ENC(name, BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0), func_proto)
+
+#endif /* _TEST_BTF_H */
diff --git a/tools/testing/selftests/bpf/test_cgroup_storage.c b/tools/testing/selftests/bpf/test_cgroup_storage.c
new file mode 100644
index 000000000..d946252a2
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_cgroup_storage.c
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <assert.h>
+#include <bpf/bpf.h>
+#include <linux/filter.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/sysinfo.h>
+
+#include "bpf_rlimit.h"
+#include "cgroup_helpers.h"
+
+char bpf_log_buf[BPF_LOG_BUF_SIZE];
+
+#define TEST_CGROUP "/test-bpf-cgroup-storage-buf/"
+
+int main(int argc, char **argv)
+{
+ struct bpf_insn prog[] = {
+ BPF_LD_MAP_FD(BPF_REG_1, 0), /* percpu map fd */
+ BPF_MOV64_IMM(BPF_REG_2, 0), /* flags, not used */
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_get_local_storage),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 0x1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0),
+
+ BPF_LD_MAP_FD(BPF_REG_1, 0), /* map fd */
+ BPF_MOV64_IMM(BPF_REG_2, 0), /* flags, not used */
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_get_local_storage),
+ BPF_MOV64_IMM(BPF_REG_1, 1),
+ BPF_STX_XADD(BPF_DW, BPF_REG_0, BPF_REG_1, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0x1),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ };
+ size_t insns_cnt = sizeof(prog) / sizeof(struct bpf_insn);
+ int error = EXIT_FAILURE;
+ int map_fd, percpu_map_fd, prog_fd, cgroup_fd;
+ struct bpf_cgroup_storage_key key;
+ unsigned long long value;
+ unsigned long long *percpu_value;
+ int cpu, nproc;
+
+ nproc = get_nprocs_conf();
+ percpu_value = malloc(sizeof(*percpu_value) * nproc);
+ if (!percpu_value) {
+ printf("Not enough memory for per-cpu area (%d cpus)\n", nproc);
+ goto err;
+ }
+
+ map_fd = bpf_create_map(BPF_MAP_TYPE_CGROUP_STORAGE, sizeof(key),
+ sizeof(value), 0, 0);
+ if (map_fd < 0) {
+ printf("Failed to create map: %s\n", strerror(errno));
+ goto out;
+ }
+
+ percpu_map_fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
+ sizeof(key), sizeof(value), 0, 0);
+ if (percpu_map_fd < 0) {
+ printf("Failed to create map: %s\n", strerror(errno));
+ goto out;
+ }
+
+ prog[0].imm = percpu_map_fd;
+ prog[7].imm = map_fd;
+ prog_fd = bpf_load_program(BPF_PROG_TYPE_CGROUP_SKB,
+ prog, insns_cnt, "GPL", 0,
+ bpf_log_buf, BPF_LOG_BUF_SIZE);
+ if (prog_fd < 0) {
+ printf("Failed to load bpf program: %s\n", bpf_log_buf);
+ goto out;
+ }
+
+ cgroup_fd = cgroup_setup_and_join(TEST_CGROUP);
+
+ /* Attach the bpf program */
+ if (bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_INET_EGRESS, 0)) {
+ printf("Failed to attach bpf program\n");
+ goto err;
+ }
+
+ if (bpf_map_get_next_key(map_fd, NULL, &key)) {
+ printf("Failed to get the first key in cgroup storage\n");
+ goto err;
+ }
+
+ if (bpf_map_lookup_elem(map_fd, &key, &value)) {
+ printf("Failed to lookup cgroup storage 0\n");
+ goto err;
+ }
+
+ for (cpu = 0; cpu < nproc; cpu++)
+ percpu_value[cpu] = 1000;
+
+ if (bpf_map_update_elem(percpu_map_fd, &key, percpu_value, 0)) {
+ printf("Failed to update the data in the cgroup storage\n");
+ goto err;
+ }
+
+ /* Every second packet should be dropped */
+ assert(system("ping localhost -c 1 -W 1 -q > /dev/null") == 0);
+ assert(system("ping localhost -c 1 -W 1 -q > /dev/null"));
+ assert(system("ping localhost -c 1 -W 1 -q > /dev/null") == 0);
+
+ /* Check the counter in the cgroup local storage */
+ if (bpf_map_lookup_elem(map_fd, &key, &value)) {
+ printf("Failed to lookup cgroup storage\n");
+ goto err;
+ }
+
+ if (value != 3) {
+ printf("Unexpected data in the cgroup storage: %llu\n", value);
+ goto err;
+ }
+
+ /* Bump the counter in the cgroup local storage */
+ value++;
+ if (bpf_map_update_elem(map_fd, &key, &value, 0)) {
+ printf("Failed to update the data in the cgroup storage\n");
+ goto err;
+ }
+
+ /* Every second packet should be dropped */
+ assert(system("ping localhost -c 1 -W 1 -q > /dev/null") == 0);
+ assert(system("ping localhost -c 1 -W 1 -q > /dev/null"));
+ assert(system("ping localhost -c 1 -W 1 -q > /dev/null") == 0);
+
+ /* Check the final value of the counter in the cgroup local storage */
+ if (bpf_map_lookup_elem(map_fd, &key, &value)) {
+ printf("Failed to lookup the cgroup storage\n");
+ goto err;
+ }
+
+ if (value != 7) {
+ printf("Unexpected data in the cgroup storage: %llu\n", value);
+ goto err;
+ }
+
+ /* Check the final value of the counter in the percpu local storage */
+
+ for (cpu = 0; cpu < nproc; cpu++)
+ percpu_value[cpu] = 0;
+
+ if (bpf_map_lookup_elem(percpu_map_fd, &key, percpu_value)) {
+ printf("Failed to lookup the per-cpu cgroup storage\n");
+ goto err;
+ }
+
+ value = 0;
+ for (cpu = 0; cpu < nproc; cpu++)
+ value += percpu_value[cpu];
+
+ if (value != nproc * 1000 + 6) {
+ printf("Unexpected data in the per-cpu cgroup storage\n");
+ goto err;
+ }
+
+ error = 0;
+ printf("test_cgroup_storage:PASS\n");
+
+err:
+ cleanup_cgroup_environment();
+ free(percpu_value);
+
+out:
+ return error;
+}
diff --git a/tools/testing/selftests/bpf/test_cpp.cpp b/tools/testing/selftests/bpf/test_cpp.cpp
new file mode 100644
index 000000000..a8d2e9a87
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_cpp.cpp
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+#include <iostream>
+#include <bpf/libbpf.h>
+#include <bpf/bpf.h>
+#include <bpf/btf.h>
+#include "test_core_extern.skel.h"
+
+/* do nothing, just make sure we can link successfully */
+
+int main(int argc, char *argv[])
+{
+ struct test_core_extern *skel;
+
+ /* libbpf.h */
+ libbpf_set_print(NULL);
+
+ /* bpf.h */
+ bpf_prog_get_fd_by_id(0);
+
+ /* btf.h */
+ btf__new(NULL, 0);
+
+ /* BPF skeleton */
+ skel = test_core_extern__open_and_load();
+ test_core_extern__destroy(skel);
+
+ std::cout << "DONE!" << std::endl;
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/test_current_pid_tgid_new_ns.c b/tools/testing/selftests/bpf/test_current_pid_tgid_new_ns.c
new file mode 100644
index 000000000..ec53b1ef9
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_current_pid_tgid_new_ns.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Carlos Neira cneirabustos@gmail.com */
+#define _GNU_SOURCE
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sched.h>
+#include <sys/wait.h>
+#include <sys/mount.h>
+#include "test_progs.h"
+
+#define CHECK_NEWNS(condition, tag, format...) ({ \
+ int __ret = !!(condition); \
+ if (__ret) { \
+ printf("%s:FAIL:%s ", __func__, tag); \
+ printf(format); \
+ } else { \
+ printf("%s:PASS:%s\n", __func__, tag); \
+ } \
+ __ret; \
+})
+
+struct bss {
+ __u64 dev;
+ __u64 ino;
+ __u64 pid_tgid;
+ __u64 user_pid_tgid;
+};
+
+int main(int argc, char **argv)
+{
+ pid_t pid;
+ int exit_code = 1;
+ struct stat st;
+
+ printf("Testing bpf_get_ns_current_pid_tgid helper in new ns\n");
+
+ if (stat("/proc/self/ns/pid", &st)) {
+ perror("stat failed on /proc/self/ns/pid ns\n");
+ printf("%s:FAILED\n", argv[0]);
+ return exit_code;
+ }
+
+ if (CHECK_NEWNS(unshare(CLONE_NEWPID | CLONE_NEWNS),
+ "unshare CLONE_NEWPID | CLONE_NEWNS", "error errno=%d\n", errno))
+ return exit_code;
+
+ pid = fork();
+ if (pid == -1) {
+ perror("Fork() failed\n");
+ printf("%s:FAILED\n", argv[0]);
+ return exit_code;
+ }
+
+ if (pid > 0) {
+ int status;
+
+ usleep(5);
+ waitpid(pid, &status, 0);
+ return 0;
+ } else {
+
+ pid = fork();
+ if (pid == -1) {
+ perror("Fork() failed\n");
+ printf("%s:FAILED\n", argv[0]);
+ return exit_code;
+ }
+
+ if (pid > 0) {
+ int status;
+ waitpid(pid, &status, 0);
+ return 0;
+ } else {
+ if (CHECK_NEWNS(mount("none", "/proc", NULL, MS_PRIVATE|MS_REC, NULL),
+ "Unmounting proc", "Cannot umount proc! errno=%d\n", errno))
+ return exit_code;
+
+ if (CHECK_NEWNS(mount("proc", "/proc", "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL),
+ "Mounting proc", "Cannot mount proc! errno=%d\n", errno))
+ return exit_code;
+
+ const char *probe_name = "raw_tracepoint/sys_enter";
+ const char *file = "test_ns_current_pid_tgid.o";
+ struct bpf_link *link = NULL;
+ struct bpf_program *prog;
+ struct bpf_map *bss_map;
+ struct bpf_object *obj;
+ int exit_code = 1;
+ int err, key = 0;
+ struct bss bss;
+ struct stat st;
+ __u64 id;
+
+ obj = bpf_object__open_file(file, NULL);
+ if (CHECK_NEWNS(IS_ERR(obj), "obj_open", "err %ld\n", PTR_ERR(obj)))
+ return exit_code;
+
+ err = bpf_object__load(obj);
+ if (CHECK_NEWNS(err, "obj_load", "err %d errno %d\n", err, errno))
+ goto cleanup;
+
+ bss_map = bpf_object__find_map_by_name(obj, "test_ns_.bss");
+ if (CHECK_NEWNS(!bss_map, "find_bss_map", "failed\n"))
+ goto cleanup;
+
+ prog = bpf_object__find_program_by_title(obj, probe_name);
+ if (CHECK_NEWNS(!prog, "find_prog", "prog '%s' not found\n",
+ probe_name))
+ goto cleanup;
+
+ memset(&bss, 0, sizeof(bss));
+ pid_t tid = syscall(SYS_gettid);
+ pid_t pid = getpid();
+
+ id = (__u64) tid << 32 | pid;
+ bss.user_pid_tgid = id;
+
+ if (CHECK_NEWNS(stat("/proc/self/ns/pid", &st),
+ "stat new ns", "Failed to stat /proc/self/ns/pid errno=%d\n", errno))
+ goto cleanup;
+
+ bss.dev = st.st_dev;
+ bss.ino = st.st_ino;
+
+ err = bpf_map_update_elem(bpf_map__fd(bss_map), &key, &bss, 0);
+ if (CHECK_NEWNS(err, "setting_bss", "failed to set bss : %d\n", err))
+ goto cleanup;
+
+ link = bpf_program__attach_raw_tracepoint(prog, "sys_enter");
+ if (CHECK_NEWNS(IS_ERR(link), "attach_raw_tp", "err %ld\n",
+ PTR_ERR(link))) {
+ link = NULL;
+ goto cleanup;
+ }
+
+ /* trigger some syscalls */
+ usleep(1);
+
+ err = bpf_map_lookup_elem(bpf_map__fd(bss_map), &key, &bss);
+ if (CHECK_NEWNS(err, "set_bss", "failed to get bss : %d\n", err))
+ goto cleanup;
+
+ if (CHECK_NEWNS(id != bss.pid_tgid, "Compare user pid/tgid vs. bpf pid/tgid",
+ "User pid/tgid %llu BPF pid/tgid %llu\n", id, bss.pid_tgid))
+ goto cleanup;
+
+ exit_code = 0;
+ printf("%s:PASS\n", argv[0]);
+cleanup:
+ if (!link) {
+ bpf_link__destroy(link);
+ link = NULL;
+ }
+ bpf_object__close(obj);
+ }
+ }
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/test_dev_cgroup.c b/tools/testing/selftests/bpf/test_dev_cgroup.c
new file mode 100644
index 000000000..804dddd97
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_dev_cgroup.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2017 Facebook
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <sys/time.h>
+
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "cgroup_helpers.h"
+#include "bpf_rlimit.h"
+
+#define DEV_CGROUP_PROG "./dev_cgroup.o"
+
+#define TEST_CGROUP "/test-bpf-based-device-cgroup/"
+
+int main(int argc, char **argv)
+{
+ struct bpf_object *obj;
+ int error = EXIT_FAILURE;
+ int prog_fd, cgroup_fd;
+ __u32 prog_cnt;
+
+ if (bpf_prog_load(DEV_CGROUP_PROG, BPF_PROG_TYPE_CGROUP_DEVICE,
+ &obj, &prog_fd)) {
+ printf("Failed to load DEV_CGROUP program\n");
+ goto out;
+ }
+
+ cgroup_fd = cgroup_setup_and_join(TEST_CGROUP);
+ if (cgroup_fd < 0) {
+ printf("Failed to create test cgroup\n");
+ goto out;
+ }
+
+ /* Attach bpf program */
+ if (bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_DEVICE, 0)) {
+ printf("Failed to attach DEV_CGROUP program");
+ goto err;
+ }
+
+ if (bpf_prog_query(cgroup_fd, BPF_CGROUP_DEVICE, 0, NULL, NULL,
+ &prog_cnt)) {
+ printf("Failed to query attached programs");
+ goto err;
+ }
+
+ /* All operations with /dev/zero and and /dev/urandom are allowed,
+ * everything else is forbidden.
+ */
+ assert(system("rm -f /tmp/test_dev_cgroup_null") == 0);
+ assert(system("mknod /tmp/test_dev_cgroup_null c 1 3"));
+ assert(system("rm -f /tmp/test_dev_cgroup_null") == 0);
+
+ /* /dev/zero is whitelisted */
+ assert(system("rm -f /tmp/test_dev_cgroup_zero") == 0);
+ assert(system("mknod /tmp/test_dev_cgroup_zero c 1 5") == 0);
+ assert(system("rm -f /tmp/test_dev_cgroup_zero") == 0);
+
+ assert(system("dd if=/dev/urandom of=/dev/zero count=64") == 0);
+
+ /* src is allowed, target is forbidden */
+ assert(system("dd if=/dev/urandom of=/dev/full count=64"));
+
+ /* src is forbidden, target is allowed */
+ assert(system("dd if=/dev/random of=/dev/zero count=64"));
+
+ error = 0;
+ printf("test_dev_cgroup:PASS\n");
+
+err:
+ cleanup_cgroup_environment();
+
+out:
+ return error;
+}
diff --git a/tools/testing/selftests/bpf/test_flow_dissector.c b/tools/testing/selftests/bpf/test_flow_dissector.c
new file mode 100644
index 000000000..01f0c634d
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_flow_dissector.c
@@ -0,0 +1,780 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Inject packets with all sorts of encapsulation into the kernel.
+ *
+ * IPv4/IPv6 outer layer 3
+ * GRE/GUE/BARE outer layer 4, where bare is IPIP/SIT/IPv4-in-IPv6/..
+ * IPv4/IPv6 inner layer 3
+ */
+
+#define _GNU_SOURCE
+
+#include <stddef.h>
+#include <arpa/inet.h>
+#include <asm/byteorder.h>
+#include <error.h>
+#include <errno.h>
+#include <linux/if_packet.h>
+#include <linux/if_ether.h>
+#include <linux/ipv6.h>
+#include <netinet/ip.h>
+#include <netinet/in.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#define CFG_PORT_INNER 8000
+
+/* Add some protocol definitions that do not exist in userspace */
+
+struct grehdr {
+ uint16_t unused;
+ uint16_t protocol;
+} __attribute__((packed));
+
+struct guehdr {
+ union {
+ struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u8 hlen:5,
+ control:1,
+ version:2;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u8 version:2,
+ control:1,
+ hlen:5;
+#else
+#error "Please fix <asm/byteorder.h>"
+#endif
+ __u8 proto_ctype;
+ __be16 flags;
+ };
+ __be32 word;
+ };
+};
+
+static uint8_t cfg_dsfield_inner;
+static uint8_t cfg_dsfield_outer;
+static uint8_t cfg_encap_proto;
+static bool cfg_expect_failure = false;
+static int cfg_l3_extra = AF_UNSPEC; /* optional SIT prefix */
+static int cfg_l3_inner = AF_UNSPEC;
+static int cfg_l3_outer = AF_UNSPEC;
+static int cfg_num_pkt = 10;
+static int cfg_num_secs = 0;
+static char cfg_payload_char = 'a';
+static int cfg_payload_len = 100;
+static int cfg_port_gue = 6080;
+static bool cfg_only_rx;
+static bool cfg_only_tx;
+static int cfg_src_port = 9;
+
+static char buf[ETH_DATA_LEN];
+
+#define INIT_ADDR4(name, addr4, port) \
+ static struct sockaddr_in name = { \
+ .sin_family = AF_INET, \
+ .sin_port = __constant_htons(port), \
+ .sin_addr.s_addr = __constant_htonl(addr4), \
+ };
+
+#define INIT_ADDR6(name, addr6, port) \
+ static struct sockaddr_in6 name = { \
+ .sin6_family = AF_INET6, \
+ .sin6_port = __constant_htons(port), \
+ .sin6_addr = addr6, \
+ };
+
+INIT_ADDR4(in_daddr4, INADDR_LOOPBACK, CFG_PORT_INNER)
+INIT_ADDR4(in_saddr4, INADDR_LOOPBACK + 2, 0)
+INIT_ADDR4(out_daddr4, INADDR_LOOPBACK, 0)
+INIT_ADDR4(out_saddr4, INADDR_LOOPBACK + 1, 0)
+INIT_ADDR4(extra_daddr4, INADDR_LOOPBACK, 0)
+INIT_ADDR4(extra_saddr4, INADDR_LOOPBACK + 1, 0)
+
+INIT_ADDR6(in_daddr6, IN6ADDR_LOOPBACK_INIT, CFG_PORT_INNER)
+INIT_ADDR6(in_saddr6, IN6ADDR_LOOPBACK_INIT, 0)
+INIT_ADDR6(out_daddr6, IN6ADDR_LOOPBACK_INIT, 0)
+INIT_ADDR6(out_saddr6, IN6ADDR_LOOPBACK_INIT, 0)
+INIT_ADDR6(extra_daddr6, IN6ADDR_LOOPBACK_INIT, 0)
+INIT_ADDR6(extra_saddr6, IN6ADDR_LOOPBACK_INIT, 0)
+
+static unsigned long util_gettime(void)
+{
+ struct timeval tv;
+
+ gettimeofday(&tv, NULL);
+ return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
+}
+
+static void util_printaddr(const char *msg, struct sockaddr *addr)
+{
+ unsigned long off = 0;
+ char nbuf[INET6_ADDRSTRLEN];
+
+ switch (addr->sa_family) {
+ case PF_INET:
+ off = __builtin_offsetof(struct sockaddr_in, sin_addr);
+ break;
+ case PF_INET6:
+ off = __builtin_offsetof(struct sockaddr_in6, sin6_addr);
+ break;
+ default:
+ error(1, 0, "printaddr: unsupported family %u\n",
+ addr->sa_family);
+ }
+
+ if (!inet_ntop(addr->sa_family, ((void *) addr) + off, nbuf,
+ sizeof(nbuf)))
+ error(1, errno, "inet_ntop");
+
+ fprintf(stderr, "%s: %s\n", msg, nbuf);
+}
+
+static unsigned long add_csum_hword(const uint16_t *start, int num_u16)
+{
+ unsigned long sum = 0;
+ int i;
+
+ for (i = 0; i < num_u16; i++)
+ sum += start[i];
+
+ return sum;
+}
+
+static uint16_t build_ip_csum(const uint16_t *start, int num_u16,
+ unsigned long sum)
+{
+ sum += add_csum_hword(start, num_u16);
+
+ while (sum >> 16)
+ sum = (sum & 0xffff) + (sum >> 16);
+
+ return ~sum;
+}
+
+static void build_ipv4_header(void *header, uint8_t proto,
+ uint32_t src, uint32_t dst,
+ int payload_len, uint8_t tos)
+{
+ struct iphdr *iph = header;
+
+ iph->ihl = 5;
+ iph->version = 4;
+ iph->tos = tos;
+ iph->ttl = 8;
+ iph->tot_len = htons(sizeof(*iph) + payload_len);
+ iph->id = htons(1337);
+ iph->protocol = proto;
+ iph->saddr = src;
+ iph->daddr = dst;
+ iph->check = build_ip_csum((void *) iph, iph->ihl << 1, 0);
+}
+
+static void ipv6_set_dsfield(struct ipv6hdr *ip6h, uint8_t dsfield)
+{
+ uint16_t val, *ptr = (uint16_t *)ip6h;
+
+ val = ntohs(*ptr);
+ val &= 0xF00F;
+ val |= ((uint16_t) dsfield) << 4;
+ *ptr = htons(val);
+}
+
+static void build_ipv6_header(void *header, uint8_t proto,
+ struct sockaddr_in6 *src,
+ struct sockaddr_in6 *dst,
+ int payload_len, uint8_t dsfield)
+{
+ struct ipv6hdr *ip6h = header;
+
+ ip6h->version = 6;
+ ip6h->payload_len = htons(payload_len);
+ ip6h->nexthdr = proto;
+ ip6h->hop_limit = 8;
+ ipv6_set_dsfield(ip6h, dsfield);
+
+ memcpy(&ip6h->saddr, &src->sin6_addr, sizeof(ip6h->saddr));
+ memcpy(&ip6h->daddr, &dst->sin6_addr, sizeof(ip6h->daddr));
+}
+
+static uint16_t build_udp_v4_csum(const struct iphdr *iph,
+ const struct udphdr *udph,
+ int num_words)
+{
+ unsigned long pseudo_sum;
+ int num_u16 = sizeof(iph->saddr); /* halfwords: twice byte len */
+
+ pseudo_sum = add_csum_hword((void *) &iph->saddr, num_u16);
+ pseudo_sum += htons(IPPROTO_UDP);
+ pseudo_sum += udph->len;
+ return build_ip_csum((void *) udph, num_words, pseudo_sum);
+}
+
+static uint16_t build_udp_v6_csum(const struct ipv6hdr *ip6h,
+ const struct udphdr *udph,
+ int num_words)
+{
+ unsigned long pseudo_sum;
+ int num_u16 = sizeof(ip6h->saddr); /* halfwords: twice byte len */
+
+ pseudo_sum = add_csum_hword((void *) &ip6h->saddr, num_u16);
+ pseudo_sum += htons(ip6h->nexthdr);
+ pseudo_sum += ip6h->payload_len;
+ return build_ip_csum((void *) udph, num_words, pseudo_sum);
+}
+
+static void build_udp_header(void *header, int payload_len,
+ uint16_t dport, int family)
+{
+ struct udphdr *udph = header;
+ int len = sizeof(*udph) + payload_len;
+
+ udph->source = htons(cfg_src_port);
+ udph->dest = htons(dport);
+ udph->len = htons(len);
+ udph->check = 0;
+ if (family == AF_INET)
+ udph->check = build_udp_v4_csum(header - sizeof(struct iphdr),
+ udph, len >> 1);
+ else
+ udph->check = build_udp_v6_csum(header - sizeof(struct ipv6hdr),
+ udph, len >> 1);
+}
+
+static void build_gue_header(void *header, uint8_t proto)
+{
+ struct guehdr *gueh = header;
+
+ gueh->proto_ctype = proto;
+}
+
+static void build_gre_header(void *header, uint16_t proto)
+{
+ struct grehdr *greh = header;
+
+ greh->protocol = htons(proto);
+}
+
+static int l3_length(int family)
+{
+ if (family == AF_INET)
+ return sizeof(struct iphdr);
+ else
+ return sizeof(struct ipv6hdr);
+}
+
+static int build_packet(void)
+{
+ int ol3_len = 0, ol4_len = 0, il3_len = 0, il4_len = 0;
+ int el3_len = 0;
+
+ if (cfg_l3_extra)
+ el3_len = l3_length(cfg_l3_extra);
+
+ /* calculate header offsets */
+ if (cfg_encap_proto) {
+ ol3_len = l3_length(cfg_l3_outer);
+
+ if (cfg_encap_proto == IPPROTO_GRE)
+ ol4_len = sizeof(struct grehdr);
+ else if (cfg_encap_proto == IPPROTO_UDP)
+ ol4_len = sizeof(struct udphdr) + sizeof(struct guehdr);
+ }
+
+ il3_len = l3_length(cfg_l3_inner);
+ il4_len = sizeof(struct udphdr);
+
+ if (el3_len + ol3_len + ol4_len + il3_len + il4_len + cfg_payload_len >=
+ sizeof(buf))
+ error(1, 0, "packet too large\n");
+
+ /*
+ * Fill packet from inside out, to calculate correct checksums.
+ * But create ip before udp headers, as udp uses ip for pseudo-sum.
+ */
+ memset(buf + el3_len + ol3_len + ol4_len + il3_len + il4_len,
+ cfg_payload_char, cfg_payload_len);
+
+ /* add zero byte for udp csum padding */
+ buf[el3_len + ol3_len + ol4_len + il3_len + il4_len + cfg_payload_len] = 0;
+
+ switch (cfg_l3_inner) {
+ case PF_INET:
+ build_ipv4_header(buf + el3_len + ol3_len + ol4_len,
+ IPPROTO_UDP,
+ in_saddr4.sin_addr.s_addr,
+ in_daddr4.sin_addr.s_addr,
+ il4_len + cfg_payload_len,
+ cfg_dsfield_inner);
+ break;
+ case PF_INET6:
+ build_ipv6_header(buf + el3_len + ol3_len + ol4_len,
+ IPPROTO_UDP,
+ &in_saddr6, &in_daddr6,
+ il4_len + cfg_payload_len,
+ cfg_dsfield_inner);
+ break;
+ }
+
+ build_udp_header(buf + el3_len + ol3_len + ol4_len + il3_len,
+ cfg_payload_len, CFG_PORT_INNER, cfg_l3_inner);
+
+ if (!cfg_encap_proto)
+ return il3_len + il4_len + cfg_payload_len;
+
+ switch (cfg_l3_outer) {
+ case PF_INET:
+ build_ipv4_header(buf + el3_len, cfg_encap_proto,
+ out_saddr4.sin_addr.s_addr,
+ out_daddr4.sin_addr.s_addr,
+ ol4_len + il3_len + il4_len + cfg_payload_len,
+ cfg_dsfield_outer);
+ break;
+ case PF_INET6:
+ build_ipv6_header(buf + el3_len, cfg_encap_proto,
+ &out_saddr6, &out_daddr6,
+ ol4_len + il3_len + il4_len + cfg_payload_len,
+ cfg_dsfield_outer);
+ break;
+ }
+
+ switch (cfg_encap_proto) {
+ case IPPROTO_UDP:
+ build_gue_header(buf + el3_len + ol3_len + ol4_len -
+ sizeof(struct guehdr),
+ cfg_l3_inner == PF_INET ? IPPROTO_IPIP
+ : IPPROTO_IPV6);
+ build_udp_header(buf + el3_len + ol3_len,
+ sizeof(struct guehdr) + il3_len + il4_len +
+ cfg_payload_len,
+ cfg_port_gue, cfg_l3_outer);
+ break;
+ case IPPROTO_GRE:
+ build_gre_header(buf + el3_len + ol3_len,
+ cfg_l3_inner == PF_INET ? ETH_P_IP
+ : ETH_P_IPV6);
+ break;
+ }
+
+ switch (cfg_l3_extra) {
+ case PF_INET:
+ build_ipv4_header(buf,
+ cfg_l3_outer == PF_INET ? IPPROTO_IPIP
+ : IPPROTO_IPV6,
+ extra_saddr4.sin_addr.s_addr,
+ extra_daddr4.sin_addr.s_addr,
+ ol3_len + ol4_len + il3_len + il4_len +
+ cfg_payload_len, 0);
+ break;
+ case PF_INET6:
+ build_ipv6_header(buf,
+ cfg_l3_outer == PF_INET ? IPPROTO_IPIP
+ : IPPROTO_IPV6,
+ &extra_saddr6, &extra_daddr6,
+ ol3_len + ol4_len + il3_len + il4_len +
+ cfg_payload_len, 0);
+ break;
+ }
+
+ return el3_len + ol3_len + ol4_len + il3_len + il4_len +
+ cfg_payload_len;
+}
+
+/* sender transmits encapsulated over RAW or unencap'd over UDP */
+static int setup_tx(void)
+{
+ int family, fd, ret;
+
+ if (cfg_l3_extra)
+ family = cfg_l3_extra;
+ else if (cfg_l3_outer)
+ family = cfg_l3_outer;
+ else
+ family = cfg_l3_inner;
+
+ fd = socket(family, SOCK_RAW, IPPROTO_RAW);
+ if (fd == -1)
+ error(1, errno, "socket tx");
+
+ if (cfg_l3_extra) {
+ if (cfg_l3_extra == PF_INET)
+ ret = connect(fd, (void *) &extra_daddr4,
+ sizeof(extra_daddr4));
+ else
+ ret = connect(fd, (void *) &extra_daddr6,
+ sizeof(extra_daddr6));
+ if (ret)
+ error(1, errno, "connect tx");
+ } else if (cfg_l3_outer) {
+ /* connect to destination if not encapsulated */
+ if (cfg_l3_outer == PF_INET)
+ ret = connect(fd, (void *) &out_daddr4,
+ sizeof(out_daddr4));
+ else
+ ret = connect(fd, (void *) &out_daddr6,
+ sizeof(out_daddr6));
+ if (ret)
+ error(1, errno, "connect tx");
+ } else {
+ /* otherwise using loopback */
+ if (cfg_l3_inner == PF_INET)
+ ret = connect(fd, (void *) &in_daddr4,
+ sizeof(in_daddr4));
+ else
+ ret = connect(fd, (void *) &in_daddr6,
+ sizeof(in_daddr6));
+ if (ret)
+ error(1, errno, "connect tx");
+ }
+
+ return fd;
+}
+
+/* receiver reads unencapsulated UDP */
+static int setup_rx(void)
+{
+ int fd, ret;
+
+ fd = socket(cfg_l3_inner, SOCK_DGRAM, 0);
+ if (fd == -1)
+ error(1, errno, "socket rx");
+
+ if (cfg_l3_inner == PF_INET)
+ ret = bind(fd, (void *) &in_daddr4, sizeof(in_daddr4));
+ else
+ ret = bind(fd, (void *) &in_daddr6, sizeof(in_daddr6));
+ if (ret)
+ error(1, errno, "bind rx");
+
+ return fd;
+}
+
+static int do_tx(int fd, const char *pkt, int len)
+{
+ int ret;
+
+ ret = write(fd, pkt, len);
+ if (ret == -1)
+ error(1, errno, "send");
+ if (ret != len)
+ error(1, errno, "send: len (%d < %d)\n", ret, len);
+
+ return 1;
+}
+
+static int do_poll(int fd, short events, int timeout)
+{
+ struct pollfd pfd;
+ int ret;
+
+ pfd.fd = fd;
+ pfd.events = events;
+
+ ret = poll(&pfd, 1, timeout);
+ if (ret == -1)
+ error(1, errno, "poll");
+ if (ret && !(pfd.revents & POLLIN))
+ error(1, errno, "poll: unexpected event 0x%x\n", pfd.revents);
+
+ return ret;
+}
+
+static int do_rx(int fd)
+{
+ char rbuf;
+ int ret, num = 0;
+
+ while (1) {
+ ret = recv(fd, &rbuf, 1, MSG_DONTWAIT);
+ if (ret == -1 && errno == EAGAIN)
+ break;
+ if (ret == -1)
+ error(1, errno, "recv");
+ if (rbuf != cfg_payload_char)
+ error(1, 0, "recv: payload mismatch");
+ num++;
+ };
+
+ return num;
+}
+
+static int do_main(void)
+{
+ unsigned long tstop, treport, tcur;
+ int fdt = -1, fdr = -1, len, tx = 0, rx = 0;
+
+ if (!cfg_only_tx)
+ fdr = setup_rx();
+ if (!cfg_only_rx)
+ fdt = setup_tx();
+
+ len = build_packet();
+
+ tcur = util_gettime();
+ treport = tcur + 1000;
+ tstop = tcur + (cfg_num_secs * 1000);
+
+ while (1) {
+ if (!cfg_only_rx)
+ tx += do_tx(fdt, buf, len);
+
+ if (!cfg_only_tx)
+ rx += do_rx(fdr);
+
+ if (cfg_num_secs) {
+ tcur = util_gettime();
+ if (tcur >= tstop)
+ break;
+ if (tcur >= treport) {
+ fprintf(stderr, "pkts: tx=%u rx=%u\n", tx, rx);
+ tx = 0;
+ rx = 0;
+ treport = tcur + 1000;
+ }
+ } else {
+ if (tx == cfg_num_pkt)
+ break;
+ }
+ }
+
+ /* read straggler packets, if any */
+ if (rx < tx) {
+ tstop = util_gettime() + 100;
+ while (rx < tx) {
+ tcur = util_gettime();
+ if (tcur >= tstop)
+ break;
+
+ do_poll(fdr, POLLIN, tstop - tcur);
+ rx += do_rx(fdr);
+ }
+ }
+
+ fprintf(stderr, "pkts: tx=%u rx=%u\n", tx, rx);
+
+ if (fdr != -1 && close(fdr))
+ error(1, errno, "close rx");
+ if (fdt != -1 && close(fdt))
+ error(1, errno, "close tx");
+
+ /*
+ * success (== 0) only if received all packets
+ * unless failure is expected, in which case none must arrive.
+ */
+ if (cfg_expect_failure)
+ return rx != 0;
+ else
+ return rx != tx;
+}
+
+
+static void __attribute__((noreturn)) usage(const char *filepath)
+{
+ fprintf(stderr, "Usage: %s [-e gre|gue|bare|none] [-i 4|6] [-l len] "
+ "[-O 4|6] [-o 4|6] [-n num] [-t secs] [-R] [-T] "
+ "[-s <osrc> [-d <odst>] [-S <isrc>] [-D <idst>] "
+ "[-x <otos>] [-X <itos>] [-f <isport>] [-F]\n",
+ filepath);
+ exit(1);
+}
+
+static void parse_addr(int family, void *addr, const char *optarg)
+{
+ int ret;
+
+ ret = inet_pton(family, optarg, addr);
+ if (ret == -1)
+ error(1, errno, "inet_pton");
+ if (ret == 0)
+ error(1, 0, "inet_pton: bad string");
+}
+
+static void parse_addr4(struct sockaddr_in *addr, const char *optarg)
+{
+ parse_addr(AF_INET, &addr->sin_addr, optarg);
+}
+
+static void parse_addr6(struct sockaddr_in6 *addr, const char *optarg)
+{
+ parse_addr(AF_INET6, &addr->sin6_addr, optarg);
+}
+
+static int parse_protocol_family(const char *filepath, const char *optarg)
+{
+ if (!strcmp(optarg, "4"))
+ return PF_INET;
+ if (!strcmp(optarg, "6"))
+ return PF_INET6;
+
+ usage(filepath);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ int c;
+
+ while ((c = getopt(argc, argv, "d:D:e:f:Fhi:l:n:o:O:Rs:S:t:Tx:X:")) != -1) {
+ switch (c) {
+ case 'd':
+ if (cfg_l3_outer == AF_UNSPEC)
+ error(1, 0, "-d must be preceded by -o");
+ if (cfg_l3_outer == AF_INET)
+ parse_addr4(&out_daddr4, optarg);
+ else
+ parse_addr6(&out_daddr6, optarg);
+ break;
+ case 'D':
+ if (cfg_l3_inner == AF_UNSPEC)
+ error(1, 0, "-D must be preceded by -i");
+ if (cfg_l3_inner == AF_INET)
+ parse_addr4(&in_daddr4, optarg);
+ else
+ parse_addr6(&in_daddr6, optarg);
+ break;
+ case 'e':
+ if (!strcmp(optarg, "gre"))
+ cfg_encap_proto = IPPROTO_GRE;
+ else if (!strcmp(optarg, "gue"))
+ cfg_encap_proto = IPPROTO_UDP;
+ else if (!strcmp(optarg, "bare"))
+ cfg_encap_proto = IPPROTO_IPIP;
+ else if (!strcmp(optarg, "none"))
+ cfg_encap_proto = IPPROTO_IP; /* == 0 */
+ else
+ usage(argv[0]);
+ break;
+ case 'f':
+ cfg_src_port = strtol(optarg, NULL, 0);
+ break;
+ case 'F':
+ cfg_expect_failure = true;
+ break;
+ case 'h':
+ usage(argv[0]);
+ break;
+ case 'i':
+ if (!strcmp(optarg, "4"))
+ cfg_l3_inner = PF_INET;
+ else if (!strcmp(optarg, "6"))
+ cfg_l3_inner = PF_INET6;
+ else
+ usage(argv[0]);
+ break;
+ case 'l':
+ cfg_payload_len = strtol(optarg, NULL, 0);
+ break;
+ case 'n':
+ cfg_num_pkt = strtol(optarg, NULL, 0);
+ break;
+ case 'o':
+ cfg_l3_outer = parse_protocol_family(argv[0], optarg);
+ break;
+ case 'O':
+ cfg_l3_extra = parse_protocol_family(argv[0], optarg);
+ break;
+ case 'R':
+ cfg_only_rx = true;
+ break;
+ case 's':
+ if (cfg_l3_outer == AF_INET)
+ parse_addr4(&out_saddr4, optarg);
+ else
+ parse_addr6(&out_saddr6, optarg);
+ break;
+ case 'S':
+ if (cfg_l3_inner == AF_INET)
+ parse_addr4(&in_saddr4, optarg);
+ else
+ parse_addr6(&in_saddr6, optarg);
+ break;
+ case 't':
+ cfg_num_secs = strtol(optarg, NULL, 0);
+ break;
+ case 'T':
+ cfg_only_tx = true;
+ break;
+ case 'x':
+ cfg_dsfield_outer = strtol(optarg, NULL, 0);
+ break;
+ case 'X':
+ cfg_dsfield_inner = strtol(optarg, NULL, 0);
+ break;
+ }
+ }
+
+ if (cfg_only_rx && cfg_only_tx)
+ error(1, 0, "options: cannot combine rx-only and tx-only");
+
+ if (cfg_encap_proto && cfg_l3_outer == AF_UNSPEC)
+ error(1, 0, "options: must specify outer with encap");
+ else if ((!cfg_encap_proto) && cfg_l3_outer != AF_UNSPEC)
+ error(1, 0, "options: cannot combine no-encap and outer");
+ else if ((!cfg_encap_proto) && cfg_l3_extra != AF_UNSPEC)
+ error(1, 0, "options: cannot combine no-encap and extra");
+
+ if (cfg_l3_inner == AF_UNSPEC)
+ cfg_l3_inner = AF_INET6;
+ if (cfg_l3_inner == AF_INET6 && cfg_encap_proto == IPPROTO_IPIP)
+ cfg_encap_proto = IPPROTO_IPV6;
+
+ /* RFC 6040 4.2:
+ * on decap, if outer encountered congestion (CE == 0x3),
+ * but inner cannot encode ECN (NoECT == 0x0), then drop packet.
+ */
+ if (((cfg_dsfield_outer & 0x3) == 0x3) &&
+ ((cfg_dsfield_inner & 0x3) == 0x0))
+ cfg_expect_failure = true;
+}
+
+static void print_opts(void)
+{
+ if (cfg_l3_inner == PF_INET6) {
+ util_printaddr("inner.dest6", (void *) &in_daddr6);
+ util_printaddr("inner.source6", (void *) &in_saddr6);
+ } else {
+ util_printaddr("inner.dest4", (void *) &in_daddr4);
+ util_printaddr("inner.source4", (void *) &in_saddr4);
+ }
+
+ if (!cfg_l3_outer)
+ return;
+
+ fprintf(stderr, "encap proto: %u\n", cfg_encap_proto);
+
+ if (cfg_l3_outer == PF_INET6) {
+ util_printaddr("outer.dest6", (void *) &out_daddr6);
+ util_printaddr("outer.source6", (void *) &out_saddr6);
+ } else {
+ util_printaddr("outer.dest4", (void *) &out_daddr4);
+ util_printaddr("outer.source4", (void *) &out_saddr4);
+ }
+
+ if (!cfg_l3_extra)
+ return;
+
+ if (cfg_l3_outer == PF_INET6) {
+ util_printaddr("extra.dest6", (void *) &extra_daddr6);
+ util_printaddr("extra.source6", (void *) &extra_saddr6);
+ } else {
+ util_printaddr("extra.dest4", (void *) &extra_daddr4);
+ util_printaddr("extra.source4", (void *) &extra_saddr4);
+ }
+
+}
+
+int main(int argc, char **argv)
+{
+ parse_opts(argc, argv);
+ print_opts();
+ return do_main();
+}
diff --git a/tools/testing/selftests/bpf/test_flow_dissector.sh b/tools/testing/selftests/bpf/test_flow_dissector.sh
new file mode 100755
index 000000000..174b72a64
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_flow_dissector.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Load BPF flow dissector and verify it correctly dissects traffic
+export TESTNAME=test_flow_dissector
+unmount=0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+msg="skip all tests:"
+if [ $UID != 0 ]; then
+ echo $msg please run this as root >&2
+ exit $ksft_skip
+fi
+
+# This test needs to be run in a network namespace with in_netns.sh. Check if
+# this is the case and run it with in_netns.sh if it is being run in the root
+# namespace.
+if [[ -z $(ip netns identify $$) ]]; then
+ err=0
+ if bpftool="$(which bpftool)"; then
+ echo "Testing global flow dissector..."
+
+ $bpftool prog loadall ./bpf_flow.o /sys/fs/bpf/flow \
+ type flow_dissector
+
+ if ! unshare --net $bpftool prog attach pinned \
+ /sys/fs/bpf/flow/flow_dissector flow_dissector; then
+ echo "Unexpected unsuccessful attach in namespace" >&2
+ err=1
+ fi
+
+ $bpftool prog attach pinned /sys/fs/bpf/flow/flow_dissector \
+ flow_dissector
+
+ if unshare --net $bpftool prog attach pinned \
+ /sys/fs/bpf/flow/flow_dissector flow_dissector; then
+ echo "Unexpected successful attach in namespace" >&2
+ err=1
+ fi
+
+ if ! $bpftool prog detach pinned \
+ /sys/fs/bpf/flow/flow_dissector flow_dissector; then
+ echo "Failed to detach flow dissector" >&2
+ err=1
+ fi
+
+ rm -rf /sys/fs/bpf/flow
+ else
+ echo "Skipping root flow dissector test, bpftool not found" >&2
+ fi
+
+ # Run the rest of the tests in a net namespace.
+ ../net/in_netns.sh "$0" "$@"
+ err=$(( $err + $? ))
+
+ if (( $err == 0 )); then
+ echo "selftests: $TESTNAME [PASS]";
+ else
+ echo "selftests: $TESTNAME [FAILED]";
+ fi
+
+ exit $err
+fi
+
+# Determine selftest success via shell exit code
+exit_handler()
+{
+ set +e
+
+ # Cleanup
+ tc filter del dev lo ingress pref 1337 2> /dev/null
+ tc qdisc del dev lo ingress 2> /dev/null
+ ./flow_dissector_load -d 2> /dev/null
+ if [ $unmount -ne 0 ]; then
+ umount bpffs 2> /dev/null
+ fi
+}
+
+# Exit script immediately (well catched by trap handler) if any
+# program/thing exits with a non-zero status.
+set -e
+
+# (Use 'trap -l' to list meaning of numbers)
+trap exit_handler 0 2 3 6 9
+
+# Mount BPF file system
+if /bin/mount | grep /sys/fs/bpf > /dev/null; then
+ echo "bpffs already mounted"
+else
+ echo "bpffs not mounted. Mounting..."
+ unmount=1
+ /bin/mount bpffs /sys/fs/bpf -t bpf
+fi
+
+# Attach BPF program
+./flow_dissector_load -p bpf_flow.o -s flow_dissector
+
+# Setup
+tc qdisc add dev lo ingress
+echo 0 > /proc/sys/net/ipv4/conf/default/rp_filter
+echo 0 > /proc/sys/net/ipv4/conf/all/rp_filter
+echo 0 > /proc/sys/net/ipv4/conf/lo/rp_filter
+
+echo "Testing IPv4..."
+# Drops all IP/UDP packets coming from port 9
+tc filter add dev lo parent ffff: protocol ip pref 1337 flower ip_proto \
+ udp src_port 9 action drop
+
+# Send 10 IPv4/UDP packets from port 8. Filter should not drop any.
+./test_flow_dissector -i 4 -f 8
+# Send 10 IPv4/UDP packets from port 9. Filter should drop all.
+./test_flow_dissector -i 4 -f 9 -F
+# Send 10 IPv4/UDP packets from port 10. Filter should not drop any.
+./test_flow_dissector -i 4 -f 10
+
+echo "Testing IPIP..."
+# Send 10 IPv4/IPv4/UDP packets from port 8. Filter should not drop any.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \
+ -D 192.168.0.1 -S 1.1.1.1 -f 8
+# Send 10 IPv4/IPv4/UDP packets from port 9. Filter should drop all.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \
+ -D 192.168.0.1 -S 1.1.1.1 -f 9 -F
+# Send 10 IPv4/IPv4/UDP packets from port 10. Filter should not drop any.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e bare -i 4 \
+ -D 192.168.0.1 -S 1.1.1.1 -f 10
+
+echo "Testing IPv4 + GRE..."
+# Send 10 IPv4/GRE/IPv4/UDP packets from port 8. Filter should not drop any.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e gre -i 4 \
+ -D 192.168.0.1 -S 1.1.1.1 -f 8
+# Send 10 IPv4/GRE/IPv4/UDP packets from port 9. Filter should drop all.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e gre -i 4 \
+ -D 192.168.0.1 -S 1.1.1.1 -f 9 -F
+# Send 10 IPv4/GRE/IPv4/UDP packets from port 10. Filter should not drop any.
+./with_addr.sh ./with_tunnels.sh ./test_flow_dissector -o 4 -e gre -i 4 \
+ -D 192.168.0.1 -S 1.1.1.1 -f 10
+
+tc filter del dev lo ingress pref 1337
+
+echo "Testing port range..."
+# Drops all IP/UDP packets coming from port 8-10
+tc filter add dev lo parent ffff: protocol ip pref 1337 flower ip_proto \
+ udp src_port 8-10 action drop
+
+# Send 10 IPv4/UDP packets from port 7. Filter should not drop any.
+./test_flow_dissector -i 4 -f 7
+# Send 10 IPv4/UDP packets from port 9. Filter should drop all.
+./test_flow_dissector -i 4 -f 9 -F
+# Send 10 IPv4/UDP packets from port 11. Filter should not drop any.
+./test_flow_dissector -i 4 -f 11
+
+tc filter del dev lo ingress pref 1337
+
+echo "Testing IPv6..."
+# Drops all IPv6/UDP packets coming from port 9
+tc filter add dev lo parent ffff: protocol ipv6 pref 1337 flower ip_proto \
+ udp src_port 9 action drop
+
+# Send 10 IPv6/UDP packets from port 8. Filter should not drop any.
+./test_flow_dissector -i 6 -f 8
+# Send 10 IPv6/UDP packets from port 9. Filter should drop all.
+./test_flow_dissector -i 6 -f 9 -F
+# Send 10 IPv6/UDP packets from port 10. Filter should not drop any.
+./test_flow_dissector -i 6 -f 10
+
+exit 0
diff --git a/tools/testing/selftests/bpf/test_ftrace.sh b/tools/testing/selftests/bpf/test_ftrace.sh
new file mode 100755
index 000000000..20de7bb87
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_ftrace.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+TR=/sys/kernel/debug/tracing/
+clear_trace() { # reset trace output
+ echo > $TR/trace
+}
+
+disable_tracing() { # stop trace recording
+ echo 0 > $TR/tracing_on
+}
+
+enable_tracing() { # start trace recording
+ echo 1 > $TR/tracing_on
+}
+
+reset_tracer() { # reset the current tracer
+ echo nop > $TR/current_tracer
+}
+
+disable_tracing
+clear_trace
+
+echo "" > $TR/set_ftrace_filter
+echo '*printk* *console* *wake* *serial* *lock*' > $TR/set_ftrace_notrace
+
+echo "bpf_prog_test*" > $TR/set_graph_function
+echo "" > $TR/set_graph_notrace
+
+echo function_graph > $TR/current_tracer
+
+enable_tracing
+./test_progs -t fentry
+./test_progs -t fexit
+disable_tracing
+clear_trace
+
+reset_tracer
+
+exit 0
diff --git a/tools/testing/selftests/bpf/test_iptunnel_common.h b/tools/testing/selftests/bpf/test_iptunnel_common.h
new file mode 100644
index 000000000..1d5ba839d
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_iptunnel_common.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2016 Facebook
+ */
+#ifndef _TEST_IPTNL_COMMON_H
+#define _TEST_IPTNL_COMMON_H
+
+#include <linux/types.h>
+
+#define MAX_IPTNL_ENTRIES 256U
+
+struct vip {
+ union {
+ __u32 v6[4];
+ __u32 v4;
+ } daddr;
+ __u16 dport;
+ __u16 family;
+ __u8 protocol;
+};
+
+struct iptnl_info {
+ union {
+ __u32 v6[4];
+ __u32 v4;
+ } saddr;
+ union {
+ __u32 v6[4];
+ __u32 v4;
+ } daddr;
+ __u16 family;
+ __u8 dmac[6];
+};
+
+#endif
diff --git a/tools/testing/selftests/bpf/test_kmod.sh b/tools/testing/selftests/bpf/test_kmod.sh
new file mode 100755
index 000000000..4f6444bcd
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_kmod.sh
@@ -0,0 +1,67 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+msg="skip all tests:"
+if [ "$(id -u)" != "0" ]; then
+ echo $msg please run this as root >&2
+ exit $ksft_skip
+fi
+
+if [ "$building_out_of_srctree" ]; then
+ # We are in linux-build/kselftest/bpf
+ OUTPUT=../../
+else
+ # We are in linux/tools/testing/selftests/bpf
+ OUTPUT=../../../../
+fi
+
+test_run()
+{
+ sysctl -w net.core.bpf_jit_enable=$1 2>&1 > /dev/null
+ sysctl -w net.core.bpf_jit_harden=$2 2>&1 > /dev/null
+
+ echo "[ JIT enabled:$1 hardened:$2 ]"
+ dmesg -C
+ if [ -f ${OUTPUT}/lib/test_bpf.ko ]; then
+ insmod ${OUTPUT}/lib/test_bpf.ko 2> /dev/null
+ if [ $? -ne 0 ]; then
+ rc=1
+ fi
+ else
+ # Use modprobe dry run to check for missing test_bpf module
+ if ! /sbin/modprobe -q -n test_bpf; then
+ echo "test_bpf: [SKIP]"
+ elif /sbin/modprobe -q test_bpf; then
+ echo "test_bpf: ok"
+ else
+ echo "test_bpf: [FAIL]"
+ rc=1
+ fi
+ fi
+ rmmod test_bpf 2> /dev/null
+ dmesg | grep FAIL
+}
+
+test_save()
+{
+ JE=`sysctl -n net.core.bpf_jit_enable`
+ JH=`sysctl -n net.core.bpf_jit_harden`
+}
+
+test_restore()
+{
+ sysctl -w net.core.bpf_jit_enable=$JE 2>&1 > /dev/null
+ sysctl -w net.core.bpf_jit_harden=$JH 2>&1 > /dev/null
+}
+
+rc=0
+test_save
+test_run 0 0
+test_run 1 0
+test_run 1 1
+test_run 1 2
+test_restore
+exit $rc
diff --git a/tools/testing/selftests/bpf/test_lirc_mode2.sh b/tools/testing/selftests/bpf/test_lirc_mode2.sh
new file mode 100755
index 000000000..5252b91f4
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lirc_mode2.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+ret=$ksft_skip
+
+msg="skip all tests:"
+if [ $UID != 0 ]; then
+ echo $msg please run this as root >&2
+ exit $ksft_skip
+fi
+
+GREEN='\033[0;92m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+modprobe rc-loopback
+
+for i in /sys/class/rc/rc*
+do
+ if grep -q DRV_NAME=rc-loopback $i/uevent
+ then
+ LIRCDEV=$(grep DEVNAME= $i/lirc*/uevent | sed sQDEVNAME=Q/dev/Q)
+ INPUTDEV=$(grep DEVNAME= $i/input*/event*/uevent | sed sQDEVNAME=Q/dev/Q)
+ fi
+done
+
+if [ -n "$LIRCDEV" ];
+then
+ TYPE=lirc_mode2
+ ./test_lirc_mode2_user $LIRCDEV $INPUTDEV
+ ret=$?
+ if [ $ret -ne 0 ]; then
+ echo -e ${RED}"FAIL: $TYPE"${NC}
+ else
+ echo -e ${GREEN}"PASS: $TYPE"${NC}
+ fi
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/bpf/test_lirc_mode2_user.c b/tools/testing/selftests/bpf/test_lirc_mode2_user.c
new file mode 100644
index 000000000..fb5fd6841
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lirc_mode2_user.c
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: GPL-2.0
+// test ir decoder
+//
+// Copyright (C) 2018 Sean Young <sean@mess.org>
+
+// A lirc chardev is a device representing a consumer IR (cir) device which
+// can receive infrared signals from remote control and/or transmit IR.
+//
+// IR is sent as a series of pulses and space somewhat like morse code. The
+// BPF program can decode this into scancodes so that rc-core can translate
+// this into input key codes using the rc keymap.
+//
+// This test works by sending IR over rc-loopback, so the IR is processed by
+// BPF and then decoded into scancodes. The lirc chardev must be the one
+// associated with rc-loopback, see the output of ir-keytable(1).
+//
+// The following CONFIG options must be enabled for the test to succeed:
+// CONFIG_RC_CORE=y
+// CONFIG_BPF_RAWIR_EVENT=y
+// CONFIG_RC_LOOPBACK=y
+
+// Steps:
+// 1. Open the /dev/lircN device for rc-loopback (given on command line)
+// 2. Attach bpf_lirc_mode2 program which decodes some IR.
+// 3. Send some IR to the same IR device; since it is loopback, this will
+// end up in the bpf program
+// 4. bpf program should decode IR and report keycode
+// 5. We can read keycode from same /dev/lirc device
+
+#include <linux/bpf.h>
+#include <linux/lirc.h>
+#include <linux/input.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <poll.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "bpf_util.h"
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+int main(int argc, char **argv)
+{
+ struct bpf_object *obj;
+ int ret, lircfd, progfd, inputfd;
+ int testir1 = 0x1dead;
+ int testir2 = 0x20101;
+ u32 prog_ids[10], prog_flags[10], prog_cnt;
+
+ if (argc != 3) {
+ printf("Usage: %s /dev/lircN /dev/input/eventM\n", argv[0]);
+ return 2;
+ }
+
+ ret = bpf_prog_load("test_lirc_mode2_kern.o",
+ BPF_PROG_TYPE_LIRC_MODE2, &obj, &progfd);
+ if (ret) {
+ printf("Failed to load bpf program\n");
+ return 1;
+ }
+
+ lircfd = open(argv[1], O_RDWR | O_NONBLOCK);
+ if (lircfd == -1) {
+ printf("failed to open lirc device %s: %m\n", argv[1]);
+ return 1;
+ }
+
+ /* Let's try detach it before it was ever attached */
+ ret = bpf_prog_detach2(progfd, lircfd, BPF_LIRC_MODE2);
+ if (ret != -1 || errno != ENOENT) {
+ printf("bpf_prog_detach2 not attached should fail: %m\n");
+ return 1;
+ }
+
+ inputfd = open(argv[2], O_RDONLY | O_NONBLOCK);
+ if (inputfd == -1) {
+ printf("failed to open input device %s: %m\n", argv[1]);
+ return 1;
+ }
+
+ prog_cnt = 10;
+ ret = bpf_prog_query(lircfd, BPF_LIRC_MODE2, 0, prog_flags, prog_ids,
+ &prog_cnt);
+ if (ret) {
+ printf("Failed to query bpf programs on lirc device: %m\n");
+ return 1;
+ }
+
+ if (prog_cnt != 0) {
+ printf("Expected nothing to be attached\n");
+ return 1;
+ }
+
+ ret = bpf_prog_attach(progfd, lircfd, BPF_LIRC_MODE2, 0);
+ if (ret) {
+ printf("Failed to attach bpf to lirc device: %m\n");
+ return 1;
+ }
+
+ /* Write raw IR */
+ ret = write(lircfd, &testir1, sizeof(testir1));
+ if (ret != sizeof(testir1)) {
+ printf("Failed to send test IR message: %m\n");
+ return 1;
+ }
+
+ struct pollfd pfd = { .fd = inputfd, .events = POLLIN };
+ struct input_event event;
+
+ for (;;) {
+ poll(&pfd, 1, 100);
+
+ /* Read decoded IR */
+ ret = read(inputfd, &event, sizeof(event));
+ if (ret != sizeof(event)) {
+ printf("Failed to read decoded IR: %m\n");
+ return 1;
+ }
+
+ if (event.type == EV_MSC && event.code == MSC_SCAN &&
+ event.value == 0xdead) {
+ break;
+ }
+ }
+
+ /* Write raw IR */
+ ret = write(lircfd, &testir2, sizeof(testir2));
+ if (ret != sizeof(testir2)) {
+ printf("Failed to send test IR message: %m\n");
+ return 1;
+ }
+
+ for (;;) {
+ poll(&pfd, 1, 100);
+
+ /* Read decoded IR */
+ ret = read(inputfd, &event, sizeof(event));
+ if (ret != sizeof(event)) {
+ printf("Failed to read decoded IR: %m\n");
+ return 1;
+ }
+
+ if (event.type == EV_REL && event.code == REL_Y &&
+ event.value == 1 ) {
+ break;
+ }
+ }
+
+ prog_cnt = 10;
+ ret = bpf_prog_query(lircfd, BPF_LIRC_MODE2, 0, prog_flags, prog_ids,
+ &prog_cnt);
+ if (ret) {
+ printf("Failed to query bpf programs on lirc device: %m\n");
+ return 1;
+ }
+
+ if (prog_cnt != 1) {
+ printf("Expected one program to be attached\n");
+ return 1;
+ }
+
+ /* Let's try detaching it now it is actually attached */
+ ret = bpf_prog_detach2(progfd, lircfd, BPF_LIRC_MODE2);
+ if (ret) {
+ printf("bpf_prog_detach2: returned %m\n");
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c
new file mode 100644
index 000000000..006be3963
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lpm_map.c
@@ -0,0 +1,804 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Randomized tests for eBPF longest-prefix-match maps
+ *
+ * This program runs randomized tests against the lpm-bpf-map. It implements a
+ * "Trivial Longest Prefix Match" (tlpm) based on simple, linear, singly linked
+ * lists. The implementation should be pretty straightforward.
+ *
+ * Based on tlpm, this inserts randomized data into bpf-lpm-maps and verifies
+ * the trie-based bpf-map implementation behaves the same way as tlpm.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <linux/bpf.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include <arpa/inet.h>
+#include <sys/time.h>
+
+#include <bpf/bpf.h>
+
+#include "bpf_util.h"
+#include "bpf_rlimit.h"
+
+struct tlpm_node {
+ struct tlpm_node *next;
+ size_t n_bits;
+ uint8_t key[];
+};
+
+static struct tlpm_node *tlpm_match(struct tlpm_node *list,
+ const uint8_t *key,
+ size_t n_bits);
+
+static struct tlpm_node *tlpm_add(struct tlpm_node *list,
+ const uint8_t *key,
+ size_t n_bits)
+{
+ struct tlpm_node *node;
+ size_t n;
+
+ n = (n_bits + 7) / 8;
+
+ /* 'overwrite' an equivalent entry if one already exists */
+ node = tlpm_match(list, key, n_bits);
+ if (node && node->n_bits == n_bits) {
+ memcpy(node->key, key, n);
+ return list;
+ }
+
+ /* add new entry with @key/@n_bits to @list and return new head */
+
+ node = malloc(sizeof(*node) + n);
+ assert(node);
+
+ node->next = list;
+ node->n_bits = n_bits;
+ memcpy(node->key, key, n);
+
+ return node;
+}
+
+static void tlpm_clear(struct tlpm_node *list)
+{
+ struct tlpm_node *node;
+
+ /* free all entries in @list */
+
+ while ((node = list)) {
+ list = list->next;
+ free(node);
+ }
+}
+
+static struct tlpm_node *tlpm_match(struct tlpm_node *list,
+ const uint8_t *key,
+ size_t n_bits)
+{
+ struct tlpm_node *best = NULL;
+ size_t i;
+
+ /* Perform longest prefix-match on @key/@n_bits. That is, iterate all
+ * entries and match each prefix against @key. Remember the "best"
+ * entry we find (i.e., the longest prefix that matches) and return it
+ * to the caller when done.
+ */
+
+ for ( ; list; list = list->next) {
+ for (i = 0; i < n_bits && i < list->n_bits; ++i) {
+ if ((key[i / 8] & (1 << (7 - i % 8))) !=
+ (list->key[i / 8] & (1 << (7 - i % 8))))
+ break;
+ }
+
+ if (i >= list->n_bits) {
+ if (!best || i > best->n_bits)
+ best = list;
+ }
+ }
+
+ return best;
+}
+
+static struct tlpm_node *tlpm_delete(struct tlpm_node *list,
+ const uint8_t *key,
+ size_t n_bits)
+{
+ struct tlpm_node *best = tlpm_match(list, key, n_bits);
+ struct tlpm_node *node;
+
+ if (!best || best->n_bits != n_bits)
+ return list;
+
+ if (best == list) {
+ node = best->next;
+ free(best);
+ return node;
+ }
+
+ for (node = list; node; node = node->next) {
+ if (node->next == best) {
+ node->next = best->next;
+ free(best);
+ return list;
+ }
+ }
+ /* should never get here */
+ assert(0);
+ return list;
+}
+
+static void test_lpm_basic(void)
+{
+ struct tlpm_node *list = NULL, *t1, *t2;
+
+ /* very basic, static tests to verify tlpm works as expected */
+
+ assert(!tlpm_match(list, (uint8_t[]){ 0xff }, 8));
+
+ t1 = list = tlpm_add(list, (uint8_t[]){ 0xff }, 8);
+ assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff }, 8));
+ assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff, 0xff }, 16));
+ assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff, 0x00 }, 16));
+ assert(!tlpm_match(list, (uint8_t[]){ 0x7f }, 8));
+ assert(!tlpm_match(list, (uint8_t[]){ 0xfe }, 8));
+ assert(!tlpm_match(list, (uint8_t[]){ 0xff }, 7));
+
+ t2 = list = tlpm_add(list, (uint8_t[]){ 0xff, 0xff }, 16);
+ assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff }, 8));
+ assert(t2 == tlpm_match(list, (uint8_t[]){ 0xff, 0xff }, 16));
+ assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff, 0xff }, 15));
+ assert(!tlpm_match(list, (uint8_t[]){ 0x7f, 0xff }, 16));
+
+ list = tlpm_delete(list, (uint8_t[]){ 0xff, 0xff }, 16);
+ assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff }, 8));
+ assert(t1 == tlpm_match(list, (uint8_t[]){ 0xff, 0xff }, 16));
+
+ list = tlpm_delete(list, (uint8_t[]){ 0xff }, 8);
+ assert(!tlpm_match(list, (uint8_t[]){ 0xff }, 8));
+
+ tlpm_clear(list);
+}
+
+static void test_lpm_order(void)
+{
+ struct tlpm_node *t1, *t2, *l1 = NULL, *l2 = NULL;
+ size_t i, j;
+
+ /* Verify the tlpm implementation works correctly regardless of the
+ * order of entries. Insert a random set of entries into @l1, and copy
+ * the same data in reverse order into @l2. Then verify a lookup of
+ * random keys will yield the same result in both sets.
+ */
+
+ for (i = 0; i < (1 << 12); ++i)
+ l1 = tlpm_add(l1, (uint8_t[]){
+ rand() % 0xff,
+ rand() % 0xff,
+ }, rand() % 16 + 1);
+
+ for (t1 = l1; t1; t1 = t1->next)
+ l2 = tlpm_add(l2, t1->key, t1->n_bits);
+
+ for (i = 0; i < (1 << 8); ++i) {
+ uint8_t key[] = { rand() % 0xff, rand() % 0xff };
+
+ t1 = tlpm_match(l1, key, 16);
+ t2 = tlpm_match(l2, key, 16);
+
+ assert(!t1 == !t2);
+ if (t1) {
+ assert(t1->n_bits == t2->n_bits);
+ for (j = 0; j < t1->n_bits; ++j)
+ assert((t1->key[j / 8] & (1 << (7 - j % 8))) ==
+ (t2->key[j / 8] & (1 << (7 - j % 8))));
+ }
+ }
+
+ tlpm_clear(l1);
+ tlpm_clear(l2);
+}
+
+static void test_lpm_map(int keysize)
+{
+ size_t i, j, n_matches, n_matches_after_delete, n_nodes, n_lookups;
+ struct tlpm_node *t, *list = NULL;
+ struct bpf_lpm_trie_key *key;
+ uint8_t *data, *value;
+ int r, map;
+
+ /* Compare behavior of tlpm vs. bpf-lpm. Create a randomized set of
+ * prefixes and insert it into both tlpm and bpf-lpm. Then run some
+ * randomized lookups and verify both maps return the same result.
+ */
+
+ n_matches = 0;
+ n_matches_after_delete = 0;
+ n_nodes = 1 << 8;
+ n_lookups = 1 << 16;
+
+ data = alloca(keysize);
+ memset(data, 0, keysize);
+
+ value = alloca(keysize + 1);
+ memset(value, 0, keysize + 1);
+
+ key = alloca(sizeof(*key) + keysize);
+ memset(key, 0, sizeof(*key) + keysize);
+
+ map = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE,
+ sizeof(*key) + keysize,
+ keysize + 1,
+ 4096,
+ BPF_F_NO_PREALLOC);
+ assert(map >= 0);
+
+ for (i = 0; i < n_nodes; ++i) {
+ for (j = 0; j < keysize; ++j)
+ value[j] = rand() & 0xff;
+ value[keysize] = rand() % (8 * keysize + 1);
+
+ list = tlpm_add(list, value, value[keysize]);
+
+ key->prefixlen = value[keysize];
+ memcpy(key->data, value, keysize);
+ r = bpf_map_update_elem(map, key, value, 0);
+ assert(!r);
+ }
+
+ for (i = 0; i < n_lookups; ++i) {
+ for (j = 0; j < keysize; ++j)
+ data[j] = rand() & 0xff;
+
+ t = tlpm_match(list, data, 8 * keysize);
+
+ key->prefixlen = 8 * keysize;
+ memcpy(key->data, data, keysize);
+ r = bpf_map_lookup_elem(map, key, value);
+ assert(!r || errno == ENOENT);
+ assert(!t == !!r);
+
+ if (t) {
+ ++n_matches;
+ assert(t->n_bits == value[keysize]);
+ for (j = 0; j < t->n_bits; ++j)
+ assert((t->key[j / 8] & (1 << (7 - j % 8))) ==
+ (value[j / 8] & (1 << (7 - j % 8))));
+ }
+ }
+
+ /* Remove the first half of the elements in the tlpm and the
+ * corresponding nodes from the bpf-lpm. Then run the same
+ * large number of random lookups in both and make sure they match.
+ * Note: we need to count the number of nodes actually inserted
+ * since there may have been duplicates.
+ */
+ for (i = 0, t = list; t; i++, t = t->next)
+ ;
+ for (j = 0; j < i / 2; ++j) {
+ key->prefixlen = list->n_bits;
+ memcpy(key->data, list->key, keysize);
+ r = bpf_map_delete_elem(map, key);
+ assert(!r);
+ list = tlpm_delete(list, list->key, list->n_bits);
+ assert(list);
+ }
+ for (i = 0; i < n_lookups; ++i) {
+ for (j = 0; j < keysize; ++j)
+ data[j] = rand() & 0xff;
+
+ t = tlpm_match(list, data, 8 * keysize);
+
+ key->prefixlen = 8 * keysize;
+ memcpy(key->data, data, keysize);
+ r = bpf_map_lookup_elem(map, key, value);
+ assert(!r || errno == ENOENT);
+ assert(!t == !!r);
+
+ if (t) {
+ ++n_matches_after_delete;
+ assert(t->n_bits == value[keysize]);
+ for (j = 0; j < t->n_bits; ++j)
+ assert((t->key[j / 8] & (1 << (7 - j % 8))) ==
+ (value[j / 8] & (1 << (7 - j % 8))));
+ }
+ }
+
+ close(map);
+ tlpm_clear(list);
+
+ /* With 255 random nodes in the map, we are pretty likely to match
+ * something on every lookup. For statistics, use this:
+ *
+ * printf(" nodes: %zu\n"
+ * " lookups: %zu\n"
+ * " matches: %zu\n"
+ * "matches(delete): %zu\n",
+ * n_nodes, n_lookups, n_matches, n_matches_after_delete);
+ */
+}
+
+/* Test the implementation with some 'real world' examples */
+
+static void test_lpm_ipaddr(void)
+{
+ struct bpf_lpm_trie_key *key_ipv4;
+ struct bpf_lpm_trie_key *key_ipv6;
+ size_t key_size_ipv4;
+ size_t key_size_ipv6;
+ int map_fd_ipv4;
+ int map_fd_ipv6;
+ __u64 value;
+
+ key_size_ipv4 = sizeof(*key_ipv4) + sizeof(__u32);
+ key_size_ipv6 = sizeof(*key_ipv6) + sizeof(__u32) * 4;
+ key_ipv4 = alloca(key_size_ipv4);
+ key_ipv6 = alloca(key_size_ipv6);
+
+ map_fd_ipv4 = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE,
+ key_size_ipv4, sizeof(value),
+ 100, BPF_F_NO_PREALLOC);
+ assert(map_fd_ipv4 >= 0);
+
+ map_fd_ipv6 = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE,
+ key_size_ipv6, sizeof(value),
+ 100, BPF_F_NO_PREALLOC);
+ assert(map_fd_ipv6 >= 0);
+
+ /* Fill data some IPv4 and IPv6 address ranges */
+ value = 1;
+ key_ipv4->prefixlen = 16;
+ inet_pton(AF_INET, "192.168.0.0", key_ipv4->data);
+ assert(bpf_map_update_elem(map_fd_ipv4, key_ipv4, &value, 0) == 0);
+
+ value = 2;
+ key_ipv4->prefixlen = 24;
+ inet_pton(AF_INET, "192.168.0.0", key_ipv4->data);
+ assert(bpf_map_update_elem(map_fd_ipv4, key_ipv4, &value, 0) == 0);
+
+ value = 3;
+ key_ipv4->prefixlen = 24;
+ inet_pton(AF_INET, "192.168.128.0", key_ipv4->data);
+ assert(bpf_map_update_elem(map_fd_ipv4, key_ipv4, &value, 0) == 0);
+
+ value = 5;
+ key_ipv4->prefixlen = 24;
+ inet_pton(AF_INET, "192.168.1.0", key_ipv4->data);
+ assert(bpf_map_update_elem(map_fd_ipv4, key_ipv4, &value, 0) == 0);
+
+ value = 4;
+ key_ipv4->prefixlen = 23;
+ inet_pton(AF_INET, "192.168.0.0", key_ipv4->data);
+ assert(bpf_map_update_elem(map_fd_ipv4, key_ipv4, &value, 0) == 0);
+
+ value = 0xdeadbeef;
+ key_ipv6->prefixlen = 64;
+ inet_pton(AF_INET6, "2a00:1450:4001:814::200e", key_ipv6->data);
+ assert(bpf_map_update_elem(map_fd_ipv6, key_ipv6, &value, 0) == 0);
+
+ /* Set tprefixlen to maximum for lookups */
+ key_ipv4->prefixlen = 32;
+ key_ipv6->prefixlen = 128;
+
+ /* Test some lookups that should come back with a value */
+ inet_pton(AF_INET, "192.168.128.23", key_ipv4->data);
+ assert(bpf_map_lookup_elem(map_fd_ipv4, key_ipv4, &value) == 0);
+ assert(value == 3);
+
+ inet_pton(AF_INET, "192.168.0.1", key_ipv4->data);
+ assert(bpf_map_lookup_elem(map_fd_ipv4, key_ipv4, &value) == 0);
+ assert(value == 2);
+
+ inet_pton(AF_INET6, "2a00:1450:4001:814::", key_ipv6->data);
+ assert(bpf_map_lookup_elem(map_fd_ipv6, key_ipv6, &value) == 0);
+ assert(value == 0xdeadbeef);
+
+ inet_pton(AF_INET6, "2a00:1450:4001:814::1", key_ipv6->data);
+ assert(bpf_map_lookup_elem(map_fd_ipv6, key_ipv6, &value) == 0);
+ assert(value == 0xdeadbeef);
+
+ /* Test some lookups that should not match any entry */
+ inet_pton(AF_INET, "10.0.0.1", key_ipv4->data);
+ assert(bpf_map_lookup_elem(map_fd_ipv4, key_ipv4, &value) == -1 &&
+ errno == ENOENT);
+
+ inet_pton(AF_INET, "11.11.11.11", key_ipv4->data);
+ assert(bpf_map_lookup_elem(map_fd_ipv4, key_ipv4, &value) == -1 &&
+ errno == ENOENT);
+
+ inet_pton(AF_INET6, "2a00:ffff::", key_ipv6->data);
+ assert(bpf_map_lookup_elem(map_fd_ipv6, key_ipv6, &value) == -1 &&
+ errno == ENOENT);
+
+ close(map_fd_ipv4);
+ close(map_fd_ipv6);
+}
+
+static void test_lpm_delete(void)
+{
+ struct bpf_lpm_trie_key *key;
+ size_t key_size;
+ int map_fd;
+ __u64 value;
+
+ key_size = sizeof(*key) + sizeof(__u32);
+ key = alloca(key_size);
+
+ map_fd = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE,
+ key_size, sizeof(value),
+ 100, BPF_F_NO_PREALLOC);
+ assert(map_fd >= 0);
+
+ /* Add nodes:
+ * 192.168.0.0/16 (1)
+ * 192.168.0.0/24 (2)
+ * 192.168.128.0/24 (3)
+ * 192.168.1.0/24 (4)
+ *
+ * (1)
+ * / \
+ * (IM) (3)
+ * / \
+ * (2) (4)
+ */
+ value = 1;
+ key->prefixlen = 16;
+ inet_pton(AF_INET, "192.168.0.0", key->data);
+ assert(bpf_map_update_elem(map_fd, key, &value, 0) == 0);
+
+ value = 2;
+ key->prefixlen = 24;
+ inet_pton(AF_INET, "192.168.0.0", key->data);
+ assert(bpf_map_update_elem(map_fd, key, &value, 0) == 0);
+
+ value = 3;
+ key->prefixlen = 24;
+ inet_pton(AF_INET, "192.168.128.0", key->data);
+ assert(bpf_map_update_elem(map_fd, key, &value, 0) == 0);
+
+ value = 4;
+ key->prefixlen = 24;
+ inet_pton(AF_INET, "192.168.1.0", key->data);
+ assert(bpf_map_update_elem(map_fd, key, &value, 0) == 0);
+
+ /* remove non-existent node */
+ key->prefixlen = 32;
+ inet_pton(AF_INET, "10.0.0.1", key->data);
+ assert(bpf_map_lookup_elem(map_fd, key, &value) == -1 &&
+ errno == ENOENT);
+
+ key->prefixlen = 30; // unused prefix so far
+ inet_pton(AF_INET, "192.255.0.0", key->data);
+ assert(bpf_map_delete_elem(map_fd, key) == -1 &&
+ errno == ENOENT);
+
+ key->prefixlen = 16; // same prefix as the root node
+ inet_pton(AF_INET, "192.255.0.0", key->data);
+ assert(bpf_map_delete_elem(map_fd, key) == -1 &&
+ errno == ENOENT);
+
+ /* assert initial lookup */
+ key->prefixlen = 32;
+ inet_pton(AF_INET, "192.168.0.1", key->data);
+ assert(bpf_map_lookup_elem(map_fd, key, &value) == 0);
+ assert(value == 2);
+
+ /* remove leaf node */
+ key->prefixlen = 24;
+ inet_pton(AF_INET, "192.168.0.0", key->data);
+ assert(bpf_map_delete_elem(map_fd, key) == 0);
+
+ key->prefixlen = 32;
+ inet_pton(AF_INET, "192.168.0.1", key->data);
+ assert(bpf_map_lookup_elem(map_fd, key, &value) == 0);
+ assert(value == 1);
+
+ /* remove leaf (and intermediary) node */
+ key->prefixlen = 24;
+ inet_pton(AF_INET, "192.168.1.0", key->data);
+ assert(bpf_map_delete_elem(map_fd, key) == 0);
+
+ key->prefixlen = 32;
+ inet_pton(AF_INET, "192.168.1.1", key->data);
+ assert(bpf_map_lookup_elem(map_fd, key, &value) == 0);
+ assert(value == 1);
+
+ /* remove root node */
+ key->prefixlen = 16;
+ inet_pton(AF_INET, "192.168.0.0", key->data);
+ assert(bpf_map_delete_elem(map_fd, key) == 0);
+
+ key->prefixlen = 32;
+ inet_pton(AF_INET, "192.168.128.1", key->data);
+ assert(bpf_map_lookup_elem(map_fd, key, &value) == 0);
+ assert(value == 3);
+
+ /* remove last node */
+ key->prefixlen = 24;
+ inet_pton(AF_INET, "192.168.128.0", key->data);
+ assert(bpf_map_delete_elem(map_fd, key) == 0);
+
+ key->prefixlen = 32;
+ inet_pton(AF_INET, "192.168.128.1", key->data);
+ assert(bpf_map_lookup_elem(map_fd, key, &value) == -1 &&
+ errno == ENOENT);
+
+ close(map_fd);
+}
+
+static void test_lpm_get_next_key(void)
+{
+ struct bpf_lpm_trie_key *key_p, *next_key_p;
+ size_t key_size;
+ __u32 value = 0;
+ int map_fd;
+
+ key_size = sizeof(*key_p) + sizeof(__u32);
+ key_p = alloca(key_size);
+ next_key_p = alloca(key_size);
+
+ map_fd = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, key_size, sizeof(value),
+ 100, BPF_F_NO_PREALLOC);
+ assert(map_fd >= 0);
+
+ /* empty tree. get_next_key should return ENOENT */
+ assert(bpf_map_get_next_key(map_fd, NULL, key_p) == -1 &&
+ errno == ENOENT);
+
+ /* get and verify the first key, get the second one should fail. */
+ key_p->prefixlen = 16;
+ inet_pton(AF_INET, "192.168.0.0", key_p->data);
+ assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0);
+
+ memset(key_p, 0, key_size);
+ assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0);
+ assert(key_p->prefixlen == 16 && key_p->data[0] == 192 &&
+ key_p->data[1] == 168);
+
+ assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == -1 &&
+ errno == ENOENT);
+
+ /* no exact matching key should get the first one in post order. */
+ key_p->prefixlen = 8;
+ assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0);
+ assert(key_p->prefixlen == 16 && key_p->data[0] == 192 &&
+ key_p->data[1] == 168);
+
+ /* add one more element (total two) */
+ key_p->prefixlen = 24;
+ inet_pton(AF_INET, "192.168.128.0", key_p->data);
+ assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0);
+
+ memset(key_p, 0, key_size);
+ assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0);
+ assert(key_p->prefixlen == 24 && key_p->data[0] == 192 &&
+ key_p->data[1] == 168 && key_p->data[2] == 128);
+
+ memset(next_key_p, 0, key_size);
+ assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
+ assert(next_key_p->prefixlen == 16 && next_key_p->data[0] == 192 &&
+ next_key_p->data[1] == 168);
+
+ memcpy(key_p, next_key_p, key_size);
+ assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == -1 &&
+ errno == ENOENT);
+
+ /* Add one more element (total three) */
+ key_p->prefixlen = 24;
+ inet_pton(AF_INET, "192.168.0.0", key_p->data);
+ assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0);
+
+ memset(key_p, 0, key_size);
+ assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0);
+ assert(key_p->prefixlen == 24 && key_p->data[0] == 192 &&
+ key_p->data[1] == 168 && key_p->data[2] == 0);
+
+ memset(next_key_p, 0, key_size);
+ assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
+ assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 &&
+ next_key_p->data[1] == 168 && next_key_p->data[2] == 128);
+
+ memcpy(key_p, next_key_p, key_size);
+ assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
+ assert(next_key_p->prefixlen == 16 && next_key_p->data[0] == 192 &&
+ next_key_p->data[1] == 168);
+
+ memcpy(key_p, next_key_p, key_size);
+ assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == -1 &&
+ errno == ENOENT);
+
+ /* Add one more element (total four) */
+ key_p->prefixlen = 24;
+ inet_pton(AF_INET, "192.168.1.0", key_p->data);
+ assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0);
+
+ memset(key_p, 0, key_size);
+ assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0);
+ assert(key_p->prefixlen == 24 && key_p->data[0] == 192 &&
+ key_p->data[1] == 168 && key_p->data[2] == 0);
+
+ memset(next_key_p, 0, key_size);
+ assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
+ assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 &&
+ next_key_p->data[1] == 168 && next_key_p->data[2] == 1);
+
+ memcpy(key_p, next_key_p, key_size);
+ assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
+ assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 &&
+ next_key_p->data[1] == 168 && next_key_p->data[2] == 128);
+
+ memcpy(key_p, next_key_p, key_size);
+ assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
+ assert(next_key_p->prefixlen == 16 && next_key_p->data[0] == 192 &&
+ next_key_p->data[1] == 168);
+
+ memcpy(key_p, next_key_p, key_size);
+ assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == -1 &&
+ errno == ENOENT);
+
+ /* Add one more element (total five) */
+ key_p->prefixlen = 28;
+ inet_pton(AF_INET, "192.168.1.128", key_p->data);
+ assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0);
+
+ memset(key_p, 0, key_size);
+ assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0);
+ assert(key_p->prefixlen == 24 && key_p->data[0] == 192 &&
+ key_p->data[1] == 168 && key_p->data[2] == 0);
+
+ memset(next_key_p, 0, key_size);
+ assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
+ assert(next_key_p->prefixlen == 28 && next_key_p->data[0] == 192 &&
+ next_key_p->data[1] == 168 && next_key_p->data[2] == 1 &&
+ next_key_p->data[3] == 128);
+
+ memcpy(key_p, next_key_p, key_size);
+ assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
+ assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 &&
+ next_key_p->data[1] == 168 && next_key_p->data[2] == 1);
+
+ memcpy(key_p, next_key_p, key_size);
+ assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
+ assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 &&
+ next_key_p->data[1] == 168 && next_key_p->data[2] == 128);
+
+ memcpy(key_p, next_key_p, key_size);
+ assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
+ assert(next_key_p->prefixlen == 16 && next_key_p->data[0] == 192 &&
+ next_key_p->data[1] == 168);
+
+ memcpy(key_p, next_key_p, key_size);
+ assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == -1 &&
+ errno == ENOENT);
+
+ /* no exact matching key should return the first one in post order */
+ key_p->prefixlen = 22;
+ inet_pton(AF_INET, "192.168.1.0", key_p->data);
+ assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
+ assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 &&
+ next_key_p->data[1] == 168 && next_key_p->data[2] == 0);
+
+ close(map_fd);
+}
+
+#define MAX_TEST_KEYS 4
+struct lpm_mt_test_info {
+ int cmd; /* 0: update, 1: delete, 2: lookup, 3: get_next_key */
+ int iter;
+ int map_fd;
+ struct {
+ __u32 prefixlen;
+ __u32 data;
+ } key[MAX_TEST_KEYS];
+};
+
+static void *lpm_test_command(void *arg)
+{
+ int i, j, ret, iter, key_size;
+ struct lpm_mt_test_info *info = arg;
+ struct bpf_lpm_trie_key *key_p;
+
+ key_size = sizeof(struct bpf_lpm_trie_key) + sizeof(__u32);
+ key_p = alloca(key_size);
+ for (iter = 0; iter < info->iter; iter++)
+ for (i = 0; i < MAX_TEST_KEYS; i++) {
+ /* first half of iterations in forward order,
+ * and second half in backward order.
+ */
+ j = (iter < (info->iter / 2)) ? i : MAX_TEST_KEYS - i - 1;
+ key_p->prefixlen = info->key[j].prefixlen;
+ memcpy(key_p->data, &info->key[j].data, sizeof(__u32));
+ if (info->cmd == 0) {
+ __u32 value = j;
+ /* update must succeed */
+ assert(bpf_map_update_elem(info->map_fd, key_p, &value, 0) == 0);
+ } else if (info->cmd == 1) {
+ ret = bpf_map_delete_elem(info->map_fd, key_p);
+ assert(ret == 0 || errno == ENOENT);
+ } else if (info->cmd == 2) {
+ __u32 value;
+ ret = bpf_map_lookup_elem(info->map_fd, key_p, &value);
+ assert(ret == 0 || errno == ENOENT);
+ } else {
+ struct bpf_lpm_trie_key *next_key_p = alloca(key_size);
+ ret = bpf_map_get_next_key(info->map_fd, key_p, next_key_p);
+ assert(ret == 0 || errno == ENOENT || errno == ENOMEM);
+ }
+ }
+
+ // Pass successful exit info back to the main thread
+ pthread_exit((void *)info);
+}
+
+static void setup_lpm_mt_test_info(struct lpm_mt_test_info *info, int map_fd)
+{
+ info->iter = 2000;
+ info->map_fd = map_fd;
+ info->key[0].prefixlen = 16;
+ inet_pton(AF_INET, "192.168.0.0", &info->key[0].data);
+ info->key[1].prefixlen = 24;
+ inet_pton(AF_INET, "192.168.0.0", &info->key[1].data);
+ info->key[2].prefixlen = 24;
+ inet_pton(AF_INET, "192.168.128.0", &info->key[2].data);
+ info->key[3].prefixlen = 24;
+ inet_pton(AF_INET, "192.168.1.0", &info->key[3].data);
+}
+
+static void test_lpm_multi_thread(void)
+{
+ struct lpm_mt_test_info info[4];
+ size_t key_size, value_size;
+ pthread_t thread_id[4];
+ int i, map_fd;
+ void *ret;
+
+ /* create a trie */
+ value_size = sizeof(__u32);
+ key_size = sizeof(struct bpf_lpm_trie_key) + value_size;
+ map_fd = bpf_create_map(BPF_MAP_TYPE_LPM_TRIE, key_size, value_size,
+ 100, BPF_F_NO_PREALLOC);
+
+ /* create 4 threads to test update, delete, lookup and get_next_key */
+ setup_lpm_mt_test_info(&info[0], map_fd);
+ for (i = 0; i < 4; i++) {
+ if (i != 0)
+ memcpy(&info[i], &info[0], sizeof(info[i]));
+ info[i].cmd = i;
+ assert(pthread_create(&thread_id[i], NULL, &lpm_test_command, &info[i]) == 0);
+ }
+
+ for (i = 0; i < 4; i++)
+ assert(pthread_join(thread_id[i], &ret) == 0 && ret == (void *)&info[i]);
+
+ close(map_fd);
+}
+
+int main(void)
+{
+ int i;
+
+ /* we want predictable, pseudo random tests */
+ srand(0xf00ba1);
+
+ test_lpm_basic();
+ test_lpm_order();
+
+ /* Test with 8, 16, 24, 32, ... 128 bit prefix length */
+ for (i = 1; i <= 16; ++i)
+ test_lpm_map(i);
+
+ test_lpm_ipaddr();
+ test_lpm_delete();
+ test_lpm_get_next_key();
+ test_lpm_multi_thread();
+
+ printf("test_lpm: OK\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/test_lru_map.c b/tools/testing/selftests/bpf/test_lru_map.c
new file mode 100644
index 000000000..6a5349f9e
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lru_map.c
@@ -0,0 +1,903 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2016 Facebook
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+#include <sched.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include <sys/wait.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "bpf_util.h"
+#include "bpf_rlimit.h"
+#include "../../../include/linux/filter.h"
+
+#define LOCAL_FREE_TARGET (128)
+#define PERCPU_FREE_TARGET (4)
+
+static int nr_cpus;
+
+static int create_map(int map_type, int map_flags, unsigned int size)
+{
+ int map_fd;
+
+ map_fd = bpf_create_map(map_type, sizeof(unsigned long long),
+ sizeof(unsigned long long), size, map_flags);
+
+ if (map_fd == -1)
+ perror("bpf_create_map");
+
+ return map_fd;
+}
+
+static int bpf_map_lookup_elem_with_ref_bit(int fd, unsigned long long key,
+ void *value)
+{
+ struct bpf_load_program_attr prog;
+ struct bpf_create_map_attr map;
+ struct bpf_insn insns[] = {
+ BPF_LD_MAP_VALUE(BPF_REG_9, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, fd),
+ BPF_LD_IMM64(BPF_REG_3, key),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_9, BPF_REG_1, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ };
+ __u8 data[64] = {};
+ int mfd, pfd, ret, zero = 0;
+ __u32 retval = 0;
+
+ memset(&map, 0, sizeof(map));
+ map.map_type = BPF_MAP_TYPE_ARRAY;
+ map.key_size = sizeof(int);
+ map.value_size = sizeof(unsigned long long);
+ map.max_entries = 1;
+
+ mfd = bpf_create_map_xattr(&map);
+ if (mfd < 0)
+ return -1;
+
+ insns[0].imm = mfd;
+
+ memset(&prog, 0, sizeof(prog));
+ prog.prog_type = BPF_PROG_TYPE_SCHED_CLS;
+ prog.insns = insns;
+ prog.insns_cnt = ARRAY_SIZE(insns);
+ prog.license = "GPL";
+
+ pfd = bpf_load_program_xattr(&prog, NULL, 0);
+ if (pfd < 0) {
+ close(mfd);
+ return -1;
+ }
+
+ ret = bpf_prog_test_run(pfd, 1, data, sizeof(data),
+ NULL, NULL, &retval, NULL);
+ if (ret < 0 || retval != 42) {
+ ret = -1;
+ } else {
+ assert(!bpf_map_lookup_elem(mfd, &zero, value));
+ ret = 0;
+ }
+ close(pfd);
+ close(mfd);
+ return ret;
+}
+
+static int map_subset(int map0, int map1)
+{
+ unsigned long long next_key = 0;
+ unsigned long long value0[nr_cpus], value1[nr_cpus];
+ int ret;
+
+ while (!bpf_map_get_next_key(map1, &next_key, &next_key)) {
+ assert(!bpf_map_lookup_elem(map1, &next_key, value1));
+ ret = bpf_map_lookup_elem(map0, &next_key, value0);
+ if (ret) {
+ printf("key:%llu not found from map. %s(%d)\n",
+ next_key, strerror(errno), errno);
+ return 0;
+ }
+ if (value0[0] != value1[0]) {
+ printf("key:%llu value0:%llu != value1:%llu\n",
+ next_key, value0[0], value1[0]);
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static int map_equal(int lru_map, int expected)
+{
+ return map_subset(lru_map, expected) && map_subset(expected, lru_map);
+}
+
+static int sched_next_online(int pid, int *next_to_try)
+{
+ cpu_set_t cpuset;
+ int next = *next_to_try;
+ int ret = -1;
+
+ while (next < nr_cpus) {
+ CPU_ZERO(&cpuset);
+ CPU_SET(next++, &cpuset);
+ if (!sched_setaffinity(pid, sizeof(cpuset), &cpuset)) {
+ ret = 0;
+ break;
+ }
+ }
+
+ *next_to_try = next;
+ return ret;
+}
+
+/* Size of the LRU map is 2
+ * Add key=1 (+1 key)
+ * Add key=2 (+1 key)
+ * Lookup Key=1
+ * Add Key=3
+ * => Key=2 will be removed by LRU
+ * Iterate map. Only found key=1 and key=3
+ */
+static void test_lru_sanity0(int map_type, int map_flags)
+{
+ unsigned long long key, value[nr_cpus];
+ int lru_map_fd, expected_map_fd;
+ int next_cpu = 0;
+
+ printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+ map_flags);
+
+ assert(sched_next_online(0, &next_cpu) != -1);
+
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ lru_map_fd = create_map(map_type, map_flags, 2 * nr_cpus);
+ else
+ lru_map_fd = create_map(map_type, map_flags, 2);
+ assert(lru_map_fd != -1);
+
+ expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, 2);
+ assert(expected_map_fd != -1);
+
+ value[0] = 1234;
+
+ /* insert key=1 element */
+
+ key = 1;
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
+ assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+
+ /* BPF_NOEXIST means: add new element if it doesn't exist */
+ assert(bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST) == -1
+ /* key=1 already exists */
+ && errno == EEXIST);
+
+ assert(bpf_map_update_elem(lru_map_fd, &key, value, -1) == -1 &&
+ errno == EINVAL);
+
+ /* insert key=2 element */
+
+ /* check that key=2 is not found */
+ key = 2;
+ assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+ errno == ENOENT);
+
+ /* BPF_EXIST means: update existing element */
+ assert(bpf_map_update_elem(lru_map_fd, &key, value, BPF_EXIST) == -1 &&
+ /* key=2 is not there */
+ errno == ENOENT);
+
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
+
+ /* insert key=3 element */
+
+ /* check that key=3 is not found */
+ key = 3;
+ assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+ errno == ENOENT);
+
+ /* check that key=1 can be found and mark the ref bit to
+ * stop LRU from removing key=1
+ */
+ key = 1;
+ assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
+ assert(value[0] == 1234);
+
+ key = 3;
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
+ assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+
+ /* key=2 has been removed from the LRU */
+ key = 2;
+ assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+ errno == ENOENT);
+
+ assert(map_equal(lru_map_fd, expected_map_fd));
+
+ close(expected_map_fd);
+ close(lru_map_fd);
+
+ printf("Pass\n");
+}
+
+/* Size of the LRU map is 1.5*tgt_free
+ * Insert 1 to tgt_free (+tgt_free keys)
+ * Lookup 1 to tgt_free/2
+ * Insert 1+tgt_free to 2*tgt_free (+tgt_free keys)
+ * => 1+tgt_free/2 to LOCALFREE_TARGET will be removed by LRU
+ */
+static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
+{
+ unsigned long long key, end_key, value[nr_cpus];
+ int lru_map_fd, expected_map_fd;
+ unsigned int batch_size;
+ unsigned int map_size;
+ int next_cpu = 0;
+
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ /* This test is only applicable to common LRU list */
+ return;
+
+ printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+ map_flags);
+
+ assert(sched_next_online(0, &next_cpu) != -1);
+
+ batch_size = tgt_free / 2;
+ assert(batch_size * 2 == tgt_free);
+
+ map_size = tgt_free + batch_size;
+ lru_map_fd = create_map(map_type, map_flags, map_size);
+ assert(lru_map_fd != -1);
+
+ expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, map_size);
+ assert(expected_map_fd != -1);
+
+ value[0] = 1234;
+
+ /* Insert 1 to tgt_free (+tgt_free keys) */
+ end_key = 1 + tgt_free;
+ for (key = 1; key < end_key; key++)
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value,
+ BPF_NOEXIST));
+
+ /* Lookup 1 to tgt_free/2 */
+ end_key = 1 + batch_size;
+ for (key = 1; key < end_key; key++) {
+ assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
+ assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+ }
+
+ /* Insert 1+tgt_free to 2*tgt_free
+ * => 1+tgt_free/2 to LOCALFREE_TARGET will be
+ * removed by LRU
+ */
+ key = 1 + tgt_free;
+ end_key = key + tgt_free;
+ for (; key < end_key; key++) {
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value,
+ BPF_NOEXIST));
+ assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+ }
+
+ assert(map_equal(lru_map_fd, expected_map_fd));
+
+ close(expected_map_fd);
+ close(lru_map_fd);
+
+ printf("Pass\n");
+}
+
+/* Size of the LRU map 1.5 * tgt_free
+ * Insert 1 to tgt_free (+tgt_free keys)
+ * Update 1 to tgt_free/2
+ * => The original 1 to tgt_free/2 will be removed due to
+ * the LRU shrink process
+ * Re-insert 1 to tgt_free/2 again and do a lookup immeidately
+ * Insert 1+tgt_free to tgt_free*3/2
+ * Insert 1+tgt_free*3/2 to tgt_free*5/2
+ * => Key 1+tgt_free to tgt_free*3/2
+ * will be removed from LRU because it has never
+ * been lookup and ref bit is not set
+ */
+static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
+{
+ unsigned long long key, value[nr_cpus];
+ unsigned long long end_key;
+ int lru_map_fd, expected_map_fd;
+ unsigned int batch_size;
+ unsigned int map_size;
+ int next_cpu = 0;
+
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ /* This test is only applicable to common LRU list */
+ return;
+
+ printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+ map_flags);
+
+ assert(sched_next_online(0, &next_cpu) != -1);
+
+ batch_size = tgt_free / 2;
+ assert(batch_size * 2 == tgt_free);
+
+ map_size = tgt_free + batch_size;
+ lru_map_fd = create_map(map_type, map_flags, map_size);
+ assert(lru_map_fd != -1);
+
+ expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, map_size);
+ assert(expected_map_fd != -1);
+
+ value[0] = 1234;
+
+ /* Insert 1 to tgt_free (+tgt_free keys) */
+ end_key = 1 + tgt_free;
+ for (key = 1; key < end_key; key++)
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value,
+ BPF_NOEXIST));
+
+ /* Any bpf_map_update_elem will require to acquire a new node
+ * from LRU first.
+ *
+ * The local list is running out of free nodes.
+ * It gets from the global LRU list which tries to
+ * shrink the inactive list to get tgt_free
+ * number of free nodes.
+ *
+ * Hence, the oldest key 1 to tgt_free/2
+ * are removed from the LRU list.
+ */
+ key = 1;
+ if (map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value,
+ BPF_NOEXIST));
+ assert(!bpf_map_delete_elem(lru_map_fd, &key));
+ } else {
+ assert(bpf_map_update_elem(lru_map_fd, &key, value,
+ BPF_EXIST));
+ }
+
+ /* Re-insert 1 to tgt_free/2 again and do a lookup
+ * immeidately.
+ */
+ end_key = 1 + batch_size;
+ value[0] = 4321;
+ for (key = 1; key < end_key; key++) {
+ assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+ errno == ENOENT);
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value,
+ BPF_NOEXIST));
+ assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
+ assert(value[0] == 4321);
+ assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+ }
+
+ value[0] = 1234;
+
+ /* Insert 1+tgt_free to tgt_free*3/2 */
+ end_key = 1 + tgt_free + batch_size;
+ for (key = 1 + tgt_free; key < end_key; key++)
+ /* These newly added but not referenced keys will be
+ * gone during the next LRU shrink.
+ */
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value,
+ BPF_NOEXIST));
+
+ /* Insert 1+tgt_free*3/2 to tgt_free*5/2 */
+ end_key = key + tgt_free;
+ for (; key < end_key; key++) {
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value,
+ BPF_NOEXIST));
+ assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+ }
+
+ assert(map_equal(lru_map_fd, expected_map_fd));
+
+ close(expected_map_fd);
+ close(lru_map_fd);
+
+ printf("Pass\n");
+}
+
+/* Size of the LRU map is 2*tgt_free
+ * It is to test the active/inactive list rotation
+ * Insert 1 to 2*tgt_free (+2*tgt_free keys)
+ * Lookup key 1 to tgt_free*3/2
+ * Add 1+2*tgt_free to tgt_free*5/2 (+tgt_free/2 keys)
+ * => key 1+tgt_free*3/2 to 2*tgt_free are removed from LRU
+ */
+static void test_lru_sanity3(int map_type, int map_flags, unsigned int tgt_free)
+{
+ unsigned long long key, end_key, value[nr_cpus];
+ int lru_map_fd, expected_map_fd;
+ unsigned int batch_size;
+ unsigned int map_size;
+ int next_cpu = 0;
+
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ /* This test is only applicable to common LRU list */
+ return;
+
+ printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+ map_flags);
+
+ assert(sched_next_online(0, &next_cpu) != -1);
+
+ batch_size = tgt_free / 2;
+ assert(batch_size * 2 == tgt_free);
+
+ map_size = tgt_free * 2;
+ lru_map_fd = create_map(map_type, map_flags, map_size);
+ assert(lru_map_fd != -1);
+
+ expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, map_size);
+ assert(expected_map_fd != -1);
+
+ value[0] = 1234;
+
+ /* Insert 1 to 2*tgt_free (+2*tgt_free keys) */
+ end_key = 1 + (2 * tgt_free);
+ for (key = 1; key < end_key; key++)
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value,
+ BPF_NOEXIST));
+
+ /* Lookup key 1 to tgt_free*3/2 */
+ end_key = tgt_free + batch_size;
+ for (key = 1; key < end_key; key++) {
+ assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
+ assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+ }
+
+ /* Add 1+2*tgt_free to tgt_free*5/2
+ * (+tgt_free/2 keys)
+ */
+ key = 2 * tgt_free + 1;
+ end_key = key + batch_size;
+ for (; key < end_key; key++) {
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value,
+ BPF_NOEXIST));
+ assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+ }
+
+ assert(map_equal(lru_map_fd, expected_map_fd));
+
+ close(expected_map_fd);
+ close(lru_map_fd);
+
+ printf("Pass\n");
+}
+
+/* Test deletion */
+static void test_lru_sanity4(int map_type, int map_flags, unsigned int tgt_free)
+{
+ int lru_map_fd, expected_map_fd;
+ unsigned long long key, value[nr_cpus];
+ unsigned long long end_key;
+ int next_cpu = 0;
+
+ printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+ map_flags);
+
+ assert(sched_next_online(0, &next_cpu) != -1);
+
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ lru_map_fd = create_map(map_type, map_flags,
+ 3 * tgt_free * nr_cpus);
+ else
+ lru_map_fd = create_map(map_type, map_flags, 3 * tgt_free);
+ assert(lru_map_fd != -1);
+
+ expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0,
+ 3 * tgt_free);
+ assert(expected_map_fd != -1);
+
+ value[0] = 1234;
+
+ for (key = 1; key <= 2 * tgt_free; key++)
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value,
+ BPF_NOEXIST));
+
+ key = 1;
+ assert(bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
+
+ for (key = 1; key <= tgt_free; key++) {
+ assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
+ assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+ }
+
+ for (; key <= 2 * tgt_free; key++) {
+ assert(!bpf_map_delete_elem(lru_map_fd, &key));
+ assert(bpf_map_delete_elem(lru_map_fd, &key));
+ }
+
+ end_key = key + 2 * tgt_free;
+ for (; key < end_key; key++) {
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value,
+ BPF_NOEXIST));
+ assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+ }
+
+ assert(map_equal(lru_map_fd, expected_map_fd));
+
+ close(expected_map_fd);
+ close(lru_map_fd);
+
+ printf("Pass\n");
+}
+
+static void do_test_lru_sanity5(unsigned long long last_key, int map_fd)
+{
+ unsigned long long key, value[nr_cpus];
+
+ /* Ensure the last key inserted by previous CPU can be found */
+ assert(!bpf_map_lookup_elem_with_ref_bit(map_fd, last_key, value));
+ value[0] = 1234;
+
+ key = last_key + 1;
+ assert(!bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST));
+ assert(!bpf_map_lookup_elem_with_ref_bit(map_fd, key, value));
+
+ /* Cannot find the last key because it was removed by LRU */
+ assert(bpf_map_lookup_elem(map_fd, &last_key, value) == -1 &&
+ errno == ENOENT);
+}
+
+/* Test map with only one element */
+static void test_lru_sanity5(int map_type, int map_flags)
+{
+ unsigned long long key, value[nr_cpus];
+ int next_cpu = 0;
+ int map_fd;
+
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ return;
+
+ printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+ map_flags);
+
+ map_fd = create_map(map_type, map_flags, 1);
+ assert(map_fd != -1);
+
+ value[0] = 1234;
+ key = 0;
+ assert(!bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST));
+
+ while (sched_next_online(0, &next_cpu) != -1) {
+ pid_t pid;
+
+ pid = fork();
+ if (pid == 0) {
+ do_test_lru_sanity5(key, map_fd);
+ exit(0);
+ } else if (pid == -1) {
+ printf("couldn't spawn process to test key:%llu\n",
+ key);
+ exit(1);
+ } else {
+ int status;
+
+ assert(waitpid(pid, &status, 0) == pid);
+ assert(status == 0);
+ key++;
+ }
+ }
+
+ close(map_fd);
+ /* At least one key should be tested */
+ assert(key > 0);
+
+ printf("Pass\n");
+}
+
+/* Test list rotation for BPF_F_NO_COMMON_LRU map */
+static void test_lru_sanity6(int map_type, int map_flags, int tgt_free)
+{
+ int lru_map_fd, expected_map_fd;
+ unsigned long long key, value[nr_cpus];
+ unsigned int map_size = tgt_free * 2;
+ int next_cpu = 0;
+
+ if (!(map_flags & BPF_F_NO_COMMON_LRU))
+ return;
+
+ printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+ map_flags);
+
+ assert(sched_next_online(0, &next_cpu) != -1);
+
+ expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, map_size);
+ assert(expected_map_fd != -1);
+
+ lru_map_fd = create_map(map_type, map_flags, map_size * nr_cpus);
+ assert(lru_map_fd != -1);
+
+ value[0] = 1234;
+
+ for (key = 1; key <= tgt_free; key++) {
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value,
+ BPF_NOEXIST));
+ assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+ }
+
+ for (; key <= tgt_free * 2; key++) {
+ unsigned long long stable_key;
+
+ /* Make ref bit sticky for key: [1, tgt_free] */
+ for (stable_key = 1; stable_key <= tgt_free; stable_key++) {
+ /* Mark the ref bit */
+ assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd,
+ stable_key, value));
+ }
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value,
+ BPF_NOEXIST));
+ }
+
+ for (; key <= tgt_free * 3; key++) {
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value,
+ BPF_NOEXIST));
+ assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+ }
+
+ assert(map_equal(lru_map_fd, expected_map_fd));
+
+ close(expected_map_fd);
+ close(lru_map_fd);
+
+ printf("Pass\n");
+}
+
+/* Size of the LRU map is 2
+ * Add key=1 (+1 key)
+ * Add key=2 (+1 key)
+ * Lookup Key=1 (datapath)
+ * Lookup Key=2 (syscall)
+ * Add Key=3
+ * => Key=2 will be removed by LRU
+ * Iterate map. Only found key=1 and key=3
+ */
+static void test_lru_sanity7(int map_type, int map_flags)
+{
+ unsigned long long key, value[nr_cpus];
+ int lru_map_fd, expected_map_fd;
+ int next_cpu = 0;
+
+ printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+ map_flags);
+
+ assert(sched_next_online(0, &next_cpu) != -1);
+
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ lru_map_fd = create_map(map_type, map_flags, 2 * nr_cpus);
+ else
+ lru_map_fd = create_map(map_type, map_flags, 2);
+ assert(lru_map_fd != -1);
+
+ expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, 2);
+ assert(expected_map_fd != -1);
+
+ value[0] = 1234;
+
+ /* insert key=1 element */
+
+ key = 1;
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
+ assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+
+ /* BPF_NOEXIST means: add new element if it doesn't exist */
+ assert(bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST) == -1
+ /* key=1 already exists */
+ && errno == EEXIST);
+
+ /* insert key=2 element */
+
+ /* check that key=2 is not found */
+ key = 2;
+ assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+ errno == ENOENT);
+
+ /* BPF_EXIST means: update existing element */
+ assert(bpf_map_update_elem(lru_map_fd, &key, value, BPF_EXIST) == -1 &&
+ /* key=2 is not there */
+ errno == ENOENT);
+
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
+
+ /* insert key=3 element */
+
+ /* check that key=3 is not found */
+ key = 3;
+ assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+ errno == ENOENT);
+
+ /* check that key=1 can be found and mark the ref bit to
+ * stop LRU from removing key=1
+ */
+ key = 1;
+ assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
+ assert(value[0] == 1234);
+
+ /* check that key=2 can be found and do _not_ mark ref bit.
+ * this will be evicted on next update.
+ */
+ key = 2;
+ assert(!bpf_map_lookup_elem(lru_map_fd, &key, value));
+ assert(value[0] == 1234);
+
+ key = 3;
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
+ assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+
+ /* key=2 has been removed from the LRU */
+ key = 2;
+ assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+ errno == ENOENT);
+
+ assert(map_equal(lru_map_fd, expected_map_fd));
+
+ close(expected_map_fd);
+ close(lru_map_fd);
+
+ printf("Pass\n");
+}
+
+/* Size of the LRU map is 2
+ * Add key=1 (+1 key)
+ * Add key=2 (+1 key)
+ * Lookup Key=1 (syscall)
+ * Lookup Key=2 (datapath)
+ * Add Key=3
+ * => Key=1 will be removed by LRU
+ * Iterate map. Only found key=2 and key=3
+ */
+static void test_lru_sanity8(int map_type, int map_flags)
+{
+ unsigned long long key, value[nr_cpus];
+ int lru_map_fd, expected_map_fd;
+ int next_cpu = 0;
+
+ printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
+ map_flags);
+
+ assert(sched_next_online(0, &next_cpu) != -1);
+
+ if (map_flags & BPF_F_NO_COMMON_LRU)
+ lru_map_fd = create_map(map_type, map_flags, 2 * nr_cpus);
+ else
+ lru_map_fd = create_map(map_type, map_flags, 2);
+ assert(lru_map_fd != -1);
+
+ expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, 2);
+ assert(expected_map_fd != -1);
+
+ value[0] = 1234;
+
+ /* insert key=1 element */
+
+ key = 1;
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
+
+ /* BPF_NOEXIST means: add new element if it doesn't exist */
+ assert(bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST) == -1
+ /* key=1 already exists */
+ && errno == EEXIST);
+
+ /* insert key=2 element */
+
+ /* check that key=2 is not found */
+ key = 2;
+ assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+ errno == ENOENT);
+
+ /* BPF_EXIST means: update existing element */
+ assert(bpf_map_update_elem(lru_map_fd, &key, value, BPF_EXIST) == -1 &&
+ /* key=2 is not there */
+ errno == ENOENT);
+
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
+ assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+
+ /* insert key=3 element */
+
+ /* check that key=3 is not found */
+ key = 3;
+ assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+ errno == ENOENT);
+
+ /* check that key=1 can be found and do _not_ mark ref bit.
+ * this will be evicted on next update.
+ */
+ key = 1;
+ assert(!bpf_map_lookup_elem(lru_map_fd, &key, value));
+ assert(value[0] == 1234);
+
+ /* check that key=2 can be found and mark the ref bit to
+ * stop LRU from removing key=2
+ */
+ key = 2;
+ assert(!bpf_map_lookup_elem_with_ref_bit(lru_map_fd, key, value));
+ assert(value[0] == 1234);
+
+ key = 3;
+ assert(!bpf_map_update_elem(lru_map_fd, &key, value, BPF_NOEXIST));
+ assert(!bpf_map_update_elem(expected_map_fd, &key, value,
+ BPF_NOEXIST));
+
+ /* key=1 has been removed from the LRU */
+ key = 1;
+ assert(bpf_map_lookup_elem(lru_map_fd, &key, value) == -1 &&
+ errno == ENOENT);
+
+ assert(map_equal(lru_map_fd, expected_map_fd));
+
+ close(expected_map_fd);
+ close(lru_map_fd);
+
+ printf("Pass\n");
+}
+
+int main(int argc, char **argv)
+{
+ int map_types[] = {BPF_MAP_TYPE_LRU_HASH,
+ BPF_MAP_TYPE_LRU_PERCPU_HASH};
+ int map_flags[] = {0, BPF_F_NO_COMMON_LRU};
+ int t, f;
+
+ setbuf(stdout, NULL);
+
+ nr_cpus = bpf_num_possible_cpus();
+ assert(nr_cpus != -1);
+ printf("nr_cpus:%d\n\n", nr_cpus);
+
+ for (f = 0; f < sizeof(map_flags) / sizeof(*map_flags); f++) {
+ unsigned int tgt_free = (map_flags[f] & BPF_F_NO_COMMON_LRU) ?
+ PERCPU_FREE_TARGET : LOCAL_FREE_TARGET;
+
+ for (t = 0; t < sizeof(map_types) / sizeof(*map_types); t++) {
+ test_lru_sanity0(map_types[t], map_flags[f]);
+ test_lru_sanity1(map_types[t], map_flags[f], tgt_free);
+ test_lru_sanity2(map_types[t], map_flags[f], tgt_free);
+ test_lru_sanity3(map_types[t], map_flags[f], tgt_free);
+ test_lru_sanity4(map_types[t], map_flags[f], tgt_free);
+ test_lru_sanity5(map_types[t], map_flags[f]);
+ test_lru_sanity6(map_types[t], map_flags[f], tgt_free);
+ test_lru_sanity7(map_types[t], map_flags[f]);
+ test_lru_sanity8(map_types[t], map_flags[f]);
+
+ printf("\n");
+ }
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh
new file mode 100755
index 000000000..6c69c42b1
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh
@@ -0,0 +1,475 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Setup/topology:
+#
+# NS1 NS2 NS3
+# veth1 <---> veth2 veth3 <---> veth4 (the top route)
+# veth5 <---> veth6 veth7 <---> veth8 (the bottom route)
+#
+# each vethN gets IPv[4|6]_N address
+#
+# IPv*_SRC = IPv*_1
+# IPv*_DST = IPv*_4
+#
+# all tests test pings from IPv*_SRC to IPv*_DST
+#
+# by default, routes are configured to allow packets to go
+# IP*_1 <=> IP*_2 <=> IP*_3 <=> IP*_4 (the top route)
+#
+# a GRE device is installed in NS3 with IPv*_GRE, and
+# NS1/NS2 are configured to route packets to IPv*_GRE via IP*_8
+# (the bottom route)
+#
+# Tests:
+#
+# 1. routes NS2->IPv*_DST are brought down, so the only way a ping
+# from IP*_SRC to IP*_DST can work is via IPv*_GRE
+#
+# 2a. in an egress test, a bpf LWT_XMIT program is installed on veth1
+# that encaps the packets with an IP/GRE header to route to IPv*_GRE
+#
+# ping: SRC->[encap at veth1:egress]->GRE:decap->DST
+# ping replies go DST->SRC directly
+#
+# 2b. in an ingress test, a bpf LWT_IN program is installed on veth2
+# that encaps the packets with an IP/GRE header to route to IPv*_GRE
+#
+# ping: SRC->[encap at veth2:ingress]->GRE:decap->DST
+# ping replies go DST->SRC directly
+
+if [[ $EUID -ne 0 ]]; then
+ echo "This script must be run as root"
+ echo "FAIL"
+ exit 1
+fi
+
+readonly NS1="ns1-$(mktemp -u XXXXXX)"
+readonly NS2="ns2-$(mktemp -u XXXXXX)"
+readonly NS3="ns3-$(mktemp -u XXXXXX)"
+
+readonly IPv4_1="172.16.1.100"
+readonly IPv4_2="172.16.2.100"
+readonly IPv4_3="172.16.3.100"
+readonly IPv4_4="172.16.4.100"
+readonly IPv4_5="172.16.5.100"
+readonly IPv4_6="172.16.6.100"
+readonly IPv4_7="172.16.7.100"
+readonly IPv4_8="172.16.8.100"
+readonly IPv4_GRE="172.16.16.100"
+
+readonly IPv4_SRC=$IPv4_1
+readonly IPv4_DST=$IPv4_4
+
+readonly IPv6_1="fb01::1"
+readonly IPv6_2="fb02::1"
+readonly IPv6_3="fb03::1"
+readonly IPv6_4="fb04::1"
+readonly IPv6_5="fb05::1"
+readonly IPv6_6="fb06::1"
+readonly IPv6_7="fb07::1"
+readonly IPv6_8="fb08::1"
+readonly IPv6_GRE="fb10::1"
+
+readonly IPv6_SRC=$IPv6_1
+readonly IPv6_DST=$IPv6_4
+
+TEST_STATUS=0
+TESTS_SUCCEEDED=0
+TESTS_FAILED=0
+
+TMPFILE=""
+
+process_test_results()
+{
+ if [[ "${TEST_STATUS}" -eq 0 ]] ; then
+ echo "PASS"
+ TESTS_SUCCEEDED=$((TESTS_SUCCEEDED+1))
+ else
+ echo "FAIL"
+ TESTS_FAILED=$((TESTS_FAILED+1))
+ fi
+}
+
+print_test_summary_and_exit()
+{
+ echo "passed tests: ${TESTS_SUCCEEDED}"
+ echo "failed tests: ${TESTS_FAILED}"
+ if [ "${TESTS_FAILED}" -eq "0" ] ; then
+ exit 0
+ else
+ exit 1
+ fi
+}
+
+setup()
+{
+ set -e # exit on error
+ TEST_STATUS=0
+
+ # create devices and namespaces
+ ip netns add "${NS1}"
+ ip netns add "${NS2}"
+ ip netns add "${NS3}"
+
+ # rp_filter gets confused by what these tests are doing, so disable it
+ ip netns exec ${NS1} sysctl -wq net.ipv4.conf.all.rp_filter=0
+ ip netns exec ${NS2} sysctl -wq net.ipv4.conf.all.rp_filter=0
+ ip netns exec ${NS3} sysctl -wq net.ipv4.conf.all.rp_filter=0
+ ip netns exec ${NS1} sysctl -wq net.ipv4.conf.default.rp_filter=0
+ ip netns exec ${NS2} sysctl -wq net.ipv4.conf.default.rp_filter=0
+ ip netns exec ${NS3} sysctl -wq net.ipv4.conf.default.rp_filter=0
+
+ # disable IPv6 DAD because it sometimes takes too long and fails tests
+ ip netns exec ${NS1} sysctl -wq net.ipv6.conf.all.accept_dad=0
+ ip netns exec ${NS2} sysctl -wq net.ipv6.conf.all.accept_dad=0
+ ip netns exec ${NS3} sysctl -wq net.ipv6.conf.all.accept_dad=0
+ ip netns exec ${NS1} sysctl -wq net.ipv6.conf.default.accept_dad=0
+ ip netns exec ${NS2} sysctl -wq net.ipv6.conf.default.accept_dad=0
+ ip netns exec ${NS3} sysctl -wq net.ipv6.conf.default.accept_dad=0
+
+ ip link add veth1 type veth peer name veth2
+ ip link add veth3 type veth peer name veth4
+ ip link add veth5 type veth peer name veth6
+ ip link add veth7 type veth peer name veth8
+
+ ip netns exec ${NS2} sysctl -wq net.ipv4.ip_forward=1
+ ip netns exec ${NS2} sysctl -wq net.ipv6.conf.all.forwarding=1
+
+ ip link set veth1 netns ${NS1}
+ ip link set veth2 netns ${NS2}
+ ip link set veth3 netns ${NS2}
+ ip link set veth4 netns ${NS3}
+ ip link set veth5 netns ${NS1}
+ ip link set veth6 netns ${NS2}
+ ip link set veth7 netns ${NS2}
+ ip link set veth8 netns ${NS3}
+
+ if [ ! -z "${VRF}" ] ; then
+ ip -netns ${NS1} link add red type vrf table 1001
+ ip -netns ${NS1} link set red up
+ ip -netns ${NS1} route add table 1001 unreachable default metric 8192
+ ip -netns ${NS1} -6 route add table 1001 unreachable default metric 8192
+ ip -netns ${NS1} link set veth1 vrf red
+ ip -netns ${NS1} link set veth5 vrf red
+
+ ip -netns ${NS2} link add red type vrf table 1001
+ ip -netns ${NS2} link set red up
+ ip -netns ${NS2} route add table 1001 unreachable default metric 8192
+ ip -netns ${NS2} -6 route add table 1001 unreachable default metric 8192
+ ip -netns ${NS2} link set veth2 vrf red
+ ip -netns ${NS2} link set veth3 vrf red
+ ip -netns ${NS2} link set veth6 vrf red
+ ip -netns ${NS2} link set veth7 vrf red
+ fi
+
+ # configure addesses: the top route (1-2-3-4)
+ ip -netns ${NS1} addr add ${IPv4_1}/24 dev veth1
+ ip -netns ${NS2} addr add ${IPv4_2}/24 dev veth2
+ ip -netns ${NS2} addr add ${IPv4_3}/24 dev veth3
+ ip -netns ${NS3} addr add ${IPv4_4}/24 dev veth4
+ ip -netns ${NS1} -6 addr add ${IPv6_1}/128 nodad dev veth1
+ ip -netns ${NS2} -6 addr add ${IPv6_2}/128 nodad dev veth2
+ ip -netns ${NS2} -6 addr add ${IPv6_3}/128 nodad dev veth3
+ ip -netns ${NS3} -6 addr add ${IPv6_4}/128 nodad dev veth4
+
+ # configure addresses: the bottom route (5-6-7-8)
+ ip -netns ${NS1} addr add ${IPv4_5}/24 dev veth5
+ ip -netns ${NS2} addr add ${IPv4_6}/24 dev veth6
+ ip -netns ${NS2} addr add ${IPv4_7}/24 dev veth7
+ ip -netns ${NS3} addr add ${IPv4_8}/24 dev veth8
+ ip -netns ${NS1} -6 addr add ${IPv6_5}/128 nodad dev veth5
+ ip -netns ${NS2} -6 addr add ${IPv6_6}/128 nodad dev veth6
+ ip -netns ${NS2} -6 addr add ${IPv6_7}/128 nodad dev veth7
+ ip -netns ${NS3} -6 addr add ${IPv6_8}/128 nodad dev veth8
+
+ ip -netns ${NS1} link set dev veth1 up
+ ip -netns ${NS2} link set dev veth2 up
+ ip -netns ${NS2} link set dev veth3 up
+ ip -netns ${NS3} link set dev veth4 up
+ ip -netns ${NS1} link set dev veth5 up
+ ip -netns ${NS2} link set dev veth6 up
+ ip -netns ${NS2} link set dev veth7 up
+ ip -netns ${NS3} link set dev veth8 up
+
+ # configure routes: IP*_SRC -> veth1/IP*_2 (= top route) default;
+ # the bottom route to specific bottom addresses
+
+ # NS1
+ # top route
+ ip -netns ${NS1} route add ${IPv4_2}/32 dev veth1 ${VRF}
+ ip -netns ${NS1} route add default dev veth1 via ${IPv4_2} ${VRF} # go top by default
+ ip -netns ${NS1} -6 route add ${IPv6_2}/128 dev veth1 ${VRF}
+ ip -netns ${NS1} -6 route add default dev veth1 via ${IPv6_2} ${VRF} # go top by default
+ # bottom route
+ ip -netns ${NS1} route add ${IPv4_6}/32 dev veth5 ${VRF}
+ ip -netns ${NS1} route add ${IPv4_7}/32 dev veth5 via ${IPv4_6} ${VRF}
+ ip -netns ${NS1} route add ${IPv4_8}/32 dev veth5 via ${IPv4_6} ${VRF}
+ ip -netns ${NS1} -6 route add ${IPv6_6}/128 dev veth5 ${VRF}
+ ip -netns ${NS1} -6 route add ${IPv6_7}/128 dev veth5 via ${IPv6_6} ${VRF}
+ ip -netns ${NS1} -6 route add ${IPv6_8}/128 dev veth5 via ${IPv6_6} ${VRF}
+
+ # NS2
+ # top route
+ ip -netns ${NS2} route add ${IPv4_1}/32 dev veth2 ${VRF}
+ ip -netns ${NS2} route add ${IPv4_4}/32 dev veth3 ${VRF}
+ ip -netns ${NS2} -6 route add ${IPv6_1}/128 dev veth2 ${VRF}
+ ip -netns ${NS2} -6 route add ${IPv6_4}/128 dev veth3 ${VRF}
+ # bottom route
+ ip -netns ${NS2} route add ${IPv4_5}/32 dev veth6 ${VRF}
+ ip -netns ${NS2} route add ${IPv4_8}/32 dev veth7 ${VRF}
+ ip -netns ${NS2} -6 route add ${IPv6_5}/128 dev veth6 ${VRF}
+ ip -netns ${NS2} -6 route add ${IPv6_8}/128 dev veth7 ${VRF}
+
+ # NS3
+ # top route
+ ip -netns ${NS3} route add ${IPv4_3}/32 dev veth4
+ ip -netns ${NS3} route add ${IPv4_1}/32 dev veth4 via ${IPv4_3}
+ ip -netns ${NS3} route add ${IPv4_2}/32 dev veth4 via ${IPv4_3}
+ ip -netns ${NS3} -6 route add ${IPv6_3}/128 dev veth4
+ ip -netns ${NS3} -6 route add ${IPv6_1}/128 dev veth4 via ${IPv6_3}
+ ip -netns ${NS3} -6 route add ${IPv6_2}/128 dev veth4 via ${IPv6_3}
+ # bottom route
+ ip -netns ${NS3} route add ${IPv4_7}/32 dev veth8
+ ip -netns ${NS3} route add ${IPv4_5}/32 dev veth8 via ${IPv4_7}
+ ip -netns ${NS3} route add ${IPv4_6}/32 dev veth8 via ${IPv4_7}
+ ip -netns ${NS3} -6 route add ${IPv6_7}/128 dev veth8
+ ip -netns ${NS3} -6 route add ${IPv6_5}/128 dev veth8 via ${IPv6_7}
+ ip -netns ${NS3} -6 route add ${IPv6_6}/128 dev veth8 via ${IPv6_7}
+
+ # configure IPv4 GRE device in NS3, and a route to it via the "bottom" route
+ ip -netns ${NS3} tunnel add gre_dev mode gre remote ${IPv4_1} local ${IPv4_GRE} ttl 255
+ ip -netns ${NS3} link set gre_dev up
+ ip -netns ${NS3} addr add ${IPv4_GRE} dev gre_dev
+ ip -netns ${NS1} route add ${IPv4_GRE}/32 dev veth5 via ${IPv4_6} ${VRF}
+ ip -netns ${NS2} route add ${IPv4_GRE}/32 dev veth7 via ${IPv4_8} ${VRF}
+
+
+ # configure IPv6 GRE device in NS3, and a route to it via the "bottom" route
+ ip -netns ${NS3} -6 tunnel add name gre6_dev mode ip6gre remote ${IPv6_1} local ${IPv6_GRE} ttl 255
+ ip -netns ${NS3} link set gre6_dev up
+ ip -netns ${NS3} -6 addr add ${IPv6_GRE} nodad dev gre6_dev
+ ip -netns ${NS1} -6 route add ${IPv6_GRE}/128 dev veth5 via ${IPv6_6} ${VRF}
+ ip -netns ${NS2} -6 route add ${IPv6_GRE}/128 dev veth7 via ${IPv6_8} ${VRF}
+
+ TMPFILE=$(mktemp /tmp/test_lwt_ip_encap.XXXXXX)
+
+ sleep 1 # reduce flakiness
+ set +e
+}
+
+cleanup()
+{
+ if [ -f ${TMPFILE} ] ; then
+ rm ${TMPFILE}
+ fi
+
+ ip netns del ${NS1} 2> /dev/null
+ ip netns del ${NS2} 2> /dev/null
+ ip netns del ${NS3} 2> /dev/null
+}
+
+trap cleanup EXIT
+
+remove_routes_to_gredev()
+{
+ ip -netns ${NS1} route del ${IPv4_GRE} dev veth5 ${VRF}
+ ip -netns ${NS2} route del ${IPv4_GRE} dev veth7 ${VRF}
+ ip -netns ${NS1} -6 route del ${IPv6_GRE}/128 dev veth5 ${VRF}
+ ip -netns ${NS2} -6 route del ${IPv6_GRE}/128 dev veth7 ${VRF}
+}
+
+add_unreachable_routes_to_gredev()
+{
+ ip -netns ${NS1} route add unreachable ${IPv4_GRE}/32 ${VRF}
+ ip -netns ${NS2} route add unreachable ${IPv4_GRE}/32 ${VRF}
+ ip -netns ${NS1} -6 route add unreachable ${IPv6_GRE}/128 ${VRF}
+ ip -netns ${NS2} -6 route add unreachable ${IPv6_GRE}/128 ${VRF}
+}
+
+test_ping()
+{
+ local readonly PROTO=$1
+ local readonly EXPECTED=$2
+ local RET=0
+
+ if [ "${PROTO}" == "IPv4" ] ; then
+ ip netns exec ${NS1} ping -c 1 -W 1 -I veth1 ${IPv4_DST} 2>&1 > /dev/null
+ RET=$?
+ elif [ "${PROTO}" == "IPv6" ] ; then
+ ip netns exec ${NS1} ping6 -c 1 -W 1 -I veth1 ${IPv6_DST} 2>&1 > /dev/null
+ RET=$?
+ else
+ echo " test_ping: unknown PROTO: ${PROTO}"
+ TEST_STATUS=1
+ fi
+
+ if [ "0" != "${RET}" ]; then
+ RET=1
+ fi
+
+ if [ "${EXPECTED}" != "${RET}" ] ; then
+ echo " test_ping failed: expected: ${EXPECTED}; got ${RET}"
+ TEST_STATUS=1
+ fi
+}
+
+test_gso()
+{
+ local readonly PROTO=$1
+ local readonly PKT_SZ=5000
+ local IP_DST=""
+ : > ${TMPFILE} # trim the capture file
+
+ # check that nc is present
+ command -v nc >/dev/null 2>&1 || \
+ { echo >&2 "nc is not available: skipping TSO tests"; return; }
+
+ # listen on port 9000, capture TCP into $TMPFILE
+ if [ "${PROTO}" == "IPv4" ] ; then
+ IP_DST=${IPv4_DST}
+ ip netns exec ${NS3} bash -c \
+ "nc -4 -l -p 9000 > ${TMPFILE} &"
+ elif [ "${PROTO}" == "IPv6" ] ; then
+ IP_DST=${IPv6_DST}
+ ip netns exec ${NS3} bash -c \
+ "nc -6 -l -p 9000 > ${TMPFILE} &"
+ RET=$?
+ else
+ echo " test_gso: unknown PROTO: ${PROTO}"
+ TEST_STATUS=1
+ fi
+ sleep 1 # let nc start listening
+
+ # send a packet larger than MTU
+ ip netns exec ${NS1} bash -c \
+ "dd if=/dev/zero bs=$PKT_SZ count=1 > /dev/tcp/${IP_DST}/9000 2>/dev/null"
+ sleep 2 # let the packet get delivered
+
+ # verify we received all expected bytes
+ SZ=$(stat -c %s ${TMPFILE})
+ if [ "$SZ" != "$PKT_SZ" ] ; then
+ echo " test_gso failed: ${PROTO}"
+ TEST_STATUS=1
+ fi
+}
+
+test_egress()
+{
+ local readonly ENCAP=$1
+ echo "starting egress ${ENCAP} encap test ${VRF}"
+ setup
+
+ # by default, pings work
+ test_ping IPv4 0
+ test_ping IPv6 0
+
+ # remove NS2->DST routes, ping fails
+ ip -netns ${NS2} route del ${IPv4_DST}/32 dev veth3 ${VRF}
+ ip -netns ${NS2} -6 route del ${IPv6_DST}/128 dev veth3 ${VRF}
+ test_ping IPv4 1
+ test_ping IPv6 1
+
+ # install replacement routes (LWT/eBPF), pings succeed
+ if [ "${ENCAP}" == "IPv4" ] ; then
+ ip -netns ${NS1} route add ${IPv4_DST} encap bpf xmit obj \
+ test_lwt_ip_encap.o sec encap_gre dev veth1 ${VRF}
+ ip -netns ${NS1} -6 route add ${IPv6_DST} encap bpf xmit obj \
+ test_lwt_ip_encap.o sec encap_gre dev veth1 ${VRF}
+ elif [ "${ENCAP}" == "IPv6" ] ; then
+ ip -netns ${NS1} route add ${IPv4_DST} encap bpf xmit obj \
+ test_lwt_ip_encap.o sec encap_gre6 dev veth1 ${VRF}
+ ip -netns ${NS1} -6 route add ${IPv6_DST} encap bpf xmit obj \
+ test_lwt_ip_encap.o sec encap_gre6 dev veth1 ${VRF}
+ else
+ echo " unknown encap ${ENCAP}"
+ TEST_STATUS=1
+ fi
+ test_ping IPv4 0
+ test_ping IPv6 0
+
+ # skip GSO tests with VRF: VRF routing needs properly assigned
+ # source IP/device, which is easy to do with ping and hard with dd/nc.
+ if [ -z "${VRF}" ] ; then
+ test_gso IPv4
+ test_gso IPv6
+ fi
+
+ # a negative test: remove routes to GRE devices: ping fails
+ remove_routes_to_gredev
+ test_ping IPv4 1
+ test_ping IPv6 1
+
+ # another negative test
+ add_unreachable_routes_to_gredev
+ test_ping IPv4 1
+ test_ping IPv6 1
+
+ cleanup
+ process_test_results
+}
+
+test_ingress()
+{
+ local readonly ENCAP=$1
+ echo "starting ingress ${ENCAP} encap test ${VRF}"
+ setup
+
+ # need to wait a bit for IPv6 to autoconf, otherwise
+ # ping6 sometimes fails with "unable to bind to address"
+
+ # by default, pings work
+ test_ping IPv4 0
+ test_ping IPv6 0
+
+ # remove NS2->DST routes, pings fail
+ ip -netns ${NS2} route del ${IPv4_DST}/32 dev veth3 ${VRF}
+ ip -netns ${NS2} -6 route del ${IPv6_DST}/128 dev veth3 ${VRF}
+ test_ping IPv4 1
+ test_ping IPv6 1
+
+ # install replacement routes (LWT/eBPF), pings succeed
+ if [ "${ENCAP}" == "IPv4" ] ; then
+ ip -netns ${NS2} route add ${IPv4_DST} encap bpf in obj \
+ test_lwt_ip_encap.o sec encap_gre dev veth2 ${VRF}
+ ip -netns ${NS2} -6 route add ${IPv6_DST} encap bpf in obj \
+ test_lwt_ip_encap.o sec encap_gre dev veth2 ${VRF}
+ elif [ "${ENCAP}" == "IPv6" ] ; then
+ ip -netns ${NS2} route add ${IPv4_DST} encap bpf in obj \
+ test_lwt_ip_encap.o sec encap_gre6 dev veth2 ${VRF}
+ ip -netns ${NS2} -6 route add ${IPv6_DST} encap bpf in obj \
+ test_lwt_ip_encap.o sec encap_gre6 dev veth2 ${VRF}
+ else
+ echo "FAIL: unknown encap ${ENCAP}"
+ TEST_STATUS=1
+ fi
+ test_ping IPv4 0
+ test_ping IPv6 0
+
+ # a negative test: remove routes to GRE devices: ping fails
+ remove_routes_to_gredev
+ test_ping IPv4 1
+ test_ping IPv6 1
+
+ # another negative test
+ add_unreachable_routes_to_gredev
+ test_ping IPv4 1
+ test_ping IPv6 1
+
+ cleanup
+ process_test_results
+}
+
+VRF=""
+test_egress IPv4
+test_egress IPv6
+test_ingress IPv4
+test_ingress IPv6
+
+VRF="vrf red"
+test_egress IPv4
+test_egress IPv6
+test_ingress IPv4
+test_ingress IPv6
+
+print_test_summary_and_exit
diff --git a/tools/testing/selftests/bpf/test_lwt_seg6local.sh b/tools/testing/selftests/bpf/test_lwt_seg6local.sh
new file mode 100755
index 000000000..5620919fd
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_lwt_seg6local.sh
@@ -0,0 +1,149 @@
+#!/bin/bash
+# Connects 6 network namespaces through veths.
+# Each NS may have different IPv6 global scope addresses :
+# NS1 ---- NS2 ---- NS3 ---- NS4 ---- NS5 ---- NS6
+# fb00::1 fd00::1 fd00::2 fd00::3 fb00::6
+# fc42::1 fd00::4
+#
+# All IPv6 packets going to fb00::/16 through NS2 will be encapsulated in a
+# IPv6 header with a Segment Routing Header, with segments :
+# fd00::1 -> fd00::2 -> fd00::3 -> fd00::4
+#
+# 3 fd00::/16 IPv6 addresses are binded to seg6local End.BPF actions :
+# - fd00::1 : add a TLV, change the flags and apply a End.X action to fc42::1
+# - fd00::2 : remove the TLV, change the flags, add a tag
+# - fd00::3 : apply an End.T action to fd00::4, through routing table 117
+#
+# fd00::4 is a simple Segment Routing node decapsulating the inner IPv6 packet.
+# Each End.BPF action will validate the operations applied on the SRH by the
+# previous BPF program in the chain, otherwise the packet is dropped.
+#
+# An UDP datagram is sent from fb00::1 to fb00::6. The test succeeds if this
+# datagram can be read on NS6 when binding to fb00::6.
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+msg="skip all tests:"
+if [ $UID != 0 ]; then
+ echo $msg please run this as root >&2
+ exit $ksft_skip
+fi
+
+TMP_FILE="/tmp/selftest_lwt_seg6local.txt"
+
+cleanup()
+{
+ if [ "$?" = "0" ]; then
+ echo "selftests: test_lwt_seg6local [PASS]";
+ else
+ echo "selftests: test_lwt_seg6local [FAILED]";
+ fi
+
+ set +e
+ ip netns del ns1 2> /dev/null
+ ip netns del ns2 2> /dev/null
+ ip netns del ns3 2> /dev/null
+ ip netns del ns4 2> /dev/null
+ ip netns del ns5 2> /dev/null
+ ip netns del ns6 2> /dev/null
+ rm -f $TMP_FILE
+}
+
+set -e
+
+ip netns add ns1
+ip netns add ns2
+ip netns add ns3
+ip netns add ns4
+ip netns add ns5
+ip netns add ns6
+
+trap cleanup 0 2 3 6 9
+
+ip link add veth1 type veth peer name veth2
+ip link add veth3 type veth peer name veth4
+ip link add veth5 type veth peer name veth6
+ip link add veth7 type veth peer name veth8
+ip link add veth9 type veth peer name veth10
+
+ip link set veth1 netns ns1
+ip link set veth2 netns ns2
+ip link set veth3 netns ns2
+ip link set veth4 netns ns3
+ip link set veth5 netns ns3
+ip link set veth6 netns ns4
+ip link set veth7 netns ns4
+ip link set veth8 netns ns5
+ip link set veth9 netns ns5
+ip link set veth10 netns ns6
+
+ip netns exec ns1 ip link set dev veth1 up
+ip netns exec ns2 ip link set dev veth2 up
+ip netns exec ns2 ip link set dev veth3 up
+ip netns exec ns3 ip link set dev veth4 up
+ip netns exec ns3 ip link set dev veth5 up
+ip netns exec ns4 ip link set dev veth6 up
+ip netns exec ns4 ip link set dev veth7 up
+ip netns exec ns5 ip link set dev veth8 up
+ip netns exec ns5 ip link set dev veth9 up
+ip netns exec ns6 ip link set dev veth10 up
+ip netns exec ns6 ip link set dev lo up
+
+# All link scope addresses and routes required between veths
+ip netns exec ns1 ip -6 addr add fb00::12/16 dev veth1 scope link
+ip netns exec ns1 ip -6 route add fb00::21 dev veth1 scope link
+ip netns exec ns2 ip -6 addr add fb00::21/16 dev veth2 scope link
+ip netns exec ns2 ip -6 addr add fb00::34/16 dev veth3 scope link
+ip netns exec ns2 ip -6 route add fb00::43 dev veth3 scope link
+ip netns exec ns3 ip -6 route add fb00::65 dev veth5 scope link
+ip netns exec ns3 ip -6 addr add fb00::43/16 dev veth4 scope link
+ip netns exec ns3 ip -6 addr add fb00::56/16 dev veth5 scope link
+ip netns exec ns4 ip -6 addr add fb00::65/16 dev veth6 scope link
+ip netns exec ns4 ip -6 addr add fb00::78/16 dev veth7 scope link
+ip netns exec ns4 ip -6 route add fb00::87 dev veth7 scope link
+ip netns exec ns5 ip -6 addr add fb00::87/16 dev veth8 scope link
+ip netns exec ns5 ip -6 addr add fb00::910/16 dev veth9 scope link
+ip netns exec ns5 ip -6 route add fb00::109 dev veth9 scope link
+ip netns exec ns5 ip -6 route add fb00::109 table 117 dev veth9 scope link
+ip netns exec ns6 ip -6 addr add fb00::109/16 dev veth10 scope link
+
+ip netns exec ns1 ip -6 addr add fb00::1/16 dev lo
+ip netns exec ns1 ip -6 route add fb00::6 dev veth1 via fb00::21
+
+ip netns exec ns2 ip -6 route add fb00::6 encap bpf in obj test_lwt_seg6local.o sec encap_srh dev veth2
+ip netns exec ns2 ip -6 route add fd00::1 dev veth3 via fb00::43 scope link
+
+ip netns exec ns3 ip -6 route add fc42::1 dev veth5 via fb00::65
+ip netns exec ns3 ip -6 route add fd00::1 encap seg6local action End.BPF endpoint obj test_lwt_seg6local.o sec add_egr_x dev veth4
+
+ip netns exec ns4 ip -6 route add fd00::2 encap seg6local action End.BPF endpoint obj test_lwt_seg6local.o sec pop_egr dev veth6
+ip netns exec ns4 ip -6 addr add fc42::1 dev lo
+ip netns exec ns4 ip -6 route add fd00::3 dev veth7 via fb00::87
+
+ip netns exec ns5 ip -6 route add fd00::4 table 117 dev veth9 via fb00::109
+ip netns exec ns5 ip -6 route add fd00::3 encap seg6local action End.BPF endpoint obj test_lwt_seg6local.o sec inspect_t dev veth8
+
+ip netns exec ns6 ip -6 addr add fb00::6/16 dev lo
+ip netns exec ns6 ip -6 addr add fd00::4/16 dev lo
+
+ip netns exec ns1 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec ns2 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec ns3 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec ns4 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec ns5 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+
+ip netns exec ns6 sysctl net.ipv6.conf.all.seg6_enabled=1 > /dev/null
+ip netns exec ns6 sysctl net.ipv6.conf.lo.seg6_enabled=1 > /dev/null
+ip netns exec ns6 sysctl net.ipv6.conf.veth10.seg6_enabled=1 > /dev/null
+
+ip netns exec ns6 nc -l -6 -u -d 7330 > $TMP_FILE &
+ip netns exec ns1 bash -c "echo 'foobar' | nc -w0 -6 -u -p 2121 -s fb00::1 fb00::6 7330"
+sleep 5 # wait enough time to ensure the UDP datagram arrived to the last segment
+kill -TERM $!
+
+if [[ $(< $TMP_FILE) != "foobar" ]]; then
+ exit 1
+fi
+
+exit 0
diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c
new file mode 100644
index 000000000..179e680e8
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_maps.c
@@ -0,0 +1,1776 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Testsuite for eBPF maps
+ *
+ * Copyright (c) 2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include <sys/wait.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <linux/bpf.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "bpf_util.h"
+#include "bpf_rlimit.h"
+#include "test_maps.h"
+
+#ifndef ENOTSUPP
+#define ENOTSUPP 524
+#endif
+
+static int skips;
+
+static int map_flags;
+
+static void test_hashmap(unsigned int task, void *data)
+{
+ long long key, next_key, first_key, value;
+ int fd;
+
+ fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
+ 2, map_flags);
+ if (fd < 0) {
+ printf("Failed to create hashmap '%s'!\n", strerror(errno));
+ exit(1);
+ }
+
+ key = 1;
+ value = 1234;
+ /* Insert key=1 element. */
+ assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0);
+
+ value = 0;
+ /* BPF_NOEXIST means add new element if it doesn't exist. */
+ assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 &&
+ /* key=1 already exists. */
+ errno == EEXIST);
+
+ /* -1 is an invalid flag. */
+ assert(bpf_map_update_elem(fd, &key, &value, -1) == -1 &&
+ errno == EINVAL);
+
+ /* Check that key=1 can be found. */
+ assert(bpf_map_lookup_elem(fd, &key, &value) == 0 && value == 1234);
+
+ key = 2;
+ /* Check that key=2 is not found. */
+ assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == ENOENT);
+
+ /* BPF_EXIST means update existing element. */
+ assert(bpf_map_update_elem(fd, &key, &value, BPF_EXIST) == -1 &&
+ /* key=2 is not there. */
+ errno == ENOENT);
+
+ /* Insert key=2 element. */
+ assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == 0);
+
+ /* key=1 and key=2 were inserted, check that key=0 cannot be
+ * inserted due to max_entries limit.
+ */
+ key = 0;
+ assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 &&
+ errno == E2BIG);
+
+ /* Update existing element, though the map is full. */
+ key = 1;
+ assert(bpf_map_update_elem(fd, &key, &value, BPF_EXIST) == 0);
+ key = 2;
+ assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0);
+ key = 3;
+ assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 &&
+ errno == E2BIG);
+
+ /* Check that key = 0 doesn't exist. */
+ key = 0;
+ assert(bpf_map_delete_elem(fd, &key) == -1 && errno == ENOENT);
+
+ /* Iterate over two elements. */
+ assert(bpf_map_get_next_key(fd, NULL, &first_key) == 0 &&
+ (first_key == 1 || first_key == 2));
+ assert(bpf_map_get_next_key(fd, &key, &next_key) == 0 &&
+ (next_key == first_key));
+ assert(bpf_map_get_next_key(fd, &next_key, &next_key) == 0 &&
+ (next_key == 1 || next_key == 2) &&
+ (next_key != first_key));
+ assert(bpf_map_get_next_key(fd, &next_key, &next_key) == -1 &&
+ errno == ENOENT);
+
+ /* Delete both elements. */
+ key = 1;
+ assert(bpf_map_delete_elem(fd, &key) == 0);
+ key = 2;
+ assert(bpf_map_delete_elem(fd, &key) == 0);
+ assert(bpf_map_delete_elem(fd, &key) == -1 && errno == ENOENT);
+
+ key = 0;
+ /* Check that map is empty. */
+ assert(bpf_map_get_next_key(fd, NULL, &next_key) == -1 &&
+ errno == ENOENT);
+ assert(bpf_map_get_next_key(fd, &key, &next_key) == -1 &&
+ errno == ENOENT);
+
+ close(fd);
+}
+
+static void test_hashmap_sizes(unsigned int task, void *data)
+{
+ int fd, i, j;
+
+ for (i = 1; i <= 512; i <<= 1)
+ for (j = 1; j <= 1 << 18; j <<= 1) {
+ fd = bpf_create_map(BPF_MAP_TYPE_HASH, i, j,
+ 2, map_flags);
+ if (fd < 0) {
+ if (errno == ENOMEM)
+ return;
+ printf("Failed to create hashmap key=%d value=%d '%s'\n",
+ i, j, strerror(errno));
+ exit(1);
+ }
+ close(fd);
+ usleep(10); /* give kernel time to destroy */
+ }
+}
+
+static void test_hashmap_percpu(unsigned int task, void *data)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ BPF_DECLARE_PERCPU(long, value);
+ long long key, next_key, first_key;
+ int expected_key_mask = 0;
+ int fd, i;
+
+ fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_HASH, sizeof(key),
+ sizeof(bpf_percpu(value, 0)), 2, map_flags);
+ if (fd < 0) {
+ printf("Failed to create hashmap '%s'!\n", strerror(errno));
+ exit(1);
+ }
+
+ for (i = 0; i < nr_cpus; i++)
+ bpf_percpu(value, i) = i + 100;
+
+ key = 1;
+ /* Insert key=1 element. */
+ assert(!(expected_key_mask & key));
+ assert(bpf_map_update_elem(fd, &key, value, BPF_ANY) == 0);
+ expected_key_mask |= key;
+
+ /* BPF_NOEXIST means add new element if it doesn't exist. */
+ assert(bpf_map_update_elem(fd, &key, value, BPF_NOEXIST) == -1 &&
+ /* key=1 already exists. */
+ errno == EEXIST);
+
+ /* -1 is an invalid flag. */
+ assert(bpf_map_update_elem(fd, &key, value, -1) == -1 &&
+ errno == EINVAL);
+
+ /* Check that key=1 can be found. Value could be 0 if the lookup
+ * was run from a different CPU.
+ */
+ bpf_percpu(value, 0) = 1;
+ assert(bpf_map_lookup_elem(fd, &key, value) == 0 &&
+ bpf_percpu(value, 0) == 100);
+
+ key = 2;
+ /* Check that key=2 is not found. */
+ assert(bpf_map_lookup_elem(fd, &key, value) == -1 && errno == ENOENT);
+
+ /* BPF_EXIST means update existing element. */
+ assert(bpf_map_update_elem(fd, &key, value, BPF_EXIST) == -1 &&
+ /* key=2 is not there. */
+ errno == ENOENT);
+
+ /* Insert key=2 element. */
+ assert(!(expected_key_mask & key));
+ assert(bpf_map_update_elem(fd, &key, value, BPF_NOEXIST) == 0);
+ expected_key_mask |= key;
+
+ /* key=1 and key=2 were inserted, check that key=0 cannot be
+ * inserted due to max_entries limit.
+ */
+ key = 0;
+ assert(bpf_map_update_elem(fd, &key, value, BPF_NOEXIST) == -1 &&
+ errno == E2BIG);
+
+ /* Check that key = 0 doesn't exist. */
+ assert(bpf_map_delete_elem(fd, &key) == -1 && errno == ENOENT);
+
+ /* Iterate over two elements. */
+ assert(bpf_map_get_next_key(fd, NULL, &first_key) == 0 &&
+ ((expected_key_mask & first_key) == first_key));
+ while (!bpf_map_get_next_key(fd, &key, &next_key)) {
+ if (first_key) {
+ assert(next_key == first_key);
+ first_key = 0;
+ }
+ assert((expected_key_mask & next_key) == next_key);
+ expected_key_mask &= ~next_key;
+
+ assert(bpf_map_lookup_elem(fd, &next_key, value) == 0);
+
+ for (i = 0; i < nr_cpus; i++)
+ assert(bpf_percpu(value, i) == i + 100);
+
+ key = next_key;
+ }
+ assert(errno == ENOENT);
+
+ /* Update with BPF_EXIST. */
+ key = 1;
+ assert(bpf_map_update_elem(fd, &key, value, BPF_EXIST) == 0);
+
+ /* Delete both elements. */
+ key = 1;
+ assert(bpf_map_delete_elem(fd, &key) == 0);
+ key = 2;
+ assert(bpf_map_delete_elem(fd, &key) == 0);
+ assert(bpf_map_delete_elem(fd, &key) == -1 && errno == ENOENT);
+
+ key = 0;
+ /* Check that map is empty. */
+ assert(bpf_map_get_next_key(fd, NULL, &next_key) == -1 &&
+ errno == ENOENT);
+ assert(bpf_map_get_next_key(fd, &key, &next_key) == -1 &&
+ errno == ENOENT);
+
+ close(fd);
+}
+
+static int helper_fill_hashmap(int max_entries)
+{
+ int i, fd, ret;
+ long long key, value;
+
+ fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
+ max_entries, map_flags);
+ CHECK(fd < 0,
+ "failed to create hashmap",
+ "err: %s, flags: 0x%x\n", strerror(errno), map_flags);
+
+ for (i = 0; i < max_entries; i++) {
+ key = i; value = key;
+ ret = bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST);
+ CHECK(ret != 0,
+ "can't update hashmap",
+ "err: %s\n", strerror(ret));
+ }
+
+ return fd;
+}
+
+static void test_hashmap_walk(unsigned int task, void *data)
+{
+ int fd, i, max_entries = 1000;
+ long long key, value, next_key;
+ bool next_key_valid = true;
+
+ fd = helper_fill_hashmap(max_entries);
+
+ for (i = 0; bpf_map_get_next_key(fd, !i ? NULL : &key,
+ &next_key) == 0; i++) {
+ key = next_key;
+ assert(bpf_map_lookup_elem(fd, &key, &value) == 0);
+ }
+
+ assert(i == max_entries);
+
+ assert(bpf_map_get_next_key(fd, NULL, &key) == 0);
+ for (i = 0; next_key_valid; i++) {
+ next_key_valid = bpf_map_get_next_key(fd, &key, &next_key) == 0;
+ assert(bpf_map_lookup_elem(fd, &key, &value) == 0);
+ value++;
+ assert(bpf_map_update_elem(fd, &key, &value, BPF_EXIST) == 0);
+ key = next_key;
+ }
+
+ assert(i == max_entries);
+
+ for (i = 0; bpf_map_get_next_key(fd, !i ? NULL : &key,
+ &next_key) == 0; i++) {
+ key = next_key;
+ assert(bpf_map_lookup_elem(fd, &key, &value) == 0);
+ assert(value - 1 == key);
+ }
+
+ assert(i == max_entries);
+ close(fd);
+}
+
+static void test_hashmap_zero_seed(void)
+{
+ int i, first, second, old_flags;
+ long long key, next_first, next_second;
+
+ old_flags = map_flags;
+ map_flags |= BPF_F_ZERO_SEED;
+
+ first = helper_fill_hashmap(3);
+ second = helper_fill_hashmap(3);
+
+ for (i = 0; ; i++) {
+ void *key_ptr = !i ? NULL : &key;
+
+ if (bpf_map_get_next_key(first, key_ptr, &next_first) != 0)
+ break;
+
+ CHECK(bpf_map_get_next_key(second, key_ptr, &next_second) != 0,
+ "next_key for second map must succeed",
+ "key_ptr: %p", key_ptr);
+ CHECK(next_first != next_second,
+ "keys must match",
+ "i: %d first: %lld second: %lld\n", i,
+ next_first, next_second);
+
+ key = next_first;
+ }
+
+ map_flags = old_flags;
+ close(first);
+ close(second);
+}
+
+static void test_arraymap(unsigned int task, void *data)
+{
+ int key, next_key, fd;
+ long long value;
+
+ fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value),
+ 2, 0);
+ if (fd < 0) {
+ printf("Failed to create arraymap '%s'!\n", strerror(errno));
+ exit(1);
+ }
+
+ key = 1;
+ value = 1234;
+ /* Insert key=1 element. */
+ assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0);
+
+ value = 0;
+ assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 &&
+ errno == EEXIST);
+
+ /* Check that key=1 can be found. */
+ assert(bpf_map_lookup_elem(fd, &key, &value) == 0 && value == 1234);
+
+ key = 0;
+ /* Check that key=0 is also found and zero initialized. */
+ assert(bpf_map_lookup_elem(fd, &key, &value) == 0 && value == 0);
+
+ /* key=0 and key=1 were inserted, check that key=2 cannot be inserted
+ * due to max_entries limit.
+ */
+ key = 2;
+ assert(bpf_map_update_elem(fd, &key, &value, BPF_EXIST) == -1 &&
+ errno == E2BIG);
+
+ /* Check that key = 2 doesn't exist. */
+ assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == ENOENT);
+
+ /* Iterate over two elements. */
+ assert(bpf_map_get_next_key(fd, NULL, &next_key) == 0 &&
+ next_key == 0);
+ assert(bpf_map_get_next_key(fd, &key, &next_key) == 0 &&
+ next_key == 0);
+ assert(bpf_map_get_next_key(fd, &next_key, &next_key) == 0 &&
+ next_key == 1);
+ assert(bpf_map_get_next_key(fd, &next_key, &next_key) == -1 &&
+ errno == ENOENT);
+
+ /* Delete shouldn't succeed. */
+ key = 1;
+ assert(bpf_map_delete_elem(fd, &key) == -1 && errno == EINVAL);
+
+ close(fd);
+}
+
+static void test_arraymap_percpu(unsigned int task, void *data)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ BPF_DECLARE_PERCPU(long, values);
+ int key, next_key, fd, i;
+
+ fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_ARRAY, sizeof(key),
+ sizeof(bpf_percpu(values, 0)), 2, 0);
+ if (fd < 0) {
+ printf("Failed to create arraymap '%s'!\n", strerror(errno));
+ exit(1);
+ }
+
+ for (i = 0; i < nr_cpus; i++)
+ bpf_percpu(values, i) = i + 100;
+
+ key = 1;
+ /* Insert key=1 element. */
+ assert(bpf_map_update_elem(fd, &key, values, BPF_ANY) == 0);
+
+ bpf_percpu(values, 0) = 0;
+ assert(bpf_map_update_elem(fd, &key, values, BPF_NOEXIST) == -1 &&
+ errno == EEXIST);
+
+ /* Check that key=1 can be found. */
+ assert(bpf_map_lookup_elem(fd, &key, values) == 0 &&
+ bpf_percpu(values, 0) == 100);
+
+ key = 0;
+ /* Check that key=0 is also found and zero initialized. */
+ assert(bpf_map_lookup_elem(fd, &key, values) == 0 &&
+ bpf_percpu(values, 0) == 0 &&
+ bpf_percpu(values, nr_cpus - 1) == 0);
+
+ /* Check that key=2 cannot be inserted due to max_entries limit. */
+ key = 2;
+ assert(bpf_map_update_elem(fd, &key, values, BPF_EXIST) == -1 &&
+ errno == E2BIG);
+
+ /* Check that key = 2 doesn't exist. */
+ assert(bpf_map_lookup_elem(fd, &key, values) == -1 && errno == ENOENT);
+
+ /* Iterate over two elements. */
+ assert(bpf_map_get_next_key(fd, NULL, &next_key) == 0 &&
+ next_key == 0);
+ assert(bpf_map_get_next_key(fd, &key, &next_key) == 0 &&
+ next_key == 0);
+ assert(bpf_map_get_next_key(fd, &next_key, &next_key) == 0 &&
+ next_key == 1);
+ assert(bpf_map_get_next_key(fd, &next_key, &next_key) == -1 &&
+ errno == ENOENT);
+
+ /* Delete shouldn't succeed. */
+ key = 1;
+ assert(bpf_map_delete_elem(fd, &key) == -1 && errno == EINVAL);
+
+ close(fd);
+}
+
+static void test_arraymap_percpu_many_keys(void)
+{
+ unsigned int nr_cpus = bpf_num_possible_cpus();
+ BPF_DECLARE_PERCPU(long, values);
+ /* nr_keys is not too large otherwise the test stresses percpu
+ * allocator more than anything else
+ */
+ unsigned int nr_keys = 2000;
+ int key, fd, i;
+
+ fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_ARRAY, sizeof(key),
+ sizeof(bpf_percpu(values, 0)), nr_keys, 0);
+ if (fd < 0) {
+ printf("Failed to create per-cpu arraymap '%s'!\n",
+ strerror(errno));
+ exit(1);
+ }
+
+ for (i = 0; i < nr_cpus; i++)
+ bpf_percpu(values, i) = i + 10;
+
+ for (key = 0; key < nr_keys; key++)
+ assert(bpf_map_update_elem(fd, &key, values, BPF_ANY) == 0);
+
+ for (key = 0; key < nr_keys; key++) {
+ for (i = 0; i < nr_cpus; i++)
+ bpf_percpu(values, i) = 0;
+
+ assert(bpf_map_lookup_elem(fd, &key, values) == 0);
+
+ for (i = 0; i < nr_cpus; i++)
+ assert(bpf_percpu(values, i) == i + 10);
+ }
+
+ close(fd);
+}
+
+static void test_devmap(unsigned int task, void *data)
+{
+ int fd;
+ __u32 key, value;
+
+ fd = bpf_create_map(BPF_MAP_TYPE_DEVMAP, sizeof(key), sizeof(value),
+ 2, 0);
+ if (fd < 0) {
+ printf("Failed to create devmap '%s'!\n", strerror(errno));
+ exit(1);
+ }
+
+ close(fd);
+}
+
+static void test_devmap_hash(unsigned int task, void *data)
+{
+ int fd;
+ __u32 key, value;
+
+ fd = bpf_create_map(BPF_MAP_TYPE_DEVMAP_HASH, sizeof(key), sizeof(value),
+ 2, 0);
+ if (fd < 0) {
+ printf("Failed to create devmap_hash '%s'!\n", strerror(errno));
+ exit(1);
+ }
+
+ close(fd);
+}
+
+static void test_queuemap(unsigned int task, void *data)
+{
+ const int MAP_SIZE = 32;
+ __u32 vals[MAP_SIZE + MAP_SIZE/2], val;
+ int fd, i;
+
+ /* Fill test values to be used */
+ for (i = 0; i < MAP_SIZE + MAP_SIZE/2; i++)
+ vals[i] = rand();
+
+ /* Invalid key size */
+ fd = bpf_create_map(BPF_MAP_TYPE_QUEUE, 4, sizeof(val), MAP_SIZE,
+ map_flags);
+ assert(fd < 0 && errno == EINVAL);
+
+ fd = bpf_create_map(BPF_MAP_TYPE_QUEUE, 0, sizeof(val), MAP_SIZE,
+ map_flags);
+ /* Queue map does not support BPF_F_NO_PREALLOC */
+ if (map_flags & BPF_F_NO_PREALLOC) {
+ assert(fd < 0 && errno == EINVAL);
+ return;
+ }
+ if (fd < 0) {
+ printf("Failed to create queuemap '%s'!\n", strerror(errno));
+ exit(1);
+ }
+
+ /* Push MAP_SIZE elements */
+ for (i = 0; i < MAP_SIZE; i++)
+ assert(bpf_map_update_elem(fd, NULL, &vals[i], 0) == 0);
+
+ /* Check that element cannot be pushed due to max_entries limit */
+ assert(bpf_map_update_elem(fd, NULL, &val, 0) == -1 &&
+ errno == E2BIG);
+
+ /* Peek element */
+ assert(bpf_map_lookup_elem(fd, NULL, &val) == 0 && val == vals[0]);
+
+ /* Replace half elements */
+ for (i = MAP_SIZE; i < MAP_SIZE + MAP_SIZE/2; i++)
+ assert(bpf_map_update_elem(fd, NULL, &vals[i], BPF_EXIST) == 0);
+
+ /* Pop all elements */
+ for (i = MAP_SIZE/2; i < MAP_SIZE + MAP_SIZE/2; i++)
+ assert(bpf_map_lookup_and_delete_elem(fd, NULL, &val) == 0 &&
+ val == vals[i]);
+
+ /* Check that there are not elements left */
+ assert(bpf_map_lookup_and_delete_elem(fd, NULL, &val) == -1 &&
+ errno == ENOENT);
+
+ /* Check that non supported functions set errno to EINVAL */
+ assert(bpf_map_delete_elem(fd, NULL) == -1 && errno == EINVAL);
+ assert(bpf_map_get_next_key(fd, NULL, NULL) == -1 && errno == EINVAL);
+
+ close(fd);
+}
+
+static void test_stackmap(unsigned int task, void *data)
+{
+ const int MAP_SIZE = 32;
+ __u32 vals[MAP_SIZE + MAP_SIZE/2], val;
+ int fd, i;
+
+ /* Fill test values to be used */
+ for (i = 0; i < MAP_SIZE + MAP_SIZE/2; i++)
+ vals[i] = rand();
+
+ /* Invalid key size */
+ fd = bpf_create_map(BPF_MAP_TYPE_STACK, 4, sizeof(val), MAP_SIZE,
+ map_flags);
+ assert(fd < 0 && errno == EINVAL);
+
+ fd = bpf_create_map(BPF_MAP_TYPE_STACK, 0, sizeof(val), MAP_SIZE,
+ map_flags);
+ /* Stack map does not support BPF_F_NO_PREALLOC */
+ if (map_flags & BPF_F_NO_PREALLOC) {
+ assert(fd < 0 && errno == EINVAL);
+ return;
+ }
+ if (fd < 0) {
+ printf("Failed to create stackmap '%s'!\n", strerror(errno));
+ exit(1);
+ }
+
+ /* Push MAP_SIZE elements */
+ for (i = 0; i < MAP_SIZE; i++)
+ assert(bpf_map_update_elem(fd, NULL, &vals[i], 0) == 0);
+
+ /* Check that element cannot be pushed due to max_entries limit */
+ assert(bpf_map_update_elem(fd, NULL, &val, 0) == -1 &&
+ errno == E2BIG);
+
+ /* Peek element */
+ assert(bpf_map_lookup_elem(fd, NULL, &val) == 0 && val == vals[i - 1]);
+
+ /* Replace half elements */
+ for (i = MAP_SIZE; i < MAP_SIZE + MAP_SIZE/2; i++)
+ assert(bpf_map_update_elem(fd, NULL, &vals[i], BPF_EXIST) == 0);
+
+ /* Pop all elements */
+ for (i = MAP_SIZE + MAP_SIZE/2 - 1; i >= MAP_SIZE/2; i--)
+ assert(bpf_map_lookup_and_delete_elem(fd, NULL, &val) == 0 &&
+ val == vals[i]);
+
+ /* Check that there are not elements left */
+ assert(bpf_map_lookup_and_delete_elem(fd, NULL, &val) == -1 &&
+ errno == ENOENT);
+
+ /* Check that non supported functions set errno to EINVAL */
+ assert(bpf_map_delete_elem(fd, NULL) == -1 && errno == EINVAL);
+ assert(bpf_map_get_next_key(fd, NULL, NULL) == -1 && errno == EINVAL);
+
+ close(fd);
+}
+
+#include <sys/ioctl.h>
+#include <arpa/inet.h>
+#include <sys/select.h>
+#include <linux/err.h>
+#define SOCKMAP_PARSE_PROG "./sockmap_parse_prog.o"
+#define SOCKMAP_VERDICT_PROG "./sockmap_verdict_prog.o"
+#define SOCKMAP_TCP_MSG_PROG "./sockmap_tcp_msg_prog.o"
+static void test_sockmap(unsigned int tasks, void *data)
+{
+ struct bpf_map *bpf_map_rx, *bpf_map_tx, *bpf_map_msg, *bpf_map_break;
+ int map_fd_msg = 0, map_fd_rx = 0, map_fd_tx = 0, map_fd_break;
+ int ports[] = {50200, 50201, 50202, 50204};
+ int err, i, fd, udp, sfd[6] = {0xdeadbeef};
+ u8 buf[20] = {0x0, 0x5, 0x3, 0x2, 0x1, 0x0};
+ int parse_prog, verdict_prog, msg_prog;
+ struct sockaddr_in addr;
+ int one = 1, s, sc, rc;
+ struct bpf_object *obj;
+ struct timeval to;
+ __u32 key, value;
+ pid_t pid[tasks];
+ fd_set w;
+
+ /* Create some sockets to use with sockmap */
+ for (i = 0; i < 2; i++) {
+ sfd[i] = socket(AF_INET, SOCK_STREAM, 0);
+ if (sfd[i] < 0)
+ goto out;
+ err = setsockopt(sfd[i], SOL_SOCKET, SO_REUSEADDR,
+ (char *)&one, sizeof(one));
+ if (err) {
+ printf("failed to setsockopt\n");
+ goto out;
+ }
+ err = ioctl(sfd[i], FIONBIO, (char *)&one);
+ if (err < 0) {
+ printf("failed to ioctl\n");
+ goto out;
+ }
+ memset(&addr, 0, sizeof(struct sockaddr_in));
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = inet_addr("127.0.0.1");
+ addr.sin_port = htons(ports[i]);
+ err = bind(sfd[i], (struct sockaddr *)&addr, sizeof(addr));
+ if (err < 0) {
+ printf("failed to bind: err %i: %i:%i\n",
+ err, i, sfd[i]);
+ goto out;
+ }
+ err = listen(sfd[i], 32);
+ if (err < 0) {
+ printf("failed to listen\n");
+ goto out;
+ }
+ }
+
+ for (i = 2; i < 4; i++) {
+ sfd[i] = socket(AF_INET, SOCK_STREAM, 0);
+ if (sfd[i] < 0)
+ goto out;
+ err = setsockopt(sfd[i], SOL_SOCKET, SO_REUSEADDR,
+ (char *)&one, sizeof(one));
+ if (err) {
+ printf("set sock opt\n");
+ goto out;
+ }
+ memset(&addr, 0, sizeof(struct sockaddr_in));
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = inet_addr("127.0.0.1");
+ addr.sin_port = htons(ports[i - 2]);
+ err = connect(sfd[i], (struct sockaddr *)&addr, sizeof(addr));
+ if (err) {
+ printf("failed to connect\n");
+ goto out;
+ }
+ }
+
+
+ for (i = 4; i < 6; i++) {
+ sfd[i] = accept(sfd[i - 4], NULL, NULL);
+ if (sfd[i] < 0) {
+ printf("accept failed\n");
+ goto out;
+ }
+ }
+
+ /* Test sockmap with connected sockets */
+ fd = bpf_create_map(BPF_MAP_TYPE_SOCKMAP,
+ sizeof(key), sizeof(value),
+ 6, 0);
+ if (fd < 0) {
+ if (!bpf_probe_map_type(BPF_MAP_TYPE_SOCKMAP, 0)) {
+ printf("%s SKIP (unsupported map type BPF_MAP_TYPE_SOCKMAP)\n",
+ __func__);
+ skips++;
+ for (i = 0; i < 6; i++)
+ close(sfd[i]);
+ return;
+ }
+
+ printf("Failed to create sockmap %i\n", fd);
+ goto out_sockmap;
+ }
+
+ /* Test update with unsupported UDP socket */
+ udp = socket(AF_INET, SOCK_DGRAM, 0);
+ i = 0;
+ err = bpf_map_update_elem(fd, &i, &udp, BPF_ANY);
+ if (!err) {
+ printf("Failed socket SOCK_DGRAM allowed '%i:%i'\n",
+ i, udp);
+ goto out_sockmap;
+ }
+
+ /* Test update without programs */
+ for (i = 0; i < 6; i++) {
+ err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY);
+ if (err) {
+ printf("Failed noprog update sockmap '%i:%i'\n",
+ i, sfd[i]);
+ goto out_sockmap;
+ }
+ }
+
+ /* Test attaching/detaching bad fds */
+ err = bpf_prog_attach(-1, fd, BPF_SK_SKB_STREAM_PARSER, 0);
+ if (!err) {
+ printf("Failed invalid parser prog attach\n");
+ goto out_sockmap;
+ }
+
+ err = bpf_prog_attach(-1, fd, BPF_SK_SKB_STREAM_VERDICT, 0);
+ if (!err) {
+ printf("Failed invalid verdict prog attach\n");
+ goto out_sockmap;
+ }
+
+ err = bpf_prog_attach(-1, fd, BPF_SK_MSG_VERDICT, 0);
+ if (!err) {
+ printf("Failed invalid msg verdict prog attach\n");
+ goto out_sockmap;
+ }
+
+ err = bpf_prog_attach(-1, fd, __MAX_BPF_ATTACH_TYPE, 0);
+ if (!err) {
+ printf("Failed unknown prog attach\n");
+ goto out_sockmap;
+ }
+
+ err = bpf_prog_detach(fd, BPF_SK_SKB_STREAM_PARSER);
+ if (!err) {
+ printf("Failed empty parser prog detach\n");
+ goto out_sockmap;
+ }
+
+ err = bpf_prog_detach(fd, BPF_SK_SKB_STREAM_VERDICT);
+ if (!err) {
+ printf("Failed empty verdict prog detach\n");
+ goto out_sockmap;
+ }
+
+ err = bpf_prog_detach(fd, BPF_SK_MSG_VERDICT);
+ if (!err) {
+ printf("Failed empty msg verdict prog detach\n");
+ goto out_sockmap;
+ }
+
+ err = bpf_prog_detach(fd, __MAX_BPF_ATTACH_TYPE);
+ if (!err) {
+ printf("Detach invalid prog successful\n");
+ goto out_sockmap;
+ }
+
+ /* Load SK_SKB program and Attach */
+ err = bpf_prog_load(SOCKMAP_PARSE_PROG,
+ BPF_PROG_TYPE_SK_SKB, &obj, &parse_prog);
+ if (err) {
+ printf("Failed to load SK_SKB parse prog\n");
+ goto out_sockmap;
+ }
+
+ err = bpf_prog_load(SOCKMAP_TCP_MSG_PROG,
+ BPF_PROG_TYPE_SK_MSG, &obj, &msg_prog);
+ if (err) {
+ printf("Failed to load SK_SKB msg prog\n");
+ goto out_sockmap;
+ }
+
+ err = bpf_prog_load(SOCKMAP_VERDICT_PROG,
+ BPF_PROG_TYPE_SK_SKB, &obj, &verdict_prog);
+ if (err) {
+ printf("Failed to load SK_SKB verdict prog\n");
+ goto out_sockmap;
+ }
+
+ bpf_map_rx = bpf_object__find_map_by_name(obj, "sock_map_rx");
+ if (IS_ERR(bpf_map_rx)) {
+ printf("Failed to load map rx from verdict prog\n");
+ goto out_sockmap;
+ }
+
+ map_fd_rx = bpf_map__fd(bpf_map_rx);
+ if (map_fd_rx < 0) {
+ printf("Failed to get map rx fd\n");
+ goto out_sockmap;
+ }
+
+ bpf_map_tx = bpf_object__find_map_by_name(obj, "sock_map_tx");
+ if (IS_ERR(bpf_map_tx)) {
+ printf("Failed to load map tx from verdict prog\n");
+ goto out_sockmap;
+ }
+
+ map_fd_tx = bpf_map__fd(bpf_map_tx);
+ if (map_fd_tx < 0) {
+ printf("Failed to get map tx fd\n");
+ goto out_sockmap;
+ }
+
+ bpf_map_msg = bpf_object__find_map_by_name(obj, "sock_map_msg");
+ if (IS_ERR(bpf_map_msg)) {
+ printf("Failed to load map msg from msg_verdict prog\n");
+ goto out_sockmap;
+ }
+
+ map_fd_msg = bpf_map__fd(bpf_map_msg);
+ if (map_fd_msg < 0) {
+ printf("Failed to get map msg fd\n");
+ goto out_sockmap;
+ }
+
+ bpf_map_break = bpf_object__find_map_by_name(obj, "sock_map_break");
+ if (IS_ERR(bpf_map_break)) {
+ printf("Failed to load map tx from verdict prog\n");
+ goto out_sockmap;
+ }
+
+ map_fd_break = bpf_map__fd(bpf_map_break);
+ if (map_fd_break < 0) {
+ printf("Failed to get map tx fd\n");
+ goto out_sockmap;
+ }
+
+ err = bpf_prog_attach(parse_prog, map_fd_break,
+ BPF_SK_SKB_STREAM_PARSER, 0);
+ if (!err) {
+ printf("Allowed attaching SK_SKB program to invalid map\n");
+ goto out_sockmap;
+ }
+
+ err = bpf_prog_attach(parse_prog, map_fd_rx,
+ BPF_SK_SKB_STREAM_PARSER, 0);
+ if (err) {
+ printf("Failed stream parser bpf prog attach\n");
+ goto out_sockmap;
+ }
+
+ err = bpf_prog_attach(verdict_prog, map_fd_rx,
+ BPF_SK_SKB_STREAM_VERDICT, 0);
+ if (err) {
+ printf("Failed stream verdict bpf prog attach\n");
+ goto out_sockmap;
+ }
+
+ err = bpf_prog_attach(msg_prog, map_fd_msg, BPF_SK_MSG_VERDICT, 0);
+ if (err) {
+ printf("Failed msg verdict bpf prog attach\n");
+ goto out_sockmap;
+ }
+
+ err = bpf_prog_attach(verdict_prog, map_fd_rx,
+ __MAX_BPF_ATTACH_TYPE, 0);
+ if (!err) {
+ printf("Attached unknown bpf prog\n");
+ goto out_sockmap;
+ }
+
+ /* Test map update elem afterwards fd lives in fd and map_fd */
+ for (i = 2; i < 6; i++) {
+ err = bpf_map_update_elem(map_fd_rx, &i, &sfd[i], BPF_ANY);
+ if (err) {
+ printf("Failed map_fd_rx update sockmap %i '%i:%i'\n",
+ err, i, sfd[i]);
+ goto out_sockmap;
+ }
+ err = bpf_map_update_elem(map_fd_tx, &i, &sfd[i], BPF_ANY);
+ if (err) {
+ printf("Failed map_fd_tx update sockmap %i '%i:%i'\n",
+ err, i, sfd[i]);
+ goto out_sockmap;
+ }
+ }
+
+ /* Test map delete elem and remove send/recv sockets */
+ for (i = 2; i < 4; i++) {
+ err = bpf_map_delete_elem(map_fd_rx, &i);
+ if (err) {
+ printf("Failed delete sockmap rx %i '%i:%i'\n",
+ err, i, sfd[i]);
+ goto out_sockmap;
+ }
+ err = bpf_map_delete_elem(map_fd_tx, &i);
+ if (err) {
+ printf("Failed delete sockmap tx %i '%i:%i'\n",
+ err, i, sfd[i]);
+ goto out_sockmap;
+ }
+ }
+
+ /* Put sfd[2] (sending fd below) into msg map to test sendmsg bpf */
+ i = 0;
+ err = bpf_map_update_elem(map_fd_msg, &i, &sfd[2], BPF_ANY);
+ if (err) {
+ printf("Failed map_fd_msg update sockmap %i\n", err);
+ goto out_sockmap;
+ }
+
+ /* Test map send/recv */
+ for (i = 0; i < 2; i++) {
+ buf[0] = i;
+ buf[1] = 0x5;
+ sc = send(sfd[2], buf, 20, 0);
+ if (sc < 0) {
+ printf("Failed sockmap send\n");
+ goto out_sockmap;
+ }
+
+ FD_ZERO(&w);
+ FD_SET(sfd[3], &w);
+ to.tv_sec = 30;
+ to.tv_usec = 0;
+ s = select(sfd[3] + 1, &w, NULL, NULL, &to);
+ if (s == -1) {
+ perror("Failed sockmap select()");
+ goto out_sockmap;
+ } else if (!s) {
+ printf("Failed sockmap unexpected timeout\n");
+ goto out_sockmap;
+ }
+
+ if (!FD_ISSET(sfd[3], &w)) {
+ printf("Failed sockmap select/recv\n");
+ goto out_sockmap;
+ }
+
+ rc = recv(sfd[3], buf, sizeof(buf), 0);
+ if (rc < 0) {
+ printf("Failed sockmap recv\n");
+ goto out_sockmap;
+ }
+ }
+
+ /* Negative null entry lookup from datapath should be dropped */
+ buf[0] = 1;
+ buf[1] = 12;
+ sc = send(sfd[2], buf, 20, 0);
+ if (sc < 0) {
+ printf("Failed sockmap send\n");
+ goto out_sockmap;
+ }
+
+ /* Push fd into same slot */
+ i = 2;
+ err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_NOEXIST);
+ if (!err) {
+ printf("Failed allowed sockmap dup slot BPF_NOEXIST\n");
+ goto out_sockmap;
+ }
+
+ err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY);
+ if (err) {
+ printf("Failed sockmap update new slot BPF_ANY\n");
+ goto out_sockmap;
+ }
+
+ err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_EXIST);
+ if (err) {
+ printf("Failed sockmap update new slot BPF_EXIST\n");
+ goto out_sockmap;
+ }
+
+ /* Delete the elems without programs */
+ for (i = 2; i < 6; i++) {
+ err = bpf_map_delete_elem(fd, &i);
+ if (err) {
+ printf("Failed delete sockmap %i '%i:%i'\n",
+ err, i, sfd[i]);
+ }
+ }
+
+ /* Test having multiple maps open and set with programs on same fds */
+ err = bpf_prog_attach(parse_prog, fd,
+ BPF_SK_SKB_STREAM_PARSER, 0);
+ if (err) {
+ printf("Failed fd bpf parse prog attach\n");
+ goto out_sockmap;
+ }
+ err = bpf_prog_attach(verdict_prog, fd,
+ BPF_SK_SKB_STREAM_VERDICT, 0);
+ if (err) {
+ printf("Failed fd bpf verdict prog attach\n");
+ goto out_sockmap;
+ }
+
+ for (i = 4; i < 6; i++) {
+ err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY);
+ if (!err) {
+ printf("Failed allowed duplicate programs in update ANY sockmap %i '%i:%i'\n",
+ err, i, sfd[i]);
+ goto out_sockmap;
+ }
+ err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_NOEXIST);
+ if (!err) {
+ printf("Failed allowed duplicate program in update NOEXIST sockmap %i '%i:%i'\n",
+ err, i, sfd[i]);
+ goto out_sockmap;
+ }
+ err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_EXIST);
+ if (!err) {
+ printf("Failed allowed duplicate program in update EXIST sockmap %i '%i:%i'\n",
+ err, i, sfd[i]);
+ goto out_sockmap;
+ }
+ }
+
+ /* Test tasks number of forked operations */
+ for (i = 0; i < tasks; i++) {
+ pid[i] = fork();
+ if (pid[i] == 0) {
+ for (i = 0; i < 6; i++) {
+ bpf_map_delete_elem(map_fd_tx, &i);
+ bpf_map_delete_elem(map_fd_rx, &i);
+ bpf_map_update_elem(map_fd_tx, &i,
+ &sfd[i], BPF_ANY);
+ bpf_map_update_elem(map_fd_rx, &i,
+ &sfd[i], BPF_ANY);
+ }
+ exit(0);
+ } else if (pid[i] == -1) {
+ printf("Couldn't spawn #%d process!\n", i);
+ exit(1);
+ }
+ }
+
+ for (i = 0; i < tasks; i++) {
+ int status;
+
+ assert(waitpid(pid[i], &status, 0) == pid[i]);
+ assert(status == 0);
+ }
+
+ err = bpf_prog_detach2(parse_prog, map_fd_rx, __MAX_BPF_ATTACH_TYPE);
+ if (!err) {
+ printf("Detached an invalid prog type.\n");
+ goto out_sockmap;
+ }
+
+ err = bpf_prog_detach2(parse_prog, map_fd_rx, BPF_SK_SKB_STREAM_PARSER);
+ if (err) {
+ printf("Failed parser prog detach\n");
+ goto out_sockmap;
+ }
+
+ err = bpf_prog_detach2(verdict_prog, map_fd_rx, BPF_SK_SKB_STREAM_VERDICT);
+ if (err) {
+ printf("Failed parser prog detach\n");
+ goto out_sockmap;
+ }
+
+ /* Test map close sockets and empty maps */
+ for (i = 0; i < 6; i++) {
+ bpf_map_delete_elem(map_fd_tx, &i);
+ bpf_map_delete_elem(map_fd_rx, &i);
+ close(sfd[i]);
+ }
+ close(fd);
+ close(map_fd_rx);
+ bpf_object__close(obj);
+ return;
+out:
+ for (i = 0; i < 6; i++)
+ close(sfd[i]);
+ printf("Failed to create sockmap '%i:%s'!\n", i, strerror(errno));
+ exit(1);
+out_sockmap:
+ for (i = 0; i < 6; i++) {
+ if (map_fd_tx)
+ bpf_map_delete_elem(map_fd_tx, &i);
+ if (map_fd_rx)
+ bpf_map_delete_elem(map_fd_rx, &i);
+ close(sfd[i]);
+ }
+ close(fd);
+ exit(1);
+}
+
+#define MAPINMAP_PROG "./test_map_in_map.o"
+static void test_map_in_map(void)
+{
+ struct bpf_object *obj;
+ struct bpf_map *map;
+ int mim_fd, fd, err;
+ int pos = 0;
+
+ obj = bpf_object__open(MAPINMAP_PROG);
+
+ fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(int), sizeof(int),
+ 2, 0);
+ if (fd < 0) {
+ printf("Failed to create hashmap '%s'!\n", strerror(errno));
+ exit(1);
+ }
+
+ map = bpf_object__find_map_by_name(obj, "mim_array");
+ if (IS_ERR(map)) {
+ printf("Failed to load array of maps from test prog\n");
+ goto out_map_in_map;
+ }
+ err = bpf_map__set_inner_map_fd(map, fd);
+ if (err) {
+ printf("Failed to set inner_map_fd for array of maps\n");
+ goto out_map_in_map;
+ }
+
+ map = bpf_object__find_map_by_name(obj, "mim_hash");
+ if (IS_ERR(map)) {
+ printf("Failed to load hash of maps from test prog\n");
+ goto out_map_in_map;
+ }
+ err = bpf_map__set_inner_map_fd(map, fd);
+ if (err) {
+ printf("Failed to set inner_map_fd for hash of maps\n");
+ goto out_map_in_map;
+ }
+
+ bpf_object__load(obj);
+
+ map = bpf_object__find_map_by_name(obj, "mim_array");
+ if (IS_ERR(map)) {
+ printf("Failed to load array of maps from test prog\n");
+ goto out_map_in_map;
+ }
+ mim_fd = bpf_map__fd(map);
+ if (mim_fd < 0) {
+ printf("Failed to get descriptor for array of maps\n");
+ goto out_map_in_map;
+ }
+
+ err = bpf_map_update_elem(mim_fd, &pos, &fd, 0);
+ if (err) {
+ printf("Failed to update array of maps\n");
+ goto out_map_in_map;
+ }
+
+ map = bpf_object__find_map_by_name(obj, "mim_hash");
+ if (IS_ERR(map)) {
+ printf("Failed to load hash of maps from test prog\n");
+ goto out_map_in_map;
+ }
+ mim_fd = bpf_map__fd(map);
+ if (mim_fd < 0) {
+ printf("Failed to get descriptor for hash of maps\n");
+ goto out_map_in_map;
+ }
+
+ err = bpf_map_update_elem(mim_fd, &pos, &fd, 0);
+ if (err) {
+ printf("Failed to update hash of maps\n");
+ goto out_map_in_map;
+ }
+
+ close(fd);
+ bpf_object__close(obj);
+ return;
+
+out_map_in_map:
+ close(fd);
+ exit(1);
+}
+
+#define MAP_SIZE (32 * 1024)
+
+static void test_map_large(void)
+{
+ struct bigkey {
+ int a;
+ char b[116];
+ long long c;
+ } key;
+ int fd, i, value;
+
+ fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
+ MAP_SIZE, map_flags);
+ if (fd < 0) {
+ printf("Failed to create large map '%s'!\n", strerror(errno));
+ exit(1);
+ }
+
+ for (i = 0; i < MAP_SIZE; i++) {
+ key = (struct bigkey) { .c = i };
+ value = i;
+
+ assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == 0);
+ }
+
+ key.c = -1;
+ assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 &&
+ errno == E2BIG);
+
+ /* Iterate through all elements. */
+ assert(bpf_map_get_next_key(fd, NULL, &key) == 0);
+ key.c = -1;
+ for (i = 0; i < MAP_SIZE; i++)
+ assert(bpf_map_get_next_key(fd, &key, &key) == 0);
+ assert(bpf_map_get_next_key(fd, &key, &key) == -1 && errno == ENOENT);
+
+ key.c = 0;
+ assert(bpf_map_lookup_elem(fd, &key, &value) == 0 && value == 0);
+ key.a = 1;
+ assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == ENOENT);
+
+ close(fd);
+}
+
+#define run_parallel(N, FN, DATA) \
+ printf("Fork %u tasks to '" #FN "'\n", N); \
+ __run_parallel(N, FN, DATA)
+
+static void __run_parallel(unsigned int tasks,
+ void (*fn)(unsigned int task, void *data),
+ void *data)
+{
+ pid_t pid[tasks];
+ int i;
+
+ fflush(stdout);
+
+ for (i = 0; i < tasks; i++) {
+ pid[i] = fork();
+ if (pid[i] == 0) {
+ fn(i, data);
+ exit(0);
+ } else if (pid[i] == -1) {
+ printf("Couldn't spawn #%d process!\n", i);
+ exit(1);
+ }
+ }
+
+ for (i = 0; i < tasks; i++) {
+ int status;
+
+ assert(waitpid(pid[i], &status, 0) == pid[i]);
+ assert(status == 0);
+ }
+}
+
+static void test_map_stress(void)
+{
+ run_parallel(100, test_hashmap, NULL);
+ run_parallel(100, test_hashmap_percpu, NULL);
+ run_parallel(100, test_hashmap_sizes, NULL);
+ run_parallel(100, test_hashmap_walk, NULL);
+
+ run_parallel(100, test_arraymap, NULL);
+ run_parallel(100, test_arraymap_percpu, NULL);
+}
+
+#define TASKS 1024
+
+#define DO_UPDATE 1
+#define DO_DELETE 0
+
+static void test_update_delete(unsigned int fn, void *data)
+{
+ int do_update = ((int *)data)[1];
+ int fd = ((int *)data)[0];
+ int i, key, value;
+
+ for (i = fn; i < MAP_SIZE; i += TASKS) {
+ key = value = i;
+
+ if (do_update) {
+ assert(bpf_map_update_elem(fd, &key, &value,
+ BPF_NOEXIST) == 0);
+ assert(bpf_map_update_elem(fd, &key, &value,
+ BPF_EXIST) == 0);
+ } else {
+ assert(bpf_map_delete_elem(fd, &key) == 0);
+ }
+ }
+}
+
+static void test_map_parallel(void)
+{
+ int i, fd, key = 0, value = 0;
+ int data[2];
+
+ fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
+ MAP_SIZE, map_flags);
+ if (fd < 0) {
+ printf("Failed to create map for parallel test '%s'!\n",
+ strerror(errno));
+ exit(1);
+ }
+
+ /* Use the same fd in children to add elements to this map:
+ * child_0 adds key=0, key=1024, key=2048, ...
+ * child_1 adds key=1, key=1025, key=2049, ...
+ * child_1023 adds key=1023, ...
+ */
+ data[0] = fd;
+ data[1] = DO_UPDATE;
+ run_parallel(TASKS, test_update_delete, data);
+
+ /* Check that key=0 is already there. */
+ assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 &&
+ errno == EEXIST);
+
+ /* Check that all elements were inserted. */
+ assert(bpf_map_get_next_key(fd, NULL, &key) == 0);
+ key = -1;
+ for (i = 0; i < MAP_SIZE; i++)
+ assert(bpf_map_get_next_key(fd, &key, &key) == 0);
+ assert(bpf_map_get_next_key(fd, &key, &key) == -1 && errno == ENOENT);
+
+ /* Another check for all elements */
+ for (i = 0; i < MAP_SIZE; i++) {
+ key = MAP_SIZE - i - 1;
+
+ assert(bpf_map_lookup_elem(fd, &key, &value) == 0 &&
+ value == key);
+ }
+
+ /* Now let's delete all elemenets in parallel. */
+ data[1] = DO_DELETE;
+ run_parallel(TASKS, test_update_delete, data);
+
+ /* Nothing should be left. */
+ key = -1;
+ assert(bpf_map_get_next_key(fd, NULL, &key) == -1 && errno == ENOENT);
+ assert(bpf_map_get_next_key(fd, &key, &key) == -1 && errno == ENOENT);
+}
+
+static void test_map_rdonly(void)
+{
+ int fd, key = 0, value = 0;
+
+ fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
+ MAP_SIZE, map_flags | BPF_F_RDONLY);
+ if (fd < 0) {
+ printf("Failed to create map for read only test '%s'!\n",
+ strerror(errno));
+ exit(1);
+ }
+
+ key = 1;
+ value = 1234;
+ /* Try to insert key=1 element. */
+ assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == -1 &&
+ errno == EPERM);
+
+ /* Check that key=1 is not found. */
+ assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == ENOENT);
+ assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == ENOENT);
+
+ close(fd);
+}
+
+static void test_map_wronly_hash(void)
+{
+ int fd, key = 0, value = 0;
+
+ fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
+ MAP_SIZE, map_flags | BPF_F_WRONLY);
+ if (fd < 0) {
+ printf("Failed to create map for write only test '%s'!\n",
+ strerror(errno));
+ exit(1);
+ }
+
+ key = 1;
+ value = 1234;
+ /* Insert key=1 element. */
+ assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0);
+
+ /* Check that reading elements and keys from the map is not allowed. */
+ assert(bpf_map_lookup_elem(fd, &key, &value) == -1 && errno == EPERM);
+ assert(bpf_map_get_next_key(fd, &key, &value) == -1 && errno == EPERM);
+
+ close(fd);
+}
+
+static void test_map_wronly_stack_or_queue(enum bpf_map_type map_type)
+{
+ int fd, value = 0;
+
+ assert(map_type == BPF_MAP_TYPE_QUEUE ||
+ map_type == BPF_MAP_TYPE_STACK);
+ fd = bpf_create_map(map_type, 0, sizeof(value), MAP_SIZE,
+ map_flags | BPF_F_WRONLY);
+ /* Stack/Queue maps do not support BPF_F_NO_PREALLOC */
+ if (map_flags & BPF_F_NO_PREALLOC) {
+ assert(fd < 0 && errno == EINVAL);
+ return;
+ }
+ if (fd < 0) {
+ printf("Failed to create map '%s'!\n", strerror(errno));
+ exit(1);
+ }
+
+ value = 1234;
+ assert(bpf_map_update_elem(fd, NULL, &value, BPF_ANY) == 0);
+
+ /* Peek element should fail */
+ assert(bpf_map_lookup_elem(fd, NULL, &value) == -1 && errno == EPERM);
+
+ /* Pop element should fail */
+ assert(bpf_map_lookup_and_delete_elem(fd, NULL, &value) == -1 &&
+ errno == EPERM);
+
+ close(fd);
+}
+
+static void test_map_wronly(void)
+{
+ test_map_wronly_hash();
+ test_map_wronly_stack_or_queue(BPF_MAP_TYPE_STACK);
+ test_map_wronly_stack_or_queue(BPF_MAP_TYPE_QUEUE);
+}
+
+static void prepare_reuseport_grp(int type, int map_fd, size_t map_elem_size,
+ __s64 *fds64, __u64 *sk_cookies,
+ unsigned int n)
+{
+ socklen_t optlen, addrlen;
+ struct sockaddr_in6 s6;
+ const __u32 index0 = 0;
+ const int optval = 1;
+ unsigned int i;
+ u64 sk_cookie;
+ void *value;
+ __s32 fd32;
+ __s64 fd64;
+ int err;
+
+ s6.sin6_family = AF_INET6;
+ s6.sin6_addr = in6addr_any;
+ s6.sin6_port = 0;
+ addrlen = sizeof(s6);
+ optlen = sizeof(sk_cookie);
+
+ for (i = 0; i < n; i++) {
+ fd64 = socket(AF_INET6, type, 0);
+ CHECK(fd64 == -1, "socket()",
+ "sock_type:%d fd64:%lld errno:%d\n",
+ type, fd64, errno);
+
+ err = setsockopt(fd64, SOL_SOCKET, SO_REUSEPORT,
+ &optval, sizeof(optval));
+ CHECK(err == -1, "setsockopt(SO_REUSEPORT)",
+ "err:%d errno:%d\n", err, errno);
+
+ /* reuseport_array does not allow unbound sk */
+ if (map_elem_size == sizeof(__u64))
+ value = &fd64;
+ else {
+ assert(map_elem_size == sizeof(__u32));
+ fd32 = (__s32)fd64;
+ value = &fd32;
+ }
+ err = bpf_map_update_elem(map_fd, &index0, value, BPF_ANY);
+ CHECK(err != -1 || errno != EINVAL,
+ "reuseport array update unbound sk",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+
+ err = bind(fd64, (struct sockaddr *)&s6, sizeof(s6));
+ CHECK(err == -1, "bind()",
+ "sock_type:%d err:%d errno:%d\n", type, err, errno);
+
+ if (i == 0) {
+ err = getsockname(fd64, (struct sockaddr *)&s6,
+ &addrlen);
+ CHECK(err == -1, "getsockname()",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+ }
+
+ err = getsockopt(fd64, SOL_SOCKET, SO_COOKIE, &sk_cookie,
+ &optlen);
+ CHECK(err == -1, "getsockopt(SO_COOKIE)",
+ "sock_type:%d err:%d errno:%d\n", type, err, errno);
+
+ if (type == SOCK_STREAM) {
+ /*
+ * reuseport_array does not allow
+ * non-listening tcp sk.
+ */
+ err = bpf_map_update_elem(map_fd, &index0, value,
+ BPF_ANY);
+ CHECK(err != -1 || errno != EINVAL,
+ "reuseport array update non-listening sk",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+ err = listen(fd64, 0);
+ CHECK(err == -1, "listen()",
+ "sock_type:%d, err:%d errno:%d\n",
+ type, err, errno);
+ }
+
+ fds64[i] = fd64;
+ sk_cookies[i] = sk_cookie;
+ }
+}
+
+static void test_reuseport_array(void)
+{
+#define REUSEPORT_FD_IDX(err, last) ({ (err) ? last : !last; })
+
+ const __u32 array_size = 4, index0 = 0, index3 = 3;
+ int types[2] = { SOCK_STREAM, SOCK_DGRAM }, type;
+ __u64 grpa_cookies[2], sk_cookie, map_cookie;
+ __s64 grpa_fds64[2] = { -1, -1 }, fd64 = -1;
+ const __u32 bad_index = array_size;
+ int map_fd, err, t, f;
+ __u32 fds_idx = 0;
+ int fd;
+
+ map_fd = bpf_create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
+ sizeof(__u32), sizeof(__u64), array_size, 0);
+ CHECK(map_fd == -1, "reuseport array create",
+ "map_fd:%d, errno:%d\n", map_fd, errno);
+
+ /* Test lookup/update/delete with invalid index */
+ err = bpf_map_delete_elem(map_fd, &bad_index);
+ CHECK(err != -1 || errno != E2BIG, "reuseport array del >=max_entries",
+ "err:%d errno:%d\n", err, errno);
+
+ err = bpf_map_update_elem(map_fd, &bad_index, &fd64, BPF_ANY);
+ CHECK(err != -1 || errno != E2BIG,
+ "reuseport array update >=max_entries",
+ "err:%d errno:%d\n", err, errno);
+
+ err = bpf_map_lookup_elem(map_fd, &bad_index, &map_cookie);
+ CHECK(err != -1 || errno != ENOENT,
+ "reuseport array update >=max_entries",
+ "err:%d errno:%d\n", err, errno);
+
+ /* Test lookup/delete non existence elem */
+ err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie);
+ CHECK(err != -1 || errno != ENOENT,
+ "reuseport array lookup not-exist elem",
+ "err:%d errno:%d\n", err, errno);
+ err = bpf_map_delete_elem(map_fd, &index3);
+ CHECK(err != -1 || errno != ENOENT,
+ "reuseport array del not-exist elem",
+ "err:%d errno:%d\n", err, errno);
+
+ for (t = 0; t < ARRAY_SIZE(types); t++) {
+ type = types[t];
+
+ prepare_reuseport_grp(type, map_fd, sizeof(__u64), grpa_fds64,
+ grpa_cookies, ARRAY_SIZE(grpa_fds64));
+
+ /* Test BPF_* update flags */
+ /* BPF_EXIST failure case */
+ err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx],
+ BPF_EXIST);
+ CHECK(err != -1 || errno != ENOENT,
+ "reuseport array update empty elem BPF_EXIST",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+ fds_idx = REUSEPORT_FD_IDX(err, fds_idx);
+
+ /* BPF_NOEXIST success case */
+ err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx],
+ BPF_NOEXIST);
+ CHECK(err == -1,
+ "reuseport array update empty elem BPF_NOEXIST",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+ fds_idx = REUSEPORT_FD_IDX(err, fds_idx);
+
+ /* BPF_EXIST success case. */
+ err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx],
+ BPF_EXIST);
+ CHECK(err == -1,
+ "reuseport array update same elem BPF_EXIST",
+ "sock_type:%d err:%d errno:%d\n", type, err, errno);
+ fds_idx = REUSEPORT_FD_IDX(err, fds_idx);
+
+ /* BPF_NOEXIST failure case */
+ err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx],
+ BPF_NOEXIST);
+ CHECK(err != -1 || errno != EEXIST,
+ "reuseport array update non-empty elem BPF_NOEXIST",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+ fds_idx = REUSEPORT_FD_IDX(err, fds_idx);
+
+ /* BPF_ANY case (always succeed) */
+ err = bpf_map_update_elem(map_fd, &index3, &grpa_fds64[fds_idx],
+ BPF_ANY);
+ CHECK(err == -1,
+ "reuseport array update same sk with BPF_ANY",
+ "sock_type:%d err:%d errno:%d\n", type, err, errno);
+
+ fd64 = grpa_fds64[fds_idx];
+ sk_cookie = grpa_cookies[fds_idx];
+
+ /* The same sk cannot be added to reuseport_array twice */
+ err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_ANY);
+ CHECK(err != -1 || errno != EBUSY,
+ "reuseport array update same sk with same index",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+
+ err = bpf_map_update_elem(map_fd, &index0, &fd64, BPF_ANY);
+ CHECK(err != -1 || errno != EBUSY,
+ "reuseport array update same sk with different index",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+
+ /* Test delete elem */
+ err = bpf_map_delete_elem(map_fd, &index3);
+ CHECK(err == -1, "reuseport array delete sk",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+
+ /* Add it back with BPF_NOEXIST */
+ err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_NOEXIST);
+ CHECK(err == -1,
+ "reuseport array re-add with BPF_NOEXIST after del",
+ "sock_type:%d err:%d errno:%d\n", type, err, errno);
+
+ /* Test cookie */
+ err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie);
+ CHECK(err == -1 || sk_cookie != map_cookie,
+ "reuseport array lookup re-added sk",
+ "sock_type:%d err:%d errno:%d sk_cookie:0x%llx map_cookie:0x%llxn",
+ type, err, errno, sk_cookie, map_cookie);
+
+ /* Test elem removed by close() */
+ for (f = 0; f < ARRAY_SIZE(grpa_fds64); f++)
+ close(grpa_fds64[f]);
+ err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie);
+ CHECK(err != -1 || errno != ENOENT,
+ "reuseport array lookup after close()",
+ "sock_type:%d err:%d errno:%d\n",
+ type, err, errno);
+ }
+
+ /* Test SOCK_RAW */
+ fd64 = socket(AF_INET6, SOCK_RAW, IPPROTO_UDP);
+ CHECK(fd64 == -1, "socket(SOCK_RAW)", "err:%d errno:%d\n",
+ err, errno);
+ err = bpf_map_update_elem(map_fd, &index3, &fd64, BPF_NOEXIST);
+ CHECK(err != -1 || errno != ENOTSUPP, "reuseport array update SOCK_RAW",
+ "err:%d errno:%d\n", err, errno);
+ close(fd64);
+
+ /* Close the 64 bit value map */
+ close(map_fd);
+
+ /* Test 32 bit fd */
+ map_fd = bpf_create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
+ sizeof(__u32), sizeof(__u32), array_size, 0);
+ CHECK(map_fd == -1, "reuseport array create",
+ "map_fd:%d, errno:%d\n", map_fd, errno);
+ prepare_reuseport_grp(SOCK_STREAM, map_fd, sizeof(__u32), &fd64,
+ &sk_cookie, 1);
+ fd = fd64;
+ err = bpf_map_update_elem(map_fd, &index3, &fd, BPF_NOEXIST);
+ CHECK(err == -1, "reuseport array update 32 bit fd",
+ "err:%d errno:%d\n", err, errno);
+ err = bpf_map_lookup_elem(map_fd, &index3, &map_cookie);
+ CHECK(err != -1 || errno != ENOSPC,
+ "reuseport array lookup 32 bit fd",
+ "err:%d errno:%d\n", err, errno);
+ close(fd);
+ close(map_fd);
+}
+
+static void run_all_tests(void)
+{
+ test_hashmap(0, NULL);
+ test_hashmap_percpu(0, NULL);
+ test_hashmap_walk(0, NULL);
+ test_hashmap_zero_seed();
+
+ test_arraymap(0, NULL);
+ test_arraymap_percpu(0, NULL);
+
+ test_arraymap_percpu_many_keys();
+
+ test_devmap(0, NULL);
+ test_devmap_hash(0, NULL);
+ test_sockmap(0, NULL);
+
+ test_map_large();
+ test_map_parallel();
+ test_map_stress();
+
+ test_map_rdonly();
+ test_map_wronly();
+
+ test_reuseport_array();
+
+ test_queuemap(0, NULL);
+ test_stackmap(0, NULL);
+
+ test_map_in_map();
+}
+
+#define DEFINE_TEST(name) extern void test_##name(void);
+#include <map_tests/tests.h>
+#undef DEFINE_TEST
+
+int main(void)
+{
+ srand(time(NULL));
+
+ map_flags = 0;
+ run_all_tests();
+
+ map_flags = BPF_F_NO_PREALLOC;
+ run_all_tests();
+
+#define DEFINE_TEST(name) test_##name();
+#include <map_tests/tests.h>
+#undef DEFINE_TEST
+
+ printf("test_maps: OK, %d SKIPPED\n", skips);
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/test_maps.h b/tools/testing/selftests/bpf/test_maps.h
new file mode 100644
index 000000000..77d8587ac
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_maps.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _TEST_MAPS_H
+#define _TEST_MAPS_H
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#define CHECK(condition, tag, format...) ({ \
+ int __ret = !!(condition); \
+ if (__ret) { \
+ printf("%s(%d):FAIL:%s ", __func__, __LINE__, tag); \
+ printf(format); \
+ exit(-1); \
+ } \
+})
+
+#endif
diff --git a/tools/testing/selftests/bpf/test_netcnt.c b/tools/testing/selftests/bpf/test_netcnt.c
new file mode 100644
index 000000000..a7b9a69f4
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_netcnt.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <assert.h>
+#include <sys/sysinfo.h>
+#include <sys/time.h>
+
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "cgroup_helpers.h"
+#include "bpf_rlimit.h"
+#include "netcnt_common.h"
+
+#define BPF_PROG "./netcnt_prog.o"
+#define TEST_CGROUP "/test-network-counters/"
+
+static int bpf_find_map(const char *test, struct bpf_object *obj,
+ const char *name)
+{
+ struct bpf_map *map;
+
+ map = bpf_object__find_map_by_name(obj, name);
+ if (!map) {
+ printf("%s:FAIL:map '%s' not found\n", test, name);
+ return -1;
+ }
+ return bpf_map__fd(map);
+}
+
+int main(int argc, char **argv)
+{
+ struct percpu_net_cnt *percpu_netcnt;
+ struct bpf_cgroup_storage_key key;
+ int map_fd, percpu_map_fd;
+ int error = EXIT_FAILURE;
+ struct net_cnt netcnt;
+ struct bpf_object *obj;
+ int prog_fd, cgroup_fd;
+ unsigned long packets;
+ unsigned long bytes;
+ int cpu, nproc;
+ __u32 prog_cnt;
+
+ nproc = get_nprocs_conf();
+ percpu_netcnt = malloc(sizeof(*percpu_netcnt) * nproc);
+ if (!percpu_netcnt) {
+ printf("Not enough memory for per-cpu area (%d cpus)\n", nproc);
+ goto err;
+ }
+
+ if (bpf_prog_load(BPF_PROG, BPF_PROG_TYPE_CGROUP_SKB,
+ &obj, &prog_fd)) {
+ printf("Failed to load bpf program\n");
+ goto out;
+ }
+
+ cgroup_fd = cgroup_setup_and_join(TEST_CGROUP);
+ if (cgroup_fd < 0)
+ goto err;
+
+ /* Attach bpf program */
+ if (bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_INET_EGRESS, 0)) {
+ printf("Failed to attach bpf program");
+ goto err;
+ }
+
+ if (system("which ping6 &>/dev/null") == 0)
+ assert(!system("ping6 ::1 -c 10000 -f -q > /dev/null"));
+ else
+ assert(!system("ping -6 ::1 -c 10000 -f -q > /dev/null"));
+
+ if (bpf_prog_query(cgroup_fd, BPF_CGROUP_INET_EGRESS, 0, NULL, NULL,
+ &prog_cnt)) {
+ printf("Failed to query attached programs");
+ goto err;
+ }
+
+ map_fd = bpf_find_map(__func__, obj, "netcnt");
+ if (map_fd < 0) {
+ printf("Failed to find bpf map with net counters");
+ goto err;
+ }
+
+ percpu_map_fd = bpf_find_map(__func__, obj, "percpu_netcnt");
+ if (percpu_map_fd < 0) {
+ printf("Failed to find bpf map with percpu net counters");
+ goto err;
+ }
+
+ if (bpf_map_get_next_key(map_fd, NULL, &key)) {
+ printf("Failed to get key in cgroup storage\n");
+ goto err;
+ }
+
+ if (bpf_map_lookup_elem(map_fd, &key, &netcnt)) {
+ printf("Failed to lookup cgroup storage\n");
+ goto err;
+ }
+
+ if (bpf_map_lookup_elem(percpu_map_fd, &key, &percpu_netcnt[0])) {
+ printf("Failed to lookup percpu cgroup storage\n");
+ goto err;
+ }
+
+ /* Some packets can be still in per-cpu cache, but not more than
+ * MAX_PERCPU_PACKETS.
+ */
+ packets = netcnt.packets;
+ bytes = netcnt.bytes;
+ for (cpu = 0; cpu < nproc; cpu++) {
+ if (percpu_netcnt[cpu].packets > MAX_PERCPU_PACKETS) {
+ printf("Unexpected percpu value: %llu\n",
+ percpu_netcnt[cpu].packets);
+ goto err;
+ }
+
+ packets += percpu_netcnt[cpu].packets;
+ bytes += percpu_netcnt[cpu].bytes;
+ }
+
+ /* No packets should be lost */
+ if (packets != 10000) {
+ printf("Unexpected packet count: %lu\n", packets);
+ goto err;
+ }
+
+ /* Let's check that bytes counter matches the number of packets
+ * multiplied by the size of ipv6 ICMP packet.
+ */
+ if (bytes != packets * 104) {
+ printf("Unexpected bytes count: %lu\n", bytes);
+ goto err;
+ }
+
+ error = 0;
+ printf("test_netcnt:PASS\n");
+
+err:
+ cleanup_cgroup_environment();
+ free(percpu_netcnt);
+
+out:
+ return error;
+}
diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py
new file mode 100755
index 000000000..edaffd43d
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_offload.py
@@ -0,0 +1,1406 @@
+#!/usr/bin/env python3
+
+# Copyright (C) 2017 Netronome Systems, Inc.
+# Copyright (c) 2019 Mellanox Technologies. All rights reserved
+#
+# This software is licensed under the GNU General License Version 2,
+# June 1991 as shown in the file COPYING in the top-level directory of this
+# source tree.
+#
+# THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS"
+# WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
+# BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE
+# OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
+# THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+from datetime import datetime
+import argparse
+import errno
+import json
+import os
+import pprint
+import random
+import re
+import stat
+import string
+import struct
+import subprocess
+import time
+import traceback
+
+logfile = None
+log_level = 1
+skip_extack = False
+bpf_test_dir = os.path.dirname(os.path.realpath(__file__))
+pp = pprint.PrettyPrinter()
+devs = [] # devices we created for clean up
+files = [] # files to be removed
+netns = [] # net namespaces to be removed
+
+def log_get_sec(level=0):
+ return "*" * (log_level + level)
+
+def log_level_inc(add=1):
+ global log_level
+ log_level += add
+
+def log_level_dec(sub=1):
+ global log_level
+ log_level -= sub
+
+def log_level_set(level):
+ global log_level
+ log_level = level
+
+def log(header, data, level=None):
+ """
+ Output to an optional log.
+ """
+ if logfile is None:
+ return
+ if level is not None:
+ log_level_set(level)
+
+ if not isinstance(data, str):
+ data = pp.pformat(data)
+
+ if len(header):
+ logfile.write("\n" + log_get_sec() + " ")
+ logfile.write(header)
+ if len(header) and len(data.strip()):
+ logfile.write("\n")
+ logfile.write(data)
+
+def skip(cond, msg):
+ if not cond:
+ return
+ print("SKIP: " + msg)
+ log("SKIP: " + msg, "", level=1)
+ os.sys.exit(0)
+
+def fail(cond, msg):
+ if not cond:
+ return
+ print("FAIL: " + msg)
+ tb = "".join(traceback.extract_stack().format())
+ print(tb)
+ log("FAIL: " + msg, tb, level=1)
+ os.sys.exit(1)
+
+def start_test(msg):
+ log(msg, "", level=1)
+ log_level_inc()
+ print(msg)
+
+def cmd(cmd, shell=True, include_stderr=False, background=False, fail=True):
+ """
+ Run a command in subprocess and return tuple of (retval, stdout);
+ optionally return stderr as well as third value.
+ """
+ proc = subprocess.Popen(cmd, shell=shell, stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE)
+ if background:
+ msg = "%s START: %s" % (log_get_sec(1),
+ datetime.now().strftime("%H:%M:%S.%f"))
+ log("BKG " + proc.args, msg)
+ return proc
+
+ return cmd_result(proc, include_stderr=include_stderr, fail=fail)
+
+def cmd_result(proc, include_stderr=False, fail=False):
+ stdout, stderr = proc.communicate()
+ stdout = stdout.decode("utf-8")
+ stderr = stderr.decode("utf-8")
+ proc.stdout.close()
+ proc.stderr.close()
+
+ stderr = "\n" + stderr
+ if stderr[-1] == "\n":
+ stderr = stderr[:-1]
+
+ sec = log_get_sec(1)
+ log("CMD " + proc.args,
+ "RETCODE: %d\n%s STDOUT:\n%s%s STDERR:%s\n%s END: %s" %
+ (proc.returncode, sec, stdout, sec, stderr,
+ sec, datetime.now().strftime("%H:%M:%S.%f")))
+
+ if proc.returncode != 0 and fail:
+ if len(stderr) > 0 and stderr[-1] == "\n":
+ stderr = stderr[:-1]
+ raise Exception("Command failed: %s\n%s" % (proc.args, stderr))
+
+ if include_stderr:
+ return proc.returncode, stdout, stderr
+ else:
+ return proc.returncode, stdout
+
+def rm(f):
+ cmd("rm -f %s" % (f))
+ if f in files:
+ files.remove(f)
+
+def tool(name, args, flags, JSON=True, ns="", fail=True, include_stderr=False):
+ params = ""
+ if JSON:
+ params += "%s " % (flags["json"])
+
+ if ns != "":
+ ns = "ip netns exec %s " % (ns)
+
+ if include_stderr:
+ ret, stdout, stderr = cmd(ns + name + " " + params + args,
+ fail=fail, include_stderr=True)
+ else:
+ ret, stdout = cmd(ns + name + " " + params + args,
+ fail=fail, include_stderr=False)
+
+ if JSON and len(stdout.strip()) != 0:
+ out = json.loads(stdout)
+ else:
+ out = stdout
+
+ if include_stderr:
+ return ret, out, stderr
+ else:
+ return ret, out
+
+def bpftool(args, JSON=True, ns="", fail=True, include_stderr=False):
+ return tool("bpftool", args, {"json":"-p"}, JSON=JSON, ns=ns,
+ fail=fail, include_stderr=include_stderr)
+
+def bpftool_prog_list(expected=None, ns=""):
+ _, progs = bpftool("prog show", JSON=True, ns=ns, fail=True)
+ # Remove the base progs
+ for p in base_progs:
+ if p in progs:
+ progs.remove(p)
+ if expected is not None:
+ if len(progs) != expected:
+ fail(True, "%d BPF programs loaded, expected %d" %
+ (len(progs), expected))
+ return progs
+
+def bpftool_map_list(expected=None, ns=""):
+ _, maps = bpftool("map show", JSON=True, ns=ns, fail=True)
+ # Remove the base maps
+ maps = [m for m in maps if m not in base_maps and m.get('name') not in base_map_names]
+ if expected is not None:
+ if len(maps) != expected:
+ fail(True, "%d BPF maps loaded, expected %d" %
+ (len(maps), expected))
+ return maps
+
+def bpftool_prog_list_wait(expected=0, n_retry=20):
+ for i in range(n_retry):
+ nprogs = len(bpftool_prog_list())
+ if nprogs == expected:
+ return
+ time.sleep(0.05)
+ raise Exception("Time out waiting for program counts to stabilize want %d, have %d" % (expected, nprogs))
+
+def bpftool_map_list_wait(expected=0, n_retry=20):
+ for i in range(n_retry):
+ nmaps = len(bpftool_map_list())
+ if nmaps == expected:
+ return
+ time.sleep(0.05)
+ raise Exception("Time out waiting for map counts to stabilize want %d, have %d" % (expected, nmaps))
+
+def bpftool_prog_load(sample, file_name, maps=[], prog_type="xdp", dev=None,
+ fail=True, include_stderr=False):
+ args = "prog load %s %s" % (os.path.join(bpf_test_dir, sample), file_name)
+ if prog_type is not None:
+ args += " type " + prog_type
+ if dev is not None:
+ args += " dev " + dev
+ if len(maps):
+ args += " map " + " map ".join(maps)
+
+ res = bpftool(args, fail=fail, include_stderr=include_stderr)
+ if res[0] == 0:
+ files.append(file_name)
+ return res
+
+def ip(args, force=False, JSON=True, ns="", fail=True, include_stderr=False):
+ if force:
+ args = "-force " + args
+ return tool("ip", args, {"json":"-j"}, JSON=JSON, ns=ns,
+ fail=fail, include_stderr=include_stderr)
+
+def tc(args, JSON=True, ns="", fail=True, include_stderr=False):
+ return tool("tc", args, {"json":"-p"}, JSON=JSON, ns=ns,
+ fail=fail, include_stderr=include_stderr)
+
+def ethtool(dev, opt, args, fail=True):
+ return cmd("ethtool %s %s %s" % (opt, dev["ifname"], args), fail=fail)
+
+def bpf_obj(name, sec=".text", path=bpf_test_dir,):
+ return "obj %s sec %s" % (os.path.join(path, name), sec)
+
+def bpf_pinned(name):
+ return "pinned %s" % (name)
+
+def bpf_bytecode(bytecode):
+ return "bytecode \"%s\"" % (bytecode)
+
+def mknetns(n_retry=10):
+ for i in range(n_retry):
+ name = ''.join([random.choice(string.ascii_letters) for i in range(8)])
+ ret, _ = ip("netns add %s" % (name), fail=False)
+ if ret == 0:
+ netns.append(name)
+ return name
+ return None
+
+def int2str(fmt, val):
+ ret = []
+ for b in struct.pack(fmt, val):
+ ret.append(int(b))
+ return " ".join(map(lambda x: str(x), ret))
+
+def str2int(strtab):
+ inttab = []
+ for i in strtab:
+ inttab.append(int(i, 16))
+ ba = bytearray(inttab)
+ if len(strtab) == 4:
+ fmt = "I"
+ elif len(strtab) == 8:
+ fmt = "Q"
+ else:
+ raise Exception("String array of len %d can't be unpacked to an int" %
+ (len(strtab)))
+ return struct.unpack(fmt, ba)[0]
+
+class DebugfsDir:
+ """
+ Class for accessing DebugFS directories as a dictionary.
+ """
+
+ def __init__(self, path):
+ self.path = path
+ self._dict = self._debugfs_dir_read(path)
+
+ def __len__(self):
+ return len(self._dict.keys())
+
+ def __getitem__(self, key):
+ if type(key) is int:
+ key = list(self._dict.keys())[key]
+ return self._dict[key]
+
+ def __setitem__(self, key, value):
+ log("DebugFS set %s = %s" % (key, value), "")
+ log_level_inc()
+
+ cmd("echo '%s' > %s/%s" % (value, self.path, key))
+ log_level_dec()
+
+ _, out = cmd('cat %s/%s' % (self.path, key))
+ self._dict[key] = out.strip()
+
+ def _debugfs_dir_read(self, path):
+ dfs = {}
+
+ log("DebugFS state for %s" % (path), "")
+ log_level_inc(add=2)
+
+ _, out = cmd('ls ' + path)
+ for f in out.split():
+ if f == "ports":
+ continue
+
+ p = os.path.join(path, f)
+ if not os.stat(p).st_mode & stat.S_IRUSR:
+ continue
+
+ if os.path.isfile(p):
+ # We need to init trap_flow_action_cookie before read it
+ if f == "trap_flow_action_cookie":
+ cmd('echo deadbeef > %s/%s' % (path, f))
+ _, out = cmd('cat %s/%s' % (path, f))
+ dfs[f] = out.strip()
+ elif os.path.isdir(p):
+ dfs[f] = DebugfsDir(p)
+ else:
+ raise Exception("%s is neither file nor directory" % (p))
+
+ log_level_dec()
+ log("DebugFS state", dfs)
+ log_level_dec()
+
+ return dfs
+
+class NetdevSimDev:
+ """
+ Class for netdevsim bus device and its attributes.
+ """
+ @staticmethod
+ def ctrl_write(path, val):
+ fullpath = os.path.join("/sys/bus/netdevsim/", path)
+ try:
+ with open(fullpath, "w") as f:
+ f.write(val)
+ except OSError as e:
+ log("WRITE %s: %r" % (fullpath, val), -e.errno)
+ raise e
+ log("WRITE %s: %r" % (fullpath, val), 0)
+
+ def __init__(self, port_count=1):
+ addr = 0
+ while True:
+ try:
+ self.ctrl_write("new_device", "%u %u" % (addr, port_count))
+ except OSError as e:
+ if e.errno == errno.ENOSPC:
+ addr += 1
+ continue
+ raise e
+ break
+ self.addr = addr
+
+ # As probe of netdevsim device might happen from a workqueue,
+ # so wait here until all netdevs appear.
+ self.wait_for_netdevs(port_count)
+
+ ret, out = cmd("udevadm settle", fail=False)
+ if ret:
+ raise Exception("udevadm settle failed")
+ ifnames = self.get_ifnames()
+
+ devs.append(self)
+ self.dfs_dir = "/sys/kernel/debug/netdevsim/netdevsim%u/" % addr
+
+ self.nsims = []
+ for port_index in range(port_count):
+ self.nsims.append(NetdevSim(self, port_index, ifnames[port_index]))
+
+ def get_ifnames(self):
+ ifnames = []
+ listdir = os.listdir("/sys/bus/netdevsim/devices/netdevsim%u/net/" % self.addr)
+ for ifname in listdir:
+ ifnames.append(ifname)
+ ifnames.sort()
+ return ifnames
+
+ def wait_for_netdevs(self, port_count):
+ timeout = 5
+ timeout_start = time.time()
+
+ while True:
+ try:
+ ifnames = self.get_ifnames()
+ except FileNotFoundError as e:
+ ifnames = []
+ if len(ifnames) == port_count:
+ break
+ if time.time() < timeout_start + timeout:
+ continue
+ raise Exception("netdevices did not appear within timeout")
+
+ def dfs_num_bound_progs(self):
+ path = os.path.join(self.dfs_dir, "bpf_bound_progs")
+ _, progs = cmd('ls %s' % (path))
+ return len(progs.split())
+
+ def dfs_get_bound_progs(self, expected):
+ progs = DebugfsDir(os.path.join(self.dfs_dir, "bpf_bound_progs"))
+ if expected is not None:
+ if len(progs) != expected:
+ fail(True, "%d BPF programs bound, expected %d" %
+ (len(progs), expected))
+ return progs
+
+ def remove(self):
+ self.ctrl_write("del_device", "%u" % (self.addr, ))
+ devs.remove(self)
+
+ def remove_nsim(self, nsim):
+ self.nsims.remove(nsim)
+ self.ctrl_write("devices/netdevsim%u/del_port" % (self.addr, ),
+ "%u" % (nsim.port_index, ))
+
+class NetdevSim:
+ """
+ Class for netdevsim netdevice and its attributes.
+ """
+
+ def __init__(self, nsimdev, port_index, ifname):
+ # In case udev renamed the netdev to according to new schema,
+ # check if the name matches the port_index.
+ nsimnamere = re.compile("eni\d+np(\d+)")
+ match = nsimnamere.match(ifname)
+ if match and int(match.groups()[0]) != port_index + 1:
+ raise Exception("netdevice name mismatches the expected one")
+
+ self.nsimdev = nsimdev
+ self.port_index = port_index
+ self.ns = ""
+ self.dfs_dir = "%s/ports/%u/" % (nsimdev.dfs_dir, port_index)
+ self.dfs_refresh()
+ _, [self.dev] = ip("link show dev %s" % ifname)
+
+ def __getitem__(self, key):
+ return self.dev[key]
+
+ def remove(self):
+ self.nsimdev.remove_nsim(self)
+
+ def dfs_refresh(self):
+ self.dfs = DebugfsDir(self.dfs_dir)
+ return self.dfs
+
+ def dfs_read(self, f):
+ path = os.path.join(self.dfs_dir, f)
+ _, data = cmd('cat %s' % (path))
+ return data.strip()
+
+ def wait_for_flush(self, bound=0, total=0, n_retry=20):
+ for i in range(n_retry):
+ nbound = self.nsimdev.dfs_num_bound_progs()
+ nprogs = len(bpftool_prog_list())
+ if nbound == bound and nprogs == total:
+ return
+ time.sleep(0.05)
+ raise Exception("Time out waiting for program counts to stabilize want %d/%d, have %d bound, %d loaded" % (bound, total, nbound, nprogs))
+
+ def set_ns(self, ns):
+ name = "1" if ns == "" else ns
+ ip("link set dev %s netns %s" % (self.dev["ifname"], name), ns=self.ns)
+ self.ns = ns
+
+ def set_mtu(self, mtu, fail=True):
+ return ip("link set dev %s mtu %d" % (self.dev["ifname"], mtu),
+ fail=fail)
+
+ def set_xdp(self, bpf, mode, force=False, JSON=True, verbose=False,
+ fail=True, include_stderr=False):
+ if verbose:
+ bpf += " verbose"
+ return ip("link set dev %s xdp%s %s" % (self.dev["ifname"], mode, bpf),
+ force=force, JSON=JSON,
+ fail=fail, include_stderr=include_stderr)
+
+ def unset_xdp(self, mode, force=False, JSON=True,
+ fail=True, include_stderr=False):
+ return ip("link set dev %s xdp%s off" % (self.dev["ifname"], mode),
+ force=force, JSON=JSON,
+ fail=fail, include_stderr=include_stderr)
+
+ def ip_link_show(self, xdp):
+ _, link = ip("link show dev %s" % (self['ifname']))
+ if len(link) > 1:
+ raise Exception("Multiple objects on ip link show")
+ if len(link) < 1:
+ return {}
+ fail(xdp != "xdp" in link,
+ "XDP program not reporting in iplink (reported %s, expected %s)" %
+ ("xdp" in link, xdp))
+ return link[0]
+
+ def tc_add_ingress(self):
+ tc("qdisc add dev %s ingress" % (self['ifname']))
+
+ def tc_del_ingress(self):
+ tc("qdisc del dev %s ingress" % (self['ifname']))
+
+ def tc_flush_filters(self, bound=0, total=0):
+ self.tc_del_ingress()
+ self.tc_add_ingress()
+ self.wait_for_flush(bound=bound, total=total)
+
+ def tc_show_ingress(self, expected=None):
+ # No JSON support, oh well...
+ flags = ["skip_sw", "skip_hw", "in_hw"]
+ named = ["protocol", "pref", "chain", "handle", "id", "tag"]
+
+ args = "-s filter show dev %s ingress" % (self['ifname'])
+ _, out = tc(args, JSON=False)
+
+ filters = []
+ lines = out.split('\n')
+ for line in lines:
+ words = line.split()
+ if "handle" not in words:
+ continue
+ fltr = {}
+ for flag in flags:
+ fltr[flag] = flag in words
+ for name in named:
+ try:
+ idx = words.index(name)
+ fltr[name] = words[idx + 1]
+ except ValueError:
+ pass
+ filters.append(fltr)
+
+ if expected is not None:
+ fail(len(filters) != expected,
+ "%d ingress filters loaded, expected %d" %
+ (len(filters), expected))
+ return filters
+
+ def cls_filter_op(self, op, qdisc="ingress", prio=None, handle=None,
+ chain=None, cls="", params="",
+ fail=True, include_stderr=False):
+ spec = ""
+ if prio is not None:
+ spec += " prio %d" % (prio)
+ if handle:
+ spec += " handle %s" % (handle)
+ if chain is not None:
+ spec += " chain %d" % (chain)
+
+ return tc("filter {op} dev {dev} {qdisc} {spec} {cls} {params}"\
+ .format(op=op, dev=self['ifname'], qdisc=qdisc, spec=spec,
+ cls=cls, params=params),
+ fail=fail, include_stderr=include_stderr)
+
+ def cls_bpf_add_filter(self, bpf, op="add", prio=None, handle=None,
+ chain=None, da=False, verbose=False,
+ skip_sw=False, skip_hw=False,
+ fail=True, include_stderr=False):
+ cls = "bpf " + bpf
+
+ params = ""
+ if da:
+ params += " da"
+ if verbose:
+ params += " verbose"
+ if skip_sw:
+ params += " skip_sw"
+ if skip_hw:
+ params += " skip_hw"
+
+ return self.cls_filter_op(op=op, prio=prio, handle=handle, cls=cls,
+ chain=chain, params=params,
+ fail=fail, include_stderr=include_stderr)
+
+ def set_ethtool_tc_offloads(self, enable, fail=True):
+ args = "hw-tc-offload %s" % ("on" if enable else "off")
+ return ethtool(self, "-K", args, fail=fail)
+
+################################################################################
+def clean_up():
+ global files, netns, devs
+
+ for dev in devs:
+ dev.remove()
+ for f in files:
+ cmd("rm -f %s" % (f))
+ for ns in netns:
+ cmd("ip netns delete %s" % (ns))
+ files = []
+ netns = []
+
+def pin_prog(file_name, idx=0):
+ progs = bpftool_prog_list(expected=(idx + 1))
+ prog = progs[idx]
+ bpftool("prog pin id %d %s" % (prog["id"], file_name))
+ files.append(file_name)
+
+ return file_name, bpf_pinned(file_name)
+
+def pin_map(file_name, idx=0, expected=1):
+ maps = bpftool_map_list(expected=expected)
+ m = maps[idx]
+ bpftool("map pin id %d %s" % (m["id"], file_name))
+ files.append(file_name)
+
+ return file_name, bpf_pinned(file_name)
+
+def check_dev_info_removed(prog_file=None, map_file=None):
+ bpftool_prog_list(expected=0)
+ ret, err = bpftool("prog show pin %s" % (prog_file), fail=False)
+ fail(ret == 0, "Showing prog with removed device did not fail")
+ fail(err["error"].find("No such device") == -1,
+ "Showing prog with removed device expected ENODEV, error is %s" %
+ (err["error"]))
+
+ bpftool_map_list(expected=0)
+ ret, err = bpftool("map show pin %s" % (map_file), fail=False)
+ fail(ret == 0, "Showing map with removed device did not fail")
+ fail(err["error"].find("No such device") == -1,
+ "Showing map with removed device expected ENODEV, error is %s" %
+ (err["error"]))
+
+def check_dev_info(other_ns, ns, prog_file=None, map_file=None, removed=False):
+ progs = bpftool_prog_list(expected=1, ns=ns)
+ prog = progs[0]
+
+ fail("dev" not in prog.keys(), "Device parameters not reported")
+ dev = prog["dev"]
+ fail("ifindex" not in dev.keys(), "Device parameters not reported")
+ fail("ns_dev" not in dev.keys(), "Device parameters not reported")
+ fail("ns_inode" not in dev.keys(), "Device parameters not reported")
+
+ if not other_ns:
+ fail("ifname" not in dev.keys(), "Ifname not reported")
+ fail(dev["ifname"] != sim["ifname"],
+ "Ifname incorrect %s vs %s" % (dev["ifname"], sim["ifname"]))
+ else:
+ fail("ifname" in dev.keys(), "Ifname is reported for other ns")
+
+ maps = bpftool_map_list(expected=2, ns=ns)
+ for m in maps:
+ fail("dev" not in m.keys(), "Device parameters not reported")
+ fail(dev != m["dev"], "Map's device different than program's")
+
+def check_extack(output, reference, args):
+ if skip_extack:
+ return
+ lines = output.split("\n")
+ comp = len(lines) >= 2 and lines[1] == 'Error: ' + reference
+ fail(not comp, "Missing or incorrect netlink extack message")
+
+def check_extack_nsim(output, reference, args):
+ check_extack(output, "netdevsim: " + reference, args)
+
+def check_no_extack(res, needle):
+ fail((res[1] + res[2]).count(needle) or (res[1] + res[2]).count("Warning:"),
+ "Found '%s' in command output, leaky extack?" % (needle))
+
+def check_verifier_log(output, reference):
+ lines = output.split("\n")
+ for l in reversed(lines):
+ if l == reference:
+ return
+ fail(True, "Missing or incorrect message from netdevsim in verifier log")
+
+def check_multi_basic(two_xdps):
+ fail(two_xdps["mode"] != 4, "Bad mode reported with multiple programs")
+ fail("prog" in two_xdps, "Base program reported in multi program mode")
+ fail(len(two_xdps["attached"]) != 2,
+ "Wrong attached program count with two programs")
+ fail(two_xdps["attached"][0]["prog"]["id"] ==
+ two_xdps["attached"][1]["prog"]["id"],
+ "Offloaded and other programs have the same id")
+
+def test_spurios_extack(sim, obj, skip_hw, needle):
+ res = sim.cls_bpf_add_filter(obj, prio=1, handle=1, skip_hw=skip_hw,
+ include_stderr=True)
+ check_no_extack(res, needle)
+ res = sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1,
+ skip_hw=skip_hw, include_stderr=True)
+ check_no_extack(res, needle)
+ res = sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf",
+ include_stderr=True)
+ check_no_extack(res, needle)
+
+def test_multi_prog(simdev, sim, obj, modename, modeid):
+ start_test("Test multi-attachment XDP - %s + offload..." %
+ (modename or "default", ))
+ sim.set_xdp(obj, "offload")
+ xdp = sim.ip_link_show(xdp=True)["xdp"]
+ offloaded = sim.dfs_read("bpf_offloaded_id")
+ fail("prog" not in xdp, "Base program not reported in single program mode")
+ fail(len(xdp["attached"]) != 1,
+ "Wrong attached program count with one program")
+
+ sim.set_xdp(obj, modename)
+ two_xdps = sim.ip_link_show(xdp=True)["xdp"]
+
+ fail(xdp["attached"][0] not in two_xdps["attached"],
+ "Offload program not reported after other activated")
+ check_multi_basic(two_xdps)
+
+ offloaded2 = sim.dfs_read("bpf_offloaded_id")
+ fail(offloaded != offloaded2,
+ "Offload ID changed after loading other program")
+
+ start_test("Test multi-attachment XDP - replace...")
+ ret, _, err = sim.set_xdp(obj, "offload", fail=False, include_stderr=True)
+ fail(ret == 0, "Replaced one of programs without -force")
+ check_extack(err, "XDP program already attached.", args)
+
+ start_test("Test multi-attachment XDP - remove without mode...")
+ ret, _, err = sim.unset_xdp("", force=True,
+ fail=False, include_stderr=True)
+ fail(ret == 0, "Removed program without a mode flag")
+ check_extack(err, "More than one program loaded, unset mode is ambiguous.", args)
+
+ sim.unset_xdp("offload")
+ xdp = sim.ip_link_show(xdp=True)["xdp"]
+ offloaded = sim.dfs_read("bpf_offloaded_id")
+
+ fail(xdp["mode"] != modeid, "Bad mode reported after multiple programs")
+ fail("prog" not in xdp,
+ "Base program not reported after multi program mode")
+ fail(xdp["attached"][0] not in two_xdps["attached"],
+ "Offload program not reported after other activated")
+ fail(len(xdp["attached"]) != 1,
+ "Wrong attached program count with remaining programs")
+ fail(offloaded != "0", "Offload ID reported with only other program left")
+
+ start_test("Test multi-attachment XDP - reattach...")
+ sim.set_xdp(obj, "offload")
+ two_xdps = sim.ip_link_show(xdp=True)["xdp"]
+
+ fail(xdp["attached"][0] not in two_xdps["attached"],
+ "Other program not reported after offload activated")
+ check_multi_basic(two_xdps)
+
+ start_test("Test multi-attachment XDP - device remove...")
+ simdev.remove()
+
+ simdev = NetdevSimDev()
+ sim, = simdev.nsims
+ sim.set_ethtool_tc_offloads(True)
+ return [simdev, sim]
+
+# Parse command line
+parser = argparse.ArgumentParser()
+parser.add_argument("--log", help="output verbose log to given file")
+args = parser.parse_args()
+if args.log:
+ logfile = open(args.log, 'w+')
+ logfile.write("# -*-Org-*-")
+
+log("Prepare...", "", level=1)
+log_level_inc()
+
+# Check permissions
+skip(os.getuid() != 0, "test must be run as root")
+
+# Check tools
+ret, progs = bpftool("prog", fail=False)
+skip(ret != 0, "bpftool not installed")
+base_progs = progs
+_, base_maps = bpftool("map")
+base_map_names = [
+ 'pid_iter.rodata' # created on each bpftool invocation
+]
+
+# Check netdevsim
+ret, out = cmd("modprobe netdevsim", fail=False)
+skip(ret != 0, "netdevsim module could not be loaded")
+
+# Check debugfs
+_, out = cmd("mount")
+if out.find("/sys/kernel/debug type debugfs") == -1:
+ cmd("mount -t debugfs none /sys/kernel/debug")
+
+# Check samples are compiled
+samples = ["sample_ret0.o", "sample_map_ret0.o"]
+for s in samples:
+ ret, out = cmd("ls %s/%s" % (bpf_test_dir, s), fail=False)
+ skip(ret != 0, "sample %s/%s not found, please compile it" %
+ (bpf_test_dir, s))
+
+# Check if iproute2 is built with libmnl (needed by extack support)
+_, _, err = cmd("tc qdisc delete dev lo handle 0",
+ fail=False, include_stderr=True)
+if err.find("Error: Failed to find qdisc with specified handle.") == -1:
+ print("Warning: no extack message in iproute2 output, libmnl missing?")
+ log("Warning: no extack message in iproute2 output, libmnl missing?", "")
+ skip_extack = True
+
+# Check if net namespaces seem to work
+ns = mknetns()
+skip(ns is None, "Could not create a net namespace")
+cmd("ip netns delete %s" % (ns))
+netns = []
+
+try:
+ obj = bpf_obj("sample_ret0.o")
+ bytecode = bpf_bytecode("1,6 0 0 4294967295,")
+
+ start_test("Test destruction of generic XDP...")
+ simdev = NetdevSimDev()
+ sim, = simdev.nsims
+ sim.set_xdp(obj, "generic")
+ simdev.remove()
+ bpftool_prog_list_wait(expected=0)
+
+ simdev = NetdevSimDev()
+ sim, = simdev.nsims
+ sim.tc_add_ingress()
+
+ start_test("Test TC non-offloaded...")
+ ret, _ = sim.cls_bpf_add_filter(obj, skip_hw=True, fail=False)
+ fail(ret != 0, "Software TC filter did not load")
+
+ start_test("Test TC non-offloaded isn't getting bound...")
+ ret, _ = sim.cls_bpf_add_filter(obj, fail=False)
+ fail(ret != 0, "Software TC filter did not load")
+ simdev.dfs_get_bound_progs(expected=0)
+
+ sim.tc_flush_filters()
+
+ start_test("Test TC offloads are off by default...")
+ ret, _, err = sim.cls_bpf_add_filter(obj, skip_sw=True,
+ fail=False, include_stderr=True)
+ fail(ret == 0, "TC filter loaded without enabling TC offloads")
+ check_extack(err, "TC offload is disabled on net device.", args)
+ sim.wait_for_flush()
+
+ sim.set_ethtool_tc_offloads(True)
+ sim.dfs["bpf_tc_non_bound_accept"] = "Y"
+
+ start_test("Test TC offload by default...")
+ ret, _ = sim.cls_bpf_add_filter(obj, fail=False)
+ fail(ret != 0, "Software TC filter did not load")
+ simdev.dfs_get_bound_progs(expected=0)
+ ingress = sim.tc_show_ingress(expected=1)
+ fltr = ingress[0]
+ fail(not fltr["in_hw"], "Filter not offloaded by default")
+
+ sim.tc_flush_filters()
+
+ start_test("Test TC cBPF bytcode tries offload by default...")
+ ret, _ = sim.cls_bpf_add_filter(bytecode, fail=False)
+ fail(ret != 0, "Software TC filter did not load")
+ simdev.dfs_get_bound_progs(expected=0)
+ ingress = sim.tc_show_ingress(expected=1)
+ fltr = ingress[0]
+ fail(not fltr["in_hw"], "Bytecode not offloaded by default")
+
+ sim.tc_flush_filters()
+ sim.dfs["bpf_tc_non_bound_accept"] = "N"
+
+ start_test("Test TC cBPF unbound bytecode doesn't offload...")
+ ret, _, err = sim.cls_bpf_add_filter(bytecode, skip_sw=True,
+ fail=False, include_stderr=True)
+ fail(ret == 0, "TC bytecode loaded for offload")
+ check_extack_nsim(err, "netdevsim configured to reject unbound programs.",
+ args)
+ sim.wait_for_flush()
+
+ start_test("Test non-0 chain offload...")
+ ret, _, err = sim.cls_bpf_add_filter(obj, chain=1, prio=1, handle=1,
+ skip_sw=True,
+ fail=False, include_stderr=True)
+ fail(ret == 0, "Offloaded a filter to chain other than 0")
+ check_extack(err, "Driver supports only offload of chain 0.", args)
+ sim.tc_flush_filters()
+
+ start_test("Test TC replace...")
+ sim.cls_bpf_add_filter(obj, prio=1, handle=1)
+ sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1)
+ sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf")
+
+ sim.cls_bpf_add_filter(obj, prio=1, handle=1, skip_sw=True)
+ sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1, skip_sw=True)
+ sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf")
+
+ sim.cls_bpf_add_filter(obj, prio=1, handle=1, skip_hw=True)
+ sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1, skip_hw=True)
+ sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf")
+
+ start_test("Test TC replace bad flags...")
+ for i in range(3):
+ for j in range(3):
+ ret, _ = sim.cls_bpf_add_filter(obj, op="replace", prio=1, handle=1,
+ skip_sw=(j == 1), skip_hw=(j == 2),
+ fail=False)
+ fail(bool(ret) != bool(j),
+ "Software TC incorrect load in replace test, iteration %d" %
+ (j))
+ sim.cls_filter_op(op="delete", prio=1, handle=1, cls="bpf")
+
+ start_test("Test spurious extack from the driver...")
+ test_spurios_extack(sim, obj, False, "netdevsim")
+ test_spurios_extack(sim, obj, True, "netdevsim")
+
+ sim.set_ethtool_tc_offloads(False)
+
+ test_spurios_extack(sim, obj, False, "TC offload is disabled")
+ test_spurios_extack(sim, obj, True, "TC offload is disabled")
+
+ sim.set_ethtool_tc_offloads(True)
+
+ sim.tc_flush_filters()
+
+ start_test("Test TC offloads failure...")
+ sim.dfs["dev/bpf_bind_verifier_accept"] = 0
+ ret, _, err = sim.cls_bpf_add_filter(obj, verbose=True, skip_sw=True,
+ fail=False, include_stderr=True)
+ fail(ret == 0, "TC filter did not reject with TC offloads enabled")
+ check_verifier_log(err, "[netdevsim] Hello from netdevsim!")
+ sim.dfs["dev/bpf_bind_verifier_accept"] = 1
+
+ start_test("Test TC offloads work...")
+ ret, _, err = sim.cls_bpf_add_filter(obj, verbose=True, skip_sw=True,
+ fail=False, include_stderr=True)
+ fail(ret != 0, "TC filter did not load with TC offloads enabled")
+
+ start_test("Test TC offload basics...")
+ dfs = simdev.dfs_get_bound_progs(expected=1)
+ progs = bpftool_prog_list(expected=1)
+ ingress = sim.tc_show_ingress(expected=1)
+
+ dprog = dfs[0]
+ prog = progs[0]
+ fltr = ingress[0]
+ fail(fltr["skip_hw"], "TC does reports 'skip_hw' on offloaded filter")
+ fail(not fltr["in_hw"], "TC does not report 'in_hw' for offloaded filter")
+ fail(not fltr["skip_sw"], "TC does not report 'skip_sw' back")
+
+ start_test("Test TC offload is device-bound...")
+ fail(str(prog["id"]) != fltr["id"], "Program IDs don't match")
+ fail(prog["tag"] != fltr["tag"], "Program tags don't match")
+ fail(fltr["id"] != dprog["id"], "Program IDs don't match")
+ fail(dprog["state"] != "xlated", "Offloaded program state not translated")
+ fail(dprog["loaded"] != "Y", "Offloaded program is not loaded")
+
+ start_test("Test disabling TC offloads is rejected while filters installed...")
+ ret, _ = sim.set_ethtool_tc_offloads(False, fail=False)
+ fail(ret == 0, "Driver should refuse to disable TC offloads with filters installed...")
+ sim.set_ethtool_tc_offloads(True)
+
+ start_test("Test qdisc removal frees things...")
+ sim.tc_flush_filters()
+ sim.tc_show_ingress(expected=0)
+
+ start_test("Test disabling TC offloads is OK without filters...")
+ ret, _ = sim.set_ethtool_tc_offloads(False, fail=False)
+ fail(ret != 0,
+ "Driver refused to disable TC offloads without filters installed...")
+
+ sim.set_ethtool_tc_offloads(True)
+
+ start_test("Test destroying device gets rid of TC filters...")
+ sim.cls_bpf_add_filter(obj, skip_sw=True)
+ simdev.remove()
+ bpftool_prog_list_wait(expected=0)
+
+ simdev = NetdevSimDev()
+ sim, = simdev.nsims
+ sim.set_ethtool_tc_offloads(True)
+
+ start_test("Test destroying device gets rid of XDP...")
+ sim.set_xdp(obj, "offload")
+ simdev.remove()
+ bpftool_prog_list_wait(expected=0)
+
+ simdev = NetdevSimDev()
+ sim, = simdev.nsims
+ sim.set_ethtool_tc_offloads(True)
+
+ start_test("Test XDP prog reporting...")
+ sim.set_xdp(obj, "drv")
+ ipl = sim.ip_link_show(xdp=True)
+ progs = bpftool_prog_list(expected=1)
+ fail(ipl["xdp"]["prog"]["id"] != progs[0]["id"],
+ "Loaded program has wrong ID")
+
+ start_test("Test XDP prog replace without force...")
+ ret, _ = sim.set_xdp(obj, "drv", fail=False)
+ fail(ret == 0, "Replaced XDP program without -force")
+ sim.wait_for_flush(total=1)
+
+ start_test("Test XDP prog replace with force...")
+ ret, _ = sim.set_xdp(obj, "drv", force=True, fail=False)
+ fail(ret != 0, "Could not replace XDP program with -force")
+ bpftool_prog_list_wait(expected=1)
+ ipl = sim.ip_link_show(xdp=True)
+ progs = bpftool_prog_list(expected=1)
+ fail(ipl["xdp"]["prog"]["id"] != progs[0]["id"],
+ "Loaded program has wrong ID")
+ fail("dev" in progs[0].keys(),
+ "Device parameters reported for non-offloaded program")
+
+ start_test("Test XDP prog replace with bad flags...")
+ ret, _, err = sim.set_xdp(obj, "generic", force=True,
+ fail=False, include_stderr=True)
+ fail(ret == 0, "Replaced XDP program with a program in different mode")
+ check_extack(err,
+ "Native and generic XDP can't be active at the same time.",
+ args)
+
+ start_test("Test MTU restrictions...")
+ ret, _ = sim.set_mtu(9000, fail=False)
+ fail(ret == 0,
+ "Driver should refuse to increase MTU to 9000 with XDP loaded...")
+ sim.unset_xdp("drv")
+ bpftool_prog_list_wait(expected=0)
+ sim.set_mtu(9000)
+ ret, _, err = sim.set_xdp(obj, "drv", fail=False, include_stderr=True)
+ fail(ret == 0, "Driver should refuse to load program with MTU of 9000...")
+ check_extack_nsim(err, "MTU too large w/ XDP enabled.", args)
+ sim.set_mtu(1500)
+
+ sim.wait_for_flush()
+ start_test("Test non-offload XDP attaching to HW...")
+ bpftool_prog_load("sample_ret0.o", "/sys/fs/bpf/nooffload")
+ nooffload = bpf_pinned("/sys/fs/bpf/nooffload")
+ ret, _, err = sim.set_xdp(nooffload, "offload",
+ fail=False, include_stderr=True)
+ fail(ret == 0, "attached non-offloaded XDP program to HW")
+ check_extack_nsim(err, "xdpoffload of non-bound program.", args)
+ rm("/sys/fs/bpf/nooffload")
+
+ start_test("Test offload XDP attaching to drv...")
+ bpftool_prog_load("sample_ret0.o", "/sys/fs/bpf/offload",
+ dev=sim['ifname'])
+ offload = bpf_pinned("/sys/fs/bpf/offload")
+ ret, _, err = sim.set_xdp(offload, "drv", fail=False, include_stderr=True)
+ fail(ret == 0, "attached offloaded XDP program to drv")
+ check_extack(err, "Using device-bound program without HW_MODE flag is not supported.", args)
+ rm("/sys/fs/bpf/offload")
+ sim.wait_for_flush()
+
+ start_test("Test XDP load failure...")
+ sim.dfs["dev/bpf_bind_verifier_accept"] = 0
+ ret, _, err = bpftool_prog_load("sample_ret0.o", "/sys/fs/bpf/offload",
+ dev=sim['ifname'], fail=False, include_stderr=True)
+ fail(ret == 0, "verifier should fail on load")
+ check_verifier_log(err, "[netdevsim] Hello from netdevsim!")
+ sim.dfs["dev/bpf_bind_verifier_accept"] = 1
+ sim.wait_for_flush()
+
+ start_test("Test XDP offload...")
+ _, _, err = sim.set_xdp(obj, "offload", verbose=True, include_stderr=True)
+ ipl = sim.ip_link_show(xdp=True)
+ link_xdp = ipl["xdp"]["prog"]
+ progs = bpftool_prog_list(expected=1)
+ prog = progs[0]
+ fail(link_xdp["id"] != prog["id"], "Loaded program has wrong ID")
+
+ start_test("Test XDP offload is device bound...")
+ dfs = simdev.dfs_get_bound_progs(expected=1)
+ dprog = dfs[0]
+
+ fail(prog["id"] != link_xdp["id"], "Program IDs don't match")
+ fail(prog["tag"] != link_xdp["tag"], "Program tags don't match")
+ fail(str(link_xdp["id"]) != dprog["id"], "Program IDs don't match")
+ fail(dprog["state"] != "xlated", "Offloaded program state not translated")
+ fail(dprog["loaded"] != "Y", "Offloaded program is not loaded")
+
+ start_test("Test removing XDP program many times...")
+ sim.unset_xdp("offload")
+ sim.unset_xdp("offload")
+ sim.unset_xdp("drv")
+ sim.unset_xdp("drv")
+ sim.unset_xdp("")
+ sim.unset_xdp("")
+ bpftool_prog_list_wait(expected=0)
+
+ start_test("Test attempt to use a program for a wrong device...")
+ simdev2 = NetdevSimDev()
+ sim2, = simdev2.nsims
+ sim2.set_xdp(obj, "offload")
+ pin_file, pinned = pin_prog("/sys/fs/bpf/tmp")
+
+ ret, _, err = sim.set_xdp(pinned, "offload",
+ fail=False, include_stderr=True)
+ fail(ret == 0, "Pinned program loaded for a different device accepted")
+ check_extack_nsim(err, "program bound to different dev.", args)
+ simdev2.remove()
+ ret, _, err = sim.set_xdp(pinned, "offload",
+ fail=False, include_stderr=True)
+ fail(ret == 0, "Pinned program loaded for a removed device accepted")
+ check_extack_nsim(err, "xdpoffload of non-bound program.", args)
+ rm(pin_file)
+ bpftool_prog_list_wait(expected=0)
+
+ simdev, sim = test_multi_prog(simdev, sim, obj, "", 1)
+ simdev, sim = test_multi_prog(simdev, sim, obj, "drv", 1)
+ simdev, sim = test_multi_prog(simdev, sim, obj, "generic", 2)
+
+ start_test("Test mixing of TC and XDP...")
+ sim.tc_add_ingress()
+ sim.set_xdp(obj, "offload")
+ ret, _, err = sim.cls_bpf_add_filter(obj, skip_sw=True,
+ fail=False, include_stderr=True)
+ fail(ret == 0, "Loading TC when XDP active should fail")
+ check_extack_nsim(err, "driver and netdev offload states mismatch.", args)
+ sim.unset_xdp("offload")
+ sim.wait_for_flush()
+
+ sim.cls_bpf_add_filter(obj, skip_sw=True)
+ ret, _, err = sim.set_xdp(obj, "offload", fail=False, include_stderr=True)
+ fail(ret == 0, "Loading XDP when TC active should fail")
+ check_extack_nsim(err, "TC program is already loaded.", args)
+
+ start_test("Test binding TC from pinned...")
+ pin_file, pinned = pin_prog("/sys/fs/bpf/tmp")
+ sim.tc_flush_filters(bound=1, total=1)
+ sim.cls_bpf_add_filter(pinned, da=True, skip_sw=True)
+ sim.tc_flush_filters(bound=1, total=1)
+
+ start_test("Test binding XDP from pinned...")
+ sim.set_xdp(obj, "offload")
+ pin_file, pinned = pin_prog("/sys/fs/bpf/tmp2", idx=1)
+
+ sim.set_xdp(pinned, "offload", force=True)
+ sim.unset_xdp("offload")
+ sim.set_xdp(pinned, "offload", force=True)
+ sim.unset_xdp("offload")
+
+ start_test("Test offload of wrong type fails...")
+ ret, _ = sim.cls_bpf_add_filter(pinned, da=True, skip_sw=True, fail=False)
+ fail(ret == 0, "Managed to attach XDP program to TC")
+
+ start_test("Test asking for TC offload of two filters...")
+ sim.cls_bpf_add_filter(obj, da=True, skip_sw=True)
+ ret, _, err = sim.cls_bpf_add_filter(obj, da=True, skip_sw=True,
+ fail=False, include_stderr=True)
+ fail(ret == 0, "Managed to offload two TC filters at the same time")
+ check_extack_nsim(err, "driver and netdev offload states mismatch.", args)
+
+ sim.tc_flush_filters(bound=2, total=2)
+
+ start_test("Test if netdev removal waits for translation...")
+ delay_msec = 500
+ sim.dfs["dev/bpf_bind_verifier_delay"] = delay_msec
+ start = time.time()
+ cmd_line = "tc filter add dev %s ingress bpf %s da skip_sw" % \
+ (sim['ifname'], obj)
+ tc_proc = cmd(cmd_line, background=True, fail=False)
+ # Wait for the verifier to start
+ while simdev.dfs_num_bound_progs() <= 2:
+ pass
+ simdev.remove()
+ end = time.time()
+ ret, _ = cmd_result(tc_proc, fail=False)
+ time_diff = end - start
+ log("Time", "start:\t%s\nend:\t%s\ndiff:\t%s" % (start, end, time_diff))
+
+ fail(ret == 0, "Managed to load TC filter on a unregistering device")
+ delay_sec = delay_msec * 0.001
+ fail(time_diff < delay_sec, "Removal process took %s, expected %s" %
+ (time_diff, delay_sec))
+
+ # Remove all pinned files and reinstantiate the netdev
+ clean_up()
+ bpftool_prog_list_wait(expected=0)
+
+ simdev = NetdevSimDev()
+ sim, = simdev.nsims
+ map_obj = bpf_obj("sample_map_ret0.o")
+ start_test("Test loading program with maps...")
+ sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON
+
+ start_test("Test bpftool bound info reporting (own ns)...")
+ check_dev_info(False, "")
+
+ start_test("Test bpftool bound info reporting (other ns)...")
+ ns = mknetns()
+ sim.set_ns(ns)
+ check_dev_info(True, "")
+
+ start_test("Test bpftool bound info reporting (remote ns)...")
+ check_dev_info(False, ns)
+
+ start_test("Test bpftool bound info reporting (back to own ns)...")
+ sim.set_ns("")
+ check_dev_info(False, "")
+
+ prog_file, _ = pin_prog("/sys/fs/bpf/tmp_prog")
+ map_file, _ = pin_map("/sys/fs/bpf/tmp_map", idx=1, expected=2)
+ simdev.remove()
+
+ start_test("Test bpftool bound info reporting (removed dev)...")
+ check_dev_info_removed(prog_file=prog_file, map_file=map_file)
+
+ # Remove all pinned files and reinstantiate the netdev
+ clean_up()
+ bpftool_prog_list_wait(expected=0)
+
+ simdev = NetdevSimDev()
+ sim, = simdev.nsims
+
+ start_test("Test map update (no flags)...")
+ sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON
+ maps = bpftool_map_list(expected=2)
+ array = maps[0] if maps[0]["type"] == "array" else maps[1]
+ htab = maps[0] if maps[0]["type"] == "hash" else maps[1]
+ for m in maps:
+ for i in range(2):
+ bpftool("map update id %d key %s value %s" %
+ (m["id"], int2str("I", i), int2str("Q", i * 3)))
+
+ for m in maps:
+ ret, _ = bpftool("map update id %d key %s value %s" %
+ (m["id"], int2str("I", 3), int2str("Q", 3 * 3)),
+ fail=False)
+ fail(ret == 0, "added too many entries")
+
+ start_test("Test map update (exists)...")
+ for m in maps:
+ for i in range(2):
+ bpftool("map update id %d key %s value %s exist" %
+ (m["id"], int2str("I", i), int2str("Q", i * 3)))
+
+ for m in maps:
+ ret, err = bpftool("map update id %d key %s value %s exist" %
+ (m["id"], int2str("I", 3), int2str("Q", 3 * 3)),
+ fail=False)
+ fail(ret == 0, "updated non-existing key")
+ fail(err["error"].find("No such file or directory") == -1,
+ "expected ENOENT, error is '%s'" % (err["error"]))
+
+ start_test("Test map update (noexist)...")
+ for m in maps:
+ for i in range(2):
+ ret, err = bpftool("map update id %d key %s value %s noexist" %
+ (m["id"], int2str("I", i), int2str("Q", i * 3)),
+ fail=False)
+ fail(ret == 0, "updated existing key")
+ fail(err["error"].find("File exists") == -1,
+ "expected EEXIST, error is '%s'" % (err["error"]))
+
+ start_test("Test map dump...")
+ for m in maps:
+ _, entries = bpftool("map dump id %d" % (m["id"]))
+ for i in range(2):
+ key = str2int(entries[i]["key"])
+ fail(key != i, "expected key %d, got %d" % (key, i))
+ val = str2int(entries[i]["value"])
+ fail(val != i * 3, "expected value %d, got %d" % (val, i * 3))
+
+ start_test("Test map getnext...")
+ for m in maps:
+ _, entry = bpftool("map getnext id %d" % (m["id"]))
+ key = str2int(entry["next_key"])
+ fail(key != 0, "next key %d, expected %d" % (key, 0))
+ _, entry = bpftool("map getnext id %d key %s" %
+ (m["id"], int2str("I", 0)))
+ key = str2int(entry["next_key"])
+ fail(key != 1, "next key %d, expected %d" % (key, 1))
+ ret, err = bpftool("map getnext id %d key %s" %
+ (m["id"], int2str("I", 1)), fail=False)
+ fail(ret == 0, "got next key past the end of map")
+ fail(err["error"].find("No such file or directory") == -1,
+ "expected ENOENT, error is '%s'" % (err["error"]))
+
+ start_test("Test map delete (htab)...")
+ for i in range(2):
+ bpftool("map delete id %d key %s" % (htab["id"], int2str("I", i)))
+
+ start_test("Test map delete (array)...")
+ for i in range(2):
+ ret, err = bpftool("map delete id %d key %s" %
+ (htab["id"], int2str("I", i)), fail=False)
+ fail(ret == 0, "removed entry from an array")
+ fail(err["error"].find("No such file or directory") == -1,
+ "expected ENOENT, error is '%s'" % (err["error"]))
+
+ start_test("Test map remove...")
+ sim.unset_xdp("offload")
+ bpftool_map_list_wait(expected=0)
+ simdev.remove()
+
+ simdev = NetdevSimDev()
+ sim, = simdev.nsims
+ sim.set_xdp(map_obj, "offload", JSON=False) # map fixup msg breaks JSON
+ simdev.remove()
+ bpftool_map_list_wait(expected=0)
+
+ start_test("Test map creation fail path...")
+ simdev = NetdevSimDev()
+ sim, = simdev.nsims
+ sim.dfs["bpf_map_accept"] = "N"
+ ret, _ = sim.set_xdp(map_obj, "offload", JSON=False, fail=False)
+ fail(ret == 0,
+ "netdevsim didn't refuse to create a map with offload disabled")
+
+ simdev.remove()
+
+ start_test("Test multi-dev ASIC program reuse...")
+ simdevA = NetdevSimDev()
+ simA, = simdevA.nsims
+ simdevB = NetdevSimDev(3)
+ simB1, simB2, simB3 = simdevB.nsims
+ sims = (simA, simB1, simB2, simB3)
+ simB = (simB1, simB2, simB3)
+
+ bpftool_prog_load("sample_map_ret0.o", "/sys/fs/bpf/nsimA",
+ dev=simA['ifname'])
+ progA = bpf_pinned("/sys/fs/bpf/nsimA")
+ bpftool_prog_load("sample_map_ret0.o", "/sys/fs/bpf/nsimB",
+ dev=simB1['ifname'])
+ progB = bpf_pinned("/sys/fs/bpf/nsimB")
+
+ simA.set_xdp(progA, "offload", JSON=False)
+ for d in simdevB.nsims:
+ d.set_xdp(progB, "offload", JSON=False)
+
+ start_test("Test multi-dev ASIC cross-dev replace...")
+ ret, _ = simA.set_xdp(progB, "offload", force=True, JSON=False, fail=False)
+ fail(ret == 0, "cross-ASIC program allowed")
+ for d in simdevB.nsims:
+ ret, _ = d.set_xdp(progA, "offload", force=True, JSON=False, fail=False)
+ fail(ret == 0, "cross-ASIC program allowed")
+
+ start_test("Test multi-dev ASIC cross-dev install...")
+ for d in sims:
+ d.unset_xdp("offload")
+
+ ret, _, err = simA.set_xdp(progB, "offload", force=True, JSON=False,
+ fail=False, include_stderr=True)
+ fail(ret == 0, "cross-ASIC program allowed")
+ check_extack_nsim(err, "program bound to different dev.", args)
+ for d in simdevB.nsims:
+ ret, _, err = d.set_xdp(progA, "offload", force=True, JSON=False,
+ fail=False, include_stderr=True)
+ fail(ret == 0, "cross-ASIC program allowed")
+ check_extack_nsim(err, "program bound to different dev.", args)
+
+ start_test("Test multi-dev ASIC cross-dev map reuse...")
+
+ mapA = bpftool("prog show %s" % (progA))[1]["map_ids"][0]
+ mapB = bpftool("prog show %s" % (progB))[1]["map_ids"][0]
+
+ ret, _ = bpftool_prog_load("sample_map_ret0.o", "/sys/fs/bpf/nsimB_",
+ dev=simB3['ifname'],
+ maps=["idx 0 id %d" % (mapB)],
+ fail=False)
+ fail(ret != 0, "couldn't reuse a map on the same ASIC")
+ rm("/sys/fs/bpf/nsimB_")
+
+ ret, _, err = bpftool_prog_load("sample_map_ret0.o", "/sys/fs/bpf/nsimA_",
+ dev=simA['ifname'],
+ maps=["idx 0 id %d" % (mapB)],
+ fail=False, include_stderr=True)
+ fail(ret == 0, "could reuse a map on a different ASIC")
+ fail(err.count("offload device mismatch between prog and map") == 0,
+ "error message missing for cross-ASIC map")
+
+ ret, _, err = bpftool_prog_load("sample_map_ret0.o", "/sys/fs/bpf/nsimB_",
+ dev=simB1['ifname'],
+ maps=["idx 0 id %d" % (mapA)],
+ fail=False, include_stderr=True)
+ fail(ret == 0, "could reuse a map on a different ASIC")
+ fail(err.count("offload device mismatch between prog and map") == 0,
+ "error message missing for cross-ASIC map")
+
+ start_test("Test multi-dev ASIC cross-dev destruction...")
+ bpftool_prog_list_wait(expected=2)
+
+ simdevA.remove()
+ bpftool_prog_list_wait(expected=1)
+
+ ifnameB = bpftool("prog show %s" % (progB))[1]["dev"]["ifname"]
+ fail(ifnameB != simB1['ifname'], "program not bound to original device")
+ simB1.remove()
+ bpftool_prog_list_wait(expected=1)
+
+ start_test("Test multi-dev ASIC cross-dev destruction - move...")
+ ifnameB = bpftool("prog show %s" % (progB))[1]["dev"]["ifname"]
+ fail(ifnameB not in (simB2['ifname'], simB3['ifname']),
+ "program not bound to remaining devices")
+
+ simB2.remove()
+ ifnameB = bpftool("prog show %s" % (progB))[1]["dev"]["ifname"]
+ fail(ifnameB != simB3['ifname'], "program not bound to remaining device")
+
+ simB3.remove()
+ simdevB.remove()
+ bpftool_prog_list_wait(expected=0)
+
+ start_test("Test multi-dev ASIC cross-dev destruction - orphaned...")
+ ret, out = bpftool("prog show %s" % (progB), fail=False)
+ fail(ret == 0, "got information about orphaned program")
+ fail("error" not in out, "no error reported for get info on orphaned")
+ fail(out["error"] != "can't get prog info: No such device",
+ "wrong error for get info on orphaned")
+
+ print("%s: OK" % (os.path.basename(__file__)))
+
+finally:
+ log("Clean up...", "", level=1)
+ log_level_inc()
+ clean_up()
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
new file mode 100644
index 000000000..4a13477ae
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -0,0 +1,751 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2017 Facebook
+ */
+#define _GNU_SOURCE
+#include "test_progs.h"
+#include "cgroup_helpers.h"
+#include "bpf_rlimit.h"
+#include <argp.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <string.h>
+#include <execinfo.h> /* backtrace */
+
+#define EXIT_NO_TEST 2
+#define EXIT_ERR_SETUP_INFRA 3
+
+/* defined in test_progs.h */
+struct test_env env = {};
+
+struct prog_test_def {
+ const char *test_name;
+ int test_num;
+ void (*run_test)(void);
+ bool force_log;
+ int error_cnt;
+ int skip_cnt;
+ bool tested;
+ bool need_cgroup_cleanup;
+
+ char *subtest_name;
+ int subtest_num;
+
+ /* store counts before subtest started */
+ int old_error_cnt;
+};
+
+/* Override C runtime library's usleep() implementation to ensure nanosleep()
+ * is always called. Usleep is frequently used in selftests as a way to
+ * trigger kprobe and tracepoints.
+ */
+int usleep(useconds_t usec)
+{
+ struct timespec ts = {
+ .tv_sec = usec / 1000000,
+ .tv_nsec = (usec % 1000000) * 1000,
+ };
+
+ return syscall(__NR_nanosleep, &ts, NULL);
+}
+
+static bool should_run(struct test_selector *sel, int num, const char *name)
+{
+ int i;
+
+ for (i = 0; i < sel->blacklist.cnt; i++) {
+ if (strstr(name, sel->blacklist.strs[i]))
+ return false;
+ }
+
+ for (i = 0; i < sel->whitelist.cnt; i++) {
+ if (strstr(name, sel->whitelist.strs[i]))
+ return true;
+ }
+
+ if (!sel->whitelist.cnt && !sel->num_set)
+ return true;
+
+ return num < sel->num_set_len && sel->num_set[num];
+}
+
+static void dump_test_log(const struct prog_test_def *test, bool failed)
+{
+ if (stdout == env.stdout)
+ return;
+
+ fflush(stdout); /* exports env.log_buf & env.log_cnt */
+
+ if (env.verbosity > VERBOSE_NONE || test->force_log || failed) {
+ if (env.log_cnt) {
+ env.log_buf[env.log_cnt] = '\0';
+ fprintf(env.stdout, "%s", env.log_buf);
+ if (env.log_buf[env.log_cnt - 1] != '\n')
+ fprintf(env.stdout, "\n");
+ }
+ }
+
+ fseeko(stdout, 0, SEEK_SET); /* rewind */
+}
+
+static void skip_account(void)
+{
+ if (env.test->skip_cnt) {
+ env.skip_cnt++;
+ env.test->skip_cnt = 0;
+ }
+}
+
+static void stdio_restore(void);
+
+/* A bunch of tests set custom affinity per-thread and/or per-process. Reset
+ * it after each test/sub-test.
+ */
+static void reset_affinity() {
+
+ cpu_set_t cpuset;
+ int i, err;
+
+ CPU_ZERO(&cpuset);
+ for (i = 0; i < env.nr_cpus; i++)
+ CPU_SET(i, &cpuset);
+
+ err = sched_setaffinity(0, sizeof(cpuset), &cpuset);
+ if (err < 0) {
+ stdio_restore();
+ fprintf(stderr, "Failed to reset process affinity: %d!\n", err);
+ exit(EXIT_ERR_SETUP_INFRA);
+ }
+ err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
+ if (err < 0) {
+ stdio_restore();
+ fprintf(stderr, "Failed to reset thread affinity: %d!\n", err);
+ exit(EXIT_ERR_SETUP_INFRA);
+ }
+}
+
+static void save_netns(void)
+{
+ env.saved_netns_fd = open("/proc/self/ns/net", O_RDONLY);
+ if (env.saved_netns_fd == -1) {
+ perror("open(/proc/self/ns/net)");
+ exit(EXIT_ERR_SETUP_INFRA);
+ }
+}
+
+static void restore_netns(void)
+{
+ if (setns(env.saved_netns_fd, CLONE_NEWNET) == -1) {
+ stdio_restore();
+ perror("setns(CLONE_NEWNS)");
+ exit(EXIT_ERR_SETUP_INFRA);
+ }
+}
+
+void test__end_subtest()
+{
+ struct prog_test_def *test = env.test;
+ int sub_error_cnt = test->error_cnt - test->old_error_cnt;
+
+ if (sub_error_cnt)
+ env.fail_cnt++;
+ else
+ env.sub_succ_cnt++;
+ skip_account();
+
+ dump_test_log(test, sub_error_cnt);
+
+ fprintf(env.stdout, "#%d/%d %s:%s\n",
+ test->test_num, test->subtest_num,
+ test->subtest_name, sub_error_cnt ? "FAIL" : "OK");
+
+ free(test->subtest_name);
+ test->subtest_name = NULL;
+}
+
+bool test__start_subtest(const char *name)
+{
+ struct prog_test_def *test = env.test;
+
+ if (test->subtest_name)
+ test__end_subtest();
+
+ test->subtest_num++;
+
+ if (!name || !name[0]) {
+ fprintf(env.stderr,
+ "Subtest #%d didn't provide sub-test name!\n",
+ test->subtest_num);
+ return false;
+ }
+
+ if (!should_run(&env.subtest_selector, test->subtest_num, name))
+ return false;
+
+ test->subtest_name = strdup(name);
+ if (!test->subtest_name) {
+ fprintf(env.stderr,
+ "Subtest #%d: failed to copy subtest name!\n",
+ test->subtest_num);
+ return false;
+ }
+ env.test->old_error_cnt = env.test->error_cnt;
+
+ return true;
+}
+
+void test__force_log() {
+ env.test->force_log = true;
+}
+
+void test__skip(void)
+{
+ env.test->skip_cnt++;
+}
+
+void test__fail(void)
+{
+ env.test->error_cnt++;
+}
+
+int test__join_cgroup(const char *path)
+{
+ int fd;
+
+ if (!env.test->need_cgroup_cleanup) {
+ if (setup_cgroup_environment()) {
+ fprintf(stderr,
+ "#%d %s: Failed to setup cgroup environment\n",
+ env.test->test_num, env.test->test_name);
+ return -1;
+ }
+
+ env.test->need_cgroup_cleanup = true;
+ }
+
+ fd = create_and_get_cgroup(path);
+ if (fd < 0) {
+ fprintf(stderr,
+ "#%d %s: Failed to create cgroup '%s' (errno=%d)\n",
+ env.test->test_num, env.test->test_name, path, errno);
+ return fd;
+ }
+
+ if (join_cgroup(path)) {
+ fprintf(stderr,
+ "#%d %s: Failed to join cgroup '%s' (errno=%d)\n",
+ env.test->test_num, env.test->test_name, path, errno);
+ return -1;
+ }
+
+ return fd;
+}
+
+int bpf_find_map(const char *test, struct bpf_object *obj, const char *name)
+{
+ struct bpf_map *map;
+
+ map = bpf_object__find_map_by_name(obj, name);
+ if (!map) {
+ fprintf(stdout, "%s:FAIL:map '%s' not found\n", test, name);
+ test__fail();
+ return -1;
+ }
+ return bpf_map__fd(map);
+}
+
+static bool is_jit_enabled(void)
+{
+ const char *jit_sysctl = "/proc/sys/net/core/bpf_jit_enable";
+ bool enabled = false;
+ int sysctl_fd;
+
+ sysctl_fd = open(jit_sysctl, 0, O_RDONLY);
+ if (sysctl_fd != -1) {
+ char tmpc;
+
+ if (read(sysctl_fd, &tmpc, sizeof(tmpc)) == 1)
+ enabled = (tmpc != '0');
+ close(sysctl_fd);
+ }
+
+ return enabled;
+}
+
+int compare_map_keys(int map1_fd, int map2_fd)
+{
+ __u32 key, next_key;
+ char val_buf[PERF_MAX_STACK_DEPTH *
+ sizeof(struct bpf_stack_build_id)];
+ int err;
+
+ err = bpf_map_get_next_key(map1_fd, NULL, &key);
+ if (err)
+ return err;
+ err = bpf_map_lookup_elem(map2_fd, &key, val_buf);
+ if (err)
+ return err;
+
+ while (bpf_map_get_next_key(map1_fd, &key, &next_key) == 0) {
+ err = bpf_map_lookup_elem(map2_fd, &next_key, val_buf);
+ if (err)
+ return err;
+
+ key = next_key;
+ }
+ if (errno != ENOENT)
+ return -1;
+
+ return 0;
+}
+
+int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len)
+{
+ __u32 key, next_key, *cur_key_p, *next_key_p;
+ char *val_buf1, *val_buf2;
+ int i, err = 0;
+
+ val_buf1 = malloc(stack_trace_len);
+ val_buf2 = malloc(stack_trace_len);
+ cur_key_p = NULL;
+ next_key_p = &key;
+ while (bpf_map_get_next_key(smap_fd, cur_key_p, next_key_p) == 0) {
+ err = bpf_map_lookup_elem(smap_fd, next_key_p, val_buf1);
+ if (err)
+ goto out;
+ err = bpf_map_lookup_elem(amap_fd, next_key_p, val_buf2);
+ if (err)
+ goto out;
+ for (i = 0; i < stack_trace_len; i++) {
+ if (val_buf1[i] != val_buf2[i]) {
+ err = -1;
+ goto out;
+ }
+ }
+ key = *next_key_p;
+ cur_key_p = &key;
+ next_key_p = &next_key;
+ }
+ if (errno != ENOENT)
+ err = -1;
+
+out:
+ free(val_buf1);
+ free(val_buf2);
+ return err;
+}
+
+int extract_build_id(char *build_id, size_t size)
+{
+ FILE *fp;
+ char *line = NULL;
+ size_t len = 0;
+
+ fp = popen("readelf -n ./urandom_read | grep 'Build ID'", "r");
+ if (fp == NULL)
+ return -1;
+
+ if (getline(&line, &len, fp) == -1)
+ goto err;
+ pclose(fp);
+
+ if (len > size)
+ len = size;
+ memcpy(build_id, line, len);
+ build_id[len] = '\0';
+ free(line);
+ return 0;
+err:
+ pclose(fp);
+ return -1;
+}
+
+/* extern declarations for test funcs */
+#define DEFINE_TEST(name) extern void test_##name(void);
+#include <prog_tests/tests.h>
+#undef DEFINE_TEST
+
+static struct prog_test_def prog_test_defs[] = {
+#define DEFINE_TEST(name) { \
+ .test_name = #name, \
+ .run_test = &test_##name, \
+},
+#include <prog_tests/tests.h>
+#undef DEFINE_TEST
+};
+const int prog_test_cnt = ARRAY_SIZE(prog_test_defs);
+
+const char *argp_program_version = "test_progs 0.1";
+const char *argp_program_bug_address = "<bpf@vger.kernel.org>";
+const char argp_program_doc[] = "BPF selftests test runner";
+
+enum ARG_KEYS {
+ ARG_TEST_NUM = 'n',
+ ARG_TEST_NAME = 't',
+ ARG_TEST_NAME_BLACKLIST = 'b',
+ ARG_VERIFIER_STATS = 's',
+ ARG_VERBOSE = 'v',
+ ARG_GET_TEST_CNT = 'c',
+ ARG_LIST_TEST_NAMES = 'l',
+};
+
+static const struct argp_option opts[] = {
+ { "num", ARG_TEST_NUM, "NUM", 0,
+ "Run test number NUM only " },
+ { "name", ARG_TEST_NAME, "NAMES", 0,
+ "Run tests with names containing any string from NAMES list" },
+ { "name-blacklist", ARG_TEST_NAME_BLACKLIST, "NAMES", 0,
+ "Don't run tests with names containing any string from NAMES list" },
+ { "verifier-stats", ARG_VERIFIER_STATS, NULL, 0,
+ "Output verifier statistics", },
+ { "verbose", ARG_VERBOSE, "LEVEL", OPTION_ARG_OPTIONAL,
+ "Verbose output (use -vv or -vvv for progressively verbose output)" },
+ { "count", ARG_GET_TEST_CNT, NULL, 0,
+ "Get number of selected top-level tests " },
+ { "list", ARG_LIST_TEST_NAMES, NULL, 0,
+ "List test names that would run (without running them) " },
+ {},
+};
+
+static int libbpf_print_fn(enum libbpf_print_level level,
+ const char *format, va_list args)
+{
+ if (env.verbosity < VERBOSE_VERY && level == LIBBPF_DEBUG)
+ return 0;
+ vfprintf(stdout, format, args);
+ return 0;
+}
+
+static void free_str_set(const struct str_set *set)
+{
+ int i;
+
+ if (!set)
+ return;
+
+ for (i = 0; i < set->cnt; i++)
+ free((void *)set->strs[i]);
+ free(set->strs);
+}
+
+static int parse_str_list(const char *s, struct str_set *set)
+{
+ char *input, *state = NULL, *next, **tmp, **strs = NULL;
+ int cnt = 0;
+
+ input = strdup(s);
+ if (!input)
+ return -ENOMEM;
+
+ set->cnt = 0;
+ set->strs = NULL;
+
+ while ((next = strtok_r(state ? NULL : input, ",", &state))) {
+ tmp = realloc(strs, sizeof(*strs) * (cnt + 1));
+ if (!tmp)
+ goto err;
+ strs = tmp;
+
+ strs[cnt] = strdup(next);
+ if (!strs[cnt])
+ goto err;
+
+ cnt++;
+ }
+
+ set->cnt = cnt;
+ set->strs = (const char **)strs;
+ free(input);
+ return 0;
+err:
+ free(strs);
+ free(input);
+ return -ENOMEM;
+}
+
+extern int extra_prog_load_log_flags;
+
+static error_t parse_arg(int key, char *arg, struct argp_state *state)
+{
+ struct test_env *env = state->input;
+
+ switch (key) {
+ case ARG_TEST_NUM: {
+ char *subtest_str = strchr(arg, '/');
+
+ if (subtest_str) {
+ *subtest_str = '\0';
+ if (parse_num_list(subtest_str + 1,
+ &env->subtest_selector.num_set,
+ &env->subtest_selector.num_set_len)) {
+ fprintf(stderr,
+ "Failed to parse subtest numbers.\n");
+ return -EINVAL;
+ }
+ }
+ if (parse_num_list(arg, &env->test_selector.num_set,
+ &env->test_selector.num_set_len)) {
+ fprintf(stderr, "Failed to parse test numbers.\n");
+ return -EINVAL;
+ }
+ break;
+ }
+ case ARG_TEST_NAME: {
+ char *subtest_str = strchr(arg, '/');
+
+ if (subtest_str) {
+ *subtest_str = '\0';
+ if (parse_str_list(subtest_str + 1,
+ &env->subtest_selector.whitelist))
+ return -ENOMEM;
+ }
+ if (parse_str_list(arg, &env->test_selector.whitelist))
+ return -ENOMEM;
+ break;
+ }
+ case ARG_TEST_NAME_BLACKLIST: {
+ char *subtest_str = strchr(arg, '/');
+
+ if (subtest_str) {
+ *subtest_str = '\0';
+ if (parse_str_list(subtest_str + 1,
+ &env->subtest_selector.blacklist))
+ return -ENOMEM;
+ }
+ if (parse_str_list(arg, &env->test_selector.blacklist))
+ return -ENOMEM;
+ break;
+ }
+ case ARG_VERIFIER_STATS:
+ env->verifier_stats = true;
+ break;
+ case ARG_VERBOSE:
+ env->verbosity = VERBOSE_NORMAL;
+ if (arg) {
+ if (strcmp(arg, "v") == 0) {
+ env->verbosity = VERBOSE_VERY;
+ extra_prog_load_log_flags = 1;
+ } else if (strcmp(arg, "vv") == 0) {
+ env->verbosity = VERBOSE_SUPER;
+ extra_prog_load_log_flags = 2;
+ } else {
+ fprintf(stderr,
+ "Unrecognized verbosity setting ('%s'), only -v and -vv are supported\n",
+ arg);
+ return -EINVAL;
+ }
+ }
+ break;
+ case ARG_GET_TEST_CNT:
+ env->get_test_cnt = true;
+ break;
+ case ARG_LIST_TEST_NAMES:
+ env->list_test_names = true;
+ break;
+ case ARGP_KEY_ARG:
+ argp_usage(state);
+ break;
+ case ARGP_KEY_END:
+ break;
+ default:
+ return ARGP_ERR_UNKNOWN;
+ }
+ return 0;
+}
+
+static void stdio_hijack(void)
+{
+#ifdef __GLIBC__
+ env.stdout = stdout;
+ env.stderr = stderr;
+
+ if (env.verbosity > VERBOSE_NONE) {
+ /* nothing to do, output to stdout by default */
+ return;
+ }
+
+ /* stdout and stderr -> buffer */
+ fflush(stdout);
+
+ stdout = open_memstream(&env.log_buf, &env.log_cnt);
+ if (!stdout) {
+ stdout = env.stdout;
+ perror("open_memstream");
+ return;
+ }
+
+ stderr = stdout;
+#endif
+}
+
+static void stdio_restore(void)
+{
+#ifdef __GLIBC__
+ if (stdout == env.stdout)
+ return;
+
+ fclose(stdout);
+ free(env.log_buf);
+
+ env.log_buf = NULL;
+ env.log_cnt = 0;
+
+ stdout = env.stdout;
+ stderr = env.stderr;
+#endif
+}
+
+/*
+ * Determine if test_progs is running as a "flavored" test runner and switch
+ * into corresponding sub-directory to load correct BPF objects.
+ *
+ * This is done by looking at executable name. If it contains "-flavor"
+ * suffix, then we are running as a flavored test runner.
+ */
+int cd_flavor_subdir(const char *exec_name)
+{
+ /* General form of argv[0] passed here is:
+ * some/path/to/test_progs[-flavor], where -flavor part is optional.
+ * First cut out "test_progs[-flavor]" part, then extract "flavor"
+ * part, if it's there.
+ */
+ const char *flavor = strrchr(exec_name, '/');
+
+ if (!flavor)
+ return 0;
+ flavor++;
+ flavor = strrchr(flavor, '-');
+ if (!flavor)
+ return 0;
+ flavor++;
+ if (env.verbosity > VERBOSE_NONE)
+ fprintf(stdout, "Switching to flavor '%s' subdirectory...\n", flavor);
+
+ return chdir(flavor);
+}
+
+#define MAX_BACKTRACE_SZ 128
+void crash_handler(int signum)
+{
+ void *bt[MAX_BACKTRACE_SZ];
+ size_t sz;
+
+ sz = backtrace(bt, ARRAY_SIZE(bt));
+
+ if (env.test)
+ dump_test_log(env.test, true);
+ if (env.stdout)
+ stdio_restore();
+
+ fprintf(stderr, "Caught signal #%d!\nStack trace:\n", signum);
+ backtrace_symbols_fd(bt, sz, STDERR_FILENO);
+}
+
+int main(int argc, char **argv)
+{
+ static const struct argp argp = {
+ .options = opts,
+ .parser = parse_arg,
+ .doc = argp_program_doc,
+ };
+ struct sigaction sigact = {
+ .sa_handler = crash_handler,
+ .sa_flags = SA_RESETHAND,
+ };
+ int err, i;
+
+ sigaction(SIGSEGV, &sigact, NULL);
+
+ err = argp_parse(&argp, argc, argv, 0, NULL, &env);
+ if (err)
+ return err;
+
+ err = cd_flavor_subdir(argv[0]);
+ if (err)
+ return err;
+
+ libbpf_set_print(libbpf_print_fn);
+
+ srand(time(NULL));
+
+ env.jit_enabled = is_jit_enabled();
+ env.nr_cpus = libbpf_num_possible_cpus();
+ if (env.nr_cpus < 0) {
+ fprintf(stderr, "Failed to get number of CPUs: %d!\n",
+ env.nr_cpus);
+ return -1;
+ }
+
+ save_netns();
+ stdio_hijack();
+ for (i = 0; i < prog_test_cnt; i++) {
+ struct prog_test_def *test = &prog_test_defs[i];
+
+ env.test = test;
+ test->test_num = i + 1;
+
+ if (!should_run(&env.test_selector,
+ test->test_num, test->test_name))
+ continue;
+
+ if (env.get_test_cnt) {
+ env.succ_cnt++;
+ continue;
+ }
+
+ if (env.list_test_names) {
+ fprintf(env.stdout, "%s\n", test->test_name);
+ env.succ_cnt++;
+ continue;
+ }
+
+ test->run_test();
+ /* ensure last sub-test is finalized properly */
+ if (test->subtest_name)
+ test__end_subtest();
+
+ test->tested = true;
+ if (test->error_cnt)
+ env.fail_cnt++;
+ else
+ env.succ_cnt++;
+ skip_account();
+
+ dump_test_log(test, test->error_cnt);
+
+ fprintf(env.stdout, "#%d %s:%s\n",
+ test->test_num, test->test_name,
+ test->error_cnt ? "FAIL" : "OK");
+
+ reset_affinity();
+ restore_netns();
+ if (test->need_cgroup_cleanup)
+ cleanup_cgroup_environment();
+ }
+ stdio_restore();
+
+ if (env.get_test_cnt) {
+ printf("%d\n", env.succ_cnt);
+ goto out;
+ }
+
+ if (env.list_test_names)
+ goto out;
+
+ fprintf(stdout, "Summary: %d/%d PASSED, %d SKIPPED, %d FAILED\n",
+ env.succ_cnt, env.sub_succ_cnt, env.skip_cnt, env.fail_cnt);
+
+out:
+ free_str_set(&env.test_selector.blacklist);
+ free_str_set(&env.test_selector.whitelist);
+ free(env.test_selector.num_set);
+ free_str_set(&env.subtest_selector.blacklist);
+ free_str_set(&env.subtest_selector.whitelist);
+ free(env.subtest_selector.num_set);
+ close(env.saved_netns_fd);
+
+ if (env.succ_cnt + env.fail_cnt + env.skip_cnt == 0)
+ return EXIT_NO_TEST;
+
+ return env.fail_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
+}
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
new file mode 100644
index 000000000..1d429d67f
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -0,0 +1,216 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <time.h>
+#include <signal.h>
+
+#include <linux/types.h>
+typedef __u16 __sum16;
+#include <arpa/inet.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/filter.h>
+#include <linux/perf_event.h>
+#include <linux/socket.h>
+#include <linux/unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <linux/bpf.h>
+#include <linux/err.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "test_iptunnel_common.h"
+#include "bpf_util.h"
+#include <bpf/bpf_endian.h>
+#include "trace_helpers.h"
+#include "testing_helpers.h"
+#include "flow_dissector_load.h"
+
+enum verbosity {
+ VERBOSE_NONE,
+ VERBOSE_NORMAL,
+ VERBOSE_VERY,
+ VERBOSE_SUPER,
+};
+
+struct str_set {
+ const char **strs;
+ int cnt;
+};
+
+struct test_selector {
+ struct str_set whitelist;
+ struct str_set blacklist;
+ bool *num_set;
+ int num_set_len;
+};
+
+struct test_env {
+ struct test_selector test_selector;
+ struct test_selector subtest_selector;
+ bool verifier_stats;
+ enum verbosity verbosity;
+
+ bool jit_enabled;
+ bool get_test_cnt;
+ bool list_test_names;
+
+ struct prog_test_def *test;
+ FILE *stdout;
+ FILE *stderr;
+ char *log_buf;
+ size_t log_cnt;
+ int nr_cpus;
+
+ int succ_cnt; /* successful tests */
+ int sub_succ_cnt; /* successful sub-tests */
+ int fail_cnt; /* total failed tests + sub-tests */
+ int skip_cnt; /* skipped tests */
+
+ int saved_netns_fd;
+};
+
+extern struct test_env env;
+
+extern void test__force_log();
+extern bool test__start_subtest(const char *name);
+extern void test__skip(void);
+extern void test__fail(void);
+extern int test__join_cgroup(const char *path);
+
+#define PRINT_FAIL(format...) \
+ ({ \
+ test__fail(); \
+ fprintf(stdout, "%s:FAIL:%d ", __func__, __LINE__); \
+ fprintf(stdout, ##format); \
+ })
+
+#define _CHECK(condition, tag, duration, format...) ({ \
+ int __ret = !!(condition); \
+ int __save_errno = errno; \
+ if (__ret) { \
+ test__fail(); \
+ fprintf(stdout, "%s:FAIL:%s ", __func__, tag); \
+ fprintf(stdout, ##format); \
+ } else { \
+ fprintf(stdout, "%s:PASS:%s %d nsec\n", \
+ __func__, tag, duration); \
+ } \
+ errno = __save_errno; \
+ __ret; \
+})
+
+#define CHECK_FAIL(condition) ({ \
+ int __ret = !!(condition); \
+ int __save_errno = errno; \
+ if (__ret) { \
+ test__fail(); \
+ fprintf(stdout, "%s:FAIL:%d\n", __func__, __LINE__); \
+ } \
+ errno = __save_errno; \
+ __ret; \
+})
+
+#define CHECK(condition, tag, format...) \
+ _CHECK(condition, tag, duration, format)
+#define CHECK_ATTR(condition, tag, format...) \
+ _CHECK(condition, tag, tattr.duration, format)
+
+#define ASSERT_EQ(actual, expected, name) ({ \
+ static int duration = 0; \
+ typeof(actual) ___act = (actual); \
+ typeof(expected) ___exp = (expected); \
+ bool ___ok = ___act == ___exp; \
+ CHECK(!___ok, (name), \
+ "unexpected %s: actual %lld != expected %lld\n", \
+ (name), (long long)(___act), (long long)(___exp)); \
+ ___ok; \
+})
+
+#define ASSERT_STREQ(actual, expected, name) ({ \
+ static int duration = 0; \
+ const char *___act = actual; \
+ const char *___exp = expected; \
+ bool ___ok = strcmp(___act, ___exp) == 0; \
+ CHECK(!___ok, (name), \
+ "unexpected %s: actual '%s' != expected '%s'\n", \
+ (name), ___act, ___exp); \
+ ___ok; \
+})
+
+#define ASSERT_OK(res, name) ({ \
+ static int duration = 0; \
+ long long ___res = (res); \
+ bool ___ok = ___res == 0; \
+ CHECK(!___ok, (name), "unexpected error: %lld\n", ___res); \
+ ___ok; \
+})
+
+#define ASSERT_ERR(res, name) ({ \
+ static int duration = 0; \
+ long long ___res = (res); \
+ bool ___ok = ___res < 0; \
+ CHECK(!___ok, (name), "unexpected success: %lld\n", ___res); \
+ ___ok; \
+})
+
+#define ASSERT_NULL(ptr, name) ({ \
+ static int duration = 0; \
+ const void *___res = (ptr); \
+ bool ___ok = !___res; \
+ CHECK(!___ok, (name), "unexpected pointer: %p\n", ___res); \
+ ___ok; \
+})
+
+#define ASSERT_OK_PTR(ptr, name) ({ \
+ static int duration = 0; \
+ const void *___res = (ptr); \
+ bool ___ok = !IS_ERR_OR_NULL(___res); \
+ CHECK(!___ok, (name), \
+ "unexpected error: %ld\n", PTR_ERR(___res)); \
+ ___ok; \
+})
+
+#define ASSERT_ERR_PTR(ptr, name) ({ \
+ static int duration = 0; \
+ const void *___res = (ptr); \
+ bool ___ok = IS_ERR(___res) \
+ CHECK(!___ok, (name), "unexpected pointer: %p\n", ___res); \
+ ___ok; \
+})
+
+static inline __u64 ptr_to_u64(const void *ptr)
+{
+ return (__u64) (unsigned long) ptr;
+}
+
+static inline void *u64_to_ptr(__u64 ptr)
+{
+ return (void *) (unsigned long) ptr;
+}
+
+int bpf_find_map(const char *test, struct bpf_object *obj, const char *name);
+int compare_map_keys(int map1_fd, int map2_fd);
+int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len);
+int extract_build_id(char *build_id, size_t size);
+
+#ifdef __x86_64__
+#define SYS_NANOSLEEP_KPROBE_NAME "__x64_sys_nanosleep"
+#elif defined(__s390x__)
+#define SYS_NANOSLEEP_KPROBE_NAME "__s390x_sys_nanosleep"
+#else
+#define SYS_NANOSLEEP_KPROBE_NAME "sys_nanosleep"
+#endif
diff --git a/tools/testing/selftests/bpf/test_select_reuseport_common.h b/tools/testing/selftests/bpf/test_select_reuseport_common.h
new file mode 100644
index 000000000..08eb2a9f1
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_select_reuseport_common.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+
+#ifndef __TEST_SELECT_REUSEPORT_COMMON_H
+#define __TEST_SELECT_REUSEPORT_COMMON_H
+
+#include <linux/types.h>
+
+enum result {
+ DROP_ERR_INNER_MAP,
+ DROP_ERR_SKB_DATA,
+ DROP_ERR_SK_SELECT_REUSEPORT,
+ DROP_MISC,
+ PASS,
+ PASS_ERR_SK_SELECT_REUSEPORT,
+ NR_RESULTS,
+};
+
+struct cmd {
+ __u32 reuseport_index;
+ __u32 pass_on_failure;
+};
+
+struct data_check {
+ __u32 ip_protocol;
+ __u32 skb_addrs[8];
+ __u16 skb_ports[2];
+ __u16 eth_protocol;
+ __u8 bind_inany;
+ __u8 equal_check_end[0];
+
+ __u32 len;
+ __u32 hash;
+};
+
+#endif
diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id.sh b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh
new file mode 100755
index 000000000..a9bc6f82a
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_skb_cgroup_id.sh
@@ -0,0 +1,63 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2018 Facebook
+
+set -eu
+
+wait_for_ip()
+{
+ local _i
+ echo -n "Wait for testing link-local IP to become available "
+ for _i in $(seq ${MAX_PING_TRIES}); do
+ echo -n "."
+ if $PING6 -c 1 -W 1 ff02::1%${TEST_IF} >/dev/null 2>&1; then
+ echo " OK"
+ return
+ fi
+ sleep 1
+ done
+ echo 1>&2 "ERROR: Timeout waiting for test IP to become available."
+ exit 1
+}
+
+setup()
+{
+ # Create testing interfaces not to interfere with current environment.
+ ip link add dev ${TEST_IF} type veth peer name ${TEST_IF_PEER}
+ ip link set ${TEST_IF} up
+ ip link set ${TEST_IF_PEER} up
+
+ wait_for_ip
+
+ tc qdisc add dev ${TEST_IF} clsact
+ tc filter add dev ${TEST_IF} egress bpf obj ${BPF_PROG_OBJ} \
+ sec ${BPF_PROG_SECTION} da
+
+ BPF_PROG_ID=$(tc filter show dev ${TEST_IF} egress | \
+ awk '/ id / {sub(/.* id /, "", $0); print($1)}')
+}
+
+cleanup()
+{
+ ip link del ${TEST_IF} 2>/dev/null || :
+ ip link del ${TEST_IF_PEER} 2>/dev/null || :
+}
+
+main()
+{
+ trap cleanup EXIT 2 3 6 15
+ setup
+ ${PROG} ${TEST_IF} ${BPF_PROG_ID}
+}
+
+DIR=$(dirname $0)
+TEST_IF="test_cgid_1"
+TEST_IF_PEER="test_cgid_2"
+MAX_PING_TRIES=5
+BPF_PROG_OBJ="${DIR}/test_skb_cgroup_id_kern.o"
+BPF_PROG_SECTION="cgroup_id_logger"
+BPF_PROG_ID=0
+PROG="${DIR}/test_skb_cgroup_id_user"
+type ping6 >/dev/null 2>&1 && PING6="ping6" || PING6="ping -6"
+
+main
diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c b/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c
new file mode 100644
index 000000000..4a6430672
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c
@@ -0,0 +1,181 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <arpa/inet.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "bpf_rlimit.h"
+#include "cgroup_helpers.h"
+
+#define CGROUP_PATH "/skb_cgroup_test"
+#define NUM_CGROUP_LEVELS 4
+
+/* RFC 4291, Section 2.7.1 */
+#define LINKLOCAL_MULTICAST "ff02::1"
+
+static int mk_dst_addr(const char *ip, const char *iface,
+ struct sockaddr_in6 *dst)
+{
+ memset(dst, 0, sizeof(*dst));
+
+ dst->sin6_family = AF_INET6;
+ dst->sin6_port = htons(1025);
+
+ if (inet_pton(AF_INET6, ip, &dst->sin6_addr) != 1) {
+ log_err("Invalid IPv6: %s", ip);
+ return -1;
+ }
+
+ dst->sin6_scope_id = if_nametoindex(iface);
+ if (!dst->sin6_scope_id) {
+ log_err("Failed to get index of iface: %s", iface);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int send_packet(const char *iface)
+{
+ struct sockaddr_in6 dst;
+ char msg[] = "msg";
+ int err = 0;
+ int fd = -1;
+
+ if (mk_dst_addr(LINKLOCAL_MULTICAST, iface, &dst))
+ goto err;
+
+ fd = socket(AF_INET6, SOCK_DGRAM, 0);
+ if (fd == -1) {
+ log_err("Failed to create UDP socket");
+ goto err;
+ }
+
+ if (sendto(fd, &msg, sizeof(msg), 0, (const struct sockaddr *)&dst,
+ sizeof(dst)) == -1) {
+ log_err("Failed to send datagram");
+ goto err;
+ }
+
+ goto out;
+err:
+ err = -1;
+out:
+ if (fd >= 0)
+ close(fd);
+ return err;
+}
+
+int get_map_fd_by_prog_id(int prog_id)
+{
+ struct bpf_prog_info info = {};
+ __u32 info_len = sizeof(info);
+ __u32 map_ids[1];
+ int prog_fd = -1;
+ int map_fd = -1;
+
+ prog_fd = bpf_prog_get_fd_by_id(prog_id);
+ if (prog_fd < 0) {
+ log_err("Failed to get fd by prog id %d", prog_id);
+ goto err;
+ }
+
+ info.nr_map_ids = 1;
+ info.map_ids = (__u64) (unsigned long) map_ids;
+
+ if (bpf_obj_get_info_by_fd(prog_fd, &info, &info_len)) {
+ log_err("Failed to get info by prog fd %d", prog_fd);
+ goto err;
+ }
+
+ if (!info.nr_map_ids) {
+ log_err("No maps found for prog fd %d", prog_fd);
+ goto err;
+ }
+
+ map_fd = bpf_map_get_fd_by_id(map_ids[0]);
+ if (map_fd < 0)
+ log_err("Failed to get fd by map id %d", map_ids[0]);
+err:
+ if (prog_fd >= 0)
+ close(prog_fd);
+ return map_fd;
+}
+
+int check_ancestor_cgroup_ids(int prog_id)
+{
+ __u64 actual_ids[NUM_CGROUP_LEVELS], expected_ids[NUM_CGROUP_LEVELS];
+ __u32 level;
+ int err = 0;
+ int map_fd;
+
+ expected_ids[0] = get_cgroup_id("/.."); /* root cgroup */
+ expected_ids[1] = get_cgroup_id("");
+ expected_ids[2] = get_cgroup_id(CGROUP_PATH);
+ expected_ids[3] = 0; /* non-existent cgroup */
+
+ map_fd = get_map_fd_by_prog_id(prog_id);
+ if (map_fd < 0)
+ goto err;
+
+ for (level = 0; level < NUM_CGROUP_LEVELS; ++level) {
+ if (bpf_map_lookup_elem(map_fd, &level, &actual_ids[level])) {
+ log_err("Failed to lookup key %d", level);
+ goto err;
+ }
+ if (actual_ids[level] != expected_ids[level]) {
+ log_err("%llx (actual) != %llx (expected), level: %u\n",
+ actual_ids[level], expected_ids[level], level);
+ goto err;
+ }
+ }
+
+ goto out;
+err:
+ err = -1;
+out:
+ if (map_fd >= 0)
+ close(map_fd);
+ return err;
+}
+
+int main(int argc, char **argv)
+{
+ int cgfd = -1;
+ int err = 0;
+
+ if (argc < 3) {
+ fprintf(stderr, "Usage: %s iface prog_id\n", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ cgfd = cgroup_setup_and_join(CGROUP_PATH);
+ if (cgfd < 0)
+ goto err;
+
+ if (send_packet(argv[1]))
+ goto err;
+
+ if (check_ancestor_cgroup_ids(atoi(argv[2])))
+ goto err;
+
+ goto out;
+err:
+ err = -1;
+out:
+ close(cgfd);
+ cleanup_cgroup_environment();
+ printf("[%s]\n", err ? "FAIL" : "PASS");
+ return err;
+}
diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c
new file mode 100644
index 000000000..9613f7538
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sock.c
@@ -0,0 +1,481 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <stdio.h>
+#include <unistd.h>
+
+#include <arpa/inet.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#include <linux/filter.h>
+
+#include <bpf/bpf.h>
+
+#include "cgroup_helpers.h"
+#include <bpf/bpf_endian.h>
+#include "bpf_rlimit.h"
+#include "bpf_util.h"
+
+#define CG_PATH "/foo"
+#define MAX_INSNS 512
+
+char bpf_log_buf[BPF_LOG_BUF_SIZE];
+static bool verbose = false;
+
+struct sock_test {
+ const char *descr;
+ /* BPF prog properties */
+ struct bpf_insn insns[MAX_INSNS];
+ enum bpf_attach_type expected_attach_type;
+ enum bpf_attach_type attach_type;
+ /* Socket properties */
+ int domain;
+ int type;
+ /* Endpoint to bind() to */
+ const char *ip;
+ unsigned short port;
+ /* Expected test result */
+ enum {
+ LOAD_REJECT,
+ ATTACH_REJECT,
+ BIND_REJECT,
+ SUCCESS,
+ } result;
+};
+
+static struct sock_test tests[] = {
+ {
+ "bind4 load with invalid access: src_ip6",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock, src_ip6[0])),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ BPF_CGROUP_INET4_POST_BIND,
+ BPF_CGROUP_INET4_POST_BIND,
+ 0,
+ 0,
+ NULL,
+ 0,
+ LOAD_REJECT,
+ },
+ {
+ "bind4 load with invalid access: mark",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock, mark)),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ BPF_CGROUP_INET4_POST_BIND,
+ BPF_CGROUP_INET4_POST_BIND,
+ 0,
+ 0,
+ NULL,
+ 0,
+ LOAD_REJECT,
+ },
+ {
+ "bind6 load with invalid access: src_ip4",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock, src_ip4)),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ BPF_CGROUP_INET6_POST_BIND,
+ BPF_CGROUP_INET6_POST_BIND,
+ 0,
+ 0,
+ NULL,
+ 0,
+ LOAD_REJECT,
+ },
+ {
+ "sock_create load with invalid access: src_port",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock, src_port)),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ BPF_CGROUP_INET_SOCK_CREATE,
+ BPF_CGROUP_INET_SOCK_CREATE,
+ 0,
+ 0,
+ NULL,
+ 0,
+ LOAD_REJECT,
+ },
+ {
+ "sock_create load w/o expected_attach_type (compat mode)",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ 0,
+ BPF_CGROUP_INET_SOCK_CREATE,
+ AF_INET,
+ SOCK_STREAM,
+ "127.0.0.1",
+ 8097,
+ SUCCESS,
+ },
+ {
+ "sock_create load w/ expected_attach_type",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ BPF_CGROUP_INET_SOCK_CREATE,
+ BPF_CGROUP_INET_SOCK_CREATE,
+ AF_INET,
+ SOCK_STREAM,
+ "127.0.0.1",
+ 8097,
+ SUCCESS,
+ },
+ {
+ "attach type mismatch bind4 vs bind6",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ BPF_CGROUP_INET4_POST_BIND,
+ BPF_CGROUP_INET6_POST_BIND,
+ 0,
+ 0,
+ NULL,
+ 0,
+ ATTACH_REJECT,
+ },
+ {
+ "attach type mismatch bind6 vs bind4",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ BPF_CGROUP_INET6_POST_BIND,
+ BPF_CGROUP_INET4_POST_BIND,
+ 0,
+ 0,
+ NULL,
+ 0,
+ ATTACH_REJECT,
+ },
+ {
+ "attach type mismatch default vs bind4",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ 0,
+ BPF_CGROUP_INET4_POST_BIND,
+ 0,
+ 0,
+ NULL,
+ 0,
+ ATTACH_REJECT,
+ },
+ {
+ "attach type mismatch bind6 vs sock_create",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ BPF_CGROUP_INET6_POST_BIND,
+ BPF_CGROUP_INET_SOCK_CREATE,
+ 0,
+ 0,
+ NULL,
+ 0,
+ ATTACH_REJECT,
+ },
+ {
+ "bind4 reject all",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ BPF_CGROUP_INET4_POST_BIND,
+ BPF_CGROUP_INET4_POST_BIND,
+ AF_INET,
+ SOCK_STREAM,
+ "0.0.0.0",
+ 0,
+ BIND_REJECT,
+ },
+ {
+ "bind6 reject all",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ BPF_CGROUP_INET6_POST_BIND,
+ BPF_CGROUP_INET6_POST_BIND,
+ AF_INET6,
+ SOCK_STREAM,
+ "::",
+ 0,
+ BIND_REJECT,
+ },
+ {
+ "bind6 deny specific IP & port",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+ /* if (ip == expected && port == expected) */
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock, src_ip6[3])),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7,
+ __bpf_constant_ntohl(0x00000001), 4),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock, src_port)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x2001, 2),
+
+ /* return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_A(1),
+
+ /* else return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ BPF_CGROUP_INET6_POST_BIND,
+ BPF_CGROUP_INET6_POST_BIND,
+ AF_INET6,
+ SOCK_STREAM,
+ "::1",
+ 8193,
+ BIND_REJECT,
+ },
+ {
+ "bind4 allow specific IP & port",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+ /* if (ip == expected && port == expected) */
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock, src_ip4)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7,
+ __bpf_constant_ntohl(0x7F000001), 4),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock, src_port)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x1002, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ BPF_CGROUP_INET4_POST_BIND,
+ BPF_CGROUP_INET4_POST_BIND,
+ AF_INET,
+ SOCK_STREAM,
+ "127.0.0.1",
+ 4098,
+ SUCCESS,
+ },
+ {
+ "bind4 allow all",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ BPF_CGROUP_INET4_POST_BIND,
+ BPF_CGROUP_INET4_POST_BIND,
+ AF_INET,
+ SOCK_STREAM,
+ "0.0.0.0",
+ 0,
+ SUCCESS,
+ },
+ {
+ "bind6 allow all",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ BPF_CGROUP_INET6_POST_BIND,
+ BPF_CGROUP_INET6_POST_BIND,
+ AF_INET6,
+ SOCK_STREAM,
+ "::",
+ 0,
+ SUCCESS,
+ },
+};
+
+static size_t probe_prog_length(const struct bpf_insn *fp)
+{
+ size_t len;
+
+ for (len = MAX_INSNS - 1; len > 0; --len)
+ if (fp[len].code != 0 || fp[len].imm != 0)
+ break;
+ return len + 1;
+}
+
+static int load_sock_prog(const struct bpf_insn *prog,
+ enum bpf_attach_type attach_type)
+{
+ struct bpf_load_program_attr attr;
+ int ret;
+
+ memset(&attr, 0, sizeof(struct bpf_load_program_attr));
+ attr.prog_type = BPF_PROG_TYPE_CGROUP_SOCK;
+ attr.expected_attach_type = attach_type;
+ attr.insns = prog;
+ attr.insns_cnt = probe_prog_length(attr.insns);
+ attr.license = "GPL";
+ attr.log_level = 2;
+
+ ret = bpf_load_program_xattr(&attr, bpf_log_buf, BPF_LOG_BUF_SIZE);
+ if (verbose && ret < 0)
+ fprintf(stderr, "%s\n", bpf_log_buf);
+
+ return ret;
+}
+
+static int attach_sock_prog(int cgfd, int progfd,
+ enum bpf_attach_type attach_type)
+{
+ return bpf_prog_attach(progfd, cgfd, attach_type, BPF_F_ALLOW_OVERRIDE);
+}
+
+static int bind_sock(int domain, int type, const char *ip, unsigned short port)
+{
+ struct sockaddr_storage addr;
+ struct sockaddr_in6 *addr6;
+ struct sockaddr_in *addr4;
+ int sockfd = -1;
+ socklen_t len;
+ int err = 0;
+
+ sockfd = socket(domain, type, 0);
+ if (sockfd < 0)
+ goto err;
+
+ memset(&addr, 0, sizeof(addr));
+
+ if (domain == AF_INET) {
+ len = sizeof(struct sockaddr_in);
+ addr4 = (struct sockaddr_in *)&addr;
+ addr4->sin_family = domain;
+ addr4->sin_port = htons(port);
+ if (inet_pton(domain, ip, (void *)&addr4->sin_addr) != 1)
+ goto err;
+ } else if (domain == AF_INET6) {
+ len = sizeof(struct sockaddr_in6);
+ addr6 = (struct sockaddr_in6 *)&addr;
+ addr6->sin6_family = domain;
+ addr6->sin6_port = htons(port);
+ if (inet_pton(domain, ip, (void *)&addr6->sin6_addr) != 1)
+ goto err;
+ } else {
+ goto err;
+ }
+
+ if (bind(sockfd, (const struct sockaddr *)&addr, len) == -1)
+ goto err;
+
+ goto out;
+err:
+ err = -1;
+out:
+ close(sockfd);
+ return err;
+}
+
+static int run_test_case(int cgfd, const struct sock_test *test)
+{
+ int progfd = -1;
+ int err = 0;
+
+ printf("Test case: %s .. ", test->descr);
+ progfd = load_sock_prog(test->insns, test->expected_attach_type);
+ if (progfd < 0) {
+ if (test->result == LOAD_REJECT)
+ goto out;
+ else
+ goto err;
+ }
+
+ if (attach_sock_prog(cgfd, progfd, test->attach_type) == -1) {
+ if (test->result == ATTACH_REJECT)
+ goto out;
+ else
+ goto err;
+ }
+
+ if (bind_sock(test->domain, test->type, test->ip, test->port) == -1) {
+ /* sys_bind() may fail for different reasons, errno has to be
+ * checked to confirm that BPF program rejected it.
+ */
+ if (test->result == BIND_REJECT && errno == EPERM)
+ goto out;
+ else
+ goto err;
+ }
+
+
+ if (test->result != SUCCESS)
+ goto err;
+
+ goto out;
+err:
+ err = -1;
+out:
+ /* Detaching w/o checking return code: best effort attempt. */
+ if (progfd != -1)
+ bpf_prog_detach(cgfd, test->attach_type);
+ close(progfd);
+ printf("[%s]\n", err ? "FAIL" : "PASS");
+ return err;
+}
+
+static int run_tests(int cgfd)
+{
+ int passes = 0;
+ int fails = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(tests); ++i) {
+ if (run_test_case(cgfd, &tests[i]))
+ ++fails;
+ else
+ ++passes;
+ }
+ printf("Summary: %d PASSED, %d FAILED\n", passes, fails);
+ return fails ? -1 : 0;
+}
+
+int main(int argc, char **argv)
+{
+ int cgfd = -1;
+ int err = 0;
+
+ cgfd = cgroup_setup_and_join(CG_PATH);
+ if (cgfd < 0)
+ goto err;
+
+ if (run_tests(cgfd))
+ goto err;
+
+ goto out;
+err:
+ err = -1;
+out:
+ close(cgfd);
+ cleanup_cgroup_environment();
+ return err;
+}
diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c
new file mode 100644
index 000000000..b8c72c1d9
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sock_addr.c
@@ -0,0 +1,1655 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <sys/types.h>
+#include <sys/select.h>
+#include <sys/socket.h>
+
+#include <linux/filter.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "cgroup_helpers.h"
+#include "bpf_rlimit.h"
+#include "bpf_util.h"
+
+#ifndef ENOTSUPP
+# define ENOTSUPP 524
+#endif
+
+#define CG_PATH "/foo"
+#define CONNECT4_PROG_PATH "./connect4_prog.o"
+#define CONNECT6_PROG_PATH "./connect6_prog.o"
+#define SENDMSG4_PROG_PATH "./sendmsg4_prog.o"
+#define SENDMSG6_PROG_PATH "./sendmsg6_prog.o"
+
+#define SERV4_IP "192.168.1.254"
+#define SERV4_REWRITE_IP "127.0.0.1"
+#define SRC4_IP "172.16.0.1"
+#define SRC4_REWRITE_IP "127.0.0.4"
+#define SERV4_PORT 4040
+#define SERV4_REWRITE_PORT 4444
+
+#define SERV6_IP "face:b00c:1234:5678::abcd"
+#define SERV6_REWRITE_IP "::1"
+#define SERV6_V4MAPPED_IP "::ffff:192.168.0.4"
+#define SRC6_IP "::1"
+#define SRC6_REWRITE_IP "::6"
+#define WILDCARD6_IP "::"
+#define SERV6_PORT 6060
+#define SERV6_REWRITE_PORT 6666
+
+#define INET_NTOP_BUF 40
+
+struct sock_addr_test;
+
+typedef int (*load_fn)(const struct sock_addr_test *test);
+typedef int (*info_fn)(int, struct sockaddr *, socklen_t *);
+
+char bpf_log_buf[BPF_LOG_BUF_SIZE];
+
+struct sock_addr_test {
+ const char *descr;
+ /* BPF prog properties */
+ load_fn loadfn;
+ enum bpf_attach_type expected_attach_type;
+ enum bpf_attach_type attach_type;
+ /* Socket properties */
+ int domain;
+ int type;
+ /* IP:port pairs for BPF prog to override */
+ const char *requested_ip;
+ unsigned short requested_port;
+ const char *expected_ip;
+ unsigned short expected_port;
+ const char *expected_src_ip;
+ /* Expected test result */
+ enum {
+ LOAD_REJECT,
+ ATTACH_REJECT,
+ ATTACH_OKAY,
+ SYSCALL_EPERM,
+ SYSCALL_ENOTSUPP,
+ SUCCESS,
+ } expected_result;
+};
+
+static int bind4_prog_load(const struct sock_addr_test *test);
+static int bind6_prog_load(const struct sock_addr_test *test);
+static int connect4_prog_load(const struct sock_addr_test *test);
+static int connect6_prog_load(const struct sock_addr_test *test);
+static int sendmsg_allow_prog_load(const struct sock_addr_test *test);
+static int sendmsg_deny_prog_load(const struct sock_addr_test *test);
+static int recvmsg_allow_prog_load(const struct sock_addr_test *test);
+static int recvmsg_deny_prog_load(const struct sock_addr_test *test);
+static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test);
+static int recvmsg4_rw_asm_prog_load(const struct sock_addr_test *test);
+static int sendmsg4_rw_c_prog_load(const struct sock_addr_test *test);
+static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test);
+static int recvmsg6_rw_asm_prog_load(const struct sock_addr_test *test);
+static int sendmsg6_rw_c_prog_load(const struct sock_addr_test *test);
+static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test);
+static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test);
+
+static struct sock_addr_test tests[] = {
+ /* bind */
+ {
+ "bind4: load prog with wrong expected attach type",
+ bind4_prog_load,
+ BPF_CGROUP_INET6_BIND,
+ BPF_CGROUP_INET4_BIND,
+ AF_INET,
+ SOCK_STREAM,
+ NULL,
+ 0,
+ NULL,
+ 0,
+ NULL,
+ LOAD_REJECT,
+ },
+ {
+ "bind4: attach prog with wrong attach type",
+ bind4_prog_load,
+ BPF_CGROUP_INET4_BIND,
+ BPF_CGROUP_INET6_BIND,
+ AF_INET,
+ SOCK_STREAM,
+ NULL,
+ 0,
+ NULL,
+ 0,
+ NULL,
+ ATTACH_REJECT,
+ },
+ {
+ "bind4: rewrite IP & TCP port in",
+ bind4_prog_load,
+ BPF_CGROUP_INET4_BIND,
+ BPF_CGROUP_INET4_BIND,
+ AF_INET,
+ SOCK_STREAM,
+ SERV4_IP,
+ SERV4_PORT,
+ SERV4_REWRITE_IP,
+ SERV4_REWRITE_PORT,
+ NULL,
+ SUCCESS,
+ },
+ {
+ "bind4: rewrite IP & UDP port in",
+ bind4_prog_load,
+ BPF_CGROUP_INET4_BIND,
+ BPF_CGROUP_INET4_BIND,
+ AF_INET,
+ SOCK_DGRAM,
+ SERV4_IP,
+ SERV4_PORT,
+ SERV4_REWRITE_IP,
+ SERV4_REWRITE_PORT,
+ NULL,
+ SUCCESS,
+ },
+ {
+ "bind6: load prog with wrong expected attach type",
+ bind6_prog_load,
+ BPF_CGROUP_INET4_BIND,
+ BPF_CGROUP_INET6_BIND,
+ AF_INET6,
+ SOCK_STREAM,
+ NULL,
+ 0,
+ NULL,
+ 0,
+ NULL,
+ LOAD_REJECT,
+ },
+ {
+ "bind6: attach prog with wrong attach type",
+ bind6_prog_load,
+ BPF_CGROUP_INET6_BIND,
+ BPF_CGROUP_INET4_BIND,
+ AF_INET,
+ SOCK_STREAM,
+ NULL,
+ 0,
+ NULL,
+ 0,
+ NULL,
+ ATTACH_REJECT,
+ },
+ {
+ "bind6: rewrite IP & TCP port in",
+ bind6_prog_load,
+ BPF_CGROUP_INET6_BIND,
+ BPF_CGROUP_INET6_BIND,
+ AF_INET6,
+ SOCK_STREAM,
+ SERV6_IP,
+ SERV6_PORT,
+ SERV6_REWRITE_IP,
+ SERV6_REWRITE_PORT,
+ NULL,
+ SUCCESS,
+ },
+ {
+ "bind6: rewrite IP & UDP port in",
+ bind6_prog_load,
+ BPF_CGROUP_INET6_BIND,
+ BPF_CGROUP_INET6_BIND,
+ AF_INET6,
+ SOCK_DGRAM,
+ SERV6_IP,
+ SERV6_PORT,
+ SERV6_REWRITE_IP,
+ SERV6_REWRITE_PORT,
+ NULL,
+ SUCCESS,
+ },
+
+ /* connect */
+ {
+ "connect4: load prog with wrong expected attach type",
+ connect4_prog_load,
+ BPF_CGROUP_INET6_CONNECT,
+ BPF_CGROUP_INET4_CONNECT,
+ AF_INET,
+ SOCK_STREAM,
+ NULL,
+ 0,
+ NULL,
+ 0,
+ NULL,
+ LOAD_REJECT,
+ },
+ {
+ "connect4: attach prog with wrong attach type",
+ connect4_prog_load,
+ BPF_CGROUP_INET4_CONNECT,
+ BPF_CGROUP_INET6_CONNECT,
+ AF_INET,
+ SOCK_STREAM,
+ NULL,
+ 0,
+ NULL,
+ 0,
+ NULL,
+ ATTACH_REJECT,
+ },
+ {
+ "connect4: rewrite IP & TCP port",
+ connect4_prog_load,
+ BPF_CGROUP_INET4_CONNECT,
+ BPF_CGROUP_INET4_CONNECT,
+ AF_INET,
+ SOCK_STREAM,
+ SERV4_IP,
+ SERV4_PORT,
+ SERV4_REWRITE_IP,
+ SERV4_REWRITE_PORT,
+ SRC4_REWRITE_IP,
+ SUCCESS,
+ },
+ {
+ "connect4: rewrite IP & UDP port",
+ connect4_prog_load,
+ BPF_CGROUP_INET4_CONNECT,
+ BPF_CGROUP_INET4_CONNECT,
+ AF_INET,
+ SOCK_DGRAM,
+ SERV4_IP,
+ SERV4_PORT,
+ SERV4_REWRITE_IP,
+ SERV4_REWRITE_PORT,
+ SRC4_REWRITE_IP,
+ SUCCESS,
+ },
+ {
+ "connect6: load prog with wrong expected attach type",
+ connect6_prog_load,
+ BPF_CGROUP_INET4_CONNECT,
+ BPF_CGROUP_INET6_CONNECT,
+ AF_INET6,
+ SOCK_STREAM,
+ NULL,
+ 0,
+ NULL,
+ 0,
+ NULL,
+ LOAD_REJECT,
+ },
+ {
+ "connect6: attach prog with wrong attach type",
+ connect6_prog_load,
+ BPF_CGROUP_INET6_CONNECT,
+ BPF_CGROUP_INET4_CONNECT,
+ AF_INET,
+ SOCK_STREAM,
+ NULL,
+ 0,
+ NULL,
+ 0,
+ NULL,
+ ATTACH_REJECT,
+ },
+ {
+ "connect6: rewrite IP & TCP port",
+ connect6_prog_load,
+ BPF_CGROUP_INET6_CONNECT,
+ BPF_CGROUP_INET6_CONNECT,
+ AF_INET6,
+ SOCK_STREAM,
+ SERV6_IP,
+ SERV6_PORT,
+ SERV6_REWRITE_IP,
+ SERV6_REWRITE_PORT,
+ SRC6_REWRITE_IP,
+ SUCCESS,
+ },
+ {
+ "connect6: rewrite IP & UDP port",
+ connect6_prog_load,
+ BPF_CGROUP_INET6_CONNECT,
+ BPF_CGROUP_INET6_CONNECT,
+ AF_INET6,
+ SOCK_DGRAM,
+ SERV6_IP,
+ SERV6_PORT,
+ SERV6_REWRITE_IP,
+ SERV6_REWRITE_PORT,
+ SRC6_REWRITE_IP,
+ SUCCESS,
+ },
+
+ /* sendmsg */
+ {
+ "sendmsg4: load prog with wrong expected attach type",
+ sendmsg4_rw_asm_prog_load,
+ BPF_CGROUP_UDP6_SENDMSG,
+ BPF_CGROUP_UDP4_SENDMSG,
+ AF_INET,
+ SOCK_DGRAM,
+ NULL,
+ 0,
+ NULL,
+ 0,
+ NULL,
+ LOAD_REJECT,
+ },
+ {
+ "sendmsg4: attach prog with wrong attach type",
+ sendmsg4_rw_asm_prog_load,
+ BPF_CGROUP_UDP4_SENDMSG,
+ BPF_CGROUP_UDP6_SENDMSG,
+ AF_INET,
+ SOCK_DGRAM,
+ NULL,
+ 0,
+ NULL,
+ 0,
+ NULL,
+ ATTACH_REJECT,
+ },
+ {
+ "sendmsg4: rewrite IP & port (asm)",
+ sendmsg4_rw_asm_prog_load,
+ BPF_CGROUP_UDP4_SENDMSG,
+ BPF_CGROUP_UDP4_SENDMSG,
+ AF_INET,
+ SOCK_DGRAM,
+ SERV4_IP,
+ SERV4_PORT,
+ SERV4_REWRITE_IP,
+ SERV4_REWRITE_PORT,
+ SRC4_REWRITE_IP,
+ SUCCESS,
+ },
+ {
+ "sendmsg4: rewrite IP & port (C)",
+ sendmsg4_rw_c_prog_load,
+ BPF_CGROUP_UDP4_SENDMSG,
+ BPF_CGROUP_UDP4_SENDMSG,
+ AF_INET,
+ SOCK_DGRAM,
+ SERV4_IP,
+ SERV4_PORT,
+ SERV4_REWRITE_IP,
+ SERV4_REWRITE_PORT,
+ SRC4_REWRITE_IP,
+ SUCCESS,
+ },
+ {
+ "sendmsg4: deny call",
+ sendmsg_deny_prog_load,
+ BPF_CGROUP_UDP4_SENDMSG,
+ BPF_CGROUP_UDP4_SENDMSG,
+ AF_INET,
+ SOCK_DGRAM,
+ SERV4_IP,
+ SERV4_PORT,
+ SERV4_REWRITE_IP,
+ SERV4_REWRITE_PORT,
+ SRC4_REWRITE_IP,
+ SYSCALL_EPERM,
+ },
+ {
+ "sendmsg6: load prog with wrong expected attach type",
+ sendmsg6_rw_asm_prog_load,
+ BPF_CGROUP_UDP4_SENDMSG,
+ BPF_CGROUP_UDP6_SENDMSG,
+ AF_INET6,
+ SOCK_DGRAM,
+ NULL,
+ 0,
+ NULL,
+ 0,
+ NULL,
+ LOAD_REJECT,
+ },
+ {
+ "sendmsg6: attach prog with wrong attach type",
+ sendmsg6_rw_asm_prog_load,
+ BPF_CGROUP_UDP6_SENDMSG,
+ BPF_CGROUP_UDP4_SENDMSG,
+ AF_INET6,
+ SOCK_DGRAM,
+ NULL,
+ 0,
+ NULL,
+ 0,
+ NULL,
+ ATTACH_REJECT,
+ },
+ {
+ "sendmsg6: rewrite IP & port (asm)",
+ sendmsg6_rw_asm_prog_load,
+ BPF_CGROUP_UDP6_SENDMSG,
+ BPF_CGROUP_UDP6_SENDMSG,
+ AF_INET6,
+ SOCK_DGRAM,
+ SERV6_IP,
+ SERV6_PORT,
+ SERV6_REWRITE_IP,
+ SERV6_REWRITE_PORT,
+ SRC6_REWRITE_IP,
+ SUCCESS,
+ },
+ {
+ "sendmsg6: rewrite IP & port (C)",
+ sendmsg6_rw_c_prog_load,
+ BPF_CGROUP_UDP6_SENDMSG,
+ BPF_CGROUP_UDP6_SENDMSG,
+ AF_INET6,
+ SOCK_DGRAM,
+ SERV6_IP,
+ SERV6_PORT,
+ SERV6_REWRITE_IP,
+ SERV6_REWRITE_PORT,
+ SRC6_REWRITE_IP,
+ SUCCESS,
+ },
+ {
+ "sendmsg6: IPv4-mapped IPv6",
+ sendmsg6_rw_v4mapped_prog_load,
+ BPF_CGROUP_UDP6_SENDMSG,
+ BPF_CGROUP_UDP6_SENDMSG,
+ AF_INET6,
+ SOCK_DGRAM,
+ SERV6_IP,
+ SERV6_PORT,
+ SERV6_REWRITE_IP,
+ SERV6_REWRITE_PORT,
+ SRC6_REWRITE_IP,
+ SYSCALL_ENOTSUPP,
+ },
+ {
+ "sendmsg6: set dst IP = [::] (BSD'ism)",
+ sendmsg6_rw_wildcard_prog_load,
+ BPF_CGROUP_UDP6_SENDMSG,
+ BPF_CGROUP_UDP6_SENDMSG,
+ AF_INET6,
+ SOCK_DGRAM,
+ SERV6_IP,
+ SERV6_PORT,
+ SERV6_REWRITE_IP,
+ SERV6_REWRITE_PORT,
+ SRC6_REWRITE_IP,
+ SUCCESS,
+ },
+ {
+ "sendmsg6: preserve dst IP = [::] (BSD'ism)",
+ sendmsg_allow_prog_load,
+ BPF_CGROUP_UDP6_SENDMSG,
+ BPF_CGROUP_UDP6_SENDMSG,
+ AF_INET6,
+ SOCK_DGRAM,
+ WILDCARD6_IP,
+ SERV6_PORT,
+ SERV6_REWRITE_IP,
+ SERV6_PORT,
+ SRC6_IP,
+ SUCCESS,
+ },
+ {
+ "sendmsg6: deny call",
+ sendmsg_deny_prog_load,
+ BPF_CGROUP_UDP6_SENDMSG,
+ BPF_CGROUP_UDP6_SENDMSG,
+ AF_INET6,
+ SOCK_DGRAM,
+ SERV6_IP,
+ SERV6_PORT,
+ SERV6_REWRITE_IP,
+ SERV6_REWRITE_PORT,
+ SRC6_REWRITE_IP,
+ SYSCALL_EPERM,
+ },
+
+ /* recvmsg */
+ {
+ "recvmsg4: return code ok",
+ recvmsg_allow_prog_load,
+ BPF_CGROUP_UDP4_RECVMSG,
+ BPF_CGROUP_UDP4_RECVMSG,
+ AF_INET,
+ SOCK_DGRAM,
+ NULL,
+ 0,
+ NULL,
+ 0,
+ NULL,
+ ATTACH_OKAY,
+ },
+ {
+ "recvmsg4: return code !ok",
+ recvmsg_deny_prog_load,
+ BPF_CGROUP_UDP4_RECVMSG,
+ BPF_CGROUP_UDP4_RECVMSG,
+ AF_INET,
+ SOCK_DGRAM,
+ NULL,
+ 0,
+ NULL,
+ 0,
+ NULL,
+ LOAD_REJECT,
+ },
+ {
+ "recvmsg6: return code ok",
+ recvmsg_allow_prog_load,
+ BPF_CGROUP_UDP6_RECVMSG,
+ BPF_CGROUP_UDP6_RECVMSG,
+ AF_INET6,
+ SOCK_DGRAM,
+ NULL,
+ 0,
+ NULL,
+ 0,
+ NULL,
+ ATTACH_OKAY,
+ },
+ {
+ "recvmsg6: return code !ok",
+ recvmsg_deny_prog_load,
+ BPF_CGROUP_UDP6_RECVMSG,
+ BPF_CGROUP_UDP6_RECVMSG,
+ AF_INET6,
+ SOCK_DGRAM,
+ NULL,
+ 0,
+ NULL,
+ 0,
+ NULL,
+ LOAD_REJECT,
+ },
+ {
+ "recvmsg4: rewrite IP & port (asm)",
+ recvmsg4_rw_asm_prog_load,
+ BPF_CGROUP_UDP4_RECVMSG,
+ BPF_CGROUP_UDP4_RECVMSG,
+ AF_INET,
+ SOCK_DGRAM,
+ SERV4_REWRITE_IP,
+ SERV4_REWRITE_PORT,
+ SERV4_REWRITE_IP,
+ SERV4_REWRITE_PORT,
+ SERV4_IP,
+ SUCCESS,
+ },
+ {
+ "recvmsg6: rewrite IP & port (asm)",
+ recvmsg6_rw_asm_prog_load,
+ BPF_CGROUP_UDP6_RECVMSG,
+ BPF_CGROUP_UDP6_RECVMSG,
+ AF_INET6,
+ SOCK_DGRAM,
+ SERV6_REWRITE_IP,
+ SERV6_REWRITE_PORT,
+ SERV6_REWRITE_IP,
+ SERV6_REWRITE_PORT,
+ SERV6_IP,
+ SUCCESS,
+ },
+};
+
+static int mk_sockaddr(int domain, const char *ip, unsigned short port,
+ struct sockaddr *addr, socklen_t addr_len)
+{
+ struct sockaddr_in6 *addr6;
+ struct sockaddr_in *addr4;
+
+ if (domain != AF_INET && domain != AF_INET6) {
+ log_err("Unsupported address family");
+ return -1;
+ }
+
+ memset(addr, 0, addr_len);
+
+ if (domain == AF_INET) {
+ if (addr_len < sizeof(struct sockaddr_in))
+ return -1;
+ addr4 = (struct sockaddr_in *)addr;
+ addr4->sin_family = domain;
+ addr4->sin_port = htons(port);
+ if (inet_pton(domain, ip, (void *)&addr4->sin_addr) != 1) {
+ log_err("Invalid IPv4: %s", ip);
+ return -1;
+ }
+ } else if (domain == AF_INET6) {
+ if (addr_len < sizeof(struct sockaddr_in6))
+ return -1;
+ addr6 = (struct sockaddr_in6 *)addr;
+ addr6->sin6_family = domain;
+ addr6->sin6_port = htons(port);
+ if (inet_pton(domain, ip, (void *)&addr6->sin6_addr) != 1) {
+ log_err("Invalid IPv6: %s", ip);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int load_insns(const struct sock_addr_test *test,
+ const struct bpf_insn *insns, size_t insns_cnt)
+{
+ struct bpf_load_program_attr load_attr;
+ int ret;
+
+ memset(&load_attr, 0, sizeof(struct bpf_load_program_attr));
+ load_attr.prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
+ load_attr.expected_attach_type = test->expected_attach_type;
+ load_attr.insns = insns;
+ load_attr.insns_cnt = insns_cnt;
+ load_attr.license = "GPL";
+
+ ret = bpf_load_program_xattr(&load_attr, bpf_log_buf, BPF_LOG_BUF_SIZE);
+ if (ret < 0 && test->expected_result != LOAD_REJECT) {
+ log_err(">>> Loading program error.\n"
+ ">>> Verifier output:\n%s\n-------\n", bpf_log_buf);
+ }
+
+ return ret;
+}
+
+/* [1] These testing programs try to read different context fields, including
+ * narrow loads of different sizes from user_ip4 and user_ip6, and write to
+ * those allowed to be overridden.
+ *
+ * [2] BPF_LD_IMM64 & BPF_JMP_REG are used below whenever there is a need to
+ * compare a register with unsigned 32bit integer. BPF_JMP_IMM can't be used
+ * in such cases since it accepts only _signed_ 32bit integer as IMM
+ * argument. Also note that BPF_LD_IMM64 contains 2 instructions what matters
+ * to count jumps properly.
+ */
+
+static int bind4_prog_load(const struct sock_addr_test *test)
+{
+ union {
+ uint8_t u4_addr8[4];
+ uint16_t u4_addr16[2];
+ uint32_t u4_addr32;
+ } ip4, port;
+ struct sockaddr_in addr4_rw;
+
+ if (inet_pton(AF_INET, SERV4_IP, (void *)&ip4) != 1) {
+ log_err("Invalid IPv4: %s", SERV4_IP);
+ return -1;
+ }
+
+ port.u4_addr32 = htons(SERV4_PORT);
+
+ if (mk_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT,
+ (struct sockaddr *)&addr4_rw, sizeof(addr4_rw)) == -1)
+ return -1;
+
+ /* See [1]. */
+ struct bpf_insn insns[] = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+ /* if (sk.family == AF_INET && */
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock_addr, family)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 32),
+
+ /* (sk.type == SOCK_DGRAM || sk.type == SOCK_STREAM) && */
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock_addr, type)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_DGRAM, 1),
+ BPF_JMP_A(1),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_STREAM, 28),
+
+ /* 1st_byte_of_user_ip4 == expected && */
+ BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock_addr, user_ip4)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[0], 26),
+
+ /* 2nd_byte_of_user_ip4 == expected && */
+ BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock_addr, user_ip4) + 1),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[1], 24),
+
+ /* 3rd_byte_of_user_ip4 == expected && */
+ BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock_addr, user_ip4) + 2),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[2], 22),
+
+ /* 4th_byte_of_user_ip4 == expected && */
+ BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock_addr, user_ip4) + 3),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[3], 20),
+
+ /* 1st_half_of_user_ip4 == expected && */
+ BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock_addr, user_ip4)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[0], 18),
+
+ /* 2nd_half_of_user_ip4 == expected && */
+ BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock_addr, user_ip4) + 2),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[1], 16),
+
+ /* whole_user_ip4 == expected && */
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock_addr, user_ip4)),
+ BPF_LD_IMM64(BPF_REG_8, ip4.u4_addr32), /* See [2]. */
+ BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 12),
+
+ /* 1st_byte_of_user_port == expected && */
+ BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock_addr, user_port)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, port.u4_addr8[0], 10),
+
+ /* 1st_half_of_user_port == expected && */
+ BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock_addr, user_port)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, port.u4_addr16[0], 8),
+
+ /* user_port == expected) { */
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock_addr, user_port)),
+ BPF_LD_IMM64(BPF_REG_8, port.u4_addr32), /* See [2]. */
+ BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 4),
+
+ /* user_ip4 = addr4_rw.sin_addr */
+ BPF_MOV32_IMM(BPF_REG_7, addr4_rw.sin_addr.s_addr),
+ BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,
+ offsetof(struct bpf_sock_addr, user_ip4)),
+
+ /* user_port = addr4_rw.sin_port */
+ BPF_MOV32_IMM(BPF_REG_7, addr4_rw.sin_port),
+ BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,
+ offsetof(struct bpf_sock_addr, user_port)),
+ /* } */
+
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ };
+
+ return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn));
+}
+
+static int bind6_prog_load(const struct sock_addr_test *test)
+{
+ struct sockaddr_in6 addr6_rw;
+ struct in6_addr ip6;
+
+ if (inet_pton(AF_INET6, SERV6_IP, (void *)&ip6) != 1) {
+ log_err("Invalid IPv6: %s", SERV6_IP);
+ return -1;
+ }
+
+ if (mk_sockaddr(AF_INET6, SERV6_REWRITE_IP, SERV6_REWRITE_PORT,
+ (struct sockaddr *)&addr6_rw, sizeof(addr6_rw)) == -1)
+ return -1;
+
+ /* See [1]. */
+ struct bpf_insn insns[] = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+ /* if (sk.family == AF_INET6 && */
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock_addr, family)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET6, 18),
+
+ /* 5th_byte_of_user_ip6 == expected && */
+ BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock_addr, user_ip6[1])),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip6.s6_addr[4], 16),
+
+ /* 3rd_half_of_user_ip6 == expected && */
+ BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock_addr, user_ip6[1])),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip6.s6_addr16[2], 14),
+
+ /* last_word_of_user_ip6 == expected) { */
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock_addr, user_ip6[3])),
+ BPF_LD_IMM64(BPF_REG_8, ip6.s6_addr32[3]), /* See [2]. */
+ BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 10),
+
+
+#define STORE_IPV6_WORD(N) \
+ BPF_MOV32_IMM(BPF_REG_7, addr6_rw.sin6_addr.s6_addr32[N]), \
+ BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, \
+ offsetof(struct bpf_sock_addr, user_ip6[N]))
+
+ /* user_ip6 = addr6_rw.sin6_addr */
+ STORE_IPV6_WORD(0),
+ STORE_IPV6_WORD(1),
+ STORE_IPV6_WORD(2),
+ STORE_IPV6_WORD(3),
+
+ /* user_port = addr6_rw.sin6_port */
+ BPF_MOV32_IMM(BPF_REG_7, addr6_rw.sin6_port),
+ BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,
+ offsetof(struct bpf_sock_addr, user_port)),
+
+ /* } */
+
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ };
+
+ return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn));
+}
+
+static int load_path(const struct sock_addr_test *test, const char *path)
+{
+ struct bpf_prog_load_attr attr;
+ struct bpf_object *obj;
+ int prog_fd;
+
+ memset(&attr, 0, sizeof(struct bpf_prog_load_attr));
+ attr.file = path;
+ attr.prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
+ attr.expected_attach_type = test->expected_attach_type;
+ attr.prog_flags = BPF_F_TEST_RND_HI32;
+
+ if (bpf_prog_load_xattr(&attr, &obj, &prog_fd)) {
+ if (test->expected_result != LOAD_REJECT)
+ log_err(">>> Loading program (%s) error.\n", path);
+ return -1;
+ }
+
+ return prog_fd;
+}
+
+static int connect4_prog_load(const struct sock_addr_test *test)
+{
+ return load_path(test, CONNECT4_PROG_PATH);
+}
+
+static int connect6_prog_load(const struct sock_addr_test *test)
+{
+ return load_path(test, CONNECT6_PROG_PATH);
+}
+
+static int xmsg_ret_only_prog_load(const struct sock_addr_test *test,
+ int32_t rc)
+{
+ struct bpf_insn insns[] = {
+ /* return rc */
+ BPF_MOV64_IMM(BPF_REG_0, rc),
+ BPF_EXIT_INSN(),
+ };
+ return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn));
+}
+
+static int sendmsg_allow_prog_load(const struct sock_addr_test *test)
+{
+ return xmsg_ret_only_prog_load(test, /*rc*/ 1);
+}
+
+static int sendmsg_deny_prog_load(const struct sock_addr_test *test)
+{
+ return xmsg_ret_only_prog_load(test, /*rc*/ 0);
+}
+
+static int recvmsg_allow_prog_load(const struct sock_addr_test *test)
+{
+ return xmsg_ret_only_prog_load(test, /*rc*/ 1);
+}
+
+static int recvmsg_deny_prog_load(const struct sock_addr_test *test)
+{
+ return xmsg_ret_only_prog_load(test, /*rc*/ 0);
+}
+
+static int sendmsg4_rw_asm_prog_load(const struct sock_addr_test *test)
+{
+ struct sockaddr_in dst4_rw_addr;
+ struct in_addr src4_rw_ip;
+
+ if (inet_pton(AF_INET, SRC4_REWRITE_IP, (void *)&src4_rw_ip) != 1) {
+ log_err("Invalid IPv4: %s", SRC4_REWRITE_IP);
+ return -1;
+ }
+
+ if (mk_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT,
+ (struct sockaddr *)&dst4_rw_addr,
+ sizeof(dst4_rw_addr)) == -1)
+ return -1;
+
+ struct bpf_insn insns[] = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+ /* if (sk.family == AF_INET && */
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock_addr, family)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 8),
+
+ /* sk.type == SOCK_DGRAM) { */
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock_addr, type)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_DGRAM, 6),
+
+ /* msg_src_ip4 = src4_rw_ip */
+ BPF_MOV32_IMM(BPF_REG_7, src4_rw_ip.s_addr),
+ BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,
+ offsetof(struct bpf_sock_addr, msg_src_ip4)),
+
+ /* user_ip4 = dst4_rw_addr.sin_addr */
+ BPF_MOV32_IMM(BPF_REG_7, dst4_rw_addr.sin_addr.s_addr),
+ BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,
+ offsetof(struct bpf_sock_addr, user_ip4)),
+
+ /* user_port = dst4_rw_addr.sin_port */
+ BPF_MOV32_IMM(BPF_REG_7, dst4_rw_addr.sin_port),
+ BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,
+ offsetof(struct bpf_sock_addr, user_port)),
+ /* } */
+
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ };
+
+ return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn));
+}
+
+static int recvmsg4_rw_asm_prog_load(const struct sock_addr_test *test)
+{
+ struct sockaddr_in src4_rw_addr;
+
+ if (mk_sockaddr(AF_INET, SERV4_IP, SERV4_PORT,
+ (struct sockaddr *)&src4_rw_addr,
+ sizeof(src4_rw_addr)) == -1)
+ return -1;
+
+ struct bpf_insn insns[] = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+ /* if (sk.family == AF_INET && */
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock_addr, family)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 6),
+
+ /* sk.type == SOCK_DGRAM) { */
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock_addr, type)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_DGRAM, 4),
+
+ /* user_ip4 = src4_rw_addr.sin_addr */
+ BPF_MOV32_IMM(BPF_REG_7, src4_rw_addr.sin_addr.s_addr),
+ BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,
+ offsetof(struct bpf_sock_addr, user_ip4)),
+
+ /* user_port = src4_rw_addr.sin_port */
+ BPF_MOV32_IMM(BPF_REG_7, src4_rw_addr.sin_port),
+ BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,
+ offsetof(struct bpf_sock_addr, user_port)),
+ /* } */
+
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ };
+
+ return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn));
+}
+
+static int sendmsg4_rw_c_prog_load(const struct sock_addr_test *test)
+{
+ return load_path(test, SENDMSG4_PROG_PATH);
+}
+
+static int sendmsg6_rw_dst_asm_prog_load(const struct sock_addr_test *test,
+ const char *rw_dst_ip)
+{
+ struct sockaddr_in6 dst6_rw_addr;
+ struct in6_addr src6_rw_ip;
+
+ if (inet_pton(AF_INET6, SRC6_REWRITE_IP, (void *)&src6_rw_ip) != 1) {
+ log_err("Invalid IPv6: %s", SRC6_REWRITE_IP);
+ return -1;
+ }
+
+ if (mk_sockaddr(AF_INET6, rw_dst_ip, SERV6_REWRITE_PORT,
+ (struct sockaddr *)&dst6_rw_addr,
+ sizeof(dst6_rw_addr)) == -1)
+ return -1;
+
+ struct bpf_insn insns[] = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+ /* if (sk.family == AF_INET6) { */
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock_addr, family)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET6, 18),
+
+#define STORE_IPV6_WORD_N(DST, SRC, N) \
+ BPF_MOV32_IMM(BPF_REG_7, SRC[N]), \
+ BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, \
+ offsetof(struct bpf_sock_addr, DST[N]))
+
+#define STORE_IPV6(DST, SRC) \
+ STORE_IPV6_WORD_N(DST, SRC, 0), \
+ STORE_IPV6_WORD_N(DST, SRC, 1), \
+ STORE_IPV6_WORD_N(DST, SRC, 2), \
+ STORE_IPV6_WORD_N(DST, SRC, 3)
+
+ STORE_IPV6(msg_src_ip6, src6_rw_ip.s6_addr32),
+ STORE_IPV6(user_ip6, dst6_rw_addr.sin6_addr.s6_addr32),
+
+ /* user_port = dst6_rw_addr.sin6_port */
+ BPF_MOV32_IMM(BPF_REG_7, dst6_rw_addr.sin6_port),
+ BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,
+ offsetof(struct bpf_sock_addr, user_port)),
+
+ /* } */
+
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ };
+
+ return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn));
+}
+
+static int sendmsg6_rw_asm_prog_load(const struct sock_addr_test *test)
+{
+ return sendmsg6_rw_dst_asm_prog_load(test, SERV6_REWRITE_IP);
+}
+
+static int recvmsg6_rw_asm_prog_load(const struct sock_addr_test *test)
+{
+ struct sockaddr_in6 src6_rw_addr;
+
+ if (mk_sockaddr(AF_INET6, SERV6_IP, SERV6_PORT,
+ (struct sockaddr *)&src6_rw_addr,
+ sizeof(src6_rw_addr)) == -1)
+ return -1;
+
+ struct bpf_insn insns[] = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+
+ /* if (sk.family == AF_INET6) { */
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6,
+ offsetof(struct bpf_sock_addr, family)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET6, 10),
+
+ STORE_IPV6(user_ip6, src6_rw_addr.sin6_addr.s6_addr32),
+
+ /* user_port = dst6_rw_addr.sin6_port */
+ BPF_MOV32_IMM(BPF_REG_7, src6_rw_addr.sin6_port),
+ BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7,
+ offsetof(struct bpf_sock_addr, user_port)),
+ /* } */
+
+ /* return 1 */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ };
+
+ return load_insns(test, insns, sizeof(insns) / sizeof(struct bpf_insn));
+}
+
+static int sendmsg6_rw_v4mapped_prog_load(const struct sock_addr_test *test)
+{
+ return sendmsg6_rw_dst_asm_prog_load(test, SERV6_V4MAPPED_IP);
+}
+
+static int sendmsg6_rw_wildcard_prog_load(const struct sock_addr_test *test)
+{
+ return sendmsg6_rw_dst_asm_prog_load(test, WILDCARD6_IP);
+}
+
+static int sendmsg6_rw_c_prog_load(const struct sock_addr_test *test)
+{
+ return load_path(test, SENDMSG6_PROG_PATH);
+}
+
+static int cmp_addr(const struct sockaddr_storage *addr1,
+ const struct sockaddr_storage *addr2, int cmp_port)
+{
+ const struct sockaddr_in *four1, *four2;
+ const struct sockaddr_in6 *six1, *six2;
+
+ if (addr1->ss_family != addr2->ss_family)
+ return -1;
+
+ if (addr1->ss_family == AF_INET) {
+ four1 = (const struct sockaddr_in *)addr1;
+ four2 = (const struct sockaddr_in *)addr2;
+ return !((four1->sin_port == four2->sin_port || !cmp_port) &&
+ four1->sin_addr.s_addr == four2->sin_addr.s_addr);
+ } else if (addr1->ss_family == AF_INET6) {
+ six1 = (const struct sockaddr_in6 *)addr1;
+ six2 = (const struct sockaddr_in6 *)addr2;
+ return !((six1->sin6_port == six2->sin6_port || !cmp_port) &&
+ !memcmp(&six1->sin6_addr, &six2->sin6_addr,
+ sizeof(struct in6_addr)));
+ }
+
+ return -1;
+}
+
+static int cmp_sock_addr(info_fn fn, int sock1,
+ const struct sockaddr_storage *addr2, int cmp_port)
+{
+ struct sockaddr_storage addr1;
+ socklen_t len1 = sizeof(addr1);
+
+ memset(&addr1, 0, len1);
+ if (fn(sock1, (struct sockaddr *)&addr1, (socklen_t *)&len1) != 0)
+ return -1;
+
+ return cmp_addr(&addr1, addr2, cmp_port);
+}
+
+static int cmp_local_ip(int sock1, const struct sockaddr_storage *addr2)
+{
+ return cmp_sock_addr(getsockname, sock1, addr2, /*cmp_port*/ 0);
+}
+
+static int cmp_local_addr(int sock1, const struct sockaddr_storage *addr2)
+{
+ return cmp_sock_addr(getsockname, sock1, addr2, /*cmp_port*/ 1);
+}
+
+static int cmp_peer_addr(int sock1, const struct sockaddr_storage *addr2)
+{
+ return cmp_sock_addr(getpeername, sock1, addr2, /*cmp_port*/ 1);
+}
+
+static int start_server(int type, const struct sockaddr_storage *addr,
+ socklen_t addr_len)
+{
+ int fd;
+
+ fd = socket(addr->ss_family, type, 0);
+ if (fd == -1) {
+ log_err("Failed to create server socket");
+ goto out;
+ }
+
+ if (bind(fd, (const struct sockaddr *)addr, addr_len) == -1) {
+ log_err("Failed to bind server socket");
+ goto close_out;
+ }
+
+ if (type == SOCK_STREAM) {
+ if (listen(fd, 128) == -1) {
+ log_err("Failed to listen on server socket");
+ goto close_out;
+ }
+ }
+
+ goto out;
+close_out:
+ close(fd);
+ fd = -1;
+out:
+ return fd;
+}
+
+static int connect_to_server(int type, const struct sockaddr_storage *addr,
+ socklen_t addr_len)
+{
+ int domain;
+ int fd = -1;
+
+ domain = addr->ss_family;
+
+ if (domain != AF_INET && domain != AF_INET6) {
+ log_err("Unsupported address family");
+ goto err;
+ }
+
+ fd = socket(domain, type, 0);
+ if (fd == -1) {
+ log_err("Failed to create client socket");
+ goto err;
+ }
+
+ if (connect(fd, (const struct sockaddr *)addr, addr_len) == -1) {
+ log_err("Fail to connect to server");
+ goto err;
+ }
+
+ goto out;
+err:
+ close(fd);
+ fd = -1;
+out:
+ return fd;
+}
+
+int init_pktinfo(int domain, struct cmsghdr *cmsg)
+{
+ struct in6_pktinfo *pktinfo6;
+ struct in_pktinfo *pktinfo4;
+
+ if (domain == AF_INET) {
+ cmsg->cmsg_level = SOL_IP;
+ cmsg->cmsg_type = IP_PKTINFO;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct in_pktinfo));
+ pktinfo4 = (struct in_pktinfo *)CMSG_DATA(cmsg);
+ memset(pktinfo4, 0, sizeof(struct in_pktinfo));
+ if (inet_pton(domain, SRC4_IP,
+ (void *)&pktinfo4->ipi_spec_dst) != 1)
+ return -1;
+ } else if (domain == AF_INET6) {
+ cmsg->cmsg_level = SOL_IPV6;
+ cmsg->cmsg_type = IPV6_PKTINFO;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo));
+ pktinfo6 = (struct in6_pktinfo *)CMSG_DATA(cmsg);
+ memset(pktinfo6, 0, sizeof(struct in6_pktinfo));
+ if (inet_pton(domain, SRC6_IP,
+ (void *)&pktinfo6->ipi6_addr) != 1)
+ return -1;
+ } else {
+ return -1;
+ }
+
+ return 0;
+}
+
+static int sendmsg_to_server(int type, const struct sockaddr_storage *addr,
+ socklen_t addr_len, int set_cmsg, int flags,
+ int *syscall_err)
+{
+ union {
+ char buf[CMSG_SPACE(sizeof(struct in6_pktinfo))];
+ struct cmsghdr align;
+ } control6;
+ union {
+ char buf[CMSG_SPACE(sizeof(struct in_pktinfo))];
+ struct cmsghdr align;
+ } control4;
+ struct msghdr hdr;
+ struct iovec iov;
+ char data = 'a';
+ int domain;
+ int fd = -1;
+
+ domain = addr->ss_family;
+
+ if (domain != AF_INET && domain != AF_INET6) {
+ log_err("Unsupported address family");
+ goto err;
+ }
+
+ fd = socket(domain, type, 0);
+ if (fd == -1) {
+ log_err("Failed to create client socket");
+ goto err;
+ }
+
+ memset(&iov, 0, sizeof(iov));
+ iov.iov_base = &data;
+ iov.iov_len = sizeof(data);
+
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.msg_name = (void *)addr;
+ hdr.msg_namelen = addr_len;
+ hdr.msg_iov = &iov;
+ hdr.msg_iovlen = 1;
+
+ if (set_cmsg) {
+ if (domain == AF_INET) {
+ hdr.msg_control = &control4;
+ hdr.msg_controllen = sizeof(control4.buf);
+ } else if (domain == AF_INET6) {
+ hdr.msg_control = &control6;
+ hdr.msg_controllen = sizeof(control6.buf);
+ }
+ if (init_pktinfo(domain, CMSG_FIRSTHDR(&hdr))) {
+ log_err("Fail to init pktinfo");
+ goto err;
+ }
+ }
+
+ if (sendmsg(fd, &hdr, flags) != sizeof(data)) {
+ log_err("Fail to send message to server");
+ *syscall_err = errno;
+ goto err;
+ }
+
+ goto out;
+err:
+ close(fd);
+ fd = -1;
+out:
+ return fd;
+}
+
+static int fastconnect_to_server(const struct sockaddr_storage *addr,
+ socklen_t addr_len)
+{
+ int sendmsg_err;
+
+ return sendmsg_to_server(SOCK_STREAM, addr, addr_len, /*set_cmsg*/0,
+ MSG_FASTOPEN, &sendmsg_err);
+}
+
+static int recvmsg_from_client(int sockfd, struct sockaddr_storage *src_addr)
+{
+ struct timeval tv;
+ struct msghdr hdr;
+ struct iovec iov;
+ char data[64];
+ fd_set rfds;
+
+ FD_ZERO(&rfds);
+ FD_SET(sockfd, &rfds);
+
+ tv.tv_sec = 2;
+ tv.tv_usec = 0;
+
+ if (select(sockfd + 1, &rfds, NULL, NULL, &tv) <= 0 ||
+ !FD_ISSET(sockfd, &rfds))
+ return -1;
+
+ memset(&iov, 0, sizeof(iov));
+ iov.iov_base = data;
+ iov.iov_len = sizeof(data);
+
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.msg_name = src_addr;
+ hdr.msg_namelen = sizeof(struct sockaddr_storage);
+ hdr.msg_iov = &iov;
+ hdr.msg_iovlen = 1;
+
+ return recvmsg(sockfd, &hdr, 0);
+}
+
+static int init_addrs(const struct sock_addr_test *test,
+ struct sockaddr_storage *requested_addr,
+ struct sockaddr_storage *expected_addr,
+ struct sockaddr_storage *expected_src_addr)
+{
+ socklen_t addr_len = sizeof(struct sockaddr_storage);
+
+ if (mk_sockaddr(test->domain, test->expected_ip, test->expected_port,
+ (struct sockaddr *)expected_addr, addr_len) == -1)
+ goto err;
+
+ if (mk_sockaddr(test->domain, test->requested_ip, test->requested_port,
+ (struct sockaddr *)requested_addr, addr_len) == -1)
+ goto err;
+
+ if (test->expected_src_ip &&
+ mk_sockaddr(test->domain, test->expected_src_ip, 0,
+ (struct sockaddr *)expected_src_addr, addr_len) == -1)
+ goto err;
+
+ return 0;
+err:
+ return -1;
+}
+
+static int run_bind_test_case(const struct sock_addr_test *test)
+{
+ socklen_t addr_len = sizeof(struct sockaddr_storage);
+ struct sockaddr_storage requested_addr;
+ struct sockaddr_storage expected_addr;
+ int clientfd = -1;
+ int servfd = -1;
+ int err = 0;
+
+ if (init_addrs(test, &requested_addr, &expected_addr, NULL))
+ goto err;
+
+ servfd = start_server(test->type, &requested_addr, addr_len);
+ if (servfd == -1)
+ goto err;
+
+ if (cmp_local_addr(servfd, &expected_addr))
+ goto err;
+
+ /* Try to connect to server just in case */
+ clientfd = connect_to_server(test->type, &expected_addr, addr_len);
+ if (clientfd == -1)
+ goto err;
+
+ goto out;
+err:
+ err = -1;
+out:
+ close(clientfd);
+ close(servfd);
+ return err;
+}
+
+static int run_connect_test_case(const struct sock_addr_test *test)
+{
+ socklen_t addr_len = sizeof(struct sockaddr_storage);
+ struct sockaddr_storage expected_src_addr;
+ struct sockaddr_storage requested_addr;
+ struct sockaddr_storage expected_addr;
+ int clientfd = -1;
+ int servfd = -1;
+ int err = 0;
+
+ if (init_addrs(test, &requested_addr, &expected_addr,
+ &expected_src_addr))
+ goto err;
+
+ /* Prepare server to connect to */
+ servfd = start_server(test->type, &expected_addr, addr_len);
+ if (servfd == -1)
+ goto err;
+
+ clientfd = connect_to_server(test->type, &requested_addr, addr_len);
+ if (clientfd == -1)
+ goto err;
+
+ /* Make sure src and dst addrs were overridden properly */
+ if (cmp_peer_addr(clientfd, &expected_addr))
+ goto err;
+
+ if (cmp_local_ip(clientfd, &expected_src_addr))
+ goto err;
+
+ if (test->type == SOCK_STREAM) {
+ /* Test TCP Fast Open scenario */
+ clientfd = fastconnect_to_server(&requested_addr, addr_len);
+ if (clientfd == -1)
+ goto err;
+
+ /* Make sure src and dst addrs were overridden properly */
+ if (cmp_peer_addr(clientfd, &expected_addr))
+ goto err;
+
+ if (cmp_local_ip(clientfd, &expected_src_addr))
+ goto err;
+ }
+
+ goto out;
+err:
+ err = -1;
+out:
+ close(clientfd);
+ close(servfd);
+ return err;
+}
+
+static int run_xmsg_test_case(const struct sock_addr_test *test, int max_cmsg)
+{
+ socklen_t addr_len = sizeof(struct sockaddr_storage);
+ struct sockaddr_storage expected_addr;
+ struct sockaddr_storage server_addr;
+ struct sockaddr_storage sendmsg_addr;
+ struct sockaddr_storage recvmsg_addr;
+ int clientfd = -1;
+ int servfd = -1;
+ int set_cmsg;
+ int err = 0;
+
+ if (test->type != SOCK_DGRAM)
+ goto err;
+
+ if (init_addrs(test, &sendmsg_addr, &server_addr, &expected_addr))
+ goto err;
+
+ /* Prepare server to sendmsg to */
+ servfd = start_server(test->type, &server_addr, addr_len);
+ if (servfd == -1)
+ goto err;
+
+ for (set_cmsg = 0; set_cmsg <= max_cmsg; ++set_cmsg) {
+ if (clientfd >= 0)
+ close(clientfd);
+
+ clientfd = sendmsg_to_server(test->type, &sendmsg_addr,
+ addr_len, set_cmsg, /*flags*/0,
+ &err);
+ if (err)
+ goto out;
+ else if (clientfd == -1)
+ goto err;
+
+ /* Try to receive message on server instead of using
+ * getpeername(2) on client socket, to check that client's
+ * destination address was rewritten properly, since
+ * getpeername(2) doesn't work with unconnected datagram
+ * sockets.
+ *
+ * Get source address from recvmsg(2) as well to make sure
+ * source was rewritten properly: getsockname(2) can't be used
+ * since socket is unconnected and source defined for one
+ * specific packet may differ from the one used by default and
+ * returned by getsockname(2).
+ */
+ if (recvmsg_from_client(servfd, &recvmsg_addr) == -1)
+ goto err;
+
+ if (cmp_addr(&recvmsg_addr, &expected_addr, /*cmp_port*/0))
+ goto err;
+ }
+
+ goto out;
+err:
+ err = -1;
+out:
+ close(clientfd);
+ close(servfd);
+ return err;
+}
+
+static int run_test_case(int cgfd, const struct sock_addr_test *test)
+{
+ int progfd = -1;
+ int err = 0;
+
+ printf("Test case: %s .. ", test->descr);
+
+ progfd = test->loadfn(test);
+ if (test->expected_result == LOAD_REJECT && progfd < 0)
+ goto out;
+ else if (test->expected_result == LOAD_REJECT || progfd < 0)
+ goto err;
+
+ err = bpf_prog_attach(progfd, cgfd, test->attach_type,
+ BPF_F_ALLOW_OVERRIDE);
+ if (test->expected_result == ATTACH_REJECT && err) {
+ err = 0; /* error was expected, reset it */
+ goto out;
+ } else if (test->expected_result == ATTACH_REJECT || err) {
+ goto err;
+ } else if (test->expected_result == ATTACH_OKAY) {
+ err = 0;
+ goto out;
+ }
+
+ switch (test->attach_type) {
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
+ err = run_bind_test_case(test);
+ break;
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ err = run_connect_test_case(test);
+ break;
+ case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP6_SENDMSG:
+ err = run_xmsg_test_case(test, 1);
+ break;
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ err = run_xmsg_test_case(test, 0);
+ break;
+ default:
+ goto err;
+ }
+
+ if (test->expected_result == SYSCALL_EPERM && err == EPERM) {
+ err = 0; /* error was expected, reset it */
+ goto out;
+ }
+
+ if (test->expected_result == SYSCALL_ENOTSUPP && err == ENOTSUPP) {
+ err = 0; /* error was expected, reset it */
+ goto out;
+ }
+
+ if (err || test->expected_result != SUCCESS)
+ goto err;
+
+ goto out;
+err:
+ err = -1;
+out:
+ /* Detaching w/o checking return code: best effort attempt. */
+ if (progfd != -1)
+ bpf_prog_detach(cgfd, test->attach_type);
+ close(progfd);
+ printf("[%s]\n", err ? "FAIL" : "PASS");
+ return err;
+}
+
+static int run_tests(int cgfd)
+{
+ int passes = 0;
+ int fails = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(tests); ++i) {
+ if (run_test_case(cgfd, &tests[i]))
+ ++fails;
+ else
+ ++passes;
+ }
+ printf("Summary: %d PASSED, %d FAILED\n", passes, fails);
+ return fails ? -1 : 0;
+}
+
+int main(int argc, char **argv)
+{
+ int cgfd = -1;
+ int err = 0;
+
+ if (argc < 2) {
+ fprintf(stderr,
+ "%s has to be run via %s.sh. Skip direct run.\n",
+ argv[0], argv[0]);
+ exit(err);
+ }
+
+ cgfd = cgroup_setup_and_join(CG_PATH);
+ if (cgfd < 0)
+ goto err;
+
+ if (run_tests(cgfd))
+ goto err;
+
+ goto out;
+err:
+ err = -1;
+out:
+ close(cgfd);
+ cleanup_cgroup_environment();
+ return err;
+}
diff --git a/tools/testing/selftests/bpf/test_sock_addr.sh b/tools/testing/selftests/bpf/test_sock_addr.sh
new file mode 100755
index 000000000..3b9fdb809
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sock_addr.sh
@@ -0,0 +1,58 @@
+#!/bin/sh
+
+set -eu
+
+ping_once()
+{
+ type ping${1} >/dev/null 2>&1 && PING="ping${1}" || PING="ping -${1}"
+ $PING -q -c 1 -W 1 ${2%%/*} >/dev/null 2>&1
+}
+
+wait_for_ip()
+{
+ local _i
+ echo -n "Wait for testing IPv4/IPv6 to become available "
+ for _i in $(seq ${MAX_PING_TRIES}); do
+ echo -n "."
+ if ping_once 4 ${TEST_IPv4} && ping_once 6 ${TEST_IPv6}; then
+ echo " OK"
+ return
+ fi
+ done
+ echo 1>&2 "ERROR: Timeout waiting for test IP to become available."
+ exit 1
+}
+
+setup()
+{
+ # Create testing interfaces not to interfere with current environment.
+ ip link add dev ${TEST_IF} type veth peer name ${TEST_IF_PEER}
+ ip link set ${TEST_IF} up
+ ip link set ${TEST_IF_PEER} up
+
+ ip -4 addr add ${TEST_IPv4} dev ${TEST_IF}
+ ip -6 addr add ${TEST_IPv6} dev ${TEST_IF}
+ wait_for_ip
+}
+
+cleanup()
+{
+ ip link del ${TEST_IF} 2>/dev/null || :
+ ip link del ${TEST_IF_PEER} 2>/dev/null || :
+}
+
+main()
+{
+ trap cleanup EXIT 2 3 6 15
+ setup
+ ./test_sock_addr setup_done
+}
+
+BASENAME=$(basename $0 .sh)
+TEST_IF="${BASENAME}1"
+TEST_IF_PEER="${BASENAME}2"
+TEST_IPv4="127.0.0.4/8"
+TEST_IPv6="::6/128"
+MAX_PING_TRIES=5
+
+main
diff --git a/tools/testing/selftests/bpf/test_socket_cookie.c b/tools/testing/selftests/bpf/test_socket_cookie.c
new file mode 100644
index 000000000..ca7ca87e9
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_socket_cookie.c
@@ -0,0 +1,208 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+
+#include <string.h>
+#include <unistd.h>
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "bpf_rlimit.h"
+#include "cgroup_helpers.h"
+
+#define CG_PATH "/foo"
+#define SOCKET_COOKIE_PROG "./socket_cookie_prog.o"
+
+struct socket_cookie {
+ __u64 cookie_key;
+ __u32 cookie_value;
+};
+
+static int start_server(void)
+{
+ struct sockaddr_in6 addr;
+ int fd;
+
+ fd = socket(AF_INET6, SOCK_STREAM, 0);
+ if (fd == -1) {
+ log_err("Failed to create server socket");
+ goto out;
+ }
+
+ memset(&addr, 0, sizeof(addr));
+ addr.sin6_family = AF_INET6;
+ addr.sin6_addr = in6addr_loopback;
+ addr.sin6_port = 0;
+
+ if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) == -1) {
+ log_err("Failed to bind server socket");
+ goto close_out;
+ }
+
+ if (listen(fd, 128) == -1) {
+ log_err("Failed to listen on server socket");
+ goto close_out;
+ }
+
+ goto out;
+
+close_out:
+ close(fd);
+ fd = -1;
+out:
+ return fd;
+}
+
+static int connect_to_server(int server_fd)
+{
+ struct sockaddr_storage addr;
+ socklen_t len = sizeof(addr);
+ int fd;
+
+ fd = socket(AF_INET6, SOCK_STREAM, 0);
+ if (fd == -1) {
+ log_err("Failed to create client socket");
+ goto out;
+ }
+
+ if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) {
+ log_err("Failed to get server addr");
+ goto close_out;
+ }
+
+ if (connect(fd, (const struct sockaddr *)&addr, len) == -1) {
+ log_err("Fail to connect to server");
+ goto close_out;
+ }
+
+ goto out;
+
+close_out:
+ close(fd);
+ fd = -1;
+out:
+ return fd;
+}
+
+static int validate_map(struct bpf_map *map, int client_fd)
+{
+ __u32 cookie_expected_value;
+ struct sockaddr_in6 addr;
+ socklen_t len = sizeof(addr);
+ struct socket_cookie val;
+ int err = 0;
+ int map_fd;
+
+ if (!map) {
+ log_err("Map not found in BPF object");
+ goto err;
+ }
+
+ map_fd = bpf_map__fd(map);
+
+ err = bpf_map_lookup_elem(map_fd, &client_fd, &val);
+
+ err = getsockname(client_fd, (struct sockaddr *)&addr, &len);
+ if (err) {
+ log_err("Can't get client local addr");
+ goto out;
+ }
+
+ cookie_expected_value = (ntohs(addr.sin6_port) << 8) | 0xFF;
+ if (val.cookie_value != cookie_expected_value) {
+ log_err("Unexpected value in map: %x != %x", val.cookie_value,
+ cookie_expected_value);
+ goto err;
+ }
+
+ goto out;
+err:
+ err = -1;
+out:
+ return err;
+}
+
+static int run_test(int cgfd)
+{
+ enum bpf_attach_type attach_type;
+ struct bpf_prog_load_attr attr;
+ struct bpf_program *prog;
+ struct bpf_object *pobj;
+ const char *prog_name;
+ int server_fd = -1;
+ int client_fd = -1;
+ int prog_fd = -1;
+ int err = 0;
+
+ memset(&attr, 0, sizeof(attr));
+ attr.file = SOCKET_COOKIE_PROG;
+ attr.prog_type = BPF_PROG_TYPE_UNSPEC;
+ attr.prog_flags = BPF_F_TEST_RND_HI32;
+
+ err = bpf_prog_load_xattr(&attr, &pobj, &prog_fd);
+ if (err) {
+ log_err("Failed to load %s", attr.file);
+ goto out;
+ }
+
+ bpf_object__for_each_program(prog, pobj) {
+ prog_name = bpf_program__section_name(prog);
+
+ if (libbpf_attach_type_by_name(prog_name, &attach_type))
+ goto err;
+
+ err = bpf_prog_attach(bpf_program__fd(prog), cgfd, attach_type,
+ BPF_F_ALLOW_OVERRIDE);
+ if (err) {
+ log_err("Failed to attach prog %s", prog_name);
+ goto out;
+ }
+ }
+
+ server_fd = start_server();
+ if (server_fd == -1)
+ goto err;
+
+ client_fd = connect_to_server(server_fd);
+ if (client_fd == -1)
+ goto err;
+
+ if (validate_map(bpf_map__next(NULL, pobj), client_fd))
+ goto err;
+
+ goto out;
+err:
+ err = -1;
+out:
+ close(client_fd);
+ close(server_fd);
+ bpf_object__close(pobj);
+ printf("%s\n", err ? "FAILED" : "PASSED");
+ return err;
+}
+
+int main(int argc, char **argv)
+{
+ int cgfd = -1;
+ int err = 0;
+
+ cgfd = cgroup_setup_and_join(CG_PATH);
+ if (cgfd < 0)
+ goto err;
+
+ if (run_test(cgfd))
+ goto err;
+
+ goto out;
+err:
+ err = -1;
+out:
+ close(cgfd);
+ cleanup_cgroup_environment();
+ return err;
+}
diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
new file mode 100644
index 000000000..427ca00a3
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -0,0 +1,2026 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2017-2018 Covalent IO, Inc. http://covalent.io
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/select.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <sched.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/types.h>
+#include <sys/sendfile.h>
+
+#include <linux/netlink.h>
+#include <linux/socket.h>
+#include <linux/sock_diag.h>
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <linux/tls.h>
+#include <assert.h>
+#include <libgen.h>
+
+#include <getopt.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "bpf_util.h"
+#include "bpf_rlimit.h"
+#include "cgroup_helpers.h"
+
+int running;
+static void running_handler(int a);
+
+#ifndef TCP_ULP
+# define TCP_ULP 31
+#endif
+#ifndef SOL_TLS
+# define SOL_TLS 282
+#endif
+
+/* randomly selected ports for testing on lo */
+#define S1_PORT 10000
+#define S2_PORT 10001
+
+#define BPF_SOCKMAP_FILENAME "test_sockmap_kern.o"
+#define BPF_SOCKHASH_FILENAME "test_sockhash_kern.o"
+#define CG_PATH "/sockmap"
+
+/* global sockets */
+int s1, s2, c1, c2, p1, p2;
+int test_cnt;
+int passed;
+int failed;
+int map_fd[9];
+struct bpf_map *maps[9];
+int prog_fd[11];
+
+int txmsg_pass;
+int txmsg_redir;
+int txmsg_drop;
+int txmsg_apply;
+int txmsg_cork;
+int txmsg_start;
+int txmsg_end;
+int txmsg_start_push;
+int txmsg_end_push;
+int txmsg_start_pop;
+int txmsg_pop;
+int txmsg_ingress;
+int txmsg_redir_skb;
+int txmsg_ktls_skb;
+int txmsg_ktls_skb_drop;
+int txmsg_ktls_skb_redir;
+int ktls;
+int peek_flag;
+int skb_use_parser;
+int txmsg_omit_skb_parser;
+
+static const struct option long_options[] = {
+ {"help", no_argument, NULL, 'h' },
+ {"cgroup", required_argument, NULL, 'c' },
+ {"rate", required_argument, NULL, 'r' },
+ {"verbose", optional_argument, NULL, 'v' },
+ {"iov_count", required_argument, NULL, 'i' },
+ {"length", required_argument, NULL, 'l' },
+ {"test", required_argument, NULL, 't' },
+ {"data_test", no_argument, NULL, 'd' },
+ {"txmsg", no_argument, &txmsg_pass, 1 },
+ {"txmsg_redir", no_argument, &txmsg_redir, 1 },
+ {"txmsg_drop", no_argument, &txmsg_drop, 1 },
+ {"txmsg_apply", required_argument, NULL, 'a'},
+ {"txmsg_cork", required_argument, NULL, 'k'},
+ {"txmsg_start", required_argument, NULL, 's'},
+ {"txmsg_end", required_argument, NULL, 'e'},
+ {"txmsg_start_push", required_argument, NULL, 'p'},
+ {"txmsg_end_push", required_argument, NULL, 'q'},
+ {"txmsg_start_pop", required_argument, NULL, 'w'},
+ {"txmsg_pop", required_argument, NULL, 'x'},
+ {"txmsg_ingress", no_argument, &txmsg_ingress, 1 },
+ {"txmsg_redir_skb", no_argument, &txmsg_redir_skb, 1 },
+ {"ktls", no_argument, &ktls, 1 },
+ {"peek", no_argument, &peek_flag, 1 },
+ {"txmsg_omit_skb_parser", no_argument, &txmsg_omit_skb_parser, 1},
+ {"whitelist", required_argument, NULL, 'n' },
+ {"blacklist", required_argument, NULL, 'b' },
+ {0, 0, NULL, 0 }
+};
+
+struct test_env {
+ const char *type;
+ const char *subtest;
+ const char *prepend;
+
+ int test_num;
+ int subtest_num;
+
+ int succ_cnt;
+ int fail_cnt;
+ int fail_last;
+};
+
+struct test_env env;
+
+struct sockmap_options {
+ int verbose;
+ bool base;
+ bool sendpage;
+ bool data_test;
+ bool drop_expected;
+ int iov_count;
+ int iov_length;
+ int rate;
+ char *map;
+ char *whitelist;
+ char *blacklist;
+ char *prepend;
+};
+
+struct _test {
+ char *title;
+ void (*tester)(int cg_fd, struct sockmap_options *opt);
+};
+
+static void test_start(void)
+{
+ env.subtest_num++;
+}
+
+static void test_fail(void)
+{
+ env.fail_cnt++;
+}
+
+static void test_pass(void)
+{
+ env.succ_cnt++;
+}
+
+static void test_reset(void)
+{
+ txmsg_start = txmsg_end = 0;
+ txmsg_start_pop = txmsg_pop = 0;
+ txmsg_start_push = txmsg_end_push = 0;
+ txmsg_pass = txmsg_drop = txmsg_redir = 0;
+ txmsg_apply = txmsg_cork = 0;
+ txmsg_ingress = txmsg_redir_skb = 0;
+ txmsg_ktls_skb = txmsg_ktls_skb_drop = txmsg_ktls_skb_redir = 0;
+ txmsg_omit_skb_parser = 0;
+ skb_use_parser = 0;
+}
+
+static int test_start_subtest(const struct _test *t, struct sockmap_options *o)
+{
+ env.type = o->map;
+ env.subtest = t->title;
+ env.prepend = o->prepend;
+ env.test_num++;
+ env.subtest_num = 0;
+ env.fail_last = env.fail_cnt;
+ test_reset();
+ return 0;
+}
+
+static void test_end_subtest(void)
+{
+ int error = env.fail_cnt - env.fail_last;
+ int type = strcmp(env.type, BPF_SOCKMAP_FILENAME);
+
+ if (!error)
+ test_pass();
+
+ fprintf(stdout, "#%2d/%2d %8s:%s:%s:%s\n",
+ env.test_num, env.subtest_num,
+ !type ? "sockmap" : "sockhash",
+ env.prepend ? : "",
+ env.subtest, error ? "FAIL" : "OK");
+}
+
+static void test_print_results(void)
+{
+ fprintf(stdout, "Pass: %d Fail: %d\n",
+ env.succ_cnt, env.fail_cnt);
+}
+
+static void usage(char *argv[])
+{
+ int i;
+
+ printf(" Usage: %s --cgroup <cgroup_path>\n", argv[0]);
+ printf(" options:\n");
+ for (i = 0; long_options[i].name != 0; i++) {
+ printf(" --%-12s", long_options[i].name);
+ if (long_options[i].flag != NULL)
+ printf(" flag (internal value:%d)\n",
+ *long_options[i].flag);
+ else
+ printf(" -%c\n", long_options[i].val);
+ }
+ printf("\n");
+}
+
+char *sock_to_string(int s)
+{
+ if (s == c1)
+ return "client1";
+ else if (s == c2)
+ return "client2";
+ else if (s == s1)
+ return "server1";
+ else if (s == s2)
+ return "server2";
+ else if (s == p1)
+ return "peer1";
+ else if (s == p2)
+ return "peer2";
+ else
+ return "unknown";
+}
+
+static int sockmap_init_ktls(int verbose, int s)
+{
+ struct tls12_crypto_info_aes_gcm_128 tls_tx = {
+ .info = {
+ .version = TLS_1_2_VERSION,
+ .cipher_type = TLS_CIPHER_AES_GCM_128,
+ },
+ };
+ struct tls12_crypto_info_aes_gcm_128 tls_rx = {
+ .info = {
+ .version = TLS_1_2_VERSION,
+ .cipher_type = TLS_CIPHER_AES_GCM_128,
+ },
+ };
+ int so_buf = 6553500;
+ int err;
+
+ err = setsockopt(s, 6, TCP_ULP, "tls", sizeof("tls"));
+ if (err) {
+ fprintf(stderr, "setsockopt: TCP_ULP(%s) failed with error %i\n", sock_to_string(s), err);
+ return -EINVAL;
+ }
+ err = setsockopt(s, SOL_TLS, TLS_TX, (void *)&tls_tx, sizeof(tls_tx));
+ if (err) {
+ fprintf(stderr, "setsockopt: TLS_TX(%s) failed with error %i\n", sock_to_string(s), err);
+ return -EINVAL;
+ }
+ err = setsockopt(s, SOL_TLS, TLS_RX, (void *)&tls_rx, sizeof(tls_rx));
+ if (err) {
+ fprintf(stderr, "setsockopt: TLS_RX(%s) failed with error %i\n", sock_to_string(s), err);
+ return -EINVAL;
+ }
+ err = setsockopt(s, SOL_SOCKET, SO_SNDBUF, &so_buf, sizeof(so_buf));
+ if (err) {
+ fprintf(stderr, "setsockopt: (%s) failed sndbuf with error %i\n", sock_to_string(s), err);
+ return -EINVAL;
+ }
+ err = setsockopt(s, SOL_SOCKET, SO_RCVBUF, &so_buf, sizeof(so_buf));
+ if (err) {
+ fprintf(stderr, "setsockopt: (%s) failed rcvbuf with error %i\n", sock_to_string(s), err);
+ return -EINVAL;
+ }
+
+ if (verbose)
+ fprintf(stdout, "socket(%s) kTLS enabled\n", sock_to_string(s));
+ return 0;
+}
+static int sockmap_init_sockets(int verbose)
+{
+ int i, err, one = 1;
+ struct sockaddr_in addr;
+ int *fds[4] = {&s1, &s2, &c1, &c2};
+
+ s1 = s2 = p1 = p2 = c1 = c2 = 0;
+
+ /* Init sockets */
+ for (i = 0; i < 4; i++) {
+ *fds[i] = socket(AF_INET, SOCK_STREAM, 0);
+ if (*fds[i] < 0) {
+ perror("socket s1 failed()");
+ return errno;
+ }
+ }
+
+ /* Allow reuse */
+ for (i = 0; i < 2; i++) {
+ err = setsockopt(*fds[i], SOL_SOCKET, SO_REUSEADDR,
+ (char *)&one, sizeof(one));
+ if (err) {
+ perror("setsockopt failed()");
+ return errno;
+ }
+ }
+
+ /* Non-blocking sockets */
+ for (i = 0; i < 2; i++) {
+ err = ioctl(*fds[i], FIONBIO, (char *)&one);
+ if (err < 0) {
+ perror("ioctl s1 failed()");
+ return errno;
+ }
+ }
+
+ /* Bind server sockets */
+ memset(&addr, 0, sizeof(struct sockaddr_in));
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = inet_addr("127.0.0.1");
+
+ addr.sin_port = htons(S1_PORT);
+ err = bind(s1, (struct sockaddr *)&addr, sizeof(addr));
+ if (err < 0) {
+ perror("bind s1 failed()");
+ return errno;
+ }
+
+ addr.sin_port = htons(S2_PORT);
+ err = bind(s2, (struct sockaddr *)&addr, sizeof(addr));
+ if (err < 0) {
+ perror("bind s2 failed()");
+ return errno;
+ }
+
+ /* Listen server sockets */
+ addr.sin_port = htons(S1_PORT);
+ err = listen(s1, 32);
+ if (err < 0) {
+ perror("listen s1 failed()");
+ return errno;
+ }
+
+ addr.sin_port = htons(S2_PORT);
+ err = listen(s2, 32);
+ if (err < 0) {
+ perror("listen s1 failed()");
+ return errno;
+ }
+
+ /* Initiate Connect */
+ addr.sin_port = htons(S1_PORT);
+ err = connect(c1, (struct sockaddr *)&addr, sizeof(addr));
+ if (err < 0 && errno != EINPROGRESS) {
+ perror("connect c1 failed()");
+ return errno;
+ }
+
+ addr.sin_port = htons(S2_PORT);
+ err = connect(c2, (struct sockaddr *)&addr, sizeof(addr));
+ if (err < 0 && errno != EINPROGRESS) {
+ perror("connect c2 failed()");
+ return errno;
+ } else if (err < 0) {
+ err = 0;
+ }
+
+ /* Accept Connecrtions */
+ p1 = accept(s1, NULL, NULL);
+ if (p1 < 0) {
+ perror("accept s1 failed()");
+ return errno;
+ }
+
+ p2 = accept(s2, NULL, NULL);
+ if (p2 < 0) {
+ perror("accept s1 failed()");
+ return errno;
+ }
+
+ if (verbose > 1) {
+ printf("connected sockets: c1 <-> p1, c2 <-> p2\n");
+ printf("cgroups binding: c1(%i) <-> s1(%i) - - - c2(%i) <-> s2(%i)\n",
+ c1, s1, c2, s2);
+ }
+ return 0;
+}
+
+struct msg_stats {
+ size_t bytes_sent;
+ size_t bytes_recvd;
+ struct timespec start;
+ struct timespec end;
+};
+
+static int msg_loop_sendpage(int fd, int iov_length, int cnt,
+ struct msg_stats *s,
+ struct sockmap_options *opt)
+{
+ bool drop = opt->drop_expected;
+ unsigned char k = 0;
+ FILE *file;
+ int i, fp;
+
+ file = tmpfile();
+ if (!file) {
+ perror("create file for sendpage");
+ return 1;
+ }
+ for (i = 0; i < iov_length * cnt; i++, k++)
+ fwrite(&k, sizeof(char), 1, file);
+ fflush(file);
+ fseek(file, 0, SEEK_SET);
+
+ fp = fileno(file);
+
+ clock_gettime(CLOCK_MONOTONIC, &s->start);
+ for (i = 0; i < cnt; i++) {
+ int sent;
+
+ errno = 0;
+ sent = sendfile(fd, fp, NULL, iov_length);
+
+ if (!drop && sent < 0) {
+ perror("sendpage loop error");
+ fclose(file);
+ return sent;
+ } else if (drop && sent >= 0) {
+ printf("sendpage loop error expected: %i errno %i\n",
+ sent, errno);
+ fclose(file);
+ return -EIO;
+ }
+
+ if (sent > 0)
+ s->bytes_sent += sent;
+ }
+ clock_gettime(CLOCK_MONOTONIC, &s->end);
+ fclose(file);
+ return 0;
+}
+
+static void msg_free_iov(struct msghdr *msg)
+{
+ int i;
+
+ for (i = 0; i < msg->msg_iovlen; i++)
+ free(msg->msg_iov[i].iov_base);
+ free(msg->msg_iov);
+ msg->msg_iov = NULL;
+ msg->msg_iovlen = 0;
+}
+
+static int msg_alloc_iov(struct msghdr *msg,
+ int iov_count, int iov_length,
+ bool data, bool xmit)
+{
+ unsigned char k = 0;
+ struct iovec *iov;
+ int i;
+
+ iov = calloc(iov_count, sizeof(struct iovec));
+ if (!iov)
+ return errno;
+
+ for (i = 0; i < iov_count; i++) {
+ unsigned char *d = calloc(iov_length, sizeof(char));
+
+ if (!d) {
+ fprintf(stderr, "iov_count %i/%i OOM\n", i, iov_count);
+ goto unwind_iov;
+ }
+ iov[i].iov_base = d;
+ iov[i].iov_len = iov_length;
+
+ if (data && xmit) {
+ int j;
+
+ for (j = 0; j < iov_length; j++)
+ d[j] = k++;
+ }
+ }
+
+ msg->msg_iov = iov;
+ msg->msg_iovlen = iov_count;
+
+ return 0;
+unwind_iov:
+ for (i--; i >= 0 ; i--)
+ free(msg->msg_iov[i].iov_base);
+ return -ENOMEM;
+}
+
+static int msg_verify_data(struct msghdr *msg, int size, int chunk_sz)
+{
+ int i, j = 0, bytes_cnt = 0;
+ unsigned char k = 0;
+
+ for (i = 0; i < msg->msg_iovlen; i++) {
+ unsigned char *d = msg->msg_iov[i].iov_base;
+
+ /* Special case test for skb ingress + ktls */
+ if (i == 0 && txmsg_ktls_skb) {
+ if (msg->msg_iov[i].iov_len < 4)
+ return -EIO;
+ if (memcmp(d, "PASS", 4) != 0) {
+ fprintf(stderr,
+ "detected skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n",
+ i, 0, d[0], d[1], d[2], d[3]);
+ return -EIO;
+ }
+ j = 4; /* advance index past PASS header */
+ }
+
+ for (; j < msg->msg_iov[i].iov_len && size; j++) {
+ if (d[j] != k++) {
+ fprintf(stderr,
+ "detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n",
+ i, j, d[j], k - 1, d[j+1], k);
+ return -EIO;
+ }
+ bytes_cnt++;
+ if (bytes_cnt == chunk_sz) {
+ k = 0;
+ bytes_cnt = 0;
+ }
+ size--;
+ }
+ }
+ return 0;
+}
+
+static int msg_loop(int fd, int iov_count, int iov_length, int cnt,
+ struct msg_stats *s, bool tx,
+ struct sockmap_options *opt)
+{
+ struct msghdr msg = {0}, msg_peek = {0};
+ int err, i, flags = MSG_NOSIGNAL;
+ bool drop = opt->drop_expected;
+ bool data = opt->data_test;
+
+ err = msg_alloc_iov(&msg, iov_count, iov_length, data, tx);
+ if (err)
+ goto out_errno;
+ if (peek_flag) {
+ err = msg_alloc_iov(&msg_peek, iov_count, iov_length, data, tx);
+ if (err)
+ goto out_errno;
+ }
+
+ if (tx) {
+ clock_gettime(CLOCK_MONOTONIC, &s->start);
+ for (i = 0; i < cnt; i++) {
+ int sent;
+
+ errno = 0;
+ sent = sendmsg(fd, &msg, flags);
+
+ if (!drop && sent < 0) {
+ perror("sendmsg loop error");
+ goto out_errno;
+ } else if (drop && sent >= 0) {
+ fprintf(stderr,
+ "sendmsg loop error expected: %i errno %i\n",
+ sent, errno);
+ errno = -EIO;
+ goto out_errno;
+ }
+ if (sent > 0)
+ s->bytes_sent += sent;
+ }
+ clock_gettime(CLOCK_MONOTONIC, &s->end);
+ } else {
+ int slct, recvp = 0, recv, max_fd = fd;
+ float total_bytes, txmsg_pop_total;
+ int fd_flags = O_NONBLOCK;
+ struct timeval timeout;
+ fd_set w;
+
+ fcntl(fd, fd_flags);
+ /* Account for pop bytes noting each iteration of apply will
+ * call msg_pop_data helper so we need to account for this
+ * by calculating the number of apply iterations. Note user
+ * of the tool can create cases where no data is sent by
+ * manipulating pop/push/pull/etc. For example txmsg_apply 1
+ * with txmsg_pop 1 will try to apply 1B at a time but each
+ * iteration will then pop 1B so no data will ever be sent.
+ * This is really only useful for testing edge cases in code
+ * paths.
+ */
+ total_bytes = (float)iov_count * (float)iov_length * (float)cnt;
+ if (txmsg_apply)
+ txmsg_pop_total = txmsg_pop * (total_bytes / txmsg_apply);
+ else
+ txmsg_pop_total = txmsg_pop * cnt;
+ total_bytes -= txmsg_pop_total;
+ err = clock_gettime(CLOCK_MONOTONIC, &s->start);
+ if (err < 0)
+ perror("recv start time");
+ while (s->bytes_recvd < total_bytes) {
+ if (txmsg_cork) {
+ timeout.tv_sec = 0;
+ timeout.tv_usec = 300000;
+ } else {
+ timeout.tv_sec = 3;
+ timeout.tv_usec = 0;
+ }
+
+ /* FD sets */
+ FD_ZERO(&w);
+ FD_SET(fd, &w);
+
+ slct = select(max_fd + 1, &w, NULL, NULL, &timeout);
+ if (slct == -1) {
+ perror("select()");
+ clock_gettime(CLOCK_MONOTONIC, &s->end);
+ goto out_errno;
+ } else if (!slct) {
+ if (opt->verbose)
+ fprintf(stderr, "unexpected timeout: recved %zu/%f pop_total %f\n", s->bytes_recvd, total_bytes, txmsg_pop_total);
+ errno = -EIO;
+ clock_gettime(CLOCK_MONOTONIC, &s->end);
+ goto out_errno;
+ }
+
+ errno = 0;
+ if (peek_flag) {
+ flags |= MSG_PEEK;
+ recvp = recvmsg(fd, &msg_peek, flags);
+ if (recvp < 0) {
+ if (errno != EWOULDBLOCK) {
+ clock_gettime(CLOCK_MONOTONIC, &s->end);
+ goto out_errno;
+ }
+ }
+ flags = 0;
+ }
+
+ recv = recvmsg(fd, &msg, flags);
+ if (recv < 0) {
+ if (errno != EWOULDBLOCK) {
+ clock_gettime(CLOCK_MONOTONIC, &s->end);
+ perror("recv failed()");
+ goto out_errno;
+ }
+ }
+
+ s->bytes_recvd += recv;
+
+ if (data) {
+ int chunk_sz = opt->sendpage ?
+ iov_length * cnt :
+ iov_length * iov_count;
+
+ errno = msg_verify_data(&msg, recv, chunk_sz);
+ if (errno) {
+ perror("data verify msg failed");
+ goto out_errno;
+ }
+ if (recvp) {
+ errno = msg_verify_data(&msg_peek,
+ recvp,
+ chunk_sz);
+ if (errno) {
+ perror("data verify msg_peek failed");
+ goto out_errno;
+ }
+ }
+ }
+ }
+ clock_gettime(CLOCK_MONOTONIC, &s->end);
+ }
+
+ msg_free_iov(&msg);
+ msg_free_iov(&msg_peek);
+ return err;
+out_errno:
+ msg_free_iov(&msg);
+ msg_free_iov(&msg_peek);
+ return errno;
+}
+
+static float giga = 1000000000;
+
+static inline float sentBps(struct msg_stats s)
+{
+ return s.bytes_sent / (s.end.tv_sec - s.start.tv_sec);
+}
+
+static inline float recvdBps(struct msg_stats s)
+{
+ return s.bytes_recvd / (s.end.tv_sec - s.start.tv_sec);
+}
+
+static int sendmsg_test(struct sockmap_options *opt)
+{
+ float sent_Bps = 0, recvd_Bps = 0;
+ int rx_fd, txpid, rxpid, err = 0;
+ struct msg_stats s = {0};
+ int iov_count = opt->iov_count;
+ int iov_buf = opt->iov_length;
+ int rx_status, tx_status;
+ int cnt = opt->rate;
+
+ errno = 0;
+
+ if (opt->base)
+ rx_fd = p1;
+ else
+ rx_fd = p2;
+
+ if (ktls) {
+ /* Redirecting into non-TLS socket which sends into a TLS
+ * socket is not a valid test. So in this case lets not
+ * enable kTLS but still run the test.
+ */
+ if (!txmsg_redir || (txmsg_redir && txmsg_ingress)) {
+ err = sockmap_init_ktls(opt->verbose, rx_fd);
+ if (err)
+ return err;
+ }
+ err = sockmap_init_ktls(opt->verbose, c1);
+ if (err)
+ return err;
+ }
+
+ rxpid = fork();
+ if (rxpid == 0) {
+ iov_buf -= (txmsg_pop - txmsg_start_pop + 1);
+ if (opt->drop_expected || txmsg_ktls_skb_drop)
+ _exit(0);
+
+ if (!iov_buf) /* zero bytes sent case */
+ _exit(0);
+
+ if (opt->sendpage)
+ iov_count = 1;
+ err = msg_loop(rx_fd, iov_count, iov_buf,
+ cnt, &s, false, opt);
+ if (opt->verbose > 1)
+ fprintf(stderr,
+ "msg_loop_rx: iov_count %i iov_buf %i cnt %i err %i\n",
+ iov_count, iov_buf, cnt, err);
+ if (s.end.tv_sec - s.start.tv_sec) {
+ sent_Bps = sentBps(s);
+ recvd_Bps = recvdBps(s);
+ }
+ if (opt->verbose > 1)
+ fprintf(stdout,
+ "rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s %s\n",
+ s.bytes_sent, sent_Bps, sent_Bps/giga,
+ s.bytes_recvd, recvd_Bps, recvd_Bps/giga,
+ peek_flag ? "(peek_msg)" : "");
+ if (err && txmsg_cork)
+ err = 0;
+ exit(err ? 1 : 0);
+ } else if (rxpid == -1) {
+ perror("msg_loop_rx");
+ return errno;
+ }
+
+ txpid = fork();
+ if (txpid == 0) {
+ if (opt->sendpage)
+ err = msg_loop_sendpage(c1, iov_buf, cnt, &s, opt);
+ else
+ err = msg_loop(c1, iov_count, iov_buf,
+ cnt, &s, true, opt);
+
+ if (err)
+ fprintf(stderr,
+ "msg_loop_tx: iov_count %i iov_buf %i cnt %i err %i\n",
+ iov_count, iov_buf, cnt, err);
+ if (s.end.tv_sec - s.start.tv_sec) {
+ sent_Bps = sentBps(s);
+ recvd_Bps = recvdBps(s);
+ }
+ if (opt->verbose > 1)
+ fprintf(stdout,
+ "tx_sendmsg: TX: %zuB %fB/s %f GB/s RX: %zuB %fB/s %fGB/s\n",
+ s.bytes_sent, sent_Bps, sent_Bps/giga,
+ s.bytes_recvd, recvd_Bps, recvd_Bps/giga);
+ exit(err ? 1 : 0);
+ } else if (txpid == -1) {
+ perror("msg_loop_tx");
+ return errno;
+ }
+
+ assert(waitpid(rxpid, &rx_status, 0) == rxpid);
+ assert(waitpid(txpid, &tx_status, 0) == txpid);
+ if (WIFEXITED(rx_status)) {
+ err = WEXITSTATUS(rx_status);
+ if (err) {
+ fprintf(stderr, "rx thread exited with err %d.\n", err);
+ goto out;
+ }
+ }
+ if (WIFEXITED(tx_status)) {
+ err = WEXITSTATUS(tx_status);
+ if (err)
+ fprintf(stderr, "tx thread exited with err %d.\n", err);
+ }
+out:
+ return err;
+}
+
+static int forever_ping_pong(int rate, struct sockmap_options *opt)
+{
+ struct timeval timeout;
+ char buf[1024] = {0};
+ int sc;
+
+ timeout.tv_sec = 10;
+ timeout.tv_usec = 0;
+
+ /* Ping/Pong data from client to server */
+ sc = send(c1, buf, sizeof(buf), 0);
+ if (sc < 0) {
+ perror("send failed()");
+ return sc;
+ }
+
+ do {
+ int s, rc, i, max_fd = p2;
+ fd_set w;
+
+ /* FD sets */
+ FD_ZERO(&w);
+ FD_SET(c1, &w);
+ FD_SET(c2, &w);
+ FD_SET(p1, &w);
+ FD_SET(p2, &w);
+
+ s = select(max_fd + 1, &w, NULL, NULL, &timeout);
+ if (s == -1) {
+ perror("select()");
+ break;
+ } else if (!s) {
+ fprintf(stderr, "unexpected timeout\n");
+ break;
+ }
+
+ for (i = 0; i <= max_fd && s > 0; ++i) {
+ if (!FD_ISSET(i, &w))
+ continue;
+
+ s--;
+
+ rc = recv(i, buf, sizeof(buf), 0);
+ if (rc < 0) {
+ if (errno != EWOULDBLOCK) {
+ perror("recv failed()");
+ return rc;
+ }
+ }
+
+ if (rc == 0) {
+ close(i);
+ break;
+ }
+
+ sc = send(i, buf, rc, 0);
+ if (sc < 0) {
+ perror("send failed()");
+ return sc;
+ }
+ }
+
+ if (rate)
+ sleep(rate);
+
+ if (opt->verbose) {
+ printf(".");
+ fflush(stdout);
+
+ }
+ } while (running);
+
+ return 0;
+}
+
+enum {
+ SELFTESTS,
+ PING_PONG,
+ SENDMSG,
+ BASE,
+ BASE_SENDPAGE,
+ SENDPAGE,
+};
+
+static int run_options(struct sockmap_options *options, int cg_fd, int test)
+{
+ int i, key, next_key, err, tx_prog_fd = -1, zero = 0;
+
+ /* If base test skip BPF setup */
+ if (test == BASE || test == BASE_SENDPAGE)
+ goto run;
+
+ /* Attach programs to sockmap */
+ if (!txmsg_omit_skb_parser) {
+ err = bpf_prog_attach(prog_fd[0], map_fd[0],
+ BPF_SK_SKB_STREAM_PARSER, 0);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_prog_attach (sockmap %i->%i): %d (%s)\n",
+ prog_fd[0], map_fd[0], err, strerror(errno));
+ return err;
+ }
+ }
+
+ err = bpf_prog_attach(prog_fd[1], map_fd[0],
+ BPF_SK_SKB_STREAM_VERDICT, 0);
+ if (err) {
+ fprintf(stderr, "ERROR: bpf_prog_attach (sockmap): %d (%s)\n",
+ err, strerror(errno));
+ return err;
+ }
+
+ /* Attach programs to TLS sockmap */
+ if (txmsg_ktls_skb) {
+ if (!txmsg_omit_skb_parser) {
+ err = bpf_prog_attach(prog_fd[0], map_fd[8],
+ BPF_SK_SKB_STREAM_PARSER, 0);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_prog_attach (TLS sockmap %i->%i): %d (%s)\n",
+ prog_fd[0], map_fd[8], err, strerror(errno));
+ return err;
+ }
+ }
+
+ err = bpf_prog_attach(prog_fd[2], map_fd[8],
+ BPF_SK_SKB_STREAM_VERDICT, 0);
+ if (err) {
+ fprintf(stderr, "ERROR: bpf_prog_attach (TLS sockmap): %d (%s)\n",
+ err, strerror(errno));
+ return err;
+ }
+ }
+
+ /* Attach to cgroups */
+ err = bpf_prog_attach(prog_fd[3], cg_fd, BPF_CGROUP_SOCK_OPS, 0);
+ if (err) {
+ fprintf(stderr, "ERROR: bpf_prog_attach (groups): %d (%s)\n",
+ err, strerror(errno));
+ return err;
+ }
+
+run:
+ err = sockmap_init_sockets(options->verbose);
+ if (err) {
+ fprintf(stderr, "ERROR: test socket failed: %d\n", err);
+ goto out;
+ }
+
+ /* Attach txmsg program to sockmap */
+ if (txmsg_pass)
+ tx_prog_fd = prog_fd[4];
+ else if (txmsg_redir)
+ tx_prog_fd = prog_fd[5];
+ else if (txmsg_apply)
+ tx_prog_fd = prog_fd[6];
+ else if (txmsg_cork)
+ tx_prog_fd = prog_fd[7];
+ else if (txmsg_drop)
+ tx_prog_fd = prog_fd[8];
+ else
+ tx_prog_fd = 0;
+
+ if (tx_prog_fd) {
+ int redir_fd, i = 0;
+
+ err = bpf_prog_attach(tx_prog_fd,
+ map_fd[1], BPF_SK_MSG_VERDICT, 0);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_prog_attach (txmsg): %d (%s)\n",
+ err, strerror(errno));
+ goto out;
+ }
+
+ err = bpf_map_update_elem(map_fd[1], &i, &c1, BPF_ANY);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_map_update_elem (txmsg): %d (%s\n",
+ err, strerror(errno));
+ goto out;
+ }
+
+ if (txmsg_redir)
+ redir_fd = c2;
+ else
+ redir_fd = c1;
+
+ err = bpf_map_update_elem(map_fd[2], &i, &redir_fd, BPF_ANY);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_map_update_elem (txmsg): %d (%s\n",
+ err, strerror(errno));
+ goto out;
+ }
+
+ if (txmsg_apply) {
+ err = bpf_map_update_elem(map_fd[3],
+ &i, &txmsg_apply, BPF_ANY);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_map_update_elem (apply_bytes): %d (%s\n",
+ err, strerror(errno));
+ goto out;
+ }
+ }
+
+ if (txmsg_cork) {
+ err = bpf_map_update_elem(map_fd[4],
+ &i, &txmsg_cork, BPF_ANY);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_map_update_elem (cork_bytes): %d (%s\n",
+ err, strerror(errno));
+ goto out;
+ }
+ }
+
+ if (txmsg_start) {
+ err = bpf_map_update_elem(map_fd[5],
+ &i, &txmsg_start, BPF_ANY);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_map_update_elem (txmsg_start): %d (%s)\n",
+ err, strerror(errno));
+ goto out;
+ }
+ }
+
+ if (txmsg_end) {
+ i = 1;
+ err = bpf_map_update_elem(map_fd[5],
+ &i, &txmsg_end, BPF_ANY);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_map_update_elem (txmsg_end): %d (%s)\n",
+ err, strerror(errno));
+ goto out;
+ }
+ }
+
+ if (txmsg_start_push) {
+ i = 2;
+ err = bpf_map_update_elem(map_fd[5],
+ &i, &txmsg_start_push, BPF_ANY);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_map_update_elem (txmsg_start_push): %d (%s)\n",
+ err, strerror(errno));
+ goto out;
+ }
+ }
+
+ if (txmsg_end_push) {
+ i = 3;
+ err = bpf_map_update_elem(map_fd[5],
+ &i, &txmsg_end_push, BPF_ANY);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_map_update_elem %i@%i (txmsg_end_push): %d (%s)\n",
+ txmsg_end_push, i, err, strerror(errno));
+ goto out;
+ }
+ }
+
+ if (txmsg_start_pop) {
+ i = 4;
+ err = bpf_map_update_elem(map_fd[5],
+ &i, &txmsg_start_pop, BPF_ANY);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_map_update_elem %i@%i (txmsg_start_pop): %d (%s)\n",
+ txmsg_start_pop, i, err, strerror(errno));
+ goto out;
+ }
+ } else {
+ i = 4;
+ bpf_map_update_elem(map_fd[5],
+ &i, &txmsg_start_pop, BPF_ANY);
+ }
+
+ if (txmsg_pop) {
+ i = 5;
+ err = bpf_map_update_elem(map_fd[5],
+ &i, &txmsg_pop, BPF_ANY);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_map_update_elem %i@%i (txmsg_pop): %d (%s)\n",
+ txmsg_pop, i, err, strerror(errno));
+ goto out;
+ }
+ } else {
+ i = 5;
+ bpf_map_update_elem(map_fd[5],
+ &i, &txmsg_pop, BPF_ANY);
+
+ }
+
+ if (txmsg_ingress) {
+ int in = BPF_F_INGRESS;
+
+ i = 0;
+ err = bpf_map_update_elem(map_fd[6], &i, &in, BPF_ANY);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_map_update_elem (txmsg_ingress): %d (%s)\n",
+ err, strerror(errno));
+ }
+ i = 1;
+ err = bpf_map_update_elem(map_fd[1], &i, &p1, BPF_ANY);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_map_update_elem (p1 txmsg): %d (%s)\n",
+ err, strerror(errno));
+ }
+ err = bpf_map_update_elem(map_fd[2], &i, &p1, BPF_ANY);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_map_update_elem (p1 redir): %d (%s)\n",
+ err, strerror(errno));
+ }
+
+ i = 2;
+ err = bpf_map_update_elem(map_fd[2], &i, &p2, BPF_ANY);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_map_update_elem (p2 txmsg): %d (%s)\n",
+ err, strerror(errno));
+ }
+ }
+
+ if (txmsg_ktls_skb) {
+ int ingress = BPF_F_INGRESS;
+
+ i = 0;
+ err = bpf_map_update_elem(map_fd[8], &i, &p2, BPF_ANY);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_map_update_elem (c1 sockmap): %d (%s)\n",
+ err, strerror(errno));
+ }
+
+ if (txmsg_ktls_skb_redir) {
+ i = 1;
+ err = bpf_map_update_elem(map_fd[7],
+ &i, &ingress, BPF_ANY);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_map_update_elem (txmsg_ingress): %d (%s)\n",
+ err, strerror(errno));
+ }
+ }
+
+ if (txmsg_ktls_skb_drop) {
+ i = 1;
+ err = bpf_map_update_elem(map_fd[7], &i, &i, BPF_ANY);
+ }
+ }
+
+ if (txmsg_redir_skb) {
+ int skb_fd = (test == SENDMSG || test == SENDPAGE) ?
+ p2 : p1;
+ int ingress = BPF_F_INGRESS;
+
+ i = 0;
+ err = bpf_map_update_elem(map_fd[7],
+ &i, &ingress, BPF_ANY);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_map_update_elem (txmsg_ingress): %d (%s)\n",
+ err, strerror(errno));
+ }
+
+ i = 3;
+ err = bpf_map_update_elem(map_fd[0], &i, &skb_fd, BPF_ANY);
+ if (err) {
+ fprintf(stderr,
+ "ERROR: bpf_map_update_elem (c1 sockmap): %d (%s)\n",
+ err, strerror(errno));
+ }
+ }
+ }
+
+ if (skb_use_parser) {
+ i = 2;
+ err = bpf_map_update_elem(map_fd[7], &i, &skb_use_parser, BPF_ANY);
+ }
+
+ if (txmsg_drop)
+ options->drop_expected = true;
+
+ if (test == PING_PONG)
+ err = forever_ping_pong(options->rate, options);
+ else if (test == SENDMSG) {
+ options->base = false;
+ options->sendpage = false;
+ err = sendmsg_test(options);
+ } else if (test == SENDPAGE) {
+ options->base = false;
+ options->sendpage = true;
+ err = sendmsg_test(options);
+ } else if (test == BASE) {
+ options->base = true;
+ options->sendpage = false;
+ err = sendmsg_test(options);
+ } else if (test == BASE_SENDPAGE) {
+ options->base = true;
+ options->sendpage = true;
+ err = sendmsg_test(options);
+ } else
+ fprintf(stderr, "unknown test\n");
+out:
+ /* Detatch and zero all the maps */
+ bpf_prog_detach2(prog_fd[3], cg_fd, BPF_CGROUP_SOCK_OPS);
+ bpf_prog_detach2(prog_fd[0], map_fd[0], BPF_SK_SKB_STREAM_PARSER);
+ bpf_prog_detach2(prog_fd[1], map_fd[0], BPF_SK_SKB_STREAM_VERDICT);
+ bpf_prog_detach2(prog_fd[0], map_fd[8], BPF_SK_SKB_STREAM_PARSER);
+ bpf_prog_detach2(prog_fd[2], map_fd[8], BPF_SK_SKB_STREAM_VERDICT);
+
+ if (tx_prog_fd >= 0)
+ bpf_prog_detach2(tx_prog_fd, map_fd[1], BPF_SK_MSG_VERDICT);
+
+ for (i = 0; i < 8; i++) {
+ key = next_key = 0;
+ bpf_map_update_elem(map_fd[i], &key, &zero, BPF_ANY);
+ while (bpf_map_get_next_key(map_fd[i], &key, &next_key) == 0) {
+ bpf_map_update_elem(map_fd[i], &key, &zero, BPF_ANY);
+ key = next_key;
+ }
+ }
+
+ close(s1);
+ close(s2);
+ close(p1);
+ close(p2);
+ close(c1);
+ close(c2);
+ return err;
+}
+
+static char *test_to_str(int test)
+{
+ switch (test) {
+ case SENDMSG:
+ return "sendmsg";
+ case SENDPAGE:
+ return "sendpage";
+ }
+ return "unknown";
+}
+
+static void append_str(char *dst, const char *src, size_t dst_cap)
+{
+ size_t avail = dst_cap - strlen(dst);
+
+ if (avail <= 1) /* just zero byte could be written */
+ return;
+
+ strncat(dst, src, avail - 1); /* strncat() adds + 1 for zero byte */
+}
+
+#define OPTSTRING 60
+static void test_options(char *options)
+{
+ char tstr[OPTSTRING];
+
+ memset(options, 0, OPTSTRING);
+
+ if (txmsg_pass)
+ append_str(options, "pass,", OPTSTRING);
+ if (txmsg_redir)
+ append_str(options, "redir,", OPTSTRING);
+ if (txmsg_drop)
+ append_str(options, "drop,", OPTSTRING);
+ if (txmsg_apply) {
+ snprintf(tstr, OPTSTRING, "apply %d,", txmsg_apply);
+ append_str(options, tstr, OPTSTRING);
+ }
+ if (txmsg_cork) {
+ snprintf(tstr, OPTSTRING, "cork %d,", txmsg_cork);
+ append_str(options, tstr, OPTSTRING);
+ }
+ if (txmsg_start) {
+ snprintf(tstr, OPTSTRING, "start %d,", txmsg_start);
+ append_str(options, tstr, OPTSTRING);
+ }
+ if (txmsg_end) {
+ snprintf(tstr, OPTSTRING, "end %d,", txmsg_end);
+ append_str(options, tstr, OPTSTRING);
+ }
+ if (txmsg_start_pop) {
+ snprintf(tstr, OPTSTRING, "pop (%d,%d),",
+ txmsg_start_pop, txmsg_start_pop + txmsg_pop);
+ append_str(options, tstr, OPTSTRING);
+ }
+ if (txmsg_ingress)
+ append_str(options, "ingress,", OPTSTRING);
+ if (txmsg_redir_skb)
+ append_str(options, "redir_skb,", OPTSTRING);
+ if (txmsg_ktls_skb)
+ append_str(options, "ktls_skb,", OPTSTRING);
+ if (ktls)
+ append_str(options, "ktls,", OPTSTRING);
+ if (peek_flag)
+ append_str(options, "peek,", OPTSTRING);
+}
+
+static int __test_exec(int cgrp, int test, struct sockmap_options *opt)
+{
+ char *options = calloc(OPTSTRING, sizeof(char));
+ int err;
+
+ if (test == SENDPAGE)
+ opt->sendpage = true;
+ else
+ opt->sendpage = false;
+
+ if (txmsg_drop)
+ opt->drop_expected = true;
+ else
+ opt->drop_expected = false;
+
+ test_options(options);
+
+ if (opt->verbose) {
+ fprintf(stdout,
+ " [TEST %i]: (%i, %i, %i, %s, %s): ",
+ test_cnt, opt->rate, opt->iov_count, opt->iov_length,
+ test_to_str(test), options);
+ fflush(stdout);
+ }
+ err = run_options(opt, cgrp, test);
+ if (opt->verbose)
+ fprintf(stdout, " %s\n", !err ? "PASS" : "FAILED");
+ test_cnt++;
+ !err ? passed++ : failed++;
+ free(options);
+ return err;
+}
+
+static void test_exec(int cgrp, struct sockmap_options *opt)
+{
+ int type = strcmp(opt->map, BPF_SOCKMAP_FILENAME);
+ int err;
+
+ if (type == 0) {
+ test_start();
+ err = __test_exec(cgrp, SENDMSG, opt);
+ if (err)
+ test_fail();
+ } else {
+ test_start();
+ err = __test_exec(cgrp, SENDPAGE, opt);
+ if (err)
+ test_fail();
+ }
+}
+
+static void test_send_one(struct sockmap_options *opt, int cgrp)
+{
+ opt->iov_length = 1;
+ opt->iov_count = 1;
+ opt->rate = 1;
+ test_exec(cgrp, opt);
+
+ opt->iov_length = 1;
+ opt->iov_count = 1024;
+ opt->rate = 1;
+ test_exec(cgrp, opt);
+
+ opt->iov_length = 1024;
+ opt->iov_count = 1;
+ opt->rate = 1;
+ test_exec(cgrp, opt);
+
+}
+
+static void test_send_many(struct sockmap_options *opt, int cgrp)
+{
+ opt->iov_length = 3;
+ opt->iov_count = 1;
+ opt->rate = 512;
+ test_exec(cgrp, opt);
+
+ opt->rate = 100;
+ opt->iov_count = 1;
+ opt->iov_length = 5;
+ test_exec(cgrp, opt);
+}
+
+static void test_send_large(struct sockmap_options *opt, int cgrp)
+{
+ opt->iov_length = 256;
+ opt->iov_count = 1024;
+ opt->rate = 2;
+ test_exec(cgrp, opt);
+}
+
+static void test_send(struct sockmap_options *opt, int cgrp)
+{
+ test_send_one(opt, cgrp);
+ test_send_many(opt, cgrp);
+ test_send_large(opt, cgrp);
+ sched_yield();
+}
+
+static void test_txmsg_pass(int cgrp, struct sockmap_options *opt)
+{
+ /* Test small and large iov_count values with pass/redir/apply/cork */
+ txmsg_pass = 1;
+ test_send(opt, cgrp);
+}
+
+static void test_txmsg_redir(int cgrp, struct sockmap_options *opt)
+{
+ txmsg_redir = 1;
+ test_send(opt, cgrp);
+}
+
+static void test_txmsg_drop(int cgrp, struct sockmap_options *opt)
+{
+ txmsg_drop = 1;
+ test_send(opt, cgrp);
+}
+
+static void test_txmsg_ingress_redir(int cgrp, struct sockmap_options *opt)
+{
+ txmsg_pass = txmsg_drop = 0;
+ txmsg_ingress = txmsg_redir = 1;
+ test_send(opt, cgrp);
+}
+
+static void test_txmsg_skb(int cgrp, struct sockmap_options *opt)
+{
+ bool data = opt->data_test;
+ int k = ktls;
+
+ opt->data_test = true;
+ ktls = 1;
+
+ txmsg_pass = txmsg_drop = 0;
+ txmsg_ingress = txmsg_redir = 0;
+ txmsg_ktls_skb = 1;
+ txmsg_pass = 1;
+
+ /* Using data verification so ensure iov layout is
+ * expected from test receiver side. e.g. has enough
+ * bytes to write test code.
+ */
+ opt->iov_length = 100;
+ opt->iov_count = 1;
+ opt->rate = 1;
+ test_exec(cgrp, opt);
+
+ txmsg_ktls_skb_drop = 1;
+ test_exec(cgrp, opt);
+
+ txmsg_ktls_skb_drop = 0;
+ txmsg_ktls_skb_redir = 1;
+ test_exec(cgrp, opt);
+ txmsg_ktls_skb_redir = 0;
+
+ /* Tests that omit skb_parser */
+ txmsg_omit_skb_parser = 1;
+ ktls = 0;
+ txmsg_ktls_skb = 0;
+ test_exec(cgrp, opt);
+
+ txmsg_ktls_skb_drop = 1;
+ test_exec(cgrp, opt);
+ txmsg_ktls_skb_drop = 0;
+
+ txmsg_ktls_skb_redir = 1;
+ test_exec(cgrp, opt);
+
+ ktls = 1;
+ test_exec(cgrp, opt);
+ txmsg_omit_skb_parser = 0;
+
+ opt->data_test = data;
+ ktls = k;
+}
+
+/* Test cork with hung data. This tests poor usage patterns where
+ * cork can leave data on the ring if user program is buggy and
+ * doesn't flush them somehow. They do take some time however
+ * because they wait for a timeout. Test pass, redir and cork with
+ * apply logic. Use cork size of 4097 with send_large to avoid
+ * aligning cork size with send size.
+ */
+static void test_txmsg_cork_hangs(int cgrp, struct sockmap_options *opt)
+{
+ txmsg_pass = 1;
+ txmsg_redir = 0;
+ txmsg_cork = 4097;
+ txmsg_apply = 4097;
+ test_send_large(opt, cgrp);
+
+ txmsg_pass = 0;
+ txmsg_redir = 1;
+ txmsg_apply = 0;
+ txmsg_cork = 4097;
+ test_send_large(opt, cgrp);
+
+ txmsg_pass = 0;
+ txmsg_redir = 1;
+ txmsg_apply = 4097;
+ txmsg_cork = 4097;
+ test_send_large(opt, cgrp);
+}
+
+static void test_txmsg_pull(int cgrp, struct sockmap_options *opt)
+{
+ /* Test basic start/end */
+ txmsg_start = 1;
+ txmsg_end = 2;
+ test_send(opt, cgrp);
+
+ /* Test >4k pull */
+ txmsg_start = 4096;
+ txmsg_end = 9182;
+ test_send_large(opt, cgrp);
+
+ /* Test pull + redirect */
+ txmsg_redir = 0;
+ txmsg_start = 1;
+ txmsg_end = 2;
+ test_send(opt, cgrp);
+
+ /* Test pull + cork */
+ txmsg_redir = 0;
+ txmsg_cork = 512;
+ txmsg_start = 1;
+ txmsg_end = 2;
+ test_send_many(opt, cgrp);
+
+ /* Test pull + cork + redirect */
+ txmsg_redir = 1;
+ txmsg_cork = 512;
+ txmsg_start = 1;
+ txmsg_end = 2;
+ test_send_many(opt, cgrp);
+}
+
+static void test_txmsg_pop(int cgrp, struct sockmap_options *opt)
+{
+ /* Test basic pop */
+ txmsg_start_pop = 1;
+ txmsg_pop = 2;
+ test_send_many(opt, cgrp);
+
+ /* Test pop with >4k */
+ txmsg_start_pop = 4096;
+ txmsg_pop = 4096;
+ test_send_large(opt, cgrp);
+
+ /* Test pop + redirect */
+ txmsg_redir = 1;
+ txmsg_start_pop = 1;
+ txmsg_pop = 2;
+ test_send_many(opt, cgrp);
+
+ /* Test pop + cork */
+ txmsg_redir = 0;
+ txmsg_cork = 512;
+ txmsg_start_pop = 1;
+ txmsg_pop = 2;
+ test_send_many(opt, cgrp);
+
+ /* Test pop + redirect + cork */
+ txmsg_redir = 1;
+ txmsg_cork = 4;
+ txmsg_start_pop = 1;
+ txmsg_pop = 2;
+ test_send_many(opt, cgrp);
+}
+
+static void test_txmsg_push(int cgrp, struct sockmap_options *opt)
+{
+ /* Test basic push */
+ txmsg_start_push = 1;
+ txmsg_end_push = 1;
+ test_send(opt, cgrp);
+
+ /* Test push 4kB >4k */
+ txmsg_start_push = 4096;
+ txmsg_end_push = 4096;
+ test_send_large(opt, cgrp);
+
+ /* Test push + redirect */
+ txmsg_redir = 1;
+ txmsg_start_push = 1;
+ txmsg_end_push = 2;
+ test_send_many(opt, cgrp);
+
+ /* Test push + cork */
+ txmsg_redir = 0;
+ txmsg_cork = 512;
+ txmsg_start_push = 1;
+ txmsg_end_push = 2;
+ test_send_many(opt, cgrp);
+}
+
+static void test_txmsg_push_pop(int cgrp, struct sockmap_options *opt)
+{
+ txmsg_start_push = 1;
+ txmsg_end_push = 10;
+ txmsg_start_pop = 5;
+ txmsg_pop = 4;
+ test_send_large(opt, cgrp);
+}
+
+static void test_txmsg_apply(int cgrp, struct sockmap_options *opt)
+{
+ txmsg_pass = 1;
+ txmsg_redir = 0;
+ txmsg_apply = 1;
+ txmsg_cork = 0;
+ test_send_one(opt, cgrp);
+
+ txmsg_pass = 0;
+ txmsg_redir = 1;
+ txmsg_apply = 1;
+ txmsg_cork = 0;
+ test_send_one(opt, cgrp);
+
+ txmsg_pass = 1;
+ txmsg_redir = 0;
+ txmsg_apply = 1024;
+ txmsg_cork = 0;
+ test_send_large(opt, cgrp);
+
+ txmsg_pass = 0;
+ txmsg_redir = 1;
+ txmsg_apply = 1024;
+ txmsg_cork = 0;
+ test_send_large(opt, cgrp);
+}
+
+static void test_txmsg_cork(int cgrp, struct sockmap_options *opt)
+{
+ txmsg_pass = 1;
+ txmsg_redir = 0;
+ txmsg_apply = 0;
+ txmsg_cork = 1;
+ test_send(opt, cgrp);
+
+ txmsg_pass = 1;
+ txmsg_redir = 0;
+ txmsg_apply = 1;
+ txmsg_cork = 1;
+ test_send(opt, cgrp);
+}
+
+static void test_txmsg_ingress_parser(int cgrp, struct sockmap_options *opt)
+{
+ txmsg_pass = 1;
+ skb_use_parser = 512;
+ opt->iov_length = 256;
+ opt->iov_count = 1;
+ opt->rate = 2;
+ test_exec(cgrp, opt);
+}
+
+char *map_names[] = {
+ "sock_map",
+ "sock_map_txmsg",
+ "sock_map_redir",
+ "sock_apply_bytes",
+ "sock_cork_bytes",
+ "sock_bytes",
+ "sock_redir_flags",
+ "sock_skb_opts",
+ "tls_sock_map",
+};
+
+int prog_attach_type[] = {
+ BPF_SK_SKB_STREAM_PARSER,
+ BPF_SK_SKB_STREAM_VERDICT,
+ BPF_SK_SKB_STREAM_VERDICT,
+ BPF_CGROUP_SOCK_OPS,
+ BPF_SK_MSG_VERDICT,
+ BPF_SK_MSG_VERDICT,
+ BPF_SK_MSG_VERDICT,
+ BPF_SK_MSG_VERDICT,
+ BPF_SK_MSG_VERDICT,
+ BPF_SK_MSG_VERDICT,
+ BPF_SK_MSG_VERDICT,
+};
+
+int prog_type[] = {
+ BPF_PROG_TYPE_SK_SKB,
+ BPF_PROG_TYPE_SK_SKB,
+ BPF_PROG_TYPE_SK_SKB,
+ BPF_PROG_TYPE_SOCK_OPS,
+ BPF_PROG_TYPE_SK_MSG,
+ BPF_PROG_TYPE_SK_MSG,
+ BPF_PROG_TYPE_SK_MSG,
+ BPF_PROG_TYPE_SK_MSG,
+ BPF_PROG_TYPE_SK_MSG,
+ BPF_PROG_TYPE_SK_MSG,
+ BPF_PROG_TYPE_SK_MSG,
+};
+
+static int populate_progs(char *bpf_file)
+{
+ struct bpf_program *prog;
+ struct bpf_object *obj;
+ int i = 0;
+ long err;
+
+ obj = bpf_object__open(bpf_file);
+ err = libbpf_get_error(obj);
+ if (err) {
+ char err_buf[256];
+
+ libbpf_strerror(err, err_buf, sizeof(err_buf));
+ printf("Unable to load eBPF objects in file '%s' : %s\n",
+ bpf_file, err_buf);
+ return -1;
+ }
+
+ bpf_object__for_each_program(prog, obj) {
+ bpf_program__set_type(prog, prog_type[i]);
+ bpf_program__set_expected_attach_type(prog,
+ prog_attach_type[i]);
+ i++;
+ }
+
+ i = bpf_object__load(obj);
+ i = 0;
+ bpf_object__for_each_program(prog, obj) {
+ prog_fd[i] = bpf_program__fd(prog);
+ i++;
+ }
+
+ for (i = 0; i < sizeof(map_fd)/sizeof(int); i++) {
+ maps[i] = bpf_object__find_map_by_name(obj, map_names[i]);
+ map_fd[i] = bpf_map__fd(maps[i]);
+ if (map_fd[i] < 0) {
+ fprintf(stderr, "load_bpf_file: (%i) %s\n",
+ map_fd[i], strerror(errno));
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+struct _test test[] = {
+ {"txmsg test passthrough", test_txmsg_pass},
+ {"txmsg test redirect", test_txmsg_redir},
+ {"txmsg test drop", test_txmsg_drop},
+ {"txmsg test ingress redirect", test_txmsg_ingress_redir},
+ {"txmsg test skb", test_txmsg_skb},
+ {"txmsg test apply", test_txmsg_apply},
+ {"txmsg test cork", test_txmsg_cork},
+ {"txmsg test hanging corks", test_txmsg_cork_hangs},
+ {"txmsg test push_data", test_txmsg_push},
+ {"txmsg test pull-data", test_txmsg_pull},
+ {"txmsg test pop-data", test_txmsg_pop},
+ {"txmsg test push/pop data", test_txmsg_push_pop},
+ {"txmsg text ingress parser", test_txmsg_ingress_parser},
+};
+
+static int check_whitelist(struct _test *t, struct sockmap_options *opt)
+{
+ char *entry, *ptr;
+
+ if (!opt->whitelist)
+ return 0;
+ ptr = strdup(opt->whitelist);
+ if (!ptr)
+ return -ENOMEM;
+ entry = strtok(ptr, ",");
+ while (entry) {
+ if ((opt->prepend && strstr(opt->prepend, entry) != 0) ||
+ strstr(opt->map, entry) != 0 ||
+ strstr(t->title, entry) != 0)
+ return 0;
+ entry = strtok(NULL, ",");
+ }
+ return -EINVAL;
+}
+
+static int check_blacklist(struct _test *t, struct sockmap_options *opt)
+{
+ char *entry, *ptr;
+
+ if (!opt->blacklist)
+ return -EINVAL;
+ ptr = strdup(opt->blacklist);
+ if (!ptr)
+ return -ENOMEM;
+ entry = strtok(ptr, ",");
+ while (entry) {
+ if ((opt->prepend && strstr(opt->prepend, entry) != 0) ||
+ strstr(opt->map, entry) != 0 ||
+ strstr(t->title, entry) != 0)
+ return 0;
+ entry = strtok(NULL, ",");
+ }
+ return -EINVAL;
+}
+
+static int __test_selftests(int cg_fd, struct sockmap_options *opt)
+{
+ int i, err;
+
+ err = populate_progs(opt->map);
+ if (err < 0) {
+ fprintf(stderr, "ERROR: (%i) load bpf failed\n", err);
+ return err;
+ }
+
+ /* Tests basic commands and APIs */
+ for (i = 0; i < sizeof(test)/sizeof(struct _test); i++) {
+ struct _test t = test[i];
+
+ if (check_whitelist(&t, opt) != 0)
+ continue;
+ if (check_blacklist(&t, opt) == 0)
+ continue;
+
+ test_start_subtest(&t, opt);
+ t.tester(cg_fd, opt);
+ test_end_subtest();
+ }
+
+ return err;
+}
+
+static void test_selftests_sockmap(int cg_fd, struct sockmap_options *opt)
+{
+ opt->map = BPF_SOCKMAP_FILENAME;
+ __test_selftests(cg_fd, opt);
+}
+
+static void test_selftests_sockhash(int cg_fd, struct sockmap_options *opt)
+{
+ opt->map = BPF_SOCKHASH_FILENAME;
+ __test_selftests(cg_fd, opt);
+}
+
+static void test_selftests_ktls(int cg_fd, struct sockmap_options *opt)
+{
+ opt->map = BPF_SOCKHASH_FILENAME;
+ opt->prepend = "ktls";
+ ktls = 1;
+ __test_selftests(cg_fd, opt);
+ ktls = 0;
+}
+
+static int test_selftest(int cg_fd, struct sockmap_options *opt)
+{
+
+ test_selftests_sockmap(cg_fd, opt);
+ test_selftests_sockhash(cg_fd, opt);
+ test_selftests_ktls(cg_fd, opt);
+ test_print_results();
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ int iov_count = 1, length = 1024, rate = 1;
+ struct sockmap_options options = {0};
+ int opt, longindex, err, cg_fd = 0;
+ char *bpf_file = BPF_SOCKMAP_FILENAME;
+ int test = SELFTESTS;
+ bool cg_created = 0;
+
+ while ((opt = getopt_long(argc, argv, ":dhv:c:r:i:l:t:p:q:n:b:",
+ long_options, &longindex)) != -1) {
+ switch (opt) {
+ case 's':
+ txmsg_start = atoi(optarg);
+ break;
+ case 'e':
+ txmsg_end = atoi(optarg);
+ break;
+ case 'p':
+ txmsg_start_push = atoi(optarg);
+ break;
+ case 'q':
+ txmsg_end_push = atoi(optarg);
+ break;
+ case 'w':
+ txmsg_start_pop = atoi(optarg);
+ break;
+ case 'x':
+ txmsg_pop = atoi(optarg);
+ break;
+ case 'a':
+ txmsg_apply = atoi(optarg);
+ break;
+ case 'k':
+ txmsg_cork = atoi(optarg);
+ break;
+ case 'c':
+ cg_fd = open(optarg, O_DIRECTORY, O_RDONLY);
+ if (cg_fd < 0) {
+ fprintf(stderr,
+ "ERROR: (%i) open cg path failed: %s\n",
+ cg_fd, optarg);
+ return cg_fd;
+ }
+ break;
+ case 'r':
+ rate = atoi(optarg);
+ break;
+ case 'v':
+ options.verbose = 1;
+ if (optarg)
+ options.verbose = atoi(optarg);
+ break;
+ case 'i':
+ iov_count = atoi(optarg);
+ break;
+ case 'l':
+ length = atoi(optarg);
+ break;
+ case 'd':
+ options.data_test = true;
+ break;
+ case 't':
+ if (strcmp(optarg, "ping") == 0) {
+ test = PING_PONG;
+ } else if (strcmp(optarg, "sendmsg") == 0) {
+ test = SENDMSG;
+ } else if (strcmp(optarg, "base") == 0) {
+ test = BASE;
+ } else if (strcmp(optarg, "base_sendpage") == 0) {
+ test = BASE_SENDPAGE;
+ } else if (strcmp(optarg, "sendpage") == 0) {
+ test = SENDPAGE;
+ } else {
+ usage(argv);
+ return -1;
+ }
+ break;
+ case 'n':
+ options.whitelist = strdup(optarg);
+ if (!options.whitelist)
+ return -ENOMEM;
+ break;
+ case 'b':
+ options.blacklist = strdup(optarg);
+ if (!options.blacklist)
+ return -ENOMEM;
+ case 0:
+ break;
+ case 'h':
+ default:
+ usage(argv);
+ return -1;
+ }
+ }
+
+ if (!cg_fd) {
+ cg_fd = cgroup_setup_and_join(CG_PATH);
+ if (cg_fd < 0)
+ return cg_fd;
+ cg_created = 1;
+ }
+
+ if (test == SELFTESTS) {
+ err = test_selftest(cg_fd, &options);
+ goto out;
+ }
+
+ err = populate_progs(bpf_file);
+ if (err) {
+ fprintf(stderr, "populate program: (%s) %s\n",
+ bpf_file, strerror(errno));
+ return 1;
+ }
+ running = 1;
+
+ /* catch SIGINT */
+ signal(SIGINT, running_handler);
+
+ options.iov_count = iov_count;
+ options.iov_length = length;
+ options.rate = rate;
+
+ err = run_options(&options, cg_fd, test);
+out:
+ if (options.whitelist)
+ free(options.whitelist);
+ if (options.blacklist)
+ free(options.blacklist);
+ if (cg_created)
+ cleanup_cgroup_environment();
+ close(cg_fd);
+ return err;
+}
+
+void running_handler(int a)
+{
+ running = 0;
+}
diff --git a/tools/testing/selftests/bpf/test_stub.c b/tools/testing/selftests/bpf/test_stub.c
new file mode 100644
index 000000000..47e132726
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_stub.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+/* Copyright (C) 2019 Netronome Systems, Inc. */
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include <string.h>
+
+int extra_prog_load_log_flags = 0;
+
+int bpf_prog_test_load(const char *file, enum bpf_prog_type type,
+ struct bpf_object **pobj, int *prog_fd)
+{
+ struct bpf_prog_load_attr attr;
+
+ memset(&attr, 0, sizeof(struct bpf_prog_load_attr));
+ attr.file = file;
+ attr.prog_type = type;
+ attr.expected_attach_type = 0;
+ attr.prog_flags = BPF_F_TEST_RND_HI32;
+ attr.log_level = extra_prog_load_log_flags;
+
+ return bpf_prog_load_xattr(&attr, pobj, prog_fd);
+}
+
+int bpf_test_load_program(enum bpf_prog_type type, const struct bpf_insn *insns,
+ size_t insns_cnt, const char *license,
+ __u32 kern_version, char *log_buf,
+ size_t log_buf_sz)
+{
+ struct bpf_load_program_attr load_attr;
+
+ memset(&load_attr, 0, sizeof(struct bpf_load_program_attr));
+ load_attr.prog_type = type;
+ load_attr.expected_attach_type = 0;
+ load_attr.name = NULL;
+ load_attr.insns = insns;
+ load_attr.insns_cnt = insns_cnt;
+ load_attr.license = license;
+ load_attr.kern_version = kern_version;
+ load_attr.prog_flags = BPF_F_TEST_RND_HI32;
+ load_attr.log_level = extra_prog_load_log_flags;
+
+ return bpf_load_program_xattr(&load_attr, log_buf, log_buf_sz);
+}
diff --git a/tools/testing/selftests/bpf/test_sysctl.c b/tools/testing/selftests/bpf/test_sysctl.c
new file mode 100644
index 000000000..a20a91924
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sysctl.c
@@ -0,0 +1,1636 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2019 Facebook
+
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <linux/filter.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <bpf/bpf_endian.h>
+#include "bpf_rlimit.h"
+#include "bpf_util.h"
+#include "cgroup_helpers.h"
+
+#define CG_PATH "/foo"
+#define MAX_INSNS 512
+#define FIXUP_SYSCTL_VALUE 0
+
+char bpf_log_buf[BPF_LOG_BUF_SIZE];
+
+struct sysctl_test {
+ const char *descr;
+ size_t fixup_value_insn;
+ struct bpf_insn insns[MAX_INSNS];
+ const char *prog_file;
+ enum bpf_attach_type attach_type;
+ const char *sysctl;
+ int open_flags;
+ int seek;
+ const char *newval;
+ const char *oldval;
+ enum {
+ LOAD_REJECT,
+ ATTACH_REJECT,
+ OP_EPERM,
+ SUCCESS,
+ } result;
+};
+
+static struct sysctl_test tests[] = {
+ {
+ .descr = "sysctl wrong attach_type",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = 0,
+ .sysctl = "kernel/ostype",
+ .open_flags = O_RDONLY,
+ .result = ATTACH_REJECT,
+ },
+ {
+ .descr = "sysctl:read allow all",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "kernel/ostype",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+ {
+ .descr = "sysctl:read deny all",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "kernel/ostype",
+ .open_flags = O_RDONLY,
+ .result = OP_EPERM,
+ },
+ {
+ .descr = "ctx:write sysctl:read read ok",
+ .insns = {
+ /* If (write) */
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct bpf_sysctl, write)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 1, 2),
+
+ /* return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_A(1),
+
+ /* else return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "kernel/ostype",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+ {
+ .descr = "ctx:write sysctl:write read ok",
+ .insns = {
+ /* If (write) */
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct bpf_sysctl, write)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 1, 2),
+
+ /* return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_A(1),
+
+ /* else return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "kernel/domainname",
+ .open_flags = O_WRONLY,
+ .newval = "(none)", /* same as default, should fail anyway */
+ .result = OP_EPERM,
+ },
+ {
+ .descr = "ctx:write sysctl:write read ok narrow",
+ .insns = {
+ /* u64 w = (u16)write & 1; */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_1,
+ offsetof(struct bpf_sysctl, write)),
+#else
+ BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_1,
+ offsetof(struct bpf_sysctl, write) + 2),
+#endif
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_7, 1),
+ /* return 1 - w; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_7),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "kernel/domainname",
+ .open_flags = O_WRONLY,
+ .newval = "(none)", /* same as default, should fail anyway */
+ .result = OP_EPERM,
+ },
+ {
+ .descr = "ctx:write sysctl:read write reject",
+ .insns = {
+ /* write = X */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sysctl, write)),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "kernel/ostype",
+ .open_flags = O_RDONLY,
+ .result = LOAD_REJECT,
+ },
+ {
+ .descr = "ctx:file_pos sysctl:read read ok",
+ .insns = {
+ /* If (file_pos == X) */
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct bpf_sysctl, file_pos)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 3, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "kernel/ostype",
+ .open_flags = O_RDONLY,
+ .seek = 3,
+ .result = SUCCESS,
+ },
+ {
+ .descr = "ctx:file_pos sysctl:read read ok narrow",
+ .insns = {
+ /* If (file_pos == X) */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_1,
+ offsetof(struct bpf_sysctl, file_pos)),
+#else
+ BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_1,
+ offsetof(struct bpf_sysctl, file_pos) + 3),
+#endif
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 4, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "kernel/ostype",
+ .open_flags = O_RDONLY,
+ .seek = 4,
+ .result = SUCCESS,
+ },
+ {
+ .descr = "ctx:file_pos sysctl:read write ok",
+ .insns = {
+ /* file_pos = X */
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct bpf_sysctl, file_pos)),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "kernel/ostype",
+ .open_flags = O_RDONLY,
+ .oldval = "nux\n",
+ .result = SUCCESS,
+ },
+ {
+ .descr = "sysctl_get_name sysctl_value:base ok",
+ .insns = {
+ /* sysctl_get_name arg2 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+ /* sysctl_get_name arg3 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_3, 8),
+
+ /* sysctl_get_name arg4 (flags) */
+ BPF_MOV64_IMM(BPF_REG_4, BPF_F_SYSCTL_BASE_NAME),
+
+ /* sysctl_get_name(ctx, buf, buf_len, flags) */
+ BPF_EMIT_CALL(BPF_FUNC_sysctl_get_name),
+
+ /* if (ret == expected && */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, sizeof("tcp_mem") - 1, 6),
+ /* buf == "tcp_mem\0") */
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x7463705f6d656d00ULL)),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/tcp_mem",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+ {
+ .descr = "sysctl_get_name sysctl_value:base E2BIG truncated",
+ .insns = {
+ /* sysctl_get_name arg2 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+ /* sysctl_get_name arg3 (buf_len) too small */
+ BPF_MOV64_IMM(BPF_REG_3, 7),
+
+ /* sysctl_get_name arg4 (flags) */
+ BPF_MOV64_IMM(BPF_REG_4, BPF_F_SYSCTL_BASE_NAME),
+
+ /* sysctl_get_name(ctx, buf, buf_len, flags) */
+ BPF_EMIT_CALL(BPF_FUNC_sysctl_get_name),
+
+ /* if (ret == expected && */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 6),
+
+ /* buf[0:7] == "tcp_me\0") */
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x7463705f6d650000ULL)),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/tcp_mem",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+ {
+ .descr = "sysctl_get_name sysctl:full ok",
+ .insns = {
+ /* sysctl_get_name arg2 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -24),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 16),
+
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+ /* sysctl_get_name arg3 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_3, 17),
+
+ /* sysctl_get_name arg4 (flags) */
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+
+ /* sysctl_get_name(ctx, buf, buf_len, flags) */
+ BPF_EMIT_CALL(BPF_FUNC_sysctl_get_name),
+
+ /* if (ret == expected && */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 16, 14),
+
+ /* buf[0:8] == "net/ipv4" && */
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x6e65742f69707634ULL)),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 10),
+
+ /* buf[8:16] == "/tcp_mem" && */
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x2f7463705f6d656dULL)),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 8),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 6),
+
+ /* buf[16:24] == "\0") */
+ BPF_LD_IMM64(BPF_REG_8, 0x0ULL),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 16),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/tcp_mem",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+ {
+ .descr = "sysctl_get_name sysctl:full E2BIG truncated",
+ .insns = {
+ /* sysctl_get_name arg2 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -16),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 8),
+
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+ /* sysctl_get_name arg3 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_3, 16),
+
+ /* sysctl_get_name arg4 (flags) */
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+
+ /* sysctl_get_name(ctx, buf, buf_len, flags) */
+ BPF_EMIT_CALL(BPF_FUNC_sysctl_get_name),
+
+ /* if (ret == expected && */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 10),
+
+ /* buf[0:8] == "net/ipv4" && */
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x6e65742f69707634ULL)),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 6),
+
+ /* buf[8:16] == "/tcp_me\0") */
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x2f7463705f6d6500ULL)),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 8),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/tcp_mem",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+ {
+ .descr = "sysctl_get_name sysctl:full E2BIG truncated small",
+ .insns = {
+ /* sysctl_get_name arg2 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+ /* sysctl_get_name arg3 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_3, 7),
+
+ /* sysctl_get_name arg4 (flags) */
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+
+ /* sysctl_get_name(ctx, buf, buf_len, flags) */
+ BPF_EMIT_CALL(BPF_FUNC_sysctl_get_name),
+
+ /* if (ret == expected && */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 6),
+
+ /* buf[0:8] == "net/ip\0") */
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x6e65742f69700000ULL)),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/tcp_mem",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+ {
+ .descr = "sysctl_get_current_value sysctl:read ok, gt",
+ .insns = {
+ /* sysctl_get_current_value arg2 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+ /* sysctl_get_current_value arg3 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_3, 8),
+
+ /* sysctl_get_current_value(ctx, buf, buf_len) */
+ BPF_EMIT_CALL(BPF_FUNC_sysctl_get_current_value),
+
+ /* if (ret == expected && */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 6, 6),
+
+ /* buf[0:6] == "Linux\n\0") */
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x4c696e75780a0000ULL)),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "kernel/ostype",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+ {
+ .descr = "sysctl_get_current_value sysctl:read ok, eq",
+ .insns = {
+ /* sysctl_get_current_value arg2 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_B, BPF_REG_7, BPF_REG_0, 7),
+
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+ /* sysctl_get_current_value arg3 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_3, 7),
+
+ /* sysctl_get_current_value(ctx, buf, buf_len) */
+ BPF_EMIT_CALL(BPF_FUNC_sysctl_get_current_value),
+
+ /* if (ret == expected && */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 6, 6),
+
+ /* buf[0:6] == "Linux\n\0") */
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x4c696e75780a0000ULL)),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "kernel/ostype",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+ {
+ .descr = "sysctl_get_current_value sysctl:read E2BIG truncated",
+ .insns = {
+ /* sysctl_get_current_value arg2 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_H, BPF_REG_7, BPF_REG_0, 6),
+
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+ /* sysctl_get_current_value arg3 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_3, 6),
+
+ /* sysctl_get_current_value(ctx, buf, buf_len) */
+ BPF_EMIT_CALL(BPF_FUNC_sysctl_get_current_value),
+
+ /* if (ret == expected && */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 6),
+
+ /* buf[0:6] == "Linux\0") */
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x4c696e7578000000ULL)),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "kernel/ostype",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+ {
+ .descr = "sysctl_get_current_value sysctl:read EINVAL",
+ .insns = {
+ /* sysctl_get_current_value arg2 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+ /* sysctl_get_current_value arg3 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_3, 8),
+
+ /* sysctl_get_current_value(ctx, buf, buf_len) */
+ BPF_EMIT_CALL(BPF_FUNC_sysctl_get_current_value),
+
+ /* if (ret == expected && */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -EINVAL, 4),
+
+ /* buf[0:8] is NUL-filled) */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 0, 2),
+
+ /* return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_A(1),
+
+ /* else return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv6/conf/lo/stable_secret", /* -EIO */
+ .open_flags = O_RDONLY,
+ .result = OP_EPERM,
+ },
+ {
+ .descr = "sysctl_get_current_value sysctl:write ok",
+ .fixup_value_insn = 6,
+ .insns = {
+ /* sysctl_get_current_value arg2 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+ /* sysctl_get_current_value arg3 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_3, 8),
+
+ /* sysctl_get_current_value(ctx, buf, buf_len) */
+ BPF_EMIT_CALL(BPF_FUNC_sysctl_get_current_value),
+
+ /* if (ret == expected && */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 4, 6),
+
+ /* buf[0:4] == expected) */
+ BPF_LD_IMM64(BPF_REG_8, FIXUP_SYSCTL_VALUE),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
+
+ /* return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_A(1),
+
+ /* else return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/route/mtu_expires",
+ .open_flags = O_WRONLY,
+ .newval = "600", /* same as default, should fail anyway */
+ .result = OP_EPERM,
+ },
+ {
+ .descr = "sysctl_get_new_value sysctl:read EINVAL",
+ .insns = {
+ /* sysctl_get_new_value arg2 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+ /* sysctl_get_new_value arg3 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_3, 8),
+
+ /* sysctl_get_new_value(ctx, buf, buf_len) */
+ BPF_EMIT_CALL(BPF_FUNC_sysctl_get_new_value),
+
+ /* if (ret == expected) */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -EINVAL, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/tcp_mem",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+ {
+ .descr = "sysctl_get_new_value sysctl:write ok",
+ .insns = {
+ /* sysctl_get_new_value arg2 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+ /* sysctl_get_new_value arg3 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_3, 4),
+
+ /* sysctl_get_new_value(ctx, buf, buf_len) */
+ BPF_EMIT_CALL(BPF_FUNC_sysctl_get_new_value),
+
+ /* if (ret == expected && */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 3, 4),
+
+ /* buf[0:4] == "606\0") */
+ BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_7, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_9,
+ bpf_ntohl(0x36303600), 2),
+
+ /* return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_A(1),
+
+ /* else return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/route/mtu_expires",
+ .open_flags = O_WRONLY,
+ .newval = "606",
+ .result = OP_EPERM,
+ },
+ {
+ .descr = "sysctl_get_new_value sysctl:write ok long",
+ .insns = {
+ /* sysctl_get_new_value arg2 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -24),
+
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+ /* sysctl_get_new_value arg3 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_3, 24),
+
+ /* sysctl_get_new_value(ctx, buf, buf_len) */
+ BPF_EMIT_CALL(BPF_FUNC_sysctl_get_new_value),
+
+ /* if (ret == expected && */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 23, 14),
+
+ /* buf[0:8] == "3000000 " && */
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x3330303030303020ULL)),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 10),
+
+ /* buf[8:16] == "4000000 " && */
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x3430303030303020ULL)),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 8),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 6),
+
+ /* buf[16:24] == "6000000\0") */
+ BPF_LD_IMM64(BPF_REG_8,
+ bpf_be64_to_cpu(0x3630303030303000ULL)),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 16),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
+
+ /* return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_A(1),
+
+ /* else return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/tcp_mem",
+ .open_flags = O_WRONLY,
+ .newval = "3000000 4000000 6000000",
+ .result = OP_EPERM,
+ },
+ {
+ .descr = "sysctl_get_new_value sysctl:write E2BIG",
+ .insns = {
+ /* sysctl_get_new_value arg2 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_B, BPF_REG_7, BPF_REG_0, 3),
+
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+ /* sysctl_get_new_value arg3 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_3, 3),
+
+ /* sysctl_get_new_value(ctx, buf, buf_len) */
+ BPF_EMIT_CALL(BPF_FUNC_sysctl_get_new_value),
+
+ /* if (ret == expected && */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -E2BIG, 4),
+
+ /* buf[0:3] == "60\0") */
+ BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_7, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_9,
+ bpf_ntohl(0x36300000), 2),
+
+ /* return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_A(1),
+
+ /* else return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/route/mtu_expires",
+ .open_flags = O_WRONLY,
+ .newval = "606",
+ .result = OP_EPERM,
+ },
+ {
+ .descr = "sysctl_set_new_value sysctl:read EINVAL",
+ .insns = {
+ /* sysctl_set_new_value arg2 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_MOV64_IMM(BPF_REG_0,
+ bpf_ntohl(0x36303000)),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+ /* sysctl_set_new_value arg3 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_3, 3),
+
+ /* sysctl_set_new_value(ctx, buf, buf_len) */
+ BPF_EMIT_CALL(BPF_FUNC_sysctl_set_new_value),
+
+ /* if (ret == expected) */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -EINVAL, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/route/mtu_expires",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+ {
+ .descr = "sysctl_set_new_value sysctl:write ok",
+ .fixup_value_insn = 2,
+ .insns = {
+ /* sysctl_set_new_value arg2 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_LD_IMM64(BPF_REG_0, FIXUP_SYSCTL_VALUE),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+
+ /* sysctl_set_new_value arg3 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_3, 3),
+
+ /* sysctl_set_new_value(ctx, buf, buf_len) */
+ BPF_EMIT_CALL(BPF_FUNC_sysctl_set_new_value),
+
+ /* if (ret == expected) */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/route/mtu_expires",
+ .open_flags = O_WRONLY,
+ .newval = "606",
+ .result = SUCCESS,
+ },
+ {
+ "bpf_strtoul one number string",
+ .insns = {
+ /* arg1 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_MOV64_IMM(BPF_REG_0,
+ bpf_ntohl(0x36303000)),
+ BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+ /* arg2 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+
+ /* arg3 (flags) */
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+
+ /* arg4 (res) */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+ BPF_EMIT_CALL(BPF_FUNC_strtoul),
+
+ /* if (ret == expected && */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 3, 4),
+ /* res == expected) */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 600, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/route/mtu_expires",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+ {
+ "bpf_strtoul multi number string",
+ .insns = {
+ /* arg1 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ /* "600 602\0" */
+ BPF_LD_IMM64(BPF_REG_0,
+ bpf_be64_to_cpu(0x3630302036303200ULL)),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+ /* arg2 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_2, 8),
+
+ /* arg3 (flags) */
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+
+ /* arg4 (res) */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+ BPF_EMIT_CALL(BPF_FUNC_strtoul),
+
+ /* if (ret == expected && */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 3, 18),
+ /* res == expected) */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 600, 16),
+
+ /* arg1 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+ /* arg2 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_2, 8),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_0),
+
+ /* arg3 (flags) */
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+
+ /* arg4 (res) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -16),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+ BPF_EMIT_CALL(BPF_FUNC_strtoul),
+
+ /* if (ret == expected && */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 4, 4),
+ /* res == expected) */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 602, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/tcp_mem",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+ {
+ "bpf_strtoul buf_len = 0, reject",
+ .insns = {
+ /* arg1 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_MOV64_IMM(BPF_REG_0,
+ bpf_ntohl(0x36303000)),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+ /* arg2 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+
+ /* arg3 (flags) */
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+
+ /* arg4 (res) */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+ BPF_EMIT_CALL(BPF_FUNC_strtoul),
+
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/route/mtu_expires",
+ .open_flags = O_RDONLY,
+ .result = LOAD_REJECT,
+ },
+ {
+ "bpf_strtoul supported base, ok",
+ .insns = {
+ /* arg1 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_MOV64_IMM(BPF_REG_0,
+ bpf_ntohl(0x30373700)),
+ BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+ /* arg2 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+
+ /* arg3 (flags) */
+ BPF_MOV64_IMM(BPF_REG_3, 8),
+
+ /* arg4 (res) */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+ BPF_EMIT_CALL(BPF_FUNC_strtoul),
+
+ /* if (ret == expected && */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 3, 4),
+ /* res == expected) */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 63, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/route/mtu_expires",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+ {
+ "bpf_strtoul unsupported base, EINVAL",
+ .insns = {
+ /* arg1 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_MOV64_IMM(BPF_REG_0,
+ bpf_ntohl(0x36303000)),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+ /* arg2 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+
+ /* arg3 (flags) */
+ BPF_MOV64_IMM(BPF_REG_3, 3),
+
+ /* arg4 (res) */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+ BPF_EMIT_CALL(BPF_FUNC_strtoul),
+
+ /* if (ret == expected) */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -EINVAL, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/route/mtu_expires",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+ {
+ "bpf_strtoul buf with spaces only, EINVAL",
+ .insns = {
+ /* arg1 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_MOV64_IMM(BPF_REG_0,
+ bpf_ntohl(0x0d0c0a09)),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+ /* arg2 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+
+ /* arg3 (flags) */
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+
+ /* arg4 (res) */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+ BPF_EMIT_CALL(BPF_FUNC_strtoul),
+
+ /* if (ret == expected) */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -EINVAL, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/route/mtu_expires",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+ {
+ "bpf_strtoul negative number, EINVAL",
+ .insns = {
+ /* arg1 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ /* " -6\0" */
+ BPF_MOV64_IMM(BPF_REG_0,
+ bpf_ntohl(0x0a2d3600)),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+ /* arg2 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+
+ /* arg3 (flags) */
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+
+ /* arg4 (res) */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+ BPF_EMIT_CALL(BPF_FUNC_strtoul),
+
+ /* if (ret == expected) */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -EINVAL, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/route/mtu_expires",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+ {
+ "bpf_strtol negative number, ok",
+ .insns = {
+ /* arg1 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ /* " -6\0" */
+ BPF_MOV64_IMM(BPF_REG_0,
+ bpf_ntohl(0x0a2d3600)),
+ BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+ /* arg2 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+
+ /* arg3 (flags) */
+ BPF_MOV64_IMM(BPF_REG_3, 10),
+
+ /* arg4 (res) */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+ BPF_EMIT_CALL(BPF_FUNC_strtol),
+
+ /* if (ret == expected && */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 3, 4),
+ /* res == expected) */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_9, -6, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/route/mtu_expires",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+ {
+ "bpf_strtol hex number, ok",
+ .insns = {
+ /* arg1 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ /* "0xfe" */
+ BPF_MOV64_IMM(BPF_REG_0,
+ bpf_ntohl(0x30786665)),
+ BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+ /* arg2 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+
+ /* arg3 (flags) */
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+
+ /* arg4 (res) */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+ BPF_EMIT_CALL(BPF_FUNC_strtol),
+
+ /* if (ret == expected && */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 4, 4),
+ /* res == expected) */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 254, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/route/mtu_expires",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+ {
+ "bpf_strtol max long",
+ .insns = {
+ /* arg1 (buf) 9223372036854775807 */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -24),
+ BPF_LD_IMM64(BPF_REG_0,
+ bpf_be64_to_cpu(0x3932323333373230ULL)),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_LD_IMM64(BPF_REG_0,
+ bpf_be64_to_cpu(0x3336383534373735ULL)),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 8),
+ BPF_LD_IMM64(BPF_REG_0,
+ bpf_be64_to_cpu(0x3830370000000000ULL)),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 16),
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+ /* arg2 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_2, 19),
+
+ /* arg3 (flags) */
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+
+ /* arg4 (res) */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+ BPF_EMIT_CALL(BPF_FUNC_strtol),
+
+ /* if (ret == expected && */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 19, 6),
+ /* res == expected) */
+ BPF_LD_IMM64(BPF_REG_8, 0x7fffffffffffffffULL),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_9, BPF_REG_7, 0),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_8, BPF_REG_9, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/route/mtu_expires",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+ {
+ "bpf_strtol overflow, ERANGE",
+ .insns = {
+ /* arg1 (buf) 9223372036854775808 */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -24),
+ BPF_LD_IMM64(BPF_REG_0,
+ bpf_be64_to_cpu(0x3932323333373230ULL)),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_LD_IMM64(BPF_REG_0,
+ bpf_be64_to_cpu(0x3336383534373735ULL)),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 8),
+ BPF_LD_IMM64(BPF_REG_0,
+ bpf_be64_to_cpu(0x3830380000000000ULL)),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 16),
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+ /* arg2 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_2, 19),
+
+ /* arg3 (flags) */
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+
+ /* arg4 (res) */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+ BPF_EMIT_CALL(BPF_FUNC_strtol),
+
+ /* if (ret == expected) */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, -ERANGE, 2),
+
+ /* return ALLOW; */
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_A(1),
+
+ /* else return DENY; */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/route/mtu_expires",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+ {
+ "C prog: deny all writes",
+ .prog_file = "./test_sysctl_prog.o",
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/tcp_mem",
+ .open_flags = O_WRONLY,
+ .newval = "123 456 789",
+ .result = OP_EPERM,
+ },
+ {
+ "C prog: deny access by name",
+ .prog_file = "./test_sysctl_prog.o",
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/route/mtu_expires",
+ .open_flags = O_RDONLY,
+ .result = OP_EPERM,
+ },
+ {
+ "C prog: read tcp_mem",
+ .prog_file = "./test_sysctl_prog.o",
+ .attach_type = BPF_CGROUP_SYSCTL,
+ .sysctl = "net/ipv4/tcp_mem",
+ .open_flags = O_RDONLY,
+ .result = SUCCESS,
+ },
+};
+
+static size_t probe_prog_length(const struct bpf_insn *fp)
+{
+ size_t len;
+
+ for (len = MAX_INSNS - 1; len > 0; --len)
+ if (fp[len].code != 0 || fp[len].imm != 0)
+ break;
+ return len + 1;
+}
+
+static int fixup_sysctl_value(const char *buf, size_t buf_len,
+ struct bpf_insn *prog, size_t insn_num)
+{
+ union {
+ uint8_t raw[sizeof(uint64_t)];
+ uint64_t num;
+ } value = {};
+
+ if (buf_len > sizeof(value)) {
+ log_err("Value is too big (%zd) to use in fixup", buf_len);
+ return -1;
+ }
+ if (prog[insn_num].code != (BPF_LD | BPF_DW | BPF_IMM)) {
+ log_err("Can fixup only BPF_LD_IMM64 insns");
+ return -1;
+ }
+
+ memcpy(value.raw, buf, buf_len);
+ prog[insn_num].imm = (uint32_t)value.num;
+ prog[insn_num + 1].imm = (uint32_t)(value.num >> 32);
+
+ return 0;
+}
+
+static int load_sysctl_prog_insns(struct sysctl_test *test,
+ const char *sysctl_path)
+{
+ struct bpf_insn *prog = test->insns;
+ struct bpf_load_program_attr attr;
+ int ret;
+
+ memset(&attr, 0, sizeof(struct bpf_load_program_attr));
+ attr.prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL;
+ attr.insns = prog;
+ attr.insns_cnt = probe_prog_length(attr.insns);
+ attr.license = "GPL";
+
+ if (test->fixup_value_insn) {
+ char buf[128];
+ ssize_t len;
+ int fd;
+
+ fd = open(sysctl_path, O_RDONLY | O_CLOEXEC);
+ if (fd < 0) {
+ log_err("open(%s) failed", sysctl_path);
+ return -1;
+ }
+ len = read(fd, buf, sizeof(buf));
+ if (len == -1) {
+ log_err("read(%s) failed", sysctl_path);
+ close(fd);
+ return -1;
+ }
+ close(fd);
+ if (fixup_sysctl_value(buf, len, prog, test->fixup_value_insn))
+ return -1;
+ }
+
+ ret = bpf_load_program_xattr(&attr, bpf_log_buf, BPF_LOG_BUF_SIZE);
+ if (ret < 0 && test->result != LOAD_REJECT) {
+ log_err(">>> Loading program error.\n"
+ ">>> Verifier output:\n%s\n-------\n", bpf_log_buf);
+ }
+
+ return ret;
+}
+
+static int load_sysctl_prog_file(struct sysctl_test *test)
+{
+ struct bpf_prog_load_attr attr;
+ struct bpf_object *obj;
+ int prog_fd;
+
+ memset(&attr, 0, sizeof(struct bpf_prog_load_attr));
+ attr.file = test->prog_file;
+ attr.prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL;
+
+ if (bpf_prog_load_xattr(&attr, &obj, &prog_fd)) {
+ if (test->result != LOAD_REJECT)
+ log_err(">>> Loading program (%s) error.\n",
+ test->prog_file);
+ return -1;
+ }
+
+ return prog_fd;
+}
+
+static int load_sysctl_prog(struct sysctl_test *test, const char *sysctl_path)
+{
+ return test->prog_file
+ ? load_sysctl_prog_file(test)
+ : load_sysctl_prog_insns(test, sysctl_path);
+}
+
+static int access_sysctl(const char *sysctl_path,
+ const struct sysctl_test *test)
+{
+ int err = 0;
+ int fd;
+
+ fd = open(sysctl_path, test->open_flags | O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ if (test->seek && lseek(fd, test->seek, SEEK_SET) == -1) {
+ log_err("lseek(%d) failed", test->seek);
+ goto err;
+ }
+
+ if (test->open_flags == O_RDONLY) {
+ char buf[128];
+
+ if (read(fd, buf, sizeof(buf)) == -1)
+ goto err;
+ if (test->oldval &&
+ strncmp(buf, test->oldval, strlen(test->oldval))) {
+ log_err("Read value %s != %s", buf, test->oldval);
+ goto err;
+ }
+ } else if (test->open_flags == O_WRONLY) {
+ if (!test->newval) {
+ log_err("New value for sysctl is not set");
+ goto err;
+ }
+ if (write(fd, test->newval, strlen(test->newval)) == -1)
+ goto err;
+ } else {
+ log_err("Unexpected sysctl access: neither read nor write");
+ goto err;
+ }
+
+ goto out;
+err:
+ err = -1;
+out:
+ close(fd);
+ return err;
+}
+
+static int run_test_case(int cgfd, struct sysctl_test *test)
+{
+ enum bpf_attach_type atype = test->attach_type;
+ char sysctl_path[128];
+ int progfd = -1;
+ int err = 0;
+
+ printf("Test case: %s .. ", test->descr);
+
+ snprintf(sysctl_path, sizeof(sysctl_path), "/proc/sys/%s",
+ test->sysctl);
+
+ progfd = load_sysctl_prog(test, sysctl_path);
+ if (progfd < 0) {
+ if (test->result == LOAD_REJECT)
+ goto out;
+ else
+ goto err;
+ }
+
+ if (bpf_prog_attach(progfd, cgfd, atype, BPF_F_ALLOW_OVERRIDE) == -1) {
+ if (test->result == ATTACH_REJECT)
+ goto out;
+ else
+ goto err;
+ }
+
+ errno = 0;
+ if (access_sysctl(sysctl_path, test) == -1) {
+ if (test->result == OP_EPERM && errno == EPERM)
+ goto out;
+ else
+ goto err;
+ }
+
+ if (test->result != SUCCESS) {
+ log_err("Unexpected success");
+ goto err;
+ }
+
+ goto out;
+err:
+ err = -1;
+out:
+ /* Detaching w/o checking return code: best effort attempt. */
+ if (progfd != -1)
+ bpf_prog_detach(cgfd, atype);
+ close(progfd);
+ printf("[%s]\n", err ? "FAIL" : "PASS");
+ return err;
+}
+
+static int run_tests(int cgfd)
+{
+ int passes = 0;
+ int fails = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(tests); ++i) {
+ if (run_test_case(cgfd, &tests[i]))
+ ++fails;
+ else
+ ++passes;
+ }
+ printf("Summary: %d PASSED, %d FAILED\n", passes, fails);
+ return fails ? -1 : 0;
+}
+
+int main(int argc, char **argv)
+{
+ int cgfd = -1;
+ int err = 0;
+
+ cgfd = cgroup_setup_and_join(CG_PATH);
+ if (cgfd < 0)
+ goto err;
+
+ if (run_tests(cgfd))
+ goto err;
+
+ goto out;
+err:
+ err = -1;
+out:
+ close(cgfd);
+ cleanup_cgroup_environment();
+ return err;
+}
diff --git a/tools/testing/selftests/bpf/test_tag.c b/tools/testing/selftests/bpf/test_tag.c
new file mode 100644
index 000000000..6272c784c
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tag.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <time.h>
+#include <errno.h>
+#include <unistd.h>
+#include <string.h>
+#include <sched.h>
+#include <limits.h>
+#include <assert.h>
+
+#include <sys/socket.h>
+
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/if_alg.h>
+
+#include <bpf/bpf.h>
+
+#include "../../../include/linux/filter.h"
+#include "bpf_rlimit.h"
+
+static struct bpf_insn prog[BPF_MAXINSNS];
+
+static void bpf_gen_imm_prog(unsigned int insns, int fd_map)
+{
+ int i;
+
+ srand(time(NULL));
+ for (i = 0; i < insns; i++)
+ prog[i] = BPF_ALU64_IMM(BPF_MOV, i % BPF_REG_10, rand());
+ prog[i - 1] = BPF_EXIT_INSN();
+}
+
+static void bpf_gen_map_prog(unsigned int insns, int fd_map)
+{
+ int i, j = 0;
+
+ for (i = 0; i + 1 < insns; i += 2) {
+ struct bpf_insn tmp[] = {
+ BPF_LD_MAP_FD(j++ % BPF_REG_10, fd_map)
+ };
+
+ memcpy(&prog[i], tmp, sizeof(tmp));
+ }
+ if (insns % 2 == 0)
+ prog[insns - 2] = BPF_ALU64_IMM(BPF_MOV, i % BPF_REG_10, 42);
+ prog[insns - 1] = BPF_EXIT_INSN();
+}
+
+static int bpf_try_load_prog(int insns, int fd_map,
+ void (*bpf_filler)(unsigned int insns,
+ int fd_map))
+{
+ int fd_prog;
+
+ bpf_filler(insns, fd_map);
+ fd_prog = bpf_load_program(BPF_PROG_TYPE_SCHED_CLS, prog, insns, "", 0,
+ NULL, 0);
+ assert(fd_prog > 0);
+ if (fd_map > 0)
+ bpf_filler(insns, 0);
+ return fd_prog;
+}
+
+static int __hex2bin(char ch)
+{
+ if ((ch >= '0') && (ch <= '9'))
+ return ch - '0';
+ ch = tolower(ch);
+ if ((ch >= 'a') && (ch <= 'f'))
+ return ch - 'a' + 10;
+ return -1;
+}
+
+static int hex2bin(uint8_t *dst, const char *src, size_t count)
+{
+ while (count--) {
+ int hi = __hex2bin(*src++);
+ int lo = __hex2bin(*src++);
+
+ if ((hi < 0) || (lo < 0))
+ return -1;
+ *dst++ = (hi << 4) | lo;
+ }
+ return 0;
+}
+
+static void tag_from_fdinfo(int fd_prog, uint8_t *tag, uint32_t len)
+{
+ const int prefix_len = sizeof("prog_tag:\t") - 1;
+ char buff[256];
+ int ret = -1;
+ FILE *fp;
+
+ snprintf(buff, sizeof(buff), "/proc/%d/fdinfo/%d", getpid(),
+ fd_prog);
+ fp = fopen(buff, "r");
+ assert(fp);
+
+ while (fgets(buff, sizeof(buff), fp)) {
+ if (strncmp(buff, "prog_tag:\t", prefix_len))
+ continue;
+ ret = hex2bin(tag, buff + prefix_len, len);
+ break;
+ }
+
+ fclose(fp);
+ assert(!ret);
+}
+
+static void tag_from_alg(int insns, uint8_t *tag, uint32_t len)
+{
+ static const struct sockaddr_alg alg = {
+ .salg_family = AF_ALG,
+ .salg_type = "hash",
+ .salg_name = "sha1",
+ };
+ int fd_base, fd_alg, ret;
+ ssize_t size;
+
+ fd_base = socket(AF_ALG, SOCK_SEQPACKET, 0);
+ assert(fd_base > 0);
+
+ ret = bind(fd_base, (struct sockaddr *)&alg, sizeof(alg));
+ assert(!ret);
+
+ fd_alg = accept(fd_base, NULL, 0);
+ assert(fd_alg > 0);
+
+ insns *= sizeof(struct bpf_insn);
+ size = write(fd_alg, prog, insns);
+ assert(size == insns);
+
+ size = read(fd_alg, tag, len);
+ assert(size == len);
+
+ close(fd_alg);
+ close(fd_base);
+}
+
+static void tag_dump(const char *prefix, uint8_t *tag, uint32_t len)
+{
+ int i;
+
+ printf("%s", prefix);
+ for (i = 0; i < len; i++)
+ printf("%02x", tag[i]);
+ printf("\n");
+}
+
+static void tag_exit_report(int insns, int fd_map, uint8_t *ftag,
+ uint8_t *atag, uint32_t len)
+{
+ printf("Program tag mismatch for %d insns%s!\n", insns,
+ fd_map < 0 ? "" : " with map");
+
+ tag_dump(" fdinfo result: ", ftag, len);
+ tag_dump(" af_alg result: ", atag, len);
+ exit(1);
+}
+
+static void do_test(uint32_t *tests, int start_insns, int fd_map,
+ void (*bpf_filler)(unsigned int insns, int fd))
+{
+ int i, fd_prog;
+
+ for (i = start_insns; i <= BPF_MAXINSNS; i++) {
+ uint8_t ftag[8], atag[sizeof(ftag)];
+
+ fd_prog = bpf_try_load_prog(i, fd_map, bpf_filler);
+ tag_from_fdinfo(fd_prog, ftag, sizeof(ftag));
+ tag_from_alg(i, atag, sizeof(atag));
+ if (memcmp(ftag, atag, sizeof(ftag)))
+ tag_exit_report(i, fd_map, ftag, atag, sizeof(ftag));
+
+ close(fd_prog);
+ sched_yield();
+ (*tests)++;
+ }
+}
+
+int main(void)
+{
+ uint32_t tests = 0;
+ int i, fd_map;
+
+ fd_map = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(int),
+ sizeof(int), 1, BPF_F_NO_PREALLOC);
+ assert(fd_map > 0);
+
+ for (i = 0; i < 5; i++) {
+ do_test(&tests, 2, -1, bpf_gen_imm_prog);
+ do_test(&tests, 3, fd_map, bpf_gen_map_prog);
+ }
+
+ printf("test_tag: OK (%u tests)\n", tests);
+ close(fd_map);
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/test_tc_edt.sh b/tools/testing/selftests/bpf/test_tc_edt.sh
new file mode 100755
index 000000000..daa7d1b8d
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tc_edt.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# This test installs a TC bpf program that throttles a TCP flow
+# with dst port = 9000 down to 5MBps. Then it measures actual
+# throughput of the flow.
+
+if [[ $EUID -ne 0 ]]; then
+ echo "This script must be run as root"
+ echo "FAIL"
+ exit 1
+fi
+
+# check that nc, dd, and timeout are present
+command -v nc >/dev/null 2>&1 || \
+ { echo >&2 "nc is not available"; exit 1; }
+command -v dd >/dev/null 2>&1 || \
+ { echo >&2 "nc is not available"; exit 1; }
+command -v timeout >/dev/null 2>&1 || \
+ { echo >&2 "timeout is not available"; exit 1; }
+
+readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)"
+readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)"
+
+readonly IP_SRC="172.16.1.100"
+readonly IP_DST="172.16.2.100"
+
+cleanup()
+{
+ ip netns del ${NS_SRC}
+ ip netns del ${NS_DST}
+}
+
+trap cleanup EXIT
+
+set -e # exit on error
+
+ip netns add "${NS_SRC}"
+ip netns add "${NS_DST}"
+ip link add veth_src type veth peer name veth_dst
+ip link set veth_src netns ${NS_SRC}
+ip link set veth_dst netns ${NS_DST}
+
+ip -netns ${NS_SRC} addr add ${IP_SRC}/24 dev veth_src
+ip -netns ${NS_DST} addr add ${IP_DST}/24 dev veth_dst
+
+ip -netns ${NS_SRC} link set dev veth_src up
+ip -netns ${NS_DST} link set dev veth_dst up
+
+ip -netns ${NS_SRC} route add ${IP_DST}/32 dev veth_src
+ip -netns ${NS_DST} route add ${IP_SRC}/32 dev veth_dst
+
+# set up TC on TX
+ip netns exec ${NS_SRC} tc qdisc add dev veth_src root fq
+ip netns exec ${NS_SRC} tc qdisc add dev veth_src clsact
+ip netns exec ${NS_SRC} tc filter add dev veth_src egress \
+ bpf da obj test_tc_edt.o sec cls_test
+
+
+# start the listener
+ip netns exec ${NS_DST} bash -c \
+ "nc -4 -l -p 9000 >/dev/null &"
+declare -i NC_PID=$!
+sleep 1
+
+declare -ir TIMEOUT=20
+declare -ir EXPECTED_BPS=5000000
+
+# run the load, capture RX bytes on DST
+declare -ir RX_BYTES_START=$( ip netns exec ${NS_DST} \
+ cat /sys/class/net/veth_dst/statistics/rx_bytes )
+
+set +e
+ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero \
+ bs=1000 count=1000000 > /dev/tcp/${IP_DST}/9000 2>/dev/null"
+set -e
+
+declare -ir RX_BYTES_END=$( ip netns exec ${NS_DST} \
+ cat /sys/class/net/veth_dst/statistics/rx_bytes )
+
+declare -ir ACTUAL_BPS=$(( ($RX_BYTES_END - $RX_BYTES_START) / $TIMEOUT ))
+
+echo $TIMEOUT $ACTUAL_BPS $EXPECTED_BPS | \
+ awk '{printf "elapsed: %d sec; bps difference: %.2f%%\n",
+ $1, ($2-$3)*100.0/$3}'
+
+# Pass the test if the actual bps is within 1% of the expected bps.
+# The difference is usually about 0.1% on a 20-sec test, and ==> zero
+# the longer the test runs.
+declare -ir RES=$( echo $ACTUAL_BPS $EXPECTED_BPS | \
+ awk 'function abs(x){return ((x < 0.0) ? -x : x)}
+ {if (abs(($1-$2)*100.0/$2) > 1.0) { print "1" }
+ else { print "0"} }' )
+if [ "${RES}" == "0" ] ; then
+ echo "PASS"
+else
+ echo "FAIL"
+ exit 1
+fi
diff --git a/tools/testing/selftests/bpf/test_tc_redirect.sh b/tools/testing/selftests/bpf/test_tc_redirect.sh
new file mode 100755
index 000000000..8868aa1ca
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tc_redirect.sh
@@ -0,0 +1,216 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link
+# between src and dst. The netns fwd has veth links to each src and dst. The
+# client is in src and server in dst. The test installs a TC BPF program to each
+# host facing veth in fwd which calls into i) bpf_redirect_neigh() to perform the
+# neigh addr population and redirect or ii) bpf_redirect_peer() for namespace
+# switch from ingress side; it also installs a checker prog on the egress side
+# to drop unexpected traffic.
+
+if [[ $EUID -ne 0 ]]; then
+ echo "This script must be run as root"
+ echo "FAIL"
+ exit 1
+fi
+
+# check that needed tools are present
+command -v nc >/dev/null 2>&1 || \
+ { echo >&2 "nc is not available"; exit 1; }
+command -v dd >/dev/null 2>&1 || \
+ { echo >&2 "dd is not available"; exit 1; }
+command -v timeout >/dev/null 2>&1 || \
+ { echo >&2 "timeout is not available"; exit 1; }
+command -v ping >/dev/null 2>&1 || \
+ { echo >&2 "ping is not available"; exit 1; }
+if command -v ping6 >/dev/null 2>&1; then PING6=ping6; else PING6=ping; fi
+command -v perl >/dev/null 2>&1 || \
+ { echo >&2 "perl is not available"; exit 1; }
+command -v jq >/dev/null 2>&1 || \
+ { echo >&2 "jq is not available"; exit 1; }
+command -v bpftool >/dev/null 2>&1 || \
+ { echo >&2 "bpftool is not available"; exit 1; }
+
+readonly GREEN='\033[0;92m'
+readonly RED='\033[0;31m'
+readonly NC='\033[0m' # No Color
+
+readonly PING_ARG="-c 3 -w 10 -q"
+
+readonly TIMEOUT=10
+
+readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)"
+readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)"
+readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)"
+
+readonly IP4_SRC="172.16.1.100"
+readonly IP4_DST="172.16.2.100"
+
+readonly IP6_SRC="::1:dead:beef:cafe"
+readonly IP6_DST="::2:dead:beef:cafe"
+
+readonly IP4_SLL="169.254.0.1"
+readonly IP4_DLL="169.254.0.2"
+readonly IP4_NET="169.254.0.0"
+
+netns_cleanup()
+{
+ ip netns del ${NS_SRC}
+ ip netns del ${NS_FWD}
+ ip netns del ${NS_DST}
+}
+
+netns_setup()
+{
+ ip netns add "${NS_SRC}"
+ ip netns add "${NS_FWD}"
+ ip netns add "${NS_DST}"
+
+ ip link add veth_src type veth peer name veth_src_fwd
+ ip link add veth_dst type veth peer name veth_dst_fwd
+
+ ip link set veth_src netns ${NS_SRC}
+ ip link set veth_src_fwd netns ${NS_FWD}
+
+ ip link set veth_dst netns ${NS_DST}
+ ip link set veth_dst_fwd netns ${NS_FWD}
+
+ ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src
+ ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst
+
+ # The fwd netns automatically get a v6 LL address / routes, but also
+ # needs v4 one in order to start ARP probing. IP4_NET route is added
+ # to the endpoints so that the ARP processing will reply.
+
+ ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd
+ ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd
+
+ ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad
+ ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad
+
+ ip -netns ${NS_SRC} link set dev veth_src up
+ ip -netns ${NS_FWD} link set dev veth_src_fwd up
+
+ ip -netns ${NS_DST} link set dev veth_dst up
+ ip -netns ${NS_FWD} link set dev veth_dst_fwd up
+
+ ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global
+ ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global
+ ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global
+
+ ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global
+ ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global
+
+ ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global
+ ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global
+ ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global
+
+ ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global
+ ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global
+
+ fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address)
+ fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address)
+
+ ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src
+ ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst
+
+ ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src
+ ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst
+}
+
+netns_test_connectivity()
+{
+ set +e
+
+ ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &"
+ ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &"
+
+ TEST="TCPv4 connectivity test"
+ ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004"
+ if [ $? -ne 0 ]; then
+ echo -e "${TEST}: ${RED}FAIL${NC}"
+ exit 1
+ fi
+ echo -e "${TEST}: ${GREEN}PASS${NC}"
+
+ TEST="TCPv6 connectivity test"
+ ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006"
+ if [ $? -ne 0 ]; then
+ echo -e "${TEST}: ${RED}FAIL${NC}"
+ exit 1
+ fi
+ echo -e "${TEST}: ${GREEN}PASS${NC}"
+
+ TEST="ICMPv4 connectivity test"
+ ip netns exec ${NS_SRC} ping $PING_ARG ${IP4_DST}
+ if [ $? -ne 0 ]; then
+ echo -e "${TEST}: ${RED}FAIL${NC}"
+ exit 1
+ fi
+ echo -e "${TEST}: ${GREEN}PASS${NC}"
+
+ TEST="ICMPv6 connectivity test"
+ ip netns exec ${NS_SRC} $PING6 $PING_ARG ${IP6_DST}
+ if [ $? -ne 0 ]; then
+ echo -e "${TEST}: ${RED}FAIL${NC}"
+ exit 1
+ fi
+ echo -e "${TEST}: ${GREEN}PASS${NC}"
+
+ set -e
+}
+
+hex_mem_str()
+{
+ perl -e 'print join(" ", unpack("(H2)8", pack("L", @ARGV)))' $1
+}
+
+netns_setup_bpf()
+{
+ local obj=$1
+ local use_forwarding=${2:-0}
+
+ ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact
+ ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj $obj sec src_ingress
+ ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj $obj sec chk_egress
+
+ ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact
+ ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj $obj sec dst_ingress
+ ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj $obj sec chk_egress
+
+ if [ "$use_forwarding" -eq "1" ]; then
+ # bpf_fib_lookup() checks if forwarding is enabled
+ ip netns exec ${NS_FWD} sysctl -w net.ipv4.ip_forward=1
+ ip netns exec ${NS_FWD} sysctl -w net.ipv6.conf.veth_dst_fwd.forwarding=1
+ ip netns exec ${NS_FWD} sysctl -w net.ipv6.conf.veth_src_fwd.forwarding=1
+ return 0
+ fi
+
+ veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex)
+ veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex)
+
+ progs=$(ip netns exec ${NS_FWD} bpftool net --json | jq -r '.[] | .tc | map(.id) | .[]')
+ for prog in $progs; do
+ map=$(bpftool prog show id $prog --json | jq -r '.map_ids | .? | .[]')
+ if [ ! -z "$map" ]; then
+ bpftool map update id $map key hex $(hex_mem_str 0) value hex $(hex_mem_str $veth_src)
+ bpftool map update id $map key hex $(hex_mem_str 1) value hex $(hex_mem_str $veth_dst)
+ fi
+ done
+}
+
+trap netns_cleanup EXIT
+set -e
+
+netns_setup
+netns_setup_bpf test_tc_neigh.o
+netns_test_connectivity
+netns_cleanup
+netns_setup
+netns_setup_bpf test_tc_neigh_fib.o 1
+netns_test_connectivity
+netns_cleanup
+netns_setup
+netns_setup_bpf test_tc_peer.o
+netns_test_connectivity
diff --git a/tools/testing/selftests/bpf/test_tc_tunnel.sh b/tools/testing/selftests/bpf/test_tc_tunnel.sh
new file mode 100755
index 000000000..7c76b841b
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tc_tunnel.sh
@@ -0,0 +1,295 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# In-place tunneling
+
+# must match the port that the bpf program filters on
+readonly port=8000
+
+readonly ns_prefix="ns-$$-"
+readonly ns1="${ns_prefix}1"
+readonly ns2="${ns_prefix}2"
+
+readonly ns1_v4=192.168.1.1
+readonly ns2_v4=192.168.1.2
+readonly ns1_v6=fd::1
+readonly ns2_v6=fd::2
+
+# Must match port used by bpf program
+readonly udpport=5555
+# MPLSoverUDP
+readonly mplsudpport=6635
+readonly mplsproto=137
+
+readonly infile="$(mktemp)"
+readonly outfile="$(mktemp)"
+
+setup() {
+ ip netns add "${ns1}"
+ ip netns add "${ns2}"
+
+ ip link add dev veth1 mtu 1500 netns "${ns1}" type veth \
+ peer name veth2 mtu 1500 netns "${ns2}"
+
+ ip netns exec "${ns1}" ethtool -K veth1 tso off
+
+ ip -netns "${ns1}" link set veth1 up
+ ip -netns "${ns2}" link set veth2 up
+
+ ip -netns "${ns1}" -4 addr add "${ns1_v4}/24" dev veth1
+ ip -netns "${ns2}" -4 addr add "${ns2_v4}/24" dev veth2
+ ip -netns "${ns1}" -6 addr add "${ns1_v6}/64" dev veth1 nodad
+ ip -netns "${ns2}" -6 addr add "${ns2_v6}/64" dev veth2 nodad
+
+ # clamp route to reserve room for tunnel headers
+ ip -netns "${ns1}" -4 route flush table main
+ ip -netns "${ns1}" -6 route flush table main
+ ip -netns "${ns1}" -4 route add "${ns2_v4}" mtu 1458 dev veth1
+ ip -netns "${ns1}" -6 route add "${ns2_v6}" mtu 1438 dev veth1
+
+ sleep 1
+
+ dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none
+}
+
+cleanup() {
+ ip netns del "${ns2}"
+ ip netns del "${ns1}"
+
+ if [[ -f "${outfile}" ]]; then
+ rm "${outfile}"
+ fi
+ if [[ -f "${infile}" ]]; then
+ rm "${infile}"
+ fi
+
+ if [[ -n $server_pid ]]; then
+ kill $server_pid 2> /dev/null
+ fi
+}
+
+server_listen() {
+ ip netns exec "${ns2}" nc "${netcat_opt}" -l -p "${port}" > "${outfile}" &
+ server_pid=$!
+ sleep 0.2
+}
+
+client_connect() {
+ ip netns exec "${ns1}" timeout 2 nc "${netcat_opt}" -w 1 "${addr2}" "${port}" < "${infile}"
+ echo $?
+}
+
+verify_data() {
+ wait "${server_pid}"
+ server_pid=
+ # sha1sum returns two fields [sha1] [filepath]
+ # convert to bash array and access first elem
+ insum=($(sha1sum ${infile}))
+ outsum=($(sha1sum ${outfile}))
+ if [[ "${insum[0]}" != "${outsum[0]}" ]]; then
+ echo "data mismatch"
+ exit 1
+ fi
+}
+
+set -e
+
+# no arguments: automated test, run all
+if [[ "$#" -eq "0" ]]; then
+ echo "ipip"
+ $0 ipv4 ipip none 100
+
+ echo "ip6ip6"
+ $0 ipv6 ip6tnl none 100
+
+ echo "sit"
+ $0 ipv6 sit none 100
+
+ for mac in none mpls eth ; do
+ echo "ip gre $mac"
+ $0 ipv4 gre $mac 100
+
+ echo "ip6 gre $mac"
+ $0 ipv6 ip6gre $mac 100
+
+ echo "ip gre $mac gso"
+ $0 ipv4 gre $mac 2000
+
+ echo "ip6 gre $mac gso"
+ $0 ipv6 ip6gre $mac 2000
+
+ echo "ip udp $mac"
+ $0 ipv4 udp $mac 100
+
+ echo "ip6 udp $mac"
+ $0 ipv6 ip6udp $mac 100
+
+ echo "ip udp $mac gso"
+ $0 ipv4 udp $mac 2000
+
+ echo "ip6 udp $mac gso"
+ $0 ipv6 ip6udp $mac 2000
+ done
+
+ echo "OK. All tests passed"
+ exit 0
+fi
+
+if [[ "$#" -ne "4" ]]; then
+ echo "Usage: $0"
+ echo " or: $0 <ipv4|ipv6> <tuntype> <none|mpls|eth> <data_len>"
+ exit 1
+fi
+
+case "$1" in
+"ipv4")
+ readonly addr1="${ns1_v4}"
+ readonly addr2="${ns2_v4}"
+ readonly ipproto=4
+ readonly netcat_opt=-${ipproto}
+ readonly foumod=fou
+ readonly foutype=ipip
+ readonly fouproto=4
+ readonly fouproto_mpls=${mplsproto}
+ readonly gretaptype=gretap
+ ;;
+"ipv6")
+ readonly addr1="${ns1_v6}"
+ readonly addr2="${ns2_v6}"
+ readonly ipproto=6
+ readonly netcat_opt=-${ipproto}
+ readonly foumod=fou6
+ readonly foutype=ip6tnl
+ readonly fouproto="41 -6"
+ readonly fouproto_mpls="${mplsproto} -6"
+ readonly gretaptype=ip6gretap
+ ;;
+*)
+ echo "unknown arg: $1"
+ exit 1
+ ;;
+esac
+
+readonly tuntype=$2
+readonly mac=$3
+readonly datalen=$4
+
+echo "encap ${addr1} to ${addr2}, type ${tuntype}, mac ${mac} len ${datalen}"
+
+trap cleanup EXIT
+
+setup
+
+# basic communication works
+echo "test basic connectivity"
+server_listen
+client_connect
+verify_data
+
+# clientside, insert bpf program to encap all TCP to port ${port}
+# client can no longer connect
+ip netns exec "${ns1}" tc qdisc add dev veth1 clsact
+ip netns exec "${ns1}" tc filter add dev veth1 egress \
+ bpf direct-action object-file ./test_tc_tunnel.o \
+ section "encap_${tuntype}_${mac}"
+echo "test bpf encap without decap (expect failure)"
+server_listen
+! client_connect
+
+if [[ "$tuntype" =~ "udp" ]]; then
+ # Set up fou tunnel.
+ ttype="${foutype}"
+ targs="encap fou encap-sport auto encap-dport $udpport"
+ # fou may be a module; allow this to fail.
+ modprobe "${foumod}" ||true
+ if [[ "$mac" == "mpls" ]]; then
+ dport=${mplsudpport}
+ dproto=${fouproto_mpls}
+ tmode="mode any ttl 255"
+ else
+ dport=${udpport}
+ dproto=${fouproto}
+ fi
+ ip netns exec "${ns2}" ip fou add port $dport ipproto ${dproto}
+ targs="encap fou encap-sport auto encap-dport $dport"
+elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then
+ ttype=$gretaptype
+else
+ ttype=$tuntype
+ targs=""
+fi
+
+# tunnel address family differs from inner for SIT
+if [[ "${tuntype}" == "sit" ]]; then
+ link_addr1="${ns1_v4}"
+ link_addr2="${ns2_v4}"
+else
+ link_addr1="${addr1}"
+ link_addr2="${addr2}"
+fi
+
+# serverside, insert decap module
+# server is still running
+# client can connect again
+ip netns exec "${ns2}" ip link add name testtun0 type "${ttype}" \
+ ${tmode} remote "${link_addr1}" local "${link_addr2}" $targs
+
+expect_tun_fail=0
+
+if [[ "$tuntype" == "ip6udp" && "$mac" == "mpls" ]]; then
+ # No support for MPLS IPv6 fou tunnel; expect failure.
+ expect_tun_fail=1
+elif [[ "$tuntype" =~ "udp" && "$mac" == "eth" ]]; then
+ # No support for TEB fou tunnel; expect failure.
+ expect_tun_fail=1
+elif [[ "$tuntype" =~ "gre" && "$mac" == "eth" ]]; then
+ # Share ethernet address between tunnel/veth2 so L2 decap works.
+ ethaddr=$(ip netns exec "${ns2}" ip link show veth2 | \
+ awk '/ether/ { print $2 }')
+ ip netns exec "${ns2}" ip link set testtun0 address $ethaddr
+elif [[ "$mac" == "mpls" ]]; then
+ modprobe mpls_iptunnel ||true
+ modprobe mpls_gso ||true
+ ip netns exec "${ns2}" sysctl -qw net.mpls.platform_labels=65536
+ ip netns exec "${ns2}" ip -f mpls route add 1000 dev lo
+ ip netns exec "${ns2}" ip link set lo up
+ ip netns exec "${ns2}" sysctl -qw net.mpls.conf.testtun0.input=1
+ ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.lo.rp_filter=0
+fi
+
+# Because packets are decapped by the tunnel they arrive on testtun0 from
+# the IP stack perspective. Ensure reverse path filtering is disabled
+# otherwise we drop the TCP SYN as arriving on testtun0 instead of the
+# expected veth2 (veth2 is where 192.168.1.2 is configured).
+ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.all.rp_filter=0
+# rp needs to be disabled for both all and testtun0 as the rp value is
+# selected as the max of the "all" and device-specific values.
+ip netns exec "${ns2}" sysctl -qw net.ipv4.conf.testtun0.rp_filter=0
+ip netns exec "${ns2}" ip link set dev testtun0 up
+if [[ "$expect_tun_fail" == 1 ]]; then
+ # This tunnel mode is not supported, so we expect failure.
+ echo "test bpf encap with tunnel device decap (expect failure)"
+ ! client_connect
+else
+ echo "test bpf encap with tunnel device decap"
+ client_connect
+ verify_data
+ server_listen
+fi
+
+# bpf_skb_net_shrink does not take tunnel flags yet, cannot update L3.
+if [[ "${tuntype}" == "sit" ]]; then
+ echo OK
+ exit 0
+fi
+
+# serverside, use BPF for decap
+ip netns exec "${ns2}" ip link del dev testtun0
+ip netns exec "${ns2}" tc qdisc add dev veth2 clsact
+ip netns exec "${ns2}" tc filter add dev veth2 ingress \
+ bpf direct-action object-file ./test_tc_tunnel.o section decap
+echo "test bpf encap with bpf decap"
+client_connect
+verify_data
+
+echo OK
diff --git a/tools/testing/selftests/bpf/test_tcp_check_syncookie.sh b/tools/testing/selftests/bpf/test_tcp_check_syncookie.sh
new file mode 100755
index 000000000..9b3617d77
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcp_check_syncookie.sh
@@ -0,0 +1,84 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2018 Facebook
+# Copyright (c) 2019 Cloudflare
+
+set -eu
+
+wait_for_ip()
+{
+ local _i
+ printf "Wait for IP %s to become available " "$1"
+ for _i in $(seq ${MAX_PING_TRIES}); do
+ printf "."
+ if ns1_exec ping -c 1 -W 1 "$1" >/dev/null 2>&1; then
+ echo " OK"
+ return
+ fi
+ sleep 1
+ done
+ echo 1>&2 "ERROR: Timeout waiting for test IP to become available."
+ exit 1
+}
+
+get_prog_id()
+{
+ awk '/ id / {sub(/.* id /, "", $0); print($1)}'
+}
+
+ns1_exec()
+{
+ ip netns exec ns1 "$@"
+}
+
+setup()
+{
+ ip netns add ns1
+ ns1_exec ip link set lo up
+
+ ns1_exec sysctl -w net.ipv4.tcp_syncookies=2
+ ns1_exec sysctl -w net.ipv4.tcp_window_scaling=0
+ ns1_exec sysctl -w net.ipv4.tcp_timestamps=0
+ ns1_exec sysctl -w net.ipv4.tcp_sack=0
+
+ wait_for_ip 127.0.0.1
+ wait_for_ip ::1
+}
+
+cleanup()
+{
+ ip netns del ns1 2>/dev/null || :
+}
+
+main()
+{
+ trap cleanup EXIT 2 3 6 15
+ setup
+
+ printf "Testing clsact..."
+ ns1_exec tc qdisc add dev "${TEST_IF}" clsact
+ ns1_exec tc filter add dev "${TEST_IF}" ingress \
+ bpf obj "${BPF_PROG_OBJ}" sec "${CLSACT_SECTION}" da
+
+ BPF_PROG_ID=$(ns1_exec tc filter show dev "${TEST_IF}" ingress | \
+ get_prog_id)
+ ns1_exec "${PROG}" "${BPF_PROG_ID}"
+ ns1_exec tc qdisc del dev "${TEST_IF}" clsact
+
+ printf "Testing XDP..."
+ ns1_exec ip link set "${TEST_IF}" xdp \
+ object "${BPF_PROG_OBJ}" section "${XDP_SECTION}"
+ BPF_PROG_ID=$(ns1_exec ip link show "${TEST_IF}" | get_prog_id)
+ ns1_exec "${PROG}" "${BPF_PROG_ID}"
+}
+
+DIR=$(dirname $0)
+TEST_IF=lo
+MAX_PING_TRIES=5
+BPF_PROG_OBJ="${DIR}/test_tcp_check_syncookie_kern.o"
+CLSACT_SECTION="clsact/check_syncookie"
+XDP_SECTION="xdp/check_syncookie"
+BPF_PROG_ID=0
+PROG="${DIR}/test_tcp_check_syncookie_user"
+
+main
diff --git a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c
new file mode 100644
index 000000000..b9e991d43
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2018 Facebook
+// Copyright (c) 2019 Cloudflare
+
+#include <limits.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "bpf_rlimit.h"
+#include "cgroup_helpers.h"
+
+static int start_server(const struct sockaddr *addr, socklen_t len)
+{
+ int fd;
+
+ fd = socket(addr->sa_family, SOCK_STREAM, 0);
+ if (fd == -1) {
+ log_err("Failed to create server socket");
+ goto out;
+ }
+
+ if (bind(fd, addr, len) == -1) {
+ log_err("Failed to bind server socket");
+ goto close_out;
+ }
+
+ if (listen(fd, 128) == -1) {
+ log_err("Failed to listen on server socket");
+ goto close_out;
+ }
+
+ goto out;
+
+close_out:
+ close(fd);
+ fd = -1;
+out:
+ return fd;
+}
+
+static int connect_to_server(int server_fd)
+{
+ struct sockaddr_storage addr;
+ socklen_t len = sizeof(addr);
+ int fd = -1;
+
+ if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) {
+ log_err("Failed to get server addr");
+ goto out;
+ }
+
+ fd = socket(addr.ss_family, SOCK_STREAM, 0);
+ if (fd == -1) {
+ log_err("Failed to create client socket");
+ goto out;
+ }
+
+ if (connect(fd, (const struct sockaddr *)&addr, len) == -1) {
+ log_err("Fail to connect to server");
+ goto close_out;
+ }
+
+ goto out;
+
+close_out:
+ close(fd);
+ fd = -1;
+out:
+ return fd;
+}
+
+static int get_map_fd_by_prog_id(int prog_id, bool *xdp)
+{
+ struct bpf_prog_info info = {};
+ __u32 info_len = sizeof(info);
+ __u32 map_ids[1];
+ int prog_fd = -1;
+ int map_fd = -1;
+
+ prog_fd = bpf_prog_get_fd_by_id(prog_id);
+ if (prog_fd < 0) {
+ log_err("Failed to get fd by prog id %d", prog_id);
+ goto err;
+ }
+
+ info.nr_map_ids = 1;
+ info.map_ids = (__u64)(unsigned long)map_ids;
+
+ if (bpf_obj_get_info_by_fd(prog_fd, &info, &info_len)) {
+ log_err("Failed to get info by prog fd %d", prog_fd);
+ goto err;
+ }
+
+ if (!info.nr_map_ids) {
+ log_err("No maps found for prog fd %d", prog_fd);
+ goto err;
+ }
+
+ *xdp = info.type == BPF_PROG_TYPE_XDP;
+
+ map_fd = bpf_map_get_fd_by_id(map_ids[0]);
+ if (map_fd < 0)
+ log_err("Failed to get fd by map id %d", map_ids[0]);
+err:
+ if (prog_fd >= 0)
+ close(prog_fd);
+ return map_fd;
+}
+
+static int run_test(int server_fd, int results_fd, bool xdp)
+{
+ int client = -1, srv_client = -1;
+ int ret = 0;
+ __u32 key = 0;
+ __u32 key_gen = 1;
+ __u32 key_mss = 2;
+ __u32 value = 0;
+ __u32 value_gen = 0;
+ __u32 value_mss = 0;
+
+ if (bpf_map_update_elem(results_fd, &key, &value, 0) < 0) {
+ log_err("Can't clear results");
+ goto err;
+ }
+
+ if (bpf_map_update_elem(results_fd, &key_gen, &value_gen, 0) < 0) {
+ log_err("Can't clear results");
+ goto err;
+ }
+
+ if (bpf_map_update_elem(results_fd, &key_mss, &value_mss, 0) < 0) {
+ log_err("Can't clear results");
+ goto err;
+ }
+
+ client = connect_to_server(server_fd);
+ if (client == -1)
+ goto err;
+
+ srv_client = accept(server_fd, NULL, 0);
+ if (srv_client == -1) {
+ log_err("Can't accept connection");
+ goto err;
+ }
+
+ if (bpf_map_lookup_elem(results_fd, &key, &value) < 0) {
+ log_err("Can't lookup result");
+ goto err;
+ }
+
+ if (value == 0) {
+ log_err("Didn't match syncookie: %u", value);
+ goto err;
+ }
+
+ if (bpf_map_lookup_elem(results_fd, &key_gen, &value_gen) < 0) {
+ log_err("Can't lookup result");
+ goto err;
+ }
+
+ if (xdp && value_gen == 0) {
+ // SYN packets do not get passed through generic XDP, skip the
+ // rest of the test.
+ printf("Skipping XDP cookie check\n");
+ goto out;
+ }
+
+ if (bpf_map_lookup_elem(results_fd, &key_mss, &value_mss) < 0) {
+ log_err("Can't lookup result");
+ goto err;
+ }
+
+ if (value != value_gen) {
+ log_err("BPF generated cookie does not match kernel one");
+ goto err;
+ }
+
+ if (value_mss < 536 || value_mss > USHRT_MAX) {
+ log_err("Unexpected MSS retrieved");
+ goto err;
+ }
+
+ goto out;
+
+err:
+ ret = 1;
+out:
+ close(client);
+ close(srv_client);
+ return ret;
+}
+
+int main(int argc, char **argv)
+{
+ struct sockaddr_in addr4;
+ struct sockaddr_in6 addr6;
+ int server = -1;
+ int server_v6 = -1;
+ int results = -1;
+ int err = 0;
+ bool xdp;
+
+ if (argc < 2) {
+ fprintf(stderr, "Usage: %s prog_id\n", argv[0]);
+ exit(1);
+ }
+
+ results = get_map_fd_by_prog_id(atoi(argv[1]), &xdp);
+ if (results < 0) {
+ log_err("Can't get map");
+ goto err;
+ }
+
+ memset(&addr4, 0, sizeof(addr4));
+ addr4.sin_family = AF_INET;
+ addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ addr4.sin_port = 0;
+
+ memset(&addr6, 0, sizeof(addr6));
+ addr6.sin6_family = AF_INET6;
+ addr6.sin6_addr = in6addr_loopback;
+ addr6.sin6_port = 0;
+
+ server = start_server((const struct sockaddr *)&addr4, sizeof(addr4));
+ if (server == -1)
+ goto err;
+
+ server_v6 = start_server((const struct sockaddr *)&addr6,
+ sizeof(addr6));
+ if (server_v6 == -1)
+ goto err;
+
+ if (run_test(server, results, xdp))
+ goto err;
+
+ if (run_test(server_v6, results, xdp))
+ goto err;
+
+ printf("ok\n");
+ goto out;
+err:
+ err = 1;
+out:
+ close(server);
+ close(server_v6);
+ close(results);
+ return err;
+}
diff --git a/tools/testing/selftests/bpf/test_tcp_hdr_options.h b/tools/testing/selftests/bpf/test_tcp_hdr_options.h
new file mode 100644
index 000000000..6118e3ab6
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcp_hdr_options.h
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2020 Facebook */
+
+#ifndef _TEST_TCP_HDR_OPTIONS_H
+#define _TEST_TCP_HDR_OPTIONS_H
+
+struct bpf_test_option {
+ __u8 flags;
+ __u8 max_delack_ms;
+ __u8 rand;
+} __attribute__((packed));
+
+enum {
+ OPTION_RESEND,
+ OPTION_MAX_DELACK_MS,
+ OPTION_RAND,
+ __NR_OPTION_FLAGS,
+};
+
+#define OPTION_F_RESEND (1 << OPTION_RESEND)
+#define OPTION_F_MAX_DELACK_MS (1 << OPTION_MAX_DELACK_MS)
+#define OPTION_F_RAND (1 << OPTION_RAND)
+#define OPTION_MASK ((1 << __NR_OPTION_FLAGS) - 1)
+
+#define TEST_OPTION_FLAGS(flags, option) (1 & ((flags) >> (option)))
+#define SET_OPTION_FLAGS(flags, option) ((flags) |= (1 << (option)))
+
+/* Store in bpf_sk_storage */
+struct hdr_stg {
+ bool active;
+ bool resend_syn; /* active side only */
+ bool syncookie; /* passive side only */
+ bool fastopen; /* passive side only */
+};
+
+struct linum_err {
+ unsigned int linum;
+ int err;
+};
+
+#define TCPHDR_FIN 0x01
+#define TCPHDR_SYN 0x02
+#define TCPHDR_RST 0x04
+#define TCPHDR_PSH 0x08
+#define TCPHDR_ACK 0x10
+#define TCPHDR_URG 0x20
+#define TCPHDR_ECE 0x40
+#define TCPHDR_CWR 0x80
+#define TCPHDR_SYNACK (TCPHDR_SYN | TCPHDR_ACK)
+
+#define TCPOPT_EOL 0
+#define TCPOPT_NOP 1
+#define TCPOPT_WINDOW 3
+#define TCPOPT_EXP 254
+
+#define TCP_BPF_EXPOPT_BASE_LEN 4
+#define MAX_TCP_HDR_LEN 60
+#define MAX_TCP_OPTION_SPACE 40
+
+#ifdef BPF_PROG_TEST_TCP_HDR_OPTIONS
+
+#define CG_OK 1
+#define CG_ERR 0
+
+#ifndef SOL_TCP
+#define SOL_TCP 6
+#endif
+
+struct tcp_exprm_opt {
+ __u8 kind;
+ __u8 len;
+ __u16 magic;
+ union {
+ __u8 data[4];
+ __u32 data32;
+ };
+} __attribute__((packed));
+
+struct tcp_opt {
+ __u8 kind;
+ __u8 len;
+ union {
+ __u8 data[4];
+ __u32 data32;
+ };
+} __attribute__((packed));
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 2);
+ __type(key, int);
+ __type(value, struct linum_err);
+} lport_linum_map SEC(".maps");
+
+static inline unsigned int tcp_hdrlen(const struct tcphdr *th)
+{
+ return th->doff << 2;
+}
+
+static inline __u8 skops_tcp_flags(const struct bpf_sock_ops *skops)
+{
+ return skops->skb_tcp_flags;
+}
+
+static inline void clear_hdr_cb_flags(struct bpf_sock_ops *skops)
+{
+ bpf_sock_ops_cb_flags_set(skops,
+ skops->bpf_sock_ops_cb_flags &
+ ~(BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG |
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG));
+}
+
+static inline void set_hdr_cb_flags(struct bpf_sock_ops *skops, __u32 extra)
+{
+ bpf_sock_ops_cb_flags_set(skops,
+ skops->bpf_sock_ops_cb_flags |
+ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG |
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG |
+ extra);
+}
+static inline void
+clear_parse_all_hdr_cb_flags(struct bpf_sock_ops *skops)
+{
+ bpf_sock_ops_cb_flags_set(skops,
+ skops->bpf_sock_ops_cb_flags &
+ ~BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG);
+}
+
+static inline void
+set_parse_all_hdr_cb_flags(struct bpf_sock_ops *skops)
+{
+ bpf_sock_ops_cb_flags_set(skops,
+ skops->bpf_sock_ops_cb_flags |
+ BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG);
+}
+
+#define RET_CG_ERR(__err) ({ \
+ struct linum_err __linum_err; \
+ int __lport; \
+ \
+ __linum_err.linum = __LINE__; \
+ __linum_err.err = __err; \
+ __lport = skops->local_port; \
+ bpf_map_update_elem(&lport_linum_map, &__lport, &__linum_err, BPF_NOEXIST); \
+ clear_hdr_cb_flags(skops); \
+ clear_parse_all_hdr_cb_flags(skops); \
+ return CG_ERR; \
+})
+
+#endif /* BPF_PROG_TEST_TCP_HDR_OPTIONS */
+
+#endif /* _TEST_TCP_HDR_OPTIONS_H */
diff --git a/tools/testing/selftests/bpf/test_tcpbpf.h b/tools/testing/selftests/bpf/test_tcpbpf.h
new file mode 100644
index 000000000..6220b95cb
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcpbpf.h
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef _TEST_TCPBPF_H
+#define _TEST_TCPBPF_H
+
+struct tcpbpf_globals {
+ __u32 event_map;
+ __u32 total_retrans;
+ __u32 data_segs_in;
+ __u32 data_segs_out;
+ __u32 bad_cb_test_rv;
+ __u32 good_cb_test_rv;
+ __u64 bytes_received;
+ __u64 bytes_acked;
+ __u32 num_listen;
+ __u32 num_close_events;
+};
+#endif
diff --git a/tools/testing/selftests/bpf/test_tcpbpf_user.c b/tools/testing/selftests/bpf/test_tcpbpf_user.c
new file mode 100644
index 000000000..74a9e4998
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcpbpf_user.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <sys/types.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "bpf_rlimit.h"
+#include "bpf_util.h"
+#include "cgroup_helpers.h"
+
+#include "test_tcpbpf.h"
+
+/* 3 comes from one listening socket + both ends of the connection */
+#define EXPECTED_CLOSE_EVENTS 3
+
+#define EXPECT_EQ(expected, actual, fmt) \
+ do { \
+ if ((expected) != (actual)) { \
+ printf(" Value of: " #actual "\n" \
+ " Actual: %" fmt "\n" \
+ " Expected: %" fmt "\n", \
+ (actual), (expected)); \
+ ret--; \
+ } \
+ } while (0)
+
+int verify_result(const struct tcpbpf_globals *result)
+{
+ __u32 expected_events;
+ int ret = 0;
+
+ expected_events = ((1 << BPF_SOCK_OPS_TIMEOUT_INIT) |
+ (1 << BPF_SOCK_OPS_RWND_INIT) |
+ (1 << BPF_SOCK_OPS_TCP_CONNECT_CB) |
+ (1 << BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB) |
+ (1 << BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB) |
+ (1 << BPF_SOCK_OPS_NEEDS_ECN) |
+ (1 << BPF_SOCK_OPS_STATE_CB) |
+ (1 << BPF_SOCK_OPS_TCP_LISTEN_CB));
+
+ EXPECT_EQ(expected_events, result->event_map, "#" PRIx32);
+ EXPECT_EQ(501ULL, result->bytes_received, "llu");
+ EXPECT_EQ(1002ULL, result->bytes_acked, "llu");
+ EXPECT_EQ(1, result->data_segs_in, PRIu32);
+ EXPECT_EQ(1, result->data_segs_out, PRIu32);
+ EXPECT_EQ(0x80, result->bad_cb_test_rv, PRIu32);
+ EXPECT_EQ(0, result->good_cb_test_rv, PRIu32);
+ EXPECT_EQ(1, result->num_listen, PRIu32);
+ EXPECT_EQ(EXPECTED_CLOSE_EVENTS, result->num_close_events, PRIu32);
+
+ return ret;
+}
+
+int verify_sockopt_result(int sock_map_fd)
+{
+ __u32 key = 0;
+ int ret = 0;
+ int res;
+ int rv;
+
+ /* check setsockopt for SAVE_SYN */
+ rv = bpf_map_lookup_elem(sock_map_fd, &key, &res);
+ EXPECT_EQ(0, rv, "d");
+ EXPECT_EQ(0, res, "d");
+ key = 1;
+ /* check getsockopt for SAVED_SYN */
+ rv = bpf_map_lookup_elem(sock_map_fd, &key, &res);
+ EXPECT_EQ(0, rv, "d");
+ EXPECT_EQ(1, res, "d");
+ return ret;
+}
+
+static int bpf_find_map(const char *test, struct bpf_object *obj,
+ const char *name)
+{
+ struct bpf_map *map;
+
+ map = bpf_object__find_map_by_name(obj, name);
+ if (!map) {
+ printf("%s:FAIL:map '%s' not found\n", test, name);
+ return -1;
+ }
+ return bpf_map__fd(map);
+}
+
+int main(int argc, char **argv)
+{
+ const char *file = "test_tcpbpf_kern.o";
+ int prog_fd, map_fd, sock_map_fd;
+ struct tcpbpf_globals g = {0};
+ const char *cg_path = "/foo";
+ int error = EXIT_FAILURE;
+ struct bpf_object *obj;
+ int cg_fd = -1;
+ int retry = 10;
+ __u32 key = 0;
+ int rv;
+
+ cg_fd = cgroup_setup_and_join(cg_path);
+ if (cg_fd < 0)
+ goto err;
+
+ if (bpf_prog_load(file, BPF_PROG_TYPE_SOCK_OPS, &obj, &prog_fd)) {
+ printf("FAILED: load_bpf_file failed for: %s\n", file);
+ goto err;
+ }
+
+ rv = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_SOCK_OPS, 0);
+ if (rv) {
+ printf("FAILED: bpf_prog_attach: %d (%s)\n",
+ error, strerror(errno));
+ goto err;
+ }
+
+ if (system("./tcp_server.py")) {
+ printf("FAILED: TCP server\n");
+ goto err;
+ }
+
+ map_fd = bpf_find_map(__func__, obj, "global_map");
+ if (map_fd < 0)
+ goto err;
+
+ sock_map_fd = bpf_find_map(__func__, obj, "sockopt_results");
+ if (sock_map_fd < 0)
+ goto err;
+
+retry_lookup:
+ rv = bpf_map_lookup_elem(map_fd, &key, &g);
+ if (rv != 0) {
+ printf("FAILED: bpf_map_lookup_elem returns %d\n", rv);
+ goto err;
+ }
+
+ if (g.num_close_events != EXPECTED_CLOSE_EVENTS && retry--) {
+ printf("Unexpected number of close events (%d), retrying!\n",
+ g.num_close_events);
+ usleep(100);
+ goto retry_lookup;
+ }
+
+ if (verify_result(&g)) {
+ printf("FAILED: Wrong stats\n");
+ goto err;
+ }
+
+ if (verify_sockopt_result(sock_map_fd)) {
+ printf("FAILED: Wrong sockopt stats\n");
+ goto err;
+ }
+
+ printf("PASSED!\n");
+ error = 0;
+err:
+ bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS);
+ close(cg_fd);
+ cleanup_cgroup_environment();
+ return error;
+}
diff --git a/tools/testing/selftests/bpf/test_tcpnotify.h b/tools/testing/selftests/bpf/test_tcpnotify.h
new file mode 100644
index 000000000..8b6cea030
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcpnotify.h
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef _TEST_TCPBPF_H
+#define _TEST_TCPBPF_H
+
+struct tcpnotify_globals {
+ __u32 total_retrans;
+ __u32 ncalls;
+};
+
+struct tcp_notifier {
+ __u8 type;
+ __u8 subtype;
+ __u8 source;
+ __u8 hash;
+};
+
+#define TESTPORT 12877
+#endif
diff --git a/tools/testing/selftests/bpf/test_tcpnotify_user.c b/tools/testing/selftests/bpf/test_tcpnotify_user.c
new file mode 100644
index 000000000..73da7fe8c
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tcpnotify_user.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <asm/types.h>
+#include <sys/syscall.h>
+#include <errno.h>
+#include <string.h>
+#include <linux/bpf.h>
+#include <sys/socket.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+#include <sys/ioctl.h>
+#include <linux/rtnetlink.h>
+#include <signal.h>
+#include <linux/perf_event.h>
+#include <linux/err.h>
+
+#include "bpf_rlimit.h"
+#include "bpf_util.h"
+#include "cgroup_helpers.h"
+
+#include "test_tcpnotify.h"
+#include "trace_helpers.h"
+
+#define SOCKET_BUFFER_SIZE (getpagesize() < 8192L ? getpagesize() : 8192L)
+
+pthread_t tid;
+int rx_callbacks;
+
+static void dummyfn(void *ctx, int cpu, void *data, __u32 size)
+{
+ struct tcp_notifier *t = data;
+
+ if (t->type != 0xde || t->subtype != 0xad ||
+ t->source != 0xbe || t->hash != 0xef)
+ return;
+ rx_callbacks++;
+}
+
+void tcp_notifier_poller(struct perf_buffer *pb)
+{
+ int err;
+
+ while (1) {
+ err = perf_buffer__poll(pb, 100);
+ if (err < 0 && err != -EINTR) {
+ printf("failed perf_buffer__poll: %d\n", err);
+ return;
+ }
+ }
+}
+
+static void *poller_thread(void *arg)
+{
+ struct perf_buffer *pb = arg;
+
+ tcp_notifier_poller(pb);
+ return arg;
+}
+
+int verify_result(const struct tcpnotify_globals *result)
+{
+ return (result->ncalls > 0 && result->ncalls == rx_callbacks ? 0 : 1);
+}
+
+int main(int argc, char **argv)
+{
+ const char *file = "test_tcpnotify_kern.o";
+ struct bpf_map *perf_map, *global_map;
+ struct perf_buffer_opts pb_opts = {};
+ struct tcpnotify_globals g = {0};
+ struct perf_buffer *pb = NULL;
+ const char *cg_path = "/foo";
+ int prog_fd, rv, cg_fd = -1;
+ int error = EXIT_FAILURE;
+ struct bpf_object *obj;
+ char test_script[80];
+ cpu_set_t cpuset;
+ __u32 key = 0;
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(0, &cpuset);
+ pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+
+ cg_fd = cgroup_setup_and_join(cg_path);
+ if (cg_fd < 0)
+ goto err;
+
+ if (bpf_prog_load(file, BPF_PROG_TYPE_SOCK_OPS, &obj, &prog_fd)) {
+ printf("FAILED: load_bpf_file failed for: %s\n", file);
+ goto err;
+ }
+
+ rv = bpf_prog_attach(prog_fd, cg_fd, BPF_CGROUP_SOCK_OPS, 0);
+ if (rv) {
+ printf("FAILED: bpf_prog_attach: %d (%s)\n",
+ error, strerror(errno));
+ goto err;
+ }
+
+ perf_map = bpf_object__find_map_by_name(obj, "perf_event_map");
+ if (!perf_map) {
+ printf("FAIL:map '%s' not found\n", "perf_event_map");
+ goto err;
+ }
+
+ global_map = bpf_object__find_map_by_name(obj, "global_map");
+ if (!global_map) {
+ printf("FAIL:map '%s' not found\n", "global_map");
+ return -1;
+ }
+
+ pb_opts.sample_cb = dummyfn;
+ pb = perf_buffer__new(bpf_map__fd(perf_map), 8, &pb_opts);
+ if (IS_ERR(pb))
+ goto err;
+
+ pthread_create(&tid, NULL, poller_thread, pb);
+
+ sprintf(test_script,
+ "iptables -A INPUT -p tcp --dport %d -j DROP",
+ TESTPORT);
+ if (system(test_script)) {
+ printf("FAILED: execute command: %s, err %d\n", test_script, -errno);
+ goto err;
+ }
+
+ sprintf(test_script,
+ "nc 127.0.0.1 %d < /etc/passwd > /dev/null 2>&1 ",
+ TESTPORT);
+ if (system(test_script))
+ printf("execute command: %s, err %d\n", test_script, -errno);
+
+ sprintf(test_script,
+ "iptables -D INPUT -p tcp --dport %d -j DROP",
+ TESTPORT);
+ if (system(test_script)) {
+ printf("FAILED: execute command: %s, err %d\n", test_script, -errno);
+ goto err;
+ }
+
+ rv = bpf_map_lookup_elem(bpf_map__fd(global_map), &key, &g);
+ if (rv != 0) {
+ printf("FAILED: bpf_map_lookup_elem returns %d\n", rv);
+ goto err;
+ }
+
+ sleep(10);
+
+ if (verify_result(&g)) {
+ printf("FAILED: Wrong stats Expected %d calls, got %d\n",
+ g.ncalls, rx_callbacks);
+ goto err;
+ }
+
+ printf("PASSED!\n");
+ error = 0;
+err:
+ bpf_prog_detach(cg_fd, BPF_CGROUP_SOCK_OPS);
+ close(cg_fd);
+ cleanup_cgroup_environment();
+ if (!IS_ERR_OR_NULL(pb))
+ perf_buffer__free(pb);
+ return error;
+}
diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh
new file mode 100755
index 000000000..1ccbe804e
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_tunnel.sh
@@ -0,0 +1,798 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# End-to-end eBPF tunnel test suite
+# The script tests BPF network tunnel implementation.
+#
+# Topology:
+# ---------
+# root namespace | at_ns0 namespace
+# |
+# ----------- | -----------
+# | tnl dev | | | tnl dev | (overlay network)
+# ----------- | -----------
+# metadata-mode | native-mode
+# with bpf |
+# |
+# ---------- | ----------
+# | veth1 | --------- | veth0 | (underlay network)
+# ---------- peer ----------
+#
+#
+# Device Configuration
+# --------------------
+# Root namespace with metadata-mode tunnel + BPF
+# Device names and addresses:
+# veth1 IP: 172.16.1.200, IPv6: 00::22 (underlay)
+# tunnel dev <type>11, ex: gre11, IPv4: 10.1.1.200, IPv6: 1::22 (overlay)
+#
+# Namespace at_ns0 with native tunnel
+# Device names and addresses:
+# veth0 IPv4: 172.16.1.100, IPv6: 00::11 (underlay)
+# tunnel dev <type>00, ex: gre00, IPv4: 10.1.1.100, IPv6: 1::11 (overlay)
+#
+#
+# End-to-end ping packet flow
+# ---------------------------
+# Most of the tests start by namespace creation, device configuration,
+# then ping the underlay and overlay network. When doing 'ping 10.1.1.100'
+# from root namespace, the following operations happen:
+# 1) Route lookup shows 10.1.1.100/24 belongs to tnl dev, fwd to tnl dev.
+# 2) Tnl device's egress BPF program is triggered and set the tunnel metadata,
+# with remote_ip=172.16.1.200 and others.
+# 3) Outer tunnel header is prepended and route the packet to veth1's egress
+# 4) veth0's ingress queue receive the tunneled packet at namespace at_ns0
+# 5) Tunnel protocol handler, ex: vxlan_rcv, decap the packet
+# 6) Forward the packet to the overlay tnl dev
+
+PING_ARG="-c 3 -w 10 -q"
+ret=0
+GREEN='\033[0;92m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+config_device()
+{
+ ip netns add at_ns0
+ ip link add veth0 type veth peer name veth1
+ ip link set veth0 netns at_ns0
+ ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0
+ ip netns exec at_ns0 ip link set dev veth0 up
+ ip link set dev veth1 up mtu 1500
+ ip addr add dev veth1 172.16.1.200/24
+}
+
+add_gre_tunnel()
+{
+ # at_ns0 namespace
+ ip netns exec at_ns0 \
+ ip link add dev $DEV_NS type $TYPE seq key 2 \
+ local 172.16.1.100 remote 172.16.1.200
+ ip netns exec at_ns0 ip link set dev $DEV_NS up
+ ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+
+ # root namespace
+ ip link add dev $DEV type $TYPE key 2 external
+ ip link set dev $DEV up
+ ip addr add dev $DEV 10.1.1.200/24
+}
+
+add_ip6gretap_tunnel()
+{
+
+ # assign ipv6 address
+ ip netns exec at_ns0 ip addr add ::11/96 dev veth0
+ ip netns exec at_ns0 ip link set dev veth0 up
+ ip addr add dev veth1 ::22/96
+ ip link set dev veth1 up
+
+ # at_ns0 namespace
+ ip netns exec at_ns0 \
+ ip link add dev $DEV_NS type $TYPE seq flowlabel 0xbcdef key 2 \
+ local ::11 remote ::22
+
+ ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+ ip netns exec at_ns0 ip addr add dev $DEV_NS fc80::100/96
+ ip netns exec at_ns0 ip link set dev $DEV_NS up
+
+ # root namespace
+ ip link add dev $DEV type $TYPE external
+ ip addr add dev $DEV 10.1.1.200/24
+ ip addr add dev $DEV fc80::200/24
+ ip link set dev $DEV up
+}
+
+add_erspan_tunnel()
+{
+ # at_ns0 namespace
+ if [ "$1" == "v1" ]; then
+ ip netns exec at_ns0 \
+ ip link add dev $DEV_NS type $TYPE seq key 2 \
+ local 172.16.1.100 remote 172.16.1.200 \
+ erspan_ver 1 erspan 123
+ else
+ ip netns exec at_ns0 \
+ ip link add dev $DEV_NS type $TYPE seq key 2 \
+ local 172.16.1.100 remote 172.16.1.200 \
+ erspan_ver 2 erspan_dir egress erspan_hwid 3
+ fi
+ ip netns exec at_ns0 ip link set dev $DEV_NS up
+ ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+
+ # root namespace
+ ip link add dev $DEV type $TYPE external
+ ip link set dev $DEV up
+ ip addr add dev $DEV 10.1.1.200/24
+}
+
+add_ip6erspan_tunnel()
+{
+
+ # assign ipv6 address
+ ip netns exec at_ns0 ip addr add ::11/96 dev veth0
+ ip netns exec at_ns0 ip link set dev veth0 up
+ ip addr add dev veth1 ::22/96
+ ip link set dev veth1 up
+
+ # at_ns0 namespace
+ if [ "$1" == "v1" ]; then
+ ip netns exec at_ns0 \
+ ip link add dev $DEV_NS type $TYPE seq key 2 \
+ local ::11 remote ::22 \
+ erspan_ver 1 erspan 123
+ else
+ ip netns exec at_ns0 \
+ ip link add dev $DEV_NS type $TYPE seq key 2 \
+ local ::11 remote ::22 \
+ erspan_ver 2 erspan_dir egress erspan_hwid 7
+ fi
+ ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+ ip netns exec at_ns0 ip link set dev $DEV_NS up
+
+ # root namespace
+ ip link add dev $DEV type $TYPE external
+ ip addr add dev $DEV 10.1.1.200/24
+ ip link set dev $DEV up
+}
+
+add_vxlan_tunnel()
+{
+ # Set static ARP entry here because iptables set-mark works
+ # on L3 packet, as a result not applying to ARP packets,
+ # causing errors at get_tunnel_{key/opt}.
+
+ # at_ns0 namespace
+ ip netns exec at_ns0 \
+ ip link add dev $DEV_NS type $TYPE \
+ id 2 dstport 4789 gbp remote 172.16.1.200
+ ip netns exec at_ns0 \
+ ip link set dev $DEV_NS address 52:54:00:d9:01:00 up
+ ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+ ip netns exec at_ns0 arp -s 10.1.1.200 52:54:00:d9:02:00
+ ip netns exec at_ns0 iptables -A OUTPUT -j MARK --set-mark 0x800FF
+
+ # root namespace
+ ip link add dev $DEV type $TYPE external gbp dstport 4789
+ ip link set dev $DEV address 52:54:00:d9:02:00 up
+ ip addr add dev $DEV 10.1.1.200/24
+ arp -s 10.1.1.100 52:54:00:d9:01:00
+}
+
+add_ip6vxlan_tunnel()
+{
+ #ip netns exec at_ns0 ip -4 addr del 172.16.1.100 dev veth0
+ ip netns exec at_ns0 ip -6 addr add ::11/96 dev veth0
+ ip netns exec at_ns0 ip link set dev veth0 up
+ #ip -4 addr del 172.16.1.200 dev veth1
+ ip -6 addr add dev veth1 ::22/96
+ ip link set dev veth1 up
+
+ # at_ns0 namespace
+ ip netns exec at_ns0 \
+ ip link add dev $DEV_NS type $TYPE id 22 dstport 4789 \
+ local ::11 remote ::22
+ ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+ ip netns exec at_ns0 ip link set dev $DEV_NS up
+
+ # root namespace
+ ip link add dev $DEV type $TYPE external dstport 4789
+ ip addr add dev $DEV 10.1.1.200/24
+ ip link set dev $DEV up
+}
+
+add_geneve_tunnel()
+{
+ # at_ns0 namespace
+ ip netns exec at_ns0 \
+ ip link add dev $DEV_NS type $TYPE \
+ id 2 dstport 6081 remote 172.16.1.200
+ ip netns exec at_ns0 ip link set dev $DEV_NS up
+ ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+
+ # root namespace
+ ip link add dev $DEV type $TYPE dstport 6081 external
+ ip link set dev $DEV up
+ ip addr add dev $DEV 10.1.1.200/24
+}
+
+add_ip6geneve_tunnel()
+{
+ ip netns exec at_ns0 ip addr add ::11/96 dev veth0
+ ip netns exec at_ns0 ip link set dev veth0 up
+ ip addr add dev veth1 ::22/96
+ ip link set dev veth1 up
+
+ # at_ns0 namespace
+ ip netns exec at_ns0 \
+ ip link add dev $DEV_NS type $TYPE id 22 \
+ remote ::22 # geneve has no local option
+ ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+ ip netns exec at_ns0 ip link set dev $DEV_NS up
+
+ # root namespace
+ ip link add dev $DEV type $TYPE external
+ ip addr add dev $DEV 10.1.1.200/24
+ ip link set dev $DEV up
+}
+
+add_ipip_tunnel()
+{
+ # at_ns0 namespace
+ ip netns exec at_ns0 \
+ ip link add dev $DEV_NS type $TYPE \
+ local 172.16.1.100 remote 172.16.1.200
+ ip netns exec at_ns0 ip link set dev $DEV_NS up
+ ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+
+ # root namespace
+ ip link add dev $DEV type $TYPE external
+ ip link set dev $DEV up
+ ip addr add dev $DEV 10.1.1.200/24
+}
+
+add_ip6tnl_tunnel()
+{
+ ip netns exec at_ns0 ip addr add ::11/96 dev veth0
+ ip netns exec at_ns0 ip link set dev veth0 up
+ ip addr add dev veth1 ::22/96
+ ip link set dev veth1 up
+
+ # at_ns0 namespace
+ ip netns exec at_ns0 \
+ ip link add dev $DEV_NS type $TYPE \
+ local ::11 remote ::22
+ ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24
+ ip netns exec at_ns0 ip addr add dev $DEV_NS 1::11/96
+ ip netns exec at_ns0 ip link set dev $DEV_NS up
+
+ # root namespace
+ ip link add dev $DEV type $TYPE external
+ ip addr add dev $DEV 10.1.1.200/24
+ ip addr add dev $DEV 1::22/96
+ ip link set dev $DEV up
+}
+
+test_gre()
+{
+ TYPE=gretap
+ DEV_NS=gretap00
+ DEV=gretap11
+ ret=0
+
+ check $TYPE
+ config_device
+ add_gre_tunnel
+ attach_bpf $DEV gre_set_tunnel gre_get_tunnel
+ ping $PING_ARG 10.1.1.100
+ check_err $?
+ ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+ check_err $?
+ cleanup
+
+ if [ $ret -ne 0 ]; then
+ echo -e ${RED}"FAIL: $TYPE"${NC}
+ return 1
+ fi
+ echo -e ${GREEN}"PASS: $TYPE"${NC}
+}
+
+test_ip6gre()
+{
+ TYPE=ip6gre
+ DEV_NS=ip6gre00
+ DEV=ip6gre11
+ ret=0
+
+ check $TYPE
+ config_device
+ # reuse the ip6gretap function
+ add_ip6gretap_tunnel
+ attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel
+ # underlay
+ ping6 $PING_ARG ::11
+ # overlay: ipv4 over ipv6
+ ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+ ping $PING_ARG 10.1.1.100
+ check_err $?
+ # overlay: ipv6 over ipv6
+ ip netns exec at_ns0 ping6 $PING_ARG fc80::200
+ check_err $?
+ cleanup
+
+ if [ $ret -ne 0 ]; then
+ echo -e ${RED}"FAIL: $TYPE"${NC}
+ return 1
+ fi
+ echo -e ${GREEN}"PASS: $TYPE"${NC}
+}
+
+test_ip6gretap()
+{
+ TYPE=ip6gretap
+ DEV_NS=ip6gretap00
+ DEV=ip6gretap11
+ ret=0
+
+ check $TYPE
+ config_device
+ add_ip6gretap_tunnel
+ attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel
+ # underlay
+ ping6 $PING_ARG ::11
+ # overlay: ipv4 over ipv6
+ ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+ ping $PING_ARG 10.1.1.100
+ check_err $?
+ # overlay: ipv6 over ipv6
+ ip netns exec at_ns0 ping6 $PING_ARG fc80::200
+ check_err $?
+ cleanup
+
+ if [ $ret -ne 0 ]; then
+ echo -e ${RED}"FAIL: $TYPE"${NC}
+ return 1
+ fi
+ echo -e ${GREEN}"PASS: $TYPE"${NC}
+}
+
+test_erspan()
+{
+ TYPE=erspan
+ DEV_NS=erspan00
+ DEV=erspan11
+ ret=0
+
+ check $TYPE
+ config_device
+ add_erspan_tunnel $1
+ attach_bpf $DEV erspan_set_tunnel erspan_get_tunnel
+ ping $PING_ARG 10.1.1.100
+ check_err $?
+ ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+ check_err $?
+ cleanup
+
+ if [ $ret -ne 0 ]; then
+ echo -e ${RED}"FAIL: $TYPE"${NC}
+ return 1
+ fi
+ echo -e ${GREEN}"PASS: $TYPE"${NC}
+}
+
+test_ip6erspan()
+{
+ TYPE=ip6erspan
+ DEV_NS=ip6erspan00
+ DEV=ip6erspan11
+ ret=0
+
+ check $TYPE
+ config_device
+ add_ip6erspan_tunnel $1
+ attach_bpf $DEV ip4ip6erspan_set_tunnel ip4ip6erspan_get_tunnel
+ ping6 $PING_ARG ::11
+ ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+ check_err $?
+ cleanup
+
+ if [ $ret -ne 0 ]; then
+ echo -e ${RED}"FAIL: $TYPE"${NC}
+ return 1
+ fi
+ echo -e ${GREEN}"PASS: $TYPE"${NC}
+}
+
+test_vxlan()
+{
+ TYPE=vxlan
+ DEV_NS=vxlan00
+ DEV=vxlan11
+ ret=0
+
+ check $TYPE
+ config_device
+ add_vxlan_tunnel
+ attach_bpf $DEV vxlan_set_tunnel vxlan_get_tunnel
+ ping $PING_ARG 10.1.1.100
+ check_err $?
+ ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+ check_err $?
+ cleanup
+
+ if [ $ret -ne 0 ]; then
+ echo -e ${RED}"FAIL: $TYPE"${NC}
+ return 1
+ fi
+ echo -e ${GREEN}"PASS: $TYPE"${NC}
+}
+
+test_ip6vxlan()
+{
+ TYPE=vxlan
+ DEV_NS=ip6vxlan00
+ DEV=ip6vxlan11
+ ret=0
+
+ check $TYPE
+ config_device
+ add_ip6vxlan_tunnel
+ ip link set dev veth1 mtu 1500
+ attach_bpf $DEV ip6vxlan_set_tunnel ip6vxlan_get_tunnel
+ # underlay
+ ping6 $PING_ARG ::11
+ # ip4 over ip6
+ ping $PING_ARG 10.1.1.100
+ check_err $?
+ ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+ check_err $?
+ cleanup
+
+ if [ $ret -ne 0 ]; then
+ echo -e ${RED}"FAIL: ip6$TYPE"${NC}
+ return 1
+ fi
+ echo -e ${GREEN}"PASS: ip6$TYPE"${NC}
+}
+
+test_geneve()
+{
+ TYPE=geneve
+ DEV_NS=geneve00
+ DEV=geneve11
+ ret=0
+
+ check $TYPE
+ config_device
+ add_geneve_tunnel
+ attach_bpf $DEV geneve_set_tunnel geneve_get_tunnel
+ ping $PING_ARG 10.1.1.100
+ check_err $?
+ ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+ check_err $?
+ cleanup
+
+ if [ $ret -ne 0 ]; then
+ echo -e ${RED}"FAIL: $TYPE"${NC}
+ return 1
+ fi
+ echo -e ${GREEN}"PASS: $TYPE"${NC}
+}
+
+test_ip6geneve()
+{
+ TYPE=geneve
+ DEV_NS=ip6geneve00
+ DEV=ip6geneve11
+ ret=0
+
+ check $TYPE
+ config_device
+ add_ip6geneve_tunnel
+ attach_bpf $DEV ip6geneve_set_tunnel ip6geneve_get_tunnel
+ ping $PING_ARG 10.1.1.100
+ check_err $?
+ ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+ check_err $?
+ cleanup
+
+ if [ $ret -ne 0 ]; then
+ echo -e ${RED}"FAIL: ip6$TYPE"${NC}
+ return 1
+ fi
+ echo -e ${GREEN}"PASS: ip6$TYPE"${NC}
+}
+
+test_ipip()
+{
+ TYPE=ipip
+ DEV_NS=ipip00
+ DEV=ipip11
+ ret=0
+
+ check $TYPE
+ config_device
+ add_ipip_tunnel
+ ip link set dev veth1 mtu 1500
+ attach_bpf $DEV ipip_set_tunnel ipip_get_tunnel
+ ping $PING_ARG 10.1.1.100
+ check_err $?
+ ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+ check_err $?
+ cleanup
+
+ if [ $ret -ne 0 ]; then
+ echo -e ${RED}"FAIL: $TYPE"${NC}
+ return 1
+ fi
+ echo -e ${GREEN}"PASS: $TYPE"${NC}
+}
+
+test_ipip6()
+{
+ TYPE=ip6tnl
+ DEV_NS=ipip6tnl00
+ DEV=ipip6tnl11
+ ret=0
+
+ check $TYPE
+ config_device
+ add_ip6tnl_tunnel
+ ip link set dev veth1 mtu 1500
+ attach_bpf $DEV ipip6_set_tunnel ipip6_get_tunnel
+ # underlay
+ ping6 $PING_ARG ::11
+ # ip4 over ip6
+ ping $PING_ARG 10.1.1.100
+ check_err $?
+ ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+ check_err $?
+ cleanup
+
+ if [ $ret -ne 0 ]; then
+ echo -e ${RED}"FAIL: $TYPE"${NC}
+ return 1
+ fi
+ echo -e ${GREEN}"PASS: $TYPE"${NC}
+}
+
+test_ip6ip6()
+{
+ TYPE=ip6tnl
+ DEV_NS=ip6ip6tnl00
+ DEV=ip6ip6tnl11
+ ret=0
+
+ check $TYPE
+ config_device
+ add_ip6tnl_tunnel
+ ip link set dev veth1 mtu 1500
+ attach_bpf $DEV ip6ip6_set_tunnel ip6ip6_get_tunnel
+ # underlay
+ ping6 $PING_ARG ::11
+ # ip6 over ip6
+ ping6 $PING_ARG 1::11
+ check_err $?
+ ip netns exec at_ns0 ping6 $PING_ARG 1::22
+ check_err $?
+ cleanup
+
+ if [ $ret -ne 0 ]; then
+ echo -e ${RED}"FAIL: ip6$TYPE"${NC}
+ return 1
+ fi
+ echo -e ${GREEN}"PASS: ip6$TYPE"${NC}
+}
+
+setup_xfrm_tunnel()
+{
+ auth=0x$(printf '1%.0s' {1..40})
+ enc=0x$(printf '2%.0s' {1..32})
+ spi_in_to_out=0x1
+ spi_out_to_in=0x2
+ # at_ns0 namespace
+ # at_ns0 -> root
+ ip netns exec at_ns0 \
+ ip xfrm state add src 172.16.1.100 dst 172.16.1.200 proto esp \
+ spi $spi_in_to_out reqid 1 mode tunnel \
+ auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc
+ ip netns exec at_ns0 \
+ ip xfrm policy add src 10.1.1.100/32 dst 10.1.1.200/32 dir out \
+ tmpl src 172.16.1.100 dst 172.16.1.200 proto esp reqid 1 \
+ mode tunnel
+ # root -> at_ns0
+ ip netns exec at_ns0 \
+ ip xfrm state add src 172.16.1.200 dst 172.16.1.100 proto esp \
+ spi $spi_out_to_in reqid 2 mode tunnel \
+ auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc
+ ip netns exec at_ns0 \
+ ip xfrm policy add src 10.1.1.200/32 dst 10.1.1.100/32 dir in \
+ tmpl src 172.16.1.200 dst 172.16.1.100 proto esp reqid 2 \
+ mode tunnel
+ # address & route
+ ip netns exec at_ns0 \
+ ip addr add dev veth0 10.1.1.100/32
+ ip netns exec at_ns0 \
+ ip route add 10.1.1.200 dev veth0 via 172.16.1.200 \
+ src 10.1.1.100
+
+ # root namespace
+ # at_ns0 -> root
+ ip xfrm state add src 172.16.1.100 dst 172.16.1.200 proto esp \
+ spi $spi_in_to_out reqid 1 mode tunnel \
+ auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc
+ ip xfrm policy add src 10.1.1.100/32 dst 10.1.1.200/32 dir in \
+ tmpl src 172.16.1.100 dst 172.16.1.200 proto esp reqid 1 \
+ mode tunnel
+ # root -> at_ns0
+ ip xfrm state add src 172.16.1.200 dst 172.16.1.100 proto esp \
+ spi $spi_out_to_in reqid 2 mode tunnel \
+ auth-trunc 'hmac(sha1)' $auth 96 enc 'cbc(aes)' $enc
+ ip xfrm policy add src 10.1.1.200/32 dst 10.1.1.100/32 dir out \
+ tmpl src 172.16.1.200 dst 172.16.1.100 proto esp reqid 2 \
+ mode tunnel
+ # address & route
+ ip addr add dev veth1 10.1.1.200/32
+ ip route add 10.1.1.100 dev veth1 via 172.16.1.100 src 10.1.1.200
+}
+
+test_xfrm_tunnel()
+{
+ config_device
+ > /sys/kernel/debug/tracing/trace
+ setup_xfrm_tunnel
+ tc qdisc add dev veth1 clsact
+ tc filter add dev veth1 proto ip ingress bpf da obj test_tunnel_kern.o \
+ sec xfrm_get_state
+ ip netns exec at_ns0 ping $PING_ARG 10.1.1.200
+ sleep 1
+ grep "reqid 1" /sys/kernel/debug/tracing/trace
+ check_err $?
+ grep "spi 0x1" /sys/kernel/debug/tracing/trace
+ check_err $?
+ grep "remote ip 0xac100164" /sys/kernel/debug/tracing/trace
+ check_err $?
+ cleanup
+
+ if [ $ret -ne 0 ]; then
+ echo -e ${RED}"FAIL: xfrm tunnel"${NC}
+ return 1
+ fi
+ echo -e ${GREEN}"PASS: xfrm tunnel"${NC}
+}
+
+attach_bpf()
+{
+ DEV=$1
+ SET=$2
+ GET=$3
+ tc qdisc add dev $DEV clsact
+ tc filter add dev $DEV egress bpf da obj test_tunnel_kern.o sec $SET
+ tc filter add dev $DEV ingress bpf da obj test_tunnel_kern.o sec $GET
+}
+
+cleanup()
+{
+ ip netns delete at_ns0 2> /dev/null
+ ip link del veth1 2> /dev/null
+ ip link del ipip11 2> /dev/null
+ ip link del ipip6tnl11 2> /dev/null
+ ip link del ip6ip6tnl11 2> /dev/null
+ ip link del gretap11 2> /dev/null
+ ip link del ip6gre11 2> /dev/null
+ ip link del ip6gretap11 2> /dev/null
+ ip link del vxlan11 2> /dev/null
+ ip link del ip6vxlan11 2> /dev/null
+ ip link del geneve11 2> /dev/null
+ ip link del ip6geneve11 2> /dev/null
+ ip link del erspan11 2> /dev/null
+ ip link del ip6erspan11 2> /dev/null
+ ip xfrm policy delete dir out src 10.1.1.200/32 dst 10.1.1.100/32 2> /dev/null
+ ip xfrm policy delete dir in src 10.1.1.100/32 dst 10.1.1.200/32 2> /dev/null
+ ip xfrm state delete src 172.16.1.100 dst 172.16.1.200 proto esp spi 0x1 2> /dev/null
+ ip xfrm state delete src 172.16.1.200 dst 172.16.1.100 proto esp spi 0x2 2> /dev/null
+}
+
+cleanup_exit()
+{
+ echo "CATCH SIGKILL or SIGINT, cleanup and exit"
+ cleanup
+ exit 0
+}
+
+check()
+{
+ ip link help 2>&1 | grep -q "\s$1\s"
+ if [ $? -ne 0 ];then
+ echo "SKIP $1: iproute2 not support"
+ cleanup
+ return 1
+ fi
+}
+
+enable_debug()
+{
+ echo 'file ip_gre.c +p' > /sys/kernel/debug/dynamic_debug/control
+ echo 'file ip6_gre.c +p' > /sys/kernel/debug/dynamic_debug/control
+ echo 'file vxlan.c +p' > /sys/kernel/debug/dynamic_debug/control
+ echo 'file geneve.c +p' > /sys/kernel/debug/dynamic_debug/control
+ echo 'file ipip.c +p' > /sys/kernel/debug/dynamic_debug/control
+}
+
+check_err()
+{
+ if [ $ret -eq 0 ]; then
+ ret=$1
+ fi
+}
+
+bpf_tunnel_test()
+{
+ local errors=0
+
+ echo "Testing GRE tunnel..."
+ test_gre
+ errors=$(( $errors + $? ))
+
+ echo "Testing IP6GRE tunnel..."
+ test_ip6gre
+ errors=$(( $errors + $? ))
+
+ echo "Testing IP6GRETAP tunnel..."
+ test_ip6gretap
+ errors=$(( $errors + $? ))
+
+ echo "Testing ERSPAN tunnel..."
+ test_erspan v2
+ errors=$(( $errors + $? ))
+
+ echo "Testing IP6ERSPAN tunnel..."
+ test_ip6erspan v2
+ errors=$(( $errors + $? ))
+
+ echo "Testing VXLAN tunnel..."
+ test_vxlan
+ errors=$(( $errors + $? ))
+
+ echo "Testing IP6VXLAN tunnel..."
+ test_ip6vxlan
+ errors=$(( $errors + $? ))
+
+ echo "Testing GENEVE tunnel..."
+ test_geneve
+ errors=$(( $errors + $? ))
+
+ echo "Testing IP6GENEVE tunnel..."
+ test_ip6geneve
+ errors=$(( $errors + $? ))
+
+ echo "Testing IPIP tunnel..."
+ test_ipip
+ errors=$(( $errors + $? ))
+
+ echo "Testing IPIP6 tunnel..."
+ test_ipip6
+ errors=$(( $errors + $? ))
+
+ echo "Testing IP6IP6 tunnel..."
+ test_ip6ip6
+ errors=$(( $errors + $? ))
+
+ echo "Testing IPSec tunnel..."
+ test_xfrm_tunnel
+ errors=$(( $errors + $? ))
+
+ return $errors
+}
+
+trap cleanup 0 3 6
+trap cleanup_exit 2 9
+
+cleanup
+bpf_tunnel_test
+
+if [ $? -ne 0 ]; then
+ echo -e "$(basename $0): ${RED}FAIL${NC}"
+ exit 1
+fi
+echo -e "$(basename $0): ${GREEN}PASS${NC}"
+exit 0
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
new file mode 100644
index 000000000..961c17b46
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -0,0 +1,1244 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Testsuite for eBPF verifier
+ *
+ * Copyright (c) 2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2017 Facebook
+ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io
+ */
+
+#include <endian.h>
+#include <asm/types.h>
+#include <linux/types.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <sched.h>
+#include <limits.h>
+#include <assert.h>
+
+#include <sys/capability.h>
+
+#include <linux/unistd.h>
+#include <linux/filter.h>
+#include <linux/bpf_perf_event.h>
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/btf.h>
+
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#ifdef HAVE_GENHDR
+# include "autoconf.h"
+#else
+# if defined(__i386) || defined(__x86_64) || defined(__s390x__) || defined(__aarch64__)
+# define CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS 1
+# endif
+#endif
+#include "bpf_rlimit.h"
+#include "bpf_rand.h"
+#include "bpf_util.h"
+#include "test_btf.h"
+#include "../../../include/linux/filter.h"
+
+#define MAX_INSNS BPF_MAXINSNS
+#define MAX_TEST_INSNS 1000000
+#define MAX_FIXUPS 8
+#define MAX_NR_MAPS 21
+#define MAX_TEST_RUNS 8
+#define POINTER_VALUE 0xcafe4all
+#define TEST_DATA_LEN 64
+
+#define F_NEEDS_EFFICIENT_UNALIGNED_ACCESS (1 << 0)
+#define F_LOAD_WITH_STRICT_ALIGNMENT (1 << 1)
+
+#define UNPRIV_SYSCTL "kernel/unprivileged_bpf_disabled"
+static bool unpriv_disabled = false;
+static int skips;
+static bool verbose = false;
+
+struct bpf_test {
+ const char *descr;
+ struct bpf_insn insns[MAX_INSNS];
+ struct bpf_insn *fill_insns;
+ int fixup_map_hash_8b[MAX_FIXUPS];
+ int fixup_map_hash_48b[MAX_FIXUPS];
+ int fixup_map_hash_16b[MAX_FIXUPS];
+ int fixup_map_array_48b[MAX_FIXUPS];
+ int fixup_map_sockmap[MAX_FIXUPS];
+ int fixup_map_sockhash[MAX_FIXUPS];
+ int fixup_map_xskmap[MAX_FIXUPS];
+ int fixup_map_stacktrace[MAX_FIXUPS];
+ int fixup_prog1[MAX_FIXUPS];
+ int fixup_prog2[MAX_FIXUPS];
+ int fixup_map_in_map[MAX_FIXUPS];
+ int fixup_cgroup_storage[MAX_FIXUPS];
+ int fixup_percpu_cgroup_storage[MAX_FIXUPS];
+ int fixup_map_spin_lock[MAX_FIXUPS];
+ int fixup_map_array_ro[MAX_FIXUPS];
+ int fixup_map_array_wo[MAX_FIXUPS];
+ int fixup_map_array_small[MAX_FIXUPS];
+ int fixup_sk_storage_map[MAX_FIXUPS];
+ int fixup_map_event_output[MAX_FIXUPS];
+ int fixup_map_reuseport_array[MAX_FIXUPS];
+ int fixup_map_ringbuf[MAX_FIXUPS];
+ const char *errstr;
+ const char *errstr_unpriv;
+ uint32_t insn_processed;
+ int prog_len;
+ enum {
+ UNDEF,
+ ACCEPT,
+ REJECT,
+ VERBOSE_ACCEPT,
+ } result, result_unpriv;
+ enum bpf_prog_type prog_type;
+ uint8_t flags;
+ void (*fill_helper)(struct bpf_test *self);
+ int runs;
+#define bpf_testdata_struct_t \
+ struct { \
+ uint32_t retval, retval_unpriv; \
+ union { \
+ __u8 data[TEST_DATA_LEN]; \
+ __u64 data64[TEST_DATA_LEN / 8]; \
+ }; \
+ }
+ union {
+ bpf_testdata_struct_t;
+ bpf_testdata_struct_t retvals[MAX_TEST_RUNS];
+ };
+ enum bpf_attach_type expected_attach_type;
+ const char *kfunc;
+};
+
+/* Note we want this to be 64 bit aligned so that the end of our array is
+ * actually the end of the structure.
+ */
+#define MAX_ENTRIES 11
+
+struct test_val {
+ unsigned int index;
+ int foo[MAX_ENTRIES];
+};
+
+struct other_val {
+ long long foo;
+ long long bar;
+};
+
+static void bpf_fill_ld_abs_vlan_push_pop(struct bpf_test *self)
+{
+ /* test: {skb->data[0], vlan_push} x 51 + {skb->data[0], vlan_pop} x 51 */
+#define PUSH_CNT 51
+ /* jump range is limited to 16 bit. PUSH_CNT of ld_abs needs room */
+ unsigned int len = (1 << 15) - PUSH_CNT * 2 * 5 * 6;
+ struct bpf_insn *insn = self->fill_insns;
+ int i = 0, j, k = 0;
+
+ insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+loop:
+ for (j = 0; j < PUSH_CNT; j++) {
+ insn[i++] = BPF_LD_ABS(BPF_B, 0);
+ /* jump to error label */
+ insn[i] = BPF_JMP32_IMM(BPF_JNE, BPF_REG_0, 0x34, len - i - 3);
+ i++;
+ insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+ insn[i++] = BPF_MOV64_IMM(BPF_REG_2, 1);
+ insn[i++] = BPF_MOV64_IMM(BPF_REG_3, 2);
+ insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_skb_vlan_push),
+ insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, len - i - 3);
+ i++;
+ }
+
+ for (j = 0; j < PUSH_CNT; j++) {
+ insn[i++] = BPF_LD_ABS(BPF_B, 0);
+ insn[i] = BPF_JMP32_IMM(BPF_JNE, BPF_REG_0, 0x34, len - i - 3);
+ i++;
+ insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
+ insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_skb_vlan_pop),
+ insn[i] = BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, len - i - 3);
+ i++;
+ }
+ if (++k < 5)
+ goto loop;
+
+ for (; i < len - 3; i++)
+ insn[i] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 0xbef);
+ insn[len - 3] = BPF_JMP_A(1);
+ /* error label */
+ insn[len - 2] = BPF_MOV32_IMM(BPF_REG_0, 0);
+ insn[len - 1] = BPF_EXIT_INSN();
+ self->prog_len = len;
+}
+
+static void bpf_fill_jump_around_ld_abs(struct bpf_test *self)
+{
+ struct bpf_insn *insn = self->fill_insns;
+ /* jump range is limited to 16 bit. every ld_abs is replaced by 6 insns,
+ * but on arches like arm, ppc etc, there will be one BPF_ZEXT inserted
+ * to extend the error value of the inlined ld_abs sequence which then
+ * contains 7 insns. so, set the dividend to 7 so the testcase could
+ * work on all arches.
+ */
+ unsigned int len = (1 << 15) / 7;
+ int i = 0;
+
+ insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+ insn[i++] = BPF_LD_ABS(BPF_B, 0);
+ insn[i] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 10, len - i - 2);
+ i++;
+ while (i < len - 1)
+ insn[i++] = BPF_LD_ABS(BPF_B, 1);
+ insn[i] = BPF_EXIT_INSN();
+ self->prog_len = i + 1;
+}
+
+static void bpf_fill_rand_ld_dw(struct bpf_test *self)
+{
+ struct bpf_insn *insn = self->fill_insns;
+ uint64_t res = 0;
+ int i = 0;
+
+ insn[i++] = BPF_MOV32_IMM(BPF_REG_0, 0);
+ while (i < self->retval) {
+ uint64_t val = bpf_semi_rand_get();
+ struct bpf_insn tmp[2] = { BPF_LD_IMM64(BPF_REG_1, val) };
+
+ res ^= val;
+ insn[i++] = tmp[0];
+ insn[i++] = tmp[1];
+ insn[i++] = BPF_ALU64_REG(BPF_XOR, BPF_REG_0, BPF_REG_1);
+ }
+ insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_0);
+ insn[i++] = BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 32);
+ insn[i++] = BPF_ALU64_REG(BPF_XOR, BPF_REG_0, BPF_REG_1);
+ insn[i] = BPF_EXIT_INSN();
+ self->prog_len = i + 1;
+ res ^= (res >> 32);
+ self->retval = (uint32_t)res;
+}
+
+#define MAX_JMP_SEQ 8192
+
+/* test the sequence of 8k jumps */
+static void bpf_fill_scale1(struct bpf_test *self)
+{
+ struct bpf_insn *insn = self->fill_insns;
+ int i = 0, k = 0;
+
+ insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+ /* test to check that the long sequence of jumps is acceptable */
+ while (k++ < MAX_JMP_SEQ) {
+ insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_get_prandom_u32);
+ insn[i++] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, bpf_semi_rand_get(), 2);
+ insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_10);
+ insn[i++] = BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6,
+ -8 * (k % 64 + 1));
+ }
+ /* is_state_visited() doesn't allocate state for pruning for every jump.
+ * Hence multiply jmps by 4 to accommodate that heuristic
+ */
+ while (i < MAX_TEST_INSNS - MAX_JMP_SEQ * 4)
+ insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 42);
+ insn[i] = BPF_EXIT_INSN();
+ self->prog_len = i + 1;
+ self->retval = 42;
+}
+
+/* test the sequence of 8k jumps in inner most function (function depth 8)*/
+static void bpf_fill_scale2(struct bpf_test *self)
+{
+ struct bpf_insn *insn = self->fill_insns;
+ int i = 0, k = 0;
+
+#define FUNC_NEST 7
+ for (k = 0; k < FUNC_NEST; k++) {
+ insn[i++] = BPF_CALL_REL(1);
+ insn[i++] = BPF_EXIT_INSN();
+ }
+ insn[i++] = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
+ /* test to check that the long sequence of jumps is acceptable */
+ k = 0;
+ while (k++ < MAX_JMP_SEQ) {
+ insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_get_prandom_u32);
+ insn[i++] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, bpf_semi_rand_get(), 2);
+ insn[i++] = BPF_MOV64_REG(BPF_REG_1, BPF_REG_10);
+ insn[i++] = BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6,
+ -8 * (k % (64 - 4 * FUNC_NEST) + 1));
+ }
+ while (i < MAX_TEST_INSNS - MAX_JMP_SEQ * 4)
+ insn[i++] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_0, 42);
+ insn[i] = BPF_EXIT_INSN();
+ self->prog_len = i + 1;
+ self->retval = 42;
+}
+
+static void bpf_fill_scale(struct bpf_test *self)
+{
+ switch (self->retval) {
+ case 1:
+ return bpf_fill_scale1(self);
+ case 2:
+ return bpf_fill_scale2(self);
+ default:
+ self->prog_len = 0;
+ break;
+ }
+}
+
+/* BPF_SK_LOOKUP contains 13 instructions, if you need to fix up maps */
+#define BPF_SK_LOOKUP(func) \
+ /* struct bpf_sock_tuple tuple = {} */ \
+ BPF_MOV64_IMM(BPF_REG_2, 0), \
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8), \
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -16), \
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -24), \
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -32), \
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -40), \
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -48), \
+ /* sk = func(ctx, &tuple, sizeof tuple, 0, 0) */ \
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), \
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -48), \
+ BPF_MOV64_IMM(BPF_REG_3, sizeof(struct bpf_sock_tuple)), \
+ BPF_MOV64_IMM(BPF_REG_4, 0), \
+ BPF_MOV64_IMM(BPF_REG_5, 0), \
+ BPF_EMIT_CALL(BPF_FUNC_ ## func)
+
+/* BPF_DIRECT_PKT_R2 contains 7 instructions, it initializes default return
+ * value into 0 and does necessary preparation for direct packet access
+ * through r2. The allowed access range is 8 bytes.
+ */
+#define BPF_DIRECT_PKT_R2 \
+ BPF_MOV64_IMM(BPF_REG_0, 0), \
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, \
+ offsetof(struct __sk_buff, data)), \
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, \
+ offsetof(struct __sk_buff, data_end)), \
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_2), \
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 8), \
+ BPF_JMP_REG(BPF_JLE, BPF_REG_4, BPF_REG_3, 1), \
+ BPF_EXIT_INSN()
+
+/* BPF_RAND_UEXT_R7 contains 4 instructions, it initializes R7 into a random
+ * positive u32, and zero-extend it into 64-bit.
+ */
+#define BPF_RAND_UEXT_R7 \
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, \
+ BPF_FUNC_get_prandom_u32), \
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), \
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_7, 33), \
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_7, 33)
+
+/* BPF_RAND_SEXT_R7 contains 5 instructions, it initializes R7 into a random
+ * negative u32, and sign-extend it into 64-bit.
+ */
+#define BPF_RAND_SEXT_R7 \
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, \
+ BPF_FUNC_get_prandom_u32), \
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0), \
+ BPF_ALU64_IMM(BPF_OR, BPF_REG_7, 0x80000000), \
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_7, 32), \
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_7, 32)
+
+static struct bpf_test tests[] = {
+#define FILL_ARRAY
+#include <verifier/tests.h>
+#undef FILL_ARRAY
+};
+
+static int probe_filter_length(const struct bpf_insn *fp)
+{
+ int len;
+
+ for (len = MAX_INSNS - 1; len > 0; --len)
+ if (fp[len].code != 0 || fp[len].imm != 0)
+ break;
+ return len + 1;
+}
+
+static bool skip_unsupported_map(enum bpf_map_type map_type)
+{
+ if (!bpf_probe_map_type(map_type, 0)) {
+ printf("SKIP (unsupported map type %d)\n", map_type);
+ skips++;
+ return true;
+ }
+ return false;
+}
+
+static int __create_map(uint32_t type, uint32_t size_key,
+ uint32_t size_value, uint32_t max_elem,
+ uint32_t extra_flags)
+{
+ int fd;
+
+ fd = bpf_create_map(type, size_key, size_value, max_elem,
+ (type == BPF_MAP_TYPE_HASH ?
+ BPF_F_NO_PREALLOC : 0) | extra_flags);
+ if (fd < 0) {
+ if (skip_unsupported_map(type))
+ return -1;
+ printf("Failed to create hash map '%s'!\n", strerror(errno));
+ }
+
+ return fd;
+}
+
+static int create_map(uint32_t type, uint32_t size_key,
+ uint32_t size_value, uint32_t max_elem)
+{
+ return __create_map(type, size_key, size_value, max_elem, 0);
+}
+
+static void update_map(int fd, int index)
+{
+ struct test_val value = {
+ .index = (6 + 1) * sizeof(int),
+ .foo[6] = 0xabcdef12,
+ };
+
+ assert(!bpf_map_update_elem(fd, &index, &value, 0));
+}
+
+static int create_prog_dummy_simple(enum bpf_prog_type prog_type, int ret)
+{
+ struct bpf_insn prog[] = {
+ BPF_MOV64_IMM(BPF_REG_0, ret),
+ BPF_EXIT_INSN(),
+ };
+
+ return bpf_load_program(prog_type, prog,
+ ARRAY_SIZE(prog), "GPL", 0, NULL, 0);
+}
+
+static int create_prog_dummy_loop(enum bpf_prog_type prog_type, int mfd,
+ int idx, int ret)
+{
+ struct bpf_insn prog[] = {
+ BPF_MOV64_IMM(BPF_REG_3, idx),
+ BPF_LD_MAP_FD(BPF_REG_2, mfd),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_tail_call),
+ BPF_MOV64_IMM(BPF_REG_0, ret),
+ BPF_EXIT_INSN(),
+ };
+
+ return bpf_load_program(prog_type, prog,
+ ARRAY_SIZE(prog), "GPL", 0, NULL, 0);
+}
+
+static int create_prog_array(enum bpf_prog_type prog_type, uint32_t max_elem,
+ int p1key, int p2key, int p3key)
+{
+ int mfd, p1fd, p2fd, p3fd;
+
+ mfd = bpf_create_map(BPF_MAP_TYPE_PROG_ARRAY, sizeof(int),
+ sizeof(int), max_elem, 0);
+ if (mfd < 0) {
+ if (skip_unsupported_map(BPF_MAP_TYPE_PROG_ARRAY))
+ return -1;
+ printf("Failed to create prog array '%s'!\n", strerror(errno));
+ return -1;
+ }
+
+ p1fd = create_prog_dummy_simple(prog_type, 42);
+ p2fd = create_prog_dummy_loop(prog_type, mfd, p2key, 41);
+ p3fd = create_prog_dummy_simple(prog_type, 24);
+ if (p1fd < 0 || p2fd < 0 || p3fd < 0)
+ goto err;
+ if (bpf_map_update_elem(mfd, &p1key, &p1fd, BPF_ANY) < 0)
+ goto err;
+ if (bpf_map_update_elem(mfd, &p2key, &p2fd, BPF_ANY) < 0)
+ goto err;
+ if (bpf_map_update_elem(mfd, &p3key, &p3fd, BPF_ANY) < 0) {
+err:
+ close(mfd);
+ mfd = -1;
+ }
+ close(p3fd);
+ close(p2fd);
+ close(p1fd);
+ return mfd;
+}
+
+static int create_map_in_map(void)
+{
+ int inner_map_fd, outer_map_fd;
+
+ inner_map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(int),
+ sizeof(int), 1, 0);
+ if (inner_map_fd < 0) {
+ if (skip_unsupported_map(BPF_MAP_TYPE_ARRAY))
+ return -1;
+ printf("Failed to create array '%s'!\n", strerror(errno));
+ return inner_map_fd;
+ }
+
+ outer_map_fd = bpf_create_map_in_map(BPF_MAP_TYPE_ARRAY_OF_MAPS, NULL,
+ sizeof(int), inner_map_fd, 1, 0);
+ if (outer_map_fd < 0) {
+ if (skip_unsupported_map(BPF_MAP_TYPE_ARRAY_OF_MAPS))
+ return -1;
+ printf("Failed to create array of maps '%s'!\n",
+ strerror(errno));
+ }
+
+ close(inner_map_fd);
+
+ return outer_map_fd;
+}
+
+static int create_cgroup_storage(bool percpu)
+{
+ enum bpf_map_type type = percpu ? BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE :
+ BPF_MAP_TYPE_CGROUP_STORAGE;
+ int fd;
+
+ fd = bpf_create_map(type, sizeof(struct bpf_cgroup_storage_key),
+ TEST_DATA_LEN, 0, 0);
+ if (fd < 0) {
+ if (skip_unsupported_map(type))
+ return -1;
+ printf("Failed to create cgroup storage '%s'!\n",
+ strerror(errno));
+ }
+
+ return fd;
+}
+
+/* struct bpf_spin_lock {
+ * int val;
+ * };
+ * struct val {
+ * int cnt;
+ * struct bpf_spin_lock l;
+ * };
+ */
+static const char btf_str_sec[] = "\0bpf_spin_lock\0val\0cnt\0l";
+static __u32 btf_raw_types[] = {
+ /* int */
+ BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */
+ /* struct bpf_spin_lock */ /* [2] */
+ BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 1), 4),
+ BTF_MEMBER_ENC(15, 1, 0), /* int val; */
+ /* struct val */ /* [3] */
+ BTF_TYPE_ENC(15, BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 2), 8),
+ BTF_MEMBER_ENC(19, 1, 0), /* int cnt; */
+ BTF_MEMBER_ENC(23, 2, 32),/* struct bpf_spin_lock l; */
+};
+
+static int load_btf(void)
+{
+ struct btf_header hdr = {
+ .magic = BTF_MAGIC,
+ .version = BTF_VERSION,
+ .hdr_len = sizeof(struct btf_header),
+ .type_len = sizeof(btf_raw_types),
+ .str_off = sizeof(btf_raw_types),
+ .str_len = sizeof(btf_str_sec),
+ };
+ void *ptr, *raw_btf;
+ int btf_fd;
+
+ ptr = raw_btf = malloc(sizeof(hdr) + sizeof(btf_raw_types) +
+ sizeof(btf_str_sec));
+
+ memcpy(ptr, &hdr, sizeof(hdr));
+ ptr += sizeof(hdr);
+ memcpy(ptr, btf_raw_types, hdr.type_len);
+ ptr += hdr.type_len;
+ memcpy(ptr, btf_str_sec, hdr.str_len);
+ ptr += hdr.str_len;
+
+ btf_fd = bpf_load_btf(raw_btf, ptr - raw_btf, 0, 0, 0);
+ free(raw_btf);
+ if (btf_fd < 0)
+ return -1;
+ return btf_fd;
+}
+
+static int create_map_spin_lock(void)
+{
+ struct bpf_create_map_attr attr = {
+ .name = "test_map",
+ .map_type = BPF_MAP_TYPE_ARRAY,
+ .key_size = 4,
+ .value_size = 8,
+ .max_entries = 1,
+ .btf_key_type_id = 1,
+ .btf_value_type_id = 3,
+ };
+ int fd, btf_fd;
+
+ btf_fd = load_btf();
+ if (btf_fd < 0)
+ return -1;
+ attr.btf_fd = btf_fd;
+ fd = bpf_create_map_xattr(&attr);
+ if (fd < 0)
+ printf("Failed to create map with spin_lock\n");
+ return fd;
+}
+
+static int create_sk_storage_map(void)
+{
+ struct bpf_create_map_attr attr = {
+ .name = "test_map",
+ .map_type = BPF_MAP_TYPE_SK_STORAGE,
+ .key_size = 4,
+ .value_size = 8,
+ .max_entries = 0,
+ .map_flags = BPF_F_NO_PREALLOC,
+ .btf_key_type_id = 1,
+ .btf_value_type_id = 3,
+ };
+ int fd, btf_fd;
+
+ btf_fd = load_btf();
+ if (btf_fd < 0)
+ return -1;
+ attr.btf_fd = btf_fd;
+ fd = bpf_create_map_xattr(&attr);
+ close(attr.btf_fd);
+ if (fd < 0)
+ printf("Failed to create sk_storage_map\n");
+ return fd;
+}
+
+static char bpf_vlog[UINT_MAX >> 8];
+
+static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
+ struct bpf_insn *prog, int *map_fds)
+{
+ int *fixup_map_hash_8b = test->fixup_map_hash_8b;
+ int *fixup_map_hash_48b = test->fixup_map_hash_48b;
+ int *fixup_map_hash_16b = test->fixup_map_hash_16b;
+ int *fixup_map_array_48b = test->fixup_map_array_48b;
+ int *fixup_map_sockmap = test->fixup_map_sockmap;
+ int *fixup_map_sockhash = test->fixup_map_sockhash;
+ int *fixup_map_xskmap = test->fixup_map_xskmap;
+ int *fixup_map_stacktrace = test->fixup_map_stacktrace;
+ int *fixup_prog1 = test->fixup_prog1;
+ int *fixup_prog2 = test->fixup_prog2;
+ int *fixup_map_in_map = test->fixup_map_in_map;
+ int *fixup_cgroup_storage = test->fixup_cgroup_storage;
+ int *fixup_percpu_cgroup_storage = test->fixup_percpu_cgroup_storage;
+ int *fixup_map_spin_lock = test->fixup_map_spin_lock;
+ int *fixup_map_array_ro = test->fixup_map_array_ro;
+ int *fixup_map_array_wo = test->fixup_map_array_wo;
+ int *fixup_map_array_small = test->fixup_map_array_small;
+ int *fixup_sk_storage_map = test->fixup_sk_storage_map;
+ int *fixup_map_event_output = test->fixup_map_event_output;
+ int *fixup_map_reuseport_array = test->fixup_map_reuseport_array;
+ int *fixup_map_ringbuf = test->fixup_map_ringbuf;
+
+ if (test->fill_helper) {
+ test->fill_insns = calloc(MAX_TEST_INSNS, sizeof(struct bpf_insn));
+ test->fill_helper(test);
+ }
+
+ /* Allocating HTs with 1 elem is fine here, since we only test
+ * for verifier and not do a runtime lookup, so the only thing
+ * that really matters is value size in this case.
+ */
+ if (*fixup_map_hash_8b) {
+ map_fds[0] = create_map(BPF_MAP_TYPE_HASH, sizeof(long long),
+ sizeof(long long), 1);
+ do {
+ prog[*fixup_map_hash_8b].imm = map_fds[0];
+ fixup_map_hash_8b++;
+ } while (*fixup_map_hash_8b);
+ }
+
+ if (*fixup_map_hash_48b) {
+ map_fds[1] = create_map(BPF_MAP_TYPE_HASH, sizeof(long long),
+ sizeof(struct test_val), 1);
+ do {
+ prog[*fixup_map_hash_48b].imm = map_fds[1];
+ fixup_map_hash_48b++;
+ } while (*fixup_map_hash_48b);
+ }
+
+ if (*fixup_map_hash_16b) {
+ map_fds[2] = create_map(BPF_MAP_TYPE_HASH, sizeof(long long),
+ sizeof(struct other_val), 1);
+ do {
+ prog[*fixup_map_hash_16b].imm = map_fds[2];
+ fixup_map_hash_16b++;
+ } while (*fixup_map_hash_16b);
+ }
+
+ if (*fixup_map_array_48b) {
+ map_fds[3] = create_map(BPF_MAP_TYPE_ARRAY, sizeof(int),
+ sizeof(struct test_val), 1);
+ update_map(map_fds[3], 0);
+ do {
+ prog[*fixup_map_array_48b].imm = map_fds[3];
+ fixup_map_array_48b++;
+ } while (*fixup_map_array_48b);
+ }
+
+ if (*fixup_prog1) {
+ map_fds[4] = create_prog_array(prog_type, 4, 0, 1, 2);
+ do {
+ prog[*fixup_prog1].imm = map_fds[4];
+ fixup_prog1++;
+ } while (*fixup_prog1);
+ }
+
+ if (*fixup_prog2) {
+ map_fds[5] = create_prog_array(prog_type, 8, 7, 1, 2);
+ do {
+ prog[*fixup_prog2].imm = map_fds[5];
+ fixup_prog2++;
+ } while (*fixup_prog2);
+ }
+
+ if (*fixup_map_in_map) {
+ map_fds[6] = create_map_in_map();
+ do {
+ prog[*fixup_map_in_map].imm = map_fds[6];
+ fixup_map_in_map++;
+ } while (*fixup_map_in_map);
+ }
+
+ if (*fixup_cgroup_storage) {
+ map_fds[7] = create_cgroup_storage(false);
+ do {
+ prog[*fixup_cgroup_storage].imm = map_fds[7];
+ fixup_cgroup_storage++;
+ } while (*fixup_cgroup_storage);
+ }
+
+ if (*fixup_percpu_cgroup_storage) {
+ map_fds[8] = create_cgroup_storage(true);
+ do {
+ prog[*fixup_percpu_cgroup_storage].imm = map_fds[8];
+ fixup_percpu_cgroup_storage++;
+ } while (*fixup_percpu_cgroup_storage);
+ }
+ if (*fixup_map_sockmap) {
+ map_fds[9] = create_map(BPF_MAP_TYPE_SOCKMAP, sizeof(int),
+ sizeof(int), 1);
+ do {
+ prog[*fixup_map_sockmap].imm = map_fds[9];
+ fixup_map_sockmap++;
+ } while (*fixup_map_sockmap);
+ }
+ if (*fixup_map_sockhash) {
+ map_fds[10] = create_map(BPF_MAP_TYPE_SOCKHASH, sizeof(int),
+ sizeof(int), 1);
+ do {
+ prog[*fixup_map_sockhash].imm = map_fds[10];
+ fixup_map_sockhash++;
+ } while (*fixup_map_sockhash);
+ }
+ if (*fixup_map_xskmap) {
+ map_fds[11] = create_map(BPF_MAP_TYPE_XSKMAP, sizeof(int),
+ sizeof(int), 1);
+ do {
+ prog[*fixup_map_xskmap].imm = map_fds[11];
+ fixup_map_xskmap++;
+ } while (*fixup_map_xskmap);
+ }
+ if (*fixup_map_stacktrace) {
+ map_fds[12] = create_map(BPF_MAP_TYPE_STACK_TRACE, sizeof(u32),
+ sizeof(u64), 1);
+ do {
+ prog[*fixup_map_stacktrace].imm = map_fds[12];
+ fixup_map_stacktrace++;
+ } while (*fixup_map_stacktrace);
+ }
+ if (*fixup_map_spin_lock) {
+ map_fds[13] = create_map_spin_lock();
+ do {
+ prog[*fixup_map_spin_lock].imm = map_fds[13];
+ fixup_map_spin_lock++;
+ } while (*fixup_map_spin_lock);
+ }
+ if (*fixup_map_array_ro) {
+ map_fds[14] = __create_map(BPF_MAP_TYPE_ARRAY, sizeof(int),
+ sizeof(struct test_val), 1,
+ BPF_F_RDONLY_PROG);
+ update_map(map_fds[14], 0);
+ do {
+ prog[*fixup_map_array_ro].imm = map_fds[14];
+ fixup_map_array_ro++;
+ } while (*fixup_map_array_ro);
+ }
+ if (*fixup_map_array_wo) {
+ map_fds[15] = __create_map(BPF_MAP_TYPE_ARRAY, sizeof(int),
+ sizeof(struct test_val), 1,
+ BPF_F_WRONLY_PROG);
+ update_map(map_fds[15], 0);
+ do {
+ prog[*fixup_map_array_wo].imm = map_fds[15];
+ fixup_map_array_wo++;
+ } while (*fixup_map_array_wo);
+ }
+ if (*fixup_map_array_small) {
+ map_fds[16] = __create_map(BPF_MAP_TYPE_ARRAY, sizeof(int),
+ 1, 1, 0);
+ update_map(map_fds[16], 0);
+ do {
+ prog[*fixup_map_array_small].imm = map_fds[16];
+ fixup_map_array_small++;
+ } while (*fixup_map_array_small);
+ }
+ if (*fixup_sk_storage_map) {
+ map_fds[17] = create_sk_storage_map();
+ do {
+ prog[*fixup_sk_storage_map].imm = map_fds[17];
+ fixup_sk_storage_map++;
+ } while (*fixup_sk_storage_map);
+ }
+ if (*fixup_map_event_output) {
+ map_fds[18] = __create_map(BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+ sizeof(int), sizeof(int), 1, 0);
+ do {
+ prog[*fixup_map_event_output].imm = map_fds[18];
+ fixup_map_event_output++;
+ } while (*fixup_map_event_output);
+ }
+ if (*fixup_map_reuseport_array) {
+ map_fds[19] = __create_map(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
+ sizeof(u32), sizeof(u64), 1, 0);
+ do {
+ prog[*fixup_map_reuseport_array].imm = map_fds[19];
+ fixup_map_reuseport_array++;
+ } while (*fixup_map_reuseport_array);
+ }
+ if (*fixup_map_ringbuf) {
+ map_fds[20] = create_map(BPF_MAP_TYPE_RINGBUF, 0,
+ 0, 4096);
+ do {
+ prog[*fixup_map_ringbuf].imm = map_fds[20];
+ fixup_map_ringbuf++;
+ } while (*fixup_map_ringbuf);
+ }
+}
+
+struct libcap {
+ struct __user_cap_header_struct hdr;
+ struct __user_cap_data_struct data[2];
+};
+
+static int set_admin(bool admin)
+{
+ cap_t caps;
+ /* need CAP_BPF, CAP_NET_ADMIN, CAP_PERFMON to load progs */
+ const cap_value_t cap_net_admin = CAP_NET_ADMIN;
+ const cap_value_t cap_sys_admin = CAP_SYS_ADMIN;
+ struct libcap *cap;
+ int ret = -1;
+
+ caps = cap_get_proc();
+ if (!caps) {
+ perror("cap_get_proc");
+ return -1;
+ }
+ cap = (struct libcap *)caps;
+ if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_sys_admin, CAP_CLEAR)) {
+ perror("cap_set_flag clear admin");
+ goto out;
+ }
+ if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_net_admin,
+ admin ? CAP_SET : CAP_CLEAR)) {
+ perror("cap_set_flag set_or_clear net");
+ goto out;
+ }
+ /* libcap is likely old and simply ignores CAP_BPF and CAP_PERFMON,
+ * so update effective bits manually
+ */
+ if (admin) {
+ cap->data[1].effective |= 1 << (38 /* CAP_PERFMON */ - 32);
+ cap->data[1].effective |= 1 << (39 /* CAP_BPF */ - 32);
+ } else {
+ cap->data[1].effective &= ~(1 << (38 - 32));
+ cap->data[1].effective &= ~(1 << (39 - 32));
+ }
+ if (cap_set_proc(caps)) {
+ perror("cap_set_proc");
+ goto out;
+ }
+ ret = 0;
+out:
+ if (cap_free(caps))
+ perror("cap_free");
+ return ret;
+}
+
+static int do_prog_test_run(int fd_prog, bool unpriv, uint32_t expected_val,
+ void *data, size_t size_data)
+{
+ __u8 tmp[TEST_DATA_LEN << 2];
+ __u32 size_tmp = sizeof(tmp);
+ uint32_t retval;
+ int err;
+
+ if (unpriv)
+ set_admin(true);
+ err = bpf_prog_test_run(fd_prog, 1, data, size_data,
+ tmp, &size_tmp, &retval, NULL);
+ if (unpriv)
+ set_admin(false);
+ if (err && errno != 524/*ENOTSUPP*/ && errno != EPERM) {
+ printf("Unexpected bpf_prog_test_run error ");
+ return err;
+ }
+ if (!err && retval != expected_val &&
+ expected_val != POINTER_VALUE) {
+ printf("FAIL retval %d != %d ", retval, expected_val);
+ return 1;
+ }
+
+ return 0;
+}
+
+static bool cmp_str_seq(const char *log, const char *exp)
+{
+ char needle[80];
+ const char *p, *q;
+ int len;
+
+ do {
+ p = strchr(exp, '\t');
+ if (!p)
+ p = exp + strlen(exp);
+
+ len = p - exp;
+ if (len >= sizeof(needle) || !len) {
+ printf("FAIL\nTestcase bug\n");
+ return false;
+ }
+ strncpy(needle, exp, len);
+ needle[len] = 0;
+ q = strstr(log, needle);
+ if (!q) {
+ printf("FAIL\nUnexpected verifier log in successful load!\n"
+ "EXP: %s\nRES:\n", needle);
+ return false;
+ }
+ log = q + len;
+ exp = p + 1;
+ } while (*p);
+ return true;
+}
+
+static void do_test_single(struct bpf_test *test, bool unpriv,
+ int *passes, int *errors)
+{
+ int fd_prog, expected_ret, alignment_prevented_execution;
+ int prog_len, prog_type = test->prog_type;
+ struct bpf_insn *prog = test->insns;
+ struct bpf_load_program_attr attr;
+ int run_errs, run_successes;
+ int map_fds[MAX_NR_MAPS];
+ const char *expected_err;
+ int fixup_skips;
+ __u32 pflags;
+ int i, err;
+
+ for (i = 0; i < MAX_NR_MAPS; i++)
+ map_fds[i] = -1;
+
+ if (!prog_type)
+ prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+ fixup_skips = skips;
+ do_test_fixup(test, prog_type, prog, map_fds);
+ if (test->fill_insns) {
+ prog = test->fill_insns;
+ prog_len = test->prog_len;
+ } else {
+ prog_len = probe_filter_length(prog);
+ }
+ /* If there were some map skips during fixup due to missing bpf
+ * features, skip this test.
+ */
+ if (fixup_skips != skips)
+ return;
+
+ pflags = BPF_F_TEST_RND_HI32;
+ if (test->flags & F_LOAD_WITH_STRICT_ALIGNMENT)
+ pflags |= BPF_F_STRICT_ALIGNMENT;
+ if (test->flags & F_NEEDS_EFFICIENT_UNALIGNED_ACCESS)
+ pflags |= BPF_F_ANY_ALIGNMENT;
+ if (test->flags & ~3)
+ pflags |= test->flags;
+
+ expected_ret = unpriv && test->result_unpriv != UNDEF ?
+ test->result_unpriv : test->result;
+ expected_err = unpriv && test->errstr_unpriv ?
+ test->errstr_unpriv : test->errstr;
+ memset(&attr, 0, sizeof(attr));
+ attr.prog_type = prog_type;
+ attr.expected_attach_type = test->expected_attach_type;
+ attr.insns = prog;
+ attr.insns_cnt = prog_len;
+ attr.license = "GPL";
+ if (verbose)
+ attr.log_level = 1;
+ else if (expected_ret == VERBOSE_ACCEPT)
+ attr.log_level = 2;
+ else
+ attr.log_level = 4;
+ attr.prog_flags = pflags;
+
+ if (prog_type == BPF_PROG_TYPE_TRACING && test->kfunc) {
+ attr.attach_btf_id = libbpf_find_vmlinux_btf_id(test->kfunc,
+ attr.expected_attach_type);
+ if (attr.attach_btf_id < 0) {
+ printf("FAIL\nFailed to find BTF ID for '%s'!\n",
+ test->kfunc);
+ (*errors)++;
+ return;
+ }
+ }
+
+ fd_prog = bpf_load_program_xattr(&attr, bpf_vlog, sizeof(bpf_vlog));
+
+ /* BPF_PROG_TYPE_TRACING requires more setup and
+ * bpf_probe_prog_type won't give correct answer
+ */
+ if (fd_prog < 0 && prog_type != BPF_PROG_TYPE_TRACING &&
+ !bpf_probe_prog_type(prog_type, 0)) {
+ printf("SKIP (unsupported program type %d)\n", prog_type);
+ skips++;
+ goto close_fds;
+ }
+
+ alignment_prevented_execution = 0;
+
+ if (expected_ret == ACCEPT || expected_ret == VERBOSE_ACCEPT) {
+ if (fd_prog < 0) {
+ printf("FAIL\nFailed to load prog '%s'!\n",
+ strerror(errno));
+ goto fail_log;
+ }
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ if (fd_prog >= 0 &&
+ (test->flags & F_NEEDS_EFFICIENT_UNALIGNED_ACCESS))
+ alignment_prevented_execution = 1;
+#endif
+ if (expected_ret == VERBOSE_ACCEPT && !cmp_str_seq(bpf_vlog, expected_err)) {
+ goto fail_log;
+ }
+ } else {
+ if (fd_prog >= 0) {
+ printf("FAIL\nUnexpected success to load!\n");
+ goto fail_log;
+ }
+ if (!expected_err || !strstr(bpf_vlog, expected_err)) {
+ printf("FAIL\nUnexpected error message!\n\tEXP: %s\n\tRES: %s\n",
+ expected_err, bpf_vlog);
+ goto fail_log;
+ }
+ }
+
+ if (!unpriv && test->insn_processed) {
+ uint32_t insn_processed;
+ char *proc;
+
+ proc = strstr(bpf_vlog, "processed ");
+ insn_processed = atoi(proc + 10);
+ if (test->insn_processed != insn_processed) {
+ printf("FAIL\nUnexpected insn_processed %u vs %u\n",
+ insn_processed, test->insn_processed);
+ goto fail_log;
+ }
+ }
+
+ if (verbose)
+ printf(", verifier log:\n%s", bpf_vlog);
+
+ run_errs = 0;
+ run_successes = 0;
+ if (!alignment_prevented_execution && fd_prog >= 0 && test->runs >= 0) {
+ uint32_t expected_val;
+ int i;
+
+ if (!test->runs)
+ test->runs = 1;
+
+ for (i = 0; i < test->runs; i++) {
+ if (unpriv && test->retvals[i].retval_unpriv)
+ expected_val = test->retvals[i].retval_unpriv;
+ else
+ expected_val = test->retvals[i].retval;
+
+ err = do_prog_test_run(fd_prog, unpriv, expected_val,
+ test->retvals[i].data,
+ sizeof(test->retvals[i].data));
+ if (err) {
+ printf("(run %d/%d) ", i + 1, test->runs);
+ run_errs++;
+ } else {
+ run_successes++;
+ }
+ }
+ }
+
+ if (!run_errs) {
+ (*passes)++;
+ if (run_successes > 1)
+ printf("%d cases ", run_successes);
+ printf("OK");
+ if (alignment_prevented_execution)
+ printf(" (NOTE: not executed due to unknown alignment)");
+ printf("\n");
+ } else {
+ printf("\n");
+ goto fail_log;
+ }
+close_fds:
+ if (test->fill_insns)
+ free(test->fill_insns);
+ close(fd_prog);
+ for (i = 0; i < MAX_NR_MAPS; i++)
+ close(map_fds[i]);
+ sched_yield();
+ return;
+fail_log:
+ (*errors)++;
+ printf("%s", bpf_vlog);
+ goto close_fds;
+}
+
+static bool is_admin(void)
+{
+ cap_flag_value_t net_priv = CAP_CLEAR;
+ bool perfmon_priv = false;
+ bool bpf_priv = false;
+ struct libcap *cap;
+ cap_t caps;
+
+#ifdef CAP_IS_SUPPORTED
+ if (!CAP_IS_SUPPORTED(CAP_SETFCAP)) {
+ perror("cap_get_flag");
+ return false;
+ }
+#endif
+ caps = cap_get_proc();
+ if (!caps) {
+ perror("cap_get_proc");
+ return false;
+ }
+ cap = (struct libcap *)caps;
+ bpf_priv = cap->data[1].effective & (1 << (39/* CAP_BPF */ - 32));
+ perfmon_priv = cap->data[1].effective & (1 << (38/* CAP_PERFMON */ - 32));
+ if (cap_get_flag(caps, CAP_NET_ADMIN, CAP_EFFECTIVE, &net_priv))
+ perror("cap_get_flag NET");
+ if (cap_free(caps))
+ perror("cap_free");
+ return bpf_priv && perfmon_priv && net_priv == CAP_SET;
+}
+
+static void get_unpriv_disabled()
+{
+ char buf[2];
+ FILE *fd;
+
+ fd = fopen("/proc/sys/"UNPRIV_SYSCTL, "r");
+ if (!fd) {
+ perror("fopen /proc/sys/"UNPRIV_SYSCTL);
+ unpriv_disabled = true;
+ return;
+ }
+ if (fgets(buf, 2, fd) == buf && atoi(buf))
+ unpriv_disabled = true;
+ fclose(fd);
+}
+
+static bool test_as_unpriv(struct bpf_test *test)
+{
+ return !test->prog_type ||
+ test->prog_type == BPF_PROG_TYPE_SOCKET_FILTER ||
+ test->prog_type == BPF_PROG_TYPE_CGROUP_SKB;
+}
+
+static int do_test(bool unpriv, unsigned int from, unsigned int to)
+{
+ int i, passes = 0, errors = 0;
+
+ for (i = from; i < to; i++) {
+ struct bpf_test *test = &tests[i];
+
+ /* Program types that are not supported by non-root we
+ * skip right away.
+ */
+ if (test_as_unpriv(test) && unpriv_disabled) {
+ printf("#%d/u %s SKIP\n", i, test->descr);
+ skips++;
+ } else if (test_as_unpriv(test)) {
+ if (!unpriv)
+ set_admin(false);
+ printf("#%d/u %s ", i, test->descr);
+ do_test_single(test, true, &passes, &errors);
+ if (!unpriv)
+ set_admin(true);
+ }
+
+ if (unpriv) {
+ printf("#%d/p %s SKIP\n", i, test->descr);
+ skips++;
+ } else {
+ printf("#%d/p %s ", i, test->descr);
+ do_test_single(test, false, &passes, &errors);
+ }
+ }
+
+ printf("Summary: %d PASSED, %d SKIPPED, %d FAILED\n", passes,
+ skips, errors);
+ return errors ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+int main(int argc, char **argv)
+{
+ unsigned int from = 0, to = ARRAY_SIZE(tests);
+ bool unpriv = !is_admin();
+ int arg = 1;
+
+ if (argc > 1 && strcmp(argv[1], "-v") == 0) {
+ arg++;
+ verbose = true;
+ argc--;
+ }
+
+ if (argc == 3) {
+ unsigned int l = atoi(argv[arg]);
+ unsigned int u = atoi(argv[arg + 1]);
+
+ if (l < to && u < to) {
+ from = l;
+ to = u + 1;
+ }
+ } else if (argc == 2) {
+ unsigned int t = atoi(argv[arg]);
+
+ if (t < to) {
+ from = t;
+ to = t + 1;
+ }
+ }
+
+ get_unpriv_disabled();
+ if (unpriv && unpriv_disabled) {
+ printf("Cannot run as unprivileged user with sysctl %s.\n",
+ UNPRIV_SYSCTL);
+ return EXIT_FAILURE;
+ }
+
+ bpf_semi_rand_init();
+ return do_test(unpriv, from, to);
+}
diff --git a/tools/testing/selftests/bpf/test_verifier_log.c b/tools/testing/selftests/bpf/test_verifier_log.c
new file mode 100644
index 000000000..8d6918c3b
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_verifier_log.c
@@ -0,0 +1,174 @@
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/unistd.h>
+
+#include <bpf/bpf.h>
+
+#include "bpf_rlimit.h"
+
+#define LOG_SIZE (1 << 20)
+
+#define err(str...) printf("ERROR: " str)
+
+static const struct bpf_insn code_sample[] = {
+ /* We need a few instructions to pass the min log length */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+};
+
+static inline __u64 ptr_to_u64(const void *ptr)
+{
+ return (__u64) (unsigned long) ptr;
+}
+
+static int load(char *log, size_t log_len, int log_level)
+{
+ union bpf_attr attr;
+
+ bzero(&attr, sizeof(attr));
+ attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+ attr.insn_cnt = (__u32)(sizeof(code_sample) / sizeof(struct bpf_insn));
+ attr.insns = ptr_to_u64(code_sample);
+ attr.license = ptr_to_u64("GPL");
+ attr.log_buf = ptr_to_u64(log);
+ attr.log_size = log_len;
+ attr.log_level = log_level;
+
+ return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+}
+
+static void check_ret(int ret, int exp_errno)
+{
+ if (ret > 0) {
+ close(ret);
+ err("broken sample loaded successfully!?\n");
+ exit(1);
+ }
+
+ if (!ret || errno != exp_errno) {
+ err("Program load returned: ret:%d/errno:%d, expected ret:%d/errno:%d\n",
+ ret, errno, -1, exp_errno);
+ exit(1);
+ }
+}
+
+static void check_ones(const char *buf, size_t len, const char *msg)
+{
+ while (len--)
+ if (buf[len] != 1) {
+ err("%s", msg);
+ exit(1);
+ }
+}
+
+static void test_log_good(char *log, size_t buf_len, size_t log_len,
+ size_t exp_len, int exp_errno, const char *full_log)
+{
+ size_t len;
+ int ret;
+
+ memset(log, 1, buf_len);
+
+ ret = load(log, log_len, 1);
+ check_ret(ret, exp_errno);
+
+ len = strnlen(log, buf_len);
+ if (len == buf_len) {
+ err("verifier did not NULL terminate the log\n");
+ exit(1);
+ }
+ if (exp_len && len != exp_len) {
+ err("incorrect log length expected:%zd have:%zd\n",
+ exp_len, len);
+ exit(1);
+ }
+
+ if (strchr(log, 1)) {
+ err("verifier leaked a byte through\n");
+ exit(1);
+ }
+
+ check_ones(log + len + 1, buf_len - len - 1,
+ "verifier wrote bytes past NULL termination\n");
+
+ if (memcmp(full_log, log, LOG_SIZE)) {
+ err("log did not match expected output\n");
+ exit(1);
+ }
+}
+
+static void test_log_bad(char *log, size_t log_len, int log_level)
+{
+ int ret;
+
+ ret = load(log, log_len, log_level);
+ check_ret(ret, EINVAL);
+ if (log)
+ check_ones(log, LOG_SIZE,
+ "verifier touched log with bad parameters\n");
+}
+
+int main(int argc, char **argv)
+{
+ char full_log[LOG_SIZE];
+ char log[LOG_SIZE];
+ size_t want_len;
+ int i;
+
+ memset(log, 1, LOG_SIZE);
+
+ /* Test incorrect attr */
+ printf("Test log_level 0...\n");
+ test_log_bad(log, LOG_SIZE, 0);
+
+ printf("Test log_size < 128...\n");
+ test_log_bad(log, 15, 1);
+
+ printf("Test log_buff = NULL...\n");
+ test_log_bad(NULL, LOG_SIZE, 1);
+
+ /* Test with log big enough */
+ printf("Test oversized buffer...\n");
+ test_log_good(full_log, LOG_SIZE, LOG_SIZE, 0, EACCES, full_log);
+
+ want_len = strlen(full_log);
+
+ printf("Test exact buffer...\n");
+ test_log_good(log, LOG_SIZE, want_len + 2, want_len, EACCES, full_log);
+
+ printf("Test undersized buffers...\n");
+ for (i = 0; i < 64; i++) {
+ full_log[want_len - i + 1] = 1;
+ full_log[want_len - i] = 0;
+
+ test_log_good(log, LOG_SIZE, want_len + 1 - i, want_len - i,
+ ENOSPC, full_log);
+ }
+
+ printf("test_verifier_log: OK\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/test_xdp_meta.sh b/tools/testing/selftests/bpf/test_xdp_meta.sh
new file mode 100755
index 000000000..637fcf4fe
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_xdp_meta.sh
@@ -0,0 +1,52 @@
+#!/bin/sh
+
+cleanup()
+{
+ if [ "$?" = "0" ]; then
+ echo "selftests: test_xdp_meta [PASS]";
+ else
+ echo "selftests: test_xdp_meta [FAILED]";
+ fi
+
+ set +e
+ ip link del veth1 2> /dev/null
+ ip netns del ns1 2> /dev/null
+ ip netns del ns2 2> /dev/null
+}
+
+ip link set dev lo xdp off 2>/dev/null > /dev/null
+if [ $? -ne 0 ];then
+ echo "selftests: [SKIP] Could not run test without the ip xdp support"
+ exit 0
+fi
+set -e
+
+ip netns add ns1
+ip netns add ns2
+
+trap cleanup 0 2 3 6 9
+
+ip link add veth1 type veth peer name veth2
+
+ip link set veth1 netns ns1
+ip link set veth2 netns ns2
+
+ip netns exec ns1 ip addr add 10.1.1.11/24 dev veth1
+ip netns exec ns2 ip addr add 10.1.1.22/24 dev veth2
+
+ip netns exec ns1 tc qdisc add dev veth1 clsact
+ip netns exec ns2 tc qdisc add dev veth2 clsact
+
+ip netns exec ns1 tc filter add dev veth1 ingress bpf da obj test_xdp_meta.o sec t
+ip netns exec ns2 tc filter add dev veth2 ingress bpf da obj test_xdp_meta.o sec t
+
+ip netns exec ns1 ip link set dev veth1 xdp obj test_xdp_meta.o sec x
+ip netns exec ns2 ip link set dev veth2 xdp obj test_xdp_meta.o sec x
+
+ip netns exec ns1 ip link set dev veth1 up
+ip netns exec ns2 ip link set dev veth2 up
+
+ip netns exec ns1 ping -c 1 10.1.1.22
+ip netns exec ns2 ping -c 1 10.1.1.11
+
+exit 0
diff --git a/tools/testing/selftests/bpf/test_xdp_redirect.sh b/tools/testing/selftests/bpf/test_xdp_redirect.sh
new file mode 100755
index 000000000..c03385088
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_xdp_redirect.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+# Create 2 namespaces with two veth peers, and
+# forward packets in-between using generic XDP
+#
+# NS1(veth11) NS2(veth22)
+# | |
+# | |
+# (veth1, ------ (veth2,
+# id:111) id:222)
+# | xdp forwarding |
+# ------------------
+
+ret=0
+
+setup()
+{
+
+ local xdpmode=$1
+
+ ip netns add ns1
+ ip netns add ns2
+
+ ip link add veth1 index 111 type veth peer name veth11 netns ns1
+ ip link add veth2 index 222 type veth peer name veth22 netns ns2
+
+ ip link set veth1 up
+ ip link set veth2 up
+ ip -n ns1 link set dev veth11 up
+ ip -n ns2 link set dev veth22 up
+
+ ip -n ns1 addr add 10.1.1.11/24 dev veth11
+ ip -n ns2 addr add 10.1.1.22/24 dev veth22
+}
+
+cleanup()
+{
+ ip link del veth1 2> /dev/null
+ ip link del veth2 2> /dev/null
+ ip netns del ns1 2> /dev/null
+ ip netns del ns2 2> /dev/null
+}
+
+test_xdp_redirect()
+{
+ local xdpmode=$1
+
+ setup
+
+ ip link set dev veth1 $xdpmode off &> /dev/null
+ if [ $? -ne 0 ];then
+ echo "selftests: test_xdp_redirect $xdpmode [SKIP]"
+ return 0
+ fi
+
+ ip -n ns1 link set veth11 $xdpmode obj xdp_dummy.o sec xdp_dummy &> /dev/null
+ ip -n ns2 link set veth22 $xdpmode obj xdp_dummy.o sec xdp_dummy &> /dev/null
+ ip link set dev veth1 $xdpmode obj test_xdp_redirect.o sec redirect_to_222 &> /dev/null
+ ip link set dev veth2 $xdpmode obj test_xdp_redirect.o sec redirect_to_111 &> /dev/null
+
+ if ip netns exec ns1 ping -c 1 10.1.1.22 &> /dev/null &&
+ ip netns exec ns2 ping -c 1 10.1.1.11 &> /dev/null; then
+ echo "selftests: test_xdp_redirect $xdpmode [PASS]";
+ else
+ ret=1
+ echo "selftests: test_xdp_redirect $xdpmode [FAILED]";
+ fi
+
+ cleanup
+}
+
+set -e
+trap cleanup 2 3 6 9
+
+test_xdp_redirect xdpgeneric
+test_xdp_redirect xdpdrv
+
+exit $ret
diff --git a/tools/testing/selftests/bpf/test_xdp_veth.sh b/tools/testing/selftests/bpf/test_xdp_veth.sh
new file mode 100755
index 000000000..995278e68
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_xdp_veth.sh
@@ -0,0 +1,118 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Create 3 namespaces with 3 veth peers, and
+# forward packets in-between using native XDP
+#
+# XDP_TX
+# NS1(veth11) NS2(veth22) NS3(veth33)
+# | | |
+# | | |
+# (veth1, (veth2, (veth3,
+# id:111) id:122) id:133)
+# ^ | ^ | ^ |
+# | | XDP_REDIRECT | | XDP_REDIRECT | |
+# | ------------------ ------------------ |
+# -----------------------------------------
+# XDP_REDIRECT
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+TESTNAME=xdp_veth
+BPF_FS=$(awk '$3 == "bpf" {print $2; exit}' /proc/mounts)
+BPF_DIR=$BPF_FS/test_$TESTNAME
+
+_cleanup()
+{
+ set +e
+ ip link del veth1 2> /dev/null
+ ip link del veth2 2> /dev/null
+ ip link del veth3 2> /dev/null
+ ip netns del ns1 2> /dev/null
+ ip netns del ns2 2> /dev/null
+ ip netns del ns3 2> /dev/null
+ rm -rf $BPF_DIR 2> /dev/null
+}
+
+cleanup_skip()
+{
+ echo "selftests: $TESTNAME [SKIP]"
+ _cleanup
+
+ exit $ksft_skip
+}
+
+cleanup()
+{
+ if [ "$?" = 0 ]; then
+ echo "selftests: $TESTNAME [PASS]"
+ else
+ echo "selftests: $TESTNAME [FAILED]"
+ fi
+ _cleanup
+}
+
+if [ $(id -u) -ne 0 ]; then
+ echo "selftests: $TESTNAME [SKIP] Need root privileges"
+ exit $ksft_skip
+fi
+
+if ! ip link set dev lo xdp off > /dev/null 2>&1; then
+ echo "selftests: $TESTNAME [SKIP] Could not run test without the ip xdp support"
+ exit $ksft_skip
+fi
+
+if [ -z "$BPF_FS" ]; then
+ echo "selftests: $TESTNAME [SKIP] Could not run test without bpffs mounted"
+ exit $ksft_skip
+fi
+
+if ! bpftool version > /dev/null 2>&1; then
+ echo "selftests: $TESTNAME [SKIP] Could not run test without bpftool"
+ exit $ksft_skip
+fi
+
+set -e
+
+trap cleanup_skip EXIT
+
+ip netns add ns1
+ip netns add ns2
+ip netns add ns3
+
+ip link add veth1 index 111 type veth peer name veth11 netns ns1
+ip link add veth2 index 122 type veth peer name veth22 netns ns2
+ip link add veth3 index 133 type veth peer name veth33 netns ns3
+
+ip link set veth1 up
+ip link set veth2 up
+ip link set veth3 up
+
+ip -n ns1 addr add 10.1.1.11/24 dev veth11
+ip -n ns3 addr add 10.1.1.33/24 dev veth33
+
+ip -n ns1 link set dev veth11 up
+ip -n ns2 link set dev veth22 up
+ip -n ns3 link set dev veth33 up
+
+mkdir $BPF_DIR
+bpftool prog loadall \
+ xdp_redirect_map.o $BPF_DIR/progs type xdp \
+ pinmaps $BPF_DIR/maps
+bpftool map update pinned $BPF_DIR/maps/tx_port key 0 0 0 0 value 122 0 0 0
+bpftool map update pinned $BPF_DIR/maps/tx_port key 1 0 0 0 value 133 0 0 0
+bpftool map update pinned $BPF_DIR/maps/tx_port key 2 0 0 0 value 111 0 0 0
+ip link set dev veth1 xdp pinned $BPF_DIR/progs/redirect_map_0
+ip link set dev veth2 xdp pinned $BPF_DIR/progs/redirect_map_1
+ip link set dev veth3 xdp pinned $BPF_DIR/progs/redirect_map_2
+
+ip -n ns1 link set dev veth11 xdp obj xdp_dummy.o sec xdp_dummy
+ip -n ns2 link set dev veth22 xdp obj xdp_tx.o sec xdp
+ip -n ns3 link set dev veth33 xdp obj xdp_dummy.o sec xdp_dummy
+
+trap cleanup EXIT
+
+ip netns exec ns1 ping -c 1 -W 1 10.1.1.33
+
+exit 0
diff --git a/tools/testing/selftests/bpf/test_xdp_vlan.sh b/tools/testing/selftests/bpf/test_xdp_vlan.sh
new file mode 100755
index 000000000..bb8b0da91
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_xdp_vlan.sh
@@ -0,0 +1,228 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Author: Jesper Dangaard Brouer <hawk@kernel.org>
+
+# Allow wrapper scripts to name test
+if [ -z "$TESTNAME" ]; then
+ TESTNAME=xdp_vlan
+fi
+
+# Default XDP mode
+XDP_MODE=xdpgeneric
+
+usage() {
+ echo "Testing XDP + TC eBPF VLAN manipulations: $TESTNAME"
+ echo ""
+ echo "Usage: $0 [-vfh]"
+ echo " -v | --verbose : Verbose"
+ echo " --flush : Flush before starting (e.g. after --interactive)"
+ echo " --interactive : Keep netns setup running after test-run"
+ echo " --mode=XXX : Choose XDP mode (xdp | xdpgeneric | xdpdrv)"
+ echo ""
+}
+
+valid_xdp_mode()
+{
+ local mode=$1
+
+ case "$mode" in
+ xdpgeneric | xdpdrv | xdp)
+ return 0
+ ;;
+ *)
+ return 1
+ esac
+}
+
+cleanup()
+{
+ local status=$?
+
+ if [ "$status" = "0" ]; then
+ echo "selftests: $TESTNAME [PASS]";
+ else
+ echo "selftests: $TESTNAME [FAILED]";
+ fi
+
+ if [ -n "$INTERACTIVE" ]; then
+ echo "Namespace setup still active explore with:"
+ echo " ip netns exec ns1 bash"
+ echo " ip netns exec ns2 bash"
+ exit $status
+ fi
+
+ set +e
+ ip link del veth1 2> /dev/null
+ ip netns del ns1 2> /dev/null
+ ip netns del ns2 2> /dev/null
+}
+
+# Using external program "getopt" to get --long-options
+OPTIONS=$(getopt -o hvfi: \
+ --long verbose,flush,help,interactive,debug,mode: -- "$@")
+if (( $? != 0 )); then
+ usage
+ echo "selftests: $TESTNAME [FAILED] Error calling getopt, unknown option?"
+ exit 2
+fi
+eval set -- "$OPTIONS"
+
+## --- Parse command line arguments / parameters ---
+while true; do
+ case "$1" in
+ -v | --verbose)
+ export VERBOSE=yes
+ shift
+ ;;
+ -i | --interactive | --debug )
+ INTERACTIVE=yes
+ shift
+ ;;
+ -f | --flush )
+ cleanup
+ shift
+ ;;
+ --mode )
+ shift
+ XDP_MODE=$1
+ shift
+ ;;
+ -- )
+ shift
+ break
+ ;;
+ -h | --help )
+ usage;
+ echo "selftests: $TESTNAME [SKIP] usage help info requested"
+ exit 0
+ ;;
+ * )
+ shift
+ break
+ ;;
+ esac
+done
+
+if [ "$EUID" -ne 0 ]; then
+ echo "selftests: $TESTNAME [FAILED] need root privileges"
+ exit 1
+fi
+
+valid_xdp_mode $XDP_MODE
+if [ $? -ne 0 ]; then
+ echo "selftests: $TESTNAME [FAILED] unknown XDP mode ($XDP_MODE)"
+ exit 1
+fi
+
+ip link set dev lo xdpgeneric off 2>/dev/null > /dev/null
+if [ $? -ne 0 ]; then
+ echo "selftests: $TESTNAME [SKIP] need ip xdp support"
+ exit 0
+fi
+
+# Interactive mode likely require us to cleanup netns
+if [ -n "$INTERACTIVE" ]; then
+ ip link del veth1 2> /dev/null
+ ip netns del ns1 2> /dev/null
+ ip netns del ns2 2> /dev/null
+fi
+
+# Exit on failure
+set -e
+
+# Some shell-tools dependencies
+which ip > /dev/null
+which tc > /dev/null
+which ethtool > /dev/null
+
+# Make rest of shell verbose, showing comments as doc/info
+if [ -n "$VERBOSE" ]; then
+ set -v
+fi
+
+# Create two namespaces
+ip netns add ns1
+ip netns add ns2
+
+# Run cleanup if failing or on kill
+trap cleanup 0 2 3 6 9
+
+# Create veth pair
+ip link add veth1 type veth peer name veth2
+
+# Move veth1 and veth2 into the respective namespaces
+ip link set veth1 netns ns1
+ip link set veth2 netns ns2
+
+# NOTICE: XDP require VLAN header inside packet payload
+# - Thus, disable VLAN offloading driver features
+# - For veth REMEMBER TX side VLAN-offload
+#
+# Disable rx-vlan-offload (mostly needed on ns1)
+ip netns exec ns1 ethtool -K veth1 rxvlan off
+ip netns exec ns2 ethtool -K veth2 rxvlan off
+#
+# Disable tx-vlan-offload (mostly needed on ns2)
+ip netns exec ns2 ethtool -K veth2 txvlan off
+ip netns exec ns1 ethtool -K veth1 txvlan off
+
+export IPADDR1=100.64.41.1
+export IPADDR2=100.64.41.2
+
+# In ns1/veth1 add IP-addr on plain net_device
+ip netns exec ns1 ip addr add ${IPADDR1}/24 dev veth1
+ip netns exec ns1 ip link set veth1 up
+
+# In ns2/veth2 create VLAN device
+export VLAN=4011
+export DEVNS2=veth2
+ip netns exec ns2 ip link add link $DEVNS2 name $DEVNS2.$VLAN type vlan id $VLAN
+ip netns exec ns2 ip addr add ${IPADDR2}/24 dev $DEVNS2.$VLAN
+ip netns exec ns2 ip link set $DEVNS2 up
+ip netns exec ns2 ip link set $DEVNS2.$VLAN up
+
+# Bringup lo in netns (to avoids confusing people using --interactive)
+ip netns exec ns1 ip link set lo up
+ip netns exec ns2 ip link set lo up
+
+# At this point, the hosts cannot reach each-other,
+# because ns2 are using VLAN tags on the packets.
+
+ip netns exec ns2 sh -c 'ping -W 1 -c 1 100.64.41.1 || echo "Success: First ping must fail"'
+
+
+# Now we can use the test_xdp_vlan.c program to pop/push these VLAN tags
+# ----------------------------------------------------------------------
+# In ns1: ingress use XDP to remove VLAN tags
+export DEVNS1=veth1
+export FILE=test_xdp_vlan.o
+
+# First test: Remove VLAN by setting VLAN ID 0, using "xdp_vlan_change"
+export XDP_PROG=xdp_vlan_change
+ip netns exec ns1 ip link set $DEVNS1 $XDP_MODE object $FILE section $XDP_PROG
+
+# In ns1: egress use TC to add back VLAN tag 4011
+# (del cmd)
+# tc qdisc del dev $DEVNS1 clsact 2> /dev/null
+#
+ip netns exec ns1 tc qdisc add dev $DEVNS1 clsact
+ip netns exec ns1 tc filter add dev $DEVNS1 egress \
+ prio 1 handle 1 bpf da obj $FILE sec tc_vlan_push
+
+# Now the namespaces can reach each-other, test with ping:
+ip netns exec ns2 ping -i 0.2 -W 2 -c 2 $IPADDR1
+ip netns exec ns1 ping -i 0.2 -W 2 -c 2 $IPADDR2
+
+# Second test: Replace xdp prog, that fully remove vlan header
+#
+# Catch kernel bug for generic-XDP, that does didn't allow us to
+# remove a VLAN header, because skb->protocol still contain VLAN
+# ETH_P_8021Q indication, and this cause overwriting of our changes.
+#
+export XDP_PROG=xdp_vlan_remove_outer2
+ip netns exec ns1 ip link set $DEVNS1 $XDP_MODE off
+ip netns exec ns1 ip link set $DEVNS1 $XDP_MODE object $FILE section $XDP_PROG
+
+# Now the namespaces should still be able reach each-other, test with ping:
+ip netns exec ns2 ping -i 0.2 -W 2 -c 2 $IPADDR1
+ip netns exec ns1 ping -i 0.2 -W 2 -c 2 $IPADDR2
diff --git a/tools/testing/selftests/bpf/test_xdp_vlan_mode_generic.sh b/tools/testing/selftests/bpf/test_xdp_vlan_mode_generic.sh
new file mode 100755
index 000000000..c515326d6
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_xdp_vlan_mode_generic.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Exit on failure
+set -e
+
+# Wrapper script to test generic-XDP
+export TESTNAME=xdp_vlan_mode_generic
+./test_xdp_vlan.sh --mode=xdpgeneric
diff --git a/tools/testing/selftests/bpf/test_xdp_vlan_mode_native.sh b/tools/testing/selftests/bpf/test_xdp_vlan_mode_native.sh
new file mode 100755
index 000000000..5cf7ce1f1
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_xdp_vlan_mode_native.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Exit on failure
+set -e
+
+# Wrapper script to test native-XDP
+export TESTNAME=xdp_vlan_mode_native
+./test_xdp_vlan.sh --mode=xdpdrv
diff --git a/tools/testing/selftests/bpf/test_xdping.sh b/tools/testing/selftests/bpf/test_xdping.sh
new file mode 100755
index 000000000..c2f0ddb45
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_xdping.sh
@@ -0,0 +1,99 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# xdping tests
+# Here we setup and teardown configuration required to run
+# xdping, exercising its options.
+#
+# Setup is similar to test_tunnel tests but without the tunnel.
+#
+# Topology:
+# ---------
+# root namespace | tc_ns0 namespace
+# |
+# ---------- | ----------
+# | veth1 | --------- | veth0 |
+# ---------- peer ----------
+#
+# Device Configuration
+# --------------------
+# Root namespace with BPF
+# Device names and addresses:
+# veth1 IP: 10.1.1.200
+# xdp added to veth1, xdpings originate from here.
+#
+# Namespace tc_ns0 with BPF
+# Device names and addresses:
+# veth0 IPv4: 10.1.1.100
+# For some tests xdping run in server mode here.
+#
+
+readonly TARGET_IP="10.1.1.100"
+readonly TARGET_NS="xdp_ns0"
+
+readonly LOCAL_IP="10.1.1.200"
+
+setup()
+{
+ ip netns add $TARGET_NS
+ ip link add veth0 type veth peer name veth1
+ ip link set veth0 netns $TARGET_NS
+ ip netns exec $TARGET_NS ip addr add ${TARGET_IP}/24 dev veth0
+ ip addr add ${LOCAL_IP}/24 dev veth1
+ ip netns exec $TARGET_NS ip link set veth0 up
+ ip link set veth1 up
+}
+
+cleanup()
+{
+ set +e
+ ip netns delete $TARGET_NS 2>/dev/null
+ ip link del veth1 2>/dev/null
+ if [[ $server_pid -ne 0 ]]; then
+ kill -TERM $server_pid
+ fi
+}
+
+test()
+{
+ client_args="$1"
+ server_args="$2"
+
+ echo "Test client args '$client_args'; server args '$server_args'"
+
+ server_pid=0
+ if [[ -n "$server_args" ]]; then
+ ip netns exec $TARGET_NS ./xdping $server_args &
+ server_pid=$!
+ sleep 10
+ fi
+ ./xdping $client_args $TARGET_IP
+
+ if [[ $server_pid -ne 0 ]]; then
+ kill -TERM $server_pid
+ server_pid=0
+ fi
+
+ echo "Test client args '$client_args'; server args '$server_args': PASS"
+}
+
+set -e
+
+server_pid=0
+
+trap cleanup EXIT
+
+setup
+
+for server_args in "" "-I veth0 -s -S" ; do
+ # client in skb mode
+ client_args="-I veth1 -S"
+ test "$client_args" "$server_args"
+
+ # client with count of 10 RTT measurements.
+ client_args="-I veth1 -S -c 10"
+ test "$client_args" "$server_args"
+done
+
+echo "OK. All tests passed"
+exit 0
diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c
new file mode 100644
index 000000000..800d503e5
--- /dev/null
+++ b/tools/testing/selftests/bpf/testing_helpers.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+/* Copyright (C) 2020 Facebook, Inc. */
+#include <stdlib.h>
+#include <errno.h>
+#include "testing_helpers.h"
+
+int parse_num_list(const char *s, bool **num_set, int *num_set_len)
+{
+ int i, set_len = 0, new_len, num, start = 0, end = -1;
+ bool *set = NULL, *tmp, parsing_end = false;
+ char *next;
+
+ while (s[0]) {
+ errno = 0;
+ num = strtol(s, &next, 10);
+ if (errno)
+ return -errno;
+
+ if (parsing_end)
+ end = num;
+ else
+ start = num;
+
+ if (!parsing_end && *next == '-') {
+ s = next + 1;
+ parsing_end = true;
+ continue;
+ } else if (*next == ',') {
+ parsing_end = false;
+ s = next + 1;
+ end = num;
+ } else if (*next == '\0') {
+ parsing_end = false;
+ s = next;
+ end = num;
+ } else {
+ return -EINVAL;
+ }
+
+ if (start > end)
+ return -EINVAL;
+
+ if (end + 1 > set_len) {
+ new_len = end + 1;
+ tmp = realloc(set, new_len);
+ if (!tmp) {
+ free(set);
+ return -ENOMEM;
+ }
+ for (i = set_len; i < start; i++)
+ tmp[i] = false;
+ set = tmp;
+ set_len = new_len;
+ }
+ for (i = start; i <= end; i++)
+ set[i] = true;
+ }
+
+ if (!set)
+ return -EINVAL;
+
+ *num_set = set;
+ *num_set_len = set_len;
+
+ return 0;
+}
+
+__u32 link_info_prog_id(const struct bpf_link *link, struct bpf_link_info *info)
+{
+ __u32 info_len = sizeof(*info);
+ int err;
+
+ memset(info, 0, sizeof(*info));
+ err = bpf_obj_get_info_by_fd(bpf_link__fd(link), info, &info_len);
+ if (err) {
+ printf("failed to get link info: %d\n", -errno);
+ return 0;
+ }
+ return info->prog_id;
+}
diff --git a/tools/testing/selftests/bpf/testing_helpers.h b/tools/testing/selftests/bpf/testing_helpers.h
new file mode 100644
index 000000000..d4f8e7496
--- /dev/null
+++ b/tools/testing/selftests/bpf/testing_helpers.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* Copyright (C) 2020 Facebook, Inc. */
+#include <stdbool.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+int parse_num_list(const char *s, bool **set, int *set_len);
+__u32 link_info_prog_id(const struct bpf_link *link, struct bpf_link_info *info);
diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
new file mode 100644
index 000000000..1bbd1d983
--- /dev/null
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <unistd.h>
+#include <linux/perf_event.h>
+#include <sys/mman.h>
+#include "trace_helpers.h"
+
+#define DEBUGFS "/sys/kernel/debug/tracing/"
+
+#define MAX_SYMS 300000
+static struct ksym syms[MAX_SYMS];
+static int sym_cnt;
+
+static int ksym_cmp(const void *p1, const void *p2)
+{
+ return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
+}
+
+int load_kallsyms(void)
+{
+ FILE *f = fopen("/proc/kallsyms", "r");
+ char func[256], buf[256];
+ char symbol;
+ void *addr;
+ int i = 0;
+
+ if (!f)
+ return -ENOENT;
+
+ while (fgets(buf, sizeof(buf), f)) {
+ if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
+ break;
+ if (!addr)
+ continue;
+ syms[i].addr = (long) addr;
+ syms[i].name = strdup(func);
+ i++;
+ }
+ fclose(f);
+ sym_cnt = i;
+ qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp);
+ return 0;
+}
+
+struct ksym *ksym_search(long key)
+{
+ int start = 0, end = sym_cnt;
+ int result;
+
+ /* kallsyms not loaded. return NULL */
+ if (sym_cnt <= 0)
+ return NULL;
+
+ while (start < end) {
+ size_t mid = start + (end - start) / 2;
+
+ result = key - syms[mid].addr;
+ if (result < 0)
+ end = mid;
+ else if (result > 0)
+ start = mid + 1;
+ else
+ return &syms[mid];
+ }
+
+ if (start >= 1 && syms[start - 1].addr < key &&
+ key < syms[start].addr)
+ /* valid ksym */
+ return &syms[start - 1];
+
+ /* out of range. return _stext */
+ return &syms[0];
+}
+
+long ksym_get_addr(const char *name)
+{
+ int i;
+
+ for (i = 0; i < sym_cnt; i++) {
+ if (strcmp(syms[i].name, name) == 0)
+ return syms[i].addr;
+ }
+
+ return 0;
+}
+
+/* open kallsyms and read symbol addresses on the fly. Without caching all symbols,
+ * this is faster than load + find.
+ */
+int kallsyms_find(const char *sym, unsigned long long *addr)
+{
+ char type, name[500];
+ unsigned long long value;
+ int err = 0;
+ FILE *f;
+
+ f = fopen("/proc/kallsyms", "r");
+ if (!f)
+ return -EINVAL;
+
+ while (fscanf(f, "%llx %c %499s%*[^\n]\n", &value, &type, name) > 0) {
+ if (strcmp(name, sym) == 0) {
+ *addr = value;
+ goto out;
+ }
+ }
+ err = -ENOENT;
+
+out:
+ fclose(f);
+ return err;
+}
+
+void read_trace_pipe(void)
+{
+ int trace_fd;
+
+ trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
+ if (trace_fd < 0)
+ return;
+
+ while (1) {
+ static char buf[4096];
+ ssize_t sz;
+
+ sz = read(trace_fd, buf, sizeof(buf) - 1);
+ if (sz > 0) {
+ buf[sz] = 0;
+ puts(buf);
+ }
+ }
+}
diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
new file mode 100644
index 000000000..f62fdef9e
--- /dev/null
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __TRACE_HELPER_H
+#define __TRACE_HELPER_H
+
+#include <bpf/libbpf.h>
+
+struct ksym {
+ long addr;
+ char *name;
+};
+
+int load_kallsyms(void);
+struct ksym *ksym_search(long key);
+long ksym_get_addr(const char *name);
+
+/* open kallsyms and find addresses on the fly, faster than load + search. */
+int kallsyms_find(const char *sym, unsigned long long *addr);
+
+void read_trace_pipe(void);
+
+#endif
diff --git a/tools/testing/selftests/bpf/urandom_read.c b/tools/testing/selftests/bpf/urandom_read.c
new file mode 100644
index 000000000..db7810527
--- /dev/null
+++ b/tools/testing/selftests/bpf/urandom_read.c
@@ -0,0 +1,35 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+
+#define BUF_SIZE 256
+
+static __attribute__((noinline))
+void urandom_read(int fd, int count)
+{
+ char buf[BUF_SIZE];
+ int i;
+
+ for (i = 0; i < count; ++i)
+ read(fd, buf, BUF_SIZE);
+}
+
+int main(int argc, char *argv[])
+{
+ int fd = open("/dev/urandom", O_RDONLY);
+ int count = 4;
+
+ if (fd < 0)
+ return 1;
+
+ if (argc == 2)
+ count = atoi(argv[1]);
+
+ urandom_read(fd, count);
+
+ close(fd);
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/verifier/.gitignore b/tools/testing/selftests/bpf/verifier/.gitignore
new file mode 100644
index 000000000..89c4a3d37
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+tests.h
diff --git a/tools/testing/selftests/bpf/verifier/and.c b/tools/testing/selftests/bpf/verifier/and.c
new file mode 100644
index 000000000..7d7ebee5c
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/and.c
@@ -0,0 +1,68 @@
+{
+ "invalid and of negative number",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_1, -4),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "R0 max value is outside of the allowed memory range",
+ .result = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "invalid range check",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 12),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_9, 1),
+ BPF_ALU32_IMM(BPF_MOD, BPF_REG_1, 2),
+ BPF_ALU32_IMM(BPF_ADD, BPF_REG_1, 1),
+ BPF_ALU32_REG(BPF_AND, BPF_REG_9, BPF_REG_1),
+ BPF_ALU32_IMM(BPF_ADD, BPF_REG_9, 1),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_9, 1),
+ BPF_MOV32_IMM(BPF_REG_3, 1),
+ BPF_ALU32_REG(BPF_SUB, BPF_REG_3, BPF_REG_9),
+ BPF_ALU32_IMM(BPF_MUL, BPF_REG_3, 0x10000000),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_3),
+ BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_3, 0),
+ BPF_MOV64_REG(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "R0 max value is outside of the allowed memory range",
+ .result = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "check known subreg with unknown reg",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 32),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xFFFF1234),
+ /* Upper bits are unknown but AND above masks out 1 zero'ing lower bits */
+ BPF_JMP32_IMM(BPF_JLT, BPF_REG_0, 1, 1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 512),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R1 !read_ok",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = 0
+},
diff --git a/tools/testing/selftests/bpf/verifier/array_access.c b/tools/testing/selftests/bpf/verifier/array_access.c
new file mode 100644
index 000000000..1b138cd2b
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/array_access.c
@@ -0,0 +1,379 @@
+{
+ "valid map access into an array with a constant",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr_unpriv = "R0 leaks addr",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "valid map access into an array with a register",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_IMM(BPF_REG_1, 4),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr_unpriv = "R0 leaks addr",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "valid map access into an array with a variable",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_1, MAX_ENTRIES, 3),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr_unpriv = "R0 leaks addr",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "valid map access into an array with a signed variable",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ BPF_JMP32_IMM(BPF_JSGT, BPF_REG_1, 0xffffffff, 1),
+ BPF_MOV32_IMM(BPF_REG_1, 0),
+ BPF_MOV32_IMM(BPF_REG_2, MAX_ENTRIES),
+ BPF_JMP_REG(BPF_JSGT, BPF_REG_2, BPF_REG_1, 1),
+ BPF_MOV32_IMM(BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_LSH, BPF_REG_1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr_unpriv = "R0 leaks addr",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "invalid map access into an array with a constant",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, (MAX_ENTRIES + 1) << 2,
+ offsetof(struct test_val, foo)),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "invalid access to map value, value_size=48 off=48 size=8",
+ .result = REJECT,
+},
+{
+ "invalid map access into an array with a register",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_IMM(BPF_REG_1, MAX_ENTRIES + 1),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "R0 min value is outside of the allowed memory range",
+ .result = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "invalid map access into an array with a variable",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "R0 unbounded memory access, make sure to bounds check any such access",
+ .result = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "invalid map access into an array with no floor check",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV32_IMM(BPF_REG_2, MAX_ENTRIES),
+ BPF_JMP_REG(BPF_JSGT, BPF_REG_2, BPF_REG_1, 1),
+ BPF_MOV32_IMM(BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_LSH, BPF_REG_1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr_unpriv = "R0 leaks addr",
+ .errstr = "R0 unbounded memory access",
+ .result_unpriv = REJECT,
+ .result = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "invalid map access into an array with a invalid max check",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV32_IMM(BPF_REG_2, MAX_ENTRIES + 1),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 1),
+ BPF_MOV32_IMM(BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_LSH, BPF_REG_1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr_unpriv = "R0 leaks addr",
+ .errstr = "invalid access to map value, value_size=48 off=44 size=8",
+ .result_unpriv = REJECT,
+ .result = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "invalid map access into an array with a invalid max check",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10),
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_8),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0,
+ offsetof(struct test_val, foo)),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3, 11 },
+ .errstr = "R0 pointer += pointer",
+ .result = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "valid read map access into a read-only array 1",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_ro = { 3 },
+ .result = ACCEPT,
+ .retval = 28,
+},
+{
+ "valid read map access into a read-only array 2",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_csum_diff),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xffff),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .fixup_map_array_ro = { 3 },
+ .result = ACCEPT,
+ .retval = 65507,
+},
+{
+ "invalid write map access into a read-only array 1",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 42),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_ro = { 3 },
+ .result = REJECT,
+ .errstr = "write into map forbidden",
+},
+{
+ "invalid write map access into a read-only array 2",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_4, 8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_skb_load_bytes),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .fixup_map_array_ro = { 4 },
+ .result = REJECT,
+ .errstr = "write into map forbidden",
+},
+{
+ "valid write map access into a write-only array 1",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_wo = { 3 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "valid write map access into a write-only array 2",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_4, 8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_skb_load_bytes),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .fixup_map_array_wo = { 4 },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "invalid read map access into a write-only array 1",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_wo = { 3 },
+ .result = REJECT,
+ .errstr = "read from map forbidden",
+},
+{
+ "invalid read map access into a write-only array 2",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_csum_diff),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .fixup_map_array_wo = { 3 },
+ .result = REJECT,
+ .errstr = "read from map forbidden",
+},
diff --git a/tools/testing/selftests/bpf/verifier/basic.c b/tools/testing/selftests/bpf/verifier/basic.c
new file mode 100644
index 000000000..de84f0d57
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/basic.c
@@ -0,0 +1,23 @@
+{
+ "empty prog",
+ .insns = {
+ },
+ .errstr = "last insn is not an exit or jmp",
+ .result = REJECT,
+},
+{
+ "only exit insn",
+ .insns = {
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R0 !read_ok",
+ .result = REJECT,
+},
+{
+ "no bpf_exit",
+ .insns = {
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_0, BPF_REG_2),
+ },
+ .errstr = "not an exit",
+ .result = REJECT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/basic_call.c b/tools/testing/selftests/bpf/verifier/basic_call.c
new file mode 100644
index 000000000..a8c6ab4c1
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/basic_call.c
@@ -0,0 +1,50 @@
+{
+ "invalid call insn1",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL | BPF_X, 0, 0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "unknown opcode 8d",
+ .result = REJECT,
+},
+{
+ "invalid call insn2",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "BPF_CALL uses reserved",
+ .result = REJECT,
+},
+{
+ "invalid function call",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, 1234567),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid func unknown#1234567",
+ .result = REJECT,
+},
+{
+ "invalid argument register",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_cgroup_classid),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_cgroup_classid),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 !read_ok",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "non-invalid argument register",
+ .insns = {
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_cgroup_classid),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_1, BPF_REG_6),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_cgroup_classid),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
diff --git a/tools/testing/selftests/bpf/verifier/basic_instr.c b/tools/testing/selftests/bpf/verifier/basic_instr.c
new file mode 100644
index 000000000..071dbc889
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/basic_instr.c
@@ -0,0 +1,219 @@
+{
+ "add+sub+mul",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 2),
+ BPF_MOV64_IMM(BPF_REG_2, 3),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -1),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_1, 3),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = -3,
+},
+{
+ "xor32 zero extend check",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_2, -1),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 32),
+ BPF_ALU64_IMM(BPF_OR, BPF_REG_2, 0xffff),
+ BPF_ALU32_REG(BPF_XOR, BPF_REG_2, BPF_REG_2),
+ BPF_MOV32_IMM(BPF_REG_0, 2),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 0, 1),
+ BPF_MOV32_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "arsh32 on imm",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_ALU32_IMM(BPF_ARSH, BPF_REG_0, 5),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "arsh32 on imm 2",
+ .insns = {
+ BPF_LD_IMM64(BPF_REG_0, 0x1122334485667788),
+ BPF_ALU32_IMM(BPF_ARSH, BPF_REG_0, 7),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = -16069393,
+},
+{
+ "arsh32 on reg",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_MOV64_IMM(BPF_REG_1, 5),
+ BPF_ALU32_REG(BPF_ARSH, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "arsh32 on reg 2",
+ .insns = {
+ BPF_LD_IMM64(BPF_REG_0, 0xffff55667788),
+ BPF_MOV64_IMM(BPF_REG_1, 15),
+ BPF_ALU32_REG(BPF_ARSH, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 43724,
+},
+{
+ "arsh64 on imm",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_0, 5),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+},
+{
+ "arsh64 on reg",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_MOV64_IMM(BPF_REG_1, 5),
+ BPF_ALU64_REG(BPF_ARSH, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+},
+{
+ "lsh64 by 0 imm",
+ .insns = {
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_LD_IMM64(BPF_REG_1, 1),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 1, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "rsh64 by 0 imm",
+ .insns = {
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_LD_IMM64(BPF_REG_1, 0x100000000LL),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 0),
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "arsh64 by 0 imm",
+ .insns = {
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_LD_IMM64(BPF_REG_1, 0x100000000LL),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_1, 0),
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "lsh64 by 0 reg",
+ .insns = {
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_LD_IMM64(BPF_REG_1, 1),
+ BPF_LD_IMM64(BPF_REG_2, 0),
+ BPF_ALU64_REG(BPF_LSH, BPF_REG_1, BPF_REG_2),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 1, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "rsh64 by 0 reg",
+ .insns = {
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_LD_IMM64(BPF_REG_1, 0x100000000LL),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_1),
+ BPF_LD_IMM64(BPF_REG_3, 0),
+ BPF_ALU64_REG(BPF_RSH, BPF_REG_1, BPF_REG_3),
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "arsh64 by 0 reg",
+ .insns = {
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_LD_IMM64(BPF_REG_1, 0x100000000LL),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_1),
+ BPF_LD_IMM64(BPF_REG_3, 0),
+ BPF_ALU64_REG(BPF_ARSH, BPF_REG_1, BPF_REG_3),
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "invalid 64-bit BPF_END",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ {
+ .code = BPF_ALU64 | BPF_END | BPF_TO_LE,
+ .dst_reg = BPF_REG_0,
+ .src_reg = 0,
+ .off = 0,
+ .imm = 32,
+ },
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "unknown opcode d7",
+ .result = REJECT,
+},
+{
+ "mov64 src == dst",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_2),
+ // Check bounds are OK
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
+{
+ "mov64 src != dst",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_3),
+ // Check bounds are OK
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/basic_stack.c b/tools/testing/selftests/bpf/verifier/basic_stack.c
new file mode 100644
index 000000000..f995777dd
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/basic_stack.c
@@ -0,0 +1,64 @@
+{
+ "stack out of bounds",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid write to stack",
+ .result = REJECT,
+},
+{
+ "uninitialized stack1",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 2 },
+ .errstr = "invalid indirect read from stack",
+ .result = REJECT,
+},
+{
+ "uninitialized stack2",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, -8),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid read from stack",
+ .result = REJECT,
+},
+{
+ "invalid fp arithmetic",
+ /* If this gets ever changed, make sure JITs can deal with it. */
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 subtraction from stack pointer",
+ .result = REJECT,
+},
+{
+ "non-invalid fp arithmetic",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+},
+{
+ "misaligned read from stack",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, -4),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "misaligned stack access",
+ .result = REJECT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/basic_stx_ldx.c b/tools/testing/selftests/bpf/verifier/basic_stx_ldx.c
new file mode 100644
index 000000000..7a0aab3f2
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/basic_stx_ldx.c
@@ -0,0 +1,45 @@
+{
+ "invalid src register in STX",
+ .insns = {
+ BPF_STX_MEM(BPF_B, BPF_REG_10, -1, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R15 is invalid",
+ .result = REJECT,
+},
+{
+ "invalid dst register in STX",
+ .insns = {
+ BPF_STX_MEM(BPF_B, 14, BPF_REG_10, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R14 is invalid",
+ .result = REJECT,
+},
+{
+ "invalid dst register in ST",
+ .insns = {
+ BPF_ST_MEM(BPF_B, 14, -1, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R14 is invalid",
+ .result = REJECT,
+},
+{
+ "invalid src register in LDX",
+ .insns = {
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, 12, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R12 is invalid",
+ .result = REJECT,
+},
+{
+ "invalid dst register in LDX",
+ .insns = {
+ BPF_LDX_MEM(BPF_B, 11, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R11 is invalid",
+ .result = REJECT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c
new file mode 100644
index 000000000..e061e8799
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/bounds.c
@@ -0,0 +1,755 @@
+{
+ "subtraction bounds (map value) variant 1",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_1, 0xff, 7),
+ BPF_LDX_MEM(BPF_B, BPF_REG_3, BPF_REG_0, 1),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_3, 0xff, 5),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_3),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 56),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .errstr = "R0 max value is outside of the allowed memory range",
+ .result = REJECT,
+},
+{
+ "subtraction bounds (map value) variant 2",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_1, 0xff, 6),
+ BPF_LDX_MEM(BPF_B, BPF_REG_3, BPF_REG_0, 1),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_3, 0xff, 4),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_3),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .errstr = "R0 min value is negative, either use unsigned index or do a if (index >=0) check.",
+ .errstr_unpriv = "R1 has unknown scalar with mixed signed bounds",
+ .result = REJECT,
+},
+{
+ "check subtraction on pointers for unpriv",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_LD_MAP_FD(BPF_REG_ARG1, 0),
+ BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_ARG2, 0, 9),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_REG(BPF_REG_9, BPF_REG_FP),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_9, BPF_REG_0),
+ BPF_LD_MAP_FD(BPF_REG_ARG1, 0),
+ BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_ARG2, 0, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_9, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 1, 9 },
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "R9 pointer -= pointer prohibited",
+},
+{
+ "bounds check based on zero-extended MOV",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ /* r2 = 0x0000'0000'ffff'ffff */
+ BPF_MOV32_IMM(BPF_REG_2, 0xffffffff),
+ /* r2 = 0 */
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 32),
+ /* no-op */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2),
+ /* access at offset 0 */
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ /* exit */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .result = ACCEPT
+},
+{
+ "bounds check based on sign-extended MOV. test1",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ /* r2 = 0xffff'ffff'ffff'ffff */
+ BPF_MOV64_IMM(BPF_REG_2, 0xffffffff),
+ /* r2 = 0xffff'ffff */
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 32),
+ /* r0 = <oob pointer> */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2),
+ /* access to OOB pointer */
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ /* exit */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .errstr = "map_value pointer and 4294967295",
+ .result = REJECT
+},
+{
+ "bounds check based on sign-extended MOV. test2",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ /* r2 = 0xffff'ffff'ffff'ffff */
+ BPF_MOV64_IMM(BPF_REG_2, 0xffffffff),
+ /* r2 = 0xfff'ffff */
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 36),
+ /* r0 = <oob pointer> */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2),
+ /* access to OOB pointer */
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ /* exit */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .errstr = "R0 min value is outside of the allowed memory range",
+ .result = REJECT
+},
+{
+ "bounds check based on reg_off + var_off + insn_off. test1",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, (1 << 29) - 1),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, (1 << 29) - 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 3),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 4 },
+ .errstr = "value_size=8 off=1073741825",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "bounds check based on reg_off + var_off + insn_off. test2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_6, 1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, (1 << 30) - 1),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, (1 << 29) - 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 3),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 4 },
+ .errstr = "value 1073741823",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "bounds check after truncation of non-boundary-crossing range",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
+ /* r1 = [0x00, 0xff] */
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ /* r2 = 0x10'0000'0000 */
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 36),
+ /* r1 = [0x10'0000'0000, 0x10'0000'00ff] */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2),
+ /* r1 = [0x10'7fff'ffff, 0x10'8000'00fe] */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x7fffffff),
+ /* r1 = [0x00, 0xff] */
+ BPF_ALU32_IMM(BPF_SUB, BPF_REG_1, 0x7fffffff),
+ /* r1 = 0 */
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8),
+ /* no-op */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ /* access at offset 0 */
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ /* exit */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .result = ACCEPT
+},
+{
+ "bounds check after truncation of boundary-crossing range (1)",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
+ /* r1 = [0x00, 0xff] */
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xffffff80 >> 1),
+ /* r1 = [0xffff'ff80, 0x1'0000'007f] */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xffffff80 >> 1),
+ /* r1 = [0xffff'ff80, 0xffff'ffff] or
+ * [0x0000'0000, 0x0000'007f]
+ */
+ BPF_ALU32_IMM(BPF_ADD, BPF_REG_1, 0),
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0xffffff80 >> 1),
+ /* r1 = [0x00, 0xff] or
+ * [0xffff'ffff'0000'0080, 0xffff'ffff'ffff'ffff]
+ */
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0xffffff80 >> 1),
+ /* error on OOB pointer computation */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ /* exit */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ /* not actually fully unbounded, but the bound is very high */
+ .errstr = "value -4294967168 makes map_value pointer be out of bounds",
+ .result = REJECT,
+},
+{
+ "bounds check after truncation of boundary-crossing range (2)",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
+ /* r1 = [0x00, 0xff] */
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xffffff80 >> 1),
+ /* r1 = [0xffff'ff80, 0x1'0000'007f] */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0xffffff80 >> 1),
+ /* r1 = [0xffff'ff80, 0xffff'ffff] or
+ * [0x0000'0000, 0x0000'007f]
+ * difference to previous test: truncation via MOV32
+ * instead of ALU32.
+ */
+ BPF_MOV32_REG(BPF_REG_1, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0xffffff80 >> 1),
+ /* r1 = [0x00, 0xff] or
+ * [0xffff'ffff'0000'0080, 0xffff'ffff'ffff'ffff]
+ */
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 0xffffff80 >> 1),
+ /* error on OOB pointer computation */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ /* exit */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .errstr = "value -4294967168 makes map_value pointer be out of bounds",
+ .result = REJECT,
+},
+{
+ "bounds check after wrapping 32-bit addition",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+ /* r1 = 0x7fff'ffff */
+ BPF_MOV64_IMM(BPF_REG_1, 0x7fffffff),
+ /* r1 = 0xffff'fffe */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x7fffffff),
+ /* r1 = 0 */
+ BPF_ALU32_IMM(BPF_ADD, BPF_REG_1, 2),
+ /* no-op */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ /* access at offset 0 */
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ /* exit */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .result = ACCEPT
+},
+{
+ "bounds check after shift with oversized count operand",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_MOV64_IMM(BPF_REG_2, 32),
+ BPF_MOV64_IMM(BPF_REG_1, 1),
+ /* r1 = (u32)1 << (u32)32 = ? */
+ BPF_ALU32_REG(BPF_LSH, BPF_REG_1, BPF_REG_2),
+ /* r1 = [0x0000, 0xffff] */
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0xffff),
+ /* computes unknown pointer, potentially OOB */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ /* potentially OOB access */
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ /* exit */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .errstr = "R0 max value is outside of the allowed memory range",
+ .result = REJECT
+},
+{
+ "bounds check after right shift of maybe-negative number",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ /* r1 = [0x00, 0xff] */
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ /* r1 = [-0x01, 0xfe] */
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 1),
+ /* r1 = 0 or 0xff'ffff'ffff'ffff */
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8),
+ /* r1 = 0 or 0xffff'ffff'ffff */
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 8),
+ /* computes unknown pointer, potentially OOB */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ /* potentially OOB access */
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ /* exit */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .errstr = "R0 unbounded memory access",
+ .result = REJECT
+},
+{
+ "bounds check after 32-bit right shift with 64-bit input",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ /* r1 = 2 */
+ BPF_MOV64_IMM(BPF_REG_1, 2),
+ /* r1 = 1<<32 */
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 31),
+ /* r1 = 0 (NOT 2!) */
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_1, 31),
+ /* r1 = 0xffff'fffe (NOT 0!) */
+ BPF_ALU32_IMM(BPF_SUB, BPF_REG_1, 2),
+ /* error on computing OOB pointer */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ /* exit */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .errstr = "math between map_value pointer and 4294967294 is not allowed",
+ .result = REJECT,
+},
+{
+ "bounds check map access with off+size signed 32bit overflow. test1",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 0x7ffffffe),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
+ BPF_JMP_A(0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .errstr = "map_value pointer and 2147483646",
+ .result = REJECT
+},
+{
+ "bounds check map access with off+size signed 32bit overflow. test2",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 0x1fffffff),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 0x1fffffff),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 0x1fffffff),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
+ BPF_JMP_A(0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .errstr = "pointer offset 1073741822",
+ .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
+ .result = REJECT
+},
+{
+ "bounds check map access with off+size signed 32bit overflow. test3",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 0x1fffffff),
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 0x1fffffff),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 2),
+ BPF_JMP_A(0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .errstr = "pointer offset -1073741822",
+ .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
+ .result = REJECT
+},
+{
+ "bounds check map access with off+size signed 32bit overflow. test4",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_1, 1000000),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_1, 1000000),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 2),
+ BPF_JMP_A(0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .errstr = "map_value pointer and 1000000000000",
+ .result = REJECT
+},
+{
+ "bounds check mixed 32bit and 64bit arithmetic. test1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_1, -1),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1),
+ /* r1 = 0xffffFFFF00000001 */
+ BPF_JMP32_IMM(BPF_JGT, BPF_REG_1, 1, 3),
+ /* check ALU64 op keeps 32bit bounds */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1),
+ BPF_JMP32_IMM(BPF_JGT, BPF_REG_1, 2, 1),
+ BPF_JMP_A(1),
+ /* invalid ldx if bounds are lost above */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R0 invalid mem access 'inv'",
+ .result_unpriv = REJECT,
+ .result = ACCEPT
+},
+{
+ "bounds check mixed 32bit and 64bit arithmetic. test2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_1, -1),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1),
+ /* r1 = 0xffffFFFF00000001 */
+ BPF_MOV64_IMM(BPF_REG_2, 3),
+ /* r1 = 0x2 */
+ BPF_ALU32_IMM(BPF_ADD, BPF_REG_1, 1),
+ /* check ALU32 op zero extends 64bit bounds */
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 1),
+ BPF_JMP_A(1),
+ /* invalid ldx if bounds are lost above */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R0 invalid mem access 'inv'",
+ .result_unpriv = REJECT,
+ .result = ACCEPT
+},
+{
+ "assigning 32bit bounds to 64bit for wA = 0, wB = wA",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_8, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_MOV32_IMM(BPF_REG_9, 0),
+ BPF_MOV32_REG(BPF_REG_2, BPF_REG_9),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_7),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_8, 1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_6, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "bounds check for reg = 0, reg xor 1",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_1, 0),
+ BPF_ALU64_IMM(BPF_XOR, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R0 min value is outside of the allowed memory range",
+ .result_unpriv = REJECT,
+ .fixup_map_hash_8b = { 3 },
+ .result = ACCEPT,
+},
+{
+ "bounds check for reg32 = 0, reg32 xor 1",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV32_IMM(BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_XOR, BPF_REG_1, 1),
+ BPF_JMP32_IMM(BPF_JNE, BPF_REG_1, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R0 min value is outside of the allowed memory range",
+ .result_unpriv = REJECT,
+ .fixup_map_hash_8b = { 3 },
+ .result = ACCEPT,
+},
+{
+ "bounds check for reg = 2, reg xor 3",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_1, 2),
+ BPF_ALU64_IMM(BPF_XOR, BPF_REG_1, 3),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_1, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R0 min value is outside of the allowed memory range",
+ .result_unpriv = REJECT,
+ .fixup_map_hash_8b = { 3 },
+ .result = ACCEPT,
+},
+{
+ "bounds check for reg = any, reg xor 3",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_XOR, BPF_REG_1, 3),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .result = REJECT,
+ .errstr = "invalid access to map value",
+ .errstr_unpriv = "invalid access to map value",
+},
+{
+ "bounds check for reg32 = any, reg32 xor 3",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
+ BPF_ALU32_IMM(BPF_XOR, BPF_REG_1, 3),
+ BPF_JMP32_IMM(BPF_JNE, BPF_REG_1, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .result = REJECT,
+ .errstr = "invalid access to map value",
+ .errstr_unpriv = "invalid access to map value",
+},
+{
+ "bounds check for reg > 0, reg xor 3",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JLE, BPF_REG_1, 0, 3),
+ BPF_ALU64_IMM(BPF_XOR, BPF_REG_1, 3),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R0 min value is outside of the allowed memory range",
+ .result_unpriv = REJECT,
+ .fixup_map_hash_8b = { 3 },
+ .result = ACCEPT,
+},
+{
+ "bounds check for reg32 > 0, reg32 xor 3",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
+ BPF_JMP32_IMM(BPF_JLE, BPF_REG_1, 0, 3),
+ BPF_ALU32_IMM(BPF_XOR, BPF_REG_1, 3),
+ BPF_JMP32_IMM(BPF_JGE, BPF_REG_1, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R0 min value is outside of the allowed memory range",
+ .result_unpriv = REJECT,
+ .fixup_map_hash_8b = { 3 },
+ .result = ACCEPT,
+},
+{
+ "bounds checks after 32-bit truncation. test 1",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ /* This used to reduce the max bound to 0x7fffffff */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_1, 0x7fffffff, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .errstr_unpriv = "R0 leaks addr",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "bounds checks after 32-bit truncation. test 2",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JSLT, BPF_REG_1, 1, 1),
+ BPF_JMP32_IMM(BPF_JSLT, BPF_REG_1, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .errstr_unpriv = "R0 leaks addr",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/bounds_deduction.c b/tools/testing/selftests/bpf/verifier/bounds_deduction.c
new file mode 100644
index 000000000..91869aea6
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/bounds_deduction.c
@@ -0,0 +1,136 @@
+{
+ "check deducing bounds from const, 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 1, 0),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R1 has pointer with unsupported alu operation",
+ .errstr = "R0 tried to subtract pointer from scalar",
+ .result = REJECT,
+},
+{
+ "check deducing bounds from const, 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 1, 1),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JSLE, BPF_REG_0, 1, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R1 has pointer with unsupported alu operation",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "check deducing bounds from const, 3",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JSLE, BPF_REG_0, 0, 0),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R1 has pointer with unsupported alu operation",
+ .errstr = "R0 tried to subtract pointer from scalar",
+ .result = REJECT,
+},
+{
+ "check deducing bounds from const, 4",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JSLE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_6, BPF_REG_0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R6 has pointer with unsupported alu operation",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "check deducing bounds from const, 5",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 1, 1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R1 has pointer with unsupported alu operation",
+ .errstr = "R0 tried to subtract pointer from scalar",
+ .result = REJECT,
+},
+{
+ "check deducing bounds from const, 6",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R1 has pointer with unsupported alu operation",
+ .errstr = "R0 tried to subtract pointer from scalar",
+ .result = REJECT,
+},
+{
+ "check deducing bounds from const, 7",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, ~0),
+ BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 0),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R1 has pointer with unsupported alu operation",
+ .errstr = "dereference of modified ctx ptr",
+ .result = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "check deducing bounds from const, 8",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, ~0),
+ BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 1),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R1 has pointer with unsupported alu operation",
+ .errstr = "dereference of modified ctx ptr",
+ .result = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "check deducing bounds from const, 9",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 0),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R1 has pointer with unsupported alu operation",
+ .errstr = "R0 tried to subtract pointer from scalar",
+ .result = REJECT,
+},
+{
+ "check deducing bounds from const, 10",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JSLE, BPF_REG_0, 0, 0),
+ /* Marks reg as unknown. */
+ BPF_ALU64_IMM(BPF_NEG, BPF_REG_0, 0),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "math between ctx pointer and register with unbounded min value is not allowed",
+ .result = REJECT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c b/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c
new file mode 100644
index 000000000..bf82b923c
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/bounds_mix_sign_unsign.c
@@ -0,0 +1,411 @@
+{
+ "bounds checks mixing signed and unsigned, positive bounds",
+ .insns = {
+ BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
+ BPF_MOV64_IMM(BPF_REG_2, 2),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_2, BPF_REG_1, 3),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_1, 4, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ST_MEM(BPF_B, BPF_REG_0, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 5 },
+ .errstr = "unbounded min value",
+ .result = REJECT,
+},
+{
+ "bounds checks mixing signed and unsigned",
+ .insns = {
+ BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
+ BPF_MOV64_IMM(BPF_REG_2, -1),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 3),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_1, 1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ST_MEM(BPF_B, BPF_REG_0, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 5 },
+ .errstr = "unbounded min value",
+ .result = REJECT,
+},
+{
+ "bounds checks mixing signed and unsigned, variant 2",
+ .insns = {
+ BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
+ BPF_MOV64_IMM(BPF_REG_2, -1),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 5),
+ BPF_MOV64_IMM(BPF_REG_8, 0),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_1),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_8, 1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_8),
+ BPF_ST_MEM(BPF_B, BPF_REG_8, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 5 },
+ .errstr = "unbounded min value",
+ .result = REJECT,
+},
+{
+ "bounds checks mixing signed and unsigned, variant 3",
+ .insns = {
+ BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
+ BPF_MOV64_IMM(BPF_REG_2, -1),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 4),
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_8, 1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_8),
+ BPF_ST_MEM(BPF_B, BPF_REG_8, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 5 },
+ .errstr = "unbounded min value",
+ .result = REJECT,
+},
+{
+ "bounds checks mixing signed and unsigned, variant 4",
+ .insns = {
+ BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_1, 1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ST_MEM(BPF_B, BPF_REG_0, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 5 },
+ .result = ACCEPT,
+},
+{
+ "bounds checks mixing signed and unsigned, variant 5",
+ .insns = {
+ BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
+ BPF_MOV64_IMM(BPF_REG_2, -1),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 5),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_1, 1, 4),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 4),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+ BPF_ST_MEM(BPF_B, BPF_REG_0, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 5 },
+ .errstr = "unbounded min value",
+ .result = REJECT,
+},
+{
+ "bounds checks mixing signed and unsigned, variant 6",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_9, BPF_REG_1),
+ BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_9),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -512),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -16),
+ BPF_MOV64_IMM(BPF_REG_6, -1),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_6, 5),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_4, 1, 4),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 1),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_ST_MEM(BPF_H, BPF_REG_10, -512, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R4 min value is negative, either use unsigned",
+ .result = REJECT,
+},
+{
+ "bounds checks mixing signed and unsigned, variant 7",
+ .insns = {
+ BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
+ BPF_MOV64_IMM(BPF_REG_2, 1024 * 1024 * 1024),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, 3),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_1, 1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ST_MEM(BPF_B, BPF_REG_0, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 5 },
+ .result = ACCEPT,
+},
+{
+ "bounds checks mixing signed and unsigned, variant 8",
+ .insns = {
+ BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
+ BPF_MOV64_IMM(BPF_REG_2, -1),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_1, 1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ST_MEM(BPF_B, BPF_REG_0, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 5 },
+ .errstr = "unbounded min value",
+ .result = REJECT,
+},
+{
+ "bounds checks mixing signed and unsigned, variant 9",
+ .insns = {
+ BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
+ BPF_LD_IMM64(BPF_REG_2, -9223372036854775808ULL),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_1, 1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ST_MEM(BPF_B, BPF_REG_0, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 5 },
+ .result = ACCEPT,
+},
+{
+ "bounds checks mixing signed and unsigned, variant 10",
+ .insns = {
+ BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_1, 1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ST_MEM(BPF_B, BPF_REG_0, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 5 },
+ .errstr = "unbounded min value",
+ .result = REJECT,
+},
+{
+ "bounds checks mixing signed and unsigned, variant 11",
+ .insns = {
+ BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
+ BPF_MOV64_IMM(BPF_REG_2, -1),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_2, BPF_REG_1, 2),
+ /* Dead branch. */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_1, 1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ST_MEM(BPF_B, BPF_REG_0, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 5 },
+ .errstr = "unbounded min value",
+ .result = REJECT,
+},
+{
+ "bounds checks mixing signed and unsigned, variant 12",
+ .insns = {
+ BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
+ BPF_MOV64_IMM(BPF_REG_2, -6),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_2, BPF_REG_1, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_1, 1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ST_MEM(BPF_B, BPF_REG_0, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 5 },
+ .errstr = "unbounded min value",
+ .result = REJECT,
+},
+{
+ "bounds checks mixing signed and unsigned, variant 13",
+ .insns = {
+ BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
+ BPF_MOV64_IMM(BPF_REG_2, 2),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_2, BPF_REG_1, 2),
+ BPF_MOV64_IMM(BPF_REG_7, 1),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_7, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_1),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_7, 4, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_7),
+ BPF_ST_MEM(BPF_B, BPF_REG_0, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 5 },
+ .errstr = "unbounded min value",
+ .result = REJECT,
+},
+{
+ "bounds checks mixing signed and unsigned, variant 14",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
+ BPF_MOV64_IMM(BPF_REG_2, -1),
+ BPF_MOV64_IMM(BPF_REG_8, 2),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_9, 42, 6),
+ BPF_JMP_REG(BPF_JSGT, BPF_REG_8, BPF_REG_1, 3),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_1, 1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ST_MEM(BPF_B, BPF_REG_0, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_2, -3),
+ BPF_JMP_IMM(BPF_JA, 0, 0, -7),
+ },
+ .fixup_map_hash_8b = { 6 },
+ .errstr = "unbounded min value",
+ .result = REJECT,
+},
+{
+ "bounds checks mixing signed and unsigned, variant 15",
+ .insns = {
+ BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
+ BPF_MOV64_IMM(BPF_REG_2, -6),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_2, BPF_REG_1, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_0, 1, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_ST_MEM(BPF_B, BPF_REG_0, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 5 },
+ .errstr = "unbounded min value",
+ .result = REJECT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/bpf_get_stack.c b/tools/testing/selftests/bpf/verifier/bpf_get_stack.c
new file mode 100644
index 000000000..69b048cf4
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/bpf_get_stack.c
@@ -0,0 +1,44 @@
+{
+ "bpf_get_stack return R0 within range",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 28),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_9, sizeof(struct test_val)/2),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+ BPF_MOV64_IMM(BPF_REG_3, sizeof(struct test_val)/2),
+ BPF_MOV64_IMM(BPF_REG_4, 256),
+ BPF_EMIT_CALL(BPF_FUNC_get_stack),
+ BPF_MOV64_IMM(BPF_REG_1, 0),
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_8, 32),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_8, 32),
+ BPF_JMP_REG(BPF_JSGT, BPF_REG_1, BPF_REG_8, 16),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_9, BPF_REG_8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_8),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_9),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_1, 32),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_1),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+ BPF_MOV64_IMM(BPF_REG_5, sizeof(struct test_val)/2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_5),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_9),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_EMIT_CALL(BPF_FUNC_get_stack),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 4 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c
new file mode 100644
index 000000000..eb888c847
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -0,0 +1,2034 @@
+{
+ "calls: basic sanity",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .result = ACCEPT,
+},
+{
+ "calls: not on unpriviledged",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "function calls to other bpf functions are allowed for",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "calls: div by 0 in subprog",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 8),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV32_IMM(BPF_REG_2, 0),
+ BPF_MOV32_IMM(BPF_REG_3, 1),
+ BPF_ALU32_REG(BPF_DIV, BPF_REG_3, BPF_REG_2),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "calls: multiple ret types in subprog 1",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 8),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_MOV32_IMM(BPF_REG_0, 42),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .errstr = "R0 invalid mem access 'inv'",
+},
+{
+ "calls: multiple ret types in subprog 2",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 8),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 9),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6,
+ offsetof(struct __sk_buff, data)),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 64),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .fixup_map_hash_8b = { 16 },
+ .result = REJECT,
+ .errstr = "R0 min value is outside of the allowed memory range",
+},
+{
+ "calls: overlapping caller/callee",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .errstr = "last insn is not an exit or jmp",
+ .result = REJECT,
+},
+{
+ "calls: wrong recursive calls",
+ .insns = {
+ BPF_JMP_IMM(BPF_JA, 0, 0, 4),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 4),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -2),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -2),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .errstr = "jump out of range",
+ .result = REJECT,
+},
+{
+ "calls: wrong src reg",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 2, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .errstr = "BPF_CALL uses reserved fields",
+ .result = REJECT,
+},
+{
+ "calls: wrong off value",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, -1, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .errstr = "BPF_CALL uses reserved fields",
+ .result = REJECT,
+},
+{
+ "calls: jump back loop",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -1),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .errstr = "back-edge from insn 0 to 0",
+ .result = REJECT,
+},
+{
+ "calls: conditional call",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .errstr = "jump out of range",
+ .result = REJECT,
+},
+{
+ "calls: conditional call 2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 3),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .result = ACCEPT,
+},
+{
+ "calls: conditional call 3",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 4),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, -6),
+ BPF_MOV64_IMM(BPF_REG_0, 3),
+ BPF_JMP_IMM(BPF_JA, 0, 0, -6),
+ },
+ .prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
+ .errstr_unpriv = "back-edge from insn",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "calls: conditional call 4",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, -5),
+ BPF_MOV64_IMM(BPF_REG_0, 3),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .result = ACCEPT,
+},
+{
+ "calls: conditional call 5",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, -6),
+ BPF_MOV64_IMM(BPF_REG_0, 3),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "calls: conditional call 6",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -3),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "infinite loop detected",
+ .result = REJECT,
+},
+{
+ "calls: using r0 returned by callee",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .result = ACCEPT,
+},
+{
+ "calls: using uninit r0 from callee",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .errstr = "!read_ok",
+ .result = REJECT,
+},
+{
+ "calls: callee is using r1",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, len)),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_ACT,
+ .result = ACCEPT,
+ .retval = TEST_DATA_LEN,
+},
+{
+ "calls: callee using args1",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "allowed for",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = POINTER_VALUE,
+},
+{
+ "calls: callee using wrong args2",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .errstr = "R2 !read_ok",
+ .result = REJECT,
+},
+{
+ "calls: callee using two args",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6,
+ offsetof(struct __sk_buff, len)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_6,
+ offsetof(struct __sk_buff, len)),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "allowed for",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = TEST_DATA_LEN + TEST_DATA_LEN - ETH_HLEN - ETH_HLEN,
+},
+{
+ "calls: callee changing pkt pointers",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_8, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_8, BPF_REG_7, 2),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+ /* clear_all_pkt_pointers() has to walk all frames
+ * to make sure that pkt pointers in the caller
+ * are cleared when callee is calling a helper that
+ * adjusts packet size
+ */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_xdp_adjust_head),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "R6 invalid mem access 'inv'",
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "calls: ptr null check in subprog",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_6, 0),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "function calls to other bpf functions are allowed for",
+ .fixup_map_hash_48b = { 3 },
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "calls: two calls with args",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 6),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_7),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, len)),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = TEST_DATA_LEN + TEST_DATA_LEN,
+},
+{
+ "calls: calls with stack arith",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -64),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -64),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -64),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 42,
+},
+{
+ "calls: calls with misaligned stack access",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -63),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -61),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -63),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .flags = F_LOAD_WITH_STRICT_ALIGNMENT,
+ .errstr = "misaligned stack access",
+ .result = REJECT,
+},
+{
+ "calls: calls control flow, jump test",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 43),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, -3),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 43,
+},
+{
+ "calls: calls control flow, jump test 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 43),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -3),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "jump out of range from insn 1 to 4",
+ .result = REJECT,
+},
+{
+ "calls: two calls with bad jump",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 6),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_7),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, len)),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -3),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .errstr = "jump out of range from insn 11 to 9",
+ .result = REJECT,
+},
+{
+ "calls: recursive call. test1",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .errstr = "back-edge",
+ .result = REJECT,
+},
+{
+ "calls: recursive call. test2",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -3),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .errstr = "back-edge",
+ .result = REJECT,
+},
+{
+ "calls: unreachable code",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .errstr = "unreachable insn 6",
+ .result = REJECT,
+},
+{
+ "calls: invalid call",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -4),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .errstr = "invalid destination",
+ .result = REJECT,
+},
+{
+ "calls: invalid call 2",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 0x7fffffff),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .errstr = "invalid destination",
+ .result = REJECT,
+},
+{
+ "calls: jumping across function bodies. test1",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, -3),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .errstr = "jump out of range",
+ .result = REJECT,
+},
+{
+ "calls: jumping across function bodies. test2",
+ .insns = {
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 3),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .errstr = "jump out of range",
+ .result = REJECT,
+},
+{
+ "calls: call without exit",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, -2),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .errstr = "not an exit",
+ .result = REJECT,
+},
+{
+ "calls: call into middle of ld_imm64",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_LD_IMM64(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .errstr = "last insn",
+ .result = REJECT,
+},
+{
+ "calls: call into middle of other call",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .errstr = "last insn",
+ .result = REJECT,
+},
+{
+ "calls: subprog call with ld_abs in main prog",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_LD_ABS(BPF_B, 0),
+ BPF_LD_ABS(BPF_H, 0),
+ BPF_LD_ABS(BPF_W, 0),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 5),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_7),
+ BPF_LD_ABS(BPF_B, 0),
+ BPF_LD_ABS(BPF_H, 0),
+ BPF_LD_ABS(BPF_W, 0),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_3, 2),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_vlan_push),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
+{
+ "calls: two calls with bad fallthrough",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 6),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_7),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, len)),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .errstr = "not an exit",
+ .result = REJECT,
+},
+{
+ "calls: two calls with stack read",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 6),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_7),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .result = ACCEPT,
+},
+{
+ "calls: two calls with stack write",
+ .insns = {
+ /* main prog */
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -16),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_2),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 7),
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_8),
+ /* write into stack frame of main prog */
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 2 */
+ /* read from stack frame of main prog */
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .result = ACCEPT,
+},
+{
+ "calls: stack overflow using two frames (pre-call access)",
+ .insns = {
+ /* prog 1 */
+ BPF_ST_MEM(BPF_B, BPF_REG_10, -300, 0),
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+
+ /* prog 2 */
+ BPF_ST_MEM(BPF_B, BPF_REG_10, -300, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .errstr = "combined stack size",
+ .result = REJECT,
+},
+{
+ "calls: stack overflow using two frames (post-call access)",
+ .insns = {
+ /* prog 1 */
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 2),
+ BPF_ST_MEM(BPF_B, BPF_REG_10, -300, 0),
+ BPF_EXIT_INSN(),
+
+ /* prog 2 */
+ BPF_ST_MEM(BPF_B, BPF_REG_10, -300, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .errstr = "combined stack size",
+ .result = REJECT,
+},
+{
+ "calls: stack depth check using three frames. test1",
+ .insns = {
+ /* main */
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 4), /* call A */
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 5), /* call B */
+ BPF_ST_MEM(BPF_B, BPF_REG_10, -32, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ /* A */
+ BPF_ST_MEM(BPF_B, BPF_REG_10, -256, 0),
+ BPF_EXIT_INSN(),
+ /* B */
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, -3), /* call A */
+ BPF_ST_MEM(BPF_B, BPF_REG_10, -64, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ /* stack_main=32, stack_A=256, stack_B=64
+ * and max(main+A, main+A+B) < 512
+ */
+ .result = ACCEPT,
+},
+{
+ "calls: stack depth check using three frames. test2",
+ .insns = {
+ /* main */
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 4), /* call A */
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 5), /* call B */
+ BPF_ST_MEM(BPF_B, BPF_REG_10, -32, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ /* A */
+ BPF_ST_MEM(BPF_B, BPF_REG_10, -64, 0),
+ BPF_EXIT_INSN(),
+ /* B */
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, -3), /* call A */
+ BPF_ST_MEM(BPF_B, BPF_REG_10, -256, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ /* stack_main=32, stack_A=64, stack_B=256
+ * and max(main+A, main+A+B) < 512
+ */
+ .result = ACCEPT,
+},
+{
+ "calls: stack depth check using three frames. test3",
+ .insns = {
+ /* main */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 6), /* call A */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 8), /* call B */
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_6, 0, 1),
+ BPF_ST_MEM(BPF_B, BPF_REG_10, -64, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ /* A */
+ BPF_JMP_IMM(BPF_JLT, BPF_REG_1, 10, 1),
+ BPF_EXIT_INSN(),
+ BPF_ST_MEM(BPF_B, BPF_REG_10, -224, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, -3),
+ /* B */
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_1, 2, 1),
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, -6), /* call A */
+ BPF_ST_MEM(BPF_B, BPF_REG_10, -256, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ /* stack_main=64, stack_A=224, stack_B=256
+ * and max(main+A, main+A+B) > 512
+ */
+ .errstr = "combined stack",
+ .result = REJECT,
+},
+{
+ "calls: stack depth check using three frames. test4",
+ /* void main(void) {
+ * func1(0);
+ * func1(1);
+ * func2(1);
+ * }
+ * void func1(int alloc_or_recurse) {
+ * if (alloc_or_recurse) {
+ * frame_pointer[-300] = 1;
+ * } else {
+ * func2(alloc_or_recurse);
+ * }
+ * }
+ * void func2(int alloc_or_recurse) {
+ * if (alloc_or_recurse) {
+ * frame_pointer[-300] = 1;
+ * }
+ * }
+ */
+ .insns = {
+ /* main */
+ BPF_MOV64_IMM(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 6), /* call A */
+ BPF_MOV64_IMM(BPF_REG_1, 1),
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 4), /* call A */
+ BPF_MOV64_IMM(BPF_REG_1, 1),
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 7), /* call B */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ /* A */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 2),
+ BPF_ST_MEM(BPF_B, BPF_REG_10, -300, 0),
+ BPF_EXIT_INSN(),
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call B */
+ BPF_EXIT_INSN(),
+ /* B */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_ST_MEM(BPF_B, BPF_REG_10, -300, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .result = REJECT,
+ .errstr = "combined stack",
+},
+{
+ "calls: stack depth check using three frames. test5",
+ .insns = {
+ /* main */
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call A */
+ BPF_EXIT_INSN(),
+ /* A */
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call B */
+ BPF_EXIT_INSN(),
+ /* B */
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call C */
+ BPF_EXIT_INSN(),
+ /* C */
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call D */
+ BPF_EXIT_INSN(),
+ /* D */
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call E */
+ BPF_EXIT_INSN(),
+ /* E */
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call F */
+ BPF_EXIT_INSN(),
+ /* F */
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call G */
+ BPF_EXIT_INSN(),
+ /* G */
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call H */
+ BPF_EXIT_INSN(),
+ /* H */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .errstr = "call stack",
+ .result = REJECT,
+},
+{
+ "calls: stack depth check in dead code",
+ .insns = {
+ /* main */
+ BPF_MOV64_IMM(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call A */
+ BPF_EXIT_INSN(),
+ /* A */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 2), /* call B */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ /* B */
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call C */
+ BPF_EXIT_INSN(),
+ /* C */
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call D */
+ BPF_EXIT_INSN(),
+ /* D */
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call E */
+ BPF_EXIT_INSN(),
+ /* E */
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call F */
+ BPF_EXIT_INSN(),
+ /* F */
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call G */
+ BPF_EXIT_INSN(),
+ /* G */
+ BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call H */
+ BPF_EXIT_INSN(),
+ /* H */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .errstr = "call stack",
+ .result = REJECT,
+},
+{
+ "calls: spill into caller stack frame",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .errstr = "cannot spill",
+ .result = REJECT,
+},
+{
+ "calls: write into caller stack frame",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+ BPF_EXIT_INSN(),
+ BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .result = ACCEPT,
+ .retval = 42,
+},
+{
+ "calls: write into callee stack frame",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 42),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, -8),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .errstr = "cannot return stack pointer",
+ .result = REJECT,
+},
+{
+ "calls: two calls with stack write and void return",
+ .insns = {
+ /* main prog */
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -16),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_2),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+
+ /* subprog 2 */
+ /* write into stack frame of main prog */
+ BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 0),
+ BPF_EXIT_INSN(), /* void return */
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .result = ACCEPT,
+},
+{
+ "calls: ambiguous return value",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 5),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "allowed for",
+ .result_unpriv = REJECT,
+ .errstr = "R0 !read_ok",
+ .result = REJECT,
+},
+{
+ "calls: two calls that return map_value",
+ .insns = {
+ /* main prog */
+ /* pass fp-16, fp-8 into a function */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 8),
+
+ /* fetch map_value_ptr from the stack of this function */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ /* write into map value */
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ /* fetch secound map_value_ptr from the stack */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -16),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ /* write into map value */
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ /* call 3rd function twice */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_2),
+ /* first time with fp-8 */
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+ /* second time with fp-16 */
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+
+ /* subprog 2 */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ /* lookup from map */
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ /* write map_value_ptr into stack frame of main prog */
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(), /* return 0 */
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .fixup_map_hash_8b = { 23 },
+ .result = ACCEPT,
+},
+{
+ "calls: two calls that return map_value with bool condition",
+ .insns = {
+ /* main prog */
+ /* pass fp-16, fp-8 into a function */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ /* call 3rd function twice */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_2),
+ /* first time with fp-8 */
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 9),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2),
+ /* fetch map_value_ptr from the stack of this function */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+ /* write into map value */
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+ /* second time with fp-16 */
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2),
+ /* fetch secound map_value_ptr from the stack */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+ /* write into map value */
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 2 */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ /* lookup from map */
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(), /* return 0 */
+ /* write map_value_ptr into stack frame of main prog */
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(), /* return 1 */
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .fixup_map_hash_8b = { 23 },
+ .result = ACCEPT,
+},
+{
+ "calls: two calls that return map_value with incorrect bool check",
+ .insns = {
+ /* main prog */
+ /* pass fp-16, fp-8 into a function */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ /* call 3rd function twice */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_2),
+ /* first time with fp-8 */
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 9),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2),
+ /* fetch map_value_ptr from the stack of this function */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+ /* write into map value */
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+ /* second time with fp-16 */
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ /* fetch secound map_value_ptr from the stack */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+ /* write into map value */
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 2 */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ /* lookup from map */
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(), /* return 0 */
+ /* write map_value_ptr into stack frame of main prog */
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(), /* return 1 */
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .fixup_map_hash_8b = { 23 },
+ .result = REJECT,
+ .errstr = "invalid read from stack R7 off=-16 size=8",
+},
+{
+ "calls: two calls that receive map_value via arg=ptr_stack_of_caller. test1",
+ .insns = {
+ /* main prog */
+ /* pass fp-16, fp-8 into a function */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_2),
+ /* 1st lookup from map */
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_8, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ /* write map_value_ptr into stack frame of main prog at fp-8 */
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_8, 1),
+
+ /* 2nd lookup from map */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), /* 20 */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, /* 24 */
+ BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_9, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ /* write map_value_ptr into stack frame of main prog at fp-16 */
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_9, 1),
+
+ /* call 3rd func with fp-8, 0|1, fp-16, 0|1 */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), /* 30 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_8),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_7),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_9),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), /* 34 */
+ BPF_EXIT_INSN(),
+
+ /* subprog 2 */
+ /* if arg2 == 1 do *arg1 = 0 */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 1, 2),
+ /* fetch map_value_ptr from the stack of this function */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 0),
+ /* write into map value */
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+
+ /* if arg4 == 1 do *arg3 = 0 */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_4, 1, 2),
+ /* fetch map_value_ptr from the stack of this function */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0),
+ /* write into map value */
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 2, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .fixup_map_hash_8b = { 12, 22 },
+ .result = REJECT,
+ .errstr = "invalid access to map value, value_size=8 off=2 size=8",
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "calls: two calls that receive map_value via arg=ptr_stack_of_caller. test2",
+ .insns = {
+ /* main prog */
+ /* pass fp-16, fp-8 into a function */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_2),
+ /* 1st lookup from map */
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_8, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ /* write map_value_ptr into stack frame of main prog at fp-8 */
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_8, 1),
+
+ /* 2nd lookup from map */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), /* 20 */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, /* 24 */
+ BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_9, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ /* write map_value_ptr into stack frame of main prog at fp-16 */
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_9, 1),
+
+ /* call 3rd func with fp-8, 0|1, fp-16, 0|1 */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), /* 30 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_8),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_7),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_9),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1), /* 34 */
+ BPF_EXIT_INSN(),
+
+ /* subprog 2 */
+ /* if arg2 == 1 do *arg1 = 0 */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 1, 2),
+ /* fetch map_value_ptr from the stack of this function */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 0),
+ /* write into map value */
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+
+ /* if arg4 == 1 do *arg3 = 0 */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_4, 1, 2),
+ /* fetch map_value_ptr from the stack of this function */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0),
+ /* write into map value */
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .fixup_map_hash_8b = { 12, 22 },
+ .result = ACCEPT,
+},
+{
+ "calls: two jumps that receive map_value via arg=ptr_stack_of_jumper. test3",
+ .insns = {
+ /* main prog */
+ /* pass fp-16, fp-8 into a function */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_2),
+ /* 1st lookup from map */
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -24, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -24),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_8, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ /* write map_value_ptr into stack frame of main prog at fp-8 */
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_8, 1),
+
+ /* 2nd lookup from map */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -24),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_9, 0), // 26
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ /* write map_value_ptr into stack frame of main prog at fp-16 */
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_9, 1),
+
+ /* call 3rd func with fp-8, 0|1, fp-16, 0|1 */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), // 30
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_8),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_7),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_9),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 1), // 34
+ BPF_JMP_IMM(BPF_JA, 0, 0, -30),
+
+ /* subprog 2 */
+ /* if arg2 == 1 do *arg1 = 0 */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 1, 2),
+ /* fetch map_value_ptr from the stack of this function */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 0),
+ /* write into map value */
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+
+ /* if arg4 == 1 do *arg3 = 0 */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_4, 1, 2),
+ /* fetch map_value_ptr from the stack of this function */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0),
+ /* write into map value */
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 2, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, -8),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .fixup_map_hash_8b = { 12, 22 },
+ .result = REJECT,
+ .errstr = "invalid access to map value, value_size=8 off=2 size=8",
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "calls: two calls that receive map_value_ptr_or_null via arg. test1",
+ .insns = {
+ /* main prog */
+ /* pass fp-16, fp-8 into a function */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_2),
+ /* 1st lookup from map */
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ /* write map_value_ptr_or_null into stack frame of main prog at fp-8 */
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_8, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_8, 1),
+
+ /* 2nd lookup from map */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ /* write map_value_ptr_or_null into stack frame of main prog at fp-16 */
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_9, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_9, 1),
+
+ /* call 3rd func with fp-8, 0|1, fp-16, 0|1 */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_8),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_7),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_9),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+
+ /* subprog 2 */
+ /* if arg2 == 1 do *arg1 = 0 */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 1, 2),
+ /* fetch map_value_ptr from the stack of this function */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 0),
+ /* write into map value */
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+
+ /* if arg4 == 1 do *arg3 = 0 */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_4, 1, 2),
+ /* fetch map_value_ptr from the stack of this function */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0),
+ /* write into map value */
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .fixup_map_hash_8b = { 12, 22 },
+ .result = ACCEPT,
+},
+{
+ "calls: two calls that receive map_value_ptr_or_null via arg. test2",
+ .insns = {
+ /* main prog */
+ /* pass fp-16, fp-8 into a function */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_2),
+ /* 1st lookup from map */
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ /* write map_value_ptr_or_null into stack frame of main prog at fp-8 */
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_8, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_8, 1),
+
+ /* 2nd lookup from map */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ /* write map_value_ptr_or_null into stack frame of main prog at fp-16 */
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_9, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_9, 1),
+
+ /* call 3rd func with fp-8, 0|1, fp-16, 0|1 */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_8),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_7),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_9),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+
+ /* subprog 2 */
+ /* if arg2 == 1 do *arg1 = 0 */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 1, 2),
+ /* fetch map_value_ptr from the stack of this function */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 0),
+ /* write into map value */
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+
+ /* if arg4 == 0 do *arg3 = 0 */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_4, 0, 2),
+ /* fetch map_value_ptr from the stack of this function */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0),
+ /* write into map value */
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .fixup_map_hash_8b = { 12, 22 },
+ .result = REJECT,
+ .errstr = "R0 invalid mem access 'inv'",
+},
+{
+ "calls: pkt_ptr spill into caller stack",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ /* spill unchecked pkt_ptr into stack of caller */
+ BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 2),
+ /* now the pkt range is verified, read pkt_ptr from stack */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_4, 0),
+ /* write 4 bytes into packet */
+ BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .retval = POINTER_VALUE,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "calls: pkt_ptr spill into caller stack 2",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+ /* Marking is still kept, but not in all cases safe. */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8),
+ BPF_ST_MEM(BPF_W, BPF_REG_4, 0, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ /* spill unchecked pkt_ptr into stack of caller */
+ BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 2),
+ /* now the pkt range is verified, read pkt_ptr from stack */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_4, 0),
+ /* write 4 bytes into packet */
+ BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "invalid access to packet",
+ .result = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "calls: pkt_ptr spill into caller stack 3",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ /* Marking is still kept and safe here. */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8),
+ BPF_ST_MEM(BPF_W, BPF_REG_4, 0, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ /* spill unchecked pkt_ptr into stack of caller */
+ BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 3),
+ BPF_MOV64_IMM(BPF_REG_5, 1),
+ /* now the pkt range is verified, read pkt_ptr from stack */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_4, 0),
+ /* write 4 bytes into packet */
+ BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_5),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 1,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "calls: pkt_ptr spill into caller stack 4",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ /* Check marking propagated. */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8),
+ BPF_ST_MEM(BPF_W, BPF_REG_4, 0, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ /* spill unchecked pkt_ptr into stack of caller */
+ BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 2),
+ BPF_MOV64_IMM(BPF_REG_5, 1),
+ /* don't read back pkt_ptr from stack here */
+ /* write 4 bytes into packet */
+ BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_5),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 1,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "calls: pkt_ptr spill into caller stack 5",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_4, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 3),
+ /* spill checked pkt_ptr into stack of caller */
+ BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 1),
+ /* don't read back pkt_ptr from stack here */
+ /* write 4 bytes into packet */
+ BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_5),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "same insn cannot be used with different",
+ .result = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "calls: pkt_ptr spill into caller stack 6",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_4, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 3),
+ /* spill checked pkt_ptr into stack of caller */
+ BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 1),
+ /* don't read back pkt_ptr from stack here */
+ /* write 4 bytes into packet */
+ BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_5),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "R4 invalid mem access",
+ .result = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "calls: pkt_ptr spill into caller stack 7",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_4, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 3),
+ /* spill checked pkt_ptr into stack of caller */
+ BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 1),
+ /* don't read back pkt_ptr from stack here */
+ /* write 4 bytes into packet */
+ BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_5),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "R4 invalid mem access",
+ .result = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "calls: pkt_ptr spill into caller stack 8",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_0, BPF_REG_3, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_4, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 3),
+ /* spill checked pkt_ptr into stack of caller */
+ BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 1),
+ /* don't read back pkt_ptr from stack here */
+ /* write 4 bytes into packet */
+ BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_5),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "calls: pkt_ptr spill into caller stack 9",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_0, BPF_REG_3, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_4, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ /* spill unchecked pkt_ptr into stack of caller */
+ BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 2),
+ BPF_MOV64_IMM(BPF_REG_5, 1),
+ /* don't read back pkt_ptr from stack here */
+ /* write 4 bytes into packet */
+ BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_5),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "invalid access to packet",
+ .result = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "calls: caller stack init to zero or map_value_or_null",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4),
+ /* fetch map_value_or_null or const_zero from stack */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ /* store into map_value */
+ BPF_ST_MEM(BPF_W, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ /* if (ctx == 0) return; */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 8),
+ /* else bpf_map_lookup() and *(fp - 8) = r0 */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ /* write map_value_ptr_or_null into stack frame of main prog at fp-8 */
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 13 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+},
+{
+ "calls: stack init to zero and pruning",
+ .insns = {
+ /* first make allocated_stack 16 byte */
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 0),
+ /* now fork the execution such that the false branch
+ * of JGT insn will be verified second and it skisp zero
+ * init of fp-8 stack slot. If stack liveness marking
+ * is missing live_read marks from call map_lookup
+ * processing then pruning will incorrectly assume
+ * that fp-8 stack slot was unused in the fall-through
+ * branch and will accept the program incorrectly
+ */
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_1, 2, 2),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 6 },
+ .errstr = "invalid indirect read from stack R2 off -8+0 size 8",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+},
+{
+ "calls: ctx read at start of subprog",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 5),
+ BPF_JMP_REG(BPF_JSGT, BPF_REG_0, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_B, BPF_REG_9, BPF_REG_1, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
+ .errstr_unpriv = "function calls to other bpf functions are allowed for",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "calls: cross frame pruning",
+ .insns = {
+ /* r8 = !!random();
+ * call pruner()
+ * if (r8)
+ * do something bad;
+ */
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_IMM(BPF_REG_8, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_8, 1),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_8, 1, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_9, BPF_REG_1, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
+ .errstr_unpriv = "function calls to other bpf functions are allowed for",
+ .errstr = "!read_ok",
+ .result = REJECT,
+},
+{
+ "calls: cross frame pruning - liveness propagation",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_IMM(BPF_REG_8, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_8, 1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_IMM(BPF_REG_9, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_9, 1),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_8, 1, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
+ .errstr_unpriv = "function calls to other bpf functions are allowed for",
+ .errstr = "!read_ok",
+ .result = REJECT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/cfg.c b/tools/testing/selftests/bpf/verifier/cfg.c
new file mode 100644
index 000000000..4eb76ed73
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/cfg.c
@@ -0,0 +1,73 @@
+{
+ "unreachable",
+ .insns = {
+ BPF_EXIT_INSN(),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "unreachable",
+ .result = REJECT,
+},
+{
+ "unreachable2",
+ .insns = {
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "unreachable",
+ .result = REJECT,
+},
+{
+ "out of range jump",
+ .insns = {
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "jump out of range",
+ .result = REJECT,
+},
+{
+ "out of range jump2",
+ .insns = {
+ BPF_JMP_IMM(BPF_JA, 0, 0, -2),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "jump out of range",
+ .result = REJECT,
+},
+{
+ "loop (back-edge)",
+ .insns = {
+ BPF_JMP_IMM(BPF_JA, 0, 0, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "unreachable insn 1",
+ .errstr_unpriv = "back-edge",
+ .result = REJECT,
+},
+{
+ "loop2 (back-edge)",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, -4),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "unreachable insn 4",
+ .errstr_unpriv = "back-edge",
+ .result = REJECT,
+},
+{
+ "conditional loop",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, -3),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "infinite loop detected",
+ .errstr_unpriv = "back-edge",
+ .result = REJECT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/cgroup_inv_retcode.c b/tools/testing/selftests/bpf/verifier/cgroup_inv_retcode.c
new file mode 100644
index 000000000..6d65fe3e7
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/cgroup_inv_retcode.c
@@ -0,0 +1,72 @@
+{
+ "bpf_exit with invalid return code. test1",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R0 has value (0x0; 0xffffffff)",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+},
+{
+ "bpf_exit with invalid return code. test2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+},
+{
+ "bpf_exit with invalid return code. test3",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 3),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R0 has value (0x0; 0x3)",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+},
+{
+ "bpf_exit with invalid return code. test4",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+},
+{
+ "bpf_exit with invalid return code. test5",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R0 has value (0x2; 0x0)",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+},
+{
+ "bpf_exit with invalid return code. test6",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R0 is not a known value (ctx)",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+},
+{
+ "bpf_exit with invalid return code. test7",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 4),
+ BPF_ALU64_REG(BPF_MUL, BPF_REG_0, BPF_REG_2),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R0 has unknown scalar value",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+},
diff --git a/tools/testing/selftests/bpf/verifier/cgroup_skb.c b/tools/testing/selftests/bpf/verifier/cgroup_skb.c
new file mode 100644
index 000000000..52e4c03b0
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/cgroup_skb.c
@@ -0,0 +1,197 @@
+{
+ "direct packet read test#1 for CGROUP_SKB",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+ offsetof(struct __sk_buff, len)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+ offsetof(struct __sk_buff, pkt_type)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_6,
+ offsetof(struct __sk_buff, mark)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct __sk_buff, queue_mapping)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_8, BPF_REG_1,
+ offsetof(struct __sk_buff, protocol)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_1,
+ offsetof(struct __sk_buff, vlan_present)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "invalid bpf_context access off=76 size=4",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "direct packet read test#2 for CGROUP_SKB",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+ offsetof(struct __sk_buff, vlan_tci)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+ offsetof(struct __sk_buff, vlan_proto)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, priority)),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_6,
+ offsetof(struct __sk_buff, priority)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct __sk_buff, ingress_ifindex)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_8, BPF_REG_1,
+ offsetof(struct __sk_buff, tc_index)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_1,
+ offsetof(struct __sk_buff, hash)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "direct packet read test#3 for CGROUP_SKB",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[1])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[2])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[3])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_8, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[4])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_1,
+ offsetof(struct __sk_buff, napi_id)),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_4,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_5,
+ offsetof(struct __sk_buff, cb[1])),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_6,
+ offsetof(struct __sk_buff, cb[2])),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_7,
+ offsetof(struct __sk_buff, cb[3])),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_8,
+ offsetof(struct __sk_buff, cb[4])),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "direct packet read test#4 for CGROUP_SKB",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, family)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, remote_ip4)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+ offsetof(struct __sk_buff, local_ip4)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+ offsetof(struct __sk_buff, remote_ip6[0])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+ offsetof(struct __sk_buff, remote_ip6[1])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+ offsetof(struct __sk_buff, remote_ip6[2])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_5, BPF_REG_1,
+ offsetof(struct __sk_buff, remote_ip6[3])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, local_ip6[0])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, local_ip6[1])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, local_ip6[2])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, local_ip6[3])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct __sk_buff, remote_port)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_8, BPF_REG_1,
+ offsetof(struct __sk_buff, local_port)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "invalid access of tc_classid for CGROUP_SKB",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, tc_classid)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid bpf_context access",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "invalid access of data_meta for CGROUP_SKB",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, data_meta)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid bpf_context access",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "invalid access of flow_keys for CGROUP_SKB",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, flow_keys)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid bpf_context access",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "invalid write access to napi_id for CGROUP_SKB",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_9, BPF_REG_1,
+ offsetof(struct __sk_buff, napi_id)),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_9,
+ offsetof(struct __sk_buff, napi_id)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid bpf_context access",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "write tstamp from CGROUP_SKB",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, tstamp)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "invalid bpf_context access off=152 size=8",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "read tstamp from CGROUP_SKB",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, tstamp)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
diff --git a/tools/testing/selftests/bpf/verifier/cgroup_storage.c b/tools/testing/selftests/bpf/verifier/cgroup_storage.c
new file mode 100644
index 000000000..97057c0a1
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/cgroup_storage.c
@@ -0,0 +1,220 @@
+{
+ "valid cgroup storage access",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_cgroup_storage = { 1 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "invalid cgroup storage access 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 1 },
+ .result = REJECT,
+ .errstr = "cannot pass map_type 1 into func bpf_get_local_storage",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "invalid cgroup storage access 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "fd 1 is not pointing to valid bpf_map",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "invalid cgroup storage access 3",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 256),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_cgroup_storage = { 1 },
+ .result = REJECT,
+ .errstr = "invalid access to map value, value_size=64 off=256 size=4",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "invalid cgroup storage access 4",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, -2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_cgroup_storage = { 1 },
+ .result = REJECT,
+ .errstr = "invalid access to map value, value_size=64 off=-2 size=4",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "invalid cgroup storage access 5",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 7),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_cgroup_storage = { 1 },
+ .result = REJECT,
+ .errstr = "get_local_storage() doesn't support non-zero flags",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "invalid cgroup storage access 6",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_1),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_cgroup_storage = { 1 },
+ .result = REJECT,
+ .errstr = "get_local_storage() doesn't support non-zero flags",
+ .errstr_unpriv = "R2 leaks addr into helper function",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "valid per-cpu cgroup storage access",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_percpu_cgroup_storage = { 1 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "invalid per-cpu cgroup storage access 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 1 },
+ .result = REJECT,
+ .errstr = "cannot pass map_type 1 into func bpf_get_local_storage",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "invalid per-cpu cgroup storage access 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "fd 1 is not pointing to valid bpf_map",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "invalid per-cpu cgroup storage access 3",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 256),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_percpu_cgroup_storage = { 1 },
+ .result = REJECT,
+ .errstr = "invalid access to map value, value_size=64 off=256 size=4",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "invalid per-cpu cgroup storage access 4",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, -2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_cgroup_storage = { 1 },
+ .result = REJECT,
+ .errstr = "invalid access to map value, value_size=64 off=-2 size=4",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "invalid per-cpu cgroup storage access 5",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 7),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_percpu_cgroup_storage = { 1 },
+ .result = REJECT,
+ .errstr = "get_local_storage() doesn't support non-zero flags",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "invalid per-cpu cgroup storage access 6",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_1),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_percpu_cgroup_storage = { 1 },
+ .result = REJECT,
+ .errstr = "get_local_storage() doesn't support non-zero flags",
+ .errstr_unpriv = "R2 leaks addr into helper function",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
diff --git a/tools/testing/selftests/bpf/verifier/const_or.c b/tools/testing/selftests/bpf/verifier/const_or.c
new file mode 100644
index 000000000..0719b0dde
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/const_or.c
@@ -0,0 +1,60 @@
+{
+ "constant register |= constant should keep constant type",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -48),
+ BPF_MOV64_IMM(BPF_REG_2, 34),
+ BPF_ALU64_IMM(BPF_OR, BPF_REG_2, 13),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "constant register |= constant should not bypass stack boundary checks",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -48),
+ BPF_MOV64_IMM(BPF_REG_2, 34),
+ BPF_ALU64_IMM(BPF_OR, BPF_REG_2, 24),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid indirect access to stack R1 off=-48 size=58",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "constant register |= constant register should keep constant type",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -48),
+ BPF_MOV64_IMM(BPF_REG_2, 34),
+ BPF_MOV64_IMM(BPF_REG_4, 13),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_4),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "constant register |= constant register should not bypass stack boundary checks",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -48),
+ BPF_MOV64_IMM(BPF_REG_2, 34),
+ BPF_MOV64_IMM(BPF_REG_4, 24),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_4),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid indirect access to stack R1 off=-48 size=58",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/ctx.c b/tools/testing/selftests/bpf/verifier/ctx.c
new file mode 100644
index 000000000..93d6b1641
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/ctx.c
@@ -0,0 +1,198 @@
+{
+ "context stores via ST",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_1, offsetof(struct __sk_buff, mark), 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "BPF_ST stores into R1 ctx is not allowed",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "context stores via XADD",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_W, BPF_REG_1,
+ BPF_REG_0, offsetof(struct __sk_buff, mark), 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "BPF_XADD stores into R1 ctx is not allowed",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "arithmetic ops make PTR_TO_CTX unusable",
+ .insns = {
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1,
+ offsetof(struct __sk_buff, data) -
+ offsetof(struct __sk_buff, mark)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "dereference of modified ctx ptr",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "pass unmodified ctx pointer to helper",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_csum_update),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
+{
+ "pass modified ctx pointer to helper, 1",
+ .insns = {
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -612),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_csum_update),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .errstr = "dereference of modified ctx ptr",
+},
+{
+ "pass modified ctx pointer to helper, 2",
+ .insns = {
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -612),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_get_socket_cookie),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result_unpriv = REJECT,
+ .result = REJECT,
+ .errstr_unpriv = "dereference of modified ctx ptr",
+ .errstr = "dereference of modified ctx ptr",
+},
+{
+ "pass modified ctx pointer to helper, 3",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, 0),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_3, 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_csum_update),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .errstr = "variable ctx access var_off=(0x0; 0x4)",
+},
+{
+ "pass ctx or null check, 1: ctx",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_get_netns_cookie),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG,
+ .result = ACCEPT,
+},
+{
+ "pass ctx or null check, 2: null",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_get_netns_cookie),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG,
+ .result = ACCEPT,
+},
+{
+ "pass ctx or null check, 3: 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_get_netns_cookie),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG,
+ .result = REJECT,
+ .errstr = "R1 type=inv expected=ctx",
+},
+{
+ "pass ctx or null check, 4: ctx - const",
+ .insns = {
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -612),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_get_netns_cookie),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG,
+ .result = REJECT,
+ .errstr = "dereference of modified ctx ptr",
+},
+{
+ "pass ctx or null check, 5: null (connect)",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_get_netns_cookie),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ .expected_attach_type = BPF_CGROUP_INET4_CONNECT,
+ .result = ACCEPT,
+},
+{
+ "pass ctx or null check, 6: null (bind)",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_get_netns_cookie),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+ .expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+ .result = ACCEPT,
+},
+{
+ "pass ctx or null check, 7: ctx (bind)",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_get_socket_cookie),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+ .expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+ .result = ACCEPT,
+},
+{
+ "pass ctx or null check, 8: null (bind)",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_get_socket_cookie),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+ .expected_attach_type = BPF_CGROUP_INET4_POST_BIND,
+ .result = REJECT,
+ .errstr = "R1 type=inv expected=ctx",
+},
diff --git a/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c b/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c
new file mode 100644
index 000000000..fd3b62a08
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/ctx_sk_lookup.c
@@ -0,0 +1,493 @@
+{
+ "valid 1,2,4,8-byte reads from bpf_sk_lookup",
+ .insns = {
+ /* 1-byte read from family field */
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, family)),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, family) + 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, family) + 2),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, family) + 3),
+ /* 2-byte read from family field */
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, family)),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, family) + 2),
+ /* 4-byte read from family field */
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, family)),
+
+ /* 1-byte read from protocol field */
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, protocol)),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, protocol) + 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, protocol) + 2),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, protocol) + 3),
+ /* 2-byte read from protocol field */
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, protocol)),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, protocol) + 2),
+ /* 4-byte read from protocol field */
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, protocol)),
+
+ /* 1-byte read from remote_ip4 field */
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip4)),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip4) + 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip4) + 2),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip4) + 3),
+ /* 2-byte read from remote_ip4 field */
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip4)),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip4) + 2),
+ /* 4-byte read from remote_ip4 field */
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip4)),
+
+ /* 1-byte read from remote_ip6 field */
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6)),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 2),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 3),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 4),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 5),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 6),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 7),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 8),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 9),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 10),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 11),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 12),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 13),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 14),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 15),
+ /* 2-byte read from remote_ip6 field */
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6)),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 2),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 4),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 6),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 8),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 10),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 12),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 14),
+ /* 4-byte read from remote_ip6 field */
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 4),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 8),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6) + 12),
+
+ /* 1-byte read from remote_port field */
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_port)),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_port) + 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_port) + 2),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_port) + 3),
+ /* 2-byte read from remote_port field */
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_port)),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_port) + 2),
+ /* 4-byte read from remote_port field */
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_port)),
+
+ /* 1-byte read from local_ip4 field */
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip4)),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip4) + 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip4) + 2),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip4) + 3),
+ /* 2-byte read from local_ip4 field */
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip4)),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip4) + 2),
+ /* 4-byte read from local_ip4 field */
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip4)),
+
+ /* 1-byte read from local_ip6 field */
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6)),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 2),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 3),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 4),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 5),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 6),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 7),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 8),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 9),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 10),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 11),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 12),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 13),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 14),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 15),
+ /* 2-byte read from local_ip6 field */
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6)),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 2),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 4),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 6),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 8),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 10),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 12),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 14),
+ /* 4-byte read from local_ip6 field */
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 4),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 8),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6) + 12),
+
+ /* 1-byte read from local_port field */
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_port)),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_port) + 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_port) + 2),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_port) + 3),
+ /* 2-byte read from local_port field */
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_port)),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_port) + 2),
+ /* 4-byte read from local_port field */
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_port)),
+
+ /* 8-byte read from sk field */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, sk)),
+
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+ .expected_attach_type = BPF_SK_LOOKUP,
+ .runs = -1,
+},
+/* invalid 8-byte reads from a 4-byte fields in bpf_sk_lookup */
+{
+ "invalid 8-byte read from bpf_sk_lookup family field",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, family)),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+ .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+ "invalid 8-byte read from bpf_sk_lookup protocol field",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, protocol)),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+ .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+ "invalid 8-byte read from bpf_sk_lookup remote_ip4 field",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip4)),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+ .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+ "invalid 8-byte read from bpf_sk_lookup remote_ip6 field",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_ip6)),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+ .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+ "invalid 8-byte read from bpf_sk_lookup remote_port field",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, remote_port)),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+ .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+ "invalid 8-byte read from bpf_sk_lookup local_ip4 field",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip4)),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+ .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+ "invalid 8-byte read from bpf_sk_lookup local_ip6 field",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_ip6)),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+ .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+ "invalid 8-byte read from bpf_sk_lookup local_port field",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, local_port)),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+ .expected_attach_type = BPF_SK_LOOKUP,
+},
+/* invalid 1,2,4-byte reads from 8-byte fields in bpf_sk_lookup */
+{
+ "invalid 4-byte read from bpf_sk_lookup sk field",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, sk)),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+ .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+ "invalid 2-byte read from bpf_sk_lookup sk field",
+ .insns = {
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, sk)),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+ .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+ "invalid 1-byte read from bpf_sk_lookup sk field",
+ .insns = {
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_sk_lookup, sk)),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+ .expected_attach_type = BPF_SK_LOOKUP,
+},
+/* out of bounds and unaligned reads from bpf_sk_lookup */
+{
+ "invalid 4-byte read past end of bpf_sk_lookup",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ sizeof(struct bpf_sk_lookup)),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+ .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+ "invalid 4-byte unaligned read from bpf_sk_lookup at odd offset",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 1),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+ .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+ "invalid 4-byte unaligned read from bpf_sk_lookup at even offset",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 2),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+ .expected_attach_type = BPF_SK_LOOKUP,
+},
+/* in-bound and out-of-bound writes to bpf_sk_lookup */
+{
+ "invalid 8-byte write to bpf_sk_lookup",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0xcafe4a11U),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+ .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+ "invalid 4-byte write to bpf_sk_lookup",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0xcafe4a11U),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+ .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+ "invalid 2-byte write to bpf_sk_lookup",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0xcafe4a11U),
+ BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+ .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+ "invalid 1-byte write to bpf_sk_lookup",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0xcafe4a11U),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+ .expected_attach_type = BPF_SK_LOOKUP,
+},
+{
+ "invalid 4-byte write past end of bpf_sk_lookup",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0xcafe4a11U),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ sizeof(struct bpf_sk_lookup)),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_LOOKUP,
+ .expected_attach_type = BPF_SK_LOOKUP,
+},
diff --git a/tools/testing/selftests/bpf/verifier/ctx_sk_msg.c b/tools/testing/selftests/bpf/verifier/ctx_sk_msg.c
new file mode 100644
index 000000000..c6c69220a
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/ctx_sk_msg.c
@@ -0,0 +1,181 @@
+{
+ "valid access family in SK_MSG",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct sk_msg_md, family)),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_MSG,
+},
+{
+ "valid access remote_ip4 in SK_MSG",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct sk_msg_md, remote_ip4)),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_MSG,
+},
+{
+ "valid access local_ip4 in SK_MSG",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct sk_msg_md, local_ip4)),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_MSG,
+},
+{
+ "valid access remote_port in SK_MSG",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct sk_msg_md, remote_port)),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_MSG,
+},
+{
+ "valid access local_port in SK_MSG",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct sk_msg_md, local_port)),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_MSG,
+},
+{
+ "valid access remote_ip6 in SK_MSG",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct sk_msg_md, remote_ip6[0])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct sk_msg_md, remote_ip6[1])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct sk_msg_md, remote_ip6[2])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct sk_msg_md, remote_ip6[3])),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+},
+{
+ "valid access local_ip6 in SK_MSG",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct sk_msg_md, local_ip6[0])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct sk_msg_md, local_ip6[1])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct sk_msg_md, local_ip6[2])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct sk_msg_md, local_ip6[3])),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+},
+{
+ "valid access size in SK_MSG",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct sk_msg_md, size)),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_MSG,
+},
+{
+ "invalid 64B read of size in SK_MSG",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1,
+ offsetof(struct sk_msg_md, size)),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_MSG,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "invalid read past end of SK_MSG",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct sk_msg_md, size) + 4),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_MSG,
+},
+{
+ "invalid read offset in SK_MSG",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct sk_msg_md, family) + 1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_MSG,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "direct packet read for SK_MSG",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1,
+ offsetof(struct sk_msg_md, data)),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1,
+ offsetof(struct sk_msg_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_MSG,
+},
+{
+ "direct packet write for SK_MSG",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1,
+ offsetof(struct sk_msg_md, data)),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1,
+ offsetof(struct sk_msg_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_STX_MEM(BPF_B, BPF_REG_2, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_MSG,
+},
+{
+ "overlapping checks for direct packet access SK_MSG",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1,
+ offsetof(struct sk_msg_md, data)),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1,
+ offsetof(struct sk_msg_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_2, 6),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_MSG,
+},
diff --git a/tools/testing/selftests/bpf/verifier/ctx_skb.c b/tools/testing/selftests/bpf/verifier/ctx_skb.c
new file mode 100644
index 000000000..2e16b8e26
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/ctx_skb.c
@@ -0,0 +1,1091 @@
+{
+ "access skb fields ok",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, len)),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, pkt_type)),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, queue_mapping)),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, protocol)),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, vlan_present)),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, vlan_tci)),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, napi_id)),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+},
+{
+ "access skb fields bad1",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, -4),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+},
+{
+ "access skb fields bad2",
+ .insns = {
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 9),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, pkt_type)),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 4 },
+ .errstr = "different pointers",
+ .errstr_unpriv = "R1 pointer comparison",
+ .result = REJECT,
+},
+{
+ "access skb fields bad3",
+ .insns = {
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 2),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, pkt_type)),
+ BPF_EXIT_INSN(),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, -12),
+ },
+ .fixup_map_hash_8b = { 6 },
+ .errstr = "different pointers",
+ .errstr_unpriv = "R1 pointer comparison",
+ .result = REJECT,
+},
+{
+ "access skb fields bad4",
+ .insns = {
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 3),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+ offsetof(struct __sk_buff, len)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, -13),
+ },
+ .fixup_map_hash_8b = { 7 },
+ .errstr = "different pointers",
+ .errstr_unpriv = "R1 pointer comparison",
+ .result = REJECT,
+},
+{
+ "invalid access __sk_buff family",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, family)),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+},
+{
+ "invalid access __sk_buff remote_ip4",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, remote_ip4)),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+},
+{
+ "invalid access __sk_buff local_ip4",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, local_ip4)),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+},
+{
+ "invalid access __sk_buff remote_ip6",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, remote_ip6)),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+},
+{
+ "invalid access __sk_buff local_ip6",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, local_ip6)),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+},
+{
+ "invalid access __sk_buff remote_port",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, remote_port)),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+},
+{
+ "invalid access __sk_buff remote_port",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, local_port)),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+},
+{
+ "valid access __sk_buff family",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, family)),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+},
+{
+ "valid access __sk_buff remote_ip4",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, remote_ip4)),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+},
+{
+ "valid access __sk_buff local_ip4",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, local_ip4)),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+},
+{
+ "valid access __sk_buff remote_ip6",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, remote_ip6[0])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, remote_ip6[1])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, remote_ip6[2])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, remote_ip6[3])),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+},
+{
+ "valid access __sk_buff local_ip6",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, local_ip6[0])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, local_ip6[1])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, local_ip6[2])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, local_ip6[3])),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+},
+{
+ "valid access __sk_buff remote_port",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, remote_port)),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+},
+{
+ "valid access __sk_buff remote_port",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, local_port)),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+},
+{
+ "invalid access of tc_classid for SK_SKB",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, tc_classid)),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+ .errstr = "invalid bpf_context access",
+},
+{
+ "invalid access of skb->mark for SK_SKB",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+ .errstr = "invalid bpf_context access",
+},
+{
+ "check skb->mark is not writeable by SK_SKB",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, mark)),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+ .errstr = "invalid bpf_context access",
+},
+{
+ "check skb->tc_index is writeable by SK_SKB",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, tc_index)),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+},
+{
+ "check skb->priority is writeable by SK_SKB",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, priority)),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+},
+{
+ "direct packet read for SK_SKB",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+},
+{
+ "direct packet write for SK_SKB",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_STX_MEM(BPF_B, BPF_REG_2, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+},
+{
+ "overlapping checks for direct packet access SK_SKB",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_2, 6),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+},
+{
+ "check skb->mark is not writeable by sockets",
+ .insns = {
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .errstr_unpriv = "R1 leaks addr",
+ .result = REJECT,
+},
+{
+ "check skb->tc_index is not writeable by sockets",
+ .insns = {
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+ offsetof(struct __sk_buff, tc_index)),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .errstr_unpriv = "R1 leaks addr",
+ .result = REJECT,
+},
+{
+ "check cb access: byte",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0]) + 1),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0]) + 2),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0]) + 3),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[1])),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[1]) + 1),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[1]) + 2),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[1]) + 3),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[2])),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[2]) + 1),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[2]) + 2),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[2]) + 3),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[3])),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[3]) + 1),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[3]) + 2),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[3]) + 3),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[4])),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[4]) + 1),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[4]) + 2),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[4]) + 3),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[0]) + 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[0]) + 2),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[0]) + 3),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[1])),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[1]) + 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[1]) + 2),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[1]) + 3),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[2])),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[2]) + 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[2]) + 2),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[2]) + 3),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[3])),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[3]) + 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[3]) + 2),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[3]) + 3),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[4])),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[4]) + 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[4]) + 2),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[4]) + 3),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+},
+{
+ "__sk_buff->hash, offset 0, byte store not permitted",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, hash)),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+},
+{
+ "__sk_buff->tc_index, offset 3, byte store not permitted",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, tc_index) + 3),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+},
+{
+ "check skb->hash byte load permitted",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, hash)),
+#else
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, hash) + 3),
+#endif
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+},
+{
+ "check skb->hash byte load permitted 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, hash) + 1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+},
+{
+ "check skb->hash byte load permitted 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, hash) + 2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+},
+{
+ "check skb->hash byte load permitted 3",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, hash) + 3),
+#else
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, hash)),
+#endif
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+},
+{
+ "check cb access: byte, wrong type",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+},
+{
+ "check cb access: half",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0]) + 2),
+ BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[1])),
+ BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[1]) + 2),
+ BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[2])),
+ BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[2]) + 2),
+ BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[3])),
+ BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[3]) + 2),
+ BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[4])),
+ BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[4]) + 2),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[0]) + 2),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[1])),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[1]) + 2),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[2])),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[2]) + 2),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[3])),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[3]) + 2),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[4])),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[4]) + 2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+},
+{
+ "check cb access: half, unaligned",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0]) + 1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "misaligned context access",
+ .result = REJECT,
+ .flags = F_LOAD_WITH_STRICT_ALIGNMENT,
+},
+{
+ "check __sk_buff->hash, offset 0, half store not permitted",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, hash)),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+},
+{
+ "check __sk_buff->tc_index, offset 2, half store not permitted",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, tc_index) + 2),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+},
+{
+ "check skb->hash half load permitted",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, hash)),
+#else
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, hash) + 2),
+#endif
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+},
+{
+ "check skb->hash half load permitted 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, hash) + 2),
+#else
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, hash)),
+#endif
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+},
+{
+ "check skb->hash half load not permitted, unaligned 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, hash) + 1),
+#else
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, hash) + 3),
+#endif
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "check skb->hash half load not permitted, unaligned 3",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, hash) + 3),
+#else
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, hash) + 1),
+#endif
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "check cb access: half, wrong type",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_H, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+},
+{
+ "check cb access: word",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[1])),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[2])),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[3])),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[4])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[1])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[2])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[3])),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[4])),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+},
+{
+ "check cb access: word, unaligned 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0]) + 2),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "misaligned context access",
+ .result = REJECT,
+ .flags = F_LOAD_WITH_STRICT_ALIGNMENT,
+},
+{
+ "check cb access: word, unaligned 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[4]) + 1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "misaligned context access",
+ .result = REJECT,
+ .flags = F_LOAD_WITH_STRICT_ALIGNMENT,
+},
+{
+ "check cb access: word, unaligned 3",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[4]) + 2),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "misaligned context access",
+ .result = REJECT,
+ .flags = F_LOAD_WITH_STRICT_ALIGNMENT,
+},
+{
+ "check cb access: word, unaligned 4",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[4]) + 3),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "misaligned context access",
+ .result = REJECT,
+ .flags = F_LOAD_WITH_STRICT_ALIGNMENT,
+},
+{
+ "check cb access: double",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[2])),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[2])),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+},
+{
+ "check cb access: double, unaligned 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[1])),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "misaligned context access",
+ .result = REJECT,
+ .flags = F_LOAD_WITH_STRICT_ALIGNMENT,
+},
+{
+ "check cb access: double, unaligned 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[3])),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "misaligned context access",
+ .result = REJECT,
+ .flags = F_LOAD_WITH_STRICT_ALIGNMENT,
+},
+{
+ "check cb access: double, oob 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[4])),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+},
+{
+ "check cb access: double, oob 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[4])),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+},
+{
+ "check __sk_buff->ifindex dw store not permitted",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, ifindex)),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+},
+{
+ "check __sk_buff->ifindex dw load not permitted",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, ifindex)),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+},
+{
+ "check cb access: double, wrong type",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCK,
+},
+{
+ "check out of range skb->cb access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[0]) + 256),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .errstr_unpriv = "",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SCHED_ACT,
+},
+{
+ "write skb fields from socket prog",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[4])),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, tc_index)),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[2])),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .errstr_unpriv = "R1 leaks addr",
+ .result_unpriv = REJECT,
+},
+{
+ "write skb fields from tc_cls_act prog",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, mark)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, tc_index)),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, tc_index)),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[3])),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, tstamp)),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, tstamp)),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "check skb->data half load not permitted",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+#else
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, data) + 2),
+#endif
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid bpf_context access",
+},
+{
+ "read gso_segs from CGROUP_SKB",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, gso_segs)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "read gso_segs from CGROUP_SKB",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+ offsetof(struct __sk_buff, gso_segs)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "write gso_segs from CGROUP_SKB",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, gso_segs)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .result_unpriv = REJECT,
+ .errstr = "invalid bpf_context access off=164 size=4",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "read gso_segs from CLS",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, gso_segs)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "read gso_size from CGROUP_SKB",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, gso_size)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "read gso_size from CGROUP_SKB",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+ offsetof(struct __sk_buff, gso_size)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "write gso_size from CGROUP_SKB",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, gso_size)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .result_unpriv = REJECT,
+ .errstr = "invalid bpf_context access off=176 size=4",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "read gso_size from CLS",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, gso_size)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "check wire_len is not readable by sockets",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, wire_len)),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+},
+{
+ "check wire_len is readable by tc classifier",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, wire_len)),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
+{
+ "check wire_len is not writable by tc classifier",
+ .insns = {
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+ offsetof(struct __sk_buff, wire_len)),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "invalid bpf_context access",
+ .errstr_unpriv = "R1 leaks addr",
+ .result = REJECT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/d_path.c b/tools/testing/selftests/bpf/verifier/d_path.c
new file mode 100644
index 000000000..b98839637
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/d_path.c
@@ -0,0 +1,37 @@
+{
+ "d_path accept",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_MOV64_IMM(BPF_REG_6, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_6, 0),
+ BPF_LD_IMM64(BPF_REG_3, 8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_d_path),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACING,
+ .expected_attach_type = BPF_TRACE_FENTRY,
+ .kfunc = "dentry_open",
+},
+{
+ "d_path reject",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_MOV64_IMM(BPF_REG_6, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_6, 0),
+ BPF_LD_IMM64(BPF_REG_3, 8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_d_path),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "helper call is not allowed in probe",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACING,
+ .expected_attach_type = BPF_TRACE_FENTRY,
+ .kfunc = "d_path",
+},
diff --git a/tools/testing/selftests/bpf/verifier/dead_code.c b/tools/testing/selftests/bpf/verifier/dead_code.c
new file mode 100644
index 000000000..721ec9391
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/dead_code.c
@@ -0,0 +1,161 @@
+{
+ "dead code: start",
+ .insns = {
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 7),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 10, -4),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R9 !read_ok",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = 7,
+},
+{
+ "dead code: mid 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 7),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 1),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 10, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 7,
+},
+{
+ "dead code: mid 2",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_JMP_IMM(BPF_JSET, BPF_REG_0, 1, 4),
+ BPF_JMP_IMM(BPF_JSET, BPF_REG_0, 1, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 7),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "dead code: end 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 7),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 10, 1),
+ BPF_EXIT_INSN(),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 7,
+},
+{
+ "dead code: end 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 7),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 10, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 12),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 7,
+},
+{
+ "dead code: end 3",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 7),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 8, 1),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 10, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 12),
+ BPF_JMP_IMM(BPF_JA, 0, 0, -5),
+ },
+ .result = ACCEPT,
+ .retval = 7,
+},
+{
+ "dead code: tail of main + func",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 7),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 8, 1),
+ BPF_EXIT_INSN(),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 12),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "function calls to other bpf functions are allowed for",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = 7,
+},
+{
+ "dead code: tail of main + two functions",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 7),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 8, 1),
+ BPF_EXIT_INSN(),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 12),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "function calls to other bpf functions are allowed for",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = 7,
+},
+{
+ "dead code: function in the middle and mid of another func",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 7),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 12),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 7),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 7, 1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -5),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "function calls to other bpf functions are allowed for",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = 7,
+},
+{
+ "dead code: middle of main before call",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 2),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 2, 1),
+ BPF_MOV64_IMM(BPF_REG_1, 5),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "function calls to other bpf functions are allowed for",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = 2,
+},
+{
+ "dead code: start of a function",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 2),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "function calls to other bpf functions are allowed for",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = 2,
+},
diff --git a/tools/testing/selftests/bpf/verifier/direct_packet_access.c b/tools/testing/selftests/bpf/verifier/direct_packet_access.c
new file mode 100644
index 000000000..ae7253660
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/direct_packet_access.c
@@ -0,0 +1,656 @@
+{
+ "pkt_end - pkt_start is allowed",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = TEST_DATA_LEN,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "direct packet access: test1",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "direct packet access: test2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_3),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 14),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_5, BPF_REG_4, 15),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_3, 7),
+ BPF_LDX_MEM(BPF_B, BPF_REG_4, BPF_REG_3, 12),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_4, 14),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_4),
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, len)),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_2, 49),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_2, 49),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_3),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_3, 4),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "direct packet access: test3",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_context access off=76",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
+},
+{
+ "direct packet access: test4 (write)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_STX_MEM(BPF_B, BPF_REG_2, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "direct packet access: test5 (pkt_end >= reg, good access)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "direct packet access: test6 (pkt_end >= reg, bad access)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_0, 3),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid access to packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "direct packet access: test7 (pkt_end >= reg, both accesses)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_0, 3),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid access to packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "direct packet access: test8 (double test, variant 1)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_0, 4),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "direct packet access: test9 (double test, variant 2)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "direct packet access: test10 (write invalid)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_STX_MEM(BPF_B, BPF_REG_2, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid access to packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "direct packet access: test11 (shift, good access)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 22),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 8),
+ BPF_MOV64_IMM(BPF_REG_3, 144),
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_3),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 23),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_5, 3),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_5),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .retval = 1,
+},
+{
+ "direct packet access: test12 (and, good access)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 22),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 8),
+ BPF_MOV64_IMM(BPF_REG_3, 144),
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_3),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 23),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_5, 15),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_5),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .retval = 1,
+},
+{
+ "direct packet access: test13 (branches, good access)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 22),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 13),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_MOV64_IMM(BPF_REG_4, 1),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_4, 2),
+ BPF_MOV64_IMM(BPF_REG_3, 14),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_3, 24),
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_3),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 23),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_5, 15),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_5),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .retval = 1,
+},
+{
+ "direct packet access: test14 (pkt_ptr += 0, CONST_IMM, good access)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 22),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 7),
+ BPF_MOV64_IMM(BPF_REG_5, 12),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_5, 4),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_5),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_6, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .retval = 1,
+},
+{
+ "direct packet access: test15 (spill with xadd)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 8),
+ BPF_MOV64_IMM(BPF_REG_5, 4096),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0),
+ BPF_STX_XADD(BPF_DW, BPF_REG_4, BPF_REG_5, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_4, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_2, BPF_REG_5, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R2 invalid mem access 'inv'",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "direct packet access: test16 (arith on data_end)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 16),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_STX_MEM(BPF_B, BPF_REG_2, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R3 pointer arithmetic on pkt_end",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "direct packet access: test17 (pruning, alignment)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 14),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_7, 1, 4),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, -4),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+ BPF_JMP_A(-6),
+ },
+ .errstr = "misaligned packet access off 2+(0x0; 0x0)+15+-4 size 4",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .flags = F_LOAD_WITH_STRICT_ALIGNMENT,
+},
+{
+ "direct packet access: test18 (imm += pkt_ptr, 1)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_IMM(BPF_REG_0, 8),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_STX_MEM(BPF_B, BPF_REG_2, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "direct packet access: test19 (imm += pkt_ptr, 2)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 3),
+ BPF_MOV64_IMM(BPF_REG_4, 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_4, BPF_REG_2),
+ BPF_STX_MEM(BPF_B, BPF_REG_4, BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "direct packet access: test20 (x += pkt_ptr, 1)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_IMM(BPF_REG_0, 0xffffffff),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0x7fff),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_0),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_4, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_4),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 0x7fff - 1),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_5, BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "direct packet access: test21 (x += pkt_ptr, 2)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 9),
+ BPF_MOV64_IMM(BPF_REG_4, 0xffffffff),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_4, -8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_4, 0x7fff),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_4, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_4),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 0x7fff - 1),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_5, BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "direct packet access: test22 (x += pkt_ptr, 3)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_3, -16),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_10, -16),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 11),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -8),
+ BPF_MOV64_IMM(BPF_REG_4, 0xffffffff),
+ BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_4, -8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 49),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_4, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_4),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 2),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 2),
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_STX_MEM(BPF_H, BPF_REG_4, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "direct packet access: test23 (x += pkt_ptr, 4)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xffff),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_0, 31),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 0xffff - 1),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_5, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .errstr = "invalid access to packet, off=0 size=8, R5(id=2,off=0,r=0)",
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "direct packet access: test24 (x += pkt_ptr, 5)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_IMM(BPF_REG_0, 0xffffffff),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 0xff),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_0, 64),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 0x7fff - 1),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_5, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "direct packet access: test25 (marking on <, good access)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_0, BPF_REG_3, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, -4),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "direct packet access: test26 (marking on <, bad access)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_0, BPF_REG_3, 3),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JA, 0, 0, -3),
+ },
+ .result = REJECT,
+ .errstr = "invalid access to packet",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "direct packet access: test27 (marking on <=, good access)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_3, BPF_REG_0, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .retval = 1,
+},
+{
+ "direct packet access: test28 (marking on <=, bad access)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_3, BPF_REG_0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, -4),
+ },
+ .result = REJECT,
+ .errstr = "invalid access to packet",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "direct packet access: test29 (reg > pkt_end in subprog)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_6, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
diff --git a/tools/testing/selftests/bpf/verifier/direct_stack_access_wraparound.c b/tools/testing/selftests/bpf/verifier/direct_stack_access_wraparound.c
new file mode 100644
index 000000000..698e3779f
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/direct_stack_access_wraparound.c
@@ -0,0 +1,40 @@
+{
+ "direct stack access with 32-bit wraparound. test1",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x7fffffff),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x7fffffff),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "fp pointer and 2147483647",
+ .result = REJECT
+},
+{
+ "direct stack access with 32-bit wraparound. test2",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x3fffffff),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x3fffffff),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "fp pointer and 1073741823",
+ .result = REJECT
+},
+{
+ "direct stack access with 32-bit wraparound. test3",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x1fffffff),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0x1fffffff),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "fp pointer offset 1073741822",
+ .errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
+ .result = REJECT
+},
diff --git a/tools/testing/selftests/bpf/verifier/direct_value_access.c b/tools/testing/selftests/bpf/verifier/direct_value_access.c
new file mode 100644
index 000000000..988f46a1a
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/direct_value_access.c
@@ -0,0 +1,347 @@
+{
+ "direct map access, write test 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_VALUE(BPF_REG_1, 0, 0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 4242),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "direct map access, write test 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_VALUE(BPF_REG_1, 0, 8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 4242),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "direct map access, write test 3",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_VALUE(BPF_REG_1, 0, 8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_1, 8, 4242),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "direct map access, write test 4",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_VALUE(BPF_REG_1, 0, 40),
+ BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 4242),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "direct map access, write test 5",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_VALUE(BPF_REG_1, 0, 32),
+ BPF_ST_MEM(BPF_DW, BPF_REG_1, 8, 4242),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "direct map access, write test 6",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_VALUE(BPF_REG_1, 0, 40),
+ BPF_ST_MEM(BPF_DW, BPF_REG_1, 4, 4242),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = REJECT,
+ .errstr = "R1 min value is outside of the allowed memory range",
+},
+{
+ "direct map access, write test 7",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_VALUE(BPF_REG_1, 0, -1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_1, 4, 4242),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = REJECT,
+ .errstr = "direct value offset of 4294967295 is not allowed",
+},
+{
+ "direct map access, write test 8",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_VALUE(BPF_REG_1, 0, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_1, -1, 4242),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "direct map access, write test 9",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_VALUE(BPF_REG_1, 0, 48),
+ BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 4242),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = REJECT,
+ .errstr = "invalid access to map value pointer",
+},
+{
+ "direct map access, write test 10",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_VALUE(BPF_REG_1, 0, 47),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 4),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "direct map access, write test 11",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_VALUE(BPF_REG_1, 0, 48),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 4),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = REJECT,
+ .errstr = "invalid access to map value pointer",
+},
+{
+ "direct map access, write test 12",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_VALUE(BPF_REG_1, 0, (1<<29)),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 4),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = REJECT,
+ .errstr = "direct value offset of 536870912 is not allowed",
+},
+{
+ "direct map access, write test 13",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_VALUE(BPF_REG_1, 0, (1<<29)-1),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 4),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = REJECT,
+ .errstr = "invalid access to map value pointer, value_size=48 off=536870911",
+},
+{
+ "direct map access, write test 14",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_VALUE(BPF_REG_1, 0, 47),
+ BPF_LD_MAP_VALUE(BPF_REG_2, 0, 46),
+ BPF_ST_MEM(BPF_H, BPF_REG_2, 0, 0xffff),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1, 3 },
+ .result = ACCEPT,
+ .retval = 0xff,
+},
+{
+ "direct map access, write test 15",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_VALUE(BPF_REG_1, 0, 46),
+ BPF_LD_MAP_VALUE(BPF_REG_2, 0, 46),
+ BPF_ST_MEM(BPF_H, BPF_REG_2, 0, 0xffff),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1, 3 },
+ .result = ACCEPT,
+ .retval = 0xffff,
+},
+{
+ "direct map access, write test 16",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_VALUE(BPF_REG_1, 0, 46),
+ BPF_LD_MAP_VALUE(BPF_REG_2, 0, 47),
+ BPF_ST_MEM(BPF_H, BPF_REG_2, 0, 0xffff),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1, 3 },
+ .result = REJECT,
+ .errstr = "invalid access to map value, value_size=48 off=47 size=2",
+},
+{
+ "direct map access, write test 17",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_VALUE(BPF_REG_1, 0, 46),
+ BPF_LD_MAP_VALUE(BPF_REG_2, 0, 46),
+ BPF_ST_MEM(BPF_H, BPF_REG_2, 1, 0xffff),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1, 3 },
+ .result = REJECT,
+ .errstr = "invalid access to map value, value_size=48 off=47 size=2",
+},
+{
+ "direct map access, write test 18",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_VALUE(BPF_REG_1, 0, 0),
+ BPF_ST_MEM(BPF_H, BPF_REG_1, 0, 42),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_small = { 1 },
+ .result = REJECT,
+ .errstr = "R1 min value is outside of the allowed memory range",
+},
+{
+ "direct map access, write test 19",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_VALUE(BPF_REG_1, 0, 0),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_small = { 1 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "direct map access, write test 20",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_VALUE(BPF_REG_1, 0, 1),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_small = { 1 },
+ .result = REJECT,
+ .errstr = "invalid access to map value pointer",
+},
+{
+ "direct map access, invalid insn test 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_VALUE, 0, 1, 0, 47),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = REJECT,
+ .errstr = "invalid bpf_ld_imm64 insn",
+},
+{
+ "direct map access, invalid insn test 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_VALUE, 1, 0, 0, 47),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = REJECT,
+ .errstr = "BPF_LD_IMM64 uses reserved fields",
+},
+{
+ "direct map access, invalid insn test 3",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_VALUE, ~0, 0, 0, 47),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = REJECT,
+ .errstr = "BPF_LD_IMM64 uses reserved fields",
+},
+{
+ "direct map access, invalid insn test 4",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_VALUE, 0, ~0, 0, 47),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = REJECT,
+ .errstr = "invalid bpf_ld_imm64 insn",
+},
+{
+ "direct map access, invalid insn test 5",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_VALUE, ~0, ~0, 0, 47),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = REJECT,
+ .errstr = "invalid bpf_ld_imm64 insn",
+},
+{
+ "direct map access, invalid insn test 6",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_FD, ~0, 0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = REJECT,
+ .errstr = "BPF_LD_IMM64 uses reserved fields",
+},
+{
+ "direct map access, invalid insn test 7",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_FD, 0, ~0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = REJECT,
+ .errstr = "invalid bpf_ld_imm64 insn",
+},
+{
+ "direct map access, invalid insn test 8",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_FD, ~0, ~0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = REJECT,
+ .errstr = "invalid bpf_ld_imm64 insn",
+},
+{
+ "direct map access, invalid insn test 9",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_FD, 0, 0, 0, 47),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = REJECT,
+ .errstr = "unrecognized bpf_ld_imm64 insn",
+},
diff --git a/tools/testing/selftests/bpf/verifier/div0.c b/tools/testing/selftests/bpf/verifier/div0.c
new file mode 100644
index 000000000..7685edfbc
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/div0.c
@@ -0,0 +1,184 @@
+{
+ "DIV32 by 0, zero check 1",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_0, 42),
+ BPF_MOV32_IMM(BPF_REG_1, 0),
+ BPF_MOV32_IMM(BPF_REG_2, 1),
+ BPF_ALU32_REG(BPF_DIV, BPF_REG_2, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 42,
+},
+{
+ "DIV32 by 0, zero check 2",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_0, 42),
+ BPF_LD_IMM64(BPF_REG_1, 0xffffffff00000000LL),
+ BPF_MOV32_IMM(BPF_REG_2, 1),
+ BPF_ALU32_REG(BPF_DIV, BPF_REG_2, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 42,
+},
+{
+ "DIV64 by 0, zero check",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_0, 42),
+ BPF_MOV32_IMM(BPF_REG_1, 0),
+ BPF_MOV32_IMM(BPF_REG_2, 1),
+ BPF_ALU64_REG(BPF_DIV, BPF_REG_2, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 42,
+},
+{
+ "MOD32 by 0, zero check 1",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_0, 42),
+ BPF_MOV32_IMM(BPF_REG_1, 0),
+ BPF_MOV32_IMM(BPF_REG_2, 1),
+ BPF_ALU32_REG(BPF_MOD, BPF_REG_2, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 42,
+},
+{
+ "MOD32 by 0, zero check 2",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_0, 42),
+ BPF_LD_IMM64(BPF_REG_1, 0xffffffff00000000LL),
+ BPF_MOV32_IMM(BPF_REG_2, 1),
+ BPF_ALU32_REG(BPF_MOD, BPF_REG_2, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 42,
+},
+{
+ "MOD64 by 0, zero check",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_0, 42),
+ BPF_MOV32_IMM(BPF_REG_1, 0),
+ BPF_MOV32_IMM(BPF_REG_2, 1),
+ BPF_ALU64_REG(BPF_MOD, BPF_REG_2, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 42,
+},
+{
+ "DIV32 by 0, zero check ok, cls",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_0, 42),
+ BPF_MOV32_IMM(BPF_REG_1, 2),
+ BPF_MOV32_IMM(BPF_REG_2, 16),
+ BPF_ALU32_REG(BPF_DIV, BPF_REG_2, BPF_REG_1),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 8,
+},
+{
+ "DIV32 by 0, zero check 1, cls",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_1, 0),
+ BPF_MOV32_IMM(BPF_REG_0, 1),
+ BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "DIV32 by 0, zero check 2, cls",
+ .insns = {
+ BPF_LD_IMM64(BPF_REG_1, 0xffffffff00000000LL),
+ BPF_MOV32_IMM(BPF_REG_0, 1),
+ BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "DIV64 by 0, zero check, cls",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_1, 0),
+ BPF_MOV32_IMM(BPF_REG_0, 1),
+ BPF_ALU64_REG(BPF_DIV, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "MOD32 by 0, zero check ok, cls",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_0, 42),
+ BPF_MOV32_IMM(BPF_REG_1, 3),
+ BPF_MOV32_IMM(BPF_REG_2, 5),
+ BPF_ALU32_REG(BPF_MOD, BPF_REG_2, BPF_REG_1),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 2,
+},
+{
+ "MOD32 by 0, zero check 1, cls",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_1, 0),
+ BPF_MOV32_IMM(BPF_REG_0, 1),
+ BPF_ALU32_REG(BPF_MOD, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "MOD32 by 0, zero check 2, cls",
+ .insns = {
+ BPF_LD_IMM64(BPF_REG_1, 0xffffffff00000000LL),
+ BPF_MOV32_IMM(BPF_REG_0, 1),
+ BPF_ALU32_REG(BPF_MOD, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "MOD64 by 0, zero check 1, cls",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_1, 0),
+ BPF_MOV32_IMM(BPF_REG_0, 2),
+ BPF_ALU64_REG(BPF_MOD, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 2,
+},
+{
+ "MOD64 by 0, zero check 2, cls",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_1, 0),
+ BPF_MOV32_IMM(BPF_REG_0, -1),
+ BPF_ALU64_REG(BPF_MOD, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = -1,
+},
diff --git a/tools/testing/selftests/bpf/verifier/div_overflow.c b/tools/testing/selftests/bpf/verifier/div_overflow.c
new file mode 100644
index 000000000..acab4f008
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/div_overflow.c
@@ -0,0 +1,110 @@
+/* Just make sure that JITs used udiv/umod as otherwise we get
+ * an exception from INT_MIN/-1 overflow similarly as with div
+ * by zero.
+ */
+{
+ "DIV32 overflow, check 1",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_1, -1),
+ BPF_MOV32_IMM(BPF_REG_0, INT_MIN),
+ BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "DIV32 overflow, check 2",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_0, INT_MIN),
+ BPF_ALU32_IMM(BPF_DIV, BPF_REG_0, -1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "DIV64 overflow, check 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, -1),
+ BPF_LD_IMM64(BPF_REG_2, LLONG_MIN),
+ BPF_ALU64_REG(BPF_DIV, BPF_REG_2, BPF_REG_1),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_0, BPF_REG_2, 1),
+ BPF_MOV32_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "DIV64 overflow, check 2",
+ .insns = {
+ BPF_LD_IMM64(BPF_REG_1, LLONG_MIN),
+ BPF_ALU64_IMM(BPF_DIV, BPF_REG_1, -1),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_0, BPF_REG_1, 1),
+ BPF_MOV32_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "MOD32 overflow, check 1",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_1, -1),
+ BPF_MOV32_IMM(BPF_REG_0, INT_MIN),
+ BPF_ALU32_REG(BPF_MOD, BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = INT_MIN,
+},
+{
+ "MOD32 overflow, check 2",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_0, INT_MIN),
+ BPF_ALU32_IMM(BPF_MOD, BPF_REG_0, -1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = INT_MIN,
+},
+{
+ "MOD64 overflow, check 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, -1),
+ BPF_LD_IMM64(BPF_REG_2, LLONG_MIN),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_2),
+ BPF_ALU64_REG(BPF_MOD, BPF_REG_2, BPF_REG_1),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_3, BPF_REG_2, 1),
+ BPF_MOV32_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "MOD64 overflow, check 2",
+ .insns = {
+ BPF_LD_IMM64(BPF_REG_2, LLONG_MIN),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_MOD, BPF_REG_2, -1),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_3, BPF_REG_2, 1),
+ BPF_MOV32_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 1,
+},
diff --git a/tools/testing/selftests/bpf/verifier/event_output.c b/tools/testing/selftests/bpf/verifier/event_output.c
new file mode 100644
index 000000000..c5e805980
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/event_output.c
@@ -0,0 +1,119 @@
+/* instructions used to output a skb based software event, produced
+ * from code snippet:
+ * struct TMP {
+ * uint64_t tmp;
+ * } tt;
+ * tt.tmp = 5;
+ * bpf_perf_event_output(skb, &connection_tracking_event_map, 0,
+ * &tt, sizeof(tt));
+ * return 1;
+ *
+ * the bpf assembly from llvm is:
+ * 0: b7 02 00 00 05 00 00 00 r2 = 5
+ * 1: 7b 2a f8 ff 00 00 00 00 *(u64 *)(r10 - 8) = r2
+ * 2: bf a4 00 00 00 00 00 00 r4 = r10
+ * 3: 07 04 00 00 f8 ff ff ff r4 += -8
+ * 4: 18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0ll
+ * 6: b7 03 00 00 00 00 00 00 r3 = 0
+ * 7: b7 05 00 00 08 00 00 00 r5 = 8
+ * 8: 85 00 00 00 19 00 00 00 call 25
+ * 9: b7 00 00 00 01 00 00 00 r0 = 1
+ * 10: 95 00 00 00 00 00 00 00 exit
+ *
+ * The reason I put the code here instead of fill_helpers is that map fixup
+ * is against the insns, instead of filled prog.
+ */
+
+#define __PERF_EVENT_INSNS__ \
+ BPF_MOV64_IMM(BPF_REG_2, 5), \
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), \
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), \
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), \
+ BPF_LD_MAP_FD(BPF_REG_2, 0), \
+ BPF_MOV64_IMM(BPF_REG_3, 0), \
+ BPF_MOV64_IMM(BPF_REG_5, 8), \
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, \
+ BPF_FUNC_perf_event_output), \
+ BPF_MOV64_IMM(BPF_REG_0, 1), \
+ BPF_EXIT_INSN(),
+{
+ "perfevent for sockops",
+ .insns = { __PERF_EVENT_INSNS__ },
+ .prog_type = BPF_PROG_TYPE_SOCK_OPS,
+ .fixup_map_event_output = { 4 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "perfevent for tc",
+ .insns = { __PERF_EVENT_INSNS__ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .fixup_map_event_output = { 4 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "perfevent for lwt out",
+ .insns = { __PERF_EVENT_INSNS__ },
+ .prog_type = BPF_PROG_TYPE_LWT_OUT,
+ .fixup_map_event_output = { 4 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "perfevent for xdp",
+ .insns = { __PERF_EVENT_INSNS__ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .fixup_map_event_output = { 4 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "perfevent for socket filter",
+ .insns = { __PERF_EVENT_INSNS__ },
+ .prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
+ .fixup_map_event_output = { 4 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "perfevent for sk_skb",
+ .insns = { __PERF_EVENT_INSNS__ },
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+ .fixup_map_event_output = { 4 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "perfevent for cgroup skb",
+ .insns = { __PERF_EVENT_INSNS__ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .fixup_map_event_output = { 4 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "perfevent for cgroup dev",
+ .insns = { __PERF_EVENT_INSNS__ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_DEVICE,
+ .fixup_map_event_output = { 4 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "perfevent for cgroup sysctl",
+ .insns = { __PERF_EVENT_INSNS__ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL,
+ .fixup_map_event_output = { 4 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "perfevent for cgroup sockopt",
+ .insns = { __PERF_EVENT_INSNS__ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCKOPT,
+ .expected_attach_type = BPF_CGROUP_SETSOCKOPT,
+ .fixup_map_event_output = { 4 },
+ .result = ACCEPT,
+ .retval = 1,
+},
diff --git a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c
new file mode 100644
index 000000000..0ab7f1dfc
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c
@@ -0,0 +1,616 @@
+{
+ "helper access to variable memory: stack, bitwise AND + JMP, correct bounds",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -64),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -56),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -48),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -40),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -32),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -24),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+ BPF_MOV64_IMM(BPF_REG_2, 16),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to variable memory: stack, bitwise AND, zero included",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid indirect read from stack R1 off -64+0 size 64",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to variable memory: stack, bitwise AND + JMP, wrong max",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 65),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid indirect access to stack R1 off=-64 size=65",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to variable memory: stack, JMP, correct bounds",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -64),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -56),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -48),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -40),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -32),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -24),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+ BPF_MOV64_IMM(BPF_REG_2, 16),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 64, 4),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to variable memory: stack, JMP (signed), correct bounds",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -64),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -56),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -48),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -40),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -32),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -24),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+ BPF_MOV64_IMM(BPF_REG_2, 16),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, 64, 4),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to variable memory: stack, JMP, bounds + offset",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 64, 5),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 3),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid indirect access to stack R1 off=-64 size=65",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to variable memory: stack, JMP, wrong max",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 65, 4),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid indirect access to stack R1 off=-64 size=65",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to variable memory: stack, JMP, no max check",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_4, BPF_REG_2, 2),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ /* because max wasn't checked, signed min is negative */
+ .errstr = "R2 min value is negative, either use unsigned or 'var &= const'",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to variable memory: stack, JMP, no min check",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 64, 3),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid indirect read from stack R1 off -64+0 size 64",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to variable memory: stack, JMP (signed), no min check",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, -128),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -128),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, 64, 3),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R2 min value is negative",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to variable memory: map, JMP, correct bounds",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, sizeof(struct test_val), 4),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to variable memory: map, JMP, wrong max",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_6),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, sizeof(struct test_val) + 1, 4),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 4 },
+ .errstr = "invalid access to map value, value_size=48 off=0 size=49",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to variable memory: map adjusted, JMP, correct bounds",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 20),
+ BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, sizeof(struct test_val) - 20, 4),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to variable memory: map adjusted, JMP, wrong max",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 20),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_6),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_2, sizeof(struct test_val) - 19, 4),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_JMP_REG(BPF_JSGE, BPF_REG_4, BPF_REG_2, 2),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 4 },
+ .errstr = "R1 min value is outside of the allowed memory range",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to variable memory: size = 0 allowed on NULL (ARG_PTR_TO_MEM_OR_NULL)",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 0),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_EMIT_CALL(BPF_FUNC_csum_diff),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "helper access to variable memory: size > 0 not allowed on NULL (ARG_PTR_TO_MEM_OR_NULL)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ BPF_MOV64_IMM(BPF_REG_1, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 64),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_EMIT_CALL(BPF_FUNC_csum_diff),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 type=inv expected=fp",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "helper access to variable memory: size = 0 allowed on != NULL stack pointer (ARG_PTR_TO_MEM_OR_NULL)",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, 0),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 8),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_EMIT_CALL(BPF_FUNC_csum_diff),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "helper access to variable memory: size = 0 allowed on != NULL map pointer (ARG_PTR_TO_MEM_OR_NULL)",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_EMIT_CALL(BPF_FUNC_csum_diff),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "helper access to variable memory: size possible = 0 allowed on != NULL stack pointer (ARG_PTR_TO_MEM_OR_NULL)",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 8, 7),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_EMIT_CALL(BPF_FUNC_csum_diff),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "helper access to variable memory: size possible = 0 allowed on != NULL map pointer (ARG_PTR_TO_MEM_OR_NULL)",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 8, 4),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_EMIT_CALL(BPF_FUNC_csum_diff),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "helper access to variable memory: size possible = 0 allowed on != NULL packet pointer (ARG_PTR_TO_MEM_OR_NULL)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 7),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_6, 0),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 8, 4),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_EMIT_CALL(BPF_FUNC_csum_diff),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .retval = 0 /* csum_diff of 64-byte packet */,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "helper access to variable memory: size = 0 not allowed on NULL (!ARG_PTR_TO_MEM_OR_NULL)",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 0),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 type=inv expected=fp",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to variable memory: size > 0 not allowed on NULL (!ARG_PTR_TO_MEM_OR_NULL)",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 0),
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 type=inv expected=fp",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to variable memory: size = 0 allowed on != NULL stack pointer (!ARG_PTR_TO_MEM_OR_NULL)",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to variable memory: size = 0 allowed on != NULL map pointer (!ARG_PTR_TO_MEM_OR_NULL)",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to variable memory: size possible = 0 allowed on != NULL stack pointer (!ARG_PTR_TO_MEM_OR_NULL)",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 8, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to variable memory: size possible = 0 allowed on != NULL map pointer (!ARG_PTR_TO_MEM_OR_NULL)",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_2, 8, 2),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to variable memory: 8 bytes leak",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -64),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -56),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -48),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -40),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -24),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -128),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -128),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 63),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid indirect read from stack R1 off -64+32 size 64",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to variable memory: 8 bytes no leak (init memory)",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -64),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -56),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -48),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -40),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -32),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -24),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -16),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -64),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 32),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 32),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_10, -16),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/helper_packet_access.c b/tools/testing/selftests/bpf/verifier/helper_packet_access.c
new file mode 100644
index 000000000..ae54587e9
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/helper_packet_access.c
@@ -0,0 +1,460 @@
+{
+ "helper access to packet: test1, valid packet_ptr range",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 5),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_2),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_update_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 5 },
+ .result_unpriv = ACCEPT,
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+},
+{
+ "helper access to packet: test2, unchecked packet_ptr",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 1 },
+ .result = REJECT,
+ .errstr = "invalid access to packet",
+ .prog_type = BPF_PROG_TYPE_XDP,
+},
+{
+ "helper access to packet: test3, variable add",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 10),
+ BPF_LDX_MEM(BPF_B, BPF_REG_5, BPF_REG_2, 0),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_4, BPF_REG_5),
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_4),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_5, BPF_REG_3, 4),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_4),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 11 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+},
+{
+ "helper access to packet: test4, packet_ptr with bad range",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 7 },
+ .result = REJECT,
+ .errstr = "invalid access to packet",
+ .prog_type = BPF_PROG_TYPE_XDP,
+},
+{
+ "helper access to packet: test5, packet_ptr with too short range",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 7),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 3),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 6 },
+ .result = REJECT,
+ .errstr = "invalid access to packet",
+ .prog_type = BPF_PROG_TYPE_XDP,
+},
+{
+ "helper access to packet: test6, cls valid packet_ptr range",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 5),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_2),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_update_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 5 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "helper access to packet: test7, cls unchecked packet_ptr",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 1 },
+ .result = REJECT,
+ .errstr = "invalid access to packet",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "helper access to packet: test8, cls variable add",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 10),
+ BPF_LDX_MEM(BPF_B, BPF_REG_5, BPF_REG_2, 0),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_4, BPF_REG_5),
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_4),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_5, BPF_REG_3, 4),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_4),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 11 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "helper access to packet: test9, cls packet_ptr with bad range",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 4),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 7 },
+ .result = REJECT,
+ .errstr = "invalid access to packet",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "helper access to packet: test10, cls packet_ptr with too short range",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 7),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 3),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 6 },
+ .result = REJECT,
+ .errstr = "invalid access to packet",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "helper access to packet: test11, cls unsuitable helper 1",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 7),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_7, 4),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_4, 42),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_store_bytes),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "helper access to the packet",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "helper access to packet: test12, cls unsuitable helper 2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_7, 3),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_4, 4),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "helper access to the packet",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "helper access to packet: test13, cls helper ok",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_7, 6),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_csum_diff),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "helper access to packet: test14, cls helper ok sub",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_7, 6),
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 4),
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_csum_diff),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "helper access to packet: test15, cls helper fail sub",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_7, 6),
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 12),
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_csum_diff),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid access to packet",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "helper access to packet: test16, cls helper fail range 1",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_7, 6),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_2, 8),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_csum_diff),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid access to packet",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "helper access to packet: test17, cls helper fail range 2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_7, 6),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_2, -9),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_csum_diff),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "R2 min value is negative",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "helper access to packet: test18, cls helper fail range 3",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_7, 6),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_2, ~0),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_csum_diff),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "R2 min value is negative",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "helper access to packet: test19, cls helper range zero",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_7, 6),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_csum_diff),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "helper access to packet: test20, pkt end as input",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_7, 6),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_csum_diff),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "R1 type=pkt_end expected=fp",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "helper access to packet: test21, wrong reg",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_7, 6),
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_csum_diff),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid access to packet",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
diff --git a/tools/testing/selftests/bpf/verifier/helper_value_access.c b/tools/testing/selftests/bpf/verifier/helper_value_access.c
new file mode 100644
index 000000000..1c7882ddf
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/helper_value_access.c
@@ -0,0 +1,953 @@
+{
+ "helper access to map: full range",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val)),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to map: partial range",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_2, 8),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to map: empty range",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_EMIT_CALL(BPF_FUNC_trace_printk),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "invalid access to map value, value_size=48 off=0 size=0",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to map: out-of-bound range",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_2, sizeof(struct test_val) + 8),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "invalid access to map value, value_size=48 off=0 size=56",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to map: negative range",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_2, -8),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "R2 min value is negative",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to adjusted map (via const imm): full range",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct test_val, foo)),
+ BPF_MOV64_IMM(BPF_REG_2,
+ sizeof(struct test_val) - offsetof(struct test_val, foo)),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to adjusted map (via const imm): partial range",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct test_val, foo)),
+ BPF_MOV64_IMM(BPF_REG_2, 8),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to adjusted map (via const imm): empty range",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct test_val, foo)),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_EMIT_CALL(BPF_FUNC_trace_printk),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "invalid access to map value, value_size=48 off=4 size=0",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to adjusted map (via const imm): out-of-bound range",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct test_val, foo)),
+ BPF_MOV64_IMM(BPF_REG_2,
+ sizeof(struct test_val) - offsetof(struct test_val, foo) + 8),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "invalid access to map value, value_size=48 off=4 size=52",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to adjusted map (via const imm): negative range (> adjustment)",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct test_val, foo)),
+ BPF_MOV64_IMM(BPF_REG_2, -8),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "R2 min value is negative",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to adjusted map (via const imm): negative range (< adjustment)",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct test_val, foo)),
+ BPF_MOV64_IMM(BPF_REG_2, -1),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "R2 min value is negative",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to adjusted map (via const reg): full range",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_3, offsetof(struct test_val, foo)),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+ BPF_MOV64_IMM(BPF_REG_2,
+ sizeof(struct test_val) - offsetof(struct test_val, foo)),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to adjusted map (via const reg): partial range",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_3, offsetof(struct test_val, foo)),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+ BPF_MOV64_IMM(BPF_REG_2, 8),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to adjusted map (via const reg): empty range",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_EMIT_CALL(BPF_FUNC_trace_printk),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "R1 min value is outside of the allowed memory range",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to adjusted map (via const reg): out-of-bound range",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_3, offsetof(struct test_val, foo)),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+ BPF_MOV64_IMM(BPF_REG_2,
+ sizeof(struct test_val) -
+ offsetof(struct test_val, foo) + 8),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "invalid access to map value, value_size=48 off=4 size=52",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to adjusted map (via const reg): negative range (> adjustment)",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_3, offsetof(struct test_val, foo)),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+ BPF_MOV64_IMM(BPF_REG_2, -8),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "R2 min value is negative",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to adjusted map (via const reg): negative range (< adjustment)",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_3, offsetof(struct test_val, foo)),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+ BPF_MOV64_IMM(BPF_REG_2, -1),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "R2 min value is negative",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to adjusted map (via variable): full range",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_3, offsetof(struct test_val, foo), 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+ BPF_MOV64_IMM(BPF_REG_2,
+ sizeof(struct test_val) - offsetof(struct test_val, foo)),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to adjusted map (via variable): partial range",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_3, offsetof(struct test_val, foo), 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+ BPF_MOV64_IMM(BPF_REG_2, 8),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to adjusted map (via variable): empty range",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_3, offsetof(struct test_val, foo), 3),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_EMIT_CALL(BPF_FUNC_trace_printk),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "R1 min value is outside of the allowed memory range",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to adjusted map (via variable): no max check",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "R1 unbounded memory access",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to adjusted map (via variable): wrong max check",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_3, offsetof(struct test_val, foo), 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+ BPF_MOV64_IMM(BPF_REG_2,
+ sizeof(struct test_val) -
+ offsetof(struct test_val, foo) + 1),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "invalid access to map value, value_size=48 off=4 size=45",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to map: bounds check using <, good access",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JLT, BPF_REG_3, 32, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to map: bounds check using <, bad access",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JLT, BPF_REG_3, 32, 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .result = REJECT,
+ .errstr = "R1 unbounded memory access",
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to map: bounds check using <=, good access",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JLE, BPF_REG_3, 32, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to map: bounds check using <=, bad access",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JLE, BPF_REG_3, 32, 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .result = REJECT,
+ .errstr = "R1 unbounded memory access",
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to map: bounds check using s<, good access",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JSLT, BPF_REG_3, 32, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JSLT, BPF_REG_3, 0, -3),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to map: bounds check using s<, good access 2",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JSLT, BPF_REG_3, 32, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JSLT, BPF_REG_3, -3, -3),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to map: bounds check using s<, bad access",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JSLT, BPF_REG_3, 32, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JSLT, BPF_REG_3, -3, -3),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .result = REJECT,
+ .errstr = "R1 min value is negative",
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to map: bounds check using s<=, good access",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JSLE, BPF_REG_3, 32, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JSLE, BPF_REG_3, 0, -3),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to map: bounds check using s<=, good access 2",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JSLE, BPF_REG_3, 32, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JSLE, BPF_REG_3, -3, -3),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "helper access to map: bounds check using s<=, bad access",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JSLE, BPF_REG_3, 32, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JSLE, BPF_REG_3, -3, -3),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_3),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .result = REJECT,
+ .errstr = "R1 min value is negative",
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "map lookup helper access to map",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_16b = { 3, 8 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "map update helper access to map",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_update_elem),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_16b = { 3, 10 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "map update helper access to map: wrong size",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_update_elem),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .fixup_map_hash_16b = { 10 },
+ .result = REJECT,
+ .errstr = "invalid access to map value, value_size=8 off=0 size=16",
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "map helper access to adjusted map (via const imm)",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, offsetof(struct other_val, bar)),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_16b = { 3, 9 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "map helper access to adjusted map (via const imm): out-of-bound 1",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, sizeof(struct other_val) - 4),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_16b = { 3, 9 },
+ .result = REJECT,
+ .errstr = "invalid access to map value, value_size=16 off=12 size=8",
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "map helper access to adjusted map (via const imm): out-of-bound 2",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_16b = { 3, 9 },
+ .result = REJECT,
+ .errstr = "invalid access to map value, value_size=16 off=-4 size=8",
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "map helper access to adjusted map (via const reg)",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_3, offsetof(struct other_val, bar)),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_3),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_16b = { 3, 10 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "map helper access to adjusted map (via const reg): out-of-bound 1",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_3, sizeof(struct other_val) - 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_3),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_16b = { 3, 10 },
+ .result = REJECT,
+ .errstr = "invalid access to map value, value_size=16 off=12 size=8",
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "map helper access to adjusted map (via const reg): out-of-bound 2",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_3, -4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_3),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_16b = { 3, 10 },
+ .result = REJECT,
+ .errstr = "invalid access to map value, value_size=16 off=-4 size=8",
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "map helper access to adjusted map (via variable)",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_3, offsetof(struct other_val, bar), 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_3),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_16b = { 3, 11 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "map helper access to adjusted map (via variable): no max check",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_3),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_16b = { 3, 10 },
+ .result = REJECT,
+ .errstr = "R2 unbounded memory access, make sure to bounds check any such access",
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "map helper access to adjusted map (via variable): wrong max check",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_3, offsetof(struct other_val, bar) + 1, 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_3),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_16b = { 3, 11 },
+ .result = REJECT,
+ .errstr = "invalid access to map value, value_size=16 off=9 size=8",
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/int_ptr.c b/tools/testing/selftests/bpf/verifier/int_ptr.c
new file mode 100644
index 000000000..070893fb2
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/int_ptr.c
@@ -0,0 +1,160 @@
+{
+ "ARG_PTR_TO_LONG uninitialized",
+ .insns = {
+ /* bpf_strtoul arg1 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0x00303036),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+ /* bpf_strtoul arg2 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+
+ /* bpf_strtoul arg3 (flags) */
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+
+ /* bpf_strtoul arg4 (res) */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+ /* bpf_strtoul() */
+ BPF_EMIT_CALL(BPF_FUNC_strtoul),
+
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL,
+ .errstr = "invalid indirect read from stack R4 off -16+0 size 8",
+},
+{
+ "ARG_PTR_TO_LONG half-uninitialized",
+ .insns = {
+ /* bpf_strtoul arg1 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0x00303036),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+ /* bpf_strtoul arg2 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+
+ /* bpf_strtoul arg3 (flags) */
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+
+ /* bpf_strtoul arg4 (res) */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+ /* bpf_strtoul() */
+ BPF_EMIT_CALL(BPF_FUNC_strtoul),
+
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL,
+ .errstr = "invalid indirect read from stack R4 off -16+4 size 8",
+},
+{
+ "ARG_PTR_TO_LONG misaligned",
+ .insns = {
+ /* bpf_strtoul arg1 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0x00303036),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+ /* bpf_strtoul arg2 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+
+ /* bpf_strtoul arg3 (flags) */
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+
+ /* bpf_strtoul arg4 (res) */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -12),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 4),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+ /* bpf_strtoul() */
+ BPF_EMIT_CALL(BPF_FUNC_strtoul),
+
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL,
+ .errstr = "misaligned stack access off (0x0; 0x0)+-20+0 size 8",
+},
+{
+ "ARG_PTR_TO_LONG size < sizeof(long)",
+ .insns = {
+ /* bpf_strtoul arg1 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -16),
+ BPF_MOV64_IMM(BPF_REG_0, 0x00303036),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+ /* bpf_strtoul arg2 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+
+ /* bpf_strtoul arg3 (flags) */
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+
+ /* bpf_strtoul arg4 (res) */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 12),
+ BPF_STX_MEM(BPF_W, BPF_REG_7, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+ /* bpf_strtoul() */
+ BPF_EMIT_CALL(BPF_FUNC_strtoul),
+
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL,
+ .errstr = "invalid indirect access to stack R4 off=-4 size=8",
+},
+{
+ "ARG_PTR_TO_LONG initialized",
+ .insns = {
+ /* bpf_strtoul arg1 (buf) */
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0x00303036),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+
+ /* bpf_strtoul arg2 (buf_len) */
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+
+ /* bpf_strtoul arg3 (flags) */
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+
+ /* bpf_strtoul arg4 (res) */
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_7),
+
+ /* bpf_strtoul() */
+ BPF_EMIT_CALL(BPF_FUNC_strtoul),
+
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL,
+},
diff --git a/tools/testing/selftests/bpf/verifier/jit.c b/tools/testing/selftests/bpf/verifier/jit.c
new file mode 100644
index 000000000..c33adf344
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/jit.c
@@ -0,0 +1,107 @@
+{
+ "jit: lsh, rsh, arsh by 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_MOV64_IMM(BPF_REG_1, 0xff),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 1),
+ BPF_ALU32_IMM(BPF_LSH, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x3fc, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 1),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0xff, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0x7f, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 2,
+},
+{
+ "jit: mov32 for ldimm64, 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_LD_IMM64(BPF_REG_1, 0xfeffffffffffffffULL),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_1, 32),
+ BPF_LD_IMM64(BPF_REG_2, 0xfeffffffULL),
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 2,
+},
+{
+ "jit: mov32 for ldimm64, 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_IMM64(BPF_REG_1, 0x1ffffffffULL),
+ BPF_LD_IMM64(BPF_REG_2, 0xffffffffULL),
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 2,
+},
+{
+ "jit: various mul tests",
+ .insns = {
+ BPF_LD_IMM64(BPF_REG_2, 0xeeff0d413122ULL),
+ BPF_LD_IMM64(BPF_REG_0, 0xfefefeULL),
+ BPF_LD_IMM64(BPF_REG_1, 0xefefefULL),
+ BPF_ALU64_REG(BPF_MUL, BPF_REG_0, BPF_REG_1),
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_0, BPF_REG_2, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LD_IMM64(BPF_REG_3, 0xfefefeULL),
+ BPF_ALU64_REG(BPF_MUL, BPF_REG_3, BPF_REG_1),
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_3, BPF_REG_2, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV32_REG(BPF_REG_2, BPF_REG_2),
+ BPF_LD_IMM64(BPF_REG_0, 0xfefefeULL),
+ BPF_ALU32_REG(BPF_MUL, BPF_REG_0, BPF_REG_1),
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_0, BPF_REG_2, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LD_IMM64(BPF_REG_3, 0xfefefeULL),
+ BPF_ALU32_REG(BPF_MUL, BPF_REG_3, BPF_REG_1),
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_3, BPF_REG_2, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LD_IMM64(BPF_REG_0, 0x952a7bbcULL),
+ BPF_LD_IMM64(BPF_REG_1, 0xfefefeULL),
+ BPF_LD_IMM64(BPF_REG_2, 0xeeff0d413122ULL),
+ BPF_ALU32_REG(BPF_MUL, BPF_REG_2, BPF_REG_1),
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_2, BPF_REG_0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 2,
+},
+{
+ "jit: jsgt, jslt",
+ .insns = {
+ BPF_LD_IMM64(BPF_REG_1, 0x80000000ULL),
+ BPF_LD_IMM64(BPF_REG_2, 0x0ULL),
+ BPF_JMP_REG(BPF_JSGT, BPF_REG_1, BPF_REG_2, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+
+ BPF_JMP_REG(BPF_JSLT, BPF_REG_2, BPF_REG_1, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 2,
+},
diff --git a/tools/testing/selftests/bpf/verifier/jmp32.c b/tools/testing/selftests/bpf/verifier/jmp32.c
new file mode 100644
index 000000000..1c857b2fb
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/jmp32.c
@@ -0,0 +1,866 @@
+{
+ "jset32: BPF_K",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+ /* reg, high bits shouldn't be tested */
+ BPF_JMP32_IMM(BPF_JSET, BPF_REG_7, -2, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_EXIT_INSN(),
+
+ BPF_JMP32_IMM(BPF_JSET, BPF_REG_7, 1, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 3,
+ .retvals = {
+ { .retval = 0,
+ .data64 = { 1ULL << 63, }
+ },
+ { .retval = 2,
+ .data64 = { 1, }
+ },
+ { .retval = 2,
+ .data64 = { 1ULL << 63 | 1, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jset32: BPF_X",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+ BPF_LD_IMM64(BPF_REG_8, 0x8000000000000000),
+ BPF_JMP32_REG(BPF_JSET, BPF_REG_7, BPF_REG_8, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_EXIT_INSN(),
+
+ BPF_LD_IMM64(BPF_REG_8, 0x8000000000000001),
+ BPF_JMP32_REG(BPF_JSET, BPF_REG_7, BPF_REG_8, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 3,
+ .retvals = {
+ { .retval = 0,
+ .data64 = { 1ULL << 63, }
+ },
+ { .retval = 2,
+ .data64 = { 1, }
+ },
+ { .retval = 2,
+ .data64 = { 1ULL << 63 | 1, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jset32: ignores upper bits",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_LD_IMM64(BPF_REG_7, 0x8000000000000000),
+ BPF_LD_IMM64(BPF_REG_8, 0x8000000000000000),
+ BPF_JMP_REG(BPF_JSET, BPF_REG_7, BPF_REG_8, 1),
+ BPF_EXIT_INSN(),
+ BPF_JMP32_REG(BPF_JSET, BPF_REG_7, BPF_REG_8, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 2,
+},
+{
+ "jset32: min/max deduction",
+ .insns = {
+ BPF_RAND_UEXT_R7,
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP32_IMM(BPF_JSET, BPF_REG_7, 0x10, 1),
+ BPF_EXIT_INSN(),
+ BPF_JMP32_IMM(BPF_JGE, BPF_REG_7, 0x10, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R9 !read_ok",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "jeq32: BPF_K",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+ BPF_JMP32_IMM(BPF_JEQ, BPF_REG_7, -1, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 2,
+ .retvals = {
+ { .retval = 0,
+ .data64 = { -2, }
+ },
+ { .retval = 2,
+ .data64 = { -1, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jeq32: BPF_X",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+ BPF_LD_IMM64(BPF_REG_8, 0x7000000000000001),
+ BPF_JMP32_REG(BPF_JEQ, BPF_REG_7, BPF_REG_8, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 3,
+ .retvals = {
+ { .retval = 0,
+ .data64 = { 2, }
+ },
+ { .retval = 2,
+ .data64 = { 1, }
+ },
+ { .retval = 2,
+ .data64 = { 1ULL << 63 | 1, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jeq32: min/max deduction",
+ .insns = {
+ BPF_RAND_UEXT_R7,
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP32_IMM(BPF_JEQ, BPF_REG_7, 0x10, 1),
+ BPF_EXIT_INSN(),
+ BPF_JMP32_IMM(BPF_JSGE, BPF_REG_7, 0xf, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R9 !read_ok",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "jne32: BPF_K",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+ BPF_JMP32_IMM(BPF_JNE, BPF_REG_7, -1, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 2,
+ .retvals = {
+ { .retval = 2,
+ .data64 = { 1, }
+ },
+ { .retval = 0,
+ .data64 = { -1, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jne32: BPF_X",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+ BPF_LD_IMM64(BPF_REG_8, 0x8000000000000001),
+ BPF_JMP32_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 3,
+ .retvals = {
+ { .retval = 0,
+ .data64 = { 1, }
+ },
+ { .retval = 2,
+ .data64 = { 2, }
+ },
+ { .retval = 2,
+ .data64 = { 1ULL << 63 | 2, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jne32: min/max deduction",
+ .insns = {
+ BPF_RAND_UEXT_R7,
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP32_IMM(BPF_JNE, BPF_REG_7, 0x10, 1),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x10, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R9 !read_ok",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "jge32: BPF_K",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+ BPF_JMP32_IMM(BPF_JGE, BPF_REG_7, UINT_MAX - 1, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 3,
+ .retvals = {
+ { .retval = 2,
+ .data64 = { UINT_MAX, }
+ },
+ { .retval = 2,
+ .data64 = { UINT_MAX - 1, }
+ },
+ { .retval = 0,
+ .data64 = { 0, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jge32: BPF_X",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LD_IMM64(BPF_REG_8, UINT_MAX | 1ULL << 32),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+ BPF_JMP32_REG(BPF_JGE, BPF_REG_7, BPF_REG_8, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 3,
+ .retvals = {
+ { .retval = 2,
+ .data64 = { UINT_MAX, }
+ },
+ { .retval = 0,
+ .data64 = { INT_MAX, }
+ },
+ { .retval = 0,
+ .data64 = { (UINT_MAX - 1) | 2ULL << 32, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jge32: min/max deduction",
+ .insns = {
+ BPF_RAND_UEXT_R7,
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_LD_IMM64(BPF_REG_8, 0x7ffffff0 | 1ULL << 32),
+ BPF_JMP32_REG(BPF_JGE, BPF_REG_7, BPF_REG_8, 1),
+ BPF_EXIT_INSN(),
+ BPF_JMP32_IMM(BPF_JGE, BPF_REG_7, 0x7ffffff0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R0 invalid mem access 'inv'",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = 2,
+},
+{
+ "jgt32: BPF_K",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+ BPF_JMP32_IMM(BPF_JGT, BPF_REG_7, UINT_MAX - 1, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 3,
+ .retvals = {
+ { .retval = 2,
+ .data64 = { UINT_MAX, }
+ },
+ { .retval = 0,
+ .data64 = { UINT_MAX - 1, }
+ },
+ { .retval = 0,
+ .data64 = { 0, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jgt32: BPF_X",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LD_IMM64(BPF_REG_8, (UINT_MAX - 1) | 1ULL << 32),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+ BPF_JMP32_REG(BPF_JGT, BPF_REG_7, BPF_REG_8, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 3,
+ .retvals = {
+ { .retval = 2,
+ .data64 = { UINT_MAX, }
+ },
+ { .retval = 0,
+ .data64 = { UINT_MAX - 1, }
+ },
+ { .retval = 0,
+ .data64 = { (UINT_MAX - 1) | 2ULL << 32, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jgt32: min/max deduction",
+ .insns = {
+ BPF_RAND_UEXT_R7,
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_LD_IMM64(BPF_REG_8, 0x7ffffff0 | 1ULL << 32),
+ BPF_JMP32_REG(BPF_JGT, BPF_REG_7, BPF_REG_8, 1),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_7, 0x7ffffff0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R0 invalid mem access 'inv'",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = 2,
+},
+{
+ "jle32: BPF_K",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+ BPF_JMP32_IMM(BPF_JLE, BPF_REG_7, INT_MAX, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 3,
+ .retvals = {
+ { .retval = 2,
+ .data64 = { INT_MAX - 1, }
+ },
+ { .retval = 0,
+ .data64 = { UINT_MAX, }
+ },
+ { .retval = 2,
+ .data64 = { INT_MAX, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jle32: BPF_X",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LD_IMM64(BPF_REG_8, (INT_MAX - 1) | 2ULL << 32),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+ BPF_JMP32_REG(BPF_JLE, BPF_REG_7, BPF_REG_8, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 3,
+ .retvals = {
+ { .retval = 0,
+ .data64 = { INT_MAX | 1ULL << 32, }
+ },
+ { .retval = 2,
+ .data64 = { INT_MAX - 2, }
+ },
+ { .retval = 0,
+ .data64 = { UINT_MAX, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jle32: min/max deduction",
+ .insns = {
+ BPF_RAND_UEXT_R7,
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_LD_IMM64(BPF_REG_8, 0x7ffffff0 | 1ULL << 32),
+ BPF_JMP32_REG(BPF_JLE, BPF_REG_7, BPF_REG_8, 1),
+ BPF_EXIT_INSN(),
+ BPF_JMP32_IMM(BPF_JLE, BPF_REG_7, 0x7ffffff0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R0 invalid mem access 'inv'",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = 2,
+},
+{
+ "jlt32: BPF_K",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+ BPF_JMP32_IMM(BPF_JLT, BPF_REG_7, INT_MAX, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 3,
+ .retvals = {
+ { .retval = 0,
+ .data64 = { INT_MAX, }
+ },
+ { .retval = 0,
+ .data64 = { UINT_MAX, }
+ },
+ { .retval = 2,
+ .data64 = { INT_MAX - 1, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jlt32: BPF_X",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LD_IMM64(BPF_REG_8, INT_MAX | 2ULL << 32),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+ BPF_JMP32_REG(BPF_JLT, BPF_REG_7, BPF_REG_8, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 3,
+ .retvals = {
+ { .retval = 0,
+ .data64 = { INT_MAX | 1ULL << 32, }
+ },
+ { .retval = 0,
+ .data64 = { UINT_MAX, }
+ },
+ { .retval = 2,
+ .data64 = { (INT_MAX - 1) | 3ULL << 32, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jlt32: min/max deduction",
+ .insns = {
+ BPF_RAND_UEXT_R7,
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_LD_IMM64(BPF_REG_8, 0x7ffffff0 | 1ULL << 32),
+ BPF_JMP32_REG(BPF_JLT, BPF_REG_7, BPF_REG_8, 1),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JSLT, BPF_REG_7, 0x7ffffff0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R0 invalid mem access 'inv'",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = 2,
+},
+{
+ "jsge32: BPF_K",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+ BPF_JMP32_IMM(BPF_JSGE, BPF_REG_7, -1, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 3,
+ .retvals = {
+ { .retval = 2,
+ .data64 = { 0, }
+ },
+ { .retval = 2,
+ .data64 = { -1, }
+ },
+ { .retval = 0,
+ .data64 = { -2, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jsge32: BPF_X",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LD_IMM64(BPF_REG_8, (__u32)-1 | 2ULL << 32),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+ BPF_JMP32_REG(BPF_JSGE, BPF_REG_7, BPF_REG_8, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 3,
+ .retvals = {
+ { .retval = 2,
+ .data64 = { -1, }
+ },
+ { .retval = 2,
+ .data64 = { 0x7fffffff | 1ULL << 32, }
+ },
+ { .retval = 0,
+ .data64 = { -2, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jsge32: min/max deduction",
+ .insns = {
+ BPF_RAND_UEXT_R7,
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_LD_IMM64(BPF_REG_8, 0x7ffffff0 | 1ULL << 32),
+ BPF_JMP32_REG(BPF_JSGE, BPF_REG_7, BPF_REG_8, 1),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JSGE, BPF_REG_7, 0x7ffffff0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R0 invalid mem access 'inv'",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = 2,
+},
+{
+ "jsgt32: BPF_K",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+ BPF_JMP32_IMM(BPF_JSGT, BPF_REG_7, -1, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 3,
+ .retvals = {
+ { .retval = 0,
+ .data64 = { (__u32)-2, }
+ },
+ { .retval = 0,
+ .data64 = { -1, }
+ },
+ { .retval = 2,
+ .data64 = { 1, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jsgt32: BPF_X",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LD_IMM64(BPF_REG_8, 0x7ffffffe | 1ULL << 32),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+ BPF_JMP32_REG(BPF_JSGT, BPF_REG_7, BPF_REG_8, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 3,
+ .retvals = {
+ { .retval = 0,
+ .data64 = { 0x7ffffffe, }
+ },
+ { .retval = 0,
+ .data64 = { 0x1ffffffffULL, }
+ },
+ { .retval = 2,
+ .data64 = { 0x7fffffff, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jsgt32: min/max deduction",
+ .insns = {
+ BPF_RAND_SEXT_R7,
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_LD_IMM64(BPF_REG_8, (__u32)(-2) | 1ULL << 32),
+ BPF_JMP32_REG(BPF_JSGT, BPF_REG_7, BPF_REG_8, 1),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_7, -2, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R0 invalid mem access 'inv'",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = 2,
+},
+{
+ "jsle32: BPF_K",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+ BPF_JMP32_IMM(BPF_JSLE, BPF_REG_7, -1, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 3,
+ .retvals = {
+ { .retval = 2,
+ .data64 = { (__u32)-2, }
+ },
+ { .retval = 2,
+ .data64 = { -1, }
+ },
+ { .retval = 0,
+ .data64 = { 1, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jsle32: BPF_X",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LD_IMM64(BPF_REG_8, 0x7ffffffe | 1ULL << 32),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+ BPF_JMP32_REG(BPF_JSLE, BPF_REG_7, BPF_REG_8, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 3,
+ .retvals = {
+ { .retval = 2,
+ .data64 = { 0x7ffffffe, }
+ },
+ { .retval = 2,
+ .data64 = { (__u32)-1, }
+ },
+ { .retval = 0,
+ .data64 = { 0x7fffffff | 2ULL << 32, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jsle32: min/max deduction",
+ .insns = {
+ BPF_RAND_UEXT_R7,
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_LD_IMM64(BPF_REG_8, 0x7ffffff0 | 1ULL << 32),
+ BPF_JMP32_REG(BPF_JSLE, BPF_REG_7, BPF_REG_8, 1),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JSLE, BPF_REG_7, 0x7ffffff0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R0 invalid mem access 'inv'",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = 2,
+},
+{
+ "jslt32: BPF_K",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+ BPF_JMP32_IMM(BPF_JSLT, BPF_REG_7, -1, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 3,
+ .retvals = {
+ { .retval = 2,
+ .data64 = { (__u32)-2, }
+ },
+ { .retval = 0,
+ .data64 = { -1, }
+ },
+ { .retval = 0,
+ .data64 = { 1, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jslt32: BPF_X",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LD_IMM64(BPF_REG_8, 0x7fffffff | 1ULL << 32),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+ BPF_JMP32_REG(BPF_JSLT, BPF_REG_7, BPF_REG_8, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 3,
+ .retvals = {
+ { .retval = 2,
+ .data64 = { 0x7ffffffe, }
+ },
+ { .retval = 2,
+ .data64 = { 0xffffffff, }
+ },
+ { .retval = 0,
+ .data64 = { 0x7fffffff | 2ULL << 32, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jslt32: min/max deduction",
+ .insns = {
+ BPF_RAND_SEXT_R7,
+ BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, 2),
+ BPF_LD_IMM64(BPF_REG_8, (__u32)(-1) | 1ULL << 32),
+ BPF_JMP32_REG(BPF_JSLT, BPF_REG_7, BPF_REG_8, 1),
+ BPF_EXIT_INSN(),
+ BPF_JMP32_IMM(BPF_JSLT, BPF_REG_7, -1, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R0 invalid mem access 'inv'",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = 2,
+},
+{
+ "jgt32: range bound deduction, reg op imm",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_get_cgroup_classid),
+ BPF_JMP32_IMM(BPF_JGT, BPF_REG_0, 1, 5),
+ BPF_MOV32_REG(BPF_REG_6, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 32),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_6, 32),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_6),
+ BPF_ST_MEM(BPF_B, BPF_REG_8, 0, 0),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .fixup_map_hash_48b = { 4 },
+ .result = ACCEPT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jgt32: range bound deduction, reg1 op reg2, reg1 unknown",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_get_cgroup_classid),
+ BPF_MOV32_IMM(BPF_REG_2, 1),
+ BPF_JMP32_REG(BPF_JGT, BPF_REG_0, BPF_REG_2, 5),
+ BPF_MOV32_REG(BPF_REG_6, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 32),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_6, 32),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_6),
+ BPF_ST_MEM(BPF_B, BPF_REG_8, 0, 0),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .fixup_map_hash_48b = { 4 },
+ .result = ACCEPT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jle32: range bound deduction, reg1 op reg2, reg2 unknown",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_get_cgroup_classid),
+ BPF_MOV32_IMM(BPF_REG_2, 1),
+ BPF_JMP32_REG(BPF_JLE, BPF_REG_2, BPF_REG_0, 5),
+ BPF_MOV32_REG(BPF_REG_6, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 32),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_6, 32),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_6),
+ BPF_ST_MEM(BPF_B, BPF_REG_8, 0, 0),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .fixup_map_hash_48b = { 4 },
+ .result = ACCEPT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
diff --git a/tools/testing/selftests/bpf/verifier/jset.c b/tools/testing/selftests/bpf/verifier/jset.c
new file mode 100644
index 000000000..11fc68da7
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/jset.c
@@ -0,0 +1,169 @@
+{
+ "jset: functional",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+
+ /* reg, bit 63 or bit 0 set, taken */
+ BPF_LD_IMM64(BPF_REG_8, 0x8000000000000001),
+ BPF_JMP_REG(BPF_JSET, BPF_REG_7, BPF_REG_8, 1),
+ BPF_EXIT_INSN(),
+
+ /* reg, bit 62, not taken */
+ BPF_LD_IMM64(BPF_REG_8, 0x4000000000000000),
+ BPF_JMP_REG(BPF_JSET, BPF_REG_7, BPF_REG_8, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_EXIT_INSN(),
+
+ /* imm, any bit set, taken */
+ BPF_JMP_IMM(BPF_JSET, BPF_REG_7, -1, 1),
+ BPF_EXIT_INSN(),
+
+ /* imm, bit 31 set, taken */
+ BPF_JMP_IMM(BPF_JSET, BPF_REG_7, 0x80000000, 1),
+ BPF_EXIT_INSN(),
+
+ /* all good - return r0 == 2 */
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .runs = 7,
+ .retvals = {
+ { .retval = 2,
+ .data64 = { (1ULL << 63) | (1U << 31) | (1U << 0), }
+ },
+ { .retval = 2,
+ .data64 = { (1ULL << 63) | (1U << 31), }
+ },
+ { .retval = 2,
+ .data64 = { (1ULL << 31) | (1U << 0), }
+ },
+ { .retval = 2,
+ .data64 = { (__u32)-1, }
+ },
+ { .retval = 2,
+ .data64 = { ~0x4000000000000000ULL, }
+ },
+ { .retval = 0,
+ .data64 = { 0, }
+ },
+ { .retval = 0,
+ .data64 = { ~0ULL, }
+ },
+ },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jset: sign-extend",
+ .insns = {
+ BPF_DIRECT_PKT_R2,
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_2, 0),
+
+ BPF_JMP_IMM(BPF_JSET, BPF_REG_7, 0x80000000, 1),
+ BPF_EXIT_INSN(),
+
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 2,
+ .data = { 1, 0, 0, 0, 0, 0, 0, 1, },
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "jset: known const compare",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_IMM(BPF_JSET, BPF_REG_0, 1, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
+ .errstr_unpriv = "R9 !read_ok",
+ .result_unpriv = REJECT,
+ .retval = 1,
+ .result = ACCEPT,
+},
+{
+ "jset: known const compare bad",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JSET, BPF_REG_0, 1, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
+ .errstr_unpriv = "!read_ok",
+ .result_unpriv = REJECT,
+ .errstr = "!read_ok",
+ .result = REJECT,
+},
+{
+ "jset: unknown const compare taken",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_JMP_IMM(BPF_JSET, BPF_REG_0, 1, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
+ .errstr_unpriv = "!read_ok",
+ .result_unpriv = REJECT,
+ .errstr = "!read_ok",
+ .result = REJECT,
+},
+{
+ "jset: unknown const compare not taken",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_JMP_IMM(BPF_JSET, BPF_REG_0, 1, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
+ .errstr_unpriv = "!read_ok",
+ .result_unpriv = REJECT,
+ .errstr = "!read_ok",
+ .result = REJECT,
+},
+{
+ "jset: half-known const compare",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_ALU64_IMM(BPF_OR, BPF_REG_0, 2),
+ BPF_JMP_IMM(BPF_JSET, BPF_REG_0, 3, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
+ .errstr_unpriv = "R9 !read_ok",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "jset: range",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0xff),
+ BPF_JMP_IMM(BPF_JSET, BPF_REG_1, 0xf0, 3),
+ BPF_JMP_IMM(BPF_JLT, BPF_REG_1, 0x10, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JSET, BPF_REG_1, 0x10, 1),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0x10, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_8, BPF_REG_9, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
+ .errstr_unpriv = "R9 !read_ok",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/jump.c b/tools/testing/selftests/bpf/verifier/jump.c
new file mode 100644
index 000000000..6f951d1ff
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/jump.c
@@ -0,0 +1,375 @@
+{
+ "jump test 1",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, -8),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, -8, 0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 1, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, -16, 1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 2, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, -8, 2),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 3, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, -16, 3),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 4, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, -8, 4),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 5, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, -32, 5),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R1 pointer comparison",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "jump test 2",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 2),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, -8, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 14),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 1, 2),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, -16, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 11),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 2, 2),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, -32, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 8),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 3, 2),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, -40, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 5),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 4, 2),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, -48, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 5, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, -56, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R1 pointer comparison",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "jump test 3",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 3),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, -8, 0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 19),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 1, 3),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, -16, 0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 15),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 2, 3),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, -32, 0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -32),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 11),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 3, 3),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, -40, 0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -40),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 7),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 4, 3),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, -48, 0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -48),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 3),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 5, 0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, -56, 0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -56),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_delete_elem),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 24 },
+ .errstr_unpriv = "R1 pointer comparison",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = -ENOENT,
+},
+{
+ "jump test 4",
+ .insns = {
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 2),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 3),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 4),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 2),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 3),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 4),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 2),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 3),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 4),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 2),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 3),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 4),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 2),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 3),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 4),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 2),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 3),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 4),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 2),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 3),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 4),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 2),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 3),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 4),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 2),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 3),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 4),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, BPF_REG_10, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R1 pointer comparison",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "jump test 5",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_2),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 2),
+ BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, -8),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_2, -8),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 2),
+ BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, -8),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_2, -8),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 2),
+ BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, -8),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_2, -8),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 2),
+ BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, -8),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_2, -8),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 2),
+ BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_3, -8),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_2, -8),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R1 pointer comparison",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "jump test 6",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_MOV64_IMM(BPF_REG_1, 2),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_0, BPF_REG_1, 16),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, -20),
+ },
+ .result = ACCEPT,
+ .retval = 2,
+},
+{
+ "jump test 7",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 3),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 2, 16),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_JMP_IMM(BPF_JA, 0, 0, -20),
+ },
+ .result = ACCEPT,
+ .retval = 3,
+},
+{
+ "jump test 8",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_MOV64_IMM(BPF_REG_1, 2),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 3),
+ BPF_EXIT_INSN(),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_0, BPF_REG_1, 16),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_JMP_IMM(BPF_JA, 0, 0, -20),
+ },
+ .result = ACCEPT,
+ .retval = 3,
+},
+{
+ "jump/call test 9",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 3),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 2, 16),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -20),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .errstr = "jump out of range from insn 1 to 4",
+},
+{
+ "jump/call test 10",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 3),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 2, 16),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -20),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .errstr = "last insn is not an exit or jmp",
+},
+{
+ "jump/call test 11",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4),
+ BPF_MOV64_IMM(BPF_REG_0, 3),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 3),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 2, 26),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -31),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 3,
+},
diff --git a/tools/testing/selftests/bpf/verifier/junk_insn.c b/tools/testing/selftests/bpf/verifier/junk_insn.c
new file mode 100644
index 000000000..89d690f19
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/junk_insn.c
@@ -0,0 +1,45 @@
+{
+ "junk insn",
+ .insns = {
+ BPF_RAW_INSN(0, 0, 0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "unknown opcode 00",
+ .result = REJECT,
+},
+{
+ "junk insn2",
+ .insns = {
+ BPF_RAW_INSN(1, 0, 0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "BPF_LDX uses reserved fields",
+ .result = REJECT,
+},
+{
+ "junk insn3",
+ .insns = {
+ BPF_RAW_INSN(-1, 0, 0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "unknown opcode ff",
+ .result = REJECT,
+},
+{
+ "junk insn4",
+ .insns = {
+ BPF_RAW_INSN(-1, -1, -1, -1, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "unknown opcode ff",
+ .result = REJECT,
+},
+{
+ "junk insn5",
+ .insns = {
+ BPF_RAW_INSN(0x7f, -1, -1, -1, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "BPF_ALU uses reserved fields",
+ .result = REJECT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/ld_abs.c b/tools/testing/selftests/bpf/verifier/ld_abs.c
new file mode 100644
index 000000000..f6599d2ec
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/ld_abs.c
@@ -0,0 +1,286 @@
+{
+ "ld_abs: check calling conv, r1",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_1, 0),
+ BPF_LD_ABS(BPF_W, -0x200000),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 !read_ok",
+ .result = REJECT,
+},
+{
+ "ld_abs: check calling conv, r2",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_LD_ABS(BPF_W, -0x200000),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R2 !read_ok",
+ .result = REJECT,
+},
+{
+ "ld_abs: check calling conv, r3",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_LD_ABS(BPF_W, -0x200000),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_3),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R3 !read_ok",
+ .result = REJECT,
+},
+{
+ "ld_abs: check calling conv, r4",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_LD_ABS(BPF_W, -0x200000),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_4),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R4 !read_ok",
+ .result = REJECT,
+},
+{
+ "ld_abs: check calling conv, r5",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_LD_ABS(BPF_W, -0x200000),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_5),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R5 !read_ok",
+ .result = REJECT,
+},
+{
+ "ld_abs: check calling conv, r7",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_7, 0),
+ BPF_LD_ABS(BPF_W, -0x200000),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_7),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+},
+{
+ "ld_abs: tests on r6 and skb data reload helper",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_LD_ABS(BPF_B, 0),
+ BPF_LD_ABS(BPF_H, 0),
+ BPF_LD_ABS(BPF_W, 0),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_6, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_3, 2),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_vlan_push),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_7),
+ BPF_LD_ABS(BPF_B, 0),
+ BPF_LD_ABS(BPF_H, 0),
+ BPF_LD_ABS(BPF_W, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 42 /* ultimate return value */,
+},
+{
+ "ld_abs: invalid op 1",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_LD_ABS(BPF_DW, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .errstr = "unknown opcode",
+},
+{
+ "ld_abs: invalid op 2",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_0, 256),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_LD_IND(BPF_DW, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .errstr = "unknown opcode",
+},
+{
+ "ld_abs: nmap reduced",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_LD_ABS(BPF_H, 12),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x806, 28),
+ BPF_LD_ABS(BPF_H, 12),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x806, 26),
+ BPF_MOV32_IMM(BPF_REG_0, 18),
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -64),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -64),
+ BPF_LD_IND(BPF_W, BPF_REG_7, 14),
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -60),
+ BPF_MOV32_IMM(BPF_REG_0, 280971478),
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -56),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -56),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -60),
+ BPF_ALU32_REG(BPF_SUB, BPF_REG_0, BPF_REG_7),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 15),
+ BPF_LD_ABS(BPF_H, 12),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0x806, 13),
+ BPF_MOV32_IMM(BPF_REG_0, 22),
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -56),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -56),
+ BPF_LD_IND(BPF_H, BPF_REG_7, 14),
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -52),
+ BPF_MOV32_IMM(BPF_REG_0, 17366),
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -48),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_10, -48),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -52),
+ BPF_ALU32_REG(BPF_SUB, BPF_REG_0, BPF_REG_7),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV32_IMM(BPF_REG_0, 256),
+ BPF_EXIT_INSN(),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .data = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x06, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0x10, 0xbf, 0x48, 0xd6, 0x43, 0xd6,
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 256,
+},
+{
+ "ld_abs: div + abs, test 1",
+ .insns = {
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
+ BPF_LD_ABS(BPF_B, 3),
+ BPF_ALU64_IMM(BPF_MOV, BPF_REG_2, 2),
+ BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_8, BPF_REG_0),
+ BPF_LD_ABS(BPF_B, 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_0),
+ BPF_LD_IND(BPF_B, BPF_REG_8, -70),
+ BPF_EXIT_INSN(),
+ },
+ .data = {
+ 10, 20, 30, 40, 50,
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 10,
+},
+{
+ "ld_abs: div + abs, test 2",
+ .insns = {
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
+ BPF_LD_ABS(BPF_B, 3),
+ BPF_ALU64_IMM(BPF_MOV, BPF_REG_2, 2),
+ BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_8, BPF_REG_0),
+ BPF_LD_ABS(BPF_B, 128),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_0),
+ BPF_LD_IND(BPF_B, BPF_REG_8, -70),
+ BPF_EXIT_INSN(),
+ },
+ .data = {
+ 10, 20, 30, 40, 50,
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "ld_abs: div + abs, test 3",
+ .insns = {
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_MOV, BPF_REG_7, 0),
+ BPF_LD_ABS(BPF_B, 3),
+ BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_7),
+ BPF_EXIT_INSN(),
+ },
+ .data = {
+ 10, 20, 30, 40, 50,
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "ld_abs: div + abs, test 4",
+ .insns = {
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_MOV, BPF_REG_7, 0),
+ BPF_LD_ABS(BPF_B, 256),
+ BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_7),
+ BPF_EXIT_INSN(),
+ },
+ .data = {
+ 10, 20, 30, 40, 50,
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "ld_abs: vlan + abs, test 1",
+ .insns = { },
+ .data = {
+ 0x34,
+ },
+ .fill_helper = bpf_fill_ld_abs_vlan_push_pop,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 0xbef,
+},
+{
+ "ld_abs: vlan + abs, test 2",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_LD_ABS(BPF_B, 0),
+ BPF_LD_ABS(BPF_H, 0),
+ BPF_LD_ABS(BPF_W, 0),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_6, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_MOV64_IMM(BPF_REG_3, 2),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_skb_vlan_push),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_7),
+ BPF_LD_ABS(BPF_B, 0),
+ BPF_LD_ABS(BPF_H, 0),
+ BPF_LD_ABS(BPF_W, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_EXIT_INSN(),
+ },
+ .data = {
+ 0x34,
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 42,
+},
+{
+ "ld_abs: jump around ld_abs",
+ .insns = { },
+ .data = {
+ 10, 11,
+ },
+ .fill_helper = bpf_fill_jump_around_ld_abs,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 10,
+},
diff --git a/tools/testing/selftests/bpf/verifier/ld_dw.c b/tools/testing/selftests/bpf/verifier/ld_dw.c
new file mode 100644
index 000000000..0f18e62f0
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/ld_dw.c
@@ -0,0 +1,45 @@
+{
+ "ld_dw: xor semi-random 64 bit imms, test 1",
+ .insns = { },
+ .data = { },
+ .fill_helper = bpf_fill_rand_ld_dw,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 4090,
+},
+{
+ "ld_dw: xor semi-random 64 bit imms, test 2",
+ .insns = { },
+ .data = { },
+ .fill_helper = bpf_fill_rand_ld_dw,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 2047,
+},
+{
+ "ld_dw: xor semi-random 64 bit imms, test 3",
+ .insns = { },
+ .data = { },
+ .fill_helper = bpf_fill_rand_ld_dw,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 511,
+},
+{
+ "ld_dw: xor semi-random 64 bit imms, test 4",
+ .insns = { },
+ .data = { },
+ .fill_helper = bpf_fill_rand_ld_dw,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 5,
+},
+{
+ "ld_dw: xor semi-random 64 bit imms, test 5",
+ .insns = { },
+ .data = { },
+ .fill_helper = bpf_fill_rand_ld_dw,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 1000000 - 6,
+},
diff --git a/tools/testing/selftests/bpf/verifier/ld_imm64.c b/tools/testing/selftests/bpf/verifier/ld_imm64.c
new file mode 100644
index 000000000..f9297900c
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/ld_imm64.c
@@ -0,0 +1,146 @@
+{
+ "test1 ld_imm64",
+ .insns = {
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_LD_IMM64(BPF_REG_0, 0),
+ BPF_LD_IMM64(BPF_REG_0, 0),
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid BPF_LD_IMM insn",
+ .errstr_unpriv = "R1 pointer comparison",
+ .result = REJECT,
+},
+{
+ "test2 ld_imm64",
+ .insns = {
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_LD_IMM64(BPF_REG_0, 0),
+ BPF_LD_IMM64(BPF_REG_0, 0),
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid BPF_LD_IMM insn",
+ .errstr_unpriv = "R1 pointer comparison",
+ .result = REJECT,
+},
+{
+ "test3 ld_imm64",
+ .insns = {
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0),
+ BPF_LD_IMM64(BPF_REG_0, 0),
+ BPF_LD_IMM64(BPF_REG_0, 0),
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_ld_imm64 insn",
+ .result = REJECT,
+},
+{
+ "test4 ld_imm64",
+ .insns = {
+ BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_ld_imm64 insn",
+ .result = REJECT,
+},
+{
+ "test6 ld_imm64",
+ .insns = {
+ BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0),
+ BPF_RAW_INSN(0, 0, 0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+},
+{
+ "test7 ld_imm64",
+ .insns = {
+ BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 1),
+ BPF_RAW_INSN(0, 0, 0, 0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "test8 ld_imm64",
+ .insns = {
+ BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 1, 1),
+ BPF_RAW_INSN(0, 0, 0, 0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "uses reserved fields",
+ .result = REJECT,
+},
+{
+ "test9 ld_imm64",
+ .insns = {
+ BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 1),
+ BPF_RAW_INSN(0, 0, 0, 1, 1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_ld_imm64 insn",
+ .result = REJECT,
+},
+{
+ "test10 ld_imm64",
+ .insns = {
+ BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 1),
+ BPF_RAW_INSN(0, BPF_REG_1, 0, 0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_ld_imm64 insn",
+ .result = REJECT,
+},
+{
+ "test11 ld_imm64",
+ .insns = {
+ BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 1),
+ BPF_RAW_INSN(0, 0, BPF_REG_1, 0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_ld_imm64 insn",
+ .result = REJECT,
+},
+{
+ "test12 ld_imm64",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, BPF_REG_1, 0, 1),
+ BPF_RAW_INSN(0, 0, 0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "not pointing to valid bpf_map",
+ .result = REJECT,
+},
+{
+ "test13 ld_imm64",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, BPF_REG_1, 0, 1),
+ BPF_RAW_INSN(0, 0, BPF_REG_1, 0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid bpf_ld_imm64 insn",
+ .result = REJECT,
+},
+{
+ "test14 ld_imm64: reject 2nd imm != 0",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, BPF_REG_1,
+ BPF_PSEUDO_MAP_FD, 0, 0),
+ BPF_RAW_INSN(0, 0, 0, 0, 0xfefefe),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 1 },
+ .errstr = "unrecognized bpf_ld_imm64 insn",
+ .result = REJECT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/ld_ind.c b/tools/testing/selftests/bpf/verifier/ld_ind.c
new file mode 100644
index 000000000..079734227
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/ld_ind.c
@@ -0,0 +1,72 @@
+{
+ "ld_ind: check calling conv, r1",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_1, 1),
+ BPF_LD_IND(BPF_W, BPF_REG_1, -0x200000),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 !read_ok",
+ .result = REJECT,
+},
+{
+ "ld_ind: check calling conv, r2",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_2, 1),
+ BPF_LD_IND(BPF_W, BPF_REG_2, -0x200000),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R2 !read_ok",
+ .result = REJECT,
+},
+{
+ "ld_ind: check calling conv, r3",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_3, 1),
+ BPF_LD_IND(BPF_W, BPF_REG_3, -0x200000),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_3),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R3 !read_ok",
+ .result = REJECT,
+},
+{
+ "ld_ind: check calling conv, r4",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_4, 1),
+ BPF_LD_IND(BPF_W, BPF_REG_4, -0x200000),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_4),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R4 !read_ok",
+ .result = REJECT,
+},
+{
+ "ld_ind: check calling conv, r5",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_5, 1),
+ BPF_LD_IND(BPF_W, BPF_REG_5, -0x200000),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_5),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R5 !read_ok",
+ .result = REJECT,
+},
+{
+ "ld_ind: check calling conv, r7",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_7, 1),
+ BPF_LD_IND(BPF_W, BPF_REG_7, -0x200000),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_7),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 1,
+},
diff --git a/tools/testing/selftests/bpf/verifier/leak_ptr.c b/tools/testing/selftests/bpf/verifier/leak_ptr.c
new file mode 100644
index 000000000..d6eec17f2
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/leak_ptr.c
@@ -0,0 +1,67 @@
+{
+ "leak pointer into ctx 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_STX_XADD(BPF_DW, BPF_REG_1, BPF_REG_2,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 2 },
+ .errstr_unpriv = "R2 leaks addr into mem",
+ .result_unpriv = REJECT,
+ .result = REJECT,
+ .errstr = "BPF_XADD stores into R1 ctx is not allowed",
+},
+{
+ "leak pointer into ctx 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_STX_XADD(BPF_DW, BPF_REG_1, BPF_REG_10,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R10 leaks addr into mem",
+ .result_unpriv = REJECT,
+ .result = REJECT,
+ .errstr = "BPF_XADD stores into R1 ctx is not allowed",
+},
+{
+ "leak pointer into ctx 3",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 1 },
+ .errstr_unpriv = "R2 leaks addr into ctx",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "leak pointer into map val",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0),
+ BPF_STX_XADD(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 4 },
+ .errstr_unpriv = "R6 leaks addr into mem",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/loops1.c b/tools/testing/selftests/bpf/verifier/loops1.c
new file mode 100644
index 000000000..1af37187d
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/loops1.c
@@ -0,0 +1,206 @@
+{
+ "bounded loop, count to 4",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+ BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 4, -2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .retval = 4,
+},
+{
+ "bounded loop, count to 20",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 3),
+ BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 20, -2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "bounded loop, count from positive unknown to 4",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_JMP_IMM(BPF_JSLT, BPF_REG_0, 0, 2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+ BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 4, -2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .retval = 4,
+},
+{
+ "bounded loop, count from totally unknown to 4",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+ BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 4, -2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "bounded loop, count to 4 with equality",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 4, -2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "bounded loop, start in the middle",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_A(1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+ BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 4, -2),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "back-edge",
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .retval = 4,
+},
+{
+ "bounded loop containing a forward jump",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_0, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 4, -3),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .retval = 4,
+},
+{
+ "bounded loop that jumps out rather than in",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_6, 0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 1),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_6, 10000, 2),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_JMP_A(-4),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "infinite loop after a conditional jump",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 5),
+ BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 4, 2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+ BPF_JMP_A(-2),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "program is too large",
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "bounded recursion",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_JMP_IMM(BPF_JLT, BPF_REG_1, 4, 1),
+ BPF_EXIT_INSN(),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, -5),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "back-edge",
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "infinite loop in two jumps",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_A(0),
+ BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 4, -2),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "loop detected",
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "infinite loop: three-jump trick",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+ BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 2, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+ BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 2, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 1),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 1),
+ BPF_JMP_IMM(BPF_JLT, BPF_REG_0, 2, -11),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "loop detected",
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "not-taken loop with back jump to 1st insn",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 123),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 4, -2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .retval = 123,
+},
+{
+ "taken loop with back jump to 1st insn",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 10),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, -3),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .retval = 55,
+},
+{
+ "taken loop with back jump to 1st insn, 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 10),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_1, 1),
+ BPF_JMP32_IMM(BPF_JNE, BPF_REG_1, 0, -3),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .retval = 55,
+},
diff --git a/tools/testing/selftests/bpf/verifier/lwt.c b/tools/testing/selftests/bpf/verifier/lwt.c
new file mode 100644
index 000000000..2cab6a396
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/lwt.c
@@ -0,0 +1,189 @@
+{
+ "invalid direct packet write for LWT_IN",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_STX_MEM(BPF_B, BPF_REG_2, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "cannot write into packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_LWT_IN,
+},
+{
+ "invalid direct packet write for LWT_OUT",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_STX_MEM(BPF_B, BPF_REG_2, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "cannot write into packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_LWT_OUT,
+},
+{
+ "direct packet write for LWT_XMIT",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_STX_MEM(BPF_B, BPF_REG_2, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_LWT_XMIT,
+},
+{
+ "direct packet read for LWT_IN",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_LWT_IN,
+},
+{
+ "direct packet read for LWT_OUT",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_LWT_OUT,
+},
+{
+ "direct packet read for LWT_XMIT",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_LWT_XMIT,
+},
+{
+ "overlapping checks for direct packet access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_2, 6),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_LWT_XMIT,
+},
+{
+ "make headroom for LWT_XMIT",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_2, 34),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_skb_change_head),
+ /* split for s390 to succeed */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_2, 42),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_skb_change_head),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_LWT_XMIT,
+},
+{
+ "invalid access of tc_classid for LWT_IN",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, tc_classid)),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid bpf_context access",
+},
+{
+ "invalid access of tc_classid for LWT_OUT",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, tc_classid)),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid bpf_context access",
+},
+{
+ "invalid access of tc_classid for LWT_XMIT",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, tc_classid)),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid bpf_context access",
+},
+{
+ "check skb->tc_classid half load not permitted for lwt prog",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, tc_classid)),
+#else
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, tc_classid) + 2),
+#endif
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid bpf_context access",
+ .prog_type = BPF_PROG_TYPE_LWT_IN,
+},
diff --git a/tools/testing/selftests/bpf/verifier/map_in_map.c b/tools/testing/selftests/bpf/verifier/map_in_map.c
new file mode 100644
index 000000000..2798927ee
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/map_in_map.c
@@ -0,0 +1,62 @@
+{
+ "map in map access",
+ .insns = {
+ BPF_ST_MEM(0, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+ BPF_ST_MEM(0, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_in_map = { 3 },
+ .result = ACCEPT,
+},
+{
+ "invalid inner map pointer",
+ .insns = {
+ BPF_ST_MEM(0, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_ST_MEM(0, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_in_map = { 3 },
+ .errstr = "R1 pointer arithmetic on map_ptr prohibited",
+ .result = REJECT,
+},
+{
+ "forgot null checking on the inner map pointer",
+ .insns = {
+ BPF_ST_MEM(0, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_ST_MEM(0, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_in_map = { 3 },
+ .errstr = "R1 type=map_value_or_null expected=map_ptr",
+ .result = REJECT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/map_ptr.c b/tools/testing/selftests/bpf/verifier/map_ptr.c
new file mode 100644
index 000000000..2f551cb24
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/map_ptr.c
@@ -0,0 +1,98 @@
+{
+ "bpf_map_ptr: read with negative offset rejected",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "bpf_array access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN",
+ .result = REJECT,
+ .errstr = "R1 is bpf_array invalid negative access: off=-8",
+},
+{
+ "bpf_map_ptr: write rejected",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "bpf_array access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN",
+ .result = REJECT,
+ .errstr = "only read from bpf_array is supported",
+},
+{
+ "bpf_map_ptr: read non-existent field rejected",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_6, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "bpf_array access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN",
+ .result = REJECT,
+ .errstr = "cannot access ptr member ops with moff 0 in struct bpf_map with off 1 size 4",
+},
+{
+ "bpf_map_ptr: read ops field accepted",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_6, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "bpf_array access is allowed only to CAP_PERFMON and CAP_SYS_ADMIN",
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "bpf_map_ptr: r = 0, map_ptr = map_ptr + r",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_16b = { 4 },
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "R1 has pointer with unsupported alu operation",
+ .result = ACCEPT,
+},
+{
+ "bpf_map_ptr: r = 0, r = r + map_ptr",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_MOV64_IMM(BPF_REG_1, 0),
+ BPF_LD_MAP_FD(BPF_REG_0, 0),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_16b = { 4 },
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "R0 has pointer with unsupported alu operation",
+ .result = ACCEPT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/map_ptr_mixing.c b/tools/testing/selftests/bpf/verifier/map_ptr_mixing.c
new file mode 100644
index 000000000..1f2b8c4cb
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/map_ptr_mixing.c
@@ -0,0 +1,100 @@
+{
+ "calls: two calls returning different map pointers for lookup (hash, array)",
+ .insns = {
+ /* main prog */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_CALL_REL(11),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_CALL_REL(12),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ /* subprog 1 */
+ BPF_LD_MAP_FD(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ /* subprog 2 */
+ BPF_LD_MAP_FD(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .fixup_map_hash_48b = { 13 },
+ .fixup_map_array_48b = { 16 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "calls: two calls returning different map pointers for lookup (hash, map in map)",
+ .insns = {
+ /* main prog */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_CALL_REL(11),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_CALL_REL(12),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ /* subprog 1 */
+ BPF_LD_MAP_FD(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ /* subprog 2 */
+ BPF_LD_MAP_FD(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .fixup_map_in_map = { 16 },
+ .fixup_map_array_48b = { 13 },
+ .result = REJECT,
+ .errstr = "only read from bpf_array is supported",
+},
+{
+ "cond: two branches returning different map pointers for lookup (tail, tail)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 3),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_3, 7),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_prog1 = { 5 },
+ .fixup_prog2 = { 2 },
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "tail_call abusing map_ptr",
+ .result = ACCEPT,
+ .retval = 42,
+},
+{
+ "cond: two branches returning same map pointers for lookup (tail, tail)",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_1,
+ offsetof(struct __sk_buff, mark)),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 3),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 2),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_3, 7),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_prog2 = { 2, 5 },
+ .result_unpriv = ACCEPT,
+ .result = ACCEPT,
+ .retval = 42,
+},
diff --git a/tools/testing/selftests/bpf/verifier/map_ret_val.c b/tools/testing/selftests/bpf/verifier/map_ret_val.c
new file mode 100644
index 000000000..bdd0e8d18
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/map_ret_val.c
@@ -0,0 +1,65 @@
+{
+ "invalid map_fd for function call",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_delete_elem),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "fd 0 is not pointing to valid bpf_map",
+ .result = REJECT,
+},
+{
+ "don't check return value before access",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .errstr = "R0 invalid mem access 'map_value_or_null'",
+ .result = REJECT,
+},
+{
+ "access memory with incorrect alignment",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .errstr = "misaligned value access",
+ .result = REJECT,
+ .flags = F_LOAD_WITH_STRICT_ALIGNMENT,
+},
+{
+ "sometimes access memory with incorrect alignment",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .errstr = "R0 invalid mem access",
+ .errstr_unpriv = "R0 leaks addr",
+ .result = REJECT,
+ .flags = F_LOAD_WITH_STRICT_ALIGNMENT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/masking.c b/tools/testing/selftests/bpf/verifier/masking.c
new file mode 100644
index 000000000..6e1358c54
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/masking.c
@@ -0,0 +1,322 @@
+{
+ "masking, test out of bounds 1",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_1, 5),
+ BPF_MOV32_IMM(BPF_REG_2, 5 - 1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+ BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "masking, test out of bounds 2",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_1, 1),
+ BPF_MOV32_IMM(BPF_REG_2, 1 - 1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+ BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "masking, test out of bounds 3",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_1, 0xffffffff),
+ BPF_MOV32_IMM(BPF_REG_2, 0xffffffff - 1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+ BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "masking, test out of bounds 4",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_1, 0xffffffff),
+ BPF_MOV32_IMM(BPF_REG_2, 1 - 1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+ BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "masking, test out of bounds 5",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_1, -1),
+ BPF_MOV32_IMM(BPF_REG_2, 1 - 1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+ BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "masking, test out of bounds 6",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_1, -1),
+ BPF_MOV32_IMM(BPF_REG_2, 0xffffffff - 1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+ BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "masking, test out of bounds 7",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 5),
+ BPF_MOV32_IMM(BPF_REG_2, 5 - 1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+ BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "masking, test out of bounds 8",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 1),
+ BPF_MOV32_IMM(BPF_REG_2, 1 - 1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+ BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "masking, test out of bounds 9",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 0xffffffff),
+ BPF_MOV32_IMM(BPF_REG_2, 0xffffffff - 1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+ BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "masking, test out of bounds 10",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 0xffffffff),
+ BPF_MOV32_IMM(BPF_REG_2, 1 - 1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+ BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "masking, test out of bounds 11",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, -1),
+ BPF_MOV32_IMM(BPF_REG_2, 1 - 1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+ BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "masking, test out of bounds 12",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, -1),
+ BPF_MOV32_IMM(BPF_REG_2, 0xffffffff - 1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+ BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "masking, test in bounds 1",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_1, 4),
+ BPF_MOV32_IMM(BPF_REG_2, 5 - 1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+ BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 4,
+},
+{
+ "masking, test in bounds 2",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_1, 0),
+ BPF_MOV32_IMM(BPF_REG_2, 0xffffffff - 1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+ BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "masking, test in bounds 3",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_1, 0xfffffffe),
+ BPF_MOV32_IMM(BPF_REG_2, 0xffffffff - 1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+ BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0xfffffffe,
+},
+{
+ "masking, test in bounds 4",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_1, 0xabcde),
+ BPF_MOV32_IMM(BPF_REG_2, 0xabcdef - 1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+ BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0xabcde,
+},
+{
+ "masking, test in bounds 5",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_1, 0),
+ BPF_MOV32_IMM(BPF_REG_2, 1 - 1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+ BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "masking, test in bounds 6",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_1, 46),
+ BPF_MOV32_IMM(BPF_REG_2, 47 - 1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+ BPF_ALU64_REG(BPF_AND, BPF_REG_1, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 46,
+},
+{
+ "masking, test in bounds 7",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_3, -46),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, -1),
+ BPF_MOV32_IMM(BPF_REG_2, 47 - 1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_3),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3),
+ BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+ BPF_ALU64_REG(BPF_AND, BPF_REG_3, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_3),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 46,
+},
+{
+ "masking, test in bounds 8",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_3, -47),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_3, -1),
+ BPF_MOV32_IMM(BPF_REG_2, 47 - 1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_3),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_2, BPF_REG_3),
+ BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0),
+ BPF_ALU64_IMM(BPF_ARSH, BPF_REG_2, 63),
+ BPF_ALU64_REG(BPF_AND, BPF_REG_3, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_3),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
diff --git a/tools/testing/selftests/bpf/verifier/meta_access.c b/tools/testing/selftests/bpf/verifier/meta_access.c
new file mode 100644
index 000000000..205292b8d
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/meta_access.c
@@ -0,0 +1,235 @@
+{
+ "meta access, test1",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+},
+{
+ "meta access, test2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_0, 8),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid access to packet, off=-8",
+ .prog_type = BPF_PROG_TYPE_XDP,
+},
+{
+ "meta access, test3",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid access to packet",
+ .prog_type = BPF_PROG_TYPE_XDP,
+},
+{
+ "meta access, test4",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_4),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid access to packet",
+ .prog_type = BPF_PROG_TYPE_XDP,
+},
+{
+ "meta access, test5",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_3),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_4, 3),
+ BPF_MOV64_IMM(BPF_REG_2, -8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_xdp_adjust_meta),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_3, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "R3 !read_ok",
+ .prog_type = BPF_PROG_TYPE_XDP,
+},
+{
+ "meta access, test6",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_3),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_0, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid access to packet",
+ .prog_type = BPF_PROG_TYPE_XDP,
+},
+{
+ "meta access, test7",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_3),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 8),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+},
+{
+ "meta access, test8",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 0xFFFF),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+},
+{
+ "meta access, test9",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 0xFFFF),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 1),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid access to packet",
+ .prog_type = BPF_PROG_TYPE_XDP,
+},
+{
+ "meta access, test10",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_IMM(BPF_REG_5, 42),
+ BPF_MOV64_IMM(BPF_REG_6, 24),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_5, -8),
+ BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_6, -8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -8),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_5, 100, 6),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_5),
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_3),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_5, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_2, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid access to packet",
+ .prog_type = BPF_PROG_TYPE_XDP,
+},
+{
+ "meta access, test11",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_IMM(BPF_REG_5, 42),
+ BPF_MOV64_IMM(BPF_REG_6, 24),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_5, -8),
+ BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_6, -8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -8),
+ BPF_JMP_IMM(BPF_JGT, BPF_REG_5, 100, 6),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_5),
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_5, BPF_REG_5, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+},
+{
+ "meta access, test12",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_3),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 16),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_5, BPF_REG_4, 5),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_3, 0),
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, 16),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_5, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+},
diff --git a/tools/testing/selftests/bpf/verifier/perf_event_sample_period.c b/tools/testing/selftests/bpf/verifier/perf_event_sample_period.c
new file mode 100644
index 000000000..471c1a595
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/perf_event_sample_period.c
@@ -0,0 +1,59 @@
+{
+ "check bpf_perf_event_data->sample_period byte load permitted",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_perf_event_data, sample_period)),
+#else
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_perf_event_data, sample_period) + 7),
+#endif
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_PERF_EVENT,
+},
+{
+ "check bpf_perf_event_data->sample_period half load permitted",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_perf_event_data, sample_period)),
+#else
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_perf_event_data, sample_period) + 6),
+#endif
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_PERF_EVENT,
+},
+{
+ "check bpf_perf_event_data->sample_period word load permitted",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_perf_event_data, sample_period)),
+#else
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_perf_event_data, sample_period) + 4),
+#endif
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_PERF_EVENT,
+},
+{
+ "check bpf_perf_event_data->sample_period dword load permitted",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1,
+ offsetof(struct bpf_perf_event_data, sample_period)),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_PERF_EVENT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/precise.c b/tools/testing/selftests/bpf/verifier/precise.c
new file mode 100644
index 000000000..6dc8003ff
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/precise.c
@@ -0,0 +1,194 @@
+{
+ "precise: test 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_FD(BPF_REG_6, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_FP),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_FP, -8, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+
+ BPF_MOV64_REG(BPF_REG_9, BPF_REG_0),
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_FP),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_9, BPF_REG_8), /* map_value_ptr -= map_value_ptr */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_9),
+ BPF_JMP_IMM(BPF_JLT, BPF_REG_2, 8, 1),
+ BPF_EXIT_INSN(),
+
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1), /* R2=inv(umin=1, umax=8) */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_FP),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .fixup_map_array_48b = { 1 },
+ .result = VERBOSE_ACCEPT,
+ .errstr =
+ "26: (85) call bpf_probe_read_kernel#113\
+ last_idx 26 first_idx 20\
+ regs=4 stack=0 before 25\
+ regs=4 stack=0 before 24\
+ regs=4 stack=0 before 23\
+ regs=4 stack=0 before 22\
+ regs=4 stack=0 before 20\
+ parent didn't have regs=4 stack=0 marks\
+ last_idx 19 first_idx 10\
+ regs=4 stack=0 before 19\
+ regs=200 stack=0 before 18\
+ regs=300 stack=0 before 17\
+ regs=201 stack=0 before 15\
+ regs=201 stack=0 before 14\
+ regs=200 stack=0 before 13\
+ regs=200 stack=0 before 12\
+ regs=200 stack=0 before 11\
+ regs=200 stack=0 before 10\
+ parent already had regs=0 stack=0 marks",
+},
+{
+ "precise: test 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_FD(BPF_REG_6, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_FP),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_FP, -8, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+
+ BPF_MOV64_REG(BPF_REG_9, BPF_REG_0),
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_FP),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_9, BPF_REG_8), /* map_value_ptr -= map_value_ptr */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_9),
+ BPF_JMP_IMM(BPF_JLT, BPF_REG_2, 8, 1),
+ BPF_EXIT_INSN(),
+
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 1), /* R2=inv(umin=1, umax=8) */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_FP),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_probe_read_kernel),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+ .fixup_map_array_48b = { 1 },
+ .result = VERBOSE_ACCEPT,
+ .flags = BPF_F_TEST_STATE_FREQ,
+ .errstr =
+ "26: (85) call bpf_probe_read_kernel#113\
+ last_idx 26 first_idx 22\
+ regs=4 stack=0 before 25\
+ regs=4 stack=0 before 24\
+ regs=4 stack=0 before 23\
+ regs=4 stack=0 before 22\
+ parent didn't have regs=4 stack=0 marks\
+ last_idx 20 first_idx 20\
+ regs=4 stack=0 before 20\
+ parent didn't have regs=4 stack=0 marks\
+ last_idx 19 first_idx 17\
+ regs=4 stack=0 before 19\
+ regs=200 stack=0 before 18\
+ regs=300 stack=0 before 17\
+ parent already had regs=0 stack=0 marks",
+},
+{
+ "precise: cross frame pruning",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_IMM(BPF_REG_8, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_8, 1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_IMM(BPF_REG_9, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_9, 1),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_8, 1, 1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = BPF_F_TEST_STATE_FREQ,
+ .errstr = "!read_ok",
+ .result = REJECT,
+},
+{
+ "precise: ST insn causing spi > allocated_stack",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 123, 0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_3, -8, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8),
+ BPF_MOV64_IMM(BPF_REG_0, -1),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = BPF_F_TEST_STATE_FREQ,
+ .errstr = "5: (2d) if r4 > r0 goto pc+0\
+ last_idx 5 first_idx 5\
+ parent didn't have regs=10 stack=0 marks\
+ last_idx 4 first_idx 2\
+ regs=10 stack=0 before 4\
+ regs=10 stack=0 before 3\
+ regs=0 stack=1 before 2\
+ last_idx 5 first_idx 5\
+ parent didn't have regs=1 stack=0 marks",
+ .result = VERBOSE_ACCEPT,
+ .retval = -1,
+},
+{
+ "precise: STX insn causing spi > allocated_stack",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_3, 123, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, -8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8),
+ BPF_MOV64_IMM(BPF_REG_0, -1),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_4, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = BPF_F_TEST_STATE_FREQ,
+ .errstr = "last_idx 6 first_idx 6\
+ parent didn't have regs=10 stack=0 marks\
+ last_idx 5 first_idx 3\
+ regs=10 stack=0 before 5\
+ regs=10 stack=0 before 4\
+ regs=0 stack=1 before 3\
+ last_idx 6 first_idx 6\
+ parent didn't have regs=1 stack=0 marks\
+ last_idx 5 first_idx 3\
+ regs=1 stack=0 before 5",
+ .result = VERBOSE_ACCEPT,
+ .retval = -1,
+},
diff --git a/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c b/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c
new file mode 100644
index 000000000..fc4e30126
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/prevent_map_lookup.c
@@ -0,0 +1,29 @@
+{
+ "prevent map lookup in stack trace",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_stacktrace = { 3 },
+ .result = REJECT,
+ .errstr = "cannot pass map_type 7 into func bpf_map_lookup_elem",
+ .prog_type = BPF_PROG_TYPE_PERF_EVENT,
+},
+{
+ "prevent map lookup in prog array",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_prog2 = { 3 },
+ .result = REJECT,
+ .errstr = "cannot pass map_type 3 into func bpf_map_lookup_elem",
+},
diff --git a/tools/testing/selftests/bpf/verifier/raw_stack.c b/tools/testing/selftests/bpf/verifier/raw_stack.c
new file mode 100644
index 000000000..cc8e8c3cd
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/raw_stack.c
@@ -0,0 +1,305 @@
+{
+ "raw_stack: no skb_load_bytes",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_4, 8),
+ /* Call to skb_load_bytes() omitted. */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid read from stack R6 off=-8 size=8",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "raw_stack: skb_load_bytes, negative len",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_4, -8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "R4 min value is negative",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "raw_stack: skb_load_bytes, negative len 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_4, ~0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "R4 min value is negative",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "raw_stack: skb_load_bytes, zero len",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid zero-sized read",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "raw_stack: skb_load_bytes, no init",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_4, 8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "raw_stack: skb_load_bytes, init",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_6, 0, 0xcafe),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_4, 8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "raw_stack: skb_load_bytes, spilled regs around bounds",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -16),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 8),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_4, 8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, -8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_6, 8),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0,
+ offsetof(struct __sk_buff, mark)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_2,
+ offsetof(struct __sk_buff, priority)),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "raw_stack: skb_load_bytes, spilled regs corruption",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_4, 8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0,
+ offsetof(struct __sk_buff, mark)),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "R0 invalid mem access 'inv'",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "raw_stack: skb_load_bytes, spilled regs corruption 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -16),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 8),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_4, 8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, -8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_6, 8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_6, 0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0,
+ offsetof(struct __sk_buff, mark)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_2,
+ offsetof(struct __sk_buff, priority)),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_3,
+ offsetof(struct __sk_buff, pkt_type)),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_3),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "R3 invalid mem access 'inv'",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "raw_stack: skb_load_bytes, spilled regs + data",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -16),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 8),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_4, 8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, -8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_6, 8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_6, 0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0,
+ offsetof(struct __sk_buff, mark)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_2,
+ offsetof(struct __sk_buff, priority)),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_3),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "raw_stack: skb_load_bytes, invalid access 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -513),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_4, 8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid indirect access to stack R3 off=-513 size=8",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "raw_stack: skb_load_bytes, invalid access 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -1),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_4, 8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid indirect access to stack R3 off=-1 size=8",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "raw_stack: skb_load_bytes, invalid access 3",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 0xffffffff),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_4, 0xffffffff),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "R4 min value is negative",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "raw_stack: skb_load_bytes, invalid access 4",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -1),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_4, 0x7fffffff),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "R4 unbounded memory access, use 'var &= const' or 'if (var < const)'",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "raw_stack: skb_load_bytes, invalid access 5",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -512),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_4, 0x7fffffff),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "R4 unbounded memory access, use 'var &= const' or 'if (var < const)'",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "raw_stack: skb_load_bytes, invalid access 6",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -512),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid zero-sized read",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "raw_stack: skb_load_bytes, large access",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -512),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_6),
+ BPF_MOV64_IMM(BPF_REG_4, 512),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_6, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
diff --git a/tools/testing/selftests/bpf/verifier/raw_tp_writable.c b/tools/testing/selftests/bpf/verifier/raw_tp_writable.c
new file mode 100644
index 000000000..95b5d70a1
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/raw_tp_writable.c
@@ -0,0 +1,34 @@
+{
+ "raw_tracepoint_writable: reject variable offset",
+ .insns = {
+ /* r6 is our tp buffer */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ /* move the key (== 0) to r10-8 */
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0),
+ /* lookup in the map */
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+ BPF_FUNC_map_lookup_elem),
+
+ /* exit clean if null */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+
+ /* shift the buffer pointer to a variable location */
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_6, BPF_REG_0),
+ /* clobber whatever's there */
+ BPF_MOV64_IMM(BPF_REG_7, 4242),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_7, 0),
+
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 1, },
+ .prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
+ .errstr = "R6 invalid variable buffer offset: off=0, var_off=(0x0; 0xffffffff)",
+},
diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c
new file mode 100644
index 000000000..525d810b1
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c
@@ -0,0 +1,939 @@
+{
+ "reference tracking: leak potential reference",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), /* leak reference */
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "Unreleased reference",
+ .result = REJECT,
+},
+{
+ "reference tracking: leak potential reference to sock_common",
+ .insns = {
+ BPF_SK_LOOKUP(skc_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), /* leak reference */
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "Unreleased reference",
+ .result = REJECT,
+},
+{
+ "reference tracking: leak potential reference on stack",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "Unreleased reference",
+ .result = REJECT,
+},
+{
+ "reference tracking: leak potential reference on stack 2",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "Unreleased reference",
+ .result = REJECT,
+},
+{
+ "reference tracking: zero potential reference",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_IMM(BPF_REG_0, 0), /* leak reference */
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "Unreleased reference",
+ .result = REJECT,
+},
+{
+ "reference tracking: zero potential reference to sock_common",
+ .insns = {
+ BPF_SK_LOOKUP(skc_lookup_tcp),
+ BPF_MOV64_IMM(BPF_REG_0, 0), /* leak reference */
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "Unreleased reference",
+ .result = REJECT,
+},
+{
+ "reference tracking: copy and zero potential references",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_7, 0), /* leak reference */
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "Unreleased reference",
+ .result = REJECT,
+},
+{
+ "reference tracking: release reference without check",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ /* reference in r0 may be NULL */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "type=sock_or_null expected=sock",
+ .result = REJECT,
+},
+{
+ "reference tracking: release reference to sock_common without check",
+ .insns = {
+ BPF_SK_LOOKUP(skc_lookup_tcp),
+ /* reference in r0 may be NULL */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "type=sock_common_or_null expected=sock",
+ .result = REJECT,
+},
+{
+ "reference tracking: release reference",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
+{
+ "reference tracking: release reference to sock_common",
+ .insns = {
+ BPF_SK_LOOKUP(skc_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
+{
+ "reference tracking: release reference 2",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
+{
+ "reference tracking: release reference twice",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "type=inv expected=sock",
+ .result = REJECT,
+},
+{
+ "reference tracking: release reference twice inside branch",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), /* goto end */
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "type=inv expected=sock",
+ .result = REJECT,
+},
+{
+ "reference tracking: alloc, check, free in one subbranch",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 16),
+ /* if (offsetof(skb, mark) > data_len) exit; */
+ BPF_JMP_REG(BPF_JLE, BPF_REG_0, BPF_REG_3, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_2,
+ offsetof(struct __sk_buff, mark)),
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 1), /* mark == 0? */
+ /* Leak reference in R0 */
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), /* sk NULL? */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "Unreleased reference",
+ .result = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "reference tracking: alloc, check, free in both subbranches",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 16),
+ /* if (offsetof(skb, mark) > data_len) exit; */
+ BPF_JMP_REG(BPF_JLE, BPF_REG_0, BPF_REG_3, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_6, BPF_REG_2,
+ offsetof(struct __sk_buff, mark)),
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 4), /* mark == 0? */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), /* sk NULL? */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), /* sk NULL? */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "reference tracking in call: free reference in subprog",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), /* unchecked reference */
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_2, 0, 1),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
+{
+ "reference tracking in call: free reference in subprog and outside",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0), /* unchecked reference */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_2, 0, 1),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "type=inv expected=sock",
+ .result = REJECT,
+},
+{
+ "reference tracking in call: alloc & leak reference in subprog",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 3),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_4),
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ /* spill unchecked sk_ptr into stack of caller */
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "Unreleased reference",
+ .result = REJECT,
+},
+{
+ "reference tracking in call: alloc in subprog, release outside",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_EXIT_INSN(), /* return sk */
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .retval = POINTER_VALUE,
+ .result = ACCEPT,
+},
+{
+ "reference tracking in call: sk_ptr leak into caller stack",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_5, BPF_REG_4, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 5),
+ /* spill unchecked sk_ptr into stack of caller */
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, -8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_5, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 2 */
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "Unreleased reference",
+ .result = REJECT,
+},
+{
+ "reference tracking in call: sk_ptr spill into caller stack",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+
+ /* subprog 1 */
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_5, BPF_REG_4, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 8),
+ /* spill unchecked sk_ptr into stack of caller */
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_5, -8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_5, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ /* now the sk_ptr is verified, free the reference */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_4, 0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+
+ /* subprog 2 */
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
+{
+ "reference tracking: allow LD_ABS",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_LD_ABS(BPF_B, 0),
+ BPF_LD_ABS(BPF_H, 0),
+ BPF_LD_ABS(BPF_W, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
+{
+ "reference tracking: forbid LD_ABS while holding reference",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_LD_ABS(BPF_B, 0),
+ BPF_LD_ABS(BPF_H, 0),
+ BPF_LD_ABS(BPF_W, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "BPF_LD_[ABS|IND] cannot be mixed with socket references",
+ .result = REJECT,
+},
+{
+ "reference tracking: allow LD_IND",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_MOV64_IMM(BPF_REG_7, 1),
+ BPF_LD_IND(BPF_W, BPF_REG_7, -0x200000),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_7),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "reference tracking: forbid LD_IND while holding reference",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_7, 1),
+ BPF_LD_IND(BPF_W, BPF_REG_7, -0x200000),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_7),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_4),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "BPF_LD_[ABS|IND] cannot be mixed with socket references",
+ .result = REJECT,
+},
+{
+ "reference tracking: check reference or tail call",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_1),
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ /* if (sk) bpf_sk_release() */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 7),
+ /* bpf_tail_call() */
+ BPF_MOV64_IMM(BPF_REG_3, 3),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_prog1 = { 17 },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
+{
+ "reference tracking: release reference then tail call",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_1),
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ /* if (sk) bpf_sk_release() */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ /* bpf_tail_call() */
+ BPF_MOV64_IMM(BPF_REG_3, 3),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_prog1 = { 18 },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
+{
+ "reference tracking: leak possible reference over tail call",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_1),
+ /* Look up socket and store in REG_6 */
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ /* bpf_tail_call() */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_3, 3),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ /* if (sk) bpf_sk_release() */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_prog1 = { 16 },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "tail_call would lead to reference leak",
+ .result = REJECT,
+},
+{
+ "reference tracking: leak checked reference over tail call",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_1),
+ /* Look up socket and store in REG_6 */
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ /* if (!sk) goto end */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+ /* bpf_tail_call() */
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_prog1 = { 17 },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "tail_call would lead to reference leak",
+ .result = REJECT,
+},
+{
+ "reference tracking: mangle and release sock_or_null",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 5),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "R1 pointer arithmetic on sock_or_null prohibited",
+ .result = REJECT,
+},
+{
+ "reference tracking: mangle and release sock",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 5),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "R1 pointer arithmetic on sock prohibited",
+ .result = REJECT,
+},
+{
+ "reference tracking: access member",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_0, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
+{
+ "reference tracking: write to member",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_LD_IMM64(BPF_REG_2, 42),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_2,
+ offsetof(struct bpf_sock, mark)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_LD_IMM64(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "cannot write into sock",
+ .result = REJECT,
+},
+{
+ "reference tracking: invalid 64-bit access of member",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "invalid sock access off=0 size=8",
+ .result = REJECT,
+},
+{
+ "reference tracking: access after release",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "!read_ok",
+ .result = REJECT,
+},
+{
+ "reference tracking: direct access for lookup",
+ .insns = {
+ /* Check that the packet is at least 64B long */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 64),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 9),
+ /* sk = sk_lookup_tcp(ctx, skb->data, ...) */
+ BPF_MOV64_IMM(BPF_REG_3, sizeof(struct bpf_sock_tuple)),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_5, 0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_0, 4),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
+{
+ "reference tracking: use ptr from bpf_tcp_sock() after release",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_tcp_sock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 3),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_7, offsetof(struct bpf_tcp_sock, snd_cwnd)),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .errstr = "invalid mem access",
+},
+{
+ "reference tracking: use ptr from bpf_sk_fullsock() after release",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 3),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_7, offsetof(struct bpf_sock, type)),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .errstr = "invalid mem access",
+},
+{
+ "reference tracking: use ptr from bpf_sk_fullsock(tp) after release",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_tcp_sock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 3),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, offsetof(struct bpf_sock, type)),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .errstr = "invalid mem access",
+},
+{
+ "reference tracking: use sk after bpf_sk_release(tp)",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_tcp_sock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 3),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, offsetof(struct bpf_sock, type)),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .errstr = "invalid mem access",
+},
+{
+ "reference tracking: use ptr from bpf_get_listener_sock() after bpf_sk_release(sk)",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_get_listener_sock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 3),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, offsetof(struct bpf_sock, src_port)),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
+{
+ "reference tracking: bpf_sk_release(listen_sk)",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_get_listener_sock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 3),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, offsetof(struct bpf_sock, type)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .errstr = "reference has not been acquired before",
+},
+{
+ /* !bpf_sk_fullsock(sk) is checked but !bpf_tcp_sock(sk) is not checked */
+ "reference tracking: tp->snd_cwnd after bpf_sk_fullsock(sk) and bpf_tcp_sock(sk)",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_tcp_sock),
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0, 3),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_8, offsetof(struct bpf_tcp_sock, snd_cwnd)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .errstr = "invalid mem access",
+},
+{
+ "reference tracking: branch tracking valid pointer null comparison",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_3, 1),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 2),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
+{
+ "reference tracking: branch tracking valid pointer value comparison",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_3, 1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 4),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 1234, 2),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .errstr = "Unreleased reference",
+ .result = REJECT,
+},
+{
+ "reference tracking: bpf_sk_release(btf_tcp_sock)",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_skc_to_tcp_sock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 3),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "unknown func",
+},
+{
+ "reference tracking: use ptr from bpf_skc_to_tcp_sock() after release",
+ .insns = {
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_skc_to_tcp_sock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 3),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_7, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .errstr = "invalid mem access",
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "unknown func",
+},
+{
+ "reference tracking: try to leak released ptr reg",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_9, BPF_REG_0),
+
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_MOV64_IMM(BPF_REG_2, 8),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_EMIT_CALL(BPF_FUNC_ringbuf_reserve),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_EMIT_CALL(BPF_FUNC_ringbuf_discard),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+
+ BPF_STX_MEM(BPF_DW, BPF_REG_9, BPF_REG_8, 0),
+ BPF_EXIT_INSN()
+ },
+ .fixup_map_array_48b = { 4 },
+ .fixup_map_ringbuf = { 11 },
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "R8 !read_ok"
+},
diff --git a/tools/testing/selftests/bpf/verifier/regalloc.c b/tools/testing/selftests/bpf/verifier/regalloc.c
new file mode 100644
index 000000000..4ad7e05de
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/regalloc.c
@@ -0,0 +1,269 @@
+{
+ "regalloc basic",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 20, 4),
+ BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 3),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 4 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "regalloc negative",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 24, 4),
+ BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 3),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_7, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 4 },
+ .result = REJECT,
+ .errstr = "invalid access to map value, value_size=48 off=48 size=1",
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "regalloc src_reg mark",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 20, 5),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_JMP_REG(BPF_JSGE, BPF_REG_3, BPF_REG_2, 3),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 4 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "regalloc src_reg negative",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 22, 5),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_JMP_REG(BPF_JSGE, BPF_REG_3, BPF_REG_2, 3),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 4 },
+ .result = REJECT,
+ .errstr = "invalid access to map value, value_size=48 off=44 size=8",
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "regalloc and spill",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 20, 7),
+ /* r0 has upper bound that should propagate into r2 */
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), /* spill r2 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_2, 0), /* clear r0 and r2 */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_10, -8), /* fill r3 */
+ BPF_JMP_REG(BPF_JSGE, BPF_REG_0, BPF_REG_3, 2),
+ /* r3 has lower and upper bounds */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_3),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 4 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "regalloc and spill negative",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 48, 7),
+ /* r0 has upper bound that should propagate into r2 */
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), /* spill r2 */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_2, 0), /* clear r0 and r2 */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_10, -8), /* fill r3 */
+ BPF_JMP_REG(BPF_JSGE, BPF_REG_0, BPF_REG_3, 2),
+ /* r3 has lower and upper bounds */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_3),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 4 },
+ .result = REJECT,
+ .errstr = "invalid access to map value, value_size=48 off=48 size=8",
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "regalloc three regs",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 12, 5),
+ BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_4),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 4 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "regalloc after call",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_9, BPF_REG_0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 6),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_8, 20, 4),
+ BPF_JMP_IMM(BPF_JSLT, BPF_REG_9, 0, 3),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_8),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_9),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 4 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "regalloc in callee",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_7),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_JMP_IMM(BPF_JSGT, BPF_REG_1, 20, 5),
+ BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_1),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_2),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 4 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "regalloc, spill, JEQ",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), /* spill r0 */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
+ /* The verifier will walk the rest twice with r0 == 0 and r0 == map_value */
+ BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_2, 20, 0),
+ /* The verifier will walk the rest two more times with r0 == 20 and r0 == unknown */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_10, -8), /* fill r3 with map_value */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_3, 0, 1), /* skip ldx if map_value == NULL */
+ /* Buggy verifier will think that r3 == 20 here */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0), /* read from map_value */
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 4 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/runtime_jit.c b/tools/testing/selftests/bpf/verifier/runtime_jit.c
new file mode 100644
index 000000000..94c399d1f
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/runtime_jit.c
@@ -0,0 +1,231 @@
+{
+ "runtime/jit: tail_call within bounds, prog once",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_prog1 = { 1 },
+ .result = ACCEPT,
+ .retval = 42,
+},
+{
+ "runtime/jit: tail_call within bounds, prog loop",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_3, 1),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_prog1 = { 1 },
+ .result = ACCEPT,
+ .retval = 41,
+},
+{
+ "runtime/jit: tail_call within bounds, no prog",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_3, 3),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_prog1 = { 1 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "runtime/jit: tail_call within bounds, key 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_3, 2),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_prog1 = { 1 },
+ .result = ACCEPT,
+ .retval = 24,
+},
+{
+ "runtime/jit: tail_call within bounds, key 2 / key 2, first branch",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 13),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 13, 4),
+ BPF_MOV64_IMM(BPF_REG_3, 2),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 3),
+ BPF_MOV64_IMM(BPF_REG_3, 2),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_prog1 = { 5, 9 },
+ .result = ACCEPT,
+ .retval = 24,
+},
+{
+ "runtime/jit: tail_call within bounds, key 2 / key 2, second branch",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 14),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 13, 4),
+ BPF_MOV64_IMM(BPF_REG_3, 2),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 3),
+ BPF_MOV64_IMM(BPF_REG_3, 2),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_prog1 = { 5, 9 },
+ .result = ACCEPT,
+ .retval = 24,
+},
+{
+ "runtime/jit: tail_call within bounds, key 0 / key 2, first branch",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 13),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 13, 4),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 3),
+ BPF_MOV64_IMM(BPF_REG_3, 2),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_prog1 = { 5, 9 },
+ .result = ACCEPT,
+ .retval = 24,
+},
+{
+ "runtime/jit: tail_call within bounds, key 0 / key 2, second branch",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 14),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 13, 4),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 3),
+ BPF_MOV64_IMM(BPF_REG_3, 2),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_prog1 = { 5, 9 },
+ .result = ACCEPT,
+ .retval = 42,
+},
+{
+ "runtime/jit: tail_call within bounds, different maps, first branch",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 13),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 13, 4),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 3),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_prog1 = { 5 },
+ .fixup_prog2 = { 9 },
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "tail_call abusing map_ptr",
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "runtime/jit: tail_call within bounds, different maps, second branch",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 14),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_0,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, cb[0])),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 13, 4),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 3),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_prog1 = { 5 },
+ .fixup_prog2 = { 9 },
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "tail_call abusing map_ptr",
+ .result = ACCEPT,
+ .retval = 42,
+},
+{
+ "runtime/jit: tail_call out of bounds",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_3, 256),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_prog1 = { 1 },
+ .result = ACCEPT,
+ .retval = 2,
+},
+{
+ "runtime/jit: pass negative index to tail_call",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_3, -1),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_prog1 = { 1 },
+ .result = ACCEPT,
+ .retval = 2,
+},
+{
+ "runtime/jit: pass > 32bit index to tail_call",
+ .insns = {
+ BPF_LD_IMM64(BPF_REG_3, 0x100000000ULL),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_prog1 = { 2 },
+ .result = ACCEPT,
+ .retval = 42,
+ /* Verifier rewrite for unpriv skips tail call here. */
+ .retval_unpriv = 2,
+},
diff --git a/tools/testing/selftests/bpf/verifier/scale.c b/tools/testing/selftests/bpf/verifier/scale.c
new file mode 100644
index 000000000..7f868d480
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/scale.c
@@ -0,0 +1,18 @@
+{
+ "scale: scale test 1",
+ .insns = { },
+ .data = { },
+ .fill_helper = bpf_fill_scale,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "scale: scale test 2",
+ .insns = { },
+ .data = { },
+ .fill_helper = bpf_fill_scale,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .retval = 2,
+},
diff --git a/tools/testing/selftests/bpf/verifier/search_pruning.c b/tools/testing/selftests/bpf/verifier/search_pruning.c
new file mode 100644
index 000000000..7e36078f8
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/search_pruning.c
@@ -0,0 +1,192 @@
+{
+ "pointer/scalar confusion in state equality check (way 1)",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
+ BPF_JMP_A(1),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_10),
+ BPF_JMP_A(0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .result = ACCEPT,
+ .retval = POINTER_VALUE,
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "R0 leaks addr as return value"
+},
+{
+ "pointer/scalar confusion in state equality check (way 2)",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_10),
+ BPF_JMP_A(1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .result = ACCEPT,
+ .retval = POINTER_VALUE,
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "R0 leaks addr as return value"
+},
+{
+ "liveness pruning and write screening",
+ .insns = {
+ /* Get an unknown value */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ /* branch conditions teach us nothing about R2 */
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_2, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_2, 0, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R0 !read_ok",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_LWT_IN,
+},
+{
+ "varlen_map_value_access pruning",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV32_IMM(BPF_REG_2, MAX_ENTRIES),
+ BPF_JMP_REG(BPF_JSGT, BPF_REG_2, BPF_REG_1, 1),
+ BPF_MOV32_IMM(BPF_REG_1, 0),
+ BPF_ALU32_IMM(BPF_LSH, BPF_REG_1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr_unpriv = "R0 leaks addr",
+ .errstr = "R0 unbounded memory access",
+ .result_unpriv = REJECT,
+ .result = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "search pruning: all branches should be verified (nop operation)",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_3, 0xbeef, 2),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_JMP_A(1),
+ BPF_MOV64_IMM(BPF_REG_4, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_4, -16),
+ BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -16),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_5, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_6, 0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_6, 0, 0xdead),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .errstr = "R6 invalid mem access 'inv'",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "search pruning: all branches should be verified (invalid stack access)",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_3, 0xbeef, 2),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_4, -16),
+ BPF_JMP_A(1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_4, -24),
+ BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -16),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .errstr = "invalid read from stack off -16+0 size 8",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "allocated_stack",
+ .insns = {
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_7, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, -8),
+ BPF_STX_MEM(BPF_B, BPF_REG_10, BPF_REG_7, -9),
+ BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_10, -9),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .result_unpriv = ACCEPT,
+ .insn_processed = 15,
+},
+/* The test performs a conditional 64-bit write to a stack location
+ * fp[-8], this is followed by an unconditional 8-bit write to fp[-8],
+ * then data is read from fp[-8]. This sequence is unsafe.
+ *
+ * The test would be mistakenly marked as safe w/o dst register parent
+ * preservation in verifier.c:copy_register_state() function.
+ *
+ * Note the usage of BPF_F_TEST_STATE_FREQ to force creation of the
+ * checkpoint state after conditional 64-bit assignment.
+ */
+{
+ "write tracking and register parent chain bug",
+ .insns = {
+ /* r6 = ktime_get_ns() */
+ BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ /* r0 = ktime_get_ns() */
+ BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns),
+ /* if r0 > r6 goto +1 */
+ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_6, 1),
+ /* *(u64 *)(r10 - 8) = 0xdeadbeef */
+ BPF_ST_MEM(BPF_DW, BPF_REG_FP, -8, 0xdeadbeef),
+ /* r1 = 42 */
+ BPF_MOV64_IMM(BPF_REG_1, 42),
+ /* *(u8 *)(r10 - 8) = r1 */
+ BPF_STX_MEM(BPF_B, BPF_REG_FP, BPF_REG_1, -8),
+ /* r2 = *(u64 *)(r10 - 8) */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_FP, -8),
+ /* exit(0) */
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .flags = BPF_F_TEST_STATE_FREQ,
+ .errstr = "invalid read from stack off -8+1 size 8",
+ .result = REJECT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/sock.c b/tools/testing/selftests/bpf/verifier/sock.c
new file mode 100644
index 000000000..8c224eac9
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/sock.c
@@ -0,0 +1,733 @@
+{
+ "skb->sk: no NULL check",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = REJECT,
+ .errstr = "invalid mem access 'sock_common_or_null'",
+},
+{
+ "skb->sk: sk->family [non fullsock field]",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, offsetof(struct bpf_sock, family)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = ACCEPT,
+},
+{
+ "skb->sk: sk->type [fullsock field]",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, offsetof(struct bpf_sock, type)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = REJECT,
+ .errstr = "invalid sock_common access",
+},
+{
+ "bpf_sk_fullsock(skb->sk): no !skb->sk check",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = REJECT,
+ .errstr = "type=sock_common_or_null expected=sock_common",
+},
+{
+ "sk_fullsock(skb->sk): no NULL check on ret",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, type)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = REJECT,
+ .errstr = "invalid mem access 'sock_or_null'",
+},
+{
+ "sk_fullsock(skb->sk): sk->type [fullsock field]",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, type)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = ACCEPT,
+},
+{
+ "sk_fullsock(skb->sk): sk->family [non fullsock field]",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, family)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = ACCEPT,
+},
+{
+ "sk_fullsock(skb->sk): sk->state [narrow load]",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, state)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = ACCEPT,
+},
+{
+ "sk_fullsock(skb->sk): sk->dst_port [word load] (backward compatibility)",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, dst_port)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = ACCEPT,
+},
+{
+ "sk_fullsock(skb->sk): sk->dst_port [half load]",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, dst_port)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = ACCEPT,
+},
+{
+ "sk_fullsock(skb->sk): sk->dst_port [half load] (invalid)",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, dst_port) + 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = REJECT,
+ .errstr = "invalid sock access",
+},
+{
+ "sk_fullsock(skb->sk): sk->dst_port [byte load]",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_B, BPF_REG_2, BPF_REG_0, offsetof(struct bpf_sock, dst_port)),
+ BPF_LDX_MEM(BPF_B, BPF_REG_2, BPF_REG_0, offsetof(struct bpf_sock, dst_port) + 1),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = ACCEPT,
+},
+{
+ "sk_fullsock(skb->sk): sk->dst_port [byte load] (invalid)",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, dst_port) + 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = REJECT,
+ .errstr = "invalid sock access",
+},
+{
+ "sk_fullsock(skb->sk): past sk->dst_port [half load] (invalid)",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_0, offsetofend(struct bpf_sock, dst_port)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = REJECT,
+ .errstr = "invalid sock access",
+},
+{
+ "sk_fullsock(skb->sk): sk->dst_ip6 [load 2nd byte]",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, dst_ip6[0]) + 1),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = ACCEPT,
+},
+{
+ "sk_fullsock(skb->sk): sk->type [narrow load]",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, type)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = ACCEPT,
+},
+{
+ "sk_fullsock(skb->sk): sk->protocol [narrow load]",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, protocol)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = ACCEPT,
+},
+{
+ "sk_fullsock(skb->sk): beyond last field",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetofend(struct bpf_sock, rx_queue_mapping)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = REJECT,
+ .errstr = "invalid sock access",
+},
+{
+ "bpf_tcp_sock(skb->sk): no !skb->sk check",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_EMIT_CALL(BPF_FUNC_tcp_sock),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = REJECT,
+ .errstr = "type=sock_common_or_null expected=sock_common",
+},
+{
+ "bpf_tcp_sock(skb->sk): no NULL check on ret",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_tcp_sock),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_tcp_sock, snd_cwnd)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = REJECT,
+ .errstr = "invalid mem access 'tcp_sock_or_null'",
+},
+{
+ "bpf_tcp_sock(skb->sk): tp->snd_cwnd",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_tcp_sock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_tcp_sock, snd_cwnd)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = ACCEPT,
+},
+{
+ "bpf_tcp_sock(skb->sk): tp->bytes_acked",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_tcp_sock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_tcp_sock, bytes_acked)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = ACCEPT,
+},
+{
+ "bpf_tcp_sock(skb->sk): beyond last field",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_tcp_sock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, offsetofend(struct bpf_tcp_sock, bytes_acked)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = REJECT,
+ .errstr = "invalid tcp_sock access",
+},
+{
+ "bpf_tcp_sock(bpf_sk_fullsock(skb->sk)): tp->snd_cwnd",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_tcp_sock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_tcp_sock, snd_cwnd)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .result = ACCEPT,
+},
+{
+ "bpf_sk_release(skb->sk)",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .errstr = "reference has not been acquired before",
+},
+{
+ "bpf_sk_release(bpf_sk_fullsock(skb->sk))",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .errstr = "reference has not been acquired before",
+},
+{
+ "bpf_sk_release(bpf_tcp_sock(skb->sk))",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_tcp_sock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .errstr = "reference has not been acquired before",
+},
+{
+ "sk_storage_get(map, skb->sk, NULL, 0): value == NULL",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_storage_get),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_sk_storage_map = { 11 },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
+{
+ "sk_storage_get(map, skb->sk, 1, 1): value == 1",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_4, 1),
+ BPF_MOV64_IMM(BPF_REG_3, 1),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_storage_get),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_sk_storage_map = { 11 },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .errstr = "R3 type=inv expected=fp",
+},
+{
+ "sk_storage_get(map, skb->sk, &stack_value, 1): stack_value",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_4, 1),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_storage_get),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_sk_storage_map = { 14 },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
+{
+ "sk_storage_get(map, skb->sk, &stack_value, 1): partially init stack_value",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_EMIT_CALL(BPF_FUNC_sk_fullsock),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_4, 1),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_storage_get),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_sk_storage_map = { 14 },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .errstr = "invalid indirect read from stack",
+},
+{
+ "bpf_map_lookup_elem(smap, &key)",
+ .insns = {
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_sk_storage_map = { 3 },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .errstr = "cannot pass map_type 24 into func bpf_map_lookup_elem",
+},
+{
+ "bpf_map_lookup_elem(xskmap, &key); xs->queue_id",
+ .insns = {
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_xdp_sock, queue_id)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_xskmap = { 3 },
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .result = ACCEPT,
+},
+{
+ "bpf_map_lookup_elem(sockmap, &key)",
+ .insns = {
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_sockmap = { 3 },
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+ .result = REJECT,
+ .errstr = "Unreleased reference id=2 alloc_insn=5",
+},
+{
+ "bpf_map_lookup_elem(sockhash, &key)",
+ .insns = {
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_sockhash = { 3 },
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+ .result = REJECT,
+ .errstr = "Unreleased reference id=2 alloc_insn=5",
+},
+{
+ "bpf_map_lookup_elem(sockmap, &key); sk->type [fullsock field]; bpf_sk_release(sk)",
+ .insns = {
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, type)),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_sockmap = { 3 },
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+ .result = ACCEPT,
+},
+{
+ "bpf_map_lookup_elem(sockhash, &key); sk->type [fullsock field]; bpf_sk_release(sk)",
+ .insns = {
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, type)),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_sockhash = { 3 },
+ .prog_type = BPF_PROG_TYPE_SK_SKB,
+ .result = ACCEPT,
+},
+{
+ "bpf_sk_select_reuseport(ctx, reuseport_array, &key, flags)",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -4),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_select_reuseport),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_reuseport_array = { 4 },
+ .prog_type = BPF_PROG_TYPE_SK_REUSEPORT,
+ .result = ACCEPT,
+},
+{
+ "bpf_sk_select_reuseport(ctx, sockmap, &key, flags)",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -4),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_select_reuseport),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_sockmap = { 4 },
+ .prog_type = BPF_PROG_TYPE_SK_REUSEPORT,
+ .result = ACCEPT,
+},
+{
+ "bpf_sk_select_reuseport(ctx, sockhash, &key, flags)",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_4, 0),
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -4),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_EMIT_CALL(BPF_FUNC_sk_select_reuseport),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_sockmap = { 4 },
+ .prog_type = BPF_PROG_TYPE_SK_REUSEPORT,
+ .result = ACCEPT,
+},
+{
+ "mark null check on return value of bpf_skc_to helpers",
+ .insns = {
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_EMIT_CALL(BPF_FUNC_skc_to_tcp_sock),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_EMIT_CALL(BPF_FUNC_skc_to_tcp_request_sock),
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_8, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_7, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = REJECT,
+ .errstr = "invalid mem access",
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "unknown func",
+},
diff --git a/tools/testing/selftests/bpf/verifier/spill_fill.c b/tools/testing/selftests/bpf/verifier/spill_fill.c
new file mode 100644
index 000000000..0b943897a
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/spill_fill.c
@@ -0,0 +1,106 @@
+{
+ "check valid spill/fill",
+ .insns = {
+ /* spill R1(ctx) into stack */
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+ /* fill it back into R2 */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -8),
+ /* should be able to access R0 = *(R2 + 8) */
+ /* BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 8), */
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R0 leaks addr",
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .retval = POINTER_VALUE,
+},
+{
+ "check valid spill/fill, skb mark",
+ .insns = {
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0,
+ offsetof(struct __sk_buff, mark)),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .result_unpriv = ACCEPT,
+},
+{
+ "check valid spill/fill, ptr to mem",
+ .insns = {
+ /* reserve 8 byte ringbuf memory */
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_MOV64_IMM(BPF_REG_2, 8),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve),
+ /* store a pointer to the reserved memory in R6 */
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ /* check whether the reservation was successful */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ /* spill R6(mem) into the stack */
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8),
+ /* fill it back in R7 */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, -8),
+ /* should be able to access *(R7) = 0 */
+ BPF_ST_MEM(BPF_DW, BPF_REG_7, 0, 0),
+ /* submit the reserved ringbuf memory */
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_ringbuf = { 1 },
+ .result = ACCEPT,
+ .result_unpriv = ACCEPT,
+},
+{
+ "check corrupted spill/fill",
+ .insns = {
+ /* spill R1(ctx) into stack */
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+ /* mess up with R1 pointer on stack */
+ BPF_ST_MEM(BPF_B, BPF_REG_10, -7, 0x23),
+ /* fill back into R0 is fine for priv.
+ * R0 now becomes SCALAR_VALUE.
+ */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
+ /* Load from R0 should fail. */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 8),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "attempt to corrupt spilled",
+ .errstr = "R0 invalid mem access 'inv",
+ .result = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "check corrupted spill/fill, LSB",
+ .insns = {
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+ BPF_ST_MEM(BPF_H, BPF_REG_10, -8, 0xcafe),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "attempt to corrupt spilled",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = POINTER_VALUE,
+},
+{
+ "check corrupted spill/fill, MSB",
+ .insns = {
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0x12345678),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "attempt to corrupt spilled",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = POINTER_VALUE,
+},
diff --git a/tools/testing/selftests/bpf/verifier/spin_lock.c b/tools/testing/selftests/bpf/verifier/spin_lock.c
new file mode 100644
index 000000000..781621fac
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/spin_lock.c
@@ -0,0 +1,333 @@
+{
+ "spin_lock: test1 success",
+ .insns = {
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1,
+ 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_lock),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_unlock),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_spin_lock = { 3 },
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "spin_lock: test2 direct ld/st",
+ .insns = {
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1,
+ 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_lock),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_unlock),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_spin_lock = { 3 },
+ .result = REJECT,
+ .errstr = "cannot be accessed directly",
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "spin_lock: test3 direct ld/st",
+ .insns = {
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1,
+ 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_lock),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, 1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_unlock),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_spin_lock = { 3 },
+ .result = REJECT,
+ .errstr = "cannot be accessed directly",
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "spin_lock: test4 direct ld/st",
+ .insns = {
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1,
+ 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_lock),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_6, 3),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_unlock),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_spin_lock = { 3 },
+ .result = REJECT,
+ .errstr = "cannot be accessed directly",
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "spin_lock: test5 call within a locked region",
+ .insns = {
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1,
+ 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_lock),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_unlock),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_spin_lock = { 3 },
+ .result = REJECT,
+ .errstr = "calls are not allowed",
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "spin_lock: test6 missing unlock",
+ .insns = {
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1,
+ 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_lock),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_unlock),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_spin_lock = { 3 },
+ .result = REJECT,
+ .errstr = "unlock is missing",
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "spin_lock: test7 unlock without lock",
+ .insns = {
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1,
+ 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_lock),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_unlock),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_spin_lock = { 3 },
+ .result = REJECT,
+ .errstr = "without taking a lock",
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "spin_lock: test8 double lock",
+ .insns = {
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1,
+ 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_lock),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_lock),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_unlock),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_spin_lock = { 3 },
+ .result = REJECT,
+ .errstr = "calls are not allowed",
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "spin_lock: test9 different lock",
+ .insns = {
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1,
+ 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1,
+ 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_lock),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_unlock),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_spin_lock = { 3, 11 },
+ .result = REJECT,
+ .errstr = "unlock of different lock",
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "spin_lock: test10 lock in subprog without unlock",
+ .insns = {
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1,
+ 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 5),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_unlock),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_lock),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_spin_lock = { 3 },
+ .result = REJECT,
+ .errstr = "unlock is missing",
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "spin_lock: test11 ld_abs under lock",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
+ BPF_LD_MAP_FD(BPF_REG_1,
+ 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_lock),
+ BPF_LD_ABS(BPF_B, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 4),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_spin_unlock),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_spin_lock = { 4 },
+ .result = REJECT,
+ .errstr = "inside bpf_spin_lock",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
diff --git a/tools/testing/selftests/bpf/verifier/stack_ptr.c b/tools/testing/selftests/bpf/verifier/stack_ptr.c
new file mode 100644
index 000000000..8ab94d65f
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/stack_ptr.c
@@ -0,0 +1,359 @@
+{
+ "PTR_TO_STACK store/load",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -10),
+ BPF_ST_MEM(BPF_DW, BPF_REG_1, 2, 0xfaceb00c),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0xfaceb00c,
+},
+{
+ "PTR_TO_STACK store/load - bad alignment on off",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_1, 2, 0xfaceb00c),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 2),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "misaligned stack access off (0x0; 0x0)+-8+2 size 8",
+},
+{
+ "PTR_TO_STACK store/load - bad alignment on reg",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -10),
+ BPF_ST_MEM(BPF_DW, BPF_REG_1, 8, 0xfaceb00c),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 8),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "misaligned stack access off (0x0; 0x0)+-10+8 size 8",
+},
+{
+ "PTR_TO_STACK store/load - out of bounds low",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -80000),
+ BPF_ST_MEM(BPF_DW, BPF_REG_1, 8, 0xfaceb00c),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 8),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid write to stack R1 off=-79992 size=8",
+ .errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
+},
+{
+ "PTR_TO_STACK store/load - out of bounds high",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_1, 8, 0xfaceb00c),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 8),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid write to stack R1 off=0 size=8",
+},
+{
+ "PTR_TO_STACK check high 1",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -1),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 42,
+},
+{
+ "PTR_TO_STACK check high 2",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, -1, 42),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, -1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 42,
+},
+{
+ "PTR_TO_STACK check high 3",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, -1, 42),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = 42,
+},
+{
+ "PTR_TO_STACK check high 4",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
+ .errstr = "invalid write to stack R1 off=0 size=1",
+ .result = REJECT,
+},
+{
+ "PTR_TO_STACK check high 5",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, (1 << 29) - 1),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
+ .errstr = "invalid write to stack R1",
+},
+{
+ "PTR_TO_STACK check high 6",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, (1 << 29) - 1),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, SHRT_MAX, 42),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, SHRT_MAX),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
+ .errstr = "invalid write to stack",
+},
+{
+ "PTR_TO_STACK check high 7",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, (1 << 29) - 1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, (1 << 29) - 1),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, SHRT_MAX, 42),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, SHRT_MAX),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
+ .errstr = "fp pointer offset",
+},
+{
+ "PTR_TO_STACK check low 1",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -512),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 42,
+},
+{
+ "PTR_TO_STACK check low 2",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -513),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 1, 42),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 1),
+ BPF_EXIT_INSN(),
+ },
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
+ .result = ACCEPT,
+ .retval = 42,
+},
+{
+ "PTR_TO_STACK check low 3",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -513),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
+ .errstr = "invalid write to stack R1 off=-513 size=1",
+ .result = REJECT,
+},
+{
+ "PTR_TO_STACK check low 4",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, INT_MIN),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "math between fp pointer",
+},
+{
+ "PTR_TO_STACK check low 5",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -((1 << 29) - 1)),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
+ .errstr = "invalid write to stack",
+},
+{
+ "PTR_TO_STACK check low 6",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -((1 << 29) - 1)),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, SHRT_MIN, 42),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, SHRT_MIN),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid write to stack",
+ .errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
+},
+{
+ "PTR_TO_STACK check low 7",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -((1 << 29) - 1)),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -((1 << 29) - 1)),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, SHRT_MIN, 42),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, SHRT_MIN),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
+ .errstr = "fp pointer offset",
+},
+{
+ "PTR_TO_STACK mixed reg/k, 1",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -3),
+ BPF_MOV64_IMM(BPF_REG_2, -3),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 42,
+},
+{
+ "PTR_TO_STACK mixed reg/k, 2",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -3),
+ BPF_MOV64_IMM(BPF_REG_2, -3),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+ BPF_MOV64_REG(BPF_REG_5, BPF_REG_10),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_5, -6),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 42,
+},
+{
+ "PTR_TO_STACK mixed reg/k, 3",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -3),
+ BPF_MOV64_IMM(BPF_REG_2, -3),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = -3,
+},
+{
+ "PTR_TO_STACK reg",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_MOV64_IMM(BPF_REG_2, -3),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2),
+ BPF_ST_MEM(BPF_B, BPF_REG_1, 0, 42),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 42,
+},
+{
+ "stack pointer arithmetic",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 4),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 0),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -10),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_1),
+ BPF_ST_MEM(0, BPF_REG_2, 4, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, 8),
+ BPF_ST_MEM(0, BPF_REG_2, 4, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+},
+{
+ "store PTR_TO_STACK in R10 to array map using BPF_B",
+ .insns = {
+ /* Load pointer to map. */
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ /* Copy R10 to R9. */
+ BPF_MOV64_REG(BPF_REG_9, BPF_REG_10),
+ /* Pollute other registers with unaligned values. */
+ BPF_MOV64_IMM(BPF_REG_2, -1),
+ BPF_MOV64_IMM(BPF_REG_3, -1),
+ BPF_MOV64_IMM(BPF_REG_4, -1),
+ BPF_MOV64_IMM(BPF_REG_5, -1),
+ BPF_MOV64_IMM(BPF_REG_6, -1),
+ BPF_MOV64_IMM(BPF_REG_7, -1),
+ BPF_MOV64_IMM(BPF_REG_8, -1),
+ /* Store both R9 and R10 with BPF_B and read back. */
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_10, 0),
+ BPF_LDX_MEM(BPF_B, BPF_REG_2, BPF_REG_1, 0),
+ BPF_STX_MEM(BPF_B, BPF_REG_1, BPF_REG_9, 0),
+ BPF_LDX_MEM(BPF_B, BPF_REG_3, BPF_REG_1, 0),
+ /* Should read back as same value. */
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_2, BPF_REG_3, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = ACCEPT,
+ .retval = 42,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
diff --git a/tools/testing/selftests/bpf/verifier/subreg.c b/tools/testing/selftests/bpf/verifier/subreg.c
new file mode 100644
index 000000000..4c4133c80
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/subreg.c
@@ -0,0 +1,533 @@
+/* This file contains sub-register zero extension checks for insns defining
+ * sub-registers, meaning:
+ * - All insns under BPF_ALU class. Their BPF_ALU32 variants or narrow width
+ * forms (BPF_END) could define sub-registers.
+ * - Narrow direct loads, BPF_B/H/W | BPF_LDX.
+ * - BPF_LD is not exposed to JIT back-ends, so no need for testing.
+ *
+ * "get_prandom_u32" is used to initialize low 32-bit of some registers to
+ * prevent potential optimizations done by verifier or JIT back-ends which could
+ * optimize register back into constant when range info shows one register is a
+ * constant.
+ */
+{
+ "add32 reg zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LD_IMM64(BPF_REG_0, 0x100000000ULL),
+ BPF_ALU32_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "add32 imm zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ /* An insn could have no effect on the low 32-bit, for example:
+ * a = a + 0
+ * a = a | 0
+ * a = a & -1
+ * But, they should still zero high 32-bit.
+ */
+ BPF_ALU32_IMM(BPF_ADD, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU32_IMM(BPF_ADD, BPF_REG_0, -2),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_6),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "sub32 reg zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LD_IMM64(BPF_REG_0, 0x1ffffffffULL),
+ BPF_ALU32_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "sub32 imm zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU32_IMM(BPF_SUB, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU32_IMM(BPF_SUB, BPF_REG_0, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_6),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "mul32 reg zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LD_IMM64(BPF_REG_0, 0x100000001ULL),
+ BPF_ALU32_REG(BPF_MUL, BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "mul32 imm zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU32_IMM(BPF_MUL, BPF_REG_0, -1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_6),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "div32 reg zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_0, -1),
+ BPF_ALU32_REG(BPF_DIV, BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "div32 imm zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU32_IMM(BPF_DIV, BPF_REG_0, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU32_IMM(BPF_DIV, BPF_REG_0, 2),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_6),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "or32 reg zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LD_IMM64(BPF_REG_0, 0x100000001ULL),
+ BPF_ALU32_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "or32 imm zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU32_IMM(BPF_OR, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU32_IMM(BPF_OR, BPF_REG_0, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_6),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "and32 reg zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x100000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_1, BPF_REG_0),
+ BPF_LD_IMM64(BPF_REG_0, 0x1ffffffffULL),
+ BPF_ALU32_REG(BPF_AND, BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "and32 imm zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_0, -1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_0, -2),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_6),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "lsh32 reg zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x100000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_1, 1),
+ BPF_ALU32_REG(BPF_LSH, BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "lsh32 imm zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU32_IMM(BPF_LSH, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU32_IMM(BPF_LSH, BPF_REG_0, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_6),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "rsh32 reg zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_1, 1),
+ BPF_ALU32_REG(BPF_RSH, BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "rsh32 imm zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU32_IMM(BPF_RSH, BPF_REG_0, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_6),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "neg32 reg zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU32_IMM(BPF_NEG, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "mod32 reg zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_0, -1),
+ BPF_ALU32_REG(BPF_MOD, BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "mod32 imm zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU32_IMM(BPF_MOD, BPF_REG_0, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU32_IMM(BPF_MOD, BPF_REG_0, 2),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_6),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "xor32 reg zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
+ BPF_LD_IMM64(BPF_REG_0, 0x100000000ULL),
+ BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "xor32 imm zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU32_IMM(BPF_XOR, BPF_REG_0, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "mov32 reg zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x100000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_1, BPF_REG_0),
+ BPF_LD_IMM64(BPF_REG_0, 0x100000000ULL),
+ BPF_MOV32_REG(BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "mov32 imm zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_MOV32_IMM(BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_MOV32_IMM(BPF_REG_0, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_6),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "arsh32 reg zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_1, 1),
+ BPF_ALU32_REG(BPF_ARSH, BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "arsh32 imm zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU32_IMM(BPF_ARSH, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_ALU32_IMM(BPF_ARSH, BPF_REG_0, 1),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_6),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "end16 (to_le) reg zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 32),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_6),
+ BPF_ENDIAN(BPF_TO_LE, BPF_REG_0, 16),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "end32 (to_le) reg zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 32),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_6),
+ BPF_ENDIAN(BPF_TO_LE, BPF_REG_0, 32),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "end16 (to_be) reg zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 32),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_6),
+ BPF_ENDIAN(BPF_TO_BE, BPF_REG_0, 16),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "end32 (to_be) reg zero extend check",
+ .insns = {
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_6, 32),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_6),
+ BPF_ENDIAN(BPF_TO_BE, BPF_REG_0, 32),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "ldx_b zero extend check",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -4),
+ BPF_ST_MEM(BPF_W, BPF_REG_6, 0, 0xfaceb00c),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_6, 0),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "ldx_h zero extend check",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -4),
+ BPF_ST_MEM(BPF_W, BPF_REG_6, 0, 0xfaceb00c),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_6, 0),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "ldx_w zero extend check",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -4),
+ BPF_ST_MEM(BPF_W, BPF_REG_6, 0, 0xfaceb00c),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_prandom_u32),
+ BPF_LD_IMM64(BPF_REG_1, 0x1000000000ULL),
+ BPF_ALU64_REG(BPF_OR, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, 0),
+ BPF_ALU64_IMM(BPF_RSH, BPF_REG_0, 32),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = 0,
+},
diff --git a/tools/testing/selftests/bpf/verifier/uninit.c b/tools/testing/selftests/bpf/verifier/uninit.c
new file mode 100644
index 000000000..987a5871f
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/uninit.c
@@ -0,0 +1,39 @@
+{
+ "read uninitialized register",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R2 !read_ok",
+ .result = REJECT,
+},
+{
+ "read invalid register",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_0, -1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R15 is invalid",
+ .result = REJECT,
+},
+{
+ "program doesn't init R0 before exit",
+ .insns = {
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_2, BPF_REG_1),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R0 !read_ok",
+ .result = REJECT,
+},
+{
+ "program doesn't init R0 before exit in all branches",
+ .insns = {
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_1, 0, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R0 !read_ok",
+ .errstr_unpriv = "R1 pointer comparison",
+ .result = REJECT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/unpriv.c b/tools/testing/selftests/bpf/verifier/unpriv.c
new file mode 100644
index 000000000..9dfb68c8c
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/unpriv.c
@@ -0,0 +1,538 @@
+{
+ "unpriv: return pointer",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_10),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "R0 leaks addr",
+ .retval = POINTER_VALUE,
+},
+{
+ "unpriv: add const to pointer",
+ .insns = {
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+},
+{
+ "unpriv: add pointer to pointer",
+ .insns = {
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_10),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "R1 pointer += pointer",
+},
+{
+ "unpriv: neg pointer",
+ .insns = {
+ BPF_ALU64_IMM(BPF_NEG, BPF_REG_1, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "R1 pointer arithmetic",
+},
+{
+ "unpriv: cmp pointer with const",
+ .insns = {
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "R1 pointer comparison",
+},
+{
+ "unpriv: cmp pointer with pointer",
+ .insns = {
+ BPF_JMP_REG(BPF_JEQ, BPF_REG_1, BPF_REG_10, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "R10 pointer comparison",
+},
+{
+ "unpriv: check that printk is disallowed",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_2, 8),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_1),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_trace_printk),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "unknown func bpf_trace_printk#6",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_TRACEPOINT,
+},
+{
+ "unpriv: pass pointer to helper function",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_2),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_update_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .errstr_unpriv = "R4 leaks addr",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "unpriv: indirectly pass pointer on stack to helper function",
+ .insns = {
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_10, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .errstr_unpriv = "invalid indirect read from stack R2 off -8+0 size 8",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "unpriv: mangle pointer on stack 1",
+ .insns = {
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_10, -8),
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -8, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "attempt to corrupt spilled",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "unpriv: mangle pointer on stack 2",
+ .insns = {
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_10, -8),
+ BPF_ST_MEM(BPF_B, BPF_REG_10, -1, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "attempt to corrupt spilled",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "unpriv: read pointer from stack in small chunks",
+ .insns = {
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_10, -8),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid size",
+ .result = REJECT,
+},
+{
+ "unpriv: write pointer into ctx",
+ .insns = {
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R1 leaks addr",
+ .result_unpriv = REJECT,
+ .errstr = "invalid bpf_context access",
+ .result = REJECT,
+},
+{
+ "unpriv: spill/fill of ctx",
+ .insns = {
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+},
+{
+ "unpriv: spill/fill of ctx 2",
+ .insns = {
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_hash_recalc),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "unpriv: spill/fill of ctx 3",
+ .insns = {
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_hash_recalc),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "R1 type=fp expected=ctx",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "unpriv: spill/fill of ctx 4",
+ .insns = {
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_10, BPF_REG_0, -8, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_hash_recalc),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "R1 type=inv expected=ctx",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "unpriv: spill/fill of different pointers stx",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_3, 42),
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 3),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_2, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 0),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3,
+ offsetof(struct __sk_buff, mark)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "same insn cannot be used with different pointers",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "unpriv: spill/fill of different pointers stx - ctx and sock",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
+ /* struct bpf_sock *sock = bpf_sock_lookup(...); */
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ /* u64 foo; */
+ /* void *target = &foo; */
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+ /* if (skb == NULL) *target = sock; */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_2, 0),
+ /* else *target = skb; */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+ /* struct __sk_buff *skb = *target; */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 0),
+ /* skb->mark = 42; */
+ BPF_MOV64_IMM(BPF_REG_3, 42),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3,
+ offsetof(struct __sk_buff, mark)),
+ /* if (sk) bpf_sk_release(sk) */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "type=ctx expected=sock",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "unpriv: spill/fill of different pointers stx - leak sock",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
+ /* struct bpf_sock *sock = bpf_sock_lookup(...); */
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ /* u64 foo; */
+ /* void *target = &foo; */
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+ /* if (skb == NULL) *target = sock; */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_2, 0),
+ /* else *target = skb; */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+ /* struct __sk_buff *skb = *target; */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 0),
+ /* skb->mark = 42; */
+ BPF_MOV64_IMM(BPF_REG_3, 42),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3,
+ offsetof(struct __sk_buff, mark)),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ //.errstr = "same insn cannot be used with different pointers",
+ .errstr = "Unreleased reference",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "unpriv: spill/fill of different pointers stx - sock and ctx (read)",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
+ /* struct bpf_sock *sock = bpf_sock_lookup(...); */
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ /* u64 foo; */
+ /* void *target = &foo; */
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+ /* if (skb) *target = skb */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+ /* else *target = sock */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_2, 0),
+ /* struct bpf_sock *sk = *target; */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 0),
+ /* if (sk) u32 foo = sk->mark; bpf_sk_release(sk); */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 2),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct bpf_sock, mark)),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "same insn cannot be used with different pointers",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "unpriv: spill/fill of different pointers stx - sock and ctx (write)",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
+ /* struct bpf_sock *sock = bpf_sock_lookup(...); */
+ BPF_SK_LOOKUP(sk_lookup_tcp),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ /* u64 foo; */
+ /* void *target = &foo; */
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+ /* if (skb) *target = skb */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+ /* else *target = sock */
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_2, 0),
+ /* struct bpf_sock *sk = *target; */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 0),
+ /* if (sk) sk->mark = 42; bpf_sk_release(sk); */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 3),
+ BPF_MOV64_IMM(BPF_REG_3, 42),
+ BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_3,
+ offsetof(struct bpf_sock, mark)),
+ BPF_EMIT_CALL(BPF_FUNC_sk_release),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ //.errstr = "same insn cannot be used with different pointers",
+ .errstr = "cannot write into sock",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "unpriv: spill/fill of different pointers ldx",
+ .insns = {
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 3),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2,
+ -(__s32)offsetof(struct bpf_perf_event_data,
+ sample_period) - 8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_2, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1,
+ offsetof(struct bpf_perf_event_data, sample_period)),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "same insn cannot be used with different pointers",
+ .prog_type = BPF_PROG_TYPE_PERF_EVENT,
+},
+{
+ "unpriv: write pointer into map elem value",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .errstr_unpriv = "R0 leaks addr",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "alu32: mov u32 const",
+ .insns = {
+ BPF_MOV32_IMM(BPF_REG_7, 0),
+ BPF_ALU32_IMM(BPF_AND, BPF_REG_7, 1),
+ BPF_MOV32_REG(BPF_REG_0, BPF_REG_7),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R7 invalid mem access 'inv'",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .retval = 0,
+},
+{
+ "unpriv: partial copy of pointer",
+ .insns = {
+ BPF_MOV32_REG(BPF_REG_1, BPF_REG_10),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R10 partial copy",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "unpriv: pass pointer to tail_call",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_3, BPF_REG_1),
+ BPF_LD_MAP_FD(BPF_REG_2, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_tail_call),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_prog1 = { 1 },
+ .errstr_unpriv = "R3 leaks addr into helper",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "unpriv: cmp map pointer with zero",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 1 },
+ .errstr_unpriv = "R1 pointer comparison",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "unpriv: write into frame pointer",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_10, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "frame pointer is read only",
+ .result = REJECT,
+},
+{
+ "unpriv: spill/fill frame pointer",
+ .insns = {
+ BPF_ALU64_REG(BPF_MOV, BPF_REG_6, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "frame pointer is read only",
+ .result = REJECT,
+},
+{
+ "unpriv: cmp of frame pointer",
+ .insns = {
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_10, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R10 pointer comparison",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "unpriv: adding of fp, reg",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_1, 0),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_10),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, -8),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "unpriv: adding of fp, imm",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 0),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, -8),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R1 stack pointer arithmetic goes out of range",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "unpriv: cmp of stack pointer",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_2, 0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr_unpriv = "R2 pointer comparison",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/value.c b/tools/testing/selftests/bpf/verifier/value.c
new file mode 100644
index 000000000..0e42592b1
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/value.c
@@ -0,0 +1,104 @@
+{
+ "map element value store of cleared call register",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr_unpriv = "R1 !read_ok",
+ .errstr = "R1 !read_ok",
+ .result = REJECT,
+ .result_unpriv = REJECT,
+},
+{
+ "map element value with unaligned store",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 17),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 3),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 42),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 2, 43),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, -2, 44),
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_8, 0, 32),
+ BPF_ST_MEM(BPF_DW, BPF_REG_8, 2, 33),
+ BPF_ST_MEM(BPF_DW, BPF_REG_8, -2, 34),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_8, 5),
+ BPF_ST_MEM(BPF_DW, BPF_REG_8, 0, 22),
+ BPF_ST_MEM(BPF_DW, BPF_REG_8, 4, 23),
+ BPF_ST_MEM(BPF_DW, BPF_REG_8, -7, 24),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_8),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, 3),
+ BPF_ST_MEM(BPF_DW, BPF_REG_7, 0, 22),
+ BPF_ST_MEM(BPF_DW, BPF_REG_7, 4, 23),
+ BPF_ST_MEM(BPF_DW, BPF_REG_7, -4, 24),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr_unpriv = "R0 leaks addr",
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "map element value with unaligned load",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_1, MAX_ENTRIES, 9),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 3),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 2),
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_8, 2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 5),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_0, 4),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr_unpriv = "R0 leaks addr",
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "map element value is preserved across register spilling",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, offsetof(struct test_val, foo)),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 42),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -184),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, 0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_3, 0, 42),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr_unpriv = "R0 leaks addr",
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
diff --git a/tools/testing/selftests/bpf/verifier/value_adj_spill.c b/tools/testing/selftests/bpf/verifier/value_adj_spill.c
new file mode 100644
index 000000000..7135e8021
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/value_adj_spill.c
@@ -0,0 +1,43 @@
+{
+ "map element value is preserved across register spilling",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 42),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -184),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, 0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_3, 0, 42),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr_unpriv = "R0 leaks addr",
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+},
+{
+ "map element value or null is marked on register spilling",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -152),
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_1, 0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_3, 0, 42),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr_unpriv = "R0 leaks addr",
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/value_illegal_alu.c b/tools/testing/selftests/bpf/verifier/value_illegal_alu.c
new file mode 100644
index 000000000..ed1c2cea1
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/value_illegal_alu.c
@@ -0,0 +1,95 @@
+{
+ "map element value illegal alu op, 1",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_0, 8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "R0 bitwise operator &= on pointer",
+ .result = REJECT,
+},
+{
+ "map element value illegal alu op, 2",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_ALU32_IMM(BPF_ADD, BPF_REG_0, 0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "R0 32-bit pointer arithmetic prohibited",
+ .result = REJECT,
+},
+{
+ "map element value illegal alu op, 3",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_ALU64_IMM(BPF_DIV, BPF_REG_0, 42),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "R0 pointer arithmetic with /= operator",
+ .result = REJECT,
+},
+{
+ "map element value illegal alu op, 4",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_ENDIAN(BPF_FROM_BE, BPF_REG_0, 64),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr_unpriv = "R0 pointer arithmetic prohibited",
+ .errstr = "invalid mem access 'inv'",
+ .result = REJECT,
+ .result_unpriv = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "map element value illegal alu op, 5",
+ .insns = {
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+ BPF_MOV64_IMM(BPF_REG_3, 4096),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0),
+ BPF_STX_XADD(BPF_DW, BPF_REG_2, BPF_REG_3, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr_unpriv = "leaking pointer from stack off -8",
+ .errstr = "R0 invalid mem access 'inv'",
+ .result = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
diff --git a/tools/testing/selftests/bpf/verifier/value_or_null.c b/tools/testing/selftests/bpf/verifier/value_or_null.c
new file mode 100644
index 000000000..3ecb70a3d
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/value_or_null.c
@@ -0,0 +1,171 @@
+{
+ "multiple registers share map_lookup_elem result",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 10),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 4 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS
+},
+{
+ "alu ops on ptr_to_map_value_or_null, 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 10),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, 2),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 4 },
+ .errstr = "R4 pointer arithmetic on map_value_or_null",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS
+},
+{
+ "alu ops on ptr_to_map_value_or_null, 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 10),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_4, -1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 4 },
+ .errstr = "R4 pointer arithmetic on map_value_or_null",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS
+},
+{
+ "alu ops on ptr_to_map_value_or_null, 3",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 10),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_0),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_4, 1),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 4 },
+ .errstr = "R4 pointer arithmetic on map_value_or_null",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS
+},
+{
+ "invalid memory access with multiple map_lookup_elem calls",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 10),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_2),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 4 },
+ .result = REJECT,
+ .errstr = "R4 !read_ok",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS
+},
+{
+ "valid indirect map_lookup_elem access with 2nd lookup in branch",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 10),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_1),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_2),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_2, 10),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 0, 3),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_REG(BPF_REG_4, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_4, 0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 4 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS
+},
+{
+ "invalid map access from else condition",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JGE, BPF_REG_1, MAX_ENTRIES-1, 1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 1),
+ BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, offsetof(struct test_val, foo)),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 3 },
+ .errstr = "R0 unbounded memory access",
+ .result = REJECT,
+ .errstr_unpriv = "R0 leaks addr",
+ .result_unpriv = REJECT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "map lookup and null branch prediction",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_1, 10),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, -8),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 2),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_6, 0, 1),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_10, 10),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 4 },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+},
diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c
new file mode 100644
index 000000000..d8765a4d5
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c
@@ -0,0 +1,911 @@
+{
+ "map access: known scalar += value_ptr from different maps",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, len)),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 1, 3),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+ BPF_MOV64_IMM(BPF_REG_1, 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_16b = { 5 },
+ .fixup_map_array_48b = { 8 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "map access: value_ptr -= known scalar from different maps",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, len)),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 1, 3),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_IMM(BPF_REG_1, 4),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_16b = { 5 },
+ .fixup_map_array_48b = { 8 },
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "R0 min value is outside of the allowed memory range",
+ .retval = 1,
+},
+{
+ "map access: known scalar += value_ptr from different maps, but same value properties",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1,
+ offsetof(struct __sk_buff, len)),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 1, 3),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 1, 2),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+ BPF_MOV64_IMM(BPF_REG_1, 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_48b = { 5 },
+ .fixup_map_array_48b = { 8 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "map access: mixing value pointer and scalar, 1",
+ .insns = {
+ // load map value pointer into r0 and r2
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_FD(BPF_REG_ARG1, 0),
+ BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -16),
+ BPF_ST_MEM(BPF_DW, BPF_REG_FP, -16, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ // load some number from the map into r1
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ // depending on r1, branch:
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 3),
+ // branch A
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ BPF_JMP_A(2),
+ // branch B
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_3, 0x100000),
+ // common instruction
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_3),
+ // depending on r1, branch:
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 1),
+ // branch A
+ BPF_JMP_A(4),
+ // branch B
+ BPF_MOV64_IMM(BPF_REG_0, 0x13371337),
+ // verifier follows fall-through
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 0x100000, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ // fake-dead code; targeted from branch A to
+ // prevent dead code sanitization
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "R2 pointer comparison prohibited",
+ .retval = 0,
+},
+{
+ "map access: mixing value pointer and scalar, 2",
+ .insns = {
+ // load map value pointer into r0 and r2
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_FD(BPF_REG_ARG1, 0),
+ BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -16),
+ BPF_ST_MEM(BPF_DW, BPF_REG_FP, -16, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ // load some number from the map into r1
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ // depending on r1, branch:
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 3),
+ // branch A
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_3, 0x100000),
+ BPF_JMP_A(2),
+ // branch B
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_3, 0),
+ // common instruction
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_3),
+ // depending on r1, branch:
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 1),
+ // branch A
+ BPF_JMP_A(4),
+ // branch B
+ BPF_MOV64_IMM(BPF_REG_0, 0x13371337),
+ // verifier follows fall-through
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_2, 0x100000, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ // fake-dead code; targeted from branch A to
+ // prevent dead code sanitization, rejected
+ // via branch B however
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "R0 invalid mem access 'inv'",
+ .retval = 0,
+},
+{
+ "sanitation: alu with different scalars 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_FD(BPF_REG_ARG1, 0),
+ BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_FP),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG2, -16),
+ BPF_ST_MEM(BPF_DW, BPF_REG_FP, -16, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 3),
+ BPF_MOV64_IMM(BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_3, 0x100000),
+ BPF_JMP_A(2),
+ BPF_MOV64_IMM(BPF_REG_2, 42),
+ BPF_MOV64_IMM(BPF_REG_3, 0x100001),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_3),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = ACCEPT,
+ .retval = 0x100000,
+},
+{
+ "sanitation: alu with different scalars 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_FP),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16),
+ BPF_ST_MEM(BPF_DW, BPF_REG_FP, -16, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_delete_elem),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_FP),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -16),
+ BPF_EMIT_CALL(BPF_FUNC_map_delete_elem),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_6),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_7),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_8),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 1 },
+ .result = ACCEPT,
+ .retval = -EINVAL * 2,
+},
+{
+ "sanitation: alu with different scalars 3",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, EINVAL),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_0, -1),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+ BPF_MOV64_IMM(BPF_REG_0, EINVAL),
+ BPF_ALU64_IMM(BPF_MUL, BPF_REG_0, -1),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_8, BPF_REG_6),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_8, BPF_REG_7),
+ BPF_MOV64_REG(BPF_REG_0, BPF_REG_8),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .retval = -EINVAL * 2,
+},
+{
+ "map access: value_ptr += known scalar, upper oob arith, test 1",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_IMM(BPF_REG_1, 48),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
+ .retval = 1,
+},
+{
+ "map access: value_ptr += known scalar, upper oob arith, test 2",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_IMM(BPF_REG_1, 49),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
+ .retval = 1,
+},
+{
+ "map access: value_ptr += known scalar, upper oob arith, test 3",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_MOV64_IMM(BPF_REG_1, 47),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "map access: value_ptr -= known scalar, lower oob arith, test 1",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+ BPF_MOV64_IMM(BPF_REG_1, 47),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_1, 48),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = REJECT,
+ .errstr = "R0 min value is outside of the allowed memory range",
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
+},
+{
+ "map access: value_ptr -= known scalar, lower oob arith, test 2",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+ BPF_MOV64_IMM(BPF_REG_1, 47),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_1, 48),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_1, 1),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
+ .retval = 1,
+},
+{
+ "map access: value_ptr -= known scalar, lower oob arith, test 3",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+ BPF_MOV64_IMM(BPF_REG_1, 47),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_1, 47),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "map access: known scalar += value_ptr",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+ BPF_MOV64_IMM(BPF_REG_1, 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "map access: value_ptr += known scalar, 1",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+ BPF_MOV64_IMM(BPF_REG_1, 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "map access: value_ptr += known scalar, 2",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+ BPF_MOV64_IMM(BPF_REG_1, 49),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = REJECT,
+ .errstr = "invalid access to map value",
+},
+{
+ "map access: value_ptr += known scalar, 3",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+ BPF_MOV64_IMM(BPF_REG_1, -1),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = REJECT,
+ .errstr = "invalid access to map value",
+},
+{
+ "map access: value_ptr += known scalar, 4",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 7),
+ BPF_MOV64_IMM(BPF_REG_1, 5),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_1, -2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_1, -1),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "map access: value_ptr += known scalar, 5",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+ BPF_MOV64_IMM(BPF_REG_1, (6 + 1) * sizeof(int)),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = ACCEPT,
+ .retval = 0xabcdef12,
+},
+{
+ "map access: value_ptr += known scalar, 6",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+ BPF_MOV64_IMM(BPF_REG_1, (3 + 1) * sizeof(int)),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_1, 3 * sizeof(int)),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = ACCEPT,
+ .retval = 0xabcdef12,
+},
+{
+ "map access: value_ptr += N, value_ptr -= N known scalar",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_MOV32_IMM(BPF_REG_1, 0x12345678),
+ BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_0, 2),
+ BPF_MOV64_IMM(BPF_REG_1, 2),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = ACCEPT,
+ .retval = 0x12345678,
+},
+{
+ "map access: unknown scalar += value_ptr, 1",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0xf),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "map access: unknown scalar += value_ptr, 2",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 31),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = ACCEPT,
+ .retval = 0xabcdef12,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "map access: unknown scalar += value_ptr, 3",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
+ BPF_MOV64_IMM(BPF_REG_1, -1),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_MOV64_IMM(BPF_REG_1, 1),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 31),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
+ .retval = 0xabcdef12,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "map access: unknown scalar += value_ptr, 4",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
+ BPF_MOV64_IMM(BPF_REG_1, 19),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 31),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = REJECT,
+ .errstr = "R1 max value is outside of the allowed memory range",
+ .errstr_unpriv = "R1 pointer arithmetic of map value goes out of range",
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "map access: value_ptr += unknown scalar, 1",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0xf),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "map access: value_ptr += unknown scalar, 2",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 31),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = ACCEPT,
+ .retval = 0xabcdef12,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "map access: value_ptr += unknown scalar, 3",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 8),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 16),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0xf),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_3, 1),
+ BPF_ALU64_IMM(BPF_OR, BPF_REG_3, 1),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_2, BPF_REG_3, 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_3),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_JMP_IMM(BPF_JA, 0, 0, -3),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "map access: value_ptr += value_ptr",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_0),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = REJECT,
+ .errstr = "R0 pointer += pointer prohibited",
+},
+{
+ "map access: known scalar -= value_ptr",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+ BPF_MOV64_IMM(BPF_REG_1, 4),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = REJECT,
+ .errstr = "R1 tried to subtract pointer from scalar",
+},
+{
+ "map access: value_ptr -= known scalar",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3),
+ BPF_MOV64_IMM(BPF_REG_1, 4),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = REJECT,
+ .errstr = "R0 min value is outside of the allowed memory range",
+},
+{
+ "map access: value_ptr -= known scalar, 2",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 5),
+ BPF_MOV64_IMM(BPF_REG_1, 6),
+ BPF_MOV64_IMM(BPF_REG_2, 4),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_2),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = ACCEPT,
+ .retval = 1,
+},
+{
+ "map access: unknown scalar -= value_ptr",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0xf),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_1, BPF_REG_0),
+ BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = REJECT,
+ .errstr = "R1 tried to subtract pointer from scalar",
+},
+{
+ "map access: value_ptr -= unknown scalar",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0xf),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = REJECT,
+ .errstr = "R0 min value is negative",
+},
+{
+ "map access: value_ptr -= unknown scalar, 2",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0xf),
+ BPF_ALU64_IMM(BPF_OR, BPF_REG_1, 0x7),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0x7),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_1),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
+ .retval = 1,
+},
+{
+ "map access: value_ptr -= value_ptr",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_0, BPF_REG_0),
+ BPF_LDX_MEM(BPF_B, BPF_REG_1, BPF_REG_0, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 3 },
+ .result = REJECT,
+ .errstr = "R0 invalid mem access 'inv'",
+ .errstr_unpriv = "R0 pointer -= pointer prohibited",
+},
+{
+ "map access: trying to leak tained dst reg",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
+ BPF_MOV32_IMM(BPF_REG_1, 0xFFFFFFFF),
+ BPF_MOV32_REG(BPF_REG_1, BPF_REG_1),
+ BPF_ALU64_REG(BPF_SUB, BPF_REG_2, BPF_REG_1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_array_48b = { 4 },
+ .result = REJECT,
+ .errstr = "math between map_value pointer and 4294967295 is not allowed",
+},
+{
+ "32bit pkt_ptr -= scalar",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_8, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_7),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 40),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_8, 2),
+ BPF_ALU32_REG(BPF_MOV, BPF_REG_4, BPF_REG_7),
+ BPF_ALU32_REG(BPF_SUB, BPF_REG_6, BPF_REG_4),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "32bit scalar -= pkt_ptr",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_8, BPF_REG_1,
+ offsetof(struct __sk_buff, data_end)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_1,
+ offsetof(struct __sk_buff, data)),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_7),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, 40),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_6, BPF_REG_8, 2),
+ BPF_ALU32_REG(BPF_MOV, BPF_REG_4, BPF_REG_6),
+ BPF_ALU32_REG(BPF_SUB, BPF_REG_4, BPF_REG_7),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .result = ACCEPT,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
diff --git a/tools/testing/selftests/bpf/verifier/var_off.c b/tools/testing/selftests/bpf/verifier/var_off.c
new file mode 100644
index 000000000..eab1f7f56
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/var_off.c
@@ -0,0 +1,343 @@
+{
+ "variable-offset ctx access",
+ .insns = {
+ /* Get an unknown value */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ /* Make it small and 4-byte aligned */
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 4),
+ /* add it to skb. We now have either &skb->len or
+ * &skb->pkt_type, but we don't know which
+ */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_2),
+ /* dereference it */
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "variable ctx access var_off=(0x0; 0x4)",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_LWT_IN,
+},
+{
+ "variable-offset stack read, priv vs unpriv",
+ .insns = {
+ /* Fill the top 8 bytes of the stack */
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ /* Get an unknown value */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ /* Make it small and 4-byte aligned */
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 4),
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 8),
+ /* add it to fp. We now have either fp-4 or fp-8, but
+ * we don't know which
+ */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10),
+ /* dereference it for a stack read */
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .result_unpriv = REJECT,
+ .errstr_unpriv = "R2 variable stack access prohibited for !root",
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "variable-offset stack read, uninitialized",
+ .insns = {
+ /* Get an unknown value */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ /* Make it small and 4-byte aligned */
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 4),
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 8),
+ /* add it to fp. We now have either fp-4 or fp-8, but
+ * we don't know which
+ */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10),
+ /* dereference it for a stack read */
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "invalid variable-offset read from stack R2",
+ .prog_type = BPF_PROG_TYPE_LWT_IN,
+},
+{
+ "variable-offset stack write, priv vs unpriv",
+ .insns = {
+ /* Get an unknown value */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ /* Make it small and 8-byte aligned */
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 8),
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 16),
+ /* Add it to fp. We now have either fp-8 or fp-16, but
+ * we don't know which
+ */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10),
+ /* Dereference it for a stack write */
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ /* Now read from the address we just wrote. This shows
+ * that, after a variable-offset write, a priviledged
+ * program can read the slots that were in the range of
+ * that write (even if the verifier doesn't actually know
+ * if the slot being read was really written to or not.
+ */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_2, 0),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ /* Variable stack access is rejected for unprivileged.
+ */
+ .errstr_unpriv = "R2 variable stack access prohibited for !root",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+},
+{
+ "variable-offset stack write clobbers spilled regs",
+ .insns = {
+ /* Dummy instruction; needed because we need to patch the next one
+ * and we can't patch the first instruction.
+ */
+ BPF_MOV64_IMM(BPF_REG_6, 0),
+ /* Make R0 a map ptr */
+ BPF_LD_MAP_FD(BPF_REG_0, 0),
+ /* Get an unknown value */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ /* Make it small and 8-byte aligned */
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 8),
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 16),
+ /* Add it to fp. We now have either fp-8 or fp-16, but
+ * we don't know which.
+ */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10),
+ /* Spill R0(map ptr) into stack */
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+ /* Dereference the unknown value for a stack write */
+ BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0),
+ /* Fill the register back into R2 */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -8),
+ /* Try to dereference R2 for a memory load */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 8),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 1 },
+ /* The unpriviledged case is not too interesting; variable
+ * stack access is rejected.
+ */
+ .errstr_unpriv = "R2 variable stack access prohibited for !root",
+ .result_unpriv = REJECT,
+ /* In the priviledged case, dereferencing a spilled-and-then-filled
+ * register is rejected because the previous variable offset stack
+ * write might have overwritten the spilled pointer (i.e. we lose track
+ * of the spilled register when we analyze the write).
+ */
+ .errstr = "R2 invalid mem access 'inv'",
+ .result = REJECT,
+},
+{
+ "indirect variable-offset stack access, unbounded",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 6),
+ BPF_MOV64_IMM(BPF_REG_3, 28),
+ /* Fill the top 16 bytes of the stack. */
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ /* Get an unknown value. */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_1, offsetof(struct bpf_sock_ops,
+ bytes_received)),
+ /* Check the lower bound but don't check the upper one. */
+ BPF_JMP_IMM(BPF_JSLT, BPF_REG_4, 0, 4),
+ /* Point the lower bound to initialized stack. Offset is now in range
+ * from fp-16 to fp+0x7fffffffffffffef, i.e. max value is unbounded.
+ */
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_4, 16),
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_4, BPF_REG_10),
+ BPF_MOV64_IMM(BPF_REG_5, 8),
+ /* Dereference it indirectly. */
+ BPF_EMIT_CALL(BPF_FUNC_getsockopt),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid unbounded variable-offset indirect access to stack R4",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SOCK_OPS,
+},
+{
+ "indirect variable-offset stack access, max out of bound",
+ .insns = {
+ /* Fill the top 8 bytes of the stack */
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ /* Get an unknown value */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ /* Make it small and 4-byte aligned */
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 4),
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 8),
+ /* add it to fp. We now have either fp-4 or fp-8, but
+ * we don't know which
+ */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10),
+ /* dereference it indirectly */
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 5 },
+ .errstr = "invalid variable-offset indirect access to stack R2",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_LWT_IN,
+},
+{
+ "indirect variable-offset stack access, min out of bound",
+ .insns = {
+ /* Fill the top 8 bytes of the stack */
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ /* Get an unknown value */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ /* Make it small and 4-byte aligned */
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 4),
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 516),
+ /* add it to fp. We now have either fp-516 or fp-512, but
+ * we don't know which
+ */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10),
+ /* dereference it indirectly */
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 5 },
+ .errstr = "invalid variable-offset indirect access to stack R2",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_LWT_IN,
+},
+{
+ "indirect variable-offset stack access, max_off+size > max_initialized",
+ .insns = {
+ /* Fill only the second from top 8 bytes of the stack. */
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 0),
+ /* Get an unknown value. */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ /* Make it small and 4-byte aligned. */
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 4),
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 16),
+ /* Add it to fp. We now have either fp-12 or fp-16, but we don't know
+ * which. fp-12 size 8 is partially uninitialized stack.
+ */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10),
+ /* Dereference it indirectly. */
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 5 },
+ .errstr = "invalid indirect read from stack R2 var_off",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_LWT_IN,
+},
+{
+ "indirect variable-offset stack access, min_off < min_initialized",
+ .insns = {
+ /* Fill only the top 8 bytes of the stack. */
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ /* Get an unknown value */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ /* Make it small and 4-byte aligned. */
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 4),
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 16),
+ /* Add it to fp. We now have either fp-12 or fp-16, but we don't know
+ * which. fp-16 size 8 is partially uninitialized stack.
+ */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10),
+ /* Dereference it indirectly. */
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 5 },
+ .errstr = "invalid indirect read from stack R2 var_off",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_LWT_IN,
+},
+{
+ "indirect variable-offset stack access, priv vs unpriv",
+ .insns = {
+ /* Fill the top 16 bytes of the stack. */
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ /* Get an unknown value. */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ /* Make it small and 4-byte aligned. */
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 4),
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 16),
+ /* Add it to fp. We now have either fp-12 or fp-16, we don't know
+ * which, but either way it points to initialized stack.
+ */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10),
+ /* Dereference it indirectly. */
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 6 },
+ .errstr_unpriv = "R2 variable stack access prohibited for !root",
+ .result_unpriv = REJECT,
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_CGROUP_SKB,
+},
+{
+ "indirect variable-offset stack access, uninitialized",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_2, 6),
+ BPF_MOV64_IMM(BPF_REG_3, 28),
+ /* Fill the top 16 bytes of the stack. */
+ BPF_ST_MEM(BPF_W, BPF_REG_10, -16, 0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ /* Get an unknown value. */
+ BPF_LDX_MEM(BPF_W, BPF_REG_4, BPF_REG_1, 0),
+ /* Make it small and 4-byte aligned. */
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_4, 4),
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_4, 16),
+ /* Add it to fp. We now have either fp-12 or fp-16, we don't know
+ * which, but either way it points to initialized stack.
+ */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_4, BPF_REG_10),
+ BPF_MOV64_IMM(BPF_REG_5, 8),
+ /* Dereference it indirectly. */
+ BPF_EMIT_CALL(BPF_FUNC_getsockopt),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "invalid indirect read from stack R4 var_off",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_SOCK_OPS,
+},
+{
+ "indirect variable-offset stack access, ok",
+ .insns = {
+ /* Fill the top 16 bytes of the stack. */
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 0),
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ /* Get an unknown value. */
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 0),
+ /* Make it small and 4-byte aligned. */
+ BPF_ALU64_IMM(BPF_AND, BPF_REG_2, 4),
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_2, 16),
+ /* Add it to fp. We now have either fp-12 or fp-16, we don't know
+ * which, but either way it points to initialized stack.
+ */
+ BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_10),
+ /* Dereference it indirectly. */
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 6 },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_LWT_IN,
+},
diff --git a/tools/testing/selftests/bpf/verifier/wide_access.c b/tools/testing/selftests/bpf/verifier/wide_access.c
new file mode 100644
index 000000000..ccade9312
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/wide_access.c
@@ -0,0 +1,73 @@
+#define BPF_SOCK_ADDR_STORE(field, off, res, err) \
+{ \
+ "wide store to bpf_sock_addr." #field "[" #off "]", \
+ .insns = { \
+ BPF_MOV64_IMM(BPF_REG_0, 1), \
+ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, \
+ offsetof(struct bpf_sock_addr, field[off])), \
+ BPF_EXIT_INSN(), \
+ }, \
+ .result = res, \
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, \
+ .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, \
+ .errstr = err, \
+}
+
+/* user_ip6[0] is u64 aligned */
+BPF_SOCK_ADDR_STORE(user_ip6, 0, ACCEPT,
+ NULL),
+BPF_SOCK_ADDR_STORE(user_ip6, 1, REJECT,
+ "invalid bpf_context access off=12 size=8"),
+BPF_SOCK_ADDR_STORE(user_ip6, 2, ACCEPT,
+ NULL),
+BPF_SOCK_ADDR_STORE(user_ip6, 3, REJECT,
+ "invalid bpf_context access off=20 size=8"),
+
+/* msg_src_ip6[0] is _not_ u64 aligned */
+BPF_SOCK_ADDR_STORE(msg_src_ip6, 0, REJECT,
+ "invalid bpf_context access off=44 size=8"),
+BPF_SOCK_ADDR_STORE(msg_src_ip6, 1, ACCEPT,
+ NULL),
+BPF_SOCK_ADDR_STORE(msg_src_ip6, 2, REJECT,
+ "invalid bpf_context access off=52 size=8"),
+BPF_SOCK_ADDR_STORE(msg_src_ip6, 3, REJECT,
+ "invalid bpf_context access off=56 size=8"),
+
+#undef BPF_SOCK_ADDR_STORE
+
+#define BPF_SOCK_ADDR_LOAD(field, off, res, err) \
+{ \
+ "wide load from bpf_sock_addr." #field "[" #off "]", \
+ .insns = { \
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, \
+ offsetof(struct bpf_sock_addr, field[off])), \
+ BPF_MOV64_IMM(BPF_REG_0, 1), \
+ BPF_EXIT_INSN(), \
+ }, \
+ .result = res, \
+ .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, \
+ .expected_attach_type = BPF_CGROUP_UDP6_SENDMSG, \
+ .errstr = err, \
+}
+
+/* user_ip6[0] is u64 aligned */
+BPF_SOCK_ADDR_LOAD(user_ip6, 0, ACCEPT,
+ NULL),
+BPF_SOCK_ADDR_LOAD(user_ip6, 1, REJECT,
+ "invalid bpf_context access off=12 size=8"),
+BPF_SOCK_ADDR_LOAD(user_ip6, 2, ACCEPT,
+ NULL),
+BPF_SOCK_ADDR_LOAD(user_ip6, 3, REJECT,
+ "invalid bpf_context access off=20 size=8"),
+
+/* msg_src_ip6[0] is _not_ u64 aligned */
+BPF_SOCK_ADDR_LOAD(msg_src_ip6, 0, REJECT,
+ "invalid bpf_context access off=44 size=8"),
+BPF_SOCK_ADDR_LOAD(msg_src_ip6, 1, ACCEPT,
+ NULL),
+BPF_SOCK_ADDR_LOAD(msg_src_ip6, 2, REJECT,
+ "invalid bpf_context access off=52 size=8"),
+BPF_SOCK_ADDR_LOAD(msg_src_ip6, 3, REJECT,
+ "invalid bpf_context access off=56 size=8"),
+
+#undef BPF_SOCK_ADDR_LOAD
diff --git a/tools/testing/selftests/bpf/verifier/xadd.c b/tools/testing/selftests/bpf/verifier/xadd.c
new file mode 100644
index 000000000..c5de2e62c
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/xadd.c
@@ -0,0 +1,97 @@
+{
+ "xadd/w check unaligned stack",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+ BPF_STX_XADD(BPF_W, BPF_REG_10, BPF_REG_0, -7),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "misaligned stack access off",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "xadd/w check unaligned map",
+ .insns = {
+ BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+ BPF_LD_MAP_FD(BPF_REG_1, 0),
+ BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_1, 1),
+ BPF_STX_XADD(BPF_W, BPF_REG_0, BPF_REG_1, 3),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 3),
+ BPF_EXIT_INSN(),
+ },
+ .fixup_map_hash_8b = { 3 },
+ .result = REJECT,
+ .errstr = "misaligned value access off",
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+},
+{
+ "xadd/w check unaligned pkt",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 2),
+ BPF_MOV64_IMM(BPF_REG_0, 99),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 6),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0),
+ BPF_ST_MEM(BPF_W, BPF_REG_2, 3, 0),
+ BPF_STX_XADD(BPF_W, BPF_REG_2, BPF_REG_0, 1),
+ BPF_STX_XADD(BPF_W, BPF_REG_2, BPF_REG_0, 2),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 1),
+ BPF_EXIT_INSN(),
+ },
+ .result = REJECT,
+ .errstr = "BPF_XADD stores into R2 pkt is not allowed",
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "xadd/w check whether src/dst got mangled, 1",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+ BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+ BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_0, -8),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_6, BPF_REG_0, 3),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_10, 2),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .retval = 3,
+},
+{
+ "xadd/w check whether src/dst got mangled, 2",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0),
+ BPF_MOV64_REG(BPF_REG_7, BPF_REG_10),
+ BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -8),
+ BPF_STX_XADD(BPF_W, BPF_REG_10, BPF_REG_0, -8),
+ BPF_STX_XADD(BPF_W, BPF_REG_10, BPF_REG_0, -8),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_6, BPF_REG_0, 3),
+ BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_10, 2),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -8),
+ BPF_EXIT_INSN(),
+ BPF_MOV64_IMM(BPF_REG_0, 42),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_SCHED_CLS,
+ .retval = 3,
+},
diff --git a/tools/testing/selftests/bpf/verifier/xdp.c b/tools/testing/selftests/bpf/verifier/xdp.c
new file mode 100644
index 000000000..5ac390508
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/xdp.c
@@ -0,0 +1,14 @@
+{
+ "XDP, using ifindex from netdev",
+ .insns = {
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, ingress_ifindex)),
+ BPF_JMP_IMM(BPF_JLT, BPF_REG_2, 1, 1),
+ BPF_MOV64_IMM(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .retval = 1,
+},
diff --git a/tools/testing/selftests/bpf/verifier/xdp_direct_packet_access.c b/tools/testing/selftests/bpf/verifier/xdp_direct_packet_access.c
new file mode 100644
index 000000000..b4ec228eb
--- /dev/null
+++ b/tools/testing/selftests/bpf/verifier/xdp_direct_packet_access.c
@@ -0,0 +1,1468 @@
+{
+ "XDP pkt read, pkt_end mangling, bad access 1",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R3 pointer arithmetic on pkt_end",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+},
+{
+ "XDP pkt read, pkt_end mangling, bad access 2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_ALU64_IMM(BPF_SUB, BPF_REG_3, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R3 pointer arithmetic on pkt_end",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+},
+{
+ "XDP pkt read, pkt_data' > pkt_end, corner case, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data' > pkt_end, bad access 1",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -4),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data' > pkt_end, bad access 2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data' > pkt_end, corner case +1, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 9),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -9),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data' > pkt_end, corner case -1, bad access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_end > pkt_data', good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, -5),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_end > pkt_data', corner case -1, bad access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_end > pkt_data', bad access 2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_end > pkt_data', corner case, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_end > pkt_data', corner case +1, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data' < pkt_end, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, -5),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data' < pkt_end, corner case -1, bad access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data' < pkt_end, bad access 2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data' < pkt_end, corner case, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data' < pkt_end, corner case +1, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_end < pkt_data', corner case, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_3, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_end < pkt_data', bad access 1",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_3, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -4),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_end < pkt_data', bad access 2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_3, BPF_REG_1, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_end < pkt_data', corner case +1, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 9),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_3, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -9),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_end < pkt_data', corner case -1, bad access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_3, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data' >= pkt_end, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, -5),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data' >= pkt_end, corner case -1, bad access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data' >= pkt_end, bad access 2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_1, BPF_REG_3, 0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, -5),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data' >= pkt_end, corner case, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data' >= pkt_end, corner case +1, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_end >= pkt_data', corner case, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_end >= pkt_data', bad access 1",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -4),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_end >= pkt_data', bad access 2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_end >= pkt_data', corner case +1, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 9),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -9),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_end >= pkt_data', corner case -1, bad access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data' <= pkt_end, corner case, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_1, BPF_REG_3, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data' <= pkt_end, bad access 1",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_1, BPF_REG_3, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -4),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data' <= pkt_end, bad access 2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data' <= pkt_end, corner case +1, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 9),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_1, BPF_REG_3, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -9),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data' <= pkt_end, corner case -1, bad access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_1, BPF_REG_3, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_end <= pkt_data', good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_3, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, -5),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_end <= pkt_data', corner case -1, bad access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_3, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_end <= pkt_data', bad access 2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_3, BPF_REG_1, 0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, -5),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_end <= pkt_data', corner case, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_3, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_end <= pkt_data', corner case +1, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1,
+ offsetof(struct xdp_md, data_end)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_3, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_meta' > pkt_data, corner case, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_meta' > pkt_data, bad access 1",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -4),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_meta' > pkt_data, bad access 2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_meta' > pkt_data, corner case +1, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 9),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -9),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_meta' > pkt_data, corner case -1, bad access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data > pkt_meta', good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, -5),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data > pkt_meta', corner case -1, bad access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data > pkt_meta', bad access 2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data > pkt_meta', corner case, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data > pkt_meta', corner case +1, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGT, BPF_REG_3, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_meta' < pkt_data, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, -5),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_meta' < pkt_data, corner case -1, bad access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_meta' < pkt_data, bad access 2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_meta' < pkt_data, corner case, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_meta' < pkt_data, corner case +1, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_1, BPF_REG_3, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data < pkt_meta', corner case, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_3, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data < pkt_meta', bad access 1",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_3, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -4),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data < pkt_meta', bad access 2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_3, BPF_REG_1, 0),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data < pkt_meta', corner case +1, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 9),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_3, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -9),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data < pkt_meta', corner case -1, bad access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JLT, BPF_REG_3, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_meta' >= pkt_data, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, -5),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_meta' >= pkt_data, corner case -1, bad access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_meta' >= pkt_data, bad access 2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_1, BPF_REG_3, 0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, -5),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_meta' >= pkt_data, corner case, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_meta' >= pkt_data, corner case +1, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data >= pkt_meta', corner case, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data >= pkt_meta', bad access 1",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -4),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data >= pkt_meta', bad access 2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data >= pkt_meta', corner case +1, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 9),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -9),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data >= pkt_meta', corner case -1, bad access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_meta' <= pkt_data, corner case, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_1, BPF_REG_3, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_meta' <= pkt_data, bad access 1",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_1, BPF_REG_3, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -4),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_meta' <= pkt_data, bad access 2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_1, BPF_REG_3, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_meta' <= pkt_data, corner case +1, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 9),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_1, BPF_REG_3, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -9),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_meta' <= pkt_data, corner case -1, bad access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_1, BPF_REG_3, 1),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data <= pkt_meta', good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_3, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, -5),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data <= pkt_meta', corner case -1, bad access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 6),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_3, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -6),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data <= pkt_meta', bad access 2",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_3, BPF_REG_1, 0),
+ BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, -5),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .errstr = "R1 offset is outside of the packet",
+ .result = REJECT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data <= pkt_meta', corner case, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 7),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_3, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -7),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
+{
+ "XDP pkt read, pkt_data <= pkt_meta', corner case +1, good access",
+ .insns = {
+ BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1,
+ offsetof(struct xdp_md, data_meta)),
+ BPF_LDX_MEM(BPF_W, BPF_REG_3, BPF_REG_1, offsetof(struct xdp_md, data)),
+ BPF_MOV64_REG(BPF_REG_1, BPF_REG_2),
+ BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, 8),
+ BPF_JMP_REG(BPF_JLE, BPF_REG_3, BPF_REG_1, 1),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_1, -8),
+ BPF_MOV64_IMM(BPF_REG_0, 0),
+ BPF_EXIT_INSN(),
+ },
+ .result = ACCEPT,
+ .prog_type = BPF_PROG_TYPE_XDP,
+ .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
+},
diff --git a/tools/testing/selftests/bpf/with_addr.sh b/tools/testing/selftests/bpf/with_addr.sh
new file mode 100755
index 000000000..ffcd3953f
--- /dev/null
+++ b/tools/testing/selftests/bpf/with_addr.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# add private ipv4 and ipv6 addresses to loopback
+
+readonly V6_INNER='100::a/128'
+readonly V4_INNER='192.168.0.1/32'
+
+if getopts ":s" opt; then
+ readonly SIT_DEV_NAME='sixtofourtest0'
+ readonly V6_SIT='2::/64'
+ readonly V4_SIT='172.17.0.1/32'
+ shift
+fi
+
+fail() {
+ echo "error: $*" 1>&2
+ exit 1
+}
+
+setup() {
+ ip -6 addr add "${V6_INNER}" dev lo || fail 'failed to setup v6 address'
+ ip -4 addr add "${V4_INNER}" dev lo || fail 'failed to setup v4 address'
+
+ if [[ -n "${V6_SIT}" ]]; then
+ ip link add "${SIT_DEV_NAME}" type sit remote any local any \
+ || fail 'failed to add sit'
+ ip link set dev "${SIT_DEV_NAME}" up \
+ || fail 'failed to bring sit device up'
+ ip -6 addr add "${V6_SIT}" dev "${SIT_DEV_NAME}" \
+ || fail 'failed to setup v6 SIT address'
+ ip -4 addr add "${V4_SIT}" dev "${SIT_DEV_NAME}" \
+ || fail 'failed to setup v4 SIT address'
+ fi
+
+ sleep 2 # avoid race causing bind to fail
+}
+
+cleanup() {
+ if [[ -n "${V6_SIT}" ]]; then
+ ip -4 addr del "${V4_SIT}" dev "${SIT_DEV_NAME}"
+ ip -6 addr del "${V6_SIT}" dev "${SIT_DEV_NAME}"
+ ip link del "${SIT_DEV_NAME}"
+ fi
+
+ ip -4 addr del "${V4_INNER}" dev lo
+ ip -6 addr del "${V6_INNER}" dev lo
+}
+
+trap cleanup EXIT
+
+setup
+"$@"
+exit "$?"
diff --git a/tools/testing/selftests/bpf/with_tunnels.sh b/tools/testing/selftests/bpf/with_tunnels.sh
new file mode 100755
index 000000000..e24949ed3
--- /dev/null
+++ b/tools/testing/selftests/bpf/with_tunnels.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# setup tunnels for flow dissection test
+
+readonly SUFFIX="test_$(mktemp -u XXXX)"
+CONFIG="remote 127.0.0.2 local 127.0.0.1 dev lo"
+
+setup() {
+ ip link add "ipip_${SUFFIX}" type ipip ${CONFIG}
+ ip link add "gre_${SUFFIX}" type gre ${CONFIG}
+ ip link add "sit_${SUFFIX}" type sit ${CONFIG}
+
+ echo "tunnels before test:"
+ ip tunnel show
+
+ ip link set "ipip_${SUFFIX}" up
+ ip link set "gre_${SUFFIX}" up
+ ip link set "sit_${SUFFIX}" up
+}
+
+
+cleanup() {
+ ip tunnel del "ipip_${SUFFIX}"
+ ip tunnel del "gre_${SUFFIX}"
+ ip tunnel del "sit_${SUFFIX}"
+
+ echo "tunnels after test:"
+ ip tunnel show
+}
+
+trap cleanup EXIT
+
+setup
+"$@"
+exit "$?"
diff --git a/tools/testing/selftests/bpf/xdping.c b/tools/testing/selftests/bpf/xdping.c
new file mode 100644
index 000000000..842d9155d
--- /dev/null
+++ b/tools/testing/selftests/bpf/xdping.c
@@ -0,0 +1,258 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */
+
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <arpa/inet.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <sys/resource.h>
+#include <net/if.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+
+#include "bpf/bpf.h"
+#include "bpf/libbpf.h"
+
+#include "xdping.h"
+
+static int ifindex;
+static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
+
+static void cleanup(int sig)
+{
+ bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+ if (sig)
+ exit(1);
+}
+
+static int get_stats(int fd, __u16 count, __u32 raddr)
+{
+ struct pinginfo pinginfo = { 0 };
+ char inaddrbuf[INET_ADDRSTRLEN];
+ struct in_addr inaddr;
+ __u16 i;
+
+ inaddr.s_addr = raddr;
+
+ printf("\nXDP RTT data:\n");
+
+ if (bpf_map_lookup_elem(fd, &raddr, &pinginfo)) {
+ perror("bpf_map_lookup elem");
+ return 1;
+ }
+
+ for (i = 0; i < count; i++) {
+ if (pinginfo.times[i] == 0)
+ break;
+
+ printf("64 bytes from %s: icmp_seq=%d ttl=64 time=%#.5f ms\n",
+ inet_ntop(AF_INET, &inaddr, inaddrbuf,
+ sizeof(inaddrbuf)),
+ count + i + 1,
+ (double)pinginfo.times[i]/1000000);
+ }
+
+ if (i < count) {
+ fprintf(stderr, "Expected %d samples, got %d.\n", count, i);
+ return 1;
+ }
+
+ bpf_map_delete_elem(fd, &raddr);
+
+ return 0;
+}
+
+static void show_usage(const char *prog)
+{
+ fprintf(stderr,
+ "usage: %s [OPTS] -I interface destination\n\n"
+ "OPTS:\n"
+ " -c count Stop after sending count requests\n"
+ " (default %d, max %d)\n"
+ " -I interface interface name\n"
+ " -N Run in driver mode\n"
+ " -s Server mode\n"
+ " -S Run in skb mode\n",
+ prog, XDPING_DEFAULT_COUNT, XDPING_MAX_COUNT);
+}
+
+int main(int argc, char **argv)
+{
+ __u32 mode_flags = XDP_FLAGS_DRV_MODE | XDP_FLAGS_SKB_MODE;
+ struct addrinfo *a, hints = { .ai_family = AF_INET };
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ __u16 count = XDPING_DEFAULT_COUNT;
+ struct pinginfo pinginfo = { 0 };
+ const char *optstr = "c:I:NsS";
+ struct bpf_program *main_prog;
+ int prog_fd = -1, map_fd = -1;
+ struct sockaddr_in rin;
+ struct bpf_object *obj;
+ struct bpf_map *map;
+ char *ifname = NULL;
+ char filename[256];
+ int opt, ret = 1;
+ __u32 raddr = 0;
+ int server = 0;
+ char cmd[256];
+
+ while ((opt = getopt(argc, argv, optstr)) != -1) {
+ switch (opt) {
+ case 'c':
+ count = atoi(optarg);
+ if (count < 1 || count > XDPING_MAX_COUNT) {
+ fprintf(stderr,
+ "min count is 1, max count is %d\n",
+ XDPING_MAX_COUNT);
+ return 1;
+ }
+ break;
+ case 'I':
+ ifname = optarg;
+ ifindex = if_nametoindex(ifname);
+ if (!ifindex) {
+ fprintf(stderr, "Could not get interface %s\n",
+ ifname);
+ return 1;
+ }
+ break;
+ case 'N':
+ xdp_flags |= XDP_FLAGS_DRV_MODE;
+ break;
+ case 's':
+ /* use server program */
+ server = 1;
+ break;
+ case 'S':
+ xdp_flags |= XDP_FLAGS_SKB_MODE;
+ break;
+ default:
+ show_usage(basename(argv[0]));
+ return 1;
+ }
+ }
+
+ if (!ifname) {
+ show_usage(basename(argv[0]));
+ return 1;
+ }
+ if (!server && optind == argc) {
+ show_usage(basename(argv[0]));
+ return 1;
+ }
+
+ if ((xdp_flags & mode_flags) == mode_flags) {
+ fprintf(stderr, "-N or -S can be specified, not both.\n");
+ show_usage(basename(argv[0]));
+ return 1;
+ }
+
+ if (!server) {
+ /* Only supports IPv4; see hints initiailization above. */
+ if (getaddrinfo(argv[optind], NULL, &hints, &a) || !a) {
+ fprintf(stderr, "Could not resolve %s\n", argv[optind]);
+ return 1;
+ }
+ memcpy(&rin, a->ai_addr, sizeof(rin));
+ raddr = rin.sin_addr.s_addr;
+ freeaddrinfo(a);
+ }
+
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+ if (bpf_prog_load(filename, BPF_PROG_TYPE_XDP, &obj, &prog_fd)) {
+ fprintf(stderr, "load of %s failed\n", filename);
+ return 1;
+ }
+
+ main_prog = bpf_object__find_program_by_title(obj,
+ server ? "xdpserver" :
+ "xdpclient");
+ if (main_prog)
+ prog_fd = bpf_program__fd(main_prog);
+ if (!main_prog || prog_fd < 0) {
+ fprintf(stderr, "could not find xdping program");
+ return 1;
+ }
+
+ map = bpf_map__next(NULL, obj);
+ if (map)
+ map_fd = bpf_map__fd(map);
+ if (!map || map_fd < 0) {
+ fprintf(stderr, "Could not find ping map");
+ goto done;
+ }
+
+ signal(SIGINT, cleanup);
+ signal(SIGTERM, cleanup);
+
+ printf("Setting up XDP for %s, please wait...\n", ifname);
+
+ printf("XDP setup disrupts network connectivity, hit Ctrl+C to quit\n");
+
+ if (bpf_set_link_xdp_fd(ifindex, prog_fd, xdp_flags) < 0) {
+ fprintf(stderr, "Link set xdp fd failed for %s\n", ifname);
+ goto done;
+ }
+
+ if (server) {
+ close(prog_fd);
+ close(map_fd);
+ printf("Running server on %s; press Ctrl+C to exit...\n",
+ ifname);
+ do { } while (1);
+ }
+
+ /* Start xdping-ing from last regular ping reply, e.g. for a count
+ * of 10 ICMP requests, we start xdping-ing using reply with seq number
+ * 10. The reason the last "real" ping RTT is much higher is that
+ * the ping program sees the ICMP reply associated with the last
+ * XDP-generated packet, so ping doesn't get a reply until XDP is done.
+ */
+ pinginfo.seq = htons(count);
+ pinginfo.count = count;
+
+ if (bpf_map_update_elem(map_fd, &raddr, &pinginfo, BPF_ANY)) {
+ fprintf(stderr, "could not communicate with BPF map: %s\n",
+ strerror(errno));
+ cleanup(0);
+ goto done;
+ }
+
+ /* We need to wait for XDP setup to complete. */
+ sleep(10);
+
+ snprintf(cmd, sizeof(cmd), "ping -c %d -I %s %s",
+ count, ifname, argv[optind]);
+
+ printf("\nNormal ping RTT data\n");
+ printf("[Ignore final RTT; it is distorted by XDP using the reply]\n");
+
+ ret = system(cmd);
+
+ if (!ret)
+ ret = get_stats(map_fd, count, raddr);
+
+ cleanup(0);
+
+done:
+ if (prog_fd > 0)
+ close(prog_fd);
+ if (map_fd > 0)
+ close(map_fd);
+
+ return ret;
+}
diff --git a/tools/testing/selftests/bpf/xdping.h b/tools/testing/selftests/bpf/xdping.h
new file mode 100644
index 000000000..afc578df7
--- /dev/null
+++ b/tools/testing/selftests/bpf/xdping.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */
+
+#define XDPING_MAX_COUNT 10
+#define XDPING_DEFAULT_COUNT 4
+
+struct pinginfo {
+ __u64 start;
+ __be16 seq;
+ __u16 count;
+ __u32 pad;
+ __u64 times[XDPING_MAX_COUNT];
+};
diff --git a/tools/testing/selftests/breakpoints/.gitignore b/tools/testing/selftests/breakpoints/.gitignore
new file mode 100644
index 000000000..def2e97da
--- /dev/null
+++ b/tools/testing/selftests/breakpoints/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+breakpoint_test
+step_after_suspend_test
diff --git a/tools/testing/selftests/breakpoints/Makefile b/tools/testing/selftests/breakpoints/Makefile
new file mode 100644
index 000000000..9ec2c78de
--- /dev/null
+++ b/tools/testing/selftests/breakpoints/Makefile
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+# Taken from perf makefile
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/)
+
+TEST_GEN_PROGS := step_after_suspend_test
+
+ifeq ($(ARCH),x86)
+TEST_GEN_PROGS += breakpoint_test
+endif
+ifneq (,$(filter $(ARCH),aarch64 arm64))
+TEST_GEN_PROGS += breakpoint_test_arm64
+endif
+
+include ../lib.mk
+
diff --git a/tools/testing/selftests/breakpoints/breakpoint_test.c b/tools/testing/selftests/breakpoints/breakpoint_test.c
new file mode 100644
index 000000000..3266cc929
--- /dev/null
+++ b/tools/testing/selftests/breakpoints/breakpoint_test.c
@@ -0,0 +1,409 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2011 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
+ *
+ * Selftests for breakpoints (and more generally the do_debug() path) in x86.
+ */
+
+
+#include <sys/ptrace.h>
+#include <unistd.h>
+#include <stddef.h>
+#include <sys/user.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <errno.h>
+#include <string.h>
+
+#include "../kselftest.h"
+
+#define COUNT_ISN_BPS 4
+#define COUNT_WPS 4
+
+/* Breakpoint access modes */
+enum {
+ BP_X = 1,
+ BP_RW = 2,
+ BP_W = 4,
+};
+
+static pid_t child_pid;
+
+/*
+ * Ensures the child and parent are always "talking" about
+ * the same test sequence. (ie: that we haven't forgotten
+ * to call check_trapped() somewhere).
+ */
+static int nr_tests;
+
+static void set_breakpoint_addr(void *addr, int n)
+{
+ int ret;
+
+ ret = ptrace(PTRACE_POKEUSER, child_pid,
+ offsetof(struct user, u_debugreg[n]), addr);
+ if (ret)
+ ksft_exit_fail_msg("Can't set breakpoint addr: %s\n",
+ strerror(errno));
+}
+
+static void toggle_breakpoint(int n, int type, int len,
+ int local, int global, int set)
+{
+ int ret;
+
+ int xtype, xlen;
+ unsigned long vdr7, dr7;
+
+ switch (type) {
+ case BP_X:
+ xtype = 0;
+ break;
+ case BP_W:
+ xtype = 1;
+ break;
+ case BP_RW:
+ xtype = 3;
+ break;
+ }
+
+ switch (len) {
+ case 1:
+ xlen = 0;
+ break;
+ case 2:
+ xlen = 4;
+ break;
+ case 4:
+ xlen = 0xc;
+ break;
+ case 8:
+ xlen = 8;
+ break;
+ }
+
+ dr7 = ptrace(PTRACE_PEEKUSER, child_pid,
+ offsetof(struct user, u_debugreg[7]), 0);
+
+ vdr7 = (xlen | xtype) << 16;
+ vdr7 <<= 4 * n;
+
+ if (local) {
+ vdr7 |= 1 << (2 * n);
+ vdr7 |= 1 << 8;
+ }
+ if (global) {
+ vdr7 |= 2 << (2 * n);
+ vdr7 |= 1 << 9;
+ }
+
+ if (set)
+ dr7 |= vdr7;
+ else
+ dr7 &= ~vdr7;
+
+ ret = ptrace(PTRACE_POKEUSER, child_pid,
+ offsetof(struct user, u_debugreg[7]), dr7);
+ if (ret) {
+ ksft_print_msg("Can't set dr7: %s\n", strerror(errno));
+ exit(-1);
+ }
+}
+
+/* Dummy variables to test read/write accesses */
+static unsigned long long dummy_var[4];
+
+/* Dummy functions to test execution accesses */
+static void dummy_func(void) { }
+static void dummy_func1(void) { }
+static void dummy_func2(void) { }
+static void dummy_func3(void) { }
+
+static void (*dummy_funcs[])(void) = {
+ dummy_func,
+ dummy_func1,
+ dummy_func2,
+ dummy_func3,
+};
+
+static int trapped;
+
+static void check_trapped(void)
+{
+ /*
+ * If we haven't trapped, wake up the parent
+ * so that it notices the failure.
+ */
+ if (!trapped)
+ kill(getpid(), SIGUSR1);
+ trapped = 0;
+
+ nr_tests++;
+}
+
+static void write_var(int len)
+{
+ char *pcval; short *psval; int *pival; long long *plval;
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ switch (len) {
+ case 1:
+ pcval = (char *)&dummy_var[i];
+ *pcval = 0xff;
+ break;
+ case 2:
+ psval = (short *)&dummy_var[i];
+ *psval = 0xffff;
+ break;
+ case 4:
+ pival = (int *)&dummy_var[i];
+ *pival = 0xffffffff;
+ break;
+ case 8:
+ plval = (long long *)&dummy_var[i];
+ *plval = 0xffffffffffffffffLL;
+ break;
+ }
+ check_trapped();
+ }
+}
+
+static void read_var(int len)
+{
+ char cval; short sval; int ival; long long lval;
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ switch (len) {
+ case 1:
+ cval = *(char *)&dummy_var[i];
+ break;
+ case 2:
+ sval = *(short *)&dummy_var[i];
+ break;
+ case 4:
+ ival = *(int *)&dummy_var[i];
+ break;
+ case 8:
+ lval = *(long long *)&dummy_var[i];
+ break;
+ }
+ check_trapped();
+ }
+}
+
+/*
+ * Do the r/w/x accesses to trigger the breakpoints. And run
+ * the usual traps.
+ */
+static void trigger_tests(void)
+{
+ int len, local, global, i;
+ char val;
+ int ret;
+
+ ret = ptrace(PTRACE_TRACEME, 0, NULL, 0);
+ if (ret) {
+ ksft_print_msg("Can't be traced? %s\n", strerror(errno));
+ return;
+ }
+
+ /* Wake up father so that it sets up the first test */
+ kill(getpid(), SIGUSR1);
+
+ /* Test instruction breakpoints */
+ for (local = 0; local < 2; local++) {
+ for (global = 0; global < 2; global++) {
+ if (!local && !global)
+ continue;
+
+ for (i = 0; i < COUNT_ISN_BPS; i++) {
+ dummy_funcs[i]();
+ check_trapped();
+ }
+ }
+ }
+
+ /* Test write watchpoints */
+ for (len = 1; len <= sizeof(long); len <<= 1) {
+ for (local = 0; local < 2; local++) {
+ for (global = 0; global < 2; global++) {
+ if (!local && !global)
+ continue;
+ write_var(len);
+ }
+ }
+ }
+
+ /* Test read/write watchpoints (on read accesses) */
+ for (len = 1; len <= sizeof(long); len <<= 1) {
+ for (local = 0; local < 2; local++) {
+ for (global = 0; global < 2; global++) {
+ if (!local && !global)
+ continue;
+ read_var(len);
+ }
+ }
+ }
+
+ /* Icebp trap */
+ asm(".byte 0xf1\n");
+ check_trapped();
+
+ /* Int 3 trap */
+ asm("int $3\n");
+ check_trapped();
+
+ kill(getpid(), SIGUSR1);
+}
+
+static void check_success(const char *msg)
+{
+ int child_nr_tests;
+ int status;
+ int ret;
+
+ /* Wait for the child to SIGTRAP */
+ wait(&status);
+
+ ret = 0;
+
+ if (WSTOPSIG(status) == SIGTRAP) {
+ child_nr_tests = ptrace(PTRACE_PEEKDATA, child_pid,
+ &nr_tests, 0);
+ if (child_nr_tests == nr_tests)
+ ret = 1;
+ if (ptrace(PTRACE_POKEDATA, child_pid, &trapped, 1))
+ ksft_exit_fail_msg("Can't poke: %s\n", strerror(errno));
+ }
+
+ nr_tests++;
+
+ if (ret)
+ ksft_test_result_pass(msg);
+ else
+ ksft_test_result_fail(msg);
+}
+
+static void launch_instruction_breakpoints(char *buf, int local, int global)
+{
+ int i;
+
+ for (i = 0; i < COUNT_ISN_BPS; i++) {
+ set_breakpoint_addr(dummy_funcs[i], i);
+ toggle_breakpoint(i, BP_X, 1, local, global, 1);
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ sprintf(buf, "Test breakpoint %d with local: %d global: %d\n",
+ i, local, global);
+ check_success(buf);
+ toggle_breakpoint(i, BP_X, 1, local, global, 0);
+ }
+}
+
+static void launch_watchpoints(char *buf, int mode, int len,
+ int local, int global)
+{
+ const char *mode_str;
+ int i;
+
+ if (mode == BP_W)
+ mode_str = "write";
+ else
+ mode_str = "read";
+
+ for (i = 0; i < COUNT_WPS; i++) {
+ set_breakpoint_addr(&dummy_var[i], i);
+ toggle_breakpoint(i, mode, len, local, global, 1);
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ sprintf(buf,
+ "Test %s watchpoint %d with len: %d local: %d global: %d\n",
+ mode_str, i, len, local, global);
+ check_success(buf);
+ toggle_breakpoint(i, mode, len, local, global, 0);
+ }
+}
+
+/* Set the breakpoints and check the child successfully trigger them */
+static void launch_tests(void)
+{
+ char buf[1024];
+ unsigned int tests = 0;
+ int len, local, global, i;
+
+ tests += 3 * COUNT_ISN_BPS;
+ tests += sizeof(long) / 2 * 3 * COUNT_WPS;
+ tests += sizeof(long) / 2 * 3 * COUNT_WPS;
+ tests += 2;
+ ksft_set_plan(tests);
+
+ /* Instruction breakpoints */
+ for (local = 0; local < 2; local++) {
+ for (global = 0; global < 2; global++) {
+ if (!local && !global)
+ continue;
+ launch_instruction_breakpoints(buf, local, global);
+ }
+ }
+
+ /* Write watchpoint */
+ for (len = 1; len <= sizeof(long); len <<= 1) {
+ for (local = 0; local < 2; local++) {
+ for (global = 0; global < 2; global++) {
+ if (!local && !global)
+ continue;
+ launch_watchpoints(buf, BP_W, len,
+ local, global);
+ }
+ }
+ }
+
+ /* Read-Write watchpoint */
+ for (len = 1; len <= sizeof(long); len <<= 1) {
+ for (local = 0; local < 2; local++) {
+ for (global = 0; global < 2; global++) {
+ if (!local && !global)
+ continue;
+ launch_watchpoints(buf, BP_RW, len,
+ local, global);
+ }
+ }
+ }
+
+ /* Icebp traps */
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ check_success("Test icebp\n");
+
+ /* Int 3 traps */
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ check_success("Test int 3 trap\n");
+
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+}
+
+int main(int argc, char **argv)
+{
+ pid_t pid;
+ int ret;
+
+ ksft_print_header();
+
+ pid = fork();
+ if (!pid) {
+ trigger_tests();
+ exit(0);
+ }
+
+ child_pid = pid;
+
+ wait(NULL);
+
+ launch_tests();
+
+ wait(NULL);
+
+ ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/breakpoints/breakpoint_test_arm64.c b/tools/testing/selftests/breakpoints/breakpoint_test_arm64.c
new file mode 100644
index 000000000..ad41ea690
--- /dev/null
+++ b/tools/testing/selftests/breakpoints/breakpoint_test_arm64.c
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * Original Code by Pavel Labath <labath@google.com>
+ *
+ * Code modified by Pratyush Anand <panand@redhat.com>
+ * for testing different byte select for each access size.
+ */
+
+#define _GNU_SOURCE
+
+#include <asm/ptrace.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/ptrace.h>
+#include <sys/param.h>
+#include <sys/uio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <elf.h>
+#include <errno.h>
+#include <signal.h>
+
+#include "../kselftest.h"
+
+static volatile uint8_t var[96] __attribute__((__aligned__(32)));
+
+static void child(int size, int wr)
+{
+ volatile uint8_t *addr = &var[32 + wr];
+
+ if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) != 0) {
+ ksft_print_msg(
+ "ptrace(PTRACE_TRACEME) failed: %s\n",
+ strerror(errno));
+ _exit(1);
+ }
+
+ if (raise(SIGSTOP) != 0) {
+ ksft_print_msg(
+ "raise(SIGSTOP) failed: %s\n", strerror(errno));
+ _exit(1);
+ }
+
+ if ((uintptr_t) addr % size) {
+ ksft_print_msg(
+ "Wrong address write for the given size: %s\n",
+ strerror(errno));
+ _exit(1);
+ }
+
+ switch (size) {
+ case 1:
+ *addr = 47;
+ break;
+ case 2:
+ *(uint16_t *)addr = 47;
+ break;
+ case 4:
+ *(uint32_t *)addr = 47;
+ break;
+ case 8:
+ *(uint64_t *)addr = 47;
+ break;
+ case 16:
+ __asm__ volatile ("stp x29, x30, %0" : "=m" (addr[0]));
+ break;
+ case 32:
+ __asm__ volatile ("stp q29, q30, %0" : "=m" (addr[0]));
+ break;
+ }
+
+ _exit(0);
+}
+
+static bool set_watchpoint(pid_t pid, int size, int wp)
+{
+ const volatile uint8_t *addr = &var[32 + wp];
+ const int offset = (uintptr_t)addr % 8;
+ const unsigned int byte_mask = ((1 << size) - 1) << offset;
+ const unsigned int type = 2; /* Write */
+ const unsigned int enable = 1;
+ const unsigned int control = byte_mask << 5 | type << 3 | enable;
+ struct user_hwdebug_state dreg_state;
+ struct iovec iov;
+
+ memset(&dreg_state, 0, sizeof(dreg_state));
+ dreg_state.dbg_regs[0].addr = (uintptr_t)(addr - offset);
+ dreg_state.dbg_regs[0].ctrl = control;
+ iov.iov_base = &dreg_state;
+ iov.iov_len = offsetof(struct user_hwdebug_state, dbg_regs) +
+ sizeof(dreg_state.dbg_regs[0]);
+ if (ptrace(PTRACE_SETREGSET, pid, NT_ARM_HW_WATCH, &iov) == 0)
+ return true;
+
+ if (errno == EIO)
+ ksft_print_msg(
+ "ptrace(PTRACE_SETREGSET, NT_ARM_HW_WATCH) not supported on this hardware: %s\n",
+ strerror(errno));
+
+ ksft_print_msg(
+ "ptrace(PTRACE_SETREGSET, NT_ARM_HW_WATCH) failed: %s\n",
+ strerror(errno));
+ return false;
+}
+
+static bool run_test(int wr_size, int wp_size, int wr, int wp)
+{
+ int status;
+ siginfo_t siginfo;
+ pid_t pid = fork();
+ pid_t wpid;
+
+ if (pid < 0) {
+ ksft_test_result_fail(
+ "fork() failed: %s\n", strerror(errno));
+ return false;
+ }
+ if (pid == 0)
+ child(wr_size, wr);
+
+ wpid = waitpid(pid, &status, __WALL);
+ if (wpid != pid) {
+ ksft_print_msg(
+ "waitpid() failed: %s\n", strerror(errno));
+ return false;
+ }
+ if (!WIFSTOPPED(status)) {
+ ksft_print_msg(
+ "child did not stop: %s\n", strerror(errno));
+ return false;
+ }
+ if (WSTOPSIG(status) != SIGSTOP) {
+ ksft_print_msg("child did not stop with SIGSTOP\n");
+ return false;
+ }
+
+ if (!set_watchpoint(pid, wp_size, wp))
+ return false;
+
+ if (ptrace(PTRACE_CONT, pid, NULL, NULL) < 0) {
+ ksft_print_msg(
+ "ptrace(PTRACE_SINGLESTEP) failed: %s\n",
+ strerror(errno));
+ return false;
+ }
+
+ alarm(3);
+ wpid = waitpid(pid, &status, __WALL);
+ if (wpid != pid) {
+ ksft_print_msg(
+ "waitpid() failed: %s\n", strerror(errno));
+ return false;
+ }
+ alarm(0);
+ if (WIFEXITED(status)) {
+ ksft_print_msg("child did not single-step\n");
+ return false;
+ }
+ if (!WIFSTOPPED(status)) {
+ ksft_print_msg("child did not stop\n");
+ return false;
+ }
+ if (WSTOPSIG(status) != SIGTRAP) {
+ ksft_print_msg("child did not stop with SIGTRAP\n");
+ return false;
+ }
+ if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo) != 0) {
+ ksft_print_msg(
+ "ptrace(PTRACE_GETSIGINFO): %s\n",
+ strerror(errno));
+ return false;
+ }
+ if (siginfo.si_code != TRAP_HWBKPT) {
+ ksft_print_msg(
+ "Unexpected si_code %d\n", siginfo.si_code);
+ return false;
+ }
+
+ kill(pid, SIGKILL);
+ wpid = waitpid(pid, &status, 0);
+ if (wpid != pid) {
+ ksft_print_msg(
+ "waitpid() failed: %s\n", strerror(errno));
+ return false;
+ }
+ return true;
+}
+
+static void sigalrm(int sig)
+{
+}
+
+int main(int argc, char **argv)
+{
+ int opt;
+ bool succeeded = true;
+ struct sigaction act;
+ int wr, wp, size;
+ bool result;
+
+ ksft_print_header();
+ ksft_set_plan(213);
+
+ act.sa_handler = sigalrm;
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = 0;
+ sigaction(SIGALRM, &act, NULL);
+ for (size = 1; size <= 32; size = size*2) {
+ for (wr = 0; wr <= 32; wr = wr + size) {
+ for (wp = wr - size; wp <= wr + size; wp = wp + size) {
+ result = run_test(size, MIN(size, 8), wr, wp);
+ if ((result && wr == wp) ||
+ (!result && wr != wp))
+ ksft_test_result_pass(
+ "Test size = %d write offset = %d watchpoint offset = %d\n",
+ size, wr, wp);
+ else {
+ ksft_test_result_fail(
+ "Test size = %d write offset = %d watchpoint offset = %d\n",
+ size, wr, wp);
+ succeeded = false;
+ }
+ }
+ }
+ }
+
+ for (size = 1; size <= 32; size = size*2) {
+ if (run_test(size, 8, -size, -8))
+ ksft_test_result_pass(
+ "Test size = %d write offset = %d watchpoint offset = -8\n",
+ size, -size);
+ else {
+ ksft_test_result_fail(
+ "Test size = %d write offset = %d watchpoint offset = -8\n",
+ size, -size);
+ succeeded = false;
+ }
+ }
+
+ if (succeeded)
+ ksft_exit_pass();
+ else
+ ksft_exit_fail();
+}
diff --git a/tools/testing/selftests/breakpoints/step_after_suspend_test.c b/tools/testing/selftests/breakpoints/step_after_suspend_test.c
new file mode 100644
index 000000000..2cf6f10ab
--- /dev/null
+++ b/tools/testing/selftests/breakpoints/step_after_suspend_test.c
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2016 Google, Inc.
+ */
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/ptrace.h>
+#include <sys/stat.h>
+#include <sys/timerfd.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include "../kselftest.h"
+
+void child(int cpu)
+{
+ cpu_set_t set;
+
+ CPU_ZERO(&set);
+ CPU_SET(cpu, &set);
+ if (sched_setaffinity(0, sizeof(set), &set) != 0) {
+ ksft_print_msg("sched_setaffinity() failed: %s\n",
+ strerror(errno));
+ _exit(1);
+ }
+
+ if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) != 0) {
+ ksft_print_msg("ptrace(PTRACE_TRACEME) failed: %s\n",
+ strerror(errno));
+ _exit(1);
+ }
+
+ if (raise(SIGSTOP) != 0) {
+ ksft_print_msg("raise(SIGSTOP) failed: %s\n", strerror(errno));
+ _exit(1);
+ }
+
+ _exit(0);
+}
+
+int run_test(int cpu)
+{
+ int status;
+ pid_t pid = fork();
+ pid_t wpid;
+
+ if (pid < 0) {
+ ksft_print_msg("fork() failed: %s\n", strerror(errno));
+ return KSFT_FAIL;
+ }
+ if (pid == 0)
+ child(cpu);
+
+ wpid = waitpid(pid, &status, __WALL);
+ if (wpid != pid) {
+ ksft_print_msg("waitpid() failed: %s\n", strerror(errno));
+ return KSFT_FAIL;
+ }
+ if (!WIFSTOPPED(status)) {
+ ksft_print_msg("child did not stop: %s\n", strerror(errno));
+ return KSFT_FAIL;
+ }
+ if (WSTOPSIG(status) != SIGSTOP) {
+ ksft_print_msg("child did not stop with SIGSTOP: %s\n",
+ strerror(errno));
+ return KSFT_FAIL;
+ }
+
+ if (ptrace(PTRACE_SINGLESTEP, pid, NULL, NULL) < 0) {
+ if (errno == EIO) {
+ ksft_print_msg(
+ "ptrace(PTRACE_SINGLESTEP) not supported on this architecture: %s\n",
+ strerror(errno));
+ return KSFT_SKIP;
+ }
+ ksft_print_msg("ptrace(PTRACE_SINGLESTEP) failed: %s\n",
+ strerror(errno));
+ return KSFT_FAIL;
+ }
+
+ wpid = waitpid(pid, &status, __WALL);
+ if (wpid != pid) {
+ ksft_print_msg("waitpid() failed: $s\n", strerror(errno));
+ return KSFT_FAIL;
+ }
+ if (WIFEXITED(status)) {
+ ksft_print_msg("child did not single-step: %s\n",
+ strerror(errno));
+ return KSFT_FAIL;
+ }
+ if (!WIFSTOPPED(status)) {
+ ksft_print_msg("child did not stop: %s\n", strerror(errno));
+ return KSFT_FAIL;
+ }
+ if (WSTOPSIG(status) != SIGTRAP) {
+ ksft_print_msg("child did not stop with SIGTRAP: %s\n",
+ strerror(errno));
+ return KSFT_FAIL;
+ }
+
+ if (ptrace(PTRACE_CONT, pid, NULL, NULL) < 0) {
+ ksft_print_msg("ptrace(PTRACE_CONT) failed: %s\n",
+ strerror(errno));
+ return KSFT_FAIL;
+ }
+
+ wpid = waitpid(pid, &status, __WALL);
+ if (wpid != pid) {
+ ksft_print_msg("waitpid() failed: %s\n", strerror(errno));
+ return KSFT_FAIL;
+ }
+ if (!WIFEXITED(status)) {
+ ksft_print_msg("child did not exit after PTRACE_CONT: %s\n",
+ strerror(errno));
+ return KSFT_FAIL;
+ }
+
+ return KSFT_PASS;
+}
+
+void suspend(void)
+{
+ int power_state_fd;
+ struct sigevent event = {};
+ int timerfd;
+ int err;
+ struct itimerspec spec = {};
+
+ if (getuid() != 0)
+ ksft_exit_skip("Please run the test as root - Exiting.\n");
+
+ power_state_fd = open("/sys/power/state", O_RDWR);
+ if (power_state_fd < 0)
+ ksft_exit_fail_msg(
+ "open(\"/sys/power/state\") failed %s)\n",
+ strerror(errno));
+
+ timerfd = timerfd_create(CLOCK_BOOTTIME_ALARM, 0);
+ if (timerfd < 0)
+ ksft_exit_fail_msg("timerfd_create() failed\n");
+
+ spec.it_value.tv_sec = 5;
+ err = timerfd_settime(timerfd, 0, &spec, NULL);
+ if (err < 0)
+ ksft_exit_fail_msg("timerfd_settime() failed\n");
+
+ if (write(power_state_fd, "mem", strlen("mem")) != strlen("mem"))
+ ksft_exit_fail_msg("Failed to enter Suspend state\n");
+
+ close(timerfd);
+ close(power_state_fd);
+}
+
+int main(int argc, char **argv)
+{
+ int opt;
+ bool do_suspend = true;
+ bool succeeded = true;
+ unsigned int tests = 0;
+ cpu_set_t available_cpus;
+ int err;
+ int cpu;
+
+ ksft_print_header();
+
+ while ((opt = getopt(argc, argv, "n")) != -1) {
+ switch (opt) {
+ case 'n':
+ do_suspend = false;
+ break;
+ default:
+ printf("Usage: %s [-n]\n", argv[0]);
+ printf(" -n: do not trigger a suspend/resume cycle before the test\n");
+ return -1;
+ }
+ }
+
+ err = sched_getaffinity(0, sizeof(available_cpus), &available_cpus);
+ if (err < 0)
+ ksft_exit_fail_msg("sched_getaffinity() failed\n");
+
+ for (cpu = 0; cpu < CPU_SETSIZE; cpu++) {
+ if (!CPU_ISSET(cpu, &available_cpus))
+ continue;
+ tests++;
+ }
+
+ if (do_suspend)
+ suspend();
+
+ ksft_set_plan(tests);
+ for (cpu = 0; cpu < CPU_SETSIZE; cpu++) {
+ int test_success;
+
+ if (!CPU_ISSET(cpu, &available_cpus))
+ continue;
+
+ test_success = run_test(cpu);
+ switch (test_success) {
+ case KSFT_PASS:
+ ksft_test_result_pass("CPU %d\n", cpu);
+ break;
+ case KSFT_SKIP:
+ ksft_test_result_skip("CPU %d\n", cpu);
+ break;
+ case KSFT_FAIL:
+ ksft_test_result_fail("CPU %d\n", cpu);
+ succeeded = false;
+ break;
+ }
+ }
+
+ if (succeeded)
+ ksft_exit_pass();
+ else
+ ksft_exit_fail();
+}
diff --git a/tools/testing/selftests/capabilities/.gitignore b/tools/testing/selftests/capabilities/.gitignore
new file mode 100644
index 000000000..426d9adca
--- /dev/null
+++ b/tools/testing/selftests/capabilities/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+test_execve
+validate_cap
diff --git a/tools/testing/selftests/capabilities/Makefile b/tools/testing/selftests/capabilities/Makefile
new file mode 100644
index 000000000..6e9d98d45
--- /dev/null
+++ b/tools/testing/selftests/capabilities/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+TEST_GEN_FILES := validate_cap
+TEST_GEN_PROGS := test_execve
+
+CFLAGS += -O2 -g -std=gnu99 -Wall
+LDLIBS += -lcap-ng -lrt -ldl
+
+include ../lib.mk
+
diff --git a/tools/testing/selftests/capabilities/test_execve.c b/tools/testing/selftests/capabilities/test_execve.c
new file mode 100644
index 000000000..df0ef02b4
--- /dev/null
+++ b/tools/testing/selftests/capabilities/test_execve.c
@@ -0,0 +1,462 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <cap-ng.h>
+#include <linux/capability.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <sched.h>
+#include <sys/mount.h>
+#include <limits.h>
+#include <libgen.h>
+#include <malloc.h>
+#include <sys/wait.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
+
+#include "../kselftest.h"
+
+#ifndef PR_CAP_AMBIENT
+#define PR_CAP_AMBIENT 47
+# define PR_CAP_AMBIENT_IS_SET 1
+# define PR_CAP_AMBIENT_RAISE 2
+# define PR_CAP_AMBIENT_LOWER 3
+# define PR_CAP_AMBIENT_CLEAR_ALL 4
+#endif
+
+static int nerrs;
+static pid_t mpid; /* main() pid is used to avoid duplicate test counts */
+
+static void vmaybe_write_file(bool enoent_ok, char *filename, char *fmt, va_list ap)
+{
+ char buf[4096];
+ int fd;
+ ssize_t written;
+ int buf_len;
+
+ buf_len = vsnprintf(buf, sizeof(buf), fmt, ap);
+ if (buf_len < 0)
+ ksft_exit_fail_msg("vsnprintf failed - %s\n", strerror(errno));
+
+ if (buf_len >= sizeof(buf))
+ ksft_exit_fail_msg("vsnprintf output truncated\n");
+
+
+ fd = open(filename, O_WRONLY);
+ if (fd < 0) {
+ if ((errno == ENOENT) && enoent_ok)
+ return;
+ ksft_exit_fail_msg("open of %s failed - %s\n",
+ filename, strerror(errno));
+ }
+ written = write(fd, buf, buf_len);
+ if (written != buf_len) {
+ if (written >= 0) {
+ ksft_exit_fail_msg("short write to %s\n", filename);
+ } else {
+ ksft_exit_fail_msg("write to %s failed - %s\n",
+ filename, strerror(errno));
+ }
+ }
+ if (close(fd) != 0) {
+ ksft_exit_fail_msg("close of %s failed - %s\n",
+ filename, strerror(errno));
+ }
+}
+
+static void maybe_write_file(char *filename, char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vmaybe_write_file(true, filename, fmt, ap);
+ va_end(ap);
+}
+
+static void write_file(char *filename, char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vmaybe_write_file(false, filename, fmt, ap);
+ va_end(ap);
+}
+
+static bool create_and_enter_ns(uid_t inner_uid)
+{
+ uid_t outer_uid;
+ gid_t outer_gid;
+ int i;
+ bool have_outer_privilege;
+
+ outer_uid = getuid();
+ outer_gid = getgid();
+
+ /*
+ * TODO: If we're already root, we could skip creating the userns.
+ */
+
+ if (unshare(CLONE_NEWNS) == 0) {
+ ksft_print_msg("[NOTE]\tUsing global UIDs for tests\n");
+ if (prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0) != 0)
+ ksft_exit_fail_msg("PR_SET_KEEPCAPS - %s\n",
+ strerror(errno));
+ if (setresuid(inner_uid, inner_uid, -1) != 0)
+ ksft_exit_fail_msg("setresuid - %s\n", strerror(errno));
+
+ // Re-enable effective caps
+ capng_get_caps_process();
+ for (i = 0; i < CAP_LAST_CAP; i++)
+ if (capng_have_capability(CAPNG_PERMITTED, i))
+ capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, i);
+ if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+ ksft_exit_fail_msg(
+ "capng_apply - %s\n", strerror(errno));
+
+ have_outer_privilege = true;
+ } else if (unshare(CLONE_NEWUSER | CLONE_NEWNS) == 0) {
+ ksft_print_msg("[NOTE]\tUsing a user namespace for tests\n");
+ maybe_write_file("/proc/self/setgroups", "deny");
+ write_file("/proc/self/uid_map", "%d %d 1", inner_uid, outer_uid);
+ write_file("/proc/self/gid_map", "0 %d 1", outer_gid);
+
+ have_outer_privilege = false;
+ } else {
+ ksft_exit_skip("must be root or be able to create a userns\n");
+ }
+
+ if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL) != 0)
+ ksft_exit_fail_msg("remount everything private - %s\n",
+ strerror(errno));
+
+ return have_outer_privilege;
+}
+
+static void chdir_to_tmpfs(void)
+{
+ char cwd[PATH_MAX];
+ if (getcwd(cwd, sizeof(cwd)) != cwd)
+ ksft_exit_fail_msg("getcwd - %s\n", strerror(errno));
+
+ if (mount("private_tmp", ".", "tmpfs", 0, "mode=0777") != 0)
+ ksft_exit_fail_msg("mount private tmpfs - %s\n",
+ strerror(errno));
+
+ if (chdir(cwd) != 0)
+ ksft_exit_fail_msg("chdir to private tmpfs - %s\n",
+ strerror(errno));
+}
+
+static void copy_fromat_to(int fromfd, const char *fromname, const char *toname)
+{
+ int from = openat(fromfd, fromname, O_RDONLY);
+ if (from == -1)
+ ksft_exit_fail_msg("open copy source - %s\n", strerror(errno));
+
+ int to = open(toname, O_CREAT | O_WRONLY | O_EXCL, 0700);
+
+ while (true) {
+ char buf[4096];
+ ssize_t sz = read(from, buf, sizeof(buf));
+ if (sz == 0)
+ break;
+ if (sz < 0)
+ ksft_exit_fail_msg("read - %s\n", strerror(errno));
+
+ if (write(to, buf, sz) != sz)
+ /* no short writes on tmpfs */
+ ksft_exit_fail_msg("write - %s\n", strerror(errno));
+ }
+
+ close(from);
+ close(to);
+}
+
+static bool fork_wait(void)
+{
+ pid_t child = fork();
+ if (child == 0) {
+ nerrs = 0;
+ return true;
+ } else if (child > 0) {
+ int status;
+ if (waitpid(child, &status, 0) != child ||
+ !WIFEXITED(status)) {
+ ksft_print_msg("Child died\n");
+ nerrs++;
+ } else if (WEXITSTATUS(status) != 0) {
+ ksft_print_msg("Child failed\n");
+ nerrs++;
+ } else {
+ /* don't print this message for mpid */
+ if (getpid() != mpid)
+ ksft_test_result_pass("Passed\n");
+ }
+ return false;
+ } else {
+ ksft_exit_fail_msg("fork - %s\n", strerror(errno));
+ return false;
+ }
+}
+
+static void exec_other_validate_cap(const char *name,
+ bool eff, bool perm, bool inh, bool ambient)
+{
+ execl(name, name, (eff ? "1" : "0"),
+ (perm ? "1" : "0"), (inh ? "1" : "0"), (ambient ? "1" : "0"),
+ NULL);
+ ksft_exit_fail_msg("execl - %s\n", strerror(errno));
+}
+
+static void exec_validate_cap(bool eff, bool perm, bool inh, bool ambient)
+{
+ exec_other_validate_cap("./validate_cap", eff, perm, inh, ambient);
+}
+
+static int do_tests(int uid, const char *our_path)
+{
+ bool have_outer_privilege = create_and_enter_ns(uid);
+
+ int ourpath_fd = open(our_path, O_RDONLY | O_DIRECTORY);
+ if (ourpath_fd == -1)
+ ksft_exit_fail_msg("open '%s' - %s\n",
+ our_path, strerror(errno));
+
+ chdir_to_tmpfs();
+
+ copy_fromat_to(ourpath_fd, "validate_cap", "validate_cap");
+
+ if (have_outer_privilege) {
+ uid_t gid = getegid();
+
+ copy_fromat_to(ourpath_fd, "validate_cap",
+ "validate_cap_suidroot");
+ if (chown("validate_cap_suidroot", 0, -1) != 0)
+ ksft_exit_fail_msg("chown - %s\n", strerror(errno));
+ if (chmod("validate_cap_suidroot", S_ISUID | 0700) != 0)
+ ksft_exit_fail_msg("chmod - %s\n", strerror(errno));
+
+ copy_fromat_to(ourpath_fd, "validate_cap",
+ "validate_cap_suidnonroot");
+ if (chown("validate_cap_suidnonroot", uid + 1, -1) != 0)
+ ksft_exit_fail_msg("chown - %s\n", strerror(errno));
+ if (chmod("validate_cap_suidnonroot", S_ISUID | 0700) != 0)
+ ksft_exit_fail_msg("chmod - %s\n", strerror(errno));
+
+ copy_fromat_to(ourpath_fd, "validate_cap",
+ "validate_cap_sgidroot");
+ if (chown("validate_cap_sgidroot", -1, 0) != 0)
+ ksft_exit_fail_msg("chown - %s\n", strerror(errno));
+ if (chmod("validate_cap_sgidroot", S_ISGID | 0710) != 0)
+ ksft_exit_fail_msg("chmod - %s\n", strerror(errno));
+
+ copy_fromat_to(ourpath_fd, "validate_cap",
+ "validate_cap_sgidnonroot");
+ if (chown("validate_cap_sgidnonroot", -1, gid + 1) != 0)
+ ksft_exit_fail_msg("chown - %s\n", strerror(errno));
+ if (chmod("validate_cap_sgidnonroot", S_ISGID | 0710) != 0)
+ ksft_exit_fail_msg("chmod - %s\n", strerror(errno));
+ }
+
+ capng_get_caps_process();
+
+ /* Make sure that i starts out clear */
+ capng_update(CAPNG_DROP, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
+ if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+ ksft_exit_fail_msg("capng_apply - %s\n", strerror(errno));
+
+ if (uid == 0) {
+ ksft_print_msg("[RUN]\tRoot => ep\n");
+ if (fork_wait())
+ exec_validate_cap(true, true, false, false);
+ } else {
+ ksft_print_msg("[RUN]\tNon-root => no caps\n");
+ if (fork_wait())
+ exec_validate_cap(false, false, false, false);
+ }
+
+ ksft_print_msg("Check cap_ambient manipulation rules\n");
+
+ /* We should not be able to add ambient caps yet. */
+ if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != -1 || errno != EPERM) {
+ if (errno == EINVAL)
+ ksft_test_result_fail(
+ "PR_CAP_AMBIENT_RAISE isn't supported\n");
+ else
+ ksft_test_result_fail(
+ "PR_CAP_AMBIENT_RAISE should have failed eith EPERM on a non-inheritable cap\n");
+ return 1;
+ }
+ ksft_test_result_pass(
+ "PR_CAP_AMBIENT_RAISE failed on non-inheritable cap\n");
+
+ capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_RAW);
+ capng_update(CAPNG_DROP, CAPNG_PERMITTED, CAP_NET_RAW);
+ capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, CAP_NET_RAW);
+ if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+ ksft_exit_fail_msg("capng_apply - %s\n", strerror(errno));
+ if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_RAW, 0, 0, 0) != -1 || errno != EPERM) {
+ ksft_test_result_fail(
+ "PR_CAP_AMBIENT_RAISE should have failed on a non-permitted cap\n");
+ return 1;
+ }
+ ksft_test_result_pass(
+ "PR_CAP_AMBIENT_RAISE failed on non-permitted cap\n");
+
+ capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
+ if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+ ksft_exit_fail_msg("capng_apply - %s\n", strerror(errno));
+ if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) {
+ ksft_test_result_fail(
+ "PR_CAP_AMBIENT_RAISE should have succeeded\n");
+ return 1;
+ }
+ ksft_test_result_pass("PR_CAP_AMBIENT_RAISE worked\n");
+
+ if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 1) {
+ ksft_test_result_fail("PR_CAP_AMBIENT_IS_SET is broken\n");
+ return 1;
+ }
+
+ if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0, 0) != 0)
+ ksft_exit_fail_msg("PR_CAP_AMBIENT_CLEAR_ALL - %s\n",
+ strerror(errno));
+
+ if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) {
+ ksft_test_result_fail(
+ "PR_CAP_AMBIENT_CLEAR_ALL didn't work\n");
+ return 1;
+ }
+
+ if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0)
+ ksft_exit_fail_msg("PR_CAP_AMBIENT_RAISE - %s\n",
+ strerror(errno));
+
+ capng_update(CAPNG_DROP, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
+ if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+ ksft_exit_fail_msg("capng_apply - %s\n", strerror(errno));
+
+ if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0) {
+ ksft_test_result_fail("Dropping I should have dropped A\n");
+ return 1;
+ }
+
+ ksft_test_result_pass("Basic manipulation appears to work\n");
+
+ capng_update(CAPNG_ADD, CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE);
+ if (capng_apply(CAPNG_SELECT_CAPS) != 0)
+ ksft_exit_fail_msg("capng_apply - %s\n", strerror(errno));
+ if (uid == 0) {
+ ksft_print_msg("[RUN]\tRoot +i => eip\n");
+ if (fork_wait())
+ exec_validate_cap(true, true, true, false);
+ } else {
+ ksft_print_msg("[RUN]\tNon-root +i => i\n");
+ if (fork_wait())
+ exec_validate_cap(false, false, true, false);
+ }
+
+ if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, CAP_NET_BIND_SERVICE, 0, 0, 0) != 0)
+ ksft_exit_fail_msg("PR_CAP_AMBIENT_RAISE - %s\n",
+ strerror(errno));
+
+ ksft_print_msg("[RUN]\tUID %d +ia => eipa\n", uid);
+ if (fork_wait())
+ exec_validate_cap(true, true, true, true);
+
+ /* The remaining tests need real privilege */
+
+ if (!have_outer_privilege) {
+ ksft_test_result_skip("SUID/SGID tests (needs privilege)\n");
+ goto done;
+ }
+
+ if (uid == 0) {
+ ksft_print_msg("[RUN]\tRoot +ia, suidroot => eipa\n");
+ if (fork_wait())
+ exec_other_validate_cap("./validate_cap_suidroot",
+ true, true, true, true);
+
+ ksft_print_msg("[RUN]\tRoot +ia, suidnonroot => ip\n");
+ if (fork_wait())
+ exec_other_validate_cap("./validate_cap_suidnonroot",
+ false, true, true, false);
+
+ ksft_print_msg("[RUN]\tRoot +ia, sgidroot => eipa\n");
+ if (fork_wait())
+ exec_other_validate_cap("./validate_cap_sgidroot",
+ true, true, true, true);
+
+ if (fork_wait()) {
+ ksft_print_msg(
+ "[RUN]\tRoot, gid != 0, +ia, sgidroot => eip\n");
+ if (setresgid(1, 1, 1) != 0)
+ ksft_exit_fail_msg("setresgid - %s\n",
+ strerror(errno));
+ exec_other_validate_cap("./validate_cap_sgidroot",
+ true, true, true, false);
+ }
+
+ ksft_print_msg("[RUN]\tRoot +ia, sgidnonroot => eip\n");
+ if (fork_wait())
+ exec_other_validate_cap("./validate_cap_sgidnonroot",
+ true, true, true, false);
+ } else {
+ ksft_print_msg("[RUN]\tNon-root +ia, sgidnonroot => i\n");
+ if (fork_wait())
+ exec_other_validate_cap("./validate_cap_sgidnonroot",
+ false, false, true, false);
+
+ if (fork_wait()) {
+ ksft_print_msg("[RUN]\tNon-root +ia, sgidroot => i\n");
+ if (setresgid(1, 1, 1) != 0)
+ ksft_exit_fail_msg("setresgid - %s\n",
+ strerror(errno));
+ exec_other_validate_cap("./validate_cap_sgidroot",
+ false, false, true, false);
+ }
+ }
+
+done:
+ ksft_print_cnts();
+ return nerrs ? 1 : 0;
+}
+
+int main(int argc, char **argv)
+{
+ char *tmp1, *tmp2, *our_path;
+
+ /* Find our path */
+ tmp1 = strdup(argv[0]);
+ if (!tmp1)
+ ksft_exit_fail_msg("strdup - %s\n", strerror(errno));
+ tmp2 = dirname(tmp1);
+ our_path = strdup(tmp2);
+ if (!our_path)
+ ksft_exit_fail_msg("strdup - %s\n", strerror(errno));
+ free(tmp1);
+
+ mpid = getpid();
+
+ if (fork_wait()) {
+ ksft_print_header();
+ ksft_set_plan(12);
+ ksft_print_msg("[RUN]\t+++ Tests with uid == 0 +++\n");
+ return do_tests(0, our_path);
+ }
+
+ ksft_print_msg("==================================================\n");
+
+ if (fork_wait()) {
+ ksft_print_header();
+ ksft_set_plan(9);
+ ksft_print_msg("[RUN]\t+++ Tests with uid != 0 +++\n");
+ return do_tests(1, our_path);
+ }
+
+ return nerrs ? 1 : 0;
+}
diff --git a/tools/testing/selftests/capabilities/validate_cap.c b/tools/testing/selftests/capabilities/validate_cap.c
new file mode 100644
index 000000000..cdfc94268
--- /dev/null
+++ b/tools/testing/selftests/capabilities/validate_cap.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <cap-ng.h>
+#include <linux/capability.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/prctl.h>
+#include <sys/auxv.h>
+
+#include "../kselftest.h"
+
+#ifndef PR_CAP_AMBIENT
+#define PR_CAP_AMBIENT 47
+# define PR_CAP_AMBIENT_IS_SET 1
+# define PR_CAP_AMBIENT_RAISE 2
+# define PR_CAP_AMBIENT_LOWER 3
+# define PR_CAP_AMBIENT_CLEAR_ALL 4
+#endif
+
+#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 19)
+# define HAVE_GETAUXVAL
+#endif
+
+static bool bool_arg(char **argv, int i)
+{
+ if (!strcmp(argv[i], "0"))
+ return false;
+ else if (!strcmp(argv[i], "1"))
+ return true;
+ else {
+ ksft_exit_fail_msg("wrong argv[%d]\n", i);
+ return false;
+ }
+}
+
+int main(int argc, char **argv)
+{
+ const char *atsec = "";
+
+ /*
+ * Be careful just in case a setgid or setcapped copy of this
+ * helper gets out.
+ */
+
+ if (argc != 5)
+ ksft_exit_fail_msg("wrong argc\n");
+
+#ifdef HAVE_GETAUXVAL
+ if (getauxval(AT_SECURE))
+ atsec = " (AT_SECURE is set)";
+ else
+ atsec = " (AT_SECURE is not set)";
+#endif
+
+ capng_get_caps_process();
+
+ if (capng_have_capability(CAPNG_EFFECTIVE, CAP_NET_BIND_SERVICE) != bool_arg(argv, 1)) {
+ ksft_print_msg("Wrong effective state%s\n", atsec);
+ return 1;
+ }
+
+ if (capng_have_capability(CAPNG_PERMITTED, CAP_NET_BIND_SERVICE) != bool_arg(argv, 2)) {
+ ksft_print_msg("Wrong permitted state%s\n", atsec);
+ return 1;
+ }
+
+ if (capng_have_capability(CAPNG_INHERITABLE, CAP_NET_BIND_SERVICE) != bool_arg(argv, 3)) {
+ ksft_print_msg("Wrong inheritable state%s\n", atsec);
+ return 1;
+ }
+
+ if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, CAP_NET_BIND_SERVICE, 0, 0, 0) != bool_arg(argv, 4)) {
+ ksft_print_msg("Wrong ambient state%s\n", atsec);
+ return 1;
+ }
+
+ ksft_print_msg("%s: Capabilities after execve were correct\n",
+ "validate_cap:");
+ return 0;
+}
diff --git a/tools/testing/selftests/cgroup/.gitignore b/tools/testing/selftests/cgroup/.gitignore
new file mode 100644
index 000000000..84cfcabea
--- /dev/null
+++ b/tools/testing/selftests/cgroup/.gitignore
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+test_memcontrol
+test_core
+test_freezer
+test_kmem \ No newline at end of file
diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile
new file mode 100644
index 000000000..f027d9335
--- /dev/null
+++ b/tools/testing/selftests/cgroup/Makefile
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS += -Wall -pthread
+
+all:
+
+TEST_FILES := with_stress.sh
+TEST_PROGS := test_stress.sh
+TEST_GEN_PROGS = test_memcontrol
+TEST_GEN_PROGS += test_kmem
+TEST_GEN_PROGS += test_core
+TEST_GEN_PROGS += test_freezer
+
+include ../lib.mk
+
+$(OUTPUT)/test_memcontrol: cgroup_util.c ../clone3/clone3_selftests.h
+$(OUTPUT)/test_kmem: cgroup_util.c ../clone3/clone3_selftests.h
+$(OUTPUT)/test_core: cgroup_util.c ../clone3/clone3_selftests.h
+$(OUTPUT)/test_freezer: cgroup_util.c ../clone3/clone3_selftests.h
diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c
new file mode 100644
index 000000000..5b16c7b0a
--- /dev/null
+++ b/tools/testing/selftests/cgroup/cgroup_util.c
@@ -0,0 +1,578 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/limits.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "cgroup_util.h"
+#include "../clone3/clone3_selftests.h"
+
+static ssize_t read_text(const char *path, char *buf, size_t max_len)
+{
+ ssize_t len;
+ int fd;
+
+ fd = open(path, O_RDONLY);
+ if (fd < 0)
+ return fd;
+
+ len = read(fd, buf, max_len - 1);
+ if (len < 0)
+ goto out;
+
+ buf[len] = 0;
+out:
+ close(fd);
+ return len;
+}
+
+static ssize_t write_text(const char *path, char *buf, ssize_t len)
+{
+ int fd;
+
+ fd = open(path, O_WRONLY | O_APPEND);
+ if (fd < 0)
+ return fd;
+
+ len = write(fd, buf, len);
+ if (len < 0) {
+ close(fd);
+ return len;
+ }
+
+ close(fd);
+
+ return len;
+}
+
+char *cg_name(const char *root, const char *name)
+{
+ size_t len = strlen(root) + strlen(name) + 2;
+ char *ret = malloc(len);
+
+ snprintf(ret, len, "%s/%s", root, name);
+
+ return ret;
+}
+
+char *cg_name_indexed(const char *root, const char *name, int index)
+{
+ size_t len = strlen(root) + strlen(name) + 10;
+ char *ret = malloc(len);
+
+ snprintf(ret, len, "%s/%s_%d", root, name, index);
+
+ return ret;
+}
+
+char *cg_control(const char *cgroup, const char *control)
+{
+ size_t len = strlen(cgroup) + strlen(control) + 2;
+ char *ret = malloc(len);
+
+ snprintf(ret, len, "%s/%s", cgroup, control);
+
+ return ret;
+}
+
+int cg_read(const char *cgroup, const char *control, char *buf, size_t len)
+{
+ char path[PATH_MAX];
+
+ snprintf(path, sizeof(path), "%s/%s", cgroup, control);
+
+ if (read_text(path, buf, len) >= 0)
+ return 0;
+
+ return -1;
+}
+
+int cg_read_strcmp(const char *cgroup, const char *control,
+ const char *expected)
+{
+ size_t size;
+ char *buf;
+ int ret;
+
+ /* Handle the case of comparing against empty string */
+ if (!expected)
+ return -1;
+ else
+ size = strlen(expected) + 1;
+
+ buf = malloc(size);
+ if (!buf)
+ return -1;
+
+ if (cg_read(cgroup, control, buf, size)) {
+ free(buf);
+ return -1;
+ }
+
+ ret = strcmp(expected, buf);
+ free(buf);
+ return ret;
+}
+
+int cg_read_strstr(const char *cgroup, const char *control, const char *needle)
+{
+ char buf[PAGE_SIZE];
+
+ if (cg_read(cgroup, control, buf, sizeof(buf)))
+ return -1;
+
+ return strstr(buf, needle) ? 0 : -1;
+}
+
+long cg_read_long(const char *cgroup, const char *control)
+{
+ char buf[128];
+
+ if (cg_read(cgroup, control, buf, sizeof(buf)))
+ return -1;
+
+ return atol(buf);
+}
+
+long cg_read_key_long(const char *cgroup, const char *control, const char *key)
+{
+ char buf[PAGE_SIZE];
+ char *ptr;
+
+ if (cg_read(cgroup, control, buf, sizeof(buf)))
+ return -1;
+
+ ptr = strstr(buf, key);
+ if (!ptr)
+ return -1;
+
+ return atol(ptr + strlen(key));
+}
+
+long cg_read_lc(const char *cgroup, const char *control)
+{
+ char buf[PAGE_SIZE];
+ const char delim[] = "\n";
+ char *line;
+ long cnt = 0;
+
+ if (cg_read(cgroup, control, buf, sizeof(buf)))
+ return -1;
+
+ for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
+ cnt++;
+
+ return cnt;
+}
+
+int cg_write(const char *cgroup, const char *control, char *buf)
+{
+ char path[PATH_MAX];
+ ssize_t len = strlen(buf);
+
+ snprintf(path, sizeof(path), "%s/%s", cgroup, control);
+
+ if (write_text(path, buf, len) == len)
+ return 0;
+
+ return -1;
+}
+
+int cg_find_unified_root(char *root, size_t len)
+{
+ char buf[10 * PAGE_SIZE];
+ char *fs, *mount, *type;
+ const char delim[] = "\n\t ";
+
+ if (read_text("/proc/self/mounts", buf, sizeof(buf)) <= 0)
+ return -1;
+
+ /*
+ * Example:
+ * cgroup /sys/fs/cgroup cgroup2 rw,seclabel,noexec,relatime 0 0
+ */
+ for (fs = strtok(buf, delim); fs; fs = strtok(NULL, delim)) {
+ mount = strtok(NULL, delim);
+ type = strtok(NULL, delim);
+ strtok(NULL, delim);
+ strtok(NULL, delim);
+ strtok(NULL, delim);
+
+ if (strcmp(type, "cgroup2") == 0) {
+ strncpy(root, mount, len);
+ return 0;
+ }
+ }
+
+ return -1;
+}
+
+int cg_create(const char *cgroup)
+{
+ return mkdir(cgroup, 0755);
+}
+
+int cg_wait_for_proc_count(const char *cgroup, int count)
+{
+ char buf[10 * PAGE_SIZE] = {0};
+ int attempts;
+ char *ptr;
+
+ for (attempts = 10; attempts >= 0; attempts--) {
+ int nr = 0;
+
+ if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf)))
+ break;
+
+ for (ptr = buf; *ptr; ptr++)
+ if (*ptr == '\n')
+ nr++;
+
+ if (nr >= count)
+ return 0;
+
+ usleep(100000);
+ }
+
+ return -1;
+}
+
+int cg_killall(const char *cgroup)
+{
+ char buf[PAGE_SIZE];
+ char *ptr = buf;
+
+ if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf)))
+ return -1;
+
+ while (ptr < buf + sizeof(buf)) {
+ int pid = strtol(ptr, &ptr, 10);
+
+ if (pid == 0)
+ break;
+ if (*ptr)
+ ptr++;
+ else
+ break;
+ if (kill(pid, SIGKILL))
+ return -1;
+ }
+
+ return 0;
+}
+
+int cg_destroy(const char *cgroup)
+{
+ int ret;
+
+retry:
+ ret = rmdir(cgroup);
+ if (ret && errno == EBUSY) {
+ cg_killall(cgroup);
+ usleep(100);
+ goto retry;
+ }
+
+ if (ret && errno == ENOENT)
+ ret = 0;
+
+ return ret;
+}
+
+int cg_enter(const char *cgroup, int pid)
+{
+ char pidbuf[64];
+
+ snprintf(pidbuf, sizeof(pidbuf), "%d", pid);
+ return cg_write(cgroup, "cgroup.procs", pidbuf);
+}
+
+int cg_enter_current(const char *cgroup)
+{
+ return cg_write(cgroup, "cgroup.procs", "0");
+}
+
+int cg_enter_current_thread(const char *cgroup)
+{
+ return cg_write(cgroup, "cgroup.threads", "0");
+}
+
+int cg_run(const char *cgroup,
+ int (*fn)(const char *cgroup, void *arg),
+ void *arg)
+{
+ int pid, retcode;
+
+ pid = fork();
+ if (pid < 0) {
+ return pid;
+ } else if (pid == 0) {
+ char buf[64];
+
+ snprintf(buf, sizeof(buf), "%d", getpid());
+ if (cg_write(cgroup, "cgroup.procs", buf))
+ exit(EXIT_FAILURE);
+ exit(fn(cgroup, arg));
+ } else {
+ waitpid(pid, &retcode, 0);
+ if (WIFEXITED(retcode))
+ return WEXITSTATUS(retcode);
+ else
+ return -1;
+ }
+}
+
+pid_t clone_into_cgroup(int cgroup_fd)
+{
+#ifdef CLONE_ARGS_SIZE_VER2
+ pid_t pid;
+
+ struct __clone_args args = {
+ .flags = CLONE_INTO_CGROUP,
+ .exit_signal = SIGCHLD,
+ .cgroup = cgroup_fd,
+ };
+
+ pid = sys_clone3(&args, sizeof(struct __clone_args));
+ /*
+ * Verify that this is a genuine test failure:
+ * ENOSYS -> clone3() not available
+ * E2BIG -> CLONE_INTO_CGROUP not available
+ */
+ if (pid < 0 && (errno == ENOSYS || errno == E2BIG))
+ goto pretend_enosys;
+
+ return pid;
+
+pretend_enosys:
+#endif
+ errno = ENOSYS;
+ return -ENOSYS;
+}
+
+int clone_reap(pid_t pid, int options)
+{
+ int ret;
+ siginfo_t info = {
+ .si_signo = 0,
+ };
+
+again:
+ ret = waitid(P_PID, pid, &info, options | __WALL | __WNOTHREAD);
+ if (ret < 0) {
+ if (errno == EINTR)
+ goto again;
+ return -1;
+ }
+
+ if (options & WEXITED) {
+ if (WIFEXITED(info.si_status))
+ return WEXITSTATUS(info.si_status);
+ }
+
+ if (options & WSTOPPED) {
+ if (WIFSTOPPED(info.si_status))
+ return WSTOPSIG(info.si_status);
+ }
+
+ if (options & WCONTINUED) {
+ if (WIFCONTINUED(info.si_status))
+ return 0;
+ }
+
+ return -1;
+}
+
+int dirfd_open_opath(const char *dir)
+{
+ return open(dir, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW | O_PATH);
+}
+
+#define close_prot_errno(fd) \
+ if (fd >= 0) { \
+ int _e_ = errno; \
+ close(fd); \
+ errno = _e_; \
+ }
+
+static int clone_into_cgroup_run_nowait(const char *cgroup,
+ int (*fn)(const char *cgroup, void *arg),
+ void *arg)
+{
+ int cgroup_fd;
+ pid_t pid;
+
+ cgroup_fd = dirfd_open_opath(cgroup);
+ if (cgroup_fd < 0)
+ return -1;
+
+ pid = clone_into_cgroup(cgroup_fd);
+ close_prot_errno(cgroup_fd);
+ if (pid == 0)
+ exit(fn(cgroup, arg));
+
+ return pid;
+}
+
+int cg_run_nowait(const char *cgroup,
+ int (*fn)(const char *cgroup, void *arg),
+ void *arg)
+{
+ int pid;
+
+ pid = clone_into_cgroup_run_nowait(cgroup, fn, arg);
+ if (pid > 0)
+ return pid;
+
+ /* Genuine test failure. */
+ if (pid < 0 && errno != ENOSYS)
+ return -1;
+
+ pid = fork();
+ if (pid == 0) {
+ char buf[64];
+
+ snprintf(buf, sizeof(buf), "%d", getpid());
+ if (cg_write(cgroup, "cgroup.procs", buf))
+ exit(EXIT_FAILURE);
+ exit(fn(cgroup, arg));
+ }
+
+ return pid;
+}
+
+int get_temp_fd(void)
+{
+ return open(".", O_TMPFILE | O_RDWR | O_EXCL);
+}
+
+int alloc_pagecache(int fd, size_t size)
+{
+ char buf[PAGE_SIZE];
+ struct stat st;
+ int i;
+
+ if (fstat(fd, &st))
+ goto cleanup;
+
+ size += st.st_size;
+
+ if (ftruncate(fd, size))
+ goto cleanup;
+
+ for (i = 0; i < size; i += sizeof(buf))
+ read(fd, buf, sizeof(buf));
+
+ return 0;
+
+cleanup:
+ return -1;
+}
+
+int alloc_anon(const char *cgroup, void *arg)
+{
+ size_t size = (unsigned long)arg;
+ char *buf, *ptr;
+
+ buf = malloc(size);
+ for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
+ *ptr = 0;
+
+ free(buf);
+ return 0;
+}
+
+int is_swap_enabled(void)
+{
+ char buf[PAGE_SIZE];
+ const char delim[] = "\n";
+ int cnt = 0;
+ char *line;
+
+ if (read_text("/proc/swaps", buf, sizeof(buf)) <= 0)
+ return -1;
+
+ for (line = strtok(buf, delim); line; line = strtok(NULL, delim))
+ cnt++;
+
+ return cnt > 1;
+}
+
+int set_oom_adj_score(int pid, int score)
+{
+ char path[PATH_MAX];
+ int fd, len;
+
+ sprintf(path, "/proc/%d/oom_score_adj", pid);
+
+ fd = open(path, O_WRONLY | O_APPEND);
+ if (fd < 0)
+ return fd;
+
+ len = dprintf(fd, "%d", score);
+ if (len < 0) {
+ close(fd);
+ return len;
+ }
+
+ close(fd);
+ return 0;
+}
+
+ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size)
+{
+ char path[PATH_MAX];
+
+ if (!pid)
+ snprintf(path, sizeof(path), "/proc/%s/%s",
+ thread ? "thread-self" : "self", item);
+ else
+ snprintf(path, sizeof(path), "/proc/%d/%s", pid, item);
+
+ return read_text(path, buf, size);
+}
+
+int proc_read_strstr(int pid, bool thread, const char *item, const char *needle)
+{
+ char buf[PAGE_SIZE];
+
+ if (proc_read_text(pid, thread, item, buf, sizeof(buf)) < 0)
+ return -1;
+
+ return strstr(buf, needle) ? 0 : -1;
+}
+
+int clone_into_cgroup_run_wait(const char *cgroup)
+{
+ int cgroup_fd;
+ pid_t pid;
+
+ cgroup_fd = dirfd_open_opath(cgroup);
+ if (cgroup_fd < 0)
+ return -1;
+
+ pid = clone_into_cgroup(cgroup_fd);
+ close_prot_errno(cgroup_fd);
+ if (pid < 0)
+ return -1;
+
+ if (pid == 0)
+ exit(EXIT_SUCCESS);
+
+ /*
+ * We don't care whether this fails. We only care whether the initial
+ * clone succeeded.
+ */
+ (void)clone_reap(pid, WEXITED);
+ return 0;
+}
diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h
new file mode 100644
index 000000000..5a1305dd1
--- /dev/null
+++ b/tools/testing/selftests/cgroup/cgroup_util.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <stdbool.h>
+#include <stdlib.h>
+
+#define PAGE_SIZE 4096
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+#define MB(x) (x << 20)
+
+/*
+ * Checks if two given values differ by less than err% of their sum.
+ */
+static inline int values_close(long a, long b, int err)
+{
+ return abs(a - b) <= (a + b) / 100 * err;
+}
+
+extern int cg_find_unified_root(char *root, size_t len);
+extern char *cg_name(const char *root, const char *name);
+extern char *cg_name_indexed(const char *root, const char *name, int index);
+extern char *cg_control(const char *cgroup, const char *control);
+extern int cg_create(const char *cgroup);
+extern int cg_destroy(const char *cgroup);
+extern int cg_read(const char *cgroup, const char *control,
+ char *buf, size_t len);
+extern int cg_read_strcmp(const char *cgroup, const char *control,
+ const char *expected);
+extern int cg_read_strstr(const char *cgroup, const char *control,
+ const char *needle);
+extern long cg_read_long(const char *cgroup, const char *control);
+long cg_read_key_long(const char *cgroup, const char *control, const char *key);
+extern long cg_read_lc(const char *cgroup, const char *control);
+extern int cg_write(const char *cgroup, const char *control, char *buf);
+extern int cg_run(const char *cgroup,
+ int (*fn)(const char *cgroup, void *arg),
+ void *arg);
+extern int cg_enter(const char *cgroup, int pid);
+extern int cg_enter_current(const char *cgroup);
+extern int cg_enter_current_thread(const char *cgroup);
+extern int cg_run_nowait(const char *cgroup,
+ int (*fn)(const char *cgroup, void *arg),
+ void *arg);
+extern int get_temp_fd(void);
+extern int alloc_pagecache(int fd, size_t size);
+extern int alloc_anon(const char *cgroup, void *arg);
+extern int is_swap_enabled(void);
+extern int set_oom_adj_score(int pid, int score);
+extern int cg_wait_for_proc_count(const char *cgroup, int count);
+extern int cg_killall(const char *cgroup);
+extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size);
+extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle);
+extern pid_t clone_into_cgroup(int cgroup_fd);
+extern int clone_reap(pid_t pid, int options);
+extern int clone_into_cgroup_run_wait(const char *cgroup);
+extern int dirfd_open_opath(const char *dir);
diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c
new file mode 100644
index 000000000..600123503
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_core.c
@@ -0,0 +1,888 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define _GNU_SOURCE
+#include <linux/limits.h>
+#include <linux/sched.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <errno.h>
+#include <signal.h>
+#include <string.h>
+#include <pthread.h>
+
+#include "../kselftest.h"
+#include "cgroup_util.h"
+
+static int touch_anon(char *buf, size_t size)
+{
+ int fd;
+ char *pos = buf;
+
+ fd = open("/dev/urandom", O_RDONLY);
+ if (fd < 0)
+ return -1;
+
+ while (size > 0) {
+ ssize_t ret = read(fd, pos, size);
+
+ if (ret < 0) {
+ if (errno != EINTR) {
+ close(fd);
+ return -1;
+ }
+ } else {
+ pos += ret;
+ size -= ret;
+ }
+ }
+ close(fd);
+
+ return 0;
+}
+
+static int alloc_and_touch_anon_noexit(const char *cgroup, void *arg)
+{
+ int ppid = getppid();
+ size_t size = (size_t)arg;
+ void *buf;
+
+ buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
+ 0, 0);
+ if (buf == MAP_FAILED)
+ return -1;
+
+ if (touch_anon((char *)buf, size)) {
+ munmap(buf, size);
+ return -1;
+ }
+
+ while (getppid() == ppid)
+ sleep(1);
+
+ munmap(buf, size);
+ return 0;
+}
+
+/*
+ * Create a child process that allocates and touches 100MB, then waits to be
+ * killed. Wait until the child is attached to the cgroup, kill all processes
+ * in that cgroup and wait until "cgroup.procs" is empty. At this point try to
+ * destroy the empty cgroup. The test helps detect race conditions between
+ * dying processes leaving the cgroup and cgroup destruction path.
+ */
+static int test_cgcore_destroy(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *cg_test = NULL;
+ int child_pid;
+ char buf[PAGE_SIZE];
+
+ cg_test = cg_name(root, "cg_test");
+
+ if (!cg_test)
+ goto cleanup;
+
+ for (int i = 0; i < 10; i++) {
+ if (cg_create(cg_test))
+ goto cleanup;
+
+ child_pid = cg_run_nowait(cg_test, alloc_and_touch_anon_noexit,
+ (void *) MB(100));
+
+ if (child_pid < 0)
+ goto cleanup;
+
+ /* wait for the child to enter cgroup */
+ if (cg_wait_for_proc_count(cg_test, 1))
+ goto cleanup;
+
+ if (cg_killall(cg_test))
+ goto cleanup;
+
+ /* wait for cgroup to be empty */
+ while (1) {
+ if (cg_read(cg_test, "cgroup.procs", buf, sizeof(buf)))
+ goto cleanup;
+ if (buf[0] == '\0')
+ break;
+ usleep(1000);
+ }
+
+ if (rmdir(cg_test))
+ goto cleanup;
+
+ if (waitpid(child_pid, NULL, 0) < 0)
+ goto cleanup;
+ }
+ ret = KSFT_PASS;
+cleanup:
+ if (cg_test)
+ cg_destroy(cg_test);
+ free(cg_test);
+ return ret;
+}
+
+/*
+ * A(0) - B(0) - C(1)
+ * \ D(0)
+ *
+ * A, B and C's "populated" fields would be 1 while D's 0.
+ * test that after the one process in C is moved to root,
+ * A,B and C's "populated" fields would flip to "0" and file
+ * modified events will be generated on the
+ * "cgroup.events" files of both cgroups.
+ */
+static int test_cgcore_populated(const char *root)
+{
+ int ret = KSFT_FAIL;
+ int err;
+ char *cg_test_a = NULL, *cg_test_b = NULL;
+ char *cg_test_c = NULL, *cg_test_d = NULL;
+ int cgroup_fd = -EBADF;
+ pid_t pid;
+
+ cg_test_a = cg_name(root, "cg_test_a");
+ cg_test_b = cg_name(root, "cg_test_a/cg_test_b");
+ cg_test_c = cg_name(root, "cg_test_a/cg_test_b/cg_test_c");
+ cg_test_d = cg_name(root, "cg_test_a/cg_test_b/cg_test_d");
+
+ if (!cg_test_a || !cg_test_b || !cg_test_c || !cg_test_d)
+ goto cleanup;
+
+ if (cg_create(cg_test_a))
+ goto cleanup;
+
+ if (cg_create(cg_test_b))
+ goto cleanup;
+
+ if (cg_create(cg_test_c))
+ goto cleanup;
+
+ if (cg_create(cg_test_d))
+ goto cleanup;
+
+ if (cg_enter_current(cg_test_c))
+ goto cleanup;
+
+ if (cg_read_strcmp(cg_test_a, "cgroup.events", "populated 1\n"))
+ goto cleanup;
+
+ if (cg_read_strcmp(cg_test_b, "cgroup.events", "populated 1\n"))
+ goto cleanup;
+
+ if (cg_read_strcmp(cg_test_c, "cgroup.events", "populated 1\n"))
+ goto cleanup;
+
+ if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n"))
+ goto cleanup;
+
+ if (cg_enter_current(root))
+ goto cleanup;
+
+ if (cg_read_strcmp(cg_test_a, "cgroup.events", "populated 0\n"))
+ goto cleanup;
+
+ if (cg_read_strcmp(cg_test_b, "cgroup.events", "populated 0\n"))
+ goto cleanup;
+
+ if (cg_read_strcmp(cg_test_c, "cgroup.events", "populated 0\n"))
+ goto cleanup;
+
+ if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n"))
+ goto cleanup;
+
+ /* Test that we can directly clone into a new cgroup. */
+ cgroup_fd = dirfd_open_opath(cg_test_d);
+ if (cgroup_fd < 0)
+ goto cleanup;
+
+ pid = clone_into_cgroup(cgroup_fd);
+ if (pid < 0) {
+ if (errno == ENOSYS)
+ goto cleanup_pass;
+ goto cleanup;
+ }
+
+ if (pid == 0) {
+ if (raise(SIGSTOP))
+ exit(EXIT_FAILURE);
+ exit(EXIT_SUCCESS);
+ }
+
+ err = cg_read_strcmp(cg_test_d, "cgroup.events", "populated 1\n");
+
+ (void)clone_reap(pid, WSTOPPED);
+ (void)kill(pid, SIGCONT);
+ (void)clone_reap(pid, WEXITED);
+
+ if (err)
+ goto cleanup;
+
+ if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n"))
+ goto cleanup;
+
+ /* Remove cgroup. */
+ if (cg_test_d) {
+ cg_destroy(cg_test_d);
+ free(cg_test_d);
+ cg_test_d = NULL;
+ }
+
+ pid = clone_into_cgroup(cgroup_fd);
+ if (pid < 0)
+ goto cleanup_pass;
+ if (pid == 0)
+ exit(EXIT_SUCCESS);
+ (void)clone_reap(pid, WEXITED);
+ goto cleanup;
+
+cleanup_pass:
+ ret = KSFT_PASS;
+
+cleanup:
+ if (cg_test_d)
+ cg_destroy(cg_test_d);
+ if (cg_test_c)
+ cg_destroy(cg_test_c);
+ if (cg_test_b)
+ cg_destroy(cg_test_b);
+ if (cg_test_a)
+ cg_destroy(cg_test_a);
+ free(cg_test_d);
+ free(cg_test_c);
+ free(cg_test_b);
+ free(cg_test_a);
+ if (cgroup_fd >= 0)
+ close(cgroup_fd);
+ return ret;
+}
+
+/*
+ * A (domain threaded) - B (threaded) - C (domain)
+ *
+ * test that C can't be used until it is turned into a
+ * threaded cgroup. "cgroup.type" file will report "domain (invalid)" in
+ * these cases. Operations which fail due to invalid topology use
+ * EOPNOTSUPP as the errno.
+ */
+static int test_cgcore_invalid_domain(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *grandparent = NULL, *parent = NULL, *child = NULL;
+
+ grandparent = cg_name(root, "cg_test_grandparent");
+ parent = cg_name(root, "cg_test_grandparent/cg_test_parent");
+ child = cg_name(root, "cg_test_grandparent/cg_test_parent/cg_test_child");
+ if (!parent || !child || !grandparent)
+ goto cleanup;
+
+ if (cg_create(grandparent))
+ goto cleanup;
+
+ if (cg_create(parent))
+ goto cleanup;
+
+ if (cg_create(child))
+ goto cleanup;
+
+ if (cg_write(parent, "cgroup.type", "threaded"))
+ goto cleanup;
+
+ if (cg_read_strcmp(child, "cgroup.type", "domain invalid\n"))
+ goto cleanup;
+
+ if (!cg_enter_current(child))
+ goto cleanup;
+
+ if (errno != EOPNOTSUPP)
+ goto cleanup;
+
+ if (!clone_into_cgroup_run_wait(child))
+ goto cleanup;
+
+ if (errno == ENOSYS)
+ goto cleanup_pass;
+
+ if (errno != EOPNOTSUPP)
+ goto cleanup;
+
+cleanup_pass:
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_enter_current(root);
+ if (child)
+ cg_destroy(child);
+ if (parent)
+ cg_destroy(parent);
+ if (grandparent)
+ cg_destroy(grandparent);
+ free(child);
+ free(parent);
+ free(grandparent);
+ return ret;
+}
+
+/*
+ * Test that when a child becomes threaded
+ * the parent type becomes domain threaded.
+ */
+static int test_cgcore_parent_becomes_threaded(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *parent = NULL, *child = NULL;
+
+ parent = cg_name(root, "cg_test_parent");
+ child = cg_name(root, "cg_test_parent/cg_test_child");
+ if (!parent || !child)
+ goto cleanup;
+
+ if (cg_create(parent))
+ goto cleanup;
+
+ if (cg_create(child))
+ goto cleanup;
+
+ if (cg_write(child, "cgroup.type", "threaded"))
+ goto cleanup;
+
+ if (cg_read_strcmp(parent, "cgroup.type", "domain threaded\n"))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ if (child)
+ cg_destroy(child);
+ if (parent)
+ cg_destroy(parent);
+ free(child);
+ free(parent);
+ return ret;
+
+}
+
+/*
+ * Test that there's no internal process constrain on threaded cgroups.
+ * You can add threads/processes on a parent with a controller enabled.
+ */
+static int test_cgcore_no_internal_process_constraint_on_threads(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *parent = NULL, *child = NULL;
+
+ if (cg_read_strstr(root, "cgroup.controllers", "cpu") ||
+ cg_write(root, "cgroup.subtree_control", "+cpu")) {
+ ret = KSFT_SKIP;
+ goto cleanup;
+ }
+
+ parent = cg_name(root, "cg_test_parent");
+ child = cg_name(root, "cg_test_parent/cg_test_child");
+ if (!parent || !child)
+ goto cleanup;
+
+ if (cg_create(parent))
+ goto cleanup;
+
+ if (cg_create(child))
+ goto cleanup;
+
+ if (cg_write(parent, "cgroup.type", "threaded"))
+ goto cleanup;
+
+ if (cg_write(child, "cgroup.type", "threaded"))
+ goto cleanup;
+
+ if (cg_write(parent, "cgroup.subtree_control", "+cpu"))
+ goto cleanup;
+
+ if (cg_enter_current(parent))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_enter_current(root);
+ cg_enter_current(root);
+ if (child)
+ cg_destroy(child);
+ if (parent)
+ cg_destroy(parent);
+ free(child);
+ free(parent);
+ return ret;
+}
+
+/*
+ * Test that you can't enable a controller on a child if it's not enabled
+ * on the parent.
+ */
+static int test_cgcore_top_down_constraint_enable(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *parent = NULL, *child = NULL;
+
+ parent = cg_name(root, "cg_test_parent");
+ child = cg_name(root, "cg_test_parent/cg_test_child");
+ if (!parent || !child)
+ goto cleanup;
+
+ if (cg_create(parent))
+ goto cleanup;
+
+ if (cg_create(child))
+ goto cleanup;
+
+ if (!cg_write(child, "cgroup.subtree_control", "+memory"))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ if (child)
+ cg_destroy(child);
+ if (parent)
+ cg_destroy(parent);
+ free(child);
+ free(parent);
+ return ret;
+}
+
+/*
+ * Test that you can't disable a controller on a parent
+ * if it's enabled in a child.
+ */
+static int test_cgcore_top_down_constraint_disable(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *parent = NULL, *child = NULL;
+
+ parent = cg_name(root, "cg_test_parent");
+ child = cg_name(root, "cg_test_parent/cg_test_child");
+ if (!parent || !child)
+ goto cleanup;
+
+ if (cg_create(parent))
+ goto cleanup;
+
+ if (cg_create(child))
+ goto cleanup;
+
+ if (cg_write(parent, "cgroup.subtree_control", "+memory"))
+ goto cleanup;
+
+ if (cg_write(child, "cgroup.subtree_control", "+memory"))
+ goto cleanup;
+
+ if (!cg_write(parent, "cgroup.subtree_control", "-memory"))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ if (child)
+ cg_destroy(child);
+ if (parent)
+ cg_destroy(parent);
+ free(child);
+ free(parent);
+ return ret;
+}
+
+/*
+ * Test internal process constraint.
+ * You can't add a pid to a domain parent if a controller is enabled.
+ */
+static int test_cgcore_internal_process_constraint(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *parent = NULL, *child = NULL;
+
+ parent = cg_name(root, "cg_test_parent");
+ child = cg_name(root, "cg_test_parent/cg_test_child");
+ if (!parent || !child)
+ goto cleanup;
+
+ if (cg_create(parent))
+ goto cleanup;
+
+ if (cg_create(child))
+ goto cleanup;
+
+ if (cg_write(parent, "cgroup.subtree_control", "+memory"))
+ goto cleanup;
+
+ if (!cg_enter_current(parent))
+ goto cleanup;
+
+ if (!clone_into_cgroup_run_wait(parent))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ if (child)
+ cg_destroy(child);
+ if (parent)
+ cg_destroy(parent);
+ free(child);
+ free(parent);
+ return ret;
+}
+
+static void *dummy_thread_fn(void *arg)
+{
+ return (void *)(size_t)pause();
+}
+
+/*
+ * Test threadgroup migration.
+ * All threads of a process are migrated together.
+ */
+static int test_cgcore_proc_migration(const char *root)
+{
+ int ret = KSFT_FAIL;
+ int t, c_threads = 0, n_threads = 13;
+ char *src = NULL, *dst = NULL;
+ pthread_t threads[n_threads];
+
+ src = cg_name(root, "cg_src");
+ dst = cg_name(root, "cg_dst");
+ if (!src || !dst)
+ goto cleanup;
+
+ if (cg_create(src))
+ goto cleanup;
+ if (cg_create(dst))
+ goto cleanup;
+
+ if (cg_enter_current(src))
+ goto cleanup;
+
+ for (c_threads = 0; c_threads < n_threads; ++c_threads) {
+ if (pthread_create(&threads[c_threads], NULL, dummy_thread_fn, NULL))
+ goto cleanup;
+ }
+
+ cg_enter_current(dst);
+ if (cg_read_lc(dst, "cgroup.threads") != n_threads + 1)
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ for (t = 0; t < c_threads; ++t) {
+ pthread_cancel(threads[t]);
+ }
+
+ for (t = 0; t < c_threads; ++t) {
+ pthread_join(threads[t], NULL);
+ }
+
+ cg_enter_current(root);
+
+ if (dst)
+ cg_destroy(dst);
+ if (src)
+ cg_destroy(src);
+ free(dst);
+ free(src);
+ return ret;
+}
+
+static void *migrating_thread_fn(void *arg)
+{
+ int g, i, n_iterations = 1000;
+ char **grps = arg;
+ char lines[3][PATH_MAX];
+
+ for (g = 1; g < 3; ++g)
+ snprintf(lines[g], sizeof(lines[g]), "0::%s", grps[g] + strlen(grps[0]));
+
+ for (i = 0; i < n_iterations; ++i) {
+ cg_enter_current_thread(grps[(i % 2) + 1]);
+
+ if (proc_read_strstr(0, 1, "cgroup", lines[(i % 2) + 1]))
+ return (void *)-1;
+ }
+ return NULL;
+}
+
+/*
+ * Test single thread migration.
+ * Threaded cgroups allow successful migration of a thread.
+ */
+static int test_cgcore_thread_migration(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *dom = NULL;
+ char line[PATH_MAX];
+ char *grps[3] = { (char *)root, NULL, NULL };
+ pthread_t thr;
+ void *retval;
+
+ dom = cg_name(root, "cg_dom");
+ grps[1] = cg_name(root, "cg_dom/cg_src");
+ grps[2] = cg_name(root, "cg_dom/cg_dst");
+ if (!grps[1] || !grps[2] || !dom)
+ goto cleanup;
+
+ if (cg_create(dom))
+ goto cleanup;
+ if (cg_create(grps[1]))
+ goto cleanup;
+ if (cg_create(grps[2]))
+ goto cleanup;
+
+ if (cg_write(grps[1], "cgroup.type", "threaded"))
+ goto cleanup;
+ if (cg_write(grps[2], "cgroup.type", "threaded"))
+ goto cleanup;
+
+ if (cg_enter_current(grps[1]))
+ goto cleanup;
+
+ if (pthread_create(&thr, NULL, migrating_thread_fn, grps))
+ goto cleanup;
+
+ if (pthread_join(thr, &retval))
+ goto cleanup;
+
+ if (retval)
+ goto cleanup;
+
+ snprintf(line, sizeof(line), "0::%s", grps[1] + strlen(grps[0]));
+ if (proc_read_strstr(0, 1, "cgroup", line))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_enter_current(root);
+ if (grps[2])
+ cg_destroy(grps[2]);
+ if (grps[1])
+ cg_destroy(grps[1]);
+ if (dom)
+ cg_destroy(dom);
+ free(grps[2]);
+ free(grps[1]);
+ free(dom);
+ return ret;
+}
+
+/*
+ * cgroup migration permission check should be performed based on the
+ * credentials at the time of open instead of write.
+ */
+static int test_cgcore_lesser_euid_open(const char *root)
+{
+ const uid_t test_euid = 65534; /* usually nobody, any !root is fine */
+ int ret = KSFT_FAIL;
+ char *cg_test_a = NULL, *cg_test_b = NULL;
+ char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL;
+ int cg_test_b_procs_fd = -1;
+ uid_t saved_uid;
+
+ cg_test_a = cg_name(root, "cg_test_a");
+ cg_test_b = cg_name(root, "cg_test_b");
+
+ if (!cg_test_a || !cg_test_b)
+ goto cleanup;
+
+ cg_test_a_procs = cg_name(cg_test_a, "cgroup.procs");
+ cg_test_b_procs = cg_name(cg_test_b, "cgroup.procs");
+
+ if (!cg_test_a_procs || !cg_test_b_procs)
+ goto cleanup;
+
+ if (cg_create(cg_test_a) || cg_create(cg_test_b))
+ goto cleanup;
+
+ if (cg_enter_current(cg_test_a))
+ goto cleanup;
+
+ if (chown(cg_test_a_procs, test_euid, -1) ||
+ chown(cg_test_b_procs, test_euid, -1))
+ goto cleanup;
+
+ saved_uid = geteuid();
+ if (seteuid(test_euid))
+ goto cleanup;
+
+ cg_test_b_procs_fd = open(cg_test_b_procs, O_RDWR);
+
+ if (seteuid(saved_uid))
+ goto cleanup;
+
+ if (cg_test_b_procs_fd < 0)
+ goto cleanup;
+
+ if (write(cg_test_b_procs_fd, "0", 1) >= 0 || errno != EACCES)
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_enter_current(root);
+ if (cg_test_b_procs_fd >= 0)
+ close(cg_test_b_procs_fd);
+ if (cg_test_b)
+ cg_destroy(cg_test_b);
+ if (cg_test_a)
+ cg_destroy(cg_test_a);
+ free(cg_test_b_procs);
+ free(cg_test_a_procs);
+ free(cg_test_b);
+ free(cg_test_a);
+ return ret;
+}
+
+struct lesser_ns_open_thread_arg {
+ const char *path;
+ int fd;
+ int err;
+};
+
+static int lesser_ns_open_thread_fn(void *arg)
+{
+ struct lesser_ns_open_thread_arg *targ = arg;
+
+ targ->fd = open(targ->path, O_RDWR);
+ targ->err = errno;
+ return 0;
+}
+
+/*
+ * cgroup migration permission check should be performed based on the cgroup
+ * namespace at the time of open instead of write.
+ */
+static int test_cgcore_lesser_ns_open(const char *root)
+{
+ static char stack[65536];
+ const uid_t test_euid = 65534; /* usually nobody, any !root is fine */
+ int ret = KSFT_FAIL;
+ char *cg_test_a = NULL, *cg_test_b = NULL;
+ char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL;
+ int cg_test_b_procs_fd = -1;
+ struct lesser_ns_open_thread_arg targ = { .fd = -1 };
+ pid_t pid;
+ int status;
+
+ cg_test_a = cg_name(root, "cg_test_a");
+ cg_test_b = cg_name(root, "cg_test_b");
+
+ if (!cg_test_a || !cg_test_b)
+ goto cleanup;
+
+ cg_test_a_procs = cg_name(cg_test_a, "cgroup.procs");
+ cg_test_b_procs = cg_name(cg_test_b, "cgroup.procs");
+
+ if (!cg_test_a_procs || !cg_test_b_procs)
+ goto cleanup;
+
+ if (cg_create(cg_test_a) || cg_create(cg_test_b))
+ goto cleanup;
+
+ if (cg_enter_current(cg_test_b))
+ goto cleanup;
+
+ if (chown(cg_test_a_procs, test_euid, -1) ||
+ chown(cg_test_b_procs, test_euid, -1))
+ goto cleanup;
+
+ targ.path = cg_test_b_procs;
+ pid = clone(lesser_ns_open_thread_fn, stack + sizeof(stack),
+ CLONE_NEWCGROUP | CLONE_FILES | CLONE_VM | SIGCHLD,
+ &targ);
+ if (pid < 0)
+ goto cleanup;
+
+ if (waitpid(pid, &status, 0) < 0)
+ goto cleanup;
+
+ if (!WIFEXITED(status))
+ goto cleanup;
+
+ cg_test_b_procs_fd = targ.fd;
+ if (cg_test_b_procs_fd < 0)
+ goto cleanup;
+
+ if (cg_enter_current(cg_test_a))
+ goto cleanup;
+
+ if ((status = write(cg_test_b_procs_fd, "0", 1)) >= 0 || errno != ENOENT)
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_enter_current(root);
+ if (cg_test_b_procs_fd >= 0)
+ close(cg_test_b_procs_fd);
+ if (cg_test_b)
+ cg_destroy(cg_test_b);
+ if (cg_test_a)
+ cg_destroy(cg_test_a);
+ free(cg_test_b_procs);
+ free(cg_test_a_procs);
+ free(cg_test_b);
+ free(cg_test_a);
+ return ret;
+}
+
+#define T(x) { x, #x }
+struct corecg_test {
+ int (*fn)(const char *root);
+ const char *name;
+} tests[] = {
+ T(test_cgcore_internal_process_constraint),
+ T(test_cgcore_top_down_constraint_enable),
+ T(test_cgcore_top_down_constraint_disable),
+ T(test_cgcore_no_internal_process_constraint_on_threads),
+ T(test_cgcore_parent_becomes_threaded),
+ T(test_cgcore_invalid_domain),
+ T(test_cgcore_populated),
+ T(test_cgcore_proc_migration),
+ T(test_cgcore_thread_migration),
+ T(test_cgcore_destroy),
+ T(test_cgcore_lesser_euid_open),
+ T(test_cgcore_lesser_ns_open),
+};
+#undef T
+
+int main(int argc, char *argv[])
+{
+ char root[PATH_MAX];
+ int i, ret = EXIT_SUCCESS;
+
+ if (cg_find_unified_root(root, sizeof(root)))
+ ksft_exit_skip("cgroup v2 isn't mounted\n");
+
+ if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
+ if (cg_write(root, "cgroup.subtree_control", "+memory"))
+ ksft_exit_skip("Failed to set memory controller\n");
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ switch (tests[i].fn(root)) {
+ case KSFT_PASS:
+ ksft_test_result_pass("%s\n", tests[i].name);
+ break;
+ case KSFT_SKIP:
+ ksft_test_result_skip("%s\n", tests[i].name);
+ break;
+ default:
+ ret = EXIT_FAILURE;
+ ksft_test_result_fail("%s\n", tests[i].name);
+ break;
+ }
+ }
+
+ return ret;
+}
diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c
new file mode 100644
index 000000000..23d8fa4a3
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_freezer.c
@@ -0,0 +1,905 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <stdbool.h>
+#include <linux/limits.h>
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <errno.h>
+#include <poll.h>
+#include <stdlib.h>
+#include <sys/inotify.h>
+#include <string.h>
+#include <sys/wait.h>
+
+#include "../kselftest.h"
+#include "cgroup_util.h"
+
+#define DEBUG
+#ifdef DEBUG
+#define debug(args...) fprintf(stderr, args)
+#else
+#define debug(args...)
+#endif
+
+/*
+ * Check if the cgroup is frozen by looking at the cgroup.events::frozen value.
+ */
+static int cg_check_frozen(const char *cgroup, bool frozen)
+{
+ if (frozen) {
+ if (cg_read_strstr(cgroup, "cgroup.events", "frozen 1") != 0) {
+ debug("Cgroup %s isn't frozen\n", cgroup);
+ return -1;
+ }
+ } else {
+ /*
+ * Check the cgroup.events::frozen value.
+ */
+ if (cg_read_strstr(cgroup, "cgroup.events", "frozen 0") != 0) {
+ debug("Cgroup %s is frozen\n", cgroup);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Freeze the given cgroup.
+ */
+static int cg_freeze_nowait(const char *cgroup, bool freeze)
+{
+ return cg_write(cgroup, "cgroup.freeze", freeze ? "1" : "0");
+}
+
+/*
+ * Prepare for waiting on cgroup.events file.
+ */
+static int cg_prepare_for_wait(const char *cgroup)
+{
+ int fd, ret = -1;
+
+ fd = inotify_init1(0);
+ if (fd == -1) {
+ debug("Error: inotify_init1() failed\n");
+ return fd;
+ }
+
+ ret = inotify_add_watch(fd, cg_control(cgroup, "cgroup.events"),
+ IN_MODIFY);
+ if (ret == -1) {
+ debug("Error: inotify_add_watch() failed\n");
+ close(fd);
+ fd = -1;
+ }
+
+ return fd;
+}
+
+/*
+ * Wait for an event. If there are no events for 10 seconds,
+ * treat this an error.
+ */
+static int cg_wait_for(int fd)
+{
+ int ret = -1;
+ struct pollfd fds = {
+ .fd = fd,
+ .events = POLLIN,
+ };
+
+ while (true) {
+ ret = poll(&fds, 1, 10000);
+
+ if (ret == -1) {
+ if (errno == EINTR)
+ continue;
+ debug("Error: poll() failed\n");
+ break;
+ }
+
+ if (ret > 0 && fds.revents & POLLIN) {
+ ret = 0;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * Attach a task to the given cgroup and wait for a cgroup frozen event.
+ * All transient events (e.g. populated) are ignored.
+ */
+static int cg_enter_and_wait_for_frozen(const char *cgroup, int pid,
+ bool frozen)
+{
+ int fd, ret = -1;
+ int attempts;
+
+ fd = cg_prepare_for_wait(cgroup);
+ if (fd < 0)
+ return fd;
+
+ ret = cg_enter(cgroup, pid);
+ if (ret)
+ goto out;
+
+ for (attempts = 0; attempts < 10; attempts++) {
+ ret = cg_wait_for(fd);
+ if (ret)
+ break;
+
+ ret = cg_check_frozen(cgroup, frozen);
+ if (ret)
+ continue;
+ }
+
+out:
+ close(fd);
+ return ret;
+}
+
+/*
+ * Freeze the given cgroup and wait for the inotify signal.
+ * If there are no events in 10 seconds, treat this as an error.
+ * Then check that the cgroup is in the desired state.
+ */
+static int cg_freeze_wait(const char *cgroup, bool freeze)
+{
+ int fd, ret = -1;
+
+ fd = cg_prepare_for_wait(cgroup);
+ if (fd < 0)
+ return fd;
+
+ ret = cg_freeze_nowait(cgroup, freeze);
+ if (ret) {
+ debug("Error: cg_freeze_nowait() failed\n");
+ goto out;
+ }
+
+ ret = cg_wait_for(fd);
+ if (ret)
+ goto out;
+
+ ret = cg_check_frozen(cgroup, freeze);
+out:
+ close(fd);
+ return ret;
+}
+
+/*
+ * A simple process running in a sleep loop until being
+ * re-parented.
+ */
+static int child_fn(const char *cgroup, void *arg)
+{
+ int ppid = getppid();
+
+ while (getppid() == ppid)
+ usleep(1000);
+
+ return getppid() == ppid;
+}
+
+/*
+ * A simple test for the cgroup freezer: populated the cgroup with 100
+ * running processes and freeze it. Then unfreeze it. Then it kills all
+ * processes and destroys the cgroup.
+ */
+static int test_cgfreezer_simple(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *cgroup = NULL;
+ int i;
+
+ cgroup = cg_name(root, "cg_test_simple");
+ if (!cgroup)
+ goto cleanup;
+
+ if (cg_create(cgroup))
+ goto cleanup;
+
+ for (i = 0; i < 100; i++)
+ cg_run_nowait(cgroup, child_fn, NULL);
+
+ if (cg_wait_for_proc_count(cgroup, 100))
+ goto cleanup;
+
+ if (cg_check_frozen(cgroup, false))
+ goto cleanup;
+
+ if (cg_freeze_wait(cgroup, true))
+ goto cleanup;
+
+ if (cg_freeze_wait(cgroup, false))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ if (cgroup)
+ cg_destroy(cgroup);
+ free(cgroup);
+ return ret;
+}
+
+/*
+ * The test creates the following hierarchy:
+ * A
+ * / / \ \
+ * B E I K
+ * /\ |
+ * C D F
+ * |
+ * G
+ * |
+ * H
+ *
+ * with a process in C, H and 3 processes in K.
+ * Then it tries to freeze and unfreeze the whole tree.
+ */
+static int test_cgfreezer_tree(const char *root)
+{
+ char *cgroup[10] = {0};
+ int ret = KSFT_FAIL;
+ int i;
+
+ cgroup[0] = cg_name(root, "cg_test_tree_A");
+ if (!cgroup[0])
+ goto cleanup;
+
+ cgroup[1] = cg_name(cgroup[0], "B");
+ if (!cgroup[1])
+ goto cleanup;
+
+ cgroup[2] = cg_name(cgroup[1], "C");
+ if (!cgroup[2])
+ goto cleanup;
+
+ cgroup[3] = cg_name(cgroup[1], "D");
+ if (!cgroup[3])
+ goto cleanup;
+
+ cgroup[4] = cg_name(cgroup[0], "E");
+ if (!cgroup[4])
+ goto cleanup;
+
+ cgroup[5] = cg_name(cgroup[4], "F");
+ if (!cgroup[5])
+ goto cleanup;
+
+ cgroup[6] = cg_name(cgroup[5], "G");
+ if (!cgroup[6])
+ goto cleanup;
+
+ cgroup[7] = cg_name(cgroup[6], "H");
+ if (!cgroup[7])
+ goto cleanup;
+
+ cgroup[8] = cg_name(cgroup[0], "I");
+ if (!cgroup[8])
+ goto cleanup;
+
+ cgroup[9] = cg_name(cgroup[0], "K");
+ if (!cgroup[9])
+ goto cleanup;
+
+ for (i = 0; i < 10; i++)
+ if (cg_create(cgroup[i]))
+ goto cleanup;
+
+ cg_run_nowait(cgroup[2], child_fn, NULL);
+ cg_run_nowait(cgroup[7], child_fn, NULL);
+ cg_run_nowait(cgroup[9], child_fn, NULL);
+ cg_run_nowait(cgroup[9], child_fn, NULL);
+ cg_run_nowait(cgroup[9], child_fn, NULL);
+
+ /*
+ * Wait until all child processes will enter
+ * corresponding cgroups.
+ */
+
+ if (cg_wait_for_proc_count(cgroup[2], 1) ||
+ cg_wait_for_proc_count(cgroup[7], 1) ||
+ cg_wait_for_proc_count(cgroup[9], 3))
+ goto cleanup;
+
+ /*
+ * Freeze B.
+ */
+ if (cg_freeze_wait(cgroup[1], true))
+ goto cleanup;
+
+ /*
+ * Freeze F.
+ */
+ if (cg_freeze_wait(cgroup[5], true))
+ goto cleanup;
+
+ /*
+ * Freeze G.
+ */
+ if (cg_freeze_wait(cgroup[6], true))
+ goto cleanup;
+
+ /*
+ * Check that A and E are not frozen.
+ */
+ if (cg_check_frozen(cgroup[0], false))
+ goto cleanup;
+
+ if (cg_check_frozen(cgroup[4], false))
+ goto cleanup;
+
+ /*
+ * Freeze A. Check that A, B and E are frozen.
+ */
+ if (cg_freeze_wait(cgroup[0], true))
+ goto cleanup;
+
+ if (cg_check_frozen(cgroup[1], true))
+ goto cleanup;
+
+ if (cg_check_frozen(cgroup[4], true))
+ goto cleanup;
+
+ /*
+ * Unfreeze B, F and G
+ */
+ if (cg_freeze_nowait(cgroup[1], false))
+ goto cleanup;
+
+ if (cg_freeze_nowait(cgroup[5], false))
+ goto cleanup;
+
+ if (cg_freeze_nowait(cgroup[6], false))
+ goto cleanup;
+
+ /*
+ * Check that C and H are still frozen.
+ */
+ if (cg_check_frozen(cgroup[2], true))
+ goto cleanup;
+
+ if (cg_check_frozen(cgroup[7], true))
+ goto cleanup;
+
+ /*
+ * Unfreeze A. Check that A, C and K are not frozen.
+ */
+ if (cg_freeze_wait(cgroup[0], false))
+ goto cleanup;
+
+ if (cg_check_frozen(cgroup[2], false))
+ goto cleanup;
+
+ if (cg_check_frozen(cgroup[9], false))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ for (i = 9; i >= 0 && cgroup[i]; i--) {
+ cg_destroy(cgroup[i]);
+ free(cgroup[i]);
+ }
+
+ return ret;
+}
+
+/*
+ * A fork bomb emulator.
+ */
+static int forkbomb_fn(const char *cgroup, void *arg)
+{
+ int ppid;
+
+ fork();
+ fork();
+
+ ppid = getppid();
+
+ while (getppid() == ppid)
+ usleep(1000);
+
+ return getppid() == ppid;
+}
+
+/*
+ * The test runs a fork bomb in a cgroup and tries to freeze it.
+ * Then it kills all processes and checks that cgroup isn't populated
+ * anymore.
+ */
+static int test_cgfreezer_forkbomb(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *cgroup = NULL;
+
+ cgroup = cg_name(root, "cg_forkbomb_test");
+ if (!cgroup)
+ goto cleanup;
+
+ if (cg_create(cgroup))
+ goto cleanup;
+
+ cg_run_nowait(cgroup, forkbomb_fn, NULL);
+
+ usleep(100000);
+
+ if (cg_freeze_wait(cgroup, true))
+ goto cleanup;
+
+ if (cg_killall(cgroup))
+ goto cleanup;
+
+ if (cg_wait_for_proc_count(cgroup, 0))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ if (cgroup)
+ cg_destroy(cgroup);
+ free(cgroup);
+ return ret;
+}
+
+/*
+ * The test creates a cgroups and freezes it. Then it creates a child cgroup
+ * and populates it with a task. After that it checks that the child cgroup
+ * is frozen and the parent cgroup remains frozen too.
+ */
+static int test_cgfreezer_mkdir(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *parent, *child = NULL;
+ int pid;
+
+ parent = cg_name(root, "cg_test_mkdir_A");
+ if (!parent)
+ goto cleanup;
+
+ child = cg_name(parent, "cg_test_mkdir_B");
+ if (!child)
+ goto cleanup;
+
+ if (cg_create(parent))
+ goto cleanup;
+
+ if (cg_freeze_wait(parent, true))
+ goto cleanup;
+
+ if (cg_create(child))
+ goto cleanup;
+
+ pid = cg_run_nowait(child, child_fn, NULL);
+ if (pid < 0)
+ goto cleanup;
+
+ if (cg_wait_for_proc_count(child, 1))
+ goto cleanup;
+
+ if (cg_check_frozen(child, true))
+ goto cleanup;
+
+ if (cg_check_frozen(parent, true))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ if (child)
+ cg_destroy(child);
+ free(child);
+ if (parent)
+ cg_destroy(parent);
+ free(parent);
+ return ret;
+}
+
+/*
+ * The test creates two nested cgroups, freezes the parent
+ * and removes the child. Then it checks that the parent cgroup
+ * remains frozen and it's possible to create a new child
+ * without unfreezing. The new child is frozen too.
+ */
+static int test_cgfreezer_rmdir(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *parent, *child = NULL;
+
+ parent = cg_name(root, "cg_test_rmdir_A");
+ if (!parent)
+ goto cleanup;
+
+ child = cg_name(parent, "cg_test_rmdir_B");
+ if (!child)
+ goto cleanup;
+
+ if (cg_create(parent))
+ goto cleanup;
+
+ if (cg_create(child))
+ goto cleanup;
+
+ if (cg_freeze_wait(parent, true))
+ goto cleanup;
+
+ if (cg_destroy(child))
+ goto cleanup;
+
+ if (cg_check_frozen(parent, true))
+ goto cleanup;
+
+ if (cg_create(child))
+ goto cleanup;
+
+ if (cg_check_frozen(child, true))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ if (child)
+ cg_destroy(child);
+ free(child);
+ if (parent)
+ cg_destroy(parent);
+ free(parent);
+ return ret;
+}
+
+/*
+ * The test creates two cgroups: A and B, runs a process in A
+ * and performs several migrations:
+ * 1) A (running) -> B (frozen)
+ * 2) B (frozen) -> A (running)
+ * 3) A (frozen) -> B (frozen)
+ *
+ * On each step it checks the actual state of both cgroups.
+ */
+static int test_cgfreezer_migrate(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *cgroup[2] = {0};
+ int pid;
+
+ cgroup[0] = cg_name(root, "cg_test_migrate_A");
+ if (!cgroup[0])
+ goto cleanup;
+
+ cgroup[1] = cg_name(root, "cg_test_migrate_B");
+ if (!cgroup[1])
+ goto cleanup;
+
+ if (cg_create(cgroup[0]))
+ goto cleanup;
+
+ if (cg_create(cgroup[1]))
+ goto cleanup;
+
+ pid = cg_run_nowait(cgroup[0], child_fn, NULL);
+ if (pid < 0)
+ goto cleanup;
+
+ if (cg_wait_for_proc_count(cgroup[0], 1))
+ goto cleanup;
+
+ /*
+ * Migrate from A (running) to B (frozen)
+ */
+ if (cg_freeze_wait(cgroup[1], true))
+ goto cleanup;
+
+ if (cg_enter_and_wait_for_frozen(cgroup[1], pid, true))
+ goto cleanup;
+
+ if (cg_check_frozen(cgroup[0], false))
+ goto cleanup;
+
+ /*
+ * Migrate from B (frozen) to A (running)
+ */
+ if (cg_enter_and_wait_for_frozen(cgroup[0], pid, false))
+ goto cleanup;
+
+ if (cg_check_frozen(cgroup[1], true))
+ goto cleanup;
+
+ /*
+ * Migrate from A (frozen) to B (frozen)
+ */
+ if (cg_freeze_wait(cgroup[0], true))
+ goto cleanup;
+
+ if (cg_enter_and_wait_for_frozen(cgroup[1], pid, true))
+ goto cleanup;
+
+ if (cg_check_frozen(cgroup[0], true))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ if (cgroup[0])
+ cg_destroy(cgroup[0]);
+ free(cgroup[0]);
+ if (cgroup[1])
+ cg_destroy(cgroup[1]);
+ free(cgroup[1]);
+ return ret;
+}
+
+/*
+ * The test checks that ptrace works with a tracing process in a frozen cgroup.
+ */
+static int test_cgfreezer_ptrace(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *cgroup = NULL;
+ siginfo_t siginfo;
+ int pid;
+
+ cgroup = cg_name(root, "cg_test_ptrace");
+ if (!cgroup)
+ goto cleanup;
+
+ if (cg_create(cgroup))
+ goto cleanup;
+
+ pid = cg_run_nowait(cgroup, child_fn, NULL);
+ if (pid < 0)
+ goto cleanup;
+
+ if (cg_wait_for_proc_count(cgroup, 1))
+ goto cleanup;
+
+ if (cg_freeze_wait(cgroup, true))
+ goto cleanup;
+
+ if (ptrace(PTRACE_SEIZE, pid, NULL, NULL))
+ goto cleanup;
+
+ if (ptrace(PTRACE_INTERRUPT, pid, NULL, NULL))
+ goto cleanup;
+
+ waitpid(pid, NULL, 0);
+
+ /*
+ * Cgroup has to remain frozen, however the test task
+ * is in traced state.
+ */
+ if (cg_check_frozen(cgroup, true))
+ goto cleanup;
+
+ if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo))
+ goto cleanup;
+
+ if (ptrace(PTRACE_DETACH, pid, NULL, NULL))
+ goto cleanup;
+
+ if (cg_check_frozen(cgroup, true))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ if (cgroup)
+ cg_destroy(cgroup);
+ free(cgroup);
+ return ret;
+}
+
+/*
+ * Check if the process is stopped.
+ */
+static int proc_check_stopped(int pid)
+{
+ char buf[PAGE_SIZE];
+ int len;
+
+ len = proc_read_text(pid, 0, "stat", buf, sizeof(buf));
+ if (len == -1) {
+ debug("Can't get %d stat\n", pid);
+ return -1;
+ }
+
+ if (strstr(buf, "(test_freezer) T ") == NULL) {
+ debug("Process %d in the unexpected state: %s\n", pid, buf);
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Test that it's possible to freeze a cgroup with a stopped process.
+ */
+static int test_cgfreezer_stopped(const char *root)
+{
+ int pid, ret = KSFT_FAIL;
+ char *cgroup = NULL;
+
+ cgroup = cg_name(root, "cg_test_stopped");
+ if (!cgroup)
+ goto cleanup;
+
+ if (cg_create(cgroup))
+ goto cleanup;
+
+ pid = cg_run_nowait(cgroup, child_fn, NULL);
+
+ if (cg_wait_for_proc_count(cgroup, 1))
+ goto cleanup;
+
+ if (kill(pid, SIGSTOP))
+ goto cleanup;
+
+ if (cg_check_frozen(cgroup, false))
+ goto cleanup;
+
+ if (cg_freeze_wait(cgroup, true))
+ goto cleanup;
+
+ if (cg_freeze_wait(cgroup, false))
+ goto cleanup;
+
+ if (proc_check_stopped(pid))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ if (cgroup)
+ cg_destroy(cgroup);
+ free(cgroup);
+ return ret;
+}
+
+/*
+ * Test that it's possible to freeze a cgroup with a ptraced process.
+ */
+static int test_cgfreezer_ptraced(const char *root)
+{
+ int pid, ret = KSFT_FAIL;
+ char *cgroup = NULL;
+ siginfo_t siginfo;
+
+ cgroup = cg_name(root, "cg_test_ptraced");
+ if (!cgroup)
+ goto cleanup;
+
+ if (cg_create(cgroup))
+ goto cleanup;
+
+ pid = cg_run_nowait(cgroup, child_fn, NULL);
+
+ if (cg_wait_for_proc_count(cgroup, 1))
+ goto cleanup;
+
+ if (ptrace(PTRACE_SEIZE, pid, NULL, NULL))
+ goto cleanup;
+
+ if (ptrace(PTRACE_INTERRUPT, pid, NULL, NULL))
+ goto cleanup;
+
+ waitpid(pid, NULL, 0);
+
+ if (cg_check_frozen(cgroup, false))
+ goto cleanup;
+
+ if (cg_freeze_wait(cgroup, true))
+ goto cleanup;
+
+ /*
+ * cg_check_frozen(cgroup, true) will fail here,
+ * because the task in in the TRACEd state.
+ */
+ if (cg_freeze_wait(cgroup, false))
+ goto cleanup;
+
+ if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo))
+ goto cleanup;
+
+ if (ptrace(PTRACE_DETACH, pid, NULL, NULL))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ if (cgroup)
+ cg_destroy(cgroup);
+ free(cgroup);
+ return ret;
+}
+
+static int vfork_fn(const char *cgroup, void *arg)
+{
+ int pid = vfork();
+
+ if (pid == 0)
+ while (true)
+ sleep(1);
+
+ return pid;
+}
+
+/*
+ * Test that it's possible to freeze a cgroup with a process,
+ * which called vfork() and is waiting for a child.
+ */
+static int test_cgfreezer_vfork(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *cgroup = NULL;
+
+ cgroup = cg_name(root, "cg_test_vfork");
+ if (!cgroup)
+ goto cleanup;
+
+ if (cg_create(cgroup))
+ goto cleanup;
+
+ cg_run_nowait(cgroup, vfork_fn, NULL);
+
+ if (cg_wait_for_proc_count(cgroup, 2))
+ goto cleanup;
+
+ if (cg_freeze_wait(cgroup, true))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ if (cgroup)
+ cg_destroy(cgroup);
+ free(cgroup);
+ return ret;
+}
+
+#define T(x) { x, #x }
+struct cgfreezer_test {
+ int (*fn)(const char *root);
+ const char *name;
+} tests[] = {
+ T(test_cgfreezer_simple),
+ T(test_cgfreezer_tree),
+ T(test_cgfreezer_forkbomb),
+ T(test_cgfreezer_mkdir),
+ T(test_cgfreezer_rmdir),
+ T(test_cgfreezer_migrate),
+ T(test_cgfreezer_ptrace),
+ T(test_cgfreezer_stopped),
+ T(test_cgfreezer_ptraced),
+ T(test_cgfreezer_vfork),
+};
+#undef T
+
+int main(int argc, char *argv[])
+{
+ char root[PATH_MAX];
+ int i, ret = EXIT_SUCCESS;
+
+ if (cg_find_unified_root(root, sizeof(root)))
+ ksft_exit_skip("cgroup v2 isn't mounted\n");
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ switch (tests[i].fn(root)) {
+ case KSFT_PASS:
+ ksft_test_result_pass("%s\n", tests[i].name);
+ break;
+ case KSFT_SKIP:
+ ksft_test_result_skip("%s\n", tests[i].name);
+ break;
+ default:
+ ret = EXIT_FAILURE;
+ ksft_test_result_fail("%s\n", tests[i].name);
+ break;
+ }
+ }
+
+ return ret;
+}
diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c
new file mode 100644
index 000000000..0941aa161
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_kmem.c
@@ -0,0 +1,450 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <linux/limits.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <errno.h>
+#include <sys/sysinfo.h>
+#include <pthread.h>
+
+#include "../kselftest.h"
+#include "cgroup_util.h"
+
+
+/*
+ * Memory cgroup charging and vmstat data aggregation is performed using
+ * percpu batches 32 pages big (look at MEMCG_CHARGE_BATCH). So the maximum
+ * discrepancy between charge and vmstat entries is number of cpus multiplied
+ * by 32 pages multiplied by 2.
+ */
+#define MAX_VMSTAT_ERROR (4096 * 32 * 2 * get_nprocs())
+
+
+static int alloc_dcache(const char *cgroup, void *arg)
+{
+ unsigned long i;
+ struct stat st;
+ char buf[128];
+
+ for (i = 0; i < (unsigned long)arg; i++) {
+ snprintf(buf, sizeof(buf),
+ "/something-non-existent-with-a-long-name-%64lu-%d",
+ i, getpid());
+ stat(buf, &st);
+ }
+
+ return 0;
+}
+
+/*
+ * This test allocates 100000 of negative dentries with long names.
+ * Then it checks that "slab" in memory.stat is larger than 1M.
+ * Then it sets memory.high to 1M and checks that at least 1/2
+ * of slab memory has been reclaimed.
+ */
+static int test_kmem_basic(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *cg = NULL;
+ long slab0, slab1, current;
+
+ cg = cg_name(root, "kmem_basic_test");
+ if (!cg)
+ goto cleanup;
+
+ if (cg_create(cg))
+ goto cleanup;
+
+ if (cg_run(cg, alloc_dcache, (void *)100000))
+ goto cleanup;
+
+ slab0 = cg_read_key_long(cg, "memory.stat", "slab ");
+ if (slab0 < (1 << 20))
+ goto cleanup;
+
+ cg_write(cg, "memory.high", "1M");
+ slab1 = cg_read_key_long(cg, "memory.stat", "slab ");
+ if (slab1 <= 0)
+ goto cleanup;
+
+ current = cg_read_long(cg, "memory.current");
+ if (current <= 0)
+ goto cleanup;
+
+ if (slab1 < slab0 / 2 && current < slab0 / 2)
+ ret = KSFT_PASS;
+cleanup:
+ cg_destroy(cg);
+ free(cg);
+
+ return ret;
+}
+
+static void *alloc_kmem_fn(void *arg)
+{
+ alloc_dcache(NULL, (void *)100);
+ return NULL;
+}
+
+static int alloc_kmem_smp(const char *cgroup, void *arg)
+{
+ int nr_threads = 2 * get_nprocs();
+ pthread_t *tinfo;
+ unsigned long i;
+ int ret = -1;
+
+ tinfo = calloc(nr_threads, sizeof(pthread_t));
+ if (tinfo == NULL)
+ return -1;
+
+ for (i = 0; i < nr_threads; i++) {
+ if (pthread_create(&tinfo[i], NULL, &alloc_kmem_fn,
+ (void *)i)) {
+ free(tinfo);
+ return -1;
+ }
+ }
+
+ for (i = 0; i < nr_threads; i++) {
+ ret = pthread_join(tinfo[i], NULL);
+ if (ret)
+ break;
+ }
+
+ free(tinfo);
+ return ret;
+}
+
+static int cg_run_in_subcgroups(const char *parent,
+ int (*fn)(const char *cgroup, void *arg),
+ void *arg, int times)
+{
+ char *child;
+ int i;
+
+ for (i = 0; i < times; i++) {
+ child = cg_name_indexed(parent, "child", i);
+ if (!child)
+ return -1;
+
+ if (cg_create(child)) {
+ cg_destroy(child);
+ free(child);
+ return -1;
+ }
+
+ if (cg_run(child, fn, NULL)) {
+ cg_destroy(child);
+ free(child);
+ return -1;
+ }
+
+ cg_destroy(child);
+ free(child);
+ }
+
+ return 0;
+}
+
+/*
+ * The test creates and destroys a large number of cgroups. In each cgroup it
+ * allocates some slab memory (mostly negative dentries) using 2 * NR_CPUS
+ * threads. Then it checks the sanity of numbers on the parent level:
+ * the total size of the cgroups should be roughly equal to
+ * anon + file + slab + kernel_stack.
+ */
+static int test_kmem_memcg_deletion(const char *root)
+{
+ long current, slab, anon, file, kernel_stack, sum;
+ int ret = KSFT_FAIL;
+ char *parent;
+
+ parent = cg_name(root, "kmem_memcg_deletion_test");
+ if (!parent)
+ goto cleanup;
+
+ if (cg_create(parent))
+ goto cleanup;
+
+ if (cg_write(parent, "cgroup.subtree_control", "+memory"))
+ goto cleanup;
+
+ if (cg_run_in_subcgroups(parent, alloc_kmem_smp, NULL, 100))
+ goto cleanup;
+
+ current = cg_read_long(parent, "memory.current");
+ slab = cg_read_key_long(parent, "memory.stat", "slab ");
+ anon = cg_read_key_long(parent, "memory.stat", "anon ");
+ file = cg_read_key_long(parent, "memory.stat", "file ");
+ kernel_stack = cg_read_key_long(parent, "memory.stat", "kernel_stack ");
+ if (current < 0 || slab < 0 || anon < 0 || file < 0 ||
+ kernel_stack < 0)
+ goto cleanup;
+
+ sum = slab + anon + file + kernel_stack;
+ if (abs(sum - current) < MAX_VMSTAT_ERROR) {
+ ret = KSFT_PASS;
+ } else {
+ printf("memory.current = %ld\n", current);
+ printf("slab + anon + file + kernel_stack = %ld\n", sum);
+ printf("slab = %ld\n", slab);
+ printf("anon = %ld\n", anon);
+ printf("file = %ld\n", file);
+ printf("kernel_stack = %ld\n", kernel_stack);
+ }
+
+cleanup:
+ cg_destroy(parent);
+ free(parent);
+
+ return ret;
+}
+
+/*
+ * The test reads the entire /proc/kpagecgroup. If the operation went
+ * successfully (and the kernel didn't panic), the test is treated as passed.
+ */
+static int test_kmem_proc_kpagecgroup(const char *root)
+{
+ unsigned long buf[128];
+ int ret = KSFT_FAIL;
+ ssize_t len;
+ int fd;
+
+ fd = open("/proc/kpagecgroup", O_RDONLY);
+ if (fd < 0)
+ return ret;
+
+ do {
+ len = read(fd, buf, sizeof(buf));
+ } while (len > 0);
+
+ if (len == 0)
+ ret = KSFT_PASS;
+
+ close(fd);
+ return ret;
+}
+
+static void *pthread_wait_fn(void *arg)
+{
+ sleep(100);
+ return NULL;
+}
+
+static int spawn_1000_threads(const char *cgroup, void *arg)
+{
+ int nr_threads = 1000;
+ pthread_t *tinfo;
+ unsigned long i;
+ long stack;
+ int ret = -1;
+
+ tinfo = calloc(nr_threads, sizeof(pthread_t));
+ if (tinfo == NULL)
+ return -1;
+
+ for (i = 0; i < nr_threads; i++) {
+ if (pthread_create(&tinfo[i], NULL, &pthread_wait_fn,
+ (void *)i)) {
+ free(tinfo);
+ return(-1);
+ }
+ }
+
+ stack = cg_read_key_long(cgroup, "memory.stat", "kernel_stack ");
+ if (stack >= 4096 * 1000)
+ ret = 0;
+
+ free(tinfo);
+ return ret;
+}
+
+/*
+ * The test spawns a process, which spawns 1000 threads. Then it checks
+ * that memory.stat's kernel_stack is at least 1000 pages large.
+ */
+static int test_kmem_kernel_stacks(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *cg = NULL;
+
+ cg = cg_name(root, "kmem_kernel_stacks_test");
+ if (!cg)
+ goto cleanup;
+
+ if (cg_create(cg))
+ goto cleanup;
+
+ if (cg_run(cg, spawn_1000_threads, NULL))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+cleanup:
+ cg_destroy(cg);
+ free(cg);
+
+ return ret;
+}
+
+/*
+ * This test sequentionally creates 30 child cgroups, allocates some
+ * kernel memory in each of them, and deletes them. Then it checks
+ * that the number of dying cgroups on the parent level is 0.
+ */
+static int test_kmem_dead_cgroups(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *parent;
+ long dead;
+ int i;
+
+ parent = cg_name(root, "kmem_dead_cgroups_test");
+ if (!parent)
+ goto cleanup;
+
+ if (cg_create(parent))
+ goto cleanup;
+
+ if (cg_write(parent, "cgroup.subtree_control", "+memory"))
+ goto cleanup;
+
+ if (cg_run_in_subcgroups(parent, alloc_dcache, (void *)100, 30))
+ goto cleanup;
+
+ for (i = 0; i < 5; i++) {
+ dead = cg_read_key_long(parent, "cgroup.stat",
+ "nr_dying_descendants ");
+ if (dead == 0) {
+ ret = KSFT_PASS;
+ break;
+ }
+ /*
+ * Reclaiming cgroups might take some time,
+ * let's wait a bit and repeat.
+ */
+ sleep(1);
+ }
+
+cleanup:
+ cg_destroy(parent);
+ free(parent);
+
+ return ret;
+}
+
+/*
+ * This test creates a sub-tree with 1000 memory cgroups.
+ * Then it checks that the memory.current on the parent level
+ * is greater than 0 and approximates matches the percpu value
+ * from memory.stat.
+ */
+static int test_percpu_basic(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *parent, *child;
+ long current, percpu;
+ int i;
+
+ parent = cg_name(root, "percpu_basic_test");
+ if (!parent)
+ goto cleanup;
+
+ if (cg_create(parent))
+ goto cleanup;
+
+ if (cg_write(parent, "cgroup.subtree_control", "+memory"))
+ goto cleanup;
+
+ for (i = 0; i < 1000; i++) {
+ child = cg_name_indexed(parent, "child", i);
+ if (!child)
+ return -1;
+
+ if (cg_create(child))
+ goto cleanup_children;
+
+ free(child);
+ }
+
+ current = cg_read_long(parent, "memory.current");
+ percpu = cg_read_key_long(parent, "memory.stat", "percpu ");
+
+ if (current > 0 && percpu > 0 && abs(current - percpu) <
+ MAX_VMSTAT_ERROR)
+ ret = KSFT_PASS;
+ else
+ printf("memory.current %ld\npercpu %ld\n",
+ current, percpu);
+
+cleanup_children:
+ for (i = 0; i < 1000; i++) {
+ child = cg_name_indexed(parent, "child", i);
+ cg_destroy(child);
+ free(child);
+ }
+
+cleanup:
+ cg_destroy(parent);
+ free(parent);
+
+ return ret;
+}
+
+#define T(x) { x, #x }
+struct kmem_test {
+ int (*fn)(const char *root);
+ const char *name;
+} tests[] = {
+ T(test_kmem_basic),
+ T(test_kmem_memcg_deletion),
+ T(test_kmem_proc_kpagecgroup),
+ T(test_kmem_kernel_stacks),
+ T(test_kmem_dead_cgroups),
+ T(test_percpu_basic),
+};
+#undef T
+
+int main(int argc, char **argv)
+{
+ char root[PATH_MAX];
+ int i, ret = EXIT_SUCCESS;
+
+ if (cg_find_unified_root(root, sizeof(root)))
+ ksft_exit_skip("cgroup v2 isn't mounted\n");
+
+ /*
+ * Check that memory controller is available:
+ * memory is listed in cgroup.controllers
+ */
+ if (cg_read_strstr(root, "cgroup.controllers", "memory"))
+ ksft_exit_skip("memory controller isn't available\n");
+
+ if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
+ if (cg_write(root, "cgroup.subtree_control", "+memory"))
+ ksft_exit_skip("Failed to set memory controller\n");
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ switch (tests[i].fn(root)) {
+ case KSFT_PASS:
+ ksft_test_result_pass("%s\n", tests[i].name);
+ break;
+ case KSFT_SKIP:
+ ksft_test_result_skip("%s\n", tests[i].name);
+ break;
+ default:
+ ret = EXIT_FAILURE;
+ ksft_test_result_fail("%s\n", tests[i].name);
+ break;
+ }
+ }
+
+ return ret;
+}
diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
new file mode 100644
index 000000000..c19a97dd0
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_memcontrol.c
@@ -0,0 +1,1228 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#define _GNU_SOURCE
+
+#include <linux/limits.h>
+#include <linux/oom.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <errno.h>
+
+#include "../kselftest.h"
+#include "cgroup_util.h"
+
+/*
+ * This test creates two nested cgroups with and without enabling
+ * the memory controller.
+ */
+static int test_memcg_subtree_control(const char *root)
+{
+ char *parent, *child, *parent2 = NULL, *child2 = NULL;
+ int ret = KSFT_FAIL;
+ char buf[PAGE_SIZE];
+
+ /* Create two nested cgroups with the memory controller enabled */
+ parent = cg_name(root, "memcg_test_0");
+ child = cg_name(root, "memcg_test_0/memcg_test_1");
+ if (!parent || !child)
+ goto cleanup_free;
+
+ if (cg_create(parent))
+ goto cleanup_free;
+
+ if (cg_write(parent, "cgroup.subtree_control", "+memory"))
+ goto cleanup_parent;
+
+ if (cg_create(child))
+ goto cleanup_parent;
+
+ if (cg_read_strstr(child, "cgroup.controllers", "memory"))
+ goto cleanup_child;
+
+ /* Create two nested cgroups without enabling memory controller */
+ parent2 = cg_name(root, "memcg_test_1");
+ child2 = cg_name(root, "memcg_test_1/memcg_test_1");
+ if (!parent2 || !child2)
+ goto cleanup_free2;
+
+ if (cg_create(parent2))
+ goto cleanup_free2;
+
+ if (cg_create(child2))
+ goto cleanup_parent2;
+
+ if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
+ goto cleanup_all;
+
+ if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
+ goto cleanup_all;
+
+ ret = KSFT_PASS;
+
+cleanup_all:
+ cg_destroy(child2);
+cleanup_parent2:
+ cg_destroy(parent2);
+cleanup_free2:
+ free(parent2);
+ free(child2);
+cleanup_child:
+ cg_destroy(child);
+cleanup_parent:
+ cg_destroy(parent);
+cleanup_free:
+ free(parent);
+ free(child);
+
+ return ret;
+}
+
+static int alloc_anon_50M_check(const char *cgroup, void *arg)
+{
+ size_t size = MB(50);
+ char *buf, *ptr;
+ long anon, current;
+ int ret = -1;
+
+ buf = malloc(size);
+ for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
+ *ptr = 0;
+
+ current = cg_read_long(cgroup, "memory.current");
+ if (current < size)
+ goto cleanup;
+
+ if (!values_close(size, current, 3))
+ goto cleanup;
+
+ anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
+ if (anon < 0)
+ goto cleanup;
+
+ if (!values_close(anon, current, 3))
+ goto cleanup;
+
+ ret = 0;
+cleanup:
+ free(buf);
+ return ret;
+}
+
+static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
+{
+ size_t size = MB(50);
+ int ret = -1;
+ long current, file;
+ int fd;
+
+ fd = get_temp_fd();
+ if (fd < 0)
+ return -1;
+
+ if (alloc_pagecache(fd, size))
+ goto cleanup;
+
+ current = cg_read_long(cgroup, "memory.current");
+ if (current < size)
+ goto cleanup;
+
+ file = cg_read_key_long(cgroup, "memory.stat", "file ");
+ if (file < 0)
+ goto cleanup;
+
+ if (!values_close(file, current, 10))
+ goto cleanup;
+
+ ret = 0;
+
+cleanup:
+ close(fd);
+ return ret;
+}
+
+/*
+ * This test create a memory cgroup, allocates
+ * some anonymous memory and some pagecache
+ * and check memory.current and some memory.stat values.
+ */
+static int test_memcg_current(const char *root)
+{
+ int ret = KSFT_FAIL;
+ long current;
+ char *memcg;
+
+ memcg = cg_name(root, "memcg_test");
+ if (!memcg)
+ goto cleanup;
+
+ if (cg_create(memcg))
+ goto cleanup;
+
+ current = cg_read_long(memcg, "memory.current");
+ if (current != 0)
+ goto cleanup;
+
+ if (cg_run(memcg, alloc_anon_50M_check, NULL))
+ goto cleanup;
+
+ if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_destroy(memcg);
+ free(memcg);
+
+ return ret;
+}
+
+static int alloc_pagecache_50M(const char *cgroup, void *arg)
+{
+ int fd = (long)arg;
+
+ return alloc_pagecache(fd, MB(50));
+}
+
+static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
+{
+ int fd = (long)arg;
+ int ppid = getppid();
+
+ if (alloc_pagecache(fd, MB(50)))
+ return -1;
+
+ while (getppid() == ppid)
+ sleep(1);
+
+ return 0;
+}
+
+static int alloc_anon_noexit(const char *cgroup, void *arg)
+{
+ int ppid = getppid();
+
+ if (alloc_anon(cgroup, arg))
+ return -1;
+
+ while (getppid() == ppid)
+ sleep(1);
+
+ return 0;
+}
+
+/*
+ * Wait until processes are killed asynchronously by the OOM killer
+ * If we exceed a timeout, fail.
+ */
+static int cg_test_proc_killed(const char *cgroup)
+{
+ int limit;
+
+ for (limit = 10; limit > 0; limit--) {
+ if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
+ return 0;
+
+ usleep(100000);
+ }
+ return -1;
+}
+
+/*
+ * First, this test creates the following hierarchy:
+ * A memory.min = 50M, memory.max = 200M
+ * A/B memory.min = 50M, memory.current = 50M
+ * A/B/C memory.min = 75M, memory.current = 50M
+ * A/B/D memory.min = 25M, memory.current = 50M
+ * A/B/E memory.min = 500M, memory.current = 0
+ * A/B/F memory.min = 0, memory.current = 50M
+ *
+ * Usages are pagecache, but the test keeps a running
+ * process in every leaf cgroup.
+ * Then it creates A/G and creates a significant
+ * memory pressure in it.
+ *
+ * A/B memory.current ~= 50M
+ * A/B/C memory.current ~= 33M
+ * A/B/D memory.current ~= 17M
+ * A/B/E memory.current ~= 0
+ *
+ * After that it tries to allocate more than there is
+ * unprotected memory in A available, and checks
+ * checks that memory.min protects pagecache even
+ * in this case.
+ */
+static int test_memcg_min(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *parent[3] = {NULL};
+ char *children[4] = {NULL};
+ long c[4];
+ int i, attempts;
+ int fd;
+
+ fd = get_temp_fd();
+ if (fd < 0)
+ goto cleanup;
+
+ parent[0] = cg_name(root, "memcg_test_0");
+ if (!parent[0])
+ goto cleanup;
+
+ parent[1] = cg_name(parent[0], "memcg_test_1");
+ if (!parent[1])
+ goto cleanup;
+
+ parent[2] = cg_name(parent[0], "memcg_test_2");
+ if (!parent[2])
+ goto cleanup;
+
+ if (cg_create(parent[0]))
+ goto cleanup;
+
+ if (cg_read_long(parent[0], "memory.min")) {
+ ret = KSFT_SKIP;
+ goto cleanup;
+ }
+
+ if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
+ goto cleanup;
+
+ if (cg_write(parent[0], "memory.max", "200M"))
+ goto cleanup;
+
+ if (cg_write(parent[0], "memory.swap.max", "0"))
+ goto cleanup;
+
+ if (cg_create(parent[1]))
+ goto cleanup;
+
+ if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
+ goto cleanup;
+
+ if (cg_create(parent[2]))
+ goto cleanup;
+
+ for (i = 0; i < ARRAY_SIZE(children); i++) {
+ children[i] = cg_name_indexed(parent[1], "child_memcg", i);
+ if (!children[i])
+ goto cleanup;
+
+ if (cg_create(children[i]))
+ goto cleanup;
+
+ if (i == 2)
+ continue;
+
+ cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
+ (void *)(long)fd);
+ }
+
+ if (cg_write(parent[0], "memory.min", "50M"))
+ goto cleanup;
+ if (cg_write(parent[1], "memory.min", "50M"))
+ goto cleanup;
+ if (cg_write(children[0], "memory.min", "75M"))
+ goto cleanup;
+ if (cg_write(children[1], "memory.min", "25M"))
+ goto cleanup;
+ if (cg_write(children[2], "memory.min", "500M"))
+ goto cleanup;
+ if (cg_write(children[3], "memory.min", "0"))
+ goto cleanup;
+
+ attempts = 0;
+ while (!values_close(cg_read_long(parent[1], "memory.current"),
+ MB(150), 3)) {
+ if (attempts++ > 5)
+ break;
+ sleep(1);
+ }
+
+ if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
+ goto cleanup;
+
+ if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
+ goto cleanup;
+
+ for (i = 0; i < ARRAY_SIZE(children); i++)
+ c[i] = cg_read_long(children[i], "memory.current");
+
+ if (!values_close(c[0], MB(33), 10))
+ goto cleanup;
+
+ if (!values_close(c[1], MB(17), 10))
+ goto cleanup;
+
+ if (!values_close(c[2], 0, 1))
+ goto cleanup;
+
+ if (!cg_run(parent[2], alloc_anon, (void *)MB(170)))
+ goto cleanup;
+
+ if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
+ if (!children[i])
+ continue;
+
+ cg_destroy(children[i]);
+ free(children[i]);
+ }
+
+ for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
+ if (!parent[i])
+ continue;
+
+ cg_destroy(parent[i]);
+ free(parent[i]);
+ }
+ close(fd);
+ return ret;
+}
+
+/*
+ * First, this test creates the following hierarchy:
+ * A memory.low = 50M, memory.max = 200M
+ * A/B memory.low = 50M, memory.current = 50M
+ * A/B/C memory.low = 75M, memory.current = 50M
+ * A/B/D memory.low = 25M, memory.current = 50M
+ * A/B/E memory.low = 500M, memory.current = 0
+ * A/B/F memory.low = 0, memory.current = 50M
+ *
+ * Usages are pagecache.
+ * Then it creates A/G an creates a significant
+ * memory pressure in it.
+ *
+ * Then it checks actual memory usages and expects that:
+ * A/B memory.current ~= 50M
+ * A/B/ memory.current ~= 33M
+ * A/B/D memory.current ~= 17M
+ * A/B/E memory.current ~= 0
+ *
+ * After that it tries to allocate more than there is
+ * unprotected memory in A available,
+ * and checks low and oom events in memory.events.
+ */
+static int test_memcg_low(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *parent[3] = {NULL};
+ char *children[4] = {NULL};
+ long low, oom;
+ long c[4];
+ int i;
+ int fd;
+
+ fd = get_temp_fd();
+ if (fd < 0)
+ goto cleanup;
+
+ parent[0] = cg_name(root, "memcg_test_0");
+ if (!parent[0])
+ goto cleanup;
+
+ parent[1] = cg_name(parent[0], "memcg_test_1");
+ if (!parent[1])
+ goto cleanup;
+
+ parent[2] = cg_name(parent[0], "memcg_test_2");
+ if (!parent[2])
+ goto cleanup;
+
+ if (cg_create(parent[0]))
+ goto cleanup;
+
+ if (cg_read_long(parent[0], "memory.low"))
+ goto cleanup;
+
+ if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
+ goto cleanup;
+
+ if (cg_write(parent[0], "memory.max", "200M"))
+ goto cleanup;
+
+ if (cg_write(parent[0], "memory.swap.max", "0"))
+ goto cleanup;
+
+ if (cg_create(parent[1]))
+ goto cleanup;
+
+ if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
+ goto cleanup;
+
+ if (cg_create(parent[2]))
+ goto cleanup;
+
+ for (i = 0; i < ARRAY_SIZE(children); i++) {
+ children[i] = cg_name_indexed(parent[1], "child_memcg", i);
+ if (!children[i])
+ goto cleanup;
+
+ if (cg_create(children[i]))
+ goto cleanup;
+
+ if (i == 2)
+ continue;
+
+ if (cg_run(children[i], alloc_pagecache_50M, (void *)(long)fd))
+ goto cleanup;
+ }
+
+ if (cg_write(parent[0], "memory.low", "50M"))
+ goto cleanup;
+ if (cg_write(parent[1], "memory.low", "50M"))
+ goto cleanup;
+ if (cg_write(children[0], "memory.low", "75M"))
+ goto cleanup;
+ if (cg_write(children[1], "memory.low", "25M"))
+ goto cleanup;
+ if (cg_write(children[2], "memory.low", "500M"))
+ goto cleanup;
+ if (cg_write(children[3], "memory.low", "0"))
+ goto cleanup;
+
+ if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
+ goto cleanup;
+
+ if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
+ goto cleanup;
+
+ for (i = 0; i < ARRAY_SIZE(children); i++)
+ c[i] = cg_read_long(children[i], "memory.current");
+
+ if (!values_close(c[0], MB(33), 10))
+ goto cleanup;
+
+ if (!values_close(c[1], MB(17), 10))
+ goto cleanup;
+
+ if (!values_close(c[2], 0, 1))
+ goto cleanup;
+
+ if (cg_run(parent[2], alloc_anon, (void *)MB(166))) {
+ fprintf(stderr,
+ "memory.low prevents from allocating anon memory\n");
+ goto cleanup;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(children); i++) {
+ oom = cg_read_key_long(children[i], "memory.events", "oom ");
+ low = cg_read_key_long(children[i], "memory.events", "low ");
+
+ if (oom)
+ goto cleanup;
+ if (i < 2 && low <= 0)
+ goto cleanup;
+ if (i >= 2 && low)
+ goto cleanup;
+ }
+
+ ret = KSFT_PASS;
+
+cleanup:
+ for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
+ if (!children[i])
+ continue;
+
+ cg_destroy(children[i]);
+ free(children[i]);
+ }
+
+ for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
+ if (!parent[i])
+ continue;
+
+ cg_destroy(parent[i]);
+ free(parent[i]);
+ }
+ close(fd);
+ return ret;
+}
+
+static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
+{
+ size_t size = MB(50);
+ int ret = -1;
+ long current;
+ int fd;
+
+ fd = get_temp_fd();
+ if (fd < 0)
+ return -1;
+
+ if (alloc_pagecache(fd, size))
+ goto cleanup;
+
+ current = cg_read_long(cgroup, "memory.current");
+ if (current <= MB(29) || current > MB(30))
+ goto cleanup;
+
+ ret = 0;
+
+cleanup:
+ close(fd);
+ return ret;
+
+}
+
+/*
+ * This test checks that memory.high limits the amount of
+ * memory which can be consumed by either anonymous memory
+ * or pagecache.
+ */
+static int test_memcg_high(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *memcg;
+ long high;
+
+ memcg = cg_name(root, "memcg_test");
+ if (!memcg)
+ goto cleanup;
+
+ if (cg_create(memcg))
+ goto cleanup;
+
+ if (cg_read_strcmp(memcg, "memory.high", "max\n"))
+ goto cleanup;
+
+ if (cg_write(memcg, "memory.swap.max", "0"))
+ goto cleanup;
+
+ if (cg_write(memcg, "memory.high", "30M"))
+ goto cleanup;
+
+ if (cg_run(memcg, alloc_anon, (void *)MB(100)))
+ goto cleanup;
+
+ if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
+ goto cleanup;
+
+ if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
+ goto cleanup;
+
+ high = cg_read_key_long(memcg, "memory.events", "high ");
+ if (high <= 0)
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_destroy(memcg);
+ free(memcg);
+
+ return ret;
+}
+
+/*
+ * This test checks that memory.max limits the amount of
+ * memory which can be consumed by either anonymous memory
+ * or pagecache.
+ */
+static int test_memcg_max(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *memcg;
+ long current, max;
+
+ memcg = cg_name(root, "memcg_test");
+ if (!memcg)
+ goto cleanup;
+
+ if (cg_create(memcg))
+ goto cleanup;
+
+ if (cg_read_strcmp(memcg, "memory.max", "max\n"))
+ goto cleanup;
+
+ if (cg_write(memcg, "memory.swap.max", "0"))
+ goto cleanup;
+
+ if (cg_write(memcg, "memory.max", "30M"))
+ goto cleanup;
+
+ /* Should be killed by OOM killer */
+ if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
+ goto cleanup;
+
+ if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
+ goto cleanup;
+
+ current = cg_read_long(memcg, "memory.current");
+ if (current > MB(30) || !current)
+ goto cleanup;
+
+ max = cg_read_key_long(memcg, "memory.events", "max ");
+ if (max <= 0)
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_destroy(memcg);
+ free(memcg);
+
+ return ret;
+}
+
+static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
+{
+ long mem_max = (long)arg;
+ size_t size = MB(50);
+ char *buf, *ptr;
+ long mem_current, swap_current;
+ int ret = -1;
+
+ buf = malloc(size);
+ for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
+ *ptr = 0;
+
+ mem_current = cg_read_long(cgroup, "memory.current");
+ if (!mem_current || !values_close(mem_current, mem_max, 3))
+ goto cleanup;
+
+ swap_current = cg_read_long(cgroup, "memory.swap.current");
+ if (!swap_current ||
+ !values_close(mem_current + swap_current, size, 3))
+ goto cleanup;
+
+ ret = 0;
+cleanup:
+ free(buf);
+ return ret;
+}
+
+/*
+ * This test checks that memory.swap.max limits the amount of
+ * anonymous memory which can be swapped out.
+ */
+static int test_memcg_swap_max(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *memcg;
+ long max;
+
+ if (!is_swap_enabled())
+ return KSFT_SKIP;
+
+ memcg = cg_name(root, "memcg_test");
+ if (!memcg)
+ goto cleanup;
+
+ if (cg_create(memcg))
+ goto cleanup;
+
+ if (cg_read_long(memcg, "memory.swap.current")) {
+ ret = KSFT_SKIP;
+ goto cleanup;
+ }
+
+ if (cg_read_strcmp(memcg, "memory.max", "max\n"))
+ goto cleanup;
+
+ if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
+ goto cleanup;
+
+ if (cg_write(memcg, "memory.swap.max", "30M"))
+ goto cleanup;
+
+ if (cg_write(memcg, "memory.max", "30M"))
+ goto cleanup;
+
+ /* Should be killed by OOM killer */
+ if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
+ goto cleanup;
+
+ if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
+ goto cleanup;
+
+ if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
+ goto cleanup;
+
+ if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
+ goto cleanup;
+
+ max = cg_read_key_long(memcg, "memory.events", "max ");
+ if (max <= 0)
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_destroy(memcg);
+ free(memcg);
+
+ return ret;
+}
+
+/*
+ * This test disables swapping and tries to allocate anonymous memory
+ * up to OOM. Then it checks for oom and oom_kill events in
+ * memory.events.
+ */
+static int test_memcg_oom_events(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *memcg;
+
+ memcg = cg_name(root, "memcg_test");
+ if (!memcg)
+ goto cleanup;
+
+ if (cg_create(memcg))
+ goto cleanup;
+
+ if (cg_write(memcg, "memory.max", "30M"))
+ goto cleanup;
+
+ if (cg_write(memcg, "memory.swap.max", "0"))
+ goto cleanup;
+
+ if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
+ goto cleanup;
+
+ if (cg_read_strcmp(memcg, "cgroup.procs", ""))
+ goto cleanup;
+
+ if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
+ goto cleanup;
+
+ if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_destroy(memcg);
+ free(memcg);
+
+ return ret;
+}
+
+struct tcp_server_args {
+ unsigned short port;
+ int ctl[2];
+};
+
+static int tcp_server(const char *cgroup, void *arg)
+{
+ struct tcp_server_args *srv_args = arg;
+ struct sockaddr_in6 saddr = { 0 };
+ socklen_t slen = sizeof(saddr);
+ int sk, client_sk, ctl_fd, yes = 1, ret = -1;
+
+ close(srv_args->ctl[0]);
+ ctl_fd = srv_args->ctl[1];
+
+ saddr.sin6_family = AF_INET6;
+ saddr.sin6_addr = in6addr_any;
+ saddr.sin6_port = htons(srv_args->port);
+
+ sk = socket(AF_INET6, SOCK_STREAM, 0);
+ if (sk < 0)
+ return ret;
+
+ if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
+ goto cleanup;
+
+ if (bind(sk, (struct sockaddr *)&saddr, slen)) {
+ write(ctl_fd, &errno, sizeof(errno));
+ goto cleanup;
+ }
+
+ if (listen(sk, 1))
+ goto cleanup;
+
+ ret = 0;
+ if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
+ ret = -1;
+ goto cleanup;
+ }
+
+ client_sk = accept(sk, NULL, NULL);
+ if (client_sk < 0)
+ goto cleanup;
+
+ ret = -1;
+ for (;;) {
+ uint8_t buf[0x100000];
+
+ if (write(client_sk, buf, sizeof(buf)) <= 0) {
+ if (errno == ECONNRESET)
+ ret = 0;
+ break;
+ }
+ }
+
+ close(client_sk);
+
+cleanup:
+ close(sk);
+ return ret;
+}
+
+static int tcp_client(const char *cgroup, unsigned short port)
+{
+ const char server[] = "localhost";
+ struct addrinfo *ai;
+ char servport[6];
+ int retries = 0x10; /* nice round number */
+ int sk, ret;
+
+ snprintf(servport, sizeof(servport), "%hd", port);
+ ret = getaddrinfo(server, servport, NULL, &ai);
+ if (ret)
+ return ret;
+
+ sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
+ if (sk < 0)
+ goto free_ainfo;
+
+ ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
+ if (ret < 0)
+ goto close_sk;
+
+ ret = KSFT_FAIL;
+ while (retries--) {
+ uint8_t buf[0x100000];
+ long current, sock;
+
+ if (read(sk, buf, sizeof(buf)) <= 0)
+ goto close_sk;
+
+ current = cg_read_long(cgroup, "memory.current");
+ sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
+
+ if (current < 0 || sock < 0)
+ goto close_sk;
+
+ if (current < sock)
+ goto close_sk;
+
+ if (values_close(current, sock, 10)) {
+ ret = KSFT_PASS;
+ break;
+ }
+ }
+
+close_sk:
+ close(sk);
+free_ainfo:
+ freeaddrinfo(ai);
+ return ret;
+}
+
+/*
+ * This test checks socket memory accounting.
+ * The test forks a TCP server listens on a random port between 1000
+ * and 61000. Once it gets a client connection, it starts writing to
+ * its socket.
+ * The TCP client interleaves reads from the socket with check whether
+ * memory.current and memory.stat.sock are similar.
+ */
+static int test_memcg_sock(const char *root)
+{
+ int bind_retries = 5, ret = KSFT_FAIL, pid, err;
+ unsigned short port;
+ char *memcg;
+
+ memcg = cg_name(root, "memcg_test");
+ if (!memcg)
+ goto cleanup;
+
+ if (cg_create(memcg))
+ goto cleanup;
+
+ while (bind_retries--) {
+ struct tcp_server_args args;
+
+ if (pipe(args.ctl))
+ goto cleanup;
+
+ port = args.port = 1000 + rand() % 60000;
+
+ pid = cg_run_nowait(memcg, tcp_server, &args);
+ if (pid < 0)
+ goto cleanup;
+
+ close(args.ctl[1]);
+ if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
+ goto cleanup;
+ close(args.ctl[0]);
+
+ if (!err)
+ break;
+ if (err != EADDRINUSE)
+ goto cleanup;
+
+ waitpid(pid, NULL, 0);
+ }
+
+ if (err == EADDRINUSE) {
+ ret = KSFT_SKIP;
+ goto cleanup;
+ }
+
+ if (tcp_client(memcg, port) != KSFT_PASS)
+ goto cleanup;
+
+ waitpid(pid, &err, 0);
+ if (WEXITSTATUS(err))
+ goto cleanup;
+
+ if (cg_read_long(memcg, "memory.current") < 0)
+ goto cleanup;
+
+ if (cg_read_key_long(memcg, "memory.stat", "sock "))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_destroy(memcg);
+ free(memcg);
+
+ return ret;
+}
+
+/*
+ * This test disables swapping and tries to allocate anonymous memory
+ * up to OOM with memory.group.oom set. Then it checks that all
+ * processes in the leaf (but not the parent) were killed.
+ */
+static int test_memcg_oom_group_leaf_events(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *parent, *child;
+
+ parent = cg_name(root, "memcg_test_0");
+ child = cg_name(root, "memcg_test_0/memcg_test_1");
+
+ if (!parent || !child)
+ goto cleanup;
+
+ if (cg_create(parent))
+ goto cleanup;
+
+ if (cg_create(child))
+ goto cleanup;
+
+ if (cg_write(parent, "cgroup.subtree_control", "+memory"))
+ goto cleanup;
+
+ if (cg_write(child, "memory.max", "50M"))
+ goto cleanup;
+
+ if (cg_write(child, "memory.swap.max", "0"))
+ goto cleanup;
+
+ if (cg_write(child, "memory.oom.group", "1"))
+ goto cleanup;
+
+ cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
+ cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
+ cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
+ if (!cg_run(child, alloc_anon, (void *)MB(100)))
+ goto cleanup;
+
+ if (cg_test_proc_killed(child))
+ goto cleanup;
+
+ if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
+ goto cleanup;
+
+ if (cg_read_key_long(parent, "memory.events", "oom_kill ") != 0)
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ if (child)
+ cg_destroy(child);
+ if (parent)
+ cg_destroy(parent);
+ free(child);
+ free(parent);
+
+ return ret;
+}
+
+/*
+ * This test disables swapping and tries to allocate anonymous memory
+ * up to OOM with memory.group.oom set. Then it checks that all
+ * processes in the parent and leaf were killed.
+ */
+static int test_memcg_oom_group_parent_events(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *parent, *child;
+
+ parent = cg_name(root, "memcg_test_0");
+ child = cg_name(root, "memcg_test_0/memcg_test_1");
+
+ if (!parent || !child)
+ goto cleanup;
+
+ if (cg_create(parent))
+ goto cleanup;
+
+ if (cg_create(child))
+ goto cleanup;
+
+ if (cg_write(parent, "memory.max", "80M"))
+ goto cleanup;
+
+ if (cg_write(parent, "memory.swap.max", "0"))
+ goto cleanup;
+
+ if (cg_write(parent, "memory.oom.group", "1"))
+ goto cleanup;
+
+ cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
+ cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
+ cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
+
+ if (!cg_run(child, alloc_anon, (void *)MB(100)))
+ goto cleanup;
+
+ if (cg_test_proc_killed(child))
+ goto cleanup;
+ if (cg_test_proc_killed(parent))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ if (child)
+ cg_destroy(child);
+ if (parent)
+ cg_destroy(parent);
+ free(child);
+ free(parent);
+
+ return ret;
+}
+
+/*
+ * This test disables swapping and tries to allocate anonymous memory
+ * up to OOM with memory.group.oom set. Then it checks that all
+ * processes were killed except those set with OOM_SCORE_ADJ_MIN
+ */
+static int test_memcg_oom_group_score_events(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *memcg;
+ int safe_pid;
+
+ memcg = cg_name(root, "memcg_test_0");
+
+ if (!memcg)
+ goto cleanup;
+
+ if (cg_create(memcg))
+ goto cleanup;
+
+ if (cg_write(memcg, "memory.max", "50M"))
+ goto cleanup;
+
+ if (cg_write(memcg, "memory.swap.max", "0"))
+ goto cleanup;
+
+ if (cg_write(memcg, "memory.oom.group", "1"))
+ goto cleanup;
+
+ safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
+ if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
+ goto cleanup;
+
+ cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
+ if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
+ goto cleanup;
+
+ if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
+ goto cleanup;
+
+ if (kill(safe_pid, SIGKILL))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ if (memcg)
+ cg_destroy(memcg);
+ free(memcg);
+
+ return ret;
+}
+
+
+#define T(x) { x, #x }
+struct memcg_test {
+ int (*fn)(const char *root);
+ const char *name;
+} tests[] = {
+ T(test_memcg_subtree_control),
+ T(test_memcg_current),
+ T(test_memcg_min),
+ T(test_memcg_low),
+ T(test_memcg_high),
+ T(test_memcg_max),
+ T(test_memcg_oom_events),
+ T(test_memcg_swap_max),
+ T(test_memcg_sock),
+ T(test_memcg_oom_group_leaf_events),
+ T(test_memcg_oom_group_parent_events),
+ T(test_memcg_oom_group_score_events),
+};
+#undef T
+
+int main(int argc, char **argv)
+{
+ char root[PATH_MAX];
+ int i, ret = EXIT_SUCCESS;
+
+ if (cg_find_unified_root(root, sizeof(root)))
+ ksft_exit_skip("cgroup v2 isn't mounted\n");
+
+ /*
+ * Check that memory controller is available:
+ * memory is listed in cgroup.controllers
+ */
+ if (cg_read_strstr(root, "cgroup.controllers", "memory"))
+ ksft_exit_skip("memory controller isn't available\n");
+
+ if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
+ if (cg_write(root, "cgroup.subtree_control", "+memory"))
+ ksft_exit_skip("Failed to set memory controller\n");
+
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ switch (tests[i].fn(root)) {
+ case KSFT_PASS:
+ ksft_test_result_pass("%s\n", tests[i].name);
+ break;
+ case KSFT_SKIP:
+ ksft_test_result_skip("%s\n", tests[i].name);
+ break;
+ default:
+ ret = EXIT_FAILURE;
+ ksft_test_result_fail("%s\n", tests[i].name);
+ break;
+ }
+ }
+
+ return ret;
+}
diff --git a/tools/testing/selftests/cgroup/test_stress.sh b/tools/testing/selftests/cgroup/test_stress.sh
new file mode 100755
index 000000000..3c9c4554d
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_stress.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+./with_stress.sh -s subsys -s fork ${OUTPUT:-.}/test_core
diff --git a/tools/testing/selftests/cgroup/with_stress.sh b/tools/testing/selftests/cgroup/with_stress.sh
new file mode 100755
index 000000000..e28c35008
--- /dev/null
+++ b/tools/testing/selftests/cgroup/with_stress.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+stress_fork()
+{
+ while true ; do
+ /usr/bin/true
+ sleep 0.01
+ done
+}
+
+stress_subsys()
+{
+ local verb=+
+ while true ; do
+ echo $verb$subsys_ctrl >$sysfs/cgroup.subtree_control
+ [ $verb = "+" ] && verb=- || verb=+
+ # incommensurable period with other stresses
+ sleep 0.011
+ done
+}
+
+init_and_check()
+{
+ sysfs=`mount -t cgroup2 | head -1 | awk '{ print $3 }'`
+ if [ ! -d "$sysfs" ]; then
+ echo "Skipping: cgroup2 is not mounted" >&2
+ exit $ksft_skip
+ fi
+
+ if ! echo +$subsys_ctrl >$sysfs/cgroup.subtree_control ; then
+ echo "Skipping: cannot enable $subsys_ctrl in $sysfs" >&2
+ exit $ksft_skip
+ fi
+
+ if ! echo -$subsys_ctrl >$sysfs/cgroup.subtree_control ; then
+ echo "Skipping: cannot disable $subsys_ctrl in $sysfs" >&2
+ exit $ksft_skip
+ fi
+}
+
+declare -a stresses
+declare -a stress_pids
+duration=5
+rc=0
+subsys_ctrl=cpuset
+sysfs=
+
+while getopts c:d:hs: opt; do
+ case $opt in
+ c)
+ subsys_ctrl=$OPTARG
+ ;;
+ d)
+ duration=$OPTARG
+ ;;
+ h)
+ echo "Usage $0 [ -s stress ] ... [ -d duration ] [-c controller] cmd args .."
+ echo -e "\t default duration $duration seconds"
+ echo -e "\t default controller $subsys_ctrl"
+ exit
+ ;;
+ s)
+ func=stress_$OPTARG
+ if [ "x$(type -t $func)" != "xfunction" ] ; then
+ echo "Unknown stress $OPTARG"
+ exit 1
+ fi
+ stresses+=($func)
+ ;;
+ esac
+done
+shift $((OPTIND - 1))
+
+init_and_check
+
+for s in ${stresses[*]} ; do
+ $s &
+ stress_pids+=($!)
+done
+
+
+time=0
+start=$(date +%s)
+
+while [ $time -lt $duration ] ; do
+ $*
+ rc=$?
+ [ $rc -eq 0 ] || break
+ time=$(($(date +%s) - $start))
+done
+
+for pid in ${stress_pids[*]} ; do
+ kill -SIGTERM $pid
+ wait $pid
+done
+
+exit $rc
diff --git a/tools/testing/selftests/clone3/.gitignore b/tools/testing/selftests/clone3/.gitignore
new file mode 100644
index 000000000..83c0f6246
--- /dev/null
+++ b/tools/testing/selftests/clone3/.gitignore
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+clone3
+clone3_clear_sighand
+clone3_set_tid
+clone3_cap_checkpoint_restore
diff --git a/tools/testing/selftests/clone3/Makefile b/tools/testing/selftests/clone3/Makefile
new file mode 100644
index 000000000..ef7564cb7
--- /dev/null
+++ b/tools/testing/selftests/clone3/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS += -g -I../../../../usr/include/
+LDLIBS += -lcap
+
+TEST_GEN_PROGS := clone3 clone3_clear_sighand clone3_set_tid \
+ clone3_cap_checkpoint_restore
+
+include ../lib.mk
diff --git a/tools/testing/selftests/clone3/clone3.c b/tools/testing/selftests/clone3/clone3.c
new file mode 100644
index 000000000..cd4582129
--- /dev/null
+++ b/tools/testing/selftests/clone3/clone3.c
@@ -0,0 +1,199 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* Based on Christian Brauner's clone3() example */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <inttypes.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <sched.h>
+
+#include "../kselftest.h"
+#include "clone3_selftests.h"
+
+enum test_mode {
+ CLONE3_ARGS_NO_TEST,
+ CLONE3_ARGS_ALL_0,
+ CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG,
+ CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG,
+ CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG,
+ CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG,
+};
+
+static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode)
+{
+ struct __clone_args args = {
+ .flags = flags,
+ .exit_signal = SIGCHLD,
+ };
+
+ struct clone_args_extended {
+ struct __clone_args args;
+ __aligned_u64 excess_space[2];
+ } args_ext;
+
+ pid_t pid = -1;
+ int status;
+
+ memset(&args_ext, 0, sizeof(args_ext));
+ if (size > sizeof(struct __clone_args))
+ args_ext.excess_space[1] = 1;
+
+ if (size == 0)
+ size = sizeof(struct __clone_args);
+
+ switch (test_mode) {
+ case CLONE3_ARGS_NO_TEST:
+ /*
+ * Uses default 'flags' and 'SIGCHLD'
+ * assignment.
+ */
+ break;
+ case CLONE3_ARGS_ALL_0:
+ args.flags = 0;
+ args.exit_signal = 0;
+ break;
+ case CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG:
+ args.exit_signal = 0xbadc0ded00000000ULL;
+ break;
+ case CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG:
+ args.exit_signal = 0x0000000080000000ULL;
+ break;
+ case CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG:
+ args.exit_signal = 0x0000000000000100ULL;
+ break;
+ case CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG:
+ args.exit_signal = 0x00000000000000f0ULL;
+ break;
+ }
+
+ memcpy(&args_ext.args, &args, sizeof(struct __clone_args));
+
+ pid = sys_clone3((struct __clone_args *)&args_ext, size);
+ if (pid < 0) {
+ ksft_print_msg("%s - Failed to create new process\n",
+ strerror(errno));
+ return -errno;
+ }
+
+ if (pid == 0) {
+ ksft_print_msg("I am the child, my PID is %d\n", getpid());
+ _exit(EXIT_SUCCESS);
+ }
+
+ ksft_print_msg("I am the parent (%d). My child's pid is %d\n",
+ getpid(), pid);
+
+ if (waitpid(-1, &status, __WALL) < 0) {
+ ksft_print_msg("Child returned %s\n", strerror(errno));
+ return -errno;
+ }
+ if (WEXITSTATUS(status))
+ return WEXITSTATUS(status);
+
+ return 0;
+}
+
+static void test_clone3(uint64_t flags, size_t size, int expected,
+ enum test_mode test_mode)
+{
+ int ret;
+
+ ksft_print_msg(
+ "[%d] Trying clone3() with flags %#" PRIx64 " (size %zu)\n",
+ getpid(), flags, size);
+ ret = call_clone3(flags, size, test_mode);
+ ksft_print_msg("[%d] clone3() with flags says: %d expected %d\n",
+ getpid(), ret, expected);
+ if (ret != expected)
+ ksft_test_result_fail(
+ "[%d] Result (%d) is different than expected (%d)\n",
+ getpid(), ret, expected);
+ else
+ ksft_test_result_pass(
+ "[%d] Result (%d) matches expectation (%d)\n",
+ getpid(), ret, expected);
+}
+
+int main(int argc, char *argv[])
+{
+ uid_t uid = getuid();
+
+ ksft_print_header();
+ ksft_set_plan(17);
+ test_clone3_supported();
+
+ /* Just a simple clone3() should return 0.*/
+ test_clone3(0, 0, 0, CLONE3_ARGS_NO_TEST);
+
+ /* Do a clone3() in a new PID NS.*/
+ if (uid == 0)
+ test_clone3(CLONE_NEWPID, 0, 0, CLONE3_ARGS_NO_TEST);
+ else
+ ksft_test_result_skip("Skipping clone3() with CLONE_NEWPID\n");
+
+ /* Do a clone3() with CLONE_ARGS_SIZE_VER0. */
+ test_clone3(0, CLONE_ARGS_SIZE_VER0, 0, CLONE3_ARGS_NO_TEST);
+
+ /* Do a clone3() with CLONE_ARGS_SIZE_VER0 - 8 */
+ test_clone3(0, CLONE_ARGS_SIZE_VER0 - 8, -EINVAL, CLONE3_ARGS_NO_TEST);
+
+ /* Do a clone3() with sizeof(struct clone_args) + 8 */
+ test_clone3(0, sizeof(struct __clone_args) + 8, 0, CLONE3_ARGS_NO_TEST);
+
+ /* Do a clone3() with exit_signal having highest 32 bits non-zero */
+ test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG);
+
+ /* Do a clone3() with negative 32-bit exit_signal */
+ test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG);
+
+ /* Do a clone3() with exit_signal not fitting into CSIGNAL mask */
+ test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG);
+
+ /* Do a clone3() with NSIG < exit_signal < CSIG */
+ test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG);
+
+ test_clone3(0, sizeof(struct __clone_args) + 8, 0, CLONE3_ARGS_ALL_0);
+
+ test_clone3(0, sizeof(struct __clone_args) + 16, -E2BIG,
+ CLONE3_ARGS_ALL_0);
+
+ test_clone3(0, sizeof(struct __clone_args) * 2, -E2BIG,
+ CLONE3_ARGS_ALL_0);
+
+ /* Do a clone3() with > page size */
+ test_clone3(0, getpagesize() + 8, -E2BIG, CLONE3_ARGS_NO_TEST);
+
+ /* Do a clone3() with CLONE_ARGS_SIZE_VER0 in a new PID NS. */
+ if (uid == 0)
+ test_clone3(CLONE_NEWPID, CLONE_ARGS_SIZE_VER0, 0,
+ CLONE3_ARGS_NO_TEST);
+ else
+ ksft_test_result_skip("Skipping clone3() with CLONE_NEWPID\n");
+
+ /* Do a clone3() with CLONE_ARGS_SIZE_VER0 - 8 in a new PID NS */
+ test_clone3(CLONE_NEWPID, CLONE_ARGS_SIZE_VER0 - 8, -EINVAL,
+ CLONE3_ARGS_NO_TEST);
+
+ /* Do a clone3() with sizeof(struct clone_args) + 8 in a new PID NS */
+ if (uid == 0)
+ test_clone3(CLONE_NEWPID, sizeof(struct __clone_args) + 8, 0,
+ CLONE3_ARGS_NO_TEST);
+ else
+ ksft_test_result_skip("Skipping clone3() with CLONE_NEWPID\n");
+
+ /* Do a clone3() with > page size in a new PID NS */
+ test_clone3(CLONE_NEWPID, getpagesize() + 8, -E2BIG,
+ CLONE3_ARGS_NO_TEST);
+
+ return !ksft_get_fail_cnt() ? ksft_exit_pass() : ksft_exit_fail();
+}
diff --git a/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c b/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c
new file mode 100644
index 000000000..52d3f0364
--- /dev/null
+++ b/tools/testing/selftests/clone3/clone3_cap_checkpoint_restore.c
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Based on Christian Brauner's clone3() example.
+ * These tests are assuming to be running in the host's
+ * PID namespace.
+ */
+
+/* capabilities related code based on selftests/bpf/test_verifier.c */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <sys/capability.h>
+#include <sys/prctl.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <sched.h>
+
+#include "../kselftest_harness.h"
+#include "clone3_selftests.h"
+
+#ifndef MAX_PID_NS_LEVEL
+#define MAX_PID_NS_LEVEL 32
+#endif
+
+static void child_exit(int ret)
+{
+ fflush(stdout);
+ fflush(stderr);
+ _exit(ret);
+}
+
+static int call_clone3_set_tid(struct __test_metadata *_metadata,
+ pid_t *set_tid, size_t set_tid_size)
+{
+ int status;
+ pid_t pid = -1;
+
+ struct __clone_args args = {
+ .exit_signal = SIGCHLD,
+ .set_tid = ptr_to_u64(set_tid),
+ .set_tid_size = set_tid_size,
+ };
+
+ pid = sys_clone3(&args, sizeof(args));
+ if (pid < 0) {
+ TH_LOG("%s - Failed to create new process", strerror(errno));
+ return -errno;
+ }
+
+ if (pid == 0) {
+ int ret;
+ char tmp = 0;
+
+ TH_LOG("I am the child, my PID is %d (expected %d)", getpid(), set_tid[0]);
+
+ if (set_tid[0] != getpid())
+ child_exit(EXIT_FAILURE);
+ child_exit(EXIT_SUCCESS);
+ }
+
+ TH_LOG("I am the parent (%d). My child's pid is %d", getpid(), pid);
+
+ if (waitpid(pid, &status, 0) < 0) {
+ TH_LOG("Child returned %s", strerror(errno));
+ return -errno;
+ }
+
+ if (!WIFEXITED(status))
+ return -1;
+
+ return WEXITSTATUS(status);
+}
+
+static int test_clone3_set_tid(struct __test_metadata *_metadata,
+ pid_t *set_tid, size_t set_tid_size)
+{
+ int ret;
+
+ TH_LOG("[%d] Trying clone3() with CLONE_SET_TID to %d", getpid(), set_tid[0]);
+ ret = call_clone3_set_tid(_metadata, set_tid, set_tid_size);
+ TH_LOG("[%d] clone3() with CLONE_SET_TID %d says:%d", getpid(), set_tid[0], ret);
+ return ret;
+}
+
+struct libcap {
+ struct __user_cap_header_struct hdr;
+ struct __user_cap_data_struct data[2];
+};
+
+static int set_capability(void)
+{
+ cap_value_t cap_values[] = { CAP_SETUID, CAP_SETGID };
+ struct libcap *cap;
+ int ret = -1;
+ cap_t caps;
+
+ caps = cap_get_proc();
+ if (!caps) {
+ perror("cap_get_proc");
+ return -1;
+ }
+
+ /* Drop all capabilities */
+ if (cap_clear(caps)) {
+ perror("cap_clear");
+ goto out;
+ }
+
+ cap_set_flag(caps, CAP_EFFECTIVE, 2, cap_values, CAP_SET);
+ cap_set_flag(caps, CAP_PERMITTED, 2, cap_values, CAP_SET);
+
+ cap = (struct libcap *) caps;
+
+ /* 40 -> CAP_CHECKPOINT_RESTORE */
+ cap->data[1].effective |= 1 << (40 - 32);
+ cap->data[1].permitted |= 1 << (40 - 32);
+
+ if (cap_set_proc(caps)) {
+ perror("cap_set_proc");
+ goto out;
+ }
+ ret = 0;
+out:
+ if (cap_free(caps))
+ perror("cap_free");
+ return ret;
+}
+
+TEST(clone3_cap_checkpoint_restore)
+{
+ pid_t pid;
+ int status;
+ int ret = 0;
+ pid_t set_tid[1];
+
+ test_clone3_supported();
+
+ EXPECT_EQ(getuid(), 0)
+ SKIP(return, "Skipping all tests as non-root");
+
+ memset(&set_tid, 0, sizeof(set_tid));
+
+ /* Find the current active PID */
+ pid = fork();
+ if (pid == 0) {
+ TH_LOG("Child has PID %d", getpid());
+ child_exit(EXIT_SUCCESS);
+ }
+ ASSERT_GT(waitpid(pid, &status, 0), 0)
+ TH_LOG("Waiting for child %d failed", pid);
+
+ /* After the child has finished, its PID should be free. */
+ set_tid[0] = pid;
+
+ ASSERT_EQ(set_capability(), 0)
+ TH_LOG("Could not set CAP_CHECKPOINT_RESTORE");
+
+ ASSERT_EQ(prctl(PR_SET_KEEPCAPS, 1, 0, 0, 0), 0);
+
+ EXPECT_EQ(setgid(65534), 0)
+ TH_LOG("Failed to setgid(65534)");
+ ASSERT_EQ(setuid(65534), 0);
+
+ set_tid[0] = pid;
+ /* This would fail without CAP_CHECKPOINT_RESTORE */
+ ASSERT_EQ(test_clone3_set_tid(_metadata, set_tid, 1), -EPERM);
+ ASSERT_EQ(set_capability(), 0)
+ TH_LOG("Could not set CAP_CHECKPOINT_RESTORE");
+ /* This should work as we have CAP_CHECKPOINT_RESTORE as non-root */
+ ASSERT_EQ(test_clone3_set_tid(_metadata, set_tid, 1), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/clone3/clone3_clear_sighand.c b/tools/testing/selftests/clone3/clone3_clear_sighand.c
new file mode 100644
index 000000000..47a8c0fc3
--- /dev/null
+++ b/tools/testing/selftests/clone3/clone3_clear_sighand.c
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+
+#include "../kselftest.h"
+#include "clone3_selftests.h"
+
+#ifndef CLONE_CLEAR_SIGHAND
+#define CLONE_CLEAR_SIGHAND 0x100000000ULL
+#endif
+
+static void nop_handler(int signo)
+{
+}
+
+static int wait_for_pid(pid_t pid)
+{
+ int status, ret;
+
+again:
+ ret = waitpid(pid, &status, 0);
+ if (ret == -1) {
+ if (errno == EINTR)
+ goto again;
+
+ return -1;
+ }
+
+ if (!WIFEXITED(status))
+ return -1;
+
+ return WEXITSTATUS(status);
+}
+
+static void test_clone3_clear_sighand(void)
+{
+ int ret;
+ pid_t pid;
+ struct __clone_args args = {};
+ struct sigaction act;
+
+ /*
+ * Check that CLONE_CLEAR_SIGHAND and CLONE_SIGHAND are mutually
+ * exclusive.
+ */
+ args.flags |= CLONE_CLEAR_SIGHAND | CLONE_SIGHAND;
+ args.exit_signal = SIGCHLD;
+ pid = sys_clone3(&args, sizeof(args));
+ if (pid > 0)
+ ksft_exit_fail_msg(
+ "clone3(CLONE_CLEAR_SIGHAND | CLONE_SIGHAND) succeeded\n");
+
+ act.sa_handler = nop_handler;
+ ret = sigemptyset(&act.sa_mask);
+ if (ret < 0)
+ ksft_exit_fail_msg("%s - sigemptyset() failed\n",
+ strerror(errno));
+
+ act.sa_flags = 0;
+
+ /* Register signal handler for SIGUSR1 */
+ ret = sigaction(SIGUSR1, &act, NULL);
+ if (ret < 0)
+ ksft_exit_fail_msg(
+ "%s - sigaction(SIGUSR1, &act, NULL) failed\n",
+ strerror(errno));
+
+ /* Register signal handler for SIGUSR2 */
+ ret = sigaction(SIGUSR2, &act, NULL);
+ if (ret < 0)
+ ksft_exit_fail_msg(
+ "%s - sigaction(SIGUSR2, &act, NULL) failed\n",
+ strerror(errno));
+
+ /* Check that CLONE_CLEAR_SIGHAND works. */
+ args.flags = CLONE_CLEAR_SIGHAND;
+ pid = sys_clone3(&args, sizeof(args));
+ if (pid < 0)
+ ksft_exit_fail_msg("%s - clone3(CLONE_CLEAR_SIGHAND) failed\n",
+ strerror(errno));
+
+ if (pid == 0) {
+ ret = sigaction(SIGUSR1, NULL, &act);
+ if (ret < 0)
+ exit(EXIT_FAILURE);
+
+ if (act.sa_handler != SIG_DFL)
+ exit(EXIT_FAILURE);
+
+ ret = sigaction(SIGUSR2, NULL, &act);
+ if (ret < 0)
+ exit(EXIT_FAILURE);
+
+ if (act.sa_handler != SIG_DFL)
+ exit(EXIT_FAILURE);
+
+ exit(EXIT_SUCCESS);
+ }
+
+ ret = wait_for_pid(pid);
+ if (ret)
+ ksft_exit_fail_msg(
+ "Failed to clear signal handler for child process\n");
+
+ ksft_test_result_pass("Cleared signal handlers for child process\n");
+}
+
+int main(int argc, char **argv)
+{
+ ksft_print_header();
+ ksft_set_plan(1);
+ test_clone3_supported();
+
+ test_clone3_clear_sighand();
+
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/clone3/clone3_selftests.h b/tools/testing/selftests/clone3/clone3_selftests.h
new file mode 100644
index 000000000..e81ffaaee
--- /dev/null
+++ b/tools/testing/selftests/clone3/clone3_selftests.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _CLONE3_SELFTESTS_H
+#define _CLONE3_SELFTESTS_H
+
+#define _GNU_SOURCE
+#include <sched.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <stdint.h>
+#include <syscall.h>
+#include <sys/wait.h>
+
+#include "../kselftest.h"
+
+#define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr)))
+
+#ifndef CLONE_INTO_CGROUP
+#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
+#endif
+
+#ifndef __NR_clone3
+#define __NR_clone3 -1
+#endif
+
+struct __clone_args {
+ __aligned_u64 flags;
+ __aligned_u64 pidfd;
+ __aligned_u64 child_tid;
+ __aligned_u64 parent_tid;
+ __aligned_u64 exit_signal;
+ __aligned_u64 stack;
+ __aligned_u64 stack_size;
+ __aligned_u64 tls;
+#ifndef CLONE_ARGS_SIZE_VER0
+#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
+#endif
+ __aligned_u64 set_tid;
+ __aligned_u64 set_tid_size;
+#ifndef CLONE_ARGS_SIZE_VER1
+#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
+#endif
+ __aligned_u64 cgroup;
+#ifndef CLONE_ARGS_SIZE_VER2
+#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
+#endif
+};
+
+static pid_t sys_clone3(struct __clone_args *args, size_t size)
+{
+ fflush(stdout);
+ fflush(stderr);
+ return syscall(__NR_clone3, args, size);
+}
+
+static inline void test_clone3_supported(void)
+{
+ pid_t pid;
+ struct __clone_args args = {};
+
+ if (__NR_clone3 < 0)
+ ksft_exit_skip("clone3() syscall is not supported\n");
+
+ /* Set to something that will always cause EINVAL. */
+ args.exit_signal = -1;
+ pid = sys_clone3(&args, sizeof(args));
+ if (!pid)
+ exit(EXIT_SUCCESS);
+
+ if (pid > 0) {
+ wait(NULL);
+ ksft_exit_fail_msg(
+ "Managed to create child process with invalid exit_signal\n");
+ }
+
+ if (errno == ENOSYS)
+ ksft_exit_skip("clone3() syscall is not supported\n");
+
+ ksft_print_msg("clone3() syscall supported\n");
+}
+
+#endif /* _CLONE3_SELFTESTS_H */
diff --git a/tools/testing/selftests/clone3/clone3_set_tid.c b/tools/testing/selftests/clone3/clone3_set_tid.c
new file mode 100644
index 000000000..0229e9ebb
--- /dev/null
+++ b/tools/testing/selftests/clone3/clone3_set_tid.c
@@ -0,0 +1,397 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Based on Christian Brauner's clone3() example.
+ * These tests are assuming to be running in the host's
+ * PID namespace.
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <sched.h>
+
+#include "../kselftest.h"
+#include "clone3_selftests.h"
+
+#ifndef MAX_PID_NS_LEVEL
+#define MAX_PID_NS_LEVEL 32
+#endif
+
+static int pipe_1[2];
+static int pipe_2[2];
+
+static void child_exit(int ret)
+{
+ fflush(stdout);
+ fflush(stderr);
+ _exit(ret);
+}
+
+static int call_clone3_set_tid(pid_t *set_tid,
+ size_t set_tid_size,
+ int flags,
+ int expected_pid,
+ bool wait_for_it)
+{
+ int status;
+ pid_t pid = -1;
+
+ struct __clone_args args = {
+ .flags = flags,
+ .exit_signal = SIGCHLD,
+ .set_tid = ptr_to_u64(set_tid),
+ .set_tid_size = set_tid_size,
+ };
+
+ pid = sys_clone3(&args, sizeof(args));
+ if (pid < 0) {
+ ksft_print_msg("%s - Failed to create new process\n",
+ strerror(errno));
+ return -errno;
+ }
+
+ if (pid == 0) {
+ int ret;
+ char tmp = 0;
+ int exit_code = EXIT_SUCCESS;
+
+ ksft_print_msg("I am the child, my PID is %d (expected %d)\n",
+ getpid(), set_tid[0]);
+ if (wait_for_it) {
+ ksft_print_msg("[%d] Child is ready and waiting\n",
+ getpid());
+
+ /* Signal the parent that the child is ready */
+ close(pipe_1[0]);
+ ret = write(pipe_1[1], &tmp, 1);
+ if (ret != 1) {
+ ksft_print_msg(
+ "Writing to pipe returned %d", ret);
+ exit_code = EXIT_FAILURE;
+ }
+ close(pipe_1[1]);
+ close(pipe_2[1]);
+ ret = read(pipe_2[0], &tmp, 1);
+ if (ret != 1) {
+ ksft_print_msg(
+ "Reading from pipe returned %d", ret);
+ exit_code = EXIT_FAILURE;
+ }
+ close(pipe_2[0]);
+ }
+
+ if (set_tid[0] != getpid())
+ child_exit(EXIT_FAILURE);
+ child_exit(exit_code);
+ }
+
+ if (expected_pid == 0 || expected_pid == pid) {
+ ksft_print_msg("I am the parent (%d). My child's pid is %d\n",
+ getpid(), pid);
+ } else {
+ ksft_print_msg(
+ "Expected child pid %d does not match actual pid %d\n",
+ expected_pid, pid);
+ return -1;
+ }
+
+ if (waitpid(pid, &status, 0) < 0) {
+ ksft_print_msg("Child returned %s\n", strerror(errno));
+ return -errno;
+ }
+
+ if (!WIFEXITED(status))
+ return -1;
+
+ return WEXITSTATUS(status);
+}
+
+static void test_clone3_set_tid(pid_t *set_tid,
+ size_t set_tid_size,
+ int flags,
+ int expected,
+ int expected_pid,
+ bool wait_for_it)
+{
+ int ret;
+
+ ksft_print_msg(
+ "[%d] Trying clone3() with CLONE_SET_TID to %d and 0x%x\n",
+ getpid(), set_tid[0], flags);
+ ret = call_clone3_set_tid(set_tid, set_tid_size, flags, expected_pid,
+ wait_for_it);
+ ksft_print_msg(
+ "[%d] clone3() with CLONE_SET_TID %d says :%d - expected %d\n",
+ getpid(), set_tid[0], ret, expected);
+ if (ret != expected)
+ ksft_test_result_fail(
+ "[%d] Result (%d) is different than expected (%d)\n",
+ getpid(), ret, expected);
+ else
+ ksft_test_result_pass(
+ "[%d] Result (%d) matches expectation (%d)\n",
+ getpid(), ret, expected);
+}
+int main(int argc, char *argv[])
+{
+ FILE *f;
+ char buf;
+ char *line;
+ int status;
+ int ret = -1;
+ size_t len = 0;
+ int pid_max = 0;
+ uid_t uid = getuid();
+ char proc_path[100] = {0};
+ pid_t pid, ns1, ns2, ns3, ns_pid;
+ pid_t set_tid[MAX_PID_NS_LEVEL * 2];
+
+ ksft_print_header();
+ ksft_set_plan(29);
+ test_clone3_supported();
+
+ if (pipe(pipe_1) < 0 || pipe(pipe_2) < 0)
+ ksft_exit_fail_msg("pipe() failed\n");
+
+ f = fopen("/proc/sys/kernel/pid_max", "r");
+ if (f == NULL)
+ ksft_exit_fail_msg(
+ "%s - Could not open /proc/sys/kernel/pid_max\n",
+ strerror(errno));
+ fscanf(f, "%d", &pid_max);
+ fclose(f);
+ ksft_print_msg("/proc/sys/kernel/pid_max %d\n", pid_max);
+
+ /* Try invalid settings */
+ memset(&set_tid, 0, sizeof(set_tid));
+ test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL + 1, 0, -EINVAL, 0, 0);
+
+ test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2, 0, -EINVAL, 0, 0);
+
+ test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2 + 1, 0,
+ -EINVAL, 0, 0);
+
+ test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 42, 0, -EINVAL, 0, 0);
+
+ /*
+ * This can actually work if this test running in a MAX_PID_NS_LEVEL - 1
+ * nested PID namespace.
+ */
+ test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL - 1, 0, -EINVAL, 0, 0);
+
+ memset(&set_tid, 0xff, sizeof(set_tid));
+ test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL + 1, 0, -EINVAL, 0, 0);
+
+ test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2, 0, -EINVAL, 0, 0);
+
+ test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2 + 1, 0,
+ -EINVAL, 0, 0);
+
+ test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 42, 0, -EINVAL, 0, 0);
+
+ /*
+ * This can actually work if this test running in a MAX_PID_NS_LEVEL - 1
+ * nested PID namespace.
+ */
+ test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL - 1, 0, -EINVAL, 0, 0);
+
+ memset(&set_tid, 0, sizeof(set_tid));
+ /* Try with an invalid PID */
+ set_tid[0] = 0;
+ test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0);
+
+ set_tid[0] = -1;
+ test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0);
+
+ /* Claim that the set_tid array actually contains 2 elements. */
+ test_clone3_set_tid(set_tid, 2, 0, -EINVAL, 0, 0);
+
+ /* Try it in a new PID namespace */
+ if (uid == 0)
+ test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0);
+ else
+ ksft_test_result_skip("Clone3() with set_tid requires root\n");
+
+ /* Try with a valid PID (1) this should return -EEXIST. */
+ set_tid[0] = 1;
+ if (uid == 0)
+ test_clone3_set_tid(set_tid, 1, 0, -EEXIST, 0, 0);
+ else
+ ksft_test_result_skip("Clone3() with set_tid requires root\n");
+
+ /* Try it in a new PID namespace */
+ if (uid == 0)
+ test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, 0, 0, 0);
+ else
+ ksft_test_result_skip("Clone3() with set_tid requires root\n");
+
+ /* pid_max should fail everywhere */
+ set_tid[0] = pid_max;
+ test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0);
+
+ if (uid == 0)
+ test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0);
+ else
+ ksft_test_result_skip("Clone3() with set_tid requires root\n");
+
+ if (uid != 0) {
+ /*
+ * All remaining tests require root. Tell the framework
+ * that all those tests are skipped as non-root.
+ */
+ ksft_cnt.ksft_xskip += ksft_plan - ksft_test_num();
+ goto out;
+ }
+
+ /* Find the current active PID */
+ pid = fork();
+ if (pid == 0) {
+ ksft_print_msg("Child has PID %d\n", getpid());
+ child_exit(EXIT_SUCCESS);
+ }
+ if (waitpid(pid, &status, 0) < 0)
+ ksft_exit_fail_msg("Waiting for child %d failed", pid);
+
+ /* After the child has finished, its PID should be free. */
+ set_tid[0] = pid;
+ test_clone3_set_tid(set_tid, 1, 0, 0, 0, 0);
+
+ /* This should fail as there is no PID 1 in that namespace */
+ test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0);
+
+ /*
+ * Creating a process with PID 1 in the newly created most nested
+ * PID namespace and PID 'pid' in the parent PID namespace. This
+ * needs to work.
+ */
+ set_tid[0] = 1;
+ set_tid[1] = pid;
+ test_clone3_set_tid(set_tid, 2, CLONE_NEWPID, 0, pid, 0);
+
+ ksft_print_msg("unshare PID namespace\n");
+ if (unshare(CLONE_NEWPID) == -1)
+ ksft_exit_fail_msg("unshare(CLONE_NEWPID) failed: %s\n",
+ strerror(errno));
+
+ set_tid[0] = pid;
+
+ /* This should fail as there is no PID 1 in that namespace */
+ test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0);
+
+ /* Let's create a PID 1 */
+ ns_pid = fork();
+ if (ns_pid == 0) {
+ /*
+ * This and the next test cases check that all pid-s are
+ * released on error paths.
+ */
+ set_tid[0] = 43;
+ set_tid[1] = -1;
+ test_clone3_set_tid(set_tid, 2, 0, -EINVAL, 0, 0);
+
+ set_tid[0] = 43;
+ set_tid[1] = pid;
+ test_clone3_set_tid(set_tid, 2, 0, 0, 43, 0);
+
+ ksft_print_msg("Child in PID namespace has PID %d\n", getpid());
+ set_tid[0] = 2;
+ test_clone3_set_tid(set_tid, 1, 0, 0, 2, 0);
+
+ set_tid[0] = 1;
+ set_tid[1] = -1;
+ set_tid[2] = pid;
+ /* This should fail as there is invalid PID at level '1'. */
+ test_clone3_set_tid(set_tid, 3, CLONE_NEWPID, -EINVAL, 0, 0);
+
+ set_tid[0] = 1;
+ set_tid[1] = 42;
+ set_tid[2] = pid;
+ /*
+ * This should fail as there are not enough active PID
+ * namespaces. Again assuming this is running in the host's
+ * PID namespace. Not yet nested.
+ */
+ test_clone3_set_tid(set_tid, 4, CLONE_NEWPID, -EINVAL, 0, 0);
+
+ /*
+ * This should work and from the parent we should see
+ * something like 'NSpid: pid 42 1'.
+ */
+ test_clone3_set_tid(set_tid, 3, CLONE_NEWPID, 0, 42, true);
+
+ child_exit(ksft_cnt.ksft_fail);
+ }
+
+ close(pipe_1[1]);
+ close(pipe_2[0]);
+ while (read(pipe_1[0], &buf, 1) > 0) {
+ ksft_print_msg("[%d] Child is ready and waiting\n", getpid());
+ break;
+ }
+
+ snprintf(proc_path, sizeof(proc_path), "/proc/%d/status", pid);
+ f = fopen(proc_path, "r");
+ if (f == NULL)
+ ksft_exit_fail_msg(
+ "%s - Could not open %s\n",
+ strerror(errno), proc_path);
+
+ while (getline(&line, &len, f) != -1) {
+ if (strstr(line, "NSpid")) {
+ int i;
+
+ /* Verify that all generated PIDs are as expected. */
+ i = sscanf(line, "NSpid:\t%d\t%d\t%d",
+ &ns3, &ns2, &ns1);
+ if (i != 3) {
+ ksft_print_msg(
+ "Unexpected 'NSPid:' entry: %s",
+ line);
+ ns1 = ns2 = ns3 = 0;
+ }
+ break;
+ }
+ }
+ fclose(f);
+ free(line);
+ close(pipe_2[0]);
+
+ /* Tell the clone3()'d child to finish. */
+ write(pipe_2[1], &buf, 1);
+ close(pipe_2[1]);
+
+ if (waitpid(ns_pid, &status, 0) < 0) {
+ ksft_print_msg("Child returned %s\n", strerror(errno));
+ ret = -errno;
+ goto out;
+ }
+
+ if (!WIFEXITED(status))
+ ksft_test_result_fail("Child error\n");
+
+ ksft_cnt.ksft_pass += 6 - (ksft_cnt.ksft_fail - WEXITSTATUS(status));
+ ksft_cnt.ksft_fail = WEXITSTATUS(status);
+
+ if (ns3 == pid && ns2 == 42 && ns1 == 1)
+ ksft_test_result_pass(
+ "PIDs in all namespaces as expected (%d,%d,%d)\n",
+ ns3, ns2, ns1);
+ else
+ ksft_test_result_fail(
+ "PIDs in all namespaces not as expected (%d,%d,%d)\n",
+ ns3, ns2, ns1);
+out:
+ ret = 0;
+
+ return !ret ? ksft_exit_pass() : ksft_exit_fail();
+}
diff --git a/tools/testing/selftests/core/.gitignore b/tools/testing/selftests/core/.gitignore
new file mode 100644
index 000000000..6e6712ce5
--- /dev/null
+++ b/tools/testing/selftests/core/.gitignore
@@ -0,0 +1 @@
+close_range_test
diff --git a/tools/testing/selftests/core/Makefile b/tools/testing/selftests/core/Makefile
new file mode 100644
index 000000000..f6f2d6f47
--- /dev/null
+++ b/tools/testing/selftests/core/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+CFLAGS += -g -I../../../../usr/include/
+
+TEST_GEN_PROGS := close_range_test
+
+include ../lib.mk
+
diff --git a/tools/testing/selftests/core/close_range_test.c b/tools/testing/selftests/core/close_range_test.c
new file mode 100644
index 000000000..0a2679584
--- /dev/null
+++ b/tools/testing/selftests/core/close_range_test.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/kernel.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <unistd.h>
+
+#include "../kselftest_harness.h"
+#include "../clone3/clone3_selftests.h"
+
+#ifndef __NR_close_range
+#define __NR_close_range -1
+#endif
+
+#ifndef CLOSE_RANGE_UNSHARE
+#define CLOSE_RANGE_UNSHARE (1U << 1)
+#endif
+
+static inline int sys_close_range(unsigned int fd, unsigned int max_fd,
+ unsigned int flags)
+{
+ return syscall(__NR_close_range, fd, max_fd, flags);
+}
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+TEST(core_close_range)
+{
+ int i, ret;
+ int open_fds[101];
+
+ for (i = 0; i < ARRAY_SIZE(open_fds); i++) {
+ int fd;
+
+ fd = open("/dev/null", O_RDONLY | O_CLOEXEC);
+ ASSERT_GE(fd, 0) {
+ if (errno == ENOENT)
+ SKIP(return, "Skipping test since /dev/null does not exist");
+ }
+
+ open_fds[i] = fd;
+ }
+
+ EXPECT_EQ(-1, sys_close_range(open_fds[0], open_fds[100], -1)) {
+ if (errno == ENOSYS)
+ SKIP(return, "close_range() syscall not supported");
+ }
+
+ EXPECT_EQ(0, sys_close_range(open_fds[0], open_fds[50], 0));
+
+ for (i = 0; i <= 50; i++)
+ EXPECT_EQ(-1, fcntl(open_fds[i], F_GETFL));
+
+ for (i = 51; i <= 100; i++)
+ EXPECT_GT(fcntl(open_fds[i], F_GETFL), -1);
+
+ /* create a couple of gaps */
+ close(57);
+ close(78);
+ close(81);
+ close(82);
+ close(84);
+ close(90);
+
+ EXPECT_EQ(0, sys_close_range(open_fds[51], open_fds[92], 0));
+
+ for (i = 51; i <= 92; i++)
+ EXPECT_EQ(-1, fcntl(open_fds[i], F_GETFL));
+
+ for (i = 93; i <= 100; i++)
+ EXPECT_GT(fcntl(open_fds[i], F_GETFL), -1);
+
+ /* test that the kernel caps and still closes all fds */
+ EXPECT_EQ(0, sys_close_range(open_fds[93], open_fds[99], 0));
+
+ for (i = 93; i <= 99; i++)
+ EXPECT_EQ(-1, fcntl(open_fds[i], F_GETFL));
+
+ EXPECT_GT(fcntl(open_fds[i], F_GETFL), -1);
+
+ EXPECT_EQ(0, sys_close_range(open_fds[100], open_fds[100], 0));
+
+ EXPECT_EQ(-1, fcntl(open_fds[100], F_GETFL));
+}
+
+TEST(close_range_unshare)
+{
+ int i, ret, status;
+ pid_t pid;
+ int open_fds[101];
+ struct clone_args args = {
+ .flags = CLONE_FILES,
+ .exit_signal = SIGCHLD,
+ };
+
+ for (i = 0; i < ARRAY_SIZE(open_fds); i++) {
+ int fd;
+
+ fd = open("/dev/null", O_RDONLY | O_CLOEXEC);
+ ASSERT_GE(fd, 0) {
+ if (errno == ENOENT)
+ SKIP(return, "Skipping test since /dev/null does not exist");
+ }
+
+ open_fds[i] = fd;
+ }
+
+ pid = sys_clone3(&args, sizeof(args));
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ ret = sys_close_range(open_fds[0], open_fds[50],
+ CLOSE_RANGE_UNSHARE);
+ if (ret)
+ exit(EXIT_FAILURE);
+
+ for (i = 0; i <= 50; i++)
+ if (fcntl(open_fds[i], F_GETFL) != -1)
+ exit(EXIT_FAILURE);
+
+ for (i = 51; i <= 100; i++)
+ if (fcntl(open_fds[i], F_GETFL) == -1)
+ exit(EXIT_FAILURE);
+
+ /* create a couple of gaps */
+ close(57);
+ close(78);
+ close(81);
+ close(82);
+ close(84);
+ close(90);
+
+ ret = sys_close_range(open_fds[51], open_fds[92],
+ CLOSE_RANGE_UNSHARE);
+ if (ret)
+ exit(EXIT_FAILURE);
+
+ for (i = 51; i <= 92; i++)
+ if (fcntl(open_fds[i], F_GETFL) != -1)
+ exit(EXIT_FAILURE);
+
+ for (i = 93; i <= 100; i++)
+ if (fcntl(open_fds[i], F_GETFL) == -1)
+ exit(EXIT_FAILURE);
+
+ /* test that the kernel caps and still closes all fds */
+ ret = sys_close_range(open_fds[93], open_fds[99],
+ CLOSE_RANGE_UNSHARE);
+ if (ret)
+ exit(EXIT_FAILURE);
+
+ for (i = 93; i <= 99; i++)
+ if (fcntl(open_fds[i], F_GETFL) != -1)
+ exit(EXIT_FAILURE);
+
+ if (fcntl(open_fds[100], F_GETFL) == -1)
+ exit(EXIT_FAILURE);
+
+ ret = sys_close_range(open_fds[100], open_fds[100],
+ CLOSE_RANGE_UNSHARE);
+ if (ret)
+ exit(EXIT_FAILURE);
+
+ if (fcntl(open_fds[100], F_GETFL) != -1)
+ exit(EXIT_FAILURE);
+
+ exit(EXIT_SUCCESS);
+ }
+
+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
+TEST(close_range_unshare_capped)
+{
+ int i, ret, status;
+ pid_t pid;
+ int open_fds[101];
+ struct clone_args args = {
+ .flags = CLONE_FILES,
+ .exit_signal = SIGCHLD,
+ };
+
+ for (i = 0; i < ARRAY_SIZE(open_fds); i++) {
+ int fd;
+
+ fd = open("/dev/null", O_RDONLY | O_CLOEXEC);
+ ASSERT_GE(fd, 0) {
+ if (errno == ENOENT)
+ SKIP(return, "Skipping test since /dev/null does not exist");
+ }
+
+ open_fds[i] = fd;
+ }
+
+ pid = sys_clone3(&args, sizeof(args));
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ ret = sys_close_range(open_fds[0], UINT_MAX,
+ CLOSE_RANGE_UNSHARE);
+ if (ret)
+ exit(EXIT_FAILURE);
+
+ for (i = 0; i <= 100; i++)
+ if (fcntl(open_fds[i], F_GETFL) != -1)
+ exit(EXIT_FAILURE);
+
+ exit(EXIT_SUCCESS);
+ }
+
+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/cpu-hotplug/Makefile b/tools/testing/selftests/cpu-hotplug/Makefile
new file mode 100644
index 000000000..d8be047ee
--- /dev/null
+++ b/tools/testing/selftests/cpu-hotplug/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+all:
+
+TEST_PROGS := cpu-on-off-test.sh
+
+include ../lib.mk
+
+run_full_test:
+ @/bin/bash ./cpu-on-off-test.sh -a || echo "cpu-hotplug selftests: [FAIL]"
+
+clean:
diff --git a/tools/testing/selftests/cpu-hotplug/config b/tools/testing/selftests/cpu-hotplug/config
new file mode 100644
index 000000000..d4aca2ad5
--- /dev/null
+++ b/tools/testing/selftests/cpu-hotplug/config
@@ -0,0 +1 @@
+CONFIG_NOTIFIER_ERROR_INJECTION=y
diff --git a/tools/testing/selftests/cpu-hotplug/cpu-on-off-test.sh b/tools/testing/selftests/cpu-hotplug/cpu-on-off-test.sh
new file mode 100755
index 000000000..0d26b5e3f
--- /dev/null
+++ b/tools/testing/selftests/cpu-hotplug/cpu-on-off-test.sh
@@ -0,0 +1,293 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+SYSFS=
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+prerequisite()
+{
+ msg="skip all tests:"
+
+ if [ $UID != 0 ]; then
+ echo $msg must be run as root >&2
+ exit $ksft_skip
+ fi
+
+ taskset -p 01 $$
+
+ SYSFS=`mount -t sysfs | head -1 | awk '{ print $3 }'`
+
+ if [ ! -d "$SYSFS" ]; then
+ echo $msg sysfs is not mounted >&2
+ exit $ksft_skip
+ fi
+
+ if ! ls $SYSFS/devices/system/cpu/cpu* > /dev/null 2>&1; then
+ echo $msg cpu hotplug is not supported >&2
+ exit $ksft_skip
+ fi
+
+ echo "CPU online/offline summary:"
+ online_cpus=`cat $SYSFS/devices/system/cpu/online`
+ online_max=${online_cpus##*-}
+
+ if [[ "$online_cpus" = "$online_max" ]]; then
+ echo "$msg: since there is only one cpu: $online_cpus"
+ exit $ksft_skip
+ fi
+
+ present_cpus=`cat $SYSFS/devices/system/cpu/present`
+ present_max=${present_cpus##*-}
+ echo "present_cpus = $present_cpus present_max = $present_max"
+
+ echo -e "\t Cpus in online state: $online_cpus"
+
+ offline_cpus=`cat $SYSFS/devices/system/cpu/offline`
+ if [[ "a$offline_cpus" = "a" ]]; then
+ offline_cpus=0
+ else
+ offline_max=${offline_cpus##*-}
+ fi
+ echo -e "\t Cpus in offline state: $offline_cpus"
+}
+
+#
+# list all hot-pluggable CPUs
+#
+hotpluggable_cpus()
+{
+ local state=${1:-.\*}
+
+ for cpu in $SYSFS/devices/system/cpu/cpu*; do
+ if [ -f $cpu/online ] && grep -q $state $cpu/online; then
+ echo ${cpu##/*/cpu}
+ fi
+ done
+}
+
+hotplaggable_offline_cpus()
+{
+ hotpluggable_cpus 0
+}
+
+hotpluggable_online_cpus()
+{
+ hotpluggable_cpus 1
+}
+
+cpu_is_online()
+{
+ grep -q 1 $SYSFS/devices/system/cpu/cpu$1/online
+}
+
+cpu_is_offline()
+{
+ grep -q 0 $SYSFS/devices/system/cpu/cpu$1/online
+}
+
+online_cpu()
+{
+ echo 1 > $SYSFS/devices/system/cpu/cpu$1/online
+}
+
+offline_cpu()
+{
+ echo 0 > $SYSFS/devices/system/cpu/cpu$1/online
+}
+
+online_cpu_expect_success()
+{
+ local cpu=$1
+
+ if ! online_cpu $cpu; then
+ echo $FUNCNAME $cpu: unexpected fail >&2
+ exit 1
+ elif ! cpu_is_online $cpu; then
+ echo $FUNCNAME $cpu: unexpected offline >&2
+ exit 1
+ fi
+}
+
+online_cpu_expect_fail()
+{
+ local cpu=$1
+
+ if online_cpu $cpu 2> /dev/null; then
+ echo $FUNCNAME $cpu: unexpected success >&2
+ exit 1
+ elif ! cpu_is_offline $cpu; then
+ echo $FUNCNAME $cpu: unexpected online >&2
+ exit 1
+ fi
+}
+
+offline_cpu_expect_success()
+{
+ local cpu=$1
+
+ if ! offline_cpu $cpu; then
+ echo $FUNCNAME $cpu: unexpected fail >&2
+ exit 1
+ elif ! cpu_is_offline $cpu; then
+ echo $FUNCNAME $cpu: unexpected offline >&2
+ exit 1
+ fi
+}
+
+offline_cpu_expect_fail()
+{
+ local cpu=$1
+
+ if offline_cpu $cpu 2> /dev/null; then
+ echo $FUNCNAME $cpu: unexpected success >&2
+ exit 1
+ elif ! cpu_is_online $cpu; then
+ echo $FUNCNAME $cpu: unexpected offline >&2
+ exit 1
+ fi
+}
+
+error=-12
+allcpus=0
+priority=0
+online_cpus=0
+online_max=0
+offline_cpus=0
+offline_max=0
+present_cpus=0
+present_max=0
+
+while getopts e:ahp: opt; do
+ case $opt in
+ e)
+ error=$OPTARG
+ ;;
+ a)
+ allcpus=1
+ ;;
+ h)
+ echo "Usage $0 [ -a ] [ -e errno ] [ -p notifier-priority ]"
+ echo -e "\t default offline one cpu"
+ echo -e "\t run with -a option to offline all cpus"
+ exit
+ ;;
+ p)
+ priority=$OPTARG
+ ;;
+ esac
+done
+
+if ! [ "$error" -ge -4095 -a "$error" -lt 0 ]; then
+ echo "error code must be -4095 <= errno < 0" >&2
+ exit 1
+fi
+
+prerequisite
+
+#
+# Safe test (default) - offline and online one cpu
+#
+if [ $allcpus -eq 0 ]; then
+ echo "Limited scope test: one hotplug cpu"
+ echo -e "\t (leaves cpu in the original state):"
+ echo -e "\t online to offline to online: cpu $online_max"
+ offline_cpu_expect_success $online_max
+ online_cpu_expect_success $online_max
+
+ if [[ $offline_cpus -gt 0 ]]; then
+ echo -e "\t offline to online to offline: cpu $present_max"
+ online_cpu_expect_success $present_max
+ offline_cpu_expect_success $present_max
+ online_cpu $present_max
+ fi
+ exit 0
+else
+ echo "Full scope test: all hotplug cpus"
+ echo -e "\t online all offline cpus"
+ echo -e "\t offline all online cpus"
+ echo -e "\t online all offline cpus"
+fi
+
+#
+# Online all hot-pluggable CPUs
+#
+for cpu in `hotplaggable_offline_cpus`; do
+ online_cpu_expect_success $cpu
+done
+
+#
+# Offline all hot-pluggable CPUs
+#
+for cpu in `hotpluggable_online_cpus`; do
+ offline_cpu_expect_success $cpu
+done
+
+#
+# Online all hot-pluggable CPUs again
+#
+for cpu in `hotplaggable_offline_cpus`; do
+ online_cpu_expect_success $cpu
+done
+
+#
+# Test with cpu notifier error injection
+#
+
+DEBUGFS=`mount -t debugfs | head -1 | awk '{ print $3 }'`
+NOTIFIER_ERR_INJECT_DIR=$DEBUGFS/notifier-error-inject/cpu
+
+prerequisite_extra()
+{
+ msg="skip extra tests:"
+
+ /sbin/modprobe -q -r cpu-notifier-error-inject
+ /sbin/modprobe -q cpu-notifier-error-inject priority=$priority
+
+ if [ ! -d "$DEBUGFS" ]; then
+ echo $msg debugfs is not mounted >&2
+ exit $ksft_skip
+ fi
+
+ if [ ! -d $NOTIFIER_ERR_INJECT_DIR ]; then
+ echo $msg cpu-notifier-error-inject module is not available >&2
+ exit $ksft_skip
+ fi
+}
+
+prerequisite_extra
+
+#
+# Offline all hot-pluggable CPUs
+#
+echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/CPU_DOWN_PREPARE/error
+for cpu in `hotpluggable_online_cpus`; do
+ offline_cpu_expect_success $cpu
+done
+
+#
+# Test CPU hot-add error handling (offline => online)
+#
+echo $error > $NOTIFIER_ERR_INJECT_DIR/actions/CPU_UP_PREPARE/error
+for cpu in `hotplaggable_offline_cpus`; do
+ online_cpu_expect_fail $cpu
+done
+
+#
+# Online all hot-pluggable CPUs
+#
+echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/CPU_UP_PREPARE/error
+for cpu in `hotplaggable_offline_cpus`; do
+ online_cpu_expect_success $cpu
+done
+
+#
+# Test CPU hot-remove error handling (online => offline)
+#
+echo $error > $NOTIFIER_ERR_INJECT_DIR/actions/CPU_DOWN_PREPARE/error
+for cpu in `hotpluggable_online_cpus`; do
+ offline_cpu_expect_fail $cpu
+done
+
+echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/CPU_DOWN_PREPARE/error
+/sbin/modprobe -q -r cpu-notifier-error-inject
diff --git a/tools/testing/selftests/cpufreq/Makefile b/tools/testing/selftests/cpufreq/Makefile
new file mode 100644
index 000000000..c86ca8342
--- /dev/null
+++ b/tools/testing/selftests/cpufreq/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+all:
+
+TEST_PROGS := main.sh
+TEST_FILES := cpu.sh cpufreq.sh governor.sh module.sh special-tests.sh
+
+include ../lib.mk
+
+clean:
diff --git a/tools/testing/selftests/cpufreq/config b/tools/testing/selftests/cpufreq/config
new file mode 100644
index 000000000..27ff72ebd
--- /dev/null
+++ b/tools/testing/selftests/cpufreq/config
@@ -0,0 +1,15 @@
+CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_STAT=y
+CONFIG_CPU_FREQ_GOV_POWERSAVE=y
+CONFIG_CPU_FREQ_GOV_USERSPACE=y
+CONFIG_CPU_FREQ_GOV_ONDEMAND=y
+CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
+CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y
+CONFIG_DEBUG_RT_MUTEXES=y
+CONFIG_DEBUG_PI_LIST=y
+CONFIG_DEBUG_SPINLOCK=y
+CONFIG_DEBUG_MUTEXES=y
+CONFIG_DEBUG_LOCK_ALLOC=y
+CONFIG_PROVE_LOCKING=y
+CONFIG_LOCKDEP=y
+CONFIG_DEBUG_ATOMIC_SLEEP=y
diff --git a/tools/testing/selftests/cpufreq/cpu.sh b/tools/testing/selftests/cpufreq/cpu.sh
new file mode 100755
index 000000000..39fdcdfb8
--- /dev/null
+++ b/tools/testing/selftests/cpufreq/cpu.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# CPU helpers
+
+# protect against multiple inclusion
+if [ $FILE_CPU ]; then
+ return 0
+else
+ FILE_CPU=DONE
+fi
+
+source cpufreq.sh
+
+for_each_cpu()
+{
+ cpus=$(ls $CPUROOT | grep "cpu[0-9].*")
+ for cpu in $cpus; do
+ $@ $cpu
+ done
+}
+
+for_each_non_boot_cpu()
+{
+ cpus=$(ls $CPUROOT | grep "cpu[1-9].*")
+ for cpu in $cpus; do
+ $@ $cpu
+ done
+}
+
+#$1: cpu
+offline_cpu()
+{
+ printf "Offline $1\n"
+ echo 0 > $CPUROOT/$1/online
+}
+
+#$1: cpu
+online_cpu()
+{
+ printf "Online $1\n"
+ echo 1 > $CPUROOT/$1/online
+}
+
+#$1: cpu
+reboot_cpu()
+{
+ offline_cpu $1
+ online_cpu $1
+}
+
+# Reboot CPUs
+# param: number of times we want to run the loop
+reboot_cpus()
+{
+ printf "** Test: Running ${FUNCNAME[0]} for $1 loops **\n\n"
+
+ for i in `seq 1 $1`; do
+ for_each_non_boot_cpu offline_cpu
+ for_each_non_boot_cpu online_cpu
+ printf "\n"
+ done
+
+ printf "\n%s\n\n" "------------------------------------------------"
+}
+
+# Prints warning for all CPUs with missing cpufreq directory
+print_unmanaged_cpus()
+{
+ for_each_cpu cpu_should_have_cpufreq_directory
+}
+
+# Counts CPUs with cpufreq directories
+count_cpufreq_managed_cpus()
+{
+ count=0;
+
+ for cpu in `ls $CPUROOT | grep "cpu[0-9].*"`; do
+ if [ -d $CPUROOT/$cpu/cpufreq ]; then
+ let count=count+1;
+ fi
+ done
+
+ echo $count;
+}
diff --git a/tools/testing/selftests/cpufreq/cpufreq.sh b/tools/testing/selftests/cpufreq/cpufreq.sh
new file mode 100755
index 000000000..b583a2fb4
--- /dev/null
+++ b/tools/testing/selftests/cpufreq/cpufreq.sh
@@ -0,0 +1,242 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# protect against multiple inclusion
+if [ $FILE_CPUFREQ ]; then
+ return 0
+else
+ FILE_CPUFREQ=DONE
+fi
+
+source cpu.sh
+
+
+# $1: cpu
+cpu_should_have_cpufreq_directory()
+{
+ if [ ! -d $CPUROOT/$1/cpufreq ]; then
+ printf "Warning: No cpufreq directory present for $1\n"
+ fi
+}
+
+cpu_should_not_have_cpufreq_directory()
+{
+ if [ -d $CPUROOT/$1/cpufreq ]; then
+ printf "Warning: cpufreq directory present for $1\n"
+ fi
+}
+
+for_each_policy()
+{
+ policies=$(ls $CPUFREQROOT| grep "policy[0-9].*")
+ for policy in $policies; do
+ $@ $policy
+ done
+}
+
+for_each_policy_concurrent()
+{
+ policies=$(ls $CPUFREQROOT| grep "policy[0-9].*")
+ for policy in $policies; do
+ $@ $policy &
+ done
+}
+
+# $1: Path
+read_cpufreq_files_in_dir()
+{
+ local files=`ls $1`
+
+ printf "Printing directory: $1\n\n"
+
+ for file in $files; do
+ if [ -f $1/$file ]; then
+ printf "$file:"
+ cat $1/$file
+ else
+ printf "\n"
+ read_cpufreq_files_in_dir "$1/$file"
+ fi
+ done
+ printf "\n"
+}
+
+
+read_all_cpufreq_files()
+{
+ printf "** Test: Running ${FUNCNAME[0]} **\n\n"
+
+ read_cpufreq_files_in_dir $CPUFREQROOT
+
+ printf "%s\n\n" "------------------------------------------------"
+}
+
+
+# UPDATE CPUFREQ FILES
+
+# $1: directory path
+update_cpufreq_files_in_dir()
+{
+ local files=`ls $1`
+
+ printf "Updating directory: $1\n\n"
+
+ for file in $files; do
+ if [ -f $1/$file ]; then
+ # file is writable ?
+ local wfile=$(ls -l $1/$file | awk '$1 ~ /^.*w.*/ { print $NF; }')
+
+ if [ ! -z $wfile ]; then
+ # scaling_setspeed is a special file and we
+ # should skip updating it
+ if [ $file != "scaling_setspeed" ]; then
+ local val=$(cat $1/$file)
+ printf "Writing $val to: $file\n"
+ echo $val > $1/$file
+ fi
+ fi
+ else
+ printf "\n"
+ update_cpufreq_files_in_dir "$1/$file"
+ fi
+ done
+
+ printf "\n"
+}
+
+# Update all writable files with their existing values
+update_all_cpufreq_files()
+{
+ printf "** Test: Running ${FUNCNAME[0]} **\n\n"
+
+ update_cpufreq_files_in_dir $CPUFREQROOT
+
+ printf "%s\n\n" "------------------------------------------------"
+}
+
+
+# CHANGE CPU FREQUENCIES
+
+# $1: policy
+find_current_freq()
+{
+ cat $CPUFREQROOT/$1/scaling_cur_freq
+}
+
+# $1: policy
+# $2: frequency
+set_cpu_frequency()
+{
+ printf "Change frequency for $1 to $2\n"
+ echo $2 > $CPUFREQROOT/$1/scaling_setspeed
+}
+
+# $1: policy
+test_all_frequencies()
+{
+ local filepath="$CPUFREQROOT/$1"
+
+ backup_governor $1
+
+ local found=$(switch_governor $1 "userspace")
+ if [ $found = 1 ]; then
+ printf "${FUNCNAME[0]}: userspace governor not available for: $1\n"
+ return;
+ fi
+
+ printf "Switched governor for $1 to userspace\n\n"
+
+ local freqs=$(cat $filepath/scaling_available_frequencies)
+ printf "Available frequencies for $1: $freqs\n\n"
+
+ # Set all frequencies one-by-one
+ for freq in $freqs; do
+ set_cpu_frequency $1 $freq
+ done
+
+ printf "\n"
+
+ restore_governor $1
+}
+
+# $1: loop count
+shuffle_frequency_for_all_cpus()
+{
+ printf "** Test: Running ${FUNCNAME[0]} for $1 loops **\n\n"
+
+ for i in `seq 1 $1`; do
+ for_each_policy test_all_frequencies
+ done
+ printf "\n%s\n\n" "------------------------------------------------"
+}
+
+# Basic cpufreq tests
+cpufreq_basic_tests()
+{
+ printf "*** RUNNING CPUFREQ SANITY TESTS ***\n"
+ printf "====================================\n\n"
+
+ count=$(count_cpufreq_managed_cpus)
+ if [ $count = 0 ]; then
+ printf "No cpu is managed by cpufreq core, exiting\n"
+ exit;
+ else
+ printf "CPUFreq manages: $count CPUs\n\n"
+ fi
+
+ # Detect & print which CPUs are not managed by cpufreq
+ print_unmanaged_cpus
+
+ # read/update all cpufreq files
+ read_all_cpufreq_files
+ update_all_cpufreq_files
+
+ # hotplug cpus
+ reboot_cpus 5
+
+ # Test all frequencies
+ shuffle_frequency_for_all_cpus 2
+
+ # Test all governors
+ shuffle_governors_for_all_cpus 1
+}
+
+# Suspend/resume
+# $1: "suspend" or "hibernate", $2: loop count
+do_suspend()
+{
+ printf "** Test: Running ${FUNCNAME[0]}: Trying $1 for $2 loops **\n\n"
+
+ # Is the directory available
+ if [ ! -d $SYSFS/power/ -o ! -f $SYSFS/power/state ]; then
+ printf "$SYSFS/power/state not available\n"
+ return 1
+ fi
+
+ if [ $1 = "suspend" ]; then
+ filename="mem"
+ elif [ $1 = "hibernate" ]; then
+ filename="disk"
+ else
+ printf "$1 is not a valid option\n"
+ return 1
+ fi
+
+ if [ -n $filename ]; then
+ present=$(cat $SYSFS/power/state | grep $filename)
+
+ if [ -z "$present" ]; then
+ printf "Tried to $1 but $filename isn't present in $SYSFS/power/state\n"
+ return 1;
+ fi
+
+ for i in `seq 1 $2`; do
+ printf "Starting $1\n"
+ echo $filename > $SYSFS/power/state
+ printf "Came out of $1\n"
+
+ printf "Do basic tests after finishing $1 to verify cpufreq state\n\n"
+ cpufreq_basic_tests
+ done
+ fi
+}
diff --git a/tools/testing/selftests/cpufreq/governor.sh b/tools/testing/selftests/cpufreq/governor.sh
new file mode 100755
index 000000000..fe37df79c
--- /dev/null
+++ b/tools/testing/selftests/cpufreq/governor.sh
@@ -0,0 +1,154 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test governors
+
+# protect against multiple inclusion
+if [ $FILE_GOVERNOR ]; then
+ return 0
+else
+ FILE_GOVERNOR=DONE
+fi
+
+source cpu.sh
+source cpufreq.sh
+
+CUR_GOV=
+CUR_FREQ=
+
+# Find governor's directory path
+# $1: policy, $2: governor
+find_gov_directory()
+{
+ if [ -d $CPUFREQROOT/$2 ]; then
+ printf "$CPUFREQROOT/$2\n"
+ elif [ -d $CPUFREQROOT/$1/$2 ]; then
+ printf "$CPUFREQROOT/$1/$2\n"
+ else
+ printf "INVALID\n"
+ fi
+}
+
+# $1: policy
+find_current_governor()
+{
+ cat $CPUFREQROOT/$1/scaling_governor
+}
+
+# $1: policy
+backup_governor()
+{
+ CUR_GOV=$(find_current_governor $1)
+
+ printf "Governor backup done for $1: $CUR_GOV\n"
+
+ if [ $CUR_GOV == "userspace" ]; then
+ CUR_FREQ=$(find_current_freq $1)
+ printf "Governor frequency backup done for $1: $CUR_FREQ\n"
+ fi
+
+ printf "\n"
+}
+
+# $1: policy
+restore_governor()
+{
+ __switch_governor $1 $CUR_GOV
+
+ printf "Governor restored for $1 to $CUR_GOV\n"
+
+ if [ $CUR_GOV == "userspace" ]; then
+ set_cpu_frequency $1 $CUR_FREQ
+ printf "Governor frequency restored for $1: $CUR_FREQ\n"
+ fi
+
+ printf "\n"
+}
+
+# param:
+# $1: policy, $2: governor
+__switch_governor()
+{
+ echo $2 > $CPUFREQROOT/$1/scaling_governor
+}
+
+# param:
+# $1: cpu, $2: governor
+__switch_governor_for_cpu()
+{
+ echo $2 > $CPUROOT/$1/cpufreq/scaling_governor
+}
+
+# SWITCH GOVERNORS
+
+# $1: cpu, $2: governor
+switch_governor()
+{
+ local filepath=$CPUFREQROOT/$1/scaling_available_governors
+
+ # check if governor is available
+ local found=$(cat $filepath | grep $2 | wc -l)
+ if [ $found = 0 ]; then
+ echo 1;
+ return
+ fi
+
+ __switch_governor $1 $2
+ echo 0;
+}
+
+# $1: policy, $2: governor
+switch_show_governor()
+{
+ cur_gov=find_current_governor
+ if [ $cur_gov == "userspace" ]; then
+ cur_freq=find_current_freq
+ fi
+
+ # switch governor
+ __switch_governor $1 $2
+
+ printf "\nSwitched governor for $1 to $2\n\n"
+
+ if [ $2 == "userspace" -o $2 == "powersave" -o $2 == "performance" ]; then
+ printf "No files to read for $2 governor\n\n"
+ return
+ fi
+
+ # show governor files
+ local govpath=$(find_gov_directory $1 $2)
+ read_cpufreq_files_in_dir $govpath
+}
+
+# $1: function to be called, $2: policy
+call_for_each_governor()
+{
+ local filepath=$CPUFREQROOT/$2/scaling_available_governors
+
+ # Exit if cpu isn't managed by cpufreq core
+ if [ ! -f $filepath ]; then
+ return;
+ fi
+
+ backup_governor $2
+
+ local governors=$(cat $filepath)
+ printf "Available governors for $2: $governors\n"
+
+ for governor in $governors; do
+ $1 $2 $governor
+ done
+
+ restore_governor $2
+}
+
+# $1: loop count
+shuffle_governors_for_all_cpus()
+{
+ printf "** Test: Running ${FUNCNAME[0]} for $1 loops **\n\n"
+
+ for i in `seq 1 $1`; do
+ for_each_policy call_for_each_governor switch_show_governor
+ done
+ printf "%s\n\n" "------------------------------------------------"
+}
diff --git a/tools/testing/selftests/cpufreq/main.sh b/tools/testing/selftests/cpufreq/main.sh
new file mode 100755
index 000000000..31f8c9a76
--- /dev/null
+++ b/tools/testing/selftests/cpufreq/main.sh
@@ -0,0 +1,198 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source cpu.sh
+source cpufreq.sh
+source governor.sh
+source module.sh
+source special-tests.sh
+
+FUNC=basic # do basic tests by default
+OUTFILE=cpufreq_selftest
+SYSFS=
+CPUROOT=
+CPUFREQROOT=
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+helpme()
+{
+ printf "Usage: $0 [-h] [-todg args]
+ [-h <help>]
+ [-o <output-file-for-dump>]
+ [-t <basic: Basic cpufreq testing
+ suspend: suspend/resume,
+ hibernate: hibernate/resume,
+ modtest: test driver or governor modules. Only to be used with -d or -g options,
+ sptest1: Simple governor switch to produce lockdep.
+ sptest2: Concurrent governor switch to produce lockdep.
+ sptest3: Governor races, shuffle between governors quickly.
+ sptest4: CPU hotplugs with updates to cpufreq files.>]
+ [-d <driver's module name: only with \"-t modtest>\"]
+ [-g <governor's module name: only with \"-t modtest>\"]
+ \n"
+ exit 2
+}
+
+prerequisite()
+{
+ msg="skip all tests:"
+
+ if [ $UID != 0 ]; then
+ echo $msg must be run as root >&2
+ exit $ksft_skip
+ fi
+
+ taskset -p 01 $$
+
+ SYSFS=`mount -t sysfs | head -1 | awk '{ print $3 }'`
+
+ if [ ! -d "$SYSFS" ]; then
+ echo $msg sysfs is not mounted >&2
+ exit 2
+ fi
+
+ CPUROOT=$SYSFS/devices/system/cpu
+ CPUFREQROOT="$CPUROOT/cpufreq"
+
+ if ! ls $CPUROOT/cpu* > /dev/null 2>&1; then
+ echo $msg cpus not available in sysfs >&2
+ exit 2
+ fi
+
+ if ! ls $CPUROOT/cpufreq > /dev/null 2>&1; then
+ echo $msg cpufreq directory not available in sysfs >&2
+ exit 2
+ fi
+}
+
+parse_arguments()
+{
+ while getopts ht:o:d:g: arg
+ do
+ case $arg in
+ h) # --help
+ helpme
+ ;;
+
+ t) # --func_type (Function to perform: basic, suspend, hibernate, modtest, sptest1/2/3/4 (default: basic))
+ FUNC=$OPTARG
+ ;;
+
+ o) # --output-file (Output file to store dumps)
+ OUTFILE=$OPTARG
+ ;;
+
+ d) # --driver-mod-name (Name of the driver module)
+ DRIVER_MOD=$OPTARG
+ ;;
+
+ g) # --governor-mod-name (Name of the governor module)
+ GOVERNOR_MOD=$OPTARG
+ ;;
+
+ \?)
+ helpme
+ ;;
+ esac
+ done
+}
+
+do_test()
+{
+ # Check if CPUs are managed by cpufreq or not
+ count=$(count_cpufreq_managed_cpus)
+
+ if [ $count = 0 -a $FUNC != "modtest" ]; then
+ echo "No cpu is managed by cpufreq core, exiting"
+ exit 2;
+ fi
+
+ case "$FUNC" in
+ "basic")
+ cpufreq_basic_tests
+ ;;
+
+ "suspend")
+ do_suspend "suspend" 1
+ ;;
+
+ "hibernate")
+ do_suspend "hibernate" 1
+ ;;
+
+ "modtest")
+ # Do we have modules in place?
+ if [ -z $DRIVER_MOD ] && [ -z $GOVERNOR_MOD ]; then
+ echo "No driver or governor module passed with -d or -g"
+ exit 2;
+ fi
+
+ if [ $DRIVER_MOD ]; then
+ if [ $GOVERNOR_MOD ]; then
+ module_test $DRIVER_MOD $GOVERNOR_MOD
+ else
+ module_driver_test $DRIVER_MOD
+ fi
+ else
+ if [ $count = 0 ]; then
+ echo "No cpu is managed by cpufreq core, exiting"
+ exit 2;
+ fi
+
+ module_governor_test $GOVERNOR_MOD
+ fi
+ ;;
+
+ "sptest1")
+ simple_lockdep
+ ;;
+
+ "sptest2")
+ concurrent_lockdep
+ ;;
+
+ "sptest3")
+ governor_race
+ ;;
+
+ "sptest4")
+ hotplug_with_updates
+ ;;
+
+ *)
+ echo "Invalid [-f] function type"
+ helpme
+ ;;
+ esac
+}
+
+# clear dumps
+# $1: file name
+clear_dumps()
+{
+ echo "" > $1.txt
+ echo "" > $1.dmesg_cpufreq.txt
+ echo "" > $1.dmesg_full.txt
+}
+
+# $1: output file name
+dmesg_dumps()
+{
+ dmesg | grep cpufreq >> $1.dmesg_cpufreq.txt
+
+ # We may need the full logs as well
+ dmesg >> $1.dmesg_full.txt
+}
+
+# Parse arguments
+parse_arguments $@
+
+# Make sure all requirements are met
+prerequisite
+
+# Run requested functions
+clear_dumps $OUTFILE
+do_test >> $OUTFILE.txt
+dmesg_dumps $OUTFILE
diff --git a/tools/testing/selftests/cpufreq/module.sh b/tools/testing/selftests/cpufreq/module.sh
new file mode 100755
index 000000000..22563cd12
--- /dev/null
+++ b/tools/testing/selftests/cpufreq/module.sh
@@ -0,0 +1,244 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Modules specific tests cases
+
+# protect against multiple inclusion
+if [ $FILE_MODULE ]; then
+ return 0
+else
+ FILE_MODULE=DONE
+fi
+
+source cpu.sh
+source cpufreq.sh
+source governor.sh
+
+# Check basic insmod/rmmod
+# $1: module
+test_basic_insmod_rmmod()
+{
+ printf "** Test: Running ${FUNCNAME[0]} **\n\n"
+
+ printf "Inserting $1 module\n"
+ # insert module
+ insmod $1
+ if [ $? != 0 ]; then
+ printf "Insmod $1 failed\n"
+ exit;
+ fi
+
+ printf "Removing $1 module\n"
+ # remove module
+ rmmod $1
+ if [ $? != 0 ]; then
+ printf "rmmod $1 failed\n"
+ exit;
+ fi
+
+ printf "\n"
+}
+
+# Insert cpufreq driver module and perform basic tests
+# $1: cpufreq-driver module to insert
+# $2: If we want to play with CPUs (1) or not (0)
+module_driver_test_single()
+{
+ printf "** Test: Running ${FUNCNAME[0]} for driver $1 and cpus_hotplug=$2 **\n\n"
+
+ if [ $2 -eq 1 ]; then
+ # offline all non-boot CPUs
+ for_each_non_boot_cpu offline_cpu
+ printf "\n"
+ fi
+
+ # insert module
+ printf "Inserting $1 module\n\n"
+ insmod $1
+ if [ $? != 0 ]; then
+ printf "Insmod $1 failed\n"
+ return;
+ fi
+
+ if [ $2 -eq 1 ]; then
+ # online all non-boot CPUs
+ for_each_non_boot_cpu online_cpu
+ printf "\n"
+ fi
+
+ # run basic tests
+ cpufreq_basic_tests
+
+ # remove module
+ printf "Removing $1 module\n\n"
+ rmmod $1
+ if [ $? != 0 ]; then
+ printf "rmmod $1 failed\n"
+ return;
+ fi
+
+ # There shouldn't be any cpufreq directories now.
+ for_each_cpu cpu_should_not_have_cpufreq_directory
+ printf "\n"
+}
+
+# $1: cpufreq-driver module to insert
+module_driver_test()
+{
+ printf "** Test: Running ${FUNCNAME[0]} **\n\n"
+
+ # check if module is present or not
+ ls $1 > /dev/null
+ if [ $? != 0 ]; then
+ printf "$1: not present in `pwd` folder\n"
+ return;
+ fi
+
+ # test basic module tests
+ test_basic_insmod_rmmod $1
+
+ # Do simple module test
+ module_driver_test_single $1 0
+
+ # Remove CPUs before inserting module and then bring them back
+ module_driver_test_single $1 1
+ printf "\n"
+}
+
+# find governor name based on governor module name
+# $1: governor module name
+find_gov_name()
+{
+ if [ $1 = "cpufreq_ondemand.ko" ]; then
+ printf "ondemand"
+ elif [ $1 = "cpufreq_conservative.ko" ]; then
+ printf "conservative"
+ elif [ $1 = "cpufreq_userspace.ko" ]; then
+ printf "userspace"
+ elif [ $1 = "cpufreq_performance.ko" ]; then
+ printf "performance"
+ elif [ $1 = "cpufreq_powersave.ko" ]; then
+ printf "powersave"
+ elif [ $1 = "cpufreq_schedutil.ko" ]; then
+ printf "schedutil"
+ fi
+}
+
+# $1: governor string, $2: governor module, $3: policy
+# example: module_governor_test_single "ondemand" "cpufreq_ondemand.ko" 2
+module_governor_test_single()
+{
+ printf "** Test: Running ${FUNCNAME[0]} for $3 **\n\n"
+
+ backup_governor $3
+
+ # switch to new governor
+ printf "Switch from $CUR_GOV to $1\n"
+ switch_show_governor $3 $1
+
+ # try removing module, it should fail as governor is used
+ printf "Removing $2 module\n\n"
+ rmmod $2
+ if [ $? = 0 ]; then
+ printf "WARN: rmmod $2 succeeded even if governor is used\n"
+ insmod $2
+ else
+ printf "Pass: unable to remove $2 while it is being used\n\n"
+ fi
+
+ # switch back to old governor
+ printf "Switchback to $CUR_GOV from $1\n"
+ restore_governor $3
+ printf "\n"
+}
+
+# Insert cpufreq governor module and perform basic tests
+# $1: cpufreq-governor module to insert
+module_governor_test()
+{
+ printf "** Test: Running ${FUNCNAME[0]} **\n\n"
+
+ # check if module is present or not
+ ls $1 > /dev/null
+ if [ $? != 0 ]; then
+ printf "$1: not present in `pwd` folder\n"
+ return;
+ fi
+
+ # test basic module tests
+ test_basic_insmod_rmmod $1
+
+ # insert module
+ printf "Inserting $1 module\n\n"
+ insmod $1
+ if [ $? != 0 ]; then
+ printf "Insmod $1 failed\n"
+ return;
+ fi
+
+ # switch to new governor for each cpu
+ for_each_policy module_governor_test_single $(find_gov_name $1) $1
+
+ # remove module
+ printf "Removing $1 module\n\n"
+ rmmod $1
+ if [ $? != 0 ]; then
+ printf "rmmod $1 failed\n"
+ return;
+ fi
+ printf "\n"
+}
+
+# test modules: driver and governor
+# $1: driver module, $2: governor module
+module_test()
+{
+ printf "** Test: Running ${FUNCNAME[0]} **\n\n"
+
+ # check if modules are present or not
+ ls $1 $2 > /dev/null
+ if [ $? != 0 ]; then
+ printf "$1 or $2: is not present in `pwd` folder\n"
+ return;
+ fi
+
+ # TEST1: Insert gov after driver
+ # insert driver module
+ printf "Inserting $1 module\n\n"
+ insmod $1
+ if [ $? != 0 ]; then
+ printf "Insmod $1 failed\n"
+ return;
+ fi
+
+ # run governor tests
+ module_governor_test $2
+
+ # remove driver module
+ printf "Removing $1 module\n\n"
+ rmmod $1
+ if [ $? != 0 ]; then
+ printf "rmmod $1 failed\n"
+ return;
+ fi
+
+ # TEST2: Insert driver after governor
+ # insert governor module
+ printf "Inserting $2 module\n\n"
+ insmod $2
+ if [ $? != 0 ]; then
+ printf "Insmod $2 failed\n"
+ return;
+ fi
+
+ # run governor tests
+ module_driver_test $1
+
+ # remove driver module
+ printf "Removing $2 module\n\n"
+ rmmod $2
+ if [ $? != 0 ]; then
+ printf "rmmod $2 failed\n"
+ return;
+ fi
+}
diff --git a/tools/testing/selftests/cpufreq/special-tests.sh b/tools/testing/selftests/cpufreq/special-tests.sh
new file mode 100755
index 000000000..8d40505dc
--- /dev/null
+++ b/tools/testing/selftests/cpufreq/special-tests.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Special test cases reported by people
+
+# Testcase 1: Reported here: http://marc.info/?l=linux-pm&m=140618592709858&w=2
+
+# protect against multiple inclusion
+if [ $FILE_SPECIAL ]; then
+ return 0
+else
+ FILE_SPECIAL=DONE
+fi
+
+source cpu.sh
+source cpufreq.sh
+source governor.sh
+
+# Test 1
+# $1: policy
+__simple_lockdep()
+{
+ # switch to ondemand
+ __switch_governor $1 "ondemand"
+
+ # cat ondemand files
+ local ondir=$(find_gov_directory $1 "ondemand")
+ if [ -z $ondir ]; then
+ printf "${FUNCNAME[0]}Ondemand directory not created, quit"
+ return
+ fi
+
+ cat $ondir/*
+
+ # switch to conservative
+ __switch_governor $1 "conservative"
+}
+
+simple_lockdep()
+{
+ printf "** Test: Running ${FUNCNAME[0]} **\n"
+
+ for_each_policy __simple_lockdep
+}
+
+# Test 2
+# $1: policy
+__concurrent_lockdep()
+{
+ for i in `seq 0 100`; do
+ __simple_lockdep $1
+ done
+}
+
+concurrent_lockdep()
+{
+ printf "** Test: Running ${FUNCNAME[0]} **\n"
+
+ for_each_policy_concurrent __concurrent_lockdep
+}
+
+# Test 3
+quick_shuffle()
+{
+ # this is called concurrently from governor_race
+ for I in `seq 1000`
+ do
+ echo ondemand | sudo tee $CPUFREQROOT/policy*/scaling_governor &
+ echo userspace | sudo tee $CPUFREQROOT/policy*/scaling_governor &
+ done
+}
+
+governor_race()
+{
+ printf "** Test: Running ${FUNCNAME[0]} **\n"
+
+ # run 8 concurrent instances
+ for I in `seq 8`
+ do
+ quick_shuffle &
+ done
+}
+
+# Test 4
+# $1: cpu
+hotplug_with_updates_cpu()
+{
+ local filepath="$CPUROOT/$1/cpufreq"
+
+ # switch to ondemand
+ __switch_governor_for_cpu $1 "ondemand"
+
+ for i in `seq 1 5000`
+ do
+ reboot_cpu $1
+ done &
+
+ local freqs=$(cat $filepath/scaling_available_frequencies)
+ local oldfreq=$(cat $filepath/scaling_min_freq)
+
+ for j in `seq 1 5000`
+ do
+ # Set all frequencies one-by-one
+ for freq in $freqs; do
+ echo $freq > $filepath/scaling_min_freq
+ done
+ done
+
+ # restore old freq
+ echo $oldfreq > $filepath/scaling_min_freq
+}
+
+hotplug_with_updates()
+{
+ for_each_non_boot_cpu hotplug_with_updates_cpu
+}
diff --git a/tools/testing/selftests/dmabuf-heaps/Makefile b/tools/testing/selftests/dmabuf-heaps/Makefile
new file mode 100644
index 000000000..604b43ece
--- /dev/null
+++ b/tools/testing/selftests/dmabuf-heaps/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS += -static -O3 -Wl,-no-as-needed -Wall
+
+TEST_GEN_PROGS = dmabuf-heap
+
+include ../lib.mk
diff --git a/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c b/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c
new file mode 100644
index 000000000..909da9cdd
--- /dev/null
+++ b/tools/testing/selftests/dmabuf-heaps/dmabuf-heap.c
@@ -0,0 +1,397 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+
+#include <linux/dma-buf.h>
+#include <drm/drm.h>
+
+#include "../../../../include/uapi/linux/dma-heap.h"
+
+#define DEVPATH "/dev/dma_heap"
+
+static int check_vgem(int fd)
+{
+ drm_version_t version = { 0 };
+ char name[5];
+ int ret;
+
+ version.name_len = 4;
+ version.name = name;
+
+ ret = ioctl(fd, DRM_IOCTL_VERSION, &version);
+ if (ret)
+ return 0;
+
+ return !strcmp(name, "vgem");
+}
+
+static int open_vgem(void)
+{
+ int i, fd;
+ const char *drmstr = "/dev/dri/card";
+
+ fd = -1;
+ for (i = 0; i < 16; i++) {
+ char name[80];
+
+ snprintf(name, 80, "%s%u", drmstr, i);
+
+ fd = open(name, O_RDWR);
+ if (fd < 0)
+ continue;
+
+ if (!check_vgem(fd)) {
+ close(fd);
+ fd = -1;
+ continue;
+ } else {
+ break;
+ }
+ }
+ return fd;
+}
+
+static int import_vgem_fd(int vgem_fd, int dma_buf_fd, uint32_t *handle)
+{
+ struct drm_prime_handle import_handle = {
+ .fd = dma_buf_fd,
+ .flags = 0,
+ .handle = 0,
+ };
+ int ret;
+
+ ret = ioctl(vgem_fd, DRM_IOCTL_PRIME_FD_TO_HANDLE, &import_handle);
+ if (ret == 0)
+ *handle = import_handle.handle;
+ return ret;
+}
+
+static void close_handle(int vgem_fd, uint32_t handle)
+{
+ struct drm_gem_close close = {
+ .handle = handle,
+ };
+
+ ioctl(vgem_fd, DRM_IOCTL_GEM_CLOSE, &close);
+}
+
+static int dmabuf_heap_open(char *name)
+{
+ int ret, fd;
+ char buf[256];
+
+ ret = snprintf(buf, 256, "%s/%s", DEVPATH, name);
+ if (ret < 0) {
+ printf("snprintf failed!\n");
+ return ret;
+ }
+
+ fd = open(buf, O_RDWR);
+ if (fd < 0)
+ printf("open %s failed!\n", buf);
+ return fd;
+}
+
+static int dmabuf_heap_alloc_fdflags(int fd, size_t len, unsigned int fd_flags,
+ unsigned int heap_flags, int *dmabuf_fd)
+{
+ struct dma_heap_allocation_data data = {
+ .len = len,
+ .fd = 0,
+ .fd_flags = fd_flags,
+ .heap_flags = heap_flags,
+ };
+ int ret;
+
+ if (!dmabuf_fd)
+ return -EINVAL;
+
+ ret = ioctl(fd, DMA_HEAP_IOCTL_ALLOC, &data);
+ if (ret < 0)
+ return ret;
+ *dmabuf_fd = (int)data.fd;
+ return ret;
+}
+
+static int dmabuf_heap_alloc(int fd, size_t len, unsigned int flags,
+ int *dmabuf_fd)
+{
+ return dmabuf_heap_alloc_fdflags(fd, len, O_RDWR | O_CLOEXEC, flags,
+ dmabuf_fd);
+}
+
+static void dmabuf_sync(int fd, int start_stop)
+{
+ struct dma_buf_sync sync = {
+ .flags = start_stop | DMA_BUF_SYNC_RW,
+ };
+ int ret;
+
+ ret = ioctl(fd, DMA_BUF_IOCTL_SYNC, &sync);
+ if (ret)
+ printf("sync failed %d\n", errno);
+}
+
+#define ONE_MEG (1024 * 1024)
+
+static int test_alloc_and_import(char *heap_name)
+{
+ int heap_fd = -1, dmabuf_fd = -1, importer_fd = -1;
+ uint32_t handle = 0;
+ void *p = NULL;
+ int ret;
+
+ printf("Testing heap: %s\n", heap_name);
+
+ heap_fd = dmabuf_heap_open(heap_name);
+ if (heap_fd < 0)
+ return -1;
+
+ printf("Allocating 1 MEG\n");
+ ret = dmabuf_heap_alloc(heap_fd, ONE_MEG, 0, &dmabuf_fd);
+ if (ret) {
+ printf("Allocation Failed!\n");
+ ret = -1;
+ goto out;
+ }
+ /* mmap and write a simple pattern */
+ p = mmap(NULL,
+ ONE_MEG,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED,
+ dmabuf_fd,
+ 0);
+ if (p == MAP_FAILED) {
+ printf("mmap() failed: %m\n");
+ ret = -1;
+ goto out;
+ }
+ printf("mmap passed\n");
+
+ dmabuf_sync(dmabuf_fd, DMA_BUF_SYNC_START);
+ memset(p, 1, ONE_MEG / 2);
+ memset((char *)p + ONE_MEG / 2, 0, ONE_MEG / 2);
+ dmabuf_sync(dmabuf_fd, DMA_BUF_SYNC_END);
+
+ importer_fd = open_vgem();
+ if (importer_fd < 0) {
+ ret = importer_fd;
+ printf("Failed to open vgem\n");
+ goto out;
+ }
+
+ ret = import_vgem_fd(importer_fd, dmabuf_fd, &handle);
+ if (ret < 0) {
+ printf("Failed to import buffer\n");
+ goto out;
+ }
+ printf("import passed\n");
+
+ dmabuf_sync(dmabuf_fd, DMA_BUF_SYNC_START);
+ memset(p, 0xff, ONE_MEG);
+ dmabuf_sync(dmabuf_fd, DMA_BUF_SYNC_END);
+ printf("syncs passed\n");
+
+ close_handle(importer_fd, handle);
+ ret = 0;
+
+out:
+ if (p)
+ munmap(p, ONE_MEG);
+ if (importer_fd >= 0)
+ close(importer_fd);
+ if (dmabuf_fd >= 0)
+ close(dmabuf_fd);
+ if (heap_fd >= 0)
+ close(heap_fd);
+
+ return ret;
+}
+
+/* Test the ioctl version compatibility w/ a smaller structure then expected */
+static int dmabuf_heap_alloc_older(int fd, size_t len, unsigned int flags,
+ int *dmabuf_fd)
+{
+ int ret;
+ unsigned int older_alloc_ioctl;
+ struct dma_heap_allocation_data_smaller {
+ __u64 len;
+ __u32 fd;
+ __u32 fd_flags;
+ } data = {
+ .len = len,
+ .fd = 0,
+ .fd_flags = O_RDWR | O_CLOEXEC,
+ };
+
+ older_alloc_ioctl = _IOWR(DMA_HEAP_IOC_MAGIC, 0x0,
+ struct dma_heap_allocation_data_smaller);
+ if (!dmabuf_fd)
+ return -EINVAL;
+
+ ret = ioctl(fd, older_alloc_ioctl, &data);
+ if (ret < 0)
+ return ret;
+ *dmabuf_fd = (int)data.fd;
+ return ret;
+}
+
+/* Test the ioctl version compatibility w/ a larger structure then expected */
+static int dmabuf_heap_alloc_newer(int fd, size_t len, unsigned int flags,
+ int *dmabuf_fd)
+{
+ int ret;
+ unsigned int newer_alloc_ioctl;
+ struct dma_heap_allocation_data_bigger {
+ __u64 len;
+ __u32 fd;
+ __u32 fd_flags;
+ __u64 heap_flags;
+ __u64 garbage1;
+ __u64 garbage2;
+ __u64 garbage3;
+ } data = {
+ .len = len,
+ .fd = 0,
+ .fd_flags = O_RDWR | O_CLOEXEC,
+ .heap_flags = flags,
+ .garbage1 = 0xffffffff,
+ .garbage2 = 0x88888888,
+ .garbage3 = 0x11111111,
+ };
+
+ newer_alloc_ioctl = _IOWR(DMA_HEAP_IOC_MAGIC, 0x0,
+ struct dma_heap_allocation_data_bigger);
+ if (!dmabuf_fd)
+ return -EINVAL;
+
+ ret = ioctl(fd, newer_alloc_ioctl, &data);
+ if (ret < 0)
+ return ret;
+
+ *dmabuf_fd = (int)data.fd;
+ return ret;
+}
+
+static int test_alloc_compat(char *heap_name)
+{
+ int heap_fd = -1, dmabuf_fd = -1;
+ int ret;
+
+ heap_fd = dmabuf_heap_open(heap_name);
+ if (heap_fd < 0)
+ return -1;
+
+ printf("Testing (theoretical)older alloc compat\n");
+ ret = dmabuf_heap_alloc_older(heap_fd, ONE_MEG, 0, &dmabuf_fd);
+ if (ret) {
+ printf("Older compat allocation failed!\n");
+ ret = -1;
+ goto out;
+ }
+ close(dmabuf_fd);
+
+ printf("Testing (theoretical)newer alloc compat\n");
+ ret = dmabuf_heap_alloc_newer(heap_fd, ONE_MEG, 0, &dmabuf_fd);
+ if (ret) {
+ printf("Newer compat allocation failed!\n");
+ ret = -1;
+ goto out;
+ }
+ printf("Ioctl compatibility tests passed\n");
+out:
+ if (dmabuf_fd >= 0)
+ close(dmabuf_fd);
+ if (heap_fd >= 0)
+ close(heap_fd);
+
+ return ret;
+}
+
+static int test_alloc_errors(char *heap_name)
+{
+ int heap_fd = -1, dmabuf_fd = -1;
+ int ret;
+
+ heap_fd = dmabuf_heap_open(heap_name);
+ if (heap_fd < 0)
+ return -1;
+
+ printf("Testing expected error cases\n");
+ ret = dmabuf_heap_alloc(0, ONE_MEG, 0x111111, &dmabuf_fd);
+ if (!ret) {
+ printf("Did not see expected error (invalid fd)!\n");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dmabuf_heap_alloc(heap_fd, ONE_MEG, 0x111111, &dmabuf_fd);
+ if (!ret) {
+ printf("Did not see expected error (invalid heap flags)!\n");
+ ret = -1;
+ goto out;
+ }
+
+ ret = dmabuf_heap_alloc_fdflags(heap_fd, ONE_MEG,
+ ~(O_RDWR | O_CLOEXEC), 0, &dmabuf_fd);
+ if (!ret) {
+ printf("Did not see expected error (invalid fd flags)!\n");
+ ret = -1;
+ goto out;
+ }
+
+ printf("Expected error checking passed\n");
+ ret = 0;
+out:
+ if (dmabuf_fd >= 0)
+ close(dmabuf_fd);
+ if (heap_fd >= 0)
+ close(heap_fd);
+
+ return ret;
+}
+
+int main(void)
+{
+ DIR *d;
+ struct dirent *dir;
+ int ret = -1;
+
+ d = opendir(DEVPATH);
+ if (!d) {
+ printf("No %s directory?\n", DEVPATH);
+ return -1;
+ }
+
+ while ((dir = readdir(d)) != NULL) {
+ if (!strncmp(dir->d_name, ".", 2))
+ continue;
+ if (!strncmp(dir->d_name, "..", 3))
+ continue;
+
+ ret = test_alloc_and_import(dir->d_name);
+ if (ret)
+ break;
+
+ ret = test_alloc_compat(dir->d_name);
+ if (ret)
+ break;
+
+ ret = test_alloc_errors(dir->d_name);
+ if (ret)
+ break;
+ }
+ closedir(d);
+
+ return ret;
+}
diff --git a/tools/testing/selftests/drivers/.gitignore b/tools/testing/selftests/drivers/.gitignore
new file mode 100644
index 000000000..ca74f2e1c
--- /dev/null
+++ b/tools/testing/selftests/drivers/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/dma-buf/udmabuf
diff --git a/tools/testing/selftests/drivers/dma-buf/Makefile b/tools/testing/selftests/drivers/dma-buf/Makefile
new file mode 100644
index 000000000..79cb16b4e
--- /dev/null
+++ b/tools/testing/selftests/drivers/dma-buf/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+CFLAGS += -I../../../../../usr/include/
+
+TEST_GEN_PROGS := udmabuf
+
+top_srcdir ?=../../../../..
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/drivers/dma-buf/config b/tools/testing/selftests/drivers/dma-buf/config
new file mode 100644
index 000000000..d708515cf
--- /dev/null
+++ b/tools/testing/selftests/drivers/dma-buf/config
@@ -0,0 +1 @@
+CONFIG_UDMABUF=y
diff --git a/tools/testing/selftests/drivers/dma-buf/udmabuf.c b/tools/testing/selftests/drivers/dma-buf/udmabuf.c
new file mode 100644
index 000000000..4de902ea1
--- /dev/null
+++ b/tools/testing/selftests/drivers/dma-buf/udmabuf.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <linux/fcntl.h>
+#include <malloc.h>
+
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include <linux/memfd.h>
+#include <linux/udmabuf.h>
+
+#define TEST_PREFIX "drivers/dma-buf/udmabuf"
+#define NUM_PAGES 4
+
+static int memfd_create(const char *name, unsigned int flags)
+{
+ return syscall(__NR_memfd_create, name, flags);
+}
+
+int main(int argc, char *argv[])
+{
+ struct udmabuf_create create;
+ int devfd, memfd, buf, ret;
+ off_t size;
+ void *mem;
+
+ devfd = open("/dev/udmabuf", O_RDWR);
+ if (devfd < 0) {
+ printf("%s: [skip,no-udmabuf]\n", TEST_PREFIX);
+ exit(77);
+ }
+
+ memfd = memfd_create("udmabuf-test", MFD_ALLOW_SEALING);
+ if (memfd < 0) {
+ printf("%s: [skip,no-memfd]\n", TEST_PREFIX);
+ exit(77);
+ }
+
+ ret = fcntl(memfd, F_ADD_SEALS, F_SEAL_SHRINK);
+ if (ret < 0) {
+ printf("%s: [skip,fcntl-add-seals]\n", TEST_PREFIX);
+ exit(77);
+ }
+
+
+ size = getpagesize() * NUM_PAGES;
+ ret = ftruncate(memfd, size);
+ if (ret == -1) {
+ printf("%s: [FAIL,memfd-truncate]\n", TEST_PREFIX);
+ exit(1);
+ }
+
+ memset(&create, 0, sizeof(create));
+
+ /* should fail (offset not page aligned) */
+ create.memfd = memfd;
+ create.offset = getpagesize()/2;
+ create.size = getpagesize();
+ buf = ioctl(devfd, UDMABUF_CREATE, &create);
+ if (buf >= 0) {
+ printf("%s: [FAIL,test-1]\n", TEST_PREFIX);
+ exit(1);
+ }
+
+ /* should fail (size not multiple of page) */
+ create.memfd = memfd;
+ create.offset = 0;
+ create.size = getpagesize()/2;
+ buf = ioctl(devfd, UDMABUF_CREATE, &create);
+ if (buf >= 0) {
+ printf("%s: [FAIL,test-2]\n", TEST_PREFIX);
+ exit(1);
+ }
+
+ /* should fail (not memfd) */
+ create.memfd = 0; /* stdin */
+ create.offset = 0;
+ create.size = size;
+ buf = ioctl(devfd, UDMABUF_CREATE, &create);
+ if (buf >= 0) {
+ printf("%s: [FAIL,test-3]\n", TEST_PREFIX);
+ exit(1);
+ }
+
+ /* should work */
+ create.memfd = memfd;
+ create.offset = 0;
+ create.size = size;
+ buf = ioctl(devfd, UDMABUF_CREATE, &create);
+ if (buf < 0) {
+ printf("%s: [FAIL,test-4]\n", TEST_PREFIX);
+ exit(1);
+ }
+
+ fprintf(stderr, "%s: ok\n", TEST_PREFIX);
+ close(buf);
+ close(memfd);
+ close(devfd);
+ return 0;
+}
diff --git a/tools/testing/selftests/drivers/gpu/drm_mm.sh b/tools/testing/selftests/drivers/gpu/drm_mm.sh
new file mode 100755
index 000000000..b789dc825
--- /dev/null
+++ b/tools/testing/selftests/drivers/gpu/drm_mm.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# Runs API tests for struct drm_mm (DRM range manager)
+
+if ! /sbin/modprobe -n -q test-drm_mm; then
+ echo "drivers/gpu/drm_mm: [skip]"
+ exit 77
+fi
+
+if /sbin/modprobe -q test-drm_mm; then
+ /sbin/modprobe -q -r test-drm_mm
+ echo "drivers/gpu/drm_mm: ok"
+else
+ echo "drivers/gpu/drm_mm: [FAIL]"
+ exit 1
+fi
diff --git a/tools/testing/selftests/drivers/gpu/i915.sh b/tools/testing/selftests/drivers/gpu/i915.sh
new file mode 100755
index 000000000..d3895bc71
--- /dev/null
+++ b/tools/testing/selftests/drivers/gpu/i915.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# Runs hardware independent tests for i915 (drivers/gpu/drm/i915)
+
+if ! /sbin/modprobe -q -r i915; then
+ echo "drivers/gpu/i915: [SKIP]"
+ exit 77
+fi
+
+if /sbin/modprobe -q i915 mock_selftests=-1; then
+ /sbin/modprobe -q -r i915
+ echo "drivers/gpu/i915: ok"
+else
+ echo "drivers/gpu/i915: [FAIL]"
+ exit 1
+fi
diff --git a/tools/testing/selftests/drivers/net/mlxsw/blackhole_routes.sh b/tools/testing/selftests/drivers/net/mlxsw/blackhole_routes.sh
new file mode 100755
index 000000000..bdffe698e
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/blackhole_routes.sh
@@ -0,0 +1,201 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test that blackhole routes are marked as offloaded and that packets hitting
+# them are dropped by the ASIC and not by the kernel.
+#
+# +---------------------------------+
+# | H1 (vrf) |
+# | + $h1 |
+# | | 192.0.2.1/24 |
+# | | 2001:db8:1::1/64 |
+# | | |
+# | | default via 192.0.2.2 |
+# | | default via 2001:db8:1::2 |
+# +----|----------------------------+
+# |
+# +----|----------------------------------------------------------------------+
+# | SW | |
+# | + $rp1 |
+# | 192.0.2.2/24 |
+# | 2001:db8:1::2/64 |
+# | |
+# | 2001:db8:2::2/64 |
+# | 198.51.100.2/24 |
+# | + $rp2 |
+# | | |
+# +----|----------------------------------------------------------------------+
+# |
+# +----|----------------------------+
+# | | default via 198.51.100.2 |
+# | | default via 2001:db8:2::2 |
+# | | |
+# | | 2001:db8:2::1/64 |
+# | | 198.51.100.1/24 |
+# | + $h2 |
+# | H2 (vrf) |
+# +---------------------------------+
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ ping_ipv4
+ ping_ipv6
+ blackhole_ipv4
+ blackhole_ipv6
+"
+NUM_NETIFS=4
+: ${TIMEOUT:=20000} # ms
+source $lib_dir/tc_common.sh
+source $lib_dir/lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/24 2001:db8:1::1/64
+
+ ip -4 route add default vrf v$h1 nexthop via 192.0.2.2
+ ip -6 route add default vrf v$h1 nexthop via 2001:db8:1::2
+}
+
+h1_destroy()
+{
+ ip -6 route del default vrf v$h1 nexthop via 2001:db8:1::2
+ ip -4 route del default vrf v$h1 nexthop via 192.0.2.2
+
+ simple_if_fini $h1 192.0.2.1/24 2001:db8:1::1/64
+}
+
+h2_create()
+{
+ simple_if_init $h2 198.51.100.1/24 2001:db8:2::1/64
+
+ ip -4 route add default vrf v$h2 nexthop via 198.51.100.2
+ ip -6 route add default vrf v$h2 nexthop via 2001:db8:2::2
+}
+
+h2_destroy()
+{
+ ip -6 route del default vrf v$h2 nexthop via 2001:db8:2::2
+ ip -4 route del default vrf v$h2 nexthop via 198.51.100.2
+
+ simple_if_fini $h2 198.51.100.1/24 2001:db8:2::1/64
+}
+
+router_create()
+{
+ ip link set dev $rp1 up
+ ip link set dev $rp2 up
+
+ tc qdisc add dev $rp1 clsact
+
+ __addr_add_del $rp1 add 192.0.2.2/24 2001:db8:1::2/64
+ __addr_add_del $rp2 add 198.51.100.2/24 2001:db8:2::2/64
+}
+
+router_destroy()
+{
+ __addr_add_del $rp2 del 198.51.100.2/24 2001:db8:2::2/64
+ __addr_add_del $rp1 del 192.0.2.2/24 2001:db8:1::2/64
+
+ tc qdisc del dev $rp1 clsact
+
+ ip link set dev $rp2 down
+ ip link set dev $rp1 down
+}
+
+ping_ipv4()
+{
+ ping_test $h1 198.51.100.1 ": h1->h2"
+}
+
+ping_ipv6()
+{
+ ping6_test $h1 2001:db8:2::1 ": h1->h2"
+}
+
+blackhole_ipv4()
+{
+ # Transmit packets from H1 to H2 and make sure they are dropped by the
+ # ASIC and not by the kernel
+ RET=0
+
+ ip -4 route add blackhole 198.51.100.0/30
+ tc filter add dev $rp1 ingress protocol ip pref 1 handle 101 flower \
+ skip_hw dst_ip 198.51.100.1 src_ip 192.0.2.1 ip_proto icmp \
+ action pass
+
+ busywait "$TIMEOUT" wait_for_offload ip -4 route show 198.51.100.0/30
+ check_err $? "route not marked as offloaded when should"
+
+ ping_do $h1 198.51.100.1
+ check_fail $? "ping passed when should not"
+
+ tc_check_packets "dev $rp1 ingress" 101 0
+ check_err $? "packets trapped and not dropped by ASIC"
+
+ log_test "IPv4 blackhole route"
+
+ tc filter del dev $rp1 ingress protocol ip pref 1 handle 101 flower
+ ip -4 route del blackhole 198.51.100.0/30
+}
+
+blackhole_ipv6()
+{
+ RET=0
+
+ ip -6 route add blackhole 2001:db8:2::/120
+ tc filter add dev $rp1 ingress protocol ipv6 pref 1 handle 101 flower \
+ skip_hw dst_ip 2001:db8:2::1 src_ip 2001:db8:1::1 \
+ ip_proto icmpv6 action pass
+
+ busywait "$TIMEOUT" wait_for_offload ip -6 route show 2001:db8:2::/120
+ check_err $? "route not marked as offloaded when should"
+
+ ping6_do $h1 2001:db8:2::1
+ check_fail $? "ping passed when should not"
+
+ tc_check_packets "dev $rp1 ingress" 101 0
+ check_err $? "packets trapped and not dropped by ASIC"
+
+ log_test "IPv6 blackhole route"
+
+ tc filter del dev $rp1 ingress protocol ipv6 pref 1 handle 101 flower
+ ip -6 route del blackhole 2001:db8:2::/120
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ rp1=${NETIFS[p2]}
+
+ rp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ vrf_prepare
+ forwarding_enable
+
+ h1_create
+ h2_create
+ router_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ router_destroy
+ h2_destroy
+ h1_destroy
+
+ forwarding_restore
+ vrf_cleanup
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap.sh
new file mode 100755
index 000000000..89b55e946
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test generic devlink-trap functionality over mlxsw. These tests are not
+# specific to a single trap, but do not check the devlink-trap common
+# infrastructure either.
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ dev_del_test
+"
+NUM_NETIFS=4
+source $lib_dir/tc_common.sh
+source $lib_dir/lib.sh
+source $lib_dir/devlink_lib.sh
+
+h1_create()
+{
+ simple_if_init $h1
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1
+}
+
+h2_create()
+{
+ simple_if_init $h2
+}
+
+h2_destroy()
+{
+ simple_if_fini $h2
+}
+
+switch_create()
+{
+ ip link add dev br0 type bridge vlan_filtering 1 mcast_snooping 0
+
+ ip link set dev $swp1 master br0
+ ip link set dev $swp2 master br0
+
+ ip link set dev br0 up
+ ip link set dev $swp1 up
+ ip link set dev $swp2 up
+}
+
+switch_destroy()
+{
+ ip link set dev $swp2 down
+ ip link set dev $swp1 down
+
+ ip link del dev br0
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+dev_del_test()
+{
+ local trap_name="source_mac_is_multicast"
+ local smac=01:02:03:04:05:06
+ local num_iter=5
+ local mz_pid
+ local i
+
+ $MZ $h1 -c 0 -p 100 -a $smac -b bcast -t ip -q &
+ mz_pid=$!
+
+ # The purpose of this test is to make sure we correctly dismantle a
+ # port while packets are trapped from it. This is done by reloading the
+ # the driver while the 'ingress_smac_mc_drop' trap is triggered.
+ RET=0
+
+ for i in $(seq 1 $num_iter); do
+ log_info "Iteration $i / $num_iter"
+
+ devlink_trap_action_set $trap_name "trap"
+ sleep 1
+
+ devlink_reload
+ # Allow netdevices to be re-created following the reload
+ sleep 20
+
+ cleanup
+ setup_prepare
+ setup_wait
+ done
+
+ log_test "Device delete"
+
+ kill $mz_pid && wait $mz_pid &> /dev/null
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_acl_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_acl_drops.sh
new file mode 100755
index 000000000..b32ba5fec
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_acl_drops.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test devlink-trap ACL drops functionality over mlxsw.
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ ingress_flow_action_drop_test
+ egress_flow_action_drop_test
+"
+NUM_NETIFS=4
+source $lib_dir/tc_common.sh
+source $lib_dir/lib.sh
+source $lib_dir/devlink_lib.sh
+
+h1_create()
+{
+ simple_if_init $h1
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1
+}
+
+h2_create()
+{
+ simple_if_init $h2
+}
+
+h2_destroy()
+{
+ simple_if_fini $h2
+}
+
+switch_create()
+{
+ ip link add dev br0 type bridge vlan_filtering 1 mcast_snooping 0
+
+ ip link set dev $swp1 master br0
+ ip link set dev $swp2 master br0
+
+ ip link set dev br0 up
+ ip link set dev $swp1 up
+ ip link set dev $swp2 up
+
+ tc qdisc add dev $swp1 clsact
+ tc qdisc add dev $swp2 clsact
+}
+
+switch_destroy()
+{
+ tc qdisc del dev $swp2 clsact
+ tc qdisc del dev $swp1 clsact
+
+ ip link set dev $swp2 down
+ ip link set dev $swp1 down
+
+ ip link del dev br0
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ h1mac=$(mac_get $h1)
+ h2mac=$(mac_get $h2)
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ingress_flow_action_drop_test()
+{
+ local mz_pid
+
+ tc filter add dev $swp2 egress protocol ip pref 1 handle 101 \
+ flower src_mac $h1mac action pass
+
+ tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 \
+ flower dst_ip 192.0.2.2 action drop
+
+ $MZ $h1 -c 0 -p 100 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -d 1msec -q &
+ mz_pid=$!
+
+ RET=0
+
+ devlink_trap_drop_test ingress_flow_action_drop $swp2 101
+
+ log_test "ingress_flow_action_drop"
+
+ tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower
+
+ devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101
+}
+
+egress_flow_action_drop_test()
+{
+ local mz_pid
+
+ tc filter add dev $swp2 egress protocol ip pref 2 handle 102 \
+ flower src_mac $h1mac action pass
+
+ tc filter add dev $swp2 egress protocol ip pref 1 handle 101 \
+ flower dst_ip 192.0.2.2 action drop
+
+ $MZ $h1 -c 0 -p 100 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -d 1msec -q &
+ mz_pid=$!
+
+ RET=0
+
+ devlink_trap_drop_test egress_flow_action_drop $swp2 102
+
+ log_test "egress_flow_action_drop"
+
+ tc filter del dev $swp2 egress protocol ip pref 1 handle 101 flower
+
+ devlink_trap_drop_cleanup $mz_pid $swp2 ip 2 102
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_control.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_control.sh
new file mode 100755
index 000000000..a37273473
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_control.sh
@@ -0,0 +1,688 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test devlink-trap control trap functionality over mlxsw. Each registered
+# control packet trap is tested to make sure it is triggered under the right
+# conditions.
+#
+# +---------------------------------+
+# | H1 (vrf) |
+# | + $h1 |
+# | | 192.0.2.1/24 |
+# | | 2001:db8:1::1/64 |
+# | | |
+# | | default via 192.0.2.2 |
+# | | default via 2001:db8:1::2 |
+# +----|----------------------------+
+# |
+# +----|----------------------------------------------------------------------+
+# | SW | |
+# | + $rp1 |
+# | 192.0.2.2/24 |
+# | 2001:db8:1::2/64 |
+# | |
+# | 2001:db8:2::2/64 |
+# | 198.51.100.2/24 |
+# | + $rp2 |
+# | | |
+# +----|----------------------------------------------------------------------+
+# |
+# +----|----------------------------+
+# | | default via 198.51.100.2 |
+# | | default via 2001:db8:2::2 |
+# | | |
+# | | 2001:db8:2::1/64 |
+# | | 198.51.100.1/24 |
+# | + $h2 |
+# | H2 (vrf) |
+# +---------------------------------+
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ stp_test
+ lacp_test
+ lldp_test
+ igmp_query_test
+ igmp_v1_report_test
+ igmp_v2_report_test
+ igmp_v3_report_test
+ igmp_v2_leave_test
+ mld_query_test
+ mld_v1_report_test
+ mld_v2_report_test
+ mld_v1_done_test
+ ipv4_dhcp_test
+ ipv6_dhcp_test
+ arp_request_test
+ arp_response_test
+ ipv6_neigh_solicit_test
+ ipv6_neigh_advert_test
+ ipv4_bfd_test
+ ipv6_bfd_test
+ ipv4_ospf_test
+ ipv6_ospf_test
+ ipv4_bgp_test
+ ipv6_bgp_test
+ ipv4_vrrp_test
+ ipv6_vrrp_test
+ ipv4_pim_test
+ ipv6_pim_test
+ uc_loopback_test
+ local_route_test
+ external_route_test
+ ipv6_uc_dip_link_local_scope_test
+ ipv4_router_alert_test
+ ipv6_router_alert_test
+ ipv6_dip_all_nodes_test
+ ipv6_dip_all_routers_test
+ ipv6_router_solicit_test
+ ipv6_router_advert_test
+ ipv6_redirect_test
+ ptp_event_test
+ ptp_general_test
+ flow_action_sample_test
+ flow_action_trap_test
+"
+NUM_NETIFS=4
+source $lib_dir/lib.sh
+source $lib_dir/devlink_lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/24 2001:db8:1::1/64
+
+ ip -4 route add default vrf v$h1 nexthop via 192.0.2.2
+ ip -6 route add default vrf v$h1 nexthop via 2001:db8:1::2
+}
+
+h1_destroy()
+{
+ ip -6 route del default vrf v$h1 nexthop via 2001:db8:1::2
+ ip -4 route del default vrf v$h1 nexthop via 192.0.2.2
+
+ simple_if_fini $h1 192.0.2.1/24 2001:db8:1::1/64
+}
+
+h2_create()
+{
+ simple_if_init $h2 198.51.100.1/24 2001:db8:2::1/64
+
+ ip -4 route add default vrf v$h2 nexthop via 198.51.100.2
+ ip -6 route add default vrf v$h2 nexthop via 2001:db8:2::2
+}
+
+h2_destroy()
+{
+ ip -6 route del default vrf v$h2 nexthop via 2001:db8:2::2
+ ip -4 route del default vrf v$h2 nexthop via 198.51.100.2
+
+ simple_if_fini $h2 198.51.100.1/24 2001:db8:2::1/64
+}
+
+router_create()
+{
+ ip link set dev $rp1 up
+ ip link set dev $rp2 up
+
+ __addr_add_del $rp1 add 192.0.2.2/24 2001:db8:1::2/64
+ __addr_add_del $rp2 add 198.51.100.2/24 2001:db8:2::2/64
+}
+
+router_destroy()
+{
+ __addr_add_del $rp2 del 198.51.100.2/24 2001:db8:2::2/64
+ __addr_add_del $rp1 del 192.0.2.2/24 2001:db8:1::2/64
+
+ ip link set dev $rp2 down
+ ip link set dev $rp1 down
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ rp1=${NETIFS[p2]}
+
+ rp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ vrf_prepare
+ forwarding_enable
+
+ h1_create
+ h2_create
+ router_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ router_destroy
+ h2_destroy
+ h1_destroy
+
+ forwarding_restore
+ vrf_cleanup
+}
+
+stp_test()
+{
+ devlink_trap_stats_test "STP" "stp" $MZ $h1 -c 1 -t bpdu -q
+}
+
+lacp_payload_get()
+{
+ local source_mac=$1; shift
+ local p
+
+ p=$(:
+ )"01:80:C2:00:00:02:"$( : ETH daddr
+ )"$source_mac:"$( : ETH saddr
+ )"88:09:"$( : ETH type
+ )
+ echo $p
+}
+
+lacp_test()
+{
+ local h1mac=$(mac_get $h1)
+
+ devlink_trap_stats_test "LACP" "lacp" $MZ $h1 -c 1 \
+ $(lacp_payload_get $h1mac) -p 100 -q
+}
+
+lldp_payload_get()
+{
+ local source_mac=$1; shift
+ local p
+
+ p=$(:
+ )"01:80:C2:00:00:0E:"$( : ETH daddr
+ )"$source_mac:"$( : ETH saddr
+ )"88:CC:"$( : ETH type
+ )
+ echo $p
+}
+
+lldp_test()
+{
+ local h1mac=$(mac_get $h1)
+
+ devlink_trap_stats_test "LLDP" "lldp" $MZ $h1 -c 1 \
+ $(lldp_payload_get $h1mac) -p 100 -q
+}
+
+igmp_query_test()
+{
+ # IGMP (IP Protocol 2) Membership Query (Type 0x11)
+ devlink_trap_stats_test "IGMP Membership Query" "igmp_query" \
+ $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:01 \
+ -A 192.0.2.1 -B 224.0.0.1 -t ip proto=2,p=11 -p 100 -q
+}
+
+igmp_v1_report_test()
+{
+ # IGMP (IP Protocol 2) Version 1 Membership Report (Type 0x12)
+ devlink_trap_stats_test "IGMP Version 1 Membership Report" \
+ "igmp_v1_report" $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:01 \
+ -A 192.0.2.1 -B 244.0.0.1 -t ip proto=2,p=12 -p 100 -q
+}
+
+igmp_v2_report_test()
+{
+ # IGMP (IP Protocol 2) Version 2 Membership Report (Type 0x16)
+ devlink_trap_stats_test "IGMP Version 2 Membership Report" \
+ "igmp_v2_report" $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:01 \
+ -A 192.0.2.1 -B 244.0.0.1 -t ip proto=2,p=16 -p 100 -q
+}
+
+igmp_v3_report_test()
+{
+ # IGMP (IP Protocol 2) Version 3 Membership Report (Type 0x22)
+ devlink_trap_stats_test "IGMP Version 3 Membership Report" \
+ "igmp_v3_report" $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:01 \
+ -A 192.0.2.1 -B 244.0.0.1 -t ip proto=2,p=22 -p 100 -q
+}
+
+igmp_v2_leave_test()
+{
+ # IGMP (IP Protocol 2) Version 2 Leave Group (Type 0x17)
+ devlink_trap_stats_test "IGMP Version 2 Leave Group" \
+ "igmp_v2_leave" $MZ $h1 -c 1 -a own -b 01:00:5E:00:00:02 \
+ -A 192.0.2.1 -B 224.0.0.2 -t ip proto=2,p=17 -p 100 -q
+}
+
+mld_payload_get()
+{
+ local type=$1; shift
+ local p
+
+ type=$(printf "%x" $type)
+ p=$(:
+ )"3A:"$( : Next Header - ICMPv6
+ )"00:"$( : Hdr Ext Len
+ )"00:00:00:00:00:00:"$( : Options and Padding
+ )"$type:"$( : ICMPv6.type
+ )"00:"$( : ICMPv6.code
+ )"00:"$( : ICMPv6.checksum
+ )
+ echo $p
+}
+
+mld_query_test()
+{
+ # MLD Multicast Listener Query (Type 130)
+ devlink_trap_stats_test "MLD Multicast Listener Query" "mld_query" \
+ $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::1 \
+ -t ip hop=1,next=0,payload=$(mld_payload_get 130) -p 100 -q
+}
+
+mld_v1_report_test()
+{
+ # MLD Version 1 Multicast Listener Report (Type 131)
+ devlink_trap_stats_test "MLD Version 1 Multicast Listener Report" \
+ "mld_v1_report" $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::16 \
+ -t ip hop=1,next=0,payload=$(mld_payload_get 131) -p 100 -q
+}
+
+mld_v2_report_test()
+{
+ # MLD Version 2 Multicast Listener Report (Type 143)
+ devlink_trap_stats_test "MLD Version 2 Multicast Listener Report" \
+ "mld_v2_report" $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::16 \
+ -t ip hop=1,next=0,payload=$(mld_payload_get 143) -p 100 -q
+}
+
+mld_v1_done_test()
+{
+ # MLD Version 1 Multicast Listener Done (Type 132)
+ devlink_trap_stats_test "MLD Version 1 Multicast Listener Done" \
+ "mld_v1_done" $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::16 \
+ -t ip hop=1,next=0,payload=$(mld_payload_get 132) -p 100 -q
+}
+
+ipv4_dhcp_test()
+{
+ devlink_trap_stats_test "IPv4 DHCP Port 67" "ipv4_dhcp" \
+ $MZ $h1 -c 1 -a own -b bcast -A 0.0.0.0 -B 255.255.255.255 \
+ -t udp sp=68,dp=67 -p 100 -q
+
+ devlink_trap_stats_test "IPv4 DHCP Port 68" "ipv4_dhcp" \
+ $MZ $h1 -c 1 -a own -b $(mac_get $rp1) -A 192.0.2.1 \
+ -B 255.255.255.255 -t udp sp=67,dp=68 -p 100 -q
+}
+
+ipv6_dhcp_test()
+{
+ devlink_trap_stats_test "IPv6 DHCP Port 547" "ipv6_dhcp" \
+ $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::1:2 -t udp sp=546,dp=547 \
+ -p 100 -q
+
+ devlink_trap_stats_test "IPv6 DHCP Port 546" "ipv6_dhcp" \
+ $MZ $h1 -6 -c 1 -A fe80::1 -B ff02::1:2 -t udp sp=547,dp=546 \
+ -p 100 -q
+}
+
+arp_request_test()
+{
+ devlink_trap_stats_test "ARP Request" "arp_request" \
+ $MZ $h1 -c 1 -a own -b bcast -t arp request -p 100 -q
+}
+
+arp_response_test()
+{
+ devlink_trap_stats_test "ARP Response" "arp_response" \
+ $MZ $h1 -c 1 -a own -b $(mac_get $rp1) -t arp reply -p 100 -q
+}
+
+icmpv6_header_get()
+{
+ local type=$1; shift
+ local p
+
+ type=$(printf "%x" $type)
+ p=$(:
+ )"$type:"$( : ICMPv6.type
+ )"00:"$( : ICMPv6.code
+ )"00:"$( : ICMPv6.checksum
+ )
+ echo $p
+}
+
+ipv6_neigh_solicit_test()
+{
+ devlink_trap_stats_test "IPv6 Neighbour Solicitation" \
+ "ipv6_neigh_solicit" $MZ $h1 -6 -c 1 \
+ -A fe80::1 -B ff02::1:ff00:02 \
+ -t ip hop=1,next=58,payload=$(icmpv6_header_get 135) -p 100 -q
+}
+
+ipv6_neigh_advert_test()
+{
+ devlink_trap_stats_test "IPv6 Neighbour Advertisement" \
+ "ipv6_neigh_advert" $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \
+ -A fe80::1 -B 2001:db8:1::2 \
+ -t ip hop=1,next=58,payload=$(icmpv6_header_get 136) -p 100 -q
+}
+
+ipv4_bfd_test()
+{
+ devlink_trap_stats_test "IPv4 BFD Control - Port 3784" "ipv4_bfd" \
+ $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \
+ -A 192.0.2.1 -B 192.0.2.2 -t udp sp=49153,dp=3784 -p 100 -q
+
+ devlink_trap_stats_test "IPv4 BFD Echo - Port 3785" "ipv4_bfd" \
+ $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \
+ -A 192.0.2.1 -B 192.0.2.2 -t udp sp=49153,dp=3785 -p 100 -q
+}
+
+ipv6_bfd_test()
+{
+ devlink_trap_stats_test "IPv6 BFD Control - Port 3784" "ipv6_bfd" \
+ $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \
+ -A 2001:db8:1::1 -B 2001:db8:1::2 \
+ -t udp sp=49153,dp=3784 -p 100 -q
+
+ devlink_trap_stats_test "IPv6 BFD Echo - Port 3785" "ipv6_bfd" \
+ $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \
+ -A 2001:db8:1::1 -B 2001:db8:1::2 \
+ -t udp sp=49153,dp=3785 -p 100 -q
+}
+
+ipv4_ospf_test()
+{
+ devlink_trap_stats_test "IPv4 OSPF - Multicast" "ipv4_ospf" \
+ $MZ $h1 -c 1 -a own -b 01:00:5e:00:00:05 \
+ -A 192.0.2.1 -B 224.0.0.5 -t ip proto=89 -p 100 -q
+
+ devlink_trap_stats_test "IPv4 OSPF - Unicast" "ipv4_ospf" \
+ $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \
+ -A 192.0.2.1 -B 192.0.2.2 -t ip proto=89 -p 100 -q
+}
+
+ipv6_ospf_test()
+{
+ devlink_trap_stats_test "IPv6 OSPF - Multicast" "ipv6_ospf" \
+ $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:05 \
+ -A fe80::1 -B ff02::5 -t ip next=89 -p 100 -q
+
+ devlink_trap_stats_test "IPv6 OSPF - Unicast" "ipv6_ospf" \
+ $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \
+ -A 2001:db8:1::1 -B 2001:db8:1::2 -t ip next=89 -p 100 -q
+}
+
+ipv4_bgp_test()
+{
+ devlink_trap_stats_test "IPv4 BGP" "ipv4_bgp" \
+ $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \
+ -A 192.0.2.1 -B 192.0.2.2 -t tcp sp=54321,dp=179,flags=rst \
+ -p 100 -q
+}
+
+ipv6_bgp_test()
+{
+ devlink_trap_stats_test "IPv6 BGP" "ipv6_bgp" \
+ $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \
+ -A 2001:db8:1::1 -B 2001:db8:1::2 \
+ -t tcp sp=54321,dp=179,flags=rst -p 100 -q
+}
+
+ipv4_vrrp_test()
+{
+ devlink_trap_stats_test "IPv4 VRRP" "ipv4_vrrp" \
+ $MZ $h1 -c 1 -a own -b 01:00:5e:00:00:12 \
+ -A 192.0.2.1 -B 224.0.0.18 -t ip proto=112 -p 100 -q
+}
+
+ipv6_vrrp_test()
+{
+ devlink_trap_stats_test "IPv6 VRRP" "ipv6_vrrp" \
+ $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:12 \
+ -A fe80::1 -B ff02::12 -t ip next=112 -p 100 -q
+}
+
+ipv4_pim_test()
+{
+ devlink_trap_stats_test "IPv4 PIM - Multicast" "ipv4_pim" \
+ $MZ $h1 -c 1 -a own -b 01:00:5e:00:00:0d \
+ -A 192.0.2.1 -B 224.0.0.13 -t ip proto=103 -p 100 -q
+
+ devlink_trap_stats_test "IPv4 PIM - Unicast" "ipv4_pim" \
+ $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \
+ -A 192.0.2.1 -B 192.0.2.2 -t ip proto=103 -p 100 -q
+}
+
+ipv6_pim_test()
+{
+ devlink_trap_stats_test "IPv6 PIM - Multicast" "ipv6_pim" \
+ $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:0d \
+ -A fe80::1 -B ff02::d -t ip next=103 -p 100 -q
+
+ devlink_trap_stats_test "IPv6 PIM - Unicast" "ipv6_pim" \
+ $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \
+ -A fe80::1 -B 2001:db8:1::2 -t ip next=103 -p 100 -q
+}
+
+uc_loopback_test()
+{
+ # Add neighbours to the fake destination IPs, so that the packets are
+ # routed in the device and not trapped due to an unresolved neighbour
+ # exception.
+ ip -4 neigh add 192.0.2.3 lladdr 00:11:22:33:44:55 nud permanent \
+ dev $rp1
+ ip -6 neigh add 2001:db8:1::3 lladdr 00:11:22:33:44:55 nud permanent \
+ dev $rp1
+
+ devlink_trap_stats_test "IPv4 Unicast Loopback" "uc_loopback" \
+ $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \
+ -A 192.0.2.1 -B 192.0.2.3 -t udp sp=54321,dp=12345 -p 100 -q
+
+ devlink_trap_stats_test "IPv6 Unicast Loopback" "uc_loopback" \
+ $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \
+ -A 2001:db8:1::1 -B 2001:db8:1::3 -t udp sp=54321,dp=12345 \
+ -p 100 -q
+
+ ip -6 neigh del 2001:db8:1::3 dev $rp1
+ ip -4 neigh del 192.0.2.3 dev $rp1
+}
+
+local_route_test()
+{
+ # Use a fake source IP to prevent the trap from being triggered twice
+ # when the router sends back a port unreachable message.
+ devlink_trap_stats_test "IPv4 Local Route" "local_route" \
+ $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \
+ -A 192.0.2.3 -B 192.0.2.2 -t udp sp=54321,dp=12345 -p 100 -q
+
+ devlink_trap_stats_test "IPv6 Local Route" "local_route" \
+ $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \
+ -A 2001:db8:1::3 -B 2001:db8:1::2 -t udp sp=54321,sp=12345 \
+ -p 100 -q
+}
+
+external_route_test()
+{
+ # Add a dummy device through which the incoming packets should be
+ # routed.
+ ip link add name dummy10 up type dummy
+ ip address add 203.0.113.1/24 dev dummy10
+ ip -6 address add 2001:db8:10::1/64 dev dummy10
+
+ devlink_trap_stats_test "IPv4 External Route" "external_route" \
+ $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \
+ -A 192.0.2.1 -B 203.0.113.2 -t udp sp=54321,dp=12345 -p 100 -q
+
+ devlink_trap_stats_test "IPv6 External Route" "external_route" \
+ $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \
+ -A 2001:db8:1::1 -B 2001:db8:10::2 -t udp sp=54321,sp=12345 \
+ -p 100 -q
+
+ ip -6 address del 2001:db8:10::1/64 dev dummy10
+ ip address del 203.0.113.1/24 dev dummy10
+ ip link del dev dummy10
+}
+
+ipv6_uc_dip_link_local_scope_test()
+{
+ # Add a dummy link-local prefix route to allow the packet to be routed.
+ ip -6 route add fe80:1::/64 dev $rp2
+
+ devlink_trap_stats_test \
+ "IPv6 Unicast Destination IP With Link-Local Scope" \
+ "ipv6_uc_dip_link_local_scope" \
+ $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \
+ -A fe80::1 -B fe80:1::2 -t udp sp=54321,sp=12345 \
+ -p 100 -q
+
+ ip -6 route del fe80:1::/64 dev $rp2
+}
+
+ipv4_router_alert_get()
+{
+ local p
+
+ # https://en.wikipedia.org/wiki/IPv4#Options
+ p=$(:
+ )"94:"$( : Option Number
+ )"04:"$( : Option Length
+ )"00:00:"$( : Option Data
+ )
+ echo $p
+}
+
+ipv4_router_alert_test()
+{
+ devlink_trap_stats_test "IPv4 Router Alert" "ipv4_router_alert" \
+ $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \
+ -A 192.0.2.1 -B 198.51.100.3 \
+ -t ip option=$(ipv4_router_alert_get) -p 100 -q
+}
+
+ipv6_router_alert_get()
+{
+ local p
+
+ # https://en.wikipedia.org/wiki/IPv6_packet#Hop-by-hop_options_and_destination_options
+ # https://tools.ietf.org/html/rfc2711#section-2.1
+ p=$(:
+ )"11:"$( : Next Header - UDP
+ )"00:"$( : Hdr Ext Len
+ )"05:02:00:00:00:00:"$( : Option Data
+ )
+ echo $p
+}
+
+ipv6_router_alert_test()
+{
+ devlink_trap_stats_test "IPv6 Router Alert" "ipv6_router_alert" \
+ $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \
+ -A 2001:db8:1::1 -B 2001:db8:1::3 \
+ -t ip next=0,payload=$(ipv6_router_alert_get) -p 100 -q
+}
+
+ipv6_dip_all_nodes_test()
+{
+ devlink_trap_stats_test "IPv6 Destination IP \"All Nodes Address\"" \
+ "ipv6_dip_all_nodes" \
+ $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:01 \
+ -A 2001:db8:1::1 -B ff02::1 -t udp sp=12345,dp=54321 -p 100 -q
+}
+
+ipv6_dip_all_routers_test()
+{
+ devlink_trap_stats_test "IPv6 Destination IP \"All Routers Address\"" \
+ "ipv6_dip_all_routers" \
+ $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:02 \
+ -A 2001:db8:1::1 -B ff02::2 -t udp sp=12345,dp=54321 -p 100 -q
+}
+
+ipv6_router_solicit_test()
+{
+ devlink_trap_stats_test "IPv6 Router Solicitation" \
+ "ipv6_router_solicit" \
+ $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:02 \
+ -A fe80::1 -B ff02::2 \
+ -t ip hop=1,next=58,payload=$(icmpv6_header_get 133) -p 100 -q
+}
+
+ipv6_router_advert_test()
+{
+ devlink_trap_stats_test "IPv6 Router Advertisement" \
+ "ipv6_router_advert" \
+ $MZ $h1 -6 -c 1 -a own -b 33:33:00:00:00:01 \
+ -A fe80::1 -B ff02::1 \
+ -t ip hop=1,next=58,payload=$(icmpv6_header_get 134) -p 100 -q
+}
+
+ipv6_redirect_test()
+{
+ devlink_trap_stats_test "IPv6 Redirect Message" \
+ "ipv6_redirect" \
+ $MZ $h1 -6 -c 1 -a own -b $(mac_get $rp1) \
+ -A fe80::1 -B 2001:db8:1::2 \
+ -t ip hop=1,next=58,payload=$(icmpv6_header_get 137) -p 100 -q
+}
+
+ptp_event_test()
+{
+ # PTP is only supported on Spectrum-1, for now.
+ [[ "$DEVLINK_VIDDID" != "15b3:cb84" ]] && return
+
+ # PTP Sync (0)
+ devlink_trap_stats_test "PTP Time-Critical Event Message" "ptp_event" \
+ $MZ $h1 -c 1 -a own -b 01:00:5e:00:01:81 \
+ -A 192.0.2.1 -B 224.0.1.129 \
+ -t udp sp=12345,dp=319,payload=10 -p 100 -q
+}
+
+ptp_general_test()
+{
+ # PTP is only supported on Spectrum-1, for now.
+ [[ "$DEVLINK_VIDDID" != "15b3:cb84" ]] && return
+
+ # PTP Announce (b)
+ devlink_trap_stats_test "PTP General Message" "ptp_general" \
+ $MZ $h1 -c 1 -a own -b 01:00:5e:00:01:81 \
+ -A 192.0.2.1 -B 224.0.1.129 \
+ -t udp sp=12345,dp=320,payload=1b -p 100 -q
+}
+
+flow_action_sample_test()
+{
+ # Install a filter that samples every incoming packet.
+ tc qdisc add dev $rp1 clsact
+ tc filter add dev $rp1 ingress proto all pref 1 handle 101 matchall \
+ skip_sw action sample rate 1 group 1
+
+ devlink_trap_stats_test "Flow Sampling" "flow_action_sample" \
+ $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \
+ -A 192.0.2.1 -B 198.51.100.1 -t udp sp=12345,dp=54321 -p 100 -q
+
+ tc filter del dev $rp1 ingress proto all pref 1 handle 101 matchall
+ tc qdisc del dev $rp1 clsact
+}
+
+flow_action_trap_test()
+{
+ # Install a filter that traps a specific flow.
+ tc qdisc add dev $rp1 clsact
+ tc filter add dev $rp1 ingress proto ip pref 1 handle 101 flower \
+ skip_sw ip_proto udp src_port 12345 dst_port 54321 action trap
+
+ devlink_trap_stats_test "Flow Trapping (Logging)" "flow_action_trap" \
+ $MZ $h1 -c 1 -a own -b $(mac_get $rp1) \
+ -A 192.0.2.1 -B 198.51.100.1 -t udp sp=12345,dp=54321 -p 100 -q
+
+ tc filter del dev $rp1 ingress proto ip pref 1 handle 101 flower
+ tc qdisc del dev $rp1 clsact
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh
new file mode 100755
index 000000000..a4c2812e9
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l2_drops.sh
@@ -0,0 +1,430 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test devlink-trap L2 drops functionality over mlxsw. Each registered L2 drop
+# packet trap is tested to make sure it is triggered under the right
+# conditions.
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ source_mac_is_multicast_test
+ vlan_tag_mismatch_test
+ ingress_vlan_filter_test
+ ingress_stp_filter_test
+ port_list_is_empty_test
+ port_loopback_filter_test
+"
+NUM_NETIFS=4
+source $lib_dir/tc_common.sh
+source $lib_dir/lib.sh
+source $lib_dir/devlink_lib.sh
+
+h1_create()
+{
+ simple_if_init $h1
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1
+}
+
+h2_create()
+{
+ simple_if_init $h2
+}
+
+h2_destroy()
+{
+ simple_if_fini $h2
+}
+
+switch_create()
+{
+ ip link add dev br0 type bridge vlan_filtering 1 mcast_snooping 0
+
+ ip link set dev $swp1 master br0
+ ip link set dev $swp2 master br0
+
+ ip link set dev br0 up
+ ip link set dev $swp1 up
+ ip link set dev $swp2 up
+
+ tc qdisc add dev $swp2 clsact
+}
+
+switch_destroy()
+{
+ tc qdisc del dev $swp2 clsact
+
+ ip link set dev $swp2 down
+ ip link set dev $swp1 down
+
+ ip link del dev br0
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+source_mac_is_multicast_test()
+{
+ local trap_name="source_mac_is_multicast"
+ local smac=01:02:03:04:05:06
+ local mz_pid
+
+ tc filter add dev $swp2 egress protocol ip pref 1 handle 101 \
+ flower src_mac $smac action drop
+
+ $MZ $h1 -c 0 -p 100 -a $smac -b bcast -t ip -d 1msec -q &
+ mz_pid=$!
+
+ RET=0
+
+ devlink_trap_drop_test $trap_name $swp2 101
+
+ log_test "Source MAC is multicast"
+
+ devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101
+}
+
+__vlan_tag_mismatch_test()
+{
+ local trap_name="vlan_tag_mismatch"
+ local dmac=de:ad:be:ef:13:37
+ local opt=$1; shift
+ local mz_pid
+
+ # Remove PVID flag. This should prevent untagged and prio-tagged
+ # packets from entering the bridge.
+ bridge vlan add vid 1 dev $swp1 untagged master
+
+ tc filter add dev $swp2 egress protocol ip pref 1 handle 101 \
+ flower dst_mac $dmac action drop
+
+ $MZ $h1 "$opt" -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q &
+ mz_pid=$!
+
+ devlink_trap_drop_test $trap_name $swp2 101
+
+ # Add PVID and make sure packets are no longer dropped.
+ bridge vlan add vid 1 dev $swp1 pvid untagged master
+ devlink_trap_action_set $trap_name "trap"
+
+ devlink_trap_stats_idle_test $trap_name
+ check_err $? "Trap stats not idle when packets should not be dropped"
+ devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name)
+ check_err $? "Trap group stats not idle with when packets should not be dropped"
+
+ tc_check_packets "dev $swp2 egress" 101 0
+ check_fail $? "Packets not forwarded when should"
+
+ devlink_trap_action_set $trap_name "drop"
+
+ devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101
+}
+
+vlan_tag_mismatch_untagged_test()
+{
+ RET=0
+
+ __vlan_tag_mismatch_test
+
+ log_test "VLAN tag mismatch - untagged packets"
+}
+
+vlan_tag_mismatch_vid_0_test()
+{
+ RET=0
+
+ __vlan_tag_mismatch_test "-Q 0"
+
+ log_test "VLAN tag mismatch - prio-tagged packets"
+}
+
+vlan_tag_mismatch_test()
+{
+ vlan_tag_mismatch_untagged_test
+ vlan_tag_mismatch_vid_0_test
+}
+
+ingress_vlan_filter_test()
+{
+ local trap_name="ingress_vlan_filter"
+ local dmac=de:ad:be:ef:13:37
+ local mz_pid
+ local vid=10
+
+ bridge vlan add vid $vid dev $swp2 master
+
+ RET=0
+
+ tc filter add dev $swp2 egress protocol ip pref 1 handle 101 \
+ flower dst_mac $dmac action drop
+
+ $MZ $h1 -Q $vid -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q &
+ mz_pid=$!
+
+ devlink_trap_drop_test $trap_name $swp2 101
+
+ # Add the VLAN on the bridge port and make sure packets are no longer
+ # dropped.
+ bridge vlan add vid $vid dev $swp1 master
+ devlink_trap_action_set $trap_name "trap"
+
+ devlink_trap_stats_idle_test $trap_name
+ check_err $? "Trap stats not idle when packets should not be dropped"
+ devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name)
+ check_err $? "Trap group stats not idle with when packets should not be dropped"
+
+ tc_check_packets "dev $swp2 egress" 101 0
+ check_fail $? "Packets not forwarded when should"
+
+ devlink_trap_action_set $trap_name "drop"
+
+ log_test "Ingress VLAN filter"
+
+ devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101
+
+ bridge vlan del vid $vid dev $swp1 master
+ bridge vlan del vid $vid dev $swp2 master
+}
+
+__ingress_stp_filter_test()
+{
+ local trap_name="ingress_spanning_tree_filter"
+ local dmac=de:ad:be:ef:13:37
+ local state=$1; shift
+ local mz_pid
+ local vid=20
+
+ bridge vlan add vid $vid dev $swp2 master
+ bridge vlan add vid $vid dev $swp1 master
+ ip link set dev $swp1 type bridge_slave state $state
+
+ tc filter add dev $swp2 egress protocol ip pref 1 handle 101 \
+ flower dst_mac $dmac action drop
+
+ $MZ $h1 -Q $vid -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q &
+ mz_pid=$!
+
+ devlink_trap_drop_test $trap_name $swp2 101
+
+ # Change STP state to forwarding and make sure packets are no longer
+ # dropped.
+ ip link set dev $swp1 type bridge_slave state 3
+ devlink_trap_action_set $trap_name "trap"
+
+ devlink_trap_stats_idle_test $trap_name
+ check_err $? "Trap stats not idle when packets should not be dropped"
+ devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name)
+ check_err $? "Trap group stats not idle with when packets should not be dropped"
+
+ tc_check_packets "dev $swp2 egress" 101 0
+ check_fail $? "Packets not forwarded when should"
+
+ devlink_trap_action_set $trap_name "drop"
+
+ devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101
+
+ bridge vlan del vid $vid dev $swp1 master
+ bridge vlan del vid $vid dev $swp2 master
+}
+
+ingress_stp_filter_listening_test()
+{
+ local state=$1; shift
+
+ RET=0
+
+ __ingress_stp_filter_test $state
+
+ log_test "Ingress STP filter - listening state"
+}
+
+ingress_stp_filter_learning_test()
+{
+ local state=$1; shift
+
+ RET=0
+
+ __ingress_stp_filter_test $state
+
+ log_test "Ingress STP filter - learning state"
+}
+
+ingress_stp_filter_test()
+{
+ ingress_stp_filter_listening_test 1
+ ingress_stp_filter_learning_test 2
+}
+
+port_list_is_empty_uc_test()
+{
+ local trap_name="port_list_is_empty"
+ local dmac=de:ad:be:ef:13:37
+ local mz_pid
+
+ # Disable unicast flooding on both ports, so that packets cannot egress
+ # any port.
+ ip link set dev $swp1 type bridge_slave flood off
+ ip link set dev $swp2 type bridge_slave flood off
+
+ RET=0
+
+ tc filter add dev $swp2 egress protocol ip pref 1 handle 101 \
+ flower dst_mac $dmac action drop
+
+ $MZ $h1 -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q &
+ mz_pid=$!
+
+ devlink_trap_drop_test $trap_name $swp2 101
+
+ # Allow packets to be flooded to one port.
+ ip link set dev $swp2 type bridge_slave flood on
+ devlink_trap_action_set $trap_name "trap"
+
+ devlink_trap_stats_idle_test $trap_name
+ check_err $? "Trap stats not idle when packets should not be dropped"
+ devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name)
+ check_err $? "Trap group stats not idle with when packets should not be dropped"
+
+ tc_check_packets "dev $swp2 egress" 101 0
+ check_fail $? "Packets not forwarded when should"
+
+ devlink_trap_action_set $trap_name "drop"
+
+ log_test "Port list is empty - unicast"
+
+ devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101
+
+ ip link set dev $swp1 type bridge_slave flood on
+}
+
+port_list_is_empty_mc_test()
+{
+ local trap_name="port_list_is_empty"
+ local dmac=01:00:5e:00:00:01
+ local dip=239.0.0.1
+ local mz_pid
+
+ # Disable multicast flooding on both ports, so that packets cannot
+ # egress any port. We also need to flush IP addresses from the bridge
+ # in order to prevent packets from being flooded to the router port.
+ ip link set dev $swp1 type bridge_slave mcast_flood off
+ ip link set dev $swp2 type bridge_slave mcast_flood off
+ ip address flush dev br0
+
+ RET=0
+
+ tc filter add dev $swp2 egress protocol ip pref 1 handle 101 \
+ flower dst_mac $dmac action drop
+
+ $MZ $h1 -c 0 -p 100 -a own -b $dmac -t ip -B $dip -d 1msec -q &
+ mz_pid=$!
+
+ devlink_trap_drop_test $trap_name $swp2 101
+
+ # Allow packets to be flooded to one port.
+ ip link set dev $swp2 type bridge_slave mcast_flood on
+ devlink_trap_action_set $trap_name "trap"
+
+ devlink_trap_stats_idle_test $trap_name
+ check_err $? "Trap stats not idle when packets should not be dropped"
+ devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name)
+ check_err $? "Trap group stats not idle with when packets should not be dropped"
+
+ tc_check_packets "dev $swp2 egress" 101 0
+ check_fail $? "Packets not forwarded when should"
+
+ devlink_trap_action_set $trap_name "drop"
+
+ log_test "Port list is empty - multicast"
+
+ devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101
+
+ ip link set dev $swp1 type bridge_slave mcast_flood on
+}
+
+port_list_is_empty_test()
+{
+ port_list_is_empty_uc_test
+ port_list_is_empty_mc_test
+}
+
+port_loopback_filter_uc_test()
+{
+ local trap_name="port_loopback_filter"
+ local dmac=de:ad:be:ef:13:37
+ local mz_pid
+
+ # Make sure packets can only egress the input port.
+ ip link set dev $swp2 type bridge_slave flood off
+
+ RET=0
+
+ tc filter add dev $swp2 egress protocol ip pref 1 handle 101 \
+ flower dst_mac $dmac action drop
+
+ $MZ $h1 -c 0 -p 100 -a own -b $dmac -t ip -d 1msec -q &
+ mz_pid=$!
+
+ devlink_trap_drop_test $trap_name $swp2 101
+
+ # Allow packets to be flooded.
+ ip link set dev $swp2 type bridge_slave flood on
+ devlink_trap_action_set $trap_name "trap"
+
+ devlink_trap_stats_idle_test $trap_name
+ check_err $? "Trap stats not idle when packets should not be dropped"
+ devlink_trap_group_stats_idle_test $(devlink_trap_group_get $trap_name)
+ check_err $? "Trap group stats not idle with when packets should not be dropped"
+
+ tc_check_packets "dev $swp2 egress" 101 0
+ check_fail $? "Packets not forwarded when should"
+
+ devlink_trap_action_set $trap_name "drop"
+
+ log_test "Port loopback filter - unicast"
+
+ devlink_trap_drop_cleanup $mz_pid $swp2 ip 1 101
+}
+
+port_loopback_filter_test()
+{
+ port_loopback_filter_uc_test
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh
new file mode 100755
index 000000000..269b26806
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_drops.sh
@@ -0,0 +1,660 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test devlink-trap L3 drops functionality over mlxsw. Each registered L3 drop
+# packet trap is tested to make sure it is triggered under the right
+# conditions.
+
+# +---------------------------------+
+# | H1 (vrf) |
+# | + $h1 |
+# | | 192.0.2.1/24 |
+# | | 2001:db8:1::1/64 |
+# | | |
+# | | default via 192.0.2.2 |
+# | | default via 2001:db8:1::2 |
+# +----|----------------------------+
+# |
+# +----|----------------------------------------------------------------------+
+# | SW | |
+# | + $rp1 |
+# | 192.0.2.2/24 |
+# | 2001:db8:1::2/64 |
+# | |
+# | 2001:db8:2::2/64 |
+# | 198.51.100.2/24 |
+# | + $rp2 |
+# | | |
+# +----|----------------------------------------------------------------------+
+# |
+# +----|----------------------------+
+# | | default via 198.51.100.2 |
+# | | default via 2001:db8:2::2 |
+# | | |
+# | | 2001:db8:2::1/64 |
+# | | 198.51.100.1/24 |
+# | + $h2 |
+# | H2 (vrf) |
+# +---------------------------------+
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ non_ip_test
+ uc_dip_over_mc_dmac_test
+ dip_is_loopback_test
+ sip_is_mc_test
+ sip_is_loopback_test
+ ip_header_corrupted_test
+ ipv4_sip_is_limited_bc_test
+ ipv6_mc_dip_reserved_scope_test
+ ipv6_mc_dip_interface_local_scope_test
+ blackhole_route_test
+ irif_disabled_test
+ erif_disabled_test
+"
+
+NUM_NETIFS=4
+source $lib_dir/lib.sh
+source $lib_dir/tc_common.sh
+source $lib_dir/devlink_lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/24 2001:db8:1::1/64
+
+ ip -4 route add default vrf v$h1 nexthop via 192.0.2.2
+ ip -6 route add default vrf v$h1 nexthop via 2001:db8:1::2
+}
+
+h1_destroy()
+{
+ ip -6 route del default vrf v$h1 nexthop via 2001:db8:1::2
+ ip -4 route del default vrf v$h1 nexthop via 192.0.2.2
+
+ simple_if_fini $h1 192.0.2.1/24 2001:db8:1::1/64
+}
+
+h2_create()
+{
+ simple_if_init $h2 $h2_ipv4/24 $h2_ipv6/64
+
+ ip -4 route add default vrf v$h2 nexthop via 198.51.100.2
+ ip -6 route add default vrf v$h2 nexthop via 2001:db8:2::2
+}
+
+h2_destroy()
+{
+ ip -6 route del default vrf v$h2 nexthop via 2001:db8:2::2
+ ip -4 route del default vrf v$h2 nexthop via 198.51.100.2
+
+ simple_if_fini $h2 $h2_ipv4/24 $h2_ipv6/64
+}
+
+router_create()
+{
+ ip link set dev $rp1 up
+ ip link set dev $rp2 up
+
+ tc qdisc add dev $rp2 clsact
+
+ __addr_add_del $rp1 add 192.0.2.2/24 2001:db8:1::2/64
+ __addr_add_del $rp2 add 198.51.100.2/24 2001:db8:2::2/64
+}
+
+router_destroy()
+{
+ __addr_add_del $rp2 del 198.51.100.2/24 2001:db8:2::2/64
+ __addr_add_del $rp1 del 192.0.2.2/24 2001:db8:1::2/64
+
+ tc qdisc del dev $rp2 clsact
+
+ ip link set dev $rp2 down
+ ip link set dev $rp1 down
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ rp1=${NETIFS[p2]}
+
+ rp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ h1mac=$(mac_get $h1)
+ rp1mac=$(mac_get $rp1)
+
+ h1_ipv4=192.0.2.1
+ h2_ipv4=198.51.100.1
+ h1_ipv6=2001:db8:1::1
+ h2_ipv6=2001:db8:2::1
+
+ vrf_prepare
+ forwarding_enable
+
+ h1_create
+ h2_create
+
+ router_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ router_destroy
+
+ h2_destroy
+ h1_destroy
+
+ forwarding_restore
+ vrf_cleanup
+}
+
+ping_check()
+{
+ trap_name=$1; shift
+
+ devlink_trap_action_set $trap_name "trap"
+ ping_do $h1 $h2_ipv4
+ check_err $? "Packets that should not be trapped were trapped"
+ devlink_trap_action_set $trap_name "drop"
+}
+
+non_ip_test()
+{
+ local trap_name="non_ip"
+ local mz_pid
+
+ RET=0
+
+ ping_check $trap_name
+
+ tc filter add dev $rp2 egress protocol ip pref 1 handle 101 \
+ flower dst_ip $h2_ipv4 action drop
+
+ # Generate non-IP packets to the router
+ $MZ $h1 -c 0 -p 100 -d 1msec -B $h2_ipv4 -q "$rp1mac $h1mac \
+ 00:00 de:ad:be:ef" &
+ mz_pid=$!
+
+ devlink_trap_drop_test $trap_name $rp2 101
+
+ log_test "Non IP"
+
+ devlink_trap_drop_cleanup $mz_pid $rp2 "ip" 1 101
+}
+
+__uc_dip_over_mc_dmac_test()
+{
+ local desc=$1; shift
+ local proto=$1; shift
+ local dip=$1; shift
+ local flags=${1:-""}; shift
+ local trap_name="uc_dip_over_mc_dmac"
+ local dmac=01:02:03:04:05:06
+ local mz_pid
+
+ RET=0
+
+ ping_check $trap_name
+
+ tc filter add dev $rp2 egress protocol $proto pref 1 handle 101 \
+ flower ip_proto udp src_port 54321 dst_port 12345 action drop
+
+ # Generate IP packets with a unicast IP and a multicast destination MAC
+ $MZ $h1 $flags -t udp "sp=54321,dp=12345" -c 0 -p 100 -b $dmac \
+ -B $dip -d 1msec -q &
+ mz_pid=$!
+
+ devlink_trap_drop_test $trap_name $rp2 101
+
+ log_test "Unicast destination IP over multicast destination MAC: $desc"
+
+ devlink_trap_drop_cleanup $mz_pid $rp2 $proto 1 101
+}
+
+uc_dip_over_mc_dmac_test()
+{
+ __uc_dip_over_mc_dmac_test "IPv4" "ip" $h2_ipv4
+ __uc_dip_over_mc_dmac_test "IPv6" "ipv6" $h2_ipv6 "-6"
+}
+
+__sip_is_loopback_test()
+{
+ local desc=$1; shift
+ local proto=$1; shift
+ local sip=$1; shift
+ local dip=$1; shift
+ local flags=${1:-""}; shift
+ local trap_name="sip_is_loopback_address"
+ local mz_pid
+
+ RET=0
+
+ ping_check $trap_name
+
+ tc filter add dev $rp2 egress protocol $proto pref 1 handle 101 \
+ flower src_ip $sip action drop
+
+ # Generate packets with loopback source IP
+ $MZ $h1 $flags -t udp "sp=54321,dp=12345" -c 0 -p 100 -A $sip \
+ -b $rp1mac -B $dip -d 1msec -q &
+ mz_pid=$!
+
+ devlink_trap_drop_test $trap_name $rp2 101
+
+ log_test "Source IP is loopback address: $desc"
+
+ devlink_trap_drop_cleanup $mz_pid $rp2 $proto 1 101
+}
+
+sip_is_loopback_test()
+{
+ __sip_is_loopback_test "IPv4" "ip" "127.0.0.0/8" $h2_ipv4
+ __sip_is_loopback_test "IPv6" "ipv6" "::1" $h2_ipv6 "-6"
+}
+
+__dip_is_loopback_test()
+{
+ local desc=$1; shift
+ local proto=$1; shift
+ local dip=$1; shift
+ local flags=${1:-""}; shift
+ local trap_name="dip_is_loopback_address"
+ local mz_pid
+
+ RET=0
+
+ ping_check $trap_name
+
+ tc filter add dev $rp2 egress protocol $proto pref 1 handle 101 \
+ flower dst_ip $dip action drop
+
+ # Generate packets with loopback destination IP
+ $MZ $h1 $flags -t udp "sp=54321,dp=12345" -c 0 -p 100 -b $rp1mac \
+ -B $dip -d 1msec -q &
+ mz_pid=$!
+
+ devlink_trap_drop_test $trap_name $rp2 101
+
+ log_test "Destination IP is loopback address: $desc"
+
+ devlink_trap_drop_cleanup $mz_pid $rp2 $proto 1 101
+}
+
+dip_is_loopback_test()
+{
+ __dip_is_loopback_test "IPv4" "ip" "127.0.0.0/8"
+ __dip_is_loopback_test "IPv6" "ipv6" "::1" "-6"
+}
+
+__sip_is_mc_test()
+{
+ local desc=$1; shift
+ local proto=$1; shift
+ local sip=$1; shift
+ local dip=$1; shift
+ local flags=${1:-""}; shift
+ local trap_name="sip_is_mc"
+ local mz_pid
+
+ RET=0
+
+ ping_check $trap_name
+
+ tc filter add dev $rp2 egress protocol $proto pref 1 handle 101 \
+ flower src_ip $sip action drop
+
+ # Generate packets with multicast source IP
+ $MZ $h1 $flags -t udp "sp=54321,dp=12345" -c 0 -p 100 -A $sip \
+ -b $rp1mac -B $dip -d 1msec -q &
+ mz_pid=$!
+
+ devlink_trap_drop_test $trap_name $rp2 101
+
+ log_test "Source IP is multicast: $desc"
+
+ devlink_trap_drop_cleanup $mz_pid $rp2 $proto 1 101
+}
+
+sip_is_mc_test()
+{
+ __sip_is_mc_test "IPv4" "ip" "239.1.1.1" $h2_ipv4
+ __sip_is_mc_test "IPv6" "ipv6" "FF02::2" $h2_ipv6 "-6"
+}
+
+ipv4_sip_is_limited_bc_test()
+{
+ local trap_name="ipv4_sip_is_limited_bc"
+ local sip=255.255.255.255
+ local mz_pid
+
+ RET=0
+
+ ping_check $trap_name
+
+ tc filter add dev $rp2 egress protocol ip pref 1 handle 101 \
+ flower src_ip $sip action drop
+
+ # Generate packets with limited broadcast source IP
+ $MZ $h1 -t udp "sp=54321,dp=12345" -c 0 -p 100 -A $sip -b $rp1mac \
+ -B $h2_ipv4 -d 1msec -q &
+ mz_pid=$!
+
+ devlink_trap_drop_test $trap_name $rp2 101
+
+ log_test "IPv4 source IP is limited broadcast"
+
+ devlink_trap_drop_cleanup $mz_pid $rp2 "ip" 1 101
+}
+
+ipv4_payload_get()
+{
+ local ipver=$1; shift
+ local ihl=$1; shift
+ local checksum=$1; shift
+
+ p=$(:
+ )"08:00:"$( : ETH type
+ )"$ipver"$( : IP version
+ )"$ihl:"$( : IHL
+ )"00:"$( : IP TOS
+ )"00:F4:"$( : IP total length
+ )"00:00:"$( : IP identification
+ )"20:00:"$( : IP flags + frag off
+ )"30:"$( : IP TTL
+ )"01:"$( : IP proto
+ )"$checksum:"$( : IP header csum
+ )"$h1_ipv4:"$( : IP saddr
+ )"$h2_ipv4:"$( : IP daddr
+ )
+ echo $p
+}
+
+__ipv4_header_corrupted_test()
+{
+ local desc=$1; shift
+ local ipver=$1; shift
+ local ihl=$1; shift
+ local checksum=$1; shift
+ local trap_name="ip_header_corrupted"
+ local payload
+ local mz_pid
+
+ RET=0
+
+ ping_check $trap_name
+
+ tc filter add dev $rp2 egress protocol ip pref 1 handle 101 \
+ flower dst_ip $h2_ipv4 action drop
+
+ payload=$(ipv4_payload_get $ipver $ihl $checksum)
+
+ # Generate packets with corrupted IP header
+ $MZ $h1 -c 0 -d 1msec -a $h1mac -b $rp1mac -q p=$payload &
+ mz_pid=$!
+
+ devlink_trap_drop_test $trap_name $rp2 101
+
+ log_test "IP header corrupted: $desc: IPv4"
+
+ devlink_trap_drop_cleanup $mz_pid $rp2 "ip" 1 101
+}
+
+ipv6_payload_get()
+{
+ local ipver=$1; shift
+
+ p=$(:
+ )"86:DD:"$( : ETH type
+ )"$ipver"$( : IP version
+ )"0:0:"$( : Traffic class
+ )"0:00:00:"$( : Flow label
+ )"00:00:"$( : Payload length
+ )"01:"$( : Next header
+ )"04:"$( : Hop limit
+ )"$h1_ipv6:"$( : IP saddr
+ )"$h2_ipv6:"$( : IP daddr
+ )
+ echo $p
+}
+
+__ipv6_header_corrupted_test()
+{
+ local desc=$1; shift
+ local ipver=$1; shift
+ local trap_name="ip_header_corrupted"
+ local payload
+ local mz_pid
+
+ RET=0
+
+ ping_check $trap_name
+
+ tc filter add dev $rp2 egress protocol ip pref 1 handle 101 \
+ flower dst_ip $h2_ipv4 action drop
+
+ payload=$(ipv6_payload_get $ipver)
+
+ # Generate packets with corrupted IP header
+ $MZ $h1 -c 0 -d 1msec -a $h1mac -b $rp1mac -q p=$payload &
+ mz_pid=$!
+
+ devlink_trap_drop_test $trap_name $rp2 101
+
+ log_test "IP header corrupted: $desc: IPv6"
+
+ devlink_trap_drop_cleanup $mz_pid $rp2 "ip" 1 101
+}
+
+ip_header_corrupted_test()
+{
+ # Each test uses one wrong value. The three values below are correct.
+ local ipv="4"
+ local ihl="5"
+ local checksum="00:F4"
+
+ __ipv4_header_corrupted_test "wrong IP version" 5 $ihl $checksum
+ __ipv4_header_corrupted_test "wrong IHL" $ipv 4 $checksum
+ __ipv4_header_corrupted_test "wrong checksum" $ipv $ihl "00:00"
+ __ipv6_header_corrupted_test "wrong IP version" 5
+}
+
+ipv6_mc_dip_reserved_scope_test()
+{
+ local trap_name="ipv6_mc_dip_reserved_scope"
+ local dip=FF00::
+ local mz_pid
+
+ RET=0
+
+ ping_check $trap_name
+
+ tc filter add dev $rp2 egress protocol ipv6 pref 1 handle 101 \
+ flower dst_ip $dip action drop
+
+ # Generate packets with reserved scope destination IP
+ $MZ $h1 -6 -t udp "sp=54321,dp=12345" -c 0 -p 100 -b \
+ "33:33:00:00:00:00" -B $dip -d 1msec -q &
+ mz_pid=$!
+
+ devlink_trap_drop_test $trap_name $rp2 101
+
+ log_test "IPv6 multicast destination IP reserved scope"
+
+ devlink_trap_drop_cleanup $mz_pid $rp2 "ipv6" 1 101
+}
+
+ipv6_mc_dip_interface_local_scope_test()
+{
+ local trap_name="ipv6_mc_dip_interface_local_scope"
+ local dip=FF01::
+ local mz_pid
+
+ RET=0
+
+ ping_check $trap_name
+
+ tc filter add dev $rp2 egress protocol ipv6 pref 1 handle 101 \
+ flower dst_ip $dip action drop
+
+ # Generate packets with interface local scope destination IP
+ $MZ $h1 -6 -t udp "sp=54321,dp=12345" -c 0 -p 100 -b \
+ "33:33:00:00:00:00" -B $dip -d 1msec -q &
+ mz_pid=$!
+
+ devlink_trap_drop_test $trap_name $rp2 101
+
+ log_test "IPv6 multicast destination IP interface-local scope"
+
+ devlink_trap_drop_cleanup $mz_pid $rp2 "ipv6" 1 101
+}
+
+__blackhole_route_test()
+{
+ local flags=$1; shift
+ local subnet=$1; shift
+ local proto=$1; shift
+ local dip=$1; shift
+ local ip_proto=${1:-"icmp"}; shift
+ local trap_name="blackhole_route"
+ local mz_pid
+
+ RET=0
+
+ ping_check $trap_name
+
+ ip -$flags route add blackhole $subnet
+ tc filter add dev $rp2 egress protocol $proto pref 1 handle 101 \
+ flower skip_hw dst_ip $dip ip_proto $ip_proto action drop
+
+ # Generate packets to the blackhole route
+ $MZ $h1 -$flags -t udp "sp=54321,dp=12345" -c 0 -p 100 -b $rp1mac \
+ -B $dip -d 1msec -q &
+ mz_pid=$!
+
+ devlink_trap_drop_test $trap_name $rp2 101
+ log_test "Blackhole route: IPv$flags"
+
+ devlink_trap_drop_cleanup $mz_pid $rp2 $proto 1 101
+ ip -$flags route del blackhole $subnet
+}
+
+blackhole_route_test()
+{
+ __blackhole_route_test "4" "198.51.100.0/30" "ip" $h2_ipv4
+ __blackhole_route_test "6" "2001:db8:2::/120" "ipv6" $h2_ipv6 "icmpv6"
+}
+
+irif_disabled_test()
+{
+ local trap_name="irif_disabled"
+ local t0_packets t0_bytes
+ local t1_packets t1_bytes
+ local mz_pid
+
+ RET=0
+
+ ping_check $trap_name
+
+ devlink_trap_action_set $trap_name "trap"
+
+ # When RIF of a physical port ("Sub-port RIF") is destroyed, we first
+ # block the STP of the {Port, VLAN} so packets cannot get into the RIF.
+ # Using bridge enables us to see this trap because when bridge is
+ # destroyed, there is a small time window that packets can go into the
+ # RIF, while it is disabled.
+ ip link add dev br0 type bridge
+ ip link set dev $rp1 master br0
+ ip address flush dev $rp1
+ __addr_add_del br0 add 192.0.2.2/24
+ ip li set dev br0 up
+
+ t0_packets=$(devlink_trap_rx_packets_get $trap_name)
+ t0_bytes=$(devlink_trap_rx_bytes_get $trap_name)
+
+ # Generate packets to h2 through br0 RIF that will be removed later
+ $MZ $h1 -t udp "sp=54321,dp=12345" -c 0 -p 100 -a own -b $rp1mac \
+ -B $h2_ipv4 -q &
+ mz_pid=$!
+
+ # Wait before removing br0 RIF to allow packets to go into the bridge.
+ sleep 1
+
+ # Flushing address will dismantle the RIF
+ ip address flush dev br0
+
+ t1_packets=$(devlink_trap_rx_packets_get $trap_name)
+ t1_bytes=$(devlink_trap_rx_bytes_get $trap_name)
+
+ if [[ $t0_packets -eq $t1_packets && $t0_bytes -eq $t1_bytes ]]; then
+ check_err 1 "Trap stats idle when packets should be trapped"
+ fi
+
+ log_test "Ingress RIF disabled"
+
+ kill $mz_pid && wait $mz_pid &> /dev/null
+ ip link set dev $rp1 nomaster
+ __addr_add_del $rp1 add 192.0.2.2/24 2001:db8:1::2/64
+ ip link del dev br0 type bridge
+ devlink_trap_action_set $trap_name "drop"
+}
+
+erif_disabled_test()
+{
+ local trap_name="erif_disabled"
+ local t0_packets t0_bytes
+ local t1_packets t1_bytes
+ local mz_pid
+
+ RET=0
+
+ ping_check $trap_name
+
+ devlink_trap_action_set $trap_name "trap"
+ ip link add dev br0 type bridge
+ ip add flush dev $rp1
+ ip link set dev $rp1 master br0
+ __addr_add_del br0 add 192.0.2.2/24
+ ip link set dev br0 up
+
+ t0_packets=$(devlink_trap_rx_packets_get $trap_name)
+ t0_bytes=$(devlink_trap_rx_bytes_get $trap_name)
+
+ rp2mac=$(mac_get $rp2)
+
+ # Generate packets that should go out through br0 RIF that will be
+ # removed later
+ $MZ $h2 -t udp "sp=54321,dp=12345" -c 0 -p 100 -a own -b $rp2mac \
+ -B 192.0.2.1 -q &
+ mz_pid=$!
+
+ sleep 5
+ # Unlinking the port from the bridge will disable the RIF associated
+ # with br0 as it is no longer an upper of any mlxsw port.
+ ip link set dev $rp1 nomaster
+
+ t1_packets=$(devlink_trap_rx_packets_get $trap_name)
+ t1_bytes=$(devlink_trap_rx_bytes_get $trap_name)
+
+ if [[ $t0_packets -eq $t1_packets && $t0_bytes -eq $t1_bytes ]]; then
+ check_err 1 "Trap stats idle when packets should be trapped"
+ fi
+
+ log_test "Egress RIF disabled"
+
+ kill $mz_pid && wait $mz_pid &> /dev/null
+ __addr_add_del $rp1 add 192.0.2.2/24 2001:db8:1::2/64
+ ip link del dev br0 type bridge
+ devlink_trap_action_set $trap_name "drop"
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh
new file mode 100755
index 000000000..1d157b1bd
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_l3_exceptions.sh
@@ -0,0 +1,552 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test devlink-trap L3 exceptions functionality over mlxsw.
+# Check all exception traps to make sure they are triggered under the right
+# conditions.
+
+# +---------------------------------+
+# | H1 (vrf) |
+# | + $h1 |
+# | | 192.0.2.1/24 |
+# | | 2001:db8:1::1/64 |
+# | | |
+# | | default via 192.0.2.2 |
+# | | default via 2001:db8:1::2 |
+# +----|----------------------------+
+# |
+# +----|----------------------------------------------------------------------+
+# | SW | |
+# | + $rp1 |
+# | 192.0.2.2/24 |
+# | 2001:db8:1::2/64 |
+# | |
+# | 2001:db8:2::2/64 |
+# | 198.51.100.2/24 |
+# | + $rp2 |
+# | | |
+# +----|----------------------------------------------------------------------+
+# |
+# +----|----------------------------+
+# | | default via 198.51.100.2 |
+# | | default via 2001:db8:2::2 |
+# | | |
+# | | 2001:db8:2::1/64 |
+# | | 198.51.100.1/24 |
+# | + $h2 |
+# | H2 (vrf) |
+# +---------------------------------+
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ mtu_value_is_too_small_test
+ ttl_value_is_too_small_test
+ mc_reverse_path_forwarding_test
+ reject_route_test
+ unresolved_neigh_test
+ ipv4_lpm_miss_test
+ ipv6_lpm_miss_test
+"
+
+NUM_NETIFS=4
+source $lib_dir/lib.sh
+source $lib_dir/tc_common.sh
+source $lib_dir/devlink_lib.sh
+
+require_command $MCD
+require_command $MC_CLI
+table_name=selftests
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/24 2001:db8:1::1/64
+
+ ip -4 route add default vrf v$h1 nexthop via 192.0.2.2
+ ip -6 route add default vrf v$h1 nexthop via 2001:db8:1::2
+
+ tc qdisc add dev $h1 clsact
+}
+
+h1_destroy()
+{
+ tc qdisc del dev $h1 clsact
+
+ ip -6 route del default vrf v$h1 nexthop via 2001:db8:1::2
+ ip -4 route del default vrf v$h1 nexthop via 192.0.2.2
+
+ simple_if_fini $h1 192.0.2.1/24 2001:db8:1::1/64
+}
+
+h2_create()
+{
+ simple_if_init $h2 198.51.100.1/24 2001:db8:2::1/64
+
+ ip -4 route add default vrf v$h2 nexthop via 198.51.100.2
+ ip -6 route add default vrf v$h2 nexthop via 2001:db8:2::2
+}
+
+h2_destroy()
+{
+ ip -6 route del default vrf v$h2 nexthop via 2001:db8:2::2
+ ip -4 route del default vrf v$h2 nexthop via 198.51.100.2
+
+ simple_if_fini $h2 198.51.100.1/24 2001:db8:2::1/64
+}
+
+router_create()
+{
+ ip link set dev $rp1 up
+ ip link set dev $rp2 up
+
+ tc qdisc add dev $rp2 clsact
+
+ __addr_add_del $rp1 add 192.0.2.2/24 2001:db8:1::2/64
+ __addr_add_del $rp2 add 198.51.100.2/24 2001:db8:2::2/64
+}
+
+router_destroy()
+{
+ __addr_add_del $rp2 del 198.51.100.2/24 2001:db8:2::2/64
+ __addr_add_del $rp1 del 192.0.2.2/24 2001:db8:1::2/64
+
+ tc qdisc del dev $rp2 clsact
+
+ ip link set dev $rp2 down
+ ip link set dev $rp1 down
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ rp1=${NETIFS[p2]}
+
+ rp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ rp1mac=$(mac_get $rp1)
+
+ start_mcd
+
+ vrf_prepare
+ forwarding_enable
+
+ h1_create
+ h2_create
+
+ router_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ router_destroy
+
+ h2_destroy
+ h1_destroy
+
+ forwarding_restore
+ vrf_cleanup
+
+ kill_mcd
+}
+
+ping_check()
+{
+ ping_do $h1 198.51.100.1
+ check_err $? "Packets that should not be trapped were trapped"
+}
+
+trap_action_check()
+{
+ local trap_name=$1; shift
+ local expected_action=$1; shift
+
+ action=$(devlink_trap_action_get $trap_name)
+ if [ "$action" != $expected_action ]; then
+ check_err 1 "Trap $trap_name has wrong action: $action"
+ fi
+}
+
+mtu_value_is_too_small_test()
+{
+ local trap_name="mtu_value_is_too_small"
+ local expected_action="trap"
+ local mz_pid
+
+ RET=0
+
+ ping_check $trap_name
+ trap_action_check $trap_name $expected_action
+
+ # type - Destination Unreachable
+ # code - Fragmentation Needed and Don't Fragment was Set
+ tc filter add dev $h1 ingress protocol ip pref 1 handle 101 \
+ flower skip_hw ip_proto icmp type 3 code 4 action pass
+
+ mtu_set $rp2 1300
+
+ # Generate IP packets bigger than router's MTU with don't fragment
+ # flag on.
+ $MZ $h1 -t udp "sp=54321,dp=12345,df" -p 1400 -c 0 -d 1msec -b $rp1mac \
+ -B 198.51.100.1 -q &
+ mz_pid=$!
+
+ devlink_trap_exception_test $trap_name
+
+ tc_check_packets_hitting "dev $h1 ingress" 101
+ check_err $? "Packets were not received to h1"
+
+ log_test "MTU value is too small"
+
+ mtu_restore $rp2
+
+ kill $mz_pid && wait $mz_pid &> /dev/null
+ tc filter del dev $h1 ingress protocol ip pref 1 handle 101 flower
+}
+
+__ttl_value_is_too_small_test()
+{
+ local ttl_val=$1; shift
+ local trap_name="ttl_value_is_too_small"
+ local expected_action="trap"
+ local mz_pid
+
+ RET=0
+
+ ping_check $trap_name
+ trap_action_check $trap_name $expected_action
+
+ # type - Time Exceeded
+ # code - Time to Live exceeded in Transit
+ tc filter add dev $h1 ingress protocol ip pref 1 handle 101 \
+ flower skip_hw ip_proto icmp type 11 code 0 action pass
+
+ # Generate IP packets with small TTL
+ $MZ $h1 -t udp "ttl=$ttl_val,sp=54321,dp=12345" -c 0 -d 1msec \
+ -b $rp1mac -B 198.51.100.1 -q &
+ mz_pid=$!
+
+ devlink_trap_exception_test $trap_name
+
+ tc_check_packets_hitting "dev $h1 ingress" 101
+ check_err $? "Packets were not received to h1"
+
+ log_test "TTL value is too small: TTL=$ttl_val"
+
+ kill $mz_pid && wait $mz_pid &> /dev/null
+ tc filter del dev $h1 ingress protocol ip pref 1 handle 101 flower
+}
+
+ttl_value_is_too_small_test()
+{
+ __ttl_value_is_too_small_test 0
+ __ttl_value_is_too_small_test 1
+}
+
+start_mcd()
+{
+ SMCROUTEDIR="$(mktemp -d)"
+ for ((i = 1; i <= $NUM_NETIFS; ++i)); do
+ echo "phyint ${NETIFS[p$i]} enable" >> \
+ $SMCROUTEDIR/$table_name.conf
+ done
+
+ $MCD -N -I $table_name -f $SMCROUTEDIR/$table_name.conf \
+ -P $SMCROUTEDIR/$table_name.pid
+}
+
+kill_mcd()
+{
+ pkill $MCD
+ rm -rf $SMCROUTEDIR
+}
+
+__mc_reverse_path_forwarding_test()
+{
+ local desc=$1; shift
+ local src_ip=$1; shift
+ local dst_ip=$1; shift
+ local dst_mac=$1; shift
+ local proto=$1; shift
+ local flags=${1:-""}; shift
+ local trap_name="mc_reverse_path_forwarding"
+ local expected_action="trap"
+ local mz_pid
+
+ RET=0
+
+ ping_check $trap_name
+ trap_action_check $trap_name $expected_action
+
+ tc filter add dev $rp2 egress protocol $proto pref 1 handle 101 \
+ flower dst_ip $dst_ip ip_proto udp action drop
+
+ $MC_CLI -I $table_name add $rp1 $src_ip $dst_ip $rp2
+
+ # Generate packets to multicast address.
+ $MZ $h2 $flags -t udp "sp=54321,dp=12345" -c 0 -p 128 \
+ -a 00:11:22:33:44:55 -b $dst_mac \
+ -A $src_ip -B $dst_ip -q &
+
+ mz_pid=$!
+
+ devlink_trap_exception_test $trap_name
+
+ tc_check_packets "dev $rp2 egress" 101 0
+ check_err $? "Packets were not dropped"
+
+ log_test "Multicast reverse path forwarding: $desc"
+
+ kill $mz_pid && wait $mz_pid &> /dev/null
+ tc filter del dev $rp2 egress protocol $proto pref 1 handle 101 flower
+}
+
+mc_reverse_path_forwarding_test()
+{
+ __mc_reverse_path_forwarding_test "IPv4" "192.0.2.1" "225.1.2.3" \
+ "01:00:5e:01:02:03" "ip"
+ __mc_reverse_path_forwarding_test "IPv6" "2001:db8:1::1" "ff0e::3" \
+ "33:33:00:00:00:03" "ipv6" "-6"
+}
+
+__reject_route_test()
+{
+ local desc=$1; shift
+ local dst_ip=$1; shift
+ local proto=$1; shift
+ local ip_proto=$1; shift
+ local type=$1; shift
+ local code=$1; shift
+ local unreachable=$1; shift
+ local flags=${1:-""}; shift
+ local trap_name="reject_route"
+ local expected_action="trap"
+ local mz_pid
+
+ RET=0
+
+ ping_check $trap_name
+ trap_action_check $trap_name $expected_action
+
+ tc filter add dev $h1 ingress protocol $proto pref 1 handle 101 flower \
+ skip_hw ip_proto $ip_proto type $type code $code action pass
+
+ ip route add unreachable $unreachable
+
+ # Generate pacekts to h2. The destination IP is unreachable.
+ $MZ $flags $h1 -t udp "sp=54321,dp=12345" -c 0 -d 1msec -b $rp1mac \
+ -B $dst_ip -q &
+ mz_pid=$!
+
+ devlink_trap_exception_test $trap_name
+
+ tc_check_packets_hitting "dev $h1 ingress" 101
+ check_err $? "ICMP packet was not received to h1"
+
+ log_test "Reject route: $desc"
+
+ kill $mz_pid && wait $mz_pid &> /dev/null
+ ip route del unreachable $unreachable
+ tc filter del dev $h1 ingress protocol $proto pref 1 handle 101 flower
+}
+
+reject_route_test()
+{
+ # type - Destination Unreachable
+ # code - Host Unreachable
+ __reject_route_test "IPv4" 198.51.100.1 "ip" "icmp" 3 1 \
+ "198.51.100.0/26"
+ # type - Destination Unreachable
+ # code - No Route
+ __reject_route_test "IPv6" 2001:db8:2::1 "ipv6" "icmpv6" 1 0 \
+ "2001:db8:2::0/66" "-6"
+}
+
+__host_miss_test()
+{
+ local desc=$1; shift
+ local dip=$1; shift
+ local trap_name="unresolved_neigh"
+ local expected_action="trap"
+ local mz_pid
+
+ RET=0
+
+ ping_check $trap_name
+ trap_action_check $trap_name $expected_action
+
+ ip neigh flush dev $rp2
+
+ t0_packets=$(devlink_trap_rx_packets_get $trap_name)
+
+ # Generate packets to h2 (will incur a unresolved neighbor).
+ # The ping should pass and devlink counters should be increased.
+ ping_do $h1 $dip
+ check_err $? "ping failed: $desc"
+
+ t1_packets=$(devlink_trap_rx_packets_get $trap_name)
+
+ if [[ $t0_packets -eq $t1_packets ]]; then
+ check_err 1 "Trap counter did not increase"
+ fi
+
+ log_test "Unresolved neigh: host miss: $desc"
+}
+
+__invalid_nexthop_test()
+{
+ local desc=$1; shift
+ local dip=$1; shift
+ local extra_add=$1; shift
+ local subnet=$1; shift
+ local via_add=$1; shift
+ local trap_name="unresolved_neigh"
+ local expected_action="trap"
+ local mz_pid
+
+ RET=0
+
+ ping_check $trap_name
+ trap_action_check $trap_name $expected_action
+
+ ip address add $extra_add/$subnet dev $h2
+
+ # Check that correct route does not trigger unresolved_neigh
+ ip $flags route add $dip via $extra_add dev $rp2
+
+ # Generate packets in order to discover all neighbours.
+ # Without it, counters of unresolved_neigh will be increased
+ # during neighbours discovery and the check below will fail
+ # for a wrong reason
+ ping_do $h1 $dip
+
+ t0_packets=$(devlink_trap_rx_packets_get $trap_name)
+ ping_do $h1 $dip
+ t1_packets=$(devlink_trap_rx_packets_get $trap_name)
+
+ if [[ $t0_packets -ne $t1_packets ]]; then
+ check_err 1 "Trap counter increased when it should not"
+ fi
+
+ ip $flags route del $dip via $extra_add dev $rp2
+
+ # Check that route to nexthop that does not exist trigger
+ # unresolved_neigh
+ ip $flags route add $dip via $via_add dev $h2
+
+ t0_packets=$(devlink_trap_rx_packets_get $trap_name)
+ ping_do $h1 $dip
+ t1_packets=$(devlink_trap_rx_packets_get $trap_name)
+
+ if [[ $t0_packets -eq $t1_packets ]]; then
+ check_err 1 "Trap counter did not increase"
+ fi
+
+ ip $flags route del $dip via $via_add dev $h2
+ ip address del $extra_add/$subnet dev $h2
+ log_test "Unresolved neigh: nexthop does not exist: $desc"
+}
+
+unresolved_neigh_test()
+{
+ __host_miss_test "IPv4" 198.51.100.1
+ __host_miss_test "IPv6" 2001:db8:2::1
+ __invalid_nexthop_test "IPv4" 198.51.100.1 198.51.100.3 24 198.51.100.4
+ __invalid_nexthop_test "IPv6" 2001:db8:2::1 2001:db8:2::3 64 \
+ 2001:db8:2::4
+}
+
+vrf_without_routes_create()
+{
+ # VRF creating makes the links to be down and then up again.
+ # By default, IPv6 address is not saved after link becomes down.
+ # Save IPv6 address using sysctl configuration.
+ sysctl_set net.ipv6.conf.$rp1.keep_addr_on_down 1
+ sysctl_set net.ipv6.conf.$rp2.keep_addr_on_down 1
+
+ ip link add dev vrf1 type vrf table 101
+ ip link set dev $rp1 master vrf1
+ ip link set dev $rp2 master vrf1
+ ip link set dev vrf1 up
+
+ # Wait for rp1 and rp2 to be up
+ setup_wait
+}
+
+vrf_without_routes_destroy()
+{
+ ip link set dev $rp1 nomaster
+ ip link set dev $rp2 nomaster
+ ip link del dev vrf1
+
+ sysctl_restore net.ipv6.conf.$rp2.keep_addr_on_down
+ sysctl_restore net.ipv6.conf.$rp1.keep_addr_on_down
+
+ # Wait for interfaces to be up
+ setup_wait
+}
+
+ipv4_lpm_miss_test()
+{
+ local trap_name="ipv4_lpm_miss"
+ local expected_action="trap"
+ local mz_pid
+
+ RET=0
+
+ ping_check $trap_name
+ trap_action_check $trap_name $expected_action
+
+ # Create a VRF without a default route
+ vrf_without_routes_create
+
+ # Generate packets through a VRF without a matching route.
+ $MZ $h1 -t udp "sp=54321,dp=12345" -c 0 -d 1msec -b $rp1mac \
+ -B 203.0.113.1 -q &
+ mz_pid=$!
+
+ devlink_trap_exception_test $trap_name
+
+ log_test "LPM miss: IPv4"
+
+ kill $mz_pid && wait $mz_pid &> /dev/null
+ vrf_without_routes_destroy
+}
+
+ipv6_lpm_miss_test()
+{
+ local trap_name="ipv6_lpm_miss"
+ local expected_action="trap"
+ local mz_pid
+
+ RET=0
+
+ ping_check $trap_name
+ trap_action_check $trap_name $expected_action
+
+ # Create a VRF without a default route
+ vrf_without_routes_create
+
+ # Generate packets through a VRF without a matching route.
+ $MZ -6 $h1 -t udp "sp=54321,dp=12345" -c 0 -d 1msec -b $rp1mac \
+ -B 2001:db8::1 -q &
+ mz_pid=$!
+
+ devlink_trap_exception_test $trap_name
+
+ log_test "LPM miss: IPv6"
+
+ kill $mz_pid && wait $mz_pid &> /dev/null
+ vrf_without_routes_destroy
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_policer.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_policer.sh
new file mode 100755
index 000000000..508a702f0
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_policer.sh
@@ -0,0 +1,361 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test devlink-trap policer functionality over mlxsw.
+
+# +---------------------------------+
+# | H1 (vrf) |
+# | + $h1 |
+# | | 192.0.2.1/24 |
+# | | |
+# | | default via 192.0.2.2 |
+# +----|----------------------------+
+# |
+# +----|----------------------------------------------------------------------+
+# | SW | |
+# | + $rp1 |
+# | 192.0.2.2/24 |
+# | |
+# | 198.51.100.2/24 |
+# | + $rp2 |
+# | | |
+# +----|----------------------------------------------------------------------+
+# |
+# +----|----------------------------+
+# | | default via 198.51.100.2 |
+# | | |
+# | | 198.51.100.1/24 |
+# | + $h2 |
+# | H2 (vrf) |
+# +---------------------------------+
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ rate_limits_test
+ burst_limits_test
+ rate_test
+ burst_test
+"
+NUM_NETIFS=4
+source $lib_dir/tc_common.sh
+source $lib_dir/lib.sh
+source $lib_dir/devlink_lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/24
+ mtu_set $h1 10000
+
+ ip -4 route add default vrf v$h1 nexthop via 192.0.2.2
+}
+
+h1_destroy()
+{
+ ip -4 route del default vrf v$h1 nexthop via 192.0.2.2
+
+ mtu_restore $h1
+ simple_if_fini $h1 192.0.2.1/24
+}
+
+h2_create()
+{
+ simple_if_init $h2 198.51.100.1/24
+ mtu_set $h2 10000
+
+ ip -4 route add default vrf v$h2 nexthop via 198.51.100.2
+}
+
+h2_destroy()
+{
+ ip -4 route del default vrf v$h2 nexthop via 198.51.100.2
+
+ mtu_restore $h2
+ simple_if_fini $h2 198.51.100.1/24
+}
+
+router_create()
+{
+ ip link set dev $rp1 up
+ ip link set dev $rp2 up
+
+ __addr_add_del $rp1 add 192.0.2.2/24
+ __addr_add_del $rp2 add 198.51.100.2/24
+ mtu_set $rp1 10000
+ mtu_set $rp2 10000
+
+ ip -4 route add blackhole 198.51.100.100
+
+ devlink trap set $DEVLINK_DEV trap blackhole_route action trap
+}
+
+router_destroy()
+{
+ devlink trap set $DEVLINK_DEV trap blackhole_route action drop
+
+ ip -4 route del blackhole 198.51.100.100
+
+ mtu_restore $rp2
+ mtu_restore $rp1
+ __addr_add_del $rp2 del 198.51.100.2/24
+ __addr_add_del $rp1 del 192.0.2.2/24
+
+ ip link set dev $rp2 down
+ ip link set dev $rp1 down
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ rp1=${NETIFS[p2]}
+
+ rp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ rp1_mac=$(mac_get $rp1)
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+
+ router_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ router_destroy
+
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+
+ # Reload to ensure devlink-trap settings are back to default.
+ devlink_reload
+}
+
+rate_limits_test()
+{
+ RET=0
+
+ devlink trap policer set $DEVLINK_DEV policer 1 rate 0 &> /dev/null
+ check_fail $? "Policer rate was changed to rate lower than limit"
+ devlink trap policer set $DEVLINK_DEV policer 1 \
+ rate 2000000001 &> /dev/null
+ check_fail $? "Policer rate was changed to rate higher than limit"
+
+ devlink trap policer set $DEVLINK_DEV policer 1 rate 1
+ check_err $? "Failed to set policer rate to minimum"
+ devlink trap policer set $DEVLINK_DEV policer 1 rate 2000000000
+ check_err $? "Failed to set policer rate to maximum"
+
+ log_test "Trap policer rate limits"
+}
+
+burst_limits_test()
+{
+ RET=0
+
+ devlink trap policer set $DEVLINK_DEV policer 1 burst 0 &> /dev/null
+ check_fail $? "Policer burst size was changed to 0"
+ devlink trap policer set $DEVLINK_DEV policer 1 burst 17 &> /dev/null
+ check_fail $? "Policer burst size was changed to burst size that is not power of 2"
+ devlink trap policer set $DEVLINK_DEV policer 1 burst 8 &> /dev/null
+ check_fail $? "Policer burst size was changed to burst size lower than limit"
+ devlink trap policer set $DEVLINK_DEV policer 1 \
+ burst $((2**25)) &> /dev/null
+ check_fail $? "Policer burst size was changed to burst size higher than limit"
+
+ devlink trap policer set $DEVLINK_DEV policer 1 burst 16
+ check_err $? "Failed to set policer burst size to minimum"
+ devlink trap policer set $DEVLINK_DEV policer 1 burst $((2**24))
+ check_err $? "Failed to set policer burst size to maximum"
+
+ log_test "Trap policer burst size limits"
+}
+
+trap_rate_get()
+{
+ local t0 t1
+
+ t0=$(devlink_trap_rx_packets_get blackhole_route)
+ sleep 10
+ t1=$(devlink_trap_rx_packets_get blackhole_route)
+
+ echo $(((t1 - t0) / 10))
+}
+
+policer_drop_rate_get()
+{
+ local id=$1; shift
+ local t0 t1
+
+ t0=$(devlink_trap_policer_rx_dropped_get $id)
+ sleep 10
+ t1=$(devlink_trap_policer_rx_dropped_get $id)
+
+ echo $(((t1 - t0) / 10))
+}
+
+__rate_test()
+{
+ local rate pct drop_rate
+ local id=$1; shift
+
+ RET=0
+
+ devlink trap policer set $DEVLINK_DEV policer $id rate 1000 burst 512
+ devlink trap group set $DEVLINK_DEV group l3_drops policer $id
+
+ # Send packets at highest possible rate and make sure they are dropped
+ # by the policer. Make sure measured received rate is about 1000 pps
+ log_info "=== Tx rate: Highest, Policer rate: 1000 pps ==="
+
+ start_traffic $h1 192.0.2.1 198.51.100.100 $rp1_mac
+
+ sleep 5 # Take measurements when rate is stable
+
+ rate=$(trap_rate_get)
+ pct=$((100 * (rate - 1000) / 1000))
+ ((-10 <= pct && pct <= 10))
+ check_err $? "Expected rate 1000 pps, got $rate pps, which is $pct% off. Required accuracy is +-10%"
+ log_info "Expected rate 1000 pps, measured rate $rate pps"
+
+ drop_rate=$(policer_drop_rate_get $id)
+ (( drop_rate > 0 ))
+ check_err $? "Expected non-zero policer drop rate, got 0"
+ log_info "Measured policer drop rate of $drop_rate pps"
+
+ stop_traffic
+
+ # Send packets at a rate of 1000 pps and make sure they are not dropped
+ # by the policer
+ log_info "=== Tx rate: 1000 pps, Policer rate: 1000 pps ==="
+
+ start_traffic $h1 192.0.2.1 198.51.100.100 $rp1_mac -d 1msec
+
+ sleep 5 # Take measurements when rate is stable
+
+ drop_rate=$(policer_drop_rate_get $id)
+ (( drop_rate == 0 ))
+ check_err $? "Expected zero policer drop rate, got a drop rate of $drop_rate pps"
+ log_info "Measured policer drop rate of $drop_rate pps"
+
+ stop_traffic
+
+ # Unbind the policer and send packets at highest possible rate. Make
+ # sure they are not dropped by the policer and that the measured
+ # received rate is higher than 1000 pps
+ log_info "=== Tx rate: Highest, Policer rate: No policer ==="
+
+ devlink trap group set $DEVLINK_DEV group l3_drops nopolicer
+
+ start_traffic $h1 192.0.2.1 198.51.100.100 $rp1_mac
+
+ rate=$(trap_rate_get)
+ (( rate > 1000 ))
+ check_err $? "Expected rate higher than 1000 pps, got $rate pps"
+ log_info "Measured rate $rate pps"
+
+ drop_rate=$(policer_drop_rate_get $id)
+ (( drop_rate == 0 ))
+ check_err $? "Expected zero policer drop rate, got a drop rate of $drop_rate pps"
+ log_info "Measured policer drop rate of $drop_rate pps"
+
+ stop_traffic
+
+ log_test "Trap policer rate"
+}
+
+rate_test()
+{
+ local id
+
+ for id in $(devlink_trap_policer_ids_get); do
+ echo
+ log_info "Running rate test for policer $id"
+ __rate_test $id
+ done
+}
+
+__burst_test()
+{
+ local t0_rx t0_drop t1_rx t1_drop rx drop
+ local id=$1; shift
+
+ RET=0
+
+ devlink trap policer set $DEVLINK_DEV policer $id rate 1000 burst 512
+ devlink trap group set $DEVLINK_DEV group l3_drops policer $id
+
+ # Send a burst of 16 packets and make sure that 16 are received
+ # and that none are dropped by the policer
+ log_info "=== Tx burst size: 16, Policer burst size: 512 ==="
+
+ t0_rx=$(devlink_trap_rx_packets_get blackhole_route)
+ t0_drop=$(devlink_trap_policer_rx_dropped_get $id)
+
+ start_traffic $h1 192.0.2.1 198.51.100.100 $rp1_mac -c 16
+
+ t1_rx=$(devlink_trap_rx_packets_get blackhole_route)
+ t1_drop=$(devlink_trap_policer_rx_dropped_get $id)
+
+ rx=$((t1_rx - t0_rx))
+ (( rx == 16 ))
+ check_err $? "Expected burst size of 16 packets, got $rx packets"
+ log_info "Expected burst size of 16 packets, measured burst size of $rx packets"
+
+ drop=$((t1_drop - t0_drop))
+ (( drop == 0 ))
+ check_err $? "Expected zero policer drops, got $drop"
+ log_info "Measured policer drops of $drop packets"
+
+ # Unbind the policer and send a burst of 64 packets. Make sure that
+ # 64 packets are received and that none are dropped by the policer
+ log_info "=== Tx burst size: 64, Policer burst size: No policer ==="
+
+ devlink trap group set $DEVLINK_DEV group l3_drops nopolicer
+
+ t0_rx=$(devlink_trap_rx_packets_get blackhole_route)
+ t0_drop=$(devlink_trap_policer_rx_dropped_get $id)
+
+ start_traffic $h1 192.0.2.1 198.51.100.100 $rp1_mac -c 64
+
+ t1_rx=$(devlink_trap_rx_packets_get blackhole_route)
+ t1_drop=$(devlink_trap_policer_rx_dropped_get $id)
+
+ rx=$((t1_rx - t0_rx))
+ (( rx == 64 ))
+ check_err $? "Expected burst size of 64 packets, got $rx packets"
+ log_info "Expected burst size of 64 packets, measured burst size of $rx packets"
+
+ drop=$((t1_drop - t0_drop))
+ (( drop == 0 ))
+ check_err $? "Expected zero policer drops, got $drop"
+ log_info "Measured policer drops of $drop packets"
+
+ log_test "Trap policer burst size"
+}
+
+burst_test()
+{
+ local id
+
+ for id in $(devlink_trap_policer_ids_get); do
+ echo
+ log_info "Running burst size test for policer $id"
+ __burst_test $id
+ done
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh
new file mode 100755
index 000000000..8817851da
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_ipip.sh
@@ -0,0 +1,263 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test devlink-trap tunnel exceptions functionality over mlxsw.
+# Check all exception traps to make sure they are triggered under the right
+# conditions.
+
+# +-------------------------+
+# | H1 |
+# | $h1 + |
+# | 192.0.2.1/28 | |
+# +-------------------|-----+
+# |
+# +-------------------|-----+
+# | SW1 | |
+# | $swp1 + |
+# | 192.0.2.2/28 |
+# | |
+# | + g1a (gre) |
+# | loc=192.0.2.65 |
+# | rem=192.0.2.66 |
+# | tos=inherit |
+# | |
+# | + $rp1 |
+# | | 198.51.100.1/28 |
+# +--|----------------------+
+# |
+# +--|----------------------+
+# | | VRF2 |
+# | + $rp2 |
+# | 198.51.100.2/28 |
+# +-------------------------+
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ decap_error_test
+"
+
+NUM_NETIFS=4
+source $lib_dir/lib.sh
+source $lib_dir/tc_common.sh
+source $lib_dir/devlink_lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/28
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/28
+}
+
+vrf2_create()
+{
+ simple_if_init $rp2 198.51.100.2/28
+}
+
+vrf2_destroy()
+{
+ simple_if_fini $rp2 198.51.100.2/28
+}
+
+switch_create()
+{
+ __addr_add_del $swp1 add 192.0.2.2/28
+ tc qdisc add dev $swp1 clsact
+ ip link set dev $swp1 up
+
+ tunnel_create g1 gre 192.0.2.65 192.0.2.66 tos inherit
+ __addr_add_del g1 add 192.0.2.65/32
+ ip link set dev g1 up
+
+ __addr_add_del $rp1 add 198.51.100.1/28
+ ip link set dev $rp1 up
+}
+
+switch_destroy()
+{
+ ip link set dev $rp1 down
+ __addr_add_del $rp1 del 198.51.100.1/28
+
+ ip link set dev g1 down
+ __addr_add_del g1 del 192.0.2.65/32
+ tunnel_destroy g1
+
+ ip link set dev $swp1 down
+ tc qdisc del dev $swp1 clsact
+ __addr_add_del $swp1 del 192.0.2.2/28
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ rp1=${NETIFS[p3]}
+ rp2=${NETIFS[p4]}
+
+ forwarding_enable
+ vrf_prepare
+ h1_create
+ switch_create
+ vrf2_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ vrf2_destroy
+ switch_destroy
+ h1_destroy
+ vrf_cleanup
+ forwarding_restore
+}
+
+ecn_payload_get()
+{
+ p=$(:
+ )"0"$( : GRE flags
+ )"0:00:"$( : Reserved + version
+ )"08:00:"$( : ETH protocol type
+ )"4"$( : IP version
+ )"5:"$( : IHL
+ )"00:"$( : IP TOS
+ )"00:14:"$( : IP total length
+ )"00:00:"$( : IP identification
+ )"20:00:"$( : IP flags + frag off
+ )"30:"$( : IP TTL
+ )"01:"$( : IP proto
+ )"E7:E6:"$( : IP header csum
+ )"C0:00:01:01:"$( : IP saddr : 192.0.1.1
+ )"C0:00:02:01:"$( : IP daddr : 192.0.2.1
+ )
+ echo $p
+}
+
+ecn_decap_test()
+{
+ local trap_name="decap_error"
+ local desc=$1; shift
+ local ecn_desc=$1; shift
+ local outer_tos=$1; shift
+ local mz_pid
+
+ RET=0
+
+ tc filter add dev $swp1 egress protocol ip pref 1 handle 101 \
+ flower src_ip 192.0.1.1 dst_ip 192.0.2.1 action pass
+
+ rp1_mac=$(mac_get $rp1)
+ rp2_mac=$(mac_get $rp2)
+ payload=$(ecn_payload_get)
+
+ ip vrf exec v$rp2 $MZ $rp2 -c 0 -d 1msec -a $rp2_mac -b $rp1_mac \
+ -A 192.0.2.66 -B 192.0.2.65 -t ip \
+ len=48,tos=$outer_tos,proto=47,p=$payload -q &
+
+ mz_pid=$!
+
+ devlink_trap_exception_test $trap_name
+
+ tc_check_packets "dev $swp1 egress" 101 0
+ check_err $? "Packets were not dropped"
+
+ log_test "$desc: Inner ECN is not ECT and outer is $ecn_desc"
+
+ kill $mz_pid && wait $mz_pid &> /dev/null
+ tc filter del dev $swp1 egress protocol ip pref 1 handle 101 flower
+}
+
+ipip_payload_get()
+{
+ local flags=$1; shift
+ local key=$1; shift
+
+ p=$(:
+ )"$flags"$( : GRE flags
+ )"0:00:"$( : Reserved + version
+ )"08:00:"$( : ETH protocol type
+ )"$key"$( : Key
+ )"4"$( : IP version
+ )"5:"$( : IHL
+ )"00:"$( : IP TOS
+ )"00:14:"$( : IP total length
+ )"00:00:"$( : IP identification
+ )"20:00:"$( : IP flags + frag off
+ )"30:"$( : IP TTL
+ )"01:"$( : IP proto
+ )"E7:E6:"$( : IP header csum
+ )"C0:00:01:01:"$( : IP saddr : 192.0.1.1
+ )"C0:00:02:01:"$( : IP daddr : 192.0.2.1
+ )
+ echo $p
+}
+
+no_matching_tunnel_test()
+{
+ local trap_name="decap_error"
+ local desc=$1; shift
+ local sip=$1; shift
+ local mz_pid
+
+ RET=0
+
+ tc filter add dev $swp1 egress protocol ip pref 1 handle 101 \
+ flower src_ip 192.0.1.1 dst_ip 192.0.2.1 action pass
+
+ rp1_mac=$(mac_get $rp1)
+ rp2_mac=$(mac_get $rp2)
+ payload=$(ipip_payload_get "$@")
+
+ ip vrf exec v$rp2 $MZ $rp2 -c 0 -d 1msec -a $rp2_mac -b $rp1_mac \
+ -A $sip -B 192.0.2.65 -t ip len=48,proto=47,p=$payload -q &
+ mz_pid=$!
+
+ devlink_trap_exception_test $trap_name
+
+ tc_check_packets "dev $swp1 egress" 101 0
+ check_err $? "Packets were not dropped"
+
+ log_test "$desc"
+
+ kill $mz_pid && wait $mz_pid &> /dev/null
+ tc filter del dev $swp1 egress protocol ip pref 1 handle 101 flower
+}
+
+decap_error_test()
+{
+ # Correct source IP - the remote address
+ local sip=192.0.2.66
+
+ ecn_decap_test "Decap error" "ECT(1)" 01
+ ecn_decap_test "Decap error" "ECT(0)" 02
+ ecn_decap_test "Decap error" "CE" 03
+
+ no_matching_tunnel_test "Decap error: Source IP check failed" \
+ 192.0.2.68 "0"
+ no_matching_tunnel_test \
+ "Decap error: Key exists but was not expected" $sip "2" ":E9:"
+
+ # Destroy the tunnel and create new one with key
+ __addr_add_del g1 del 192.0.2.65/32
+ tunnel_destroy g1
+
+ tunnel_create g1 gre 192.0.2.65 192.0.2.66 tos inherit key 233
+ __addr_add_del g1 add 192.0.2.65/32
+
+ no_matching_tunnel_test \
+ "Decap error: Key does not exist but was expected" $sip "0"
+ no_matching_tunnel_test \
+ "Decap error: Packet has a wrong key field" $sip "2" "E8:"
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh
new file mode 100755
index 000000000..10e0f3dbc
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/devlink_trap_tunnel_vxlan.sh
@@ -0,0 +1,327 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test devlink-trap tunnel drops and exceptions functionality over mlxsw.
+# Check all traps to make sure they are triggered under the right
+# conditions.
+
+# +--------------------+
+# | H1 (vrf) |
+# | + $h1 |
+# | | 192.0.2.1/28 |
+# +----|---------------+
+# |
+# +----|----------------------------------------------------------------------+
+# | SW | |
+# | +--|--------------------------------------------------------------------+ |
+# | | + $swp1 BR1 (802.1d) | |
+# | | | |
+# | | + vx1 (vxlan) | |
+# | | local 192.0.2.17 | |
+# | | id 1000 dstport $VXPORT | |
+# | +-----------------------------------------------------------------------+ |
+# | |
+# | + $rp1 |
+# | | 192.0.2.17/28 |
+# +----|----------------------------------------------------------------------+
+# |
+# +----|--------------------------------------------------------+
+# | | VRF2 |
+# | + $rp2 |
+# | 192.0.2.18/28 |
+# | |
+# +-------------------------------------------------------------+
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ decap_error_test
+ overlay_smac_is_mc_test
+"
+
+NUM_NETIFS=4
+source $lib_dir/lib.sh
+source $lib_dir/tc_common.sh
+source $lib_dir/devlink_lib.sh
+
+: ${VXPORT:=4789}
+export VXPORT
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/28
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/28
+}
+
+switch_create()
+{
+ ip link add name br1 type bridge vlan_filtering 0 mcast_snooping 0
+ # Make sure the bridge uses the MAC address of the local port and not
+ # that of the VxLAN's device.
+ ip link set dev br1 address $(mac_get $swp1)
+ ip link set dev br1 up
+
+ tc qdisc add dev $swp1 clsact
+ ip link set dev $swp1 master br1
+ ip link set dev $swp1 up
+
+ ip link add name vx1 type vxlan id 1000 local 192.0.2.17 \
+ dstport "$VXPORT" nolearning noudpcsum tos inherit ttl 100
+ ip link set dev vx1 master br1
+ ip link set dev vx1 up
+
+ ip address add dev $rp1 192.0.2.17/28
+ ip link set dev $rp1 up
+}
+
+switch_destroy()
+{
+ ip link set dev $rp1 down
+ ip address del dev $rp1 192.0.2.17/28
+
+ ip link set dev vx1 down
+ ip link set dev vx1 nomaster
+ ip link del dev vx1
+
+ ip link set dev $swp1 down
+ ip link set dev $swp1 nomaster
+ tc qdisc del dev $swp1 clsact
+
+ ip link set dev br1 down
+ ip link del dev br1
+}
+
+vrf2_create()
+{
+ simple_if_init $rp2 192.0.2.18/28
+}
+
+vrf2_destroy()
+{
+ simple_if_fini $rp2 192.0.2.18/28
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ rp1=${NETIFS[p3]}
+ rp2=${NETIFS[p4]}
+
+ vrf_prepare
+ forwarding_enable
+ h1_create
+ switch_create
+ vrf2_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ vrf2_destroy
+ switch_destroy
+ h1_destroy
+ forwarding_restore
+ vrf_cleanup
+}
+
+ecn_payload_get()
+{
+ dest_mac=$(mac_get $h1)
+ p=$(:
+ )"08:"$( : VXLAN flags
+ )"00:00:00:"$( : VXLAN reserved
+ )"00:03:e8:"$( : VXLAN VNI : 1000
+ )"00:"$( : VXLAN reserved
+ )"$dest_mac:"$( : ETH daddr
+ )"00:00:00:00:00:00:"$( : ETH saddr
+ )"08:00:"$( : ETH type
+ )"45:"$( : IP version + IHL
+ )"00:"$( : IP TOS
+ )"00:14:"$( : IP total length
+ )"00:00:"$( : IP identification
+ )"20:00:"$( : IP flags + frag off
+ )"40:"$( : IP TTL
+ )"00:"$( : IP proto
+ )"D6:E5:"$( : IP header csum
+ )"c0:00:02:03:"$( : IP saddr: 192.0.2.3
+ )"c0:00:02:01:"$( : IP daddr: 192.0.2.1
+ )
+ echo $p
+}
+
+ecn_decap_test()
+{
+ local trap_name="decap_error"
+ local desc=$1; shift
+ local ecn_desc=$1; shift
+ local outer_tos=$1; shift
+ local mz_pid
+
+ RET=0
+
+ tc filter add dev $swp1 egress protocol ip pref 1 handle 101 \
+ flower src_ip 192.0.2.3 dst_ip 192.0.2.1 action pass
+
+ rp1_mac=$(mac_get $rp1)
+ payload=$(ecn_payload_get)
+
+ ip vrf exec v$rp2 $MZ $rp2 -c 0 -d 1msec -b $rp1_mac -B 192.0.2.17 \
+ -t udp sp=12345,dp=$VXPORT,tos=$outer_tos,p=$payload -q &
+ mz_pid=$!
+
+ devlink_trap_exception_test $trap_name
+
+ tc_check_packets "dev $swp1 egress" 101 0
+ check_err $? "Packets were not dropped"
+
+ log_test "$desc: Inner ECN is not ECT and outer is $ecn_desc"
+
+ kill $mz_pid && wait $mz_pid &> /dev/null
+ tc filter del dev $swp1 egress protocol ip pref 1 handle 101 flower
+}
+
+reserved_bits_payload_get()
+{
+ dest_mac=$(mac_get $h1)
+ p=$(:
+ )"08:"$( : VXLAN flags
+ )"01:00:00:"$( : VXLAN reserved
+ )"00:03:e8:"$( : VXLAN VNI : 1000
+ )"00:"$( : VXLAN reserved
+ )"$dest_mac:"$( : ETH daddr
+ )"00:00:00:00:00:00:"$( : ETH saddr
+ )"08:00:"$( : ETH type
+ )"45:"$( : IP version + IHL
+ )"00:"$( : IP TOS
+ )"00:14:"$( : IP total length
+ )"00:00:"$( : IP identification
+ )"20:00:"$( : IP flags + frag off
+ )"40:"$( : IP TTL
+ )"00:"$( : IP proto
+ )"00:00:"$( : IP header csum
+ )"c0:00:02:03:"$( : IP saddr: 192.0.2.3
+ )"c0:00:02:01:"$( : IP daddr: 192.0.2.1
+ )
+ echo $p
+}
+
+short_payload_get()
+{
+ dest_mac=$(mac_get $h1)
+ p=$(:
+ )"08:"$( : VXLAN flags
+ )"01:00:00:"$( : VXLAN reserved
+ )"00:03:e8:"$( : VXLAN VNI : 1000
+ )"00:"$( : VXLAN reserved
+ )
+ echo $p
+}
+
+corrupted_packet_test()
+{
+ local trap_name="decap_error"
+ local desc=$1; shift
+ local payload_get=$1; shift
+ local mz_pid
+
+ RET=0
+
+ # In case of too short packet, there is no any inner packet,
+ # so the matching will always succeed
+ tc filter add dev $swp1 egress protocol ip pref 1 handle 101 \
+ flower skip_hw src_ip 192.0.2.3 dst_ip 192.0.2.1 action pass
+
+ rp1_mac=$(mac_get $rp1)
+ payload=$($payload_get)
+ ip vrf exec v$rp2 $MZ $rp2 -c 0 -d 1msec -b $rp1_mac \
+ -B 192.0.2.17 -t udp sp=12345,dp=$VXPORT,p=$payload -q &
+ mz_pid=$!
+
+ devlink_trap_exception_test $trap_name
+
+ tc_check_packets "dev $swp1 egress" 101 0
+ check_err $? "Packets were not dropped"
+
+ log_test "$desc"
+
+ kill $mz_pid && wait $mz_pid &> /dev/null
+ tc filter del dev $swp1 egress protocol ip pref 1 handle 101 flower
+}
+
+decap_error_test()
+{
+ ecn_decap_test "Decap error" "ECT(1)" 01
+ ecn_decap_test "Decap error" "ECT(0)" 02
+ ecn_decap_test "Decap error" "CE" 03
+
+ corrupted_packet_test "Decap error: Reserved bits in use" \
+ "reserved_bits_payload_get"
+ corrupted_packet_test "Decap error: No L2 header" "short_payload_get"
+}
+
+mc_smac_payload_get()
+{
+ dest_mac=$(mac_get $h1)
+ source_mac=01:02:03:04:05:06
+ p=$(:
+ )"08:"$( : VXLAN flags
+ )"00:00:00:"$( : VXLAN reserved
+ )"00:03:e8:"$( : VXLAN VNI : 1000
+ )"00:"$( : VXLAN reserved
+ )"$dest_mac:"$( : ETH daddr
+ )"$source_mac:"$( : ETH saddr
+ )"08:00:"$( : ETH type
+ )"45:"$( : IP version + IHL
+ )"00:"$( : IP TOS
+ )"00:14:"$( : IP total length
+ )"00:00:"$( : IP identification
+ )"20:00:"$( : IP flags + frag off
+ )"40:"$( : IP TTL
+ )"00:"$( : IP proto
+ )"00:00:"$( : IP header csum
+ )"c0:00:02:03:"$( : IP saddr: 192.0.2.3
+ )"c0:00:02:01:"$( : IP daddr: 192.0.2.1
+ )
+ echo $p
+}
+
+overlay_smac_is_mc_test()
+{
+ local trap_name="overlay_smac_is_mc"
+ local mz_pid
+
+ RET=0
+
+ # The matching will be checked on devlink_trap_drop_test()
+ # and the filter will be removed on devlink_trap_drop_cleanup()
+ tc filter add dev $swp1 egress protocol ip pref 1 handle 101 \
+ flower src_mac 01:02:03:04:05:06 action pass
+
+ rp1_mac=$(mac_get $rp1)
+ payload=$(mc_smac_payload_get)
+
+ ip vrf exec v$rp2 $MZ $rp2 -c 0 -d 1msec -b $rp1_mac \
+ -B 192.0.2.17 -t udp sp=12345,dp=$VXPORT,p=$payload -q &
+ mz_pid=$!
+
+ devlink_trap_drop_test $trap_name $swp1 101
+
+ log_test "Overlay source MAC is multicast"
+
+ devlink_trap_drop_cleanup $mz_pid $swp1 "ip" 1 101
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/extack.sh b/tools/testing/selftests/drivers/net/mlxsw/extack.sh
new file mode 100755
index 000000000..7a0a99c1d
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/extack.sh
@@ -0,0 +1,170 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test operations that we expect to report extended ack.
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ netdev_pre_up_test
+ vxlan_vlan_add_test
+ vxlan_bridge_create_test
+ bridge_create_test
+"
+NUM_NETIFS=2
+source $lib_dir/lib.sh
+
+setup_prepare()
+{
+ swp1=${NETIFS[p1]}
+ swp2=${NETIFS[p2]}
+
+ ip link set dev $swp1 up
+ ip link set dev $swp2 up
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ ip link set dev $swp2 down
+ ip link set dev $swp1 down
+}
+
+netdev_pre_up_test()
+{
+ RET=0
+
+ ip link add name br1 up type bridge vlan_filtering 0 mcast_snooping 0
+ ip link add name vx1 up type vxlan id 1000 \
+ local 192.0.2.17 remote 192.0.2.18 \
+ dstport 4789 nolearning noudpcsum tos inherit ttl 100
+
+ ip link set dev vx1 master br1
+ check_err $?
+
+ ip link set dev $swp1 master br1
+ check_err $?
+
+ ip link add name br2 up type bridge vlan_filtering 0 mcast_snooping 0
+ ip link add name vx2 up type vxlan id 2000 \
+ local 192.0.2.17 remote 192.0.2.18 \
+ dstport 4789 nolearning noudpcsum tos inherit ttl 100
+
+ ip link set dev vx2 master br2
+ check_err $?
+
+ ip link set dev $swp2 master br2
+ check_err $?
+
+ # Unsupported configuration: mlxsw demands that all offloaded VXLAN
+ # devices have the same TTL.
+ ip link set dev vx2 down
+ ip link set dev vx2 type vxlan ttl 200
+
+ ip link set dev vx2 up &>/dev/null
+ check_fail $?
+
+ ip link set dev vx2 up 2>&1 >/dev/null | grep -q mlxsw_spectrum
+ check_err $?
+
+ log_test "extack - NETDEV_PRE_UP"
+
+ ip link del dev vx2
+ ip link del dev br2
+
+ ip link del dev vx1
+ ip link del dev br1
+}
+
+vxlan_vlan_add_test()
+{
+ RET=0
+
+ ip link add name br1 up type bridge vlan_filtering 1 mcast_snooping 0
+
+ # Unsupported configuration: mlxsw demands VXLAN with "noudpcsum".
+ ip link add name vx1 up type vxlan id 1000 \
+ local 192.0.2.17 remote 192.0.2.18 \
+ dstport 4789 tos inherit ttl 100
+
+ ip link set dev vx1 master br1
+ check_err $?
+
+ bridge vlan add dev vx1 vid 1
+ check_err $?
+
+ ip link set dev $swp1 master br1
+ check_err $?
+
+ bridge vlan add dev vx1 vid 1 pvid untagged 2>&1 >/dev/null \
+ | grep -q mlxsw_spectrum
+ check_err $?
+
+ log_test "extack - map VLAN at VXLAN device"
+
+ ip link del dev vx1
+ ip link del dev br1
+}
+
+vxlan_bridge_create_test()
+{
+ RET=0
+
+ # Unsupported configuration: mlxsw demands VXLAN with "noudpcsum".
+ ip link add name vx1 up type vxlan id 1000 \
+ local 192.0.2.17 remote 192.0.2.18 \
+ dstport 4789 tos inherit ttl 100
+
+ # Test with VLAN-aware bridge.
+ ip link add name br1 up type bridge vlan_filtering 1 mcast_snooping 0
+
+ ip link set dev vx1 master br1
+
+ ip link set dev $swp1 master br1 2>&1 > /dev/null \
+ | grep -q mlxsw_spectrum
+ check_err $?
+
+ # Test with VLAN-unaware bridge.
+ ip link set dev br1 type bridge vlan_filtering 0
+
+ ip link set dev $swp1 master br1 2>&1 > /dev/null \
+ | grep -q mlxsw_spectrum
+ check_err $?
+
+ log_test "extack - bridge creation with VXLAN"
+
+ ip link del dev br1
+ ip link del dev vx1
+}
+
+bridge_create_test()
+{
+ RET=0
+
+ ip link add name br1 up type bridge vlan_filtering 1
+ ip link add name br2 up type bridge vlan_filtering 1
+
+ ip link set dev $swp1 master br1
+ check_err $?
+
+ # Only one VLAN-aware bridge is supported, so this should fail with
+ # an extack.
+ ip link set dev $swp2 master br2 2>&1 > /dev/null \
+ | grep -q mlxsw_spectrum
+ check_err $?
+
+ log_test "extack - multiple VLAN-aware bridges creation"
+
+ ip link del dev br2
+ ip link del dev br1
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/fib.sh b/tools/testing/selftests/drivers/net/mlxsw/fib.sh
new file mode 100755
index 000000000..eab79b9e5
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/fib.sh
@@ -0,0 +1,256 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# This test is for checking the FIB offload API on top of mlxsw.
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ ipv4_identical_routes
+ ipv4_tos
+ ipv4_metric
+ ipv4_replace
+ ipv4_delete
+ ipv4_plen
+ ipv4_replay
+ ipv4_flush
+ ipv4_local_replace
+ ipv6_add
+ ipv6_metric
+ ipv6_append_single
+ ipv6_replace_single
+ ipv6_metric_multipath
+ ipv6_append_multipath
+ ipv6_replace_multipath
+ ipv6_append_multipath_to_single
+ ipv6_delete_single
+ ipv6_delete_multipath
+ ipv6_replay_single
+ ipv6_replay_multipath
+ ipv6_local_replace
+"
+NUM_NETIFS=0
+source $lib_dir/lib.sh
+source $lib_dir/devlink_lib.sh
+source $lib_dir/fib_offload_lib.sh
+
+ipv4_identical_routes()
+{
+ fib_ipv4_identical_routes_test "testns1"
+}
+
+ipv4_tos()
+{
+ fib_ipv4_tos_test "testns1"
+}
+
+ipv4_metric()
+{
+ fib_ipv4_metric_test "testns1"
+}
+
+ipv4_replace()
+{
+ fib_ipv4_replace_test "testns1"
+}
+
+ipv4_delete()
+{
+ fib_ipv4_delete_test "testns1"
+}
+
+ipv4_plen()
+{
+ fib_ipv4_plen_test "testns1"
+}
+
+ipv4_replay_metric()
+{
+ fib_ipv4_replay_metric_test "testns1" "$DEVLINK_DEV"
+}
+
+ipv4_replay_tos()
+{
+ fib_ipv4_replay_tos_test "testns1" "$DEVLINK_DEV"
+}
+
+ipv4_replay_plen()
+{
+ fib_ipv4_replay_plen_test "testns1" "$DEVLINK_DEV"
+}
+
+ipv4_replay()
+{
+ ipv4_replay_metric
+ ipv4_replay_tos
+ ipv4_replay_plen
+}
+
+ipv4_flush()
+{
+ fib_ipv4_flush_test "testns1"
+}
+
+ipv4_local_replace()
+{
+ local ns="testns1"
+
+ RET=0
+
+ ip -n $ns link add name dummy1 type dummy
+ ip -n $ns link set dev dummy1 up
+
+ ip -n $ns route add table local 192.0.2.1/32 dev dummy1
+ fib4_trap_check $ns "table local 192.0.2.1/32 dev dummy1" false
+ check_err $? "Local table route not in hardware when should"
+
+ ip -n $ns route add table main 192.0.2.1/32 dev dummy1
+ fib4_trap_check $ns "table main 192.0.2.1/32 dev dummy1" true
+ check_err $? "Main table route in hardware when should not"
+
+ fib4_trap_check $ns "table local 192.0.2.1/32 dev dummy1" false
+ check_err $? "Local table route was replaced when should not"
+
+ # Test that local routes can replace routes in main table.
+ ip -n $ns route add table main 192.0.2.2/32 dev dummy1
+ fib4_trap_check $ns "table main 192.0.2.2/32 dev dummy1" false
+ check_err $? "Main table route not in hardware when should"
+
+ ip -n $ns route add table local 192.0.2.2/32 dev dummy1
+ fib4_trap_check $ns "table local 192.0.2.2/32 dev dummy1" false
+ check_err $? "Local table route did not replace route in main table when should"
+
+ fib4_trap_check $ns "table main 192.0.2.2/32 dev dummy1" true
+ check_err $? "Main table route was not replaced when should"
+
+ log_test "IPv4 local table route replacement"
+
+ ip -n $ns link del dev dummy1
+}
+
+ipv6_add()
+{
+ fib_ipv6_add_test "testns1"
+}
+
+ipv6_metric()
+{
+ fib_ipv6_metric_test "testns1"
+}
+
+ipv6_append_single()
+{
+ fib_ipv6_append_single_test "testns1"
+}
+
+ipv6_replace_single()
+{
+ fib_ipv6_replace_single_test "testns1"
+}
+
+ipv6_metric_multipath()
+{
+ fib_ipv6_metric_multipath_test "testns1"
+}
+
+ipv6_append_multipath()
+{
+ fib_ipv6_append_multipath_test "testns1"
+}
+
+ipv6_replace_multipath()
+{
+ fib_ipv6_replace_multipath_test "testns1"
+}
+
+ipv6_append_multipath_to_single()
+{
+ fib_ipv6_append_multipath_to_single_test "testns1"
+}
+
+ipv6_delete_single()
+{
+ fib_ipv6_delete_single_test "testns1"
+}
+
+ipv6_delete_multipath()
+{
+ fib_ipv6_delete_multipath_test "testns1"
+}
+
+ipv6_replay_single()
+{
+ fib_ipv6_replay_single_test "testns1" "$DEVLINK_DEV"
+}
+
+ipv6_replay_multipath()
+{
+ fib_ipv6_replay_multipath_test "testns1" "$DEVLINK_DEV"
+}
+
+ipv6_local_replace()
+{
+ local ns="testns1"
+
+ RET=0
+
+ ip -n $ns link add name dummy1 type dummy
+ ip -n $ns link set dev dummy1 up
+
+ ip -n $ns route add table local 2001:db8:1::1/128 dev dummy1
+ fib6_trap_check $ns "table local 2001:db8:1::1/128 dev dummy1" false
+ check_err $? "Local table route not in hardware when should"
+
+ ip -n $ns route add table main 2001:db8:1::1/128 dev dummy1
+ fib6_trap_check $ns "table main 2001:db8:1::1/128 dev dummy1" true
+ check_err $? "Main table route in hardware when should not"
+
+ fib6_trap_check $ns "table local 2001:db8:1::1/128 dev dummy1" false
+ check_err $? "Local table route was replaced when should not"
+
+ # Test that local routes can replace routes in main table.
+ ip -n $ns route add table main 2001:db8:1::2/128 dev dummy1
+ fib6_trap_check $ns "table main 2001:db8:1::2/128 dev dummy1" false
+ check_err $? "Main table route not in hardware when should"
+
+ ip -n $ns route add table local 2001:db8:1::2/128 dev dummy1
+ fib6_trap_check $ns "table local 2001:db8:1::2/128 dev dummy1" false
+ check_err $? "Local route route did not replace route in main table when should"
+
+ fib6_trap_check $ns "table main 2001:db8:1::2/128 dev dummy1" true
+ check_err $? "Main table route was not replaced when should"
+
+ log_test "IPv6 local table route replacement"
+
+ ip -n $ns link del dev dummy1
+}
+
+setup_prepare()
+{
+ ip netns add testns1
+ if [ $? -ne 0 ]; then
+ echo "Failed to add netns \"testns1\""
+ exit 1
+ fi
+
+ devlink dev reload $DEVLINK_DEV netns testns1
+ if [ $? -ne 0 ]; then
+ echo "Failed to reload into netns \"testns1\""
+ exit 1
+ fi
+}
+
+cleanup()
+{
+ pre_cleanup
+ devlink -N testns1 dev reload $DEVLINK_DEV netns $$
+ ip netns del testns1
+}
+
+trap cleanup EXIT
+
+setup_prepare
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/fib_offload.sh b/tools/testing/selftests/drivers/net/mlxsw/fib_offload.sh
new file mode 100755
index 000000000..e99ae500f
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/fib_offload.sh
@@ -0,0 +1,349 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test unicast FIB offload indication.
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ ipv6_route_add
+ ipv6_route_replace
+ ipv6_route_nexthop_group_share
+ ipv6_route_rate
+"
+NUM_NETIFS=4
+source $lib_dir/lib.sh
+source $lib_dir/devlink_lib.sh
+
+tor1_create()
+{
+ simple_if_init $tor1_p1 2001:db8:1::2/128 2001:db8:1::3/128
+}
+
+tor1_destroy()
+{
+ simple_if_fini $tor1_p1 2001:db8:1::2/128 2001:db8:1::3/128
+}
+
+tor2_create()
+{
+ simple_if_init $tor2_p1 2001:db8:2::2/128 2001:db8:2::3/128
+}
+
+tor2_destroy()
+{
+ simple_if_fini $tor2_p1 2001:db8:2::2/128 2001:db8:2::3/128
+}
+
+spine_create()
+{
+ ip link set dev $spine_p1 up
+ ip link set dev $spine_p2 up
+
+ __addr_add_del $spine_p1 add 2001:db8:1::1/64
+ __addr_add_del $spine_p2 add 2001:db8:2::1/64
+}
+
+spine_destroy()
+{
+ __addr_add_del $spine_p2 del 2001:db8:2::1/64
+ __addr_add_del $spine_p1 del 2001:db8:1::1/64
+
+ ip link set dev $spine_p2 down
+ ip link set dev $spine_p1 down
+}
+
+ipv6_offload_check()
+{
+ local pfx="$1"; shift
+ local expected_num=$1; shift
+ local num
+
+ # Try to avoid races with route offload
+ sleep .1
+
+ num=$(ip -6 route show match ${pfx} | grep "offload" | wc -l)
+
+ if [ $num -eq $expected_num ]; then
+ return 0
+ fi
+
+ return 1
+}
+
+ipv6_route_add_prefix()
+{
+ RET=0
+
+ # Add a prefix route and check that it is offloaded.
+ ip -6 route add 2001:db8:3::/64 dev $spine_p1 metric 100
+ ipv6_offload_check "2001:db8:3::/64 dev $spine_p1 metric 100" 1
+ check_err $? "prefix route not offloaded"
+
+ # Append an identical prefix route with an higher metric and check that
+ # offload indication did not change.
+ ip -6 route append 2001:db8:3::/64 dev $spine_p1 metric 200
+ ipv6_offload_check "2001:db8:3::/64 dev $spine_p1 metric 100" 1
+ check_err $? "lowest metric not offloaded after append"
+ ipv6_offload_check "2001:db8:3::/64 dev $spine_p1 metric 200" 0
+ check_err $? "highest metric offloaded when should not"
+
+ # Prepend an identical prefix route with lower metric and check that
+ # it is offloaded and the others are not.
+ ip -6 route append 2001:db8:3::/64 dev $spine_p1 metric 10
+ ipv6_offload_check "2001:db8:3::/64 dev $spine_p1 metric 10" 1
+ check_err $? "lowest metric not offloaded after prepend"
+ ipv6_offload_check "2001:db8:3::/64 dev $spine_p1 metric 100" 0
+ check_err $? "mid metric offloaded when should not"
+ ipv6_offload_check "2001:db8:3::/64 dev $spine_p1 metric 200" 0
+ check_err $? "highest metric offloaded when should not"
+
+ # Delete the routes and add the same route with a different nexthop
+ # device. Check that it is offloaded.
+ ip -6 route flush 2001:db8:3::/64 dev $spine_p1
+ ip -6 route add 2001:db8:3::/64 dev $spine_p2
+ ipv6_offload_check "2001:db8:3::/64 dev $spine_p2" 1
+
+ log_test "IPv6 prefix route add"
+
+ ip -6 route flush 2001:db8:3::/64
+}
+
+ipv6_route_add_mpath()
+{
+ RET=0
+
+ # Add a multipath route and check that it is offloaded.
+ ip -6 route add 2001:db8:3::/64 metric 100 \
+ nexthop via 2001:db8:1::2 dev $spine_p1 \
+ nexthop via 2001:db8:2::2 dev $spine_p2
+ ipv6_offload_check "2001:db8:3::/64 metric 100" 2
+ check_err $? "multipath route not offloaded when should"
+
+ # Append another nexthop and check that it is offloaded as well.
+ ip -6 route append 2001:db8:3::/64 metric 100 \
+ nexthop via 2001:db8:1::3 dev $spine_p1
+ ipv6_offload_check "2001:db8:3::/64 metric 100" 3
+ check_err $? "appended nexthop not offloaded when should"
+
+ # Mimic route replace by removing the route and adding it back with
+ # only two nexthops.
+ ip -6 route del 2001:db8:3::/64
+ ip -6 route add 2001:db8:3::/64 metric 100 \
+ nexthop via 2001:db8:1::2 dev $spine_p1 \
+ nexthop via 2001:db8:2::2 dev $spine_p2
+ ipv6_offload_check "2001:db8:3::/64 metric 100" 2
+ check_err $? "multipath route not offloaded after delete & add"
+
+ # Append a nexthop with an higher metric and check that the offload
+ # indication did not change.
+ ip -6 route append 2001:db8:3::/64 metric 200 \
+ nexthop via 2001:db8:1::3 dev $spine_p1
+ ipv6_offload_check "2001:db8:3::/64 metric 100" 2
+ check_err $? "lowest metric not offloaded after append"
+ ipv6_offload_check "2001:db8:3::/64 metric 200" 0
+ check_err $? "highest metric offloaded when should not"
+
+ # Prepend a nexthop with a lower metric and check that it is offloaded
+ # and the others are not.
+ ip -6 route append 2001:db8:3::/64 metric 10 \
+ nexthop via 2001:db8:1::3 dev $spine_p1
+ ipv6_offload_check "2001:db8:3::/64 metric 10" 1
+ check_err $? "lowest metric not offloaded after prepend"
+ ipv6_offload_check "2001:db8:3::/64 metric 100" 0
+ check_err $? "mid metric offloaded when should not"
+ ipv6_offload_check "2001:db8:3::/64 metric 200" 0
+ check_err $? "highest metric offloaded when should not"
+
+ log_test "IPv6 multipath route add"
+
+ ip -6 route flush 2001:db8:3::/64
+}
+
+ipv6_route_add()
+{
+ ipv6_route_add_prefix
+ ipv6_route_add_mpath
+}
+
+ipv6_route_replace()
+{
+ RET=0
+
+ # Replace prefix route with prefix route.
+ ip -6 route add 2001:db8:3::/64 metric 100 dev $spine_p1
+ ipv6_offload_check "2001:db8:3::/64 metric 100" 1
+ check_err $? "prefix route not offloaded when should"
+ ip -6 route replace 2001:db8:3::/64 metric 100 dev $spine_p2
+ ipv6_offload_check "2001:db8:3::/64 metric 100" 1
+ check_err $? "prefix route not offloaded after replace"
+
+ # Replace prefix route with multipath route.
+ ip -6 route replace 2001:db8:3::/64 metric 100 \
+ nexthop via 2001:db8:1::2 dev $spine_p1 \
+ nexthop via 2001:db8:2::2 dev $spine_p2
+ ipv6_offload_check "2001:db8:3::/64 metric 100" 2
+ check_err $? "multipath route not offloaded after replace"
+
+ # Replace multipath route with prefix route. A prefix route cannot
+ # replace a multipath route, so it is appended.
+ ip -6 route replace 2001:db8:3::/64 metric 100 dev $spine_p1
+ ipv6_offload_check "2001:db8:3::/64 metric 100 dev $spine_p1" 0
+ check_err $? "prefix route offloaded after 'replacing' multipath route"
+ ipv6_offload_check "2001:db8:3::/64 metric 100" 2
+ check_err $? "multipath route not offloaded after being 'replaced' by prefix route"
+
+ # Replace multipath route with multipath route.
+ ip -6 route replace 2001:db8:3::/64 metric 100 \
+ nexthop via 2001:db8:1::3 dev $spine_p1 \
+ nexthop via 2001:db8:2::3 dev $spine_p2
+ ipv6_offload_check "2001:db8:3::/64 metric 100" 2
+ check_err $? "multipath route not offloaded after replacing multipath route"
+
+ # Replace a non-existing multipath route with a multipath route and
+ # check that it is appended and not offloaded.
+ ip -6 route replace 2001:db8:3::/64 metric 200 \
+ nexthop via 2001:db8:1::3 dev $spine_p1 \
+ nexthop via 2001:db8:2::3 dev $spine_p2
+ ipv6_offload_check "2001:db8:3::/64 metric 100" 2
+ check_err $? "multipath route not offloaded after non-existing route was 'replaced'"
+ ipv6_offload_check "2001:db8:3::/64 metric 200" 0
+ check_err $? "multipath route offloaded after 'replacing' non-existing route"
+
+ log_test "IPv6 route replace"
+
+ ip -6 route flush 2001:db8:3::/64
+}
+
+ipv6_route_nexthop_group_share()
+{
+ RET=0
+
+ # The driver consolidates identical nexthop groups in order to reduce
+ # the resource usage in its adjacency table. Check that the deletion
+ # of one multipath route using the group does not affect the other.
+ ip -6 route add 2001:db8:3::/64 \
+ nexthop via 2001:db8:1::2 dev $spine_p1 \
+ nexthop via 2001:db8:2::2 dev $spine_p2
+ ip -6 route add 2001:db8:4::/64 \
+ nexthop via 2001:db8:1::2 dev $spine_p1 \
+ nexthop via 2001:db8:2::2 dev $spine_p2
+ ipv6_offload_check "2001:db8:3::/64" 2
+ check_err $? "multipath route not offloaded when should"
+ ipv6_offload_check "2001:db8:4::/64" 2
+ check_err $? "multipath route not offloaded when should"
+ ip -6 route del 2001:db8:3::/64
+ ipv6_offload_check "2001:db8:4::/64" 2
+ check_err $? "multipath route not offloaded after deletion of route sharing the nexthop group"
+
+ # Check that after unsharing a nexthop group the routes are still
+ # marked as offloaded.
+ ip -6 route add 2001:db8:3::/64 \
+ nexthop via 2001:db8:1::2 dev $spine_p1 \
+ nexthop via 2001:db8:2::2 dev $spine_p2
+ ip -6 route del 2001:db8:4::/64 \
+ nexthop via 2001:db8:1::2 dev $spine_p1
+ ipv6_offload_check "2001:db8:4::/64" 1
+ check_err $? "singlepath route not offloaded after unsharing the nexthop group"
+ ipv6_offload_check "2001:db8:3::/64" 2
+ check_err $? "multipath route not offloaded after unsharing the nexthop group"
+
+ log_test "IPv6 nexthop group sharing"
+
+ ip -6 route flush 2001:db8:3::/64
+ ip -6 route flush 2001:db8:4::/64
+}
+
+ipv6_route_rate()
+{
+ local batch_dir=$(mktemp -d)
+ local num_rts=$((40 * 1024))
+ local num_nhs=16
+ local total
+ local start
+ local diff
+ local end
+ local nhs
+ local i
+
+ RET=0
+
+ # Prepare 40K /64 multipath routes with 16 nexthops each and check how
+ # long it takes to add them. A limit of 60 seconds is set. It is much
+ # higher than insertion should take and meant to flag a serious
+ # regression.
+ total=$((nums_nhs * num_rts))
+
+ for i in $(seq 1 $num_nhs); do
+ ip -6 address add 2001:db8:1::10:$i/128 dev $tor1_p1
+ nexthops+=" nexthop via 2001:db8:1::10:$i dev $spine_p1"
+ done
+
+ for i in $(seq 1 $num_rts); do
+ echo "route add 2001:db8:8:$(printf "%x" $i)::/64$nexthops" \
+ >> $batch_dir/add.batch
+ echo "route del 2001:db8:8:$(printf "%x" $i)::/64$nexthops" \
+ >> $batch_dir/del.batch
+ done
+
+ start=$(date +%s.%N)
+
+ ip -batch $batch_dir/add.batch
+ count=$(ip -6 route show | grep offload | wc -l)
+ while [ $count -lt $total ]; do
+ sleep .01
+ count=$(ip -6 route show | grep offload | wc -l)
+ done
+
+ end=$(date +%s.%N)
+
+ diff=$(echo "$end - $start" | bc -l)
+ test "$(echo "$diff > 60" | bc -l)" -eq 0
+ check_err $? "route insertion took too long"
+ log_info "inserted $num_rts routes in $diff seconds"
+
+ log_test "IPv6 routes insertion rate"
+
+ ip -batch $batch_dir/del.batch
+ for i in $(seq 1 $num_nhs); do
+ ip -6 address del 2001:db8:1::10:$i/128 dev $tor1_p1
+ done
+ rm -rf $batch_dir
+}
+
+setup_prepare()
+{
+ spine_p1=${NETIFS[p1]}
+ tor1_p1=${NETIFS[p2]}
+
+ spine_p2=${NETIFS[p3]}
+ tor2_p1=${NETIFS[p4]}
+
+ vrf_prepare
+ forwarding_enable
+
+ tor1_create
+ tor2_create
+ spine_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ spine_destroy
+ tor2_destroy
+ tor1_destroy
+
+ forwarding_restore
+ vrf_cleanup
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/mirror_gre.sh b/tools/testing/selftests/drivers/net/mlxsw/mirror_gre.sh
new file mode 100755
index 000000000..76f1ab489
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/mirror_gre.sh
@@ -0,0 +1,217 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test uses standard topology for testing gretap. See
+# ../../../net/forwarding/mirror_gre_topo_lib.sh for more details.
+#
+# Test offloading various features of offloading gretap mirrors specific to
+# mlxsw.
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+NUM_NETIFS=6
+source $lib_dir/lib.sh
+source $lib_dir/mirror_lib.sh
+source $lib_dir/mirror_gre_lib.sh
+source $lib_dir/mirror_gre_topo_lib.sh
+
+setup_keyful()
+{
+ tunnel_create gt6-key ip6gretap 2001:db8:3::1 2001:db8:3::2 \
+ ttl 100 tos inherit allow-localremote \
+ key 1234
+
+ tunnel_create h3-gt6-key ip6gretap 2001:db8:3::2 2001:db8:3::1 \
+ key 1234
+ ip link set h3-gt6-key vrf v$h3
+ matchall_sink_create h3-gt6-key
+
+ ip address add dev $swp3 2001:db8:3::1/64
+ ip address add dev $h3 2001:db8:3::2/64
+}
+
+cleanup_keyful()
+{
+ ip address del dev $h3 2001:db8:3::2/64
+ ip address del dev $swp3 2001:db8:3::1/64
+
+ tunnel_destroy h3-gt6-key
+ tunnel_destroy gt6-key
+}
+
+setup_soft()
+{
+ # Set up a topology for testing underlay routes that point at an
+ # unsupported soft device.
+
+ tunnel_create gt6-soft ip6gretap 2001:db8:4::1 2001:db8:4::2 \
+ ttl 100 tos inherit allow-localremote
+
+ tunnel_create h3-gt6-soft ip6gretap 2001:db8:4::2 2001:db8:4::1
+ ip link set h3-gt6-soft vrf v$h3
+ matchall_sink_create h3-gt6-soft
+
+ ip link add name v1 type veth peer name v2
+ ip link set dev v1 up
+ ip address add dev v1 2001:db8:4::1/64
+
+ ip link set dev v2 vrf v$h3
+ ip link set dev v2 up
+ ip address add dev v2 2001:db8:4::2/64
+}
+
+cleanup_soft()
+{
+ ip link del dev v1
+
+ tunnel_destroy h3-gt6-soft
+ tunnel_destroy gt6-soft
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ vrf_prepare
+ mirror_gre_topo_create
+
+ ip address add dev $swp3 2001:db8:2::1/64
+ ip address add dev $h3 2001:db8:2::2/64
+
+ ip address add dev $swp3 192.0.2.129/28
+ ip address add dev $h3 192.0.2.130/28
+
+ setup_keyful
+ setup_soft
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ cleanup_soft
+ cleanup_keyful
+
+ ip address del dev $h3 2001:db8:2::2/64
+ ip address del dev $swp3 2001:db8:2::1/64
+
+ ip address del dev $h3 192.0.2.130/28
+ ip address del dev $swp3 192.0.2.129/28
+
+ mirror_gre_topo_destroy
+ vrf_cleanup
+}
+
+test_span_gre_ttl_inherit()
+{
+ local tundev=$1; shift
+ local type=$1; shift
+ local what=$1; shift
+
+ RET=0
+
+ ip link set dev $tundev type $type ttl inherit
+ mirror_install $swp1 ingress $tundev "matchall $tcflags"
+ fail_test_span_gre_dir $tundev ingress
+
+ ip link set dev $tundev type $type ttl 100
+
+ quick_test_span_gre_dir $tundev ingress
+ mirror_uninstall $swp1 ingress
+
+ log_test "$what: no offload on TTL of inherit ($tcflags)"
+}
+
+test_span_gre_tos_fixed()
+{
+ local tundev=$1; shift
+ local type=$1; shift
+ local what=$1; shift
+
+ RET=0
+
+ ip link set dev $tundev type $type tos 0x10
+ mirror_install $swp1 ingress $tundev "matchall $tcflags"
+ fail_test_span_gre_dir $tundev ingress
+
+ ip link set dev $tundev type $type tos inherit
+ quick_test_span_gre_dir $tundev ingress
+ mirror_uninstall $swp1 ingress
+
+ log_test "$what: no offload on a fixed TOS ($tcflags)"
+}
+
+test_span_failable()
+{
+ local should_fail=$1; shift
+ local tundev=$1; shift
+ local what=$1; shift
+
+ RET=0
+
+ mirror_install $swp1 ingress $tundev "matchall $tcflags"
+ if ((should_fail)); then
+ fail_test_span_gre_dir $tundev ingress
+ else
+ quick_test_span_gre_dir $tundev ingress
+ fi
+ mirror_uninstall $swp1 ingress
+
+ log_test "$what: should_fail=$should_fail ($tcflags)"
+}
+
+test_failable()
+{
+ local should_fail=$1; shift
+
+ test_span_failable $should_fail gt6-key "mirror to keyful gretap"
+ test_span_failable $should_fail gt6-soft "mirror to gretap w/ soft underlay"
+}
+
+test_sw()
+{
+ slow_path_trap_install $swp1 ingress
+ slow_path_trap_install $swp1 egress
+
+ test_failable 0
+
+ slow_path_trap_uninstall $swp1 egress
+ slow_path_trap_uninstall $swp1 ingress
+}
+
+test_hw()
+{
+ test_failable 1
+
+ test_span_gre_tos_fixed gt4 gretap "mirror to gretap"
+ test_span_gre_tos_fixed gt6 ip6gretap "mirror to ip6gretap"
+
+ test_span_gre_ttl_inherit gt4 gretap "mirror to gretap"
+ test_span_gre_ttl_inherit gt6 ip6gretap "mirror to ip6gretap"
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+if ! tc_offload_check; then
+ check_err 1 "Could not test offloaded functionality"
+ log_test "mlxsw-specific tests for mirror to gretap"
+ exit
+fi
+
+tcflags="skip_hw"
+test_sw
+
+tcflags="skip_sw"
+test_hw
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/mirror_gre_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/mirror_gre_scale.sh
new file mode 100644
index 000000000..e00435753
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/mirror_gre_scale.sh
@@ -0,0 +1,198 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# Test offloading a number of mirrors-to-gretap. The test creates a number of
+# tunnels. Then it adds one flower mirror for each of the tunnels, matching a
+# given host IP. Then it generates traffic at each of the host IPs and checks
+# that the traffic has been mirrored at the appropriate tunnel.
+#
+# +--------------------------+ +--------------------------+
+# | H1 | | H2 |
+# | + $h1 | | $h2 + |
+# | | 2001:db8:1:X::1/64 | | 2001:db8:1:X::2/64 | |
+# +-----|--------------------+ +--------------------|-----+
+# | |
+# +-----|-------------------------------------------------------------|-----+
+# | SW o--> mirrors | |
+# | +---|-------------------------------------------------------------|---+ |
+# | | + $swp1 BR $swp2 + | |
+# | +---------------------------------------------------------------------+ |
+# | |
+# | + $swp3 + gt6-<X> (ip6gretap) |
+# | | 2001:db8:2:X::1/64 : loc=2001:db8:2:X::1 |
+# | | : rem=2001:db8:2:X::2 |
+# | | : ttl=100 |
+# | | : tos=inherit |
+# | | : |
+# +-----|--------------------------------:----------------------------------+
+# | :
+# +-----|--------------------------------:----------------------------------+
+# | H3 + $h3 + h3-gt6-<X> (ip6gretap) |
+# | 2001:db8:2:X::2/64 loc=2001:db8:2:X::2 |
+# | rem=2001:db8:2:X::1 |
+# | ttl=100 |
+# | tos=inherit |
+# | |
+# +-------------------------------------------------------------------------+
+
+source ../../../../net/forwarding/mirror_lib.sh
+
+MIRROR_NUM_NETIFS=6
+
+mirror_gre_ipv6_addr()
+{
+ local net=$1; shift
+ local num=$1; shift
+
+ printf "2001:db8:%x:%x" $net $num
+}
+
+mirror_gre_tunnels_create()
+{
+ local count=$1; shift
+ local should_fail=$1; shift
+
+ MIRROR_GRE_BATCH_FILE="$(mktemp)"
+ for ((i=0; i < count; ++i)); do
+ local match_dip=$(mirror_gre_ipv6_addr 1 $i)::2
+ local htun=h3-gt6-$i
+ local tun=gt6-$i
+
+ ((mirror_gre_tunnels++))
+
+ ip address add dev $h1 $(mirror_gre_ipv6_addr 1 $i)::1/64
+ ip address add dev $h2 $(mirror_gre_ipv6_addr 1 $i)::2/64
+
+ ip address add dev $swp3 $(mirror_gre_ipv6_addr 2 $i)::1/64
+ ip address add dev $h3 $(mirror_gre_ipv6_addr 2 $i)::2/64
+
+ tunnel_create $tun ip6gretap \
+ $(mirror_gre_ipv6_addr 2 $i)::1 \
+ $(mirror_gre_ipv6_addr 2 $i)::2 \
+ ttl 100 tos inherit allow-localremote
+
+ tunnel_create $htun ip6gretap \
+ $(mirror_gre_ipv6_addr 2 $i)::2 \
+ $(mirror_gre_ipv6_addr 2 $i)::1
+ ip link set $htun vrf v$h3
+ matchall_sink_create $htun
+
+ cat >> $MIRROR_GRE_BATCH_FILE <<-EOF
+ filter add dev $swp1 ingress pref 1000 \
+ protocol ipv6 \
+ flower $tcflags dst_ip $match_dip \
+ action mirred egress mirror dev $tun
+ EOF
+ done
+
+ tc -b $MIRROR_GRE_BATCH_FILE
+ check_err_fail $should_fail $? "Mirror rule insertion"
+}
+
+mirror_gre_tunnels_destroy()
+{
+ local count=$1; shift
+
+ for ((i=0; i < count; ++i)); do
+ local htun=h3-gt6-$i
+ local tun=gt6-$i
+
+ ip address del dev $h3 $(mirror_gre_ipv6_addr 2 $i)::2/64
+ ip address del dev $swp3 $(mirror_gre_ipv6_addr 2 $i)::1/64
+
+ ip address del dev $h2 $(mirror_gre_ipv6_addr 1 $i)::2/64
+ ip address del dev $h1 $(mirror_gre_ipv6_addr 1 $i)::1/64
+
+ tunnel_destroy $htun
+ tunnel_destroy $tun
+ done
+}
+
+__mirror_gre_test()
+{
+ local count=$1; shift
+ local should_fail=$1; shift
+
+ mirror_gre_tunnels_create $count $should_fail
+ if ((should_fail)); then
+ return
+ fi
+
+ sleep 5
+
+ for ((i = 0; i < count; ++i)); do
+ local sip=$(mirror_gre_ipv6_addr 1 $i)::1
+ local dip=$(mirror_gre_ipv6_addr 1 $i)::2
+ local htun=h3-gt6-$i
+ local message
+
+ icmp6_capture_install $htun
+ mirror_test v$h1 $sip $dip $htun 100 10
+ icmp6_capture_uninstall $htun
+ done
+}
+
+mirror_gre_test()
+{
+ local count=$1; shift
+ local should_fail=$1; shift
+
+ if ! tc_offload_check $TC_FLOWER_NUM_NETIFS; then
+ check_err 1 "Could not test offloaded functionality"
+ return
+ fi
+
+ tcflags="skip_sw"
+ __mirror_gre_test $count $should_fail
+}
+
+mirror_gre_setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ mirror_gre_tunnels=0
+
+ vrf_prepare
+
+ simple_if_init $h1
+ simple_if_init $h2
+ simple_if_init $h3
+
+ ip link add name br1 type bridge vlan_filtering 1
+ ip link set dev br1 up
+
+ ip link set dev $swp1 master br1
+ ip link set dev $swp1 up
+ tc qdisc add dev $swp1 clsact
+
+ ip link set dev $swp2 master br1
+ ip link set dev $swp2 up
+
+ ip link set dev $swp3 up
+}
+
+mirror_gre_cleanup()
+{
+ mirror_gre_tunnels_destroy $mirror_gre_tunnels
+
+ ip link set dev $swp3 down
+
+ ip link set dev $swp2 down
+
+ tc qdisc del dev $swp1 clsact
+ ip link set dev $swp1 down
+
+ ip link del dev br1
+
+ simple_if_fini $h3
+ simple_if_fini $h2
+ simple_if_fini $h1
+
+ vrf_cleanup
+}
diff --git a/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh b/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh
new file mode 100644
index 000000000..cbe50f260
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/mlxsw_lib.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+##############################################################################
+# Defines
+
+if [[ ! -v MLXSW_CHIP ]]; then
+ MLXSW_CHIP=$(devlink -j dev info $DEVLINK_DEV | jq -r '.[][]["driver"]')
+ if [ -z "$MLXSW_CHIP" ]; then
+ echo "SKIP: Device $DEVLINK_DEV doesn't support devlink info command"
+ exit 1
+ fi
+fi
diff --git a/tools/testing/selftests/drivers/net/mlxsw/one_armed_router.sh b/tools/testing/selftests/drivers/net/mlxsw/one_armed_router.sh
new file mode 100755
index 000000000..f02d83e94
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/one_armed_router.sh
@@ -0,0 +1,259 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test a "one-armed router" [1] scenario. Packets forwarded between H1 and H2
+# should be forwarded by the ASIC, but also trapped so that ICMP redirect
+# packets could be potentially generated.
+#
+# 1. https://en.wikipedia.org/wiki/One-armed_router
+#
+# +---------------------------------+
+# | H1 (vrf) |
+# | + $h1 |
+# | | 192.0.2.1/24 |
+# | | 2001:db8:1::1/64 |
+# | | |
+# | | default via 192.0.2.2 |
+# | | default via 2001:db8:1::2 |
+# +----|----------------------------+
+# |
+# +----|----------------------------------------------------------------------+
+# | SW | |
+# | +--|--------------------------------------------------------------------+ |
+# | | + $swp1 BR0 (802.1d) | |
+# | | | |
+# | | 192.0.2.2/24 | |
+# | | 2001:db8:1::2/64 | |
+# | | 198.51.100.2/24 | |
+# | | 2001:db8:2::2/64 | |
+# | | | |
+# | | + $swp2 | |
+# | +--|--------------------------------------------------------------------+ |
+# | | |
+# +----|----------------------------------------------------------------------+
+# |
+# +----|----------------------------+
+# | | default via 198.51.100.2 |
+# | | default via 2001:db8:2::2 |
+# | | |
+# | | 2001:db8:2::1/64 |
+# | | 198.51.100.1/24 |
+# | + $h2 |
+# | H2 (vrf) |
+# +---------------------------------+
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="ping_ipv4 ping_ipv6 fwd_mark_ipv4 fwd_mark_ipv6"
+NUM_NETIFS=4
+source $lib_dir/tc_common.sh
+source $lib_dir/lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/24 2001:db8:1::1/64
+
+ ip -4 route add default vrf v$h1 nexthop via 192.0.2.2
+ ip -6 route add default vrf v$h1 nexthop via 2001:db8:1::2
+}
+
+h1_destroy()
+{
+ ip -6 route del default vrf v$h1 nexthop via 2001:db8:1::2
+ ip -4 route del default vrf v$h1 nexthop via 192.0.2.2
+
+ simple_if_fini $h1 192.0.2.1/24 2001:db8:1::1/64
+}
+
+h2_create()
+{
+ simple_if_init $h2 198.51.100.1/24 2001:db8:2::1/64
+
+ ip -4 route add default vrf v$h2 nexthop via 198.51.100.2
+ ip -6 route add default vrf v$h2 nexthop via 2001:db8:2::2
+}
+
+h2_destroy()
+{
+ ip -6 route del default vrf v$h2 nexthop via 2001:db8:2::2
+ ip -4 route del default vrf v$h2 nexthop via 198.51.100.2
+
+ simple_if_fini $h2 198.51.100.1/24 2001:db8:2::1/64
+}
+
+switch_create()
+{
+ ip link add name br0 type bridge mcast_snooping 0
+ ip link set dev br0 up
+
+ ip link set dev $swp1 master br0
+ ip link set dev $swp1 up
+ ip link set dev $swp2 master br0
+ ip link set dev $swp2 up
+
+ tc qdisc add dev $swp1 clsact
+ tc qdisc add dev $swp2 clsact
+
+ __addr_add_del br0 add 192.0.2.2/24 2001:db8:1::2/64
+ __addr_add_del br0 add 198.51.100.2/24 2001:db8:2::2/64
+}
+
+switch_destroy()
+{
+ __addr_add_del br0 del 198.51.100.2/24 2001:db8:2::2/64
+ __addr_add_del br0 del 192.0.2.2/24 2001:db8:1::2/64
+
+ tc qdisc del dev $swp2 clsact
+ tc qdisc del dev $swp1 clsact
+
+ ip link set dev $swp2 down
+ ip link set dev $swp2 nomaster
+ ip link set dev $swp1 down
+ ip link set dev $swp1 nomaster
+
+ ip link set dev br0 down
+ ip link del dev br0
+}
+
+ping_ipv4()
+{
+ ping_test $h1 198.51.100.1 ": h1->h2"
+}
+
+ping_ipv6()
+{
+ ping6_test $h1 2001:db8:2::1 ": h1->h2"
+}
+
+fwd_mark_ipv4()
+{
+ # Transmit packets from H1 to H2 and make sure they are trapped at
+ # swp1 due to loopback error, but only forwarded by the ASIC through
+ # swp2
+
+ tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \
+ skip_hw dst_ip 198.51.100.1 ip_proto udp dst_port 52768 \
+ action pass
+
+ tc filter add dev $swp2 egress protocol ip pref 1 handle 101 flower \
+ skip_hw dst_ip 198.51.100.1 ip_proto udp dst_port 52768 \
+ action pass
+
+ tc filter add dev $swp2 egress protocol ip pref 2 handle 102 flower \
+ skip_sw dst_ip 198.51.100.1 ip_proto udp dst_port 52768 \
+ action pass
+
+ ip vrf exec v$h1 $MZ $h1 -c 10 -d 100msec -p 64 -A 192.0.2.1 \
+ -B 198.51.100.1 -t udp dp=52768,sp=42768 -q
+
+ RET=0
+
+ tc_check_packets "dev $swp1 ingress" 101 10
+ check_err $?
+
+ log_test "fwd mark: trapping IPv4 packets due to LBERROR"
+
+ RET=0
+
+ tc_check_packets "dev $swp2 egress" 101 0
+ check_err $?
+
+ log_test "fwd mark: forwarding IPv4 packets in software"
+
+ RET=0
+
+ tc_check_packets "dev $swp2 egress" 102 10
+ check_err $?
+
+ log_test "fwd mark: forwarding IPv4 packets in hardware"
+
+ tc filter del dev $swp2 egress protocol ip pref 2 handle 102 flower
+ tc filter del dev $swp2 egress protocol ip pref 1 handle 101 flower
+ tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower
+}
+
+fwd_mark_ipv6()
+{
+ tc filter add dev $swp1 ingress protocol ipv6 pref 1 handle 101 flower \
+ skip_hw dst_ip 2001:db8:2::1 ip_proto udp dst_port 52768 \
+ action pass
+
+ tc filter add dev $swp2 egress protocol ipv6 pref 1 handle 101 flower \
+ skip_hw dst_ip 2001:db8:2::1 ip_proto udp dst_port 52768 \
+ action pass
+
+ tc filter add dev $swp2 egress protocol ipv6 pref 2 handle 102 flower \
+ skip_sw dst_ip 2001:db8:2::1 ip_proto udp dst_port 52768 \
+ action pass
+
+ ip vrf exec v$h1 $MZ $h1 -6 -c 10 -d 100msec -p 64 -A 2001:db8:1::1 \
+ -B 2001:db8:2::1 -t udp dp=52768,sp=42768 -q
+
+ RET=0
+
+ tc_check_packets "dev $swp1 ingress" 101 10
+ check_err $?
+
+ log_test "fwd mark: trapping IPv6 packets due to LBERROR"
+
+ RET=0
+
+ tc_check_packets "dev $swp2 egress" 101 0
+ check_err $?
+
+ log_test "fwd mark: forwarding IPv6 packets in software"
+
+ RET=0
+
+ tc_check_packets "dev $swp2 egress" 102 10
+ check_err $?
+
+ log_test "fwd mark: forwarding IPv6 packets in hardware"
+
+ tc filter del dev $swp2 egress protocol ipv6 pref 2 handle 102 flower
+ tc filter del dev $swp2 egress protocol ipv6 pref 1 handle 101 flower
+ tc filter del dev $swp1 ingress protocol ipv6 pref 1 handle 101 flower
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ vrf_prepare
+ forwarding_enable
+
+ sysctl_set net.ipv4.conf.all.accept_redirects 0
+ sysctl_set net.ipv6.conf.all.accept_redirects 0
+
+ h1_create
+ h2_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h2_destroy
+ h1_destroy
+
+ sysctl_restore net.ipv6.conf.all.accept_redirects
+ sysctl_restore net.ipv4.conf.all.accept_redirects
+
+ forwarding_restore
+ vrf_cleanup
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_defprio.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_defprio.sh
new file mode 100755
index 000000000..71066bc4b
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/qos_defprio.sh
@@ -0,0 +1,166 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test for port-default priority. Non-IP packets ingress $swp1 and are
+# prioritized according to the default priority specified at the port.
+# rx_octets_prio_* counters are used to verify the prioritization.
+#
+# +-----------------------+
+# | H1 |
+# | + $h1 |
+# | | 192.0.2.1/28 |
+# +----|------------------+
+# |
+# +----|------------------+
+# | SW | |
+# | + $swp1 |
+# | 192.0.2.2/28 |
+# | APP=<prio>,1,0 |
+# +-----------------------+
+
+ALL_TESTS="
+ ping_ipv4
+ test_defprio
+"
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+NUM_NETIFS=2
+: ${HIT_TIMEOUT:=1000} # ms
+source $lib_dir/lib.sh
+
+declare -a APP
+
+defprio_install()
+{
+ local dev=$1; shift
+ local prio=$1; shift
+ local app="app=$prio,1,0"
+
+ lldptool -T -i $dev -V APP $app >/dev/null
+ lldpad_app_wait_set $dev
+ APP[$prio]=$app
+}
+
+defprio_uninstall()
+{
+ local dev=$1; shift
+ local prio=$1; shift
+ local app=${APP[$prio]}
+
+ lldptool -T -i $dev -V APP -d $app >/dev/null
+ lldpad_app_wait_del
+ unset APP[$prio]
+}
+
+defprio_flush()
+{
+ local dev=$1; shift
+ local prio
+
+ if ((${#APP[@]})); then
+ lldptool -T -i $dev -V APP -d ${APP[@]} >/dev/null
+ fi
+ lldpad_app_wait_del
+ APP=()
+}
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/28
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/28
+}
+
+switch_create()
+{
+ ip link set dev $swp1 up
+ ip addr add dev $swp1 192.0.2.2/28
+}
+
+switch_destroy()
+{
+ defprio_flush $swp1
+ ip addr del dev $swp1 192.0.2.2/28
+ ip link set dev $swp1 down
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ vrf_prepare
+
+ h1_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.2.2
+}
+
+__test_defprio()
+{
+ local prio_install=$1; shift
+ local prio_observe=$1; shift
+ local key
+ local t1
+ local i
+
+ RET=0
+
+ defprio_install $swp1 $prio_install
+
+ local t0=$(ethtool_stats_get $swp1 rx_frames_prio_$prio_observe)
+ mausezahn -q $h1 -d 100m -c 10 -t arp reply
+ t1=$(busywait "$HIT_TIMEOUT" until_counter_is ">= $((t0 + 10))" \
+ ethtool_stats_get $swp1 rx_frames_prio_$prio_observe)
+
+ check_err $? "Default priority $prio_install/$prio_observe: Expected to capture 10 packets, got $((t1 - t0))."
+ log_test "Default priority $prio_install/$prio_observe"
+
+ defprio_uninstall $swp1 $prio_install
+}
+
+test_defprio()
+{
+ local prio
+
+ for prio in {0..7}; do
+ __test_defprio $prio $prio
+ done
+
+ defprio_install $swp1 3
+ __test_defprio 0 3
+ __test_defprio 1 3
+ __test_defprio 2 3
+ __test_defprio 4 4
+ __test_defprio 5 5
+ __test_defprio 6 6
+ __test_defprio 7 7
+ defprio_uninstall $swp1 3
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_bridge.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_bridge.sh
new file mode 100755
index 000000000..28a570006
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_bridge.sh
@@ -0,0 +1,194 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test for DSCP prioritization and rewrite. Packets ingress $swp1 with a DSCP
+# tag and are prioritized according to the map at $swp1. They egress $swp2 and
+# the DSCP value is updated to match the map at that interface. The updated DSCP
+# tag is verified at $h2.
+#
+# ICMP responses are produced with the same DSCP tag that arrived at $h2. They
+# go through prioritization at $swp2 and DSCP retagging at $swp1. The tag is
+# verified at $h1--it should match the original tag.
+#
+# +----------------------+ +----------------------+
+# | H1 | | H2 |
+# | + $h1 | | $h2 + |
+# | | 192.0.2.1/28 | | 192.0.2.2/28 | |
+# +----|-----------------+ +----------------|-----+
+# | |
+# +----|----------------------------------------------------------------|-----+
+# | SW | | |
+# | +-|----------------------------------------------------------------|-+ |
+# | | + $swp1 BR $swp2 + | |
+# | | APP=0,5,10 .. 7,5,17 APP=0,5,20 .. 7,5,27 | |
+# | +--------------------------------------------------------------------+ |
+# +---------------------------------------------------------------------------+
+
+ALL_TESTS="
+ ping_ipv4
+ test_dscp
+"
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+NUM_NETIFS=4
+source $lib_dir/lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/28
+ tc qdisc add dev $h1 clsact
+ dscp_capture_install $h1 10
+}
+
+h1_destroy()
+{
+ dscp_capture_uninstall $h1 10
+ tc qdisc del dev $h1 clsact
+ simple_if_fini $h1 192.0.2.1/28
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.2/28
+ tc qdisc add dev $h2 clsact
+ dscp_capture_install $h2 20
+}
+
+h2_destroy()
+{
+ dscp_capture_uninstall $h2 20
+ tc qdisc del dev $h2 clsact
+ simple_if_fini $h2 192.0.2.2/28
+}
+
+dscp_map()
+{
+ local base=$1; shift
+ local prio
+
+ for prio in {0..7}; do
+ echo app=$prio,5,$((base + prio))
+ done
+}
+
+switch_create()
+{
+ ip link add name br1 type bridge vlan_filtering 1
+ ip link set dev br1 up
+ ip link set dev $swp1 master br1
+ ip link set dev $swp1 up
+ ip link set dev $swp2 master br1
+ ip link set dev $swp2 up
+
+ lldptool -T -i $swp1 -V APP $(dscp_map 10) >/dev/null
+ lldptool -T -i $swp2 -V APP $(dscp_map 20) >/dev/null
+ lldpad_app_wait_set $swp1
+ lldpad_app_wait_set $swp2
+}
+
+switch_destroy()
+{
+ lldptool -T -i $swp2 -V APP -d $(dscp_map 20) >/dev/null
+ lldptool -T -i $swp1 -V APP -d $(dscp_map 10) >/dev/null
+ lldpad_app_wait_del
+
+ ip link set dev $swp2 down
+ ip link set dev $swp2 nomaster
+ ip link set dev $swp1 down
+ ip link set dev $swp1 nomaster
+ ip link del dev br1
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.2.2
+}
+
+dscp_ping_test()
+{
+ local vrf_name=$1; shift
+ local sip=$1; shift
+ local dip=$1; shift
+ local prio=$1; shift
+ local dev_10=$1; shift
+ local dev_20=$1; shift
+ local key
+
+ local dscp_10=$(((prio + 10) << 2))
+ local dscp_20=$(((prio + 20) << 2))
+
+ RET=0
+
+ local -A t0s
+ eval "t0s=($(dscp_fetch_stats $dev_10 10)
+ $(dscp_fetch_stats $dev_20 20))"
+
+ local ping_timeout=$((PING_TIMEOUT * 5))
+ ip vrf exec $vrf_name \
+ ${PING} -Q $dscp_10 ${sip:+-I $sip} $dip \
+ -c 10 -i 0.5 -w $ping_timeout &> /dev/null
+
+ local -A t1s
+ eval "t1s=($(dscp_fetch_stats $dev_10 10)
+ $(dscp_fetch_stats $dev_20 20))"
+
+ for key in ${!t0s[@]}; do
+ local expect
+ if ((key == prio+10 || key == prio+20)); then
+ expect=10
+ else
+ expect=0
+ fi
+
+ local delta=$((t1s[$key] - t0s[$key]))
+ ((expect == delta))
+ check_err $? "DSCP $key: Expected to capture $expect packets, got $delta."
+ done
+
+ log_test "DSCP rewrite: $dscp_10-(prio $prio)-$dscp_20"
+}
+
+test_dscp()
+{
+ local prio
+
+ for prio in {0..7}; do
+ dscp_ping_test v$h1 192.0.2.1 192.0.2.2 $prio $h1 $h2
+ done
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_router.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_router.sh
new file mode 100755
index 000000000..4cb2aa652
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/qos_dscp_router.sh
@@ -0,0 +1,284 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test for DSCP prioritization in the router.
+#
+# With ip_forward_update_priority disabled, the packets are expected to keep
+# their DSCP (which in this test uses only values 0..7) intact as they are
+# forwarded by the switch. That is verified at $h2. ICMP responses are formed
+# with the same DSCP as the requests, and likewise pass through the switch
+# intact, which is verified at $h1.
+#
+# With ip_forward_update_priority enabled, router reprioritizes the packets
+# according to the table in reprioritize(). Thus, say, DSCP 7 maps to priority
+# 4, which on egress maps back to DSCP 4. The response packet then gets
+# reprioritized to 6, getting DSCP 6 on egress.
+#
+# +----------------------+ +----------------------+
+# | H1 | | H2 |
+# | + $h1 | | $h2 + |
+# | | 192.0.2.1/28 | | 192.0.2.18/28 | |
+# +----|-----------------+ +----------------|-----+
+# | |
+# +----|----------------------------------------------------------------|-----+
+# | SW | | |
+# | + $swp1 $swp2 + |
+# | 192.0.2.2/28 192.0.2.17/28 |
+# | APP=0,5,0 .. 7,5,7 APP=0,5,0 .. 7,5,7 |
+# +---------------------------------------------------------------------------+
+
+ALL_TESTS="
+ ping_ipv4
+ test_update
+ test_no_update
+ test_pedit_norewrite
+ test_dscp_leftover
+"
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+NUM_NETIFS=4
+source $lib_dir/lib.sh
+
+reprioritize()
+{
+ local in=$1; shift
+
+ # This is based on rt_tos2priority in include/net/route.h. Assuming 1:1
+ # mapping between priorities and TOS, it yields a new priority for a
+ # packet with ingress priority of $in.
+ local -a reprio=(0 0 2 2 6 6 4 4)
+
+ echo ${reprio[$in]}
+}
+
+zero()
+{
+ echo 0
+}
+
+three()
+{
+ echo 3
+}
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/28
+ tc qdisc add dev $h1 clsact
+ dscp_capture_install $h1 0
+ ip route add vrf v$h1 192.0.2.16/28 via 192.0.2.2
+}
+
+h1_destroy()
+{
+ ip route del vrf v$h1 192.0.2.16/28 via 192.0.2.2
+ dscp_capture_uninstall $h1 0
+ tc qdisc del dev $h1 clsact
+ simple_if_fini $h1 192.0.2.1/28
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.18/28
+ tc qdisc add dev $h2 clsact
+ dscp_capture_install $h2 0
+ ip route add vrf v$h2 192.0.2.0/28 via 192.0.2.17
+}
+
+h2_destroy()
+{
+ ip route del vrf v$h2 192.0.2.0/28 via 192.0.2.17
+ dscp_capture_uninstall $h2 0
+ tc qdisc del dev $h2 clsact
+ simple_if_fini $h2 192.0.2.18/28
+}
+
+dscp_map()
+{
+ local base=$1; shift
+ local prio
+
+ for prio in {0..7}; do
+ echo app=$prio,5,$((base + prio))
+ done
+}
+
+switch_create()
+{
+ simple_if_init $swp1 192.0.2.2/28
+ __simple_if_init $swp2 v$swp1 192.0.2.17/28
+
+ tc qdisc add dev $swp1 clsact
+ tc qdisc add dev $swp2 clsact
+
+ lldptool -T -i $swp1 -V APP $(dscp_map 0) >/dev/null
+ lldptool -T -i $swp2 -V APP $(dscp_map 0) >/dev/null
+ lldpad_app_wait_set $swp1
+ lldpad_app_wait_set $swp2
+}
+
+switch_destroy()
+{
+ lldptool -T -i $swp2 -V APP -d $(dscp_map 0) >/dev/null
+ lldptool -T -i $swp1 -V APP -d $(dscp_map 0) >/dev/null
+ lldpad_app_wait_del
+
+ tc qdisc del dev $swp2 clsact
+ tc qdisc del dev $swp1 clsact
+
+ __simple_if_fini $swp2 192.0.2.17/28
+ simple_if_fini $swp1 192.0.2.2/28
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ vrf_prepare
+
+ sysctl_set net.ipv4.ip_forward_update_priority 1
+ h1_create
+ h2_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h2_destroy
+ h1_destroy
+ sysctl_restore net.ipv4.ip_forward_update_priority
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.2.18
+}
+
+dscp_ping_test()
+{
+ local vrf_name=$1; shift
+ local sip=$1; shift
+ local dip=$1; shift
+ local prio=$1; shift
+ local reprio=$1; shift
+ local dev1=$1; shift
+ local dev2=$1; shift
+ local i
+
+ local prio2=$($reprio $prio) # ICMP Request egress prio
+ local prio3=$($reprio $prio2) # ICMP Response egress prio
+
+ local dscp=$((prio << 2)) # ICMP Request ingress DSCP
+ local dscp2=$((prio2 << 2)) # ICMP Request egress DSCP
+ local dscp3=$((prio3 << 2)) # ICMP Response egress DSCP
+
+ RET=0
+
+ eval "local -A dev1_t0s=($(dscp_fetch_stats $dev1 0))"
+ eval "local -A dev2_t0s=($(dscp_fetch_stats $dev2 0))"
+
+ local ping_timeout=$((PING_TIMEOUT * 5))
+ ip vrf exec $vrf_name \
+ ${PING} -Q $dscp ${sip:+-I $sip} $dip \
+ -c 10 -i 0.5 -w $ping_timeout &> /dev/null
+
+ eval "local -A dev1_t1s=($(dscp_fetch_stats $dev1 0))"
+ eval "local -A dev2_t1s=($(dscp_fetch_stats $dev2 0))"
+
+ for i in {0..7}; do
+ local dscpi=$((i << 2))
+ local expect2=0
+ local expect3=0
+
+ if ((i == prio2)); then
+ expect2=10
+ fi
+ if ((i == prio3)); then
+ expect3=10
+ fi
+
+ local delta=$((dev2_t1s[$i] - dev2_t0s[$i]))
+ ((expect2 == delta))
+ check_err $? "DSCP $dscpi@$dev2: Expected to capture $expect2 packets, got $delta."
+
+ delta=$((dev1_t1s[$i] - dev1_t0s[$i]))
+ ((expect3 == delta))
+ check_err $? "DSCP $dscpi@$dev1: Expected to capture $expect3 packets, got $delta."
+ done
+
+ log_test "DSCP rewrite: $dscp-(prio $prio2)-$dscp2-(prio $prio3)-$dscp3"
+}
+
+__test_update()
+{
+ local update=$1; shift
+ local reprio=$1; shift
+ local prio
+
+ sysctl_restore net.ipv4.ip_forward_update_priority
+ sysctl_set net.ipv4.ip_forward_update_priority $update
+
+ for prio in {0..7}; do
+ dscp_ping_test v$h1 192.0.2.1 192.0.2.18 $prio $reprio $h1 $h2
+ done
+}
+
+test_update()
+{
+ echo "Test net.ipv4.ip_forward_update_priority=1"
+ __test_update 1 reprioritize
+}
+
+test_no_update()
+{
+ echo "Test net.ipv4.ip_forward_update_priority=0"
+ __test_update 0 echo
+}
+
+# Test that when DSCP is updated in pedit, the DSCP rewrite is turned off.
+test_pedit_norewrite()
+{
+ echo "Test no DSCP rewrite after DSCP is updated by pedit"
+
+ tc filter add dev $swp1 ingress handle 101 pref 1 prot ip flower \
+ action pedit ex munge ip dsfield set $((3 << 2)) retain 0xfc \
+ action skbedit priority 3
+
+ __test_update 0 three
+
+ tc filter del dev $swp1 ingress pref 1
+}
+
+# Test that when the last APP rule is removed, the prio->DSCP map is properly
+# set to zeroes, and that the last APP rule does not stay active in the ASIC.
+test_dscp_leftover()
+{
+ echo "Test that last removed DSCP rule is deconfigured correctly"
+
+ lldptool -T -i $swp2 -V APP -d $(dscp_map 0) >/dev/null
+ lldpad_app_wait_del
+
+ __test_update 0 zero
+
+ lldptool -T -i $swp2 -V APP $(dscp_map 0) >/dev/null
+ lldpad_app_wait_set $swp2
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_ets_strict.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_ets_strict.sh
new file mode 100755
index 000000000..e9f8718af
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/qos_ets_strict.sh
@@ -0,0 +1,320 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# A test for strict prioritization of traffic in the switch. Run two streams of
+# traffic, each through a different ingress port, one tagged with PCP of 1, the
+# other with PCP of 2. Both streams converge at one egress port, where they are
+# assigned TC of, respectively, 1 and 2, with strict priority configured between
+# them. In H3, we expect to see (almost) exclusively the high-priority traffic.
+#
+# Please see qos_mc_aware.sh for an explanation of why we use mausezahn and
+# counters instead of just running iperf3.
+#
+# +---------------------------+ +-----------------------------+
+# | H1 | | H2 |
+# | $h1.111 + | | + $h2.222 |
+# | 192.0.2.33/28 | | | | 192.0.2.65/28 |
+# | e-qos-map 0:1 | | | | e-qos-map 0:2 |
+# | | | | | |
+# | $h1 + | | + $h2 |
+# +-----------------|---------+ +---------|-------------------+
+# | |
+# +-----------------|-------------------------------------|-------------------+
+# | $swp1 + + $swp2 |
+# | >1Gbps | | >1Gbps |
+# | +---------------|-----------+ +----------|----------------+ |
+# | | $swp1.111 + | | + $swp2.222 | |
+# | | BR111 | SW | BR222 | |
+# | | $swp3.111 + | | + $swp3.222 | |
+# | +---------------|-----------+ +----------|----------------+ |
+# | \_____________________________________/ |
+# | | |
+# | + $swp3 |
+# | | 1Gbps bottleneck |
+# | | ETS: (up n->tc n for n in 0..7) |
+# | | strict priority |
+# +------------------------------------|--------------------------------------+
+# |
+# +--------------------|--------------------+
+# | + $h3 H3 |
+# | / \ |
+# | / \ |
+# | $h3.111 + + $h3.222 |
+# | 192.0.2.34/28 192.0.2.66/28 |
+# +-----------------------------------------+
+
+ALL_TESTS="
+ ping_ipv4
+ test_ets_strict
+"
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+NUM_NETIFS=6
+source $lib_dir/lib.sh
+source $lib_dir/devlink_lib.sh
+source qos_lib.sh
+
+h1_create()
+{
+ simple_if_init $h1
+ mtu_set $h1 10000
+
+ vlan_create $h1 111 v$h1 192.0.2.33/28
+ ip link set dev $h1.111 type vlan egress-qos-map 0:1
+}
+
+h1_destroy()
+{
+ vlan_destroy $h1 111
+
+ mtu_restore $h1
+ simple_if_fini $h1
+}
+
+h2_create()
+{
+ simple_if_init $h2
+ mtu_set $h2 10000
+
+ vlan_create $h2 222 v$h2 192.0.2.65/28
+ ip link set dev $h2.222 type vlan egress-qos-map 0:2
+}
+
+h2_destroy()
+{
+ vlan_destroy $h2 222
+
+ mtu_restore $h2
+ simple_if_fini $h2
+}
+
+h3_create()
+{
+ simple_if_init $h3
+ mtu_set $h3 10000
+
+ vlan_create $h3 111 v$h3 192.0.2.34/28
+ vlan_create $h3 222 v$h3 192.0.2.66/28
+}
+
+h3_destroy()
+{
+ vlan_destroy $h3 222
+ vlan_destroy $h3 111
+
+ mtu_restore $h3
+ simple_if_fini $h3
+}
+
+switch_create()
+{
+ ip link set dev $swp1 up
+ mtu_set $swp1 10000
+
+ ip link set dev $swp2 up
+ mtu_set $swp2 10000
+
+ # prio n -> TC n, strict scheduling
+ lldptool -T -i $swp3 -V ETS-CFG up2tc=0:0,1:1,2:2,3:3,4:4,5:5,6:6,7:7
+ lldptool -T -i $swp3 -V ETS-CFG tsa=$(
+ )"0:strict,"$(
+ )"1:strict,"$(
+ )"2:strict,"$(
+ )"3:strict,"$(
+ )"4:strict,"$(
+ )"5:strict,"$(
+ )"6:strict,"$(
+ )"7:strict"
+ sleep 1
+
+ ip link set dev $swp3 up
+ mtu_set $swp3 10000
+ ethtool -s $swp3 speed 1000 autoneg off
+
+ vlan_create $swp1 111
+ vlan_create $swp2 222
+ vlan_create $swp3 111
+ vlan_create $swp3 222
+
+ ip link add name br111 up type bridge vlan_filtering 0
+ ip link set dev $swp1.111 master br111
+ ip link set dev $swp3.111 master br111
+
+ ip link add name br222 up type bridge vlan_filtering 0
+ ip link set dev $swp2.222 master br222
+ ip link set dev $swp3.222 master br222
+
+ # Make sure that ingress quotas are smaller than egress so that there is
+ # room for both streams of traffic to be admitted to shared buffer.
+ devlink_pool_size_thtype_save 0
+ devlink_pool_size_thtype_set 0 dynamic 10000000
+ devlink_pool_size_thtype_save 4
+ devlink_pool_size_thtype_set 4 dynamic 10000000
+
+ devlink_port_pool_th_save $swp1 0
+ devlink_port_pool_th_set $swp1 0 6
+ devlink_tc_bind_pool_th_save $swp1 1 ingress
+ devlink_tc_bind_pool_th_set $swp1 1 ingress 0 6
+
+ devlink_port_pool_th_save $swp2 0
+ devlink_port_pool_th_set $swp2 0 6
+ devlink_tc_bind_pool_th_save $swp2 2 ingress
+ devlink_tc_bind_pool_th_set $swp2 2 ingress 0 6
+
+ devlink_tc_bind_pool_th_save $swp3 1 egress
+ devlink_tc_bind_pool_th_set $swp3 1 egress 4 7
+ devlink_tc_bind_pool_th_save $swp3 2 egress
+ devlink_tc_bind_pool_th_set $swp3 2 egress 4 7
+ devlink_port_pool_th_save $swp3 4
+ devlink_port_pool_th_set $swp3 4 7
+}
+
+switch_destroy()
+{
+ devlink_port_pool_th_restore $swp3 4
+ devlink_tc_bind_pool_th_restore $swp3 2 egress
+ devlink_tc_bind_pool_th_restore $swp3 1 egress
+
+ devlink_tc_bind_pool_th_restore $swp2 2 ingress
+ devlink_port_pool_th_restore $swp2 0
+
+ devlink_tc_bind_pool_th_restore $swp1 1 ingress
+ devlink_port_pool_th_restore $swp1 0
+
+ devlink_pool_size_thtype_restore 4
+ devlink_pool_size_thtype_restore 0
+
+ ip link del dev br222
+ ip link del dev br111
+
+ vlan_destroy $swp3 222
+ vlan_destroy $swp3 111
+ vlan_destroy $swp2 222
+ vlan_destroy $swp1 111
+
+ ethtool -s $swp3 autoneg on
+ mtu_restore $swp3
+ ip link set dev $swp3 down
+ lldptool -T -i $swp3 -V ETS-CFG up2tc=0:0,1:0,2:0,3:0,4:0,5:0,6:0,7:0
+
+ mtu_restore $swp2
+ ip link set dev $swp2 down
+
+ mtu_restore $swp1
+ ip link set dev $swp1 down
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ h3mac=$(mac_get $h3)
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+ h3_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h3_destroy
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.2.34 " from H1"
+ ping_test $h2 192.0.2.66 " from H2"
+}
+
+rel()
+{
+ local old=$1; shift
+ local new=$1; shift
+
+ bc <<< "
+ scale=2
+ ret = 100 * $new / $old
+ if (ret > 0) { ret } else { 0 }
+ "
+}
+
+test_ets_strict()
+{
+ RET=0
+
+ # Run high-prio traffic on its own.
+ start_traffic $h2.222 192.0.2.65 192.0.2.66 $h3mac
+ local -a rate_2
+ rate_2=($(measure_rate $swp2 $h3 rx_octets_prio_2 "prio 2"))
+ check_err $? "Could not get high enough prio-2 ingress rate"
+ local rate_2_in=${rate_2[0]}
+ local rate_2_eg=${rate_2[1]}
+ stop_traffic # $h2.222
+
+ # Start low-prio stream.
+ start_traffic $h1.111 192.0.2.33 192.0.2.34 $h3mac
+
+ local -a rate_1
+ rate_1=($(measure_rate $swp1 $h3 rx_octets_prio_1 "prio 1"))
+ check_err $? "Could not get high enough prio-1 ingress rate"
+ local rate_1_in=${rate_1[0]}
+ local rate_1_eg=${rate_1[1]}
+
+ # High-prio and low-prio on their own should have about the same
+ # throughput.
+ local rel21=$(rel $rate_1_eg $rate_2_eg)
+ check_err $(bc <<< "$rel21 < 95")
+ check_err $(bc <<< "$rel21 > 105")
+
+ # Start the high-prio stream--now both streams run.
+ start_traffic $h2.222 192.0.2.65 192.0.2.66 $h3mac
+ rate_3=($(measure_rate $swp2 $h3 rx_octets_prio_2 "prio 2 w/ 1"))
+ check_err $? "Could not get high enough prio-2 ingress rate with prio-1"
+ local rate_3_in=${rate_3[0]}
+ local rate_3_eg=${rate_3[1]}
+ stop_traffic # $h2.222
+
+ stop_traffic # $h1.111
+
+ # High-prio should have about the same throughput whether or not
+ # low-prio is in the system.
+ local rel32=$(rel $rate_2_eg $rate_3_eg)
+ check_err $(bc <<< "$rel32 < 95")
+
+ log_test "strict priority"
+ echo "Ingress to switch:"
+ echo " p1 in rate $(humanize $rate_1_in)"
+ echo " p2 in rate $(humanize $rate_2_in)"
+ echo " p2 in rate w/ p1 $(humanize $rate_3_in)"
+ echo "Egress from switch:"
+ echo " p1 eg rate $(humanize $rate_1_eg)"
+ echo " p2 eg rate $(humanize $rate_2_eg) ($rel21% of p1)"
+ echo " p2 eg rate w/ p1 $(humanize $rate_3_eg) ($rel32% of p2)"
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_headroom.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_headroom.sh
new file mode 100755
index 000000000..27de3d9ed
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/qos_headroom.sh
@@ -0,0 +1,379 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="
+ test_defaults
+ test_dcb_ets
+ test_mtu
+ test_pfc
+ test_int_buf
+ test_tc_priomap
+ test_tc_mtu
+ test_tc_sizes
+ test_tc_int_buf
+"
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+NUM_NETIFS=0
+source $lib_dir/lib.sh
+source $lib_dir/devlink_lib.sh
+source qos_lib.sh
+
+swp=$NETIF_NO_CABLE
+
+cleanup()
+{
+ pre_cleanup
+}
+
+get_prio_pg()
+{
+ __mlnx_qos -i $swp | sed -n '/^PFC/,/^[^[:space:]]/p' |
+ grep buffer | sed 's/ \+/ /g' | cut -d' ' -f 2-
+}
+
+get_prio_pfc()
+{
+ __mlnx_qos -i $swp | sed -n '/^PFC/,/^[^[:space:]]/p' |
+ grep enabled | sed 's/ \+/ /g' | cut -d' ' -f 2-
+}
+
+get_prio_tc()
+{
+ __mlnx_qos -i $swp | sed -n '/^tc/,$p' |
+ awk '/^tc/ { TC = $2 }
+ /priority:/ { PRIO[$2]=TC }
+ END {
+ for (i in PRIO)
+ printf("%d ", PRIO[i])
+ }'
+}
+
+get_buf_size()
+{
+ local idx=$1; shift
+
+ __mlnx_qos -i $swp | grep Receive | sed 's/.*: //' | cut -d, -f $((idx + 1))
+}
+
+get_tot_size()
+{
+ __mlnx_qos -i $swp | grep Receive | sed 's/.*total_size=//'
+}
+
+check_prio_pg()
+{
+ local expect=$1; shift
+
+ local current=$(get_prio_pg)
+ test "$current" = "$expect"
+ check_err $? "prio2buffer is '$current', expected '$expect'"
+}
+
+check_prio_pfc()
+{
+ local expect=$1; shift
+
+ local current=$(get_prio_pfc)
+ test "$current" = "$expect"
+ check_err $? "prio PFC is '$current', expected '$expect'"
+}
+
+check_prio_tc()
+{
+ local expect=$1; shift
+
+ local current=$(get_prio_tc)
+ test "$current" = "$expect"
+ check_err $? "prio_tc is '$current', expected '$expect'"
+}
+
+__check_buf_size()
+{
+ local idx=$1; shift
+ local expr=$1; shift
+ local what=$1; shift
+
+ local current=$(get_buf_size $idx)
+ ((current $expr))
+ check_err $? "${what}buffer $idx size is '$current', expected '$expr'"
+ echo $current
+}
+
+check_buf_size()
+{
+ __check_buf_size "$@" > /dev/null
+}
+
+test_defaults()
+{
+ RET=0
+
+ check_prio_pg "0 0 0 0 0 0 0 0 "
+ check_prio_tc "0 0 0 0 0 0 0 0 "
+ check_prio_pfc "0 0 0 0 0 0 0 0 "
+
+ log_test "Default headroom configuration"
+}
+
+test_dcb_ets()
+{
+ RET=0
+
+ __mlnx_qos -i $swp --prio_tc=0,2,4,6,1,3,5,7 > /dev/null
+
+ check_prio_pg "0 2 4 6 1 3 5 7 "
+ check_prio_tc "0 2 4 6 1 3 5 7 "
+ check_prio_pfc "0 0 0 0 0 0 0 0 "
+
+ __mlnx_qos -i $swp --prio_tc=0,0,0,0,0,0,0,0 > /dev/null
+
+ check_prio_pg "0 0 0 0 0 0 0 0 "
+ check_prio_tc "0 0 0 0 0 0 0 0 "
+
+ __mlnx_qos -i $swp --prio2buffer=1,3,5,7,0,2,4,6 &> /dev/null
+ check_fail $? "prio2buffer accepted in DCB mode"
+
+ log_test "Configuring headroom through ETS"
+}
+
+test_mtu()
+{
+ local what=$1; shift
+ local buf0size_2
+ local buf0size
+
+ RET=0
+ buf0size=$(__check_buf_size 0 "> 0")
+
+ mtu_set $swp 3000
+ buf0size_2=$(__check_buf_size 0 "> $buf0size" "MTU 3000: ")
+ mtu_restore $swp
+
+ mtu_set $swp 6000
+ check_buf_size 0 "> $buf0size_2" "MTU 6000: "
+ mtu_restore $swp
+
+ check_buf_size 0 "== $buf0size"
+
+ log_test "${what}MTU impacts buffer size"
+}
+
+test_tc_mtu()
+{
+ # In TC mode, MTU still impacts the threshold below which a buffer is
+ # not permitted to go.
+
+ tc qdisc replace dev $swp root handle 1: bfifo limit 1.5M
+ test_mtu "TC: "
+ tc qdisc delete dev $swp root
+}
+
+test_pfc()
+{
+ RET=0
+
+ __mlnx_qos -i $swp --prio_tc=0,0,0,0,0,1,2,3 > /dev/null
+
+ local buf0size=$(get_buf_size 0)
+ local buf1size=$(get_buf_size 1)
+ local buf2size=$(get_buf_size 2)
+ local buf3size=$(get_buf_size 3)
+ check_buf_size 0 "> 0"
+ check_buf_size 1 "> 0"
+ check_buf_size 2 "> 0"
+ check_buf_size 3 "> 0"
+ check_buf_size 4 "== 0"
+ check_buf_size 5 "== 0"
+ check_buf_size 6 "== 0"
+ check_buf_size 7 "== 0"
+
+ log_test "Buffer size sans PFC"
+
+ RET=0
+
+ __mlnx_qos -i $swp --pfc=0,0,0,0,0,1,1,1 --cable_len=0 > /dev/null
+
+ check_prio_pg "0 0 0 0 0 1 2 3 "
+ check_prio_pfc "0 0 0 0 0 1 1 1 "
+ check_buf_size 0 "== $buf0size"
+ check_buf_size 1 "> $buf1size"
+ check_buf_size 2 "> $buf2size"
+ check_buf_size 3 "> $buf3size"
+
+ local buf1size=$(get_buf_size 1)
+ check_buf_size 2 "== $buf1size"
+ check_buf_size 3 "== $buf1size"
+
+ log_test "PFC: Cable length 0"
+
+ RET=0
+
+ __mlnx_qos -i $swp --pfc=0,0,0,0,0,1,1,1 --cable_len=1000 > /dev/null
+
+ check_buf_size 0 "== $buf0size"
+ check_buf_size 1 "> $buf1size"
+ check_buf_size 2 "> $buf1size"
+ check_buf_size 3 "> $buf1size"
+
+ log_test "PFC: Cable length 1000"
+
+ RET=0
+
+ __mlnx_qos -i $swp --pfc=0,0,0,0,0,0,0,0 --cable_len=0 > /dev/null
+ __mlnx_qos -i $swp --prio_tc=0,0,0,0,0,0,0,0 > /dev/null
+
+ check_prio_pg "0 0 0 0 0 0 0 0 "
+ check_prio_tc "0 0 0 0 0 0 0 0 "
+ check_buf_size 0 "> 0"
+ check_buf_size 1 "== 0"
+ check_buf_size 2 "== 0"
+ check_buf_size 3 "== 0"
+ check_buf_size 4 "== 0"
+ check_buf_size 5 "== 0"
+ check_buf_size 6 "== 0"
+ check_buf_size 7 "== 0"
+
+ log_test "PFC: Restore defaults"
+}
+
+test_tc_priomap()
+{
+ RET=0
+
+ __mlnx_qos -i $swp --prio_tc=0,1,2,3,4,5,6,7 > /dev/null
+ check_prio_pg "0 1 2 3 4 5 6 7 "
+
+ tc qdisc replace dev $swp root handle 1: bfifo limit 1.5M
+ check_prio_pg "0 0 0 0 0 0 0 0 "
+
+ __mlnx_qos -i $swp --prio2buffer=1,3,5,7,0,2,4,6 > /dev/null
+ check_prio_pg "1 3 5 7 0 2 4 6 "
+
+ tc qdisc delete dev $swp root
+ check_prio_pg "0 1 2 3 4 5 6 7 "
+
+ # Clean up.
+ tc qdisc replace dev $swp root handle 1: bfifo limit 1.5M
+ __mlnx_qos -i $swp --prio2buffer=0,0,0,0,0,0,0,0 > /dev/null
+ tc qdisc delete dev $swp root
+ __mlnx_qos -i $swp --prio_tc=0,0,0,0,0,0,0,0 > /dev/null
+
+ log_test "TC: priomap"
+}
+
+test_tc_sizes()
+{
+ local cell_size=$(devlink_cell_size_get)
+ local size=$((cell_size * 1000))
+
+ RET=0
+
+ __mlnx_qos -i $swp --buffer_size=$size,0,0,0,0,0,0,0 &> /dev/null
+ check_fail $? "buffer_size should fail before qdisc is added"
+
+ tc qdisc replace dev $swp root handle 1: bfifo limit 1.5M
+
+ __mlnx_qos -i $swp --buffer_size=$size,0,0,0,0,0,0,0 > /dev/null
+ check_err $? "buffer_size should pass after qdisc is added"
+ check_buf_size 0 "== $size" "set size: "
+
+ mtu_set $swp 6000
+ check_buf_size 0 "== $size" "set MTU: "
+ mtu_restore $swp
+
+ __mlnx_qos -i $swp --buffer_size=0,0,0,0,0,0,0,0 > /dev/null
+
+ # After replacing the qdisc for the same kind, buffer_size still has to
+ # work.
+ tc qdisc replace dev $swp root handle 1: bfifo limit 1M
+
+ __mlnx_qos -i $swp --buffer_size=$size,0,0,0,0,0,0,0 > /dev/null
+ check_buf_size 0 "== $size" "post replace, set size: "
+
+ __mlnx_qos -i $swp --buffer_size=0,0,0,0,0,0,0,0 > /dev/null
+
+ # Likewise after replacing for a different kind.
+ tc qdisc replace dev $swp root handle 2: prio bands 8
+
+ __mlnx_qos -i $swp --buffer_size=$size,0,0,0,0,0,0,0 > /dev/null
+ check_buf_size 0 "== $size" "post replace different kind, set size: "
+
+ tc qdisc delete dev $swp root
+
+ __mlnx_qos -i $swp --buffer_size=$size,0,0,0,0,0,0,0 &> /dev/null
+ check_fail $? "buffer_size should fail after qdisc is deleted"
+
+ log_test "TC: buffer size"
+}
+
+test_int_buf()
+{
+ local what=$1; shift
+
+ RET=0
+
+ local buf0size=$(get_buf_size 0)
+ local tot_size=$(get_tot_size)
+
+ # Size of internal buffer and buffer 9.
+ local dsize=$((tot_size - buf0size))
+
+ tc qdisc add dev $swp clsact
+ tc filter add dev $swp egress matchall skip_sw action mirred egress mirror dev $swp
+
+ local buf0size_2=$(get_buf_size 0)
+ local tot_size_2=$(get_tot_size)
+ local dsize_2=$((tot_size_2 - buf0size_2))
+
+ # Egress SPAN should have added to the "invisible" buffer configuration.
+ ((dsize_2 > dsize))
+ check_err $? "Invisible buffers account for '$dsize_2', expected '> $dsize'"
+
+ mtu_set $swp 3000
+
+ local buf0size_3=$(get_buf_size 0)
+ local tot_size_3=$(get_tot_size)
+ local dsize_3=$((tot_size_3 - buf0size_3))
+
+ # MTU change might change buffer 0, which will show at total, but the
+ # hidden buffers should stay the same size.
+ ((dsize_3 == dsize_2))
+ check_err $? "MTU change: Invisible buffers account for '$dsize_3', expected '== $dsize_2'"
+
+ mtu_restore $swp
+ tc qdisc del dev $swp clsact
+
+ # After SPAN removal, hidden buffers should be back to the original sizes.
+ local buf0size_4=$(get_buf_size 0)
+ local tot_size_4=$(get_tot_size)
+ local dsize_4=$((tot_size_4 - buf0size_4))
+ ((dsize_4 == dsize))
+ check_err $? "SPAN removed: Invisible buffers account for '$dsize_4', expected '== $dsize'"
+
+ log_test "${what}internal buffer size"
+}
+
+test_tc_int_buf()
+{
+ local cell_size=$(devlink_cell_size_get)
+ local size=$((cell_size * 1000))
+
+ tc qdisc replace dev $swp root handle 1: bfifo limit 1.5M
+ test_int_buf "TC: "
+
+ __mlnx_qos -i $swp --buffer_size=$size,0,0,0,0,0,0,0 > /dev/null
+ test_int_buf "TC+buffsize: "
+
+ __mlnx_qos -i $swp --buffer_size=0,0,0,0,0,0,0,0 > /dev/null
+ tc qdisc delete dev $swp root
+}
+
+trap cleanup EXIT
+
+bail_on_lldpad
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_lib.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_lib.sh
new file mode 100644
index 000000000..0bf76f13c
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/qos_lib.sh
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: GPL-2.0
+
+check_rate()
+{
+ local rate=$1; shift
+ local min=$1; shift
+ local what=$1; shift
+
+ if ((rate > min)); then
+ return 0
+ fi
+
+ echo "$what $(humanize $ir) < $(humanize $min)" > /dev/stderr
+ return 1
+}
+
+measure_rate()
+{
+ local sw_in=$1; shift # Where the traffic ingresses the switch
+ local host_in=$1; shift # Where it ingresses another host
+ local counter=$1; shift # Counter to use for measurement
+ local what=$1; shift
+
+ local interval=10
+ local i
+ local ret=0
+
+ # Dips in performance might cause momentary ingress rate to drop below
+ # 1Gbps. That wouldn't saturate egress and MC would thus get through,
+ # seemingly winning bandwidth on account of UC. Demand at least 2Gbps
+ # average ingress rate to somewhat mitigate this.
+ local min_ingress=2147483648
+
+ for i in {5..0}; do
+ local t0=$(ethtool_stats_get $host_in $counter)
+ local u0=$(ethtool_stats_get $sw_in $counter)
+ sleep $interval
+ local t1=$(ethtool_stats_get $host_in $counter)
+ local u1=$(ethtool_stats_get $sw_in $counter)
+
+ local ir=$(rate $u0 $u1 $interval)
+ local er=$(rate $t0 $t1 $interval)
+
+ if check_rate $ir $min_ingress "$what ingress rate"; then
+ break
+ fi
+
+ # Fail the test if we can't get the throughput.
+ if ((i == 0)); then
+ ret=1
+ fi
+ done
+
+ echo $ir $er
+ return $ret
+}
+
+bail_on_lldpad()
+{
+ if systemctl is-active --quiet lldpad; then
+
+ cat >/dev/stderr <<-EOF
+ WARNING: lldpad is running
+
+ lldpad will likely configure DCB, and this test will
+ configure Qdiscs. mlxsw does not support both at the
+ same time, one of them is arbitrarily going to overwrite
+ the other. That will cause spurious failures (or,
+ unlikely, passes) of this test.
+ EOF
+
+ if [[ -z $ALLOW_LLDPAD ]]; then
+ cat >/dev/stderr <<-EOF
+
+ If you want to run the test anyway, please set
+ an environment variable ALLOW_LLDPAD to a
+ non-empty string.
+ EOF
+ exit 1
+ else
+ return
+ fi
+ fi
+}
+
+__mlnx_qos()
+{
+ local err
+
+ mlnx_qos "$@" 2>/dev/null
+ err=$?
+
+ if ((err)); then
+ echo "Error ($err) in mlnx_qos $@" >/dev/stderr
+ fi
+
+ return $err
+}
diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh
new file mode 100755
index 000000000..8f164c80e
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/qos_mc_aware.sh
@@ -0,0 +1,341 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# A test for switch behavior under MC overload. An issue in Spectrum chips
+# causes throughput of UC traffic to drop severely when a switch is under heavy
+# MC load. This issue can be overcome by putting the switch to MC-aware mode.
+# This test verifies that UC performance stays intact even as the switch is
+# under MC flood, and therefore that the MC-aware mode is enabled and correctly
+# configured.
+#
+# Because mlxsw throttles CPU port, the traffic can't actually reach userspace
+# at full speed. That makes it impossible to use iperf3 to simply measure the
+# throughput, because many packets (that reach $h3) don't get to the kernel at
+# all even in UDP mode (the situation is even worse in TCP mode, where one can't
+# hope to see more than a couple Mbps).
+#
+# So instead we send traffic with mausezahn and use RX ethtool counters at $h3.
+# Multicast traffic is untagged, unicast traffic is tagged with PCP 1. Therefore
+# each gets a different priority and we can use per-prio ethtool counters to
+# measure the throughput. In order to avoid prioritizing unicast traffic, prio
+# qdisc is installed on $swp3 and maps all priorities to the same band #7 (and
+# thus TC 0).
+#
+# Mausezahn can't actually saturate the links unless it's using large frames.
+# Thus we set MTU to 10K on all involved interfaces. Then both unicast and
+# multicast traffic uses 8K frames.
+#
+# +---------------------------+ +----------------------------------+
+# | H1 | | H2 |
+# | | | unicast --> + $h2.111 |
+# | multicast | | traffic | 192.0.2.129/28 |
+# | traffic | | | e-qos-map 0:1 |
+# | $h1 + <----- | | | |
+# | 192.0.2.65/28 | | | + $h2 |
+# +---------------|-----------+ +--------------|-------------------+
+# | |
+# +---------------|---------------------------------------|-------------------+
+# | $swp1 + + $swp2 |
+# | >1Gbps | | >1Gbps |
+# | +-------------|------+ +----------|----------------+ |
+# | | $swp1.1 + | | + $swp2.111 | |
+# | | BR1 | SW | BR111 | |
+# | | $swp3.1 + | | + $swp3.111 | |
+# | +-------------|------+ +----------|----------------+ |
+# | \_______________________________________/ |
+# | | |
+# | + $swp3 |
+# | | 1Gbps bottleneck |
+# | | prio qdisc: {0..7} -> 7 |
+# +------------------------------------|--------------------------------------+
+# |
+# +--|-----------------+
+# | + $h3 H3 |
+# | | 192.0.2.66/28 |
+# | | |
+# | + $h3.111 |
+# | 192.0.2.130/28 |
+# +--------------------+
+
+ALL_TESTS="
+ ping_ipv4
+ test_mc_aware
+ test_uc_aware
+"
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+NUM_NETIFS=6
+source $lib_dir/lib.sh
+source $lib_dir/devlink_lib.sh
+source qos_lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.65/28
+ mtu_set $h1 10000
+}
+
+h1_destroy()
+{
+ mtu_restore $h1
+ simple_if_fini $h1 192.0.2.65/28
+}
+
+h2_create()
+{
+ simple_if_init $h2
+ mtu_set $h2 10000
+
+ vlan_create $h2 111 v$h2 192.0.2.129/28
+ ip link set dev $h2.111 type vlan egress-qos-map 0:1
+}
+
+h2_destroy()
+{
+ vlan_destroy $h2 111
+
+ mtu_restore $h2
+ simple_if_fini $h2
+}
+
+h3_create()
+{
+ simple_if_init $h3 192.0.2.66/28
+ mtu_set $h3 10000
+
+ vlan_create $h3 111 v$h3 192.0.2.130/28
+}
+
+h3_destroy()
+{
+ vlan_destroy $h3 111
+
+ mtu_restore $h3
+ simple_if_fini $h3 192.0.2.66/28
+}
+
+switch_create()
+{
+ ip link set dev $swp1 up
+ mtu_set $swp1 10000
+
+ ip link set dev $swp2 up
+ mtu_set $swp2 10000
+
+ ip link set dev $swp3 up
+ mtu_set $swp3 10000
+
+ vlan_create $swp2 111
+ vlan_create $swp3 111
+
+ ethtool -s $swp3 speed 1000 autoneg off
+ tc qdisc replace dev $swp3 root handle 3: \
+ prio bands 8 priomap 7 7 7 7 7 7 7 7
+
+ ip link add name br1 type bridge vlan_filtering 0
+ ip link set dev br1 up
+ ip link set dev $swp1 master br1
+ ip link set dev $swp3 master br1
+
+ ip link add name br111 type bridge vlan_filtering 0
+ ip link set dev br111 up
+ ip link set dev $swp2.111 master br111
+ ip link set dev $swp3.111 master br111
+
+ # Make sure that ingress quotas are smaller than egress so that there is
+ # room for both streams of traffic to be admitted to shared buffer.
+ devlink_port_pool_th_save $swp1 0
+ devlink_port_pool_th_set $swp1 0 5
+ devlink_tc_bind_pool_th_save $swp1 0 ingress
+ devlink_tc_bind_pool_th_set $swp1 0 ingress 0 5
+
+ devlink_port_pool_th_save $swp2 0
+ devlink_port_pool_th_set $swp2 0 5
+ devlink_tc_bind_pool_th_save $swp2 1 ingress
+ devlink_tc_bind_pool_th_set $swp2 1 ingress 0 5
+
+ devlink_port_pool_th_save $swp3 4
+ devlink_port_pool_th_set $swp3 4 12
+}
+
+switch_destroy()
+{
+ devlink_port_pool_th_restore $swp3 4
+
+ devlink_tc_bind_pool_th_restore $swp2 1 ingress
+ devlink_port_pool_th_restore $swp2 0
+
+ devlink_tc_bind_pool_th_restore $swp1 0 ingress
+ devlink_port_pool_th_restore $swp1 0
+
+ ip link del dev br111
+ ip link del dev br1
+
+ tc qdisc del dev $swp3 root handle 3:
+ ethtool -s $swp3 autoneg on
+
+ vlan_destroy $swp3 111
+ vlan_destroy $swp2 111
+
+ mtu_restore $swp3
+ ip link set dev $swp3 down
+
+ mtu_restore $swp2
+ ip link set dev $swp2 down
+
+ mtu_restore $swp1
+ ip link set dev $swp1 down
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ h3mac=$(mac_get $h3)
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+ h3_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h3_destroy
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h2 192.0.2.130
+}
+
+test_mc_aware()
+{
+ RET=0
+
+ local -a uc_rate
+ start_traffic $h2.111 192.0.2.129 192.0.2.130 $h3mac
+ uc_rate=($(measure_rate $swp2 $h3 rx_octets_prio_1 "UC-only"))
+ check_err $? "Could not get high enough UC-only ingress rate"
+ stop_traffic
+ local ucth1=${uc_rate[1]}
+
+ start_traffic $h1 192.0.2.65 bc bc
+
+ local d0=$(date +%s)
+ local t0=$(ethtool_stats_get $h3 rx_octets_prio_0)
+ local u0=$(ethtool_stats_get $swp1 rx_octets_prio_0)
+
+ local -a uc_rate_2
+ start_traffic $h2.111 192.0.2.129 192.0.2.130 $h3mac
+ uc_rate_2=($(measure_rate $swp2 $h3 rx_octets_prio_1 "UC+MC"))
+ check_err $? "Could not get high enough UC+MC ingress rate"
+ stop_traffic
+ local ucth2=${uc_rate_2[1]}
+
+ local d1=$(date +%s)
+ local t1=$(ethtool_stats_get $h3 rx_octets_prio_0)
+ local u1=$(ethtool_stats_get $swp1 rx_octets_prio_0)
+
+ local deg=$(bc <<< "
+ scale=2
+ ret = 100 * ($ucth1 - $ucth2) / $ucth1
+ if (ret > 0) { ret } else { 0 }
+ ")
+
+ # Minimum shaper of 200Mbps on MC TCs should cause about 20% of
+ # degradation on 1Gbps link.
+ check_err $(bc <<< "$deg < 15") "Minimum shaper not in effect"
+ check_err $(bc <<< "$deg > 25") "MC traffic degrades UC performance too much"
+
+ local interval=$((d1 - d0))
+ local mc_ir=$(rate $u0 $u1 $interval)
+ local mc_er=$(rate $t0 $t1 $interval)
+
+ stop_traffic
+
+ log_test "UC performance under MC overload"
+
+ echo "UC-only throughput $(humanize $ucth1)"
+ echo "UC+MC throughput $(humanize $ucth2)"
+ echo "Degradation $deg %"
+ echo
+ echo "Full report:"
+ echo " UC only:"
+ echo " ingress UC throughput $(humanize ${uc_rate[0]})"
+ echo " egress UC throughput $(humanize ${uc_rate[1]})"
+ echo " UC+MC:"
+ echo " ingress UC throughput $(humanize ${uc_rate_2[0]})"
+ echo " egress UC throughput $(humanize ${uc_rate_2[1]})"
+ echo " ingress MC throughput $(humanize $mc_ir)"
+ echo " egress MC throughput $(humanize $mc_er)"
+ echo
+}
+
+test_uc_aware()
+{
+ RET=0
+
+ start_traffic $h2.111 192.0.2.129 192.0.2.130 $h3mac
+
+ local d0=$(date +%s)
+ local t0=$(ethtool_stats_get $h3 rx_octets_prio_1)
+ local u0=$(ethtool_stats_get $swp2 rx_octets_prio_1)
+ sleep 1
+
+ local attempts=50
+ local passes=0
+ local i
+
+ for ((i = 0; i < attempts; ++i)); do
+ if $ARPING -c 1 -I $h1 -b 192.0.2.66 -q -w 1; then
+ ((passes++))
+ fi
+
+ sleep 0.1
+ done
+
+ local d1=$(date +%s)
+ local t1=$(ethtool_stats_get $h3 rx_octets_prio_1)
+ local u1=$(ethtool_stats_get $swp2 rx_octets_prio_1)
+
+ local interval=$((d1 - d0))
+ local uc_ir=$(rate $u0 $u1 $interval)
+ local uc_er=$(rate $t0 $t1 $interval)
+
+ ((attempts == passes))
+ check_err $?
+
+ stop_traffic
+
+ log_test "MC performance under UC overload"
+ echo " ingress UC throughput $(humanize ${uc_ir})"
+ echo " egress UC throughput $(humanize ${uc_er})"
+ echo " sent $attempts BC ARPs, got $passes responses"
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh b/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh
new file mode 100755
index 000000000..56761de1c
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/qos_pfc.sh
@@ -0,0 +1,419 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# This test injects a 10-MB burst of traffic with VLAN tag and 802.1p priority
+# of 1. This stream is consistently prioritized as priority 1, is put to PG
+# buffer 1, and scheduled at TC 1.
+#
+# - the stream first ingresses through $swp1, where it is forwarded to $swp3
+#
+# - then it ingresses through $swp4. Here it is put to a lossless buffer and put
+# to a small pool ("PFC pool"). The traffic is forwarded to $swp2, which is
+# shaped, and thus the PFC pool eventually fills, therefore the headroom
+# fills, and $swp3 is paused.
+#
+# - since $swp3 now can't send traffic, the traffic ingressing $swp1 is kept at
+# a pool ("overflow pool"). The overflow pool needs to be large enough to
+# contain the whole burst.
+#
+# - eventually the PFC pool gets some traffic out, headroom therefore gets some
+# traffic to the pool, and $swp3 is unpaused again. This way the traffic is
+# gradually forwarded from the overflow pool, through the PFC pool, out of
+# $swp2, and eventually to $h2.
+#
+# - if PFC works, all lossless flow packets that ingress through $swp1 should
+# also be seen ingressing $h2. If it doesn't, there will be drops due to
+# discrepancy between the speeds of $swp1 and $h2.
+#
+# - it should all play out relatively quickly, so that SLL and HLL will not
+# cause drops.
+#
+# +-----------------------+
+# | H1 |
+# | + $h1.111 |
+# | | 192.0.2.33/28 |
+# | | |
+# | + $h1 |
+# +---|-------------------+ +--------------------+
+# | | |
+# +---|----------------------|--------------------|---------------------------+
+# | + $swp1 $swp3 + + $swp4 |
+# | | iPOOL1 iPOOL0 | | iPOOL2 |
+# | | ePOOL4 ePOOL5 | | ePOOL4 |
+# | | 1Gbps | | 1Gbps |
+# | | PFC:enabled=1 | | PFC:enabled=1 |
+# | +-|----------------------|-+ +-|------------------------+ |
+# | | + $swp1.111 $swp3.111 + | | + $swp4.111 | |
+# | | | | | |
+# | | BR1 | | BR2 | |
+# | | | | | |
+# | | | | + $swp2.111 | |
+# | +--------------------------+ +---------|----------------+ |
+# | | |
+# | iPOOL0: 500KB dynamic | |
+# | iPOOL1: 10MB static | |
+# | iPOOL2: 1MB static + $swp2 |
+# | ePOOL4: 500KB dynamic | iPOOL0 |
+# | ePOOL5: 10MB static | ePOOL6 |
+# | ePOOL6: "infinite" static | 200Mbps shaper |
+# +-------------------------------------------------------|-------------------+
+# |
+# +---|-------------------+
+# | + $h2 H2 |
+# | | |
+# | + $h2.111 |
+# | 192.0.2.34/28 |
+# +-----------------------+
+#
+# iPOOL0+ePOOL4 is a helper pool for control traffic etc.
+# iPOOL1+ePOOL5 are overflow pools.
+# iPOOL2+ePOOL6 are PFC pools.
+
+ALL_TESTS="
+ ping_ipv4
+ test_qos_pfc
+"
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+NUM_NETIFS=6
+source $lib_dir/lib.sh
+source $lib_dir/devlink_lib.sh
+source qos_lib.sh
+
+_1KB=1000
+_100KB=$((100 * _1KB))
+_500KB=$((500 * _1KB))
+_1MB=$((1000 * _1KB))
+_10MB=$((10 * _1MB))
+
+h1_create()
+{
+ simple_if_init $h1
+ mtu_set $h1 10000
+
+ vlan_create $h1 111 v$h1 192.0.2.33/28
+}
+
+h1_destroy()
+{
+ vlan_destroy $h1 111
+
+ mtu_restore $h1
+ simple_if_fini $h1
+}
+
+h2_create()
+{
+ simple_if_init $h2
+ mtu_set $h2 10000
+
+ vlan_create $h2 111 v$h2 192.0.2.34/28
+}
+
+h2_destroy()
+{
+ vlan_destroy $h2 111
+
+ mtu_restore $h2
+ simple_if_fini $h2
+}
+
+switch_create()
+{
+ local lanes_swp4
+ local pg1_size
+
+ # pools
+ # -----
+
+ devlink_pool_size_thtype_save 0
+ devlink_pool_size_thtype_save 4
+ devlink_pool_size_thtype_save 1
+ devlink_pool_size_thtype_save 5
+ devlink_pool_size_thtype_save 2
+ devlink_pool_size_thtype_save 6
+
+ devlink_port_pool_th_save $swp1 1
+ devlink_port_pool_th_save $swp2 6
+ devlink_port_pool_th_save $swp3 5
+ devlink_port_pool_th_save $swp4 2
+
+ devlink_tc_bind_pool_th_save $swp1 1 ingress
+ devlink_tc_bind_pool_th_save $swp2 1 egress
+ devlink_tc_bind_pool_th_save $swp3 1 egress
+ devlink_tc_bind_pool_th_save $swp4 1 ingress
+
+ # Control traffic pools. Just reduce the size. Keep them dynamic so that
+ # we don't need to change all the uninteresting quotas.
+ devlink_pool_size_thtype_set 0 dynamic $_500KB
+ devlink_pool_size_thtype_set 4 dynamic $_500KB
+
+ # Overflow pools.
+ devlink_pool_size_thtype_set 1 static $_10MB
+ devlink_pool_size_thtype_set 5 static $_10MB
+
+ # PFC pools. As per the writ, the size of egress PFC pool should be
+ # infinice, but actually it just needs to be large enough to not matter
+ # in practice, so reuse the 10MB limit.
+ devlink_pool_size_thtype_set 2 static $_1MB
+ devlink_pool_size_thtype_set 6 static $_10MB
+
+ # $swp1
+ # -----
+
+ ip link set dev $swp1 up
+ mtu_set $swp1 10000
+ vlan_create $swp1 111
+ ip link set dev $swp1.111 type vlan ingress-qos-map 0:0 1:1
+
+ devlink_port_pool_th_set $swp1 1 $_10MB
+ devlink_tc_bind_pool_th_set $swp1 1 ingress 1 $_10MB
+
+ # Configure qdisc so that we can configure PG and therefore pool
+ # assignment.
+ tc qdisc replace dev $swp1 root handle 1: \
+ ets bands 8 strict 8 priomap 7 6
+ dcb buffer set dev $swp1 prio-buffer all:0 1:1
+
+ # $swp2
+ # -----
+
+ ip link set dev $swp2 up
+ mtu_set $swp2 10000
+ vlan_create $swp2 111
+ ip link set dev $swp2.111 type vlan egress-qos-map 0:0 1:1
+
+ devlink_port_pool_th_set $swp2 6 $_10MB
+ devlink_tc_bind_pool_th_set $swp2 1 egress 6 $_10MB
+
+ # prio 0->TC0 (band 7), 1->TC1 (band 6). TC1 is shaped.
+ tc qdisc replace dev $swp2 root handle 1: \
+ ets bands 8 strict 8 priomap 7 6
+ tc qdisc replace dev $swp2 parent 1:7 handle 17: \
+ tbf rate 200Mbit burst 131072 limit 1M
+
+ # $swp3
+ # -----
+
+ ip link set dev $swp3 up
+ mtu_set $swp3 10000
+ vlan_create $swp3 111
+ ip link set dev $swp3.111 type vlan egress-qos-map 0:0 1:1
+
+ devlink_port_pool_th_set $swp3 5 $_10MB
+ devlink_tc_bind_pool_th_set $swp3 1 egress 5 $_10MB
+
+ # prio 0->TC0 (band 7), 1->TC1 (band 6)
+ tc qdisc replace dev $swp3 root handle 1: \
+ ets bands 8 strict 8 priomap 7 6
+
+ # Need to enable PFC so that PAUSE takes effect. Therefore need to put
+ # the lossless prio into a buffer of its own. Don't bother with buffer
+ # sizes though, there is not going to be any pressure in the "backward"
+ # direction.
+ dcb buffer set dev $swp3 prio-buffer all:0 1:1
+ dcb pfc set dev $swp3 prio-pfc all:off 1:on
+
+ # $swp4
+ # -----
+
+ ip link set dev $swp4 up
+ mtu_set $swp4 10000
+ vlan_create $swp4 111
+ ip link set dev $swp4.111 type vlan ingress-qos-map 0:0 1:1
+
+ devlink_port_pool_th_set $swp4 2 $_1MB
+ devlink_tc_bind_pool_th_set $swp4 1 ingress 2 $_1MB
+
+ # Configure qdisc so that we can hand-tune headroom.
+ tc qdisc replace dev $swp4 root handle 1: \
+ ets bands 8 strict 8 priomap 7 6
+ dcb buffer set dev $swp4 prio-buffer all:0 1:1
+ dcb pfc set dev $swp4 prio-pfc all:off 1:on
+ # PG0 will get autoconfigured to Xoff, give PG1 arbitrarily 100K, which
+ # is (-2*MTU) about 80K of delay provision.
+ pg1_size=$_100KB
+
+ setup_wait_dev_with_timeout $swp4
+
+ lanes_swp4=$(ethtool $swp4 | grep 'Lanes:')
+ lanes_swp4=${lanes_swp4#*"Lanes: "}
+
+ # 8-lane ports use two buffers among which the configured buffer
+ # is split, so double the size to get twice (20K + 80K).
+ if [[ $lanes_swp4 -eq 8 ]]; then
+ pg1_size=$((pg1_size * 2))
+ fi
+
+ dcb buffer set dev $swp4 buffer-size all:0 1:$pg1_size
+
+ # bridges
+ # -------
+
+ ip link add name br1 type bridge vlan_filtering 0
+ ip link set dev $swp1.111 master br1
+ ip link set dev $swp3.111 master br1
+ ip link set dev br1 up
+
+ ip link add name br2 type bridge vlan_filtering 0
+ ip link set dev $swp2.111 master br2
+ ip link set dev $swp4.111 master br2
+ ip link set dev br2 up
+}
+
+switch_destroy()
+{
+ # Do this first so that we can reset the limits to values that are only
+ # valid for the original static / dynamic setting.
+ devlink_pool_size_thtype_restore 6
+ devlink_pool_size_thtype_restore 5
+ devlink_pool_size_thtype_restore 4
+ devlink_pool_size_thtype_restore 2
+ devlink_pool_size_thtype_restore 1
+ devlink_pool_size_thtype_restore 0
+
+ # bridges
+ # -------
+
+ ip link set dev br2 down
+ ip link set dev $swp4.111 nomaster
+ ip link set dev $swp2.111 nomaster
+ ip link del dev br2
+
+ ip link set dev br1 down
+ ip link set dev $swp3.111 nomaster
+ ip link set dev $swp1.111 nomaster
+ ip link del dev br1
+
+ # $swp4
+ # -----
+
+ dcb buffer set dev $swp4 buffer-size all:0
+ dcb pfc set dev $swp4 prio-pfc all:off
+ dcb buffer set dev $swp4 prio-buffer all:0
+ tc qdisc del dev $swp4 root
+
+ devlink_tc_bind_pool_th_restore $swp4 1 ingress
+ devlink_port_pool_th_restore $swp4 2
+
+ vlan_destroy $swp4 111
+ mtu_restore $swp4
+ ip link set dev $swp4 down
+
+ # $swp3
+ # -----
+
+ dcb pfc set dev $swp3 prio-pfc all:off
+ dcb buffer set dev $swp3 prio-buffer all:0
+ tc qdisc del dev $swp3 root
+
+ devlink_tc_bind_pool_th_restore $swp3 1 egress
+ devlink_port_pool_th_restore $swp3 5
+
+ vlan_destroy $swp3 111
+ mtu_restore $swp3
+ ip link set dev $swp3 down
+
+ # $swp2
+ # -----
+
+ tc qdisc del dev $swp2 parent 1:7
+ tc qdisc del dev $swp2 root
+
+ devlink_tc_bind_pool_th_restore $swp2 1 egress
+ devlink_port_pool_th_restore $swp2 6
+
+ vlan_destroy $swp2 111
+ mtu_restore $swp2
+ ip link set dev $swp2 down
+
+ # $swp1
+ # -----
+
+ dcb buffer set dev $swp1 prio-buffer all:0
+ tc qdisc del dev $swp1 root
+
+ devlink_tc_bind_pool_th_restore $swp1 1 ingress
+ devlink_port_pool_th_restore $swp1 1
+
+ vlan_destroy $swp1 111
+ mtu_restore $swp1
+ ip link set dev $swp1 down
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ swp4=${NETIFS[p6]}
+
+ h2mac=$(mac_get $h2)
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.2.34
+}
+
+test_qos_pfc()
+{
+ RET=0
+
+ # 10M pool, each packet is 8K of payload + headers
+ local pkts=$((_10MB / 8050))
+ local size=$((pkts * 8050))
+ local in0=$(ethtool_stats_get $swp1 rx_octets_prio_1)
+ local out0=$(ethtool_stats_get $swp2 tx_octets_prio_1)
+
+ $MZ $h1 -p 8000 -Q 1:111 -A 192.0.2.33 -B 192.0.2.34 \
+ -a own -b $h2mac -c $pkts -t udp -q
+ sleep 2
+
+ local in1=$(ethtool_stats_get $swp1 rx_octets_prio_1)
+ local out1=$(ethtool_stats_get $swp2 tx_octets_prio_1)
+
+ local din=$((in1 - in0))
+ local dout=$((out1 - out0))
+
+ local pct_in=$((din * 100 / size))
+
+ ((pct_in > 95 && pct_in < 105))
+ check_err $? "Relative ingress out of expected bounds, $pct_in% should be 100%"
+
+ ((dout == din))
+ check_err $? "$((din - dout)) bytes out of $din ingressed got lost"
+
+ log_test "PFC"
+}
+
+trap cleanup EXIT
+
+bail_on_lldpad
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/router_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/router_scale.sh
new file mode 100644
index 000000000..e93878d42
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/router_scale.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ROUTER_NUM_NETIFS=4
+: ${TIMEOUT:=20000} # ms
+
+router_h1_create()
+{
+ simple_if_init $h1 192.0.1.1/24
+}
+
+router_h1_destroy()
+{
+ simple_if_fini $h1 192.0.1.1/24
+}
+
+router_h2_create()
+{
+ simple_if_init $h2 192.0.2.1/24
+ tc qdisc add dev $h2 handle ffff: ingress
+}
+
+router_h2_destroy()
+{
+ tc qdisc del dev $h2 handle ffff: ingress
+ simple_if_fini $h2 192.0.2.1/24
+}
+
+router_create()
+{
+ ip link set dev $rp1 up
+ ip link set dev $rp2 up
+
+ ip address add 192.0.1.2/24 dev $rp1
+ ip address add 192.0.2.2/24 dev $rp2
+}
+
+router_destroy()
+{
+ ip address del 192.0.2.2/24 dev $rp2
+ ip address del 192.0.1.2/24 dev $rp1
+
+ ip link set dev $rp2 down
+ ip link set dev $rp1 down
+}
+
+router_setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ rp1=${NETIFS[p2]}
+
+ rp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ h1mac=$(mac_get $h1)
+ rp1mac=$(mac_get $rp1)
+
+ vrf_prepare
+
+ router_h1_create
+ router_h2_create
+
+ router_create
+}
+
+wait_for_routes()
+{
+ local t0=$1; shift
+ local route_count=$1; shift
+
+ local t1=$(ip route | grep -o 'offload' | wc -l)
+ local delta=$((t1 - t0))
+ echo $delta
+ [[ $delta -ge $route_count ]]
+}
+
+router_routes_create()
+{
+ local route_count=$1
+ local count=0
+
+ ROUTE_FILE="$(mktemp)"
+
+ for i in {0..255}
+ do
+ for j in {0..255}
+ do
+ for k in {0..255}
+ do
+ if [[ $count -eq $route_count ]]; then
+ break 3
+ fi
+
+ echo route add 193.${i}.${j}.${k}/32 dev $rp2 \
+ >> $ROUTE_FILE
+ ((count++))
+ done
+ done
+ done
+
+ ip -b $ROUTE_FILE &> /dev/null
+}
+
+router_routes_destroy()
+{
+ if [[ -v ROUTE_FILE ]]; then
+ rm -f $ROUTE_FILE
+ fi
+}
+
+router_test()
+{
+ local route_count=$1
+ local should_fail=$2
+ local delta
+
+ RET=0
+
+ local t0=$(ip route | grep -o 'offload' | wc -l)
+ router_routes_create $route_count
+ delta=$(busywait "$TIMEOUT" wait_for_routes $t0 $route_count)
+
+ check_err_fail $should_fail $? "Offload routes: Expected $route_count, got $delta."
+ if [[ $RET -ne 0 ]] || [[ $should_fail -eq 1 ]]; then
+ return
+ fi
+
+ router_routes_destroy
+}
+
+router_cleanup()
+{
+ pre_cleanup
+
+ router_routes_destroy
+ router_destroy
+
+ router_h2_destroy
+ router_h1_destroy
+
+ vrf_cleanup
+}
diff --git a/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh b/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh
new file mode 100755
index 000000000..f4031002d
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/rtnetlink.sh
@@ -0,0 +1,698 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test various interface configuration scenarios. Observe that configurations
+# deemed valid by mlxsw succeed, invalid configurations fail and that no traces
+# are produced. To prevent the test from passing in case traces are produced,
+# the user can set the 'kernel.panic_on_warn' and 'kernel.panic_on_oops'
+# sysctls in its environment.
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ rif_set_addr_test
+ rif_vrf_set_addr_test
+ rif_inherit_bridge_addr_test
+ rif_non_inherit_bridge_addr_test
+ vlan_interface_deletion_test
+ bridge_deletion_test
+ bridge_vlan_flags_test
+ vlan_1_test
+ lag_bridge_upper_test
+ duplicate_vlans_test
+ vlan_rif_refcount_test
+ subport_rif_refcount_test
+ vlan_dev_deletion_test
+ lag_unlink_slaves_test
+ lag_dev_deletion_test
+ vlan_interface_uppers_test
+ bridge_extern_learn_test
+ neigh_offload_test
+ nexthop_offload_test
+ devlink_reload_test
+"
+NUM_NETIFS=2
+: ${TIMEOUT:=20000} # ms
+source $lib_dir/lib.sh
+source $lib_dir/devlink_lib.sh
+
+setup_prepare()
+{
+ swp1=${NETIFS[p1]}
+ swp2=${NETIFS[p2]}
+
+ ip link set dev $swp1 up
+ ip link set dev $swp2 up
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ ip link set dev $swp2 down
+ ip link set dev $swp1 down
+}
+
+rif_set_addr_test()
+{
+ local swp1_mac=$(mac_get $swp1)
+ local swp2_mac=$(mac_get $swp2)
+
+ RET=0
+
+ # $swp1 and $swp2 likely got their IPv6 local addresses already, but
+ # here we need to test the transition to RIF.
+ ip addr flush dev $swp1
+ ip addr flush dev $swp2
+ sleep .1
+
+ ip addr add dev $swp1 192.0.2.1/28
+ check_err $?
+
+ ip link set dev $swp1 addr 00:11:22:33:44:55
+ check_err $?
+
+ # IP address enablement should be rejected if the MAC address prefix
+ # doesn't match other RIFs.
+ ip addr add dev $swp2 192.0.2.2/28 &>/dev/null
+ check_fail $? "IP address addition passed for a device with a wrong MAC"
+ ip addr add dev $swp2 192.0.2.2/28 2>&1 >/dev/null \
+ | grep -q mlxsw_spectrum
+ check_err $? "no extack for IP address addition"
+
+ ip link set dev $swp2 addr 00:11:22:33:44:66
+ check_err $?
+ ip addr add dev $swp2 192.0.2.2/28 &>/dev/null
+ check_err $?
+
+ # Change of MAC address of a RIF should be forbidden if the new MAC
+ # doesn't share the prefix with other MAC addresses.
+ ip link set dev $swp2 addr 00:11:22:33:00:66 &>/dev/null
+ check_fail $? "change of MAC address passed for a wrong MAC"
+ ip link set dev $swp2 addr 00:11:22:33:00:66 2>&1 >/dev/null \
+ | grep -q mlxsw_spectrum
+ check_err $? "no extack for MAC address change"
+
+ log_test "RIF - bad MAC change"
+
+ ip addr del dev $swp2 192.0.2.2/28
+ ip addr del dev $swp1 192.0.2.1/28
+
+ ip link set dev $swp2 addr $swp2_mac
+ ip link set dev $swp1 addr $swp1_mac
+}
+
+rif_vrf_set_addr_test()
+{
+ # Test that it is possible to set an IP address on a VRF upper despite
+ # its random MAC address.
+ RET=0
+
+ ip link add name vrf-test type vrf table 10
+ ip link set dev $swp1 master vrf-test
+
+ ip -4 address add 192.0.2.1/24 dev vrf-test
+ check_err $? "failed to set IPv4 address on VRF"
+ ip -6 address add 2001:db8:1::1/64 dev vrf-test
+ check_err $? "failed to set IPv6 address on VRF"
+
+ log_test "RIF - setting IP address on VRF"
+
+ ip link del dev vrf-test
+}
+
+rif_inherit_bridge_addr_test()
+{
+ RET=0
+
+ # Create first RIF
+ ip addr add dev $swp1 192.0.2.1/28
+ check_err $?
+
+ # Create a FID RIF
+ ip link add name br1 up type bridge vlan_filtering 0
+ ip link set dev $swp2 master br1
+ ip addr add dev br1 192.0.2.17/28
+ check_err $?
+
+ # Prepare a device with a low MAC address
+ ip link add name d up type dummy
+ ip link set dev d addr 00:11:22:33:44:55
+
+ # Attach the device to br1. That prompts bridge address change, which
+ # should be vetoed, thus preventing the attachment.
+ ip link set dev d master br1 &>/dev/null
+ check_fail $? "Device with low MAC was permitted to attach a bridge with RIF"
+ ip link set dev d master br1 2>&1 >/dev/null \
+ | grep -q mlxsw_spectrum
+ check_err $? "no extack for bridge attach rejection"
+
+ ip link set dev $swp2 addr 00:11:22:33:44:55 &>/dev/null
+ check_fail $? "Changing swp2's MAC address permitted"
+ ip link set dev $swp2 addr 00:11:22:33:44:55 2>&1 >/dev/null \
+ | grep -q mlxsw_spectrum
+ check_err $? "no extack for bridge port MAC address change rejection"
+
+ log_test "RIF - attach port with bad MAC to bridge"
+
+ ip link del dev d
+ ip link del dev br1
+ ip addr del dev $swp1 192.0.2.1/28
+}
+
+rif_non_inherit_bridge_addr_test()
+{
+ local swp2_mac=$(mac_get $swp2)
+
+ RET=0
+
+ # Create first RIF
+ ip addr add dev $swp1 192.0.2.1/28
+ check_err $?
+
+ # Create a FID RIF
+ ip link add name br1 up type bridge vlan_filtering 0
+ ip link set dev br1 addr $swp2_mac
+ ip link set dev $swp2 master br1
+ ip addr add dev br1 192.0.2.17/28
+ check_err $?
+
+ # Prepare a device with a low MAC address
+ ip link add name d up type dummy
+ ip link set dev d addr 00:11:22:33:44:55
+
+ # Attach the device to br1. Since the bridge address was set, it should
+ # work.
+ ip link set dev d master br1 &>/dev/null
+ check_err $? "Could not attach a device with low MAC to a bridge with RIF"
+
+ # Port MAC address change should be allowed for a bridge with set MAC.
+ ip link set dev $swp2 addr 00:11:22:33:44:55
+ check_err $? "Changing swp2's MAC address not permitted"
+
+ log_test "RIF - attach port with bad MAC to bridge with set MAC"
+
+ ip link set dev $swp2 addr $swp2_mac
+ ip link del dev d
+ ip link del dev br1
+ ip addr del dev $swp1 192.0.2.1/28
+}
+
+vlan_interface_deletion_test()
+{
+ # Test that when a VLAN interface is deleted, its associated router
+ # interface (RIF) is correctly deleted and not leaked. See commit
+ # c360867ec46a ("mlxsw: spectrum: Delete RIF when VLAN device is
+ # removed") for more details
+ RET=0
+
+ ip link add name br0 type bridge vlan_filtering 1
+ ip link set dev $swp1 master br0
+
+ ip link add link br0 name br0.10 type vlan id 10
+ ip -6 address add 2001:db8:1::1/64 dev br0.10
+ ip link del dev br0.10
+
+ # If we leaked the previous RIF, then this should produce a trace
+ ip link add link br0 name br0.20 type vlan id 20
+ ip -6 address add 2001:db8:1::1/64 dev br0.20
+ ip link del dev br0.20
+
+ log_test "vlan interface deletion"
+
+ ip link del dev br0
+}
+
+bridge_deletion_test()
+{
+ # Test that when a bridge with VLAN interfaces is deleted, we correctly
+ # delete the associated RIFs. See commit 602b74eda813 ("mlxsw:
+ # spectrum_switchdev: Do not leak RIFs when removing bridge") for more
+ # details
+ RET=0
+
+ ip link add name br0 type bridge vlan_filtering 1
+ ip link set dev $swp1 master br0
+ ip -6 address add 2001:db8::1/64 dev br0
+
+ ip link add link br0 name br0.10 type vlan id 10
+ ip -6 address add 2001:db8:1::1/64 dev br0.10
+
+ ip link add link br0 name br0.20 type vlan id 20
+ ip -6 address add 2001:db8:2::1/64 dev br0.20
+
+ ip link del dev br0
+
+ # If we leaked previous RIFs, then this should produce a trace
+ ip -6 address add 2001:db8:1::1/64 dev $swp1
+ ip -6 address del 2001:db8:1::1/64 dev $swp1
+
+ log_test "bridge deletion"
+}
+
+bridge_vlan_flags_test()
+{
+ # Test that when bridge VLAN flags are toggled, we do not take
+ # unnecessary references on related structs. See commit 9e25826ffc94
+ # ("mlxsw: spectrum_switchdev: Fix port_vlan refcounting") for more
+ # details
+ RET=0
+
+ ip link add name br0 type bridge vlan_filtering 1
+ ip link set dev $swp1 master br0
+
+ bridge vlan add vid 10 dev $swp1 pvid untagged
+ bridge vlan add vid 10 dev $swp1 untagged
+ bridge vlan add vid 10 dev $swp1 pvid
+ bridge vlan add vid 10 dev $swp1
+ ip link del dev br0
+
+ # If we did not handle references correctly, then this should produce a
+ # trace
+ devlink dev reload "$DEVLINK_DEV"
+
+ # Allow netdevices to be re-created following the reload
+ sleep 20
+
+ log_test "bridge vlan flags"
+}
+
+vlan_1_test()
+{
+ # Test that VLAN 1 can be configured over mlxsw ports. In the past it
+ # was used internally for untagged traffic. See commit 47bf9df2e820
+ # ("mlxsw: spectrum: Forbid creation of VLAN 1 over port/LAG") for more
+ # details
+ RET=0
+
+ ip link add link $swp1 name $swp1.1 type vlan id 1
+ check_err $? "did not manage to create vlan 1 when should"
+
+ log_test "vlan 1"
+
+ ip link del dev $swp1.1
+}
+
+lag_bridge_upper_test()
+{
+ # Test that ports cannot be enslaved to LAG devices that have uppers
+ # and that failure is handled gracefully. See commit b3529af6bb0d
+ # ("spectrum: Reference count VLAN entries") for more details
+ RET=0
+
+ ip link add name bond1 type bond mode 802.3ad
+
+ ip link add name br0 type bridge vlan_filtering 1
+ ip link set dev bond1 master br0
+
+ ip link set dev $swp1 down
+ ip link set dev $swp1 master bond1 &> /dev/null
+ check_fail $? "managed to enslave port to lag when should not"
+
+ # This might generate a trace, if we did not handle the failure
+ # correctly
+ ip -6 address add 2001:db8:1::1/64 dev $swp1
+ ip -6 address del 2001:db8:1::1/64 dev $swp1
+
+ log_test "lag with bridge upper"
+
+ ip link del dev br0
+ ip link del dev bond1
+}
+
+duplicate_vlans_test()
+{
+ # Test that on a given port a VLAN is only used once. Either as VLAN
+ # in a VLAN-aware bridge or as a VLAN device
+ RET=0
+
+ ip link add name br0 type bridge vlan_filtering 1
+ ip link set dev $swp1 master br0
+ bridge vlan add vid 10 dev $swp1
+
+ ip link add link $swp1 name $swp1.10 type vlan id 10 &> /dev/null
+ check_fail $? "managed to create vlan device when should not"
+
+ bridge vlan del vid 10 dev $swp1
+ ip link add link $swp1 name $swp1.10 type vlan id 10
+ check_err $? "did not manage to create vlan device when should"
+ bridge vlan add vid 10 dev $swp1 &> /dev/null
+ check_fail $? "managed to add bridge vlan when should not"
+
+ log_test "duplicate vlans"
+
+ ip link del dev $swp1.10
+ ip link del dev br0
+}
+
+vlan_rif_refcount_test()
+{
+ # Test that RIFs representing VLAN interfaces are not affected from
+ # ports member in the VLAN. We use the offload indication on routes
+ # configured on the RIF to understand if it was created / destroyed
+ RET=0
+
+ ip link add name br0 type bridge vlan_filtering 1
+ ip link set dev $swp1 master br0
+
+ ip link set dev $swp1 up
+ ip link set dev br0 up
+
+ ip link add link br0 name br0.10 up type vlan id 10
+ ip -6 address add 2001:db8:1::1/64 dev br0.10
+
+ busywait "$TIMEOUT" wait_for_offload \
+ ip -6 route get fibmatch 2001:db8:1::2 dev br0.10
+ check_err $? "vlan rif was not created before adding port to vlan"
+
+ bridge vlan add vid 10 dev $swp1
+ busywait "$TIMEOUT" wait_for_offload \
+ ip -6 route get fibmatch 2001:db8:1::2 dev br0.10
+ check_err $? "vlan rif was destroyed after adding port to vlan"
+
+ bridge vlan del vid 10 dev $swp1
+ busywait "$TIMEOUT" wait_for_offload \
+ ip -6 route get fibmatch 2001:db8:1::2 dev br0.10
+ check_err $? "vlan rif was destroyed after removing port from vlan"
+
+ ip link set dev $swp1 nomaster
+ busywait "$TIMEOUT" not wait_for_offload \
+ ip -6 route get fibmatch 2001:db8:1::2 dev br0.10
+ check_err $? "vlan rif was not destroyed after unlinking port from bridge"
+
+ log_test "vlan rif refcount"
+
+ ip link del dev br0.10
+ ip link set dev $swp1 down
+ ip link del dev br0
+}
+
+subport_rif_refcount_test()
+{
+ # Test that RIFs representing upper devices of physical ports are
+ # reference counted correctly and destroyed when should. We use the
+ # offload indication on routes configured on the RIF to understand if
+ # it was created / destroyed
+ RET=0
+
+ ip link add name bond1 type bond mode 802.3ad
+ ip link set dev $swp1 down
+ ip link set dev $swp2 down
+ ip link set dev $swp1 master bond1
+ ip link set dev $swp2 master bond1
+
+ ip link set dev bond1 up
+ ip link add link bond1 name bond1.10 up type vlan id 10
+ ip -6 address add 2001:db8:1::1/64 dev bond1
+ ip -6 address add 2001:db8:2::1/64 dev bond1.10
+
+ busywait "$TIMEOUT" wait_for_offload \
+ ip -6 route get fibmatch 2001:db8:1::2 dev bond1
+ check_err $? "subport rif was not created on lag device"
+ busywait "$TIMEOUT" wait_for_offload \
+ ip -6 route get fibmatch 2001:db8:2::2 dev bond1.10
+ check_err $? "subport rif was not created on vlan device"
+
+ ip link set dev $swp1 nomaster
+ busywait "$TIMEOUT" wait_for_offload \
+ ip -6 route get fibmatch 2001:db8:1::2 dev bond1
+ check_err $? "subport rif of lag device was destroyed when should not"
+ busywait "$TIMEOUT" wait_for_offload \
+ ip -6 route get fibmatch 2001:db8:2::2 dev bond1.10
+ check_err $? "subport rif of vlan device was destroyed when should not"
+
+ ip link set dev $swp2 nomaster
+ busywait "$TIMEOUT" not wait_for_offload \
+ ip -6 route get fibmatch 2001:db8:1::2 dev bond1
+ check_err $? "subport rif of lag device was not destroyed when should"
+ busywait "$TIMEOUT" not wait_for_offload \
+ ip -6 route get fibmatch 2001:db8:2::2 dev bond1.10
+ check_err $? "subport rif of vlan device was not destroyed when should"
+
+ log_test "subport rif refcount"
+
+ ip link del dev bond1.10
+ ip link del dev bond1
+}
+
+vlan_dev_deletion_test()
+{
+ # Test that VLAN devices are correctly deleted / unlinked when enslaved
+ # to bridge
+ RET=0
+
+ ip link add name br10 type bridge
+ ip link add name br20 type bridge
+ ip link add name br30 type bridge
+ ip link add link $swp1 name $swp1.10 type vlan id 10
+ ip link add link $swp1 name $swp1.20 type vlan id 20
+ ip link add link $swp1 name $swp1.30 type vlan id 30
+ ip link set dev $swp1.10 master br10
+ ip link set dev $swp1.20 master br20
+ ip link set dev $swp1.30 master br30
+
+ # If we did not handle the situation correctly, then these operations
+ # might produce a trace
+ ip link set dev $swp1.30 nomaster
+ ip link del dev $swp1.20
+ # Deletion via ioctl uses different code paths from netlink
+ vconfig rem $swp1.10 &> /dev/null
+
+ log_test "vlan device deletion"
+
+ ip link del dev $swp1.30
+ ip link del dev br30
+ ip link del dev br20
+ ip link del dev br10
+}
+
+lag_create()
+{
+ ip link add name bond1 type bond mode 802.3ad
+ ip link set dev $swp1 down
+ ip link set dev $swp2 down
+ ip link set dev $swp1 master bond1
+ ip link set dev $swp2 master bond1
+
+ ip link add link bond1 name bond1.10 type vlan id 10
+ ip link add link bond1 name bond1.20 type vlan id 20
+
+ ip link add name br0 type bridge vlan_filtering 1
+ ip link set dev bond1 master br0
+
+ ip link add name br10 type bridge
+ ip link set dev bond1.10 master br10
+
+ ip link add name br20 type bridge
+ ip link set dev bond1.20 master br20
+}
+
+lag_unlink_slaves_test()
+{
+ # Test that ports are correctly unlinked from their LAG master, when
+ # the LAG and its VLAN uppers are enslaved to bridges
+ RET=0
+
+ lag_create
+
+ ip link set dev $swp1 nomaster
+ check_err $? "lag slave $swp1 was not unlinked from master"
+ ip link set dev $swp2 nomaster
+ check_err $? "lag slave $swp2 was not unlinked from master"
+
+ # Try to configure corresponding VLANs as router interfaces
+ ip -6 address add 2001:db8:1::1/64 dev $swp1
+ check_err $? "failed to configure ip address on $swp1"
+
+ ip link add link $swp1 name $swp1.10 type vlan id 10
+ ip -6 address add 2001:db8:10::1/64 dev $swp1.10
+ check_err $? "failed to configure ip address on $swp1.10"
+
+ ip link add link $swp1 name $swp1.20 type vlan id 20
+ ip -6 address add 2001:db8:20::1/64 dev $swp1.20
+ check_err $? "failed to configure ip address on $swp1.20"
+
+ log_test "lag slaves unlinking"
+
+ ip link del dev $swp1.20
+ ip link del dev $swp1.10
+ ip address flush dev $swp1
+
+ ip link del dev br20
+ ip link del dev br10
+ ip link del dev br0
+ ip link del dev bond1
+}
+
+lag_dev_deletion_test()
+{
+ # Test that LAG device is correctly deleted, when the LAG and its VLAN
+ # uppers are enslaved to bridges
+ RET=0
+
+ lag_create
+
+ ip link del dev bond1
+
+ log_test "lag device deletion"
+
+ ip link del dev br20
+ ip link del dev br10
+ ip link del dev br0
+}
+
+vlan_interface_uppers_test()
+{
+ # Test that uppers of a VLAN interface are correctly sanitized
+ RET=0
+
+ ip link add name br0 type bridge vlan_filtering 1
+ ip link set dev $swp1 master br0
+
+ ip link add link br0 name br0.10 type vlan id 10
+ ip link add link br0.10 name macvlan0 \
+ type macvlan mode private &> /dev/null
+ check_fail $? "managed to create a macvlan when should not"
+
+ ip -6 address add 2001:db8:1::1/64 dev br0.10
+ ip link add link br0.10 name macvlan0 type macvlan mode private
+ check_err $? "did not manage to create a macvlan when should"
+
+ ip link del dev macvlan0
+
+ ip link add name vrf-test type vrf table 10
+ ip link set dev br0.10 master vrf-test
+ check_err $? "did not manage to enslave vlan interface to vrf"
+ ip link del dev vrf-test
+
+ ip link add name br-test type bridge
+ ip link set dev br0.10 master br-test &> /dev/null
+ check_fail $? "managed to enslave vlan interface to bridge when should not"
+ ip link del dev br-test
+
+ log_test "vlan interface uppers"
+
+ ip link del dev br0
+}
+
+bridge_extern_learn_test()
+{
+ # Test that externally learned entries added from user space are
+ # marked as offloaded
+ RET=0
+
+ ip link add name br0 type bridge
+ ip link set dev $swp1 master br0
+
+ bridge fdb add de:ad:be:ef:13:37 dev $swp1 master extern_learn
+
+ busywait "$TIMEOUT" wait_for_offload \
+ bridge fdb show brport $swp1 de:ad:be:ef:13:37
+ check_err $? "fdb entry not marked as offloaded when should"
+
+ log_test "externally learned fdb entry"
+
+ ip link del dev br0
+}
+
+neigh_offload_test()
+{
+ # Test that IPv4 and IPv6 neighbour entries are marked as offloaded
+ RET=0
+
+ ip -4 address add 192.0.2.1/24 dev $swp1
+ ip -6 address add 2001:db8:1::1/64 dev $swp1
+
+ ip -4 neigh add 192.0.2.2 lladdr de:ad:be:ef:13:37 nud perm dev $swp1
+ ip -6 neigh add 2001:db8:1::2 lladdr de:ad:be:ef:13:37 nud perm \
+ dev $swp1
+
+ busywait "$TIMEOUT" wait_for_offload \
+ ip -4 neigh show dev $swp1 192.0.2.2
+ check_err $? "ipv4 neigh entry not marked as offloaded when should"
+ busywait "$TIMEOUT" wait_for_offload \
+ ip -6 neigh show dev $swp1 2001:db8:1::2
+ check_err $? "ipv6 neigh entry not marked as offloaded when should"
+
+ log_test "neighbour offload indication"
+
+ ip -6 neigh del 2001:db8:1::2 dev $swp1
+ ip -4 neigh del 192.0.2.2 dev $swp1
+ ip -6 address del 2001:db8:1::1/64 dev $swp1
+ ip -4 address del 192.0.2.1/24 dev $swp1
+}
+
+nexthop_offload_test()
+{
+ # Test that IPv4 and IPv6 nexthops are marked as offloaded
+ RET=0
+
+ sysctl_set net.ipv6.conf.$swp2.keep_addr_on_down 1
+ simple_if_init $swp1 192.0.2.1/24 2001:db8:1::1/64
+ simple_if_init $swp2 192.0.2.2/24 2001:db8:1::2/64
+ setup_wait
+
+ ip -4 route add 198.51.100.0/24 vrf v$swp1 \
+ nexthop via 192.0.2.2 dev $swp1
+ ip -6 route add 2001:db8:2::/64 vrf v$swp1 \
+ nexthop via 2001:db8:1::2 dev $swp1
+
+ busywait "$TIMEOUT" wait_for_offload \
+ ip -4 route show 198.51.100.0/24 vrf v$swp1
+ check_err $? "ipv4 nexthop not marked as offloaded when should"
+ busywait "$TIMEOUT" wait_for_offload \
+ ip -6 route show 2001:db8:2::/64 vrf v$swp1
+ check_err $? "ipv6 nexthop not marked as offloaded when should"
+
+ ip link set dev $swp2 down
+ sleep 1
+
+ busywait "$TIMEOUT" not wait_for_offload \
+ ip -4 route show 198.51.100.0/24 vrf v$swp1
+ check_err $? "ipv4 nexthop marked as offloaded when should not"
+ busywait "$TIMEOUT" not wait_for_offload \
+ ip -6 route show 2001:db8:2::/64 vrf v$swp1
+ check_err $? "ipv6 nexthop marked as offloaded when should not"
+
+ ip link set dev $swp2 up
+ setup_wait
+
+ busywait "$TIMEOUT" wait_for_offload \
+ ip -4 route show 198.51.100.0/24 vrf v$swp1
+ check_err $? "ipv4 nexthop not marked as offloaded after neigh add"
+ busywait "$TIMEOUT" wait_for_offload \
+ ip -6 route show 2001:db8:2::/64 vrf v$swp1
+ check_err $? "ipv6 nexthop not marked as offloaded after neigh add"
+
+ log_test "nexthop offload indication"
+
+ ip -6 route del 2001:db8:2::/64 vrf v$swp1
+ ip -4 route del 198.51.100.0/24 vrf v$swp1
+
+ simple_if_fini $swp2 192.0.2.2/24 2001:db8:1::2/64
+ simple_if_fini $swp1 192.0.2.1/24 2001:db8:1::1/64
+ sysctl_restore net.ipv6.conf.$swp2.keep_addr_on_down
+}
+
+devlink_reload_test()
+{
+ # Test that after executing all the above configuration tests, a
+ # devlink reload can be performed without errors
+ RET=0
+
+ devlink dev reload "$DEVLINK_DEV"
+ check_err $? "devlink reload failed"
+
+ log_test "devlink reload - last test"
+
+ sleep 20
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_ets.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_ets.sh
new file mode 100755
index 000000000..af64bc9ea
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/sch_ets.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# A driver for the ETS selftest that implements testing in offloaded datapath.
+lib_dir=$(dirname $0)/../../../net/forwarding
+source $lib_dir/sch_ets_core.sh
+source $lib_dir/devlink_lib.sh
+source qos_lib.sh
+
+ALL_TESTS="
+ ping_ipv4
+ priomap_mode
+ ets_test_strict
+ ets_test_mixed
+ ets_test_dwrr
+"
+
+switch_create()
+{
+ ets_switch_create
+
+ # Create a bottleneck so that the DWRR process can kick in.
+ ethtool -s $h2 speed 1000 autoneg off
+ ethtool -s $swp2 speed 1000 autoneg off
+
+ # Set the ingress quota high and use the three egress TCs to limit the
+ # amount of traffic that is admitted to the shared buffers. This makes
+ # sure that there is always enough traffic of all types to select from
+ # for the DWRR process.
+ devlink_port_pool_th_save $swp1 0
+ devlink_port_pool_th_set $swp1 0 12
+ devlink_tc_bind_pool_th_save $swp1 0 ingress
+ devlink_tc_bind_pool_th_set $swp1 0 ingress 0 12
+ devlink_port_pool_th_save $swp2 4
+ devlink_port_pool_th_set $swp2 4 12
+ devlink_tc_bind_pool_th_save $swp2 7 egress
+ devlink_tc_bind_pool_th_set $swp2 7 egress 4 5
+ devlink_tc_bind_pool_th_save $swp2 6 egress
+ devlink_tc_bind_pool_th_set $swp2 6 egress 4 5
+ devlink_tc_bind_pool_th_save $swp2 5 egress
+ devlink_tc_bind_pool_th_set $swp2 5 egress 4 5
+
+ # Note: sch_ets_core.sh uses VLAN ingress-qos-map to assign packet
+ # priorities at $swp1 based on their 802.1p headers. ingress-qos-map is
+ # not offloaded by mlxsw as of this writing, but the mapping used is
+ # 1:1, which is the mapping currently hard-coded by the driver.
+}
+
+switch_destroy()
+{
+ devlink_tc_bind_pool_th_restore $swp2 5 egress
+ devlink_tc_bind_pool_th_restore $swp2 6 egress
+ devlink_tc_bind_pool_th_restore $swp2 7 egress
+ devlink_port_pool_th_restore $swp2 4
+ devlink_tc_bind_pool_th_restore $swp1 0 ingress
+ devlink_port_pool_th_restore $swp1 0
+
+ ethtool -s $swp2 autoneg on
+ ethtool -s $h2 autoneg on
+
+ ets_switch_destroy
+}
+
+# Callback from sch_ets_tests.sh
+collect_stats()
+{
+ local -a streams=("$@")
+ local stream
+
+ # Wait for qdisc counter update so that we don't get it mid-way through.
+ busywait_for_counter 1000 +1 \
+ qdisc_parent_stats_get $swp2 10:$((${streams[0]} + 1)) .bytes \
+ > /dev/null
+
+ for stream in ${streams[@]}; do
+ qdisc_parent_stats_get $swp2 10:$((stream + 1)) .bytes
+ done
+}
+
+bail_on_lldpad
+ets_run
diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh
new file mode 100644
index 000000000..33ddd0168
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh
@@ -0,0 +1,657 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# This test sends a >1Gbps stream of traffic from H1, to the switch, which
+# forwards it to a 1Gbps port. This 1Gbps stream is then looped back to the
+# switch and forwarded to the port under test $swp3, which is also 1Gbps.
+#
+# This way, $swp3 should be 100% filled with traffic without any of it spilling
+# to the backlog. Any extra packets sent should almost 1:1 go to backlog. That
+# is what H2 is used for--it sends the extra traffic to create backlog.
+#
+# A RED Qdisc is installed on $swp3. The configuration is such that the minimum
+# and maximum size are 1 byte apart, so there is a very clear border under which
+# no marking or dropping takes place, and above which everything is marked or
+# dropped.
+#
+# The test uses the buffer build-up behavior to test the installed RED.
+#
+# In order to test WRED, $swp3 actually contains RED under PRIO, with two
+# different configurations. Traffic is prioritized using 802.1p and relies on
+# the implicit mlxsw configuration, where packet priority is taken 1:1 from the
+# 802.1p marking.
+#
+# +--------------------------+ +--------------------------+
+# | H1 | | H2 |
+# | + $h1.10 | | + $h2.10 |
+# | | 192.0.2.1/28 | | | 192.0.2.2/28 |
+# | | | | | |
+# | | $h1.11 + | | | $h2.11 + |
+# | | 192.0.2.17/28 | | | | 192.0.2.18/28 | |
+# | | | | | | | |
+# | \______ ______/ | | \______ ______/ |
+# | \ / | | \ / |
+# | + $h1 | | + $h2 |
+# +-------------|------------+ +-------------|------------+
+# | >1Gbps |
+# +-------------|------------------------------------------------|------------+
+# | SW + $swp1 + $swp2 |
+# | _______/ \___________ ___________/ \_______ |
+# | / \ / \ |
+# | +-|-----------------+ | +-|-----------------+ | |
+# | | + $swp1.10 | | | + $swp2.10 | | |
+# | | | | .-------------+ $swp5.10 | | |
+# | | BR1_10 | | | | | | |
+# | | | | | | BR2_10 | | |
+# | | + $swp2.10 | | | | | | |
+# | +-|-----------------+ | | | + $swp3.10 | | |
+# | | | | +-|-----------------+ | |
+# | | +-----------------|-+ | | +-----------------|-+ |
+# | | | $swp1.11 + | | | | $swp2.11 + | |
+# | | | | | .-----------------+ $swp5.11 | |
+# | | | BR1_11 | | | | | | |
+# | | | | | | | | BR2_11 | |
+# | | | $swp2.11 + | | | | | | |
+# | | +-----------------|-+ | | | | $swp3.11 + | |
+# | | | | | | +-----------------|-+ |
+# | \_______ ___________/ | | \___________ _______/ |
+# | \ / \ / \ / |
+# | + $swp4 + $swp5 + $swp3 |
+# +-------------|----------------------|-------------------------|------------+
+# | | | 1Gbps
+# \________1Gbps_________/ |
+# +----------------------------|------------+
+# | H3 + $h3 |
+# | _____________________/ \_______ |
+# | / \ |
+# | | | |
+# | + $h3.10 $h3.11 + |
+# | 192.0.2.3/28 192.0.2.19/28 |
+# +-----------------------------------------+
+
+NUM_NETIFS=8
+CHECK_TC="yes"
+lib_dir=$(dirname $0)/../../../net/forwarding
+source $lib_dir/lib.sh
+source $lib_dir/devlink_lib.sh
+source qos_lib.sh
+
+ipaddr()
+{
+ local host=$1; shift
+ local vlan=$1; shift
+
+ echo 192.0.2.$((16 * (vlan - 10) + host))
+}
+
+host_create()
+{
+ local dev=$1; shift
+ local host=$1; shift
+
+ simple_if_init $dev
+ mtu_set $dev 10000
+
+ vlan_create $dev 10 v$dev $(ipaddr $host 10)/28
+ ip link set dev $dev.10 type vlan egress 0:0
+
+ vlan_create $dev 11 v$dev $(ipaddr $host 11)/28
+ ip link set dev $dev.11 type vlan egress 0:1
+}
+
+host_destroy()
+{
+ local dev=$1; shift
+
+ vlan_destroy $dev 11
+ vlan_destroy $dev 10
+ mtu_restore $dev
+ simple_if_fini $dev
+}
+
+h1_create()
+{
+ host_create $h1 1
+}
+
+h1_destroy()
+{
+ host_destroy $h1
+}
+
+h2_create()
+{
+ host_create $h2 2
+ tc qdisc add dev $h2 clsact
+
+ # Some of the tests in this suite use multicast traffic. As this traffic
+ # enters BR2_10 resp. BR2_11, it is flooded to all other ports. Thus
+ # e.g. traffic ingressing through $swp2 is flooded to $swp3 (the
+ # intended destination) and $swp5 (which is intended as ingress for
+ # another stream of traffic).
+ #
+ # This is generally not a problem, but if the $swp5 throughput is lower
+ # than $swp2 throughput, there will be a build-up at $swp5. That may
+ # cause packets to fail to queue up at $swp3 due to shared buffer
+ # quotas, and the test to spuriously fail.
+ #
+ # Prevent this by setting the speed of $h2 to 1Gbps.
+
+ ethtool -s $h2 speed 1000 autoneg off
+}
+
+h2_destroy()
+{
+ ethtool -s $h2 autoneg on
+ tc qdisc del dev $h2 clsact
+ host_destroy $h2
+}
+
+h3_create()
+{
+ host_create $h3 3
+ ethtool -s $h3 speed 1000 autoneg off
+}
+
+h3_destroy()
+{
+ ethtool -s $h3 autoneg on
+ host_destroy $h3
+}
+
+switch_create()
+{
+ local intf
+ local vlan
+
+ ip link add dev br1_10 type bridge
+ ip link add dev br1_11 type bridge
+
+ ip link add dev br2_10 type bridge
+ ip link add dev br2_11 type bridge
+
+ for intf in $swp1 $swp2 $swp3 $swp4 $swp5; do
+ ip link set dev $intf up
+ mtu_set $intf 10000
+ done
+
+ for intf in $swp1 $swp4; do
+ for vlan in 10 11; do
+ vlan_create $intf $vlan
+ ip link set dev $intf.$vlan master br1_$vlan
+ ip link set dev $intf.$vlan up
+ done
+ done
+
+ for intf in $swp2 $swp3 $swp5; do
+ for vlan in 10 11; do
+ vlan_create $intf $vlan
+ ip link set dev $intf.$vlan master br2_$vlan
+ ip link set dev $intf.$vlan up
+ done
+ done
+
+ ip link set dev $swp4.10 type vlan egress 0:0
+ ip link set dev $swp4.11 type vlan egress 0:1
+ for intf in $swp1 $swp2 $swp5; do
+ for vlan in 10 11; do
+ ip link set dev $intf.$vlan type vlan ingress 0:0 1:1
+ done
+ done
+
+ for intf in $swp2 $swp3 $swp4 $swp5; do
+ ethtool -s $intf speed 1000 autoneg off
+ done
+
+ ip link set dev br1_10 up
+ ip link set dev br1_11 up
+ ip link set dev br2_10 up
+ ip link set dev br2_11 up
+
+ local size=$(devlink_pool_size_thtype 0 | cut -d' ' -f 1)
+ devlink_port_pool_th_save $swp3 8
+ devlink_port_pool_th_set $swp3 8 $size
+}
+
+switch_destroy()
+{
+ local intf
+ local vlan
+
+ devlink_port_pool_th_restore $swp3 8
+
+ tc qdisc del dev $swp3 root 2>/dev/null
+
+ ip link set dev br2_11 down
+ ip link set dev br2_10 down
+ ip link set dev br1_11 down
+ ip link set dev br1_10 down
+
+ for intf in $swp5 $swp4 $swp3 $swp2; do
+ ethtool -s $intf autoneg on
+ done
+
+ for intf in $swp5 $swp3 $swp2 $swp4 $swp1; do
+ for vlan in 11 10; do
+ ip link set dev $intf.$vlan down
+ ip link set dev $intf.$vlan nomaster
+ vlan_destroy $intf $vlan
+ done
+
+ mtu_restore $intf
+ ip link set dev $intf down
+ done
+
+ ip link del dev br2_11
+ ip link del dev br2_10
+ ip link del dev br1_11
+ ip link del dev br1_10
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ swp4=${NETIFS[p7]}
+ swp5=${NETIFS[p8]}
+
+ h3_mac=$(mac_get $h3)
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+ h3_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h3_destroy
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1.10 $(ipaddr 3 10) " from host 1, vlan 10"
+ ping_test $h1.11 $(ipaddr 3 11) " from host 1, vlan 11"
+ ping_test $h2.10 $(ipaddr 3 10) " from host 2, vlan 10"
+ ping_test $h2.11 $(ipaddr 3 11) " from host 2, vlan 11"
+}
+
+get_tc()
+{
+ local vlan=$1; shift
+
+ echo $((vlan - 10))
+}
+
+get_qdisc_handle()
+{
+ local vlan=$1; shift
+
+ local tc=$(get_tc $vlan)
+ local band=$((8 - tc))
+
+ # Handle is 107: for TC1, 108: for TC0.
+ echo "10$band:"
+}
+
+get_qdisc_backlog()
+{
+ local vlan=$1; shift
+
+ qdisc_stats_get $swp3 $(get_qdisc_handle $vlan) .backlog
+}
+
+get_mc_transmit_queue()
+{
+ local vlan=$1; shift
+
+ local tc=$(($(get_tc $vlan) + 8))
+ ethtool_stats_get $swp3 tc_transmit_queue_tc_$tc
+}
+
+get_nmarked()
+{
+ local vlan=$1; shift
+
+ ethtool_stats_get $swp3 ecn_marked
+}
+
+get_qdisc_npackets()
+{
+ local vlan=$1; shift
+
+ busywait_for_counter 1100 +1 \
+ qdisc_stats_get $swp3 $(get_qdisc_handle $vlan) .packets
+}
+
+send_packets()
+{
+ local vlan=$1; shift
+ local proto=$1; shift
+ local pkts=$1; shift
+
+ $MZ $h2.$vlan -p 8000 -a own -b $h3_mac \
+ -A $(ipaddr 2 $vlan) -B $(ipaddr 3 $vlan) \
+ -t $proto -q -c $pkts "$@"
+}
+
+# This sends traffic in an attempt to build a backlog of $size. Returns 0 on
+# success. After 10 failed attempts it bails out and returns 1. It dumps the
+# backlog size to stdout.
+build_backlog()
+{
+ local vlan=$1; shift
+ local size=$1; shift
+ local proto=$1; shift
+
+ local tc=$((vlan - 10))
+ local band=$((8 - tc))
+ local cur=-1
+ local i=0
+
+ while :; do
+ local cur=$(busywait 1100 until_counter_is "> $cur" \
+ get_qdisc_backlog $vlan)
+ local diff=$((size - cur))
+ local pkts=$(((diff + 7999) / 8000))
+
+ if ((cur >= size)); then
+ echo $cur
+ return 0
+ elif ((i++ > 10)); then
+ echo $cur
+ return 1
+ fi
+
+ send_packets $vlan $proto $pkts "$@"
+ done
+}
+
+check_marking()
+{
+ local vlan=$1; shift
+ local cond=$1; shift
+
+ local npackets_0=$(get_qdisc_npackets $vlan)
+ local nmarked_0=$(get_nmarked $vlan)
+ sleep 5
+ local npackets_1=$(get_qdisc_npackets $vlan)
+ local nmarked_1=$(get_nmarked $vlan)
+
+ local nmarked_d=$((nmarked_1 - nmarked_0))
+ local npackets_d=$((npackets_1 - npackets_0))
+ local pct=$((100 * nmarked_d / npackets_d))
+
+ echo $pct
+ ((pct $cond))
+}
+
+ecn_test_common()
+{
+ local name=$1; shift
+ local vlan=$1; shift
+ local limit=$1; shift
+ local backlog
+ local pct
+
+ # Build the below-the-limit backlog using UDP. We could use TCP just
+ # fine, but this way we get a proof that UDP is accepted when queue
+ # length is below the limit. The main stream is using TCP, and if the
+ # limit is misconfigured, we would see this traffic being ECN marked.
+ RET=0
+ backlog=$(build_backlog $vlan $((2 * limit / 3)) udp)
+ check_err $? "Could not build the requested backlog"
+ pct=$(check_marking $vlan "== 0")
+ check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0."
+ log_test "TC $((vlan - 10)): $name backlog < limit"
+
+ # Now push TCP, because non-TCP traffic would be early-dropped after the
+ # backlog crosses the limit, and we want to make sure that the backlog
+ # is above the limit.
+ RET=0
+ backlog=$(build_backlog $vlan $((3 * limit / 2)) tcp tos=0x01)
+ check_err $? "Could not build the requested backlog"
+ pct=$(check_marking $vlan ">= 95")
+ check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected >= 95."
+ log_test "TC $((vlan - 10)): $name backlog > limit"
+}
+
+do_ecn_test()
+{
+ local vlan=$1; shift
+ local limit=$1; shift
+ local name=ECN
+
+ start_tcp_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) \
+ $h3_mac tos=0x01
+ sleep 1
+
+ ecn_test_common "$name" $vlan $limit
+
+ # Up there we saw that UDP gets accepted when backlog is below the
+ # limit. Now that it is above, it should all get dropped, and backlog
+ # building should fail.
+ RET=0
+ build_backlog $vlan $((2 * limit)) udp >/dev/null
+ check_fail $? "UDP traffic went into backlog instead of being early-dropped"
+ log_test "TC $((vlan - 10)): $name backlog > limit: UDP early-dropped"
+
+ stop_traffic
+ sleep 1
+}
+
+do_ecn_nodrop_test()
+{
+ local vlan=$1; shift
+ local limit=$1; shift
+ local name="ECN nodrop"
+
+ start_tcp_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) \
+ $h3_mac tos=0x01
+ sleep 1
+
+ ecn_test_common "$name" $vlan $limit
+
+ # Up there we saw that UDP gets accepted when backlog is below the
+ # limit. Now that it is above, in nodrop mode, make sure it goes to
+ # backlog as well.
+ RET=0
+ build_backlog $vlan $((2 * limit)) udp >/dev/null
+ check_err $? "UDP traffic was early-dropped instead of getting into backlog"
+ log_test "TC $((vlan - 10)): $name backlog > limit: UDP not dropped"
+
+ stop_traffic
+ sleep 1
+}
+
+do_red_test()
+{
+ local vlan=$1; shift
+ local limit=$1; shift
+ local backlog
+ local pct
+
+ # Use ECN-capable TCP to verify there's no marking even though the queue
+ # is above limit.
+ start_tcp_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) \
+ $h3_mac tos=0x01
+
+ # Pushing below the queue limit should work.
+ RET=0
+ backlog=$(build_backlog $vlan $((2 * limit / 3)) tcp tos=0x01)
+ check_err $? "Could not build the requested backlog"
+ pct=$(check_marking $vlan "== 0")
+ check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0."
+ log_test "TC $((vlan - 10)): RED backlog < limit"
+
+ # Pushing above should not.
+ RET=0
+ backlog=$(build_backlog $vlan $((3 * limit / 2)) tcp tos=0x01)
+ check_fail $? "Traffic went into backlog instead of being early-dropped"
+ pct=$(check_marking $vlan "== 0")
+ check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0."
+ local diff=$((limit - backlog))
+ pct=$((100 * diff / limit))
+ ((0 <= pct && pct <= 10))
+ check_err $? "backlog $backlog / $limit expected <= 10% distance"
+ log_test "TC $((vlan - 10)): RED backlog > limit"
+
+ stop_traffic
+ sleep 1
+}
+
+do_mc_backlog_test()
+{
+ local vlan=$1; shift
+ local limit=$1; shift
+ local backlog
+ local pct
+
+ RET=0
+
+ start_tcp_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) bc
+ start_tcp_traffic $h2.$vlan $(ipaddr 2 $vlan) $(ipaddr 3 $vlan) bc
+
+ qbl=$(busywait 5000 until_counter_is ">= 500000" \
+ get_qdisc_backlog $vlan)
+ check_err $? "Could not build MC backlog"
+
+ # Verify that we actually see the backlog on BUM TC. Do a busywait as
+ # well, performance blips might cause false fail.
+ local ebl
+ ebl=$(busywait 5000 until_counter_is ">= 500000" \
+ get_mc_transmit_queue $vlan)
+ check_err $? "MC backlog reported by qdisc not visible in ethtool"
+
+ stop_traffic
+ stop_traffic
+
+ log_test "TC $((vlan - 10)): Qdisc reports MC backlog"
+}
+
+do_drop_test()
+{
+ local vlan=$1; shift
+ local limit=$1; shift
+ local trigger=$1; shift
+ local subtest=$1; shift
+ local fetch_counter=$1; shift
+ local backlog
+ local base
+ local now
+ local pct
+
+ RET=0
+
+ start_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) $h3_mac
+
+ # Create a bit of a backlog and observe no mirroring due to drops.
+ qevent_rule_install_$subtest
+ base=$($fetch_counter)
+
+ build_backlog $vlan $((2 * limit / 3)) udp >/dev/null
+
+ busywait 1100 until_counter_is ">= $((base + 1))" $fetch_counter >/dev/null
+ check_fail $? "Spurious packets observed without buffer pressure"
+
+ # Push to the queue until it's at the limit. The configured limit is
+ # rounded by the qdisc and then by the driver, so this is the best we
+ # can do to get to the real limit of the system.
+ build_backlog $vlan $((3 * limit / 2)) udp >/dev/null
+
+ base=$($fetch_counter)
+ send_packets $vlan udp 11
+
+ now=$(busywait 1100 until_counter_is ">= $((base + 10))" $fetch_counter)
+ check_err $? "Dropped packets not observed: 11 expected, $((now - base)) seen"
+
+ # When no extra traffic is injected, there should be no mirroring.
+ busywait 1100 until_counter_is ">= $((base + 20))" $fetch_counter >/dev/null
+ check_fail $? "Spurious packets observed"
+
+ # When the rule is uninstalled, there should be no mirroring.
+ qevent_rule_uninstall_$subtest
+ send_packets $vlan udp 11
+ busywait 1100 until_counter_is ">= $((base + 20))" $fetch_counter >/dev/null
+ check_fail $? "Spurious packets observed after uninstall"
+
+ log_test "TC $((vlan - 10)): ${trigger}ped packets $subtest'd"
+
+ stop_traffic
+ sleep 1
+}
+
+qevent_rule_install_mirror()
+{
+ tc filter add block 10 pref 1234 handle 102 matchall skip_sw \
+ action mirred egress mirror dev $swp2 hw_stats disabled
+}
+
+qevent_rule_uninstall_mirror()
+{
+ tc filter del block 10 pref 1234 handle 102 matchall
+}
+
+qevent_counter_fetch_mirror()
+{
+ tc_rule_handle_stats_get "dev $h2 ingress" 101
+}
+
+do_drop_mirror_test()
+{
+ local vlan=$1; shift
+ local limit=$1; shift
+ local qevent_name=$1; shift
+
+ tc filter add dev $h2 ingress pref 1 handle 101 prot ip \
+ flower skip_sw ip_proto udp \
+ action drop
+
+ do_drop_test "$vlan" "$limit" "$qevent_name" mirror \
+ qevent_counter_fetch_mirror
+
+ tc filter del dev $h2 ingress pref 1 handle 101 flower
+}
+
+qevent_rule_install_trap()
+{
+ tc filter add block 10 pref 1234 handle 102 matchall skip_sw \
+ action trap hw_stats disabled
+}
+
+qevent_rule_uninstall_trap()
+{
+ tc filter del block 10 pref 1234 handle 102 matchall
+}
+
+qevent_counter_fetch_trap()
+{
+ local trap_name=$1; shift
+
+ devlink_trap_rx_packets_get "$trap_name"
+}
+
+do_drop_trap_test()
+{
+ local vlan=$1; shift
+ local limit=$1; shift
+ local trap_name=$1; shift
+
+ do_drop_test "$vlan" "$limit" "$trap_name" trap \
+ "qevent_counter_fetch_trap $trap_name"
+}
diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh
new file mode 100755
index 000000000..3f007c5f8
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="
+ ping_ipv4
+ ecn_test
+ ecn_nodrop_test
+ red_test
+ mc_backlog_test
+ red_mirror_test
+ red_trap_test
+"
+: ${QDISC:=ets}
+source sch_red_core.sh
+
+# do_ecn_test first build 2/3 of the requested backlog and expects no marking,
+# and then builds 3/2 of it and does expect marking. The values of $BACKLOG1 and
+# $BACKLOG2 are far enough not to overlap, so that we can assume that if we do
+# see (do not see) marking, it is actually due to the configuration of that one
+# TC, and not due to configuration of the other TC leaking over.
+BACKLOG1=200000
+BACKLOG2=500000
+
+install_qdisc()
+{
+ local -a args=("$@")
+
+ tc qdisc add dev $swp3 root handle 10: $QDISC \
+ bands 8 priomap 7 6 5 4 3 2 1 0
+ tc qdisc add dev $swp3 parent 10:8 handle 108: red \
+ limit 1000000 min $BACKLOG1 max $((BACKLOG1 + 1)) \
+ probability 1.0 avpkt 8000 burst 38 "${args[@]}"
+ tc qdisc add dev $swp3 parent 10:7 handle 107: red \
+ limit 1000000 min $BACKLOG2 max $((BACKLOG2 + 1)) \
+ probability 1.0 avpkt 8000 burst 63 "${args[@]}"
+ sleep 1
+}
+
+uninstall_qdisc()
+{
+ tc qdisc del dev $swp3 parent 10:7
+ tc qdisc del dev $swp3 parent 10:8
+ tc qdisc del dev $swp3 root
+}
+
+ecn_test()
+{
+ install_qdisc ecn
+
+ do_ecn_test 10 $BACKLOG1
+ do_ecn_test 11 $BACKLOG2
+
+ uninstall_qdisc
+}
+
+ecn_nodrop_test()
+{
+ install_qdisc ecn nodrop
+
+ do_ecn_nodrop_test 10 $BACKLOG1
+ do_ecn_nodrop_test 11 $BACKLOG2
+
+ uninstall_qdisc
+}
+
+red_test()
+{
+ install_qdisc
+
+ do_red_test 10 $BACKLOG1
+ do_red_test 11 $BACKLOG2
+
+ uninstall_qdisc
+}
+
+mc_backlog_test()
+{
+ install_qdisc
+
+ # Note that the backlog numbers here do not correspond to RED
+ # configuration, but are arbitrary.
+ do_mc_backlog_test 10 $BACKLOG1
+ do_mc_backlog_test 11 $BACKLOG2
+
+ uninstall_qdisc
+}
+
+red_mirror_test()
+{
+ install_qdisc qevent early_drop block 10
+
+ do_drop_mirror_test 10 $BACKLOG1 early_drop
+ do_drop_mirror_test 11 $BACKLOG2 early_drop
+
+ uninstall_qdisc
+}
+
+red_trap_test()
+{
+ install_qdisc qevent early_drop block 10
+
+ do_drop_trap_test 10 $BACKLOG1 early_drop
+ do_drop_trap_test 11 $BACKLOG2 early_drop
+
+ uninstall_qdisc
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+bail_on_lldpad
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_prio.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_prio.sh
new file mode 100755
index 000000000..76820a0e9
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_prio.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+QDISC=prio
+source sch_red_ets.sh
diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_root.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_root.sh
new file mode 100755
index 000000000..ede9c38d3
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_root.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="
+ ping_ipv4
+ ecn_test
+ ecn_nodrop_test
+ red_test
+ mc_backlog_test
+ red_mirror_test
+"
+source sch_red_core.sh
+
+BACKLOG=300000
+
+install_qdisc()
+{
+ local -a args=("$@")
+
+ tc qdisc add dev $swp3 root handle 108: red \
+ limit 1000000 min $BACKLOG max $((BACKLOG + 1)) \
+ probability 1.0 avpkt 8000 burst 38 "${args[@]}"
+ sleep 1
+}
+
+uninstall_qdisc()
+{
+ tc qdisc del dev $swp3 root
+}
+
+ecn_test()
+{
+ install_qdisc ecn
+ do_ecn_test 10 $BACKLOG
+ uninstall_qdisc
+}
+
+ecn_nodrop_test()
+{
+ install_qdisc ecn nodrop
+ do_ecn_nodrop_test 10 $BACKLOG
+ uninstall_qdisc
+}
+
+red_test()
+{
+ install_qdisc
+ do_red_test 10 $BACKLOG
+ uninstall_qdisc
+}
+
+mc_backlog_test()
+{
+ install_qdisc
+ # Note that the backlog value here does not correspond to RED
+ # configuration, but is arbitrary.
+ do_mc_backlog_test 10 $BACKLOG
+ uninstall_qdisc
+}
+
+red_mirror_test()
+{
+ install_qdisc qevent early_drop block 10
+ do_drop_mirror_test 10 $BACKLOG
+ uninstall_qdisc
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+bail_on_lldpad
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_tbf_ets.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_tbf_ets.sh
new file mode 100755
index 000000000..c6ce0b448
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/sch_tbf_ets.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source qos_lib.sh
+bail_on_lldpad
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+TCFLAGS=skip_sw
+source $lib_dir/sch_tbf_ets.sh
diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_tbf_prio.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_tbf_prio.sh
new file mode 100755
index 000000000..8d245f331
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/sch_tbf_prio.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source qos_lib.sh
+bail_on_lldpad
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+TCFLAGS=skip_sw
+source $lib_dir/sch_tbf_prio.sh
diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_tbf_root.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_tbf_root.sh
new file mode 100755
index 000000000..013886061
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/sch_tbf_root.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source qos_lib.sh
+bail_on_lldpad
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+TCFLAGS=skip_sw
+source $lib_dir/sch_tbf_root.sh
diff --git a/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh
new file mode 100755
index 000000000..7d9e73a43
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer.sh
@@ -0,0 +1,222 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="
+ port_pool_test
+ port_tc_ip_test
+ port_tc_arp_test
+"
+
+NUM_NETIFS=2
+source ../../../net/forwarding/lib.sh
+source ../../../net/forwarding/devlink_lib.sh
+source mlxsw_lib.sh
+
+SB_POOL_ING=0
+SB_POOL_EGR_CPU=10
+
+SB_ITC_CPU_IP=2
+SB_ITC_CPU_ARP=2
+SB_ITC=0
+
+h1_create()
+{
+ simple_if_init $h1 192.0.1.1/24
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.1.1/24
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.1.2/24
+}
+
+h2_destroy()
+{
+ simple_if_fini $h2 192.0.1.2/24
+}
+
+sb_occ_pool_check()
+{
+ local dl_port=$1; shift
+ local pool=$1; shift
+ local exp_max_occ=$1
+ local max_occ
+ local err=0
+
+ max_occ=$(devlink sb -j occupancy show $dl_port \
+ | jq -e ".[][][\"pool\"][\"$pool\"][\"max\"]")
+
+ if [[ "$max_occ" -ne "$exp_max_occ" ]]; then
+ err=1
+ fi
+
+ echo $max_occ
+ return $err
+}
+
+sb_occ_itc_check()
+{
+ local dl_port=$1; shift
+ local itc=$1; shift
+ local exp_max_occ=$1
+ local max_occ
+ local err=0
+
+ max_occ=$(devlink sb -j occupancy show $dl_port \
+ | jq -e ".[][][\"itc\"][\"$itc\"][\"max\"]")
+
+ if [[ "$max_occ" -ne "$exp_max_occ" ]]; then
+ err=1
+ fi
+
+ echo $max_occ
+ return $err
+}
+
+sb_occ_etc_check()
+{
+ local dl_port=$1; shift
+ local etc=$1; shift
+ local exp_max_occ=$1; shift
+ local max_occ
+ local err=0
+
+ max_occ=$(devlink sb -j occupancy show $dl_port \
+ | jq -e ".[][][\"etc\"][\"$etc\"][\"max\"]")
+
+ if [[ "$max_occ" -ne "$exp_max_occ" ]]; then
+ err=1
+ fi
+
+ echo $max_occ
+ return $err
+}
+
+port_pool_test()
+{
+ local exp_max_occ=288
+ local max_occ
+
+ devlink sb occupancy clearmax $DEVLINK_DEV
+
+ $MZ $h1 -c 1 -p 160 -a $h1mac -b $h2mac -A 192.0.1.1 -B 192.0.1.2 \
+ -t ip -q
+
+ devlink sb occupancy snapshot $DEVLINK_DEV
+
+ RET=0
+ max_occ=$(sb_occ_pool_check $dl_port1 $SB_POOL_ING $exp_max_occ)
+ check_err $? "Expected iPool($SB_POOL_ING) max occupancy to be $exp_max_occ, but got $max_occ"
+ log_test "physical port's($h1) ingress pool"
+
+ RET=0
+ max_occ=$(sb_occ_pool_check $dl_port2 $SB_POOL_ING $exp_max_occ)
+ check_err $? "Expected iPool($SB_POOL_ING) max occupancy to be $exp_max_occ, but got $max_occ"
+ log_test "physical port's($h2) ingress pool"
+
+ RET=0
+ max_occ=$(sb_occ_pool_check $cpu_dl_port $SB_POOL_EGR_CPU $exp_max_occ)
+ check_err $? "Expected ePool($SB_POOL_EGR_CPU) max occupancy to be $exp_max_occ, but got $max_occ"
+ log_test "CPU port's egress pool"
+}
+
+port_tc_ip_test()
+{
+ local exp_max_occ=288
+ local max_occ
+
+ devlink sb occupancy clearmax $DEVLINK_DEV
+
+ $MZ $h1 -c 1 -p 160 -a $h1mac -b $h2mac -A 192.0.1.1 -B 192.0.1.2 \
+ -t ip -q
+
+ devlink sb occupancy snapshot $DEVLINK_DEV
+
+ RET=0
+ max_occ=$(sb_occ_itc_check $dl_port2 $SB_ITC $exp_max_occ)
+ check_err $? "Expected ingress TC($SB_ITC) max occupancy to be $exp_max_occ, but got $max_occ"
+ log_test "physical port's($h1) ingress TC - IP packet"
+
+ RET=0
+ max_occ=$(sb_occ_itc_check $dl_port2 $SB_ITC $exp_max_occ)
+ check_err $? "Expected ingress TC($SB_ITC) max occupancy to be $exp_max_occ, but got $max_occ"
+ log_test "physical port's($h2) ingress TC - IP packet"
+
+ RET=0
+ max_occ=$(sb_occ_etc_check $cpu_dl_port $SB_ITC_CPU_IP $exp_max_occ)
+ check_err $? "Expected egress TC($SB_ITC_CPU_IP) max occupancy to be $exp_max_occ, but got $max_occ"
+ log_test "CPU port's egress TC - IP packet"
+}
+
+port_tc_arp_test()
+{
+ local exp_max_occ=96
+ local max_occ
+
+ if [[ $MLXSW_CHIP != "mlxsw_spectrum" ]]; then
+ exp_max_occ=144
+ fi
+
+ devlink sb occupancy clearmax $DEVLINK_DEV
+
+ $MZ $h1 -c 1 -p 160 -a $h1mac -A 192.0.1.1 -t arp -q
+
+ devlink sb occupancy snapshot $DEVLINK_DEV
+
+ RET=0
+ max_occ=$(sb_occ_itc_check $dl_port2 $SB_ITC $exp_max_occ)
+ check_err $? "Expected ingress TC($SB_ITC) max occupancy to be $exp_max_occ, but got $max_occ"
+ log_test "physical port's($h1) ingress TC - ARP packet"
+
+ RET=0
+ max_occ=$(sb_occ_itc_check $dl_port2 $SB_ITC $exp_max_occ)
+ check_err $? "Expected ingress TC($SB_ITC) max occupancy to be $exp_max_occ, but got $max_occ"
+ log_test "physical port's($h2) ingress TC - ARP packet"
+
+ RET=0
+ max_occ=$(sb_occ_etc_check $cpu_dl_port $SB_ITC_CPU_ARP $exp_max_occ)
+ check_err $? "Expected egress TC($SB_ITC_IP2ME) max occupancy to be $exp_max_occ, but got $max_occ"
+ log_test "CPU port's egress TC - ARP packet"
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ h2=${NETIFS[p2]}
+
+ h1mac=$(mac_get $h1)
+ h2mac=$(mac_get $h2)
+
+ dl_port1=$(devlink_port_by_netdev $h1)
+ dl_port2=$(devlink_port_by_netdev $h2)
+
+ cpu_dl_port=$(devlink_cpu_port_get)
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer_configuration.py b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer_configuration.py
new file mode 100755
index 000000000..2223337ee
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/sharedbuffer_configuration.py
@@ -0,0 +1,416 @@
+#!/usr/bin/env python
+# SPDX-License-Identifier: GPL-2.0
+
+import subprocess
+import json as j
+import random
+
+
+class SkipTest(Exception):
+ pass
+
+
+class RandomValuePicker:
+ """
+ Class for storing shared buffer configuration. Can handle 3 different
+ objects, pool, tcbind and portpool. Provide an interface to get random
+ values for a specific object type as the follow:
+ 1. Pool:
+ - random size
+
+ 2. TcBind:
+ - random pool number
+ - random threshold
+
+ 3. PortPool:
+ - random threshold
+ """
+ def __init__(self, pools):
+ self._pools = []
+ for pool in pools:
+ self._pools.append(pool)
+
+ def _cell_size(self):
+ return self._pools[0]["cell_size"]
+
+ def _get_static_size(self, th):
+ # For threshold of 16, this works out to be about 12MB on Spectrum-1,
+ # and about 17MB on Spectrum-2.
+ return th * 8000 * self._cell_size()
+
+ def _get_size(self):
+ return self._get_static_size(16)
+
+ def _get_thtype(self):
+ return "static"
+
+ def _get_th(self, pool):
+ # Threshold value could be any integer between 3 to 16
+ th = random.randint(3, 16)
+ if pool["thtype"] == "dynamic":
+ return th
+ else:
+ return self._get_static_size(th)
+
+ def _get_pool(self, direction):
+ ing_pools = []
+ egr_pools = []
+ for pool in self._pools:
+ if pool["type"] == "ingress":
+ ing_pools.append(pool)
+ else:
+ egr_pools.append(pool)
+ if direction == "ingress":
+ arr = ing_pools
+ else:
+ arr = egr_pools
+ return arr[random.randint(0, len(arr) - 1)]
+
+ def get_value(self, objid):
+ if isinstance(objid, Pool):
+ if objid["pool"] in [4, 8, 9, 10]:
+ # The threshold type of pools 4, 8, 9 and 10 cannot be changed
+ raise SkipTest()
+ else:
+ return (self._get_size(), self._get_thtype())
+ if isinstance(objid, TcBind):
+ if objid["tc"] >= 8:
+ # Multicast TCs cannot be changed
+ raise SkipTest()
+ else:
+ pool = self._get_pool(objid["type"])
+ th = self._get_th(pool)
+ pool_n = pool["pool"]
+ return (pool_n, th)
+ if isinstance(objid, PortPool):
+ pool_n = objid["pool"]
+ pool = self._pools[pool_n]
+ assert pool["pool"] == pool_n
+ th = self._get_th(pool)
+ return (th,)
+
+
+class RecordValuePickerException(Exception):
+ pass
+
+
+class RecordValuePicker:
+ """
+ Class for storing shared buffer configuration. Can handle 2 different
+ objects, pool and tcbind. Provide an interface to get the stored values per
+ object type.
+ """
+ def __init__(self, objlist):
+ self._recs = []
+ for item in objlist:
+ self._recs.append({"objid": item, "value": item.var_tuple()})
+
+ def get_value(self, objid):
+ if isinstance(objid, Pool) and objid["pool"] in [4, 8, 9, 10]:
+ # The threshold type of pools 4, 8, 9 and 10 cannot be changed
+ raise SkipTest()
+ if isinstance(objid, TcBind) and objid["tc"] >= 8:
+ # Multicast TCs cannot be changed
+ raise SkipTest()
+ for rec in self._recs:
+ if rec["objid"].weak_eq(objid):
+ return rec["value"]
+ raise RecordValuePickerException()
+
+
+def run_cmd(cmd, json=False):
+ out = subprocess.check_output(cmd, shell=True)
+ if json:
+ return j.loads(out)
+ return out
+
+
+def run_json_cmd(cmd):
+ return run_cmd(cmd, json=True)
+
+
+def log_test(test_name, err_msg=None):
+ if err_msg:
+ print("\t%s" % err_msg)
+ print("TEST: %-80s [FAIL]" % test_name)
+ else:
+ print("TEST: %-80s [ OK ]" % test_name)
+
+
+class CommonItem(dict):
+ varitems = []
+
+ def var_tuple(self):
+ ret = []
+ self.varitems.sort()
+ for key in self.varitems:
+ ret.append(self[key])
+ return tuple(ret)
+
+ def weak_eq(self, other):
+ for key in self:
+ if key in self.varitems:
+ continue
+ if self[key] != other[key]:
+ return False
+ return True
+
+
+class CommonList(list):
+ def get_by(self, by_obj):
+ for item in self:
+ if item.weak_eq(by_obj):
+ return item
+ return None
+
+ def del_by(self, by_obj):
+ for item in self:
+ if item.weak_eq(by_obj):
+ self.remove(item)
+
+
+class Pool(CommonItem):
+ varitems = ["size", "thtype"]
+
+ def dl_set(self, dlname, size, thtype):
+ run_cmd("devlink sb pool set {} sb {} pool {} size {} thtype {}".format(dlname, self["sb"],
+ self["pool"],
+ size, thtype))
+
+
+class PoolList(CommonList):
+ pass
+
+
+def get_pools(dlname, direction=None):
+ d = run_json_cmd("devlink sb pool show -j")
+ pools = PoolList()
+ for pooldict in d["pool"][dlname]:
+ if not direction or direction == pooldict["type"]:
+ pools.append(Pool(pooldict))
+ return pools
+
+
+def do_check_pools(dlname, pools, vp):
+ for pool in pools:
+ pre_pools = get_pools(dlname)
+ try:
+ (size, thtype) = vp.get_value(pool)
+ except SkipTest:
+ continue
+ pool.dl_set(dlname, size, thtype)
+ post_pools = get_pools(dlname)
+ pool = post_pools.get_by(pool)
+
+ err_msg = None
+ if pool["size"] != size:
+ err_msg = "Incorrect pool size (got {}, expected {})".format(pool["size"], size)
+ if pool["thtype"] != thtype:
+ err_msg = "Incorrect pool threshold type (got {}, expected {})".format(pool["thtype"], thtype)
+
+ pre_pools.del_by(pool)
+ post_pools.del_by(pool)
+ if pre_pools != post_pools:
+ err_msg = "Other pool setup changed as well"
+ log_test("pool {} of sb {} set verification".format(pool["pool"],
+ pool["sb"]), err_msg)
+
+
+def check_pools(dlname, pools):
+ # Save defaults
+ record_vp = RecordValuePicker(pools)
+
+ # For each pool, set random size and static threshold type
+ do_check_pools(dlname, pools, RandomValuePicker(pools))
+
+ # Restore defaults
+ do_check_pools(dlname, pools, record_vp)
+
+
+class TcBind(CommonItem):
+ varitems = ["pool", "threshold"]
+
+ def __init__(self, port, d):
+ super(TcBind, self).__init__(d)
+ self["dlportname"] = port.name
+
+ def dl_set(self, pool, th):
+ run_cmd("devlink sb tc bind set {} sb {} tc {} type {} pool {} th {}".format(self["dlportname"],
+ self["sb"],
+ self["tc"],
+ self["type"],
+ pool, th))
+
+
+class TcBindList(CommonList):
+ pass
+
+
+def get_tcbinds(ports, verify_existence=False):
+ d = run_json_cmd("devlink sb tc bind show -j -n")
+ tcbinds = TcBindList()
+ for port in ports:
+ err_msg = None
+ if port.name not in d["tc_bind"] or len(d["tc_bind"][port.name]) == 0:
+ err_msg = "No tc bind for port"
+ else:
+ for tcbinddict in d["tc_bind"][port.name]:
+ tcbinds.append(TcBind(port, tcbinddict))
+ if verify_existence:
+ log_test("tc bind existence for port {} verification".format(port.name), err_msg)
+ return tcbinds
+
+
+def do_check_tcbind(ports, tcbinds, vp):
+ for tcbind in tcbinds:
+ pre_tcbinds = get_tcbinds(ports)
+ try:
+ (pool, th) = vp.get_value(tcbind)
+ except SkipTest:
+ continue
+ tcbind.dl_set(pool, th)
+ post_tcbinds = get_tcbinds(ports)
+ tcbind = post_tcbinds.get_by(tcbind)
+
+ err_msg = None
+ if tcbind["pool"] != pool:
+ err_msg = "Incorrect pool (got {}, expected {})".format(tcbind["pool"], pool)
+ if tcbind["threshold"] != th:
+ err_msg = "Incorrect threshold (got {}, expected {})".format(tcbind["threshold"], th)
+
+ pre_tcbinds.del_by(tcbind)
+ post_tcbinds.del_by(tcbind)
+ if pre_tcbinds != post_tcbinds:
+ err_msg = "Other tc bind setup changed as well"
+ log_test("tc bind {}-{} of sb {} set verification".format(tcbind["dlportname"],
+ tcbind["tc"],
+ tcbind["sb"]), err_msg)
+
+
+def check_tcbind(dlname, ports, pools):
+ tcbinds = get_tcbinds(ports, verify_existence=True)
+
+ # Save defaults
+ record_vp = RecordValuePicker(tcbinds)
+
+ # Bind each port and unicast TC (TCs < 8) to a random pool and a random
+ # threshold
+ do_check_tcbind(ports, tcbinds, RandomValuePicker(pools))
+
+ # Restore defaults
+ do_check_tcbind(ports, tcbinds, record_vp)
+
+
+class PortPool(CommonItem):
+ varitems = ["threshold"]
+
+ def __init__(self, port, d):
+ super(PortPool, self).__init__(d)
+ self["dlportname"] = port.name
+
+ def dl_set(self, th):
+ run_cmd("devlink sb port pool set {} sb {} pool {} th {}".format(self["dlportname"],
+ self["sb"],
+ self["pool"], th))
+
+
+class PortPoolList(CommonList):
+ pass
+
+
+def get_portpools(ports, verify_existence=False):
+ d = run_json_cmd("devlink sb port pool -j -n")
+ portpools = PortPoolList()
+ for port in ports:
+ err_msg = None
+ if port.name not in d["port_pool"] or len(d["port_pool"][port.name]) == 0:
+ err_msg = "No port pool for port"
+ else:
+ for portpooldict in d["port_pool"][port.name]:
+ portpools.append(PortPool(port, portpooldict))
+ if verify_existence:
+ log_test("port pool existence for port {} verification".format(port.name), err_msg)
+ return portpools
+
+
+def do_check_portpool(ports, portpools, vp):
+ for portpool in portpools:
+ pre_portpools = get_portpools(ports)
+ (th,) = vp.get_value(portpool)
+ portpool.dl_set(th)
+ post_portpools = get_portpools(ports)
+ portpool = post_portpools.get_by(portpool)
+
+ err_msg = None
+ if portpool["threshold"] != th:
+ err_msg = "Incorrect threshold (got {}, expected {})".format(portpool["threshold"], th)
+
+ pre_portpools.del_by(portpool)
+ post_portpools.del_by(portpool)
+ if pre_portpools != post_portpools:
+ err_msg = "Other port pool setup changed as well"
+ log_test("port pool {}-{} of sb {} set verification".format(portpool["dlportname"],
+ portpool["pool"],
+ portpool["sb"]), err_msg)
+
+
+def check_portpool(dlname, ports, pools):
+ portpools = get_portpools(ports, verify_existence=True)
+
+ # Save defaults
+ record_vp = RecordValuePicker(portpools)
+
+ # For each port pool, set a random threshold
+ do_check_portpool(ports, portpools, RandomValuePicker(pools))
+
+ # Restore defaults
+ do_check_portpool(ports, portpools, record_vp)
+
+
+class Port:
+ def __init__(self, name):
+ self.name = name
+
+
+class PortList(list):
+ pass
+
+
+def get_ports(dlname):
+ d = run_json_cmd("devlink port show -j")
+ ports = PortList()
+ for name in d["port"]:
+ if name.find(dlname) == 0 and d["port"][name]["flavour"] == "physical":
+ ports.append(Port(name))
+ return ports
+
+
+def get_device():
+ devices_info = run_json_cmd("devlink -j dev info")["info"]
+ for d in devices_info:
+ if "mlxsw_spectrum" in devices_info[d]["driver"]:
+ return d
+ return None
+
+
+class UnavailableDevlinkNameException(Exception):
+ pass
+
+
+def test_sb_configuration():
+ # Use static seed
+ random.seed(0)
+
+ dlname = get_device()
+ if not dlname:
+ raise UnavailableDevlinkNameException()
+
+ ports = get_ports(dlname)
+ pools = get_pools(dlname)
+
+ check_pools(dlname, pools)
+ check_tcbind(dlname, ports, pools)
+ check_portpool(dlname, ports, pools)
+
+
+test_sb_configuration()
diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/mirror_gre_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/mirror_gre_scale.sh
new file mode 100644
index 000000000..f7c168dec
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/mirror_gre_scale.sh
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+source ../mirror_gre_scale.sh
+
+mirror_gre_get_target()
+{
+ local should_fail=$1; shift
+ local target
+
+ target=$(devlink_resource_size_get span_agents)
+
+ if ((! should_fail)); then
+ echo $target
+ else
+ echo $((target + 1))
+ fi
+}
diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh
new file mode 100755
index 000000000..d7cf33a3f
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/resource_scale.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+lib_dir=$(dirname $0)/../../../../net/forwarding
+
+NUM_NETIFS=6
+source $lib_dir/lib.sh
+source $lib_dir/tc_common.sh
+source $lib_dir/devlink_lib.sh
+
+if [[ "$DEVLINK_VIDDID" != "15b3:cf6c" && \
+ "$DEVLINK_VIDDID" != "15b3:cf70" ]]; then
+ echo "SKIP: test is tailored for Mellanox Spectrum-2 and Spectrum-3"
+ exit 1
+fi
+
+current_test=""
+
+cleanup()
+{
+ pre_cleanup
+ if [ ! -z $current_test ]; then
+ ${current_test}_cleanup
+ fi
+ # Need to reload in order to avoid router abort.
+ devlink_reload
+}
+
+trap cleanup EXIT
+
+ALL_TESTS="router tc_flower mirror_gre tc_police"
+for current_test in ${TESTS:-$ALL_TESTS}; do
+ source ${current_test}_scale.sh
+
+ num_netifs_var=${current_test^^}_NUM_NETIFS
+ num_netifs=${!num_netifs_var:-$NUM_NETIFS}
+
+ for should_fail in 0 1; do
+ RET=0
+ target=$(${current_test}_get_target "$should_fail")
+ ${current_test}_setup_prepare
+ setup_wait $num_netifs
+ ${current_test}_test "$target" "$should_fail"
+ ${current_test}_cleanup
+ devlink_reload
+ if [[ "$should_fail" -eq 0 ]]; then
+ log_test "'$current_test' $target"
+ else
+ log_test "'$current_test' overflow $target"
+ fi
+ done
+done
+current_test=""
+
+exit "$RET"
diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/router_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/router_scale.sh
new file mode 100644
index 000000000..1897e163e
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/router_scale.sh
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0
+source ../router_scale.sh
+
+router_get_target()
+{
+ local should_fail=$1
+ local target
+
+ target=$(devlink_resource_size_get kvd)
+
+ if [[ $should_fail -eq 0 ]]; then
+ target=$((target * 85 / 100))
+ else
+ target=$((target + 1))
+ fi
+
+ echo $target
+}
diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh
new file mode 100755
index 000000000..616d35814
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower.sh
@@ -0,0 +1,1129 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test is for checking the A-TCAM and C-TCAM operation in Spectrum-2.
+# It tries to exercise as many code paths in the eRP state machine as
+# possible.
+
+lib_dir=$(dirname $0)/../../../../net/forwarding
+
+ALL_TESTS="single_mask_test identical_filters_test two_masks_test \
+ multiple_masks_test ctcam_edge_cases_test delta_simple_test \
+ delta_two_masks_one_key_test delta_simple_rehash_test \
+ bloom_simple_test bloom_complex_test bloom_delta_test \
+ max_erp_entries_test max_group_size_test"
+NUM_NETIFS=2
+source $lib_dir/lib.sh
+source $lib_dir/tc_common.sh
+source $lib_dir/devlink_lib.sh
+
+tcflags="skip_hw"
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/24 198.51.100.1/24
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/24 198.51.100.1/24
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.2/24 198.51.100.2/24
+ tc qdisc add dev $h2 clsact
+}
+
+h2_destroy()
+{
+ tc qdisc del dev $h2 clsact
+ simple_if_fini $h2 192.0.2.2/24 198.51.100.2/24
+}
+
+tp_record()
+{
+ local tracepoint=$1
+ local cmd=$2
+
+ perf record -q -e $tracepoint $cmd
+ return $?
+}
+
+tp_record_all()
+{
+ local tracepoint=$1
+ local seconds=$2
+
+ perf record -a -q -e $tracepoint sleep $seconds
+ return $?
+}
+
+__tp_hit_count()
+{
+ local tracepoint=$1
+
+ local perf_output=`perf script -F trace:event,trace`
+ return `echo $perf_output | grep "$tracepoint:" | wc -l`
+}
+
+tp_check_hits()
+{
+ local tracepoint=$1
+ local count=$2
+
+ __tp_hit_count $tracepoint
+ if [[ "$?" -ne "$count" ]]; then
+ return 1
+ fi
+ return 0
+}
+
+tp_check_hits_any()
+{
+ local tracepoint=$1
+
+ __tp_hit_count $tracepoint
+ if [[ "$?" -eq "0" ]]; then
+ return 1
+ fi
+ return 0
+}
+
+single_mask_test()
+{
+ # When only a single mask is required, the device uses the master
+ # mask and not the eRP table. Verify that under this mode the right
+ # filter is matched
+
+ RET=0
+
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ $tcflags dst_ip 192.0.2.2 action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_err $? "Single filter - did not match"
+
+ tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \
+ $tcflags dst_ip 198.51.100.2 action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 101 2
+ check_err $? "Two filters - did not match highest priority"
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 198.51.100.1 -B 198.51.100.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_err $? "Two filters - did not match lowest priority"
+
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 198.51.100.1 -B 198.51.100.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 102 2
+ check_err $? "Single filter - did not match after delete"
+
+ tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
+
+ log_test "single mask test ($tcflags)"
+}
+
+identical_filters_test()
+{
+ # When two filters that only differ in their priority are used,
+ # one needs to be inserted into the C-TCAM. This test verifies
+ # that filters are correctly spilled to C-TCAM and that the right
+ # filter is matched
+
+ RET=0
+
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ $tcflags dst_ip 192.0.2.2 action drop
+ tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \
+ $tcflags dst_ip 192.0.2.2 action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_err $? "Did not match A-TCAM filter"
+
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_err $? "Did not match C-TCAM filter after A-TCAM delete"
+
+ tc filter add dev $h2 ingress protocol ip pref 3 handle 103 flower \
+ $tcflags dst_ip 192.0.2.2 action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 102 2
+ check_err $? "Did not match C-TCAM filter after A-TCAM add"
+
+ tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 103 1
+ check_err $? "Did not match A-TCAM filter after C-TCAM delete"
+
+ tc filter del dev $h2 ingress protocol ip pref 3 handle 103 flower
+
+ log_test "identical filters test ($tcflags)"
+}
+
+two_masks_test()
+{
+ # When more than one mask is required, the eRP table is used. This
+ # test verifies that the eRP table is correctly allocated and used
+
+ RET=0
+
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ $tcflags dst_ip 192.0.2.2 action drop
+ tc filter add dev $h2 ingress protocol ip pref 3 handle 103 flower \
+ $tcflags dst_ip 192.0.0.0/8 action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_err $? "Two filters - did not match highest priority"
+
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 103 1
+ check_err $? "Single filter - did not match"
+
+ tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \
+ $tcflags dst_ip 192.0.2.0/24 action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_err $? "Two filters - did not match highest priority after add"
+
+ tc filter del dev $h2 ingress protocol ip pref 3 handle 103 flower
+ tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
+
+ log_test "two masks test ($tcflags)"
+}
+
+multiple_masks_test()
+{
+ # The number of masks in a region is limited. Once the maximum
+ # number of masks has been reached filters that require new
+ # masks are spilled to the C-TCAM. This test verifies that
+ # spillage is performed correctly and that the right filter is
+ # matched
+
+ if [[ "$tcflags" != "skip_sw" ]]; then
+ return 0;
+ fi
+
+ local index
+
+ RET=0
+
+ NUM_MASKS=32
+ NUM_ERPS=16
+ BASE_INDEX=100
+
+ for i in $(eval echo {1..$NUM_MASKS}); do
+ index=$((BASE_INDEX - i))
+
+ if ((i > NUM_ERPS)); then
+ exp_hits=1
+ err_msg="$i filters - C-TCAM spill did not happen when it was expected"
+ else
+ exp_hits=0
+ err_msg="$i filters - C-TCAM spill happened when it should not"
+ fi
+
+ tp_record "mlxsw:mlxsw_sp_acl_atcam_entry_add_ctcam_spill" \
+ "tc filter add dev $h2 ingress protocol ip pref $index \
+ handle $index \
+ flower $tcflags \
+ dst_ip 192.0.2.2/${i} src_ip 192.0.2.1/${i} \
+ action drop"
+ tp_check_hits "mlxsw:mlxsw_sp_acl_atcam_entry_add_ctcam_spill" \
+ $exp_hits
+ check_err $? "$err_msg"
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 \
+ -B 192.0.2.2 -t ip -q
+
+ tc_check_packets "dev $h2 ingress" $index 1
+ check_err $? "$i filters - did not match highest priority (add)"
+ done
+
+ for i in $(eval echo {$NUM_MASKS..1}); do
+ index=$((BASE_INDEX - i))
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 \
+ -B 192.0.2.2 -t ip -q
+
+ tc_check_packets "dev $h2 ingress" $index 2
+ check_err $? "$i filters - did not match highest priority (del)"
+
+ tc filter del dev $h2 ingress protocol ip pref $index \
+ handle $index flower
+ done
+
+ log_test "multiple masks test ($tcflags)"
+}
+
+ctcam_two_atcam_masks_test()
+{
+ RET=0
+
+ # First case: C-TCAM is disabled when there are two A-TCAM masks.
+ # We push a filter into the C-TCAM by using two identical filters
+ # as in identical_filters_test()
+
+ # Filter goes into A-TCAM
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ $tcflags dst_ip 192.0.2.2 action drop
+ # Filter goes into C-TCAM
+ tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \
+ $tcflags dst_ip 192.0.2.2 action drop
+ # Filter goes into A-TCAM
+ tc filter add dev $h2 ingress protocol ip pref 3 handle 103 flower \
+ $tcflags dst_ip 192.0.0.0/16 action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_err $? "Did not match A-TCAM filter"
+
+ # Delete both A-TCAM and C-TCAM filters and make sure the remaining
+ # A-TCAM filter still works
+ tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 103 1
+ check_err $? "Did not match A-TCAM filter"
+
+ tc filter del dev $h2 ingress protocol ip pref 3 handle 103 flower
+
+ log_test "ctcam with two atcam masks test ($tcflags)"
+}
+
+ctcam_one_atcam_mask_test()
+{
+ RET=0
+
+ # Second case: C-TCAM is disabled when there is one A-TCAM mask.
+ # The test is similar to identical_filters_test()
+
+ # Filter goes into A-TCAM
+ tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \
+ $tcflags dst_ip 192.0.2.2 action drop
+ # Filter goes into C-TCAM
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ $tcflags dst_ip 192.0.2.2 action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_err $? "Did not match C-TCAM filter"
+
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_err $? "Did not match A-TCAM filter"
+
+ tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
+
+ log_test "ctcam with one atcam mask test ($tcflags)"
+}
+
+ctcam_no_atcam_masks_test()
+{
+ RET=0
+
+ # Third case: C-TCAM is disabled when there are no A-TCAM masks
+ # This test exercises the code path that transitions the eRP table
+ # to its initial state after deleting the last C-TCAM mask
+
+ # Filter goes into A-TCAM
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ $tcflags dst_ip 192.0.2.2 action drop
+ # Filter goes into C-TCAM
+ tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \
+ $tcflags dst_ip 192.0.2.2 action drop
+
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+ tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
+
+ log_test "ctcam with no atcam masks test ($tcflags)"
+}
+
+ctcam_edge_cases_test()
+{
+ # When the C-TCAM is disabled after deleting the last C-TCAM
+ # mask, we want to make sure the eRP state machine is put in
+ # the correct state
+
+ ctcam_two_atcam_masks_test
+ ctcam_one_atcam_mask_test
+ ctcam_no_atcam_masks_test
+}
+
+delta_simple_test()
+{
+ # The first filter will create eRP, the second filter will fit into
+ # the first eRP with delta. Remove the first rule then and check that
+ # the eRP stays (referenced by the second filter).
+
+ RET=0
+
+ if [[ "$tcflags" != "skip_sw" ]]; then
+ return 0;
+ fi
+
+ tp_record "objagg:*" "tc filter add dev $h2 ingress protocol ip \
+ pref 1 handle 101 flower $tcflags dst_ip 192.0.0.0/24 \
+ action drop"
+ tp_check_hits "objagg:objagg_obj_root_create" 1
+ check_err $? "eRP was not created"
+
+ tp_record "objagg:*" "tc filter add dev $h2 ingress protocol ip \
+ pref 2 handle 102 flower $tcflags dst_ip 192.0.2.2 \
+ action drop"
+ tp_check_hits "objagg:objagg_obj_root_create" 0
+ check_err $? "eRP was incorrectly created"
+ tp_check_hits "objagg:objagg_obj_parent_assign" 1
+ check_err $? "delta was not created"
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_fail $? "Matched a wrong filter"
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_err $? "Did not match on correct filter"
+
+ tp_record "objagg:*" "tc filter del dev $h2 ingress protocol ip \
+ pref 1 handle 101 flower"
+ tp_check_hits "objagg:objagg_obj_root_destroy" 0
+ check_err $? "eRP was incorrectly destroyed"
+ tp_check_hits "objagg:objagg_obj_parent_unassign" 0
+ check_err $? "delta was incorrectly destroyed"
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 102 2
+ check_err $? "Did not match on correct filter after the first was removed"
+
+ tp_record "objagg:*" "tc filter del dev $h2 ingress protocol ip \
+ pref 2 handle 102 flower"
+ tp_check_hits "objagg:objagg_obj_parent_unassign" 1
+ check_err $? "delta was not destroyed"
+ tp_check_hits "objagg:objagg_obj_root_destroy" 1
+ check_err $? "eRP was not destroyed"
+
+ log_test "delta simple test ($tcflags)"
+}
+
+delta_two_masks_one_key_test()
+{
+ # If 2 keys are the same and only differ in mask in a way that
+ # they belong under the same ERP (second is delta of the first),
+ # there should be no C-TCAM spill.
+
+ RET=0
+
+ if [[ "$tcflags" != "skip_sw" ]]; then
+ return 0;
+ fi
+
+ tp_record "mlxsw:*" "tc filter add dev $h2 ingress protocol ip \
+ pref 1 handle 101 flower $tcflags dst_ip 192.0.2.0/24 \
+ action drop"
+ tp_check_hits "mlxsw:mlxsw_sp_acl_atcam_entry_add_ctcam_spill" 0
+ check_err $? "incorrect C-TCAM spill while inserting the first rule"
+
+ tp_record "mlxsw:*" "tc filter add dev $h2 ingress protocol ip \
+ pref 2 handle 102 flower $tcflags dst_ip 192.0.2.2 \
+ action drop"
+ tp_check_hits "mlxsw:mlxsw_sp_acl_atcam_entry_add_ctcam_spill" 0
+ check_err $? "incorrect C-TCAM spill while inserting the second rule"
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_err $? "Did not match on correct filter"
+
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_err $? "Did not match on correct filter"
+
+ tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
+
+ log_test "delta two masks one key test ($tcflags)"
+}
+
+delta_simple_rehash_test()
+{
+ RET=0
+
+ if [[ "$tcflags" != "skip_sw" ]]; then
+ return 0;
+ fi
+
+ devlink dev param set $DEVLINK_DEV \
+ name acl_region_rehash_interval cmode runtime value 0
+ check_err $? "Failed to set ACL region rehash interval"
+
+ tp_record_all mlxsw:mlxsw_sp_acl_tcam_vregion_rehash 7
+ tp_check_hits_any mlxsw:mlxsw_sp_acl_tcam_vregion_rehash
+ check_fail $? "Rehash trace was hit even when rehash should be disabled"
+
+ devlink dev param set $DEVLINK_DEV \
+ name acl_region_rehash_interval cmode runtime value 3000
+ check_err $? "Failed to set ACL region rehash interval"
+
+ sleep 1
+
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ $tcflags dst_ip 192.0.1.0/25 action drop
+ tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \
+ $tcflags dst_ip 192.0.2.2 action drop
+ tc filter add dev $h2 ingress protocol ip pref 3 handle 103 flower \
+ $tcflags dst_ip 192.0.3.0/24 action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_fail $? "Matched a wrong filter"
+
+ tc_check_packets "dev $h2 ingress" 103 1
+ check_fail $? "Matched a wrong filter"
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_err $? "Did not match on correct filter"
+
+ tp_record_all mlxsw:* 3
+ tp_check_hits_any mlxsw:mlxsw_sp_acl_tcam_vregion_rehash
+ check_err $? "Rehash trace was not hit"
+ tp_check_hits_any mlxsw:mlxsw_sp_acl_tcam_vregion_migrate
+ check_err $? "Migrate trace was not hit"
+ tp_check_hits_any mlxsw:mlxsw_sp_acl_tcam_vregion_migrate_end
+ check_err $? "Migrate end trace was not hit"
+ tp_record_all mlxsw:* 3
+ tp_check_hits_any mlxsw:mlxsw_sp_acl_tcam_vregion_rehash
+ check_err $? "Rehash trace was not hit"
+ tp_check_hits_any mlxsw:mlxsw_sp_acl_tcam_vregion_migrate
+ check_fail $? "Migrate trace was hit when no migration should happen"
+ tp_check_hits_any mlxsw:mlxsw_sp_acl_tcam_vregion_migrate_end
+ check_fail $? "Migrate end trace was hit when no migration should happen"
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_fail $? "Matched a wrong filter after rehash"
+
+ tc_check_packets "dev $h2 ingress" 103 1
+ check_fail $? "Matched a wrong filter after rehash"
+
+ tc_check_packets "dev $h2 ingress" 102 2
+ check_err $? "Did not match on correct filter after rehash"
+
+ tc filter del dev $h2 ingress protocol ip pref 3 handle 103 flower
+ tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+
+ log_test "delta simple rehash test ($tcflags)"
+}
+
+delta_simple_ipv6_rehash_test()
+{
+ RET=0
+
+ if [[ "$tcflags" != "skip_sw" ]]; then
+ return 0;
+ fi
+
+ devlink dev param set $DEVLINK_DEV \
+ name acl_region_rehash_interval cmode runtime value 0
+ check_err $? "Failed to set ACL region rehash interval"
+
+ tp_record_all mlxsw:mlxsw_sp_acl_tcam_vregion_rehash 7
+ tp_check_hits_any mlxsw:mlxsw_sp_acl_tcam_vregion_rehash
+ check_fail $? "Rehash trace was hit even when rehash should be disabled"
+
+ devlink dev param set $DEVLINK_DEV \
+ name acl_region_rehash_interval cmode runtime value 3000
+ check_err $? "Failed to set ACL region rehash interval"
+
+ sleep 1
+
+ tc filter add dev $h2 ingress protocol ipv6 pref 1 handle 101 flower \
+ $tcflags dst_ip 2001:db8:1::0/121 action drop
+ tc filter add dev $h2 ingress protocol ipv6 pref 2 handle 102 flower \
+ $tcflags dst_ip 2001:db8:2::2 action drop
+ tc filter add dev $h2 ingress protocol ipv6 pref 3 handle 103 flower \
+ $tcflags dst_ip 2001:db8:3::0/120 action drop
+
+ $MZ $h1 -6 -c 1 -p 64 -a $h1mac -b $h2mac \
+ -A 2001:db8:2::1 -B 2001:db8:2::2 -t udp -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_fail $? "Matched a wrong filter"
+
+ tc_check_packets "dev $h2 ingress" 103 1
+ check_fail $? "Matched a wrong filter"
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_err $? "Did not match on correct filter"
+
+ tp_record_all mlxsw:* 3
+ tp_check_hits_any mlxsw:mlxsw_sp_acl_tcam_vregion_rehash
+ check_err $? "Rehash trace was not hit"
+ tp_check_hits_any mlxsw:mlxsw_sp_acl_tcam_vregion_migrate
+ check_err $? "Migrate trace was not hit"
+ tp_check_hits_any mlxsw:mlxsw_sp_acl_tcam_vregion_migrate_end
+ check_err $? "Migrate end trace was not hit"
+ tp_record_all mlxsw:* 3
+ tp_check_hits_any mlxsw:mlxsw_sp_acl_tcam_vregion_rehash
+ check_err $? "Rehash trace was not hit"
+ tp_check_hits_any mlxsw:mlxsw_sp_acl_tcam_vregion_migrate
+ check_fail $? "Migrate trace was hit when no migration should happen"
+ tp_check_hits_any mlxsw:mlxsw_sp_acl_tcam_vregion_migrate_end
+ check_fail $? "Migrate end trace was hit when no migration should happen"
+
+ $MZ $h1 -6 -c 1 -p 64 -a $h1mac -b $h2mac \
+ -A 2001:db8:2::1 -B 2001:db8:2::2 -t udp -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_fail $? "Matched a wrong filter after rehash"
+
+ tc_check_packets "dev $h2 ingress" 103 1
+ check_fail $? "Matched a wrong filter after rehash"
+
+ tc_check_packets "dev $h2 ingress" 102 2
+ check_err $? "Did not match on correct filter after rehash"
+
+ tc filter del dev $h2 ingress protocol ipv6 pref 3 handle 103 flower
+ tc filter del dev $h2 ingress protocol ipv6 pref 2 handle 102 flower
+ tc filter del dev $h2 ingress protocol ipv6 pref 1 handle 101 flower
+
+ log_test "delta simple IPv6 rehash test ($tcflags)"
+}
+
+TEST_RULE_BASE=256
+declare -a test_rules_inserted
+
+test_rule_add()
+{
+ local iface=$1
+ local tcflags=$2
+ local index=$3
+
+ if ! [ ${test_rules_inserted[$index]} ] ; then
+ test_rules_inserted[$index]=false
+ fi
+ if ${test_rules_inserted[$index]} ; then
+ return
+ fi
+
+ local number=$(( $index + $TEST_RULE_BASE ))
+ printf -v hexnumber '%x' $number
+
+ batch="${batch}filter add dev $iface ingress protocol ipv6 pref 1 \
+ handle $number flower $tcflags \
+ src_ip 2001:db8:1::$hexnumber action drop\n"
+ test_rules_inserted[$index]=true
+}
+
+test_rule_del()
+{
+ local iface=$1
+ local index=$2
+
+ if ! [ ${test_rules_inserted[$index]} ] ; then
+ test_rules_inserted[$index]=false
+ fi
+ if ! ${test_rules_inserted[$index]} ; then
+ return
+ fi
+
+ local number=$(( $index + $TEST_RULE_BASE ))
+ printf -v hexnumber '%x' $number
+
+ batch="${batch}filter del dev $iface ingress protocol ipv6 pref 1 \
+ handle $number flower\n"
+ test_rules_inserted[$index]=false
+}
+
+test_rule_add_or_remove()
+{
+ local iface=$1
+ local tcflags=$2
+ local index=$3
+
+ if ! [ ${test_rules_inserted[$index]} ] ; then
+ test_rules_inserted[$index]=false
+ fi
+ if ${test_rules_inserted[$index]} ; then
+ test_rule_del $iface $index
+ else
+ test_rule_add $iface $tcflags $index
+ fi
+}
+
+test_rule_add_or_remove_random_batch()
+{
+ local iface=$1
+ local tcflags=$2
+ local total_count=$3
+ local skip=0
+ local count=0
+ local MAXSKIP=20
+ local MAXCOUNT=20
+
+ for ((i=1;i<=total_count;i++)); do
+ if (( $skip == 0 )) && (($count == 0)); then
+ ((skip=$RANDOM % $MAXSKIP + 1))
+ ((count=$RANDOM % $MAXCOUNT + 1))
+ fi
+ if (( $skip != 0 )); then
+ ((skip-=1))
+ else
+ ((count-=1))
+ test_rule_add_or_remove $iface $tcflags $i
+ fi
+ done
+}
+
+delta_massive_ipv6_rehash_test()
+{
+ RET=0
+
+ if [[ "$tcflags" != "skip_sw" ]]; then
+ return 0;
+ fi
+
+ devlink dev param set $DEVLINK_DEV \
+ name acl_region_rehash_interval cmode runtime value 0
+ check_err $? "Failed to set ACL region rehash interval"
+
+ tp_record_all mlxsw:mlxsw_sp_acl_tcam_vregion_rehash 7
+ tp_check_hits_any mlxsw:mlxsw_sp_acl_tcam_vregion_rehash
+ check_fail $? "Rehash trace was hit even when rehash should be disabled"
+
+ RANDOM=4432897
+ declare batch=""
+ test_rule_add_or_remove_random_batch $h2 $tcflags 5000
+
+ echo -n -e $batch | tc -b -
+
+ declare batch=""
+ test_rule_add_or_remove_random_batch $h2 $tcflags 5000
+
+ devlink dev param set $DEVLINK_DEV \
+ name acl_region_rehash_interval cmode runtime value 3000
+ check_err $? "Failed to set ACL region rehash interval"
+
+ sleep 1
+
+ tc filter add dev $h2 ingress protocol ipv6 pref 1 handle 101 flower \
+ $tcflags dst_ip 2001:db8:1::0/121 action drop
+ tc filter add dev $h2 ingress protocol ipv6 pref 2 handle 102 flower \
+ $tcflags dst_ip 2001:db8:2::2 action drop
+ tc filter add dev $h2 ingress protocol ipv6 pref 3 handle 103 flower \
+ $tcflags dst_ip 2001:db8:3::0/120 action drop
+
+ $MZ $h1 -6 -c 1 -p 64 -a $h1mac -b $h2mac \
+ -A 2001:db8:2::1 -B 2001:db8:2::2 -t udp -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_fail $? "Matched a wrong filter"
+
+ tc_check_packets "dev $h2 ingress" 103 1
+ check_fail $? "Matched a wrong filter"
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_err $? "Did not match on correct filter"
+
+ echo -n -e $batch | tc -b -
+
+ devlink dev param set $DEVLINK_DEV \
+ name acl_region_rehash_interval cmode runtime value 0
+ check_err $? "Failed to set ACL region rehash interval"
+
+ $MZ $h1 -6 -c 1 -p 64 -a $h1mac -b $h2mac \
+ -A 2001:db8:2::1 -B 2001:db8:2::2 -t udp -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_fail $? "Matched a wrong filter after rehash"
+
+ tc_check_packets "dev $h2 ingress" 103 1
+ check_fail $? "Matched a wrong filter after rehash"
+
+ tc_check_packets "dev $h2 ingress" 102 2
+ check_err $? "Did not match on correct filter after rehash"
+
+ tc filter del dev $h2 ingress protocol ipv6 pref 3 handle 103 flower
+ tc filter del dev $h2 ingress protocol ipv6 pref 2 handle 102 flower
+ tc filter del dev $h2 ingress protocol ipv6 pref 1 handle 101 flower
+
+ declare batch=""
+ for i in {1..5000}; do
+ test_rule_del $h2 $tcflags $i
+ done
+ echo -e $batch | tc -b -
+
+ log_test "delta massive IPv6 rehash test ($tcflags)"
+}
+
+bloom_simple_test()
+{
+ # Bloom filter requires that the eRP table is used. This test
+ # verifies that Bloom filter is not harming correctness of ACLs.
+ # First, make sure that eRP table is used and then set rule patterns
+ # which are distant enough and will result skipping a lookup after
+ # consulting the Bloom filter. Although some eRP lookups are skipped,
+ # the correct filter should be hit.
+
+ RET=0
+
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ $tcflags dst_ip 192.0.2.2 action drop
+ tc filter add dev $h2 ingress protocol ip pref 5 handle 104 flower \
+ $tcflags dst_ip 198.51.100.2 action drop
+ tc filter add dev $h2 ingress protocol ip pref 3 handle 103 flower \
+ $tcflags dst_ip 192.0.0.0/8 action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_err $? "Two filters - did not match highest priority"
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 198.51.100.1 -B 198.51.100.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 104 1
+ check_err $? "Single filter - did not match"
+
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 103 1
+ check_err $? "Low prio filter - did not match"
+
+ tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \
+ $tcflags dst_ip 198.0.0.0/8 action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 198.51.100.1 -B 198.51.100.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_err $? "Two filters - did not match highest priority after add"
+
+ tc filter del dev $h2 ingress protocol ip pref 3 handle 103 flower
+ tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
+ tc filter del dev $h2 ingress protocol ip pref 5 handle 104 flower
+
+ log_test "bloom simple test ($tcflags)"
+}
+
+bloom_complex_test()
+{
+ # Bloom filter index computation is affected from region ID, eRP
+ # ID and from the region key size. In order to excercise those parts
+ # of the Bloom filter code, use a series of regions, each with a
+ # different key size and send packet that should hit all of them.
+ local index
+
+ RET=0
+ NUM_CHAINS=4
+ BASE_INDEX=100
+
+ # Create chain with up to 2 key blocks (ip_proto only)
+ tc chain add dev $h2 ingress chain 1 protocol ip flower \
+ ip_proto tcp &> /dev/null
+ # Create chain with 2-4 key blocks (ip_proto, src MAC)
+ tc chain add dev $h2 ingress chain 2 protocol ip flower \
+ ip_proto tcp \
+ src_mac 00:00:00:00:00:00/FF:FF:FF:FF:FF:FF &> /dev/null
+ # Create chain with 4-8 key blocks (ip_proto, src & dst MAC, IPv4 dest)
+ tc chain add dev $h2 ingress chain 3 protocol ip flower \
+ ip_proto tcp \
+ dst_mac 00:00:00:00:00:00/FF:FF:FF:FF:FF:FF \
+ src_mac 00:00:00:00:00:00/FF:FF:FF:FF:FF:FF \
+ dst_ip 0.0.0.0/32 &> /dev/null
+ # Default chain contains all fields and therefore is 8-12 key blocks
+ tc chain add dev $h2 ingress chain 4
+
+ # We need at least 2 rules in every region to have eRP table active
+ # so create a dummy rule per chain using a different pattern
+ for i in $(eval echo {0..$NUM_CHAINS}); do
+ index=$((BASE_INDEX - 1 - i))
+ tc filter add dev $h2 ingress chain $i protocol ip \
+ pref 2 handle $index flower \
+ $tcflags ip_proto tcp action drop
+ done
+
+ # Add rules to test Bloom filter, each in a different chain
+ index=$BASE_INDEX
+ tc filter add dev $h2 ingress protocol ip \
+ pref 1 handle $((++index)) flower \
+ $tcflags dst_ip 192.0.0.0/16 action goto chain 1
+ tc filter add dev $h2 ingress chain 1 protocol ip \
+ pref 1 handle $((++index)) flower \
+ $tcflags action goto chain 2
+ tc filter add dev $h2 ingress chain 2 protocol ip \
+ pref 1 handle $((++index)) flower \
+ $tcflags src_mac $h1mac action goto chain 3
+ tc filter add dev $h2 ingress chain 3 protocol ip \
+ pref 1 handle $((++index)) flower \
+ $tcflags dst_ip 192.0.0.0/8 action goto chain 4
+ tc filter add dev $h2 ingress chain 4 protocol ip \
+ pref 1 handle $((++index)) flower \
+ $tcflags src_ip 192.0.2.0/24 action drop
+
+ # Send a packet that is supposed to hit all chains
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ for i in $(eval echo {0..$NUM_CHAINS}); do
+ index=$((BASE_INDEX + i + 1))
+ tc_check_packets "dev $h2 ingress" $index 1
+ check_err $? "Did not match chain $i"
+ done
+
+ # Rules cleanup
+ for i in $(eval echo {$NUM_CHAINS..0}); do
+ index=$((BASE_INDEX - i - 1))
+ tc filter del dev $h2 ingress chain $i \
+ pref 2 handle $index flower
+ index=$((BASE_INDEX + i + 1))
+ tc filter del dev $h2 ingress chain $i \
+ pref 1 handle $index flower
+ done
+
+ # Chains cleanup
+ for i in $(eval echo {$NUM_CHAINS..1}); do
+ tc chain del dev $h2 ingress chain $i
+ done
+
+ log_test "bloom complex test ($tcflags)"
+}
+
+
+bloom_delta_test()
+{
+ # When multiple masks are used, the eRP table is activated. When
+ # masks are close enough (delta) the masks reside on the same
+ # eRP table. This test verifies that the eRP table is correctly
+ # allocated and used in delta condition and that Bloom filter is
+ # still functional with delta.
+
+ RET=0
+
+ tc filter add dev $h2 ingress protocol ip pref 3 handle 103 flower \
+ $tcflags dst_ip 192.1.0.0/16 action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.1.2.1 -B 192.1.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 103 1
+ check_err $? "Single filter - did not match"
+
+ tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \
+ $tcflags dst_ip 192.2.1.0/24 action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.2.1.1 -B 192.2.1.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_err $? "Delta filters - did not match second filter"
+
+ tc filter del dev $h2 ingress protocol ip pref 3 handle 103 flower
+ tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
+
+ log_test "bloom delta test ($tcflags)"
+}
+
+max_erp_entries_test()
+{
+ # The number of eRP entries is limited. Once the maximum number of eRPs
+ # has been reached, filters cannot be added. This test verifies that
+ # when this limit is reached, inserstion fails without crashing.
+
+ RET=0
+
+ local num_masks=32
+ local num_regions=15
+ local chain_failed
+ local mask_failed
+ local ret
+
+ if [[ "$tcflags" != "skip_sw" ]]; then
+ return 0;
+ fi
+
+ for ((i=1; i < $num_regions; i++)); do
+ for ((j=$num_masks; j >= 0; j--)); do
+ tc filter add dev $h2 ingress chain $i protocol ip \
+ pref $i handle $j flower $tcflags \
+ dst_ip 192.1.0.0/$j &> /dev/null
+ ret=$?
+
+ if [ $ret -ne 0 ]; then
+ chain_failed=$i
+ mask_failed=$j
+ break 2
+ fi
+ done
+ done
+
+ # We expect to exceed the maximum number of eRP entries, so that
+ # insertion eventually fails. Otherwise, the test should be adjusted to
+ # add more filters.
+ check_fail $ret "expected to exceed number of eRP entries"
+
+ for ((; i >= 1; i--)); do
+ for ((j=0; j <= $num_masks; j++)); do
+ tc filter del dev $h2 ingress chain $i protocol ip \
+ pref $i handle $j flower &> /dev/null
+ done
+ done
+
+ log_test "max eRP entries test ($tcflags). " \
+ "max chain $chain_failed, mask $mask_failed"
+}
+
+max_group_size_test()
+{
+ # The number of ACLs in an ACL group is limited. Once the maximum
+ # number of ACLs has been reached, filters cannot be added. This test
+ # verifies that when this limit is reached, insertion fails without
+ # crashing.
+
+ RET=0
+
+ local num_acls=32
+ local max_size
+ local ret
+
+ if [[ "$tcflags" != "skip_sw" ]]; then
+ return 0;
+ fi
+
+ for ((i=1; i < $num_acls; i++)); do
+ if [[ $(( i % 2 )) == 1 ]]; then
+ tc filter add dev $h2 ingress pref $i proto ipv4 \
+ flower $tcflags dst_ip 198.51.100.1/32 \
+ ip_proto tcp tcp_flags 0x01/0x01 \
+ action drop &> /dev/null
+ else
+ tc filter add dev $h2 ingress pref $i proto ipv6 \
+ flower $tcflags dst_ip 2001:db8:1::1/128 \
+ action drop &> /dev/null
+ fi
+
+ ret=$?
+ [[ $ret -ne 0 ]] && max_size=$((i - 1)) && break
+ done
+
+ # We expect to exceed the maximum number of ACLs in a group, so that
+ # insertion eventually fails. Otherwise, the test should be adjusted to
+ # add more filters.
+ check_fail $ret "expected to exceed number of ACLs in a group"
+
+ for ((; i >= 1; i--)); do
+ if [[ $(( i % 2 )) == 1 ]]; then
+ tc filter del dev $h2 ingress pref $i proto ipv4 \
+ flower $tcflags dst_ip 198.51.100.1/32 \
+ ip_proto tcp tcp_flags 0x01/0x01 \
+ action drop &> /dev/null
+ else
+ tc filter del dev $h2 ingress pref $i proto ipv6 \
+ flower $tcflags dst_ip 2001:db8:1::1/128 \
+ action drop &> /dev/null
+ fi
+ done
+
+ log_test "max ACL group size test ($tcflags). max size $max_size"
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ h2=${NETIFS[p2]}
+ h1mac=$(mac_get $h1)
+ h2mac=$(mac_get $h2)
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+if ! tc_offload_check; then
+ check_err 1 "Could not test offloaded functionality"
+ log_test "mlxsw-specific tests for tc flower"
+ exit
+else
+ tcflags="skip_sw"
+ tests_run
+fi
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh
new file mode 100644
index 000000000..efd798a85
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_flower_scale.sh
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0
+source ../tc_flower_scale.sh
+
+tc_flower_get_target()
+{
+ local should_fail=$1; shift
+
+ # The driver associates a counter with each tc filter, which means the
+ # number of supported filters is bounded by the number of available
+ # counters.
+ # Currently, the driver supports 30K (30,720) flow counters and six of
+ # these are used for multicast routing.
+ local target=30714
+
+ if ((! should_fail)); then
+ echo $target
+ else
+ echo $((target + 1))
+ fi
+}
diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_police_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_police_scale.sh
new file mode 100644
index 000000000..e79ac0dad
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum-2/tc_police_scale.sh
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+source ../tc_police_scale.sh
+
+tc_police_get_target()
+{
+ local should_fail=$1; shift
+ local target
+
+ target=$(devlink_resource_size_get global_policers single_rate_policers)
+
+ if ((! should_fail)); then
+ echo $target
+ else
+ echo $((target + 1))
+ fi
+}
diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_lib_spectrum.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_lib_spectrum.sh
new file mode 100644
index 000000000..73035e250
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_lib_spectrum.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source "../../../../net/forwarding/devlink_lib.sh"
+
+if [ "$DEVLINK_VIDDID" != "15b3:cb84" ]; then
+ echo "SKIP: test is tailored for Mellanox Spectrum"
+ exit 1
+fi
+
+# Needed for returning to default
+declare -A KVD_DEFAULTS
+
+KVD_CHILDREN="linear hash_single hash_double"
+KVDL_CHILDREN="singles chunks large_chunks"
+
+devlink_sp_resource_minimize()
+{
+ local size
+ local i
+
+ for i in $KVD_CHILDREN; do
+ size=$(devlink_resource_get kvd "$i" | jq '.["size_min"]')
+ devlink_resource_size_set "$size" kvd "$i"
+ done
+
+ for i in $KVDL_CHILDREN; do
+ size=$(devlink_resource_get kvd linear "$i" | \
+ jq '.["size_min"]')
+ devlink_resource_size_set "$size" kvd linear "$i"
+ done
+}
+
+devlink_sp_size_kvd_to_default()
+{
+ local need_reload=0
+ local i
+
+ for i in $KVD_CHILDREN; do
+ local size=$(echo "${KVD_DEFAULTS[kvd_$i]}" | jq '.["size"]')
+ current_size=$(devlink_resource_size_get kvd "$i")
+
+ if [ "$size" -ne "$current_size" ]; then
+ devlink_resource_size_set "$size" kvd "$i"
+ need_reload=1
+ fi
+ done
+
+ for i in $KVDL_CHILDREN; do
+ local size=$(echo "${KVD_DEFAULTS[kvd_linear_$i]}" | \
+ jq '.["size"]')
+ current_size=$(devlink_resource_size_get kvd linear "$i")
+
+ if [ "$size" -ne "$current_size" ]; then
+ devlink_resource_size_set "$size" kvd linear "$i"
+ need_reload=1
+ fi
+ done
+
+ if [ "$need_reload" -ne "0" ]; then
+ devlink_reload
+ fi
+}
+
+devlink_sp_read_kvd_defaults()
+{
+ local key
+ local i
+
+ KVD_DEFAULTS[kvd]=$(devlink_resource_get "kvd")
+ for i in $KVD_CHILDREN; do
+ key=kvd_$i
+ KVD_DEFAULTS[$key]=$(devlink_resource_get kvd "$i")
+ done
+
+ for i in $KVDL_CHILDREN; do
+ key=kvd_linear_$i
+ KVD_DEFAULTS[$key]=$(devlink_resource_get kvd linear "$i")
+ done
+}
+
+KVD_PROFILES="default scale ipv4_max"
+
+devlink_sp_resource_kvd_profile_set()
+{
+ local profile=$1
+
+ case "$profile" in
+ scale)
+ devlink_resource_size_set 64000 kvd linear
+ devlink_resource_size_set 15616 kvd linear singles
+ devlink_resource_size_set 32000 kvd linear chunks
+ devlink_resource_size_set 16384 kvd linear large_chunks
+ devlink_resource_size_set 128000 kvd hash_single
+ devlink_resource_size_set 48000 kvd hash_double
+ devlink_reload
+ ;;
+ ipv4_max)
+ devlink_resource_size_set 64000 kvd linear
+ devlink_resource_size_set 15616 kvd linear singles
+ devlink_resource_size_set 32000 kvd linear chunks
+ devlink_resource_size_set 16384 kvd linear large_chunks
+ devlink_resource_size_set 144000 kvd hash_single
+ devlink_resource_size_set 32768 kvd hash_double
+ devlink_reload
+ ;;
+ default)
+ devlink_resource_size_set 98304 kvd linear
+ devlink_resource_size_set 16384 kvd linear singles
+ devlink_resource_size_set 49152 kvd linear chunks
+ devlink_resource_size_set 32768 kvd linear large_chunks
+ devlink_resource_size_set 87040 kvd hash_single
+ devlink_resource_size_set 60416 kvd hash_double
+ devlink_reload
+ ;;
+ *)
+ check_err 1 "Unknown profile $profile"
+ esac
+}
diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_resources.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_resources.sh
new file mode 100755
index 000000000..6f2683cbc
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/devlink_resources.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+lib_dir=$(dirname $0)/../../../../net/forwarding
+
+NUM_NETIFS=1
+source $lib_dir/lib.sh
+source devlink_lib_spectrum.sh
+
+setup_prepare()
+{
+ devlink_sp_read_kvd_defaults
+}
+
+cleanup()
+{
+ pre_cleanup
+ devlink_sp_size_kvd_to_default
+}
+
+trap cleanup EXIT
+
+setup_prepare
+
+profiles_test()
+{
+ local i
+
+ log_info "Running profile tests"
+
+ for i in $KVD_PROFILES; do
+ RET=0
+ devlink_sp_resource_kvd_profile_set $i
+ log_test "'$i' profile"
+ done
+
+ # Default is explicitly tested at end to ensure it's actually applied
+ RET=0
+ devlink_sp_resource_kvd_profile_set "default"
+ log_test "'default' profile"
+}
+
+resources_min_test()
+{
+ local size
+ local i
+ local j
+
+ log_info "Running KVD-minimum tests"
+
+ for i in $KVD_CHILDREN; do
+ RET=0
+ size=$(devlink_resource_get kvd "$i" | jq '.["size_min"]')
+ devlink_resource_size_set "$size" kvd "$i"
+
+ # In case of linear, need to minimize sub-resources as well
+ if [[ "$i" == "linear" ]]; then
+ for j in $KVDL_CHILDREN; do
+ devlink_resource_size_set 0 kvd linear "$j"
+ done
+ fi
+
+ devlink_reload
+ devlink_sp_size_kvd_to_default
+ log_test "'$i' minimize [$size]"
+ done
+}
+
+resources_max_test()
+{
+ local min_size
+ local size
+ local i
+ local j
+
+ log_info "Running KVD-maximum tests"
+ for i in $KVD_CHILDREN; do
+ RET=0
+ devlink_sp_resource_minimize
+
+ # Calculate the maximum possible size for the given partition
+ size=$(devlink_resource_size_get kvd)
+ for j in $KVD_CHILDREN; do
+ if [ "$i" != "$j" ]; then
+ min_size=$(devlink_resource_get kvd "$j" | \
+ jq '.["size_min"]')
+ size=$((size - min_size))
+ fi
+ done
+
+ # Test almost maximum size
+ devlink_resource_size_set "$((size - 128))" kvd "$i"
+ devlink_reload
+ log_test "'$i' almost maximize [$((size - 128))]"
+
+ # Test above maximum size
+ devlink resource set "$DEVLINK_DEV" \
+ path "kvd/$i" size $((size + 128)) &> /dev/null
+ check_fail $? "Set kvd/$i to size $((size + 128)) should fail"
+ log_test "'$i' Overflow rejection [$((size + 128))]"
+
+ # Test maximum size
+ if [ "$i" == "hash_single" ] || [ "$i" == "hash_double" ]; then
+ echo "SKIP: Observed problem with exact max $i"
+ continue
+ fi
+
+ devlink_resource_size_set "$size" kvd "$i"
+ devlink_reload
+ log_test "'$i' maximize [$size]"
+
+ devlink_sp_size_kvd_to_default
+ done
+}
+
+profiles_test
+resources_min_test
+resources_max_test
+
+exit "$RET"
diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/mirror_gre_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/mirror_gre_scale.sh
new file mode 100644
index 000000000..f7c168dec
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/mirror_gre_scale.sh
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+source ../mirror_gre_scale.sh
+
+mirror_gre_get_target()
+{
+ local should_fail=$1; shift
+ local target
+
+ target=$(devlink_resource_size_get span_agents)
+
+ if ((! should_fail)); then
+ echo $target
+ else
+ echo $((target + 1))
+ fi
+}
diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh
new file mode 100755
index 000000000..43f662401
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/resource_scale.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+lib_dir=$(dirname $0)/../../../../net/forwarding
+
+NUM_NETIFS=6
+source $lib_dir/lib.sh
+source $lib_dir/tc_common.sh
+source devlink_lib_spectrum.sh
+
+current_test=""
+
+cleanup()
+{
+ pre_cleanup
+ if [ ! -z $current_test ]; then
+ ${current_test}_cleanup
+ fi
+ devlink_sp_size_kvd_to_default
+}
+
+devlink_sp_read_kvd_defaults
+trap cleanup EXIT
+
+ALL_TESTS="router tc_flower mirror_gre tc_police"
+for current_test in ${TESTS:-$ALL_TESTS}; do
+ source ${current_test}_scale.sh
+
+ num_netifs_var=${current_test^^}_NUM_NETIFS
+ num_netifs=${!num_netifs_var:-$NUM_NETIFS}
+
+ for profile in $KVD_PROFILES; do
+ RET=0
+ devlink_sp_resource_kvd_profile_set $profile
+ if [[ $RET -gt 0 ]]; then
+ log_test "'$current_test' [$profile] setting"
+ continue
+ fi
+
+ for should_fail in 0 1; do
+ RET=0
+ target=$(${current_test}_get_target "$should_fail")
+ ${current_test}_setup_prepare
+ setup_wait $num_netifs
+ ${current_test}_test "$target" "$should_fail"
+ ${current_test}_cleanup
+ if [[ "$should_fail" -eq 0 ]]; then
+ log_test "'$current_test' [$profile] $target"
+ else
+ log_test "'$current_test' [$profile] overflow $target"
+ fi
+ done
+ done
+done
+current_test=""
+
+exit "$RET"
diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/router_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/router_scale.sh
new file mode 100644
index 000000000..21c4697d5
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/router_scale.sh
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0
+source ../router_scale.sh
+
+router_get_target()
+{
+ local should_fail=$1
+ local target
+
+ target=$(devlink_resource_size_get kvd hash_single)
+
+ if [[ $should_fail -eq 0 ]]; then
+ target=$((target * 85 / 100))
+ else
+ target=$((target + 1))
+ fi
+
+ echo $target
+}
diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/tc_flower_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/tc_flower_scale.sh
new file mode 100644
index 000000000..f9bfd8937
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/tc_flower_scale.sh
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0
+source ../tc_flower_scale.sh
+
+tc_flower_get_target()
+{
+ local should_fail=$1; shift
+
+ # 6144 (6x1024) is the theoretical maximum.
+ # One bank of 512 rules is taken by the 18-byte MC router rule.
+ # One rule is the ACL catch-all.
+ # 6144 - 512 - 1 = 5631
+ local target=5631
+
+ if ((! should_fail)); then
+ echo $target
+ else
+ echo $((target + 1))
+ fi
+}
diff --git a/tools/testing/selftests/drivers/net/mlxsw/spectrum/tc_police_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/spectrum/tc_police_scale.sh
new file mode 100644
index 000000000..e79ac0dad
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/spectrum/tc_police_scale.sh
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+source ../tc_police_scale.sh
+
+tc_police_get_target()
+{
+ local should_fail=$1; shift
+ local target
+
+ target=$(devlink_resource_size_get global_policers single_rate_policers)
+
+ if ((! should_fail)); then
+ echo $target
+ else
+ echo $((target + 1))
+ fi
+}
diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_action_hw_stats.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_action_hw_stats.sh
new file mode 100755
index 000000000..20ed98fe5
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/tc_action_hw_stats.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ default_hw_stats_test
+ immediate_hw_stats_test
+ delayed_hw_stats_test
+ disabled_hw_stats_test
+"
+NUM_NETIFS=2
+
+source $lib_dir/tc_common.sh
+source $lib_dir/lib.sh
+source $lib_dir/devlink_lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/24
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/24
+}
+
+switch_create()
+{
+ simple_if_init $swp1 192.0.2.2/24
+ tc qdisc add dev $swp1 clsact
+}
+
+switch_destroy()
+{
+ tc qdisc del dev $swp1 clsact
+ simple_if_fini $swp1 192.0.2.2/24
+}
+
+hw_stats_test()
+{
+ RET=0
+
+ local name=$1
+ local action_hw_stats=$2
+ local occ_delta=$3
+ local expected_packet_count=$4
+
+ local orig_occ=$(devlink_resource_get "counters" "flow" | jq '.["occ"]')
+
+ tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \
+ skip_sw dst_ip 192.0.2.2 action drop $action_hw_stats
+ check_err $? "Failed to add rule with $name hw_stats"
+
+ local new_occ=$(devlink_resource_get "counters" "flow" | jq '.["occ"]')
+ local expected_occ=$((orig_occ + occ_delta))
+ [ "$new_occ" == "$expected_occ" ]
+ check_err $? "Expected occupancy of $expected_occ, got $new_occ"
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $swp1mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $swp1 ingress" 101 $expected_packet_count
+ check_err $? "Did not match incoming packet"
+
+ tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower
+
+ log_test "$name hw_stats"
+}
+
+default_hw_stats_test()
+{
+ hw_stats_test "default" "" 2 1
+}
+
+immediate_hw_stats_test()
+{
+ hw_stats_test "immediate" "hw_stats immediate" 2 1
+}
+
+delayed_hw_stats_test()
+{
+ RET=0
+
+ tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \
+ skip_sw dst_ip 192.0.2.2 action drop hw_stats delayed
+ check_fail $? "Unexpected success in adding rule with delayed hw_stats"
+
+ log_test "delayed hw_stats"
+}
+
+disabled_hw_stats_test()
+{
+ hw_stats_test "disabled" "hw_stats disabled" 0 0
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ h1mac=$(mac_get $h1)
+ swp1mac=$(mac_get $swp1)
+
+ vrf_prepare
+
+ h1_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+check_tc_action_hw_stats_support
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh
new file mode 100644
index 000000000..aa74be9f4
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/tc_flower_scale.sh
@@ -0,0 +1,123 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test for resource limit of offloaded flower rules. The test adds a given
+# number of flower matches for different IPv6 addresses, then check the offload
+# indication for all of the tc flower rules. This file contains functions to set
+# up a testing topology and run the test, and is meant to be sourced from a test
+# script that calls the testing routine with a given number of rules.
+
+TC_FLOWER_NUM_NETIFS=2
+
+tc_flower_h1_create()
+{
+ simple_if_init $h1
+ tc qdisc add dev $h1 clsact
+}
+
+tc_flower_h1_destroy()
+{
+ tc qdisc del dev $h1 clsact
+ simple_if_fini $h1
+}
+
+tc_flower_h2_create()
+{
+ simple_if_init $h2
+ tc qdisc add dev $h2 clsact
+}
+
+tc_flower_h2_destroy()
+{
+ tc qdisc del dev $h2 clsact
+ simple_if_fini $h2
+}
+
+tc_flower_setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ h2=${NETIFS[p2]}
+
+ vrf_prepare
+
+ tc_flower_h1_create
+ tc_flower_h2_create
+}
+
+tc_flower_cleanup()
+{
+ pre_cleanup
+
+ tc_flower_h2_destroy
+ tc_flower_h1_destroy
+
+ vrf_cleanup
+
+ if [[ -v TC_FLOWER_BATCH_FILE ]]; then
+ rm -f $TC_FLOWER_BATCH_FILE
+ fi
+}
+
+tc_flower_addr()
+{
+ local num=$1; shift
+
+ printf "2001:db8:1::%x" $num
+}
+
+tc_flower_rules_create()
+{
+ local count=$1; shift
+ local should_fail=$1; shift
+
+ TC_FLOWER_BATCH_FILE="$(mktemp)"
+
+ for ((i = 0; i < count; ++i)); do
+ cat >> $TC_FLOWER_BATCH_FILE <<-EOF
+ filter add dev $h2 ingress \
+ prot ipv6 \
+ pref 1000 \
+ flower $tcflags dst_ip $(tc_flower_addr $i) \
+ action drop
+ EOF
+ done
+
+ tc -b $TC_FLOWER_BATCH_FILE
+ check_err_fail $should_fail $? "Rule insertion"
+}
+
+__tc_flower_test()
+{
+ local count=$1; shift
+ local should_fail=$1; shift
+ local last=$((count - 1))
+
+ tc_flower_rules_create $count $should_fail
+
+ offload_count=$(tc -j -s filter show dev $h2 ingress |
+ jq -r '[ .[] | select(.kind == "flower") |
+ .options | .in_hw ]' | jq .[] | wc -l)
+ [[ $((offload_count - 1)) -eq $count ]]
+ check_err_fail $should_fail $? "Attempt to offload $count rules (actual result $((offload_count - 1)))"
+}
+
+tc_flower_test()
+{
+ local count=$1; shift
+ local should_fail=$1; shift
+
+ # We use lower 16 bits of IPv6 address for match. Also there are only 16
+ # bits of rule priority space.
+ if ((count > 65536)); then
+ check_err 1 "Invalid count of $count. At most 65536 rules supported"
+ return
+ fi
+
+ if ! tc_offload_check $TC_FLOWER_NUM_NETIFS; then
+ check_err 1 "Could not test offloaded functionality"
+ return
+ fi
+
+ tcflags="skip_sw"
+ __tc_flower_test $count $should_fail
+}
diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_police_occ.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_police_occ.sh
new file mode 100755
index 000000000..448b75c15
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/tc_police_occ.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test that policers shared by different tc filters are correctly reference
+# counted by observing policers' occupancy via devlink-resource.
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ tc_police_occ_test
+"
+NUM_NETIFS=2
+source $lib_dir/lib.sh
+source $lib_dir/devlink_lib.sh
+
+h1_create()
+{
+ simple_if_init $h1
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1
+}
+
+switch_create()
+{
+ simple_if_init $swp1
+ tc qdisc add dev $swp1 clsact
+}
+
+switch_destroy()
+{
+ tc qdisc del dev $swp1 clsact
+ simple_if_fini $swp1
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ vrf_prepare
+
+ h1_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+tc_police_occ_get()
+{
+ devlink_resource_occ_get global_policers single_rate_policers
+}
+
+tc_police_occ_test()
+{
+ RET=0
+
+ local occ=$(tc_police_occ_get)
+
+ tc filter add dev $swp1 ingress pref 1 handle 101 proto ip \
+ flower skip_sw \
+ action police rate 100mbit burst 100k conform-exceed drop/ok
+ (( occ + 1 == $(tc_police_occ_get) ))
+ check_err $? "Got occupancy $(tc_police_occ_get), expected $((occ + 1))"
+
+ tc filter del dev $swp1 ingress pref 1 handle 101 flower
+ (( occ == $(tc_police_occ_get) ))
+ check_err $? "Got occupancy $(tc_police_occ_get), expected $occ"
+
+ tc filter add dev $swp1 ingress pref 1 handle 101 proto ip \
+ flower skip_sw \
+ action police rate 100mbit burst 100k conform-exceed drop/ok \
+ index 10
+ tc filter add dev $swp1 ingress pref 2 handle 102 proto ip \
+ flower skip_sw action police index 10
+
+ (( occ + 1 == $(tc_police_occ_get) ))
+ check_err $? "Got occupancy $(tc_police_occ_get), expected $((occ + 1))"
+
+ tc filter del dev $swp1 ingress pref 2 handle 102 flower
+ (( occ + 1 == $(tc_police_occ_get) ))
+ check_err $? "Got occupancy $(tc_police_occ_get), expected $((occ + 1))"
+
+ tc filter del dev $swp1 ingress pref 1 handle 101 flower
+ (( occ == $(tc_police_occ_get) ))
+ check_err $? "Got occupancy $(tc_police_occ_get), expected $occ"
+
+ log_test "tc police occupancy"
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_police_scale.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_police_scale.sh
new file mode 100644
index 000000000..86e787895
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/tc_police_scale.sh
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: GPL-2.0
+
+TC_POLICE_NUM_NETIFS=2
+
+tc_police_h1_create()
+{
+ simple_if_init $h1
+}
+
+tc_police_h1_destroy()
+{
+ simple_if_fini $h1
+}
+
+tc_police_switch_create()
+{
+ simple_if_init $swp1
+ tc qdisc add dev $swp1 clsact
+}
+
+tc_police_switch_destroy()
+{
+ tc qdisc del dev $swp1 clsact
+ simple_if_fini $swp1
+}
+
+tc_police_addr()
+{
+ local num=$1; shift
+
+ printf "2001:db8:1::%x" $num
+}
+
+tc_police_rules_create()
+{
+ local count=$1; shift
+ local should_fail=$1; shift
+
+ TC_POLICE_BATCH_FILE="$(mktemp)"
+
+ for ((i = 0; i < count; ++i)); do
+ cat >> $TC_POLICE_BATCH_FILE <<-EOF
+ filter add dev $swp1 ingress \
+ prot ipv6 \
+ pref 1000 \
+ flower skip_sw dst_ip $(tc_police_addr $i) \
+ action police rate 10mbit burst 100k \
+ conform-exceed drop/ok
+ EOF
+ done
+
+ tc -b $TC_POLICE_BATCH_FILE
+ check_err_fail $should_fail $? "Rule insertion"
+}
+
+__tc_police_test()
+{
+ local count=$1; shift
+ local should_fail=$1; shift
+
+ tc_police_rules_create $count $should_fail
+
+ offload_count=$(tc -j filter show dev $swp1 ingress |
+ jq "[.[] | select(.options.in_hw == true)] | length")
+ ((offload_count == count))
+ check_err_fail $should_fail $? "tc police offload count"
+}
+
+tc_police_test()
+{
+ local count=$1; shift
+ local should_fail=$1; shift
+
+ if ! tc_offload_check $TC_POLICE_NUM_NETIFS; then
+ check_err 1 "Could not test offloaded functionality"
+ return
+ fi
+
+ __tc_police_test $count $should_fail
+}
+
+tc_police_setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ vrf_prepare
+
+ tc_police_h1_create
+ tc_police_switch_create
+}
+
+tc_police_cleanup()
+{
+ pre_cleanup
+
+ tc_police_switch_destroy
+ tc_police_h1_destroy
+
+ vrf_cleanup
+}
diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh
new file mode 100755
index 000000000..553cb9fad
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/tc_restrictions.sh
@@ -0,0 +1,394 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ shared_block_drop_test
+ egress_redirect_test
+ multi_mirror_test
+ matchall_sample_egress_test
+ matchall_mirror_behind_flower_ingress_test
+ matchall_sample_behind_flower_ingress_test
+ matchall_mirror_behind_flower_egress_test
+ police_limits_test
+ multi_police_test
+"
+NUM_NETIFS=2
+
+source $lib_dir/tc_common.sh
+source $lib_dir/lib.sh
+
+switch_create()
+{
+ simple_if_init $swp1 192.0.2.1/24
+ simple_if_init $swp2 192.0.2.2/24
+}
+
+switch_destroy()
+{
+ simple_if_fini $swp2 192.0.2.2/24
+ simple_if_fini $swp1 192.0.2.1/24
+}
+
+shared_block_drop_test()
+{
+ RET=0
+
+ # It is forbidden in mlxsw driver to have mixed-bound
+ # shared block with a drop rule.
+
+ tc qdisc add dev $swp1 ingress_block 22 clsact
+ check_err $? "Failed to create clsact with ingress block"
+
+ tc filter add block 22 protocol ip pref 1 handle 101 flower \
+ skip_sw dst_ip 192.0.2.2 action drop
+ check_err $? "Failed to add drop rule to ingress bound block"
+
+ tc qdisc add dev $swp2 ingress_block 22 clsact
+ check_err $? "Failed to create another clsact with ingress shared block"
+
+ tc qdisc del dev $swp2 clsact
+
+ tc qdisc add dev $swp2 egress_block 22 clsact
+ check_fail $? "Incorrect success to create another clsact with egress shared block"
+
+ tc filter del block 22 protocol ip pref 1 handle 101 flower
+
+ tc qdisc add dev $swp2 egress_block 22 clsact
+ check_err $? "Failed to create another clsact with egress shared block after blocker drop rule removed"
+
+ tc filter add block 22 protocol ip pref 1 handle 101 flower \
+ skip_sw dst_ip 192.0.2.2 action drop
+ check_fail $? "Incorrect success to add drop rule to mixed bound block"
+
+ tc qdisc del dev $swp1 clsact
+
+ tc qdisc add dev $swp1 egress_block 22 clsact
+ check_err $? "Failed to create another clsact with egress shared block"
+
+ tc filter add block 22 protocol ip pref 1 handle 101 flower \
+ skip_sw dst_ip 192.0.2.2 action drop
+ check_err $? "Failed to add drop rule to egress bound shared block"
+
+ tc filter del block 22 protocol ip pref 1 handle 101 flower
+
+ tc qdisc del dev $swp2 clsact
+ tc qdisc del dev $swp1 clsact
+
+ log_test "shared block drop"
+}
+
+egress_redirect_test()
+{
+ RET=0
+
+ # It is forbidden in mlxsw driver to have mirred redirect on
+ # egress-bound block.
+
+ tc qdisc add dev $swp1 ingress_block 22 clsact
+ check_err $? "Failed to create clsact with ingress block"
+
+ tc filter add block 22 protocol ip pref 1 handle 101 flower \
+ skip_sw dst_ip 192.0.2.2 \
+ action mirred egress redirect dev $swp2
+ check_err $? "Failed to add redirect rule to ingress bound block"
+
+ tc qdisc add dev $swp2 ingress_block 22 clsact
+ check_err $? "Failed to create another clsact with ingress shared block"
+
+ tc qdisc del dev $swp2 clsact
+
+ tc qdisc add dev $swp2 egress_block 22 clsact
+ check_fail $? "Incorrect success to create another clsact with egress shared block"
+
+ tc filter del block 22 protocol ip pref 1 handle 101 flower
+
+ tc qdisc add dev $swp2 egress_block 22 clsact
+ check_err $? "Failed to create another clsact with egress shared block after blocker redirect rule removed"
+
+ tc filter add block 22 protocol ip pref 1 handle 101 flower \
+ skip_sw dst_ip 192.0.2.2 \
+ action mirred egress redirect dev $swp2
+ check_fail $? "Incorrect success to add redirect rule to mixed bound block"
+
+ tc qdisc del dev $swp1 clsact
+
+ tc qdisc add dev $swp1 egress_block 22 clsact
+ check_err $? "Failed to create another clsact with egress shared block"
+
+ tc filter add block 22 protocol ip pref 1 handle 101 flower \
+ skip_sw dst_ip 192.0.2.2 \
+ action mirred egress redirect dev $swp2
+ check_fail $? "Incorrect success to add redirect rule to egress bound shared block"
+
+ tc qdisc del dev $swp2 clsact
+
+ tc filter add block 22 protocol ip pref 1 handle 101 flower \
+ skip_sw dst_ip 192.0.2.2 \
+ action mirred egress redirect dev $swp2
+ check_fail $? "Incorrect success to add redirect rule to egress bound block"
+
+ tc qdisc del dev $swp1 clsact
+
+ log_test "shared block drop"
+}
+
+multi_mirror_test()
+{
+ RET=0
+
+ # It is forbidden in mlxsw driver to have multiple mirror
+ # actions in a single rule.
+
+ tc qdisc add dev $swp1 clsact
+
+ tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \
+ skip_sw dst_ip 192.0.2.2 \
+ action mirred egress mirror dev $swp2
+ check_err $? "Failed to add rule with single mirror action"
+
+ tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower
+
+ tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \
+ skip_sw dst_ip 192.0.2.2 \
+ action mirred egress mirror dev $swp2 \
+ action mirred egress mirror dev $swp1
+ check_fail $? "Incorrect success to add rule with two mirror actions"
+
+ tc qdisc del dev $swp1 clsact
+
+ log_test "multi mirror"
+}
+
+matchall_sample_egress_test()
+{
+ RET=0
+
+ # It is forbidden in mlxsw driver to have matchall with sample action
+ # bound on egress
+
+ tc qdisc add dev $swp1 clsact
+
+ tc filter add dev $swp1 ingress protocol all pref 1 handle 101 \
+ matchall skip_sw action sample rate 100 group 1
+ check_err $? "Failed to add rule with sample action on ingress"
+
+ tc filter del dev $swp1 ingress protocol all pref 1 handle 101 matchall
+
+ tc filter add dev $swp1 egress protocol all pref 1 handle 101 \
+ matchall skip_sw action sample rate 100 group 1
+ check_fail $? "Incorrect success to add rule with sample action on egress"
+
+ tc qdisc del dev $swp1 clsact
+
+ log_test "matchall sample egress"
+}
+
+matchall_behind_flower_ingress_test()
+{
+ local action=$1
+ local action_args=$2
+
+ RET=0
+
+ # On ingress, all matchall-mirror and matchall-sample
+ # rules have to be in front of the flower rules
+
+ tc qdisc add dev $swp1 clsact
+
+ tc filter add dev $swp1 ingress protocol ip pref 10 handle 101 flower \
+ skip_sw dst_ip 192.0.2.2 action drop
+
+ tc filter add dev $swp1 ingress protocol all pref 9 handle 102 \
+ matchall skip_sw action $action_args
+ check_err $? "Failed to add matchall rule in front of a flower rule"
+
+ tc filter del dev $swp1 ingress protocol all pref 9 handle 102 matchall
+
+ tc filter add dev $swp1 ingress protocol all pref 11 handle 102 \
+ matchall skip_sw action $action_args
+ check_fail $? "Incorrect success to add matchall rule behind a flower rule"
+
+ tc filter del dev $swp1 ingress protocol ip pref 10 handle 101 flower
+
+ tc filter add dev $swp1 ingress protocol all pref 9 handle 102 \
+ matchall skip_sw action $action_args
+
+ tc filter add dev $swp1 ingress protocol ip pref 10 handle 101 flower \
+ skip_sw dst_ip 192.0.2.2 action drop
+ check_err $? "Failed to add flower rule behind a matchall rule"
+
+ tc filter del dev $swp1 ingress protocol ip pref 10 handle 101 flower
+
+ tc filter add dev $swp1 ingress protocol ip pref 8 handle 101 flower \
+ skip_sw dst_ip 192.0.2.2 action drop
+ check_fail $? "Incorrect success to add flower rule in front of a matchall rule"
+
+ tc qdisc del dev $swp1 clsact
+
+ log_test "matchall $action flower ingress"
+}
+
+matchall_mirror_behind_flower_ingress_test()
+{
+ matchall_behind_flower_ingress_test "mirror" "mirred egress mirror dev $swp2"
+}
+
+matchall_sample_behind_flower_ingress_test()
+{
+ matchall_behind_flower_ingress_test "sample" "sample rate 100 group 1"
+}
+
+matchall_behind_flower_egress_test()
+{
+ local action=$1
+ local action_args=$2
+
+ RET=0
+
+ # On egress, all matchall-mirror rules have to be behind the flower rules
+
+ tc qdisc add dev $swp1 clsact
+
+ tc filter add dev $swp1 egress protocol ip pref 10 handle 101 flower \
+ skip_sw dst_ip 192.0.2.2 action drop
+
+ tc filter add dev $swp1 egress protocol all pref 11 handle 102 \
+ matchall skip_sw action $action_args
+ check_err $? "Failed to add matchall rule in front of a flower rule"
+
+ tc filter del dev $swp1 egress protocol all pref 11 handle 102 matchall
+
+ tc filter add dev $swp1 egress protocol all pref 9 handle 102 \
+ matchall skip_sw action $action_args
+ check_fail $? "Incorrect success to add matchall rule behind a flower rule"
+
+ tc filter del dev $swp1 egress protocol ip pref 10 handle 101 flower
+
+ tc filter add dev $swp1 egress protocol all pref 11 handle 102 \
+ matchall skip_sw action $action_args
+
+ tc filter add dev $swp1 egress protocol ip pref 10 handle 101 flower \
+ skip_sw dst_ip 192.0.2.2 action drop
+ check_err $? "Failed to add flower rule behind a matchall rule"
+
+ tc filter del dev $swp1 egress protocol ip pref 10 handle 101 flower
+
+ tc filter add dev $swp1 egress protocol ip pref 12 handle 101 flower \
+ skip_sw dst_ip 192.0.2.2 action drop
+ check_fail $? "Incorrect success to add flower rule in front of a matchall rule"
+
+ tc qdisc del dev $swp1 clsact
+
+ log_test "matchall $action flower egress"
+}
+
+matchall_mirror_behind_flower_egress_test()
+{
+ matchall_behind_flower_egress_test "mirror" "mirred egress mirror dev $swp2"
+}
+
+police_limits_test()
+{
+ RET=0
+
+ tc qdisc add dev $swp1 clsact
+
+ tc filter add dev $swp1 ingress pref 1 proto ip handle 101 \
+ flower skip_sw \
+ action police rate 0.5kbit burst 1m conform-exceed drop/ok
+ check_fail $? "Incorrect success to add police action with too low rate"
+
+ tc filter add dev $swp1 ingress pref 1 proto ip handle 101 \
+ flower skip_sw \
+ action police rate 2.5tbit burst 1g conform-exceed drop/ok
+ check_fail $? "Incorrect success to add police action with too high rate"
+
+ tc filter add dev $swp1 ingress pref 1 proto ip handle 101 \
+ flower skip_sw \
+ action police rate 1.5kbit burst 1m conform-exceed drop/ok
+ check_err $? "Failed to add police action with low rate"
+
+ tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower
+
+ tc filter add dev $swp1 ingress pref 1 proto ip handle 101 \
+ flower skip_sw \
+ action police rate 1.9tbit burst 1g conform-exceed drop/ok
+ check_err $? "Failed to add police action with high rate"
+
+ tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower
+
+ tc filter add dev $swp1 ingress pref 1 proto ip handle 101 \
+ flower skip_sw \
+ action police rate 1.5kbit burst 512b conform-exceed drop/ok
+ check_fail $? "Incorrect success to add police action with too low burst size"
+
+ tc filter add dev $swp1 ingress pref 1 proto ip handle 101 \
+ flower skip_sw \
+ action police rate 1.5kbit burst 2k conform-exceed drop/ok
+ check_err $? "Failed to add police action with low burst size"
+
+ tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower
+
+ tc qdisc del dev $swp1 clsact
+
+ log_test "police rate and burst limits"
+}
+
+multi_police_test()
+{
+ RET=0
+
+ # It is forbidden in mlxsw driver to have multiple police
+ # actions in a single rule.
+
+ tc qdisc add dev $swp1 clsact
+
+ tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 \
+ flower skip_sw \
+ action police rate 100mbit burst 100k conform-exceed drop/ok
+ check_err $? "Failed to add rule with single police action"
+
+ tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower
+
+ tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 \
+ flower skip_sw \
+ action police rate 100mbit burst 100k conform-exceed drop/pipe \
+ action police rate 200mbit burst 200k conform-exceed drop/ok
+ check_fail $? "Incorrect success to add rule with two police actions"
+
+ tc qdisc del dev $swp1 clsact
+
+ log_test "multi police"
+}
+
+setup_prepare()
+{
+ swp1=${NETIFS[p1]}
+ swp2=${NETIFS[p2]}
+
+ vrf_prepare
+
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+
+ vrf_cleanup
+}
+
+check_tc_shblock_support
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh b/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh
new file mode 100755
index 000000000..729a86cc4
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/vxlan.sh
@@ -0,0 +1,1156 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test various aspects of VxLAN offloading which are specific to mlxsw, such
+# as sanitization of invalid configurations and offload indication.
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="sanitization_test offload_indication_test \
+ sanitization_vlan_aware_test offload_indication_vlan_aware_test"
+NUM_NETIFS=2
+: ${TIMEOUT:=20000} # ms
+source $lib_dir/lib.sh
+
+setup_prepare()
+{
+ swp1=${NETIFS[p1]}
+ swp2=${NETIFS[p2]}
+
+ ip link set dev $swp1 up
+ ip link set dev $swp2 up
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ ip link set dev $swp2 down
+ ip link set dev $swp1 down
+}
+
+sanitization_single_dev_test_pass()
+{
+ ip link set dev $swp1 master br0
+ check_err $?
+ ip link set dev vxlan0 master br0
+ check_err $?
+
+ ip link set dev $swp1 nomaster
+
+ ip link set dev $swp1 master br0
+ check_err $?
+}
+
+sanitization_single_dev_test_fail()
+{
+ ip link set dev $swp1 master br0
+ check_err $?
+ ip link set dev vxlan0 master br0 &> /dev/null
+ check_fail $?
+
+ ip link set dev $swp1 nomaster
+
+ ip link set dev vxlan0 master br0
+ check_err $?
+ ip link set dev $swp1 master br0 &> /dev/null
+ check_fail $?
+}
+
+sanitization_single_dev_valid_test()
+{
+ RET=0
+
+ ip link add dev br0 type bridge mcast_snooping 0
+
+ ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
+ ttl 20 tos inherit local 198.51.100.1 dstport 4789
+
+ sanitization_single_dev_test_pass
+
+ ip link del dev vxlan0
+ ip link del dev br0
+
+ log_test "vxlan device - valid configuration"
+}
+
+sanitization_single_dev_vlan_aware_test()
+{
+ RET=0
+
+ ip link add dev br0 type bridge mcast_snooping 0 vlan_filtering 1
+
+ ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
+ ttl 20 tos inherit local 198.51.100.1 dstport 4789
+
+ sanitization_single_dev_test_pass
+
+ ip link del dev vxlan0
+ ip link del dev br0
+
+ log_test "vxlan device with a vlan-aware bridge"
+}
+
+sanitization_single_dev_mcast_enabled_test()
+{
+ RET=0
+
+ ip link add dev br0 type bridge
+
+ ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
+ ttl 20 tos inherit local 198.51.100.1 dstport 4789
+
+ sanitization_single_dev_test_fail
+
+ ip link del dev vxlan0
+ ip link del dev br0
+
+ log_test "vxlan device with a multicast enabled bridge"
+}
+
+sanitization_single_dev_mcast_group_test()
+{
+ RET=0
+
+ ip link add dev br0 type bridge mcast_snooping 0
+ ip link add name dummy1 up type dummy
+
+ ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
+ ttl 20 tos inherit local 198.51.100.1 dstport 4789 \
+ dev dummy1 group 239.0.0.1
+
+ sanitization_single_dev_test_fail
+
+ ip link del dev vxlan0
+ ip link del dev dummy1
+ ip link del dev br0
+
+ log_test "vxlan device with a multicast group"
+}
+
+sanitization_single_dev_no_local_ip_test()
+{
+ RET=0
+
+ ip link add dev br0 type bridge mcast_snooping 0
+
+ ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
+ ttl 20 tos inherit dstport 4789
+
+ sanitization_single_dev_test_fail
+
+ ip link del dev vxlan0
+ ip link del dev br0
+
+ log_test "vxlan device with no local ip"
+}
+
+sanitization_single_dev_local_ipv6_test()
+{
+ RET=0
+
+ ip link add dev br0 type bridge mcast_snooping 0
+
+ ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
+ ttl 20 tos inherit local 2001:db8::1 dstport 4789
+
+ sanitization_single_dev_test_fail
+
+ ip link del dev vxlan0
+ ip link del dev br0
+
+ log_test "vxlan device with local ipv6 address"
+}
+
+sanitization_single_dev_learning_enabled_test()
+{
+ RET=0
+
+ ip link add dev br0 type bridge mcast_snooping 0
+
+ ip link add name vxlan0 up type vxlan id 10 learning noudpcsum \
+ ttl 20 tos inherit local 198.51.100.1 dstport 4789
+
+ sanitization_single_dev_test_pass
+
+ ip link del dev vxlan0
+ ip link del dev br0
+
+ log_test "vxlan device with learning enabled"
+}
+
+sanitization_single_dev_local_interface_test()
+{
+ RET=0
+
+ ip link add dev br0 type bridge mcast_snooping 0
+ ip link add name dummy1 up type dummy
+
+ ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
+ ttl 20 tos inherit local 198.51.100.1 dstport 4789 dev dummy1
+
+ sanitization_single_dev_test_fail
+
+ ip link del dev vxlan0
+ ip link del dev dummy1
+ ip link del dev br0
+
+ log_test "vxlan device with local interface"
+}
+
+sanitization_single_dev_port_range_test()
+{
+ RET=0
+
+ ip link add dev br0 type bridge mcast_snooping 0
+
+ ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
+ ttl 20 tos inherit local 198.51.100.1 dstport 4789 \
+ srcport 4000 5000
+
+ sanitization_single_dev_test_fail
+
+ ip link del dev vxlan0
+ ip link del dev br0
+
+ log_test "vxlan device with udp source port range"
+}
+
+sanitization_single_dev_tos_static_test()
+{
+ RET=0
+
+ ip link add dev br0 type bridge mcast_snooping 0
+
+ ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
+ ttl 20 tos 20 local 198.51.100.1 dstport 4789
+
+ sanitization_single_dev_test_fail
+
+ ip link del dev vxlan0
+ ip link del dev br0
+
+ log_test "vxlan device with static tos"
+}
+
+sanitization_single_dev_ttl_inherit_test()
+{
+ RET=0
+
+ ip link add dev br0 type bridge mcast_snooping 0
+
+ ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
+ ttl inherit tos inherit local 198.51.100.1 dstport 4789
+
+ sanitization_single_dev_test_fail
+
+ ip link del dev vxlan0
+ ip link del dev br0
+
+ log_test "vxlan device with inherit ttl"
+}
+
+sanitization_single_dev_udp_checksum_test()
+{
+ RET=0
+
+ ip link add dev br0 type bridge mcast_snooping 0
+
+ ip link add name vxlan0 up type vxlan id 10 nolearning udpcsum \
+ ttl 20 tos inherit local 198.51.100.1 dstport 4789
+
+ sanitization_single_dev_test_fail
+
+ ip link del dev vxlan0
+ ip link del dev br0
+
+ log_test "vxlan device with udp checksum"
+}
+
+sanitization_single_dev_test()
+{
+ # These tests make sure that we correctly sanitize VxLAN device
+ # configurations we do not support
+ sanitization_single_dev_valid_test
+ sanitization_single_dev_vlan_aware_test
+ sanitization_single_dev_mcast_enabled_test
+ sanitization_single_dev_mcast_group_test
+ sanitization_single_dev_no_local_ip_test
+ sanitization_single_dev_local_ipv6_test
+ sanitization_single_dev_learning_enabled_test
+ sanitization_single_dev_local_interface_test
+ sanitization_single_dev_port_range_test
+ sanitization_single_dev_tos_static_test
+ sanitization_single_dev_ttl_inherit_test
+ sanitization_single_dev_udp_checksum_test
+}
+
+sanitization_multi_devs_test_pass()
+{
+ ip link set dev $swp1 master br0
+ check_err $?
+ ip link set dev vxlan0 master br0
+ check_err $?
+ ip link set dev $swp2 master br1
+ check_err $?
+ ip link set dev vxlan1 master br1
+ check_err $?
+
+ ip link set dev $swp2 nomaster
+ ip link set dev $swp1 nomaster
+
+ ip link set dev $swp1 master br0
+ check_err $?
+ ip link set dev $swp2 master br1
+ check_err $?
+}
+
+sanitization_multi_devs_test_fail()
+{
+ ip link set dev $swp1 master br0
+ check_err $?
+ ip link set dev vxlan0 master br0
+ check_err $?
+ ip link set dev $swp2 master br1
+ check_err $?
+ ip link set dev vxlan1 master br1 &> /dev/null
+ check_fail $?
+
+ ip link set dev $swp2 nomaster
+ ip link set dev $swp1 nomaster
+
+ ip link set dev vxlan1 master br1
+ check_err $?
+ ip link set dev $swp1 master br0
+ check_err $?
+ ip link set dev $swp2 master br1 &> /dev/null
+ check_fail $?
+}
+
+sanitization_multi_devs_valid_test()
+{
+ RET=0
+
+ ip link add dev br0 type bridge mcast_snooping 0
+ ip link add dev br1 type bridge mcast_snooping 0
+
+ ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
+ ttl 20 tos inherit local 198.51.100.1 dstport 4789
+ ip link add name vxlan1 up type vxlan id 20 nolearning noudpcsum \
+ ttl 20 tos inherit local 198.51.100.1 dstport 4789
+
+ sanitization_multi_devs_test_pass
+
+ ip link del dev vxlan1
+ ip link del dev vxlan0
+ ip link del dev br1
+ ip link del dev br0
+
+ log_test "multiple vxlan devices - valid configuration"
+}
+
+sanitization_multi_devs_ttl_test()
+{
+ RET=0
+
+ ip link add dev br0 type bridge mcast_snooping 0
+ ip link add dev br1 type bridge mcast_snooping 0
+
+ ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
+ ttl 20 tos inherit local 198.51.100.1 dstport 4789
+ ip link add name vxlan1 up type vxlan id 20 nolearning noudpcsum \
+ ttl 40 tos inherit local 198.51.100.1 dstport 4789
+
+ sanitization_multi_devs_test_fail
+
+ ip link del dev vxlan1
+ ip link del dev vxlan0
+ ip link del dev br1
+ ip link del dev br0
+
+ log_test "multiple vxlan devices with different ttl"
+}
+
+sanitization_multi_devs_udp_dstport_test()
+{
+ RET=0
+
+ ip link add dev br0 type bridge mcast_snooping 0
+ ip link add dev br1 type bridge mcast_snooping 0
+
+ ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
+ ttl 20 tos inherit local 198.51.100.1 dstport 4789
+ ip link add name vxlan1 up type vxlan id 20 nolearning noudpcsum \
+ ttl 20 tos inherit local 198.51.100.1 dstport 5789
+
+ sanitization_multi_devs_test_fail
+
+ ip link del dev vxlan1
+ ip link del dev vxlan0
+ ip link del dev br1
+ ip link del dev br0
+
+ log_test "multiple vxlan devices with different udp destination port"
+}
+
+sanitization_multi_devs_local_ip_test()
+{
+ RET=0
+
+ ip link add dev br0 type bridge mcast_snooping 0
+ ip link add dev br1 type bridge mcast_snooping 0
+
+ ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
+ ttl 20 tos inherit local 198.51.100.1 dstport 4789
+ ip link add name vxlan1 up type vxlan id 20 nolearning noudpcsum \
+ ttl 20 tos inherit local 198.51.100.2 dstport 4789
+
+ sanitization_multi_devs_test_fail
+
+ ip link del dev vxlan1
+ ip link del dev vxlan0
+ ip link del dev br1
+ ip link del dev br0
+
+ log_test "multiple vxlan devices with different local ip"
+}
+
+sanitization_multi_devs_test()
+{
+ # The device has a single VTEP, which means all the VxLAN devices
+ # we offload must share certain properties such as source IP and
+ # UDP destination port. These tests make sure that we forbid
+ # configurations that violate this limitation
+ sanitization_multi_devs_valid_test
+ sanitization_multi_devs_ttl_test
+ sanitization_multi_devs_udp_dstport_test
+ sanitization_multi_devs_local_ip_test
+}
+
+sanitization_test()
+{
+ sanitization_single_dev_test
+ sanitization_multi_devs_test
+}
+
+offload_indication_setup_create()
+{
+ # Create a simple setup with two bridges, each with a VxLAN device
+ # and one local port
+ ip link add name br0 up type bridge mcast_snooping 0
+ ip link add name br1 up type bridge mcast_snooping 0
+
+ ip link set dev $swp1 master br0
+ ip link set dev $swp2 master br1
+
+ ip address add 198.51.100.1/32 dev lo
+
+ ip link add name vxlan0 up master br0 type vxlan id 10 nolearning \
+ noudpcsum ttl 20 tos inherit local 198.51.100.1 dstport 4789
+ ip link add name vxlan1 up master br1 type vxlan id 20 nolearning \
+ noudpcsum ttl 20 tos inherit local 198.51.100.1 dstport 4789
+}
+
+offload_indication_setup_destroy()
+{
+ ip link del dev vxlan1
+ ip link del dev vxlan0
+
+ ip address del 198.51.100.1/32 dev lo
+
+ ip link set dev $swp2 nomaster
+ ip link set dev $swp1 nomaster
+
+ ip link del dev br1
+ ip link del dev br0
+}
+
+offload_indication_fdb_flood_test()
+{
+ RET=0
+
+ bridge fdb append 00:00:00:00:00:00 dev vxlan0 self dst 198.51.100.2
+
+ busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb 00:00:00:00:00:00 \
+ bridge fdb show brport vxlan0
+ check_err $?
+
+ bridge fdb del 00:00:00:00:00:00 dev vxlan0 self
+
+ log_test "vxlan flood entry offload indication"
+}
+
+offload_indication_fdb_bridge_test()
+{
+ RET=0
+
+ bridge fdb add de:ad:be:ef:13:37 dev vxlan0 self master static \
+ dst 198.51.100.2
+
+ busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \
+ de:ad:be:ef:13:37 self bridge fdb show brport vxlan0
+ check_err $?
+ busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \
+ de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan0
+ check_err $?
+
+ log_test "vxlan entry offload indication - initial state"
+
+ # Remove FDB entry from the bridge driver and check that corresponding
+ # entry in the VxLAN driver is not marked as offloaded
+ RET=0
+
+ bridge fdb del de:ad:be:ef:13:37 dev vxlan0 master
+ busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb \
+ de:ad:be:ef:13:37 self bridge fdb show brport vxlan0
+ check_err $?
+
+ log_test "vxlan entry offload indication - after removal from bridge"
+
+ # Add the FDB entry back to the bridge driver and make sure it is
+ # marked as offloaded in both drivers
+ RET=0
+
+ bridge fdb add de:ad:be:ef:13:37 dev vxlan0 master static
+ busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \
+ de:ad:be:ef:13:37 self bridge fdb show brport vxlan0
+ check_err $?
+ busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \
+ de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan0
+ check_err $?
+
+ log_test "vxlan entry offload indication - after re-add to bridge"
+
+ # Remove FDB entry from the VxLAN driver and check that corresponding
+ # entry in the bridge driver is not marked as offloaded
+ RET=0
+
+ bridge fdb del de:ad:be:ef:13:37 dev vxlan0 self
+ busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb \
+ de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan0
+ check_err $?
+
+ log_test "vxlan entry offload indication - after removal from vxlan"
+
+ # Add the FDB entry back to the VxLAN driver and make sure it is
+ # marked as offloaded in both drivers
+ RET=0
+
+ bridge fdb add de:ad:be:ef:13:37 dev vxlan0 self dst 198.51.100.2
+ busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \
+ de:ad:be:ef:13:37 self bridge fdb show brport vxlan0
+ check_err $?
+ busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \
+ de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan0
+ check_err $?
+
+ log_test "vxlan entry offload indication - after re-add to vxlan"
+
+ bridge fdb del de:ad:be:ef:13:37 dev vxlan0 self master
+}
+
+offload_indication_fdb_test()
+{
+ offload_indication_fdb_flood_test
+ offload_indication_fdb_bridge_test
+}
+
+offload_indication_decap_route_test()
+{
+ RET=0
+
+ busywait "$TIMEOUT" wait_for_offload \
+ ip route show table local 198.51.100.1
+ check_err $?
+
+ ip link set dev vxlan0 down
+ busywait "$TIMEOUT" wait_for_offload \
+ ip route show table local 198.51.100.1
+ check_err $?
+
+ ip link set dev vxlan1 down
+ busywait "$TIMEOUT" not wait_for_offload \
+ ip route show table local 198.51.100.1
+ check_err $?
+
+ log_test "vxlan decap route - vxlan device down"
+
+ RET=0
+
+ ip link set dev vxlan1 up
+ busywait "$TIMEOUT" wait_for_offload \
+ ip route show table local 198.51.100.1
+ check_err $?
+
+ ip link set dev vxlan0 up
+ busywait "$TIMEOUT" wait_for_offload \
+ ip route show table local 198.51.100.1
+ check_err $?
+
+ log_test "vxlan decap route - vxlan device up"
+
+ RET=0
+
+ ip address delete 198.51.100.1/32 dev lo
+ busywait "$TIMEOUT" not wait_for_offload \
+ ip route show table local 198.51.100.1
+ check_err $?
+
+ ip address add 198.51.100.1/32 dev lo
+ busywait "$TIMEOUT" wait_for_offload \
+ ip route show table local 198.51.100.1
+ check_err $?
+
+ log_test "vxlan decap route - add local route"
+
+ RET=0
+
+ ip link set dev $swp1 nomaster
+ busywait "$TIMEOUT" wait_for_offload \
+ ip route show table local 198.51.100.1
+ check_err $?
+
+ ip link set dev $swp2 nomaster
+ busywait "$TIMEOUT" not wait_for_offload \
+ ip route show table local 198.51.100.1
+ check_err $?
+
+ ip link set dev $swp1 master br0
+ ip link set dev $swp2 master br1
+ busywait "$TIMEOUT" wait_for_offload \
+ ip route show table local 198.51.100.1
+ check_err $?
+
+ log_test "vxlan decap route - local ports enslavement"
+
+ RET=0
+
+ ip link del dev br0
+ busywait "$TIMEOUT" wait_for_offload \
+ ip route show table local 198.51.100.1
+ check_err $?
+
+ ip link del dev br1
+ busywait "$TIMEOUT" not wait_for_offload \
+ ip route show table local 198.51.100.1
+ check_err $?
+
+ log_test "vxlan decap route - bridge device deletion"
+
+ RET=0
+
+ ip link add name br0 up type bridge mcast_snooping 0
+ ip link add name br1 up type bridge mcast_snooping 0
+ ip link set dev $swp1 master br0
+ ip link set dev $swp2 master br1
+ ip link set dev vxlan0 master br0
+ ip link set dev vxlan1 master br1
+ busywait "$TIMEOUT" wait_for_offload \
+ ip route show table local 198.51.100.1
+ check_err $?
+
+ ip link del dev vxlan0
+ busywait "$TIMEOUT" wait_for_offload \
+ ip route show table local 198.51.100.1
+ check_err $?
+
+ ip link del dev vxlan1
+ busywait "$TIMEOUT" not wait_for_offload \
+ ip route show table local 198.51.100.1
+ check_err $?
+
+ log_test "vxlan decap route - vxlan device deletion"
+
+ ip link add name vxlan0 up master br0 type vxlan id 10 nolearning \
+ noudpcsum ttl 20 tos inherit local 198.51.100.1 dstport 4789
+ ip link add name vxlan1 up master br1 type vxlan id 20 nolearning \
+ noudpcsum ttl 20 tos inherit local 198.51.100.1 dstport 4789
+}
+
+check_fdb_offloaded()
+{
+ local mac=00:11:22:33:44:55
+ local zmac=00:00:00:00:00:00
+
+ busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb $mac self \
+ bridge fdb show dev vxlan0
+ check_err $?
+ busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb $mac master \
+ bridge fdb show dev vxlan0
+ check_err $?
+
+ busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb $zmac self \
+ bridge fdb show dev vxlan0
+ check_err $?
+}
+
+check_vxlan_fdb_not_offloaded()
+{
+ local mac=00:11:22:33:44:55
+ local zmac=00:00:00:00:00:00
+
+ bridge fdb show dev vxlan0 | grep $mac | grep -q self
+ check_err $?
+ busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb $mac self \
+ bridge fdb show dev vxlan0
+ check_err $?
+
+ bridge fdb show dev vxlan0 | grep $zmac | grep -q self
+ check_err $?
+ busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb $zmac self \
+ bridge fdb show dev vxlan0
+ check_err $?
+}
+
+check_bridge_fdb_not_offloaded()
+{
+ local mac=00:11:22:33:44:55
+ local zmac=00:00:00:00:00:00
+
+ bridge fdb show dev vxlan0 | grep $mac | grep -q master
+ check_err $?
+ busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb $mac master \
+ bridge fdb show dev vxlan0
+ check_err $?
+}
+
+__offload_indication_join_vxlan_first()
+{
+ local vid=$1; shift
+
+ local mac=00:11:22:33:44:55
+ local zmac=00:00:00:00:00:00
+
+ bridge fdb append $zmac dev vxlan0 self dst 198.51.100.2
+
+ ip link set dev vxlan0 master br0
+ bridge fdb add dev vxlan0 $mac self master static dst 198.51.100.2
+
+ RET=0
+ check_vxlan_fdb_not_offloaded
+ ip link set dev $swp1 master br0
+ sleep .1
+ check_fdb_offloaded
+ log_test "offload indication - attach vxlan first"
+
+ RET=0
+ ip link set dev vxlan0 down
+ check_vxlan_fdb_not_offloaded
+ check_bridge_fdb_not_offloaded
+ log_test "offload indication - set vxlan down"
+
+ RET=0
+ ip link set dev vxlan0 up
+ sleep .1
+ check_fdb_offloaded
+ log_test "offload indication - set vxlan up"
+
+ if [[ ! -z $vid ]]; then
+ RET=0
+ bridge vlan del dev vxlan0 vid $vid
+ check_vxlan_fdb_not_offloaded
+ check_bridge_fdb_not_offloaded
+ log_test "offload indication - delete VLAN"
+
+ RET=0
+ bridge vlan add dev vxlan0 vid $vid
+ check_vxlan_fdb_not_offloaded
+ check_bridge_fdb_not_offloaded
+ log_test "offload indication - add tagged VLAN"
+
+ RET=0
+ bridge vlan add dev vxlan0 vid $vid pvid untagged
+ sleep .1
+ check_fdb_offloaded
+ log_test "offload indication - add pvid/untagged VLAN"
+ fi
+
+ RET=0
+ ip link set dev $swp1 nomaster
+ check_vxlan_fdb_not_offloaded
+ log_test "offload indication - detach port"
+}
+
+offload_indication_join_vxlan_first()
+{
+ ip link add dev br0 up type bridge mcast_snooping 0
+ ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
+ ttl 20 tos inherit local 198.51.100.1 dstport 4789
+
+ __offload_indication_join_vxlan_first
+
+ ip link del dev vxlan0
+ ip link del dev br0
+}
+
+__offload_indication_join_vxlan_last()
+{
+ local zmac=00:00:00:00:00:00
+
+ RET=0
+
+ bridge fdb append $zmac dev vxlan0 self dst 198.51.100.2
+
+ ip link set dev $swp1 master br0
+
+ busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb $zmac self \
+ bridge fdb show dev vxlan0
+ check_err $?
+
+ ip link set dev vxlan0 master br0
+
+ busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb $zmac self \
+ bridge fdb show dev vxlan0
+ check_err $?
+
+ log_test "offload indication - attach vxlan last"
+}
+
+offload_indication_join_vxlan_last()
+{
+ ip link add dev br0 up type bridge mcast_snooping 0
+ ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
+ ttl 20 tos inherit local 198.51.100.1 dstport 4789
+
+ __offload_indication_join_vxlan_last
+
+ ip link del dev vxlan0
+ ip link del dev br0
+}
+
+offload_indication_test()
+{
+ offload_indication_setup_create
+ offload_indication_fdb_test
+ offload_indication_decap_route_test
+ offload_indication_setup_destroy
+
+ log_info "offload indication - replay & cleanup"
+ offload_indication_join_vxlan_first
+ offload_indication_join_vxlan_last
+}
+
+sanitization_vlan_aware_test()
+{
+ RET=0
+
+ ip link add dev br0 type bridge mcast_snooping 0 vlan_filtering 1
+
+ ip link add name vxlan10 up master br0 type vxlan id 10 nolearning \
+ noudpcsum ttl 20 tos inherit local 198.51.100.1 dstport 4789
+
+ ip link add name vxlan20 up master br0 type vxlan id 20 nolearning \
+ noudpcsum ttl 20 tos inherit local 198.51.100.1 dstport 4789
+
+ # Test that when each VNI is mapped to a different VLAN we can enslave
+ # a port to the bridge
+ bridge vlan add vid 10 dev vxlan10 pvid untagged
+ bridge vlan add vid 20 dev vxlan20 pvid untagged
+
+ ip link set dev $swp1 master br0
+ check_err $?
+
+ log_test "vlan-aware - enslavement to vlan-aware bridge"
+
+ # Try to map both VNIs to the same VLAN and make sure configuration
+ # fails
+ RET=0
+
+ bridge vlan add vid 10 dev vxlan20 pvid untagged &> /dev/null
+ check_fail $?
+
+ log_test "vlan-aware - two vnis mapped to the same vlan"
+
+ # Test that enslavement of a port to a bridge fails when two VNIs
+ # are mapped to the same VLAN
+ RET=0
+
+ ip link set dev $swp1 nomaster
+
+ bridge vlan del vid 20 dev vxlan20 pvid untagged
+ bridge vlan add vid 10 dev vxlan20 pvid untagged
+
+ ip link set dev $swp1 master br0 &> /dev/null
+ check_fail $?
+
+ log_test "vlan-aware - failed enslavement to vlan-aware bridge"
+
+ bridge vlan del vid 10 dev vxlan20
+ bridge vlan add vid 20 dev vxlan20 pvid untagged
+
+ # Test that when two VXLAN tunnels with conflicting configurations
+ # (i.e., different TTL) are enslaved to the same VLAN-aware bridge,
+ # then the enslavement of a port to the bridge is denied.
+
+ # Use the offload indication of the local route to ensure the VXLAN
+ # configuration was correctly rollbacked.
+ ip address add 198.51.100.1/32 dev lo
+
+ ip link set dev vxlan10 type vxlan ttl 10
+ ip link set dev $swp1 master br0 &> /dev/null
+ check_fail $?
+
+ busywait "$TIMEOUT" not wait_for_offload \
+ ip route show table local 198.51.100.1
+ check_err $?
+
+ log_test "vlan-aware - failed enslavement to bridge due to conflict"
+
+ ip link set dev vxlan10 type vxlan ttl 20
+ ip address del 198.51.100.1/32 dev lo
+
+ ip link del dev vxlan20
+ ip link del dev vxlan10
+ ip link del dev br0
+}
+
+offload_indication_vlan_aware_setup_create()
+{
+ # Create a simple setup with two VxLAN devices and a single VLAN-aware
+ # bridge
+ ip link add name br0 up type bridge mcast_snooping 0 vlan_filtering 1 \
+ vlan_default_pvid 0
+
+ ip link set dev $swp1 master br0
+
+ bridge vlan add vid 10 dev $swp1
+ bridge vlan add vid 20 dev $swp1
+
+ ip address add 198.51.100.1/32 dev lo
+
+ ip link add name vxlan10 up master br0 type vxlan id 10 nolearning \
+ noudpcsum ttl 20 tos inherit local 198.51.100.1 dstport 4789
+ ip link add name vxlan20 up master br0 type vxlan id 20 nolearning \
+ noudpcsum ttl 20 tos inherit local 198.51.100.1 dstport 4789
+
+ bridge vlan add vid 10 dev vxlan10 pvid untagged
+ bridge vlan add vid 20 dev vxlan20 pvid untagged
+}
+
+offload_indication_vlan_aware_setup_destroy()
+{
+ bridge vlan del vid 20 dev vxlan20
+ bridge vlan del vid 10 dev vxlan10
+
+ ip link del dev vxlan20
+ ip link del dev vxlan10
+
+ ip address del 198.51.100.1/32 dev lo
+
+ bridge vlan del vid 20 dev $swp1
+ bridge vlan del vid 10 dev $swp1
+
+ ip link set dev $swp1 nomaster
+
+ ip link del dev br0
+}
+
+offload_indication_vlan_aware_fdb_test()
+{
+ RET=0
+
+ log_info "vxlan entry offload indication - vlan-aware"
+
+ bridge fdb add de:ad:be:ef:13:37 dev vxlan10 self master static \
+ dst 198.51.100.2 vlan 10
+
+ busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \
+ de:ad:be:ef:13:37 self bridge fdb show brport vxlan10
+ check_err $?
+ busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \
+ de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan10
+ check_err $?
+
+ log_test "vxlan entry offload indication - initial state"
+
+ # Remove FDB entry from the bridge driver and check that corresponding
+ # entry in the VxLAN driver is not marked as offloaded
+ RET=0
+
+ bridge fdb del de:ad:be:ef:13:37 dev vxlan10 master vlan 10
+ busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb \
+ de:ad:be:ef:13:37 self bridge fdb show brport vxlan10
+ check_err $?
+
+ log_test "vxlan entry offload indication - after removal from bridge"
+
+ # Add the FDB entry back to the bridge driver and make sure it is
+ # marked as offloaded in both drivers
+ RET=0
+
+ bridge fdb add de:ad:be:ef:13:37 dev vxlan10 master static vlan 10
+ busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \
+ de:ad:be:ef:13:37 self bridge fdb show brport vxlan10
+ check_err $?
+ busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \
+ de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan10
+ check_err $?
+
+ log_test "vxlan entry offload indication - after re-add to bridge"
+
+ # Remove FDB entry from the VxLAN driver and check that corresponding
+ # entry in the bridge driver is not marked as offloaded
+ RET=0
+
+ bridge fdb del de:ad:be:ef:13:37 dev vxlan10 self
+ busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb \
+ de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan10
+ check_err $?
+
+ log_test "vxlan entry offload indication - after removal from vxlan"
+
+ # Add the FDB entry back to the VxLAN driver and make sure it is
+ # marked as offloaded in both drivers
+ RET=0
+
+ bridge fdb add de:ad:be:ef:13:37 dev vxlan10 self dst 198.51.100.2
+ busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \
+ de:ad:be:ef:13:37 self bridge fdb show brport vxlan10
+ check_err $?
+ busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb \
+ de:ad:be:ef:13:37 self -v bridge fdb show brport vxlan10
+ check_err $?
+
+ log_test "vxlan entry offload indication - after re-add to vxlan"
+
+ bridge fdb del de:ad:be:ef:13:37 dev vxlan10 self master vlan 10
+}
+
+offload_indication_vlan_aware_decap_route_test()
+{
+ RET=0
+
+ busywait "$TIMEOUT" wait_for_offload \
+ ip route show table local 198.51.100.1
+ check_err $?
+
+ # Toggle PVID flag on one VxLAN device and make sure route is still
+ # marked as offloaded
+ bridge vlan add vid 10 dev vxlan10 untagged
+
+ busywait "$TIMEOUT" wait_for_offload \
+ ip route show table local 198.51.100.1
+ check_err $?
+
+ # Toggle PVID flag on second VxLAN device and make sure route is no
+ # longer marked as offloaded
+ bridge vlan add vid 20 dev vxlan20 untagged
+
+ busywait "$TIMEOUT" not wait_for_offload \
+ ip route show table local 198.51.100.1
+ check_err $?
+
+ # Toggle PVID flag back and make sure route is marked as offloaded
+ bridge vlan add vid 10 dev vxlan10 pvid untagged
+ bridge vlan add vid 20 dev vxlan20 pvid untagged
+
+ busywait "$TIMEOUT" wait_for_offload ip route show table local 198.51.100.1
+ check_err $?
+
+ log_test "vxlan decap route - vni map/unmap"
+}
+
+offload_indication_vlan_aware_join_vxlan_first()
+{
+ ip link add dev br0 up type bridge mcast_snooping 0 \
+ vlan_filtering 1 vlan_default_pvid 1
+ ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
+ ttl 20 tos inherit local 198.51.100.1 dstport 4789
+
+ __offload_indication_join_vxlan_first 1
+
+ ip link del dev vxlan0
+ ip link del dev br0
+}
+
+offload_indication_vlan_aware_join_vxlan_last()
+{
+ ip link add dev br0 up type bridge mcast_snooping 0 \
+ vlan_filtering 1 vlan_default_pvid 1
+ ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
+ ttl 20 tos inherit local 198.51.100.1 dstport 4789
+
+ __offload_indication_join_vxlan_last
+
+ ip link del dev vxlan0
+ ip link del dev br0
+}
+
+offload_indication_vlan_aware_l3vni_test()
+{
+ local zmac=00:00:00:00:00:00
+
+ RET=0
+
+ sysctl_set net.ipv6.conf.default.disable_ipv6 1
+ ip link add dev br0 up type bridge mcast_snooping 0 \
+ vlan_filtering 1 vlan_default_pvid 0
+ ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
+ ttl 20 tos inherit local 198.51.100.1 dstport 4789
+
+ ip link set dev $swp1 master br0
+
+ # The test will use the offload indication on the FDB entry to
+ # understand if the tunnel is offloaded or not
+ bridge fdb append $zmac dev vxlan0 self dst 192.0.2.1
+
+ ip link set dev vxlan0 master br0
+ bridge vlan add dev vxlan0 vid 10 pvid untagged
+
+ busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb $zmac self \
+ bridge fdb show brport vxlan0
+ check_err $? "vxlan tunnel not offloaded when should"
+
+ # Configure a VLAN interface and make sure tunnel is offloaded
+ ip link add link br0 name br10 up type vlan id 10
+ sysctl_set net.ipv6.conf.br10.disable_ipv6 0
+ ip -6 address add 2001:db8:1::1/64 dev br10
+ busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb $zmac self \
+ bridge fdb show brport vxlan0
+ check_err $? "vxlan tunnel not offloaded when should"
+
+ # Unlink the VXLAN device, make sure tunnel is no longer offloaded,
+ # then add it back to the bridge and make sure it is offloaded
+ ip link set dev vxlan0 nomaster
+ busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb $zmac self \
+ bridge fdb show brport vxlan0
+ check_err $? "vxlan tunnel offloaded after unlinked from bridge"
+
+ ip link set dev vxlan0 master br0
+ busywait "$TIMEOUT" not wait_for_offload grep_bridge_fdb $zmac self \
+ bridge fdb show brport vxlan0
+ check_err $? "vxlan tunnel offloaded despite no matching vid"
+
+ bridge vlan add dev vxlan0 vid 10 pvid untagged
+ busywait "$TIMEOUT" wait_for_offload grep_bridge_fdb $zmac self \
+ bridge fdb show brport vxlan0
+ check_err $? "vxlan tunnel not offloaded after adding vid"
+
+ log_test "vxlan - l3 vni"
+
+ ip link del dev vxlan0
+ ip link del dev br0
+ sysctl_restore net.ipv6.conf.default.disable_ipv6
+}
+
+offload_indication_vlan_aware_test()
+{
+ offload_indication_vlan_aware_setup_create
+ offload_indication_vlan_aware_fdb_test
+ offload_indication_vlan_aware_decap_route_test
+ offload_indication_vlan_aware_setup_destroy
+
+ log_info "offload indication - replay & cleanup - vlan aware"
+ offload_indication_vlan_aware_join_vxlan_first
+ offload_indication_vlan_aware_join_vxlan_last
+ offload_indication_vlan_aware_l3vni_test
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/vxlan_fdb_veto.sh b/tools/testing/selftests/drivers/net/mlxsw/vxlan_fdb_veto.sh
new file mode 100755
index 000000000..749ba3cfd
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/vxlan_fdb_veto.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test vetoing of FDB entries that mlxsw can not offload. This exercises several
+# different veto vectors to test various rollback scenarios in the vxlan driver.
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ fdb_create_veto_test
+ fdb_replace_veto_test
+ fdb_append_veto_test
+ fdb_changelink_veto_test
+"
+NUM_NETIFS=2
+source $lib_dir/lib.sh
+
+setup_prepare()
+{
+ swp1=${NETIFS[p1]}
+ swp2=${NETIFS[p2]}
+
+ ip link add dev br0 type bridge mcast_snooping 0
+
+ ip link set dev $swp1 up
+ ip link set dev $swp1 master br0
+ ip link set dev $swp2 up
+
+ ip link add name vxlan0 up type vxlan id 10 nolearning noudpcsum \
+ ttl 20 tos inherit local 198.51.100.1 dstport 4789
+ ip link set dev vxlan0 master br0
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ ip link set dev vxlan0 nomaster
+ ip link del dev vxlan0
+
+ ip link set dev $swp2 down
+ ip link set dev $swp1 nomaster
+ ip link set dev $swp1 down
+
+ ip link del dev br0
+}
+
+fdb_create_veto_test()
+{
+ RET=0
+
+ bridge fdb add 01:02:03:04:05:06 dev vxlan0 self static \
+ dst 198.51.100.2 2>/dev/null
+ check_fail $? "multicast MAC not rejected"
+
+ bridge fdb add 01:02:03:04:05:06 dev vxlan0 self static \
+ dst 198.51.100.2 2>&1 >/dev/null | grep -q mlxsw_spectrum
+ check_err $? "multicast MAC rejected without extack"
+
+ log_test "vxlan FDB veto - create"
+}
+
+fdb_replace_veto_test()
+{
+ RET=0
+
+ bridge fdb add 00:01:02:03:04:05 dev vxlan0 self static \
+ dst 198.51.100.2
+ check_err $? "valid FDB rejected"
+
+ bridge fdb replace 00:01:02:03:04:05 dev vxlan0 self static \
+ dst 198.51.100.2 port 1234 2>/dev/null
+ check_fail $? "FDB with an explicit port not rejected"
+
+ bridge fdb replace 00:01:02:03:04:05 dev vxlan0 self static \
+ dst 198.51.100.2 port 1234 2>&1 >/dev/null \
+ | grep -q mlxsw_spectrum
+ check_err $? "FDB with an explicit port rejected without extack"
+
+ log_test "vxlan FDB veto - replace"
+}
+
+fdb_append_veto_test()
+{
+ RET=0
+
+ bridge fdb add 00:00:00:00:00:00 dev vxlan0 self static \
+ dst 198.51.100.2
+ check_err $? "valid FDB rejected"
+
+ bridge fdb append 00:00:00:00:00:00 dev vxlan0 self static \
+ dst 198.51.100.3 port 1234 2>/dev/null
+ check_fail $? "FDB with an explicit port not rejected"
+
+ bridge fdb append 00:00:00:00:00:00 dev vxlan0 self static \
+ dst 198.51.100.3 port 1234 2>&1 >/dev/null \
+ | grep -q mlxsw_spectrum
+ check_err $? "FDB with an explicit port rejected without extack"
+
+ log_test "vxlan FDB veto - append"
+}
+
+fdb_changelink_veto_test()
+{
+ RET=0
+
+ ip link set dev vxlan0 type vxlan \
+ group 224.0.0.1 dev lo 2>/dev/null
+ check_fail $? "FDB with a multicast IP not rejected"
+
+ ip link set dev vxlan0 type vxlan \
+ group 224.0.0.1 dev lo 2>&1 >/dev/null \
+ | grep -q mlxsw_spectrum
+ check_err $? "FDB with a multicast IP rejected without extack"
+
+ log_test "vxlan FDB veto - changelink"
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh b/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh
new file mode 100755
index 000000000..af5ea50ed
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/vxlan_flooding.sh
@@ -0,0 +1,326 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test VxLAN flooding. The device stores flood records in a singly linked list
+# where each record stores up to three IPv4 addresses of remote VTEPs. The test
+# verifies that packets are correctly flooded in various cases such as deletion
+# of a record in the middle of the list.
+#
+# +--------------------+
+# | H1 (vrf) |
+# | + $h1 |
+# | | 203.0.113.1/24|
+# +----|---------------+
+# |
+# +----|----------------------------------------------------------------------+
+# | SW | |
+# | +--|--------------------------------------------------------------------+ |
+# | | + $swp1 BR0 (802.1d) | |
+# | | | |
+# | | + vxlan0 (vxlan) | |
+# | | local 198.51.100.1 | |
+# | | remote 198.51.100.{2..13} | |
+# | | id 10 dstport 4789 | |
+# | +-----------------------------------------------------------------------+ |
+# | |
+# | 198.51.100.0/24 via 192.0.2.2 |
+# | |
+# | + $rp1 |
+# | | 192.0.2.1/24 |
+# +----|----------------------------------------------------------------------+
+# |
+# +----|--------------------------------------------------------+
+# | | R2 (vrf) |
+# | + $rp2 |
+# | 192.0.2.2/24 |
+# | |
+# +-------------------------------------------------------------+
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="flooding_test"
+NUM_NETIFS=4
+source $lib_dir/tc_common.sh
+source $lib_dir/lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 203.0.113.1/24
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 203.0.113.1/24
+}
+
+switch_create()
+{
+ # Make sure the bridge uses the MAC address of the local port and
+ # not that of the VxLAN's device
+ ip link add dev br0 type bridge mcast_snooping 0
+ ip link set dev br0 address $(mac_get $swp1)
+
+ ip link add name vxlan0 type vxlan id 10 nolearning noudpcsum \
+ ttl 20 tos inherit local 198.51.100.1 dstport 4789
+
+ ip address add 198.51.100.1/32 dev lo
+
+ ip link set dev $swp1 master br0
+ ip link set dev vxlan0 master br0
+
+ ip link set dev br0 up
+ ip link set dev $swp1 up
+ ip link set dev vxlan0 up
+}
+
+switch_destroy()
+{
+ ip link set dev vxlan0 down
+ ip link set dev $swp1 down
+ ip link set dev br0 down
+
+ ip link set dev vxlan0 nomaster
+ ip link set dev $swp1 nomaster
+
+ ip address del 198.51.100.1/32 dev lo
+
+ ip link del dev vxlan0
+
+ ip link del dev br0
+}
+
+router1_create()
+{
+ # This router is in the default VRF, where the VxLAN device is
+ # performing the L3 lookup
+ ip link set dev $rp1 up
+ ip address add 192.0.2.1/24 dev $rp1
+ ip route add 198.51.100.0/24 via 192.0.2.2
+}
+
+router1_destroy()
+{
+ ip route del 198.51.100.0/24 via 192.0.2.2
+ ip address del 192.0.2.1/24 dev $rp1
+ ip link set dev $rp1 down
+}
+
+router2_create()
+{
+ # This router is not in the default VRF, so use simple_if_init()
+ simple_if_init $rp2 192.0.2.2/24
+}
+
+router2_destroy()
+{
+ simple_if_fini $rp2 192.0.2.2/24
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ rp1=${NETIFS[p3]}
+ rp2=${NETIFS[p4]}
+
+ vrf_prepare
+
+ h1_create
+
+ switch_create
+
+ router1_create
+ router2_create
+
+ forwarding_enable
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ forwarding_restore
+
+ router2_destroy
+ router1_destroy
+
+ switch_destroy
+
+ h1_destroy
+
+ vrf_cleanup
+}
+
+flooding_remotes_add()
+{
+ local num_remotes=$1
+ local lsb
+ local i
+
+ for i in $(eval echo {1..$num_remotes}); do
+ lsb=$((i + 1))
+
+ bridge fdb append 00:00:00:00:00:00 dev vxlan0 self \
+ dst 198.51.100.$lsb
+ done
+}
+
+flooding_filters_add()
+{
+ local num_remotes=$1
+ local lsb
+ local i
+
+ # Prevent unwanted packets from entering the bridge and interfering
+ # with the test.
+ tc qdisc add dev br0 clsact
+ tc filter add dev br0 egress protocol all pref 1 handle 1 \
+ matchall skip_hw action drop
+ tc qdisc add dev $h1 clsact
+ tc filter add dev $h1 egress protocol all pref 1 handle 1 \
+ flower skip_hw dst_mac de:ad:be:ef:13:37 action pass
+ tc filter add dev $h1 egress protocol all pref 2 handle 2 \
+ matchall skip_hw action drop
+
+ tc qdisc add dev $rp2 clsact
+
+ for i in $(eval echo {1..$num_remotes}); do
+ lsb=$((i + 1))
+
+ tc filter add dev $rp2 ingress protocol ip pref $i handle $i \
+ flower ip_proto udp dst_ip 198.51.100.$lsb \
+ dst_port 4789 skip_sw action drop
+ done
+}
+
+flooding_filters_del()
+{
+ local num_remotes=$1
+ local i
+
+ for i in $(eval echo {1..$num_remotes}); do
+ tc filter del dev $rp2 ingress protocol ip pref $i \
+ handle $i flower
+ done
+
+ tc qdisc del dev $rp2 clsact
+
+ tc filter del dev $h1 egress protocol all pref 2 handle 2 matchall
+ tc filter del dev $h1 egress protocol all pref 1 handle 1 flower
+ tc qdisc del dev $h1 clsact
+ tc filter del dev br0 egress protocol all pref 1 handle 1 matchall
+ tc qdisc del dev br0 clsact
+}
+
+flooding_check_packets()
+{
+ local packets=("$@")
+ local num_remotes=${#packets[@]}
+ local i
+
+ for i in $(eval echo {1..$num_remotes}); do
+ tc_check_packets "dev $rp2 ingress" $i ${packets[i - 1]}
+ check_err $? "remote $i - did not get expected number of packets"
+ done
+}
+
+flooding_test()
+{
+ # Use 12 remote VTEPs that will be stored in 4 records. The array
+ # 'packets' will store how many packets are expected to be received
+ # by each remote VTEP at each stage of the test
+ declare -a packets=(1 1 1 1 1 1 1 1 1 1 1 1)
+ local num_remotes=12
+
+ RET=0
+
+ # Add FDB entries for remote VTEPs and corresponding tc filters on the
+ # ingress of the nexthop router. These filters will count how many
+ # packets were flooded to each remote VTEP
+ flooding_remotes_add $num_remotes
+ flooding_filters_add $num_remotes
+
+ # Send one packet and make sure it is flooded to all the remote VTEPs
+ $MZ $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+ flooding_check_packets "${packets[@]}"
+ log_test "flood after 1 packet"
+
+ # Delete the third record which corresponds to VTEPs with LSB 8..10
+ # and check that packet is flooded correctly when we remove a record
+ # from the middle of the list
+ RET=0
+
+ packets=(2 2 2 2 2 2 1 1 1 2 2 2)
+ bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 198.51.100.8
+ bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 198.51.100.9
+ bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 198.51.100.10
+
+ $MZ $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+ flooding_check_packets "${packets[@]}"
+ log_test "flood after 2 packets"
+
+ # Delete the first record and make sure the packet is flooded correctly
+ RET=0
+
+ packets=(2 2 2 3 3 3 1 1 1 3 3 3)
+ bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 198.51.100.2
+ bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 198.51.100.3
+ bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 198.51.100.4
+
+ $MZ $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+ flooding_check_packets "${packets[@]}"
+ log_test "flood after 3 packets"
+
+ # Delete the last record and make sure the packet is flooded correctly
+ RET=0
+
+ packets=(2 2 2 4 4 4 1 1 1 3 3 3)
+ bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 198.51.100.11
+ bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 198.51.100.12
+ bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 198.51.100.13
+
+ $MZ $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+ flooding_check_packets "${packets[@]}"
+ log_test "flood after 4 packets"
+
+ # Delete the last record, one entry at a time and make sure single
+ # entries are correctly removed
+ RET=0
+
+ packets=(2 2 2 4 5 5 1 1 1 3 3 3)
+ bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 198.51.100.5
+
+ $MZ $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+ flooding_check_packets "${packets[@]}"
+ log_test "flood after 5 packets"
+
+ RET=0
+
+ packets=(2 2 2 4 5 6 1 1 1 3 3 3)
+ bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 198.51.100.6
+
+ $MZ $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+ flooding_check_packets "${packets[@]}"
+ log_test "flood after 6 packets"
+
+ RET=0
+
+ packets=(2 2 2 4 5 6 1 1 1 3 3 3)
+ bridge fdb del 00:00:00:00:00:00 dev vxlan0 self dst 198.51.100.7
+
+ $MZ $h1 -q -p 64 -b de:ad:be:ef:13:37 -t ip -c 1
+ flooding_check_packets "${packets[@]}"
+ log_test "flood after 7 packets"
+
+ flooding_filters_del $num_remotes
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
new file mode 100755
index 000000000..2c81e01c3
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
@@ -0,0 +1,548 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="fw_flash_test params_test regions_test reload_test \
+ netns_reload_test resource_test dev_info_test \
+ empty_reporter_test dummy_reporter_test"
+NUM_NETIFS=0
+source $lib_dir/lib.sh
+
+BUS_ADDR=10
+PORT_COUNT=4
+DEV_NAME=netdevsim$BUS_ADDR
+SYSFS_NET_DIR=/sys/bus/netdevsim/devices/$DEV_NAME/net/
+DEBUGFS_DIR=/sys/kernel/debug/netdevsim/$DEV_NAME/
+DL_HANDLE=netdevsim/$DEV_NAME
+
+wait_for_devlink()
+{
+ "$@" | grep -q $DL_HANDLE
+}
+
+devlink_wait()
+{
+ local timeout=$1
+
+ busywait "$timeout" wait_for_devlink devlink dev
+}
+
+fw_flash_test()
+{
+ RET=0
+
+ devlink dev flash $DL_HANDLE file dummy
+ check_err $? "Failed to flash with status updates on"
+
+ devlink dev flash $DL_HANDLE file dummy component fw.mgmt
+ check_err $? "Failed to flash with component attribute"
+
+ devlink dev flash $DL_HANDLE file dummy overwrite settings
+ check_fail $? "Flash with overwrite settings should be rejected"
+
+ echo "1"> $DEBUGFS_DIR/fw_update_overwrite_mask
+ check_err $? "Failed to change allowed overwrite mask"
+
+ devlink dev flash $DL_HANDLE file dummy overwrite settings
+ check_err $? "Failed to flash with settings overwrite enabled"
+
+ devlink dev flash $DL_HANDLE file dummy overwrite identifiers
+ check_fail $? "Flash with overwrite settings should be identifiers"
+
+ echo "3"> $DEBUGFS_DIR/fw_update_overwrite_mask
+ check_err $? "Failed to change allowed overwrite mask"
+
+ devlink dev flash $DL_HANDLE file dummy overwrite identifiers overwrite settings
+ check_err $? "Failed to flash with settings and identifiers overwrite enabled"
+
+ echo "n"> $DEBUGFS_DIR/fw_update_status
+ check_err $? "Failed to disable status updates"
+
+ devlink dev flash $DL_HANDLE file dummy
+ check_err $? "Failed to flash with status updates off"
+
+ log_test "fw flash test"
+}
+
+param_get()
+{
+ local name=$1
+
+ cmd_jq "devlink dev param show $DL_HANDLE name $name -j" \
+ '.[][][].values[] | select(.cmode == "driverinit").value'
+}
+
+param_set()
+{
+ local name=$1
+ local value=$2
+
+ devlink dev param set $DL_HANDLE name $name cmode driverinit value $value
+}
+
+check_value()
+{
+ local name=$1
+ local phase_name=$2
+ local expected_param_value=$3
+ local expected_debugfs_value=$4
+ local value
+
+ value=$(param_get $name)
+ check_err $? "Failed to get $name param value"
+ [ "$value" == "$expected_param_value" ]
+ check_err $? "Unexpected $phase_name $name param value"
+ value=$(<$DEBUGFS_DIR/$name)
+ check_err $? "Failed to get $name debugfs value"
+ [ "$value" == "$expected_debugfs_value" ]
+ check_err $? "Unexpected $phase_name $name debugfs value"
+}
+
+params_test()
+{
+ RET=0
+
+ local max_macs
+ local test1
+
+ check_value max_macs initial 32 32
+ check_value test1 initial true Y
+
+ param_set max_macs 16
+ check_err $? "Failed to set max_macs param value"
+ param_set test1 false
+ check_err $? "Failed to set test1 param value"
+
+ check_value max_macs post-set 16 32
+ check_value test1 post-set false Y
+
+ devlink dev reload $DL_HANDLE
+
+ check_value max_macs post-reload 16 16
+ check_value test1 post-reload false N
+
+ log_test "params test"
+}
+
+check_region_size()
+{
+ local name=$1
+ local size
+
+ size=$(devlink region show $DL_HANDLE/$name -j | jq -e -r '.[][].size')
+ check_err $? "Failed to get $name region size"
+ [ $size -eq 32768 ]
+ check_err $? "Invalid $name region size"
+}
+
+check_region_snapshot_count()
+{
+ local name=$1
+ local phase_name=$2
+ local expected_count=$3
+ local count
+
+ count=$(devlink region show $DL_HANDLE/$name -j | jq -e -r '.[][].snapshot | length')
+ [ $count -eq $expected_count ]
+ check_err $? "Unexpected $phase_name snapshot count"
+}
+
+regions_test()
+{
+ RET=0
+
+ local count
+
+ check_region_size dummy
+ check_region_snapshot_count dummy initial 0
+
+ echo ""> $DEBUGFS_DIR/take_snapshot
+ check_err $? "Failed to take first dummy region snapshot"
+ check_region_snapshot_count dummy post-first-snapshot 1
+
+ echo ""> $DEBUGFS_DIR/take_snapshot
+ check_err $? "Failed to take second dummy region snapshot"
+ check_region_snapshot_count dummy post-second-snapshot 2
+
+ echo ""> $DEBUGFS_DIR/take_snapshot
+ check_err $? "Failed to take third dummy region snapshot"
+ check_region_snapshot_count dummy post-third-snapshot 3
+
+ devlink region del $DL_HANDLE/dummy snapshot 1
+ check_err $? "Failed to delete first dummy region snapshot"
+
+ check_region_snapshot_count dummy post-first-delete 2
+
+ devlink region new $DL_HANDLE/dummy snapshot 25
+ check_err $? "Failed to create a new snapshot with id 25"
+
+ check_region_snapshot_count dummy post-first-request 3
+
+ devlink region dump $DL_HANDLE/dummy snapshot 25 >> /dev/null
+ check_err $? "Failed to dump snapshot with id 25"
+
+ devlink region read $DL_HANDLE/dummy snapshot 25 addr 0 len 1 >> /dev/null
+ check_err $? "Failed to read snapshot with id 25 (1 byte)"
+
+ devlink region read $DL_HANDLE/dummy snapshot 25 addr 128 len 128 >> /dev/null
+ check_err $? "Failed to read snapshot with id 25 (128 bytes)"
+
+ devlink region read $DL_HANDLE/dummy snapshot 25 addr 128 len $((1<<32)) >> /dev/null
+ check_err $? "Failed to read snapshot with id 25 (oversized)"
+
+ devlink region read $DL_HANDLE/dummy snapshot 25 addr $((1<<32)) len 128 >> /dev/null 2>&1
+ check_fail $? "Bad read of snapshot with id 25 did not fail"
+
+ devlink region del $DL_HANDLE/dummy snapshot 25
+ check_err $? "Failed to delete snapshot with id 25"
+
+ check_region_snapshot_count dummy post-second-delete 2
+
+ sid=$(devlink -j region new $DL_HANDLE/dummy | jq '.[][][][]')
+ check_err $? "Failed to create a new snapshot with id allocated by the kernel"
+
+ check_region_snapshot_count dummy post-first-request 3
+
+ devlink region dump $DL_HANDLE/dummy snapshot $sid >> /dev/null
+ check_err $? "Failed to dump a snapshot with id allocated by the kernel"
+
+ devlink region del $DL_HANDLE/dummy snapshot $sid
+ check_err $? "Failed to delete snapshot with id allocated by the kernel"
+
+ check_region_snapshot_count dummy post-first-request 2
+
+ log_test "regions test"
+}
+
+reload_test()
+{
+ RET=0
+
+ devlink dev reload $DL_HANDLE
+ check_err $? "Failed to reload"
+
+ echo "y"> $DEBUGFS_DIR/fail_reload
+ check_err $? "Failed to setup devlink reload to fail"
+
+ devlink dev reload $DL_HANDLE
+ check_fail $? "Unexpected success of devlink reload"
+
+ echo "n"> $DEBUGFS_DIR/fail_reload
+ check_err $? "Failed to setup devlink reload not to fail"
+
+ devlink dev reload $DL_HANDLE
+ check_err $? "Failed to reload after set not to fail"
+
+ echo "y"> $DEBUGFS_DIR/dont_allow_reload
+ check_err $? "Failed to forbid devlink reload"
+
+ devlink dev reload $DL_HANDLE
+ check_fail $? "Unexpected success of devlink reload"
+
+ echo "n"> $DEBUGFS_DIR/dont_allow_reload
+ check_err $? "Failed to re-enable devlink reload"
+
+ devlink dev reload $DL_HANDLE
+ check_err $? "Failed to reload after re-enable"
+
+ log_test "reload test"
+}
+
+netns_reload_test()
+{
+ RET=0
+
+ ip netns add testns1
+ check_err $? "Failed add netns \"testns1\""
+ ip netns add testns2
+ check_err $? "Failed add netns \"testns2\""
+
+ devlink dev reload $DL_HANDLE netns testns1
+ check_err $? "Failed to reload into netns \"testns1\""
+
+ devlink -N testns1 dev reload $DL_HANDLE netns testns2
+ check_err $? "Failed to reload from netns \"testns1\" into netns \"testns2\""
+
+ ip netns del testns2
+ ip netns del testns1
+
+ # Wait until netns async cleanup is done.
+ devlink_wait 2000
+
+ log_test "netns reload test"
+}
+
+DUMMYDEV="dummytest"
+
+res_val_get()
+{
+ local netns=$1
+ local parentname=$2
+ local name=$3
+ local type=$4
+
+ cmd_jq "devlink -N $netns resource show $DL_HANDLE -j" \
+ ".[][][] | select(.name == \"$parentname\").resources[] \
+ | select(.name == \"$name\").$type"
+}
+
+resource_test()
+{
+ RET=0
+
+ ip netns add testns1
+ check_err $? "Failed add netns \"testns1\""
+ ip netns add testns2
+ check_err $? "Failed add netns \"testns2\""
+
+ devlink dev reload $DL_HANDLE netns testns1
+ check_err $? "Failed to reload into netns \"testns1\""
+
+ # Create dummy dev to add the address and routes on.
+
+ ip -n testns1 link add name $DUMMYDEV type dummy
+ check_err $? "Failed create dummy device"
+ ip -n testns1 link set $DUMMYDEV up
+ check_err $? "Failed bring up dummy device"
+ ip -n testns1 a a 192.0.1.1/24 dev $DUMMYDEV
+ check_err $? "Failed add an IP address to dummy device"
+
+ local occ=$(res_val_get testns1 IPv4 fib occ)
+ local limit=$((occ+1))
+
+ # Set fib size limit to handle one another route only.
+
+ devlink -N testns1 resource set $DL_HANDLE path IPv4/fib size $limit
+ check_err $? "Failed to set IPv4/fib resource size"
+ local size_new=$(res_val_get testns1 IPv4 fib size_new)
+ [ "$size_new" -eq "$limit" ]
+ check_err $? "Unexpected \"size_new\" value (got $size_new, expected $limit)"
+
+ devlink -N testns1 dev reload $DL_HANDLE
+ check_err $? "Failed to reload"
+ local size=$(res_val_get testns1 IPv4 fib size)
+ [ "$size" -eq "$limit" ]
+ check_err $? "Unexpected \"size\" value (got $size, expected $limit)"
+
+ # Insert 2 routes, the first is going to be inserted,
+ # the second is expected to fail to be inserted.
+
+ ip -n testns1 r a 192.0.2.0/24 via 192.0.1.2
+ check_err $? "Failed to add route"
+
+ ip -n testns1 r a 192.0.3.0/24 via 192.0.1.2
+ check_fail $? "Unexpected successful route add over limit"
+
+ # Now create another dummy in second network namespace and
+ # insert two routes. That is over the limit of the netdevsim
+ # instance in the first namespace. Move the netdevsim instance
+ # into the second namespace and expect it to fail.
+
+ ip -n testns2 link add name $DUMMYDEV type dummy
+ check_err $? "Failed create dummy device"
+ ip -n testns2 link set $DUMMYDEV up
+ check_err $? "Failed bring up dummy device"
+ ip -n testns2 a a 192.0.1.1/24 dev $DUMMYDEV
+ check_err $? "Failed add an IP address to dummy device"
+ ip -n testns2 r a 192.0.2.0/24 via 192.0.1.2
+ check_err $? "Failed to add route"
+ ip -n testns2 r a 192.0.3.0/24 via 192.0.1.2
+ check_err $? "Failed to add route"
+
+ devlink -N testns1 dev reload $DL_HANDLE netns testns2
+ check_fail $? "Unexpected successful reload from netns \"testns1\" into netns \"testns2\""
+
+ devlink -N testns2 resource set $DL_HANDLE path IPv4/fib size ' -1'
+ check_err $? "Failed to reset IPv4/fib resource size"
+
+ devlink -N testns2 dev reload $DL_HANDLE netns 1
+ check_err $? "Failed to reload devlink back"
+
+ ip netns del testns2
+ ip netns del testns1
+
+ # Wait until netns async cleanup is done.
+ devlink_wait 2000
+
+ log_test "resource test"
+}
+
+info_get()
+{
+ local name=$1
+
+ cmd_jq "devlink dev info $DL_HANDLE -j" ".[][][\"$name\"]" "-e"
+}
+
+dev_info_test()
+{
+ RET=0
+
+ driver=$(info_get "driver")
+ check_err $? "Failed to get driver name"
+ [ "$driver" == "netdevsim" ]
+ check_err $? "Unexpected driver name $driver"
+
+ log_test "dev_info test"
+}
+
+empty_reporter_test()
+{
+ RET=0
+
+ devlink health show $DL_HANDLE reporter empty >/dev/null
+ check_err $? "Failed show empty reporter"
+
+ devlink health dump show $DL_HANDLE reporter empty >/dev/null
+ check_err $? "Failed show dump of empty reporter"
+
+ devlink health diagnose $DL_HANDLE reporter empty >/dev/null
+ check_err $? "Failed diagnose empty reporter"
+
+ devlink health recover $DL_HANDLE reporter empty
+ check_err $? "Failed recover empty reporter"
+
+ log_test "empty reporter test"
+}
+
+check_reporter_info()
+{
+ local name=$1
+ local expected_state=$2
+ local expected_error=$3
+ local expected_recover=$4
+ local expected_grace_period=$5
+ local expected_auto_recover=$6
+
+ local show=$(devlink health show $DL_HANDLE reporter $name -j | jq -e -r ".[][][]")
+ check_err $? "Failed show $name reporter"
+
+ local state=$(echo $show | jq -r ".state")
+ [ "$state" == "$expected_state" ]
+ check_err $? "Unexpected \"state\" value (got $state, expected $expected_state)"
+
+ local error=$(echo $show | jq -r ".error")
+ [ "$error" == "$expected_error" ]
+ check_err $? "Unexpected \"error\" value (got $error, expected $expected_error)"
+
+ local recover=`echo $show | jq -r ".recover"`
+ [ "$recover" == "$expected_recover" ]
+ check_err $? "Unexpected \"recover\" value (got $recover, expected $expected_recover)"
+
+ local grace_period=$(echo $show | jq -r ".grace_period")
+ check_err $? "Failed get $name reporter grace_period"
+ [ "$grace_period" == "$expected_grace_period" ]
+ check_err $? "Unexpected \"grace_period\" value (got $grace_period, expected $expected_grace_period)"
+
+ local auto_recover=$(echo $show | jq -r ".auto_recover")
+ [ "$auto_recover" == "$expected_auto_recover" ]
+ check_err $? "Unexpected \"auto_recover\" value (got $auto_recover, expected $expected_auto_recover)"
+}
+
+dummy_reporter_test()
+{
+ RET=0
+
+ check_reporter_info dummy healthy 0 0 0 true
+
+ devlink health set $DL_HANDLE reporter dummy auto_recover false
+ check_err $? "Failed to dummy reporter auto_recover option"
+
+ check_reporter_info dummy healthy 0 0 0 false
+
+ local BREAK_MSG="foo bar"
+ echo "$BREAK_MSG"> $DEBUGFS_DIR/health/break_health
+ check_err $? "Failed to break dummy reporter"
+
+ check_reporter_info dummy error 1 0 0 false
+
+ local dump=$(devlink health dump show $DL_HANDLE reporter dummy -j)
+ check_err $? "Failed show dump of dummy reporter"
+
+ local dump_break_msg=$(echo $dump | jq -r ".break_message")
+ [ "$dump_break_msg" == "$BREAK_MSG" ]
+ check_err $? "Unexpected dump break message value (got $dump_break_msg, expected $BREAK_MSG)"
+
+ devlink health dump clear $DL_HANDLE reporter dummy
+ check_err $? "Failed clear dump of dummy reporter"
+
+ devlink health recover $DL_HANDLE reporter dummy
+ check_err $? "Failed recover dummy reporter"
+
+ check_reporter_info dummy healthy 1 1 0 false
+
+ devlink health set $DL_HANDLE reporter dummy auto_recover true
+ check_err $? "Failed to dummy reporter auto_recover option"
+
+ check_reporter_info dummy healthy 1 1 0 true
+
+ echo "$BREAK_MSG"> $DEBUGFS_DIR/health/break_health
+ check_err $? "Failed to break dummy reporter"
+
+ check_reporter_info dummy healthy 2 2 0 true
+
+ local diagnose=$(devlink health diagnose $DL_HANDLE reporter dummy -j -p)
+ check_err $? "Failed show diagnose of dummy reporter"
+
+ local rcvrd_break_msg=$(echo $diagnose | jq -r ".recovered_break_message")
+ [ "$rcvrd_break_msg" == "$BREAK_MSG" ]
+ check_err $? "Unexpected recovered break message value (got $rcvrd_break_msg, expected $BREAK_MSG)"
+
+ devlink health set $DL_HANDLE reporter dummy grace_period 10
+ check_err $? "Failed to dummy reporter grace_period option"
+
+ check_reporter_info dummy healthy 2 2 10 true
+
+ echo "Y"> $DEBUGFS_DIR/health/fail_recover
+ check_err $? "Failed set dummy reporter recovery to fail"
+
+ echo "$BREAK_MSG"> $DEBUGFS_DIR/health/break_health
+ check_fail $? "Unexpected success of dummy reporter break"
+
+ check_reporter_info dummy error 3 2 10 true
+
+ devlink health recover $DL_HANDLE reporter dummy
+ check_fail $? "Unexpected success of dummy reporter recover"
+
+ echo "N"> $DEBUGFS_DIR/health/fail_recover
+ check_err $? "Failed set dummy reporter recovery to be successful"
+
+ devlink health recover $DL_HANDLE reporter dummy
+ check_err $? "Failed recover dummy reporter"
+
+ check_reporter_info dummy healthy 3 3 10 true
+
+ echo 8192 > $DEBUGFS_DIR/health/binary_len
+ check_err $? "Failed set dummy reporter binary len to 8192"
+
+ local dump=$(devlink health dump show $DL_HANDLE reporter dummy -j)
+ check_err $? "Failed show dump of dummy reporter"
+
+ devlink health dump clear $DL_HANDLE reporter dummy
+ check_err $? "Failed clear dump of dummy reporter"
+
+ log_test "dummy reporter test"
+}
+
+setup_prepare()
+{
+ modprobe netdevsim
+ echo "$BUS_ADDR $PORT_COUNT" > /sys/bus/netdevsim/new_device
+ while [ ! -d $SYSFS_NET_DIR ] ; do :; done
+}
+
+cleanup()
+{
+ pre_cleanup
+ echo "$BUS_ADDR" > /sys/bus/netdevsim/del_device
+ modprobe -r netdevsim
+}
+
+trap cleanup EXIT
+
+setup_prepare
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink_in_netns.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink_in_netns.sh
new file mode 100755
index 000000000..7effd3536
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netdevsim/devlink_in_netns.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="check_devlink_test check_ports_test"
+NUM_NETIFS=0
+source $lib_dir/lib.sh
+
+BUS_ADDR=10
+PORT_COUNT=4
+DEV_NAME=netdevsim$BUS_ADDR
+SYSFS_NET_DIR=/sys/bus/netdevsim/devices/$DEV_NAME/net/
+DL_HANDLE=netdevsim/$DEV_NAME
+NETNS_NAME=testns1
+
+port_netdev_get()
+{
+ local port_index=$1
+
+ cmd_jq "devlink -N $NETNS_NAME port show -j" \
+ ".[][\"$DL_HANDLE/$port_index\"].netdev" "-e"
+}
+
+check_ports_test()
+{
+ RET=0
+
+ for i in $(seq 0 $(expr $PORT_COUNT - 1)); do
+ netdev_name=$(port_netdev_get $i)
+ check_err $? "Failed to get netdev name for port $DL_HANDLE/$i"
+ ip -n $NETNS_NAME link show $netdev_name &> /dev/null
+ check_err $? "Failed to find netdev $netdev_name"
+ done
+
+ log_test "check ports test"
+}
+
+check_devlink_test()
+{
+ RET=0
+
+ devlink -N $NETNS_NAME dev show $DL_HANDLE &> /dev/null
+ check_err $? "Failed to show devlink instance"
+
+ log_test "check devlink test"
+}
+
+setup_prepare()
+{
+ modprobe netdevsim
+ ip netns add $NETNS_NAME
+ ip netns exec $NETNS_NAME \
+ echo "$BUS_ADDR $PORT_COUNT" > /sys/bus/netdevsim/new_device
+ while [ ! -d $SYSFS_NET_DIR ] ; do :; done
+}
+
+cleanup()
+{
+ pre_cleanup
+ echo "$BUS_ADDR" > /sys/bus/netdevsim/del_device
+ ip netns del $NETNS_NAME
+ modprobe -r netdevsim
+}
+
+trap cleanup EXIT
+
+setup_prepare
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh
new file mode 100755
index 000000000..da49ad276
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netdevsim/devlink_trap.sh
@@ -0,0 +1,489 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# This test is for checking devlink-trap functionality. It makes use of
+# netdevsim which implements the required callbacks.
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ init_test
+ trap_action_test
+ trap_metadata_test
+ bad_trap_test
+ bad_trap_action_test
+ trap_stats_test
+ trap_group_action_test
+ bad_trap_group_test
+ trap_group_stats_test
+ trap_policer_test
+ trap_policer_bind_test
+ port_del_test
+ dev_del_test
+"
+NETDEVSIM_PATH=/sys/bus/netdevsim/
+DEV_ADDR=1337
+DEV=netdevsim${DEV_ADDR}
+DEVLINK_DEV=netdevsim/${DEV}
+DEBUGFS_DIR=/sys/kernel/debug/netdevsim/$DEV/
+SLEEP_TIME=1
+NETDEV=""
+NUM_NETIFS=0
+source $lib_dir/lib.sh
+source $lib_dir/devlink_lib.sh
+
+require_command udevadm
+
+modprobe netdevsim &> /dev/null
+if [ ! -d "$NETDEVSIM_PATH" ]; then
+ echo "SKIP: No netdevsim support"
+ exit 1
+fi
+
+if [ -d "${NETDEVSIM_PATH}/devices/netdevsim${DEV_ADDR}" ]; then
+ echo "SKIP: Device netdevsim${DEV_ADDR} already exists"
+ exit 1
+fi
+
+init_test()
+{
+ RET=0
+
+ test $(devlink_traps_num_get) -ne 0
+ check_err $? "No traps were registered"
+
+ log_test "Initialization"
+}
+
+trap_action_test()
+{
+ local orig_action
+ local trap_name
+ local action
+
+ RET=0
+
+ for trap_name in $(devlink_traps_get); do
+ # The action of non-drop traps cannot be changed.
+ if [ $(devlink_trap_type_get $trap_name) = "drop" ]; then
+ devlink_trap_action_set $trap_name "trap"
+ action=$(devlink_trap_action_get $trap_name)
+ if [ $action != "trap" ]; then
+ check_err 1 "Trap $trap_name did not change action to trap"
+ fi
+
+ devlink_trap_action_set $trap_name "drop"
+ action=$(devlink_trap_action_get $trap_name)
+ if [ $action != "drop" ]; then
+ check_err 1 "Trap $trap_name did not change action to drop"
+ fi
+ else
+ orig_action=$(devlink_trap_action_get $trap_name)
+
+ devlink_trap_action_set $trap_name "trap"
+ action=$(devlink_trap_action_get $trap_name)
+ if [ $action != $orig_action ]; then
+ check_err 1 "Trap $trap_name changed action when should not"
+ fi
+
+ devlink_trap_action_set $trap_name "drop"
+ action=$(devlink_trap_action_get $trap_name)
+ if [ $action != $orig_action ]; then
+ check_err 1 "Trap $trap_name changed action when should not"
+ fi
+ fi
+ done
+
+ log_test "Trap action"
+}
+
+trap_metadata_test()
+{
+ local trap_name
+
+ RET=0
+
+ for trap_name in $(devlink_traps_get); do
+ devlink_trap_metadata_test $trap_name "input_port"
+ check_err $? "Input port not reported as metadata of trap $trap_name"
+ if [ $trap_name == "ingress_flow_action_drop" ] ||
+ [ $trap_name == "egress_flow_action_drop" ]; then
+ devlink_trap_metadata_test $trap_name "flow_action_cookie"
+ check_err $? "Flow action cookie not reported as metadata of trap $trap_name"
+ fi
+ done
+
+ log_test "Trap metadata"
+}
+
+bad_trap_test()
+{
+ RET=0
+
+ devlink_trap_action_set "made_up_trap" "drop"
+ check_fail $? "Did not get an error for non-existing trap"
+
+ log_test "Non-existing trap"
+}
+
+bad_trap_action_test()
+{
+ local traps_arr
+ local trap_name
+
+ RET=0
+
+ # Pick first trap.
+ traps_arr=($(devlink_traps_get))
+ trap_name=${traps_arr[0]}
+
+ devlink_trap_action_set $trap_name "made_up_action"
+ check_fail $? "Did not get an error for non-existing trap action"
+
+ log_test "Non-existing trap action"
+}
+
+trap_stats_test()
+{
+ local trap_name
+
+ RET=0
+
+ for trap_name in $(devlink_traps_get); do
+ devlink_trap_stats_idle_test $trap_name
+ check_err $? "Stats of trap $trap_name not idle when netdev down"
+
+ ip link set dev $NETDEV up
+
+ if [ $(devlink_trap_type_get $trap_name) = "drop" ]; then
+ devlink_trap_action_set $trap_name "trap"
+ devlink_trap_stats_idle_test $trap_name
+ check_fail $? "Stats of trap $trap_name idle when action is trap"
+
+ devlink_trap_action_set $trap_name "drop"
+ devlink_trap_stats_idle_test $trap_name
+ check_err $? "Stats of trap $trap_name not idle when action is drop"
+ else
+ devlink_trap_stats_idle_test $trap_name
+ check_fail $? "Stats of non-drop trap $trap_name idle when should not"
+ fi
+
+ ip link set dev $NETDEV down
+ done
+
+ log_test "Trap statistics"
+}
+
+trap_group_action_test()
+{
+ local curr_group group_name
+ local trap_name
+ local trap_type
+ local action
+
+ RET=0
+
+ for group_name in $(devlink_trap_groups_get); do
+ devlink_trap_group_action_set $group_name "trap"
+
+ for trap_name in $(devlink_traps_get); do
+ curr_group=$(devlink_trap_group_get $trap_name)
+ if [ $curr_group != $group_name ]; then
+ continue
+ fi
+
+ trap_type=$(devlink_trap_type_get $trap_name)
+ if [ $trap_type != "drop" ]; then
+ continue
+ fi
+
+ action=$(devlink_trap_action_get $trap_name)
+ if [ $action != "trap" ]; then
+ check_err 1 "Trap $trap_name did not change action to trap"
+ fi
+ done
+
+ devlink_trap_group_action_set $group_name "drop"
+
+ for trap_name in $(devlink_traps_get); do
+ curr_group=$(devlink_trap_group_get $trap_name)
+ if [ $curr_group != $group_name ]; then
+ continue
+ fi
+
+ trap_type=$(devlink_trap_type_get $trap_name)
+ if [ $trap_type != "drop" ]; then
+ continue
+ fi
+
+ action=$(devlink_trap_action_get $trap_name)
+ if [ $action != "drop" ]; then
+ check_err 1 "Trap $trap_name did not change action to drop"
+ fi
+ done
+ done
+
+ log_test "Trap group action"
+}
+
+bad_trap_group_test()
+{
+ RET=0
+
+ devlink_trap_group_action_set "made_up_trap_group" "drop"
+ check_fail $? "Did not get an error for non-existing trap group"
+
+ log_test "Non-existing trap group"
+}
+
+trap_group_stats_test()
+{
+ local group_name
+
+ RET=0
+
+ for group_name in $(devlink_trap_groups_get); do
+ devlink_trap_group_stats_idle_test $group_name
+ check_err $? "Stats of trap group $group_name not idle when netdev down"
+
+ ip link set dev $NETDEV up
+
+ devlink_trap_group_action_set $group_name "trap"
+ devlink_trap_group_stats_idle_test $group_name
+ check_fail $? "Stats of trap group $group_name idle when action is trap"
+
+ devlink_trap_group_action_set $group_name "drop"
+ ip link set dev $NETDEV down
+ done
+
+ log_test "Trap group statistics"
+}
+
+trap_policer_test()
+{
+ local packets_t0
+ local packets_t1
+
+ RET=0
+
+ if [ $(devlink_trap_policers_num_get) -eq 0 ]; then
+ check_err 1 "Failed to dump policers"
+ fi
+
+ devlink trap policer set $DEVLINK_DEV policer 1337 &> /dev/null
+ check_fail $? "Did not get an error for setting a non-existing policer"
+ devlink trap policer show $DEVLINK_DEV policer 1337 &> /dev/null
+ check_fail $? "Did not get an error for getting a non-existing policer"
+
+ devlink trap policer set $DEVLINK_DEV policer 1 rate 2000 burst 16
+ check_err $? "Failed to set valid parameters for a valid policer"
+ if [ $(devlink_trap_policer_rate_get 1) -ne 2000 ]; then
+ check_err 1 "Policer rate was not changed"
+ fi
+ if [ $(devlink_trap_policer_burst_get 1) -ne 16 ]; then
+ check_err 1 "Policer burst size was not changed"
+ fi
+
+ devlink trap policer set $DEVLINK_DEV policer 1 rate 0 &> /dev/null
+ check_fail $? "Policer rate was changed to rate lower than limit"
+ devlink trap policer set $DEVLINK_DEV policer 1 rate 9000 &> /dev/null
+ check_fail $? "Policer rate was changed to rate higher than limit"
+ devlink trap policer set $DEVLINK_DEV policer 1 burst 2 &> /dev/null
+ check_fail $? "Policer burst size was changed to burst size lower than limit"
+ devlink trap policer set $DEVLINK_DEV policer 1 rate 65537 &> /dev/null
+ check_fail $? "Policer burst size was changed to burst size higher than limit"
+ echo "y" > $DEBUGFS_DIR/fail_trap_policer_set
+ devlink trap policer set $DEVLINK_DEV policer 1 rate 3000 &> /dev/null
+ check_fail $? "Managed to set policer rate when should not"
+ echo "n" > $DEBUGFS_DIR/fail_trap_policer_set
+ if [ $(devlink_trap_policer_rate_get 1) -ne 2000 ]; then
+ check_err 1 "Policer rate was changed to an invalid value"
+ fi
+ if [ $(devlink_trap_policer_burst_get 1) -ne 16 ]; then
+ check_err 1 "Policer burst size was changed to an invalid value"
+ fi
+
+ packets_t0=$(devlink_trap_policer_rx_dropped_get 1)
+ sleep .5
+ packets_t1=$(devlink_trap_policer_rx_dropped_get 1)
+ if [ ! $packets_t1 -gt $packets_t0 ]; then
+ check_err 1 "Policer drop counter was not incremented"
+ fi
+
+ echo "y"> $DEBUGFS_DIR/fail_trap_policer_counter_get
+ devlink -s trap policer show $DEVLINK_DEV policer 1 &> /dev/null
+ check_fail $? "Managed to read policer drop counter when should not"
+ echo "n"> $DEBUGFS_DIR/fail_trap_policer_counter_get
+ devlink -s trap policer show $DEVLINK_DEV policer 1 &> /dev/null
+ check_err $? "Did not manage to read policer drop counter when should"
+
+ log_test "Trap policer"
+}
+
+trap_group_check_policer()
+{
+ local group_name=$1; shift
+
+ devlink -j -p trap group show $DEVLINK_DEV group $group_name \
+ | jq -e '.[][][]["policer"]' &> /dev/null
+}
+
+trap_policer_bind_test()
+{
+ RET=0
+
+ devlink trap group set $DEVLINK_DEV group l2_drops policer 1
+ check_err $? "Failed to bind a valid policer"
+ if [ $(devlink_trap_group_policer_get "l2_drops") -ne 1 ]; then
+ check_err 1 "Bound policer was not changed"
+ fi
+
+ devlink trap group set $DEVLINK_DEV group l2_drops policer 1337 \
+ &> /dev/null
+ check_fail $? "Did not get an error for binding a non-existing policer"
+ if [ $(devlink_trap_group_policer_get "l2_drops") -ne 1 ]; then
+ check_err 1 "Bound policer was changed when should not"
+ fi
+
+ devlink trap group set $DEVLINK_DEV group l2_drops policer 0
+ check_err $? "Failed to unbind a policer when using ID 0"
+ trap_group_check_policer "l2_drops"
+ check_fail $? "Trap group has a policer after unbinding with ID 0"
+
+ devlink trap group set $DEVLINK_DEV group l2_drops policer 1
+ check_err $? "Failed to bind a valid policer"
+
+ devlink trap group set $DEVLINK_DEV group l2_drops nopolicer
+ check_err $? "Failed to unbind a policer when using 'nopolicer' keyword"
+ trap_group_check_policer "l2_drops"
+ check_fail $? "Trap group has a policer after unbinding with 'nopolicer' keyword"
+
+ devlink trap group set $DEVLINK_DEV group l2_drops policer 1
+ check_err $? "Failed to bind a valid policer"
+
+ echo "y"> $DEBUGFS_DIR/fail_trap_group_set
+ devlink trap group set $DEVLINK_DEV group l2_drops policer 2 \
+ &> /dev/null
+ check_fail $? "Managed to bind a policer when should not"
+ echo "n"> $DEBUGFS_DIR/fail_trap_group_set
+ devlink trap group set $DEVLINK_DEV group l2_drops policer 2
+ check_err $? "Did not manage to bind a policer when should"
+
+ devlink trap group set $DEVLINK_DEV group l2_drops action drop \
+ policer 1337 &> /dev/null
+ check_fail $? "Did not get an error for partially modified trap group"
+
+ log_test "Trap policer binding"
+}
+
+port_del_test()
+{
+ local group_name
+ local i
+
+ # The test never fails. It is meant to exercise different code paths
+ # and make sure we properly dismantle a port while packets are
+ # in-flight.
+ RET=0
+
+ devlink_traps_enable_all
+
+ for i in $(seq 1 10); do
+ ip link set dev $NETDEV up
+
+ sleep $SLEEP_TIME
+
+ netdevsim_port_destroy
+ netdevsim_port_create
+ udevadm settle
+ done
+
+ devlink_traps_disable_all
+
+ log_test "Port delete"
+}
+
+dev_del_test()
+{
+ local group_name
+ local i
+
+ # The test never fails. It is meant to exercise different code paths
+ # and make sure we properly unregister traps while packets are
+ # in-flight.
+ RET=0
+
+ devlink_traps_enable_all
+
+ for i in $(seq 1 10); do
+ ip link set dev $NETDEV up
+
+ sleep $SLEEP_TIME
+
+ cleanup
+ setup_prepare
+ done
+
+ devlink_traps_disable_all
+
+ log_test "Device delete"
+}
+
+netdevsim_dev_create()
+{
+ echo "$DEV_ADDR 0" > ${NETDEVSIM_PATH}/new_device
+}
+
+netdevsim_dev_destroy()
+{
+ echo "$DEV_ADDR" > ${NETDEVSIM_PATH}/del_device
+}
+
+netdevsim_port_create()
+{
+ echo 1 > ${NETDEVSIM_PATH}/devices/${DEV}/new_port
+}
+
+netdevsim_port_destroy()
+{
+ echo 1 > ${NETDEVSIM_PATH}/devices/${DEV}/del_port
+}
+
+setup_prepare()
+{
+ local netdev
+
+ netdevsim_dev_create
+
+ if [ ! -d "${NETDEVSIM_PATH}/devices/${DEV}" ]; then
+ echo "Failed to create netdevsim device"
+ exit 1
+ fi
+
+ netdevsim_port_create
+
+ if [ ! -d "${NETDEVSIM_PATH}/devices/${DEV}/net/" ]; then
+ echo "Failed to create netdevsim port"
+ exit 1
+ fi
+
+ # Wait for udev to rename newly created netdev.
+ udevadm settle
+
+ NETDEV=$(ls ${NETDEVSIM_PATH}/devices/${DEV}/net/)
+}
+
+cleanup()
+{
+ pre_cleanup
+ netdevsim_port_destroy
+ netdevsim_dev_destroy
+}
+
+trap cleanup EXIT
+
+setup_prepare
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/netdevsim/ethtool-pause.sh b/tools/testing/selftests/drivers/net/netdevsim/ethtool-pause.sh
new file mode 100755
index 000000000..25c896b9e
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netdevsim/ethtool-pause.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-only
+
+NSIM_ID=$((RANDOM % 1024))
+NSIM_DEV_SYS=/sys/bus/netdevsim/devices/netdevsim$NSIM_ID
+NSIM_DEV_DFS=/sys/kernel/debug/netdevsim/netdevsim$NSIM_ID/ports/0
+NSIM_NETDEV=
+num_passes=0
+num_errors=0
+
+function cleanup_nsim {
+ if [ -e $NSIM_DEV_SYS ]; then
+ echo $NSIM_ID > /sys/bus/netdevsim/del_device
+ fi
+}
+
+function cleanup {
+ cleanup_nsim
+}
+
+trap cleanup EXIT
+
+function get_netdev_name {
+ local -n old=$1
+
+ new=$(ls /sys/class/net)
+
+ for netdev in $new; do
+ for check in $old; do
+ [ $netdev == $check ] && break
+ done
+
+ if [ $netdev != $check ]; then
+ echo $netdev
+ break
+ fi
+ done
+}
+
+function check {
+ local code=$1
+ local str=$2
+ local exp_str=$3
+
+ if [ $code -ne 0 ]; then
+ ((num_errors++))
+ return
+ fi
+
+ if [ "$str" != "$exp_str" ]; then
+ echo -e "Expected: '$exp_str', got '$str'"
+ ((num_errors++))
+ return
+ fi
+
+ ((num_passes++))
+}
+
+# Bail if ethtool is too old
+if ! ethtool -h | grep include-stat 2>&1 >/dev/null; then
+ echo "SKIP: No --include-statistics support in ethtool"
+ exit 4
+fi
+
+# Make a netdevsim
+old_netdevs=$(ls /sys/class/net)
+
+modprobe netdevsim
+echo $NSIM_ID > /sys/bus/netdevsim/new_device
+
+NSIM_NETDEV=`get_netdev_name old_netdevs`
+
+set -o pipefail
+
+echo n > $NSIM_DEV_DFS/ethtool/pause/report_stats_tx
+echo n > $NSIM_DEV_DFS/ethtool/pause/report_stats_rx
+
+s=$(ethtool --json -a $NSIM_NETDEV | jq '.[].statistics')
+check $? "$s" "null"
+
+s=$(ethtool -I --json -a $NSIM_NETDEV | jq '.[].statistics')
+check $? "$s" "{}"
+
+echo y > $NSIM_DEV_DFS/ethtool/pause/report_stats_tx
+
+s=$(ethtool -I --json -a $NSIM_NETDEV | jq '.[].statistics | length')
+check $? "$s" "1"
+
+s=$(ethtool -I --json -a $NSIM_NETDEV | jq '.[].statistics.tx_pause_frames')
+check $? "$s" "2"
+
+echo y > $NSIM_DEV_DFS/ethtool/pause/report_stats_rx
+
+s=$(ethtool -I --json -a $NSIM_NETDEV | jq '.[].statistics | length')
+check $? "$s" "2"
+
+s=$(ethtool -I --json -a $NSIM_NETDEV | jq '.[].statistics.rx_pause_frames')
+check $? "$s" "1"
+s=$(ethtool -I --json -a $NSIM_NETDEV | jq '.[].statistics.tx_pause_frames')
+check $? "$s" "2"
+
+if [ $num_errors -eq 0 ]; then
+ echo "PASSED all $((num_passes)) checks"
+ exit 0
+else
+ echo "FAILED $num_errors/$((num_errors+num_passes)) checks"
+ exit 1
+fi
diff --git a/tools/testing/selftests/drivers/net/netdevsim/fib.sh b/tools/testing/selftests/drivers/net/netdevsim/fib.sh
new file mode 100755
index 000000000..2f87c3be7
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netdevsim/fib.sh
@@ -0,0 +1,341 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# This test is for checking the FIB offload API. It makes use of netdevsim
+# which registers a listener to the FIB notification chain.
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ ipv4_identical_routes
+ ipv4_tos
+ ipv4_metric
+ ipv4_replace
+ ipv4_delete
+ ipv4_plen
+ ipv4_replay
+ ipv4_flush
+ ipv4_error_path
+ ipv6_add
+ ipv6_metric
+ ipv6_append_single
+ ipv6_replace_single
+ ipv6_metric_multipath
+ ipv6_append_multipath
+ ipv6_replace_multipath
+ ipv6_append_multipath_to_single
+ ipv6_delete_single
+ ipv6_delete_multipath
+ ipv6_replay_single
+ ipv6_replay_multipath
+ ipv6_error_path
+"
+NETDEVSIM_PATH=/sys/bus/netdevsim/
+DEV_ADDR=1337
+DEV=netdevsim${DEV_ADDR}
+DEVLINK_DEV=netdevsim/${DEV}
+SYSFS_NET_DIR=/sys/bus/netdevsim/devices/$DEV/net/
+NUM_NETIFS=0
+source $lib_dir/lib.sh
+source $lib_dir/devlink_lib.sh
+source $lib_dir/fib_offload_lib.sh
+
+ipv4_identical_routes()
+{
+ fib_ipv4_identical_routes_test "testns1"
+}
+
+ipv4_tos()
+{
+ fib_ipv4_tos_test "testns1"
+}
+
+ipv4_metric()
+{
+ fib_ipv4_metric_test "testns1"
+}
+
+ipv4_replace()
+{
+ fib_ipv4_replace_test "testns1"
+}
+
+ipv4_delete()
+{
+ fib_ipv4_delete_test "testns1"
+}
+
+ipv4_plen()
+{
+ fib_ipv4_plen_test "testns1"
+}
+
+ipv4_replay_metric()
+{
+ fib_ipv4_replay_metric_test "testns1" "$DEVLINK_DEV"
+}
+
+ipv4_replay_tos()
+{
+ fib_ipv4_replay_tos_test "testns1" "$DEVLINK_DEV"
+}
+
+ipv4_replay_plen()
+{
+ fib_ipv4_replay_plen_test "testns1" "$DEVLINK_DEV"
+}
+
+ipv4_replay()
+{
+ ipv4_replay_metric
+ ipv4_replay_tos
+ ipv4_replay_plen
+}
+
+ipv4_flush()
+{
+ fib_ipv4_flush_test "testns1"
+}
+
+ipv4_error_path_add()
+{
+ local lsb
+
+ RET=0
+
+ ip -n testns1 link add name dummy1 type dummy
+ ip -n testns1 link set dev dummy1 up
+
+ devlink -N testns1 resource set $DEVLINK_DEV path IPv4/fib size 10
+ devlink -N testns1 dev reload $DEVLINK_DEV
+
+ for lsb in $(seq 1 20); do
+ ip -n testns1 route add 192.0.2.${lsb}/32 dev dummy1 \
+ &> /dev/null
+ done
+
+ log_test "IPv4 error path - add"
+
+ ip -n testns1 link del dev dummy1
+}
+
+ipv4_error_path_replay()
+{
+ local lsb
+
+ RET=0
+
+ ip -n testns1 link add name dummy1 type dummy
+ ip -n testns1 link set dev dummy1 up
+
+ devlink -N testns1 resource set $DEVLINK_DEV path IPv4/fib size 100
+ devlink -N testns1 dev reload $DEVLINK_DEV
+
+ for lsb in $(seq 1 20); do
+ ip -n testns1 route add 192.0.2.${lsb}/32 dev dummy1
+ done
+
+ devlink -N testns1 resource set $DEVLINK_DEV path IPv4/fib size 10
+ devlink -N testns1 dev reload $DEVLINK_DEV &> /dev/null
+
+ log_test "IPv4 error path - replay"
+
+ ip -n testns1 link del dev dummy1
+
+ # Successfully reload after deleting all the routes.
+ devlink -N testns1 resource set $DEVLINK_DEV path IPv4/fib size 100
+ devlink -N testns1 dev reload $DEVLINK_DEV
+}
+
+ipv4_error_path()
+{
+ # Test the different error paths of the notifiers by limiting the size
+ # of the "IPv4/fib" resource.
+ ipv4_error_path_add
+ ipv4_error_path_replay
+}
+
+ipv6_add()
+{
+ fib_ipv6_add_test "testns1"
+}
+
+ipv6_metric()
+{
+ fib_ipv6_metric_test "testns1"
+}
+
+ipv6_append_single()
+{
+ fib_ipv6_append_single_test "testns1"
+}
+
+ipv6_replace_single()
+{
+ fib_ipv6_replace_single_test "testns1"
+}
+
+ipv6_metric_multipath()
+{
+ fib_ipv6_metric_multipath_test "testns1"
+}
+
+ipv6_append_multipath()
+{
+ fib_ipv6_append_multipath_test "testns1"
+}
+
+ipv6_replace_multipath()
+{
+ fib_ipv6_replace_multipath_test "testns1"
+}
+
+ipv6_append_multipath_to_single()
+{
+ fib_ipv6_append_multipath_to_single_test "testns1"
+}
+
+ipv6_delete_single()
+{
+ fib_ipv6_delete_single_test "testns1"
+}
+
+ipv6_delete_multipath()
+{
+ fib_ipv6_delete_multipath_test "testns1"
+}
+
+ipv6_replay_single()
+{
+ fib_ipv6_replay_single_test "testns1" "$DEVLINK_DEV"
+}
+
+ipv6_replay_multipath()
+{
+ fib_ipv6_replay_multipath_test "testns1" "$DEVLINK_DEV"
+}
+
+ipv6_error_path_add_single()
+{
+ local lsb
+
+ RET=0
+
+ ip -n testns1 link add name dummy1 type dummy
+ ip -n testns1 link set dev dummy1 up
+
+ devlink -N testns1 resource set $DEVLINK_DEV path IPv6/fib size 10
+ devlink -N testns1 dev reload $DEVLINK_DEV
+
+ for lsb in $(seq 1 20); do
+ ip -n testns1 route add 2001:db8:1::${lsb}/128 dev dummy1 \
+ &> /dev/null
+ done
+
+ log_test "IPv6 error path - add single"
+
+ ip -n testns1 link del dev dummy1
+}
+
+ipv6_error_path_add_multipath()
+{
+ local lsb
+
+ RET=0
+
+ for i in $(seq 1 2); do
+ ip -n testns1 link add name dummy$i type dummy
+ ip -n testns1 link set dev dummy$i up
+ ip -n testns1 address add 2001:db8:$i::1/64 dev dummy$i
+ done
+
+ devlink -N testns1 resource set $DEVLINK_DEV path IPv6/fib size 10
+ devlink -N testns1 dev reload $DEVLINK_DEV
+
+ for lsb in $(seq 1 20); do
+ ip -n testns1 route add 2001:db8:10::${lsb}/128 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2 &> /dev/null
+ done
+
+ log_test "IPv6 error path - add multipath"
+
+ for i in $(seq 1 2); do
+ ip -n testns1 link del dev dummy$i
+ done
+}
+
+ipv6_error_path_replay()
+{
+ local lsb
+
+ RET=0
+
+ ip -n testns1 link add name dummy1 type dummy
+ ip -n testns1 link set dev dummy1 up
+
+ devlink -N testns1 resource set $DEVLINK_DEV path IPv6/fib size 100
+ devlink -N testns1 dev reload $DEVLINK_DEV
+
+ for lsb in $(seq 1 20); do
+ ip -n testns1 route add 2001:db8:1::${lsb}/128 dev dummy1
+ done
+
+ devlink -N testns1 resource set $DEVLINK_DEV path IPv6/fib size 10
+ devlink -N testns1 dev reload $DEVLINK_DEV &> /dev/null
+
+ log_test "IPv6 error path - replay"
+
+ ip -n testns1 link del dev dummy1
+
+ # Successfully reload after deleting all the routes.
+ devlink -N testns1 resource set $DEVLINK_DEV path IPv6/fib size 100
+ devlink -N testns1 dev reload $DEVLINK_DEV
+}
+
+ipv6_error_path()
+{
+ # Test the different error paths of the notifiers by limiting the size
+ # of the "IPv6/fib" resource.
+ ipv6_error_path_add_single
+ ipv6_error_path_add_multipath
+ ipv6_error_path_replay
+}
+
+setup_prepare()
+{
+ local netdev
+
+ modprobe netdevsim &> /dev/null
+
+ echo "$DEV_ADDR 1" > ${NETDEVSIM_PATH}/new_device
+ while [ ! -d $SYSFS_NET_DIR ] ; do :; done
+
+ ip netns add testns1
+ if [ $? -ne 0 ]; then
+ echo "Failed to add netns \"testns1\""
+ exit 1
+ fi
+
+ devlink dev reload $DEVLINK_DEV netns testns1
+ if [ $? -ne 0 ]; then
+ echo "Failed to reload into netns \"testns1\""
+ exit 1
+ fi
+}
+
+cleanup()
+{
+ pre_cleanup
+ ip netns del testns1
+ echo "$DEV_ADDR" > ${NETDEVSIM_PATH}/del_device
+ modprobe -r netdevsim &> /dev/null
+}
+
+trap cleanup EXIT
+
+setup_prepare
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh
new file mode 100755
index 000000000..1b08e042c
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh
@@ -0,0 +1,953 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-only
+
+VNI_GEN=$RANDOM
+NSIM_ID=$((RANDOM % 1024))
+NSIM_DEV_SYS=/sys/bus/netdevsim/devices/netdevsim$NSIM_ID
+NSIM_DEV_DFS=/sys/kernel/debug/netdevsim/netdevsim$NSIM_ID
+NSIM_NETDEV=
+HAS_ETHTOOL=
+STATIC_ENTRIES=
+EXIT_STATUS=0
+num_cases=0
+num_errors=0
+
+clean_up_devs=( )
+
+function err_cnt {
+ echo "ERROR:" $@
+ EXIT_STATUS=1
+ ((num_errors++))
+ ((num_cases++))
+}
+
+function pass_cnt {
+ ((num_cases++))
+}
+
+function cleanup_tuns {
+ for dev in "${clean_up_devs[@]}"; do
+ [ -e /sys/class/net/$dev ] && ip link del dev $dev
+ done
+ clean_up_devs=( )
+}
+
+function cleanup_nsim {
+ if [ -e $NSIM_DEV_SYS ]; then
+ echo $NSIM_ID > /sys/bus/netdevsim/del_device
+ fi
+}
+
+function cleanup {
+ cleanup_tuns
+ cleanup_nsim
+}
+
+trap cleanup EXIT
+
+function new_vxlan {
+ local dev=$1
+ local dstport=$2
+ local lower=$3
+ local ipver=$4
+ local flags=$5
+
+ local group ipfl
+
+ [ "$ipver" != '6' ] && group=239.1.1.1 || group=fff1::1
+ [ "$ipver" != '6' ] || ipfl="-6"
+
+ [[ ! "$flags" =~ "external" ]] && flags="$flags id $((VNI_GEN++))"
+
+ ip $ipfl link add $dev type vxlan \
+ group $group \
+ dev $lower \
+ dstport $dstport \
+ $flags
+
+ ip link set dev $dev up
+
+ clean_up_devs=("${clean_up_devs[@]}" $dev)
+
+ check_tables
+}
+
+function new_geneve {
+ local dev=$1
+ local dstport=$2
+ local ipver=$3
+ local flags=$4
+
+ local group ipfl
+
+ [ "$ipver" != '6' ] && remote=1.1.1.2 || group=::2
+ [ "$ipver" != '6' ] || ipfl="-6"
+
+ [[ ! "$flags" =~ "external" ]] && flags="$flags vni $((VNI_GEN++))"
+
+ ip $ipfl link add $dev type geneve \
+ remote $remote \
+ dstport $dstport \
+ $flags
+
+ ip link set dev $dev up
+
+ clean_up_devs=("${clean_up_devs[@]}" $dev)
+
+ check_tables
+}
+
+function del_dev {
+ local dev=$1
+
+ ip link del dev $dev
+ check_tables
+}
+
+# Helpers for netdevsim port/type encoding
+function mke {
+ local port=$1
+ local type=$2
+
+ echo $((port << 16 | type))
+}
+
+function pre {
+ local val=$1
+
+ echo -e "port: $((val >> 16))\ttype: $((val & 0xffff))"
+}
+
+function pre_ethtool {
+ local val=$1
+ local port=$((val >> 16))
+ local type=$((val & 0xffff))
+
+ case $type in
+ 1)
+ type_name="vxlan"
+ ;;
+ 2)
+ type_name="geneve"
+ ;;
+ 4)
+ type_name="vxlan-gpe"
+ ;;
+ *)
+ type_name="bit X"
+ ;;
+ esac
+
+ echo "port $port, $type_name"
+}
+
+function check_table {
+ local path=$NSIM_DEV_DFS/ports/$port/udp_ports_table$1
+ local -n expected=$2
+ local last=$3
+
+ read -a have < $path
+
+ if [ ${#expected[@]} -ne ${#have[@]} ]; then
+ echo "check_table: BAD NUMBER OF ITEMS"
+ return 0
+ fi
+
+ for i in "${!expected[@]}"; do
+ if [ -n "$HAS_ETHTOOL" -a ${expected[i]} -ne 0 ]; then
+ pp_expected=`pre_ethtool ${expected[i]}`
+ ethtool --show-tunnels $NSIM_NETDEV | grep "$pp_expected" >/dev/null
+ if [ $? -ne 0 -a $last -ne 0 ]; then
+ err_cnt "ethtool table $1 on port $port: $pfx - $msg"
+ echo " check_table: ethtool does not contain '$pp_expected'"
+ ethtool --show-tunnels $NSIM_NETDEV
+ return 0
+
+ fi
+ fi
+
+ if [ ${expected[i]} != ${have[i]} ]; then
+ if [ $last -ne 0 ]; then
+ err_cnt "table $1 on port $port: $pfx - $msg"
+ echo " check_table: wrong entry $i"
+ echo " expected: `pre ${expected[i]}`"
+ echo " have: `pre ${have[i]}`"
+ return 0
+ fi
+ return 1
+ fi
+ done
+
+ pass_cnt
+ return 0
+}
+
+function check_tables {
+ # Need retries in case we have workqueue making the changes
+ local retries=10
+
+ while ! check_table 0 exp0 $((retries == 0)); do
+ sleep 0.02
+ ((retries--))
+ done
+ while ! check_table 1 exp1 $((retries == 0)); do
+ sleep 0.02
+ ((retries--))
+ done
+
+ if [ -n "$HAS_ETHTOOL" -a -n "${STATIC_ENTRIES[0]}" ]; then
+ fail=0
+ for i in "${!STATIC_ENTRIES[@]}"; do
+ pp_expected=`pre_ethtool ${STATIC_ENTRIES[i]}`
+ cnt=$(ethtool --show-tunnels $NSIM_NETDEV | grep -c "$pp_expected")
+ if [ $cnt -ne 1 ]; then
+ err_cnt "ethtool static entry: $pfx - $msg"
+ echo " check_table: ethtool does not contain '$pp_expected'"
+ ethtool --show-tunnels $NSIM_NETDEV
+ fail=1
+ fi
+ done
+ [ $fail == 0 ] && pass_cnt
+ fi
+}
+
+function print_table {
+ local path=$NSIM_DEV_DFS/ports/$port/udp_ports_table$1
+ read -a have < $path
+
+ tree $NSIM_DEV_DFS/
+
+ echo "Port $port table $1:"
+
+ for i in "${!have[@]}"; do
+ echo " `pre ${have[i]}`"
+ done
+
+}
+
+function print_tables {
+ print_table 0
+ print_table 1
+}
+
+function get_netdev_name {
+ local -n old=$1
+
+ new=$(ls /sys/class/net)
+
+ for netdev in $new; do
+ for check in $old; do
+ [ $netdev == $check ] && break
+ done
+
+ if [ $netdev != $check ]; then
+ echo $netdev
+ break
+ fi
+ done
+}
+
+###
+### Code start
+###
+
+# Probe ethtool support
+ethtool -h | grep show-tunnels 2>&1 >/dev/null && HAS_ETHTOOL=y
+
+modprobe netdevsim
+
+# Basic test
+pfx="basic"
+
+for port in 0 1; do
+ old_netdevs=$(ls /sys/class/net)
+ if [ $port -eq 0 ]; then
+ echo $NSIM_ID > /sys/bus/netdevsim/new_device
+ else
+ echo 1 > $NSIM_DEV_DFS/udp_ports_open_only
+ echo 1 > $NSIM_DEV_DFS/udp_ports_sleep
+ echo 1 > $NSIM_DEV_SYS/new_port
+ fi
+ NSIM_NETDEV=`get_netdev_name old_netdevs`
+
+ msg="new NIC device created"
+ exp0=( 0 0 0 0 )
+ exp1=( 0 0 0 0 )
+ check_tables
+
+ msg="VxLAN v4 devices"
+ exp0=( `mke 4789 1` 0 0 0 )
+ new_vxlan vxlan0 4789 $NSIM_NETDEV
+ new_vxlan vxlan1 4789 $NSIM_NETDEV
+
+ msg="VxLAN v4 devices go down"
+ exp0=( 0 0 0 0 )
+ ifconfig vxlan1 down
+ ifconfig vxlan0 down
+ check_tables
+
+ msg="VxLAN v6 devices"
+ exp0=( `mke 4789 1` 0 0 0 )
+ new_vxlan vxlanA 4789 $NSIM_NETDEV 6
+
+ for ifc in vxlan0 vxlan1; do
+ ifconfig $ifc up
+ done
+
+ new_vxlan vxlanB 4789 $NSIM_NETDEV 6
+
+ msg="another VxLAN v6 devices"
+ exp0=( `mke 4789 1` `mke 4790 1` 0 0 )
+ new_vxlan vxlanC 4790 $NSIM_NETDEV 6
+
+ msg="Geneve device"
+ exp1=( `mke 6081 2` 0 0 0 )
+ new_geneve gnv0 6081
+
+ msg="NIC device goes down"
+ ifconfig $NSIM_NETDEV down
+ if [ $port -eq 1 ]; then
+ exp0=( 0 0 0 0 )
+ exp1=( 0 0 0 0 )
+ fi
+ check_tables
+ msg="NIC device goes up again"
+ ifconfig $NSIM_NETDEV up
+ exp0=( `mke 4789 1` `mke 4790 1` 0 0 )
+ exp1=( `mke 6081 2` 0 0 0 )
+ check_tables
+
+ cleanup_tuns
+
+ msg="tunnels destroyed"
+ exp0=( 0 0 0 0 )
+ exp1=( 0 0 0 0 )
+ check_tables
+
+ modprobe -r geneve
+ modprobe -r vxlan
+ modprobe -r udp_tunnel
+
+ check_tables
+done
+
+modprobe -r netdevsim
+
+# Module tests
+pfx="module tests"
+
+if modinfo netdevsim | grep udp_tunnel >/dev/null; then
+ err_cnt "netdevsim depends on udp_tunnel"
+else
+ pass_cnt
+fi
+
+modprobe netdevsim
+
+old_netdevs=$(ls /sys/class/net)
+port=0
+echo $NSIM_ID > /sys/bus/netdevsim/new_device
+echo 0 > $NSIM_DEV_SYS/del_port
+echo 1000 > $NSIM_DEV_DFS/udp_ports_sleep
+echo 0 > $NSIM_DEV_SYS/new_port
+NSIM_NETDEV=`get_netdev_name old_netdevs`
+
+msg="create VxLANs"
+exp0=( 0 0 0 0 ) # sleep is longer than out wait
+new_vxlan vxlan0 10000 $NSIM_NETDEV
+
+modprobe -r vxlan
+modprobe -r udp_tunnel
+
+msg="remove tunnels"
+exp0=( 0 0 0 0 )
+check_tables
+
+msg="create VxLANs"
+exp0=( 0 0 0 0 ) # sleep is longer than out wait
+new_vxlan vxlan0 10000 $NSIM_NETDEV
+
+exp0=( 0 0 0 0 )
+
+modprobe -r netdevsim
+modprobe netdevsim
+
+# Overflow the table
+
+function overflow_table0 {
+ local pfx=$1
+
+ msg="create VxLANs 1/5"
+ exp0=( `mke 10000 1` 0 0 0 )
+ new_vxlan vxlan0 10000 $NSIM_NETDEV
+
+ msg="create VxLANs 2/5"
+ exp0=( `mke 10000 1` `mke 10001 1` 0 0 )
+ new_vxlan vxlan1 10001 $NSIM_NETDEV
+
+ msg="create VxLANs 3/5"
+ exp0=( `mke 10000 1` `mke 10001 1` `mke 10002 1` 0 )
+ new_vxlan vxlan2 10002 $NSIM_NETDEV
+
+ msg="create VxLANs 4/5"
+ exp0=( `mke 10000 1` `mke 10001 1` `mke 10002 1` `mke 10003 1` )
+ new_vxlan vxlan3 10003 $NSIM_NETDEV
+
+ msg="create VxLANs 5/5"
+ new_vxlan vxlan4 10004 $NSIM_NETDEV
+}
+
+function overflow_table1 {
+ local pfx=$1
+
+ msg="create GENEVE 1/5"
+ exp1=( `mke 20000 2` 0 0 0 )
+ new_geneve gnv0 20000
+
+ msg="create GENEVE 2/5"
+ exp1=( `mke 20000 2` `mke 20001 2` 0 0 )
+ new_geneve gnv1 20001
+
+ msg="create GENEVE 3/5"
+ exp1=( `mke 20000 2` `mke 20001 2` `mke 20002 2` 0 )
+ new_geneve gnv2 20002
+
+ msg="create GENEVE 4/5"
+ exp1=( `mke 20000 2` `mke 20001 2` `mke 20002 2` `mke 20003 2` )
+ new_geneve gnv3 20003
+
+ msg="create GENEVE 5/5"
+ new_geneve gnv4 20004
+}
+
+echo $NSIM_ID > /sys/bus/netdevsim/new_device
+echo 0 > $NSIM_DEV_SYS/del_port
+
+for port in 0 1; do
+ if [ $port -ne 0 ]; then
+ echo 1 > $NSIM_DEV_DFS/udp_ports_open_only
+ echo 1 > $NSIM_DEV_DFS/udp_ports_sleep
+ fi
+
+ echo $port > $NSIM_DEV_SYS/new_port
+ ifconfig $NSIM_NETDEV up
+
+ overflow_table0 "overflow NIC table"
+ overflow_table1 "overflow NIC table"
+
+ msg="replace VxLAN in overflow table"
+ exp0=( `mke 10000 1` `mke 10004 1` `mke 10002 1` `mke 10003 1` )
+ del_dev vxlan1
+
+ msg="vacate VxLAN in overflow table"
+ exp0=( `mke 10000 1` `mke 10004 1` 0 `mke 10003 1` )
+ del_dev vxlan2
+
+ msg="replace GENEVE in overflow table"
+ exp1=( `mke 20000 2` `mke 20004 2` `mke 20002 2` `mke 20003 2` )
+ del_dev gnv1
+
+ msg="vacate GENEVE in overflow table"
+ exp1=( `mke 20000 2` `mke 20004 2` 0 `mke 20003 2` )
+ del_dev gnv2
+
+ msg="table sharing - share"
+ exp1=( `mke 20000 2` `mke 20004 2` `mke 30001 4` `mke 20003 2` )
+ new_vxlan vxlanG0 30001 $NSIM_NETDEV 4 "gpe external"
+
+ msg="table sharing - overflow"
+ new_vxlan vxlanG1 30002 $NSIM_NETDEV 4 "gpe external"
+ msg="table sharing - overflow v6"
+ new_vxlan vxlanG2 30002 $NSIM_NETDEV 6 "gpe external"
+
+ exp1=( `mke 20000 2` `mke 30002 4` `mke 30001 4` `mke 20003 2` )
+ del_dev gnv4
+
+ msg="destroy NIC"
+ echo $port > $NSIM_DEV_SYS/del_port
+
+ cleanup_tuns
+ exp0=( 0 0 0 0 )
+ exp1=( 0 0 0 0 )
+done
+
+cleanup_nsim
+
+# Sync all
+pfx="sync all"
+
+echo $NSIM_ID > /sys/bus/netdevsim/new_device
+echo 0 > $NSIM_DEV_SYS/del_port
+echo 1 > $NSIM_DEV_DFS/udp_ports_sync_all
+
+for port in 0 1; do
+ if [ $port -ne 0 ]; then
+ echo 1 > $NSIM_DEV_DFS/udp_ports_open_only
+ echo 1 > $NSIM_DEV_DFS/udp_ports_sleep
+ fi
+
+ echo $port > $NSIM_DEV_SYS/new_port
+ ifconfig $NSIM_NETDEV up
+
+ overflow_table0 "overflow NIC table"
+ overflow_table1 "overflow NIC table"
+
+ msg="replace VxLAN in overflow table"
+ exp0=( `mke 10000 1` `mke 10004 1` `mke 10002 1` `mke 10003 1` )
+ del_dev vxlan1
+
+ msg="vacate VxLAN in overflow table"
+ exp0=( `mke 10000 1` `mke 10004 1` 0 `mke 10003 1` )
+ del_dev vxlan2
+
+ msg="replace GENEVE in overflow table"
+ exp1=( `mke 20000 2` `mke 20004 2` `mke 20002 2` `mke 20003 2` )
+ del_dev gnv1
+
+ msg="vacate GENEVE in overflow table"
+ exp1=( `mke 20000 2` `mke 20004 2` 0 `mke 20003 2` )
+ del_dev gnv2
+
+ msg="table sharing - share"
+ exp1=( `mke 20000 2` `mke 20004 2` `mke 30001 4` `mke 20003 2` )
+ new_vxlan vxlanG0 30001 $NSIM_NETDEV 4 "gpe external"
+
+ msg="table sharing - overflow"
+ new_vxlan vxlanG1 30002 $NSIM_NETDEV 4 "gpe external"
+ msg="table sharing - overflow v6"
+ new_vxlan vxlanG2 30002 $NSIM_NETDEV 6 "gpe external"
+
+ exp1=( `mke 20000 2` `mke 30002 4` `mke 30001 4` `mke 20003 2` )
+ del_dev gnv4
+
+ msg="destroy NIC"
+ echo $port > $NSIM_DEV_SYS/del_port
+
+ cleanup_tuns
+ exp0=( 0 0 0 0 )
+ exp1=( 0 0 0 0 )
+done
+
+cleanup_nsim
+
+# Destroy full NIC
+pfx="destroy full"
+
+echo $NSIM_ID > /sys/bus/netdevsim/new_device
+echo 0 > $NSIM_DEV_SYS/del_port
+
+for port in 0 1; do
+ if [ $port -ne 0 ]; then
+ echo 1 > $NSIM_DEV_DFS/udp_ports_open_only
+ echo 1 > $NSIM_DEV_DFS/udp_ports_sleep
+ fi
+
+ echo $port > $NSIM_DEV_SYS/new_port
+ ifconfig $NSIM_NETDEV up
+
+ overflow_table0 "destroy NIC"
+ overflow_table1 "destroy NIC"
+
+ msg="destroy NIC"
+ echo $port > $NSIM_DEV_SYS/del_port
+
+ cleanup_tuns
+ exp0=( 0 0 0 0 )
+ exp1=( 0 0 0 0 )
+done
+
+cleanup_nsim
+
+# IPv4 only
+pfx="IPv4 only"
+
+echo $NSIM_ID > /sys/bus/netdevsim/new_device
+echo 0 > $NSIM_DEV_SYS/del_port
+echo 1 > $NSIM_DEV_DFS/udp_ports_ipv4_only
+
+for port in 0 1; do
+ if [ $port -ne 0 ]; then
+ echo 1 > $NSIM_DEV_DFS/udp_ports_open_only
+ echo 1 > $NSIM_DEV_DFS/udp_ports_sleep
+ fi
+
+ echo $port > $NSIM_DEV_SYS/new_port
+ ifconfig $NSIM_NETDEV up
+
+ msg="create VxLANs v6"
+ new_vxlan vxlanA0 10000 $NSIM_NETDEV 6
+
+ msg="create VxLANs v6"
+ new_vxlan vxlanA1 10000 $NSIM_NETDEV 6
+
+ ip link set dev vxlanA0 down
+ ip link set dev vxlanA0 up
+ check_tables
+
+ msg="create VxLANs v4"
+ exp0=( `mke 10000 1` 0 0 0 )
+ new_vxlan vxlan0 10000 $NSIM_NETDEV
+
+ msg="down VxLANs v4"
+ exp0=( 0 0 0 0 )
+ ip link set dev vxlan0 down
+ check_tables
+
+ msg="up VxLANs v4"
+ exp0=( `mke 10000 1` 0 0 0 )
+ ip link set dev vxlan0 up
+ check_tables
+
+ msg="destroy VxLANs v4"
+ exp0=( 0 0 0 0 )
+ del_dev vxlan0
+
+ msg="recreate VxLANs v4"
+ exp0=( `mke 10000 1` 0 0 0 )
+ new_vxlan vxlan0 10000 $NSIM_NETDEV
+
+ del_dev vxlanA0
+ del_dev vxlanA1
+
+ msg="destroy NIC"
+ echo $port > $NSIM_DEV_SYS/del_port
+
+ cleanup_tuns
+ exp0=( 0 0 0 0 )
+ exp1=( 0 0 0 0 )
+done
+
+cleanup_nsim
+
+# Failures
+pfx="error injection"
+
+echo $NSIM_ID > /sys/bus/netdevsim/new_device
+echo 0 > $NSIM_DEV_SYS/del_port
+
+for port in 0 1; do
+ if [ $port -ne 0 ]; then
+ echo 1 > $NSIM_DEV_DFS/udp_ports_open_only
+ echo 1 > $NSIM_DEV_DFS/udp_ports_sleep
+ fi
+
+ echo $port > $NSIM_DEV_SYS/new_port
+ ifconfig $NSIM_NETDEV up
+
+ echo 110 > $NSIM_DEV_DFS/ports/$port/udp_ports_inject_error
+
+ msg="1 - create VxLANs v6"
+ exp0=( 0 0 0 0 )
+ new_vxlan vxlanA0 10000 $NSIM_NETDEV 6
+
+ msg="1 - create VxLANs v4"
+ exp0=( `mke 10000 1` 0 0 0 )
+ new_vxlan vxlan0 10000 $NSIM_NETDEV
+
+ msg="1 - remove VxLANs v4"
+ del_dev vxlan0
+
+ msg="1 - remove VxLANs v6"
+ exp0=( 0 0 0 0 )
+ del_dev vxlanA0
+
+ msg="2 - create GENEVE"
+ exp1=( `mke 20000 2` 0 0 0 )
+ new_geneve gnv0 20000
+
+ msg="2 - destroy GENEVE"
+ echo 2 > $NSIM_DEV_DFS/ports/$port/udp_ports_inject_error
+ exp1=( `mke 20000 2` 0 0 0 )
+ del_dev gnv0
+
+ msg="2 - create second GENEVE"
+ exp1=( 0 `mke 20001 2` 0 0 )
+ new_geneve gnv0 20001
+
+ msg="destroy NIC"
+ echo $port > $NSIM_DEV_SYS/del_port
+
+ cleanup_tuns
+ exp0=( 0 0 0 0 )
+ exp1=( 0 0 0 0 )
+done
+
+cleanup_nsim
+
+# netdev flags
+pfx="netdev flags"
+
+echo $NSIM_ID > /sys/bus/netdevsim/new_device
+echo 0 > $NSIM_DEV_SYS/del_port
+
+for port in 0 1; do
+ if [ $port -ne 0 ]; then
+ echo 1 > $NSIM_DEV_DFS/udp_ports_open_only
+ echo 1 > $NSIM_DEV_DFS/udp_ports_sleep
+ fi
+
+ echo $port > $NSIM_DEV_SYS/new_port
+ ifconfig $NSIM_NETDEV up
+
+ msg="create VxLANs v6"
+ exp0=( `mke 10000 1` 0 0 0 )
+ new_vxlan vxlanA0 10000 $NSIM_NETDEV 6
+
+ msg="create VxLANs v4"
+ new_vxlan vxlan0 10000 $NSIM_NETDEV
+
+ msg="turn off"
+ exp0=( 0 0 0 0 )
+ ethtool -K $NSIM_NETDEV rx-udp_tunnel-port-offload off
+ check_tables
+
+ msg="turn on"
+ exp0=( `mke 10000 1` 0 0 0 )
+ ethtool -K $NSIM_NETDEV rx-udp_tunnel-port-offload on
+ check_tables
+
+ msg="remove both"
+ del_dev vxlanA0
+ exp0=( 0 0 0 0 )
+ del_dev vxlan0
+ check_tables
+
+ ethtool -K $NSIM_NETDEV rx-udp_tunnel-port-offload off
+
+ msg="create VxLANs v4 - off"
+ exp0=( 0 0 0 0 )
+ new_vxlan vxlan0 10000 $NSIM_NETDEV
+
+ msg="created off - turn on"
+ exp0=( `mke 10000 1` 0 0 0 )
+ ethtool -K $NSIM_NETDEV rx-udp_tunnel-port-offload on
+ check_tables
+
+ msg="destroy NIC"
+ echo $port > $NSIM_DEV_SYS/del_port
+
+ cleanup_tuns
+ exp0=( 0 0 0 0 )
+ exp1=( 0 0 0 0 )
+done
+
+cleanup_nsim
+
+# device initiated reset
+pfx="reset notification"
+
+echo $NSIM_ID > /sys/bus/netdevsim/new_device
+echo 0 > $NSIM_DEV_SYS/del_port
+
+for port in 0 1; do
+ if [ $port -ne 0 ]; then
+ echo 1 > $NSIM_DEV_DFS/udp_ports_open_only
+ echo 1 > $NSIM_DEV_DFS/udp_ports_sleep
+ fi
+
+ echo $port > $NSIM_DEV_SYS/new_port
+ ifconfig $NSIM_NETDEV up
+
+ msg="create VxLANs v6"
+ exp0=( `mke 10000 1` 0 0 0 )
+ new_vxlan vxlanA0 10000 $NSIM_NETDEV 6
+
+ msg="create VxLANs v4"
+ new_vxlan vxlan0 10000 $NSIM_NETDEV
+
+ echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports_reset
+ check_tables
+
+ msg="NIC device goes down"
+ ifconfig $NSIM_NETDEV down
+ if [ $port -eq 1 ]; then
+ exp0=( 0 0 0 0 )
+ exp1=( 0 0 0 0 )
+ fi
+ check_tables
+
+ echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports_reset
+ check_tables
+
+ msg="NIC device goes up again"
+ ifconfig $NSIM_NETDEV up
+ exp0=( `mke 10000 1` 0 0 0 )
+ check_tables
+
+ msg="remove both"
+ del_dev vxlanA0
+ exp0=( 0 0 0 0 )
+ del_dev vxlan0
+ check_tables
+
+ echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports_reset
+ check_tables
+
+ msg="destroy NIC"
+ echo $port > $NSIM_DEV_SYS/del_port
+
+ cleanup_tuns
+ exp0=( 0 0 0 0 )
+ exp1=( 0 0 0 0 )
+done
+
+cleanup_nsim
+
+# shared port tables
+pfx="table sharing"
+
+echo $NSIM_ID > /sys/bus/netdevsim/new_device
+echo 0 > $NSIM_DEV_SYS/del_port
+
+echo 0 > $NSIM_DEV_DFS/udp_ports_open_only
+echo 1 > $NSIM_DEV_DFS/udp_ports_sleep
+echo 1 > $NSIM_DEV_DFS/udp_ports_shared
+
+old_netdevs=$(ls /sys/class/net)
+echo 1 > $NSIM_DEV_SYS/new_port
+NSIM_NETDEV=`get_netdev_name old_netdevs`
+old_netdevs=$(ls /sys/class/net)
+echo 2 > $NSIM_DEV_SYS/new_port
+NSIM_NETDEV2=`get_netdev_name old_netdevs`
+
+msg="VxLAN v4 devices"
+exp0=( `mke 4789 1` 0 0 0 )
+exp1=( 0 0 0 0 )
+new_vxlan vxlan0 4789 $NSIM_NETDEV
+new_vxlan vxlan1 4789 $NSIM_NETDEV2
+
+msg="VxLAN v4 devices go down"
+exp0=( 0 0 0 0 )
+ifconfig vxlan1 down
+ifconfig vxlan0 down
+check_tables
+
+for ifc in vxlan0 vxlan1; do
+ ifconfig $ifc up
+done
+
+msg="VxLAN v6 device"
+exp0=( `mke 4789 1` `mke 4790 1` 0 0 )
+new_vxlan vxlanC 4790 $NSIM_NETDEV 6
+
+msg="Geneve device"
+exp1=( `mke 6081 2` 0 0 0 )
+new_geneve gnv0 6081
+
+msg="NIC device goes down"
+ifconfig $NSIM_NETDEV down
+check_tables
+
+msg="NIC device goes up again"
+ifconfig $NSIM_NETDEV up
+check_tables
+
+for i in `seq 2`; do
+ msg="turn feature off - 1, rep $i"
+ ethtool -K $NSIM_NETDEV rx-udp_tunnel-port-offload off
+ check_tables
+
+ msg="turn feature off - 2, rep $i"
+ exp0=( 0 0 0 0 )
+ exp1=( 0 0 0 0 )
+ ethtool -K $NSIM_NETDEV2 rx-udp_tunnel-port-offload off
+ check_tables
+
+ msg="turn feature on - 1, rep $i"
+ exp0=( `mke 4789 1` `mke 4790 1` 0 0 )
+ exp1=( `mke 6081 2` 0 0 0 )
+ ethtool -K $NSIM_NETDEV rx-udp_tunnel-port-offload on
+ check_tables
+
+ msg="turn feature on - 2, rep $i"
+ ethtool -K $NSIM_NETDEV2 rx-udp_tunnel-port-offload on
+ check_tables
+done
+
+msg="tunnels destroyed 1"
+cleanup_tuns
+exp0=( 0 0 0 0 )
+exp1=( 0 0 0 0 )
+check_tables
+
+overflow_table0 "overflow NIC table"
+
+msg="re-add a port"
+
+echo 2 > $NSIM_DEV_SYS/del_port
+echo 2 > $NSIM_DEV_SYS/new_port
+check_tables
+
+msg="replace VxLAN in overflow table"
+exp0=( `mke 10000 1` `mke 10004 1` `mke 10002 1` `mke 10003 1` )
+del_dev vxlan1
+
+msg="vacate VxLAN in overflow table"
+exp0=( `mke 10000 1` `mke 10004 1` 0 `mke 10003 1` )
+del_dev vxlan2
+
+echo 1 > $NSIM_DEV_DFS/ports/$port/udp_ports_reset
+check_tables
+
+msg="tunnels destroyed 2"
+cleanup_tuns
+exp0=( 0 0 0 0 )
+exp1=( 0 0 0 0 )
+check_tables
+
+echo 1 > $NSIM_DEV_SYS/del_port
+echo 2 > $NSIM_DEV_SYS/del_port
+
+cleanup_nsim
+
+# Static IANA port
+pfx="static IANA vxlan"
+
+echo $NSIM_ID > /sys/bus/netdevsim/new_device
+echo 0 > $NSIM_DEV_SYS/del_port
+
+echo 1 > $NSIM_DEV_DFS/udp_ports_static_iana_vxlan
+STATIC_ENTRIES=( `mke 4789 1` )
+
+port=1
+old_netdevs=$(ls /sys/class/net)
+echo $port > $NSIM_DEV_SYS/new_port
+NSIM_NETDEV=`get_netdev_name old_netdevs`
+
+msg="check empty"
+exp0=( 0 0 0 0 )
+exp1=( 0 0 0 0 )
+check_tables
+
+msg="add on static port"
+new_vxlan vxlan0 4789 $NSIM_NETDEV
+new_vxlan vxlan1 4789 $NSIM_NETDEV
+
+msg="add on different port"
+exp0=( `mke 4790 1` 0 0 0 )
+new_vxlan vxlan2 4790 $NSIM_NETDEV
+
+cleanup_tuns
+
+msg="tunnels destroyed"
+exp0=( 0 0 0 0 )
+exp1=( 0 0 0 0 )
+check_tables
+
+msg="different type"
+new_geneve gnv0 4789
+
+cleanup_tuns
+cleanup_nsim
+
+# END
+
+modprobe -r netdevsim
+
+if [ $num_errors -eq 0 ]; then
+ echo "PASSED all $num_cases checks"
+else
+ echo "FAILED $num_errors/$num_cases checks"
+fi
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh b/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh
new file mode 100755
index 000000000..11189f309
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/ocelot/tc_flower_chains.sh
@@ -0,0 +1,316 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright 2020 NXP Semiconductors
+
+WAIT_TIME=1
+NUM_NETIFS=4
+lib_dir=$(dirname $0)/../../../net/forwarding
+source $lib_dir/tc_common.sh
+source $lib_dir/lib.sh
+
+require_command tcpdump
+
+#
+# +---------------------------------------------+
+# | DUT ports Generator ports |
+# | +--------+ +--------+ +--------+ +--------+ |
+# | | | | | | | | | |
+# | | eth0 | | eth1 | | eth2 | | eth3 | |
+# | | | | | | | | | |
+# +-+--------+-+--------+-+--------+-+--------+-+
+# | | | |
+# | | | |
+# | +-----------+ |
+# | |
+# +--------------------------------+
+
+eth0=${NETIFS[p1]}
+eth1=${NETIFS[p2]}
+eth2=${NETIFS[p3]}
+eth3=${NETIFS[p4]}
+
+eth0_mac="de:ad:be:ef:00:00"
+eth1_mac="de:ad:be:ef:00:01"
+eth2_mac="de:ad:be:ef:00:02"
+eth3_mac="de:ad:be:ef:00:03"
+
+# Helpers to map a VCAP IS1 and VCAP IS2 lookup and policy to a chain number
+# used by the kernel driver. The numbers are:
+# VCAP IS1 lookup 0: 10000
+# VCAP IS1 lookup 1: 11000
+# VCAP IS1 lookup 2: 12000
+# VCAP IS2 lookup 0 policy 0: 20000
+# VCAP IS2 lookup 0 policy 1: 20001
+# VCAP IS2 lookup 0 policy 255: 20255
+# VCAP IS2 lookup 1 policy 0: 21000
+# VCAP IS2 lookup 1 policy 1: 21001
+# VCAP IS2 lookup 1 policy 255: 21255
+IS1()
+{
+ local lookup=$1
+
+ echo $((10000 + 1000 * lookup))
+}
+
+IS2()
+{
+ local lookup=$1
+ local pag=$2
+
+ echo $((20000 + 1000 * lookup + pag))
+}
+
+ES0()
+{
+ echo 0
+}
+
+# The Ocelot switches have a fixed ingress pipeline composed of:
+#
+# +----------------------------------------------+ +-----------------------------------------+
+# | VCAP IS1 | | VCAP IS2 |
+# | | | |
+# | +----------+ +----------+ +----------+ | | +----------+ +----------+ |
+# | | Lookup 0 | | Lookup 1 | | Lookup 2 | | --+------> PAG 0: | Lookup 0 | -> | Lookup 1 | |
+# | +----------+ -> +----------+ -> +----------+ | | | +----------+ +----------+ |
+# | |key&action| |key&action| |key&action| | | | |key&action| |key&action| |
+# | |key&action| |key&action| |key&action| | | | | .. | | .. | |
+# | | .. | | .. | | .. | | | | +----------+ +----------+ |
+# | +----------+ +----------+ +----------+ | | | |
+# | selects PAG | | | +----------+ +----------+ |
+# +----------------------------------------------+ +------> PAG 1: | Lookup 0 | -> | Lookup 1 | |
+# | | +----------+ +----------+ |
+# | | |key&action| |key&action| |
+# | | | .. | | .. | |
+# | | +----------+ +----------+ |
+# | | ... |
+# | | |
+# | | +----------+ +----------+ |
+# +----> PAG 254: | Lookup 0 | -> | Lookup 1 | |
+# | | +----------+ +----------+ |
+# | | |key&action| |key&action| |
+# | | | .. | | .. | |
+# | | +----------+ +----------+ |
+# | | |
+# | | +----------+ +----------+ |
+# +----> PAG 255: | Lookup 0 | -> | Lookup 1 | |
+# | +----------+ +----------+ |
+# | |key&action| |key&action| |
+# | | .. | | .. | |
+# | +----------+ +----------+ |
+# +-----------------------------------------+
+#
+# Both the VCAP IS1 (Ingress Stage 1) and IS2 (Ingress Stage 2) are indexed
+# (looked up) multiple times: IS1 3 times, and IS2 2 times. Each filter
+# (key and action pair) can be configured to only match during the first, or
+# second, etc, lookup.
+#
+# During one TCAM lookup, the filter processing stops at the first entry that
+# matches, then the pipeline jumps to the next lookup.
+# The driver maps each individual lookup of each individual ingress TCAM to a
+# separate chain number. For correct rule offloading, it is mandatory that each
+# filter installed in one TCAM is terminated by a non-optional GOTO action to
+# the next lookup from the fixed pipeline.
+#
+# A chain can only be used if there is a GOTO action correctly set up from the
+# prior lookup in the processing pipeline. Setting up all chains is not
+# mandatory.
+
+# NOTE: VCAP IS1 currently uses only S1_NORMAL half keys and VCAP IS2
+# dynamically chooses between MAC_ETYPE, ARP, IP4_TCP_UDP, IP4_OTHER, which are
+# all half keys as well.
+
+create_tcam_skeleton()
+{
+ local eth=$1
+
+ tc qdisc add dev $eth clsact
+
+ # VCAP IS1 is the Ingress Classification TCAM and can offload the
+ # following actions:
+ # - skbedit priority
+ # - vlan pop
+ # - vlan modify
+ # - goto (only in lookup 2, the last IS1 lookup)
+ tc filter add dev $eth ingress chain 0 pref 49152 flower \
+ skip_sw action goto chain $(IS1 0)
+ tc filter add dev $eth ingress chain $(IS1 0) pref 49152 \
+ flower skip_sw action goto chain $(IS1 1)
+ tc filter add dev $eth ingress chain $(IS1 1) pref 49152 \
+ flower skip_sw action goto chain $(IS1 2)
+ tc filter add dev $eth ingress chain $(IS1 2) pref 49152 \
+ flower skip_sw action goto chain $(IS2 0 0)
+
+ # VCAP IS2 is the Security Enforcement ingress TCAM and can offload the
+ # following actions:
+ # - trap
+ # - drop
+ # - police
+ # The two VCAP IS2 lookups can be segmented into up to 256 groups of
+ # rules, called Policies. A Policy is selected through the Policy
+ # Association Group (PAG) action of VCAP IS1 (which is the
+ # GOTO offload).
+ tc filter add dev $eth ingress chain $(IS2 0 0) pref 49152 \
+ flower skip_sw action goto chain $(IS2 1 0)
+}
+
+setup_prepare()
+{
+ create_tcam_skeleton $eth0
+
+ ip link add br0 type bridge
+ ip link set $eth0 master br0
+ ip link set $eth1 master br0
+ ip link set br0 up
+
+ ip link add link $eth3 name $eth3.100 type vlan id 100
+ ip link set $eth3.100 up
+
+ ip link add link $eth3 name $eth3.200 type vlan id 200
+ ip link set $eth3.200 up
+
+ tc filter add dev $eth0 ingress chain $(IS1 1) pref 1 \
+ protocol 802.1Q flower skip_sw vlan_id 100 \
+ action vlan pop \
+ action goto chain $(IS1 2)
+
+ tc filter add dev $eth0 egress chain $(ES0) pref 1 \
+ flower skip_sw indev $eth1 \
+ action vlan push protocol 802.1Q id 100
+
+ tc filter add dev $eth0 ingress chain $(IS1 0) pref 2 \
+ protocol ipv4 flower skip_sw src_ip 10.1.1.2 \
+ action skbedit priority 7 \
+ action goto chain $(IS1 1)
+
+ tc filter add dev $eth0 ingress chain $(IS2 0 0) pref 1 \
+ protocol ipv4 flower skip_sw ip_proto udp dst_port 5201 \
+ action police rate 50mbit burst 64k conform-exceed drop/pipe \
+ action goto chain $(IS2 1 0)
+}
+
+cleanup()
+{
+ ip link del $eth3.200
+ ip link del $eth3.100
+ tc qdisc del dev $eth0 clsact
+ ip link del br0
+}
+
+test_vlan_pop()
+{
+ printf "Testing VLAN pop.. "
+
+ tcpdump_start $eth2
+
+ # Work around Mausezahn VLAN builder bug
+ # (https://github.com/netsniff-ng/netsniff-ng/issues/225) by using
+ # an 8021q upper
+ $MZ $eth3.100 -q -c 1 -p 64 -a $eth3_mac -b $eth2_mac -t ip
+
+ sleep 1
+
+ tcpdump_stop
+
+ if tcpdump_show | grep -q "$eth3_mac > $eth2_mac, ethertype IPv4"; then
+ echo "OK"
+ else
+ echo "FAIL"
+ fi
+
+ tcpdump_cleanup
+}
+
+test_vlan_push()
+{
+ printf "Testing VLAN push.. "
+
+ tcpdump_start $eth3.100
+
+ $MZ $eth2 -q -c 1 -p 64 -a $eth2_mac -b $eth3_mac -t ip
+
+ sleep 1
+
+ tcpdump_stop
+
+ if tcpdump_show | grep -q "$eth2_mac > $eth3_mac"; then
+ echo "OK"
+ else
+ echo "FAIL"
+ fi
+
+ tcpdump_cleanup
+}
+
+test_vlan_modify()
+{
+ printf "Testing VLAN modification.. "
+
+ ip link set br0 type bridge vlan_filtering 1
+ bridge vlan add dev $eth0 vid 200
+ bridge vlan add dev $eth0 vid 300
+ bridge vlan add dev $eth1 vid 300
+
+ tc filter add dev $eth0 ingress chain $(IS1 2) pref 3 \
+ protocol 802.1Q flower skip_sw vlan_id 200 \
+ action vlan modify id 300 \
+ action goto chain $(IS2 0 0)
+
+ tcpdump_start $eth2
+
+ $MZ $eth3.200 -q -c 1 -p 64 -a $eth3_mac -b $eth2_mac -t ip
+
+ sleep 1
+
+ tcpdump_stop
+
+ if tcpdump_show | grep -q "$eth3_mac > $eth2_mac, .* vlan 300"; then
+ echo "OK"
+ else
+ echo "FAIL"
+ fi
+
+ tcpdump_cleanup
+
+ tc filter del dev $eth0 ingress chain $(IS1 2) pref 3
+
+ bridge vlan del dev $eth0 vid 200
+ bridge vlan del dev $eth0 vid 300
+ bridge vlan del dev $eth1 vid 300
+ ip link set br0 type bridge vlan_filtering 0
+}
+
+test_skbedit_priority()
+{
+ local num_pkts=100
+
+ printf "Testing frame prioritization.. "
+
+ before=$(ethtool_stats_get $eth0 'rx_green_prio_7')
+
+ $MZ $eth3 -q -c $num_pkts -p 64 -a $eth3_mac -b $eth2_mac -t ip -A 10.1.1.2
+
+ after=$(ethtool_stats_get $eth0 'rx_green_prio_7')
+
+ if [ $((after - before)) = $num_pkts ]; then
+ echo "OK"
+ else
+ echo "FAIL"
+ fi
+}
+
+trap cleanup EXIT
+
+ALL_TESTS="
+ test_vlan_pop
+ test_vlan_push
+ test_vlan_modify
+ test_skbedit_priority
+"
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/usb/usbip/usbip_test.sh b/tools/testing/selftests/drivers/usb/usbip/usbip_test.sh
new file mode 100755
index 000000000..128f0ab24
--- /dev/null
+++ b/tools/testing/selftests/drivers/usb/usbip/usbip_test.sh
@@ -0,0 +1,200 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+usage() { echo "usbip_test.sh -b <busid> -p <usbip tools path>"; exit 1; }
+
+while getopts "h:b:p:" arg; do
+ case "${arg}" in
+ h)
+ usage
+ ;;
+ b)
+ busid=${OPTARG}
+ ;;
+ p)
+ tools_path=${OPTARG}
+ ;;
+ *)
+ usage
+ ;;
+ esac
+done
+shift $((OPTIND-1))
+
+if [ -z "${busid}" ]; then
+ usage
+fi
+
+echo "Running USB over IP Testing on $busid";
+
+test_end_msg="End of USB over IP Testing on $busid"
+
+if [ $UID != 0 ]; then
+ echo "Please run usbip_test as root [SKIP]"
+ echo $test_end_msg
+ exit $ksft_skip
+fi
+
+echo "Load usbip_host module"
+if ! /sbin/modprobe -q -n usbip_host; then
+ echo "usbip_test: module usbip_host is not found [SKIP]"
+ echo $test_end_msg
+ exit $ksft_skip
+fi
+
+if /sbin/modprobe -q usbip_host; then
+ echo "usbip_test: module usbip_host is loaded [OK]"
+else
+ echo "usbip_test: module usbip_host failed to load [FAIL]"
+ echo $test_end_msg
+ exit 1
+fi
+
+echo "Load vhci_hcd module"
+if /sbin/modprobe -q vhci_hcd; then
+ echo "usbip_test: module vhci_hcd is loaded [OK]"
+else
+ echo "usbip_test: module vhci_hcd failed to load [FAIL]"
+ echo $test_end_msg
+ exit 1
+fi
+echo "=============================================================="
+
+cd $tools_path;
+
+if [ ! -f src/usbip ]; then
+ echo "Please build usbip tools"
+ echo $test_end_msg
+ exit $ksft_skip
+fi
+
+echo "Expect to see export-able devices";
+src/usbip list -l;
+echo "=============================================================="
+
+echo "Run lsusb to see all usb devices"
+lsusb -t;
+echo "=============================================================="
+
+src/usbipd -D;
+
+echo "Get exported devices from localhost - expect to see none";
+src/usbip list -r localhost;
+echo "=============================================================="
+
+echo "bind devices";
+src/usbip bind -b $busid;
+echo "=============================================================="
+
+echo "Run lsusb - bound devices should be under usbip_host control"
+lsusb -t;
+echo "=============================================================="
+
+echo "bind devices - expect already bound messages"
+src/usbip bind -b $busid;
+echo "=============================================================="
+
+echo "Get exported devices from localhost - expect to see exported devices";
+src/usbip list -r localhost;
+echo "=============================================================="
+
+echo "unbind devices";
+src/usbip unbind -b $busid;
+echo "=============================================================="
+
+echo "Run lsusb - bound devices should be rebound to original drivers"
+lsusb -t;
+echo "=============================================================="
+
+echo "unbind devices - expect no devices bound message";
+src/usbip unbind -b $busid;
+echo "=============================================================="
+
+echo "Get exported devices from localhost - expect to see none";
+src/usbip list -r localhost;
+echo "=============================================================="
+
+echo "List imported devices - expect to see none";
+src/usbip port;
+echo "=============================================================="
+
+echo "Import devices from localhost - should fail with no devices"
+src/usbip attach -r localhost -b $busid;
+echo "=============================================================="
+
+echo "bind devices";
+src/usbip bind -b $busid;
+echo "=============================================================="
+
+echo "List imported devices - expect to see exported devices";
+src/usbip list -r localhost;
+echo "=============================================================="
+
+echo "List imported devices - expect to see none";
+src/usbip port;
+echo "=============================================================="
+
+echo "Import devices from localhost - should work"
+src/usbip attach -r localhost -b $busid;
+echo "=============================================================="
+
+# Wait for sysfs file to be updated. Without this sleep, usbip port
+# shows no imported devices.
+sleep 3;
+
+echo "List imported devices - expect to see imported devices";
+src/usbip port;
+echo "=============================================================="
+
+echo "Import devices from localhost - expect already imported messages"
+src/usbip attach -r localhost -b $busid;
+echo "=============================================================="
+
+echo "Un-import devices";
+src/usbip detach -p 00;
+src/usbip detach -p 01;
+echo "=============================================================="
+
+echo "List imported devices - expect to see none";
+src/usbip port;
+echo "=============================================================="
+
+echo "Un-import devices - expect no devices to detach messages";
+src/usbip detach -p 00;
+src/usbip detach -p 01;
+echo "=============================================================="
+
+echo "Detach invalid port tests - expect invalid port error message";
+src/usbip detach -p 100;
+echo "=============================================================="
+
+echo "Expect to see export-able devices";
+src/usbip list -l;
+echo "=============================================================="
+
+echo "Remove usbip_host module";
+rmmod usbip_host;
+
+echo "Run lsusb - bound devices should be rebound to original drivers"
+lsusb -t;
+echo "=============================================================="
+
+echo "Run bind without usbip_host - expect fail"
+src/usbip bind -b $busid;
+echo "=============================================================="
+
+echo "Run lsusb - devices that failed to bind aren't bound to any driver"
+lsusb -t;
+echo "=============================================================="
+
+echo "modprobe usbip_host - does it work?"
+/sbin/modprobe usbip_host
+echo "Should see -busid- is not in match_busid table... skip! dmesg"
+echo "=============================================================="
+dmesg | grep "is not in match_busid table"
+echo "=============================================================="
+
+echo $test_end_msg
diff --git a/tools/testing/selftests/efivarfs/.gitignore b/tools/testing/selftests/efivarfs/.gitignore
new file mode 100644
index 000000000..807407f7f
--- /dev/null
+++ b/tools/testing/selftests/efivarfs/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+create-read
+open-unlink
diff --git a/tools/testing/selftests/efivarfs/Makefile b/tools/testing/selftests/efivarfs/Makefile
new file mode 100644
index 000000000..e3181338b
--- /dev/null
+++ b/tools/testing/selftests/efivarfs/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+CFLAGS = -Wall
+
+TEST_GEN_FILES := open-unlink create-read
+TEST_PROGS := efivarfs.sh
+
+include ../lib.mk
+
diff --git a/tools/testing/selftests/efivarfs/config b/tools/testing/selftests/efivarfs/config
new file mode 100644
index 000000000..4e151f100
--- /dev/null
+++ b/tools/testing/selftests/efivarfs/config
@@ -0,0 +1 @@
+CONFIG_EFIVAR_FS=y
diff --git a/tools/testing/selftests/efivarfs/create-read.c b/tools/testing/selftests/efivarfs/create-read.c
new file mode 100644
index 000000000..7bc7af4eb
--- /dev/null
+++ b/tools/testing/selftests/efivarfs/create-read.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+
+int main(int argc, char **argv)
+{
+ const char *path;
+ char buf[4];
+ int fd, rc;
+
+ if (argc < 2) {
+ fprintf(stderr, "usage: %s <path>\n", argv[0]);
+ return EXIT_FAILURE;
+ }
+
+ path = argv[1];
+
+ /* create a test variable */
+ fd = open(path, O_RDWR | O_CREAT, 0600);
+ if (fd < 0) {
+ perror("open(O_WRONLY)");
+ return EXIT_FAILURE;
+ }
+
+ rc = read(fd, buf, sizeof(buf));
+ if (rc != 0) {
+ fprintf(stderr, "Reading a new var should return EOF\n");
+ close(fd);
+ return EXIT_FAILURE;
+ }
+
+ close(fd);
+ return EXIT_SUCCESS;
+}
diff --git a/tools/testing/selftests/efivarfs/efivarfs.sh b/tools/testing/selftests/efivarfs/efivarfs.sh
new file mode 100755
index 000000000..d374878cc
--- /dev/null
+++ b/tools/testing/selftests/efivarfs/efivarfs.sh
@@ -0,0 +1,218 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+efivarfs_mount=/sys/firmware/efi/efivars
+test_guid=210be57c-9849-4fc7-a635-e6382d1aec27
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+file_cleanup()
+{
+ chattr -i $1
+ rm -f $1
+}
+
+check_prereqs()
+{
+ local msg="skip all tests:"
+
+ if [ $UID != 0 ]; then
+ echo $msg must be run as root >&2
+ exit $ksft_skip
+ fi
+
+ if ! grep -q "^\S\+ $efivarfs_mount efivarfs" /proc/mounts; then
+ echo $msg efivarfs is not mounted on $efivarfs_mount >&2
+ exit $ksft_skip
+ fi
+}
+
+run_test()
+{
+ local test="$1"
+
+ echo "--------------------"
+ echo "running $test"
+ echo "--------------------"
+
+ if [ "$(type -t $test)" = 'function' ]; then
+ ( $test )
+ else
+ ( ./$test )
+ fi
+
+ if [ $? -ne 0 ]; then
+ echo " [FAIL]"
+ rc=1
+ else
+ echo " [PASS]"
+ fi
+}
+
+test_create()
+{
+ local attrs='\x07\x00\x00\x00'
+ local file=$efivarfs_mount/$FUNCNAME-$test_guid
+
+ printf "$attrs\x00" > $file
+
+ if [ ! -e $file ]; then
+ echo "$file couldn't be created" >&2
+ exit 1
+ fi
+
+ if [ $(stat -c %s $file) -ne 5 ]; then
+ echo "$file has invalid size" >&2
+ file_cleanup $file
+ exit 1
+ fi
+ file_cleanup $file
+}
+
+test_create_empty()
+{
+ local file=$efivarfs_mount/$FUNCNAME-$test_guid
+
+ : > $file
+
+ if [ ! -e $file ]; then
+ echo "$file can not be created without writing" >&2
+ exit 1
+ fi
+ file_cleanup $file
+}
+
+test_create_read()
+{
+ local file=$efivarfs_mount/$FUNCNAME-$test_guid
+ ./create-read $file
+ if [ $? -ne 0 ]; then
+ echo "create and read $file failed"
+ file_cleanup $file
+ exit 1
+ fi
+ file_cleanup $file
+}
+
+test_delete()
+{
+ local attrs='\x07\x00\x00\x00'
+ local file=$efivarfs_mount/$FUNCNAME-$test_guid
+
+ printf "$attrs\x00" > $file
+
+ if [ ! -e $file ]; then
+ echo "$file couldn't be created" >&2
+ exit 1
+ fi
+
+ file_cleanup $file
+
+ if [ -e $file ]; then
+ echo "$file couldn't be deleted" >&2
+ exit 1
+ fi
+
+}
+
+# test that we can remove a variable by issuing a write with only
+# attributes specified
+test_zero_size_delete()
+{
+ local attrs='\x07\x00\x00\x00'
+ local file=$efivarfs_mount/$FUNCNAME-$test_guid
+
+ printf "$attrs\x00" > $file
+
+ if [ ! -e $file ]; then
+ echo "$file does not exist" >&2
+ exit 1
+ fi
+
+ chattr -i $file
+ printf "$attrs" > $file
+
+ if [ -e $file ]; then
+ echo "$file should have been deleted" >&2
+ exit 1
+ fi
+}
+
+test_open_unlink()
+{
+ local file=$efivarfs_mount/$FUNCNAME-$test_guid
+ ./open-unlink $file
+}
+
+# test that we can create a range of filenames
+test_valid_filenames()
+{
+ local attrs='\x07\x00\x00\x00'
+ local ret=0
+
+ local file_list="abc dump-type0-11-1-1362436005 1234 -"
+ for f in $file_list; do
+ local file=$efivarfs_mount/$f-$test_guid
+
+ printf "$attrs\x00" > $file
+
+ if [ ! -e $file ]; then
+ echo "$file could not be created" >&2
+ ret=1
+ else
+ file_cleanup $file
+ fi
+ done
+
+ exit $ret
+}
+
+test_invalid_filenames()
+{
+ local attrs='\x07\x00\x00\x00'
+ local ret=0
+
+ local file_list="
+ -1234-1234-1234-123456789abc
+ foo
+ foo-bar
+ -foo-
+ foo-barbazba-foob-foob-foob-foobarbazfoo
+ foo-------------------------------------
+ -12345678-1234-1234-1234-123456789abc
+ a-12345678=1234-1234-1234-123456789abc
+ a-12345678-1234=1234-1234-123456789abc
+ a-12345678-1234-1234=1234-123456789abc
+ a-12345678-1234-1234-1234=123456789abc
+ 1112345678-1234-1234-1234-123456789abc"
+
+ for f in $file_list; do
+ local file=$efivarfs_mount/$f
+
+ printf "$attrs\x00" 2>/dev/null > $file
+
+ if [ -e $file ]; then
+ echo "Creating $file should have failed" >&2
+ file_cleanup $file
+ ret=1
+ fi
+ done
+
+ exit $ret
+}
+
+check_prereqs
+
+rc=0
+
+run_test test_create
+run_test test_create_empty
+run_test test_create_read
+run_test test_delete
+run_test test_zero_size_delete
+run_test test_open_unlink
+run_test test_valid_filenames
+run_test test_invalid_filenames
+
+exit $rc
diff --git a/tools/testing/selftests/efivarfs/open-unlink.c b/tools/testing/selftests/efivarfs/open-unlink.c
new file mode 100644
index 000000000..562742d44
--- /dev/null
+++ b/tools/testing/selftests/efivarfs/open-unlink.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <errno.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <linux/fs.h>
+
+static int set_immutable(const char *path, int immutable)
+{
+ unsigned int flags;
+ int fd;
+ int rc;
+ int error;
+
+ fd = open(path, O_RDONLY);
+ if (fd < 0)
+ return fd;
+
+ rc = ioctl(fd, FS_IOC_GETFLAGS, &flags);
+ if (rc < 0) {
+ error = errno;
+ close(fd);
+ errno = error;
+ return rc;
+ }
+
+ if (immutable)
+ flags |= FS_IMMUTABLE_FL;
+ else
+ flags &= ~FS_IMMUTABLE_FL;
+
+ rc = ioctl(fd, FS_IOC_SETFLAGS, &flags);
+ error = errno;
+ close(fd);
+ errno = error;
+ return rc;
+}
+
+static int get_immutable(const char *path)
+{
+ unsigned int flags;
+ int fd;
+ int rc;
+ int error;
+
+ fd = open(path, O_RDONLY);
+ if (fd < 0)
+ return fd;
+
+ rc = ioctl(fd, FS_IOC_GETFLAGS, &flags);
+ if (rc < 0) {
+ error = errno;
+ close(fd);
+ errno = error;
+ return rc;
+ }
+ close(fd);
+ if (flags & FS_IMMUTABLE_FL)
+ return 1;
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ const char *path;
+ char buf[5];
+ int fd, rc;
+
+ if (argc < 2) {
+ fprintf(stderr, "usage: %s <path>\n", argv[0]);
+ return EXIT_FAILURE;
+ }
+
+ path = argv[1];
+
+ /* attributes: EFI_VARIABLE_NON_VOLATILE |
+ * EFI_VARIABLE_BOOTSERVICE_ACCESS |
+ * EFI_VARIABLE_RUNTIME_ACCESS
+ */
+ *(uint32_t *)buf = 0x7;
+ buf[4] = 0;
+
+ /* create a test variable */
+ fd = open(path, O_WRONLY | O_CREAT, 0600);
+ if (fd < 0) {
+ perror("open(O_WRONLY)");
+ return EXIT_FAILURE;
+ }
+
+ rc = write(fd, buf, sizeof(buf));
+ if (rc != sizeof(buf)) {
+ perror("write");
+ return EXIT_FAILURE;
+ }
+
+ close(fd);
+
+ rc = get_immutable(path);
+ if (rc < 0) {
+ perror("ioctl(FS_IOC_GETFLAGS)");
+ return EXIT_FAILURE;
+ } else if (rc) {
+ rc = set_immutable(path, 0);
+ if (rc < 0) {
+ perror("ioctl(FS_IOC_SETFLAGS)");
+ return EXIT_FAILURE;
+ }
+ }
+
+ fd = open(path, O_RDONLY);
+ if (fd < 0) {
+ perror("open");
+ return EXIT_FAILURE;
+ }
+
+ if (unlink(path) < 0) {
+ perror("unlink");
+ return EXIT_FAILURE;
+ }
+
+ rc = read(fd, buf, sizeof(buf));
+ if (rc > 0) {
+ fprintf(stderr, "reading from an unlinked variable "
+ "shouldn't be possible\n");
+ return EXIT_FAILURE;
+ }
+
+ return EXIT_SUCCESS;
+}
diff --git a/tools/testing/selftests/exec/.gitignore b/tools/testing/selftests/exec/.gitignore
new file mode 100644
index 000000000..9e2f00343
--- /dev/null
+++ b/tools/testing/selftests/exec/.gitignore
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-only
+subdir*
+script*
+execveat
+execveat.symlink
+execveat.moved
+execveat.path.ephemeral
+execveat.ephemeral
+execveat.denatured
+/load_address_*
+/recursion-depth
+xxxxxxxx*
+pipe
+S_I*.test
diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile
new file mode 100644
index 000000000..2d7fca446
--- /dev/null
+++ b/tools/testing/selftests/exec/Makefile
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS = -Wall
+CFLAGS += -Wno-nonnull
+CFLAGS += -D_GNU_SOURCE
+
+TEST_PROGS := binfmt_script
+TEST_GEN_PROGS := execveat load_address_4096 load_address_2097152 load_address_16777216 non-regular
+TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir
+# Makefile is a run-time dependency, since it's accessed by the execveat test
+TEST_FILES := Makefile
+
+TEST_GEN_PROGS += recursion-depth
+
+EXTRA_CLEAN := $(OUTPUT)/subdir.moved $(OUTPUT)/execveat.moved $(OUTPUT)/xxxxx* \
+ $(OUTPUT)/S_I*.test
+
+include ../lib.mk
+
+$(OUTPUT)/subdir:
+ mkdir -p $@
+$(OUTPUT)/script:
+ echo '#!/bin/sh' > $@
+ echo 'exit $$*' >> $@
+ chmod +x $@
+$(OUTPUT)/execveat.symlink: $(OUTPUT)/execveat
+ cd $(OUTPUT) && ln -s -f $(shell basename $<) $(shell basename $@)
+$(OUTPUT)/execveat.denatured: $(OUTPUT)/execveat
+ cp $< $@
+ chmod -x $@
+$(OUTPUT)/load_address_4096: load_address.c
+ $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=0x1000 -pie -static $< -o $@
+$(OUTPUT)/load_address_2097152: load_address.c
+ $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=0x200000 -pie -static $< -o $@
+$(OUTPUT)/load_address_16777216: load_address.c
+ $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=0x1000000 -pie -static $< -o $@
diff --git a/tools/testing/selftests/exec/binfmt_script b/tools/testing/selftests/exec/binfmt_script
new file mode 100755
index 000000000..05f94a741
--- /dev/null
+++ b/tools/testing/selftests/exec/binfmt_script
@@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test that truncation of bprm->buf doesn't cause unexpected execs paths, along
+# with various other pathological cases.
+import os, subprocess
+
+# Relevant commits
+#
+# b5372fe5dc84 ("exec: load_script: Do not exec truncated interpreter path")
+# 6eb3c3d0a52d ("exec: increase BINPRM_BUF_SIZE to 256")
+
+# BINPRM_BUF_SIZE
+SIZE=256
+
+NAME_MAX=int(subprocess.check_output(["getconf", "NAME_MAX", "."]))
+
+test_num=0
+
+code='''#!/usr/bin/perl
+print "Executed interpreter! Args:\n";
+print "0 : '$0'\n";
+$counter = 1;
+foreach my $a (@ARGV) {
+ print "$counter : '$a'\n";
+ $counter++;
+}
+'''
+
+##
+# test - produce a binfmt_script hashbang line for testing
+#
+# @size: bytes for bprm->buf line, including hashbang but not newline
+# @good: whether this script is expected to execute correctly
+# @hashbang: the special 2 bytes for running binfmt_script
+# @leading: any leading whitespace before the executable path
+# @root: start of executable pathname
+# @target: end of executable pathname
+# @arg: bytes following the executable pathname
+# @fill: character to fill between @root and @target to reach @size bytes
+# @newline: character to use as newline, not counted towards @size
+# ...
+def test(name, size, good=True, leading="", root="./", target="/perl",
+ fill="A", arg="", newline="\n", hashbang="#!"):
+ global test_num, tests, NAME_MAX
+ test_num += 1
+ if test_num > tests:
+ raise ValueError("more binfmt_script tests than expected! (want %d, expected %d)"
+ % (test_num, tests))
+
+ middle = ""
+ remaining = size - len(hashbang) - len(leading) - len(root) - len(target) - len(arg)
+ # The middle of the pathname must not exceed NAME_MAX
+ while remaining >= NAME_MAX:
+ middle += fill * (NAME_MAX - 1)
+ middle += '/'
+ remaining -= NAME_MAX
+ middle += fill * remaining
+
+ dirpath = root + middle
+ binary = dirpath + target
+ if len(target):
+ os.makedirs(dirpath, mode=0o755, exist_ok=True)
+ open(binary, "w").write(code)
+ os.chmod(binary, 0o755)
+
+ buf=hashbang + leading + root + middle + target + arg + newline
+ if len(newline) > 0:
+ buf += 'echo this is not really perl\n'
+
+ script = "binfmt_script-%s" % (name)
+ open(script, "w").write(buf)
+ os.chmod(script, 0o755)
+
+ proc = subprocess.Popen(["./%s" % (script)], shell=True,
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+ stdout = proc.communicate()[0]
+
+ if proc.returncode == 0 and b'Executed interpreter' in stdout:
+ if good:
+ print("ok %d - binfmt_script %s (successful good exec)"
+ % (test_num, name))
+ else:
+ print("not ok %d - binfmt_script %s succeeded when it should have failed"
+ % (test_num, name))
+ else:
+ if good:
+ print("not ok %d - binfmt_script %s failed when it should have succeeded (rc:%d)"
+ % (test_num, name, proc.returncode))
+ else:
+ print("ok %d - binfmt_script %s (correctly failed bad exec)"
+ % (test_num, name))
+
+ # Clean up crazy binaries
+ os.unlink(script)
+ if len(target):
+ elements = binary.split('/')
+ os.unlink(binary)
+ elements.pop()
+ while len(elements) > 1:
+ os.rmdir("/".join(elements))
+ elements.pop()
+
+tests=27
+print("TAP version 1.3")
+print("1..%d" % (tests))
+
+### FAIL (8 tests)
+
+# Entire path is well past the BINFMT_BUF_SIZE.
+test(name="too-big", size=SIZE+80, good=False)
+# Path is right at max size, making it impossible to tell if it was truncated.
+test(name="exact", size=SIZE, good=False)
+# Same as above, but with leading whitespace.
+test(name="exact-space", size=SIZE, good=False, leading=" ")
+# Huge buffer of only whitespace.
+test(name="whitespace-too-big", size=SIZE+71, good=False, root="",
+ fill=" ", target="")
+# A good path, but it gets truncated due to leading whitespace.
+test(name="truncated", size=SIZE+17, good=False, leading=" " * 19)
+# Entirely empty except for #!
+test(name="empty", size=2, good=False, root="",
+ fill="", target="", newline="")
+# Within size, but entirely spaces
+test(name="spaces", size=SIZE-1, good=False, root="", fill=" ",
+ target="", newline="")
+# Newline before binary.
+test(name="newline-prefix", size=SIZE-1, good=False, leading="\n",
+ root="", fill=" ", target="")
+
+### ok (19 tests)
+
+# The original test case that was broken by commit:
+# 8099b047ecc4 ("exec: load_script: don't blindly truncate shebang string")
+test(name="test.pl", size=439, leading=" ",
+ root="./nix/store/bwav8kz8b3y471wjsybgzw84mrh4js9-perl-5.28.1/bin",
+ arg=" -I/nix/store/x6yyav38jgr924nkna62q3pkp0dgmzlx-perl5.28.1-File-Slurp-9999.25/lib/perl5/site_perl -I/nix/store/ha8v67sl8dac92r9z07vzr4gv1y9nwqz-perl5.28.1-Net-DBus-1.1.0/lib/perl5/site_perl -I/nix/store/dcrkvnjmwh69ljsvpbdjjdnqgwx90a9d-perl5.28.1-XML-Parser-2.44/lib/perl5/site_perl -I/nix/store/rmji88k2zz7h4zg97385bygcydrf2q8h-perl5.28.1-XML-Twig-3.52/lib/perl5/site_perl")
+# One byte under size, leaving newline visible.
+test(name="one-under", size=SIZE-1)
+# Two bytes under size, leaving newline visible.
+test(name="two-under", size=SIZE-2)
+# Exact size, but trailing whitespace visible instead of newline
+test(name="exact-trunc-whitespace", size=SIZE, arg=" ")
+# Exact size, but trailing space and first arg char visible instead of newline.
+test(name="exact-trunc-arg", size=SIZE, arg=" f")
+# One bute under, with confirmed non-truncated arg since newline now visible.
+test(name="one-under-full-arg", size=SIZE-1, arg=" f")
+# Short read buffer by one byte.
+test(name="one-under-no-nl", size=SIZE-1, newline="")
+# Short read buffer by half buffer size.
+test(name="half-under-no-nl", size=int(SIZE/2), newline="")
+# One byte under with whitespace arg. leaving wenline visible.
+test(name="one-under-trunc-arg", size=SIZE-1, arg=" ")
+# One byte under with whitespace leading. leaving wenline visible.
+test(name="one-under-leading", size=SIZE-1, leading=" ")
+# One byte under with whitespace leading and as arg. leaving newline visible.
+test(name="one-under-leading-trunc-arg", size=SIZE-1, leading=" ", arg=" ")
+# Same as above, but with 2 bytes under
+test(name="two-under-no-nl", size=SIZE-2, newline="")
+test(name="two-under-trunc-arg", size=SIZE-2, arg=" ")
+test(name="two-under-leading", size=SIZE-2, leading=" ")
+test(name="two-under-leading-trunc-arg", size=SIZE-2, leading=" ", arg=" ")
+# Same as above, but with buffer half filled
+test(name="two-under-no-nl", size=int(SIZE/2), newline="")
+test(name="two-under-trunc-arg", size=int(SIZE/2), arg=" ")
+test(name="two-under-leading", size=int(SIZE/2), leading=" ")
+test(name="two-under-lead-trunc-arg", size=int(SIZE/2), leading=" ", arg=" ")
+
+if test_num != tests:
+ raise ValueError("fewer binfmt_script tests than expected! (ran %d, expected %d"
+ % (test_num, tests))
diff --git a/tools/testing/selftests/exec/execveat.c b/tools/testing/selftests/exec/execveat.c
new file mode 100644
index 000000000..67bf7254a
--- /dev/null
+++ b/tools/testing/selftests/exec/execveat.c
@@ -0,0 +1,432 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2014 Google, Inc.
+ *
+ * Selftests for execveat(2).
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE /* to get O_PATH, AT_EMPTY_PATH */
+#endif
+#include <sys/sendfile.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+
+static char longpath[2 * PATH_MAX] = "";
+static char *envp[] = { "IN_TEST=yes", NULL, NULL };
+static char *argv[] = { "execveat", "99", NULL };
+
+static int execveat_(int fd, const char *path, char **argv, char **envp,
+ int flags)
+{
+#ifdef __NR_execveat
+ return syscall(__NR_execveat, fd, path, argv, envp, flags);
+#else
+ errno = ENOSYS;
+ return -1;
+#endif
+}
+
+#define check_execveat_fail(fd, path, flags, errno) \
+ _check_execveat_fail(fd, path, flags, errno, #errno)
+static int _check_execveat_fail(int fd, const char *path, int flags,
+ int expected_errno, const char *errno_str)
+{
+ int rc;
+
+ errno = 0;
+ printf("Check failure of execveat(%d, '%s', %d) with %s... ",
+ fd, path?:"(null)", flags, errno_str);
+ rc = execveat_(fd, path, argv, envp, flags);
+
+ if (rc > 0) {
+ printf("[FAIL] (unexpected success from execveat(2))\n");
+ return 1;
+ }
+ if (errno != expected_errno) {
+ printf("[FAIL] (expected errno %d (%s) not %d (%s)\n",
+ expected_errno, strerror(expected_errno),
+ errno, strerror(errno));
+ return 1;
+ }
+ printf("[OK]\n");
+ return 0;
+}
+
+static int check_execveat_invoked_rc(int fd, const char *path, int flags,
+ int expected_rc, int expected_rc2)
+{
+ int status;
+ int rc;
+ pid_t child;
+ int pathlen = path ? strlen(path) : 0;
+
+ if (pathlen > 40)
+ printf("Check success of execveat(%d, '%.20s...%s', %d)... ",
+ fd, path, (path + pathlen - 20), flags);
+ else
+ printf("Check success of execveat(%d, '%s', %d)... ",
+ fd, path?:"(null)", flags);
+ child = fork();
+ if (child < 0) {
+ printf("[FAIL] (fork() failed)\n");
+ return 1;
+ }
+ if (child == 0) {
+ /* Child: do execveat(). */
+ rc = execveat_(fd, path, argv, envp, flags);
+ printf("[FAIL]: execveat() failed, rc=%d errno=%d (%s)\n",
+ rc, errno, strerror(errno));
+ exit(1); /* should not reach here */
+ }
+ /* Parent: wait for & check child's exit status. */
+ rc = waitpid(child, &status, 0);
+ if (rc != child) {
+ printf("[FAIL] (waitpid(%d,...) returned %d)\n", child, rc);
+ return 1;
+ }
+ if (!WIFEXITED(status)) {
+ printf("[FAIL] (child %d did not exit cleanly, status=%08x)\n",
+ child, status);
+ return 1;
+ }
+ if ((WEXITSTATUS(status) != expected_rc) &&
+ (WEXITSTATUS(status) != expected_rc2)) {
+ printf("[FAIL] (child %d exited with %d not %d nor %d)\n",
+ child, WEXITSTATUS(status), expected_rc, expected_rc2);
+ return 1;
+ }
+ printf("[OK]\n");
+ return 0;
+}
+
+static int check_execveat(int fd, const char *path, int flags)
+{
+ return check_execveat_invoked_rc(fd, path, flags, 99, 99);
+}
+
+static char *concat(const char *left, const char *right)
+{
+ char *result = malloc(strlen(left) + strlen(right) + 1);
+
+ strcpy(result, left);
+ strcat(result, right);
+ return result;
+}
+
+static int open_or_die(const char *filename, int flags)
+{
+ int fd = open(filename, flags);
+
+ if (fd < 0) {
+ printf("Failed to open '%s'; "
+ "check prerequisites are available\n", filename);
+ exit(1);
+ }
+ return fd;
+}
+
+static void exe_cp(const char *src, const char *dest)
+{
+ int in_fd = open_or_die(src, O_RDONLY);
+ int out_fd = open(dest, O_RDWR|O_CREAT|O_TRUNC, 0755);
+ struct stat info;
+
+ fstat(in_fd, &info);
+ sendfile(out_fd, in_fd, NULL, info.st_size);
+ close(in_fd);
+ close(out_fd);
+}
+
+#define XX_DIR_LEN 200
+static int check_execveat_pathmax(int root_dfd, const char *src, int is_script)
+{
+ int fail = 0;
+ int ii, count, len;
+ char longname[XX_DIR_LEN + 1];
+ int fd;
+
+ if (*longpath == '\0') {
+ /* Create a filename close to PATH_MAX in length */
+ char *cwd = getcwd(NULL, 0);
+
+ if (!cwd) {
+ printf("Failed to getcwd(), errno=%d (%s)\n",
+ errno, strerror(errno));
+ return 2;
+ }
+ strcpy(longpath, cwd);
+ strcat(longpath, "/");
+ memset(longname, 'x', XX_DIR_LEN - 1);
+ longname[XX_DIR_LEN - 1] = '/';
+ longname[XX_DIR_LEN] = '\0';
+ count = (PATH_MAX - 3 - strlen(cwd)) / XX_DIR_LEN;
+ for (ii = 0; ii < count; ii++) {
+ strcat(longpath, longname);
+ mkdir(longpath, 0755);
+ }
+ len = (PATH_MAX - 3 - strlen(cwd)) - (count * XX_DIR_LEN);
+ if (len <= 0)
+ len = 1;
+ memset(longname, 'y', len);
+ longname[len] = '\0';
+ strcat(longpath, longname);
+ free(cwd);
+ }
+ exe_cp(src, longpath);
+
+ /*
+ * Execute as a pre-opened file descriptor, which works whether this is
+ * a script or not (because the interpreter sees a filename like
+ * "/dev/fd/20").
+ */
+ fd = open(longpath, O_RDONLY);
+ if (fd > 0) {
+ printf("Invoke copy of '%s' via filename of length %zu:\n",
+ src, strlen(longpath));
+ fail += check_execveat(fd, "", AT_EMPTY_PATH);
+ } else {
+ printf("Failed to open length %zu filename, errno=%d (%s)\n",
+ strlen(longpath), errno, strerror(errno));
+ fail++;
+ }
+
+ /*
+ * Execute as a long pathname relative to "/". If this is a script,
+ * the interpreter will launch but fail to open the script because its
+ * name ("/dev/fd/5/xxx....") is bigger than PATH_MAX.
+ *
+ * The failure code is usually 127 (POSIX: "If a command is not found,
+ * the exit status shall be 127."), but some systems give 126 (POSIX:
+ * "If the command name is found, but it is not an executable utility,
+ * the exit status shall be 126."), so allow either.
+ */
+ if (is_script)
+ fail += check_execveat_invoked_rc(root_dfd, longpath + 1, 0,
+ 127, 126);
+ else
+ fail += check_execveat(root_dfd, longpath + 1, 0);
+
+ return fail;
+}
+
+static int run_tests(void)
+{
+ int fail = 0;
+ char *fullname = realpath("execveat", NULL);
+ char *fullname_script = realpath("script", NULL);
+ char *fullname_symlink = concat(fullname, ".symlink");
+ int subdir_dfd = open_or_die("subdir", O_DIRECTORY|O_RDONLY);
+ int subdir_dfd_ephemeral = open_or_die("subdir.ephemeral",
+ O_DIRECTORY|O_RDONLY);
+ int dot_dfd = open_or_die(".", O_DIRECTORY|O_RDONLY);
+ int root_dfd = open_or_die("/", O_DIRECTORY|O_RDONLY);
+ int dot_dfd_path = open_or_die(".", O_DIRECTORY|O_RDONLY|O_PATH);
+ int dot_dfd_cloexec = open_or_die(".", O_DIRECTORY|O_RDONLY|O_CLOEXEC);
+ int fd = open_or_die("execveat", O_RDONLY);
+ int fd_path = open_or_die("execveat", O_RDONLY|O_PATH);
+ int fd_symlink = open_or_die("execveat.symlink", O_RDONLY);
+ int fd_denatured = open_or_die("execveat.denatured", O_RDONLY);
+ int fd_denatured_path = open_or_die("execveat.denatured",
+ O_RDONLY|O_PATH);
+ int fd_script = open_or_die("script", O_RDONLY);
+ int fd_ephemeral = open_or_die("execveat.ephemeral", O_RDONLY);
+ int fd_ephemeral_path = open_or_die("execveat.path.ephemeral",
+ O_RDONLY|O_PATH);
+ int fd_script_ephemeral = open_or_die("script.ephemeral", O_RDONLY);
+ int fd_cloexec = open_or_die("execveat", O_RDONLY|O_CLOEXEC);
+ int fd_script_cloexec = open_or_die("script", O_RDONLY|O_CLOEXEC);
+
+ /* Check if we have execveat at all, and bail early if not */
+ errno = 0;
+ execveat_(-1, NULL, NULL, NULL, 0);
+ if (errno == ENOSYS) {
+ ksft_exit_skip(
+ "ENOSYS calling execveat - no kernel support?\n");
+ }
+
+ /* Change file position to confirm it doesn't affect anything */
+ lseek(fd, 10, SEEK_SET);
+
+ /* Normal executable file: */
+ /* dfd + path */
+ fail += check_execveat(subdir_dfd, "../execveat", 0);
+ fail += check_execveat(dot_dfd, "execveat", 0);
+ fail += check_execveat(dot_dfd_path, "execveat", 0);
+ /* absolute path */
+ fail += check_execveat(AT_FDCWD, fullname, 0);
+ /* absolute path with nonsense dfd */
+ fail += check_execveat(99, fullname, 0);
+ /* fd + no path */
+ fail += check_execveat(fd, "", AT_EMPTY_PATH);
+ /* O_CLOEXEC fd + no path */
+ fail += check_execveat(fd_cloexec, "", AT_EMPTY_PATH);
+ /* O_PATH fd */
+ fail += check_execveat(fd_path, "", AT_EMPTY_PATH);
+
+ /* Mess with executable file that's already open: */
+ /* fd + no path to a file that's been renamed */
+ rename("execveat.ephemeral", "execveat.moved");
+ fail += check_execveat(fd_ephemeral, "", AT_EMPTY_PATH);
+ /* fd + no path to a file that's been deleted */
+ unlink("execveat.moved"); /* remove the file now fd open */
+ fail += check_execveat(fd_ephemeral, "", AT_EMPTY_PATH);
+
+ /* Mess with executable file that's already open with O_PATH */
+ /* fd + no path to a file that's been deleted */
+ unlink("execveat.path.ephemeral");
+ fail += check_execveat(fd_ephemeral_path, "", AT_EMPTY_PATH);
+
+ /* Invalid argument failures */
+ fail += check_execveat_fail(fd, "", 0, ENOENT);
+ fail += check_execveat_fail(fd, NULL, AT_EMPTY_PATH, EFAULT);
+
+ /* Symlink to executable file: */
+ /* dfd + path */
+ fail += check_execveat(dot_dfd, "execveat.symlink", 0);
+ fail += check_execveat(dot_dfd_path, "execveat.symlink", 0);
+ /* absolute path */
+ fail += check_execveat(AT_FDCWD, fullname_symlink, 0);
+ /* fd + no path, even with AT_SYMLINK_NOFOLLOW (already followed) */
+ fail += check_execveat(fd_symlink, "", AT_EMPTY_PATH);
+ fail += check_execveat(fd_symlink, "",
+ AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW);
+
+ /* Symlink fails when AT_SYMLINK_NOFOLLOW set: */
+ /* dfd + path */
+ fail += check_execveat_fail(dot_dfd, "execveat.symlink",
+ AT_SYMLINK_NOFOLLOW, ELOOP);
+ fail += check_execveat_fail(dot_dfd_path, "execveat.symlink",
+ AT_SYMLINK_NOFOLLOW, ELOOP);
+ /* absolute path */
+ fail += check_execveat_fail(AT_FDCWD, fullname_symlink,
+ AT_SYMLINK_NOFOLLOW, ELOOP);
+
+ /* Non-regular file failure */
+ fail += check_execveat_fail(dot_dfd, "pipe", 0, EACCES);
+ unlink("pipe");
+
+ /* Shell script wrapping executable file: */
+ /* dfd + path */
+ fail += check_execveat(subdir_dfd, "../script", 0);
+ fail += check_execveat(dot_dfd, "script", 0);
+ fail += check_execveat(dot_dfd_path, "script", 0);
+ /* absolute path */
+ fail += check_execveat(AT_FDCWD, fullname_script, 0);
+ /* fd + no path */
+ fail += check_execveat(fd_script, "", AT_EMPTY_PATH);
+ fail += check_execveat(fd_script, "",
+ AT_EMPTY_PATH|AT_SYMLINK_NOFOLLOW);
+ /* O_CLOEXEC fd fails for a script (as script file inaccessible) */
+ fail += check_execveat_fail(fd_script_cloexec, "", AT_EMPTY_PATH,
+ ENOENT);
+ fail += check_execveat_fail(dot_dfd_cloexec, "script", 0, ENOENT);
+
+ /* Mess with script file that's already open: */
+ /* fd + no path to a file that's been renamed */
+ rename("script.ephemeral", "script.moved");
+ fail += check_execveat(fd_script_ephemeral, "", AT_EMPTY_PATH);
+ /* fd + no path to a file that's been deleted */
+ unlink("script.moved"); /* remove the file while fd open */
+ fail += check_execveat(fd_script_ephemeral, "", AT_EMPTY_PATH);
+
+ /* Rename a subdirectory in the path: */
+ rename("subdir.ephemeral", "subdir.moved");
+ fail += check_execveat(subdir_dfd_ephemeral, "../script", 0);
+ fail += check_execveat(subdir_dfd_ephemeral, "script", 0);
+ /* Remove the subdir and its contents */
+ unlink("subdir.moved/script");
+ unlink("subdir.moved");
+ /* Shell loads via deleted subdir OK because name starts with .. */
+ fail += check_execveat(subdir_dfd_ephemeral, "../script", 0);
+ fail += check_execveat_fail(subdir_dfd_ephemeral, "script", 0, ENOENT);
+
+ /* Flag values other than AT_SYMLINK_NOFOLLOW => EINVAL */
+ fail += check_execveat_fail(dot_dfd, "execveat", 0xFFFF, EINVAL);
+ /* Invalid path => ENOENT */
+ fail += check_execveat_fail(dot_dfd, "no-such-file", 0, ENOENT);
+ fail += check_execveat_fail(dot_dfd_path, "no-such-file", 0, ENOENT);
+ fail += check_execveat_fail(AT_FDCWD, "no-such-file", 0, ENOENT);
+ /* Attempt to execute directory => EACCES */
+ fail += check_execveat_fail(dot_dfd, "", AT_EMPTY_PATH, EACCES);
+ /* Attempt to execute non-executable => EACCES */
+ fail += check_execveat_fail(dot_dfd, "Makefile", 0, EACCES);
+ fail += check_execveat_fail(fd_denatured, "", AT_EMPTY_PATH, EACCES);
+ fail += check_execveat_fail(fd_denatured_path, "", AT_EMPTY_PATH,
+ EACCES);
+ /* Attempt to execute nonsense FD => EBADF */
+ fail += check_execveat_fail(99, "", AT_EMPTY_PATH, EBADF);
+ fail += check_execveat_fail(99, "execveat", 0, EBADF);
+ /* Attempt to execute relative to non-directory => ENOTDIR */
+ fail += check_execveat_fail(fd, "execveat", 0, ENOTDIR);
+
+ fail += check_execveat_pathmax(root_dfd, "execveat", 0);
+ fail += check_execveat_pathmax(root_dfd, "script", 1);
+ return fail;
+}
+
+static void prerequisites(void)
+{
+ int fd;
+ const char *script = "#!/bin/sh\nexit $*\n";
+
+ /* Create ephemeral copies of files */
+ exe_cp("execveat", "execveat.ephemeral");
+ exe_cp("execveat", "execveat.path.ephemeral");
+ exe_cp("script", "script.ephemeral");
+ mkdir("subdir.ephemeral", 0755);
+
+ fd = open("subdir.ephemeral/script", O_RDWR|O_CREAT|O_TRUNC, 0755);
+ write(fd, script, strlen(script));
+ close(fd);
+
+ mkfifo("pipe", 0755);
+}
+
+int main(int argc, char **argv)
+{
+ int ii;
+ int rc;
+ const char *verbose = getenv("VERBOSE");
+
+ if (argc >= 2) {
+ /* If we are invoked with an argument, don't run tests. */
+ const char *in_test = getenv("IN_TEST");
+
+ if (verbose) {
+ printf(" invoked with:");
+ for (ii = 0; ii < argc; ii++)
+ printf(" [%d]='%s'", ii, argv[ii]);
+ printf("\n");
+ }
+
+ /* Check expected environment transferred. */
+ if (!in_test || strcmp(in_test, "yes") != 0) {
+ printf("[FAIL] (no IN_TEST=yes in env)\n");
+ return 1;
+ }
+
+ /* Use the final argument as an exit code. */
+ rc = atoi(argv[argc - 1]);
+ fflush(stdout);
+ } else {
+ prerequisites();
+ if (verbose)
+ envp[1] = "VERBOSE=1";
+ rc = run_tests();
+ if (rc > 0)
+ printf("%d tests failed\n", rc);
+ }
+ return rc;
+}
diff --git a/tools/testing/selftests/exec/load_address.c b/tools/testing/selftests/exec/load_address.c
new file mode 100644
index 000000000..d487c2f6a
--- /dev/null
+++ b/tools/testing/selftests/exec/load_address.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <link.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+struct Statistics {
+ unsigned long long load_address;
+ unsigned long long alignment;
+};
+
+int ExtractStatistics(struct dl_phdr_info *info, size_t size, void *data)
+{
+ struct Statistics *stats = (struct Statistics *) data;
+ int i;
+
+ if (info->dlpi_name != NULL && info->dlpi_name[0] != '\0') {
+ // Ignore headers from other than the executable.
+ return 2;
+ }
+
+ stats->load_address = (unsigned long long) info->dlpi_addr;
+ stats->alignment = 0;
+
+ for (i = 0; i < info->dlpi_phnum; i++) {
+ if (info->dlpi_phdr[i].p_type != PT_LOAD)
+ continue;
+
+ if (info->dlpi_phdr[i].p_align > stats->alignment)
+ stats->alignment = info->dlpi_phdr[i].p_align;
+ }
+
+ return 1; // Terminate dl_iterate_phdr.
+}
+
+int main(int argc, char **argv)
+{
+ struct Statistics extracted;
+ unsigned long long misalign;
+ int ret;
+
+ ret = dl_iterate_phdr(ExtractStatistics, &extracted);
+ if (ret != 1) {
+ fprintf(stderr, "FAILED\n");
+ return 1;
+ }
+
+ if (extracted.alignment == 0) {
+ fprintf(stderr, "No alignment found\n");
+ return 1;
+ } else if (extracted.alignment & (extracted.alignment - 1)) {
+ fprintf(stderr, "Alignment is not a power of 2\n");
+ return 1;
+ }
+
+ misalign = extracted.load_address & (extracted.alignment - 1);
+ if (misalign) {
+ printf("alignment = %llu, load_address = %llu\n",
+ extracted.alignment, extracted.load_address);
+ fprintf(stderr, "FAILED\n");
+ return 1;
+ }
+
+ fprintf(stderr, "PASS\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/exec/non-regular.c b/tools/testing/selftests/exec/non-regular.c
new file mode 100644
index 000000000..cd3a34aca
--- /dev/null
+++ b/tools/testing/selftests/exec/non-regular.c
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+
+#include "../kselftest_harness.h"
+
+/* Remove a file, ignoring the result if it didn't exist. */
+void rm(struct __test_metadata *_metadata, const char *pathname,
+ int is_dir)
+{
+ int rc;
+
+ if (is_dir)
+ rc = rmdir(pathname);
+ else
+ rc = unlink(pathname);
+
+ if (rc < 0) {
+ ASSERT_EQ(errno, ENOENT) {
+ TH_LOG("Not ENOENT: %s", pathname);
+ }
+ } else {
+ ASSERT_EQ(rc, 0) {
+ TH_LOG("Failed to remove: %s", pathname);
+ }
+ }
+}
+
+FIXTURE(file) {
+ char *pathname;
+ int is_dir;
+};
+
+FIXTURE_VARIANT(file)
+{
+ const char *name;
+ int expected;
+ int is_dir;
+ void (*setup)(struct __test_metadata *_metadata,
+ FIXTURE_DATA(file) *self,
+ const FIXTURE_VARIANT(file) *variant);
+ int major, minor, mode; /* for mknod() */
+};
+
+void setup_link(struct __test_metadata *_metadata,
+ FIXTURE_DATA(file) *self,
+ const FIXTURE_VARIANT(file) *variant)
+{
+ const char * const paths[] = {
+ "/bin/true",
+ "/usr/bin/true",
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(paths); i++) {
+ if (access(paths[i], X_OK) == 0) {
+ ASSERT_EQ(symlink(paths[i], self->pathname), 0);
+ return;
+ }
+ }
+ ASSERT_EQ(1, 0) {
+ TH_LOG("Could not find viable 'true' binary");
+ }
+}
+
+FIXTURE_VARIANT_ADD(file, S_IFLNK)
+{
+ .name = "S_IFLNK",
+ .expected = ELOOP,
+ .setup = setup_link,
+};
+
+void setup_dir(struct __test_metadata *_metadata,
+ FIXTURE_DATA(file) *self,
+ const FIXTURE_VARIANT(file) *variant)
+{
+ ASSERT_EQ(mkdir(self->pathname, 0755), 0);
+}
+
+FIXTURE_VARIANT_ADD(file, S_IFDIR)
+{
+ .name = "S_IFDIR",
+ .is_dir = 1,
+ .expected = EACCES,
+ .setup = setup_dir,
+};
+
+void setup_node(struct __test_metadata *_metadata,
+ FIXTURE_DATA(file) *self,
+ const FIXTURE_VARIANT(file) *variant)
+{
+ dev_t dev;
+ int rc;
+
+ dev = makedev(variant->major, variant->minor);
+ rc = mknod(self->pathname, 0755 | variant->mode, dev);
+ ASSERT_EQ(rc, 0) {
+ if (errno == EPERM)
+ SKIP(return, "Please run as root; cannot mknod(%s)",
+ variant->name);
+ }
+}
+
+FIXTURE_VARIANT_ADD(file, S_IFBLK)
+{
+ .name = "S_IFBLK",
+ .expected = EACCES,
+ .setup = setup_node,
+ /* /dev/loop0 */
+ .major = 7,
+ .minor = 0,
+ .mode = S_IFBLK,
+};
+
+FIXTURE_VARIANT_ADD(file, S_IFCHR)
+{
+ .name = "S_IFCHR",
+ .expected = EACCES,
+ .setup = setup_node,
+ /* /dev/zero */
+ .major = 1,
+ .minor = 5,
+ .mode = S_IFCHR,
+};
+
+void setup_fifo(struct __test_metadata *_metadata,
+ FIXTURE_DATA(file) *self,
+ const FIXTURE_VARIANT(file) *variant)
+{
+ ASSERT_EQ(mkfifo(self->pathname, 0755), 0);
+}
+
+FIXTURE_VARIANT_ADD(file, S_IFIFO)
+{
+ .name = "S_IFIFO",
+ .expected = EACCES,
+ .setup = setup_fifo,
+};
+
+FIXTURE_SETUP(file)
+{
+ ASSERT_GT(asprintf(&self->pathname, "%s.test", variant->name), 6);
+ self->is_dir = variant->is_dir;
+
+ rm(_metadata, self->pathname, variant->is_dir);
+ variant->setup(_metadata, self, variant);
+}
+
+FIXTURE_TEARDOWN(file)
+{
+ rm(_metadata, self->pathname, self->is_dir);
+}
+
+TEST_F(file, exec_errno)
+{
+ char * const argv[2] = { (char * const)self->pathname, NULL };
+
+ EXPECT_LT(execv(argv[0], argv), 0);
+ EXPECT_EQ(errno, variant->expected);
+}
+
+/* S_IFSOCK */
+FIXTURE(sock)
+{
+ int fd;
+};
+
+FIXTURE_SETUP(sock)
+{
+ self->fd = socket(AF_INET, SOCK_STREAM, 0);
+ ASSERT_GE(self->fd, 0);
+}
+
+FIXTURE_TEARDOWN(sock)
+{
+ if (self->fd >= 0)
+ ASSERT_EQ(close(self->fd), 0);
+}
+
+TEST_F(sock, exec_errno)
+{
+ char * const argv[2] = { " magic socket ", NULL };
+ char * const envp[1] = { NULL };
+
+ EXPECT_LT(fexecve(self->fd, argv, envp), 0);
+ EXPECT_EQ(errno, EACCES);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/exec/recursion-depth.c b/tools/testing/selftests/exec/recursion-depth.c
new file mode 100644
index 000000000..2dbd5bc45
--- /dev/null
+++ b/tools/testing/selftests/exec/recursion-depth.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/* Test that pointing #! script interpreter to self doesn't recurse. */
+#include <errno.h>
+#include <sched.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+int main(void)
+{
+ if (unshare(CLONE_NEWNS) == -1) {
+ if (errno == ENOSYS || errno == EPERM) {
+ fprintf(stderr, "error: unshare, errno %d\n", errno);
+ return 4;
+ }
+ fprintf(stderr, "error: unshare, errno %d\n", errno);
+ return 1;
+ }
+ if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) {
+ fprintf(stderr, "error: mount '/', errno %d\n", errno);
+ return 1;
+ }
+ /* Require "exec" filesystem. */
+ if (mount(NULL, "/tmp", "ramfs", 0, NULL) == -1) {
+ fprintf(stderr, "error: mount ramfs, errno %d\n", errno);
+ return 1;
+ }
+
+#define FILENAME "/tmp/1"
+
+ int fd = creat(FILENAME, 0700);
+ if (fd == -1) {
+ fprintf(stderr, "error: creat, errno %d\n", errno);
+ return 1;
+ }
+#define S "#!" FILENAME "\n"
+ if (write(fd, S, strlen(S)) != strlen(S)) {
+ fprintf(stderr, "error: write, errno %d\n", errno);
+ return 1;
+ }
+ close(fd);
+
+ int rv = execve(FILENAME, NULL, NULL);
+ if (rv == -1 && errno == ELOOP) {
+ return 0;
+ }
+ fprintf(stderr, "error: execve, rv %d, errno %d\n", rv, errno);
+ return 1;
+}
diff --git a/tools/testing/selftests/filesystems/.gitignore b/tools/testing/selftests/filesystems/.gitignore
new file mode 100644
index 000000000..f0c0ff20d
--- /dev/null
+++ b/tools/testing/selftests/filesystems/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+dnotify_test
+devpts_pts
diff --git a/tools/testing/selftests/filesystems/Makefile b/tools/testing/selftests/filesystems/Makefile
new file mode 100644
index 000000000..129880fb4
--- /dev/null
+++ b/tools/testing/selftests/filesystems/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+
+CFLAGS += -I../../../../usr/include/
+TEST_GEN_PROGS := devpts_pts
+TEST_GEN_PROGS_EXTENDED := dnotify_test
+
+include ../lib.mk
diff --git a/tools/testing/selftests/filesystems/binderfs/.gitignore b/tools/testing/selftests/filesystems/binderfs/.gitignore
new file mode 100644
index 000000000..8e5cf9084
--- /dev/null
+++ b/tools/testing/selftests/filesystems/binderfs/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+binderfs_test
diff --git a/tools/testing/selftests/filesystems/binderfs/Makefile b/tools/testing/selftests/filesystems/binderfs/Makefile
new file mode 100644
index 000000000..8af25ae96
--- /dev/null
+++ b/tools/testing/selftests/filesystems/binderfs/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+
+CFLAGS += -I../../../../../usr/include/ -pthread
+TEST_GEN_PROGS := binderfs_test
+
+binderfs_test: binderfs_test.c ../../kselftest.h ../../kselftest_harness.h
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/filesystems/binderfs/binderfs_test.c b/tools/testing/selftests/filesystems/binderfs/binderfs_test.c
new file mode 100644
index 000000000..477cbb042
--- /dev/null
+++ b/tools/testing/selftests/filesystems/binderfs/binderfs_test.c
@@ -0,0 +1,521 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/fsuid.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/sysinfo.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <linux/android/binder.h>
+#include <linux/android/binderfs.h>
+
+#include "../../kselftest_harness.h"
+
+#define DEFAULT_THREADS 4
+
+#define PTR_TO_INT(p) ((int)((intptr_t)(p)))
+#define INT_TO_PTR(u) ((void *)((intptr_t)(u)))
+
+#define close_prot_errno_disarm(fd) \
+ if (fd >= 0) { \
+ int _e_ = errno; \
+ close(fd); \
+ errno = _e_; \
+ fd = -EBADF; \
+ }
+
+static void change_mountns(struct __test_metadata *_metadata)
+{
+ int ret;
+
+ ret = unshare(CLONE_NEWNS);
+ ASSERT_EQ(ret, 0) {
+ TH_LOG("%s - Failed to unshare mount namespace",
+ strerror(errno));
+ }
+
+ ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0);
+ ASSERT_EQ(ret, 0) {
+ TH_LOG("%s - Failed to mount / as private",
+ strerror(errno));
+ }
+}
+
+static int __do_binderfs_test(struct __test_metadata *_metadata)
+{
+ int fd, ret, saved_errno, result = 1;
+ size_t len;
+ ssize_t wret;
+ struct binderfs_device device = { 0 };
+ struct binder_version version = { 0 };
+ char binderfs_mntpt[] = P_tmpdir "/binderfs_XXXXXX",
+ device_path[sizeof(P_tmpdir "/binderfs_XXXXXX/") + BINDERFS_MAX_NAME];
+
+ change_mountns(_metadata);
+
+ EXPECT_NE(mkdtemp(binderfs_mntpt), NULL) {
+ TH_LOG("%s - Failed to create binderfs mountpoint",
+ strerror(errno));
+ goto out;
+ }
+
+ ret = mount(NULL, binderfs_mntpt, "binder", 0, 0);
+ EXPECT_EQ(ret, 0) {
+ if (errno == ENODEV)
+ SKIP(goto out, "binderfs missing");
+ TH_LOG("%s - Failed to mount binderfs", strerror(errno));
+ goto rmdir;
+ }
+
+ /* success: binderfs mounted */
+
+ memcpy(device.name, "my-binder", strlen("my-binder"));
+
+ snprintf(device_path, sizeof(device_path), "%s/binder-control", binderfs_mntpt);
+ fd = open(device_path, O_RDONLY | O_CLOEXEC);
+ EXPECT_GE(fd, 0) {
+ TH_LOG("%s - Failed to open binder-control device",
+ strerror(errno));
+ goto umount;
+ }
+
+ ret = ioctl(fd, BINDER_CTL_ADD, &device);
+ saved_errno = errno;
+ close(fd);
+ errno = saved_errno;
+ EXPECT_GE(ret, 0) {
+ TH_LOG("%s - Failed to allocate new binder device",
+ strerror(errno));
+ goto umount;
+ }
+
+ TH_LOG("Allocated new binder device with major %d, minor %d, and name %s",
+ device.major, device.minor, device.name);
+
+ /* success: binder device allocation */
+
+ snprintf(device_path, sizeof(device_path), "%s/my-binder", binderfs_mntpt);
+ fd = open(device_path, O_CLOEXEC | O_RDONLY);
+ EXPECT_GE(fd, 0) {
+ TH_LOG("%s - Failed to open my-binder device",
+ strerror(errno));
+ goto umount;
+ }
+
+ ret = ioctl(fd, BINDER_VERSION, &version);
+ saved_errno = errno;
+ close(fd);
+ errno = saved_errno;
+ EXPECT_GE(ret, 0) {
+ TH_LOG("%s - Failed to open perform BINDER_VERSION request",
+ strerror(errno));
+ goto umount;
+ }
+
+ TH_LOG("Detected binder version: %d", version.protocol_version);
+
+ /* success: binder transaction with binderfs binder device */
+
+ ret = unlink(device_path);
+ EXPECT_EQ(ret, 0) {
+ TH_LOG("%s - Failed to delete binder device",
+ strerror(errno));
+ goto umount;
+ }
+
+ /* success: binder device removal */
+
+ snprintf(device_path, sizeof(device_path), "%s/binder-control", binderfs_mntpt);
+ ret = unlink(device_path);
+ EXPECT_NE(ret, 0) {
+ TH_LOG("Managed to delete binder-control device");
+ goto umount;
+ }
+ EXPECT_EQ(errno, EPERM) {
+ TH_LOG("%s - Failed to delete binder-control device but exited with unexpected error code",
+ strerror(errno));
+ goto umount;
+ }
+
+ /* success: binder-control device removal failed as expected */
+ result = 0;
+
+umount:
+ ret = umount2(binderfs_mntpt, MNT_DETACH);
+ EXPECT_EQ(ret, 0) {
+ TH_LOG("%s - Failed to unmount binderfs", strerror(errno));
+ }
+rmdir:
+ ret = rmdir(binderfs_mntpt);
+ EXPECT_EQ(ret, 0) {
+ TH_LOG("%s - Failed to rmdir binderfs mount", strerror(errno));
+ }
+out:
+ return result;
+}
+
+static int wait_for_pid(pid_t pid)
+{
+ int status, ret;
+
+again:
+ ret = waitpid(pid, &status, 0);
+ if (ret == -1) {
+ if (errno == EINTR)
+ goto again;
+
+ return -1;
+ }
+
+ if (!WIFEXITED(status))
+ return -1;
+
+ return WEXITSTATUS(status);
+}
+
+static int setid_userns_root(void)
+{
+ if (setuid(0))
+ return -1;
+ if (setgid(0))
+ return -1;
+
+ setfsuid(0);
+ setfsgid(0);
+
+ return 0;
+}
+
+enum idmap_type {
+ UID_MAP,
+ GID_MAP,
+};
+
+static ssize_t read_nointr(int fd, void *buf, size_t count)
+{
+ ssize_t ret;
+again:
+ ret = read(fd, buf, count);
+ if (ret < 0 && errno == EINTR)
+ goto again;
+
+ return ret;
+}
+
+static ssize_t write_nointr(int fd, const void *buf, size_t count)
+{
+ ssize_t ret;
+again:
+ ret = write(fd, buf, count);
+ if (ret < 0 && errno == EINTR)
+ goto again;
+
+ return ret;
+}
+
+static int write_id_mapping(enum idmap_type type, pid_t pid, const char *buf,
+ size_t buf_size)
+{
+ int fd;
+ int ret;
+ char path[4096];
+
+ if (type == GID_MAP) {
+ int setgroups_fd;
+
+ snprintf(path, sizeof(path), "/proc/%d/setgroups", pid);
+ setgroups_fd = open(path, O_WRONLY | O_CLOEXEC | O_NOFOLLOW);
+ if (setgroups_fd < 0 && errno != ENOENT)
+ return -1;
+
+ if (setgroups_fd >= 0) {
+ ret = write_nointr(setgroups_fd, "deny", sizeof("deny") - 1);
+ close_prot_errno_disarm(setgroups_fd);
+ if (ret != sizeof("deny") - 1)
+ return -1;
+ }
+ }
+
+ switch (type) {
+ case UID_MAP:
+ ret = snprintf(path, sizeof(path), "/proc/%d/uid_map", pid);
+ break;
+ case GID_MAP:
+ ret = snprintf(path, sizeof(path), "/proc/%d/gid_map", pid);
+ break;
+ default:
+ return -1;
+ }
+ if (ret < 0 || ret >= sizeof(path))
+ return -E2BIG;
+
+ fd = open(path, O_WRONLY | O_CLOEXEC | O_NOFOLLOW);
+ if (fd < 0)
+ return -1;
+
+ ret = write_nointr(fd, buf, buf_size);
+ close_prot_errno_disarm(fd);
+ if (ret != buf_size)
+ return -1;
+
+ return 0;
+}
+
+static void change_userns(struct __test_metadata *_metadata, int syncfds[2])
+{
+ int ret;
+ char buf;
+
+ close_prot_errno_disarm(syncfds[1]);
+
+ ret = unshare(CLONE_NEWUSER);
+ ASSERT_EQ(ret, 0) {
+ TH_LOG("%s - Failed to unshare user namespace",
+ strerror(errno));
+ }
+
+ ret = write_nointr(syncfds[0], "1", 1);
+ ASSERT_EQ(ret, 1) {
+ TH_LOG("write_nointr() failed");
+ }
+
+ ret = read_nointr(syncfds[0], &buf, 1);
+ ASSERT_EQ(ret, 1) {
+ TH_LOG("read_nointr() failed");
+ }
+
+ close_prot_errno_disarm(syncfds[0]);
+
+ ASSERT_EQ(setid_userns_root(), 0) {
+ TH_LOG("setid_userns_root() failed");
+ }
+}
+
+static void change_idmaps(struct __test_metadata *_metadata, int syncfds[2], pid_t pid)
+{
+ int ret;
+ char buf;
+ char id_map[4096];
+
+ close_prot_errno_disarm(syncfds[0]);
+
+ ret = read_nointr(syncfds[1], &buf, 1);
+ ASSERT_EQ(ret, 1) {
+ TH_LOG("read_nointr() failed");
+ }
+
+ snprintf(id_map, sizeof(id_map), "0 %d 1\n", getuid());
+ ret = write_id_mapping(UID_MAP, pid, id_map, strlen(id_map));
+ ASSERT_EQ(ret, 0) {
+ TH_LOG("write_id_mapping(UID_MAP) failed");
+ }
+
+ snprintf(id_map, sizeof(id_map), "0 %d 1\n", getgid());
+ ret = write_id_mapping(GID_MAP, pid, id_map, strlen(id_map));
+ ASSERT_EQ(ret, 0) {
+ TH_LOG("write_id_mapping(GID_MAP) failed");
+ }
+
+ ret = write_nointr(syncfds[1], "1", 1);
+ ASSERT_EQ(ret, 1) {
+ TH_LOG("write_nointr() failed");
+ }
+
+ close_prot_errno_disarm(syncfds[1]);
+}
+
+struct __test_metadata *_thread_metadata;
+static void *binder_version_thread(void *data)
+{
+ struct __test_metadata *_metadata = _thread_metadata;
+ int fd = PTR_TO_INT(data);
+ struct binder_version version = { 0 };
+ int ret;
+
+ ret = ioctl(fd, BINDER_VERSION, &version);
+ if (ret < 0)
+ TH_LOG("%s - Failed to open perform BINDER_VERSION request\n",
+ strerror(errno));
+
+ pthread_exit(data);
+}
+
+/*
+ * Regression test:
+ * 2669b8b0c798 ("binder: prevent UAF for binderfs devices")
+ * f0fe2c0f050d ("binder: prevent UAF for binderfs devices II")
+ * 211b64e4b5b6 ("binderfs: use refcount for binder control devices too")
+ */
+TEST(binderfs_stress)
+{
+ int fds[1000];
+ int syncfds[2];
+ pid_t pid;
+ int fd, ret;
+ size_t len;
+ struct binderfs_device device = { 0 };
+ char binderfs_mntpt[] = P_tmpdir "/binderfs_XXXXXX",
+ device_path[sizeof(P_tmpdir "/binderfs_XXXXXX/") + BINDERFS_MAX_NAME];
+
+ ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, syncfds);
+ ASSERT_EQ(ret, 0) {
+ TH_LOG("%s - Failed to create socket pair", strerror(errno));
+ }
+
+ pid = fork();
+ ASSERT_GE(pid, 0) {
+ TH_LOG("%s - Failed to fork", strerror(errno));
+ close_prot_errno_disarm(syncfds[0]);
+ close_prot_errno_disarm(syncfds[1]);
+ }
+
+ if (pid == 0) {
+ int i, j, k, nthreads;
+ pthread_attr_t attr;
+ pthread_t threads[DEFAULT_THREADS];
+ change_userns(_metadata, syncfds);
+ change_mountns(_metadata);
+
+ ASSERT_NE(mkdtemp(binderfs_mntpt), NULL) {
+ TH_LOG("%s - Failed to create binderfs mountpoint",
+ strerror(errno));
+ }
+
+ ret = mount(NULL, binderfs_mntpt, "binder", 0, 0);
+ ASSERT_EQ(ret, 0) {
+ TH_LOG("%s - Failed to mount binderfs", strerror(errno));
+ }
+
+ for (int i = 0; i < ARRAY_SIZE(fds); i++) {
+
+ snprintf(device_path, sizeof(device_path),
+ "%s/binder-control", binderfs_mntpt);
+ fd = open(device_path, O_RDONLY | O_CLOEXEC);
+ ASSERT_GE(fd, 0) {
+ TH_LOG("%s - Failed to open binder-control device",
+ strerror(errno));
+ }
+
+ memset(&device, 0, sizeof(device));
+ snprintf(device.name, sizeof(device.name), "%d", i);
+ ret = ioctl(fd, BINDER_CTL_ADD, &device);
+ close_prot_errno_disarm(fd);
+ ASSERT_EQ(ret, 0) {
+ TH_LOG("%s - Failed to allocate new binder device",
+ strerror(errno));
+ }
+
+ snprintf(device_path, sizeof(device_path), "%s/%d",
+ binderfs_mntpt, i);
+ fds[i] = open(device_path, O_RDONLY | O_CLOEXEC);
+ ASSERT_GE(fds[i], 0) {
+ TH_LOG("%s - Failed to open binder device", strerror(errno));
+ }
+ }
+
+ ret = umount2(binderfs_mntpt, MNT_DETACH);
+ ASSERT_EQ(ret, 0) {
+ TH_LOG("%s - Failed to unmount binderfs", strerror(errno));
+ rmdir(binderfs_mntpt);
+ }
+
+ nthreads = get_nprocs_conf();
+ if (nthreads > DEFAULT_THREADS)
+ nthreads = DEFAULT_THREADS;
+
+ _thread_metadata = _metadata;
+ pthread_attr_init(&attr);
+ for (k = 0; k < ARRAY_SIZE(fds); k++) {
+ for (i = 0; i < nthreads; i++) {
+ ret = pthread_create(&threads[i], &attr, binder_version_thread, INT_TO_PTR(fds[k]));
+ if (ret) {
+ TH_LOG("%s - Failed to create thread %d",
+ strerror(errno), i);
+ break;
+ }
+ }
+
+ for (j = 0; j < i; j++) {
+ void *fdptr = NULL;
+
+ ret = pthread_join(threads[j], &fdptr);
+ if (ret)
+ TH_LOG("%s - Failed to join thread %d for fd %d",
+ strerror(errno), j, PTR_TO_INT(fdptr));
+ }
+ }
+ pthread_attr_destroy(&attr);
+
+ for (k = 0; k < ARRAY_SIZE(fds); k++)
+ close(fds[k]);
+
+ exit(EXIT_SUCCESS);
+ }
+
+ change_idmaps(_metadata, syncfds, pid);
+
+ ret = wait_for_pid(pid);
+ ASSERT_EQ(ret, 0) {
+ TH_LOG("wait_for_pid() failed");
+ }
+}
+
+TEST(binderfs_test_privileged)
+{
+ if (geteuid() != 0)
+ SKIP(return, "Tests are not run as root. Skipping privileged tests");
+
+ if (__do_binderfs_test(_metadata))
+ SKIP(return, "The Android binderfs filesystem is not available");
+}
+
+TEST(binderfs_test_unprivileged)
+{
+ int ret;
+ int syncfds[2];
+ pid_t pid;
+
+ ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, syncfds);
+ ASSERT_EQ(ret, 0) {
+ TH_LOG("%s - Failed to create socket pair", strerror(errno));
+ }
+
+ pid = fork();
+ ASSERT_GE(pid, 0) {
+ close_prot_errno_disarm(syncfds[0]);
+ close_prot_errno_disarm(syncfds[1]);
+ TH_LOG("%s - Failed to fork", strerror(errno));
+ }
+
+ if (pid == 0) {
+ change_userns(_metadata, syncfds);
+ if (__do_binderfs_test(_metadata))
+ exit(2);
+ exit(EXIT_SUCCESS);
+ }
+
+ change_idmaps(_metadata, syncfds, pid);
+
+ ret = wait_for_pid(pid);
+ if (ret) {
+ if (ret == 2)
+ SKIP(return, "The Android binderfs filesystem is not available");
+ ASSERT_EQ(ret, 0) {
+ TH_LOG("wait_for_pid() failed");
+ }
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/filesystems/binderfs/config b/tools/testing/selftests/filesystems/binderfs/config
new file mode 100644
index 000000000..02dd6cc9c
--- /dev/null
+++ b/tools/testing/selftests/filesystems/binderfs/config
@@ -0,0 +1,3 @@
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDERFS=y
+CONFIG_ANDROID_BINDER_IPC=y
diff --git a/tools/testing/selftests/filesystems/devpts_pts.c b/tools/testing/selftests/filesystems/devpts_pts.c
new file mode 100644
index 000000000..b1fc9b916
--- /dev/null
+++ b/tools/testing/selftests/filesystems/devpts_pts.c
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <asm/ioctls.h>
+#include <sys/mount.h>
+#include <sys/wait.h>
+#include "../kselftest.h"
+
+static bool terminal_dup2(int duplicate, int original)
+{
+ int ret;
+
+ ret = dup2(duplicate, original);
+ if (ret < 0)
+ return false;
+
+ return true;
+}
+
+static int terminal_set_stdfds(int fd)
+{
+ int i;
+
+ if (fd < 0)
+ return 0;
+
+ for (i = 0; i < 3; i++)
+ if (!terminal_dup2(fd, (int[]){STDIN_FILENO, STDOUT_FILENO,
+ STDERR_FILENO}[i]))
+ return -1;
+
+ return 0;
+}
+
+static int login_pty(int fd)
+{
+ int ret;
+
+ setsid();
+
+ ret = ioctl(fd, TIOCSCTTY, NULL);
+ if (ret < 0)
+ return -1;
+
+ ret = terminal_set_stdfds(fd);
+ if (ret < 0)
+ return -1;
+
+ if (fd > STDERR_FILENO)
+ close(fd);
+
+ return 0;
+}
+
+static int wait_for_pid(pid_t pid)
+{
+ int status, ret;
+
+again:
+ ret = waitpid(pid, &status, 0);
+ if (ret == -1) {
+ if (errno == EINTR)
+ goto again;
+ return -1;
+ }
+ if (ret != pid)
+ goto again;
+
+ if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
+ return -1;
+
+ return 0;
+}
+
+static int resolve_procfd_symlink(int fd, char *buf, size_t buflen)
+{
+ int ret;
+ char procfd[4096];
+
+ ret = snprintf(procfd, 4096, "/proc/self/fd/%d", fd);
+ if (ret < 0 || ret >= 4096)
+ return -1;
+
+ ret = readlink(procfd, buf, buflen);
+ if (ret < 0 || (size_t)ret >= buflen)
+ return -1;
+
+ buf[ret] = '\0';
+
+ return 0;
+}
+
+static int do_tiocgptpeer(char *ptmx, char *expected_procfd_contents)
+{
+ int ret;
+ int master = -1, slave = -1, fret = -1;
+
+ master = open(ptmx, O_RDWR | O_NOCTTY | O_CLOEXEC);
+ if (master < 0) {
+ fprintf(stderr, "Failed to open \"%s\": %s\n", ptmx,
+ strerror(errno));
+ return -1;
+ }
+
+ /*
+ * grantpt() makes assumptions about /dev/pts/ so ignore it. It's also
+ * not really needed.
+ */
+ ret = unlockpt(master);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to unlock terminal\n");
+ goto do_cleanup;
+ }
+
+#ifdef TIOCGPTPEER
+ slave = ioctl(master, TIOCGPTPEER, O_RDWR | O_NOCTTY | O_CLOEXEC);
+#endif
+ if (slave < 0) {
+ if (errno == EINVAL) {
+ fprintf(stderr, "TIOCGPTPEER is not supported. "
+ "Skipping test.\n");
+ fret = KSFT_SKIP;
+ } else {
+ fprintf(stderr,
+ "Failed to perform TIOCGPTPEER ioctl\n");
+ fret = EXIT_FAILURE;
+ }
+ goto do_cleanup;
+ }
+
+ pid_t pid = fork();
+ if (pid < 0)
+ goto do_cleanup;
+
+ if (pid == 0) {
+ char buf[4096];
+
+ ret = login_pty(slave);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to setup terminal\n");
+ _exit(EXIT_FAILURE);
+ }
+
+ ret = resolve_procfd_symlink(STDIN_FILENO, buf, sizeof(buf));
+ if (ret < 0) {
+ fprintf(stderr, "Failed to retrieve pathname of pts "
+ "slave file descriptor\n");
+ _exit(EXIT_FAILURE);
+ }
+
+ if (strncmp(expected_procfd_contents, buf,
+ strlen(expected_procfd_contents)) != 0) {
+ fprintf(stderr, "Received invalid contents for "
+ "\"/proc/<pid>/fd/%d\" symlink: %s\n",
+ STDIN_FILENO, buf);
+ _exit(-1);
+ }
+
+ fprintf(stderr, "Contents of \"/proc/<pid>/fd/%d\" "
+ "symlink are valid: %s\n", STDIN_FILENO, buf);
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ ret = wait_for_pid(pid);
+ if (ret < 0)
+ goto do_cleanup;
+
+ fret = EXIT_SUCCESS;
+
+do_cleanup:
+ if (master >= 0)
+ close(master);
+ if (slave >= 0)
+ close(slave);
+
+ return fret;
+}
+
+static int verify_non_standard_devpts_mount(void)
+{
+ char *mntpoint;
+ int ret = -1;
+ char devpts[] = P_tmpdir "/devpts_fs_XXXXXX";
+ char ptmx[] = P_tmpdir "/devpts_fs_XXXXXX/ptmx";
+
+ ret = umount("/dev/pts");
+ if (ret < 0) {
+ fprintf(stderr, "Failed to unmount \"/dev/pts\": %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ (void)umount("/dev/ptmx");
+
+ mntpoint = mkdtemp(devpts);
+ if (!mntpoint) {
+ fprintf(stderr, "Failed to create temporary mountpoint: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ ret = mount("devpts", mntpoint, "devpts", MS_NOSUID | MS_NOEXEC,
+ "newinstance,ptmxmode=0666,mode=0620,gid=5");
+ if (ret < 0) {
+ fprintf(stderr, "Failed to mount devpts fs to \"%s\" in new "
+ "mount namespace: %s\n", mntpoint,
+ strerror(errno));
+ unlink(mntpoint);
+ return -1;
+ }
+
+ ret = snprintf(ptmx, sizeof(ptmx), "%s/ptmx", devpts);
+ if (ret < 0 || (size_t)ret >= sizeof(ptmx)) {
+ unlink(mntpoint);
+ return -1;
+ }
+
+ ret = do_tiocgptpeer(ptmx, mntpoint);
+ unlink(mntpoint);
+ if (ret < 0)
+ return -1;
+
+ return 0;
+}
+
+static int verify_ptmx_bind_mount(void)
+{
+ int ret;
+
+ ret = mount("/dev/pts/ptmx", "/dev/ptmx", NULL, MS_BIND, NULL);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to bind mount \"/dev/pts/ptmx\" to "
+ "\"/dev/ptmx\" mount namespace\n");
+ return -1;
+ }
+
+ ret = do_tiocgptpeer("/dev/ptmx", "/dev/pts/");
+ if (ret < 0)
+ return -1;
+
+ return 0;
+}
+
+static int verify_invalid_ptmx_bind_mount(void)
+{
+ int ret;
+ char mntpoint_fd;
+ char ptmx[] = P_tmpdir "/devpts_ptmx_XXXXXX";
+
+ mntpoint_fd = mkstemp(ptmx);
+ if (mntpoint_fd < 0) {
+ fprintf(stderr, "Failed to create temporary directory: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ ret = mount("/dev/pts/ptmx", ptmx, NULL, MS_BIND, NULL);
+ close(mntpoint_fd);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to bind mount \"/dev/pts/ptmx\" to "
+ "\"%s\" mount namespace\n", ptmx);
+ return -1;
+ }
+
+ ret = do_tiocgptpeer(ptmx, "/dev/pts/");
+ if (ret == 0)
+ return -1;
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ int ret;
+
+ if (!isatty(STDIN_FILENO)) {
+ fprintf(stderr, "Standard input file descriptor is not attached "
+ "to a terminal. Skipping test\n");
+ exit(KSFT_SKIP);
+ }
+
+ ret = unshare(CLONE_NEWNS);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to unshare mount namespace\n");
+ exit(EXIT_FAILURE);
+ }
+
+ ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to make \"/\" MS_PRIVATE in new mount "
+ "namespace\n");
+ exit(EXIT_FAILURE);
+ }
+
+ ret = verify_ptmx_bind_mount();
+ if (ret < 0)
+ exit(EXIT_FAILURE);
+
+ ret = verify_invalid_ptmx_bind_mount();
+ if (ret < 0)
+ exit(EXIT_FAILURE);
+
+ ret = verify_non_standard_devpts_mount();
+ if (ret < 0)
+ exit(EXIT_FAILURE);
+
+ exit(EXIT_SUCCESS);
+}
diff --git a/tools/testing/selftests/filesystems/dnotify_test.c b/tools/testing/selftests/filesystems/dnotify_test.c
new file mode 100644
index 000000000..c0a9b2d33
--- /dev/null
+++ b/tools/testing/selftests/filesystems/dnotify_test.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE /* needed to get the defines */
+#include <fcntl.h> /* in glibc 2.2 this has the needed
+ values defined */
+#include <signal.h>
+#include <stdio.h>
+#include <unistd.h>
+
+static volatile int event_fd;
+
+static void handler(int sig, siginfo_t *si, void *data)
+{
+ event_fd = si->si_fd;
+}
+
+int main(void)
+{
+ struct sigaction act;
+ int fd;
+
+ act.sa_sigaction = handler;
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = SA_SIGINFO;
+ sigaction(SIGRTMIN + 1, &act, NULL);
+
+ fd = open(".", O_RDONLY);
+ fcntl(fd, F_SETSIG, SIGRTMIN + 1);
+ fcntl(fd, F_NOTIFY, DN_MODIFY|DN_CREATE|DN_MULTISHOT);
+ /* we will now be notified if any of the files
+ in "." is modified or new files are created */
+ while (1) {
+ pause();
+ printf("Got event on fd=%d\n", event_fd);
+ }
+}
diff --git a/tools/testing/selftests/filesystems/epoll/.gitignore b/tools/testing/selftests/filesystems/epoll/.gitignore
new file mode 100644
index 000000000..909015725
--- /dev/null
+++ b/tools/testing/selftests/filesystems/epoll/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+epoll_wakeup_test
diff --git a/tools/testing/selftests/filesystems/epoll/Makefile b/tools/testing/selftests/filesystems/epoll/Makefile
new file mode 100644
index 000000000..78ae4aaf7
--- /dev/null
+++ b/tools/testing/selftests/filesystems/epoll/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+
+CFLAGS += -I../../../../../usr/include/
+LDLIBS += -lpthread
+TEST_GEN_PROGS := epoll_wakeup_test
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
new file mode 100644
index 000000000..8f82f99f7
--- /dev/null
+++ b/tools/testing/selftests/filesystems/epoll/epoll_wakeup_test.c
@@ -0,0 +1,3380 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <poll.h>
+#include <unistd.h>
+#include <assert.h>
+#include <signal.h>
+#include <pthread.h>
+#include <sys/epoll.h>
+#include <sys/socket.h>
+#include <sys/eventfd.h>
+#include "../../kselftest_harness.h"
+
+struct epoll_mtcontext
+{
+ int efd[3];
+ int sfd[4];
+ volatile int count;
+
+ pthread_t main;
+ pthread_t waiter;
+};
+
+static void signal_handler(int signum)
+{
+}
+
+static void kill_timeout(struct epoll_mtcontext *ctx)
+{
+ usleep(1000000);
+ pthread_kill(ctx->main, SIGUSR1);
+ pthread_kill(ctx->waiter, SIGUSR1);
+}
+
+static void *waiter_entry1a(void *data)
+{
+ struct epoll_event e;
+ struct epoll_mtcontext *ctx = data;
+
+ if (epoll_wait(ctx->efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx->count, 1);
+
+ return NULL;
+}
+
+static void *waiter_entry1ap(void *data)
+{
+ struct pollfd pfd;
+ struct epoll_event e;
+ struct epoll_mtcontext *ctx = data;
+
+ pfd.fd = ctx->efd[0];
+ pfd.events = POLLIN;
+ if (poll(&pfd, 1, -1) > 0) {
+ if (epoll_wait(ctx->efd[0], &e, 1, 0) > 0)
+ __sync_fetch_and_add(&ctx->count, 1);
+ }
+
+ return NULL;
+}
+
+static void *waiter_entry1o(void *data)
+{
+ struct epoll_event e;
+ struct epoll_mtcontext *ctx = data;
+
+ if (epoll_wait(ctx->efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_or(&ctx->count, 1);
+
+ return NULL;
+}
+
+static void *waiter_entry1op(void *data)
+{
+ struct pollfd pfd;
+ struct epoll_event e;
+ struct epoll_mtcontext *ctx = data;
+
+ pfd.fd = ctx->efd[0];
+ pfd.events = POLLIN;
+ if (poll(&pfd, 1, -1) > 0) {
+ if (epoll_wait(ctx->efd[0], &e, 1, 0) > 0)
+ __sync_fetch_and_or(&ctx->count, 1);
+ }
+
+ return NULL;
+}
+
+static void *waiter_entry2a(void *data)
+{
+ struct epoll_event events[2];
+ struct epoll_mtcontext *ctx = data;
+
+ if (epoll_wait(ctx->efd[0], events, 2, -1) > 0)
+ __sync_fetch_and_add(&ctx->count, 1);
+
+ return NULL;
+}
+
+static void *waiter_entry2ap(void *data)
+{
+ struct pollfd pfd;
+ struct epoll_event events[2];
+ struct epoll_mtcontext *ctx = data;
+
+ pfd.fd = ctx->efd[0];
+ pfd.events = POLLIN;
+ if (poll(&pfd, 1, -1) > 0) {
+ if (epoll_wait(ctx->efd[0], events, 2, 0) > 0)
+ __sync_fetch_and_add(&ctx->count, 1);
+ }
+
+ return NULL;
+}
+
+static void *emitter_entry1(void *data)
+{
+ struct epoll_mtcontext *ctx = data;
+
+ usleep(100000);
+ write(ctx->sfd[1], "w", 1);
+
+ kill_timeout(ctx);
+
+ return NULL;
+}
+
+static void *emitter_entry2(void *data)
+{
+ struct epoll_mtcontext *ctx = data;
+
+ usleep(100000);
+ write(ctx->sfd[1], "w", 1);
+ write(ctx->sfd[3], "w", 1);
+
+ kill_timeout(ctx);
+
+ return NULL;
+}
+
+/*
+ * t0
+ * | (ew)
+ * e0
+ * | (lt)
+ * s0
+ */
+TEST(epoll1)
+{
+ int efd;
+ int sfd[2];
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd = epoll_create(1);
+ ASSERT_GE(efd, 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ EXPECT_EQ(epoll_wait(efd, &e, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd, &e, 1, 0), 1);
+
+ close(efd);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0
+ * | (ew)
+ * e0
+ * | (et)
+ * s0
+ */
+TEST(epoll2)
+{
+ int efd;
+ int sfd[2];
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd = epoll_create(1);
+ ASSERT_GE(efd, 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ EXPECT_EQ(epoll_wait(efd, &e, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd, &e, 1, 0), 0);
+
+ close(efd);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0
+ * | (ew)
+ * e0
+ * (lt) / \ (lt)
+ * s0 s2
+ */
+TEST(epoll3)
+{
+ int efd;
+ int sfd[4];
+ struct epoll_event events[2];
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[2]), 0);
+
+ efd = epoll_create(1);
+ ASSERT_GE(efd, 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[2], events), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+ ASSERT_EQ(write(sfd[3], "w", 1), 1);
+
+ EXPECT_EQ(epoll_wait(efd, events, 2, 0), 2);
+ EXPECT_EQ(epoll_wait(efd, events, 2, 0), 2);
+
+ close(efd);
+ close(sfd[0]);
+ close(sfd[1]);
+ close(sfd[2]);
+ close(sfd[3]);
+}
+
+/*
+ * t0
+ * | (ew)
+ * e0
+ * (et) / \ (et)
+ * s0 s2
+ */
+TEST(epoll4)
+{
+ int efd;
+ int sfd[4];
+ struct epoll_event events[2];
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[2]), 0);
+
+ efd = epoll_create(1);
+ ASSERT_GE(efd, 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], events), 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[2], events), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+ ASSERT_EQ(write(sfd[3], "w", 1), 1);
+
+ EXPECT_EQ(epoll_wait(efd, events, 2, 0), 2);
+ EXPECT_EQ(epoll_wait(efd, events, 2, 0), 0);
+
+ close(efd);
+ close(sfd[0]);
+ close(sfd[1]);
+ close(sfd[2]);
+ close(sfd[3]);
+}
+
+/*
+ * t0
+ * | (p)
+ * e0
+ * | (lt)
+ * s0
+ */
+TEST(epoll5)
+{
+ int efd;
+ int sfd[2];
+ struct pollfd pfd;
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[0]), 0);
+
+ efd = epoll_create(1);
+ ASSERT_GE(efd, 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ pfd.fd = efd;
+ pfd.events = POLLIN;
+ ASSERT_EQ(poll(&pfd, 1, 0), 1);
+ ASSERT_EQ(epoll_wait(efd, &e, 1, 0), 1);
+
+ pfd.fd = efd;
+ pfd.events = POLLIN;
+ ASSERT_EQ(poll(&pfd, 1, 0), 1);
+ ASSERT_EQ(epoll_wait(efd, &e, 1, 0), 1);
+
+ close(efd);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0
+ * | (p)
+ * e0
+ * | (et)
+ * s0
+ */
+TEST(epoll6)
+{
+ int efd;
+ int sfd[2];
+ struct pollfd pfd;
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[0]), 0);
+
+ efd = epoll_create(1);
+ ASSERT_GE(efd, 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ pfd.fd = efd;
+ pfd.events = POLLIN;
+ ASSERT_EQ(poll(&pfd, 1, 0), 1);
+ ASSERT_EQ(epoll_wait(efd, &e, 1, 0), 1);
+
+ pfd.fd = efd;
+ pfd.events = POLLIN;
+ ASSERT_EQ(poll(&pfd, 1, 0), 0);
+ ASSERT_EQ(epoll_wait(efd, &e, 1, 0), 0);
+
+ close(efd);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0
+ * | (p)
+ * e0
+ * (lt) / \ (lt)
+ * s0 s2
+ */
+
+TEST(epoll7)
+{
+ int efd;
+ int sfd[4];
+ struct pollfd pfd;
+ struct epoll_event events[2];
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[2]), 0);
+
+ efd = epoll_create(1);
+ ASSERT_GE(efd, 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[2], events), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+ ASSERT_EQ(write(sfd[3], "w", 1), 1);
+
+ pfd.fd = efd;
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd, events, 2, 0), 2);
+
+ pfd.fd = efd;
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd, events, 2, 0), 2);
+
+ close(efd);
+ close(sfd[0]);
+ close(sfd[1]);
+ close(sfd[2]);
+ close(sfd[3]);
+}
+
+/*
+ * t0
+ * | (p)
+ * e0
+ * (et) / \ (et)
+ * s0 s2
+ */
+TEST(epoll8)
+{
+ int efd;
+ int sfd[4];
+ struct pollfd pfd;
+ struct epoll_event events[2];
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[2]), 0);
+
+ efd = epoll_create(1);
+ ASSERT_GE(efd, 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], events), 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd, EPOLL_CTL_ADD, sfd[2], events), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+ ASSERT_EQ(write(sfd[3], "w", 1), 1);
+
+ pfd.fd = efd;
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd, events, 2, 0), 2);
+
+ pfd.fd = efd;
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 0);
+ EXPECT_EQ(epoll_wait(efd, events, 2, 0), 0);
+
+ close(efd);
+ close(sfd[0]);
+ close(sfd[1]);
+ close(sfd[2]);
+ close(sfd[3]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (ew)
+ * e0
+ * | (lt)
+ * s0
+ */
+TEST(epoll9)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (ew)
+ * e0
+ * | (et)
+ * s0
+ */
+TEST(epoll10)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 1);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (ew)
+ * e0
+ * (lt) / \ (lt)
+ * s0 s2
+ */
+TEST(epoll11)
+{
+ pthread_t emitter;
+ struct epoll_event events[2];
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[2]), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[2], events), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry2a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry2, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], events, 2, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+ close(ctx.sfd[2]);
+ close(ctx.sfd[3]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (ew)
+ * e0
+ * (et) / \ (et)
+ * s0 s2
+ */
+TEST(epoll12)
+{
+ pthread_t emitter;
+ struct epoll_event events[2];
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[2]), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], events), 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[2], events), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry2, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], events, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+ close(ctx.sfd[2]);
+ close(ctx.sfd[3]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (p)
+ * e0
+ * | (lt)
+ * s0
+ */
+TEST(epoll13)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (p)
+ * e0
+ * | (et)
+ * s0
+ */
+TEST(epoll14)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 1);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (p)
+ * e0
+ * (lt) / \ (lt)
+ * s0 s2
+ */
+TEST(epoll15)
+{
+ pthread_t emitter;
+ struct epoll_event events[2];
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[2]), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[2], events), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry2ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry2, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], events, 2, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+ close(ctx.sfd[2]);
+ close(ctx.sfd[3]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (p)
+ * e0
+ * (et) / \ (et)
+ * s0 s2
+ */
+TEST(epoll16)
+{
+ pthread_t emitter;
+ struct epoll_event events[2];
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[2]), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], events), 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[2], events), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry2, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], events, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+ close(ctx.sfd[2]);
+ close(ctx.sfd[3]);
+}
+
+/*
+ * t0
+ * | (ew)
+ * e0
+ * | (lt)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll17)
+{
+ int efd[2];
+ int sfd[2];
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0
+ * | (ew)
+ * e0
+ * | (lt)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll18)
+{
+ int efd[2];
+ int sfd[2];
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0
+ * | (ew)
+ * e0
+ * | (et)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll19)
+{
+ int efd[2];
+ int sfd[2];
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 0);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0
+ * | (ew)
+ * e0
+ * | (et)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll20)
+{
+ int efd[2];
+ int sfd[2];
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 0);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0
+ * | (p)
+ * e0
+ * | (lt)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll21)
+{
+ int efd[2];
+ int sfd[2];
+ struct pollfd pfd;
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0
+ * | (p)
+ * e0
+ * | (lt)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll22)
+{
+ int efd[2];
+ int sfd[2];
+ struct pollfd pfd;
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0
+ * | (p)
+ * e0
+ * | (et)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll23)
+{
+ int efd[2];
+ int sfd[2];
+ struct pollfd pfd;
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 0);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 0);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0
+ * | (p)
+ * e0
+ * | (et)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll24)
+{
+ int efd[2];
+ int sfd[2];
+ struct pollfd pfd;
+ struct epoll_event e;
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sfd), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], &e), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 1);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 0);
+ EXPECT_EQ(epoll_wait(efd[0], &e, 1, 0), 0);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(sfd[0]);
+ close(sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (ew)
+ * e0
+ * | (lt)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll25)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (ew)
+ * e0
+ * | (lt)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll26)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (ew)
+ * e0
+ * | (et)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll27)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 1);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (ew)
+ * e0
+ * | (et)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll28)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 1);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (p)
+ * e0
+ * | (lt)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll29)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (p)
+ * e0
+ * | (lt)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll30)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (p)
+ * e0
+ * | (et)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll31)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 1);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (p)
+ * e0
+ * | (et)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll32)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 1);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) | | (ew)
+ * | e0
+ * \ / (lt)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll33)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[1], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) | | (ew)
+ * | e0
+ * \ / (lt)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll34)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1o, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[1], &e, 1, -1) > 0)
+ __sync_fetch_and_or(&ctx.count, 2);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_TRUE((ctx.count == 2) || (ctx.count == 3));
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) | | (ew)
+ * | e0
+ * \ / (et)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll35)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[1], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) | | (ew)
+ * | e0
+ * \ / (et)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll36)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1o, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[1], &e, 1, -1) > 0)
+ __sync_fetch_and_or(&ctx.count, 2);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_TRUE((ctx.count == 2) || (ctx.count == 3));
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (p) | | (ew)
+ * | e0
+ * \ / (lt)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll37)
+{
+ pthread_t emitter;
+ struct pollfd pfd;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ pfd.fd = ctx.efd[1];
+ pfd.events = POLLIN;
+ if (poll(&pfd, 1, -1) > 0) {
+ if (epoll_wait(ctx.efd[1], &e, 1, 0) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+ }
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (p) | | (ew)
+ * | e0
+ * \ / (lt)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll38)
+{
+ pthread_t emitter;
+ struct pollfd pfd;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1o, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ pfd.fd = ctx.efd[1];
+ pfd.events = POLLIN;
+ if (poll(&pfd, 1, -1) > 0) {
+ if (epoll_wait(ctx.efd[1], &e, 1, 0) > 0)
+ __sync_fetch_and_or(&ctx.count, 2);
+ }
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_TRUE((ctx.count == 2) || (ctx.count == 3));
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (p) | | (ew)
+ * | e0
+ * \ / (et)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll39)
+{
+ pthread_t emitter;
+ struct pollfd pfd;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ pfd.fd = ctx.efd[1];
+ pfd.events = POLLIN;
+ if (poll(&pfd, 1, -1) > 0) {
+ if (epoll_wait(ctx.efd[1], &e, 1, 0) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+ }
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (p) | | (ew)
+ * | e0
+ * \ / (et)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll40)
+{
+ pthread_t emitter;
+ struct pollfd pfd;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1o, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ pfd.fd = ctx.efd[1];
+ pfd.events = POLLIN;
+ if (poll(&pfd, 1, -1) > 0) {
+ if (epoll_wait(ctx.efd[1], &e, 1, 0) > 0)
+ __sync_fetch_and_or(&ctx.count, 2);
+ }
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_TRUE((ctx.count == 2) || (ctx.count == 3));
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) | | (p)
+ * | e0
+ * \ / (lt)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll41)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[1], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) | | (p)
+ * | e0
+ * \ / (lt)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll42)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1op, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[1], &e, 1, -1) > 0)
+ __sync_fetch_and_or(&ctx.count, 2);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_TRUE((ctx.count == 2) || (ctx.count == 3));
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) | | (p)
+ * | e0
+ * \ / (et)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll43)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[1], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (ew) | | (p)
+ * | e0
+ * \ / (et)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll44)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1op, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[1], &e, 1, -1) > 0)
+ __sync_fetch_and_or(&ctx.count, 2);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_TRUE((ctx.count == 2) || (ctx.count == 3));
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (p) | | (p)
+ * | e0
+ * \ / (lt)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll45)
+{
+ pthread_t emitter;
+ struct pollfd pfd;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ pfd.fd = ctx.efd[1];
+ pfd.events = POLLIN;
+ if (poll(&pfd, 1, -1) > 0) {
+ if (epoll_wait(ctx.efd[1], &e, 1, 0) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+ }
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (p) | | (p)
+ * | e0
+ * \ / (lt)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll46)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1op, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[1], &e, 1, -1) > 0)
+ __sync_fetch_and_or(&ctx.count, 2);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_TRUE((ctx.count == 2) || (ctx.count == 3));
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (p) | | (p)
+ * | e0
+ * \ / (et)
+ * e1
+ * | (lt)
+ * s0
+ */
+TEST(epoll47)
+{
+ pthread_t emitter;
+ struct pollfd pfd;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ pfd.fd = ctx.efd[1];
+ pfd.events = POLLIN;
+ if (poll(&pfd, 1, -1) > 0) {
+ if (epoll_wait(ctx.efd[1], &e, 1, 0) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+ }
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0 t1
+ * (p) | | (p)
+ * | e0
+ * \ / (et)
+ * e1
+ * | (et)
+ * s0
+ */
+TEST(epoll48)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, ctx.sfd), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1op, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry1, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[1], &e, 1, -1) > 0)
+ __sync_fetch_and_or(&ctx.count, 2);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_TRUE((ctx.count == 2) || (ctx.count == 3));
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+}
+
+/*
+ * t0
+ * | (ew)
+ * e0
+ * (lt) / \ (lt)
+ * e1 e2
+ * (lt) | | (lt)
+ * s0 s2
+ */
+TEST(epoll49)
+{
+ int efd[3];
+ int sfd[4];
+ struct epoll_event events[2];
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[2]), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ efd[2] = epoll_create(1);
+ ASSERT_GE(efd[2], 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[2], EPOLL_CTL_ADD, sfd[2], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[2], events), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+ ASSERT_EQ(write(sfd[3], "w", 1), 1);
+
+ EXPECT_EQ(epoll_wait(efd[0], events, 2, 0), 2);
+ EXPECT_EQ(epoll_wait(efd[0], events, 2, 0), 2);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(efd[2]);
+ close(sfd[0]);
+ close(sfd[1]);
+ close(sfd[2]);
+ close(sfd[3]);
+}
+
+/*
+ * t0
+ * | (ew)
+ * e0
+ * (et) / \ (et)
+ * e1 e2
+ * (lt) | | (lt)
+ * s0 s2
+ */
+TEST(epoll50)
+{
+ int efd[3];
+ int sfd[4];
+ struct epoll_event events[2];
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[2]), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ efd[2] = epoll_create(1);
+ ASSERT_GE(efd[2], 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[2], EPOLL_CTL_ADD, sfd[2], events), 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], events), 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[2], events), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+ ASSERT_EQ(write(sfd[3], "w", 1), 1);
+
+ EXPECT_EQ(epoll_wait(efd[0], events, 2, 0), 2);
+ EXPECT_EQ(epoll_wait(efd[0], events, 2, 0), 0);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(efd[2]);
+ close(sfd[0]);
+ close(sfd[1]);
+ close(sfd[2]);
+ close(sfd[3]);
+}
+
+/*
+ * t0
+ * | (p)
+ * e0
+ * (lt) / \ (lt)
+ * e1 e2
+ * (lt) | | (lt)
+ * s0 s2
+ */
+TEST(epoll51)
+{
+ int efd[3];
+ int sfd[4];
+ struct pollfd pfd;
+ struct epoll_event events[2];
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[2]), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ efd[2] = epoll_create(1);
+ ASSERT_GE(efd[2], 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[2], EPOLL_CTL_ADD, sfd[2], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[2], events), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+ ASSERT_EQ(write(sfd[3], "w", 1), 1);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], events, 2, 0), 2);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], events, 2, 0), 2);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(efd[2]);
+ close(sfd[0]);
+ close(sfd[1]);
+ close(sfd[2]);
+ close(sfd[3]);
+}
+
+/*
+ * t0
+ * | (p)
+ * e0
+ * (et) / \ (et)
+ * e1 e2
+ * (lt) | | (lt)
+ * s0 s2
+ */
+TEST(epoll52)
+{
+ int efd[3];
+ int sfd[4];
+ struct pollfd pfd;
+ struct epoll_event events[2];
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[2]), 0);
+
+ efd[0] = epoll_create(1);
+ ASSERT_GE(efd[0], 0);
+
+ efd[1] = epoll_create(1);
+ ASSERT_GE(efd[1], 0);
+
+ efd[2] = epoll_create(1);
+ ASSERT_GE(efd[2], 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[1], EPOLL_CTL_ADD, sfd[0], events), 0);
+
+ events[0].events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(efd[2], EPOLL_CTL_ADD, sfd[2], events), 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[1], events), 0);
+
+ events[0].events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(efd[0], EPOLL_CTL_ADD, efd[2], events), 0);
+
+ ASSERT_EQ(write(sfd[1], "w", 1), 1);
+ ASSERT_EQ(write(sfd[3], "w", 1), 1);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 1);
+ EXPECT_EQ(epoll_wait(efd[0], events, 2, 0), 2);
+
+ pfd.fd = efd[0];
+ pfd.events = POLLIN;
+ EXPECT_EQ(poll(&pfd, 1, 0), 0);
+ EXPECT_EQ(epoll_wait(efd[0], events, 2, 0), 0);
+
+ close(efd[0]);
+ close(efd[1]);
+ close(efd[2]);
+ close(sfd[0]);
+ close(sfd[1]);
+ close(sfd[2]);
+ close(sfd[3]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (ew)
+ * e0
+ * (lt) / \ (lt)
+ * e1 e2
+ * (lt) | | (lt)
+ * s0 s2
+ */
+TEST(epoll53)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[2]), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ ctx.efd[2] = epoll_create(1);
+ ASSERT_GE(ctx.efd[2], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[2], EPOLL_CTL_ADD, ctx.sfd[2], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[2], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry2, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.efd[2]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+ close(ctx.sfd[2]);
+ close(ctx.sfd[3]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (ew)
+ * e0
+ * (et) / \ (et)
+ * e1 e2
+ * (lt) | | (lt)
+ * s0 s2
+ */
+TEST(epoll54)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[2]), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ ctx.efd[2] = epoll_create(1);
+ ASSERT_GE(ctx.efd[2], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[2], EPOLL_CTL_ADD, ctx.sfd[2], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[2], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1a, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry2, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.efd[2]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+ close(ctx.sfd[2]);
+ close(ctx.sfd[3]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (p)
+ * e0
+ * (lt) / \ (lt)
+ * e1 e2
+ * (lt) | | (lt)
+ * s0 s2
+ */
+TEST(epoll55)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[2]), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ ctx.efd[2] = epoll_create(1);
+ ASSERT_GE(ctx.efd[2], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[2], EPOLL_CTL_ADD, ctx.sfd[2], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[2], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry2, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.efd[2]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+ close(ctx.sfd[2]);
+ close(ctx.sfd[3]);
+}
+
+/*
+ * t0 t1
+ * (ew) \ / (p)
+ * e0
+ * (et) / \ (et)
+ * e1 e2
+ * (lt) | | (lt)
+ * s0 s2
+ */
+TEST(epoll56)
+{
+ pthread_t emitter;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[2]), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ ctx.efd[2] = epoll_create(1);
+ ASSERT_GE(ctx.efd[2], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[2], EPOLL_CTL_ADD, ctx.sfd[2], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[2], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry2, &ctx), 0);
+
+ if (epoll_wait(ctx.efd[0], &e, 1, -1) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.efd[2]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+ close(ctx.sfd[2]);
+ close(ctx.sfd[3]);
+}
+
+/*
+ * t0 t1
+ * (p) \ / (p)
+ * e0
+ * (lt) / \ (lt)
+ * e1 e2
+ * (lt) | | (lt)
+ * s0 s2
+ */
+TEST(epoll57)
+{
+ pthread_t emitter;
+ struct pollfd pfd;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[2]), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ ctx.efd[2] = epoll_create(1);
+ ASSERT_GE(ctx.efd[2], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[2], EPOLL_CTL_ADD, ctx.sfd[2], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[2], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry2, &ctx), 0);
+
+ pfd.fd = ctx.efd[0];
+ pfd.events = POLLIN;
+ if (poll(&pfd, 1, -1) > 0) {
+ if (epoll_wait(ctx.efd[0], &e, 1, 0) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+ }
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.efd[2]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+ close(ctx.sfd[2]);
+ close(ctx.sfd[3]);
+}
+
+/*
+ * t0 t1
+ * (p) \ / (p)
+ * e0
+ * (et) / \ (et)
+ * e1 e2
+ * (lt) | | (lt)
+ * s0 s2
+ */
+TEST(epoll58)
+{
+ pthread_t emitter;
+ struct pollfd pfd;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+
+ signal(SIGUSR1, signal_handler);
+
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[0]), 0);
+ ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &ctx.sfd[2]), 0);
+
+ ctx.efd[0] = epoll_create(1);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.efd[1] = epoll_create(1);
+ ASSERT_GE(ctx.efd[1], 0);
+
+ ctx.efd[2] = epoll_create(1);
+ ASSERT_GE(ctx.efd[2], 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[1], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ e.events = EPOLLIN;
+ ASSERT_EQ(epoll_ctl(ctx.efd[2], EPOLL_CTL_ADD, ctx.sfd[2], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[1], &e), 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.efd[2], &e), 0);
+
+ ctx.main = pthread_self();
+ ASSERT_EQ(pthread_create(&ctx.waiter, NULL, waiter_entry1ap, &ctx), 0);
+ ASSERT_EQ(pthread_create(&emitter, NULL, emitter_entry2, &ctx), 0);
+
+ pfd.fd = ctx.efd[0];
+ pfd.events = POLLIN;
+ if (poll(&pfd, 1, -1) > 0) {
+ if (epoll_wait(ctx.efd[0], &e, 1, 0) > 0)
+ __sync_fetch_and_add(&ctx.count, 1);
+ }
+
+ ASSERT_EQ(pthread_join(ctx.waiter, NULL), 0);
+ EXPECT_EQ(ctx.count, 2);
+
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+
+ close(ctx.efd[0]);
+ close(ctx.efd[1]);
+ close(ctx.efd[2]);
+ close(ctx.sfd[0]);
+ close(ctx.sfd[1]);
+ close(ctx.sfd[2]);
+ close(ctx.sfd[3]);
+}
+
+static void *epoll59_thread(void *ctx_)
+{
+ struct epoll_mtcontext *ctx = ctx_;
+ struct epoll_event e;
+ int i;
+
+ for (i = 0; i < 100000; i++) {
+ while (ctx->count == 0)
+ ;
+
+ e.events = EPOLLIN | EPOLLERR | EPOLLET;
+ epoll_ctl(ctx->efd[0], EPOLL_CTL_MOD, ctx->sfd[0], &e);
+ ctx->count = 0;
+ }
+
+ return NULL;
+}
+
+/*
+ * t0
+ * (p) \
+ * e0
+ * (et) /
+ * e0
+ *
+ * Based on https://bugzilla.kernel.org/show_bug.cgi?id=205933
+ */
+TEST(epoll59)
+{
+ pthread_t emitter;
+ struct pollfd pfd;
+ struct epoll_event e;
+ struct epoll_mtcontext ctx = { 0 };
+ int i, ret;
+
+ signal(SIGUSR1, signal_handler);
+
+ ctx.efd[0] = epoll_create1(0);
+ ASSERT_GE(ctx.efd[0], 0);
+
+ ctx.sfd[0] = eventfd(1, 0);
+ ASSERT_GE(ctx.sfd[0], 0);
+
+ e.events = EPOLLIN | EPOLLERR | EPOLLET;
+ ASSERT_EQ(epoll_ctl(ctx.efd[0], EPOLL_CTL_ADD, ctx.sfd[0], &e), 0);
+
+ ASSERT_EQ(pthread_create(&emitter, NULL, epoll59_thread, &ctx), 0);
+
+ for (i = 0; i < 100000; i++) {
+ ret = epoll_wait(ctx.efd[0], &e, 1, 1000);
+ ASSERT_GT(ret, 0);
+
+ while (ctx.count != 0)
+ ;
+ ctx.count = 1;
+ }
+ if (pthread_tryjoin_np(emitter, NULL) < 0) {
+ pthread_kill(emitter, SIGUSR1);
+ pthread_join(emitter, NULL);
+ }
+ close(ctx.efd[0]);
+ close(ctx.sfd[0]);
+}
+
+enum {
+ EPOLL60_EVENTS_NR = 10,
+};
+
+struct epoll60_ctx {
+ volatile int stopped;
+ int ready;
+ int waiters;
+ int epfd;
+ int evfd[EPOLL60_EVENTS_NR];
+};
+
+static void *epoll60_wait_thread(void *ctx_)
+{
+ struct epoll60_ctx *ctx = ctx_;
+ struct epoll_event e;
+ sigset_t sigmask;
+ uint64_t v;
+ int ret;
+
+ /* Block SIGUSR1 */
+ sigemptyset(&sigmask);
+ sigaddset(&sigmask, SIGUSR1);
+ sigprocmask(SIG_SETMASK, &sigmask, NULL);
+
+ /* Prepare empty mask for epoll_pwait() */
+ sigemptyset(&sigmask);
+
+ while (!ctx->stopped) {
+ /* Mark we are ready */
+ __atomic_fetch_add(&ctx->ready, 1, __ATOMIC_ACQUIRE);
+
+ /* Start when all are ready */
+ while (__atomic_load_n(&ctx->ready, __ATOMIC_ACQUIRE) &&
+ !ctx->stopped);
+
+ /* Account this waiter */
+ __atomic_fetch_add(&ctx->waiters, 1, __ATOMIC_ACQUIRE);
+
+ ret = epoll_pwait(ctx->epfd, &e, 1, 2000, &sigmask);
+ if (ret != 1) {
+ /* We expect only signal delivery on stop */
+ assert(ret < 0 && errno == EINTR && "Lost wakeup!\n");
+ assert(ctx->stopped);
+ break;
+ }
+
+ ret = read(e.data.fd, &v, sizeof(v));
+ /* Since we are on ET mode, thus each thread gets its own fd. */
+ assert(ret == sizeof(v));
+
+ __atomic_fetch_sub(&ctx->waiters, 1, __ATOMIC_RELEASE);
+ }
+
+ return NULL;
+}
+
+static inline unsigned long long msecs(void)
+{
+ struct timespec ts;
+ unsigned long long msecs;
+
+ clock_gettime(CLOCK_REALTIME, &ts);
+ msecs = ts.tv_sec * 1000ull;
+ msecs += ts.tv_nsec / 1000000ull;
+
+ return msecs;
+}
+
+static inline int count_waiters(struct epoll60_ctx *ctx)
+{
+ return __atomic_load_n(&ctx->waiters, __ATOMIC_ACQUIRE);
+}
+
+TEST(epoll60)
+{
+ struct epoll60_ctx ctx = { 0 };
+ pthread_t waiters[ARRAY_SIZE(ctx.evfd)];
+ struct epoll_event e;
+ int i, n, ret;
+
+ signal(SIGUSR1, signal_handler);
+
+ ctx.epfd = epoll_create1(0);
+ ASSERT_GE(ctx.epfd, 0);
+
+ /* Create event fds */
+ for (i = 0; i < ARRAY_SIZE(ctx.evfd); i++) {
+ ctx.evfd[i] = eventfd(0, EFD_NONBLOCK);
+ ASSERT_GE(ctx.evfd[i], 0);
+
+ e.events = EPOLLIN | EPOLLET;
+ e.data.fd = ctx.evfd[i];
+ ASSERT_EQ(epoll_ctl(ctx.epfd, EPOLL_CTL_ADD, ctx.evfd[i], &e), 0);
+ }
+
+ /* Create waiter threads */
+ for (i = 0; i < ARRAY_SIZE(waiters); i++)
+ ASSERT_EQ(pthread_create(&waiters[i], NULL,
+ epoll60_wait_thread, &ctx), 0);
+
+ for (i = 0; i < 300; i++) {
+ uint64_t v = 1, ms;
+
+ /* Wait for all to be ready */
+ while (__atomic_load_n(&ctx.ready, __ATOMIC_ACQUIRE) !=
+ ARRAY_SIZE(ctx.evfd))
+ ;
+
+ /* Steady, go */
+ __atomic_fetch_sub(&ctx.ready, ARRAY_SIZE(ctx.evfd),
+ __ATOMIC_ACQUIRE);
+
+ /* Wait all have gone to kernel */
+ while (count_waiters(&ctx) != ARRAY_SIZE(ctx.evfd))
+ ;
+
+ /* 1ms should be enough to schedule away */
+ usleep(1000);
+
+ /* Quickly signal all handles at once */
+ for (n = 0; n < ARRAY_SIZE(ctx.evfd); n++) {
+ ret = write(ctx.evfd[n], &v, sizeof(v));
+ ASSERT_EQ(ret, sizeof(v));
+ }
+
+ /* Busy loop for 1s and wait for all waiters to wake up */
+ ms = msecs();
+ while (count_waiters(&ctx) && msecs() < ms + 1000)
+ ;
+
+ ASSERT_EQ(count_waiters(&ctx), 0);
+ }
+ ctx.stopped = 1;
+ /* Stop waiters */
+ for (i = 0; i < ARRAY_SIZE(waiters); i++)
+ ret = pthread_kill(waiters[i], SIGUSR1);
+ for (i = 0; i < ARRAY_SIZE(waiters); i++)
+ pthread_join(waiters[i], NULL);
+
+ for (i = 0; i < ARRAY_SIZE(waiters); i++)
+ close(ctx.evfd[i]);
+ close(ctx.epfd);
+}
+
+struct epoll61_ctx {
+ int epfd;
+ int evfd;
+};
+
+static void *epoll61_write_eventfd(void *ctx_)
+{
+ struct epoll61_ctx *ctx = ctx_;
+ int64_t l = 1;
+
+ usleep(10950);
+ write(ctx->evfd, &l, sizeof(l));
+ return NULL;
+}
+
+static void *epoll61_epoll_with_timeout(void *ctx_)
+{
+ struct epoll61_ctx *ctx = ctx_;
+ struct epoll_event events[1];
+ int n;
+
+ n = epoll_wait(ctx->epfd, events, 1, 11);
+ /*
+ * If epoll returned the eventfd, write on the eventfd to wake up the
+ * blocking poller.
+ */
+ if (n == 1) {
+ int64_t l = 1;
+
+ write(ctx->evfd, &l, sizeof(l));
+ }
+ return NULL;
+}
+
+static void *epoll61_blocking_epoll(void *ctx_)
+{
+ struct epoll61_ctx *ctx = ctx_;
+ struct epoll_event events[1];
+
+ epoll_wait(ctx->epfd, events, 1, -1);
+ return NULL;
+}
+
+TEST(epoll61)
+{
+ struct epoll61_ctx ctx;
+ struct epoll_event ev;
+ int i, r;
+
+ ctx.epfd = epoll_create1(0);
+ ASSERT_GE(ctx.epfd, 0);
+ ctx.evfd = eventfd(0, EFD_NONBLOCK);
+ ASSERT_GE(ctx.evfd, 0);
+
+ ev.events = EPOLLIN | EPOLLET | EPOLLERR | EPOLLHUP;
+ ev.data.ptr = NULL;
+ r = epoll_ctl(ctx.epfd, EPOLL_CTL_ADD, ctx.evfd, &ev);
+ ASSERT_EQ(r, 0);
+
+ /*
+ * We are testing a race. Repeat the test case 1000 times to make it
+ * more likely to fail in case of a bug.
+ */
+ for (i = 0; i < 1000; i++) {
+ pthread_t threads[3];
+ int n;
+
+ /*
+ * Start 3 threads:
+ * Thread 1 sleeps for 10.9ms and writes to the evenfd.
+ * Thread 2 calls epoll with a timeout of 11ms.
+ * Thread 3 calls epoll with a timeout of -1.
+ *
+ * The eventfd write by Thread 1 should either wakeup Thread 2
+ * or Thread 3. If it wakes up Thread 2, Thread 2 writes on the
+ * eventfd to wake up Thread 3.
+ *
+ * If no events are missed, all three threads should eventually
+ * be joinable.
+ */
+ ASSERT_EQ(pthread_create(&threads[0], NULL,
+ epoll61_write_eventfd, &ctx), 0);
+ ASSERT_EQ(pthread_create(&threads[1], NULL,
+ epoll61_epoll_with_timeout, &ctx), 0);
+ ASSERT_EQ(pthread_create(&threads[2], NULL,
+ epoll61_blocking_epoll, &ctx), 0);
+
+ for (n = 0; n < ARRAY_SIZE(threads); ++n)
+ ASSERT_EQ(pthread_join(threads[n], NULL), 0);
+ }
+
+ close(ctx.epfd);
+ close(ctx.evfd);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/firmware/.gitignore b/tools/testing/selftests/firmware/.gitignore
new file mode 100644
index 000000000..62abc92a9
--- /dev/null
+++ b/tools/testing/selftests/firmware/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+fw_namespace
diff --git a/tools/testing/selftests/firmware/Makefile b/tools/testing/selftests/firmware/Makefile
new file mode 100644
index 000000000..40211cd8f
--- /dev/null
+++ b/tools/testing/selftests/firmware/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Makefile for firmware loading selftests
+CFLAGS = -Wall \
+ -O2
+
+TEST_PROGS := fw_run_tests.sh
+TEST_FILES := fw_fallback.sh fw_filesystem.sh fw_lib.sh
+TEST_GEN_FILES := fw_namespace
+
+include ../lib.mk
diff --git a/tools/testing/selftests/firmware/config b/tools/testing/selftests/firmware/config
new file mode 100644
index 000000000..bf634dda0
--- /dev/null
+++ b/tools/testing/selftests/firmware/config
@@ -0,0 +1,5 @@
+CONFIG_TEST_FIRMWARE=y
+CONFIG_FW_LOADER=y
+CONFIG_FW_LOADER_USER_HELPER=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
diff --git a/tools/testing/selftests/firmware/fw_fallback.sh b/tools/testing/selftests/firmware/fw_fallback.sh
new file mode 100755
index 000000000..70d18be46
--- /dev/null
+++ b/tools/testing/selftests/firmware/fw_fallback.sh
@@ -0,0 +1,283 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# This validates that the kernel will fall back to using the fallback mechanism
+# to load firmware it can't find on disk itself. We must request a firmware
+# that the kernel won't find, and any installed helper (e.g. udev) also
+# won't find so that we can do the load ourself manually.
+set -e
+
+TEST_REQS_FW_SYSFS_FALLBACK="yes"
+TEST_REQS_FW_SET_CUSTOM_PATH="no"
+TEST_DIR=$(dirname $0)
+source $TEST_DIR/fw_lib.sh
+
+check_mods
+check_setup
+verify_reqs
+setup_tmp_file
+
+trap "test_finish" EXIT
+
+load_fw()
+{
+ local name="$1"
+ local file="$2"
+
+ # This will block until our load (below) has finished.
+ echo -n "$name" >"$DIR"/trigger_request &
+
+ # Give kernel a chance to react.
+ local timeout=10
+ while [ ! -e "$DIR"/"$name"/loading ]; do
+ sleep 0.1
+ timeout=$(( $timeout - 1 ))
+ if [ "$timeout" -eq 0 ]; then
+ echo "$0: firmware interface never appeared" >&2
+ exit 1
+ fi
+ done
+
+ echo 1 >"$DIR"/"$name"/loading
+ cat "$file" >"$DIR"/"$name"/data
+ echo 0 >"$DIR"/"$name"/loading
+
+ # Wait for request to finish.
+ wait
+}
+
+load_fw_cancel()
+{
+ local name="$1"
+ local file="$2"
+
+ # This will block until our load (below) has finished.
+ echo -n "$name" >"$DIR"/trigger_request 2>/dev/null &
+
+ # Give kernel a chance to react.
+ local timeout=10
+ while [ ! -e "$DIR"/"$name"/loading ]; do
+ sleep 0.1
+ timeout=$(( $timeout - 1 ))
+ if [ "$timeout" -eq 0 ]; then
+ echo "$0: firmware interface never appeared" >&2
+ exit 1
+ fi
+ done
+
+ echo -1 >"$DIR"/"$name"/loading
+
+ # Wait for request to finish.
+ wait
+}
+
+load_fw_custom()
+{
+ if [ ! -e "$DIR"/trigger_custom_fallback ]; then
+ echo "$0: custom fallback trigger not present, ignoring test" >&2
+ exit $ksft_skip
+ fi
+
+ local name="$1"
+ local file="$2"
+
+ echo -n "$name" >"$DIR"/trigger_custom_fallback 2>/dev/null &
+
+ # Give kernel a chance to react.
+ local timeout=10
+ while [ ! -e "$DIR"/"$name"/loading ]; do
+ sleep 0.1
+ timeout=$(( $timeout - 1 ))
+ if [ "$timeout" -eq 0 ]; then
+ echo "$0: firmware interface never appeared" >&2
+ exit 1
+ fi
+ done
+
+ echo 1 >"$DIR"/"$name"/loading
+ cat "$file" >"$DIR"/"$name"/data
+ echo 0 >"$DIR"/"$name"/loading
+
+ # Wait for request to finish.
+ wait
+ return 0
+}
+
+
+load_fw_custom_cancel()
+{
+ if [ ! -e "$DIR"/trigger_custom_fallback ]; then
+ echo "$0: canceling custom fallback trigger not present, ignoring test" >&2
+ exit $ksft_skip
+ fi
+
+ local name="$1"
+ local file="$2"
+
+ echo -n "$name" >"$DIR"/trigger_custom_fallback 2>/dev/null &
+
+ # Give kernel a chance to react.
+ local timeout=10
+ while [ ! -e "$DIR"/"$name"/loading ]; do
+ sleep 0.1
+ timeout=$(( $timeout - 1 ))
+ if [ "$timeout" -eq 0 ]; then
+ echo "$0: firmware interface never appeared" >&2
+ exit 1
+ fi
+ done
+
+ echo -1 >"$DIR"/"$name"/loading
+
+ # Wait for request to finish.
+ wait
+ return 0
+}
+
+load_fw_fallback_with_child()
+{
+ local name="$1"
+ local file="$2"
+
+ # This is the value already set but we want to be explicit
+ echo 4 >/sys/class/firmware/timeout
+
+ sleep 1 &
+ SECONDS_BEFORE=$(date +%s)
+ echo -n "$name" >"$DIR"/trigger_request 2>/dev/null
+ SECONDS_AFTER=$(date +%s)
+ SECONDS_DELTA=$(($SECONDS_AFTER - $SECONDS_BEFORE))
+ if [ "$SECONDS_DELTA" -lt 4 ]; then
+ RET=1
+ else
+ RET=0
+ fi
+ wait
+ return $RET
+}
+
+test_syfs_timeout()
+{
+ DEVPATH="$DIR"/"nope-$NAME"/loading
+
+ # Test failure when doing nothing (timeout works).
+ echo -n 2 >/sys/class/firmware/timeout
+ echo -n "nope-$NAME" >"$DIR"/trigger_request 2>/dev/null &
+
+ # Give the kernel some time to load the loading file, must be less
+ # than the timeout above.
+ sleep 1
+ if [ ! -f $DEVPATH ]; then
+ echo "$0: fallback mechanism immediately cancelled"
+ echo ""
+ echo "The file never appeared: $DEVPATH"
+ echo ""
+ echo "This might be a distribution udev rule setup by your distribution"
+ echo "to immediately cancel all fallback requests, this must be"
+ echo "removed before running these tests. To confirm look for"
+ echo "a firmware rule like /lib/udev/rules.d/50-firmware.rules"
+ echo "and see if you have something like this:"
+ echo ""
+ echo "SUBSYSTEM==\"firmware\", ACTION==\"add\", ATTR{loading}=\"-1\""
+ echo ""
+ echo "If you do remove this file or comment out this line before"
+ echo "proceeding with these tests."
+ exit 1
+ fi
+
+ if diff -q "$FW" /dev/test_firmware >/dev/null ; then
+ echo "$0: firmware was not expected to match" >&2
+ exit 1
+ else
+ echo "$0: timeout works"
+ fi
+}
+
+run_sysfs_main_tests()
+{
+ test_syfs_timeout
+ # Put timeout high enough for us to do work but not so long that failures
+ # slow down this test too much.
+ echo 4 >/sys/class/firmware/timeout
+
+ # Load this script instead of the desired firmware.
+ load_fw "$NAME" "$0"
+ if diff -q "$FW" /dev/test_firmware >/dev/null ; then
+ echo "$0: firmware was not expected to match" >&2
+ exit 1
+ else
+ echo "$0: firmware comparison works"
+ fi
+
+ # Do a proper load, which should work correctly.
+ load_fw "$NAME" "$FW"
+ if ! diff -q "$FW" /dev/test_firmware >/dev/null ; then
+ echo "$0: firmware was not loaded" >&2
+ exit 1
+ else
+ echo "$0: fallback mechanism works"
+ fi
+
+ load_fw_cancel "nope-$NAME" "$FW"
+ if diff -q "$FW" /dev/test_firmware >/dev/null ; then
+ echo "$0: firmware was expected to be cancelled" >&2
+ exit 1
+ else
+ echo "$0: cancelling fallback mechanism works"
+ fi
+
+ set +e
+ load_fw_fallback_with_child "nope-signal-$NAME" "$FW"
+ if [ "$?" -eq 0 ]; then
+ echo "$0: SIGCHLD on sync ignored as expected" >&2
+ else
+ echo "$0: error - sync firmware request cancelled due to SIGCHLD" >&2
+ exit 1
+ fi
+ set -e
+}
+
+run_sysfs_custom_load_tests()
+{
+ RANDOM_FILE_PATH=$(setup_random_file)
+ RANDOM_FILE="$(basename $RANDOM_FILE_PATH)"
+ if load_fw_custom "$RANDOM_FILE" "$RANDOM_FILE_PATH" ; then
+ if ! diff -q "$RANDOM_FILE_PATH" /dev/test_firmware >/dev/null ; then
+ echo "$0: firmware was not loaded" >&2
+ exit 1
+ else
+ echo "$0: custom fallback loading mechanism works"
+ fi
+ fi
+
+ RANDOM_FILE_PATH=$(setup_random_file)
+ RANDOM_FILE="$(basename $RANDOM_FILE_PATH)"
+ if load_fw_custom "$RANDOM_FILE" "$RANDOM_FILE_PATH" ; then
+ if ! diff -q "$RANDOM_FILE_PATH" /dev/test_firmware >/dev/null ; then
+ echo "$0: firmware was not loaded" >&2
+ exit 1
+ else
+ echo "$0: custom fallback loading mechanism works"
+ fi
+ fi
+
+ RANDOM_FILE_REAL="$RANDOM_FILE_PATH"
+ FAKE_RANDOM_FILE_PATH=$(setup_random_file_fake)
+ FAKE_RANDOM_FILE="$(basename $FAKE_RANDOM_FILE_PATH)"
+
+ if load_fw_custom_cancel "$FAKE_RANDOM_FILE" "$RANDOM_FILE_REAL" ; then
+ if diff -q "$RANDOM_FILE_PATH" /dev/test_firmware >/dev/null ; then
+ echo "$0: firmware was expected to be cancelled" >&2
+ exit 1
+ else
+ echo "$0: cancelling custom fallback mechanism works"
+ fi
+ fi
+}
+
+if [ "$HAS_FW_LOADER_USER_HELPER_FALLBACK" = "yes" ]; then
+ run_sysfs_main_tests
+fi
+
+run_sysfs_custom_load_tests
+
+exit 0
diff --git a/tools/testing/selftests/firmware/fw_filesystem.sh b/tools/testing/selftests/firmware/fw_filesystem.sh
new file mode 100755
index 000000000..c2a2a1001
--- /dev/null
+++ b/tools/testing/selftests/firmware/fw_filesystem.sh
@@ -0,0 +1,552 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# This validates that the kernel will load firmware out of its list of
+# firmware locations on disk. Since the user helper does similar work,
+# we reset the custom load directory to a location the user helper doesn't
+# know so we can be sure we're not accidentally testing the user helper.
+set -e
+
+TEST_REQS_FW_SYSFS_FALLBACK="no"
+TEST_REQS_FW_SET_CUSTOM_PATH="yes"
+TEST_DIR=$(dirname $0)
+source $TEST_DIR/fw_lib.sh
+
+check_mods
+check_setup
+verify_reqs
+setup_tmp_file
+
+trap "test_finish" EXIT
+
+if [ "$HAS_FW_LOADER_USER_HELPER" = "yes" ]; then
+ # Turn down the timeout so failures don't take so long.
+ echo 1 >/sys/class/firmware/timeout
+fi
+
+if printf '\000' >"$DIR"/trigger_request 2> /dev/null; then
+ echo "$0: empty filename should not succeed" >&2
+ exit 1
+fi
+
+if [ ! -e "$DIR"/trigger_async_request ]; then
+ echo "$0: empty filename: async trigger not present, ignoring test" >&2
+ exit $ksft_skip
+else
+ if printf '\000' >"$DIR"/trigger_async_request 2> /dev/null; then
+ echo "$0: empty filename should not succeed (async)" >&2
+ exit 1
+ fi
+fi
+
+# Request a firmware that doesn't exist, it should fail.
+if echo -n "nope-$NAME" >"$DIR"/trigger_request 2> /dev/null; then
+ echo "$0: firmware shouldn't have loaded" >&2
+ exit 1
+fi
+if diff -q "$FW" /dev/test_firmware >/dev/null ; then
+ echo "$0: firmware was not expected to match" >&2
+ exit 1
+else
+ if [ "$HAS_FW_LOADER_USER_HELPER" = "yes" ]; then
+ echo "$0: timeout works"
+ fi
+fi
+
+# This should succeed via kernel load or will fail after 1 second after
+# being handed over to the user helper, which won't find the fw either.
+if ! echo -n "$NAME" >"$DIR"/trigger_request ; then
+ echo "$0: could not trigger request" >&2
+ exit 1
+fi
+
+# Verify the contents are what we expect.
+if ! diff -q "$FW" /dev/test_firmware >/dev/null ; then
+ echo "$0: firmware was not loaded" >&2
+ exit 1
+else
+ echo "$0: filesystem loading works"
+fi
+
+# Try the asynchronous version too
+if [ ! -e "$DIR"/trigger_async_request ]; then
+ echo "$0: firmware loading: async trigger not present, ignoring test" >&2
+ exit $ksft_skip
+else
+ if ! echo -n "$NAME" >"$DIR"/trigger_async_request ; then
+ echo "$0: could not trigger async request" >&2
+ exit 1
+ fi
+
+ # Verify the contents are what we expect.
+ if ! diff -q "$FW" /dev/test_firmware >/dev/null ; then
+ echo "$0: firmware was not loaded (async)" >&2
+ exit 1
+ else
+ echo "$0: async filesystem loading works"
+ fi
+fi
+
+# Try platform (EFI embedded fw) loading too
+if [ ! -e "$DIR"/trigger_request_platform ]; then
+ echo "$0: firmware loading: platform trigger not present, ignoring test" >&2
+else
+ if printf '\000' >"$DIR"/trigger_request_platform 2> /dev/null; then
+ echo "$0: empty filename should not succeed (platform)" >&2
+ exit 1
+ fi
+
+ # Note we echo a non-existing name, since files on the file-system
+ # are preferred over firmware embedded inside the platform's firmware
+ # The test adds a fake entry with the requested name to the platform's
+ # fw list, so the name does not matter as long as it does not exist
+ if ! echo -n "nope-$NAME" >"$DIR"/trigger_request_platform ; then
+ echo "$0: could not trigger request platform" >&2
+ exit 1
+ fi
+
+ # The test verifies itself that the loaded firmware contents matches
+ # the contents for the fake platform fw entry it added.
+ echo "$0: platform loading works"
+fi
+
+### Batched requests tests
+test_config_present()
+{
+ if [ ! -f $DIR/reset ]; then
+ echo "Configuration triggers not present, ignoring test"
+ exit $ksft_skip
+ fi
+}
+
+# Defaults :
+#
+# send_uevent: 1
+# sync_direct: 0
+# name: test-firmware.bin
+# num_requests: 4
+config_reset()
+{
+ echo 1 > $DIR/reset
+}
+
+release_all_firmware()
+{
+ echo 1 > $DIR/release_all_firmware
+}
+
+config_set_name()
+{
+ echo -n $1 > $DIR/config_name
+}
+
+config_set_into_buf()
+{
+ echo 1 > $DIR/config_into_buf
+}
+
+config_unset_into_buf()
+{
+ echo 0 > $DIR/config_into_buf
+}
+
+config_set_buf_size()
+{
+ echo $1 > $DIR/config_buf_size
+}
+
+config_set_file_offset()
+{
+ echo $1 > $DIR/config_file_offset
+}
+
+config_set_partial()
+{
+ echo 1 > $DIR/config_partial
+}
+
+config_unset_partial()
+{
+ echo 0 > $DIR/config_partial
+}
+
+config_set_sync_direct()
+{
+ echo 1 > $DIR/config_sync_direct
+}
+
+config_unset_sync_direct()
+{
+ echo 0 > $DIR/config_sync_direct
+}
+
+config_set_uevent()
+{
+ echo 1 > $DIR/config_send_uevent
+}
+
+config_unset_uevent()
+{
+ echo 0 > $DIR/config_send_uevent
+}
+
+config_trigger_sync()
+{
+ echo -n 1 > $DIR/trigger_batched_requests 2>/dev/null
+}
+
+config_trigger_async()
+{
+ echo -n 1 > $DIR/trigger_batched_requests_async 2> /dev/null
+}
+
+config_set_read_fw_idx()
+{
+ echo -n $1 > $DIR/config_read_fw_idx 2> /dev/null
+}
+
+read_firmwares()
+{
+ if [ "$(cat $DIR/config_into_buf)" == "1" ]; then
+ fwfile="$FW_INTO_BUF"
+ else
+ fwfile="$FW"
+ fi
+ if [ "$1" = "xzonly" ]; then
+ fwfile="${fwfile}-orig"
+ fi
+ for i in $(seq 0 3); do
+ config_set_read_fw_idx $i
+ # Verify the contents are what we expect.
+ # -Z required for now -- check for yourself, md5sum
+ # on $FW and DIR/read_firmware will yield the same. Even
+ # cmp agrees, so something is off.
+ if ! diff -q -Z "$fwfile" $DIR/read_firmware 2>/dev/null ; then
+ echo "request #$i: firmware was not loaded" >&2
+ exit 1
+ fi
+ done
+}
+
+read_partial_firmwares()
+{
+ if [ "$(cat $DIR/config_into_buf)" == "1" ]; then
+ fwfile="${FW_INTO_BUF}"
+ else
+ fwfile="${FW}"
+ fi
+
+ if [ "$1" = "xzonly" ]; then
+ fwfile="${fwfile}-orig"
+ fi
+
+ # Strip fwfile down to match partial offset and length
+ partial_data="$(cat $fwfile)"
+ partial_data="${partial_data:$2:$3}"
+
+ for i in $(seq 0 3); do
+ config_set_read_fw_idx $i
+
+ read_firmware="$(cat $DIR/read_firmware)"
+
+ # Verify the contents are what we expect.
+ if [ $read_firmware != $partial_data ]; then
+ echo "request #$i: partial firmware was not loaded" >&2
+ exit 1
+ fi
+ done
+}
+
+read_firmwares_expect_nofile()
+{
+ for i in $(seq 0 3); do
+ config_set_read_fw_idx $i
+ # Ensures contents differ
+ if diff -q -Z "$FW" $DIR/read_firmware 2>/dev/null ; then
+ echo "request $i: file was not expected to match" >&2
+ exit 1
+ fi
+ done
+}
+
+test_batched_request_firmware_nofile()
+{
+ echo -n "Batched request_firmware() nofile try #$1: "
+ config_reset
+ config_set_name nope-test-firmware.bin
+ config_trigger_sync
+ read_firmwares_expect_nofile
+ release_all_firmware
+ echo "OK"
+}
+
+test_batched_request_firmware_into_buf_nofile()
+{
+ echo -n "Batched request_firmware_into_buf() nofile try #$1: "
+ config_reset
+ config_set_name nope-test-firmware.bin
+ config_set_into_buf
+ config_trigger_sync
+ read_firmwares_expect_nofile
+ release_all_firmware
+ echo "OK"
+}
+
+test_request_partial_firmware_into_buf_nofile()
+{
+ echo -n "Test request_partial_firmware_into_buf() off=$1 size=$2 nofile: "
+ config_reset
+ config_set_name nope-test-firmware.bin
+ config_set_into_buf
+ config_set_partial
+ config_set_buf_size $2
+ config_set_file_offset $1
+ config_trigger_sync
+ read_firmwares_expect_nofile
+ release_all_firmware
+ echo "OK"
+}
+
+test_batched_request_firmware_direct_nofile()
+{
+ echo -n "Batched request_firmware_direct() nofile try #$1: "
+ config_reset
+ config_set_name nope-test-firmware.bin
+ config_set_sync_direct
+ config_trigger_sync
+ release_all_firmware
+ echo "OK"
+}
+
+test_request_firmware_nowait_uevent_nofile()
+{
+ echo -n "Batched request_firmware_nowait(uevent=true) nofile try #$1: "
+ config_reset
+ config_set_name nope-test-firmware.bin
+ config_trigger_async
+ release_all_firmware
+ echo "OK"
+}
+
+test_wait_and_cancel_custom_load()
+{
+ if [ "$HAS_FW_LOADER_USER_HELPER" != "yes" ]; then
+ return
+ fi
+ local timeout=10
+ name=$1
+ while [ ! -e "$DIR"/"$name"/loading ]; do
+ sleep 0.1
+ timeout=$(( $timeout - 1 ))
+ if [ "$timeout" -eq 0 ]; then
+ echo "firmware interface never appeared:" >&2
+ echo "$DIR/$name/loading" >&2
+ exit 1
+ fi
+ done
+ echo -1 >"$DIR"/"$name"/loading
+}
+
+test_request_firmware_nowait_custom_nofile()
+{
+ echo -n "Batched request_firmware_nowait(uevent=false) nofile try #$1: "
+ config_reset
+ config_unset_uevent
+ RANDOM_FILE_PATH=$(setup_random_file_fake)
+ RANDOM_FILE="$(basename $RANDOM_FILE_PATH)"
+ config_set_name $RANDOM_FILE
+ config_trigger_async &
+ test_wait_and_cancel_custom_load $RANDOM_FILE
+ wait
+ release_all_firmware
+ echo "OK"
+}
+
+test_batched_request_firmware()
+{
+ echo -n "Batched request_firmware() $2 try #$1: "
+ config_reset
+ config_trigger_sync
+ read_firmwares $2
+ release_all_firmware
+ echo "OK"
+}
+
+test_batched_request_firmware_into_buf()
+{
+ echo -n "Batched request_firmware_into_buf() $2 try #$1: "
+ config_reset
+ config_set_name $TEST_FIRMWARE_INTO_BUF_FILENAME
+ config_set_into_buf
+ config_trigger_sync
+ read_firmwares $2
+ release_all_firmware
+ echo "OK"
+}
+
+test_batched_request_firmware_direct()
+{
+ echo -n "Batched request_firmware_direct() $2 try #$1: "
+ config_reset
+ config_set_sync_direct
+ config_trigger_sync
+ release_all_firmware
+ echo "OK"
+}
+
+test_request_firmware_nowait_uevent()
+{
+ echo -n "Batched request_firmware_nowait(uevent=true) $2 try #$1: "
+ config_reset
+ config_trigger_async
+ release_all_firmware
+ echo "OK"
+}
+
+test_request_firmware_nowait_custom()
+{
+ echo -n "Batched request_firmware_nowait(uevent=false) $2 try #$1: "
+ config_reset
+ config_unset_uevent
+ RANDOM_FILE_PATH=$(setup_random_file)
+ RANDOM_FILE="$(basename $RANDOM_FILE_PATH)"
+ if [ "$2" = "both" ]; then
+ xz -9 -C crc32 -k $RANDOM_FILE_PATH
+ elif [ "$2" = "xzonly" ]; then
+ xz -9 -C crc32 $RANDOM_FILE_PATH
+ fi
+ config_set_name $RANDOM_FILE
+ config_trigger_async
+ release_all_firmware
+ echo "OK"
+}
+
+test_request_partial_firmware_into_buf()
+{
+ echo -n "Test request_partial_firmware_into_buf() off=$1 size=$2: "
+ config_reset
+ config_set_name $TEST_FIRMWARE_INTO_BUF_FILENAME
+ config_set_into_buf
+ config_set_partial
+ config_set_buf_size $2
+ config_set_file_offset $1
+ config_trigger_sync
+ read_partial_firmwares normal $1 $2
+ release_all_firmware
+ echo "OK"
+}
+
+# Only continue if batched request triggers are present on the
+# test-firmware driver
+test_config_present
+
+# test with the file present
+echo
+echo "Testing with the file present..."
+for i in $(seq 1 5); do
+ test_batched_request_firmware $i normal
+done
+
+for i in $(seq 1 5); do
+ test_batched_request_firmware_into_buf $i normal
+done
+
+for i in $(seq 1 5); do
+ test_batched_request_firmware_direct $i normal
+done
+
+for i in $(seq 1 5); do
+ test_request_firmware_nowait_uevent $i normal
+done
+
+for i in $(seq 1 5); do
+ test_request_firmware_nowait_custom $i normal
+done
+
+# Partial loads cannot use fallback, so do not repeat tests.
+test_request_partial_firmware_into_buf 0 10
+test_request_partial_firmware_into_buf 0 5
+test_request_partial_firmware_into_buf 1 6
+test_request_partial_firmware_into_buf 2 10
+
+# Test for file not found, errors are expected, the failure would be
+# a hung task, which would require a hard reset.
+echo
+echo "Testing with the file missing..."
+for i in $(seq 1 5); do
+ test_batched_request_firmware_nofile $i
+done
+
+for i in $(seq 1 5); do
+ test_batched_request_firmware_into_buf_nofile $i
+done
+
+for i in $(seq 1 5); do
+ test_batched_request_firmware_direct_nofile $i
+done
+
+for i in $(seq 1 5); do
+ test_request_firmware_nowait_uevent_nofile $i
+done
+
+for i in $(seq 1 5); do
+ test_request_firmware_nowait_custom_nofile $i
+done
+
+# Partial loads cannot use fallback, so do not repeat tests.
+test_request_partial_firmware_into_buf_nofile 0 10
+test_request_partial_firmware_into_buf_nofile 0 5
+test_request_partial_firmware_into_buf_nofile 1 6
+test_request_partial_firmware_into_buf_nofile 2 10
+
+test "$HAS_FW_LOADER_COMPRESS" != "yes" && exit 0
+
+# test with both files present
+xz -9 -C crc32 -k $FW
+config_set_name $NAME
+echo
+echo "Testing with both plain and xz files present..."
+for i in $(seq 1 5); do
+ test_batched_request_firmware $i both
+done
+
+for i in $(seq 1 5); do
+ test_batched_request_firmware_into_buf $i both
+done
+
+for i in $(seq 1 5); do
+ test_batched_request_firmware_direct $i both
+done
+
+for i in $(seq 1 5); do
+ test_request_firmware_nowait_uevent $i both
+done
+
+for i in $(seq 1 5); do
+ test_request_firmware_nowait_custom $i both
+done
+
+# test with only xz file present
+mv "$FW" "${FW}-orig"
+echo
+echo "Testing with only xz file present..."
+for i in $(seq 1 5); do
+ test_batched_request_firmware $i xzonly
+done
+
+for i in $(seq 1 5); do
+ test_batched_request_firmware_into_buf $i xzonly
+done
+
+for i in $(seq 1 5); do
+ test_batched_request_firmware_direct $i xzonly
+done
+
+for i in $(seq 1 5); do
+ test_request_firmware_nowait_uevent $i xzonly
+done
+
+for i in $(seq 1 5); do
+ test_request_firmware_nowait_custom $i xzonly
+done
+
+exit 0
diff --git a/tools/testing/selftests/firmware/fw_lib.sh b/tools/testing/selftests/firmware/fw_lib.sh
new file mode 100755
index 000000000..5b8c0fede
--- /dev/null
+++ b/tools/testing/selftests/firmware/fw_lib.sh
@@ -0,0 +1,223 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Library of helpers for test scripts.
+set -e
+
+DIR=/sys/devices/virtual/misc/test_firmware
+
+PROC_CONFIG="/proc/config.gz"
+TEST_DIR=$(dirname $0)
+
+# We need to load a different file to test request_firmware_into_buf
+# I believe the issue is firmware loaded cached vs. non-cached
+# with same filename is bungled.
+# To reproduce rename this to test-firmware.bin
+TEST_FIRMWARE_INTO_BUF_FILENAME=test-firmware-into-buf.bin
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+print_reqs_exit()
+{
+ echo "You must have the following enabled in your kernel:" >&2
+ cat $TEST_DIR/config >&2
+ exit $ksft_skip
+}
+
+test_modprobe()
+{
+ if [ ! -d $DIR ]; then
+ print_reqs_exit
+ fi
+}
+
+check_mods()
+{
+ local uid=$(id -u)
+ if [ $uid -ne 0 ]; then
+ echo "skip all tests: must be run as root" >&2
+ exit $ksft_skip
+ fi
+
+ trap "test_modprobe" EXIT
+ if [ ! -d $DIR ]; then
+ modprobe test_firmware
+ fi
+ if [ ! -f $PROC_CONFIG ]; then
+ if modprobe configs 2>/dev/null; then
+ echo "Loaded configs module"
+ if [ ! -f $PROC_CONFIG ]; then
+ echo "You must have the following enabled in your kernel:" >&2
+ cat $TEST_DIR/config >&2
+ echo "Resorting to old heuristics" >&2
+ fi
+ else
+ echo "Failed to load configs module, using old heuristics" >&2
+ fi
+ fi
+}
+
+check_setup()
+{
+ HAS_FW_LOADER_USER_HELPER="$(kconfig_has CONFIG_FW_LOADER_USER_HELPER=y)"
+ HAS_FW_LOADER_USER_HELPER_FALLBACK="$(kconfig_has CONFIG_FW_LOADER_USER_HELPER_FALLBACK=y)"
+ HAS_FW_LOADER_COMPRESS="$(kconfig_has CONFIG_FW_LOADER_COMPRESS=y)"
+ PROC_FW_IGNORE_SYSFS_FALLBACK="0"
+ PROC_FW_FORCE_SYSFS_FALLBACK="0"
+
+ if [ -z $PROC_SYS_DIR ]; then
+ PROC_SYS_DIR="/proc/sys/kernel"
+ fi
+
+ FW_PROC="${PROC_SYS_DIR}/firmware_config"
+ FW_FORCE_SYSFS_FALLBACK="$FW_PROC/force_sysfs_fallback"
+ FW_IGNORE_SYSFS_FALLBACK="$FW_PROC/ignore_sysfs_fallback"
+
+ if [ -f $FW_FORCE_SYSFS_FALLBACK ]; then
+ PROC_FW_FORCE_SYSFS_FALLBACK="$(cat $FW_FORCE_SYSFS_FALLBACK)"
+ fi
+
+ if [ -f $FW_IGNORE_SYSFS_FALLBACK ]; then
+ PROC_FW_IGNORE_SYSFS_FALLBACK="$(cat $FW_IGNORE_SYSFS_FALLBACK)"
+ fi
+
+ if [ "$PROC_FW_FORCE_SYSFS_FALLBACK" = "1" ]; then
+ HAS_FW_LOADER_USER_HELPER="yes"
+ HAS_FW_LOADER_USER_HELPER_FALLBACK="yes"
+ fi
+
+ if [ "$PROC_FW_IGNORE_SYSFS_FALLBACK" = "1" ]; then
+ HAS_FW_LOADER_USER_HELPER_FALLBACK="no"
+ HAS_FW_LOADER_USER_HELPER="no"
+ fi
+
+ if [ "$HAS_FW_LOADER_USER_HELPER" = "yes" ]; then
+ OLD_TIMEOUT="$(cat /sys/class/firmware/timeout)"
+ fi
+
+ OLD_FWPATH="$(cat /sys/module/firmware_class/parameters/path)"
+
+ if [ "$HAS_FW_LOADER_COMPRESS" = "yes" ]; then
+ if ! which xz 2> /dev/null > /dev/null; then
+ HAS_FW_LOADER_COMPRESS=""
+ fi
+ fi
+}
+
+verify_reqs()
+{
+ if [ "$TEST_REQS_FW_SYSFS_FALLBACK" = "yes" ]; then
+ if [ ! "$HAS_FW_LOADER_USER_HELPER" = "yes" ]; then
+ echo "usermode helper disabled so ignoring test"
+ exit 0
+ fi
+ fi
+}
+
+setup_tmp_file()
+{
+ FWPATH=$(mktemp -d)
+ FW="$FWPATH/test-firmware.bin"
+ echo "ABCD0123" >"$FW"
+ FW_INTO_BUF="$FWPATH/$TEST_FIRMWARE_INTO_BUF_FILENAME"
+ echo "EFGH4567" >"$FW_INTO_BUF"
+ NAME=$(basename "$FW")
+ if [ "$TEST_REQS_FW_SET_CUSTOM_PATH" = "yes" ]; then
+ echo -n "$FWPATH" >/sys/module/firmware_class/parameters/path
+ fi
+}
+
+__setup_random_file()
+{
+ RANDOM_FILE_PATH="$(mktemp -p $FWPATH)"
+ # mktemp says dry-run -n is unsafe, so...
+ if [[ "$1" = "fake" ]]; then
+ rm -rf $RANDOM_FILE_PATH
+ sync
+ else
+ echo "ABCD0123" >"$RANDOM_FILE_PATH"
+ fi
+ echo $RANDOM_FILE_PATH
+}
+
+setup_random_file()
+{
+ echo $(__setup_random_file)
+}
+
+setup_random_file_fake()
+{
+ echo $(__setup_random_file fake)
+}
+
+proc_set_force_sysfs_fallback()
+{
+ if [ -f $FW_FORCE_SYSFS_FALLBACK ]; then
+ echo -n $1 > $FW_FORCE_SYSFS_FALLBACK
+ check_setup
+ fi
+}
+
+proc_set_ignore_sysfs_fallback()
+{
+ if [ -f $FW_IGNORE_SYSFS_FALLBACK ]; then
+ echo -n $1 > $FW_IGNORE_SYSFS_FALLBACK
+ check_setup
+ fi
+}
+
+proc_restore_defaults()
+{
+ proc_set_force_sysfs_fallback 0
+ proc_set_ignore_sysfs_fallback 0
+}
+
+test_finish()
+{
+ if [ "$HAS_FW_LOADER_USER_HELPER" = "yes" ]; then
+ echo "$OLD_TIMEOUT" >/sys/class/firmware/timeout
+ fi
+ if [ "$TEST_REQS_FW_SET_CUSTOM_PATH" = "yes" ]; then
+ if [ "$OLD_FWPATH" = "" ]; then
+ # A zero-length write won't work; write a null byte
+ printf '\000' >/sys/module/firmware_class/parameters/path
+ else
+ echo -n "$OLD_FWPATH" >/sys/module/firmware_class/parameters/path
+ fi
+ fi
+ if [ -f $FW ]; then
+ rm -f "$FW"
+ fi
+ if [ -f $FW_INTO_BUF ]; then
+ rm -f "$FW_INTO_BUF"
+ fi
+ if [ -d $FWPATH ]; then
+ rm -rf "$FWPATH"
+ fi
+ proc_restore_defaults
+}
+
+kconfig_has()
+{
+ if [ -f $PROC_CONFIG ]; then
+ if zgrep -q $1 $PROC_CONFIG 2>/dev/null; then
+ echo "yes"
+ else
+ echo "no"
+ fi
+ else
+ # We currently don't have easy heuristics to infer this
+ # so best we can do is just try to use the kernel assuming
+ # you had enabled it. This matches the old behaviour.
+ if [ "$1" = "CONFIG_FW_LOADER_USER_HELPER_FALLBACK=y" ]; then
+ echo "yes"
+ elif [ "$1" = "CONFIG_FW_LOADER_USER_HELPER=y" ]; then
+ if [ -d /sys/class/firmware/ ]; then
+ echo yes
+ else
+ echo no
+ fi
+ fi
+ fi
+}
diff --git a/tools/testing/selftests/firmware/fw_namespace.c b/tools/testing/selftests/firmware/fw_namespace.c
new file mode 100644
index 000000000..817b2f1e8
--- /dev/null
+++ b/tools/testing/selftests/firmware/fw_namespace.c
@@ -0,0 +1,152 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Test triggering of loading of firmware from different mount
+ * namespaces. Expect firmware to be always loaded from the mount
+ * namespace of PID 1. */
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#ifndef CLONE_NEWNS
+# define CLONE_NEWNS 0x00020000
+#endif
+
+static char *fw_path = NULL;
+
+static void die(char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+ if (fw_path)
+ unlink(fw_path);
+ umount("/lib/firmware");
+ exit(EXIT_FAILURE);
+}
+
+static void trigger_fw(const char *fw_name, const char *sys_path)
+{
+ int fd;
+
+ fd = open(sys_path, O_WRONLY);
+ if (fd < 0)
+ die("open failed: %s\n",
+ strerror(errno));
+ if (write(fd, fw_name, strlen(fw_name)) != strlen(fw_name))
+ exit(EXIT_FAILURE);
+ close(fd);
+}
+
+static void setup_fw(const char *fw_path)
+{
+ int fd;
+ const char fw[] = "ABCD0123";
+
+ fd = open(fw_path, O_WRONLY | O_CREAT, 0600);
+ if (fd < 0)
+ die("open failed: %s\n",
+ strerror(errno));
+ if (write(fd, fw, sizeof(fw) -1) != sizeof(fw) -1)
+ die("write failed: %s\n",
+ strerror(errno));
+ close(fd);
+}
+
+static bool test_fw_in_ns(const char *fw_name, const char *sys_path, bool block_fw_in_parent_ns)
+{
+ pid_t child;
+
+ if (block_fw_in_parent_ns)
+ if (mount("test", "/lib/firmware", "tmpfs", MS_RDONLY, NULL) == -1)
+ die("blocking firmware in parent ns failed\n");
+
+ child = fork();
+ if (child == -1) {
+ die("fork failed: %s\n",
+ strerror(errno));
+ }
+ if (child != 0) { /* parent */
+ pid_t pid;
+ int status;
+
+ pid = waitpid(child, &status, 0);
+ if (pid == -1) {
+ die("waitpid failed: %s\n",
+ strerror(errno));
+ }
+ if (pid != child) {
+ die("waited for %d got %d\n",
+ child, pid);
+ }
+ if (!WIFEXITED(status)) {
+ die("child did not terminate cleanly\n");
+ }
+ if (block_fw_in_parent_ns)
+ umount("/lib/firmware");
+ return WEXITSTATUS(status) == EXIT_SUCCESS ? true : false;
+ }
+
+ if (unshare(CLONE_NEWNS) != 0) {
+ die("unshare(CLONE_NEWNS) failed: %s\n",
+ strerror(errno));
+ }
+ if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) == -1)
+ die("remount root in child ns failed\n");
+
+ if (!block_fw_in_parent_ns) {
+ if (mount("test", "/lib/firmware", "tmpfs", MS_RDONLY, NULL) == -1)
+ die("blocking firmware in child ns failed\n");
+ } else
+ umount("/lib/firmware");
+
+ trigger_fw(fw_name, sys_path);
+
+ exit(EXIT_SUCCESS);
+}
+
+int main(int argc, char **argv)
+{
+ const char *fw_name = "test-firmware.bin";
+ char *sys_path;
+ if (argc != 2)
+ die("usage: %s sys_path\n", argv[0]);
+
+ /* Mount tmpfs to /lib/firmware so we don't have to assume
+ that it is writable for us.*/
+ if (mount("test", "/lib/firmware", "tmpfs", 0, NULL) == -1)
+ die("mounting tmpfs to /lib/firmware failed\n");
+
+ sys_path = argv[1];
+ if (asprintf(&fw_path, "/lib/firmware/%s", fw_name) < 0)
+ die("error: failed to build full fw_path\n");
+
+ setup_fw(fw_path);
+
+ setvbuf(stdout, NULL, _IONBF, 0);
+ /* Positive case: firmware in PID1 mount namespace */
+ printf("Testing with firmware in parent namespace (assumed to be same file system as PID1)\n");
+ if (!test_fw_in_ns(fw_name, sys_path, false))
+ die("error: failed to access firmware\n");
+
+ /* Negative case: firmware in child mount namespace, expected to fail */
+ printf("Testing with firmware in child namespace\n");
+ if (test_fw_in_ns(fw_name, sys_path, true))
+ die("error: firmware access did not fail\n");
+
+ unlink(fw_path);
+ free(fw_path);
+ umount("/lib/firmware");
+ exit(EXIT_SUCCESS);
+}
diff --git a/tools/testing/selftests/firmware/fw_run_tests.sh b/tools/testing/selftests/firmware/fw_run_tests.sh
new file mode 100755
index 000000000..777377078
--- /dev/null
+++ b/tools/testing/selftests/firmware/fw_run_tests.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This runs all known tests across all known possible configurations we could
+# emulate in one run.
+
+set -e
+
+TEST_DIR=$(dirname $0)
+source $TEST_DIR/fw_lib.sh
+
+export HAS_FW_LOADER_USER_HELPER=""
+export HAS_FW_LOADER_USER_HELPER_FALLBACK=""
+export HAS_FW_LOADER_COMPRESS=""
+
+run_tests()
+{
+ proc_set_force_sysfs_fallback $1
+ proc_set_ignore_sysfs_fallback $2
+ $TEST_DIR/fw_filesystem.sh
+
+ proc_set_force_sysfs_fallback $1
+ proc_set_ignore_sysfs_fallback $2
+ $TEST_DIR/fw_fallback.sh
+}
+
+run_test_config_0001()
+{
+ echo "-----------------------------------------------------"
+ echo "Running kernel configuration test 1 -- rare"
+ echo "Emulates:"
+ echo "CONFIG_FW_LOADER=y"
+ echo "CONFIG_FW_LOADER_USER_HELPER=n"
+ echo "CONFIG_FW_LOADER_USER_HELPER_FALLBACK=n"
+ run_tests 0 1
+}
+
+run_test_config_0002()
+{
+ echo "-----------------------------------------------------"
+ echo "Running kernel configuration test 2 -- distro"
+ echo "Emulates:"
+ echo "CONFIG_FW_LOADER=y"
+ echo "CONFIG_FW_LOADER_USER_HELPER=y"
+ echo "CONFIG_FW_LOADER_USER_HELPER_FALLBACK=n"
+ proc_set_ignore_sysfs_fallback 0
+ run_tests 0 0
+}
+
+run_test_config_0003()
+{
+ echo "-----------------------------------------------------"
+ echo "Running kernel configuration test 3 -- android"
+ echo "Emulates:"
+ echo "CONFIG_FW_LOADER=y"
+ echo "CONFIG_FW_LOADER_USER_HELPER=y"
+ echo "CONFIG_FW_LOADER_USER_HELPER_FALLBACK=y"
+ run_tests 1 0
+}
+
+check_mods
+check_setup
+
+echo "Running namespace test: "
+$TEST_DIR/fw_namespace $DIR/trigger_request
+echo "OK"
+
+if [ -f $FW_FORCE_SYSFS_FALLBACK ]; then
+ run_test_config_0001
+ run_test_config_0002
+ run_test_config_0003
+else
+ echo "Running basic kernel configuration, working with your config"
+ run_tests
+fi
diff --git a/tools/testing/selftests/firmware/settings b/tools/testing/selftests/firmware/settings
new file mode 100644
index 000000000..085e664ee
--- /dev/null
+++ b/tools/testing/selftests/firmware/settings
@@ -0,0 +1,8 @@
+# The async firmware timeout is set to 1 second (but ends up being effectively
+# 2 seconds). There are 3 test configs, each done with and without firmware
+# present, each with 2 "nowait" functions tested 5 times. Expected time for a
+# normal execution should be 2 * 3 * 2 * 2 * 5 = 120 seconds for those alone.
+# Additionally, fw_fallback may take 5 seconds for internal timeouts in each
+# of the 3 configs, so at least another 15 seconds are needed. Add another
+# 10 seconds for each testing config: 120 + 15 + 30
+timeout=165
diff --git a/tools/testing/selftests/fpu/.gitignore b/tools/testing/selftests/fpu/.gitignore
new file mode 100644
index 000000000..d6d12ac1d
--- /dev/null
+++ b/tools/testing/selftests/fpu/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0+
+test_fpu
diff --git a/tools/testing/selftests/fpu/Makefile b/tools/testing/selftests/fpu/Makefile
new file mode 100644
index 000000000..ea62c176e
--- /dev/null
+++ b/tools/testing/selftests/fpu/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0+
+
+LDLIBS := -lm
+
+TEST_GEN_PROGS := test_fpu
+
+TEST_PROGS := run_test_fpu.sh
+
+include ../lib.mk
diff --git a/tools/testing/selftests/fpu/run_test_fpu.sh b/tools/testing/selftests/fpu/run_test_fpu.sh
new file mode 100755
index 000000000..d77be93ec
--- /dev/null
+++ b/tools/testing/selftests/fpu/run_test_fpu.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Load kernel module for FPU tests
+
+uid=$(id -u)
+if [ $uid -ne 0 ]; then
+ echo "$0: Must be run as root"
+ exit 1
+fi
+
+if ! which modprobe > /dev/null 2>&1; then
+ echo "$0: You need modprobe installed"
+ exit 4
+fi
+
+if ! modinfo test_fpu > /dev/null 2>&1; then
+ echo "$0: You must have the following enabled in your kernel:"
+ echo "CONFIG_TEST_FPU=m"
+ exit 4
+fi
+
+NR_CPUS=$(getconf _NPROCESSORS_ONLN)
+if [ ! $NR_CPUS ]; then
+ NR_CPUS=1
+fi
+
+modprobe test_fpu
+
+if [ ! -e /sys/kernel/debug/selftest_helpers/test_fpu ]; then
+ mount -t debugfs none /sys/kernel/debug
+
+ if [ ! -e /sys/kernel/debug/selftest_helpers/test_fpu ]; then
+ echo "$0: Error mounting debugfs"
+ exit 4
+ fi
+fi
+
+echo "Running 1000 iterations on all CPUs... "
+for i in $(seq 1 1000); do
+ for c in $(seq 1 $NR_CPUS); do
+ ./test_fpu &
+ done
+done
+
+rmmod test_fpu
diff --git a/tools/testing/selftests/fpu/test_fpu.c b/tools/testing/selftests/fpu/test_fpu.c
new file mode 100644
index 000000000..200238522
--- /dev/null
+++ b/tools/testing/selftests/fpu/test_fpu.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* This testcase operates with the test_fpu kernel driver.
+ * It modifies the FPU control register in user mode and calls the kernel
+ * module to perform floating point operations in the kernel. The control
+ * register value should be independent between kernel and user mode.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <fenv.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+const char *test_fpu_path = "/sys/kernel/debug/selftest_helpers/test_fpu";
+
+int main(void)
+{
+ char dummy[1];
+ int fd = open(test_fpu_path, O_RDONLY);
+
+ if (fd < 0) {
+ printf("[SKIP]\tcan't access %s: %s\n",
+ test_fpu_path, strerror(errno));
+ return 0;
+ }
+
+ if (read(fd, dummy, 1) < 0) {
+ printf("[FAIL]\taccess with default rounding mode failed\n");
+ return 1;
+ }
+
+ fesetround(FE_DOWNWARD);
+ if (read(fd, dummy, 1) < 0) {
+ printf("[FAIL]\taccess with downward rounding mode failed\n");
+ return 2;
+ }
+ if (fegetround() != FE_DOWNWARD) {
+ printf("[FAIL]\tusermode rounding mode clobbered\n");
+ return 3;
+ }
+
+ /* Note: the tests up to this point are quite safe and will only return
+ * an error. But the exception mask setting can cause misbehaving kernel
+ * to crash.
+ */
+ feclearexcept(FE_ALL_EXCEPT);
+ feenableexcept(FE_ALL_EXCEPT);
+ if (read(fd, dummy, 1) < 0) {
+ printf("[FAIL]\taccess with fpu exceptions unmasked failed\n");
+ return 4;
+ }
+ if (fegetexcept() != FE_ALL_EXCEPT) {
+ printf("[FAIL]\tusermode fpu exception mask clobbered\n");
+ return 5;
+ }
+
+ printf("[OK]\ttest_fpu\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/ftrace/.gitignore b/tools/testing/selftests/ftrace/.gitignore
new file mode 100644
index 000000000..2659417cb
--- /dev/null
+++ b/tools/testing/selftests/ftrace/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+logs
diff --git a/tools/testing/selftests/ftrace/Makefile b/tools/testing/selftests/ftrace/Makefile
new file mode 100644
index 000000000..d6e106fbc
--- /dev/null
+++ b/tools/testing/selftests/ftrace/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+all:
+
+TEST_PROGS := ftracetest
+TEST_FILES := test.d settings
+EXTRA_CLEAN := $(OUTPUT)/logs/*
+
+include ../lib.mk
diff --git a/tools/testing/selftests/ftrace/README b/tools/testing/selftests/ftrace/README
new file mode 100644
index 000000000..182e76fa4
--- /dev/null
+++ b/tools/testing/selftests/ftrace/README
@@ -0,0 +1,82 @@
+Linux Ftrace Testcases
+
+This is a collection of testcases for ftrace tracing feature in the Linux
+kernel. Since ftrace exports interfaces via the debugfs, we just need
+shell scripts for testing. Feel free to add new test cases.
+
+Running the ftrace testcases
+============================
+
+At first, you need to be the root user to run this script.
+To run all testcases:
+
+ $ sudo ./ftracetest
+
+To run specific testcases:
+
+ # ./ftracetest test.d/basic3.tc
+
+Or you can also run testcases under given directory:
+
+ # ./ftracetest test.d/kprobe/
+
+Contributing new testcases
+==========================
+
+Copy test.d/template to your testcase (whose filename must have *.tc
+extension) and rewrite the test description line.
+
+ * The working directory of the script is <debugfs>/tracing/.
+
+ * Take care with side effects as the tests are run with root privilege.
+
+ * The tests should not run for a long period of time (more than 1 min.)
+ These are to be unit tests.
+
+ * You can add a directory for your testcases under test.d/ if needed.
+
+ * The test cases should run on dash (busybox shell) for testing on
+ minimal cross-build environments.
+
+ * Note that the tests are run with "set -e" (errexit) option. If any
+ command fails, the test will be terminated immediately.
+
+ * The tests can return some result codes instead of pass or fail by
+ using exit_unresolved, exit_untested, exit_unsupported and exit_xfail.
+
+Result code
+===========
+
+Ftracetest supports following result codes.
+
+ * PASS: The test succeeded as expected. The test which exits with 0 is
+ counted as passed test.
+
+ * FAIL: The test failed, but was expected to succeed. The test which exits
+ with !0 is counted as failed test.
+
+ * UNRESOLVED: The test produced unclear or intermidiate results.
+ for example, the test was interrupted
+ or the test depends on a previous test, which failed.
+ or the test was set up incorrectly
+ The test which is in above situation, must call exit_unresolved.
+
+ * UNTESTED: The test was not run, currently just a placeholder.
+ In this case, the test must call exit_untested.
+
+ * UNSUPPORTED: The test failed because of lack of feature.
+ In this case, the test must call exit_unsupported.
+
+ * XFAIL: The test failed, and was expected to fail.
+ To return XFAIL, call exit_xfail from the test.
+
+There are some sample test scripts for result code under samples/.
+You can also run samples as below:
+
+ # ./ftracetest samples/
+
+TODO
+====
+
+ * Fancy colored output :)
+
diff --git a/tools/testing/selftests/ftrace/config b/tools/testing/selftests/ftrace/config
new file mode 100644
index 000000000..e59d985ee
--- /dev/null
+++ b/tools/testing/selftests/ftrace/config
@@ -0,0 +1,16 @@
+CONFIG_KPROBES=y
+CONFIG_FTRACE=y
+CONFIG_FUNCTION_PROFILER=y
+CONFIG_TRACER_SNAPSHOT=y
+CONFIG_STACK_TRACER=y
+CONFIG_HIST_TRIGGERS=y
+CONFIG_SCHED_TRACER=y
+CONFIG_PREEMPT_TRACER=y
+CONFIG_IRQSOFF_TRACER=y
+CONFIG_PREEMPTIRQ_DELAY_TEST=m
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_SAMPLES=y
+CONFIG_SAMPLE_FTRACE_DIRECT=m
+CONFIG_SAMPLE_TRACE_PRINTK=m
+CONFIG_KALLSYMS_ALL=y
diff --git a/tools/testing/selftests/ftrace/ftracetest b/tools/testing/selftests/ftrace/ftracetest
new file mode 100755
index 000000000..55314cd19
--- /dev/null
+++ b/tools/testing/selftests/ftrace/ftracetest
@@ -0,0 +1,453 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-only
+
+# ftracetest - Ftrace test shell scripts
+#
+# Copyright (C) Hitachi Ltd., 2014
+# Written by Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
+#
+
+usage() { # errno [message]
+[ ! -z "$2" ] && echo $2
+echo "Usage: ftracetest [options] [testcase(s)] [testcase-directory(s)]"
+echo " Options:"
+echo " -h|--help Show help message"
+echo " -k|--keep Keep passed test logs"
+echo " -v|--verbose Increase verbosity of test messages"
+echo " -vv Alias of -v -v (Show all results in stdout)"
+echo " -vvv Alias of -v -v -v (Show all commands immediately)"
+echo " --fail-unsupported Treat UNSUPPORTED as a failure"
+echo " --fail-unresolved Treat UNRESOLVED as a failure"
+echo " -d|--debug Debug mode (trace all shell commands)"
+echo " -l|--logdir <dir> Save logs on the <dir>"
+echo " If <dir> is -, all logs output in console only"
+exit $1
+}
+
+# default error
+err_ret=1
+
+# kselftest skip code is 4
+err_skip=4
+
+# umount required
+UMOUNT_DIR=""
+
+# cgroup RT scheduling prevents chrt commands from succeeding, which
+# induces failures in test wakeup tests. Disable for the duration of
+# the tests.
+
+readonly sched_rt_runtime=/proc/sys/kernel/sched_rt_runtime_us
+
+sched_rt_runtime_orig=$(cat $sched_rt_runtime)
+
+setup() {
+ echo -1 > $sched_rt_runtime
+}
+
+cleanup() {
+ echo $sched_rt_runtime_orig > $sched_rt_runtime
+ if [ -n "${UMOUNT_DIR}" ]; then
+ umount ${UMOUNT_DIR} ||:
+ fi
+}
+
+errexit() { # message
+ echo "Error: $1" 1>&2
+ cleanup
+ exit $err_ret
+}
+
+# Ensuring user privilege
+if [ `id -u` -ne 0 ]; then
+ errexit "this must be run by root user"
+fi
+
+setup
+
+# Utilities
+absdir() { # file_path
+ (cd `dirname $1`; pwd)
+}
+
+abspath() {
+ echo `absdir $1`/`basename $1`
+}
+
+find_testcases() { #directory
+ echo `find $1 -name \*.tc | sort`
+}
+
+parse_opts() { # opts
+ local OPT_TEST_CASES=
+ local OPT_TEST_DIR=
+
+ while [ ! -z "$1" ]; do
+ case "$1" in
+ --help|-h)
+ usage 0
+ ;;
+ --keep|-k)
+ KEEP_LOG=1
+ shift 1
+ ;;
+ --verbose|-v|-vv|-vvv)
+ if [ $VERBOSE -eq -1 ]; then
+ usage "--console can not use with --verbose"
+ fi
+ VERBOSE=$((VERBOSE + 1))
+ [ $1 = '-vv' ] && VERBOSE=$((VERBOSE + 1))
+ [ $1 = '-vvv' ] && VERBOSE=$((VERBOSE + 2))
+ shift 1
+ ;;
+ --console)
+ if [ $VERBOSE -ne 0 ]; then
+ usage "--console can not use with --verbose"
+ fi
+ VERBOSE=-1
+ shift 1
+ ;;
+ --debug|-d)
+ DEBUG=1
+ shift 1
+ ;;
+ --stop-fail)
+ STOP_FAILURE=1
+ shift 1
+ ;;
+ --fail-unsupported)
+ UNSUPPORTED_RESULT=1
+ shift 1
+ ;;
+ --fail-unresolved)
+ UNRESOLVED_RESULT=1
+ shift 1
+ ;;
+ --logdir|-l)
+ LOG_DIR=$2
+ shift 2
+ ;;
+ *.tc)
+ if [ -f "$1" ]; then
+ OPT_TEST_CASES="$OPT_TEST_CASES `abspath $1`"
+ shift 1
+ else
+ usage 1 "$1 is not a testcase"
+ fi
+ ;;
+ *)
+ if [ -d "$1" ]; then
+ OPT_TEST_DIR=`abspath $1`
+ OPT_TEST_CASES="$OPT_TEST_CASES `find_testcases $OPT_TEST_DIR`"
+ shift 1
+ else
+ usage 1 "Invalid option ($1)"
+ fi
+ ;;
+ esac
+ done
+ if [ ! -z "$OPT_TEST_CASES" ]; then
+ TEST_CASES=$OPT_TEST_CASES
+ fi
+}
+
+# Parameters
+TRACING_DIR=`grep tracefs /proc/mounts | cut -f2 -d' ' | head -1`
+if [ -z "$TRACING_DIR" ]; then
+ DEBUGFS_DIR=`grep debugfs /proc/mounts | cut -f2 -d' ' | head -1`
+ if [ -z "$DEBUGFS_DIR" ]; then
+ # If tracefs exists, then so does /sys/kernel/tracing
+ if [ -d "/sys/kernel/tracing" ]; then
+ mount -t tracefs nodev /sys/kernel/tracing ||
+ errexit "Failed to mount /sys/kernel/tracing"
+ TRACING_DIR="/sys/kernel/tracing"
+ UMOUNT_DIR=${TRACING_DIR}
+ # If debugfs exists, then so does /sys/kernel/debug
+ elif [ -d "/sys/kernel/debug" ]; then
+ mount -t debugfs nodev /sys/kernel/debug ||
+ errexit "Failed to mount /sys/kernel/debug"
+ TRACING_DIR="/sys/kernel/debug/tracing"
+ UMOUNT_DIR=${TRACING_DIR}
+ else
+ err_ret=$err_skip
+ errexit "debugfs and tracefs are not configured in this kernel"
+ fi
+ else
+ TRACING_DIR="$DEBUGFS_DIR/tracing"
+ fi
+fi
+if [ ! -d "$TRACING_DIR" ]; then
+ err_ret=$err_skip
+ errexit "ftrace is not configured in this kernel"
+fi
+
+TOP_DIR=`absdir $0`
+TEST_DIR=$TOP_DIR/test.d
+TEST_CASES=`find_testcases $TEST_DIR`
+LOG_DIR=$TOP_DIR/logs/`date +%Y%m%d-%H%M%S`/
+KEEP_LOG=0
+DEBUG=0
+VERBOSE=0
+UNSUPPORTED_RESULT=0
+UNRESOLVED_RESULT=0
+STOP_FAILURE=0
+# Parse command-line options
+parse_opts $*
+
+[ $DEBUG -ne 0 ] && set -x
+
+# Verify parameters
+if [ -z "$TRACING_DIR" -o ! -d "$TRACING_DIR" ]; then
+ errexit "No ftrace directory found"
+fi
+
+# Preparing logs
+if [ "x$LOG_DIR" = "x-" ]; then
+ LOG_FILE=
+ date
+else
+ LOG_FILE=$LOG_DIR/ftracetest.log
+ mkdir -p $LOG_DIR || errexit "Failed to make a log directory: $LOG_DIR"
+ date > $LOG_FILE
+fi
+
+# Define text colors
+# Check available colors on the terminal, if any
+ncolors=`tput colors 2>/dev/null || echo 0`
+color_reset=
+color_red=
+color_green=
+color_blue=
+# If stdout exists and number of colors is eight or more, use them
+if [ -t 1 -a "$ncolors" -ge 8 ]; then
+ color_reset="\033[0m"
+ color_red="\033[31m"
+ color_green="\033[32m"
+ color_blue="\033[34m"
+fi
+
+strip_esc() {
+ # busybox sed implementation doesn't accept "\x1B", so use [:cntrl:] instead.
+ sed -E "s/[[:cntrl:]]\[([0-9]{1,2}(;[0-9]{1,2})?)?[m|K]//g"
+}
+
+prlog() { # messages
+ newline="\n"
+ if [ "$1" = "-n" ] ; then
+ newline=
+ shift
+ fi
+ printf "$*$newline"
+ [ "$LOG_FILE" ] && printf "$*$newline" | strip_esc >> $LOG_FILE
+}
+catlog() { #file
+ cat $1
+ [ "$LOG_FILE" ] && cat $1 | strip_esc >> $LOG_FILE
+}
+prlog "=== Ftrace unit tests ==="
+
+
+# Testcase management
+# Test result codes - Dejagnu extended code
+PASS=0 # The test succeeded.
+FAIL=1 # The test failed, but was expected to succeed.
+UNRESOLVED=2 # The test produced indeterminate results. (e.g. interrupted)
+UNTESTED=3 # The test was not run, currently just a placeholder.
+UNSUPPORTED=4 # The test failed because of lack of feature.
+XFAIL=5 # The test failed, and was expected to fail.
+
+# Accumulations
+PASSED_CASES=
+FAILED_CASES=
+UNRESOLVED_CASES=
+UNTESTED_CASES=
+UNSUPPORTED_CASES=
+XFAILED_CASES=
+UNDEFINED_CASES=
+TOTAL_RESULT=0
+
+INSTANCE=
+CASENO=0
+
+testcase() { # testfile
+ CASENO=$((CASENO+1))
+ desc=`grep "^#[ \t]*description:" $1 | cut -f2- -d:`
+ prlog -n "[$CASENO]$INSTANCE$desc"
+}
+
+checkreq() { # testfile
+ requires=`grep "^#[ \t]*requires:" $1 | cut -f2- -d:`
+ # Use eval to pass quoted-patterns correctly.
+ eval check_requires "$requires"
+}
+
+test_on_instance() { # testfile
+ grep -q "^#[ \t]*flags:.*instance" $1
+}
+
+eval_result() { # sigval
+ case $1 in
+ $PASS)
+ prlog " [${color_green}PASS${color_reset}]"
+ PASSED_CASES="$PASSED_CASES $CASENO"
+ return 0
+ ;;
+ $FAIL)
+ prlog " [${color_red}FAIL${color_reset}]"
+ FAILED_CASES="$FAILED_CASES $CASENO"
+ return 1 # this is a bug.
+ ;;
+ $UNRESOLVED)
+ prlog " [${color_blue}UNRESOLVED${color_reset}]"
+ UNRESOLVED_CASES="$UNRESOLVED_CASES $CASENO"
+ return $UNRESOLVED_RESULT # depends on use case
+ ;;
+ $UNTESTED)
+ prlog " [${color_blue}UNTESTED${color_reset}]"
+ UNTESTED_CASES="$UNTESTED_CASES $CASENO"
+ return 0
+ ;;
+ $UNSUPPORTED)
+ prlog " [${color_blue}UNSUPPORTED${color_reset}]"
+ UNSUPPORTED_CASES="$UNSUPPORTED_CASES $CASENO"
+ return $UNSUPPORTED_RESULT # depends on use case
+ ;;
+ $XFAIL)
+ prlog " [${color_green}XFAIL${color_reset}]"
+ XFAILED_CASES="$XFAILED_CASES $CASENO"
+ return 0
+ ;;
+ *)
+ prlog " [${color_blue}UNDEFINED${color_reset}]"
+ UNDEFINED_CASES="$UNDEFINED_CASES $CASENO"
+ return 1 # this must be a test bug
+ ;;
+ esac
+}
+
+# Signal handling for result codes
+SIG_RESULT=
+SIG_BASE=36 # Use realtime signals
+SIG_PID=$$
+
+exit_pass () {
+ exit 0
+}
+
+SIG_FAIL=$((SIG_BASE + FAIL))
+exit_fail () {
+ exit 1
+}
+trap 'SIG_RESULT=$FAIL' $SIG_FAIL
+
+SIG_UNRESOLVED=$((SIG_BASE + UNRESOLVED))
+exit_unresolved () {
+ kill -s $SIG_UNRESOLVED $SIG_PID
+ exit 0
+}
+trap 'SIG_RESULT=$UNRESOLVED' $SIG_UNRESOLVED
+
+SIG_UNTESTED=$((SIG_BASE + UNTESTED))
+exit_untested () {
+ kill -s $SIG_UNTESTED $SIG_PID
+ exit 0
+}
+trap 'SIG_RESULT=$UNTESTED' $SIG_UNTESTED
+
+SIG_UNSUPPORTED=$((SIG_BASE + UNSUPPORTED))
+exit_unsupported () {
+ kill -s $SIG_UNSUPPORTED $SIG_PID
+ exit 0
+}
+trap 'SIG_RESULT=$UNSUPPORTED' $SIG_UNSUPPORTED
+
+SIG_XFAIL=$((SIG_BASE + XFAIL))
+exit_xfail () {
+ kill -s $SIG_XFAIL $SIG_PID
+ exit 0
+}
+trap 'SIG_RESULT=$XFAIL' $SIG_XFAIL
+
+__run_test() { # testfile
+ # setup PID and PPID, $$ is not updated.
+ (cd $TRACING_DIR; read PID _ < /proc/self/stat; set -e; set -x;
+ checkreq $1; initialize_ftrace; . $1)
+ [ $? -ne 0 ] && kill -s $SIG_FAIL $SIG_PID
+}
+
+# Run one test case
+run_test() { # testfile
+ local testname=`basename $1`
+ testcase $1
+ if [ ! -z "$LOG_FILE" ] ; then
+ local testlog=`mktemp $LOG_DIR/${CASENO}-${testname}-log.XXXXXX`
+ else
+ local testlog=/proc/self/fd/1
+ fi
+ export TMPDIR=`mktemp -d /tmp/ftracetest-dir.XXXXXX`
+ export FTRACETEST_ROOT=$TOP_DIR
+ echo "execute$INSTANCE: "$1 > $testlog
+ SIG_RESULT=0
+ if [ $VERBOSE -eq -1 ]; then
+ __run_test $1
+ elif [ -z "$LOG_FILE" ]; then
+ __run_test $1 2>&1
+ elif [ $VERBOSE -ge 3 ]; then
+ __run_test $1 | tee -a $testlog 2>&1
+ elif [ $VERBOSE -eq 2 ]; then
+ __run_test $1 2>> $testlog | tee -a $testlog
+ else
+ __run_test $1 >> $testlog 2>&1
+ fi
+ eval_result $SIG_RESULT
+ if [ $? -eq 0 ]; then
+ # Remove test log if the test was done as it was expected.
+ [ $KEEP_LOG -eq 0 -a ! -z "$LOG_FILE" ] && rm $testlog
+ else
+ [ $VERBOSE -eq 1 -o $VERBOSE -eq 2 ] && catlog $testlog
+ TOTAL_RESULT=1
+ fi
+ rm -rf $TMPDIR
+}
+
+# load in the helper functions
+. $TEST_DIR/functions
+
+# Main loop
+for t in $TEST_CASES; do
+ run_test $t
+ if [ $STOP_FAILURE -ne 0 -a $TOTAL_RESULT -ne 0 ]; then
+ echo "A failure detected. Stop test."
+ exit 1
+ fi
+done
+
+# Test on instance loop
+INSTANCE=" (instance) "
+for t in $TEST_CASES; do
+ test_on_instance $t || continue
+ SAVED_TRACING_DIR=$TRACING_DIR
+ export TRACING_DIR=`mktemp -d $TRACING_DIR/instances/ftracetest.XXXXXX`
+ run_test $t
+ rmdir $TRACING_DIR
+ TRACING_DIR=$SAVED_TRACING_DIR
+ if [ $STOP_FAILURE -ne 0 -a $TOTAL_RESULT -ne 0 ]; then
+ echo "A failure detected. Stop test."
+ exit 1
+ fi
+done
+(cd $TRACING_DIR; initialize_ftrace) # for cleanup
+
+prlog ""
+prlog "# of passed: " `echo $PASSED_CASES | wc -w`
+prlog "# of failed: " `echo $FAILED_CASES | wc -w`
+prlog "# of unresolved: " `echo $UNRESOLVED_CASES | wc -w`
+prlog "# of untested: " `echo $UNTESTED_CASES | wc -w`
+prlog "# of unsupported: " `echo $UNSUPPORTED_CASES | wc -w`
+prlog "# of xfailed: " `echo $XFAILED_CASES | wc -w`
+prlog "# of undefined(test bug): " `echo $UNDEFINED_CASES | wc -w`
+
+cleanup
+
+# if no error, return 0
+exit $TOTAL_RESULT
diff --git a/tools/testing/selftests/ftrace/samples/fail.tc b/tools/testing/selftests/ftrace/samples/fail.tc
new file mode 100644
index 000000000..15e35b956
--- /dev/null
+++ b/tools/testing/selftests/ftrace/samples/fail.tc
@@ -0,0 +1,4 @@
+#!/bin/sh
+# description: failure-case example
+cat non-exist-file
+echo "this is not executed"
diff --git a/tools/testing/selftests/ftrace/samples/pass.tc b/tools/testing/selftests/ftrace/samples/pass.tc
new file mode 100644
index 000000000..d01549370
--- /dev/null
+++ b/tools/testing/selftests/ftrace/samples/pass.tc
@@ -0,0 +1,3 @@
+#!/bin/sh
+# description: pass-case example
+return 0
diff --git a/tools/testing/selftests/ftrace/samples/unresolved.tc b/tools/testing/selftests/ftrace/samples/unresolved.tc
new file mode 100644
index 000000000..41e99d335
--- /dev/null
+++ b/tools/testing/selftests/ftrace/samples/unresolved.tc
@@ -0,0 +1,4 @@
+#!/bin/sh
+# description: unresolved-case example
+trap exit_unresolved INT
+kill -INT $PID
diff --git a/tools/testing/selftests/ftrace/samples/unsupported.tc b/tools/testing/selftests/ftrace/samples/unsupported.tc
new file mode 100644
index 000000000..45910ff13
--- /dev/null
+++ b/tools/testing/selftests/ftrace/samples/unsupported.tc
@@ -0,0 +1,3 @@
+#!/bin/sh
+# description: unsupported-case example
+exit_unsupported
diff --git a/tools/testing/selftests/ftrace/samples/untested.tc b/tools/testing/selftests/ftrace/samples/untested.tc
new file mode 100644
index 000000000..35a45946e
--- /dev/null
+++ b/tools/testing/selftests/ftrace/samples/untested.tc
@@ -0,0 +1,3 @@
+#!/bin/sh
+# description: untested-case example
+exit_untested
diff --git a/tools/testing/selftests/ftrace/samples/xfail.tc b/tools/testing/selftests/ftrace/samples/xfail.tc
new file mode 100644
index 000000000..9dd395323
--- /dev/null
+++ b/tools/testing/selftests/ftrace/samples/xfail.tc
@@ -0,0 +1,3 @@
+#!/bin/sh
+# description: xfail-case example
+cat non-exist-file || exit_xfail
diff --git a/tools/testing/selftests/ftrace/settings b/tools/testing/selftests/ftrace/settings
new file mode 100644
index 000000000..e7b941753
--- /dev/null
+++ b/tools/testing/selftests/ftrace/settings
@@ -0,0 +1 @@
+timeout=0
diff --git a/tools/testing/selftests/ftrace/test.d/00basic/basic1.tc b/tools/testing/selftests/ftrace/test.d/00basic/basic1.tc
new file mode 100644
index 000000000..9980ff14a
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/00basic/basic1.tc
@@ -0,0 +1,3 @@
+#!/bin/sh
+# description: Basic trace file check
+test -f README -a -f trace -a -f tracing_on -a -f trace_pipe
diff --git a/tools/testing/selftests/ftrace/test.d/00basic/basic2.tc b/tools/testing/selftests/ftrace/test.d/00basic/basic2.tc
new file mode 100644
index 000000000..531e47236
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/00basic/basic2.tc
@@ -0,0 +1,9 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Basic test for tracers
+# flags: instance
+test -f available_tracers
+for t in `cat available_tracers`; do
+ echo $t > current_tracer
+done
+echo nop > current_tracer
diff --git a/tools/testing/selftests/ftrace/test.d/00basic/basic3.tc b/tools/testing/selftests/ftrace/test.d/00basic/basic3.tc
new file mode 100644
index 000000000..58a2506f7
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/00basic/basic3.tc
@@ -0,0 +1,10 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Basic trace clock test
+# flags: instance
+test -f trace_clock
+for c in `cat trace_clock | tr -d \[\]`; do
+ echo $c > trace_clock
+ grep '\['$c'\]' trace_clock
+done
+echo local > trace_clock
diff --git a/tools/testing/selftests/ftrace/test.d/00basic/basic4.tc b/tools/testing/selftests/ftrace/test.d/00basic/basic4.tc
new file mode 100644
index 000000000..0696098d6
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/00basic/basic4.tc
@@ -0,0 +1,5 @@
+#!/bin/sh
+# description: Basic event tracing check
+test -f available_events -a -f set_event -a -d events
+# check scheduler events are available
+grep -q sched available_events && exit_pass || exit_fail
diff --git a/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_size.tc b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_size.tc
new file mode 100644
index 000000000..ab70f0077
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_size.tc
@@ -0,0 +1,22 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Change the ringbuffer size
+# flags: instance
+
+rb_size_test() {
+ORIG=`cat buffer_size_kb`
+
+expr $ORIG / 2 > buffer_size_kb
+
+expr $ORIG \* 2 > buffer_size_kb
+
+echo $ORIG > buffer_size_kb
+}
+
+rb_size_test
+
+: "If per-cpu buffer is supported, imbalance it"
+if [ -d per_cpu/cpu0 ]; then
+ cd per_cpu/cpu0
+ rb_size_test
+fi
diff --git a/tools/testing/selftests/ftrace/test.d/00basic/snapshot.tc b/tools/testing/selftests/ftrace/test.d/00basic/snapshot.tc
new file mode 100644
index 000000000..13b4dabcf
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/00basic/snapshot.tc
@@ -0,0 +1,27 @@
+#!/bin/sh
+# description: Snapshot and tracing setting
+# requires: snapshot
+# flags: instance
+
+echo "Set tracing off"
+echo 0 > tracing_on
+
+echo "Allocate and take a snapshot"
+echo 1 > snapshot
+
+# Since trace buffer is empty, snapshot is also empty, but allocated
+grep -q "Snapshot is allocated" snapshot
+
+echo "Ensure keep tracing off"
+test `cat tracing_on` -eq 0
+
+echo "Set tracing on"
+echo 1 > tracing_on
+
+echo "Take a snapshot again"
+echo 1 > snapshot
+
+echo "Ensure keep tracing on"
+test `cat tracing_on` -eq 1
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/00basic/trace_pipe.tc b/tools/testing/selftests/ftrace/test.d/00basic/trace_pipe.tc
new file mode 100644
index 000000000..435d07b13
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/00basic/trace_pipe.tc
@@ -0,0 +1,15 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: trace_pipe and trace_marker
+# requires: trace_marker
+# flags: instance
+
+echo "test input 1" > trace_marker
+
+: "trace interface never consume the ring buffer"
+grep -q "test input 1" trace
+grep -q "test input 1" trace
+
+: "trace interface never consume the ring buffer"
+head -n 1 trace_pipe | grep -q "test input 1"
+! grep -q "test input 1" trace
diff --git a/tools/testing/selftests/ftrace/test.d/direct/ftrace-direct.tc b/tools/testing/selftests/ftrace/test.d/direct/ftrace-direct.tc
new file mode 100644
index 000000000..d75a8695b
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/direct/ftrace-direct.tc
@@ -0,0 +1,69 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test ftrace direct functions against tracers
+
+rmmod ftrace-direct ||:
+if ! modprobe ftrace-direct ; then
+ echo "No ftrace-direct sample module - please make CONFIG_SAMPLE_FTRACE_DIRECT=m"
+ exit_unresolved;
+fi
+
+echo "Let the module run a little"
+sleep 1
+
+grep -q "my_direct_func: waking up" trace
+
+rmmod ftrace-direct
+
+test_tracer() {
+ tracer=$1
+
+ # tracer -> direct -> no direct > no tracer
+ echo $tracer > current_tracer
+ modprobe ftrace-direct
+ rmmod ftrace-direct
+ echo nop > current_tracer
+
+ # tracer -> direct -> no tracer > no direct
+ echo $tracer > current_tracer
+ modprobe ftrace-direct
+ echo nop > current_tracer
+ rmmod ftrace-direct
+
+ # direct -> tracer -> no tracer > no direct
+ modprobe ftrace-direct
+ echo $tracer > current_tracer
+ echo nop > current_tracer
+ rmmod ftrace-direct
+
+ # direct -> tracer -> no direct > no notracer
+ modprobe ftrace-direct
+ echo $tracer > current_tracer
+ rmmod ftrace-direct
+ echo nop > current_tracer
+}
+
+for t in `cat available_tracers`; do
+ if [ "$t" != "nop" ]; then
+ test_tracer $t
+ fi
+done
+
+echo nop > current_tracer
+rmmod ftrace-direct ||:
+
+# Now do the same thing with another direct function registered
+echo "Running with another ftrace direct function"
+
+rmmod ftrace-direct-too ||:
+modprobe ftrace-direct-too
+
+for t in `cat available_tracers`; do
+ if [ "$t" != "nop" ]; then
+ test_tracer $t
+ fi
+done
+
+echo nop > current_tracer
+rmmod ftrace-direct ||:
+rmmod ftrace-direct-too ||:
diff --git a/tools/testing/selftests/ftrace/test.d/direct/kprobe-direct.tc b/tools/testing/selftests/ftrace/test.d/direct/kprobe-direct.tc
new file mode 100644
index 000000000..e52e470a1
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/direct/kprobe-direct.tc
@@ -0,0 +1,80 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test ftrace direct functions against kprobes
+# requires: kprobe_events
+
+rmmod ftrace-direct ||:
+if ! modprobe ftrace-direct ; then
+ echo "No ftrace-direct sample module - please build with CONFIG_SAMPLE_FTRACE_DIRECT=m"
+ exit_unresolved;
+fi
+
+echo "Let the module run a little"
+sleep 1
+
+grep -q "my_direct_func: waking up" trace
+
+rmmod ftrace-direct
+
+echo 'p:kwake wake_up_process task=$arg1' > kprobe_events
+
+start_direct() {
+ echo > trace
+ modprobe ftrace-direct
+ sleep 1
+ grep -q "my_direct_func: waking up" trace
+}
+
+stop_direct() {
+ rmmod ftrace-direct
+}
+
+enable_probe() {
+ echo > trace
+ echo 1 > events/kprobes/kwake/enable
+ sleep 1
+ grep -q "kwake:" trace
+}
+
+disable_probe() {
+ echo 0 > events/kprobes/kwake/enable
+}
+
+test_kprobes() {
+ # probe -> direct -> no direct > no probe
+ enable_probe
+ start_direct
+ stop_direct
+ disable_probe
+
+ # probe -> direct -> no probe > no direct
+ enable_probe
+ start_direct
+ disable_probe
+ stop_direct
+
+ # direct -> probe -> no probe > no direct
+ start_direct
+ enable_probe
+ disable_probe
+ stop_direct
+
+ # direct -> probe -> no direct > no noprobe
+ start_direct
+ enable_probe
+ stop_direct
+ disable_probe
+}
+
+test_kprobes
+
+# Now do this with a second registered direct function
+echo "Running with another ftrace direct function"
+
+modprobe ftrace-direct-too
+
+test_kprobes
+
+rmmod ftrace-direct-too
+
+echo > kprobe_events
diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_kprobe.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_kprobe.tc
new file mode 100644
index 000000000..b4da41d12
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_kprobe.tc
@@ -0,0 +1,26 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Generic dynamic event - add/remove kprobe events
+# requires: dynamic_events "place: [<module>:]<symbol>":README "place (kretprobe): [<module>:]<symbol>":README
+
+echo 0 > events/enable
+echo > dynamic_events
+
+PLACE=$FUNCTION_FORK
+
+echo "p:myevent1 $PLACE" >> dynamic_events
+echo "r:myevent2 $PLACE" >> dynamic_events
+
+grep -q myevent1 dynamic_events
+grep -q myevent2 dynamic_events
+test -d events/kprobes/myevent1
+test -d events/kprobes/myevent2
+
+echo "-:myevent2" >> dynamic_events
+
+grep -q myevent1 dynamic_events
+! grep -q myevent2 dynamic_events
+
+echo > dynamic_events
+
+clear_trace
diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_synth.tc b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_synth.tc
new file mode 100644
index 000000000..2b94611e1
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/add_remove_synth.tc
@@ -0,0 +1,24 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Generic dynamic event - add/remove synthetic events
+# requires: dynamic_events "s:[synthetic/]":README
+
+echo 0 > events/enable
+echo > dynamic_events
+
+echo "s:latency1 u64 lat; pid_t pid;" >> dynamic_events
+echo "s:latency2 u64 lat; pid_t pid;" >> dynamic_events
+
+grep -q latency1 dynamic_events
+grep -q latency2 dynamic_events
+test -d events/synthetic/latency1
+test -d events/synthetic/latency2
+
+echo "-:synthetic/latency2" >> dynamic_events
+
+grep -q latency1 dynamic_events
+! grep -q latency2 dynamic_events
+
+echo > dynamic_events
+
+clear_trace
diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/clear_select_events.tc b/tools/testing/selftests/ftrace/test.d/dynevent/clear_select_events.tc
new file mode 100644
index 000000000..3a0e2885f
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/clear_select_events.tc
@@ -0,0 +1,41 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Generic dynamic event - selective clear (compatibility)
+# requires: dynamic_events kprobe_events synthetic_events "place: [<module>:]<symbol>":README "place (kretprobe): [<module>:]<symbol>":README "s:[synthetic/]":README
+
+echo 0 > events/enable
+echo > dynamic_events
+
+PLACE=$FUNCTION_FORK
+
+setup_events() {
+echo "p:myevent1 $PLACE" >> dynamic_events
+echo "s:latency1 u64 lat; pid_t pid;" >> dynamic_events
+echo "r:myevent2 $PLACE" >> dynamic_events
+echo "s:latency2 u64 lat; pid_t pid;" >> dynamic_events
+
+grep -q myevent1 dynamic_events
+grep -q myevent2 dynamic_events
+grep -q latency1 dynamic_events
+grep -q latency2 dynamic_events
+}
+
+setup_events
+echo > synthetic_events
+
+grep -q myevent1 dynamic_events
+grep -q myevent2 dynamic_events
+! grep -q latency1 dynamic_events
+! grep -q latency2 dynamic_events
+
+echo > dynamic_events
+
+setup_events
+echo > kprobe_events
+
+! grep -q myevent1 dynamic_events
+! grep -q myevent2 dynamic_events
+grep -q latency1 dynamic_events
+grep -q latency2 dynamic_events
+
+echo > dynamic_events
diff --git a/tools/testing/selftests/ftrace/test.d/dynevent/generic_clear_event.tc b/tools/testing/selftests/ftrace/test.d/dynevent/generic_clear_event.tc
new file mode 100644
index 000000000..d3e138e83
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/dynevent/generic_clear_event.tc
@@ -0,0 +1,43 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Generic dynamic event - generic clear event
+# requires: dynamic_events "place: [<module>:]<symbol>":README "place (kretprobe): [<module>:]<symbol>":README "s:[synthetic/]":README
+
+echo 0 > events/enable
+echo > dynamic_events
+
+PLACE=$FUNCTION_FORK
+
+setup_events() {
+echo "p:myevent1 $PLACE" >> dynamic_events
+echo "s:latency1 u64 lat; pid_t pid;" >> dynamic_events
+echo "r:myevent2 $PLACE" >> dynamic_events
+echo "s:latency2 u64 lat; pid_t pid;" >> dynamic_events
+
+grep -q myevent1 dynamic_events
+grep -q myevent2 dynamic_events
+grep -q latency1 dynamic_events
+grep -q latency2 dynamic_events
+}
+
+setup_events
+
+echo "!p:myevent1 $PLACE" >> dynamic_events
+! grep -q myevent1 dynamic_events
+grep -q myevent2 dynamic_events
+grep -q latency1 dynamic_events
+grep -q latency2 dynamic_events
+
+echo "!s:latency1 u64 lat; pid_t pid;" >> dynamic_events
+grep -q myevent2 dynamic_events
+! grep -q latency1 dynamic_events
+grep -q latency2 dynamic_events
+
+echo "!r:myevent2 $PLACE" >> dynamic_events
+! grep -q myevent2 dynamic_events
+grep -q latency2 dynamic_events
+
+echo "!s:latency2 u64 lat; pid_t pid;" >> dynamic_events
+! grep -q latency2 dynamic_events
+
+echo > dynamic_events
diff --git a/tools/testing/selftests/ftrace/test.d/event/event-enable.tc b/tools/testing/selftests/ftrace/test.d/event/event-enable.tc
new file mode 100644
index 000000000..cfe5bd2d4
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/event/event-enable.tc
@@ -0,0 +1,48 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event tracing - enable/disable with event level files
+# requires: set_event events/sched
+# flags: instance
+
+do_reset() {
+ echo > set_event
+ clear_trace
+}
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo 'sched:sched_switch' > set_event
+
+yield
+
+count=`cat trace | grep sched_switch | wc -l`
+if [ $count -eq 0 ]; then
+ fail "sched_switch events are not recorded"
+fi
+
+do_reset
+
+echo 1 > events/sched/sched_switch/enable
+
+yield
+
+count=`cat trace | grep sched_switch | wc -l`
+if [ $count -eq 0 ]; then
+ fail "sched_switch events are not recorded"
+fi
+
+do_reset
+
+echo 0 > events/sched/sched_switch/enable
+
+yield
+
+count=`cat trace | grep sched_switch | wc -l`
+if [ $count -ne 0 ]; then
+ fail "sched_switch events should not be recorded"
+fi
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/event/event-no-pid.tc b/tools/testing/selftests/ftrace/test.d/event/event-no-pid.tc
new file mode 100644
index 000000000..9933ed24f
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/event/event-no-pid.tc
@@ -0,0 +1,123 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event tracing - restricts events based on pid notrace filtering
+# requires: set_event events/sched set_event_pid set_event_notrace_pid
+# flags: instance
+
+do_reset() {
+ echo > set_event
+ echo > set_event_pid
+ echo > set_event_notrace_pid
+ echo 0 > options/event-fork
+ echo 0 > events/enable
+ clear_trace
+ echo 1 > tracing_on
+}
+
+fail() { #msg
+ cat trace
+ do_reset
+ echo $1
+ exit_fail
+}
+
+count_pid() {
+ pid=$@
+ cat trace | grep -v '^#' | sed -e 's/[^-]*-\([0-9]*\).*/\1/' | grep $pid | wc -l
+}
+
+count_no_pid() {
+ pid=$1
+ cat trace | grep -v '^#' | sed -e 's/[^-]*-\([0-9]*\).*/\1/' | grep -v $pid | wc -l
+}
+
+enable_system() {
+ system=$1
+
+ if [ -d events/$system ]; then
+ echo 1 > events/$system/enable
+ fi
+}
+
+enable_events() {
+ echo 0 > tracing_on
+ # Enable common groups of events, as all events can allow for
+ # events to be traced via scheduling that we don't care to test.
+ enable_system syscalls
+ enable_system rcu
+ enable_system block
+ enable_system exceptions
+ enable_system irq
+ enable_system net
+ enable_system power
+ enable_system signal
+ enable_system sock
+ enable_system timer
+ enable_system thermal
+ echo 1 > tracing_on
+}
+
+other_task() {
+ sleep .001 || usleep 1 || sleep 1
+}
+
+echo 0 > options/event-fork
+
+do_reset
+
+read mypid rest < /proc/self/stat
+
+echo $mypid > set_event_notrace_pid
+grep -q $mypid set_event_notrace_pid
+
+enable_events
+
+yield
+
+echo 0 > tracing_on
+
+cnt=`count_pid $mypid`
+if [ $cnt -ne 0 ]; then
+ fail "Filtered out task has events"
+fi
+
+cnt=`count_no_pid $mypid`
+if [ $cnt -eq 0 ]; then
+ fail "No other events were recorded"
+fi
+
+do_reset
+
+echo $mypid > set_event_notrace_pid
+echo 1 > options/event-fork
+
+enable_events
+
+yield &
+child=$!
+echo "child = $child"
+wait $child
+
+# Be sure some other events will happen for small systems (e.g. 1 core)
+other_task
+
+echo 0 > tracing_on
+
+cnt=`count_pid $mypid`
+if [ $cnt -ne 0 ]; then
+ fail "Filtered out task has events"
+fi
+
+cnt=`count_pid $child`
+if [ $cnt -ne 0 ]; then
+ fail "Child of filtered out taskhas events"
+fi
+
+cnt=`count_no_pid $mypid`
+if [ $cnt -eq 0 ]; then
+ fail "No other events were recorded"
+fi
+
+do_reset
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/event/event-pid.tc b/tools/testing/selftests/ftrace/test.d/event/event-pid.tc
new file mode 100644
index 000000000..7f5f97dff
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/event/event-pid.tc
@@ -0,0 +1,61 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event tracing - restricts events based on pid
+# requires: set_event set_event_pid events/sched
+# flags: instance
+
+do_reset() {
+ echo > set_event
+ echo > set_event_pid
+ echo 0 > options/event-fork
+ clear_trace
+}
+
+fail() { #msg
+ do_reset
+ echo $1
+ exit_fail
+}
+
+echo 0 > options/event-fork
+
+echo 1 > events/sched/sched_switch/enable
+
+yield
+
+count=`cat trace | grep sched_switch | wc -l`
+if [ $count -eq 0 ]; then
+ fail "sched_switch events are not recorded"
+fi
+
+do_reset
+
+read mypid rest < /proc/self/stat
+
+echo $mypid > set_event_pid
+grep -q $mypid set_event_pid
+echo 'sched:sched_switch' > set_event
+
+yield
+
+count=`cat trace | grep sched_switch | grep -v "pid=$mypid" | wc -l`
+if [ $count -ne 0 ]; then
+ fail "sched_switch events from other task are recorded"
+fi
+
+do_reset
+
+echo $mypid > set_event_pid
+echo 1 > options/event-fork
+echo 1 > events/sched/sched_switch/enable
+
+yield
+
+count=`cat trace | grep sched_switch | grep -v "pid=$mypid" | wc -l`
+if [ $count -eq 0 ]; then
+ fail "sched_switch events from other task are not recorded"
+fi
+
+do_reset
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc b/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc
new file mode 100644
index 000000000..b1ede6249
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/event/subsystem-enable.tc
@@ -0,0 +1,48 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event tracing - enable/disable with subsystem level files
+# requires: set_event events/sched/enable
+# flags: instance
+
+do_reset() {
+ echo > set_event
+ clear_trace
+}
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo 'sched:*' > set_event
+
+yield
+
+count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l`
+if [ $count -lt 3 ]; then
+ fail "at least fork, exec and exit events should be recorded"
+fi
+
+do_reset
+
+echo 1 > events/sched/enable
+
+yield
+
+count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l`
+if [ $count -lt 3 ]; then
+ fail "at least fork, exec and exit events should be recorded"
+fi
+
+do_reset
+
+echo 0 > events/sched/enable
+
+yield
+
+count=`cat trace | grep -v ^# | awk '{ print $5 }' | sort -u | wc -l`
+if [ $count -ne 0 ]; then
+ fail "any of scheduler events should not be recorded"
+fi
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/event/toplevel-enable.tc b/tools/testing/selftests/ftrace/test.d/event/toplevel-enable.tc
new file mode 100644
index 000000000..93c10ea42
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/event/toplevel-enable.tc
@@ -0,0 +1,51 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event tracing - enable/disable with top level files
+# requires: available_events set_event events/enable
+
+do_reset() {
+ echo > set_event
+ clear_trace
+}
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo '*:*' > set_event
+
+yield
+
+echo 0 > tracing_on
+
+count=`head -n 128 trace | grep -v ^# | wc -l`
+if [ $count -eq 0 ]; then
+ fail "none of events are recorded"
+fi
+
+do_reset
+
+echo 1 > events/enable
+echo 1 > tracing_on
+
+yield
+
+echo 0 > tracing_on
+count=`head -n 128 trace | grep -v ^# | wc -l`
+if [ $count -eq 0 ]; then
+ fail "none of events are recorded"
+fi
+
+do_reset
+
+echo 0 > events/enable
+
+yield
+
+count=`cat trace | grep -v ^# | wc -l`
+if [ $count -ne 0 ]; then
+ fail "any of events should not be recorded"
+fi
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/event/trace_printk.tc b/tools/testing/selftests/ftrace/test.d/event/trace_printk.tc
new file mode 100644
index 000000000..b02550b42
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/event/trace_printk.tc
@@ -0,0 +1,27 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test trace_printk from module
+
+rmmod trace-printk ||:
+if ! modprobe trace-printk ; then
+ echo "No trace-printk sample module - please make CONFIG_SAMPLE_TRACE_PRINTK=m"
+ exit_unresolved;
+fi
+
+echo "Waiting for irq work"
+sleep 1
+
+grep -q ": This .* trace_bputs" trace
+grep -q ": This .* trace_puts" trace
+grep -q ": This .* trace_bprintk" trace
+grep -q ": This .* trace_printk" trace
+
+grep -q ": (irq) .* trace_bputs" trace
+grep -q ": (irq) .* trace_puts" trace
+grep -q ": (irq) .* trace_bprintk" trace
+grep -q ": (irq) .* trace_printk" trace
+
+grep -q "This is a %s that will use trace_bprintk" printk_formats
+grep -q "(irq) This is a static string that will use trace_bputs" printk_formats
+
+rmmod trace-printk ||:
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-filter-stack.tc b/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-filter-stack.tc
new file mode 100644
index 000000000..cf3ea42b1
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-filter-stack.tc
@@ -0,0 +1,73 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: ftrace - function graph filters with stack tracer
+# requires: stack_trace set_ftrace_filter function_graph:tracer
+
+# Make sure that function graph filtering works, and is not
+# affected by other tracers enabled (like stack tracer)
+
+do_reset() {
+ if [ -e /proc/sys/kernel/stack_tracer_enabled ]; then
+ echo 0 > /proc/sys/kernel/stack_tracer_enabled
+ fi
+}
+
+fail() { # msg
+ do_reset
+ echo $1
+ exit_fail
+}
+
+disable_tracing
+clear_trace;
+
+# filter something, schedule is always good
+if ! echo "schedule" > set_ftrace_filter; then
+ # test for powerpc 64
+ if ! echo ".schedule" > set_ftrace_filter; then
+ fail "can not enable schedule filter"
+ fi
+fi
+
+echo function_graph > current_tracer
+
+echo "Now testing with stack tracer"
+
+echo 1 > /proc/sys/kernel/stack_tracer_enabled
+
+disable_tracing
+clear_trace
+enable_tracing
+sleep 1
+
+count=`cat trace | grep '()' | grep -v schedule | wc -l`
+
+if [ $count -ne 0 ]; then
+ fail "Graph filtering not working with stack tracer?"
+fi
+
+# Make sure we did find something
+count=`cat trace | grep 'schedule()' | wc -l`
+if [ $count -eq 0 ]; then
+ fail "No schedule traces found?"
+fi
+
+echo 0 > /proc/sys/kernel/stack_tracer_enabled
+clear_trace
+sleep 1
+
+
+count=`cat trace | grep '()' | grep -v schedule | wc -l`
+
+if [ $count -ne 0 ]; then
+ fail "Graph filtering not working after stack tracer disabled?"
+fi
+
+count=`cat trace | grep 'schedule()' | wc -l`
+if [ $count -eq 0 ]; then
+ fail "No schedule traces found?"
+fi
+
+do_reset
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-filter.tc b/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-filter.tc
new file mode 100644
index 000000000..b3ccdaec2
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-filter.tc
@@ -0,0 +1,40 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: ftrace - function graph filters
+# requires: set_ftrace_filter function_graph:tracer
+
+# Make sure that function graph filtering works
+
+fail() { # msg
+ echo $1
+ exit_fail
+}
+
+disable_tracing
+clear_trace
+
+# filter something, schedule is always good
+if ! echo "schedule" > set_ftrace_filter; then
+ # test for powerpc 64
+ if ! echo ".schedule" > set_ftrace_filter; then
+ fail "can not enable schedule filter"
+ fi
+fi
+
+echo function_graph > current_tracer
+enable_tracing
+sleep 1
+# search for functions (has "()" on the line), and make sure
+# that only the schedule function was found
+count=`cat trace | grep '()' | grep -v schedule | wc -l`
+if [ $count -ne 0 ]; then
+ fail "Graph filtering not working by itself?"
+fi
+
+# Make sure we did find something
+count=`cat trace | grep 'schedule()' | wc -l`
+if [ $count -eq 0 ]; then
+ fail "No schedule traces found?"
+fi
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-glob.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-glob.tc
new file mode 100644
index 000000000..4b994b6df
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-glob.tc
@@ -0,0 +1,58 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: ftrace - function glob filters
+# requires: set_ftrace_filter function:tracer
+
+# Make sure that function glob matching filter works.
+
+disable_tracing
+clear_trace
+
+ftrace_filter_check() { # glob grep
+ echo "$1" > set_ftrace_filter
+ cut -f1 -d" " set_ftrace_filter > $TMPDIR/actual
+ cut -f1 -d" " available_filter_functions | grep "$2" > $TMPDIR/expected
+ DIFF=`diff $TMPDIR/actual $TMPDIR/expected`
+ test -z "$DIFF"
+}
+
+# filter by *, front match
+ftrace_filter_check '*schedule' '^.*schedule$'
+
+# filter by *, middle match
+ftrace_filter_check '*schedule*' '^.*schedule.*$'
+
+# filter by *, end match
+ftrace_filter_check 'schedule*' '^schedule.*$'
+
+# filter by *mid*end
+ftrace_filter_check '*pin*lock' '.*pin.*lock$'
+
+# filter by start*mid*
+ftrace_filter_check 'mutex*try*' '^mutex.*try.*'
+
+# Advanced full-glob matching feature is recently supported.
+# Skip the tests if we are sure the kernel does not support it.
+if grep -q 'accepts: .* glob-matching-pattern' README ; then
+
+# filter by *, both side match
+ftrace_filter_check 'sch*ule' '^sch.*ule$'
+
+# filter by char class.
+ftrace_filter_check '[Ss]y[Ss]_*' '^[Ss]y[Ss]_.*$'
+
+# filter by ?, schedule is always good
+if ! echo "sch?dule" > set_ftrace_filter; then
+ # test for powerpc 64
+ if ! echo ".sch?dule" > set_ftrace_filter; then
+ fail "can not enable schedule filter"
+ fi
+ cat set_ftrace_filter | grep '^.schedule$'
+else
+ cat set_ftrace_filter | grep '^schedule$'
+fi
+
+fi
+
+echo > set_ftrace_filter
+enable_tracing
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-notrace-pid.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-notrace-pid.tc
new file mode 100644
index 000000000..80541964b
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-notrace-pid.tc
@@ -0,0 +1,94 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: ftrace - function pid notrace filters
+# requires: set_ftrace_notrace_pid set_ftrace_filter function:tracer
+# flags: instance
+
+# Make sure that function pid matching filter with notrace works.
+
+do_function_fork=1
+
+if [ ! -f options/function-fork ]; then
+ do_function_fork=0
+ echo "no option for function-fork found. Option will not be tested."
+fi
+
+read PID _ < /proc/self/stat
+
+if [ $do_function_fork -eq 1 ]; then
+ # default value of function-fork option
+ orig_value=`grep function-fork trace_options`
+fi
+
+do_reset() {
+ if [ $do_function_fork -eq 0 ]; then
+ return
+ fi
+
+ echo > set_ftrace_notrace_pid
+ echo $orig_value > trace_options
+}
+
+fail() { # msg
+ do_reset
+ echo $1
+ exit_fail
+}
+
+do_test() {
+ disable_tracing
+
+ echo do_execve* > set_ftrace_filter
+ echo $FUNCTION_FORK >> set_ftrace_filter
+
+ echo $PID > set_ftrace_notrace_pid
+ echo function > current_tracer
+
+ if [ $do_function_fork -eq 1 ]; then
+ # don't allow children to be traced
+ echo nofunction-fork > trace_options
+ fi
+
+ enable_tracing
+ yield
+
+ count_pid=`cat trace | grep -v ^# | grep $PID | wc -l`
+ count_other=`cat trace | grep -v ^# | grep -v $PID | wc -l`
+
+ # count_pid should be 0
+ if [ $count_pid -ne 0 -o $count_other -eq 0 ]; then
+ fail "PID filtering not working? traced task = $count_pid; other tasks = $count_other "
+ fi
+
+ disable_tracing
+ clear_trace
+
+ if [ $do_function_fork -eq 0 ]; then
+ return
+ fi
+
+ # allow children to be traced
+ echo function-fork > trace_options
+
+ # With pid in both set_ftrace_pid and set_ftrace_notrace_pid
+ # there should not be any tasks traced.
+
+ echo $PID > set_ftrace_pid
+
+ enable_tracing
+ yield
+
+ count_pid=`cat trace | grep -v ^# | grep $PID | wc -l`
+ count_other=`cat trace | grep -v ^# | grep -v $PID | wc -l`
+
+ # both should be zero
+ if [ $count_pid -ne 0 -o $count_other -ne 0 ]; then
+ fail "PID filtering not following fork? traced task = $count_pid; other tasks = $count_other "
+ fi
+}
+
+do_test
+
+do_reset
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc
new file mode 100644
index 000000000..2f7211254
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-pid.tc
@@ -0,0 +1,88 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: ftrace - function pid filters
+# requires: set_ftrace_pid set_ftrace_filter function:tracer
+# flags: instance
+
+# Make sure that function pid matching filter works.
+# Also test it on an instance directory
+
+do_function_fork=1
+
+if [ ! -f options/function-fork ]; then
+ do_function_fork=0
+ echo "no option for function-fork found. Option will not be tested."
+fi
+
+read PID _ < /proc/self/stat
+
+if [ $do_function_fork -eq 1 ]; then
+ # default value of function-fork option
+ orig_value=`grep function-fork trace_options`
+fi
+
+do_reset() {
+ if [ $do_function_fork -eq 0 ]; then
+ return
+ fi
+
+ echo $orig_value > trace_options
+}
+
+fail() { # msg
+ do_reset
+ echo $1
+ exit_fail
+}
+
+do_test() {
+ disable_tracing
+
+ echo do_execve* > set_ftrace_filter
+ echo $FUNCTION_FORK >> set_ftrace_filter
+
+ echo $PID > set_ftrace_pid
+ echo function > current_tracer
+
+ if [ $do_function_fork -eq 1 ]; then
+ # don't allow children to be traced
+ echo nofunction-fork > trace_options
+ fi
+
+ enable_tracing
+ yield
+
+ count_pid=`cat trace | grep -v ^# | grep $PID | wc -l`
+ count_other=`cat trace | grep -v ^# | grep -v $PID | wc -l`
+
+ # count_other should be 0
+ if [ $count_pid -eq 0 -o $count_other -ne 0 ]; then
+ fail "PID filtering not working?"
+ fi
+
+ disable_tracing
+ clear_trace
+
+ if [ $do_function_fork -eq 0 ]; then
+ return
+ fi
+
+ # allow children to be traced
+ echo function-fork > trace_options
+
+ enable_tracing
+ yield
+
+ count_pid=`cat trace | grep -v ^# | grep $PID | wc -l`
+ count_other=`cat trace | grep -v ^# | grep -v $PID | wc -l`
+
+ # count_other should NOT be 0
+ if [ $count_pid -eq 0 -o $count_other -eq 0 ]; then
+ fail "PID filtering not following fork?"
+ fi
+}
+
+do_test
+do_reset
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-stacktrace.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-stacktrace.tc
new file mode 100644
index 000000000..191d116b7
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func-filter-stacktrace.tc
@@ -0,0 +1,13 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: ftrace - stacktrace filter command
+# requires: set_ftrace_filter
+# flags: instance
+
+echo $FUNCTION_FORK:stacktrace >> set_ftrace_filter
+
+grep -q "$FUNCTION_FORK:stacktrace:unlimited" set_ftrace_filter
+
+(echo "forked"; sleep 1)
+
+grep -q "<stack trace>" trace
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_cpumask.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_cpumask.tc
new file mode 100644
index 000000000..0c6cf7725
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_cpumask.tc
@@ -0,0 +1,43 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: ftrace - function trace with cpumask
+# requires: function:tracer
+
+if ! which nproc ; then
+ nproc() {
+ ls -d /sys/devices/system/cpu/cpu[0-9]* | wc -l
+ }
+fi
+
+NP=`nproc`
+
+if [ $NP -eq 1 ] ;then
+ echo "We can not test cpumask on UP environment"
+ exit_unresolved
+fi
+
+ORIG_CPUMASK=`cat tracing_cpumask`
+
+do_reset() {
+ echo $ORIG_CPUMASK > tracing_cpumask
+}
+
+echo 0 > tracing_on
+echo > trace
+: "Bitmask only record on CPU1"
+echo 2 > tracing_cpumask
+MASK=0x`cat tracing_cpumask`
+test `printf "%d" $MASK` -eq 2 || do_reset
+
+echo function > current_tracer
+echo 1 > tracing_on
+(echo "forked")
+echo 0 > tracing_on
+
+: "Check CPU1 events are recorded"
+grep -q -e "\[001\]" trace || do_reset
+
+: "There should be No other cpu events"
+! grep -qv -e "\[001\]" -e "^#" trace || do_reset
+
+do_reset
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc
new file mode 100644
index 000000000..d9b812795
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_event_triggers.tc
@@ -0,0 +1,123 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: ftrace - test for function event triggers
+# flags: instance
+#
+# The triggers are set within the set_ftrace_filter file
+# requires: set_ftrace_filter
+#
+# Ftrace allows to add triggers to functions, such as enabling or disabling
+# tracing, enabling or disabling trace events, or recording a stack trace
+# within the ring buffer.
+#
+# This test is designed to test event triggers
+
+do_reset() {
+ reset_ftrace_filter
+ reset_tracer
+ disable_events
+ clear_trace
+ enable_tracing
+}
+
+fail() { # mesg
+ echo $1
+ exit_fail
+}
+
+SLEEP_TIME=".1"
+
+echo "Testing function probes with events:"
+
+EVENT="sched:sched_switch"
+EVENT_ENABLE="events/sched/sched_switch/enable"
+
+cnt_trace() {
+ grep -v '^#' trace | wc -l
+}
+
+test_event_enabled() {
+ val=$1
+ check_times=10 # wait for 10 * SLEEP_TIME at most
+
+ while [ $check_times -ne 0 ]; do
+ e=`cat $EVENT_ENABLE`
+ if [ "$e" = $val ]; then
+ return 0
+ fi
+ sleep $SLEEP_TIME
+ check_times=$((check_times - 1))
+ done
+
+ fail "Expected $val but found $e"
+}
+
+run_enable_disable() {
+ enable=$1 # enable
+ Enable=$2 # Enable
+ check_disable=$3 # 0
+ check_enable_star=$4 # 1*
+ check_disable_star=$5 # 0*
+
+ cnt=`cnt_trace`
+ if [ $cnt -ne 0 ]; then
+ fail "Found junk in trace file"
+ fi
+
+ echo "$Enable event all the time"
+
+ echo $check_disable > $EVENT_ENABLE
+ sleep $SLEEP_TIME
+
+ test_event_enabled $check_disable
+
+ echo "schedule:${enable}_event:$EVENT" > set_ftrace_filter
+ if [ -d ../../instances ]; then # Check instances
+ cur=`cat set_ftrace_filter`
+ top=`cat ../../set_ftrace_filter`
+ if [ "$cur" = "$top" ]; then
+ echo "This kernel is too old to support per instance filter"
+ reset_ftrace_filter
+ exit_unsupported
+ fi
+ fi
+
+ echo " make sure it works 5 times"
+
+ for i in `seq 5`; do
+ sleep $SLEEP_TIME
+ echo " test $i"
+ test_event_enabled $check_enable_star
+
+ echo $check_disable > $EVENT_ENABLE
+ done
+ sleep $SLEEP_TIME
+ echo " make sure it's still works"
+ test_event_enabled $check_enable_star
+
+ reset_ftrace_filter
+
+ echo " make sure it only works 3 times"
+
+ echo $check_disable > $EVENT_ENABLE
+ sleep $SLEEP_TIME
+
+ echo "schedule:${enable}_event:$EVENT:3" > set_ftrace_filter
+
+ for i in `seq 3`; do
+ sleep $SLEEP_TIME
+ echo " test $i"
+ test_event_enabled $check_enable_star
+
+ echo $check_disable > $EVENT_ENABLE
+ done
+
+ sleep $SLEEP_TIME
+ echo " make sure it stop working"
+ test_event_enabled $check_disable_star
+
+ do_reset
+}
+
+run_enable_disable enable Enable 0 "1*" "0*"
+run_enable_disable disable Disable 1 "0*" "1*"
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_mod_trace.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_mod_trace.tc
new file mode 100644
index 000000000..37c8feb90
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_mod_trace.tc
@@ -0,0 +1,23 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: ftrace - function trace on module
+# requires: set_ftrace_filter
+
+: "mod: allows to filter a non exist function"
+echo 'non_exist_func:mod:non_exist_module' > set_ftrace_filter
+grep -q "non_exist_func" set_ftrace_filter
+
+: "mod: on exist module"
+echo '*:mod:trace_printk' > set_ftrace_filter
+if ! modprobe trace-printk ; then
+ echo "No trace-printk sample module - please make CONFIG_SAMPLE_TRACE_PRINTK=
+m"
+ exit_unresolved;
+fi
+
+: "Wildcard should be resolved after loading module"
+grep -q "trace_printk_irq_work" set_ftrace_filter
+
+: "After removing the filter becomes empty"
+rmmod trace_printk
+test `cat set_ftrace_filter | wc -l` -eq 0
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_profile_stat.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_profile_stat.tc
new file mode 100644
index 000000000..4daeffb02
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_profile_stat.tc
@@ -0,0 +1,21 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: ftrace - function profiling
+# requires: function_profile_enabled
+
+: "Enable function profile"
+echo 1 > function_profile_enabled
+
+: "Profile must be updated"
+cp trace_stat/function0 $TMPDIR/
+( echo "forked"; sleep 1 )
+: "diff returns 0 if there is no difference"
+! diff trace_stat/function0 $TMPDIR/function0
+
+echo 0 > function_profile_enabled
+
+: "Profile must NOT be updated"
+cp trace_stat/function0 $TMPDIR/
+( echo "forked"; sleep 1 )
+: "diff returns 0 if there is no difference"
+diff trace_stat/function0 $TMPDIR/function0
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_profiler.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_profiler.tc
new file mode 100644
index 000000000..1dbd766c0
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_profiler.tc
@@ -0,0 +1,64 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: ftrace - function profiler with function tracing
+# requires: function_profile_enabled set_ftrace_filter function_graph:tracer
+
+# There was a bug after a rewrite of the ftrace infrastructure that
+# caused the function_profiler not to be able to run with the function
+# tracer, because the function_profiler used the function_graph tracer
+# and it was assumed the two could not run simultaneously.
+#
+# There was another related bug where the solution to the first bug
+# broke the way filtering of the function tracer worked.
+#
+# This test triggers those bugs on those kernels.
+#
+# We need function_graph and profiling to to run this test
+
+fail() { # mesg
+ echo $1
+ exit_fail
+}
+
+echo "Testing function tracer with profiler:"
+echo "enable function tracer"
+echo function > current_tracer
+echo "enable profiler"
+echo 1 > function_profile_enabled
+
+sleep 1
+
+echo "Now filter on just schedule"
+echo '*schedule' > set_ftrace_filter
+clear_trace
+
+echo "Now disable function profiler"
+echo 0 > function_profile_enabled
+
+sleep 1
+
+# make sure only schedule functions exist
+
+echo "testing if only schedule is being traced"
+if grep -v -e '^#' -e 'schedule' trace; then
+ fail "more than schedule was found"
+fi
+
+echo "Make sure schedule was traced"
+if ! grep -e 'schedule' trace > /dev/null; then
+ cat trace
+ fail "can not find schedule in trace"
+fi
+
+echo > set_ftrace_filter
+clear_trace
+
+sleep 1
+
+echo "make sure something other than scheduler is being traced"
+if ! grep -v -e '^#' -e 'schedule' trace > /dev/null; then
+ cat trace
+ fail "no other functions besides schedule was found"
+fi
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc
new file mode 100644
index 000000000..e96e279e0
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_set_ftrace_file.tc
@@ -0,0 +1,154 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: ftrace - test reading of set_ftrace_filter
+#
+# The triggers are set within the set_ftrace_filter file
+# requires: set_ftrace_filter
+#
+# The set_ftrace_filter file of ftrace is used to list functions as well as
+# triggers (probes) attached to functions. The code to read this file is not
+# straight forward and has had various bugs in the past. This test is designed
+# to add functions and triggers to that file in various ways and read that
+# file in various ways (cat vs dd).
+#
+
+fail() { # mesg
+ echo $1
+ exit_fail
+}
+
+FILTER=set_ftrace_filter
+FUNC1="schedule"
+FUNC2="do_softirq"
+
+ALL_FUNCS="#### all functions enabled ####"
+
+test_func() {
+ if ! echo "$1" | grep -q "^$2\$"; then
+ return 0
+ fi
+ echo "$1" | grep -v "^$2\$"
+ return 1
+}
+
+check_set_ftrace_filter() {
+ cat=`cat $FILTER`
+ dd1=`dd if=$FILTER bs=1 | grep -v -e 'records in' -e 'records out' -e 'bytes copied'`
+ dd100=`dd if=$FILTER bs=100 | grep -v -e 'records in' -e 'records out' -e 'bytes copied'`
+
+ echo "Testing '$@'"
+
+ while [ $# -gt 0 ]; do
+ echo "test $1"
+ if cat=`test_func "$cat" "$1"`; then
+ return 0
+ fi
+ if dd1=`test_func "$dd1" "$1"`; then
+ return 0
+ fi
+ if dd100=`test_func "$dd100" "$1"`; then
+ return 0
+ fi
+ shift
+ done
+
+ if [ -n "$cat" ]; then
+ return 0
+ fi
+ if [ -n "$dd1" ]; then
+ return 0
+ fi
+ if [ -n "$dd100" ]; then
+ return 0
+ fi
+ return 1;
+}
+
+if check_set_ftrace_filter "$ALL_FUNCS"; then
+ fail "Expected only $ALL_FUNCS"
+fi
+
+echo "$FUNC1:traceoff" > set_ftrace_filter
+if check_set_ftrace_filter "$ALL_FUNCS" "$FUNC1:traceoff:unlimited"; then
+ fail "Expected $ALL_FUNCS and $FUNC1:traceoff:unlimited"
+fi
+
+echo "$FUNC1" > set_ftrace_filter
+if check_set_ftrace_filter "$FUNC1" "$FUNC1:traceoff:unlimited"; then
+ fail "Expected $FUNC1 and $FUNC1:traceoff:unlimited"
+fi
+
+echo "$FUNC2" >> set_ftrace_filter
+if check_set_ftrace_filter "$FUNC1" "$FUNC2" "$FUNC1:traceoff:unlimited"; then
+ fail "Expected $FUNC1 $FUNC2 and $FUNC1:traceoff:unlimited"
+fi
+
+echo "$FUNC2:traceoff" >> set_ftrace_filter
+if check_set_ftrace_filter "$FUNC1" "$FUNC2" "$FUNC1:traceoff:unlimited" "$FUNC2:traceoff:unlimited"; then
+ fail "Expected $FUNC1 $FUNC2 $FUNC1:traceoff:unlimited and $FUNC2:traceoff:unlimited"
+fi
+
+echo "$FUNC1" > set_ftrace_filter
+if check_set_ftrace_filter "$FUNC1" "$FUNC1:traceoff:unlimited" "$FUNC2:traceoff:unlimited"; then
+ fail "Expected $FUNC1 $FUNC1:traceoff:unlimited and $FUNC2:traceoff:unlimited"
+fi
+
+echo > set_ftrace_filter
+if check_set_ftrace_filter "$ALL_FUNCS" "$FUNC1:traceoff:unlimited" "$FUNC2:traceoff:unlimited"; then
+ fail "Expected $ALL_FUNCS $FUNC1:traceoff:unlimited and $FUNC2:traceoff:unlimited"
+fi
+
+reset_ftrace_filter
+
+if check_set_ftrace_filter "$ALL_FUNCS"; then
+ fail "Expected $ALL_FUNCS"
+fi
+
+echo "$FUNC1" > set_ftrace_filter
+if check_set_ftrace_filter "$FUNC1" ; then
+ fail "Expected $FUNC1"
+fi
+
+echo "$FUNC2" >> set_ftrace_filter
+if check_set_ftrace_filter "$FUNC1" "$FUNC2" ; then
+ fail "Expected $FUNC1 and $FUNC2"
+fi
+
+test_actual() { # Compares $TMPDIR/expected with set_ftrace_filter
+ cat set_ftrace_filter | grep -v '#' | cut -d' ' -f1 | cut -d':' -f1 | sort -u > $TMPDIR/actual
+ DIFF=`diff $TMPDIR/actual $TMPDIR/expected`
+ test -z "$DIFF"
+}
+
+# Set traceoff trigger for all fuctions with "lock" in their name
+cat available_filter_functions | cut -d' ' -f1 | grep 'lock' | sort -u > $TMPDIR/expected
+echo '*lock*:traceoff' > set_ftrace_filter
+test_actual
+
+# now remove all with 'try' in it, and end with lock
+grep -v 'try.*lock$' $TMPDIR/expected > $TMPDIR/expected2
+mv $TMPDIR/expected2 $TMPDIR/expected
+echo '!*try*lock:traceoff' >> set_ftrace_filter
+test_actual
+
+# remove all that start with "m" and end with "lock"
+grep -v '^m.*lock$' $TMPDIR/expected > $TMPDIR/expected2
+mv $TMPDIR/expected2 $TMPDIR/expected
+echo '!m*lock:traceoff' >> set_ftrace_filter
+test_actual
+
+# remove all that start with "c" and have "unlock"
+grep -v '^c.*unlock' $TMPDIR/expected > $TMPDIR/expected2
+mv $TMPDIR/expected2 $TMPDIR/expected
+echo '!c*unlock*:traceoff' >> set_ftrace_filter
+test_actual
+
+# clear all the rest
+> $TMPDIR/expected
+echo '!*:traceoff' >> set_ftrace_filter
+test_actual
+
+rm $TMPDIR/expected
+rm $TMPDIR/actual
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_stack_tracer.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_stack_tracer.tc
new file mode 100644
index 000000000..61264e422
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_stack_tracer.tc
@@ -0,0 +1,35 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: ftrace - Max stack tracer
+# requires: stack_trace stack_trace_filter
+# Test the basic function of max-stack usage tracing
+
+echo > stack_trace_filter
+echo 0 > stack_max_size
+echo 1 > /proc/sys/kernel/stack_tracer_enabled
+
+: "Fork and wait for the first entry become !lock"
+timeout=10
+while [ $timeout -ne 0 ]; do
+ ( echo "forked" )
+ FL=`grep " 0)" stack_trace`
+ echo $FL | grep -q "lock" || break;
+ timeout=$((timeout - 1))
+done
+echo 0 > /proc/sys/kernel/stack_tracer_enabled
+
+echo '*lock*' > stack_trace_filter
+test `cat stack_trace_filter | wc -l` -eq `grep lock stack_trace_filter | wc -l`
+
+echo 0 > stack_max_size
+echo 1 > /proc/sys/kernel/stack_tracer_enabled
+
+: "Fork and always the first entry including lock"
+timeout=10
+while [ $timeout -ne 0 ]; do
+ ( echo "forked" )
+ FL=`grep " 0)" stack_trace`
+ echo $FL | grep -q "lock"
+ timeout=$((timeout - 1))
+done
+echo 0 > /proc/sys/kernel/stack_tracer_enabled
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc b/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc
new file mode 100644
index 000000000..aee222895
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/func_traceonoff_triggers.tc
@@ -0,0 +1,172 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: ftrace - test for function traceon/off triggers
+# flags: instance
+#
+# The triggers are set within the set_ftrace_filter file
+# requires: set_ftrace_filter
+#
+# Ftrace allows to add triggers to functions, such as enabling or disabling
+# tracing, enabling or disabling trace events, or recording a stack trace
+# within the ring buffer.
+#
+# This test is designed to test enabling and disabling tracing triggers
+#
+
+fail() { # mesg
+ echo $1
+ exit_fail
+}
+
+SLEEP_TIME=".1"
+
+echo "Testing function probes with enabling disabling tracing:"
+
+cnt_trace() {
+ grep -v '^#' trace | wc -l
+}
+
+echo '** DISABLE TRACING'
+disable_tracing
+clear_trace
+
+cnt=`cnt_trace`
+if [ $cnt -ne 0 ]; then
+ fail "Found junk in trace"
+fi
+
+
+echo '** ENABLE EVENTS'
+
+echo 1 > events/sched/enable
+
+echo '** ENABLE TRACING'
+enable_tracing
+
+cnt=`cnt_trace`
+if [ $cnt -eq 0 ]; then
+ fail "Nothing found in trace"
+fi
+
+# powerpc uses .schedule
+func="schedule"
+available_file=available_filter_functions
+if [ -d ../../instances -a -f ../../available_filter_functions ]; then
+ available_file=../../available_filter_functions
+fi
+x=`grep '^\.schedule$' available_filter_functions | wc -l`
+if [ "$x" -eq 1 ]; then
+ func=".schedule"
+fi
+
+echo '** SET TRACEOFF'
+
+echo "$func:traceoff" > set_ftrace_filter
+if [ -d ../../instances ]; then # Check instances
+ cur=`cat set_ftrace_filter`
+ top=`cat ../../set_ftrace_filter`
+ if [ "$cur" = "$top" ]; then
+ echo "This kernel is too old to support per instance filter"
+ reset_ftrace_filter
+ exit_unsupported
+ fi
+fi
+
+cnt=`grep schedule set_ftrace_filter | wc -l`
+if [ $cnt -ne 1 ]; then
+ fail "Did not find traceoff trigger"
+fi
+
+cnt=`cnt_trace`
+sleep $SLEEP_TIME
+cnt2=`cnt_trace`
+
+if [ $cnt -ne $cnt2 ]; then
+ fail "Tracing is not stopped"
+fi
+
+on=`cat tracing_on`
+if [ $on != "0" ]; then
+ fail "Tracing is not off"
+fi
+
+csum1=`md5sum trace`
+sleep $SLEEP_TIME
+csum2=`md5sum trace`
+
+if [ "$csum1" != "$csum2" ]; then
+ fail "Tracing file is still changing"
+fi
+
+clear_trace
+
+cnt=`cnt_trace`
+if [ $cnt -ne 0 ]; then
+ fail "Tracing is still happeing"
+fi
+
+echo "!$func:traceoff" >> set_ftrace_filter
+
+cnt=`grep schedule set_ftrace_filter | wc -l`
+if [ $cnt -ne 0 ]; then
+ fail "traceoff trigger still exists"
+fi
+
+on=`cat tracing_on`
+if [ $on != "0" ]; then
+ fail "Tracing is started again"
+fi
+
+echo "$func:traceon" > set_ftrace_filter
+
+cnt=`grep schedule set_ftrace_filter | wc -l`
+if [ $cnt -ne 1 ]; then
+ fail "traceon trigger not found"
+fi
+
+cnt=`cnt_trace`
+if [ $cnt -eq 0 ]; then
+ fail "Tracing did not start"
+fi
+
+on=`cat tracing_on`
+if [ $on != "1" ]; then
+ fail "Tracing was not enabled"
+fi
+
+
+echo "!$func:traceon" >> set_ftrace_filter
+
+cnt=`grep schedule set_ftrace_filter | wc -l`
+if [ $cnt -ne 0 ]; then
+ fail "traceon trigger still exists"
+fi
+
+check_sleep() {
+ val=$1
+ sleep $SLEEP_TIME
+ cat set_ftrace_filter
+ on=`cat tracing_on`
+ if [ $on != "$val" ]; then
+ fail "Expected tracing_on to be $val, but it was $on"
+ fi
+}
+
+
+echo "$func:traceoff:3" > set_ftrace_filter
+check_sleep "0"
+echo 1 > tracing_on
+check_sleep "0"
+echo 1 > tracing_on
+check_sleep "0"
+echo 1 > tracing_on
+check_sleep "1"
+echo "!$func:traceoff:0" > set_ftrace_filter
+
+if grep -e traceon -e traceoff set_ftrace_filter; then
+ fail "Tracing on and off triggers still exist"
+fi
+
+disable_events
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/tracing-error-log.tc b/tools/testing/selftests/ftrace/test.d/ftrace/tracing-error-log.tc
new file mode 100644
index 000000000..6c190620d
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/ftrace/tracing-error-log.tc
@@ -0,0 +1,15 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: ftrace - test tracing error log support
+# event tracing is currently the only ftrace tracer that uses the
+# tracing error_log, hence this check
+# requires: set_event error_log
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+ftrace_errlog_check 'event filter parse error' '((sig >= 10 && sig < 15) || dsig ^== 17) && comm != bash' 'events/signal/signal_generate/filter'
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions
new file mode 100644
index 000000000..0cee6b067
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/functions
@@ -0,0 +1,155 @@
+clear_trace() { # reset trace output
+ echo > trace
+}
+
+disable_tracing() { # stop trace recording
+ echo 0 > tracing_on
+}
+
+enable_tracing() { # start trace recording
+ echo 1 > tracing_on
+}
+
+reset_tracer() { # reset the current tracer
+ echo nop > current_tracer
+}
+
+reset_trigger_file() {
+ # remove action triggers first
+ grep -H ':on[^:]*(' $@ |
+ while read line; do
+ cmd=`echo $line | cut -f2- -d: | cut -f1 -d"["`
+ file=`echo $line | cut -f1 -d:`
+ echo "!$cmd" >> $file
+ done
+ grep -Hv ^# $@ |
+ while read line; do
+ cmd=`echo $line | cut -f2- -d: | cut -f1 -d"["`
+ file=`echo $line | cut -f1 -d:`
+ echo "!$cmd" > $file
+ done
+}
+
+reset_trigger() { # reset all current setting triggers
+ if [ -d events/synthetic ]; then
+ reset_trigger_file events/synthetic/*/trigger
+ fi
+ reset_trigger_file events/*/*/trigger
+}
+
+reset_events_filter() { # reset all current setting filters
+ grep -v ^none events/*/*/filter |
+ while read line; do
+ echo 0 > `echo $line | cut -f1 -d:`
+ done
+}
+
+reset_ftrace_filter() { # reset all triggers in set_ftrace_filter
+ if [ ! -f set_ftrace_filter ]; then
+ return 0
+ fi
+ echo > set_ftrace_filter
+ grep -v '^#' set_ftrace_filter | while read t; do
+ tr=`echo $t | cut -d: -f2`
+ if [ "$tr" = "" ]; then
+ continue
+ fi
+ if ! grep -q "$t" set_ftrace_filter; then
+ continue;
+ fi
+ name=`echo $t | cut -d: -f1 | cut -d' ' -f1`
+ if [ $tr = "enable_event" -o $tr = "disable_event" ]; then
+ tr=`echo $t | cut -d: -f2-4`
+ limit=`echo $t | cut -d: -f5`
+ else
+ tr=`echo $t | cut -d: -f2`
+ limit=`echo $t | cut -d: -f3`
+ fi
+ if [ "$limit" != "unlimited" ]; then
+ tr="$tr:$limit"
+ fi
+ echo "!$name:$tr" > set_ftrace_filter
+ done
+}
+
+disable_events() {
+ echo 0 > events/enable
+}
+
+clear_synthetic_events() { # reset all current synthetic events
+ grep -v ^# synthetic_events |
+ while read line; do
+ echo "!$line" >> synthetic_events
+ done
+}
+
+initialize_ftrace() { # Reset ftrace to initial-state
+# As the initial state, ftrace will be set to nop tracer,
+# no events, no triggers, no filters, no function filters,
+# no probes, and tracing on.
+ disable_tracing
+ reset_tracer
+ reset_trigger
+ reset_events_filter
+ reset_ftrace_filter
+ disable_events
+ [ -f set_event_pid ] && echo > set_event_pid
+ [ -f set_ftrace_pid ] && echo > set_ftrace_pid
+ [ -f set_ftrace_notrace ] && echo > set_ftrace_notrace
+ [ -f set_graph_function ] && echo | tee set_graph_*
+ [ -f stack_trace_filter ] && echo > stack_trace_filter
+ [ -f kprobe_events ] && echo > kprobe_events
+ [ -f uprobe_events ] && echo > uprobe_events
+ [ -f synthetic_events ] && echo > synthetic_events
+ [ -f snapshot ] && echo 0 > snapshot
+ clear_trace
+ enable_tracing
+}
+
+check_requires() { # Check required files and tracers
+ for i in "$@" ; do
+ r=${i%:README}
+ t=${i%:tracer}
+ if [ $t != $i ]; then
+ if ! grep -wq $t available_tracers ; then
+ echo "Required tracer $t is not configured."
+ exit_unsupported
+ fi
+ elif [ "$r" != "$i" ]; then
+ if ! grep -Fq "$r" README ; then
+ echo "Required feature pattern \"$r\" is not in README."
+ exit_unsupported
+ fi
+ elif [ ! -e $i ]; then
+ echo "Required feature interface $i doesn't exist."
+ exit_unsupported
+ fi
+ done
+}
+
+LOCALHOST=127.0.0.1
+
+yield() {
+ ping $LOCALHOST -c 1 || sleep .001 || usleep 1 || sleep 1
+}
+
+# The fork function in the kernel was renamed from "_do_fork" to
+# "kernel_fork". As older tests should still work with older kernels
+# as well as newer kernels, check which version of fork is used on this
+# kernel so that the tests can use the fork function for the running kernel.
+FUNCTION_FORK=`(if grep '\bkernel_clone\b' /proc/kallsyms > /dev/null; then
+ echo kernel_clone; else echo '_do_fork'; fi)`
+
+# Since probe event command may include backslash, explicitly use printf "%s"
+# to NOT interpret it.
+ftrace_errlog_check() { # err-prefix command-with-error-pos-by-^ command-file
+ pos=$(printf "%s" "${2%^*}" | wc -c) # error position
+ command=$(printf "%s" "$2" | tr -d ^)
+ echo "Test command: $command"
+ echo > error_log
+ (! printf "%s" "$command" >> "$3" ) 2> /dev/null
+ grep "$1: error:" -A 3 error_log
+ N=$(tail -n 1 error_log | wc -c)
+ # " Command: " and "^\n" => 13
+ test $(expr 13 + $pos) -eq $N
+}
diff --git a/tools/testing/selftests/ftrace/test.d/instances/instance-event.tc b/tools/testing/selftests/ftrace/test.d/instances/instance-event.tc
new file mode 100644
index 000000000..42422e425
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/instances/instance-event.tc
@@ -0,0 +1,142 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test creation and deletion of trace instances while setting an event
+# requires: instances
+
+fail() { # mesg
+ rmdir foo 2>/dev/null
+ echo $1
+ set -e
+ exit_fail
+}
+
+cd instances
+
+# we don't want to fail on error
+set +e
+
+mkdir x
+rmdir x
+result=$?
+
+if [ $result -ne 0 ]; then
+ echo "instance rmdir not supported"
+ exit_unsupported
+fi
+
+instance_slam() {
+ while :; do
+ mkdir foo 2> /dev/null
+ rmdir foo 2> /dev/null
+ done
+}
+
+instance_read() {
+ while :; do
+ cat foo/trace 1> /dev/null 2>&1
+ done
+}
+
+instance_set() {
+ while :; do
+ echo 1 > foo/events/sched/sched_switch/enable
+ done 2> /dev/null
+}
+
+instance_slam &
+p1=$!
+echo $p1
+
+instance_set &
+p2=$!
+echo $p2
+
+instance_read &
+p3=$!
+echo $p3
+
+sleep 1
+
+kill -1 $p3
+kill -1 $p2
+kill -1 $p1
+
+echo "Wait for processes to finish"
+wait $p1 $p2 $p3
+echo "all processes finished, wait for cleanup"
+sleep 1
+
+mkdir foo
+ls foo > /dev/null
+rmdir foo
+if [ -d foo ]; then
+ fail "foo still exists"
+fi
+
+mkdir foo
+echo "schedule:enable_event:sched:sched_switch" > foo/set_ftrace_filter
+rmdir foo
+if [ -d foo ]; then
+ fail "foo still exists"
+fi
+if grep -q "schedule:enable_event:sched:sched_switch" ../set_ftrace_filter; then
+ echo "Older kernel detected. Cleanup filter"
+ echo '!schedule:enable_event:sched:sched_switch' > ../set_ftrace_filter
+fi
+
+instance_slam() {
+ while :; do
+ mkdir x
+ mkdir y
+ mkdir z
+ rmdir x
+ rmdir y
+ rmdir z
+ done 2>/dev/null
+}
+
+instance_slam &
+p1=$!
+echo $p1
+
+instance_slam &
+p2=$!
+echo $p2
+
+instance_slam &
+p3=$!
+echo $p3
+
+instance_slam &
+p4=$!
+echo $p4
+
+instance_slam &
+p5=$!
+echo $p5
+
+ls -lR >/dev/null
+sleep 1
+
+kill -1 $p1
+kill -1 $p2
+kill -1 $p3
+kill -1 $p4
+kill -1 $p5
+
+echo "Wait for processes to finish"
+wait $p1 $p2 $p3 $p4 $p5
+echo "all processes finished, wait for cleanup"
+
+mkdir x y z
+ls x y z
+rmdir x y z
+for d in x y z; do
+ if [ -d $d ]; then
+ fail "instance $d still exists"
+ fi
+done
+
+set -e
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/instances/instance.tc b/tools/testing/selftests/ftrace/test.d/instances/instance.tc
new file mode 100644
index 000000000..607521d25
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/instances/instance.tc
@@ -0,0 +1,82 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test creation and deletion of trace instances
+# requires: instances
+
+fail() { # mesg
+ rmdir x y z 2>/dev/null
+ echo $1
+ set -e
+ exit_fail
+}
+
+cd instances
+
+# we don't want to fail on error
+set +e
+
+mkdir x
+rmdir x
+result=$?
+
+if [ $result -ne 0 ]; then
+ echo "instance rmdir not supported"
+ exit_unsupported
+fi
+
+instance_slam() {
+ while :; do
+ mkdir x
+ mkdir y
+ mkdir z
+ rmdir x
+ rmdir y
+ rmdir z
+ done 2>/dev/null
+}
+
+instance_slam &
+p1=$!
+echo $p1
+
+instance_slam &
+p2=$!
+echo $p2
+
+instance_slam &
+p3=$!
+echo $p3
+
+instance_slam &
+p4=$!
+echo $p4
+
+instance_slam &
+p5=$!
+echo $p5
+
+ls -lR >/dev/null
+sleep 1
+
+kill -1 $p1
+kill -1 $p2
+kill -1 $p3
+kill -1 $p4
+kill -1 $p5
+
+echo "Wait for processes to finish"
+wait $p1 $p2 $p3 $p4 $p5
+echo "all processes finished, wait for cleanup"
+
+mkdir x y z
+ls x y z
+rmdir x y z
+for d in x y z; do
+ if [ -d $d ]; then
+ fail "instance $d still exists"
+ fi
+done
+
+set -e
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/add_and_remove.tc b/tools/testing/selftests/ftrace/test.d/kprobe/add_and_remove.tc
new file mode 100644
index 000000000..2428a3ed7
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/add_and_remove.tc
@@ -0,0 +1,9 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Kprobe dynamic event - adding and removing
+# requires: kprobe_events
+
+echo p:myevent $FUNCTION_FORK > kprobe_events
+grep myevent kprobe_events
+test -d events/kprobes/myevent
+echo > kprobe_events
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/busy_check.tc b/tools/testing/selftests/ftrace/test.d/kprobe/busy_check.tc
new file mode 100644
index 000000000..010a8b1d6
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/busy_check.tc
@@ -0,0 +1,11 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Kprobe dynamic event - busy event check
+# requires: kprobe_events
+
+echo p:myevent $FUNCTION_FORK > kprobe_events
+test -d events/kprobes/myevent
+echo 1 > events/kprobes/myevent/enable
+echo > kprobe_events && exit_fail # this must fail
+echo 0 > events/kprobes/myevent/enable
+echo > kprobe_events # this must succeed
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args.tc
new file mode 100644
index 000000000..a96a1dc70
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args.tc
@@ -0,0 +1,19 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Kprobe dynamic event with arguments
+# requires: kprobe_events
+
+echo "p:testprobe $FUNCTION_FORK \$stack \$stack0 +0(\$stack)" > kprobe_events
+grep testprobe kprobe_events | grep -q 'arg1=\$stack arg2=\$stack0 arg3=+0(\$stack)'
+test -d events/kprobes/testprobe
+
+echo 1 > events/kprobes/testprobe/enable
+( echo "forked")
+grep testprobe trace | grep "$FUNCTION_FORK" | \
+ grep -q 'arg1=0x[[:xdigit:]]* arg2=0x[[:xdigit:]]* arg3=0x[[:xdigit:]]*$'
+
+echo 0 > events/kprobes/testprobe/enable
+echo "-:testprobe" >> kprobe_events
+clear_trace
+test -d events/kprobes/testprobe && exit_fail || exit_pass
+
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_comm.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_comm.tc
new file mode 100644
index 000000000..a053ee2e7
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_comm.tc
@@ -0,0 +1,16 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Kprobe event with comm arguments
+# requires: kprobe_events
+
+grep -A1 "fetcharg:" README | grep -q "\$comm" || exit_unsupported # this is too old
+
+echo "p:testprobe $FUNCTION_FORK comm=\$comm " > kprobe_events
+grep testprobe kprobe_events | grep -q 'comm=$comm'
+test -d events/kprobes/testprobe
+
+echo 1 > events/kprobes/testprobe/enable
+( echo "forked")
+grep testprobe trace | grep -q 'comm=".*"'
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc
new file mode 100644
index 000000000..84285a6f6
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_string.tc
@@ -0,0 +1,42 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Kprobe event string type argument
+# requires: kprobe_events
+
+case `uname -m` in
+x86_64)
+ ARG1=%di
+;;
+i[3456]86)
+ ARG1=%ax
+;;
+aarch64)
+ ARG1=%x0
+;;
+arm*)
+ ARG1=%r0
+;;
+ppc64*)
+ ARG1=%r3
+;;
+ppc*)
+ ARG1=%r3
+;;
+*)
+ echo "Please implement other architecture here"
+ exit_untested
+esac
+
+: "Test get argument (1)"
+echo "p:testprobe tracefs_create_dir arg1=+0(${ARG1}):string" > kprobe_events
+echo 1 > events/kprobes/testprobe/enable
+echo "p:test $FUNCTION_FORK" >> kprobe_events
+grep -qe "testprobe.* arg1=\"test\"" trace
+
+echo 0 > events/kprobes/testprobe/enable
+: "Test get argument (2)"
+echo "p:testprobe tracefs_create_dir arg1=+0(${ARG1}):string arg2=+0(${ARG1}):string" > kprobe_events
+echo 1 > events/kprobes/testprobe/enable
+echo "p:test $FUNCTION_FORK" >> kprobe_events
+grep -qe "testprobe.* arg1=\"test\" arg2=\"test\"" trace
+
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_symbol.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_symbol.tc
new file mode 100644
index 000000000..717130ed4
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_symbol.tc
@@ -0,0 +1,38 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Kprobe event symbol argument
+# requires: kprobe_events
+
+SYMBOL="linux_proc_banner"
+
+if [ ! -f /proc/kallsyms ]; then
+ echo "Can not check the target symbol - please enable CONFIG_KALLSYMS"
+ exit_unresolved
+elif ! grep "$SYMBOL\$" /proc/kallsyms; then
+ echo "Linux banner is not exported - please enable CONFIG_KALLSYMS_ALL"
+ exit_unresolved
+fi
+
+: "Test get basic types symbol argument"
+echo "p:testprobe_u $FUNCTION_FORK arg1=@linux_proc_banner:u64 arg2=@linux_proc_banner:u32 arg3=@linux_proc_banner:u16 arg4=@linux_proc_banner:u8" > kprobe_events
+echo "p:testprobe_s $FUNCTION_FORK arg1=@linux_proc_banner:s64 arg2=@linux_proc_banner:s32 arg3=@linux_proc_banner:s16 arg4=@linux_proc_banner:s8" >> kprobe_events
+if grep -q "x8/16/32/64" README; then
+ echo "p:testprobe_x $FUNCTION_FORK arg1=@linux_proc_banner:x64 arg2=@linux_proc_banner:x32 arg3=@linux_proc_banner:x16 arg4=@linux_proc_banner:x8" >> kprobe_events
+fi
+echo "p:testprobe_bf $FUNCTION_FORK arg1=@linux_proc_banner:b8@4/32" >> kprobe_events
+echo 1 > events/kprobes/enable
+(echo "forked")
+echo 0 > events/kprobes/enable
+grep "testprobe_[usx]:.* arg1=.* arg2=.* arg3=.* arg4=.*" trace
+grep "testprobe_bf:.* arg1=.*" trace
+
+: "Test get string symbol argument"
+echo "p:testprobe_str $FUNCTION_FORK arg1=@linux_proc_banner:string" > kprobe_events
+echo 1 > events/kprobes/enable
+(echo "forked")
+echo 0 > events/kprobes/enable
+RESULT=`grep "testprobe_str" trace | sed -e 's/.* arg1=\(.*\)/\1/'`
+
+RESULT=`echo $RESULT | sed -e 's/.* \((.*)\) \((.*)\) .*/\1 \2/'`
+ORIG=`cat /proc/version | sed -e 's/.* \((.*)\) \((.*)\) .*/\1 \2/'`
+test "$RESULT" = "$ORIG"
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_syntax.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_syntax.tc
new file mode 100644
index 000000000..474ca1a9a
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_syntax.tc
@@ -0,0 +1,99 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Kprobe event argument syntax
+# requires: kprobe_events "x8/16/32/64":README
+
+PROBEFUNC="vfs_read"
+GOODREG=
+BADREG=
+GOODSYM="_sdata"
+if ! grep -qw ${GOODSYM} /proc/kallsyms ; then
+ GOODSYM=$PROBEFUNC
+fi
+BADSYM="deaqswdefr"
+SYMADDR=0x`grep -w ${GOODSYM} /proc/kallsyms | cut -f 1 -d " "`
+GOODTYPE="x16"
+BADTYPE="y16"
+
+case `uname -m` in
+x86_64|i[3456]86)
+ GOODREG=%ax
+ BADREG=%ex
+;;
+aarch64)
+ GOODREG=%x0
+ BADREG=%ax
+;;
+arm*)
+ GOODREG=%r0
+ BADREG=%ax
+;;
+ppc*)
+ GOODREG=%r3
+ BADREG=%msr
+;;
+*)
+ echo "Please implement other architecture here"
+ exit_untested
+esac
+
+test_goodarg() # Good-args
+{
+ while [ "$1" ]; do
+ echo "p ${PROBEFUNC} $1" > kprobe_events
+ shift 1
+ done;
+}
+
+test_badarg() # Bad-args
+{
+ while [ "$1" ]; do
+ ! echo "p ${PROBEFUNC} $1" > kprobe_events
+ shift 1
+ done;
+}
+
+echo > kprobe_events
+
+: "Register access"
+test_goodarg ${GOODREG}
+test_badarg ${BADREG}
+
+: "Symbol access"
+test_goodarg "@${GOODSYM}" "@${SYMADDR}" "@${GOODSYM}+10" "@${GOODSYM}-10"
+test_badarg "@" "@${BADSYM}" "@${GOODSYM}*10" "@${GOODSYM}/10" \
+ "@${GOODSYM}%10" "@${GOODSYM}&10" "@${GOODSYM}|10"
+
+: "Stack access"
+test_goodarg "\$stack" "\$stack0" "\$stack1"
+test_badarg "\$stackp" "\$stack0+10" "\$stack1-10"
+
+: "Retval access"
+echo "r ${PROBEFUNC} \$retval" > kprobe_events
+! echo "p ${PROBEFUNC} \$retval" > kprobe_events
+
+# $comm was introduced in 4.8, older kernels reject it.
+if grep -A1 "fetcharg:" README | grep -q '\$comm' ; then
+: "Comm access"
+test_goodarg "\$comm"
+fi
+
+: "Indirect memory access"
+test_goodarg "+0(${GOODREG})" "-0(${GOODREG})" "+10(\$stack)" \
+ "+0(\$stack1)" "+10(@${GOODSYM}-10)" "+0(+10(+20(\$stack)))"
+test_badarg "+(${GOODREG})" "(${GOODREG}+10)" "-(${GOODREG})" "(${GOODREG})" \
+ "+10(\$comm)" "+0(${GOODREG})+10"
+
+: "Name assignment"
+test_goodarg "varname=${GOODREG}"
+test_badarg "varname=varname2=${GOODREG}"
+
+: "Type syntax"
+test_goodarg "${GOODREG}:${GOODTYPE}"
+test_badarg "${GOODREG}::${GOODTYPE}" "${GOODREG}:${BADTYPE}" \
+ "${GOODTYPE}:${GOODREG}"
+
+: "Combination check"
+
+test_goodarg "\$comm:string" "+0(\$stack):string"
+test_badarg "\$comm:x64" "\$stack:string" "${GOODREG}:string"
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_type.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_type.tc
new file mode 100644
index 000000000..25b7708eb
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_type.tc
@@ -0,0 +1,47 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Kprobes event arguments with types
+# requires: kprobe_events "x8/16/32/64":README
+
+gen_event() { # Bitsize
+ echo "p:testprobe $FUNCTION_FORK \$stack0:s$1 \$stack0:u$1 \$stack0:x$1 \$stack0:b4@4/$1"
+}
+
+check_types() { # s-type u-type x-type bf-type width
+ test $# -eq 5
+ CW=$5
+ CW=$((CW / 4))
+ X1=`printf "%x" $1 | tail -c ${CW}`
+ X2=`printf "%x" $2`
+ X3=`printf "%x" $3`
+ test $X1 = $X2
+ test $X2 = $X3
+ test 0x$X3 = $3
+
+ B4=`printf "%1x" $4`
+ B3=`printf "%03x" 0x$X3 | tail -c 2 | head -c 1`
+ test $B3 = $B4
+}
+
+for width in 64 32 16 8; do
+ : "Add new event with basic types"
+ gen_event $width > kprobe_events
+ grep testprobe kprobe_events
+ test -d events/kprobes/testprobe
+
+ : "Trace the event"
+ echo 1 > events/kprobes/testprobe/enable
+ ( echo "forked")
+ echo 0 > events/kprobes/testprobe/enable
+
+ : "Confirm the arguments is recorded in given types correctly"
+ ARGS=`grep "testprobe" trace | head -n 1 | sed -e 's/.* arg1=\(.*\) arg2=\(.*\) arg3=\(.*\) arg4=\(.*\)/\1 \2 \3 \4/'`
+ check_types $ARGS $width
+
+ : "Clear event for next loop"
+ echo "-:testprobe" >> kprobe_events
+ clear_trace
+
+done
+
+exit_pass
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc
new file mode 100644
index 000000000..d25d01a19
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_args_user.tc
@@ -0,0 +1,34 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Kprobe event user-memory access
+# requires: kprobe_events '$arg<N>':README
+
+grep -A10 "fetcharg:" README | grep -q 'ustring' || exit_unsupported
+grep -A10 "fetcharg:" README | grep -q '\[u\]<offset>' || exit_unsupported
+
+:;: "user-memory access syntax and ustring working on user memory";:
+echo 'p:myevent do_sys_open path=+0($arg2):ustring path2=+u0($arg2):string' \
+ > kprobe_events
+echo 'p:myevent2 do_sys_openat2 path=+0($arg2):ustring path2=+u0($arg2):string' \
+ >> kprobe_events
+
+grep myevent kprobe_events | \
+ grep -q 'path=+0($arg2):ustring path2=+u0($arg2):string'
+echo 1 > events/kprobes/myevent/enable
+echo 1 > events/kprobes/myevent2/enable
+echo > /dev/null
+echo 0 > events/kprobes/myevent/enable
+echo 0 > events/kprobes/myevent2/enable
+
+grep myevent trace | grep -q 'path="/dev/null" path2="/dev/null"'
+
+:;: "user-memory access syntax and ustring not working with kernel memory";:
+echo 'p:myevent vfs_symlink path=+0($arg3):ustring path2=+u0($arg3):string' \
+ > kprobe_events
+echo 1 > events/kprobes/myevent/enable
+ln -s foo $TMPDIR/bar
+echo 0 > events/kprobes/myevent/enable
+
+grep myevent trace | grep -q 'path=(fault) path2=(fault)'
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_eventname.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_eventname.tc
new file mode 100644
index 000000000..1f6981ef7
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_eventname.tc
@@ -0,0 +1,45 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Kprobe event auto/manual naming
+# requires: kprobe_events
+
+:;: "Add an event on function without name" ;:
+
+FUNC=`grep " [tT] .*vfs_read$" /proc/kallsyms | tail -n 1 | cut -f 3 -d " "`
+[ "x" != "x$FUNC" ] || exit_unresolved
+echo "p $FUNC" > kprobe_events
+PROBE_NAME=`echo $FUNC | tr ".:" "_"`
+test -d events/kprobes/p_${PROBE_NAME}_0 || exit_failure
+
+:;: "Add an event on function with new name" ;:
+
+echo "p:event1 $FUNC" > kprobe_events
+test -d events/kprobes/event1 || exit_failure
+
+:;: "Add an event on function with new name and group" ;:
+
+echo "p:kprobes2/event2 $FUNC" > kprobe_events
+test -d events/kprobes2/event2 || exit_failure
+
+:;: "Add an event on dot function without name" ;:
+
+find_dot_func() {
+ if [ ! -f available_filter_functions ]; then
+ grep -m 10 " [tT] .*\.isra\..*$" /proc/kallsyms | tail -n 1 | cut -f 3 -d " "
+ return;
+ fi
+
+ grep " [tT] .*\.isra\..*" /proc/kallsyms | cut -f 3 -d " " | while read f; do
+ if grep -s $f available_filter_functions; then
+ echo $f
+ break
+ fi
+ done
+}
+
+FUNC=`find_dot_func | tail -n 1`
+[ "x" != "x$FUNC" ] || exit_unresolved
+echo "p $FUNC" > kprobe_events
+EVENT=`grep $FUNC kprobe_events | cut -f 1 -d " " | cut -f 2 -d:`
+[ "x" != "x$EVENT" ] || exit_failure
+test -d events/$EVENT || exit_failure
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_ftrace.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_ftrace.tc
new file mode 100644
index 000000000..555629260
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_ftrace.tc
@@ -0,0 +1,45 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Kprobe dynamic event with function tracer
+# requires: kprobe_events stack_trace_filter function:tracer
+
+# prepare
+echo nop > current_tracer
+echo $FUNCTION_FORK > set_ftrace_filter
+echo "p:testprobe $FUNCTION_FORK" > kprobe_events
+
+# kprobe on / ftrace off
+echo 1 > events/kprobes/testprobe/enable
+echo > trace
+( echo "forked")
+grep testprobe trace
+! grep "$FUNCTION_FORK <-" trace
+
+# kprobe on / ftrace on
+echo function > current_tracer
+echo > trace
+( echo "forked")
+grep testprobe trace
+grep "$FUNCTION_FORK <-" trace
+
+# kprobe off / ftrace on
+echo 0 > events/kprobes/testprobe/enable
+echo > trace
+( echo "forked")
+! grep testprobe trace
+grep "$FUNCTION_FORK <-" trace
+
+# kprobe on / ftrace on
+echo 1 > events/kprobes/testprobe/enable
+echo function > current_tracer
+echo > trace
+( echo "forked")
+grep testprobe trace
+grep "$FUNCTION_FORK <-" trace
+
+# kprobe on / ftrace off
+echo nop > current_tracer
+echo > trace
+( echo "forked")
+grep testprobe trace
+! grep "$FUNCTION_FORK <-" trace
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_module.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_module.tc
new file mode 100644
index 000000000..7e74ee11e
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_module.tc
@@ -0,0 +1,52 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Kprobe dynamic event - probing module
+# requires: kprobe_events
+
+rmmod trace-printk ||:
+if ! modprobe trace-printk ; then
+ echo "No trace-printk sample module - please make CONFIG_SAMPLE_TRACE_PRINTK=
+m"
+ exit_unresolved;
+fi
+
+MOD=trace_printk
+FUNC=trace_printk_irq_work
+
+:;: "Add an event on a module function without specifying event name" ;:
+
+echo "p $MOD:$FUNC" > kprobe_events
+PROBE_NAME=`echo $MOD:$FUNC | tr ".:" "_"`
+test -d events/kprobes/p_${PROBE_NAME}_0 || exit_failure
+
+:;: "Add an event on a module function with new event name" ;:
+
+echo "p:event1 $MOD:$FUNC" > kprobe_events
+test -d events/kprobes/event1 || exit_failure
+
+:;: "Add an event on a module function with new event and group name" ;:
+
+echo "p:kprobes1/event1 $MOD:$FUNC" > kprobe_events
+test -d events/kprobes1/event1 || exit_failure
+
+:;: "Remove target module, but event still be there" ;:
+if ! rmmod trace-printk ; then
+ echo "Failed to unload module - please enable CONFIG_MODULE_UNLOAD"
+ exit_unresolved;
+fi
+test -d events/kprobes1/event1
+
+:;: "Check posibility to defining events on unloaded module";:
+echo "p:event2 $MOD:$FUNC" >> kprobe_events
+
+:;: "Target is gone, but we can prepare for next time";:
+echo 1 > events/kprobes1/event1/enable
+
+:;: "Load module again, which means the event1 should be recorded";:
+modprobe trace-printk
+grep "event1:" trace
+
+:;: "Remove the module again and check the event is not locked"
+rmmod trace-printk
+echo 0 > events/kprobes1/event1/enable
+echo "-:kprobes1/event1" >> kprobe_events
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_multiprobe.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_multiprobe.tc
new file mode 100644
index 000000000..f0d5b7777
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_multiprobe.tc
@@ -0,0 +1,32 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Create/delete multiprobe on kprobe event
+# requires: kprobe_events "Create/append/":README
+
+# Choose 2 symbols for target
+SYM1=$FUNCTION_FORK
+SYM2=do_exit
+EVENT_NAME=kprobes/testevent
+
+DEF1="p:$EVENT_NAME $SYM1"
+DEF2="p:$EVENT_NAME $SYM2"
+
+:;: "Define an event which has 2 probes" ;:
+echo $DEF1 >> kprobe_events
+echo $DEF2 >> kprobe_events
+cat kprobe_events | grep "$DEF1"
+cat kprobe_events | grep "$DEF2"
+
+:;: "Remove the event by name (should remove both)" ;:
+echo "-:$EVENT_NAME" >> kprobe_events
+test `cat kprobe_events | wc -l` -eq 0
+
+:;: "Remove just 1 event" ;:
+echo $DEF1 >> kprobe_events
+echo $DEF2 >> kprobe_events
+echo "-:$EVENT_NAME $SYM1" >> kprobe_events
+! cat kprobe_events | grep "$DEF1"
+cat kprobe_events | grep "$DEF2"
+
+:;: "Appending different type must fail" ;:
+! echo "$DEF1 \$stack" >> kprobe_events
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_non_uniq_symbol.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_non_uniq_symbol.tc
new file mode 100644
index 000000000..bc9514428
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_non_uniq_symbol.tc
@@ -0,0 +1,13 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test failure of registering kprobe on non unique symbol
+# requires: kprobe_events
+
+SYMBOL='name_show'
+
+# We skip this test on kernel where SYMBOL is unique or does not exist.
+if [ "$(grep -c -E "[[:alnum:]]+ t ${SYMBOL}" /proc/kallsyms)" -le '1' ]; then
+ exit_unsupported
+fi
+
+! echo "p:test_non_unique ${SYMBOL}" > kprobe_events
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc
new file mode 100644
index 000000000..7c02509c7
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kprobe_syntax_errors.tc
@@ -0,0 +1,105 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Kprobe event parser error log check
+# requires: kprobe_events error_log
+
+check_error() { # command-with-error-pos-by-^
+ ftrace_errlog_check 'trace_kprobe' "$1" 'kprobe_events'
+}
+
+if grep -q 'r\[maxactive\]' README; then
+check_error 'p^100 vfs_read' # MAXACT_NO_KPROBE
+check_error 'r^1a111 vfs_read' # BAD_MAXACT
+check_error 'r^100000 vfs_read' # MAXACT_TOO_BIG
+fi
+
+check_error 'p ^non_exist_func' # BAD_PROBE_ADDR (enoent)
+check_error 'p ^hoge-fuga' # BAD_PROBE_ADDR (bad syntax)
+check_error 'p ^hoge+1000-1000' # BAD_PROBE_ADDR (bad syntax)
+check_error 'r ^vfs_read+10' # BAD_RETPROBE
+check_error 'p:^/bar vfs_read' # NO_GROUP_NAME
+check_error 'p:^12345678901234567890123456789012345678901234567890123456789012345/bar vfs_read' # GROUP_TOO_LONG
+
+check_error 'p:^foo.1/bar vfs_read' # BAD_GROUP_NAME
+check_error 'p:foo/^12345678901234567890123456789012345678901234567890123456789012345 vfs_read' # EVENT_TOO_LONG
+check_error 'p:foo/^bar.1 vfs_read' # BAD_EVENT_NAME
+
+check_error 'p vfs_read ^$retval' # RETVAL_ON_PROBE
+check_error 'p vfs_read ^$stack10000' # BAD_STACK_NUM
+
+if grep -q '$arg<N>' README; then
+check_error 'p vfs_read ^$arg10000' # BAD_ARG_NUM
+fi
+
+check_error 'p vfs_read ^$none_var' # BAD_VAR
+
+check_error 'p vfs_read ^%none_reg' # BAD_REG_NAME
+check_error 'p vfs_read ^@12345678abcde' # BAD_MEM_ADDR
+check_error 'p vfs_read ^@+10' # FILE_ON_KPROBE
+
+grep -q "imm-value" README && \
+check_error 'p vfs_read arg1=\^x' # BAD_IMM
+grep -q "imm-string" README && \
+check_error 'p vfs_read arg1=\"abcd^' # IMMSTR_NO_CLOSE
+
+check_error 'p vfs_read ^+0@0)' # DEREF_NEED_BRACE
+check_error 'p vfs_read ^+0ab1(@0)' # BAD_DEREF_OFFS
+check_error 'p vfs_read +0(+0(@0^)' # DEREF_OPEN_BRACE
+
+if grep -A1 "fetcharg:" README | grep -q '\$comm' ; then
+check_error 'p vfs_read +0(^$comm)' # COMM_CANT_DEREF
+fi
+
+check_error 'p vfs_read ^&1' # BAD_FETCH_ARG
+
+
+# We've introduced this limitation with array support
+if grep -q ' <type>\\\[<array-size>\\\]' README; then
+check_error 'p vfs_read +0(^+0(+0(+0(+0(+0(+0(+0(+0(+0(+0(+0(+0(+0(@0))))))))))))))' # TOO_MANY_OPS?
+check_error 'p vfs_read +0(@11):u8[10^' # ARRAY_NO_CLOSE
+check_error 'p vfs_read +0(@11):u8[10]^a' # BAD_ARRAY_SUFFIX
+check_error 'p vfs_read +0(@11):u8[^10a]' # BAD_ARRAY_NUM
+check_error 'p vfs_read +0(@11):u8[^256]' # ARRAY_TOO_BIG
+fi
+
+check_error 'p vfs_read @11:^unknown_type' # BAD_TYPE
+check_error 'p vfs_read $stack0:^string' # BAD_STRING
+check_error 'p vfs_read @11:^b10@a/16' # BAD_BITFIELD
+
+check_error 'p vfs_read ^arg123456789012345678901234567890=@11' # ARG_NAME_TOO_LOG
+check_error 'p vfs_read ^=@11' # NO_ARG_NAME
+check_error 'p vfs_read ^var.1=@11' # BAD_ARG_NAME
+check_error 'p vfs_read var1=@11 ^var1=@12' # USED_ARG_NAME
+check_error 'p vfs_read ^+1234567(+1234567(+1234567(+1234567(+1234567(+1234567(@1234))))))' # ARG_TOO_LONG
+check_error 'p vfs_read arg1=^' # NO_ARG_BODY
+
+# instruction boundary check is valid on x86 (at this moment)
+case $(uname -m) in
+ x86_64|i[3456]86)
+ echo 'p vfs_read' > kprobe_events
+ if grep -q FTRACE ../kprobes/list ; then
+ check_error 'p ^vfs_read+3' # BAD_INSN_BNDRY (only if function-tracer is enabled)
+ fi
+ ;;
+esac
+
+# multiprobe errors
+if grep -q "Create/append/" README && grep -q "imm-value" README; then
+echo "p:kprobes/testevent $FUNCTION_FORK" > kprobe_events
+check_error '^r:kprobes/testevent do_exit' # DIFF_PROBE_TYPE
+
+# Explicitly use printf "%s" to not interpret \1
+printf "%s" "p:kprobes/testevent $FUNCTION_FORK abcd=\\1" > kprobe_events
+check_error "p:kprobes/testevent $FUNCTION_FORK ^bcd=\\1" # DIFF_ARG_TYPE
+check_error "p:kprobes/testevent $FUNCTION_FORK ^abcd=\\1:u8" # DIFF_ARG_TYPE
+check_error "p:kprobes/testevent $FUNCTION_FORK ^abcd=\\\"foo\"" # DIFF_ARG_TYPE
+check_error "^p:kprobes/testevent $FUNCTION_FORK abcd=\\1" # SAME_PROBE
+fi
+
+# %return suffix errors
+if grep -q "place (kretprobe): .*%return.*" README; then
+check_error 'p vfs_read^%hoge' # BAD_ADDR_SUFFIX
+check_error 'p ^vfs_read+10%return' # BAD_RETPROBE
+fi
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_args.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_args.tc
new file mode 100644
index 000000000..197cc2afd
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_args.tc
@@ -0,0 +1,19 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Kretprobe dynamic event with arguments
+# requires: kprobe_events
+
+# Add new kretprobe event
+echo "r:testprobe2 $FUNCTION_FORK \$retval" > kprobe_events
+grep testprobe2 kprobe_events | grep -q 'arg1=\$retval'
+test -d events/kprobes/testprobe2
+
+echo 1 > events/kprobes/testprobe2/enable
+( echo "forked")
+
+cat trace | grep testprobe2 | grep -q "<- $FUNCTION_FORK"
+
+echo 0 > events/kprobes/testprobe2/enable
+echo '-:testprobe2' >> kprobe_events
+clear_trace
+test -d events/kprobes/testprobe2 && exit_fail || exit_pass
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc
new file mode 100644
index 000000000..4f0b268c1
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_maxactive.tc
@@ -0,0 +1,35 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Kretprobe dynamic event with maxactive
+# requires: kprobe_events 'r[maxactive]':README
+
+# Test if we successfully reject unknown messages
+if echo 'a:myprobeaccept inet_csk_accept' > kprobe_events; then false; else true; fi
+
+# Test if we successfully reject too big maxactive
+if echo 'r1000000:myprobeaccept inet_csk_accept' > kprobe_events; then false; else true; fi
+
+# Test if we successfully reject unparsable numbers for maxactive
+if echo 'r10fuzz:myprobeaccept inet_csk_accept' > kprobe_events; then false; else true; fi
+
+# Test for kretprobe with event name without maxactive
+echo 'r:myprobeaccept inet_csk_accept' > kprobe_events
+grep myprobeaccept kprobe_events
+test -d events/kprobes/myprobeaccept
+echo '-:myprobeaccept' >> kprobe_events
+
+# Test for kretprobe with event name with a small maxactive
+echo 'r10:myprobeaccept inet_csk_accept' > kprobe_events
+grep myprobeaccept kprobe_events
+test -d events/kprobes/myprobeaccept
+echo '-:myprobeaccept' >> kprobe_events
+
+# Test for kretprobe without event name without maxactive
+echo 'r inet_csk_accept' > kprobe_events
+grep inet_csk_accept kprobe_events
+echo > kprobe_events
+
+# Test for kretprobe without event name with a small maxactive
+echo 'r10 inet_csk_accept' > kprobe_events
+grep inet_csk_accept kprobe_events
+echo > kprobe_events
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_return_suffix.tc b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_return_suffix.tc
new file mode 100644
index 000000000..f07bd15cc
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/kretprobe_return_suffix.tc
@@ -0,0 +1,21 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Kretprobe %%return suffix test
+# requires: kprobe_events '<symbol>[+<offset>]%return':README
+
+# Test for kretprobe by "r"
+echo 'r:myprobeaccept vfs_read' > kprobe_events
+RESULT1=`cat kprobe_events`
+
+# Test for kretprobe by "%return"
+echo 'p:myprobeaccept vfs_read%return' > kprobe_events
+RESULT2=`cat kprobe_events`
+
+if [ "$RESULT1" != "$RESULT2" ]; then
+ echo "Error: %return suffix didn't make a return probe."
+ echo "r-command: $RESULT1"
+ echo "%return: $RESULT2"
+ exit_fail
+fi
+
+echo > kprobe_events
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc b/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc
new file mode 100644
index 000000000..312d23780
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/multiple_kprobes.tc
@@ -0,0 +1,33 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Register/unregister many kprobe events
+# requires: kprobe_events
+
+# ftrace fentry skip size depends on the machine architecture.
+# Currently HAVE_KPROBES_ON_FTRACE defined on x86 and powerpc64le
+case `uname -m` in
+ x86_64|i[3456]86) OFFS=5;;
+ ppc64le) OFFS=8;;
+ *) OFFS=0;;
+esac
+
+N=0
+echo "Setup up kprobes on first available 256 text symbols"
+grep -i " t " /proc/kallsyms | cut -f3 -d" " | grep -v .*\\..* | \
+while read i; do
+ echo p ${i}+${OFFS} >> kprobe_events && N=$((N+1)) ||:
+ test $N -eq 256 && break
+done
+
+L=`cat kprobe_events | wc -l`
+if [ $L -ne 256 ]; then
+ echo "The number of kprobes events ($L) is not 256"
+ exit_fail
+fi
+
+echo 1 > events/kprobes/enable
+echo 0 > events/kprobes/enable
+echo > kprobe_events
+echo "Waiting for unoptimizing & freeing"
+sleep 5
+echo "Done"
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/probepoint.tc b/tools/testing/selftests/ftrace/test.d/kprobe/probepoint.tc
new file mode 100644
index 000000000..624269c8d
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/probepoint.tc
@@ -0,0 +1,38 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Kprobe events - probe points
+# requires: kprobe_events
+
+TARGET_FUNC=tracefs_create_dir
+
+dec_addr() { # hexaddr
+ printf "%d" "0x"`echo $1 | tail -c 8`
+}
+
+set_offs() { # prev target next
+ A1=`dec_addr $1`
+ A2=`dec_addr $2`
+ A3=`dec_addr $3`
+ TARGET="0x$2" # an address
+ PREV=`expr $A1 - $A2` # offset to previous symbol
+ NEXT=+`expr $A3 - $A2` # offset to next symbol
+ OVERFLOW=+`printf "0x%x" ${PREV}` # overflow offset to previous symbol
+}
+
+# We have to decode symbol addresses to get correct offsets.
+# If the offset is not an instruction boundary, it cause -EILSEQ.
+set_offs `grep -A1 -B1 ${TARGET_FUNC} /proc/kallsyms | cut -f 1 -d " " | xargs`
+
+UINT_TEST=no
+# printf "%x" -1 returns (unsigned long)-1.
+if [ `printf "%x" -1 | wc -c` != 9 ]; then
+ UINT_TEST=yes
+fi
+
+echo "p:testprobe ${TARGET_FUNC}" > kprobe_events
+echo "p:testprobe ${TARGET}" > kprobe_events
+echo "p:testprobe ${TARGET_FUNC}${NEXT}" > kprobe_events
+! echo "p:testprobe ${TARGET_FUNC}${PREV}" > kprobe_events
+if [ "${UINT_TEST}" = yes ]; then
+! echo "p:testprobe ${TARGET_FUNC}${OVERFLOW}" > kprobe_events
+fi
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/profile.tc b/tools/testing/selftests/ftrace/test.d/kprobe/profile.tc
new file mode 100644
index 000000000..34fb89b0c
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/profile.tc
@@ -0,0 +1,14 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Kprobe profile
+# requires: kprobe_events
+
+! grep -q 'myevent' kprobe_profile
+echo "p:myevent $FUNCTION_FORK" > kprobe_events
+grep -q 'myevent[[:space:]]*0[[:space:]]*0$' kprobe_profile
+echo 1 > events/kprobes/myevent/enable
+( echo "forked" )
+grep -q 'myevent[[:space:]]*[[:digit:]]*[[:space:]]*0$' kprobe_profile
+echo 0 > events/kprobes/myevent/enable
+echo > kprobe_events
+! grep -q 'myevent' kprobe_profile
diff --git a/tools/testing/selftests/ftrace/test.d/kprobe/uprobe_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/kprobe/uprobe_syntax_errors.tc
new file mode 100644
index 000000000..f5e3f9e4a
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/kprobe/uprobe_syntax_errors.tc
@@ -0,0 +1,26 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Uprobe event parser error log check
+# requires: uprobe_events error_log
+
+check_error() { # command-with-error-pos-by-^
+ ftrace_errlog_check 'trace_uprobe' "$1" 'uprobe_events'
+}
+
+check_error 'p ^/non_exist_file:100' # FILE_NOT_FOUND
+check_error 'p ^/sys:100' # NO_REGULAR_FILE
+check_error 'p /bin/sh:^10a' # BAD_UPROBE_OFFS
+check_error 'p /bin/sh:10(^1a)' # BAD_REFCNT
+check_error 'p /bin/sh:10(10^' # REFCNT_OPEN_BRACE
+check_error 'p /bin/sh:10(10)^a' # BAD_REFCNT_SUFFIX
+
+check_error 'p /bin/sh:10 ^@+ab' # BAD_FILE_OFFS
+check_error 'p /bin/sh:10 ^@symbol' # SYM_ON_UPROBE
+
+# %return suffix error
+if grep -q "place (uprobe): .*%return.*" README; then
+check_error 'p /bin/sh:10^%hoge' # BAD_ADDR_SUFFIX
+check_error 'p /bin/sh:10(10)^%return' # BAD_REFCNT_SUFFIX
+fi
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/preemptirq/irqsoff_tracer.tc b/tools/testing/selftests/ftrace/test.d/preemptirq/irqsoff_tracer.tc
new file mode 100644
index 000000000..22bff122b
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/preemptirq/irqsoff_tracer.tc
@@ -0,0 +1,78 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: test for the preemptirqsoff tracer
+# requires: preemptoff:tracer irqsoff:tracer
+
+MOD=preemptirq_delay_test
+
+fail() {
+ reset_tracer
+ rmmod $MOD || true
+ exit_fail
+}
+
+unsup() { #msg
+ reset_tracer
+ rmmod $MOD || true
+ echo $1
+ exit_unsupported
+}
+
+unres() { #msg
+ reset_tracer
+ rmmod $MOD || true
+ echo $1
+ exit_unresolved
+}
+
+modprobe $MOD || unres "$MOD module not available"
+rmmod $MOD
+
+reset_tracer
+
+# Simulate preemptoff section for half a second couple of times
+echo preemptoff > current_tracer
+sleep 1
+modprobe $MOD test_mode=preempt delay=500000 || fail
+rmmod $MOD || fail
+modprobe $MOD test_mode=preempt delay=500000 || fail
+rmmod $MOD || fail
+modprobe $MOD test_mode=preempt delay=500000 || fail
+rmmod $MOD || fail
+
+cat trace
+
+# Confirm which tracer
+grep -q "tracer: preemptoff" trace || fail
+
+# Check the end of the section
+egrep -q "5.....us : <stack trace>" trace || fail
+
+# Check for 500ms of latency
+egrep -q "latency: 5..... us" trace || fail
+
+reset_tracer
+
+# Simulate irqsoff section for half a second couple of times
+echo irqsoff > current_tracer
+sleep 1
+modprobe $MOD test_mode=irq delay=500000 || fail
+rmmod $MOD || fail
+modprobe $MOD test_mode=irq delay=500000 || fail
+rmmod $MOD || fail
+modprobe $MOD test_mode=irq delay=500000 || fail
+rmmod $MOD || fail
+
+cat trace
+
+# Confirm which tracer
+grep -q "tracer: irqsoff" trace || fail
+
+# Check the end of the section
+egrep -q "5.....us : <stack trace>" trace || fail
+
+# Check for 500ms of latency
+egrep -q "latency: 5..... us" trace || fail
+
+reset_tracer
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/selftest/bashisms.tc b/tools/testing/selftests/ftrace/test.d/selftest/bashisms.tc
new file mode 100644
index 000000000..1b081e910
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/selftest/bashisms.tc
@@ -0,0 +1,21 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Meta-selftest: Checkbashisms
+
+if [ ! -f $FTRACETEST_ROOT/ftracetest ]; then
+ echo "Hmm, we can not find ftracetest"
+ exit_unresolved
+fi
+
+if ! which checkbashisms > /dev/null 2>&1 ; then
+ echo "No checkbashisms found. skipped."
+ exit_unresolved
+fi
+
+checkbashisms $FTRACETEST_ROOT/ftracetest
+checkbashisms $FTRACETEST_ROOT/test.d/functions
+for t in $(find $FTRACETEST_ROOT/test.d -name \*.tc); do
+ checkbashisms $t
+done
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/template b/tools/testing/selftests/ftrace/test.d/template
new file mode 100644
index 000000000..2cd8947ed
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/template
@@ -0,0 +1,15 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: %HERE DESCRIBE WHAT THIS DOES%
+# requires: %HERE LIST THE REQUIRED FILES, TRACERS OR README-STRINGS%
+# The required tracer needs :tracer suffix, e.g. function:tracer
+# The required README string needs :README suffix, e.g. "x8/16/32/64":README
+# and the README string is treated as a fixed-string instead of regexp pattern.
+# you have to add ".tc" extention for your testcase file
+# Note that all tests are run with "errexit" option.
+
+exit 0 # Return 0 if the test is passed, otherwise return !0
+# Or you can call exit_pass for passed test, and exit_fail for failed test.
+# If the test could not run because of lack of feature, call exit_unsupported
+# If the test returned unclear results, call exit_unresolved
+# If the test is a dummy, or a placeholder, call exit_untested
diff --git a/tools/testing/selftests/ftrace/test.d/tracer/wakeup.tc b/tools/testing/selftests/ftrace/test.d/tracer/wakeup.tc
new file mode 100644
index 000000000..11be10e1b
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/tracer/wakeup.tc
@@ -0,0 +1,21 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test wakeup tracer
+# requires: wakeup:tracer
+
+if ! which chrt ; then
+ echo "chrt is not found. This test requires nice command."
+ exit_unresolved
+fi
+
+echo wakeup > current_tracer
+echo 1 > tracing_on
+echo 0 > tracing_max_latency
+
+: "Wakeup higher priority task"
+chrt -f 5 sleep 1
+
+echo 0 > tracing_on
+grep '+ \[[[:digit:]]*\]' trace
+grep '==> \[[[:digit:]]*\]' trace
+
diff --git a/tools/testing/selftests/ftrace/test.d/tracer/wakeup_rt.tc b/tools/testing/selftests/ftrace/test.d/tracer/wakeup_rt.tc
new file mode 100644
index 000000000..3a77198b3
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/tracer/wakeup_rt.tc
@@ -0,0 +1,21 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Test wakeup RT tracer
+# requires: wakeup_rt:tracer
+
+if ! which chrt ; then
+ echo "chrt is not found. This test requires chrt command."
+ exit_unresolved
+fi
+
+echo wakeup_rt > current_tracer
+echo 1 > tracing_on
+echo 0 > tracing_max_latency
+
+: "Wakeup a realtime task"
+chrt -f 5 sleep 1
+
+echo 0 > tracing_on
+grep "+ \[[[:digit:]]*\]" trace
+grep "==> \[[[:digit:]]*\]" trace
+
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-action-hist-xfail.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-action-hist-xfail.tc
new file mode 100644
index 000000000..1590d6bfb
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-action-hist-xfail.tc
@@ -0,0 +1,19 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test inter-event histogram trigger expected fail actions
+# requires: set_event snapshot "snapshot()":README
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo "Test expected snapshot action failure"
+
+echo 'hist:keys=comm:onmatch(sched.sched_wakeup).snapshot()' >> events/sched/sched_waking/trigger && exit_fail
+
+echo "Test expected save action failure"
+
+echo 'hist:keys=comm:onmatch(sched.sched_wakeup).save(comm,prio)' >> events/sched/sched_waking/trigger && exit_fail
+
+exit_xfail
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc
new file mode 100644
index 000000000..41119e044
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-field-variable-support.tc
@@ -0,0 +1,33 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test field variable support
+# requires: set_event synthetic_events events/sched/sched_process_fork/hist
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo "Test field variable support"
+
+echo 'wakeup_latency u64 lat; pid_t pid; int prio; char comm[16]' > synthetic_events
+echo 'hist:keys=comm:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_waking/trigger
+echo 'hist:keys=next_comm:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,sched.sched_waking.prio,next_comm) if next_comm=="ping"' > events/sched/sched_switch/trigger
+echo 'hist:keys=pid,prio,comm:vals=lat:sort=pid,prio' > events/synthetic/wakeup_latency/trigger
+
+ping $LOCALHOST -c 3
+if ! grep -q "ping" events/synthetic/wakeup_latency/hist; then
+ fail "Failed to create inter-event histogram"
+fi
+
+if ! grep -q "synthetic_prio=prio" events/sched/sched_waking/hist; then
+ fail "Failed to create histogram with field variable"
+fi
+
+echo '!hist:keys=next_comm:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).wakeup_latency($wakeup_lat,next_pid,sched.sched_waking.prio,next_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
+
+if grep -q "synthetic_prio=prio" events/sched/sched_waking/hist; then
+ fail "Failed to remove histogram with field variable"
+fi
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc
new file mode 100644
index 000000000..9098f1e74
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-inter-event-combined-hist.tc
@@ -0,0 +1,37 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test inter-event combined histogram trigger
+# requires: set_event synthetic_events events/sched/sched_process_fork/hist
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo "Test create synthetic event"
+
+echo 'waking_latency u64 lat pid_t pid' > synthetic_events
+if [ ! -d events/synthetic/waking_latency ]; then
+ fail "Failed to create waking_latency synthetic event"
+fi
+
+echo "Test combined histogram"
+
+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_waking/trigger
+echo 'hist:keys=pid:waking_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).waking_latency($waking_lat,pid) if comm=="ping"' > events/sched/sched_wakeup/trigger
+echo 'hist:keys=pid,lat:sort=pid,lat' > events/synthetic/waking_latency/trigger
+
+echo 'wakeup_latency u64 lat pid_t pid' >> synthetic_events
+echo 'hist:keys=pid:ts1=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_wakeup/trigger
+echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts1:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid) if next_comm=="ping"' > events/sched/sched_switch/trigger
+
+echo 'waking_plus_wakeup_latency u64 lat; pid_t pid' >> synthetic_events
+echo 'hist:keys=pid,lat:sort=pid,lat:ww_lat=$waking_lat+$wakeup_lat:onmatch(synthetic.wakeup_latency).waking_plus_wakeup_latency($ww_lat,pid)' >> events/synthetic/wakeup_latency/trigger
+echo 'hist:keys=pid,lat:sort=pid,lat' >> events/synthetic/waking_plus_wakeup_latency/trigger
+
+ping $LOCALHOST -c 3
+if ! grep -q "pid:" events/synthetic/waking_plus_wakeup_latency/hist; then
+ fail "Failed to create combined histogram"
+fi
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-multi-actions-accept.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-multi-actions-accept.tc
new file mode 100644
index 000000000..3ad6e3fd8
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-multi-actions-accept.tc
@@ -0,0 +1,22 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test multiple actions on hist trigger
+# requires: set_event synthetic_events events/sched/sched_process_fork/hist
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo "Test multiple actions on hist trigger"
+echo 'wakeup_latency u64 lat; pid_t pid' >> synthetic_events
+TRIGGER1=events/sched/sched_wakeup/trigger
+TRIGGER2=events/sched/sched_switch/trigger
+
+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="cyclictest"' > $TRIGGER1
+echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0 if next_comm=="cyclictest"' >> $TRIGGER2
+echo 'hist:keys=next_pid:onmatch(sched.sched_wakeup).wakeup_latency(sched.sched_switch.$wakeup_lat,next_pid) if next_comm=="cyclictest"' >> $TRIGGER2
+echo 'hist:keys=next_pid:onmatch(sched.sched_wakeup).wakeup_latency(sched.sched_switch.$wakeup_lat,prev_pid) if next_comm=="cyclictest"' >> $TRIGGER2
+echo 'hist:keys=next_pid if next_comm=="cyclictest"' >> $TRIGGER2
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onchange-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onchange-action-hist.tc
new file mode 100644
index 000000000..adaabb873
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onchange-action-hist.tc
@@ -0,0 +1,22 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test inter-event histogram trigger onchange action
+# requires: set_event "onchange(var)":README
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo "Test onchange action"
+
+echo 'hist:keys=comm:newprio=prio:onchange($newprio).save(comm,prio) if comm=="ping"' >> events/sched/sched_waking/trigger
+
+ping $LOCALHOST -c 3
+nice -n 1 ping $LOCALHOST -c 3
+
+if ! grep -q "changed:" events/sched/sched_waking/hist; then
+ fail "Failed to create onchange action inter-event histogram"
+fi
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc
new file mode 100644
index 000000000..20e394710
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-action-hist.tc
@@ -0,0 +1,30 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test inter-event histogram trigger onmatch action
+# requires: set_event synthetic_events events/sched/sched_process_fork/hist
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo "Test create synthetic event"
+
+echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
+if [ ! -d events/synthetic/wakeup_latency ]; then
+ fail "Failed to create wakeup_latency synthetic event"
+fi
+
+echo "Test create histogram for synthetic event"
+echo "Test histogram variables,simple expression support and onmatch action"
+
+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
+echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid,next_comm) if next_comm=="ping"' > events/sched/sched_switch/trigger
+echo 'hist:keys=comm,pid,lat:wakeup_lat=lat:sort=lat' > events/synthetic/wakeup_latency/trigger
+
+ping $LOCALHOST -c 5
+if ! grep -q "ping" events/synthetic/wakeup_latency/hist; then
+ fail "Failed to create onmatch action inter-event histogram"
+fi
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc
new file mode 100644
index 000000000..f4b03ab7c
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmatch-onmax-action-hist.tc
@@ -0,0 +1,30 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test inter-event histogram trigger onmatch-onmax action
+# requires: set_event synthetic_events events/sched/sched_process_fork/hist
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo "Test create synthetic event"
+
+echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
+if [ ! -d events/synthetic/wakeup_latency ]; then
+ fail "Failed to create wakeup_latency synthetic event"
+fi
+
+echo "Test create histogram for synthetic event"
+echo "Test histogram variables,simple expression support and onmatch-onmax action"
+
+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
+echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_wakeup).wakeup_latency($wakeup_lat,next_pid,next_comm):onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
+echo 'hist:keys=comm,pid,lat:wakeup_lat=lat:sort=lat' > events/synthetic/wakeup_latency/trigger
+
+ping $LOCALHOST -c 5
+if [ ! grep -q "ping" events/synthetic/wakeup_latency/hist -o ! grep -q "max:" events/sched/sched_switch/hist]; then
+ fail "Failed to create onmatch-onmax action inter-event histogram"
+fi
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc
new file mode 100644
index 000000000..71c9b5911
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-onmax-action-hist.tc
@@ -0,0 +1,28 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test inter-event histogram trigger onmax action
+# requires: set_event synthetic_events events/sched/sched_process_fork/hist
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo "Test create synthetic event"
+
+echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
+if [ ! -d events/synthetic/wakeup_latency ]; then
+ fail "Failed to create wakeup_latency synthetic event"
+fi
+
+echo "Test onmax action"
+
+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' >> events/sched/sched_waking/trigger
+echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmax($wakeup_lat).save(next_comm,prev_pid,prev_prio,prev_comm) if next_comm=="ping"' >> events/sched/sched_switch/trigger
+
+ping $LOCALHOST -c 3
+if ! grep -q "max:" events/sched/sched_switch/hist; then
+ fail "Failed to create onmax action inter-event histogram"
+fi
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-snapshot-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-snapshot-action-hist.tc
new file mode 100644
index 000000000..67fa328b8
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-snapshot-action-hist.tc
@@ -0,0 +1,30 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test inter-event histogram trigger snapshot action
+# requires: set_event snapshot events/sched/sched_process_fork/hist "onchange(var)":README "snapshot()":README
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo "Test snapshot action"
+
+echo 1 > events/sched/enable
+
+echo 'hist:keys=comm:newprio=prio:onchange($newprio).save(comm,prio):onchange($newprio).snapshot() if comm=="ping"' >> events/sched/sched_waking/trigger
+
+ping $LOCALHOST -c 3
+nice -n 1 ping $LOCALHOST -c 3
+
+echo 0 > tracing_on
+
+if ! grep -q "changed:" events/sched/sched_waking/hist; then
+ fail "Failed to create onchange action inter-event histogram"
+fi
+
+if ! grep -q "comm=ping" snapshot; then
+ fail "Failed to create snapshot action inter-event histogram"
+fi
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc
new file mode 100644
index 000000000..a152b558b
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-createremove.tc
@@ -0,0 +1,34 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test synthetic event create remove
+# requires: set_event synthetic_events
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo "Test create synthetic event"
+
+echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
+if [ ! -d events/synthetic/wakeup_latency ]; then
+ fail "Failed to create wakeup_latency synthetic event"
+fi
+
+reset_trigger
+
+echo "Test remove synthetic event"
+echo '!wakeup_latency u64 lat pid_t pid char comm[16]' >> synthetic_events
+if [ -d events/synthetic/wakeup_latency ]; then
+ fail "Failed to delete wakeup_latency synthetic event"
+fi
+
+reset_trigger
+
+echo "Test create synthetic event with an error"
+echo 'wakeup_latency u64 lat pid_t pid char' > synthetic_events > /dev/null
+if [ -d events/synthetic/wakeup_latency ]; then
+ fail "Created wakeup_latency synthetic event with an invalid format"
+fi
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-dynstring.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-dynstring.tc
new file mode 100644
index 000000000..3d65c856e
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-dynstring.tc
@@ -0,0 +1,31 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test inter-event histogram trigger trace action with dynamic string param
+# requires: set_event synthetic_events events/sched/sched_process_exec/hist "char name[]' >> synthetic_events":README
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo "Test create synthetic event"
+
+echo 'ping_test_latency u64 lat; char filename[]' > synthetic_events
+if [ ! -d events/synthetic/ping_test_latency ]; then
+ fail "Failed to create ping_test_latency synthetic event"
+fi
+
+echo "Test create histogram for synthetic event using trace action and dynamic strings"
+echo "Test histogram dynamic string variables,simple expression support and trace action"
+
+echo 'hist:key=pid:filenamevar=filename:ts0=common_timestamp.usecs' > events/sched/sched_process_exec/trigger
+echo 'hist:key=pid:lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_process_exec).ping_test_latency($lat,$filenamevar) if comm == "ping"' > events/sched/sched_process_exit/trigger
+echo 'hist:keys=filename,lat:sort=filename,lat' > events/synthetic/ping_test_latency/trigger
+
+ping $LOCALHOST -c 5
+
+if ! grep -q "ping" events/synthetic/ping_test_latency/hist; then
+ fail "Failed to create dynamic string trace action inter-event histogram"
+fi
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-syntax.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-syntax.tc
new file mode 100644
index 000000000..59216f3cf
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic-event-syntax.tc
@@ -0,0 +1,71 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test synthetic_events syntax parser
+# requires: set_event synthetic_events
+
+do_reset() {
+ reset_trigger
+ echo > set_event
+ clear_trace
+}
+
+fail() { #msg
+ do_reset
+ echo $1
+ exit_fail
+}
+
+reset_tracer
+do_reset
+
+echo "Test synthetic_events syntax parser"
+
+echo > synthetic_events
+
+# synthetic event must have a field
+! echo "myevent" >> synthetic_events
+echo "myevent u64 var1" >> synthetic_events
+
+# synthetic event must be found in synthetic_events
+grep "myevent[[:space:]]u64 var1" synthetic_events
+
+# it is not possible to add same name event
+! echo "myevent u64 var2" >> synthetic_events
+
+# Non-append open will cleanup all events and add new one
+echo "myevent u64 var2" > synthetic_events
+
+# multiple fields with different spaces
+echo "myevent u64 var1; u64 var2;" > synthetic_events
+grep "myevent[[:space:]]u64 var1; u64 var2" synthetic_events
+echo "myevent u64 var1 ; u64 var2 ;" > synthetic_events
+grep "myevent[[:space:]]u64 var1; u64 var2" synthetic_events
+echo "myevent u64 var1 ;u64 var2" > synthetic_events
+grep "myevent[[:space:]]u64 var1; u64 var2" synthetic_events
+
+# test field types
+echo "myevent u32 var" > synthetic_events
+echo "myevent u16 var" > synthetic_events
+echo "myevent u8 var" > synthetic_events
+echo "myevent s64 var" > synthetic_events
+echo "myevent s32 var" > synthetic_events
+echo "myevent s16 var" > synthetic_events
+echo "myevent s8 var" > synthetic_events
+
+echo "myevent char var" > synthetic_events
+echo "myevent int var" > synthetic_events
+echo "myevent long var" > synthetic_events
+echo "myevent pid_t var" > synthetic_events
+
+echo "myevent unsigned char var" > synthetic_events
+echo "myevent unsigned int var" > synthetic_events
+echo "myevent unsigned long var" > synthetic_events
+grep "myevent[[:space:]]unsigned long var" synthetic_events
+
+# test string type
+echo "myevent char var[10]" > synthetic_events
+grep "myevent[[:space:]]char\[10\] var" synthetic_events
+
+do_reset
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic_event_syntax_errors.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic_event_syntax_errors.tc
new file mode 100644
index 000000000..ada594fe1
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-synthetic_event_syntax_errors.tc
@@ -0,0 +1,19 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test synthetic_events syntax parser errors
+# requires: synthetic_events error_log
+
+check_error() { # command-with-error-pos-by-^
+ ftrace_errlog_check 'synthetic_events' "$1" 'synthetic_events'
+}
+
+check_error 'myevent ^chr arg' # INVALID_TYPE
+check_error 'myevent ^char str[];; int v' # INVALID_TYPE
+check_error 'myevent char ^str]; int v' # INVALID_NAME
+check_error 'myevent char ^str;[]' # INVALID_NAME
+check_error 'myevent ^char str[; int v' # INVALID_TYPE
+check_error '^mye;vent char str[]' # BAD_NAME
+check_error 'myevent char str[]; ^int' # INVALID_FIELD
+check_error '^myevent' # INCOMPLETE_CMD
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-trace-action-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-trace-action-hist.tc
new file mode 100644
index 000000000..c126d2350
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/inter-event/trigger-trace-action-hist.tc
@@ -0,0 +1,31 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test inter-event histogram trigger trace action
+# requires: set_event synthetic_events events/sched/sched_process_fork/hist "trace(<synthetic_event>":README
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo "Test create synthetic event"
+
+echo 'wakeup_latency u64 lat pid_t pid char comm[16]' > synthetic_events
+if [ ! -d events/synthetic/wakeup_latency ]; then
+ fail "Failed to create wakeup_latency synthetic event"
+fi
+
+echo "Test create histogram for synthetic event using trace action"
+echo "Test histogram variables,simple expression support and trace action"
+
+echo 'hist:keys=pid:ts0=common_timestamp.usecs if comm=="ping"' > events/sched/sched_wakeup/trigger
+echo 'hist:keys=next_pid:wakeup_lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_wakeup).trace(wakeup_latency,$wakeup_lat,next_pid,next_comm) if next_comm=="ping"' > events/sched/sched_switch/trigger
+echo 'hist:keys=comm,pid,lat:wakeup_lat=lat:sort=lat' > events/synthetic/wakeup_latency/trigger
+
+ping $LOCALHOST -c 5
+
+if ! grep -q "ping" events/synthetic/wakeup_latency/hist; then
+ fail "Failed to create trace action inter-event histogram"
+fi
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-eventonoff.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-eventonoff.tc
new file mode 100644
index 000000000..c226acee7
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-eventonoff.tc
@@ -0,0 +1,45 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test event enable/disable trigger
+# requires: set_event events/sched/sched_process_fork/trigger
+# flags: instance
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+FEATURE=`grep enable_event events/sched/sched_process_fork/trigger`
+if [ -z "$FEATURE" ]; then
+ echo "event enable/disable trigger is not supported"
+ exit_unsupported
+fi
+
+echo "Test enable_event trigger"
+echo 0 > events/sched/sched_switch/enable
+echo 'enable_event:sched:sched_switch' > events/sched/sched_process_fork/trigger
+( echo "forked")
+if [ `cat events/sched/sched_switch/enable` != '1*' ]; then
+ fail "enable_event trigger on sched_process_fork did not work"
+fi
+
+reset_trigger
+
+echo "Test disable_event trigger"
+echo 1 > events/sched/sched_switch/enable
+echo 'disable_event:sched:sched_switch' > events/sched/sched_process_fork/trigger
+( echo "forked")
+if [ `cat events/sched/sched_switch/enable` != '0*' ]; then
+ fail "disable_event trigger on sched_process_fork did not work"
+fi
+
+reset_trigger
+
+echo "Test semantic error for event enable/disable trigger"
+! echo 'enable_event:nogroup:noevent' > events/sched/sched_process_fork/trigger
+! echo 'disable_event+1' > events/sched/sched_process_fork/trigger
+echo 'enable_event:sched:sched_switch' > events/sched/sched_process_fork/trigger
+! echo 'enable_event:sched:sched_switch' > events/sched/sched_process_fork/trigger
+! echo 'disable_event:sched:sched_switch' > events/sched/sched_process_fork/trigger
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-filter.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-filter.tc
new file mode 100644
index 000000000..d9a198cb0
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-filter.tc
@@ -0,0 +1,38 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test trigger filter
+# requires: set_event events/sched/sched_process_fork/trigger
+# flags: instance
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo "Test trigger filter"
+echo 1 > tracing_on
+echo 'traceoff if child_pid == 0' > events/sched/sched_process_fork/trigger
+( echo "forked")
+if [ `cat tracing_on` -ne 1 ]; then
+ fail "traceoff trigger on sched_process_fork did not work"
+fi
+
+reset_trigger
+
+echo "Test semantic error for trigger filter"
+! echo 'traceoff if a' > events/sched/sched_process_fork/trigger
+! echo 'traceoff if common_pid=0' > events/sched/sched_process_fork/trigger
+! echo 'traceoff if common_pid==b' > events/sched/sched_process_fork/trigger
+echo 'traceoff if common_pid == 0' > events/sched/sched_process_fork/trigger
+echo '!traceoff' > events/sched/sched_process_fork/trigger
+! echo 'traceoff if common_pid == child_pid' > events/sched/sched_process_fork/trigger
+echo 'traceoff if common_pid <= 0' > events/sched/sched_process_fork/trigger
+echo '!traceoff' > events/sched/sched_process_fork/trigger
+echo 'traceoff if common_pid >= 0' > events/sched/sched_process_fork/trigger
+echo '!traceoff' > events/sched/sched_process_fork/trigger
+echo 'traceoff if parent_pid >= 0 && child_pid >= 0' > events/sched/sched_process_fork/trigger
+echo '!traceoff' > events/sched/sched_process_fork/trigger
+echo 'traceoff if parent_pid >= 0 || child_pid >= 0' > events/sched/sched_process_fork/trigger
+echo '!traceoff' > events/sched/sched_process_fork/trigger
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-mod.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-mod.tc
new file mode 100644
index 000000000..4562e13cb
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-mod.tc
@@ -0,0 +1,50 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test histogram modifiers
+# requires: set_event events/sched/sched_process_fork/trigger events/sched/sched_process_fork/hist
+# flags: instance
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo "Test histogram with execname modifier"
+
+echo 'hist:keys=common_pid.execname' > events/sched/sched_process_fork/trigger
+for i in `seq 1 10` ; do ( echo "forked" > /dev/null); done
+COMM=`cat /proc/$$/comm`
+grep "common_pid: $COMM" events/sched/sched_process_fork/hist > /dev/null || \
+ fail "execname modifier on sched_process_fork did not work"
+
+reset_trigger
+
+echo "Test histogram with hex modifier"
+
+echo 'hist:keys=parent_pid.hex' > events/sched/sched_process_fork/trigger
+for i in `seq 1 10` ; do ( echo "forked" > /dev/null); done
+# Note that $$ is the parent pid. $PID is current PID.
+HEX=`printf %x $PID`
+grep "parent_pid: $HEX" events/sched/sched_process_fork/hist > /dev/null || \
+ fail "hex modifier on sched_process_fork did not work"
+
+reset_trigger
+
+echo "Test histogram with syscall modifier"
+
+echo 'hist:keys=id.syscall' > events/raw_syscalls/sys_exit/trigger
+for i in `seq 1 10` ; do ( echo "forked" > /dev/null); done
+grep "id: \(unknown_\|sys_\)" events/raw_syscalls/sys_exit/hist > /dev/null || \
+ fail "syscall modifier on raw_syscalls/sys_exit did not work"
+
+
+reset_trigger
+
+echo "Test histgram with log2 modifier"
+
+echo 'hist:keys=bytes_req.log2' > events/kmem/kmalloc/trigger
+for i in `seq 1 10` ; do ( echo "forked" > /dev/null); done
+grep 'bytes_req: ~ 2^[0-9]*' events/kmem/kmalloc/hist > /dev/null || \
+ fail "log2 modifier on kmem/kmalloc did not work"
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-syntax-errors.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-syntax-errors.tc
new file mode 100644
index 000000000..52cfe7828
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist-syntax-errors.tc
@@ -0,0 +1,16 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test histogram parser errors
+# requires: set_event events/kmem/kmalloc/trigger events/kmem/kmalloc/hist error_log
+
+check_error() { # command-with-error-pos-by-^
+ ftrace_errlog_check 'hist:kmem:kmalloc' "$1" 'events/kmem/kmalloc/trigger'
+}
+
+check_error 'hist:keys=common_pid:vals=bytes_req:sort=common_pid,^junk' # INVALID_SORT_FIELD
+check_error 'hist:keys=common_pid:vals=bytes_req:^sort=' # EMPTY_ASSIGNMENT
+check_error 'hist:keys=common_pid:vals=bytes_req:^sort=common_pid,' # EMPTY_SORT_FIELD
+check_error 'hist:keys=common_pid:vals=bytes_req:sort=common_pid.^junk' # INVALID_SORT_MODIFIER
+check_error 'hist:keys=common_pid:vals=bytes_req,bytes_alloc:^sort=common_pid,bytes_req,bytes_alloc' # TOO_MANY_SORT_FIELDS
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist.tc
new file mode 100644
index 000000000..2950bfbc6
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-hist.tc
@@ -0,0 +1,58 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test histogram trigger
+# requires: set_event events/sched/sched_process_fork/trigger events/sched/sched_process_fork/hist
+# flags: instance
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo "Test histogram basic trigger"
+
+echo 'hist:keys=parent_pid:vals=child_pid' > events/sched/sched_process_fork/trigger
+for i in `seq 1 10` ; do ( echo "forked" > /dev/null); done
+grep parent_pid events/sched/sched_process_fork/hist > /dev/null || \
+ fail "hist trigger on sched_process_fork did not work"
+grep child events/sched/sched_process_fork/hist > /dev/null || \
+ fail "hist trigger on sched_process_fork did not work"
+
+reset_trigger
+
+echo "Test histogram with compound keys"
+
+echo 'hist:keys=parent_pid,child_pid' > events/sched/sched_process_fork/trigger
+for i in `seq 1 10` ; do ( echo "forked" > /dev/null); done
+grep '^{ parent_pid:.*, child_pid:.*}' events/sched/sched_process_fork/hist > /dev/null || \
+ fail "compound keys on sched_process_fork did not work"
+
+reset_trigger
+
+echo "Test histogram with string key"
+
+echo 'hist:keys=parent_comm' > events/sched/sched_process_fork/trigger
+for i in `seq 1 10` ; do ( echo "forked" > /dev/null); done
+COMM=`cat /proc/$$/comm`
+grep "parent_comm: $COMM" events/sched/sched_process_fork/hist > /dev/null || \
+ fail "string key on sched_process_fork did not work"
+
+reset_trigger
+
+echo "Test histogram with sort key"
+
+echo 'hist:keys=parent_pid,child_pid:sort=child_pid.ascending' > events/sched/sched_process_fork/trigger
+for i in `seq 1 10` ; do ( echo "forked" > /dev/null); done
+
+check_inc() {
+ while [ $# -gt 1 ]; do
+ [ $1 -gt $2 ] && return 1
+ shift 1
+ done
+ return 0
+}
+check_inc `grep -o "child_pid:[[:space:]]*[[:digit:]]*" \
+ events/sched/sched_process_fork/hist | cut -d: -f2 ` ||
+ fail "sort param on sched_process_fork did not work"
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-multihist.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-multihist.tc
new file mode 100644
index 000000000..7129b52da
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-multihist.tc
@@ -0,0 +1,44 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test multiple histogram triggers
+# requires: set_event events/sched/sched_process_fork/trigger events/sched/sched_process_fork/hist
+# flags: instance
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo "Test histogram multiple triggers"
+
+echo 'hist:keys=parent_pid:vals=child_pid' > events/sched/sched_process_fork/trigger
+echo 'hist:keys=parent_comm:vals=child_pid' >> events/sched/sched_process_fork/trigger
+for i in `seq 1 10` ; do ( echo "forked" > /dev/null); done
+grep parent_pid events/sched/sched_process_fork/hist > /dev/null || \
+ fail "hist trigger on sched_process_fork did not work"
+grep child events/sched/sched_process_fork/hist > /dev/null || \
+ fail "hist trigger on sched_process_fork did not work"
+COMM=`cat /proc/$$/comm`
+grep "parent_comm: $COMM" events/sched/sched_process_fork/hist > /dev/null || \
+ fail "string key on sched_process_fork did not work"
+
+reset_trigger
+
+echo "Test histogram with its name"
+
+echo 'hist:name=test_hist:keys=common_pid' > events/sched/sched_process_fork/trigger
+for i in `seq 1 10` ; do ( echo "forked" > /dev/null); done
+grep test_hist events/sched/sched_process_fork/hist > /dev/null || \
+ fail "named event on sched_process_fork did not work"
+
+echo "Test same named histogram on different events"
+
+echo 'hist:name=test_hist:keys=common_pid' > events/sched/sched_process_exit/trigger
+for i in `seq 1 10` ; do ( echo "forked" > /dev/null); done
+grep test_hist events/sched/sched_process_exit/hist > /dev/null || \
+ fail "named event on sched_process_fork did not work"
+
+diffs=`diff events/sched/sched_process_exit/hist events/sched/sched_process_fork/hist | wc -l`
+test $diffs -eq 0 || fail "Same name histograms are not same"
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-snapshot.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-snapshot.tc
new file mode 100644
index 000000000..33f5bdee3
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-snapshot.tc
@@ -0,0 +1,36 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test snapshot-trigger
+# requires: set_event events/sched/sched_process_fork/trigger snapshot
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+FEATURE=`grep snapshot events/sched/sched_process_fork/trigger`
+if [ -z "$FEATURE" ]; then
+ echo "snapshot trigger is not supported"
+ exit_unsupported
+fi
+
+echo "Test snapshot trigger"
+echo 0 > snapshot
+echo 1 > events/sched/sched_process_fork/enable
+( echo "forked")
+echo 'snapshot:1' > events/sched/sched_process_fork/trigger
+( echo "forked")
+grep sched_process_fork snapshot > /dev/null || \
+ fail "snapshot trigger on sched_process_fork did not work"
+
+reset_trigger
+echo 0 > snapshot
+echo 0 > events/sched/sched_process_fork/enable
+
+echo "Test snapshot semantic errors"
+
+! echo "snapshot+1" > events/sched/sched_process_fork/trigger
+echo "snapshot" > events/sched/sched_process_fork/trigger
+! echo "snapshot" > events/sched/sched_process_fork/trigger
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-stacktrace.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-stacktrace.tc
new file mode 100644
index 000000000..320ea9b3c
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-stacktrace.tc
@@ -0,0 +1,33 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test stacktrace-trigger
+# requires: set_event events/sched/sched_process_fork/trigger
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+FEATURE=`grep stacktrace events/sched/sched_process_fork/trigger`
+if [ -z "$FEATURE" ]; then
+ echo "stacktrace trigger is not supported"
+ exit_unsupported
+fi
+
+echo "Test stacktrace trigger"
+echo 0 > trace
+echo 0 > options/stacktrace
+echo 'stacktrace' > events/sched/sched_process_fork/trigger
+( echo "forked")
+grep "<stack trace>" trace > /dev/null || \
+ fail "stacktrace trigger on sched_process_fork did not work"
+
+reset_trigger
+
+echo "Test stacktrace semantic errors"
+
+! echo "stacktrace:foo" > events/sched/sched_process_fork/trigger
+echo "stacktrace" > events/sched/sched_process_fork/trigger
+! echo "stacktrace" > events/sched/sched_process_fork/trigger
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-trace-marker-hist.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-trace-marker-hist.tc
new file mode 100644
index 000000000..68f3af9d9
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-trace-marker-hist.tc
@@ -0,0 +1,19 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: trace_marker trigger - test histogram trigger
+# requires: set_event events/ftrace/print/trigger events/ftrace/print/hist
+# flags: instance
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo "Test histogram trace_marker trigger"
+
+echo 'hist:keys=common_pid' > events/ftrace/print/trigger
+for i in `seq 1 10` ; do echo "hello" > trace_marker; done
+grep 'hitcount: *10$' events/ftrace/print/hist > /dev/null || \
+ fail "hist trigger did not trigger correct times on trace_marker"
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-trace-marker-snapshot.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-trace-marker-snapshot.tc
new file mode 100644
index 000000000..27da2dba9
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-trace-marker-snapshot.tc
@@ -0,0 +1,43 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: trace_marker trigger - test snapshot trigger
+# requires: set_event snapshot events/ftrace/print/trigger
+# flags: instance
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+test_trace() {
+ file=$1
+ x=$2
+
+ cat $file | while read line; do
+ comment=`echo $line | sed -e 's/^#//'`
+ if [ "$line" != "$comment" ]; then
+ continue
+ fi
+ echo "testing $line for >$x<"
+ match=`echo $line | sed -e "s/>$x<//"`
+ if [ "$line" = "$match" ]; then
+ fail "$line does not have >$x< in it"
+ fi
+ x=$((x+2))
+ done
+}
+
+echo "Test snapshot trace_marker trigger"
+
+echo 'snapshot' > events/ftrace/print/trigger
+
+# make sure the snapshot is allocated
+
+grep -q 'Snapshot is allocated' snapshot
+
+for i in `seq 1 10` ; do echo "hello >$i<" > trace_marker; done
+
+test_trace trace 1
+test_trace snapshot 2
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-trace-marker-synthetic-kernel.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-trace-marker-synthetic-kernel.tc
new file mode 100644
index 000000000..531139f41
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-trace-marker-synthetic-kernel.tc
@@ -0,0 +1,27 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: trace_marker trigger - test histogram with synthetic event against kernel event
+# requires: set_event synthetic_events events/sched/sched_waking events/ftrace/print/trigger events/ftrace/print/hist
+# flags:
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo "Test histogram kernel event to trace_marker latency histogram trigger"
+
+echo 'latency u64 lat' > synthetic_events
+echo 'hist:keys=pid:ts0=common_timestamp.usecs' > events/sched/sched_waking/trigger
+echo 'hist:keys=common_pid:lat=common_timestamp.usecs-$ts0:onmatch(sched.sched_waking).latency($lat)' > events/ftrace/print/trigger
+echo 'hist:keys=common_pid,lat:sort=lat' > events/synthetic/latency/trigger
+sleep 1
+echo "hello" > trace_marker
+
+grep 'hitcount: *1$' events/ftrace/print/hist > /dev/null || \
+ fail "hist trigger did not trigger correct times on trace_marker"
+
+grep 'hitcount: *1$' events/synthetic/latency/hist > /dev/null || \
+ fail "hist trigger did not trigger "
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-trace-marker-synthetic.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-trace-marker-synthetic.tc
new file mode 100644
index 000000000..cc99cbb06
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-trace-marker-synthetic.tc
@@ -0,0 +1,30 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: trace_marker trigger - test histogram with synthetic event
+# requires: set_event synthetic_events events/ftrace/print/trigger events/ftrace/print/hist
+# flags:
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo "Test histogram trace_marker to trace_marker latency histogram trigger"
+
+echo 'latency u64 lat' > synthetic_events
+echo 'hist:keys=common_pid:ts0=common_timestamp.usecs if buf == "start"' > events/ftrace/print/trigger
+echo 'hist:keys=common_pid:lat=common_timestamp.usecs-$ts0:onmatch(ftrace.print).latency($lat) if buf == "end"' >> events/ftrace/print/trigger
+echo 'hist:keys=common_pid,lat:sort=lat' > events/synthetic/latency/trigger
+echo -n "start" > trace_marker
+echo -n "end" > trace_marker
+
+cnt=`grep 'hitcount: *1$' events/ftrace/print/hist | wc -l`
+
+if [ $cnt -ne 2 ]; then
+ fail "hist trace_marker trigger did not trigger correctly"
+fi
+
+grep 'hitcount: *1$' events/synthetic/latency/hist > /dev/null || \
+ fail "hist trigger did not trigger "
+
+exit 0
diff --git a/tools/testing/selftests/ftrace/test.d/trigger/trigger-traceonoff.tc b/tools/testing/selftests/ftrace/test.d/trigger/trigger-traceonoff.tc
new file mode 100644
index 000000000..9ca04678f
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/trigger/trigger-traceonoff.tc
@@ -0,0 +1,38 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: event trigger - test traceon/off trigger
+# requires: set_event events/sched/sched_process_fork/trigger
+
+fail() { #msg
+ echo $1
+ exit_fail
+}
+
+echo "Test traceoff trigger"
+echo 1 > tracing_on
+echo 'traceoff' > events/sched/sched_process_fork/trigger
+( echo "forked")
+if [ `cat tracing_on` -ne 0 ]; then
+ fail "traceoff trigger on sched_process_fork did not work"
+fi
+
+reset_trigger
+
+echo "Test traceon trigger"
+echo 0 > tracing_on
+echo 'traceon' > events/sched/sched_process_fork/trigger
+( echo "forked")
+if [ `cat tracing_on` -ne 1 ]; then
+ fail "traceoff trigger on sched_process_fork did not work"
+fi
+
+reset_trigger
+
+echo "Test semantic error for traceoff/on trigger"
+! echo 'traceoff:badparam' > events/sched/sched_process_fork/trigger
+! echo 'traceoff+0' > events/sched/sched_process_fork/trigger
+echo 'traceon' > events/sched/sched_process_fork/trigger
+! echo 'traceon' > events/sched/sched_process_fork/trigger
+! echo 'traceoff' > events/sched/sched_process_fork/trigger
+
+exit 0
diff --git a/tools/testing/selftests/futex/Makefile b/tools/testing/selftests/futex/Makefile
new file mode 100644
index 000000000..11e157d75
--- /dev/null
+++ b/tools/testing/selftests/futex/Makefile
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: GPL-2.0
+SUBDIRS := functional
+
+TEST_PROGS := run.sh
+
+.PHONY: all clean
+
+include ../lib.mk
+
+all:
+ @for DIR in $(SUBDIRS); do \
+ BUILD_TARGET=$(OUTPUT)/$$DIR; \
+ mkdir $$BUILD_TARGET -p; \
+ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$DIR $@;\
+ if [ -e $$DIR/$(TEST_PROGS) ]; then \
+ rsync -a $$DIR/$(TEST_PROGS) $$BUILD_TARGET/; \
+ fi \
+ done
+
+override define INSTALL_RULE
+ mkdir -p $(INSTALL_PATH)
+ install -t $(INSTALL_PATH) $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES)
+
+ @for SUBDIR in $(SUBDIRS); do \
+ BUILD_TARGET=$(OUTPUT)/$$SUBDIR; \
+ mkdir $$BUILD_TARGET -p; \
+ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$SUBDIR INSTALL_PATH=$(INSTALL_PATH)/$$SUBDIR install; \
+ done;
+endef
+
+override define CLEAN
+ @for DIR in $(SUBDIRS); do \
+ BUILD_TARGET=$(OUTPUT)/$$DIR; \
+ mkdir $$BUILD_TARGET -p; \
+ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$DIR $@;\
+ done
+endef
diff --git a/tools/testing/selftests/futex/README b/tools/testing/selftests/futex/README
new file mode 100644
index 000000000..f3926c33e
--- /dev/null
+++ b/tools/testing/selftests/futex/README
@@ -0,0 +1,62 @@
+Futex Test
+==========
+Futex Test is intended to thoroughly test the Linux kernel futex system call
+API.
+
+Functional tests shall test the documented behavior of the futex operation
+code under test. This includes checking for proper behavior under normal use,
+odd corner cases, regression tests, and abject abuse and misuse.
+
+Futextest will also provide example implementation of mutual exclusion
+primitives. These can be used as is in user applications or can serve as
+examples for system libraries. These will likely be added to either a new lib/
+directory or purely as header files under include/, I'm leaning toward the
+latter.
+
+Quick Start
+-----------
+# make
+# ./run.sh
+
+Design and Implementation Goals
+-------------------------------
+o Tests should be as self contained as is practical so as to facilitate sharing
+ the individual tests on mailing list discussions and bug reports.
+o The build system shall remain as simple as possible, avoiding any archive or
+ shared object building and linking.
+o Where possible, any helper functions or other package-wide code shall be
+ implemented in header files, avoiding the need to compile intermediate object
+ files.
+o External dependencies shall remain as minimal as possible. Currently gcc
+ and glibc are the only dependencies.
+o Tests return 0 for success and < 0 for failure.
+
+Output Formatting
+-----------------
+Test output shall be easily parsable by both human and machine. Title and
+results are printed to stdout, while intermediate ERROR or FAIL messages are
+sent to stderr. Tests shall support the -c option to print PASS, FAIL, and
+ERROR strings in color for easy visual parsing. Output shall conform to the
+following format:
+
+test_name: Description of the test
+ Arguments: arg1=val1 #units specified for clarity where appropriate
+ ERROR: Description of unexpected error
+ FAIL: Reason for test failure
+ # FIXME: Perhaps an " INFO: informational message" option would be
+ # useful here. Using -v to toggle it them on and off, as with -c.
+ # there may be multiple ERROR or FAIL messages
+Result: (PASS|FAIL|ERROR)
+
+Naming
+------
+o FIXME: decide on a sane test naming scheme. Currently the tests are named
+ based on the primary futex operation they test. Eventually this will become a
+ problem as we intend to write multiple tests which collide in this namespace.
+ Perhaps something like "wait-wake-1" "wait-wake-2" is adequate, leaving the
+ detailed description in the test source and the output.
+
+Coding Style
+------------
+o The Futex Test project adheres to the coding standards set forth by Linux
+ kernel as defined in the Linux source Documentation/process/coding-style.rst.
diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore
new file mode 100644
index 000000000..0efcd494d
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/.gitignore
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+futex_requeue_pi
+futex_requeue_pi_mismatched_ops
+futex_requeue_pi_signal_restart
+futex_wait_private_mapped_file
+futex_wait_timeout
+futex_wait_uninitialized_heap
+futex_wait_wouldblock
diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile
new file mode 100644
index 000000000..6a0ed2e78
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/Makefile
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: GPL-2.0
+INCLUDES := -I../include -I../../
+CFLAGS := $(CFLAGS) -g -O2 -Wall -D_GNU_SOURCE -pthread $(INCLUDES)
+LDLIBS := -lpthread -lrt
+
+LOCAL_HDRS := \
+ ../include/futextest.h \
+ ../include/atomic.h \
+ ../include/logging.h
+TEST_GEN_PROGS := \
+ futex_wait_timeout \
+ futex_wait_wouldblock \
+ futex_requeue_pi \
+ futex_requeue_pi_signal_restart \
+ futex_requeue_pi_mismatched_ops \
+ futex_wait_uninitialized_heap \
+ futex_wait_private_mapped_file
+
+TEST_PROGS := run.sh
+
+top_srcdir = ../../../../..
+KSFT_KHDR_INSTALL := 1
+include ../../lib.mk
diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi.c b/tools/testing/selftests/futex/functional/futex_requeue_pi.c
new file mode 100644
index 000000000..1ee5518ee
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/futex_requeue_pi.c
@@ -0,0 +1,409 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/******************************************************************************
+ *
+ * Copyright © International Business Machines Corp., 2006-2008
+ *
+ * DESCRIPTION
+ * This test excercises the futex syscall op codes needed for requeuing
+ * priority inheritance aware POSIX condition variables and mutexes.
+ *
+ * AUTHORS
+ * Sripathi Kodi <sripathik@in.ibm.com>
+ * Darren Hart <dvhart@linux.intel.com>
+ *
+ * HISTORY
+ * 2008-Jan-13: Initial version by Sripathi Kodi <sripathik@in.ibm.com>
+ * 2009-Nov-6: futex test adaptation by Darren Hart <dvhart@linux.intel.com>
+ *
+ *****************************************************************************/
+
+#include <errno.h>
+#include <limits.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <string.h>
+#include "atomic.h"
+#include "futextest.h"
+#include "logging.h"
+
+#define TEST_NAME "futex-requeue-pi"
+#define MAX_WAKE_ITERS 1000
+#define THREAD_MAX 10
+#define SIGNAL_PERIOD_US 100
+
+atomic_t waiters_blocked = ATOMIC_INITIALIZER;
+atomic_t waiters_woken = ATOMIC_INITIALIZER;
+
+futex_t f1 = FUTEX_INITIALIZER;
+futex_t f2 = FUTEX_INITIALIZER;
+futex_t wake_complete = FUTEX_INITIALIZER;
+
+/* Test option defaults */
+static long timeout_ns;
+static int broadcast;
+static int owner;
+static int locked;
+
+struct thread_arg {
+ long id;
+ struct timespec *timeout;
+ int lock;
+ int ret;
+};
+#define THREAD_ARG_INITIALIZER { 0, NULL, 0, 0 }
+
+void usage(char *prog)
+{
+ printf("Usage: %s\n", prog);
+ printf(" -b Broadcast wakeup (all waiters)\n");
+ printf(" -c Use color\n");
+ printf(" -h Display this help message\n");
+ printf(" -l Lock the pi futex across requeue\n");
+ printf(" -o Use a third party pi futex owner during requeue (cancels -l)\n");
+ printf(" -t N Timeout in nanoseconds (default: 0)\n");
+ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n",
+ VQUIET, VCRITICAL, VINFO);
+}
+
+int create_rt_thread(pthread_t *pth, void*(*func)(void *), void *arg,
+ int policy, int prio)
+{
+ int ret;
+ struct sched_param schedp;
+ pthread_attr_t attr;
+
+ pthread_attr_init(&attr);
+ memset(&schedp, 0, sizeof(schedp));
+
+ ret = pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED);
+ if (ret) {
+ error("pthread_attr_setinheritsched\n", ret);
+ return -1;
+ }
+
+ ret = pthread_attr_setschedpolicy(&attr, policy);
+ if (ret) {
+ error("pthread_attr_setschedpolicy\n", ret);
+ return -1;
+ }
+
+ schedp.sched_priority = prio;
+ ret = pthread_attr_setschedparam(&attr, &schedp);
+ if (ret) {
+ error("pthread_attr_setschedparam\n", ret);
+ return -1;
+ }
+
+ ret = pthread_create(pth, &attr, func, arg);
+ if (ret) {
+ error("pthread_create\n", ret);
+ return -1;
+ }
+ return 0;
+}
+
+
+void *waiterfn(void *arg)
+{
+ struct thread_arg *args = (struct thread_arg *)arg;
+ futex_t old_val;
+
+ info("Waiter %ld: running\n", args->id);
+ /* Each thread sleeps for a different amount of time
+ * This is to avoid races, because we don't lock the
+ * external mutex here */
+ usleep(1000 * (long)args->id);
+
+ old_val = f1;
+ atomic_inc(&waiters_blocked);
+ info("Calling futex_wait_requeue_pi: %p (%u) -> %p\n",
+ &f1, f1, &f2);
+ args->ret = futex_wait_requeue_pi(&f1, old_val, &f2, args->timeout,
+ FUTEX_PRIVATE_FLAG);
+
+ info("waiter %ld woke with %d %s\n", args->id, args->ret,
+ args->ret < 0 ? strerror(errno) : "");
+ atomic_inc(&waiters_woken);
+ if (args->ret < 0) {
+ if (args->timeout && errno == ETIMEDOUT)
+ args->ret = 0;
+ else {
+ args->ret = RET_ERROR;
+ error("futex_wait_requeue_pi\n", errno);
+ }
+ futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG);
+ }
+ futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG);
+
+ info("Waiter %ld: exiting with %d\n", args->id, args->ret);
+ pthread_exit((void *)&args->ret);
+}
+
+void *broadcast_wakerfn(void *arg)
+{
+ struct thread_arg *args = (struct thread_arg *)arg;
+ int nr_requeue = INT_MAX;
+ int task_count = 0;
+ futex_t old_val;
+ int nr_wake = 1;
+ int i = 0;
+
+ info("Waker: waiting for waiters to block\n");
+ while (waiters_blocked.val < THREAD_MAX)
+ usleep(1000);
+ usleep(1000);
+
+ info("Waker: Calling broadcast\n");
+ if (args->lock) {
+ info("Calling FUTEX_LOCK_PI on mutex=%x @ %p\n", f2, &f2);
+ futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG);
+ }
+ continue_requeue:
+ old_val = f1;
+ args->ret = futex_cmp_requeue_pi(&f1, old_val, &f2, nr_wake, nr_requeue,
+ FUTEX_PRIVATE_FLAG);
+ if (args->ret < 0) {
+ args->ret = RET_ERROR;
+ error("FUTEX_CMP_REQUEUE_PI failed\n", errno);
+ } else if (++i < MAX_WAKE_ITERS) {
+ task_count += args->ret;
+ if (task_count < THREAD_MAX - waiters_woken.val)
+ goto continue_requeue;
+ } else {
+ error("max broadcast iterations (%d) reached with %d/%d tasks woken or requeued\n",
+ 0, MAX_WAKE_ITERS, task_count, THREAD_MAX);
+ args->ret = RET_ERROR;
+ }
+
+ futex_wake(&wake_complete, 1, FUTEX_PRIVATE_FLAG);
+
+ if (args->lock)
+ futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG);
+
+ if (args->ret > 0)
+ args->ret = task_count;
+
+ info("Waker: exiting with %d\n", args->ret);
+ pthread_exit((void *)&args->ret);
+}
+
+void *signal_wakerfn(void *arg)
+{
+ struct thread_arg *args = (struct thread_arg *)arg;
+ unsigned int old_val;
+ int nr_requeue = 0;
+ int task_count = 0;
+ int nr_wake = 1;
+ int i = 0;
+
+ info("Waker: waiting for waiters to block\n");
+ while (waiters_blocked.val < THREAD_MAX)
+ usleep(1000);
+ usleep(1000);
+
+ while (task_count < THREAD_MAX && waiters_woken.val < THREAD_MAX) {
+ info("task_count: %d, waiters_woken: %d\n",
+ task_count, waiters_woken.val);
+ if (args->lock) {
+ info("Calling FUTEX_LOCK_PI on mutex=%x @ %p\n",
+ f2, &f2);
+ futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG);
+ }
+ info("Waker: Calling signal\n");
+ /* cond_signal */
+ old_val = f1;
+ args->ret = futex_cmp_requeue_pi(&f1, old_val, &f2,
+ nr_wake, nr_requeue,
+ FUTEX_PRIVATE_FLAG);
+ if (args->ret < 0)
+ args->ret = -errno;
+ info("futex: %x\n", f2);
+ if (args->lock) {
+ info("Calling FUTEX_UNLOCK_PI on mutex=%x @ %p\n",
+ f2, &f2);
+ futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG);
+ }
+ info("futex: %x\n", f2);
+ if (args->ret < 0) {
+ error("FUTEX_CMP_REQUEUE_PI failed\n", errno);
+ args->ret = RET_ERROR;
+ break;
+ }
+
+ task_count += args->ret;
+ usleep(SIGNAL_PERIOD_US);
+ i++;
+ /* we have to loop at least THREAD_MAX times */
+ if (i > MAX_WAKE_ITERS + THREAD_MAX) {
+ error("max signaling iterations (%d) reached, giving up on pending waiters.\n",
+ 0, MAX_WAKE_ITERS + THREAD_MAX);
+ args->ret = RET_ERROR;
+ break;
+ }
+ }
+
+ futex_wake(&wake_complete, 1, FUTEX_PRIVATE_FLAG);
+
+ if (args->ret >= 0)
+ args->ret = task_count;
+
+ info("Waker: exiting with %d\n", args->ret);
+ info("Waker: waiters_woken: %d\n", waiters_woken.val);
+ pthread_exit((void *)&args->ret);
+}
+
+void *third_party_blocker(void *arg)
+{
+ struct thread_arg *args = (struct thread_arg *)arg;
+ int ret2 = 0;
+
+ args->ret = futex_lock_pi(&f2, NULL, 0, FUTEX_PRIVATE_FLAG);
+ if (args->ret)
+ goto out;
+ args->ret = futex_wait(&wake_complete, wake_complete, NULL,
+ FUTEX_PRIVATE_FLAG);
+ ret2 = futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG);
+
+ out:
+ if (args->ret || ret2) {
+ error("third_party_blocker() futex error", 0);
+ args->ret = RET_ERROR;
+ }
+
+ pthread_exit((void *)&args->ret);
+}
+
+int unit_test(int broadcast, long lock, int third_party_owner, long timeout_ns)
+{
+ void *(*wakerfn)(void *) = signal_wakerfn;
+ struct thread_arg blocker_arg = THREAD_ARG_INITIALIZER;
+ struct thread_arg waker_arg = THREAD_ARG_INITIALIZER;
+ pthread_t waiter[THREAD_MAX], waker, blocker;
+ struct timespec ts, *tsp = NULL;
+ struct thread_arg args[THREAD_MAX];
+ int *waiter_ret;
+ int i, ret = RET_PASS;
+
+ if (timeout_ns) {
+ time_t secs;
+
+ info("timeout_ns = %ld\n", timeout_ns);
+ ret = clock_gettime(CLOCK_MONOTONIC, &ts);
+ secs = (ts.tv_nsec + timeout_ns) / 1000000000;
+ ts.tv_nsec = ((int64_t)ts.tv_nsec + timeout_ns) % 1000000000;
+ ts.tv_sec += secs;
+ info("ts.tv_sec = %ld\n", ts.tv_sec);
+ info("ts.tv_nsec = %ld\n", ts.tv_nsec);
+ tsp = &ts;
+ }
+
+ if (broadcast)
+ wakerfn = broadcast_wakerfn;
+
+ if (third_party_owner) {
+ if (create_rt_thread(&blocker, third_party_blocker,
+ (void *)&blocker_arg, SCHED_FIFO, 1)) {
+ error("Creating third party blocker thread failed\n",
+ errno);
+ ret = RET_ERROR;
+ goto out;
+ }
+ }
+
+ atomic_set(&waiters_woken, 0);
+ for (i = 0; i < THREAD_MAX; i++) {
+ args[i].id = i;
+ args[i].timeout = tsp;
+ info("Starting thread %d\n", i);
+ if (create_rt_thread(&waiter[i], waiterfn, (void *)&args[i],
+ SCHED_FIFO, 1)) {
+ error("Creating waiting thread failed\n", errno);
+ ret = RET_ERROR;
+ goto out;
+ }
+ }
+ waker_arg.lock = lock;
+ if (create_rt_thread(&waker, wakerfn, (void *)&waker_arg,
+ SCHED_FIFO, 1)) {
+ error("Creating waker thread failed\n", errno);
+ ret = RET_ERROR;
+ goto out;
+ }
+
+ /* Wait for threads to finish */
+ /* Store the first error or failure encountered in waiter_ret */
+ waiter_ret = &args[0].ret;
+ for (i = 0; i < THREAD_MAX; i++)
+ pthread_join(waiter[i],
+ *waiter_ret ? NULL : (void **)&waiter_ret);
+
+ if (third_party_owner)
+ pthread_join(blocker, NULL);
+ pthread_join(waker, NULL);
+
+out:
+ if (!ret) {
+ if (*waiter_ret)
+ ret = *waiter_ret;
+ else if (waker_arg.ret < 0)
+ ret = waker_arg.ret;
+ else if (blocker_arg.ret)
+ ret = blocker_arg.ret;
+ }
+
+ return ret;
+}
+
+int main(int argc, char *argv[])
+{
+ int c, ret;
+
+ while ((c = getopt(argc, argv, "bchlot:v:")) != -1) {
+ switch (c) {
+ case 'b':
+ broadcast = 1;
+ break;
+ case 'c':
+ log_color(1);
+ break;
+ case 'h':
+ usage(basename(argv[0]));
+ exit(0);
+ case 'l':
+ locked = 1;
+ break;
+ case 'o':
+ owner = 1;
+ locked = 0;
+ break;
+ case 't':
+ timeout_ns = atoi(optarg);
+ break;
+ case 'v':
+ log_verbosity(atoi(optarg));
+ break;
+ default:
+ usage(basename(argv[0]));
+ exit(1);
+ }
+ }
+
+ ksft_print_header();
+ ksft_set_plan(1);
+ ksft_print_msg("%s: Test requeue functionality\n", basename(argv[0]));
+ ksft_print_msg(
+ "\tArguments: broadcast=%d locked=%d owner=%d timeout=%ldns\n",
+ broadcast, locked, owner, timeout_ns);
+
+ /*
+ * FIXME: unit_test is obsolete now that we parse options and the
+ * various style of runs are done by run.sh - simplify the code and move
+ * unit_test into main()
+ */
+ ret = unit_test(broadcast, locked, owner, timeout_ns);
+
+ print_result(TEST_NAME, ret);
+ return ret;
+}
diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c b/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c
new file mode 100644
index 000000000..d0a4d332e
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/futex_requeue_pi_mismatched_ops.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/******************************************************************************
+ *
+ * Copyright © International Business Machines Corp., 2009
+ *
+ * DESCRIPTION
+ * 1. Block a thread using FUTEX_WAIT
+ * 2. Attempt to use FUTEX_CMP_REQUEUE_PI on the futex from 1.
+ * 3. The kernel must detect the mismatch and return -EINVAL.
+ *
+ * AUTHOR
+ * Darren Hart <dvhart@linux.intel.com>
+ *
+ * HISTORY
+ * 2009-Nov-9: Initial version by Darren Hart <dvhart@linux.intel.com>
+ *
+ *****************************************************************************/
+
+#include <errno.h>
+#include <getopt.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "futextest.h"
+#include "logging.h"
+
+#define TEST_NAME "futex-requeue-pi-mismatched-ops"
+
+futex_t f1 = FUTEX_INITIALIZER;
+futex_t f2 = FUTEX_INITIALIZER;
+int child_ret = 0;
+
+void usage(char *prog)
+{
+ printf("Usage: %s\n", prog);
+ printf(" -c Use color\n");
+ printf(" -h Display this help message\n");
+ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n",
+ VQUIET, VCRITICAL, VINFO);
+}
+
+void *blocking_child(void *arg)
+{
+ child_ret = futex_wait(&f1, f1, NULL, FUTEX_PRIVATE_FLAG);
+ if (child_ret < 0) {
+ child_ret = -errno;
+ error("futex_wait\n", errno);
+ }
+ return (void *)&child_ret;
+}
+
+int main(int argc, char *argv[])
+{
+ int ret = RET_PASS;
+ pthread_t child;
+ int c;
+
+ while ((c = getopt(argc, argv, "chv:")) != -1) {
+ switch (c) {
+ case 'c':
+ log_color(1);
+ break;
+ case 'h':
+ usage(basename(argv[0]));
+ exit(0);
+ case 'v':
+ log_verbosity(atoi(optarg));
+ break;
+ default:
+ usage(basename(argv[0]));
+ exit(1);
+ }
+ }
+
+ ksft_print_header();
+ ksft_set_plan(1);
+ ksft_print_msg("%s: Detect mismatched requeue_pi operations\n",
+ basename(argv[0]));
+
+ if (pthread_create(&child, NULL, blocking_child, NULL)) {
+ error("pthread_create\n", errno);
+ ret = RET_ERROR;
+ goto out;
+ }
+ /* Allow the child to block in the kernel. */
+ sleep(1);
+
+ /*
+ * The kernel should detect the waiter did not setup the
+ * q->requeue_pi_key and return -EINVAL. If it does not,
+ * it likely gave the lock to the child, which is now hung
+ * in the kernel.
+ */
+ ret = futex_cmp_requeue_pi(&f1, f1, &f2, 1, 0, FUTEX_PRIVATE_FLAG);
+ if (ret < 0) {
+ if (errno == EINVAL) {
+ /*
+ * The kernel correctly detected the mismatched
+ * requeue_pi target and aborted. Wake the child with
+ * FUTEX_WAKE.
+ */
+ ret = futex_wake(&f1, 1, FUTEX_PRIVATE_FLAG);
+ if (ret == 1) {
+ ret = RET_PASS;
+ } else if (ret < 0) {
+ error("futex_wake\n", errno);
+ ret = RET_ERROR;
+ } else {
+ error("futex_wake did not wake the child\n", 0);
+ ret = RET_ERROR;
+ }
+ } else {
+ error("futex_cmp_requeue_pi\n", errno);
+ ret = RET_ERROR;
+ }
+ } else if (ret > 0) {
+ fail("futex_cmp_requeue_pi failed to detect the mismatch\n");
+ ret = RET_FAIL;
+ } else {
+ error("futex_cmp_requeue_pi found no waiters\n", 0);
+ ret = RET_ERROR;
+ }
+
+ pthread_join(child, NULL);
+
+ if (!ret)
+ ret = child_ret;
+
+ out:
+ /* If the kernel crashes, we shouldn't return at all. */
+ print_result(TEST_NAME, ret);
+ return ret;
+}
diff --git a/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c
new file mode 100644
index 000000000..f8c43ce8f
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/futex_requeue_pi_signal_restart.c
@@ -0,0 +1,222 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/******************************************************************************
+ *
+ * Copyright © International Business Machines Corp., 2006-2008
+ *
+ * DESCRIPTION
+ * This test exercises the futex_wait_requeue_pi() signal handling both
+ * before and after the requeue. The first should be restarted by the
+ * kernel. The latter should return EWOULDBLOCK to the waiter.
+ *
+ * AUTHORS
+ * Darren Hart <dvhart@linux.intel.com>
+ *
+ * HISTORY
+ * 2008-May-5: Initial version by Darren Hart <dvhart@linux.intel.com>
+ *
+ *****************************************************************************/
+
+#include <errno.h>
+#include <getopt.h>
+#include <limits.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "atomic.h"
+#include "futextest.h"
+#include "logging.h"
+
+#define TEST_NAME "futex-requeue-pi-signal-restart"
+#define DELAY_US 100
+
+futex_t f1 = FUTEX_INITIALIZER;
+futex_t f2 = FUTEX_INITIALIZER;
+atomic_t requeued = ATOMIC_INITIALIZER;
+
+int waiter_ret = 0;
+
+void usage(char *prog)
+{
+ printf("Usage: %s\n", prog);
+ printf(" -c Use color\n");
+ printf(" -h Display this help message\n");
+ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n",
+ VQUIET, VCRITICAL, VINFO);
+}
+
+int create_rt_thread(pthread_t *pth, void*(*func)(void *), void *arg,
+ int policy, int prio)
+{
+ struct sched_param schedp;
+ pthread_attr_t attr;
+ int ret;
+
+ pthread_attr_init(&attr);
+ memset(&schedp, 0, sizeof(schedp));
+
+ ret = pthread_attr_setinheritsched(&attr, PTHREAD_EXPLICIT_SCHED);
+ if (ret) {
+ error("pthread_attr_setinheritsched\n", ret);
+ return -1;
+ }
+
+ ret = pthread_attr_setschedpolicy(&attr, policy);
+ if (ret) {
+ error("pthread_attr_setschedpolicy\n", ret);
+ return -1;
+ }
+
+ schedp.sched_priority = prio;
+ ret = pthread_attr_setschedparam(&attr, &schedp);
+ if (ret) {
+ error("pthread_attr_setschedparam\n", ret);
+ return -1;
+ }
+
+ ret = pthread_create(pth, &attr, func, arg);
+ if (ret) {
+ error("pthread_create\n", ret);
+ return -1;
+ }
+ return 0;
+}
+
+void handle_signal(int signo)
+{
+ info("signal received %s requeue\n",
+ requeued.val ? "after" : "prior to");
+}
+
+void *waiterfn(void *arg)
+{
+ unsigned int old_val;
+ int res;
+
+ waiter_ret = RET_PASS;
+
+ info("Waiter running\n");
+ info("Calling FUTEX_LOCK_PI on f2=%x @ %p\n", f2, &f2);
+ old_val = f1;
+ res = futex_wait_requeue_pi(&f1, old_val, &(f2), NULL,
+ FUTEX_PRIVATE_FLAG);
+ if (!requeued.val || errno != EWOULDBLOCK) {
+ fail("unexpected return from futex_wait_requeue_pi: %d (%s)\n",
+ res, strerror(errno));
+ info("w2:futex: %x\n", f2);
+ if (!res)
+ futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG);
+ waiter_ret = RET_FAIL;
+ }
+
+ info("Waiter exiting with %d\n", waiter_ret);
+ pthread_exit(NULL);
+}
+
+
+int main(int argc, char *argv[])
+{
+ unsigned int old_val;
+ struct sigaction sa;
+ pthread_t waiter;
+ int c, res, ret = RET_PASS;
+
+ while ((c = getopt(argc, argv, "chv:")) != -1) {
+ switch (c) {
+ case 'c':
+ log_color(1);
+ break;
+ case 'h':
+ usage(basename(argv[0]));
+ exit(0);
+ case 'v':
+ log_verbosity(atoi(optarg));
+ break;
+ default:
+ usage(basename(argv[0]));
+ exit(1);
+ }
+ }
+
+ ksft_print_header();
+ ksft_set_plan(1);
+ ksft_print_msg("%s: Test signal handling during requeue_pi\n",
+ basename(argv[0]));
+ ksft_print_msg("\tArguments: <none>\n");
+
+ sa.sa_handler = handle_signal;
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = 0;
+ if (sigaction(SIGUSR1, &sa, NULL)) {
+ error("sigaction\n", errno);
+ exit(1);
+ }
+
+ info("m1:f2: %x\n", f2);
+ info("Creating waiter\n");
+ res = create_rt_thread(&waiter, waiterfn, NULL, SCHED_FIFO, 1);
+ if (res) {
+ error("Creating waiting thread failed", res);
+ ret = RET_ERROR;
+ goto out;
+ }
+
+ info("Calling FUTEX_LOCK_PI on f2=%x @ %p\n", f2, &f2);
+ info("m2:f2: %x\n", f2);
+ futex_lock_pi(&f2, 0, 0, FUTEX_PRIVATE_FLAG);
+ info("m3:f2: %x\n", f2);
+
+ while (1) {
+ /*
+ * signal the waiter before requeue, waiter should automatically
+ * restart futex_wait_requeue_pi() in the kernel. Wait for the
+ * waiter to block on f1 again.
+ */
+ info("Issuing SIGUSR1 to waiter\n");
+ pthread_kill(waiter, SIGUSR1);
+ usleep(DELAY_US);
+
+ info("Requeueing waiter via FUTEX_CMP_REQUEUE_PI\n");
+ old_val = f1;
+ res = futex_cmp_requeue_pi(&f1, old_val, &(f2), 1, 0,
+ FUTEX_PRIVATE_FLAG);
+ /*
+ * If res is non-zero, we either requeued the waiter or hit an
+ * error, break out and handle it. If it is zero, then the
+ * signal may have hit before the the waiter was blocked on f1.
+ * Try again.
+ */
+ if (res > 0) {
+ atomic_set(&requeued, 1);
+ break;
+ } else if (res < 0) {
+ error("FUTEX_CMP_REQUEUE_PI failed\n", errno);
+ ret = RET_ERROR;
+ break;
+ }
+ }
+ info("m4:f2: %x\n", f2);
+
+ /*
+ * Signal the waiter after requeue, waiter should return from
+ * futex_wait_requeue_pi() with EWOULDBLOCK. Join the thread here so the
+ * futex_unlock_pi() can't happen before the signal wakeup is detected
+ * in the kernel.
+ */
+ info("Issuing SIGUSR1 to waiter\n");
+ pthread_kill(waiter, SIGUSR1);
+ info("Waiting for waiter to return\n");
+ pthread_join(waiter, NULL);
+
+ info("Calling FUTEX_UNLOCK_PI on mutex=%x @ %p\n", f2, &f2);
+ futex_unlock_pi(&f2, FUTEX_PRIVATE_FLAG);
+ info("m5:f2: %x\n", f2);
+
+ out:
+ if (ret == RET_PASS && waiter_ret)
+ ret = waiter_ret;
+
+ print_result(TEST_NAME, ret);
+ return ret;
+}
diff --git a/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c b/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c
new file mode 100644
index 000000000..fb4148f23
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/futex_wait_private_mapped_file.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/******************************************************************************
+ *
+ * Copyright FUJITSU LIMITED 2010
+ * Copyright KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+ *
+ * DESCRIPTION
+ * Internally, Futex has two handling mode, anon and file. The private file
+ * mapping is special. At first it behave as file, but after write anything
+ * it behave as anon. This test is intent to test such case.
+ *
+ * AUTHOR
+ * KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+ *
+ * HISTORY
+ * 2010-Jan-6: Initial version by KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+ *
+ *****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <errno.h>
+#include <linux/futex.h>
+#include <pthread.h>
+#include <libgen.h>
+#include <signal.h>
+
+#include "logging.h"
+#include "futextest.h"
+
+#define TEST_NAME "futex-wait-private-mapped-file"
+#define PAGE_SZ 4096
+
+char pad[PAGE_SZ] = {1};
+futex_t val = 1;
+char pad2[PAGE_SZ] = {1};
+
+#define WAKE_WAIT_US 3000000
+struct timespec wait_timeout = { .tv_sec = 5, .tv_nsec = 0};
+
+void usage(char *prog)
+{
+ printf("Usage: %s\n", prog);
+ printf(" -c Use color\n");
+ printf(" -h Display this help message\n");
+ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n",
+ VQUIET, VCRITICAL, VINFO);
+}
+
+void *thr_futex_wait(void *arg)
+{
+ int ret;
+
+ info("futex wait\n");
+ ret = futex_wait(&val, 1, &wait_timeout, 0);
+ if (ret && errno != EWOULDBLOCK && errno != ETIMEDOUT) {
+ error("futex error.\n", errno);
+ print_result(TEST_NAME, RET_ERROR);
+ exit(RET_ERROR);
+ }
+
+ if (ret && errno == ETIMEDOUT)
+ fail("waiter timedout\n");
+
+ info("futex_wait: ret = %d, errno = %d\n", ret, errno);
+
+ return NULL;
+}
+
+int main(int argc, char **argv)
+{
+ pthread_t thr;
+ int ret = RET_PASS;
+ int res;
+ int c;
+
+ while ((c = getopt(argc, argv, "chv:")) != -1) {
+ switch (c) {
+ case 'c':
+ log_color(1);
+ break;
+ case 'h':
+ usage(basename(argv[0]));
+ exit(0);
+ case 'v':
+ log_verbosity(atoi(optarg));
+ break;
+ default:
+ usage(basename(argv[0]));
+ exit(1);
+ }
+ }
+
+ ksft_print_header();
+ ksft_set_plan(1);
+ ksft_print_msg(
+ "%s: Test the futex value of private file mappings in FUTEX_WAIT\n",
+ basename(argv[0]));
+
+ ret = pthread_create(&thr, NULL, thr_futex_wait, NULL);
+ if (ret < 0) {
+ fprintf(stderr, "pthread_create error\n");
+ ret = RET_ERROR;
+ goto out;
+ }
+
+ info("wait a while\n");
+ usleep(WAKE_WAIT_US);
+ val = 2;
+ res = futex_wake(&val, 1, 0);
+ info("futex_wake %d\n", res);
+ if (res != 1) {
+ fail("FUTEX_WAKE didn't find the waiting thread.\n");
+ ret = RET_FAIL;
+ }
+
+ info("join\n");
+ pthread_join(thr, NULL);
+
+ out:
+ print_result(TEST_NAME, ret);
+ return ret;
+}
diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c
new file mode 100644
index 000000000..ee55e6d38
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/******************************************************************************
+ *
+ * Copyright © International Business Machines Corp., 2009
+ *
+ * DESCRIPTION
+ * Block on a futex and wait for timeout.
+ *
+ * AUTHOR
+ * Darren Hart <dvhart@linux.intel.com>
+ *
+ * HISTORY
+ * 2009-Nov-6: Initial version by Darren Hart <dvhart@linux.intel.com>
+ *
+ *****************************************************************************/
+
+#include <errno.h>
+#include <getopt.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "futextest.h"
+#include "logging.h"
+
+#define TEST_NAME "futex-wait-timeout"
+
+static long timeout_ns = 100000; /* 100us default timeout */
+
+void usage(char *prog)
+{
+ printf("Usage: %s\n", prog);
+ printf(" -c Use color\n");
+ printf(" -h Display this help message\n");
+ printf(" -t N Timeout in nanoseconds (default: 100,000)\n");
+ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n",
+ VQUIET, VCRITICAL, VINFO);
+}
+
+int main(int argc, char *argv[])
+{
+ futex_t f1 = FUTEX_INITIALIZER;
+ struct timespec to;
+ int res, ret = RET_PASS;
+ int c;
+
+ while ((c = getopt(argc, argv, "cht:v:")) != -1) {
+ switch (c) {
+ case 'c':
+ log_color(1);
+ break;
+ case 'h':
+ usage(basename(argv[0]));
+ exit(0);
+ case 't':
+ timeout_ns = atoi(optarg);
+ break;
+ case 'v':
+ log_verbosity(atoi(optarg));
+ break;
+ default:
+ usage(basename(argv[0]));
+ exit(1);
+ }
+ }
+
+ ksft_print_header();
+ ksft_set_plan(1);
+ ksft_print_msg("%s: Block on a futex and wait for timeout\n",
+ basename(argv[0]));
+ ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns);
+
+ /* initialize timeout */
+ to.tv_sec = 0;
+ to.tv_nsec = timeout_ns;
+
+ info("Calling futex_wait on f1: %u @ %p\n", f1, &f1);
+ res = futex_wait(&f1, f1, &to, FUTEX_PRIVATE_FLAG);
+ if (!res || errno != ETIMEDOUT) {
+ fail("futex_wait returned %d\n", ret < 0 ? errno : ret);
+ ret = RET_FAIL;
+ }
+
+ print_result(TEST_NAME, ret);
+ return ret;
+}
diff --git a/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c b/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c
new file mode 100644
index 000000000..ed9cd07e3
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/futex_wait_uninitialized_heap.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/******************************************************************************
+ *
+ * Copyright FUJITSU LIMITED 2010
+ * Copyright KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+ *
+ * DESCRIPTION
+ * Wait on uninitialized heap. It shold be zero and FUTEX_WAIT should
+ * return immediately. This test is intent to test zero page handling in
+ * futex.
+ *
+ * AUTHOR
+ * KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+ *
+ * HISTORY
+ * 2010-Jan-6: Initial version by KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+ *
+ *****************************************************************************/
+
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <syscall.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <errno.h>
+#include <linux/futex.h>
+#include <libgen.h>
+
+#include "logging.h"
+#include "futextest.h"
+
+#define TEST_NAME "futex-wait-uninitialized-heap"
+#define WAIT_US 5000000
+
+static int child_blocked = 1;
+static int child_ret;
+void *buf;
+
+void usage(char *prog)
+{
+ printf("Usage: %s\n", prog);
+ printf(" -c Use color\n");
+ printf(" -h Display this help message\n");
+ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n",
+ VQUIET, VCRITICAL, VINFO);
+}
+
+void *wait_thread(void *arg)
+{
+ int res;
+
+ child_ret = RET_PASS;
+ res = futex_wait(buf, 1, NULL, 0);
+ child_blocked = 0;
+
+ if (res != 0 && errno != EWOULDBLOCK) {
+ error("futex failure\n", errno);
+ child_ret = RET_ERROR;
+ }
+ pthread_exit(NULL);
+}
+
+int main(int argc, char **argv)
+{
+ int c, ret = RET_PASS;
+ long page_size;
+ pthread_t thr;
+
+ while ((c = getopt(argc, argv, "chv:")) != -1) {
+ switch (c) {
+ case 'c':
+ log_color(1);
+ break;
+ case 'h':
+ usage(basename(argv[0]));
+ exit(0);
+ case 'v':
+ log_verbosity(atoi(optarg));
+ break;
+ default:
+ usage(basename(argv[0]));
+ exit(1);
+ }
+ }
+
+ page_size = sysconf(_SC_PAGESIZE);
+
+ buf = mmap(NULL, page_size, PROT_READ|PROT_WRITE,
+ MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
+ if (buf == (void *)-1) {
+ error("mmap\n", errno);
+ exit(1);
+ }
+
+ ksft_print_header();
+ ksft_set_plan(1);
+ ksft_print_msg("%s: Test the uninitialized futex value in FUTEX_WAIT\n",
+ basename(argv[0]));
+
+
+ ret = pthread_create(&thr, NULL, wait_thread, NULL);
+ if (ret) {
+ error("pthread_create\n", errno);
+ ret = RET_ERROR;
+ goto out;
+ }
+
+ info("waiting %dus for child to return\n", WAIT_US);
+ usleep(WAIT_US);
+
+ ret = child_ret;
+ if (child_blocked) {
+ fail("child blocked in kernel\n");
+ ret = RET_FAIL;
+ }
+
+ out:
+ print_result(TEST_NAME, ret);
+ return ret;
+}
diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c
new file mode 100644
index 000000000..0ae390ff8
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/******************************************************************************
+ *
+ * Copyright © International Business Machines Corp., 2009
+ *
+ * DESCRIPTION
+ * Test if FUTEX_WAIT op returns -EWOULDBLOCK if the futex value differs
+ * from the expected one.
+ *
+ * AUTHOR
+ * Gowrishankar <gowrishankar.m@in.ibm.com>
+ *
+ * HISTORY
+ * 2009-Nov-14: Initial version by Gowrishankar <gowrishankar.m@in.ibm.com>
+ *
+ *****************************************************************************/
+
+#include <errno.h>
+#include <getopt.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "futextest.h"
+#include "logging.h"
+
+#define TEST_NAME "futex-wait-wouldblock"
+#define timeout_ns 100000
+
+void usage(char *prog)
+{
+ printf("Usage: %s\n", prog);
+ printf(" -c Use color\n");
+ printf(" -h Display this help message\n");
+ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n",
+ VQUIET, VCRITICAL, VINFO);
+}
+
+int main(int argc, char *argv[])
+{
+ struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns};
+ futex_t f1 = FUTEX_INITIALIZER;
+ int res, ret = RET_PASS;
+ int c;
+
+ while ((c = getopt(argc, argv, "cht:v:")) != -1) {
+ switch (c) {
+ case 'c':
+ log_color(1);
+ break;
+ case 'h':
+ usage(basename(argv[0]));
+ exit(0);
+ case 'v':
+ log_verbosity(atoi(optarg));
+ break;
+ default:
+ usage(basename(argv[0]));
+ exit(1);
+ }
+ }
+
+ ksft_print_header();
+ ksft_set_plan(1);
+ ksft_print_msg("%s: Test the unexpected futex value in FUTEX_WAIT\n",
+ basename(argv[0]));
+
+ info("Calling futex_wait on f1: %u @ %p with val=%u\n", f1, &f1, f1+1);
+ res = futex_wait(&f1, f1+1, &to, FUTEX_PRIVATE_FLAG);
+ if (!res || errno != EWOULDBLOCK) {
+ fail("futex_wait returned: %d %s\n",
+ res ? errno : res, res ? strerror(errno) : "");
+ ret = RET_FAIL;
+ }
+
+ print_result(TEST_NAME, ret);
+ return ret;
+}
diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh
new file mode 100755
index 000000000..1acb6ace1
--- /dev/null
+++ b/tools/testing/selftests/futex/functional/run.sh
@@ -0,0 +1,75 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+###############################################################################
+#
+# Copyright © International Business Machines Corp., 2009
+#
+# DESCRIPTION
+# Run tests in the current directory.
+#
+# AUTHOR
+# Darren Hart <dvhart@linux.intel.com>
+#
+# HISTORY
+# 2009-Nov-9: Initial version by Darren Hart <dvhart@linux.intel.com>
+# 2010-Jan-6: Add futex_wait_uninitialized_heap and futex_wait_private_mapped_file
+# by KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+#
+###############################################################################
+
+# Test for a color capable console
+if [ -z "$USE_COLOR" ]; then
+ tput setf 7 || tput setaf 7
+ if [ $? -eq 0 ]; then
+ USE_COLOR=1
+ tput sgr0
+ fi
+fi
+if [ "$USE_COLOR" -eq 1 ]; then
+ COLOR="-c"
+fi
+
+
+echo
+# requeue pi testing
+# without timeouts
+./futex_requeue_pi $COLOR
+./futex_requeue_pi $COLOR -b
+./futex_requeue_pi $COLOR -b -l
+./futex_requeue_pi $COLOR -b -o
+./futex_requeue_pi $COLOR -l
+./futex_requeue_pi $COLOR -o
+# with timeouts
+./futex_requeue_pi $COLOR -b -l -t 5000
+./futex_requeue_pi $COLOR -l -t 5000
+./futex_requeue_pi $COLOR -b -l -t 500000
+./futex_requeue_pi $COLOR -l -t 500000
+./futex_requeue_pi $COLOR -b -t 5000
+./futex_requeue_pi $COLOR -t 5000
+./futex_requeue_pi $COLOR -b -t 500000
+./futex_requeue_pi $COLOR -t 500000
+./futex_requeue_pi $COLOR -b -o -t 5000
+./futex_requeue_pi $COLOR -l -t 5000
+./futex_requeue_pi $COLOR -b -o -t 500000
+./futex_requeue_pi $COLOR -l -t 500000
+# with long timeout
+./futex_requeue_pi $COLOR -b -l -t 2000000000
+./futex_requeue_pi $COLOR -l -t 2000000000
+
+
+echo
+./futex_requeue_pi_mismatched_ops $COLOR
+
+echo
+./futex_requeue_pi_signal_restart $COLOR
+
+echo
+./futex_wait_timeout $COLOR
+
+echo
+./futex_wait_wouldblock $COLOR
+
+echo
+./futex_wait_uninitialized_heap $COLOR
+./futex_wait_private_mapped_file $COLOR
diff --git a/tools/testing/selftests/futex/include/atomic.h b/tools/testing/selftests/futex/include/atomic.h
new file mode 100644
index 000000000..428bcd921
--- /dev/null
+++ b/tools/testing/selftests/futex/include/atomic.h
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/******************************************************************************
+ *
+ * Copyright © International Business Machines Corp., 2009
+ *
+ * DESCRIPTION
+ * GCC atomic builtin wrappers
+ * http://gcc.gnu.org/onlinedocs/gcc-4.1.0/gcc/Atomic-Builtins.html
+ *
+ * AUTHOR
+ * Darren Hart <dvhart@linux.intel.com>
+ *
+ * HISTORY
+ * 2009-Nov-17: Initial version by Darren Hart <dvhart@linux.intel.com>
+ *
+ *****************************************************************************/
+
+#ifndef _ATOMIC_H
+#define _ATOMIC_H
+
+typedef struct {
+ volatile int val;
+} atomic_t;
+
+#define ATOMIC_INITIALIZER { 0 }
+
+/**
+ * atomic_cmpxchg() - Atomic compare and exchange
+ * @uaddr: The address of the futex to be modified
+ * @oldval: The expected value of the futex
+ * @newval: The new value to try and assign the futex
+ *
+ * Return the old value of addr->val.
+ */
+static inline int
+atomic_cmpxchg(atomic_t *addr, int oldval, int newval)
+{
+ return __sync_val_compare_and_swap(&addr->val, oldval, newval);
+}
+
+/**
+ * atomic_inc() - Atomic incrememnt
+ * @addr: Address of the variable to increment
+ *
+ * Return the new value of addr->val.
+ */
+static inline int
+atomic_inc(atomic_t *addr)
+{
+ return __sync_add_and_fetch(&addr->val, 1);
+}
+
+/**
+ * atomic_dec() - Atomic decrement
+ * @addr: Address of the variable to decrement
+ *
+ * Return the new value of addr-val.
+ */
+static inline int
+atomic_dec(atomic_t *addr)
+{
+ return __sync_sub_and_fetch(&addr->val, 1);
+}
+
+/**
+ * atomic_set() - Atomic set
+ * @addr: Address of the variable to set
+ * @newval: New value for the atomic_t
+ *
+ * Return the new value of addr->val.
+ */
+static inline int
+atomic_set(atomic_t *addr, int newval)
+{
+ addr->val = newval;
+ return newval;
+}
+
+#endif
diff --git a/tools/testing/selftests/futex/include/futextest.h b/tools/testing/selftests/futex/include/futextest.h
new file mode 100644
index 000000000..ddbcfc9b7
--- /dev/null
+++ b/tools/testing/selftests/futex/include/futextest.h
@@ -0,0 +1,262 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/******************************************************************************
+ *
+ * Copyright © International Business Machines Corp., 2009
+ *
+ * DESCRIPTION
+ * Glibc independent futex library for testing kernel functionality.
+ *
+ * AUTHOR
+ * Darren Hart <dvhart@linux.intel.com>
+ *
+ * HISTORY
+ * 2009-Nov-6: Initial version by Darren Hart <dvhart@linux.intel.com>
+ *
+ *****************************************************************************/
+
+#ifndef _FUTEXTEST_H
+#define _FUTEXTEST_H
+
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <linux/futex.h>
+
+typedef volatile u_int32_t futex_t;
+#define FUTEX_INITIALIZER 0
+
+/* Define the newer op codes if the system header file is not up to date. */
+#ifndef FUTEX_WAIT_BITSET
+#define FUTEX_WAIT_BITSET 9
+#endif
+#ifndef FUTEX_WAKE_BITSET
+#define FUTEX_WAKE_BITSET 10
+#endif
+#ifndef FUTEX_WAIT_REQUEUE_PI
+#define FUTEX_WAIT_REQUEUE_PI 11
+#endif
+#ifndef FUTEX_CMP_REQUEUE_PI
+#define FUTEX_CMP_REQUEUE_PI 12
+#endif
+#ifndef FUTEX_WAIT_REQUEUE_PI_PRIVATE
+#define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | \
+ FUTEX_PRIVATE_FLAG)
+#endif
+#ifndef FUTEX_REQUEUE_PI_PRIVATE
+#define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \
+ FUTEX_PRIVATE_FLAG)
+#endif
+
+/**
+ * futex() - SYS_futex syscall wrapper
+ * @uaddr: address of first futex
+ * @op: futex op code
+ * @val: typically expected value of uaddr, but varies by op
+ * @timeout: typically an absolute struct timespec (except where noted
+ * otherwise). Overloaded by some ops
+ * @uaddr2: address of second futex for some ops\
+ * @val3: varies by op
+ * @opflags: flags to be bitwise OR'd with op, such as FUTEX_PRIVATE_FLAG
+ *
+ * futex() is used by all the following futex op wrappers. It can also be
+ * used for misuse and abuse testing. Generally, the specific op wrappers
+ * should be used instead. It is a macro instead of an static inline function as
+ * some of the types over overloaded (timeout is used for nr_requeue for
+ * example).
+ *
+ * These argument descriptions are the defaults for all
+ * like-named arguments in the following wrappers except where noted below.
+ */
+#define futex(uaddr, op, val, timeout, uaddr2, val3, opflags) \
+ syscall(SYS_futex, uaddr, op | opflags, val, timeout, uaddr2, val3)
+
+/**
+ * futex_wait() - block on uaddr with optional timeout
+ * @timeout: relative timeout
+ */
+static inline int
+futex_wait(futex_t *uaddr, futex_t val, struct timespec *timeout, int opflags)
+{
+ return futex(uaddr, FUTEX_WAIT, val, timeout, NULL, 0, opflags);
+}
+
+/**
+ * futex_wake() - wake one or more tasks blocked on uaddr
+ * @nr_wake: wake up to this many tasks
+ */
+static inline int
+futex_wake(futex_t *uaddr, int nr_wake, int opflags)
+{
+ return futex(uaddr, FUTEX_WAKE, nr_wake, NULL, NULL, 0, opflags);
+}
+
+/**
+ * futex_wait_bitset() - block on uaddr with bitset
+ * @bitset: bitset to be used with futex_wake_bitset
+ */
+static inline int
+futex_wait_bitset(futex_t *uaddr, futex_t val, struct timespec *timeout,
+ u_int32_t bitset, int opflags)
+{
+ return futex(uaddr, FUTEX_WAIT_BITSET, val, timeout, NULL, bitset,
+ opflags);
+}
+
+/**
+ * futex_wake_bitset() - wake one or more tasks blocked on uaddr with bitset
+ * @bitset: bitset to compare with that used in futex_wait_bitset
+ */
+static inline int
+futex_wake_bitset(futex_t *uaddr, int nr_wake, u_int32_t bitset, int opflags)
+{
+ return futex(uaddr, FUTEX_WAKE_BITSET, nr_wake, NULL, NULL, bitset,
+ opflags);
+}
+
+/**
+ * futex_lock_pi() - block on uaddr as a PI mutex
+ * @detect: whether (1) or not (0) to perform deadlock detection
+ */
+static inline int
+futex_lock_pi(futex_t *uaddr, struct timespec *timeout, int detect,
+ int opflags)
+{
+ return futex(uaddr, FUTEX_LOCK_PI, detect, timeout, NULL, 0, opflags);
+}
+
+/**
+ * futex_unlock_pi() - release uaddr as a PI mutex, waking the top waiter
+ */
+static inline int
+futex_unlock_pi(futex_t *uaddr, int opflags)
+{
+ return futex(uaddr, FUTEX_UNLOCK_PI, 0, NULL, NULL, 0, opflags);
+}
+
+/**
+ * futex_wake_op() - FIXME: COME UP WITH A GOOD ONE LINE DESCRIPTION
+ */
+static inline int
+futex_wake_op(futex_t *uaddr, futex_t *uaddr2, int nr_wake, int nr_wake2,
+ int wake_op, int opflags)
+{
+ return futex(uaddr, FUTEX_WAKE_OP, nr_wake, nr_wake2, uaddr2, wake_op,
+ opflags);
+}
+
+/**
+ * futex_requeue() - requeue without expected value comparison, deprecated
+ * @nr_wake: wake up to this many tasks
+ * @nr_requeue: requeue up to this many tasks
+ *
+ * Due to its inherently racy implementation, futex_requeue() is deprecated in
+ * favor of futex_cmp_requeue().
+ */
+static inline int
+futex_requeue(futex_t *uaddr, futex_t *uaddr2, int nr_wake, int nr_requeue,
+ int opflags)
+{
+ return futex(uaddr, FUTEX_REQUEUE, nr_wake, nr_requeue, uaddr2, 0,
+ opflags);
+}
+
+/**
+ * futex_cmp_requeue() - requeue tasks from uaddr to uaddr2
+ * @nr_wake: wake up to this many tasks
+ * @nr_requeue: requeue up to this many tasks
+ */
+static inline int
+futex_cmp_requeue(futex_t *uaddr, futex_t val, futex_t *uaddr2, int nr_wake,
+ int nr_requeue, int opflags)
+{
+ return futex(uaddr, FUTEX_CMP_REQUEUE, nr_wake, nr_requeue, uaddr2,
+ val, opflags);
+}
+
+/**
+ * futex_wait_requeue_pi() - block on uaddr and prepare to requeue to uaddr2
+ * @uaddr: non-PI futex source
+ * @uaddr2: PI futex target
+ *
+ * This is the first half of the requeue_pi mechanism. It shall always be
+ * paired with futex_cmp_requeue_pi().
+ */
+static inline int
+futex_wait_requeue_pi(futex_t *uaddr, futex_t val, futex_t *uaddr2,
+ struct timespec *timeout, int opflags)
+{
+ return futex(uaddr, FUTEX_WAIT_REQUEUE_PI, val, timeout, uaddr2, 0,
+ opflags);
+}
+
+/**
+ * futex_cmp_requeue_pi() - requeue tasks from uaddr to uaddr2 (PI aware)
+ * @uaddr: non-PI futex source
+ * @uaddr2: PI futex target
+ * @nr_wake: wake up to this many tasks
+ * @nr_requeue: requeue up to this many tasks
+ */
+static inline int
+futex_cmp_requeue_pi(futex_t *uaddr, futex_t val, futex_t *uaddr2, int nr_wake,
+ int nr_requeue, int opflags)
+{
+ return futex(uaddr, FUTEX_CMP_REQUEUE_PI, nr_wake, nr_requeue, uaddr2,
+ val, opflags);
+}
+
+/**
+ * futex_cmpxchg() - atomic compare and exchange
+ * @uaddr: The address of the futex to be modified
+ * @oldval: The expected value of the futex
+ * @newval: The new value to try and assign the futex
+ *
+ * Implement cmpxchg using gcc atomic builtins.
+ * http://gcc.gnu.org/onlinedocs/gcc-4.1.0/gcc/Atomic-Builtins.html
+ *
+ * Return the old futex value.
+ */
+static inline u_int32_t
+futex_cmpxchg(futex_t *uaddr, u_int32_t oldval, u_int32_t newval)
+{
+ return __sync_val_compare_and_swap(uaddr, oldval, newval);
+}
+
+/**
+ * futex_dec() - atomic decrement of the futex value
+ * @uaddr: The address of the futex to be modified
+ *
+ * Return the new futex value.
+ */
+static inline u_int32_t
+futex_dec(futex_t *uaddr)
+{
+ return __sync_sub_and_fetch(uaddr, 1);
+}
+
+/**
+ * futex_inc() - atomic increment of the futex value
+ * @uaddr: the address of the futex to be modified
+ *
+ * Return the new futex value.
+ */
+static inline u_int32_t
+futex_inc(futex_t *uaddr)
+{
+ return __sync_add_and_fetch(uaddr, 1);
+}
+
+/**
+ * futex_set() - atomic decrement of the futex value
+ * @uaddr: the address of the futex to be modified
+ * @newval: New value for the atomic_t
+ *
+ * Return the new futex value.
+ */
+static inline u_int32_t
+futex_set(futex_t *uaddr, u_int32_t newval)
+{
+ *uaddr = newval;
+ return newval;
+}
+
+#endif
diff --git a/tools/testing/selftests/futex/include/logging.h b/tools/testing/selftests/futex/include/logging.h
new file mode 100644
index 000000000..874c69ce5
--- /dev/null
+++ b/tools/testing/selftests/futex/include/logging.h
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/******************************************************************************
+ *
+ * Copyright © International Business Machines Corp., 2009
+ *
+ * DESCRIPTION
+ * Glibc independent futex library for testing kernel functionality.
+ *
+ * AUTHOR
+ * Darren Hart <dvhart@linux.intel.com>
+ *
+ * HISTORY
+ * 2009-Nov-6: Initial version by Darren Hart <dvhart@linux.intel.com>
+ *
+ *****************************************************************************/
+
+#ifndef _LOGGING_H
+#define _LOGGING_H
+
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <linux/futex.h>
+#include "kselftest.h"
+
+/*
+ * Define PASS, ERROR, and FAIL strings with and without color escape
+ * sequences, default to no color.
+ */
+#define ESC 0x1B, '['
+#define BRIGHT '1'
+#define GREEN '3', '2'
+#define YELLOW '3', '3'
+#define RED '3', '1'
+#define ESCEND 'm'
+#define BRIGHT_GREEN ESC, BRIGHT, ';', GREEN, ESCEND
+#define BRIGHT_YELLOW ESC, BRIGHT, ';', YELLOW, ESCEND
+#define BRIGHT_RED ESC, BRIGHT, ';', RED, ESCEND
+#define RESET_COLOR ESC, '0', 'm'
+static const char PASS_COLOR[] = {BRIGHT_GREEN, ' ', 'P', 'A', 'S', 'S',
+ RESET_COLOR, 0};
+static const char ERROR_COLOR[] = {BRIGHT_YELLOW, 'E', 'R', 'R', 'O', 'R',
+ RESET_COLOR, 0};
+static const char FAIL_COLOR[] = {BRIGHT_RED, ' ', 'F', 'A', 'I', 'L',
+ RESET_COLOR, 0};
+static const char INFO_NORMAL[] = " INFO";
+static const char PASS_NORMAL[] = " PASS";
+static const char ERROR_NORMAL[] = "ERROR";
+static const char FAIL_NORMAL[] = " FAIL";
+const char *INFO = INFO_NORMAL;
+const char *PASS = PASS_NORMAL;
+const char *ERROR = ERROR_NORMAL;
+const char *FAIL = FAIL_NORMAL;
+
+/* Verbosity setting for INFO messages */
+#define VQUIET 0
+#define VCRITICAL 1
+#define VINFO 2
+#define VMAX VINFO
+int _verbose = VCRITICAL;
+
+/* Functional test return codes */
+#define RET_PASS 0
+#define RET_ERROR -1
+#define RET_FAIL -2
+
+/**
+ * log_color() - Use colored output for PASS, ERROR, and FAIL strings
+ * @use_color: use color (1) or not (0)
+ */
+void log_color(int use_color)
+{
+ if (use_color) {
+ PASS = PASS_COLOR;
+ ERROR = ERROR_COLOR;
+ FAIL = FAIL_COLOR;
+ } else {
+ PASS = PASS_NORMAL;
+ ERROR = ERROR_NORMAL;
+ FAIL = FAIL_NORMAL;
+ }
+}
+
+/**
+ * log_verbosity() - Set verbosity of test output
+ * @verbose: Enable (1) verbose output or not (0)
+ *
+ * Currently setting verbose=1 will enable INFO messages and 0 will disable
+ * them. FAIL and ERROR messages are always displayed.
+ */
+void log_verbosity(int level)
+{
+ if (level > VMAX)
+ level = VMAX;
+ else if (level < 0)
+ level = 0;
+ _verbose = level;
+}
+
+/**
+ * print_result() - Print standard PASS | ERROR | FAIL results
+ * @ret: the return value to be considered: 0 | RET_ERROR | RET_FAIL
+ *
+ * print_result() is primarily intended for functional tests.
+ */
+void print_result(const char *test_name, int ret)
+{
+ switch (ret) {
+ case RET_PASS:
+ ksft_test_result_pass("%s\n", test_name);
+ ksft_print_cnts();
+ return;
+ case RET_ERROR:
+ ksft_test_result_error("%s\n", test_name);
+ ksft_print_cnts();
+ return;
+ case RET_FAIL:
+ ksft_test_result_fail("%s\n", test_name);
+ ksft_print_cnts();
+ return;
+ }
+}
+
+/* log level macros */
+#define info(message, vargs...) \
+do { \
+ if (_verbose >= VINFO) \
+ fprintf(stderr, "\t%s: "message, INFO, ##vargs); \
+} while (0)
+
+#define error(message, err, args...) \
+do { \
+ if (_verbose >= VCRITICAL) {\
+ if (err) \
+ fprintf(stderr, "\t%s: %s: "message, \
+ ERROR, strerror(err), ##args); \
+ else \
+ fprintf(stderr, "\t%s: "message, ERROR, ##args); \
+ } \
+} while (0)
+
+#define fail(message, args...) \
+do { \
+ if (_verbose >= VCRITICAL) \
+ fprintf(stderr, "\t%s: "message, FAIL, ##args); \
+} while (0)
+
+#endif
diff --git a/tools/testing/selftests/futex/run.sh b/tools/testing/selftests/futex/run.sh
new file mode 100755
index 000000000..5e76ea18f
--- /dev/null
+++ b/tools/testing/selftests/futex/run.sh
@@ -0,0 +1,29 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+###############################################################################
+#
+# Copyright © International Business Machines Corp., 2009
+#
+# DESCRIPTION
+# Run all tests under the functional, performance, and stress directories.
+# Format and summarize the results.
+#
+# AUTHOR
+# Darren Hart <dvhart@linux.intel.com>
+#
+# HISTORY
+# 2009-Nov-9: Initial version by Darren Hart <dvhart@linux.intel.com>
+#
+###############################################################################
+
+# Test for a color capable shell and pass the result to the subdir scripts
+USE_COLOR=0
+tput setf 7 || tput setaf 7
+if [ $? -eq 0 ]; then
+ USE_COLOR=1
+ tput sgr0
+fi
+export USE_COLOR
+
+(cd functional; ./run.sh)
diff --git a/tools/testing/selftests/gen_kselftest_tar.sh b/tools/testing/selftests/gen_kselftest_tar.sh
new file mode 100755
index 000000000..4a974bc03
--- /dev/null
+++ b/tools/testing/selftests/gen_kselftest_tar.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+#
+# SPDX-License-Identifier: GPL-2.0
+# gen_kselftest_tar
+# Generate kselftest tarball
+# Author: Shuah Khan <shuahkh@osg.samsung.com>
+# Copyright (C) 2015 Samsung Electronics Co., Ltd.
+
+# main
+main()
+{
+ if [ "$#" -eq 0 ]; then
+ echo "$0: Generating default compression gzip"
+ copts="cvzf"
+ ext=".tar.gz"
+ else
+ case "$1" in
+ tar)
+ copts="cvf"
+ ext=".tar"
+ ;;
+ targz)
+ copts="cvzf"
+ ext=".tar.gz"
+ ;;
+ tarbz2)
+ copts="cvjf"
+ ext=".tar.bz2"
+ ;;
+ tarxz)
+ copts="cvJf"
+ ext=".tar.xz"
+ ;;
+ *)
+ echo "Unknown tarball format $1"
+ exit 1
+ ;;
+ esac
+ fi
+
+ # Create working directory.
+ dest=`pwd`
+ install_work="$dest"/kselftest_install
+ install_name=kselftest
+ install_dir="$install_work"/"$install_name"
+ mkdir -p "$install_dir"
+
+ # Run install using INSTALL_KSFT_PATH override to generate install
+ # directory
+ ./kselftest_install.sh "$install_dir"
+ (cd "$install_work"; tar $copts "$dest"/kselftest${ext} $install_name)
+
+ # Don't put the message at the actual end as people may be parsing the
+ # "archive created" line in their scripts.
+ echo -e "\nConsider using 'make gen_tar' instead of this script\n"
+
+ echo "Kselftest archive kselftest${ext} created!"
+
+ # clean up top-level install work directory
+ rm -rf "$install_work"
+}
+
+main "$@"
diff --git a/tools/testing/selftests/gpio/.gitignore b/tools/testing/selftests/gpio/.gitignore
new file mode 100644
index 000000000..4c69408f3
--- /dev/null
+++ b/tools/testing/selftests/gpio/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+gpio-mockup-chardev
diff --git a/tools/testing/selftests/gpio/Makefile b/tools/testing/selftests/gpio/Makefile
new file mode 100644
index 000000000..acf4088a9
--- /dev/null
+++ b/tools/testing/selftests/gpio/Makefile
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: GPL-2.0
+
+VAR_CFLAGS := $(shell pkg-config --cflags mount 2>/dev/null)
+VAR_LDLIBS := $(shell pkg-config --libs mount 2>/dev/null)
+ifeq ($(VAR_LDLIBS),)
+VAR_LDLIBS := -lmount -I/usr/include/libmount
+endif
+
+CFLAGS += -O2 -g -std=gnu99 -Wall -I../../../../usr/include/ $(VAR_CFLAGS)
+LDLIBS += $(VAR_LDLIBS)
+
+TEST_PROGS := gpio-mockup.sh
+TEST_FILES := gpio-mockup-sysfs.sh
+TEST_GEN_PROGS_EXTENDED := gpio-mockup-chardev
+
+KSFT_KHDR_INSTALL := 1
+include ../lib.mk
+
+GPIODIR := $(realpath ../../../gpio)
+GPIOOUT := $(OUTPUT)/tools-gpio/
+GPIOOBJ := $(GPIOOUT)/gpio-utils.o
+
+override define CLEAN
+ $(RM) $(TEST_GEN_PROGS_EXTENDED)
+ $(RM) -rf $(GPIOOUT)
+endef
+
+$(TEST_GEN_PROGS_EXTENDED): $(GPIOOBJ)
+
+$(GPIOOUT):
+ mkdir -p $@
+
+$(GPIOOBJ): $(GPIOOUT)
+ $(MAKE) OUTPUT=$(GPIOOUT) -C $(GPIODIR)
diff --git a/tools/testing/selftests/gpio/config b/tools/testing/selftests/gpio/config
new file mode 100644
index 000000000..abaa6902b
--- /dev/null
+++ b/tools/testing/selftests/gpio/config
@@ -0,0 +1,2 @@
+CONFIG_GPIOLIB=y
+CONFIG_GPIO_MOCKUP=m
diff --git a/tools/testing/selftests/gpio/gpio-mockup-chardev.c b/tools/testing/selftests/gpio/gpio-mockup-chardev.c
new file mode 100644
index 000000000..73ead8828
--- /dev/null
+++ b/tools/testing/selftests/gpio/gpio-mockup-chardev.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * GPIO chardev test helper
+ *
+ * Copyright (C) 2016 Bamvor Jian Zhang
+ */
+
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <sys/ioctl.h>
+#include <libmount.h>
+#include <err.h>
+#include <dirent.h>
+#include <linux/gpio.h>
+#include "../../../gpio/gpio-utils.h"
+
+#define CONSUMER "gpio-selftest"
+#define GC_NUM 10
+enum direction {
+ OUT,
+ IN
+};
+
+static int get_debugfs(char **path)
+{
+ struct libmnt_context *cxt;
+ struct libmnt_table *tb;
+ struct libmnt_iter *itr = NULL;
+ struct libmnt_fs *fs;
+ int found = 0, ret;
+
+ cxt = mnt_new_context();
+ if (!cxt)
+ err(EXIT_FAILURE, "libmount context allocation failed");
+
+ itr = mnt_new_iter(MNT_ITER_FORWARD);
+ if (!itr)
+ err(EXIT_FAILURE, "failed to initialize libmount iterator");
+
+ if (mnt_context_get_mtab(cxt, &tb))
+ err(EXIT_FAILURE, "failed to read mtab");
+
+ while (mnt_table_next_fs(tb, itr, &fs) == 0) {
+ const char *type = mnt_fs_get_fstype(fs);
+
+ if (!strcmp(type, "debugfs")) {
+ found = 1;
+ break;
+ }
+ }
+ if (found) {
+ ret = asprintf(path, "%s/gpio", mnt_fs_get_target(fs));
+ if (ret < 0)
+ err(EXIT_FAILURE, "failed to format string");
+ }
+
+ mnt_free_iter(itr);
+ mnt_free_context(cxt);
+
+ if (!found)
+ return -1;
+
+ return 0;
+}
+
+static int gpio_debugfs_get(const char *consumer, int *dir, int *value)
+{
+ char *debugfs;
+ FILE *f;
+ char *line = NULL;
+ size_t len = 0;
+ char *cur;
+ int found = 0;
+
+ if (get_debugfs(&debugfs) != 0)
+ err(EXIT_FAILURE, "debugfs is not mounted");
+
+ f = fopen(debugfs, "r");
+ if (!f)
+ err(EXIT_FAILURE, "read from gpio debugfs failed");
+
+ /*
+ * gpio-2 ( |gpio-selftest ) in lo
+ */
+ while (getline(&line, &len, f) != -1) {
+ cur = strstr(line, consumer);
+ if (cur == NULL)
+ continue;
+
+ cur = strchr(line, ')');
+ if (!cur)
+ continue;
+
+ cur += 2;
+ if (!strncmp(cur, "out", 3)) {
+ *dir = OUT;
+ cur += 4;
+ } else if (!strncmp(cur, "in", 2)) {
+ *dir = IN;
+ cur += 4;
+ }
+
+ if (!strncmp(cur, "hi", 2))
+ *value = 1;
+ else if (!strncmp(cur, "lo", 2))
+ *value = 0;
+
+ found = 1;
+ break;
+ }
+ free(debugfs);
+ fclose(f);
+ free(line);
+
+ if (!found)
+ return -1;
+
+ return 0;
+}
+
+static struct gpiochip_info *list_gpiochip(const char *gpiochip_name, int *ret)
+{
+ struct gpiochip_info *cinfo;
+ struct gpiochip_info *current;
+ const struct dirent *ent;
+ DIR *dp;
+ char *chrdev_name;
+ int fd;
+ int i = 0;
+
+ cinfo = calloc(sizeof(struct gpiochip_info) * 4, GC_NUM + 1);
+ if (!cinfo)
+ err(EXIT_FAILURE, "gpiochip_info allocation failed");
+
+ current = cinfo;
+ dp = opendir("/dev");
+ if (!dp) {
+ *ret = -errno;
+ goto error_out;
+ } else {
+ *ret = 0;
+ }
+
+ while (ent = readdir(dp), ent) {
+ if (check_prefix(ent->d_name, "gpiochip")) {
+ *ret = asprintf(&chrdev_name, "/dev/%s", ent->d_name);
+ if (*ret < 0)
+ goto error_out;
+
+ fd = open(chrdev_name, 0);
+ if (fd == -1) {
+ *ret = -errno;
+ fprintf(stderr, "Failed to open %s\n",
+ chrdev_name);
+ goto error_close_dir;
+ }
+ *ret = ioctl(fd, GPIO_GET_CHIPINFO_IOCTL, current);
+ if (*ret == -1) {
+ perror("Failed to issue CHIPINFO IOCTL\n");
+ goto error_close_dir;
+ }
+ close(fd);
+ if (strcmp(current->label, gpiochip_name) == 0
+ || check_prefix(current->label, gpiochip_name)) {
+ *ret = 0;
+ current++;
+ i++;
+ }
+ }
+ }
+
+ if ((!*ret && i == 0) || *ret < 0) {
+ free(cinfo);
+ cinfo = NULL;
+ }
+ if (!*ret && i > 0) {
+ cinfo = realloc(cinfo, sizeof(struct gpiochip_info) * 4 * i);
+ *ret = i;
+ }
+
+error_close_dir:
+ closedir(dp);
+error_out:
+ if (*ret < 0)
+ err(EXIT_FAILURE, "list gpiochip failed: %s", strerror(*ret));
+
+ return cinfo;
+}
+
+int gpio_pin_test(struct gpiochip_info *cinfo, int line, int flag, int value)
+{
+ struct gpiohandle_data data;
+ unsigned int lines[] = {line};
+ int fd;
+ int debugfs_dir = IN;
+ int debugfs_value = 0;
+ int ret;
+
+ data.values[0] = value;
+ ret = gpiotools_request_linehandle(cinfo->name, lines, 1, flag, &data,
+ CONSUMER);
+ if (ret < 0)
+ goto fail_out;
+ else
+ fd = ret;
+
+ ret = gpio_debugfs_get(CONSUMER, &debugfs_dir, &debugfs_value);
+ if (ret) {
+ ret = -EINVAL;
+ goto fail_out;
+ }
+ if (flag & GPIOHANDLE_REQUEST_INPUT) {
+ if (debugfs_dir != IN) {
+ errno = -EINVAL;
+ ret = -errno;
+ }
+ } else if (flag & GPIOHANDLE_REQUEST_OUTPUT) {
+ if (flag & GPIOHANDLE_REQUEST_ACTIVE_LOW)
+ debugfs_value = !debugfs_value;
+
+ if (!(debugfs_dir == OUT && value == debugfs_value)) {
+ errno = -EINVAL;
+ ret = -errno;
+ }
+ }
+ gpiotools_release_linehandle(fd);
+
+fail_out:
+ if (ret)
+ err(EXIT_FAILURE, "gpio<%s> line<%d> test flag<0x%x> value<%d>",
+ cinfo->name, line, flag, value);
+
+ return ret;
+}
+
+void gpio_pin_tests(struct gpiochip_info *cinfo, unsigned int line)
+{
+ printf("line<%d>", line);
+ gpio_pin_test(cinfo, line, GPIOHANDLE_REQUEST_OUTPUT, 0);
+ printf(".");
+ gpio_pin_test(cinfo, line, GPIOHANDLE_REQUEST_OUTPUT, 1);
+ printf(".");
+ gpio_pin_test(cinfo, line,
+ GPIOHANDLE_REQUEST_OUTPUT | GPIOHANDLE_REQUEST_ACTIVE_LOW,
+ 0);
+ printf(".");
+ gpio_pin_test(cinfo, line,
+ GPIOHANDLE_REQUEST_OUTPUT | GPIOHANDLE_REQUEST_ACTIVE_LOW,
+ 1);
+ printf(".");
+ gpio_pin_test(cinfo, line, GPIOHANDLE_REQUEST_INPUT, 0);
+ printf(".");
+}
+
+/*
+ * ./gpio-mockup-chardev gpio_chip_name_prefix is_valid_gpio_chip
+ * Return 0 if successful or exit with EXIT_FAILURE if test failed.
+ * gpio_chip_name_prefix: The prefix of gpiochip you want to test. E.g.
+ * gpio-mockup
+ * is_valid_gpio_chip: Whether the gpio_chip is valid. 1 means valid,
+ * 0 means invalid which could not be found by
+ * list_gpiochip.
+ */
+int main(int argc, char *argv[])
+{
+ char *prefix;
+ int valid;
+ struct gpiochip_info *cinfo;
+ struct gpiochip_info *current;
+ int i;
+ int ret;
+
+ if (argc < 3) {
+ printf("Usage: %s prefix is_valid", argv[0]);
+ exit(EXIT_FAILURE);
+ }
+
+ prefix = argv[1];
+ valid = strcmp(argv[2], "true") == 0 ? 1 : 0;
+
+ printf("Test gpiochip %s: ", prefix);
+ cinfo = list_gpiochip(prefix, &ret);
+ if (!cinfo) {
+ if (!valid && ret == 0) {
+ printf("Invalid test successful\n");
+ ret = 0;
+ goto out;
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
+ } else if (cinfo && !valid) {
+ ret = -EINVAL;
+ goto out;
+ }
+ current = cinfo;
+ for (i = 0; i < ret; i++) {
+ gpio_pin_tests(current, 0);
+ gpio_pin_tests(current, current->lines - 1);
+ gpio_pin_tests(current, random() % current->lines);
+ current++;
+ }
+ ret = 0;
+ printf("successful\n");
+
+out:
+ if (ret)
+ fprintf(stderr, "gpio<%s> test failed\n", prefix);
+
+ if (cinfo)
+ free(cinfo);
+
+ if (ret)
+ exit(EXIT_FAILURE);
+
+ return ret;
+}
diff --git a/tools/testing/selftests/gpio/gpio-mockup-sysfs.sh b/tools/testing/selftests/gpio/gpio-mockup-sysfs.sh
new file mode 100755
index 000000000..dd269d877
--- /dev/null
+++ b/tools/testing/selftests/gpio/gpio-mockup-sysfs.sh
@@ -0,0 +1,135 @@
+
+# SPDX-License-Identifier: GPL-2.0
+is_consistent()
+{
+ val=
+
+ active_low_sysfs=`cat $GPIO_SYSFS/gpio$nr/active_low`
+ val_sysfs=`cat $GPIO_SYSFS/gpio$nr/value`
+ dir_sysfs=`cat $GPIO_SYSFS/gpio$nr/direction`
+
+ gpio_this_debugfs=`cat $GPIO_DEBUGFS |grep "gpio-$nr" | sed "s/(.*)//g"`
+ dir_debugfs=`echo $gpio_this_debugfs | awk '{print $2}'`
+ val_debugfs=`echo $gpio_this_debugfs | awk '{print $3}'`
+ if [ $val_debugfs = "lo" ]; then
+ val=0
+ elif [ $val_debugfs = "hi" ]; then
+ val=1
+ fi
+
+ if [ $active_low_sysfs = "1" ]; then
+ if [ $val = "0" ]; then
+ val="1"
+ else
+ val="0"
+ fi
+ fi
+
+ if [ $val_sysfs = $val ] && [ $dir_sysfs = $dir_debugfs ]; then
+ echo -n "."
+ else
+ echo "test fail, exit"
+ die
+ fi
+}
+
+test_pin_logic()
+{
+ nr=$1
+ direction=$2
+ active_low=$3
+ value=$4
+
+ echo $direction > $GPIO_SYSFS/gpio$nr/direction
+ echo $active_low > $GPIO_SYSFS/gpio$nr/active_low
+ if [ $direction = "out" ]; then
+ echo $value > $GPIO_SYSFS/gpio$nr/value
+ fi
+ is_consistent $nr
+}
+
+test_one_pin()
+{
+ nr=$1
+
+ echo -n "test pin<$nr>"
+
+ echo $nr > $GPIO_SYSFS/export 2>/dev/null
+
+ if [ X$? != X0 ]; then
+ echo "test GPIO pin $nr failed"
+ die
+ fi
+
+ #"Checking if the sysfs is consistent with debugfs: "
+ is_consistent $nr
+
+ #"Checking the logic of active_low: "
+ test_pin_logic $nr out 1 1
+ test_pin_logic $nr out 1 0
+ test_pin_logic $nr out 0 1
+ test_pin_logic $nr out 0 0
+
+ #"Checking the logic of direction: "
+ test_pin_logic $nr in 1 1
+ test_pin_logic $nr out 1 0
+ test_pin_logic $nr low 0 1
+ test_pin_logic $nr high 0 0
+
+ echo $nr > $GPIO_SYSFS/unexport
+
+ echo "successful"
+}
+
+test_one_pin_fail()
+{
+ nr=$1
+
+ echo $nr > $GPIO_SYSFS/export 2>/dev/null
+
+ if [ X$? != X0 ]; then
+ echo "test invalid pin $nr successful"
+ else
+ echo "test invalid pin $nr failed"
+ echo $nr > $GPIO_SYSFS/unexport 2>/dev/null
+ die
+ fi
+}
+
+list_chip()
+{
+ echo `ls -d $GPIO_DRV_SYSFS/gpiochip* 2>/dev/null`
+}
+
+test_chip()
+{
+ chip=$1
+ name=`basename $chip`
+ base=`cat $chip/base`
+ ngpio=`cat $chip/ngpio`
+ printf "%-10s %-5s %-5s\n" $name $base $ngpio
+ if [ $ngpio = "0" ]; then
+ echo "number of gpio is zero is not allowed".
+ fi
+ test_one_pin $base
+ test_one_pin $(($base + $ngpio - 1))
+ test_one_pin $((( RANDOM % $ngpio ) + $base ))
+}
+
+test_chips_sysfs()
+{
+ gpiochip=`list_chip $module`
+ if [ X"$gpiochip" = X ]; then
+ if [ X"$valid" = Xfalse ]; then
+ echo "successful"
+ else
+ echo "fail"
+ die
+ fi
+ else
+ for chip in $gpiochip; do
+ test_chip $chip
+ done
+ fi
+}
+
diff --git a/tools/testing/selftests/gpio/gpio-mockup.sh b/tools/testing/selftests/gpio/gpio-mockup.sh
new file mode 100755
index 000000000..7f35b9880
--- /dev/null
+++ b/tools/testing/selftests/gpio/gpio-mockup.sh
@@ -0,0 +1,206 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+#exit status
+#1: Internal error
+#2: sysfs/debugfs not mount
+#3: insert module fail when gpio-mockup is a module.
+#4: Skip test including run as non-root user.
+#5: other reason.
+
+SYSFS=
+GPIO_SYSFS=
+GPIO_DRV_SYSFS=
+DEBUGFS=
+GPIO_DEBUGFS=
+dev_type=
+module=
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+usage()
+{
+ echo "Usage:"
+ echo "$0 [-f] [-m name] [-t type]"
+ echo "-f: full test. It maybe conflict with existence gpio device."
+ echo "-m: module name, default name is gpio-mockup. It could also test"
+ echo " other gpio device."
+ echo "-t: interface type: chardev(char device) and sysfs(being"
+ echo " deprecated). The first one is default"
+ echo ""
+ echo "$0 -h"
+ echo "This usage"
+}
+
+prerequisite()
+{
+ msg="skip all tests:"
+ if [ $UID != 0 ]; then
+ echo $msg must be run as root >&2
+ exit $ksft_skip
+ fi
+ SYSFS=`mount -t sysfs | head -1 | awk '{ print $3 }'`
+ if [ ! -d "$SYSFS" ]; then
+ echo $msg sysfs is not mounted >&2
+ exit 2
+ fi
+ GPIO_SYSFS=`echo $SYSFS/class/gpio`
+ GPIO_DRV_SYSFS=`echo $SYSFS/devices/platform/$module/gpio`
+ DEBUGFS=`mount -t debugfs | head -1 | awk '{ print $3 }'`
+ if [ ! -d "$DEBUGFS" ]; then
+ echo $msg debugfs is not mounted >&2
+ exit 2
+ fi
+ GPIO_DEBUGFS=`echo $DEBUGFS/gpio`
+ source gpio-mockup-sysfs.sh
+}
+
+try_insert_module()
+{
+ if [ -d "$GPIO_DRV_SYSFS" ]; then
+ echo "$GPIO_DRV_SYSFS exist. Skip insert module"
+ else
+ modprobe -q $module $1
+ if [ X$? != X0 ]; then
+ echo $msg insmod $module failed >&2
+ exit 3
+ fi
+ fi
+}
+
+remove_module()
+{
+ modprobe -r -q $module
+}
+
+die()
+{
+ remove_module
+ exit 5
+}
+
+test_chips()
+{
+ if [ X$dev_type = Xsysfs ]; then
+ echo "WARNING: sysfs ABI of gpio is going to deprecated."
+ test_chips_sysfs $*
+ else
+ $BASE/gpio-mockup-chardev $*
+ fi
+}
+
+gpio_test()
+{
+ param=$1
+ valid=$2
+
+ if [ X"$param" = X ]; then
+ die
+ fi
+ try_insert_module "gpio_mockup_ranges=$param"
+ echo -n "GPIO $module test with ranges: <"
+ echo "$param>: "
+ printf "%-10s %s\n" $param
+ test_chips $module $valid
+ remove_module
+}
+
+BASE=`dirname $0`
+
+dev_type=
+TEMP=`getopt -o fhm:t: -n '$0' -- "$@"`
+
+if [ "$?" != "0" ]; then
+ echo "Parameter process failed, Terminating..." >&2
+ exit 1
+fi
+
+# Note the quotes around `$TEMP': they are essential!
+eval set -- "$TEMP"
+
+while true; do
+ case $1 in
+ -f)
+ full_test=true
+ shift
+ ;;
+ -h)
+ usage
+ exit
+ ;;
+ -m)
+ module=$2
+ shift 2
+ ;;
+ -t)
+ dev_type=$2
+ shift 2
+ ;;
+ --)
+ shift
+ break
+ ;;
+ *)
+ echo "Internal error!"
+ exit 1
+ ;;
+ esac
+done
+
+if [ X"$module" = X ]; then
+ module="gpio-mockup"
+fi
+
+if [ X$dev_type != Xsysfs ]; then
+ dev_type="chardev"
+fi
+
+prerequisite
+
+echo "1. Test dynamic allocation of gpio successful means insert gpiochip and"
+echo " manipulate gpio pin successful"
+gpio_test "-1,32" true
+gpio_test "-1,32,-1,32" true
+gpio_test "-1,32,-1,32,-1,32" true
+if [ X$full_test = Xtrue ]; then
+ gpio_test "-1,32,32,64" true
+ gpio_test "-1,32,40,64,-1,5" true
+ gpio_test "-1,32,32,64,-1,32" true
+ gpio_test "0,32,32,64,-1,32,-1,32" true
+ gpio_test "-1,32,-1,32,0,32,32,64" true
+ echo "2. Do basic test: successful means insert gpiochip and"
+ echo " manipulate gpio pin successful"
+ gpio_test "0,32" true
+ gpio_test "0,32,32,64" true
+ gpio_test "0,32,40,64,64,96" true
+fi
+echo "3. Error test: successful means insert gpiochip failed"
+echo "3.1 Test number of gpio overflow"
+#Currently: The max number of gpio(1024) is defined in arm architecture.
+gpio_test "-1,32,-1,1024" false
+if [ X$full_test = Xtrue ]; then
+ echo "3.2 Test zero line of gpio"
+ gpio_test "0,0" false
+ echo "3.3 Test range overlap"
+ echo "3.3.1 Test corner case"
+ gpio_test "0,32,0,1" false
+ gpio_test "0,32,32,64,32,40" false
+ gpio_test "0,32,35,64,35,45" false
+ gpio_test "0,32,31,32" false
+ gpio_test "0,32,32,64,36,37" false
+ gpio_test "0,32,35,64,34,36" false
+ echo "3.3.2 Test inserting invalid second gpiochip"
+ gpio_test "0,32,30,35" false
+ gpio_test "0,32,1,5" false
+ gpio_test "10,32,9,14" false
+ gpio_test "10,32,30,35" false
+ echo "3.3.3 Test others"
+ gpio_test "0,32,40,56,39,45" false
+ gpio_test "0,32,40,56,30,33" false
+ gpio_test "0,32,40,56,30,41" false
+ gpio_test "0,32,40,56,20,21" false
+fi
+
+echo GPIO test PASS
+
diff --git a/tools/testing/selftests/ia64/.gitignore b/tools/testing/selftests/ia64/.gitignore
new file mode 100644
index 000000000..e962fb2a0
--- /dev/null
+++ b/tools/testing/selftests/ia64/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+aliasing-test
diff --git a/tools/testing/selftests/ia64/Makefile b/tools/testing/selftests/ia64/Makefile
new file mode 100644
index 000000000..4bce1a84b
--- /dev/null
+++ b/tools/testing/selftests/ia64/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+TEST_PROGS := aliasing-test
+
+all: $(TEST_PROGS)
+
+include ../lib.mk
+
+clean:
+ rm -fr $(TEST_PROGS)
diff --git a/tools/testing/selftests/ia64/aliasing-test.c b/tools/testing/selftests/ia64/aliasing-test.c
new file mode 100644
index 000000000..1ad6896f1
--- /dev/null
+++ b/tools/testing/selftests/ia64/aliasing-test.c
@@ -0,0 +1,260 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Exercise /dev/mem mmap cases that have been troublesome in the past
+ *
+ * (c) Copyright 2007 Hewlett-Packard Development Company, L.P.
+ * Bjorn Helgaas <bjorn.helgaas@hp.com>
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <fnmatch.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <linux/pci.h>
+
+int sum;
+
+static int map_mem(char *path, off_t offset, size_t length, int touch)
+{
+ int fd, rc;
+ void *addr;
+ int *c;
+
+ fd = open(path, O_RDWR);
+ if (fd == -1) {
+ perror(path);
+ return -1;
+ }
+
+ if (fnmatch("/proc/bus/pci/*", path, 0) == 0) {
+ rc = ioctl(fd, PCIIOC_MMAP_IS_MEM);
+ if (rc == -1)
+ perror("PCIIOC_MMAP_IS_MEM ioctl");
+ }
+
+ addr = mmap(NULL, length, PROT_READ|PROT_WRITE, MAP_SHARED, fd, offset);
+ if (addr == MAP_FAILED)
+ return 1;
+
+ if (touch) {
+ c = (int *) addr;
+ while (c < (int *) (addr + length))
+ sum += *c++;
+ }
+
+ rc = munmap(addr, length);
+ if (rc == -1) {
+ perror("munmap");
+ return -1;
+ }
+
+ close(fd);
+ return 0;
+}
+
+static int scan_tree(char *path, char *file, off_t offset, size_t length, int touch)
+{
+ struct dirent **namelist;
+ char *name, *path2;
+ int i, n, r, rc = 0, result = 0;
+ struct stat buf;
+
+ n = scandir(path, &namelist, 0, alphasort);
+ if (n < 0) {
+ perror("scandir");
+ return -1;
+ }
+
+ for (i = 0; i < n; i++) {
+ name = namelist[i]->d_name;
+
+ if (fnmatch(".", name, 0) == 0)
+ goto skip;
+ if (fnmatch("..", name, 0) == 0)
+ goto skip;
+
+ path2 = malloc(strlen(path) + strlen(name) + 3);
+ strcpy(path2, path);
+ strcat(path2, "/");
+ strcat(path2, name);
+
+ if (fnmatch(file, name, 0) == 0) {
+ rc = map_mem(path2, offset, length, touch);
+ if (rc == 0)
+ fprintf(stderr, "PASS: %s 0x%lx-0x%lx is %s\n", path2, offset, offset + length, touch ? "readable" : "mappable");
+ else if (rc > 0)
+ fprintf(stderr, "PASS: %s 0x%lx-0x%lx not mappable\n", path2, offset, offset + length);
+ else {
+ fprintf(stderr, "FAIL: %s 0x%lx-0x%lx not accessible\n", path2, offset, offset + length);
+ return rc;
+ }
+ } else {
+ r = lstat(path2, &buf);
+ if (r == 0 && S_ISDIR(buf.st_mode)) {
+ rc = scan_tree(path2, file, offset, length, touch);
+ if (rc < 0)
+ return rc;
+ }
+ }
+
+ result |= rc;
+ free(path2);
+
+skip:
+ free(namelist[i]);
+ }
+ free(namelist);
+ return result;
+}
+
+char buf[1024];
+
+static int read_rom(char *path)
+{
+ int fd, rc;
+ size_t size = 0;
+
+ fd = open(path, O_RDWR);
+ if (fd == -1) {
+ perror(path);
+ return -1;
+ }
+
+ rc = write(fd, "1", 2);
+ if (rc <= 0) {
+ close(fd);
+ perror("write");
+ return -1;
+ }
+
+ do {
+ rc = read(fd, buf, sizeof(buf));
+ if (rc > 0)
+ size += rc;
+ } while (rc > 0);
+
+ close(fd);
+ return size;
+}
+
+static int scan_rom(char *path, char *file)
+{
+ struct dirent **namelist;
+ char *name, *path2;
+ int i, n, r, rc = 0, result = 0;
+ struct stat buf;
+
+ n = scandir(path, &namelist, 0, alphasort);
+ if (n < 0) {
+ perror("scandir");
+ return -1;
+ }
+
+ for (i = 0; i < n; i++) {
+ name = namelist[i]->d_name;
+
+ if (fnmatch(".", name, 0) == 0)
+ goto skip;
+ if (fnmatch("..", name, 0) == 0)
+ goto skip;
+
+ path2 = malloc(strlen(path) + strlen(name) + 3);
+ strcpy(path2, path);
+ strcat(path2, "/");
+ strcat(path2, name);
+
+ if (fnmatch(file, name, 0) == 0) {
+ rc = read_rom(path2);
+
+ /*
+ * It's OK if the ROM is unreadable. Maybe there
+ * is no ROM, or some other error occurred. The
+ * important thing is that no MCA happened.
+ */
+ if (rc > 0)
+ fprintf(stderr, "PASS: %s read %d bytes\n", path2, rc);
+ else {
+ fprintf(stderr, "PASS: %s not readable\n", path2);
+ return rc;
+ }
+ } else {
+ r = lstat(path2, &buf);
+ if (r == 0 && S_ISDIR(buf.st_mode)) {
+ rc = scan_rom(path2, file);
+ if (rc < 0)
+ return rc;
+ }
+ }
+
+ result |= rc;
+ free(path2);
+
+skip:
+ free(namelist[i]);
+ }
+ free(namelist);
+ return result;
+}
+
+int main(void)
+{
+ int rc;
+
+ if (map_mem("/dev/mem", 0, 0xA0000, 1) == 0)
+ fprintf(stderr, "PASS: /dev/mem 0x0-0xa0000 is readable\n");
+ else
+ fprintf(stderr, "FAIL: /dev/mem 0x0-0xa0000 not accessible\n");
+
+ /*
+ * It's not safe to blindly read the VGA frame buffer. If you know
+ * how to poke the card the right way, it should respond, but it's
+ * not safe in general. Many machines, e.g., Intel chipsets, cover
+ * up a non-responding card by just returning -1, but others will
+ * report the failure as a machine check.
+ */
+ if (map_mem("/dev/mem", 0xA0000, 0x20000, 0) == 0)
+ fprintf(stderr, "PASS: /dev/mem 0xa0000-0xc0000 is mappable\n");
+ else
+ fprintf(stderr, "FAIL: /dev/mem 0xa0000-0xc0000 not accessible\n");
+
+ if (map_mem("/dev/mem", 0xC0000, 0x40000, 1) == 0)
+ fprintf(stderr, "PASS: /dev/mem 0xc0000-0x100000 is readable\n");
+ else
+ fprintf(stderr, "FAIL: /dev/mem 0xc0000-0x100000 not accessible\n");
+
+ /*
+ * Often you can map all the individual pieces above (0-0xA0000,
+ * 0xA0000-0xC0000, and 0xC0000-0x100000), but can't map the whole
+ * thing at once. This is because the individual pieces use different
+ * attributes, and there's no single attribute supported over the
+ * whole region.
+ */
+ rc = map_mem("/dev/mem", 0, 1024*1024, 0);
+ if (rc == 0)
+ fprintf(stderr, "PASS: /dev/mem 0x0-0x100000 is mappable\n");
+ else if (rc > 0)
+ fprintf(stderr, "PASS: /dev/mem 0x0-0x100000 not mappable\n");
+ else
+ fprintf(stderr, "FAIL: /dev/mem 0x0-0x100000 not accessible\n");
+
+ scan_tree("/sys/class/pci_bus", "legacy_mem", 0, 0xA0000, 1);
+ scan_tree("/sys/class/pci_bus", "legacy_mem", 0xA0000, 0x20000, 0);
+ scan_tree("/sys/class/pci_bus", "legacy_mem", 0xC0000, 0x40000, 1);
+ scan_tree("/sys/class/pci_bus", "legacy_mem", 0, 1024*1024, 0);
+
+ scan_rom("/sys/devices", "rom");
+
+ scan_tree("/proc/bus/pci", "??.?", 0, 0xA0000, 1);
+ scan_tree("/proc/bus/pci", "??.?", 0xA0000, 0x20000, 0);
+ scan_tree("/proc/bus/pci", "??.?", 0xC0000, 0x40000, 1);
+ scan_tree("/proc/bus/pci", "??.?", 0, 1024*1024, 0);
+
+ return rc;
+}
diff --git a/tools/testing/selftests/intel_pstate/.gitignore b/tools/testing/selftests/intel_pstate/.gitignore
new file mode 100644
index 000000000..862de222a
--- /dev/null
+++ b/tools/testing/selftests/intel_pstate/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+aperf
+msr
diff --git a/tools/testing/selftests/intel_pstate/Makefile b/tools/testing/selftests/intel_pstate/Makefile
new file mode 100644
index 000000000..05d66ef50
--- /dev/null
+++ b/tools/testing/selftests/intel_pstate/Makefile
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS := $(CFLAGS) -Wall -D_GNU_SOURCE
+LDLIBS += -lm
+
+ARCH ?= $(shell uname -m 2>/dev/null || echo not)
+ARCH_PROCESSED := $(shell echo $(ARCH) | sed -e s/i.86/x86/ -e s/x86_64/x86/)
+
+ifeq (x86,$(ARCH_PROCESSED))
+TEST_GEN_FILES := msr aperf
+endif
+
+TEST_PROGS := run.sh
+
+include ../lib.mk
+
+$(TEST_GEN_FILES): $(HEADERS)
diff --git a/tools/testing/selftests/intel_pstate/aperf.c b/tools/testing/selftests/intel_pstate/aperf.c
new file mode 100644
index 000000000..a8acf3996
--- /dev/null
+++ b/tools/testing/selftests/intel_pstate/aperf.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <math.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/timeb.h>
+#include <sched.h>
+#include <errno.h>
+#include <string.h>
+#include <time.h>
+#include "../kselftest.h"
+
+#define MSEC_PER_SEC 1000L
+#define NSEC_PER_MSEC 1000000L
+
+void usage(char *name) {
+ printf ("Usage: %s cpunum\n", name);
+}
+
+int main(int argc, char **argv) {
+ unsigned int i, cpu, fd;
+ char msr_file_name[64];
+ long long tsc, old_tsc, new_tsc;
+ long long aperf, old_aperf, new_aperf;
+ long long mperf, old_mperf, new_mperf;
+ struct timespec before, after;
+ long long int start, finish, total;
+ cpu_set_t cpuset;
+
+ if (argc != 2) {
+ usage(argv[0]);
+ return 1;
+ }
+
+ errno = 0;
+ cpu = strtol(argv[1], (char **) NULL, 10);
+
+ if (errno) {
+ usage(argv[0]);
+ return 1;
+ }
+
+ sprintf(msr_file_name, "/dev/cpu/%d/msr", cpu);
+ fd = open(msr_file_name, O_RDONLY);
+
+ if (fd == -1) {
+ printf("/dev/cpu/%d/msr: %s\n", cpu, strerror(errno));
+ return KSFT_SKIP;
+ }
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpu, &cpuset);
+
+ if (sched_setaffinity(0, sizeof(cpu_set_t), &cpuset)) {
+ perror("Failed to set cpu affinity");
+ return 1;
+ }
+
+ if (clock_gettime(CLOCK_MONOTONIC, &before) < 0) {
+ perror("clock_gettime");
+ return 1;
+ }
+ pread(fd, &old_tsc, sizeof(old_tsc), 0x10);
+ pread(fd, &old_aperf, sizeof(old_mperf), 0xe7);
+ pread(fd, &old_mperf, sizeof(old_aperf), 0xe8);
+
+ for (i=0; i<0x8fffffff; i++) {
+ sqrt(i);
+ }
+
+ if (clock_gettime(CLOCK_MONOTONIC, &after) < 0) {
+ perror("clock_gettime");
+ return 1;
+ }
+ pread(fd, &new_tsc, sizeof(new_tsc), 0x10);
+ pread(fd, &new_aperf, sizeof(new_mperf), 0xe7);
+ pread(fd, &new_mperf, sizeof(new_aperf), 0xe8);
+
+ tsc = new_tsc-old_tsc;
+ aperf = new_aperf-old_aperf;
+ mperf = new_mperf-old_mperf;
+
+ start = before.tv_sec*MSEC_PER_SEC + before.tv_nsec/NSEC_PER_MSEC;
+ finish = after.tv_sec*MSEC_PER_SEC + after.tv_nsec/NSEC_PER_MSEC;
+ total = finish - start;
+
+ printf("runTime: %4.2f\n", 1.0*total/MSEC_PER_SEC);
+ printf("freq: %7.0f\n", tsc / (1.0*aperf / (1.0 * mperf)) / total);
+ return 0;
+}
diff --git a/tools/testing/selftests/intel_pstate/msr.c b/tools/testing/selftests/intel_pstate/msr.c
new file mode 100644
index 000000000..88fdd2a4b
--- /dev/null
+++ b/tools/testing/selftests/intel_pstate/msr.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <math.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/timeb.h>
+#include <sched.h>
+#include <errno.h>
+
+
+int main(int argc, char **argv) {
+ int cpu, fd;
+ long long msr;
+ char msr_file_name[64];
+
+ if (argc != 2)
+ return 1;
+
+ errno = 0;
+ cpu = strtol(argv[1], (char **) NULL, 10);
+
+ if (errno)
+ return 1;
+
+ sprintf(msr_file_name, "/dev/cpu/%d/msr", cpu);
+ fd = open(msr_file_name, O_RDONLY);
+
+ if (fd == -1) {
+ perror("Failed to open");
+ return 1;
+ }
+
+ pread(fd, &msr, sizeof(msr), 0x199);
+
+ printf("msr 0x199: 0x%llx\n", msr);
+ return 0;
+}
diff --git a/tools/testing/selftests/intel_pstate/run.sh b/tools/testing/selftests/intel_pstate/run.sh
new file mode 100755
index 000000000..e7008f614
--- /dev/null
+++ b/tools/testing/selftests/intel_pstate/run.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# This test runs on Intel x86 based hardware which support the intel_pstate
+# driver. The test checks the frequency settings from the maximum turbo
+# state to the minimum supported frequency, in decrements of 100MHz. The
+# test runs the aperf.c program to put load on each processor.
+#
+# The results are displayed in a table which indicate the "Target" state,
+# or the requested frequency in MHz, the Actual frequency, as read from
+# /proc/cpuinfo, the difference between the Target and Actual frequencies,
+# and the value of MSR 0x199 (MSR_IA32_PERF_CTL) which indicates what
+# pstate the cpu is in, and the value of
+# /sys/devices/system/cpu/intel_pstate/max_perf_pct X maximum turbo state
+#
+# Notes: In some cases several frequency values may be placed in the
+# /tmp/result.X files. This is done on purpose in order to catch cases
+# where the pstate driver may not be working at all. There is the case
+# where, for example, several "similar" frequencies are in the file:
+#
+#
+#/tmp/result.3100:1:cpu MHz : 2899.980
+#/tmp/result.3100:2:cpu MHz : 2900.000
+#/tmp/result.3100:3:msr 0x199: 0x1e00
+#/tmp/result.3100:4:max_perf_pct 94
+#
+# and the test will error out in those cases. The result.X file can be checked
+# for consistency and modified to remove the extra MHz values. The result.X
+# files can be re-evaluated by setting EVALUATE_ONLY to 1 below.
+
+EVALUATE_ONLY=0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+if ! uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/ | grep -q x86; then
+ echo "$0 # Skipped: Test can only run on x86 architectures."
+ exit $ksft_skip
+fi
+
+msg="skip all tests:"
+if [ $UID != 0 ] && [ $EVALUATE_ONLY == 0 ]; then
+ echo $msg please run this as root >&2
+ exit $ksft_skip
+fi
+
+max_cpus=$(($(nproc)-1))
+
+function run_test () {
+
+ file_ext=$1
+ for cpu in `seq 0 $max_cpus`
+ do
+ echo "launching aperf load on $cpu"
+ ./aperf $cpu &
+ done
+
+ echo "sleeping for 5 seconds"
+ sleep 5
+ grep MHz /proc/cpuinfo | sort -u > /tmp/result.freqs
+ num_freqs=$(wc -l /tmp/result.freqs | awk ' { print $1 } ')
+ if [ $num_freqs -ge 2 ]; then
+ tail -n 1 /tmp/result.freqs > /tmp/result.$1
+ else
+ cp /tmp/result.freqs /tmp/result.$1
+ fi
+ ./msr 0 >> /tmp/result.$1
+
+ max_perf_pct=$(cat /sys/devices/system/cpu/intel_pstate/max_perf_pct)
+ echo "max_perf_pct $max_perf_pct" >> /tmp/result.$1
+
+ for job in `jobs -p`
+ do
+ echo "waiting for job id $job"
+ wait $job
+ done
+}
+
+#
+# MAIN (ALL UNITS IN MHZ)
+#
+
+# Get the marketing frequency
+_mkt_freq=$(cat /proc/cpuinfo | grep -m 1 "model name" | awk '{print $NF}')
+_mkt_freq=$(echo $_mkt_freq | tr -d [:alpha:][:punct:])
+mkt_freq=${_mkt_freq}0
+
+# Get the ranges from cpupower
+_min_freq=$(cpupower frequency-info -l | tail -1 | awk ' { print $1 } ')
+min_freq=$(($_min_freq / 1000))
+_max_freq=$(cpupower frequency-info -l | tail -1 | awk ' { print $2 } ')
+max_freq=$(($_max_freq / 1000))
+
+
+[ $EVALUATE_ONLY -eq 0 ] && for freq in `seq $max_freq -100 $min_freq`
+do
+ echo "Setting maximum frequency to $freq"
+ cpupower frequency-set -g powersave --max=${freq}MHz >& /dev/null
+ run_test $freq
+done
+
+[ $EVALUATE_ONLY -eq 0 ] && cpupower frequency-set -g powersave --max=${max_freq}MHz >& /dev/null
+
+echo "========================================================================"
+echo "The marketing frequency of the cpu is $mkt_freq MHz"
+echo "The maximum frequency of the cpu is $max_freq MHz"
+echo "The minimum frequency of the cpu is $min_freq MHz"
+
+# make a pretty table
+echo "Target Actual Difference MSR(0x199) max_perf_pct" | tr " " "\n" > /tmp/result.tab
+for freq in `seq $max_freq -100 $min_freq`
+do
+ result_freq=$(cat /tmp/result.${freq} | grep "cpu MHz" | awk ' { print $4 } ' | awk -F "." ' { print $1 } ')
+ msr=$(cat /tmp/result.${freq} | grep "msr" | awk ' { print $3 } ')
+ max_perf_pct=$(cat /tmp/result.${freq} | grep "max_perf_pct" | awk ' { print $2 } ' )
+ cat >> /tmp/result.tab << EOF
+$freq
+$result_freq
+$((result_freq - freq))
+$msr
+$((max_perf_pct * max_freq))
+EOF
+done
+
+# print the table
+pr -aTt -5 < /tmp/result.tab
+
+exit 0
diff --git a/tools/testing/selftests/ipc/.gitignore b/tools/testing/selftests/ipc/.gitignore
new file mode 100644
index 000000000..9ed280e4c
--- /dev/null
+++ b/tools/testing/selftests/ipc/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+msgque_test
+msgque
diff --git a/tools/testing/selftests/ipc/Makefile b/tools/testing/selftests/ipc/Makefile
new file mode 100644
index 000000000..1c4448a84
--- /dev/null
+++ b/tools/testing/selftests/ipc/Makefile
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/)
+ifeq ($(ARCH),i386)
+ ARCH := x86
+ CFLAGS := -DCONFIG_X86_32 -D__i386__
+endif
+ifeq ($(ARCH),x86_64)
+ ARCH := x86
+ CFLAGS := -DCONFIG_X86_64 -D__x86_64__
+endif
+
+CFLAGS += -I../../../../usr/include/
+
+TEST_GEN_PROGS := msgque
+
+include ../lib.mk
+
diff --git a/tools/testing/selftests/ipc/config b/tools/testing/selftests/ipc/config
new file mode 100644
index 000000000..070244710
--- /dev/null
+++ b/tools/testing/selftests/ipc/config
@@ -0,0 +1,2 @@
+CONFIG_EXPERT=y
+CONFIG_CHECKPOINT_RESTORE=y
diff --git a/tools/testing/selftests/ipc/msgque.c b/tools/testing/selftests/ipc/msgque.c
new file mode 100644
index 000000000..5ec4d9e18
--- /dev/null
+++ b/tools/testing/selftests/ipc/msgque.c
@@ -0,0 +1,255 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/msg.h>
+#include <fcntl.h>
+
+#include "../kselftest.h"
+
+#define MAX_MSG_SIZE 32
+
+struct msg1 {
+ int msize;
+ long mtype;
+ char mtext[MAX_MSG_SIZE];
+};
+
+#define TEST_STRING "Test sysv5 msg"
+#define MSG_TYPE 1
+
+#define ANOTHER_TEST_STRING "Yet another test sysv5 msg"
+#define ANOTHER_MSG_TYPE 26538
+
+struct msgque_data {
+ key_t key;
+ int msq_id;
+ int qbytes;
+ int qnum;
+ int mode;
+ struct msg1 *messages;
+};
+
+int restore_queue(struct msgque_data *msgque)
+{
+ int fd, ret, id, i;
+ char buf[32];
+
+ fd = open("/proc/sys/kernel/msg_next_id", O_WRONLY);
+ if (fd == -1) {
+ printf("Failed to open /proc/sys/kernel/msg_next_id\n");
+ return -errno;
+ }
+ sprintf(buf, "%d", msgque->msq_id);
+
+ ret = write(fd, buf, strlen(buf));
+ if (ret != strlen(buf)) {
+ printf("Failed to write to /proc/sys/kernel/msg_next_id\n");
+ return -errno;
+ }
+
+ id = msgget(msgque->key, msgque->mode | IPC_CREAT | IPC_EXCL);
+ if (id == -1) {
+ printf("Failed to create queue\n");
+ return -errno;
+ }
+
+ if (id != msgque->msq_id) {
+ printf("Restored queue has wrong id (%d instead of %d)\n",
+ id, msgque->msq_id);
+ ret = -EFAULT;
+ goto destroy;
+ }
+
+ for (i = 0; i < msgque->qnum; i++) {
+ if (msgsnd(msgque->msq_id, &msgque->messages[i].mtype,
+ msgque->messages[i].msize, IPC_NOWAIT) != 0) {
+ printf("msgsnd failed (%m)\n");
+ ret = -errno;
+ goto destroy;
+ };
+ }
+ return 0;
+
+destroy:
+ if (msgctl(id, IPC_RMID, NULL))
+ printf("Failed to destroy queue: %d\n", -errno);
+ return ret;
+}
+
+int check_and_destroy_queue(struct msgque_data *msgque)
+{
+ struct msg1 message;
+ int cnt = 0, ret;
+
+ while (1) {
+ ret = msgrcv(msgque->msq_id, &message.mtype, MAX_MSG_SIZE,
+ 0, IPC_NOWAIT);
+ if (ret < 0) {
+ if (errno == ENOMSG)
+ break;
+ printf("Failed to read IPC message: %m\n");
+ ret = -errno;
+ goto err;
+ }
+ if (ret != msgque->messages[cnt].msize) {
+ printf("Wrong message size: %d (expected %d)\n", ret,
+ msgque->messages[cnt].msize);
+ ret = -EINVAL;
+ goto err;
+ }
+ if (message.mtype != msgque->messages[cnt].mtype) {
+ printf("Wrong message type\n");
+ ret = -EINVAL;
+ goto err;
+ }
+ if (memcmp(message.mtext, msgque->messages[cnt].mtext, ret)) {
+ printf("Wrong message content\n");
+ ret = -EINVAL;
+ goto err;
+ }
+ cnt++;
+ }
+
+ if (cnt != msgque->qnum) {
+ printf("Wrong message number\n");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ret = 0;
+err:
+ if (msgctl(msgque->msq_id, IPC_RMID, NULL)) {
+ printf("Failed to destroy queue: %d\n", -errno);
+ return -errno;
+ }
+ return ret;
+}
+
+int dump_queue(struct msgque_data *msgque)
+{
+ struct msqid_ds ds;
+ int kern_id;
+ int i, ret;
+
+ for (kern_id = 0; kern_id < 256; kern_id++) {
+ ret = msgctl(kern_id, MSG_STAT, &ds);
+ if (ret < 0) {
+ if (errno == EINVAL)
+ continue;
+ printf("Failed to get stats for IPC queue with id %d\n",
+ kern_id);
+ return -errno;
+ }
+
+ if (ret == msgque->msq_id)
+ break;
+ }
+
+ msgque->messages = malloc(sizeof(struct msg1) * ds.msg_qnum);
+ if (msgque->messages == NULL) {
+ printf("Failed to get stats for IPC queue\n");
+ return -ENOMEM;
+ }
+
+ msgque->qnum = ds.msg_qnum;
+ msgque->mode = ds.msg_perm.mode;
+ msgque->qbytes = ds.msg_qbytes;
+
+ for (i = 0; i < msgque->qnum; i++) {
+ ret = msgrcv(msgque->msq_id, &msgque->messages[i].mtype,
+ MAX_MSG_SIZE, i, IPC_NOWAIT | MSG_COPY);
+ if (ret < 0) {
+ printf("Failed to copy IPC message: %m (%d)\n", errno);
+ return -errno;
+ }
+ msgque->messages[i].msize = ret;
+ }
+ return 0;
+}
+
+int fill_msgque(struct msgque_data *msgque)
+{
+ struct msg1 msgbuf;
+
+ msgbuf.mtype = MSG_TYPE;
+ memcpy(msgbuf.mtext, TEST_STRING, sizeof(TEST_STRING));
+ if (msgsnd(msgque->msq_id, &msgbuf.mtype, sizeof(TEST_STRING),
+ IPC_NOWAIT) != 0) {
+ printf("First message send failed (%m)\n");
+ return -errno;
+ };
+
+ msgbuf.mtype = ANOTHER_MSG_TYPE;
+ memcpy(msgbuf.mtext, ANOTHER_TEST_STRING, sizeof(ANOTHER_TEST_STRING));
+ if (msgsnd(msgque->msq_id, &msgbuf.mtype, sizeof(ANOTHER_TEST_STRING),
+ IPC_NOWAIT) != 0) {
+ printf("Second message send failed (%m)\n");
+ return -errno;
+ };
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ int msg, pid, err;
+ struct msgque_data msgque;
+
+ if (getuid() != 0)
+ return ksft_exit_skip(
+ "Please run the test as root - Exiting.\n");
+
+ msgque.key = ftok(argv[0], 822155650);
+ if (msgque.key == -1) {
+ printf("Can't make key: %d\n", -errno);
+ return ksft_exit_fail();
+ }
+
+ msgque.msq_id = msgget(msgque.key, IPC_CREAT | IPC_EXCL | 0666);
+ if (msgque.msq_id == -1) {
+ err = -errno;
+ printf("Can't create queue: %d\n", err);
+ goto err_out;
+ }
+
+ err = fill_msgque(&msgque);
+ if (err) {
+ printf("Failed to fill queue: %d\n", err);
+ goto err_destroy;
+ }
+
+ err = dump_queue(&msgque);
+ if (err) {
+ printf("Failed to dump queue: %d\n", err);
+ goto err_destroy;
+ }
+
+ err = check_and_destroy_queue(&msgque);
+ if (err) {
+ printf("Failed to check and destroy queue: %d\n", err);
+ goto err_out;
+ }
+
+ err = restore_queue(&msgque);
+ if (err) {
+ printf("Failed to restore queue: %d\n", err);
+ goto err_destroy;
+ }
+
+ err = check_and_destroy_queue(&msgque);
+ if (err) {
+ printf("Failed to test queue: %d\n", err);
+ goto err_out;
+ }
+ return ksft_exit_pass();
+
+err_destroy:
+ if (msgctl(msgque.msq_id, IPC_RMID, NULL)) {
+ printf("Failed to destroy queue: %d\n", -errno);
+ return ksft_exit_fail();
+ }
+err_out:
+ return ksft_exit_fail();
+}
diff --git a/tools/testing/selftests/ir/.gitignore b/tools/testing/selftests/ir/.gitignore
new file mode 100644
index 000000000..0bbada8c1
--- /dev/null
+++ b/tools/testing/selftests/ir/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+ir_loopback
diff --git a/tools/testing/selftests/ir/Makefile b/tools/testing/selftests/ir/Makefile
new file mode 100644
index 000000000..ad06489c2
--- /dev/null
+++ b/tools/testing/selftests/ir/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+TEST_PROGS := ir_loopback.sh
+TEST_GEN_PROGS_EXTENDED := ir_loopback
+APIDIR := ../../../include/uapi
+CFLAGS += -Wall -O2 -I$(APIDIR)
+
+include ../lib.mk
diff --git a/tools/testing/selftests/ir/ir_loopback.c b/tools/testing/selftests/ir/ir_loopback.c
new file mode 100644
index 000000000..af7f9c7d5
--- /dev/null
+++ b/tools/testing/selftests/ir/ir_loopback.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0
+// test ir decoder
+//
+// Copyright (C) 2018 Sean Young <sean@mess.org>
+
+// When sending LIRC_MODE_SCANCODE, the IR will be encoded. rc-loopback
+// will send this IR to the receiver side, where we try to read the decoded
+// IR. Decoding happens in a separate kernel thread, so we will need to
+// wait until that is scheduled, hence we use poll to check for read
+// readiness.
+
+#include <linux/lirc.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+#include <poll.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <dirent.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "../kselftest.h"
+
+#define TEST_SCANCODES 10
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+#define SYSFS_PATH_MAX 256
+#define DNAME_PATH_MAX 256
+
+static const struct {
+ enum rc_proto proto;
+ const char *name;
+ unsigned int mask;
+ const char *decoder;
+} protocols[] = {
+ { RC_PROTO_RC5, "rc-5", 0x1f7f, "rc-5" },
+ { RC_PROTO_RC5X_20, "rc-5x-20", 0x1f7f3f, "rc-5" },
+ { RC_PROTO_RC5_SZ, "rc-5-sz", 0x2fff, "rc-5-sz" },
+ { RC_PROTO_JVC, "jvc", 0xffff, "jvc" },
+ { RC_PROTO_SONY12, "sony-12", 0x1f007f, "sony" },
+ { RC_PROTO_SONY15, "sony-15", 0xff007f, "sony" },
+ { RC_PROTO_SONY20, "sony-20", 0x1fff7f, "sony" },
+ { RC_PROTO_NEC, "nec", 0xffff, "nec" },
+ { RC_PROTO_NECX, "nec-x", 0xffffff, "nec" },
+ { RC_PROTO_NEC32, "nec-32", 0xffffffff, "nec" },
+ { RC_PROTO_SANYO, "sanyo", 0x1fffff, "sanyo" },
+ { RC_PROTO_RC6_0, "rc-6-0", 0xffff, "rc-6" },
+ { RC_PROTO_RC6_6A_20, "rc-6-6a-20", 0xfffff, "rc-6" },
+ { RC_PROTO_RC6_6A_24, "rc-6-6a-24", 0xffffff, "rc-6" },
+ { RC_PROTO_RC6_6A_32, "rc-6-6a-32", 0xffffffff, "rc-6" },
+ { RC_PROTO_RC6_MCE, "rc-6-mce", 0x00007fff, "rc-6" },
+ { RC_PROTO_SHARP, "sharp", 0x1fff, "sharp" },
+ { RC_PROTO_IMON, "imon", 0x7fffffff, "imon" },
+ { RC_PROTO_RCMM12, "rcmm-12", 0x00000fff, "rc-mm" },
+ { RC_PROTO_RCMM24, "rcmm-24", 0x00ffffff, "rc-mm" },
+ { RC_PROTO_RCMM32, "rcmm-32", 0xffffffff, "rc-mm" },
+};
+
+int lirc_open(const char *rc)
+{
+ struct dirent *dent;
+ char buf[SYSFS_PATH_MAX + DNAME_PATH_MAX];
+ DIR *d;
+ int fd;
+
+ snprintf(buf, sizeof(buf), "/sys/class/rc/%s", rc);
+
+ d = opendir(buf);
+ if (!d)
+ ksft_exit_fail_msg("cannot open %s: %m\n", buf);
+
+ while ((dent = readdir(d)) != NULL) {
+ if (!strncmp(dent->d_name, "lirc", 4)) {
+ snprintf(buf, sizeof(buf), "/dev/%s", dent->d_name);
+ break;
+ }
+ }
+
+ if (!dent)
+ ksft_exit_skip("cannot find lirc device for %s\n", rc);
+
+ closedir(d);
+
+ fd = open(buf, O_RDWR | O_NONBLOCK);
+ if (fd == -1)
+ ksft_exit_fail_msg("cannot open: %s: %m\n", buf);
+
+ return fd;
+}
+
+int main(int argc, char **argv)
+{
+ unsigned int mode;
+ char buf[100];
+ int rlircfd, wlircfd, protocolfd, i, n;
+
+ srand(time(NULL));
+
+ if (argc != 3)
+ ksft_exit_fail_msg("Usage: %s <write rcN> <read rcN>\n",
+ argv[0]);
+
+ rlircfd = lirc_open(argv[2]);
+ mode = LIRC_MODE_SCANCODE;
+ if (ioctl(rlircfd, LIRC_SET_REC_MODE, &mode))
+ ksft_exit_fail_msg("failed to set scancode rec mode %s: %m\n",
+ argv[2]);
+
+ wlircfd = lirc_open(argv[1]);
+ if (ioctl(wlircfd, LIRC_SET_SEND_MODE, &mode))
+ ksft_exit_fail_msg("failed to set scancode send mode %s: %m\n",
+ argv[1]);
+
+ snprintf(buf, sizeof(buf), "/sys/class/rc/%s/protocols", argv[2]);
+ protocolfd = open(buf, O_WRONLY);
+ if (protocolfd == -1)
+ ksft_exit_fail_msg("failed to open %s: %m\n", buf);
+
+ printf("Sending IR on %s and receiving IR on %s.\n", argv[1], argv[2]);
+
+ for (i = 0; i < ARRAY_SIZE(protocols); i++) {
+ if (write(protocolfd, protocols[i].decoder,
+ strlen(protocols[i].decoder)) == -1)
+ ksft_exit_fail_msg("failed to set write decoder\n");
+
+ printf("Testing protocol %s for decoder %s (%d/%d)...\n",
+ protocols[i].name, protocols[i].decoder,
+ i + 1, (int)ARRAY_SIZE(protocols));
+
+ for (n = 0; n < TEST_SCANCODES; n++) {
+ unsigned int scancode = rand() & protocols[i].mask;
+ unsigned int rc_proto = protocols[i].proto;
+
+ if (rc_proto == RC_PROTO_RC6_MCE)
+ scancode |= 0x800f0000;
+
+ if (rc_proto == RC_PROTO_NECX &&
+ (((scancode >> 16) ^ ~(scancode >> 8)) & 0xff) == 0)
+ continue;
+
+ if (rc_proto == RC_PROTO_NEC32 &&
+ (((scancode >> 8) ^ ~scancode) & 0xff) == 0)
+ continue;
+
+ if (rc_proto == RC_PROTO_RCMM32 &&
+ (scancode & 0x000c0000) != 0x000c0000 &&
+ scancode & 0x00008000)
+ continue;
+
+ struct lirc_scancode lsc = {
+ .rc_proto = rc_proto,
+ .scancode = scancode
+ };
+
+ printf("Testing scancode:%x\n", scancode);
+
+ while (write(wlircfd, &lsc, sizeof(lsc)) < 0) {
+ if (errno == EINTR)
+ continue;
+
+ ksft_exit_fail_msg("failed to send ir: %m\n");
+ }
+
+ struct pollfd pfd = { .fd = rlircfd, .events = POLLIN };
+ struct lirc_scancode lsc2;
+
+ poll(&pfd, 1, 1000);
+
+ bool decoded = true;
+
+ while (read(rlircfd, &lsc2, sizeof(lsc2)) < 0) {
+ if (errno == EINTR)
+ continue;
+
+ ksft_test_result_error("no scancode decoded: %m\n");
+ decoded = false;
+ break;
+ }
+
+ if (!decoded)
+ continue;
+
+ if (lsc.rc_proto != lsc2.rc_proto)
+ ksft_test_result_error("decoded protocol is different: %d\n",
+ lsc2.rc_proto);
+
+ else if (lsc.scancode != lsc2.scancode)
+ ksft_test_result_error("decoded scancode is different: %llx\n",
+ lsc2.scancode);
+ else
+ ksft_inc_pass_cnt();
+ }
+
+ printf("OK\n");
+ }
+
+ close(rlircfd);
+ close(wlircfd);
+ close(protocolfd);
+
+ if (ksft_get_fail_cnt() > 0)
+ ksft_exit_fail();
+ else
+ ksft_exit_pass();
+
+ return 0;
+}
diff --git a/tools/testing/selftests/ir/ir_loopback.sh b/tools/testing/selftests/ir/ir_loopback.sh
new file mode 100755
index 000000000..b90dc9939
--- /dev/null
+++ b/tools/testing/selftests/ir/ir_loopback.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+if [ $UID != 0 ]; then
+ echo "Please run ir_loopback test as root [SKIP]"
+ exit $ksft_skip
+fi
+
+if ! /sbin/modprobe -q -n rc-loopback; then
+ echo "ir_loopback: module rc-loopback is not found [SKIP]"
+ exit $ksft_skip
+fi
+
+/sbin/modprobe rc-loopback
+if [ $? -ne 0 ]; then
+ exit
+fi
+
+RCDEV=$(grep -l DRV_NAME=rc-loopback /sys/class/rc/rc*/uevent | grep -o 'rc[0-9]\+')
+
+./ir_loopback $RCDEV $RCDEV
+exit
diff --git a/tools/testing/selftests/kcmp/.gitignore b/tools/testing/selftests/kcmp/.gitignore
new file mode 100644
index 000000000..38ccdfe80
--- /dev/null
+++ b/tools/testing/selftests/kcmp/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+kcmp_test
+kcmp-test-file
diff --git a/tools/testing/selftests/kcmp/Makefile b/tools/testing/selftests/kcmp/Makefile
new file mode 100644
index 000000000..b4d39f6b5
--- /dev/null
+++ b/tools/testing/selftests/kcmp/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0-only
+CFLAGS += -I../../../../usr/include/
+
+TEST_GEN_PROGS := kcmp_test
+
+EXTRA_CLEAN := $(OUTPUT)/kcmp-test-file
+
+include ../lib.mk
+
diff --git a/tools/testing/selftests/kcmp/kcmp_test.c b/tools/testing/selftests/kcmp/kcmp_test.c
new file mode 100644
index 000000000..6ea7b9f37
--- /dev/null
+++ b/tools/testing/selftests/kcmp/kcmp_test.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <linux/unistd.h>
+#include <linux/kcmp.h>
+
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <sys/epoll.h>
+
+#include "../kselftest.h"
+
+static long sys_kcmp(int pid1, int pid2, int type, unsigned long fd1, unsigned long fd2)
+{
+ return syscall(__NR_kcmp, pid1, pid2, type, fd1, fd2);
+}
+
+static const unsigned int duped_num = 64;
+
+int main(int argc, char **argv)
+{
+ const char kpath[] = "kcmp-test-file";
+ struct kcmp_epoll_slot epoll_slot;
+ struct epoll_event ev;
+ int pid1, pid2;
+ int pipefd[2];
+ int fd1, fd2;
+ int epollfd;
+ int status;
+ int fddup;
+
+ fd1 = open(kpath, O_RDWR | O_CREAT | O_TRUNC, 0644);
+ pid1 = getpid();
+
+ if (fd1 < 0) {
+ perror("Can't create file");
+ ksft_exit_fail();
+ }
+
+ if (pipe(pipefd)) {
+ perror("Can't create pipe");
+ ksft_exit_fail();
+ }
+
+ epollfd = epoll_create1(0);
+ if (epollfd < 0) {
+ perror("epoll_create1 failed");
+ ksft_exit_fail();
+ }
+
+ memset(&ev, 0xff, sizeof(ev));
+ ev.events = EPOLLIN | EPOLLOUT;
+
+ if (epoll_ctl(epollfd, EPOLL_CTL_ADD, pipefd[0], &ev)) {
+ perror("epoll_ctl failed");
+ ksft_exit_fail();
+ }
+
+ fddup = dup2(pipefd[1], duped_num);
+ if (fddup < 0) {
+ perror("dup2 failed");
+ ksft_exit_fail();
+ }
+
+ if (epoll_ctl(epollfd, EPOLL_CTL_ADD, fddup, &ev)) {
+ perror("epoll_ctl failed");
+ ksft_exit_fail();
+ }
+ close(fddup);
+
+ pid2 = fork();
+ if (pid2 < 0) {
+ perror("fork failed");
+ ksft_exit_fail();
+ }
+
+ if (!pid2) {
+ int pid2 = getpid();
+ int ret;
+
+ fd2 = open(kpath, O_RDWR, 0644);
+ if (fd2 < 0) {
+ perror("Can't open file");
+ ksft_exit_fail();
+ }
+
+ /* An example of output and arguments */
+ printf("pid1: %6d pid2: %6d FD: %2ld FILES: %2ld VM: %2ld "
+ "FS: %2ld SIGHAND: %2ld IO: %2ld SYSVSEM: %2ld "
+ "INV: %2ld\n",
+ pid1, pid2,
+ sys_kcmp(pid1, pid2, KCMP_FILE, fd1, fd2),
+ sys_kcmp(pid1, pid2, KCMP_FILES, 0, 0),
+ sys_kcmp(pid1, pid2, KCMP_VM, 0, 0),
+ sys_kcmp(pid1, pid2, KCMP_FS, 0, 0),
+ sys_kcmp(pid1, pid2, KCMP_SIGHAND, 0, 0),
+ sys_kcmp(pid1, pid2, KCMP_IO, 0, 0),
+ sys_kcmp(pid1, pid2, KCMP_SYSVSEM, 0, 0),
+
+ /* This one should fail */
+ sys_kcmp(pid1, pid2, KCMP_TYPES + 1, 0, 0));
+
+ /* This one should return same fd */
+ ret = sys_kcmp(pid1, pid2, KCMP_FILE, fd1, fd1);
+ if (ret) {
+ printf("FAIL: 0 expected but %d returned (%s)\n",
+ ret, strerror(errno));
+ ksft_inc_fail_cnt();
+ ret = -1;
+ } else {
+ printf("PASS: 0 returned as expected\n");
+ ksft_inc_pass_cnt();
+ }
+
+ /* Compare with self */
+ ret = sys_kcmp(pid1, pid1, KCMP_VM, 0, 0);
+ if (ret) {
+ printf("FAIL: 0 expected but %d returned (%s)\n",
+ ret, strerror(errno));
+ ksft_inc_fail_cnt();
+ ret = -1;
+ } else {
+ printf("PASS: 0 returned as expected\n");
+ ksft_inc_pass_cnt();
+ }
+
+ /* Compare epoll target */
+ epoll_slot = (struct kcmp_epoll_slot) {
+ .efd = epollfd,
+ .tfd = duped_num,
+ .toff = 0,
+ };
+ ret = sys_kcmp(pid1, pid1, KCMP_EPOLL_TFD, pipefd[1],
+ (unsigned long)(void *)&epoll_slot);
+ if (ret) {
+ printf("FAIL: 0 expected but %d returned (%s)\n",
+ ret, strerror(errno));
+ ksft_inc_fail_cnt();
+ ret = -1;
+ } else {
+ printf("PASS: 0 returned as expected\n");
+ ksft_inc_pass_cnt();
+ }
+
+ ksft_print_cnts();
+
+ if (ret)
+ ksft_exit_fail();
+ else
+ ksft_exit_pass();
+ }
+
+ waitpid(pid2, &status, P_ALL);
+
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/kexec/Makefile b/tools/testing/selftests/kexec/Makefile
new file mode 100644
index 000000000..aa91d2063
--- /dev/null
+++ b/tools/testing/selftests/kexec/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Makefile for kexec tests
+
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/)
+
+ifeq ($(ARCH),x86)
+TEST_PROGS := test_kexec_load.sh test_kexec_file_load.sh
+TEST_FILES := kexec_common_lib.sh
+
+include ../lib.mk
+
+endif
diff --git a/tools/testing/selftests/kexec/config b/tools/testing/selftests/kexec/config
new file mode 100644
index 000000000..8962e862b
--- /dev/null
+++ b/tools/testing/selftests/kexec/config
@@ -0,0 +1,3 @@
+CONFIG_IMA_APPRAISE=y
+CONFIG_IMA_ARCH_POLICY=y
+CONFIG_SECURITYFS=y
diff --git a/tools/testing/selftests/kexec/kexec_common_lib.sh b/tools/testing/selftests/kexec/kexec_common_lib.sh
new file mode 100755
index 000000000..43017cfe8
--- /dev/null
+++ b/tools/testing/selftests/kexec/kexec_common_lib.sh
@@ -0,0 +1,220 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Kselftest framework defines: ksft_pass=0, ksft_fail=1, ksft_skip=4
+
+VERBOSE="${VERBOSE:-1}"
+IKCONFIG="/tmp/config-`uname -r`"
+KERNEL_IMAGE="/boot/vmlinuz-`uname -r`"
+SECURITYFS=$(grep "securityfs" /proc/mounts | awk '{print $2}')
+
+log_info()
+{
+ [ $VERBOSE -ne 0 ] && echo "[INFO] $1"
+}
+
+# The ksefltest framework requirement returns 0 for PASS.
+log_pass()
+{
+ [ $VERBOSE -ne 0 ] && echo "$1 [PASS]"
+ exit 0
+}
+
+# The ksefltest framework requirement returns 1 for FAIL.
+log_fail()
+{
+ [ $VERBOSE -ne 0 ] && echo "$1 [FAIL]"
+ exit 1
+}
+
+# The ksefltest framework requirement returns 4 for SKIP.
+log_skip()
+{
+ [ $VERBOSE -ne 0 ] && echo "$1"
+ exit 4
+}
+
+# Check efivar SecureBoot-$(the UUID) and SetupMode-$(the UUID).
+# (Based on kdump-lib.sh)
+get_efivarfs_secureboot_mode()
+{
+ local efivarfs="/sys/firmware/efi/efivars"
+ local secure_boot_file=""
+ local setup_mode_file=""
+ local secureboot_mode=0
+ local setup_mode=0
+
+ # Make sure that efivar_fs is mounted in the normal location
+ if ! grep -q "^\S\+ $efivarfs efivarfs" /proc/mounts; then
+ log_info "efivars is not mounted on $efivarfs"
+ return 0;
+ fi
+ secure_boot_file=$(find "$efivarfs" -name SecureBoot-* 2>/dev/null)
+ setup_mode_file=$(find "$efivarfs" -name SetupMode-* 2>/dev/null)
+ if [ -f "$secure_boot_file" ] && [ -f "$setup_mode_file" ]; then
+ secureboot_mode=$(hexdump -v -e '/1 "%d\ "' \
+ "$secure_boot_file"|cut -d' ' -f 5)
+ setup_mode=$(hexdump -v -e '/1 "%d\ "' \
+ "$setup_mode_file"|cut -d' ' -f 5)
+
+ if [ $secureboot_mode -eq 1 ] && [ $setup_mode -eq 0 ]; then
+ log_info "secure boot mode enabled (CONFIG_EFIVAR_FS)"
+ return 1;
+ fi
+ fi
+ return 0;
+}
+
+get_efi_var_secureboot_mode()
+{
+ local efi_vars
+ local secure_boot_file
+ local setup_mode_file
+ local secureboot_mode
+ local setup_mode
+
+ if [ ! -d "$efi_vars" ]; then
+ log_skip "efi_vars is not enabled\n"
+ fi
+ secure_boot_file=$(find "$efi_vars" -name SecureBoot-* 2>/dev/null)
+ setup_mode_file=$(find "$efi_vars" -name SetupMode-* 2>/dev/null)
+ if [ -f "$secure_boot_file/data" ] && \
+ [ -f "$setup_mode_file/data" ]; then
+ secureboot_mode=`od -An -t u1 "$secure_boot_file/data"`
+ setup_mode=`od -An -t u1 "$setup_mode_file/data"`
+
+ if [ $secureboot_mode -eq 1 ] && [ $setup_mode -eq 0 ]; then
+ log_info "secure boot mode enabled (CONFIG_EFI_VARS)"
+ return 1;
+ fi
+ fi
+ return 0;
+}
+
+# Check efivar SecureBoot-$(the UUID) and SetupMode-$(the UUID).
+# The secure boot mode can be accessed either as the last integer
+# of "od -An -t u1 /sys/firmware/efi/efivars/SecureBoot-*" or from
+# "od -An -t u1 /sys/firmware/efi/vars/SecureBoot-*/data". The efi
+# SetupMode can be similarly accessed.
+# Return 1 for SecureBoot mode enabled and SetupMode mode disabled.
+get_secureboot_mode()
+{
+ local secureboot_mode=0
+
+ get_efivarfs_secureboot_mode
+ secureboot_mode=$?
+
+ # fallback to using the efi_var files
+ if [ $secureboot_mode -eq 0 ]; then
+ get_efi_var_secureboot_mode
+ secureboot_mode=$?
+ fi
+
+ if [ $secureboot_mode -eq 0 ]; then
+ log_info "secure boot mode not enabled"
+ fi
+ return $secureboot_mode;
+}
+
+require_root_privileges()
+{
+ if [ $(id -ru) -ne 0 ]; then
+ log_skip "requires root privileges"
+ fi
+}
+
+# Look for config option in Kconfig file.
+# Return 1 for found and 0 for not found.
+kconfig_enabled()
+{
+ local config="$1"
+ local msg="$2"
+
+ grep -E -q $config $IKCONFIG
+ if [ $? -eq 0 ]; then
+ log_info "$msg"
+ return 1
+ fi
+ return 0
+}
+
+# Attempt to get the kernel config first via proc, and then by
+# extracting it from the kernel image or the configs.ko using
+# scripts/extract-ikconfig.
+# Return 1 for found.
+get_kconfig()
+{
+ local proc_config="/proc/config.gz"
+ local module_dir="/lib/modules/`uname -r`"
+ local configs_module="$module_dir/kernel/kernel/configs.ko"
+
+ if [ ! -f $proc_config ]; then
+ modprobe configs > /dev/null 2>&1
+ fi
+ if [ -f $proc_config ]; then
+ cat $proc_config | gunzip > $IKCONFIG 2>/dev/null
+ if [ $? -eq 0 ]; then
+ return 1
+ fi
+ fi
+
+ local extract_ikconfig="$module_dir/source/scripts/extract-ikconfig"
+ if [ ! -f $extract_ikconfig ]; then
+ log_skip "extract-ikconfig not found"
+ fi
+
+ $extract_ikconfig $KERNEL_IMAGE > $IKCONFIG 2>/dev/null
+ if [ $? -eq 1 ]; then
+ if [ ! -f $configs_module ]; then
+ log_skip "CONFIG_IKCONFIG not enabled"
+ fi
+ $extract_ikconfig $configs_module > $IKCONFIG
+ if [ $? -eq 1 ]; then
+ log_skip "CONFIG_IKCONFIG not enabled"
+ fi
+ fi
+ return 1
+}
+
+# Make sure that securityfs is mounted
+mount_securityfs()
+{
+ if [ -z $SECURITYFS ]; then
+ SECURITYFS=/sys/kernel/security
+ mount -t securityfs security $SECURITYFS
+ fi
+
+ if [ ! -d "$SECURITYFS" ]; then
+ log_fail "$SECURITYFS :securityfs is not mounted"
+ fi
+}
+
+# The policy rule format is an "action" followed by key-value pairs. This
+# function supports up to two key-value pairs, in any order.
+# For example: action func=<keyword> [appraise_type=<type>]
+# Return 1 for found and 0 for not found.
+check_ima_policy()
+{
+ local action="$1"
+ local keypair1="$2"
+ local keypair2="$3"
+ local ret=0
+
+ mount_securityfs
+
+ local ima_policy=$SECURITYFS/ima/policy
+ if [ ! -e $ima_policy ]; then
+ log_fail "$ima_policy not found"
+ fi
+
+ if [ -n $keypair2 ]; then
+ grep -e "^$action.*$keypair1" "$ima_policy" | \
+ grep -q -e "$keypair2"
+ else
+ grep -q -e "^$action.*$keypair1" "$ima_policy"
+ fi
+
+ # invert "grep -q" result, returning 1 for found.
+ [ $? -eq 0 ] && ret=1
+ return $ret
+}
diff --git a/tools/testing/selftests/kexec/test_kexec_file_load.sh b/tools/testing/selftests/kexec/test_kexec_file_load.sh
new file mode 100755
index 000000000..2ff600388
--- /dev/null
+++ b/tools/testing/selftests/kexec/test_kexec_file_load.sh
@@ -0,0 +1,238 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Loading a kernel image via the kexec_file_load syscall can verify either
+# the IMA signature stored in the security.ima xattr or the PE signature,
+# both signatures depending on the IMA policy, or none.
+#
+# To determine whether the kernel image is signed, this test depends
+# on pesign and getfattr. This test also requires the kernel to be
+# built with CONFIG_IKCONFIG enabled and either CONFIG_IKCONFIG_PROC
+# enabled or access to the extract-ikconfig script.
+
+TEST="KEXEC_FILE_LOAD"
+. ./kexec_common_lib.sh
+
+trap "{ rm -f $IKCONFIG ; }" EXIT
+
+# Some of the IMA builtin policies may require the kexec kernel image to
+# be signed, but these policy rules may be replaced with a custom
+# policy. Only CONFIG_IMA_APPRAISE_REQUIRE_KEXEC_SIGS persists after
+# loading a custom policy. Check if it is enabled, before reading the
+# IMA runtime sysfs policy file.
+# Return 1 for IMA signature required and 0 for not required.
+is_ima_sig_required()
+{
+ local ret=0
+
+ kconfig_enabled "CONFIG_IMA_APPRAISE_REQUIRE_KEXEC_SIGS=y" \
+ "IMA kernel image signature required"
+ if [ $? -eq 1 ]; then
+ log_info "IMA signature required"
+ return 1
+ fi
+
+ # The architecture specific or a custom policy may require the
+ # kexec kernel image be signed. Policy rules are walked
+ # sequentially. As a result, a policy rule may be defined, but
+ # might not necessarily be used. This test assumes if a policy
+ # rule is specified, that is the intent.
+
+ # First check for appended signature (modsig), then xattr
+ if [ $ima_read_policy -eq 1 ]; then
+ check_ima_policy "appraise" "func=KEXEC_KERNEL_CHECK" \
+ "appraise_type=imasig|modsig"
+ ret=$?
+ if [ $ret -eq 1 ]; then
+ log_info "IMA or appended(modsig) signature required"
+ else
+ check_ima_policy "appraise" "func=KEXEC_KERNEL_CHECK" \
+ "appraise_type=imasig"
+ ret=$?
+ [ $ret -eq 1 ] && log_info "IMA signature required";
+ fi
+ fi
+ return $ret
+}
+
+# The kexec_file_load_test() is complicated enough, require pesign.
+# Return 1 for PE signature found and 0 for not found.
+check_for_pesig()
+{
+ which pesign > /dev/null 2>&1 || log_skip "pesign not found"
+
+ pesign -i $KERNEL_IMAGE --show-signature | grep -q "No signatures"
+ local ret=$?
+ if [ $ret -eq 1 ]; then
+ log_info "kexec kernel image PE signed"
+ else
+ log_info "kexec kernel image not PE signed"
+ fi
+ return $ret
+}
+
+# The kexec_file_load_test() is complicated enough, require getfattr.
+# Return 1 for IMA signature found and 0 for not found.
+check_for_imasig()
+{
+ local ret=0
+
+ which getfattr > /dev/null 2>&1
+ if [ $? -eq 1 ]; then
+ log_skip "getfattr not found"
+ fi
+
+ line=$(getfattr -n security.ima -e hex --absolute-names $KERNEL_IMAGE 2>&1)
+ echo $line | grep -q "security.ima=0x03"
+ if [ $? -eq 0 ]; then
+ ret=1
+ log_info "kexec kernel image IMA signed"
+ else
+ log_info "kexec kernel image not IMA signed"
+ fi
+ return $ret
+}
+
+# Return 1 for appended signature (modsig) found and 0 for not found.
+check_for_modsig()
+{
+ local module_sig_string="~Module signature appended~"
+ local sig="$(tail --bytes $((${#module_sig_string} + 1)) $KERNEL_IMAGE)"
+ local ret=0
+
+ if [ "$sig" == "$module_sig_string" ]; then
+ ret=1
+ log_info "kexec kernel image modsig signed"
+ else
+ log_info "kexec kernel image not modsig signed"
+ fi
+ return $ret
+}
+
+kexec_file_load_test()
+{
+ local succeed_msg="kexec_file_load succeeded"
+ local failed_msg="kexec_file_load failed"
+ local key_msg="try enabling the CONFIG_INTEGRITY_PLATFORM_KEYRING"
+
+ line=$(kexec --load --kexec-file-syscall $KERNEL_IMAGE 2>&1)
+
+ if [ $? -eq 0 ]; then
+ kexec --unload --kexec-file-syscall
+
+ # In secureboot mode with an architecture specific
+ # policy, make sure either an IMA or PE signature exists.
+ if [ $secureboot -eq 1 ] && [ $arch_policy -eq 1 ] && \
+ [ $ima_signed -eq 0 ] && [ $pe_signed -eq 0 ] \
+ && [ $ima_modsig -eq 0 ]; then
+ log_fail "$succeed_msg (missing sig)"
+ fi
+
+ if [ $kexec_sig_required -eq 1 -o $pe_sig_required -eq 1 ] \
+ && [ $pe_signed -eq 0 ]; then
+ log_fail "$succeed_msg (missing PE sig)"
+ fi
+
+ if [ $ima_sig_required -eq 1 ] && [ $ima_signed -eq 0 ] \
+ && [ $ima_modsig -eq 0 ]; then
+ log_fail "$succeed_msg (missing IMA sig)"
+ fi
+
+ if [ $pe_sig_required -eq 0 ] && [ $ima_appraise -eq 1 ] \
+ && [ $ima_sig_required -eq 0 ] && [ $ima_signed -eq 0 ] \
+ && [ $ima_read_policy -eq 0 ]; then
+ log_fail "$succeed_msg (possibly missing IMA sig)"
+ fi
+
+ if [ $pe_sig_required -eq 0 ] && [ $ima_appraise -eq 0 ]; then
+ log_info "No signature verification required"
+ elif [ $pe_sig_required -eq 0 ] && [ $ima_appraise -eq 1 ] \
+ && [ $ima_sig_required -eq 0 ] && [ $ima_signed -eq 0 ] \
+ && [ $ima_read_policy -eq 1 ]; then
+ log_info "No signature verification required"
+ fi
+
+ log_pass "$succeed_msg"
+ fi
+
+ # Check the reason for the kexec_file_load failure
+ echo $line | grep -q "Required key not available"
+ if [ $? -eq 0 ]; then
+ if [ $platform_keyring -eq 0 ]; then
+ log_pass "$failed_msg (-ENOKEY), $key_msg"
+ else
+ log_pass "$failed_msg (-ENOKEY)"
+ fi
+ fi
+
+ if [ $kexec_sig_required -eq 1 -o $pe_sig_required -eq 1 ] \
+ && [ $pe_signed -eq 0 ]; then
+ log_pass "$failed_msg (missing PE sig)"
+ fi
+
+ if [ $ima_sig_required -eq 1 ] && [ $ima_signed -eq 0 ]; then
+ log_pass "$failed_msg (missing IMA sig)"
+ fi
+
+ if [ $pe_sig_required -eq 0 ] && [ $ima_appraise -eq 1 ] \
+ && [ $ima_sig_required -eq 0 ] && [ $ima_read_policy -eq 0 ] \
+ && [ $ima_signed -eq 0 ]; then
+ log_pass "$failed_msg (possibly missing IMA sig)"
+ fi
+
+ log_pass "$failed_msg"
+ return 0
+}
+
+# kexec requires root privileges
+require_root_privileges
+
+# get the kernel config
+get_kconfig
+
+kconfig_enabled "CONFIG_KEXEC_FILE=y" "kexec_file_load is enabled"
+if [ $? -eq 0 ]; then
+ log_skip "kexec_file_load is not enabled"
+fi
+
+# Determine which kernel config options are enabled
+kconfig_enabled "CONFIG_IMA_APPRAISE=y" "IMA enabled"
+ima_appraise=$?
+
+kconfig_enabled "CONFIG_IMA_ARCH_POLICY=y" \
+ "architecture specific policy enabled"
+arch_policy=$?
+
+kconfig_enabled "CONFIG_INTEGRITY_PLATFORM_KEYRING=y" \
+ "platform keyring enabled"
+platform_keyring=$?
+
+kconfig_enabled "CONFIG_IMA_READ_POLICY=y" "reading IMA policy permitted"
+ima_read_policy=$?
+
+kconfig_enabled "CONFIG_KEXEC_SIG_FORCE=y" \
+ "kexec signed kernel image required"
+kexec_sig_required=$?
+
+kconfig_enabled "CONFIG_KEXEC_BZIMAGE_VERIFY_SIG=y" \
+ "PE signed kernel image required"
+pe_sig_required=$?
+
+is_ima_sig_required
+ima_sig_required=$?
+
+get_secureboot_mode
+secureboot=$?
+
+# Are there pe and ima signatures
+check_for_pesig
+pe_signed=$?
+
+check_for_imasig
+ima_signed=$?
+
+check_for_modsig
+ima_modsig=$?
+
+# Test loading the kernel image via kexec_file_load syscall
+kexec_file_load_test
diff --git a/tools/testing/selftests/kexec/test_kexec_load.sh b/tools/testing/selftests/kexec/test_kexec_load.sh
new file mode 100755
index 000000000..49c6aa929
--- /dev/null
+++ b/tools/testing/selftests/kexec/test_kexec_load.sh
@@ -0,0 +1,47 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Prevent loading a kernel image via the kexec_load syscall when
+# signatures are required. (Dependent on CONFIG_IMA_ARCH_POLICY.)
+
+TEST="$0"
+. ./kexec_common_lib.sh
+
+# kexec requires root privileges
+require_root_privileges
+
+# get the kernel config
+get_kconfig
+
+kconfig_enabled "CONFIG_KEXEC=y" "kexec_load is enabled"
+if [ $? -eq 0 ]; then
+ log_skip "kexec_load is not enabled"
+fi
+
+kconfig_enabled "CONFIG_IMA_APPRAISE=y" "IMA enabled"
+ima_appraise=$?
+
+kconfig_enabled "CONFIG_IMA_ARCH_POLICY=y" \
+ "IMA architecture specific policy enabled"
+arch_policy=$?
+
+get_secureboot_mode
+secureboot=$?
+
+# kexec_load should fail in secure boot mode and CONFIG_IMA_ARCH_POLICY enabled
+kexec --load $KERNEL_IMAGE > /dev/null 2>&1
+if [ $? -eq 0 ]; then
+ kexec --unload
+ if [ $secureboot -eq 1 ] && [ $arch_policy -eq 1 ]; then
+ log_fail "kexec_load succeeded"
+ elif [ $ima_appraise -eq 0 -o $arch_policy -eq 0 ]; then
+ log_info "Either IMA or the IMA arch policy is not enabled"
+ fi
+ log_pass "kexec_load succeeded"
+else
+ if [ $secureboot -eq 1 ] && [ $arch_policy -eq 1 ] ; then
+ log_pass "kexec_load failed"
+ else
+ log_fail "kexec_load failed"
+ fi
+fi
diff --git a/tools/testing/selftests/kmod/Makefile b/tools/testing/selftests/kmod/Makefile
new file mode 100644
index 000000000..5b3e746a0
--- /dev/null
+++ b/tools/testing/selftests/kmod/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Makefile for kmod loading selftests
+
+# No binaries, but make sure arg-less "make" doesn't trigger "run_tests"
+all:
+
+TEST_PROGS := kmod.sh
+
+include ../lib.mk
+
+# Nothing to clean up.
+clean:
diff --git a/tools/testing/selftests/kmod/config b/tools/testing/selftests/kmod/config
new file mode 100644
index 000000000..259f4fd6b
--- /dev/null
+++ b/tools/testing/selftests/kmod/config
@@ -0,0 +1,7 @@
+CONFIG_TEST_KMOD=m
+CONFIG_TEST_LKM=m
+CONFIG_XFS_FS=m
+
+# For the module parameter force_init_test is used
+CONFIG_TUN=m
+CONFIG_BTRFS_FS=m
diff --git a/tools/testing/selftests/kmod/kmod.sh b/tools/testing/selftests/kmod/kmod.sh
new file mode 100755
index 000000000..afd42387e
--- /dev/null
+++ b/tools/testing/selftests/kmod/kmod.sh
@@ -0,0 +1,689 @@
+#!/bin/bash
+#
+# Copyright (C) 2017 Luis R. Rodriguez <mcgrof@kernel.org>
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the Free
+# Software Foundation; either version 2 of the License, or at your option any
+# later version; or, when distributed separately from the Linux kernel or
+# when incorporated into other software packages, subject to the following
+# license:
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of copyleft-next (version 0.3.1 or later) as published
+# at http://copyleft-next.org/.
+
+# This is a stress test script for kmod, the kernel module loader. It uses
+# test_kmod which exposes a series of knobs for the API for us so we can
+# tweak each test in userspace rather than in kernelspace.
+#
+# The way kmod works is it uses the kernel's usermode helper API to eventually
+# call /sbin/modprobe. It has a limit of the number of concurrent calls
+# possible. The kernel interface to load modules is request_module(), however
+# mount uses get_fs_type(). Both behave slightly differently, but the
+# differences are important enough to test each call separately. For this
+# reason test_kmod starts by providing tests for both calls.
+#
+# The test driver test_kmod assumes a series of defaults which you can
+# override by exporting to your environment prior running this script.
+# For instance this script assumes you do not have xfs loaded upon boot.
+# If this is false, export DEFAULT_KMOD_FS="ext4" prior to running this
+# script if the filesystem module you don't have loaded upon bootup
+# is ext4 instead. Refer to allow_user_defaults() for a list of user
+# override variables possible.
+#
+# You'll want at least 4 GiB of RAM to expect to run these tests
+# without running out of memory on them. For other requirements refer
+# to test_reqs()
+
+set -e
+
+TEST_NAME="kmod"
+TEST_DRIVER="test_${TEST_NAME}"
+TEST_DIR=$(dirname $0)
+
+# This represents
+#
+# TEST_ID:TEST_COUNT:ENABLED
+#
+# TEST_ID: is the test id number
+# TEST_COUNT: number of times we should run the test
+# ENABLED: 1 if enabled, 0 otherwise
+#
+# Once these are enabled please leave them as-is. Write your own test,
+# we have tons of space.
+ALL_TESTS="0001:3:1"
+ALL_TESTS="$ALL_TESTS 0002:3:1"
+ALL_TESTS="$ALL_TESTS 0003:1:1"
+ALL_TESTS="$ALL_TESTS 0004:1:1"
+ALL_TESTS="$ALL_TESTS 0005:10:1"
+ALL_TESTS="$ALL_TESTS 0006:10:1"
+ALL_TESTS="$ALL_TESTS 0007:5:1"
+ALL_TESTS="$ALL_TESTS 0008:150:1"
+ALL_TESTS="$ALL_TESTS 0009:150:1"
+ALL_TESTS="$ALL_TESTS 0010:1:1"
+ALL_TESTS="$ALL_TESTS 0011:1:1"
+ALL_TESTS="$ALL_TESTS 0012:1:1"
+ALL_TESTS="$ALL_TESTS 0013:1:1"
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+test_modprobe()
+{
+ if [ ! -d $DIR ]; then
+ echo "$0: $DIR not present" >&2
+ echo "You must have the following enabled in your kernel:" >&2
+ cat $TEST_DIR/config >&2
+ exit $ksft_skip
+ fi
+}
+
+function allow_user_defaults()
+{
+ if [ -z $DEFAULT_KMOD_DRIVER ]; then
+ DEFAULT_KMOD_DRIVER="test_module"
+ fi
+
+ if [ -z $DEFAULT_KMOD_FS ]; then
+ DEFAULT_KMOD_FS="xfs"
+ fi
+
+ if [ -z $PROC_DIR ]; then
+ PROC_DIR="/proc/sys/kernel/"
+ fi
+
+ if [ -z $MODPROBE_LIMIT ]; then
+ MODPROBE_LIMIT=50
+ fi
+
+ if [ -z $DIR ]; then
+ DIR="/sys/devices/virtual/misc/${TEST_DRIVER}0/"
+ fi
+
+ if [ -z $DEFAULT_NUM_TESTS ]; then
+ DEFAULT_NUM_TESTS=150
+ fi
+
+ MODPROBE_LIMIT_FILE="${PROC_DIR}/kmod-limit"
+}
+
+test_reqs()
+{
+ if ! which modprobe 2> /dev/null > /dev/null; then
+ echo "$0: You need modprobe installed" >&2
+ exit $ksft_skip
+ fi
+
+ if ! which kmod 2> /dev/null > /dev/null; then
+ echo "$0: You need kmod installed" >&2
+ exit $ksft_skip
+ fi
+
+ # kmod 19 has a bad bug where it returns 0 when modprobe
+ # gets called *even* if the module was not loaded due to
+ # some bad heuristics. For details see:
+ #
+ # A work around is possible in-kernel but its rather
+ # complex.
+ KMOD_VERSION=$(kmod --version | awk '{print $3}')
+ if [[ $KMOD_VERSION -le 19 ]]; then
+ echo "$0: You need at least kmod 20" >&2
+ echo "kmod <= 19 is buggy, for details see:" >&2
+ echo "https://git.kernel.org/cgit/utils/kernel/kmod/kmod.git/commit/libkmod/libkmod-module.c?id=fd44a98ae2eb5eb32161088954ab21e58e19dfc4" >&2
+ exit $ksft_skip
+ fi
+
+ uid=$(id -u)
+ if [ $uid -ne 0 ]; then
+ echo $msg must be run as root >&2
+ exit $ksft_skip
+ fi
+}
+
+function load_req_mod()
+{
+ trap "test_modprobe" EXIT
+
+ if [ ! -d $DIR ]; then
+ # Alanis: "Oh isn't it ironic?"
+ modprobe $TEST_DRIVER
+ fi
+}
+
+test_finish()
+{
+ echo "$MODPROBE" > /proc/sys/kernel/modprobe
+ echo "Test completed"
+}
+
+errno_name_to_val()
+{
+ case "$1" in
+ # kmod calls modprobe and upon of a module not found
+ # modprobe returns just 1... However in the kernel we
+ # *sometimes* see 256...
+ MODULE_NOT_FOUND)
+ echo 256;;
+ SUCCESS)
+ echo 0;;
+ -EPERM)
+ echo -1;;
+ -ENOENT)
+ echo -2;;
+ -EINVAL)
+ echo -22;;
+ -ERR_ANY)
+ echo -123456;;
+ *)
+ echo invalid;;
+ esac
+}
+
+errno_val_to_name()
+ case "$1" in
+ 256)
+ echo MODULE_NOT_FOUND;;
+ 0)
+ echo SUCCESS;;
+ -1)
+ echo -EPERM;;
+ -2)
+ echo -ENOENT;;
+ -22)
+ echo -EINVAL;;
+ -123456)
+ echo -ERR_ANY;;
+ *)
+ echo invalid;;
+ esac
+
+config_set_test_case_driver()
+{
+ if ! echo -n 1 >$DIR/config_test_case; then
+ echo "$0: Unable to set to test case to driver" >&2
+ exit 1
+ fi
+}
+
+config_set_test_case_fs()
+{
+ if ! echo -n 2 >$DIR/config_test_case; then
+ echo "$0: Unable to set to test case to fs" >&2
+ exit 1
+ fi
+}
+
+config_num_threads()
+{
+ if ! echo -n $1 >$DIR/config_num_threads; then
+ echo "$0: Unable to set to number of threads" >&2
+ exit 1
+ fi
+}
+
+config_get_modprobe_limit()
+{
+ if [[ -f ${MODPROBE_LIMIT_FILE} ]] ; then
+ MODPROBE_LIMIT=$(cat $MODPROBE_LIMIT_FILE)
+ fi
+ echo $MODPROBE_LIMIT
+}
+
+config_num_thread_limit_extra()
+{
+ MODPROBE_LIMIT=$(config_get_modprobe_limit)
+ let EXTRA_LIMIT=$MODPROBE_LIMIT+$1
+ config_num_threads $EXTRA_LIMIT
+}
+
+# For special characters use printf directly,
+# refer to kmod_test_0001
+config_set_driver()
+{
+ if ! echo -n $1 >$DIR/config_test_driver; then
+ echo "$0: Unable to set driver" >&2
+ exit 1
+ fi
+}
+
+config_set_fs()
+{
+ if ! echo -n $1 >$DIR/config_test_fs; then
+ echo "$0: Unable to set driver" >&2
+ exit 1
+ fi
+}
+
+config_get_driver()
+{
+ cat $DIR/config_test_driver
+}
+
+config_get_test_result()
+{
+ cat $DIR/test_result
+}
+
+config_reset()
+{
+ if ! echo -n "1" >"$DIR"/reset; then
+ echo "$0: reset should have worked" >&2
+ exit 1
+ fi
+}
+
+config_show_config()
+{
+ echo "----------------------------------------------------"
+ cat "$DIR"/config
+ echo "----------------------------------------------------"
+}
+
+config_trigger()
+{
+ if ! echo -n "1" >"$DIR"/trigger_config 2>/dev/null; then
+ echo "$1: FAIL - loading should have worked"
+ config_show_config
+ exit 1
+ fi
+ echo "$1: OK! - loading kmod test"
+}
+
+config_trigger_want_fail()
+{
+ if echo "1" > $DIR/trigger_config 2>/dev/null; then
+ echo "$1: FAIL - test case was expected to fail"
+ config_show_config
+ exit 1
+ fi
+ echo "$1: OK! - kmod test case failed as expected"
+}
+
+config_expect_result()
+{
+ RC=$(config_get_test_result)
+ RC_NAME=$(errno_val_to_name $RC)
+
+ ERRNO_NAME=$2
+ ERRNO=$(errno_name_to_val $ERRNO_NAME)
+
+ if [[ $ERRNO_NAME = "-ERR_ANY" ]]; then
+ if [[ $RC -ge 0 ]]; then
+ echo "$1: FAIL, test expects $ERRNO_NAME - got $RC_NAME ($RC)" >&2
+ config_show_config
+ exit 1
+ fi
+ elif [[ $RC != $ERRNO ]]; then
+ echo "$1: FAIL, test expects $ERRNO_NAME ($ERRNO) - got $RC_NAME ($RC)" >&2
+ config_show_config
+ exit 1
+ fi
+ echo "$1: OK! - Return value: $RC ($RC_NAME), expected $ERRNO_NAME"
+}
+
+kmod_defaults_driver()
+{
+ config_reset
+ modprobe -r $DEFAULT_KMOD_DRIVER
+ config_set_driver $DEFAULT_KMOD_DRIVER
+}
+
+kmod_defaults_fs()
+{
+ config_reset
+ modprobe -r $DEFAULT_KMOD_FS
+ config_set_fs $DEFAULT_KMOD_FS
+ config_set_test_case_fs
+}
+
+kmod_test_0001_driver()
+{
+ NAME='\000'
+
+ kmod_defaults_driver
+ config_num_threads 1
+ printf $NAME >"$DIR"/config_test_driver
+ config_trigger ${FUNCNAME[0]}
+ config_expect_result ${FUNCNAME[0]} MODULE_NOT_FOUND
+}
+
+kmod_test_0001_fs()
+{
+ NAME='\000'
+
+ kmod_defaults_fs
+ config_num_threads 1
+ printf $NAME >"$DIR"/config_test_fs
+ config_trigger ${FUNCNAME[0]}
+ config_expect_result ${FUNCNAME[0]} -EINVAL
+}
+
+kmod_test_0001()
+{
+ kmod_test_0001_driver
+ kmod_test_0001_fs
+}
+
+kmod_test_0002_driver()
+{
+ NAME="nope-$DEFAULT_KMOD_DRIVER"
+
+ kmod_defaults_driver
+ config_set_driver $NAME
+ config_num_threads 1
+ config_trigger ${FUNCNAME[0]}
+ config_expect_result ${FUNCNAME[0]} MODULE_NOT_FOUND
+}
+
+kmod_test_0002_fs()
+{
+ NAME="nope-$DEFAULT_KMOD_FS"
+
+ kmod_defaults_fs
+ config_set_fs $NAME
+ config_trigger ${FUNCNAME[0]}
+ config_expect_result ${FUNCNAME[0]} -EINVAL
+}
+
+kmod_test_0002()
+{
+ kmod_test_0002_driver
+ kmod_test_0002_fs
+}
+
+kmod_test_0003()
+{
+ kmod_defaults_fs
+ config_num_threads 1
+ config_trigger ${FUNCNAME[0]}
+ config_expect_result ${FUNCNAME[0]} SUCCESS
+}
+
+kmod_test_0004()
+{
+ kmod_defaults_fs
+ config_num_threads 2
+ config_trigger ${FUNCNAME[0]}
+ config_expect_result ${FUNCNAME[0]} SUCCESS
+}
+
+kmod_test_0005()
+{
+ kmod_defaults_driver
+ config_trigger ${FUNCNAME[0]}
+ config_expect_result ${FUNCNAME[0]} SUCCESS
+}
+
+kmod_test_0006()
+{
+ kmod_defaults_fs
+ config_trigger ${FUNCNAME[0]}
+ config_expect_result ${FUNCNAME[0]} SUCCESS
+}
+
+kmod_test_0007()
+{
+ kmod_test_0005
+ kmod_test_0006
+}
+
+kmod_test_0008()
+{
+ kmod_defaults_driver
+ MODPROBE_LIMIT=$(config_get_modprobe_limit)
+ let EXTRA=$MODPROBE_LIMIT/6
+ config_num_thread_limit_extra $EXTRA
+ config_trigger ${FUNCNAME[0]}
+ config_expect_result ${FUNCNAME[0]} SUCCESS
+}
+
+kmod_test_0009()
+{
+ kmod_defaults_fs
+ MODPROBE_LIMIT=$(config_get_modprobe_limit)
+ let EXTRA=$MODPROBE_LIMIT/4
+ config_num_thread_limit_extra $EXTRA
+ config_trigger ${FUNCNAME[0]}
+ config_expect_result ${FUNCNAME[0]} SUCCESS
+}
+
+kmod_test_0010()
+{
+ kmod_defaults_driver
+ config_num_threads 1
+ echo "/KMOD_TEST_NONEXISTENT" > /proc/sys/kernel/modprobe
+ config_trigger ${FUNCNAME[0]}
+ config_expect_result ${FUNCNAME[0]} -ENOENT
+ echo "$MODPROBE" > /proc/sys/kernel/modprobe
+}
+
+kmod_test_0011()
+{
+ kmod_defaults_driver
+ config_num_threads 1
+ # This causes the kernel to not even try executing modprobe. The error
+ # code is still -ENOENT like when modprobe doesn't exist, so we can't
+ # easily test for the exact difference. But this still is a useful test
+ # since there was a bug where request_module() returned 0 in this case.
+ echo > /proc/sys/kernel/modprobe
+ config_trigger ${FUNCNAME[0]}
+ config_expect_result ${FUNCNAME[0]} -ENOENT
+ echo "$MODPROBE" > /proc/sys/kernel/modprobe
+}
+
+kmod_check_visibility()
+{
+ local name="$1"
+ local cmd="$2"
+
+ modprobe $DEFAULT_KMOD_DRIVER
+
+ local priv=$(eval $cmd)
+ local unpriv=$(capsh --drop=CAP_SYSLOG -- -c "$cmd")
+
+ if [ "$priv" = "$unpriv" ] || \
+ [ "${priv:0:3}" = "0x0" ] || \
+ [ "${unpriv:0:3}" != "0x0" ] ; then
+ echo "${FUNCNAME[0]}: FAIL, $name visible to unpriv: '$priv' vs '$unpriv'" >&2
+ exit 1
+ else
+ echo "${FUNCNAME[0]}: OK!"
+ fi
+}
+
+kmod_test_0012()
+{
+ kmod_check_visibility /proc/modules \
+ "grep '^${DEFAULT_KMOD_DRIVER}\b' /proc/modules | awk '{print \$NF}'"
+}
+
+kmod_test_0013()
+{
+ kmod_check_visibility '/sys/module/*/sections/*' \
+ "cat /sys/module/${DEFAULT_KMOD_DRIVER}/sections/.*text | head -n1"
+}
+
+list_tests()
+{
+ echo "Test ID list:"
+ echo
+ echo "TEST_ID x NUM_TEST"
+ echo "TEST_ID: Test ID"
+ echo "NUM_TESTS: Number of recommended times to run the test"
+ echo
+ echo "0001 x $(get_test_count 0001) - Simple test - 1 thread for empty string"
+ echo "0002 x $(get_test_count 0002) - Simple test - 1 thread for modules/filesystems that do not exist"
+ echo "0003 x $(get_test_count 0003) - Simple test - 1 thread for get_fs_type() only"
+ echo "0004 x $(get_test_count 0004) - Simple test - 2 threads for get_fs_type() only"
+ echo "0005 x $(get_test_count 0005) - multithreaded tests with default setup - request_module() only"
+ echo "0006 x $(get_test_count 0006) - multithreaded tests with default setup - get_fs_type() only"
+ echo "0007 x $(get_test_count 0007) - multithreaded tests with default setup test request_module() and get_fs_type()"
+ echo "0008 x $(get_test_count 0008) - multithreaded - push kmod_concurrent over max_modprobes for request_module()"
+ echo "0009 x $(get_test_count 0009) - multithreaded - push kmod_concurrent over max_modprobes for get_fs_type()"
+ echo "0010 x $(get_test_count 0010) - test nonexistent modprobe path"
+ echo "0011 x $(get_test_count 0011) - test completely disabling module autoloading"
+ echo "0012 x $(get_test_count 0012) - test /proc/modules address visibility under CAP_SYSLOG"
+ echo "0013 x $(get_test_count 0013) - test /sys/module/*/sections/* visibility under CAP_SYSLOG"
+}
+
+usage()
+{
+ NUM_TESTS=$(grep -o ' ' <<<"$ALL_TESTS" | grep -c .)
+ let NUM_TESTS=$NUM_TESTS+1
+ MAX_TEST=$(printf "%04d\n" $NUM_TESTS)
+ echo "Usage: $0 [ -t <4-number-digit> ] | [ -w <4-number-digit> ] |"
+ echo " [ -s <4-number-digit> ] | [ -c <4-number-digit> <test- count>"
+ echo " [ all ] [ -h | --help ] [ -l ]"
+ echo ""
+ echo "Valid tests: 0001-$MAX_TEST"
+ echo ""
+ echo " all Runs all tests (default)"
+ echo " -t Run test ID the number amount of times is recommended"
+ echo " -w Watch test ID run until it runs into an error"
+ echo " -s Run test ID once"
+ echo " -c Run test ID x test-count number of times"
+ echo " -l List all test ID list"
+ echo " -h|--help Help"
+ echo
+ echo "If an error every occurs execution will immediately terminate."
+ echo "If you are adding a new test try using -w <test-ID> first to"
+ echo "make sure the test passes a series of tests."
+ echo
+ echo Example uses:
+ echo
+ echo "${TEST_NAME}.sh -- executes all tests"
+ echo "${TEST_NAME}.sh -t 0008 -- Executes test ID 0008 number of times is recommended"
+ echo "${TEST_NAME}.sh -w 0008 -- Watch test ID 0008 run until an error occurs"
+ echo "${TEST_NAME}.sh -s 0008 -- Run test ID 0008 once"
+ echo "${TEST_NAME}.sh -c 0008 3 -- Run test ID 0008 three times"
+ echo
+ list_tests
+ exit 1
+}
+
+function test_num()
+{
+ re='^[0-9]+$'
+ if ! [[ $1 =~ $re ]]; then
+ usage
+ fi
+}
+
+function get_test_data()
+{
+ test_num $1
+ local field_num=$(echo $1 | sed 's/^0*//')
+ echo $ALL_TESTS | awk '{print $'$field_num'}'
+}
+
+function get_test_count()
+{
+ TEST_DATA=$(get_test_data $1)
+ LAST_TWO=${TEST_DATA#*:*}
+ echo ${LAST_TWO%:*}
+}
+
+function get_test_enabled()
+{
+ TEST_DATA=$(get_test_data $1)
+ echo ${TEST_DATA#*:*:}
+}
+
+function run_all_tests()
+{
+ for i in $ALL_TESTS ; do
+ TEST_ID=${i%:*:*}
+ ENABLED=$(get_test_enabled $TEST_ID)
+ TEST_COUNT=$(get_test_count $TEST_ID)
+ if [[ $ENABLED -eq "1" ]]; then
+ test_case $TEST_ID $TEST_COUNT
+ fi
+ done
+}
+
+function watch_log()
+{
+ if [ $# -ne 3 ]; then
+ clear
+ fi
+ date
+ echo "Running test: $2 - run #$1"
+}
+
+function watch_case()
+{
+ i=0
+ while [ 1 ]; do
+
+ if [ $# -eq 1 ]; then
+ test_num $1
+ watch_log $i ${TEST_NAME}_test_$1
+ ${TEST_NAME}_test_$1
+ else
+ watch_log $i all
+ run_all_tests
+ fi
+ let i=$i+1
+ done
+}
+
+function test_case()
+{
+ NUM_TESTS=$DEFAULT_NUM_TESTS
+ if [ $# -eq 2 ]; then
+ NUM_TESTS=$2
+ fi
+
+ i=0
+ while [ $i -lt $NUM_TESTS ]; do
+ test_num $1
+ watch_log $i ${TEST_NAME}_test_$1 noclear
+ RUN_TEST=${TEST_NAME}_test_$1
+ $RUN_TEST
+ let i=$i+1
+ done
+}
+
+function parse_args()
+{
+ if [ $# -eq 0 ]; then
+ run_all_tests
+ else
+ if [[ "$1" = "all" ]]; then
+ run_all_tests
+ elif [[ "$1" = "-w" ]]; then
+ shift
+ watch_case $@
+ elif [[ "$1" = "-t" ]]; then
+ shift
+ test_num $1
+ test_case $1 $(get_test_count $1)
+ elif [[ "$1" = "-c" ]]; then
+ shift
+ test_num $1
+ test_num $2
+ test_case $1 $2
+ elif [[ "$1" = "-s" ]]; then
+ shift
+ test_case $1 1
+ elif [[ "$1" = "-l" ]]; then
+ list_tests
+ elif [[ "$1" = "-h" || "$1" = "--help" ]]; then
+ usage
+ else
+ usage
+ fi
+ fi
+}
+
+test_reqs
+allow_user_defaults
+load_req_mod
+
+MODPROBE=$(</proc/sys/kernel/modprobe)
+trap "test_finish" EXIT
+
+parse_args $@
+
+exit 0
diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h
new file mode 100644
index 000000000..8d50483fe
--- /dev/null
+++ b/tools/testing/selftests/kselftest.h
@@ -0,0 +1,290 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * kselftest.h: low-level kselftest framework to include from
+ * selftest programs. When possible, please use
+ * kselftest_harness.h instead.
+ *
+ * Copyright (c) 2014 Shuah Khan <shuahkh@osg.samsung.com>
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ *
+ * Using this API consists of first counting how many tests your code
+ * has to run, and then starting up the reporting:
+ *
+ * ksft_print_header();
+ * ksft_set_plan(total_number_of_tests);
+ *
+ * For each test, report any progress, debugging, etc with:
+ *
+ * ksft_print_msg(fmt, ...);
+ *
+ * and finally report the pass/fail/skip/xfail state of the test with one of:
+ *
+ * ksft_test_result(condition, fmt, ...);
+ * ksft_test_result_pass(fmt, ...);
+ * ksft_test_result_fail(fmt, ...);
+ * ksft_test_result_skip(fmt, ...);
+ * ksft_test_result_xfail(fmt, ...);
+ * ksft_test_result_error(fmt, ...);
+ *
+ * When all tests are finished, clean up and exit the program with one of:
+ *
+ * ksft_exit(condition);
+ * ksft_exit_pass();
+ * ksft_exit_fail();
+ *
+ * If the program wants to report details on why the entire program has
+ * failed, it can instead exit with a message (this is usually done when
+ * the program is aborting before finishing all tests):
+ *
+ * ksft_exit_fail_msg(fmt, ...);
+ *
+ */
+#ifndef __KSELFTEST_H
+#define __KSELFTEST_H
+
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <stdio.h>
+
+/* define kselftest exit codes */
+#define KSFT_PASS 0
+#define KSFT_FAIL 1
+#define KSFT_XFAIL 2
+#define KSFT_XPASS 3
+#define KSFT_SKIP 4
+
+/* counters */
+struct ksft_count {
+ unsigned int ksft_pass;
+ unsigned int ksft_fail;
+ unsigned int ksft_xfail;
+ unsigned int ksft_xpass;
+ unsigned int ksft_xskip;
+ unsigned int ksft_error;
+};
+
+static struct ksft_count ksft_cnt;
+static unsigned int ksft_plan;
+
+static inline unsigned int ksft_test_num(void)
+{
+ return ksft_cnt.ksft_pass + ksft_cnt.ksft_fail +
+ ksft_cnt.ksft_xfail + ksft_cnt.ksft_xpass +
+ ksft_cnt.ksft_xskip + ksft_cnt.ksft_error;
+}
+
+static inline void ksft_inc_pass_cnt(void) { ksft_cnt.ksft_pass++; }
+static inline void ksft_inc_fail_cnt(void) { ksft_cnt.ksft_fail++; }
+static inline void ksft_inc_xfail_cnt(void) { ksft_cnt.ksft_xfail++; }
+static inline void ksft_inc_xpass_cnt(void) { ksft_cnt.ksft_xpass++; }
+static inline void ksft_inc_xskip_cnt(void) { ksft_cnt.ksft_xskip++; }
+static inline void ksft_inc_error_cnt(void) { ksft_cnt.ksft_error++; }
+
+static inline int ksft_get_pass_cnt(void) { return ksft_cnt.ksft_pass; }
+static inline int ksft_get_fail_cnt(void) { return ksft_cnt.ksft_fail; }
+static inline int ksft_get_xfail_cnt(void) { return ksft_cnt.ksft_xfail; }
+static inline int ksft_get_xpass_cnt(void) { return ksft_cnt.ksft_xpass; }
+static inline int ksft_get_xskip_cnt(void) { return ksft_cnt.ksft_xskip; }
+static inline int ksft_get_error_cnt(void) { return ksft_cnt.ksft_error; }
+
+static inline void ksft_print_header(void)
+{
+ if (!(getenv("KSFT_TAP_LEVEL")))
+ printf("TAP version 13\n");
+}
+
+static inline void ksft_set_plan(unsigned int plan)
+{
+ ksft_plan = plan;
+ printf("1..%d\n", ksft_plan);
+}
+
+static inline void ksft_print_cnts(void)
+{
+ if (ksft_plan != ksft_test_num())
+ printf("# Planned tests != run tests (%u != %u)\n",
+ ksft_plan, ksft_test_num());
+ printf("# Totals: pass:%d fail:%d xfail:%d xpass:%d skip:%d error:%d\n",
+ ksft_cnt.ksft_pass, ksft_cnt.ksft_fail,
+ ksft_cnt.ksft_xfail, ksft_cnt.ksft_xpass,
+ ksft_cnt.ksft_xskip, ksft_cnt.ksft_error);
+}
+
+static inline void ksft_print_msg(const char *msg, ...)
+{
+ int saved_errno = errno;
+ va_list args;
+
+ va_start(args, msg);
+ printf("# ");
+ errno = saved_errno;
+ vprintf(msg, args);
+ va_end(args);
+}
+
+static inline void ksft_test_result_pass(const char *msg, ...)
+{
+ int saved_errno = errno;
+ va_list args;
+
+ ksft_cnt.ksft_pass++;
+
+ va_start(args, msg);
+ printf("ok %d ", ksft_test_num());
+ errno = saved_errno;
+ vprintf(msg, args);
+ va_end(args);
+}
+
+static inline void ksft_test_result_fail(const char *msg, ...)
+{
+ int saved_errno = errno;
+ va_list args;
+
+ ksft_cnt.ksft_fail++;
+
+ va_start(args, msg);
+ printf("not ok %d ", ksft_test_num());
+ errno = saved_errno;
+ vprintf(msg, args);
+ va_end(args);
+}
+
+/**
+ * ksft_test_result() - Report test success based on truth of condition
+ *
+ * @condition: if true, report test success, otherwise failure.
+ */
+#define ksft_test_result(condition, fmt, ...) do { \
+ if (!!(condition)) \
+ ksft_test_result_pass(fmt, ##__VA_ARGS__);\
+ else \
+ ksft_test_result_fail(fmt, ##__VA_ARGS__);\
+ } while (0)
+
+static inline void ksft_test_result_xfail(const char *msg, ...)
+{
+ int saved_errno = errno;
+ va_list args;
+
+ ksft_cnt.ksft_xfail++;
+
+ va_start(args, msg);
+ printf("ok %d # XFAIL ", ksft_test_num());
+ errno = saved_errno;
+ vprintf(msg, args);
+ va_end(args);
+}
+
+static inline void ksft_test_result_skip(const char *msg, ...)
+{
+ int saved_errno = errno;
+ va_list args;
+
+ ksft_cnt.ksft_xskip++;
+
+ va_start(args, msg);
+ printf("ok %d # SKIP ", ksft_test_num());
+ errno = saved_errno;
+ vprintf(msg, args);
+ va_end(args);
+}
+
+/* TODO: how does "error" differ from "fail" or "skip"? */
+static inline void ksft_test_result_error(const char *msg, ...)
+{
+ int saved_errno = errno;
+ va_list args;
+
+ ksft_cnt.ksft_error++;
+
+ va_start(args, msg);
+ printf("not ok %d # error ", ksft_test_num());
+ errno = saved_errno;
+ vprintf(msg, args);
+ va_end(args);
+}
+
+static inline int ksft_exit_pass(void)
+{
+ ksft_print_cnts();
+ exit(KSFT_PASS);
+}
+
+static inline int ksft_exit_fail(void)
+{
+ ksft_print_cnts();
+ exit(KSFT_FAIL);
+}
+
+/**
+ * ksft_exit() - Exit selftest based on truth of condition
+ *
+ * @condition: if true, exit self test with success, otherwise fail.
+ */
+#define ksft_exit(condition) do { \
+ if (!!(condition)) \
+ ksft_exit_pass(); \
+ else \
+ ksft_exit_fail(); \
+ } while (0)
+
+static inline int ksft_exit_fail_msg(const char *msg, ...)
+{
+ int saved_errno = errno;
+ va_list args;
+
+ va_start(args, msg);
+ printf("Bail out! ");
+ errno = saved_errno;
+ vprintf(msg, args);
+ va_end(args);
+
+ ksft_print_cnts();
+ exit(KSFT_FAIL);
+}
+
+static inline int ksft_exit_xfail(void)
+{
+ ksft_print_cnts();
+ exit(KSFT_XFAIL);
+}
+
+static inline int ksft_exit_xpass(void)
+{
+ ksft_print_cnts();
+ exit(KSFT_XPASS);
+}
+
+static inline int ksft_exit_skip(const char *msg, ...)
+{
+ int saved_errno = errno;
+ va_list args;
+
+ va_start(args, msg);
+
+ /*
+ * FIXME: several tests misuse ksft_exit_skip so produce
+ * something sensible if some tests have already been run
+ * or a plan has been printed. Those tests should use
+ * ksft_test_result_skip or ksft_exit_fail_msg instead.
+ */
+ if (ksft_plan || ksft_test_num()) {
+ ksft_cnt.ksft_xskip++;
+ printf("ok %d # SKIP ", 1 + ksft_test_num());
+ } else {
+ printf("1..0 # SKIP ");
+ }
+ if (msg) {
+ errno = saved_errno;
+ vprintf(msg, args);
+ va_end(args);
+ }
+ if (ksft_test_num())
+ ksft_print_cnts();
+ exit(KSFT_SKIP);
+}
+
+#endif /* __KSELFTEST_H */
diff --git a/tools/testing/selftests/kselftest/module.sh b/tools/testing/selftests/kselftest/module.sh
new file mode 100755
index 000000000..fb4733faf
--- /dev/null
+++ b/tools/testing/selftests/kselftest/module.sh
@@ -0,0 +1,84 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0+
+
+#
+# Runs an individual test module.
+#
+# kselftest expects a separate executable for each test, this can be
+# created by adding a script like this:
+#
+# #!/bin/sh
+# SPDX-License-Identifier: GPL-2.0+
+# $(dirname $0)/../kselftest/module.sh "description" module_name
+#
+# Example: tools/testing/selftests/lib/printf.sh
+
+desc="" # Output prefix.
+module="" # Filename (without the .ko).
+args="" # modprobe arguments.
+
+modprobe="/sbin/modprobe"
+
+main() {
+ parse_args "$@"
+ assert_root
+ assert_have_module
+ run_module
+}
+
+parse_args() {
+ script=${0##*/}
+
+ if [ $# -lt 2 ]; then
+ echo "Usage: $script <description> <module_name> [FAIL]"
+ exit 1
+ fi
+
+ desc="$1"
+ shift || true
+ module="$1"
+ shift || true
+ args="$@"
+}
+
+assert_root() {
+ if [ ! -w /dev ]; then
+ skip "please run as root"
+ fi
+}
+
+assert_have_module() {
+ if ! $modprobe -q -n $module; then
+ skip "module $module is not found"
+ fi
+}
+
+run_module() {
+ if $modprobe -q $module $args; then
+ $modprobe -q -r $module
+ say "ok"
+ else
+ fail ""
+ fi
+}
+
+say() {
+ echo "$desc: $1"
+}
+
+
+fail() {
+ say "$1 [FAIL]" >&2
+ exit 1
+}
+
+skip() {
+ say "$1 [SKIP]" >&2
+ # Kselftest framework requirement - SKIP code is 4.
+ exit 4
+}
+
+#
+# Main script
+#
+main "$@"
diff --git a/tools/testing/selftests/kselftest/prefix.pl b/tools/testing/selftests/kselftest/prefix.pl
new file mode 100755
index 000000000..12a7f4ca2
--- /dev/null
+++ b/tools/testing/selftests/kselftest/prefix.pl
@@ -0,0 +1,24 @@
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0
+# Prefix all lines with "# ", unbuffered. Command being piped in may need
+# to have unbuffering forced with "stdbuf -i0 -o0 -e0 $cmd".
+use strict;
+use IO::Handle;
+
+binmode STDIN;
+binmode STDOUT;
+
+STDOUT->autoflush(1);
+
+my $needed = 1;
+while (1) {
+ my $char;
+ my $bytes = sysread(STDIN, $char, 1);
+ exit 0 if ($bytes == 0);
+ if ($needed) {
+ print "# ";
+ $needed = 0;
+ }
+ print $char;
+ $needed = 1 if ($char eq "\n");
+}
diff --git a/tools/testing/selftests/kselftest/runner.sh b/tools/testing/selftests/kselftest/runner.sh
new file mode 100644
index 000000000..83616f077
--- /dev/null
+++ b/tools/testing/selftests/kselftest/runner.sh
@@ -0,0 +1,120 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Runs a set of tests in a given subdirectory.
+export skip_rc=4
+export timeout_rc=124
+export logfile=/dev/stdout
+export per_test_logging=
+
+# Defaults for "settings" file fields:
+# "timeout" how many seconds to let each test run before failing.
+export kselftest_default_timeout=45
+
+# There isn't a shell-agnostic way to find the path of a sourced file,
+# so we must rely on BASE_DIR being set to find other tools.
+if [ -z "$BASE_DIR" ]; then
+ echo "Error: BASE_DIR must be set before sourcing." >&2
+ exit 1
+fi
+
+# If Perl is unavailable, we must fall back to line-at-a-time prefixing
+# with sed instead of unbuffered output.
+tap_prefix()
+{
+ if [ ! -x /usr/bin/perl ]; then
+ sed -e 's/^/# /'
+ else
+ "$BASE_DIR"/kselftest/prefix.pl
+ fi
+}
+
+tap_timeout()
+{
+ # Make sure tests will time out if utility is available.
+ if [ -x /usr/bin/timeout ] ; then
+ /usr/bin/timeout --foreground "$kselftest_timeout" \
+ /usr/bin/timeout "$kselftest_timeout" $1
+ else
+ $1
+ fi
+}
+
+run_one()
+{
+ DIR="$1"
+ TEST="$2"
+ NUM="$3"
+
+ BASENAME_TEST=$(basename $TEST)
+
+ # Reset any "settings"-file variables.
+ export kselftest_timeout="$kselftest_default_timeout"
+ # Load per-test-directory kselftest "settings" file.
+ settings="$BASE_DIR/$DIR/settings"
+ if [ -r "$settings" ] ; then
+ while read line ; do
+ # Skip comments.
+ if echo "$line" | grep -q '^#'; then
+ continue
+ fi
+ field=$(echo "$line" | cut -d= -f1)
+ value=$(echo "$line" | cut -d= -f2-)
+ eval "kselftest_$field"="$value"
+ done < "$settings"
+ fi
+
+ TEST_HDR_MSG="selftests: $DIR: $BASENAME_TEST"
+ echo "# $TEST_HDR_MSG"
+ if [ ! -e "$TEST" ]; then
+ echo "# Warning: file $TEST is missing!"
+ echo "not ok $test_num $TEST_HDR_MSG"
+ else
+ cmd="./$BASENAME_TEST"
+ if [ ! -x "$TEST" ]; then
+ echo "# Warning: file $TEST is not executable"
+
+ if [ $(head -n 1 "$TEST" | cut -c -2) = "#!" ]
+ then
+ interpreter=$(head -n 1 "$TEST" | cut -c 3-)
+ cmd="$interpreter ./$BASENAME_TEST"
+ else
+ echo "not ok $test_num $TEST_HDR_MSG"
+ return
+ fi
+ fi
+ cd `dirname $TEST` > /dev/null
+ ((((( tap_timeout "$cmd" 2>&1; echo $? >&3) |
+ tap_prefix >&4) 3>&1) |
+ (read xs; exit $xs)) 4>>"$logfile" &&
+ echo "ok $test_num $TEST_HDR_MSG") ||
+ (rc=$?; \
+ if [ $rc -eq $skip_rc ]; then \
+ echo "ok $test_num $TEST_HDR_MSG # SKIP"
+ elif [ $rc -eq $timeout_rc ]; then \
+ echo "#"
+ echo "not ok $test_num $TEST_HDR_MSG # TIMEOUT $kselftest_timeout seconds"
+ else
+ echo "not ok $test_num $TEST_HDR_MSG # exit=$rc"
+ fi)
+ cd - >/dev/null
+ fi
+}
+
+run_many()
+{
+ echo "TAP version 13"
+ DIR="${PWD#${BASE_DIR}/}"
+ test_num=0
+ total=$(echo "$@" | wc -w)
+ echo "1..$total"
+ for TEST in "$@"; do
+ BASENAME_TEST=$(basename $TEST)
+ test_num=$(( test_num + 1 ))
+ if [ -n "$per_test_logging" ]; then
+ logfile="/tmp/$BASENAME_TEST"
+ cat /dev/null > "$logfile"
+ fi
+ run_one "$DIR" "$TEST" "$test_num"
+ done
+}
diff --git a/tools/testing/selftests/kselftest_deps.sh b/tools/testing/selftests/kselftest_deps.sh
new file mode 100755
index 000000000..e6010de67
--- /dev/null
+++ b/tools/testing/selftests/kselftest_deps.sh
@@ -0,0 +1,325 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# kselftest_deps.sh
+#
+# Checks for kselftest build dependencies on the build system.
+# Copyright (c) 2020 Shuah Khan <skhan@linuxfoundation.org>
+#
+#
+
+usage()
+{
+
+echo -e "Usage: $0 -[p] <compiler> [test_name]\n"
+echo -e "\tkselftest_deps.sh [-p] gcc"
+echo -e "\tkselftest_deps.sh [-p] gcc vm"
+echo -e "\tkselftest_deps.sh [-p] aarch64-linux-gnu-gcc"
+echo -e "\tkselftest_deps.sh [-p] aarch64-linux-gnu-gcc vm\n"
+echo "- Should be run in selftests directory in the kernel repo."
+echo "- Checks if Kselftests can be built/cross-built on a system."
+echo "- Parses all test/sub-test Makefile to find library dependencies."
+echo "- Runs compile test on a trivial C file with LDLIBS specified"
+echo " in the test Makefiles to identify missing library dependencies."
+echo "- Prints suggested target list for a system filtering out tests"
+echo " failed the build dependency check from the TARGETS in Selftests"
+echo " main Makefile when optional -p is specified."
+echo "- Prints pass/fail dependency check for each tests/sub-test."
+echo "- Prints pass/fail targets and libraries."
+echo "- Default: runs dependency checks on all tests."
+echo "- Optional test name can be specified to check dependencies for it."
+exit 1
+
+}
+
+# Start main()
+main()
+{
+
+base_dir=`pwd`
+# Make sure we're in the selftests top-level directory.
+if [ $(basename "$base_dir") != "selftests" ]; then
+ echo -e "\tPlease run $0 in"
+ echo -e "\ttools/testing/selftests directory ..."
+ exit 1
+fi
+
+print_targets=0
+
+while getopts "p" arg; do
+ case $arg in
+ p)
+ print_targets=1
+ shift;;
+ esac
+done
+
+if [ $# -eq 0 ]
+then
+ usage
+fi
+
+# Compiler
+CC=$1
+
+tmp_file=$(mktemp).c
+trap "rm -f $tmp_file.o $tmp_file $tmp_file.bin" EXIT
+#echo $tmp_file
+
+pass=$(mktemp).out
+trap "rm -f $pass" EXIT
+#echo $pass
+
+fail=$(mktemp).out
+trap "rm -f $fail" EXIT
+#echo $fail
+
+# Generate tmp source fire for compile test
+cat << "EOF" > $tmp_file
+int main()
+{
+}
+EOF
+
+# Save results
+total_cnt=0
+fail_trgts=()
+fail_libs=()
+fail_cnt=0
+pass_trgts=()
+pass_libs=()
+pass_cnt=0
+
+# Get all TARGETS from selftests Makefile
+targets=$(egrep "^TARGETS +|^TARGETS =" Makefile | cut -d "=" -f2)
+
+# Initially, in LDLIBS related lines, the dep checker needs
+# to ignore lines containing the following strings:
+filter="\$(VAR_LDLIBS)\|pkg-config\|PKG_CONFIG\|IOURING_EXTRA_LIBS"
+
+# Single test case
+if [ $# -eq 2 ]
+then
+ test=$2/Makefile
+
+ l1_test $test
+ l2_test $test
+ l3_test $test
+ l4_test $test
+ l5_test $test
+
+ print_results $1 $2
+ exit $?
+fi
+
+# Level 1: LDLIBS set static.
+#
+# Find all LDLIBS set statically for all executables built by a Makefile
+# and filter out VAR_LDLIBS to discard the following:
+# gpio/Makefile:LDLIBS += $(VAR_LDLIBS)
+# Append space at the end of the list to append more tests.
+
+l1_tests=$(grep -r --include=Makefile "^LDLIBS" | \
+ grep -v "$filter" | awk -F: '{print $1}' | uniq)
+
+# Level 2: LDLIBS set dynamically.
+#
+# Level 2
+# Some tests have multiple valid LDLIBS lines for individual sub-tests
+# that need dependency checks. Find them and append them to the tests
+# e.g: vm/Makefile:$(OUTPUT)/userfaultfd: LDLIBS += -lpthread
+# Filter out VAR_LDLIBS to discard the following:
+# memfd/Makefile:$(OUTPUT)/fuse_mnt: LDLIBS += $(VAR_LDLIBS)
+# Append space at the end of the list to append more tests.
+
+l2_tests=$(grep -r --include=Makefile ": LDLIBS" | \
+ grep -v "$filter" | awk -F: '{print $1}' | uniq)
+
+# Level 3
+# gpio, memfd and others use pkg-config to find mount and fuse libs
+# respectively and save it in VAR_LDLIBS. If pkg-config doesn't find
+# any, VAR_LDLIBS set to default.
+# Use the default value and filter out pkg-config for dependency check.
+# e.g:
+# gpio/Makefile
+# VAR_LDLIBS := $(shell pkg-config --libs mount) 2>/dev/null)
+# memfd/Makefile
+# VAR_LDLIBS := $(shell pkg-config fuse --libs 2>/dev/null)
+
+l3_tests=$(grep -r --include=Makefile "^VAR_LDLIBS" | \
+ grep -v "pkg-config\|PKG_CONFIG" | awk -F: '{print $1}' | uniq)
+
+# Level 4
+# some tests may fall back to default using `|| echo -l<libname>`
+# if pkg-config doesn't find the libs, instead of using VAR_LDLIBS
+# as per level 3 checks.
+# e.g:
+# netfilter/Makefile
+# LDLIBS += $(shell $(HOSTPKG_CONFIG) --libs libmnl 2>/dev/null || echo -lmnl)
+l4_tests=$(grep -r --include=Makefile "^LDLIBS" | \
+ grep "pkg-config\|PKG_CONFIG" | awk -F: '{print $1}' | uniq)
+
+# Level 5
+# some tests may use IOURING_EXTRA_LIBS to add extra libs to LDLIBS,
+# which in turn may be defined in a sub-Makefile
+# e.g.:
+# mm/Makefile
+# $(OUTPUT)/gup_longterm: LDLIBS += $(IOURING_EXTRA_LIBS)
+l5_tests=$(grep -r --include=Makefile "LDLIBS +=.*\$(IOURING_EXTRA_LIBS)" | \
+ awk -F: '{print $1}' | uniq)
+
+#echo l1_tests $l1_tests
+#echo l2_tests $l2_tests
+#echo l3_tests $l3_tests
+#echo l4_tests $l4_tests
+#echo l5_tests $l5_tests
+
+all_tests
+print_results $1 $2
+
+exit $?
+}
+# end main()
+
+all_tests()
+{
+ for test in $l1_tests; do
+ l1_test $test
+ done
+
+ for test in $l2_tests; do
+ l2_test $test
+ done
+
+ for test in $l3_tests; do
+ l3_test $test
+ done
+
+ for test in $l4_tests; do
+ l4_test $test
+ done
+
+ for test in $l5_tests; do
+ l5_test $test
+ done
+}
+
+# Use same parsing used for l1_tests and pick libraries this time.
+l1_test()
+{
+ test_libs=$(grep --include=Makefile "^LDLIBS" $test | \
+ grep -v "$filter" | \
+ sed -e 's/\:/ /' | \
+ sed -e 's/+/ /' | cut -d "=" -f 2)
+
+ check_libs $test $test_libs
+}
+
+# Use same parsing used for l2_tests and pick libraries this time.
+l2_test()
+{
+ test_libs=$(grep --include=Makefile ": LDLIBS" $test | \
+ grep -v "$filter" | \
+ sed -e 's/\:/ /' | sed -e 's/+/ /' | \
+ cut -d "=" -f 2)
+
+ check_libs $test $test_libs
+}
+
+l3_test()
+{
+ test_libs=$(grep --include=Makefile "^VAR_LDLIBS" $test | \
+ grep -v "pkg-config" | sed -e 's/\:/ /' |
+ sed -e 's/+/ /' | cut -d "=" -f 2)
+
+ check_libs $test $test_libs
+}
+
+l4_test()
+{
+ test_libs=$(grep --include=Makefile "^VAR_LDLIBS\|^LDLIBS" $test | \
+ grep "\(pkg-config\|PKG_CONFIG\).*|| echo " | \
+ sed -e 's/.*|| echo //' | sed -e 's/)$//')
+
+ check_libs $test $test_libs
+}
+
+l5_test()
+{
+ tests=$(find $(dirname "$test") -type f -name "*.mk")
+ test_libs=$(grep "^IOURING_EXTRA_LIBS +\?=" $tests | \
+ cut -d "=" -f 2)
+
+ check_libs $test $test_libs
+}
+
+check_libs()
+{
+
+if [[ ! -z "${test_libs// }" ]]
+then
+
+ #echo $test_libs
+
+ for lib in $test_libs; do
+
+ let total_cnt+=1
+ $CC -o $tmp_file.bin $lib $tmp_file > /dev/null 2>&1
+ if [ $? -ne 0 ]; then
+ echo "FAIL: $test dependency check: $lib" >> $fail
+ let fail_cnt+=1
+ fail_libs+="$lib "
+ fail_target=$(echo "$test" | cut -d "/" -f1)
+ fail_trgts+="$fail_target "
+ targets=$(echo "$targets" | grep -v "$fail_target")
+ else
+ echo "PASS: $test dependency check passed $lib" >> $pass
+ let pass_cnt+=1
+ pass_libs+="$lib "
+ pass_trgts+="$(echo "$test" | cut -d "/" -f1) "
+ fi
+
+ done
+fi
+}
+
+print_results()
+{
+ echo -e "========================================================";
+ echo -e "Kselftest Dependency Check for [$0 $1 $2] results..."
+
+ if [ $print_targets -ne 0 ]
+ then
+ echo -e "Suggested Selftest Targets for your configuration:"
+ echo -e "$targets";
+ fi
+
+ echo -e "========================================================";
+ echo -e "Checked tests defining LDLIBS dependencies"
+ echo -e "--------------------------------------------------------";
+ echo -e "Total tests with Dependencies:"
+ echo -e "$total_cnt Pass: $pass_cnt Fail: $fail_cnt";
+
+ if [ $pass_cnt -ne 0 ]; then
+ echo -e "--------------------------------------------------------";
+ cat $pass
+ echo -e "--------------------------------------------------------";
+ echo -e "Targets passed build dependency check on system:"
+ echo -e "$(echo "$pass_trgts" | xargs -n1 | sort -u | xargs)"
+ fi
+
+ if [ $fail_cnt -ne 0 ]; then
+ echo -e "--------------------------------------------------------";
+ cat $fail
+ echo -e "--------------------------------------------------------";
+ echo -e "Targets failed build dependency check on system:"
+ echo -e "$(echo "$fail_trgts" | xargs -n1 | sort -u | xargs)"
+ echo -e "--------------------------------------------------------";
+ echo -e "Missing libraries system"
+ echo -e "$(echo "$fail_libs" | xargs -n1 | sort -u | xargs)"
+ fi
+
+ echo -e "--------------------------------------------------------";
+ echo -e "========================================================";
+}
+
+main "$@"
diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h
new file mode 100644
index 000000000..2fadc99d9
--- /dev/null
+++ b/tools/testing/selftests/kselftest_harness.h
@@ -0,0 +1,1065 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+ *
+ * kselftest_harness.h: simple C unit test helper.
+ *
+ * See documentation in Documentation/dev-tools/kselftest.rst
+ *
+ * API inspired by code.google.com/p/googletest
+ */
+
+/**
+ * DOC: example
+ *
+ * .. code-block:: c
+ *
+ * #include "../kselftest_harness.h"
+ *
+ * TEST(standalone_test) {
+ * do_some_stuff;
+ * EXPECT_GT(10, stuff) {
+ * stuff_state_t state;
+ * enumerate_stuff_state(&state);
+ * TH_LOG("expectation failed with state: %s", state.msg);
+ * }
+ * more_stuff;
+ * ASSERT_NE(some_stuff, NULL) TH_LOG("how did it happen?!");
+ * last_stuff;
+ * EXPECT_EQ(0, last_stuff);
+ * }
+ *
+ * FIXTURE(my_fixture) {
+ * mytype_t *data;
+ * int awesomeness_level;
+ * };
+ * FIXTURE_SETUP(my_fixture) {
+ * self->data = mytype_new();
+ * ASSERT_NE(NULL, self->data);
+ * }
+ * FIXTURE_TEARDOWN(my_fixture) {
+ * mytype_free(self->data);
+ * }
+ * TEST_F(my_fixture, data_is_good) {
+ * EXPECT_EQ(1, is_my_data_good(self->data));
+ * }
+ *
+ * TEST_HARNESS_MAIN
+ */
+
+#ifndef __KSELFTEST_HARNESS_H
+#define __KSELFTEST_HARNESS_H
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <asm/types.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "kselftest.h"
+
+#define TEST_TIMEOUT_DEFAULT 30
+
+/* Utilities exposed to the test definitions */
+#ifndef TH_LOG_STREAM
+# define TH_LOG_STREAM stderr
+#endif
+
+#ifndef TH_LOG_ENABLED
+# define TH_LOG_ENABLED 1
+#endif
+
+/**
+ * TH_LOG(fmt, ...)
+ *
+ * @fmt: format string
+ * @...: optional arguments
+ *
+ * .. code-block:: c
+ *
+ * TH_LOG(format, ...)
+ *
+ * Optional debug logging function available for use in tests.
+ * Logging may be enabled or disabled by defining TH_LOG_ENABLED.
+ * E.g., #define TH_LOG_ENABLED 1
+ *
+ * If no definition is provided, logging is enabled by default.
+ *
+ * If there is no way to print an error message for the process running the
+ * test (e.g. not allowed to write to stderr), it is still possible to get the
+ * ASSERT_* number for which the test failed. This behavior can be enabled by
+ * writing `_metadata->no_print = true;` before the check sequence that is
+ * unable to print. When an error occur, instead of printing an error message
+ * and calling `abort(3)`, the test process call `_exit(2)` with the assert
+ * number as argument, which is then printed by the parent process.
+ */
+#define TH_LOG(fmt, ...) do { \
+ if (TH_LOG_ENABLED) \
+ __TH_LOG(fmt, ##__VA_ARGS__); \
+} while (0)
+
+/* Unconditional logger for internal use. */
+#define __TH_LOG(fmt, ...) \
+ fprintf(TH_LOG_STREAM, "# %s:%d:%s:" fmt "\n", \
+ __FILE__, __LINE__, _metadata->name, ##__VA_ARGS__)
+
+/**
+ * SKIP(statement, fmt, ...)
+ *
+ * @statement: statement to run after reporting SKIP
+ * @fmt: format string
+ * @...: optional arguments
+ *
+ * This forces a "pass" after reporting why something is being skipped
+ * and runs "statement", which is usually "return" or "goto skip".
+ */
+#define SKIP(statement, fmt, ...) do { \
+ snprintf(_metadata->results->reason, \
+ sizeof(_metadata->results->reason), fmt, ##__VA_ARGS__); \
+ if (TH_LOG_ENABLED) { \
+ fprintf(TH_LOG_STREAM, "# SKIP %s\n", \
+ _metadata->results->reason); \
+ } \
+ _metadata->passed = 1; \
+ _metadata->skip = 1; \
+ _metadata->trigger = 0; \
+ statement; \
+} while (0)
+
+/**
+ * TEST(test_name) - Defines the test function and creates the registration
+ * stub
+ *
+ * @test_name: test name
+ *
+ * .. code-block:: c
+ *
+ * TEST(name) { implementation }
+ *
+ * Defines a test by name.
+ * Names must be unique and tests must not be run in parallel. The
+ * implementation containing block is a function and scoping should be treated
+ * as such. Returning early may be performed with a bare "return;" statement.
+ *
+ * EXPECT_* and ASSERT_* are valid in a TEST() { } context.
+ */
+#define TEST(test_name) __TEST_IMPL(test_name, -1)
+
+/**
+ * TEST_SIGNAL(test_name, signal)
+ *
+ * @test_name: test name
+ * @signal: signal number
+ *
+ * .. code-block:: c
+ *
+ * TEST_SIGNAL(name, signal) { implementation }
+ *
+ * Defines a test by name and the expected term signal.
+ * Names must be unique and tests must not be run in parallel. The
+ * implementation containing block is a function and scoping should be treated
+ * as such. Returning early may be performed with a bare "return;" statement.
+ *
+ * EXPECT_* and ASSERT_* are valid in a TEST() { } context.
+ */
+#define TEST_SIGNAL(test_name, signal) __TEST_IMPL(test_name, signal)
+
+#define __TEST_IMPL(test_name, _signal) \
+ static void test_name(struct __test_metadata *_metadata); \
+ static inline void wrapper_##test_name( \
+ struct __test_metadata *_metadata, \
+ struct __fixture_variant_metadata *variant) \
+ { \
+ test_name(_metadata); \
+ } \
+ static struct __test_metadata _##test_name##_object = \
+ { .name = #test_name, \
+ .fn = &wrapper_##test_name, \
+ .fixture = &_fixture_global, \
+ .termsig = _signal, \
+ .timeout = TEST_TIMEOUT_DEFAULT, }; \
+ static void __attribute__((constructor)) _register_##test_name(void) \
+ { \
+ __register_test(&_##test_name##_object); \
+ } \
+ static void test_name( \
+ struct __test_metadata __attribute__((unused)) *_metadata)
+
+/**
+ * FIXTURE_DATA(datatype_name) - Wraps the struct name so we have one less
+ * argument to pass around
+ *
+ * @datatype_name: datatype name
+ *
+ * .. code-block:: c
+ *
+ * FIXTURE_DATA(datatype_name)
+ *
+ * Almost always, you want just FIXTURE() instead (see below).
+ * This call may be used when the type of the fixture data
+ * is needed. In general, this should not be needed unless
+ * the *self* is being passed to a helper directly.
+ */
+#define FIXTURE_DATA(datatype_name) struct _test_data_##datatype_name
+
+/**
+ * FIXTURE(fixture_name) - Called once per fixture to setup the data and
+ * register
+ *
+ * @fixture_name: fixture name
+ *
+ * .. code-block:: c
+ *
+ * FIXTURE(fixture_name) {
+ * type property1;
+ * ...
+ * };
+ *
+ * Defines the data provided to TEST_F()-defined tests as *self*. It should be
+ * populated and cleaned up using FIXTURE_SETUP() and FIXTURE_TEARDOWN().
+ */
+#define FIXTURE(fixture_name) \
+ FIXTURE_VARIANT(fixture_name); \
+ static struct __fixture_metadata _##fixture_name##_fixture_object = \
+ { .name = #fixture_name, }; \
+ static void __attribute__((constructor)) \
+ _register_##fixture_name##_data(void) \
+ { \
+ __register_fixture(&_##fixture_name##_fixture_object); \
+ } \
+ FIXTURE_DATA(fixture_name)
+
+/**
+ * FIXTURE_SETUP(fixture_name) - Prepares the setup function for the fixture.
+ * *_metadata* is included so that EXPECT_* and ASSERT_* work correctly.
+ *
+ * @fixture_name: fixture name
+ *
+ * .. code-block:: c
+ *
+ * FIXTURE_SETUP(fixture_name) { implementation }
+ *
+ * Populates the required "setup" function for a fixture. An instance of the
+ * datatype defined with FIXTURE_DATA() will be exposed as *self* for the
+ * implementation.
+ *
+ * ASSERT_* are valid for use in this context and will prempt the execution
+ * of any dependent fixture tests.
+ *
+ * A bare "return;" statement may be used to return early.
+ */
+#define FIXTURE_SETUP(fixture_name) \
+ void fixture_name##_setup( \
+ struct __test_metadata __attribute__((unused)) *_metadata, \
+ FIXTURE_DATA(fixture_name) __attribute__((unused)) *self, \
+ const FIXTURE_VARIANT(fixture_name) \
+ __attribute__((unused)) *variant)
+
+/**
+ * FIXTURE_TEARDOWN(fixture_name)
+ * *_metadata* is included so that EXPECT_* and ASSERT_* work correctly.
+ *
+ * @fixture_name: fixture name
+ *
+ * .. code-block:: c
+ *
+ * FIXTURE_TEARDOWN(fixture_name) { implementation }
+ *
+ * Populates the required "teardown" function for a fixture. An instance of the
+ * datatype defined with FIXTURE_DATA() will be exposed as *self* for the
+ * implementation to clean up.
+ *
+ * A bare "return;" statement may be used to return early.
+ */
+#define FIXTURE_TEARDOWN(fixture_name) \
+ void fixture_name##_teardown( \
+ struct __test_metadata __attribute__((unused)) *_metadata, \
+ FIXTURE_DATA(fixture_name) __attribute__((unused)) *self)
+
+/**
+ * FIXTURE_VARIANT(fixture_name) - Optionally called once per fixture
+ * to declare fixture variant
+ *
+ * @fixture_name: fixture name
+ *
+ * .. code-block:: c
+ *
+ * FIXTURE_VARIANT(fixture_name) {
+ * type property1;
+ * ...
+ * };
+ *
+ * Defines type of constant parameters provided to FIXTURE_SETUP() and TEST_F()
+ * as *variant*. Variants allow the same tests to be run with different
+ * arguments.
+ */
+#define FIXTURE_VARIANT(fixture_name) struct _fixture_variant_##fixture_name
+
+/**
+ * FIXTURE_VARIANT_ADD(fixture_name, variant_name) - Called once per fixture
+ * variant to setup and register the data
+ *
+ * @fixture_name: fixture name
+ * @variant_name: name of the parameter set
+ *
+ * .. code-block:: c
+ *
+ * FIXTURE_VARIANT_ADD(fixture_name, variant_name) {
+ * .property1 = val1,
+ * ...
+ * };
+ *
+ * Defines a variant of the test fixture, provided to FIXTURE_SETUP() and
+ * TEST_F() as *variant*. Tests of each fixture will be run once for each
+ * variant.
+ */
+#define FIXTURE_VARIANT_ADD(fixture_name, variant_name) \
+ extern FIXTURE_VARIANT(fixture_name) \
+ _##fixture_name##_##variant_name##_variant; \
+ static struct __fixture_variant_metadata \
+ _##fixture_name##_##variant_name##_object = \
+ { .name = #variant_name, \
+ .data = &_##fixture_name##_##variant_name##_variant}; \
+ static void __attribute__((constructor)) \
+ _register_##fixture_name##_##variant_name(void) \
+ { \
+ __register_fixture_variant(&_##fixture_name##_fixture_object, \
+ &_##fixture_name##_##variant_name##_object); \
+ } \
+ FIXTURE_VARIANT(fixture_name) \
+ _##fixture_name##_##variant_name##_variant =
+
+/**
+ * TEST_F(fixture_name, test_name) - Emits test registration and helpers for
+ * fixture-based test cases
+ *
+ * @fixture_name: fixture name
+ * @test_name: test name
+ *
+ * .. code-block:: c
+ *
+ * TEST_F(fixture, name) { implementation }
+ *
+ * Defines a test that depends on a fixture (e.g., is part of a test case).
+ * Very similar to TEST() except that *self* is the setup instance of fixture's
+ * datatype exposed for use by the implementation.
+ *
+ * Warning: use of ASSERT_* here will skip TEARDOWN.
+ */
+/* TODO(wad) register fixtures on dedicated test lists. */
+#define TEST_F(fixture_name, test_name) \
+ __TEST_F_IMPL(fixture_name, test_name, -1, TEST_TIMEOUT_DEFAULT)
+
+#define TEST_F_SIGNAL(fixture_name, test_name, signal) \
+ __TEST_F_IMPL(fixture_name, test_name, signal, TEST_TIMEOUT_DEFAULT)
+
+#define TEST_F_TIMEOUT(fixture_name, test_name, timeout) \
+ __TEST_F_IMPL(fixture_name, test_name, -1, timeout)
+
+#define __TEST_F_IMPL(fixture_name, test_name, signal, tmout) \
+ static void fixture_name##_##test_name( \
+ struct __test_metadata *_metadata, \
+ FIXTURE_DATA(fixture_name) *self, \
+ const FIXTURE_VARIANT(fixture_name) *variant); \
+ static inline void wrapper_##fixture_name##_##test_name( \
+ struct __test_metadata *_metadata, \
+ struct __fixture_variant_metadata *variant) \
+ { \
+ /* fixture data is alloced, setup, and torn down per call. */ \
+ FIXTURE_DATA(fixture_name) self; \
+ memset(&self, 0, sizeof(FIXTURE_DATA(fixture_name))); \
+ fixture_name##_setup(_metadata, &self, variant->data); \
+ /* Let setup failure terminate early. */ \
+ if (!_metadata->passed) \
+ return; \
+ fixture_name##_##test_name(_metadata, &self, variant->data); \
+ fixture_name##_teardown(_metadata, &self); \
+ } \
+ static struct __test_metadata \
+ _##fixture_name##_##test_name##_object = { \
+ .name = #test_name, \
+ .fn = &wrapper_##fixture_name##_##test_name, \
+ .fixture = &_##fixture_name##_fixture_object, \
+ .termsig = signal, \
+ .timeout = tmout, \
+ }; \
+ static void __attribute__((constructor)) \
+ _register_##fixture_name##_##test_name(void) \
+ { \
+ __register_test(&_##fixture_name##_##test_name##_object); \
+ } \
+ static void fixture_name##_##test_name( \
+ struct __test_metadata __attribute__((unused)) *_metadata, \
+ FIXTURE_DATA(fixture_name) __attribute__((unused)) *self, \
+ const FIXTURE_VARIANT(fixture_name) \
+ __attribute__((unused)) *variant)
+
+/**
+ * TEST_HARNESS_MAIN - Simple wrapper to run the test harness
+ *
+ * .. code-block:: c
+ *
+ * TEST_HARNESS_MAIN
+ *
+ * Use once to append a main() to the test file.
+ */
+#define TEST_HARNESS_MAIN \
+ static void __attribute__((constructor)) \
+ __constructor_order_last(void) \
+ { \
+ if (!__constructor_order) \
+ __constructor_order = _CONSTRUCTOR_ORDER_BACKWARD; \
+ } \
+ int main(int argc, char **argv) { \
+ return test_harness_run(argc, argv); \
+ }
+
+/**
+ * DOC: operators
+ *
+ * Operators for use in TEST() and TEST_F().
+ * ASSERT_* calls will stop test execution immediately.
+ * EXPECT_* calls will emit a failure warning, note it, and continue.
+ */
+
+/**
+ * ASSERT_EQ()
+ *
+ * @expected: expected value
+ * @seen: measured value
+ *
+ * ASSERT_EQ(expected, measured): expected == measured
+ */
+#define ASSERT_EQ(expected, seen) \
+ __EXPECT(expected, #expected, seen, #seen, ==, 1)
+
+/**
+ * ASSERT_NE()
+ *
+ * @expected: expected value
+ * @seen: measured value
+ *
+ * ASSERT_NE(expected, measured): expected != measured
+ */
+#define ASSERT_NE(expected, seen) \
+ __EXPECT(expected, #expected, seen, #seen, !=, 1)
+
+/**
+ * ASSERT_LT()
+ *
+ * @expected: expected value
+ * @seen: measured value
+ *
+ * ASSERT_LT(expected, measured): expected < measured
+ */
+#define ASSERT_LT(expected, seen) \
+ __EXPECT(expected, #expected, seen, #seen, <, 1)
+
+/**
+ * ASSERT_LE()
+ *
+ * @expected: expected value
+ * @seen: measured value
+ *
+ * ASSERT_LE(expected, measured): expected <= measured
+ */
+#define ASSERT_LE(expected, seen) \
+ __EXPECT(expected, #expected, seen, #seen, <=, 1)
+
+/**
+ * ASSERT_GT()
+ *
+ * @expected: expected value
+ * @seen: measured value
+ *
+ * ASSERT_GT(expected, measured): expected > measured
+ */
+#define ASSERT_GT(expected, seen) \
+ __EXPECT(expected, #expected, seen, #seen, >, 1)
+
+/**
+ * ASSERT_GE()
+ *
+ * @expected: expected value
+ * @seen: measured value
+ *
+ * ASSERT_GE(expected, measured): expected >= measured
+ */
+#define ASSERT_GE(expected, seen) \
+ __EXPECT(expected, #expected, seen, #seen, >=, 1)
+
+/**
+ * ASSERT_NULL()
+ *
+ * @seen: measured value
+ *
+ * ASSERT_NULL(measured): NULL == measured
+ */
+#define ASSERT_NULL(seen) \
+ __EXPECT(NULL, "NULL", seen, #seen, ==, 1)
+
+/**
+ * ASSERT_TRUE()
+ *
+ * @seen: measured value
+ *
+ * ASSERT_TRUE(measured): measured != 0
+ */
+#define ASSERT_TRUE(seen) \
+ __EXPECT(0, "0", seen, #seen, !=, 1)
+
+/**
+ * ASSERT_FALSE()
+ *
+ * @seen: measured value
+ *
+ * ASSERT_FALSE(measured): measured == 0
+ */
+#define ASSERT_FALSE(seen) \
+ __EXPECT(0, "0", seen, #seen, ==, 1)
+
+/**
+ * ASSERT_STREQ()
+ *
+ * @expected: expected value
+ * @seen: measured value
+ *
+ * ASSERT_STREQ(expected, measured): !strcmp(expected, measured)
+ */
+#define ASSERT_STREQ(expected, seen) \
+ __EXPECT_STR(expected, seen, ==, 1)
+
+/**
+ * ASSERT_STRNE()
+ *
+ * @expected: expected value
+ * @seen: measured value
+ *
+ * ASSERT_STRNE(expected, measured): strcmp(expected, measured)
+ */
+#define ASSERT_STRNE(expected, seen) \
+ __EXPECT_STR(expected, seen, !=, 1)
+
+/**
+ * EXPECT_EQ()
+ *
+ * @expected: expected value
+ * @seen: measured value
+ *
+ * EXPECT_EQ(expected, measured): expected == measured
+ */
+#define EXPECT_EQ(expected, seen) \
+ __EXPECT(expected, #expected, seen, #seen, ==, 0)
+
+/**
+ * EXPECT_NE()
+ *
+ * @expected: expected value
+ * @seen: measured value
+ *
+ * EXPECT_NE(expected, measured): expected != measured
+ */
+#define EXPECT_NE(expected, seen) \
+ __EXPECT(expected, #expected, seen, #seen, !=, 0)
+
+/**
+ * EXPECT_LT()
+ *
+ * @expected: expected value
+ * @seen: measured value
+ *
+ * EXPECT_LT(expected, measured): expected < measured
+ */
+#define EXPECT_LT(expected, seen) \
+ __EXPECT(expected, #expected, seen, #seen, <, 0)
+
+/**
+ * EXPECT_LE()
+ *
+ * @expected: expected value
+ * @seen: measured value
+ *
+ * EXPECT_LE(expected, measured): expected <= measured
+ */
+#define EXPECT_LE(expected, seen) \
+ __EXPECT(expected, #expected, seen, #seen, <=, 0)
+
+/**
+ * EXPECT_GT()
+ *
+ * @expected: expected value
+ * @seen: measured value
+ *
+ * EXPECT_GT(expected, measured): expected > measured
+ */
+#define EXPECT_GT(expected, seen) \
+ __EXPECT(expected, #expected, seen, #seen, >, 0)
+
+/**
+ * EXPECT_GE()
+ *
+ * @expected: expected value
+ * @seen: measured value
+ *
+ * EXPECT_GE(expected, measured): expected >= measured
+ */
+#define EXPECT_GE(expected, seen) \
+ __EXPECT(expected, #expected, seen, #seen, >=, 0)
+
+/**
+ * EXPECT_NULL()
+ *
+ * @seen: measured value
+ *
+ * EXPECT_NULL(measured): NULL == measured
+ */
+#define EXPECT_NULL(seen) \
+ __EXPECT(NULL, "NULL", seen, #seen, ==, 0)
+
+/**
+ * EXPECT_TRUE()
+ *
+ * @seen: measured value
+ *
+ * EXPECT_TRUE(measured): 0 != measured
+ */
+#define EXPECT_TRUE(seen) \
+ __EXPECT(0, "0", seen, #seen, !=, 0)
+
+/**
+ * EXPECT_FALSE()
+ *
+ * @seen: measured value
+ *
+ * EXPECT_FALSE(measured): 0 == measured
+ */
+#define EXPECT_FALSE(seen) \
+ __EXPECT(0, "0", seen, #seen, ==, 0)
+
+/**
+ * EXPECT_STREQ()
+ *
+ * @expected: expected value
+ * @seen: measured value
+ *
+ * EXPECT_STREQ(expected, measured): !strcmp(expected, measured)
+ */
+#define EXPECT_STREQ(expected, seen) \
+ __EXPECT_STR(expected, seen, ==, 0)
+
+/**
+ * EXPECT_STRNE()
+ *
+ * @expected: expected value
+ * @seen: measured value
+ *
+ * EXPECT_STRNE(expected, measured): strcmp(expected, measured)
+ */
+#define EXPECT_STRNE(expected, seen) \
+ __EXPECT_STR(expected, seen, !=, 0)
+
+#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
+
+/* Support an optional handler after and ASSERT_* or EXPECT_*. The approach is
+ * not thread-safe, but it should be fine in most sane test scenarios.
+ *
+ * Using __bail(), which optionally abort()s, is the easiest way to early
+ * return while still providing an optional block to the API consumer.
+ */
+#define OPTIONAL_HANDLER(_assert) \
+ for (; _metadata->trigger; _metadata->trigger = \
+ __bail(_assert, _metadata->no_print, _metadata->step))
+
+#define __INC_STEP(_metadata) \
+ /* Keep "step" below 255 (which is used for "SKIP" reporting). */ \
+ if (_metadata->passed && _metadata->step < 253) \
+ _metadata->step++;
+
+#define is_signed_type(var) (!!(((__typeof__(var))(-1)) < (__typeof__(var))1))
+
+#define __EXPECT(_expected, _expected_str, _seen, _seen_str, _t, _assert) do { \
+ /* Avoid multiple evaluation of the cases */ \
+ __typeof__(_expected) __exp = (_expected); \
+ __typeof__(_seen) __seen = (_seen); \
+ if (_assert) __INC_STEP(_metadata); \
+ if (!(__exp _t __seen)) { \
+ /* Report with actual signedness to avoid weird output. */ \
+ switch (is_signed_type(__exp) * 2 + is_signed_type(__seen)) { \
+ case 0: { \
+ unsigned long long __exp_print = (uintptr_t)__exp; \
+ unsigned long long __seen_print = (uintptr_t)__seen; \
+ __TH_LOG("Expected %s (%llu) %s %s (%llu)", \
+ _expected_str, __exp_print, #_t, \
+ _seen_str, __seen_print); \
+ break; \
+ } \
+ case 1: { \
+ unsigned long long __exp_print = (uintptr_t)__exp; \
+ long long __seen_print = (intptr_t)__seen; \
+ __TH_LOG("Expected %s (%llu) %s %s (%lld)", \
+ _expected_str, __exp_print, #_t, \
+ _seen_str, __seen_print); \
+ break; \
+ } \
+ case 2: { \
+ long long __exp_print = (intptr_t)__exp; \
+ unsigned long long __seen_print = (uintptr_t)__seen; \
+ __TH_LOG("Expected %s (%lld) %s %s (%llu)", \
+ _expected_str, __exp_print, #_t, \
+ _seen_str, __seen_print); \
+ break; \
+ } \
+ case 3: { \
+ long long __exp_print = (intptr_t)__exp; \
+ long long __seen_print = (intptr_t)__seen; \
+ __TH_LOG("Expected %s (%lld) %s %s (%lld)", \
+ _expected_str, __exp_print, #_t, \
+ _seen_str, __seen_print); \
+ break; \
+ } \
+ } \
+ _metadata->passed = 0; \
+ /* Ensure the optional handler is triggered */ \
+ _metadata->trigger = 1; \
+ } \
+} while (0); OPTIONAL_HANDLER(_assert)
+
+#define __EXPECT_STR(_expected, _seen, _t, _assert) do { \
+ const char *__exp = (_expected); \
+ const char *__seen = (_seen); \
+ if (_assert) __INC_STEP(_metadata); \
+ if (!(strcmp(__exp, __seen) _t 0)) { \
+ __TH_LOG("Expected '%s' %s '%s'.", __exp, #_t, __seen); \
+ _metadata->passed = 0; \
+ _metadata->trigger = 1; \
+ } \
+} while (0); OPTIONAL_HANDLER(_assert)
+
+/* List helpers */
+#define __LIST_APPEND(head, item) \
+{ \
+ /* Circular linked list where only prev is circular. */ \
+ if (head == NULL) { \
+ head = item; \
+ item->next = NULL; \
+ item->prev = item; \
+ return; \
+ } \
+ if (__constructor_order == _CONSTRUCTOR_ORDER_FORWARD) { \
+ item->next = NULL; \
+ item->prev = head->prev; \
+ item->prev->next = item; \
+ head->prev = item; \
+ } else { \
+ item->next = head; \
+ item->next->prev = item; \
+ item->prev = item; \
+ head = item; \
+ } \
+}
+
+struct __test_results {
+ char reason[1024]; /* Reason for test result */
+};
+
+struct __test_metadata;
+struct __fixture_variant_metadata;
+
+/* Contains all the information about a fixture. */
+struct __fixture_metadata {
+ const char *name;
+ struct __test_metadata *tests;
+ struct __fixture_variant_metadata *variant;
+ struct __fixture_metadata *prev, *next;
+} _fixture_global __attribute__((unused)) = {
+ .name = "global",
+ .prev = &_fixture_global,
+};
+
+static struct __fixture_metadata *__fixture_list = &_fixture_global;
+static int __constructor_order;
+
+#define _CONSTRUCTOR_ORDER_FORWARD 1
+#define _CONSTRUCTOR_ORDER_BACKWARD -1
+
+static inline void __register_fixture(struct __fixture_metadata *f)
+{
+ __LIST_APPEND(__fixture_list, f);
+}
+
+struct __fixture_variant_metadata {
+ const char *name;
+ const void *data;
+ struct __fixture_variant_metadata *prev, *next;
+};
+
+static inline void
+__register_fixture_variant(struct __fixture_metadata *f,
+ struct __fixture_variant_metadata *variant)
+{
+ __LIST_APPEND(f->variant, variant);
+}
+
+/* Contains all the information for test execution and status checking. */
+struct __test_metadata {
+ const char *name;
+ void (*fn)(struct __test_metadata *,
+ struct __fixture_variant_metadata *);
+ pid_t pid; /* pid of test when being run */
+ struct __fixture_metadata *fixture;
+ int termsig;
+ int passed;
+ int skip; /* did SKIP get used? */
+ int trigger; /* extra handler after the evaluation */
+ int timeout; /* seconds to wait for test timeout */
+ bool timed_out; /* did this test timeout instead of exiting? */
+ __u8 step;
+ bool no_print; /* manual trigger when TH_LOG_STREAM is not available */
+ struct __test_results *results;
+ struct __test_metadata *prev, *next;
+};
+
+/*
+ * Since constructors are called in reverse order, reverse the test
+ * list so tests are run in source declaration order.
+ * https://gcc.gnu.org/onlinedocs/gccint/Initialization.html
+ * However, it seems not all toolchains do this correctly, so use
+ * __constructor_order to detect which direction is called first
+ * and adjust list building logic to get things running in the right
+ * direction.
+ */
+static inline void __register_test(struct __test_metadata *t)
+{
+ __LIST_APPEND(t->fixture->tests, t);
+}
+
+static inline int __bail(int for_realz, bool no_print, __u8 step)
+{
+ if (for_realz) {
+ if (no_print)
+ _exit(step);
+ abort();
+ }
+ return 0;
+}
+
+struct __test_metadata *__active_test;
+static void __timeout_handler(int sig, siginfo_t *info, void *ucontext)
+{
+ struct __test_metadata *t = __active_test;
+
+ /* Sanity check handler execution environment. */
+ if (!t) {
+ fprintf(TH_LOG_STREAM,
+ "# no active test in SIGALRM handler!?\n");
+ abort();
+ }
+ if (sig != SIGALRM || sig != info->si_signo) {
+ fprintf(TH_LOG_STREAM,
+ "# %s: SIGALRM handler caught signal %d!?\n",
+ t->name, sig != SIGALRM ? sig : info->si_signo);
+ abort();
+ }
+
+ t->timed_out = true;
+ // signal process group
+ kill(-(t->pid), SIGKILL);
+}
+
+void __wait_for_test(struct __test_metadata *t)
+{
+ struct sigaction action = {
+ .sa_sigaction = __timeout_handler,
+ .sa_flags = SA_SIGINFO,
+ };
+ struct sigaction saved_action;
+ int status;
+
+ if (sigaction(SIGALRM, &action, &saved_action)) {
+ t->passed = 0;
+ fprintf(TH_LOG_STREAM,
+ "# %s: unable to install SIGALRM handler\n",
+ t->name);
+ return;
+ }
+ __active_test = t;
+ t->timed_out = false;
+ alarm(t->timeout);
+ waitpid(t->pid, &status, 0);
+ alarm(0);
+ if (sigaction(SIGALRM, &saved_action, NULL)) {
+ t->passed = 0;
+ fprintf(TH_LOG_STREAM,
+ "# %s: unable to uninstall SIGALRM handler\n",
+ t->name);
+ return;
+ }
+ __active_test = NULL;
+
+ if (t->timed_out) {
+ t->passed = 0;
+ fprintf(TH_LOG_STREAM,
+ "# %s: Test terminated by timeout\n", t->name);
+ } else if (WIFEXITED(status)) {
+ if (WEXITSTATUS(status) == 255) {
+ /* SKIP */
+ t->passed = 1;
+ t->skip = 1;
+ } else if (t->termsig != -1) {
+ t->passed = 0;
+ fprintf(TH_LOG_STREAM,
+ "# %s: Test exited normally instead of by signal (code: %d)\n",
+ t->name,
+ WEXITSTATUS(status));
+ } else {
+ switch (WEXITSTATUS(status)) {
+ /* Success */
+ case 0:
+ t->passed = 1;
+ break;
+ /* Other failure, assume step report. */
+ default:
+ t->passed = 0;
+ fprintf(TH_LOG_STREAM,
+ "# %s: Test failed at step #%d\n",
+ t->name,
+ WEXITSTATUS(status));
+ }
+ }
+ } else if (WIFSIGNALED(status)) {
+ t->passed = 0;
+ if (WTERMSIG(status) == SIGABRT) {
+ fprintf(TH_LOG_STREAM,
+ "# %s: Test terminated by assertion\n",
+ t->name);
+ } else if (WTERMSIG(status) == t->termsig) {
+ t->passed = 1;
+ } else {
+ fprintf(TH_LOG_STREAM,
+ "# %s: Test terminated unexpectedly by signal %d\n",
+ t->name,
+ WTERMSIG(status));
+ }
+ } else {
+ fprintf(TH_LOG_STREAM,
+ "# %s: Test ended in some other way [%u]\n",
+ t->name,
+ status);
+ }
+}
+
+void __run_test(struct __fixture_metadata *f,
+ struct __fixture_variant_metadata *variant,
+ struct __test_metadata *t)
+{
+ /* reset test struct */
+ t->passed = 1;
+ t->skip = 0;
+ t->trigger = 0;
+ t->step = 1;
+ t->no_print = 0;
+ memset(t->results->reason, 0, sizeof(t->results->reason));
+
+ ksft_print_msg(" RUN %s%s%s.%s ...\n",
+ f->name, variant->name[0] ? "." : "", variant->name, t->name);
+
+ /* Make sure output buffers are flushed before fork */
+ fflush(stdout);
+ fflush(stderr);
+
+ t->pid = fork();
+ if (t->pid < 0) {
+ ksft_print_msg("ERROR SPAWNING TEST CHILD\n");
+ t->passed = 0;
+ } else if (t->pid == 0) {
+ setpgrp();
+ t->fn(t, variant);
+ if (t->skip)
+ _exit(255);
+ /* Pass is exit 0 */
+ if (t->passed)
+ _exit(0);
+ /* Something else happened, report the step. */
+ _exit(t->step);
+ } else {
+ __wait_for_test(t);
+ }
+ ksft_print_msg(" %4s %s%s%s.%s\n", t->passed ? "OK" : "FAIL",
+ f->name, variant->name[0] ? "." : "", variant->name, t->name);
+
+ if (t->skip)
+ ksft_test_result_skip("%s\n", t->results->reason[0] ?
+ t->results->reason : "unknown");
+ else
+ ksft_test_result(t->passed, "%s%s%s.%s\n",
+ f->name, variant->name[0] ? "." : "", variant->name, t->name);
+}
+
+static int test_harness_run(int __attribute__((unused)) argc,
+ char __attribute__((unused)) **argv)
+{
+ struct __fixture_variant_metadata no_variant = { .name = "", };
+ struct __fixture_variant_metadata *v;
+ struct __fixture_metadata *f;
+ struct __test_results *results;
+ struct __test_metadata *t;
+ int ret = 0;
+ unsigned int case_count = 0, test_count = 0;
+ unsigned int count = 0;
+ unsigned int pass_count = 0;
+
+ for (f = __fixture_list; f; f = f->next) {
+ for (v = f->variant ?: &no_variant; v; v = v->next) {
+ case_count++;
+ for (t = f->tests; t; t = t->next)
+ test_count++;
+ }
+ }
+
+ results = mmap(NULL, sizeof(*results), PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+
+ ksft_print_header();
+ ksft_set_plan(test_count);
+ ksft_print_msg("Starting %u tests from %u test cases.\n",
+ test_count, case_count);
+ for (f = __fixture_list; f; f = f->next) {
+ for (v = f->variant ?: &no_variant; v; v = v->next) {
+ for (t = f->tests; t; t = t->next) {
+ count++;
+ t->results = results;
+ __run_test(f, v, t);
+ t->results = NULL;
+ if (t->passed)
+ pass_count++;
+ else
+ ret = 1;
+ }
+ }
+ }
+ munmap(results, sizeof(*results));
+
+ ksft_print_msg("%s: %u / %u tests passed.\n", ret ? "FAILED" : "PASSED",
+ pass_count, count);
+ ksft_exit(ret == 0);
+
+ /* unreachable */
+ return KSFT_FAIL;
+}
+
+static void __attribute__((constructor)) __constructor_order_first(void)
+{
+ if (!__constructor_order)
+ __constructor_order = _CONSTRUCTOR_ORDER_FORWARD;
+}
+
+#endif /* __KSELFTEST_HARNESS_H */
diff --git a/tools/testing/selftests/kselftest_install.sh b/tools/testing/selftests/kselftest_install.sh
new file mode 100755
index 000000000..407af7da7
--- /dev/null
+++ b/tools/testing/selftests/kselftest_install.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Kselftest Install
+# Install kselftest tests
+# Author: Shuah Khan <shuahkh@osg.samsung.com>
+# Copyright (C) 2015 Samsung Electronics Co., Ltd.
+
+main()
+{
+ base_dir=`pwd`
+ install_dir="$base_dir"/kselftest_install
+
+ # Make sure we're in the selftests top-level directory.
+ if [ $(basename "$base_dir") != "selftests" ]; then
+ echo "$0: Please run it in selftests directory ..."
+ exit 1;
+ fi
+
+ # Only allow installation into an existing location.
+ if [ "$#" -eq 0 ]; then
+ echo "$0: Installing in default location - $install_dir ..."
+ elif [ ! -d "$1" ]; then
+ echo "$0: $1 doesn't exist!!"
+ exit 1;
+ else
+ install_dir="$1"
+ echo "$0: Installing in specified location - $install_dir ..."
+ fi
+
+ # Build tests
+ KSFT_INSTALL_PATH="$install_dir" make install
+}
+
+main "$@"
diff --git a/tools/testing/selftests/kselftest_module.h b/tools/testing/selftests/kselftest_module.h
new file mode 100644
index 000000000..e8eafaf09
--- /dev/null
+++ b/tools/testing/selftests/kselftest_module.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#ifndef __KSELFTEST_MODULE_H
+#define __KSELFTEST_MODULE_H
+
+#include <linux/module.h>
+
+/*
+ * Test framework for writing test modules to be loaded by kselftest.
+ * See Documentation/dev-tools/kselftest.rst for an example test module.
+ */
+
+#define KSTM_MODULE_GLOBALS() \
+static unsigned int total_tests __initdata; \
+static unsigned int failed_tests __initdata
+
+#define KSTM_CHECK_ZERO(x) do { \
+ total_tests++; \
+ if (x) { \
+ pr_warn("TC failed at %s:%d\n", __func__, __LINE__); \
+ failed_tests++; \
+ } \
+} while (0)
+
+static inline int kstm_report(unsigned int total_tests, unsigned int failed_tests)
+{
+ if (failed_tests == 0)
+ pr_info("all %u tests passed\n", total_tests);
+ else
+ pr_warn("failed %u out of %u tests\n", failed_tests, total_tests);
+
+ return failed_tests ? -EINVAL : 0;
+}
+
+#define KSTM_MODULE_LOADERS(__module) \
+static int __init __module##_init(void) \
+{ \
+ pr_info("loaded.\n"); \
+ selftest(); \
+ return kstm_report(total_tests, failed_tests); \
+} \
+static void __exit __module##_exit(void) \
+{ \
+ pr_info("unloaded.\n"); \
+} \
+module_init(__module##_init); \
+module_exit(__module##_exit)
+
+#endif /* __KSELFTEST_MODULE_H */
diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore
new file mode 100644
index 000000000..7a2c242b7
--- /dev/null
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/aarch64/get-reg-list
+/aarch64/get-reg-list-sve
+/s390x/memop
+/s390x/resets
+/s390x/sync_regs_test
+/x86_64/cr4_cpuid_sync_test
+/x86_64/debug_regs
+/x86_64/evmcs_test
+/x86_64/kvm_pv_test
+/x86_64/hyperv_cpuid
+/x86_64/mmio_warning_test
+/x86_64/platform_info_test
+/x86_64/set_sregs_test
+/x86_64/smm_test
+/x86_64/state_test
+/x86_64/user_msr_test
+/x86_64/vmx_preemption_timer_test
+/x86_64/svm_vmcall_test
+/x86_64/sync_regs_test
+/x86_64/vmx_apic_access_test
+/x86_64/vmx_close_while_nested_test
+/x86_64/vmx_dirty_log_test
+/x86_64/vmx_set_nested_state_test
+/x86_64/vmx_tsc_adjust_test
+/x86_64/xss_msr_test
+/clear_dirty_log_test
+/demand_paging_test
+/dirty_log_test
+/dirty_log_perf_test
+/kvm_create_max_vcpus
+/set_memory_region_test
+/steal_time
diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
new file mode 100644
index 000000000..3d14ef777
--- /dev/null
+++ b/tools/testing/selftests/kvm/Makefile
@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: GPL-2.0-only
+include ../../../../scripts/Kbuild.include
+
+all:
+
+top_srcdir = ../../../..
+KSFT_KHDR_INSTALL := 1
+
+# For cross-builds to work, UNAME_M has to map to ARCH and arch specific
+# directories and targets in this Makefile. "uname -m" doesn't map to
+# arch specific sub-directory names.
+#
+# UNAME_M variable to used to run the compiles pointing to the right arch
+# directories and build the right targets for these supported architectures.
+#
+# TEST_GEN_PROGS and LIBKVM are set using UNAME_M variable.
+# LINUX_TOOL_ARCH_INCLUDE is set using ARCH variable.
+#
+# x86_64 targets are named to include x86_64 as a suffix and directories
+# for includes are in x86_64 sub-directory. s390x and aarch64 follow the
+# same convention. "uname -m" doesn't result in the correct mapping for
+# s390x and aarch64.
+#
+# No change necessary for x86_64
+UNAME_M := $(shell uname -m)
+
+# Set UNAME_M for arm64 compile/install to work
+ifeq ($(ARCH),arm64)
+ UNAME_M := aarch64
+endif
+# Set UNAME_M s390x compile/install to work
+ifeq ($(ARCH),s390)
+ UNAME_M := s390x
+endif
+
+LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/sparsebit.c lib/test_util.c
+LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c lib/x86_64/svm.c lib/x86_64/ucall.c lib/x86_64/handlers.S
+LIBKVM_aarch64 = lib/aarch64/processor.c lib/aarch64/ucall.c
+LIBKVM_s390x = lib/s390x/processor.c lib/s390x/ucall.c
+
+TEST_GEN_PROGS_x86_64 = x86_64/cr4_cpuid_sync_test
+TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test
+TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid
+TEST_GEN_PROGS_x86_64 += x86_64/kvm_pv_test
+TEST_GEN_PROGS_x86_64 += x86_64/mmio_warning_test
+TEST_GEN_PROGS_x86_64 += x86_64/platform_info_test
+TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test
+TEST_GEN_PROGS_x86_64 += x86_64/smm_test
+TEST_GEN_PROGS_x86_64 += x86_64/state_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_preemption_timer_test
+TEST_GEN_PROGS_x86_64 += x86_64/svm_vmcall_test
+TEST_GEN_PROGS_x86_64 += x86_64/sync_regs_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_apic_access_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_dirty_log_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
+TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
+TEST_GEN_PROGS_x86_64 += x86_64/xss_msr_test
+TEST_GEN_PROGS_x86_64 += x86_64/debug_regs
+TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test
+TEST_GEN_PROGS_x86_64 += x86_64/user_msr_test
+TEST_GEN_PROGS_x86_64 += demand_paging_test
+TEST_GEN_PROGS_x86_64 += dirty_log_test
+TEST_GEN_PROGS_x86_64 += dirty_log_perf_test
+TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_x86_64 += set_memory_region_test
+TEST_GEN_PROGS_x86_64 += steal_time
+
+TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list
+TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve
+TEST_GEN_PROGS_aarch64 += demand_paging_test
+TEST_GEN_PROGS_aarch64 += dirty_log_test
+TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus
+TEST_GEN_PROGS_aarch64 += set_memory_region_test
+TEST_GEN_PROGS_aarch64 += steal_time
+
+TEST_GEN_PROGS_s390x = s390x/memop
+TEST_GEN_PROGS_s390x += s390x/resets
+TEST_GEN_PROGS_s390x += s390x/sync_regs_test
+TEST_GEN_PROGS_s390x += demand_paging_test
+TEST_GEN_PROGS_s390x += dirty_log_test
+TEST_GEN_PROGS_s390x += kvm_create_max_vcpus
+TEST_GEN_PROGS_s390x += set_memory_region_test
+
+TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
+LIBKVM += $(LIBKVM_$(UNAME_M))
+
+INSTALL_HDR_PATH = $(top_srcdir)/usr
+LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/
+LINUX_TOOL_INCLUDE = $(top_srcdir)/tools/include
+ifeq ($(ARCH),x86_64)
+LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/x86/include
+else
+LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include
+endif
+CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \
+ -fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \
+ -I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \
+ -I$(<D) -Iinclude/$(UNAME_M) -I..
+
+no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \
+ $(CC) -Werror -no-pie -x c - -o "$$TMP", -no-pie)
+
+# On s390, build the testcases KVM-enabled
+pgste-option = $(call try-run, echo 'int main() { return 0; }' | \
+ $(CC) -Werror -Wl$(comma)--s390-pgste -x c - -o "$$TMP",-Wl$(comma)--s390-pgste)
+
+
+LDFLAGS += -pthread $(no-pie-option) $(pgste-option)
+
+# After inclusion, $(OUTPUT) is defined and
+# $(TEST_GEN_PROGS) starts with $(OUTPUT)/
+include ../lib.mk
+
+STATIC_LIBS := $(OUTPUT)/libkvm.a
+LIBKVM_C := $(filter %.c,$(LIBKVM))
+LIBKVM_S := $(filter %.S,$(LIBKVM))
+LIBKVM_C_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_C))
+LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S))
+EXTRA_CLEAN += $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(STATIC_LIBS) cscope.*
+
+x := $(shell mkdir -p $(sort $(dir $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ))))
+$(LIBKVM_C_OBJ): $(OUTPUT)/%.o: %.c
+ $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
+
+$(LIBKVM_S_OBJ): $(OUTPUT)/%.o: %.S
+ $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@
+
+LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ)
+$(OUTPUT)/libkvm.a: $(LIBKVM_OBJS)
+ $(AR) crs $@ $^
+
+x := $(shell mkdir -p $(sort $(dir $(TEST_GEN_PROGS))))
+all: $(STATIC_LIBS)
+$(TEST_GEN_PROGS): $(STATIC_LIBS)
+
+cscope: include_paths = $(LINUX_TOOL_INCLUDE) $(LINUX_HDR_PATH) include lib ..
+cscope:
+ $(RM) cscope.*
+ (find $(include_paths) -name '*.h' \
+ -exec realpath --relative-base=$(PWD) {} \;; \
+ find . -name '*.c' \
+ -exec realpath --relative-base=$(PWD) {} \;) | sort -u > cscope.files
+ cscope -b
diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c b/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c
new file mode 100644
index 000000000..efba76682
--- /dev/null
+++ b/tools/testing/selftests/kvm/aarch64/get-reg-list-sve.c
@@ -0,0 +1,3 @@
+// SPDX-License-Identifier: GPL-2.0
+#define REG_LIST_SVE
+#include "get-reg-list.c"
diff --git a/tools/testing/selftests/kvm/aarch64/get-reg-list.c b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
new file mode 100644
index 000000000..33218a395
--- /dev/null
+++ b/tools/testing/selftests/kvm/aarch64/get-reg-list.c
@@ -0,0 +1,841 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Check for KVM_GET_REG_LIST regressions.
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ *
+ * When attempting to migrate from a host with an older kernel to a host
+ * with a newer kernel we allow the newer kernel on the destination to
+ * list new registers with get-reg-list. We assume they'll be unused, at
+ * least until the guest reboots, and so they're relatively harmless.
+ * However, if the destination host with the newer kernel is missing
+ * registers which the source host with the older kernel has, then that's
+ * a regression in get-reg-list. This test checks for that regression by
+ * checking the current list against a blessed list. We should never have
+ * missing registers, but if new ones appear then they can probably be
+ * added to the blessed list. A completely new blessed list can be created
+ * by running the test with the --list command line argument.
+ *
+ * Note, the blessed list should be created from the oldest possible
+ * kernel. We can't go older than v4.15, though, because that's the first
+ * release to expose the ID system registers in KVM_GET_REG_LIST, see
+ * commit 93390c0a1b20 ("arm64: KVM: Hide unsupported AArch64 CPU features
+ * from guests"). Also, one must use the --core-reg-fixup command line
+ * option when running on an older kernel that doesn't include df205b5c6328
+ * ("KVM: arm64: Filter out invalid core register IDs in KVM_GET_REG_LIST")
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "kvm_util.h"
+#include "test_util.h"
+#include "processor.h"
+
+#ifdef REG_LIST_SVE
+#define reg_list_sve() (true)
+#else
+#define reg_list_sve() (false)
+#endif
+
+#define REG_MASK (KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_COPROC_MASK)
+
+#define for_each_reg(i) \
+ for ((i) = 0; (i) < reg_list->n; ++(i))
+
+#define for_each_missing_reg(i) \
+ for ((i) = 0; (i) < blessed_n; ++(i)) \
+ if (!find_reg(reg_list->reg, reg_list->n, blessed_reg[i]))
+
+#define for_each_new_reg(i) \
+ for ((i) = 0; (i) < reg_list->n; ++(i)) \
+ if (!find_reg(blessed_reg, blessed_n, reg_list->reg[i]))
+
+
+static struct kvm_reg_list *reg_list;
+
+static __u64 base_regs[], vregs[], sve_regs[], rejects_set[];
+static __u64 base_regs_n, vregs_n, sve_regs_n, rejects_set_n;
+static __u64 *blessed_reg, blessed_n;
+
+static bool find_reg(__u64 regs[], __u64 nr_regs, __u64 reg)
+{
+ int i;
+
+ for (i = 0; i < nr_regs; ++i)
+ if (reg == regs[i])
+ return true;
+ return false;
+}
+
+static const char *str_with_index(const char *template, __u64 index)
+{
+ char *str, *p;
+ int n;
+
+ str = strdup(template);
+ p = strstr(str, "##");
+ n = sprintf(p, "%lld", index);
+ strcat(p + n, strstr(template, "##") + 2);
+
+ return (const char *)str;
+}
+
+#define CORE_REGS_XX_NR_WORDS 2
+#define CORE_SPSR_XX_NR_WORDS 2
+#define CORE_FPREGS_XX_NR_WORDS 4
+
+static const char *core_id_to_str(__u64 id)
+{
+ __u64 core_off = id & ~REG_MASK, idx;
+
+ /*
+ * core_off is the offset into struct kvm_regs
+ */
+ switch (core_off) {
+ case KVM_REG_ARM_CORE_REG(regs.regs[0]) ...
+ KVM_REG_ARM_CORE_REG(regs.regs[30]):
+ idx = (core_off - KVM_REG_ARM_CORE_REG(regs.regs[0])) / CORE_REGS_XX_NR_WORDS;
+ TEST_ASSERT(idx < 31, "Unexpected regs.regs index: %lld", idx);
+ return str_with_index("KVM_REG_ARM_CORE_REG(regs.regs[##])", idx);
+ case KVM_REG_ARM_CORE_REG(regs.sp):
+ return "KVM_REG_ARM_CORE_REG(regs.sp)";
+ case KVM_REG_ARM_CORE_REG(regs.pc):
+ return "KVM_REG_ARM_CORE_REG(regs.pc)";
+ case KVM_REG_ARM_CORE_REG(regs.pstate):
+ return "KVM_REG_ARM_CORE_REG(regs.pstate)";
+ case KVM_REG_ARM_CORE_REG(sp_el1):
+ return "KVM_REG_ARM_CORE_REG(sp_el1)";
+ case KVM_REG_ARM_CORE_REG(elr_el1):
+ return "KVM_REG_ARM_CORE_REG(elr_el1)";
+ case KVM_REG_ARM_CORE_REG(spsr[0]) ...
+ KVM_REG_ARM_CORE_REG(spsr[KVM_NR_SPSR - 1]):
+ idx = (core_off - KVM_REG_ARM_CORE_REG(spsr[0])) / CORE_SPSR_XX_NR_WORDS;
+ TEST_ASSERT(idx < KVM_NR_SPSR, "Unexpected spsr index: %lld", idx);
+ return str_with_index("KVM_REG_ARM_CORE_REG(spsr[##])", idx);
+ case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ...
+ KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]):
+ idx = (core_off - KVM_REG_ARM_CORE_REG(fp_regs.vregs[0])) / CORE_FPREGS_XX_NR_WORDS;
+ TEST_ASSERT(idx < 32, "Unexpected fp_regs.vregs index: %lld", idx);
+ return str_with_index("KVM_REG_ARM_CORE_REG(fp_regs.vregs[##])", idx);
+ case KVM_REG_ARM_CORE_REG(fp_regs.fpsr):
+ return "KVM_REG_ARM_CORE_REG(fp_regs.fpsr)";
+ case KVM_REG_ARM_CORE_REG(fp_regs.fpcr):
+ return "KVM_REG_ARM_CORE_REG(fp_regs.fpcr)";
+ }
+
+ TEST_FAIL("Unknown core reg id: 0x%llx", id);
+ return NULL;
+}
+
+static const char *sve_id_to_str(__u64 id)
+{
+ __u64 sve_off, n, i;
+
+ if (id == KVM_REG_ARM64_SVE_VLS)
+ return "KVM_REG_ARM64_SVE_VLS";
+
+ sve_off = id & ~(REG_MASK | ((1ULL << 5) - 1));
+ i = id & (KVM_ARM64_SVE_MAX_SLICES - 1);
+
+ TEST_ASSERT(i == 0, "Currently we don't expect slice > 0, reg id 0x%llx", id);
+
+ switch (sve_off) {
+ case KVM_REG_ARM64_SVE_ZREG_BASE ...
+ KVM_REG_ARM64_SVE_ZREG_BASE + (1ULL << 5) * KVM_ARM64_SVE_NUM_ZREGS - 1:
+ n = (id >> 5) & (KVM_ARM64_SVE_NUM_ZREGS - 1);
+ TEST_ASSERT(id == KVM_REG_ARM64_SVE_ZREG(n, 0),
+ "Unexpected bits set in SVE ZREG id: 0x%llx", id);
+ return str_with_index("KVM_REG_ARM64_SVE_ZREG(##, 0)", n);
+ case KVM_REG_ARM64_SVE_PREG_BASE ...
+ KVM_REG_ARM64_SVE_PREG_BASE + (1ULL << 5) * KVM_ARM64_SVE_NUM_PREGS - 1:
+ n = (id >> 5) & (KVM_ARM64_SVE_NUM_PREGS - 1);
+ TEST_ASSERT(id == KVM_REG_ARM64_SVE_PREG(n, 0),
+ "Unexpected bits set in SVE PREG id: 0x%llx", id);
+ return str_with_index("KVM_REG_ARM64_SVE_PREG(##, 0)", n);
+ case KVM_REG_ARM64_SVE_FFR_BASE:
+ TEST_ASSERT(id == KVM_REG_ARM64_SVE_FFR(0),
+ "Unexpected bits set in SVE FFR id: 0x%llx", id);
+ return "KVM_REG_ARM64_SVE_FFR(0)";
+ }
+
+ return NULL;
+}
+
+static void print_reg(__u64 id)
+{
+ unsigned op0, op1, crn, crm, op2;
+ const char *reg_size = NULL;
+
+ TEST_ASSERT((id & KVM_REG_ARCH_MASK) == KVM_REG_ARM64,
+ "KVM_REG_ARM64 missing in reg id: 0x%llx", id);
+
+ switch (id & KVM_REG_SIZE_MASK) {
+ case KVM_REG_SIZE_U8:
+ reg_size = "KVM_REG_SIZE_U8";
+ break;
+ case KVM_REG_SIZE_U16:
+ reg_size = "KVM_REG_SIZE_U16";
+ break;
+ case KVM_REG_SIZE_U32:
+ reg_size = "KVM_REG_SIZE_U32";
+ break;
+ case KVM_REG_SIZE_U64:
+ reg_size = "KVM_REG_SIZE_U64";
+ break;
+ case KVM_REG_SIZE_U128:
+ reg_size = "KVM_REG_SIZE_U128";
+ break;
+ case KVM_REG_SIZE_U256:
+ reg_size = "KVM_REG_SIZE_U256";
+ break;
+ case KVM_REG_SIZE_U512:
+ reg_size = "KVM_REG_SIZE_U512";
+ break;
+ case KVM_REG_SIZE_U1024:
+ reg_size = "KVM_REG_SIZE_U1024";
+ break;
+ case KVM_REG_SIZE_U2048:
+ reg_size = "KVM_REG_SIZE_U2048";
+ break;
+ default:
+ TEST_FAIL("Unexpected reg size: 0x%llx in reg id: 0x%llx",
+ (id & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT, id);
+ }
+
+ switch (id & KVM_REG_ARM_COPROC_MASK) {
+ case KVM_REG_ARM_CORE:
+ printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_CORE | %s,\n", reg_size, core_id_to_str(id));
+ break;
+ case KVM_REG_ARM_DEMUX:
+ TEST_ASSERT(!(id & ~(REG_MASK | KVM_REG_ARM_DEMUX_ID_MASK | KVM_REG_ARM_DEMUX_VAL_MASK)),
+ "Unexpected bits set in DEMUX reg id: 0x%llx", id);
+ printf("\tKVM_REG_ARM64 | %s | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | %lld,\n",
+ reg_size, id & KVM_REG_ARM_DEMUX_VAL_MASK);
+ break;
+ case KVM_REG_ARM64_SYSREG:
+ op0 = (id & KVM_REG_ARM64_SYSREG_OP0_MASK) >> KVM_REG_ARM64_SYSREG_OP0_SHIFT;
+ op1 = (id & KVM_REG_ARM64_SYSREG_OP1_MASK) >> KVM_REG_ARM64_SYSREG_OP1_SHIFT;
+ crn = (id & KVM_REG_ARM64_SYSREG_CRN_MASK) >> KVM_REG_ARM64_SYSREG_CRN_SHIFT;
+ crm = (id & KVM_REG_ARM64_SYSREG_CRM_MASK) >> KVM_REG_ARM64_SYSREG_CRM_SHIFT;
+ op2 = (id & KVM_REG_ARM64_SYSREG_OP2_MASK) >> KVM_REG_ARM64_SYSREG_OP2_SHIFT;
+ TEST_ASSERT(id == ARM64_SYS_REG(op0, op1, crn, crm, op2),
+ "Unexpected bits set in SYSREG reg id: 0x%llx", id);
+ printf("\tARM64_SYS_REG(%d, %d, %d, %d, %d),\n", op0, op1, crn, crm, op2);
+ break;
+ case KVM_REG_ARM_FW:
+ TEST_ASSERT(id == KVM_REG_ARM_FW_REG(id & 0xffff),
+ "Unexpected bits set in FW reg id: 0x%llx", id);
+ printf("\tKVM_REG_ARM_FW_REG(%lld),\n", id & 0xffff);
+ break;
+ case KVM_REG_ARM64_SVE:
+ if (reg_list_sve())
+ printf("\t%s,\n", sve_id_to_str(id));
+ else
+ TEST_FAIL("KVM_REG_ARM64_SVE is an unexpected coproc type in reg id: 0x%llx", id);
+ break;
+ default:
+ TEST_FAIL("Unexpected coproc type: 0x%llx in reg id: 0x%llx",
+ (id & KVM_REG_ARM_COPROC_MASK) >> KVM_REG_ARM_COPROC_SHIFT, id);
+ }
+}
+
+/*
+ * Older kernels listed each 32-bit word of CORE registers separately.
+ * For 64 and 128-bit registers we need to ignore the extra words. We
+ * also need to fixup the sizes, because the older kernels stated all
+ * registers were 64-bit, even when they weren't.
+ */
+static void core_reg_fixup(void)
+{
+ struct kvm_reg_list *tmp;
+ __u64 id, core_off;
+ int i;
+
+ tmp = calloc(1, sizeof(*tmp) + reg_list->n * sizeof(__u64));
+
+ for (i = 0; i < reg_list->n; ++i) {
+ id = reg_list->reg[i];
+
+ if ((id & KVM_REG_ARM_COPROC_MASK) != KVM_REG_ARM_CORE) {
+ tmp->reg[tmp->n++] = id;
+ continue;
+ }
+
+ core_off = id & ~REG_MASK;
+
+ switch (core_off) {
+ case 0x52: case 0xd2: case 0xd6:
+ /*
+ * These offsets are pointing at padding.
+ * We need to ignore them too.
+ */
+ continue;
+ case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ...
+ KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]):
+ if (core_off & 3)
+ continue;
+ id &= ~KVM_REG_SIZE_MASK;
+ id |= KVM_REG_SIZE_U128;
+ tmp->reg[tmp->n++] = id;
+ continue;
+ case KVM_REG_ARM_CORE_REG(fp_regs.fpsr):
+ case KVM_REG_ARM_CORE_REG(fp_regs.fpcr):
+ id &= ~KVM_REG_SIZE_MASK;
+ id |= KVM_REG_SIZE_U32;
+ tmp->reg[tmp->n++] = id;
+ continue;
+ default:
+ if (core_off & 1)
+ continue;
+ tmp->reg[tmp->n++] = id;
+ break;
+ }
+ }
+
+ free(reg_list);
+ reg_list = tmp;
+}
+
+static void prepare_vcpu_init(struct kvm_vcpu_init *init)
+{
+ if (reg_list_sve())
+ init->features[0] |= 1 << KVM_ARM_VCPU_SVE;
+}
+
+static void finalize_vcpu(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ int feature;
+
+ if (reg_list_sve()) {
+ feature = KVM_ARM_VCPU_SVE;
+ vcpu_ioctl(vm, vcpuid, KVM_ARM_VCPU_FINALIZE, &feature);
+ }
+}
+
+static void check_supported(void)
+{
+ if (reg_list_sve() && !kvm_check_cap(KVM_CAP_ARM_SVE)) {
+ fprintf(stderr, "SVE not available, skipping tests\n");
+ exit(KSFT_SKIP);
+ }
+}
+
+int main(int ac, char **av)
+{
+ struct kvm_vcpu_init init = { .target = -1, };
+ int new_regs = 0, missing_regs = 0, i;
+ int failed_get = 0, failed_set = 0, failed_reject = 0;
+ bool print_list = false, fixup_core_regs = false;
+ struct kvm_vm *vm;
+ __u64 *vec_regs;
+
+ check_supported();
+
+ for (i = 1; i < ac; ++i) {
+ if (strcmp(av[i], "--core-reg-fixup") == 0)
+ fixup_core_regs = true;
+ else if (strcmp(av[i], "--list") == 0)
+ print_list = true;
+ else
+ fprintf(stderr, "Ignoring unknown option: %s\n", av[i]);
+ }
+
+ vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR);
+ prepare_vcpu_init(&init);
+ aarch64_vcpu_add_default(vm, 0, &init, NULL);
+ finalize_vcpu(vm, 0);
+
+ reg_list = vcpu_get_reg_list(vm, 0);
+
+ if (fixup_core_regs)
+ core_reg_fixup();
+
+ if (print_list) {
+ putchar('\n');
+ for_each_reg(i)
+ print_reg(reg_list->reg[i]);
+ putchar('\n');
+ return 0;
+ }
+
+ /*
+ * We only test that we can get the register and then write back the
+ * same value. Some registers may allow other values to be written
+ * back, but others only allow some bits to be changed, and at least
+ * for ID registers set will fail if the value does not exactly match
+ * what was returned by get. If registers that allow other values to
+ * be written need to have the other values tested, then we should
+ * create a new set of tests for those in a new independent test
+ * executable.
+ */
+ for_each_reg(i) {
+ uint8_t addr[2048 / 8];
+ struct kvm_one_reg reg = {
+ .id = reg_list->reg[i],
+ .addr = (__u64)&addr,
+ };
+ int ret;
+
+ ret = _vcpu_ioctl(vm, 0, KVM_GET_ONE_REG, &reg);
+ if (ret) {
+ puts("Failed to get ");
+ print_reg(reg.id);
+ putchar('\n');
+ ++failed_get;
+ }
+
+ /* rejects_set registers are rejected after KVM_ARM_VCPU_FINALIZE */
+ if (find_reg(rejects_set, rejects_set_n, reg.id)) {
+ ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, &reg);
+ if (ret != -1 || errno != EPERM) {
+ printf("Failed to reject (ret=%d, errno=%d) ", ret, errno);
+ print_reg(reg.id);
+ putchar('\n');
+ ++failed_reject;
+ }
+ continue;
+ }
+
+ ret = _vcpu_ioctl(vm, 0, KVM_SET_ONE_REG, &reg);
+ if (ret) {
+ puts("Failed to set ");
+ print_reg(reg.id);
+ putchar('\n');
+ ++failed_set;
+ }
+ }
+
+ if (reg_list_sve()) {
+ blessed_n = base_regs_n + sve_regs_n;
+ vec_regs = sve_regs;
+ } else {
+ blessed_n = base_regs_n + vregs_n;
+ vec_regs = vregs;
+ }
+
+ blessed_reg = calloc(blessed_n, sizeof(__u64));
+ for (i = 0; i < base_regs_n; ++i)
+ blessed_reg[i] = base_regs[i];
+ for (i = 0; i < blessed_n - base_regs_n; ++i)
+ blessed_reg[base_regs_n + i] = vec_regs[i];
+
+ for_each_new_reg(i)
+ ++new_regs;
+
+ for_each_missing_reg(i)
+ ++missing_regs;
+
+ if (new_regs || missing_regs) {
+ printf("Number blessed registers: %5lld\n", blessed_n);
+ printf("Number registers: %5lld\n", reg_list->n);
+ }
+
+ if (new_regs) {
+ printf("\nThere are %d new registers.\n"
+ "Consider adding them to the blessed reg "
+ "list with the following lines:\n\n", new_regs);
+ for_each_new_reg(i)
+ print_reg(reg_list->reg[i]);
+ putchar('\n');
+ }
+
+ if (missing_regs) {
+ printf("\nThere are %d missing registers.\n"
+ "The following lines are missing registers:\n\n", missing_regs);
+ for_each_missing_reg(i)
+ print_reg(blessed_reg[i]);
+ putchar('\n');
+ }
+
+ TEST_ASSERT(!missing_regs && !failed_get && !failed_set && !failed_reject,
+ "There are %d missing registers; "
+ "%d registers failed get; %d registers failed set; %d registers failed reject",
+ missing_regs, failed_get, failed_set, failed_reject);
+
+ return 0;
+}
+
+/*
+ * The current blessed list was primed with the output of kernel version
+ * v4.15 with --core-reg-fixup and then later updated with new registers.
+ */
+static __u64 base_regs[] = {
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[0]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[1]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[2]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[3]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[4]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[5]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[6]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[7]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[8]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[9]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[10]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[11]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[12]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[13]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[14]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[15]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[16]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[17]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[18]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[19]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[20]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[21]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[22]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[23]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[24]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[25]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[26]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[27]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[28]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[29]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.regs[30]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.sp),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.pc),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(regs.pstate),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(sp_el1),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(elr_el1),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[0]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[1]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[2]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[3]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U64 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(spsr[4]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.fpsr),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.fpcr),
+ KVM_REG_ARM_FW_REG(0),
+ KVM_REG_ARM_FW_REG(1),
+ KVM_REG_ARM_FW_REG(2),
+ ARM64_SYS_REG(3, 3, 14, 3, 1), /* CNTV_CTL_EL0 */
+ ARM64_SYS_REG(3, 3, 14, 3, 2), /* CNTV_CVAL_EL0 */
+ ARM64_SYS_REG(3, 3, 14, 0, 2),
+ ARM64_SYS_REG(3, 0, 0, 0, 0), /* MIDR_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 0, 6), /* REVIDR_EL1 */
+ ARM64_SYS_REG(3, 1, 0, 0, 1), /* CLIDR_EL1 */
+ ARM64_SYS_REG(3, 1, 0, 0, 7), /* AIDR_EL1 */
+ ARM64_SYS_REG(3, 3, 0, 0, 1), /* CTR_EL0 */
+ ARM64_SYS_REG(2, 0, 0, 0, 4),
+ ARM64_SYS_REG(2, 0, 0, 0, 5),
+ ARM64_SYS_REG(2, 0, 0, 0, 6),
+ ARM64_SYS_REG(2, 0, 0, 0, 7),
+ ARM64_SYS_REG(2, 0, 0, 1, 4),
+ ARM64_SYS_REG(2, 0, 0, 1, 5),
+ ARM64_SYS_REG(2, 0, 0, 1, 6),
+ ARM64_SYS_REG(2, 0, 0, 1, 7),
+ ARM64_SYS_REG(2, 0, 0, 2, 0), /* MDCCINT_EL1 */
+ ARM64_SYS_REG(2, 0, 0, 2, 2), /* MDSCR_EL1 */
+ ARM64_SYS_REG(2, 0, 0, 2, 4),
+ ARM64_SYS_REG(2, 0, 0, 2, 5),
+ ARM64_SYS_REG(2, 0, 0, 2, 6),
+ ARM64_SYS_REG(2, 0, 0, 2, 7),
+ ARM64_SYS_REG(2, 0, 0, 3, 4),
+ ARM64_SYS_REG(2, 0, 0, 3, 5),
+ ARM64_SYS_REG(2, 0, 0, 3, 6),
+ ARM64_SYS_REG(2, 0, 0, 3, 7),
+ ARM64_SYS_REG(2, 0, 0, 4, 4),
+ ARM64_SYS_REG(2, 0, 0, 4, 5),
+ ARM64_SYS_REG(2, 0, 0, 4, 6),
+ ARM64_SYS_REG(2, 0, 0, 4, 7),
+ ARM64_SYS_REG(2, 0, 0, 5, 4),
+ ARM64_SYS_REG(2, 0, 0, 5, 5),
+ ARM64_SYS_REG(2, 0, 0, 5, 6),
+ ARM64_SYS_REG(2, 0, 0, 5, 7),
+ ARM64_SYS_REG(2, 0, 0, 6, 4),
+ ARM64_SYS_REG(2, 0, 0, 6, 5),
+ ARM64_SYS_REG(2, 0, 0, 6, 6),
+ ARM64_SYS_REG(2, 0, 0, 6, 7),
+ ARM64_SYS_REG(2, 0, 0, 7, 4),
+ ARM64_SYS_REG(2, 0, 0, 7, 5),
+ ARM64_SYS_REG(2, 0, 0, 7, 6),
+ ARM64_SYS_REG(2, 0, 0, 7, 7),
+ ARM64_SYS_REG(2, 0, 0, 8, 4),
+ ARM64_SYS_REG(2, 0, 0, 8, 5),
+ ARM64_SYS_REG(2, 0, 0, 8, 6),
+ ARM64_SYS_REG(2, 0, 0, 8, 7),
+ ARM64_SYS_REG(2, 0, 0, 9, 4),
+ ARM64_SYS_REG(2, 0, 0, 9, 5),
+ ARM64_SYS_REG(2, 0, 0, 9, 6),
+ ARM64_SYS_REG(2, 0, 0, 9, 7),
+ ARM64_SYS_REG(2, 0, 0, 10, 4),
+ ARM64_SYS_REG(2, 0, 0, 10, 5),
+ ARM64_SYS_REG(2, 0, 0, 10, 6),
+ ARM64_SYS_REG(2, 0, 0, 10, 7),
+ ARM64_SYS_REG(2, 0, 0, 11, 4),
+ ARM64_SYS_REG(2, 0, 0, 11, 5),
+ ARM64_SYS_REG(2, 0, 0, 11, 6),
+ ARM64_SYS_REG(2, 0, 0, 11, 7),
+ ARM64_SYS_REG(2, 0, 0, 12, 4),
+ ARM64_SYS_REG(2, 0, 0, 12, 5),
+ ARM64_SYS_REG(2, 0, 0, 12, 6),
+ ARM64_SYS_REG(2, 0, 0, 12, 7),
+ ARM64_SYS_REG(2, 0, 0, 13, 4),
+ ARM64_SYS_REG(2, 0, 0, 13, 5),
+ ARM64_SYS_REG(2, 0, 0, 13, 6),
+ ARM64_SYS_REG(2, 0, 0, 13, 7),
+ ARM64_SYS_REG(2, 0, 0, 14, 4),
+ ARM64_SYS_REG(2, 0, 0, 14, 5),
+ ARM64_SYS_REG(2, 0, 0, 14, 6),
+ ARM64_SYS_REG(2, 0, 0, 14, 7),
+ ARM64_SYS_REG(2, 0, 0, 15, 4),
+ ARM64_SYS_REG(2, 0, 0, 15, 5),
+ ARM64_SYS_REG(2, 0, 0, 15, 6),
+ ARM64_SYS_REG(2, 0, 0, 15, 7),
+ ARM64_SYS_REG(2, 4, 0, 7, 0), /* DBGVCR32_EL2 */
+ ARM64_SYS_REG(3, 0, 0, 0, 5), /* MPIDR_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 1, 0), /* ID_PFR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 1, 1), /* ID_PFR1_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 1, 2), /* ID_DFR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 1, 3), /* ID_AFR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 1, 4), /* ID_MMFR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 1, 5), /* ID_MMFR1_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 1, 6), /* ID_MMFR2_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 1, 7), /* ID_MMFR3_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 2, 0), /* ID_ISAR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 2, 1), /* ID_ISAR1_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 2, 2), /* ID_ISAR2_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 2, 3), /* ID_ISAR3_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 2, 4), /* ID_ISAR4_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 2, 5), /* ID_ISAR5_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 2, 6), /* ID_MMFR4_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 2, 7), /* ID_ISAR6_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 3, 0), /* MVFR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 3, 1), /* MVFR1_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 3, 2), /* MVFR2_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 3, 3),
+ ARM64_SYS_REG(3, 0, 0, 3, 4), /* ID_PFR2_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 3, 5), /* ID_DFR1_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 3, 6), /* ID_MMFR5_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 3, 7),
+ ARM64_SYS_REG(3, 0, 0, 4, 0), /* ID_AA64PFR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 4, 1), /* ID_AA64PFR1_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 4, 2),
+ ARM64_SYS_REG(3, 0, 0, 4, 3),
+ ARM64_SYS_REG(3, 0, 0, 4, 4), /* ID_AA64ZFR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 4, 5),
+ ARM64_SYS_REG(3, 0, 0, 4, 6),
+ ARM64_SYS_REG(3, 0, 0, 4, 7),
+ ARM64_SYS_REG(3, 0, 0, 5, 0), /* ID_AA64DFR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 5, 1), /* ID_AA64DFR1_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 5, 2),
+ ARM64_SYS_REG(3, 0, 0, 5, 3),
+ ARM64_SYS_REG(3, 0, 0, 5, 4), /* ID_AA64AFR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 5, 5), /* ID_AA64AFR1_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 5, 6),
+ ARM64_SYS_REG(3, 0, 0, 5, 7),
+ ARM64_SYS_REG(3, 0, 0, 6, 0), /* ID_AA64ISAR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 6, 1), /* ID_AA64ISAR1_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 6, 2),
+ ARM64_SYS_REG(3, 0, 0, 6, 3),
+ ARM64_SYS_REG(3, 0, 0, 6, 4),
+ ARM64_SYS_REG(3, 0, 0, 6, 5),
+ ARM64_SYS_REG(3, 0, 0, 6, 6),
+ ARM64_SYS_REG(3, 0, 0, 6, 7),
+ ARM64_SYS_REG(3, 0, 0, 7, 0), /* ID_AA64MMFR0_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 7, 1), /* ID_AA64MMFR1_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 7, 2), /* ID_AA64MMFR2_EL1 */
+ ARM64_SYS_REG(3, 0, 0, 7, 3),
+ ARM64_SYS_REG(3, 0, 0, 7, 4),
+ ARM64_SYS_REG(3, 0, 0, 7, 5),
+ ARM64_SYS_REG(3, 0, 0, 7, 6),
+ ARM64_SYS_REG(3, 0, 0, 7, 7),
+ ARM64_SYS_REG(3, 0, 1, 0, 0), /* SCTLR_EL1 */
+ ARM64_SYS_REG(3, 0, 1, 0, 1), /* ACTLR_EL1 */
+ ARM64_SYS_REG(3, 0, 1, 0, 2), /* CPACR_EL1 */
+ ARM64_SYS_REG(3, 0, 2, 0, 0), /* TTBR0_EL1 */
+ ARM64_SYS_REG(3, 0, 2, 0, 1), /* TTBR1_EL1 */
+ ARM64_SYS_REG(3, 0, 2, 0, 2), /* TCR_EL1 */
+ ARM64_SYS_REG(3, 0, 5, 1, 0), /* AFSR0_EL1 */
+ ARM64_SYS_REG(3, 0, 5, 1, 1), /* AFSR1_EL1 */
+ ARM64_SYS_REG(3, 0, 5, 2, 0), /* ESR_EL1 */
+ ARM64_SYS_REG(3, 0, 6, 0, 0), /* FAR_EL1 */
+ ARM64_SYS_REG(3, 0, 7, 4, 0), /* PAR_EL1 */
+ ARM64_SYS_REG(3, 0, 9, 14, 1), /* PMINTENSET_EL1 */
+ ARM64_SYS_REG(3, 0, 9, 14, 2), /* PMINTENCLR_EL1 */
+ ARM64_SYS_REG(3, 0, 10, 2, 0), /* MAIR_EL1 */
+ ARM64_SYS_REG(3, 0, 10, 3, 0), /* AMAIR_EL1 */
+ ARM64_SYS_REG(3, 0, 12, 0, 0), /* VBAR_EL1 */
+ ARM64_SYS_REG(3, 0, 12, 1, 1), /* DISR_EL1 */
+ ARM64_SYS_REG(3, 0, 13, 0, 1), /* CONTEXTIDR_EL1 */
+ ARM64_SYS_REG(3, 0, 13, 0, 4), /* TPIDR_EL1 */
+ ARM64_SYS_REG(3, 0, 14, 1, 0), /* CNTKCTL_EL1 */
+ ARM64_SYS_REG(3, 2, 0, 0, 0), /* CSSELR_EL1 */
+ ARM64_SYS_REG(3, 3, 9, 12, 0), /* PMCR_EL0 */
+ ARM64_SYS_REG(3, 3, 9, 12, 1), /* PMCNTENSET_EL0 */
+ ARM64_SYS_REG(3, 3, 9, 12, 2), /* PMCNTENCLR_EL0 */
+ ARM64_SYS_REG(3, 3, 9, 12, 3), /* PMOVSCLR_EL0 */
+ ARM64_SYS_REG(3, 3, 9, 12, 4), /* PMSWINC_EL0 */
+ ARM64_SYS_REG(3, 3, 9, 12, 5), /* PMSELR_EL0 */
+ ARM64_SYS_REG(3, 3, 9, 13, 0), /* PMCCNTR_EL0 */
+ ARM64_SYS_REG(3, 3, 9, 14, 0), /* PMUSERENR_EL0 */
+ ARM64_SYS_REG(3, 3, 9, 14, 3), /* PMOVSSET_EL0 */
+ ARM64_SYS_REG(3, 3, 13, 0, 2), /* TPIDR_EL0 */
+ ARM64_SYS_REG(3, 3, 13, 0, 3), /* TPIDRRO_EL0 */
+ ARM64_SYS_REG(3, 3, 14, 8, 0),
+ ARM64_SYS_REG(3, 3, 14, 8, 1),
+ ARM64_SYS_REG(3, 3, 14, 8, 2),
+ ARM64_SYS_REG(3, 3, 14, 8, 3),
+ ARM64_SYS_REG(3, 3, 14, 8, 4),
+ ARM64_SYS_REG(3, 3, 14, 8, 5),
+ ARM64_SYS_REG(3, 3, 14, 8, 6),
+ ARM64_SYS_REG(3, 3, 14, 8, 7),
+ ARM64_SYS_REG(3, 3, 14, 9, 0),
+ ARM64_SYS_REG(3, 3, 14, 9, 1),
+ ARM64_SYS_REG(3, 3, 14, 9, 2),
+ ARM64_SYS_REG(3, 3, 14, 9, 3),
+ ARM64_SYS_REG(3, 3, 14, 9, 4),
+ ARM64_SYS_REG(3, 3, 14, 9, 5),
+ ARM64_SYS_REG(3, 3, 14, 9, 6),
+ ARM64_SYS_REG(3, 3, 14, 9, 7),
+ ARM64_SYS_REG(3, 3, 14, 10, 0),
+ ARM64_SYS_REG(3, 3, 14, 10, 1),
+ ARM64_SYS_REG(3, 3, 14, 10, 2),
+ ARM64_SYS_REG(3, 3, 14, 10, 3),
+ ARM64_SYS_REG(3, 3, 14, 10, 4),
+ ARM64_SYS_REG(3, 3, 14, 10, 5),
+ ARM64_SYS_REG(3, 3, 14, 10, 6),
+ ARM64_SYS_REG(3, 3, 14, 10, 7),
+ ARM64_SYS_REG(3, 3, 14, 11, 0),
+ ARM64_SYS_REG(3, 3, 14, 11, 1),
+ ARM64_SYS_REG(3, 3, 14, 11, 2),
+ ARM64_SYS_REG(3, 3, 14, 11, 3),
+ ARM64_SYS_REG(3, 3, 14, 11, 4),
+ ARM64_SYS_REG(3, 3, 14, 11, 5),
+ ARM64_SYS_REG(3, 3, 14, 11, 6),
+ ARM64_SYS_REG(3, 3, 14, 12, 0),
+ ARM64_SYS_REG(3, 3, 14, 12, 1),
+ ARM64_SYS_REG(3, 3, 14, 12, 2),
+ ARM64_SYS_REG(3, 3, 14, 12, 3),
+ ARM64_SYS_REG(3, 3, 14, 12, 4),
+ ARM64_SYS_REG(3, 3, 14, 12, 5),
+ ARM64_SYS_REG(3, 3, 14, 12, 6),
+ ARM64_SYS_REG(3, 3, 14, 12, 7),
+ ARM64_SYS_REG(3, 3, 14, 13, 0),
+ ARM64_SYS_REG(3, 3, 14, 13, 1),
+ ARM64_SYS_REG(3, 3, 14, 13, 2),
+ ARM64_SYS_REG(3, 3, 14, 13, 3),
+ ARM64_SYS_REG(3, 3, 14, 13, 4),
+ ARM64_SYS_REG(3, 3, 14, 13, 5),
+ ARM64_SYS_REG(3, 3, 14, 13, 6),
+ ARM64_SYS_REG(3, 3, 14, 13, 7),
+ ARM64_SYS_REG(3, 3, 14, 14, 0),
+ ARM64_SYS_REG(3, 3, 14, 14, 1),
+ ARM64_SYS_REG(3, 3, 14, 14, 2),
+ ARM64_SYS_REG(3, 3, 14, 14, 3),
+ ARM64_SYS_REG(3, 3, 14, 14, 4),
+ ARM64_SYS_REG(3, 3, 14, 14, 5),
+ ARM64_SYS_REG(3, 3, 14, 14, 6),
+ ARM64_SYS_REG(3, 3, 14, 14, 7),
+ ARM64_SYS_REG(3, 3, 14, 15, 0),
+ ARM64_SYS_REG(3, 3, 14, 15, 1),
+ ARM64_SYS_REG(3, 3, 14, 15, 2),
+ ARM64_SYS_REG(3, 3, 14, 15, 3),
+ ARM64_SYS_REG(3, 3, 14, 15, 4),
+ ARM64_SYS_REG(3, 3, 14, 15, 5),
+ ARM64_SYS_REG(3, 3, 14, 15, 6),
+ ARM64_SYS_REG(3, 3, 14, 15, 7), /* PMCCFILTR_EL0 */
+ ARM64_SYS_REG(3, 4, 3, 0, 0), /* DACR32_EL2 */
+ ARM64_SYS_REG(3, 4, 5, 0, 1), /* IFSR32_EL2 */
+ ARM64_SYS_REG(3, 4, 5, 3, 0), /* FPEXC32_EL2 */
+ KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 0,
+ KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 1,
+ KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX | KVM_REG_ARM_DEMUX_ID_CCSIDR | 2,
+};
+static __u64 base_regs_n = ARRAY_SIZE(base_regs);
+
+static __u64 vregs[] = {
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[1]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[2]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[3]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[4]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[5]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[6]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[7]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[8]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[9]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[10]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[11]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[12]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[13]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[14]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[15]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[16]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[17]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[18]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[19]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[20]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[21]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[22]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[23]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[24]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[25]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[26]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[27]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[28]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[29]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[30]),
+ KVM_REG_ARM64 | KVM_REG_SIZE_U128 | KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]),
+};
+static __u64 vregs_n = ARRAY_SIZE(vregs);
+
+static __u64 sve_regs[] = {
+ KVM_REG_ARM64_SVE_VLS,
+ KVM_REG_ARM64_SVE_ZREG(0, 0),
+ KVM_REG_ARM64_SVE_ZREG(1, 0),
+ KVM_REG_ARM64_SVE_ZREG(2, 0),
+ KVM_REG_ARM64_SVE_ZREG(3, 0),
+ KVM_REG_ARM64_SVE_ZREG(4, 0),
+ KVM_REG_ARM64_SVE_ZREG(5, 0),
+ KVM_REG_ARM64_SVE_ZREG(6, 0),
+ KVM_REG_ARM64_SVE_ZREG(7, 0),
+ KVM_REG_ARM64_SVE_ZREG(8, 0),
+ KVM_REG_ARM64_SVE_ZREG(9, 0),
+ KVM_REG_ARM64_SVE_ZREG(10, 0),
+ KVM_REG_ARM64_SVE_ZREG(11, 0),
+ KVM_REG_ARM64_SVE_ZREG(12, 0),
+ KVM_REG_ARM64_SVE_ZREG(13, 0),
+ KVM_REG_ARM64_SVE_ZREG(14, 0),
+ KVM_REG_ARM64_SVE_ZREG(15, 0),
+ KVM_REG_ARM64_SVE_ZREG(16, 0),
+ KVM_REG_ARM64_SVE_ZREG(17, 0),
+ KVM_REG_ARM64_SVE_ZREG(18, 0),
+ KVM_REG_ARM64_SVE_ZREG(19, 0),
+ KVM_REG_ARM64_SVE_ZREG(20, 0),
+ KVM_REG_ARM64_SVE_ZREG(21, 0),
+ KVM_REG_ARM64_SVE_ZREG(22, 0),
+ KVM_REG_ARM64_SVE_ZREG(23, 0),
+ KVM_REG_ARM64_SVE_ZREG(24, 0),
+ KVM_REG_ARM64_SVE_ZREG(25, 0),
+ KVM_REG_ARM64_SVE_ZREG(26, 0),
+ KVM_REG_ARM64_SVE_ZREG(27, 0),
+ KVM_REG_ARM64_SVE_ZREG(28, 0),
+ KVM_REG_ARM64_SVE_ZREG(29, 0),
+ KVM_REG_ARM64_SVE_ZREG(30, 0),
+ KVM_REG_ARM64_SVE_ZREG(31, 0),
+ KVM_REG_ARM64_SVE_PREG(0, 0),
+ KVM_REG_ARM64_SVE_PREG(1, 0),
+ KVM_REG_ARM64_SVE_PREG(2, 0),
+ KVM_REG_ARM64_SVE_PREG(3, 0),
+ KVM_REG_ARM64_SVE_PREG(4, 0),
+ KVM_REG_ARM64_SVE_PREG(5, 0),
+ KVM_REG_ARM64_SVE_PREG(6, 0),
+ KVM_REG_ARM64_SVE_PREG(7, 0),
+ KVM_REG_ARM64_SVE_PREG(8, 0),
+ KVM_REG_ARM64_SVE_PREG(9, 0),
+ KVM_REG_ARM64_SVE_PREG(10, 0),
+ KVM_REG_ARM64_SVE_PREG(11, 0),
+ KVM_REG_ARM64_SVE_PREG(12, 0),
+ KVM_REG_ARM64_SVE_PREG(13, 0),
+ KVM_REG_ARM64_SVE_PREG(14, 0),
+ KVM_REG_ARM64_SVE_PREG(15, 0),
+ KVM_REG_ARM64_SVE_FFR(0),
+ ARM64_SYS_REG(3, 0, 1, 2, 0), /* ZCR_EL1 */
+};
+static __u64 sve_regs_n = ARRAY_SIZE(sve_regs);
+
+static __u64 rejects_set[] = {
+#ifdef REG_LIST_SVE
+ KVM_REG_ARM64_SVE_VLS,
+#endif
+};
+static __u64 rejects_set_n = ARRAY_SIZE(rejects_set);
diff --git a/tools/testing/selftests/kvm/config b/tools/testing/selftests/kvm/config
new file mode 100644
index 000000000..63ed533f7
--- /dev/null
+++ b/tools/testing/selftests/kvm/config
@@ -0,0 +1,3 @@
+CONFIG_KVM=y
+CONFIG_KVM_INTEL=y
+CONFIG_KVM_AMD=y
diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c
new file mode 100644
index 000000000..3d96a7bfa
--- /dev/null
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -0,0 +1,498 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM demand paging test
+ * Adapted from dirty_log_test.c
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ * Copyright (C) 2019, Google, Inc.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <asm/unistd.h>
+#include <time.h>
+#include <poll.h>
+#include <pthread.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/userfaultfd.h>
+
+#include "perf_test_util.h"
+#include "processor.h"
+#include "test_util.h"
+
+#ifdef __NR_userfaultfd
+
+#ifdef PRINT_PER_PAGE_UPDATES
+#define PER_PAGE_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define PER_PAGE_DEBUG(...) _no_printf(__VA_ARGS__)
+#endif
+
+#ifdef PRINT_PER_VCPU_UPDATES
+#define PER_VCPU_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define PER_VCPU_DEBUG(...) _no_printf(__VA_ARGS__)
+#endif
+
+static char *guest_data_prototype;
+
+static void *vcpu_worker(void *data)
+{
+ int ret;
+ struct vcpu_args *vcpu_args = (struct vcpu_args *)data;
+ int vcpu_id = vcpu_args->vcpu_id;
+ struct kvm_vm *vm = perf_test_args.vm;
+ struct kvm_run *run;
+ struct timespec start;
+ struct timespec ts_diff;
+
+ vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
+ run = vcpu_state(vm, vcpu_id);
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+
+ /* Let the guest access its memory */
+ ret = _vcpu_run(vm, vcpu_id);
+ TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
+ if (get_ucall(vm, vcpu_id, NULL) != UCALL_SYNC) {
+ TEST_ASSERT(false,
+ "Invalid guest sync status: exit_reason=%s\n",
+ exit_reason_str(run->exit_reason));
+ }
+
+ ts_diff = timespec_diff_now(start);
+ PER_VCPU_DEBUG("vCPU %d execution time: %ld.%.9lds\n", vcpu_id,
+ ts_diff.tv_sec, ts_diff.tv_nsec);
+
+ return NULL;
+}
+
+static int handle_uffd_page_request(int uffd, uint64_t addr)
+{
+ pid_t tid;
+ struct timespec start;
+ struct timespec ts_diff;
+ struct uffdio_copy copy;
+ int r;
+
+ tid = syscall(__NR_gettid);
+
+ copy.src = (uint64_t)guest_data_prototype;
+ copy.dst = addr;
+ copy.len = perf_test_args.host_page_size;
+ copy.mode = 0;
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+
+ r = ioctl(uffd, UFFDIO_COPY, &copy);
+ if (r == -1) {
+ pr_info("Failed Paged in 0x%lx from thread %d with errno: %d\n",
+ addr, tid, errno);
+ return r;
+ }
+
+ ts_diff = timespec_diff_now(start);
+
+ PER_PAGE_DEBUG("UFFDIO_COPY %d \t%ld ns\n", tid,
+ timespec_to_ns(ts_diff));
+ PER_PAGE_DEBUG("Paged in %ld bytes at 0x%lx from thread %d\n",
+ perf_test_args.host_page_size, addr, tid);
+
+ return 0;
+}
+
+bool quit_uffd_thread;
+
+struct uffd_handler_args {
+ int uffd;
+ int pipefd;
+ useconds_t delay;
+};
+
+static void *uffd_handler_thread_fn(void *arg)
+{
+ struct uffd_handler_args *uffd_args = (struct uffd_handler_args *)arg;
+ int uffd = uffd_args->uffd;
+ int pipefd = uffd_args->pipefd;
+ useconds_t delay = uffd_args->delay;
+ int64_t pages = 0;
+ struct timespec start;
+ struct timespec ts_diff;
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ while (!quit_uffd_thread) {
+ struct uffd_msg msg;
+ struct pollfd pollfd[2];
+ char tmp_chr;
+ int r;
+ uint64_t addr;
+
+ pollfd[0].fd = uffd;
+ pollfd[0].events = POLLIN;
+ pollfd[1].fd = pipefd;
+ pollfd[1].events = POLLIN;
+
+ r = poll(pollfd, 2, -1);
+ switch (r) {
+ case -1:
+ pr_info("poll err");
+ continue;
+ case 0:
+ continue;
+ case 1:
+ break;
+ default:
+ pr_info("Polling uffd returned %d", r);
+ return NULL;
+ }
+
+ if (pollfd[0].revents & POLLERR) {
+ pr_info("uffd revents has POLLERR");
+ return NULL;
+ }
+
+ if (pollfd[1].revents & POLLIN) {
+ r = read(pollfd[1].fd, &tmp_chr, 1);
+ TEST_ASSERT(r == 1,
+ "Error reading pipefd in UFFD thread\n");
+ return NULL;
+ }
+
+ if (!pollfd[0].revents & POLLIN)
+ continue;
+
+ r = read(uffd, &msg, sizeof(msg));
+ if (r == -1) {
+ if (errno == EAGAIN)
+ continue;
+ pr_info("Read of uffd gor errno %d", errno);
+ return NULL;
+ }
+
+ if (r != sizeof(msg)) {
+ pr_info("Read on uffd returned unexpected size: %d bytes", r);
+ return NULL;
+ }
+
+ if (!(msg.event & UFFD_EVENT_PAGEFAULT))
+ continue;
+
+ if (delay)
+ usleep(delay);
+ addr = msg.arg.pagefault.address;
+ r = handle_uffd_page_request(uffd, addr);
+ if (r < 0)
+ return NULL;
+ pages++;
+ }
+
+ ts_diff = timespec_diff_now(start);
+ PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n",
+ pages, ts_diff.tv_sec, ts_diff.tv_nsec,
+ pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0));
+
+ return NULL;
+}
+
+static int setup_demand_paging(struct kvm_vm *vm,
+ pthread_t *uffd_handler_thread, int pipefd,
+ useconds_t uffd_delay,
+ struct uffd_handler_args *uffd_args,
+ void *hva, uint64_t len)
+{
+ int uffd;
+ struct uffdio_api uffdio_api;
+ struct uffdio_register uffdio_register;
+
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ if (uffd == -1) {
+ pr_info("uffd creation failed\n");
+ return -1;
+ }
+
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = 0;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
+ pr_info("ioctl uffdio_api failed\n");
+ return -1;
+ }
+
+ uffdio_register.range.start = (uint64_t)hva;
+ uffdio_register.range.len = len;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
+ pr_info("ioctl uffdio_register failed\n");
+ return -1;
+ }
+
+ if ((uffdio_register.ioctls & UFFD_API_RANGE_IOCTLS) !=
+ UFFD_API_RANGE_IOCTLS) {
+ pr_info("unexpected userfaultfd ioctl set\n");
+ return -1;
+ }
+
+ uffd_args->uffd = uffd;
+ uffd_args->pipefd = pipefd;
+ uffd_args->delay = uffd_delay;
+ pthread_create(uffd_handler_thread, NULL, uffd_handler_thread_fn,
+ uffd_args);
+
+ PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n",
+ hva, hva + len);
+
+ return 0;
+}
+
+static void run_test(enum vm_guest_mode mode, bool use_uffd,
+ useconds_t uffd_delay)
+{
+ pthread_t *vcpu_threads;
+ pthread_t *uffd_handler_threads = NULL;
+ struct uffd_handler_args *uffd_args = NULL;
+ struct timespec start;
+ struct timespec ts_diff;
+ int *pipefds = NULL;
+ struct kvm_vm *vm;
+ int vcpu_id;
+ int r;
+
+ vm = create_vm(mode, nr_vcpus, guest_percpu_mem_size);
+
+ perf_test_args.wr_fract = 1;
+
+ guest_data_prototype = malloc(perf_test_args.host_page_size);
+ TEST_ASSERT(guest_data_prototype,
+ "Failed to allocate buffer for guest data pattern");
+ memset(guest_data_prototype, 0xAB, perf_test_args.host_page_size);
+
+ vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads));
+ TEST_ASSERT(vcpu_threads, "Memory allocation failed");
+
+ add_vcpus(vm, nr_vcpus, guest_percpu_mem_size);
+
+ if (use_uffd) {
+ uffd_handler_threads =
+ malloc(nr_vcpus * sizeof(*uffd_handler_threads));
+ TEST_ASSERT(uffd_handler_threads, "Memory allocation failed");
+
+ uffd_args = malloc(nr_vcpus * sizeof(*uffd_args));
+ TEST_ASSERT(uffd_args, "Memory allocation failed");
+
+ pipefds = malloc(sizeof(int) * nr_vcpus * 2);
+ TEST_ASSERT(pipefds, "Unable to allocate memory for pipefd");
+
+ for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+ vm_paddr_t vcpu_gpa;
+ void *vcpu_hva;
+
+ vcpu_gpa = guest_test_phys_mem + (vcpu_id * guest_percpu_mem_size);
+ PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n",
+ vcpu_id, vcpu_gpa, vcpu_gpa + guest_percpu_mem_size);
+
+ /* Cache the HVA pointer of the region */
+ vcpu_hva = addr_gpa2hva(vm, vcpu_gpa);
+
+ /*
+ * Set up user fault fd to handle demand paging
+ * requests.
+ */
+ r = pipe2(&pipefds[vcpu_id * 2],
+ O_CLOEXEC | O_NONBLOCK);
+ TEST_ASSERT(!r, "Failed to set up pipefd");
+
+ r = setup_demand_paging(vm,
+ &uffd_handler_threads[vcpu_id],
+ pipefds[vcpu_id * 2],
+ uffd_delay, &uffd_args[vcpu_id],
+ vcpu_hva, guest_percpu_mem_size);
+ if (r < 0)
+ exit(-r);
+ }
+ }
+
+ /* Export the shared variables to the guest */
+ sync_global_to_guest(vm, perf_test_args);
+
+ pr_info("Finished creating vCPUs and starting uffd threads\n");
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+
+ for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+ pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker,
+ &perf_test_args.vcpu_args[vcpu_id]);
+ }
+
+ pr_info("Started all vCPUs\n");
+
+ /* Wait for the vcpu threads to quit */
+ for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+ pthread_join(vcpu_threads[vcpu_id], NULL);
+ PER_VCPU_DEBUG("Joined thread for vCPU %d\n", vcpu_id);
+ }
+
+ ts_diff = timespec_diff_now(start);
+
+ pr_info("All vCPU threads joined\n");
+
+ if (use_uffd) {
+ char c;
+
+ /* Tell the user fault fd handler threads to quit */
+ for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+ r = write(pipefds[vcpu_id * 2 + 1], &c, 1);
+ TEST_ASSERT(r == 1, "Unable to write to pipefd");
+
+ pthread_join(uffd_handler_threads[vcpu_id], NULL);
+ }
+ }
+
+ pr_info("Total guest execution time: %ld.%.9lds\n",
+ ts_diff.tv_sec, ts_diff.tv_nsec);
+ pr_info("Overall demand paging rate: %f pgs/sec\n",
+ perf_test_args.vcpu_args[0].pages * nr_vcpus /
+ ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0));
+
+ ucall_uninit(vm);
+ kvm_vm_free(vm);
+
+ free(guest_data_prototype);
+ free(vcpu_threads);
+ if (use_uffd) {
+ free(uffd_handler_threads);
+ free(uffd_args);
+ free(pipefds);
+ }
+}
+
+struct guest_mode {
+ bool supported;
+ bool enabled;
+};
+static struct guest_mode guest_modes[NUM_VM_MODES];
+
+#define guest_mode_init(mode, supported, enabled) ({ \
+ guest_modes[mode] = (struct guest_mode){ supported, enabled }; \
+})
+
+static void help(char *name)
+{
+ int i;
+
+ puts("");
+ printf("usage: %s [-h] [-m mode] [-u] [-d uffd_delay_usec]\n"
+ " [-b memory] [-v vcpus]\n", name);
+ printf(" -m: specify the guest mode ID to test\n"
+ " (default: test all supported modes)\n"
+ " This option may be used multiple times.\n"
+ " Guest mode IDs:\n");
+ for (i = 0; i < NUM_VM_MODES; ++i) {
+ printf(" %d: %s%s\n", i, vm_guest_mode_string(i),
+ guest_modes[i].supported ? " (supported)" : "");
+ }
+ printf(" -u: use User Fault FD to handle vCPU page\n"
+ " faults.\n");
+ printf(" -d: add a delay in usec to the User Fault\n"
+ " FD handler to simulate demand paging\n"
+ " overheads. Ignored without -u.\n");
+ printf(" -b: specify the size of the memory region which should be\n"
+ " demand paged by each vCPU. e.g. 10M or 3G.\n"
+ " Default: 1G\n");
+ printf(" -v: specify the number of vCPUs to run.\n");
+ puts("");
+ exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+ int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
+ bool mode_selected = false;
+ unsigned int mode;
+ int opt, i;
+ bool use_uffd = false;
+ useconds_t uffd_delay = 0;
+
+#ifdef __x86_64__
+ guest_mode_init(VM_MODE_PXXV48_4K, true, true);
+#endif
+#ifdef __aarch64__
+ guest_mode_init(VM_MODE_P40V48_4K, true, true);
+ guest_mode_init(VM_MODE_P40V48_64K, true, true);
+ {
+ unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE);
+
+ if (limit >= 52)
+ guest_mode_init(VM_MODE_P52V48_64K, true, true);
+ if (limit >= 48) {
+ guest_mode_init(VM_MODE_P48V48_4K, true, true);
+ guest_mode_init(VM_MODE_P48V48_64K, true, true);
+ }
+ }
+#endif
+#ifdef __s390x__
+ guest_mode_init(VM_MODE_P40V48_4K, true, true);
+#endif
+
+ while ((opt = getopt(argc, argv, "hm:ud:b:v:")) != -1) {
+ switch (opt) {
+ case 'm':
+ if (!mode_selected) {
+ for (i = 0; i < NUM_VM_MODES; ++i)
+ guest_modes[i].enabled = false;
+ mode_selected = true;
+ }
+ mode = strtoul(optarg, NULL, 10);
+ TEST_ASSERT(mode < NUM_VM_MODES,
+ "Guest mode ID %d too big", mode);
+ guest_modes[mode].enabled = true;
+ break;
+ case 'u':
+ use_uffd = true;
+ break;
+ case 'd':
+ uffd_delay = strtoul(optarg, NULL, 0);
+ TEST_ASSERT(uffd_delay >= 0,
+ "A negative UFFD delay is not supported.");
+ break;
+ case 'b':
+ guest_percpu_mem_size = parse_size(optarg);
+ break;
+ case 'v':
+ nr_vcpus = atoi(optarg);
+ TEST_ASSERT(nr_vcpus > 0 && nr_vcpus <= max_vcpus,
+ "Invalid number of vcpus, must be between 1 and %d", max_vcpus);
+ break;
+ case 'h':
+ default:
+ help(argv[0]);
+ break;
+ }
+ }
+
+ for (i = 0; i < NUM_VM_MODES; ++i) {
+ if (!guest_modes[i].enabled)
+ continue;
+ TEST_ASSERT(guest_modes[i].supported,
+ "Guest mode ID %d (%s) not supported.",
+ i, vm_guest_mode_string(i));
+ run_test(i, use_uffd, uffd_delay);
+ }
+
+ return 0;
+}
+
+#else /* __NR_userfaultfd */
+
+#warning "missing __NR_userfaultfd definition"
+
+int main(void)
+{
+ print_skip("__NR_userfaultfd must be present for userfaultfd test");
+ return KSFT_SKIP;
+}
+
+#endif /* __NR_userfaultfd */
diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c
new file mode 100644
index 000000000..85c9b8f73
--- /dev/null
+++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c
@@ -0,0 +1,376 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM dirty page logging performance test
+ *
+ * Based on dirty_log_test.c
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ * Copyright (C) 2020, Google, Inc.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <pthread.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+
+#include "kvm_util.h"
+#include "perf_test_util.h"
+#include "processor.h"
+#include "test_util.h"
+
+/* How many host loops to run by default (one KVM_GET_DIRTY_LOG for each loop)*/
+#define TEST_HOST_LOOP_N 2UL
+
+/* Host variables */
+static bool host_quit;
+static uint64_t iteration;
+static uint64_t vcpu_last_completed_iteration[MAX_VCPUS];
+
+static void *vcpu_worker(void *data)
+{
+ int ret;
+ struct kvm_vm *vm = perf_test_args.vm;
+ uint64_t pages_count = 0;
+ struct kvm_run *run;
+ struct timespec start;
+ struct timespec ts_diff;
+ struct timespec total = (struct timespec){0};
+ struct timespec avg;
+ struct vcpu_args *vcpu_args = (struct vcpu_args *)data;
+ int vcpu_id = vcpu_args->vcpu_id;
+
+ vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
+ run = vcpu_state(vm, vcpu_id);
+
+ while (!READ_ONCE(host_quit)) {
+ uint64_t current_iteration = READ_ONCE(iteration);
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ ret = _vcpu_run(vm, vcpu_id);
+ ts_diff = timespec_diff_now(start);
+
+ TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
+ TEST_ASSERT(get_ucall(vm, vcpu_id, NULL) == UCALL_SYNC,
+ "Invalid guest sync status: exit_reason=%s\n",
+ exit_reason_str(run->exit_reason));
+
+ pr_debug("Got sync event from vCPU %d\n", vcpu_id);
+ vcpu_last_completed_iteration[vcpu_id] = current_iteration;
+ pr_debug("vCPU %d updated last completed iteration to %lu\n",
+ vcpu_id, vcpu_last_completed_iteration[vcpu_id]);
+
+ if (current_iteration) {
+ pages_count += vcpu_args->pages;
+ total = timespec_add(total, ts_diff);
+ pr_debug("vCPU %d iteration %lu dirty memory time: %ld.%.9lds\n",
+ vcpu_id, current_iteration, ts_diff.tv_sec,
+ ts_diff.tv_nsec);
+ } else {
+ pr_debug("vCPU %d iteration %lu populate memory time: %ld.%.9lds\n",
+ vcpu_id, current_iteration, ts_diff.tv_sec,
+ ts_diff.tv_nsec);
+ }
+
+ while (current_iteration == READ_ONCE(iteration) &&
+ !READ_ONCE(host_quit)) {}
+ }
+
+ avg = timespec_div(total, vcpu_last_completed_iteration[vcpu_id]);
+ pr_debug("\nvCPU %d dirtied 0x%lx pages over %lu iterations in %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n",
+ vcpu_id, pages_count, vcpu_last_completed_iteration[vcpu_id],
+ total.tv_sec, total.tv_nsec, avg.tv_sec, avg.tv_nsec);
+
+ return NULL;
+}
+
+#ifdef USE_CLEAR_DIRTY_LOG
+static u64 dirty_log_manual_caps;
+#endif
+
+static void run_test(enum vm_guest_mode mode, unsigned long iterations,
+ uint64_t phys_offset, int wr_fract)
+{
+ pthread_t *vcpu_threads;
+ struct kvm_vm *vm;
+ unsigned long *bmap;
+ uint64_t guest_num_pages;
+ uint64_t host_num_pages;
+ int vcpu_id;
+ struct timespec start;
+ struct timespec ts_diff;
+ struct timespec get_dirty_log_total = (struct timespec){0};
+ struct timespec vcpu_dirty_total = (struct timespec){0};
+ struct timespec avg;
+#ifdef USE_CLEAR_DIRTY_LOG
+ struct kvm_enable_cap cap = {};
+ struct timespec clear_dirty_log_total = (struct timespec){0};
+#endif
+
+ vm = create_vm(mode, nr_vcpus, guest_percpu_mem_size);
+
+ perf_test_args.wr_fract = wr_fract;
+
+ guest_num_pages = (nr_vcpus * guest_percpu_mem_size) >> vm_get_page_shift(vm);
+ guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages);
+ host_num_pages = vm_num_host_pages(mode, guest_num_pages);
+ bmap = bitmap_alloc(host_num_pages);
+
+#ifdef USE_CLEAR_DIRTY_LOG
+ cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2;
+ cap.args[0] = dirty_log_manual_caps;
+ vm_enable_cap(vm, &cap);
+#endif
+
+ vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads));
+ TEST_ASSERT(vcpu_threads, "Memory allocation failed");
+
+ add_vcpus(vm, nr_vcpus, guest_percpu_mem_size);
+
+ sync_global_to_guest(vm, perf_test_args);
+
+ /* Start the iterations */
+ iteration = 0;
+ host_quit = false;
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+ pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker,
+ &perf_test_args.vcpu_args[vcpu_id]);
+ }
+
+ /* Allow the vCPU to populate memory */
+ pr_debug("Starting iteration %lu - Populating\n", iteration);
+ while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != iteration)
+ pr_debug("Waiting for vcpu_last_completed_iteration == %lu\n",
+ iteration);
+
+ ts_diff = timespec_diff_now(start);
+ pr_info("Populate memory time: %ld.%.9lds\n",
+ ts_diff.tv_sec, ts_diff.tv_nsec);
+
+ /* Enable dirty logging */
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX,
+ KVM_MEM_LOG_DIRTY_PAGES);
+ ts_diff = timespec_diff_now(start);
+ pr_info("Enabling dirty logging time: %ld.%.9lds\n\n",
+ ts_diff.tv_sec, ts_diff.tv_nsec);
+
+ while (iteration < iterations) {
+ /*
+ * Incrementing the iteration number will start the vCPUs
+ * dirtying memory again.
+ */
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ iteration++;
+
+ pr_debug("Starting iteration %lu\n", iteration);
+ for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+ while (READ_ONCE(vcpu_last_completed_iteration[vcpu_id]) != iteration)
+ pr_debug("Waiting for vCPU %d vcpu_last_completed_iteration == %lu\n",
+ vcpu_id, iteration);
+ }
+
+ ts_diff = timespec_diff_now(start);
+ vcpu_dirty_total = timespec_add(vcpu_dirty_total, ts_diff);
+ pr_info("Iteration %lu dirty memory time: %ld.%.9lds\n",
+ iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap);
+
+ ts_diff = timespec_diff_now(start);
+ get_dirty_log_total = timespec_add(get_dirty_log_total,
+ ts_diff);
+ pr_info("Iteration %lu get dirty log time: %ld.%.9lds\n",
+ iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
+
+#ifdef USE_CLEAR_DIRTY_LOG
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ kvm_vm_clear_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap, 0,
+ host_num_pages);
+
+ ts_diff = timespec_diff_now(start);
+ clear_dirty_log_total = timespec_add(clear_dirty_log_total,
+ ts_diff);
+ pr_info("Iteration %lu clear dirty log time: %ld.%.9lds\n",
+ iteration, ts_diff.tv_sec, ts_diff.tv_nsec);
+#endif
+ }
+
+ /* Tell the vcpu thread to quit */
+ host_quit = true;
+ for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++)
+ pthread_join(vcpu_threads[vcpu_id], NULL);
+
+ /* Disable dirty logging */
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ vm_mem_region_set_flags(vm, TEST_MEM_SLOT_INDEX, 0);
+ ts_diff = timespec_diff_now(start);
+ pr_info("Disabling dirty logging time: %ld.%.9lds\n",
+ ts_diff.tv_sec, ts_diff.tv_nsec);
+
+ avg = timespec_div(get_dirty_log_total, iterations);
+ pr_info("Get dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n",
+ iterations, get_dirty_log_total.tv_sec,
+ get_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec);
+
+#ifdef USE_CLEAR_DIRTY_LOG
+ avg = timespec_div(clear_dirty_log_total, iterations);
+ pr_info("Clear dirty log over %lu iterations took %ld.%.9lds. (Avg %ld.%.9lds/iteration)\n",
+ iterations, clear_dirty_log_total.tv_sec,
+ clear_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec);
+#endif
+
+ free(bmap);
+ free(vcpu_threads);
+ ucall_uninit(vm);
+ kvm_vm_free(vm);
+}
+
+struct guest_mode {
+ bool supported;
+ bool enabled;
+};
+static struct guest_mode guest_modes[NUM_VM_MODES];
+
+#define guest_mode_init(mode, supported, enabled) ({ \
+ guest_modes[mode] = (struct guest_mode){ supported, enabled }; \
+})
+
+static void help(char *name)
+{
+ int i;
+
+ puts("");
+ printf("usage: %s [-h] [-i iterations] [-p offset] "
+ "[-m mode] [-b vcpu bytes] [-v vcpus]\n", name);
+ puts("");
+ printf(" -i: specify iteration counts (default: %"PRIu64")\n",
+ TEST_HOST_LOOP_N);
+ printf(" -p: specify guest physical test memory offset\n"
+ " Warning: a low offset can conflict with the loaded test code.\n");
+ printf(" -m: specify the guest mode ID to test "
+ "(default: test all supported modes)\n"
+ " This option may be used multiple times.\n"
+ " Guest mode IDs:\n");
+ for (i = 0; i < NUM_VM_MODES; ++i) {
+ printf(" %d: %s%s\n", i, vm_guest_mode_string(i),
+ guest_modes[i].supported ? " (supported)" : "");
+ }
+ printf(" -b: specify the size of the memory region which should be\n"
+ " dirtied by each vCPU. e.g. 10M or 3G.\n"
+ " (default: 1G)\n");
+ printf(" -f: specify the fraction of pages which should be written to\n"
+ " as opposed to simply read, in the form\n"
+ " 1/<fraction of pages to write>.\n"
+ " (default: 1 i.e. all pages are written to.)\n");
+ printf(" -v: specify the number of vCPUs to run.\n");
+ puts("");
+ exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned long iterations = TEST_HOST_LOOP_N;
+ bool mode_selected = false;
+ uint64_t phys_offset = 0;
+ unsigned int mode;
+ int opt, i;
+ int wr_fract = 1;
+
+#ifdef USE_CLEAR_DIRTY_LOG
+ dirty_log_manual_caps =
+ kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
+ if (!dirty_log_manual_caps) {
+ print_skip("KVM_CLEAR_DIRTY_LOG not available");
+ exit(KSFT_SKIP);
+ }
+ dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
+ KVM_DIRTY_LOG_INITIALLY_SET);
+#endif
+
+#ifdef __x86_64__
+ guest_mode_init(VM_MODE_PXXV48_4K, true, true);
+#endif
+#ifdef __aarch64__
+ guest_mode_init(VM_MODE_P40V48_4K, true, true);
+ guest_mode_init(VM_MODE_P40V48_64K, true, true);
+
+ {
+ unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE);
+
+ if (limit >= 52)
+ guest_mode_init(VM_MODE_P52V48_64K, true, true);
+ if (limit >= 48) {
+ guest_mode_init(VM_MODE_P48V48_4K, true, true);
+ guest_mode_init(VM_MODE_P48V48_64K, true, true);
+ }
+ }
+#endif
+#ifdef __s390x__
+ guest_mode_init(VM_MODE_P40V48_4K, true, true);
+#endif
+
+ while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:")) != -1) {
+ switch (opt) {
+ case 'i':
+ iterations = strtol(optarg, NULL, 10);
+ break;
+ case 'p':
+ phys_offset = strtoull(optarg, NULL, 0);
+ break;
+ case 'm':
+ if (!mode_selected) {
+ for (i = 0; i < NUM_VM_MODES; ++i)
+ guest_modes[i].enabled = false;
+ mode_selected = true;
+ }
+ mode = strtoul(optarg, NULL, 10);
+ TEST_ASSERT(mode < NUM_VM_MODES,
+ "Guest mode ID %d too big", mode);
+ guest_modes[mode].enabled = true;
+ break;
+ case 'b':
+ guest_percpu_mem_size = parse_size(optarg);
+ break;
+ case 'f':
+ wr_fract = atoi(optarg);
+ TEST_ASSERT(wr_fract >= 1,
+ "Write fraction cannot be less than one");
+ break;
+ case 'v':
+ nr_vcpus = atoi(optarg);
+ TEST_ASSERT(nr_vcpus > 0,
+ "Must have a positive number of vCPUs");
+ TEST_ASSERT(nr_vcpus <= MAX_VCPUS,
+ "This test does not currently support\n"
+ "more than %d vCPUs.", MAX_VCPUS);
+ break;
+ case 'h':
+ default:
+ help(argv[0]);
+ break;
+ }
+ }
+
+ TEST_ASSERT(iterations >= 2, "The test should have at least two iterations");
+
+ pr_info("Test iterations: %"PRIu64"\n", iterations);
+
+ for (i = 0; i < NUM_VM_MODES; ++i) {
+ if (!guest_modes[i].enabled)
+ continue;
+ TEST_ASSERT(guest_modes[i].supported,
+ "Guest mode ID %d (%s) not supported.",
+ i, vm_guest_mode_string(i));
+ run_test(i, iterations, phys_offset, wr_fract);
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c
new file mode 100644
index 000000000..54da9cc20
--- /dev/null
+++ b/tools/testing/selftests/kvm/dirty_log_test.c
@@ -0,0 +1,639 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM dirty page logging test
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <pthread.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 1
+
+/* The memory slot index to track dirty pages */
+#define TEST_MEM_SLOT_INDEX 1
+
+/* Default guest test virtual memory offset */
+#define DEFAULT_GUEST_TEST_MEM 0xc0000000
+
+/* How many pages to dirty for each guest loop */
+#define TEST_PAGES_PER_LOOP 1024
+
+/* How many host loops to run (one KVM_GET_DIRTY_LOG for each loop) */
+#define TEST_HOST_LOOP_N 32UL
+
+/* Interval for each host loop (ms) */
+#define TEST_HOST_LOOP_INTERVAL 10UL
+
+/* Dirty bitmaps are always little endian, so we need to swap on big endian */
+#if defined(__s390x__)
+# define BITOP_LE_SWIZZLE ((BITS_PER_LONG-1) & ~0x7)
+# define test_bit_le(nr, addr) \
+ test_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
+# define set_bit_le(nr, addr) \
+ set_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
+# define clear_bit_le(nr, addr) \
+ clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
+# define test_and_set_bit_le(nr, addr) \
+ test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
+# define test_and_clear_bit_le(nr, addr) \
+ test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, addr)
+#else
+# define test_bit_le test_bit
+# define set_bit_le set_bit
+# define clear_bit_le clear_bit
+# define test_and_set_bit_le test_and_set_bit
+# define test_and_clear_bit_le test_and_clear_bit
+#endif
+
+/*
+ * Guest/Host shared variables. Ensure addr_gva2hva() and/or
+ * sync_global_to/from_guest() are used when accessing from
+ * the host. READ/WRITE_ONCE() should also be used with anything
+ * that may change.
+ */
+static uint64_t host_page_size;
+static uint64_t guest_page_size;
+static uint64_t guest_num_pages;
+static uint64_t random_array[TEST_PAGES_PER_LOOP];
+static uint64_t iteration;
+
+/*
+ * Guest physical memory offset of the testing memory slot.
+ * This will be set to the topmost valid physical address minus
+ * the test memory size.
+ */
+static uint64_t guest_test_phys_mem;
+
+/*
+ * Guest virtual memory offset of the testing memory slot.
+ * Must not conflict with identity mapped test code.
+ */
+static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM;
+
+/*
+ * Continuously write to the first 8 bytes of a random pages within
+ * the testing memory region.
+ */
+static void guest_code(void)
+{
+ uint64_t addr;
+ int i;
+
+ /*
+ * On s390x, all pages of a 1M segment are initially marked as dirty
+ * when a page of the segment is written to for the very first time.
+ * To compensate this specialty in this test, we need to touch all
+ * pages during the first iteration.
+ */
+ for (i = 0; i < guest_num_pages; i++) {
+ addr = guest_test_virt_mem + i * guest_page_size;
+ *(uint64_t *)addr = READ_ONCE(iteration);
+ }
+
+ while (true) {
+ for (i = 0; i < TEST_PAGES_PER_LOOP; i++) {
+ addr = guest_test_virt_mem;
+ addr += (READ_ONCE(random_array[i]) % guest_num_pages)
+ * guest_page_size;
+ addr &= ~(host_page_size - 1);
+ *(uint64_t *)addr = READ_ONCE(iteration);
+ }
+
+ /* Tell the host that we need more random numbers */
+ GUEST_SYNC(1);
+ }
+}
+
+/* Host variables */
+static bool host_quit;
+
+/* Points to the test VM memory region on which we track dirty logs */
+static void *host_test_mem;
+static uint64_t host_num_pages;
+
+/* For statistics only */
+static uint64_t host_dirty_count;
+static uint64_t host_clear_count;
+static uint64_t host_track_next_count;
+
+enum log_mode_t {
+ /* Only use KVM_GET_DIRTY_LOG for logging */
+ LOG_MODE_DIRTY_LOG = 0,
+
+ /* Use both KVM_[GET|CLEAR]_DIRTY_LOG for logging */
+ LOG_MODE_CLEAR_LOG = 1,
+
+ LOG_MODE_NUM,
+
+ /* Run all supported modes */
+ LOG_MODE_ALL = LOG_MODE_NUM,
+};
+
+/* Mode of logging to test. Default is to run all supported modes */
+static enum log_mode_t host_log_mode_option = LOG_MODE_ALL;
+/* Logging mode for current run */
+static enum log_mode_t host_log_mode;
+
+static bool clear_log_supported(void)
+{
+ return kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
+}
+
+static void clear_log_create_vm_done(struct kvm_vm *vm)
+{
+ struct kvm_enable_cap cap = {};
+ u64 manual_caps;
+
+ manual_caps = kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
+ TEST_ASSERT(manual_caps, "MANUAL_CAPS is zero!");
+ manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
+ KVM_DIRTY_LOG_INITIALLY_SET);
+ cap.cap = KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2;
+ cap.args[0] = manual_caps;
+ vm_enable_cap(vm, &cap);
+}
+
+static void dirty_log_collect_dirty_pages(struct kvm_vm *vm, int slot,
+ void *bitmap, uint32_t num_pages)
+{
+ kvm_vm_get_dirty_log(vm, slot, bitmap);
+}
+
+static void clear_log_collect_dirty_pages(struct kvm_vm *vm, int slot,
+ void *bitmap, uint32_t num_pages)
+{
+ kvm_vm_get_dirty_log(vm, slot, bitmap);
+ kvm_vm_clear_dirty_log(vm, slot, bitmap, 0, num_pages);
+}
+
+struct log_mode {
+ const char *name;
+ /* Return true if this mode is supported, otherwise false */
+ bool (*supported)(void);
+ /* Hook when the vm creation is done (before vcpu creation) */
+ void (*create_vm_done)(struct kvm_vm *vm);
+ /* Hook to collect the dirty pages into the bitmap provided */
+ void (*collect_dirty_pages) (struct kvm_vm *vm, int slot,
+ void *bitmap, uint32_t num_pages);
+} log_modes[LOG_MODE_NUM] = {
+ {
+ .name = "dirty-log",
+ .collect_dirty_pages = dirty_log_collect_dirty_pages,
+ },
+ {
+ .name = "clear-log",
+ .supported = clear_log_supported,
+ .create_vm_done = clear_log_create_vm_done,
+ .collect_dirty_pages = clear_log_collect_dirty_pages,
+ },
+};
+
+/*
+ * We use this bitmap to track some pages that should have its dirty
+ * bit set in the _next_ iteration. For example, if we detected the
+ * page value changed to current iteration but at the same time the
+ * page bit is cleared in the latest bitmap, then the system must
+ * report that write in the next get dirty log call.
+ */
+static unsigned long *host_bmap_track;
+
+static void log_modes_dump(void)
+{
+ int i;
+
+ printf("all");
+ for (i = 0; i < LOG_MODE_NUM; i++)
+ printf(", %s", log_modes[i].name);
+ printf("\n");
+}
+
+static bool log_mode_supported(void)
+{
+ struct log_mode *mode = &log_modes[host_log_mode];
+
+ if (mode->supported)
+ return mode->supported();
+
+ return true;
+}
+
+static void log_mode_create_vm_done(struct kvm_vm *vm)
+{
+ struct log_mode *mode = &log_modes[host_log_mode];
+
+ if (mode->create_vm_done)
+ mode->create_vm_done(vm);
+}
+
+static void log_mode_collect_dirty_pages(struct kvm_vm *vm, int slot,
+ void *bitmap, uint32_t num_pages)
+{
+ struct log_mode *mode = &log_modes[host_log_mode];
+
+ TEST_ASSERT(mode->collect_dirty_pages != NULL,
+ "collect_dirty_pages() is required for any log mode!");
+ mode->collect_dirty_pages(vm, slot, bitmap, num_pages);
+}
+
+static void generate_random_array(uint64_t *guest_array, uint64_t size)
+{
+ uint64_t i;
+
+ for (i = 0; i < size; i++)
+ guest_array[i] = random();
+}
+
+static void *vcpu_worker(void *data)
+{
+ int ret;
+ struct kvm_vm *vm = data;
+ uint64_t *guest_array;
+ uint64_t pages_count = 0;
+ struct kvm_run *run;
+
+ run = vcpu_state(vm, VCPU_ID);
+
+ guest_array = addr_gva2hva(vm, (vm_vaddr_t)random_array);
+ generate_random_array(guest_array, TEST_PAGES_PER_LOOP);
+
+ while (!READ_ONCE(host_quit)) {
+ /* Let the guest dirty the random pages */
+ ret = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
+ if (get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC) {
+ pages_count += TEST_PAGES_PER_LOOP;
+ generate_random_array(guest_array, TEST_PAGES_PER_LOOP);
+ } else {
+ TEST_FAIL("Invalid guest sync status: "
+ "exit_reason=%s\n",
+ exit_reason_str(run->exit_reason));
+ }
+ }
+
+ pr_info("Dirtied %"PRIu64" pages\n", pages_count);
+
+ return NULL;
+}
+
+static void vm_dirty_log_verify(enum vm_guest_mode mode, unsigned long *bmap)
+{
+ uint64_t step = vm_num_host_pages(mode, 1);
+ uint64_t page;
+ uint64_t *value_ptr;
+
+ for (page = 0; page < host_num_pages; page += step) {
+ value_ptr = host_test_mem + page * host_page_size;
+
+ /* If this is a special page that we were tracking... */
+ if (test_and_clear_bit_le(page, host_bmap_track)) {
+ host_track_next_count++;
+ TEST_ASSERT(test_bit_le(page, bmap),
+ "Page %"PRIu64" should have its dirty bit "
+ "set in this iteration but it is missing",
+ page);
+ }
+
+ if (test_and_clear_bit_le(page, bmap)) {
+ host_dirty_count++;
+ /*
+ * If the bit is set, the value written onto
+ * the corresponding page should be either the
+ * previous iteration number or the current one.
+ */
+ TEST_ASSERT(*value_ptr == iteration ||
+ *value_ptr == iteration - 1,
+ "Set page %"PRIu64" value %"PRIu64
+ " incorrect (iteration=%"PRIu64")",
+ page, *value_ptr, iteration);
+ } else {
+ host_clear_count++;
+ /*
+ * If cleared, the value written can be any
+ * value smaller or equals to the iteration
+ * number. Note that the value can be exactly
+ * (iteration-1) if that write can happen
+ * like this:
+ *
+ * (1) increase loop count to "iteration-1"
+ * (2) write to page P happens (with value
+ * "iteration-1")
+ * (3) get dirty log for "iteration-1"; we'll
+ * see that page P bit is set (dirtied),
+ * and not set the bit in host_bmap_track
+ * (4) increase loop count to "iteration"
+ * (which is current iteration)
+ * (5) get dirty log for current iteration,
+ * we'll see that page P is cleared, with
+ * value "iteration-1".
+ */
+ TEST_ASSERT(*value_ptr <= iteration,
+ "Clear page %"PRIu64" value %"PRIu64
+ " incorrect (iteration=%"PRIu64")",
+ page, *value_ptr, iteration);
+ if (*value_ptr == iteration) {
+ /*
+ * This page is _just_ modified; it
+ * should report its dirtyness in the
+ * next run
+ */
+ set_bit_le(page, host_bmap_track);
+ }
+ }
+ }
+}
+
+static struct kvm_vm *create_vm(enum vm_guest_mode mode, uint32_t vcpuid,
+ uint64_t extra_mem_pages, void *guest_code)
+{
+ struct kvm_vm *vm;
+ uint64_t extra_pg_pages = extra_mem_pages / 512 * 2;
+
+ pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
+
+ vm = vm_create(mode, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR);
+ kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
+#ifdef __x86_64__
+ vm_create_irqchip(vm);
+#endif
+ log_mode_create_vm_done(vm);
+ vm_vcpu_add_default(vm, vcpuid, guest_code);
+ return vm;
+}
+
+#define DIRTY_MEM_BITS 30 /* 1G */
+#define PAGE_SHIFT_4K 12
+
+static void run_test(enum vm_guest_mode mode, unsigned long iterations,
+ unsigned long interval, uint64_t phys_offset)
+{
+ pthread_t vcpu_thread;
+ struct kvm_vm *vm;
+ unsigned long *bmap;
+
+ if (!log_mode_supported()) {
+ print_skip("Log mode '%s' not supported",
+ log_modes[host_log_mode].name);
+ return;
+ }
+
+ /*
+ * We reserve page table for 2 times of extra dirty mem which
+ * will definitely cover the original (1G+) test range. Here
+ * we do the calculation with 4K page size which is the
+ * smallest so the page number will be enough for all archs
+ * (e.g., 64K page size guest will need even less memory for
+ * page tables).
+ */
+ vm = create_vm(mode, VCPU_ID,
+ 2ul << (DIRTY_MEM_BITS - PAGE_SHIFT_4K),
+ guest_code);
+
+ guest_page_size = vm_get_page_size(vm);
+ /*
+ * A little more than 1G of guest page sized pages. Cover the
+ * case where the size is not aligned to 64 pages.
+ */
+ guest_num_pages = (1ul << (DIRTY_MEM_BITS -
+ vm_get_page_shift(vm))) + 3;
+ guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages);
+
+ host_page_size = getpagesize();
+ host_num_pages = vm_num_host_pages(mode, guest_num_pages);
+
+ if (!phys_offset) {
+ guest_test_phys_mem = (vm_get_max_gfn(vm) -
+ guest_num_pages) * guest_page_size;
+ guest_test_phys_mem &= ~(host_page_size - 1);
+ } else {
+ guest_test_phys_mem = phys_offset;
+ }
+
+#ifdef __s390x__
+ /* Align to 1M (segment size) */
+ guest_test_phys_mem &= ~((1 << 20) - 1);
+#endif
+
+ pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem);
+
+ bmap = bitmap_alloc(host_num_pages);
+ host_bmap_track = bitmap_alloc(host_num_pages);
+
+ /* Add an extra memory slot for testing dirty logging */
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+ guest_test_phys_mem,
+ TEST_MEM_SLOT_INDEX,
+ guest_num_pages,
+ KVM_MEM_LOG_DIRTY_PAGES);
+
+ /* Do mapping for the dirty track memory slot */
+ virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0);
+
+ /* Cache the HVA pointer of the region */
+ host_test_mem = addr_gpa2hva(vm, (vm_paddr_t)guest_test_phys_mem);
+
+#ifdef __x86_64__
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+#endif
+ ucall_init(vm, NULL);
+
+ /* Export the shared variables to the guest */
+ sync_global_to_guest(vm, host_page_size);
+ sync_global_to_guest(vm, guest_page_size);
+ sync_global_to_guest(vm, guest_test_virt_mem);
+ sync_global_to_guest(vm, guest_num_pages);
+
+ /* Start the iterations */
+ iteration = 1;
+ sync_global_to_guest(vm, iteration);
+ host_quit = false;
+ host_dirty_count = 0;
+ host_clear_count = 0;
+ host_track_next_count = 0;
+
+ pthread_create(&vcpu_thread, NULL, vcpu_worker, vm);
+
+ while (iteration < iterations) {
+ /* Give the vcpu thread some time to dirty some pages */
+ usleep(interval * 1000);
+ log_mode_collect_dirty_pages(vm, TEST_MEM_SLOT_INDEX,
+ bmap, host_num_pages);
+ vm_dirty_log_verify(mode, bmap);
+ iteration++;
+ sync_global_to_guest(vm, iteration);
+ }
+
+ /* Tell the vcpu thread to quit */
+ host_quit = true;
+ pthread_join(vcpu_thread, NULL);
+
+ pr_info("Total bits checked: dirty (%"PRIu64"), clear (%"PRIu64"), "
+ "track_next (%"PRIu64")\n", host_dirty_count, host_clear_count,
+ host_track_next_count);
+
+ free(bmap);
+ free(host_bmap_track);
+ ucall_uninit(vm);
+ kvm_vm_free(vm);
+}
+
+struct guest_mode {
+ bool supported;
+ bool enabled;
+};
+static struct guest_mode guest_modes[NUM_VM_MODES];
+
+#define guest_mode_init(mode, supported, enabled) ({ \
+ guest_modes[mode] = (struct guest_mode){ supported, enabled }; \
+})
+
+static void help(char *name)
+{
+ int i;
+
+ puts("");
+ printf("usage: %s [-h] [-i iterations] [-I interval] "
+ "[-p offset] [-m mode]\n", name);
+ puts("");
+ printf(" -i: specify iteration counts (default: %"PRIu64")\n",
+ TEST_HOST_LOOP_N);
+ printf(" -I: specify interval in ms (default: %"PRIu64" ms)\n",
+ TEST_HOST_LOOP_INTERVAL);
+ printf(" -p: specify guest physical test memory offset\n"
+ " Warning: a low offset can conflict with the loaded test code.\n");
+ printf(" -M: specify the host logging mode "
+ "(default: run all log modes). Supported modes: \n\t");
+ log_modes_dump();
+ printf(" -m: specify the guest mode ID to test "
+ "(default: test all supported modes)\n"
+ " This option may be used multiple times.\n"
+ " Guest mode IDs:\n");
+ for (i = 0; i < NUM_VM_MODES; ++i) {
+ printf(" %d: %s%s\n", i, vm_guest_mode_string(i),
+ guest_modes[i].supported ? " (supported)" : "");
+ }
+ puts("");
+ exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned long iterations = TEST_HOST_LOOP_N;
+ unsigned long interval = TEST_HOST_LOOP_INTERVAL;
+ bool mode_selected = false;
+ uint64_t phys_offset = 0;
+ unsigned int mode;
+ int opt, i, j;
+
+#ifdef __x86_64__
+ guest_mode_init(VM_MODE_PXXV48_4K, true, true);
+#endif
+#ifdef __aarch64__
+ guest_mode_init(VM_MODE_P40V48_4K, true, true);
+ guest_mode_init(VM_MODE_P40V48_64K, true, true);
+
+ {
+ unsigned int limit = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE);
+
+ if (limit >= 52)
+ guest_mode_init(VM_MODE_P52V48_64K, true, true);
+ if (limit >= 48) {
+ guest_mode_init(VM_MODE_P48V48_4K, true, true);
+ guest_mode_init(VM_MODE_P48V48_64K, true, true);
+ }
+ }
+#endif
+#ifdef __s390x__
+ guest_mode_init(VM_MODE_P40V48_4K, true, true);
+#endif
+
+ while ((opt = getopt(argc, argv, "hi:I:p:m:M:")) != -1) {
+ switch (opt) {
+ case 'i':
+ iterations = strtol(optarg, NULL, 10);
+ break;
+ case 'I':
+ interval = strtol(optarg, NULL, 10);
+ break;
+ case 'p':
+ phys_offset = strtoull(optarg, NULL, 0);
+ break;
+ case 'm':
+ if (!mode_selected) {
+ for (i = 0; i < NUM_VM_MODES; ++i)
+ guest_modes[i].enabled = false;
+ mode_selected = true;
+ }
+ mode = strtoul(optarg, NULL, 10);
+ TEST_ASSERT(mode < NUM_VM_MODES,
+ "Guest mode ID %d too big", mode);
+ guest_modes[mode].enabled = true;
+ break;
+ case 'M':
+ if (!strcmp(optarg, "all")) {
+ host_log_mode_option = LOG_MODE_ALL;
+ break;
+ }
+ for (i = 0; i < LOG_MODE_NUM; i++) {
+ if (!strcmp(optarg, log_modes[i].name)) {
+ pr_info("Setting log mode to: '%s'\n",
+ optarg);
+ host_log_mode_option = i;
+ break;
+ }
+ }
+ if (i == LOG_MODE_NUM) {
+ printf("Log mode '%s' invalid. Please choose "
+ "from: ", optarg);
+ log_modes_dump();
+ exit(1);
+ }
+ break;
+ case 'h':
+ default:
+ help(argv[0]);
+ break;
+ }
+ }
+
+ TEST_ASSERT(iterations > 2, "Iterations must be greater than two");
+ TEST_ASSERT(interval > 0, "Interval must be greater than zero");
+
+ pr_info("Test iterations: %"PRIu64", interval: %"PRIu64" (ms)\n",
+ iterations, interval);
+
+ srandom(time(0));
+
+ for (i = 0; i < NUM_VM_MODES; ++i) {
+ if (!guest_modes[i].enabled)
+ continue;
+ TEST_ASSERT(guest_modes[i].supported,
+ "Guest mode ID %d (%s) not supported.",
+ i, vm_guest_mode_string(i));
+ if (host_log_mode_option == LOG_MODE_ALL) {
+ /* Run each log mode */
+ for (j = 0; j < LOG_MODE_NUM; j++) {
+ pr_info("Testing Log Mode '%s'\n",
+ log_modes[j].name);
+ host_log_mode = j;
+ run_test(i, iterations, interval, phys_offset);
+ }
+ } else {
+ host_log_mode = host_log_mode_option;
+ run_test(i, iterations, interval, phys_offset);
+ }
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h
new file mode 100644
index 000000000..b7fa0c855
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/aarch64/processor.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * AArch64 processor specific defines
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ */
+#ifndef SELFTEST_KVM_PROCESSOR_H
+#define SELFTEST_KVM_PROCESSOR_H
+
+#include "kvm_util.h"
+
+
+#define ARM64_CORE_REG(x) (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \
+ KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x))
+
+#define CPACR_EL1 3, 0, 1, 0, 2
+#define TCR_EL1 3, 0, 2, 0, 2
+#define MAIR_EL1 3, 0, 10, 2, 0
+#define TTBR0_EL1 3, 0, 2, 0, 0
+#define SCTLR_EL1 3, 0, 1, 0, 0
+
+/*
+ * Default MAIR
+ * index attribute
+ * DEVICE_nGnRnE 0 0000:0000
+ * DEVICE_nGnRE 1 0000:0100
+ * DEVICE_GRE 2 0000:1100
+ * NORMAL_NC 3 0100:0100
+ * NORMAL 4 1111:1111
+ * NORMAL_WT 5 1011:1011
+ */
+#define DEFAULT_MAIR_EL1 ((0x00ul << (0 * 8)) | \
+ (0x04ul << (1 * 8)) | \
+ (0x0cul << (2 * 8)) | \
+ (0x44ul << (3 * 8)) | \
+ (0xfful << (4 * 8)) | \
+ (0xbbul << (5 * 8)))
+
+static inline void get_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, uint64_t *addr)
+{
+ struct kvm_one_reg reg;
+ reg.id = id;
+ reg.addr = (uint64_t)addr;
+ vcpu_ioctl(vm, vcpuid, KVM_GET_ONE_REG, &reg);
+}
+
+static inline void set_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, uint64_t val)
+{
+ struct kvm_one_reg reg;
+ reg.id = id;
+ reg.addr = (uint64_t)&val;
+ vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, &reg);
+}
+
+void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *init);
+void aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_vcpu_init *init, void *guest_code);
+
+#endif /* SELFTEST_KVM_PROCESSOR_H */
diff --git a/tools/testing/selftests/kvm/include/evmcs.h b/tools/testing/selftests/kvm/include/evmcs.h
new file mode 100644
index 000000000..a034438b6
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/evmcs.h
@@ -0,0 +1,1102 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * tools/testing/selftests/kvm/include/vmx.h
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ *
+ */
+
+#ifndef SELFTEST_KVM_EVMCS_H
+#define SELFTEST_KVM_EVMCS_H
+
+#include <stdint.h>
+#include "vmx.h"
+
+#define u16 uint16_t
+#define u32 uint32_t
+#define u64 uint64_t
+
+#define EVMCS_VERSION 1
+
+extern bool enable_evmcs;
+
+struct hv_vp_assist_page {
+ __u32 apic_assist;
+ __u32 reserved;
+ __u64 vtl_control[2];
+ __u64 nested_enlightenments_control[2];
+ __u32 enlighten_vmentry;
+ __u64 current_nested_vmcs;
+};
+
+struct hv_enlightened_vmcs {
+ u32 revision_id;
+ u32 abort;
+
+ u16 host_es_selector;
+ u16 host_cs_selector;
+ u16 host_ss_selector;
+ u16 host_ds_selector;
+ u16 host_fs_selector;
+ u16 host_gs_selector;
+ u16 host_tr_selector;
+
+ u64 host_ia32_pat;
+ u64 host_ia32_efer;
+
+ u64 host_cr0;
+ u64 host_cr3;
+ u64 host_cr4;
+
+ u64 host_ia32_sysenter_esp;
+ u64 host_ia32_sysenter_eip;
+ u64 host_rip;
+ u32 host_ia32_sysenter_cs;
+
+ u32 pin_based_vm_exec_control;
+ u32 vm_exit_controls;
+ u32 secondary_vm_exec_control;
+
+ u64 io_bitmap_a;
+ u64 io_bitmap_b;
+ u64 msr_bitmap;
+
+ u16 guest_es_selector;
+ u16 guest_cs_selector;
+ u16 guest_ss_selector;
+ u16 guest_ds_selector;
+ u16 guest_fs_selector;
+ u16 guest_gs_selector;
+ u16 guest_ldtr_selector;
+ u16 guest_tr_selector;
+
+ u32 guest_es_limit;
+ u32 guest_cs_limit;
+ u32 guest_ss_limit;
+ u32 guest_ds_limit;
+ u32 guest_fs_limit;
+ u32 guest_gs_limit;
+ u32 guest_ldtr_limit;
+ u32 guest_tr_limit;
+ u32 guest_gdtr_limit;
+ u32 guest_idtr_limit;
+
+ u32 guest_es_ar_bytes;
+ u32 guest_cs_ar_bytes;
+ u32 guest_ss_ar_bytes;
+ u32 guest_ds_ar_bytes;
+ u32 guest_fs_ar_bytes;
+ u32 guest_gs_ar_bytes;
+ u32 guest_ldtr_ar_bytes;
+ u32 guest_tr_ar_bytes;
+
+ u64 guest_es_base;
+ u64 guest_cs_base;
+ u64 guest_ss_base;
+ u64 guest_ds_base;
+ u64 guest_fs_base;
+ u64 guest_gs_base;
+ u64 guest_ldtr_base;
+ u64 guest_tr_base;
+ u64 guest_gdtr_base;
+ u64 guest_idtr_base;
+
+ u64 padding64_1[3];
+
+ u64 vm_exit_msr_store_addr;
+ u64 vm_exit_msr_load_addr;
+ u64 vm_entry_msr_load_addr;
+
+ u64 cr3_target_value0;
+ u64 cr3_target_value1;
+ u64 cr3_target_value2;
+ u64 cr3_target_value3;
+
+ u32 page_fault_error_code_mask;
+ u32 page_fault_error_code_match;
+
+ u32 cr3_target_count;
+ u32 vm_exit_msr_store_count;
+ u32 vm_exit_msr_load_count;
+ u32 vm_entry_msr_load_count;
+
+ u64 tsc_offset;
+ u64 virtual_apic_page_addr;
+ u64 vmcs_link_pointer;
+
+ u64 guest_ia32_debugctl;
+ u64 guest_ia32_pat;
+ u64 guest_ia32_efer;
+
+ u64 guest_pdptr0;
+ u64 guest_pdptr1;
+ u64 guest_pdptr2;
+ u64 guest_pdptr3;
+
+ u64 guest_pending_dbg_exceptions;
+ u64 guest_sysenter_esp;
+ u64 guest_sysenter_eip;
+
+ u32 guest_activity_state;
+ u32 guest_sysenter_cs;
+
+ u64 cr0_guest_host_mask;
+ u64 cr4_guest_host_mask;
+ u64 cr0_read_shadow;
+ u64 cr4_read_shadow;
+ u64 guest_cr0;
+ u64 guest_cr3;
+ u64 guest_cr4;
+ u64 guest_dr7;
+
+ u64 host_fs_base;
+ u64 host_gs_base;
+ u64 host_tr_base;
+ u64 host_gdtr_base;
+ u64 host_idtr_base;
+ u64 host_rsp;
+
+ u64 ept_pointer;
+
+ u16 virtual_processor_id;
+ u16 padding16[3];
+
+ u64 padding64_2[5];
+ u64 guest_physical_address;
+
+ u32 vm_instruction_error;
+ u32 vm_exit_reason;
+ u32 vm_exit_intr_info;
+ u32 vm_exit_intr_error_code;
+ u32 idt_vectoring_info_field;
+ u32 idt_vectoring_error_code;
+ u32 vm_exit_instruction_len;
+ u32 vmx_instruction_info;
+
+ u64 exit_qualification;
+ u64 exit_io_instruction_ecx;
+ u64 exit_io_instruction_esi;
+ u64 exit_io_instruction_edi;
+ u64 exit_io_instruction_eip;
+
+ u64 guest_linear_address;
+ u64 guest_rsp;
+ u64 guest_rflags;
+
+ u32 guest_interruptibility_info;
+ u32 cpu_based_vm_exec_control;
+ u32 exception_bitmap;
+ u32 vm_entry_controls;
+ u32 vm_entry_intr_info_field;
+ u32 vm_entry_exception_error_code;
+ u32 vm_entry_instruction_len;
+ u32 tpr_threshold;
+
+ u64 guest_rip;
+
+ u32 hv_clean_fields;
+ u32 hv_padding_32;
+ u32 hv_synthetic_controls;
+ struct {
+ u32 nested_flush_hypercall:1;
+ u32 msr_bitmap:1;
+ u32 reserved:30;
+ } hv_enlightenments_control;
+ u32 hv_vp_id;
+
+ u64 hv_vm_id;
+ u64 partition_assist_page;
+ u64 padding64_4[4];
+ u64 guest_bndcfgs;
+ u64 padding64_5[7];
+ u64 xss_exit_bitmap;
+ u64 padding64_6[7];
+};
+
+#define HV_X64_MSR_VP_ASSIST_PAGE 0x40000073
+#define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001
+#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12
+#define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \
+ (~((1ull << HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
+
+extern struct hv_enlightened_vmcs *current_evmcs;
+extern struct hv_vp_assist_page *current_vp_assist;
+
+int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id);
+
+static inline int enable_vp_assist(uint64_t vp_assist_pa, void *vp_assist)
+{
+ u64 val = (vp_assist_pa & HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK) |
+ HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
+
+ wrmsr(HV_X64_MSR_VP_ASSIST_PAGE, val);
+
+ current_vp_assist = vp_assist;
+
+ enable_evmcs = true;
+
+ return 0;
+}
+
+static inline int evmcs_vmptrld(uint64_t vmcs_pa, void *vmcs)
+{
+ current_vp_assist->current_nested_vmcs = vmcs_pa;
+ current_vp_assist->enlighten_vmentry = 1;
+
+ current_evmcs = vmcs;
+
+ return 0;
+}
+
+static inline int evmcs_vmptrst(uint64_t *value)
+{
+ *value = current_vp_assist->current_nested_vmcs &
+ ~HV_X64_MSR_VP_ASSIST_PAGE_ENABLE;
+
+ return 0;
+}
+
+static inline int evmcs_vmread(uint64_t encoding, uint64_t *value)
+{
+ switch (encoding) {
+ case GUEST_RIP:
+ *value = current_evmcs->guest_rip;
+ break;
+ case GUEST_RSP:
+ *value = current_evmcs->guest_rsp;
+ break;
+ case GUEST_RFLAGS:
+ *value = current_evmcs->guest_rflags;
+ break;
+ case HOST_IA32_PAT:
+ *value = current_evmcs->host_ia32_pat;
+ break;
+ case HOST_IA32_EFER:
+ *value = current_evmcs->host_ia32_efer;
+ break;
+ case HOST_CR0:
+ *value = current_evmcs->host_cr0;
+ break;
+ case HOST_CR3:
+ *value = current_evmcs->host_cr3;
+ break;
+ case HOST_CR4:
+ *value = current_evmcs->host_cr4;
+ break;
+ case HOST_IA32_SYSENTER_ESP:
+ *value = current_evmcs->host_ia32_sysenter_esp;
+ break;
+ case HOST_IA32_SYSENTER_EIP:
+ *value = current_evmcs->host_ia32_sysenter_eip;
+ break;
+ case HOST_RIP:
+ *value = current_evmcs->host_rip;
+ break;
+ case IO_BITMAP_A:
+ *value = current_evmcs->io_bitmap_a;
+ break;
+ case IO_BITMAP_B:
+ *value = current_evmcs->io_bitmap_b;
+ break;
+ case MSR_BITMAP:
+ *value = current_evmcs->msr_bitmap;
+ break;
+ case GUEST_ES_BASE:
+ *value = current_evmcs->guest_es_base;
+ break;
+ case GUEST_CS_BASE:
+ *value = current_evmcs->guest_cs_base;
+ break;
+ case GUEST_SS_BASE:
+ *value = current_evmcs->guest_ss_base;
+ break;
+ case GUEST_DS_BASE:
+ *value = current_evmcs->guest_ds_base;
+ break;
+ case GUEST_FS_BASE:
+ *value = current_evmcs->guest_fs_base;
+ break;
+ case GUEST_GS_BASE:
+ *value = current_evmcs->guest_gs_base;
+ break;
+ case GUEST_LDTR_BASE:
+ *value = current_evmcs->guest_ldtr_base;
+ break;
+ case GUEST_TR_BASE:
+ *value = current_evmcs->guest_tr_base;
+ break;
+ case GUEST_GDTR_BASE:
+ *value = current_evmcs->guest_gdtr_base;
+ break;
+ case GUEST_IDTR_BASE:
+ *value = current_evmcs->guest_idtr_base;
+ break;
+ case TSC_OFFSET:
+ *value = current_evmcs->tsc_offset;
+ break;
+ case VIRTUAL_APIC_PAGE_ADDR:
+ *value = current_evmcs->virtual_apic_page_addr;
+ break;
+ case VMCS_LINK_POINTER:
+ *value = current_evmcs->vmcs_link_pointer;
+ break;
+ case GUEST_IA32_DEBUGCTL:
+ *value = current_evmcs->guest_ia32_debugctl;
+ break;
+ case GUEST_IA32_PAT:
+ *value = current_evmcs->guest_ia32_pat;
+ break;
+ case GUEST_IA32_EFER:
+ *value = current_evmcs->guest_ia32_efer;
+ break;
+ case GUEST_PDPTR0:
+ *value = current_evmcs->guest_pdptr0;
+ break;
+ case GUEST_PDPTR1:
+ *value = current_evmcs->guest_pdptr1;
+ break;
+ case GUEST_PDPTR2:
+ *value = current_evmcs->guest_pdptr2;
+ break;
+ case GUEST_PDPTR3:
+ *value = current_evmcs->guest_pdptr3;
+ break;
+ case GUEST_PENDING_DBG_EXCEPTIONS:
+ *value = current_evmcs->guest_pending_dbg_exceptions;
+ break;
+ case GUEST_SYSENTER_ESP:
+ *value = current_evmcs->guest_sysenter_esp;
+ break;
+ case GUEST_SYSENTER_EIP:
+ *value = current_evmcs->guest_sysenter_eip;
+ break;
+ case CR0_GUEST_HOST_MASK:
+ *value = current_evmcs->cr0_guest_host_mask;
+ break;
+ case CR4_GUEST_HOST_MASK:
+ *value = current_evmcs->cr4_guest_host_mask;
+ break;
+ case CR0_READ_SHADOW:
+ *value = current_evmcs->cr0_read_shadow;
+ break;
+ case CR4_READ_SHADOW:
+ *value = current_evmcs->cr4_read_shadow;
+ break;
+ case GUEST_CR0:
+ *value = current_evmcs->guest_cr0;
+ break;
+ case GUEST_CR3:
+ *value = current_evmcs->guest_cr3;
+ break;
+ case GUEST_CR4:
+ *value = current_evmcs->guest_cr4;
+ break;
+ case GUEST_DR7:
+ *value = current_evmcs->guest_dr7;
+ break;
+ case HOST_FS_BASE:
+ *value = current_evmcs->host_fs_base;
+ break;
+ case HOST_GS_BASE:
+ *value = current_evmcs->host_gs_base;
+ break;
+ case HOST_TR_BASE:
+ *value = current_evmcs->host_tr_base;
+ break;
+ case HOST_GDTR_BASE:
+ *value = current_evmcs->host_gdtr_base;
+ break;
+ case HOST_IDTR_BASE:
+ *value = current_evmcs->host_idtr_base;
+ break;
+ case HOST_RSP:
+ *value = current_evmcs->host_rsp;
+ break;
+ case EPT_POINTER:
+ *value = current_evmcs->ept_pointer;
+ break;
+ case GUEST_BNDCFGS:
+ *value = current_evmcs->guest_bndcfgs;
+ break;
+ case XSS_EXIT_BITMAP:
+ *value = current_evmcs->xss_exit_bitmap;
+ break;
+ case GUEST_PHYSICAL_ADDRESS:
+ *value = current_evmcs->guest_physical_address;
+ break;
+ case EXIT_QUALIFICATION:
+ *value = current_evmcs->exit_qualification;
+ break;
+ case GUEST_LINEAR_ADDRESS:
+ *value = current_evmcs->guest_linear_address;
+ break;
+ case VM_EXIT_MSR_STORE_ADDR:
+ *value = current_evmcs->vm_exit_msr_store_addr;
+ break;
+ case VM_EXIT_MSR_LOAD_ADDR:
+ *value = current_evmcs->vm_exit_msr_load_addr;
+ break;
+ case VM_ENTRY_MSR_LOAD_ADDR:
+ *value = current_evmcs->vm_entry_msr_load_addr;
+ break;
+ case CR3_TARGET_VALUE0:
+ *value = current_evmcs->cr3_target_value0;
+ break;
+ case CR3_TARGET_VALUE1:
+ *value = current_evmcs->cr3_target_value1;
+ break;
+ case CR3_TARGET_VALUE2:
+ *value = current_evmcs->cr3_target_value2;
+ break;
+ case CR3_TARGET_VALUE3:
+ *value = current_evmcs->cr3_target_value3;
+ break;
+ case TPR_THRESHOLD:
+ *value = current_evmcs->tpr_threshold;
+ break;
+ case GUEST_INTERRUPTIBILITY_INFO:
+ *value = current_evmcs->guest_interruptibility_info;
+ break;
+ case CPU_BASED_VM_EXEC_CONTROL:
+ *value = current_evmcs->cpu_based_vm_exec_control;
+ break;
+ case EXCEPTION_BITMAP:
+ *value = current_evmcs->exception_bitmap;
+ break;
+ case VM_ENTRY_CONTROLS:
+ *value = current_evmcs->vm_entry_controls;
+ break;
+ case VM_ENTRY_INTR_INFO_FIELD:
+ *value = current_evmcs->vm_entry_intr_info_field;
+ break;
+ case VM_ENTRY_EXCEPTION_ERROR_CODE:
+ *value = current_evmcs->vm_entry_exception_error_code;
+ break;
+ case VM_ENTRY_INSTRUCTION_LEN:
+ *value = current_evmcs->vm_entry_instruction_len;
+ break;
+ case HOST_IA32_SYSENTER_CS:
+ *value = current_evmcs->host_ia32_sysenter_cs;
+ break;
+ case PIN_BASED_VM_EXEC_CONTROL:
+ *value = current_evmcs->pin_based_vm_exec_control;
+ break;
+ case VM_EXIT_CONTROLS:
+ *value = current_evmcs->vm_exit_controls;
+ break;
+ case SECONDARY_VM_EXEC_CONTROL:
+ *value = current_evmcs->secondary_vm_exec_control;
+ break;
+ case GUEST_ES_LIMIT:
+ *value = current_evmcs->guest_es_limit;
+ break;
+ case GUEST_CS_LIMIT:
+ *value = current_evmcs->guest_cs_limit;
+ break;
+ case GUEST_SS_LIMIT:
+ *value = current_evmcs->guest_ss_limit;
+ break;
+ case GUEST_DS_LIMIT:
+ *value = current_evmcs->guest_ds_limit;
+ break;
+ case GUEST_FS_LIMIT:
+ *value = current_evmcs->guest_fs_limit;
+ break;
+ case GUEST_GS_LIMIT:
+ *value = current_evmcs->guest_gs_limit;
+ break;
+ case GUEST_LDTR_LIMIT:
+ *value = current_evmcs->guest_ldtr_limit;
+ break;
+ case GUEST_TR_LIMIT:
+ *value = current_evmcs->guest_tr_limit;
+ break;
+ case GUEST_GDTR_LIMIT:
+ *value = current_evmcs->guest_gdtr_limit;
+ break;
+ case GUEST_IDTR_LIMIT:
+ *value = current_evmcs->guest_idtr_limit;
+ break;
+ case GUEST_ES_AR_BYTES:
+ *value = current_evmcs->guest_es_ar_bytes;
+ break;
+ case GUEST_CS_AR_BYTES:
+ *value = current_evmcs->guest_cs_ar_bytes;
+ break;
+ case GUEST_SS_AR_BYTES:
+ *value = current_evmcs->guest_ss_ar_bytes;
+ break;
+ case GUEST_DS_AR_BYTES:
+ *value = current_evmcs->guest_ds_ar_bytes;
+ break;
+ case GUEST_FS_AR_BYTES:
+ *value = current_evmcs->guest_fs_ar_bytes;
+ break;
+ case GUEST_GS_AR_BYTES:
+ *value = current_evmcs->guest_gs_ar_bytes;
+ break;
+ case GUEST_LDTR_AR_BYTES:
+ *value = current_evmcs->guest_ldtr_ar_bytes;
+ break;
+ case GUEST_TR_AR_BYTES:
+ *value = current_evmcs->guest_tr_ar_bytes;
+ break;
+ case GUEST_ACTIVITY_STATE:
+ *value = current_evmcs->guest_activity_state;
+ break;
+ case GUEST_SYSENTER_CS:
+ *value = current_evmcs->guest_sysenter_cs;
+ break;
+ case VM_INSTRUCTION_ERROR:
+ *value = current_evmcs->vm_instruction_error;
+ break;
+ case VM_EXIT_REASON:
+ *value = current_evmcs->vm_exit_reason;
+ break;
+ case VM_EXIT_INTR_INFO:
+ *value = current_evmcs->vm_exit_intr_info;
+ break;
+ case VM_EXIT_INTR_ERROR_CODE:
+ *value = current_evmcs->vm_exit_intr_error_code;
+ break;
+ case IDT_VECTORING_INFO_FIELD:
+ *value = current_evmcs->idt_vectoring_info_field;
+ break;
+ case IDT_VECTORING_ERROR_CODE:
+ *value = current_evmcs->idt_vectoring_error_code;
+ break;
+ case VM_EXIT_INSTRUCTION_LEN:
+ *value = current_evmcs->vm_exit_instruction_len;
+ break;
+ case VMX_INSTRUCTION_INFO:
+ *value = current_evmcs->vmx_instruction_info;
+ break;
+ case PAGE_FAULT_ERROR_CODE_MASK:
+ *value = current_evmcs->page_fault_error_code_mask;
+ break;
+ case PAGE_FAULT_ERROR_CODE_MATCH:
+ *value = current_evmcs->page_fault_error_code_match;
+ break;
+ case CR3_TARGET_COUNT:
+ *value = current_evmcs->cr3_target_count;
+ break;
+ case VM_EXIT_MSR_STORE_COUNT:
+ *value = current_evmcs->vm_exit_msr_store_count;
+ break;
+ case VM_EXIT_MSR_LOAD_COUNT:
+ *value = current_evmcs->vm_exit_msr_load_count;
+ break;
+ case VM_ENTRY_MSR_LOAD_COUNT:
+ *value = current_evmcs->vm_entry_msr_load_count;
+ break;
+ case HOST_ES_SELECTOR:
+ *value = current_evmcs->host_es_selector;
+ break;
+ case HOST_CS_SELECTOR:
+ *value = current_evmcs->host_cs_selector;
+ break;
+ case HOST_SS_SELECTOR:
+ *value = current_evmcs->host_ss_selector;
+ break;
+ case HOST_DS_SELECTOR:
+ *value = current_evmcs->host_ds_selector;
+ break;
+ case HOST_FS_SELECTOR:
+ *value = current_evmcs->host_fs_selector;
+ break;
+ case HOST_GS_SELECTOR:
+ *value = current_evmcs->host_gs_selector;
+ break;
+ case HOST_TR_SELECTOR:
+ *value = current_evmcs->host_tr_selector;
+ break;
+ case GUEST_ES_SELECTOR:
+ *value = current_evmcs->guest_es_selector;
+ break;
+ case GUEST_CS_SELECTOR:
+ *value = current_evmcs->guest_cs_selector;
+ break;
+ case GUEST_SS_SELECTOR:
+ *value = current_evmcs->guest_ss_selector;
+ break;
+ case GUEST_DS_SELECTOR:
+ *value = current_evmcs->guest_ds_selector;
+ break;
+ case GUEST_FS_SELECTOR:
+ *value = current_evmcs->guest_fs_selector;
+ break;
+ case GUEST_GS_SELECTOR:
+ *value = current_evmcs->guest_gs_selector;
+ break;
+ case GUEST_LDTR_SELECTOR:
+ *value = current_evmcs->guest_ldtr_selector;
+ break;
+ case GUEST_TR_SELECTOR:
+ *value = current_evmcs->guest_tr_selector;
+ break;
+ case VIRTUAL_PROCESSOR_ID:
+ *value = current_evmcs->virtual_processor_id;
+ break;
+ default: return 1;
+ }
+
+ return 0;
+}
+
+static inline int evmcs_vmwrite(uint64_t encoding, uint64_t value)
+{
+ switch (encoding) {
+ case GUEST_RIP:
+ current_evmcs->guest_rip = value;
+ break;
+ case GUEST_RSP:
+ current_evmcs->guest_rsp = value;
+ break;
+ case GUEST_RFLAGS:
+ current_evmcs->guest_rflags = value;
+ break;
+ case HOST_IA32_PAT:
+ current_evmcs->host_ia32_pat = value;
+ break;
+ case HOST_IA32_EFER:
+ current_evmcs->host_ia32_efer = value;
+ break;
+ case HOST_CR0:
+ current_evmcs->host_cr0 = value;
+ break;
+ case HOST_CR3:
+ current_evmcs->host_cr3 = value;
+ break;
+ case HOST_CR4:
+ current_evmcs->host_cr4 = value;
+ break;
+ case HOST_IA32_SYSENTER_ESP:
+ current_evmcs->host_ia32_sysenter_esp = value;
+ break;
+ case HOST_IA32_SYSENTER_EIP:
+ current_evmcs->host_ia32_sysenter_eip = value;
+ break;
+ case HOST_RIP:
+ current_evmcs->host_rip = value;
+ break;
+ case IO_BITMAP_A:
+ current_evmcs->io_bitmap_a = value;
+ break;
+ case IO_BITMAP_B:
+ current_evmcs->io_bitmap_b = value;
+ break;
+ case MSR_BITMAP:
+ current_evmcs->msr_bitmap = value;
+ break;
+ case GUEST_ES_BASE:
+ current_evmcs->guest_es_base = value;
+ break;
+ case GUEST_CS_BASE:
+ current_evmcs->guest_cs_base = value;
+ break;
+ case GUEST_SS_BASE:
+ current_evmcs->guest_ss_base = value;
+ break;
+ case GUEST_DS_BASE:
+ current_evmcs->guest_ds_base = value;
+ break;
+ case GUEST_FS_BASE:
+ current_evmcs->guest_fs_base = value;
+ break;
+ case GUEST_GS_BASE:
+ current_evmcs->guest_gs_base = value;
+ break;
+ case GUEST_LDTR_BASE:
+ current_evmcs->guest_ldtr_base = value;
+ break;
+ case GUEST_TR_BASE:
+ current_evmcs->guest_tr_base = value;
+ break;
+ case GUEST_GDTR_BASE:
+ current_evmcs->guest_gdtr_base = value;
+ break;
+ case GUEST_IDTR_BASE:
+ current_evmcs->guest_idtr_base = value;
+ break;
+ case TSC_OFFSET:
+ current_evmcs->tsc_offset = value;
+ break;
+ case VIRTUAL_APIC_PAGE_ADDR:
+ current_evmcs->virtual_apic_page_addr = value;
+ break;
+ case VMCS_LINK_POINTER:
+ current_evmcs->vmcs_link_pointer = value;
+ break;
+ case GUEST_IA32_DEBUGCTL:
+ current_evmcs->guest_ia32_debugctl = value;
+ break;
+ case GUEST_IA32_PAT:
+ current_evmcs->guest_ia32_pat = value;
+ break;
+ case GUEST_IA32_EFER:
+ current_evmcs->guest_ia32_efer = value;
+ break;
+ case GUEST_PDPTR0:
+ current_evmcs->guest_pdptr0 = value;
+ break;
+ case GUEST_PDPTR1:
+ current_evmcs->guest_pdptr1 = value;
+ break;
+ case GUEST_PDPTR2:
+ current_evmcs->guest_pdptr2 = value;
+ break;
+ case GUEST_PDPTR3:
+ current_evmcs->guest_pdptr3 = value;
+ break;
+ case GUEST_PENDING_DBG_EXCEPTIONS:
+ current_evmcs->guest_pending_dbg_exceptions = value;
+ break;
+ case GUEST_SYSENTER_ESP:
+ current_evmcs->guest_sysenter_esp = value;
+ break;
+ case GUEST_SYSENTER_EIP:
+ current_evmcs->guest_sysenter_eip = value;
+ break;
+ case CR0_GUEST_HOST_MASK:
+ current_evmcs->cr0_guest_host_mask = value;
+ break;
+ case CR4_GUEST_HOST_MASK:
+ current_evmcs->cr4_guest_host_mask = value;
+ break;
+ case CR0_READ_SHADOW:
+ current_evmcs->cr0_read_shadow = value;
+ break;
+ case CR4_READ_SHADOW:
+ current_evmcs->cr4_read_shadow = value;
+ break;
+ case GUEST_CR0:
+ current_evmcs->guest_cr0 = value;
+ break;
+ case GUEST_CR3:
+ current_evmcs->guest_cr3 = value;
+ break;
+ case GUEST_CR4:
+ current_evmcs->guest_cr4 = value;
+ break;
+ case GUEST_DR7:
+ current_evmcs->guest_dr7 = value;
+ break;
+ case HOST_FS_BASE:
+ current_evmcs->host_fs_base = value;
+ break;
+ case HOST_GS_BASE:
+ current_evmcs->host_gs_base = value;
+ break;
+ case HOST_TR_BASE:
+ current_evmcs->host_tr_base = value;
+ break;
+ case HOST_GDTR_BASE:
+ current_evmcs->host_gdtr_base = value;
+ break;
+ case HOST_IDTR_BASE:
+ current_evmcs->host_idtr_base = value;
+ break;
+ case HOST_RSP:
+ current_evmcs->host_rsp = value;
+ break;
+ case EPT_POINTER:
+ current_evmcs->ept_pointer = value;
+ break;
+ case GUEST_BNDCFGS:
+ current_evmcs->guest_bndcfgs = value;
+ break;
+ case XSS_EXIT_BITMAP:
+ current_evmcs->xss_exit_bitmap = value;
+ break;
+ case GUEST_PHYSICAL_ADDRESS:
+ current_evmcs->guest_physical_address = value;
+ break;
+ case EXIT_QUALIFICATION:
+ current_evmcs->exit_qualification = value;
+ break;
+ case GUEST_LINEAR_ADDRESS:
+ current_evmcs->guest_linear_address = value;
+ break;
+ case VM_EXIT_MSR_STORE_ADDR:
+ current_evmcs->vm_exit_msr_store_addr = value;
+ break;
+ case VM_EXIT_MSR_LOAD_ADDR:
+ current_evmcs->vm_exit_msr_load_addr = value;
+ break;
+ case VM_ENTRY_MSR_LOAD_ADDR:
+ current_evmcs->vm_entry_msr_load_addr = value;
+ break;
+ case CR3_TARGET_VALUE0:
+ current_evmcs->cr3_target_value0 = value;
+ break;
+ case CR3_TARGET_VALUE1:
+ current_evmcs->cr3_target_value1 = value;
+ break;
+ case CR3_TARGET_VALUE2:
+ current_evmcs->cr3_target_value2 = value;
+ break;
+ case CR3_TARGET_VALUE3:
+ current_evmcs->cr3_target_value3 = value;
+ break;
+ case TPR_THRESHOLD:
+ current_evmcs->tpr_threshold = value;
+ break;
+ case GUEST_INTERRUPTIBILITY_INFO:
+ current_evmcs->guest_interruptibility_info = value;
+ break;
+ case CPU_BASED_VM_EXEC_CONTROL:
+ current_evmcs->cpu_based_vm_exec_control = value;
+ break;
+ case EXCEPTION_BITMAP:
+ current_evmcs->exception_bitmap = value;
+ break;
+ case VM_ENTRY_CONTROLS:
+ current_evmcs->vm_entry_controls = value;
+ break;
+ case VM_ENTRY_INTR_INFO_FIELD:
+ current_evmcs->vm_entry_intr_info_field = value;
+ break;
+ case VM_ENTRY_EXCEPTION_ERROR_CODE:
+ current_evmcs->vm_entry_exception_error_code = value;
+ break;
+ case VM_ENTRY_INSTRUCTION_LEN:
+ current_evmcs->vm_entry_instruction_len = value;
+ break;
+ case HOST_IA32_SYSENTER_CS:
+ current_evmcs->host_ia32_sysenter_cs = value;
+ break;
+ case PIN_BASED_VM_EXEC_CONTROL:
+ current_evmcs->pin_based_vm_exec_control = value;
+ break;
+ case VM_EXIT_CONTROLS:
+ current_evmcs->vm_exit_controls = value;
+ break;
+ case SECONDARY_VM_EXEC_CONTROL:
+ current_evmcs->secondary_vm_exec_control = value;
+ break;
+ case GUEST_ES_LIMIT:
+ current_evmcs->guest_es_limit = value;
+ break;
+ case GUEST_CS_LIMIT:
+ current_evmcs->guest_cs_limit = value;
+ break;
+ case GUEST_SS_LIMIT:
+ current_evmcs->guest_ss_limit = value;
+ break;
+ case GUEST_DS_LIMIT:
+ current_evmcs->guest_ds_limit = value;
+ break;
+ case GUEST_FS_LIMIT:
+ current_evmcs->guest_fs_limit = value;
+ break;
+ case GUEST_GS_LIMIT:
+ current_evmcs->guest_gs_limit = value;
+ break;
+ case GUEST_LDTR_LIMIT:
+ current_evmcs->guest_ldtr_limit = value;
+ break;
+ case GUEST_TR_LIMIT:
+ current_evmcs->guest_tr_limit = value;
+ break;
+ case GUEST_GDTR_LIMIT:
+ current_evmcs->guest_gdtr_limit = value;
+ break;
+ case GUEST_IDTR_LIMIT:
+ current_evmcs->guest_idtr_limit = value;
+ break;
+ case GUEST_ES_AR_BYTES:
+ current_evmcs->guest_es_ar_bytes = value;
+ break;
+ case GUEST_CS_AR_BYTES:
+ current_evmcs->guest_cs_ar_bytes = value;
+ break;
+ case GUEST_SS_AR_BYTES:
+ current_evmcs->guest_ss_ar_bytes = value;
+ break;
+ case GUEST_DS_AR_BYTES:
+ current_evmcs->guest_ds_ar_bytes = value;
+ break;
+ case GUEST_FS_AR_BYTES:
+ current_evmcs->guest_fs_ar_bytes = value;
+ break;
+ case GUEST_GS_AR_BYTES:
+ current_evmcs->guest_gs_ar_bytes = value;
+ break;
+ case GUEST_LDTR_AR_BYTES:
+ current_evmcs->guest_ldtr_ar_bytes = value;
+ break;
+ case GUEST_TR_AR_BYTES:
+ current_evmcs->guest_tr_ar_bytes = value;
+ break;
+ case GUEST_ACTIVITY_STATE:
+ current_evmcs->guest_activity_state = value;
+ break;
+ case GUEST_SYSENTER_CS:
+ current_evmcs->guest_sysenter_cs = value;
+ break;
+ case VM_INSTRUCTION_ERROR:
+ current_evmcs->vm_instruction_error = value;
+ break;
+ case VM_EXIT_REASON:
+ current_evmcs->vm_exit_reason = value;
+ break;
+ case VM_EXIT_INTR_INFO:
+ current_evmcs->vm_exit_intr_info = value;
+ break;
+ case VM_EXIT_INTR_ERROR_CODE:
+ current_evmcs->vm_exit_intr_error_code = value;
+ break;
+ case IDT_VECTORING_INFO_FIELD:
+ current_evmcs->idt_vectoring_info_field = value;
+ break;
+ case IDT_VECTORING_ERROR_CODE:
+ current_evmcs->idt_vectoring_error_code = value;
+ break;
+ case VM_EXIT_INSTRUCTION_LEN:
+ current_evmcs->vm_exit_instruction_len = value;
+ break;
+ case VMX_INSTRUCTION_INFO:
+ current_evmcs->vmx_instruction_info = value;
+ break;
+ case PAGE_FAULT_ERROR_CODE_MASK:
+ current_evmcs->page_fault_error_code_mask = value;
+ break;
+ case PAGE_FAULT_ERROR_CODE_MATCH:
+ current_evmcs->page_fault_error_code_match = value;
+ break;
+ case CR3_TARGET_COUNT:
+ current_evmcs->cr3_target_count = value;
+ break;
+ case VM_EXIT_MSR_STORE_COUNT:
+ current_evmcs->vm_exit_msr_store_count = value;
+ break;
+ case VM_EXIT_MSR_LOAD_COUNT:
+ current_evmcs->vm_exit_msr_load_count = value;
+ break;
+ case VM_ENTRY_MSR_LOAD_COUNT:
+ current_evmcs->vm_entry_msr_load_count = value;
+ break;
+ case HOST_ES_SELECTOR:
+ current_evmcs->host_es_selector = value;
+ break;
+ case HOST_CS_SELECTOR:
+ current_evmcs->host_cs_selector = value;
+ break;
+ case HOST_SS_SELECTOR:
+ current_evmcs->host_ss_selector = value;
+ break;
+ case HOST_DS_SELECTOR:
+ current_evmcs->host_ds_selector = value;
+ break;
+ case HOST_FS_SELECTOR:
+ current_evmcs->host_fs_selector = value;
+ break;
+ case HOST_GS_SELECTOR:
+ current_evmcs->host_gs_selector = value;
+ break;
+ case HOST_TR_SELECTOR:
+ current_evmcs->host_tr_selector = value;
+ break;
+ case GUEST_ES_SELECTOR:
+ current_evmcs->guest_es_selector = value;
+ break;
+ case GUEST_CS_SELECTOR:
+ current_evmcs->guest_cs_selector = value;
+ break;
+ case GUEST_SS_SELECTOR:
+ current_evmcs->guest_ss_selector = value;
+ break;
+ case GUEST_DS_SELECTOR:
+ current_evmcs->guest_ds_selector = value;
+ break;
+ case GUEST_FS_SELECTOR:
+ current_evmcs->guest_fs_selector = value;
+ break;
+ case GUEST_GS_SELECTOR:
+ current_evmcs->guest_gs_selector = value;
+ break;
+ case GUEST_LDTR_SELECTOR:
+ current_evmcs->guest_ldtr_selector = value;
+ break;
+ case GUEST_TR_SELECTOR:
+ current_evmcs->guest_tr_selector = value;
+ break;
+ case VIRTUAL_PROCESSOR_ID:
+ current_evmcs->virtual_processor_id = value;
+ break;
+ default: return 1;
+ }
+
+ return 0;
+}
+
+static inline int evmcs_vmlaunch(void)
+{
+ int ret;
+
+ current_evmcs->hv_clean_fields = 0;
+
+ __asm__ __volatile__("push %%rbp;"
+ "push %%rcx;"
+ "push %%rdx;"
+ "push %%rsi;"
+ "push %%rdi;"
+ "push $0;"
+ "mov %%rsp, (%[host_rsp]);"
+ "lea 1f(%%rip), %%rax;"
+ "mov %%rax, (%[host_rip]);"
+ "vmlaunch;"
+ "incq (%%rsp);"
+ "1: pop %%rax;"
+ "pop %%rdi;"
+ "pop %%rsi;"
+ "pop %%rdx;"
+ "pop %%rcx;"
+ "pop %%rbp;"
+ : [ret]"=&a"(ret)
+ : [host_rsp]"r"
+ ((uint64_t)&current_evmcs->host_rsp),
+ [host_rip]"r"
+ ((uint64_t)&current_evmcs->host_rip)
+ : "memory", "cc", "rbx", "r8", "r9", "r10",
+ "r11", "r12", "r13", "r14", "r15");
+ return ret;
+}
+
+/*
+ * No guest state (e.g. GPRs) is established by this vmresume.
+ */
+static inline int evmcs_vmresume(void)
+{
+ int ret;
+
+ current_evmcs->hv_clean_fields = 0;
+
+ __asm__ __volatile__("push %%rbp;"
+ "push %%rcx;"
+ "push %%rdx;"
+ "push %%rsi;"
+ "push %%rdi;"
+ "push $0;"
+ "mov %%rsp, (%[host_rsp]);"
+ "lea 1f(%%rip), %%rax;"
+ "mov %%rax, (%[host_rip]);"
+ "vmresume;"
+ "incq (%%rsp);"
+ "1: pop %%rax;"
+ "pop %%rdi;"
+ "pop %%rsi;"
+ "pop %%rdx;"
+ "pop %%rcx;"
+ "pop %%rbp;"
+ : [ret]"=&a"(ret)
+ : [host_rsp]"r"
+ ((uint64_t)&current_evmcs->host_rsp),
+ [host_rip]"r"
+ ((uint64_t)&current_evmcs->host_rip)
+ : "memory", "cc", "rbx", "r8", "r9", "r10",
+ "r11", "r12", "r13", "r14", "r15");
+ return ret;
+}
+
+#endif /* !SELFTEST_KVM_EVMCS_H */
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
new file mode 100644
index 000000000..7d29aa786
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -0,0 +1,348 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * tools/testing/selftests/kvm/include/kvm_util.h
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+#ifndef SELFTEST_KVM_UTIL_H
+#define SELFTEST_KVM_UTIL_H
+
+#include "test_util.h"
+
+#include "asm/kvm.h"
+#include "linux/list.h"
+#include "linux/kvm.h"
+#include <sys/ioctl.h>
+
+#include "sparsebit.h"
+
+
+/*
+ * Callers of kvm_util only have an incomplete/opaque description of the
+ * structure kvm_util is using to maintain the state of a VM.
+ */
+struct kvm_vm;
+
+typedef uint64_t vm_paddr_t; /* Virtual Machine (Guest) physical address */
+typedef uint64_t vm_vaddr_t; /* Virtual Machine (Guest) virtual address */
+
+/* Minimum allocated guest virtual and physical addresses */
+#define KVM_UTIL_MIN_VADDR 0x2000
+
+#define DEFAULT_GUEST_PHY_PAGES 512
+#define DEFAULT_GUEST_STACK_VADDR_MIN 0xab6000
+#define DEFAULT_STACK_PGS 5
+
+enum vm_guest_mode {
+ VM_MODE_P52V48_4K,
+ VM_MODE_P52V48_64K,
+ VM_MODE_P48V48_4K,
+ VM_MODE_P48V48_64K,
+ VM_MODE_P40V48_4K,
+ VM_MODE_P40V48_64K,
+ VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */
+ NUM_VM_MODES,
+};
+
+#if defined(__aarch64__)
+#define VM_MODE_DEFAULT VM_MODE_P40V48_4K
+#elif defined(__x86_64__)
+#define VM_MODE_DEFAULT VM_MODE_PXXV48_4K
+#else
+#define VM_MODE_DEFAULT VM_MODE_P52V48_4K
+#endif
+
+#define vm_guest_mode_string(m) vm_guest_mode_string[m]
+extern const char * const vm_guest_mode_string[];
+
+enum vm_mem_backing_src_type {
+ VM_MEM_SRC_ANONYMOUS,
+ VM_MEM_SRC_ANONYMOUS_THP,
+ VM_MEM_SRC_ANONYMOUS_HUGETLB,
+};
+
+int kvm_check_cap(long cap);
+int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap);
+int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
+ struct kvm_enable_cap *cap);
+void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size);
+
+struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
+void kvm_vm_free(struct kvm_vm *vmp);
+void kvm_vm_restart(struct kvm_vm *vmp, int perm);
+void kvm_vm_release(struct kvm_vm *vmp);
+void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log);
+void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log,
+ uint64_t first_page, uint32_t num_pages);
+
+int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva,
+ size_t len);
+
+void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename,
+ uint32_t data_memslot, uint32_t pgd_memslot);
+
+void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent);
+
+/*
+ * VM VCPU Dump
+ *
+ * Input Args:
+ * stream - Output FILE stream
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * indent - Left margin indent amount
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Dumps the current state of the VCPU specified by @vcpuid, within the VM
+ * given by @vm, to the FILE stream given by @stream.
+ */
+void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid,
+ uint8_t indent);
+
+void vm_create_irqchip(struct kvm_vm *vm);
+
+void vm_userspace_mem_region_add(struct kvm_vm *vm,
+ enum vm_mem_backing_src_type src_type,
+ uint64_t guest_paddr, uint32_t slot, uint64_t npages,
+ uint32_t flags);
+
+void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl,
+ void *arg);
+int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid, unsigned long ioctl,
+ void *arg);
+void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg);
+void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags);
+void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa);
+void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot);
+void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid);
+vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
+ uint32_t data_memslot, uint32_t pgd_memslot);
+void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+ unsigned int npages, uint32_t pgd_memslot);
+void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa);
+void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva);
+vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva);
+
+/*
+ * Address Guest Virtual to Guest Physical
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * gva - VM virtual address
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Equivalent VM physical address
+ *
+ * Returns the VM physical address of the translated VM virtual
+ * address given by @gva.
+ */
+vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva);
+
+struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid);
+void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid);
+int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid);
+void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid);
+void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_guest_debug *debug);
+void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_mp_state *mp_state);
+struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid);
+void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs);
+void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs);
+
+/*
+ * VM VCPU Args Set
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * num - number of arguments
+ * ... - arguments, each of type uint64_t
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Sets the first @num function input registers of the VCPU with @vcpuid,
+ * per the C calling convention of the architecture, to the values given
+ * as variable args. Each of the variable args is expected to be of type
+ * uint64_t. The maximum @num can be is specific to the architecture.
+ */
+void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...);
+
+void vcpu_sregs_get(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_sregs *sregs);
+void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_sregs *sregs);
+int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_sregs *sregs);
+void vcpu_fpu_get(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_fpu *fpu);
+void vcpu_fpu_set(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_fpu *fpu);
+void vcpu_get_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg);
+void vcpu_set_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg);
+#ifdef __KVM_HAVE_VCPU_EVENTS
+void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_vcpu_events *events);
+void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_vcpu_events *events);
+#endif
+#ifdef __x86_64__
+void vcpu_nested_state_get(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_nested_state *state);
+int vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_nested_state *state, bool ignore_error);
+#endif
+
+const char *exit_reason_str(unsigned int exit_reason);
+
+void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot);
+
+/*
+ * VM Virtual Page Map
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vaddr - VM Virtual Address
+ * paddr - VM Physical Address
+ * memslot - Memory region slot for new virtual translation tables
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Within @vm, creates a virtual translation for the page starting
+ * at @vaddr to the page starting at @paddr.
+ */
+void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+ uint32_t memslot);
+
+vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min,
+ uint32_t memslot);
+vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
+ vm_paddr_t paddr_min, uint32_t memslot);
+
+/*
+ * Create a VM with reasonable defaults
+ *
+ * Input Args:
+ * vcpuid - The id of the single VCPU to add to the VM.
+ * extra_mem_pages - The number of extra pages to add (this will
+ * decide how much extra space we will need to
+ * setup the page tables using memslot 0)
+ * guest_code - The vCPU's entry point
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Pointer to opaque structure that describes the created VM.
+ */
+struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
+ void *guest_code);
+
+/*
+ * Adds a vCPU with reasonable defaults (e.g. a stack)
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - The id of the VCPU to add to the VM.
+ * guest_code - The vCPU's entry point
+ */
+void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code);
+
+bool vm_is_unrestricted_guest(struct kvm_vm *vm);
+
+unsigned int vm_get_page_size(struct kvm_vm *vm);
+unsigned int vm_get_page_shift(struct kvm_vm *vm);
+unsigned int vm_get_max_gfn(struct kvm_vm *vm);
+int vm_get_fd(struct kvm_vm *vm);
+
+unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size);
+unsigned int vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages);
+unsigned int vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages);
+static inline unsigned int
+vm_adjust_num_guest_pages(enum vm_guest_mode mode, unsigned int num_guest_pages)
+{
+ unsigned int n;
+ n = vm_num_guest_pages(mode, vm_num_host_pages(mode, num_guest_pages));
+#ifdef __s390x__
+ /* s390 requires 1M aligned guest sizes */
+ n = (n + 255) & ~255;
+#endif
+ return n;
+}
+
+struct kvm_userspace_memory_region *
+kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
+ uint64_t end);
+
+struct kvm_dirty_log *
+allocate_kvm_dirty_log(struct kvm_userspace_memory_region *region);
+
+int vm_create_device(struct kvm_vm *vm, struct kvm_create_device *cd);
+
+#define sync_global_to_guest(vm, g) ({ \
+ typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \
+ memcpy(_p, &(g), sizeof(g)); \
+})
+
+#define sync_global_from_guest(vm, g) ({ \
+ typeof(g) *_p = addr_gva2hva(vm, (vm_vaddr_t)&(g)); \
+ memcpy(&(g), _p, sizeof(g)); \
+})
+
+void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid);
+
+/* Common ucalls */
+enum {
+ UCALL_NONE,
+ UCALL_SYNC,
+ UCALL_ABORT,
+ UCALL_DONE,
+};
+
+#define UCALL_MAX_ARGS 6
+
+struct ucall {
+ uint64_t cmd;
+ uint64_t args[UCALL_MAX_ARGS];
+};
+
+void ucall_init(struct kvm_vm *vm, void *arg);
+void ucall_uninit(struct kvm_vm *vm);
+void ucall(uint64_t cmd, int nargs, ...);
+uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc);
+
+#define GUEST_SYNC_ARGS(stage, arg1, arg2, arg3, arg4) \
+ ucall(UCALL_SYNC, 6, "hello", stage, arg1, arg2, arg3, arg4)
+#define GUEST_SYNC(stage) ucall(UCALL_SYNC, 2, "hello", stage)
+#define GUEST_DONE() ucall(UCALL_DONE, 0)
+#define __GUEST_ASSERT(_condition, _nargs, _args...) do { \
+ if (!(_condition)) \
+ ucall(UCALL_ABORT, 2 + _nargs, \
+ "Failed guest assert: " \
+ #_condition, __LINE__, _args); \
+} while (0)
+
+#define GUEST_ASSERT(_condition) \
+ __GUEST_ASSERT((_condition), 0, 0)
+
+#define GUEST_ASSERT_1(_condition, arg1) \
+ __GUEST_ASSERT((_condition), 1, (arg1))
+
+#define GUEST_ASSERT_2(_condition, arg1, arg2) \
+ __GUEST_ASSERT((_condition), 2, (arg1), (arg2))
+
+#define GUEST_ASSERT_3(_condition, arg1, arg2, arg3) \
+ __GUEST_ASSERT((_condition), 3, (arg1), (arg2), (arg3))
+
+#define GUEST_ASSERT_4(_condition, arg1, arg2, arg3, arg4) \
+ __GUEST_ASSERT((_condition), 4, (arg1), (arg2), (arg3), (arg4))
+
+#endif /* SELFTEST_KVM_UTIL_H */
diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h
new file mode 100644
index 000000000..261805205
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/perf_test_util.h
@@ -0,0 +1,198 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * tools/testing/selftests/kvm/include/perf_test_util.h
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#ifndef SELFTEST_KVM_PERF_TEST_UTIL_H
+#define SELFTEST_KVM_PERF_TEST_UTIL_H
+
+#include "kvm_util.h"
+#include "processor.h"
+
+#define MAX_VCPUS 512
+
+#define PAGE_SHIFT_4K 12
+#define PTES_PER_4K_PT 512
+
+#define TEST_MEM_SLOT_INDEX 1
+
+/* Default guest test virtual memory offset */
+#define DEFAULT_GUEST_TEST_MEM 0xc0000000
+
+#define DEFAULT_PER_VCPU_MEM_SIZE (1 << 30) /* 1G */
+
+/*
+ * Guest physical memory offset of the testing memory slot.
+ * This will be set to the topmost valid physical address minus
+ * the test memory size.
+ */
+static uint64_t guest_test_phys_mem;
+
+/*
+ * Guest virtual memory offset of the testing memory slot.
+ * Must not conflict with identity mapped test code.
+ */
+static uint64_t guest_test_virt_mem = DEFAULT_GUEST_TEST_MEM;
+static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
+
+/* Number of VCPUs for the test */
+static int nr_vcpus = 1;
+
+struct vcpu_args {
+ uint64_t gva;
+ uint64_t pages;
+
+ /* Only used by the host userspace part of the vCPU thread */
+ int vcpu_id;
+};
+
+struct perf_test_args {
+ struct kvm_vm *vm;
+ uint64_t host_page_size;
+ uint64_t guest_page_size;
+ int wr_fract;
+
+ struct vcpu_args vcpu_args[MAX_VCPUS];
+};
+
+static struct perf_test_args perf_test_args;
+
+/*
+ * Continuously write to the first 8 bytes of each page in the
+ * specified region.
+ */
+static void guest_code(uint32_t vcpu_id)
+{
+ struct vcpu_args *vcpu_args = &perf_test_args.vcpu_args[vcpu_id];
+ uint64_t gva;
+ uint64_t pages;
+ int i;
+
+ /* Make sure vCPU args data structure is not corrupt. */
+ GUEST_ASSERT(vcpu_args->vcpu_id == vcpu_id);
+
+ gva = vcpu_args->gva;
+ pages = vcpu_args->pages;
+
+ while (true) {
+ for (i = 0; i < pages; i++) {
+ uint64_t addr = gva + (i * perf_test_args.guest_page_size);
+
+ if (i % perf_test_args.wr_fract == 0)
+ *(uint64_t *)addr = 0x0123456789ABCDEF;
+ else
+ READ_ONCE(*(uint64_t *)addr);
+ }
+
+ GUEST_SYNC(1);
+ }
+}
+
+static struct kvm_vm *create_vm(enum vm_guest_mode mode, int vcpus,
+ uint64_t vcpu_memory_bytes)
+{
+ struct kvm_vm *vm;
+ uint64_t pages = DEFAULT_GUEST_PHY_PAGES;
+ uint64_t guest_num_pages;
+
+ /* Account for a few pages per-vCPU for stacks */
+ pages += DEFAULT_STACK_PGS * vcpus;
+
+ /*
+ * Reserve twice the ammount of memory needed to map the test region and
+ * the page table / stacks region, at 4k, for page tables. Do the
+ * calculation with 4K page size: the smallest of all archs. (e.g., 64K
+ * page size guest will need even less memory for page tables).
+ */
+ pages += (2 * pages) / PTES_PER_4K_PT;
+ pages += ((2 * vcpus * vcpu_memory_bytes) >> PAGE_SHIFT_4K) /
+ PTES_PER_4K_PT;
+ pages = vm_adjust_num_guest_pages(mode, pages);
+
+ pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode));
+
+ vm = vm_create(mode, pages, O_RDWR);
+ kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
+#ifdef __x86_64__
+ vm_create_irqchip(vm);
+#endif
+
+ perf_test_args.vm = vm;
+ perf_test_args.guest_page_size = vm_get_page_size(vm);
+ perf_test_args.host_page_size = getpagesize();
+
+ TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0,
+ "Guest memory size is not guest page size aligned.");
+
+ guest_num_pages = (vcpus * vcpu_memory_bytes) /
+ perf_test_args.guest_page_size;
+ guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages);
+
+ /*
+ * If there should be more memory in the guest test region than there
+ * can be pages in the guest, it will definitely cause problems.
+ */
+ TEST_ASSERT(guest_num_pages < vm_get_max_gfn(vm),
+ "Requested more guest memory than address space allows.\n"
+ " guest pages: %lx max gfn: %x vcpus: %d wss: %lx]\n",
+ guest_num_pages, vm_get_max_gfn(vm), vcpus,
+ vcpu_memory_bytes);
+
+ TEST_ASSERT(vcpu_memory_bytes % perf_test_args.host_page_size == 0,
+ "Guest memory size is not host page size aligned.");
+
+ guest_test_phys_mem = (vm_get_max_gfn(vm) - guest_num_pages) *
+ perf_test_args.guest_page_size;
+ guest_test_phys_mem &= ~(perf_test_args.host_page_size - 1);
+
+#ifdef __s390x__
+ /* Align to 1M (segment size) */
+ guest_test_phys_mem &= ~((1 << 20) - 1);
+#endif
+
+ pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem);
+
+ /* Add an extra memory slot for testing */
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+ guest_test_phys_mem,
+ TEST_MEM_SLOT_INDEX,
+ guest_num_pages, 0);
+
+ /* Do mapping for the demand paging memory slot */
+ virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages, 0);
+
+ ucall_init(vm, NULL);
+
+ return vm;
+}
+
+static void add_vcpus(struct kvm_vm *vm, int vcpus, uint64_t vcpu_memory_bytes)
+{
+ vm_paddr_t vcpu_gpa;
+ struct vcpu_args *vcpu_args;
+ int vcpu_id;
+
+ for (vcpu_id = 0; vcpu_id < vcpus; vcpu_id++) {
+ vcpu_args = &perf_test_args.vcpu_args[vcpu_id];
+
+ vm_vcpu_add_default(vm, vcpu_id, guest_code);
+
+#ifdef __x86_64__
+ vcpu_set_cpuid(vm, vcpu_id, kvm_get_supported_cpuid());
+#endif
+
+ vcpu_args->vcpu_id = vcpu_id;
+ vcpu_args->gva = guest_test_virt_mem +
+ (vcpu_id * vcpu_memory_bytes);
+ vcpu_args->pages = vcpu_memory_bytes /
+ perf_test_args.guest_page_size;
+
+ vcpu_gpa = guest_test_phys_mem + (vcpu_id * vcpu_memory_bytes);
+ pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n",
+ vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_memory_bytes);
+ }
+}
+
+#endif /* SELFTEST_KVM_PERF_TEST_UTIL_H */
diff --git a/tools/testing/selftests/kvm/include/s390x/processor.h b/tools/testing/selftests/kvm/include/s390x/processor.h
new file mode 100644
index 000000000..e0e96a5f6
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/s390x/processor.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * s390x processor specific defines
+ */
+#ifndef SELFTEST_KVM_PROCESSOR_H
+#define SELFTEST_KVM_PROCESSOR_H
+
+/* Bits in the region/segment table entry */
+#define REGION_ENTRY_ORIGIN ~0xfffUL /* region/segment table origin */
+#define REGION_ENTRY_PROTECT 0x200 /* region protection bit */
+#define REGION_ENTRY_NOEXEC 0x100 /* region no-execute bit */
+#define REGION_ENTRY_OFFSET 0xc0 /* region table offset */
+#define REGION_ENTRY_INVALID 0x20 /* invalid region table entry */
+#define REGION_ENTRY_TYPE 0x0c /* region/segment table type mask */
+#define REGION_ENTRY_LENGTH 0x03 /* region third length */
+
+/* Bits in the page table entry */
+#define PAGE_INVALID 0x400 /* HW invalid bit */
+#define PAGE_PROTECT 0x200 /* HW read-only bit */
+#define PAGE_NOEXEC 0x100 /* HW no-execute bit */
+
+#endif
diff --git a/tools/testing/selftests/kvm/include/sparsebit.h b/tools/testing/selftests/kvm/include/sparsebit.h
new file mode 100644
index 000000000..12a9a4b9c
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/sparsebit.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * tools/testing/selftests/kvm/include/sparsebit.h
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * Header file that describes API to the sparsebit library.
+ * This library provides a memory efficient means of storing
+ * the settings of bits indexed via a uint64_t. Memory usage
+ * is reasonable, significantly less than (2^64 / 8) bytes, as
+ * long as bits that are mostly set or mostly cleared are close
+ * to each other. This library is efficient in memory usage
+ * even in the case where most bits are set.
+ */
+
+#ifndef SELFTEST_KVM_SPARSEBIT_H
+#define SELFTEST_KVM_SPARSEBIT_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct sparsebit;
+typedef uint64_t sparsebit_idx_t;
+typedef uint64_t sparsebit_num_t;
+
+struct sparsebit *sparsebit_alloc(void);
+void sparsebit_free(struct sparsebit **sbitp);
+void sparsebit_copy(struct sparsebit *dstp, struct sparsebit *src);
+
+bool sparsebit_is_set(struct sparsebit *sbit, sparsebit_idx_t idx);
+bool sparsebit_is_set_num(struct sparsebit *sbit,
+ sparsebit_idx_t idx, sparsebit_num_t num);
+bool sparsebit_is_clear(struct sparsebit *sbit, sparsebit_idx_t idx);
+bool sparsebit_is_clear_num(struct sparsebit *sbit,
+ sparsebit_idx_t idx, sparsebit_num_t num);
+sparsebit_num_t sparsebit_num_set(struct sparsebit *sbit);
+bool sparsebit_any_set(struct sparsebit *sbit);
+bool sparsebit_any_clear(struct sparsebit *sbit);
+bool sparsebit_all_set(struct sparsebit *sbit);
+bool sparsebit_all_clear(struct sparsebit *sbit);
+sparsebit_idx_t sparsebit_first_set(struct sparsebit *sbit);
+sparsebit_idx_t sparsebit_first_clear(struct sparsebit *sbit);
+sparsebit_idx_t sparsebit_next_set(struct sparsebit *sbit, sparsebit_idx_t prev);
+sparsebit_idx_t sparsebit_next_clear(struct sparsebit *sbit, sparsebit_idx_t prev);
+sparsebit_idx_t sparsebit_next_set_num(struct sparsebit *sbit,
+ sparsebit_idx_t start, sparsebit_num_t num);
+sparsebit_idx_t sparsebit_next_clear_num(struct sparsebit *sbit,
+ sparsebit_idx_t start, sparsebit_num_t num);
+
+void sparsebit_set(struct sparsebit *sbitp, sparsebit_idx_t idx);
+void sparsebit_set_num(struct sparsebit *sbitp, sparsebit_idx_t start,
+ sparsebit_num_t num);
+void sparsebit_set_all(struct sparsebit *sbitp);
+
+void sparsebit_clear(struct sparsebit *sbitp, sparsebit_idx_t idx);
+void sparsebit_clear_num(struct sparsebit *sbitp,
+ sparsebit_idx_t start, sparsebit_num_t num);
+void sparsebit_clear_all(struct sparsebit *sbitp);
+
+void sparsebit_dump(FILE *stream, struct sparsebit *sbit,
+ unsigned int indent);
+void sparsebit_validate_internal(struct sparsebit *sbit);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* SELFTEST_KVM_SPARSEBIT_H */
diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
new file mode 100644
index 000000000..ffffa5604
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/test_util.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * tools/testing/selftests/kvm/include/test_util.h
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#ifndef SELFTEST_KVM_TEST_UTIL_H
+#define SELFTEST_KVM_TEST_UTIL_H
+
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include "kselftest.h"
+
+static inline int _no_printf(const char *format, ...) { return 0; }
+
+#ifdef DEBUG
+#define pr_debug(...) printf(__VA_ARGS__)
+#else
+#define pr_debug(...) _no_printf(__VA_ARGS__)
+#endif
+#ifndef QUIET
+#define pr_info(...) printf(__VA_ARGS__)
+#else
+#define pr_info(...) _no_printf(__VA_ARGS__)
+#endif
+
+void print_skip(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
+
+ssize_t test_write(int fd, const void *buf, size_t count);
+ssize_t test_read(int fd, void *buf, size_t count);
+int test_seq_read(const char *path, char **bufp, size_t *sizep);
+
+void test_assert(bool exp, const char *exp_str,
+ const char *file, unsigned int line, const char *fmt, ...)
+ __attribute__((format(printf, 5, 6)));
+
+#define TEST_ASSERT(e, fmt, ...) \
+ test_assert((e), #e, __FILE__, __LINE__, fmt, ##__VA_ARGS__)
+
+#define ASSERT_EQ(a, b) do { \
+ typeof(a) __a = (a); \
+ typeof(b) __b = (b); \
+ TEST_ASSERT(__a == __b, \
+ "ASSERT_EQ(%s, %s) failed.\n" \
+ "\t%s is %#lx\n" \
+ "\t%s is %#lx", \
+ #a, #b, #a, (unsigned long) __a, #b, (unsigned long) __b); \
+} while (0)
+
+#define TEST_FAIL(fmt, ...) \
+ TEST_ASSERT(false, fmt, ##__VA_ARGS__)
+
+size_t parse_size(const char *size);
+
+int64_t timespec_to_ns(struct timespec ts);
+struct timespec timespec_add_ns(struct timespec ts, int64_t ns);
+struct timespec timespec_add(struct timespec ts1, struct timespec ts2);
+struct timespec timespec_sub(struct timespec ts1, struct timespec ts2);
+struct timespec timespec_diff_now(struct timespec start);
+struct timespec timespec_div(struct timespec ts, int divisor);
+
+#endif /* SELFTEST_KVM_TEST_UTIL_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h
new file mode 100644
index 000000000..8e61340b3
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86_64/processor.h
@@ -0,0 +1,422 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * tools/testing/selftests/kvm/include/x86_64/processor.h
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#ifndef SELFTEST_KVM_PROCESSOR_H
+#define SELFTEST_KVM_PROCESSOR_H
+
+#include <assert.h>
+#include <stdint.h>
+
+#include <asm/msr-index.h>
+
+#define X86_EFLAGS_FIXED (1u << 1)
+
+#define X86_CR4_VME (1ul << 0)
+#define X86_CR4_PVI (1ul << 1)
+#define X86_CR4_TSD (1ul << 2)
+#define X86_CR4_DE (1ul << 3)
+#define X86_CR4_PSE (1ul << 4)
+#define X86_CR4_PAE (1ul << 5)
+#define X86_CR4_MCE (1ul << 6)
+#define X86_CR4_PGE (1ul << 7)
+#define X86_CR4_PCE (1ul << 8)
+#define X86_CR4_OSFXSR (1ul << 9)
+#define X86_CR4_OSXMMEXCPT (1ul << 10)
+#define X86_CR4_UMIP (1ul << 11)
+#define X86_CR4_VMXE (1ul << 13)
+#define X86_CR4_SMXE (1ul << 14)
+#define X86_CR4_FSGSBASE (1ul << 16)
+#define X86_CR4_PCIDE (1ul << 17)
+#define X86_CR4_OSXSAVE (1ul << 18)
+#define X86_CR4_SMEP (1ul << 20)
+#define X86_CR4_SMAP (1ul << 21)
+#define X86_CR4_PKE (1ul << 22)
+
+#define UNEXPECTED_VECTOR_PORT 0xfff0u
+
+/* General Registers in 64-Bit Mode */
+struct gpr64_regs {
+ u64 rax;
+ u64 rcx;
+ u64 rdx;
+ u64 rbx;
+ u64 rsp;
+ u64 rbp;
+ u64 rsi;
+ u64 rdi;
+ u64 r8;
+ u64 r9;
+ u64 r10;
+ u64 r11;
+ u64 r12;
+ u64 r13;
+ u64 r14;
+ u64 r15;
+};
+
+struct desc64 {
+ uint16_t limit0;
+ uint16_t base0;
+ unsigned base1:8, type:4, s:1, dpl:2, p:1;
+ unsigned limit1:4, avl:1, l:1, db:1, g:1, base2:8;
+ uint32_t base3;
+ uint32_t zero1;
+} __attribute__((packed));
+
+struct desc_ptr {
+ uint16_t size;
+ uint64_t address;
+} __attribute__((packed));
+
+static inline uint64_t get_desc64_base(const struct desc64 *desc)
+{
+ return ((uint64_t)desc->base3 << 32) |
+ (desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
+}
+
+static inline uint64_t rdtsc(void)
+{
+ uint32_t eax, edx;
+ uint64_t tsc_val;
+ /*
+ * The lfence is to wait (on Intel CPUs) until all previous
+ * instructions have been executed. If software requires RDTSC to be
+ * executed prior to execution of any subsequent instruction, it can
+ * execute LFENCE immediately after RDTSC
+ */
+ __asm__ __volatile__("lfence; rdtsc; lfence" : "=a"(eax), "=d"(edx));
+ tsc_val = ((uint64_t)edx) << 32 | eax;
+ return tsc_val;
+}
+
+static inline uint64_t rdtscp(uint32_t *aux)
+{
+ uint32_t eax, edx;
+
+ __asm__ __volatile__("rdtscp" : "=a"(eax), "=d"(edx), "=c"(*aux));
+ return ((uint64_t)edx) << 32 | eax;
+}
+
+static inline uint64_t rdmsr(uint32_t msr)
+{
+ uint32_t a, d;
+
+ __asm__ __volatile__("rdmsr" : "=a"(a), "=d"(d) : "c"(msr) : "memory");
+
+ return a | ((uint64_t) d << 32);
+}
+
+static inline void wrmsr(uint32_t msr, uint64_t value)
+{
+ uint32_t a = value;
+ uint32_t d = value >> 32;
+
+ __asm__ __volatile__("wrmsr" :: "a"(a), "d"(d), "c"(msr) : "memory");
+}
+
+
+static inline uint16_t inw(uint16_t port)
+{
+ uint16_t tmp;
+
+ __asm__ __volatile__("in %%dx, %%ax"
+ : /* output */ "=a" (tmp)
+ : /* input */ "d" (port));
+
+ return tmp;
+}
+
+static inline uint16_t get_es(void)
+{
+ uint16_t es;
+
+ __asm__ __volatile__("mov %%es, %[es]"
+ : /* output */ [es]"=rm"(es));
+ return es;
+}
+
+static inline uint16_t get_cs(void)
+{
+ uint16_t cs;
+
+ __asm__ __volatile__("mov %%cs, %[cs]"
+ : /* output */ [cs]"=rm"(cs));
+ return cs;
+}
+
+static inline uint16_t get_ss(void)
+{
+ uint16_t ss;
+
+ __asm__ __volatile__("mov %%ss, %[ss]"
+ : /* output */ [ss]"=rm"(ss));
+ return ss;
+}
+
+static inline uint16_t get_ds(void)
+{
+ uint16_t ds;
+
+ __asm__ __volatile__("mov %%ds, %[ds]"
+ : /* output */ [ds]"=rm"(ds));
+ return ds;
+}
+
+static inline uint16_t get_fs(void)
+{
+ uint16_t fs;
+
+ __asm__ __volatile__("mov %%fs, %[fs]"
+ : /* output */ [fs]"=rm"(fs));
+ return fs;
+}
+
+static inline uint16_t get_gs(void)
+{
+ uint16_t gs;
+
+ __asm__ __volatile__("mov %%gs, %[gs]"
+ : /* output */ [gs]"=rm"(gs));
+ return gs;
+}
+
+static inline uint16_t get_tr(void)
+{
+ uint16_t tr;
+
+ __asm__ __volatile__("str %[tr]"
+ : /* output */ [tr]"=rm"(tr));
+ return tr;
+}
+
+static inline uint64_t get_cr0(void)
+{
+ uint64_t cr0;
+
+ __asm__ __volatile__("mov %%cr0, %[cr0]"
+ : /* output */ [cr0]"=r"(cr0));
+ return cr0;
+}
+
+static inline uint64_t get_cr3(void)
+{
+ uint64_t cr3;
+
+ __asm__ __volatile__("mov %%cr3, %[cr3]"
+ : /* output */ [cr3]"=r"(cr3));
+ return cr3;
+}
+
+static inline uint64_t get_cr4(void)
+{
+ uint64_t cr4;
+
+ __asm__ __volatile__("mov %%cr4, %[cr4]"
+ : /* output */ [cr4]"=r"(cr4));
+ return cr4;
+}
+
+static inline void set_cr4(uint64_t val)
+{
+ __asm__ __volatile__("mov %0, %%cr4" : : "r" (val) : "memory");
+}
+
+static inline struct desc_ptr get_gdt(void)
+{
+ struct desc_ptr gdt;
+ __asm__ __volatile__("sgdt %[gdt]"
+ : /* output */ [gdt]"=m"(gdt));
+ return gdt;
+}
+
+static inline struct desc_ptr get_idt(void)
+{
+ struct desc_ptr idt;
+ __asm__ __volatile__("sidt %[idt]"
+ : /* output */ [idt]"=m"(idt));
+ return idt;
+}
+
+static inline void outl(uint16_t port, uint32_t value)
+{
+ __asm__ __volatile__("outl %%eax, %%dx" : : "d"(port), "a"(value));
+}
+
+#define SET_XMM(__var, __xmm) \
+ asm volatile("movq %0, %%"#__xmm : : "r"(__var) : #__xmm)
+
+static inline void set_xmm(int n, unsigned long val)
+{
+ switch (n) {
+ case 0:
+ SET_XMM(val, xmm0);
+ break;
+ case 1:
+ SET_XMM(val, xmm1);
+ break;
+ case 2:
+ SET_XMM(val, xmm2);
+ break;
+ case 3:
+ SET_XMM(val, xmm3);
+ break;
+ case 4:
+ SET_XMM(val, xmm4);
+ break;
+ case 5:
+ SET_XMM(val, xmm5);
+ break;
+ case 6:
+ SET_XMM(val, xmm6);
+ break;
+ case 7:
+ SET_XMM(val, xmm7);
+ break;
+ }
+}
+
+typedef unsigned long v1di __attribute__ ((vector_size (8)));
+static inline unsigned long get_xmm(int n)
+{
+ assert(n >= 0 && n <= 7);
+
+ register v1di xmm0 __asm__("%xmm0");
+ register v1di xmm1 __asm__("%xmm1");
+ register v1di xmm2 __asm__("%xmm2");
+ register v1di xmm3 __asm__("%xmm3");
+ register v1di xmm4 __asm__("%xmm4");
+ register v1di xmm5 __asm__("%xmm5");
+ register v1di xmm6 __asm__("%xmm6");
+ register v1di xmm7 __asm__("%xmm7");
+ switch (n) {
+ case 0:
+ return (unsigned long)xmm0;
+ case 1:
+ return (unsigned long)xmm1;
+ case 2:
+ return (unsigned long)xmm2;
+ case 3:
+ return (unsigned long)xmm3;
+ case 4:
+ return (unsigned long)xmm4;
+ case 5:
+ return (unsigned long)xmm5;
+ case 6:
+ return (unsigned long)xmm6;
+ case 7:
+ return (unsigned long)xmm7;
+ }
+ return 0;
+}
+
+bool is_intel_cpu(void);
+
+struct kvm_x86_state;
+struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid);
+void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_x86_state *state);
+
+struct kvm_msr_list *kvm_get_msr_index_list(void);
+
+struct kvm_cpuid2 *kvm_get_supported_cpuid(void);
+void vcpu_set_cpuid(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_cpuid2 *cpuid);
+
+struct kvm_cpuid_entry2 *
+kvm_get_supported_cpuid_index(uint32_t function, uint32_t index);
+
+static inline struct kvm_cpuid_entry2 *
+kvm_get_supported_cpuid_entry(uint32_t function)
+{
+ return kvm_get_supported_cpuid_index(function, 0);
+}
+
+uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index);
+int _vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
+ uint64_t msr_value);
+void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
+ uint64_t msr_value);
+
+uint32_t kvm_get_cpuid_max_basic(void);
+uint32_t kvm_get_cpuid_max_extended(void);
+void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits);
+
+struct ex_regs {
+ uint64_t rax, rcx, rdx, rbx;
+ uint64_t rbp, rsi, rdi;
+ uint64_t r8, r9, r10, r11;
+ uint64_t r12, r13, r14, r15;
+ uint64_t vector;
+ uint64_t error_code;
+ uint64_t rip;
+ uint64_t cs;
+ uint64_t rflags;
+};
+
+void vm_init_descriptor_tables(struct kvm_vm *vm);
+void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid);
+void vm_handle_exception(struct kvm_vm *vm, int vector,
+ void (*handler)(struct ex_regs *));
+
+/*
+ * set_cpuid() - overwrites a matching cpuid entry with the provided value.
+ * matches based on ent->function && ent->index. returns true
+ * if a match was found and successfully overwritten.
+ * @cpuid: the kvm cpuid list to modify.
+ * @ent: cpuid entry to insert
+ */
+bool set_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 *ent);
+
+uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
+ uint64_t a3);
+
+/*
+ * Basic CPU control in CR0
+ */
+#define X86_CR0_PE (1UL<<0) /* Protection Enable */
+#define X86_CR0_MP (1UL<<1) /* Monitor Coprocessor */
+#define X86_CR0_EM (1UL<<2) /* Emulation */
+#define X86_CR0_TS (1UL<<3) /* Task Switched */
+#define X86_CR0_ET (1UL<<4) /* Extension Type */
+#define X86_CR0_NE (1UL<<5) /* Numeric Error */
+#define X86_CR0_WP (1UL<<16) /* Write Protect */
+#define X86_CR0_AM (1UL<<18) /* Alignment Mask */
+#define X86_CR0_NW (1UL<<29) /* Not Write-through */
+#define X86_CR0_CD (1UL<<30) /* Cache Disable */
+#define X86_CR0_PG (1UL<<31) /* Paging */
+
+#define APIC_BASE_MSR 0x800
+#define X2APIC_ENABLE (1UL << 10)
+#define APIC_ICR 0x300
+#define APIC_DEST_SELF 0x40000
+#define APIC_DEST_ALLINC 0x80000
+#define APIC_DEST_ALLBUT 0xC0000
+#define APIC_ICR_RR_MASK 0x30000
+#define APIC_ICR_RR_INVALID 0x00000
+#define APIC_ICR_RR_INPROG 0x10000
+#define APIC_ICR_RR_VALID 0x20000
+#define APIC_INT_LEVELTRIG 0x08000
+#define APIC_INT_ASSERT 0x04000
+#define APIC_ICR_BUSY 0x01000
+#define APIC_DEST_LOGICAL 0x00800
+#define APIC_DEST_PHYSICAL 0x00000
+#define APIC_DM_FIXED 0x00000
+#define APIC_DM_FIXED_MASK 0x00700
+#define APIC_DM_LOWEST 0x00100
+#define APIC_DM_SMI 0x00200
+#define APIC_DM_REMRD 0x00300
+#define APIC_DM_NMI 0x00400
+#define APIC_DM_INIT 0x00500
+#define APIC_DM_STARTUP 0x00600
+#define APIC_DM_EXTINT 0x00700
+#define APIC_VECTOR_MASK 0x000FF
+#define APIC_ICR2 0x310
+
+/* VMX_EPT_VPID_CAP bits */
+#define VMX_EPT_VPID_CAP_AD_BITS (1ULL << 21)
+
+#endif /* SELFTEST_KVM_PROCESSOR_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/svm.h b/tools/testing/selftests/kvm/include/x86_64/svm.h
new file mode 100644
index 000000000..f4ea2355d
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86_64/svm.h
@@ -0,0 +1,297 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * tools/testing/selftests/kvm/include/x86_64/svm.h
+ * This is a copy of arch/x86/include/asm/svm.h
+ *
+ */
+
+#ifndef SELFTEST_KVM_SVM_H
+#define SELFTEST_KVM_SVM_H
+
+enum {
+ INTERCEPT_INTR,
+ INTERCEPT_NMI,
+ INTERCEPT_SMI,
+ INTERCEPT_INIT,
+ INTERCEPT_VINTR,
+ INTERCEPT_SELECTIVE_CR0,
+ INTERCEPT_STORE_IDTR,
+ INTERCEPT_STORE_GDTR,
+ INTERCEPT_STORE_LDTR,
+ INTERCEPT_STORE_TR,
+ INTERCEPT_LOAD_IDTR,
+ INTERCEPT_LOAD_GDTR,
+ INTERCEPT_LOAD_LDTR,
+ INTERCEPT_LOAD_TR,
+ INTERCEPT_RDTSC,
+ INTERCEPT_RDPMC,
+ INTERCEPT_PUSHF,
+ INTERCEPT_POPF,
+ INTERCEPT_CPUID,
+ INTERCEPT_RSM,
+ INTERCEPT_IRET,
+ INTERCEPT_INTn,
+ INTERCEPT_INVD,
+ INTERCEPT_PAUSE,
+ INTERCEPT_HLT,
+ INTERCEPT_INVLPG,
+ INTERCEPT_INVLPGA,
+ INTERCEPT_IOIO_PROT,
+ INTERCEPT_MSR_PROT,
+ INTERCEPT_TASK_SWITCH,
+ INTERCEPT_FERR_FREEZE,
+ INTERCEPT_SHUTDOWN,
+ INTERCEPT_VMRUN,
+ INTERCEPT_VMMCALL,
+ INTERCEPT_VMLOAD,
+ INTERCEPT_VMSAVE,
+ INTERCEPT_STGI,
+ INTERCEPT_CLGI,
+ INTERCEPT_SKINIT,
+ INTERCEPT_RDTSCP,
+ INTERCEPT_ICEBP,
+ INTERCEPT_WBINVD,
+ INTERCEPT_MONITOR,
+ INTERCEPT_MWAIT,
+ INTERCEPT_MWAIT_COND,
+ INTERCEPT_XSETBV,
+ INTERCEPT_RDPRU,
+};
+
+
+struct __attribute__ ((__packed__)) vmcb_control_area {
+ u32 intercept_cr;
+ u32 intercept_dr;
+ u32 intercept_exceptions;
+ u64 intercept;
+ u8 reserved_1[40];
+ u16 pause_filter_thresh;
+ u16 pause_filter_count;
+ u64 iopm_base_pa;
+ u64 msrpm_base_pa;
+ u64 tsc_offset;
+ u32 asid;
+ u8 tlb_ctl;
+ u8 reserved_2[3];
+ u32 int_ctl;
+ u32 int_vector;
+ u32 int_state;
+ u8 reserved_3[4];
+ u32 exit_code;
+ u32 exit_code_hi;
+ u64 exit_info_1;
+ u64 exit_info_2;
+ u32 exit_int_info;
+ u32 exit_int_info_err;
+ u64 nested_ctl;
+ u64 avic_vapic_bar;
+ u8 reserved_4[8];
+ u32 event_inj;
+ u32 event_inj_err;
+ u64 nested_cr3;
+ u64 virt_ext;
+ u32 clean;
+ u32 reserved_5;
+ u64 next_rip;
+ u8 insn_len;
+ u8 insn_bytes[15];
+ u64 avic_backing_page; /* Offset 0xe0 */
+ u8 reserved_6[8]; /* Offset 0xe8 */
+ u64 avic_logical_id; /* Offset 0xf0 */
+ u64 avic_physical_id; /* Offset 0xf8 */
+ u8 reserved_7[768];
+};
+
+
+#define TLB_CONTROL_DO_NOTHING 0
+#define TLB_CONTROL_FLUSH_ALL_ASID 1
+#define TLB_CONTROL_FLUSH_ASID 3
+#define TLB_CONTROL_FLUSH_ASID_LOCAL 7
+
+#define V_TPR_MASK 0x0f
+
+#define V_IRQ_SHIFT 8
+#define V_IRQ_MASK (1 << V_IRQ_SHIFT)
+
+#define V_GIF_SHIFT 9
+#define V_GIF_MASK (1 << V_GIF_SHIFT)
+
+#define V_INTR_PRIO_SHIFT 16
+#define V_INTR_PRIO_MASK (0x0f << V_INTR_PRIO_SHIFT)
+
+#define V_IGN_TPR_SHIFT 20
+#define V_IGN_TPR_MASK (1 << V_IGN_TPR_SHIFT)
+
+#define V_INTR_MASKING_SHIFT 24
+#define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT)
+
+#define V_GIF_ENABLE_SHIFT 25
+#define V_GIF_ENABLE_MASK (1 << V_GIF_ENABLE_SHIFT)
+
+#define AVIC_ENABLE_SHIFT 31
+#define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT)
+
+#define LBR_CTL_ENABLE_MASK BIT_ULL(0)
+#define VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK BIT_ULL(1)
+
+#define SVM_INTERRUPT_SHADOW_MASK 1
+
+#define SVM_IOIO_STR_SHIFT 2
+#define SVM_IOIO_REP_SHIFT 3
+#define SVM_IOIO_SIZE_SHIFT 4
+#define SVM_IOIO_ASIZE_SHIFT 7
+
+#define SVM_IOIO_TYPE_MASK 1
+#define SVM_IOIO_STR_MASK (1 << SVM_IOIO_STR_SHIFT)
+#define SVM_IOIO_REP_MASK (1 << SVM_IOIO_REP_SHIFT)
+#define SVM_IOIO_SIZE_MASK (7 << SVM_IOIO_SIZE_SHIFT)
+#define SVM_IOIO_ASIZE_MASK (7 << SVM_IOIO_ASIZE_SHIFT)
+
+#define SVM_VM_CR_VALID_MASK 0x001fULL
+#define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL
+#define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL
+
+#define SVM_NESTED_CTL_NP_ENABLE BIT(0)
+#define SVM_NESTED_CTL_SEV_ENABLE BIT(1)
+
+struct __attribute__ ((__packed__)) vmcb_seg {
+ u16 selector;
+ u16 attrib;
+ u32 limit;
+ u64 base;
+};
+
+struct __attribute__ ((__packed__)) vmcb_save_area {
+ struct vmcb_seg es;
+ struct vmcb_seg cs;
+ struct vmcb_seg ss;
+ struct vmcb_seg ds;
+ struct vmcb_seg fs;
+ struct vmcb_seg gs;
+ struct vmcb_seg gdtr;
+ struct vmcb_seg ldtr;
+ struct vmcb_seg idtr;
+ struct vmcb_seg tr;
+ u8 reserved_1[43];
+ u8 cpl;
+ u8 reserved_2[4];
+ u64 efer;
+ u8 reserved_3[112];
+ u64 cr4;
+ u64 cr3;
+ u64 cr0;
+ u64 dr7;
+ u64 dr6;
+ u64 rflags;
+ u64 rip;
+ u8 reserved_4[88];
+ u64 rsp;
+ u8 reserved_5[24];
+ u64 rax;
+ u64 star;
+ u64 lstar;
+ u64 cstar;
+ u64 sfmask;
+ u64 kernel_gs_base;
+ u64 sysenter_cs;
+ u64 sysenter_esp;
+ u64 sysenter_eip;
+ u64 cr2;
+ u8 reserved_6[32];
+ u64 g_pat;
+ u64 dbgctl;
+ u64 br_from;
+ u64 br_to;
+ u64 last_excp_from;
+ u64 last_excp_to;
+};
+
+struct __attribute__ ((__packed__)) vmcb {
+ struct vmcb_control_area control;
+ struct vmcb_save_area save;
+};
+
+#define SVM_CPUID_FUNC 0x8000000a
+
+#define SVM_VM_CR_SVM_DISABLE 4
+
+#define SVM_SELECTOR_S_SHIFT 4
+#define SVM_SELECTOR_DPL_SHIFT 5
+#define SVM_SELECTOR_P_SHIFT 7
+#define SVM_SELECTOR_AVL_SHIFT 8
+#define SVM_SELECTOR_L_SHIFT 9
+#define SVM_SELECTOR_DB_SHIFT 10
+#define SVM_SELECTOR_G_SHIFT 11
+
+#define SVM_SELECTOR_TYPE_MASK (0xf)
+#define SVM_SELECTOR_S_MASK (1 << SVM_SELECTOR_S_SHIFT)
+#define SVM_SELECTOR_DPL_MASK (3 << SVM_SELECTOR_DPL_SHIFT)
+#define SVM_SELECTOR_P_MASK (1 << SVM_SELECTOR_P_SHIFT)
+#define SVM_SELECTOR_AVL_MASK (1 << SVM_SELECTOR_AVL_SHIFT)
+#define SVM_SELECTOR_L_MASK (1 << SVM_SELECTOR_L_SHIFT)
+#define SVM_SELECTOR_DB_MASK (1 << SVM_SELECTOR_DB_SHIFT)
+#define SVM_SELECTOR_G_MASK (1 << SVM_SELECTOR_G_SHIFT)
+
+#define SVM_SELECTOR_WRITE_MASK (1 << 1)
+#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
+#define SVM_SELECTOR_CODE_MASK (1 << 3)
+
+#define INTERCEPT_CR0_READ 0
+#define INTERCEPT_CR3_READ 3
+#define INTERCEPT_CR4_READ 4
+#define INTERCEPT_CR8_READ 8
+#define INTERCEPT_CR0_WRITE (16 + 0)
+#define INTERCEPT_CR3_WRITE (16 + 3)
+#define INTERCEPT_CR4_WRITE (16 + 4)
+#define INTERCEPT_CR8_WRITE (16 + 8)
+
+#define INTERCEPT_DR0_READ 0
+#define INTERCEPT_DR1_READ 1
+#define INTERCEPT_DR2_READ 2
+#define INTERCEPT_DR3_READ 3
+#define INTERCEPT_DR4_READ 4
+#define INTERCEPT_DR5_READ 5
+#define INTERCEPT_DR6_READ 6
+#define INTERCEPT_DR7_READ 7
+#define INTERCEPT_DR0_WRITE (16 + 0)
+#define INTERCEPT_DR1_WRITE (16 + 1)
+#define INTERCEPT_DR2_WRITE (16 + 2)
+#define INTERCEPT_DR3_WRITE (16 + 3)
+#define INTERCEPT_DR4_WRITE (16 + 4)
+#define INTERCEPT_DR5_WRITE (16 + 5)
+#define INTERCEPT_DR6_WRITE (16 + 6)
+#define INTERCEPT_DR7_WRITE (16 + 7)
+
+#define SVM_EVTINJ_VEC_MASK 0xff
+
+#define SVM_EVTINJ_TYPE_SHIFT 8
+#define SVM_EVTINJ_TYPE_MASK (7 << SVM_EVTINJ_TYPE_SHIFT)
+
+#define SVM_EVTINJ_TYPE_INTR (0 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_NMI (2 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_EXEPT (3 << SVM_EVTINJ_TYPE_SHIFT)
+#define SVM_EVTINJ_TYPE_SOFT (4 << SVM_EVTINJ_TYPE_SHIFT)
+
+#define SVM_EVTINJ_VALID (1 << 31)
+#define SVM_EVTINJ_VALID_ERR (1 << 11)
+
+#define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
+#define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK
+
+#define SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
+#define SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
+#define SVM_EXITINTINFO_TYPE_EXEPT SVM_EVTINJ_TYPE_EXEPT
+#define SVM_EXITINTINFO_TYPE_SOFT SVM_EVTINJ_TYPE_SOFT
+
+#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
+#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
+
+#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36
+#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
+#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
+
+#define SVM_EXITINFO_REG_MASK 0x0F
+
+#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP)
+
+#endif /* SELFTEST_KVM_SVM_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/svm_util.h b/tools/testing/selftests/kvm/include/x86_64/svm_util.h
new file mode 100644
index 000000000..b7531c83b
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86_64/svm_util.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * tools/testing/selftests/kvm/include/x86_64/svm_utils.h
+ * Header for nested SVM testing
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+
+#ifndef SELFTEST_KVM_SVM_UTILS_H
+#define SELFTEST_KVM_SVM_UTILS_H
+
+#include <stdint.h>
+#include "svm.h"
+#include "processor.h"
+
+#define CPUID_SVM_BIT 2
+#define CPUID_SVM BIT_ULL(CPUID_SVM_BIT)
+
+#define SVM_EXIT_VMMCALL 0x081
+
+struct svm_test_data {
+ /* VMCB */
+ struct vmcb *vmcb; /* gva */
+ void *vmcb_hva;
+ uint64_t vmcb_gpa;
+
+ /* host state-save area */
+ struct vmcb_save_area *save_area; /* gva */
+ void *save_area_hva;
+ uint64_t save_area_gpa;
+};
+
+struct svm_test_data *vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva);
+void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp);
+void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa);
+bool nested_svm_supported(void);
+void nested_svm_check_supported(void);
+
+static inline bool cpu_has_svm(void)
+{
+ u32 eax = 0x80000001, ecx;
+
+ asm("cpuid" :
+ "=a" (eax), "=c" (ecx) : "0" (eax) : "ebx", "edx");
+
+ return ecx & CPUID_SVM;
+}
+
+#endif /* SELFTEST_KVM_SVM_UTILS_H */
diff --git a/tools/testing/selftests/kvm/include/x86_64/vmx.h b/tools/testing/selftests/kvm/include/x86_64/vmx.h
new file mode 100644
index 000000000..e78d7e26b
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/x86_64/vmx.h
@@ -0,0 +1,625 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * tools/testing/selftests/kvm/include/x86_64/vmx.h
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#ifndef SELFTEST_KVM_VMX_H
+#define SELFTEST_KVM_VMX_H
+
+#include <stdint.h>
+#include "processor.h"
+
+#define CPUID_VMX_BIT 5
+
+#define CPUID_VMX (1 << 5)
+
+/*
+ * Definitions of Primary Processor-Based VM-Execution Controls.
+ */
+#define CPU_BASED_INTR_WINDOW_EXITING 0x00000004
+#define CPU_BASED_USE_TSC_OFFSETTING 0x00000008
+#define CPU_BASED_HLT_EXITING 0x00000080
+#define CPU_BASED_INVLPG_EXITING 0x00000200
+#define CPU_BASED_MWAIT_EXITING 0x00000400
+#define CPU_BASED_RDPMC_EXITING 0x00000800
+#define CPU_BASED_RDTSC_EXITING 0x00001000
+#define CPU_BASED_CR3_LOAD_EXITING 0x00008000
+#define CPU_BASED_CR3_STORE_EXITING 0x00010000
+#define CPU_BASED_CR8_LOAD_EXITING 0x00080000
+#define CPU_BASED_CR8_STORE_EXITING 0x00100000
+#define CPU_BASED_TPR_SHADOW 0x00200000
+#define CPU_BASED_NMI_WINDOW_EXITING 0x00400000
+#define CPU_BASED_MOV_DR_EXITING 0x00800000
+#define CPU_BASED_UNCOND_IO_EXITING 0x01000000
+#define CPU_BASED_USE_IO_BITMAPS 0x02000000
+#define CPU_BASED_MONITOR_TRAP 0x08000000
+#define CPU_BASED_USE_MSR_BITMAPS 0x10000000
+#define CPU_BASED_MONITOR_EXITING 0x20000000
+#define CPU_BASED_PAUSE_EXITING 0x40000000
+#define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000
+
+#define CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x0401e172
+
+/*
+ * Definitions of Secondary Processor-Based VM-Execution Controls.
+ */
+#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
+#define SECONDARY_EXEC_DESC 0x00000004
+#define SECONDARY_EXEC_ENABLE_RDTSCP 0x00000008
+#define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010
+#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
+#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
+#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
+#define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100
+#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200
+#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
+#define SECONDARY_EXEC_RDRAND_EXITING 0x00000800
+#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
+#define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000
+#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
+#define SECONDARY_EXEC_RDSEED_EXITING 0x00010000
+#define SECONDARY_EXEC_ENABLE_PML 0x00020000
+#define SECONDARY_EPT_VE 0x00040000
+#define SECONDARY_ENABLE_XSAV_RESTORE 0x00100000
+#define SECONDARY_EXEC_TSC_SCALING 0x02000000
+
+#define PIN_BASED_EXT_INTR_MASK 0x00000001
+#define PIN_BASED_NMI_EXITING 0x00000008
+#define PIN_BASED_VIRTUAL_NMIS 0x00000020
+#define PIN_BASED_VMX_PREEMPTION_TIMER 0x00000040
+#define PIN_BASED_POSTED_INTR 0x00000080
+
+#define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x00000016
+
+#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000004
+#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
+#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000
+#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
+#define VM_EXIT_SAVE_IA32_PAT 0x00040000
+#define VM_EXIT_LOAD_IA32_PAT 0x00080000
+#define VM_EXIT_SAVE_IA32_EFER 0x00100000
+#define VM_EXIT_LOAD_IA32_EFER 0x00200000
+#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000
+
+#define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff
+
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000004
+#define VM_ENTRY_IA32E_MODE 0x00000200
+#define VM_ENTRY_SMM 0x00000400
+#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
+#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000
+#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
+#define VM_ENTRY_LOAD_IA32_EFER 0x00008000
+
+#define VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR 0x000011ff
+
+#define VMX_MISC_PREEMPTION_TIMER_RATE_MASK 0x0000001f
+#define VMX_MISC_SAVE_EFER_LMA 0x00000020
+
+#define EXIT_REASON_FAILED_VMENTRY 0x80000000
+#define EXIT_REASON_EXCEPTION_NMI 0
+#define EXIT_REASON_EXTERNAL_INTERRUPT 1
+#define EXIT_REASON_TRIPLE_FAULT 2
+#define EXIT_REASON_INTERRUPT_WINDOW 7
+#define EXIT_REASON_NMI_WINDOW 8
+#define EXIT_REASON_TASK_SWITCH 9
+#define EXIT_REASON_CPUID 10
+#define EXIT_REASON_HLT 12
+#define EXIT_REASON_INVD 13
+#define EXIT_REASON_INVLPG 14
+#define EXIT_REASON_RDPMC 15
+#define EXIT_REASON_RDTSC 16
+#define EXIT_REASON_VMCALL 18
+#define EXIT_REASON_VMCLEAR 19
+#define EXIT_REASON_VMLAUNCH 20
+#define EXIT_REASON_VMPTRLD 21
+#define EXIT_REASON_VMPTRST 22
+#define EXIT_REASON_VMREAD 23
+#define EXIT_REASON_VMRESUME 24
+#define EXIT_REASON_VMWRITE 25
+#define EXIT_REASON_VMOFF 26
+#define EXIT_REASON_VMON 27
+#define EXIT_REASON_CR_ACCESS 28
+#define EXIT_REASON_DR_ACCESS 29
+#define EXIT_REASON_IO_INSTRUCTION 30
+#define EXIT_REASON_MSR_READ 31
+#define EXIT_REASON_MSR_WRITE 32
+#define EXIT_REASON_INVALID_STATE 33
+#define EXIT_REASON_MWAIT_INSTRUCTION 36
+#define EXIT_REASON_MONITOR_INSTRUCTION 39
+#define EXIT_REASON_PAUSE_INSTRUCTION 40
+#define EXIT_REASON_MCE_DURING_VMENTRY 41
+#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
+#define EXIT_REASON_APIC_ACCESS 44
+#define EXIT_REASON_EOI_INDUCED 45
+#define EXIT_REASON_EPT_VIOLATION 48
+#define EXIT_REASON_EPT_MISCONFIG 49
+#define EXIT_REASON_INVEPT 50
+#define EXIT_REASON_RDTSCP 51
+#define EXIT_REASON_PREEMPTION_TIMER 52
+#define EXIT_REASON_INVVPID 53
+#define EXIT_REASON_WBINVD 54
+#define EXIT_REASON_XSETBV 55
+#define EXIT_REASON_APIC_WRITE 56
+#define EXIT_REASON_INVPCID 58
+#define EXIT_REASON_PML_FULL 62
+#define EXIT_REASON_XSAVES 63
+#define EXIT_REASON_XRSTORS 64
+#define LAST_EXIT_REASON 64
+
+enum vmcs_field {
+ VIRTUAL_PROCESSOR_ID = 0x00000000,
+ POSTED_INTR_NV = 0x00000002,
+ GUEST_ES_SELECTOR = 0x00000800,
+ GUEST_CS_SELECTOR = 0x00000802,
+ GUEST_SS_SELECTOR = 0x00000804,
+ GUEST_DS_SELECTOR = 0x00000806,
+ GUEST_FS_SELECTOR = 0x00000808,
+ GUEST_GS_SELECTOR = 0x0000080a,
+ GUEST_LDTR_SELECTOR = 0x0000080c,
+ GUEST_TR_SELECTOR = 0x0000080e,
+ GUEST_INTR_STATUS = 0x00000810,
+ GUEST_PML_INDEX = 0x00000812,
+ HOST_ES_SELECTOR = 0x00000c00,
+ HOST_CS_SELECTOR = 0x00000c02,
+ HOST_SS_SELECTOR = 0x00000c04,
+ HOST_DS_SELECTOR = 0x00000c06,
+ HOST_FS_SELECTOR = 0x00000c08,
+ HOST_GS_SELECTOR = 0x00000c0a,
+ HOST_TR_SELECTOR = 0x00000c0c,
+ IO_BITMAP_A = 0x00002000,
+ IO_BITMAP_A_HIGH = 0x00002001,
+ IO_BITMAP_B = 0x00002002,
+ IO_BITMAP_B_HIGH = 0x00002003,
+ MSR_BITMAP = 0x00002004,
+ MSR_BITMAP_HIGH = 0x00002005,
+ VM_EXIT_MSR_STORE_ADDR = 0x00002006,
+ VM_EXIT_MSR_STORE_ADDR_HIGH = 0x00002007,
+ VM_EXIT_MSR_LOAD_ADDR = 0x00002008,
+ VM_EXIT_MSR_LOAD_ADDR_HIGH = 0x00002009,
+ VM_ENTRY_MSR_LOAD_ADDR = 0x0000200a,
+ VM_ENTRY_MSR_LOAD_ADDR_HIGH = 0x0000200b,
+ PML_ADDRESS = 0x0000200e,
+ PML_ADDRESS_HIGH = 0x0000200f,
+ TSC_OFFSET = 0x00002010,
+ TSC_OFFSET_HIGH = 0x00002011,
+ VIRTUAL_APIC_PAGE_ADDR = 0x00002012,
+ VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013,
+ APIC_ACCESS_ADDR = 0x00002014,
+ APIC_ACCESS_ADDR_HIGH = 0x00002015,
+ POSTED_INTR_DESC_ADDR = 0x00002016,
+ POSTED_INTR_DESC_ADDR_HIGH = 0x00002017,
+ EPT_POINTER = 0x0000201a,
+ EPT_POINTER_HIGH = 0x0000201b,
+ EOI_EXIT_BITMAP0 = 0x0000201c,
+ EOI_EXIT_BITMAP0_HIGH = 0x0000201d,
+ EOI_EXIT_BITMAP1 = 0x0000201e,
+ EOI_EXIT_BITMAP1_HIGH = 0x0000201f,
+ EOI_EXIT_BITMAP2 = 0x00002020,
+ EOI_EXIT_BITMAP2_HIGH = 0x00002021,
+ EOI_EXIT_BITMAP3 = 0x00002022,
+ EOI_EXIT_BITMAP3_HIGH = 0x00002023,
+ VMREAD_BITMAP = 0x00002026,
+ VMREAD_BITMAP_HIGH = 0x00002027,
+ VMWRITE_BITMAP = 0x00002028,
+ VMWRITE_BITMAP_HIGH = 0x00002029,
+ XSS_EXIT_BITMAP = 0x0000202C,
+ XSS_EXIT_BITMAP_HIGH = 0x0000202D,
+ TSC_MULTIPLIER = 0x00002032,
+ TSC_MULTIPLIER_HIGH = 0x00002033,
+ GUEST_PHYSICAL_ADDRESS = 0x00002400,
+ GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
+ VMCS_LINK_POINTER = 0x00002800,
+ VMCS_LINK_POINTER_HIGH = 0x00002801,
+ GUEST_IA32_DEBUGCTL = 0x00002802,
+ GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
+ GUEST_IA32_PAT = 0x00002804,
+ GUEST_IA32_PAT_HIGH = 0x00002805,
+ GUEST_IA32_EFER = 0x00002806,
+ GUEST_IA32_EFER_HIGH = 0x00002807,
+ GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
+ GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809,
+ GUEST_PDPTR0 = 0x0000280a,
+ GUEST_PDPTR0_HIGH = 0x0000280b,
+ GUEST_PDPTR1 = 0x0000280c,
+ GUEST_PDPTR1_HIGH = 0x0000280d,
+ GUEST_PDPTR2 = 0x0000280e,
+ GUEST_PDPTR2_HIGH = 0x0000280f,
+ GUEST_PDPTR3 = 0x00002810,
+ GUEST_PDPTR3_HIGH = 0x00002811,
+ GUEST_BNDCFGS = 0x00002812,
+ GUEST_BNDCFGS_HIGH = 0x00002813,
+ HOST_IA32_PAT = 0x00002c00,
+ HOST_IA32_PAT_HIGH = 0x00002c01,
+ HOST_IA32_EFER = 0x00002c02,
+ HOST_IA32_EFER_HIGH = 0x00002c03,
+ HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
+ HOST_IA32_PERF_GLOBAL_CTRL_HIGH = 0x00002c05,
+ PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
+ CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
+ EXCEPTION_BITMAP = 0x00004004,
+ PAGE_FAULT_ERROR_CODE_MASK = 0x00004006,
+ PAGE_FAULT_ERROR_CODE_MATCH = 0x00004008,
+ CR3_TARGET_COUNT = 0x0000400a,
+ VM_EXIT_CONTROLS = 0x0000400c,
+ VM_EXIT_MSR_STORE_COUNT = 0x0000400e,
+ VM_EXIT_MSR_LOAD_COUNT = 0x00004010,
+ VM_ENTRY_CONTROLS = 0x00004012,
+ VM_ENTRY_MSR_LOAD_COUNT = 0x00004014,
+ VM_ENTRY_INTR_INFO_FIELD = 0x00004016,
+ VM_ENTRY_EXCEPTION_ERROR_CODE = 0x00004018,
+ VM_ENTRY_INSTRUCTION_LEN = 0x0000401a,
+ TPR_THRESHOLD = 0x0000401c,
+ SECONDARY_VM_EXEC_CONTROL = 0x0000401e,
+ PLE_GAP = 0x00004020,
+ PLE_WINDOW = 0x00004022,
+ VM_INSTRUCTION_ERROR = 0x00004400,
+ VM_EXIT_REASON = 0x00004402,
+ VM_EXIT_INTR_INFO = 0x00004404,
+ VM_EXIT_INTR_ERROR_CODE = 0x00004406,
+ IDT_VECTORING_INFO_FIELD = 0x00004408,
+ IDT_VECTORING_ERROR_CODE = 0x0000440a,
+ VM_EXIT_INSTRUCTION_LEN = 0x0000440c,
+ VMX_INSTRUCTION_INFO = 0x0000440e,
+ GUEST_ES_LIMIT = 0x00004800,
+ GUEST_CS_LIMIT = 0x00004802,
+ GUEST_SS_LIMIT = 0x00004804,
+ GUEST_DS_LIMIT = 0x00004806,
+ GUEST_FS_LIMIT = 0x00004808,
+ GUEST_GS_LIMIT = 0x0000480a,
+ GUEST_LDTR_LIMIT = 0x0000480c,
+ GUEST_TR_LIMIT = 0x0000480e,
+ GUEST_GDTR_LIMIT = 0x00004810,
+ GUEST_IDTR_LIMIT = 0x00004812,
+ GUEST_ES_AR_BYTES = 0x00004814,
+ GUEST_CS_AR_BYTES = 0x00004816,
+ GUEST_SS_AR_BYTES = 0x00004818,
+ GUEST_DS_AR_BYTES = 0x0000481a,
+ GUEST_FS_AR_BYTES = 0x0000481c,
+ GUEST_GS_AR_BYTES = 0x0000481e,
+ GUEST_LDTR_AR_BYTES = 0x00004820,
+ GUEST_TR_AR_BYTES = 0x00004822,
+ GUEST_INTERRUPTIBILITY_INFO = 0x00004824,
+ GUEST_ACTIVITY_STATE = 0X00004826,
+ GUEST_SYSENTER_CS = 0x0000482A,
+ VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
+ HOST_IA32_SYSENTER_CS = 0x00004c00,
+ CR0_GUEST_HOST_MASK = 0x00006000,
+ CR4_GUEST_HOST_MASK = 0x00006002,
+ CR0_READ_SHADOW = 0x00006004,
+ CR4_READ_SHADOW = 0x00006006,
+ CR3_TARGET_VALUE0 = 0x00006008,
+ CR3_TARGET_VALUE1 = 0x0000600a,
+ CR3_TARGET_VALUE2 = 0x0000600c,
+ CR3_TARGET_VALUE3 = 0x0000600e,
+ EXIT_QUALIFICATION = 0x00006400,
+ GUEST_LINEAR_ADDRESS = 0x0000640a,
+ GUEST_CR0 = 0x00006800,
+ GUEST_CR3 = 0x00006802,
+ GUEST_CR4 = 0x00006804,
+ GUEST_ES_BASE = 0x00006806,
+ GUEST_CS_BASE = 0x00006808,
+ GUEST_SS_BASE = 0x0000680a,
+ GUEST_DS_BASE = 0x0000680c,
+ GUEST_FS_BASE = 0x0000680e,
+ GUEST_GS_BASE = 0x00006810,
+ GUEST_LDTR_BASE = 0x00006812,
+ GUEST_TR_BASE = 0x00006814,
+ GUEST_GDTR_BASE = 0x00006816,
+ GUEST_IDTR_BASE = 0x00006818,
+ GUEST_DR7 = 0x0000681a,
+ GUEST_RSP = 0x0000681c,
+ GUEST_RIP = 0x0000681e,
+ GUEST_RFLAGS = 0x00006820,
+ GUEST_PENDING_DBG_EXCEPTIONS = 0x00006822,
+ GUEST_SYSENTER_ESP = 0x00006824,
+ GUEST_SYSENTER_EIP = 0x00006826,
+ HOST_CR0 = 0x00006c00,
+ HOST_CR3 = 0x00006c02,
+ HOST_CR4 = 0x00006c04,
+ HOST_FS_BASE = 0x00006c06,
+ HOST_GS_BASE = 0x00006c08,
+ HOST_TR_BASE = 0x00006c0a,
+ HOST_GDTR_BASE = 0x00006c0c,
+ HOST_IDTR_BASE = 0x00006c0e,
+ HOST_IA32_SYSENTER_ESP = 0x00006c10,
+ HOST_IA32_SYSENTER_EIP = 0x00006c12,
+ HOST_RSP = 0x00006c14,
+ HOST_RIP = 0x00006c16,
+};
+
+struct vmx_msr_entry {
+ uint32_t index;
+ uint32_t reserved;
+ uint64_t value;
+} __attribute__ ((aligned(16)));
+
+#include "evmcs.h"
+
+static inline int vmxon(uint64_t phys)
+{
+ uint8_t ret;
+
+ __asm__ __volatile__ ("vmxon %[pa]; setna %[ret]"
+ : [ret]"=rm"(ret)
+ : [pa]"m"(phys)
+ : "cc", "memory");
+
+ return ret;
+}
+
+static inline void vmxoff(void)
+{
+ __asm__ __volatile__("vmxoff");
+}
+
+static inline int vmclear(uint64_t vmcs_pa)
+{
+ uint8_t ret;
+
+ __asm__ __volatile__ ("vmclear %[pa]; setna %[ret]"
+ : [ret]"=rm"(ret)
+ : [pa]"m"(vmcs_pa)
+ : "cc", "memory");
+
+ return ret;
+}
+
+static inline int vmptrld(uint64_t vmcs_pa)
+{
+ uint8_t ret;
+
+ if (enable_evmcs)
+ return -1;
+
+ __asm__ __volatile__ ("vmptrld %[pa]; setna %[ret]"
+ : [ret]"=rm"(ret)
+ : [pa]"m"(vmcs_pa)
+ : "cc", "memory");
+
+ return ret;
+}
+
+static inline int vmptrst(uint64_t *value)
+{
+ uint64_t tmp;
+ uint8_t ret;
+
+ if (enable_evmcs)
+ return evmcs_vmptrst(value);
+
+ __asm__ __volatile__("vmptrst %[value]; setna %[ret]"
+ : [value]"=m"(tmp), [ret]"=rm"(ret)
+ : : "cc", "memory");
+
+ *value = tmp;
+ return ret;
+}
+
+/*
+ * A wrapper around vmptrst that ignores errors and returns zero if the
+ * vmptrst instruction fails.
+ */
+static inline uint64_t vmptrstz(void)
+{
+ uint64_t value = 0;
+ vmptrst(&value);
+ return value;
+}
+
+/*
+ * No guest state (e.g. GPRs) is established by this vmlaunch.
+ */
+static inline int vmlaunch(void)
+{
+ int ret;
+
+ if (enable_evmcs)
+ return evmcs_vmlaunch();
+
+ __asm__ __volatile__("push %%rbp;"
+ "push %%rcx;"
+ "push %%rdx;"
+ "push %%rsi;"
+ "push %%rdi;"
+ "push $0;"
+ "vmwrite %%rsp, %[host_rsp];"
+ "lea 1f(%%rip), %%rax;"
+ "vmwrite %%rax, %[host_rip];"
+ "vmlaunch;"
+ "incq (%%rsp);"
+ "1: pop %%rax;"
+ "pop %%rdi;"
+ "pop %%rsi;"
+ "pop %%rdx;"
+ "pop %%rcx;"
+ "pop %%rbp;"
+ : [ret]"=&a"(ret)
+ : [host_rsp]"r"((uint64_t)HOST_RSP),
+ [host_rip]"r"((uint64_t)HOST_RIP)
+ : "memory", "cc", "rbx", "r8", "r9", "r10",
+ "r11", "r12", "r13", "r14", "r15");
+ return ret;
+}
+
+/*
+ * No guest state (e.g. GPRs) is established by this vmresume.
+ */
+static inline int vmresume(void)
+{
+ int ret;
+
+ if (enable_evmcs)
+ return evmcs_vmresume();
+
+ __asm__ __volatile__("push %%rbp;"
+ "push %%rcx;"
+ "push %%rdx;"
+ "push %%rsi;"
+ "push %%rdi;"
+ "push $0;"
+ "vmwrite %%rsp, %[host_rsp];"
+ "lea 1f(%%rip), %%rax;"
+ "vmwrite %%rax, %[host_rip];"
+ "vmresume;"
+ "incq (%%rsp);"
+ "1: pop %%rax;"
+ "pop %%rdi;"
+ "pop %%rsi;"
+ "pop %%rdx;"
+ "pop %%rcx;"
+ "pop %%rbp;"
+ : [ret]"=&a"(ret)
+ : [host_rsp]"r"((uint64_t)HOST_RSP),
+ [host_rip]"r"((uint64_t)HOST_RIP)
+ : "memory", "cc", "rbx", "r8", "r9", "r10",
+ "r11", "r12", "r13", "r14", "r15");
+ return ret;
+}
+
+static inline void vmcall(void)
+{
+ /* Currently, L1 destroys our GPRs during vmexits. */
+ __asm__ __volatile__("push %%rbp; vmcall; pop %%rbp" : : :
+ "rax", "rbx", "rcx", "rdx",
+ "rsi", "rdi", "r8", "r9", "r10", "r11", "r12",
+ "r13", "r14", "r15");
+}
+
+static inline int vmread(uint64_t encoding, uint64_t *value)
+{
+ uint64_t tmp;
+ uint8_t ret;
+
+ if (enable_evmcs)
+ return evmcs_vmread(encoding, value);
+
+ __asm__ __volatile__("vmread %[encoding], %[value]; setna %[ret]"
+ : [value]"=rm"(tmp), [ret]"=rm"(ret)
+ : [encoding]"r"(encoding)
+ : "cc", "memory");
+
+ *value = tmp;
+ return ret;
+}
+
+/*
+ * A wrapper around vmread that ignores errors and returns zero if the
+ * vmread instruction fails.
+ */
+static inline uint64_t vmreadz(uint64_t encoding)
+{
+ uint64_t value = 0;
+ vmread(encoding, &value);
+ return value;
+}
+
+static inline int vmwrite(uint64_t encoding, uint64_t value)
+{
+ uint8_t ret;
+
+ if (enable_evmcs)
+ return evmcs_vmwrite(encoding, value);
+
+ __asm__ __volatile__ ("vmwrite %[value], %[encoding]; setna %[ret]"
+ : [ret]"=rm"(ret)
+ : [value]"rm"(value), [encoding]"r"(encoding)
+ : "cc", "memory");
+
+ return ret;
+}
+
+static inline uint32_t vmcs_revision(void)
+{
+ return rdmsr(MSR_IA32_VMX_BASIC);
+}
+
+struct vmx_pages {
+ void *vmxon_hva;
+ uint64_t vmxon_gpa;
+ void *vmxon;
+
+ void *vmcs_hva;
+ uint64_t vmcs_gpa;
+ void *vmcs;
+
+ void *msr_hva;
+ uint64_t msr_gpa;
+ void *msr;
+
+ void *shadow_vmcs_hva;
+ uint64_t shadow_vmcs_gpa;
+ void *shadow_vmcs;
+
+ void *vmread_hva;
+ uint64_t vmread_gpa;
+ void *vmread;
+
+ void *vmwrite_hva;
+ uint64_t vmwrite_gpa;
+ void *vmwrite;
+
+ void *vp_assist_hva;
+ uint64_t vp_assist_gpa;
+ void *vp_assist;
+
+ void *enlightened_vmcs_hva;
+ uint64_t enlightened_vmcs_gpa;
+ void *enlightened_vmcs;
+
+ void *eptp_hva;
+ uint64_t eptp_gpa;
+ void *eptp;
+
+ void *apic_access_hva;
+ uint64_t apic_access_gpa;
+ void *apic_access;
+};
+
+union vmx_basic {
+ u64 val;
+ struct {
+ u32 revision;
+ u32 size:13,
+ reserved1:3,
+ width:1,
+ dual:1,
+ type:4,
+ insouts:1,
+ ctrl:1,
+ vm_entry_exception_ctrl:1,
+ reserved2:7;
+ };
+};
+
+union vmx_ctrl_msr {
+ u64 val;
+ struct {
+ u32 set, clr;
+ };
+};
+
+struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva);
+bool prepare_for_vmx_operation(struct vmx_pages *vmx);
+void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp);
+bool load_vmcs(struct vmx_pages *vmx);
+
+bool nested_vmx_supported(void);
+void nested_vmx_check_supported(void);
+
+void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint64_t nested_paddr, uint64_t paddr, uint32_t eptp_memslot);
+void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint64_t nested_paddr, uint64_t paddr, uint64_t size,
+ uint32_t eptp_memslot);
+void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint32_t memslot, uint32_t eptp_memslot);
+void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint32_t eptp_memslot);
+void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint32_t eptp_memslot);
+
+#endif /* SELFTEST_KVM_VMX_H */
diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c
new file mode 100644
index 000000000..aa3795cd7
--- /dev/null
+++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * kvm_create_max_vcpus
+ *
+ * Copyright (C) 2019, Google LLC.
+ *
+ * Test for KVM_CAP_MAX_VCPUS and KVM_CAP_MAX_VCPU_ID.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "asm/kvm.h"
+#include "linux/kvm.h"
+
+void test_vcpu_creation(int first_vcpu_id, int num_vcpus)
+{
+ struct kvm_vm *vm;
+ int i;
+
+ pr_info("Testing creating %d vCPUs, with IDs %d...%d.\n",
+ num_vcpus, first_vcpu_id, first_vcpu_id + num_vcpus - 1);
+
+ vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR);
+
+ for (i = first_vcpu_id; i < first_vcpu_id + num_vcpus; i++)
+ /* This asserts that the vCPU was created. */
+ vm_vcpu_add(vm, i);
+
+ kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+ int kvm_max_vcpu_id = kvm_check_cap(KVM_CAP_MAX_VCPU_ID);
+ int kvm_max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
+ /*
+ * Number of file descriptors reqired, KVM_CAP_MAX_VCPUS for vCPU fds +
+ * an arbitrary number for everything else.
+ */
+ int nr_fds_wanted = kvm_max_vcpus + 100;
+ struct rlimit rl;
+
+ pr_info("KVM_CAP_MAX_VCPU_ID: %d\n", kvm_max_vcpu_id);
+ pr_info("KVM_CAP_MAX_VCPUS: %d\n", kvm_max_vcpus);
+
+ /*
+ * Check that we're allowed to open nr_fds_wanted file descriptors and
+ * try raising the limits if needed.
+ */
+ TEST_ASSERT(!getrlimit(RLIMIT_NOFILE, &rl), "getrlimit() failed!");
+
+ if (rl.rlim_cur < nr_fds_wanted) {
+ rl.rlim_cur = nr_fds_wanted;
+ if (rl.rlim_max < nr_fds_wanted) {
+ int old_rlim_max = rl.rlim_max;
+ rl.rlim_max = nr_fds_wanted;
+
+ int r = setrlimit(RLIMIT_NOFILE, &rl);
+ if (r < 0) {
+ printf("RLIMIT_NOFILE hard limit is too low (%d, wanted %d)\n",
+ old_rlim_max, nr_fds_wanted);
+ exit(KSFT_SKIP);
+ }
+ } else {
+ TEST_ASSERT(!setrlimit(RLIMIT_NOFILE, &rl), "setrlimit() failed!");
+ }
+ }
+
+ /*
+ * Upstream KVM prior to 4.8 does not support KVM_CAP_MAX_VCPU_ID.
+ * Userspace is supposed to use KVM_CAP_MAX_VCPUS as the maximum ID
+ * in this case.
+ */
+ if (!kvm_max_vcpu_id)
+ kvm_max_vcpu_id = kvm_max_vcpus;
+
+ TEST_ASSERT(kvm_max_vcpu_id >= kvm_max_vcpus,
+ "KVM_MAX_VCPU_ID (%d) must be at least as large as KVM_MAX_VCPUS (%d).",
+ kvm_max_vcpu_id, kvm_max_vcpus);
+
+ test_vcpu_creation(0, kvm_max_vcpus);
+
+ if (kvm_max_vcpu_id > kvm_max_vcpus)
+ test_vcpu_creation(
+ kvm_max_vcpu_id - kvm_max_vcpus, kvm_max_vcpus);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c
new file mode 100644
index 000000000..d6c32c328
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c
@@ -0,0 +1,356 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * AArch64 code
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include <linux/compiler.h>
+
+#include "kvm_util.h"
+#include "../kvm_util_internal.h"
+#include "processor.h"
+
+#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000
+#define DEFAULT_ARM64_GUEST_STACK_VADDR_MIN 0xac0000
+
+static uint64_t page_align(struct kvm_vm *vm, uint64_t v)
+{
+ return (v + vm->page_size) & ~(vm->page_size - 1);
+}
+
+static uint64_t pgd_index(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+ unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift;
+ uint64_t mask = (1UL << (vm->va_bits - shift)) - 1;
+
+ return (gva >> shift) & mask;
+}
+
+static uint64_t pud_index(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+ unsigned int shift = 2 * (vm->page_shift - 3) + vm->page_shift;
+ uint64_t mask = (1UL << (vm->page_shift - 3)) - 1;
+
+ TEST_ASSERT(vm->pgtable_levels == 4,
+ "Mode %d does not have 4 page table levels", vm->mode);
+
+ return (gva >> shift) & mask;
+}
+
+static uint64_t pmd_index(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+ unsigned int shift = (vm->page_shift - 3) + vm->page_shift;
+ uint64_t mask = (1UL << (vm->page_shift - 3)) - 1;
+
+ TEST_ASSERT(vm->pgtable_levels >= 3,
+ "Mode %d does not have >= 3 page table levels", vm->mode);
+
+ return (gva >> shift) & mask;
+}
+
+static uint64_t pte_index(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+ uint64_t mask = (1UL << (vm->page_shift - 3)) - 1;
+ return (gva >> vm->page_shift) & mask;
+}
+
+static uint64_t pte_addr(struct kvm_vm *vm, uint64_t entry)
+{
+ uint64_t mask = ((1UL << (vm->va_bits - vm->page_shift)) - 1) << vm->page_shift;
+ return entry & mask;
+}
+
+static uint64_t ptrs_per_pgd(struct kvm_vm *vm)
+{
+ unsigned int shift = (vm->pgtable_levels - 1) * (vm->page_shift - 3) + vm->page_shift;
+ return 1 << (vm->va_bits - shift);
+}
+
+static uint64_t __maybe_unused ptrs_per_pte(struct kvm_vm *vm)
+{
+ return 1 << (vm->page_shift - 3);
+}
+
+void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot)
+{
+ if (!vm->pgd_created) {
+ vm_paddr_t paddr = vm_phy_pages_alloc(vm,
+ page_align(vm, ptrs_per_pgd(vm) * 8) / vm->page_size,
+ KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot);
+ vm->pgd = paddr;
+ vm->pgd_created = true;
+ }
+}
+
+void _virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+ uint32_t pgd_memslot, uint64_t flags)
+{
+ uint8_t attr_idx = flags & 7;
+ uint64_t *ptep;
+
+ TEST_ASSERT((vaddr % vm->page_size) == 0,
+ "Virtual address not on page boundary,\n"
+ " vaddr: 0x%lx vm->page_size: 0x%x", vaddr, vm->page_size);
+ TEST_ASSERT(sparsebit_is_set(vm->vpages_valid,
+ (vaddr >> vm->page_shift)),
+ "Invalid virtual address, vaddr: 0x%lx", vaddr);
+ TEST_ASSERT((paddr % vm->page_size) == 0,
+ "Physical address not on page boundary,\n"
+ " paddr: 0x%lx vm->page_size: 0x%x", paddr, vm->page_size);
+ TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
+ "Physical address beyond beyond maximum supported,\n"
+ " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
+ paddr, vm->max_gfn, vm->page_size);
+
+ ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, vaddr) * 8;
+ if (!*ptep) {
+ *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot);
+ *ptep |= 3;
+ }
+
+ switch (vm->pgtable_levels) {
+ case 4:
+ ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, vaddr) * 8;
+ if (!*ptep) {
+ *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot);
+ *ptep |= 3;
+ }
+ /* fall through */
+ case 3:
+ ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pmd_index(vm, vaddr) * 8;
+ if (!*ptep) {
+ *ptep = vm_phy_page_alloc(vm, KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot);
+ *ptep |= 3;
+ }
+ /* fall through */
+ case 2:
+ ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, vaddr) * 8;
+ break;
+ default:
+ TEST_FAIL("Page table levels must be 2, 3, or 4");
+ }
+
+ *ptep = paddr | 3;
+ *ptep |= (attr_idx << 2) | (1 << 10) /* Access Flag */;
+}
+
+void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+ uint32_t pgd_memslot)
+{
+ uint64_t attr_idx = 4; /* NORMAL (See DEFAULT_MAIR_EL1) */
+
+ _virt_pg_map(vm, vaddr, paddr, pgd_memslot, attr_idx);
+}
+
+vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+ uint64_t *ptep;
+
+ if (!vm->pgd_created)
+ goto unmapped_gva;
+
+ ptep = addr_gpa2hva(vm, vm->pgd) + pgd_index(vm, gva) * 8;
+ if (!ptep)
+ goto unmapped_gva;
+
+ switch (vm->pgtable_levels) {
+ case 4:
+ ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pud_index(vm, gva) * 8;
+ if (!ptep)
+ goto unmapped_gva;
+ /* fall through */
+ case 3:
+ ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pmd_index(vm, gva) * 8;
+ if (!ptep)
+ goto unmapped_gva;
+ /* fall through */
+ case 2:
+ ptep = addr_gpa2hva(vm, pte_addr(vm, *ptep)) + pte_index(vm, gva) * 8;
+ if (!ptep)
+ goto unmapped_gva;
+ break;
+ default:
+ TEST_FAIL("Page table levels must be 2, 3, or 4");
+ }
+
+ return pte_addr(vm, *ptep) + (gva & (vm->page_size - 1));
+
+unmapped_gva:
+ TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);
+ exit(1);
+}
+
+static void pte_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent, uint64_t page, int level)
+{
+#ifdef DEBUG
+ static const char * const type[] = { "", "pud", "pmd", "pte" };
+ uint64_t pte, *ptep;
+
+ if (level == 4)
+ return;
+
+ for (pte = page; pte < page + ptrs_per_pte(vm) * 8; pte += 8) {
+ ptep = addr_gpa2hva(vm, pte);
+ if (!*ptep)
+ continue;
+ fprintf(stream, "%*s%s: %lx: %lx at %p\n", indent, "", type[level], pte, *ptep, ptep);
+ pte_dump(stream, vm, indent + 1, pte_addr(vm, *ptep), level + 1);
+ }
+#endif
+}
+
+void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
+{
+ int level = 4 - (vm->pgtable_levels - 1);
+ uint64_t pgd, *ptep;
+
+ if (!vm->pgd_created)
+ return;
+
+ for (pgd = vm->pgd; pgd < vm->pgd + ptrs_per_pgd(vm) * 8; pgd += 8) {
+ ptep = addr_gpa2hva(vm, pgd);
+ if (!*ptep)
+ continue;
+ fprintf(stream, "%*spgd: %lx: %lx at %p\n", indent, "", pgd, *ptep, ptep);
+ pte_dump(stream, vm, indent + 1, pte_addr(vm, *ptep), level);
+ }
+}
+
+struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
+ void *guest_code)
+{
+ uint64_t ptrs_per_4k_pte = 512;
+ uint64_t extra_pg_pages = (extra_mem_pages / ptrs_per_4k_pte) * 2;
+ struct kvm_vm *vm;
+
+ vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR);
+
+ kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
+ vm_vcpu_add_default(vm, vcpuid, guest_code);
+
+ return vm;
+}
+
+void aarch64_vcpu_setup(struct kvm_vm *vm, int vcpuid, struct kvm_vcpu_init *init)
+{
+ struct kvm_vcpu_init default_init = { .target = -1, };
+ uint64_t sctlr_el1, tcr_el1;
+
+ if (!init)
+ init = &default_init;
+
+ if (init->target == -1) {
+ struct kvm_vcpu_init preferred;
+ vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &preferred);
+ init->target = preferred.target;
+ }
+
+ vcpu_ioctl(vm, vcpuid, KVM_ARM_VCPU_INIT, init);
+
+ /*
+ * Enable FP/ASIMD to avoid trapping when accessing Q0-Q15
+ * registers, which the variable argument list macros do.
+ */
+ set_reg(vm, vcpuid, ARM64_SYS_REG(CPACR_EL1), 3 << 20);
+
+ get_reg(vm, vcpuid, ARM64_SYS_REG(SCTLR_EL1), &sctlr_el1);
+ get_reg(vm, vcpuid, ARM64_SYS_REG(TCR_EL1), &tcr_el1);
+
+ switch (vm->mode) {
+ case VM_MODE_P52V48_4K:
+ TEST_FAIL("AArch64 does not support 4K sized pages "
+ "with 52-bit physical address ranges");
+ case VM_MODE_PXXV48_4K:
+ TEST_FAIL("AArch64 does not support 4K sized pages "
+ "with ANY-bit physical address ranges");
+ case VM_MODE_P52V48_64K:
+ tcr_el1 |= 1ul << 14; /* TG0 = 64KB */
+ tcr_el1 |= 6ul << 32; /* IPS = 52 bits */
+ break;
+ case VM_MODE_P48V48_4K:
+ tcr_el1 |= 0ul << 14; /* TG0 = 4KB */
+ tcr_el1 |= 5ul << 32; /* IPS = 48 bits */
+ break;
+ case VM_MODE_P48V48_64K:
+ tcr_el1 |= 1ul << 14; /* TG0 = 64KB */
+ tcr_el1 |= 5ul << 32; /* IPS = 48 bits */
+ break;
+ case VM_MODE_P40V48_4K:
+ tcr_el1 |= 0ul << 14; /* TG0 = 4KB */
+ tcr_el1 |= 2ul << 32; /* IPS = 40 bits */
+ break;
+ case VM_MODE_P40V48_64K:
+ tcr_el1 |= 1ul << 14; /* TG0 = 64KB */
+ tcr_el1 |= 2ul << 32; /* IPS = 40 bits */
+ break;
+ default:
+ TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode);
+ }
+
+ sctlr_el1 |= (1 << 0) | (1 << 2) | (1 << 12) /* M | C | I */;
+ /* TCR_EL1 |= IRGN0:WBWA | ORGN0:WBWA | SH0:Inner-Shareable */;
+ tcr_el1 |= (1 << 8) | (1 << 10) | (3 << 12);
+ tcr_el1 |= (64 - vm->va_bits) /* T0SZ */;
+
+ set_reg(vm, vcpuid, ARM64_SYS_REG(SCTLR_EL1), sctlr_el1);
+ set_reg(vm, vcpuid, ARM64_SYS_REG(TCR_EL1), tcr_el1);
+ set_reg(vm, vcpuid, ARM64_SYS_REG(MAIR_EL1), DEFAULT_MAIR_EL1);
+ set_reg(vm, vcpuid, ARM64_SYS_REG(TTBR0_EL1), vm->pgd);
+}
+
+void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
+{
+ uint64_t pstate, pc;
+
+ get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pstate), &pstate);
+ get_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), &pc);
+
+ fprintf(stream, "%*spstate: 0x%.16lx pc: 0x%.16lx\n",
+ indent, "", pstate, pc);
+}
+
+void aarch64_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_vcpu_init *init, void *guest_code)
+{
+ size_t stack_size = vm->page_size == 4096 ?
+ DEFAULT_STACK_PGS * vm->page_size :
+ vm->page_size;
+ uint64_t stack_vaddr = vm_vaddr_alloc(vm, stack_size,
+ DEFAULT_ARM64_GUEST_STACK_VADDR_MIN, 0, 0);
+
+ vm_vcpu_add(vm, vcpuid);
+ aarch64_vcpu_setup(vm, vcpuid, init);
+
+ set_reg(vm, vcpuid, ARM64_CORE_REG(sp_el1), stack_vaddr + stack_size);
+ set_reg(vm, vcpuid, ARM64_CORE_REG(regs.pc), (uint64_t)guest_code);
+}
+
+void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
+{
+ aarch64_vcpu_add_default(vm, vcpuid, NULL, guest_code);
+}
+
+void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...)
+{
+ va_list ap;
+ int i;
+
+ TEST_ASSERT(num >= 1 && num <= 8, "Unsupported number of args,\n"
+ " num: %u\n", num);
+
+ va_start(ap, num);
+
+ for (i = 0; i < num; i++) {
+ set_reg(vm, vcpuid, ARM64_CORE_REG(regs.regs[i]),
+ va_arg(ap, uint64_t));
+ }
+
+ va_end(ap);
+}
+
+void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid)
+{
+}
diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c
new file mode 100644
index 000000000..f600311fd
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ucall support. A ucall is a "hypercall to userspace".
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ */
+#include "kvm_util.h"
+#include "../kvm_util_internal.h"
+
+static vm_vaddr_t *ucall_exit_mmio_addr;
+
+static bool ucall_mmio_init(struct kvm_vm *vm, vm_paddr_t gpa)
+{
+ if (kvm_userspace_memory_region_find(vm, gpa, gpa + 1))
+ return false;
+
+ virt_pg_map(vm, gpa, gpa, 0);
+
+ ucall_exit_mmio_addr = (vm_vaddr_t *)gpa;
+ sync_global_to_guest(vm, ucall_exit_mmio_addr);
+
+ return true;
+}
+
+void ucall_init(struct kvm_vm *vm, void *arg)
+{
+ vm_paddr_t gpa, start, end, step, offset;
+ unsigned int bits;
+ bool ret;
+
+ if (arg) {
+ gpa = (vm_paddr_t)arg;
+ ret = ucall_mmio_init(vm, gpa);
+ TEST_ASSERT(ret, "Can't set ucall mmio address to %lx", gpa);
+ return;
+ }
+
+ /*
+ * Find an address within the allowed physical and virtual address
+ * spaces, that does _not_ have a KVM memory region associated with
+ * it. Identity mapping an address like this allows the guest to
+ * access it, but as KVM doesn't know what to do with it, it
+ * will assume it's something userspace handles and exit with
+ * KVM_EXIT_MMIO. Well, at least that's how it works for AArch64.
+ * Here we start with a guess that the addresses around 5/8th
+ * of the allowed space are unmapped and then work both down and
+ * up from there in 1/16th allowed space sized steps.
+ *
+ * Note, we need to use VA-bits - 1 when calculating the allowed
+ * virtual address space for an identity mapping because the upper
+ * half of the virtual address space is the two's complement of the
+ * lower and won't match physical addresses.
+ */
+ bits = vm->va_bits - 1;
+ bits = vm->pa_bits < bits ? vm->pa_bits : bits;
+ end = 1ul << bits;
+ start = end * 5 / 8;
+ step = end / 16;
+ for (offset = 0; offset < end - start; offset += step) {
+ if (ucall_mmio_init(vm, start - offset))
+ return;
+ if (ucall_mmio_init(vm, start + offset))
+ return;
+ }
+ TEST_FAIL("Can't find a ucall mmio address");
+}
+
+void ucall_uninit(struct kvm_vm *vm)
+{
+ ucall_exit_mmio_addr = 0;
+ sync_global_to_guest(vm, ucall_exit_mmio_addr);
+}
+
+void ucall(uint64_t cmd, int nargs, ...)
+{
+ struct ucall uc = {};
+ va_list va;
+ int i;
+
+ WRITE_ONCE(uc.cmd, cmd);
+ nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS;
+
+ va_start(va, nargs);
+ for (i = 0; i < nargs; ++i)
+ WRITE_ONCE(uc.args[i], va_arg(va, uint64_t));
+ va_end(va);
+
+ WRITE_ONCE(*ucall_exit_mmio_addr, (vm_vaddr_t)&uc);
+}
+
+uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc)
+{
+ struct kvm_run *run = vcpu_state(vm, vcpu_id);
+ struct ucall ucall = {};
+
+ if (uc)
+ memset(uc, 0, sizeof(*uc));
+
+ if (run->exit_reason == KVM_EXIT_MMIO &&
+ run->mmio.phys_addr == (uint64_t)ucall_exit_mmio_addr) {
+ vm_vaddr_t gva;
+
+ TEST_ASSERT(run->mmio.is_write && run->mmio.len == 8,
+ "Unexpected ucall exit mmio address access");
+ memcpy(&gva, run->mmio.data, sizeof(gva));
+ memcpy(&ucall, addr_gva2hva(vm, gva), sizeof(ucall));
+
+ vcpu_run_complete_io(vm, vcpu_id);
+ if (uc)
+ memcpy(uc, &ucall, sizeof(ucall));
+ }
+
+ return ucall.cmd;
+}
diff --git a/tools/testing/selftests/kvm/lib/assert.c b/tools/testing/selftests/kvm/lib/assert.c
new file mode 100644
index 000000000..5ebbd0d6b
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/assert.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tools/testing/selftests/kvm/lib/assert.c
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#define _GNU_SOURCE /* for getline(3) and strchrnul(3)*/
+
+#include "test_util.h"
+
+#include <execinfo.h>
+#include <sys/syscall.h>
+
+#include "kselftest.h"
+
+/* Dumps the current stack trace to stderr. */
+static void __attribute__((noinline)) test_dump_stack(void);
+static void test_dump_stack(void)
+{
+ /*
+ * Build and run this command:
+ *
+ * addr2line -s -e /proc/$PPID/exe -fpai {backtrace addresses} | \
+ * grep -v test_dump_stack | cat -n 1>&2
+ *
+ * Note that the spacing is different and there's no newline.
+ */
+ size_t i;
+ size_t n = 20;
+ void *stack[n];
+ const char *addr2line = "addr2line -s -e /proc/$PPID/exe -fpai";
+ const char *pipeline = "|cat -n 1>&2";
+ char cmd[strlen(addr2line) + strlen(pipeline) +
+ /* N bytes per addr * 2 digits per byte + 1 space per addr: */
+ n * (((sizeof(void *)) * 2) + 1) +
+ /* Null terminator: */
+ 1];
+ char *c;
+
+ n = backtrace(stack, n);
+ c = &cmd[0];
+ c += sprintf(c, "%s", addr2line);
+ /*
+ * Skip the first 3 frames: backtrace, test_dump_stack, and
+ * test_assert. We hope that backtrace isn't inlined and the other two
+ * we've declared noinline.
+ */
+ for (i = 2; i < n; i++)
+ c += sprintf(c, " %lx", ((unsigned long) stack[i]) - 1);
+ c += sprintf(c, "%s", pipeline);
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-result"
+ system(cmd);
+#pragma GCC diagnostic pop
+}
+
+static pid_t _gettid(void)
+{
+ return syscall(SYS_gettid);
+}
+
+void __attribute__((noinline))
+test_assert(bool exp, const char *exp_str,
+ const char *file, unsigned int line, const char *fmt, ...)
+{
+ va_list ap;
+
+ if (!(exp)) {
+ va_start(ap, fmt);
+
+ fprintf(stderr, "==== Test Assertion Failure ====\n"
+ " %s:%u: %s\n"
+ " pid=%d tid=%d - %s\n",
+ file, line, exp_str, getpid(), _gettid(),
+ strerror(errno));
+ test_dump_stack();
+ if (fmt) {
+ fputs(" ", stderr);
+ vfprintf(stderr, fmt, ap);
+ fputs("\n", stderr);
+ }
+ va_end(ap);
+
+ if (errno == EACCES) {
+ print_skip("Access denied - Exiting");
+ exit(KSFT_SKIP);
+ }
+ exit(254);
+ }
+
+ return;
+}
diff --git a/tools/testing/selftests/kvm/lib/elf.c b/tools/testing/selftests/kvm/lib/elf.c
new file mode 100644
index 000000000..bc75a91e0
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/elf.c
@@ -0,0 +1,196 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tools/testing/selftests/kvm/lib/elf.c
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#include "test_util.h"
+
+#include <bits/endian.h>
+#include <linux/elf.h>
+
+#include "kvm_util.h"
+#include "kvm_util_internal.h"
+
+static void elfhdr_get(const char *filename, Elf64_Ehdr *hdrp)
+{
+ off_t offset_rv;
+
+ /* Open the ELF file. */
+ int fd;
+ fd = open(filename, O_RDONLY);
+ TEST_ASSERT(fd >= 0, "Failed to open ELF file,\n"
+ " filename: %s\n"
+ " rv: %i errno: %i", filename, fd, errno);
+
+ /* Read in and validate ELF Identification Record.
+ * The ELF Identification record is the first 16 (EI_NIDENT) bytes
+ * of the ELF header, which is at the beginning of the ELF file.
+ * For now it is only safe to read the first EI_NIDENT bytes. Once
+ * read and validated, the value of e_ehsize can be used to determine
+ * the real size of the ELF header.
+ */
+ unsigned char ident[EI_NIDENT];
+ test_read(fd, ident, sizeof(ident));
+ TEST_ASSERT((ident[EI_MAG0] == ELFMAG0) && (ident[EI_MAG1] == ELFMAG1)
+ && (ident[EI_MAG2] == ELFMAG2) && (ident[EI_MAG3] == ELFMAG3),
+ "ELF MAGIC Mismatch,\n"
+ " filename: %s\n"
+ " ident[EI_MAG0 - EI_MAG3]: %02x %02x %02x %02x\n"
+ " Expected: %02x %02x %02x %02x",
+ filename,
+ ident[EI_MAG0], ident[EI_MAG1], ident[EI_MAG2], ident[EI_MAG3],
+ ELFMAG0, ELFMAG1, ELFMAG2, ELFMAG3);
+ TEST_ASSERT(ident[EI_CLASS] == ELFCLASS64,
+ "Current implementation only able to handle ELFCLASS64,\n"
+ " filename: %s\n"
+ " ident[EI_CLASS]: %02x\n"
+ " expected: %02x",
+ filename,
+ ident[EI_CLASS], ELFCLASS64);
+ TEST_ASSERT(((BYTE_ORDER == LITTLE_ENDIAN)
+ && (ident[EI_DATA] == ELFDATA2LSB))
+ || ((BYTE_ORDER == BIG_ENDIAN)
+ && (ident[EI_DATA] == ELFDATA2MSB)), "Current "
+ "implementation only able to handle\n"
+ "cases where the host and ELF file endianness\n"
+ "is the same:\n"
+ " host BYTE_ORDER: %u\n"
+ " host LITTLE_ENDIAN: %u\n"
+ " host BIG_ENDIAN: %u\n"
+ " ident[EI_DATA]: %u\n"
+ " ELFDATA2LSB: %u\n"
+ " ELFDATA2MSB: %u",
+ BYTE_ORDER, LITTLE_ENDIAN, BIG_ENDIAN,
+ ident[EI_DATA], ELFDATA2LSB, ELFDATA2MSB);
+ TEST_ASSERT(ident[EI_VERSION] == EV_CURRENT,
+ "Current implementation only able to handle current "
+ "ELF version,\n"
+ " filename: %s\n"
+ " ident[EI_VERSION]: %02x\n"
+ " expected: %02x",
+ filename, ident[EI_VERSION], EV_CURRENT);
+
+ /* Read in the ELF header.
+ * With the ELF Identification portion of the ELF header
+ * validated, especially that the value at EI_VERSION is
+ * as expected, it is now safe to read the entire ELF header.
+ */
+ offset_rv = lseek(fd, 0, SEEK_SET);
+ TEST_ASSERT(offset_rv == 0, "Seek to ELF header failed,\n"
+ " rv: %zi expected: %i", offset_rv, 0);
+ test_read(fd, hdrp, sizeof(*hdrp));
+ TEST_ASSERT(hdrp->e_phentsize == sizeof(Elf64_Phdr),
+ "Unexpected physical header size,\n"
+ " hdrp->e_phentsize: %x\n"
+ " expected: %zx",
+ hdrp->e_phentsize, sizeof(Elf64_Phdr));
+ TEST_ASSERT(hdrp->e_shentsize == sizeof(Elf64_Shdr),
+ "Unexpected section header size,\n"
+ " hdrp->e_shentsize: %x\n"
+ " expected: %zx",
+ hdrp->e_shentsize, sizeof(Elf64_Shdr));
+}
+
+/* VM ELF Load
+ *
+ * Input Args:
+ * filename - Path to ELF file
+ *
+ * Output Args: None
+ *
+ * Input/Output Args:
+ * vm - Pointer to opaque type that describes the VM.
+ *
+ * Return: None, TEST_ASSERT failures for all error conditions
+ *
+ * Loads the program image of the ELF file specified by filename,
+ * into the virtual address space of the VM pointed to by vm. On entry
+ * the VM needs to not be using any of the virtual address space used
+ * by the image and it needs to have sufficient available physical pages, to
+ * back the virtual pages used to load the image.
+ */
+void kvm_vm_elf_load(struct kvm_vm *vm, const char *filename,
+ uint32_t data_memslot, uint32_t pgd_memslot)
+{
+ off_t offset, offset_rv;
+ Elf64_Ehdr hdr;
+
+ /* Open the ELF file. */
+ int fd;
+ fd = open(filename, O_RDONLY);
+ TEST_ASSERT(fd >= 0, "Failed to open ELF file,\n"
+ " filename: %s\n"
+ " rv: %i errno: %i", filename, fd, errno);
+
+ /* Read in the ELF header. */
+ elfhdr_get(filename, &hdr);
+
+ /* For each program header.
+ * The following ELF header members specify the location
+ * and size of the program headers:
+ *
+ * e_phoff - File offset to start of program headers
+ * e_phentsize - Size of each program header
+ * e_phnum - Number of program header entries
+ */
+ for (unsigned int n1 = 0; n1 < hdr.e_phnum; n1++) {
+ /* Seek to the beginning of the program header. */
+ offset = hdr.e_phoff + (n1 * hdr.e_phentsize);
+ offset_rv = lseek(fd, offset, SEEK_SET);
+ TEST_ASSERT(offset_rv == offset,
+ "Failed to seek to begining of program header %u,\n"
+ " filename: %s\n"
+ " rv: %jd errno: %i",
+ n1, filename, (intmax_t) offset_rv, errno);
+
+ /* Read in the program header. */
+ Elf64_Phdr phdr;
+ test_read(fd, &phdr, sizeof(phdr));
+
+ /* Skip if this header doesn't describe a loadable segment. */
+ if (phdr.p_type != PT_LOAD)
+ continue;
+
+ /* Allocate memory for this segment within the VM. */
+ TEST_ASSERT(phdr.p_memsz > 0, "Unexpected loadable segment "
+ "memsize of 0,\n"
+ " phdr index: %u p_memsz: 0x%" PRIx64,
+ n1, (uint64_t) phdr.p_memsz);
+ vm_vaddr_t seg_vstart = phdr.p_vaddr;
+ seg_vstart &= ~(vm_vaddr_t)(vm->page_size - 1);
+ vm_vaddr_t seg_vend = phdr.p_vaddr + phdr.p_memsz - 1;
+ seg_vend |= vm->page_size - 1;
+ size_t seg_size = seg_vend - seg_vstart + 1;
+
+ vm_vaddr_t vaddr = vm_vaddr_alloc(vm, seg_size, seg_vstart,
+ data_memslot, pgd_memslot);
+ TEST_ASSERT(vaddr == seg_vstart, "Unable to allocate "
+ "virtual memory for segment at requested min addr,\n"
+ " segment idx: %u\n"
+ " seg_vstart: 0x%lx\n"
+ " vaddr: 0x%lx",
+ n1, seg_vstart, vaddr);
+ memset(addr_gva2hva(vm, vaddr), 0, seg_size);
+ /* TODO(lhuemill): Set permissions of each memory segment
+ * based on the least-significant 3 bits of phdr.p_flags.
+ */
+
+ /* Load portion of initial state that is contained within
+ * the ELF file.
+ */
+ if (phdr.p_filesz) {
+ offset_rv = lseek(fd, phdr.p_offset, SEEK_SET);
+ TEST_ASSERT(offset_rv == phdr.p_offset,
+ "Seek to program segment offset failed,\n"
+ " program header idx: %u errno: %i\n"
+ " offset_rv: 0x%jx\n"
+ " expected: 0x%jx\n",
+ n1, errno, (intmax_t) offset_rv,
+ (intmax_t) phdr.p_offset);
+ test_read(fd, addr_gva2hva(vm, phdr.p_vaddr),
+ phdr.p_filesz);
+ }
+ }
+}
diff --git a/tools/testing/selftests/kvm/lib/io.c b/tools/testing/selftests/kvm/lib/io.c
new file mode 100644
index 000000000..fedb2a741
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/io.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tools/testing/selftests/kvm/lib/io.c
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#include "test_util.h"
+
+/* Test Write
+ *
+ * A wrapper for write(2), that automatically handles the following
+ * special conditions:
+ *
+ * + Interrupted system call (EINTR)
+ * + Write of less than requested amount
+ * + Non-block return (EAGAIN)
+ *
+ * For each of the above, an additional write is performed to automatically
+ * continue writing the requested data.
+ * There are also many cases where write(2) can return an unexpected
+ * error (e.g. EIO). Such errors cause a TEST_ASSERT failure.
+ *
+ * Note, for function signature compatibility with write(2), this function
+ * returns the number of bytes written, but that value will always be equal
+ * to the number of requested bytes. All other conditions in this and
+ * future enhancements to this function either automatically issue another
+ * write(2) or cause a TEST_ASSERT failure.
+ *
+ * Args:
+ * fd - Opened file descriptor to file to be written.
+ * count - Number of bytes to write.
+ *
+ * Output:
+ * buf - Starting address of data to be written.
+ *
+ * Return:
+ * On success, number of bytes written.
+ * On failure, a TEST_ASSERT failure is caused.
+ */
+ssize_t test_write(int fd, const void *buf, size_t count)
+{
+ ssize_t rc;
+ ssize_t num_written = 0;
+ size_t num_left = count;
+ const char *ptr = buf;
+
+ /* Note: Count of zero is allowed (see "RETURN VALUE" portion of
+ * write(2) manpage for details.
+ */
+ TEST_ASSERT(count >= 0, "Unexpected count, count: %li", count);
+
+ do {
+ rc = write(fd, ptr, num_left);
+
+ switch (rc) {
+ case -1:
+ TEST_ASSERT(errno == EAGAIN || errno == EINTR,
+ "Unexpected write failure,\n"
+ " rc: %zi errno: %i", rc, errno);
+ continue;
+
+ case 0:
+ TEST_FAIL("Unexpected EOF,\n"
+ " rc: %zi num_written: %zi num_left: %zu",
+ rc, num_written, num_left);
+ break;
+
+ default:
+ TEST_ASSERT(rc >= 0, "Unexpected ret from write,\n"
+ " rc: %zi errno: %i", rc, errno);
+ num_written += rc;
+ num_left -= rc;
+ ptr += rc;
+ break;
+ }
+ } while (num_written < count);
+
+ return num_written;
+}
+
+/* Test Read
+ *
+ * A wrapper for read(2), that automatically handles the following
+ * special conditions:
+ *
+ * + Interrupted system call (EINTR)
+ * + Read of less than requested amount
+ * + Non-block return (EAGAIN)
+ *
+ * For each of the above, an additional read is performed to automatically
+ * continue reading the requested data.
+ * There are also many cases where read(2) can return an unexpected
+ * error (e.g. EIO). Such errors cause a TEST_ASSERT failure. Note,
+ * it is expected that the file opened by fd at the current file position
+ * contains at least the number of requested bytes to be read. A TEST_ASSERT
+ * failure is produced if an End-Of-File condition occurs, before all the
+ * data is read. It is the callers responsibility to assure that sufficient
+ * data exists.
+ *
+ * Note, for function signature compatibility with read(2), this function
+ * returns the number of bytes read, but that value will always be equal
+ * to the number of requested bytes. All other conditions in this and
+ * future enhancements to this function either automatically issue another
+ * read(2) or cause a TEST_ASSERT failure.
+ *
+ * Args:
+ * fd - Opened file descriptor to file to be read.
+ * count - Number of bytes to read.
+ *
+ * Output:
+ * buf - Starting address of where to write the bytes read.
+ *
+ * Return:
+ * On success, number of bytes read.
+ * On failure, a TEST_ASSERT failure is caused.
+ */
+ssize_t test_read(int fd, void *buf, size_t count)
+{
+ ssize_t rc;
+ ssize_t num_read = 0;
+ size_t num_left = count;
+ char *ptr = buf;
+
+ /* Note: Count of zero is allowed (see "If count is zero" portion of
+ * read(2) manpage for details.
+ */
+ TEST_ASSERT(count >= 0, "Unexpected count, count: %li", count);
+
+ do {
+ rc = read(fd, ptr, num_left);
+
+ switch (rc) {
+ case -1:
+ TEST_ASSERT(errno == EAGAIN || errno == EINTR,
+ "Unexpected read failure,\n"
+ " rc: %zi errno: %i", rc, errno);
+ break;
+
+ case 0:
+ TEST_FAIL("Unexpected EOF,\n"
+ " rc: %zi num_read: %zi num_left: %zu",
+ rc, num_read, num_left);
+ break;
+
+ default:
+ TEST_ASSERT(rc > 0, "Unexpected ret from read,\n"
+ " rc: %zi errno: %i", rc, errno);
+ num_read += rc;
+ num_left -= rc;
+ ptr += rc;
+ break;
+ }
+ } while (num_read < count);
+
+ return num_read;
+}
diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
new file mode 100644
index 000000000..49805fd16
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -0,0 +1,1865 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tools/testing/selftests/kvm/lib/kvm_util.c
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "kvm_util_internal.h"
+#include "processor.h"
+
+#include <assert.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <linux/kernel.h>
+
+#define KVM_UTIL_PGS_PER_HUGEPG 512
+#define KVM_UTIL_MIN_PFN 2
+
+/* Aligns x up to the next multiple of size. Size must be a power of 2. */
+static void *align(void *x, size_t size)
+{
+ size_t mask = size - 1;
+ TEST_ASSERT(size != 0 && !(size & (size - 1)),
+ "size not a power of 2: %lu", size);
+ return (void *) (((size_t) x + mask) & ~mask);
+}
+
+/*
+ * Capability
+ *
+ * Input Args:
+ * cap - Capability
+ *
+ * Output Args: None
+ *
+ * Return:
+ * On success, the Value corresponding to the capability (KVM_CAP_*)
+ * specified by the value of cap. On failure a TEST_ASSERT failure
+ * is produced.
+ *
+ * Looks up and returns the value corresponding to the capability
+ * (KVM_CAP_*) given by cap.
+ */
+int kvm_check_cap(long cap)
+{
+ int ret;
+ int kvm_fd;
+
+ kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
+ if (kvm_fd < 0)
+ exit(KSFT_SKIP);
+
+ ret = ioctl(kvm_fd, KVM_CHECK_EXTENSION, cap);
+ TEST_ASSERT(ret >= 0, "KVM_CHECK_EXTENSION IOCTL failed,\n"
+ " rc: %i errno: %i", ret, errno);
+
+ close(kvm_fd);
+
+ return ret;
+}
+
+/* VM Enable Capability
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * cap - Capability
+ *
+ * Output Args: None
+ *
+ * Return: On success, 0. On failure a TEST_ASSERT failure is produced.
+ *
+ * Enables a capability (KVM_CAP_*) on the VM.
+ */
+int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap)
+{
+ int ret;
+
+ ret = ioctl(vm->fd, KVM_ENABLE_CAP, cap);
+ TEST_ASSERT(ret == 0, "KVM_ENABLE_CAP IOCTL failed,\n"
+ " rc: %i errno: %i", ret, errno);
+
+ return ret;
+}
+
+/* VCPU Enable Capability
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpu_id - VCPU
+ * cap - Capability
+ *
+ * Output Args: None
+ *
+ * Return: On success, 0. On failure a TEST_ASSERT failure is produced.
+ *
+ * Enables a capability (KVM_CAP_*) on the VCPU.
+ */
+int vcpu_enable_cap(struct kvm_vm *vm, uint32_t vcpu_id,
+ struct kvm_enable_cap *cap)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpu_id);
+ int r;
+
+ TEST_ASSERT(vcpu, "cannot find vcpu %d", vcpu_id);
+
+ r = ioctl(vcpu->fd, KVM_ENABLE_CAP, cap);
+ TEST_ASSERT(!r, "KVM_ENABLE_CAP vCPU ioctl failed,\n"
+ " rc: %i, errno: %i", r, errno);
+
+ return r;
+}
+
+static void vm_open(struct kvm_vm *vm, int perm)
+{
+ vm->kvm_fd = open(KVM_DEV_PATH, perm);
+ if (vm->kvm_fd < 0)
+ exit(KSFT_SKIP);
+
+ if (!kvm_check_cap(KVM_CAP_IMMEDIATE_EXIT)) {
+ print_skip("immediate_exit not available");
+ exit(KSFT_SKIP);
+ }
+
+ vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, vm->type);
+ TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, "
+ "rc: %i errno: %i", vm->fd, errno);
+}
+
+const char * const vm_guest_mode_string[] = {
+ "PA-bits:52, VA-bits:48, 4K pages",
+ "PA-bits:52, VA-bits:48, 64K pages",
+ "PA-bits:48, VA-bits:48, 4K pages",
+ "PA-bits:48, VA-bits:48, 64K pages",
+ "PA-bits:40, VA-bits:48, 4K pages",
+ "PA-bits:40, VA-bits:48, 64K pages",
+ "PA-bits:ANY, VA-bits:48, 4K pages",
+};
+_Static_assert(sizeof(vm_guest_mode_string)/sizeof(char *) == NUM_VM_MODES,
+ "Missing new mode strings?");
+
+struct vm_guest_mode_params {
+ unsigned int pa_bits;
+ unsigned int va_bits;
+ unsigned int page_size;
+ unsigned int page_shift;
+};
+
+static const struct vm_guest_mode_params vm_guest_mode_params[] = {
+ { 52, 48, 0x1000, 12 },
+ { 52, 48, 0x10000, 16 },
+ { 48, 48, 0x1000, 12 },
+ { 48, 48, 0x10000, 16 },
+ { 40, 48, 0x1000, 12 },
+ { 40, 48, 0x10000, 16 },
+ { 0, 0, 0x1000, 12 },
+};
+_Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES,
+ "Missing new mode params?");
+
+/*
+ * VM Create
+ *
+ * Input Args:
+ * mode - VM Mode (e.g. VM_MODE_P52V48_4K)
+ * phy_pages - Physical memory pages
+ * perm - permission
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Pointer to opaque structure that describes the created VM.
+ *
+ * Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K).
+ * When phy_pages is non-zero, a memory region of phy_pages physical pages
+ * is created and mapped starting at guest physical address 0. The file
+ * descriptor to control the created VM is created with the permissions
+ * given by perm (e.g. O_RDWR).
+ */
+struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
+{
+ struct kvm_vm *vm;
+
+ pr_debug("%s: mode='%s' pages='%ld' perm='%d'\n", __func__,
+ vm_guest_mode_string(mode), phy_pages, perm);
+
+ vm = calloc(1, sizeof(*vm));
+ TEST_ASSERT(vm != NULL, "Insufficient Memory");
+
+ INIT_LIST_HEAD(&vm->vcpus);
+ INIT_LIST_HEAD(&vm->userspace_mem_regions);
+
+ vm->mode = mode;
+ vm->type = 0;
+
+ vm->pa_bits = vm_guest_mode_params[mode].pa_bits;
+ vm->va_bits = vm_guest_mode_params[mode].va_bits;
+ vm->page_size = vm_guest_mode_params[mode].page_size;
+ vm->page_shift = vm_guest_mode_params[mode].page_shift;
+
+ /* Setup mode specific traits. */
+ switch (vm->mode) {
+ case VM_MODE_P52V48_4K:
+ vm->pgtable_levels = 4;
+ break;
+ case VM_MODE_P52V48_64K:
+ vm->pgtable_levels = 3;
+ break;
+ case VM_MODE_P48V48_4K:
+ vm->pgtable_levels = 4;
+ break;
+ case VM_MODE_P48V48_64K:
+ vm->pgtable_levels = 3;
+ break;
+ case VM_MODE_P40V48_4K:
+ vm->pgtable_levels = 4;
+ break;
+ case VM_MODE_P40V48_64K:
+ vm->pgtable_levels = 3;
+ break;
+ case VM_MODE_PXXV48_4K:
+#ifdef __x86_64__
+ kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits);
+ /*
+ * Ignore KVM support for 5-level paging (vm->va_bits == 57),
+ * it doesn't take effect unless a CR4.LA57 is set, which it
+ * isn't for this VM_MODE.
+ */
+ TEST_ASSERT(vm->va_bits == 48 || vm->va_bits == 57,
+ "Linear address width (%d bits) not supported",
+ vm->va_bits);
+ pr_debug("Guest physical address width detected: %d\n",
+ vm->pa_bits);
+ vm->pgtable_levels = 4;
+ vm->va_bits = 48;
+#else
+ TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms");
+#endif
+ break;
+ default:
+ TEST_FAIL("Unknown guest mode, mode: 0x%x", mode);
+ }
+
+#ifdef __aarch64__
+ if (vm->pa_bits != 40)
+ vm->type = KVM_VM_TYPE_ARM_IPA_SIZE(vm->pa_bits);
+#endif
+
+ vm_open(vm, perm);
+
+ /* Limit to VA-bit canonical virtual addresses. */
+ vm->vpages_valid = sparsebit_alloc();
+ sparsebit_set_num(vm->vpages_valid,
+ 0, (1ULL << (vm->va_bits - 1)) >> vm->page_shift);
+ sparsebit_set_num(vm->vpages_valid,
+ (~((1ULL << (vm->va_bits - 1)) - 1)) >> vm->page_shift,
+ (1ULL << (vm->va_bits - 1)) >> vm->page_shift);
+
+ /* Limit physical addresses to PA-bits. */
+ vm->max_gfn = ((1ULL << vm->pa_bits) >> vm->page_shift) - 1;
+
+ /* Allocate and setup memory for guest. */
+ vm->vpages_mapped = sparsebit_alloc();
+ if (phy_pages != 0)
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+ 0, 0, phy_pages, 0);
+
+ return vm;
+}
+
+/*
+ * VM Restart
+ *
+ * Input Args:
+ * vm - VM that has been released before
+ * perm - permission
+ *
+ * Output Args: None
+ *
+ * Reopens the file descriptors associated to the VM and reinstates the
+ * global state, such as the irqchip and the memory regions that are mapped
+ * into the guest.
+ */
+void kvm_vm_restart(struct kvm_vm *vmp, int perm)
+{
+ struct userspace_mem_region *region;
+
+ vm_open(vmp, perm);
+ if (vmp->has_irqchip)
+ vm_create_irqchip(vmp);
+
+ list_for_each_entry(region, &vmp->userspace_mem_regions, list) {
+ int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
+ TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
+ " rc: %i errno: %i\n"
+ " slot: %u flags: 0x%x\n"
+ " guest_phys_addr: 0x%llx size: 0x%llx",
+ ret, errno, region->region.slot,
+ region->region.flags,
+ region->region.guest_phys_addr,
+ region->region.memory_size);
+ }
+}
+
+void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log)
+{
+ struct kvm_dirty_log args = { .dirty_bitmap = log, .slot = slot };
+ int ret;
+
+ ret = ioctl(vm->fd, KVM_GET_DIRTY_LOG, &args);
+ TEST_ASSERT(ret == 0, "%s: KVM_GET_DIRTY_LOG failed: %s",
+ __func__, strerror(-ret));
+}
+
+void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log,
+ uint64_t first_page, uint32_t num_pages)
+{
+ struct kvm_clear_dirty_log args = { .dirty_bitmap = log, .slot = slot,
+ .first_page = first_page,
+ .num_pages = num_pages };
+ int ret;
+
+ ret = ioctl(vm->fd, KVM_CLEAR_DIRTY_LOG, &args);
+ TEST_ASSERT(ret == 0, "%s: KVM_CLEAR_DIRTY_LOG failed: %s",
+ __func__, strerror(-ret));
+}
+
+/*
+ * Userspace Memory Region Find
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * start - Starting VM physical address
+ * end - Ending VM physical address, inclusive.
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Pointer to overlapping region, NULL if no such region.
+ *
+ * Searches for a region with any physical memory that overlaps with
+ * any portion of the guest physical addresses from start to end
+ * inclusive. If multiple overlapping regions exist, a pointer to any
+ * of the regions is returned. Null is returned only when no overlapping
+ * region exists.
+ */
+static struct userspace_mem_region *
+userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end)
+{
+ struct userspace_mem_region *region;
+
+ list_for_each_entry(region, &vm->userspace_mem_regions, list) {
+ uint64_t existing_start = region->region.guest_phys_addr;
+ uint64_t existing_end = region->region.guest_phys_addr
+ + region->region.memory_size - 1;
+ if (start <= existing_end && end >= existing_start)
+ return region;
+ }
+
+ return NULL;
+}
+
+/*
+ * KVM Userspace Memory Region Find
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * start - Starting VM physical address
+ * end - Ending VM physical address, inclusive.
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Pointer to overlapping region, NULL if no such region.
+ *
+ * Public interface to userspace_mem_region_find. Allows tests to look up
+ * the memslot datastructure for a given range of guest physical memory.
+ */
+struct kvm_userspace_memory_region *
+kvm_userspace_memory_region_find(struct kvm_vm *vm, uint64_t start,
+ uint64_t end)
+{
+ struct userspace_mem_region *region;
+
+ region = userspace_mem_region_find(vm, start, end);
+ if (!region)
+ return NULL;
+
+ return &region->region;
+}
+
+/*
+ * VCPU Find
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Pointer to VCPU structure
+ *
+ * Locates a vcpu structure that describes the VCPU specified by vcpuid and
+ * returns a pointer to it. Returns NULL if the VM doesn't contain a VCPU
+ * for the specified vcpuid.
+ */
+struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct vcpu *vcpu;
+
+ list_for_each_entry(vcpu, &vm->vcpus, list) {
+ if (vcpu->id == vcpuid)
+ return vcpu;
+ }
+
+ return NULL;
+}
+
+/*
+ * VM VCPU Remove
+ *
+ * Input Args:
+ * vcpu - VCPU to remove
+ *
+ * Output Args: None
+ *
+ * Return: None, TEST_ASSERT failures for all error conditions
+ *
+ * Removes a vCPU from a VM and frees its resources.
+ */
+static void vm_vcpu_rm(struct vcpu *vcpu)
+{
+ int ret;
+
+ ret = munmap(vcpu->state, sizeof(*vcpu->state));
+ TEST_ASSERT(ret == 0, "munmap of VCPU fd failed, rc: %i "
+ "errno: %i", ret, errno);
+ close(vcpu->fd);
+ TEST_ASSERT(ret == 0, "Close of VCPU fd failed, rc: %i "
+ "errno: %i", ret, errno);
+
+ list_del(&vcpu->list);
+ free(vcpu);
+}
+
+void kvm_vm_release(struct kvm_vm *vmp)
+{
+ struct vcpu *vcpu, *tmp;
+ int ret;
+
+ list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list)
+ vm_vcpu_rm(vcpu);
+
+ ret = close(vmp->fd);
+ TEST_ASSERT(ret == 0, "Close of vm fd failed,\n"
+ " vmp->fd: %i rc: %i errno: %i", vmp->fd, ret, errno);
+
+ close(vmp->kvm_fd);
+ TEST_ASSERT(ret == 0, "Close of /dev/kvm fd failed,\n"
+ " vmp->kvm_fd: %i rc: %i errno: %i", vmp->kvm_fd, ret, errno);
+}
+
+static void __vm_mem_region_delete(struct kvm_vm *vm,
+ struct userspace_mem_region *region)
+{
+ int ret;
+
+ list_del(&region->list);
+
+ region->region.memory_size = 0;
+ ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
+ TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed, "
+ "rc: %i errno: %i", ret, errno);
+
+ sparsebit_free(&region->unused_phy_pages);
+ ret = munmap(region->mmap_start, region->mmap_size);
+ TEST_ASSERT(ret == 0, "munmap failed, rc: %i errno: %i", ret, errno);
+
+ free(region);
+}
+
+/*
+ * Destroys and frees the VM pointed to by vmp.
+ */
+void kvm_vm_free(struct kvm_vm *vmp)
+{
+ struct userspace_mem_region *region, *tmp;
+
+ if (vmp == NULL)
+ return;
+
+ /* Free userspace_mem_regions. */
+ list_for_each_entry_safe(region, tmp, &vmp->userspace_mem_regions, list)
+ __vm_mem_region_delete(vmp, region);
+
+ /* Free sparsebit arrays. */
+ sparsebit_free(&vmp->vpages_valid);
+ sparsebit_free(&vmp->vpages_mapped);
+
+ kvm_vm_release(vmp);
+
+ /* Free the structure describing the VM. */
+ free(vmp);
+}
+
+/*
+ * Memory Compare, host virtual to guest virtual
+ *
+ * Input Args:
+ * hva - Starting host virtual address
+ * vm - Virtual Machine
+ * gva - Starting guest virtual address
+ * len - number of bytes to compare
+ *
+ * Output Args: None
+ *
+ * Input/Output Args: None
+ *
+ * Return:
+ * Returns 0 if the bytes starting at hva for a length of len
+ * are equal the guest virtual bytes starting at gva. Returns
+ * a value < 0, if bytes at hva are less than those at gva.
+ * Otherwise a value > 0 is returned.
+ *
+ * Compares the bytes starting at the host virtual address hva, for
+ * a length of len, to the guest bytes starting at the guest virtual
+ * address given by gva.
+ */
+int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, vm_vaddr_t gva, size_t len)
+{
+ size_t amt;
+
+ /*
+ * Compare a batch of bytes until either a match is found
+ * or all the bytes have been compared.
+ */
+ for (uintptr_t offset = 0; offset < len; offset += amt) {
+ uintptr_t ptr1 = (uintptr_t)hva + offset;
+
+ /*
+ * Determine host address for guest virtual address
+ * at offset.
+ */
+ uintptr_t ptr2 = (uintptr_t)addr_gva2hva(vm, gva + offset);
+
+ /*
+ * Determine amount to compare on this pass.
+ * Don't allow the comparsion to cross a page boundary.
+ */
+ amt = len - offset;
+ if ((ptr1 >> vm->page_shift) != ((ptr1 + amt) >> vm->page_shift))
+ amt = vm->page_size - (ptr1 % vm->page_size);
+ if ((ptr2 >> vm->page_shift) != ((ptr2 + amt) >> vm->page_shift))
+ amt = vm->page_size - (ptr2 % vm->page_size);
+
+ assert((ptr1 >> vm->page_shift) == ((ptr1 + amt - 1) >> vm->page_shift));
+ assert((ptr2 >> vm->page_shift) == ((ptr2 + amt - 1) >> vm->page_shift));
+
+ /*
+ * Perform the comparison. If there is a difference
+ * return that result to the caller, otherwise need
+ * to continue on looking for a mismatch.
+ */
+ int ret = memcmp((void *)ptr1, (void *)ptr2, amt);
+ if (ret != 0)
+ return ret;
+ }
+
+ /*
+ * No mismatch found. Let the caller know the two memory
+ * areas are equal.
+ */
+ return 0;
+}
+
+/*
+ * VM Userspace Memory Region Add
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * backing_src - Storage source for this region.
+ * NULL to use anonymous memory.
+ * guest_paddr - Starting guest physical address
+ * slot - KVM region slot
+ * npages - Number of physical pages
+ * flags - KVM memory region flags (e.g. KVM_MEM_LOG_DIRTY_PAGES)
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Allocates a memory area of the number of pages specified by npages
+ * and maps it to the VM specified by vm, at a starting physical address
+ * given by guest_paddr. The region is created with a KVM region slot
+ * given by slot, which must be unique and < KVM_MEM_SLOTS_NUM. The
+ * region is created with the flags given by flags.
+ */
+void vm_userspace_mem_region_add(struct kvm_vm *vm,
+ enum vm_mem_backing_src_type src_type,
+ uint64_t guest_paddr, uint32_t slot, uint64_t npages,
+ uint32_t flags)
+{
+ int ret;
+ struct userspace_mem_region *region;
+ size_t huge_page_size = KVM_UTIL_PGS_PER_HUGEPG * vm->page_size;
+ size_t alignment;
+
+ TEST_ASSERT(vm_adjust_num_guest_pages(vm->mode, npages) == npages,
+ "Number of guest pages is not compatible with the host. "
+ "Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages));
+
+ TEST_ASSERT((guest_paddr % vm->page_size) == 0, "Guest physical "
+ "address not on a page boundary.\n"
+ " guest_paddr: 0x%lx vm->page_size: 0x%x",
+ guest_paddr, vm->page_size);
+ TEST_ASSERT((((guest_paddr >> vm->page_shift) + npages) - 1)
+ <= vm->max_gfn, "Physical range beyond maximum "
+ "supported physical address,\n"
+ " guest_paddr: 0x%lx npages: 0x%lx\n"
+ " vm->max_gfn: 0x%lx vm->page_size: 0x%x",
+ guest_paddr, npages, vm->max_gfn, vm->page_size);
+
+ /*
+ * Confirm a mem region with an overlapping address doesn't
+ * already exist.
+ */
+ region = (struct userspace_mem_region *) userspace_mem_region_find(
+ vm, guest_paddr, (guest_paddr + npages * vm->page_size) - 1);
+ if (region != NULL)
+ TEST_FAIL("overlapping userspace_mem_region already "
+ "exists\n"
+ " requested guest_paddr: 0x%lx npages: 0x%lx "
+ "page_size: 0x%x\n"
+ " existing guest_paddr: 0x%lx size: 0x%lx",
+ guest_paddr, npages, vm->page_size,
+ (uint64_t) region->region.guest_phys_addr,
+ (uint64_t) region->region.memory_size);
+
+ /* Confirm no region with the requested slot already exists. */
+ list_for_each_entry(region, &vm->userspace_mem_regions, list) {
+ if (region->region.slot != slot)
+ continue;
+
+ TEST_FAIL("A mem region with the requested slot "
+ "already exists.\n"
+ " requested slot: %u paddr: 0x%lx npages: 0x%lx\n"
+ " existing slot: %u paddr: 0x%lx size: 0x%lx",
+ slot, guest_paddr, npages,
+ region->region.slot,
+ (uint64_t) region->region.guest_phys_addr,
+ (uint64_t) region->region.memory_size);
+ }
+
+ /* Allocate and initialize new mem region structure. */
+ region = calloc(1, sizeof(*region));
+ TEST_ASSERT(region != NULL, "Insufficient Memory");
+ region->mmap_size = npages * vm->page_size;
+
+#ifdef __s390x__
+ /* On s390x, the host address must be aligned to 1M (due to PGSTEs) */
+ alignment = 0x100000;
+#else
+ alignment = 1;
+#endif
+
+ if (src_type == VM_MEM_SRC_ANONYMOUS_THP)
+ alignment = max(huge_page_size, alignment);
+
+ /* Add enough memory to align up if necessary */
+ if (alignment > 1)
+ region->mmap_size += alignment;
+
+ region->mmap_start = mmap(NULL, region->mmap_size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS
+ | (src_type == VM_MEM_SRC_ANONYMOUS_HUGETLB ? MAP_HUGETLB : 0),
+ -1, 0);
+ TEST_ASSERT(region->mmap_start != MAP_FAILED,
+ "test_malloc failed, mmap_start: %p errno: %i",
+ region->mmap_start, errno);
+
+ /* Align host address */
+ region->host_mem = align(region->mmap_start, alignment);
+
+ /* As needed perform madvise */
+ if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == VM_MEM_SRC_ANONYMOUS_THP) {
+ struct stat statbuf;
+
+ ret = stat("/sys/kernel/mm/transparent_hugepage", &statbuf);
+ TEST_ASSERT(ret == 0 || (ret == -1 && errno == ENOENT),
+ "stat /sys/kernel/mm/transparent_hugepage");
+
+ TEST_ASSERT(ret == 0 || src_type != VM_MEM_SRC_ANONYMOUS_THP,
+ "VM_MEM_SRC_ANONYMOUS_THP requires THP to be configured in the host kernel");
+
+ if (ret == 0) {
+ ret = madvise(region->host_mem, npages * vm->page_size,
+ src_type == VM_MEM_SRC_ANONYMOUS ? MADV_NOHUGEPAGE : MADV_HUGEPAGE);
+ TEST_ASSERT(ret == 0, "madvise failed, addr: %p length: 0x%lx src_type: %x",
+ region->host_mem, npages * vm->page_size, src_type);
+ }
+ }
+
+ region->unused_phy_pages = sparsebit_alloc();
+ sparsebit_set_num(region->unused_phy_pages,
+ guest_paddr >> vm->page_shift, npages);
+ region->region.slot = slot;
+ region->region.flags = flags;
+ region->region.guest_phys_addr = guest_paddr;
+ region->region.memory_size = npages * vm->page_size;
+ region->region.userspace_addr = (uintptr_t) region->host_mem;
+ ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
+ TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
+ " rc: %i errno: %i\n"
+ " slot: %u flags: 0x%x\n"
+ " guest_phys_addr: 0x%lx size: 0x%lx",
+ ret, errno, slot, flags,
+ guest_paddr, (uint64_t) region->region.memory_size);
+
+ /* Add to linked-list of memory regions. */
+ list_add(&region->list, &vm->userspace_mem_regions);
+}
+
+/*
+ * Memslot to region
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * memslot - KVM memory slot ID
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Pointer to memory region structure that describe memory region
+ * using kvm memory slot ID given by memslot. TEST_ASSERT failure
+ * on error (e.g. currently no memory region using memslot as a KVM
+ * memory slot ID).
+ */
+struct userspace_mem_region *
+memslot2region(struct kvm_vm *vm, uint32_t memslot)
+{
+ struct userspace_mem_region *region;
+
+ list_for_each_entry(region, &vm->userspace_mem_regions, list) {
+ if (region->region.slot == memslot)
+ return region;
+ }
+
+ fprintf(stderr, "No mem region with the requested slot found,\n"
+ " requested slot: %u\n", memslot);
+ fputs("---- vm dump ----\n", stderr);
+ vm_dump(stderr, vm, 2);
+ TEST_FAIL("Mem region not found");
+ return NULL;
+}
+
+/*
+ * VM Memory Region Flags Set
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * flags - Starting guest physical address
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Sets the flags of the memory region specified by the value of slot,
+ * to the values given by flags.
+ */
+void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags)
+{
+ int ret;
+ struct userspace_mem_region *region;
+
+ region = memslot2region(vm, slot);
+
+ region->region.flags = flags;
+
+ ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
+
+ TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
+ " rc: %i errno: %i slot: %u flags: 0x%x",
+ ret, errno, slot, flags);
+}
+
+/*
+ * VM Memory Region Move
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * slot - Slot of the memory region to move
+ * new_gpa - Starting guest physical address
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Change the gpa of a memory region.
+ */
+void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa)
+{
+ struct userspace_mem_region *region;
+ int ret;
+
+ region = memslot2region(vm, slot);
+
+ region->region.guest_phys_addr = new_gpa;
+
+ ret = ioctl(vm->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
+
+ TEST_ASSERT(!ret, "KVM_SET_USER_MEMORY_REGION failed\n"
+ "ret: %i errno: %i slot: %u new_gpa: 0x%lx",
+ ret, errno, slot, new_gpa);
+}
+
+/*
+ * VM Memory Region Delete
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * slot - Slot of the memory region to delete
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Delete a memory region.
+ */
+void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot)
+{
+ __vm_mem_region_delete(vm, memslot2region(vm, slot));
+}
+
+/*
+ * VCPU mmap Size
+ *
+ * Input Args: None
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Size of VCPU state
+ *
+ * Returns the size of the structure pointed to by the return value
+ * of vcpu_state().
+ */
+static int vcpu_mmap_sz(void)
+{
+ int dev_fd, ret;
+
+ dev_fd = open(KVM_DEV_PATH, O_RDONLY);
+ if (dev_fd < 0)
+ exit(KSFT_SKIP);
+
+ ret = ioctl(dev_fd, KVM_GET_VCPU_MMAP_SIZE, NULL);
+ TEST_ASSERT(ret >= sizeof(struct kvm_run),
+ "%s KVM_GET_VCPU_MMAP_SIZE ioctl failed, rc: %i errno: %i",
+ __func__, ret, errno);
+
+ close(dev_fd);
+
+ return ret;
+}
+
+/*
+ * VM VCPU Add
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Adds a virtual CPU to the VM specified by vm with the ID given by vcpuid.
+ * No additional VCPU setup is done.
+ */
+void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct vcpu *vcpu;
+
+ /* Confirm a vcpu with the specified id doesn't already exist. */
+ vcpu = vcpu_find(vm, vcpuid);
+ if (vcpu != NULL)
+ TEST_FAIL("vcpu with the specified id "
+ "already exists,\n"
+ " requested vcpuid: %u\n"
+ " existing vcpuid: %u state: %p",
+ vcpuid, vcpu->id, vcpu->state);
+
+ /* Allocate and initialize new vcpu structure. */
+ vcpu = calloc(1, sizeof(*vcpu));
+ TEST_ASSERT(vcpu != NULL, "Insufficient Memory");
+ vcpu->id = vcpuid;
+ vcpu->fd = ioctl(vm->fd, KVM_CREATE_VCPU, vcpuid);
+ TEST_ASSERT(vcpu->fd >= 0, "KVM_CREATE_VCPU failed, rc: %i errno: %i",
+ vcpu->fd, errno);
+
+ TEST_ASSERT(vcpu_mmap_sz() >= sizeof(*vcpu->state), "vcpu mmap size "
+ "smaller than expected, vcpu_mmap_sz: %i expected_min: %zi",
+ vcpu_mmap_sz(), sizeof(*vcpu->state));
+ vcpu->state = (struct kvm_run *) mmap(NULL, sizeof(*vcpu->state),
+ PROT_READ | PROT_WRITE, MAP_SHARED, vcpu->fd, 0);
+ TEST_ASSERT(vcpu->state != MAP_FAILED, "mmap vcpu_state failed, "
+ "vcpu id: %u errno: %i", vcpuid, errno);
+
+ /* Add to linked-list of VCPUs. */
+ list_add(&vcpu->list, &vm->vcpus);
+}
+
+/*
+ * VM Virtual Address Unused Gap
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * sz - Size (bytes)
+ * vaddr_min - Minimum Virtual Address
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Lowest virtual address at or below vaddr_min, with at least
+ * sz unused bytes. TEST_ASSERT failure if no area of at least
+ * size sz is available.
+ *
+ * Within the VM specified by vm, locates the lowest starting virtual
+ * address >= vaddr_min, that has at least sz unallocated bytes. A
+ * TEST_ASSERT failure occurs for invalid input or no area of at least
+ * sz unallocated bytes >= vaddr_min is available.
+ */
+static vm_vaddr_t vm_vaddr_unused_gap(struct kvm_vm *vm, size_t sz,
+ vm_vaddr_t vaddr_min)
+{
+ uint64_t pages = (sz + vm->page_size - 1) >> vm->page_shift;
+
+ /* Determine lowest permitted virtual page index. */
+ uint64_t pgidx_start = (vaddr_min + vm->page_size - 1) >> vm->page_shift;
+ if ((pgidx_start * vm->page_size) < vaddr_min)
+ goto no_va_found;
+
+ /* Loop over section with enough valid virtual page indexes. */
+ if (!sparsebit_is_set_num(vm->vpages_valid,
+ pgidx_start, pages))
+ pgidx_start = sparsebit_next_set_num(vm->vpages_valid,
+ pgidx_start, pages);
+ do {
+ /*
+ * Are there enough unused virtual pages available at
+ * the currently proposed starting virtual page index.
+ * If not, adjust proposed starting index to next
+ * possible.
+ */
+ if (sparsebit_is_clear_num(vm->vpages_mapped,
+ pgidx_start, pages))
+ goto va_found;
+ pgidx_start = sparsebit_next_clear_num(vm->vpages_mapped,
+ pgidx_start, pages);
+ if (pgidx_start == 0)
+ goto no_va_found;
+
+ /*
+ * If needed, adjust proposed starting virtual address,
+ * to next range of valid virtual addresses.
+ */
+ if (!sparsebit_is_set_num(vm->vpages_valid,
+ pgidx_start, pages)) {
+ pgidx_start = sparsebit_next_set_num(
+ vm->vpages_valid, pgidx_start, pages);
+ if (pgidx_start == 0)
+ goto no_va_found;
+ }
+ } while (pgidx_start != 0);
+
+no_va_found:
+ TEST_FAIL("No vaddr of specified pages available, pages: 0x%lx", pages);
+
+ /* NOT REACHED */
+ return -1;
+
+va_found:
+ TEST_ASSERT(sparsebit_is_set_num(vm->vpages_valid,
+ pgidx_start, pages),
+ "Unexpected, invalid virtual page index range,\n"
+ " pgidx_start: 0x%lx\n"
+ " pages: 0x%lx",
+ pgidx_start, pages);
+ TEST_ASSERT(sparsebit_is_clear_num(vm->vpages_mapped,
+ pgidx_start, pages),
+ "Unexpected, pages already mapped,\n"
+ " pgidx_start: 0x%lx\n"
+ " pages: 0x%lx",
+ pgidx_start, pages);
+
+ return pgidx_start * vm->page_size;
+}
+
+/*
+ * VM Virtual Address Allocate
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * sz - Size in bytes
+ * vaddr_min - Minimum starting virtual address
+ * data_memslot - Memory region slot for data pages
+ * pgd_memslot - Memory region slot for new virtual translation tables
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Starting guest virtual address
+ *
+ * Allocates at least sz bytes within the virtual address space of the vm
+ * given by vm. The allocated bytes are mapped to a virtual address >=
+ * the address given by vaddr_min. Note that each allocation uses a
+ * a unique set of pages, with the minimum real allocation being at least
+ * a page.
+ */
+vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
+ uint32_t data_memslot, uint32_t pgd_memslot)
+{
+ uint64_t pages = (sz >> vm->page_shift) + ((sz % vm->page_size) != 0);
+
+ virt_pgd_alloc(vm, pgd_memslot);
+
+ /*
+ * Find an unused range of virtual page addresses of at least
+ * pages in length.
+ */
+ vm_vaddr_t vaddr_start = vm_vaddr_unused_gap(vm, sz, vaddr_min);
+
+ /* Map the virtual pages. */
+ for (vm_vaddr_t vaddr = vaddr_start; pages > 0;
+ pages--, vaddr += vm->page_size) {
+ vm_paddr_t paddr;
+
+ paddr = vm_phy_page_alloc(vm,
+ KVM_UTIL_MIN_PFN * vm->page_size, data_memslot);
+
+ virt_pg_map(vm, vaddr, paddr, pgd_memslot);
+
+ sparsebit_set(vm->vpages_mapped,
+ vaddr >> vm->page_shift);
+ }
+
+ return vaddr_start;
+}
+
+/*
+ * Map a range of VM virtual address to the VM's physical address
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vaddr - Virtuall address to map
+ * paddr - VM Physical Address
+ * npages - The number of pages to map
+ * pgd_memslot - Memory region slot for new virtual translation tables
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Within the VM given by @vm, creates a virtual translation for
+ * @npages starting at @vaddr to the page range starting at @paddr.
+ */
+void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+ unsigned int npages, uint32_t pgd_memslot)
+{
+ size_t page_size = vm->page_size;
+ size_t size = npages * page_size;
+
+ TEST_ASSERT(vaddr + size > vaddr, "Vaddr overflow");
+ TEST_ASSERT(paddr + size > paddr, "Paddr overflow");
+
+ while (npages--) {
+ virt_pg_map(vm, vaddr, paddr, pgd_memslot);
+ vaddr += page_size;
+ paddr += page_size;
+ }
+}
+
+/*
+ * Address VM Physical to Host Virtual
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * gpa - VM physical address
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Equivalent host virtual address
+ *
+ * Locates the memory region containing the VM physical address given
+ * by gpa, within the VM given by vm. When found, the host virtual
+ * address providing the memory to the vm physical address is returned.
+ * A TEST_ASSERT failure occurs if no region containing gpa exists.
+ */
+void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa)
+{
+ struct userspace_mem_region *region;
+
+ list_for_each_entry(region, &vm->userspace_mem_regions, list) {
+ if ((gpa >= region->region.guest_phys_addr)
+ && (gpa <= (region->region.guest_phys_addr
+ + region->region.memory_size - 1)))
+ return (void *) ((uintptr_t) region->host_mem
+ + (gpa - region->region.guest_phys_addr));
+ }
+
+ TEST_FAIL("No vm physical memory at 0x%lx", gpa);
+ return NULL;
+}
+
+/*
+ * Address Host Virtual to VM Physical
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * hva - Host virtual address
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Equivalent VM physical address
+ *
+ * Locates the memory region containing the host virtual address given
+ * by hva, within the VM given by vm. When found, the equivalent
+ * VM physical address is returned. A TEST_ASSERT failure occurs if no
+ * region containing hva exists.
+ */
+vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva)
+{
+ struct userspace_mem_region *region;
+
+ list_for_each_entry(region, &vm->userspace_mem_regions, list) {
+ if ((hva >= region->host_mem)
+ && (hva <= (region->host_mem
+ + region->region.memory_size - 1)))
+ return (vm_paddr_t) ((uintptr_t)
+ region->region.guest_phys_addr
+ + (hva - (uintptr_t) region->host_mem));
+ }
+
+ TEST_FAIL("No mapping to a guest physical address, hva: %p", hva);
+ return -1;
+}
+
+/*
+ * VM Create IRQ Chip
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Creates an interrupt controller chip for the VM specified by vm.
+ */
+void vm_create_irqchip(struct kvm_vm *vm)
+{
+ int ret;
+
+ ret = ioctl(vm->fd, KVM_CREATE_IRQCHIP, 0);
+ TEST_ASSERT(ret == 0, "KVM_CREATE_IRQCHIP IOCTL failed, "
+ "rc: %i errno: %i", ret, errno);
+
+ vm->has_irqchip = true;
+}
+
+/*
+ * VM VCPU State
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Pointer to structure that describes the state of the VCPU.
+ *
+ * Locates and returns a pointer to a structure that describes the
+ * state of the VCPU with the given vcpuid.
+ */
+struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ return vcpu->state;
+}
+
+/*
+ * VM VCPU Run
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Switch to executing the code for the VCPU given by vcpuid, within the VM
+ * given by vm.
+ */
+void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ int ret = _vcpu_run(vm, vcpuid);
+ TEST_ASSERT(ret == 0, "KVM_RUN IOCTL failed, "
+ "rc: %i errno: %i", ret, errno);
+}
+
+int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int rc;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+ do {
+ rc = ioctl(vcpu->fd, KVM_RUN, NULL);
+ } while (rc == -1 && errno == EINTR);
+
+ assert_on_unhandled_exception(vm, vcpuid);
+
+ return rc;
+}
+
+void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ vcpu->state->immediate_exit = 1;
+ ret = ioctl(vcpu->fd, KVM_RUN, NULL);
+ vcpu->state->immediate_exit = 0;
+
+ TEST_ASSERT(ret == -1 && errno == EINTR,
+ "KVM_RUN IOCTL didn't exit immediately, rc: %i, errno: %i",
+ ret, errno);
+}
+
+void vcpu_set_guest_debug(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_guest_debug *debug)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret = ioctl(vcpu->fd, KVM_SET_GUEST_DEBUG, debug);
+
+ TEST_ASSERT(ret == 0, "KVM_SET_GUEST_DEBUG failed: %d", ret);
+}
+
+/*
+ * VM VCPU Set MP State
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * mp_state - mp_state to be set
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Sets the MP state of the VCPU given by vcpuid, to the state given
+ * by mp_state.
+ */
+void vcpu_set_mp_state(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_mp_state *mp_state)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ ret = ioctl(vcpu->fd, KVM_SET_MP_STATE, mp_state);
+ TEST_ASSERT(ret == 0, "KVM_SET_MP_STATE IOCTL failed, "
+ "rc: %i errno: %i", ret, errno);
+}
+
+/*
+ * VM VCPU Get Reg List
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ *
+ * Output Args:
+ * None
+ *
+ * Return:
+ * A pointer to an allocated struct kvm_reg_list
+ *
+ * Get the list of guest registers which are supported for
+ * KVM_GET_ONE_REG/KVM_SET_ONE_REG calls
+ */
+struct kvm_reg_list *vcpu_get_reg_list(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct kvm_reg_list reg_list_n = { .n = 0 }, *reg_list;
+ int ret;
+
+ ret = _vcpu_ioctl(vm, vcpuid, KVM_GET_REG_LIST, &reg_list_n);
+ TEST_ASSERT(ret == -1 && errno == E2BIG, "KVM_GET_REG_LIST n=0");
+ reg_list = calloc(1, sizeof(*reg_list) + reg_list_n.n * sizeof(__u64));
+ reg_list->n = reg_list_n.n;
+ vcpu_ioctl(vm, vcpuid, KVM_GET_REG_LIST, reg_list);
+ return reg_list;
+}
+
+/*
+ * VM VCPU Regs Get
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ *
+ * Output Args:
+ * regs - current state of VCPU regs
+ *
+ * Return: None
+ *
+ * Obtains the current register state for the VCPU specified by vcpuid
+ * and stores it at the location given by regs.
+ */
+void vcpu_regs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ ret = ioctl(vcpu->fd, KVM_GET_REGS, regs);
+ TEST_ASSERT(ret == 0, "KVM_GET_REGS failed, rc: %i errno: %i",
+ ret, errno);
+}
+
+/*
+ * VM VCPU Regs Set
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * regs - Values to set VCPU regs to
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Sets the regs of the VCPU specified by vcpuid to the values
+ * given by regs.
+ */
+void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ ret = ioctl(vcpu->fd, KVM_SET_REGS, regs);
+ TEST_ASSERT(ret == 0, "KVM_SET_REGS failed, rc: %i errno: %i",
+ ret, errno);
+}
+
+#ifdef __KVM_HAVE_VCPU_EVENTS
+void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_vcpu_events *events)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ ret = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, events);
+ TEST_ASSERT(ret == 0, "KVM_GET_VCPU_EVENTS, failed, rc: %i errno: %i",
+ ret, errno);
+}
+
+void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_vcpu_events *events)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ ret = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, events);
+ TEST_ASSERT(ret == 0, "KVM_SET_VCPU_EVENTS, failed, rc: %i errno: %i",
+ ret, errno);
+}
+#endif
+
+#ifdef __x86_64__
+void vcpu_nested_state_get(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_nested_state *state)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ ret = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, state);
+ TEST_ASSERT(ret == 0,
+ "KVM_SET_NESTED_STATE failed, ret: %i errno: %i",
+ ret, errno);
+}
+
+int vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid,
+ struct kvm_nested_state *state, bool ignore_error)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ ret = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, state);
+ if (!ignore_error) {
+ TEST_ASSERT(ret == 0,
+ "KVM_SET_NESTED_STATE failed, ret: %i errno: %i",
+ ret, errno);
+ }
+
+ return ret;
+}
+#endif
+
+/*
+ * VM VCPU System Regs Get
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ *
+ * Output Args:
+ * sregs - current state of VCPU system regs
+ *
+ * Return: None
+ *
+ * Obtains the current system register state for the VCPU specified by
+ * vcpuid and stores it at the location given by sregs.
+ */
+void vcpu_sregs_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ ret = ioctl(vcpu->fd, KVM_GET_SREGS, sregs);
+ TEST_ASSERT(ret == 0, "KVM_GET_SREGS failed, rc: %i errno: %i",
+ ret, errno);
+}
+
+/*
+ * VM VCPU System Regs Set
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * sregs - Values to set VCPU system regs to
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Sets the system regs of the VCPU specified by vcpuid to the values
+ * given by sregs.
+ */
+void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs)
+{
+ int ret = _vcpu_sregs_set(vm, vcpuid, sregs);
+ TEST_ASSERT(ret == 0, "KVM_RUN IOCTL failed, "
+ "rc: %i errno: %i", ret, errno);
+}
+
+int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ return ioctl(vcpu->fd, KVM_SET_SREGS, sregs);
+}
+
+void vcpu_fpu_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_fpu *fpu)
+{
+ int ret;
+
+ ret = _vcpu_ioctl(vm, vcpuid, KVM_GET_FPU, fpu);
+ TEST_ASSERT(ret == 0, "KVM_GET_FPU failed, rc: %i errno: %i (%s)",
+ ret, errno, strerror(errno));
+}
+
+void vcpu_fpu_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_fpu *fpu)
+{
+ int ret;
+
+ ret = _vcpu_ioctl(vm, vcpuid, KVM_SET_FPU, fpu);
+ TEST_ASSERT(ret == 0, "KVM_SET_FPU failed, rc: %i errno: %i (%s)",
+ ret, errno, strerror(errno));
+}
+
+void vcpu_get_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg)
+{
+ int ret;
+
+ ret = _vcpu_ioctl(vm, vcpuid, KVM_GET_ONE_REG, reg);
+ TEST_ASSERT(ret == 0, "KVM_GET_ONE_REG failed, rc: %i errno: %i (%s)",
+ ret, errno, strerror(errno));
+}
+
+void vcpu_set_reg(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_one_reg *reg)
+{
+ int ret;
+
+ ret = _vcpu_ioctl(vm, vcpuid, KVM_SET_ONE_REG, reg);
+ TEST_ASSERT(ret == 0, "KVM_SET_ONE_REG failed, rc: %i errno: %i (%s)",
+ ret, errno, strerror(errno));
+}
+
+/*
+ * VCPU Ioctl
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * cmd - Ioctl number
+ * arg - Argument to pass to the ioctl
+ *
+ * Return: None
+ *
+ * Issues an arbitrary ioctl on a VCPU fd.
+ */
+void vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid,
+ unsigned long cmd, void *arg)
+{
+ int ret;
+
+ ret = _vcpu_ioctl(vm, vcpuid, cmd, arg);
+ TEST_ASSERT(ret == 0, "vcpu ioctl %lu failed, rc: %i errno: %i (%s)",
+ cmd, ret, errno, strerror(errno));
+}
+
+int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid,
+ unsigned long cmd, void *arg)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int ret;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ ret = ioctl(vcpu->fd, cmd, arg);
+
+ return ret;
+}
+
+/*
+ * VM Ioctl
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * cmd - Ioctl number
+ * arg - Argument to pass to the ioctl
+ *
+ * Return: None
+ *
+ * Issues an arbitrary ioctl on a VM fd.
+ */
+void vm_ioctl(struct kvm_vm *vm, unsigned long cmd, void *arg)
+{
+ int ret;
+
+ ret = ioctl(vm->fd, cmd, arg);
+ TEST_ASSERT(ret == 0, "vm ioctl %lu failed, rc: %i errno: %i (%s)",
+ cmd, ret, errno, strerror(errno));
+}
+
+/*
+ * VM Dump
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * indent - Left margin indent amount
+ *
+ * Output Args:
+ * stream - Output FILE stream
+ *
+ * Return: None
+ *
+ * Dumps the current state of the VM given by vm, to the FILE stream
+ * given by stream.
+ */
+void vm_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
+{
+ struct userspace_mem_region *region;
+ struct vcpu *vcpu;
+
+ fprintf(stream, "%*smode: 0x%x\n", indent, "", vm->mode);
+ fprintf(stream, "%*sfd: %i\n", indent, "", vm->fd);
+ fprintf(stream, "%*spage_size: 0x%x\n", indent, "", vm->page_size);
+ fprintf(stream, "%*sMem Regions:\n", indent, "");
+ list_for_each_entry(region, &vm->userspace_mem_regions, list) {
+ fprintf(stream, "%*sguest_phys: 0x%lx size: 0x%lx "
+ "host_virt: %p\n", indent + 2, "",
+ (uint64_t) region->region.guest_phys_addr,
+ (uint64_t) region->region.memory_size,
+ region->host_mem);
+ fprintf(stream, "%*sunused_phy_pages: ", indent + 2, "");
+ sparsebit_dump(stream, region->unused_phy_pages, 0);
+ }
+ fprintf(stream, "%*sMapped Virtual Pages:\n", indent, "");
+ sparsebit_dump(stream, vm->vpages_mapped, indent + 2);
+ fprintf(stream, "%*spgd_created: %u\n", indent, "",
+ vm->pgd_created);
+ if (vm->pgd_created) {
+ fprintf(stream, "%*sVirtual Translation Tables:\n",
+ indent + 2, "");
+ virt_dump(stream, vm, indent + 4);
+ }
+ fprintf(stream, "%*sVCPUs:\n", indent, "");
+ list_for_each_entry(vcpu, &vm->vcpus, list)
+ vcpu_dump(stream, vm, vcpu->id, indent + 2);
+}
+
+/* Known KVM exit reasons */
+static struct exit_reason {
+ unsigned int reason;
+ const char *name;
+} exit_reasons_known[] = {
+ {KVM_EXIT_UNKNOWN, "UNKNOWN"},
+ {KVM_EXIT_EXCEPTION, "EXCEPTION"},
+ {KVM_EXIT_IO, "IO"},
+ {KVM_EXIT_HYPERCALL, "HYPERCALL"},
+ {KVM_EXIT_DEBUG, "DEBUG"},
+ {KVM_EXIT_HLT, "HLT"},
+ {KVM_EXIT_MMIO, "MMIO"},
+ {KVM_EXIT_IRQ_WINDOW_OPEN, "IRQ_WINDOW_OPEN"},
+ {KVM_EXIT_SHUTDOWN, "SHUTDOWN"},
+ {KVM_EXIT_FAIL_ENTRY, "FAIL_ENTRY"},
+ {KVM_EXIT_INTR, "INTR"},
+ {KVM_EXIT_SET_TPR, "SET_TPR"},
+ {KVM_EXIT_TPR_ACCESS, "TPR_ACCESS"},
+ {KVM_EXIT_S390_SIEIC, "S390_SIEIC"},
+ {KVM_EXIT_S390_RESET, "S390_RESET"},
+ {KVM_EXIT_DCR, "DCR"},
+ {KVM_EXIT_NMI, "NMI"},
+ {KVM_EXIT_INTERNAL_ERROR, "INTERNAL_ERROR"},
+ {KVM_EXIT_OSI, "OSI"},
+ {KVM_EXIT_PAPR_HCALL, "PAPR_HCALL"},
+#ifdef KVM_EXIT_MEMORY_NOT_PRESENT
+ {KVM_EXIT_MEMORY_NOT_PRESENT, "MEMORY_NOT_PRESENT"},
+#endif
+};
+
+/*
+ * Exit Reason String
+ *
+ * Input Args:
+ * exit_reason - Exit reason
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Constant string pointer describing the exit reason.
+ *
+ * Locates and returns a constant string that describes the KVM exit
+ * reason given by exit_reason. If no such string is found, a constant
+ * string of "Unknown" is returned.
+ */
+const char *exit_reason_str(unsigned int exit_reason)
+{
+ unsigned int n1;
+
+ for (n1 = 0; n1 < ARRAY_SIZE(exit_reasons_known); n1++) {
+ if (exit_reason == exit_reasons_known[n1].reason)
+ return exit_reasons_known[n1].name;
+ }
+
+ return "Unknown";
+}
+
+/*
+ * Physical Contiguous Page Allocator
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * num - number of pages
+ * paddr_min - Physical address minimum
+ * memslot - Memory region to allocate page from
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Starting physical address
+ *
+ * Within the VM specified by vm, locates a range of available physical
+ * pages at or above paddr_min. If found, the pages are marked as in use
+ * and their base address is returned. A TEST_ASSERT failure occurs if
+ * not enough pages are available at or above paddr_min.
+ */
+vm_paddr_t vm_phy_pages_alloc(struct kvm_vm *vm, size_t num,
+ vm_paddr_t paddr_min, uint32_t memslot)
+{
+ struct userspace_mem_region *region;
+ sparsebit_idx_t pg, base;
+
+ TEST_ASSERT(num > 0, "Must allocate at least one page");
+
+ TEST_ASSERT((paddr_min % vm->page_size) == 0, "Min physical address "
+ "not divisible by page size.\n"
+ " paddr_min: 0x%lx page_size: 0x%x",
+ paddr_min, vm->page_size);
+
+ region = memslot2region(vm, memslot);
+ base = pg = paddr_min >> vm->page_shift;
+
+ do {
+ for (; pg < base + num; ++pg) {
+ if (!sparsebit_is_set(region->unused_phy_pages, pg)) {
+ base = pg = sparsebit_next_set(region->unused_phy_pages, pg);
+ break;
+ }
+ }
+ } while (pg && pg != base + num);
+
+ if (pg == 0) {
+ fprintf(stderr, "No guest physical page available, "
+ "paddr_min: 0x%lx page_size: 0x%x memslot: %u\n",
+ paddr_min, vm->page_size, memslot);
+ fputs("---- vm dump ----\n", stderr);
+ vm_dump(stderr, vm, 2);
+ abort();
+ }
+
+ for (pg = base; pg < base + num; ++pg)
+ sparsebit_clear(region->unused_phy_pages, pg);
+
+ return base * vm->page_size;
+}
+
+vm_paddr_t vm_phy_page_alloc(struct kvm_vm *vm, vm_paddr_t paddr_min,
+ uint32_t memslot)
+{
+ return vm_phy_pages_alloc(vm, 1, paddr_min, memslot);
+}
+
+/*
+ * Address Guest Virtual to Host Virtual
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * gva - VM virtual address
+ *
+ * Output Args: None
+ *
+ * Return:
+ * Equivalent host virtual address
+ */
+void *addr_gva2hva(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+ return addr_gpa2hva(vm, addr_gva2gpa(vm, gva));
+}
+
+/*
+ * Is Unrestricted Guest
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ *
+ * Output Args: None
+ *
+ * Return: True if the unrestricted guest is set to 'Y', otherwise return false.
+ *
+ * Check if the unrestricted guest flag is enabled.
+ */
+bool vm_is_unrestricted_guest(struct kvm_vm *vm)
+{
+ char val = 'N';
+ size_t count;
+ FILE *f;
+
+ if (vm == NULL) {
+ /* Ensure that the KVM vendor-specific module is loaded. */
+ f = fopen(KVM_DEV_PATH, "r");
+ TEST_ASSERT(f != NULL, "Error in opening KVM dev file: %d",
+ errno);
+ fclose(f);
+ }
+
+ f = fopen("/sys/module/kvm_intel/parameters/unrestricted_guest", "r");
+ if (f) {
+ count = fread(&val, sizeof(char), 1, f);
+ TEST_ASSERT(count == 1, "Unable to read from param file.");
+ fclose(f);
+ }
+
+ return val == 'Y';
+}
+
+unsigned int vm_get_page_size(struct kvm_vm *vm)
+{
+ return vm->page_size;
+}
+
+unsigned int vm_get_page_shift(struct kvm_vm *vm)
+{
+ return vm->page_shift;
+}
+
+unsigned int vm_get_max_gfn(struct kvm_vm *vm)
+{
+ return vm->max_gfn;
+}
+
+int vm_get_fd(struct kvm_vm *vm)
+{
+ return vm->fd;
+}
+
+static unsigned int vm_calc_num_pages(unsigned int num_pages,
+ unsigned int page_shift,
+ unsigned int new_page_shift,
+ bool ceil)
+{
+ unsigned int n = 1 << (new_page_shift - page_shift);
+
+ if (page_shift >= new_page_shift)
+ return num_pages * (1 << (page_shift - new_page_shift));
+
+ return num_pages / n + !!(ceil && num_pages % n);
+}
+
+static inline int getpageshift(void)
+{
+ return __builtin_ffs(getpagesize()) - 1;
+}
+
+unsigned int
+vm_num_host_pages(enum vm_guest_mode mode, unsigned int num_guest_pages)
+{
+ return vm_calc_num_pages(num_guest_pages,
+ vm_guest_mode_params[mode].page_shift,
+ getpageshift(), true);
+}
+
+unsigned int
+vm_num_guest_pages(enum vm_guest_mode mode, unsigned int num_host_pages)
+{
+ return vm_calc_num_pages(num_host_pages, getpageshift(),
+ vm_guest_mode_params[mode].page_shift, false);
+}
+
+unsigned int vm_calc_num_guest_pages(enum vm_guest_mode mode, size_t size)
+{
+ unsigned int n;
+ n = DIV_ROUND_UP(size, vm_guest_mode_params[mode].page_size);
+ return vm_adjust_num_guest_pages(mode, n);
+}
diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
new file mode 100644
index 000000000..f07d383d0
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * tools/testing/selftests/kvm/lib/kvm_util_internal.h
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#ifndef SELFTEST_KVM_UTIL_INTERNAL_H
+#define SELFTEST_KVM_UTIL_INTERNAL_H
+
+#include "sparsebit.h"
+
+#define KVM_DEV_PATH "/dev/kvm"
+
+struct userspace_mem_region {
+ struct kvm_userspace_memory_region region;
+ struct sparsebit *unused_phy_pages;
+ int fd;
+ off_t offset;
+ void *host_mem;
+ void *mmap_start;
+ size_t mmap_size;
+ struct list_head list;
+};
+
+struct vcpu {
+ struct list_head list;
+ uint32_t id;
+ int fd;
+ struct kvm_run *state;
+};
+
+struct kvm_vm {
+ int mode;
+ unsigned long type;
+ int kvm_fd;
+ int fd;
+ unsigned int pgtable_levels;
+ unsigned int page_size;
+ unsigned int page_shift;
+ unsigned int pa_bits;
+ unsigned int va_bits;
+ uint64_t max_gfn;
+ struct list_head vcpus;
+ struct list_head userspace_mem_regions;
+ struct sparsebit *vpages_valid;
+ struct sparsebit *vpages_mapped;
+ bool has_irqchip;
+ bool pgd_created;
+ vm_paddr_t pgd;
+ vm_vaddr_t gdt;
+ vm_vaddr_t tss;
+ vm_vaddr_t idt;
+ vm_vaddr_t handlers;
+};
+
+struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid);
+
+/*
+ * Virtual Translation Tables Dump
+ *
+ * Input Args:
+ * stream - Output FILE stream
+ * vm - Virtual Machine
+ * indent - Left margin indent amount
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Dumps to the FILE stream given by @stream, the contents of all the
+ * virtual translation tables for the VM given by @vm.
+ */
+void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent);
+
+/*
+ * Register Dump
+ *
+ * Input Args:
+ * stream - Output FILE stream
+ * regs - Registers
+ * indent - Left margin indent amount
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Dumps the state of the registers given by @regs, to the FILE stream
+ * given by @stream.
+ */
+void regs_dump(FILE *stream, struct kvm_regs *regs, uint8_t indent);
+
+/*
+ * System Register Dump
+ *
+ * Input Args:
+ * stream - Output FILE stream
+ * sregs - System registers
+ * indent - Left margin indent amount
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Dumps the state of the system registers given by @sregs, to the FILE stream
+ * given by @stream.
+ */
+void sregs_dump(FILE *stream, struct kvm_sregs *sregs, uint8_t indent);
+
+struct userspace_mem_region *
+memslot2region(struct kvm_vm *vm, uint32_t memslot);
+
+#endif /* SELFTEST_KVM_UTIL_INTERNAL_H */
diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c
new file mode 100644
index 000000000..7349bb2e1
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/s390x/processor.c
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM selftest s390x library code - CPU-related functions (page tables...)
+ *
+ * Copyright (C) 2019, Red Hat, Inc.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include "processor.h"
+#include "kvm_util.h"
+#include "../kvm_util_internal.h"
+
+#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000
+
+#define PAGES_PER_REGION 4
+
+void virt_pgd_alloc(struct kvm_vm *vm, uint32_t memslot)
+{
+ vm_paddr_t paddr;
+
+ TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x",
+ vm->page_size);
+
+ if (vm->pgd_created)
+ return;
+
+ paddr = vm_phy_pages_alloc(vm, PAGES_PER_REGION,
+ KVM_GUEST_PAGE_TABLE_MIN_PADDR, memslot);
+ memset(addr_gpa2hva(vm, paddr), 0xff, PAGES_PER_REGION * vm->page_size);
+
+ vm->pgd = paddr;
+ vm->pgd_created = true;
+}
+
+/*
+ * Allocate 4 pages for a region/segment table (ri < 4), or one page for
+ * a page table (ri == 4). Returns a suitable region/segment table entry
+ * which points to the freshly allocated pages.
+ */
+static uint64_t virt_alloc_region(struct kvm_vm *vm, int ri, uint32_t memslot)
+{
+ uint64_t taddr;
+
+ taddr = vm_phy_pages_alloc(vm, ri < 4 ? PAGES_PER_REGION : 1,
+ KVM_GUEST_PAGE_TABLE_MIN_PADDR, memslot);
+ memset(addr_gpa2hva(vm, taddr), 0xff, PAGES_PER_REGION * vm->page_size);
+
+ return (taddr & REGION_ENTRY_ORIGIN)
+ | (((4 - ri) << 2) & REGION_ENTRY_TYPE)
+ | ((ri < 4 ? (PAGES_PER_REGION - 1) : 0) & REGION_ENTRY_LENGTH);
+}
+
+void virt_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa,
+ uint32_t memslot)
+{
+ int ri, idx;
+ uint64_t *entry;
+
+ TEST_ASSERT((gva % vm->page_size) == 0,
+ "Virtual address not on page boundary,\n"
+ " vaddr: 0x%lx vm->page_size: 0x%x",
+ gva, vm->page_size);
+ TEST_ASSERT(sparsebit_is_set(vm->vpages_valid,
+ (gva >> vm->page_shift)),
+ "Invalid virtual address, vaddr: 0x%lx",
+ gva);
+ TEST_ASSERT((gpa % vm->page_size) == 0,
+ "Physical address not on page boundary,\n"
+ " paddr: 0x%lx vm->page_size: 0x%x",
+ gva, vm->page_size);
+ TEST_ASSERT((gpa >> vm->page_shift) <= vm->max_gfn,
+ "Physical address beyond beyond maximum supported,\n"
+ " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
+ gva, vm->max_gfn, vm->page_size);
+
+ /* Walk through region and segment tables */
+ entry = addr_gpa2hva(vm, vm->pgd);
+ for (ri = 1; ri <= 4; ri++) {
+ idx = (gva >> (64 - 11 * ri)) & 0x7ffu;
+ if (entry[idx] & REGION_ENTRY_INVALID)
+ entry[idx] = virt_alloc_region(vm, ri, memslot);
+ entry = addr_gpa2hva(vm, entry[idx] & REGION_ENTRY_ORIGIN);
+ }
+
+ /* Fill in page table entry */
+ idx = (gva >> 12) & 0x0ffu; /* page index */
+ if (!(entry[idx] & PAGE_INVALID))
+ fprintf(stderr,
+ "WARNING: PTE for gpa=0x%"PRIx64" already set!\n", gpa);
+ entry[idx] = gpa;
+}
+
+vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+ int ri, idx;
+ uint64_t *entry;
+
+ TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x",
+ vm->page_size);
+
+ entry = addr_gpa2hva(vm, vm->pgd);
+ for (ri = 1; ri <= 4; ri++) {
+ idx = (gva >> (64 - 11 * ri)) & 0x7ffu;
+ TEST_ASSERT(!(entry[idx] & REGION_ENTRY_INVALID),
+ "No region mapping for vm virtual address 0x%lx",
+ gva);
+ entry = addr_gpa2hva(vm, entry[idx] & REGION_ENTRY_ORIGIN);
+ }
+
+ idx = (gva >> 12) & 0x0ffu; /* page index */
+
+ TEST_ASSERT(!(entry[idx] & PAGE_INVALID),
+ "No page mapping for vm virtual address 0x%lx", gva);
+
+ return (entry[idx] & ~0xffful) + (gva & 0xffful);
+}
+
+static void virt_dump_ptes(FILE *stream, struct kvm_vm *vm, uint8_t indent,
+ uint64_t ptea_start)
+{
+ uint64_t *pte, ptea;
+
+ for (ptea = ptea_start; ptea < ptea_start + 0x100 * 8; ptea += 8) {
+ pte = addr_gpa2hva(vm, ptea);
+ if (*pte & PAGE_INVALID)
+ continue;
+ fprintf(stream, "%*spte @ 0x%lx: 0x%016lx\n",
+ indent, "", ptea, *pte);
+ }
+}
+
+static void virt_dump_region(FILE *stream, struct kvm_vm *vm, uint8_t indent,
+ uint64_t reg_tab_addr)
+{
+ uint64_t addr, *entry;
+
+ for (addr = reg_tab_addr; addr < reg_tab_addr + 0x400 * 8; addr += 8) {
+ entry = addr_gpa2hva(vm, addr);
+ if (*entry & REGION_ENTRY_INVALID)
+ continue;
+ fprintf(stream, "%*srt%lde @ 0x%lx: 0x%016lx\n",
+ indent, "", 4 - ((*entry & REGION_ENTRY_TYPE) >> 2),
+ addr, *entry);
+ if (*entry & REGION_ENTRY_TYPE) {
+ virt_dump_region(stream, vm, indent + 2,
+ *entry & REGION_ENTRY_ORIGIN);
+ } else {
+ virt_dump_ptes(stream, vm, indent + 2,
+ *entry & REGION_ENTRY_ORIGIN);
+ }
+ }
+}
+
+void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
+{
+ if (!vm->pgd_created)
+ return;
+
+ virt_dump_region(stream, vm, indent, vm->pgd);
+}
+
+struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
+ void *guest_code)
+{
+ /*
+ * The additional amount of pages required for the page tables is:
+ * 1 * n / 256 + 4 * (n / 256) / 2048 + 4 * (n / 256) / 2048^2 + ...
+ * which is definitely smaller than (n / 256) * 2.
+ */
+ uint64_t extra_pg_pages = extra_mem_pages / 256 * 2;
+ struct kvm_vm *vm;
+
+ vm = vm_create(VM_MODE_DEFAULT,
+ DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR);
+
+ kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
+ vm_vcpu_add_default(vm, vcpuid, guest_code);
+
+ return vm;
+}
+
+void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
+{
+ size_t stack_size = DEFAULT_STACK_PGS * getpagesize();
+ uint64_t stack_vaddr;
+ struct kvm_regs regs;
+ struct kvm_sregs sregs;
+ struct kvm_run *run;
+
+ TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x",
+ vm->page_size);
+
+ stack_vaddr = vm_vaddr_alloc(vm, stack_size,
+ DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0);
+
+ vm_vcpu_add(vm, vcpuid);
+
+ /* Setup guest registers */
+ vcpu_regs_get(vm, vcpuid, &regs);
+ regs.gprs[15] = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize()) - 160;
+ vcpu_regs_set(vm, vcpuid, &regs);
+
+ vcpu_sregs_get(vm, vcpuid, &sregs);
+ sregs.crs[0] |= 0x00040000; /* Enable floating point regs */
+ sregs.crs[1] = vm->pgd | 0xf; /* Primary region table */
+ vcpu_sregs_set(vm, vcpuid, &sregs);
+
+ run = vcpu_state(vm, vcpuid);
+ run->psw_mask = 0x0400000180000000ULL; /* DAT enabled + 64 bit mode */
+ run->psw_addr = (uintptr_t)guest_code;
+}
+
+void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...)
+{
+ va_list ap;
+ struct kvm_regs regs;
+ int i;
+
+ TEST_ASSERT(num >= 1 && num <= 5, "Unsupported number of args,\n"
+ " num: %u\n",
+ num);
+
+ va_start(ap, num);
+ vcpu_regs_get(vm, vcpuid, &regs);
+
+ for (i = 0; i < num; i++)
+ regs.gprs[i + 2] = va_arg(ap, uint64_t);
+
+ vcpu_regs_set(vm, vcpuid, &regs);
+ va_end(ap);
+}
+
+void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+
+ if (!vcpu)
+ return;
+
+ fprintf(stream, "%*spstate: psw: 0x%.16llx:0x%.16llx\n",
+ indent, "", vcpu->state->psw_mask, vcpu->state->psw_addr);
+}
+
+void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid)
+{
+}
diff --git a/tools/testing/selftests/kvm/lib/s390x/ucall.c b/tools/testing/selftests/kvm/lib/s390x/ucall.c
new file mode 100644
index 000000000..9d3b0f152
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/s390x/ucall.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ucall support. A ucall is a "hypercall to userspace".
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ */
+#include "kvm_util.h"
+
+void ucall_init(struct kvm_vm *vm, void *arg)
+{
+}
+
+void ucall_uninit(struct kvm_vm *vm)
+{
+}
+
+void ucall(uint64_t cmd, int nargs, ...)
+{
+ struct ucall uc = {
+ .cmd = cmd,
+ };
+ va_list va;
+ int i;
+
+ nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS;
+
+ va_start(va, nargs);
+ for (i = 0; i < nargs; ++i)
+ uc.args[i] = va_arg(va, uint64_t);
+ va_end(va);
+
+ /* Exit via DIAGNOSE 0x501 (normally used for breakpoints) */
+ asm volatile ("diag 0,%0,0x501" : : "a"(&uc) : "memory");
+}
+
+uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc)
+{
+ struct kvm_run *run = vcpu_state(vm, vcpu_id);
+ struct ucall ucall = {};
+
+ if (uc)
+ memset(uc, 0, sizeof(*uc));
+
+ if (run->exit_reason == KVM_EXIT_S390_SIEIC &&
+ run->s390_sieic.icptcode == 4 &&
+ (run->s390_sieic.ipa >> 8) == 0x83 && /* 0x83 means DIAGNOSE */
+ (run->s390_sieic.ipb >> 16) == 0x501) {
+ int reg = run->s390_sieic.ipa & 0xf;
+
+ memcpy(&ucall, addr_gva2hva(vm, run->s.regs.gprs[reg]),
+ sizeof(ucall));
+
+ vcpu_run_complete_io(vm, vcpu_id);
+ if (uc)
+ memcpy(uc, &ucall, sizeof(ucall));
+ }
+
+ return ucall.cmd;
+}
diff --git a/tools/testing/selftests/kvm/lib/sparsebit.c b/tools/testing/selftests/kvm/lib/sparsebit.c
new file mode 100644
index 000000000..031ba3c93
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/sparsebit.c
@@ -0,0 +1,2086 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Sparse bit array
+ *
+ * Copyright (C) 2018, Google LLC.
+ * Copyright (C) 2018, Red Hat, Inc. (code style cleanup and fuzzing driver)
+ *
+ * This library provides functions to support a memory efficient bit array,
+ * with an index size of 2^64. A sparsebit array is allocated through
+ * the use sparsebit_alloc() and free'd via sparsebit_free(),
+ * such as in the following:
+ *
+ * struct sparsebit *s;
+ * s = sparsebit_alloc();
+ * sparsebit_free(&s);
+ *
+ * The struct sparsebit type resolves down to a struct sparsebit.
+ * Note that, sparsebit_free() takes a pointer to the sparsebit
+ * structure. This is so that sparsebit_free() is able to poison
+ * the pointer (e.g. set it to NULL) to the struct sparsebit before
+ * returning to the caller.
+ *
+ * Between the return of sparsebit_alloc() and the call of
+ * sparsebit_free(), there are multiple query and modifying operations
+ * that can be performed on the allocated sparsebit array. All of
+ * these operations take as a parameter the value returned from
+ * sparsebit_alloc() and most also take a bit index. Frequently
+ * used routines include:
+ *
+ * ---- Query Operations
+ * sparsebit_is_set(s, idx)
+ * sparsebit_is_clear(s, idx)
+ * sparsebit_any_set(s)
+ * sparsebit_first_set(s)
+ * sparsebit_next_set(s, prev_idx)
+ *
+ * ---- Modifying Operations
+ * sparsebit_set(s, idx)
+ * sparsebit_clear(s, idx)
+ * sparsebit_set_num(s, idx, num);
+ * sparsebit_clear_num(s, idx, num);
+ *
+ * A common operation, is to itterate over all the bits set in a test
+ * sparsebit array. This can be done via code with the following structure:
+ *
+ * sparsebit_idx_t idx;
+ * if (sparsebit_any_set(s)) {
+ * idx = sparsebit_first_set(s);
+ * do {
+ * ...
+ * idx = sparsebit_next_set(s, idx);
+ * } while (idx != 0);
+ * }
+ *
+ * The index of the first bit set needs to be obtained via
+ * sparsebit_first_set(), because sparsebit_next_set(), needs
+ * the index of the previously set. The sparsebit_idx_t type is
+ * unsigned, so there is no previous index before 0 that is available.
+ * Also, the call to sparsebit_first_set() is not made unless there
+ * is at least 1 bit in the array set. This is because sparsebit_first_set()
+ * aborts if sparsebit_first_set() is called with no bits set.
+ * It is the callers responsibility to assure that the
+ * sparsebit array has at least a single bit set before calling
+ * sparsebit_first_set().
+ *
+ * ==== Implementation Overview ====
+ * For the most part the internal implementation of sparsebit is
+ * opaque to the caller. One important implementation detail that the
+ * caller may need to be aware of is the spatial complexity of the
+ * implementation. This implementation of a sparsebit array is not
+ * only sparse, in that it uses memory proportional to the number of bits
+ * set. It is also efficient in memory usage when most of the bits are
+ * set.
+ *
+ * At a high-level the state of the bit settings are maintained through
+ * the use of a binary-search tree, where each node contains at least
+ * the following members:
+ *
+ * typedef uint64_t sparsebit_idx_t;
+ * typedef uint64_t sparsebit_num_t;
+ *
+ * sparsebit_idx_t idx;
+ * uint32_t mask;
+ * sparsebit_num_t num_after;
+ *
+ * The idx member contains the bit index of the first bit described by this
+ * node, while the mask member stores the setting of the first 32-bits.
+ * The setting of the bit at idx + n, where 0 <= n < 32, is located in the
+ * mask member at 1 << n.
+ *
+ * Nodes are sorted by idx and the bits described by two nodes will never
+ * overlap. The idx member is always aligned to the mask size, i.e. a
+ * multiple of 32.
+ *
+ * Beyond a typical implementation, the nodes in this implementation also
+ * contains a member named num_after. The num_after member holds the
+ * number of bits immediately after the mask bits that are contiguously set.
+ * The use of the num_after member allows this implementation to efficiently
+ * represent cases where most bits are set. For example, the case of all
+ * but the last two bits set, is represented by the following two nodes:
+ *
+ * node 0 - idx: 0x0 mask: 0xffffffff num_after: 0xffffffffffffffc0
+ * node 1 - idx: 0xffffffffffffffe0 mask: 0x3fffffff num_after: 0
+ *
+ * ==== Invariants ====
+ * This implementation usses the following invariants:
+ *
+ * + Node are only used to represent bits that are set.
+ * Nodes with a mask of 0 and num_after of 0 are not allowed.
+ *
+ * + Sum of bits set in all the nodes is equal to the value of
+ * the struct sparsebit_pvt num_set member.
+ *
+ * + The setting of at least one bit is always described in a nodes
+ * mask (mask >= 1).
+ *
+ * + A node with all mask bits set only occurs when the last bit
+ * described by the previous node is not equal to this nodes
+ * starting index - 1. All such occurences of this condition are
+ * avoided by moving the setting of the nodes mask bits into
+ * the previous nodes num_after setting.
+ *
+ * + Node starting index is evenly divisible by the number of bits
+ * within a nodes mask member.
+ *
+ * + Nodes never represent a range of bits that wrap around the
+ * highest supported index.
+ *
+ * (idx + MASK_BITS + num_after - 1) <= ((sparsebit_idx_t) 0) - 1)
+ *
+ * As a consequence of the above, the num_after member of a node
+ * will always be <=:
+ *
+ * maximum_index - nodes_starting_index - number_of_mask_bits
+ *
+ * + Nodes within the binary search tree are sorted based on each
+ * nodes starting index.
+ *
+ * + The range of bits described by any two nodes do not overlap. The
+ * range of bits described by a single node is:
+ *
+ * start: node->idx
+ * end (inclusive): node->idx + MASK_BITS + node->num_after - 1;
+ *
+ * Note, at times these invariants are temporarily violated for a
+ * specific portion of the code. For example, when setting a mask
+ * bit, there is a small delay between when the mask bit is set and the
+ * value in the struct sparsebit_pvt num_set member is updated. Other
+ * temporary violations occur when node_split() is called with a specified
+ * index and assures that a node where its mask represents the bit
+ * at the specified index exists. At times to do this node_split()
+ * must split an existing node into two nodes or create a node that
+ * has no bits set. Such temporary violations must be corrected before
+ * returning to the caller. These corrections are typically performed
+ * by the local function node_reduce().
+ */
+
+#include "test_util.h"
+#include "sparsebit.h"
+#include <limits.h>
+#include <assert.h>
+
+#define DUMP_LINE_MAX 100 /* Does not include indent amount */
+
+typedef uint32_t mask_t;
+#define MASK_BITS (sizeof(mask_t) * CHAR_BIT)
+
+struct node {
+ struct node *parent;
+ struct node *left;
+ struct node *right;
+ sparsebit_idx_t idx; /* index of least-significant bit in mask */
+ sparsebit_num_t num_after; /* num contiguously set after mask */
+ mask_t mask;
+};
+
+struct sparsebit {
+ /*
+ * Points to root node of the binary search
+ * tree. Equal to NULL when no bits are set in
+ * the entire sparsebit array.
+ */
+ struct node *root;
+
+ /*
+ * A redundant count of the total number of bits set. Used for
+ * diagnostic purposes and to change the time complexity of
+ * sparsebit_num_set() from O(n) to O(1).
+ * Note: Due to overflow, a value of 0 means none or all set.
+ */
+ sparsebit_num_t num_set;
+};
+
+/* Returns the number of set bits described by the settings
+ * of the node pointed to by nodep.
+ */
+static sparsebit_num_t node_num_set(struct node *nodep)
+{
+ return nodep->num_after + __builtin_popcount(nodep->mask);
+}
+
+/* Returns a pointer to the node that describes the
+ * lowest bit index.
+ */
+static struct node *node_first(struct sparsebit *s)
+{
+ struct node *nodep;
+
+ for (nodep = s->root; nodep && nodep->left; nodep = nodep->left)
+ ;
+
+ return nodep;
+}
+
+/* Returns a pointer to the node that describes the
+ * lowest bit index > the index of the node pointed to by np.
+ * Returns NULL if no node with a higher index exists.
+ */
+static struct node *node_next(struct sparsebit *s, struct node *np)
+{
+ struct node *nodep = np;
+
+ /*
+ * If current node has a right child, next node is the left-most
+ * of the right child.
+ */
+ if (nodep->right) {
+ for (nodep = nodep->right; nodep->left; nodep = nodep->left)
+ ;
+ return nodep;
+ }
+
+ /*
+ * No right child. Go up until node is left child of a parent.
+ * That parent is then the next node.
+ */
+ while (nodep->parent && nodep == nodep->parent->right)
+ nodep = nodep->parent;
+
+ return nodep->parent;
+}
+
+/* Searches for and returns a pointer to the node that describes the
+ * highest index < the index of the node pointed to by np.
+ * Returns NULL if no node with a lower index exists.
+ */
+static struct node *node_prev(struct sparsebit *s, struct node *np)
+{
+ struct node *nodep = np;
+
+ /*
+ * If current node has a left child, next node is the right-most
+ * of the left child.
+ */
+ if (nodep->left) {
+ for (nodep = nodep->left; nodep->right; nodep = nodep->right)
+ ;
+ return (struct node *) nodep;
+ }
+
+ /*
+ * No left child. Go up until node is right child of a parent.
+ * That parent is then the next node.
+ */
+ while (nodep->parent && nodep == nodep->parent->left)
+ nodep = nodep->parent;
+
+ return (struct node *) nodep->parent;
+}
+
+
+/* Allocates space to hold a copy of the node sub-tree pointed to by
+ * subtree and duplicates the bit settings to the newly allocated nodes.
+ * Returns the newly allocated copy of subtree.
+ */
+static struct node *node_copy_subtree(struct node *subtree)
+{
+ struct node *root;
+
+ /* Duplicate the node at the root of the subtree */
+ root = calloc(1, sizeof(*root));
+ if (!root) {
+ perror("calloc");
+ abort();
+ }
+
+ root->idx = subtree->idx;
+ root->mask = subtree->mask;
+ root->num_after = subtree->num_after;
+
+ /* As needed, recursively duplicate the left and right subtrees */
+ if (subtree->left) {
+ root->left = node_copy_subtree(subtree->left);
+ root->left->parent = root;
+ }
+
+ if (subtree->right) {
+ root->right = node_copy_subtree(subtree->right);
+ root->right->parent = root;
+ }
+
+ return root;
+}
+
+/* Searches for and returns a pointer to the node that describes the setting
+ * of the bit given by idx. A node describes the setting of a bit if its
+ * index is within the bits described by the mask bits or the number of
+ * contiguous bits set after the mask. Returns NULL if there is no such node.
+ */
+static struct node *node_find(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ struct node *nodep;
+
+ /* Find the node that describes the setting of the bit at idx */
+ for (nodep = s->root; nodep;
+ nodep = nodep->idx > idx ? nodep->left : nodep->right) {
+ if (idx >= nodep->idx &&
+ idx <= nodep->idx + MASK_BITS + nodep->num_after - 1)
+ break;
+ }
+
+ return nodep;
+}
+
+/* Entry Requirements:
+ * + A node that describes the setting of idx is not already present.
+ *
+ * Adds a new node to describe the setting of the bit at the index given
+ * by idx. Returns a pointer to the newly added node.
+ *
+ * TODO(lhuemill): Degenerate cases causes the tree to get unbalanced.
+ */
+static struct node *node_add(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ struct node *nodep, *parentp, *prev;
+
+ /* Allocate and initialize the new node. */
+ nodep = calloc(1, sizeof(*nodep));
+ if (!nodep) {
+ perror("calloc");
+ abort();
+ }
+
+ nodep->idx = idx & -MASK_BITS;
+
+ /* If no nodes, set it up as the root node. */
+ if (!s->root) {
+ s->root = nodep;
+ return nodep;
+ }
+
+ /*
+ * Find the parent where the new node should be attached
+ * and add the node there.
+ */
+ parentp = s->root;
+ while (true) {
+ if (idx < parentp->idx) {
+ if (!parentp->left) {
+ parentp->left = nodep;
+ nodep->parent = parentp;
+ break;
+ }
+ parentp = parentp->left;
+ } else {
+ assert(idx > parentp->idx + MASK_BITS + parentp->num_after - 1);
+ if (!parentp->right) {
+ parentp->right = nodep;
+ nodep->parent = parentp;
+ break;
+ }
+ parentp = parentp->right;
+ }
+ }
+
+ /*
+ * Does num_after bits of previous node overlap with the mask
+ * of the new node? If so set the bits in the new nodes mask
+ * and reduce the previous nodes num_after.
+ */
+ prev = node_prev(s, nodep);
+ while (prev && prev->idx + MASK_BITS + prev->num_after - 1 >= nodep->idx) {
+ unsigned int n1 = (prev->idx + MASK_BITS + prev->num_after - 1)
+ - nodep->idx;
+ assert(prev->num_after > 0);
+ assert(n1 < MASK_BITS);
+ assert(!(nodep->mask & (1 << n1)));
+ nodep->mask |= (1 << n1);
+ prev->num_after--;
+ }
+
+ return nodep;
+}
+
+/* Returns whether all the bits in the sparsebit array are set. */
+bool sparsebit_all_set(struct sparsebit *s)
+{
+ /*
+ * If any nodes there must be at least one bit set. Only case
+ * where a bit is set and total num set is 0, is when all bits
+ * are set.
+ */
+ return s->root && s->num_set == 0;
+}
+
+/* Clears all bits described by the node pointed to by nodep, then
+ * removes the node.
+ */
+static void node_rm(struct sparsebit *s, struct node *nodep)
+{
+ struct node *tmp;
+ sparsebit_num_t num_set;
+
+ num_set = node_num_set(nodep);
+ assert(s->num_set >= num_set || sparsebit_all_set(s));
+ s->num_set -= node_num_set(nodep);
+
+ /* Have both left and right child */
+ if (nodep->left && nodep->right) {
+ /*
+ * Move left children to the leftmost leaf node
+ * of the right child.
+ */
+ for (tmp = nodep->right; tmp->left; tmp = tmp->left)
+ ;
+ tmp->left = nodep->left;
+ nodep->left = NULL;
+ tmp->left->parent = tmp;
+ }
+
+ /* Left only child */
+ if (nodep->left) {
+ if (!nodep->parent) {
+ s->root = nodep->left;
+ nodep->left->parent = NULL;
+ } else {
+ nodep->left->parent = nodep->parent;
+ if (nodep == nodep->parent->left)
+ nodep->parent->left = nodep->left;
+ else {
+ assert(nodep == nodep->parent->right);
+ nodep->parent->right = nodep->left;
+ }
+ }
+
+ nodep->parent = nodep->left = nodep->right = NULL;
+ free(nodep);
+
+ return;
+ }
+
+
+ /* Right only child */
+ if (nodep->right) {
+ if (!nodep->parent) {
+ s->root = nodep->right;
+ nodep->right->parent = NULL;
+ } else {
+ nodep->right->parent = nodep->parent;
+ if (nodep == nodep->parent->left)
+ nodep->parent->left = nodep->right;
+ else {
+ assert(nodep == nodep->parent->right);
+ nodep->parent->right = nodep->right;
+ }
+ }
+
+ nodep->parent = nodep->left = nodep->right = NULL;
+ free(nodep);
+
+ return;
+ }
+
+ /* Leaf Node */
+ if (!nodep->parent) {
+ s->root = NULL;
+ } else {
+ if (nodep->parent->left == nodep)
+ nodep->parent->left = NULL;
+ else {
+ assert(nodep == nodep->parent->right);
+ nodep->parent->right = NULL;
+ }
+ }
+
+ nodep->parent = nodep->left = nodep->right = NULL;
+ free(nodep);
+
+ return;
+}
+
+/* Splits the node containing the bit at idx so that there is a node
+ * that starts at the specified index. If no such node exists, a new
+ * node at the specified index is created. Returns the new node.
+ *
+ * idx must start of a mask boundary.
+ */
+static struct node *node_split(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ struct node *nodep1, *nodep2;
+ sparsebit_idx_t offset;
+ sparsebit_num_t orig_num_after;
+
+ assert(!(idx % MASK_BITS));
+
+ /*
+ * Is there a node that describes the setting of idx?
+ * If not, add it.
+ */
+ nodep1 = node_find(s, idx);
+ if (!nodep1)
+ return node_add(s, idx);
+
+ /*
+ * All done if the starting index of the node is where the
+ * split should occur.
+ */
+ if (nodep1->idx == idx)
+ return nodep1;
+
+ /*
+ * Split point not at start of mask, so it must be part of
+ * bits described by num_after.
+ */
+
+ /*
+ * Calculate offset within num_after for where the split is
+ * to occur.
+ */
+ offset = idx - (nodep1->idx + MASK_BITS);
+ orig_num_after = nodep1->num_after;
+
+ /*
+ * Add a new node to describe the bits starting at
+ * the split point.
+ */
+ nodep1->num_after = offset;
+ nodep2 = node_add(s, idx);
+
+ /* Move bits after the split point into the new node */
+ nodep2->num_after = orig_num_after - offset;
+ if (nodep2->num_after >= MASK_BITS) {
+ nodep2->mask = ~(mask_t) 0;
+ nodep2->num_after -= MASK_BITS;
+ } else {
+ nodep2->mask = (1 << nodep2->num_after) - 1;
+ nodep2->num_after = 0;
+ }
+
+ return nodep2;
+}
+
+/* Iteratively reduces the node pointed to by nodep and its adjacent
+ * nodes into a more compact form. For example, a node with a mask with
+ * all bits set adjacent to a previous node, will get combined into a
+ * single node with an increased num_after setting.
+ *
+ * After each reduction, a further check is made to see if additional
+ * reductions are possible with the new previous and next nodes. Note,
+ * a search for a reduction is only done across the nodes nearest nodep
+ * and those that became part of a reduction. Reductions beyond nodep
+ * and the adjacent nodes that are reduced are not discovered. It is the
+ * responsibility of the caller to pass a nodep that is within one node
+ * of each possible reduction.
+ *
+ * This function does not fix the temporary violation of all invariants.
+ * For example it does not fix the case where the bit settings described
+ * by two or more nodes overlap. Such a violation introduces the potential
+ * complication of a bit setting for a specific index having different settings
+ * in different nodes. This would then introduce the further complication
+ * of which node has the correct setting of the bit and thus such conditions
+ * are not allowed.
+ *
+ * This function is designed to fix invariant violations that are introduced
+ * by node_split() and by changes to the nodes mask or num_after members.
+ * For example, when setting a bit within a nodes mask, the function that
+ * sets the bit doesn't have to worry about whether the setting of that
+ * bit caused the mask to have leading only or trailing only bits set.
+ * Instead, the function can call node_reduce(), with nodep equal to the
+ * node address that it set a mask bit in, and node_reduce() will notice
+ * the cases of leading or trailing only bits and that there is an
+ * adjacent node that the bit settings could be merged into.
+ *
+ * This implementation specifically detects and corrects violation of the
+ * following invariants:
+ *
+ * + Node are only used to represent bits that are set.
+ * Nodes with a mask of 0 and num_after of 0 are not allowed.
+ *
+ * + The setting of at least one bit is always described in a nodes
+ * mask (mask >= 1).
+ *
+ * + A node with all mask bits set only occurs when the last bit
+ * described by the previous node is not equal to this nodes
+ * starting index - 1. All such occurences of this condition are
+ * avoided by moving the setting of the nodes mask bits into
+ * the previous nodes num_after setting.
+ */
+static void node_reduce(struct sparsebit *s, struct node *nodep)
+{
+ bool reduction_performed;
+
+ do {
+ reduction_performed = false;
+ struct node *prev, *next, *tmp;
+
+ /* 1) Potential reductions within the current node. */
+
+ /* Nodes with all bits cleared may be removed. */
+ if (nodep->mask == 0 && nodep->num_after == 0) {
+ /*
+ * About to remove the node pointed to by
+ * nodep, which normally would cause a problem
+ * for the next pass through the reduction loop,
+ * because the node at the starting point no longer
+ * exists. This potential problem is handled
+ * by first remembering the location of the next
+ * or previous nodes. Doesn't matter which, because
+ * once the node at nodep is removed, there will be
+ * no other nodes between prev and next.
+ *
+ * Note, the checks performed on nodep against both
+ * both prev and next both check for an adjacent
+ * node that can be reduced into a single node. As
+ * such, after removing the node at nodep, doesn't
+ * matter whether the nodep for the next pass
+ * through the loop is equal to the previous pass
+ * prev or next node. Either way, on the next pass
+ * the one not selected will become either the
+ * prev or next node.
+ */
+ tmp = node_next(s, nodep);
+ if (!tmp)
+ tmp = node_prev(s, nodep);
+
+ node_rm(s, nodep);
+ nodep = NULL;
+
+ nodep = tmp;
+ reduction_performed = true;
+ continue;
+ }
+
+ /*
+ * When the mask is 0, can reduce the amount of num_after
+ * bits by moving the initial num_after bits into the mask.
+ */
+ if (nodep->mask == 0) {
+ assert(nodep->num_after != 0);
+ assert(nodep->idx + MASK_BITS > nodep->idx);
+
+ nodep->idx += MASK_BITS;
+
+ if (nodep->num_after >= MASK_BITS) {
+ nodep->mask = ~0;
+ nodep->num_after -= MASK_BITS;
+ } else {
+ nodep->mask = (1u << nodep->num_after) - 1;
+ nodep->num_after = 0;
+ }
+
+ reduction_performed = true;
+ continue;
+ }
+
+ /*
+ * 2) Potential reductions between the current and
+ * previous nodes.
+ */
+ prev = node_prev(s, nodep);
+ if (prev) {
+ sparsebit_idx_t prev_highest_bit;
+
+ /* Nodes with no bits set can be removed. */
+ if (prev->mask == 0 && prev->num_after == 0) {
+ node_rm(s, prev);
+
+ reduction_performed = true;
+ continue;
+ }
+
+ /*
+ * All mask bits set and previous node has
+ * adjacent index.
+ */
+ if (nodep->mask + 1 == 0 &&
+ prev->idx + MASK_BITS == nodep->idx) {
+ prev->num_after += MASK_BITS + nodep->num_after;
+ nodep->mask = 0;
+ nodep->num_after = 0;
+
+ reduction_performed = true;
+ continue;
+ }
+
+ /*
+ * Is node adjacent to previous node and the node
+ * contains a single contiguous range of bits
+ * starting from the beginning of the mask?
+ */
+ prev_highest_bit = prev->idx + MASK_BITS - 1 + prev->num_after;
+ if (prev_highest_bit + 1 == nodep->idx &&
+ (nodep->mask | (nodep->mask >> 1)) == nodep->mask) {
+ /*
+ * How many contiguous bits are there?
+ * Is equal to the total number of set
+ * bits, due to an earlier check that
+ * there is a single contiguous range of
+ * set bits.
+ */
+ unsigned int num_contiguous
+ = __builtin_popcount(nodep->mask);
+ assert((num_contiguous > 0) &&
+ ((1ULL << num_contiguous) - 1) == nodep->mask);
+
+ prev->num_after += num_contiguous;
+ nodep->mask = 0;
+
+ /*
+ * For predictable performance, handle special
+ * case where all mask bits are set and there
+ * is a non-zero num_after setting. This code
+ * is functionally correct without the following
+ * conditionalized statements, but without them
+ * the value of num_after is only reduced by
+ * the number of mask bits per pass. There are
+ * cases where num_after can be close to 2^64.
+ * Without this code it could take nearly
+ * (2^64) / 32 passes to perform the full
+ * reduction.
+ */
+ if (num_contiguous == MASK_BITS) {
+ prev->num_after += nodep->num_after;
+ nodep->num_after = 0;
+ }
+
+ reduction_performed = true;
+ continue;
+ }
+ }
+
+ /*
+ * 3) Potential reductions between the current and
+ * next nodes.
+ */
+ next = node_next(s, nodep);
+ if (next) {
+ /* Nodes with no bits set can be removed. */
+ if (next->mask == 0 && next->num_after == 0) {
+ node_rm(s, next);
+ reduction_performed = true;
+ continue;
+ }
+
+ /*
+ * Is next node index adjacent to current node
+ * and has a mask with all bits set?
+ */
+ if (next->idx == nodep->idx + MASK_BITS + nodep->num_after &&
+ next->mask == ~(mask_t) 0) {
+ nodep->num_after += MASK_BITS;
+ next->mask = 0;
+ nodep->num_after += next->num_after;
+ next->num_after = 0;
+
+ node_rm(s, next);
+ next = NULL;
+
+ reduction_performed = true;
+ continue;
+ }
+ }
+ } while (nodep && reduction_performed);
+}
+
+/* Returns whether the bit at the index given by idx, within the
+ * sparsebit array is set or not.
+ */
+bool sparsebit_is_set(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ struct node *nodep;
+
+ /* Find the node that describes the setting of the bit at idx */
+ for (nodep = s->root; nodep;
+ nodep = nodep->idx > idx ? nodep->left : nodep->right)
+ if (idx >= nodep->idx &&
+ idx <= nodep->idx + MASK_BITS + nodep->num_after - 1)
+ goto have_node;
+
+ return false;
+
+have_node:
+ /* Bit is set if it is any of the bits described by num_after */
+ if (nodep->num_after && idx >= nodep->idx + MASK_BITS)
+ return true;
+
+ /* Is the corresponding mask bit set */
+ assert(idx >= nodep->idx && idx - nodep->idx < MASK_BITS);
+ return !!(nodep->mask & (1 << (idx - nodep->idx)));
+}
+
+/* Within the sparsebit array pointed to by s, sets the bit
+ * at the index given by idx.
+ */
+static void bit_set(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ struct node *nodep;
+
+ /* Skip bits that are already set */
+ if (sparsebit_is_set(s, idx))
+ return;
+
+ /*
+ * Get a node where the bit at idx is described by the mask.
+ * The node_split will also create a node, if there isn't
+ * already a node that describes the setting of bit.
+ */
+ nodep = node_split(s, idx & -MASK_BITS);
+
+ /* Set the bit within the nodes mask */
+ assert(idx >= nodep->idx && idx <= nodep->idx + MASK_BITS - 1);
+ assert(!(nodep->mask & (1 << (idx - nodep->idx))));
+ nodep->mask |= 1 << (idx - nodep->idx);
+ s->num_set++;
+
+ node_reduce(s, nodep);
+}
+
+/* Within the sparsebit array pointed to by s, clears the bit
+ * at the index given by idx.
+ */
+static void bit_clear(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ struct node *nodep;
+
+ /* Skip bits that are already cleared */
+ if (!sparsebit_is_set(s, idx))
+ return;
+
+ /* Is there a node that describes the setting of this bit? */
+ nodep = node_find(s, idx);
+ if (!nodep)
+ return;
+
+ /*
+ * If a num_after bit, split the node, so that the bit is
+ * part of a node mask.
+ */
+ if (idx >= nodep->idx + MASK_BITS)
+ nodep = node_split(s, idx & -MASK_BITS);
+
+ /*
+ * After node_split above, bit at idx should be within the mask.
+ * Clear that bit.
+ */
+ assert(idx >= nodep->idx && idx <= nodep->idx + MASK_BITS - 1);
+ assert(nodep->mask & (1 << (idx - nodep->idx)));
+ nodep->mask &= ~(1 << (idx - nodep->idx));
+ assert(s->num_set > 0 || sparsebit_all_set(s));
+ s->num_set--;
+
+ node_reduce(s, nodep);
+}
+
+/* Recursively dumps to the FILE stream given by stream the contents
+ * of the sub-tree of nodes pointed to by nodep. Each line of output
+ * is prefixed by the number of spaces given by indent. On each
+ * recursion, the indent amount is increased by 2. This causes nodes
+ * at each level deeper into the binary search tree to be displayed
+ * with a greater indent.
+ */
+static void dump_nodes(FILE *stream, struct node *nodep,
+ unsigned int indent)
+{
+ char *node_type;
+
+ /* Dump contents of node */
+ if (!nodep->parent)
+ node_type = "root";
+ else if (nodep == nodep->parent->left)
+ node_type = "left";
+ else {
+ assert(nodep == nodep->parent->right);
+ node_type = "right";
+ }
+ fprintf(stream, "%*s---- %s nodep: %p\n", indent, "", node_type, nodep);
+ fprintf(stream, "%*s parent: %p left: %p right: %p\n", indent, "",
+ nodep->parent, nodep->left, nodep->right);
+ fprintf(stream, "%*s idx: 0x%lx mask: 0x%x num_after: 0x%lx\n",
+ indent, "", nodep->idx, nodep->mask, nodep->num_after);
+
+ /* If present, dump contents of left child nodes */
+ if (nodep->left)
+ dump_nodes(stream, nodep->left, indent + 2);
+
+ /* If present, dump contents of right child nodes */
+ if (nodep->right)
+ dump_nodes(stream, nodep->right, indent + 2);
+}
+
+static inline sparsebit_idx_t node_first_set(struct node *nodep, int start)
+{
+ mask_t leading = (mask_t)1 << start;
+ int n1 = __builtin_ctz(nodep->mask & -leading);
+
+ return nodep->idx + n1;
+}
+
+static inline sparsebit_idx_t node_first_clear(struct node *nodep, int start)
+{
+ mask_t leading = (mask_t)1 << start;
+ int n1 = __builtin_ctz(~nodep->mask & -leading);
+
+ return nodep->idx + n1;
+}
+
+/* Dumps to the FILE stream specified by stream, the implementation dependent
+ * internal state of s. Each line of output is prefixed with the number
+ * of spaces given by indent. The output is completely implementation
+ * dependent and subject to change. Output from this function should only
+ * be used for diagnostic purposes. For example, this function can be
+ * used by test cases after they detect an unexpected condition, as a means
+ * to capture diagnostic information.
+ */
+static void sparsebit_dump_internal(FILE *stream, struct sparsebit *s,
+ unsigned int indent)
+{
+ /* Dump the contents of s */
+ fprintf(stream, "%*sroot: %p\n", indent, "", s->root);
+ fprintf(stream, "%*snum_set: 0x%lx\n", indent, "", s->num_set);
+
+ if (s->root)
+ dump_nodes(stream, s->root, indent);
+}
+
+/* Allocates and returns a new sparsebit array. The initial state
+ * of the newly allocated sparsebit array has all bits cleared.
+ */
+struct sparsebit *sparsebit_alloc(void)
+{
+ struct sparsebit *s;
+
+ /* Allocate top level structure. */
+ s = calloc(1, sizeof(*s));
+ if (!s) {
+ perror("calloc");
+ abort();
+ }
+
+ return s;
+}
+
+/* Frees the implementation dependent data for the sparsebit array
+ * pointed to by s and poisons the pointer to that data.
+ */
+void sparsebit_free(struct sparsebit **sbitp)
+{
+ struct sparsebit *s = *sbitp;
+
+ if (!s)
+ return;
+
+ sparsebit_clear_all(s);
+ free(s);
+ *sbitp = NULL;
+}
+
+/* Makes a copy of the sparsebit array given by s, to the sparsebit
+ * array given by d. Note, d must have already been allocated via
+ * sparsebit_alloc(). It can though already have bits set, which
+ * if different from src will be cleared.
+ */
+void sparsebit_copy(struct sparsebit *d, struct sparsebit *s)
+{
+ /* First clear any bits already set in the destination */
+ sparsebit_clear_all(d);
+
+ if (s->root) {
+ d->root = node_copy_subtree(s->root);
+ d->num_set = s->num_set;
+ }
+}
+
+/* Returns whether num consecutive bits starting at idx are all set. */
+bool sparsebit_is_set_num(struct sparsebit *s,
+ sparsebit_idx_t idx, sparsebit_num_t num)
+{
+ sparsebit_idx_t next_cleared;
+
+ assert(num > 0);
+ assert(idx + num - 1 >= idx);
+
+ /* With num > 0, the first bit must be set. */
+ if (!sparsebit_is_set(s, idx))
+ return false;
+
+ /* Find the next cleared bit */
+ next_cleared = sparsebit_next_clear(s, idx);
+
+ /*
+ * If no cleared bits beyond idx, then there are at least num
+ * set bits. idx + num doesn't wrap. Otherwise check if
+ * there are enough set bits between idx and the next cleared bit.
+ */
+ return next_cleared == 0 || next_cleared - idx >= num;
+}
+
+/* Returns whether the bit at the index given by idx. */
+bool sparsebit_is_clear(struct sparsebit *s,
+ sparsebit_idx_t idx)
+{
+ return !sparsebit_is_set(s, idx);
+}
+
+/* Returns whether num consecutive bits starting at idx are all cleared. */
+bool sparsebit_is_clear_num(struct sparsebit *s,
+ sparsebit_idx_t idx, sparsebit_num_t num)
+{
+ sparsebit_idx_t next_set;
+
+ assert(num > 0);
+ assert(idx + num - 1 >= idx);
+
+ /* With num > 0, the first bit must be cleared. */
+ if (!sparsebit_is_clear(s, idx))
+ return false;
+
+ /* Find the next set bit */
+ next_set = sparsebit_next_set(s, idx);
+
+ /*
+ * If no set bits beyond idx, then there are at least num
+ * cleared bits. idx + num doesn't wrap. Otherwise check if
+ * there are enough cleared bits between idx and the next set bit.
+ */
+ return next_set == 0 || next_set - idx >= num;
+}
+
+/* Returns the total number of bits set. Note: 0 is also returned for
+ * the case of all bits set. This is because with all bits set, there
+ * is 1 additional bit set beyond what can be represented in the return
+ * value. Use sparsebit_any_set(), instead of sparsebit_num_set() > 0,
+ * to determine if the sparsebit array has any bits set.
+ */
+sparsebit_num_t sparsebit_num_set(struct sparsebit *s)
+{
+ return s->num_set;
+}
+
+/* Returns whether any bit is set in the sparsebit array. */
+bool sparsebit_any_set(struct sparsebit *s)
+{
+ /*
+ * Nodes only describe set bits. If any nodes then there
+ * is at least 1 bit set.
+ */
+ if (!s->root)
+ return false;
+
+ /*
+ * Every node should have a non-zero mask. For now will
+ * just assure that the root node has a non-zero mask,
+ * which is a quick check that at least 1 bit is set.
+ */
+ assert(s->root->mask != 0);
+ assert(s->num_set > 0 ||
+ (s->root->num_after == ((sparsebit_num_t) 0) - MASK_BITS &&
+ s->root->mask == ~(mask_t) 0));
+
+ return true;
+}
+
+/* Returns whether all the bits in the sparsebit array are cleared. */
+bool sparsebit_all_clear(struct sparsebit *s)
+{
+ return !sparsebit_any_set(s);
+}
+
+/* Returns whether all the bits in the sparsebit array are set. */
+bool sparsebit_any_clear(struct sparsebit *s)
+{
+ return !sparsebit_all_set(s);
+}
+
+/* Returns the index of the first set bit. Abort if no bits are set.
+ */
+sparsebit_idx_t sparsebit_first_set(struct sparsebit *s)
+{
+ struct node *nodep;
+
+ /* Validate at least 1 bit is set */
+ assert(sparsebit_any_set(s));
+
+ nodep = node_first(s);
+ return node_first_set(nodep, 0);
+}
+
+/* Returns the index of the first cleared bit. Abort if
+ * no bits are cleared.
+ */
+sparsebit_idx_t sparsebit_first_clear(struct sparsebit *s)
+{
+ struct node *nodep1, *nodep2;
+
+ /* Validate at least 1 bit is cleared. */
+ assert(sparsebit_any_clear(s));
+
+ /* If no nodes or first node index > 0 then lowest cleared is 0 */
+ nodep1 = node_first(s);
+ if (!nodep1 || nodep1->idx > 0)
+ return 0;
+
+ /* Does the mask in the first node contain any cleared bits. */
+ if (nodep1->mask != ~(mask_t) 0)
+ return node_first_clear(nodep1, 0);
+
+ /*
+ * All mask bits set in first node. If there isn't a second node
+ * then the first cleared bit is the first bit after the bits
+ * described by the first node.
+ */
+ nodep2 = node_next(s, nodep1);
+ if (!nodep2) {
+ /*
+ * No second node. First cleared bit is first bit beyond
+ * bits described by first node.
+ */
+ assert(nodep1->mask == ~(mask_t) 0);
+ assert(nodep1->idx + MASK_BITS + nodep1->num_after != (sparsebit_idx_t) 0);
+ return nodep1->idx + MASK_BITS + nodep1->num_after;
+ }
+
+ /*
+ * There is a second node.
+ * If it is not adjacent to the first node, then there is a gap
+ * of cleared bits between the nodes, and the first cleared bit
+ * is the first bit within the gap.
+ */
+ if (nodep1->idx + MASK_BITS + nodep1->num_after != nodep2->idx)
+ return nodep1->idx + MASK_BITS + nodep1->num_after;
+
+ /*
+ * Second node is adjacent to the first node.
+ * Because it is adjacent, its mask should be non-zero. If all
+ * its mask bits are set, then with it being adjacent, it should
+ * have had the mask bits moved into the num_after setting of the
+ * previous node.
+ */
+ return node_first_clear(nodep2, 0);
+}
+
+/* Returns index of next bit set within s after the index given by prev.
+ * Returns 0 if there are no bits after prev that are set.
+ */
+sparsebit_idx_t sparsebit_next_set(struct sparsebit *s,
+ sparsebit_idx_t prev)
+{
+ sparsebit_idx_t lowest_possible = prev + 1;
+ sparsebit_idx_t start;
+ struct node *nodep;
+
+ /* A bit after the highest index can't be set. */
+ if (lowest_possible == 0)
+ return 0;
+
+ /*
+ * Find the leftmost 'candidate' overlapping or to the right
+ * of lowest_possible.
+ */
+ struct node *candidate = NULL;
+
+ /* True iff lowest_possible is within candidate */
+ bool contains = false;
+
+ /*
+ * Find node that describes setting of bit at lowest_possible.
+ * If such a node doesn't exist, find the node with the lowest
+ * starting index that is > lowest_possible.
+ */
+ for (nodep = s->root; nodep;) {
+ if ((nodep->idx + MASK_BITS + nodep->num_after - 1)
+ >= lowest_possible) {
+ candidate = nodep;
+ if (candidate->idx <= lowest_possible) {
+ contains = true;
+ break;
+ }
+ nodep = nodep->left;
+ } else {
+ nodep = nodep->right;
+ }
+ }
+ if (!candidate)
+ return 0;
+
+ assert(candidate->mask != 0);
+
+ /* Does the candidate node describe the setting of lowest_possible? */
+ if (!contains) {
+ /*
+ * Candidate doesn't describe setting of bit at lowest_possible.
+ * Candidate points to the first node with a starting index
+ * > lowest_possible.
+ */
+ assert(candidate->idx > lowest_possible);
+
+ return node_first_set(candidate, 0);
+ }
+
+ /*
+ * Candidate describes setting of bit at lowest_possible.
+ * Note: although the node describes the setting of the bit
+ * at lowest_possible, its possible that its setting and the
+ * setting of all latter bits described by this node are 0.
+ * For now, just handle the cases where this node describes
+ * a bit at or after an index of lowest_possible that is set.
+ */
+ start = lowest_possible - candidate->idx;
+
+ if (start < MASK_BITS && candidate->mask >= (1 << start))
+ return node_first_set(candidate, start);
+
+ if (candidate->num_after) {
+ sparsebit_idx_t first_num_after_idx = candidate->idx + MASK_BITS;
+
+ return lowest_possible < first_num_after_idx
+ ? first_num_after_idx : lowest_possible;
+ }
+
+ /*
+ * Although candidate node describes setting of bit at
+ * the index of lowest_possible, all bits at that index and
+ * latter that are described by candidate are cleared. With
+ * this, the next bit is the first bit in the next node, if
+ * such a node exists. If a next node doesn't exist, then
+ * there is no next set bit.
+ */
+ candidate = node_next(s, candidate);
+ if (!candidate)
+ return 0;
+
+ return node_first_set(candidate, 0);
+}
+
+/* Returns index of next bit cleared within s after the index given by prev.
+ * Returns 0 if there are no bits after prev that are cleared.
+ */
+sparsebit_idx_t sparsebit_next_clear(struct sparsebit *s,
+ sparsebit_idx_t prev)
+{
+ sparsebit_idx_t lowest_possible = prev + 1;
+ sparsebit_idx_t idx;
+ struct node *nodep1, *nodep2;
+
+ /* A bit after the highest index can't be set. */
+ if (lowest_possible == 0)
+ return 0;
+
+ /*
+ * Does a node describing the setting of lowest_possible exist?
+ * If not, the bit at lowest_possible is cleared.
+ */
+ nodep1 = node_find(s, lowest_possible);
+ if (!nodep1)
+ return lowest_possible;
+
+ /* Does a mask bit in node 1 describe the next cleared bit. */
+ for (idx = lowest_possible - nodep1->idx; idx < MASK_BITS; idx++)
+ if (!(nodep1->mask & (1 << idx)))
+ return nodep1->idx + idx;
+
+ /*
+ * Next cleared bit is not described by node 1. If there
+ * isn't a next node, then next cleared bit is described
+ * by bit after the bits described by the first node.
+ */
+ nodep2 = node_next(s, nodep1);
+ if (!nodep2)
+ return nodep1->idx + MASK_BITS + nodep1->num_after;
+
+ /*
+ * There is a second node.
+ * If it is not adjacent to the first node, then there is a gap
+ * of cleared bits between the nodes, and the next cleared bit
+ * is the first bit within the gap.
+ */
+ if (nodep1->idx + MASK_BITS + nodep1->num_after != nodep2->idx)
+ return nodep1->idx + MASK_BITS + nodep1->num_after;
+
+ /*
+ * Second node is adjacent to the first node.
+ * Because it is adjacent, its mask should be non-zero. If all
+ * its mask bits are set, then with it being adjacent, it should
+ * have had the mask bits moved into the num_after setting of the
+ * previous node.
+ */
+ return node_first_clear(nodep2, 0);
+}
+
+/* Starting with the index 1 greater than the index given by start, finds
+ * and returns the index of the first sequence of num consecutively set
+ * bits. Returns a value of 0 of no such sequence exists.
+ */
+sparsebit_idx_t sparsebit_next_set_num(struct sparsebit *s,
+ sparsebit_idx_t start, sparsebit_num_t num)
+{
+ sparsebit_idx_t idx;
+
+ assert(num >= 1);
+
+ for (idx = sparsebit_next_set(s, start);
+ idx != 0 && idx + num - 1 >= idx;
+ idx = sparsebit_next_set(s, idx)) {
+ assert(sparsebit_is_set(s, idx));
+
+ /*
+ * Does the sequence of bits starting at idx consist of
+ * num set bits?
+ */
+ if (sparsebit_is_set_num(s, idx, num))
+ return idx;
+
+ /*
+ * Sequence of set bits at idx isn't large enough.
+ * Skip this entire sequence of set bits.
+ */
+ idx = sparsebit_next_clear(s, idx);
+ if (idx == 0)
+ return 0;
+ }
+
+ return 0;
+}
+
+/* Starting with the index 1 greater than the index given by start, finds
+ * and returns the index of the first sequence of num consecutively cleared
+ * bits. Returns a value of 0 of no such sequence exists.
+ */
+sparsebit_idx_t sparsebit_next_clear_num(struct sparsebit *s,
+ sparsebit_idx_t start, sparsebit_num_t num)
+{
+ sparsebit_idx_t idx;
+
+ assert(num >= 1);
+
+ for (idx = sparsebit_next_clear(s, start);
+ idx != 0 && idx + num - 1 >= idx;
+ idx = sparsebit_next_clear(s, idx)) {
+ assert(sparsebit_is_clear(s, idx));
+
+ /*
+ * Does the sequence of bits starting at idx consist of
+ * num cleared bits?
+ */
+ if (sparsebit_is_clear_num(s, idx, num))
+ return idx;
+
+ /*
+ * Sequence of cleared bits at idx isn't large enough.
+ * Skip this entire sequence of cleared bits.
+ */
+ idx = sparsebit_next_set(s, idx);
+ if (idx == 0)
+ return 0;
+ }
+
+ return 0;
+}
+
+/* Sets the bits * in the inclusive range idx through idx + num - 1. */
+void sparsebit_set_num(struct sparsebit *s,
+ sparsebit_idx_t start, sparsebit_num_t num)
+{
+ struct node *nodep, *next;
+ unsigned int n1;
+ sparsebit_idx_t idx;
+ sparsebit_num_t n;
+ sparsebit_idx_t middle_start, middle_end;
+
+ assert(num > 0);
+ assert(start + num - 1 >= start);
+
+ /*
+ * Leading - bits before first mask boundary.
+ *
+ * TODO(lhuemill): With some effort it may be possible to
+ * replace the following loop with a sequential sequence
+ * of statements. High level sequence would be:
+ *
+ * 1. Use node_split() to force node that describes setting
+ * of idx to be within the mask portion of a node.
+ * 2. Form mask of bits to be set.
+ * 3. Determine number of mask bits already set in the node
+ * and store in a local variable named num_already_set.
+ * 4. Set the appropriate mask bits within the node.
+ * 5. Increment struct sparsebit_pvt num_set member
+ * by the number of bits that were actually set.
+ * Exclude from the counts bits that were already set.
+ * 6. Before returning to the caller, use node_reduce() to
+ * handle the multiple corner cases that this method
+ * introduces.
+ */
+ for (idx = start, n = num; n > 0 && idx % MASK_BITS != 0; idx++, n--)
+ bit_set(s, idx);
+
+ /* Middle - bits spanning one or more entire mask */
+ middle_start = idx;
+ middle_end = middle_start + (n & -MASK_BITS) - 1;
+ if (n >= MASK_BITS) {
+ nodep = node_split(s, middle_start);
+
+ /*
+ * As needed, split just after end of middle bits.
+ * No split needed if end of middle bits is at highest
+ * supported bit index.
+ */
+ if (middle_end + 1 > middle_end)
+ (void) node_split(s, middle_end + 1);
+
+ /* Delete nodes that only describe bits within the middle. */
+ for (next = node_next(s, nodep);
+ next && (next->idx < middle_end);
+ next = node_next(s, nodep)) {
+ assert(next->idx + MASK_BITS + next->num_after - 1 <= middle_end);
+ node_rm(s, next);
+ next = NULL;
+ }
+
+ /* As needed set each of the mask bits */
+ for (n1 = 0; n1 < MASK_BITS; n1++) {
+ if (!(nodep->mask & (1 << n1))) {
+ nodep->mask |= 1 << n1;
+ s->num_set++;
+ }
+ }
+
+ s->num_set -= nodep->num_after;
+ nodep->num_after = middle_end - middle_start + 1 - MASK_BITS;
+ s->num_set += nodep->num_after;
+
+ node_reduce(s, nodep);
+ }
+ idx = middle_end + 1;
+ n -= middle_end - middle_start + 1;
+
+ /* Trailing - bits at and beyond last mask boundary */
+ assert(n < MASK_BITS);
+ for (; n > 0; idx++, n--)
+ bit_set(s, idx);
+}
+
+/* Clears the bits * in the inclusive range idx through idx + num - 1. */
+void sparsebit_clear_num(struct sparsebit *s,
+ sparsebit_idx_t start, sparsebit_num_t num)
+{
+ struct node *nodep, *next;
+ unsigned int n1;
+ sparsebit_idx_t idx;
+ sparsebit_num_t n;
+ sparsebit_idx_t middle_start, middle_end;
+
+ assert(num > 0);
+ assert(start + num - 1 >= start);
+
+ /* Leading - bits before first mask boundary */
+ for (idx = start, n = num; n > 0 && idx % MASK_BITS != 0; idx++, n--)
+ bit_clear(s, idx);
+
+ /* Middle - bits spanning one or more entire mask */
+ middle_start = idx;
+ middle_end = middle_start + (n & -MASK_BITS) - 1;
+ if (n >= MASK_BITS) {
+ nodep = node_split(s, middle_start);
+
+ /*
+ * As needed, split just after end of middle bits.
+ * No split needed if end of middle bits is at highest
+ * supported bit index.
+ */
+ if (middle_end + 1 > middle_end)
+ (void) node_split(s, middle_end + 1);
+
+ /* Delete nodes that only describe bits within the middle. */
+ for (next = node_next(s, nodep);
+ next && (next->idx < middle_end);
+ next = node_next(s, nodep)) {
+ assert(next->idx + MASK_BITS + next->num_after - 1 <= middle_end);
+ node_rm(s, next);
+ next = NULL;
+ }
+
+ /* As needed clear each of the mask bits */
+ for (n1 = 0; n1 < MASK_BITS; n1++) {
+ if (nodep->mask & (1 << n1)) {
+ nodep->mask &= ~(1 << n1);
+ s->num_set--;
+ }
+ }
+
+ /* Clear any bits described by num_after */
+ s->num_set -= nodep->num_after;
+ nodep->num_after = 0;
+
+ /*
+ * Delete the node that describes the beginning of
+ * the middle bits and perform any allowed reductions
+ * with the nodes prev or next of nodep.
+ */
+ node_reduce(s, nodep);
+ nodep = NULL;
+ }
+ idx = middle_end + 1;
+ n -= middle_end - middle_start + 1;
+
+ /* Trailing - bits at and beyond last mask boundary */
+ assert(n < MASK_BITS);
+ for (; n > 0; idx++, n--)
+ bit_clear(s, idx);
+}
+
+/* Sets the bit at the index given by idx. */
+void sparsebit_set(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ sparsebit_set_num(s, idx, 1);
+}
+
+/* Clears the bit at the index given by idx. */
+void sparsebit_clear(struct sparsebit *s, sparsebit_idx_t idx)
+{
+ sparsebit_clear_num(s, idx, 1);
+}
+
+/* Sets the bits in the entire addressable range of the sparsebit array. */
+void sparsebit_set_all(struct sparsebit *s)
+{
+ sparsebit_set(s, 0);
+ sparsebit_set_num(s, 1, ~(sparsebit_idx_t) 0);
+ assert(sparsebit_all_set(s));
+}
+
+/* Clears the bits in the entire addressable range of the sparsebit array. */
+void sparsebit_clear_all(struct sparsebit *s)
+{
+ sparsebit_clear(s, 0);
+ sparsebit_clear_num(s, 1, ~(sparsebit_idx_t) 0);
+ assert(!sparsebit_any_set(s));
+}
+
+static size_t display_range(FILE *stream, sparsebit_idx_t low,
+ sparsebit_idx_t high, bool prepend_comma_space)
+{
+ char *fmt_str;
+ size_t sz;
+
+ /* Determine the printf format string */
+ if (low == high)
+ fmt_str = prepend_comma_space ? ", 0x%lx" : "0x%lx";
+ else
+ fmt_str = prepend_comma_space ? ", 0x%lx:0x%lx" : "0x%lx:0x%lx";
+
+ /*
+ * When stream is NULL, just determine the size of what would
+ * have been printed, else print the range.
+ */
+ if (!stream)
+ sz = snprintf(NULL, 0, fmt_str, low, high);
+ else
+ sz = fprintf(stream, fmt_str, low, high);
+
+ return sz;
+}
+
+
+/* Dumps to the FILE stream given by stream, the bit settings
+ * of s. Each line of output is prefixed with the number of
+ * spaces given by indent. The length of each line is implementation
+ * dependent and does not depend on the indent amount. The following
+ * is an example output of a sparsebit array that has bits:
+ *
+ * 0x5, 0x8, 0xa:0xe, 0x12
+ *
+ * This corresponds to a sparsebit whose bits 5, 8, 10, 11, 12, 13, 14, 18
+ * are set. Note that a ':', instead of a '-' is used to specify a range of
+ * contiguous bits. This is done because '-' is used to specify command-line
+ * options, and sometimes ranges are specified as command-line arguments.
+ */
+void sparsebit_dump(FILE *stream, struct sparsebit *s,
+ unsigned int indent)
+{
+ size_t current_line_len = 0;
+ size_t sz;
+ struct node *nodep;
+
+ if (!sparsebit_any_set(s))
+ return;
+
+ /* Display initial indent */
+ fprintf(stream, "%*s", indent, "");
+
+ /* For each node */
+ for (nodep = node_first(s); nodep; nodep = node_next(s, nodep)) {
+ unsigned int n1;
+ sparsebit_idx_t low, high;
+
+ /* For each group of bits in the mask */
+ for (n1 = 0; n1 < MASK_BITS; n1++) {
+ if (nodep->mask & (1 << n1)) {
+ low = high = nodep->idx + n1;
+
+ for (; n1 < MASK_BITS; n1++) {
+ if (nodep->mask & (1 << n1))
+ high = nodep->idx + n1;
+ else
+ break;
+ }
+
+ if ((n1 == MASK_BITS) && nodep->num_after)
+ high += nodep->num_after;
+
+ /*
+ * How much room will it take to display
+ * this range.
+ */
+ sz = display_range(NULL, low, high,
+ current_line_len != 0);
+
+ /*
+ * If there is not enough room, display
+ * a newline plus the indent of the next
+ * line.
+ */
+ if (current_line_len + sz > DUMP_LINE_MAX) {
+ fputs("\n", stream);
+ fprintf(stream, "%*s", indent, "");
+ current_line_len = 0;
+ }
+
+ /* Display the range */
+ sz = display_range(stream, low, high,
+ current_line_len != 0);
+ current_line_len += sz;
+ }
+ }
+
+ /*
+ * If num_after and most significant-bit of mask is not
+ * set, then still need to display a range for the bits
+ * described by num_after.
+ */
+ if (!(nodep->mask & (1 << (MASK_BITS - 1))) && nodep->num_after) {
+ low = nodep->idx + MASK_BITS;
+ high = nodep->idx + MASK_BITS + nodep->num_after - 1;
+
+ /*
+ * How much room will it take to display
+ * this range.
+ */
+ sz = display_range(NULL, low, high,
+ current_line_len != 0);
+
+ /*
+ * If there is not enough room, display
+ * a newline plus the indent of the next
+ * line.
+ */
+ if (current_line_len + sz > DUMP_LINE_MAX) {
+ fputs("\n", stream);
+ fprintf(stream, "%*s", indent, "");
+ current_line_len = 0;
+ }
+
+ /* Display the range */
+ sz = display_range(stream, low, high,
+ current_line_len != 0);
+ current_line_len += sz;
+ }
+ }
+ fputs("\n", stream);
+}
+
+/* Validates the internal state of the sparsebit array given by
+ * s. On error, diagnostic information is printed to stderr and
+ * abort is called.
+ */
+void sparsebit_validate_internal(struct sparsebit *s)
+{
+ bool error_detected = false;
+ struct node *nodep, *prev = NULL;
+ sparsebit_num_t total_bits_set = 0;
+ unsigned int n1;
+
+ /* For each node */
+ for (nodep = node_first(s); nodep;
+ prev = nodep, nodep = node_next(s, nodep)) {
+
+ /*
+ * Increase total bits set by the number of bits set
+ * in this node.
+ */
+ for (n1 = 0; n1 < MASK_BITS; n1++)
+ if (nodep->mask & (1 << n1))
+ total_bits_set++;
+
+ total_bits_set += nodep->num_after;
+
+ /*
+ * Arbitrary choice as to whether a mask of 0 is allowed
+ * or not. For diagnostic purposes it is beneficial to
+ * have only one valid means to represent a set of bits.
+ * To support this an arbitrary choice has been made
+ * to not allow a mask of zero.
+ */
+ if (nodep->mask == 0) {
+ fprintf(stderr, "Node mask of zero, "
+ "nodep: %p nodep->mask: 0x%x",
+ nodep, nodep->mask);
+ error_detected = true;
+ break;
+ }
+
+ /*
+ * Validate num_after is not greater than the max index
+ * - the number of mask bits. The num_after member
+ * uses 0-based indexing and thus has no value that
+ * represents all bits set. This limitation is handled
+ * by requiring a non-zero mask. With a non-zero mask,
+ * MASK_BITS worth of bits are described by the mask,
+ * which makes the largest needed num_after equal to:
+ *
+ * (~(sparsebit_num_t) 0) - MASK_BITS + 1
+ */
+ if (nodep->num_after
+ > (~(sparsebit_num_t) 0) - MASK_BITS + 1) {
+ fprintf(stderr, "num_after too large, "
+ "nodep: %p nodep->num_after: 0x%lx",
+ nodep, nodep->num_after);
+ error_detected = true;
+ break;
+ }
+
+ /* Validate node index is divisible by the mask size */
+ if (nodep->idx % MASK_BITS) {
+ fprintf(stderr, "Node index not divisible by "
+ "mask size,\n"
+ " nodep: %p nodep->idx: 0x%lx "
+ "MASK_BITS: %lu\n",
+ nodep, nodep->idx, MASK_BITS);
+ error_detected = true;
+ break;
+ }
+
+ /*
+ * Validate bits described by node don't wrap beyond the
+ * highest supported index.
+ */
+ if ((nodep->idx + MASK_BITS + nodep->num_after - 1) < nodep->idx) {
+ fprintf(stderr, "Bits described by node wrap "
+ "beyond highest supported index,\n"
+ " nodep: %p nodep->idx: 0x%lx\n"
+ " MASK_BITS: %lu nodep->num_after: 0x%lx",
+ nodep, nodep->idx, MASK_BITS, nodep->num_after);
+ error_detected = true;
+ break;
+ }
+
+ /* Check parent pointers. */
+ if (nodep->left) {
+ if (nodep->left->parent != nodep) {
+ fprintf(stderr, "Left child parent pointer "
+ "doesn't point to this node,\n"
+ " nodep: %p nodep->left: %p "
+ "nodep->left->parent: %p",
+ nodep, nodep->left,
+ nodep->left->parent);
+ error_detected = true;
+ break;
+ }
+ }
+
+ if (nodep->right) {
+ if (nodep->right->parent != nodep) {
+ fprintf(stderr, "Right child parent pointer "
+ "doesn't point to this node,\n"
+ " nodep: %p nodep->right: %p "
+ "nodep->right->parent: %p",
+ nodep, nodep->right,
+ nodep->right->parent);
+ error_detected = true;
+ break;
+ }
+ }
+
+ if (!nodep->parent) {
+ if (s->root != nodep) {
+ fprintf(stderr, "Unexpected root node, "
+ "s->root: %p nodep: %p",
+ s->root, nodep);
+ error_detected = true;
+ break;
+ }
+ }
+
+ if (prev) {
+ /*
+ * Is index of previous node before index of
+ * current node?
+ */
+ if (prev->idx >= nodep->idx) {
+ fprintf(stderr, "Previous node index "
+ ">= current node index,\n"
+ " prev: %p prev->idx: 0x%lx\n"
+ " nodep: %p nodep->idx: 0x%lx",
+ prev, prev->idx, nodep, nodep->idx);
+ error_detected = true;
+ break;
+ }
+
+ /*
+ * Nodes occur in asscending order, based on each
+ * nodes starting index.
+ */
+ if ((prev->idx + MASK_BITS + prev->num_after - 1)
+ >= nodep->idx) {
+ fprintf(stderr, "Previous node bit range "
+ "overlap with current node bit range,\n"
+ " prev: %p prev->idx: 0x%lx "
+ "prev->num_after: 0x%lx\n"
+ " nodep: %p nodep->idx: 0x%lx "
+ "nodep->num_after: 0x%lx\n"
+ " MASK_BITS: %lu",
+ prev, prev->idx, prev->num_after,
+ nodep, nodep->idx, nodep->num_after,
+ MASK_BITS);
+ error_detected = true;
+ break;
+ }
+
+ /*
+ * When the node has all mask bits set, it shouldn't
+ * be adjacent to the last bit described by the
+ * previous node.
+ */
+ if (nodep->mask == ~(mask_t) 0 &&
+ prev->idx + MASK_BITS + prev->num_after == nodep->idx) {
+ fprintf(stderr, "Current node has mask with "
+ "all bits set and is adjacent to the "
+ "previous node,\n"
+ " prev: %p prev->idx: 0x%lx "
+ "prev->num_after: 0x%lx\n"
+ " nodep: %p nodep->idx: 0x%lx "
+ "nodep->num_after: 0x%lx\n"
+ " MASK_BITS: %lu",
+ prev, prev->idx, prev->num_after,
+ nodep, nodep->idx, nodep->num_after,
+ MASK_BITS);
+
+ error_detected = true;
+ break;
+ }
+ }
+ }
+
+ if (!error_detected) {
+ /*
+ * Is sum of bits set in each node equal to the count
+ * of total bits set.
+ */
+ if (s->num_set != total_bits_set) {
+ fprintf(stderr, "Number of bits set missmatch,\n"
+ " s->num_set: 0x%lx total_bits_set: 0x%lx",
+ s->num_set, total_bits_set);
+
+ error_detected = true;
+ }
+ }
+
+ if (error_detected) {
+ fputs(" dump_internal:\n", stderr);
+ sparsebit_dump_internal(stderr, s, 4);
+ abort();
+ }
+}
+
+
+#ifdef FUZZ
+/* A simple but effective fuzzing driver. Look for bugs with the help
+ * of some invariants and of a trivial representation of sparsebit.
+ * Just use 512 bytes of /dev/zero and /dev/urandom as inputs, and let
+ * afl-fuzz do the magic. :)
+ */
+
+#include <stdlib.h>
+#include <assert.h>
+
+struct range {
+ sparsebit_idx_t first, last;
+ bool set;
+};
+
+struct sparsebit *s;
+struct range ranges[1000];
+int num_ranges;
+
+static bool get_value(sparsebit_idx_t idx)
+{
+ int i;
+
+ for (i = num_ranges; --i >= 0; )
+ if (ranges[i].first <= idx && idx <= ranges[i].last)
+ return ranges[i].set;
+
+ return false;
+}
+
+static void operate(int code, sparsebit_idx_t first, sparsebit_idx_t last)
+{
+ sparsebit_num_t num;
+ sparsebit_idx_t next;
+
+ if (first < last) {
+ num = last - first + 1;
+ } else {
+ num = first - last + 1;
+ first = last;
+ last = first + num - 1;
+ }
+
+ switch (code) {
+ case 0:
+ sparsebit_set(s, first);
+ assert(sparsebit_is_set(s, first));
+ assert(!sparsebit_is_clear(s, first));
+ assert(sparsebit_any_set(s));
+ assert(!sparsebit_all_clear(s));
+ if (get_value(first))
+ return;
+ if (num_ranges == 1000)
+ exit(0);
+ ranges[num_ranges++] = (struct range)
+ { .first = first, .last = first, .set = true };
+ break;
+ case 1:
+ sparsebit_clear(s, first);
+ assert(!sparsebit_is_set(s, first));
+ assert(sparsebit_is_clear(s, first));
+ assert(sparsebit_any_clear(s));
+ assert(!sparsebit_all_set(s));
+ if (!get_value(first))
+ return;
+ if (num_ranges == 1000)
+ exit(0);
+ ranges[num_ranges++] = (struct range)
+ { .first = first, .last = first, .set = false };
+ break;
+ case 2:
+ assert(sparsebit_is_set(s, first) == get_value(first));
+ assert(sparsebit_is_clear(s, first) == !get_value(first));
+ break;
+ case 3:
+ if (sparsebit_any_set(s))
+ assert(get_value(sparsebit_first_set(s)));
+ if (sparsebit_any_clear(s))
+ assert(!get_value(sparsebit_first_clear(s)));
+ sparsebit_set_all(s);
+ assert(!sparsebit_any_clear(s));
+ assert(sparsebit_all_set(s));
+ num_ranges = 0;
+ ranges[num_ranges++] = (struct range)
+ { .first = 0, .last = ~(sparsebit_idx_t)0, .set = true };
+ break;
+ case 4:
+ if (sparsebit_any_set(s))
+ assert(get_value(sparsebit_first_set(s)));
+ if (sparsebit_any_clear(s))
+ assert(!get_value(sparsebit_first_clear(s)));
+ sparsebit_clear_all(s);
+ assert(!sparsebit_any_set(s));
+ assert(sparsebit_all_clear(s));
+ num_ranges = 0;
+ break;
+ case 5:
+ next = sparsebit_next_set(s, first);
+ assert(next == 0 || next > first);
+ assert(next == 0 || get_value(next));
+ break;
+ case 6:
+ next = sparsebit_next_clear(s, first);
+ assert(next == 0 || next > first);
+ assert(next == 0 || !get_value(next));
+ break;
+ case 7:
+ next = sparsebit_next_clear(s, first);
+ if (sparsebit_is_set_num(s, first, num)) {
+ assert(next == 0 || next > last);
+ if (first)
+ next = sparsebit_next_set(s, first - 1);
+ else if (sparsebit_any_set(s))
+ next = sparsebit_first_set(s);
+ else
+ return;
+ assert(next == first);
+ } else {
+ assert(sparsebit_is_clear(s, first) || next <= last);
+ }
+ break;
+ case 8:
+ next = sparsebit_next_set(s, first);
+ if (sparsebit_is_clear_num(s, first, num)) {
+ assert(next == 0 || next > last);
+ if (first)
+ next = sparsebit_next_clear(s, first - 1);
+ else if (sparsebit_any_clear(s))
+ next = sparsebit_first_clear(s);
+ else
+ return;
+ assert(next == first);
+ } else {
+ assert(sparsebit_is_set(s, first) || next <= last);
+ }
+ break;
+ case 9:
+ sparsebit_set_num(s, first, num);
+ assert(sparsebit_is_set_num(s, first, num));
+ assert(!sparsebit_is_clear_num(s, first, num));
+ assert(sparsebit_any_set(s));
+ assert(!sparsebit_all_clear(s));
+ if (num_ranges == 1000)
+ exit(0);
+ ranges[num_ranges++] = (struct range)
+ { .first = first, .last = last, .set = true };
+ break;
+ case 10:
+ sparsebit_clear_num(s, first, num);
+ assert(!sparsebit_is_set_num(s, first, num));
+ assert(sparsebit_is_clear_num(s, first, num));
+ assert(sparsebit_any_clear(s));
+ assert(!sparsebit_all_set(s));
+ if (num_ranges == 1000)
+ exit(0);
+ ranges[num_ranges++] = (struct range)
+ { .first = first, .last = last, .set = false };
+ break;
+ case 11:
+ sparsebit_validate_internal(s);
+ break;
+ default:
+ break;
+ }
+}
+
+unsigned char get8(void)
+{
+ int ch;
+
+ ch = getchar();
+ if (ch == EOF)
+ exit(0);
+ return ch;
+}
+
+uint64_t get64(void)
+{
+ uint64_t x;
+
+ x = get8();
+ x = (x << 8) | get8();
+ x = (x << 8) | get8();
+ x = (x << 8) | get8();
+ x = (x << 8) | get8();
+ x = (x << 8) | get8();
+ x = (x << 8) | get8();
+ return (x << 8) | get8();
+}
+
+int main(void)
+{
+ s = sparsebit_alloc();
+ for (;;) {
+ uint8_t op = get8() & 0xf;
+ uint64_t first = get64();
+ uint64_t last = get64();
+
+ operate(op, first, last);
+ }
+}
+#endif
diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c
new file mode 100644
index 000000000..8e04c0b16
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/test_util.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tools/testing/selftests/kvm/lib/test_util.c
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "test_util.h"
+
+/*
+ * Parses "[0-9]+[kmgt]?".
+ */
+size_t parse_size(const char *size)
+{
+ size_t base;
+ char *scale;
+ int shift = 0;
+
+ TEST_ASSERT(size && isdigit(size[0]), "Need at least one digit in '%s'", size);
+
+ base = strtoull(size, &scale, 0);
+
+ TEST_ASSERT(base != ULLONG_MAX, "Overflow parsing size!");
+
+ switch (tolower(*scale)) {
+ case 't':
+ shift = 40;
+ break;
+ case 'g':
+ shift = 30;
+ break;
+ case 'm':
+ shift = 20;
+ break;
+ case 'k':
+ shift = 10;
+ break;
+ case 'b':
+ case '\0':
+ shift = 0;
+ break;
+ default:
+ TEST_ASSERT(false, "Unknown size letter %c", *scale);
+ }
+
+ TEST_ASSERT((base << shift) >> shift == base, "Overflow scaling size!");
+
+ return base << shift;
+}
+
+int64_t timespec_to_ns(struct timespec ts)
+{
+ return (int64_t)ts.tv_nsec + 1000000000LL * (int64_t)ts.tv_sec;
+}
+
+struct timespec timespec_add_ns(struct timespec ts, int64_t ns)
+{
+ struct timespec res;
+
+ res.tv_nsec = ts.tv_nsec + ns;
+ res.tv_sec = ts.tv_sec + res.tv_nsec / 1000000000LL;
+ res.tv_nsec %= 1000000000LL;
+
+ return res;
+}
+
+struct timespec timespec_add(struct timespec ts1, struct timespec ts2)
+{
+ int64_t ns1 = timespec_to_ns(ts1);
+ int64_t ns2 = timespec_to_ns(ts2);
+ return timespec_add_ns((struct timespec){0}, ns1 + ns2);
+}
+
+struct timespec timespec_sub(struct timespec ts1, struct timespec ts2)
+{
+ int64_t ns1 = timespec_to_ns(ts1);
+ int64_t ns2 = timespec_to_ns(ts2);
+ return timespec_add_ns((struct timespec){0}, ns1 - ns2);
+}
+
+struct timespec timespec_diff_now(struct timespec start)
+{
+ struct timespec end;
+
+ clock_gettime(CLOCK_MONOTONIC, &end);
+ return timespec_sub(end, start);
+}
+
+struct timespec timespec_div(struct timespec ts, int divisor)
+{
+ int64_t ns = timespec_to_ns(ts) / divisor;
+
+ return timespec_add_ns((struct timespec){0}, ns);
+}
+
+void print_skip(const char *fmt, ...)
+{
+ va_list ap;
+
+ assert(fmt);
+ va_start(ap, fmt);
+ vprintf(fmt, ap);
+ va_end(ap);
+ puts(", skipping test");
+}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/handlers.S b/tools/testing/selftests/kvm/lib/x86_64/handlers.S
new file mode 100644
index 000000000..aaf7bc7d2
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86_64/handlers.S
@@ -0,0 +1,81 @@
+handle_exception:
+ push %r15
+ push %r14
+ push %r13
+ push %r12
+ push %r11
+ push %r10
+ push %r9
+ push %r8
+
+ push %rdi
+ push %rsi
+ push %rbp
+ push %rbx
+ push %rdx
+ push %rcx
+ push %rax
+ mov %rsp, %rdi
+
+ call route_exception
+
+ pop %rax
+ pop %rcx
+ pop %rdx
+ pop %rbx
+ pop %rbp
+ pop %rsi
+ pop %rdi
+ pop %r8
+ pop %r9
+ pop %r10
+ pop %r11
+ pop %r12
+ pop %r13
+ pop %r14
+ pop %r15
+
+ /* Discard vector and error code. */
+ add $16, %rsp
+ iretq
+
+/*
+ * Build the handle_exception wrappers which push the vector/error code on the
+ * stack and an array of pointers to those wrappers.
+ */
+.pushsection .rodata
+.globl idt_handlers
+idt_handlers:
+.popsection
+
+.macro HANDLERS has_error from to
+ vector = \from
+ .rept \to - \from + 1
+ .align 8
+
+ /* Fetch current address and append it to idt_handlers. */
+ current_handler = .
+.pushsection .rodata
+.quad current_handler
+.popsection
+
+ .if ! \has_error
+ pushq $0
+ .endif
+ pushq $vector
+ jmp handle_exception
+ vector = vector + 1
+ .endr
+.endm
+
+.global idt_handler_code
+idt_handler_code:
+ HANDLERS has_error=0 from=0 to=7
+ HANDLERS has_error=1 from=8 to=8
+ HANDLERS has_error=0 from=9 to=9
+ HANDLERS has_error=1 from=10 to=14
+ HANDLERS has_error=0 from=15 to=16
+ HANDLERS has_error=1 from=17 to=17
+ HANDLERS has_error=0 from=18 to=255
+
+.section .note.GNU-stack, "", %progbits
diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c
new file mode 100644
index 000000000..f5d2d27be
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c
@@ -0,0 +1,1258 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tools/testing/selftests/kvm/lib/x86_64/processor.c
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "../kvm_util_internal.h"
+#include "processor.h"
+
+#ifndef NUM_INTERRUPTS
+#define NUM_INTERRUPTS 256
+#endif
+
+#define DEFAULT_CODE_SELECTOR 0x8
+#define DEFAULT_DATA_SELECTOR 0x10
+
+/* Minimum physical address used for virtual translation tables. */
+#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000
+
+vm_vaddr_t exception_handlers;
+
+/* Virtual translation table structure declarations */
+struct pageMapL4Entry {
+ uint64_t present:1;
+ uint64_t writable:1;
+ uint64_t user:1;
+ uint64_t write_through:1;
+ uint64_t cache_disable:1;
+ uint64_t accessed:1;
+ uint64_t ignored_06:1;
+ uint64_t page_size:1;
+ uint64_t ignored_11_08:4;
+ uint64_t address:40;
+ uint64_t ignored_62_52:11;
+ uint64_t execute_disable:1;
+};
+
+struct pageDirectoryPointerEntry {
+ uint64_t present:1;
+ uint64_t writable:1;
+ uint64_t user:1;
+ uint64_t write_through:1;
+ uint64_t cache_disable:1;
+ uint64_t accessed:1;
+ uint64_t ignored_06:1;
+ uint64_t page_size:1;
+ uint64_t ignored_11_08:4;
+ uint64_t address:40;
+ uint64_t ignored_62_52:11;
+ uint64_t execute_disable:1;
+};
+
+struct pageDirectoryEntry {
+ uint64_t present:1;
+ uint64_t writable:1;
+ uint64_t user:1;
+ uint64_t write_through:1;
+ uint64_t cache_disable:1;
+ uint64_t accessed:1;
+ uint64_t ignored_06:1;
+ uint64_t page_size:1;
+ uint64_t ignored_11_08:4;
+ uint64_t address:40;
+ uint64_t ignored_62_52:11;
+ uint64_t execute_disable:1;
+};
+
+struct pageTableEntry {
+ uint64_t present:1;
+ uint64_t writable:1;
+ uint64_t user:1;
+ uint64_t write_through:1;
+ uint64_t cache_disable:1;
+ uint64_t accessed:1;
+ uint64_t dirty:1;
+ uint64_t reserved_07:1;
+ uint64_t global:1;
+ uint64_t ignored_11_09:3;
+ uint64_t address:40;
+ uint64_t ignored_62_52:11;
+ uint64_t execute_disable:1;
+};
+
+void regs_dump(FILE *stream, struct kvm_regs *regs,
+ uint8_t indent)
+{
+ fprintf(stream, "%*srax: 0x%.16llx rbx: 0x%.16llx "
+ "rcx: 0x%.16llx rdx: 0x%.16llx\n",
+ indent, "",
+ regs->rax, regs->rbx, regs->rcx, regs->rdx);
+ fprintf(stream, "%*srsi: 0x%.16llx rdi: 0x%.16llx "
+ "rsp: 0x%.16llx rbp: 0x%.16llx\n",
+ indent, "",
+ regs->rsi, regs->rdi, regs->rsp, regs->rbp);
+ fprintf(stream, "%*sr8: 0x%.16llx r9: 0x%.16llx "
+ "r10: 0x%.16llx r11: 0x%.16llx\n",
+ indent, "",
+ regs->r8, regs->r9, regs->r10, regs->r11);
+ fprintf(stream, "%*sr12: 0x%.16llx r13: 0x%.16llx "
+ "r14: 0x%.16llx r15: 0x%.16llx\n",
+ indent, "",
+ regs->r12, regs->r13, regs->r14, regs->r15);
+ fprintf(stream, "%*srip: 0x%.16llx rfl: 0x%.16llx\n",
+ indent, "",
+ regs->rip, regs->rflags);
+}
+
+/*
+ * Segment Dump
+ *
+ * Input Args:
+ * stream - Output FILE stream
+ * segment - KVM segment
+ * indent - Left margin indent amount
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Dumps the state of the KVM segment given by @segment, to the FILE stream
+ * given by @stream.
+ */
+static void segment_dump(FILE *stream, struct kvm_segment *segment,
+ uint8_t indent)
+{
+ fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.8x "
+ "selector: 0x%.4x type: 0x%.2x\n",
+ indent, "", segment->base, segment->limit,
+ segment->selector, segment->type);
+ fprintf(stream, "%*spresent: 0x%.2x dpl: 0x%.2x "
+ "db: 0x%.2x s: 0x%.2x l: 0x%.2x\n",
+ indent, "", segment->present, segment->dpl,
+ segment->db, segment->s, segment->l);
+ fprintf(stream, "%*sg: 0x%.2x avl: 0x%.2x "
+ "unusable: 0x%.2x padding: 0x%.2x\n",
+ indent, "", segment->g, segment->avl,
+ segment->unusable, segment->padding);
+}
+
+/*
+ * dtable Dump
+ *
+ * Input Args:
+ * stream - Output FILE stream
+ * dtable - KVM dtable
+ * indent - Left margin indent amount
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Dumps the state of the KVM dtable given by @dtable, to the FILE stream
+ * given by @stream.
+ */
+static void dtable_dump(FILE *stream, struct kvm_dtable *dtable,
+ uint8_t indent)
+{
+ fprintf(stream, "%*sbase: 0x%.16llx limit: 0x%.4x "
+ "padding: 0x%.4x 0x%.4x 0x%.4x\n",
+ indent, "", dtable->base, dtable->limit,
+ dtable->padding[0], dtable->padding[1], dtable->padding[2]);
+}
+
+void sregs_dump(FILE *stream, struct kvm_sregs *sregs,
+ uint8_t indent)
+{
+ unsigned int i;
+
+ fprintf(stream, "%*scs:\n", indent, "");
+ segment_dump(stream, &sregs->cs, indent + 2);
+ fprintf(stream, "%*sds:\n", indent, "");
+ segment_dump(stream, &sregs->ds, indent + 2);
+ fprintf(stream, "%*ses:\n", indent, "");
+ segment_dump(stream, &sregs->es, indent + 2);
+ fprintf(stream, "%*sfs:\n", indent, "");
+ segment_dump(stream, &sregs->fs, indent + 2);
+ fprintf(stream, "%*sgs:\n", indent, "");
+ segment_dump(stream, &sregs->gs, indent + 2);
+ fprintf(stream, "%*sss:\n", indent, "");
+ segment_dump(stream, &sregs->ss, indent + 2);
+ fprintf(stream, "%*str:\n", indent, "");
+ segment_dump(stream, &sregs->tr, indent + 2);
+ fprintf(stream, "%*sldt:\n", indent, "");
+ segment_dump(stream, &sregs->ldt, indent + 2);
+
+ fprintf(stream, "%*sgdt:\n", indent, "");
+ dtable_dump(stream, &sregs->gdt, indent + 2);
+ fprintf(stream, "%*sidt:\n", indent, "");
+ dtable_dump(stream, &sregs->idt, indent + 2);
+
+ fprintf(stream, "%*scr0: 0x%.16llx cr2: 0x%.16llx "
+ "cr3: 0x%.16llx cr4: 0x%.16llx\n",
+ indent, "",
+ sregs->cr0, sregs->cr2, sregs->cr3, sregs->cr4);
+ fprintf(stream, "%*scr8: 0x%.16llx efer: 0x%.16llx "
+ "apic_base: 0x%.16llx\n",
+ indent, "",
+ sregs->cr8, sregs->efer, sregs->apic_base);
+
+ fprintf(stream, "%*sinterrupt_bitmap:\n", indent, "");
+ for (i = 0; i < (KVM_NR_INTERRUPTS + 63) / 64; i++) {
+ fprintf(stream, "%*s%.16llx\n", indent + 2, "",
+ sregs->interrupt_bitmap[i]);
+ }
+}
+
+void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot)
+{
+ TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
+ "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
+
+ /* If needed, create page map l4 table. */
+ if (!vm->pgd_created) {
+ vm_paddr_t paddr = vm_phy_page_alloc(vm,
+ KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot);
+ vm->pgd = paddr;
+ vm->pgd_created = true;
+ }
+}
+
+void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr,
+ uint32_t pgd_memslot)
+{
+ uint16_t index[4];
+ struct pageMapL4Entry *pml4e;
+
+ TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
+ "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
+
+ TEST_ASSERT((vaddr % vm->page_size) == 0,
+ "Virtual address not on page boundary,\n"
+ " vaddr: 0x%lx vm->page_size: 0x%x",
+ vaddr, vm->page_size);
+ TEST_ASSERT(sparsebit_is_set(vm->vpages_valid,
+ (vaddr >> vm->page_shift)),
+ "Invalid virtual address, vaddr: 0x%lx",
+ vaddr);
+ TEST_ASSERT((paddr % vm->page_size) == 0,
+ "Physical address not on page boundary,\n"
+ " paddr: 0x%lx vm->page_size: 0x%x",
+ paddr, vm->page_size);
+ TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
+ "Physical address beyond beyond maximum supported,\n"
+ " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
+ paddr, vm->max_gfn, vm->page_size);
+
+ index[0] = (vaddr >> 12) & 0x1ffu;
+ index[1] = (vaddr >> 21) & 0x1ffu;
+ index[2] = (vaddr >> 30) & 0x1ffu;
+ index[3] = (vaddr >> 39) & 0x1ffu;
+
+ /* Allocate page directory pointer table if not present. */
+ pml4e = addr_gpa2hva(vm, vm->pgd);
+ if (!pml4e[index[3]].present) {
+ pml4e[index[3]].address = vm_phy_page_alloc(vm,
+ KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot)
+ >> vm->page_shift;
+ pml4e[index[3]].writable = true;
+ pml4e[index[3]].present = true;
+ }
+
+ /* Allocate page directory table if not present. */
+ struct pageDirectoryPointerEntry *pdpe;
+ pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size);
+ if (!pdpe[index[2]].present) {
+ pdpe[index[2]].address = vm_phy_page_alloc(vm,
+ KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot)
+ >> vm->page_shift;
+ pdpe[index[2]].writable = true;
+ pdpe[index[2]].present = true;
+ }
+
+ /* Allocate page table if not present. */
+ struct pageDirectoryEntry *pde;
+ pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size);
+ if (!pde[index[1]].present) {
+ pde[index[1]].address = vm_phy_page_alloc(vm,
+ KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot)
+ >> vm->page_shift;
+ pde[index[1]].writable = true;
+ pde[index[1]].present = true;
+ }
+
+ /* Fill in page table entry. */
+ struct pageTableEntry *pte;
+ pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size);
+ pte[index[0]].address = paddr >> vm->page_shift;
+ pte[index[0]].writable = true;
+ pte[index[0]].present = 1;
+}
+
+void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent)
+{
+ struct pageMapL4Entry *pml4e, *pml4e_start;
+ struct pageDirectoryPointerEntry *pdpe, *pdpe_start;
+ struct pageDirectoryEntry *pde, *pde_start;
+ struct pageTableEntry *pte, *pte_start;
+
+ if (!vm->pgd_created)
+ return;
+
+ fprintf(stream, "%*s "
+ " no\n", indent, "");
+ fprintf(stream, "%*s index hvaddr gpaddr "
+ "addr w exec dirty\n",
+ indent, "");
+ pml4e_start = (struct pageMapL4Entry *) addr_gpa2hva(vm,
+ vm->pgd);
+ for (uint16_t n1 = 0; n1 <= 0x1ffu; n1++) {
+ pml4e = &pml4e_start[n1];
+ if (!pml4e->present)
+ continue;
+ fprintf(stream, "%*spml4e 0x%-3zx %p 0x%-12lx 0x%-10lx %u "
+ " %u\n",
+ indent, "",
+ pml4e - pml4e_start, pml4e,
+ addr_hva2gpa(vm, pml4e), (uint64_t) pml4e->address,
+ pml4e->writable, pml4e->execute_disable);
+
+ pdpe_start = addr_gpa2hva(vm, pml4e->address
+ * vm->page_size);
+ for (uint16_t n2 = 0; n2 <= 0x1ffu; n2++) {
+ pdpe = &pdpe_start[n2];
+ if (!pdpe->present)
+ continue;
+ fprintf(stream, "%*spdpe 0x%-3zx %p 0x%-12lx 0x%-10lx "
+ "%u %u\n",
+ indent, "",
+ pdpe - pdpe_start, pdpe,
+ addr_hva2gpa(vm, pdpe),
+ (uint64_t) pdpe->address, pdpe->writable,
+ pdpe->execute_disable);
+
+ pde_start = addr_gpa2hva(vm,
+ pdpe->address * vm->page_size);
+ for (uint16_t n3 = 0; n3 <= 0x1ffu; n3++) {
+ pde = &pde_start[n3];
+ if (!pde->present)
+ continue;
+ fprintf(stream, "%*spde 0x%-3zx %p "
+ "0x%-12lx 0x%-10lx %u %u\n",
+ indent, "", pde - pde_start, pde,
+ addr_hva2gpa(vm, pde),
+ (uint64_t) pde->address, pde->writable,
+ pde->execute_disable);
+
+ pte_start = addr_gpa2hva(vm,
+ pde->address * vm->page_size);
+ for (uint16_t n4 = 0; n4 <= 0x1ffu; n4++) {
+ pte = &pte_start[n4];
+ if (!pte->present)
+ continue;
+ fprintf(stream, "%*spte 0x%-3zx %p "
+ "0x%-12lx 0x%-10lx %u %u "
+ " %u 0x%-10lx\n",
+ indent, "",
+ pte - pte_start, pte,
+ addr_hva2gpa(vm, pte),
+ (uint64_t) pte->address,
+ pte->writable,
+ pte->execute_disable,
+ pte->dirty,
+ ((uint64_t) n1 << 27)
+ | ((uint64_t) n2 << 18)
+ | ((uint64_t) n3 << 9)
+ | ((uint64_t) n4));
+ }
+ }
+ }
+ }
+}
+
+/*
+ * Set Unusable Segment
+ *
+ * Input Args: None
+ *
+ * Output Args:
+ * segp - Pointer to segment register
+ *
+ * Return: None
+ *
+ * Sets the segment register pointed to by @segp to an unusable state.
+ */
+static void kvm_seg_set_unusable(struct kvm_segment *segp)
+{
+ memset(segp, 0, sizeof(*segp));
+ segp->unusable = true;
+}
+
+static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp)
+{
+ void *gdt = addr_gva2hva(vm, vm->gdt);
+ struct desc64 *desc = gdt + (segp->selector >> 3) * 8;
+
+ desc->limit0 = segp->limit & 0xFFFF;
+ desc->base0 = segp->base & 0xFFFF;
+ desc->base1 = segp->base >> 16;
+ desc->type = segp->type;
+ desc->s = segp->s;
+ desc->dpl = segp->dpl;
+ desc->p = segp->present;
+ desc->limit1 = segp->limit >> 16;
+ desc->avl = segp->avl;
+ desc->l = segp->l;
+ desc->db = segp->db;
+ desc->g = segp->g;
+ desc->base2 = segp->base >> 24;
+ if (!segp->s)
+ desc->base3 = segp->base >> 32;
+}
+
+
+/*
+ * Set Long Mode Flat Kernel Code Segment
+ *
+ * Input Args:
+ * vm - VM whose GDT is being filled, or NULL to only write segp
+ * selector - selector value
+ *
+ * Output Args:
+ * segp - Pointer to KVM segment
+ *
+ * Return: None
+ *
+ * Sets up the KVM segment pointed to by @segp, to be a code segment
+ * with the selector value given by @selector.
+ */
+static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector,
+ struct kvm_segment *segp)
+{
+ memset(segp, 0, sizeof(*segp));
+ segp->selector = selector;
+ segp->limit = 0xFFFFFFFFu;
+ segp->s = 0x1; /* kTypeCodeData */
+ segp->type = 0x08 | 0x01 | 0x02; /* kFlagCode | kFlagCodeAccessed
+ * | kFlagCodeReadable
+ */
+ segp->g = true;
+ segp->l = true;
+ segp->present = 1;
+ if (vm)
+ kvm_seg_fill_gdt_64bit(vm, segp);
+}
+
+/*
+ * Set Long Mode Flat Kernel Data Segment
+ *
+ * Input Args:
+ * vm - VM whose GDT is being filled, or NULL to only write segp
+ * selector - selector value
+ *
+ * Output Args:
+ * segp - Pointer to KVM segment
+ *
+ * Return: None
+ *
+ * Sets up the KVM segment pointed to by @segp, to be a data segment
+ * with the selector value given by @selector.
+ */
+static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector,
+ struct kvm_segment *segp)
+{
+ memset(segp, 0, sizeof(*segp));
+ segp->selector = selector;
+ segp->limit = 0xFFFFFFFFu;
+ segp->s = 0x1; /* kTypeCodeData */
+ segp->type = 0x00 | 0x01 | 0x02; /* kFlagData | kFlagDataAccessed
+ * | kFlagDataWritable
+ */
+ segp->g = true;
+ segp->present = true;
+ if (vm)
+ kvm_seg_fill_gdt_64bit(vm, segp);
+}
+
+vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva)
+{
+ uint16_t index[4];
+ struct pageMapL4Entry *pml4e;
+ struct pageDirectoryPointerEntry *pdpe;
+ struct pageDirectoryEntry *pde;
+ struct pageTableEntry *pte;
+
+ TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
+ "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
+
+ index[0] = (gva >> 12) & 0x1ffu;
+ index[1] = (gva >> 21) & 0x1ffu;
+ index[2] = (gva >> 30) & 0x1ffu;
+ index[3] = (gva >> 39) & 0x1ffu;
+
+ if (!vm->pgd_created)
+ goto unmapped_gva;
+ pml4e = addr_gpa2hva(vm, vm->pgd);
+ if (!pml4e[index[3]].present)
+ goto unmapped_gva;
+
+ pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size);
+ if (!pdpe[index[2]].present)
+ goto unmapped_gva;
+
+ pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size);
+ if (!pde[index[1]].present)
+ goto unmapped_gva;
+
+ pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size);
+ if (!pte[index[0]].present)
+ goto unmapped_gva;
+
+ return (pte[index[0]].address * vm->page_size) + (gva & 0xfffu);
+
+unmapped_gva:
+ TEST_FAIL("No mapping for vm virtual address, gva: 0x%lx", gva);
+ exit(EXIT_FAILURE);
+}
+
+static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt, int gdt_memslot,
+ int pgd_memslot)
+{
+ if (!vm->gdt)
+ vm->gdt = vm_vaddr_alloc(vm, getpagesize(),
+ KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot);
+
+ dt->base = vm->gdt;
+ dt->limit = getpagesize();
+}
+
+static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp,
+ int selector, int gdt_memslot,
+ int pgd_memslot)
+{
+ if (!vm->tss)
+ vm->tss = vm_vaddr_alloc(vm, getpagesize(),
+ KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot);
+
+ memset(segp, 0, sizeof(*segp));
+ segp->base = vm->tss;
+ segp->limit = 0x67;
+ segp->selector = selector;
+ segp->type = 0xb;
+ segp->present = 1;
+ kvm_seg_fill_gdt_64bit(vm, segp);
+}
+
+static void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot)
+{
+ struct kvm_sregs sregs;
+
+ /* Set mode specific system register values. */
+ vcpu_sregs_get(vm, vcpuid, &sregs);
+
+ sregs.idt.limit = 0;
+
+ kvm_setup_gdt(vm, &sregs.gdt, gdt_memslot, pgd_memslot);
+
+ switch (vm->mode) {
+ case VM_MODE_PXXV48_4K:
+ sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG;
+ sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR;
+ sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX);
+
+ kvm_seg_set_unusable(&sregs.ldt);
+ kvm_seg_set_kernel_code_64bit(vm, DEFAULT_CODE_SELECTOR, &sregs.cs);
+ kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.ds);
+ kvm_seg_set_kernel_data_64bit(vm, DEFAULT_DATA_SELECTOR, &sregs.es);
+ kvm_setup_tss_64bit(vm, &sregs.tr, 0x18, gdt_memslot, pgd_memslot);
+ break;
+
+ default:
+ TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode);
+ }
+
+ sregs.cr3 = vm->pgd;
+ vcpu_sregs_set(vm, vcpuid, &sregs);
+}
+
+void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
+{
+ struct kvm_mp_state mp_state;
+ struct kvm_regs regs;
+ vm_vaddr_t stack_vaddr;
+ stack_vaddr = vm_vaddr_alloc(vm, DEFAULT_STACK_PGS * getpagesize(),
+ DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0);
+
+ /* Create VCPU */
+ vm_vcpu_add(vm, vcpuid);
+ vcpu_setup(vm, vcpuid, 0, 0);
+
+ /* Setup guest general purpose registers */
+ vcpu_regs_get(vm, vcpuid, &regs);
+ regs.rflags = regs.rflags | 0x2;
+ regs.rsp = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize());
+ regs.rip = (unsigned long) guest_code;
+ vcpu_regs_set(vm, vcpuid, &regs);
+
+ /* Setup the MP state */
+ mp_state.mp_state = 0;
+ vcpu_set_mp_state(vm, vcpuid, &mp_state);
+}
+
+/*
+ * Allocate an instance of struct kvm_cpuid2
+ *
+ * Input Args: None
+ *
+ * Output Args: None
+ *
+ * Return: A pointer to the allocated struct. The caller is responsible
+ * for freeing this struct.
+ *
+ * Since kvm_cpuid2 uses a 0-length array to allow a the size of the
+ * array to be decided at allocation time, allocation is slightly
+ * complicated. This function uses a reasonable default length for
+ * the array and performs the appropriate allocation.
+ */
+static struct kvm_cpuid2 *allocate_kvm_cpuid2(void)
+{
+ struct kvm_cpuid2 *cpuid;
+ int nent = 100;
+ size_t size;
+
+ size = sizeof(*cpuid);
+ size += nent * sizeof(struct kvm_cpuid_entry2);
+ cpuid = malloc(size);
+ if (!cpuid) {
+ perror("malloc");
+ abort();
+ }
+
+ cpuid->nent = nent;
+
+ return cpuid;
+}
+
+/*
+ * KVM Supported CPUID Get
+ *
+ * Input Args: None
+ *
+ * Output Args:
+ *
+ * Return: The supported KVM CPUID
+ *
+ * Get the guest CPUID supported by KVM.
+ */
+struct kvm_cpuid2 *kvm_get_supported_cpuid(void)
+{
+ static struct kvm_cpuid2 *cpuid;
+ int ret;
+ int kvm_fd;
+
+ if (cpuid)
+ return cpuid;
+
+ cpuid = allocate_kvm_cpuid2();
+ kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
+ if (kvm_fd < 0)
+ exit(KSFT_SKIP);
+
+ ret = ioctl(kvm_fd, KVM_GET_SUPPORTED_CPUID, cpuid);
+ TEST_ASSERT(ret == 0, "KVM_GET_SUPPORTED_CPUID failed %d %d\n",
+ ret, errno);
+
+ close(kvm_fd);
+ return cpuid;
+}
+
+/*
+ * Locate a cpuid entry.
+ *
+ * Input Args:
+ * function: The function of the cpuid entry to find.
+ * index: The index of the cpuid entry.
+ *
+ * Output Args: None
+ *
+ * Return: A pointer to the cpuid entry. Never returns NULL.
+ */
+struct kvm_cpuid_entry2 *
+kvm_get_supported_cpuid_index(uint32_t function, uint32_t index)
+{
+ struct kvm_cpuid2 *cpuid;
+ struct kvm_cpuid_entry2 *entry = NULL;
+ int i;
+
+ cpuid = kvm_get_supported_cpuid();
+ for (i = 0; i < cpuid->nent; i++) {
+ if (cpuid->entries[i].function == function &&
+ cpuid->entries[i].index == index) {
+ entry = &cpuid->entries[i];
+ break;
+ }
+ }
+
+ TEST_ASSERT(entry, "Guest CPUID entry not found: (EAX=%x, ECX=%x).",
+ function, index);
+ return entry;
+}
+
+/*
+ * VM VCPU CPUID Set
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU id
+ * cpuid - The CPUID values to set.
+ *
+ * Output Args: None
+ *
+ * Return: void
+ *
+ * Set the VCPU's CPUID.
+ */
+void vcpu_set_cpuid(struct kvm_vm *vm,
+ uint32_t vcpuid, struct kvm_cpuid2 *cpuid)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int rc;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+
+ rc = ioctl(vcpu->fd, KVM_SET_CPUID2, cpuid);
+ TEST_ASSERT(rc == 0, "KVM_SET_CPUID2 failed, rc: %i errno: %i",
+ rc, errno);
+
+}
+
+struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
+ void *guest_code)
+{
+ struct kvm_vm *vm;
+ /*
+ * For x86 the maximum page table size for a memory region
+ * will be when only 4K pages are used. In that case the
+ * total extra size for page tables (for extra N pages) will
+ * be: N/512+N/512^2+N/512^3+... which is definitely smaller
+ * than N/512*2.
+ */
+ uint64_t extra_pg_pages = extra_mem_pages / 512 * 2;
+
+ /* Create VM */
+ vm = vm_create(VM_MODE_DEFAULT,
+ DEFAULT_GUEST_PHY_PAGES + extra_pg_pages,
+ O_RDWR);
+
+ /* Setup guest code */
+ kvm_vm_elf_load(vm, program_invocation_name, 0, 0);
+
+ /* Setup IRQ Chip */
+ vm_create_irqchip(vm);
+
+ /* Add the first vCPU. */
+ vm_vcpu_add_default(vm, vcpuid, guest_code);
+
+ return vm;
+}
+
+/*
+ * VCPU Get MSR
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * msr_index - Index of MSR
+ *
+ * Output Args: None
+ *
+ * Return: On success, value of the MSR. On failure a TEST_ASSERT is produced.
+ *
+ * Get value of MSR for VCPU.
+ */
+uint64_t vcpu_get_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ struct {
+ struct kvm_msrs header;
+ struct kvm_msr_entry entry;
+ } buffer = {};
+ int r;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+ buffer.header.nmsrs = 1;
+ buffer.entry.index = msr_index;
+ r = ioctl(vcpu->fd, KVM_GET_MSRS, &buffer.header);
+ TEST_ASSERT(r == 1, "KVM_GET_MSRS IOCTL failed,\n"
+ " rc: %i errno: %i", r, errno);
+
+ return buffer.entry.data;
+}
+
+/*
+ * _VCPU Set MSR
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * msr_index - Index of MSR
+ * msr_value - New value of MSR
+ *
+ * Output Args: None
+ *
+ * Return: The result of KVM_SET_MSRS.
+ *
+ * Sets the value of an MSR for the given VCPU.
+ */
+int _vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
+ uint64_t msr_value)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ struct {
+ struct kvm_msrs header;
+ struct kvm_msr_entry entry;
+ } buffer = {};
+ int r;
+
+ TEST_ASSERT(vcpu != NULL, "vcpu not found, vcpuid: %u", vcpuid);
+ memset(&buffer, 0, sizeof(buffer));
+ buffer.header.nmsrs = 1;
+ buffer.entry.index = msr_index;
+ buffer.entry.data = msr_value;
+ r = ioctl(vcpu->fd, KVM_SET_MSRS, &buffer.header);
+ return r;
+}
+
+/*
+ * VCPU Set MSR
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * vcpuid - VCPU ID
+ * msr_index - Index of MSR
+ * msr_value - New value of MSR
+ *
+ * Output Args: None
+ *
+ * Return: On success, nothing. On failure a TEST_ASSERT is produced.
+ *
+ * Set value of MSR for VCPU.
+ */
+void vcpu_set_msr(struct kvm_vm *vm, uint32_t vcpuid, uint64_t msr_index,
+ uint64_t msr_value)
+{
+ int r;
+
+ r = _vcpu_set_msr(vm, vcpuid, msr_index, msr_value);
+ TEST_ASSERT(r == 1, "KVM_SET_MSRS IOCTL failed,\n"
+ " rc: %i errno: %i", r, errno);
+}
+
+void vcpu_args_set(struct kvm_vm *vm, uint32_t vcpuid, unsigned int num, ...)
+{
+ va_list ap;
+ struct kvm_regs regs;
+
+ TEST_ASSERT(num >= 1 && num <= 6, "Unsupported number of args,\n"
+ " num: %u\n",
+ num);
+
+ va_start(ap, num);
+ vcpu_regs_get(vm, vcpuid, &regs);
+
+ if (num >= 1)
+ regs.rdi = va_arg(ap, uint64_t);
+
+ if (num >= 2)
+ regs.rsi = va_arg(ap, uint64_t);
+
+ if (num >= 3)
+ regs.rdx = va_arg(ap, uint64_t);
+
+ if (num >= 4)
+ regs.rcx = va_arg(ap, uint64_t);
+
+ if (num >= 5)
+ regs.r8 = va_arg(ap, uint64_t);
+
+ if (num >= 6)
+ regs.r9 = va_arg(ap, uint64_t);
+
+ vcpu_regs_set(vm, vcpuid, &regs);
+ va_end(ap);
+}
+
+void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent)
+{
+ struct kvm_regs regs;
+ struct kvm_sregs sregs;
+
+ fprintf(stream, "%*scpuid: %u\n", indent, "", vcpuid);
+
+ fprintf(stream, "%*sregs:\n", indent + 2, "");
+ vcpu_regs_get(vm, vcpuid, &regs);
+ regs_dump(stream, &regs, indent + 4);
+
+ fprintf(stream, "%*ssregs:\n", indent + 2, "");
+ vcpu_sregs_get(vm, vcpuid, &sregs);
+ sregs_dump(stream, &sregs, indent + 4);
+}
+
+struct kvm_x86_state {
+ struct kvm_vcpu_events events;
+ struct kvm_mp_state mp_state;
+ struct kvm_regs regs;
+ struct kvm_xsave xsave;
+ struct kvm_xcrs xcrs;
+ struct kvm_sregs sregs;
+ struct kvm_debugregs debugregs;
+ union {
+ struct kvm_nested_state nested;
+ char nested_[16384];
+ };
+ struct kvm_msrs msrs;
+};
+
+static int kvm_get_num_msrs_fd(int kvm_fd)
+{
+ struct kvm_msr_list nmsrs;
+ int r;
+
+ nmsrs.nmsrs = 0;
+ r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs);
+ TEST_ASSERT(r == -1 && errno == E2BIG, "Unexpected result from KVM_GET_MSR_INDEX_LIST probe, r: %i",
+ r);
+
+ return nmsrs.nmsrs;
+}
+
+static int kvm_get_num_msrs(struct kvm_vm *vm)
+{
+ return kvm_get_num_msrs_fd(vm->kvm_fd);
+}
+
+struct kvm_msr_list *kvm_get_msr_index_list(void)
+{
+ struct kvm_msr_list *list;
+ int nmsrs, r, kvm_fd;
+
+ kvm_fd = open(KVM_DEV_PATH, O_RDONLY);
+ if (kvm_fd < 0)
+ exit(KSFT_SKIP);
+
+ nmsrs = kvm_get_num_msrs_fd(kvm_fd);
+ list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
+ list->nmsrs = nmsrs;
+ r = ioctl(kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
+ close(kvm_fd);
+
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i",
+ r);
+
+ return list;
+}
+
+struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ struct kvm_msr_list *list;
+ struct kvm_x86_state *state;
+ int nmsrs, r, i;
+ static int nested_size = -1;
+
+ if (nested_size == -1) {
+ nested_size = kvm_check_cap(KVM_CAP_NESTED_STATE);
+ TEST_ASSERT(nested_size <= sizeof(state->nested_),
+ "Nested state size too big, %i > %zi",
+ nested_size, sizeof(state->nested_));
+ }
+
+ /*
+ * When KVM exits to userspace with KVM_EXIT_IO, KVM guarantees
+ * guest state is consistent only after userspace re-enters the
+ * kernel with KVM_RUN. Complete IO prior to migrating state
+ * to a new VM.
+ */
+ vcpu_run_complete_io(vm, vcpuid);
+
+ nmsrs = kvm_get_num_msrs(vm);
+ list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
+ list->nmsrs = nmsrs;
+ r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i",
+ r);
+
+ state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0]));
+ r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i",
+ r);
+
+ r = ioctl(vcpu->fd, KVM_GET_MP_STATE, &state->mp_state);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i",
+ r);
+
+ r = ioctl(vcpu->fd, KVM_GET_REGS, &state->regs);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i",
+ r);
+
+ r = ioctl(vcpu->fd, KVM_GET_XSAVE, &state->xsave);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i",
+ r);
+
+ if (kvm_check_cap(KVM_CAP_XCRS)) {
+ r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XCRS, r: %i",
+ r);
+ }
+
+ r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i",
+ r);
+
+ if (nested_size) {
+ state->nested.size = sizeof(state->nested_);
+ r = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, &state->nested);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_NESTED_STATE, r: %i",
+ r);
+ TEST_ASSERT(state->nested.size <= nested_size,
+ "Nested state size too big, %i (KVM_CHECK_CAP gave %i)",
+ state->nested.size, nested_size);
+ } else
+ state->nested.size = 0;
+
+ state->msrs.nmsrs = nmsrs;
+ for (i = 0; i < nmsrs; i++)
+ state->msrs.entries[i].index = list->indices[i];
+ r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs);
+ TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed MSR was 0x%x)",
+ r, r == nmsrs ? -1 : list->indices[r]);
+
+ r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i",
+ r);
+
+ free(list);
+ return state;
+}
+
+void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state)
+{
+ struct vcpu *vcpu = vcpu_find(vm, vcpuid);
+ int r;
+
+ r = ioctl(vcpu->fd, KVM_SET_XSAVE, &state->xsave);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i",
+ r);
+
+ if (kvm_check_cap(KVM_CAP_XCRS)) {
+ r = ioctl(vcpu->fd, KVM_SET_XCRS, &state->xcrs);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XCRS, r: %i",
+ r);
+ }
+
+ r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i",
+ r);
+
+ r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs);
+ TEST_ASSERT(r == state->msrs.nmsrs, "Unexpected result from KVM_SET_MSRS, r: %i (failed at %x)",
+ r, r == state->msrs.nmsrs ? -1 : state->msrs.entries[r].index);
+
+ r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i",
+ r);
+
+ r = ioctl(vcpu->fd, KVM_SET_MP_STATE, &state->mp_state);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i",
+ r);
+
+ r = ioctl(vcpu->fd, KVM_SET_DEBUGREGS, &state->debugregs);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i",
+ r);
+
+ r = ioctl(vcpu->fd, KVM_SET_REGS, &state->regs);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i",
+ r);
+
+ if (state->nested.size) {
+ r = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, &state->nested);
+ TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_NESTED_STATE, r: %i",
+ r);
+ }
+}
+
+bool is_intel_cpu(void)
+{
+ int eax, ebx, ecx, edx;
+ const uint32_t *chunk;
+ const int leaf = 0;
+
+ __asm__ __volatile__(
+ "cpuid"
+ : /* output */ "=a"(eax), "=b"(ebx),
+ "=c"(ecx), "=d"(edx)
+ : /* input */ "0"(leaf), "2"(0));
+
+ chunk = (const uint32_t *)("GenuineIntel");
+ return (ebx == chunk[0] && edx == chunk[1] && ecx == chunk[2]);
+}
+
+uint32_t kvm_get_cpuid_max_basic(void)
+{
+ return kvm_get_supported_cpuid_entry(0)->eax;
+}
+
+uint32_t kvm_get_cpuid_max_extended(void)
+{
+ return kvm_get_supported_cpuid_entry(0x80000000)->eax;
+}
+
+void kvm_get_cpu_address_width(unsigned int *pa_bits, unsigned int *va_bits)
+{
+ struct kvm_cpuid_entry2 *entry;
+ bool pae;
+
+ /* SDM 4.1.4 */
+ if (kvm_get_cpuid_max_extended() < 0x80000008) {
+ pae = kvm_get_supported_cpuid_entry(1)->edx & (1 << 6);
+ *pa_bits = pae ? 36 : 32;
+ *va_bits = 32;
+ } else {
+ entry = kvm_get_supported_cpuid_entry(0x80000008);
+ *pa_bits = entry->eax & 0xff;
+ *va_bits = (entry->eax >> 8) & 0xff;
+ }
+}
+
+struct idt_entry {
+ uint16_t offset0;
+ uint16_t selector;
+ uint16_t ist : 3;
+ uint16_t : 5;
+ uint16_t type : 4;
+ uint16_t : 1;
+ uint16_t dpl : 2;
+ uint16_t p : 1;
+ uint16_t offset1;
+ uint32_t offset2; uint32_t reserved;
+};
+
+static void set_idt_entry(struct kvm_vm *vm, int vector, unsigned long addr,
+ int dpl, unsigned short selector)
+{
+ struct idt_entry *base =
+ (struct idt_entry *)addr_gva2hva(vm, vm->idt);
+ struct idt_entry *e = &base[vector];
+
+ memset(e, 0, sizeof(*e));
+ e->offset0 = addr;
+ e->selector = selector;
+ e->ist = 0;
+ e->type = 14;
+ e->dpl = dpl;
+ e->p = 1;
+ e->offset1 = addr >> 16;
+ e->offset2 = addr >> 32;
+}
+
+void kvm_exit_unexpected_vector(uint32_t value)
+{
+ outl(UNEXPECTED_VECTOR_PORT, value);
+}
+
+void route_exception(struct ex_regs *regs)
+{
+ typedef void(*handler)(struct ex_regs *);
+ handler *handlers = (handler *)exception_handlers;
+
+ if (handlers && handlers[regs->vector]) {
+ handlers[regs->vector](regs);
+ return;
+ }
+
+ kvm_exit_unexpected_vector(regs->vector);
+}
+
+void vm_init_descriptor_tables(struct kvm_vm *vm)
+{
+ extern void *idt_handlers;
+ int i;
+
+ vm->idt = vm_vaddr_alloc(vm, getpagesize(), 0x2000, 0, 0);
+ vm->handlers = vm_vaddr_alloc(vm, 256 * sizeof(void *), 0x2000, 0, 0);
+ /* Handlers have the same address in both address spaces.*/
+ for (i = 0; i < NUM_INTERRUPTS; i++)
+ set_idt_entry(vm, i, (unsigned long)(&idt_handlers)[i], 0,
+ DEFAULT_CODE_SELECTOR);
+}
+
+void vcpu_init_descriptor_tables(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct kvm_sregs sregs;
+
+ vcpu_sregs_get(vm, vcpuid, &sregs);
+ sregs.idt.base = vm->idt;
+ sregs.idt.limit = NUM_INTERRUPTS * sizeof(struct idt_entry) - 1;
+ sregs.gdt.base = vm->gdt;
+ sregs.gdt.limit = getpagesize() - 1;
+ kvm_seg_set_kernel_data_64bit(NULL, DEFAULT_DATA_SELECTOR, &sregs.gs);
+ vcpu_sregs_set(vm, vcpuid, &sregs);
+ *(vm_vaddr_t *)addr_gva2hva(vm, (vm_vaddr_t)(&exception_handlers)) = vm->handlers;
+}
+
+void vm_handle_exception(struct kvm_vm *vm, int vector,
+ void (*handler)(struct ex_regs *))
+{
+ vm_vaddr_t *handlers = (vm_vaddr_t *)addr_gva2hva(vm, vm->handlers);
+
+ handlers[vector] = (vm_vaddr_t)handler;
+}
+
+void assert_on_unhandled_exception(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ if (vcpu_state(vm, vcpuid)->exit_reason == KVM_EXIT_IO
+ && vcpu_state(vm, vcpuid)->io.port == UNEXPECTED_VECTOR_PORT
+ && vcpu_state(vm, vcpuid)->io.size == 4) {
+ /* Grab pointer to io data */
+ uint32_t *data = (void *)vcpu_state(vm, vcpuid)
+ + vcpu_state(vm, vcpuid)->io.data_offset;
+
+ TEST_ASSERT(false,
+ "Unexpected vectored event in guest (vector:0x%x)",
+ *data);
+ }
+}
+
+bool set_cpuid(struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 *ent)
+{
+ int i;
+
+ for (i = 0; i < cpuid->nent; i++) {
+ struct kvm_cpuid_entry2 *cur = &cpuid->entries[i];
+
+ if (cur->function != ent->function || cur->index != ent->index)
+ continue;
+
+ memcpy(cur, ent, sizeof(struct kvm_cpuid_entry2));
+ return true;
+ }
+
+ return false;
+}
+
+uint64_t kvm_hypercall(uint64_t nr, uint64_t a0, uint64_t a1, uint64_t a2,
+ uint64_t a3)
+{
+ uint64_t r;
+
+ asm volatile("vmcall"
+ : "=a"(r)
+ : "a"(nr), "b"(a0), "c"(a1), "d"(a2), "S"(a3));
+ return r;
+}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/svm.c b/tools/testing/selftests/kvm/lib/x86_64/svm.c
new file mode 100644
index 000000000..a58507a7b
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86_64/svm.c
@@ -0,0 +1,177 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tools/testing/selftests/kvm/lib/x86_64/svm.c
+ * Helpers used for nested SVM testing
+ * Largely inspired from KVM unit test svm.c
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "../kvm_util_internal.h"
+#include "processor.h"
+#include "svm_util.h"
+
+struct gpr64_regs guest_regs;
+u64 rflags;
+
+/* Allocate memory regions for nested SVM tests.
+ *
+ * Input Args:
+ * vm - The VM to allocate guest-virtual addresses in.
+ *
+ * Output Args:
+ * p_svm_gva - The guest virtual address for the struct svm_test_data.
+ *
+ * Return:
+ * Pointer to structure with the addresses of the SVM areas.
+ */
+struct svm_test_data *
+vcpu_alloc_svm(struct kvm_vm *vm, vm_vaddr_t *p_svm_gva)
+{
+ vm_vaddr_t svm_gva = vm_vaddr_alloc(vm, getpagesize(),
+ 0x10000, 0, 0);
+ struct svm_test_data *svm = addr_gva2hva(vm, svm_gva);
+
+ svm->vmcb = (void *)vm_vaddr_alloc(vm, getpagesize(),
+ 0x10000, 0, 0);
+ svm->vmcb_hva = addr_gva2hva(vm, (uintptr_t)svm->vmcb);
+ svm->vmcb_gpa = addr_gva2gpa(vm, (uintptr_t)svm->vmcb);
+
+ svm->save_area = (void *)vm_vaddr_alloc(vm, getpagesize(),
+ 0x10000, 0, 0);
+ svm->save_area_hva = addr_gva2hva(vm, (uintptr_t)svm->save_area);
+ svm->save_area_gpa = addr_gva2gpa(vm, (uintptr_t)svm->save_area);
+
+ *p_svm_gva = svm_gva;
+ return svm;
+}
+
+static void vmcb_set_seg(struct vmcb_seg *seg, u16 selector,
+ u64 base, u32 limit, u32 attr)
+{
+ seg->selector = selector;
+ seg->attrib = attr;
+ seg->limit = limit;
+ seg->base = base;
+}
+
+/*
+ * Avoid using memset to clear the vmcb, since libc may not be
+ * available in L1 (and, even if it is, features that libc memset may
+ * want to use, like AVX, may not be enabled).
+ */
+static void clear_vmcb(struct vmcb *vmcb)
+{
+ int n = sizeof(*vmcb) / sizeof(u32);
+
+ asm volatile ("rep stosl" : "+c"(n), "+D"(vmcb) : "a"(0) : "memory");
+}
+
+void generic_svm_setup(struct svm_test_data *svm, void *guest_rip, void *guest_rsp)
+{
+ struct vmcb *vmcb = svm->vmcb;
+ uint64_t vmcb_gpa = svm->vmcb_gpa;
+ struct vmcb_save_area *save = &vmcb->save;
+ struct vmcb_control_area *ctrl = &vmcb->control;
+ u32 data_seg_attr = 3 | SVM_SELECTOR_S_MASK | SVM_SELECTOR_P_MASK
+ | SVM_SELECTOR_DB_MASK | SVM_SELECTOR_G_MASK;
+ u32 code_seg_attr = 9 | SVM_SELECTOR_S_MASK | SVM_SELECTOR_P_MASK
+ | SVM_SELECTOR_L_MASK | SVM_SELECTOR_G_MASK;
+ uint64_t efer;
+
+ efer = rdmsr(MSR_EFER);
+ wrmsr(MSR_EFER, efer | EFER_SVME);
+ wrmsr(MSR_VM_HSAVE_PA, svm->save_area_gpa);
+
+ clear_vmcb(vmcb);
+ asm volatile ("vmsave %0\n\t" : : "a" (vmcb_gpa) : "memory");
+ vmcb_set_seg(&save->es, get_es(), 0, -1U, data_seg_attr);
+ vmcb_set_seg(&save->cs, get_cs(), 0, -1U, code_seg_attr);
+ vmcb_set_seg(&save->ss, get_ss(), 0, -1U, data_seg_attr);
+ vmcb_set_seg(&save->ds, get_ds(), 0, -1U, data_seg_attr);
+ vmcb_set_seg(&save->gdtr, 0, get_gdt().address, get_gdt().size, 0);
+ vmcb_set_seg(&save->idtr, 0, get_idt().address, get_idt().size, 0);
+
+ ctrl->asid = 1;
+ save->cpl = 0;
+ save->efer = rdmsr(MSR_EFER);
+ asm volatile ("mov %%cr4, %0" : "=r"(save->cr4) : : "memory");
+ asm volatile ("mov %%cr3, %0" : "=r"(save->cr3) : : "memory");
+ asm volatile ("mov %%cr0, %0" : "=r"(save->cr0) : : "memory");
+ asm volatile ("mov %%dr7, %0" : "=r"(save->dr7) : : "memory");
+ asm volatile ("mov %%dr6, %0" : "=r"(save->dr6) : : "memory");
+ asm volatile ("mov %%cr2, %0" : "=r"(save->cr2) : : "memory");
+ save->g_pat = rdmsr(MSR_IA32_CR_PAT);
+ save->dbgctl = rdmsr(MSR_IA32_DEBUGCTLMSR);
+ ctrl->intercept = (1ULL << INTERCEPT_VMRUN) |
+ (1ULL << INTERCEPT_VMMCALL);
+
+ vmcb->save.rip = (u64)guest_rip;
+ vmcb->save.rsp = (u64)guest_rsp;
+ guest_regs.rdi = (u64)svm;
+}
+
+/*
+ * save/restore 64-bit general registers except rax, rip, rsp
+ * which are directly handed through the VMCB guest processor state
+ */
+#define SAVE_GPR_C \
+ "xchg %%rbx, guest_regs+0x20\n\t" \
+ "xchg %%rcx, guest_regs+0x10\n\t" \
+ "xchg %%rdx, guest_regs+0x18\n\t" \
+ "xchg %%rbp, guest_regs+0x30\n\t" \
+ "xchg %%rsi, guest_regs+0x38\n\t" \
+ "xchg %%rdi, guest_regs+0x40\n\t" \
+ "xchg %%r8, guest_regs+0x48\n\t" \
+ "xchg %%r9, guest_regs+0x50\n\t" \
+ "xchg %%r10, guest_regs+0x58\n\t" \
+ "xchg %%r11, guest_regs+0x60\n\t" \
+ "xchg %%r12, guest_regs+0x68\n\t" \
+ "xchg %%r13, guest_regs+0x70\n\t" \
+ "xchg %%r14, guest_regs+0x78\n\t" \
+ "xchg %%r15, guest_regs+0x80\n\t"
+
+#define LOAD_GPR_C SAVE_GPR_C
+
+/*
+ * selftests do not use interrupts so we dropped clgi/sti/cli/stgi
+ * for now. registers involved in LOAD/SAVE_GPR_C are eventually
+ * unmodified so they do not need to be in the clobber list.
+ */
+void run_guest(struct vmcb *vmcb, uint64_t vmcb_gpa)
+{
+ asm volatile (
+ "vmload %[vmcb_gpa]\n\t"
+ "mov rflags, %%r15\n\t" // rflags
+ "mov %%r15, 0x170(%[vmcb])\n\t"
+ "mov guest_regs, %%r15\n\t" // rax
+ "mov %%r15, 0x1f8(%[vmcb])\n\t"
+ LOAD_GPR_C
+ "vmrun %[vmcb_gpa]\n\t"
+ SAVE_GPR_C
+ "mov 0x170(%[vmcb]), %%r15\n\t" // rflags
+ "mov %%r15, rflags\n\t"
+ "mov 0x1f8(%[vmcb]), %%r15\n\t" // rax
+ "mov %%r15, guest_regs\n\t"
+ "vmsave %[vmcb_gpa]\n\t"
+ : : [vmcb] "r" (vmcb), [vmcb_gpa] "a" (vmcb_gpa)
+ : "r15", "memory");
+}
+
+bool nested_svm_supported(void)
+{
+ struct kvm_cpuid_entry2 *entry =
+ kvm_get_supported_cpuid_entry(0x80000001);
+
+ return entry->ecx & CPUID_SVM;
+}
+
+void nested_svm_check_supported(void)
+{
+ if (!nested_svm_supported()) {
+ print_skip("nested SVM not enabled");
+ exit(KSFT_SKIP);
+ }
+}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/ucall.c b/tools/testing/selftests/kvm/lib/x86_64/ucall.c
new file mode 100644
index 000000000..a3489973e
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86_64/ucall.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ucall support. A ucall is a "hypercall to userspace".
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ */
+#include "kvm_util.h"
+
+#define UCALL_PIO_PORT ((uint16_t)0x1000)
+
+void ucall_init(struct kvm_vm *vm, void *arg)
+{
+}
+
+void ucall_uninit(struct kvm_vm *vm)
+{
+}
+
+void ucall(uint64_t cmd, int nargs, ...)
+{
+ struct ucall uc = {
+ .cmd = cmd,
+ };
+ va_list va;
+ int i;
+
+ nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS;
+
+ va_start(va, nargs);
+ for (i = 0; i < nargs; ++i)
+ uc.args[i] = va_arg(va, uint64_t);
+ va_end(va);
+
+ asm volatile("in %[port], %%al"
+ : : [port] "d" (UCALL_PIO_PORT), "D" (&uc) : "rax", "memory");
+}
+
+uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc)
+{
+ struct kvm_run *run = vcpu_state(vm, vcpu_id);
+ struct ucall ucall = {};
+
+ if (uc)
+ memset(uc, 0, sizeof(*uc));
+
+ if (run->exit_reason == KVM_EXIT_IO && run->io.port == UCALL_PIO_PORT) {
+ struct kvm_regs regs;
+
+ vcpu_regs_get(vm, vcpu_id, &regs);
+ memcpy(&ucall, addr_gva2hva(vm, (vm_vaddr_t)regs.rdi),
+ sizeof(ucall));
+
+ vcpu_run_complete_io(vm, vcpu_id);
+ if (uc)
+ memcpy(uc, &ucall, sizeof(ucall));
+ }
+
+ return ucall.cmd;
+}
diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c
new file mode 100644
index 000000000..2448b30e8
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c
@@ -0,0 +1,553 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tools/testing/selftests/kvm/lib/x86_64/vmx.c
+ *
+ * Copyright (C) 2018, Google LLC.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "../kvm_util_internal.h"
+#include "processor.h"
+#include "vmx.h"
+
+#define PAGE_SHIFT_4K 12
+
+#define KVM_EPT_PAGE_TABLE_MIN_PADDR 0x1c0000
+
+bool enable_evmcs;
+
+struct hv_enlightened_vmcs *current_evmcs;
+struct hv_vp_assist_page *current_vp_assist;
+
+struct eptPageTableEntry {
+ uint64_t readable:1;
+ uint64_t writable:1;
+ uint64_t executable:1;
+ uint64_t memory_type:3;
+ uint64_t ignore_pat:1;
+ uint64_t page_size:1;
+ uint64_t accessed:1;
+ uint64_t dirty:1;
+ uint64_t ignored_11_10:2;
+ uint64_t address:40;
+ uint64_t ignored_62_52:11;
+ uint64_t suppress_ve:1;
+};
+
+struct eptPageTablePointer {
+ uint64_t memory_type:3;
+ uint64_t page_walk_length:3;
+ uint64_t ad_enabled:1;
+ uint64_t reserved_11_07:5;
+ uint64_t address:40;
+ uint64_t reserved_63_52:12;
+};
+int vcpu_enable_evmcs(struct kvm_vm *vm, int vcpu_id)
+{
+ uint16_t evmcs_ver;
+
+ struct kvm_enable_cap enable_evmcs_cap = {
+ .cap = KVM_CAP_HYPERV_ENLIGHTENED_VMCS,
+ .args[0] = (unsigned long)&evmcs_ver
+ };
+
+ vcpu_ioctl(vm, vcpu_id, KVM_ENABLE_CAP, &enable_evmcs_cap);
+
+ /* KVM should return supported EVMCS version range */
+ TEST_ASSERT(((evmcs_ver >> 8) >= (evmcs_ver & 0xff)) &&
+ (evmcs_ver & 0xff) > 0,
+ "Incorrect EVMCS version range: %x:%x\n",
+ evmcs_ver & 0xff, evmcs_ver >> 8);
+
+ return evmcs_ver;
+}
+
+/* Allocate memory regions for nested VMX tests.
+ *
+ * Input Args:
+ * vm - The VM to allocate guest-virtual addresses in.
+ *
+ * Output Args:
+ * p_vmx_gva - The guest virtual address for the struct vmx_pages.
+ *
+ * Return:
+ * Pointer to structure with the addresses of the VMX areas.
+ */
+struct vmx_pages *
+vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva)
+{
+ vm_vaddr_t vmx_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+ struct vmx_pages *vmx = addr_gva2hva(vm, vmx_gva);
+
+ /* Setup of a region of guest memory for the vmxon region. */
+ vmx->vmxon = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+ vmx->vmxon_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmxon);
+ vmx->vmxon_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmxon);
+
+ /* Setup of a region of guest memory for a vmcs. */
+ vmx->vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+ vmx->vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmcs);
+ vmx->vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmcs);
+
+ /* Setup of a region of guest memory for the MSR bitmap. */
+ vmx->msr = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+ vmx->msr_hva = addr_gva2hva(vm, (uintptr_t)vmx->msr);
+ vmx->msr_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->msr);
+ memset(vmx->msr_hva, 0, getpagesize());
+
+ /* Setup of a region of guest memory for the shadow VMCS. */
+ vmx->shadow_vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+ vmx->shadow_vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->shadow_vmcs);
+ vmx->shadow_vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->shadow_vmcs);
+
+ /* Setup of a region of guest memory for the VMREAD and VMWRITE bitmaps. */
+ vmx->vmread = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+ vmx->vmread_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmread);
+ vmx->vmread_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmread);
+ memset(vmx->vmread_hva, 0, getpagesize());
+
+ vmx->vmwrite = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+ vmx->vmwrite_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmwrite);
+ vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite);
+ memset(vmx->vmwrite_hva, 0, getpagesize());
+
+ /* Setup of a region of guest memory for the VP Assist page. */
+ vmx->vp_assist = (void *)vm_vaddr_alloc(vm, getpagesize(),
+ 0x10000, 0, 0);
+ vmx->vp_assist_hva = addr_gva2hva(vm, (uintptr_t)vmx->vp_assist);
+ vmx->vp_assist_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vp_assist);
+
+ /* Setup of a region of guest memory for the enlightened VMCS. */
+ vmx->enlightened_vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(),
+ 0x10000, 0, 0);
+ vmx->enlightened_vmcs_hva =
+ addr_gva2hva(vm, (uintptr_t)vmx->enlightened_vmcs);
+ vmx->enlightened_vmcs_gpa =
+ addr_gva2gpa(vm, (uintptr_t)vmx->enlightened_vmcs);
+
+ *p_vmx_gva = vmx_gva;
+ return vmx;
+}
+
+bool prepare_for_vmx_operation(struct vmx_pages *vmx)
+{
+ uint64_t feature_control;
+ uint64_t required;
+ unsigned long cr0;
+ unsigned long cr4;
+
+ /*
+ * Ensure bits in CR0 and CR4 are valid in VMX operation:
+ * - Bit X is 1 in _FIXED0: bit X is fixed to 1 in CRx.
+ * - Bit X is 0 in _FIXED1: bit X is fixed to 0 in CRx.
+ */
+ __asm__ __volatile__("mov %%cr0, %0" : "=r"(cr0) : : "memory");
+ cr0 &= rdmsr(MSR_IA32_VMX_CR0_FIXED1);
+ cr0 |= rdmsr(MSR_IA32_VMX_CR0_FIXED0);
+ __asm__ __volatile__("mov %0, %%cr0" : : "r"(cr0) : "memory");
+
+ __asm__ __volatile__("mov %%cr4, %0" : "=r"(cr4) : : "memory");
+ cr4 &= rdmsr(MSR_IA32_VMX_CR4_FIXED1);
+ cr4 |= rdmsr(MSR_IA32_VMX_CR4_FIXED0);
+ /* Enable VMX operation */
+ cr4 |= X86_CR4_VMXE;
+ __asm__ __volatile__("mov %0, %%cr4" : : "r"(cr4) : "memory");
+
+ /*
+ * Configure IA32_FEATURE_CONTROL MSR to allow VMXON:
+ * Bit 0: Lock bit. If clear, VMXON causes a #GP.
+ * Bit 2: Enables VMXON outside of SMX operation. If clear, VMXON
+ * outside of SMX causes a #GP.
+ */
+ required = FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
+ required |= FEAT_CTL_LOCKED;
+ feature_control = rdmsr(MSR_IA32_FEAT_CTL);
+ if ((feature_control & required) != required)
+ wrmsr(MSR_IA32_FEAT_CTL, feature_control | required);
+
+ /* Enter VMX root operation. */
+ *(uint32_t *)(vmx->vmxon) = vmcs_revision();
+ if (vmxon(vmx->vmxon_gpa))
+ return false;
+
+ return true;
+}
+
+bool load_vmcs(struct vmx_pages *vmx)
+{
+ if (!enable_evmcs) {
+ /* Load a VMCS. */
+ *(uint32_t *)(vmx->vmcs) = vmcs_revision();
+ if (vmclear(vmx->vmcs_gpa))
+ return false;
+
+ if (vmptrld(vmx->vmcs_gpa))
+ return false;
+
+ /* Setup shadow VMCS, do not load it yet. */
+ *(uint32_t *)(vmx->shadow_vmcs) =
+ vmcs_revision() | 0x80000000ul;
+ if (vmclear(vmx->shadow_vmcs_gpa))
+ return false;
+ } else {
+ if (evmcs_vmptrld(vmx->enlightened_vmcs_gpa,
+ vmx->enlightened_vmcs))
+ return false;
+ current_evmcs->revision_id = EVMCS_VERSION;
+ }
+
+ return true;
+}
+
+/*
+ * Initialize the control fields to the most basic settings possible.
+ */
+static inline void init_vmcs_control_fields(struct vmx_pages *vmx)
+{
+ uint32_t sec_exec_ctl = 0;
+
+ vmwrite(VIRTUAL_PROCESSOR_ID, 0);
+ vmwrite(POSTED_INTR_NV, 0);
+
+ vmwrite(PIN_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PINBASED_CTLS));
+
+ if (vmx->eptp_gpa) {
+ uint64_t ept_paddr;
+ struct eptPageTablePointer eptp = {
+ .memory_type = VMX_BASIC_MEM_TYPE_WB,
+ .page_walk_length = 3, /* + 1 */
+ .ad_enabled = !!(rdmsr(MSR_IA32_VMX_EPT_VPID_CAP) & VMX_EPT_VPID_CAP_AD_BITS),
+ .address = vmx->eptp_gpa >> PAGE_SHIFT_4K,
+ };
+
+ memcpy(&ept_paddr, &eptp, sizeof(ept_paddr));
+ vmwrite(EPT_POINTER, ept_paddr);
+ sec_exec_ctl |= SECONDARY_EXEC_ENABLE_EPT;
+ }
+
+ if (!vmwrite(SECONDARY_VM_EXEC_CONTROL, sec_exec_ctl))
+ vmwrite(CPU_BASED_VM_EXEC_CONTROL,
+ rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS) | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
+ else {
+ vmwrite(CPU_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS));
+ GUEST_ASSERT(!sec_exec_ctl);
+ }
+
+ vmwrite(EXCEPTION_BITMAP, 0);
+ vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
+ vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, -1); /* Never match */
+ vmwrite(CR3_TARGET_COUNT, 0);
+ vmwrite(VM_EXIT_CONTROLS, rdmsr(MSR_IA32_VMX_EXIT_CTLS) |
+ VM_EXIT_HOST_ADDR_SPACE_SIZE); /* 64-bit host */
+ vmwrite(VM_EXIT_MSR_STORE_COUNT, 0);
+ vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
+ vmwrite(VM_ENTRY_CONTROLS, rdmsr(MSR_IA32_VMX_ENTRY_CTLS) |
+ VM_ENTRY_IA32E_MODE); /* 64-bit guest */
+ vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
+ vmwrite(VM_ENTRY_INTR_INFO_FIELD, 0);
+ vmwrite(TPR_THRESHOLD, 0);
+
+ vmwrite(CR0_GUEST_HOST_MASK, 0);
+ vmwrite(CR4_GUEST_HOST_MASK, 0);
+ vmwrite(CR0_READ_SHADOW, get_cr0());
+ vmwrite(CR4_READ_SHADOW, get_cr4());
+
+ vmwrite(MSR_BITMAP, vmx->msr_gpa);
+ vmwrite(VMREAD_BITMAP, vmx->vmread_gpa);
+ vmwrite(VMWRITE_BITMAP, vmx->vmwrite_gpa);
+}
+
+/*
+ * Initialize the host state fields based on the current host state, with
+ * the exception of HOST_RSP and HOST_RIP, which should be set by vmlaunch
+ * or vmresume.
+ */
+static inline void init_vmcs_host_state(void)
+{
+ uint32_t exit_controls = vmreadz(VM_EXIT_CONTROLS);
+
+ vmwrite(HOST_ES_SELECTOR, get_es());
+ vmwrite(HOST_CS_SELECTOR, get_cs());
+ vmwrite(HOST_SS_SELECTOR, get_ss());
+ vmwrite(HOST_DS_SELECTOR, get_ds());
+ vmwrite(HOST_FS_SELECTOR, get_fs());
+ vmwrite(HOST_GS_SELECTOR, get_gs());
+ vmwrite(HOST_TR_SELECTOR, get_tr());
+
+ if (exit_controls & VM_EXIT_LOAD_IA32_PAT)
+ vmwrite(HOST_IA32_PAT, rdmsr(MSR_IA32_CR_PAT));
+ if (exit_controls & VM_EXIT_LOAD_IA32_EFER)
+ vmwrite(HOST_IA32_EFER, rdmsr(MSR_EFER));
+ if (exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+ vmwrite(HOST_IA32_PERF_GLOBAL_CTRL,
+ rdmsr(MSR_CORE_PERF_GLOBAL_CTRL));
+
+ vmwrite(HOST_IA32_SYSENTER_CS, rdmsr(MSR_IA32_SYSENTER_CS));
+
+ vmwrite(HOST_CR0, get_cr0());
+ vmwrite(HOST_CR3, get_cr3());
+ vmwrite(HOST_CR4, get_cr4());
+ vmwrite(HOST_FS_BASE, rdmsr(MSR_FS_BASE));
+ vmwrite(HOST_GS_BASE, rdmsr(MSR_GS_BASE));
+ vmwrite(HOST_TR_BASE,
+ get_desc64_base((struct desc64 *)(get_gdt().address + get_tr())));
+ vmwrite(HOST_GDTR_BASE, get_gdt().address);
+ vmwrite(HOST_IDTR_BASE, get_idt().address);
+ vmwrite(HOST_IA32_SYSENTER_ESP, rdmsr(MSR_IA32_SYSENTER_ESP));
+ vmwrite(HOST_IA32_SYSENTER_EIP, rdmsr(MSR_IA32_SYSENTER_EIP));
+}
+
+/*
+ * Initialize the guest state fields essentially as a clone of
+ * the host state fields. Some host state fields have fixed
+ * values, and we set the corresponding guest state fields accordingly.
+ */
+static inline void init_vmcs_guest_state(void *rip, void *rsp)
+{
+ vmwrite(GUEST_ES_SELECTOR, vmreadz(HOST_ES_SELECTOR));
+ vmwrite(GUEST_CS_SELECTOR, vmreadz(HOST_CS_SELECTOR));
+ vmwrite(GUEST_SS_SELECTOR, vmreadz(HOST_SS_SELECTOR));
+ vmwrite(GUEST_DS_SELECTOR, vmreadz(HOST_DS_SELECTOR));
+ vmwrite(GUEST_FS_SELECTOR, vmreadz(HOST_FS_SELECTOR));
+ vmwrite(GUEST_GS_SELECTOR, vmreadz(HOST_GS_SELECTOR));
+ vmwrite(GUEST_LDTR_SELECTOR, 0);
+ vmwrite(GUEST_TR_SELECTOR, vmreadz(HOST_TR_SELECTOR));
+ vmwrite(GUEST_INTR_STATUS, 0);
+ vmwrite(GUEST_PML_INDEX, 0);
+
+ vmwrite(VMCS_LINK_POINTER, -1ll);
+ vmwrite(GUEST_IA32_DEBUGCTL, 0);
+ vmwrite(GUEST_IA32_PAT, vmreadz(HOST_IA32_PAT));
+ vmwrite(GUEST_IA32_EFER, vmreadz(HOST_IA32_EFER));
+ vmwrite(GUEST_IA32_PERF_GLOBAL_CTRL,
+ vmreadz(HOST_IA32_PERF_GLOBAL_CTRL));
+
+ vmwrite(GUEST_ES_LIMIT, -1);
+ vmwrite(GUEST_CS_LIMIT, -1);
+ vmwrite(GUEST_SS_LIMIT, -1);
+ vmwrite(GUEST_DS_LIMIT, -1);
+ vmwrite(GUEST_FS_LIMIT, -1);
+ vmwrite(GUEST_GS_LIMIT, -1);
+ vmwrite(GUEST_LDTR_LIMIT, -1);
+ vmwrite(GUEST_TR_LIMIT, 0x67);
+ vmwrite(GUEST_GDTR_LIMIT, 0xffff);
+ vmwrite(GUEST_IDTR_LIMIT, 0xffff);
+ vmwrite(GUEST_ES_AR_BYTES,
+ vmreadz(GUEST_ES_SELECTOR) == 0 ? 0x10000 : 0xc093);
+ vmwrite(GUEST_CS_AR_BYTES, 0xa09b);
+ vmwrite(GUEST_SS_AR_BYTES, 0xc093);
+ vmwrite(GUEST_DS_AR_BYTES,
+ vmreadz(GUEST_DS_SELECTOR) == 0 ? 0x10000 : 0xc093);
+ vmwrite(GUEST_FS_AR_BYTES,
+ vmreadz(GUEST_FS_SELECTOR) == 0 ? 0x10000 : 0xc093);
+ vmwrite(GUEST_GS_AR_BYTES,
+ vmreadz(GUEST_GS_SELECTOR) == 0 ? 0x10000 : 0xc093);
+ vmwrite(GUEST_LDTR_AR_BYTES, 0x10000);
+ vmwrite(GUEST_TR_AR_BYTES, 0x8b);
+ vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
+ vmwrite(GUEST_ACTIVITY_STATE, 0);
+ vmwrite(GUEST_SYSENTER_CS, vmreadz(HOST_IA32_SYSENTER_CS));
+ vmwrite(VMX_PREEMPTION_TIMER_VALUE, 0);
+
+ vmwrite(GUEST_CR0, vmreadz(HOST_CR0));
+ vmwrite(GUEST_CR3, vmreadz(HOST_CR3));
+ vmwrite(GUEST_CR4, vmreadz(HOST_CR4));
+ vmwrite(GUEST_ES_BASE, 0);
+ vmwrite(GUEST_CS_BASE, 0);
+ vmwrite(GUEST_SS_BASE, 0);
+ vmwrite(GUEST_DS_BASE, 0);
+ vmwrite(GUEST_FS_BASE, vmreadz(HOST_FS_BASE));
+ vmwrite(GUEST_GS_BASE, vmreadz(HOST_GS_BASE));
+ vmwrite(GUEST_LDTR_BASE, 0);
+ vmwrite(GUEST_TR_BASE, vmreadz(HOST_TR_BASE));
+ vmwrite(GUEST_GDTR_BASE, vmreadz(HOST_GDTR_BASE));
+ vmwrite(GUEST_IDTR_BASE, vmreadz(HOST_IDTR_BASE));
+ vmwrite(GUEST_DR7, 0x400);
+ vmwrite(GUEST_RSP, (uint64_t)rsp);
+ vmwrite(GUEST_RIP, (uint64_t)rip);
+ vmwrite(GUEST_RFLAGS, 2);
+ vmwrite(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+ vmwrite(GUEST_SYSENTER_ESP, vmreadz(HOST_IA32_SYSENTER_ESP));
+ vmwrite(GUEST_SYSENTER_EIP, vmreadz(HOST_IA32_SYSENTER_EIP));
+}
+
+void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp)
+{
+ init_vmcs_control_fields(vmx);
+ init_vmcs_host_state();
+ init_vmcs_guest_state(guest_rip, guest_rsp);
+}
+
+bool nested_vmx_supported(void)
+{
+ struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1);
+
+ return entry->ecx & CPUID_VMX;
+}
+
+void nested_vmx_check_supported(void)
+{
+ if (!nested_vmx_supported()) {
+ print_skip("nested VMX not enabled");
+ exit(KSFT_SKIP);
+ }
+}
+
+void nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint64_t nested_paddr, uint64_t paddr, uint32_t eptp_memslot)
+{
+ uint16_t index[4];
+ struct eptPageTableEntry *pml4e;
+
+ TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use "
+ "unknown or unsupported guest mode, mode: 0x%x", vm->mode);
+
+ TEST_ASSERT((nested_paddr % vm->page_size) == 0,
+ "Nested physical address not on page boundary,\n"
+ " nested_paddr: 0x%lx vm->page_size: 0x%x",
+ nested_paddr, vm->page_size);
+ TEST_ASSERT((nested_paddr >> vm->page_shift) <= vm->max_gfn,
+ "Physical address beyond beyond maximum supported,\n"
+ " nested_paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
+ paddr, vm->max_gfn, vm->page_size);
+ TEST_ASSERT((paddr % vm->page_size) == 0,
+ "Physical address not on page boundary,\n"
+ " paddr: 0x%lx vm->page_size: 0x%x",
+ paddr, vm->page_size);
+ TEST_ASSERT((paddr >> vm->page_shift) <= vm->max_gfn,
+ "Physical address beyond beyond maximum supported,\n"
+ " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x",
+ paddr, vm->max_gfn, vm->page_size);
+
+ index[0] = (nested_paddr >> 12) & 0x1ffu;
+ index[1] = (nested_paddr >> 21) & 0x1ffu;
+ index[2] = (nested_paddr >> 30) & 0x1ffu;
+ index[3] = (nested_paddr >> 39) & 0x1ffu;
+
+ /* Allocate page directory pointer table if not present. */
+ pml4e = vmx->eptp_hva;
+ if (!pml4e[index[3]].readable) {
+ pml4e[index[3]].address = vm_phy_page_alloc(vm,
+ KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot)
+ >> vm->page_shift;
+ pml4e[index[3]].writable = true;
+ pml4e[index[3]].readable = true;
+ pml4e[index[3]].executable = true;
+ }
+
+ /* Allocate page directory table if not present. */
+ struct eptPageTableEntry *pdpe;
+ pdpe = addr_gpa2hva(vm, pml4e[index[3]].address * vm->page_size);
+ if (!pdpe[index[2]].readable) {
+ pdpe[index[2]].address = vm_phy_page_alloc(vm,
+ KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot)
+ >> vm->page_shift;
+ pdpe[index[2]].writable = true;
+ pdpe[index[2]].readable = true;
+ pdpe[index[2]].executable = true;
+ }
+
+ /* Allocate page table if not present. */
+ struct eptPageTableEntry *pde;
+ pde = addr_gpa2hva(vm, pdpe[index[2]].address * vm->page_size);
+ if (!pde[index[1]].readable) {
+ pde[index[1]].address = vm_phy_page_alloc(vm,
+ KVM_EPT_PAGE_TABLE_MIN_PADDR, eptp_memslot)
+ >> vm->page_shift;
+ pde[index[1]].writable = true;
+ pde[index[1]].readable = true;
+ pde[index[1]].executable = true;
+ }
+
+ /* Fill in page table entry. */
+ struct eptPageTableEntry *pte;
+ pte = addr_gpa2hva(vm, pde[index[1]].address * vm->page_size);
+ pte[index[0]].address = paddr >> vm->page_shift;
+ pte[index[0]].writable = true;
+ pte[index[0]].readable = true;
+ pte[index[0]].executable = true;
+
+ /*
+ * For now mark these as accessed and dirty because the only
+ * testcase we have needs that. Can be reconsidered later.
+ */
+ pte[index[0]].accessed = true;
+ pte[index[0]].dirty = true;
+}
+
+/*
+ * Map a range of EPT guest physical addresses to the VM's physical address
+ *
+ * Input Args:
+ * vm - Virtual Machine
+ * nested_paddr - Nested guest physical address to map
+ * paddr - VM Physical Address
+ * size - The size of the range to map
+ * eptp_memslot - Memory region slot for new virtual translation tables
+ *
+ * Output Args: None
+ *
+ * Return: None
+ *
+ * Within the VM given by vm, creates a nested guest translation for the
+ * page range starting at nested_paddr to the page range starting at paddr.
+ */
+void nested_map(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint64_t nested_paddr, uint64_t paddr, uint64_t size,
+ uint32_t eptp_memslot)
+{
+ size_t page_size = vm->page_size;
+ size_t npages = size / page_size;
+
+ TEST_ASSERT(nested_paddr + size > nested_paddr, "Vaddr overflow");
+ TEST_ASSERT(paddr + size > paddr, "Paddr overflow");
+
+ while (npages--) {
+ nested_pg_map(vmx, vm, nested_paddr, paddr, eptp_memslot);
+ nested_paddr += page_size;
+ paddr += page_size;
+ }
+}
+
+/* Prepare an identity extended page table that maps all the
+ * physical pages in VM.
+ */
+void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint32_t memslot, uint32_t eptp_memslot)
+{
+ sparsebit_idx_t i, last;
+ struct userspace_mem_region *region =
+ memslot2region(vm, memslot);
+
+ i = (region->region.guest_phys_addr >> vm->page_shift) - 1;
+ last = i + (region->region.memory_size >> vm->page_shift);
+ for (;;) {
+ i = sparsebit_next_clear(region->unused_phy_pages, i);
+ if (i > last)
+ break;
+
+ nested_map(vmx, vm,
+ (uint64_t)i << vm->page_shift,
+ (uint64_t)i << vm->page_shift,
+ 1 << vm->page_shift,
+ eptp_memslot);
+ }
+}
+
+void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint32_t eptp_memslot)
+{
+ vmx->eptp = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
+ vmx->eptp_hva = addr_gva2hva(vm, (uintptr_t)vmx->eptp);
+ vmx->eptp_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->eptp);
+}
+
+void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm,
+ uint32_t eptp_memslot)
+{
+ vmx->apic_access = (void *)vm_vaddr_alloc(vm, getpagesize(),
+ 0x10000, 0, 0);
+ vmx->apic_access_hva = addr_gva2hva(vm, (uintptr_t)vmx->apic_access);
+ vmx->apic_access_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->apic_access);
+}
diff --git a/tools/testing/selftests/kvm/s390x/memop.c b/tools/testing/selftests/kvm/s390x/memop.c
new file mode 100644
index 000000000..9f49ead38
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390x/memop.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test for s390x KVM_S390_MEM_OP
+ *
+ * Copyright (C) 2019, Red Hat, Inc.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+
+#define VCPU_ID 1
+
+static uint8_t mem1[65536];
+static uint8_t mem2[65536];
+
+static void guest_code(void)
+{
+ int i;
+
+ for (;;) {
+ for (i = 0; i < sizeof(mem2); i++)
+ mem2[i] = mem1[i];
+ GUEST_SYNC(0);
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ struct kvm_s390_mem_op ksmo;
+ int rv, i, maxsize;
+
+ setbuf(stdout, NULL); /* Tell stdout not to buffer its content */
+
+ maxsize = kvm_check_cap(KVM_CAP_S390_MEM_OP);
+ if (!maxsize) {
+ print_skip("CAP_S390_MEM_OP not supported");
+ exit(KSFT_SKIP);
+ }
+ if (maxsize > sizeof(mem1))
+ maxsize = sizeof(mem1);
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ run = vcpu_state(vm, VCPU_ID);
+
+ for (i = 0; i < sizeof(mem1); i++)
+ mem1[i] = i * i + i;
+
+ /* Set the first array */
+ ksmo.gaddr = addr_gva2gpa(vm, (uintptr_t)mem1);
+ ksmo.flags = 0;
+ ksmo.size = maxsize;
+ ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE;
+ ksmo.buf = (uintptr_t)mem1;
+ ksmo.ar = 0;
+ vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo);
+
+ /* Let the guest code copy the first array to the second */
+ vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC,
+ "Unexpected exit reason: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ memset(mem2, 0xaa, sizeof(mem2));
+
+ /* Get the second array */
+ ksmo.gaddr = (uintptr_t)mem2;
+ ksmo.flags = 0;
+ ksmo.size = maxsize;
+ ksmo.op = KVM_S390_MEMOP_LOGICAL_READ;
+ ksmo.buf = (uintptr_t)mem2;
+ ksmo.ar = 0;
+ vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo);
+
+ TEST_ASSERT(!memcmp(mem1, mem2, maxsize),
+ "Memory contents do not match!");
+
+ /* Check error conditions - first bad size: */
+ ksmo.gaddr = (uintptr_t)mem1;
+ ksmo.flags = 0;
+ ksmo.size = -1;
+ ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE;
+ ksmo.buf = (uintptr_t)mem1;
+ ksmo.ar = 0;
+ rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo);
+ TEST_ASSERT(rv == -1 && errno == E2BIG, "ioctl allows insane sizes");
+
+ /* Zero size: */
+ ksmo.gaddr = (uintptr_t)mem1;
+ ksmo.flags = 0;
+ ksmo.size = 0;
+ ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE;
+ ksmo.buf = (uintptr_t)mem1;
+ ksmo.ar = 0;
+ rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo);
+ TEST_ASSERT(rv == -1 && (errno == EINVAL || errno == ENOMEM),
+ "ioctl allows 0 as size");
+
+ /* Bad flags: */
+ ksmo.gaddr = (uintptr_t)mem1;
+ ksmo.flags = -1;
+ ksmo.size = maxsize;
+ ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE;
+ ksmo.buf = (uintptr_t)mem1;
+ ksmo.ar = 0;
+ rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo);
+ TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows all flags");
+
+ /* Bad operation: */
+ ksmo.gaddr = (uintptr_t)mem1;
+ ksmo.flags = 0;
+ ksmo.size = maxsize;
+ ksmo.op = -1;
+ ksmo.buf = (uintptr_t)mem1;
+ ksmo.ar = 0;
+ rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo);
+ TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows bad operations");
+
+ /* Bad guest address: */
+ ksmo.gaddr = ~0xfffUL;
+ ksmo.flags = KVM_S390_MEMOP_F_CHECK_ONLY;
+ ksmo.size = maxsize;
+ ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE;
+ ksmo.buf = (uintptr_t)mem1;
+ ksmo.ar = 0;
+ rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo);
+ TEST_ASSERT(rv > 0, "ioctl does not report bad guest memory access");
+
+ /* Bad host address: */
+ ksmo.gaddr = (uintptr_t)mem1;
+ ksmo.flags = 0;
+ ksmo.size = maxsize;
+ ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE;
+ ksmo.buf = 0;
+ ksmo.ar = 0;
+ rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo);
+ TEST_ASSERT(rv == -1 && errno == EFAULT,
+ "ioctl does not report bad host memory address");
+
+ /* Bad access register: */
+ run->psw_mask &= ~(3UL << (63 - 17));
+ run->psw_mask |= 1UL << (63 - 17); /* Enable AR mode */
+ vcpu_run(vm, VCPU_ID); /* To sync new state to SIE block */
+ ksmo.gaddr = (uintptr_t)mem1;
+ ksmo.flags = 0;
+ ksmo.size = maxsize;
+ ksmo.op = KVM_S390_MEMOP_LOGICAL_WRITE;
+ ksmo.buf = (uintptr_t)mem1;
+ ksmo.ar = 17;
+ rv = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_MEM_OP, &ksmo);
+ TEST_ASSERT(rv == -1 && errno == EINVAL, "ioctl allows ARs > 15");
+ run->psw_mask &= ~(3UL << (63 - 17)); /* Disable AR mode */
+ vcpu_run(vm, VCPU_ID); /* Run to sync new state */
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/s390x/resets.c b/tools/testing/selftests/kvm/s390x/resets.c
new file mode 100644
index 000000000..b143db6d8
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390x/resets.c
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test for s390x CPU resets
+ *
+ * Copyright (C) 2020, IBM
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+
+#define VCPU_ID 3
+#define LOCAL_IRQS 32
+
+struct kvm_s390_irq buf[VCPU_ID + LOCAL_IRQS];
+
+struct kvm_vm *vm;
+struct kvm_run *run;
+struct kvm_sync_regs *sync_regs;
+static uint8_t regs_null[512];
+
+static void guest_code_initial(void)
+{
+ /* set several CRs to "safe" value */
+ unsigned long cr2_59 = 0x10; /* enable guarded storage */
+ unsigned long cr8_63 = 0x1; /* monitor mask = 1 */
+ unsigned long cr10 = 1; /* PER START */
+ unsigned long cr11 = -1; /* PER END */
+
+
+ /* Dirty registers */
+ asm volatile (
+ " lghi 2,0x11\n" /* Round toward 0 */
+ " sfpc 2\n" /* set fpc to !=0 */
+ " lctlg 2,2,%0\n"
+ " lctlg 8,8,%1\n"
+ " lctlg 10,10,%2\n"
+ " lctlg 11,11,%3\n"
+ /* now clobber some general purpose regs */
+ " llihh 0,0xffff\n"
+ " llihl 1,0x5555\n"
+ " llilh 2,0xaaaa\n"
+ " llill 3,0x0000\n"
+ /* now clobber a floating point reg */
+ " lghi 4,0x1\n"
+ " cdgbr 0,4\n"
+ /* now clobber an access reg */
+ " sar 9,4\n"
+ /* We embed diag 501 here to control register content */
+ " diag 0,0,0x501\n"
+ :
+ : "m" (cr2_59), "m" (cr8_63), "m" (cr10), "m" (cr11)
+ /* no clobber list as this should not return */
+ );
+}
+
+static void test_one_reg(uint64_t id, uint64_t value)
+{
+ struct kvm_one_reg reg;
+ uint64_t eval_reg;
+
+ reg.addr = (uintptr_t)&eval_reg;
+ reg.id = id;
+ vcpu_get_reg(vm, VCPU_ID, &reg);
+ TEST_ASSERT(eval_reg == value, "value == 0x%lx", value);
+}
+
+static void assert_noirq(void)
+{
+ struct kvm_s390_irq_state irq_state;
+ int irqs;
+
+ irq_state.len = sizeof(buf);
+ irq_state.buf = (unsigned long)buf;
+ irqs = _vcpu_ioctl(vm, VCPU_ID, KVM_S390_GET_IRQ_STATE, &irq_state);
+ /*
+ * irqs contains the number of retrieved interrupts. Any interrupt
+ * (notably, the emergency call interrupt we have injected) should
+ * be cleared by the resets, so this should be 0.
+ */
+ TEST_ASSERT(irqs >= 0, "Could not fetch IRQs: errno %d\n", errno);
+ TEST_ASSERT(!irqs, "IRQ pending");
+}
+
+static void assert_clear(void)
+{
+ struct kvm_sregs sregs;
+ struct kvm_regs regs;
+ struct kvm_fpu fpu;
+
+ vcpu_regs_get(vm, VCPU_ID, &regs);
+ TEST_ASSERT(!memcmp(&regs.gprs, regs_null, sizeof(regs.gprs)), "grs == 0");
+
+ vcpu_sregs_get(vm, VCPU_ID, &sregs);
+ TEST_ASSERT(!memcmp(&sregs.acrs, regs_null, sizeof(sregs.acrs)), "acrs == 0");
+
+ vcpu_fpu_get(vm, VCPU_ID, &fpu);
+ TEST_ASSERT(!memcmp(&fpu.fprs, regs_null, sizeof(fpu.fprs)), "fprs == 0");
+
+ /* sync regs */
+ TEST_ASSERT(!memcmp(sync_regs->gprs, regs_null, sizeof(sync_regs->gprs)),
+ "gprs0-15 == 0 (sync_regs)");
+
+ TEST_ASSERT(!memcmp(sync_regs->acrs, regs_null, sizeof(sync_regs->acrs)),
+ "acrs0-15 == 0 (sync_regs)");
+
+ TEST_ASSERT(!memcmp(sync_regs->vrs, regs_null, sizeof(sync_regs->vrs)),
+ "vrs0-15 == 0 (sync_regs)");
+}
+
+static void assert_initial_noclear(void)
+{
+ TEST_ASSERT(sync_regs->gprs[0] == 0xffff000000000000UL,
+ "gpr0 == 0xffff000000000000 (sync_regs)");
+ TEST_ASSERT(sync_regs->gprs[1] == 0x0000555500000000UL,
+ "gpr1 == 0x0000555500000000 (sync_regs)");
+ TEST_ASSERT(sync_regs->gprs[2] == 0x00000000aaaa0000UL,
+ "gpr2 == 0x00000000aaaa0000 (sync_regs)");
+ TEST_ASSERT(sync_regs->gprs[3] == 0x0000000000000000UL,
+ "gpr3 == 0x0000000000000000 (sync_regs)");
+ TEST_ASSERT(sync_regs->fprs[0] == 0x3ff0000000000000UL,
+ "fpr0 == 0f1 (sync_regs)");
+ TEST_ASSERT(sync_regs->acrs[9] == 1, "ar9 == 1 (sync_regs)");
+}
+
+static void assert_initial(void)
+{
+ struct kvm_sregs sregs;
+ struct kvm_fpu fpu;
+
+ /* KVM_GET_SREGS */
+ vcpu_sregs_get(vm, VCPU_ID, &sregs);
+ TEST_ASSERT(sregs.crs[0] == 0xE0UL, "cr0 == 0xE0 (KVM_GET_SREGS)");
+ TEST_ASSERT(sregs.crs[14] == 0xC2000000UL,
+ "cr14 == 0xC2000000 (KVM_GET_SREGS)");
+ TEST_ASSERT(!memcmp(&sregs.crs[1], regs_null, sizeof(sregs.crs[1]) * 12),
+ "cr1-13 == 0 (KVM_GET_SREGS)");
+ TEST_ASSERT(sregs.crs[15] == 0, "cr15 == 0 (KVM_GET_SREGS)");
+
+ /* sync regs */
+ TEST_ASSERT(sync_regs->crs[0] == 0xE0UL, "cr0 == 0xE0 (sync_regs)");
+ TEST_ASSERT(sync_regs->crs[14] == 0xC2000000UL,
+ "cr14 == 0xC2000000 (sync_regs)");
+ TEST_ASSERT(!memcmp(&sync_regs->crs[1], regs_null, 8 * 12),
+ "cr1-13 == 0 (sync_regs)");
+ TEST_ASSERT(sync_regs->crs[15] == 0, "cr15 == 0 (sync_regs)");
+ TEST_ASSERT(sync_regs->fpc == 0, "fpc == 0 (sync_regs)");
+ TEST_ASSERT(sync_regs->todpr == 0, "todpr == 0 (sync_regs)");
+ TEST_ASSERT(sync_regs->cputm == 0, "cputm == 0 (sync_regs)");
+ TEST_ASSERT(sync_regs->ckc == 0, "ckc == 0 (sync_regs)");
+ TEST_ASSERT(sync_regs->pp == 0, "pp == 0 (sync_regs)");
+ TEST_ASSERT(sync_regs->gbea == 1, "gbea == 1 (sync_regs)");
+
+ /* kvm_run */
+ TEST_ASSERT(run->psw_addr == 0, "psw_addr == 0 (kvm_run)");
+ TEST_ASSERT(run->psw_mask == 0, "psw_mask == 0 (kvm_run)");
+
+ vcpu_fpu_get(vm, VCPU_ID, &fpu);
+ TEST_ASSERT(!fpu.fpc, "fpc == 0");
+
+ test_one_reg(KVM_REG_S390_GBEA, 1);
+ test_one_reg(KVM_REG_S390_PP, 0);
+ test_one_reg(KVM_REG_S390_TODPR, 0);
+ test_one_reg(KVM_REG_S390_CPU_TIMER, 0);
+ test_one_reg(KVM_REG_S390_CLOCK_COMP, 0);
+}
+
+static void assert_normal_noclear(void)
+{
+ TEST_ASSERT(sync_regs->crs[2] == 0x10, "cr2 == 10 (sync_regs)");
+ TEST_ASSERT(sync_regs->crs[8] == 1, "cr10 == 1 (sync_regs)");
+ TEST_ASSERT(sync_regs->crs[10] == 1, "cr10 == 1 (sync_regs)");
+ TEST_ASSERT(sync_regs->crs[11] == -1, "cr11 == -1 (sync_regs)");
+}
+
+static void assert_normal(void)
+{
+ test_one_reg(KVM_REG_S390_PFTOKEN, KVM_S390_PFAULT_TOKEN_INVALID);
+ TEST_ASSERT(sync_regs->pft == KVM_S390_PFAULT_TOKEN_INVALID,
+ "pft == 0xff..... (sync_regs)");
+ assert_noirq();
+}
+
+static void inject_irq(int cpu_id)
+{
+ struct kvm_s390_irq_state irq_state;
+ struct kvm_s390_irq *irq = &buf[0];
+ int irqs;
+
+ /* Inject IRQ */
+ irq_state.len = sizeof(struct kvm_s390_irq);
+ irq_state.buf = (unsigned long)buf;
+ irq->type = KVM_S390_INT_EMERGENCY;
+ irq->u.emerg.code = cpu_id;
+ irqs = _vcpu_ioctl(vm, cpu_id, KVM_S390_SET_IRQ_STATE, &irq_state);
+ TEST_ASSERT(irqs >= 0, "Error injecting EMERGENCY IRQ errno %d\n", errno);
+}
+
+static void test_normal(void)
+{
+ pr_info("Testing normal reset\n");
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code_initial);
+ run = vcpu_state(vm, VCPU_ID);
+ sync_regs = &run->s.regs;
+
+ vcpu_run(vm, VCPU_ID);
+
+ inject_irq(VCPU_ID);
+
+ vcpu_ioctl(vm, VCPU_ID, KVM_S390_NORMAL_RESET, 0);
+
+ /* must clears */
+ assert_normal();
+ /* must not clears */
+ assert_normal_noclear();
+ assert_initial_noclear();
+
+ kvm_vm_free(vm);
+}
+
+static void test_initial(void)
+{
+ pr_info("Testing initial reset\n");
+ vm = vm_create_default(VCPU_ID, 0, guest_code_initial);
+ run = vcpu_state(vm, VCPU_ID);
+ sync_regs = &run->s.regs;
+
+ vcpu_run(vm, VCPU_ID);
+
+ inject_irq(VCPU_ID);
+
+ vcpu_ioctl(vm, VCPU_ID, KVM_S390_INITIAL_RESET, 0);
+
+ /* must clears */
+ assert_normal();
+ assert_initial();
+ /* must not clears */
+ assert_initial_noclear();
+
+ kvm_vm_free(vm);
+}
+
+static void test_clear(void)
+{
+ pr_info("Testing clear reset\n");
+ vm = vm_create_default(VCPU_ID, 0, guest_code_initial);
+ run = vcpu_state(vm, VCPU_ID);
+ sync_regs = &run->s.regs;
+
+ vcpu_run(vm, VCPU_ID);
+
+ inject_irq(VCPU_ID);
+
+ vcpu_ioctl(vm, VCPU_ID, KVM_S390_CLEAR_RESET, 0);
+
+ /* must clears */
+ assert_normal();
+ assert_initial();
+ assert_clear();
+
+ kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+ setbuf(stdout, NULL); /* Tell stdout not to buffer its content */
+
+ test_initial();
+ if (kvm_check_cap(KVM_CAP_S390_VCPU_RESETS)) {
+ test_normal();
+ test_clear();
+ }
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/s390x/sync_regs_test.c b/tools/testing/selftests/kvm/s390x/sync_regs_test.c
new file mode 100644
index 000000000..5731ccf34
--- /dev/null
+++ b/tools/testing/selftests/kvm/s390x/sync_regs_test.c
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test for s390x KVM_CAP_SYNC_REGS
+ *
+ * Based on the same test for x86:
+ * Copyright (C) 2018, Google LLC.
+ *
+ * Adaptions for s390x:
+ * Copyright (C) 2019, Red Hat, Inc.
+ *
+ * Test expected behavior of the KVM_CAP_SYNC_REGS functionality.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+
+#define VCPU_ID 5
+
+static void guest_code(void)
+{
+ /*
+ * We embed diag 501 here instead of doing a ucall to avoid that
+ * the compiler has messed with r11 at the time of the ucall.
+ */
+ asm volatile (
+ "0: diag 0,0,0x501\n"
+ " ahi 11,1\n"
+ " j 0b\n"
+ );
+}
+
+#define REG_COMPARE(reg) \
+ TEST_ASSERT(left->reg == right->reg, \
+ "Register " #reg \
+ " values did not match: 0x%llx, 0x%llx\n", \
+ left->reg, right->reg)
+
+#define REG_COMPARE32(reg) \
+ TEST_ASSERT(left->reg == right->reg, \
+ "Register " #reg \
+ " values did not match: 0x%x, 0x%x\n", \
+ left->reg, right->reg)
+
+
+static void compare_regs(struct kvm_regs *left, struct kvm_sync_regs *right)
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ REG_COMPARE(gprs[i]);
+}
+
+static void compare_sregs(struct kvm_sregs *left, struct kvm_sync_regs *right)
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+ REG_COMPARE32(acrs[i]);
+
+ for (i = 0; i < 16; i++)
+ REG_COMPARE(crs[i]);
+}
+
+#undef REG_COMPARE
+
+#define TEST_SYNC_FIELDS (KVM_SYNC_GPRS|KVM_SYNC_ACRS|KVM_SYNC_CRS)
+#define INVALID_SYNC_FIELD 0x80000000
+
+int main(int argc, char *argv[])
+{
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ struct kvm_regs regs;
+ struct kvm_sregs sregs;
+ int rv, cap;
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+ cap = kvm_check_cap(KVM_CAP_SYNC_REGS);
+ if (!cap) {
+ print_skip("CAP_SYNC_REGS not supported");
+ exit(KSFT_SKIP);
+ }
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+ run = vcpu_state(vm, VCPU_ID);
+
+ /* Request reading invalid register set from VCPU. */
+ run->kvm_valid_regs = INVALID_SYNC_FIELD;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv < 0 && errno == EINVAL,
+ "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n",
+ rv);
+ vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0;
+
+ run->kvm_valid_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv < 0 && errno == EINVAL,
+ "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n",
+ rv);
+ vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0;
+
+ /* Request setting invalid register set into VCPU. */
+ run->kvm_dirty_regs = INVALID_SYNC_FIELD;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv < 0 && errno == EINVAL,
+ "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n",
+ rv);
+ vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0;
+
+ run->kvm_dirty_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv < 0 && errno == EINVAL,
+ "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n",
+ rv);
+ vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0;
+
+ /* Request and verify all valid register sets. */
+ run->kvm_valid_regs = TEST_SYNC_FIELDS;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC,
+ "Unexpected exit reason: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ TEST_ASSERT(run->s390_sieic.icptcode == 4 &&
+ (run->s390_sieic.ipa >> 8) == 0x83 &&
+ (run->s390_sieic.ipb >> 16) == 0x501,
+ "Unexpected interception code: ic=%u, ipa=0x%x, ipb=0x%x\n",
+ run->s390_sieic.icptcode, run->s390_sieic.ipa,
+ run->s390_sieic.ipb);
+
+ vcpu_regs_get(vm, VCPU_ID, &regs);
+ compare_regs(&regs, &run->s.regs);
+
+ vcpu_sregs_get(vm, VCPU_ID, &sregs);
+ compare_sregs(&sregs, &run->s.regs);
+
+ /* Set and verify various register values */
+ run->s.regs.gprs[11] = 0xBAD1DEA;
+ run->s.regs.acrs[0] = 1 << 11;
+
+ run->kvm_valid_regs = TEST_SYNC_FIELDS;
+ run->kvm_dirty_regs = KVM_SYNC_GPRS | KVM_SYNC_ACRS;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC,
+ "Unexpected exit reason: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ TEST_ASSERT(run->s.regs.gprs[11] == 0xBAD1DEA + 1,
+ "r11 sync regs value incorrect 0x%llx.",
+ run->s.regs.gprs[11]);
+ TEST_ASSERT(run->s.regs.acrs[0] == 1 << 11,
+ "acr0 sync regs value incorrect 0x%x.",
+ run->s.regs.acrs[0]);
+
+ vcpu_regs_get(vm, VCPU_ID, &regs);
+ compare_regs(&regs, &run->s.regs);
+
+ vcpu_sregs_get(vm, VCPU_ID, &sregs);
+ compare_sregs(&sregs, &run->s.regs);
+
+ /* Clear kvm_dirty_regs bits, verify new s.regs values are
+ * overwritten with existing guest values.
+ */
+ run->kvm_valid_regs = TEST_SYNC_FIELDS;
+ run->kvm_dirty_regs = 0;
+ run->s.regs.gprs[11] = 0xDEADBEEF;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC,
+ "Unexpected exit reason: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ TEST_ASSERT(run->s.regs.gprs[11] != 0xDEADBEEF,
+ "r11 sync regs value incorrect 0x%llx.",
+ run->s.regs.gprs[11]);
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/set_memory_region_test.c b/tools/testing/selftests/kvm/set_memory_region_test.c
new file mode 100644
index 000000000..6f441dd9f
--- /dev/null
+++ b/tools/testing/selftests/kvm/set_memory_region_test.c
@@ -0,0 +1,417 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <pthread.h>
+#include <sched.h>
+#include <semaphore.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <linux/compiler.h>
+
+#include <test_util.h>
+#include <kvm_util.h>
+#include <processor.h>
+
+#define VCPU_ID 0
+
+/*
+ * s390x needs at least 1MB alignment, and the x86_64 MOVE/DELETE tests need a
+ * 2MB sized and aligned region so that the initial region corresponds to
+ * exactly one large page.
+ */
+#define MEM_REGION_SIZE 0x200000
+
+#ifdef __x86_64__
+/*
+ * Somewhat arbitrary location and slot, intended to not overlap anything.
+ */
+#define MEM_REGION_GPA 0xc0000000
+#define MEM_REGION_SLOT 10
+
+static const uint64_t MMIO_VAL = 0xbeefull;
+
+extern const uint64_t final_rip_start;
+extern const uint64_t final_rip_end;
+
+static sem_t vcpu_ready;
+
+static inline uint64_t guest_spin_on_val(uint64_t spin_val)
+{
+ uint64_t val;
+
+ do {
+ val = READ_ONCE(*((uint64_t *)MEM_REGION_GPA));
+ } while (val == spin_val);
+
+ GUEST_SYNC(0);
+ return val;
+}
+
+static void *vcpu_worker(void *data)
+{
+ struct kvm_vm *vm = data;
+ struct kvm_run *run;
+ struct ucall uc;
+ uint64_t cmd;
+
+ /*
+ * Loop until the guest is done. Re-enter the guest on all MMIO exits,
+ * which will occur if the guest attempts to access a memslot after it
+ * has been deleted or while it is being moved .
+ */
+ run = vcpu_state(vm, VCPU_ID);
+
+ while (1) {
+ vcpu_run(vm, VCPU_ID);
+
+ if (run->exit_reason == KVM_EXIT_IO) {
+ cmd = get_ucall(vm, VCPU_ID, &uc);
+ if (cmd != UCALL_SYNC)
+ break;
+
+ sem_post(&vcpu_ready);
+ continue;
+ }
+
+ if (run->exit_reason != KVM_EXIT_MMIO)
+ break;
+
+ TEST_ASSERT(!run->mmio.is_write, "Unexpected exit mmio write");
+ TEST_ASSERT(run->mmio.len == 8,
+ "Unexpected exit mmio size = %u", run->mmio.len);
+
+ TEST_ASSERT(run->mmio.phys_addr == MEM_REGION_GPA,
+ "Unexpected exit mmio address = 0x%llx",
+ run->mmio.phys_addr);
+ memcpy(run->mmio.data, &MMIO_VAL, 8);
+ }
+
+ if (run->exit_reason == KVM_EXIT_IO && cmd == UCALL_ABORT)
+ TEST_FAIL("%s at %s:%ld, val = %lu", (const char *)uc.args[0],
+ __FILE__, uc.args[1], uc.args[2]);
+
+ return NULL;
+}
+
+static void wait_for_vcpu(void)
+{
+ struct timespec ts;
+
+ TEST_ASSERT(!clock_gettime(CLOCK_REALTIME, &ts),
+ "clock_gettime() failed: %d\n", errno);
+
+ ts.tv_sec += 2;
+ TEST_ASSERT(!sem_timedwait(&vcpu_ready, &ts),
+ "sem_timedwait() failed: %d\n", errno);
+
+ /* Wait for the vCPU thread to reenter the guest. */
+ usleep(100000);
+}
+
+static struct kvm_vm *spawn_vm(pthread_t *vcpu_thread, void *guest_code)
+{
+ struct kvm_vm *vm;
+ uint64_t *hva;
+ uint64_t gpa;
+
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_THP,
+ MEM_REGION_GPA, MEM_REGION_SLOT,
+ MEM_REGION_SIZE / getpagesize(), 0);
+
+ /*
+ * Allocate and map two pages so that the GPA accessed by guest_code()
+ * stays valid across the memslot move.
+ */
+ gpa = vm_phy_pages_alloc(vm, 2, MEM_REGION_GPA, MEM_REGION_SLOT);
+ TEST_ASSERT(gpa == MEM_REGION_GPA, "Failed vm_phy_pages_alloc\n");
+
+ virt_map(vm, MEM_REGION_GPA, MEM_REGION_GPA, 2, 0);
+
+ /* Ditto for the host mapping so that both pages can be zeroed. */
+ hva = addr_gpa2hva(vm, MEM_REGION_GPA);
+ memset(hva, 0, 2 * 4096);
+
+ pthread_create(vcpu_thread, NULL, vcpu_worker, vm);
+
+ /* Ensure the guest thread is spun up. */
+ wait_for_vcpu();
+
+ return vm;
+}
+
+
+static void guest_code_move_memory_region(void)
+{
+ uint64_t val;
+
+ GUEST_SYNC(0);
+
+ /*
+ * Spin until the memory region starts getting moved to a
+ * misaligned address.
+ * Every region move may or may not trigger MMIO, as the
+ * window where the memslot is invalid is usually quite small.
+ */
+ val = guest_spin_on_val(0);
+ GUEST_ASSERT_1(val == 1 || val == MMIO_VAL, val);
+
+ /* Spin until the misaligning memory region move completes. */
+ val = guest_spin_on_val(MMIO_VAL);
+ GUEST_ASSERT_1(val == 1 || val == 0, val);
+
+ /* Spin until the memory region starts to get re-aligned. */
+ val = guest_spin_on_val(0);
+ GUEST_ASSERT_1(val == 1 || val == MMIO_VAL, val);
+
+ /* Spin until the re-aligning memory region move completes. */
+ val = guest_spin_on_val(MMIO_VAL);
+ GUEST_ASSERT_1(val == 1, val);
+
+ GUEST_DONE();
+}
+
+static void test_move_memory_region(void)
+{
+ pthread_t vcpu_thread;
+ struct kvm_vm *vm;
+ uint64_t *hva;
+
+ vm = spawn_vm(&vcpu_thread, guest_code_move_memory_region);
+
+ hva = addr_gpa2hva(vm, MEM_REGION_GPA);
+
+ /*
+ * Shift the region's base GPA. The guest should not see "2" as the
+ * hva->gpa translation is misaligned, i.e. the guest is accessing a
+ * different host pfn.
+ */
+ vm_mem_region_move(vm, MEM_REGION_SLOT, MEM_REGION_GPA - 4096);
+ WRITE_ONCE(*hva, 2);
+
+ /*
+ * The guest _might_ see an invalid memslot and trigger MMIO, but it's
+ * a tiny window. Spin and defer the sync until the memslot is
+ * restored and guest behavior is once again deterministic.
+ */
+ usleep(100000);
+
+ /*
+ * Note, value in memory needs to be changed *before* restoring the
+ * memslot, else the guest could race the update and see "2".
+ */
+ WRITE_ONCE(*hva, 1);
+
+ /* Restore the original base, the guest should see "1". */
+ vm_mem_region_move(vm, MEM_REGION_SLOT, MEM_REGION_GPA);
+ wait_for_vcpu();
+ /* Defered sync from when the memslot was misaligned (above). */
+ wait_for_vcpu();
+
+ pthread_join(vcpu_thread, NULL);
+
+ kvm_vm_free(vm);
+}
+
+static void guest_code_delete_memory_region(void)
+{
+ uint64_t val;
+
+ GUEST_SYNC(0);
+
+ /* Spin until the memory region is deleted. */
+ val = guest_spin_on_val(0);
+ GUEST_ASSERT_1(val == MMIO_VAL, val);
+
+ /* Spin until the memory region is recreated. */
+ val = guest_spin_on_val(MMIO_VAL);
+ GUEST_ASSERT_1(val == 0, val);
+
+ /* Spin until the memory region is deleted. */
+ val = guest_spin_on_val(0);
+ GUEST_ASSERT_1(val == MMIO_VAL, val);
+
+ asm("1:\n\t"
+ ".pushsection .rodata\n\t"
+ ".global final_rip_start\n\t"
+ "final_rip_start: .quad 1b\n\t"
+ ".popsection");
+
+ /* Spin indefinitely (until the code memslot is deleted). */
+ guest_spin_on_val(MMIO_VAL);
+
+ asm("1:\n\t"
+ ".pushsection .rodata\n\t"
+ ".global final_rip_end\n\t"
+ "final_rip_end: .quad 1b\n\t"
+ ".popsection");
+
+ GUEST_ASSERT_1(0, 0);
+}
+
+static void test_delete_memory_region(void)
+{
+ pthread_t vcpu_thread;
+ struct kvm_regs regs;
+ struct kvm_run *run;
+ struct kvm_vm *vm;
+
+ vm = spawn_vm(&vcpu_thread, guest_code_delete_memory_region);
+
+ /* Delete the memory region, the guest should not die. */
+ vm_mem_region_delete(vm, MEM_REGION_SLOT);
+ wait_for_vcpu();
+
+ /* Recreate the memory region. The guest should see "0". */
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS_THP,
+ MEM_REGION_GPA, MEM_REGION_SLOT,
+ MEM_REGION_SIZE / getpagesize(), 0);
+ wait_for_vcpu();
+
+ /* Delete the region again so that there's only one memslot left. */
+ vm_mem_region_delete(vm, MEM_REGION_SLOT);
+ wait_for_vcpu();
+
+ /*
+ * Delete the primary memslot. This should cause an emulation error or
+ * shutdown due to the page tables getting nuked.
+ */
+ vm_mem_region_delete(vm, 0);
+
+ pthread_join(vcpu_thread, NULL);
+
+ run = vcpu_state(vm, VCPU_ID);
+
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN ||
+ run->exit_reason == KVM_EXIT_INTERNAL_ERROR,
+ "Unexpected exit reason = %d", run->exit_reason);
+
+ vcpu_regs_get(vm, VCPU_ID, &regs);
+
+ /*
+ * On AMD, after KVM_EXIT_SHUTDOWN the VMCB has been reinitialized already,
+ * so the instruction pointer would point to the reset vector.
+ */
+ if (run->exit_reason == KVM_EXIT_INTERNAL_ERROR)
+ TEST_ASSERT(regs.rip >= final_rip_start &&
+ regs.rip < final_rip_end,
+ "Bad rip, expected 0x%lx - 0x%lx, got 0x%llx\n",
+ final_rip_start, final_rip_end, regs.rip);
+
+ kvm_vm_free(vm);
+}
+
+static void test_zero_memory_regions(void)
+{
+ struct kvm_run *run;
+ struct kvm_vm *vm;
+
+ pr_info("Testing KVM_RUN with zero added memory regions\n");
+
+ vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR);
+ vm_vcpu_add(vm, VCPU_ID);
+
+ TEST_ASSERT(!ioctl(vm_get_fd(vm), KVM_SET_NR_MMU_PAGES, 64),
+ "KVM_SET_NR_MMU_PAGES failed, errno = %d\n", errno);
+ vcpu_run(vm, VCPU_ID);
+
+ run = vcpu_state(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_INTERNAL_ERROR,
+ "Unexpected exit_reason = %u\n", run->exit_reason);
+
+ kvm_vm_free(vm);
+}
+#endif /* __x86_64__ */
+
+/*
+ * Test it can be added memory slots up to KVM_CAP_NR_MEMSLOTS, then any
+ * tentative to add further slots should fail.
+ */
+static void test_add_max_memory_regions(void)
+{
+ int ret;
+ struct kvm_vm *vm;
+ uint32_t max_mem_slots;
+ uint32_t slot;
+ uint64_t guest_addr = 0x0;
+ uint64_t mem_reg_npages;
+ void *mem;
+
+ max_mem_slots = kvm_check_cap(KVM_CAP_NR_MEMSLOTS);
+ TEST_ASSERT(max_mem_slots > 0,
+ "KVM_CAP_NR_MEMSLOTS should be greater than 0");
+ pr_info("Allowed number of memory slots: %i\n", max_mem_slots);
+
+ vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR);
+
+ mem_reg_npages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, MEM_REGION_SIZE);
+
+ /* Check it can be added memory slots up to the maximum allowed */
+ pr_info("Adding slots 0..%i, each memory region with %dK size\n",
+ (max_mem_slots - 1), MEM_REGION_SIZE >> 10);
+ for (slot = 0; slot < max_mem_slots; slot++) {
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+ guest_addr, slot, mem_reg_npages,
+ 0);
+ guest_addr += MEM_REGION_SIZE;
+ }
+
+ /* Check it cannot be added memory slots beyond the limit */
+ mem = mmap(NULL, MEM_REGION_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ TEST_ASSERT(mem != MAP_FAILED, "Failed to mmap() host");
+
+ ret = ioctl(vm_get_fd(vm), KVM_SET_USER_MEMORY_REGION,
+ &(struct kvm_userspace_memory_region) {slot, 0, guest_addr,
+ MEM_REGION_SIZE, (uint64_t) mem});
+ TEST_ASSERT(ret == -1 && errno == EINVAL,
+ "Adding one more memory slot should fail with EINVAL");
+
+ munmap(mem, MEM_REGION_SIZE);
+ kvm_vm_free(vm);
+}
+
+int main(int argc, char *argv[])
+{
+#ifdef __x86_64__
+ int i, loops;
+#endif
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+#ifdef __x86_64__
+ /*
+ * FIXME: the zero-memslot test fails on aarch64 and s390x because
+ * KVM_RUN fails with ENOEXEC or EFAULT.
+ */
+ test_zero_memory_regions();
+#endif
+
+ test_add_max_memory_regions();
+
+#ifdef __x86_64__
+ if (argc > 1)
+ loops = atoi(argv[1]);
+ else
+ loops = 10;
+
+ pr_info("Testing MOVE of in-use region, %d loops\n", loops);
+ for (i = 0; i < loops; i++)
+ test_move_memory_region();
+
+ pr_info("Testing DELETE of in-use region, %d loops\n", loops);
+ for (i = 0; i < loops; i++)
+ test_delete_memory_region();
+#endif
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/steal_time.c b/tools/testing/selftests/kvm/steal_time.c
new file mode 100644
index 000000000..7daedee3e
--- /dev/null
+++ b/tools/testing/selftests/kvm/steal_time.c
@@ -0,0 +1,352 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * steal/stolen time test
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <time.h>
+#include <sched.h>
+#include <pthread.h>
+#include <linux/kernel.h>
+#include <sys/syscall.h>
+#include <asm/kvm.h>
+#include <asm/kvm_para.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define NR_VCPUS 4
+#define ST_GPA_BASE (1 << 30)
+#define MIN_RUN_DELAY_NS 200000UL
+
+static void *st_gva[NR_VCPUS];
+static uint64_t guest_stolen_time[NR_VCPUS];
+
+#if defined(__x86_64__)
+
+/* steal_time must have 64-byte alignment */
+#define STEAL_TIME_SIZE ((sizeof(struct kvm_steal_time) + 63) & ~63)
+
+static void check_status(struct kvm_steal_time *st)
+{
+ GUEST_ASSERT(!(READ_ONCE(st->version) & 1));
+ GUEST_ASSERT(READ_ONCE(st->flags) == 0);
+ GUEST_ASSERT(READ_ONCE(st->preempted) == 0);
+}
+
+static void guest_code(int cpu)
+{
+ struct kvm_steal_time *st = st_gva[cpu];
+ uint32_t version;
+
+ GUEST_ASSERT(rdmsr(MSR_KVM_STEAL_TIME) == ((uint64_t)st_gva[cpu] | KVM_MSR_ENABLED));
+
+ memset(st, 0, sizeof(*st));
+ GUEST_SYNC(0);
+
+ check_status(st);
+ WRITE_ONCE(guest_stolen_time[cpu], st->steal);
+ version = READ_ONCE(st->version);
+ check_status(st);
+ GUEST_SYNC(1);
+
+ check_status(st);
+ GUEST_ASSERT(version < READ_ONCE(st->version));
+ WRITE_ONCE(guest_stolen_time[cpu], st->steal);
+ check_status(st);
+ GUEST_DONE();
+}
+
+static void steal_time_init(struct kvm_vm *vm)
+{
+ int i;
+
+ if (!(kvm_get_supported_cpuid_entry(KVM_CPUID_FEATURES)->eax &
+ KVM_FEATURE_STEAL_TIME)) {
+ print_skip("steal-time not supported");
+ exit(KSFT_SKIP);
+ }
+
+ for (i = 0; i < NR_VCPUS; ++i) {
+ int ret;
+
+ vcpu_set_cpuid(vm, i, kvm_get_supported_cpuid());
+
+ /* ST_GPA_BASE is identity mapped */
+ st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE);
+ sync_global_to_guest(vm, st_gva[i]);
+
+ ret = _vcpu_set_msr(vm, i, MSR_KVM_STEAL_TIME, (ulong)st_gva[i] | KVM_STEAL_RESERVED_MASK);
+ TEST_ASSERT(ret == 0, "Bad GPA didn't fail");
+
+ vcpu_set_msr(vm, i, MSR_KVM_STEAL_TIME, (ulong)st_gva[i] | KVM_MSR_ENABLED);
+ }
+}
+
+static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct kvm_steal_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpuid]);
+ int i;
+
+ pr_info("VCPU%d:\n", vcpuid);
+ pr_info(" steal: %lld\n", st->steal);
+ pr_info(" version: %d\n", st->version);
+ pr_info(" flags: %d\n", st->flags);
+ pr_info(" preempted: %d\n", st->preempted);
+ pr_info(" u8_pad: ");
+ for (i = 0; i < 3; ++i)
+ pr_info("%d", st->u8_pad[i]);
+ pr_info("\n pad: ");
+ for (i = 0; i < 11; ++i)
+ pr_info("%d", st->pad[i]);
+ pr_info("\n");
+}
+
+#elif defined(__aarch64__)
+
+/* PV_TIME_ST must have 64-byte alignment */
+#define STEAL_TIME_SIZE ((sizeof(struct st_time) + 63) & ~63)
+
+#define SMCCC_ARCH_FEATURES 0x80000001
+#define PV_TIME_FEATURES 0xc5000020
+#define PV_TIME_ST 0xc5000021
+
+struct st_time {
+ uint32_t rev;
+ uint32_t attr;
+ uint64_t st_time;
+};
+
+static int64_t smccc(uint32_t func, uint64_t arg)
+{
+ unsigned long ret;
+
+ asm volatile(
+ "mov w0, %w1\n"
+ "mov x1, %2\n"
+ "hvc #0\n"
+ "mov %0, x0\n"
+ : "=r" (ret) : "r" (func), "r" (arg) :
+ "x0", "x1", "x2", "x3");
+
+ return ret;
+}
+
+static void check_status(struct st_time *st)
+{
+ GUEST_ASSERT(READ_ONCE(st->rev) == 0);
+ GUEST_ASSERT(READ_ONCE(st->attr) == 0);
+}
+
+static void guest_code(int cpu)
+{
+ struct st_time *st;
+ int64_t status;
+
+ status = smccc(SMCCC_ARCH_FEATURES, PV_TIME_FEATURES);
+ GUEST_ASSERT(status == 0);
+ status = smccc(PV_TIME_FEATURES, PV_TIME_FEATURES);
+ GUEST_ASSERT(status == 0);
+ status = smccc(PV_TIME_FEATURES, PV_TIME_ST);
+ GUEST_ASSERT(status == 0);
+
+ status = smccc(PV_TIME_ST, 0);
+ GUEST_ASSERT(status != -1);
+ GUEST_ASSERT(status == (ulong)st_gva[cpu]);
+
+ st = (struct st_time *)status;
+ GUEST_SYNC(0);
+
+ check_status(st);
+ WRITE_ONCE(guest_stolen_time[cpu], st->st_time);
+ GUEST_SYNC(1);
+
+ check_status(st);
+ WRITE_ONCE(guest_stolen_time[cpu], st->st_time);
+ GUEST_DONE();
+}
+
+static void steal_time_init(struct kvm_vm *vm)
+{
+ struct kvm_device_attr dev = {
+ .group = KVM_ARM_VCPU_PVTIME_CTRL,
+ .attr = KVM_ARM_VCPU_PVTIME_IPA,
+ };
+ int i, ret;
+
+ ret = _vcpu_ioctl(vm, 0, KVM_HAS_DEVICE_ATTR, &dev);
+ if (ret != 0 && errno == ENXIO) {
+ print_skip("steal-time not supported");
+ exit(KSFT_SKIP);
+ }
+
+ for (i = 0; i < NR_VCPUS; ++i) {
+ uint64_t st_ipa;
+
+ vcpu_ioctl(vm, i, KVM_HAS_DEVICE_ATTR, &dev);
+
+ dev.addr = (uint64_t)&st_ipa;
+
+ /* ST_GPA_BASE is identity mapped */
+ st_gva[i] = (void *)(ST_GPA_BASE + i * STEAL_TIME_SIZE);
+ sync_global_to_guest(vm, st_gva[i]);
+
+ st_ipa = (ulong)st_gva[i] | 1;
+ ret = _vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev);
+ TEST_ASSERT(ret == -1 && errno == EINVAL, "Bad IPA didn't report EINVAL");
+
+ st_ipa = (ulong)st_gva[i];
+ vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev);
+
+ ret = _vcpu_ioctl(vm, i, KVM_SET_DEVICE_ATTR, &dev);
+ TEST_ASSERT(ret == -1 && errno == EEXIST, "Set IPA twice without EEXIST");
+
+ }
+}
+
+static void steal_time_dump(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct st_time *st = addr_gva2hva(vm, (ulong)st_gva[vcpuid]);
+
+ pr_info("VCPU%d:\n", vcpuid);
+ pr_info(" rev: %d\n", st->rev);
+ pr_info(" attr: %d\n", st->attr);
+ pr_info(" st_time: %ld\n", st->st_time);
+}
+
+#endif
+
+static long get_run_delay(void)
+{
+ char path[64];
+ long val[2];
+ FILE *fp;
+
+ sprintf(path, "/proc/%ld/schedstat", syscall(SYS_gettid));
+ fp = fopen(path, "r");
+ fscanf(fp, "%ld %ld ", &val[0], &val[1]);
+ fclose(fp);
+
+ return val[1];
+}
+
+static void *do_steal_time(void *arg)
+{
+ struct timespec ts, stop;
+
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ stop = timespec_add_ns(ts, MIN_RUN_DELAY_NS);
+
+ while (1) {
+ clock_gettime(CLOCK_MONOTONIC, &ts);
+ if (timespec_to_ns(timespec_sub(ts, stop)) >= 0)
+ break;
+ }
+
+ return NULL;
+}
+
+static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct ucall uc;
+
+ vcpu_args_set(vm, vcpuid, 1, vcpuid);
+
+ vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL);
+
+ switch (get_ucall(vm, vcpuid, &uc)) {
+ case UCALL_SYNC:
+ case UCALL_DONE:
+ break;
+ case UCALL_ABORT:
+ TEST_ASSERT(false, "%s at %s:%ld", (const char *)uc.args[0],
+ __FILE__, uc.args[1]);
+ default:
+ TEST_ASSERT(false, "Unexpected exit: %s",
+ exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason));
+ }
+}
+
+int main(int ac, char **av)
+{
+ struct kvm_vm *vm;
+ pthread_attr_t attr;
+ pthread_t thread;
+ cpu_set_t cpuset;
+ unsigned int gpages;
+ long stolen_time;
+ long run_delay;
+ bool verbose;
+ int i;
+
+ verbose = ac > 1 && (!strncmp(av[1], "-v", 3) || !strncmp(av[1], "--verbose", 10));
+
+ /* Set CPU affinity so we can force preemption of the VCPU */
+ CPU_ZERO(&cpuset);
+ CPU_SET(0, &cpuset);
+ pthread_attr_init(&attr);
+ pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset);
+ pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
+
+ /* Create a one VCPU guest and an identity mapped memslot for the steal time structure */
+ vm = vm_create_default(0, 0, guest_code);
+ gpages = vm_calc_num_guest_pages(VM_MODE_DEFAULT, STEAL_TIME_SIZE * NR_VCPUS);
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, ST_GPA_BASE, 1, gpages, 0);
+ virt_map(vm, ST_GPA_BASE, ST_GPA_BASE, gpages, 0);
+ ucall_init(vm, NULL);
+
+ /* Add the rest of the VCPUs */
+ for (i = 1; i < NR_VCPUS; ++i)
+ vm_vcpu_add_default(vm, i, guest_code);
+
+ steal_time_init(vm);
+
+ /* Run test on each VCPU */
+ for (i = 0; i < NR_VCPUS; ++i) {
+ /* First VCPU run initializes steal-time */
+ run_vcpu(vm, i);
+
+ /* Second VCPU run, expect guest stolen time to be <= run_delay */
+ run_vcpu(vm, i);
+ sync_global_from_guest(vm, guest_stolen_time[i]);
+ stolen_time = guest_stolen_time[i];
+ run_delay = get_run_delay();
+ TEST_ASSERT(stolen_time <= run_delay,
+ "Expected stolen time <= %ld, got %ld",
+ run_delay, stolen_time);
+
+ /* Steal time from the VCPU. The steal time thread has the same CPU affinity as the VCPUs. */
+ run_delay = get_run_delay();
+ pthread_create(&thread, &attr, do_steal_time, NULL);
+ do
+ pthread_yield();
+ while (get_run_delay() - run_delay < MIN_RUN_DELAY_NS);
+ pthread_join(thread, NULL);
+ run_delay = get_run_delay() - run_delay;
+ TEST_ASSERT(run_delay >= MIN_RUN_DELAY_NS,
+ "Expected run_delay >= %ld, got %ld",
+ MIN_RUN_DELAY_NS, run_delay);
+
+ /* Run VCPU again to confirm stolen time is consistent with run_delay */
+ run_vcpu(vm, i);
+ sync_global_from_guest(vm, guest_stolen_time[i]);
+ stolen_time = guest_stolen_time[i] - stolen_time;
+ TEST_ASSERT(stolen_time >= run_delay,
+ "Expected stolen time >= %ld, got %ld",
+ run_delay, stolen_time);
+
+ if (verbose) {
+ pr_info("VCPU%d: total-stolen-time=%ld test-stolen-time=%ld", i,
+ guest_stolen_time[i], stolen_time);
+ if (stolen_time == run_delay)
+ pr_info(" (BONUS: guest test-stolen-time even exactly matches test-run_delay)");
+ pr_info("\n");
+ steal_time_dump(vm, i);
+ }
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
new file mode 100644
index 000000000..140e91901
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/cr4_cpuid_sync_test.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * CR4 and CPUID sync test
+ *
+ * Copyright 2018, Red Hat, Inc. and/or its affiliates.
+ *
+ * Author:
+ * Wei Huang <wei@redhat.com>
+ */
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+
+#define X86_FEATURE_XSAVE (1<<26)
+#define X86_FEATURE_OSXSAVE (1<<27)
+#define VCPU_ID 1
+
+static inline bool cr4_cpuid_is_sync(void)
+{
+ int func, subfunc;
+ uint32_t eax, ebx, ecx, edx;
+ uint64_t cr4;
+
+ func = 0x1;
+ subfunc = 0x0;
+ __asm__ __volatile__("cpuid"
+ : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
+ : "a"(func), "c"(subfunc));
+
+ cr4 = get_cr4();
+
+ return (!!(ecx & X86_FEATURE_OSXSAVE)) == (!!(cr4 & X86_CR4_OSXSAVE));
+}
+
+static void guest_code(void)
+{
+ uint64_t cr4;
+
+ /* turn on CR4.OSXSAVE */
+ cr4 = get_cr4();
+ cr4 |= X86_CR4_OSXSAVE;
+ set_cr4(cr4);
+
+ /* verify CR4.OSXSAVE == CPUID.OSXSAVE */
+ GUEST_ASSERT(cr4_cpuid_is_sync());
+
+ /* notify hypervisor to change CR4 */
+ GUEST_SYNC(0);
+
+ /* check again */
+ GUEST_ASSERT(cr4_cpuid_is_sync());
+
+ GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_run *run;
+ struct kvm_vm *vm;
+ struct kvm_sregs sregs;
+ struct kvm_cpuid_entry2 *entry;
+ struct ucall uc;
+ int rc;
+
+ entry = kvm_get_supported_cpuid_entry(1);
+ if (!(entry->ecx & X86_FEATURE_XSAVE)) {
+ print_skip("XSAVE feature not supported");
+ return 0;
+ }
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ run = vcpu_state(vm, VCPU_ID);
+
+ while (1) {
+ rc = _vcpu_run(vm, VCPU_ID);
+
+ TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Unexpected exit reason: %u (%s),\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_SYNC:
+ /* emulate hypervisor clearing CR4.OSXSAVE */
+ vcpu_sregs_get(vm, VCPU_ID, &sregs);
+ sregs.cr4 &= ~X86_CR4_OSXSAVE;
+ vcpu_sregs_set(vm, VCPU_ID, &sregs);
+ break;
+ case UCALL_ABORT:
+ TEST_FAIL("Guest CR4 bit (OSXSAVE) unsynchronized with CPUID bit.");
+ break;
+ case UCALL_DONE:
+ goto done;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+ }
+
+ kvm_vm_free(vm);
+
+done:
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c
new file mode 100644
index 000000000..2fc6b3af8
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM guest debug register tests
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#include <stdio.h>
+#include <string.h>
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 0
+
+#define DR6_BD (1 << 13)
+#define DR7_GD (1 << 13)
+
+/* For testing data access debug BP */
+uint32_t guest_value;
+
+extern unsigned char sw_bp, hw_bp, write_data, ss_start, bd_start;
+
+static void guest_code(void)
+{
+ /*
+ * Software BP tests.
+ *
+ * NOTE: sw_bp need to be before the cmd here, because int3 is an
+ * exception rather than a normal trap for KVM_SET_GUEST_DEBUG (we
+ * capture it using the vcpu exception bitmap).
+ */
+ asm volatile("sw_bp: int3");
+
+ /* Hardware instruction BP test */
+ asm volatile("hw_bp: nop");
+
+ /* Hardware data BP test */
+ asm volatile("mov $1234,%%rax;\n\t"
+ "mov %%rax,%0;\n\t write_data:"
+ : "=m" (guest_value) : : "rax");
+
+ /* Single step test, covers 2 basic instructions and 2 emulated */
+ asm volatile("ss_start: "
+ "xor %%eax,%%eax\n\t"
+ "cpuid\n\t"
+ "movl $0x1a0,%%ecx\n\t"
+ "rdmsr\n\t"
+ : : : "eax", "ebx", "ecx", "edx");
+
+ /* DR6.BD test */
+ asm volatile("bd_start: mov %%dr0, %%rax" : : : "rax");
+ GUEST_DONE();
+}
+
+#define CLEAR_DEBUG() memset(&debug, 0, sizeof(debug))
+#define APPLY_DEBUG() vcpu_set_guest_debug(vm, VCPU_ID, &debug)
+#define CAST_TO_RIP(v) ((unsigned long long)&(v))
+#define SET_RIP(v) do { \
+ vcpu_regs_get(vm, VCPU_ID, &regs); \
+ regs.rip = (v); \
+ vcpu_regs_set(vm, VCPU_ID, &regs); \
+ } while (0)
+#define MOVE_RIP(v) SET_RIP(regs.rip + (v));
+
+int main(void)
+{
+ struct kvm_guest_debug debug;
+ unsigned long long target_dr6, target_rip;
+ struct kvm_regs regs;
+ struct kvm_run *run;
+ struct kvm_vm *vm;
+ struct ucall uc;
+ uint64_t cmd;
+ int i;
+ /* Instruction lengths starting at ss_start */
+ int ss_size[4] = {
+ 2, /* xor */
+ 2, /* cpuid */
+ 5, /* mov */
+ 2, /* rdmsr */
+ };
+
+ if (!kvm_check_cap(KVM_CAP_SET_GUEST_DEBUG)) {
+ print_skip("KVM_CAP_SET_GUEST_DEBUG not supported");
+ return 0;
+ }
+
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ run = vcpu_state(vm, VCPU_ID);
+
+ /* Test software BPs - int3 */
+ CLEAR_DEBUG();
+ debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP;
+ APPLY_DEBUG();
+ vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
+ run->debug.arch.exception == BP_VECTOR &&
+ run->debug.arch.pc == CAST_TO_RIP(sw_bp),
+ "INT3: exit %d exception %d rip 0x%llx (should be 0x%llx)",
+ run->exit_reason, run->debug.arch.exception,
+ run->debug.arch.pc, CAST_TO_RIP(sw_bp));
+ MOVE_RIP(1);
+
+ /* Test instruction HW BP over DR[0-3] */
+ for (i = 0; i < 4; i++) {
+ CLEAR_DEBUG();
+ debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
+ debug.arch.debugreg[i] = CAST_TO_RIP(hw_bp);
+ debug.arch.debugreg[7] = 0x400 | (1UL << (2*i+1));
+ APPLY_DEBUG();
+ vcpu_run(vm, VCPU_ID);
+ target_dr6 = 0xffff0ff0 | (1UL << i);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
+ run->debug.arch.exception == DB_VECTOR &&
+ run->debug.arch.pc == CAST_TO_RIP(hw_bp) &&
+ run->debug.arch.dr6 == target_dr6,
+ "INS_HW_BP (DR%d): exit %d exception %d rip 0x%llx "
+ "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)",
+ i, run->exit_reason, run->debug.arch.exception,
+ run->debug.arch.pc, CAST_TO_RIP(hw_bp),
+ run->debug.arch.dr6, target_dr6);
+ }
+ /* Skip "nop" */
+ MOVE_RIP(1);
+
+ /* Test data access HW BP over DR[0-3] */
+ for (i = 0; i < 4; i++) {
+ CLEAR_DEBUG();
+ debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
+ debug.arch.debugreg[i] = CAST_TO_RIP(guest_value);
+ debug.arch.debugreg[7] = 0x00000400 | (1UL << (2*i+1)) |
+ (0x000d0000UL << (4*i));
+ APPLY_DEBUG();
+ vcpu_run(vm, VCPU_ID);
+ target_dr6 = 0xffff0ff0 | (1UL << i);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
+ run->debug.arch.exception == DB_VECTOR &&
+ run->debug.arch.pc == CAST_TO_RIP(write_data) &&
+ run->debug.arch.dr6 == target_dr6,
+ "DATA_HW_BP (DR%d): exit %d exception %d rip 0x%llx "
+ "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)",
+ i, run->exit_reason, run->debug.arch.exception,
+ run->debug.arch.pc, CAST_TO_RIP(write_data),
+ run->debug.arch.dr6, target_dr6);
+ /* Rollback the 4-bytes "mov" */
+ MOVE_RIP(-7);
+ }
+ /* Skip the 4-bytes "mov" */
+ MOVE_RIP(7);
+
+ /* Test single step */
+ target_rip = CAST_TO_RIP(ss_start);
+ target_dr6 = 0xffff4ff0ULL;
+ vcpu_regs_get(vm, VCPU_ID, &regs);
+ for (i = 0; i < (sizeof(ss_size) / sizeof(ss_size[0])); i++) {
+ target_rip += ss_size[i];
+ CLEAR_DEBUG();
+ debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
+ debug.arch.debugreg[7] = 0x00000400;
+ APPLY_DEBUG();
+ vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
+ run->debug.arch.exception == DB_VECTOR &&
+ run->debug.arch.pc == target_rip &&
+ run->debug.arch.dr6 == target_dr6,
+ "SINGLE_STEP[%d]: exit %d exception %d rip 0x%llx "
+ "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)",
+ i, run->exit_reason, run->debug.arch.exception,
+ run->debug.arch.pc, target_rip, run->debug.arch.dr6,
+ target_dr6);
+ }
+
+ /* Finally test global disable */
+ CLEAR_DEBUG();
+ debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP;
+ debug.arch.debugreg[7] = 0x400 | DR7_GD;
+ APPLY_DEBUG();
+ vcpu_run(vm, VCPU_ID);
+ target_dr6 = 0xffff0ff0 | DR6_BD;
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_DEBUG &&
+ run->debug.arch.exception == DB_VECTOR &&
+ run->debug.arch.pc == CAST_TO_RIP(bd_start) &&
+ run->debug.arch.dr6 == target_dr6,
+ "DR7.GD: exit %d exception %d rip 0x%llx "
+ "(should be 0x%llx) dr6 0x%llx (should be 0x%llx)",
+ run->exit_reason, run->debug.arch.exception,
+ run->debug.arch.pc, target_rip, run->debug.arch.dr6,
+ target_dr6);
+
+ /* Disable all debug controls, run to the end */
+ CLEAR_DEBUG();
+ APPLY_DEBUG();
+
+ vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO, "KVM_EXIT_IO");
+ cmd = get_ucall(vm, VCPU_ID, &uc);
+ TEST_ASSERT(cmd == UCALL_DONE, "UCALL_DONE");
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/evmcs_test.c b/tools/testing/selftests/kvm/x86_64/evmcs_test.c
new file mode 100644
index 000000000..757928199
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/evmcs_test.c
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018, Red Hat, Inc.
+ *
+ * Tests for Enlightened VMCS, including nested guest state.
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+
+#include "vmx.h"
+
+#define VCPU_ID 5
+
+void l2_guest_code(void)
+{
+ GUEST_SYNC(7);
+
+ GUEST_SYNC(8);
+
+ /* Done, exit to L1 and never come back. */
+ vmcall();
+}
+
+void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+#define L2_GUEST_STACK_SIZE 64
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+ enable_vp_assist(vmx_pages->vp_assist_gpa, vmx_pages->vp_assist);
+
+ GUEST_ASSERT(vmx_pages->vmcs_gpa);
+ GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+ GUEST_SYNC(3);
+ GUEST_ASSERT(load_vmcs(vmx_pages));
+ GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
+
+ GUEST_SYNC(4);
+ GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
+
+ prepare_vmcs(vmx_pages, l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ GUEST_SYNC(5);
+ GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
+ current_evmcs->revision_id = -1u;
+ GUEST_ASSERT(vmlaunch());
+ current_evmcs->revision_id = EVMCS_VERSION;
+ GUEST_SYNC(6);
+
+ GUEST_ASSERT(!vmlaunch());
+ GUEST_ASSERT(vmptrstz() == vmx_pages->enlightened_vmcs_gpa);
+ GUEST_SYNC(9);
+ GUEST_ASSERT(!vmresume());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+ GUEST_SYNC(10);
+}
+
+void guest_code(struct vmx_pages *vmx_pages)
+{
+ GUEST_SYNC(1);
+ GUEST_SYNC(2);
+
+ if (vmx_pages)
+ l1_guest_code(vmx_pages);
+
+ GUEST_DONE();
+
+ /* Try enlightened vmptrld with an incorrect GPA */
+ evmcs_vmptrld(0xdeadbeef, vmx_pages->enlightened_vmcs);
+ GUEST_ASSERT(vmlaunch());
+}
+
+int main(int argc, char *argv[])
+{
+ vm_vaddr_t vmx_pages_gva = 0;
+
+ struct kvm_regs regs1, regs2;
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ struct kvm_x86_state *state;
+ struct ucall uc;
+ int stage;
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+ if (!nested_vmx_supported() ||
+ !kvm_check_cap(KVM_CAP_NESTED_STATE) ||
+ !kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) {
+ print_skip("Enlightened VMCS is unsupported");
+ exit(KSFT_SKIP);
+ }
+
+ vcpu_enable_evmcs(vm, VCPU_ID);
+
+ run = vcpu_state(vm, VCPU_ID);
+
+ vcpu_regs_get(vm, VCPU_ID, &regs1);
+
+ vcpu_alloc_vmx(vm, &vmx_pages_gva);
+ vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
+
+ for (stage = 1;; stage++) {
+ _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Stage %d: unexpected exit reason: %u (%s),\n",
+ stage, run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
+ __FILE__, uc.args[1]);
+ /* NOT REACHED */
+ case UCALL_SYNC:
+ break;
+ case UCALL_DONE:
+ goto part1_done;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+
+ /* UCALL_SYNC is handled here. */
+ TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+ uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
+ stage, (ulong)uc.args[1]);
+
+ state = vcpu_save_state(vm, VCPU_ID);
+ memset(&regs1, 0, sizeof(regs1));
+ vcpu_regs_get(vm, VCPU_ID, &regs1);
+
+ kvm_vm_release(vm);
+
+ /* Restore state in a new VM. */
+ kvm_vm_restart(vm, O_RDWR);
+ vm_vcpu_add(vm, VCPU_ID);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ vcpu_enable_evmcs(vm, VCPU_ID);
+ vcpu_load_state(vm, VCPU_ID, state);
+ run = vcpu_state(vm, VCPU_ID);
+ free(state);
+
+ memset(&regs2, 0, sizeof(regs2));
+ vcpu_regs_get(vm, VCPU_ID, &regs2);
+ TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
+ "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
+ (ulong) regs2.rdi, (ulong) regs2.rsi);
+ }
+
+part1_done:
+ _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN,
+ "Unexpected successful VMEnter with invalid eVMCS pointer!");
+
+ kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
new file mode 100644
index 000000000..745b708c2
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/hyperv_cpuid.c
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for x86 KVM_CAP_HYPERV_CPUID
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#define VCPU_ID 0
+
+static void guest_code(void)
+{
+}
+
+static bool smt_possible(void)
+{
+ char buf[16];
+ FILE *f;
+ bool res = true;
+
+ f = fopen("/sys/devices/system/cpu/smt/control", "r");
+ if (f) {
+ if (fread(buf, sizeof(*buf), sizeof(buf), f) > 0) {
+ if (!strncmp(buf, "forceoff", 8) ||
+ !strncmp(buf, "notsupported", 12))
+ res = false;
+ }
+ fclose(f);
+ }
+
+ return res;
+}
+
+static void test_hv_cpuid(struct kvm_cpuid2 *hv_cpuid_entries,
+ bool evmcs_enabled)
+{
+ int i;
+ int nent = 9;
+ u32 test_val;
+
+ if (evmcs_enabled)
+ nent += 1; /* 0x4000000A */
+
+ TEST_ASSERT(hv_cpuid_entries->nent == nent,
+ "KVM_GET_SUPPORTED_HV_CPUID should return %d entries"
+ " with evmcs=%d (returned %d)",
+ nent, evmcs_enabled, hv_cpuid_entries->nent);
+
+ for (i = 0; i < hv_cpuid_entries->nent; i++) {
+ struct kvm_cpuid_entry2 *entry = &hv_cpuid_entries->entries[i];
+
+ TEST_ASSERT((entry->function >= 0x40000000) &&
+ (entry->function <= 0x40000082),
+ "function %x is our of supported range",
+ entry->function);
+
+ TEST_ASSERT(evmcs_enabled || (entry->function != 0x4000000A),
+ "0x4000000A leaf should not be reported");
+
+ TEST_ASSERT(entry->index == 0,
+ ".index field should be zero");
+
+ TEST_ASSERT(entry->flags == 0,
+ ".flags field should be zero");
+
+ TEST_ASSERT(!entry->padding[0] && !entry->padding[1] &&
+ !entry->padding[2], "padding should be zero");
+
+ switch (entry->function) {
+ case 0x40000000:
+ test_val = 0x40000082;
+
+ TEST_ASSERT(entry->eax == test_val,
+ "Wrong max leaf report in 0x40000000.EAX: %x"
+ " (evmcs=%d)",
+ entry->eax, evmcs_enabled
+ );
+ break;
+ case 0x40000004:
+ test_val = entry->eax & (1UL << 18);
+
+ TEST_ASSERT(!!test_val == !smt_possible(),
+ "NoNonArchitecturalCoreSharing bit"
+ " doesn't reflect SMT setting");
+ break;
+ }
+
+ /*
+ * If needed for debug:
+ * fprintf(stdout,
+ * "CPUID%lx EAX=0x%lx EBX=0x%lx ECX=0x%lx EDX=0x%lx\n",
+ * entry->function, entry->eax, entry->ebx, entry->ecx,
+ * entry->edx);
+ */
+ }
+
+}
+
+void test_hv_cpuid_e2big(struct kvm_vm *vm)
+{
+ static struct kvm_cpuid2 cpuid = {.nent = 0};
+ int ret;
+
+ ret = _vcpu_ioctl(vm, VCPU_ID, KVM_GET_SUPPORTED_HV_CPUID, &cpuid);
+
+ TEST_ASSERT(ret == -1 && errno == E2BIG,
+ "KVM_GET_SUPPORTED_HV_CPUID didn't fail with -E2BIG when"
+ " it should have: %d %d", ret, errno);
+}
+
+
+struct kvm_cpuid2 *kvm_get_supported_hv_cpuid(struct kvm_vm *vm)
+{
+ int nent = 20; /* should be enough */
+ static struct kvm_cpuid2 *cpuid;
+
+ cpuid = malloc(sizeof(*cpuid) + nent * sizeof(struct kvm_cpuid_entry2));
+
+ if (!cpuid) {
+ perror("malloc");
+ abort();
+ }
+
+ cpuid->nent = nent;
+
+ vcpu_ioctl(vm, VCPU_ID, KVM_GET_SUPPORTED_HV_CPUID, cpuid);
+
+ return cpuid;
+}
+
+
+int main(int argc, char *argv[])
+{
+ struct kvm_vm *vm;
+ int rv, stage;
+ struct kvm_cpuid2 *hv_cpuid_entries;
+ bool evmcs_enabled;
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+ rv = kvm_check_cap(KVM_CAP_HYPERV_CPUID);
+ if (!rv) {
+ print_skip("KVM_CAP_HYPERV_CPUID not supported");
+ exit(KSFT_SKIP);
+ }
+
+ for (stage = 0; stage < 3; stage++) {
+ evmcs_enabled = false;
+
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ switch (stage) {
+ case 0:
+ test_hv_cpuid_e2big(vm);
+ continue;
+ case 1:
+ break;
+ case 2:
+ if (!nested_vmx_supported() ||
+ !kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) {
+ print_skip("Enlightened VMCS is unsupported");
+ continue;
+ }
+ vcpu_enable_evmcs(vm, VCPU_ID);
+ evmcs_enabled = true;
+ break;
+ }
+
+ hv_cpuid_entries = kvm_get_supported_hv_cpuid(vm);
+ test_hv_cpuid(hv_cpuid_entries, evmcs_enabled);
+ free(hv_cpuid_entries);
+ kvm_vm_free(vm);
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c
new file mode 100644
index 000000000..b10a27485
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/kvm_pv_test.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020, Google LLC.
+ *
+ * Tests for KVM paravirtual feature disablement
+ */
+#include <asm/kvm_para.h>
+#include <linux/kvm_para.h>
+#include <stdint.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+extern unsigned char rdmsr_start;
+extern unsigned char rdmsr_end;
+
+static u64 do_rdmsr(u32 idx)
+{
+ u32 lo, hi;
+
+ asm volatile("rdmsr_start: rdmsr;"
+ "rdmsr_end:"
+ : "=a"(lo), "=c"(hi)
+ : "c"(idx));
+
+ return (((u64) hi) << 32) | lo;
+}
+
+extern unsigned char wrmsr_start;
+extern unsigned char wrmsr_end;
+
+static void do_wrmsr(u32 idx, u64 val)
+{
+ u32 lo, hi;
+
+ lo = val;
+ hi = val >> 32;
+
+ asm volatile("wrmsr_start: wrmsr;"
+ "wrmsr_end:"
+ : : "a"(lo), "c"(idx), "d"(hi));
+}
+
+static int nr_gp;
+
+static void guest_gp_handler(struct ex_regs *regs)
+{
+ unsigned char *rip = (unsigned char *)regs->rip;
+ bool r, w;
+
+ r = rip == &rdmsr_start;
+ w = rip == &wrmsr_start;
+ GUEST_ASSERT(r || w);
+
+ nr_gp++;
+
+ if (r)
+ regs->rip = (uint64_t)&rdmsr_end;
+ else
+ regs->rip = (uint64_t)&wrmsr_end;
+}
+
+struct msr_data {
+ uint32_t idx;
+ const char *name;
+};
+
+#define TEST_MSR(msr) { .idx = msr, .name = #msr }
+#define UCALL_PR_MSR 0xdeadbeef
+#define PR_MSR(msr) ucall(UCALL_PR_MSR, 1, msr)
+
+/*
+ * KVM paravirtual msrs to test. Expect a #GP if any of these msrs are read or
+ * written, as the KVM_CPUID_FEATURES leaf is cleared.
+ */
+static struct msr_data msrs_to_test[] = {
+ TEST_MSR(MSR_KVM_SYSTEM_TIME),
+ TEST_MSR(MSR_KVM_SYSTEM_TIME_NEW),
+ TEST_MSR(MSR_KVM_WALL_CLOCK),
+ TEST_MSR(MSR_KVM_WALL_CLOCK_NEW),
+ TEST_MSR(MSR_KVM_ASYNC_PF_EN),
+ TEST_MSR(MSR_KVM_STEAL_TIME),
+ TEST_MSR(MSR_KVM_PV_EOI_EN),
+ TEST_MSR(MSR_KVM_POLL_CONTROL),
+ TEST_MSR(MSR_KVM_ASYNC_PF_INT),
+ TEST_MSR(MSR_KVM_ASYNC_PF_ACK),
+};
+
+static void test_msr(struct msr_data *msr)
+{
+ PR_MSR(msr);
+ do_rdmsr(msr->idx);
+ GUEST_ASSERT(READ_ONCE(nr_gp) == 1);
+
+ nr_gp = 0;
+ do_wrmsr(msr->idx, 0);
+ GUEST_ASSERT(READ_ONCE(nr_gp) == 1);
+ nr_gp = 0;
+}
+
+struct hcall_data {
+ uint64_t nr;
+ const char *name;
+};
+
+#define TEST_HCALL(hc) { .nr = hc, .name = #hc }
+#define UCALL_PR_HCALL 0xdeadc0de
+#define PR_HCALL(hc) ucall(UCALL_PR_HCALL, 1, hc)
+
+/*
+ * KVM hypercalls to test. Expect -KVM_ENOSYS when called, as the corresponding
+ * features have been cleared in KVM_CPUID_FEATURES.
+ */
+static struct hcall_data hcalls_to_test[] = {
+ TEST_HCALL(KVM_HC_KICK_CPU),
+ TEST_HCALL(KVM_HC_SEND_IPI),
+ TEST_HCALL(KVM_HC_SCHED_YIELD),
+};
+
+static void test_hcall(struct hcall_data *hc)
+{
+ uint64_t r;
+
+ PR_HCALL(hc);
+ r = kvm_hypercall(hc->nr, 0, 0, 0, 0);
+ GUEST_ASSERT(r == -KVM_ENOSYS);
+}
+
+static void guest_main(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(msrs_to_test); i++) {
+ test_msr(&msrs_to_test[i]);
+ }
+
+ for (i = 0; i < ARRAY_SIZE(hcalls_to_test); i++) {
+ test_hcall(&hcalls_to_test[i]);
+ }
+
+ GUEST_DONE();
+}
+
+static void clear_kvm_cpuid_features(struct kvm_cpuid2 *cpuid)
+{
+ struct kvm_cpuid_entry2 ent = {0};
+
+ ent.function = KVM_CPUID_FEATURES;
+ TEST_ASSERT(set_cpuid(cpuid, &ent),
+ "failed to clear KVM_CPUID_FEATURES leaf");
+}
+
+static void pr_msr(struct ucall *uc)
+{
+ struct msr_data *msr = (struct msr_data *)uc->args[0];
+
+ pr_info("testing msr: %s (%#x)\n", msr->name, msr->idx);
+}
+
+static void pr_hcall(struct ucall *uc)
+{
+ struct hcall_data *hc = (struct hcall_data *)uc->args[0];
+
+ pr_info("testing hcall: %s (%lu)\n", hc->name, hc->nr);
+}
+
+static void handle_abort(struct ucall *uc)
+{
+ TEST_FAIL("%s at %s:%ld", (const char *)uc->args[0],
+ __FILE__, uc->args[1]);
+}
+
+#define VCPU_ID 0
+
+static void enter_guest(struct kvm_vm *vm)
+{
+ struct kvm_run *run;
+ struct ucall uc;
+ int r;
+
+ run = vcpu_state(vm, VCPU_ID);
+
+ while (true) {
+ r = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(!r, "vcpu_run failed: %d\n", r);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "unexpected exit reason: %u (%s)",
+ run->exit_reason, exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_PR_MSR:
+ pr_msr(&uc);
+ break;
+ case UCALL_PR_HCALL:
+ pr_hcall(&uc);
+ break;
+ case UCALL_ABORT:
+ handle_abort(&uc);
+ return;
+ case UCALL_DONE:
+ return;
+ }
+ }
+}
+
+int main(void)
+{
+ struct kvm_enable_cap cap = {0};
+ struct kvm_cpuid2 *best;
+ struct kvm_vm *vm;
+
+ if (!kvm_check_cap(KVM_CAP_ENFORCE_PV_FEATURE_CPUID)) {
+ pr_info("will skip kvm paravirt restriction tests.\n");
+ return 0;
+ }
+
+ vm = vm_create_default(VCPU_ID, 0, guest_main);
+
+ cap.cap = KVM_CAP_ENFORCE_PV_FEATURE_CPUID;
+ cap.args[0] = 1;
+ vcpu_enable_cap(vm, VCPU_ID, &cap);
+
+ best = kvm_get_supported_cpuid();
+ clear_kvm_cpuid_features(best);
+ vcpu_set_cpuid(vm, VCPU_ID, best);
+
+ vm_init_descriptor_tables(vm);
+ vcpu_init_descriptor_tables(vm, VCPU_ID);
+ vm_handle_exception(vm, GP_VECTOR, guest_gp_handler);
+
+ enter_guest(vm);
+ kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c
new file mode 100644
index 000000000..9f55ccd16
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/mmio_warning_test.c
@@ -0,0 +1,127 @@
+/*
+ * mmio_warning_test
+ *
+ * Copyright (C) 2019, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Test that we don't get a kernel warning when we call KVM_RUN after a
+ * triple fault occurs. To get the triple fault to occur we call KVM_RUN
+ * on a VCPU that hasn't been properly setup.
+ *
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <kvm_util.h>
+#include <linux/kvm.h>
+#include <processor.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <test_util.h>
+#include <unistd.h>
+
+#define NTHREAD 4
+#define NPROCESS 5
+
+struct thread_context {
+ int kvmcpu;
+ struct kvm_run *run;
+};
+
+void *thr(void *arg)
+{
+ struct thread_context *tc = (struct thread_context *)arg;
+ int res;
+ int kvmcpu = tc->kvmcpu;
+ struct kvm_run *run = tc->run;
+
+ res = ioctl(kvmcpu, KVM_RUN, 0);
+ pr_info("ret1=%d exit_reason=%d suberror=%d\n",
+ res, run->exit_reason, run->internal.suberror);
+
+ return 0;
+}
+
+void test(void)
+{
+ int i, kvm, kvmvm, kvmcpu;
+ pthread_t th[NTHREAD];
+ struct kvm_run *run;
+ struct thread_context tc;
+
+ kvm = open("/dev/kvm", O_RDWR);
+ TEST_ASSERT(kvm != -1, "failed to open /dev/kvm");
+ kvmvm = ioctl(kvm, KVM_CREATE_VM, 0);
+ TEST_ASSERT(kvmvm != -1, "KVM_CREATE_VM failed");
+ kvmcpu = ioctl(kvmvm, KVM_CREATE_VCPU, 0);
+ TEST_ASSERT(kvmcpu != -1, "KVM_CREATE_VCPU failed");
+ run = (struct kvm_run *)mmap(0, 4096, PROT_READ|PROT_WRITE, MAP_SHARED,
+ kvmcpu, 0);
+ tc.kvmcpu = kvmcpu;
+ tc.run = run;
+ srand(getpid());
+ for (i = 0; i < NTHREAD; i++) {
+ pthread_create(&th[i], NULL, thr, (void *)(uintptr_t)&tc);
+ usleep(rand() % 10000);
+ }
+ for (i = 0; i < NTHREAD; i++)
+ pthread_join(th[i], NULL);
+}
+
+int get_warnings_count(void)
+{
+ int warnings;
+ FILE *f;
+
+ f = popen("dmesg | grep \"WARNING:\" | wc -l", "r");
+ if (fscanf(f, "%d", &warnings) < 1)
+ warnings = 0;
+ pclose(f);
+
+ return warnings;
+}
+
+int main(void)
+{
+ int warnings_before, warnings_after;
+
+ if (!is_intel_cpu()) {
+ print_skip("Must be run on an Intel CPU");
+ exit(KSFT_SKIP);
+ }
+
+ if (vm_is_unrestricted_guest(NULL)) {
+ print_skip("Unrestricted guest must be disabled");
+ exit(KSFT_SKIP);
+ }
+
+ warnings_before = get_warnings_count();
+
+ for (int i = 0; i < NPROCESS; ++i) {
+ int status;
+ int pid = fork();
+
+ if (pid < 0)
+ exit(1);
+ if (pid == 0) {
+ test();
+ exit(0);
+ }
+ while (waitpid(pid, &status, __WALL) != pid)
+ ;
+ }
+
+ warnings_after = get_warnings_count();
+ TEST_ASSERT(warnings_before == warnings_after,
+ "Warnings found in kernel. Run 'dmesg' to inspect them.");
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/platform_info_test.c b/tools/testing/selftests/kvm/x86_64/platform_info_test.c
new file mode 100644
index 000000000..1e89688cb
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/platform_info_test.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for x86 KVM_CAP_MSR_PLATFORM_INFO
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * Verifies expected behavior of controlling guest access to
+ * MSR_PLATFORM_INFO.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 0
+#define MSR_PLATFORM_INFO_MAX_TURBO_RATIO 0xff00
+
+static void guest_code(void)
+{
+ uint64_t msr_platform_info;
+
+ for (;;) {
+ msr_platform_info = rdmsr(MSR_PLATFORM_INFO);
+ GUEST_SYNC(msr_platform_info);
+ asm volatile ("inc %r11");
+ }
+}
+
+static void set_msr_platform_info_enabled(struct kvm_vm *vm, bool enable)
+{
+ struct kvm_enable_cap cap = {};
+
+ cap.cap = KVM_CAP_MSR_PLATFORM_INFO;
+ cap.flags = 0;
+ cap.args[0] = (int)enable;
+ vm_enable_cap(vm, &cap);
+}
+
+static void test_msr_platform_info_enabled(struct kvm_vm *vm)
+{
+ struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+ struct ucall uc;
+
+ set_msr_platform_info_enabled(vm, true);
+ vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Exit_reason other than KVM_EXIT_IO: %u (%s),\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ get_ucall(vm, VCPU_ID, &uc);
+ TEST_ASSERT(uc.cmd == UCALL_SYNC,
+ "Received ucall other than UCALL_SYNC: %lu\n", uc.cmd);
+ TEST_ASSERT((uc.args[1] & MSR_PLATFORM_INFO_MAX_TURBO_RATIO) ==
+ MSR_PLATFORM_INFO_MAX_TURBO_RATIO,
+ "Expected MSR_PLATFORM_INFO to have max turbo ratio mask: %i.",
+ MSR_PLATFORM_INFO_MAX_TURBO_RATIO);
+}
+
+static void test_msr_platform_info_disabled(struct kvm_vm *vm)
+{
+ struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+
+ set_msr_platform_info_enabled(vm, false);
+ vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_SHUTDOWN,
+ "Exit_reason other than KVM_EXIT_SHUTDOWN: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_vm *vm;
+ int rv;
+ uint64_t msr_platform_info;
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+ rv = kvm_check_cap(KVM_CAP_MSR_PLATFORM_INFO);
+ if (!rv) {
+ print_skip("KVM_CAP_MSR_PLATFORM_INFO not supported");
+ exit(KSFT_SKIP);
+ }
+
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+ msr_platform_info = vcpu_get_msr(vm, VCPU_ID, MSR_PLATFORM_INFO);
+ vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO,
+ msr_platform_info | MSR_PLATFORM_INFO_MAX_TURBO_RATIO);
+ test_msr_platform_info_enabled(vm);
+ test_msr_platform_info_disabled(vm);
+ vcpu_set_msr(vm, VCPU_ID, MSR_PLATFORM_INFO, msr_platform_info);
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/set_sregs_test.c b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c
new file mode 100644
index 000000000..9f7656184
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/set_sregs_test.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM_SET_SREGS tests
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * This is a regression test for the bug fixed by the following commit:
+ * d3802286fa0f ("kvm: x86: Disallow illegal IA32_APIC_BASE MSR values")
+ *
+ * That bug allowed a user-mode program that called the KVM_SET_SREGS
+ * ioctl to put a VCPU's local APIC into an invalid state.
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 5
+
+int main(int argc, char *argv[])
+{
+ struct kvm_sregs sregs;
+ struct kvm_vm *vm;
+ int rc;
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, NULL);
+
+ vcpu_sregs_get(vm, VCPU_ID, &sregs);
+ sregs.apic_base = 1 << 10;
+ rc = _vcpu_sregs_set(vm, VCPU_ID, &sregs);
+ TEST_ASSERT(rc, "Set IA32_APIC_BASE to %llx (invalid)",
+ sregs.apic_base);
+ sregs.apic_base = 1 << 11;
+ rc = _vcpu_sregs_set(vm, VCPU_ID, &sregs);
+ TEST_ASSERT(!rc, "Couldn't set IA32_APIC_BASE to %llx (valid)",
+ sregs.apic_base);
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/smm_test.c b/tools/testing/selftests/kvm/x86_64/smm_test.c
new file mode 100644
index 000000000..ae39a2206
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/smm_test.c
@@ -0,0 +1,164 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018, Red Hat, Inc.
+ *
+ * Tests for SMM.
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+
+#include "vmx.h"
+#include "svm_util.h"
+
+#define VCPU_ID 1
+
+#define PAGE_SIZE 4096
+
+#define SMRAM_SIZE 65536
+#define SMRAM_MEMSLOT ((1 << 16) | 1)
+#define SMRAM_PAGES (SMRAM_SIZE / PAGE_SIZE)
+#define SMRAM_GPA 0x1000000
+#define SMRAM_STAGE 0xfe
+
+#define STR(x) #x
+#define XSTR(s) STR(s)
+
+#define SYNC_PORT 0xe
+#define DONE 0xff
+
+/*
+ * This is compiled as normal 64-bit code, however, SMI handler is executed
+ * in real-address mode. To stay simple we're limiting ourselves to a mode
+ * independent subset of asm here.
+ * SMI handler always report back fixed stage SMRAM_STAGE.
+ */
+uint8_t smi_handler[] = {
+ 0xb0, SMRAM_STAGE, /* mov $SMRAM_STAGE, %al */
+ 0xe4, SYNC_PORT, /* in $SYNC_PORT, %al */
+ 0x0f, 0xaa, /* rsm */
+};
+
+static inline void sync_with_host(uint64_t phase)
+{
+ asm volatile("in $" XSTR(SYNC_PORT)", %%al \n"
+ : "+a" (phase));
+}
+
+void self_smi(void)
+{
+ wrmsr(APIC_BASE_MSR + (APIC_ICR >> 4),
+ APIC_DEST_SELF | APIC_INT_ASSERT | APIC_DM_SMI);
+}
+
+void guest_code(void *arg)
+{
+ uint64_t apicbase = rdmsr(MSR_IA32_APICBASE);
+
+ sync_with_host(1);
+
+ wrmsr(MSR_IA32_APICBASE, apicbase | X2APIC_ENABLE);
+
+ sync_with_host(2);
+
+ self_smi();
+
+ sync_with_host(4);
+
+ if (arg) {
+ if (cpu_has_svm())
+ generic_svm_setup(arg, NULL, NULL);
+ else
+ GUEST_ASSERT(prepare_for_vmx_operation(arg));
+
+ sync_with_host(5);
+
+ self_smi();
+
+ sync_with_host(7);
+ }
+
+ sync_with_host(DONE);
+}
+
+int main(int argc, char *argv[])
+{
+ vm_vaddr_t nested_gva = 0;
+
+ struct kvm_regs regs;
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ struct kvm_x86_state *state;
+ int stage, stage_reported;
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+ run = vcpu_state(vm, VCPU_ID);
+
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, SMRAM_GPA,
+ SMRAM_MEMSLOT, SMRAM_PAGES, 0);
+ TEST_ASSERT(vm_phy_pages_alloc(vm, SMRAM_PAGES, SMRAM_GPA, SMRAM_MEMSLOT)
+ == SMRAM_GPA, "could not allocate guest physical addresses?");
+
+ memset(addr_gpa2hva(vm, SMRAM_GPA), 0x0, SMRAM_SIZE);
+ memcpy(addr_gpa2hva(vm, SMRAM_GPA) + 0x8000, smi_handler,
+ sizeof(smi_handler));
+
+ vcpu_set_msr(vm, VCPU_ID, MSR_IA32_SMBASE, SMRAM_GPA);
+
+ if (kvm_check_cap(KVM_CAP_NESTED_STATE)) {
+ if (nested_svm_supported())
+ vcpu_alloc_svm(vm, &nested_gva);
+ else if (nested_vmx_supported())
+ vcpu_alloc_vmx(vm, &nested_gva);
+ }
+
+ if (!nested_gva)
+ pr_info("will skip SMM test with VMX enabled\n");
+
+ vcpu_args_set(vm, VCPU_ID, 1, nested_gva);
+
+ for (stage = 1;; stage++) {
+ _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Stage %d: unexpected exit reason: %u (%s),\n",
+ stage, run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ memset(&regs, 0, sizeof(regs));
+ vcpu_regs_get(vm, VCPU_ID, &regs);
+
+ stage_reported = regs.rax & 0xff;
+
+ if (stage_reported == DONE)
+ goto done;
+
+ TEST_ASSERT(stage_reported == stage ||
+ stage_reported == SMRAM_STAGE,
+ "Unexpected stage: #%x, got %x",
+ stage, stage_reported);
+
+ state = vcpu_save_state(vm, VCPU_ID);
+ kvm_vm_release(vm);
+ kvm_vm_restart(vm, O_RDWR);
+ vm_vcpu_add(vm, VCPU_ID);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ vcpu_load_state(vm, VCPU_ID, state);
+ run = vcpu_state(vm, VCPU_ID);
+ free(state);
+ }
+
+done:
+ kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/state_test.c b/tools/testing/selftests/kvm/x86_64/state_test.c
new file mode 100644
index 000000000..f6c8b9042
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/state_test.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KVM_GET/SET_* tests
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ *
+ * Tests for vCPU state save/restore, including nested guest state.
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+#include "svm_util.h"
+
+#define VCPU_ID 5
+#define L2_GUEST_STACK_SIZE 256
+
+void svm_l2_guest_code(void)
+{
+ GUEST_SYNC(4);
+ /* Exit to L1 */
+ vmcall();
+ GUEST_SYNC(6);
+ /* Done, exit to L1 and never come back. */
+ vmcall();
+}
+
+static void svm_l1_guest_code(struct svm_test_data *svm)
+{
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+ struct vmcb *vmcb = svm->vmcb;
+
+ GUEST_ASSERT(svm->vmcb_gpa);
+ /* Prepare for L2 execution. */
+ generic_svm_setup(svm, svm_l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ GUEST_SYNC(3);
+ run_guest(vmcb, svm->vmcb_gpa);
+ GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
+ GUEST_SYNC(5);
+ vmcb->save.rip += 3;
+ run_guest(vmcb, svm->vmcb_gpa);
+ GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
+ GUEST_SYNC(7);
+}
+
+void vmx_l2_guest_code(void)
+{
+ GUEST_SYNC(6);
+
+ /* Exit to L1 */
+ vmcall();
+
+ /* L1 has now set up a shadow VMCS for us. */
+ GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee);
+ GUEST_SYNC(10);
+ GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee);
+ GUEST_ASSERT(!vmwrite(GUEST_RIP, 0xc0fffee));
+ GUEST_SYNC(11);
+ GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0fffee);
+ GUEST_ASSERT(!vmwrite(GUEST_RIP, 0xc0ffffee));
+ GUEST_SYNC(12);
+
+ /* Done, exit to L1 and never come back. */
+ vmcall();
+}
+
+static void vmx_l1_guest_code(struct vmx_pages *vmx_pages)
+{
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+ GUEST_ASSERT(vmx_pages->vmcs_gpa);
+ GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+ GUEST_SYNC(3);
+ GUEST_ASSERT(load_vmcs(vmx_pages));
+ GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
+
+ GUEST_SYNC(4);
+ GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
+
+ prepare_vmcs(vmx_pages, vmx_l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ GUEST_SYNC(5);
+ GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
+ GUEST_ASSERT(!vmlaunch());
+ GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+ /* Check that the launched state is preserved. */
+ GUEST_ASSERT(vmlaunch());
+
+ GUEST_ASSERT(!vmresume());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+ GUEST_SYNC(7);
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+ GUEST_ASSERT(!vmresume());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+ vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + 3);
+
+ vmwrite(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
+ vmwrite(VMCS_LINK_POINTER, vmx_pages->shadow_vmcs_gpa);
+
+ GUEST_ASSERT(!vmptrld(vmx_pages->shadow_vmcs_gpa));
+ GUEST_ASSERT(vmlaunch());
+ GUEST_SYNC(8);
+ GUEST_ASSERT(vmlaunch());
+ GUEST_ASSERT(vmresume());
+
+ vmwrite(GUEST_RIP, 0xc0ffee);
+ GUEST_SYNC(9);
+ GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee);
+
+ GUEST_ASSERT(!vmptrld(vmx_pages->vmcs_gpa));
+ GUEST_ASSERT(!vmresume());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+ GUEST_ASSERT(!vmptrld(vmx_pages->shadow_vmcs_gpa));
+ GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffffee);
+ GUEST_ASSERT(vmlaunch());
+ GUEST_ASSERT(vmresume());
+ GUEST_SYNC(13);
+ GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffffee);
+ GUEST_ASSERT(vmlaunch());
+ GUEST_ASSERT(vmresume());
+}
+
+static void __attribute__((__flatten__)) guest_code(void *arg)
+{
+ GUEST_SYNC(1);
+ GUEST_SYNC(2);
+
+ if (arg) {
+ if (cpu_has_svm())
+ svm_l1_guest_code(arg);
+ else
+ vmx_l1_guest_code(arg);
+ }
+
+ GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+ vm_vaddr_t nested_gva = 0;
+
+ struct kvm_regs regs1, regs2;
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ struct kvm_x86_state *state;
+ struct ucall uc;
+ int stage;
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ run = vcpu_state(vm, VCPU_ID);
+
+ vcpu_regs_get(vm, VCPU_ID, &regs1);
+
+ if (kvm_check_cap(KVM_CAP_NESTED_STATE)) {
+ if (nested_svm_supported())
+ vcpu_alloc_svm(vm, &nested_gva);
+ else if (nested_vmx_supported())
+ vcpu_alloc_vmx(vm, &nested_gva);
+ }
+
+ if (!nested_gva)
+ pr_info("will skip nested state checks\n");
+
+ vcpu_args_set(vm, VCPU_ID, 1, nested_gva);
+
+ for (stage = 1;; stage++) {
+ _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Stage %d: unexpected exit reason: %u (%s),\n",
+ stage, run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
+ __FILE__, uc.args[1]);
+ /* NOT REACHED */
+ case UCALL_SYNC:
+ break;
+ case UCALL_DONE:
+ goto done;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+
+ /* UCALL_SYNC is handled here. */
+ TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+ uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
+ stage, (ulong)uc.args[1]);
+
+ state = vcpu_save_state(vm, VCPU_ID);
+ memset(&regs1, 0, sizeof(regs1));
+ vcpu_regs_get(vm, VCPU_ID, &regs1);
+
+ kvm_vm_release(vm);
+
+ /* Restore state in a new VM. */
+ kvm_vm_restart(vm, O_RDWR);
+ vm_vcpu_add(vm, VCPU_ID);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ vcpu_load_state(vm, VCPU_ID, state);
+ run = vcpu_state(vm, VCPU_ID);
+ free(state);
+
+ memset(&regs2, 0, sizeof(regs2));
+ vcpu_regs_get(vm, VCPU_ID, &regs2);
+ TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
+ "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
+ (ulong) regs2.rdi, (ulong) regs2.rsi);
+ }
+
+done:
+ kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c
new file mode 100644
index 000000000..0e1adb4e3
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/svm_vmcall_test.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * svm_vmcall_test
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ *
+ * Nested SVM testing: VMCALL
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "svm_util.h"
+
+#define VCPU_ID 5
+
+static struct kvm_vm *vm;
+
+static void l2_guest_code(struct svm_test_data *svm)
+{
+ __asm__ __volatile__("vmcall");
+}
+
+static void l1_guest_code(struct svm_test_data *svm)
+{
+ #define L2_GUEST_STACK_SIZE 64
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+ struct vmcb *vmcb = svm->vmcb;
+
+ /* Prepare for L2 execution. */
+ generic_svm_setup(svm, l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ run_guest(vmcb, svm->vmcb_gpa);
+
+ GUEST_ASSERT(vmcb->control.exit_code == SVM_EXIT_VMMCALL);
+ GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+ vm_vaddr_t svm_gva;
+
+ nested_svm_check_supported();
+
+ vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+ vcpu_alloc_svm(vm, &svm_gva);
+ vcpu_args_set(vm, VCPU_ID, 1, svm_gva);
+
+ for (;;) {
+ volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+ struct ucall uc;
+
+ vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("%s", (const char *)uc.args[0]);
+ /* NOT REACHED */
+ case UCALL_SYNC:
+ break;
+ case UCALL_DONE:
+ goto done;
+ default:
+ TEST_FAIL("Unknown ucall 0x%lx.", uc.cmd);
+ }
+ }
+done:
+ kvm_vm_free(vm);
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/sync_regs_test.c b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c
new file mode 100644
index 000000000..d672f0a47
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/sync_regs_test.c
@@ -0,0 +1,243 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test for x86 KVM_CAP_SYNC_REGS
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * Verifies expected behavior of x86 KVM_CAP_SYNC_REGS functionality,
+ * including requesting an invalid register set, updates to/from values
+ * in kvm_run.s.regs when kvm_valid_regs and kvm_dirty_regs are toggled.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 5
+
+#define UCALL_PIO_PORT ((uint16_t)0x1000)
+
+/*
+ * ucall is embedded here to protect against compiler reshuffling registers
+ * before calling a function. In this test we only need to get KVM_EXIT_IO
+ * vmexit and preserve RBX, no additional information is needed.
+ */
+void guest_code(void)
+{
+ asm volatile("1: in %[port], %%al\n"
+ "add $0x1, %%rbx\n"
+ "jmp 1b"
+ : : [port] "d" (UCALL_PIO_PORT) : "rax", "rbx");
+}
+
+static void compare_regs(struct kvm_regs *left, struct kvm_regs *right)
+{
+#define REG_COMPARE(reg) \
+ TEST_ASSERT(left->reg == right->reg, \
+ "Register " #reg \
+ " values did not match: 0x%llx, 0x%llx\n", \
+ left->reg, right->reg)
+ REG_COMPARE(rax);
+ REG_COMPARE(rbx);
+ REG_COMPARE(rcx);
+ REG_COMPARE(rdx);
+ REG_COMPARE(rsi);
+ REG_COMPARE(rdi);
+ REG_COMPARE(rsp);
+ REG_COMPARE(rbp);
+ REG_COMPARE(r8);
+ REG_COMPARE(r9);
+ REG_COMPARE(r10);
+ REG_COMPARE(r11);
+ REG_COMPARE(r12);
+ REG_COMPARE(r13);
+ REG_COMPARE(r14);
+ REG_COMPARE(r15);
+ REG_COMPARE(rip);
+ REG_COMPARE(rflags);
+#undef REG_COMPARE
+}
+
+static void compare_sregs(struct kvm_sregs *left, struct kvm_sregs *right)
+{
+}
+
+static void compare_vcpu_events(struct kvm_vcpu_events *left,
+ struct kvm_vcpu_events *right)
+{
+}
+
+#define TEST_SYNC_FIELDS (KVM_SYNC_X86_REGS|KVM_SYNC_X86_SREGS|KVM_SYNC_X86_EVENTS)
+#define INVALID_SYNC_FIELD 0x80000000
+
+int main(int argc, char *argv[])
+{
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ struct kvm_regs regs;
+ struct kvm_sregs sregs;
+ struct kvm_vcpu_events events;
+ int rv, cap;
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+ cap = kvm_check_cap(KVM_CAP_SYNC_REGS);
+ if ((cap & TEST_SYNC_FIELDS) != TEST_SYNC_FIELDS) {
+ print_skip("KVM_CAP_SYNC_REGS not supported");
+ exit(KSFT_SKIP);
+ }
+ if ((cap & INVALID_SYNC_FIELD) != 0) {
+ print_skip("The \"invalid\" field is not invalid");
+ exit(KSFT_SKIP);
+ }
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+
+ run = vcpu_state(vm, VCPU_ID);
+
+ /* Request reading invalid register set from VCPU. */
+ run->kvm_valid_regs = INVALID_SYNC_FIELD;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv < 0 && errno == EINVAL,
+ "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n",
+ rv);
+ vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0;
+
+ run->kvm_valid_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv < 0 && errno == EINVAL,
+ "Invalid kvm_valid_regs did not cause expected KVM_RUN error: %d\n",
+ rv);
+ vcpu_state(vm, VCPU_ID)->kvm_valid_regs = 0;
+
+ /* Request setting invalid register set into VCPU. */
+ run->kvm_dirty_regs = INVALID_SYNC_FIELD;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv < 0 && errno == EINVAL,
+ "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n",
+ rv);
+ vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0;
+
+ run->kvm_dirty_regs = INVALID_SYNC_FIELD | TEST_SYNC_FIELDS;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(rv < 0 && errno == EINVAL,
+ "Invalid kvm_dirty_regs did not cause expected KVM_RUN error: %d\n",
+ rv);
+ vcpu_state(vm, VCPU_ID)->kvm_dirty_regs = 0;
+
+ /* Request and verify all valid register sets. */
+ /* TODO: BUILD TIME CHECK: TEST_ASSERT(KVM_SYNC_X86_NUM_FIELDS != 3); */
+ run->kvm_valid_regs = TEST_SYNC_FIELDS;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Unexpected exit reason: %u (%s),\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ vcpu_regs_get(vm, VCPU_ID, &regs);
+ compare_regs(&regs, &run->s.regs.regs);
+
+ vcpu_sregs_get(vm, VCPU_ID, &sregs);
+ compare_sregs(&sregs, &run->s.regs.sregs);
+
+ vcpu_events_get(vm, VCPU_ID, &events);
+ compare_vcpu_events(&events, &run->s.regs.events);
+
+ /* Set and verify various register values. */
+ run->s.regs.regs.rbx = 0xBAD1DEA;
+ run->s.regs.sregs.apic_base = 1 << 11;
+ /* TODO run->s.regs.events.XYZ = ABC; */
+
+ run->kvm_valid_regs = TEST_SYNC_FIELDS;
+ run->kvm_dirty_regs = KVM_SYNC_X86_REGS | KVM_SYNC_X86_SREGS;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Unexpected exit reason: %u (%s),\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ TEST_ASSERT(run->s.regs.regs.rbx == 0xBAD1DEA + 1,
+ "rbx sync regs value incorrect 0x%llx.",
+ run->s.regs.regs.rbx);
+ TEST_ASSERT(run->s.regs.sregs.apic_base == 1 << 11,
+ "apic_base sync regs value incorrect 0x%llx.",
+ run->s.regs.sregs.apic_base);
+
+ vcpu_regs_get(vm, VCPU_ID, &regs);
+ compare_regs(&regs, &run->s.regs.regs);
+
+ vcpu_sregs_get(vm, VCPU_ID, &sregs);
+ compare_sregs(&sregs, &run->s.regs.sregs);
+
+ vcpu_events_get(vm, VCPU_ID, &events);
+ compare_vcpu_events(&events, &run->s.regs.events);
+
+ /* Clear kvm_dirty_regs bits, verify new s.regs values are
+ * overwritten with existing guest values.
+ */
+ run->kvm_valid_regs = TEST_SYNC_FIELDS;
+ run->kvm_dirty_regs = 0;
+ run->s.regs.regs.rbx = 0xDEADBEEF;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Unexpected exit reason: %u (%s),\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ TEST_ASSERT(run->s.regs.regs.rbx != 0xDEADBEEF,
+ "rbx sync regs value incorrect 0x%llx.",
+ run->s.regs.regs.rbx);
+
+ /* Clear kvm_valid_regs bits and kvm_dirty_bits.
+ * Verify s.regs values are not overwritten with existing guest values
+ * and that guest values are not overwritten with kvm_sync_regs values.
+ */
+ run->kvm_valid_regs = 0;
+ run->kvm_dirty_regs = 0;
+ run->s.regs.regs.rbx = 0xAAAA;
+ regs.rbx = 0xBAC0;
+ vcpu_regs_set(vm, VCPU_ID, &regs);
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Unexpected exit reason: %u (%s),\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ TEST_ASSERT(run->s.regs.regs.rbx == 0xAAAA,
+ "rbx sync regs value incorrect 0x%llx.",
+ run->s.regs.regs.rbx);
+ vcpu_regs_get(vm, VCPU_ID, &regs);
+ TEST_ASSERT(regs.rbx == 0xBAC0 + 1,
+ "rbx guest value incorrect 0x%llx.",
+ regs.rbx);
+
+ /* Clear kvm_valid_regs bits. Verify s.regs values are not overwritten
+ * with existing guest values but that guest values are overwritten
+ * with kvm_sync_regs values.
+ */
+ run->kvm_valid_regs = 0;
+ run->kvm_dirty_regs = TEST_SYNC_FIELDS;
+ run->s.regs.regs.rbx = 0xBBBB;
+ rv = _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Unexpected exit reason: %u (%s),\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ TEST_ASSERT(run->s.regs.regs.rbx == 0xBBBB,
+ "rbx sync regs value incorrect 0x%llx.",
+ run->s.regs.regs.rbx);
+ vcpu_regs_get(vm, VCPU_ID, &regs);
+ TEST_ASSERT(regs.rbx == 0xBBBB + 1,
+ "rbx guest value incorrect 0x%llx.",
+ regs.rbx);
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c
new file mode 100644
index 000000000..f8e761149
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/tsc_msrs_test.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tests for MSR_IA32_TSC and MSR_IA32_TSC_ADJUST.
+ *
+ * Copyright (C) 2020, Red Hat, Inc.
+ */
+#include <stdio.h>
+#include <string.h>
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 0
+
+#define UNITY (1ull << 30)
+#define HOST_ADJUST (UNITY * 64)
+#define GUEST_STEP (UNITY * 4)
+#define ROUND(x) ((x + UNITY / 2) & -UNITY)
+#define rounded_rdmsr(x) ROUND(rdmsr(x))
+#define rounded_host_rdmsr(x) ROUND(vcpu_get_msr(vm, 0, x))
+
+#define GUEST_ASSERT_EQ(a, b) do { \
+ __typeof(a) _a = (a); \
+ __typeof(b) _b = (b); \
+ if (_a != _b) \
+ ucall(UCALL_ABORT, 4, \
+ "Failed guest assert: " \
+ #a " == " #b, __LINE__, _a, _b); \
+ } while(0)
+
+static void guest_code(void)
+{
+ u64 val = 0;
+
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /* Guest: writes to MSR_IA32_TSC affect both MSRs. */
+ val = 1ull * GUEST_STEP;
+ wrmsr(MSR_IA32_TSC, val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs. */
+ GUEST_SYNC(2);
+ val = 2ull * GUEST_STEP;
+ wrmsr(MSR_IA32_TSC_ADJUST, val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /* Host: setting the TSC offset. */
+ GUEST_SYNC(3);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /*
+ * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the
+ * host-side offset and affect both MSRs.
+ */
+ GUEST_SYNC(4);
+ val = 3ull * GUEST_STEP;
+ wrmsr(MSR_IA32_TSC_ADJUST, val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /*
+ * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side
+ * offset is now visible in MSR_IA32_TSC_ADJUST.
+ */
+ GUEST_SYNC(5);
+ val = 4ull * GUEST_STEP;
+ wrmsr(MSR_IA32_TSC, val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC), val);
+ GUEST_ASSERT_EQ(rounded_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST);
+
+ GUEST_DONE();
+}
+
+static void run_vcpu(struct kvm_vm *vm, uint32_t vcpuid, int stage)
+{
+ struct ucall uc;
+
+ vcpu_args_set(vm, vcpuid, 1, vcpuid);
+
+ vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL);
+
+ switch (get_ucall(vm, vcpuid, &uc)) {
+ case UCALL_SYNC:
+ TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+ uc.args[1] == stage + 1, "Stage %d: Unexpected register values vmexit, got %lx",
+ stage + 1, (ulong)uc.args[1]);
+ return;
+ case UCALL_DONE:
+ return;
+ case UCALL_ABORT:
+ TEST_ASSERT(false, "%s at %s:%ld\n" \
+ "\tvalues: %#lx, %#lx", (const char *)uc.args[0],
+ __FILE__, uc.args[1], uc.args[2], uc.args[3]);
+ default:
+ TEST_ASSERT(false, "Unexpected exit: %s",
+ exit_reason_str(vcpu_state(vm, vcpuid)->exit_reason));
+ }
+}
+
+int main(void)
+{
+ struct kvm_vm *vm;
+ uint64_t val;
+
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+ val = 0;
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /* Guest: writes to MSR_IA32_TSC affect both MSRs. */
+ run_vcpu(vm, VCPU_ID, 1);
+ val = 1ull * GUEST_STEP;
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /* Guest: writes to MSR_IA32_TSC_ADJUST affect both MSRs. */
+ run_vcpu(vm, VCPU_ID, 2);
+ val = 2ull * GUEST_STEP;
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /*
+ * Host: writes to MSR_IA32_TSC set the host-side offset
+ * and therefore do not change MSR_IA32_TSC_ADJUST.
+ */
+ vcpu_set_msr(vm, 0, MSR_IA32_TSC, HOST_ADJUST + val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+ run_vcpu(vm, VCPU_ID, 3);
+
+ /* Host: writes to MSR_IA32_TSC_ADJUST do not modify the TSC. */
+ vcpu_set_msr(vm, 0, MSR_IA32_TSC_ADJUST, UNITY * 123456);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ ASSERT_EQ(vcpu_get_msr(vm, 0, MSR_IA32_TSC_ADJUST), UNITY * 123456);
+
+ /* Restore previous value. */
+ vcpu_set_msr(vm, 0, MSR_IA32_TSC_ADJUST, val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /*
+ * Guest: writes to MSR_IA32_TSC_ADJUST do not destroy the
+ * host-side offset and affect both MSRs.
+ */
+ run_vcpu(vm, VCPU_ID, 4);
+ val = 3ull * GUEST_STEP;
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), HOST_ADJUST + val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val);
+
+ /*
+ * Guest: writes to MSR_IA32_TSC affect both MSRs, so the host-side
+ * offset is now visible in MSR_IA32_TSC_ADJUST.
+ */
+ run_vcpu(vm, VCPU_ID, 5);
+ val = 4ull * GUEST_STEP;
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC), val);
+ ASSERT_EQ(rounded_host_rdmsr(MSR_IA32_TSC_ADJUST), val - HOST_ADJUST);
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/user_msr_test.c b/tools/testing/selftests/kvm/x86_64/user_msr_test.c
new file mode 100644
index 000000000..cbe1b0889
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/user_msr_test.c
@@ -0,0 +1,248 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * tests for KVM_CAP_X86_USER_SPACE_MSR and KVM_X86_SET_MSR_FILTER
+ *
+ * Copyright (C) 2020, Amazon Inc.
+ *
+ * This is a functional test to verify that we can deflect MSR events
+ * into user space.
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+
+#define VCPU_ID 5
+
+static u32 msr_reads, msr_writes;
+
+static u8 bitmap_00000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_00000000_write[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_40000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_c0000000[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_c0000000_read[KVM_MSR_FILTER_MAX_BITMAP_SIZE];
+static u8 bitmap_deadbeef[1] = { 0x1 };
+
+static void deny_msr(uint8_t *bitmap, u32 msr)
+{
+ u32 idx = msr & (KVM_MSR_FILTER_MAX_BITMAP_SIZE - 1);
+
+ bitmap[idx / 8] &= ~(1 << (idx % 8));
+}
+
+static void prepare_bitmaps(void)
+{
+ memset(bitmap_00000000, 0xff, sizeof(bitmap_00000000));
+ memset(bitmap_00000000_write, 0xff, sizeof(bitmap_00000000_write));
+ memset(bitmap_40000000, 0xff, sizeof(bitmap_40000000));
+ memset(bitmap_c0000000, 0xff, sizeof(bitmap_c0000000));
+ memset(bitmap_c0000000_read, 0xff, sizeof(bitmap_c0000000_read));
+
+ deny_msr(bitmap_00000000_write, MSR_IA32_POWER_CTL);
+ deny_msr(bitmap_c0000000_read, MSR_SYSCALL_MASK);
+ deny_msr(bitmap_c0000000_read, MSR_GS_BASE);
+}
+
+struct kvm_msr_filter filter = {
+ .flags = KVM_MSR_FILTER_DEFAULT_DENY,
+ .ranges = {
+ {
+ .flags = KVM_MSR_FILTER_READ,
+ .base = 0x00000000,
+ .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+ .bitmap = bitmap_00000000,
+ }, {
+ .flags = KVM_MSR_FILTER_WRITE,
+ .base = 0x00000000,
+ .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+ .bitmap = bitmap_00000000_write,
+ }, {
+ .flags = KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE,
+ .base = 0x40000000,
+ .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+ .bitmap = bitmap_40000000,
+ }, {
+ .flags = KVM_MSR_FILTER_READ,
+ .base = 0xc0000000,
+ .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+ .bitmap = bitmap_c0000000_read,
+ }, {
+ .flags = KVM_MSR_FILTER_WRITE,
+ .base = 0xc0000000,
+ .nmsrs = KVM_MSR_FILTER_MAX_BITMAP_SIZE * BITS_PER_BYTE,
+ .bitmap = bitmap_c0000000,
+ }, {
+ .flags = KVM_MSR_FILTER_WRITE | KVM_MSR_FILTER_READ,
+ .base = 0xdeadbeef,
+ .nmsrs = 1,
+ .bitmap = bitmap_deadbeef,
+ },
+ },
+};
+
+struct kvm_msr_filter no_filter = {
+ .flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
+};
+
+static void guest_msr_calls(bool trapped)
+{
+ /* This goes into the in-kernel emulation */
+ wrmsr(MSR_SYSCALL_MASK, 0);
+
+ if (trapped) {
+ /* This goes into user space emulation */
+ GUEST_ASSERT(rdmsr(MSR_SYSCALL_MASK) == MSR_SYSCALL_MASK);
+ GUEST_ASSERT(rdmsr(MSR_GS_BASE) == MSR_GS_BASE);
+ } else {
+ GUEST_ASSERT(rdmsr(MSR_SYSCALL_MASK) != MSR_SYSCALL_MASK);
+ GUEST_ASSERT(rdmsr(MSR_GS_BASE) != MSR_GS_BASE);
+ }
+
+ /* If trapped == true, this goes into user space emulation */
+ wrmsr(MSR_IA32_POWER_CTL, 0x1234);
+
+ /* This goes into the in-kernel emulation */
+ rdmsr(MSR_IA32_POWER_CTL);
+
+ /* Invalid MSR, should always be handled by user space exit */
+ GUEST_ASSERT(rdmsr(0xdeadbeef) == 0xdeadbeef);
+ wrmsr(0xdeadbeef, 0x1234);
+}
+
+static void guest_code(void)
+{
+ guest_msr_calls(true);
+
+ /*
+ * Disable msr filtering, so that the kernel
+ * handles everything in the next round
+ */
+ GUEST_SYNC(0);
+
+ guest_msr_calls(false);
+
+ GUEST_DONE();
+}
+
+static int handle_ucall(struct kvm_vm *vm)
+{
+ struct ucall uc;
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("Guest assertion not met");
+ break;
+ case UCALL_SYNC:
+ vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &no_filter);
+ break;
+ case UCALL_DONE:
+ return 1;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+
+ return 0;
+}
+
+static void handle_rdmsr(struct kvm_run *run)
+{
+ run->msr.data = run->msr.index;
+ msr_reads++;
+
+ if (run->msr.index == MSR_SYSCALL_MASK ||
+ run->msr.index == MSR_GS_BASE) {
+ TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER,
+ "MSR read trap w/o access fault");
+ }
+
+ if (run->msr.index == 0xdeadbeef) {
+ TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_UNKNOWN,
+ "MSR deadbeef read trap w/o inval fault");
+ }
+}
+
+static void handle_wrmsr(struct kvm_run *run)
+{
+ /* ignore */
+ msr_writes++;
+
+ if (run->msr.index == MSR_IA32_POWER_CTL) {
+ TEST_ASSERT(run->msr.data == 0x1234,
+ "MSR data for MSR_IA32_POWER_CTL incorrect");
+ TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_FILTER,
+ "MSR_IA32_POWER_CTL trap w/o access fault");
+ }
+
+ if (run->msr.index == 0xdeadbeef) {
+ TEST_ASSERT(run->msr.data == 0x1234,
+ "MSR data for deadbeef incorrect");
+ TEST_ASSERT(run->msr.reason == KVM_MSR_EXIT_REASON_UNKNOWN,
+ "deadbeef trap w/o inval fault");
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_enable_cap cap = {
+ .cap = KVM_CAP_X86_USER_SPACE_MSR,
+ .args[0] = KVM_MSR_EXIT_REASON_INVAL |
+ KVM_MSR_EXIT_REASON_UNKNOWN |
+ KVM_MSR_EXIT_REASON_FILTER,
+ };
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ int rc;
+
+ /* Tell stdout not to buffer its content */
+ setbuf(stdout, NULL);
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ run = vcpu_state(vm, VCPU_ID);
+
+ rc = kvm_check_cap(KVM_CAP_X86_USER_SPACE_MSR);
+ TEST_ASSERT(rc, "KVM_CAP_X86_USER_SPACE_MSR is available");
+ vm_enable_cap(vm, &cap);
+
+ rc = kvm_check_cap(KVM_CAP_X86_MSR_FILTER);
+ TEST_ASSERT(rc, "KVM_CAP_X86_MSR_FILTER is available");
+
+ prepare_bitmaps();
+ vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter);
+
+ while (1) {
+ rc = _vcpu_run(vm, VCPU_ID);
+
+ TEST_ASSERT(rc == 0, "vcpu_run failed: %d\n", rc);
+
+ switch (run->exit_reason) {
+ case KVM_EXIT_X86_RDMSR:
+ handle_rdmsr(run);
+ break;
+ case KVM_EXIT_X86_WRMSR:
+ handle_wrmsr(run);
+ break;
+ case KVM_EXIT_IO:
+ if (handle_ucall(vm))
+ goto done;
+ break;
+ }
+
+ }
+
+done:
+ TEST_ASSERT(msr_reads == 4, "Handled 4 rdmsr in user space");
+ TEST_ASSERT(msr_writes == 3, "Handled 3 wrmsr in user space");
+
+ kvm_vm_free(vm);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c
new file mode 100644
index 000000000..1f65342d6
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_apic_access_test.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vmx_apic_access_test
+ *
+ * Copyright (C) 2020, Google LLC.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ *
+ * The first subtest simply checks to see that an L2 guest can be
+ * launched with a valid APIC-access address that is backed by a
+ * page of L1 physical memory.
+ *
+ * The second subtest sets the APIC-access address to a (valid) L1
+ * physical address that is not backed by memory. KVM can't handle
+ * this situation, so resuming L2 should result in a KVM exit for
+ * internal error (emulation). This is not an architectural
+ * requirement. It is just a shortcoming of KVM. The internal error
+ * is unfortunate, but it's better than what used to happen!
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "kselftest.h"
+
+#define VCPU_ID 0
+
+/* The virtual machine object. */
+static struct kvm_vm *vm;
+
+static void l2_guest_code(void)
+{
+ /* Exit to L1 */
+ __asm__ __volatile__("vmcall");
+}
+
+static void l1_guest_code(struct vmx_pages *vmx_pages, unsigned long high_gpa)
+{
+#define L2_GUEST_STACK_SIZE 64
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+ uint32_t control;
+
+ GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+ GUEST_ASSERT(load_vmcs(vmx_pages));
+
+ /* Prepare the VMCS for L2 execution. */
+ prepare_vmcs(vmx_pages, l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
+ control |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+ vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
+ control = vmreadz(SECONDARY_VM_EXEC_CONTROL);
+ control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ vmwrite(SECONDARY_VM_EXEC_CONTROL, control);
+ vmwrite(APIC_ACCESS_ADDR, vmx_pages->apic_access_gpa);
+
+ /* Try to launch L2 with the memory-backed APIC-access address. */
+ GUEST_SYNC(vmreadz(APIC_ACCESS_ADDR));
+ GUEST_ASSERT(!vmlaunch());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+ vmwrite(APIC_ACCESS_ADDR, high_gpa);
+
+ /* Try to resume L2 with the unbacked APIC-access address. */
+ GUEST_SYNC(vmreadz(APIC_ACCESS_ADDR));
+ GUEST_ASSERT(!vmresume());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+ GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned long apic_access_addr = ~0ul;
+ unsigned int paddr_width;
+ unsigned int vaddr_width;
+ vm_vaddr_t vmx_pages_gva;
+ unsigned long high_gpa;
+ struct vmx_pages *vmx;
+ bool done = false;
+
+ nested_vmx_check_supported();
+
+ vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+ kvm_get_cpu_address_width(&paddr_width, &vaddr_width);
+ high_gpa = (1ul << paddr_width) - getpagesize();
+ if ((unsigned long)DEFAULT_GUEST_PHY_PAGES * getpagesize() > high_gpa) {
+ print_skip("No unbacked physical page available");
+ exit(KSFT_SKIP);
+ }
+
+ vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva);
+ prepare_virtualize_apic_accesses(vmx, vm, 0);
+ vcpu_args_set(vm, VCPU_ID, 2, vmx_pages_gva, high_gpa);
+
+ while (!done) {
+ volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+ struct ucall uc;
+
+ vcpu_run(vm, VCPU_ID);
+ if (apic_access_addr == high_gpa) {
+ TEST_ASSERT(run->exit_reason ==
+ KVM_EXIT_INTERNAL_ERROR,
+ "Got exit reason other than KVM_EXIT_INTERNAL_ERROR: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+ TEST_ASSERT(run->internal.suberror ==
+ KVM_INTERNAL_ERROR_EMULATION,
+ "Got internal suberror other than KVM_INTERNAL_ERROR_EMULATION: %u\n",
+ run->internal.suberror);
+ break;
+ }
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
+ __FILE__, uc.args[1]);
+ /* NOT REACHED */
+ case UCALL_SYNC:
+ apic_access_addr = uc.args[1];
+ break;
+ case UCALL_DONE:
+ done = true;
+ break;
+ default:
+ TEST_ASSERT(false, "Unknown ucall %lu", uc.cmd);
+ }
+ }
+ kvm_vm_free(vm);
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c
new file mode 100644
index 000000000..fe40ade06
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_close_while_nested_test.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vmx_close_while_nested
+ *
+ * Copyright (C) 2019, Red Hat, Inc.
+ *
+ * Verify that nothing bad happens if a KVM user exits with open
+ * file descriptors while executing a nested guest.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "kselftest.h"
+
+#define VCPU_ID 5
+
+enum {
+ PORT_L0_EXIT = 0x2000,
+};
+
+/* The virtual machine object. */
+static struct kvm_vm *vm;
+
+static void l2_guest_code(void)
+{
+ /* Exit to L0 */
+ asm volatile("inb %%dx, %%al"
+ : : [port] "d" (PORT_L0_EXIT) : "rax");
+}
+
+static void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+#define L2_GUEST_STACK_SIZE 64
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+ GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+ GUEST_ASSERT(load_vmcs(vmx_pages));
+
+ /* Prepare the VMCS for L2 execution. */
+ prepare_vmcs(vmx_pages, l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ GUEST_ASSERT(!vmlaunch());
+ GUEST_ASSERT(0);
+}
+
+int main(int argc, char *argv[])
+{
+ vm_vaddr_t vmx_pages_gva;
+
+ nested_vmx_check_supported();
+
+ vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+ /* Allocate VMX pages and shared descriptors (vmx_pages). */
+ vcpu_alloc_vmx(vm, &vmx_pages_gva);
+ vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
+
+ for (;;) {
+ volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+ struct ucall uc;
+
+ vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ if (run->io.port == PORT_L0_EXIT)
+ break;
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("%s", (const char *)uc.args[0]);
+ /* NOT REACHED */
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+ }
+}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c
new file mode 100644
index 000000000..e894a638a
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_dirty_log_test.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KVM dirty page logging test
+ *
+ * Copyright (C) 2018, Red Hat, Inc.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_name */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#define VCPU_ID 1
+
+/* The memory slot index to track dirty pages */
+#define TEST_MEM_SLOT_INDEX 1
+#define TEST_MEM_PAGES 3
+
+/* L1 guest test virtual memory offset */
+#define GUEST_TEST_MEM 0xc0000000
+
+/* L2 guest test virtual memory offset */
+#define NESTED_TEST_MEM1 0xc0001000
+#define NESTED_TEST_MEM2 0xc0002000
+
+static void l2_guest_code(void)
+{
+ *(volatile uint64_t *)NESTED_TEST_MEM1;
+ *(volatile uint64_t *)NESTED_TEST_MEM1 = 1;
+ GUEST_SYNC(true);
+ GUEST_SYNC(false);
+
+ *(volatile uint64_t *)NESTED_TEST_MEM2 = 1;
+ GUEST_SYNC(true);
+ *(volatile uint64_t *)NESTED_TEST_MEM2 = 1;
+ GUEST_SYNC(true);
+ GUEST_SYNC(false);
+
+ /* Exit to L1 and never come back. */
+ vmcall();
+}
+
+void l1_guest_code(struct vmx_pages *vmx)
+{
+#define L2_GUEST_STACK_SIZE 64
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+
+ GUEST_ASSERT(vmx->vmcs_gpa);
+ GUEST_ASSERT(prepare_for_vmx_operation(vmx));
+ GUEST_ASSERT(load_vmcs(vmx));
+
+ prepare_vmcs(vmx, l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ GUEST_SYNC(false);
+ GUEST_ASSERT(!vmlaunch());
+ GUEST_SYNC(false);
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+ GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+ vm_vaddr_t vmx_pages_gva = 0;
+ struct vmx_pages *vmx;
+ unsigned long *bmap;
+ uint64_t *host_test_mem;
+
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ struct ucall uc;
+ bool done = false;
+
+ nested_vmx_check_supported();
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, l1_guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ vmx = vcpu_alloc_vmx(vm, &vmx_pages_gva);
+ vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
+ run = vcpu_state(vm, VCPU_ID);
+
+ /* Add an extra memory slot for testing dirty logging */
+ vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+ GUEST_TEST_MEM,
+ TEST_MEM_SLOT_INDEX,
+ TEST_MEM_PAGES,
+ KVM_MEM_LOG_DIRTY_PAGES);
+
+ /*
+ * Add an identity map for GVA range [0xc0000000, 0xc0002000). This
+ * affects both L1 and L2. However...
+ */
+ virt_map(vm, GUEST_TEST_MEM, GUEST_TEST_MEM, TEST_MEM_PAGES, 0);
+
+ /*
+ * ... pages in the L2 GPA range [0xc0001000, 0xc0003000) will map to
+ * 0xc0000000.
+ *
+ * Note that prepare_eptp should be called only L1's GPA map is done,
+ * meaning after the last call to virt_map.
+ */
+ prepare_eptp(vmx, vm, 0);
+ nested_map_memslot(vmx, vm, 0, 0);
+ nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096, 0);
+ nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096, 0);
+
+ bmap = bitmap_alloc(TEST_MEM_PAGES);
+ host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM);
+
+ while (!done) {
+ memset(host_test_mem, 0xaa, TEST_MEM_PAGES * 4096);
+ _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Unexpected exit reason: %u (%s),\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
+ __FILE__, uc.args[1]);
+ /* NOT REACHED */
+ case UCALL_SYNC:
+ /*
+ * The nested guest wrote at offset 0x1000 in the memslot, but the
+ * dirty bitmap must be filled in according to L1 GPA, not L2.
+ */
+ kvm_vm_get_dirty_log(vm, TEST_MEM_SLOT_INDEX, bmap);
+ if (uc.args[1]) {
+ TEST_ASSERT(test_bit(0, bmap), "Page 0 incorrectly reported clean\n");
+ TEST_ASSERT(host_test_mem[0] == 1, "Page 0 not written by guest\n");
+ } else {
+ TEST_ASSERT(!test_bit(0, bmap), "Page 0 incorrectly reported dirty\n");
+ TEST_ASSERT(host_test_mem[0] == 0xaaaaaaaaaaaaaaaaULL, "Page 0 written by guest\n");
+ }
+
+ TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty\n");
+ TEST_ASSERT(host_test_mem[4096 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest\n");
+ TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty\n");
+ TEST_ASSERT(host_test_mem[8192 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest\n");
+ break;
+ case UCALL_DONE:
+ done = true;
+ break;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+ }
+}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
new file mode 100644
index 000000000..a7737af12
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_preemption_timer_test.c
@@ -0,0 +1,259 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * VMX-preemption timer test
+ *
+ * Copyright (C) 2020, Google, LLC.
+ *
+ * Test to ensure the VM-Enter after migration doesn't
+ * incorrectly restarts the timer with the full timer
+ * value instead of partially decayed timer value
+ *
+ */
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#define VCPU_ID 5
+#define PREEMPTION_TIMER_VALUE 100000000ull
+#define PREEMPTION_TIMER_VALUE_THRESHOLD1 80000000ull
+
+u32 vmx_pt_rate;
+bool l2_save_restore_done;
+static u64 l2_vmx_pt_start;
+volatile u64 l2_vmx_pt_finish;
+
+union vmx_basic basic;
+union vmx_ctrl_msr ctrl_pin_rev;
+union vmx_ctrl_msr ctrl_exit_rev;
+
+void l2_guest_code(void)
+{
+ u64 vmx_pt_delta;
+
+ vmcall();
+ l2_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate;
+
+ /*
+ * Wait until the 1st threshold has passed
+ */
+ do {
+ l2_vmx_pt_finish = rdtsc();
+ vmx_pt_delta = (l2_vmx_pt_finish - l2_vmx_pt_start) >>
+ vmx_pt_rate;
+ } while (vmx_pt_delta < PREEMPTION_TIMER_VALUE_THRESHOLD1);
+
+ /*
+ * Force L2 through Save and Restore cycle
+ */
+ GUEST_SYNC(1);
+
+ l2_save_restore_done = 1;
+
+ /*
+ * Now wait for the preemption timer to fire and
+ * exit to L1
+ */
+ while ((l2_vmx_pt_finish = rdtsc()))
+ ;
+}
+
+void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+#define L2_GUEST_STACK_SIZE 64
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+ u64 l1_vmx_pt_start;
+ u64 l1_vmx_pt_finish;
+ u64 l1_tsc_deadline, l2_tsc_deadline;
+
+ GUEST_ASSERT(vmx_pages->vmcs_gpa);
+ GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+ GUEST_ASSERT(load_vmcs(vmx_pages));
+ GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
+
+ prepare_vmcs(vmx_pages, l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+
+ /*
+ * Check for Preemption timer support
+ */
+ basic.val = rdmsr(MSR_IA32_VMX_BASIC);
+ ctrl_pin_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_PINBASED_CTLS
+ : MSR_IA32_VMX_PINBASED_CTLS);
+ ctrl_exit_rev.val = rdmsr(basic.ctrl ? MSR_IA32_VMX_TRUE_EXIT_CTLS
+ : MSR_IA32_VMX_EXIT_CTLS);
+
+ if (!(ctrl_pin_rev.clr & PIN_BASED_VMX_PREEMPTION_TIMER) ||
+ !(ctrl_exit_rev.clr & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER))
+ return;
+
+ GUEST_ASSERT(!vmlaunch());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+ vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + vmreadz(VM_EXIT_INSTRUCTION_LEN));
+
+ /*
+ * Turn on PIN control and resume the guest
+ */
+ GUEST_ASSERT(!vmwrite(PIN_BASED_VM_EXEC_CONTROL,
+ vmreadz(PIN_BASED_VM_EXEC_CONTROL) |
+ PIN_BASED_VMX_PREEMPTION_TIMER));
+
+ GUEST_ASSERT(!vmwrite(VMX_PREEMPTION_TIMER_VALUE,
+ PREEMPTION_TIMER_VALUE));
+
+ vmx_pt_rate = rdmsr(MSR_IA32_VMX_MISC) & 0x1F;
+
+ l2_save_restore_done = 0;
+
+ l1_vmx_pt_start = (rdtsc() >> vmx_pt_rate) << vmx_pt_rate;
+
+ GUEST_ASSERT(!vmresume());
+
+ l1_vmx_pt_finish = rdtsc();
+
+ /*
+ * Ensure exit from L2 happens after L2 goes through
+ * save and restore
+ */
+ GUEST_ASSERT(l2_save_restore_done);
+
+ /*
+ * Ensure the exit from L2 is due to preemption timer expiry
+ */
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_PREEMPTION_TIMER);
+
+ l1_tsc_deadline = l1_vmx_pt_start +
+ (PREEMPTION_TIMER_VALUE << vmx_pt_rate);
+
+ l2_tsc_deadline = l2_vmx_pt_start +
+ (PREEMPTION_TIMER_VALUE << vmx_pt_rate);
+
+ /*
+ * Sync with the host and pass the l1|l2 pt_expiry_finish times and
+ * tsc deadlines so that host can verify they are as expected
+ */
+ GUEST_SYNC_ARGS(2, l1_vmx_pt_finish, l1_tsc_deadline,
+ l2_vmx_pt_finish, l2_tsc_deadline);
+}
+
+void guest_code(struct vmx_pages *vmx_pages)
+{
+ if (vmx_pages)
+ l1_guest_code(vmx_pages);
+
+ GUEST_DONE();
+}
+
+int main(int argc, char *argv[])
+{
+ vm_vaddr_t vmx_pages_gva = 0;
+
+ struct kvm_regs regs1, regs2;
+ struct kvm_vm *vm;
+ struct kvm_run *run;
+ struct kvm_x86_state *state;
+ struct ucall uc;
+ int stage;
+
+ /*
+ * AMD currently does not implement any VMX features, so for now we
+ * just early out.
+ */
+ nested_vmx_check_supported();
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ run = vcpu_state(vm, VCPU_ID);
+
+ vcpu_regs_get(vm, VCPU_ID, &regs1);
+
+ if (kvm_check_cap(KVM_CAP_NESTED_STATE)) {
+ vcpu_alloc_vmx(vm, &vmx_pages_gva);
+ vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
+ } else {
+ pr_info("will skip vmx preemption timer checks\n");
+ goto done;
+ }
+
+ for (stage = 1;; stage++) {
+ _vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Stage %d: unexpected exit reason: %u (%s),\n",
+ stage, run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
+ __FILE__, uc.args[1]);
+ /* NOT REACHED */
+ case UCALL_SYNC:
+ break;
+ case UCALL_DONE:
+ goto done;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+
+ /* UCALL_SYNC is handled here. */
+ TEST_ASSERT(!strcmp((const char *)uc.args[0], "hello") &&
+ uc.args[1] == stage, "Stage %d: Unexpected register values vmexit, got %lx",
+ stage, (ulong)uc.args[1]);
+ /*
+ * If this stage 2 then we should verify the vmx pt expiry
+ * is as expected.
+ * From L1's perspective verify Preemption timer hasn't
+ * expired too early.
+ * From L2's perspective verify Preemption timer hasn't
+ * expired too late.
+ */
+ if (stage == 2) {
+
+ pr_info("Stage %d: L1 PT expiry TSC (%lu) , L1 TSC deadline (%lu)\n",
+ stage, uc.args[2], uc.args[3]);
+
+ pr_info("Stage %d: L2 PT expiry TSC (%lu) , L2 TSC deadline (%lu)\n",
+ stage, uc.args[4], uc.args[5]);
+
+ TEST_ASSERT(uc.args[2] >= uc.args[3],
+ "Stage %d: L1 PT expiry TSC (%lu) < L1 TSC deadline (%lu)",
+ stage, uc.args[2], uc.args[3]);
+
+ TEST_ASSERT(uc.args[4] < uc.args[5],
+ "Stage %d: L2 PT expiry TSC (%lu) > L2 TSC deadline (%lu)",
+ stage, uc.args[4], uc.args[5]);
+ }
+
+ state = vcpu_save_state(vm, VCPU_ID);
+ memset(&regs1, 0, sizeof(regs1));
+ vcpu_regs_get(vm, VCPU_ID, &regs1);
+
+ kvm_vm_release(vm);
+
+ /* Restore state in a new VM. */
+ kvm_vm_restart(vm, O_RDWR);
+ vm_vcpu_add(vm, VCPU_ID);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+ vcpu_load_state(vm, VCPU_ID, state);
+ run = vcpu_state(vm, VCPU_ID);
+ free(state);
+
+ memset(&regs2, 0, sizeof(regs2));
+ vcpu_regs_get(vm, VCPU_ID, &regs2);
+ TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
+ "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
+ (ulong) regs2.rdi, (ulong) regs2.rsi);
+ }
+
+done:
+ kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c
new file mode 100644
index 000000000..d59f3eb67
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_set_nested_state_test.c
@@ -0,0 +1,298 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vmx_set_nested_state_test
+ *
+ * Copyright (C) 2019, Google LLC.
+ *
+ * This test verifies the integrity of calling the ioctl KVM_SET_NESTED_STATE.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#include <errno.h>
+#include <linux/kvm.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+
+/*
+ * Mirror of VMCS12_REVISION in arch/x86/kvm/vmx/vmcs12.h. If that value
+ * changes this should be updated.
+ */
+#define VMCS12_REVISION 0x11e57ed0
+#define VCPU_ID 5
+
+bool have_evmcs;
+
+void test_nested_state(struct kvm_vm *vm, struct kvm_nested_state *state)
+{
+ vcpu_nested_state_set(vm, VCPU_ID, state, false);
+}
+
+void test_nested_state_expect_errno(struct kvm_vm *vm,
+ struct kvm_nested_state *state,
+ int expected_errno)
+{
+ int rv;
+
+ rv = vcpu_nested_state_set(vm, VCPU_ID, state, true);
+ TEST_ASSERT(rv == -1 && errno == expected_errno,
+ "Expected %s (%d) from vcpu_nested_state_set but got rv: %i errno: %s (%d)",
+ strerror(expected_errno), expected_errno, rv, strerror(errno),
+ errno);
+}
+
+void test_nested_state_expect_einval(struct kvm_vm *vm,
+ struct kvm_nested_state *state)
+{
+ test_nested_state_expect_errno(vm, state, EINVAL);
+}
+
+void test_nested_state_expect_efault(struct kvm_vm *vm,
+ struct kvm_nested_state *state)
+{
+ test_nested_state_expect_errno(vm, state, EFAULT);
+}
+
+void set_revision_id_for_vmcs12(struct kvm_nested_state *state,
+ u32 vmcs12_revision)
+{
+ /* Set revision_id in vmcs12 to vmcs12_revision. */
+ memcpy(&state->data, &vmcs12_revision, sizeof(u32));
+}
+
+void set_default_state(struct kvm_nested_state *state)
+{
+ memset(state, 0, sizeof(*state));
+ state->flags = KVM_STATE_NESTED_RUN_PENDING |
+ KVM_STATE_NESTED_GUEST_MODE;
+ state->format = 0;
+ state->size = sizeof(*state);
+}
+
+void set_default_vmx_state(struct kvm_nested_state *state, int size)
+{
+ memset(state, 0, size);
+ if (have_evmcs)
+ state->flags = KVM_STATE_NESTED_EVMCS;
+ state->format = 0;
+ state->size = size;
+ state->hdr.vmx.vmxon_pa = 0x1000;
+ state->hdr.vmx.vmcs12_pa = 0x2000;
+ state->hdr.vmx.smm.flags = 0;
+ set_revision_id_for_vmcs12(state, VMCS12_REVISION);
+}
+
+void test_vmx_nested_state(struct kvm_vm *vm)
+{
+ /* Add a page for VMCS12. */
+ const int state_sz = sizeof(struct kvm_nested_state) + getpagesize();
+ struct kvm_nested_state *state =
+ (struct kvm_nested_state *)malloc(state_sz);
+
+ /* The format must be set to 0. 0 for VMX, 1 for SVM. */
+ set_default_vmx_state(state, state_sz);
+ state->format = 1;
+ test_nested_state_expect_einval(vm, state);
+
+ /*
+ * We cannot virtualize anything if the guest does not have VMX
+ * enabled.
+ */
+ set_default_vmx_state(state, state_sz);
+ test_nested_state_expect_einval(vm, state);
+
+ /*
+ * We cannot virtualize anything if the guest does not have VMX
+ * enabled. We expect KVM_SET_NESTED_STATE to return 0 if vmxon_pa
+ * is set to -1ull, but the flags must be zero.
+ */
+ set_default_vmx_state(state, state_sz);
+ state->hdr.vmx.vmxon_pa = -1ull;
+ test_nested_state_expect_einval(vm, state);
+
+ state->hdr.vmx.vmcs12_pa = -1ull;
+ state->flags = KVM_STATE_NESTED_EVMCS;
+ test_nested_state_expect_einval(vm, state);
+
+ state->flags = 0;
+ test_nested_state(vm, state);
+
+ /* Enable VMX in the guest CPUID. */
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+ /*
+ * Setting vmxon_pa == -1ull and vmcs_pa == -1ull exits early without
+ * setting the nested state but flags other than eVMCS must be clear.
+ * The eVMCS flag can be set if the enlightened VMCS capability has
+ * been enabled.
+ */
+ set_default_vmx_state(state, state_sz);
+ state->hdr.vmx.vmxon_pa = -1ull;
+ state->hdr.vmx.vmcs12_pa = -1ull;
+ test_nested_state_expect_einval(vm, state);
+
+ state->flags &= KVM_STATE_NESTED_EVMCS;
+ if (have_evmcs) {
+ test_nested_state_expect_einval(vm, state);
+ vcpu_enable_evmcs(vm, VCPU_ID);
+ }
+ test_nested_state(vm, state);
+
+ /* It is invalid to have vmxon_pa == -1ull and SMM flags non-zero. */
+ state->hdr.vmx.smm.flags = 1;
+ test_nested_state_expect_einval(vm, state);
+
+ /* Invalid flags are rejected. */
+ set_default_vmx_state(state, state_sz);
+ state->hdr.vmx.flags = ~0;
+ test_nested_state_expect_einval(vm, state);
+
+ /* It is invalid to have vmxon_pa == -1ull and vmcs_pa != -1ull. */
+ set_default_vmx_state(state, state_sz);
+ state->hdr.vmx.vmxon_pa = -1ull;
+ state->flags = 0;
+ test_nested_state_expect_einval(vm, state);
+
+ /* It is invalid to have vmxon_pa set to a non-page aligned address. */
+ set_default_vmx_state(state, state_sz);
+ state->hdr.vmx.vmxon_pa = 1;
+ test_nested_state_expect_einval(vm, state);
+
+ /*
+ * It is invalid to have KVM_STATE_NESTED_SMM_GUEST_MODE and
+ * KVM_STATE_NESTED_GUEST_MODE set together.
+ */
+ set_default_vmx_state(state, state_sz);
+ state->flags = KVM_STATE_NESTED_GUEST_MODE |
+ KVM_STATE_NESTED_RUN_PENDING;
+ state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE;
+ test_nested_state_expect_einval(vm, state);
+
+ /*
+ * It is invalid to have any of the SMM flags set besides:
+ * KVM_STATE_NESTED_SMM_GUEST_MODE
+ * KVM_STATE_NESTED_SMM_VMXON
+ */
+ set_default_vmx_state(state, state_sz);
+ state->hdr.vmx.smm.flags = ~(KVM_STATE_NESTED_SMM_GUEST_MODE |
+ KVM_STATE_NESTED_SMM_VMXON);
+ test_nested_state_expect_einval(vm, state);
+
+ /* Outside SMM, SMM flags must be zero. */
+ set_default_vmx_state(state, state_sz);
+ state->flags = 0;
+ state->hdr.vmx.smm.flags = KVM_STATE_NESTED_SMM_GUEST_MODE;
+ test_nested_state_expect_einval(vm, state);
+
+ /*
+ * Size must be large enough to fit kvm_nested_state and vmcs12
+ * if VMCS12 physical address is set
+ */
+ set_default_vmx_state(state, state_sz);
+ state->size = sizeof(*state);
+ state->flags = 0;
+ test_nested_state_expect_einval(vm, state);
+
+ set_default_vmx_state(state, state_sz);
+ state->size = sizeof(*state);
+ state->flags = 0;
+ state->hdr.vmx.vmcs12_pa = -1;
+ test_nested_state(vm, state);
+
+ /*
+ * KVM_SET_NESTED_STATE succeeds with invalid VMCS
+ * contents but L2 not running.
+ */
+ set_default_vmx_state(state, state_sz);
+ state->flags = 0;
+ test_nested_state(vm, state);
+
+ /* Invalid flags are rejected, even if no VMCS loaded. */
+ set_default_vmx_state(state, state_sz);
+ state->size = sizeof(*state);
+ state->flags = 0;
+ state->hdr.vmx.vmcs12_pa = -1;
+ state->hdr.vmx.flags = ~0;
+ test_nested_state_expect_einval(vm, state);
+
+ /* vmxon_pa cannot be the same address as vmcs_pa. */
+ set_default_vmx_state(state, state_sz);
+ state->hdr.vmx.vmxon_pa = 0;
+ state->hdr.vmx.vmcs12_pa = 0;
+ test_nested_state_expect_einval(vm, state);
+
+ /*
+ * Test that if we leave nesting the state reflects that when we get
+ * it again.
+ */
+ set_default_vmx_state(state, state_sz);
+ state->hdr.vmx.vmxon_pa = -1ull;
+ state->hdr.vmx.vmcs12_pa = -1ull;
+ state->flags = 0;
+ test_nested_state(vm, state);
+ vcpu_nested_state_get(vm, VCPU_ID, state);
+ TEST_ASSERT(state->size >= sizeof(*state) && state->size <= state_sz,
+ "Size must be between %ld and %d. The size returned was %d.",
+ sizeof(*state), state_sz, state->size);
+ TEST_ASSERT(state->hdr.vmx.vmxon_pa == -1ull, "vmxon_pa must be -1ull.");
+ TEST_ASSERT(state->hdr.vmx.vmcs12_pa == -1ull, "vmcs_pa must be -1ull.");
+
+ free(state);
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_vm *vm;
+ struct kvm_nested_state state;
+
+ have_evmcs = kvm_check_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS);
+
+ if (!kvm_check_cap(KVM_CAP_NESTED_STATE)) {
+ print_skip("KVM_CAP_NESTED_STATE not available");
+ exit(KSFT_SKIP);
+ }
+
+ /*
+ * AMD currently does not implement set_nested_state, so for now we
+ * just early out.
+ */
+ nested_vmx_check_supported();
+
+ vm = vm_create_default(VCPU_ID, 0, 0);
+
+ /* Passing a NULL kvm_nested_state causes a EFAULT. */
+ test_nested_state_expect_efault(vm, NULL);
+
+ /* 'size' cannot be smaller than sizeof(kvm_nested_state). */
+ set_default_state(&state);
+ state.size = 0;
+ test_nested_state_expect_einval(vm, &state);
+
+ /*
+ * Setting the flags 0xf fails the flags check. The only flags that
+ * can be used are:
+ * KVM_STATE_NESTED_GUEST_MODE
+ * KVM_STATE_NESTED_RUN_PENDING
+ * KVM_STATE_NESTED_EVMCS
+ */
+ set_default_state(&state);
+ state.flags = 0xf;
+ test_nested_state_expect_einval(vm, &state);
+
+ /*
+ * If KVM_STATE_NESTED_RUN_PENDING is set then
+ * KVM_STATE_NESTED_GUEST_MODE has to be set as well.
+ */
+ set_default_state(&state);
+ state.flags = KVM_STATE_NESTED_RUN_PENDING;
+ test_nested_state_expect_einval(vm, &state);
+
+ test_vmx_nested_state(vm);
+
+ kvm_vm_free(vm);
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c
new file mode 100644
index 000000000..fbe8417cb
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/vmx_tsc_adjust_test.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vmx_tsc_adjust_test
+ *
+ * Copyright (C) 2018, Google LLC.
+ *
+ * IA32_TSC_ADJUST test
+ *
+ * According to the SDM, "if an execution of WRMSR to the
+ * IA32_TIME_STAMP_COUNTER MSR adds (or subtracts) value X from the TSC,
+ * the logical processor also adds (or subtracts) value X from the
+ * IA32_TSC_ADJUST MSR.
+ *
+ * Note that when L1 doesn't intercept writes to IA32_TSC, a
+ * WRMSR(IA32_TSC) from L2 sets L1's TSC value, not L2's perceived TSC
+ * value.
+ *
+ * This test verifies that this unusual case is handled correctly.
+ */
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "processor.h"
+#include "vmx.h"
+
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "kselftest.h"
+
+#ifndef MSR_IA32_TSC_ADJUST
+#define MSR_IA32_TSC_ADJUST 0x3b
+#endif
+
+#define PAGE_SIZE 4096
+#define VCPU_ID 5
+
+#define TSC_ADJUST_VALUE (1ll << 32)
+#define TSC_OFFSET_VALUE -(1ll << 48)
+
+enum {
+ PORT_ABORT = 0x1000,
+ PORT_REPORT,
+ PORT_DONE,
+};
+
+enum {
+ VMXON_PAGE = 0,
+ VMCS_PAGE,
+ MSR_BITMAP_PAGE,
+
+ NUM_VMX_PAGES,
+};
+
+struct kvm_single_msr {
+ struct kvm_msrs header;
+ struct kvm_msr_entry entry;
+} __attribute__((packed));
+
+/* The virtual machine object. */
+static struct kvm_vm *vm;
+
+static void check_ia32_tsc_adjust(int64_t max)
+{
+ int64_t adjust;
+
+ adjust = rdmsr(MSR_IA32_TSC_ADJUST);
+ GUEST_SYNC(adjust);
+ GUEST_ASSERT(adjust <= max);
+}
+
+static void l2_guest_code(void)
+{
+ uint64_t l1_tsc = rdtsc() - TSC_OFFSET_VALUE;
+
+ wrmsr(MSR_IA32_TSC, l1_tsc - TSC_ADJUST_VALUE);
+ check_ia32_tsc_adjust(-2 * TSC_ADJUST_VALUE);
+
+ /* Exit to L1 */
+ __asm__ __volatile__("vmcall");
+}
+
+static void l1_guest_code(struct vmx_pages *vmx_pages)
+{
+#define L2_GUEST_STACK_SIZE 64
+ unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
+ uint32_t control;
+ uintptr_t save_cr3;
+
+ GUEST_ASSERT(rdtsc() < TSC_ADJUST_VALUE);
+ wrmsr(MSR_IA32_TSC, rdtsc() - TSC_ADJUST_VALUE);
+ check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE);
+
+ GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
+ GUEST_ASSERT(load_vmcs(vmx_pages));
+
+ /* Prepare the VMCS for L2 execution. */
+ prepare_vmcs(vmx_pages, l2_guest_code,
+ &l2_guest_stack[L2_GUEST_STACK_SIZE]);
+ control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
+ control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING;
+ vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
+ vmwrite(TSC_OFFSET, TSC_OFFSET_VALUE);
+
+ /* Jump into L2. First, test failure to load guest CR3. */
+ save_cr3 = vmreadz(GUEST_CR3);
+ vmwrite(GUEST_CR3, -1ull);
+ GUEST_ASSERT(!vmlaunch());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) ==
+ (EXIT_REASON_FAILED_VMENTRY | EXIT_REASON_INVALID_STATE));
+ check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE);
+ vmwrite(GUEST_CR3, save_cr3);
+
+ GUEST_ASSERT(!vmlaunch());
+ GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
+
+ check_ia32_tsc_adjust(-2 * TSC_ADJUST_VALUE);
+
+ GUEST_DONE();
+}
+
+static void report(int64_t val)
+{
+ pr_info("IA32_TSC_ADJUST is %ld (%lld * TSC_ADJUST_VALUE + %lld).\n",
+ val, val / TSC_ADJUST_VALUE, val % TSC_ADJUST_VALUE);
+}
+
+int main(int argc, char *argv[])
+{
+ vm_vaddr_t vmx_pages_gva;
+
+ nested_vmx_check_supported();
+
+ vm = vm_create_default(VCPU_ID, 0, (void *) l1_guest_code);
+ vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
+
+ /* Allocate VMX pages and shared descriptors (vmx_pages). */
+ vcpu_alloc_vmx(vm, &vmx_pages_gva);
+ vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
+
+ for (;;) {
+ volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+ struct ucall uc;
+
+ vcpu_run(vm, VCPU_ID);
+ TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
+ "Got exit_reason other than KVM_EXIT_IO: %u (%s)\n",
+ run->exit_reason,
+ exit_reason_str(run->exit_reason));
+
+ switch (get_ucall(vm, VCPU_ID, &uc)) {
+ case UCALL_ABORT:
+ TEST_FAIL("%s", (const char *)uc.args[0]);
+ /* NOT REACHED */
+ case UCALL_SYNC:
+ report(uc.args[1]);
+ break;
+ case UCALL_DONE:
+ goto done;
+ default:
+ TEST_FAIL("Unknown ucall %lu", uc.cmd);
+ }
+ }
+
+ kvm_vm_free(vm);
+done:
+ return 0;
+}
diff --git a/tools/testing/selftests/kvm/x86_64/xss_msr_test.c b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c
new file mode 100644
index 000000000..352937674
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/xss_msr_test.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019, Google LLC.
+ *
+ * Tests for the IA32_XSS MSR.
+ */
+
+#define _GNU_SOURCE /* for program_invocation_short_name */
+#include <sys/ioctl.h>
+
+#include "test_util.h"
+#include "kvm_util.h"
+#include "vmx.h"
+
+#define VCPU_ID 1
+#define MSR_BITS 64
+
+#define X86_FEATURE_XSAVES (1<<3)
+
+bool is_supported_msr(u32 msr_index)
+{
+ struct kvm_msr_list *list;
+ bool found = false;
+ int i;
+
+ list = kvm_get_msr_index_list();
+ for (i = 0; i < list->nmsrs; ++i) {
+ if (list->indices[i] == msr_index) {
+ found = true;
+ break;
+ }
+ }
+
+ free(list);
+ return found;
+}
+
+int main(int argc, char *argv[])
+{
+ struct kvm_cpuid_entry2 *entry;
+ bool xss_supported = false;
+ struct kvm_vm *vm;
+ uint64_t xss_val;
+ int i, r;
+
+ /* Create VM */
+ vm = vm_create_default(VCPU_ID, 0, 0);
+
+ if (kvm_get_cpuid_max_basic() >= 0xd) {
+ entry = kvm_get_supported_cpuid_index(0xd, 1);
+ xss_supported = entry && !!(entry->eax & X86_FEATURE_XSAVES);
+ }
+ if (!xss_supported) {
+ print_skip("IA32_XSS is not supported by the vCPU");
+ exit(KSFT_SKIP);
+ }
+
+ xss_val = vcpu_get_msr(vm, VCPU_ID, MSR_IA32_XSS);
+ TEST_ASSERT(xss_val == 0,
+ "MSR_IA32_XSS should be initialized to zero\n");
+
+ vcpu_set_msr(vm, VCPU_ID, MSR_IA32_XSS, xss_val);
+ /*
+ * At present, KVM only supports a guest IA32_XSS value of 0. Verify
+ * that trying to set the guest IA32_XSS to an unsupported value fails.
+ * Also, in the future when a non-zero value succeeds check that
+ * IA32_XSS is in the KVM_GET_MSR_INDEX_LIST.
+ */
+ for (i = 0; i < MSR_BITS; ++i) {
+ r = _vcpu_set_msr(vm, VCPU_ID, MSR_IA32_XSS, 1ull << i);
+ TEST_ASSERT(r == 0 || is_supported_msr(MSR_IA32_XSS),
+ "IA32_XSS was able to be set, but was not found in KVM_GET_MSR_INDEX_LIST.\n");
+ }
+
+ kvm_vm_free(vm);
+}
diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk
new file mode 100644
index 000000000..56e360e01
--- /dev/null
+++ b/tools/testing/selftests/lib.mk
@@ -0,0 +1,159 @@
+# This mimics the top-level Makefile. We do it explicitly here so that this
+# Makefile can operate with or without the kbuild infrastructure.
+ifneq ($(LLVM),)
+CC := clang
+else
+CC := $(CROSS_COMPILE)gcc
+endif
+
+ifeq (0,$(MAKELEVEL))
+ ifeq ($(OUTPUT),)
+ OUTPUT := $(shell pwd)
+ DEFAULT_INSTALL_HDR_PATH := 1
+ endif
+endif
+selfdir = $(realpath $(dir $(filter %/lib.mk,$(MAKEFILE_LIST))))
+
+# The following are built by lib.mk common compile rules.
+# TEST_CUSTOM_PROGS should be used by tests that require
+# custom build rule and prevent common build rule use.
+# TEST_PROGS are for test shell scripts.
+# TEST_CUSTOM_PROGS and TEST_PROGS will be run by common run_tests
+# and install targets. Common clean doesn't touch them.
+TEST_GEN_PROGS := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS))
+TEST_GEN_PROGS_EXTENDED := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_PROGS_EXTENDED))
+TEST_GEN_FILES := $(patsubst %,$(OUTPUT)/%,$(TEST_GEN_FILES))
+
+ifdef KSFT_KHDR_INSTALL
+top_srcdir ?= ../../../..
+include $(top_srcdir)/scripts/subarch.include
+ARCH ?= $(SUBARCH)
+
+# set default goal to all, so make without a target runs all, even when
+# all isn't the first target in the file.
+.DEFAULT_GOAL := all
+
+# Invoke headers install with --no-builtin-rules to avoid circular
+# dependency in "make kselftest" case. In this case, second level
+# make inherits builtin-rules which will use the rule generate
+# Makefile.o and runs into
+# "Circular Makefile.o <- prepare dependency dropped."
+# and headers_install fails and test compile fails.
+# O= KBUILD_OUTPUT cases don't run into this error, since main Makefile
+# invokes them as sub-makes and --no-builtin-rules is not necessary,
+# but doesn't cause any failures. Keep it simple and use the same
+# flags in both cases.
+# Note that the support to install headers from lib.mk is necessary
+# when test Makefile is run directly with "make -C".
+# When local build is done, headers are installed in the default
+# INSTALL_HDR_PATH usr/include.
+.PHONY: khdr
+.NOTPARALLEL:
+khdr:
+ifndef KSFT_KHDR_INSTALL_DONE
+ifeq (1,$(DEFAULT_INSTALL_HDR_PATH))
+ $(MAKE) --no-builtin-rules ARCH=$(ARCH) -C $(top_srcdir) headers_install
+else
+ $(MAKE) --no-builtin-rules INSTALL_HDR_PATH=$$OUTPUT/usr \
+ ARCH=$(ARCH) -C $(top_srcdir) headers_install
+endif
+endif
+
+all: khdr $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES)
+else
+all: $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES)
+endif
+
+define RUN_TESTS
+ BASE_DIR="$(selfdir)"; \
+ . $(selfdir)/kselftest/runner.sh; \
+ if [ "X$(summary)" != "X" ]; then \
+ per_test_logging=1; \
+ fi; \
+ run_many $(1)
+endef
+
+run_tests: all
+ifdef building_out_of_srctree
+ @if [ "X$(TEST_PROGS)$(TEST_PROGS_EXTENDED)$(TEST_FILES)" != "X" ]; then \
+ rsync -aq $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES) $(OUTPUT); \
+ fi
+ @if [ "X$(TEST_PROGS)" != "X" ]; then \
+ $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) \
+ $(addprefix $(OUTPUT)/,$(TEST_PROGS))) ; \
+ else \
+ $(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS)); \
+ fi
+else
+ @$(call RUN_TESTS, $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_PROGS))
+endif
+
+define INSTALL_SINGLE_RULE
+ $(if $(INSTALL_LIST),@mkdir -p $(INSTALL_PATH))
+ $(if $(INSTALL_LIST),rsync -a $(INSTALL_LIST) $(INSTALL_PATH)/)
+endef
+
+define INSTALL_RULE
+ $(eval INSTALL_LIST = $(TEST_PROGS)) $(INSTALL_SINGLE_RULE)
+ $(eval INSTALL_LIST = $(TEST_PROGS_EXTENDED)) $(INSTALL_SINGLE_RULE)
+ $(eval INSTALL_LIST = $(TEST_FILES)) $(INSTALL_SINGLE_RULE)
+ $(eval INSTALL_LIST = $(TEST_GEN_PROGS)) $(INSTALL_SINGLE_RULE)
+ $(eval INSTALL_LIST = $(TEST_CUSTOM_PROGS)) $(INSTALL_SINGLE_RULE)
+ $(eval INSTALL_LIST = $(TEST_GEN_PROGS_EXTENDED)) $(INSTALL_SINGLE_RULE)
+ $(eval INSTALL_LIST = $(TEST_GEN_FILES)) $(INSTALL_SINGLE_RULE)
+endef
+
+install: all
+ifdef INSTALL_PATH
+ $(INSTALL_RULE)
+else
+ $(error Error: set INSTALL_PATH to use install)
+endif
+
+emit_tests:
+ for TEST in $(TEST_GEN_PROGS) $(TEST_CUSTOM_PROGS) $(TEST_PROGS); do \
+ BASENAME_TEST=`basename $$TEST`; \
+ echo "$(COLLECTION):$$BASENAME_TEST"; \
+ done
+
+# define if isn't already. It is undefined in make O= case.
+ifeq ($(RM),)
+RM := rm -f
+endif
+
+define CLEAN
+ $(RM) -r $(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED) $(TEST_GEN_FILES) $(EXTRA_CLEAN)
+endef
+
+clean:
+ $(CLEAN)
+
+# Enables to extend CFLAGS and LDFLAGS from command line, e.g.
+# make USERCFLAGS=-Werror USERLDFLAGS=-static
+CFLAGS += $(USERCFLAGS)
+LDFLAGS += $(USERLDFLAGS)
+
+# When make O= with kselftest target from main level
+# the following aren't defined.
+#
+ifdef building_out_of_srctree
+LINK.c = $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH)
+COMPILE.S = $(CC) $(ASFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c
+LINK.S = $(CC) $(ASFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH)
+endif
+
+# Selftest makefiles can override those targets by setting
+# OVERRIDE_TARGETS = 1.
+ifeq ($(OVERRIDE_TARGETS),)
+LOCAL_HDRS := $(selfdir)/kselftest_harness.h $(selfdir)/kselftest.h
+$(OUTPUT)/%:%.c $(LOCAL_HDRS)
+ $(LINK.c) $(filter-out $(LOCAL_HDRS),$^) $(LDLIBS) -o $@
+
+$(OUTPUT)/%.o:%.S
+ $(COMPILE.S) $^ -o $@
+
+$(OUTPUT)/%:%.S
+ $(LINK.S) $^ $(LDLIBS) -o $@
+endif
+
+.PHONY: run_tests all clean install emit_tests
diff --git a/tools/testing/selftests/lib/Makefile b/tools/testing/selftests/lib/Makefile
new file mode 100644
index 000000000..a105f0946
--- /dev/null
+++ b/tools/testing/selftests/lib/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Makefile for lib/ function selftests
+
+# No binaries, but make sure arg-less "make" doesn't trigger "run_tests"
+all:
+
+TEST_PROGS := printf.sh bitmap.sh prime_numbers.sh strscpy.sh
+
+include ../lib.mk
diff --git a/tools/testing/selftests/lib/bitmap.sh b/tools/testing/selftests/lib/bitmap.sh
new file mode 100755
index 000000000..00a416fbc
--- /dev/null
+++ b/tools/testing/selftests/lib/bitmap.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+$(dirname $0)/../kselftest/module.sh "bitmap" test_bitmap
diff --git a/tools/testing/selftests/lib/config b/tools/testing/selftests/lib/config
new file mode 100644
index 000000000..b80ee3f6e
--- /dev/null
+++ b/tools/testing/selftests/lib/config
@@ -0,0 +1,5 @@
+CONFIG_TEST_PRINTF=m
+CONFIG_TEST_BITMAP=m
+CONFIG_PRIME_NUMBERS=m
+CONFIG_TEST_STRSCPY=m
+CONFIG_TEST_BITOPS=m
diff --git a/tools/testing/selftests/lib/prime_numbers.sh b/tools/testing/selftests/lib/prime_numbers.sh
new file mode 100755
index 000000000..370b79a9c
--- /dev/null
+++ b/tools/testing/selftests/lib/prime_numbers.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# Checks fast/slow prime_number generation for inconsistencies
+$(dirname $0)/../kselftest/module.sh "prime numbers" prime_numbers selftest=65536
diff --git a/tools/testing/selftests/lib/printf.sh b/tools/testing/selftests/lib/printf.sh
new file mode 100755
index 000000000..05f4544e8
--- /dev/null
+++ b/tools/testing/selftests/lib/printf.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# Tests the printf infrastructure using test_printf kernel module.
+$(dirname $0)/../kselftest/module.sh "printf" test_printf
diff --git a/tools/testing/selftests/lib/strscpy.sh b/tools/testing/selftests/lib/strscpy.sh
new file mode 100755
index 000000000..be60ef6e1
--- /dev/null
+++ b/tools/testing/selftests/lib/strscpy.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0+
+$(dirname $0)/../kselftest/module.sh "strscpy*" test_strscpy
diff --git a/tools/testing/selftests/livepatch/Makefile b/tools/testing/selftests/livepatch/Makefile
new file mode 100644
index 000000000..1acc9e1fa
--- /dev/null
+++ b/tools/testing/selftests/livepatch/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
+
+TEST_PROGS_EXTENDED := functions.sh
+TEST_PROGS := \
+ test-livepatch.sh \
+ test-callbacks.sh \
+ test-shadow-vars.sh \
+ test-state.sh \
+ test-ftrace.sh
+
+TEST_FILES := settings
+
+include ../lib.mk
diff --git a/tools/testing/selftests/livepatch/README b/tools/testing/selftests/livepatch/README
new file mode 100644
index 000000000..0942dd582
--- /dev/null
+++ b/tools/testing/selftests/livepatch/README
@@ -0,0 +1,43 @@
+====================
+Livepatch Self Tests
+====================
+
+This is a small set of sanity tests for the kernel livepatching.
+
+The test suite loads and unloads several test kernel modules to verify
+livepatch behavior. Debug information is logged to the kernel's message
+buffer and parsed for expected messages. (Note: the tests will compare
+the message buffer for only the duration of each individual test.)
+
+
+Config
+------
+
+Set these config options and their prerequisites:
+
+CONFIG_LIVEPATCH=y
+CONFIG_TEST_LIVEPATCH=m
+
+
+Running the tests
+-----------------
+
+Test kernel modules are built as part of lib/ (make modules) and need to
+be installed (make modules_install) as the test scripts will modprobe
+them.
+
+To run the livepatch selftests, from the top of the kernel source tree:
+
+ % make -C tools/testing/selftests TARGETS=livepatch run_tests
+
+
+Adding tests
+------------
+
+See the common functions.sh file for the existing collection of utility
+functions, most importantly setup_config(), start_test() and
+check_result(). The latter function greps the kernel's ring buffer for
+"livepatch:" and "test_klp" strings, so tests be sure to include one of
+those strings for result comparison. Other utility functions include
+general module loading and livepatch loading helpers (waiting for patch
+transitions, sysfs entries, etc.)
diff --git a/tools/testing/selftests/livepatch/config b/tools/testing/selftests/livepatch/config
new file mode 100644
index 000000000..ad23100cb
--- /dev/null
+++ b/tools/testing/selftests/livepatch/config
@@ -0,0 +1,3 @@
+CONFIG_LIVEPATCH=y
+CONFIG_DYNAMIC_DEBUG=y
+CONFIG_TEST_LIVEPATCH=m
diff --git a/tools/testing/selftests/livepatch/functions.sh b/tools/testing/selftests/livepatch/functions.sh
new file mode 100644
index 000000000..846c7ed71
--- /dev/null
+++ b/tools/testing/selftests/livepatch/functions.sh
@@ -0,0 +1,294 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2018 Joe Lawrence <joe.lawrence@redhat.com>
+
+# Shell functions for the rest of the scripts.
+
+MAX_RETRIES=600
+RETRY_INTERVAL=".1" # seconds
+
+# Kselftest framework requirement - SKIP code is 4
+ksft_skip=4
+
+# log(msg) - write message to kernel log
+# msg - insightful words
+function log() {
+ echo "$1" > /dev/kmsg
+}
+
+# skip(msg) - testing can't proceed
+# msg - explanation
+function skip() {
+ log "SKIP: $1"
+ echo "SKIP: $1" >&2
+ exit $ksft_skip
+}
+
+# root test
+function is_root() {
+ uid=$(id -u)
+ if [ $uid -ne 0 ]; then
+ echo "skip all tests: must be run as root" >&2
+ exit $ksft_skip
+ fi
+}
+
+# die(msg) - game over, man
+# msg - dying words
+function die() {
+ log "ERROR: $1"
+ echo "ERROR: $1" >&2
+ exit 1
+}
+
+# save existing dmesg so we can detect new content
+function save_dmesg() {
+ SAVED_DMESG=$(mktemp --tmpdir -t klp-dmesg-XXXXXX)
+ dmesg > "$SAVED_DMESG"
+}
+
+# cleanup temporary dmesg file from save_dmesg()
+function cleanup_dmesg_file() {
+ rm -f "$SAVED_DMESG"
+}
+
+function push_config() {
+ DYNAMIC_DEBUG=$(grep '^kernel/livepatch' /sys/kernel/debug/dynamic_debug/control | \
+ awk -F'[: ]' '{print "file " $1 " line " $2 " " $4}')
+ FTRACE_ENABLED=$(sysctl --values kernel.ftrace_enabled)
+}
+
+function pop_config() {
+ if [[ -n "$DYNAMIC_DEBUG" ]]; then
+ echo -n "$DYNAMIC_DEBUG" > /sys/kernel/debug/dynamic_debug/control
+ fi
+ if [[ -n "$FTRACE_ENABLED" ]]; then
+ sysctl kernel.ftrace_enabled="$FTRACE_ENABLED" &> /dev/null
+ fi
+}
+
+function set_dynamic_debug() {
+ cat <<-EOF > /sys/kernel/debug/dynamic_debug/control
+ file kernel/livepatch/* +p
+ func klp_try_switch_task -p
+ EOF
+}
+
+function set_ftrace_enabled() {
+ result=$(sysctl -q kernel.ftrace_enabled="$1" 2>&1 && \
+ sysctl kernel.ftrace_enabled 2>&1)
+ echo "livepatch: $result" > /dev/kmsg
+}
+
+function cleanup() {
+ pop_config
+ cleanup_dmesg_file
+}
+
+# setup_config - save the current config and set a script exit trap that
+# restores the original config. Setup the dynamic debug
+# for verbose livepatching output and turn on
+# the ftrace_enabled sysctl.
+function setup_config() {
+ is_root
+ push_config
+ set_dynamic_debug
+ set_ftrace_enabled 1
+ trap cleanup EXIT INT TERM HUP
+}
+
+# loop_until(cmd) - loop a command until it is successful or $MAX_RETRIES,
+# sleep $RETRY_INTERVAL between attempts
+# cmd - command and its arguments to run
+function loop_until() {
+ local cmd="$*"
+ local i=0
+ while true; do
+ eval "$cmd" && return 0
+ [[ $((i++)) -eq $MAX_RETRIES ]] && return 1
+ sleep $RETRY_INTERVAL
+ done
+}
+
+function assert_mod() {
+ local mod="$1"
+
+ modprobe --dry-run "$mod" &>/dev/null
+}
+
+function is_livepatch_mod() {
+ local mod="$1"
+
+ if [[ $(modinfo "$mod" | awk '/^livepatch:/{print $NF}') == "Y" ]]; then
+ return 0
+ fi
+
+ return 1
+}
+
+function __load_mod() {
+ local mod="$1"; shift
+
+ local msg="% modprobe $mod $*"
+ log "${msg%% }"
+ ret=$(modprobe "$mod" "$@" 2>&1)
+ if [[ "$ret" != "" ]]; then
+ die "$ret"
+ fi
+
+ # Wait for module in sysfs ...
+ loop_until '[[ -e "/sys/module/$mod" ]]' ||
+ die "failed to load module $mod"
+}
+
+
+# load_mod(modname, params) - load a kernel module
+# modname - module name to load
+# params - module parameters to pass to modprobe
+function load_mod() {
+ local mod="$1"; shift
+
+ assert_mod "$mod" ||
+ skip "unable to load module ${mod}, verify CONFIG_TEST_LIVEPATCH=m and run self-tests as root"
+
+ is_livepatch_mod "$mod" &&
+ die "use load_lp() to load the livepatch module $mod"
+
+ __load_mod "$mod" "$@"
+}
+
+# load_lp_nowait(modname, params) - load a kernel module with a livepatch
+# but do not wait on until the transition finishes
+# modname - module name to load
+# params - module parameters to pass to modprobe
+function load_lp_nowait() {
+ local mod="$1"; shift
+
+ assert_mod "$mod" ||
+ skip "unable to load module ${mod}, verify CONFIG_TEST_LIVEPATCH=m and run self-tests as root"
+
+ is_livepatch_mod "$mod" ||
+ die "module $mod is not a livepatch"
+
+ __load_mod "$mod" "$@"
+
+ # Wait for livepatch in sysfs ...
+ loop_until '[[ -e "/sys/kernel/livepatch/$mod" ]]' ||
+ die "failed to load module $mod (sysfs)"
+}
+
+# load_lp(modname, params) - load a kernel module with a livepatch
+# modname - module name to load
+# params - module parameters to pass to modprobe
+function load_lp() {
+ local mod="$1"; shift
+
+ load_lp_nowait "$mod" "$@"
+
+ # Wait until the transition finishes ...
+ loop_until 'grep -q '^0$' /sys/kernel/livepatch/$mod/transition' ||
+ die "failed to complete transition"
+}
+
+# load_failing_mod(modname, params) - load a kernel module, expect to fail
+# modname - module name to load
+# params - module parameters to pass to modprobe
+function load_failing_mod() {
+ local mod="$1"; shift
+
+ local msg="% modprobe $mod $*"
+ log "${msg%% }"
+ ret=$(modprobe "$mod" "$@" 2>&1)
+ if [[ "$ret" == "" ]]; then
+ die "$mod unexpectedly loaded"
+ fi
+ log "$ret"
+}
+
+# unload_mod(modname) - unload a kernel module
+# modname - module name to unload
+function unload_mod() {
+ local mod="$1"
+
+ # Wait for module reference count to clear ...
+ loop_until '[[ $(cat "/sys/module/$mod/refcnt") == "0" ]]' ||
+ die "failed to unload module $mod (refcnt)"
+
+ log "% rmmod $mod"
+ ret=$(rmmod "$mod" 2>&1)
+ if [[ "$ret" != "" ]]; then
+ die "$ret"
+ fi
+
+ # Wait for module in sysfs ...
+ loop_until '[[ ! -e "/sys/module/$mod" ]]' ||
+ die "failed to unload module $mod (/sys/module)"
+}
+
+# unload_lp(modname) - unload a kernel module with a livepatch
+# modname - module name to unload
+function unload_lp() {
+ unload_mod "$1"
+}
+
+# disable_lp(modname) - disable a livepatch
+# modname - module name to unload
+function disable_lp() {
+ local mod="$1"
+
+ log "% echo 0 > /sys/kernel/livepatch/$mod/enabled"
+ echo 0 > /sys/kernel/livepatch/"$mod"/enabled
+
+ # Wait until the transition finishes and the livepatch gets
+ # removed from sysfs...
+ loop_until '[[ ! -e "/sys/kernel/livepatch/$mod" ]]' ||
+ die "failed to disable livepatch $mod"
+}
+
+# set_pre_patch_ret(modname, pre_patch_ret)
+# modname - module name to set
+# pre_patch_ret - new pre_patch_ret value
+function set_pre_patch_ret {
+ local mod="$1"; shift
+ local ret="$1"
+
+ log "% echo $ret > /sys/module/$mod/parameters/pre_patch_ret"
+ echo "$ret" > /sys/module/"$mod"/parameters/pre_patch_ret
+
+ # Wait for sysfs value to hold ...
+ loop_until '[[ $(cat "/sys/module/$mod/parameters/pre_patch_ret") == "$ret" ]]' ||
+ die "failed to set pre_patch_ret parameter for $mod module"
+}
+
+function start_test {
+ local test="$1"
+
+ save_dmesg
+ echo -n "TEST: $test ... "
+ log "===== TEST: $test ====="
+}
+
+# check_result() - verify dmesg output
+# TODO - better filter, out of order msgs, etc?
+function check_result {
+ local expect="$*"
+ local result
+
+ # Note: when comparing dmesg output, the kernel log timestamps
+ # help differentiate repeated testing runs. Remove them with a
+ # post-comparison sed filter.
+
+ result=$(dmesg | comm --nocheck-order -13 "$SAVED_DMESG" - | \
+ grep -e 'livepatch:' -e 'test_klp' | \
+ grep -v '\(tainting\|taints\) kernel' | \
+ sed 's/^\[[ 0-9.]*\] //')
+
+ if [[ "$expect" == "$result" ]] ; then
+ echo "ok"
+ else
+ echo -e "not ok\n\n$(diff -upr --label expected --label result <(echo "$expect") <(echo "$result"))\n"
+ die "livepatch kselftest(s) failed"
+ fi
+
+ cleanup_dmesg_file
+}
diff --git a/tools/testing/selftests/livepatch/settings b/tools/testing/selftests/livepatch/settings
new file mode 100644
index 000000000..e7b941753
--- /dev/null
+++ b/tools/testing/selftests/livepatch/settings
@@ -0,0 +1 @@
+timeout=0
diff --git a/tools/testing/selftests/livepatch/test-callbacks.sh b/tools/testing/selftests/livepatch/test-callbacks.sh
new file mode 100755
index 000000000..90b26dbb2
--- /dev/null
+++ b/tools/testing/selftests/livepatch/test-callbacks.sh
@@ -0,0 +1,553 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2018 Joe Lawrence <joe.lawrence@redhat.com>
+
+. $(dirname $0)/functions.sh
+
+MOD_LIVEPATCH=test_klp_callbacks_demo
+MOD_LIVEPATCH2=test_klp_callbacks_demo2
+MOD_TARGET=test_klp_callbacks_mod
+MOD_TARGET_BUSY=test_klp_callbacks_busy
+
+setup_config
+
+
+# Test a combination of loading a kernel module and a livepatch that
+# patches a function in the first module. Load the target module
+# before the livepatch module. Unload them in the same order.
+#
+# - On livepatch enable, before the livepatch transition starts,
+# pre-patch callbacks are executed for vmlinux and $MOD_TARGET (those
+# klp_objects currently loaded). After klp_objects are patched
+# according to the klp_patch, their post-patch callbacks run and the
+# transition completes.
+#
+# - Similarly, on livepatch disable, pre-patch callbacks run before the
+# unpatching transition starts. klp_objects are reverted, post-patch
+# callbacks execute and the transition completes.
+
+start_test "target module before livepatch"
+
+load_mod $MOD_TARGET
+load_lp $MOD_LIVEPATCH
+disable_lp $MOD_LIVEPATCH
+unload_lp $MOD_LIVEPATCH
+unload_mod $MOD_TARGET
+
+check_result "% modprobe $MOD_TARGET
+$MOD_TARGET: ${MOD_TARGET}_init
+% modprobe $MOD_LIVEPATCH
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+$MOD_LIVEPATCH: pre_patch_callback: vmlinux
+$MOD_LIVEPATCH: pre_patch_callback: $MOD_TARGET -> [MODULE_STATE_LIVE] Normal state
+livepatch: '$MOD_LIVEPATCH': starting patching transition
+livepatch: '$MOD_LIVEPATCH': completing patching transition
+$MOD_LIVEPATCH: post_patch_callback: vmlinux
+$MOD_LIVEPATCH: post_patch_callback: $MOD_TARGET -> [MODULE_STATE_LIVE] Normal state
+livepatch: '$MOD_LIVEPATCH': patching complete
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
+$MOD_LIVEPATCH: pre_unpatch_callback: vmlinux
+$MOD_LIVEPATCH: pre_unpatch_callback: $MOD_TARGET -> [MODULE_STATE_LIVE] Normal state
+livepatch: '$MOD_LIVEPATCH': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH': completing unpatching transition
+$MOD_LIVEPATCH: post_unpatch_callback: vmlinux
+$MOD_LIVEPATCH: post_unpatch_callback: $MOD_TARGET -> [MODULE_STATE_LIVE] Normal state
+livepatch: '$MOD_LIVEPATCH': unpatching complete
+% rmmod $MOD_LIVEPATCH
+% rmmod $MOD_TARGET
+$MOD_TARGET: ${MOD_TARGET}_exit"
+
+
+# This test is similar to the previous test, but (un)load the livepatch
+# module before the target kernel module. This tests the livepatch
+# core's module_coming handler.
+#
+# - On livepatch enable, only pre/post-patch callbacks are executed for
+# currently loaded klp_objects, in this case, vmlinux.
+#
+# - When a targeted module is subsequently loaded, only its
+# pre/post-patch callbacks are executed.
+#
+# - On livepatch disable, all currently loaded klp_objects' (vmlinux and
+# $MOD_TARGET) pre/post-unpatch callbacks are executed.
+
+start_test "module_coming notifier"
+
+load_lp $MOD_LIVEPATCH
+load_mod $MOD_TARGET
+disable_lp $MOD_LIVEPATCH
+unload_lp $MOD_LIVEPATCH
+unload_mod $MOD_TARGET
+
+check_result "% modprobe $MOD_LIVEPATCH
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+$MOD_LIVEPATCH: pre_patch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH': starting patching transition
+livepatch: '$MOD_LIVEPATCH': completing patching transition
+$MOD_LIVEPATCH: post_patch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH': patching complete
+% modprobe $MOD_TARGET
+livepatch: applying patch '$MOD_LIVEPATCH' to loading module '$MOD_TARGET'
+$MOD_LIVEPATCH: pre_patch_callback: $MOD_TARGET -> [MODULE_STATE_COMING] Full formed, running module_init
+$MOD_LIVEPATCH: post_patch_callback: $MOD_TARGET -> [MODULE_STATE_COMING] Full formed, running module_init
+$MOD_TARGET: ${MOD_TARGET}_init
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
+$MOD_LIVEPATCH: pre_unpatch_callback: vmlinux
+$MOD_LIVEPATCH: pre_unpatch_callback: $MOD_TARGET -> [MODULE_STATE_LIVE] Normal state
+livepatch: '$MOD_LIVEPATCH': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH': completing unpatching transition
+$MOD_LIVEPATCH: post_unpatch_callback: vmlinux
+$MOD_LIVEPATCH: post_unpatch_callback: $MOD_TARGET -> [MODULE_STATE_LIVE] Normal state
+livepatch: '$MOD_LIVEPATCH': unpatching complete
+% rmmod $MOD_LIVEPATCH
+% rmmod $MOD_TARGET
+$MOD_TARGET: ${MOD_TARGET}_exit"
+
+
+# Test loading the livepatch after a targeted kernel module, then unload
+# the kernel module before disabling the livepatch. This tests the
+# livepatch core's module_going handler.
+#
+# - First load a target module, then the livepatch.
+#
+# - When a target module is unloaded, the livepatch is only reverted
+# from that klp_object ($MOD_TARGET). As such, only its pre and
+# post-unpatch callbacks are executed when this occurs.
+#
+# - When the livepatch is disabled, pre and post-unpatch callbacks are
+# run for the remaining klp_object, vmlinux.
+
+start_test "module_going notifier"
+
+load_mod $MOD_TARGET
+load_lp $MOD_LIVEPATCH
+unload_mod $MOD_TARGET
+disable_lp $MOD_LIVEPATCH
+unload_lp $MOD_LIVEPATCH
+
+check_result "% modprobe $MOD_TARGET
+$MOD_TARGET: ${MOD_TARGET}_init
+% modprobe $MOD_LIVEPATCH
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+$MOD_LIVEPATCH: pre_patch_callback: vmlinux
+$MOD_LIVEPATCH: pre_patch_callback: $MOD_TARGET -> [MODULE_STATE_LIVE] Normal state
+livepatch: '$MOD_LIVEPATCH': starting patching transition
+livepatch: '$MOD_LIVEPATCH': completing patching transition
+$MOD_LIVEPATCH: post_patch_callback: vmlinux
+$MOD_LIVEPATCH: post_patch_callback: $MOD_TARGET -> [MODULE_STATE_LIVE] Normal state
+livepatch: '$MOD_LIVEPATCH': patching complete
+% rmmod $MOD_TARGET
+$MOD_TARGET: ${MOD_TARGET}_exit
+$MOD_LIVEPATCH: pre_unpatch_callback: $MOD_TARGET -> [MODULE_STATE_GOING] Going away
+livepatch: reverting patch '$MOD_LIVEPATCH' on unloading module '$MOD_TARGET'
+$MOD_LIVEPATCH: post_unpatch_callback: $MOD_TARGET -> [MODULE_STATE_GOING] Going away
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
+$MOD_LIVEPATCH: pre_unpatch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH': completing unpatching transition
+$MOD_LIVEPATCH: post_unpatch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH': unpatching complete
+% rmmod $MOD_LIVEPATCH"
+
+
+# This test is similar to the previous test, however the livepatch is
+# loaded first. This tests the livepatch core's module_coming and
+# module_going handlers.
+#
+# - First load the livepatch.
+#
+# - When a targeted kernel module is subsequently loaded, only its
+# pre/post-patch callbacks are executed.
+#
+# - When the target module is unloaded, the livepatch is only reverted
+# from the $MOD_TARGET klp_object. As such, only pre and
+# post-unpatch callbacks are executed when this occurs.
+
+start_test "module_coming and module_going notifiers"
+
+load_lp $MOD_LIVEPATCH
+load_mod $MOD_TARGET
+unload_mod $MOD_TARGET
+disable_lp $MOD_LIVEPATCH
+unload_lp $MOD_LIVEPATCH
+
+check_result "% modprobe $MOD_LIVEPATCH
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+$MOD_LIVEPATCH: pre_patch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH': starting patching transition
+livepatch: '$MOD_LIVEPATCH': completing patching transition
+$MOD_LIVEPATCH: post_patch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH': patching complete
+% modprobe $MOD_TARGET
+livepatch: applying patch '$MOD_LIVEPATCH' to loading module '$MOD_TARGET'
+$MOD_LIVEPATCH: pre_patch_callback: $MOD_TARGET -> [MODULE_STATE_COMING] Full formed, running module_init
+$MOD_LIVEPATCH: post_patch_callback: $MOD_TARGET -> [MODULE_STATE_COMING] Full formed, running module_init
+$MOD_TARGET: ${MOD_TARGET}_init
+% rmmod $MOD_TARGET
+$MOD_TARGET: ${MOD_TARGET}_exit
+$MOD_LIVEPATCH: pre_unpatch_callback: $MOD_TARGET -> [MODULE_STATE_GOING] Going away
+livepatch: reverting patch '$MOD_LIVEPATCH' on unloading module '$MOD_TARGET'
+$MOD_LIVEPATCH: post_unpatch_callback: $MOD_TARGET -> [MODULE_STATE_GOING] Going away
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
+$MOD_LIVEPATCH: pre_unpatch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH': completing unpatching transition
+$MOD_LIVEPATCH: post_unpatch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH': unpatching complete
+% rmmod $MOD_LIVEPATCH"
+
+
+# A simple test of loading a livepatch without one of its patch target
+# klp_objects ever loaded ($MOD_TARGET).
+#
+# - Load the livepatch.
+#
+# - As expected, only pre/post-(un)patch handlers are executed for
+# vmlinux.
+
+start_test "target module not present"
+
+load_lp $MOD_LIVEPATCH
+disable_lp $MOD_LIVEPATCH
+unload_lp $MOD_LIVEPATCH
+
+check_result "% modprobe $MOD_LIVEPATCH
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+$MOD_LIVEPATCH: pre_patch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH': starting patching transition
+livepatch: '$MOD_LIVEPATCH': completing patching transition
+$MOD_LIVEPATCH: post_patch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH': patching complete
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
+$MOD_LIVEPATCH: pre_unpatch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH': completing unpatching transition
+$MOD_LIVEPATCH: post_unpatch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH': unpatching complete
+% rmmod $MOD_LIVEPATCH"
+
+
+# Test a scenario where a vmlinux pre-patch callback returns a non-zero
+# status (ie, failure).
+#
+# - First load a target module.
+#
+# - Load the livepatch module, setting its 'pre_patch_ret' value to -19
+# (-ENODEV). When its vmlinux pre-patch callback executes, this
+# status code will propagate back to the module-loading subsystem.
+# The result is that the insmod command refuses to load the livepatch
+# module.
+
+start_test "pre-patch callback -ENODEV"
+
+load_mod $MOD_TARGET
+load_failing_mod $MOD_LIVEPATCH pre_patch_ret=-19
+unload_mod $MOD_TARGET
+
+check_result "% modprobe $MOD_TARGET
+$MOD_TARGET: ${MOD_TARGET}_init
+% modprobe $MOD_LIVEPATCH pre_patch_ret=-19
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+test_klp_callbacks_demo: pre_patch_callback: vmlinux
+livepatch: pre-patch callback failed for object 'vmlinux'
+livepatch: failed to enable patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': canceling patching transition, going to unpatch
+livepatch: '$MOD_LIVEPATCH': completing unpatching transition
+livepatch: '$MOD_LIVEPATCH': unpatching complete
+modprobe: ERROR: could not insert '$MOD_LIVEPATCH': No such device
+% rmmod $MOD_TARGET
+$MOD_TARGET: ${MOD_TARGET}_exit"
+
+
+# Similar to the previous test, setup a livepatch such that its vmlinux
+# pre-patch callback returns success. However, when a targeted kernel
+# module is later loaded, have the livepatch return a failing status
+# code.
+#
+# - Load the livepatch, vmlinux pre-patch callback succeeds.
+#
+# - Set a trap so subsequent pre-patch callbacks to this livepatch will
+# return -ENODEV.
+#
+# - The livepatch pre-patch callback for subsequently loaded target
+# modules will return failure, so the module loader refuses to load
+# the kernel module. No post-patch or pre/post-unpatch callbacks are
+# executed for this klp_object.
+#
+# - Pre/post-unpatch callbacks are run for the vmlinux klp_object.
+
+start_test "module_coming + pre-patch callback -ENODEV"
+
+load_lp $MOD_LIVEPATCH
+set_pre_patch_ret $MOD_LIVEPATCH -19
+load_failing_mod $MOD_TARGET
+disable_lp $MOD_LIVEPATCH
+unload_lp $MOD_LIVEPATCH
+
+check_result "% modprobe $MOD_LIVEPATCH
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+$MOD_LIVEPATCH: pre_patch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH': starting patching transition
+livepatch: '$MOD_LIVEPATCH': completing patching transition
+$MOD_LIVEPATCH: post_patch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH': patching complete
+% echo -19 > /sys/module/$MOD_LIVEPATCH/parameters/pre_patch_ret
+% modprobe $MOD_TARGET
+livepatch: applying patch '$MOD_LIVEPATCH' to loading module '$MOD_TARGET'
+$MOD_LIVEPATCH: pre_patch_callback: $MOD_TARGET -> [MODULE_STATE_COMING] Full formed, running module_init
+livepatch: pre-patch callback failed for object '$MOD_TARGET'
+livepatch: patch '$MOD_LIVEPATCH' failed for module '$MOD_TARGET', refusing to load module '$MOD_TARGET'
+modprobe: ERROR: could not insert '$MOD_TARGET': No such device
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
+$MOD_LIVEPATCH: pre_unpatch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH': completing unpatching transition
+$MOD_LIVEPATCH: post_unpatch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH': unpatching complete
+% rmmod $MOD_LIVEPATCH"
+
+
+# Test loading multiple targeted kernel modules. This test-case is
+# mainly for comparing with the next test-case.
+#
+# - Load a target "busy" kernel module which kicks off a worker function
+# that immediately exits.
+#
+# - Proceed with loading the livepatch and another ordinary target
+# module. Post-patch callbacks are executed and the transition
+# completes quickly.
+
+start_test "multiple target modules"
+
+load_mod $MOD_TARGET_BUSY block_transition=N
+load_lp $MOD_LIVEPATCH
+load_mod $MOD_TARGET
+unload_mod $MOD_TARGET
+disable_lp $MOD_LIVEPATCH
+unload_lp $MOD_LIVEPATCH
+unload_mod $MOD_TARGET_BUSY
+
+check_result "% modprobe $MOD_TARGET_BUSY block_transition=N
+$MOD_TARGET_BUSY: ${MOD_TARGET_BUSY}_init
+$MOD_TARGET_BUSY: busymod_work_func enter
+$MOD_TARGET_BUSY: busymod_work_func exit
+% modprobe $MOD_LIVEPATCH
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+$MOD_LIVEPATCH: pre_patch_callback: vmlinux
+$MOD_LIVEPATCH: pre_patch_callback: $MOD_TARGET_BUSY -> [MODULE_STATE_LIVE] Normal state
+livepatch: '$MOD_LIVEPATCH': starting patching transition
+livepatch: '$MOD_LIVEPATCH': completing patching transition
+$MOD_LIVEPATCH: post_patch_callback: vmlinux
+$MOD_LIVEPATCH: post_patch_callback: $MOD_TARGET_BUSY -> [MODULE_STATE_LIVE] Normal state
+livepatch: '$MOD_LIVEPATCH': patching complete
+% modprobe $MOD_TARGET
+livepatch: applying patch '$MOD_LIVEPATCH' to loading module '$MOD_TARGET'
+$MOD_LIVEPATCH: pre_patch_callback: $MOD_TARGET -> [MODULE_STATE_COMING] Full formed, running module_init
+$MOD_LIVEPATCH: post_patch_callback: $MOD_TARGET -> [MODULE_STATE_COMING] Full formed, running module_init
+$MOD_TARGET: ${MOD_TARGET}_init
+% rmmod $MOD_TARGET
+$MOD_TARGET: ${MOD_TARGET}_exit
+$MOD_LIVEPATCH: pre_unpatch_callback: $MOD_TARGET -> [MODULE_STATE_GOING] Going away
+livepatch: reverting patch '$MOD_LIVEPATCH' on unloading module '$MOD_TARGET'
+$MOD_LIVEPATCH: post_unpatch_callback: $MOD_TARGET -> [MODULE_STATE_GOING] Going away
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
+$MOD_LIVEPATCH: pre_unpatch_callback: vmlinux
+$MOD_LIVEPATCH: pre_unpatch_callback: $MOD_TARGET_BUSY -> [MODULE_STATE_LIVE] Normal state
+livepatch: '$MOD_LIVEPATCH': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH': completing unpatching transition
+$MOD_LIVEPATCH: post_unpatch_callback: vmlinux
+$MOD_LIVEPATCH: post_unpatch_callback: $MOD_TARGET_BUSY -> [MODULE_STATE_LIVE] Normal state
+livepatch: '$MOD_LIVEPATCH': unpatching complete
+% rmmod $MOD_LIVEPATCH
+% rmmod $MOD_TARGET_BUSY
+$MOD_TARGET_BUSY: ${MOD_TARGET_BUSY}_exit"
+
+
+# A similar test as the previous one, but force the "busy" kernel module
+# to block the livepatch transition.
+#
+# The livepatching core will refuse to patch a task that is currently
+# executing a to-be-patched function -- the consistency model stalls the
+# current patch transition until this safety-check is met. Test a
+# scenario where one of a livepatch's target klp_objects sits on such a
+# function for a long time. Meanwhile, load and unload other target
+# kernel modules while the livepatch transition is in progress.
+#
+# - Load the "busy" kernel module, this time make its work function loop
+#
+# - Meanwhile, the livepatch is loaded. Notice that the patch
+# transition does not complete as the targeted "busy" module is
+# sitting on a to-be-patched function.
+#
+# - Load a second target module (this one is an ordinary idle kernel
+# module). Note that *no* post-patch callbacks will be executed while
+# the livepatch is still in transition.
+#
+# - Request an unload of the simple kernel module. The patch is still
+# transitioning, so its pre-unpatch callbacks are skipped.
+#
+# - Finally the livepatch is disabled. Since none of the patch's
+# klp_object's post-patch callbacks executed, the remaining
+# klp_object's pre-unpatch callbacks are skipped.
+
+start_test "busy target module"
+
+load_mod $MOD_TARGET_BUSY block_transition=Y
+load_lp_nowait $MOD_LIVEPATCH
+
+# Wait until the livepatch reports in-transition state, i.e. that it's
+# stalled on $MOD_TARGET_BUSY::busymod_work_func()
+loop_until 'grep -q '^1$' /sys/kernel/livepatch/$MOD_LIVEPATCH/transition' ||
+ die "failed to stall transition"
+
+load_mod $MOD_TARGET
+unload_mod $MOD_TARGET
+disable_lp $MOD_LIVEPATCH
+unload_lp $MOD_LIVEPATCH
+unload_mod $MOD_TARGET_BUSY
+
+check_result "% modprobe $MOD_TARGET_BUSY block_transition=Y
+$MOD_TARGET_BUSY: ${MOD_TARGET_BUSY}_init
+$MOD_TARGET_BUSY: busymod_work_func enter
+% modprobe $MOD_LIVEPATCH
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+$MOD_LIVEPATCH: pre_patch_callback: vmlinux
+$MOD_LIVEPATCH: pre_patch_callback: $MOD_TARGET_BUSY -> [MODULE_STATE_LIVE] Normal state
+livepatch: '$MOD_LIVEPATCH': starting patching transition
+% modprobe $MOD_TARGET
+livepatch: applying patch '$MOD_LIVEPATCH' to loading module '$MOD_TARGET'
+$MOD_LIVEPATCH: pre_patch_callback: $MOD_TARGET -> [MODULE_STATE_COMING] Full formed, running module_init
+$MOD_TARGET: ${MOD_TARGET}_init
+% rmmod $MOD_TARGET
+$MOD_TARGET: ${MOD_TARGET}_exit
+livepatch: reverting patch '$MOD_LIVEPATCH' on unloading module '$MOD_TARGET'
+$MOD_LIVEPATCH: post_unpatch_callback: $MOD_TARGET -> [MODULE_STATE_GOING] Going away
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+livepatch: '$MOD_LIVEPATCH': reversing transition from patching to unpatching
+livepatch: '$MOD_LIVEPATCH': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH': completing unpatching transition
+$MOD_LIVEPATCH: post_unpatch_callback: vmlinux
+$MOD_LIVEPATCH: post_unpatch_callback: $MOD_TARGET_BUSY -> [MODULE_STATE_LIVE] Normal state
+livepatch: '$MOD_LIVEPATCH': unpatching complete
+% rmmod $MOD_LIVEPATCH
+% rmmod $MOD_TARGET_BUSY
+$MOD_TARGET_BUSY: busymod_work_func exit
+$MOD_TARGET_BUSY: ${MOD_TARGET_BUSY}_exit"
+
+
+# Test loading multiple livepatches. This test-case is mainly for comparing
+# with the next test-case.
+#
+# - Load and unload two livepatches, pre and post (un)patch callbacks
+# execute as each patch progresses through its (un)patching
+# transition.
+
+start_test "multiple livepatches"
+
+load_lp $MOD_LIVEPATCH
+load_lp $MOD_LIVEPATCH2
+disable_lp $MOD_LIVEPATCH2
+disable_lp $MOD_LIVEPATCH
+unload_lp $MOD_LIVEPATCH2
+unload_lp $MOD_LIVEPATCH
+
+check_result "% modprobe $MOD_LIVEPATCH
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+$MOD_LIVEPATCH: pre_patch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH': starting patching transition
+livepatch: '$MOD_LIVEPATCH': completing patching transition
+$MOD_LIVEPATCH: post_patch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH': patching complete
+% modprobe $MOD_LIVEPATCH2
+livepatch: enabling patch '$MOD_LIVEPATCH2'
+livepatch: '$MOD_LIVEPATCH2': initializing patching transition
+$MOD_LIVEPATCH2: pre_patch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH2': starting patching transition
+livepatch: '$MOD_LIVEPATCH2': completing patching transition
+$MOD_LIVEPATCH2: post_patch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH2': patching complete
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH2/enabled
+livepatch: '$MOD_LIVEPATCH2': initializing unpatching transition
+$MOD_LIVEPATCH2: pre_unpatch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH2': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH2': completing unpatching transition
+$MOD_LIVEPATCH2: post_unpatch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH2': unpatching complete
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
+$MOD_LIVEPATCH: pre_unpatch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH': completing unpatching transition
+$MOD_LIVEPATCH: post_unpatch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH': unpatching complete
+% rmmod $MOD_LIVEPATCH2
+% rmmod $MOD_LIVEPATCH"
+
+
+# Load multiple livepatches, but the second as an 'atomic-replace'
+# patch. When the latter loads, the original livepatch should be
+# disabled and *none* of its pre/post-unpatch callbacks executed. On
+# the other hand, when the atomic-replace livepatch is disabled, its
+# pre/post-unpatch callbacks *should* be executed.
+#
+# - Load and unload two livepatches, the second of which has its
+# .replace flag set true.
+#
+# - Pre and post patch callbacks are executed for both livepatches.
+#
+# - Once the atomic replace module is loaded, only its pre and post
+# unpatch callbacks are executed.
+
+start_test "atomic replace"
+
+load_lp $MOD_LIVEPATCH
+load_lp $MOD_LIVEPATCH2 replace=1
+disable_lp $MOD_LIVEPATCH2
+unload_lp $MOD_LIVEPATCH2
+unload_lp $MOD_LIVEPATCH
+
+check_result "% modprobe $MOD_LIVEPATCH
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+$MOD_LIVEPATCH: pre_patch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH': starting patching transition
+livepatch: '$MOD_LIVEPATCH': completing patching transition
+$MOD_LIVEPATCH: post_patch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH': patching complete
+% modprobe $MOD_LIVEPATCH2 replace=1
+livepatch: enabling patch '$MOD_LIVEPATCH2'
+livepatch: '$MOD_LIVEPATCH2': initializing patching transition
+$MOD_LIVEPATCH2: pre_patch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH2': starting patching transition
+livepatch: '$MOD_LIVEPATCH2': completing patching transition
+$MOD_LIVEPATCH2: post_patch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH2': patching complete
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH2/enabled
+livepatch: '$MOD_LIVEPATCH2': initializing unpatching transition
+$MOD_LIVEPATCH2: pre_unpatch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH2': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH2': completing unpatching transition
+$MOD_LIVEPATCH2: post_unpatch_callback: vmlinux
+livepatch: '$MOD_LIVEPATCH2': unpatching complete
+% rmmod $MOD_LIVEPATCH2
+% rmmod $MOD_LIVEPATCH"
+
+
+exit 0
diff --git a/tools/testing/selftests/livepatch/test-ftrace.sh b/tools/testing/selftests/livepatch/test-ftrace.sh
new file mode 100755
index 000000000..552e16551
--- /dev/null
+++ b/tools/testing/selftests/livepatch/test-ftrace.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2019 Joe Lawrence <joe.lawrence@redhat.com>
+
+. $(dirname $0)/functions.sh
+
+MOD_LIVEPATCH=test_klp_livepatch
+
+setup_config
+
+
+# - turn ftrace_enabled OFF and verify livepatches can't load
+# - turn ftrace_enabled ON and verify livepatch can load
+# - verify that ftrace_enabled can't be turned OFF while a livepatch is loaded
+
+start_test "livepatch interaction with ftrace_enabled sysctl"
+
+set_ftrace_enabled 0
+load_failing_mod $MOD_LIVEPATCH
+
+set_ftrace_enabled 1
+load_lp $MOD_LIVEPATCH
+if [[ "$(cat /proc/cmdline)" != "$MOD_LIVEPATCH: this has been live patched" ]] ; then
+ echo -e "FAIL\n\n"
+ die "livepatch kselftest(s) failed"
+fi
+
+set_ftrace_enabled 0
+if [[ "$(cat /proc/cmdline)" != "$MOD_LIVEPATCH: this has been live patched" ]] ; then
+ echo -e "FAIL\n\n"
+ die "livepatch kselftest(s) failed"
+fi
+disable_lp $MOD_LIVEPATCH
+unload_lp $MOD_LIVEPATCH
+
+check_result "livepatch: kernel.ftrace_enabled = 0
+% modprobe $MOD_LIVEPATCH
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+livepatch: failed to register ftrace handler for function 'cmdline_proc_show' (-16)
+livepatch: failed to patch object 'vmlinux'
+livepatch: failed to enable patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': canceling patching transition, going to unpatch
+livepatch: '$MOD_LIVEPATCH': completing unpatching transition
+livepatch: '$MOD_LIVEPATCH': unpatching complete
+modprobe: ERROR: could not insert '$MOD_LIVEPATCH': Device or resource busy
+livepatch: kernel.ftrace_enabled = 1
+% modprobe $MOD_LIVEPATCH
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+livepatch: '$MOD_LIVEPATCH': starting patching transition
+livepatch: '$MOD_LIVEPATCH': completing patching transition
+livepatch: '$MOD_LIVEPATCH': patching complete
+livepatch: sysctl: setting key \"kernel.ftrace_enabled\": Device or resource busy
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
+livepatch: '$MOD_LIVEPATCH': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH': completing unpatching transition
+livepatch: '$MOD_LIVEPATCH': unpatching complete
+% rmmod $MOD_LIVEPATCH"
+
+
+exit 0
diff --git a/tools/testing/selftests/livepatch/test-livepatch.sh b/tools/testing/selftests/livepatch/test-livepatch.sh
new file mode 100755
index 000000000..5fe79ac34
--- /dev/null
+++ b/tools/testing/selftests/livepatch/test-livepatch.sh
@@ -0,0 +1,162 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2018 Joe Lawrence <joe.lawrence@redhat.com>
+
+. $(dirname $0)/functions.sh
+
+MOD_LIVEPATCH=test_klp_livepatch
+MOD_REPLACE=test_klp_atomic_replace
+
+setup_config
+
+
+# - load a livepatch that modifies the output from /proc/cmdline and
+# verify correct behavior
+# - unload the livepatch and make sure the patch was removed
+
+start_test "basic function patching"
+
+load_lp $MOD_LIVEPATCH
+
+if [[ "$(cat /proc/cmdline)" != "$MOD_LIVEPATCH: this has been live patched" ]] ; then
+ echo -e "FAIL\n\n"
+ die "livepatch kselftest(s) failed"
+fi
+
+disable_lp $MOD_LIVEPATCH
+unload_lp $MOD_LIVEPATCH
+
+if [[ "$(cat /proc/cmdline)" == "$MOD_LIVEPATCH: this has been live patched" ]] ; then
+ echo -e "FAIL\n\n"
+ die "livepatch kselftest(s) failed"
+fi
+
+check_result "% modprobe $MOD_LIVEPATCH
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+livepatch: '$MOD_LIVEPATCH': starting patching transition
+livepatch: '$MOD_LIVEPATCH': completing patching transition
+livepatch: '$MOD_LIVEPATCH': patching complete
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
+livepatch: '$MOD_LIVEPATCH': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH': completing unpatching transition
+livepatch: '$MOD_LIVEPATCH': unpatching complete
+% rmmod $MOD_LIVEPATCH"
+
+
+# - load a livepatch that modifies the output from /proc/cmdline and
+# verify correct behavior
+# - load another livepatch and verify that both livepatches are active
+# - unload the second livepatch and verify that the first is still active
+# - unload the first livepatch and verify none are active
+
+start_test "multiple livepatches"
+
+load_lp $MOD_LIVEPATCH
+
+grep 'live patched' /proc/cmdline > /dev/kmsg
+grep 'live patched' /proc/meminfo > /dev/kmsg
+
+load_lp $MOD_REPLACE replace=0
+
+grep 'live patched' /proc/cmdline > /dev/kmsg
+grep 'live patched' /proc/meminfo > /dev/kmsg
+
+disable_lp $MOD_REPLACE
+unload_lp $MOD_REPLACE
+
+grep 'live patched' /proc/cmdline > /dev/kmsg
+grep 'live patched' /proc/meminfo > /dev/kmsg
+
+disable_lp $MOD_LIVEPATCH
+unload_lp $MOD_LIVEPATCH
+
+grep 'live patched' /proc/cmdline > /dev/kmsg
+grep 'live patched' /proc/meminfo > /dev/kmsg
+
+check_result "% modprobe $MOD_LIVEPATCH
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+livepatch: '$MOD_LIVEPATCH': starting patching transition
+livepatch: '$MOD_LIVEPATCH': completing patching transition
+livepatch: '$MOD_LIVEPATCH': patching complete
+$MOD_LIVEPATCH: this has been live patched
+% modprobe $MOD_REPLACE replace=0
+livepatch: enabling patch '$MOD_REPLACE'
+livepatch: '$MOD_REPLACE': initializing patching transition
+livepatch: '$MOD_REPLACE': starting patching transition
+livepatch: '$MOD_REPLACE': completing patching transition
+livepatch: '$MOD_REPLACE': patching complete
+$MOD_LIVEPATCH: this has been live patched
+$MOD_REPLACE: this has been live patched
+% echo 0 > /sys/kernel/livepatch/$MOD_REPLACE/enabled
+livepatch: '$MOD_REPLACE': initializing unpatching transition
+livepatch: '$MOD_REPLACE': starting unpatching transition
+livepatch: '$MOD_REPLACE': completing unpatching transition
+livepatch: '$MOD_REPLACE': unpatching complete
+% rmmod $MOD_REPLACE
+$MOD_LIVEPATCH: this has been live patched
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
+livepatch: '$MOD_LIVEPATCH': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH': completing unpatching transition
+livepatch: '$MOD_LIVEPATCH': unpatching complete
+% rmmod $MOD_LIVEPATCH"
+
+
+# - load a livepatch that modifies the output from /proc/cmdline and
+# verify correct behavior
+# - load an atomic replace livepatch and verify that only the second is active
+# - remove the first livepatch and verify that the atomic replace livepatch
+# is still active
+# - remove the atomic replace livepatch and verify that none are active
+
+start_test "atomic replace livepatch"
+
+load_lp $MOD_LIVEPATCH
+
+grep 'live patched' /proc/cmdline > /dev/kmsg
+grep 'live patched' /proc/meminfo > /dev/kmsg
+
+load_lp $MOD_REPLACE replace=1
+
+grep 'live patched' /proc/cmdline > /dev/kmsg
+grep 'live patched' /proc/meminfo > /dev/kmsg
+
+unload_lp $MOD_LIVEPATCH
+
+grep 'live patched' /proc/cmdline > /dev/kmsg
+grep 'live patched' /proc/meminfo > /dev/kmsg
+
+disable_lp $MOD_REPLACE
+unload_lp $MOD_REPLACE
+
+grep 'live patched' /proc/cmdline > /dev/kmsg
+grep 'live patched' /proc/meminfo > /dev/kmsg
+
+check_result "% modprobe $MOD_LIVEPATCH
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+livepatch: '$MOD_LIVEPATCH': starting patching transition
+livepatch: '$MOD_LIVEPATCH': completing patching transition
+livepatch: '$MOD_LIVEPATCH': patching complete
+$MOD_LIVEPATCH: this has been live patched
+% modprobe $MOD_REPLACE replace=1
+livepatch: enabling patch '$MOD_REPLACE'
+livepatch: '$MOD_REPLACE': initializing patching transition
+livepatch: '$MOD_REPLACE': starting patching transition
+livepatch: '$MOD_REPLACE': completing patching transition
+livepatch: '$MOD_REPLACE': patching complete
+$MOD_REPLACE: this has been live patched
+% rmmod $MOD_LIVEPATCH
+$MOD_REPLACE: this has been live patched
+% echo 0 > /sys/kernel/livepatch/$MOD_REPLACE/enabled
+livepatch: '$MOD_REPLACE': initializing unpatching transition
+livepatch: '$MOD_REPLACE': starting unpatching transition
+livepatch: '$MOD_REPLACE': completing unpatching transition
+livepatch: '$MOD_REPLACE': unpatching complete
+% rmmod $MOD_REPLACE"
+
+
+exit 0
diff --git a/tools/testing/selftests/livepatch/test-shadow-vars.sh b/tools/testing/selftests/livepatch/test-shadow-vars.sh
new file mode 100755
index 000000000..e04cb354f
--- /dev/null
+++ b/tools/testing/selftests/livepatch/test-shadow-vars.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2018 Joe Lawrence <joe.lawrence@redhat.com>
+
+. $(dirname $0)/functions.sh
+
+MOD_TEST=test_klp_shadow_vars
+
+setup_config
+
+
+# - load a module that exercises the shadow variable API
+
+start_test "basic shadow variable API"
+
+load_mod $MOD_TEST
+unload_mod $MOD_TEST
+
+check_result "% modprobe $MOD_TEST
+$MOD_TEST: klp_shadow_get(obj=PTR1, id=0x1234) = PTR0
+$MOD_TEST: got expected NULL result
+$MOD_TEST: shadow_ctor: PTR3 -> PTR2
+$MOD_TEST: klp_shadow_get_or_alloc(obj=PTR1, id=0x1234, size=8, gfp_flags=GFP_KERNEL), ctor=PTR4, ctor_data=PTR2 = PTR3
+$MOD_TEST: shadow_ctor: PTR6 -> PTR5
+$MOD_TEST: klp_shadow_alloc(obj=PTR1, id=0x1235, size=8, gfp_flags=GFP_KERNEL), ctor=PTR4, ctor_data=PTR5 = PTR6
+$MOD_TEST: shadow_ctor: PTR8 -> PTR7
+$MOD_TEST: klp_shadow_alloc(obj=PTR9, id=0x1234, size=8, gfp_flags=GFP_KERNEL), ctor=PTR4, ctor_data=PTR7 = PTR8
+$MOD_TEST: shadow_ctor: PTR11 -> PTR10
+$MOD_TEST: klp_shadow_alloc(obj=PTR9, id=0x1235, size=8, gfp_flags=GFP_KERNEL), ctor=PTR4, ctor_data=PTR10 = PTR11
+$MOD_TEST: shadow_ctor: PTR13 -> PTR12
+$MOD_TEST: klp_shadow_get_or_alloc(obj=PTR14, id=0x1234, size=8, gfp_flags=GFP_KERNEL), ctor=PTR4, ctor_data=PTR12 = PTR13
+$MOD_TEST: shadow_ctor: PTR16 -> PTR15
+$MOD_TEST: klp_shadow_alloc(obj=PTR14, id=0x1235, size=8, gfp_flags=GFP_KERNEL), ctor=PTR4, ctor_data=PTR15 = PTR16
+$MOD_TEST: klp_shadow_get(obj=PTR1, id=0x1234) = PTR3
+$MOD_TEST: got expected PTR3 -> PTR2 result
+$MOD_TEST: klp_shadow_get(obj=PTR1, id=0x1235) = PTR6
+$MOD_TEST: got expected PTR6 -> PTR5 result
+$MOD_TEST: klp_shadow_get(obj=PTR9, id=0x1234) = PTR8
+$MOD_TEST: got expected PTR8 -> PTR7 result
+$MOD_TEST: klp_shadow_get(obj=PTR9, id=0x1235) = PTR11
+$MOD_TEST: got expected PTR11 -> PTR10 result
+$MOD_TEST: klp_shadow_get(obj=PTR14, id=0x1234) = PTR13
+$MOD_TEST: got expected PTR13 -> PTR12 result
+$MOD_TEST: klp_shadow_get(obj=PTR14, id=0x1235) = PTR16
+$MOD_TEST: got expected PTR16 -> PTR15 result
+$MOD_TEST: klp_shadow_get_or_alloc(obj=PTR1, id=0x1234, size=8, gfp_flags=GFP_KERNEL), ctor=PTR4, ctor_data=PTR2 = PTR3
+$MOD_TEST: got expected PTR3 -> PTR2 result
+$MOD_TEST: klp_shadow_get_or_alloc(obj=PTR9, id=0x1234, size=8, gfp_flags=GFP_KERNEL), ctor=PTR4, ctor_data=PTR7 = PTR8
+$MOD_TEST: got expected PTR8 -> PTR7 result
+$MOD_TEST: klp_shadow_get_or_alloc(obj=PTR14, id=0x1234, size=8, gfp_flags=GFP_KERNEL), ctor=PTR4, ctor_data=PTR12 = PTR13
+$MOD_TEST: got expected PTR13 -> PTR12 result
+$MOD_TEST: shadow_dtor(obj=PTR1, shadow_data=PTR3)
+$MOD_TEST: klp_shadow_free(obj=PTR1, id=0x1234, dtor=PTR17)
+$MOD_TEST: klp_shadow_get(obj=PTR1, id=0x1234) = PTR0
+$MOD_TEST: got expected NULL result
+$MOD_TEST: shadow_dtor(obj=PTR9, shadow_data=PTR8)
+$MOD_TEST: klp_shadow_free(obj=PTR9, id=0x1234, dtor=PTR17)
+$MOD_TEST: klp_shadow_get(obj=PTR9, id=0x1234) = PTR0
+$MOD_TEST: got expected NULL result
+$MOD_TEST: shadow_dtor(obj=PTR14, shadow_data=PTR13)
+$MOD_TEST: klp_shadow_free(obj=PTR14, id=0x1234, dtor=PTR17)
+$MOD_TEST: klp_shadow_get(obj=PTR14, id=0x1234) = PTR0
+$MOD_TEST: got expected NULL result
+$MOD_TEST: klp_shadow_get(obj=PTR1, id=0x1235) = PTR6
+$MOD_TEST: got expected PTR6 -> PTR5 result
+$MOD_TEST: klp_shadow_get(obj=PTR9, id=0x1235) = PTR11
+$MOD_TEST: got expected PTR11 -> PTR10 result
+$MOD_TEST: klp_shadow_get(obj=PTR14, id=0x1235) = PTR16
+$MOD_TEST: got expected PTR16 -> PTR15 result
+$MOD_TEST: klp_shadow_free_all(id=0x1235, dtor=PTR0)
+$MOD_TEST: klp_shadow_get(obj=PTR1, id=0x1235) = PTR0
+$MOD_TEST: got expected NULL result
+$MOD_TEST: klp_shadow_get(obj=PTR9, id=0x1235) = PTR0
+$MOD_TEST: got expected NULL result
+$MOD_TEST: klp_shadow_get(obj=PTR14, id=0x1235) = PTR0
+$MOD_TEST: got expected NULL result
+% rmmod $MOD_TEST"
+
+exit 0
diff --git a/tools/testing/selftests/livepatch/test-state.sh b/tools/testing/selftests/livepatch/test-state.sh
new file mode 100755
index 000000000..38656721c
--- /dev/null
+++ b/tools/testing/selftests/livepatch/test-state.sh
@@ -0,0 +1,176 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2019 SUSE
+
+. $(dirname $0)/functions.sh
+
+MOD_LIVEPATCH=test_klp_state
+MOD_LIVEPATCH2=test_klp_state2
+MOD_LIVEPATCH3=test_klp_state3
+
+setup_config
+
+
+# Load and remove a module that modifies the system state
+
+start_test "system state modification"
+
+load_lp $MOD_LIVEPATCH
+disable_lp $MOD_LIVEPATCH
+unload_lp $MOD_LIVEPATCH
+
+check_result "% modprobe $MOD_LIVEPATCH
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+$MOD_LIVEPATCH: pre_patch_callback: vmlinux
+$MOD_LIVEPATCH: allocate_loglevel_state: allocating space to store console_loglevel
+livepatch: '$MOD_LIVEPATCH': starting patching transition
+livepatch: '$MOD_LIVEPATCH': completing patching transition
+$MOD_LIVEPATCH: post_patch_callback: vmlinux
+$MOD_LIVEPATCH: fix_console_loglevel: fixing console_loglevel
+livepatch: '$MOD_LIVEPATCH': patching complete
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH/enabled
+livepatch: '$MOD_LIVEPATCH': initializing unpatching transition
+$MOD_LIVEPATCH: pre_unpatch_callback: vmlinux
+$MOD_LIVEPATCH: restore_console_loglevel: restoring console_loglevel
+livepatch: '$MOD_LIVEPATCH': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH': completing unpatching transition
+$MOD_LIVEPATCH: post_unpatch_callback: vmlinux
+$MOD_LIVEPATCH: free_loglevel_state: freeing space for the stored console_loglevel
+livepatch: '$MOD_LIVEPATCH': unpatching complete
+% rmmod $MOD_LIVEPATCH"
+
+
+# Take over system state change by a cumulative patch
+
+start_test "taking over system state modification"
+
+load_lp $MOD_LIVEPATCH
+load_lp $MOD_LIVEPATCH2
+unload_lp $MOD_LIVEPATCH
+disable_lp $MOD_LIVEPATCH2
+unload_lp $MOD_LIVEPATCH2
+
+check_result "% modprobe $MOD_LIVEPATCH
+livepatch: enabling patch '$MOD_LIVEPATCH'
+livepatch: '$MOD_LIVEPATCH': initializing patching transition
+$MOD_LIVEPATCH: pre_patch_callback: vmlinux
+$MOD_LIVEPATCH: allocate_loglevel_state: allocating space to store console_loglevel
+livepatch: '$MOD_LIVEPATCH': starting patching transition
+livepatch: '$MOD_LIVEPATCH': completing patching transition
+$MOD_LIVEPATCH: post_patch_callback: vmlinux
+$MOD_LIVEPATCH: fix_console_loglevel: fixing console_loglevel
+livepatch: '$MOD_LIVEPATCH': patching complete
+% modprobe $MOD_LIVEPATCH2
+livepatch: enabling patch '$MOD_LIVEPATCH2'
+livepatch: '$MOD_LIVEPATCH2': initializing patching transition
+$MOD_LIVEPATCH2: pre_patch_callback: vmlinux
+$MOD_LIVEPATCH2: allocate_loglevel_state: space to store console_loglevel already allocated
+livepatch: '$MOD_LIVEPATCH2': starting patching transition
+livepatch: '$MOD_LIVEPATCH2': completing patching transition
+$MOD_LIVEPATCH2: post_patch_callback: vmlinux
+$MOD_LIVEPATCH2: fix_console_loglevel: taking over the console_loglevel change
+livepatch: '$MOD_LIVEPATCH2': patching complete
+% rmmod $MOD_LIVEPATCH
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH2/enabled
+livepatch: '$MOD_LIVEPATCH2': initializing unpatching transition
+$MOD_LIVEPATCH2: pre_unpatch_callback: vmlinux
+$MOD_LIVEPATCH2: restore_console_loglevel: restoring console_loglevel
+livepatch: '$MOD_LIVEPATCH2': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH2': completing unpatching transition
+$MOD_LIVEPATCH2: post_unpatch_callback: vmlinux
+$MOD_LIVEPATCH2: free_loglevel_state: freeing space for the stored console_loglevel
+livepatch: '$MOD_LIVEPATCH2': unpatching complete
+% rmmod $MOD_LIVEPATCH2"
+
+
+# Take over system state change by a cumulative patch
+
+start_test "compatible cumulative livepatches"
+
+load_lp $MOD_LIVEPATCH2
+load_lp $MOD_LIVEPATCH3
+unload_lp $MOD_LIVEPATCH2
+load_lp $MOD_LIVEPATCH2
+disable_lp $MOD_LIVEPATCH2
+unload_lp $MOD_LIVEPATCH2
+unload_lp $MOD_LIVEPATCH3
+
+check_result "% modprobe $MOD_LIVEPATCH2
+livepatch: enabling patch '$MOD_LIVEPATCH2'
+livepatch: '$MOD_LIVEPATCH2': initializing patching transition
+$MOD_LIVEPATCH2: pre_patch_callback: vmlinux
+$MOD_LIVEPATCH2: allocate_loglevel_state: allocating space to store console_loglevel
+livepatch: '$MOD_LIVEPATCH2': starting patching transition
+livepatch: '$MOD_LIVEPATCH2': completing patching transition
+$MOD_LIVEPATCH2: post_patch_callback: vmlinux
+$MOD_LIVEPATCH2: fix_console_loglevel: fixing console_loglevel
+livepatch: '$MOD_LIVEPATCH2': patching complete
+% modprobe $MOD_LIVEPATCH3
+livepatch: enabling patch '$MOD_LIVEPATCH3'
+livepatch: '$MOD_LIVEPATCH3': initializing patching transition
+$MOD_LIVEPATCH3: pre_patch_callback: vmlinux
+$MOD_LIVEPATCH3: allocate_loglevel_state: space to store console_loglevel already allocated
+livepatch: '$MOD_LIVEPATCH3': starting patching transition
+livepatch: '$MOD_LIVEPATCH3': completing patching transition
+$MOD_LIVEPATCH3: post_patch_callback: vmlinux
+$MOD_LIVEPATCH3: fix_console_loglevel: taking over the console_loglevel change
+livepatch: '$MOD_LIVEPATCH3': patching complete
+% rmmod $MOD_LIVEPATCH2
+% modprobe $MOD_LIVEPATCH2
+livepatch: enabling patch '$MOD_LIVEPATCH2'
+livepatch: '$MOD_LIVEPATCH2': initializing patching transition
+$MOD_LIVEPATCH2: pre_patch_callback: vmlinux
+$MOD_LIVEPATCH2: allocate_loglevel_state: space to store console_loglevel already allocated
+livepatch: '$MOD_LIVEPATCH2': starting patching transition
+livepatch: '$MOD_LIVEPATCH2': completing patching transition
+$MOD_LIVEPATCH2: post_patch_callback: vmlinux
+$MOD_LIVEPATCH2: fix_console_loglevel: taking over the console_loglevel change
+livepatch: '$MOD_LIVEPATCH2': patching complete
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH2/enabled
+livepatch: '$MOD_LIVEPATCH2': initializing unpatching transition
+$MOD_LIVEPATCH2: pre_unpatch_callback: vmlinux
+$MOD_LIVEPATCH2: restore_console_loglevel: restoring console_loglevel
+livepatch: '$MOD_LIVEPATCH2': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH2': completing unpatching transition
+$MOD_LIVEPATCH2: post_unpatch_callback: vmlinux
+$MOD_LIVEPATCH2: free_loglevel_state: freeing space for the stored console_loglevel
+livepatch: '$MOD_LIVEPATCH2': unpatching complete
+% rmmod $MOD_LIVEPATCH2
+% rmmod $MOD_LIVEPATCH3"
+
+
+# Failure caused by incompatible cumulative livepatches
+
+start_test "incompatible cumulative livepatches"
+
+load_lp $MOD_LIVEPATCH2
+load_failing_mod $MOD_LIVEPATCH
+disable_lp $MOD_LIVEPATCH2
+unload_lp $MOD_LIVEPATCH2
+
+check_result "% modprobe $MOD_LIVEPATCH2
+livepatch: enabling patch '$MOD_LIVEPATCH2'
+livepatch: '$MOD_LIVEPATCH2': initializing patching transition
+$MOD_LIVEPATCH2: pre_patch_callback: vmlinux
+$MOD_LIVEPATCH2: allocate_loglevel_state: allocating space to store console_loglevel
+livepatch: '$MOD_LIVEPATCH2': starting patching transition
+livepatch: '$MOD_LIVEPATCH2': completing patching transition
+$MOD_LIVEPATCH2: post_patch_callback: vmlinux
+$MOD_LIVEPATCH2: fix_console_loglevel: fixing console_loglevel
+livepatch: '$MOD_LIVEPATCH2': patching complete
+% modprobe $MOD_LIVEPATCH
+livepatch: Livepatch patch ($MOD_LIVEPATCH) is not compatible with the already installed livepatches.
+modprobe: ERROR: could not insert '$MOD_LIVEPATCH': Invalid argument
+% echo 0 > /sys/kernel/livepatch/$MOD_LIVEPATCH2/enabled
+livepatch: '$MOD_LIVEPATCH2': initializing unpatching transition
+$MOD_LIVEPATCH2: pre_unpatch_callback: vmlinux
+$MOD_LIVEPATCH2: restore_console_loglevel: restoring console_loglevel
+livepatch: '$MOD_LIVEPATCH2': starting unpatching transition
+livepatch: '$MOD_LIVEPATCH2': completing unpatching transition
+$MOD_LIVEPATCH2: post_unpatch_callback: vmlinux
+$MOD_LIVEPATCH2: free_loglevel_state: freeing space for the stored console_loglevel
+livepatch: '$MOD_LIVEPATCH2': unpatching complete
+% rmmod $MOD_LIVEPATCH2"
+
+exit 0
diff --git a/tools/testing/selftests/lkdtm/.gitignore b/tools/testing/selftests/lkdtm/.gitignore
new file mode 100644
index 000000000..f26212605
--- /dev/null
+++ b/tools/testing/selftests/lkdtm/.gitignore
@@ -0,0 +1,2 @@
+*.sh
+!run.sh
diff --git a/tools/testing/selftests/lkdtm/Makefile b/tools/testing/selftests/lkdtm/Makefile
new file mode 100644
index 000000000..1bcc9ee99
--- /dev/null
+++ b/tools/testing/selftests/lkdtm/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for LKDTM regression tests
+
+include ../lib.mk
+
+# NOTE: $(OUTPUT) won't get default value if used before lib.mk
+TEST_FILES := tests.txt
+TEST_GEN_PROGS = $(patsubst %,$(OUTPUT)/%.sh,$(shell awk '{print $$1}' tests.txt | sed -e 's/\#//'))
+all: $(TEST_GEN_PROGS)
+
+$(OUTPUT)/%: run.sh tests.txt
+ install -m 0744 run.sh $@
diff --git a/tools/testing/selftests/lkdtm/config b/tools/testing/selftests/lkdtm/config
new file mode 100644
index 000000000..d874990e4
--- /dev/null
+++ b/tools/testing/selftests/lkdtm/config
@@ -0,0 +1 @@
+CONFIG_LKDTM=y
diff --git a/tools/testing/selftests/lkdtm/run.sh b/tools/testing/selftests/lkdtm/run.sh
new file mode 100755
index 000000000..e95e79bd3
--- /dev/null
+++ b/tools/testing/selftests/lkdtm/run.sh
@@ -0,0 +1,104 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# This reads tests.txt for the list of LKDTM tests to invoke. Any marked
+# with a leading "#" are skipped. The rest of the line after the
+# test name is either the text to look for in dmesg for a "success",
+# or the rationale for why a test is marked to be skipped.
+#
+set -e
+TRIGGER=/sys/kernel/debug/provoke-crash/DIRECT
+CLEAR_ONCE=/sys/kernel/debug/clear_warn_once
+KSELFTEST_SKIP_TEST=4
+
+# Verify we have LKDTM available in the kernel.
+if [ ! -r $TRIGGER ] ; then
+ /sbin/modprobe -q lkdtm || true
+ if [ ! -r $TRIGGER ] ; then
+ echo "Cannot find $TRIGGER (missing CONFIG_LKDTM?)"
+ else
+ echo "Cannot write $TRIGGER (need to run as root?)"
+ fi
+ # Skip this test
+ exit $KSELFTEST_SKIP_TEST
+fi
+
+# Figure out which test to run from our script name.
+test=$(basename $0 .sh)
+# Look up details about the test from master list of LKDTM tests.
+line=$(grep -E '^#?'"$test"'\b' tests.txt)
+if [ -z "$line" ]; then
+ echo "Skipped: missing test '$test' in tests.txt"
+ exit $KSELFTEST_SKIP_TEST
+fi
+# Check that the test is known to LKDTM.
+if ! grep -E -q '^'"$test"'$' "$TRIGGER" ; then
+ echo "Skipped: test '$test' missing in $TRIGGER!"
+ exit $KSELFTEST_SKIP_TEST
+fi
+
+# Extract notes/expected output from test list.
+test=$(echo "$line" | cut -d" " -f1)
+if echo "$line" | grep -q ' ' ; then
+ expect=$(echo "$line" | cut -d" " -f2-)
+else
+ expect=""
+fi
+
+# If the test is commented out, report a skip
+if echo "$test" | grep -q '^#' ; then
+ test=$(echo "$test" | cut -c2-)
+ if [ -z "$expect" ]; then
+ expect="crashes entire system"
+ fi
+ echo "Skipping $test: $expect"
+ exit $KSELFTEST_SKIP_TEST
+fi
+
+# If no expected output given, assume an Oops with back trace is success.
+if [ -z "$expect" ]; then
+ expect="call trace:"
+fi
+
+# Prepare log for report checking
+LOG=$(mktemp --tmpdir -t lkdtm-log-XXXXXX)
+DMESG=$(mktemp --tmpdir -t lkdtm-dmesg-XXXXXX)
+cleanup() {
+ rm -f "$LOG" "$DMESG"
+}
+trap cleanup EXIT
+
+# Reset WARN_ONCE counters so we trip it each time this runs.
+if [ -w $CLEAR_ONCE ] ; then
+ echo 1 > $CLEAR_ONCE
+fi
+
+# Save existing dmesg so we can detect new content below
+dmesg > "$DMESG"
+
+# Since the kernel is likely killing the process writing to the trigger
+# file, it must not be the script's shell itself. i.e. we cannot do:
+# echo "$test" >"$TRIGGER"
+# Instead, use "cat" to take the signal. Since the shell will yell about
+# the signal that killed the subprocess, we must ignore the failure and
+# continue. However we don't silence stderr since there might be other
+# useful details reported there in the case of other unexpected conditions.
+echo "$test" | cat >"$TRIGGER" || true
+
+# Record and dump the results
+dmesg | comm --nocheck-order -13 "$DMESG" - > "$LOG" || true
+
+cat "$LOG"
+# Check for expected output
+if grep -E -qi "$expect" "$LOG" ; then
+ echo "$test: saw '$expect': ok"
+ exit 0
+else
+ if grep -E -qi XFAIL: "$LOG" ; then
+ echo "$test: saw 'XFAIL': [SKIP]"
+ exit $KSELFTEST_SKIP_TEST
+ else
+ echo "$test: missing '$expect': [FAIL]"
+ exit 1
+ fi
+fi
diff --git a/tools/testing/selftests/lkdtm/tests.txt b/tools/testing/selftests/lkdtm/tests.txt
new file mode 100644
index 000000000..9b84cba9e
--- /dev/null
+++ b/tools/testing/selftests/lkdtm/tests.txt
@@ -0,0 +1,70 @@
+#PANIC
+BUG kernel BUG at
+WARNING WARNING:
+WARNING_MESSAGE message trigger
+EXCEPTION
+#LOOP Hangs the system
+#EXHAUST_STACK Corrupts memory on failure
+#CORRUPT_STACK Crashes entire system on success
+#CORRUPT_STACK_STRONG Crashes entire system on success
+CORRUPT_LIST_ADD list_add corruption
+CORRUPT_LIST_DEL list_del corruption
+STACK_GUARD_PAGE_LEADING
+STACK_GUARD_PAGE_TRAILING
+UNSET_SMEP pinned CR4 bits changed:
+DOUBLE_FAULT
+CORRUPT_PAC
+UNALIGNED_LOAD_STORE_WRITE
+#OVERWRITE_ALLOCATION Corrupts memory on failure
+#WRITE_AFTER_FREE Corrupts memory on failure
+READ_AFTER_FREE
+#WRITE_BUDDY_AFTER_FREE Corrupts memory on failure
+READ_BUDDY_AFTER_FREE
+SLAB_FREE_DOUBLE
+SLAB_FREE_CROSS
+SLAB_FREE_PAGE
+#SOFTLOCKUP Hangs the system
+#HARDLOCKUP Hangs the system
+#SPINLOCKUP Hangs the system
+#HUNG_TASK Hangs the system
+EXEC_DATA
+EXEC_STACK
+EXEC_KMALLOC
+EXEC_VMALLOC
+EXEC_RODATA
+EXEC_USERSPACE
+EXEC_NULL
+ACCESS_USERSPACE
+ACCESS_NULL
+WRITE_RO
+WRITE_RO_AFTER_INIT
+WRITE_KERN
+REFCOUNT_INC_OVERFLOW
+REFCOUNT_ADD_OVERFLOW
+REFCOUNT_INC_NOT_ZERO_OVERFLOW
+REFCOUNT_ADD_NOT_ZERO_OVERFLOW
+REFCOUNT_DEC_ZERO
+REFCOUNT_DEC_NEGATIVE Negative detected: saturated
+REFCOUNT_DEC_AND_TEST_NEGATIVE Negative detected: saturated
+REFCOUNT_SUB_AND_TEST_NEGATIVE Negative detected: saturated
+REFCOUNT_INC_ZERO
+REFCOUNT_ADD_ZERO
+REFCOUNT_INC_SATURATED Saturation detected: still saturated
+REFCOUNT_DEC_SATURATED Saturation detected: still saturated
+REFCOUNT_ADD_SATURATED Saturation detected: still saturated
+REFCOUNT_INC_NOT_ZERO_SATURATED
+REFCOUNT_ADD_NOT_ZERO_SATURATED
+REFCOUNT_DEC_AND_TEST_SATURATED Saturation detected: still saturated
+REFCOUNT_SUB_AND_TEST_SATURATED Saturation detected: still saturated
+#REFCOUNT_TIMING timing only
+#ATOMIC_TIMING timing only
+USERCOPY_HEAP_SIZE_TO
+USERCOPY_HEAP_SIZE_FROM
+USERCOPY_HEAP_WHITELIST_TO
+USERCOPY_HEAP_WHITELIST_FROM
+USERCOPY_STACK_FRAME_TO
+USERCOPY_STACK_FRAME_FROM
+USERCOPY_STACK_BEYOND
+USERCOPY_KERNEL
+STACKLEAK_ERASING OK: the rest of the thread stack is properly erased
+CFI_FORWARD_PROTO
diff --git a/tools/testing/selftests/locking/Makefile b/tools/testing/selftests/locking/Makefile
new file mode 100644
index 000000000..6e7761ab3
--- /dev/null
+++ b/tools/testing/selftests/locking/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for locking/ww_mutx selftests
+
+# No binaries, but make sure arg-less "make" doesn't trigger "run_tests"
+all:
+
+TEST_PROGS := ww_mutex.sh
+
+include ../lib.mk
diff --git a/tools/testing/selftests/locking/ww_mutex.sh b/tools/testing/selftests/locking/ww_mutex.sh
new file mode 100755
index 000000000..91e4ac756
--- /dev/null
+++ b/tools/testing/selftests/locking/ww_mutex.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+# Runs API tests for struct ww_mutex (Wait/Wound mutexes)
+if ! /sbin/modprobe -q -n test-ww_mutex; then
+ echo "ww_mutex: module test-ww_mutex is not found [SKIP]"
+ exit $ksft_skip
+fi
+
+if /sbin/modprobe -q test-ww_mutex; then
+ /sbin/modprobe -q -r test-ww_mutex
+ echo "locking/ww_mutex: ok"
+else
+ echo "locking/ww_mutex: [FAIL]"
+ exit 1
+fi
diff --git a/tools/testing/selftests/media_tests/.gitignore b/tools/testing/selftests/media_tests/.gitignore
new file mode 100644
index 000000000..da438e780
--- /dev/null
+++ b/tools/testing/selftests/media_tests/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+media_device_test
+media_device_open
+video_device_test
diff --git a/tools/testing/selftests/media_tests/Makefile b/tools/testing/selftests/media_tests/Makefile
new file mode 100644
index 000000000..60826d7d3
--- /dev/null
+++ b/tools/testing/selftests/media_tests/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+CFLAGS += -I../ -I../../../../usr/include/
+TEST_GEN_PROGS := media_device_test media_device_open video_device_test
+
+include ../lib.mk
diff --git a/tools/testing/selftests/media_tests/bind_unbind_sample.sh b/tools/testing/selftests/media_tests/bind_unbind_sample.sh
new file mode 100755
index 000000000..0101c1ec4
--- /dev/null
+++ b/tools/testing/selftests/media_tests/bind_unbind_sample.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Find device number in /sys/bus/usb/drivers/drivername
+# Edit this file to update the driver numer and name
+# Example test for uvcvideo driver
+#i=0
+# while :; do
+# i=$((i+1))
+# echo 1-5:1.0 > /sys/bus/usb/drivers/uvcvideo/unbind;
+# echo 1-5:1.0 > /sys/bus/usb/drivers/uvcvideo/bind;
+# clear
+# echo $i
+#done
diff --git a/tools/testing/selftests/media_tests/media_dev_allocator.sh b/tools/testing/selftests/media_tests/media_dev_allocator.sh
new file mode 100755
index 000000000..ffe00c59a
--- /dev/null
+++ b/tools/testing/selftests/media_tests/media_dev_allocator.sh
@@ -0,0 +1,85 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Media Device Allocator API test script
+# Copyright (c) 2019 Shuah Khan <shuah@kernel.org>
+
+echo "Media Device Allocator testing: unbind and bind"
+echo "media driver $1 audio driver $2"
+
+MDRIVER=/sys/bus/usb/drivers/$1
+cd $MDRIVER
+MDEV=$(ls -d *\-*)
+
+ADRIVER=/sys/bus/usb/drivers/$2
+cd $ADRIVER
+ADEV=$(ls -d *\-*.1)
+
+echo "=================================="
+echo "Test unbind both devices - start"
+echo "Running unbind of $MDEV from $MDRIVER"
+echo $MDEV > $MDRIVER/unbind;
+
+echo "Media device should still be present!"
+ls -l /dev/media*
+
+echo "sound driver is at: $ADRIVER"
+echo "Device is: $ADEV"
+
+echo "Running unbind of $ADEV from $ADRIVER"
+echo $ADEV > $ADRIVER/unbind;
+
+echo "Media device should have been deleted!"
+ls -l /dev/media*
+echo "Test unbind both devices - end"
+
+echo "=================================="
+
+echo "Test bind both devices - start"
+echo "Running bind of $MDEV from $MDRIVER"
+echo $MDEV > $MDRIVER/bind;
+
+echo "Media device should be present!"
+ls -l /dev/media*
+
+echo "Running bind of $ADEV from $ADRIVER"
+echo $ADEV > $ADRIVER/bind;
+
+echo "Media device should be there!"
+ls -l /dev/media*
+
+echo "Test bind both devices - end"
+
+echo "=================================="
+
+echo "Test unbind $MDEV - bind $MDEV - unbind $ADEV - bind $ADEV start"
+
+echo "Running unbind of $MDEV from $MDRIVER"
+echo $MDEV > $MDRIVER/unbind;
+
+echo "Media device should be there!"
+ls -l /dev/media*
+
+sleep 1
+
+echo "Running bind of $MDEV from $MDRIVER"
+echo $MDEV > $MDRIVER/bind;
+
+echo "Media device should be there!"
+ls -l /dev/media*
+
+echo "Running unbind of $ADEV from $ADRIVER"
+echo $ADEV > $ADRIVER/unbind;
+
+echo "Media device should be there!"
+ls -l /dev/media*
+
+sleep 1
+
+echo "Running bind of $ADEV from $ADRIVER"
+echo $ADEV > $ADRIVER/bind;
+
+echo "Media device should be there!"
+ls -l /dev/media*
+
+echo "Test unbind $MDEV - bind $MDEV - unbind $ADEV - bind $ADEV end"
+echo "=================================="
diff --git a/tools/testing/selftests/media_tests/media_device_open.c b/tools/testing/selftests/media_tests/media_device_open.c
new file mode 100644
index 000000000..93183a37b
--- /dev/null
+++ b/tools/testing/selftests/media_tests/media_device_open.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * media_device_open.c - Media Controller Device Open Test
+ *
+ * Copyright (c) 2016 Shuah Khan <shuahkh@osg.samsung.com>
+ * Copyright (c) 2016 Samsung Electronics Co., Ltd.
+ *
+ */
+
+/*
+ * This file adds a test for Media Controller API.
+ * This test should be run as root and should not be
+ * included in the Kselftest run. This test should be
+ * run when hardware and driver that makes use Media
+ * Controller API are present in the system.
+ *
+ * This test opens user specified Media Device and calls
+ * MEDIA_IOC_DEVICE_INFO ioctl, closes the file, and exits.
+ *
+ * Usage:
+ * sudo ./media_device_open -d /dev/mediaX
+ *
+ * Run this test is a loop and run bind/unbind on the driver.
+*/
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <linux/media.h>
+
+#include "../kselftest.h"
+
+int main(int argc, char **argv)
+{
+ int opt;
+ char media_device[256];
+ int count = 0;
+ struct media_device_info mdi;
+ int ret;
+ int fd;
+
+ if (argc < 2) {
+ printf("Usage: %s [-d </dev/mediaX>]\n", argv[0]);
+ exit(-1);
+ }
+
+ /* Process arguments */
+ while ((opt = getopt(argc, argv, "d:")) != -1) {
+ switch (opt) {
+ case 'd':
+ strncpy(media_device, optarg, sizeof(media_device) - 1);
+ media_device[sizeof(media_device)-1] = '\0';
+ break;
+ default:
+ printf("Usage: %s [-d </dev/mediaX>]\n", argv[0]);
+ exit(-1);
+ }
+ }
+
+ if (getuid() != 0)
+ ksft_exit_skip("Please run the test as root - Exiting.\n");
+
+ /* Open Media device and keep it open */
+ fd = open(media_device, O_RDWR);
+ if (fd == -1) {
+ printf("Media Device open errno %s\n", strerror(errno));
+ exit(-1);
+ }
+
+ ret = ioctl(fd, MEDIA_IOC_DEVICE_INFO, &mdi);
+ if (ret < 0)
+ printf("Media Device Info errno %s\n", strerror(errno));
+ else
+ printf("Media device model %s driver %s\n",
+ mdi.model, mdi.driver);
+}
diff --git a/tools/testing/selftests/media_tests/media_device_test.c b/tools/testing/selftests/media_tests/media_device_test.c
new file mode 100644
index 000000000..4b9953359
--- /dev/null
+++ b/tools/testing/selftests/media_tests/media_device_test.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * media_device_test.c - Media Controller Device ioctl loop Test
+ *
+ * Copyright (c) 2016 Shuah Khan <shuahkh@osg.samsung.com>
+ * Copyright (c) 2016 Samsung Electronics Co., Ltd.
+ *
+ */
+
+/*
+ * This file adds a test for Media Controller API.
+ * This test should be run as root and should not be
+ * included in the Kselftest run. This test should be
+ * run when hardware and driver that makes use Media
+ * Controller API are present in the system.
+ *
+ * This test opens user specified Media Device and calls
+ * MEDIA_IOC_DEVICE_INFO ioctl in a loop once every 10
+ * seconds.
+ *
+ * Usage:
+ * sudo ./media_device_test -d /dev/mediaX
+ *
+ * While test is running, remove the device and
+ * ensure there are no use after free errors and
+ * other Oops in the dmesg. Enable KaSan kernel
+ * config option for use-after-free error detection.
+*/
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <linux/media.h>
+
+#include "../kselftest.h"
+
+int main(int argc, char **argv)
+{
+ int opt;
+ char media_device[256];
+ int count;
+ struct media_device_info mdi;
+ int ret;
+ int fd;
+
+ if (argc < 2) {
+ printf("Usage: %s [-d </dev/mediaX>]\n", argv[0]);
+ exit(-1);
+ }
+
+ /* Process arguments */
+ while ((opt = getopt(argc, argv, "d:")) != -1) {
+ switch (opt) {
+ case 'd':
+ strncpy(media_device, optarg, sizeof(media_device) - 1);
+ media_device[sizeof(media_device)-1] = '\0';
+ break;
+ default:
+ printf("Usage: %s [-d </dev/mediaX>]\n", argv[0]);
+ exit(-1);
+ }
+ }
+
+ if (getuid() != 0)
+ ksft_exit_skip("Please run the test as root - Exiting.\n");
+
+ /* Generate random number of interations */
+ srand((unsigned int) time(NULL));
+ count = rand();
+
+ /* Open Media device and keep it open */
+ fd = open(media_device, O_RDWR);
+ if (fd == -1) {
+ printf("Media Device open errno %s\n", strerror(errno));
+ exit(-1);
+ }
+
+ printf("\nNote:\n"
+ "While test is running, remove the device and\n"
+ "ensure there are no use after free errors and\n"
+ "other Oops in the dmesg. Enable KaSan kernel\n"
+ "config option for use-after-free error detection.\n\n");
+
+ printf("Running test for %d iterations\n", count);
+
+ while (count > 0) {
+ ret = ioctl(fd, MEDIA_IOC_DEVICE_INFO, &mdi);
+ if (ret < 0)
+ printf("Media Device Info errno %s\n", strerror(errno));
+ else
+ printf("Media device model %s driver %s - count %d\n",
+ mdi.model, mdi.driver, count);
+ sleep(10);
+ count--;
+ }
+}
diff --git a/tools/testing/selftests/media_tests/open_loop_test.sh b/tools/testing/selftests/media_tests/open_loop_test.sh
new file mode 100755
index 000000000..d4c0179bb
--- /dev/null
+++ b/tools/testing/selftests/media_tests/open_loop_test.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+ i=0
+file=/dev/media$1
+ while :; do
+ echo $file
+ i=$((i+1))
+ R=$(./media_device_open -d $file);
+ # clear
+ echo -e "Loop $i\n$R"
+ done
diff --git a/tools/testing/selftests/media_tests/regression_test.txt b/tools/testing/selftests/media_tests/regression_test.txt
new file mode 100644
index 000000000..262736768
--- /dev/null
+++ b/tools/testing/selftests/media_tests/regression_test.txt
@@ -0,0 +1,43 @@
+Testing for regressions in Media Controller API register, ioctl, syscall,
+and unregister paths. There have a few problems that result in user-after
+free on media_device, media_devnode, and cdev pointers when the driver is
+unbound while ioctl is in progress.
+
+Test Procedure:
+
+Run bin/unbind loop while ioctls are in progress.
+Run rmmod and modprobe.
+Disconnect the device.
+
+Setup:
+
+Build media_device_test
+cd tools/testing/selftests/media_tests
+make
+
+Regressions test for cdev user-after free error on /dev/mediaX when driver
+is unbound:
+
+Start media_device_test to regression test media devnode dynamic alloc
+and cdev user-after-free fixes. This opens media dev files and sits in
+a loop running media ioctl MEDIA_IOC_DEVICE_INFO command once every 10
+seconds. The idea is when device file goes away, media devnode and cdev
+should stick around until this test exits.
+
+The test for a random number of iterations or until user kills it with a
+sleep 10 in between the ioctl calls.
+
+sudo ./media_device_test -d /dev/mediaX
+
+Regression test for media_devnode unregister race with ioctl_syscall:
+
+Start 6 open_loop_test.sh tests with different /dev/mediaX files. When
+device file goes away after unbind, device file name changes. Start the
+test with possible device names. If we start with /dev/media0 for example,
+after unbind, /dev/media1 or /dev/media2 could get created. The idea is
+keep ioctls going while bind/unbind runs.
+
+Copy bind_unbind_sample.txt and make changes to specify the driver name
+and number to run bind and unbind. Start the bind_unbind.sh
+
+Run dmesg looking for any user-after free errors or mutex lock errors.
diff --git a/tools/testing/selftests/media_tests/video_device_test.c b/tools/testing/selftests/media_tests/video_device_test.c
new file mode 100644
index 000000000..0f6aef2e2
--- /dev/null
+++ b/tools/testing/selftests/media_tests/video_device_test.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * video_device_test - Video Device Test
+ *
+ * Copyright (c) 2016 Shuah Khan <shuahkh@osg.samsung.com>
+ * Copyright (c) 2016 Samsung Electronics Co., Ltd.
+ *
+ */
+
+/*
+ * This file adds a test for Video Device. This test should not be included
+ * in the Kselftest run. This test should be run when hardware and driver
+ * that makes use of V4L2 API is present.
+ *
+ * This test opens user specified Video Device and calls video ioctls in a
+ * loop once every 10 seconds.
+ *
+ * Usage:
+ * sudo ./video_device_test -d /dev/videoX
+ *
+ * While test is running, remove the device or unbind the driver and
+ * ensure there are no use after free errors and other Oops in the
+ * dmesg.
+ * When possible, enable KaSan kernel config option for use-after-free
+ * error detection.
+*/
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <linux/videodev2.h>
+
+int main(int argc, char **argv)
+{
+ int opt;
+ char video_dev[256];
+ int count;
+ struct v4l2_tuner vtuner;
+ struct v4l2_capability vcap;
+ int ret;
+ int fd;
+
+ if (argc < 2) {
+ printf("Usage: %s [-d </dev/videoX>]\n", argv[0]);
+ exit(-1);
+ }
+
+ /* Process arguments */
+ while ((opt = getopt(argc, argv, "d:")) != -1) {
+ switch (opt) {
+ case 'd':
+ strncpy(video_dev, optarg, sizeof(video_dev) - 1);
+ video_dev[sizeof(video_dev)-1] = '\0';
+ break;
+ default:
+ printf("Usage: %s [-d </dev/videoX>]\n", argv[0]);
+ exit(-1);
+ }
+ }
+
+ /* Generate random number of interations */
+ srand((unsigned int) time(NULL));
+ count = rand();
+
+ /* Open Video device and keep it open */
+ fd = open(video_dev, O_RDWR);
+ if (fd == -1) {
+ printf("Video Device open errno %s\n", strerror(errno));
+ exit(-1);
+ }
+
+ printf("\nNote:\n"
+ "While test is running, remove the device or unbind\n"
+ "driver and ensure there are no use after free errors\n"
+ "and other Oops in the dmesg. When possible, enable KaSan\n"
+ "kernel config option for use-after-free error detection.\n\n");
+
+ while (count > 0) {
+ ret = ioctl(fd, VIDIOC_QUERYCAP, &vcap);
+ if (ret < 0)
+ printf("VIDIOC_QUERYCAP errno %s\n", strerror(errno));
+ else
+ printf("Video device driver %s\n", vcap.driver);
+
+ ret = ioctl(fd, VIDIOC_G_TUNER, &vtuner);
+ if (ret < 0)
+ printf("VIDIOC_G_TUNER, errno %s\n", strerror(errno));
+ else
+ printf("type %d rangelow %d rangehigh %d\n",
+ vtuner.type, vtuner.rangelow, vtuner.rangehigh);
+ sleep(10);
+ count--;
+ }
+}
diff --git a/tools/testing/selftests/membarrier/.gitignore b/tools/testing/selftests/membarrier/.gitignore
new file mode 100644
index 000000000..f2fbba178
--- /dev/null
+++ b/tools/testing/selftests/membarrier/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+membarrier_test_multi_thread
+membarrier_test_single_thread
diff --git a/tools/testing/selftests/membarrier/Makefile b/tools/testing/selftests/membarrier/Makefile
new file mode 100644
index 000000000..34d1c81a2
--- /dev/null
+++ b/tools/testing/selftests/membarrier/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+CFLAGS += -g -I../../../../usr/include/
+LDLIBS += -lpthread
+
+TEST_GEN_PROGS := membarrier_test_single_thread \
+ membarrier_test_multi_thread
+
+include ../lib.mk
diff --git a/tools/testing/selftests/membarrier/membarrier_test_impl.h b/tools/testing/selftests/membarrier/membarrier_test_impl.h
new file mode 100644
index 000000000..186be69f0
--- /dev/null
+++ b/tools/testing/selftests/membarrier/membarrier_test_impl.h
@@ -0,0 +1,317 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#define _GNU_SOURCE
+#include <linux/membarrier.h>
+#include <syscall.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <pthread.h>
+
+#include "../kselftest.h"
+
+static int sys_membarrier(int cmd, int flags)
+{
+ return syscall(__NR_membarrier, cmd, flags);
+}
+
+static int test_membarrier_cmd_fail(void)
+{
+ int cmd = -1, flags = 0;
+ const char *test_name = "sys membarrier invalid command";
+
+ if (sys_membarrier(cmd, flags) != -1) {
+ ksft_exit_fail_msg(
+ "%s test: command = %d, flags = %d. Should fail, but passed\n",
+ test_name, cmd, flags);
+ }
+ if (errno != EINVAL) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
+ test_name, flags, EINVAL, strerror(EINVAL),
+ errno, strerror(errno));
+ }
+
+ ksft_test_result_pass(
+ "%s test: command = %d, flags = %d, errno = %d. Failed as expected\n",
+ test_name, cmd, flags, errno);
+ return 0;
+}
+
+static int test_membarrier_flags_fail(void)
+{
+ int cmd = MEMBARRIER_CMD_QUERY, flags = 1;
+ const char *test_name = "sys membarrier MEMBARRIER_CMD_QUERY invalid flags";
+
+ if (sys_membarrier(cmd, flags) != -1) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d. Should fail, but passed\n",
+ test_name, flags);
+ }
+ if (errno != EINVAL) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
+ test_name, flags, EINVAL, strerror(EINVAL),
+ errno, strerror(errno));
+ }
+
+ ksft_test_result_pass(
+ "%s test: flags = %d, errno = %d. Failed as expected\n",
+ test_name, flags, errno);
+ return 0;
+}
+
+static int test_membarrier_global_success(void)
+{
+ int cmd = MEMBARRIER_CMD_GLOBAL, flags = 0;
+ const char *test_name = "sys membarrier MEMBARRIER_CMD_GLOBAL";
+
+ if (sys_membarrier(cmd, flags) != 0) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d, errno = %d\n",
+ test_name, flags, errno);
+ }
+
+ ksft_test_result_pass(
+ "%s test: flags = %d\n", test_name, flags);
+ return 0;
+}
+
+static int test_membarrier_private_expedited_fail(void)
+{
+ int cmd = MEMBARRIER_CMD_PRIVATE_EXPEDITED, flags = 0;
+ const char *test_name = "sys membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED not registered failure";
+
+ if (sys_membarrier(cmd, flags) != -1) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d. Should fail, but passed\n",
+ test_name, flags);
+ }
+ if (errno != EPERM) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
+ test_name, flags, EPERM, strerror(EPERM),
+ errno, strerror(errno));
+ }
+
+ ksft_test_result_pass(
+ "%s test: flags = %d, errno = %d\n",
+ test_name, flags, errno);
+ return 0;
+}
+
+static int test_membarrier_register_private_expedited_success(void)
+{
+ int cmd = MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, flags = 0;
+ const char *test_name = "sys membarrier MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED";
+
+ if (sys_membarrier(cmd, flags) != 0) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d, errno = %d\n",
+ test_name, flags, errno);
+ }
+
+ ksft_test_result_pass(
+ "%s test: flags = %d\n",
+ test_name, flags);
+ return 0;
+}
+
+static int test_membarrier_private_expedited_success(void)
+{
+ int cmd = MEMBARRIER_CMD_PRIVATE_EXPEDITED, flags = 0;
+ const char *test_name = "sys membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED";
+
+ if (sys_membarrier(cmd, flags) != 0) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d, errno = %d\n",
+ test_name, flags, errno);
+ }
+
+ ksft_test_result_pass(
+ "%s test: flags = %d\n",
+ test_name, flags);
+ return 0;
+}
+
+static int test_membarrier_private_expedited_sync_core_fail(void)
+{
+ int cmd = MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE, flags = 0;
+ const char *test_name = "sys membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE not registered failure";
+
+ if (sys_membarrier(cmd, flags) != -1) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d. Should fail, but passed\n",
+ test_name, flags);
+ }
+ if (errno != EPERM) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
+ test_name, flags, EPERM, strerror(EPERM),
+ errno, strerror(errno));
+ }
+
+ ksft_test_result_pass(
+ "%s test: flags = %d, errno = %d\n",
+ test_name, flags, errno);
+ return 0;
+}
+
+static int test_membarrier_register_private_expedited_sync_core_success(void)
+{
+ int cmd = MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, flags = 0;
+ const char *test_name = "sys membarrier MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE";
+
+ if (sys_membarrier(cmd, flags) != 0) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d, errno = %d\n",
+ test_name, flags, errno);
+ }
+
+ ksft_test_result_pass(
+ "%s test: flags = %d\n",
+ test_name, flags);
+ return 0;
+}
+
+static int test_membarrier_private_expedited_sync_core_success(void)
+{
+ int cmd = MEMBARRIER_CMD_PRIVATE_EXPEDITED, flags = 0;
+ const char *test_name = "sys membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE";
+
+ if (sys_membarrier(cmd, flags) != 0) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d, errno = %d\n",
+ test_name, flags, errno);
+ }
+
+ ksft_test_result_pass(
+ "%s test: flags = %d\n",
+ test_name, flags);
+ return 0;
+}
+
+static int test_membarrier_register_global_expedited_success(void)
+{
+ int cmd = MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, flags = 0;
+ const char *test_name = "sys membarrier MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED";
+
+ if (sys_membarrier(cmd, flags) != 0) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d, errno = %d\n",
+ test_name, flags, errno);
+ }
+
+ ksft_test_result_pass(
+ "%s test: flags = %d\n",
+ test_name, flags);
+ return 0;
+}
+
+static int test_membarrier_global_expedited_success(void)
+{
+ int cmd = MEMBARRIER_CMD_GLOBAL_EXPEDITED, flags = 0;
+ const char *test_name = "sys membarrier MEMBARRIER_CMD_GLOBAL_EXPEDITED";
+
+ if (sys_membarrier(cmd, flags) != 0) {
+ ksft_exit_fail_msg(
+ "%s test: flags = %d, errno = %d\n",
+ test_name, flags, errno);
+ }
+
+ ksft_test_result_pass(
+ "%s test: flags = %d\n",
+ test_name, flags);
+ return 0;
+}
+
+static int test_membarrier_fail(void)
+{
+ int status;
+
+ status = test_membarrier_cmd_fail();
+ if (status)
+ return status;
+ status = test_membarrier_flags_fail();
+ if (status)
+ return status;
+ status = test_membarrier_private_expedited_fail();
+ if (status)
+ return status;
+ status = sys_membarrier(MEMBARRIER_CMD_QUERY, 0);
+ if (status < 0) {
+ ksft_test_result_fail("sys_membarrier() failed\n");
+ return status;
+ }
+ if (status & MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE) {
+ status = test_membarrier_private_expedited_sync_core_fail();
+ if (status)
+ return status;
+ }
+ return 0;
+}
+
+static int test_membarrier_success(void)
+{
+ int status;
+
+ status = test_membarrier_global_success();
+ if (status)
+ return status;
+ status = test_membarrier_register_private_expedited_success();
+ if (status)
+ return status;
+ status = test_membarrier_private_expedited_success();
+ if (status)
+ return status;
+ status = sys_membarrier(MEMBARRIER_CMD_QUERY, 0);
+ if (status < 0) {
+ ksft_test_result_fail("sys_membarrier() failed\n");
+ return status;
+ }
+ if (status & MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE) {
+ status = test_membarrier_register_private_expedited_sync_core_success();
+ if (status)
+ return status;
+ status = test_membarrier_private_expedited_sync_core_success();
+ if (status)
+ return status;
+ }
+ /*
+ * It is valid to send a global membarrier from a non-registered
+ * process.
+ */
+ status = test_membarrier_global_expedited_success();
+ if (status)
+ return status;
+ status = test_membarrier_register_global_expedited_success();
+ if (status)
+ return status;
+ status = test_membarrier_global_expedited_success();
+ if (status)
+ return status;
+ return 0;
+}
+
+static int test_membarrier_query(void)
+{
+ int flags = 0, ret;
+
+ ret = sys_membarrier(MEMBARRIER_CMD_QUERY, flags);
+ if (ret < 0) {
+ if (errno == ENOSYS) {
+ /*
+ * It is valid to build a kernel with
+ * CONFIG_MEMBARRIER=n. However, this skips the tests.
+ */
+ ksft_exit_skip(
+ "sys membarrier (CONFIG_MEMBARRIER) is disabled.\n");
+ }
+ ksft_exit_fail_msg("sys_membarrier() failed\n");
+ }
+ if (!(ret & MEMBARRIER_CMD_GLOBAL))
+ ksft_exit_skip(
+ "sys_membarrier unsupported: CMD_GLOBAL not found.\n");
+
+ ksft_test_result_pass("sys_membarrier available\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c b/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c
new file mode 100644
index 000000000..ac5613e5b
--- /dev/null
+++ b/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <linux/membarrier.h>
+#include <syscall.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <pthread.h>
+
+#include "membarrier_test_impl.h"
+
+static int thread_ready, thread_quit;
+static pthread_mutex_t test_membarrier_thread_mutex =
+ PTHREAD_MUTEX_INITIALIZER;
+static pthread_cond_t test_membarrier_thread_cond =
+ PTHREAD_COND_INITIALIZER;
+
+void *test_membarrier_thread(void *arg)
+{
+ pthread_mutex_lock(&test_membarrier_thread_mutex);
+ thread_ready = 1;
+ pthread_cond_broadcast(&test_membarrier_thread_cond);
+ pthread_mutex_unlock(&test_membarrier_thread_mutex);
+
+ pthread_mutex_lock(&test_membarrier_thread_mutex);
+ while (!thread_quit)
+ pthread_cond_wait(&test_membarrier_thread_cond,
+ &test_membarrier_thread_mutex);
+ pthread_mutex_unlock(&test_membarrier_thread_mutex);
+
+ return NULL;
+}
+
+static int test_mt_membarrier(void)
+{
+ int i;
+ pthread_t test_thread;
+
+ pthread_create(&test_thread, NULL,
+ test_membarrier_thread, NULL);
+
+ pthread_mutex_lock(&test_membarrier_thread_mutex);
+ while (!thread_ready)
+ pthread_cond_wait(&test_membarrier_thread_cond,
+ &test_membarrier_thread_mutex);
+ pthread_mutex_unlock(&test_membarrier_thread_mutex);
+
+ test_membarrier_fail();
+
+ test_membarrier_success();
+
+ pthread_mutex_lock(&test_membarrier_thread_mutex);
+ thread_quit = 1;
+ pthread_cond_broadcast(&test_membarrier_thread_cond);
+ pthread_mutex_unlock(&test_membarrier_thread_mutex);
+
+ pthread_join(test_thread, NULL);
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ ksft_print_header();
+ ksft_set_plan(13);
+
+ test_membarrier_query();
+
+ /* Multi-threaded */
+ test_mt_membarrier();
+
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/membarrier/membarrier_test_single_thread.c b/tools/testing/selftests/membarrier/membarrier_test_single_thread.c
new file mode 100644
index 000000000..c1c963902
--- /dev/null
+++ b/tools/testing/selftests/membarrier/membarrier_test_single_thread.c
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <linux/membarrier.h>
+#include <syscall.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <pthread.h>
+
+#include "membarrier_test_impl.h"
+
+int main(int argc, char **argv)
+{
+ ksft_print_header();
+ ksft_set_plan(13);
+
+ test_membarrier_query();
+
+ test_membarrier_fail();
+
+ test_membarrier_success();
+
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/memfd/.gitignore b/tools/testing/selftests/memfd/.gitignore
new file mode 100644
index 000000000..dd9a051f6
--- /dev/null
+++ b/tools/testing/selftests/memfd/.gitignore
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+fuse_mnt
+fuse_test
+memfd_test
+memfd-test-file
diff --git a/tools/testing/selftests/memfd/Makefile b/tools/testing/selftests/memfd/Makefile
new file mode 100644
index 000000000..4da8b565f
--- /dev/null
+++ b/tools/testing/selftests/memfd/Makefile
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS += -D_FILE_OFFSET_BITS=64
+CFLAGS += -I../../../../include/uapi/
+CFLAGS += -I../../../../include/
+CFLAGS += -I../../../../usr/include/
+
+TEST_GEN_PROGS := memfd_test
+TEST_PROGS := run_fuse_test.sh run_hugetlbfs_test.sh
+TEST_GEN_FILES := fuse_test fuse_mnt
+
+VAR_CFLAGS := $(shell pkg-config fuse --cflags 2>/dev/null)
+ifeq ($(VAR_CFLAGS),)
+VAR_CFLAGS := -D_FILE_OFFSET_BITS=64 -I/usr/include/fuse
+endif
+
+VAR_LDLIBS := $(shell pkg-config fuse --libs 2>/dev/null)
+ifeq ($(VAR_LDLIBS),)
+VAR_LDLIBS := -lfuse -pthread
+endif
+
+fuse_mnt.o: CFLAGS += $(VAR_CFLAGS)
+
+include ../lib.mk
+
+$(OUTPUT)/fuse_mnt: LDLIBS += $(VAR_LDLIBS)
+
+$(OUTPUT)/memfd_test: memfd_test.c common.c
+$(OUTPUT)/fuse_test: fuse_test.c common.c
+
+EXTRA_CLEAN = $(OUTPUT)/common.o
diff --git a/tools/testing/selftests/memfd/common.c b/tools/testing/selftests/memfd/common.c
new file mode 100644
index 000000000..8eb3d75f6
--- /dev/null
+++ b/tools/testing/selftests/memfd/common.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#define __EXPORTED_HEADERS__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <linux/fcntl.h>
+#include <linux/memfd.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#include "common.h"
+
+int hugetlbfs_test = 0;
+
+/*
+ * Copied from mlock2-tests.c
+ */
+unsigned long default_huge_page_size(void)
+{
+ unsigned long hps = 0;
+ char *line = NULL;
+ size_t linelen = 0;
+ FILE *f = fopen("/proc/meminfo", "r");
+
+ if (!f)
+ return 0;
+ while (getline(&line, &linelen, f) > 0) {
+ if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) {
+ hps <<= 10;
+ break;
+ }
+ }
+
+ free(line);
+ fclose(f);
+ return hps;
+}
+
+int sys_memfd_create(const char *name, unsigned int flags)
+{
+ if (hugetlbfs_test)
+ flags |= MFD_HUGETLB;
+
+ return syscall(__NR_memfd_create, name, flags);
+}
diff --git a/tools/testing/selftests/memfd/common.h b/tools/testing/selftests/memfd/common.h
new file mode 100644
index 000000000..522d2c630
--- /dev/null
+++ b/tools/testing/selftests/memfd/common.h
@@ -0,0 +1,9 @@
+#ifndef COMMON_H_
+#define COMMON_H_
+
+extern int hugetlbfs_test;
+
+unsigned long default_huge_page_size(void);
+int sys_memfd_create(const char *name, unsigned int flags);
+
+#endif
diff --git a/tools/testing/selftests/memfd/config b/tools/testing/selftests/memfd/config
new file mode 100644
index 000000000..835c7f4da
--- /dev/null
+++ b/tools/testing/selftests/memfd/config
@@ -0,0 +1 @@
+CONFIG_FUSE_FS=m
diff --git a/tools/testing/selftests/memfd/fuse_mnt.c b/tools/testing/selftests/memfd/fuse_mnt.c
new file mode 100644
index 000000000..6936f2a00
--- /dev/null
+++ b/tools/testing/selftests/memfd/fuse_mnt.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * memfd test file-system
+ * This file uses FUSE to create a dummy file-system with only one file /memfd.
+ * This file is read-only and takes 1s per read.
+ *
+ * This file-system is used by the memfd test-cases to force the kernel to pin
+ * pages during reads(). Due to the 1s delay of this file-system, this is a
+ * nice way to test race-conditions against get_user_pages() in the kernel.
+ *
+ * We use direct_io==1 to force the kernel to use direct-IO for this
+ * file-system.
+ */
+
+#define FUSE_USE_VERSION 26
+
+#include <fuse.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+static const char memfd_content[] = "memfd-example-content";
+static const char memfd_path[] = "/memfd";
+
+static int memfd_getattr(const char *path, struct stat *st)
+{
+ memset(st, 0, sizeof(*st));
+
+ if (!strcmp(path, "/")) {
+ st->st_mode = S_IFDIR | 0755;
+ st->st_nlink = 2;
+ } else if (!strcmp(path, memfd_path)) {
+ st->st_mode = S_IFREG | 0444;
+ st->st_nlink = 1;
+ st->st_size = strlen(memfd_content);
+ } else {
+ return -ENOENT;
+ }
+
+ return 0;
+}
+
+static int memfd_readdir(const char *path,
+ void *buf,
+ fuse_fill_dir_t filler,
+ off_t offset,
+ struct fuse_file_info *fi)
+{
+ if (strcmp(path, "/"))
+ return -ENOENT;
+
+ filler(buf, ".", NULL, 0);
+ filler(buf, "..", NULL, 0);
+ filler(buf, memfd_path + 1, NULL, 0);
+
+ return 0;
+}
+
+static int memfd_open(const char *path, struct fuse_file_info *fi)
+{
+ if (strcmp(path, memfd_path))
+ return -ENOENT;
+
+ if ((fi->flags & 3) != O_RDONLY)
+ return -EACCES;
+
+ /* force direct-IO */
+ fi->direct_io = 1;
+
+ return 0;
+}
+
+static int memfd_read(const char *path,
+ char *buf,
+ size_t size,
+ off_t offset,
+ struct fuse_file_info *fi)
+{
+ size_t len;
+
+ if (strcmp(path, memfd_path) != 0)
+ return -ENOENT;
+
+ sleep(1);
+
+ len = strlen(memfd_content);
+ if (offset < len) {
+ if (offset + size > len)
+ size = len - offset;
+
+ memcpy(buf, memfd_content + offset, size);
+ } else {
+ size = 0;
+ }
+
+ return size;
+}
+
+static struct fuse_operations memfd_ops = {
+ .getattr = memfd_getattr,
+ .readdir = memfd_readdir,
+ .open = memfd_open,
+ .read = memfd_read,
+};
+
+int main(int argc, char *argv[])
+{
+ return fuse_main(argc, argv, &memfd_ops, NULL);
+}
diff --git a/tools/testing/selftests/memfd/fuse_test.c b/tools/testing/selftests/memfd/fuse_test.c
new file mode 100644
index 000000000..cda63164d
--- /dev/null
+++ b/tools/testing/selftests/memfd/fuse_test.c
@@ -0,0 +1,331 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * memfd GUP test-case
+ * This tests memfd interactions with get_user_pages(). We require the
+ * fuse_mnt.c program to provide a fake direct-IO FUSE mount-point for us. This
+ * file-system delays _all_ reads by 1s and forces direct-IO. This means, any
+ * read() on files in that file-system will pin the receive-buffer pages for at
+ * least 1s via get_user_pages().
+ *
+ * We use this trick to race ADD_SEALS against a write on a memfd object. The
+ * ADD_SEALS must fail if the memfd pages are still pinned. Note that we use
+ * the read() syscall with our memory-mapped memfd object as receive buffer to
+ * force the kernel to write into our memfd object.
+ */
+
+#define _GNU_SOURCE
+#define __EXPORTED_HEADERS__
+
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <linux/falloc.h>
+#include <linux/fcntl.h>
+#include <linux/memfd.h>
+#include <linux/types.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "common.h"
+
+#define MFD_DEF_SIZE 8192
+#define STACK_SIZE 65536
+
+static size_t mfd_def_size = MFD_DEF_SIZE;
+
+static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags)
+{
+ int r, fd;
+
+ fd = sys_memfd_create(name, flags);
+ if (fd < 0) {
+ printf("memfd_create(\"%s\", %u) failed: %m\n",
+ name, flags);
+ abort();
+ }
+
+ r = ftruncate(fd, sz);
+ if (r < 0) {
+ printf("ftruncate(%llu) failed: %m\n", (unsigned long long)sz);
+ abort();
+ }
+
+ return fd;
+}
+
+static __u64 mfd_assert_get_seals(int fd)
+{
+ long r;
+
+ r = fcntl(fd, F_GET_SEALS);
+ if (r < 0) {
+ printf("GET_SEALS(%d) failed: %m\n", fd);
+ abort();
+ }
+
+ return r;
+}
+
+static void mfd_assert_has_seals(int fd, __u64 seals)
+{
+ __u64 s;
+
+ s = mfd_assert_get_seals(fd);
+ if (s != seals) {
+ printf("%llu != %llu = GET_SEALS(%d)\n",
+ (unsigned long long)seals, (unsigned long long)s, fd);
+ abort();
+ }
+}
+
+static void mfd_assert_add_seals(int fd, __u64 seals)
+{
+ long r;
+ __u64 s;
+
+ s = mfd_assert_get_seals(fd);
+ r = fcntl(fd, F_ADD_SEALS, seals);
+ if (r < 0) {
+ printf("ADD_SEALS(%d, %llu -> %llu) failed: %m\n",
+ fd, (unsigned long long)s, (unsigned long long)seals);
+ abort();
+ }
+}
+
+static int mfd_busy_add_seals(int fd, __u64 seals)
+{
+ long r;
+ __u64 s;
+
+ r = fcntl(fd, F_GET_SEALS);
+ if (r < 0)
+ s = 0;
+ else
+ s = r;
+
+ r = fcntl(fd, F_ADD_SEALS, seals);
+ if (r < 0 && errno != EBUSY) {
+ printf("ADD_SEALS(%d, %llu -> %llu) didn't fail as expected with EBUSY: %m\n",
+ fd, (unsigned long long)s, (unsigned long long)seals);
+ abort();
+ }
+
+ return r;
+}
+
+static void *mfd_assert_mmap_shared(int fd)
+{
+ void *p;
+
+ p = mmap(NULL,
+ mfd_def_size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED,
+ fd,
+ 0);
+ if (p == MAP_FAILED) {
+ printf("mmap() failed: %m\n");
+ abort();
+ }
+
+ return p;
+}
+
+static void *mfd_assert_mmap_private(int fd)
+{
+ void *p;
+
+ p = mmap(NULL,
+ mfd_def_size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE,
+ fd,
+ 0);
+ if (p == MAP_FAILED) {
+ printf("mmap() failed: %m\n");
+ abort();
+ }
+
+ return p;
+}
+
+static int global_mfd = -1;
+static void *global_p = NULL;
+
+static int sealing_thread_fn(void *arg)
+{
+ int sig, r;
+
+ /*
+ * This thread first waits 200ms so any pending operation in the parent
+ * is correctly started. After that, it tries to seal @global_mfd as
+ * SEAL_WRITE. This _must_ fail as the parent thread has a read() into
+ * that memory mapped object still ongoing.
+ * We then wait one more second and try sealing again. This time it
+ * must succeed as there shouldn't be anyone else pinning the pages.
+ */
+
+ /* wait 200ms for FUSE-request to be active */
+ usleep(200000);
+
+ /* unmount mapping before sealing to avoid i_mmap_writable failures */
+ munmap(global_p, mfd_def_size);
+
+ /* Try sealing the global file; expect EBUSY or success. Current
+ * kernels will never succeed, but in the future, kernels might
+ * implement page-replacements or other fancy ways to avoid racing
+ * writes. */
+ r = mfd_busy_add_seals(global_mfd, F_SEAL_WRITE);
+ if (r >= 0) {
+ printf("HURRAY! This kernel fixed GUP races!\n");
+ } else {
+ /* wait 1s more so the FUSE-request is done */
+ sleep(1);
+
+ /* try sealing the global file again */
+ mfd_assert_add_seals(global_mfd, F_SEAL_WRITE);
+ }
+
+ return 0;
+}
+
+static pid_t spawn_sealing_thread(void)
+{
+ uint8_t *stack;
+ pid_t pid;
+
+ stack = malloc(STACK_SIZE);
+ if (!stack) {
+ printf("malloc(STACK_SIZE) failed: %m\n");
+ abort();
+ }
+
+ pid = clone(sealing_thread_fn,
+ stack + STACK_SIZE,
+ SIGCHLD | CLONE_FILES | CLONE_FS | CLONE_VM,
+ NULL);
+ if (pid < 0) {
+ printf("clone() failed: %m\n");
+ abort();
+ }
+
+ return pid;
+}
+
+static void join_sealing_thread(pid_t pid)
+{
+ waitpid(pid, NULL, 0);
+}
+
+int main(int argc, char **argv)
+{
+ char *zero;
+ int fd, mfd, r;
+ void *p;
+ int was_sealed;
+ pid_t pid;
+
+ if (argc < 2) {
+ printf("error: please pass path to file in fuse_mnt mount-point\n");
+ abort();
+ }
+
+ if (argc >= 3) {
+ if (!strcmp(argv[2], "hugetlbfs")) {
+ unsigned long hpage_size = default_huge_page_size();
+
+ if (!hpage_size) {
+ printf("Unable to determine huge page size\n");
+ abort();
+ }
+
+ hugetlbfs_test = 1;
+ mfd_def_size = hpage_size * 2;
+ } else {
+ printf("Unknown option: %s\n", argv[2]);
+ abort();
+ }
+ }
+
+ zero = calloc(sizeof(*zero), mfd_def_size);
+
+ /* open FUSE memfd file for GUP testing */
+ printf("opening: %s\n", argv[1]);
+ fd = open(argv[1], O_RDONLY | O_CLOEXEC);
+ if (fd < 0) {
+ printf("cannot open(\"%s\"): %m\n", argv[1]);
+ abort();
+ }
+
+ /* create new memfd-object */
+ mfd = mfd_assert_new("kern_memfd_fuse",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_ALLOW_SEALING);
+
+ /* mmap memfd-object for writing */
+ p = mfd_assert_mmap_shared(mfd);
+
+ /* pass mfd+mapping to a separate sealing-thread which tries to seal
+ * the memfd objects with SEAL_WRITE while we write into it */
+ global_mfd = mfd;
+ global_p = p;
+ pid = spawn_sealing_thread();
+
+ /* Use read() on the FUSE file to read into our memory-mapped memfd
+ * object. This races the other thread which tries to seal the
+ * memfd-object.
+ * If @fd is on the memfd-fake-FUSE-FS, the read() is delayed by 1s.
+ * This guarantees that the receive-buffer is pinned for 1s until the
+ * data is written into it. The racing ADD_SEALS should thus fail as
+ * the pages are still pinned. */
+ r = read(fd, p, mfd_def_size);
+ if (r < 0) {
+ printf("read() failed: %m\n");
+ abort();
+ } else if (!r) {
+ printf("unexpected EOF on read()\n");
+ abort();
+ }
+
+ was_sealed = mfd_assert_get_seals(mfd) & F_SEAL_WRITE;
+
+ /* Wait for sealing-thread to finish and verify that it
+ * successfully sealed the file after the second try. */
+ join_sealing_thread(pid);
+ mfd_assert_has_seals(mfd, F_SEAL_WRITE);
+
+ /* *IF* the memfd-object was sealed at the time our read() returned,
+ * then the kernel did a page-replacement or canceled the read() (or
+ * whatever magic it did..). In that case, the memfd object is still
+ * all zero.
+ * In case the memfd-object was *not* sealed, the read() was successfull
+ * and the memfd object must *not* be all zero.
+ * Note that in real scenarios, there might be a mixture of both, but
+ * in this test-cases, we have explicit 200ms delays which should be
+ * enough to avoid any in-flight writes. */
+
+ p = mfd_assert_mmap_private(mfd);
+ if (was_sealed && memcmp(p, zero, mfd_def_size)) {
+ printf("memfd sealed during read() but data not discarded\n");
+ abort();
+ } else if (!was_sealed && !memcmp(p, zero, mfd_def_size)) {
+ printf("memfd sealed after read() but data discarded\n");
+ abort();
+ }
+
+ close(mfd);
+ close(fd);
+
+ printf("fuse: DONE\n");
+ free(zero);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c
new file mode 100644
index 000000000..fba322d1c
--- /dev/null
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -0,0 +1,1080 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#define __EXPORTED_HEADERS__
+
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <linux/falloc.h>
+#include <linux/fcntl.h>
+#include <linux/memfd.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "common.h"
+
+#define MEMFD_STR "memfd:"
+#define MEMFD_HUGE_STR "memfd-hugetlb:"
+#define SHARED_FT_STR "(shared file-table)"
+
+#define MFD_DEF_SIZE 8192
+#define STACK_SIZE 65536
+
+/*
+ * Default is not to test hugetlbfs
+ */
+static size_t mfd_def_size = MFD_DEF_SIZE;
+static const char *memfd_str = MEMFD_STR;
+
+static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags)
+{
+ int r, fd;
+
+ fd = sys_memfd_create(name, flags);
+ if (fd < 0) {
+ printf("memfd_create(\"%s\", %u) failed: %m\n",
+ name, flags);
+ abort();
+ }
+
+ r = ftruncate(fd, sz);
+ if (r < 0) {
+ printf("ftruncate(%llu) failed: %m\n", (unsigned long long)sz);
+ abort();
+ }
+
+ return fd;
+}
+
+static int mfd_assert_reopen_fd(int fd_in)
+{
+ int r, fd;
+ char path[100];
+
+ sprintf(path, "/proc/self/fd/%d", fd_in);
+
+ fd = open(path, O_RDWR);
+ if (fd < 0) {
+ printf("re-open of existing fd %d failed\n", fd_in);
+ abort();
+ }
+
+ return fd;
+}
+
+static void mfd_fail_new(const char *name, unsigned int flags)
+{
+ int r;
+
+ r = sys_memfd_create(name, flags);
+ if (r >= 0) {
+ printf("memfd_create(\"%s\", %u) succeeded, but failure expected\n",
+ name, flags);
+ close(r);
+ abort();
+ }
+}
+
+static unsigned int mfd_assert_get_seals(int fd)
+{
+ int r;
+
+ r = fcntl(fd, F_GET_SEALS);
+ if (r < 0) {
+ printf("GET_SEALS(%d) failed: %m\n", fd);
+ abort();
+ }
+
+ return (unsigned int)r;
+}
+
+static void mfd_assert_has_seals(int fd, unsigned int seals)
+{
+ unsigned int s;
+
+ s = mfd_assert_get_seals(fd);
+ if (s != seals) {
+ printf("%u != %u = GET_SEALS(%d)\n", seals, s, fd);
+ abort();
+ }
+}
+
+static void mfd_assert_add_seals(int fd, unsigned int seals)
+{
+ int r;
+ unsigned int s;
+
+ s = mfd_assert_get_seals(fd);
+ r = fcntl(fd, F_ADD_SEALS, seals);
+ if (r < 0) {
+ printf("ADD_SEALS(%d, %u -> %u) failed: %m\n", fd, s, seals);
+ abort();
+ }
+}
+
+static void mfd_fail_add_seals(int fd, unsigned int seals)
+{
+ int r;
+ unsigned int s;
+
+ r = fcntl(fd, F_GET_SEALS);
+ if (r < 0)
+ s = 0;
+ else
+ s = (unsigned int)r;
+
+ r = fcntl(fd, F_ADD_SEALS, seals);
+ if (r >= 0) {
+ printf("ADD_SEALS(%d, %u -> %u) didn't fail as expected\n",
+ fd, s, seals);
+ abort();
+ }
+}
+
+static void mfd_assert_size(int fd, size_t size)
+{
+ struct stat st;
+ int r;
+
+ r = fstat(fd, &st);
+ if (r < 0) {
+ printf("fstat(%d) failed: %m\n", fd);
+ abort();
+ } else if (st.st_size != size) {
+ printf("wrong file size %lld, but expected %lld\n",
+ (long long)st.st_size, (long long)size);
+ abort();
+ }
+}
+
+static int mfd_assert_dup(int fd)
+{
+ int r;
+
+ r = dup(fd);
+ if (r < 0) {
+ printf("dup(%d) failed: %m\n", fd);
+ abort();
+ }
+
+ return r;
+}
+
+static void *mfd_assert_mmap_shared(int fd)
+{
+ void *p;
+
+ p = mmap(NULL,
+ mfd_def_size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED,
+ fd,
+ 0);
+ if (p == MAP_FAILED) {
+ printf("mmap() failed: %m\n");
+ abort();
+ }
+
+ return p;
+}
+
+static void *mfd_assert_mmap_private(int fd)
+{
+ void *p;
+
+ p = mmap(NULL,
+ mfd_def_size,
+ PROT_READ,
+ MAP_PRIVATE,
+ fd,
+ 0);
+ if (p == MAP_FAILED) {
+ printf("mmap() failed: %m\n");
+ abort();
+ }
+
+ return p;
+}
+
+static int mfd_assert_open(int fd, int flags, mode_t mode)
+{
+ char buf[512];
+ int r;
+
+ sprintf(buf, "/proc/self/fd/%d", fd);
+ r = open(buf, flags, mode);
+ if (r < 0) {
+ printf("open(%s) failed: %m\n", buf);
+ abort();
+ }
+
+ return r;
+}
+
+static void mfd_fail_open(int fd, int flags, mode_t mode)
+{
+ char buf[512];
+ int r;
+
+ sprintf(buf, "/proc/self/fd/%d", fd);
+ r = open(buf, flags, mode);
+ if (r >= 0) {
+ printf("open(%s) didn't fail as expected\n", buf);
+ abort();
+ }
+}
+
+static void mfd_assert_read(int fd)
+{
+ char buf[16];
+ void *p;
+ ssize_t l;
+
+ l = read(fd, buf, sizeof(buf));
+ if (l != sizeof(buf)) {
+ printf("read() failed: %m\n");
+ abort();
+ }
+
+ /* verify PROT_READ *is* allowed */
+ p = mmap(NULL,
+ mfd_def_size,
+ PROT_READ,
+ MAP_PRIVATE,
+ fd,
+ 0);
+ if (p == MAP_FAILED) {
+ printf("mmap() failed: %m\n");
+ abort();
+ }
+ munmap(p, mfd_def_size);
+
+ /* verify MAP_PRIVATE is *always* allowed (even writable) */
+ p = mmap(NULL,
+ mfd_def_size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE,
+ fd,
+ 0);
+ if (p == MAP_FAILED) {
+ printf("mmap() failed: %m\n");
+ abort();
+ }
+ munmap(p, mfd_def_size);
+}
+
+/* Test that PROT_READ + MAP_SHARED mappings work. */
+static void mfd_assert_read_shared(int fd)
+{
+ void *p;
+
+ /* verify PROT_READ and MAP_SHARED *is* allowed */
+ p = mmap(NULL,
+ mfd_def_size,
+ PROT_READ,
+ MAP_SHARED,
+ fd,
+ 0);
+ if (p == MAP_FAILED) {
+ printf("mmap() failed: %m\n");
+ abort();
+ }
+ munmap(p, mfd_def_size);
+}
+
+static void mfd_assert_fork_private_write(int fd)
+{
+ int *p;
+ pid_t pid;
+
+ p = mmap(NULL,
+ mfd_def_size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE,
+ fd,
+ 0);
+ if (p == MAP_FAILED) {
+ printf("mmap() failed: %m\n");
+ abort();
+ }
+
+ p[0] = 22;
+
+ pid = fork();
+ if (pid == 0) {
+ p[0] = 33;
+ exit(0);
+ } else {
+ waitpid(pid, NULL, 0);
+
+ if (p[0] != 22) {
+ printf("MAP_PRIVATE copy-on-write failed: %m\n");
+ abort();
+ }
+ }
+
+ munmap(p, mfd_def_size);
+}
+
+static void mfd_assert_write(int fd)
+{
+ ssize_t l;
+ void *p;
+ int r;
+
+ /*
+ * huegtlbfs does not support write, but we want to
+ * verify everything else here.
+ */
+ if (!hugetlbfs_test) {
+ /* verify write() succeeds */
+ l = write(fd, "\0\0\0\0", 4);
+ if (l != 4) {
+ printf("write() failed: %m\n");
+ abort();
+ }
+ }
+
+ /* verify PROT_READ | PROT_WRITE is allowed */
+ p = mmap(NULL,
+ mfd_def_size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED,
+ fd,
+ 0);
+ if (p == MAP_FAILED) {
+ printf("mmap() failed: %m\n");
+ abort();
+ }
+ *(char *)p = 0;
+ munmap(p, mfd_def_size);
+
+ /* verify PROT_WRITE is allowed */
+ p = mmap(NULL,
+ mfd_def_size,
+ PROT_WRITE,
+ MAP_SHARED,
+ fd,
+ 0);
+ if (p == MAP_FAILED) {
+ printf("mmap() failed: %m\n");
+ abort();
+ }
+ *(char *)p = 0;
+ munmap(p, mfd_def_size);
+
+ /* verify PROT_READ with MAP_SHARED is allowed and a following
+ * mprotect(PROT_WRITE) allows writing */
+ p = mmap(NULL,
+ mfd_def_size,
+ PROT_READ,
+ MAP_SHARED,
+ fd,
+ 0);
+ if (p == MAP_FAILED) {
+ printf("mmap() failed: %m\n");
+ abort();
+ }
+
+ r = mprotect(p, mfd_def_size, PROT_READ | PROT_WRITE);
+ if (r < 0) {
+ printf("mprotect() failed: %m\n");
+ abort();
+ }
+
+ *(char *)p = 0;
+ munmap(p, mfd_def_size);
+
+ /* verify PUNCH_HOLE works */
+ r = fallocate(fd,
+ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ 0,
+ mfd_def_size);
+ if (r < 0) {
+ printf("fallocate(PUNCH_HOLE) failed: %m\n");
+ abort();
+ }
+}
+
+static void mfd_fail_write(int fd)
+{
+ ssize_t l;
+ void *p;
+ int r;
+
+ /* verify write() fails */
+ l = write(fd, "data", 4);
+ if (l != -EPERM) {
+ printf("expected EPERM on write(), but got %d: %m\n", (int)l);
+ abort();
+ }
+
+ /* verify PROT_READ | PROT_WRITE is not allowed */
+ p = mmap(NULL,
+ mfd_def_size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED,
+ fd,
+ 0);
+ if (p != MAP_FAILED) {
+ printf("mmap() didn't fail as expected\n");
+ abort();
+ }
+
+ /* verify PROT_WRITE is not allowed */
+ p = mmap(NULL,
+ mfd_def_size,
+ PROT_WRITE,
+ MAP_SHARED,
+ fd,
+ 0);
+ if (p != MAP_FAILED) {
+ printf("mmap() didn't fail as expected\n");
+ abort();
+ }
+
+ /* Verify PROT_READ with MAP_SHARED with a following mprotect is not
+ * allowed. Note that for r/w the kernel already prevents the mmap. */
+ p = mmap(NULL,
+ mfd_def_size,
+ PROT_READ,
+ MAP_SHARED,
+ fd,
+ 0);
+ if (p != MAP_FAILED) {
+ r = mprotect(p, mfd_def_size, PROT_READ | PROT_WRITE);
+ if (r >= 0) {
+ printf("mmap()+mprotect() didn't fail as expected\n");
+ abort();
+ }
+ munmap(p, mfd_def_size);
+ }
+
+ /* verify PUNCH_HOLE fails */
+ r = fallocate(fd,
+ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ 0,
+ mfd_def_size);
+ if (r >= 0) {
+ printf("fallocate(PUNCH_HOLE) didn't fail as expected\n");
+ abort();
+ }
+}
+
+static void mfd_assert_shrink(int fd)
+{
+ int r, fd2;
+
+ r = ftruncate(fd, mfd_def_size / 2);
+ if (r < 0) {
+ printf("ftruncate(SHRINK) failed: %m\n");
+ abort();
+ }
+
+ mfd_assert_size(fd, mfd_def_size / 2);
+
+ fd2 = mfd_assert_open(fd,
+ O_RDWR | O_CREAT | O_TRUNC,
+ S_IRUSR | S_IWUSR);
+ close(fd2);
+
+ mfd_assert_size(fd, 0);
+}
+
+static void mfd_fail_shrink(int fd)
+{
+ int r;
+
+ r = ftruncate(fd, mfd_def_size / 2);
+ if (r >= 0) {
+ printf("ftruncate(SHRINK) didn't fail as expected\n");
+ abort();
+ }
+
+ mfd_fail_open(fd,
+ O_RDWR | O_CREAT | O_TRUNC,
+ S_IRUSR | S_IWUSR);
+}
+
+static void mfd_assert_grow(int fd)
+{
+ int r;
+
+ r = ftruncate(fd, mfd_def_size * 2);
+ if (r < 0) {
+ printf("ftruncate(GROW) failed: %m\n");
+ abort();
+ }
+
+ mfd_assert_size(fd, mfd_def_size * 2);
+
+ r = fallocate(fd,
+ 0,
+ 0,
+ mfd_def_size * 4);
+ if (r < 0) {
+ printf("fallocate(ALLOC) failed: %m\n");
+ abort();
+ }
+
+ mfd_assert_size(fd, mfd_def_size * 4);
+}
+
+static void mfd_fail_grow(int fd)
+{
+ int r;
+
+ r = ftruncate(fd, mfd_def_size * 2);
+ if (r >= 0) {
+ printf("ftruncate(GROW) didn't fail as expected\n");
+ abort();
+ }
+
+ r = fallocate(fd,
+ 0,
+ 0,
+ mfd_def_size * 4);
+ if (r >= 0) {
+ printf("fallocate(ALLOC) didn't fail as expected\n");
+ abort();
+ }
+}
+
+static void mfd_assert_grow_write(int fd)
+{
+ static char *buf;
+ ssize_t l;
+
+ /* hugetlbfs does not support write */
+ if (hugetlbfs_test)
+ return;
+
+ buf = malloc(mfd_def_size * 8);
+ if (!buf) {
+ printf("malloc(%zu) failed: %m\n", mfd_def_size * 8);
+ abort();
+ }
+
+ l = pwrite(fd, buf, mfd_def_size * 8, 0);
+ if (l != (mfd_def_size * 8)) {
+ printf("pwrite() failed: %m\n");
+ abort();
+ }
+
+ mfd_assert_size(fd, mfd_def_size * 8);
+}
+
+static void mfd_fail_grow_write(int fd)
+{
+ static char *buf;
+ ssize_t l;
+
+ /* hugetlbfs does not support write */
+ if (hugetlbfs_test)
+ return;
+
+ buf = malloc(mfd_def_size * 8);
+ if (!buf) {
+ printf("malloc(%zu) failed: %m\n", mfd_def_size * 8);
+ abort();
+ }
+
+ l = pwrite(fd, buf, mfd_def_size * 8, 0);
+ if (l == (mfd_def_size * 8)) {
+ printf("pwrite() didn't fail as expected\n");
+ abort();
+ }
+}
+
+static int idle_thread_fn(void *arg)
+{
+ sigset_t set;
+ int sig;
+
+ /* dummy waiter; SIGTERM terminates us anyway */
+ sigemptyset(&set);
+ sigaddset(&set, SIGTERM);
+ sigwait(&set, &sig);
+
+ return 0;
+}
+
+static pid_t spawn_idle_thread(unsigned int flags)
+{
+ uint8_t *stack;
+ pid_t pid;
+
+ stack = malloc(STACK_SIZE);
+ if (!stack) {
+ printf("malloc(STACK_SIZE) failed: %m\n");
+ abort();
+ }
+
+ pid = clone(idle_thread_fn,
+ stack + STACK_SIZE,
+ SIGCHLD | flags,
+ NULL);
+ if (pid < 0) {
+ printf("clone() failed: %m\n");
+ abort();
+ }
+
+ return pid;
+}
+
+static void join_idle_thread(pid_t pid)
+{
+ kill(pid, SIGTERM);
+ waitpid(pid, NULL, 0);
+}
+
+/*
+ * Test memfd_create() syscall
+ * Verify syscall-argument validation, including name checks, flag validation
+ * and more.
+ */
+static void test_create(void)
+{
+ char buf[2048];
+ int fd;
+
+ printf("%s CREATE\n", memfd_str);
+
+ /* test NULL name */
+ mfd_fail_new(NULL, 0);
+
+ /* test over-long name (not zero-terminated) */
+ memset(buf, 0xff, sizeof(buf));
+ mfd_fail_new(buf, 0);
+
+ /* test over-long zero-terminated name */
+ memset(buf, 0xff, sizeof(buf));
+ buf[sizeof(buf) - 1] = 0;
+ mfd_fail_new(buf, 0);
+
+ /* verify "" is a valid name */
+ fd = mfd_assert_new("", 0, 0);
+ close(fd);
+
+ /* verify invalid O_* open flags */
+ mfd_fail_new("", 0x0100);
+ mfd_fail_new("", ~MFD_CLOEXEC);
+ mfd_fail_new("", ~MFD_ALLOW_SEALING);
+ mfd_fail_new("", ~0);
+ mfd_fail_new("", 0x80000000U);
+
+ /* verify MFD_CLOEXEC is allowed */
+ fd = mfd_assert_new("", 0, MFD_CLOEXEC);
+ close(fd);
+
+ /* verify MFD_ALLOW_SEALING is allowed */
+ fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING);
+ close(fd);
+
+ /* verify MFD_ALLOW_SEALING | MFD_CLOEXEC is allowed */
+ fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING | MFD_CLOEXEC);
+ close(fd);
+}
+
+/*
+ * Test basic sealing
+ * A very basic sealing test to see whether setting/retrieving seals works.
+ */
+static void test_basic(void)
+{
+ int fd;
+
+ printf("%s BASIC\n", memfd_str);
+
+ fd = mfd_assert_new("kern_memfd_basic",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_ALLOW_SEALING);
+
+ /* add basic seals */
+ mfd_assert_has_seals(fd, 0);
+ mfd_assert_add_seals(fd, F_SEAL_SHRINK |
+ F_SEAL_WRITE);
+ mfd_assert_has_seals(fd, F_SEAL_SHRINK |
+ F_SEAL_WRITE);
+
+ /* add them again */
+ mfd_assert_add_seals(fd, F_SEAL_SHRINK |
+ F_SEAL_WRITE);
+ mfd_assert_has_seals(fd, F_SEAL_SHRINK |
+ F_SEAL_WRITE);
+
+ /* add more seals and seal against sealing */
+ mfd_assert_add_seals(fd, F_SEAL_GROW | F_SEAL_SEAL);
+ mfd_assert_has_seals(fd, F_SEAL_SHRINK |
+ F_SEAL_GROW |
+ F_SEAL_WRITE |
+ F_SEAL_SEAL);
+
+ /* verify that sealing no longer works */
+ mfd_fail_add_seals(fd, F_SEAL_GROW);
+ mfd_fail_add_seals(fd, 0);
+
+ close(fd);
+
+ /* verify sealing does not work without MFD_ALLOW_SEALING */
+ fd = mfd_assert_new("kern_memfd_basic",
+ mfd_def_size,
+ MFD_CLOEXEC);
+ mfd_assert_has_seals(fd, F_SEAL_SEAL);
+ mfd_fail_add_seals(fd, F_SEAL_SHRINK |
+ F_SEAL_GROW |
+ F_SEAL_WRITE);
+ mfd_assert_has_seals(fd, F_SEAL_SEAL);
+ close(fd);
+}
+
+/*
+ * Test SEAL_WRITE
+ * Test whether SEAL_WRITE actually prevents modifications.
+ */
+static void test_seal_write(void)
+{
+ int fd;
+
+ printf("%s SEAL-WRITE\n", memfd_str);
+
+ fd = mfd_assert_new("kern_memfd_seal_write",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_ALLOW_SEALING);
+ mfd_assert_has_seals(fd, 0);
+ mfd_assert_add_seals(fd, F_SEAL_WRITE);
+ mfd_assert_has_seals(fd, F_SEAL_WRITE);
+
+ mfd_assert_read(fd);
+ mfd_fail_write(fd);
+ mfd_assert_shrink(fd);
+ mfd_assert_grow(fd);
+ mfd_fail_grow_write(fd);
+
+ close(fd);
+}
+
+/*
+ * Test SEAL_FUTURE_WRITE
+ * Test whether SEAL_FUTURE_WRITE actually prevents modifications.
+ */
+static void test_seal_future_write(void)
+{
+ int fd, fd2;
+ void *p;
+
+ printf("%s SEAL-FUTURE-WRITE\n", memfd_str);
+
+ fd = mfd_assert_new("kern_memfd_seal_future_write",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_ALLOW_SEALING);
+
+ p = mfd_assert_mmap_shared(fd);
+
+ mfd_assert_has_seals(fd, 0);
+
+ mfd_assert_add_seals(fd, F_SEAL_FUTURE_WRITE);
+ mfd_assert_has_seals(fd, F_SEAL_FUTURE_WRITE);
+
+ /* read should pass, writes should fail */
+ mfd_assert_read(fd);
+ mfd_assert_read_shared(fd);
+ mfd_fail_write(fd);
+
+ fd2 = mfd_assert_reopen_fd(fd);
+ /* read should pass, writes should still fail */
+ mfd_assert_read(fd2);
+ mfd_assert_read_shared(fd2);
+ mfd_fail_write(fd2);
+
+ mfd_assert_fork_private_write(fd);
+
+ munmap(p, mfd_def_size);
+ close(fd2);
+ close(fd);
+}
+
+/*
+ * Test SEAL_SHRINK
+ * Test whether SEAL_SHRINK actually prevents shrinking
+ */
+static void test_seal_shrink(void)
+{
+ int fd;
+
+ printf("%s SEAL-SHRINK\n", memfd_str);
+
+ fd = mfd_assert_new("kern_memfd_seal_shrink",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_ALLOW_SEALING);
+ mfd_assert_has_seals(fd, 0);
+ mfd_assert_add_seals(fd, F_SEAL_SHRINK);
+ mfd_assert_has_seals(fd, F_SEAL_SHRINK);
+
+ mfd_assert_read(fd);
+ mfd_assert_write(fd);
+ mfd_fail_shrink(fd);
+ mfd_assert_grow(fd);
+ mfd_assert_grow_write(fd);
+
+ close(fd);
+}
+
+/*
+ * Test SEAL_GROW
+ * Test whether SEAL_GROW actually prevents growing
+ */
+static void test_seal_grow(void)
+{
+ int fd;
+
+ printf("%s SEAL-GROW\n", memfd_str);
+
+ fd = mfd_assert_new("kern_memfd_seal_grow",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_ALLOW_SEALING);
+ mfd_assert_has_seals(fd, 0);
+ mfd_assert_add_seals(fd, F_SEAL_GROW);
+ mfd_assert_has_seals(fd, F_SEAL_GROW);
+
+ mfd_assert_read(fd);
+ mfd_assert_write(fd);
+ mfd_assert_shrink(fd);
+ mfd_fail_grow(fd);
+ mfd_fail_grow_write(fd);
+
+ close(fd);
+}
+
+/*
+ * Test SEAL_SHRINK | SEAL_GROW
+ * Test whether SEAL_SHRINK | SEAL_GROW actually prevents resizing
+ */
+static void test_seal_resize(void)
+{
+ int fd;
+
+ printf("%s SEAL-RESIZE\n", memfd_str);
+
+ fd = mfd_assert_new("kern_memfd_seal_resize",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_ALLOW_SEALING);
+ mfd_assert_has_seals(fd, 0);
+ mfd_assert_add_seals(fd, F_SEAL_SHRINK | F_SEAL_GROW);
+ mfd_assert_has_seals(fd, F_SEAL_SHRINK | F_SEAL_GROW);
+
+ mfd_assert_read(fd);
+ mfd_assert_write(fd);
+ mfd_fail_shrink(fd);
+ mfd_fail_grow(fd);
+ mfd_fail_grow_write(fd);
+
+ close(fd);
+}
+
+/*
+ * Test sharing via dup()
+ * Test that seals are shared between dupped FDs and they're all equal.
+ */
+static void test_share_dup(char *banner, char *b_suffix)
+{
+ int fd, fd2;
+
+ printf("%s %s %s\n", memfd_str, banner, b_suffix);
+
+ fd = mfd_assert_new("kern_memfd_share_dup",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_ALLOW_SEALING);
+ mfd_assert_has_seals(fd, 0);
+
+ fd2 = mfd_assert_dup(fd);
+ mfd_assert_has_seals(fd2, 0);
+
+ mfd_assert_add_seals(fd, F_SEAL_WRITE);
+ mfd_assert_has_seals(fd, F_SEAL_WRITE);
+ mfd_assert_has_seals(fd2, F_SEAL_WRITE);
+
+ mfd_assert_add_seals(fd2, F_SEAL_SHRINK);
+ mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK);
+ mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK);
+
+ mfd_assert_add_seals(fd, F_SEAL_SEAL);
+ mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_SEAL);
+ mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_SEAL);
+
+ mfd_fail_add_seals(fd, F_SEAL_GROW);
+ mfd_fail_add_seals(fd2, F_SEAL_GROW);
+ mfd_fail_add_seals(fd, F_SEAL_SEAL);
+ mfd_fail_add_seals(fd2, F_SEAL_SEAL);
+
+ close(fd2);
+
+ mfd_fail_add_seals(fd, F_SEAL_GROW);
+ close(fd);
+}
+
+/*
+ * Test sealing with active mmap()s
+ * Modifying seals is only allowed if no other mmap() refs exist.
+ */
+static void test_share_mmap(char *banner, char *b_suffix)
+{
+ int fd;
+ void *p;
+
+ printf("%s %s %s\n", memfd_str, banner, b_suffix);
+
+ fd = mfd_assert_new("kern_memfd_share_mmap",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_ALLOW_SEALING);
+ mfd_assert_has_seals(fd, 0);
+
+ /* shared/writable ref prevents sealing WRITE, but allows others */
+ p = mfd_assert_mmap_shared(fd);
+ mfd_fail_add_seals(fd, F_SEAL_WRITE);
+ mfd_assert_has_seals(fd, 0);
+ mfd_assert_add_seals(fd, F_SEAL_SHRINK);
+ mfd_assert_has_seals(fd, F_SEAL_SHRINK);
+ munmap(p, mfd_def_size);
+
+ /* readable ref allows sealing */
+ p = mfd_assert_mmap_private(fd);
+ mfd_assert_add_seals(fd, F_SEAL_WRITE);
+ mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK);
+ munmap(p, mfd_def_size);
+
+ close(fd);
+}
+
+/*
+ * Test sealing with open(/proc/self/fd/%d)
+ * Via /proc we can get access to a separate file-context for the same memfd.
+ * This is *not* like dup(), but like a real separate open(). Make sure the
+ * semantics are as expected and we correctly check for RDONLY / WRONLY / RDWR.
+ */
+static void test_share_open(char *banner, char *b_suffix)
+{
+ int fd, fd2;
+
+ printf("%s %s %s\n", memfd_str, banner, b_suffix);
+
+ fd = mfd_assert_new("kern_memfd_share_open",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_ALLOW_SEALING);
+ mfd_assert_has_seals(fd, 0);
+
+ fd2 = mfd_assert_open(fd, O_RDWR, 0);
+ mfd_assert_add_seals(fd, F_SEAL_WRITE);
+ mfd_assert_has_seals(fd, F_SEAL_WRITE);
+ mfd_assert_has_seals(fd2, F_SEAL_WRITE);
+
+ mfd_assert_add_seals(fd2, F_SEAL_SHRINK);
+ mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK);
+ mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK);
+
+ close(fd);
+ fd = mfd_assert_open(fd2, O_RDONLY, 0);
+
+ mfd_fail_add_seals(fd, F_SEAL_SEAL);
+ mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK);
+ mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK);
+
+ close(fd2);
+ fd2 = mfd_assert_open(fd, O_RDWR, 0);
+
+ mfd_assert_add_seals(fd2, F_SEAL_SEAL);
+ mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_SEAL);
+ mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_SEAL);
+
+ close(fd2);
+ close(fd);
+}
+
+/*
+ * Test sharing via fork()
+ * Test whether seal-modifications work as expected with forked childs.
+ */
+static void test_share_fork(char *banner, char *b_suffix)
+{
+ int fd;
+ pid_t pid;
+
+ printf("%s %s %s\n", memfd_str, banner, b_suffix);
+
+ fd = mfd_assert_new("kern_memfd_share_fork",
+ mfd_def_size,
+ MFD_CLOEXEC | MFD_ALLOW_SEALING);
+ mfd_assert_has_seals(fd, 0);
+
+ pid = spawn_idle_thread(0);
+ mfd_assert_add_seals(fd, F_SEAL_SEAL);
+ mfd_assert_has_seals(fd, F_SEAL_SEAL);
+
+ mfd_fail_add_seals(fd, F_SEAL_WRITE);
+ mfd_assert_has_seals(fd, F_SEAL_SEAL);
+
+ join_idle_thread(pid);
+
+ mfd_fail_add_seals(fd, F_SEAL_WRITE);
+ mfd_assert_has_seals(fd, F_SEAL_SEAL);
+
+ close(fd);
+}
+
+int main(int argc, char **argv)
+{
+ pid_t pid;
+
+ if (argc == 2) {
+ if (!strcmp(argv[1], "hugetlbfs")) {
+ unsigned long hpage_size = default_huge_page_size();
+
+ if (!hpage_size) {
+ printf("Unable to determine huge page size\n");
+ abort();
+ }
+
+ hugetlbfs_test = 1;
+ memfd_str = MEMFD_HUGE_STR;
+ mfd_def_size = hpage_size * 2;
+ } else {
+ printf("Unknown option: %s\n", argv[1]);
+ abort();
+ }
+ }
+
+ test_create();
+ test_basic();
+
+ test_seal_write();
+ test_seal_future_write();
+ test_seal_shrink();
+ test_seal_grow();
+ test_seal_resize();
+
+ test_share_dup("SHARE-DUP", "");
+ test_share_mmap("SHARE-MMAP", "");
+ test_share_open("SHARE-OPEN", "");
+ test_share_fork("SHARE-FORK", "");
+
+ /* Run test-suite in a multi-threaded environment with a shared
+ * file-table. */
+ pid = spawn_idle_thread(CLONE_FILES | CLONE_FS | CLONE_VM);
+ test_share_dup("SHARE-DUP", SHARED_FT_STR);
+ test_share_mmap("SHARE-MMAP", SHARED_FT_STR);
+ test_share_open("SHARE-OPEN", SHARED_FT_STR);
+ test_share_fork("SHARE-FORK", SHARED_FT_STR);
+ join_idle_thread(pid);
+
+ printf("memfd: DONE\n");
+
+ return 0;
+}
diff --git a/tools/testing/selftests/memfd/run_fuse_test.sh b/tools/testing/selftests/memfd/run_fuse_test.sh
new file mode 100755
index 000000000..22e572e2d
--- /dev/null
+++ b/tools/testing/selftests/memfd/run_fuse_test.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+if test -d "./mnt" ; then
+ fusermount -u ./mnt
+ rmdir ./mnt
+fi
+
+set -e
+
+mkdir mnt
+./fuse_mnt ./mnt
+./fuse_test ./mnt/memfd $@
+fusermount -u ./mnt
+rmdir ./mnt
diff --git a/tools/testing/selftests/memfd/run_hugetlbfs_test.sh b/tools/testing/selftests/memfd/run_hugetlbfs_test.sh
new file mode 100755
index 000000000..fb633eeb0
--- /dev/null
+++ b/tools/testing/selftests/memfd/run_hugetlbfs_test.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+# please run as root
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+#
+# To test memfd_create with hugetlbfs, there needs to be hpages_test
+# huge pages free. Attempt to allocate enough pages to test.
+#
+hpages_test=8
+
+#
+# Get count of free huge pages from /proc/meminfo
+#
+while read name size unit; do
+ if [ "$name" = "HugePages_Free:" ]; then
+ freepgs=$size
+ fi
+done < /proc/meminfo
+
+#
+# If not enough free huge pages for test, attempt to increase
+#
+if [ -n "$freepgs" ] && [ $freepgs -lt $hpages_test ]; then
+ nr_hugepgs=`cat /proc/sys/vm/nr_hugepages`
+ hpages_needed=`expr $hpages_test - $freepgs`
+
+ if [ $UID != 0 ]; then
+ echo "Please run memfd with hugetlbfs test as root"
+ exit $ksft_skip
+ fi
+
+ echo 3 > /proc/sys/vm/drop_caches
+ echo $(( $hpages_needed + $nr_hugepgs )) > /proc/sys/vm/nr_hugepages
+ while read name size unit; do
+ if [ "$name" = "HugePages_Free:" ]; then
+ freepgs=$size
+ fi
+ done < /proc/meminfo
+fi
+
+#
+# If still not enough huge pages available, exit. But, give back any huge
+# pages potentially allocated above.
+#
+if [ $freepgs -lt $hpages_test ]; then
+ # nr_hugepgs non-zero only if we attempted to increase
+ if [ -n "$nr_hugepgs" ]; then
+ echo $nr_hugepgs > /proc/sys/vm/nr_hugepages
+ fi
+ printf "Not enough huge pages available (%d < %d)\n" \
+ $freepgs $needpgs
+ exit $ksft_skip
+fi
+
+#
+# Run the hugetlbfs test
+#
+./memfd_test hugetlbfs
+./run_fuse_test.sh hugetlbfs
+
+#
+# Give back any huge pages allocated for the test
+#
+if [ -n "$nr_hugepgs" ]; then
+ echo $nr_hugepgs > /proc/sys/vm/nr_hugepages
+fi
diff --git a/tools/testing/selftests/memory-hotplug/Makefile b/tools/testing/selftests/memory-hotplug/Makefile
new file mode 100644
index 000000000..e0a625e34
--- /dev/null
+++ b/tools/testing/selftests/memory-hotplug/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+all:
+
+include ../lib.mk
+
+TEST_PROGS := mem-on-off-test.sh
+
+run_full_test:
+ @/bin/bash ./mem-on-off-test.sh -r 10 && echo "memory-hotplug selftests: [PASS]" || echo "memory-hotplug selftests: [FAIL]"
+
+clean:
diff --git a/tools/testing/selftests/memory-hotplug/config b/tools/testing/selftests/memory-hotplug/config
new file mode 100644
index 000000000..a7e8cd5bb
--- /dev/null
+++ b/tools/testing/selftests/memory-hotplug/config
@@ -0,0 +1,5 @@
+CONFIG_MEMORY_HOTPLUG=y
+CONFIG_MEMORY_HOTPLUG_SPARSE=y
+CONFIG_NOTIFIER_ERROR_INJECTION=y
+CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m
+CONFIG_MEMORY_HOTREMOVE=y
diff --git a/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh b/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh
new file mode 100755
index 000000000..b37585e6a
--- /dev/null
+++ b/tools/testing/selftests/memory-hotplug/mem-on-off-test.sh
@@ -0,0 +1,291 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+SYSFS=
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+prerequisite()
+{
+ msg="skip all tests:"
+
+ if [ $UID != 0 ]; then
+ echo $msg must be run as root >&2
+ exit $ksft_skip
+ fi
+
+ SYSFS=`mount -t sysfs | head -1 | awk '{ print $3 }'`
+
+ if [ ! -d "$SYSFS" ]; then
+ echo $msg sysfs is not mounted >&2
+ exit $ksft_skip
+ fi
+
+ if ! ls $SYSFS/devices/system/memory/memory* > /dev/null 2>&1; then
+ echo $msg memory hotplug is not supported >&2
+ exit $ksft_skip
+ fi
+
+ if ! grep -q 1 $SYSFS/devices/system/memory/memory*/removable; then
+ echo $msg no hot-pluggable memory >&2
+ exit $ksft_skip
+ fi
+}
+
+#
+# list all hot-pluggable memory
+#
+hotpluggable_memory()
+{
+ local state=${1:-.\*}
+
+ for memory in $SYSFS/devices/system/memory/memory*; do
+ if grep -q 1 $memory/removable &&
+ grep -q $state $memory/state; then
+ echo ${memory##/*/memory}
+ fi
+ done
+}
+
+hotpluggable_offline_memory()
+{
+ hotpluggable_memory offline
+}
+
+hotpluggable_online_memory()
+{
+ hotpluggable_memory online
+}
+
+memory_is_online()
+{
+ grep -q online $SYSFS/devices/system/memory/memory$1/state
+}
+
+memory_is_offline()
+{
+ grep -q offline $SYSFS/devices/system/memory/memory$1/state
+}
+
+online_memory()
+{
+ echo online > $SYSFS/devices/system/memory/memory$1/state
+}
+
+offline_memory()
+{
+ echo offline > $SYSFS/devices/system/memory/memory$1/state
+}
+
+online_memory_expect_success()
+{
+ local memory=$1
+
+ if ! online_memory $memory; then
+ echo $FUNCNAME $memory: unexpected fail >&2
+ return 1
+ elif ! memory_is_online $memory; then
+ echo $FUNCNAME $memory: unexpected offline >&2
+ return 1
+ fi
+ return 0
+}
+
+online_memory_expect_fail()
+{
+ local memory=$1
+
+ if online_memory $memory 2> /dev/null; then
+ echo $FUNCNAME $memory: unexpected success >&2
+ return 1
+ elif ! memory_is_offline $memory; then
+ echo $FUNCNAME $memory: unexpected online >&2
+ return 1
+ fi
+ return 0
+}
+
+offline_memory_expect_success()
+{
+ local memory=$1
+
+ if ! offline_memory $memory; then
+ echo $FUNCNAME $memory: unexpected fail >&2
+ return 1
+ elif ! memory_is_offline $memory; then
+ echo $FUNCNAME $memory: unexpected offline >&2
+ return 1
+ fi
+ return 0
+}
+
+offline_memory_expect_fail()
+{
+ local memory=$1
+
+ if offline_memory $memory 2> /dev/null; then
+ echo $FUNCNAME $memory: unexpected success >&2
+ return 1
+ elif ! memory_is_online $memory; then
+ echo $FUNCNAME $memory: unexpected offline >&2
+ return 1
+ fi
+ return 0
+}
+
+error=-12
+priority=0
+# Run with default of ratio=2 for Kselftest run
+ratio=2
+retval=0
+
+while getopts e:hp:r: opt; do
+ case $opt in
+ e)
+ error=$OPTARG
+ ;;
+ h)
+ echo "Usage $0 [ -e errno ] [ -p notifier-priority ] [ -r percent-of-memory-to-offline ]"
+ exit
+ ;;
+ p)
+ priority=$OPTARG
+ ;;
+ r)
+ ratio=$OPTARG
+ if [ "$ratio" -gt 100 ] || [ "$ratio" -lt 0 ]; then
+ echo "The percentage should be an integer within 0~100 range"
+ exit 1
+ fi
+ ;;
+ esac
+done
+
+if ! [ "$error" -ge -4095 -a "$error" -lt 0 ]; then
+ echo "error code must be -4095 <= errno < 0" >&2
+ exit 1
+fi
+
+prerequisite
+
+echo "Test scope: $ratio% hotplug memory"
+
+#
+# Online all hot-pluggable memory
+#
+hotpluggable_num=`hotpluggable_offline_memory | wc -l`
+echo -e "\t online all hot-pluggable memory in offline state:"
+if [ "$hotpluggable_num" -gt 0 ]; then
+ for memory in `hotpluggable_offline_memory`; do
+ echo "offline->online memory$memory"
+ if ! online_memory_expect_success $memory; then
+ retval=1
+ fi
+ done
+else
+ echo -e "\t\t SKIPPED - no hot-pluggable memory in offline state"
+fi
+
+#
+# Offline $ratio percent of hot-pluggable memory
+#
+hotpluggable_num=`hotpluggable_online_memory | wc -l`
+target=`echo "a=$hotpluggable_num*$ratio; if ( a%100 ) a/100+1 else a/100" | bc`
+echo -e "\t offline $ratio% hot-pluggable memory in online state"
+echo -e "\t trying to offline $target out of $hotpluggable_num memory block(s):"
+for memory in `hotpluggable_online_memory`; do
+ if [ "$target" -gt 0 ]; then
+ echo "online->offline memory$memory"
+ if offline_memory_expect_success $memory; then
+ target=$(($target - 1))
+ fi
+ fi
+done
+if [ "$target" -gt 0 ]; then
+ retval=1
+ echo -e "\t\t FAILED - unable to offline some memory blocks, device busy?"
+fi
+
+#
+# Online all hot-pluggable memory again
+#
+hotpluggable_num=`hotpluggable_offline_memory | wc -l`
+echo -e "\t online all hot-pluggable memory in offline state:"
+if [ "$hotpluggable_num" -gt 0 ]; then
+ for memory in `hotpluggable_offline_memory`; do
+ echo "offline->online memory$memory"
+ if ! online_memory_expect_success $memory; then
+ retval=1
+ fi
+ done
+else
+ echo -e "\t\t SKIPPED - no hot-pluggable memory in offline state"
+fi
+
+#
+# Test with memory notifier error injection
+#
+
+DEBUGFS=`mount -t debugfs | head -1 | awk '{ print $3 }'`
+NOTIFIER_ERR_INJECT_DIR=$DEBUGFS/notifier-error-inject/memory
+
+prerequisite_extra()
+{
+ msg="skip extra tests:"
+
+ /sbin/modprobe -q -r memory-notifier-error-inject
+ /sbin/modprobe -q memory-notifier-error-inject priority=$priority
+
+ if [ ! -d "$DEBUGFS" ]; then
+ echo $msg debugfs is not mounted >&2
+ exit $retval
+ fi
+
+ if [ ! -d $NOTIFIER_ERR_INJECT_DIR ]; then
+ echo $msg memory-notifier-error-inject module is not available >&2
+ exit $retval
+ fi
+}
+
+echo -e "\t Test with memory notifier error injection"
+prerequisite_extra
+
+#
+# Offline $ratio percent of hot-pluggable memory
+#
+echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_OFFLINE/error
+for memory in `hotpluggable_online_memory`; do
+ if [ $((RANDOM % 100)) -lt $ratio ]; then
+ offline_memory_expect_success $memory
+ fi
+done
+
+#
+# Test memory hot-add error handling (offline => online)
+#
+echo $error > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_ONLINE/error
+for memory in `hotpluggable_offline_memory`; do
+ online_memory_expect_fail $memory
+done
+
+#
+# Online all hot-pluggable memory
+#
+echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_ONLINE/error
+for memory in `hotpluggable_offline_memory`; do
+ online_memory_expect_success $memory
+done
+
+#
+# Test memory hot-remove error handling (online => offline)
+#
+echo $error > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_OFFLINE/error
+for memory in `hotpluggable_online_memory`; do
+ offline_memory_expect_fail $memory
+done
+
+echo 0 > $NOTIFIER_ERR_INJECT_DIR/actions/MEM_GOING_OFFLINE/error
+/sbin/modprobe -q -r memory-notifier-error-inject
+
+exit $retval
diff --git a/tools/testing/selftests/mincore/.gitignore b/tools/testing/selftests/mincore/.gitignore
new file mode 100644
index 000000000..15c4dfc2d
--- /dev/null
+++ b/tools/testing/selftests/mincore/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0+
+mincore_selftest
diff --git a/tools/testing/selftests/mincore/Makefile b/tools/testing/selftests/mincore/Makefile
new file mode 100644
index 000000000..38c7db1e8
--- /dev/null
+++ b/tools/testing/selftests/mincore/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0+
+
+CFLAGS += -Wall
+
+TEST_GEN_PROGS := mincore_selftest
+include ../lib.mk
diff --git a/tools/testing/selftests/mincore/mincore_selftest.c b/tools/testing/selftests/mincore/mincore_selftest.c
new file mode 100644
index 000000000..2cf6f2f27
--- /dev/null
+++ b/tools/testing/selftests/mincore/mincore_selftest.c
@@ -0,0 +1,369 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * kselftest suite for mincore().
+ *
+ * Copyright (C) 2020 Collabora, Ltd.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <fcntl.h>
+#include <string.h>
+
+#include "../kselftest.h"
+#include "../kselftest_harness.h"
+
+/* Default test file size: 4MB */
+#define MB (1UL << 20)
+#define FILE_SIZE (4 * MB)
+
+
+/*
+ * Tests the user interface. This test triggers most of the documented
+ * error conditions in mincore().
+ */
+TEST(basic_interface)
+{
+ int retval;
+ int page_size;
+ unsigned char vec[1];
+ char *addr;
+
+ page_size = sysconf(_SC_PAGESIZE);
+
+ /* Query a 0 byte sized range */
+ retval = mincore(0, 0, vec);
+ EXPECT_EQ(0, retval);
+
+ /* Addresses in the specified range are invalid or unmapped */
+ errno = 0;
+ retval = mincore(NULL, page_size, vec);
+ EXPECT_EQ(-1, retval);
+ EXPECT_EQ(ENOMEM, errno);
+
+ errno = 0;
+ addr = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(MAP_FAILED, addr) {
+ TH_LOG("mmap error: %s", strerror(errno));
+ }
+
+ /* <addr> argument is not page-aligned */
+ errno = 0;
+ retval = mincore(addr + 1, page_size, vec);
+ EXPECT_EQ(-1, retval);
+ EXPECT_EQ(EINVAL, errno);
+
+ /* <length> argument is too large */
+ errno = 0;
+ retval = mincore(addr, -1, vec);
+ EXPECT_EQ(-1, retval);
+ EXPECT_EQ(ENOMEM, errno);
+
+ /* <vec> argument points to an illegal address */
+ errno = 0;
+ retval = mincore(addr, page_size, NULL);
+ EXPECT_EQ(-1, retval);
+ EXPECT_EQ(EFAULT, errno);
+ munmap(addr, page_size);
+}
+
+
+/*
+ * Test mincore() behavior on a private anonymous page mapping.
+ * Check that the page is not loaded into memory right after the mapping
+ * but after accessing it (on-demand allocation).
+ * Then free the page and check that it's not memory-resident.
+ */
+TEST(check_anonymous_locked_pages)
+{
+ unsigned char vec[1];
+ char *addr;
+ int retval;
+ int page_size;
+
+ page_size = sysconf(_SC_PAGESIZE);
+
+ /* Map one page and check it's not memory-resident */
+ errno = 0;
+ addr = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ ASSERT_NE(MAP_FAILED, addr) {
+ TH_LOG("mmap error: %s", strerror(errno));
+ }
+ retval = mincore(addr, page_size, vec);
+ ASSERT_EQ(0, retval);
+ ASSERT_EQ(0, vec[0]) {
+ TH_LOG("Page found in memory before use");
+ }
+
+ /* Touch the page and check again. It should now be in memory */
+ addr[0] = 1;
+ mlock(addr, page_size);
+ retval = mincore(addr, page_size, vec);
+ ASSERT_EQ(0, retval);
+ ASSERT_EQ(1, vec[0]) {
+ TH_LOG("Page not found in memory after use");
+ }
+
+ /*
+ * It shouldn't be memory-resident after unlocking it and
+ * marking it as unneeded.
+ */
+ munlock(addr, page_size);
+ madvise(addr, page_size, MADV_DONTNEED);
+ retval = mincore(addr, page_size, vec);
+ ASSERT_EQ(0, retval);
+ ASSERT_EQ(0, vec[0]) {
+ TH_LOG("Page in memory after being zapped");
+ }
+ munmap(addr, page_size);
+}
+
+
+/*
+ * Check mincore() behavior on huge pages.
+ * This test will be skipped if the mapping fails (ie. if there are no
+ * huge pages available).
+ *
+ * Make sure the system has at least one free huge page, check
+ * "HugePages_Free" in /proc/meminfo.
+ * Increment /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages if
+ * needed.
+ */
+TEST(check_huge_pages)
+{
+ unsigned char vec[1];
+ char *addr;
+ int retval;
+ int page_size;
+
+ page_size = sysconf(_SC_PAGESIZE);
+
+ errno = 0;
+ addr = mmap(NULL, page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+ -1, 0);
+ if (addr == MAP_FAILED) {
+ if (errno == ENOMEM)
+ SKIP(return, "No huge pages available.");
+ else
+ TH_LOG("mmap error: %s", strerror(errno));
+ }
+ retval = mincore(addr, page_size, vec);
+ ASSERT_EQ(0, retval);
+ ASSERT_EQ(0, vec[0]) {
+ TH_LOG("Page found in memory before use");
+ }
+
+ addr[0] = 1;
+ mlock(addr, page_size);
+ retval = mincore(addr, page_size, vec);
+ ASSERT_EQ(0, retval);
+ ASSERT_EQ(1, vec[0]) {
+ TH_LOG("Page not found in memory after use");
+ }
+
+ munlock(addr, page_size);
+ munmap(addr, page_size);
+}
+
+
+/*
+ * Test mincore() behavior on a file-backed page.
+ * No pages should be loaded into memory right after the mapping. Then,
+ * accessing any address in the mapping range should load the page
+ * containing the address and a number of subsequent pages (readahead).
+ *
+ * The actual readahead settings depend on the test environment, so we
+ * can't make a lot of assumptions about that. This test covers the most
+ * general cases.
+ */
+TEST(check_file_mmap)
+{
+ unsigned char *vec;
+ int vec_size;
+ char *addr;
+ int retval;
+ int page_size;
+ int fd;
+ int i;
+ int ra_pages = 0;
+
+ page_size = sysconf(_SC_PAGESIZE);
+ vec_size = FILE_SIZE / page_size;
+ if (FILE_SIZE % page_size)
+ vec_size++;
+
+ vec = calloc(vec_size, sizeof(unsigned char));
+ ASSERT_NE(NULL, vec) {
+ TH_LOG("Can't allocate array");
+ }
+
+ errno = 0;
+ fd = open(".", O_TMPFILE | O_RDWR, 0600);
+ if (fd < 0) {
+ ASSERT_EQ(errno, EOPNOTSUPP) {
+ TH_LOG("Can't create temporary file: %s",
+ strerror(errno));
+ }
+ SKIP(goto out_free, "O_TMPFILE not supported by filesystem.");
+ }
+ errno = 0;
+ retval = fallocate(fd, 0, 0, FILE_SIZE);
+ if (retval) {
+ ASSERT_EQ(errno, EOPNOTSUPP) {
+ TH_LOG("Error allocating space for the temporary file: %s",
+ strerror(errno));
+ }
+ SKIP(goto out_close, "fallocate not supported by filesystem.");
+ }
+
+ /*
+ * Map the whole file, the pages shouldn't be fetched yet.
+ */
+ errno = 0;
+ addr = mmap(NULL, FILE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+ ASSERT_NE(MAP_FAILED, addr) {
+ TH_LOG("mmap error: %s", strerror(errno));
+ }
+ retval = mincore(addr, FILE_SIZE, vec);
+ ASSERT_EQ(0, retval);
+ for (i = 0; i < vec_size; i++) {
+ ASSERT_EQ(0, vec[i]) {
+ TH_LOG("Unexpected page in memory");
+ }
+ }
+
+ /*
+ * Touch a page in the middle of the mapping. We expect the next
+ * few pages (the readahead window) to be populated too.
+ */
+ addr[FILE_SIZE / 2] = 1;
+ retval = mincore(addr, FILE_SIZE, vec);
+ ASSERT_EQ(0, retval);
+ ASSERT_EQ(1, vec[FILE_SIZE / 2 / page_size]) {
+ TH_LOG("Page not found in memory after use");
+ }
+
+ i = FILE_SIZE / 2 / page_size + 1;
+ while (i < vec_size && vec[i]) {
+ ra_pages++;
+ i++;
+ }
+ EXPECT_GT(ra_pages, 0) {
+ TH_LOG("No read-ahead pages found in memory");
+ }
+
+ EXPECT_LT(i, vec_size) {
+ TH_LOG("Read-ahead pages reached the end of the file");
+ }
+ /*
+ * End of the readahead window. The rest of the pages shouldn't
+ * be in memory.
+ */
+ if (i < vec_size) {
+ while (i < vec_size && !vec[i])
+ i++;
+ EXPECT_EQ(vec_size, i) {
+ TH_LOG("Unexpected page in memory beyond readahead window");
+ }
+ }
+
+ munmap(addr, FILE_SIZE);
+out_close:
+ close(fd);
+out_free:
+ free(vec);
+}
+
+
+/*
+ * Test mincore() behavior on a page backed by a tmpfs file. This test
+ * performs the same steps as the previous one. However, we don't expect
+ * any readahead in this case.
+ */
+TEST(check_tmpfs_mmap)
+{
+ unsigned char *vec;
+ int vec_size;
+ char *addr;
+ int retval;
+ int page_size;
+ int fd;
+ int i;
+ int ra_pages = 0;
+
+ page_size = sysconf(_SC_PAGESIZE);
+ vec_size = FILE_SIZE / page_size;
+ if (FILE_SIZE % page_size)
+ vec_size++;
+
+ vec = calloc(vec_size, sizeof(unsigned char));
+ ASSERT_NE(NULL, vec) {
+ TH_LOG("Can't allocate array");
+ }
+
+ errno = 0;
+ fd = open("/dev/shm", O_TMPFILE | O_RDWR, 0600);
+ ASSERT_NE(-1, fd) {
+ TH_LOG("Can't create temporary file: %s",
+ strerror(errno));
+ }
+ errno = 0;
+ retval = fallocate(fd, 0, 0, FILE_SIZE);
+ ASSERT_EQ(0, retval) {
+ TH_LOG("Error allocating space for the temporary file: %s",
+ strerror(errno));
+ }
+
+ /*
+ * Map the whole file, the pages shouldn't be fetched yet.
+ */
+ errno = 0;
+ addr = mmap(NULL, FILE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+ ASSERT_NE(MAP_FAILED, addr) {
+ TH_LOG("mmap error: %s", strerror(errno));
+ }
+ retval = mincore(addr, FILE_SIZE, vec);
+ ASSERT_EQ(0, retval);
+ for (i = 0; i < vec_size; i++) {
+ ASSERT_EQ(0, vec[i]) {
+ TH_LOG("Unexpected page in memory");
+ }
+ }
+
+ /*
+ * Touch a page in the middle of the mapping. We expect only
+ * that page to be fetched into memory.
+ */
+ addr[FILE_SIZE / 2] = 1;
+ retval = mincore(addr, FILE_SIZE, vec);
+ ASSERT_EQ(0, retval);
+ ASSERT_EQ(1, vec[FILE_SIZE / 2 / page_size]) {
+ TH_LOG("Page not found in memory after use");
+ }
+
+ i = FILE_SIZE / 2 / page_size + 1;
+ while (i < vec_size && vec[i]) {
+ ra_pages++;
+ i++;
+ }
+ ASSERT_EQ(ra_pages, 0) {
+ TH_LOG("Read-ahead pages found in memory");
+ }
+
+ munmap(addr, FILE_SIZE);
+ close(fd);
+ free(vec);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/mount/.gitignore b/tools/testing/selftests/mount/.gitignore
new file mode 100644
index 000000000..17f2d8415
--- /dev/null
+++ b/tools/testing/selftests/mount/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+unprivileged-remount-test
+nosymfollow-test
diff --git a/tools/testing/selftests/mount/Makefile b/tools/testing/selftests/mount/Makefile
new file mode 100644
index 000000000..2d9454841
--- /dev/null
+++ b/tools/testing/selftests/mount/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for mount selftests.
+CFLAGS = -Wall \
+ -O2
+
+TEST_PROGS := run_unprivileged_remount.sh run_nosymfollow.sh
+TEST_GEN_FILES := unprivileged-remount-test nosymfollow-test
+
+include ../lib.mk
diff --git a/tools/testing/selftests/mount/config b/tools/testing/selftests/mount/config
new file mode 100644
index 000000000..416bd53ce
--- /dev/null
+++ b/tools/testing/selftests/mount/config
@@ -0,0 +1 @@
+CONFIG_USER_NS=y
diff --git a/tools/testing/selftests/mount/nosymfollow-test.c b/tools/testing/selftests/mount/nosymfollow-test.c
new file mode 100644
index 000000000..650d6d80a
--- /dev/null
+++ b/tools/testing/selftests/mount/nosymfollow-test.c
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/vfs.h>
+#include <unistd.h>
+
+#ifndef MS_NOSYMFOLLOW
+# define MS_NOSYMFOLLOW 256 /* Do not follow symlinks */
+#endif
+
+#ifndef ST_NOSYMFOLLOW
+# define ST_NOSYMFOLLOW 0x2000 /* Do not follow symlinks */
+#endif
+
+#define DATA "/tmp/data"
+#define LINK "/tmp/symlink"
+#define TMP "/tmp"
+
+static void die(char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+ exit(EXIT_FAILURE);
+}
+
+static void vmaybe_write_file(bool enoent_ok, char *filename, char *fmt,
+ va_list ap)
+{
+ ssize_t written;
+ char buf[4096];
+ int buf_len;
+ int fd;
+
+ buf_len = vsnprintf(buf, sizeof(buf), fmt, ap);
+ if (buf_len < 0)
+ die("vsnprintf failed: %s\n", strerror(errno));
+
+ if (buf_len >= sizeof(buf))
+ die("vsnprintf output truncated\n");
+
+ fd = open(filename, O_WRONLY);
+ if (fd < 0) {
+ if ((errno == ENOENT) && enoent_ok)
+ return;
+ die("open of %s failed: %s\n", filename, strerror(errno));
+ }
+
+ written = write(fd, buf, buf_len);
+ if (written != buf_len) {
+ if (written >= 0) {
+ die("short write to %s\n", filename);
+ } else {
+ die("write to %s failed: %s\n",
+ filename, strerror(errno));
+ }
+ }
+
+ if (close(fd) != 0)
+ die("close of %s failed: %s\n", filename, strerror(errno));
+}
+
+static void maybe_write_file(char *filename, char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vmaybe_write_file(true, filename, fmt, ap);
+ va_end(ap);
+}
+
+static void write_file(char *filename, char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vmaybe_write_file(false, filename, fmt, ap);
+ va_end(ap);
+}
+
+static void create_and_enter_ns(void)
+{
+ uid_t uid = getuid();
+ gid_t gid = getgid();
+
+ if (unshare(CLONE_NEWUSER) != 0)
+ die("unshare(CLONE_NEWUSER) failed: %s\n", strerror(errno));
+
+ maybe_write_file("/proc/self/setgroups", "deny");
+ write_file("/proc/self/uid_map", "0 %d 1", uid);
+ write_file("/proc/self/gid_map", "0 %d 1", gid);
+
+ if (setgid(0) != 0)
+ die("setgid(0) failed %s\n", strerror(errno));
+ if (setuid(0) != 0)
+ die("setuid(0) failed %s\n", strerror(errno));
+
+ if (unshare(CLONE_NEWNS) != 0)
+ die("unshare(CLONE_NEWNS) failed: %s\n", strerror(errno));
+}
+
+static void setup_symlink(void)
+{
+ int data, err;
+
+ data = creat(DATA, O_RDWR);
+ if (data < 0)
+ die("creat failed: %s\n", strerror(errno));
+
+ err = symlink(DATA, LINK);
+ if (err < 0)
+ die("symlink failed: %s\n", strerror(errno));
+
+ if (close(data) != 0)
+ die("close of %s failed: %s\n", DATA, strerror(errno));
+}
+
+static void test_link_traversal(bool nosymfollow)
+{
+ int link;
+
+ link = open(LINK, 0, O_RDWR);
+ if (nosymfollow) {
+ if ((link != -1 || errno != ELOOP)) {
+ die("link traversal unexpected result: %d, %s\n",
+ link, strerror(errno));
+ }
+ } else {
+ if (link < 0)
+ die("link traversal failed: %s\n", strerror(errno));
+
+ if (close(link) != 0)
+ die("close of link failed: %s\n", strerror(errno));
+ }
+}
+
+static void test_readlink(void)
+{
+ char buf[4096];
+ ssize_t ret;
+
+ bzero(buf, sizeof(buf));
+
+ ret = readlink(LINK, buf, sizeof(buf));
+ if (ret < 0)
+ die("readlink failed: %s\n", strerror(errno));
+ if (strcmp(buf, DATA) != 0)
+ die("readlink strcmp failed: '%s' '%s'\n", buf, DATA);
+}
+
+static void test_realpath(void)
+{
+ char *path = realpath(LINK, NULL);
+
+ if (!path)
+ die("realpath failed: %s\n", strerror(errno));
+ if (strcmp(path, DATA) != 0)
+ die("realpath strcmp failed\n");
+
+ free(path);
+}
+
+static void test_statfs(bool nosymfollow)
+{
+ struct statfs buf;
+ int ret;
+
+ ret = statfs(TMP, &buf);
+ if (ret)
+ die("statfs failed: %s\n", strerror(errno));
+
+ if (nosymfollow) {
+ if ((buf.f_flags & ST_NOSYMFOLLOW) == 0)
+ die("ST_NOSYMFOLLOW not set on %s\n", TMP);
+ } else {
+ if ((buf.f_flags & ST_NOSYMFOLLOW) != 0)
+ die("ST_NOSYMFOLLOW set on %s\n", TMP);
+ }
+}
+
+static void run_tests(bool nosymfollow)
+{
+ test_link_traversal(nosymfollow);
+ test_readlink();
+ test_realpath();
+ test_statfs(nosymfollow);
+}
+
+int main(int argc, char **argv)
+{
+ create_and_enter_ns();
+
+ if (mount("testing", TMP, "ramfs", 0, NULL) != 0)
+ die("mount failed: %s\n", strerror(errno));
+
+ setup_symlink();
+ run_tests(false);
+
+ if (mount("testing", TMP, "ramfs", MS_REMOUNT|MS_NOSYMFOLLOW, NULL) != 0)
+ die("remount failed: %s\n", strerror(errno));
+
+ run_tests(true);
+
+ return EXIT_SUCCESS;
+}
diff --git a/tools/testing/selftests/mount/run_nosymfollow.sh b/tools/testing/selftests/mount/run_nosymfollow.sh
new file mode 100755
index 000000000..5fbbf0304
--- /dev/null
+++ b/tools/testing/selftests/mount/run_nosymfollow.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+./nosymfollow-test
diff --git a/tools/testing/selftests/mount/run_unprivileged_remount.sh b/tools/testing/selftests/mount/run_unprivileged_remount.sh
new file mode 100755
index 000000000..4ab8f507d
--- /dev/null
+++ b/tools/testing/selftests/mount/run_unprivileged_remount.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+# Run mount selftests
+if [ -f /proc/self/uid_map ] ; then
+ ./unprivileged-remount-test ;
+else
+ echo "WARN: No /proc/self/uid_map exist, test skipped." ;
+ exit $ksft_skip
+fi
diff --git a/tools/testing/selftests/mount/unprivileged-remount-test.c b/tools/testing/selftests/mount/unprivileged-remount-test.c
new file mode 100644
index 000000000..584dc6bc3
--- /dev/null
+++ b/tools/testing/selftests/mount/unprivileged-remount-test.c
@@ -0,0 +1,371 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sched.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/mount.h>
+#include <sys/wait.h>
+#include <sys/vfs.h>
+#include <sys/statvfs.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <grp.h>
+#include <stdbool.h>
+#include <stdarg.h>
+
+#ifndef CLONE_NEWNS
+# define CLONE_NEWNS 0x00020000
+#endif
+#ifndef CLONE_NEWUTS
+# define CLONE_NEWUTS 0x04000000
+#endif
+#ifndef CLONE_NEWIPC
+# define CLONE_NEWIPC 0x08000000
+#endif
+#ifndef CLONE_NEWNET
+# define CLONE_NEWNET 0x40000000
+#endif
+#ifndef CLONE_NEWUSER
+# define CLONE_NEWUSER 0x10000000
+#endif
+#ifndef CLONE_NEWPID
+# define CLONE_NEWPID 0x20000000
+#endif
+
+#ifndef MS_REC
+# define MS_REC 16384
+#endif
+#ifndef MS_RELATIME
+# define MS_RELATIME (1 << 21)
+#endif
+#ifndef MS_STRICTATIME
+# define MS_STRICTATIME (1 << 24)
+#endif
+
+static void die(char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap, fmt);
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+ exit(EXIT_FAILURE);
+}
+
+static void vmaybe_write_file(bool enoent_ok, char *filename, char *fmt, va_list ap)
+{
+ char buf[4096];
+ int fd;
+ ssize_t written;
+ int buf_len;
+
+ buf_len = vsnprintf(buf, sizeof(buf), fmt, ap);
+ if (buf_len < 0) {
+ die("vsnprintf failed: %s\n",
+ strerror(errno));
+ }
+ if (buf_len >= sizeof(buf)) {
+ die("vsnprintf output truncated\n");
+ }
+
+ fd = open(filename, O_WRONLY);
+ if (fd < 0) {
+ if ((errno == ENOENT) && enoent_ok)
+ return;
+ die("open of %s failed: %s\n",
+ filename, strerror(errno));
+ }
+ written = write(fd, buf, buf_len);
+ if (written != buf_len) {
+ if (written >= 0) {
+ die("short write to %s\n", filename);
+ } else {
+ die("write to %s failed: %s\n",
+ filename, strerror(errno));
+ }
+ }
+ if (close(fd) != 0) {
+ die("close of %s failed: %s\n",
+ filename, strerror(errno));
+ }
+}
+
+static void maybe_write_file(char *filename, char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vmaybe_write_file(true, filename, fmt, ap);
+ va_end(ap);
+
+}
+
+static void write_file(char *filename, char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vmaybe_write_file(false, filename, fmt, ap);
+ va_end(ap);
+
+}
+
+static int read_mnt_flags(const char *path)
+{
+ int ret;
+ struct statvfs stat;
+ int mnt_flags;
+
+ ret = statvfs(path, &stat);
+ if (ret != 0) {
+ die("statvfs of %s failed: %s\n",
+ path, strerror(errno));
+ }
+ if (stat.f_flag & ~(ST_RDONLY | ST_NOSUID | ST_NODEV | \
+ ST_NOEXEC | ST_NOATIME | ST_NODIRATIME | ST_RELATIME | \
+ ST_SYNCHRONOUS | ST_MANDLOCK)) {
+ die("Unrecognized mount flags\n");
+ }
+ mnt_flags = 0;
+ if (stat.f_flag & ST_RDONLY)
+ mnt_flags |= MS_RDONLY;
+ if (stat.f_flag & ST_NOSUID)
+ mnt_flags |= MS_NOSUID;
+ if (stat.f_flag & ST_NODEV)
+ mnt_flags |= MS_NODEV;
+ if (stat.f_flag & ST_NOEXEC)
+ mnt_flags |= MS_NOEXEC;
+ if (stat.f_flag & ST_NOATIME)
+ mnt_flags |= MS_NOATIME;
+ if (stat.f_flag & ST_NODIRATIME)
+ mnt_flags |= MS_NODIRATIME;
+ if (stat.f_flag & ST_RELATIME)
+ mnt_flags |= MS_RELATIME;
+ if (stat.f_flag & ST_SYNCHRONOUS)
+ mnt_flags |= MS_SYNCHRONOUS;
+ if (stat.f_flag & ST_MANDLOCK)
+ mnt_flags |= ST_MANDLOCK;
+
+ return mnt_flags;
+}
+
+static void create_and_enter_userns(void)
+{
+ uid_t uid;
+ gid_t gid;
+
+ uid = getuid();
+ gid = getgid();
+
+ if (unshare(CLONE_NEWUSER) !=0) {
+ die("unshare(CLONE_NEWUSER) failed: %s\n",
+ strerror(errno));
+ }
+
+ maybe_write_file("/proc/self/setgroups", "deny");
+ write_file("/proc/self/uid_map", "0 %d 1", uid);
+ write_file("/proc/self/gid_map", "0 %d 1", gid);
+
+ if (setgid(0) != 0) {
+ die ("setgid(0) failed %s\n",
+ strerror(errno));
+ }
+ if (setuid(0) != 0) {
+ die("setuid(0) failed %s\n",
+ strerror(errno));
+ }
+}
+
+static
+bool test_unpriv_remount(const char *fstype, const char *mount_options,
+ int mount_flags, int remount_flags, int invalid_flags)
+{
+ pid_t child;
+
+ child = fork();
+ if (child == -1) {
+ die("fork failed: %s\n",
+ strerror(errno));
+ }
+ if (child != 0) { /* parent */
+ pid_t pid;
+ int status;
+ pid = waitpid(child, &status, 0);
+ if (pid == -1) {
+ die("waitpid failed: %s\n",
+ strerror(errno));
+ }
+ if (pid != child) {
+ die("waited for %d got %d\n",
+ child, pid);
+ }
+ if (!WIFEXITED(status)) {
+ die("child did not terminate cleanly\n");
+ }
+ return WEXITSTATUS(status) == EXIT_SUCCESS ? true : false;
+ }
+
+ create_and_enter_userns();
+ if (unshare(CLONE_NEWNS) != 0) {
+ die("unshare(CLONE_NEWNS) failed: %s\n",
+ strerror(errno));
+ }
+
+ if (mount("testing", "/tmp", fstype, mount_flags, mount_options) != 0) {
+ die("mount of %s with options '%s' on /tmp failed: %s\n",
+ fstype,
+ mount_options? mount_options : "",
+ strerror(errno));
+ }
+
+ create_and_enter_userns();
+
+ if (unshare(CLONE_NEWNS) != 0) {
+ die("unshare(CLONE_NEWNS) failed: %s\n",
+ strerror(errno));
+ }
+
+ if (mount("/tmp", "/tmp", "none",
+ MS_REMOUNT | MS_BIND | remount_flags, NULL) != 0) {
+ /* system("cat /proc/self/mounts"); */
+ die("remount of /tmp failed: %s\n",
+ strerror(errno));
+ }
+
+ if (mount("/tmp", "/tmp", "none",
+ MS_REMOUNT | MS_BIND | invalid_flags, NULL) == 0) {
+ /* system("cat /proc/self/mounts"); */
+ die("remount of /tmp with invalid flags "
+ "succeeded unexpectedly\n");
+ }
+ exit(EXIT_SUCCESS);
+}
+
+static bool test_unpriv_remount_simple(int mount_flags)
+{
+ return test_unpriv_remount("ramfs", NULL, mount_flags, mount_flags, 0);
+}
+
+static bool test_unpriv_remount_atime(int mount_flags, int invalid_flags)
+{
+ return test_unpriv_remount("ramfs", NULL, mount_flags, mount_flags,
+ invalid_flags);
+}
+
+static bool test_priv_mount_unpriv_remount(void)
+{
+ pid_t child;
+ int ret;
+ const char *orig_path = "/dev";
+ const char *dest_path = "/tmp";
+ int orig_mnt_flags, remount_mnt_flags;
+
+ child = fork();
+ if (child == -1) {
+ die("fork failed: %s\n",
+ strerror(errno));
+ }
+ if (child != 0) { /* parent */
+ pid_t pid;
+ int status;
+ pid = waitpid(child, &status, 0);
+ if (pid == -1) {
+ die("waitpid failed: %s\n",
+ strerror(errno));
+ }
+ if (pid != child) {
+ die("waited for %d got %d\n",
+ child, pid);
+ }
+ if (!WIFEXITED(status)) {
+ die("child did not terminate cleanly\n");
+ }
+ return WEXITSTATUS(status) == EXIT_SUCCESS ? true : false;
+ }
+
+ orig_mnt_flags = read_mnt_flags(orig_path);
+
+ create_and_enter_userns();
+ ret = unshare(CLONE_NEWNS);
+ if (ret != 0) {
+ die("unshare(CLONE_NEWNS) failed: %s\n",
+ strerror(errno));
+ }
+
+ ret = mount(orig_path, dest_path, "bind", MS_BIND | MS_REC, NULL);
+ if (ret != 0) {
+ die("recursive bind mount of %s onto %s failed: %s\n",
+ orig_path, dest_path, strerror(errno));
+ }
+
+ ret = mount(dest_path, dest_path, "none",
+ MS_REMOUNT | MS_BIND | orig_mnt_flags , NULL);
+ if (ret != 0) {
+ /* system("cat /proc/self/mounts"); */
+ die("remount of /tmp failed: %s\n",
+ strerror(errno));
+ }
+
+ remount_mnt_flags = read_mnt_flags(dest_path);
+ if (orig_mnt_flags != remount_mnt_flags) {
+ die("Mount flags unexpectedly changed during remount of %s originally mounted on %s\n",
+ dest_path, orig_path);
+ }
+ exit(EXIT_SUCCESS);
+}
+
+int main(int argc, char **argv)
+{
+ if (!test_unpriv_remount_simple(MS_RDONLY)) {
+ die("MS_RDONLY malfunctions\n");
+ }
+ if (!test_unpriv_remount("devpts", "newinstance", MS_NODEV, MS_NODEV, 0)) {
+ die("MS_NODEV malfunctions\n");
+ }
+ if (!test_unpriv_remount_simple(MS_NOSUID)) {
+ die("MS_NOSUID malfunctions\n");
+ }
+ if (!test_unpriv_remount_simple(MS_NOEXEC)) {
+ die("MS_NOEXEC malfunctions\n");
+ }
+ if (!test_unpriv_remount_atime(MS_RELATIME,
+ MS_NOATIME))
+ {
+ die("MS_RELATIME malfunctions\n");
+ }
+ if (!test_unpriv_remount_atime(MS_STRICTATIME,
+ MS_NOATIME))
+ {
+ die("MS_STRICTATIME malfunctions\n");
+ }
+ if (!test_unpriv_remount_atime(MS_NOATIME,
+ MS_STRICTATIME))
+ {
+ die("MS_NOATIME malfunctions\n");
+ }
+ if (!test_unpriv_remount_atime(MS_RELATIME|MS_NODIRATIME,
+ MS_NOATIME))
+ {
+ die("MS_RELATIME|MS_NODIRATIME malfunctions\n");
+ }
+ if (!test_unpriv_remount_atime(MS_STRICTATIME|MS_NODIRATIME,
+ MS_NOATIME))
+ {
+ die("MS_STRICTATIME|MS_NODIRATIME malfunctions\n");
+ }
+ if (!test_unpriv_remount_atime(MS_NOATIME|MS_NODIRATIME,
+ MS_STRICTATIME))
+ {
+ die("MS_NOATIME|MS_DIRATIME malfunctions\n");
+ }
+ if (!test_unpriv_remount("ramfs", NULL, MS_STRICTATIME, 0, MS_NOATIME))
+ {
+ die("Default atime malfunctions\n");
+ }
+ if (!test_priv_mount_unpriv_remount()) {
+ die("Mount flags unexpectedly changed after remount\n");
+ }
+ return EXIT_SUCCESS;
+}
diff --git a/tools/testing/selftests/mqueue/.gitignore b/tools/testing/selftests/mqueue/.gitignore
new file mode 100644
index 000000000..72ad8ca69
--- /dev/null
+++ b/tools/testing/selftests/mqueue/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+mq_open_tests
+mq_perf_tests
diff --git a/tools/testing/selftests/mqueue/Makefile b/tools/testing/selftests/mqueue/Makefile
new file mode 100644
index 000000000..8a58055fc
--- /dev/null
+++ b/tools/testing/selftests/mqueue/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS += -O2
+LDLIBS = -lrt -lpthread -lpopt
+
+TEST_GEN_PROGS := mq_open_tests mq_perf_tests
+
+include ../lib.mk
diff --git a/tools/testing/selftests/mqueue/mq_open_tests.c b/tools/testing/selftests/mqueue/mq_open_tests.c
new file mode 100644
index 000000000..9403ac01b
--- /dev/null
+++ b/tools/testing/selftests/mqueue/mq_open_tests.c
@@ -0,0 +1,502 @@
+/*
+ * This application is Copyright 2012 Red Hat, Inc.
+ * Doug Ledford <dledford@redhat.com>
+ *
+ * mq_open_tests is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3.
+ *
+ * mq_open_tests is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * For the full text of the license, see <http://www.gnu.org/licenses/>.
+ *
+ * mq_open_tests.c
+ * Tests the various situations that should either succeed or fail to
+ * open a posix message queue and then reports whether or not they
+ * did as they were supposed to.
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <limits.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <mqueue.h>
+#include <error.h>
+
+#include "../kselftest.h"
+
+static char *usage =
+"Usage:\n"
+" %s path\n"
+"\n"
+" path Path name of the message queue to create\n"
+"\n"
+" Note: this program must be run as root in order to enable all tests\n"
+"\n";
+
+char *DEF_MSGS = "/proc/sys/fs/mqueue/msg_default";
+char *DEF_MSGSIZE = "/proc/sys/fs/mqueue/msgsize_default";
+char *MAX_MSGS = "/proc/sys/fs/mqueue/msg_max";
+char *MAX_MSGSIZE = "/proc/sys/fs/mqueue/msgsize_max";
+
+int default_settings;
+struct rlimit saved_limits, cur_limits;
+int saved_def_msgs, saved_def_msgsize, saved_max_msgs, saved_max_msgsize;
+int cur_def_msgs, cur_def_msgsize, cur_max_msgs, cur_max_msgsize;
+FILE *def_msgs, *def_msgsize, *max_msgs, *max_msgsize;
+char *queue_path;
+char *default_queue_path = "/test1";
+mqd_t queue = -1;
+
+static inline void __set(FILE *stream, int value, char *err_msg);
+void shutdown(int exit_val, char *err_cause, int line_no);
+static inline int get(FILE *stream);
+static inline void set(FILE *stream, int value);
+static inline void getr(int type, struct rlimit *rlim);
+static inline void setr(int type, struct rlimit *rlim);
+void validate_current_settings();
+static inline void test_queue(struct mq_attr *attr, struct mq_attr *result);
+static inline int test_queue_fail(struct mq_attr *attr, struct mq_attr *result);
+
+static inline void __set(FILE *stream, int value, char *err_msg)
+{
+ rewind(stream);
+ if (fprintf(stream, "%d", value) < 0)
+ perror(err_msg);
+}
+
+
+void shutdown(int exit_val, char *err_cause, int line_no)
+{
+ static int in_shutdown = 0;
+
+ /* In case we get called recursively by a set() call below */
+ if (in_shutdown++)
+ return;
+
+ if (seteuid(0) == -1)
+ perror("seteuid() failed");
+
+ if (queue != -1)
+ if (mq_close(queue))
+ perror("mq_close() during shutdown");
+ if (queue_path)
+ /*
+ * Be silent if this fails, if we cleaned up already it's
+ * expected to fail
+ */
+ mq_unlink(queue_path);
+ if (default_settings) {
+ if (saved_def_msgs)
+ __set(def_msgs, saved_def_msgs,
+ "failed to restore saved_def_msgs");
+ if (saved_def_msgsize)
+ __set(def_msgsize, saved_def_msgsize,
+ "failed to restore saved_def_msgsize");
+ }
+ if (saved_max_msgs)
+ __set(max_msgs, saved_max_msgs,
+ "failed to restore saved_max_msgs");
+ if (saved_max_msgsize)
+ __set(max_msgsize, saved_max_msgsize,
+ "failed to restore saved_max_msgsize");
+ if (exit_val)
+ error(exit_val, errno, "%s at %d", err_cause, line_no);
+ exit(0);
+}
+
+static inline int get(FILE *stream)
+{
+ int value;
+ rewind(stream);
+ if (fscanf(stream, "%d", &value) != 1)
+ shutdown(4, "Error reading /proc entry", __LINE__ - 1);
+ return value;
+}
+
+static inline void set(FILE *stream, int value)
+{
+ int new_value;
+
+ rewind(stream);
+ if (fprintf(stream, "%d", value) < 0)
+ return shutdown(5, "Failed writing to /proc file",
+ __LINE__ - 1);
+ new_value = get(stream);
+ if (new_value != value)
+ return shutdown(5, "We didn't get what we wrote to /proc back",
+ __LINE__ - 1);
+}
+
+static inline void getr(int type, struct rlimit *rlim)
+{
+ if (getrlimit(type, rlim))
+ shutdown(6, "getrlimit()", __LINE__ - 1);
+}
+
+static inline void setr(int type, struct rlimit *rlim)
+{
+ if (setrlimit(type, rlim))
+ shutdown(7, "setrlimit()", __LINE__ - 1);
+}
+
+void validate_current_settings()
+{
+ int rlim_needed;
+
+ if (cur_limits.rlim_cur < 4096) {
+ printf("Current rlimit value for POSIX message queue bytes is "
+ "unreasonably low,\nincreasing.\n\n");
+ cur_limits.rlim_cur = 8192;
+ cur_limits.rlim_max = 16384;
+ setr(RLIMIT_MSGQUEUE, &cur_limits);
+ }
+
+ if (default_settings) {
+ rlim_needed = (cur_def_msgs + 1) * (cur_def_msgsize + 1 +
+ 2 * sizeof(void *));
+ if (rlim_needed > cur_limits.rlim_cur) {
+ printf("Temporarily lowering default queue parameters "
+ "to something that will work\n"
+ "with the current rlimit values.\n\n");
+ set(def_msgs, 10);
+ cur_def_msgs = 10;
+ set(def_msgsize, 128);
+ cur_def_msgsize = 128;
+ }
+ } else {
+ rlim_needed = (cur_max_msgs + 1) * (cur_max_msgsize + 1 +
+ 2 * sizeof(void *));
+ if (rlim_needed > cur_limits.rlim_cur) {
+ printf("Temporarily lowering maximum queue parameters "
+ "to something that will work\n"
+ "with the current rlimit values in case this is "
+ "a kernel that ties the default\n"
+ "queue parameters to the maximum queue "
+ "parameters.\n\n");
+ set(max_msgs, 10);
+ cur_max_msgs = 10;
+ set(max_msgsize, 128);
+ cur_max_msgsize = 128;
+ }
+ }
+}
+
+/*
+ * test_queue - Test opening a queue, shutdown if we fail. This should
+ * only be called in situations that should never fail. We clean up
+ * after ourselves and return the queue attributes in *result.
+ */
+static inline void test_queue(struct mq_attr *attr, struct mq_attr *result)
+{
+ int flags = O_RDWR | O_EXCL | O_CREAT;
+ int perms = DEFFILEMODE;
+
+ if ((queue = mq_open(queue_path, flags, perms, attr)) == -1)
+ shutdown(1, "mq_open()", __LINE__);
+ if (mq_getattr(queue, result))
+ shutdown(1, "mq_getattr()", __LINE__);
+ if (mq_close(queue))
+ shutdown(1, "mq_close()", __LINE__);
+ queue = -1;
+ if (mq_unlink(queue_path))
+ shutdown(1, "mq_unlink()", __LINE__);
+}
+
+/*
+ * Same as test_queue above, but failure is not fatal.
+ * Returns:
+ * 0 - Failed to create a queue
+ * 1 - Created a queue, attributes in *result
+ */
+static inline int test_queue_fail(struct mq_attr *attr, struct mq_attr *result)
+{
+ int flags = O_RDWR | O_EXCL | O_CREAT;
+ int perms = DEFFILEMODE;
+
+ if ((queue = mq_open(queue_path, flags, perms, attr)) == -1)
+ return 0;
+ if (mq_getattr(queue, result))
+ shutdown(1, "mq_getattr()", __LINE__);
+ if (mq_close(queue))
+ shutdown(1, "mq_close()", __LINE__);
+ queue = -1;
+ if (mq_unlink(queue_path))
+ shutdown(1, "mq_unlink()", __LINE__);
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ struct mq_attr attr, result;
+
+ if (argc != 2) {
+ printf("Using Default queue path - %s\n", default_queue_path);
+ queue_path = default_queue_path;
+ } else {
+
+ /*
+ * Although we can create a msg queue with a non-absolute path name,
+ * unlink will fail. So, if the name doesn't start with a /, add one
+ * when we save it.
+ */
+ if (*argv[1] == '/')
+ queue_path = strdup(argv[1]);
+ else {
+ queue_path = malloc(strlen(argv[1]) + 2);
+ if (!queue_path) {
+ perror("malloc()");
+ exit(1);
+ }
+ queue_path[0] = '/';
+ queue_path[1] = 0;
+ strcat(queue_path, argv[1]);
+ }
+ }
+
+ if (getuid() != 0)
+ ksft_exit_skip("Not running as root, but almost all tests "
+ "require root in order to modify\nsystem settings. "
+ "Exiting.\n");
+
+ /* Find out what files there are for us to make tweaks in */
+ def_msgs = fopen(DEF_MSGS, "r+");
+ def_msgsize = fopen(DEF_MSGSIZE, "r+");
+ max_msgs = fopen(MAX_MSGS, "r+");
+ max_msgsize = fopen(MAX_MSGSIZE, "r+");
+
+ if (!max_msgs)
+ shutdown(2, "Failed to open msg_max", __LINE__);
+ if (!max_msgsize)
+ shutdown(2, "Failed to open msgsize_max", __LINE__);
+ if (def_msgs || def_msgsize)
+ default_settings = 1;
+
+ /* Load up the current system values for everything we can */
+ getr(RLIMIT_MSGQUEUE, &saved_limits);
+ cur_limits = saved_limits;
+ if (default_settings) {
+ saved_def_msgs = cur_def_msgs = get(def_msgs);
+ saved_def_msgsize = cur_def_msgsize = get(def_msgsize);
+ }
+ saved_max_msgs = cur_max_msgs = get(max_msgs);
+ saved_max_msgsize = cur_max_msgsize = get(max_msgsize);
+
+ /* Tell the user our initial state */
+ printf("\nInitial system state:\n");
+ printf("\tUsing queue path:\t\t%s\n", queue_path);
+ printf("\tRLIMIT_MSGQUEUE(soft):\t\t%ld\n",
+ (long) saved_limits.rlim_cur);
+ printf("\tRLIMIT_MSGQUEUE(hard):\t\t%ld\n",
+ (long) saved_limits.rlim_max);
+ printf("\tMaximum Message Size:\t\t%d\n", saved_max_msgsize);
+ printf("\tMaximum Queue Size:\t\t%d\n", saved_max_msgs);
+ if (default_settings) {
+ printf("\tDefault Message Size:\t\t%d\n", saved_def_msgsize);
+ printf("\tDefault Queue Size:\t\t%d\n", saved_def_msgs);
+ } else {
+ printf("\tDefault Message Size:\t\tNot Supported\n");
+ printf("\tDefault Queue Size:\t\tNot Supported\n");
+ }
+ printf("\n");
+
+ validate_current_settings();
+
+ printf("Adjusted system state for testing:\n");
+ printf("\tRLIMIT_MSGQUEUE(soft):\t\t%ld\n", (long) cur_limits.rlim_cur);
+ printf("\tRLIMIT_MSGQUEUE(hard):\t\t%ld\n", (long) cur_limits.rlim_max);
+ printf("\tMaximum Message Size:\t\t%d\n", cur_max_msgsize);
+ printf("\tMaximum Queue Size:\t\t%d\n", cur_max_msgs);
+ if (default_settings) {
+ printf("\tDefault Message Size:\t\t%d\n", cur_def_msgsize);
+ printf("\tDefault Queue Size:\t\t%d\n", cur_def_msgs);
+ }
+
+ printf("\n\nTest series 1, behavior when no attr struct "
+ "passed to mq_open:\n");
+ if (!default_settings) {
+ test_queue(NULL, &result);
+ printf("Given sane system settings, mq_open without an attr "
+ "struct succeeds:\tPASS\n");
+ if (result.mq_maxmsg != cur_max_msgs ||
+ result.mq_msgsize != cur_max_msgsize) {
+ printf("Kernel does not support setting the default "
+ "mq attributes,\nbut also doesn't tie the "
+ "defaults to the maximums:\t\t\tPASS\n");
+ } else {
+ set(max_msgs, ++cur_max_msgs);
+ set(max_msgsize, ++cur_max_msgsize);
+ test_queue(NULL, &result);
+ if (result.mq_maxmsg == cur_max_msgs &&
+ result.mq_msgsize == cur_max_msgsize)
+ printf("Kernel does not support setting the "
+ "default mq attributes and\n"
+ "also ties system wide defaults to "
+ "the system wide maximums:\t\t"
+ "FAIL\n");
+ else
+ printf("Kernel does not support setting the "
+ "default mq attributes,\n"
+ "but also doesn't tie the defaults to "
+ "the maximums:\t\t\tPASS\n");
+ }
+ } else {
+ printf("Kernel supports setting defaults separately from "
+ "maximums:\t\tPASS\n");
+ /*
+ * While we are here, go ahead and test that the kernel
+ * properly follows the default settings
+ */
+ test_queue(NULL, &result);
+ printf("Given sane values, mq_open without an attr struct "
+ "succeeds:\t\tPASS\n");
+ if (result.mq_maxmsg != cur_def_msgs ||
+ result.mq_msgsize != cur_def_msgsize)
+ printf("Kernel supports setting defaults, but does "
+ "not actually honor them:\tFAIL\n\n");
+ else {
+ set(def_msgs, ++cur_def_msgs);
+ set(def_msgsize, ++cur_def_msgsize);
+ /* In case max was the same as the default */
+ set(max_msgs, ++cur_max_msgs);
+ set(max_msgsize, ++cur_max_msgsize);
+ test_queue(NULL, &result);
+ if (result.mq_maxmsg != cur_def_msgs ||
+ result.mq_msgsize != cur_def_msgsize)
+ printf("Kernel supports setting defaults, but "
+ "does not actually honor them:\t"
+ "FAIL\n");
+ else
+ printf("Kernel properly honors default setting "
+ "knobs:\t\t\t\tPASS\n");
+ }
+ set(def_msgs, cur_max_msgs + 1);
+ cur_def_msgs = cur_max_msgs + 1;
+ set(def_msgsize, cur_max_msgsize + 1);
+ cur_def_msgsize = cur_max_msgsize + 1;
+ if (cur_def_msgs * (cur_def_msgsize + 2 * sizeof(void *)) >=
+ cur_limits.rlim_cur) {
+ cur_limits.rlim_cur = (cur_def_msgs + 2) *
+ (cur_def_msgsize + 2 * sizeof(void *));
+ cur_limits.rlim_max = 2 * cur_limits.rlim_cur;
+ setr(RLIMIT_MSGQUEUE, &cur_limits);
+ }
+ if (test_queue_fail(NULL, &result)) {
+ if (result.mq_maxmsg == cur_max_msgs &&
+ result.mq_msgsize == cur_max_msgsize)
+ printf("Kernel properly limits default values "
+ "to lesser of default/max:\t\tPASS\n");
+ else
+ printf("Kernel does not properly set default "
+ "queue parameters when\ndefaults > "
+ "max:\t\t\t\t\t\t\t\tFAIL\n");
+ } else
+ printf("Kernel fails to open mq because defaults are "
+ "greater than maximums:\tFAIL\n");
+ set(def_msgs, --cur_def_msgs);
+ set(def_msgsize, --cur_def_msgsize);
+ cur_limits.rlim_cur = cur_limits.rlim_max = cur_def_msgs *
+ cur_def_msgsize;
+ setr(RLIMIT_MSGQUEUE, &cur_limits);
+ if (test_queue_fail(NULL, &result))
+ printf("Kernel creates queue even though defaults "
+ "would exceed\nrlimit setting:"
+ "\t\t\t\t\t\t\t\tFAIL\n");
+ else
+ printf("Kernel properly fails to create queue when "
+ "defaults would\nexceed rlimit:"
+ "\t\t\t\t\t\t\t\tPASS\n");
+ }
+
+ /*
+ * Test #2 - open with an attr struct that exceeds rlimit
+ */
+ printf("\n\nTest series 2, behavior when attr struct is "
+ "passed to mq_open:\n");
+ cur_max_msgs = 32;
+ cur_max_msgsize = cur_limits.rlim_max >> 4;
+ set(max_msgs, cur_max_msgs);
+ set(max_msgsize, cur_max_msgsize);
+ attr.mq_maxmsg = cur_max_msgs;
+ attr.mq_msgsize = cur_max_msgsize;
+ if (test_queue_fail(&attr, &result))
+ printf("Queue open in excess of rlimit max when euid = 0 "
+ "succeeded:\t\tFAIL\n");
+ else
+ printf("Queue open in excess of rlimit max when euid = 0 "
+ "failed:\t\tPASS\n");
+ attr.mq_maxmsg = cur_max_msgs + 1;
+ attr.mq_msgsize = 10;
+ if (test_queue_fail(&attr, &result))
+ printf("Queue open with mq_maxmsg > limit when euid = 0 "
+ "succeeded:\t\tPASS\n");
+ else
+ printf("Queue open with mq_maxmsg > limit when euid = 0 "
+ "failed:\t\tFAIL\n");
+ attr.mq_maxmsg = 1;
+ attr.mq_msgsize = cur_max_msgsize + 1;
+ if (test_queue_fail(&attr, &result))
+ printf("Queue open with mq_msgsize > limit when euid = 0 "
+ "succeeded:\t\tPASS\n");
+ else
+ printf("Queue open with mq_msgsize > limit when euid = 0 "
+ "failed:\t\tFAIL\n");
+ attr.mq_maxmsg = 65536;
+ attr.mq_msgsize = 65536;
+ if (test_queue_fail(&attr, &result))
+ printf("Queue open with total size > 2GB when euid = 0 "
+ "succeeded:\t\tFAIL\n");
+ else
+ printf("Queue open with total size > 2GB when euid = 0 "
+ "failed:\t\t\tPASS\n");
+
+ if (seteuid(99) == -1) {
+ perror("seteuid() failed");
+ exit(1);
+ }
+
+ attr.mq_maxmsg = cur_max_msgs;
+ attr.mq_msgsize = cur_max_msgsize;
+ if (test_queue_fail(&attr, &result))
+ printf("Queue open in excess of rlimit max when euid = 99 "
+ "succeeded:\t\tFAIL\n");
+ else
+ printf("Queue open in excess of rlimit max when euid = 99 "
+ "failed:\t\tPASS\n");
+ attr.mq_maxmsg = cur_max_msgs + 1;
+ attr.mq_msgsize = 10;
+ if (test_queue_fail(&attr, &result))
+ printf("Queue open with mq_maxmsg > limit when euid = 99 "
+ "succeeded:\t\tFAIL\n");
+ else
+ printf("Queue open with mq_maxmsg > limit when euid = 99 "
+ "failed:\t\tPASS\n");
+ attr.mq_maxmsg = 1;
+ attr.mq_msgsize = cur_max_msgsize + 1;
+ if (test_queue_fail(&attr, &result))
+ printf("Queue open with mq_msgsize > limit when euid = 99 "
+ "succeeded:\t\tFAIL\n");
+ else
+ printf("Queue open with mq_msgsize > limit when euid = 99 "
+ "failed:\t\tPASS\n");
+ attr.mq_maxmsg = 65536;
+ attr.mq_msgsize = 65536;
+ if (test_queue_fail(&attr, &result))
+ printf("Queue open with total size > 2GB when euid = 99 "
+ "succeeded:\t\tFAIL\n");
+ else
+ printf("Queue open with total size > 2GB when euid = 99 "
+ "failed:\t\t\tPASS\n");
+
+ shutdown(0,"",0);
+}
diff --git a/tools/testing/selftests/mqueue/mq_perf_tests.c b/tools/testing/selftests/mqueue/mq_perf_tests.c
new file mode 100644
index 000000000..84fda3b49
--- /dev/null
+++ b/tools/testing/selftests/mqueue/mq_perf_tests.c
@@ -0,0 +1,752 @@
+/*
+ * This application is Copyright 2012 Red Hat, Inc.
+ * Doug Ledford <dledford@redhat.com>
+ *
+ * mq_perf_tests is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3.
+ *
+ * mq_perf_tests is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * For the full text of the license, see <http://www.gnu.org/licenses/>.
+ *
+ * mq_perf_tests.c
+ * Tests various types of message queue workloads, concentrating on those
+ * situations that invole large message sizes, large message queue depths,
+ * or both, and reports back useful metrics about kernel message queue
+ * performance.
+ *
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <limits.h>
+#include <errno.h>
+#include <signal.h>
+#include <pthread.h>
+#include <sched.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <mqueue.h>
+#include <popt.h>
+#include <error.h>
+
+#include "../kselftest.h"
+
+static char *usage =
+"Usage:\n"
+" %s [-c #[,#..] -f] path\n"
+"\n"
+" -c # Skip most tests and go straight to a high queue depth test\n"
+" and then run that test continuously (useful for running at\n"
+" the same time as some other workload to see how much the\n"
+" cache thrashing caused by adding messages to a very deep\n"
+" queue impacts the performance of other programs). The number\n"
+" indicates which CPU core we should bind the process to during\n"
+" the run. If you have more than one physical CPU, then you\n"
+" will need one copy per physical CPU package, and you should\n"
+" specify the CPU cores to pin ourself to via a comma separated\n"
+" list of CPU values.\n"
+" -f Only usable with continuous mode. Pin ourself to the CPUs\n"
+" as requested, then instead of looping doing a high mq\n"
+" workload, just busy loop. This will allow us to lock up a\n"
+" single CPU just like we normally would, but without actually\n"
+" thrashing the CPU cache. This is to make it easier to get\n"
+" comparable numbers from some other workload running on the\n"
+" other CPUs. One set of numbers with # CPUs locked up running\n"
+" an mq workload, and another set of numbers with those same\n"
+" CPUs locked away from the test workload, but not doing\n"
+" anything to trash the cache like the mq workload might.\n"
+" path Path name of the message queue to create\n"
+"\n"
+" Note: this program must be run as root in order to enable all tests\n"
+"\n";
+
+char *MAX_MSGS = "/proc/sys/fs/mqueue/msg_max";
+char *MAX_MSGSIZE = "/proc/sys/fs/mqueue/msgsize_max";
+
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#define MAX_CPUS 64
+char *cpu_option_string;
+int cpus_to_pin[MAX_CPUS];
+int num_cpus_to_pin;
+pthread_t cpu_threads[MAX_CPUS];
+pthread_t main_thread;
+cpu_set_t *cpu_set;
+int cpu_set_size;
+int cpus_online;
+
+#define MSG_SIZE 16
+#define TEST1_LOOPS 10000000
+#define TEST2_LOOPS 100000
+int continuous_mode;
+int continuous_mode_fake;
+
+struct rlimit saved_limits, cur_limits;
+int saved_max_msgs, saved_max_msgsize;
+int cur_max_msgs, cur_max_msgsize;
+FILE *max_msgs, *max_msgsize;
+int cur_nice;
+char *queue_path = "/mq_perf_tests";
+mqd_t queue = -1;
+struct mq_attr result;
+int mq_prio_max;
+
+const struct poptOption options[] = {
+ {
+ .longName = "continuous",
+ .shortName = 'c',
+ .argInfo = POPT_ARG_STRING,
+ .arg = &cpu_option_string,
+ .val = 'c',
+ .descrip = "Run continuous tests at a high queue depth in "
+ "order to test the effects of cache thrashing on "
+ "other tasks on the system. This test is intended "
+ "to be run on one core of each physical CPU while "
+ "some other CPU intensive task is run on all the other "
+ "cores of that same physical CPU and the other task "
+ "is timed. It is assumed that the process of adding "
+ "messages to the message queue in a tight loop will "
+ "impact that other task to some degree. Once the "
+ "tests are performed in this way, you should then "
+ "re-run the tests using fake mode in order to check "
+ "the difference in time required to perform the CPU "
+ "intensive task",
+ .argDescrip = "cpu[,cpu]",
+ },
+ {
+ .longName = "fake",
+ .shortName = 'f',
+ .argInfo = POPT_ARG_NONE,
+ .arg = &continuous_mode_fake,
+ .val = 0,
+ .descrip = "Tie up the CPUs that we would normally tie up in"
+ "continuous mode, but don't actually do any mq stuff, "
+ "just keep the CPU busy so it can't be used to process "
+ "system level tasks as this would free up resources on "
+ "the other CPU cores and skew the comparison between "
+ "the no-mqueue work and mqueue work tests",
+ .argDescrip = NULL,
+ },
+ {
+ .longName = "path",
+ .shortName = 'p',
+ .argInfo = POPT_ARG_STRING | POPT_ARGFLAG_SHOW_DEFAULT,
+ .arg = &queue_path,
+ .val = 'p',
+ .descrip = "The name of the path to use in the mqueue "
+ "filesystem for our tests",
+ .argDescrip = "pathname",
+ },
+ POPT_AUTOHELP
+ POPT_TABLEEND
+};
+
+static inline void __set(FILE *stream, int value, char *err_msg);
+void shutdown(int exit_val, char *err_cause, int line_no);
+void sig_action_SIGUSR1(int signum, siginfo_t *info, void *context);
+void sig_action(int signum, siginfo_t *info, void *context);
+static inline int get(FILE *stream);
+static inline void set(FILE *stream, int value);
+static inline int try_set(FILE *stream, int value);
+static inline void getr(int type, struct rlimit *rlim);
+static inline void setr(int type, struct rlimit *rlim);
+static inline void open_queue(struct mq_attr *attr);
+void increase_limits(void);
+
+static inline void __set(FILE *stream, int value, char *err_msg)
+{
+ rewind(stream);
+ if (fprintf(stream, "%d", value) < 0)
+ perror(err_msg);
+}
+
+
+void shutdown(int exit_val, char *err_cause, int line_no)
+{
+ static int in_shutdown = 0;
+ int errno_at_shutdown = errno;
+ int i;
+
+ /* In case we get called by multiple threads or from an sighandler */
+ if (in_shutdown++)
+ return;
+
+ /* Free the cpu_set allocated using CPU_ALLOC in main function */
+ CPU_FREE(cpu_set);
+
+ for (i = 0; i < num_cpus_to_pin; i++)
+ if (cpu_threads[i]) {
+ pthread_kill(cpu_threads[i], SIGUSR1);
+ pthread_join(cpu_threads[i], NULL);
+ }
+
+ if (queue != -1)
+ if (mq_close(queue))
+ perror("mq_close() during shutdown");
+ if (queue_path)
+ /*
+ * Be silent if this fails, if we cleaned up already it's
+ * expected to fail
+ */
+ mq_unlink(queue_path);
+ if (saved_max_msgs)
+ __set(max_msgs, saved_max_msgs,
+ "failed to restore saved_max_msgs");
+ if (saved_max_msgsize)
+ __set(max_msgsize, saved_max_msgsize,
+ "failed to restore saved_max_msgsize");
+ if (exit_val)
+ error(exit_val, errno_at_shutdown, "%s at %d",
+ err_cause, line_no);
+ exit(0);
+}
+
+void sig_action_SIGUSR1(int signum, siginfo_t *info, void *context)
+{
+ if (pthread_self() != main_thread)
+ pthread_exit(0);
+ else {
+ fprintf(stderr, "Caught signal %d in SIGUSR1 handler, "
+ "exiting\n", signum);
+ shutdown(0, "", 0);
+ fprintf(stderr, "\n\nReturned from shutdown?!?!\n\n");
+ exit(0);
+ }
+}
+
+void sig_action(int signum, siginfo_t *info, void *context)
+{
+ if (pthread_self() != main_thread)
+ pthread_kill(main_thread, signum);
+ else {
+ fprintf(stderr, "Caught signal %d, exiting\n", signum);
+ shutdown(0, "", 0);
+ fprintf(stderr, "\n\nReturned from shutdown?!?!\n\n");
+ exit(0);
+ }
+}
+
+static inline int get(FILE *stream)
+{
+ int value;
+ rewind(stream);
+ if (fscanf(stream, "%d", &value) != 1)
+ shutdown(4, "Error reading /proc entry", __LINE__);
+ return value;
+}
+
+static inline void set(FILE *stream, int value)
+{
+ int new_value;
+
+ rewind(stream);
+ if (fprintf(stream, "%d", value) < 0)
+ return shutdown(5, "Failed writing to /proc file", __LINE__);
+ new_value = get(stream);
+ if (new_value != value)
+ return shutdown(5, "We didn't get what we wrote to /proc back",
+ __LINE__);
+}
+
+static inline int try_set(FILE *stream, int value)
+{
+ int new_value;
+
+ rewind(stream);
+ fprintf(stream, "%d", value);
+ new_value = get(stream);
+ return new_value == value;
+}
+
+static inline void getr(int type, struct rlimit *rlim)
+{
+ if (getrlimit(type, rlim))
+ shutdown(6, "getrlimit()", __LINE__);
+}
+
+static inline void setr(int type, struct rlimit *rlim)
+{
+ if (setrlimit(type, rlim))
+ shutdown(7, "setrlimit()", __LINE__);
+}
+
+/**
+ * open_queue - open the global queue for testing
+ * @attr - An attr struct specifying the desired queue traits
+ * @result - An attr struct that lists the actual traits the queue has
+ *
+ * This open is not allowed to fail, failure will result in an orderly
+ * shutdown of the program. The global queue_path is used to set what
+ * queue to open, the queue descriptor is saved in the global queue
+ * variable.
+ */
+static inline void open_queue(struct mq_attr *attr)
+{
+ int flags = O_RDWR | O_EXCL | O_CREAT | O_NONBLOCK;
+ int perms = DEFFILEMODE;
+
+ queue = mq_open(queue_path, flags, perms, attr);
+ if (queue == -1)
+ shutdown(1, "mq_open()", __LINE__);
+ if (mq_getattr(queue, &result))
+ shutdown(1, "mq_getattr()", __LINE__);
+ printf("\n\tQueue %s created:\n", queue_path);
+ printf("\t\tmq_flags:\t\t\t%s\n", result.mq_flags & O_NONBLOCK ?
+ "O_NONBLOCK" : "(null)");
+ printf("\t\tmq_maxmsg:\t\t\t%lu\n", result.mq_maxmsg);
+ printf("\t\tmq_msgsize:\t\t\t%lu\n", result.mq_msgsize);
+ printf("\t\tmq_curmsgs:\t\t\t%lu\n", result.mq_curmsgs);
+}
+
+void *fake_cont_thread(void *arg)
+{
+ int i;
+
+ for (i = 0; i < num_cpus_to_pin; i++)
+ if (cpu_threads[i] == pthread_self())
+ break;
+ printf("\tStarted fake continuous mode thread %d on CPU %d\n", i,
+ cpus_to_pin[i]);
+ while (1)
+ ;
+}
+
+void *cont_thread(void *arg)
+{
+ char buff[MSG_SIZE];
+ int i, priority;
+
+ for (i = 0; i < num_cpus_to_pin; i++)
+ if (cpu_threads[i] == pthread_self())
+ break;
+ printf("\tStarted continuous mode thread %d on CPU %d\n", i,
+ cpus_to_pin[i]);
+ while (1) {
+ while (mq_send(queue, buff, sizeof(buff), 0) == 0)
+ ;
+ mq_receive(queue, buff, sizeof(buff), &priority);
+ }
+}
+
+#define drain_queue() \
+ while (mq_receive(queue, buff, MSG_SIZE, &prio_in) == MSG_SIZE)
+
+#define do_untimed_send() \
+ do { \
+ if (mq_send(queue, buff, MSG_SIZE, prio_out)) \
+ shutdown(3, "Test send failure", __LINE__); \
+ } while (0)
+
+#define do_send_recv() \
+ do { \
+ clock_gettime(clock, &start); \
+ if (mq_send(queue, buff, MSG_SIZE, prio_out)) \
+ shutdown(3, "Test send failure", __LINE__); \
+ clock_gettime(clock, &middle); \
+ if (mq_receive(queue, buff, MSG_SIZE, &prio_in) != MSG_SIZE) \
+ shutdown(3, "Test receive failure", __LINE__); \
+ clock_gettime(clock, &end); \
+ nsec = ((middle.tv_sec - start.tv_sec) * 1000000000) + \
+ (middle.tv_nsec - start.tv_nsec); \
+ send_total.tv_nsec += nsec; \
+ if (send_total.tv_nsec >= 1000000000) { \
+ send_total.tv_sec++; \
+ send_total.tv_nsec -= 1000000000; \
+ } \
+ nsec = ((end.tv_sec - middle.tv_sec) * 1000000000) + \
+ (end.tv_nsec - middle.tv_nsec); \
+ recv_total.tv_nsec += nsec; \
+ if (recv_total.tv_nsec >= 1000000000) { \
+ recv_total.tv_sec++; \
+ recv_total.tv_nsec -= 1000000000; \
+ } \
+ } while (0)
+
+struct test {
+ char *desc;
+ void (*func)(int *);
+};
+
+void const_prio(int *prio)
+{
+ return;
+}
+
+void inc_prio(int *prio)
+{
+ if (++*prio == mq_prio_max)
+ *prio = 0;
+}
+
+void dec_prio(int *prio)
+{
+ if (--*prio < 0)
+ *prio = mq_prio_max - 1;
+}
+
+void random_prio(int *prio)
+{
+ *prio = random() % mq_prio_max;
+}
+
+struct test test2[] = {
+ {"\n\tTest #2a: Time send/recv message, queue full, constant prio\n",
+ const_prio},
+ {"\n\tTest #2b: Time send/recv message, queue full, increasing prio\n",
+ inc_prio},
+ {"\n\tTest #2c: Time send/recv message, queue full, decreasing prio\n",
+ dec_prio},
+ {"\n\tTest #2d: Time send/recv message, queue full, random prio\n",
+ random_prio},
+ {NULL, NULL}
+};
+
+/**
+ * Tests to perform (all done with MSG_SIZE messages):
+ *
+ * 1) Time to add/remove message with 0 messages on queue
+ * 1a) with constant prio
+ * 2) Time to add/remove message when queue close to capacity:
+ * 2a) with constant prio
+ * 2b) with increasing prio
+ * 2c) with decreasing prio
+ * 2d) with random prio
+ * 3) Test limits of priorities honored (double check _SC_MQ_PRIO_MAX)
+ */
+void *perf_test_thread(void *arg)
+{
+ char buff[MSG_SIZE];
+ int prio_out, prio_in;
+ int i;
+ clockid_t clock;
+ pthread_t *t;
+ struct timespec res, start, middle, end, send_total, recv_total;
+ unsigned long long nsec;
+ struct test *cur_test;
+
+ t = &cpu_threads[0];
+ printf("\n\tStarted mqueue performance test thread on CPU %d\n",
+ cpus_to_pin[0]);
+ mq_prio_max = sysconf(_SC_MQ_PRIO_MAX);
+ if (mq_prio_max == -1)
+ shutdown(2, "sysconf(_SC_MQ_PRIO_MAX)", __LINE__);
+ if (pthread_getcpuclockid(cpu_threads[0], &clock) != 0)
+ shutdown(2, "pthread_getcpuclockid", __LINE__);
+
+ if (clock_getres(clock, &res))
+ shutdown(2, "clock_getres()", __LINE__);
+
+ printf("\t\tMax priorities:\t\t\t%d\n", mq_prio_max);
+ printf("\t\tClock resolution:\t\t%lu nsec%s\n", res.tv_nsec,
+ res.tv_nsec > 1 ? "s" : "");
+
+
+
+ printf("\n\tTest #1: Time send/recv message, queue empty\n");
+ printf("\t\t(%d iterations)\n", TEST1_LOOPS);
+ prio_out = 0;
+ send_total.tv_sec = 0;
+ send_total.tv_nsec = 0;
+ recv_total.tv_sec = 0;
+ recv_total.tv_nsec = 0;
+ for (i = 0; i < TEST1_LOOPS; i++)
+ do_send_recv();
+ printf("\t\tSend msg:\t\t\t%ld.%lus total time\n",
+ send_total.tv_sec, send_total.tv_nsec);
+ nsec = ((unsigned long long)send_total.tv_sec * 1000000000 +
+ send_total.tv_nsec) / TEST1_LOOPS;
+ printf("\t\t\t\t\t\t%lld nsec/msg\n", nsec);
+ printf("\t\tRecv msg:\t\t\t%ld.%lus total time\n",
+ recv_total.tv_sec, recv_total.tv_nsec);
+ nsec = ((unsigned long long)recv_total.tv_sec * 1000000000 +
+ recv_total.tv_nsec) / TEST1_LOOPS;
+ printf("\t\t\t\t\t\t%lld nsec/msg\n", nsec);
+
+
+ for (cur_test = test2; cur_test->desc != NULL; cur_test++) {
+ printf("%s:\n", cur_test->desc);
+ printf("\t\t(%d iterations)\n", TEST2_LOOPS);
+ prio_out = 0;
+ send_total.tv_sec = 0;
+ send_total.tv_nsec = 0;
+ recv_total.tv_sec = 0;
+ recv_total.tv_nsec = 0;
+ printf("\t\tFilling queue...");
+ fflush(stdout);
+ clock_gettime(clock, &start);
+ for (i = 0; i < result.mq_maxmsg - 1; i++) {
+ do_untimed_send();
+ cur_test->func(&prio_out);
+ }
+ clock_gettime(clock, &end);
+ nsec = ((unsigned long long)(end.tv_sec - start.tv_sec) *
+ 1000000000) + (end.tv_nsec - start.tv_nsec);
+ printf("done.\t\t%lld.%llds\n", nsec / 1000000000,
+ nsec % 1000000000);
+ printf("\t\tTesting...");
+ fflush(stdout);
+ for (i = 0; i < TEST2_LOOPS; i++) {
+ do_send_recv();
+ cur_test->func(&prio_out);
+ }
+ printf("done.\n");
+ printf("\t\tSend msg:\t\t\t%ld.%lus total time\n",
+ send_total.tv_sec, send_total.tv_nsec);
+ nsec = ((unsigned long long)send_total.tv_sec * 1000000000 +
+ send_total.tv_nsec) / TEST2_LOOPS;
+ printf("\t\t\t\t\t\t%lld nsec/msg\n", nsec);
+ printf("\t\tRecv msg:\t\t\t%ld.%lus total time\n",
+ recv_total.tv_sec, recv_total.tv_nsec);
+ nsec = ((unsigned long long)recv_total.tv_sec * 1000000000 +
+ recv_total.tv_nsec) / TEST2_LOOPS;
+ printf("\t\t\t\t\t\t%lld nsec/msg\n", nsec);
+ printf("\t\tDraining queue...");
+ fflush(stdout);
+ clock_gettime(clock, &start);
+ drain_queue();
+ clock_gettime(clock, &end);
+ nsec = ((unsigned long long)(end.tv_sec - start.tv_sec) *
+ 1000000000) + (end.tv_nsec - start.tv_nsec);
+ printf("done.\t\t%lld.%llds\n", nsec / 1000000000,
+ nsec % 1000000000);
+ }
+ return 0;
+}
+
+void increase_limits(void)
+{
+ cur_limits.rlim_cur = RLIM_INFINITY;
+ cur_limits.rlim_max = RLIM_INFINITY;
+ setr(RLIMIT_MSGQUEUE, &cur_limits);
+ while (try_set(max_msgs, cur_max_msgs += 10))
+ ;
+ cur_max_msgs = get(max_msgs);
+ while (try_set(max_msgsize, cur_max_msgsize += 1024))
+ ;
+ cur_max_msgsize = get(max_msgsize);
+ if (setpriority(PRIO_PROCESS, 0, -20) != 0)
+ shutdown(2, "setpriority()", __LINE__);
+ cur_nice = -20;
+}
+
+int main(int argc, char *argv[])
+{
+ struct mq_attr attr;
+ char *option, *next_option;
+ int i, cpu, rc;
+ struct sigaction sa;
+ poptContext popt_context;
+ void *retval;
+
+ main_thread = pthread_self();
+ num_cpus_to_pin = 0;
+
+ if (sysconf(_SC_NPROCESSORS_ONLN) == -1) {
+ perror("sysconf(_SC_NPROCESSORS_ONLN)");
+ exit(1);
+ }
+
+ if (getuid() != 0)
+ ksft_exit_skip("Not running as root, but almost all tests "
+ "require root in order to modify\nsystem settings. "
+ "Exiting.\n");
+
+ cpus_online = min(MAX_CPUS, sysconf(_SC_NPROCESSORS_ONLN));
+ cpu_set = CPU_ALLOC(cpus_online);
+ if (cpu_set == NULL) {
+ perror("CPU_ALLOC()");
+ exit(1);
+ }
+ cpu_set_size = CPU_ALLOC_SIZE(cpus_online);
+ CPU_ZERO_S(cpu_set_size, cpu_set);
+
+ popt_context = poptGetContext(NULL, argc, (const char **)argv,
+ options, 0);
+
+ while ((rc = poptGetNextOpt(popt_context)) > 0) {
+ switch (rc) {
+ case 'c':
+ continuous_mode = 1;
+ option = cpu_option_string;
+ do {
+ next_option = strchr(option, ',');
+ if (next_option)
+ *next_option = '\0';
+ cpu = atoi(option);
+ if (cpu >= cpus_online)
+ fprintf(stderr, "CPU %d exceeds "
+ "cpus online, ignoring.\n",
+ cpu);
+ else
+ cpus_to_pin[num_cpus_to_pin++] = cpu;
+ if (next_option)
+ option = ++next_option;
+ } while (next_option && num_cpus_to_pin < MAX_CPUS);
+ /* Double check that they didn't give us the same CPU
+ * more than once */
+ for (cpu = 0; cpu < num_cpus_to_pin; cpu++) {
+ if (CPU_ISSET_S(cpus_to_pin[cpu], cpu_set_size,
+ cpu_set)) {
+ fprintf(stderr, "Any given CPU may "
+ "only be given once.\n");
+ goto err_code;
+ } else
+ CPU_SET_S(cpus_to_pin[cpu],
+ cpu_set_size, cpu_set);
+ }
+ break;
+ case 'p':
+ /*
+ * Although we can create a msg queue with a
+ * non-absolute path name, unlink will fail. So,
+ * if the name doesn't start with a /, add one
+ * when we save it.
+ */
+ option = queue_path;
+ if (*option != '/') {
+ queue_path = malloc(strlen(option) + 2);
+ if (!queue_path) {
+ perror("malloc()");
+ goto err_code;
+ }
+ queue_path[0] = '/';
+ queue_path[1] = 0;
+ strcat(queue_path, option);
+ free(option);
+ }
+ break;
+ }
+ }
+
+ if (continuous_mode && num_cpus_to_pin == 0) {
+ fprintf(stderr, "Must pass at least one CPU to continuous "
+ "mode.\n");
+ poptPrintUsage(popt_context, stderr, 0);
+ goto err_code;
+ } else if (!continuous_mode) {
+ num_cpus_to_pin = 1;
+ cpus_to_pin[0] = cpus_online - 1;
+ }
+
+ max_msgs = fopen(MAX_MSGS, "r+");
+ max_msgsize = fopen(MAX_MSGSIZE, "r+");
+ if (!max_msgs)
+ shutdown(2, "Failed to open msg_max", __LINE__);
+ if (!max_msgsize)
+ shutdown(2, "Failed to open msgsize_max", __LINE__);
+
+ /* Load up the current system values for everything we can */
+ getr(RLIMIT_MSGQUEUE, &saved_limits);
+ cur_limits = saved_limits;
+ saved_max_msgs = cur_max_msgs = get(max_msgs);
+ saved_max_msgsize = cur_max_msgsize = get(max_msgsize);
+ errno = 0;
+ cur_nice = getpriority(PRIO_PROCESS, 0);
+ if (errno)
+ shutdown(2, "getpriority()", __LINE__);
+
+ /* Tell the user our initial state */
+ printf("\nInitial system state:\n");
+ printf("\tUsing queue path:\t\t\t%s\n", queue_path);
+ printf("\tRLIMIT_MSGQUEUE(soft):\t\t\t%ld\n",
+ (long) saved_limits.rlim_cur);
+ printf("\tRLIMIT_MSGQUEUE(hard):\t\t\t%ld\n",
+ (long) saved_limits.rlim_max);
+ printf("\tMaximum Message Size:\t\t\t%d\n", saved_max_msgsize);
+ printf("\tMaximum Queue Size:\t\t\t%d\n", saved_max_msgs);
+ printf("\tNice value:\t\t\t\t%d\n", cur_nice);
+ printf("\n");
+
+ increase_limits();
+
+ printf("Adjusted system state for testing:\n");
+ if (cur_limits.rlim_cur == RLIM_INFINITY) {
+ printf("\tRLIMIT_MSGQUEUE(soft):\t\t\t(unlimited)\n");
+ printf("\tRLIMIT_MSGQUEUE(hard):\t\t\t(unlimited)\n");
+ } else {
+ printf("\tRLIMIT_MSGQUEUE(soft):\t\t\t%ld\n",
+ (long) cur_limits.rlim_cur);
+ printf("\tRLIMIT_MSGQUEUE(hard):\t\t\t%ld\n",
+ (long) cur_limits.rlim_max);
+ }
+ printf("\tMaximum Message Size:\t\t\t%d\n", cur_max_msgsize);
+ printf("\tMaximum Queue Size:\t\t\t%d\n", cur_max_msgs);
+ printf("\tNice value:\t\t\t\t%d\n", cur_nice);
+ printf("\tContinuous mode:\t\t\t(%s)\n", continuous_mode ?
+ (continuous_mode_fake ? "fake mode" : "enabled") :
+ "disabled");
+ printf("\tCPUs to pin:\t\t\t\t%d", cpus_to_pin[0]);
+ for (cpu = 1; cpu < num_cpus_to_pin; cpu++)
+ printf(",%d", cpus_to_pin[cpu]);
+ printf("\n");
+
+ sa.sa_sigaction = sig_action_SIGUSR1;
+ sigemptyset(&sa.sa_mask);
+ sigaddset(&sa.sa_mask, SIGHUP);
+ sigaddset(&sa.sa_mask, SIGINT);
+ sigaddset(&sa.sa_mask, SIGQUIT);
+ sigaddset(&sa.sa_mask, SIGTERM);
+ sa.sa_flags = SA_SIGINFO;
+ if (sigaction(SIGUSR1, &sa, NULL) == -1)
+ shutdown(1, "sigaction(SIGUSR1)", __LINE__);
+ sa.sa_sigaction = sig_action;
+ if (sigaction(SIGHUP, &sa, NULL) == -1)
+ shutdown(1, "sigaction(SIGHUP)", __LINE__);
+ if (sigaction(SIGINT, &sa, NULL) == -1)
+ shutdown(1, "sigaction(SIGINT)", __LINE__);
+ if (sigaction(SIGQUIT, &sa, NULL) == -1)
+ shutdown(1, "sigaction(SIGQUIT)", __LINE__);
+ if (sigaction(SIGTERM, &sa, NULL) == -1)
+ shutdown(1, "sigaction(SIGTERM)", __LINE__);
+
+ if (!continuous_mode_fake) {
+ attr.mq_flags = O_NONBLOCK;
+ attr.mq_maxmsg = cur_max_msgs;
+ attr.mq_msgsize = MSG_SIZE;
+ open_queue(&attr);
+ }
+ for (i = 0; i < num_cpus_to_pin; i++) {
+ pthread_attr_t thread_attr;
+ void *thread_func;
+
+ if (continuous_mode_fake)
+ thread_func = &fake_cont_thread;
+ else if (continuous_mode)
+ thread_func = &cont_thread;
+ else
+ thread_func = &perf_test_thread;
+
+ CPU_ZERO_S(cpu_set_size, cpu_set);
+ CPU_SET_S(cpus_to_pin[i], cpu_set_size, cpu_set);
+ pthread_attr_init(&thread_attr);
+ pthread_attr_setaffinity_np(&thread_attr, cpu_set_size,
+ cpu_set);
+ if (pthread_create(&cpu_threads[i], &thread_attr, thread_func,
+ NULL))
+ shutdown(1, "pthread_create()", __LINE__);
+ pthread_attr_destroy(&thread_attr);
+ }
+
+ if (!continuous_mode) {
+ pthread_join(cpu_threads[0], &retval);
+ shutdown((long)retval, "perf_test_thread()", __LINE__);
+ } else {
+ while (1)
+ sleep(1);
+ }
+ shutdown(0, "", 0);
+
+err_code:
+ CPU_FREE(cpu_set);
+ exit(1);
+
+}
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
new file mode 100644
index 000000000..61ae899cf
--- /dev/null
+++ b/tools/testing/selftests/net/.gitignore
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: GPL-2.0-only
+ipsec
+msg_zerocopy
+socket
+psock_fanout
+psock_snd
+psock_tpacket
+reuseport_addr_any
+reuseport_bpf
+reuseport_bpf_cpu
+reuseport_bpf_numa
+reuseport_dualstack
+reuseaddr_conflict
+tcp_mmap
+udpgso
+udpgso_bench_rx
+udpgso_bench_tx
+tcp_inq
+tls
+txring_overwrite
+ip_defrag
+ipv6_flowlabel
+ipv6_flowlabel_mgr
+so_txtime
+tcp_fastopen_backup_key
+nettest
+fin_ack_lat
+reuseaddr_ports_exhausted
+hwtstamp_config
+rxtimestamp
+timestamping
+txtimestamp
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
new file mode 100644
index 000000000..ef352477c
--- /dev/null
+++ b/tools/testing/selftests/net/Makefile
@@ -0,0 +1,43 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for net selftests
+
+CFLAGS = -Wall -Wl,--no-as-needed -O2 -g
+CFLAGS += -I../../../../usr/include/
+
+TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh \
+ rtnetlink.sh xfrm_policy.sh test_blackhole_dev.sh
+TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh ip_defrag.sh
+TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh msg_zerocopy.sh psock_snd.sh
+TEST_PROGS += udpgro_bench.sh udpgro.sh test_vxlan_under_vrf.sh reuseport_addr_any.sh
+TEST_PROGS += test_vxlan_fdb_changelink.sh so_txtime.sh ipv6_flowlabel.sh
+TEST_PROGS += tcp_fastopen_backup_key.sh fcnal-test.sh l2tp.sh traceroute.sh
+TEST_PROGS += fin_ack_lat.sh fib_nexthop_multiprefix.sh fib_nexthops.sh
+TEST_PROGS += altnames.sh icmp_redirect.sh ip6_gre_headroom.sh
+TEST_PROGS += route_localnet.sh
+TEST_PROGS += reuseaddr_ports_exhausted.sh
+TEST_PROGS += txtimestamp.sh
+TEST_PROGS += vrf-xfrm-tests.sh
+TEST_PROGS += rxtimestamp.sh
+TEST_PROGS += devlink_port_split.py
+TEST_PROGS += drop_monitor_tests.sh
+TEST_PROGS += vrf_route_leaking.sh
+TEST_PROGS_EXTENDED := in_netns.sh
+TEST_GEN_FILES = socket nettest
+TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any
+TEST_GEN_FILES += tcp_mmap tcp_inq psock_snd txring_overwrite
+TEST_GEN_FILES += udpgso udpgso_bench_tx udpgso_bench_rx ip_defrag
+TEST_GEN_FILES += so_txtime ipv6_flowlabel ipv6_flowlabel_mgr
+TEST_GEN_FILES += tcp_fastopen_backup_key
+TEST_GEN_FILES += fin_ack_lat
+TEST_GEN_FILES += reuseaddr_ports_exhausted
+TEST_GEN_FILES += hwtstamp_config rxtimestamp timestamping txtimestamp
+TEST_GEN_FILES += ipsec
+TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
+TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls
+
+KSFT_KHDR_INSTALL := 1
+include ../lib.mk
+
+$(OUTPUT)/reuseport_bpf_numa: LDLIBS += -lnuma
+$(OUTPUT)/tcp_mmap: LDLIBS += -lpthread
+$(OUTPUT)/tcp_inq: LDLIBS += -lpthread
diff --git a/tools/testing/selftests/net/altnames.sh b/tools/testing/selftests/net/altnames.sh
new file mode 100755
index 000000000..1ef9e4159
--- /dev/null
+++ b/tools/testing/selftests/net/altnames.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+lib_dir=$(dirname $0)/forwarding
+
+ALL_TESTS="altnames_test"
+NUM_NETIFS=0
+source $lib_dir/lib.sh
+
+DUMMY_DEV=dummytest
+SHORT_NAME=shortname
+LONG_NAME=someveryveryveryveryveryverylongname
+
+altnames_test()
+{
+ RET=0
+ local output
+ local name
+
+ ip link property add $DUMMY_DEV altname $SHORT_NAME
+ check_err $? "Failed to add short alternative name"
+
+ output=$(ip -j -p link show $SHORT_NAME)
+ check_err $? "Failed to do link show with short alternative name"
+
+ name=$(echo $output | jq -e -r ".[0].altnames[0]")
+ check_err $? "Failed to get short alternative name from link show JSON"
+
+ [ "$name" == "$SHORT_NAME" ]
+ check_err $? "Got unexpected short alternative name from link show JSON"
+
+ ip -j -p link show $DUMMY_DEV &>/dev/null
+ check_err $? "Failed to do link show with original name"
+
+ ip link property add $DUMMY_DEV altname $LONG_NAME
+ check_err $? "Failed to add long alternative name"
+
+ output=$(ip -j -p link show $LONG_NAME)
+ check_err $? "Failed to do link show with long alternative name"
+
+ name=$(echo $output | jq -e -r ".[0].altnames[1]")
+ check_err $? "Failed to get long alternative name from link show JSON"
+
+ [ "$name" == "$LONG_NAME" ]
+ check_err $? "Got unexpected long alternative name from link show JSON"
+
+ ip link property del $DUMMY_DEV altname $SHORT_NAME
+ check_err $? "Failed to delete short alternative name"
+
+ ip -j -p link show $SHORT_NAME &>/dev/null
+ check_fail $? "Unexpected success while trying to do link show with deleted short alternative name"
+
+ # long name is left there on purpose to be removed alongside the device
+
+ log_test "altnames test"
+}
+
+setup_prepare()
+{
+ ip link add name $DUMMY_DEV type dummy
+}
+
+cleanup()
+{
+ pre_cleanup
+ ip link del name $DUMMY_DEV
+}
+
+trap cleanup EXIT
+
+setup_prepare
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config
new file mode 100644
index 000000000..4d5df8e1e
--- /dev/null
+++ b/tools/testing/selftests/net/config
@@ -0,0 +1,36 @@
+CONFIG_USER_NS=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_TEST_BPF=m
+CONFIG_NUMA=y
+CONFIG_NET_VRF=y
+CONFIG_NET_L3_MASTER_DEV=y
+CONFIG_IPV6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_VETH=y
+CONFIG_NET_IPVTI=y
+CONFIG_IPV6_VTI=y
+CONFIG_DUMMY=y
+CONFIG_BRIDGE=y
+CONFIG_VLAN_8021Q=y
+CONFIG_IFB=y
+CONFIG_NETFILTER=y
+CONFIG_NETFILTER_ADVANCED=y
+CONFIG_NF_CONNTRACK=m
+CONFIG_NF_NAT=m
+CONFIG_IP6_NF_IPTABLES=m
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP6_NF_NAT=m
+CONFIG_IP_NF_NAT=m
+CONFIG_NF_TABLES=m
+CONFIG_NF_TABLES_IPV6=y
+CONFIG_NF_TABLES_IPV4=y
+CONFIG_NFT_NAT=m
+CONFIG_NET_SCH_FQ=m
+CONFIG_NET_SCH_ETF=m
+CONFIG_NET_SCH_NETEM=y
+CONFIG_TEST_BLACKHOLE_DEV=m
+CONFIG_KALLSYMS=y
+CONFIG_TRACEPOINTS=y
+CONFIG_NET_DROP_MONITOR=m
+CONFIG_NETDEVSIM=m
+CONFIG_NET_FOU=m
diff --git a/tools/testing/selftests/net/devlink_port_split.py b/tools/testing/selftests/net/devlink_port_split.py
new file mode 100755
index 000000000..f0fbd7367
--- /dev/null
+++ b/tools/testing/selftests/net/devlink_port_split.py
@@ -0,0 +1,307 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+from subprocess import PIPE, Popen
+import json
+import time
+import argparse
+import collections
+import sys
+
+#
+# Test port split configuration using devlink-port lanes attribute.
+# The test is skipped in case the attribute is not available.
+#
+# First, check that all the ports with 1 lane fail to split.
+# Second, check that all the ports with more than 1 lane can be split
+# to all valid configurations (e.g., split to 2, split to 4 etc.)
+#
+
+
+Port = collections.namedtuple('Port', 'bus_info name')
+
+
+def run_command(cmd, should_fail=False):
+ """
+ Run a command in subprocess.
+ Return: Tuple of (stdout, stderr).
+ """
+
+ p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
+ stdout, stderr = p.communicate()
+ stdout, stderr = stdout.decode(), stderr.decode()
+
+ if stderr != "" and not should_fail:
+ print("Error sending command: %s" % cmd)
+ print(stdout)
+ print(stderr)
+ return stdout, stderr
+
+
+class devlink_ports(object):
+ """
+ Class that holds information on the devlink ports, required to the tests;
+ if_names: A list of interfaces in the devlink ports.
+ """
+
+ def get_if_names(dev):
+ """
+ Get a list of physical devlink ports.
+ Return: Array of tuples (bus_info/port, if_name).
+ """
+
+ arr = []
+
+ cmd = "devlink -j port show"
+ stdout, stderr = run_command(cmd)
+ assert stderr == ""
+ ports = json.loads(stdout)['port']
+
+ validate_devlink_output(ports, 'flavour')
+
+ for port in ports:
+ if dev in port:
+ if ports[port]['flavour'] == 'physical':
+ arr.append(Port(bus_info=port, name=ports[port]['netdev']))
+
+ return arr
+
+ def __init__(self, dev):
+ self.if_names = devlink_ports.get_if_names(dev)
+
+
+def get_max_lanes(port):
+ """
+ Get the $port's maximum number of lanes.
+ Return: number of lanes, e.g. 1, 2, 4 and 8.
+ """
+
+ cmd = "devlink -j port show %s" % port
+ stdout, stderr = run_command(cmd)
+ assert stderr == ""
+ values = list(json.loads(stdout)['port'].values())[0]
+
+ if 'lanes' in values:
+ lanes = values['lanes']
+ else:
+ lanes = 0
+ return lanes
+
+
+def get_split_ability(port):
+ """
+ Get the $port split ability.
+ Return: split ability, true or false.
+ """
+
+ cmd = "devlink -j port show %s" % port.name
+ stdout, stderr = run_command(cmd)
+ assert stderr == ""
+ values = list(json.loads(stdout)['port'].values())[0]
+
+ return values['splittable']
+
+
+def split(k, port, should_fail=False):
+ """
+ Split $port into $k ports.
+ If should_fail == True, the split should fail. Otherwise, should pass.
+ Return: Array of sub ports after splitting.
+ If the $port wasn't split, the array will be empty.
+ """
+
+ cmd = "devlink port split %s count %s" % (port.bus_info, k)
+ stdout, stderr = run_command(cmd, should_fail=should_fail)
+
+ if should_fail:
+ if not test(stderr != "", "%s is unsplittable" % port.name):
+ print("split an unsplittable port %s" % port.name)
+ return create_split_group(port, k)
+ else:
+ if stderr == "":
+ return create_split_group(port, k)
+ print("didn't split a splittable port %s" % port.name)
+
+ return []
+
+
+def unsplit(port):
+ """
+ Unsplit $port.
+ """
+
+ cmd = "devlink port unsplit %s" % port
+ stdout, stderr = run_command(cmd)
+ test(stderr == "", "Unsplit port %s" % port)
+
+
+def exists(port, dev):
+ """
+ Check if $port exists in the devlink ports.
+ Return: True is so, False otherwise.
+ """
+
+ return any(dev_port.name == port
+ for dev_port in devlink_ports.get_if_names(dev))
+
+
+def exists_and_lanes(ports, lanes, dev):
+ """
+ Check if every port in the list $ports exists in the devlink ports and has
+ $lanes number of lanes after splitting.
+ Return: True if both are True, False otherwise.
+ """
+
+ for port in ports:
+ max_lanes = get_max_lanes(port)
+ if not exists(port, dev):
+ print("port %s doesn't exist in devlink ports" % port)
+ return False
+ if max_lanes != lanes:
+ print("port %s has %d lanes, but %s were expected"
+ % (port, lanes, max_lanes))
+ return False
+ return True
+
+
+def test(cond, msg):
+ """
+ Check $cond and print a message accordingly.
+ Return: True is pass, False otherwise.
+ """
+
+ if cond:
+ print("TEST: %-60s [ OK ]" % msg)
+ else:
+ print("TEST: %-60s [FAIL]" % msg)
+
+ return cond
+
+
+def create_split_group(port, k):
+ """
+ Create the split group for $port.
+ Return: Array with $k elements, which are the split port group.
+ """
+
+ return list(port.name + "s" + str(i) for i in range(k))
+
+
+def split_unsplittable_port(port, k):
+ """
+ Test that splitting of unsplittable port fails.
+ """
+
+ # split to max
+ new_split_group = split(k, port, should_fail=True)
+
+ if new_split_group != []:
+ unsplit(port.bus_info)
+
+
+def split_splittable_port(port, k, lanes, dev):
+ """
+ Test that splitting of splittable port passes correctly.
+ """
+
+ new_split_group = split(k, port)
+
+ # Once the split command ends, it takes some time to the sub ifaces'
+ # to get their names. Use udevadm to continue only when all current udev
+ # events are handled.
+ cmd = "udevadm settle"
+ stdout, stderr = run_command(cmd)
+ assert stderr == ""
+
+ if new_split_group != []:
+ test(exists_and_lanes(new_split_group, lanes/k, dev),
+ "split port %s into %s" % (port.name, k))
+
+ unsplit(port.bus_info)
+
+
+def validate_devlink_output(devlink_data, target_property=None):
+ """
+ Determine if test should be skipped by checking:
+ 1. devlink_data contains values
+ 2. The target_property exist in devlink_data
+ """
+ skip_reason = None
+ if any(devlink_data.values()):
+ if target_property:
+ skip_reason = "{} not found in devlink output, test skipped".format(target_property)
+ for key in devlink_data:
+ if target_property in devlink_data[key]:
+ skip_reason = None
+ else:
+ skip_reason = 'devlink output is empty, test skipped'
+
+ if skip_reason:
+ print(skip_reason)
+ sys.exit(KSFT_SKIP)
+
+
+def make_parser():
+ parser = argparse.ArgumentParser(description='A test for port splitting.')
+ parser.add_argument('--dev',
+ help='The devlink handle of the device under test. ' +
+ 'The default is the first registered devlink ' +
+ 'handle.')
+
+ return parser
+
+
+def main(cmdline=None):
+ parser = make_parser()
+ args = parser.parse_args(cmdline)
+
+ dev = args.dev
+ if not dev:
+ cmd = "devlink -j dev show"
+ stdout, stderr = run_command(cmd)
+ assert stderr == ""
+
+ validate_devlink_output(json.loads(stdout))
+ devs = json.loads(stdout)['dev']
+ dev = list(devs.keys())[0]
+
+ cmd = "devlink dev show %s" % dev
+ stdout, stderr = run_command(cmd)
+ if stderr != "":
+ print("devlink device %s can not be found" % dev)
+ sys.exit(1)
+
+ ports = devlink_ports(dev)
+
+ found_max_lanes = False
+ for port in ports.if_names:
+ max_lanes = get_max_lanes(port.name)
+
+ # If max lanes is 0, do not test port splitting at all
+ if max_lanes == 0:
+ continue
+
+ # If 1 lane, shouldn't be able to split
+ elif max_lanes == 1:
+ test(not get_split_ability(port),
+ "%s should not be able to split" % port.name)
+ split_unsplittable_port(port, max_lanes)
+
+ # Else, splitting should pass and all the split ports should exist.
+ else:
+ lane = max_lanes
+ test(get_split_ability(port),
+ "%s should be able to split" % port.name)
+ while lane > 1:
+ split_splittable_port(port, lane, max_lanes, dev)
+
+ lane //= 2
+ found_max_lanes = True
+
+ if not found_max_lanes:
+ print(f"Test not started, no port of device {dev} reports max_lanes")
+ sys.exit(KSFT_SKIP)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/testing/selftests/net/drop_monitor_tests.sh b/tools/testing/selftests/net/drop_monitor_tests.sh
new file mode 100755
index 000000000..b7650e30d
--- /dev/null
+++ b/tools/testing/selftests/net/drop_monitor_tests.sh
@@ -0,0 +1,215 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test is for checking drop monitor functionality.
+
+ret=0
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+# all tests in this script. Can be overridden with -t option
+TESTS="
+ sw_drops
+ hw_drops
+"
+
+IP="ip -netns ns1"
+TC="tc -netns ns1"
+DEVLINK="devlink -N ns1"
+NS_EXEC="ip netns exec ns1"
+NETDEVSIM_PATH=/sys/bus/netdevsim/
+DEV_ADDR=1337
+DEV=netdevsim${DEV_ADDR}
+DEVLINK_DEV=netdevsim/${DEV}
+
+log_test()
+{
+ local rc=$1
+ local expected=$2
+ local msg="$3"
+
+ if [ ${rc} -eq ${expected} ]; then
+ printf " TEST: %-60s [ OK ]\n" "${msg}"
+ nsuccess=$((nsuccess+1))
+ else
+ ret=1
+ nfail=$((nfail+1))
+ printf " TEST: %-60s [FAIL]\n" "${msg}"
+ fi
+}
+
+setup()
+{
+ modprobe netdevsim &> /dev/null
+
+ set -e
+ ip netns add ns1
+ $IP link add dummy10 up type dummy
+
+ $NS_EXEC echo "$DEV_ADDR 1" > ${NETDEVSIM_PATH}/new_device
+ udevadm settle
+ local netdev=$($NS_EXEC ls ${NETDEVSIM_PATH}/devices/${DEV}/net/)
+ $IP link set dev $netdev up
+
+ set +e
+}
+
+cleanup()
+{
+ $NS_EXEC echo "$DEV_ADDR" > ${NETDEVSIM_PATH}/del_device
+ ip netns del ns1
+}
+
+sw_drops_test()
+{
+ echo
+ echo "Software drops test"
+
+ setup
+
+ local dir=$(mktemp -d)
+
+ $TC qdisc add dev dummy10 clsact
+ $TC filter add dev dummy10 egress pref 1 handle 101 proto ip \
+ flower dst_ip 192.0.2.10 action drop
+
+ $NS_EXEC mausezahn dummy10 -a 00:11:22:33:44:55 -b 00:aa:bb:cc:dd:ee \
+ -A 192.0.2.1 -B 192.0.2.10 -t udp sp=12345,dp=54321 -c 0 -q \
+ -d 100msec &
+ timeout 5 dwdump -o sw -w ${dir}/packets.pcap
+ (( $(tshark -r ${dir}/packets.pcap \
+ -Y 'ip.dst == 192.0.2.10' 2> /dev/null | wc -l) != 0))
+ log_test $? 0 "Capturing active software drops"
+
+ rm ${dir}/packets.pcap
+
+ { kill %% && wait %%; } 2>/dev/null
+ timeout 5 dwdump -o sw -w ${dir}/packets.pcap
+ (( $(tshark -r ${dir}/packets.pcap \
+ -Y 'ip.dst == 192.0.2.10' 2> /dev/null | wc -l) == 0))
+ log_test $? 0 "Capturing inactive software drops"
+
+ rm -r $dir
+
+ cleanup
+}
+
+hw_drops_test()
+{
+ echo
+ echo "Hardware drops test"
+
+ setup
+
+ local dir=$(mktemp -d)
+
+ $DEVLINK trap set $DEVLINK_DEV trap blackhole_route action trap
+ timeout 5 dwdump -o hw -w ${dir}/packets.pcap
+ (( $(tshark -r ${dir}/packets.pcap \
+ -Y 'net_dm.hw_trap_name== blackhole_route' 2> /dev/null \
+ | wc -l) != 0))
+ log_test $? 0 "Capturing active hardware drops"
+
+ rm ${dir}/packets.pcap
+
+ $DEVLINK trap set $DEVLINK_DEV trap blackhole_route action drop
+ timeout 5 dwdump -o hw -w ${dir}/packets.pcap
+ (( $(tshark -r ${dir}/packets.pcap \
+ -Y 'net_dm.hw_trap_name== blackhole_route' 2> /dev/null \
+ | wc -l) == 0))
+ log_test $? 0 "Capturing inactive hardware drops"
+
+ rm -r $dir
+
+ cleanup
+}
+
+################################################################################
+# usage
+
+usage()
+{
+ cat <<EOF
+usage: ${0##*/} OPTS
+
+ -t <test> Test(s) to run (default: all)
+ (options: $TESTS)
+EOF
+}
+
+################################################################################
+# main
+
+while getopts ":t:h" opt; do
+ case $opt in
+ t) TESTS=$OPTARG;;
+ h) usage; exit 0;;
+ *) usage; exit 1;;
+ esac
+done
+
+if [ "$(id -u)" -ne 0 ];then
+ echo "SKIP: Need root privileges"
+ exit $ksft_skip;
+fi
+
+if [ ! -x "$(command -v ip)" ]; then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+
+if [ ! -x "$(command -v devlink)" ]; then
+ echo "SKIP: Could not run test without devlink tool"
+ exit $ksft_skip
+fi
+
+if [ ! -x "$(command -v tshark)" ]; then
+ echo "SKIP: Could not run test without tshark tool"
+ exit $ksft_skip
+fi
+
+if [ ! -x "$(command -v dwdump)" ]; then
+ echo "SKIP: Could not run test without dwdump tool"
+ exit $ksft_skip
+fi
+
+if [ ! -x "$(command -v udevadm)" ]; then
+ echo "SKIP: Could not run test without udevadm tool"
+ exit $ksft_skip
+fi
+
+if [ ! -x "$(command -v timeout)" ]; then
+ echo "SKIP: Could not run test without timeout tool"
+ exit $ksft_skip
+fi
+
+if [ ! -x "$(command -v mausezahn)" ]; then
+ echo "SKIP: Could not run test without mausezahn tool"
+ exit $ksft_skip
+fi
+
+tshark -G fields 2> /dev/null | grep -q net_dm
+if [ $? -ne 0 ]; then
+ echo "SKIP: tshark too old, missing net_dm dissector"
+ exit $ksft_skip
+fi
+
+# start clean
+cleanup &> /dev/null
+
+for t in $TESTS
+do
+ case $t in
+ sw_drops|sw) sw_drops_test;;
+ hw_drops|hw) hw_drops_test;;
+
+ help) echo "Test names: $TESTS"; exit 0;;
+ esac
+done
+
+if [ "$TESTS" != "none" ]; then
+ printf "\nTests passed: %3d\n" ${nsuccess}
+ printf "Tests failed: %3d\n" ${nfail}
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/net/fcnal-test.sh b/tools/testing/selftests/net/fcnal-test.sh
new file mode 100755
index 000000000..e13b0fb63
--- /dev/null
+++ b/tools/testing/selftests/net/fcnal-test.sh
@@ -0,0 +1,4034 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (c) 2019 David Ahern <dsahern@gmail.com>. All rights reserved.
+#
+# IPv4 and IPv6 functional tests focusing on VRF and routing lookups
+# for various permutations:
+# 1. icmp, tcp, udp and netfilter
+# 2. client, server, no-server
+# 3. global address on interface
+# 4. global address on 'lo'
+# 5. remote and local traffic
+# 6. VRF and non-VRF permutations
+#
+# Setup:
+# ns-A | ns-B
+# No VRF case:
+# [ lo ] [ eth1 ]---|---[ eth1 ] [ lo ]
+# remote address
+# VRF case:
+# [ red ]---[ eth1 ]---|---[ eth1 ] [ lo ]
+#
+# ns-A:
+# eth1: 172.16.1.1/24, 2001:db8:1::1/64
+# lo: 127.0.0.1/8, ::1/128
+# 172.16.2.1/32, 2001:db8:2::1/128
+# red: 127.0.0.1/8, ::1/128
+# 172.16.3.1/32, 2001:db8:3::1/128
+#
+# ns-B:
+# eth1: 172.16.1.2/24, 2001:db8:1::2/64
+# lo2: 127.0.0.1/8, ::1/128
+# 172.16.2.2/32, 2001:db8:2::2/128
+#
+# ns-A to ns-C connection - only for VRF and same config
+# as ns-A to ns-B
+#
+# server / client nomenclature relative to ns-A
+
+VERBOSE=0
+
+NSA_DEV=eth1
+NSA_DEV2=eth2
+NSB_DEV=eth1
+NSC_DEV=eth2
+VRF=red
+VRF_TABLE=1101
+
+# IPv4 config
+NSA_IP=172.16.1.1
+NSB_IP=172.16.1.2
+VRF_IP=172.16.3.1
+NS_NET=172.16.1.0/24
+
+# IPv6 config
+NSA_IP6=2001:db8:1::1
+NSB_IP6=2001:db8:1::2
+VRF_IP6=2001:db8:3::1
+NS_NET6=2001:db8:1::/120
+
+NSA_LO_IP=172.16.2.1
+NSB_LO_IP=172.16.2.2
+NSA_LO_IP6=2001:db8:2::1
+NSB_LO_IP6=2001:db8:2::2
+
+MD5_PW=abc123
+MD5_WRONG_PW=abc1234
+
+MCAST=ff02::1
+# set after namespace create
+NSA_LINKIP6=
+NSB_LINKIP6=
+
+NSA=ns-A
+NSB=ns-B
+NSC=ns-C
+
+NSA_CMD="ip netns exec ${NSA}"
+NSB_CMD="ip netns exec ${NSB}"
+NSC_CMD="ip netns exec ${NSC}"
+
+which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping)
+
+# Check if FIPS mode is enabled
+if [ -f /proc/sys/crypto/fips_enabled ]; then
+ fips_enabled=`cat /proc/sys/crypto/fips_enabled`
+else
+ fips_enabled=0
+fi
+
+################################################################################
+# utilities
+
+log_test()
+{
+ local rc=$1
+ local expected=$2
+ local msg="$3"
+
+ [ "${VERBOSE}" = "1" ] && echo
+
+ if [ ${rc} -eq ${expected} ]; then
+ nsuccess=$((nsuccess+1))
+ printf "TEST: %-70s [ OK ]\n" "${msg}"
+ else
+ nfail=$((nfail+1))
+ printf "TEST: %-70s [FAIL]\n" "${msg}"
+ if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+ echo
+ echo "hit enter to continue, 'q' to quit"
+ read a
+ [ "$a" = "q" ] && exit 1
+ fi
+ fi
+
+ if [ "${PAUSE}" = "yes" ]; then
+ echo
+ echo "hit enter to continue, 'q' to quit"
+ read a
+ [ "$a" = "q" ] && exit 1
+ fi
+
+ kill_procs
+}
+
+log_test_addr()
+{
+ local addr=$1
+ local rc=$2
+ local expected=$3
+ local msg="$4"
+ local astr
+
+ astr=$(addr2str ${addr})
+ log_test $rc $expected "$msg - ${astr}"
+}
+
+log_section()
+{
+ echo
+ echo "###########################################################################"
+ echo "$*"
+ echo "###########################################################################"
+ echo
+}
+
+log_subsection()
+{
+ echo
+ echo "#################################################################"
+ echo "$*"
+ echo
+}
+
+log_start()
+{
+ # make sure we have no test instances running
+ kill_procs
+
+ if [ "${VERBOSE}" = "1" ]; then
+ echo
+ echo "#######################################################"
+ fi
+}
+
+log_debug()
+{
+ if [ "${VERBOSE}" = "1" ]; then
+ echo
+ echo "$*"
+ echo
+ fi
+}
+
+show_hint()
+{
+ if [ "${VERBOSE}" = "1" ]; then
+ echo "HINT: $*"
+ echo
+ fi
+}
+
+kill_procs()
+{
+ killall nettest ping ping6 >/dev/null 2>&1
+ sleep 1
+}
+
+do_run_cmd()
+{
+ local cmd="$*"
+ local out
+
+ if [ "$VERBOSE" = "1" ]; then
+ echo "COMMAND: ${cmd}"
+ fi
+
+ out=$($cmd 2>&1)
+ rc=$?
+ if [ "$VERBOSE" = "1" -a -n "$out" ]; then
+ echo "$out"
+ fi
+
+ return $rc
+}
+
+run_cmd()
+{
+ do_run_cmd ${NSA_CMD} $*
+}
+
+run_cmd_nsb()
+{
+ do_run_cmd ${NSB_CMD} $*
+}
+
+run_cmd_nsc()
+{
+ do_run_cmd ${NSC_CMD} $*
+}
+
+setup_cmd()
+{
+ local cmd="$*"
+ local rc
+
+ run_cmd ${cmd}
+ rc=$?
+ if [ $rc -ne 0 ]; then
+ # show user the command if not done so already
+ if [ "$VERBOSE" = "0" ]; then
+ echo "setup command: $cmd"
+ fi
+ echo "failed. stopping tests"
+ if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+ echo
+ echo "hit enter to continue"
+ read a
+ fi
+ exit $rc
+ fi
+}
+
+setup_cmd_nsb()
+{
+ local cmd="$*"
+ local rc
+
+ run_cmd_nsb ${cmd}
+ rc=$?
+ if [ $rc -ne 0 ]; then
+ # show user the command if not done so already
+ if [ "$VERBOSE" = "0" ]; then
+ echo "setup command: $cmd"
+ fi
+ echo "failed. stopping tests"
+ if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+ echo
+ echo "hit enter to continue"
+ read a
+ fi
+ exit $rc
+ fi
+}
+
+setup_cmd_nsc()
+{
+ local cmd="$*"
+ local rc
+
+ run_cmd_nsc ${cmd}
+ rc=$?
+ if [ $rc -ne 0 ]; then
+ # show user the command if not done so already
+ if [ "$VERBOSE" = "0" ]; then
+ echo "setup command: $cmd"
+ fi
+ echo "failed. stopping tests"
+ if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+ echo
+ echo "hit enter to continue"
+ read a
+ fi
+ exit $rc
+ fi
+}
+
+# set sysctl values in NS-A
+set_sysctl()
+{
+ echo "SYSCTL: $*"
+ echo
+ run_cmd sysctl -q -w $*
+}
+
+################################################################################
+# Setup for tests
+
+addr2str()
+{
+ case "$1" in
+ 127.0.0.1) echo "loopback";;
+ ::1) echo "IPv6 loopback";;
+
+ ${NSA_IP}) echo "ns-A IP";;
+ ${NSA_IP6}) echo "ns-A IPv6";;
+ ${NSA_LO_IP}) echo "ns-A loopback IP";;
+ ${NSA_LO_IP6}) echo "ns-A loopback IPv6";;
+ ${NSA_LINKIP6}|${NSA_LINKIP6}%*) echo "ns-A IPv6 LLA";;
+
+ ${NSB_IP}) echo "ns-B IP";;
+ ${NSB_IP6}) echo "ns-B IPv6";;
+ ${NSB_LO_IP}) echo "ns-B loopback IP";;
+ ${NSB_LO_IP6}) echo "ns-B loopback IPv6";;
+ ${NSB_LINKIP6}|${NSB_LINKIP6}%*) echo "ns-B IPv6 LLA";;
+
+ ${VRF_IP}) echo "VRF IP";;
+ ${VRF_IP6}) echo "VRF IPv6";;
+
+ ${MCAST}%*) echo "multicast IP";;
+
+ *) echo "unknown";;
+ esac
+}
+
+get_linklocal()
+{
+ local ns=$1
+ local dev=$2
+ local addr
+
+ addr=$(ip -netns ${ns} -6 -br addr show dev ${dev} | \
+ awk '{
+ for (i = 3; i <= NF; ++i) {
+ if ($i ~ /^fe80/)
+ print $i
+ }
+ }'
+ )
+ addr=${addr/\/*}
+
+ [ -z "$addr" ] && return 1
+
+ echo $addr
+
+ return 0
+}
+
+################################################################################
+# create namespaces and vrf
+
+create_vrf()
+{
+ local ns=$1
+ local vrf=$2
+ local table=$3
+ local addr=$4
+ local addr6=$5
+
+ ip -netns ${ns} link add ${vrf} type vrf table ${table}
+ ip -netns ${ns} link set ${vrf} up
+ ip -netns ${ns} route add vrf ${vrf} unreachable default metric 8192
+ ip -netns ${ns} -6 route add vrf ${vrf} unreachable default metric 8192
+
+ ip -netns ${ns} addr add 127.0.0.1/8 dev ${vrf}
+ ip -netns ${ns} -6 addr add ::1 dev ${vrf} nodad
+ if [ "${addr}" != "-" ]; then
+ ip -netns ${ns} addr add dev ${vrf} ${addr}
+ fi
+ if [ "${addr6}" != "-" ]; then
+ ip -netns ${ns} -6 addr add dev ${vrf} ${addr6}
+ fi
+
+ ip -netns ${ns} ru del pref 0
+ ip -netns ${ns} ru add pref 32765 from all lookup local
+ ip -netns ${ns} -6 ru del pref 0
+ ip -netns ${ns} -6 ru add pref 32765 from all lookup local
+}
+
+create_ns()
+{
+ local ns=$1
+ local addr=$2
+ local addr6=$3
+
+ ip netns add ${ns}
+
+ ip -netns ${ns} link set lo up
+ if [ "${addr}" != "-" ]; then
+ ip -netns ${ns} addr add dev lo ${addr}
+ fi
+ if [ "${addr6}" != "-" ]; then
+ ip -netns ${ns} -6 addr add dev lo ${addr6}
+ fi
+
+ ip -netns ${ns} ro add unreachable default metric 8192
+ ip -netns ${ns} -6 ro add unreachable default metric 8192
+
+ ip netns exec ${ns} sysctl -qw net.ipv4.ip_forward=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.all.keep_addr_on_down=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.all.forwarding=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.default.forwarding=1
+}
+
+# create veth pair to connect namespaces and apply addresses.
+connect_ns()
+{
+ local ns1=$1
+ local ns1_dev=$2
+ local ns1_addr=$3
+ local ns1_addr6=$4
+ local ns2=$5
+ local ns2_dev=$6
+ local ns2_addr=$7
+ local ns2_addr6=$8
+
+ ip -netns ${ns1} li add ${ns1_dev} type veth peer name tmp
+ ip -netns ${ns1} li set ${ns1_dev} up
+ ip -netns ${ns1} li set tmp netns ${ns2} name ${ns2_dev}
+ ip -netns ${ns2} li set ${ns2_dev} up
+
+ if [ "${ns1_addr}" != "-" ]; then
+ ip -netns ${ns1} addr add dev ${ns1_dev} ${ns1_addr}
+ ip -netns ${ns2} addr add dev ${ns2_dev} ${ns2_addr}
+ fi
+
+ if [ "${ns1_addr6}" != "-" ]; then
+ ip -netns ${ns1} addr add dev ${ns1_dev} ${ns1_addr6}
+ ip -netns ${ns2} addr add dev ${ns2_dev} ${ns2_addr6}
+ fi
+}
+
+cleanup()
+{
+ # explicit cleanups to check those code paths
+ ip netns | grep -q ${NSA}
+ if [ $? -eq 0 ]; then
+ ip -netns ${NSA} link delete ${VRF}
+ ip -netns ${NSA} ro flush table ${VRF_TABLE}
+
+ ip -netns ${NSA} addr flush dev ${NSA_DEV}
+ ip -netns ${NSA} -6 addr flush dev ${NSA_DEV}
+ ip -netns ${NSA} link set dev ${NSA_DEV} down
+ ip -netns ${NSA} link del dev ${NSA_DEV}
+
+ ip netns pids ${NSA} | xargs kill 2>/dev/null
+ ip netns del ${NSA}
+ fi
+
+ ip netns pids ${NSB} | xargs kill 2>/dev/null
+ ip netns del ${NSB}
+ ip netns pids ${NSC} | xargs kill 2>/dev/null
+ ip netns del ${NSC} >/dev/null 2>&1
+}
+
+cleanup_vrf_dup()
+{
+ ip link del ${NSA_DEV2} >/dev/null 2>&1
+ ip netns pids ${NSC} | xargs kill 2>/dev/null
+ ip netns del ${NSC} >/dev/null 2>&1
+}
+
+setup_vrf_dup()
+{
+ # some VRF tests use ns-C which has the same config as
+ # ns-B but for a device NOT in the VRF
+ create_ns ${NSC} "-" "-"
+ connect_ns ${NSA} ${NSA_DEV2} ${NSA_IP}/24 ${NSA_IP6}/64 \
+ ${NSC} ${NSC_DEV} ${NSB_IP}/24 ${NSB_IP6}/64
+}
+
+setup()
+{
+ local with_vrf=${1}
+
+ # make sure we are starting with a clean slate
+ kill_procs
+ cleanup 2>/dev/null
+
+ log_debug "Configuring network namespaces"
+ set -e
+
+ create_ns ${NSA} ${NSA_LO_IP}/32 ${NSA_LO_IP6}/128
+ create_ns ${NSB} ${NSB_LO_IP}/32 ${NSB_LO_IP6}/128
+ connect_ns ${NSA} ${NSA_DEV} ${NSA_IP}/24 ${NSA_IP6}/64 \
+ ${NSB} ${NSB_DEV} ${NSB_IP}/24 ${NSB_IP6}/64
+
+ NSA_LINKIP6=$(get_linklocal ${NSA} ${NSA_DEV})
+ NSB_LINKIP6=$(get_linklocal ${NSB} ${NSB_DEV})
+
+ # tell ns-A how to get to remote addresses of ns-B
+ if [ "${with_vrf}" = "yes" ]; then
+ create_vrf ${NSA} ${VRF} ${VRF_TABLE} ${VRF_IP} ${VRF_IP6}
+
+ ip -netns ${NSA} link set dev ${NSA_DEV} vrf ${VRF}
+ ip -netns ${NSA} ro add vrf ${VRF} ${NSB_LO_IP}/32 via ${NSB_IP} dev ${NSA_DEV}
+ ip -netns ${NSA} -6 ro add vrf ${VRF} ${NSB_LO_IP6}/128 via ${NSB_IP6} dev ${NSA_DEV}
+
+ ip -netns ${NSB} ro add ${VRF_IP}/32 via ${NSA_IP} dev ${NSB_DEV}
+ ip -netns ${NSB} -6 ro add ${VRF_IP6}/128 via ${NSA_IP6} dev ${NSB_DEV}
+ else
+ ip -netns ${NSA} ro add ${NSB_LO_IP}/32 via ${NSB_IP} dev ${NSA_DEV}
+ ip -netns ${NSA} ro add ${NSB_LO_IP6}/128 via ${NSB_IP6} dev ${NSA_DEV}
+ fi
+
+
+ # tell ns-B how to get to remote addresses of ns-A
+ ip -netns ${NSB} ro add ${NSA_LO_IP}/32 via ${NSA_IP} dev ${NSB_DEV}
+ ip -netns ${NSB} ro add ${NSA_LO_IP6}/128 via ${NSA_IP6} dev ${NSB_DEV}
+
+ set +e
+
+ sleep 1
+}
+
+setup_lla_only()
+{
+ # make sure we are starting with a clean slate
+ kill_procs
+ cleanup 2>/dev/null
+
+ log_debug "Configuring network namespaces"
+ set -e
+
+ create_ns ${NSA} "-" "-"
+ create_ns ${NSB} "-" "-"
+ create_ns ${NSC} "-" "-"
+ connect_ns ${NSA} ${NSA_DEV} "-" "-" \
+ ${NSB} ${NSB_DEV} "-" "-"
+ connect_ns ${NSA} ${NSA_DEV2} "-" "-" \
+ ${NSC} ${NSC_DEV} "-" "-"
+
+ NSA_LINKIP6=$(get_linklocal ${NSA} ${NSA_DEV})
+ NSB_LINKIP6=$(get_linklocal ${NSB} ${NSB_DEV})
+ NSC_LINKIP6=$(get_linklocal ${NSC} ${NSC_DEV})
+
+ create_vrf ${NSA} ${VRF} ${VRF_TABLE} "-" "-"
+ ip -netns ${NSA} link set dev ${NSA_DEV} vrf ${VRF}
+ ip -netns ${NSA} link set dev ${NSA_DEV2} vrf ${VRF}
+
+ set +e
+
+ sleep 1
+}
+
+################################################################################
+# IPv4
+
+ipv4_ping_novrf()
+{
+ local a
+
+ #
+ # out
+ #
+ for a in ${NSB_IP} ${NSB_LO_IP}
+ do
+ log_start
+ run_cmd ping -c1 -w1 ${a}
+ log_test_addr ${a} $? 0 "ping out"
+
+ log_start
+ run_cmd ping -c1 -w1 -I ${NSA_DEV} ${a}
+ log_test_addr ${a} $? 0 "ping out, device bind"
+
+ log_start
+ run_cmd ping -c1 -w1 -I ${NSA_LO_IP} ${a}
+ log_test_addr ${a} $? 0 "ping out, address bind"
+ done
+
+ #
+ # in
+ #
+ for a in ${NSA_IP} ${NSA_LO_IP}
+ do
+ log_start
+ run_cmd_nsb ping -c1 -w1 ${a}
+ log_test_addr ${a} $? 0 "ping in"
+ done
+
+ #
+ # local traffic
+ #
+ for a in ${NSA_IP} ${NSA_LO_IP} 127.0.0.1
+ do
+ log_start
+ run_cmd ping -c1 -w1 ${a}
+ log_test_addr ${a} $? 0 "ping local"
+ done
+
+ #
+ # local traffic, socket bound to device
+ #
+ # address on device
+ a=${NSA_IP}
+ log_start
+ run_cmd ping -c1 -w1 -I ${NSA_DEV} ${a}
+ log_test_addr ${a} $? 0 "ping local, device bind"
+
+ # loopback addresses not reachable from device bind
+ # fails in a really weird way though because ipv4 special cases
+ # route lookups with oif set.
+ for a in ${NSA_LO_IP} 127.0.0.1
+ do
+ log_start
+ show_hint "Fails since address on loopback device is out of device scope"
+ run_cmd ping -c1 -w1 -I ${NSA_DEV} ${a}
+ log_test_addr ${a} $? 1 "ping local, device bind"
+ done
+
+ #
+ # ip rule blocks reachability to remote address
+ #
+ log_start
+ setup_cmd ip rule add pref 32765 from all lookup local
+ setup_cmd ip rule del pref 0 from all lookup local
+ setup_cmd ip rule add pref 50 to ${NSB_LO_IP} prohibit
+ setup_cmd ip rule add pref 51 from ${NSB_IP} prohibit
+
+ a=${NSB_LO_IP}
+ run_cmd ping -c1 -w1 ${a}
+ log_test_addr ${a} $? 2 "ping out, blocked by rule"
+
+ # NOTE: ipv4 actually allows the lookup to fail and yet still create
+ # a viable rtable if the oif (e.g., bind to device) is set, so this
+ # case succeeds despite the rule
+ # run_cmd ping -c1 -w1 -I ${NSA_DEV} ${a}
+
+ a=${NSA_LO_IP}
+ log_start
+ show_hint "Response generates ICMP (or arp request is ignored) due to ip rule"
+ run_cmd_nsb ping -c1 -w1 ${a}
+ log_test_addr ${a} $? 1 "ping in, blocked by rule"
+
+ [ "$VERBOSE" = "1" ] && echo
+ setup_cmd ip rule del pref 32765 from all lookup local
+ setup_cmd ip rule add pref 0 from all lookup local
+ setup_cmd ip rule del pref 50 to ${NSB_LO_IP} prohibit
+ setup_cmd ip rule del pref 51 from ${NSB_IP} prohibit
+
+ #
+ # route blocks reachability to remote address
+ #
+ log_start
+ setup_cmd ip route replace unreachable ${NSB_LO_IP}
+ setup_cmd ip route replace unreachable ${NSB_IP}
+
+ a=${NSB_LO_IP}
+ run_cmd ping -c1 -w1 ${a}
+ log_test_addr ${a} $? 2 "ping out, blocked by route"
+
+ # NOTE: ipv4 actually allows the lookup to fail and yet still create
+ # a viable rtable if the oif (e.g., bind to device) is set, so this
+ # case succeeds despite not having a route for the address
+ # run_cmd ping -c1 -w1 -I ${NSA_DEV} ${a}
+
+ a=${NSA_LO_IP}
+ log_start
+ show_hint "Response is dropped (or arp request is ignored) due to ip route"
+ run_cmd_nsb ping -c1 -w1 ${a}
+ log_test_addr ${a} $? 1 "ping in, blocked by route"
+
+ #
+ # remove 'remote' routes; fallback to default
+ #
+ log_start
+ setup_cmd ip ro del ${NSB_LO_IP}
+
+ a=${NSB_LO_IP}
+ run_cmd ping -c1 -w1 ${a}
+ log_test_addr ${a} $? 2 "ping out, unreachable default route"
+
+ # NOTE: ipv4 actually allows the lookup to fail and yet still create
+ # a viable rtable if the oif (e.g., bind to device) is set, so this
+ # case succeeds despite not having a route for the address
+ # run_cmd ping -c1 -w1 -I ${NSA_DEV} ${a}
+}
+
+ipv4_ping_vrf()
+{
+ local a
+
+ # should default on; does not exist on older kernels
+ set_sysctl net.ipv4.raw_l3mdev_accept=1 2>/dev/null
+
+ #
+ # out
+ #
+ for a in ${NSB_IP} ${NSB_LO_IP}
+ do
+ log_start
+ run_cmd ping -c1 -w1 -I ${VRF} ${a}
+ log_test_addr ${a} $? 0 "ping out, VRF bind"
+
+ log_start
+ run_cmd ping -c1 -w1 -I ${NSA_DEV} ${a}
+ log_test_addr ${a} $? 0 "ping out, device bind"
+
+ log_start
+ run_cmd ip vrf exec ${VRF} ping -c1 -w1 -I ${NSA_IP} ${a}
+ log_test_addr ${a} $? 0 "ping out, vrf device + dev address bind"
+
+ log_start
+ run_cmd ip vrf exec ${VRF} ping -c1 -w1 -I ${VRF_IP} ${a}
+ log_test_addr ${a} $? 0 "ping out, vrf device + vrf address bind"
+ done
+
+ #
+ # in
+ #
+ for a in ${NSA_IP} ${VRF_IP}
+ do
+ log_start
+ run_cmd_nsb ping -c1 -w1 ${a}
+ log_test_addr ${a} $? 0 "ping in"
+ done
+
+ #
+ # local traffic, local address
+ #
+ for a in ${NSA_IP} ${VRF_IP} 127.0.0.1
+ do
+ log_start
+ show_hint "Source address should be ${a}"
+ run_cmd ping -c1 -w1 -I ${VRF} ${a}
+ log_test_addr ${a} $? 0 "ping local, VRF bind"
+ done
+
+ #
+ # local traffic, socket bound to device
+ #
+ # address on device
+ a=${NSA_IP}
+ log_start
+ run_cmd ping -c1 -w1 -I ${NSA_DEV} ${a}
+ log_test_addr ${a} $? 0 "ping local, device bind"
+
+ # vrf device is out of scope
+ for a in ${VRF_IP} 127.0.0.1
+ do
+ log_start
+ show_hint "Fails since address on vrf device is out of device scope"
+ run_cmd ping -c1 -w1 -I ${NSA_DEV} ${a}
+ log_test_addr ${a} $? 1 "ping local, device bind"
+ done
+
+ #
+ # ip rule blocks address
+ #
+ log_start
+ setup_cmd ip rule add pref 50 to ${NSB_LO_IP} prohibit
+ setup_cmd ip rule add pref 51 from ${NSB_IP} prohibit
+
+ a=${NSB_LO_IP}
+ run_cmd ping -c1 -w1 -I ${VRF} ${a}
+ log_test_addr ${a} $? 2 "ping out, vrf bind, blocked by rule"
+
+ log_start
+ run_cmd ping -c1 -w1 -I ${NSA_DEV} ${a}
+ log_test_addr ${a} $? 2 "ping out, device bind, blocked by rule"
+
+ a=${NSA_LO_IP}
+ log_start
+ show_hint "Response lost due to ip rule"
+ run_cmd_nsb ping -c1 -w1 ${a}
+ log_test_addr ${a} $? 1 "ping in, blocked by rule"
+
+ [ "$VERBOSE" = "1" ] && echo
+ setup_cmd ip rule del pref 50 to ${NSB_LO_IP} prohibit
+ setup_cmd ip rule del pref 51 from ${NSB_IP} prohibit
+
+ #
+ # remove 'remote' routes; fallback to default
+ #
+ log_start
+ setup_cmd ip ro del vrf ${VRF} ${NSB_LO_IP}
+
+ a=${NSB_LO_IP}
+ run_cmd ping -c1 -w1 -I ${VRF} ${a}
+ log_test_addr ${a} $? 2 "ping out, vrf bind, unreachable route"
+
+ log_start
+ run_cmd ping -c1 -w1 -I ${NSA_DEV} ${a}
+ log_test_addr ${a} $? 2 "ping out, device bind, unreachable route"
+
+ a=${NSA_LO_IP}
+ log_start
+ show_hint "Response lost by unreachable route"
+ run_cmd_nsb ping -c1 -w1 ${a}
+ log_test_addr ${a} $? 1 "ping in, unreachable route"
+}
+
+ipv4_ping()
+{
+ log_section "IPv4 ping"
+
+ log_subsection "No VRF"
+ setup
+ set_sysctl net.ipv4.raw_l3mdev_accept=0 2>/dev/null
+ ipv4_ping_novrf
+ setup
+ set_sysctl net.ipv4.raw_l3mdev_accept=1 2>/dev/null
+ ipv4_ping_novrf
+ setup
+ set_sysctl net.ipv4.ping_group_range='0 2147483647' 2>/dev/null
+ ipv4_ping_novrf
+
+ log_subsection "With VRF"
+ setup "yes"
+ ipv4_ping_vrf
+ setup "yes"
+ set_sysctl net.ipv4.ping_group_range='0 2147483647' 2>/dev/null
+ ipv4_ping_vrf
+}
+
+################################################################################
+# IPv4 TCP
+
+#
+# MD5 tests without VRF
+#
+ipv4_tcp_md5_novrf()
+{
+ #
+ # single address
+ #
+
+ # basic use case
+ log_start
+ run_cmd nettest -s -M ${MD5_PW} -r ${NSB_IP} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 0 "MD5: Single address config"
+
+ # client sends MD5, server not configured
+ log_start
+ show_hint "Should timeout due to MD5 mismatch"
+ run_cmd nettest -s &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 2 "MD5: Server no config, client uses password"
+
+ # wrong password
+ log_start
+ show_hint "Should timeout since client uses wrong password"
+ run_cmd nettest -s -M ${MD5_PW} -r ${NSB_IP} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: Client uses wrong password"
+
+ # client from different address
+ log_start
+ show_hint "Should timeout due to MD5 mismatch"
+ run_cmd nettest -s -M ${MD5_PW} -r ${NSB_LO_IP} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 2 "MD5: Client address does not match address configured with password"
+
+ #
+ # MD5 extension - prefix length
+ #
+
+ # client in prefix
+ log_start
+ run_cmd nettest -s -M ${MD5_PW} -m ${NS_NET} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 0 "MD5: Prefix config"
+
+ # client in prefix, wrong password
+ log_start
+ show_hint "Should timeout since client uses wrong password"
+ run_cmd nettest -s -M ${MD5_PW} -m ${NS_NET} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: Prefix config, client uses wrong password"
+
+ # client outside of prefix
+ log_start
+ show_hint "Should timeout due to MD5 mismatch"
+ run_cmd nettest -s -M ${MD5_PW} -m ${NS_NET} &
+ sleep 1
+ run_cmd_nsb nettest -l ${NSB_LO_IP} -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 2 "MD5: Prefix config, client address not in configured prefix"
+}
+
+#
+# MD5 tests with VRF
+#
+ipv4_tcp_md5()
+{
+ #
+ # single address
+ #
+
+ # basic use case
+ log_start
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 0 "MD5: VRF: Single address config"
+
+ # client sends MD5, server not configured
+ log_start
+ show_hint "Should timeout since server does not have MD5 auth"
+ run_cmd nettest -s -d ${VRF} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 2 "MD5: VRF: Server no config, client uses password"
+
+ # wrong password
+ log_start
+ show_hint "Should timeout since client uses wrong password"
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: VRF: Client uses wrong password"
+
+ # client from different address
+ log_start
+ show_hint "Should timeout since server config differs from client"
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -r ${NSB_LO_IP} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 2 "MD5: VRF: Client address does not match address configured with password"
+
+ #
+ # MD5 extension - prefix length
+ #
+
+ # client in prefix
+ log_start
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 0 "MD5: VRF: Prefix config"
+
+ # client in prefix, wrong password
+ log_start
+ show_hint "Should timeout since client uses wrong password"
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: VRF: Prefix config, client uses wrong password"
+
+ # client outside of prefix
+ log_start
+ show_hint "Should timeout since client address is outside of prefix"
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET} &
+ sleep 1
+ run_cmd_nsb nettest -l ${NSB_LO_IP} -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 2 "MD5: VRF: Prefix config, client address not in configured prefix"
+
+ #
+ # duplicate config between default VRF and a VRF
+ #
+
+ log_start
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP} &
+ run_cmd nettest -s -M ${MD5_WRONG_PW} -r ${NSB_IP} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 0 "MD5: VRF: Single address config in default VRF and VRF, conn in VRF"
+
+ log_start
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP} &
+ run_cmd nettest -s -M ${MD5_WRONG_PW} -r ${NSB_IP} &
+ sleep 1
+ run_cmd_nsc nettest -r ${NSA_IP} -M ${MD5_WRONG_PW}
+ log_test $? 0 "MD5: VRF: Single address config in default VRF and VRF, conn in default VRF"
+
+ log_start
+ show_hint "Should timeout since client in default VRF uses VRF password"
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP} &
+ run_cmd nettest -s -M ${MD5_WRONG_PW} -r ${NSB_IP} &
+ sleep 1
+ run_cmd_nsc nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 2 "MD5: VRF: Single address config in default VRF and VRF, conn in default VRF with VRF pw"
+
+ log_start
+ show_hint "Should timeout since client in VRF uses default VRF password"
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP} &
+ run_cmd nettest -s -M ${MD5_WRONG_PW} -r ${NSB_IP} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: VRF: Single address config in default VRF and VRF, conn in VRF with default VRF pw"
+
+ log_start
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET} &
+ run_cmd nettest -s -M ${MD5_WRONG_PW} -m ${NS_NET} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 0 "MD5: VRF: Prefix config in default VRF and VRF, conn in VRF"
+
+ log_start
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET} &
+ run_cmd nettest -s -M ${MD5_WRONG_PW} -m ${NS_NET} &
+ sleep 1
+ run_cmd_nsc nettest -r ${NSA_IP} -M ${MD5_WRONG_PW}
+ log_test $? 0 "MD5: VRF: Prefix config in default VRF and VRF, conn in default VRF"
+
+ log_start
+ show_hint "Should timeout since client in default VRF uses VRF password"
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET} &
+ run_cmd nettest -s -M ${MD5_WRONG_PW} -m ${NS_NET} &
+ sleep 1
+ run_cmd_nsc nettest -r ${NSA_IP} -M ${MD5_PW}
+ log_test $? 2 "MD5: VRF: Prefix config in default VRF and VRF, conn in default VRF with VRF pw"
+
+ log_start
+ show_hint "Should timeout since client in VRF uses default VRF password"
+ run_cmd nettest -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET} &
+ run_cmd nettest -s -M ${MD5_WRONG_PW} -m ${NS_NET} &
+ sleep 1
+ run_cmd_nsb nettest -r ${NSA_IP} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: VRF: Prefix config in default VRF and VRF, conn in VRF with default VRF pw"
+
+ #
+ # negative tests
+ #
+ log_start
+ run_cmd nettest -s -d ${NSA_DEV} -M ${MD5_PW} -r ${NSB_IP}
+ log_test $? 1 "MD5: VRF: Device must be a VRF - single address"
+
+ log_start
+ run_cmd nettest -s -d ${NSA_DEV} -M ${MD5_PW} -m ${NS_NET}
+ log_test $? 1 "MD5: VRF: Device must be a VRF - prefix"
+
+}
+
+ipv4_tcp_novrf()
+{
+ local a
+
+ #
+ # server tests
+ #
+ for a in ${NSA_IP} ${NSA_LO_IP}
+ do
+ log_start
+ run_cmd nettest -s &
+ sleep 1
+ run_cmd_nsb nettest -r ${a}
+ log_test_addr ${a} $? 0 "Global server"
+ done
+
+ a=${NSA_IP}
+ log_start
+ run_cmd nettest -s -d ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -r ${a}
+ log_test_addr ${a} $? 0 "Device server"
+
+ # verify TCP reset sent and received
+ for a in ${NSA_IP} ${NSA_LO_IP}
+ do
+ log_start
+ show_hint "Should fail 'Connection refused' since there is no server"
+ run_cmd_nsb nettest -r ${a}
+ log_test_addr ${a} $? 1 "No server"
+ done
+
+ #
+ # client
+ #
+ for a in ${NSB_IP} ${NSB_LO_IP}
+ do
+ log_start
+ run_cmd_nsb nettest -s &
+ sleep 1
+ run_cmd nettest -r ${a} -0 ${NSA_IP}
+ log_test_addr ${a} $? 0 "Client"
+
+ log_start
+ run_cmd_nsb nettest -s &
+ sleep 1
+ run_cmd nettest -r ${a} -d ${NSA_DEV}
+ log_test_addr ${a} $? 0 "Client, device bind"
+
+ log_start
+ show_hint "Should fail 'Connection refused'"
+ run_cmd nettest -r ${a}
+ log_test_addr ${a} $? 1 "No server, unbound client"
+
+ log_start
+ show_hint "Should fail 'Connection refused'"
+ run_cmd nettest -r ${a} -d ${NSA_DEV}
+ log_test_addr ${a} $? 1 "No server, device client"
+ done
+
+ #
+ # local address tests
+ #
+ for a in ${NSA_IP} ${NSA_LO_IP} 127.0.0.1
+ do
+ log_start
+ run_cmd nettest -s &
+ sleep 1
+ run_cmd nettest -r ${a} -0 ${a} -1 ${a}
+ log_test_addr ${a} $? 0 "Global server, local connection"
+ done
+
+ a=${NSA_IP}
+ log_start
+ run_cmd nettest -s -d ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -r ${a} -0 ${a}
+ log_test_addr ${a} $? 0 "Device server, unbound client, local connection"
+
+ for a in ${NSA_LO_IP} 127.0.0.1
+ do
+ log_start
+ show_hint "Should fail 'Connection refused' since addresses on loopback are out of device scope"
+ run_cmd nettest -s -d ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -r ${a}
+ log_test_addr ${a} $? 1 "Device server, unbound client, local connection"
+ done
+
+ a=${NSA_IP}
+ log_start
+ run_cmd nettest -s &
+ sleep 1
+ run_cmd nettest -r ${a} -0 ${a} -d ${NSA_DEV}
+ log_test_addr ${a} $? 0 "Global server, device client, local connection"
+
+ for a in ${NSA_LO_IP} 127.0.0.1
+ do
+ log_start
+ show_hint "Should fail 'No route to host' since addresses on loopback are out of device scope"
+ run_cmd nettest -s &
+ sleep 1
+ run_cmd nettest -r ${a} -d ${NSA_DEV}
+ log_test_addr ${a} $? 1 "Global server, device client, local connection"
+ done
+
+ a=${NSA_IP}
+ log_start
+ run_cmd nettest -s -d ${NSA_DEV} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -d ${NSA_DEV} -r ${a} -0 ${a}
+ log_test_addr ${a} $? 0 "Device server, device client, local connection"
+
+ log_start
+ show_hint "Should fail 'Connection refused'"
+ run_cmd nettest -d ${NSA_DEV} -r ${a}
+ log_test_addr ${a} $? 1 "No server, device client, local conn"
+
+ [ "$fips_enabled" = "1" ] || ipv4_tcp_md5_novrf
+}
+
+ipv4_tcp_vrf()
+{
+ local a
+
+ # disable global server
+ log_subsection "Global server disabled"
+
+ set_sysctl net.ipv4.tcp_l3mdev_accept=0
+
+ #
+ # server tests
+ #
+ for a in ${NSA_IP} ${VRF_IP}
+ do
+ log_start
+ show_hint "Should fail 'Connection refused' since global server with VRF is disabled"
+ run_cmd nettest -s &
+ sleep 1
+ run_cmd_nsb nettest -r ${a}
+ log_test_addr ${a} $? 1 "Global server"
+
+ log_start
+ run_cmd nettest -s -d ${VRF} -2 ${VRF} &
+ sleep 1
+ run_cmd_nsb nettest -r ${a}
+ log_test_addr ${a} $? 0 "VRF server"
+
+ log_start
+ run_cmd nettest -s -d ${NSA_DEV} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -r ${a}
+ log_test_addr ${a} $? 0 "Device server"
+
+ # verify TCP reset received
+ log_start
+ show_hint "Should fail 'Connection refused' since there is no server"
+ run_cmd_nsb nettest -r ${a}
+ log_test_addr ${a} $? 1 "No server"
+ done
+
+ # local address tests
+ # (${VRF_IP} and 127.0.0.1 both timeout)
+ a=${NSA_IP}
+ log_start
+ show_hint "Should fail 'Connection refused' since global server with VRF is disabled"
+ run_cmd nettest -s &
+ sleep 1
+ run_cmd nettest -r ${a} -d ${NSA_DEV}
+ log_test_addr ${a} $? 1 "Global server, local connection"
+
+ # run MD5 tests
+ if [ "$fips_enabled" = "0" ]; then
+ setup_vrf_dup
+ ipv4_tcp_md5
+ cleanup_vrf_dup
+ fi
+
+ #
+ # enable VRF global server
+ #
+ log_subsection "VRF Global server enabled"
+ set_sysctl net.ipv4.tcp_l3mdev_accept=1
+
+ for a in ${NSA_IP} ${VRF_IP}
+ do
+ log_start
+ show_hint "client socket should be bound to VRF"
+ run_cmd nettest -s -2 ${VRF} &
+ sleep 1
+ run_cmd_nsb nettest -r ${a}
+ log_test_addr ${a} $? 0 "Global server"
+
+ log_start
+ show_hint "client socket should be bound to VRF"
+ run_cmd nettest -s -d ${VRF} -2 ${VRF} &
+ sleep 1
+ run_cmd_nsb nettest -r ${a}
+ log_test_addr ${a} $? 0 "VRF server"
+
+ # verify TCP reset received
+ log_start
+ show_hint "Should fail 'Connection refused'"
+ run_cmd_nsb nettest -r ${a}
+ log_test_addr ${a} $? 1 "No server"
+ done
+
+ a=${NSA_IP}
+ log_start
+ show_hint "client socket should be bound to device"
+ run_cmd nettest -s -d ${NSA_DEV} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -r ${a}
+ log_test_addr ${a} $? 0 "Device server"
+
+ # local address tests
+ for a in ${NSA_IP} ${VRF_IP}
+ do
+ log_start
+ show_hint "Should fail 'Connection refused' since client is not bound to VRF"
+ run_cmd nettest -s -d ${VRF} &
+ sleep 1
+ run_cmd nettest -r ${a}
+ log_test_addr ${a} $? 1 "Global server, local connection"
+ done
+
+ #
+ # client
+ #
+ for a in ${NSB_IP} ${NSB_LO_IP}
+ do
+ log_start
+ run_cmd_nsb nettest -s &
+ sleep 1
+ run_cmd nettest -r ${a} -d ${VRF}
+ log_test_addr ${a} $? 0 "Client, VRF bind"
+
+ log_start
+ run_cmd_nsb nettest -s &
+ sleep 1
+ run_cmd nettest -r ${a} -d ${NSA_DEV}
+ log_test_addr ${a} $? 0 "Client, device bind"
+
+ log_start
+ show_hint "Should fail 'Connection refused'"
+ run_cmd nettest -r ${a} -d ${VRF}
+ log_test_addr ${a} $? 1 "No server, VRF client"
+
+ log_start
+ show_hint "Should fail 'Connection refused'"
+ run_cmd nettest -r ${a} -d ${NSA_DEV}
+ log_test_addr ${a} $? 1 "No server, device client"
+ done
+
+ for a in ${NSA_IP} ${VRF_IP} 127.0.0.1
+ do
+ log_start
+ run_cmd nettest -s -d ${VRF} -2 ${VRF} &
+ sleep 1
+ run_cmd nettest -r ${a} -d ${VRF} -0 ${a}
+ log_test_addr ${a} $? 0 "VRF server, VRF client, local connection"
+ done
+
+ a=${NSA_IP}
+ log_start
+ run_cmd nettest -s -d ${VRF} -2 ${VRF} &
+ sleep 1
+ run_cmd nettest -r ${a} -d ${NSA_DEV} -0 ${a}
+ log_test_addr ${a} $? 0 "VRF server, device client, local connection"
+
+ log_start
+ show_hint "Should fail 'No route to host' since client is out of VRF scope"
+ run_cmd nettest -s -d ${VRF} &
+ sleep 1
+ run_cmd nettest -r ${a}
+ log_test_addr ${a} $? 1 "VRF server, unbound client, local connection"
+
+ log_start
+ run_cmd nettest -s -d ${NSA_DEV} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -r ${a} -d ${VRF} -0 ${a}
+ log_test_addr ${a} $? 0 "Device server, VRF client, local connection"
+
+ log_start
+ run_cmd nettest -s -d ${NSA_DEV} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -r ${a} -d ${NSA_DEV} -0 ${a}
+ log_test_addr ${a} $? 0 "Device server, device client, local connection"
+}
+
+ipv4_tcp()
+{
+ log_section "IPv4/TCP"
+ log_subsection "No VRF"
+ setup
+
+ # tcp_l3mdev_accept should have no affect without VRF;
+ # run tests with it enabled and disabled to verify
+ log_subsection "tcp_l3mdev_accept disabled"
+ set_sysctl net.ipv4.tcp_l3mdev_accept=0
+ ipv4_tcp_novrf
+ log_subsection "tcp_l3mdev_accept enabled"
+ set_sysctl net.ipv4.tcp_l3mdev_accept=1
+ ipv4_tcp_novrf
+
+ log_subsection "With VRF"
+ setup "yes"
+ ipv4_tcp_vrf
+}
+
+################################################################################
+# IPv4 UDP
+
+ipv4_udp_novrf()
+{
+ local a
+
+ #
+ # server tests
+ #
+ for a in ${NSA_IP} ${NSA_LO_IP}
+ do
+ log_start
+ run_cmd nettest -D -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -D -r ${a}
+ log_test_addr ${a} $? 0 "Global server"
+
+ log_start
+ show_hint "Should fail 'Connection refused' since there is no server"
+ run_cmd_nsb nettest -D -r ${a}
+ log_test_addr ${a} $? 1 "No server"
+ done
+
+ a=${NSA_IP}
+ log_start
+ run_cmd nettest -D -d ${NSA_DEV} -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -D -r ${a}
+ log_test_addr ${a} $? 0 "Device server"
+
+ #
+ # client
+ #
+ for a in ${NSB_IP} ${NSB_LO_IP}
+ do
+ log_start
+ run_cmd_nsb nettest -D -s &
+ sleep 1
+ run_cmd nettest -D -r ${a} -0 ${NSA_IP}
+ log_test_addr ${a} $? 0 "Client"
+
+ log_start
+ run_cmd_nsb nettest -D -s &
+ sleep 1
+ run_cmd nettest -D -r ${a} -d ${NSA_DEV} -0 ${NSA_IP}
+ log_test_addr ${a} $? 0 "Client, device bind"
+
+ log_start
+ run_cmd_nsb nettest -D -s &
+ sleep 1
+ run_cmd nettest -D -r ${a} -d ${NSA_DEV} -C -0 ${NSA_IP}
+ log_test_addr ${a} $? 0 "Client, device send via cmsg"
+
+ log_start
+ run_cmd_nsb nettest -D -s &
+ sleep 1
+ run_cmd nettest -D -r ${a} -d ${NSA_DEV} -S -0 ${NSA_IP}
+ log_test_addr ${a} $? 0 "Client, device bind via IP_UNICAST_IF"
+
+ log_start
+ show_hint "Should fail 'Connection refused'"
+ run_cmd nettest -D -r ${a}
+ log_test_addr ${a} $? 1 "No server, unbound client"
+
+ log_start
+ show_hint "Should fail 'Connection refused'"
+ run_cmd nettest -D -r ${a} -d ${NSA_DEV}
+ log_test_addr ${a} $? 1 "No server, device client"
+ done
+
+ #
+ # local address tests
+ #
+ for a in ${NSA_IP} ${NSA_LO_IP} 127.0.0.1
+ do
+ log_start
+ run_cmd nettest -D -s &
+ sleep 1
+ run_cmd nettest -D -r ${a} -0 ${a} -1 ${a}
+ log_test_addr ${a} $? 0 "Global server, local connection"
+ done
+
+ a=${NSA_IP}
+ log_start
+ run_cmd nettest -s -D -d ${NSA_DEV} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -D -r ${a}
+ log_test_addr ${a} $? 0 "Device server, unbound client, local connection"
+
+ for a in ${NSA_LO_IP} 127.0.0.1
+ do
+ log_start
+ show_hint "Should fail 'Connection refused' since address is out of device scope"
+ run_cmd nettest -s -D -d ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -D -r ${a}
+ log_test_addr ${a} $? 1 "Device server, unbound client, local connection"
+ done
+
+ a=${NSA_IP}
+ log_start
+ run_cmd nettest -s -D &
+ sleep 1
+ run_cmd nettest -D -d ${NSA_DEV} -r ${a}
+ log_test_addr ${a} $? 0 "Global server, device client, local connection"
+
+ log_start
+ run_cmd nettest -s -D &
+ sleep 1
+ run_cmd nettest -D -d ${NSA_DEV} -C -r ${a}
+ log_test_addr ${a} $? 0 "Global server, device send via cmsg, local connection"
+
+ log_start
+ run_cmd nettest -s -D &
+ sleep 1
+ run_cmd nettest -D -d ${NSA_DEV} -S -r ${a}
+ log_test_addr ${a} $? 0 "Global server, device client via IP_UNICAST_IF, local connection"
+
+ # IPv4 with device bind has really weird behavior - it overrides the
+ # fib lookup, generates an rtable and tries to send the packet. This
+ # causes failures for local traffic at different places
+ for a in ${NSA_LO_IP} 127.0.0.1
+ do
+ log_start
+ show_hint "Should fail since addresses on loopback are out of device scope"
+ run_cmd nettest -D -s &
+ sleep 1
+ run_cmd nettest -D -r ${a} -d ${NSA_DEV}
+ log_test_addr ${a} $? 2 "Global server, device client, local connection"
+
+ log_start
+ show_hint "Should fail since addresses on loopback are out of device scope"
+ run_cmd nettest -D -s &
+ sleep 1
+ run_cmd nettest -D -r ${a} -d ${NSA_DEV} -C
+ log_test_addr ${a} $? 1 "Global server, device send via cmsg, local connection"
+
+ log_start
+ show_hint "Should fail since addresses on loopback are out of device scope"
+ run_cmd nettest -D -s &
+ sleep 1
+ run_cmd nettest -D -r ${a} -d ${NSA_DEV} -S
+ log_test_addr ${a} $? 1 "Global server, device client via IP_UNICAST_IF, local connection"
+ done
+
+ a=${NSA_IP}
+ log_start
+ run_cmd nettest -D -s -d ${NSA_DEV} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -D -d ${NSA_DEV} -r ${a} -0 ${a}
+ log_test_addr ${a} $? 0 "Device server, device client, local conn"
+
+ log_start
+ run_cmd nettest -D -d ${NSA_DEV} -r ${a}
+ log_test_addr ${a} $? 2 "No server, device client, local conn"
+}
+
+ipv4_udp_vrf()
+{
+ local a
+
+ # disable global server
+ log_subsection "Global server disabled"
+ set_sysctl net.ipv4.udp_l3mdev_accept=0
+
+ #
+ # server tests
+ #
+ for a in ${NSA_IP} ${VRF_IP}
+ do
+ log_start
+ show_hint "Fails because ingress is in a VRF and global server is disabled"
+ run_cmd nettest -D -s &
+ sleep 1
+ run_cmd_nsb nettest -D -r ${a}
+ log_test_addr ${a} $? 1 "Global server"
+
+ log_start
+ run_cmd nettest -D -d ${VRF} -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -D -r ${a}
+ log_test_addr ${a} $? 0 "VRF server"
+
+ log_start
+ run_cmd nettest -D -d ${NSA_DEV} -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -D -r ${a}
+ log_test_addr ${a} $? 0 "Enslaved device server"
+
+ log_start
+ show_hint "Should fail 'Connection refused' since there is no server"
+ run_cmd_nsb nettest -D -r ${a}
+ log_test_addr ${a} $? 1 "No server"
+
+ log_start
+ show_hint "Should fail 'Connection refused' since global server is out of scope"
+ run_cmd nettest -D -s &
+ sleep 1
+ run_cmd nettest -D -d ${VRF} -r ${a}
+ log_test_addr ${a} $? 1 "Global server, VRF client, local connection"
+ done
+
+ a=${NSA_IP}
+ log_start
+ run_cmd nettest -s -D -d ${VRF} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -D -d ${VRF} -r ${a}
+ log_test_addr ${a} $? 0 "VRF server, VRF client, local conn"
+
+ log_start
+ run_cmd nettest -s -D -d ${VRF} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -D -d ${NSA_DEV} -r ${a}
+ log_test_addr ${a} $? 0 "VRF server, enslaved device client, local connection"
+
+ a=${NSA_IP}
+ log_start
+ run_cmd nettest -s -D -d ${NSA_DEV} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -D -d ${VRF} -r ${a}
+ log_test_addr ${a} $? 0 "Enslaved device server, VRF client, local conn"
+
+ log_start
+ run_cmd nettest -s -D -d ${NSA_DEV} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -D -d ${NSA_DEV} -r ${a}
+ log_test_addr ${a} $? 0 "Enslaved device server, device client, local conn"
+
+ # enable global server
+ log_subsection "Global server enabled"
+ set_sysctl net.ipv4.udp_l3mdev_accept=1
+
+ #
+ # server tests
+ #
+ for a in ${NSA_IP} ${VRF_IP}
+ do
+ log_start
+ run_cmd nettest -D -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -D -r ${a}
+ log_test_addr ${a} $? 0 "Global server"
+
+ log_start
+ run_cmd nettest -D -d ${VRF} -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -D -r ${a}
+ log_test_addr ${a} $? 0 "VRF server"
+
+ log_start
+ run_cmd nettest -D -d ${NSA_DEV} -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -D -r ${a}
+ log_test_addr ${a} $? 0 "Enslaved device server"
+
+ log_start
+ show_hint "Should fail 'Connection refused'"
+ run_cmd_nsb nettest -D -r ${a}
+ log_test_addr ${a} $? 1 "No server"
+ done
+
+ #
+ # client tests
+ #
+ log_start
+ run_cmd_nsb nettest -D -s &
+ sleep 1
+ run_cmd nettest -d ${VRF} -D -r ${NSB_IP} -1 ${NSA_IP}
+ log_test $? 0 "VRF client"
+
+ log_start
+ run_cmd_nsb nettest -D -s &
+ sleep 1
+ run_cmd nettest -d ${NSA_DEV} -D -r ${NSB_IP} -1 ${NSA_IP}
+ log_test $? 0 "Enslaved device client"
+
+ # negative test - should fail
+ log_start
+ show_hint "Should fail 'Connection refused'"
+ run_cmd nettest -D -d ${VRF} -r ${NSB_IP}
+ log_test $? 1 "No server, VRF client"
+
+ log_start
+ show_hint "Should fail 'Connection refused'"
+ run_cmd nettest -D -d ${NSA_DEV} -r ${NSB_IP}
+ log_test $? 1 "No server, enslaved device client"
+
+ #
+ # local address tests
+ #
+ a=${NSA_IP}
+ log_start
+ run_cmd nettest -D -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -D -d ${VRF} -r ${a}
+ log_test_addr ${a} $? 0 "Global server, VRF client, local conn"
+
+ log_start
+ run_cmd nettest -s -D -d ${VRF} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -D -d ${VRF} -r ${a}
+ log_test_addr ${a} $? 0 "VRF server, VRF client, local conn"
+
+ log_start
+ run_cmd nettest -s -D -d ${VRF} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -D -d ${NSA_DEV} -r ${a}
+ log_test_addr ${a} $? 0 "VRF server, device client, local conn"
+
+ log_start
+ run_cmd nettest -s -D -d ${NSA_DEV} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -D -d ${VRF} -r ${a}
+ log_test_addr ${a} $? 0 "Enslaved device server, VRF client, local conn"
+
+ log_start
+ run_cmd nettest -s -D -d ${NSA_DEV} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -D -d ${NSA_DEV} -r ${a}
+ log_test_addr ${a} $? 0 "Enslaved device server, device client, local conn"
+
+ for a in ${VRF_IP} 127.0.0.1
+ do
+ log_start
+ run_cmd nettest -D -s -2 ${VRF} &
+ sleep 1
+ run_cmd nettest -D -d ${VRF} -r ${a}
+ log_test_addr ${a} $? 0 "Global server, VRF client, local conn"
+ done
+
+ for a in ${VRF_IP} 127.0.0.1
+ do
+ log_start
+ run_cmd nettest -s -D -d ${VRF} -2 ${VRF} &
+ sleep 1
+ run_cmd nettest -D -d ${VRF} -r ${a}
+ log_test_addr ${a} $? 0 "VRF server, VRF client, local conn"
+ done
+
+ # negative test - should fail
+ # verifies ECONNREFUSED
+ for a in ${NSA_IP} ${VRF_IP} 127.0.0.1
+ do
+ log_start
+ show_hint "Should fail 'Connection refused'"
+ run_cmd nettest -D -d ${VRF} -r ${a}
+ log_test_addr ${a} $? 1 "No server, VRF client, local conn"
+ done
+}
+
+ipv4_udp()
+{
+ log_section "IPv4/UDP"
+ log_subsection "No VRF"
+
+ setup
+
+ # udp_l3mdev_accept should have no affect without VRF;
+ # run tests with it enabled and disabled to verify
+ log_subsection "udp_l3mdev_accept disabled"
+ set_sysctl net.ipv4.udp_l3mdev_accept=0
+ ipv4_udp_novrf
+ log_subsection "udp_l3mdev_accept enabled"
+ set_sysctl net.ipv4.udp_l3mdev_accept=1
+ ipv4_udp_novrf
+
+ log_subsection "With VRF"
+ setup "yes"
+ ipv4_udp_vrf
+}
+
+################################################################################
+# IPv4 address bind
+#
+# verifies ability or inability to bind to an address / device
+
+ipv4_addr_bind_novrf()
+{
+ #
+ # raw socket
+ #
+ for a in ${NSA_IP} ${NSA_LO_IP}
+ do
+ log_start
+ run_cmd nettest -s -R -P icmp -l ${a} -b
+ log_test_addr ${a} $? 0 "Raw socket bind to local address"
+
+ log_start
+ run_cmd nettest -s -R -P icmp -l ${a} -d ${NSA_DEV} -b
+ log_test_addr ${a} $? 0 "Raw socket bind to local address after device bind"
+ done
+
+ #
+ # tcp sockets
+ #
+ a=${NSA_IP}
+ log_start
+ run_cmd nettest -l ${a} -r ${NSB_IP} -t1 -b
+ log_test_addr ${a} $? 0 "TCP socket bind to local address"
+
+ log_start
+ run_cmd nettest -l ${a} -r ${NSB_IP} -d ${NSA_DEV} -t1 -b
+ log_test_addr ${a} $? 0 "TCP socket bind to local address after device bind"
+
+ # Sadly, the kernel allows binding a socket to a device and then
+ # binding to an address not on the device. The only restriction
+ # is that the address is valid in the L3 domain. So this test
+ # passes when it really should not
+ #a=${NSA_LO_IP}
+ #log_start
+ #show_hint "Should fail with 'Cannot assign requested address'"
+ #run_cmd nettest -s -l ${a} -d ${NSA_DEV} -t1 -b
+ #log_test_addr ${a} $? 1 "TCP socket bind to out of scope local address"
+}
+
+ipv4_addr_bind_vrf()
+{
+ #
+ # raw socket
+ #
+ for a in ${NSA_IP} ${VRF_IP}
+ do
+ log_start
+ show_hint "Socket not bound to VRF, but address is in VRF"
+ run_cmd nettest -s -R -P icmp -l ${a} -b
+ log_test_addr ${a} $? 1 "Raw socket bind to local address"
+
+ log_start
+ run_cmd nettest -s -R -P icmp -l ${a} -d ${NSA_DEV} -b
+ log_test_addr ${a} $? 0 "Raw socket bind to local address after device bind"
+ log_start
+ run_cmd nettest -s -R -P icmp -l ${a} -d ${VRF} -b
+ log_test_addr ${a} $? 0 "Raw socket bind to local address after VRF bind"
+ done
+
+ a=${NSA_LO_IP}
+ log_start
+ show_hint "Address on loopback is out of VRF scope"
+ run_cmd nettest -s -R -P icmp -l ${a} -d ${VRF} -b
+ log_test_addr ${a} $? 1 "Raw socket bind to out of scope address after VRF bind"
+
+ #
+ # tcp sockets
+ #
+ for a in ${NSA_IP} ${VRF_IP}
+ do
+ log_start
+ run_cmd nettest -s -l ${a} -d ${VRF} -t1 -b
+ log_test_addr ${a} $? 0 "TCP socket bind to local address"
+
+ log_start
+ run_cmd nettest -s -l ${a} -d ${NSA_DEV} -t1 -b
+ log_test_addr ${a} $? 0 "TCP socket bind to local address after device bind"
+ done
+
+ a=${NSA_LO_IP}
+ log_start
+ show_hint "Address on loopback out of scope for VRF"
+ run_cmd nettest -s -l ${a} -d ${VRF} -t1 -b
+ log_test_addr ${a} $? 1 "TCP socket bind to invalid local address for VRF"
+
+ log_start
+ show_hint "Address on loopback out of scope for device in VRF"
+ run_cmd nettest -s -l ${a} -d ${NSA_DEV} -t1 -b
+ log_test_addr ${a} $? 1 "TCP socket bind to invalid local address for device bind"
+}
+
+ipv4_addr_bind()
+{
+ log_section "IPv4 address binds"
+
+ log_subsection "No VRF"
+ setup
+ ipv4_addr_bind_novrf
+
+ log_subsection "With VRF"
+ setup "yes"
+ ipv4_addr_bind_vrf
+}
+
+################################################################################
+# IPv4 runtime tests
+
+ipv4_rt()
+{
+ local desc="$1"
+ local varg="$2"
+ local with_vrf="yes"
+ local a
+
+ #
+ # server tests
+ #
+ for a in ${NSA_IP} ${VRF_IP}
+ do
+ log_start
+ run_cmd nettest ${varg} -s &
+ sleep 1
+ run_cmd_nsb nettest ${varg} -r ${a} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test_addr ${a} 0 0 "${desc}, global server"
+
+ setup ${with_vrf}
+ done
+
+ for a in ${NSA_IP} ${VRF_IP}
+ do
+ log_start
+ run_cmd nettest ${varg} -s -d ${VRF} &
+ sleep 1
+ run_cmd_nsb nettest ${varg} -r ${a} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test_addr ${a} 0 0 "${desc}, VRF server"
+
+ setup ${with_vrf}
+ done
+
+ a=${NSA_IP}
+ log_start
+ run_cmd nettest ${varg} -s -d ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest ${varg} -r ${a} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test_addr ${a} 0 0 "${desc}, enslaved device server"
+
+ setup ${with_vrf}
+
+ #
+ # client test
+ #
+ log_start
+ run_cmd_nsb nettest ${varg} -s &
+ sleep 1
+ run_cmd nettest ${varg} -d ${VRF} -r ${NSB_IP} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test_addr ${a} 0 0 "${desc}, VRF client"
+
+ setup ${with_vrf}
+
+ log_start
+ run_cmd_nsb nettest ${varg} -s &
+ sleep 1
+ run_cmd nettest ${varg} -d ${NSA_DEV} -r ${NSB_IP} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test_addr ${a} 0 0 "${desc}, enslaved device client"
+
+ setup ${with_vrf}
+
+ #
+ # local address tests
+ #
+ for a in ${NSA_IP} ${VRF_IP}
+ do
+ log_start
+ run_cmd nettest ${varg} -s &
+ sleep 1
+ run_cmd nettest ${varg} -d ${VRF} -r ${a} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test_addr ${a} 0 0 "${desc}, global server, VRF client, local"
+
+ setup ${with_vrf}
+ done
+
+ for a in ${NSA_IP} ${VRF_IP}
+ do
+ log_start
+ run_cmd nettest ${varg} -d ${VRF} -s &
+ sleep 1
+ run_cmd nettest ${varg} -d ${VRF} -r ${a} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test_addr ${a} 0 0 "${desc}, VRF server and client, local"
+
+ setup ${with_vrf}
+ done
+
+ a=${NSA_IP}
+ log_start
+ run_cmd nettest ${varg} -s &
+ sleep 1
+ run_cmd nettest ${varg} -d ${NSA_DEV} -r ${a} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test_addr ${a} 0 0 "${desc}, global server, enslaved device client, local"
+
+ setup ${with_vrf}
+
+ log_start
+ run_cmd nettest ${varg} -d ${VRF} -s &
+ sleep 1
+ run_cmd nettest ${varg} -d ${NSA_DEV} -r ${a} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test_addr ${a} 0 0 "${desc}, VRF server, enslaved device client, local"
+
+ setup ${with_vrf}
+
+ log_start
+ run_cmd nettest ${varg} -d ${NSA_DEV} -s &
+ sleep 1
+ run_cmd nettest ${varg} -d ${NSA_DEV} -r ${a} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test_addr ${a} 0 0 "${desc}, enslaved device server and client, local"
+}
+
+ipv4_ping_rt()
+{
+ local with_vrf="yes"
+ local a
+
+ for a in ${NSA_IP} ${VRF_IP}
+ do
+ log_start
+ run_cmd_nsb ping -f ${a} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test_addr ${a} 0 0 "Device delete with active traffic - ping in"
+
+ setup ${with_vrf}
+ done
+
+ a=${NSB_IP}
+ log_start
+ run_cmd ping -f -I ${VRF} ${a} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test_addr ${a} 0 0 "Device delete with active traffic - ping out"
+}
+
+ipv4_runtime()
+{
+ log_section "Run time tests - ipv4"
+
+ setup "yes"
+ ipv4_ping_rt
+
+ setup "yes"
+ ipv4_rt "TCP active socket" "-n -1"
+
+ setup "yes"
+ ipv4_rt "TCP passive socket" "-i"
+}
+
+################################################################################
+# IPv6
+
+ipv6_ping_novrf()
+{
+ local a
+
+ # should not have an impact, but make a known state
+ set_sysctl net.ipv4.raw_l3mdev_accept=0 2>/dev/null
+
+ #
+ # out
+ #
+ for a in ${NSB_IP6} ${NSB_LO_IP6} ${NSB_LINKIP6}%${NSA_DEV} ${MCAST}%${NSA_DEV}
+ do
+ log_start
+ run_cmd ${ping6} -c1 -w1 ${a}
+ log_test_addr ${a} $? 0 "ping out"
+ done
+
+ for a in ${NSB_IP6} ${NSB_LO_IP6}
+ do
+ log_start
+ run_cmd ${ping6} -c1 -w1 -I ${NSA_DEV} ${a}
+ log_test_addr ${a} $? 0 "ping out, device bind"
+
+ log_start
+ run_cmd ${ping6} -c1 -w1 -I ${NSA_LO_IP6} ${a}
+ log_test_addr ${a} $? 0 "ping out, loopback address bind"
+ done
+
+ #
+ # in
+ #
+ for a in ${NSA_IP6} ${NSA_LO_IP6} ${NSA_LINKIP6}%${NSB_DEV} ${MCAST}%${NSB_DEV}
+ do
+ log_start
+ run_cmd_nsb ${ping6} -c1 -w1 ${a}
+ log_test_addr ${a} $? 0 "ping in"
+ done
+
+ #
+ # local traffic, local address
+ #
+ for a in ${NSA_IP6} ${NSA_LO_IP6} ::1 ${NSA_LINKIP6}%${NSA_DEV} ${MCAST}%${NSA_DEV}
+ do
+ log_start
+ run_cmd ${ping6} -c1 -w1 ${a}
+ log_test_addr ${a} $? 0 "ping local, no bind"
+ done
+
+ for a in ${NSA_IP6} ${NSA_LINKIP6}%${NSA_DEV} ${MCAST}%${NSA_DEV}
+ do
+ log_start
+ run_cmd ${ping6} -c1 -w1 -I ${NSA_DEV} ${a}
+ log_test_addr ${a} $? 0 "ping local, device bind"
+ done
+
+ for a in ${NSA_LO_IP6} ::1
+ do
+ log_start
+ show_hint "Fails since address on loopback is out of device scope"
+ run_cmd ${ping6} -c1 -w1 -I ${NSA_DEV} ${a}
+ log_test_addr ${a} $? 2 "ping local, device bind"
+ done
+
+ #
+ # ip rule blocks address
+ #
+ log_start
+ setup_cmd ip -6 rule add pref 32765 from all lookup local
+ setup_cmd ip -6 rule del pref 0 from all lookup local
+ setup_cmd ip -6 rule add pref 50 to ${NSB_LO_IP6} prohibit
+ setup_cmd ip -6 rule add pref 51 from ${NSB_IP6} prohibit
+
+ a=${NSB_LO_IP6}
+ run_cmd ${ping6} -c1 -w1 ${a}
+ log_test_addr ${a} $? 2 "ping out, blocked by rule"
+
+ log_start
+ run_cmd ${ping6} -c1 -w1 -I ${NSA_DEV} ${a}
+ log_test_addr ${a} $? 2 "ping out, device bind, blocked by rule"
+
+ a=${NSA_LO_IP6}
+ log_start
+ show_hint "Response lost due to ip rule"
+ run_cmd_nsb ${ping6} -c1 -w1 ${a}
+ log_test_addr ${a} $? 1 "ping in, blocked by rule"
+
+ setup_cmd ip -6 rule add pref 0 from all lookup local
+ setup_cmd ip -6 rule del pref 32765 from all lookup local
+ setup_cmd ip -6 rule del pref 50 to ${NSB_LO_IP6} prohibit
+ setup_cmd ip -6 rule del pref 51 from ${NSB_IP6} prohibit
+
+ #
+ # route blocks reachability to remote address
+ #
+ log_start
+ setup_cmd ip -6 route del ${NSB_LO_IP6}
+ setup_cmd ip -6 route add unreachable ${NSB_LO_IP6} metric 10
+ setup_cmd ip -6 route add unreachable ${NSB_IP6} metric 10
+
+ a=${NSB_LO_IP6}
+ run_cmd ${ping6} -c1 -w1 ${a}
+ log_test_addr ${a} $? 2 "ping out, blocked by route"
+
+ log_start
+ run_cmd ${ping6} -c1 -w1 -I ${NSA_DEV} ${a}
+ log_test_addr ${a} $? 2 "ping out, device bind, blocked by route"
+
+ a=${NSA_LO_IP6}
+ log_start
+ show_hint "Response lost due to ip route"
+ run_cmd_nsb ${ping6} -c1 -w1 ${a}
+ log_test_addr ${a} $? 1 "ping in, blocked by route"
+
+
+ #
+ # remove 'remote' routes; fallback to default
+ #
+ log_start
+ setup_cmd ip -6 ro del unreachable ${NSB_LO_IP6}
+ setup_cmd ip -6 ro del unreachable ${NSB_IP6}
+
+ a=${NSB_LO_IP6}
+ run_cmd ${ping6} -c1 -w1 ${a}
+ log_test_addr ${a} $? 2 "ping out, unreachable route"
+
+ log_start
+ run_cmd ${ping6} -c1 -w1 -I ${NSA_DEV} ${a}
+ log_test_addr ${a} $? 2 "ping out, device bind, unreachable route"
+}
+
+ipv6_ping_vrf()
+{
+ local a
+
+ # should default on; does not exist on older kernels
+ set_sysctl net.ipv4.raw_l3mdev_accept=1 2>/dev/null
+
+ #
+ # out
+ #
+ for a in ${NSB_IP6} ${NSB_LO_IP6}
+ do
+ log_start
+ run_cmd ${ping6} -c1 -w1 -I ${VRF} ${a}
+ log_test_addr ${a} $? 0 "ping out, VRF bind"
+ done
+
+ for a in ${NSB_LINKIP6}%${VRF} ${MCAST}%${VRF}
+ do
+ log_start
+ show_hint "Fails since VRF device does not support linklocal or multicast"
+ run_cmd ${ping6} -c1 -w1 ${a}
+ log_test_addr ${a} $? 1 "ping out, VRF bind"
+ done
+
+ for a in ${NSB_IP6} ${NSB_LO_IP6} ${NSB_LINKIP6}%${NSA_DEV} ${MCAST}%${NSA_DEV}
+ do
+ log_start
+ run_cmd ${ping6} -c1 -w1 -I ${NSA_DEV} ${a}
+ log_test_addr ${a} $? 0 "ping out, device bind"
+ done
+
+ for a in ${NSB_IP6} ${NSB_LO_IP6} ${NSB_LINKIP6}%${NSA_DEV}
+ do
+ log_start
+ run_cmd ip vrf exec ${VRF} ${ping6} -c1 -w1 -I ${VRF_IP6} ${a}
+ log_test_addr ${a} $? 0 "ping out, vrf device+address bind"
+ done
+
+ #
+ # in
+ #
+ for a in ${NSA_IP6} ${VRF_IP6} ${NSA_LINKIP6}%${NSB_DEV} ${MCAST}%${NSB_DEV}
+ do
+ log_start
+ run_cmd_nsb ${ping6} -c1 -w1 ${a}
+ log_test_addr ${a} $? 0 "ping in"
+ done
+
+ a=${NSA_LO_IP6}
+ log_start
+ show_hint "Fails since loopback address is out of VRF scope"
+ run_cmd_nsb ${ping6} -c1 -w1 ${a}
+ log_test_addr ${a} $? 1 "ping in"
+
+ #
+ # local traffic, local address
+ #
+ for a in ${NSA_IP6} ${VRF_IP6} ::1
+ do
+ log_start
+ show_hint "Source address should be ${a}"
+ run_cmd ${ping6} -c1 -w1 -I ${VRF} ${a}
+ log_test_addr ${a} $? 0 "ping local, VRF bind"
+ done
+
+ for a in ${NSA_IP6} ${NSA_LINKIP6}%${NSA_DEV} ${MCAST}%${NSA_DEV}
+ do
+ log_start
+ run_cmd ${ping6} -c1 -w1 -I ${NSA_DEV} ${a}
+ log_test_addr ${a} $? 0 "ping local, device bind"
+ done
+
+ # LLA to GUA - remove ipv6 global addresses from ns-B
+ setup_cmd_nsb ip -6 addr del ${NSB_IP6}/64 dev ${NSB_DEV}
+ setup_cmd_nsb ip -6 addr del ${NSB_LO_IP6}/128 dev lo
+ setup_cmd_nsb ip -6 ro add ${NSA_IP6}/128 via ${NSA_LINKIP6} dev ${NSB_DEV}
+
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ run_cmd_nsb ${ping6} -c1 -w1 ${NSA_IP6}
+ log_test_addr ${a} $? 0 "ping in, LLA to GUA"
+ done
+
+ setup_cmd_nsb ip -6 ro del ${NSA_IP6}/128 via ${NSA_LINKIP6} dev ${NSB_DEV}
+ setup_cmd_nsb ip -6 addr add ${NSB_IP6}/64 dev ${NSB_DEV}
+ setup_cmd_nsb ip -6 addr add ${NSB_LO_IP6}/128 dev lo
+
+ #
+ # ip rule blocks address
+ #
+ log_start
+ setup_cmd ip -6 rule add pref 50 to ${NSB_LO_IP6} prohibit
+ setup_cmd ip -6 rule add pref 51 from ${NSB_IP6} prohibit
+
+ a=${NSB_LO_IP6}
+ run_cmd ${ping6} -c1 -w1 ${a}
+ log_test_addr ${a} $? 2 "ping out, blocked by rule"
+
+ log_start
+ run_cmd ${ping6} -c1 -w1 -I ${NSA_DEV} ${a}
+ log_test_addr ${a} $? 2 "ping out, device bind, blocked by rule"
+
+ a=${NSA_LO_IP6}
+ log_start
+ show_hint "Response lost due to ip rule"
+ run_cmd_nsb ${ping6} -c1 -w1 ${a}
+ log_test_addr ${a} $? 1 "ping in, blocked by rule"
+
+ log_start
+ setup_cmd ip -6 rule del pref 50 to ${NSB_LO_IP6} prohibit
+ setup_cmd ip -6 rule del pref 51 from ${NSB_IP6} prohibit
+
+ #
+ # remove 'remote' routes; fallback to default
+ #
+ log_start
+ setup_cmd ip -6 ro del ${NSB_LO_IP6} vrf ${VRF}
+
+ a=${NSB_LO_IP6}
+ run_cmd ${ping6} -c1 -w1 ${a}
+ log_test_addr ${a} $? 2 "ping out, unreachable route"
+
+ log_start
+ run_cmd ${ping6} -c1 -w1 -I ${NSA_DEV} ${a}
+ log_test_addr ${a} $? 2 "ping out, device bind, unreachable route"
+
+ ip -netns ${NSB} -6 ro del ${NSA_LO_IP6}
+ a=${NSA_LO_IP6}
+ log_start
+ run_cmd_nsb ${ping6} -c1 -w1 ${a}
+ log_test_addr ${a} $? 2 "ping in, unreachable route"
+}
+
+ipv6_ping()
+{
+ log_section "IPv6 ping"
+
+ log_subsection "No VRF"
+ setup
+ ipv6_ping_novrf
+ setup
+ set_sysctl net.ipv4.ping_group_range='0 2147483647' 2>/dev/null
+ ipv6_ping_novrf
+
+ log_subsection "With VRF"
+ setup "yes"
+ ipv6_ping_vrf
+ setup "yes"
+ set_sysctl net.ipv4.ping_group_range='0 2147483647' 2>/dev/null
+ ipv6_ping_vrf
+}
+
+################################################################################
+# IPv6 TCP
+
+#
+# MD5 tests without VRF
+#
+ipv6_tcp_md5_novrf()
+{
+ #
+ # single address
+ #
+
+ # basic use case
+ log_start
+ run_cmd nettest -6 -s -M ${MD5_PW} -r ${NSB_IP6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 0 "MD5: Single address config"
+
+ # client sends MD5, server not configured
+ log_start
+ show_hint "Should timeout due to MD5 mismatch"
+ run_cmd nettest -6 -s &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 2 "MD5: Server no config, client uses password"
+
+ # wrong password
+ log_start
+ show_hint "Should timeout since client uses wrong password"
+ run_cmd nettest -6 -s -M ${MD5_PW} -r ${NSB_IP6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: Client uses wrong password"
+
+ # client from different address
+ log_start
+ show_hint "Should timeout due to MD5 mismatch"
+ run_cmd nettest -6 -s -M ${MD5_PW} -r ${NSB_LO_IP6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 2 "MD5: Client address does not match address configured with password"
+
+ #
+ # MD5 extension - prefix length
+ #
+
+ # client in prefix
+ log_start
+ run_cmd nettest -6 -s -M ${MD5_PW} -m ${NS_NET6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 0 "MD5: Prefix config"
+
+ # client in prefix, wrong password
+ log_start
+ show_hint "Should timeout since client uses wrong password"
+ run_cmd nettest -6 -s -M ${MD5_PW} -m ${NS_NET6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: Prefix config, client uses wrong password"
+
+ # client outside of prefix
+ log_start
+ show_hint "Should timeout due to MD5 mismatch"
+ run_cmd nettest -6 -s -M ${MD5_PW} -m ${NS_NET6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -l ${NSB_LO_IP6} -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 2 "MD5: Prefix config, client address not in configured prefix"
+}
+
+#
+# MD5 tests with VRF
+#
+ipv6_tcp_md5()
+{
+ #
+ # single address
+ #
+
+ # basic use case
+ log_start
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 0 "MD5: VRF: Single address config"
+
+ # client sends MD5, server not configured
+ log_start
+ show_hint "Should timeout since server does not have MD5 auth"
+ run_cmd nettest -6 -s -d ${VRF} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 2 "MD5: VRF: Server no config, client uses password"
+
+ # wrong password
+ log_start
+ show_hint "Should timeout since client uses wrong password"
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: VRF: Client uses wrong password"
+
+ # client from different address
+ log_start
+ show_hint "Should timeout since server config differs from client"
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -r ${NSB_LO_IP6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 2 "MD5: VRF: Client address does not match address configured with password"
+
+ #
+ # MD5 extension - prefix length
+ #
+
+ # client in prefix
+ log_start
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 0 "MD5: VRF: Prefix config"
+
+ # client in prefix, wrong password
+ log_start
+ show_hint "Should timeout since client uses wrong password"
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: VRF: Prefix config, client uses wrong password"
+
+ # client outside of prefix
+ log_start
+ show_hint "Should timeout since client address is outside of prefix"
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -l ${NSB_LO_IP6} -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 2 "MD5: VRF: Prefix config, client address not in configured prefix"
+
+ #
+ # duplicate config between default VRF and a VRF
+ #
+
+ log_start
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP6} &
+ run_cmd nettest -6 -s -M ${MD5_WRONG_PW} -r ${NSB_IP6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 0 "MD5: VRF: Single address config in default VRF and VRF, conn in VRF"
+
+ log_start
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP6} &
+ run_cmd nettest -6 -s -M ${MD5_WRONG_PW} -r ${NSB_IP6} &
+ sleep 1
+ run_cmd_nsc nettest -6 -r ${NSA_IP6} -M ${MD5_WRONG_PW}
+ log_test $? 0 "MD5: VRF: Single address config in default VRF and VRF, conn in default VRF"
+
+ log_start
+ show_hint "Should timeout since client in default VRF uses VRF password"
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP6} &
+ run_cmd nettest -6 -s -M ${MD5_WRONG_PW} -r ${NSB_IP6} &
+ sleep 1
+ run_cmd_nsc nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 2 "MD5: VRF: Single address config in default VRF and VRF, conn in default VRF with VRF pw"
+
+ log_start
+ show_hint "Should timeout since client in VRF uses default VRF password"
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -r ${NSB_IP6} &
+ run_cmd nettest -6 -s -M ${MD5_WRONG_PW} -r ${NSB_IP6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: VRF: Single address config in default VRF and VRF, conn in VRF with default VRF pw"
+
+ log_start
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET6} &
+ run_cmd nettest -6 -s -M ${MD5_WRONG_PW} -m ${NS_NET6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 0 "MD5: VRF: Prefix config in default VRF and VRF, conn in VRF"
+
+ log_start
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET6} &
+ run_cmd nettest -6 -s -M ${MD5_WRONG_PW} -m ${NS_NET6} &
+ sleep 1
+ run_cmd_nsc nettest -6 -r ${NSA_IP6} -M ${MD5_WRONG_PW}
+ log_test $? 0 "MD5: VRF: Prefix config in default VRF and VRF, conn in default VRF"
+
+ log_start
+ show_hint "Should timeout since client in default VRF uses VRF password"
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET6} &
+ run_cmd nettest -6 -s -M ${MD5_WRONG_PW} -m ${NS_NET6} &
+ sleep 1
+ run_cmd_nsc nettest -6 -r ${NSA_IP6} -M ${MD5_PW}
+ log_test $? 2 "MD5: VRF: Prefix config in default VRF and VRF, conn in default VRF with VRF pw"
+
+ log_start
+ show_hint "Should timeout since client in VRF uses default VRF password"
+ run_cmd nettest -6 -s -d ${VRF} -M ${MD5_PW} -m ${NS_NET6} &
+ run_cmd nettest -6 -s -M ${MD5_WRONG_PW} -m ${NS_NET6} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${NSA_IP6} -M ${MD5_WRONG_PW}
+ log_test $? 2 "MD5: VRF: Prefix config in default VRF and VRF, conn in VRF with default VRF pw"
+
+ #
+ # negative tests
+ #
+ log_start
+ run_cmd nettest -6 -s -d ${NSA_DEV} -M ${MD5_PW} -r ${NSB_IP6}
+ log_test $? 1 "MD5: VRF: Device must be a VRF - single address"
+
+ log_start
+ run_cmd nettest -6 -s -d ${NSA_DEV} -M ${MD5_PW} -m ${NS_NET6}
+ log_test $? 1 "MD5: VRF: Device must be a VRF - prefix"
+
+}
+
+ipv6_tcp_novrf()
+{
+ local a
+
+ #
+ # server tests
+ #
+ for a in ${NSA_IP6} ${NSA_LO_IP6} ${NSA_LINKIP6}%${NSB_DEV}
+ do
+ log_start
+ run_cmd nettest -6 -s &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${a}
+ log_test_addr ${a} $? 0 "Global server"
+ done
+
+ # verify TCP reset received
+ for a in ${NSA_IP6} ${NSA_LO_IP6} ${NSA_LINKIP6}%${NSB_DEV}
+ do
+ log_start
+ show_hint "Should fail 'Connection refused'"
+ run_cmd_nsb nettest -6 -r ${a}
+ log_test_addr ${a} $? 1 "No server"
+ done
+
+ #
+ # client
+ #
+ for a in ${NSB_IP6} ${NSB_LO_IP6} ${NSB_LINKIP6}%${NSA_DEV}
+ do
+ log_start
+ run_cmd_nsb nettest -6 -s &
+ sleep 1
+ run_cmd nettest -6 -r ${a}
+ log_test_addr ${a} $? 0 "Client"
+ done
+
+ for a in ${NSB_IP6} ${NSB_LO_IP6} ${NSB_LINKIP6}%${NSA_DEV}
+ do
+ log_start
+ run_cmd_nsb nettest -6 -s &
+ sleep 1
+ run_cmd nettest -6 -r ${a} -d ${NSA_DEV}
+ log_test_addr ${a} $? 0 "Client, device bind"
+ done
+
+ for a in ${NSB_IP6} ${NSB_LO_IP6} ${NSB_LINKIP6}%${NSA_DEV}
+ do
+ log_start
+ show_hint "Should fail 'Connection refused'"
+ run_cmd nettest -6 -r ${a} -d ${NSA_DEV}
+ log_test_addr ${a} $? 1 "No server, device client"
+ done
+
+ #
+ # local address tests
+ #
+ for a in ${NSA_IP6} ${NSA_LO_IP6} ::1
+ do
+ log_start
+ run_cmd nettest -6 -s &
+ sleep 1
+ run_cmd nettest -6 -r ${a}
+ log_test_addr ${a} $? 0 "Global server, local connection"
+ done
+
+ a=${NSA_IP6}
+ log_start
+ run_cmd nettest -6 -s -d ${NSA_DEV} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -6 -r ${a} -0 ${a}
+ log_test_addr ${a} $? 0 "Device server, unbound client, local connection"
+
+ for a in ${NSA_LO_IP6} ::1
+ do
+ log_start
+ show_hint "Should fail 'Connection refused' since addresses on loopback are out of device scope"
+ run_cmd nettest -6 -s -d ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -6 -r ${a}
+ log_test_addr ${a} $? 1 "Device server, unbound client, local connection"
+ done
+
+ a=${NSA_IP6}
+ log_start
+ run_cmd nettest -6 -s &
+ sleep 1
+ run_cmd nettest -6 -r ${a} -d ${NSA_DEV} -0 ${a}
+ log_test_addr ${a} $? 0 "Global server, device client, local connection"
+
+ for a in ${NSA_LO_IP6} ::1
+ do
+ log_start
+ show_hint "Should fail 'Connection refused' since addresses on loopback are out of device scope"
+ run_cmd nettest -6 -s &
+ sleep 1
+ run_cmd nettest -6 -r ${a} -d ${NSA_DEV}
+ log_test_addr ${a} $? 1 "Global server, device client, local connection"
+ done
+
+ for a in ${NSA_IP6} ${NSA_LINKIP6}
+ do
+ log_start
+ run_cmd nettest -6 -s -d ${NSA_DEV} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -6 -d ${NSA_DEV} -r ${a}
+ log_test_addr ${a} $? 0 "Device server, device client, local conn"
+ done
+
+ for a in ${NSA_IP6} ${NSA_LINKIP6}
+ do
+ log_start
+ show_hint "Should fail 'Connection refused'"
+ run_cmd nettest -6 -d ${NSA_DEV} -r ${a}
+ log_test_addr ${a} $? 1 "No server, device client, local conn"
+ done
+
+ [ "$fips_enabled" = "1" ] || ipv6_tcp_md5_novrf
+}
+
+ipv6_tcp_vrf()
+{
+ local a
+
+ # disable global server
+ log_subsection "Global server disabled"
+
+ set_sysctl net.ipv4.tcp_l3mdev_accept=0
+
+ #
+ # server tests
+ #
+ for a in ${NSA_IP6} ${VRF_IP6} ${NSA_LINKIP6}%${NSB_DEV}
+ do
+ log_start
+ show_hint "Should fail 'Connection refused' since global server with VRF is disabled"
+ run_cmd nettest -6 -s &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${a}
+ log_test_addr ${a} $? 1 "Global server"
+ done
+
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ run_cmd nettest -6 -s -d ${VRF} -2 ${VRF} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${a}
+ log_test_addr ${a} $? 0 "VRF server"
+ done
+
+ # link local is always bound to ingress device
+ a=${NSA_LINKIP6}%${NSB_DEV}
+ log_start
+ run_cmd nettest -6 -s -d ${VRF} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${a}
+ log_test_addr ${a} $? 0 "VRF server"
+
+ for a in ${NSA_IP6} ${VRF_IP6} ${NSA_LINKIP6}%${NSB_DEV}
+ do
+ log_start
+ run_cmd nettest -6 -s -d ${NSA_DEV} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${a}
+ log_test_addr ${a} $? 0 "Device server"
+ done
+
+ # verify TCP reset received
+ for a in ${NSA_IP6} ${VRF_IP6} ${NSA_LINKIP6}%${NSB_DEV}
+ do
+ log_start
+ show_hint "Should fail 'Connection refused'"
+ run_cmd_nsb nettest -6 -r ${a}
+ log_test_addr ${a} $? 1 "No server"
+ done
+
+ # local address tests
+ a=${NSA_IP6}
+ log_start
+ show_hint "Should fail 'Connection refused' since global server with VRF is disabled"
+ run_cmd nettest -6 -s &
+ sleep 1
+ run_cmd nettest -6 -r ${a} -d ${NSA_DEV}
+ log_test_addr ${a} $? 1 "Global server, local connection"
+
+ # run MD5 tests
+ if [ "$fips_enabled" = "0" ]; then
+ setup_vrf_dup
+ ipv6_tcp_md5
+ cleanup_vrf_dup
+ fi
+
+ #
+ # enable VRF global server
+ #
+ log_subsection "VRF Global server enabled"
+ set_sysctl net.ipv4.tcp_l3mdev_accept=1
+
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ run_cmd nettest -6 -s -2 ${VRF} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${a}
+ log_test_addr ${a} $? 0 "Global server"
+ done
+
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ run_cmd nettest -6 -s -d ${VRF} -2 ${VRF} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${a}
+ log_test_addr ${a} $? 0 "VRF server"
+ done
+
+ # For LLA, child socket is bound to device
+ a=${NSA_LINKIP6}%${NSB_DEV}
+ log_start
+ run_cmd nettest -6 -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${a}
+ log_test_addr ${a} $? 0 "Global server"
+
+ log_start
+ run_cmd nettest -6 -s -d ${VRF} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${a}
+ log_test_addr ${a} $? 0 "VRF server"
+
+ for a in ${NSA_IP6} ${NSA_LINKIP6}%${NSB_DEV}
+ do
+ log_start
+ run_cmd nettest -6 -s -d ${NSA_DEV} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${a}
+ log_test_addr ${a} $? 0 "Device server"
+ done
+
+ # verify TCP reset received
+ for a in ${NSA_IP6} ${VRF_IP6} ${NSA_LINKIP6}%${NSB_DEV}
+ do
+ log_start
+ show_hint "Should fail 'Connection refused'"
+ run_cmd_nsb nettest -6 -r ${a}
+ log_test_addr ${a} $? 1 "No server"
+ done
+
+ # local address tests
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ show_hint "Fails 'Connection refused' since client is not in VRF"
+ run_cmd nettest -6 -s -d ${VRF} &
+ sleep 1
+ run_cmd nettest -6 -r ${a}
+ log_test_addr ${a} $? 1 "Global server, local connection"
+ done
+
+
+ #
+ # client
+ #
+ for a in ${NSB_IP6} ${NSB_LO_IP6}
+ do
+ log_start
+ run_cmd_nsb nettest -6 -s &
+ sleep 1
+ run_cmd nettest -6 -r ${a} -d ${VRF}
+ log_test_addr ${a} $? 0 "Client, VRF bind"
+ done
+
+ a=${NSB_LINKIP6}
+ log_start
+ show_hint "Fails since VRF device does not allow linklocal addresses"
+ run_cmd_nsb nettest -6 -s &
+ sleep 1
+ run_cmd nettest -6 -r ${a} -d ${VRF}
+ log_test_addr ${a} $? 1 "Client, VRF bind"
+
+ for a in ${NSB_IP6} ${NSB_LO_IP6} ${NSB_LINKIP6}
+ do
+ log_start
+ run_cmd_nsb nettest -6 -s &
+ sleep 1
+ run_cmd nettest -6 -r ${a} -d ${NSA_DEV}
+ log_test_addr ${a} $? 0 "Client, device bind"
+ done
+
+ for a in ${NSB_IP6} ${NSB_LO_IP6}
+ do
+ log_start
+ show_hint "Should fail 'Connection refused'"
+ run_cmd nettest -6 -r ${a} -d ${VRF}
+ log_test_addr ${a} $? 1 "No server, VRF client"
+ done
+
+ for a in ${NSB_IP6} ${NSB_LO_IP6} ${NSB_LINKIP6}
+ do
+ log_start
+ show_hint "Should fail 'Connection refused'"
+ run_cmd nettest -6 -r ${a} -d ${NSA_DEV}
+ log_test_addr ${a} $? 1 "No server, device client"
+ done
+
+ for a in ${NSA_IP6} ${VRF_IP6} ::1
+ do
+ log_start
+ run_cmd nettest -6 -s -d ${VRF} -2 ${VRF} &
+ sleep 1
+ run_cmd nettest -6 -r ${a} -d ${VRF} -0 ${a}
+ log_test_addr ${a} $? 0 "VRF server, VRF client, local connection"
+ done
+
+ a=${NSA_IP6}
+ log_start
+ run_cmd nettest -6 -s -d ${VRF} -2 ${VRF} &
+ sleep 1
+ run_cmd nettest -6 -r ${a} -d ${NSA_DEV} -0 ${a}
+ log_test_addr ${a} $? 0 "VRF server, device client, local connection"
+
+ a=${NSA_IP6}
+ log_start
+ show_hint "Should fail since unbound client is out of VRF scope"
+ run_cmd nettest -6 -s -d ${VRF} &
+ sleep 1
+ run_cmd nettest -6 -r ${a}
+ log_test_addr ${a} $? 1 "VRF server, unbound client, local connection"
+
+ log_start
+ run_cmd nettest -6 -s -d ${NSA_DEV} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -6 -r ${a} -d ${VRF} -0 ${a}
+ log_test_addr ${a} $? 0 "Device server, VRF client, local connection"
+
+ for a in ${NSA_IP6} ${NSA_LINKIP6}
+ do
+ log_start
+ run_cmd nettest -6 -s -d ${NSA_DEV} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -6 -r ${a} -d ${NSA_DEV} -0 ${a}
+ log_test_addr ${a} $? 0 "Device server, device client, local connection"
+ done
+}
+
+ipv6_tcp()
+{
+ log_section "IPv6/TCP"
+ log_subsection "No VRF"
+ setup
+
+ # tcp_l3mdev_accept should have no affect without VRF;
+ # run tests with it enabled and disabled to verify
+ log_subsection "tcp_l3mdev_accept disabled"
+ set_sysctl net.ipv4.tcp_l3mdev_accept=0
+ ipv6_tcp_novrf
+ log_subsection "tcp_l3mdev_accept enabled"
+ set_sysctl net.ipv4.tcp_l3mdev_accept=1
+ ipv6_tcp_novrf
+
+ log_subsection "With VRF"
+ setup "yes"
+ ipv6_tcp_vrf
+}
+
+################################################################################
+# IPv6 UDP
+
+ipv6_udp_novrf()
+{
+ local a
+
+ #
+ # server tests
+ #
+ for a in ${NSA_IP6} ${NSA_LINKIP6}%${NSB_DEV}
+ do
+ log_start
+ run_cmd nettest -6 -D -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -6 -D -r ${a}
+ log_test_addr ${a} $? 0 "Global server"
+
+ log_start
+ run_cmd nettest -6 -D -d ${NSA_DEV} -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -6 -D -r ${a}
+ log_test_addr ${a} $? 0 "Device server"
+ done
+
+ a=${NSA_LO_IP6}
+ log_start
+ run_cmd nettest -6 -D -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -6 -D -r ${a}
+ log_test_addr ${a} $? 0 "Global server"
+
+ # should fail since loopback address is out of scope for a device
+ # bound server, but it does not - hence this is more documenting
+ # behavior.
+ #log_start
+ #show_hint "Should fail since loopback address is out of scope"
+ #run_cmd nettest -6 -D -d ${NSA_DEV} -s -2 ${NSA_DEV} &
+ #sleep 1
+ #run_cmd_nsb nettest -6 -D -r ${a}
+ #log_test_addr ${a} $? 1 "Device server"
+
+ # negative test - should fail
+ for a in ${NSA_IP6} ${NSA_LO_IP6} ${NSA_LINKIP6}%${NSB_DEV}
+ do
+ log_start
+ show_hint "Should fail 'Connection refused' since there is no server"
+ run_cmd_nsb nettest -6 -D -r ${a}
+ log_test_addr ${a} $? 1 "No server"
+ done
+
+ #
+ # client
+ #
+ for a in ${NSB_IP6} ${NSB_LO_IP6} ${NSB_LINKIP6}%${NSA_DEV}
+ do
+ log_start
+ run_cmd_nsb nettest -6 -D -s &
+ sleep 1
+ run_cmd nettest -6 -D -r ${a} -0 ${NSA_IP6}
+ log_test_addr ${a} $? 0 "Client"
+
+ log_start
+ run_cmd_nsb nettest -6 -D -s &
+ sleep 1
+ run_cmd nettest -6 -D -r ${a} -d ${NSA_DEV} -0 ${NSA_IP6}
+ log_test_addr ${a} $? 0 "Client, device bind"
+
+ log_start
+ run_cmd_nsb nettest -6 -D -s &
+ sleep 1
+ run_cmd nettest -6 -D -r ${a} -d ${NSA_DEV} -C -0 ${NSA_IP6}
+ log_test_addr ${a} $? 0 "Client, device send via cmsg"
+
+ log_start
+ run_cmd_nsb nettest -6 -D -s &
+ sleep 1
+ run_cmd nettest -6 -D -r ${a} -d ${NSA_DEV} -S -0 ${NSA_IP6}
+ log_test_addr ${a} $? 0 "Client, device bind via IPV6_UNICAST_IF"
+
+ log_start
+ show_hint "Should fail 'Connection refused'"
+ run_cmd nettest -6 -D -r ${a}
+ log_test_addr ${a} $? 1 "No server, unbound client"
+
+ log_start
+ show_hint "Should fail 'Connection refused'"
+ run_cmd nettest -6 -D -r ${a} -d ${NSA_DEV}
+ log_test_addr ${a} $? 1 "No server, device client"
+ done
+
+ #
+ # local address tests
+ #
+ for a in ${NSA_IP6} ${NSA_LO_IP6} ::1
+ do
+ log_start
+ run_cmd nettest -6 -D -s &
+ sleep 1
+ run_cmd nettest -6 -D -r ${a} -0 ${a} -1 ${a}
+ log_test_addr ${a} $? 0 "Global server, local connection"
+ done
+
+ a=${NSA_IP6}
+ log_start
+ run_cmd nettest -6 -s -D -d ${NSA_DEV} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -6 -D -r ${a}
+ log_test_addr ${a} $? 0 "Device server, unbound client, local connection"
+
+ for a in ${NSA_LO_IP6} ::1
+ do
+ log_start
+ show_hint "Should fail 'Connection refused' since address is out of device scope"
+ run_cmd nettest -6 -s -D -d ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -6 -D -r ${a}
+ log_test_addr ${a} $? 1 "Device server, local connection"
+ done
+
+ a=${NSA_IP6}
+ log_start
+ run_cmd nettest -6 -s -D &
+ sleep 1
+ run_cmd nettest -6 -D -d ${NSA_DEV} -r ${a}
+ log_test_addr ${a} $? 0 "Global server, device client, local connection"
+
+ log_start
+ run_cmd nettest -6 -s -D &
+ sleep 1
+ run_cmd nettest -6 -D -d ${NSA_DEV} -C -r ${a}
+ log_test_addr ${a} $? 0 "Global server, device send via cmsg, local connection"
+
+ log_start
+ run_cmd nettest -6 -s -D &
+ sleep 1
+ run_cmd nettest -6 -D -d ${NSA_DEV} -S -r ${a}
+ log_test_addr ${a} $? 0 "Global server, device client via IPV6_UNICAST_IF, local connection"
+
+ for a in ${NSA_LO_IP6} ::1
+ do
+ log_start
+ show_hint "Should fail 'No route to host' since addresses on loopback are out of device scope"
+ run_cmd nettest -6 -D -s &
+ sleep 1
+ run_cmd nettest -6 -D -r ${a} -d ${NSA_DEV}
+ log_test_addr ${a} $? 1 "Global server, device client, local connection"
+
+ log_start
+ show_hint "Should fail 'No route to host' since addresses on loopback are out of device scope"
+ run_cmd nettest -6 -D -s &
+ sleep 1
+ run_cmd nettest -6 -D -r ${a} -d ${NSA_DEV} -C
+ log_test_addr ${a} $? 1 "Global server, device send via cmsg, local connection"
+
+ log_start
+ show_hint "Should fail 'No route to host' since addresses on loopback are out of device scope"
+ run_cmd nettest -6 -D -s &
+ sleep 1
+ run_cmd nettest -6 -D -r ${a} -d ${NSA_DEV} -S
+ log_test_addr ${a} $? 1 "Global server, device client via IP_UNICAST_IF, local connection"
+ done
+
+ a=${NSA_IP6}
+ log_start
+ run_cmd nettest -6 -D -s -d ${NSA_DEV} -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -6 -D -d ${NSA_DEV} -r ${a} -0 ${a}
+ log_test_addr ${a} $? 0 "Device server, device client, local conn"
+
+ log_start
+ show_hint "Should fail 'Connection refused'"
+ run_cmd nettest -6 -D -d ${NSA_DEV} -r ${a}
+ log_test_addr ${a} $? 1 "No server, device client, local conn"
+
+ # LLA to GUA
+ run_cmd_nsb ip -6 addr del ${NSB_IP6}/64 dev ${NSB_DEV}
+ run_cmd_nsb ip -6 ro add ${NSA_IP6}/128 dev ${NSB_DEV}
+ log_start
+ run_cmd nettest -6 -s -D &
+ sleep 1
+ run_cmd_nsb nettest -6 -D -r ${NSA_IP6}
+ log_test $? 0 "UDP in - LLA to GUA"
+
+ run_cmd_nsb ip -6 ro del ${NSA_IP6}/128 dev ${NSB_DEV}
+ run_cmd_nsb ip -6 addr add ${NSB_IP6}/64 dev ${NSB_DEV} nodad
+}
+
+ipv6_udp_vrf()
+{
+ local a
+
+ # disable global server
+ log_subsection "Global server disabled"
+ set_sysctl net.ipv4.udp_l3mdev_accept=0
+
+ #
+ # server tests
+ #
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ show_hint "Should fail 'Connection refused' since global server is disabled"
+ run_cmd nettest -6 -D -s &
+ sleep 1
+ run_cmd_nsb nettest -6 -D -r ${a}
+ log_test_addr ${a} $? 1 "Global server"
+ done
+
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ run_cmd nettest -6 -D -d ${VRF} -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -6 -D -r ${a}
+ log_test_addr ${a} $? 0 "VRF server"
+ done
+
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ run_cmd nettest -6 -D -d ${NSA_DEV} -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -6 -D -r ${a}
+ log_test_addr ${a} $? 0 "Enslaved device server"
+ done
+
+ # negative test - should fail
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ show_hint "Should fail 'Connection refused' since there is no server"
+ run_cmd_nsb nettest -6 -D -r ${a}
+ log_test_addr ${a} $? 1 "No server"
+ done
+
+ #
+ # local address tests
+ #
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ show_hint "Should fail 'Connection refused' since global server is disabled"
+ run_cmd nettest -6 -D -s &
+ sleep 1
+ run_cmd nettest -6 -D -d ${VRF} -r ${a}
+ log_test_addr ${a} $? 1 "Global server, VRF client, local conn"
+ done
+
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ run_cmd nettest -6 -D -d ${VRF} -s &
+ sleep 1
+ run_cmd nettest -6 -D -d ${VRF} -r ${a}
+ log_test_addr ${a} $? 0 "VRF server, VRF client, local conn"
+ done
+
+ a=${NSA_IP6}
+ log_start
+ show_hint "Should fail 'Connection refused' since global server is disabled"
+ run_cmd nettest -6 -D -s &
+ sleep 1
+ run_cmd nettest -6 -D -d ${NSA_DEV} -r ${a}
+ log_test_addr ${a} $? 1 "Global server, device client, local conn"
+
+ log_start
+ run_cmd nettest -6 -D -d ${VRF} -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -6 -D -d ${NSA_DEV} -r ${a}
+ log_test_addr ${a} $? 0 "VRF server, device client, local conn"
+
+ log_start
+ run_cmd nettest -6 -D -d ${NSA_DEV} -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -6 -D -d ${VRF} -r ${a}
+ log_test_addr ${a} $? 0 "Enslaved device server, VRF client, local conn"
+
+ log_start
+ run_cmd nettest -6 -D -d ${NSA_DEV} -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -6 -D -d ${NSA_DEV} -r ${a}
+ log_test_addr ${a} $? 0 "Enslaved device server, device client, local conn"
+
+ # disable global server
+ log_subsection "Global server enabled"
+ set_sysctl net.ipv4.udp_l3mdev_accept=1
+
+ #
+ # server tests
+ #
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ run_cmd nettest -6 -D -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -6 -D -r ${a}
+ log_test_addr ${a} $? 0 "Global server"
+ done
+
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ run_cmd nettest -6 -D -d ${VRF} -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -6 -D -r ${a}
+ log_test_addr ${a} $? 0 "VRF server"
+ done
+
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ run_cmd nettest -6 -D -d ${NSA_DEV} -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd_nsb nettest -6 -D -r ${a}
+ log_test_addr ${a} $? 0 "Enslaved device server"
+ done
+
+ # negative test - should fail
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ run_cmd_nsb nettest -6 -D -r ${a}
+ log_test_addr ${a} $? 1 "No server"
+ done
+
+ #
+ # client tests
+ #
+ log_start
+ run_cmd_nsb nettest -6 -D -s &
+ sleep 1
+ run_cmd nettest -6 -D -d ${VRF} -r ${NSB_IP6}
+ log_test $? 0 "VRF client"
+
+ # negative test - should fail
+ log_start
+ run_cmd nettest -6 -D -d ${VRF} -r ${NSB_IP6}
+ log_test $? 1 "No server, VRF client"
+
+ log_start
+ run_cmd_nsb nettest -6 -D -s &
+ sleep 1
+ run_cmd nettest -6 -D -d ${NSA_DEV} -r ${NSB_IP6}
+ log_test $? 0 "Enslaved device client"
+
+ # negative test - should fail
+ log_start
+ run_cmd nettest -6 -D -d ${NSA_DEV} -r ${NSB_IP6}
+ log_test $? 1 "No server, enslaved device client"
+
+ #
+ # local address tests
+ #
+ a=${NSA_IP6}
+ log_start
+ run_cmd nettest -6 -D -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -6 -D -d ${VRF} -r ${a}
+ log_test_addr ${a} $? 0 "Global server, VRF client, local conn"
+
+ #log_start
+ run_cmd nettest -6 -D -d ${VRF} -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -6 -D -d ${VRF} -r ${a}
+ log_test_addr ${a} $? 0 "VRF server, VRF client, local conn"
+
+
+ a=${VRF_IP6}
+ log_start
+ run_cmd nettest -6 -D -s -2 ${VRF} &
+ sleep 1
+ run_cmd nettest -6 -D -d ${VRF} -r ${a}
+ log_test_addr ${a} $? 0 "Global server, VRF client, local conn"
+
+ log_start
+ run_cmd nettest -6 -D -d ${VRF} -s -2 ${VRF} &
+ sleep 1
+ run_cmd nettest -6 -D -d ${VRF} -r ${a}
+ log_test_addr ${a} $? 0 "VRF server, VRF client, local conn"
+
+ # negative test - should fail
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ run_cmd nettest -6 -D -d ${VRF} -r ${a}
+ log_test_addr ${a} $? 1 "No server, VRF client, local conn"
+ done
+
+ # device to global IP
+ a=${NSA_IP6}
+ log_start
+ run_cmd nettest -6 -D -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -6 -D -d ${NSA_DEV} -r ${a}
+ log_test_addr ${a} $? 0 "Global server, device client, local conn"
+
+ log_start
+ run_cmd nettest -6 -D -d ${VRF} -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -6 -D -d ${NSA_DEV} -r ${a}
+ log_test_addr ${a} $? 0 "VRF server, device client, local conn"
+
+ log_start
+ run_cmd nettest -6 -D -d ${NSA_DEV} -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -6 -D -d ${VRF} -r ${a}
+ log_test_addr ${a} $? 0 "Device server, VRF client, local conn"
+
+ log_start
+ run_cmd nettest -6 -D -d ${NSA_DEV} -s -2 ${NSA_DEV} &
+ sleep 1
+ run_cmd nettest -6 -D -d ${NSA_DEV} -r ${a}
+ log_test_addr ${a} $? 0 "Device server, device client, local conn"
+
+ log_start
+ run_cmd nettest -6 -D -d ${NSA_DEV} -r ${a}
+ log_test_addr ${a} $? 1 "No server, device client, local conn"
+
+
+ # link local addresses
+ log_start
+ run_cmd nettest -6 -D -s &
+ sleep 1
+ run_cmd_nsb nettest -6 -D -d ${NSB_DEV} -r ${NSA_LINKIP6}
+ log_test $? 0 "Global server, linklocal IP"
+
+ log_start
+ run_cmd_nsb nettest -6 -D -d ${NSB_DEV} -r ${NSA_LINKIP6}
+ log_test $? 1 "No server, linklocal IP"
+
+
+ log_start
+ run_cmd_nsb nettest -6 -D -s &
+ sleep 1
+ run_cmd nettest -6 -D -d ${NSA_DEV} -r ${NSB_LINKIP6}
+ log_test $? 0 "Enslaved device client, linklocal IP"
+
+ log_start
+ run_cmd nettest -6 -D -d ${NSA_DEV} -r ${NSB_LINKIP6}
+ log_test $? 1 "No server, device client, peer linklocal IP"
+
+
+ log_start
+ run_cmd nettest -6 -D -s &
+ sleep 1
+ run_cmd nettest -6 -D -d ${NSA_DEV} -r ${NSA_LINKIP6}
+ log_test $? 0 "Enslaved device client, local conn - linklocal IP"
+
+ log_start
+ run_cmd nettest -6 -D -d ${NSA_DEV} -r ${NSA_LINKIP6}
+ log_test $? 1 "No server, device client, local conn - linklocal IP"
+
+ # LLA to GUA
+ run_cmd_nsb ip -6 addr del ${NSB_IP6}/64 dev ${NSB_DEV}
+ run_cmd_nsb ip -6 ro add ${NSA_IP6}/128 dev ${NSB_DEV}
+ log_start
+ run_cmd nettest -6 -s -D &
+ sleep 1
+ run_cmd_nsb nettest -6 -D -r ${NSA_IP6}
+ log_test $? 0 "UDP in - LLA to GUA"
+
+ run_cmd_nsb ip -6 ro del ${NSA_IP6}/128 dev ${NSB_DEV}
+ run_cmd_nsb ip -6 addr add ${NSB_IP6}/64 dev ${NSB_DEV} nodad
+}
+
+ipv6_udp()
+{
+ # should not matter, but set to known state
+ set_sysctl net.ipv4.udp_early_demux=1
+
+ log_section "IPv6/UDP"
+ log_subsection "No VRF"
+ setup
+
+ # udp_l3mdev_accept should have no affect without VRF;
+ # run tests with it enabled and disabled to verify
+ log_subsection "udp_l3mdev_accept disabled"
+ set_sysctl net.ipv4.udp_l3mdev_accept=0
+ ipv6_udp_novrf
+ log_subsection "udp_l3mdev_accept enabled"
+ set_sysctl net.ipv4.udp_l3mdev_accept=1
+ ipv6_udp_novrf
+
+ log_subsection "With VRF"
+ setup "yes"
+ ipv6_udp_vrf
+}
+
+################################################################################
+# IPv6 address bind
+
+ipv6_addr_bind_novrf()
+{
+ #
+ # raw socket
+ #
+ for a in ${NSA_IP6} ${NSA_LO_IP6}
+ do
+ log_start
+ run_cmd nettest -6 -s -R -P ipv6-icmp -l ${a} -b
+ log_test_addr ${a} $? 0 "Raw socket bind to local address"
+
+ log_start
+ run_cmd nettest -6 -s -R -P ipv6-icmp -l ${a} -d ${NSA_DEV} -b
+ log_test_addr ${a} $? 0 "Raw socket bind to local address after device bind"
+ done
+
+ #
+ # tcp sockets
+ #
+ a=${NSA_IP6}
+ log_start
+ run_cmd nettest -6 -s -l ${a} -t1 -b
+ log_test_addr ${a} $? 0 "TCP socket bind to local address"
+
+ log_start
+ run_cmd nettest -6 -s -l ${a} -d ${NSA_DEV} -t1 -b
+ log_test_addr ${a} $? 0 "TCP socket bind to local address after device bind"
+
+ # Sadly, the kernel allows binding a socket to a device and then
+ # binding to an address not on the device. So this test passes
+ # when it really should not
+ a=${NSA_LO_IP6}
+ log_start
+ show_hint "Tecnically should fail since address is not on device but kernel allows"
+ run_cmd nettest -6 -s -l ${a} -I ${NSA_DEV} -t1 -b
+ log_test_addr ${a} $? 0 "TCP socket bind to out of scope local address"
+}
+
+ipv6_addr_bind_vrf()
+{
+ #
+ # raw socket
+ #
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ run_cmd nettest -6 -s -R -P ipv6-icmp -l ${a} -d ${VRF} -b
+ log_test_addr ${a} $? 0 "Raw socket bind to local address after vrf bind"
+
+ log_start
+ run_cmd nettest -6 -s -R -P ipv6-icmp -l ${a} -d ${NSA_DEV} -b
+ log_test_addr ${a} $? 0 "Raw socket bind to local address after device bind"
+ done
+
+ a=${NSA_LO_IP6}
+ log_start
+ show_hint "Address on loopback is out of VRF scope"
+ run_cmd nettest -6 -s -R -P ipv6-icmp -l ${a} -d ${VRF} -b
+ log_test_addr ${a} $? 1 "Raw socket bind to invalid local address after vrf bind"
+
+ #
+ # tcp sockets
+ #
+ # address on enslaved device is valid for the VRF or device in a VRF
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ run_cmd nettest -6 -s -l ${a} -d ${VRF} -t1 -b
+ log_test_addr ${a} $? 0 "TCP socket bind to local address with VRF bind"
+ done
+
+ a=${NSA_IP6}
+ log_start
+ run_cmd nettest -6 -s -l ${a} -d ${NSA_DEV} -t1 -b
+ log_test_addr ${a} $? 0 "TCP socket bind to local address with device bind"
+
+ # Sadly, the kernel allows binding a socket to a device and then
+ # binding to an address not on the device. The only restriction
+ # is that the address is valid in the L3 domain. So this test
+ # passes when it really should not
+ a=${VRF_IP6}
+ log_start
+ show_hint "Tecnically should fail since address is not on device but kernel allows"
+ run_cmd nettest -6 -s -l ${a} -I ${NSA_DEV} -t1 -b
+ log_test_addr ${a} $? 0 "TCP socket bind to VRF address with device bind"
+
+ a=${NSA_LO_IP6}
+ log_start
+ show_hint "Address on loopback out of scope for VRF"
+ run_cmd nettest -6 -s -l ${a} -d ${VRF} -t1 -b
+ log_test_addr ${a} $? 1 "TCP socket bind to invalid local address for VRF"
+
+ log_start
+ show_hint "Address on loopback out of scope for device in VRF"
+ run_cmd nettest -6 -s -l ${a} -d ${NSA_DEV} -t1 -b
+ log_test_addr ${a} $? 1 "TCP socket bind to invalid local address for device bind"
+
+}
+
+ipv6_addr_bind()
+{
+ log_section "IPv6 address binds"
+
+ log_subsection "No VRF"
+ setup
+ ipv6_addr_bind_novrf
+
+ log_subsection "With VRF"
+ setup "yes"
+ ipv6_addr_bind_vrf
+}
+
+################################################################################
+# IPv6 runtime tests
+
+ipv6_rt()
+{
+ local desc="$1"
+ local varg="-6 $2"
+ local with_vrf="yes"
+ local a
+
+ #
+ # server tests
+ #
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ run_cmd nettest ${varg} -s &
+ sleep 1
+ run_cmd_nsb nettest ${varg} -r ${a} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test_addr ${a} 0 0 "${desc}, global server"
+
+ setup ${with_vrf}
+ done
+
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ run_cmd nettest ${varg} -d ${VRF} -s &
+ sleep 1
+ run_cmd_nsb nettest ${varg} -r ${a} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test_addr ${a} 0 0 "${desc}, VRF server"
+
+ setup ${with_vrf}
+ done
+
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ run_cmd nettest ${varg} -d ${NSA_DEV} -s &
+ sleep 1
+ run_cmd_nsb nettest ${varg} -r ${a} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test_addr ${a} 0 0 "${desc}, enslaved device server"
+
+ setup ${with_vrf}
+ done
+
+ #
+ # client test
+ #
+ log_start
+ run_cmd_nsb nettest ${varg} -s &
+ sleep 1
+ run_cmd nettest ${varg} -d ${VRF} -r ${NSB_IP6} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test 0 0 "${desc}, VRF client"
+
+ setup ${with_vrf}
+
+ log_start
+ run_cmd_nsb nettest ${varg} -s &
+ sleep 1
+ run_cmd nettest ${varg} -d ${NSA_DEV} -r ${NSB_IP6} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test 0 0 "${desc}, enslaved device client"
+
+ setup ${with_vrf}
+
+
+ #
+ # local address tests
+ #
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ run_cmd nettest ${varg} -s &
+ sleep 1
+ run_cmd nettest ${varg} -d ${VRF} -r ${a} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test_addr ${a} 0 0 "${desc}, global server, VRF client"
+
+ setup ${with_vrf}
+ done
+
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ run_cmd nettest ${varg} -d ${VRF} -s &
+ sleep 1
+ run_cmd nettest ${varg} -d ${VRF} -r ${a} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test_addr ${a} 0 0 "${desc}, VRF server and client"
+
+ setup ${with_vrf}
+ done
+
+ a=${NSA_IP6}
+ log_start
+ run_cmd nettest ${varg} -s &
+ sleep 1
+ run_cmd nettest ${varg} -d ${NSA_DEV} -r ${a} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test_addr ${a} 0 0 "${desc}, global server, device client"
+
+ setup ${with_vrf}
+
+ log_start
+ run_cmd nettest ${varg} -d ${VRF} -s &
+ sleep 1
+ run_cmd nettest ${varg} -d ${NSA_DEV} -r ${a} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test_addr ${a} 0 0 "${desc}, VRF server, device client"
+
+ setup ${with_vrf}
+
+ log_start
+ run_cmd nettest ${varg} -d ${NSA_DEV} -s &
+ sleep 1
+ run_cmd nettest ${varg} -d ${NSA_DEV} -r ${a} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test_addr ${a} 0 0 "${desc}, device server, device client"
+}
+
+ipv6_ping_rt()
+{
+ local with_vrf="yes"
+ local a
+
+ a=${NSA_IP6}
+ log_start
+ run_cmd_nsb ${ping6} -f ${a} &
+ sleep 3
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test_addr ${a} 0 0 "Device delete with active traffic - ping in"
+
+ setup ${with_vrf}
+
+ log_start
+ run_cmd ${ping6} -f ${NSB_IP6} -I ${VRF} &
+ sleep 1
+ run_cmd ip link del ${VRF}
+ sleep 1
+ log_test_addr ${a} 0 0 "Device delete with active traffic - ping out"
+}
+
+ipv6_runtime()
+{
+ log_section "Run time tests - ipv6"
+
+ setup "yes"
+ ipv6_ping_rt
+
+ setup "yes"
+ ipv6_rt "TCP active socket" "-n -1"
+
+ setup "yes"
+ ipv6_rt "TCP passive socket" "-i"
+
+ setup "yes"
+ ipv6_rt "UDP active socket" "-D -n -1"
+}
+
+################################################################################
+# netfilter blocking connections
+
+netfilter_tcp_reset()
+{
+ local a
+
+ for a in ${NSA_IP} ${VRF_IP}
+ do
+ log_start
+ run_cmd nettest -s &
+ sleep 1
+ run_cmd_nsb nettest -r ${a}
+ log_test_addr ${a} $? 1 "Global server, reject with TCP-reset on Rx"
+ done
+}
+
+netfilter_icmp()
+{
+ local stype="$1"
+ local arg
+ local a
+
+ [ "${stype}" = "UDP" ] && arg="-D"
+
+ for a in ${NSA_IP} ${VRF_IP}
+ do
+ log_start
+ run_cmd nettest ${arg} -s &
+ sleep 1
+ run_cmd_nsb nettest ${arg} -r ${a}
+ log_test_addr ${a} $? 1 "Global ${stype} server, Rx reject icmp-port-unreach"
+ done
+}
+
+ipv4_netfilter()
+{
+ log_section "IPv4 Netfilter"
+ log_subsection "TCP reset"
+
+ setup "yes"
+ run_cmd iptables -A INPUT -p tcp --dport 12345 -j REJECT --reject-with tcp-reset
+
+ netfilter_tcp_reset
+
+ log_start
+ log_subsection "ICMP unreachable"
+
+ log_start
+ run_cmd iptables -F
+ run_cmd iptables -A INPUT -p tcp --dport 12345 -j REJECT --reject-with icmp-port-unreachable
+ run_cmd iptables -A INPUT -p udp --dport 12345 -j REJECT --reject-with icmp-port-unreachable
+
+ netfilter_icmp "TCP"
+ netfilter_icmp "UDP"
+
+ log_start
+ iptables -F
+}
+
+netfilter_tcp6_reset()
+{
+ local a
+
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ run_cmd nettest -6 -s &
+ sleep 1
+ run_cmd_nsb nettest -6 -r ${a}
+ log_test_addr ${a} $? 1 "Global server, reject with TCP-reset on Rx"
+ done
+}
+
+netfilter_icmp6()
+{
+ local stype="$1"
+ local arg
+ local a
+
+ [ "${stype}" = "UDP" ] && arg="$arg -D"
+
+ for a in ${NSA_IP6} ${VRF_IP6}
+ do
+ log_start
+ run_cmd nettest -6 -s ${arg} &
+ sleep 1
+ run_cmd_nsb nettest -6 ${arg} -r ${a}
+ log_test_addr ${a} $? 1 "Global ${stype} server, Rx reject icmp-port-unreach"
+ done
+}
+
+ipv6_netfilter()
+{
+ log_section "IPv6 Netfilter"
+ log_subsection "TCP reset"
+
+ setup "yes"
+ run_cmd ip6tables -A INPUT -p tcp --dport 12345 -j REJECT --reject-with tcp-reset
+
+ netfilter_tcp6_reset
+
+ log_subsection "ICMP unreachable"
+
+ log_start
+ run_cmd ip6tables -F
+ run_cmd ip6tables -A INPUT -p tcp --dport 12345 -j REJECT --reject-with icmp6-port-unreachable
+ run_cmd ip6tables -A INPUT -p udp --dport 12345 -j REJECT --reject-with icmp6-port-unreachable
+
+ netfilter_icmp6 "TCP"
+ netfilter_icmp6 "UDP"
+
+ log_start
+ ip6tables -F
+}
+
+################################################################################
+# specific use cases
+
+# VRF only.
+# ns-A device enslaved to bridge. Verify traffic with and without
+# br_netfilter module loaded. Repeat with SVI on bridge.
+use_case_br()
+{
+ setup "yes"
+
+ setup_cmd ip link set ${NSA_DEV} down
+ setup_cmd ip addr del dev ${NSA_DEV} ${NSA_IP}/24
+ setup_cmd ip -6 addr del dev ${NSA_DEV} ${NSA_IP6}/64
+
+ setup_cmd ip link add br0 type bridge
+ setup_cmd ip addr add dev br0 ${NSA_IP}/24
+ setup_cmd ip -6 addr add dev br0 ${NSA_IP6}/64 nodad
+
+ setup_cmd ip li set ${NSA_DEV} master br0
+ setup_cmd ip li set ${NSA_DEV} up
+ setup_cmd ip li set br0 up
+ setup_cmd ip li set br0 vrf ${VRF}
+
+ rmmod br_netfilter 2>/dev/null
+ sleep 5 # DAD
+
+ run_cmd ip neigh flush all
+ run_cmd ping -c1 -w1 -I br0 ${NSB_IP}
+ log_test $? 0 "Bridge into VRF - IPv4 ping out"
+
+ run_cmd ip neigh flush all
+ run_cmd ${ping6} -c1 -w1 -I br0 ${NSB_IP6}
+ log_test $? 0 "Bridge into VRF - IPv6 ping out"
+
+ run_cmd ip neigh flush all
+ run_cmd_nsb ping -c1 -w1 ${NSA_IP}
+ log_test $? 0 "Bridge into VRF - IPv4 ping in"
+
+ run_cmd ip neigh flush all
+ run_cmd_nsb ${ping6} -c1 -w1 ${NSA_IP6}
+ log_test $? 0 "Bridge into VRF - IPv6 ping in"
+
+ modprobe br_netfilter
+ if [ $? -eq 0 ]; then
+ run_cmd ip neigh flush all
+ run_cmd ping -c1 -w1 -I br0 ${NSB_IP}
+ log_test $? 0 "Bridge into VRF with br_netfilter - IPv4 ping out"
+
+ run_cmd ip neigh flush all
+ run_cmd ${ping6} -c1 -w1 -I br0 ${NSB_IP6}
+ log_test $? 0 "Bridge into VRF with br_netfilter - IPv6 ping out"
+
+ run_cmd ip neigh flush all
+ run_cmd_nsb ping -c1 -w1 ${NSA_IP}
+ log_test $? 0 "Bridge into VRF with br_netfilter - IPv4 ping in"
+
+ run_cmd ip neigh flush all
+ run_cmd_nsb ${ping6} -c1 -w1 ${NSA_IP6}
+ log_test $? 0 "Bridge into VRF with br_netfilter - IPv6 ping in"
+ fi
+
+ setup_cmd ip li set br0 nomaster
+ setup_cmd ip li add br0.100 link br0 type vlan id 100
+ setup_cmd ip li set br0.100 vrf ${VRF} up
+ setup_cmd ip addr add dev br0.100 172.16.101.1/24
+ setup_cmd ip -6 addr add dev br0.100 2001:db8:101::1/64 nodad
+
+ setup_cmd_nsb ip li add vlan100 link ${NSB_DEV} type vlan id 100
+ setup_cmd_nsb ip addr add dev vlan100 172.16.101.2/24
+ setup_cmd_nsb ip -6 addr add dev vlan100 2001:db8:101::2/64 nodad
+ setup_cmd_nsb ip li set vlan100 up
+ sleep 1
+
+ rmmod br_netfilter 2>/dev/null
+
+ run_cmd ip neigh flush all
+ run_cmd ping -c1 -w1 -I br0.100 172.16.101.2
+ log_test $? 0 "Bridge vlan into VRF - IPv4 ping out"
+
+ run_cmd ip neigh flush all
+ run_cmd ${ping6} -c1 -w1 -I br0.100 2001:db8:101::2
+ log_test $? 0 "Bridge vlan into VRF - IPv6 ping out"
+
+ run_cmd ip neigh flush all
+ run_cmd_nsb ping -c1 -w1 172.16.101.1
+ log_test $? 0 "Bridge vlan into VRF - IPv4 ping in"
+
+ run_cmd ip neigh flush all
+ run_cmd_nsb ${ping6} -c1 -w1 2001:db8:101::1
+ log_test $? 0 "Bridge vlan into VRF - IPv6 ping in"
+
+ modprobe br_netfilter
+ if [ $? -eq 0 ]; then
+ run_cmd ip neigh flush all
+ run_cmd ping -c1 -w1 -I br0.100 172.16.101.2
+ log_test $? 0 "Bridge vlan into VRF with br_netfilter - IPv4 ping out"
+
+ run_cmd ip neigh flush all
+ run_cmd ${ping6} -c1 -w1 -I br0.100 2001:db8:101::2
+ log_test $? 0 "Bridge vlan into VRF with br_netfilter - IPv6 ping out"
+
+ run_cmd ip neigh flush all
+ run_cmd_nsb ping -c1 -w1 172.16.101.1
+ log_test $? 0 "Bridge vlan into VRF - IPv4 ping in"
+
+ run_cmd ip neigh flush all
+ run_cmd_nsb ${ping6} -c1 -w1 2001:db8:101::1
+ log_test $? 0 "Bridge vlan into VRF - IPv6 ping in"
+ fi
+
+ setup_cmd ip li del br0 2>/dev/null
+ setup_cmd_nsb ip li del vlan100 2>/dev/null
+}
+
+# VRF only.
+# ns-A device is connected to both ns-B and ns-C on a single VRF but only has
+# LLA on the interfaces
+use_case_ping_lla_multi()
+{
+ setup_lla_only
+ # only want reply from ns-A
+ setup_cmd_nsb sysctl -qw net.ipv6.icmp.echo_ignore_multicast=1
+ setup_cmd_nsc sysctl -qw net.ipv6.icmp.echo_ignore_multicast=1
+
+ log_start
+ run_cmd_nsb ping -c1 -w1 ${MCAST}%${NSB_DEV}
+ log_test_addr ${MCAST}%${NSB_DEV} $? 0 "Pre cycle, ping out ns-B"
+
+ run_cmd_nsc ping -c1 -w1 ${MCAST}%${NSC_DEV}
+ log_test_addr ${MCAST}%${NSC_DEV} $? 0 "Pre cycle, ping out ns-C"
+
+ # cycle/flap the first ns-A interface
+ setup_cmd ip link set ${NSA_DEV} down
+ setup_cmd ip link set ${NSA_DEV} up
+ sleep 1
+
+ log_start
+ run_cmd_nsb ping -c1 -w1 ${MCAST}%${NSB_DEV}
+ log_test_addr ${MCAST}%${NSB_DEV} $? 0 "Post cycle ${NSA} ${NSA_DEV}, ping out ns-B"
+ run_cmd_nsc ping -c1 -w1 ${MCAST}%${NSC_DEV}
+ log_test_addr ${MCAST}%${NSC_DEV} $? 0 "Post cycle ${NSA} ${NSA_DEV}, ping out ns-C"
+
+ # cycle/flap the second ns-A interface
+ setup_cmd ip link set ${NSA_DEV2} down
+ setup_cmd ip link set ${NSA_DEV2} up
+ sleep 1
+
+ log_start
+ run_cmd_nsb ping -c1 -w1 ${MCAST}%${NSB_DEV}
+ log_test_addr ${MCAST}%${NSB_DEV} $? 0 "Post cycle ${NSA} ${NSA_DEV2}, ping out ns-B"
+ run_cmd_nsc ping -c1 -w1 ${MCAST}%${NSC_DEV}
+ log_test_addr ${MCAST}%${NSC_DEV} $? 0 "Post cycle ${NSA} ${NSA_DEV2}, ping out ns-C"
+}
+
+use_cases()
+{
+ log_section "Use cases"
+ log_subsection "Device enslaved to bridge"
+ use_case_br
+ log_subsection "Ping LLA with multiple interfaces"
+ use_case_ping_lla_multi
+}
+
+################################################################################
+# usage
+
+usage()
+{
+ cat <<EOF
+usage: ${0##*/} OPTS
+
+ -4 IPv4 tests only
+ -6 IPv6 tests only
+ -t <test> Test name/set to run
+ -p Pause on fail
+ -P Pause after each test
+ -v Be verbose
+EOF
+}
+
+################################################################################
+# main
+
+TESTS_IPV4="ipv4_ping ipv4_tcp ipv4_udp ipv4_bind ipv4_runtime ipv4_netfilter"
+TESTS_IPV6="ipv6_ping ipv6_tcp ipv6_udp ipv6_bind ipv6_runtime ipv6_netfilter"
+TESTS_OTHER="use_cases"
+
+PAUSE_ON_FAIL=no
+PAUSE=no
+
+while getopts :46t:pPvh o
+do
+ case $o in
+ 4) TESTS=ipv4;;
+ 6) TESTS=ipv6;;
+ t) TESTS=$OPTARG;;
+ p) PAUSE_ON_FAIL=yes;;
+ P) PAUSE=yes;;
+ v) VERBOSE=1;;
+ h) usage; exit 0;;
+ *) usage; exit 1;;
+ esac
+done
+
+# make sure we don't pause twice
+[ "${PAUSE}" = "yes" ] && PAUSE_ON_FAIL=no
+
+#
+# show user test config
+#
+if [ -z "$TESTS" ]; then
+ TESTS="$TESTS_IPV4 $TESTS_IPV6 $TESTS_OTHER"
+elif [ "$TESTS" = "ipv4" ]; then
+ TESTS="$TESTS_IPV4"
+elif [ "$TESTS" = "ipv6" ]; then
+ TESTS="$TESTS_IPV6"
+fi
+
+which nettest >/dev/null
+if [ $? -ne 0 ]; then
+ echo "'nettest' command not found; skipping tests"
+ exit 0
+fi
+
+declare -i nfail=0
+declare -i nsuccess=0
+
+for t in $TESTS
+do
+ case $t in
+ ipv4_ping|ping) ipv4_ping;;
+ ipv4_tcp|tcp) ipv4_tcp;;
+ ipv4_udp|udp) ipv4_udp;;
+ ipv4_bind|bind) ipv4_addr_bind;;
+ ipv4_runtime) ipv4_runtime;;
+ ipv4_netfilter) ipv4_netfilter;;
+
+ ipv6_ping|ping6) ipv6_ping;;
+ ipv6_tcp|tcp6) ipv6_tcp;;
+ ipv6_udp|udp6) ipv6_udp;;
+ ipv6_bind|bind6) ipv6_addr_bind;;
+ ipv6_runtime) ipv6_runtime;;
+ ipv6_netfilter) ipv6_netfilter;;
+
+ use_cases) use_cases;;
+
+ # setup namespaces and config, but do not run any tests
+ setup) setup; exit 0;;
+ vrf_setup) setup "yes"; exit 0;;
+
+ help) echo "Test names: $TESTS"; exit 0;;
+ esac
+done
+
+cleanup 2>/dev/null
+
+printf "\nTests passed: %3d\n" ${nsuccess}
+printf "Tests failed: %3d\n" ${nfail}
diff --git a/tools/testing/selftests/net/fib-onlink-tests.sh b/tools/testing/selftests/net/fib-onlink-tests.sh
new file mode 100755
index 000000000..c287b90b8
--- /dev/null
+++ b/tools/testing/selftests/net/fib-onlink-tests.sh
@@ -0,0 +1,505 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# IPv4 and IPv6 onlink tests
+
+PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no}
+VERBOSE=0
+
+# Network interfaces
+# - odd in current namespace; even in peer ns
+declare -A NETIFS
+# default VRF
+NETIFS[p1]=veth1
+NETIFS[p2]=veth2
+NETIFS[p3]=veth3
+NETIFS[p4]=veth4
+# VRF
+NETIFS[p5]=veth5
+NETIFS[p6]=veth6
+NETIFS[p7]=veth7
+NETIFS[p8]=veth8
+
+# /24 network
+declare -A V4ADDRS
+V4ADDRS[p1]=169.254.1.1
+V4ADDRS[p2]=169.254.1.2
+V4ADDRS[p3]=169.254.3.1
+V4ADDRS[p4]=169.254.3.2
+V4ADDRS[p5]=169.254.5.1
+V4ADDRS[p6]=169.254.5.2
+V4ADDRS[p7]=169.254.7.1
+V4ADDRS[p8]=169.254.7.2
+
+# /64 network
+declare -A V6ADDRS
+V6ADDRS[p1]=2001:db8:101::1
+V6ADDRS[p2]=2001:db8:101::2
+V6ADDRS[p3]=2001:db8:301::1
+V6ADDRS[p4]=2001:db8:301::2
+V6ADDRS[p5]=2001:db8:501::1
+V6ADDRS[p6]=2001:db8:501::2
+V6ADDRS[p7]=2001:db8:701::1
+V6ADDRS[p8]=2001:db8:701::2
+
+# Test networks:
+# [1] = default table
+# [2] = VRF
+#
+# /32 host routes
+declare -A TEST_NET4
+TEST_NET4[1]=169.254.101
+TEST_NET4[2]=169.254.102
+# /128 host routes
+declare -A TEST_NET6
+TEST_NET6[1]=2001:db8:101
+TEST_NET6[2]=2001:db8:102
+
+# connected gateway
+CONGW[1]=169.254.1.254
+CONGW[2]=169.254.3.254
+CONGW[3]=169.254.5.254
+
+# recursive gateway
+RECGW4[1]=169.254.11.254
+RECGW4[2]=169.254.12.254
+RECGW6[1]=2001:db8:11::64
+RECGW6[2]=2001:db8:12::64
+
+# for v4 mapped to v6
+declare -A TEST_NET4IN6IN6
+TEST_NET4IN6[1]=10.1.1.254
+TEST_NET4IN6[2]=10.2.1.254
+
+# mcast address
+MCAST6=ff02::1
+
+
+PEER_NS=bart
+PEER_CMD="ip netns exec ${PEER_NS}"
+VRF=lisa
+VRF_TABLE=1101
+PBR_TABLE=101
+
+################################################################################
+# utilities
+
+log_test()
+{
+ local rc=$1
+ local expected=$2
+ local msg="$3"
+
+ if [ ${rc} -eq ${expected} ]; then
+ nsuccess=$((nsuccess+1))
+ printf " TEST: %-50s [ OK ]\n" "${msg}"
+ else
+ nfail=$((nfail+1))
+ printf " TEST: %-50s [FAIL]\n" "${msg}"
+ if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+ echo
+ echo "hit enter to continue, 'q' to quit"
+ read a
+ [ "$a" = "q" ] && exit 1
+ fi
+ fi
+}
+
+log_section()
+{
+ echo
+ echo "######################################################################"
+ echo "TEST SECTION: $*"
+ echo "######################################################################"
+}
+
+log_subsection()
+{
+ echo
+ echo "#########################################"
+ echo "TEST SUBSECTION: $*"
+}
+
+run_cmd()
+{
+ local cmd="$*"
+ local out
+ local rc
+
+ if [ "$VERBOSE" = "1" ]; then
+ printf " COMMAND: $cmd\n"
+ fi
+
+ out=$(eval $cmd 2>&1)
+ rc=$?
+ if [ "$VERBOSE" = "1" -a -n "$out" ]; then
+ echo " $out"
+ fi
+
+ [ "$VERBOSE" = "1" ] && echo
+
+ return $rc
+}
+
+get_linklocal()
+{
+ local dev=$1
+ local pfx
+ local addr
+
+ addr=$(${pfx} ip -6 -br addr show dev ${dev} | \
+ awk '{
+ for (i = 3; i <= NF; ++i) {
+ if ($i ~ /^fe80/)
+ print $i
+ }
+ }'
+ )
+ addr=${addr/\/*}
+
+ [ -z "$addr" ] && return 1
+
+ echo $addr
+
+ return 0
+}
+
+################################################################################
+#
+
+setup()
+{
+ echo
+ echo "########################################"
+ echo "Configuring interfaces"
+
+ set -e
+
+ # create namespace
+ ip netns add ${PEER_NS}
+ ip -netns ${PEER_NS} li set lo up
+
+ # add vrf table
+ ip li add ${VRF} type vrf table ${VRF_TABLE}
+ ip li set ${VRF} up
+ ip ro add table ${VRF_TABLE} unreachable default metric 8192
+ ip -6 ro add table ${VRF_TABLE} unreachable default metric 8192
+
+ # create test interfaces
+ ip li add ${NETIFS[p1]} type veth peer name ${NETIFS[p2]}
+ ip li add ${NETIFS[p3]} type veth peer name ${NETIFS[p4]}
+ ip li add ${NETIFS[p5]} type veth peer name ${NETIFS[p6]}
+ ip li add ${NETIFS[p7]} type veth peer name ${NETIFS[p8]}
+
+ # enslave vrf interfaces
+ for n in 5 7; do
+ ip li set ${NETIFS[p${n}]} vrf ${VRF}
+ done
+
+ # add addresses
+ for n in 1 3 5 7; do
+ ip li set ${NETIFS[p${n}]} up
+ ip addr add ${V4ADDRS[p${n}]}/24 dev ${NETIFS[p${n}]}
+ ip addr add ${V6ADDRS[p${n}]}/64 dev ${NETIFS[p${n}]} nodad
+ done
+
+ # move peer interfaces to namespace and add addresses
+ for n in 2 4 6 8; do
+ ip li set ${NETIFS[p${n}]} netns ${PEER_NS} up
+ ip -netns ${PEER_NS} addr add ${V4ADDRS[p${n}]}/24 dev ${NETIFS[p${n}]}
+ ip -netns ${PEER_NS} addr add ${V6ADDRS[p${n}]}/64 dev ${NETIFS[p${n}]} nodad
+ done
+
+ ip -6 ro add default via ${V6ADDRS[p3]/::[0-9]/::64}
+ ip -6 ro add table ${VRF_TABLE} default via ${V6ADDRS[p7]/::[0-9]/::64}
+
+ set +e
+}
+
+cleanup()
+{
+ # make sure we start from a clean slate
+ ip netns del ${PEER_NS} 2>/dev/null
+ for n in 1 3 5 7; do
+ ip link del ${NETIFS[p${n}]} 2>/dev/null
+ done
+ ip link del ${VRF} 2>/dev/null
+ ip ro flush table ${VRF_TABLE}
+ ip -6 ro flush table ${VRF_TABLE}
+}
+
+################################################################################
+# IPv4 tests
+#
+
+run_ip()
+{
+ local table="$1"
+ local prefix="$2"
+ local gw="$3"
+ local dev="$4"
+ local exp_rc="$5"
+ local desc="$6"
+
+ # dev arg may be empty
+ [ -n "${dev}" ] && dev="dev ${dev}"
+
+ run_cmd ip ro add table "${table}" "${prefix}"/32 via "${gw}" "${dev}" onlink
+ log_test $? ${exp_rc} "${desc}"
+}
+
+run_ip_mpath()
+{
+ local table="$1"
+ local prefix="$2"
+ local nh1="$3"
+ local nh2="$4"
+ local exp_rc="$5"
+ local desc="$6"
+
+ # dev arg may be empty
+ [ -n "${dev}" ] && dev="dev ${dev}"
+
+ run_cmd ip ro add table "${table}" "${prefix}"/32 \
+ nexthop via ${nh1} nexthop via ${nh2}
+ log_test $? ${exp_rc} "${desc}"
+}
+
+valid_onlink_ipv4()
+{
+ # - unicast connected, unicast recursive
+ #
+ log_subsection "default VRF - main table"
+
+ run_ip 254 ${TEST_NET4[1]}.1 ${CONGW[1]} ${NETIFS[p1]} 0 "unicast connected"
+ run_ip 254 ${TEST_NET4[1]}.2 ${RECGW4[1]} ${NETIFS[p1]} 0 "unicast recursive"
+
+ log_subsection "VRF ${VRF}"
+
+ run_ip ${VRF_TABLE} ${TEST_NET4[2]}.1 ${CONGW[3]} ${NETIFS[p5]} 0 "unicast connected"
+ run_ip ${VRF_TABLE} ${TEST_NET4[2]}.2 ${RECGW4[2]} ${NETIFS[p5]} 0 "unicast recursive"
+
+ log_subsection "VRF device, PBR table"
+
+ run_ip ${PBR_TABLE} ${TEST_NET4[2]}.3 ${CONGW[3]} ${NETIFS[p5]} 0 "unicast connected"
+ run_ip ${PBR_TABLE} ${TEST_NET4[2]}.4 ${RECGW4[2]} ${NETIFS[p5]} 0 "unicast recursive"
+
+ # multipath version
+ #
+ log_subsection "default VRF - main table - multipath"
+
+ run_ip_mpath 254 ${TEST_NET4[1]}.5 \
+ "${CONGW[1]} dev ${NETIFS[p1]} onlink" \
+ "${CONGW[2]} dev ${NETIFS[p3]} onlink" \
+ 0 "unicast connected - multipath"
+
+ run_ip_mpath 254 ${TEST_NET4[1]}.6 \
+ "${RECGW4[1]} dev ${NETIFS[p1]} onlink" \
+ "${RECGW4[2]} dev ${NETIFS[p3]} onlink" \
+ 0 "unicast recursive - multipath"
+
+ run_ip_mpath 254 ${TEST_NET4[1]}.7 \
+ "${CONGW[1]} dev ${NETIFS[p1]}" \
+ "${CONGW[2]} dev ${NETIFS[p3]} onlink" \
+ 0 "unicast connected - multipath onlink first only"
+
+ run_ip_mpath 254 ${TEST_NET4[1]}.8 \
+ "${CONGW[1]} dev ${NETIFS[p1]} onlink" \
+ "${CONGW[2]} dev ${NETIFS[p3]}" \
+ 0 "unicast connected - multipath onlink second only"
+}
+
+invalid_onlink_ipv4()
+{
+ run_ip 254 ${TEST_NET4[1]}.11 ${V4ADDRS[p1]} ${NETIFS[p1]} 2 \
+ "Invalid gw - local unicast address"
+
+ run_ip ${VRF_TABLE} ${TEST_NET4[2]}.11 ${V4ADDRS[p5]} ${NETIFS[p5]} 2 \
+ "Invalid gw - local unicast address, VRF"
+
+ run_ip 254 ${TEST_NET4[1]}.101 ${V4ADDRS[p1]} "" 2 "No nexthop device given"
+
+ run_ip 254 ${TEST_NET4[1]}.102 ${V4ADDRS[p3]} ${NETIFS[p1]} 2 \
+ "Gateway resolves to wrong nexthop device"
+
+ run_ip ${VRF_TABLE} ${TEST_NET4[2]}.103 ${V4ADDRS[p7]} ${NETIFS[p5]} 2 \
+ "Gateway resolves to wrong nexthop device - VRF"
+}
+
+################################################################################
+# IPv6 tests
+#
+
+run_ip6()
+{
+ local table="$1"
+ local prefix="$2"
+ local gw="$3"
+ local dev="$4"
+ local exp_rc="$5"
+ local desc="$6"
+
+ # dev arg may be empty
+ [ -n "${dev}" ] && dev="dev ${dev}"
+
+ run_cmd ip -6 ro add table "${table}" "${prefix}"/128 via "${gw}" "${dev}" onlink
+ log_test $? ${exp_rc} "${desc}"
+}
+
+run_ip6_mpath()
+{
+ local table="$1"
+ local prefix="$2"
+ local opts="$3"
+ local nh1="$4"
+ local nh2="$5"
+ local exp_rc="$6"
+ local desc="$7"
+
+ run_cmd ip -6 ro add table "${table}" "${prefix}"/128 "${opts}" \
+ nexthop via ${nh1} nexthop via ${nh2}
+ log_test $? ${exp_rc} "${desc}"
+}
+
+valid_onlink_ipv6()
+{
+ # - unicast connected, unicast recursive, v4-mapped
+ #
+ log_subsection "default VRF - main table"
+
+ run_ip6 254 ${TEST_NET6[1]}::1 ${V6ADDRS[p1]/::*}::64 ${NETIFS[p1]} 0 "unicast connected"
+ run_ip6 254 ${TEST_NET6[1]}::2 ${RECGW6[1]} ${NETIFS[p1]} 0 "unicast recursive"
+ run_ip6 254 ${TEST_NET6[1]}::3 ::ffff:${TEST_NET4IN6[1]} ${NETIFS[p1]} 0 "v4-mapped"
+
+ log_subsection "VRF ${VRF}"
+
+ run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::1 ${V6ADDRS[p5]/::*}::64 ${NETIFS[p5]} 0 "unicast connected"
+ run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::2 ${RECGW6[2]} ${NETIFS[p5]} 0 "unicast recursive"
+ run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::3 ::ffff:${TEST_NET4IN6[2]} ${NETIFS[p5]} 0 "v4-mapped"
+
+ log_subsection "VRF device, PBR table"
+
+ run_ip6 ${PBR_TABLE} ${TEST_NET6[2]}::4 ${V6ADDRS[p5]/::*}::64 ${NETIFS[p5]} 0 "unicast connected"
+ run_ip6 ${PBR_TABLE} ${TEST_NET6[2]}::5 ${RECGW6[2]} ${NETIFS[p5]} 0 "unicast recursive"
+ run_ip6 ${PBR_TABLE} ${TEST_NET6[2]}::6 ::ffff:${TEST_NET4IN6[2]} ${NETIFS[p5]} 0 "v4-mapped"
+
+ # multipath version
+ #
+ log_subsection "default VRF - main table - multipath"
+
+ run_ip6_mpath 254 ${TEST_NET6[1]}::4 "onlink" \
+ "${V6ADDRS[p1]/::*}::64 dev ${NETIFS[p1]}" \
+ "${V6ADDRS[p3]/::*}::64 dev ${NETIFS[p3]}" \
+ 0 "unicast connected - multipath onlink"
+
+ run_ip6_mpath 254 ${TEST_NET6[1]}::5 "onlink" \
+ "${RECGW6[1]} dev ${NETIFS[p1]}" \
+ "${RECGW6[2]} dev ${NETIFS[p3]}" \
+ 0 "unicast recursive - multipath onlink"
+
+ run_ip6_mpath 254 ${TEST_NET6[1]}::6 "onlink" \
+ "::ffff:${TEST_NET4IN6[1]} dev ${NETIFS[p1]}" \
+ "::ffff:${TEST_NET4IN6[2]} dev ${NETIFS[p3]}" \
+ 0 "v4-mapped - multipath onlink"
+
+ run_ip6_mpath 254 ${TEST_NET6[1]}::7 "" \
+ "${V6ADDRS[p1]/::*}::64 dev ${NETIFS[p1]} onlink" \
+ "${V6ADDRS[p3]/::*}::64 dev ${NETIFS[p3]} onlink" \
+ 0 "unicast connected - multipath onlink both nexthops"
+
+ run_ip6_mpath 254 ${TEST_NET6[1]}::8 "" \
+ "${V6ADDRS[p1]/::*}::64 dev ${NETIFS[p1]} onlink" \
+ "${V6ADDRS[p3]/::*}::64 dev ${NETIFS[p3]}" \
+ 0 "unicast connected - multipath onlink first only"
+
+ run_ip6_mpath 254 ${TEST_NET6[1]}::9 "" \
+ "${V6ADDRS[p1]/::*}::64 dev ${NETIFS[p1]}" \
+ "${V6ADDRS[p3]/::*}::64 dev ${NETIFS[p3]} onlink" \
+ 0 "unicast connected - multipath onlink second only"
+}
+
+invalid_onlink_ipv6()
+{
+ local lladdr
+
+ lladdr=$(get_linklocal ${NETIFS[p1]}) || return 1
+
+ run_ip6 254 ${TEST_NET6[1]}::11 ${V6ADDRS[p1]} ${NETIFS[p1]} 2 \
+ "Invalid gw - local unicast address"
+ run_ip6 254 ${TEST_NET6[1]}::12 ${lladdr} ${NETIFS[p1]} 2 \
+ "Invalid gw - local linklocal address"
+ run_ip6 254 ${TEST_NET6[1]}::12 ${MCAST6} ${NETIFS[p1]} 2 \
+ "Invalid gw - multicast address"
+
+ lladdr=$(get_linklocal ${NETIFS[p5]}) || return 1
+ run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::11 ${V6ADDRS[p5]} ${NETIFS[p5]} 2 \
+ "Invalid gw - local unicast address, VRF"
+ run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::12 ${lladdr} ${NETIFS[p5]} 2 \
+ "Invalid gw - local linklocal address, VRF"
+ run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::12 ${MCAST6} ${NETIFS[p5]} 2 \
+ "Invalid gw - multicast address, VRF"
+
+ run_ip6 254 ${TEST_NET6[1]}::101 ${V6ADDRS[p1]} "" 2 \
+ "No nexthop device given"
+
+ # default VRF validation is done against LOCAL table
+ # run_ip6 254 ${TEST_NET6[1]}::102 ${V6ADDRS[p3]/::[0-9]/::64} ${NETIFS[p1]} 2 \
+ # "Gateway resolves to wrong nexthop device"
+
+ run_ip6 ${VRF_TABLE} ${TEST_NET6[2]}::103 ${V6ADDRS[p7]/::[0-9]/::64} ${NETIFS[p5]} 2 \
+ "Gateway resolves to wrong nexthop device - VRF"
+}
+
+run_onlink_tests()
+{
+ log_section "IPv4 onlink"
+ log_subsection "Valid onlink commands"
+ valid_onlink_ipv4
+ log_subsection "Invalid onlink commands"
+ invalid_onlink_ipv4
+
+ log_section "IPv6 onlink"
+ log_subsection "Valid onlink commands"
+ valid_onlink_ipv6
+ log_subsection "Invalid onlink commands"
+ invalid_onlink_ipv6
+}
+
+################################################################################
+# usage
+
+usage()
+{
+ cat <<EOF
+usage: ${0##*/} OPTS
+
+ -p Pause on fail
+ -v verbose mode (show commands and output)
+EOF
+}
+
+################################################################################
+# main
+
+nsuccess=0
+nfail=0
+
+while getopts :t:pPhv o
+do
+ case $o in
+ p) PAUSE_ON_FAIL=yes;;
+ v) VERBOSE=$(($VERBOSE + 1));;
+ h) usage; exit 0;;
+ *) usage; exit 1;;
+ esac
+done
+
+cleanup
+setup
+run_onlink_tests
+cleanup
+
+if [ "$TESTS" != "none" ]; then
+ printf "\nTests passed: %3d\n" ${nsuccess}
+ printf "Tests failed: %3d\n" ${nfail}
+fi
diff --git a/tools/testing/selftests/net/fib_nexthop_multiprefix.sh b/tools/testing/selftests/net/fib_nexthop_multiprefix.sh
new file mode 100755
index 000000000..b52d59547
--- /dev/null
+++ b/tools/testing/selftests/net/fib_nexthop_multiprefix.sh
@@ -0,0 +1,292 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Validate cached routes in fib{6}_nh that is used by multiple prefixes.
+# Validate a different # exception is generated in h0 for each remote host.
+#
+# h1
+# /
+# h0 - r1 - h2
+# \
+# h3
+#
+# routing in h0 to hN is done with nexthop objects.
+
+PAUSE_ON_FAIL=no
+VERBOSE=0
+
+which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping)
+
+################################################################################
+# helpers
+
+log_test()
+{
+ local rc=$1
+ local expected=$2
+ local msg="$3"
+
+ if [ ${rc} -eq ${expected} ]; then
+ printf "TEST: %-60s [ OK ]\n" "${msg}"
+ nsuccess=$((nsuccess+1))
+ else
+ ret=1
+ nfail=$((nfail+1))
+ printf "TEST: %-60s [FAIL]\n" "${msg}"
+ if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+ echo
+ echo "hit enter to continue, 'q' to quit"
+ read a
+ [ "$a" = "q" ] && exit 1
+ fi
+ fi
+
+ [ "$VERBOSE" = "1" ] && echo
+}
+
+run_cmd()
+{
+ local cmd="$*"
+ local out
+ local rc
+
+ if [ "$VERBOSE" = "1" ]; then
+ echo "COMMAND: $cmd"
+ fi
+
+ out=$(eval $cmd 2>&1)
+ rc=$?
+ if [ "$VERBOSE" = "1" -a -n "$out" ]; then
+ echo "$out"
+ fi
+
+ [ "$VERBOSE" = "1" ] && echo
+
+ return $rc
+}
+
+################################################################################
+# config
+
+create_ns()
+{
+ local ns=${1}
+
+ ip netns del ${ns} 2>/dev/null
+
+ ip netns add ${ns}
+ ip -netns ${ns} addr add 127.0.0.1/8 dev lo
+ ip -netns ${ns} link set lo up
+
+ ip netns exec ${ns} sysctl -q -w net.ipv6.conf.all.keep_addr_on_down=1
+ case ${ns} in
+ h*)
+ ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=0
+ ;;
+ r*)
+ ip netns exec $ns sysctl -q -w net.ipv4.ip_forward=1
+ ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=1
+ ;;
+ esac
+}
+
+setup()
+{
+ local ns
+ local i
+
+ #set -e
+
+ for ns in h0 r1 h1 h2 h3
+ do
+ create_ns ${ns}
+ done
+
+ #
+ # create interconnects
+ #
+
+ for i in 0 1 2 3
+ do
+ ip -netns h${i} li add eth0 type veth peer name r1h${i}
+ ip -netns h${i} li set eth0 up
+ ip -netns h${i} li set r1h${i} netns r1 name eth${i} up
+
+ ip -netns h${i} addr add dev eth0 172.16.10${i}.1/24
+ ip -netns h${i} -6 addr add dev eth0 2001:db8:10${i}::1/64
+ ip -netns r1 addr add dev eth${i} 172.16.10${i}.254/24
+ ip -netns r1 -6 addr add dev eth${i} 2001:db8:10${i}::64/64
+ done
+
+ ip -netns h0 nexthop add id 4 via 172.16.100.254 dev eth0
+ ip -netns h0 nexthop add id 6 via 2001:db8:100::64 dev eth0
+
+ # routing from h0 to h1-h3 and back
+ for i in 1 2 3
+ do
+ ip -netns h0 ro add 172.16.10${i}.0/24 nhid 4
+ ip -netns h${i} ro add 172.16.100.0/24 via 172.16.10${i}.254
+
+ ip -netns h0 -6 ro add 2001:db8:10${i}::/64 nhid 6
+ ip -netns h${i} -6 ro add 2001:db8:100::/64 via 2001:db8:10${i}::64
+ done
+
+ if [ "$VERBOSE" = "1" ]; then
+ echo
+ echo "host 1 config"
+ ip -netns h0 li sh
+ ip -netns h0 ro sh
+ ip -netns h0 -6 ro sh
+ fi
+
+ #set +e
+}
+
+cleanup()
+{
+ for n in h0 r1 h1 h2 h3
+ do
+ ip netns del ${n} 2>/dev/null
+ done
+}
+
+change_mtu()
+{
+ local hostid=$1
+ local mtu=$2
+
+ run_cmd ip -netns h${hostid} li set eth0 mtu ${mtu}
+ run_cmd ip -netns r1 li set eth${hostid} mtu ${mtu}
+}
+
+################################################################################
+# validate exceptions
+
+validate_v4_exception()
+{
+ local i=$1
+ local mtu=$2
+ local ping_sz=$3
+ local dst="172.16.10${i}.1"
+ local h0=172.16.100.1
+ local r1=172.16.100.254
+ local rc
+
+ if [ ${ping_sz} != "0" ]; then
+ run_cmd ip netns exec h0 ping -s ${ping_sz} -c5 -w5 ${dst}
+ fi
+
+ if [ "$VERBOSE" = "1" ]; then
+ echo "Route get"
+ ip -netns h0 ro get ${dst}
+ echo "Searching for:"
+ echo " cache .* mtu ${mtu}"
+ echo
+ fi
+
+ ip -netns h0 ro get ${dst} | \
+ grep -q "cache .* mtu ${mtu}"
+ rc=$?
+
+ log_test $rc 0 "IPv4: host 0 to host ${i}, mtu ${mtu}"
+}
+
+validate_v6_exception()
+{
+ local i=$1
+ local mtu=$2
+ local ping_sz=$3
+ local dst="2001:db8:10${i}::1"
+ local h0=2001:db8:100::1
+ local r1=2001:db8:100::64
+ local rc
+
+ if [ ${ping_sz} != "0" ]; then
+ run_cmd ip netns exec h0 ${ping6} -s ${ping_sz} -c5 -w5 ${dst}
+ fi
+
+ if [ "$VERBOSE" = "1" ]; then
+ echo "Route get"
+ ip -netns h0 -6 ro get ${dst}
+ echo "Searching for:"
+ echo " ${dst}.* via ${r1} dev eth0 src ${h0} .* mtu ${mtu}"
+ echo
+ fi
+
+ ip -netns h0 -6 ro get ${dst} | \
+ grep -q "${dst}.* via ${r1} dev eth0 src ${h0} .* mtu ${mtu}"
+ rc=$?
+
+ log_test $rc 0 "IPv6: host 0 to host ${i}, mtu ${mtu}"
+}
+
+################################################################################
+# main
+
+while getopts :pv o
+do
+ case $o in
+ p) PAUSE_ON_FAIL=yes;;
+ v) VERBOSE=1;;
+ esac
+done
+
+cleanup
+setup
+sleep 2
+
+cpus=$(cat /sys/devices/system/cpu/online)
+cpus="$(seq ${cpus/-/ })"
+ret=0
+for i in 1 2 3
+do
+ # generate a cached route per-cpu
+ for c in ${cpus}; do
+ run_cmd taskset -c ${c} ip netns exec h0 ping -c1 -w1 172.16.10${i}.1
+ [ $? -ne 0 ] && printf "\nERROR: ping to h${i} failed\n" && ret=1
+
+ run_cmd taskset -c ${c} ip netns exec h0 ${ping6} -c1 -w1 2001:db8:10${i}::1
+ [ $? -ne 0 ] && printf "\nERROR: ping6 to h${i} failed\n" && ret=1
+
+ [ $ret -ne 0 ] && break
+ done
+ [ $ret -ne 0 ] && break
+done
+
+if [ $ret -eq 0 ]; then
+ # generate different exceptions in h0 for h1, h2 and h3
+ change_mtu 1 1300
+ validate_v4_exception 1 1300 1350
+ validate_v6_exception 1 1300 1350
+ echo
+
+ change_mtu 2 1350
+ validate_v4_exception 2 1350 1400
+ validate_v6_exception 2 1350 1400
+ echo
+
+ change_mtu 3 1400
+ validate_v4_exception 3 1400 1450
+ validate_v6_exception 3 1400 1450
+ echo
+
+ validate_v4_exception 1 1300 0
+ validate_v6_exception 1 1300 0
+ echo
+
+ validate_v4_exception 2 1350 0
+ validate_v6_exception 2 1350 0
+ echo
+
+ validate_v4_exception 3 1400 0
+ validate_v6_exception 3 1400 0
+
+ # targeted deletes to trigger cleanup paths in kernel
+ ip -netns h0 ro del 172.16.102.0/24 nhid 4
+ ip -netns h0 -6 ro del 2001:db8:102::/64 nhid 6
+
+ ip -netns h0 nexthop del id 4
+ ip -netns h0 nexthop del id 6
+fi
+
+cleanup
diff --git a/tools/testing/selftests/net/fib_nexthops.sh b/tools/testing/selftests/net/fib_nexthops.sh
new file mode 100755
index 000000000..7ece4131d
--- /dev/null
+++ b/tools/testing/selftests/net/fib_nexthops.sh
@@ -0,0 +1,1696 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# ns: me | ns: peer | ns: remote
+# 2001:db8:91::1 | 2001:db8:91::2 |
+# 172.16.1.1 | 172.16.1.2 |
+# veth1 <---|---> veth2 |
+# | veth5 <--|--> veth6 172.16.101.1
+# veth3 <---|---> veth4 | 2001:db8:101::1
+# 172.16.2.1 | 172.16.2.2 |
+# 2001:db8:92::1 | 2001:db8:92::2 |
+#
+# This test is for checking IPv4 and IPv6 FIB behavior with nexthop
+# objects. Device reference counts and network namespace cleanup tested
+# by use of network namespace for peer.
+
+ret=0
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+# all tests in this script. Can be overridden with -t option
+IPV4_TESTS="ipv4_fcnal ipv4_grp_fcnal ipv4_withv6_fcnal ipv4_fcnal_runtime ipv4_large_grp ipv4_compat_mode ipv4_fdb_grp_fcnal ipv4_torture"
+IPV6_TESTS="ipv6_fcnal ipv6_grp_fcnal ipv6_fcnal_runtime ipv6_large_grp ipv6_compat_mode ipv6_fdb_grp_fcnal ipv6_torture"
+
+ALL_TESTS="basic ${IPV4_TESTS} ${IPV6_TESTS}"
+TESTS="${ALL_TESTS}"
+VERBOSE=0
+PAUSE_ON_FAIL=no
+PAUSE=no
+
+nsid=100
+
+################################################################################
+# utilities
+
+log_test()
+{
+ local rc=$1
+ local expected=$2
+ local msg="$3"
+
+ if [ ${rc} -eq ${expected} ]; then
+ printf "TEST: %-60s [ OK ]\n" "${msg}"
+ nsuccess=$((nsuccess+1))
+ else
+ ret=1
+ nfail=$((nfail+1))
+ printf "TEST: %-60s [FAIL]\n" "${msg}"
+ if [ "$VERBOSE" = "1" ]; then
+ echo " rc=$rc, expected $expected"
+ fi
+
+ if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+ echo
+ echo "hit enter to continue, 'q' to quit"
+ read a
+ [ "$a" = "q" ] && exit 1
+ fi
+ fi
+
+ if [ "${PAUSE}" = "yes" ]; then
+ echo
+ echo "hit enter to continue, 'q' to quit"
+ read a
+ [ "$a" = "q" ] && exit 1
+ fi
+
+ [ "$VERBOSE" = "1" ] && echo
+}
+
+run_cmd()
+{
+ local cmd="$1"
+ local out
+ local stderr="2>/dev/null"
+
+ if [ "$VERBOSE" = "1" ]; then
+ printf "COMMAND: $cmd\n"
+ stderr=
+ fi
+
+ out=$(eval $cmd $stderr)
+ rc=$?
+ if [ "$VERBOSE" = "1" -a -n "$out" ]; then
+ echo " $out"
+ fi
+
+ return $rc
+}
+
+get_linklocal()
+{
+ local dev=$1
+ local ns
+ local addr
+
+ [ -n "$2" ] && ns="-netns $2"
+ addr=$(ip $ns -6 -br addr show dev ${dev} | \
+ awk '{
+ for (i = 3; i <= NF; ++i) {
+ if ($i ~ /^fe80/)
+ print $i
+ }
+ }'
+ )
+ addr=${addr/\/*}
+
+ [ -z "$addr" ] && return 1
+
+ echo $addr
+
+ return 0
+}
+
+create_ns()
+{
+ local n=${1}
+
+ ip netns del ${n} 2>/dev/null
+
+ set -e
+ ip netns add ${n}
+ ip netns set ${n} $((nsid++))
+ ip -netns ${n} addr add 127.0.0.1/8 dev lo
+ ip -netns ${n} link set lo up
+
+ ip netns exec ${n} sysctl -qw net.ipv4.ip_forward=1
+ ip netns exec ${n} sysctl -qw net.ipv4.fib_multipath_use_neigh=1
+ ip netns exec ${n} sysctl -qw net.ipv4.conf.default.ignore_routes_with_linkdown=1
+ ip netns exec ${n} sysctl -qw net.ipv6.conf.all.keep_addr_on_down=1
+ ip netns exec ${n} sysctl -qw net.ipv6.conf.all.forwarding=1
+ ip netns exec ${n} sysctl -qw net.ipv6.conf.default.forwarding=1
+ ip netns exec ${n} sysctl -qw net.ipv6.conf.default.ignore_routes_with_linkdown=1
+ ip netns exec ${n} sysctl -qw net.ipv6.conf.all.accept_dad=0
+ ip netns exec ${n} sysctl -qw net.ipv6.conf.default.accept_dad=0
+
+ set +e
+}
+
+setup()
+{
+ cleanup
+
+ create_ns me
+ create_ns peer
+ create_ns remote
+
+ IP="ip -netns me"
+ BRIDGE="bridge -netns me"
+ set -e
+ $IP li add veth1 type veth peer name veth2
+ $IP li set veth1 up
+ $IP addr add 172.16.1.1/24 dev veth1
+ $IP -6 addr add 2001:db8:91::1/64 dev veth1 nodad
+
+ $IP li add veth3 type veth peer name veth4
+ $IP li set veth3 up
+ $IP addr add 172.16.2.1/24 dev veth3
+ $IP -6 addr add 2001:db8:92::1/64 dev veth3 nodad
+
+ $IP li set veth2 netns peer up
+ ip -netns peer addr add 172.16.1.2/24 dev veth2
+ ip -netns peer -6 addr add 2001:db8:91::2/64 dev veth2 nodad
+
+ $IP li set veth4 netns peer up
+ ip -netns peer addr add 172.16.2.2/24 dev veth4
+ ip -netns peer -6 addr add 2001:db8:92::2/64 dev veth4 nodad
+
+ ip -netns remote li add veth5 type veth peer name veth6
+ ip -netns remote li set veth5 up
+ ip -netns remote addr add dev veth5 172.16.101.1/24
+ ip -netns remote -6 addr add dev veth5 2001:db8:101::1/64 nodad
+ ip -netns remote ro add 172.16.0.0/22 via 172.16.101.2
+ ip -netns remote -6 ro add 2001:db8:90::/40 via 2001:db8:101::2
+
+ ip -netns remote li set veth6 netns peer up
+ ip -netns peer addr add dev veth6 172.16.101.2/24
+ ip -netns peer -6 addr add dev veth6 2001:db8:101::2/64 nodad
+ set +e
+}
+
+cleanup()
+{
+ local ns
+
+ for ns in me peer remote; do
+ ip netns del ${ns} 2>/dev/null
+ done
+}
+
+check_output()
+{
+ local out="$1"
+ local expected="$2"
+ local rc=0
+
+ [ "${out}" = "${expected}" ] && return 0
+
+ if [ -z "${out}" ]; then
+ if [ "$VERBOSE" = "1" ]; then
+ printf "\nNo entry found\n"
+ printf "Expected:\n"
+ printf " ${expected}\n"
+ fi
+ return 1
+ fi
+
+ out=$(echo ${out})
+ if [ "${out}" != "${expected}" ]; then
+ rc=1
+ if [ "${VERBOSE}" = "1" ]; then
+ printf " Unexpected entry. Have:\n"
+ printf " ${out}\n"
+ printf " Expected:\n"
+ printf " ${expected}\n\n"
+ else
+ echo " WARNING: Unexpected route entry"
+ fi
+ fi
+
+ return $rc
+}
+
+check_nexthop()
+{
+ local nharg="$1"
+ local expected="$2"
+ local out
+
+ out=$($IP nexthop ls ${nharg} 2>/dev/null)
+
+ check_output "${out}" "${expected}"
+}
+
+check_route()
+{
+ local pfx="$1"
+ local expected="$2"
+ local out
+
+ out=$($IP route ls match ${pfx} 2>/dev/null)
+
+ check_output "${out}" "${expected}"
+}
+
+check_route6()
+{
+ local pfx="$1"
+ local expected="$2"
+ local out
+
+ out=$($IP -6 route ls match ${pfx} 2>/dev/null | sed -e 's/pref medium//')
+
+ check_output "${out}" "${expected}"
+}
+
+check_large_grp()
+{
+ local ipv=$1
+ local ecmp=$2
+ local grpnum=100
+ local nhidstart=100
+ local grpidstart=1000
+ local iter=0
+ local nhidstr=""
+ local grpidstr=""
+ local grpstr=""
+ local ipstr=""
+
+ if [ $ipv -eq 4 ]; then
+ ipstr="172.16.1."
+ else
+ ipstr="2001:db8:91::"
+ fi
+
+ #
+ # Create $grpnum groups with specified $ecmp and dump them
+ #
+
+ # create nexthops with different gateways
+ iter=2
+ while [ $iter -le $(($ecmp + 1)) ]
+ do
+ nhidstr="$(($nhidstart + $iter))"
+ run_cmd "$IP nexthop add id $nhidstr via $ipstr$iter dev veth1"
+ check_nexthop "id $nhidstr" "id $nhidstr via $ipstr$iter dev veth1 scope link"
+
+ if [ $iter -le $ecmp ]; then
+ grpstr+="$nhidstr/"
+ else
+ grpstr+="$nhidstr"
+ fi
+ ((iter++))
+ done
+
+ # create duplicate large ecmp groups
+ iter=0
+ while [ $iter -le $grpnum ]
+ do
+ grpidstr="$(($grpidstart + $iter))"
+ run_cmd "$IP nexthop add id $grpidstr group $grpstr"
+ check_nexthop "id $grpidstr" "id $grpidstr group $grpstr"
+ ((iter++))
+ done
+
+ # dump large groups
+ run_cmd "$IP nexthop list"
+ log_test $? 0 "Dump large (x$ecmp) ecmp groups"
+}
+
+start_ip_monitor()
+{
+ local mtype=$1
+
+ # start the monitor in the background
+ tmpfile=`mktemp /var/run/nexthoptestXXX`
+ mpid=`($IP monitor $mtype > $tmpfile & echo $!) 2>/dev/null`
+ sleep 0.2
+ echo "$mpid $tmpfile"
+}
+
+stop_ip_monitor()
+{
+ local mpid=$1
+ local tmpfile=$2
+ local el=$3
+
+ # check the monitor results
+ kill $mpid
+ lines=`wc -l $tmpfile | cut "-d " -f1`
+ test $lines -eq $el
+ rc=$?
+ rm -rf $tmpfile
+
+ return $rc
+}
+
+check_nexthop_fdb_support()
+{
+ $IP nexthop help 2>&1 | grep -q fdb
+ if [ $? -ne 0 ]; then
+ echo "SKIP: iproute2 too old, missing fdb nexthop support"
+ return $ksft_skip
+ fi
+}
+
+ipv6_fdb_grp_fcnal()
+{
+ local rc
+
+ echo
+ echo "IPv6 fdb groups functional"
+ echo "--------------------------"
+
+ check_nexthop_fdb_support
+ if [ $? -eq $ksft_skip ]; then
+ return $ksft_skip
+ fi
+
+ # create group with multiple nexthops
+ run_cmd "$IP nexthop add id 61 via 2001:db8:91::2 fdb"
+ run_cmd "$IP nexthop add id 62 via 2001:db8:91::3 fdb"
+ run_cmd "$IP nexthop add id 102 group 61/62 fdb"
+ check_nexthop "id 102" "id 102 group 61/62 fdb"
+ log_test $? 0 "Fdb Nexthop group with multiple nexthops"
+
+ ## get nexthop group
+ run_cmd "$IP nexthop get id 102"
+ check_nexthop "id 102" "id 102 group 61/62 fdb"
+ log_test $? 0 "Get Fdb nexthop group by id"
+
+ # fdb nexthop group can only contain fdb nexthops
+ run_cmd "$IP nexthop add id 63 via 2001:db8:91::4"
+ run_cmd "$IP nexthop add id 64 via 2001:db8:91::5"
+ run_cmd "$IP nexthop add id 103 group 63/64 fdb"
+ log_test $? 2 "Fdb Nexthop group with non-fdb nexthops"
+
+ # Non fdb nexthop group can not contain fdb nexthops
+ run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 fdb"
+ run_cmd "$IP nexthop add id 66 via 2001:db8:91::6 fdb"
+ run_cmd "$IP nexthop add id 104 group 65/66"
+ log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops"
+
+ # fdb nexthop cannot have blackhole
+ run_cmd "$IP nexthop add id 67 blackhole fdb"
+ log_test $? 2 "Fdb Nexthop with blackhole"
+
+ # fdb nexthop with oif
+ run_cmd "$IP nexthop add id 68 via 2001:db8:91::7 dev veth1 fdb"
+ log_test $? 2 "Fdb Nexthop with oif"
+
+ # fdb nexthop with onlink
+ run_cmd "$IP nexthop add id 68 via 2001:db8:91::7 onlink fdb"
+ log_test $? 2 "Fdb Nexthop with onlink"
+
+ # fdb nexthop with encap
+ run_cmd "$IP nexthop add id 69 encap mpls 101 via 2001:db8:91::8 dev veth1 fdb"
+ log_test $? 2 "Fdb Nexthop with encap"
+
+ run_cmd "$IP link add name vx10 type vxlan id 1010 local 2001:db8:91::9 remote 2001:db8:91::10 dstport 4789 nolearning noudpcsum tos inherit ttl 100"
+ run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self"
+ log_test $? 0 "Fdb mac add with nexthop group"
+
+ ## fdb nexthops can only reference nexthop groups and not nexthops
+ run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 61 self"
+ log_test $? 255 "Fdb mac add with nexthop"
+
+ run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 66"
+ log_test $? 2 "Route add with fdb nexthop"
+
+ run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 103"
+ log_test $? 2 "Route add with fdb nexthop group"
+
+ run_cmd "$IP nexthop del id 61"
+ run_cmd "$BRIDGE fdb get to 02:02:00:00:00:13 dev vx10 self"
+ log_test $? 0 "Fdb entry after deleting a single nexthop"
+
+ run_cmd "$IP nexthop del id 102"
+ log_test $? 0 "Fdb nexthop delete"
+
+ run_cmd "$BRIDGE fdb get to 02:02:00:00:00:13 dev vx10 self"
+ log_test $? 254 "Fdb entry after deleting a nexthop group"
+
+ $IP link del dev vx10
+}
+
+ipv4_fdb_grp_fcnal()
+{
+ local rc
+
+ echo
+ echo "IPv4 fdb groups functional"
+ echo "--------------------------"
+
+ check_nexthop_fdb_support
+ if [ $? -eq $ksft_skip ]; then
+ return $ksft_skip
+ fi
+
+ # create group with multiple nexthops
+ run_cmd "$IP nexthop add id 12 via 172.16.1.2 fdb"
+ run_cmd "$IP nexthop add id 13 via 172.16.1.3 fdb"
+ run_cmd "$IP nexthop add id 102 group 12/13 fdb"
+ check_nexthop "id 102" "id 102 group 12/13 fdb"
+ log_test $? 0 "Fdb Nexthop group with multiple nexthops"
+
+ # get nexthop group
+ run_cmd "$IP nexthop get id 102"
+ check_nexthop "id 102" "id 102 group 12/13 fdb"
+ log_test $? 0 "Get Fdb nexthop group by id"
+
+ # fdb nexthop group can only contain fdb nexthops
+ run_cmd "$IP nexthop add id 14 via 172.16.1.2"
+ run_cmd "$IP nexthop add id 15 via 172.16.1.3"
+ run_cmd "$IP nexthop add id 103 group 14/15 fdb"
+ log_test $? 2 "Fdb Nexthop group with non-fdb nexthops"
+
+ # Non fdb nexthop group can not contain fdb nexthops
+ run_cmd "$IP nexthop add id 16 via 172.16.1.2 fdb"
+ run_cmd "$IP nexthop add id 17 via 172.16.1.3 fdb"
+ run_cmd "$IP nexthop add id 104 group 14/15"
+ log_test $? 2 "Non-Fdb Nexthop group with fdb nexthops"
+
+ # fdb nexthop cannot have blackhole
+ run_cmd "$IP nexthop add id 18 blackhole fdb"
+ log_test $? 2 "Fdb Nexthop with blackhole"
+
+ # fdb nexthop with oif
+ run_cmd "$IP nexthop add id 16 via 172.16.1.2 dev veth1 fdb"
+ log_test $? 2 "Fdb Nexthop with oif"
+
+ # fdb nexthop with onlink
+ run_cmd "$IP nexthop add id 16 via 172.16.1.2 onlink fdb"
+ log_test $? 2 "Fdb Nexthop with onlink"
+
+ # fdb nexthop with encap
+ run_cmd "$IP nexthop add id 17 encap mpls 101 via 172.16.1.2 dev veth1 fdb"
+ log_test $? 2 "Fdb Nexthop with encap"
+
+ run_cmd "$IP link add name vx10 type vxlan id 1010 local 10.0.0.1 remote 10.0.0.2 dstport 4789 nolearning noudpcsum tos inherit ttl 100"
+ run_cmd "$BRIDGE fdb add 02:02:00:00:00:13 dev vx10 nhid 102 self"
+ log_test $? 0 "Fdb mac add with nexthop group"
+
+ # fdb nexthops can only reference nexthop groups and not nexthops
+ run_cmd "$BRIDGE fdb add 02:02:00:00:00:14 dev vx10 nhid 12 self"
+ log_test $? 255 "Fdb mac add with nexthop"
+
+ run_cmd "$IP ro add 172.16.0.0/22 nhid 15"
+ log_test $? 2 "Route add with fdb nexthop"
+
+ run_cmd "$IP ro add 172.16.0.0/22 nhid 103"
+ log_test $? 2 "Route add with fdb nexthop group"
+
+ run_cmd "$IP nexthop del id 12"
+ run_cmd "$BRIDGE fdb get to 02:02:00:00:00:13 dev vx10 self"
+ log_test $? 0 "Fdb entry after deleting a single nexthop"
+
+ run_cmd "$IP nexthop del id 102"
+ log_test $? 0 "Fdb nexthop delete"
+
+ run_cmd "$BRIDGE fdb get to 02:02:00:00:00:13 dev vx10 self"
+ log_test $? 254 "Fdb entry after deleting a nexthop group"
+
+ $IP link del dev vx10
+}
+
+################################################################################
+# basic operations (add, delete, replace) on nexthops and nexthop groups
+#
+# IPv6
+
+ipv6_fcnal()
+{
+ local rc
+
+ echo
+ echo "IPv6"
+ echo "----------------------"
+
+ run_cmd "$IP nexthop add id 52 via 2001:db8:91::2 dev veth1"
+ rc=$?
+ log_test $rc 0 "Create nexthop with id, gw, dev"
+ if [ $rc -ne 0 ]; then
+ echo "Basic IPv6 create fails; can not continue"
+ return 1
+ fi
+
+ run_cmd "$IP nexthop get id 52"
+ log_test $? 0 "Get nexthop by id"
+ check_nexthop "id 52" "id 52 via 2001:db8:91::2 dev veth1 scope link"
+
+ run_cmd "$IP nexthop del id 52"
+ log_test $? 0 "Delete nexthop by id"
+ check_nexthop "id 52" ""
+
+ #
+ # gw, device spec
+ #
+ # gw validation, no device - fails since dev required
+ run_cmd "$IP nexthop add id 52 via 2001:db8:92::3"
+ log_test $? 2 "Create nexthop - gw only"
+
+ # gw is not reachable throught given dev
+ run_cmd "$IP nexthop add id 53 via 2001:db8:3::3 dev veth1"
+ log_test $? 2 "Create nexthop - invalid gw+dev combination"
+
+ # onlink arg overrides gw+dev lookup
+ run_cmd "$IP nexthop add id 53 via 2001:db8:3::3 dev veth1 onlink"
+ log_test $? 0 "Create nexthop - gw+dev and onlink"
+
+ # admin down should delete nexthops
+ set -e
+ run_cmd "$IP -6 nexthop add id 55 via 2001:db8:91::3 dev veth1"
+ run_cmd "$IP nexthop add id 56 via 2001:db8:91::4 dev veth1"
+ run_cmd "$IP nexthop add id 57 via 2001:db8:91::5 dev veth1"
+ run_cmd "$IP li set dev veth1 down"
+ set +e
+ check_nexthop "dev veth1" ""
+ log_test $? 0 "Nexthops removed on admin down"
+}
+
+ipv6_grp_fcnal()
+{
+ local rc
+
+ echo
+ echo "IPv6 groups functional"
+ echo "----------------------"
+
+ # basic functionality: create a nexthop group, default weight
+ run_cmd "$IP nexthop add id 61 via 2001:db8:91::2 dev veth1"
+ run_cmd "$IP nexthop add id 101 group 61"
+ log_test $? 0 "Create nexthop group with single nexthop"
+
+ # get nexthop group
+ run_cmd "$IP nexthop get id 101"
+ log_test $? 0 "Get nexthop group by id"
+ check_nexthop "id 101" "id 101 group 61"
+
+ # delete nexthop group
+ run_cmd "$IP nexthop del id 101"
+ log_test $? 0 "Delete nexthop group by id"
+ check_nexthop "id 101" ""
+
+ $IP nexthop flush >/dev/null 2>&1
+ check_nexthop "id 101" ""
+
+ #
+ # create group with multiple nexthops - mix of gw and dev only
+ #
+ run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1"
+ run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1"
+ run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1"
+ run_cmd "$IP nexthop add id 65 dev veth1"
+ run_cmd "$IP nexthop add id 102 group 62/63/64/65"
+ log_test $? 0 "Nexthop group with multiple nexthops"
+ check_nexthop "id 102" "id 102 group 62/63/64/65"
+
+ # Delete nexthop in a group and group is updated
+ run_cmd "$IP nexthop del id 63"
+ check_nexthop "id 102" "id 102 group 62/64/65"
+ log_test $? 0 "Nexthop group updated when entry is deleted"
+
+ # create group with multiple weighted nexthops
+ run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1"
+ run_cmd "$IP nexthop add id 103 group 62/63,2/64,3/65,4"
+ log_test $? 0 "Nexthop group with weighted nexthops"
+ check_nexthop "id 103" "id 103 group 62/63,2/64,3/65,4"
+
+ # Delete nexthop in a weighted group and group is updated
+ run_cmd "$IP nexthop del id 63"
+ check_nexthop "id 103" "id 103 group 62/64,3/65,4"
+ log_test $? 0 "Weighted nexthop group updated when entry is deleted"
+
+ # admin down - nexthop is removed from group
+ run_cmd "$IP li set dev veth1 down"
+ check_nexthop "dev veth1" ""
+ log_test $? 0 "Nexthops in groups removed on admin down"
+
+ # expect groups to have been deleted as well
+ check_nexthop "" ""
+
+ run_cmd "$IP li set dev veth1 up"
+
+ $IP nexthop flush >/dev/null 2>&1
+
+ # group with nexthops using different devices
+ set -e
+ run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1"
+ run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1"
+ run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1"
+ run_cmd "$IP nexthop add id 65 via 2001:db8:91::5 dev veth1"
+
+ run_cmd "$IP nexthop add id 72 via 2001:db8:92::2 dev veth3"
+ run_cmd "$IP nexthop add id 73 via 2001:db8:92::3 dev veth3"
+ run_cmd "$IP nexthop add id 74 via 2001:db8:92::4 dev veth3"
+ run_cmd "$IP nexthop add id 75 via 2001:db8:92::5 dev veth3"
+ set +e
+
+ # multiple groups with same nexthop
+ run_cmd "$IP nexthop add id 104 group 62"
+ run_cmd "$IP nexthop add id 105 group 62"
+ check_nexthop "group" "id 104 group 62 id 105 group 62"
+ log_test $? 0 "Multiple groups with same nexthop"
+
+ run_cmd "$IP nexthop flush groups"
+ [ $? -ne 0 ] && return 1
+
+ # on admin down of veth1, it should be removed from the group
+ run_cmd "$IP nexthop add id 105 group 62/63/72/73/64"
+ run_cmd "$IP li set veth1 down"
+ check_nexthop "id 105" "id 105 group 72/73"
+ log_test $? 0 "Nexthops in group removed on admin down - mixed group"
+
+ run_cmd "$IP nexthop add id 106 group 105/74"
+ log_test $? 2 "Nexthop group can not have a group as an entry"
+
+ # a group can have a blackhole entry only if it is the only
+ # nexthop in the group. Needed for atomic replace with an
+ # actual nexthop group
+ run_cmd "$IP -6 nexthop add id 31 blackhole"
+ run_cmd "$IP nexthop add id 107 group 31"
+ log_test $? 0 "Nexthop group with a blackhole entry"
+
+ run_cmd "$IP nexthop add id 108 group 31/24"
+ log_test $? 2 "Nexthop group can not have a blackhole and another nexthop"
+}
+
+ipv6_fcnal_runtime()
+{
+ local rc
+
+ echo
+ echo "IPv6 functional runtime"
+ echo "-----------------------"
+
+ #
+ # IPv6 - the basics
+ #
+ run_cmd "$IP nexthop add id 81 via 2001:db8:91::2 dev veth1"
+ run_cmd "$IP ro add 2001:db8:101::1/128 nhid 81"
+ log_test $? 0 "Route add"
+
+ run_cmd "$IP ro delete 2001:db8:101::1/128 nhid 81"
+ log_test $? 0 "Route delete"
+
+ run_cmd "$IP ro add 2001:db8:101::1/128 nhid 81"
+ run_cmd "ip netns exec me ping -c1 -w1 2001:db8:101::1"
+ log_test $? 0 "Ping with nexthop"
+
+ run_cmd "$IP nexthop add id 82 via 2001:db8:92::2 dev veth3"
+ run_cmd "$IP nexthop add id 122 group 81/82"
+ run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 122"
+ run_cmd "ip netns exec me ping -c1 -w1 2001:db8:101::1"
+ log_test $? 0 "Ping - multipath"
+
+ #
+ # IPv6 with blackhole nexthops
+ #
+ run_cmd "$IP -6 nexthop add id 83 blackhole"
+ run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 83"
+ run_cmd "ip netns exec me ping -c1 -w1 2001:db8:101::1"
+ log_test $? 2 "Ping - blackhole"
+
+ run_cmd "$IP nexthop replace id 83 via 2001:db8:91::2 dev veth1"
+ run_cmd "ip netns exec me ping -c1 -w1 2001:db8:101::1"
+ log_test $? 0 "Ping - blackhole replaced with gateway"
+
+ run_cmd "$IP -6 nexthop replace id 83 blackhole"
+ run_cmd "ip netns exec me ping -c1 -w1 2001:db8:101::1"
+ log_test $? 2 "Ping - gateway replaced by blackhole"
+
+ run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 122"
+ run_cmd "ip netns exec me ping -c1 -w1 2001:db8:101::1"
+ if [ $? -eq 0 ]; then
+ run_cmd "$IP nexthop replace id 122 group 83"
+ run_cmd "ip netns exec me ping -c1 -w1 2001:db8:101::1"
+ log_test $? 2 "Ping - group with blackhole"
+
+ run_cmd "$IP nexthop replace id 122 group 81/82"
+ run_cmd "ip netns exec me ping -c1 -w1 2001:db8:101::1"
+ log_test $? 0 "Ping - group blackhole replaced with gateways"
+ else
+ log_test 2 0 "Ping - multipath failed"
+ fi
+
+ #
+ # device only and gw + dev only mix
+ #
+ run_cmd "$IP -6 nexthop add id 85 dev veth1"
+ run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 85"
+ log_test $? 0 "IPv6 route with device only nexthop"
+ check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 85 dev veth1 metric 1024"
+
+ run_cmd "$IP nexthop add id 123 group 81/85"
+ run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 123"
+ log_test $? 0 "IPv6 multipath route with nexthop mix - dev only + gw"
+ check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 123 metric 1024 nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop dev veth1 weight 1"
+
+ #
+ # IPv6 route with v4 nexthop - not allowed
+ #
+ run_cmd "$IP ro delete 2001:db8:101::1/128"
+ run_cmd "$IP nexthop add id 84 via 172.16.1.1 dev veth1"
+ run_cmd "$IP ro add 2001:db8:101::1/128 nhid 84"
+ log_test $? 2 "IPv6 route can not have a v4 gateway"
+
+ run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 81"
+ run_cmd "$IP nexthop replace id 81 via 172.16.1.1 dev veth1"
+ log_test $? 2 "Nexthop replace - v6 route, v4 nexthop"
+
+ run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 122"
+ run_cmd "$IP nexthop replace id 81 via 172.16.1.1 dev veth1"
+ log_test $? 2 "Nexthop replace of group entry - v6 route, v4 nexthop"
+
+ run_cmd "$IP nexthop add id 86 via 2001:db8:92::2 dev veth3"
+ run_cmd "$IP nexthop add id 87 via 172.16.1.1 dev veth1"
+ run_cmd "$IP nexthop add id 88 via 172.16.1.1 dev veth1"
+ run_cmd "$IP nexthop add id 124 group 86/87/88"
+ run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 124"
+ log_test $? 2 "IPv6 route can not have a group with v4 and v6 gateways"
+
+ run_cmd "$IP nexthop del id 88"
+ run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 124"
+ log_test $? 2 "IPv6 route can not have a group with v4 and v6 gateways"
+
+ run_cmd "$IP nexthop del id 87"
+ run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 124"
+ log_test $? 0 "IPv6 route using a group after removing v4 gateways"
+
+ run_cmd "$IP ro delete 2001:db8:101::1/128"
+ run_cmd "$IP nexthop add id 87 via 172.16.1.1 dev veth1"
+ run_cmd "$IP nexthop add id 88 via 172.16.1.1 dev veth1"
+ run_cmd "$IP nexthop replace id 124 group 86/87/88"
+ run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 124"
+ log_test $? 2 "IPv6 route can not have a group with v4 and v6 gateways"
+
+ run_cmd "$IP nexthop replace id 88 via 2001:db8:92::2 dev veth3"
+ run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 124"
+ log_test $? 2 "IPv6 route can not have a group with v4 and v6 gateways"
+
+ run_cmd "$IP nexthop replace id 87 via 2001:db8:92::2 dev veth3"
+ run_cmd "$IP ro replace 2001:db8:101::1/128 nhid 124"
+ log_test $? 0 "IPv6 route using a group after replacing v4 gateways"
+
+ $IP nexthop flush >/dev/null 2>&1
+
+ #
+ # weird IPv6 cases
+ #
+ run_cmd "$IP nexthop add id 86 via 2001:db8:91::2 dev veth1"
+ run_cmd "$IP ro add 2001:db8:101::1/128 nhid 81"
+
+ # rpfilter and default route
+ $IP nexthop flush >/dev/null 2>&1
+ run_cmd "ip netns exec me ip6tables -t mangle -I PREROUTING 1 -m rpfilter --invert -j DROP"
+ run_cmd "$IP nexthop add id 91 via 2001:db8:91::2 dev veth1"
+ run_cmd "$IP nexthop add id 92 via 2001:db8:92::2 dev veth3"
+ run_cmd "$IP nexthop add id 93 group 91/92"
+ run_cmd "$IP -6 ro add default nhid 91"
+ run_cmd "ip netns exec me ping -c1 -w1 2001:db8:101::1"
+ log_test $? 0 "Nexthop with default route and rpfilter"
+ run_cmd "$IP -6 ro replace default nhid 93"
+ run_cmd "ip netns exec me ping -c1 -w1 2001:db8:101::1"
+ log_test $? 0 "Nexthop with multipath default route and rpfilter"
+
+ # TO-DO:
+ # existing route with old nexthop; append route with new nexthop
+ # existing route with old nexthop; replace route with new
+ # existing route with new nexthop; replace route with old
+ # route with src address and using nexthop - not allowed
+}
+
+ipv6_large_grp()
+{
+ local ecmp=32
+
+ echo
+ echo "IPv6 large groups (x$ecmp)"
+ echo "---------------------"
+
+ check_large_grp 6 $ecmp
+
+ $IP nexthop flush >/dev/null 2>&1
+}
+
+ipv6_del_add_loop1()
+{
+ while :; do
+ $IP nexthop del id 100
+ $IP nexthop add id 100 via 2001:db8:91::2 dev veth1
+ done >/dev/null 2>&1
+}
+
+ipv6_grp_replace_loop()
+{
+ while :; do
+ $IP nexthop replace id 102 group 100/101
+ done >/dev/null 2>&1
+}
+
+ipv6_torture()
+{
+ local pid1
+ local pid2
+ local pid3
+ local pid4
+ local pid5
+
+ echo
+ echo "IPv6 runtime torture"
+ echo "--------------------"
+ if [ ! -x "$(command -v mausezahn)" ]; then
+ echo "SKIP: Could not run test; need mausezahn tool"
+ return
+ fi
+
+ run_cmd "$IP nexthop add id 100 via 2001:db8:91::2 dev veth1"
+ run_cmd "$IP nexthop add id 101 via 2001:db8:92::2 dev veth3"
+ run_cmd "$IP nexthop add id 102 group 100/101"
+ run_cmd "$IP route add 2001:db8:101::1 nhid 102"
+ run_cmd "$IP route add 2001:db8:101::2 nhid 102"
+
+ ipv6_del_add_loop1 &
+ pid1=$!
+ ipv6_grp_replace_loop &
+ pid2=$!
+ ip netns exec me ping -f 2001:db8:101::1 >/dev/null 2>&1 &
+ pid3=$!
+ ip netns exec me ping -f 2001:db8:101::2 >/dev/null 2>&1 &
+ pid4=$!
+ ip netns exec me mausezahn -6 veth1 -B 2001:db8:101::2 -A 2001:db8:91::1 -c 0 -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1 &
+ pid5=$!
+
+ sleep 300
+ kill -9 $pid1 $pid2 $pid3 $pid4 $pid5
+
+ # if we did not crash, success
+ log_test 0 0 "IPv6 torture test"
+}
+
+
+ipv4_fcnal()
+{
+ local rc
+
+ echo
+ echo "IPv4 functional"
+ echo "----------------------"
+
+ #
+ # basic IPv4 ops - add, get, delete
+ #
+ run_cmd "$IP nexthop add id 12 via 172.16.1.2 dev veth1"
+ rc=$?
+ log_test $rc 0 "Create nexthop with id, gw, dev"
+ if [ $rc -ne 0 ]; then
+ echo "Basic IPv4 create fails; can not continue"
+ return 1
+ fi
+
+ run_cmd "$IP nexthop get id 12"
+ log_test $? 0 "Get nexthop by id"
+ check_nexthop "id 12" "id 12 via 172.16.1.2 dev veth1 scope link"
+
+ run_cmd "$IP nexthop del id 12"
+ log_test $? 0 "Delete nexthop by id"
+ check_nexthop "id 52" ""
+
+ #
+ # gw, device spec
+ #
+ # gw validation, no device - fails since dev is required
+ run_cmd "$IP nexthop add id 12 via 172.16.2.3"
+ log_test $? 2 "Create nexthop - gw only"
+
+ # gw not reachable through given dev
+ run_cmd "$IP nexthop add id 13 via 172.16.3.2 dev veth1"
+ log_test $? 2 "Create nexthop - invalid gw+dev combination"
+
+ # onlink flag overrides gw+dev lookup
+ run_cmd "$IP nexthop add id 13 via 172.16.3.2 dev veth1 onlink"
+ log_test $? 0 "Create nexthop - gw+dev and onlink"
+
+ # admin down should delete nexthops
+ set -e
+ run_cmd "$IP nexthop add id 15 via 172.16.1.3 dev veth1"
+ run_cmd "$IP nexthop add id 16 via 172.16.1.4 dev veth1"
+ run_cmd "$IP nexthop add id 17 via 172.16.1.5 dev veth1"
+ run_cmd "$IP li set dev veth1 down"
+ set +e
+ check_nexthop "dev veth1" ""
+ log_test $? 0 "Nexthops removed on admin down"
+
+ # nexthop route delete warning: route add with nhid and delete
+ # using device
+ run_cmd "$IP li set dev veth1 up"
+ run_cmd "$IP nexthop add id 12 via 172.16.1.3 dev veth1"
+ out1=`dmesg | grep "WARNING:.*fib_nh_match.*" | wc -l`
+ run_cmd "$IP route add 172.16.101.1/32 nhid 12"
+ run_cmd "$IP route delete 172.16.101.1/32 dev veth1"
+ out2=`dmesg | grep "WARNING:.*fib_nh_match.*" | wc -l`
+ [ $out1 -eq $out2 ]
+ rc=$?
+ log_test $rc 0 "Delete nexthop route warning"
+ run_cmd "$IP route delete 172.16.101.1/32 nhid 12"
+ run_cmd "$IP nexthop del id 12"
+
+ run_cmd "$IP nexthop add id 21 via 172.16.1.6 dev veth1"
+ run_cmd "$IP ro add 172.16.101.0/24 nhid 21"
+ run_cmd "$IP ro del 172.16.101.0/24 nexthop via 172.16.1.7 dev veth1 nexthop via 172.16.1.8 dev veth1"
+ log_test $? 2 "Delete multipath route with only nh id based entry"
+
+ run_cmd "$IP nexthop add id 22 via 172.16.1.6 dev veth1"
+ run_cmd "$IP ro add 172.16.102.0/24 nhid 22"
+ run_cmd "$IP ro del 172.16.102.0/24 dev veth1"
+ log_test $? 2 "Delete route when specifying only nexthop device"
+
+ run_cmd "$IP ro del 172.16.102.0/24 via 172.16.1.6"
+ log_test $? 2 "Delete route when specifying only gateway"
+
+ run_cmd "$IP ro del 172.16.102.0/24"
+ log_test $? 0 "Delete route when not specifying nexthop attributes"
+}
+
+ipv4_grp_fcnal()
+{
+ local rc
+
+ echo
+ echo "IPv4 groups functional"
+ echo "----------------------"
+
+ # basic functionality: create a nexthop group, default weight
+ run_cmd "$IP nexthop add id 11 via 172.16.1.2 dev veth1"
+ run_cmd "$IP nexthop add id 101 group 11"
+ log_test $? 0 "Create nexthop group with single nexthop"
+
+ # get nexthop group
+ run_cmd "$IP nexthop get id 101"
+ log_test $? 0 "Get nexthop group by id"
+ check_nexthop "id 101" "id 101 group 11"
+
+ # delete nexthop group
+ run_cmd "$IP nexthop del id 101"
+ log_test $? 0 "Delete nexthop group by id"
+ check_nexthop "id 101" ""
+
+ $IP nexthop flush >/dev/null 2>&1
+
+ #
+ # create group with multiple nexthops
+ run_cmd "$IP nexthop add id 12 via 172.16.1.2 dev veth1"
+ run_cmd "$IP nexthop add id 13 via 172.16.1.3 dev veth1"
+ run_cmd "$IP nexthop add id 14 via 172.16.1.4 dev veth1"
+ run_cmd "$IP nexthop add id 15 via 172.16.1.5 dev veth1"
+ run_cmd "$IP nexthop add id 102 group 12/13/14/15"
+ log_test $? 0 "Nexthop group with multiple nexthops"
+ check_nexthop "id 102" "id 102 group 12/13/14/15"
+
+ # Delete nexthop in a group and group is updated
+ run_cmd "$IP nexthop del id 13"
+ check_nexthop "id 102" "id 102 group 12/14/15"
+ log_test $? 0 "Nexthop group updated when entry is deleted"
+
+ # create group with multiple weighted nexthops
+ run_cmd "$IP nexthop add id 13 via 172.16.1.3 dev veth1"
+ run_cmd "$IP nexthop add id 103 group 12/13,2/14,3/15,4"
+ log_test $? 0 "Nexthop group with weighted nexthops"
+ check_nexthop "id 103" "id 103 group 12/13,2/14,3/15,4"
+
+ # Delete nexthop in a weighted group and group is updated
+ run_cmd "$IP nexthop del id 13"
+ check_nexthop "id 103" "id 103 group 12/14,3/15,4"
+ log_test $? 0 "Weighted nexthop group updated when entry is deleted"
+
+ # admin down - nexthop is removed from group
+ run_cmd "$IP li set dev veth1 down"
+ check_nexthop "dev veth1" ""
+ log_test $? 0 "Nexthops in groups removed on admin down"
+
+ # expect groups to have been deleted as well
+ check_nexthop "" ""
+
+ run_cmd "$IP li set dev veth1 up"
+
+ $IP nexthop flush >/dev/null 2>&1
+
+ # group with nexthops using different devices
+ set -e
+ run_cmd "$IP nexthop add id 12 via 172.16.1.2 dev veth1"
+ run_cmd "$IP nexthop add id 13 via 172.16.1.3 dev veth1"
+ run_cmd "$IP nexthop add id 14 via 172.16.1.4 dev veth1"
+ run_cmd "$IP nexthop add id 15 via 172.16.1.5 dev veth1"
+
+ run_cmd "$IP nexthop add id 22 via 172.16.2.2 dev veth3"
+ run_cmd "$IP nexthop add id 23 via 172.16.2.3 dev veth3"
+ run_cmd "$IP nexthop add id 24 via 172.16.2.4 dev veth3"
+ run_cmd "$IP nexthop add id 25 via 172.16.2.5 dev veth3"
+ set +e
+
+ # multiple groups with same nexthop
+ run_cmd "$IP nexthop add id 104 group 12"
+ run_cmd "$IP nexthop add id 105 group 12"
+ check_nexthop "group" "id 104 group 12 id 105 group 12"
+ log_test $? 0 "Multiple groups with same nexthop"
+
+ run_cmd "$IP nexthop flush groups"
+ [ $? -ne 0 ] && return 1
+
+ # on admin down of veth1, it should be removed from the group
+ run_cmd "$IP nexthop add id 105 group 12/13/22/23/14"
+ run_cmd "$IP li set veth1 down"
+ check_nexthop "id 105" "id 105 group 22/23"
+ log_test $? 0 "Nexthops in group removed on admin down - mixed group"
+
+ run_cmd "$IP nexthop add id 106 group 105/24"
+ log_test $? 2 "Nexthop group can not have a group as an entry"
+
+ # a group can have a blackhole entry only if it is the only
+ # nexthop in the group. Needed for atomic replace with an
+ # actual nexthop group
+ run_cmd "$IP nexthop add id 31 blackhole"
+ run_cmd "$IP nexthop add id 107 group 31"
+ log_test $? 0 "Nexthop group with a blackhole entry"
+
+ run_cmd "$IP nexthop add id 108 group 31/24"
+ log_test $? 2 "Nexthop group can not have a blackhole and another nexthop"
+}
+
+ipv4_withv6_fcnal()
+{
+ local lladdr
+
+ set -e
+ lladdr=$(get_linklocal veth2 peer)
+ run_cmd "$IP nexthop add id 11 via ${lladdr} dev veth1"
+ set +e
+ run_cmd "$IP ro add 172.16.101.1/32 nhid 11"
+ log_test $? 0 "IPv6 nexthop with IPv4 route"
+ check_route "172.16.101.1" "172.16.101.1 nhid 11 via inet6 ${lladdr} dev veth1"
+
+ set -e
+ run_cmd "$IP nexthop add id 12 via 172.16.1.2 dev veth1"
+ run_cmd "$IP nexthop add id 101 group 11/12"
+ set +e
+ run_cmd "$IP ro replace 172.16.101.1/32 nhid 101"
+ log_test $? 0 "IPv6 nexthop with IPv4 route"
+
+ check_route "172.16.101.1" "172.16.101.1 nhid 101 nexthop via inet6 ${lladdr} dev veth1 weight 1 nexthop via 172.16.1.2 dev veth1 weight 1"
+
+ run_cmd "$IP ro replace 172.16.101.1/32 via inet6 ${lladdr} dev veth1"
+ log_test $? 0 "IPv4 route with IPv6 gateway"
+ check_route "172.16.101.1" "172.16.101.1 via inet6 ${lladdr} dev veth1"
+
+ run_cmd "$IP ro replace 172.16.101.1/32 via inet6 2001:db8:50::1 dev veth1"
+ log_test $? 2 "IPv4 route with invalid IPv6 gateway"
+}
+
+ipv4_fcnal_runtime()
+{
+ local lladdr
+ local rc
+
+ echo
+ echo "IPv4 functional runtime"
+ echo "-----------------------"
+
+ run_cmd "$IP nexthop add id 21 via 172.16.1.2 dev veth1"
+ run_cmd "$IP ro add 172.16.101.1/32 nhid 21"
+ log_test $? 0 "Route add"
+ check_route "172.16.101.1" "172.16.101.1 nhid 21 via 172.16.1.2 dev veth1"
+
+ run_cmd "$IP ro delete 172.16.101.1/32 nhid 21"
+ log_test $? 0 "Route delete"
+
+ #
+ # scope mismatch
+ #
+ run_cmd "$IP nexthop add id 22 via 172.16.1.2 dev veth1"
+ run_cmd "$IP ro add 172.16.101.1/32 nhid 22 scope host"
+ log_test $? 2 "Route add - scope conflict with nexthop"
+
+ run_cmd "$IP nexthop replace id 22 dev veth3"
+ run_cmd "$IP ro add 172.16.101.1/32 nhid 22 scope host"
+ run_cmd "$IP nexthop replace id 22 via 172.16.2.2 dev veth3"
+ log_test $? 2 "Nexthop replace with invalid scope for existing route"
+
+ #
+ # add route with nexthop and check traffic
+ #
+ run_cmd "$IP nexthop replace id 21 via 172.16.1.2 dev veth1"
+ run_cmd "$IP ro replace 172.16.101.1/32 nhid 21"
+ run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1"
+ log_test $? 0 "Basic ping"
+
+ run_cmd "$IP nexthop replace id 22 via 172.16.2.2 dev veth3"
+ run_cmd "$IP nexthop add id 122 group 21/22"
+ run_cmd "$IP ro replace 172.16.101.1/32 nhid 122"
+ run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1"
+ log_test $? 0 "Ping - multipath"
+
+ run_cmd "$IP ro delete 172.16.101.1/32 nhid 122"
+
+ #
+ # multiple default routes
+ # - tests fib_select_default
+ run_cmd "$IP nexthop add id 501 via 172.16.1.2 dev veth1"
+ run_cmd "$IP ro add default nhid 501"
+ run_cmd "$IP ro add default via 172.16.1.3 dev veth1 metric 20"
+ run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1"
+ log_test $? 0 "Ping - multiple default routes, nh first"
+
+ # flip the order
+ run_cmd "$IP ro del default nhid 501"
+ run_cmd "$IP ro del default via 172.16.1.3 dev veth1 metric 20"
+ run_cmd "$IP ro add default via 172.16.1.2 dev veth1 metric 20"
+ run_cmd "$IP nexthop replace id 501 via 172.16.1.3 dev veth1"
+ run_cmd "$IP ro add default nhid 501 metric 20"
+ run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1"
+ log_test $? 0 "Ping - multiple default routes, nh second"
+
+ run_cmd "$IP nexthop delete nhid 501"
+ run_cmd "$IP ro del default"
+
+ #
+ # IPv4 with blackhole nexthops
+ #
+ run_cmd "$IP nexthop add id 23 blackhole"
+ run_cmd "$IP ro replace 172.16.101.1/32 nhid 23"
+ run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1"
+ log_test $? 2 "Ping - blackhole"
+
+ run_cmd "$IP nexthop replace id 23 via 172.16.1.2 dev veth1"
+ run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1"
+ log_test $? 0 "Ping - blackhole replaced with gateway"
+
+ run_cmd "$IP nexthop replace id 23 blackhole"
+ run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1"
+ log_test $? 2 "Ping - gateway replaced by blackhole"
+
+ run_cmd "$IP ro replace 172.16.101.1/32 nhid 122"
+ run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1"
+ if [ $? -eq 0 ]; then
+ run_cmd "$IP nexthop replace id 122 group 23"
+ run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1"
+ log_test $? 2 "Ping - group with blackhole"
+
+ run_cmd "$IP nexthop replace id 122 group 21/22"
+ run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1"
+ log_test $? 0 "Ping - group blackhole replaced with gateways"
+ else
+ log_test 2 0 "Ping - multipath failed"
+ fi
+
+ #
+ # device only and gw + dev only mix
+ #
+ run_cmd "$IP nexthop add id 85 dev veth1"
+ run_cmd "$IP ro replace 172.16.101.1/32 nhid 85"
+ log_test $? 0 "IPv4 route with device only nexthop"
+ check_route "172.16.101.1" "172.16.101.1 nhid 85 dev veth1"
+
+ run_cmd "$IP nexthop add id 123 group 21/85"
+ run_cmd "$IP ro replace 172.16.101.1/32 nhid 123"
+ log_test $? 0 "IPv4 multipath route with nexthop mix - dev only + gw"
+ check_route "172.16.101.1" "172.16.101.1 nhid 123 nexthop via 172.16.1.2 dev veth1 weight 1 nexthop dev veth1 weight 1"
+
+ #
+ # IPv4 with IPv6
+ #
+ set -e
+ lladdr=$(get_linklocal veth2 peer)
+ run_cmd "$IP nexthop add id 24 via ${lladdr} dev veth1"
+ set +e
+ run_cmd "$IP ro replace 172.16.101.1/32 nhid 24"
+ run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1"
+ log_test $? 0 "IPv6 nexthop with IPv4 route"
+
+ $IP neigh sh | grep -q "${lladdr} dev veth1"
+ if [ $? -eq 1 ]; then
+ echo " WARNING: Neigh entry missing for ${lladdr}"
+ $IP neigh sh | grep 'dev veth1'
+ fi
+
+ $IP neigh sh | grep -q "172.16.101.1 dev eth1"
+ if [ $? -eq 0 ]; then
+ echo " WARNING: Neigh entry exists for 172.16.101.1"
+ $IP neigh sh | grep 'dev veth1'
+ fi
+
+ set -e
+ run_cmd "$IP nexthop add id 25 via 172.16.1.2 dev veth1"
+ run_cmd "$IP nexthop add id 101 group 24/25"
+ set +e
+ run_cmd "$IP ro replace 172.16.101.1/32 nhid 101"
+ log_test $? 0 "IPv4 route with mixed v4-v6 multipath route"
+
+ check_route "172.16.101.1" "172.16.101.1 nhid 101 nexthop via inet6 ${lladdr} dev veth1 weight 1 nexthop via 172.16.1.2 dev veth1 weight 1"
+
+ run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1"
+ log_test $? 0 "IPv6 nexthop with IPv4 route"
+
+ run_cmd "$IP ro replace 172.16.101.1/32 via inet6 ${lladdr} dev veth1"
+ run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1"
+ log_test $? 0 "IPv4 route with IPv6 gateway"
+
+ $IP neigh sh | grep -q "${lladdr} dev veth1"
+ if [ $? -eq 1 ]; then
+ echo " WARNING: Neigh entry missing for ${lladdr}"
+ $IP neigh sh | grep 'dev veth1'
+ fi
+
+ $IP neigh sh | grep -q "172.16.101.1 dev eth1"
+ if [ $? -eq 0 ]; then
+ echo " WARNING: Neigh entry exists for 172.16.101.1"
+ $IP neigh sh | grep 'dev veth1'
+ fi
+
+ run_cmd "$IP ro del 172.16.101.1/32 via inet6 ${lladdr} dev veth1"
+ run_cmd "$IP -4 ro add default via inet6 ${lladdr} dev veth1"
+ run_cmd "ip netns exec me ping -c1 -w1 172.16.101.1"
+ log_test $? 0 "IPv4 default route with IPv6 gateway"
+
+ #
+ # MPLS as an example of LWT encap
+ #
+ run_cmd "$IP nexthop add id 51 encap mpls 101 via 172.16.1.2 dev veth1"
+ log_test $? 0 "IPv4 route with MPLS encap"
+ check_nexthop "id 51" "id 51 encap mpls 101 via 172.16.1.2 dev veth1 scope link"
+ log_test $? 0 "IPv4 route with MPLS encap - check"
+
+ run_cmd "$IP nexthop add id 52 encap mpls 102 via inet6 2001:db8:91::2 dev veth1"
+ log_test $? 0 "IPv4 route with MPLS encap and v6 gateway"
+ check_nexthop "id 52" "id 52 encap mpls 102 via 2001:db8:91::2 dev veth1 scope link"
+ log_test $? 0 "IPv4 route with MPLS encap, v6 gw - check"
+}
+
+ipv4_large_grp()
+{
+ local ecmp=32
+
+ echo
+ echo "IPv4 large groups (x$ecmp)"
+ echo "---------------------"
+
+ check_large_grp 4 $ecmp
+
+ $IP nexthop flush >/dev/null 2>&1
+}
+
+sysctl_nexthop_compat_mode_check()
+{
+ local sysctlname="net.ipv4.nexthop_compat_mode"
+ local lprefix=$1
+
+ IPE="ip netns exec me"
+
+ $IPE sysctl -q $sysctlname 2>&1 >/dev/null
+ if [ $? -ne 0 ]; then
+ echo "SKIP: kernel lacks nexthop compat mode sysctl control"
+ return $ksft_skip
+ fi
+
+ out=$($IPE sysctl $sysctlname 2>/dev/null)
+ log_test $? 0 "$lprefix default nexthop compat mode check"
+ check_output "${out}" "$sysctlname = 1"
+}
+
+sysctl_nexthop_compat_mode_set()
+{
+ local sysctlname="net.ipv4.nexthop_compat_mode"
+ local mode=$1
+ local lprefix=$2
+
+ IPE="ip netns exec me"
+
+ out=$($IPE sysctl -w $sysctlname=$mode)
+ log_test $? 0 "$lprefix set compat mode - $mode"
+ check_output "${out}" "net.ipv4.nexthop_compat_mode = $mode"
+}
+
+ipv6_compat_mode()
+{
+ local rc
+
+ echo
+ echo "IPv6 nexthop api compat mode test"
+ echo "--------------------------------"
+
+ sysctl_nexthop_compat_mode_check "IPv6"
+ if [ $? -eq $ksft_skip ]; then
+ return $ksft_skip
+ fi
+
+ run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1"
+ run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1"
+ run_cmd "$IP nexthop add id 122 group 62/63"
+ ipmout=$(start_ip_monitor route)
+
+ run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 122"
+ # route add notification should contain expanded nexthops
+ stop_ip_monitor $ipmout 3
+ log_test $? 0 "IPv6 compat mode on - route add notification"
+
+ # route dump should contain expanded nexthops
+ check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 122 metric 1024 nexthop via 2001:db8:91::2 dev veth1 weight 1 nexthop via 2001:db8:91::3 dev veth1 weight 1"
+ log_test $? 0 "IPv6 compat mode on - route dump"
+
+ # change in nexthop group should generate route notification
+ run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1"
+ ipmout=$(start_ip_monitor route)
+ run_cmd "$IP nexthop replace id 122 group 62/64"
+ stop_ip_monitor $ipmout 3
+
+ log_test $? 0 "IPv6 compat mode on - nexthop change"
+
+ # set compat mode off
+ sysctl_nexthop_compat_mode_set 0 "IPv6"
+
+ run_cmd "$IP -6 ro del 2001:db8:101::1/128 nhid 122"
+
+ run_cmd "$IP nexthop add id 62 via 2001:db8:91::2 dev veth1"
+ run_cmd "$IP nexthop add id 63 via 2001:db8:91::3 dev veth1"
+ run_cmd "$IP nexthop add id 122 group 62/63"
+ ipmout=$(start_ip_monitor route)
+
+ run_cmd "$IP -6 ro add 2001:db8:101::1/128 nhid 122"
+ # route add notification should not contain expanded nexthops
+ stop_ip_monitor $ipmout 1
+ log_test $? 0 "IPv6 compat mode off - route add notification"
+
+ # route dump should not contain expanded nexthops
+ check_route6 "2001:db8:101::1" "2001:db8:101::1 nhid 122 metric 1024"
+ log_test $? 0 "IPv6 compat mode off - route dump"
+
+ # change in nexthop group should not generate route notification
+ run_cmd "$IP nexthop add id 64 via 2001:db8:91::4 dev veth1"
+ ipmout=$(start_ip_monitor route)
+ run_cmd "$IP nexthop replace id 122 group 62/64"
+ stop_ip_monitor $ipmout 0
+ log_test $? 0 "IPv6 compat mode off - nexthop change"
+
+ # nexthop delete should not generate route notification
+ ipmout=$(start_ip_monitor route)
+ run_cmd "$IP nexthop del id 122"
+ stop_ip_monitor $ipmout 0
+ log_test $? 0 "IPv6 compat mode off - nexthop delete"
+
+ # set compat mode back on
+ sysctl_nexthop_compat_mode_set 1 "IPv6"
+}
+
+ipv4_compat_mode()
+{
+ local rc
+
+ echo
+ echo "IPv4 nexthop api compat mode"
+ echo "----------------------------"
+
+ sysctl_nexthop_compat_mode_check "IPv4"
+ if [ $? -eq $ksft_skip ]; then
+ return $ksft_skip
+ fi
+
+ run_cmd "$IP nexthop add id 21 via 172.16.1.2 dev veth1"
+ run_cmd "$IP nexthop add id 22 via 172.16.1.2 dev veth1"
+ run_cmd "$IP nexthop add id 122 group 21/22"
+ ipmout=$(start_ip_monitor route)
+
+ run_cmd "$IP ro add 172.16.101.1/32 nhid 122"
+ stop_ip_monitor $ipmout 3
+
+ # route add notification should contain expanded nexthops
+ log_test $? 0 "IPv4 compat mode on - route add notification"
+
+ # route dump should contain expanded nexthops
+ check_route "172.16.101.1" "172.16.101.1 nhid 122 nexthop via 172.16.1.2 dev veth1 weight 1 nexthop via 172.16.1.2 dev veth1 weight 1"
+ log_test $? 0 "IPv4 compat mode on - route dump"
+
+ # change in nexthop group should generate route notification
+ run_cmd "$IP nexthop add id 23 via 172.16.1.3 dev veth1"
+ ipmout=$(start_ip_monitor route)
+ run_cmd "$IP nexthop replace id 122 group 21/23"
+ stop_ip_monitor $ipmout 3
+ log_test $? 0 "IPv4 compat mode on - nexthop change"
+
+ sysctl_nexthop_compat_mode_set 0 "IPv4"
+
+ # cleanup
+ run_cmd "$IP ro del 172.16.101.1/32 nhid 122"
+
+ ipmout=$(start_ip_monitor route)
+ run_cmd "$IP ro add 172.16.101.1/32 nhid 122"
+ stop_ip_monitor $ipmout 1
+ # route add notification should not contain expanded nexthops
+ log_test $? 0 "IPv4 compat mode off - route add notification"
+
+ # route dump should not contain expanded nexthops
+ check_route "172.16.101.1" "172.16.101.1 nhid 122"
+ log_test $? 0 "IPv4 compat mode off - route dump"
+
+ # change in nexthop group should not generate route notification
+ ipmout=$(start_ip_monitor route)
+ run_cmd "$IP nexthop replace id 122 group 21/22"
+ stop_ip_monitor $ipmout 0
+ log_test $? 0 "IPv4 compat mode off - nexthop change"
+
+ # nexthop delete should not generate route notification
+ ipmout=$(start_ip_monitor route)
+ run_cmd "$IP nexthop del id 122"
+ stop_ip_monitor $ipmout 0
+ log_test $? 0 "IPv4 compat mode off - nexthop delete"
+
+ sysctl_nexthop_compat_mode_set 1 "IPv4"
+}
+
+ipv4_del_add_loop1()
+{
+ while :; do
+ $IP nexthop del id 100
+ $IP nexthop add id 100 via 172.16.1.2 dev veth1
+ done >/dev/null 2>&1
+}
+
+ipv4_grp_replace_loop()
+{
+ while :; do
+ $IP nexthop replace id 102 group 100/101
+ done >/dev/null 2>&1
+}
+
+ipv4_torture()
+{
+ local pid1
+ local pid2
+ local pid3
+ local pid4
+ local pid5
+
+ echo
+ echo "IPv4 runtime torture"
+ echo "--------------------"
+ if [ ! -x "$(command -v mausezahn)" ]; then
+ echo "SKIP: Could not run test; need mausezahn tool"
+ return
+ fi
+
+ run_cmd "$IP nexthop add id 100 via 172.16.1.2 dev veth1"
+ run_cmd "$IP nexthop add id 101 via 172.16.2.2 dev veth3"
+ run_cmd "$IP nexthop add id 102 group 100/101"
+ run_cmd "$IP route add 172.16.101.1 nhid 102"
+ run_cmd "$IP route add 172.16.101.2 nhid 102"
+
+ ipv4_del_add_loop1 &
+ pid1=$!
+ ipv4_grp_replace_loop &
+ pid2=$!
+ ip netns exec me ping -f 172.16.101.1 >/dev/null 2>&1 &
+ pid3=$!
+ ip netns exec me ping -f 172.16.101.2 >/dev/null 2>&1 &
+ pid4=$!
+ ip netns exec me mausezahn veth1 -B 172.16.101.2 -A 172.16.1.1 -c 0 -t tcp "dp=1-1023, flags=syn" >/dev/null 2>&1 &
+ pid5=$!
+
+ sleep 300
+ kill -9 $pid1 $pid2 $pid3 $pid4 $pid5
+
+ # if we did not crash, success
+ log_test 0 0 "IPv4 torture test"
+}
+
+basic()
+{
+ echo
+ echo "Basic functional tests"
+ echo "----------------------"
+ run_cmd "$IP nexthop ls"
+ log_test $? 0 "List with nothing defined"
+
+ run_cmd "$IP nexthop get id 1"
+ log_test $? 2 "Nexthop get on non-existent id"
+
+ # attempt to create nh without a device or gw - fails
+ run_cmd "$IP nexthop add id 1"
+ log_test $? 2 "Nexthop with no device or gateway"
+
+ # attempt to create nh with down device - fails
+ $IP li set veth1 down
+ run_cmd "$IP nexthop add id 1 dev veth1"
+ log_test $? 2 "Nexthop with down device"
+
+ # create nh with linkdown device - fails
+ $IP li set veth1 up
+ ip -netns peer li set veth2 down
+ run_cmd "$IP nexthop add id 1 dev veth1"
+ log_test $? 2 "Nexthop with device that is linkdown"
+ ip -netns peer li set veth2 up
+
+ # device only
+ run_cmd "$IP nexthop add id 1 dev veth1"
+ log_test $? 0 "Nexthop with device only"
+
+ # create nh with duplicate id
+ run_cmd "$IP nexthop add id 1 dev veth3"
+ log_test $? 2 "Nexthop with duplicate id"
+
+ # blackhole nexthop
+ run_cmd "$IP nexthop add id 2 blackhole"
+ log_test $? 0 "Blackhole nexthop"
+
+ # blackhole nexthop can not have other specs
+ run_cmd "$IP nexthop replace id 2 blackhole dev veth1"
+ log_test $? 2 "Blackhole nexthop with other attributes"
+
+ #
+ # groups
+ #
+
+ run_cmd "$IP nexthop add id 101 group 1"
+ log_test $? 0 "Create group"
+
+ run_cmd "$IP nexthop add id 102 group 2"
+ log_test $? 0 "Create group with blackhole nexthop"
+
+ # multipath group can not have a blackhole as 1 path
+ run_cmd "$IP nexthop add id 103 group 1/2"
+ log_test $? 2 "Create multipath group where 1 path is a blackhole"
+
+ # multipath group can not have a member replaced by a blackhole
+ run_cmd "$IP nexthop replace id 2 dev veth3"
+ run_cmd "$IP nexthop replace id 102 group 1/2"
+ run_cmd "$IP nexthop replace id 2 blackhole"
+ log_test $? 2 "Multipath group can not have a member replaced by blackhole"
+
+ # attempt to create group with non-existent nexthop
+ run_cmd "$IP nexthop add id 103 group 12"
+ log_test $? 2 "Create group with non-existent nexthop"
+
+ # attempt to create group with same nexthop
+ run_cmd "$IP nexthop add id 103 group 1/1"
+ log_test $? 2 "Create group with same nexthop multiple times"
+
+ # replace nexthop with a group - fails
+ run_cmd "$IP nexthop replace id 2 group 1"
+ log_test $? 2 "Replace nexthop with nexthop group"
+
+ # replace nexthop group with a nexthop - fails
+ run_cmd "$IP nexthop replace id 101 dev veth1"
+ log_test $? 2 "Replace nexthop group with nexthop"
+
+ # nexthop group with other attributes fail
+ run_cmd "$IP nexthop add id 104 group 1 dev veth1"
+ log_test $? 2 "Nexthop group and device"
+
+ # Tests to ensure that flushing works as expected.
+ run_cmd "$IP nexthop add id 105 blackhole proto 99"
+ run_cmd "$IP nexthop add id 106 blackhole proto 100"
+ run_cmd "$IP nexthop add id 107 blackhole proto 99"
+ run_cmd "$IP nexthop flush proto 99"
+ check_nexthop "id 105" ""
+ check_nexthop "id 106" "id 106 blackhole proto 100"
+ check_nexthop "id 107" ""
+ run_cmd "$IP nexthop flush proto 100"
+ check_nexthop "id 106" ""
+
+ run_cmd "$IP nexthop flush proto 100"
+ log_test $? 0 "Test proto flush"
+
+ run_cmd "$IP nexthop add id 104 group 1 blackhole"
+ log_test $? 2 "Nexthop group and blackhole"
+
+ $IP nexthop flush >/dev/null 2>&1
+}
+
+################################################################################
+# usage
+
+usage()
+{
+ cat <<EOF
+usage: ${0##*/} OPTS
+
+ -t <test> Test(s) to run (default: all)
+ (options: $ALL_TESTS)
+ -4 IPv4 tests only
+ -6 IPv6 tests only
+ -p Pause on fail
+ -P Pause after each test before cleanup
+ -v verbose mode (show commands and output)
+
+ Runtime test
+ -n num Number of nexthops to target
+ -N Use new style to install routes in DUT
+
+done
+EOF
+}
+
+################################################################################
+# main
+
+while getopts :t:pP46hv o
+do
+ case $o in
+ t) TESTS=$OPTARG;;
+ 4) TESTS=${IPV4_TESTS};;
+ 6) TESTS=${IPV6_TESTS};;
+ p) PAUSE_ON_FAIL=yes;;
+ P) PAUSE=yes;;
+ v) VERBOSE=$(($VERBOSE + 1));;
+ h) usage; exit 0;;
+ *) usage; exit 1;;
+ esac
+done
+
+# make sure we don't pause twice
+[ "${PAUSE}" = "yes" ] && PAUSE_ON_FAIL=no
+
+if [ "$(id -u)" -ne 0 ];then
+ echo "SKIP: Need root privileges"
+ exit $ksft_skip;
+fi
+
+if [ ! -x "$(command -v ip)" ]; then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+
+ip help 2>&1 | grep -q nexthop
+if [ $? -ne 0 ]; then
+ echo "SKIP: iproute2 too old, missing nexthop command"
+ exit $ksft_skip
+fi
+
+out=$(ip nexthop ls 2>&1 | grep -q "Operation not supported")
+if [ $? -eq 0 ]; then
+ echo "SKIP: kernel lacks nexthop support"
+ exit $ksft_skip
+fi
+
+for t in $TESTS
+do
+ case $t in
+ none) IP="ip -netns peer"; setup; exit 0;;
+ *) setup; $t; cleanup;;
+ esac
+done
+
+if [ "$TESTS" != "none" ]; then
+ printf "\nTests passed: %3d\n" ${nsuccess}
+ printf "Tests failed: %3d\n" ${nfail}
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh
new file mode 100755
index 000000000..a93e6b690
--- /dev/null
+++ b/tools/testing/selftests/net/fib_rule_tests.sh
@@ -0,0 +1,260 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test is for checking IPv4 and IPv6 FIB rules API
+
+ret=0
+
+PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no}
+IP="ip -netns testns"
+
+RTABLE=100
+GW_IP4=192.51.100.2
+SRC_IP=192.51.100.3
+GW_IP6=2001:db8:1::2
+SRC_IP6=2001:db8:1::3
+
+DEV_ADDR=192.51.100.1
+DEV_ADDR6=2001:db8:1::1
+DEV=dummy0
+
+log_test()
+{
+ local rc=$1
+ local expected=$2
+ local msg="$3"
+
+ if [ ${rc} -eq ${expected} ]; then
+ nsuccess=$((nsuccess+1))
+ printf "\n TEST: %-50s [ OK ]\n" "${msg}"
+ else
+ ret=1
+ nfail=$((nfail+1))
+ printf "\n TEST: %-50s [FAIL]\n" "${msg}"
+ if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+ echo
+ echo "hit enter to continue, 'q' to quit"
+ read a
+ [ "$a" = "q" ] && exit 1
+ fi
+ fi
+}
+
+log_section()
+{
+ echo
+ echo "######################################################################"
+ echo "TEST SECTION: $*"
+ echo "######################################################################"
+}
+
+setup()
+{
+ set -e
+ ip netns add testns
+ $IP link set dev lo up
+
+ $IP link add dummy0 type dummy
+ $IP link set dev dummy0 up
+ $IP address add $DEV_ADDR/24 dev dummy0
+ $IP -6 address add $DEV_ADDR6/64 dev dummy0
+
+ set +e
+}
+
+cleanup()
+{
+ $IP link del dev dummy0 &> /dev/null
+ ip netns del testns
+}
+
+fib_check_iproute_support()
+{
+ ip rule help 2>&1 | grep -q $1
+ if [ $? -ne 0 ]; then
+ echo "SKIP: iproute2 iprule too old, missing $1 match"
+ return 1
+ fi
+
+ ip route get help 2>&1 | grep -q $2
+ if [ $? -ne 0 ]; then
+ echo "SKIP: iproute2 get route too old, missing $2 match"
+ return 1
+ fi
+
+ return 0
+}
+
+fib_rule6_del()
+{
+ $IP -6 rule del $1
+ log_test $? 0 "rule6 del $1"
+}
+
+fib_rule6_del_by_pref()
+{
+ pref=$($IP -6 rule show | grep "$1 lookup $TABLE" | cut -d ":" -f 1)
+ $IP -6 rule del pref $pref
+}
+
+fib_rule6_test_match_n_redirect()
+{
+ local match="$1"
+ local getmatch="$2"
+
+ $IP -6 rule add $match table $RTABLE
+ $IP -6 route get $GW_IP6 $getmatch | grep -q "table $RTABLE"
+ log_test $? 0 "rule6 check: $1"
+
+ fib_rule6_del_by_pref "$match"
+ log_test $? 0 "rule6 del by pref: $match"
+}
+
+fib_rule6_test()
+{
+ # setup the fib rule redirect route
+ $IP -6 route add table $RTABLE default via $GW_IP6 dev $DEV onlink
+
+ match="oif $DEV"
+ fib_rule6_test_match_n_redirect "$match" "$match" "oif redirect to table"
+
+ match="from $SRC_IP6 iif $DEV"
+ fib_rule6_test_match_n_redirect "$match" "$match" "iif redirect to table"
+
+ match="tos 0x10"
+ fib_rule6_test_match_n_redirect "$match" "$match" "tos redirect to table"
+
+ match="fwmark 0x64"
+ getmatch="mark 0x64"
+ fib_rule6_test_match_n_redirect "$match" "$getmatch" "fwmark redirect to table"
+
+ fib_check_iproute_support "uidrange" "uid"
+ if [ $? -eq 0 ]; then
+ match="uidrange 100-100"
+ getmatch="uid 100"
+ fib_rule6_test_match_n_redirect "$match" "$getmatch" "uid redirect to table"
+ fi
+
+ fib_check_iproute_support "sport" "sport"
+ if [ $? -eq 0 ]; then
+ match="sport 666 dport 777"
+ fib_rule6_test_match_n_redirect "$match" "$match" "sport and dport redirect to table"
+ fi
+
+ fib_check_iproute_support "ipproto" "ipproto"
+ if [ $? -eq 0 ]; then
+ match="ipproto tcp"
+ fib_rule6_test_match_n_redirect "$match" "$match" "ipproto match"
+ fi
+
+ fib_check_iproute_support "ipproto" "ipproto"
+ if [ $? -eq 0 ]; then
+ match="ipproto ipv6-icmp"
+ fib_rule6_test_match_n_redirect "$match" "$match" "ipproto ipv6-icmp match"
+ fi
+}
+
+fib_rule4_del()
+{
+ $IP rule del $1
+ log_test $? 0 "del $1"
+}
+
+fib_rule4_del_by_pref()
+{
+ pref=$($IP rule show | grep "$1 lookup $TABLE" | cut -d ":" -f 1)
+ $IP rule del pref $pref
+}
+
+fib_rule4_test_match_n_redirect()
+{
+ local match="$1"
+ local getmatch="$2"
+
+ $IP rule add $match table $RTABLE
+ $IP route get $GW_IP4 $getmatch | grep -q "table $RTABLE"
+ log_test $? 0 "rule4 check: $1"
+
+ fib_rule4_del_by_pref "$match"
+ log_test $? 0 "rule4 del by pref: $match"
+}
+
+fib_rule4_test()
+{
+ # setup the fib rule redirect route
+ $IP route add table $RTABLE default via $GW_IP4 dev $DEV onlink
+
+ match="oif $DEV"
+ fib_rule4_test_match_n_redirect "$match" "$match" "oif redirect to table"
+
+ # need enable forwarding and disable rp_filter temporarily as all the
+ # addresses are in the same subnet and egress device == ingress device.
+ ip netns exec testns sysctl -w net.ipv4.ip_forward=1
+ ip netns exec testns sysctl -w net.ipv4.conf.$DEV.rp_filter=0
+ match="from $SRC_IP iif $DEV"
+ fib_rule4_test_match_n_redirect "$match" "$match" "iif redirect to table"
+ ip netns exec testns sysctl -w net.ipv4.ip_forward=0
+
+ match="tos 0x10"
+ fib_rule4_test_match_n_redirect "$match" "$match" "tos redirect to table"
+
+ match="fwmark 0x64"
+ getmatch="mark 0x64"
+ fib_rule4_test_match_n_redirect "$match" "$getmatch" "fwmark redirect to table"
+
+ fib_check_iproute_support "uidrange" "uid"
+ if [ $? -eq 0 ]; then
+ match="uidrange 100-100"
+ getmatch="uid 100"
+ fib_rule4_test_match_n_redirect "$match" "$getmatch" "uid redirect to table"
+ fi
+
+ fib_check_iproute_support "sport" "sport"
+ if [ $? -eq 0 ]; then
+ match="sport 666 dport 777"
+ fib_rule4_test_match_n_redirect "$match" "$match" "sport and dport redirect to table"
+ fi
+
+ fib_check_iproute_support "ipproto" "ipproto"
+ if [ $? -eq 0 ]; then
+ match="ipproto tcp"
+ fib_rule4_test_match_n_redirect "$match" "$match" "ipproto tcp match"
+ fi
+
+ fib_check_iproute_support "ipproto" "ipproto"
+ if [ $? -eq 0 ]; then
+ match="ipproto icmp"
+ fib_rule4_test_match_n_redirect "$match" "$match" "ipproto icmp match"
+ fi
+}
+
+run_fibrule_tests()
+{
+ log_section "IPv4 fib rule"
+ fib_rule4_test
+ log_section "IPv6 fib rule"
+ fib_rule6_test
+}
+
+if [ "$(id -u)" -ne 0 ];then
+ echo "SKIP: Need root privileges"
+ exit 0
+fi
+
+if [ ! -x "$(command -v ip)" ]; then
+ echo "SKIP: Could not run test without ip tool"
+ exit 0
+fi
+
+# start clean
+cleanup &> /dev/null
+setup
+run_fibrule_tests
+cleanup
+
+if [ "$TESTS" != "none" ]; then
+ printf "\nTests passed: %3d\n" ${nsuccess}
+ printf "Tests failed: %3d\n" ${nfail}
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
new file mode 100755
index 000000000..168101637
--- /dev/null
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -0,0 +1,1841 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test is for checking IPv4 and IPv6 FIB behavior in response to
+# different events.
+
+ret=0
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+# all tests in this script. Can be overridden with -t option
+TESTS="unregister down carrier nexthop suppress ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr"
+
+VERBOSE=0
+PAUSE_ON_FAIL=no
+PAUSE=no
+IP="ip -netns ns1"
+NS_EXEC="ip netns exec ns1"
+
+which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping)
+
+log_test()
+{
+ local rc=$1
+ local expected=$2
+ local msg="$3"
+
+ if [ ${rc} -eq ${expected} ]; then
+ printf " TEST: %-60s [ OK ]\n" "${msg}"
+ nsuccess=$((nsuccess+1))
+ else
+ ret=1
+ nfail=$((nfail+1))
+ printf " TEST: %-60s [FAIL]\n" "${msg}"
+ if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+ echo
+ echo "hit enter to continue, 'q' to quit"
+ read a
+ [ "$a" = "q" ] && exit 1
+ fi
+ fi
+
+ if [ "${PAUSE}" = "yes" ]; then
+ echo
+ echo "hit enter to continue, 'q' to quit"
+ read a
+ [ "$a" = "q" ] && exit 1
+ fi
+}
+
+setup()
+{
+ set -e
+ ip netns add ns1
+ ip netns set ns1 auto
+ $IP link set dev lo up
+ ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=1
+ ip netns exec ns1 sysctl -qw net.ipv6.conf.all.forwarding=1
+
+ $IP link add dummy0 type dummy
+ $IP link set dev dummy0 up
+ $IP address add 198.51.100.1/24 dev dummy0
+ $IP -6 address add 2001:db8:1::1/64 dev dummy0
+ set +e
+
+}
+
+cleanup()
+{
+ $IP link del dev dummy0 &> /dev/null
+ ip netns del ns1 &> /dev/null
+ ip netns del ns2 &> /dev/null
+}
+
+get_linklocal()
+{
+ local dev=$1
+ local addr
+
+ addr=$($IP -6 -br addr show dev ${dev} | \
+ awk '{
+ for (i = 3; i <= NF; ++i) {
+ if ($i ~ /^fe80/)
+ print $i
+ }
+ }'
+ )
+ addr=${addr/\/*}
+
+ [ -z "$addr" ] && return 1
+
+ echo $addr
+
+ return 0
+}
+
+fib_unreg_unicast_test()
+{
+ echo
+ echo "Single path route test"
+
+ setup
+
+ echo " Start point"
+ $IP route get fibmatch 198.51.100.2 &> /dev/null
+ log_test $? 0 "IPv4 fibmatch"
+ $IP -6 route get fibmatch 2001:db8:1::2 &> /dev/null
+ log_test $? 0 "IPv6 fibmatch"
+
+ set -e
+ $IP link del dev dummy0
+ set +e
+
+ echo " Nexthop device deleted"
+ $IP route get fibmatch 198.51.100.2 &> /dev/null
+ log_test $? 2 "IPv4 fibmatch - no route"
+ $IP -6 route get fibmatch 2001:db8:1::2 &> /dev/null
+ log_test $? 2 "IPv6 fibmatch - no route"
+
+ cleanup
+}
+
+fib_unreg_multipath_test()
+{
+
+ echo
+ echo "Multipath route test"
+
+ setup
+
+ set -e
+ $IP link add dummy1 type dummy
+ $IP link set dev dummy1 up
+ $IP address add 192.0.2.1/24 dev dummy1
+ $IP -6 address add 2001:db8:2::1/64 dev dummy1
+
+ $IP route add 203.0.113.0/24 \
+ nexthop via 198.51.100.2 dev dummy0 \
+ nexthop via 192.0.2.2 dev dummy1
+ $IP -6 route add 2001:db8:3::/64 \
+ nexthop via 2001:db8:1::2 dev dummy0 \
+ nexthop via 2001:db8:2::2 dev dummy1
+ set +e
+
+ echo " Start point"
+ $IP route get fibmatch 203.0.113.1 &> /dev/null
+ log_test $? 0 "IPv4 fibmatch"
+ $IP -6 route get fibmatch 2001:db8:3::1 &> /dev/null
+ log_test $? 0 "IPv6 fibmatch"
+
+ set -e
+ $IP link del dev dummy0
+ set +e
+
+ echo " One nexthop device deleted"
+ $IP route get fibmatch 203.0.113.1 &> /dev/null
+ log_test $? 2 "IPv4 - multipath route removed on delete"
+
+ $IP -6 route get fibmatch 2001:db8:3::1 &> /dev/null
+ # In IPv6 we do not flush the entire multipath route.
+ log_test $? 0 "IPv6 - multipath down to single path"
+
+ set -e
+ $IP link del dev dummy1
+ set +e
+
+ echo " Second nexthop device deleted"
+ $IP -6 route get fibmatch 2001:db8:3::1 &> /dev/null
+ log_test $? 2 "IPv6 - no route"
+
+ cleanup
+}
+
+fib_unreg_test()
+{
+ fib_unreg_unicast_test
+ fib_unreg_multipath_test
+}
+
+fib_down_unicast_test()
+{
+ echo
+ echo "Single path, admin down"
+
+ setup
+
+ echo " Start point"
+ $IP route get fibmatch 198.51.100.2 &> /dev/null
+ log_test $? 0 "IPv4 fibmatch"
+ $IP -6 route get fibmatch 2001:db8:1::2 &> /dev/null
+ log_test $? 0 "IPv6 fibmatch"
+
+ set -e
+ $IP link set dev dummy0 down
+ set +e
+
+ echo " Route deleted on down"
+ $IP route get fibmatch 198.51.100.2 &> /dev/null
+ log_test $? 2 "IPv4 fibmatch"
+ $IP -6 route get fibmatch 2001:db8:1::2 &> /dev/null
+ log_test $? 2 "IPv6 fibmatch"
+
+ cleanup
+}
+
+fib_down_multipath_test_do()
+{
+ local down_dev=$1
+ local up_dev=$2
+
+ $IP route get fibmatch 203.0.113.1 \
+ oif $down_dev &> /dev/null
+ log_test $? 2 "IPv4 fibmatch on down device"
+ $IP -6 route get fibmatch 2001:db8:3::1 \
+ oif $down_dev &> /dev/null
+ log_test $? 2 "IPv6 fibmatch on down device"
+
+ $IP route get fibmatch 203.0.113.1 \
+ oif $up_dev &> /dev/null
+ log_test $? 0 "IPv4 fibmatch on up device"
+ $IP -6 route get fibmatch 2001:db8:3::1 \
+ oif $up_dev &> /dev/null
+ log_test $? 0 "IPv6 fibmatch on up device"
+
+ $IP route get fibmatch 203.0.113.1 | \
+ grep $down_dev | grep -q "dead linkdown"
+ log_test $? 0 "IPv4 flags on down device"
+ $IP -6 route get fibmatch 2001:db8:3::1 | \
+ grep $down_dev | grep -q "dead linkdown"
+ log_test $? 0 "IPv6 flags on down device"
+
+ $IP route get fibmatch 203.0.113.1 | \
+ grep $up_dev | grep -q "dead linkdown"
+ log_test $? 1 "IPv4 flags on up device"
+ $IP -6 route get fibmatch 2001:db8:3::1 | \
+ grep $up_dev | grep -q "dead linkdown"
+ log_test $? 1 "IPv6 flags on up device"
+}
+
+fib_down_multipath_test()
+{
+ echo
+ echo "Admin down multipath"
+
+ setup
+
+ set -e
+ $IP link add dummy1 type dummy
+ $IP link set dev dummy1 up
+
+ $IP address add 192.0.2.1/24 dev dummy1
+ $IP -6 address add 2001:db8:2::1/64 dev dummy1
+
+ $IP route add 203.0.113.0/24 \
+ nexthop via 198.51.100.2 dev dummy0 \
+ nexthop via 192.0.2.2 dev dummy1
+ $IP -6 route add 2001:db8:3::/64 \
+ nexthop via 2001:db8:1::2 dev dummy0 \
+ nexthop via 2001:db8:2::2 dev dummy1
+ set +e
+
+ echo " Verify start point"
+ $IP route get fibmatch 203.0.113.1 &> /dev/null
+ log_test $? 0 "IPv4 fibmatch"
+
+ $IP -6 route get fibmatch 2001:db8:3::1 &> /dev/null
+ log_test $? 0 "IPv6 fibmatch"
+
+ set -e
+ $IP link set dev dummy0 down
+ set +e
+
+ echo " One device down, one up"
+ fib_down_multipath_test_do "dummy0" "dummy1"
+
+ set -e
+ $IP link set dev dummy0 up
+ $IP link set dev dummy1 down
+ set +e
+
+ echo " Other device down and up"
+ fib_down_multipath_test_do "dummy1" "dummy0"
+
+ set -e
+ $IP link set dev dummy0 down
+ set +e
+
+ echo " Both devices down"
+ $IP route get fibmatch 203.0.113.1 &> /dev/null
+ log_test $? 2 "IPv4 fibmatch"
+ $IP -6 route get fibmatch 2001:db8:3::1 &> /dev/null
+ log_test $? 2 "IPv6 fibmatch"
+
+ $IP link del dev dummy1
+ cleanup
+}
+
+fib_down_test()
+{
+ fib_down_unicast_test
+ fib_down_multipath_test
+}
+
+# Local routes should not be affected when carrier changes.
+fib_carrier_local_test()
+{
+ echo
+ echo "Local carrier tests - single path"
+
+ setup
+
+ set -e
+ $IP link set dev dummy0 carrier on
+ set +e
+
+ echo " Start point"
+ $IP route get fibmatch 198.51.100.1 &> /dev/null
+ log_test $? 0 "IPv4 fibmatch"
+ $IP -6 route get fibmatch 2001:db8:1::1 &> /dev/null
+ log_test $? 0 "IPv6 fibmatch"
+
+ $IP route get fibmatch 198.51.100.1 | \
+ grep -q "linkdown"
+ log_test $? 1 "IPv4 - no linkdown flag"
+ $IP -6 route get fibmatch 2001:db8:1::1 | \
+ grep -q "linkdown"
+ log_test $? 1 "IPv6 - no linkdown flag"
+
+ set -e
+ $IP link set dev dummy0 carrier off
+ sleep 1
+ set +e
+
+ echo " Carrier off on nexthop"
+ $IP route get fibmatch 198.51.100.1 &> /dev/null
+ log_test $? 0 "IPv4 fibmatch"
+ $IP -6 route get fibmatch 2001:db8:1::1 &> /dev/null
+ log_test $? 0 "IPv6 fibmatch"
+
+ $IP route get fibmatch 198.51.100.1 | \
+ grep -q "linkdown"
+ log_test $? 1 "IPv4 - linkdown flag set"
+ $IP -6 route get fibmatch 2001:db8:1::1 | \
+ grep -q "linkdown"
+ log_test $? 1 "IPv6 - linkdown flag set"
+
+ set -e
+ $IP address add 192.0.2.1/24 dev dummy0
+ $IP -6 address add 2001:db8:2::1/64 dev dummy0
+ set +e
+
+ echo " Route to local address with carrier down"
+ $IP route get fibmatch 192.0.2.1 &> /dev/null
+ log_test $? 0 "IPv4 fibmatch"
+ $IP -6 route get fibmatch 2001:db8:2::1 &> /dev/null
+ log_test $? 0 "IPv6 fibmatch"
+
+ $IP route get fibmatch 192.0.2.1 | \
+ grep -q "linkdown"
+ log_test $? 1 "IPv4 linkdown flag set"
+ $IP -6 route get fibmatch 2001:db8:2::1 | \
+ grep -q "linkdown"
+ log_test $? 1 "IPv6 linkdown flag set"
+
+ cleanup
+}
+
+fib_carrier_unicast_test()
+{
+ ret=0
+
+ echo
+ echo "Single path route carrier test"
+
+ setup
+
+ set -e
+ $IP link set dev dummy0 carrier on
+ set +e
+
+ echo " Start point"
+ $IP route get fibmatch 198.51.100.2 &> /dev/null
+ log_test $? 0 "IPv4 fibmatch"
+ $IP -6 route get fibmatch 2001:db8:1::2 &> /dev/null
+ log_test $? 0 "IPv6 fibmatch"
+
+ $IP route get fibmatch 198.51.100.2 | \
+ grep -q "linkdown"
+ log_test $? 1 "IPv4 no linkdown flag"
+ $IP -6 route get fibmatch 2001:db8:1::2 | \
+ grep -q "linkdown"
+ log_test $? 1 "IPv6 no linkdown flag"
+
+ set -e
+ $IP link set dev dummy0 carrier off
+ sleep 1
+ set +e
+
+ echo " Carrier down"
+ $IP route get fibmatch 198.51.100.2 &> /dev/null
+ log_test $? 0 "IPv4 fibmatch"
+ $IP -6 route get fibmatch 2001:db8:1::2 &> /dev/null
+ log_test $? 0 "IPv6 fibmatch"
+
+ $IP route get fibmatch 198.51.100.2 | \
+ grep -q "linkdown"
+ log_test $? 0 "IPv4 linkdown flag set"
+ $IP -6 route get fibmatch 2001:db8:1::2 | \
+ grep -q "linkdown"
+ log_test $? 0 "IPv6 linkdown flag set"
+
+ set -e
+ $IP address add 192.0.2.1/24 dev dummy0
+ $IP -6 address add 2001:db8:2::1/64 dev dummy0
+ set +e
+
+ echo " Second address added with carrier down"
+ $IP route get fibmatch 192.0.2.2 &> /dev/null
+ log_test $? 0 "IPv4 fibmatch"
+ $IP -6 route get fibmatch 2001:db8:2::2 &> /dev/null
+ log_test $? 0 "IPv6 fibmatch"
+
+ $IP route get fibmatch 192.0.2.2 | \
+ grep -q "linkdown"
+ log_test $? 0 "IPv4 linkdown flag set"
+ $IP -6 route get fibmatch 2001:db8:2::2 | \
+ grep -q "linkdown"
+ log_test $? 0 "IPv6 linkdown flag set"
+
+ cleanup
+}
+
+fib_carrier_test()
+{
+ fib_carrier_local_test
+ fib_carrier_unicast_test
+}
+
+fib_rp_filter_test()
+{
+ echo
+ echo "IPv4 rp_filter tests"
+
+ setup
+
+ set -e
+ ip netns add ns2
+ ip netns set ns2 auto
+
+ ip -netns ns2 link set dev lo up
+
+ $IP link add name veth1 type veth peer name veth2
+ $IP link set dev veth2 netns ns2
+ $IP address add 192.0.2.1/24 dev veth1
+ ip -netns ns2 address add 192.0.2.1/24 dev veth2
+ $IP link set dev veth1 up
+ ip -netns ns2 link set dev veth2 up
+
+ $IP link set dev lo address 52:54:00:6a:c7:5e
+ $IP link set dev veth1 address 52:54:00:6a:c7:5e
+ ip -netns ns2 link set dev lo address 52:54:00:6a:c7:5e
+ ip -netns ns2 link set dev veth2 address 52:54:00:6a:c7:5e
+
+ # 1. (ns2) redirect lo's egress to veth2's egress
+ ip netns exec ns2 tc qdisc add dev lo parent root handle 1: fq_codel
+ ip netns exec ns2 tc filter add dev lo parent 1: protocol arp basic \
+ action mirred egress redirect dev veth2
+ ip netns exec ns2 tc filter add dev lo parent 1: protocol ip basic \
+ action mirred egress redirect dev veth2
+
+ # 2. (ns1) redirect veth1's ingress to lo's ingress
+ $NS_EXEC tc qdisc add dev veth1 ingress
+ $NS_EXEC tc filter add dev veth1 ingress protocol arp basic \
+ action mirred ingress redirect dev lo
+ $NS_EXEC tc filter add dev veth1 ingress protocol ip basic \
+ action mirred ingress redirect dev lo
+
+ # 3. (ns1) redirect lo's egress to veth1's egress
+ $NS_EXEC tc qdisc add dev lo parent root handle 1: fq_codel
+ $NS_EXEC tc filter add dev lo parent 1: protocol arp basic \
+ action mirred egress redirect dev veth1
+ $NS_EXEC tc filter add dev lo parent 1: protocol ip basic \
+ action mirred egress redirect dev veth1
+
+ # 4. (ns2) redirect veth2's ingress to lo's ingress
+ ip netns exec ns2 tc qdisc add dev veth2 ingress
+ ip netns exec ns2 tc filter add dev veth2 ingress protocol arp basic \
+ action mirred ingress redirect dev lo
+ ip netns exec ns2 tc filter add dev veth2 ingress protocol ip basic \
+ action mirred ingress redirect dev lo
+
+ $NS_EXEC sysctl -qw net.ipv4.conf.all.rp_filter=1
+ $NS_EXEC sysctl -qw net.ipv4.conf.all.accept_local=1
+ $NS_EXEC sysctl -qw net.ipv4.conf.all.route_localnet=1
+ ip netns exec ns2 sysctl -qw net.ipv4.conf.all.rp_filter=1
+ ip netns exec ns2 sysctl -qw net.ipv4.conf.all.accept_local=1
+ ip netns exec ns2 sysctl -qw net.ipv4.conf.all.route_localnet=1
+ set +e
+
+ run_cmd "ip netns exec ns2 ping -w1 -c1 192.0.2.1"
+ log_test $? 0 "rp_filter passes local packets"
+
+ run_cmd "ip netns exec ns2 ping -w1 -c1 127.0.0.1"
+ log_test $? 0 "rp_filter passes loopback packets"
+
+ cleanup
+}
+
+################################################################################
+# Tests on nexthop spec
+
+# run 'ip route add' with given spec
+add_rt()
+{
+ local desc="$1"
+ local erc=$2
+ local vrf=$3
+ local pfx=$4
+ local gw=$5
+ local dev=$6
+ local cmd out rc
+
+ [ "$vrf" = "-" ] && vrf="default"
+ [ -n "$gw" ] && gw="via $gw"
+ [ -n "$dev" ] && dev="dev $dev"
+
+ cmd="$IP route add vrf $vrf $pfx $gw $dev"
+ if [ "$VERBOSE" = "1" ]; then
+ printf "\n COMMAND: $cmd\n"
+ fi
+
+ out=$(eval $cmd 2>&1)
+ rc=$?
+ if [ "$VERBOSE" = "1" -a -n "$out" ]; then
+ echo " $out"
+ fi
+ log_test $rc $erc "$desc"
+}
+
+fib4_nexthop()
+{
+ echo
+ echo "IPv4 nexthop tests"
+
+ echo "<<< write me >>>"
+}
+
+fib6_nexthop()
+{
+ local lldummy=$(get_linklocal dummy0)
+ local llv1=$(get_linklocal dummy0)
+
+ if [ -z "$lldummy" ]; then
+ echo "Failed to get linklocal address for dummy0"
+ return 1
+ fi
+ if [ -z "$llv1" ]; then
+ echo "Failed to get linklocal address for veth1"
+ return 1
+ fi
+
+ echo
+ echo "IPv6 nexthop tests"
+
+ add_rt "Directly connected nexthop, unicast address" 0 \
+ - 2001:db8:101::/64 2001:db8:1::2
+ add_rt "Directly connected nexthop, unicast address with device" 0 \
+ - 2001:db8:102::/64 2001:db8:1::2 "dummy0"
+ add_rt "Gateway is linklocal address" 0 \
+ - 2001:db8:103::1/64 $llv1 "veth0"
+
+ # fails because LL address requires a device
+ add_rt "Gateway is linklocal address, no device" 2 \
+ - 2001:db8:104::1/64 $llv1
+
+ # local address can not be a gateway
+ add_rt "Gateway can not be local unicast address" 2 \
+ - 2001:db8:105::/64 2001:db8:1::1
+ add_rt "Gateway can not be local unicast address, with device" 2 \
+ - 2001:db8:106::/64 2001:db8:1::1 "dummy0"
+ add_rt "Gateway can not be a local linklocal address" 2 \
+ - 2001:db8:107::1/64 $lldummy "dummy0"
+
+ # VRF tests
+ add_rt "Gateway can be local address in a VRF" 0 \
+ - 2001:db8:108::/64 2001:db8:51::2
+ add_rt "Gateway can be local address in a VRF, with device" 0 \
+ - 2001:db8:109::/64 2001:db8:51::2 "veth0"
+ add_rt "Gateway can be local linklocal address in a VRF" 0 \
+ - 2001:db8:110::1/64 $llv1 "veth0"
+
+ add_rt "Redirect to VRF lookup" 0 \
+ - 2001:db8:111::/64 "" "red"
+
+ add_rt "VRF route, gateway can be local address in default VRF" 0 \
+ red 2001:db8:112::/64 2001:db8:51::1
+
+ # local address in same VRF fails
+ add_rt "VRF route, gateway can not be a local address" 2 \
+ red 2001:db8:113::1/64 2001:db8:2::1
+ add_rt "VRF route, gateway can not be a local addr with device" 2 \
+ red 2001:db8:114::1/64 2001:db8:2::1 "dummy1"
+}
+
+# Default VRF:
+# dummy0 - 198.51.100.1/24 2001:db8:1::1/64
+# veth0 - 192.0.2.1/24 2001:db8:51::1/64
+#
+# VRF red:
+# dummy1 - 192.168.2.1/24 2001:db8:2::1/64
+# veth1 - 192.0.2.2/24 2001:db8:51::2/64
+#
+# [ dummy0 veth0 ]--[ veth1 dummy1 ]
+
+fib_nexthop_test()
+{
+ setup
+
+ set -e
+
+ $IP -4 rule add pref 32765 table local
+ $IP -4 rule del pref 0
+ $IP -6 rule add pref 32765 table local
+ $IP -6 rule del pref 0
+
+ $IP link add red type vrf table 1
+ $IP link set red up
+ $IP -4 route add vrf red unreachable default metric 4278198272
+ $IP -6 route add vrf red unreachable default metric 4278198272
+
+ $IP link add veth0 type veth peer name veth1
+ $IP link set dev veth0 up
+ $IP address add 192.0.2.1/24 dev veth0
+ $IP -6 address add 2001:db8:51::1/64 dev veth0
+
+ $IP link set dev veth1 vrf red up
+ $IP address add 192.0.2.2/24 dev veth1
+ $IP -6 address add 2001:db8:51::2/64 dev veth1
+
+ $IP link add dummy1 type dummy
+ $IP link set dev dummy1 vrf red up
+ $IP address add 192.168.2.1/24 dev dummy1
+ $IP -6 address add 2001:db8:2::1/64 dev dummy1
+ set +e
+
+ sleep 1
+ fib4_nexthop
+ fib6_nexthop
+
+ (
+ $IP link del dev dummy1
+ $IP link del veth0
+ $IP link del red
+ ) 2>/dev/null
+ cleanup
+}
+
+fib_suppress_test()
+{
+ echo
+ echo "FIB rule with suppress_prefixlength"
+ setup
+
+ $IP link add dummy1 type dummy
+ $IP link set dummy1 up
+ $IP -6 route add default dev dummy1
+ $IP -6 rule add table main suppress_prefixlength 0
+ ping -f -c 1000 -W 1 1234::1 >/dev/null 2>&1
+ $IP -6 rule del table main suppress_prefixlength 0
+ $IP link del dummy1
+
+ # If we got here without crashing, we're good.
+ log_test 0 0 "FIB rule suppress test"
+
+ cleanup
+}
+
+################################################################################
+# Tests on route add and replace
+
+run_cmd()
+{
+ local cmd="$1"
+ local out
+ local stderr="2>/dev/null"
+
+ if [ "$VERBOSE" = "1" ]; then
+ printf " COMMAND: $cmd\n"
+ stderr=
+ fi
+
+ out=$(eval $cmd $stderr)
+ rc=$?
+ if [ "$VERBOSE" = "1" -a -n "$out" ]; then
+ echo " $out"
+ fi
+
+ [ "$VERBOSE" = "1" ] && echo
+
+ return $rc
+}
+
+check_expected()
+{
+ local out="$1"
+ local expected="$2"
+ local rc=0
+
+ [ "${out}" = "${expected}" ] && return 0
+
+ if [ -z "${out}" ]; then
+ if [ "$VERBOSE" = "1" ]; then
+ printf "\nNo route entry found\n"
+ printf "Expected:\n"
+ printf " ${expected}\n"
+ fi
+ return 1
+ fi
+
+ # tricky way to convert output to 1-line without ip's
+ # messy '\'; this drops all extra white space
+ out=$(echo ${out})
+ if [ "${out}" != "${expected}" ]; then
+ rc=1
+ if [ "${VERBOSE}" = "1" ]; then
+ printf " Unexpected route entry. Have:\n"
+ printf " ${out}\n"
+ printf " Expected:\n"
+ printf " ${expected}\n\n"
+ fi
+ fi
+
+ return $rc
+}
+
+# add route for a prefix, flushing any existing routes first
+# expected to be the first step of a test
+add_route6()
+{
+ local pfx="$1"
+ local nh="$2"
+ local out
+
+ if [ "$VERBOSE" = "1" ]; then
+ echo
+ echo " ##################################################"
+ echo
+ fi
+
+ run_cmd "$IP -6 ro flush ${pfx}"
+ [ $? -ne 0 ] && exit 1
+
+ out=$($IP -6 ro ls match ${pfx})
+ if [ -n "$out" ]; then
+ echo "Failed to flush routes for prefix used for tests."
+ exit 1
+ fi
+
+ run_cmd "$IP -6 ro add ${pfx} ${nh}"
+ if [ $? -ne 0 ]; then
+ echo "Failed to add initial route for test."
+ exit 1
+ fi
+}
+
+# add initial route - used in replace route tests
+add_initial_route6()
+{
+ add_route6 "2001:db8:104::/64" "$1"
+}
+
+check_route6()
+{
+ local pfx
+ local expected="$1"
+ local out
+ local rc=0
+
+ set -- $expected
+ pfx=$1
+
+ out=$($IP -6 ro ls match ${pfx} | sed -e 's/ pref medium//')
+ check_expected "${out}" "${expected}"
+}
+
+route_cleanup()
+{
+ $IP li del red 2>/dev/null
+ $IP li del dummy1 2>/dev/null
+ $IP li del veth1 2>/dev/null
+ $IP li del veth3 2>/dev/null
+
+ cleanup &> /dev/null
+}
+
+route_setup()
+{
+ route_cleanup
+ setup
+
+ [ "${VERBOSE}" = "1" ] && set -x
+ set -e
+
+ ip netns add ns2
+ ip netns set ns2 auto
+ ip -netns ns2 link set dev lo up
+ ip netns exec ns2 sysctl -qw net.ipv4.ip_forward=1
+ ip netns exec ns2 sysctl -qw net.ipv6.conf.all.forwarding=1
+
+ $IP li add veth1 type veth peer name veth2
+ $IP li add veth3 type veth peer name veth4
+
+ $IP li set veth1 up
+ $IP li set veth3 up
+ $IP li set veth2 netns ns2 up
+ $IP li set veth4 netns ns2 up
+ ip -netns ns2 li add dummy1 type dummy
+ ip -netns ns2 li set dummy1 up
+
+ $IP -6 addr add 2001:db8:101::1/64 dev veth1 nodad
+ $IP -6 addr add 2001:db8:103::1/64 dev veth3 nodad
+ $IP addr add 172.16.101.1/24 dev veth1
+ $IP addr add 172.16.103.1/24 dev veth3
+
+ ip -netns ns2 -6 addr add 2001:db8:101::2/64 dev veth2 nodad
+ ip -netns ns2 -6 addr add 2001:db8:103::2/64 dev veth4 nodad
+ ip -netns ns2 -6 addr add 2001:db8:104::1/64 dev dummy1 nodad
+
+ ip -netns ns2 addr add 172.16.101.2/24 dev veth2
+ ip -netns ns2 addr add 172.16.103.2/24 dev veth4
+ ip -netns ns2 addr add 172.16.104.1/24 dev dummy1
+
+ set +e
+}
+
+# assumption is that basic add of a single path route works
+# otherwise just adding an address on an interface is broken
+ipv6_rt_add()
+{
+ local rc
+
+ echo
+ echo "IPv6 route add / append tests"
+
+ # route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
+ add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
+ run_cmd "$IP -6 ro add 2001:db8:104::/64 via 2001:db8:103::2"
+ log_test $? 2 "Attempt to add duplicate route - gw"
+
+ # route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
+ add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
+ run_cmd "$IP -6 ro add 2001:db8:104::/64 dev veth3"
+ log_test $? 2 "Attempt to add duplicate route - dev only"
+
+ # route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
+ add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
+ run_cmd "$IP -6 ro add unreachable 2001:db8:104::/64"
+ log_test $? 2 "Attempt to add duplicate route - reject route"
+
+ # route append with same prefix adds a new route
+ # - iproute2 sets NLM_F_CREATE | NLM_F_APPEND
+ add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
+ run_cmd "$IP -6 ro append 2001:db8:104::/64 via 2001:db8:103::2"
+ check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
+ log_test $? 0 "Append nexthop to existing route - gw"
+
+ # insert mpath directly
+ add_route6 "2001:db8:104::/64" "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
+ check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
+ log_test $? 0 "Add multipath route"
+
+ add_route6 "2001:db8:104::/64" "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
+ run_cmd "$IP -6 ro add 2001:db8:104::/64 nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
+ log_test $? 2 "Attempt to add duplicate multipath route"
+
+ # insert of a second route without append but different metric
+ add_route6 "2001:db8:104::/64" "via 2001:db8:101::2"
+ run_cmd "$IP -6 ro add 2001:db8:104::/64 via 2001:db8:103::2 metric 512"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ run_cmd "$IP -6 ro add 2001:db8:104::/64 via 2001:db8:103::3 metric 256"
+ rc=$?
+ fi
+ log_test $rc 0 "Route add with different metrics"
+
+ run_cmd "$IP -6 ro del 2001:db8:104::/64 metric 512"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route6 "2001:db8:104::/64 via 2001:db8:103::3 dev veth3 metric 256 2001:db8:104::/64 via 2001:db8:101::2 dev veth1 metric 1024"
+ rc=$?
+ fi
+ log_test $rc 0 "Route delete with metric"
+}
+
+ipv6_rt_replace_single()
+{
+ # single path with single path
+ #
+ add_initial_route6 "via 2001:db8:101::2"
+ run_cmd "$IP -6 ro replace 2001:db8:104::/64 via 2001:db8:103::2"
+ check_route6 "2001:db8:104::/64 via 2001:db8:103::2 dev veth3 metric 1024"
+ log_test $? 0 "Single path with single path"
+
+ # single path with multipath
+ #
+ add_initial_route6 "nexthop via 2001:db8:101::2"
+ run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:101::3 nexthop via 2001:db8:103::2"
+ check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::3 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
+ log_test $? 0 "Single path with multipath"
+
+ # single path with single path using MULTIPATH attribute
+ #
+ add_initial_route6 "via 2001:db8:101::2"
+ run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:103::2"
+ check_route6 "2001:db8:104::/64 via 2001:db8:103::2 dev veth3 metric 1024"
+ log_test $? 0 "Single path with single path via multipath attribute"
+
+ # route replace fails - invalid nexthop
+ add_initial_route6 "via 2001:db8:101::2"
+ run_cmd "$IP -6 ro replace 2001:db8:104::/64 via 2001:db8:104::2"
+ if [ $? -eq 0 ]; then
+ # previous command is expected to fail so if it returns 0
+ # that means the test failed.
+ log_test 0 1 "Invalid nexthop"
+ else
+ check_route6 "2001:db8:104::/64 via 2001:db8:101::2 dev veth1 metric 1024"
+ log_test $? 0 "Invalid nexthop"
+ fi
+
+ # replace non-existent route
+ # - note use of change versus replace since ip adds NLM_F_CREATE
+ # for replace
+ add_initial_route6 "via 2001:db8:101::2"
+ run_cmd "$IP -6 ro change 2001:db8:105::/64 via 2001:db8:101::2"
+ log_test $? 2 "Single path - replace of non-existent route"
+}
+
+ipv6_rt_replace_mpath()
+{
+ # multipath with multipath
+ add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
+ run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:101::3 nexthop via 2001:db8:103::3"
+ check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::3 dev veth1 weight 1 nexthop via 2001:db8:103::3 dev veth3 weight 1"
+ log_test $? 0 "Multipath with multipath"
+
+ # multipath with single
+ add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
+ run_cmd "$IP -6 ro replace 2001:db8:104::/64 via 2001:db8:101::3"
+ check_route6 "2001:db8:104::/64 via 2001:db8:101::3 dev veth1 metric 1024"
+ log_test $? 0 "Multipath with single path"
+
+ # multipath with single
+ add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
+ run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:101::3"
+ check_route6 "2001:db8:104::/64 via 2001:db8:101::3 dev veth1 metric 1024"
+ log_test $? 0 "Multipath with single path via multipath attribute"
+
+ # multipath with dev-only
+ add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
+ run_cmd "$IP -6 ro replace 2001:db8:104::/64 dev veth1"
+ check_route6 "2001:db8:104::/64 dev veth1 metric 1024"
+ log_test $? 0 "Multipath with dev-only"
+
+ # route replace fails - invalid nexthop 1
+ add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
+ run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:111::3 nexthop via 2001:db8:103::3"
+ check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
+ log_test $? 0 "Multipath - invalid first nexthop"
+
+ # route replace fails - invalid nexthop 2
+ add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
+ run_cmd "$IP -6 ro replace 2001:db8:104::/64 nexthop via 2001:db8:101::3 nexthop via 2001:db8:113::3"
+ check_route6 "2001:db8:104::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
+ log_test $? 0 "Multipath - invalid second nexthop"
+
+ # multipath non-existent route
+ add_initial_route6 "nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
+ run_cmd "$IP -6 ro change 2001:db8:105::/64 nexthop via 2001:db8:101::3 nexthop via 2001:db8:103::3"
+ log_test $? 2 "Multipath - replace of non-existent route"
+}
+
+ipv6_rt_replace()
+{
+ echo
+ echo "IPv6 route replace tests"
+
+ ipv6_rt_replace_single
+ ipv6_rt_replace_mpath
+}
+
+ipv6_route_test()
+{
+ route_setup
+
+ ipv6_rt_add
+ ipv6_rt_replace
+
+ route_cleanup
+}
+
+ip_addr_metric_check()
+{
+ ip addr help 2>&1 | grep -q metric
+ if [ $? -ne 0 ]; then
+ echo "iproute2 command does not support metric for addresses. Skipping test"
+ return 1
+ fi
+
+ return 0
+}
+
+ipv6_addr_metric_test()
+{
+ local rc
+
+ echo
+ echo "IPv6 prefix route tests"
+
+ ip_addr_metric_check || return 1
+
+ setup
+
+ set -e
+ $IP li add dummy1 type dummy
+ $IP li add dummy2 type dummy
+ $IP li set dummy1 up
+ $IP li set dummy2 up
+
+ # default entry is metric 256
+ run_cmd "$IP -6 addr add dev dummy1 2001:db8:104::1/64"
+ run_cmd "$IP -6 addr add dev dummy2 2001:db8:104::2/64"
+ set +e
+
+ check_route6 "2001:db8:104::/64 dev dummy1 proto kernel metric 256 2001:db8:104::/64 dev dummy2 proto kernel metric 256"
+ log_test $? 0 "Default metric"
+
+ set -e
+ run_cmd "$IP -6 addr flush dev dummy1"
+ run_cmd "$IP -6 addr add dev dummy1 2001:db8:104::1/64 metric 257"
+ set +e
+
+ check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 256 2001:db8:104::/64 dev dummy1 proto kernel metric 257"
+ log_test $? 0 "User specified metric on first device"
+
+ set -e
+ run_cmd "$IP -6 addr flush dev dummy2"
+ run_cmd "$IP -6 addr add dev dummy2 2001:db8:104::2/64 metric 258"
+ set +e
+
+ check_route6 "2001:db8:104::/64 dev dummy1 proto kernel metric 257 2001:db8:104::/64 dev dummy2 proto kernel metric 258"
+ log_test $? 0 "User specified metric on second device"
+
+ run_cmd "$IP -6 addr del dev dummy1 2001:db8:104::1/64 metric 257"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 258"
+ rc=$?
+ fi
+ log_test $rc 0 "Delete of address on first device"
+
+ run_cmd "$IP -6 addr change dev dummy2 2001:db8:104::2/64 metric 259"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 259"
+ rc=$?
+ fi
+ log_test $rc 0 "Modify metric of address"
+
+ # verify prefix route removed on down
+ run_cmd "ip netns exec ns1 sysctl -qw net.ipv6.conf.all.keep_addr_on_down=1"
+ run_cmd "$IP li set dev dummy2 down"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ out=$($IP -6 ro ls match 2001:db8:104::/64)
+ check_expected "${out}" ""
+ rc=$?
+ fi
+ log_test $rc 0 "Prefix route removed on link down"
+
+ # verify prefix route re-inserted with assigned metric
+ run_cmd "$IP li set dev dummy2 up"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 259"
+ rc=$?
+ fi
+ log_test $rc 0 "Prefix route with metric on link up"
+
+ # verify peer metric added correctly
+ set -e
+ run_cmd "$IP -6 addr flush dev dummy2"
+ run_cmd "$IP -6 addr add dev dummy2 2001:db8:104::1 peer 2001:db8:104::2 metric 260"
+ set +e
+
+ check_route6 "2001:db8:104::1 dev dummy2 proto kernel metric 260"
+ log_test $? 0 "Set metric with peer route on local side"
+ check_route6 "2001:db8:104::2 dev dummy2 proto kernel metric 260"
+ log_test $? 0 "Set metric with peer route on peer side"
+
+ set -e
+ run_cmd "$IP -6 addr change dev dummy2 2001:db8:104::1 peer 2001:db8:104::3 metric 261"
+ set +e
+
+ check_route6 "2001:db8:104::1 dev dummy2 proto kernel metric 261"
+ log_test $? 0 "Modify metric and peer address on local side"
+ check_route6 "2001:db8:104::3 dev dummy2 proto kernel metric 261"
+ log_test $? 0 "Modify metric and peer address on peer side"
+
+ $IP li del dummy1
+ $IP li del dummy2
+ cleanup
+}
+
+ipv6_route_metrics_test()
+{
+ local rc
+
+ echo
+ echo "IPv6 routes with metrics"
+
+ route_setup
+
+ #
+ # single path with metrics
+ #
+ run_cmd "$IP -6 ro add 2001:db8:111::/64 via 2001:db8:101::2 mtu 1400"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route6 "2001:db8:111::/64 via 2001:db8:101::2 dev veth1 metric 1024 mtu 1400"
+ rc=$?
+ fi
+ log_test $rc 0 "Single path route with mtu metric"
+
+
+ #
+ # multipath via separate routes with metrics
+ #
+ run_cmd "$IP -6 ro add 2001:db8:112::/64 via 2001:db8:101::2 mtu 1400"
+ run_cmd "$IP -6 ro append 2001:db8:112::/64 via 2001:db8:103::2"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route6 "2001:db8:112::/64 metric 1024 mtu 1400 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
+ rc=$?
+ fi
+ log_test $rc 0 "Multipath route via 2 single routes with mtu metric on first"
+
+ # second route is coalesced to first to make a multipath route.
+ # MTU of the second path is hidden from display!
+ run_cmd "$IP -6 ro add 2001:db8:113::/64 via 2001:db8:101::2"
+ run_cmd "$IP -6 ro append 2001:db8:113::/64 via 2001:db8:103::2 mtu 1400"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route6 "2001:db8:113::/64 metric 1024 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
+ rc=$?
+ fi
+ log_test $rc 0 "Multipath route via 2 single routes with mtu metric on 2nd"
+
+ run_cmd "$IP -6 ro del 2001:db8:113::/64 via 2001:db8:101::2"
+ if [ $? -eq 0 ]; then
+ check_route6 "2001:db8:113::/64 via 2001:db8:103::2 dev veth3 metric 1024 mtu 1400"
+ log_test $? 0 " MTU of second leg"
+ fi
+
+ #
+ # multipath with metrics
+ #
+ run_cmd "$IP -6 ro add 2001:db8:115::/64 mtu 1400 nexthop via 2001:db8:101::2 nexthop via 2001:db8:103::2"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route6 "2001:db8:115::/64 metric 1024 mtu 1400 nexthop via 2001:db8:101::2 dev veth1 weight 1 nexthop via 2001:db8:103::2 dev veth3 weight 1"
+ rc=$?
+ fi
+ log_test $rc 0 "Multipath route with mtu metric"
+
+ $IP -6 ro add 2001:db8:104::/64 via 2001:db8:101::2 mtu 1300
+ run_cmd "ip netns exec ns1 ${ping6} -w1 -c1 -s 1500 2001:db8:104::1"
+ log_test $? 0 "Using route with mtu metric"
+
+ run_cmd "$IP -6 ro add 2001:db8:114::/64 via 2001:db8:101::2 congctl lock foo"
+ log_test $? 2 "Invalid metric (fails metric_convert)"
+
+ route_cleanup
+}
+
+# add route for a prefix, flushing any existing routes first
+# expected to be the first step of a test
+add_route()
+{
+ local pfx="$1"
+ local nh="$2"
+ local out
+
+ if [ "$VERBOSE" = "1" ]; then
+ echo
+ echo " ##################################################"
+ echo
+ fi
+
+ run_cmd "$IP ro flush ${pfx}"
+ [ $? -ne 0 ] && exit 1
+
+ out=$($IP ro ls match ${pfx})
+ if [ -n "$out" ]; then
+ echo "Failed to flush routes for prefix used for tests."
+ exit 1
+ fi
+
+ run_cmd "$IP ro add ${pfx} ${nh}"
+ if [ $? -ne 0 ]; then
+ echo "Failed to add initial route for test."
+ exit 1
+ fi
+}
+
+# add initial route - used in replace route tests
+add_initial_route()
+{
+ add_route "172.16.104.0/24" "$1"
+}
+
+check_route()
+{
+ local pfx
+ local expected="$1"
+ local out
+
+ set -- $expected
+ pfx=$1
+ [ "${pfx}" = "unreachable" ] && pfx=$2
+
+ out=$($IP ro ls match ${pfx})
+ check_expected "${out}" "${expected}"
+}
+
+# assumption is that basic add of a single path route works
+# otherwise just adding an address on an interface is broken
+ipv4_rt_add()
+{
+ local rc
+
+ echo
+ echo "IPv4 route add / append tests"
+
+ # route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
+ add_route "172.16.104.0/24" "via 172.16.101.2"
+ run_cmd "$IP ro add 172.16.104.0/24 via 172.16.103.2"
+ log_test $? 2 "Attempt to add duplicate route - gw"
+
+ # route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
+ add_route "172.16.104.0/24" "via 172.16.101.2"
+ run_cmd "$IP ro add 172.16.104.0/24 dev veth3"
+ log_test $? 2 "Attempt to add duplicate route - dev only"
+
+ # route add same prefix - fails with EEXISTS b/c ip adds NLM_F_EXCL
+ add_route "172.16.104.0/24" "via 172.16.101.2"
+ run_cmd "$IP ro add unreachable 172.16.104.0/24"
+ log_test $? 2 "Attempt to add duplicate route - reject route"
+
+ # iproute2 prepend only sets NLM_F_CREATE
+ # - adds a new route; does NOT convert existing route to ECMP
+ add_route "172.16.104.0/24" "via 172.16.101.2"
+ run_cmd "$IP ro prepend 172.16.104.0/24 via 172.16.103.2"
+ check_route "172.16.104.0/24 via 172.16.103.2 dev veth3 172.16.104.0/24 via 172.16.101.2 dev veth1"
+ log_test $? 0 "Add new nexthop for existing prefix"
+
+ # route append with same prefix adds a new route
+ # - iproute2 sets NLM_F_CREATE | NLM_F_APPEND
+ add_route "172.16.104.0/24" "via 172.16.101.2"
+ run_cmd "$IP ro append 172.16.104.0/24 via 172.16.103.2"
+ check_route "172.16.104.0/24 via 172.16.101.2 dev veth1 172.16.104.0/24 via 172.16.103.2 dev veth3"
+ log_test $? 0 "Append nexthop to existing route - gw"
+
+ add_route "172.16.104.0/24" "via 172.16.101.2"
+ run_cmd "$IP ro append 172.16.104.0/24 dev veth3"
+ check_route "172.16.104.0/24 via 172.16.101.2 dev veth1 172.16.104.0/24 dev veth3 scope link"
+ log_test $? 0 "Append nexthop to existing route - dev only"
+
+ add_route "172.16.104.0/24" "via 172.16.101.2"
+ run_cmd "$IP ro append unreachable 172.16.104.0/24"
+ check_route "172.16.104.0/24 via 172.16.101.2 dev veth1 unreachable 172.16.104.0/24"
+ log_test $? 0 "Append nexthop to existing route - reject route"
+
+ run_cmd "$IP ro flush 172.16.104.0/24"
+ run_cmd "$IP ro add unreachable 172.16.104.0/24"
+ run_cmd "$IP ro append 172.16.104.0/24 via 172.16.103.2"
+ check_route "unreachable 172.16.104.0/24 172.16.104.0/24 via 172.16.103.2 dev veth3"
+ log_test $? 0 "Append nexthop to existing reject route - gw"
+
+ run_cmd "$IP ro flush 172.16.104.0/24"
+ run_cmd "$IP ro add unreachable 172.16.104.0/24"
+ run_cmd "$IP ro append 172.16.104.0/24 dev veth3"
+ check_route "unreachable 172.16.104.0/24 172.16.104.0/24 dev veth3 scope link"
+ log_test $? 0 "Append nexthop to existing reject route - dev only"
+
+ # insert mpath directly
+ add_route "172.16.104.0/24" "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
+ check_route "172.16.104.0/24 nexthop via 172.16.101.2 dev veth1 weight 1 nexthop via 172.16.103.2 dev veth3 weight 1"
+ log_test $? 0 "add multipath route"
+
+ add_route "172.16.104.0/24" "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
+ run_cmd "$IP ro add 172.16.104.0/24 nexthop via 172.16.101.2 nexthop via 172.16.103.2"
+ log_test $? 2 "Attempt to add duplicate multipath route"
+
+ # insert of a second route without append but different metric
+ add_route "172.16.104.0/24" "via 172.16.101.2"
+ run_cmd "$IP ro add 172.16.104.0/24 via 172.16.103.2 metric 512"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ run_cmd "$IP ro add 172.16.104.0/24 via 172.16.103.3 metric 256"
+ rc=$?
+ fi
+ log_test $rc 0 "Route add with different metrics"
+
+ run_cmd "$IP ro del 172.16.104.0/24 metric 512"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route "172.16.104.0/24 via 172.16.101.2 dev veth1 172.16.104.0/24 via 172.16.103.3 dev veth3 metric 256"
+ rc=$?
+ fi
+ log_test $rc 0 "Route delete with metric"
+}
+
+ipv4_rt_replace_single()
+{
+ # single path with single path
+ #
+ add_initial_route "via 172.16.101.2"
+ run_cmd "$IP ro replace 172.16.104.0/24 via 172.16.103.2"
+ check_route "172.16.104.0/24 via 172.16.103.2 dev veth3"
+ log_test $? 0 "Single path with single path"
+
+ # single path with multipath
+ #
+ add_initial_route "nexthop via 172.16.101.2"
+ run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.101.3 nexthop via 172.16.103.2"
+ check_route "172.16.104.0/24 nexthop via 172.16.101.3 dev veth1 weight 1 nexthop via 172.16.103.2 dev veth3 weight 1"
+ log_test $? 0 "Single path with multipath"
+
+ # single path with reject
+ #
+ add_initial_route "nexthop via 172.16.101.2"
+ run_cmd "$IP ro replace unreachable 172.16.104.0/24"
+ check_route "unreachable 172.16.104.0/24"
+ log_test $? 0 "Single path with reject route"
+
+ # single path with single path using MULTIPATH attribute
+ #
+ add_initial_route "via 172.16.101.2"
+ run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.103.2"
+ check_route "172.16.104.0/24 via 172.16.103.2 dev veth3"
+ log_test $? 0 "Single path with single path via multipath attribute"
+
+ # route replace fails - invalid nexthop
+ add_initial_route "via 172.16.101.2"
+ run_cmd "$IP ro replace 172.16.104.0/24 via 2001:db8:104::2"
+ if [ $? -eq 0 ]; then
+ # previous command is expected to fail so if it returns 0
+ # that means the test failed.
+ log_test 0 1 "Invalid nexthop"
+ else
+ check_route "172.16.104.0/24 via 172.16.101.2 dev veth1"
+ log_test $? 0 "Invalid nexthop"
+ fi
+
+ # replace non-existent route
+ # - note use of change versus replace since ip adds NLM_F_CREATE
+ # for replace
+ add_initial_route "via 172.16.101.2"
+ run_cmd "$IP ro change 172.16.105.0/24 via 172.16.101.2"
+ log_test $? 2 "Single path - replace of non-existent route"
+}
+
+ipv4_rt_replace_mpath()
+{
+ # multipath with multipath
+ add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
+ run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.101.3 nexthop via 172.16.103.3"
+ check_route "172.16.104.0/24 nexthop via 172.16.101.3 dev veth1 weight 1 nexthop via 172.16.103.3 dev veth3 weight 1"
+ log_test $? 0 "Multipath with multipath"
+
+ # multipath with single
+ add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
+ run_cmd "$IP ro replace 172.16.104.0/24 via 172.16.101.3"
+ check_route "172.16.104.0/24 via 172.16.101.3 dev veth1"
+ log_test $? 0 "Multipath with single path"
+
+ # multipath with single
+ add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
+ run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.101.3"
+ check_route "172.16.104.0/24 via 172.16.101.3 dev veth1"
+ log_test $? 0 "Multipath with single path via multipath attribute"
+
+ # multipath with reject
+ add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
+ run_cmd "$IP ro replace unreachable 172.16.104.0/24"
+ check_route "unreachable 172.16.104.0/24"
+ log_test $? 0 "Multipath with reject route"
+
+ # route replace fails - invalid nexthop 1
+ add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
+ run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.111.3 nexthop via 172.16.103.3"
+ check_route "172.16.104.0/24 nexthop via 172.16.101.2 dev veth1 weight 1 nexthop via 172.16.103.2 dev veth3 weight 1"
+ log_test $? 0 "Multipath - invalid first nexthop"
+
+ # route replace fails - invalid nexthop 2
+ add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
+ run_cmd "$IP ro replace 172.16.104.0/24 nexthop via 172.16.101.3 nexthop via 172.16.113.3"
+ check_route "172.16.104.0/24 nexthop via 172.16.101.2 dev veth1 weight 1 nexthop via 172.16.103.2 dev veth3 weight 1"
+ log_test $? 0 "Multipath - invalid second nexthop"
+
+ # multipath non-existent route
+ add_initial_route "nexthop via 172.16.101.2 nexthop via 172.16.103.2"
+ run_cmd "$IP ro change 172.16.105.0/24 nexthop via 172.16.101.3 nexthop via 172.16.103.3"
+ log_test $? 2 "Multipath - replace of non-existent route"
+}
+
+ipv4_rt_replace()
+{
+ echo
+ echo "IPv4 route replace tests"
+
+ ipv4_rt_replace_single
+ ipv4_rt_replace_mpath
+}
+
+# checks that cached input route on VRF port is deleted
+# when VRF is deleted
+ipv4_local_rt_cache()
+{
+ run_cmd "ip addr add 10.0.0.1/32 dev lo"
+ run_cmd "ip netns add test-ns"
+ run_cmd "ip link add veth-outside type veth peer name veth-inside"
+ run_cmd "ip link add vrf-100 type vrf table 1100"
+ run_cmd "ip link set veth-outside master vrf-100"
+ run_cmd "ip link set veth-inside netns test-ns"
+ run_cmd "ip link set veth-outside up"
+ run_cmd "ip link set vrf-100 up"
+ run_cmd "ip route add 10.1.1.1/32 dev veth-outside table 1100"
+ run_cmd "ip netns exec test-ns ip link set veth-inside up"
+ run_cmd "ip netns exec test-ns ip addr add 10.1.1.1/32 dev veth-inside"
+ run_cmd "ip netns exec test-ns ip route add 10.0.0.1/32 dev veth-inside"
+ run_cmd "ip netns exec test-ns ip route add default via 10.0.0.1"
+ run_cmd "ip netns exec test-ns ping 10.0.0.1 -c 1 -i 1"
+ run_cmd "ip link delete vrf-100"
+
+ # if we do not hang test is a success
+ log_test $? 0 "Cached route removed from VRF port device"
+}
+
+ipv4_route_test()
+{
+ route_setup
+
+ ipv4_rt_add
+ ipv4_rt_replace
+ ipv4_local_rt_cache
+
+ route_cleanup
+}
+
+ipv4_addr_metric_test()
+{
+ local rc
+
+ echo
+ echo "IPv4 prefix route tests"
+
+ ip_addr_metric_check || return 1
+
+ setup
+
+ set -e
+ $IP li add dummy1 type dummy
+ $IP li add dummy2 type dummy
+ $IP li set dummy1 up
+ $IP li set dummy2 up
+
+ # default entry is metric 256
+ run_cmd "$IP addr add dev dummy1 172.16.104.1/24"
+ run_cmd "$IP addr add dev dummy2 172.16.104.2/24"
+ set +e
+
+ check_route "172.16.104.0/24 dev dummy1 proto kernel scope link src 172.16.104.1 172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2"
+ log_test $? 0 "Default metric"
+
+ set -e
+ run_cmd "$IP addr flush dev dummy1"
+ run_cmd "$IP addr add dev dummy1 172.16.104.1/24 metric 257"
+ set +e
+
+ check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 172.16.104.0/24 dev dummy1 proto kernel scope link src 172.16.104.1 metric 257"
+ log_test $? 0 "User specified metric on first device"
+
+ set -e
+ run_cmd "$IP addr flush dev dummy2"
+ run_cmd "$IP addr add dev dummy2 172.16.104.2/24 metric 258"
+ set +e
+
+ check_route "172.16.104.0/24 dev dummy1 proto kernel scope link src 172.16.104.1 metric 257 172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 258"
+ log_test $? 0 "User specified metric on second device"
+
+ run_cmd "$IP addr del dev dummy1 172.16.104.1/24 metric 257"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 258"
+ rc=$?
+ fi
+ log_test $rc 0 "Delete of address on first device"
+
+ run_cmd "$IP addr change dev dummy2 172.16.104.2/24 metric 259"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 259"
+ rc=$?
+ fi
+ log_test $rc 0 "Modify metric of address"
+
+ # verify prefix route removed on down
+ run_cmd "$IP li set dev dummy2 down"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ out=$($IP ro ls match 172.16.104.0/24)
+ check_expected "${out}" ""
+ rc=$?
+ fi
+ log_test $rc 0 "Prefix route removed on link down"
+
+ # verify prefix route re-inserted with assigned metric
+ run_cmd "$IP li set dev dummy2 up"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 259"
+ rc=$?
+ fi
+ log_test $rc 0 "Prefix route with metric on link up"
+
+ # explicitly check for metric changes on edge scenarios
+ run_cmd "$IP addr flush dev dummy2"
+ run_cmd "$IP addr add dev dummy2 172.16.104.0/24 metric 259"
+ run_cmd "$IP addr change dev dummy2 172.16.104.0/24 metric 260"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.0 metric 260"
+ rc=$?
+ fi
+ log_test $rc 0 "Modify metric of .0/24 address"
+
+ run_cmd "$IP addr flush dev dummy2"
+ run_cmd "$IP addr add dev dummy2 172.16.104.1/32 peer 172.16.104.2 metric 260"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route "172.16.104.2 dev dummy2 proto kernel scope link src 172.16.104.1 metric 260"
+ rc=$?
+ fi
+ log_test $rc 0 "Set metric of address with peer route"
+
+ run_cmd "$IP addr change dev dummy2 172.16.104.1/32 peer 172.16.104.3 metric 261"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route "172.16.104.3 dev dummy2 proto kernel scope link src 172.16.104.1 metric 261"
+ rc=$?
+ fi
+ log_test $rc 0 "Modify metric and peer address for peer route"
+
+ $IP li del dummy1
+ $IP li del dummy2
+ cleanup
+}
+
+ipv4_route_metrics_test()
+{
+ local rc
+
+ echo
+ echo "IPv4 route add / append tests"
+
+ route_setup
+
+ run_cmd "$IP ro add 172.16.111.0/24 via 172.16.101.2 mtu 1400"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route "172.16.111.0/24 via 172.16.101.2 dev veth1 mtu 1400"
+ rc=$?
+ fi
+ log_test $rc 0 "Single path route with mtu metric"
+
+
+ run_cmd "$IP ro add 172.16.112.0/24 mtu 1400 nexthop via 172.16.101.2 nexthop via 172.16.103.2"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route "172.16.112.0/24 mtu 1400 nexthop via 172.16.101.2 dev veth1 weight 1 nexthop via 172.16.103.2 dev veth3 weight 1"
+ rc=$?
+ fi
+ log_test $rc 0 "Multipath route with mtu metric"
+
+ $IP ro add 172.16.104.0/24 via 172.16.101.2 mtu 1300
+ run_cmd "ip netns exec ns1 ping -w1 -c1 -s 1500 172.16.104.1"
+ log_test $? 0 "Using route with mtu metric"
+
+ run_cmd "$IP ro add 172.16.111.0/24 via 172.16.101.2 congctl lock foo"
+ log_test $? 2 "Invalid metric (fails metric_convert)"
+
+ route_cleanup
+}
+
+ipv4_del_addr_test()
+{
+ echo
+ echo "IPv4 delete address route tests"
+
+ setup
+
+ set -e
+ $IP li add dummy1 type dummy
+ $IP li set dummy1 up
+ $IP li add dummy2 type dummy
+ $IP li set dummy2 up
+ $IP li add red type vrf table 1111
+ $IP li set red up
+ $IP ro add vrf red unreachable default
+ $IP li set dummy2 vrf red
+
+ $IP addr add dev dummy1 172.16.104.1/24
+ $IP addr add dev dummy1 172.16.104.11/24
+ $IP addr add dev dummy1 172.16.104.12/24
+ $IP addr add dev dummy1 172.16.104.13/24
+ $IP addr add dev dummy2 172.16.104.1/24
+ $IP addr add dev dummy2 172.16.104.11/24
+ $IP addr add dev dummy2 172.16.104.12/24
+ $IP route add 172.16.105.0/24 via 172.16.104.2 src 172.16.104.11
+ $IP route add 172.16.106.0/24 dev lo src 172.16.104.12
+ $IP route add table 0 172.16.107.0/24 via 172.16.104.2 src 172.16.104.13
+ $IP route add vrf red 172.16.105.0/24 via 172.16.104.2 src 172.16.104.11
+ $IP route add vrf red 172.16.106.0/24 dev lo src 172.16.104.12
+ set +e
+
+ # removing address from device in vrf should only remove route from vrf table
+ echo " Regular FIB info"
+
+ $IP addr del dev dummy2 172.16.104.11/24
+ $IP ro ls vrf red | grep -q 172.16.105.0/24
+ log_test $? 1 "Route removed from VRF when source address deleted"
+
+ $IP ro ls | grep -q 172.16.105.0/24
+ log_test $? 0 "Route in default VRF not removed"
+
+ $IP addr add dev dummy2 172.16.104.11/24
+ $IP route add vrf red 172.16.105.0/24 via 172.16.104.2 src 172.16.104.11
+
+ $IP addr del dev dummy1 172.16.104.11/24
+ $IP ro ls | grep -q 172.16.105.0/24
+ log_test $? 1 "Route removed in default VRF when source address deleted"
+
+ $IP ro ls vrf red | grep -q 172.16.105.0/24
+ log_test $? 0 "Route in VRF is not removed by address delete"
+
+ # removing address from device in vrf should only remove route from vrf
+ # table even when the associated fib info only differs in table ID
+ echo " Identical FIB info with different table ID"
+
+ $IP addr del dev dummy2 172.16.104.12/24
+ $IP ro ls vrf red | grep -q 172.16.106.0/24
+ log_test $? 1 "Route removed from VRF when source address deleted"
+
+ $IP ro ls | grep -q 172.16.106.0/24
+ log_test $? 0 "Route in default VRF not removed"
+
+ $IP addr add dev dummy2 172.16.104.12/24
+ $IP route add vrf red 172.16.106.0/24 dev lo src 172.16.104.12
+
+ $IP addr del dev dummy1 172.16.104.12/24
+ $IP ro ls | grep -q 172.16.106.0/24
+ log_test $? 1 "Route removed in default VRF when source address deleted"
+
+ $IP ro ls vrf red | grep -q 172.16.106.0/24
+ log_test $? 0 "Route in VRF is not removed by address delete"
+
+ # removing address from device in default vrf should remove route from
+ # the default vrf even when route was inserted with a table ID of 0.
+ echo " Table ID 0"
+
+ $IP addr del dev dummy1 172.16.104.13/24
+ $IP ro ls | grep -q 172.16.107.0/24
+ log_test $? 1 "Route removed in default VRF when source address deleted"
+
+ $IP li del dummy1
+ $IP li del dummy2
+ cleanup
+}
+
+
+ipv4_route_v6_gw_test()
+{
+ local rc
+
+ echo
+ echo "IPv4 route with IPv6 gateway tests"
+
+ route_setup
+ sleep 2
+
+ #
+ # single path route
+ #
+ run_cmd "$IP ro add 172.16.104.0/24 via inet6 2001:db8:101::2"
+ rc=$?
+ log_test $rc 0 "Single path route with IPv6 gateway"
+ if [ $rc -eq 0 ]; then
+ check_route "172.16.104.0/24 via inet6 2001:db8:101::2 dev veth1"
+ fi
+
+ run_cmd "ip netns exec ns1 ping -w1 -c1 172.16.104.1"
+ log_test $rc 0 "Single path route with IPv6 gateway - ping"
+
+ run_cmd "$IP ro del 172.16.104.0/24 via inet6 2001:db8:101::2"
+ rc=$?
+ log_test $rc 0 "Single path route delete"
+ if [ $rc -eq 0 ]; then
+ check_route "172.16.112.0/24"
+ fi
+
+ #
+ # multipath - v6 then v4
+ #
+ run_cmd "$IP ro add 172.16.104.0/24 nexthop via inet6 2001:db8:101::2 dev veth1 nexthop via 172.16.103.2 dev veth3"
+ rc=$?
+ log_test $rc 0 "Multipath route add - v6 nexthop then v4"
+ if [ $rc -eq 0 ]; then
+ check_route "172.16.104.0/24 nexthop via inet6 2001:db8:101::2 dev veth1 weight 1 nexthop via 172.16.103.2 dev veth3 weight 1"
+ fi
+
+ run_cmd "$IP ro del 172.16.104.0/24 nexthop via 172.16.103.2 dev veth3 nexthop via inet6 2001:db8:101::2 dev veth1"
+ log_test $? 2 " Multipath route delete - nexthops in wrong order"
+
+ run_cmd "$IP ro del 172.16.104.0/24 nexthop via inet6 2001:db8:101::2 dev veth1 nexthop via 172.16.103.2 dev veth3"
+ log_test $? 0 " Multipath route delete exact match"
+
+ #
+ # multipath - v4 then v6
+ #
+ run_cmd "$IP ro add 172.16.104.0/24 nexthop via 172.16.103.2 dev veth3 nexthop via inet6 2001:db8:101::2 dev veth1"
+ rc=$?
+ log_test $rc 0 "Multipath route add - v4 nexthop then v6"
+ if [ $rc -eq 0 ]; then
+ check_route "172.16.104.0/24 nexthop via 172.16.103.2 dev veth3 weight 1 nexthop via inet6 2001:db8:101::2 dev veth1 weight 1"
+ fi
+
+ run_cmd "$IP ro del 172.16.104.0/24 nexthop via inet6 2001:db8:101::2 dev veth1 nexthop via 172.16.103.2 dev veth3"
+ log_test $? 2 " Multipath route delete - nexthops in wrong order"
+
+ run_cmd "$IP ro del 172.16.104.0/24 nexthop via 172.16.103.2 dev veth3 nexthop via inet6 2001:db8:101::2 dev veth1"
+ log_test $? 0 " Multipath route delete exact match"
+
+ route_cleanup
+}
+
+################################################################################
+# usage
+
+usage()
+{
+ cat <<EOF
+usage: ${0##*/} OPTS
+
+ -t <test> Test(s) to run (default: all)
+ (options: $TESTS)
+ -p Pause on fail
+ -P Pause after each test before cleanup
+ -v verbose mode (show commands and output)
+EOF
+}
+
+################################################################################
+# main
+
+trap cleanup EXIT
+
+while getopts :t:pPhv o
+do
+ case $o in
+ t) TESTS=$OPTARG;;
+ p) PAUSE_ON_FAIL=yes;;
+ P) PAUSE=yes;;
+ v) VERBOSE=$(($VERBOSE + 1));;
+ h) usage; exit 0;;
+ *) usage; exit 1;;
+ esac
+done
+
+PEER_CMD="ip netns exec ${PEER_NS}"
+
+# make sure we don't pause twice
+[ "${PAUSE}" = "yes" ] && PAUSE_ON_FAIL=no
+
+if [ "$(id -u)" -ne 0 ];then
+ echo "SKIP: Need root privileges"
+ exit $ksft_skip;
+fi
+
+if [ ! -x "$(command -v ip)" ]; then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+
+ip route help 2>&1 | grep -q fibmatch
+if [ $? -ne 0 ]; then
+ echo "SKIP: iproute2 too old, missing fibmatch"
+ exit $ksft_skip
+fi
+
+# start clean
+cleanup &> /dev/null
+
+for t in $TESTS
+do
+ case $t in
+ fib_unreg_test|unregister) fib_unreg_test;;
+ fib_down_test|down) fib_down_test;;
+ fib_carrier_test|carrier) fib_carrier_test;;
+ fib_rp_filter_test|rp_filter) fib_rp_filter_test;;
+ fib_nexthop_test|nexthop) fib_nexthop_test;;
+ fib_suppress_test|suppress) fib_suppress_test;;
+ ipv6_route_test|ipv6_rt) ipv6_route_test;;
+ ipv4_route_test|ipv4_rt) ipv4_route_test;;
+ ipv6_addr_metric) ipv6_addr_metric_test;;
+ ipv4_addr_metric) ipv4_addr_metric_test;;
+ ipv4_del_addr) ipv4_del_addr_test;;
+ ipv6_route_metrics) ipv6_route_metrics_test;;
+ ipv4_route_metrics) ipv4_route_metrics_test;;
+ ipv4_route_v6_gw) ipv4_route_v6_gw_test;;
+
+ help) echo "Test names: $TESTS"; exit 0;;
+ esac
+done
+
+if [ "$TESTS" != "none" ]; then
+ printf "\nTests passed: %3d\n" ${nsuccess}
+ printf "Tests failed: %3d\n" ${nfail}
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/net/fin_ack_lat.c b/tools/testing/selftests/net/fin_ack_lat.c
new file mode 100644
index 000000000..70187494b
--- /dev/null
+++ b/tools/testing/selftests/net/fin_ack_lat.c
@@ -0,0 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+static int child_pid;
+
+static unsigned long timediff(struct timeval s, struct timeval e)
+{
+ unsigned long s_us, e_us;
+
+ s_us = s.tv_sec * 1000000 + s.tv_usec;
+ e_us = e.tv_sec * 1000000 + e.tv_usec;
+ if (s_us > e_us)
+ return 0;
+ return e_us - s_us;
+}
+
+static void client(int port)
+{
+ int sock = 0;
+ struct sockaddr_in addr, laddr;
+ socklen_t len = sizeof(laddr);
+ struct linger sl;
+ int flag = 1;
+ int buffer;
+ struct timeval start, end;
+ unsigned long lat, sum_lat = 0, nr_lat = 0;
+
+ while (1) {
+ gettimeofday(&start, NULL);
+
+ sock = socket(AF_INET, SOCK_STREAM, 0);
+ if (sock < 0)
+ error(-1, errno, "socket creation");
+
+ sl.l_onoff = 1;
+ sl.l_linger = 0;
+ if (setsockopt(sock, SOL_SOCKET, SO_LINGER, &sl, sizeof(sl)))
+ error(-1, errno, "setsockopt(linger)");
+
+ if (setsockopt(sock, IPPROTO_TCP, TCP_NODELAY,
+ &flag, sizeof(flag)))
+ error(-1, errno, "setsockopt(nodelay)");
+
+ addr.sin_family = AF_INET;
+ addr.sin_port = htons(port);
+
+ if (inet_pton(AF_INET, "127.0.0.1", &addr.sin_addr) <= 0)
+ error(-1, errno, "inet_pton");
+
+ if (connect(sock, (struct sockaddr *)&addr, sizeof(addr)) < 0)
+ error(-1, errno, "connect");
+
+ send(sock, &buffer, sizeof(buffer), 0);
+ if (read(sock, &buffer, sizeof(buffer)) == -1)
+ error(-1, errno, "waiting read");
+
+ gettimeofday(&end, NULL);
+ lat = timediff(start, end);
+ sum_lat += lat;
+ nr_lat++;
+ if (lat < 100000)
+ goto close;
+
+ if (getsockname(sock, (struct sockaddr *)&laddr, &len) == -1)
+ error(-1, errno, "getsockname");
+ printf("port: %d, lat: %lu, avg: %lu, nr: %lu\n",
+ ntohs(laddr.sin_port), lat,
+ sum_lat / nr_lat, nr_lat);
+close:
+ fflush(stdout);
+ close(sock);
+ }
+}
+
+static void server(int sock, struct sockaddr_in address)
+{
+ int accepted;
+ int addrlen = sizeof(address);
+ int buffer;
+
+ while (1) {
+ accepted = accept(sock, (struct sockaddr *)&address,
+ (socklen_t *)&addrlen);
+ if (accepted < 0)
+ error(-1, errno, "accept");
+
+ if (read(accepted, &buffer, sizeof(buffer)) == -1)
+ error(-1, errno, "read");
+ close(accepted);
+ }
+}
+
+static void sig_handler(int signum)
+{
+ kill(SIGTERM, child_pid);
+ exit(0);
+}
+
+int main(int argc, char const *argv[])
+{
+ int sock;
+ int opt = 1;
+ struct sockaddr_in address;
+ struct sockaddr_in laddr;
+ socklen_t len = sizeof(laddr);
+
+ if (signal(SIGTERM, sig_handler) == SIG_ERR)
+ error(-1, errno, "signal");
+
+ sock = socket(AF_INET, SOCK_STREAM, 0);
+ if (sock < 0)
+ error(-1, errno, "socket");
+
+ if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT,
+ &opt, sizeof(opt)) == -1)
+ error(-1, errno, "setsockopt");
+
+ address.sin_family = AF_INET;
+ address.sin_addr.s_addr = INADDR_ANY;
+ /* dynamically allocate unused port */
+ address.sin_port = 0;
+
+ if (bind(sock, (struct sockaddr *)&address, sizeof(address)) < 0)
+ error(-1, errno, "bind");
+
+ if (listen(sock, 3) < 0)
+ error(-1, errno, "listen");
+
+ if (getsockname(sock, (struct sockaddr *)&laddr, &len) == -1)
+ error(-1, errno, "getsockname");
+
+ fprintf(stderr, "server port: %d\n", ntohs(laddr.sin_port));
+ child_pid = fork();
+ if (!child_pid)
+ client(ntohs(laddr.sin_port));
+ else
+ server(sock, laddr);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/net/fin_ack_lat.sh b/tools/testing/selftests/net/fin_ack_lat.sh
new file mode 100755
index 000000000..a3ff6e0b2
--- /dev/null
+++ b/tools/testing/selftests/net/fin_ack_lat.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test latency spikes caused by FIN/ACK handling race.
+
+set +x
+set -e
+
+tmpfile=$(mktemp /tmp/fin_ack_latency.XXXX.log)
+
+cleanup() {
+ kill $(pidof fin_ack_lat)
+ rm -f $tmpfile
+}
+
+trap cleanup EXIT
+
+do_test() {
+ RUNTIME=$1
+
+ ./fin_ack_lat | tee $tmpfile &
+ PID=$!
+
+ sleep $RUNTIME
+ NR_SPIKES=$(wc -l $tmpfile | awk '{print $1}')
+ if [ $NR_SPIKES -gt 0 ]
+ then
+ echo "FAIL: $NR_SPIKES spikes detected"
+ return 1
+ fi
+ return 0
+}
+
+do_test "30"
+echo "test done"
diff --git a/tools/testing/selftests/net/forwarding/.gitignore b/tools/testing/selftests/net/forwarding/.gitignore
new file mode 100644
index 000000000..2dea317f1
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+forwarding.config
diff --git a/tools/testing/selftests/net/forwarding/Makefile b/tools/testing/selftests/net/forwarding/Makefile
new file mode 100644
index 000000000..881e680c2
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/Makefile
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: GPL-2.0+ OR MIT
+
+TEST_PROGS = bridge_igmp.sh \
+ bridge_port_isolation.sh \
+ bridge_sticky_fdb.sh \
+ bridge_vlan_aware.sh \
+ bridge_vlan_unaware.sh \
+ ethtool.sh \
+ gre_inner_v4_multipath.sh \
+ gre_inner_v6_multipath.sh \
+ gre_multipath.sh \
+ ip6_forward_instats_vrf.sh \
+ ip6gre_inner_v4_multipath.sh \
+ ip6gre_inner_v6_multipath.sh \
+ ipip_flat_gre_key.sh \
+ ipip_flat_gre_keys.sh \
+ ipip_flat_gre.sh \
+ ipip_hier_gre_key.sh \
+ ipip_hier_gre_keys.sh \
+ ipip_hier_gre.sh \
+ loopback.sh \
+ mirror_gre_bound.sh \
+ mirror_gre_bridge_1d.sh \
+ mirror_gre_bridge_1d_vlan.sh \
+ mirror_gre_bridge_1q_lag.sh \
+ mirror_gre_bridge_1q.sh \
+ mirror_gre_changes.sh \
+ mirror_gre_flower.sh \
+ mirror_gre_lag_lacp.sh \
+ mirror_gre_neigh.sh \
+ mirror_gre_nh.sh \
+ mirror_gre.sh \
+ mirror_gre_vlan_bridge_1q.sh \
+ mirror_gre_vlan.sh \
+ mirror_vlan.sh \
+ router_bridge.sh \
+ router_bridge_vlan.sh \
+ router_broadcast.sh \
+ router_mpath_nh.sh \
+ router_multicast.sh \
+ router_multipath.sh \
+ router.sh \
+ router_vid_1.sh \
+ sch_ets.sh \
+ sch_tbf_ets.sh \
+ sch_tbf_prio.sh \
+ sch_tbf_root.sh \
+ tc_actions.sh \
+ tc_chains.sh \
+ tc_flower_router.sh \
+ tc_flower.sh \
+ tc_shblocks.sh \
+ tc_vlan_modify.sh \
+ vxlan_asymmetric.sh \
+ vxlan_bridge_1d_port_8472.sh \
+ vxlan_bridge_1d.sh \
+ vxlan_bridge_1q_port_8472.sh \
+ vxlan_bridge_1q.sh \
+ vxlan_symmetric.sh
+
+TEST_PROGS_EXTENDED := devlink_lib.sh \
+ ethtool_lib.sh \
+ fib_offload_lib.sh \
+ forwarding.config.sample \
+ ipip_lib.sh \
+ lib.sh \
+ mirror_gre_lib.sh \
+ mirror_gre_topo_lib.sh \
+ mirror_lib.sh \
+ mirror_topo_lib.sh \
+ sch_ets_core.sh \
+ sch_ets_tests.sh \
+ sch_tbf_core.sh \
+ sch_tbf_etsprio.sh \
+ tc_common.sh
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/net/forwarding/README b/tools/testing/selftests/net/forwarding/README
new file mode 100644
index 000000000..b8a2af8fc
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/README
@@ -0,0 +1,58 @@
+Motivation
+==========
+
+One of the nice things about network namespaces is that they allow one
+to easily create and test complex environments.
+
+Unfortunately, these namespaces can not be used with actual switching
+ASICs, as their ports can not be migrated to other network namespaces
+(NETIF_F_NETNS_LOCAL) and most of them probably do not support the
+L1-separation provided by namespaces.
+
+However, a similar kind of flexibility can be achieved by using VRFs and
+by looping the switch ports together. For example:
+
+ br0
+ +
+ vrf-h1 | vrf-h2
+ + +---+----+ +
+ | | | |
+ 192.0.2.1/24 + + + + 192.0.2.2/24
+ swp1 swp2 swp3 swp4
+ + + + +
+ | | | |
+ +--------+ +--------+
+
+The VRFs act as lightweight namespaces representing hosts connected to
+the switch.
+
+This approach for testing switch ASICs has several advantages over the
+traditional method that requires multiple physical machines, to name a
+few:
+
+1. Only the device under test (DUT) is being tested without noise from
+other system.
+
+2. Ability to easily provision complex topologies. Testing bridging
+between 4-ports LAGs or 8-way ECMP requires many physical links that are
+not always available. With the VRF-based approach one merely needs to
+loopback more ports.
+
+These tests are written with switch ASICs in mind, but they can be run
+on any Linux box using veth pairs to emulate physical loopbacks.
+
+Guidelines for Writing Tests
+============================
+
+o Where possible, reuse an existing topology for different tests instead
+ of recreating the same topology.
+o Tests that use anything but the most trivial topologies should include
+ an ASCII art showing the topology.
+o Where possible, IPv6 and IPv4 addresses shall conform to RFC 3849 and
+ RFC 5737, respectively.
+o Where possible, tests shall be written so that they can be reused by
+ multiple topologies and added to lib.sh.
+o Checks shall be added to lib.sh for any external dependencies.
+o Code shall be checked using ShellCheck [1] prior to submission.
+
+1. https://www.shellcheck.net/
diff --git a/tools/testing/selftests/net/forwarding/bridge_igmp.sh b/tools/testing/selftests/net/forwarding/bridge_igmp.sh
new file mode 100755
index 000000000..88d2472ba
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/bridge_igmp.sh
@@ -0,0 +1,152 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="reportleave_test"
+NUM_NETIFS=4
+CHECK_TC="yes"
+TEST_GROUP="239.10.10.10"
+TEST_GROUP_MAC="01:00:5e:0a:0a:0a"
+source lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/24 2001:db8:1::1/64
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/24 2001:db8:1::1/64
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.2/24 2001:db8:1::2/64
+}
+
+h2_destroy()
+{
+ simple_if_fini $h2 192.0.2.2/24 2001:db8:1::2/64
+}
+
+switch_create()
+{
+ ip link add dev br0 type bridge mcast_snooping 1 mcast_querier 1
+
+ ip link set dev $swp1 master br0
+ ip link set dev $swp2 master br0
+
+ ip link set dev br0 up
+ ip link set dev $swp1 up
+ ip link set dev $swp2 up
+}
+
+switch_destroy()
+{
+ ip link set dev $swp2 down
+ ip link set dev $swp1 down
+
+ ip link del dev br0
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+
+ # Always cleanup the mcast group
+ ip address del dev $h2 $TEST_GROUP/32 2>&1 1>/dev/null
+
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+# return 0 if the packet wasn't seen on host2_if or 1 if it was
+mcast_packet_test()
+{
+ local mac=$1
+ local ip=$2
+ local host1_if=$3
+ local host2_if=$4
+ local seen=0
+
+ # Add an ACL on `host2_if` which will tell us whether the packet
+ # was received by it or not.
+ tc qdisc add dev $host2_if ingress
+ tc filter add dev $host2_if ingress protocol ip pref 1 handle 101 \
+ flower dst_mac $mac action drop
+
+ $MZ $host1_if -c 1 -p 64 -b $mac -B $ip -t udp "dp=4096,sp=2048" -q
+ sleep 1
+
+ tc -j -s filter show dev $host2_if ingress \
+ | jq -e ".[] | select(.options.handle == 101) \
+ | select(.options.actions[0].stats.packets == 1)" &> /dev/null
+ if [[ $? -eq 0 ]]; then
+ seen=1
+ fi
+
+ tc filter del dev $host2_if ingress protocol ip pref 1 handle 101 flower
+ tc qdisc del dev $host2_if ingress
+
+ return $seen
+}
+
+reportleave_test()
+{
+ RET=0
+ ip address add dev $h2 $TEST_GROUP/32 autojoin
+ check_err $? "Could not join $TEST_GROUP"
+
+ sleep 5
+ bridge mdb show dev br0 | grep $TEST_GROUP 1>/dev/null
+ check_err $? "Report didn't create mdb entry for $TEST_GROUP"
+
+ mcast_packet_test $TEST_GROUP_MAC $TEST_GROUP $h1 $h2
+ check_fail $? "Traffic to $TEST_GROUP wasn't forwarded"
+
+ log_test "IGMP report $TEST_GROUP"
+
+ RET=0
+ bridge mdb show dev br0 | grep $TEST_GROUP 1>/dev/null
+ check_err $? "mdb entry for $TEST_GROUP is missing"
+
+ ip address del dev $h2 $TEST_GROUP/32
+ check_err $? "Could not leave $TEST_GROUP"
+
+ sleep 5
+ bridge mdb show dev br0 | grep $TEST_GROUP 1>/dev/null
+ check_fail $? "Leave didn't delete mdb entry for $TEST_GROUP"
+
+ mcast_packet_test $TEST_GROUP_MAC $TEST_GROUP $h1 $h2
+ check_err $? "Traffic to $TEST_GROUP was forwarded without mdb entry"
+
+ log_test "IGMP leave $TEST_GROUP"
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/bridge_port_isolation.sh b/tools/testing/selftests/net/forwarding/bridge_port_isolation.sh
new file mode 100755
index 000000000..a43b4645c
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/bridge_port_isolation.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="ping_ipv4 ping_ipv6 flooding"
+NUM_NETIFS=6
+CHECK_TC="yes"
+source lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/24 2001:db8:1::1/64
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/24 2001:db8:1::1/64
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.2/24 2001:db8:1::2/64
+}
+
+h2_destroy()
+{
+ simple_if_fini $h2 192.0.2.2/24 2001:db8:1::2/64
+}
+
+h3_create()
+{
+ simple_if_init $h3 192.0.2.3/24 2001:db8:1::3/64
+}
+
+h3_destroy()
+{
+ simple_if_fini $h3 192.0.2.3/24 2001:db8:1::3/64
+}
+
+switch_create()
+{
+ ip link add dev br0 type bridge
+
+ ip link set dev $swp1 master br0
+ ip link set dev $swp2 master br0
+ ip link set dev $swp3 master br0
+
+ ip link set dev $swp1 type bridge_slave isolated on
+ check_err $? "Can't set isolation on port $swp1"
+ ip link set dev $swp2 type bridge_slave isolated on
+ check_err $? "Can't set isolation on port $swp2"
+ ip link set dev $swp3 type bridge_slave isolated off
+ check_err $? "Can't disable isolation on port $swp3"
+
+ ip link set dev br0 up
+ ip link set dev $swp1 up
+ ip link set dev $swp2 up
+ ip link set dev $swp3 up
+}
+
+switch_destroy()
+{
+ ip link set dev $swp3 down
+ ip link set dev $swp2 down
+ ip link set dev $swp1 down
+
+ ip link del dev br0
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+ h3_create
+
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+
+ h3_destroy
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ RET=0
+ ping_do $h1 192.0.2.2
+ check_fail $? "Ping worked when it should not have"
+
+ RET=0
+ ping_do $h3 192.0.2.2
+ check_err $? "Ping didn't work when it should have"
+
+ log_test "Isolated port ping"
+}
+
+ping_ipv6()
+{
+ RET=0
+ ping6_do $h1 2001:db8:1::2
+ check_fail $? "Ping6 worked when it should not have"
+
+ RET=0
+ ping6_do $h3 2001:db8:1::2
+ check_err $? "Ping6 didn't work when it should have"
+
+ log_test "Isolated port ping6"
+}
+
+flooding()
+{
+ local mac=de:ad:be:ef:13:37
+ local ip=192.0.2.100
+
+ RET=0
+ flood_test_do false $mac $ip $h1 $h2
+ check_err $? "Packet was flooded when it should not have been"
+
+ RET=0
+ flood_test_do true $mac $ip $h3 $h2
+ check_err $? "Packet was not flooded when it should have been"
+
+ log_test "Isolated port flooding"
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/bridge_sticky_fdb.sh b/tools/testing/selftests/net/forwarding/bridge_sticky_fdb.sh
new file mode 100755
index 000000000..1f8ef0eff
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/bridge_sticky_fdb.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="sticky"
+NUM_NETIFS=4
+TEST_MAC=de:ad:be:ef:13:37
+source lib.sh
+
+switch_create()
+{
+ ip link add dev br0 type bridge
+
+ ip link set dev $swp1 master br0
+ ip link set dev $swp2 master br0
+
+ ip link set dev br0 up
+ ip link set dev $h1 up
+ ip link set dev $swp1 up
+ ip link set dev $h2 up
+ ip link set dev $swp2 up
+}
+
+switch_destroy()
+{
+ ip link set dev $swp2 down
+ ip link set dev $h2 down
+ ip link set dev $swp1 down
+ ip link set dev $h1 down
+
+ ip link del dev br0
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+ h2=${NETIFS[p3]}
+ swp2=${NETIFS[p4]}
+
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+ switch_destroy
+}
+
+sticky()
+{
+ bridge fdb add $TEST_MAC dev $swp1 master static sticky
+ check_err $? "Could not add fdb entry"
+ bridge fdb del $TEST_MAC dev $swp1 vlan 1 master static sticky
+ $MZ $h2 -c 1 -a $TEST_MAC -t arp "request" -q
+ bridge -j fdb show br br0 brport $swp1\
+ | jq -e ".[] | select(.mac == \"$TEST_MAC\")" &> /dev/null
+ check_err $? "Did not find FDB record when should"
+
+ log_test "Sticky fdb entry"
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh
new file mode 100755
index 000000000..b90dff8d3
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding vlan_deletion extern_learn"
+NUM_NETIFS=4
+CHECK_TC="yes"
+source lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/24 2001:db8:1::1/64
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/24 2001:db8:1::1/64
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.2/24 2001:db8:1::2/64
+}
+
+h2_destroy()
+{
+ simple_if_fini $h2 192.0.2.2/24 2001:db8:1::2/64
+}
+
+switch_create()
+{
+ # 10 Seconds ageing time.
+ ip link add dev br0 type bridge vlan_filtering 1 ageing_time 1000 \
+ mcast_snooping 0
+
+ ip link set dev $swp1 master br0
+ ip link set dev $swp2 master br0
+
+ ip link set dev br0 up
+ ip link set dev $swp1 up
+ ip link set dev $swp2 up
+}
+
+switch_destroy()
+{
+ ip link set dev $swp2 down
+ ip link set dev $swp1 down
+
+ ip link del dev br0
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.2.2
+}
+
+ping_ipv6()
+{
+ ping6_test $h1 2001:db8:1::2
+}
+
+learning()
+{
+ learning_test "br0" $swp1 $h1 $h2
+}
+
+flooding()
+{
+ flood_test $swp2 $h1 $h2
+}
+
+vlan_deletion()
+{
+ # Test that the deletion of a VLAN on a bridge port does not affect
+ # the PVID VLAN
+ log_info "Add and delete a VLAN on bridge port $swp1"
+
+ bridge vlan add vid 10 dev $swp1
+ bridge vlan del vid 10 dev $swp1
+
+ ping_ipv4
+ ping_ipv6
+}
+
+extern_learn()
+{
+ local mac=de:ad:be:ef:13:37
+ local ageing_time
+
+ # Test that externally learned FDB entries can roam, but not age out
+ RET=0
+
+ bridge fdb add de:ad:be:ef:13:37 dev $swp1 master extern_learn vlan 1
+
+ bridge fdb show brport $swp1 | grep -q de:ad:be:ef:13:37
+ check_err $? "Did not find FDB entry when should"
+
+ # Wait for 10 seconds after the ageing time to make sure the FDB entry
+ # was not aged out
+ ageing_time=$(bridge_ageing_time_get br0)
+ sleep $((ageing_time + 10))
+
+ bridge fdb show brport $swp1 | grep -q de:ad:be:ef:13:37
+ check_err $? "FDB entry was aged out when should not"
+
+ $MZ $h2 -c 1 -p 64 -a $mac -t ip -q
+
+ bridge fdb show brport $swp2 | grep -q de:ad:be:ef:13:37
+ check_err $? "FDB entry did not roam when should"
+
+ log_test "Externally learned FDB entry - ageing & roaming"
+
+ bridge fdb del de:ad:be:ef:13:37 dev $swp2 master vlan 1 &> /dev/null
+ bridge fdb del de:ad:be:ef:13:37 dev $swp1 master vlan 1 &> /dev/null
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh
new file mode 100755
index 000000000..c15c6c85c
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/bridge_vlan_unaware.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding"
+NUM_NETIFS=4
+source lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/24 2001:db8:1::1/64
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/24 2001:db8:1::1/64
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.2/24 2001:db8:1::2/64
+}
+
+h2_destroy()
+{
+ simple_if_fini $h2 192.0.2.2/24 2001:db8:1::2/64
+}
+
+switch_create()
+{
+ # 10 Seconds ageing time.
+ ip link add dev br0 type bridge ageing_time 1000 mcast_snooping 0
+
+ ip link set dev $swp1 master br0
+ ip link set dev $swp2 master br0
+
+ ip link set dev br0 up
+ ip link set dev $swp1 up
+ ip link set dev $swp2 up
+}
+
+switch_destroy()
+{
+ ip link set dev $swp2 down
+ ip link set dev $swp1 down
+
+ ip link del dev br0
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.2.2
+}
+
+ping_ipv6()
+{
+ ping6_test $h1 2001:db8:1::2
+}
+
+learning()
+{
+ learning_test "br0" $swp1 $h1 $h2
+}
+
+flooding()
+{
+ flood_test $swp2 $h1 $h2
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/config b/tools/testing/selftests/net/forwarding/config
new file mode 100644
index 000000000..da96eff72
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/config
@@ -0,0 +1,14 @@
+CONFIG_BRIDGE=m
+CONFIG_VLAN_8021Q=m
+CONFIG_BRIDGE_VLAN_FILTERING=y
+CONFIG_NET_L3_MASTER_DEV=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NET_VRF=m
+CONFIG_BPF_SYSCALL=y
+CONFIG_CGROUP_BPF=y
+CONFIG_NET_CLS_FLOWER=m
+CONFIG_NET_SCH_INGRESS=m
+CONFIG_NET_ACT_GACT=m
+CONFIG_VETH=m
+CONFIG_NAMESPACES=y
+CONFIG_NET_NS=y
diff --git a/tools/testing/selftests/net/forwarding/devlink_lib.sh b/tools/testing/selftests/net/forwarding/devlink_lib.sh
new file mode 100644
index 000000000..9c12c4fd3
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/devlink_lib.sh
@@ -0,0 +1,557 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+##############################################################################
+# Defines
+
+if [[ ! -v DEVLINK_DEV ]]; then
+ DEVLINK_DEV=$(devlink port show "${NETIFS[p1]:-$NETIF_NO_CABLE}" -j \
+ | jq -r '.port | keys[]' | cut -d/ -f-2)
+ if [ -z "$DEVLINK_DEV" ]; then
+ echo "SKIP: ${NETIFS[p1]} has no devlink device registered for it"
+ exit 1
+ fi
+ if [[ "$(echo $DEVLINK_DEV | grep -c pci)" -eq 0 ]]; then
+ echo "SKIP: devlink device's bus is not PCI"
+ exit 1
+ fi
+
+ DEVLINK_VIDDID=$(lspci -s $(echo $DEVLINK_DEV | cut -d"/" -f2) \
+ -n | cut -d" " -f3)
+fi
+
+##############################################################################
+# Sanity checks
+
+devlink help 2>&1 | grep resource &> /dev/null
+if [ $? -ne 0 ]; then
+ echo "SKIP: iproute2 too old, missing devlink resource support"
+ exit 1
+fi
+
+devlink help 2>&1 | grep trap &> /dev/null
+if [ $? -ne 0 ]; then
+ echo "SKIP: iproute2 too old, missing devlink trap support"
+ exit 1
+fi
+
+devlink dev help 2>&1 | grep info &> /dev/null
+if [ $? -ne 0 ]; then
+ echo "SKIP: iproute2 too old, missing devlink dev info support"
+ exit 1
+fi
+
+##############################################################################
+# Devlink helpers
+
+devlink_resource_names_to_path()
+{
+ local resource
+ local path=""
+
+ for resource in "${@}"; do
+ if [ "$path" == "" ]; then
+ path="$resource"
+ else
+ path="${path}/$resource"
+ fi
+ done
+
+ echo "$path"
+}
+
+devlink_resource_get()
+{
+ local name=$1
+ local resource_name=.[][\"$DEVLINK_DEV\"]
+
+ resource_name="$resource_name | .[] | select (.name == \"$name\")"
+
+ shift
+ for resource in "${@}"; do
+ resource_name="${resource_name} | .[\"resources\"][] | \
+ select (.name == \"$resource\")"
+ done
+
+ devlink -j resource show "$DEVLINK_DEV" | jq "$resource_name"
+}
+
+devlink_resource_size_get()
+{
+ local size=$(devlink_resource_get "$@" | jq '.["size_new"]')
+
+ if [ "$size" == "null" ]; then
+ devlink_resource_get "$@" | jq '.["size"]'
+ else
+ echo "$size"
+ fi
+}
+
+devlink_resource_size_set()
+{
+ local new_size=$1
+ local path
+
+ shift
+ path=$(devlink_resource_names_to_path "$@")
+ devlink resource set "$DEVLINK_DEV" path "$path" size "$new_size"
+ check_err $? "Failed setting path $path to size $size"
+}
+
+devlink_resource_occ_get()
+{
+ devlink_resource_get "$@" | jq '.["occ"]'
+}
+
+devlink_reload()
+{
+ local still_pending
+
+ devlink dev reload "$DEVLINK_DEV" &> /dev/null
+ check_err $? "Failed reload"
+
+ still_pending=$(devlink resource show "$DEVLINK_DEV" | \
+ grep -c "size_new")
+ check_err $still_pending "Failed reload - There are still unset sizes"
+}
+
+declare -A DEVLINK_ORIG
+
+# Changing pool type from static to dynamic causes reinterpretation of threshold
+# values. They therefore need to be saved before pool type is changed, then the
+# pool type can be changed, and then the new values need to be set up. Therefore
+# instead of saving the current state implicitly in the _set call, provide
+# functions for all three primitives: save, set, and restore.
+
+devlink_port_pool_threshold()
+{
+ local port=$1; shift
+ local pool=$1; shift
+
+ devlink sb port pool show $port pool $pool -j \
+ | jq '.port_pool."'"$port"'"[].threshold'
+}
+
+devlink_port_pool_th_save()
+{
+ local port=$1; shift
+ local pool=$1; shift
+ local key="port_pool($port,$pool).threshold"
+
+ DEVLINK_ORIG[$key]=$(devlink_port_pool_threshold $port $pool)
+}
+
+devlink_port_pool_th_set()
+{
+ local port=$1; shift
+ local pool=$1; shift
+ local th=$1; shift
+
+ devlink sb port pool set $port pool $pool th $th
+}
+
+devlink_port_pool_th_restore()
+{
+ local port=$1; shift
+ local pool=$1; shift
+ local key="port_pool($port,$pool).threshold"
+ local -a orig=(${DEVLINK_ORIG[$key]})
+
+ if [[ -z $orig ]]; then
+ echo "WARNING: Mismatched devlink_port_pool_th_restore"
+ else
+ devlink sb port pool set $port pool $pool th $orig
+ fi
+}
+
+devlink_pool_size_thtype()
+{
+ local pool=$1; shift
+
+ devlink sb pool show "$DEVLINK_DEV" pool $pool -j \
+ | jq -r '.pool[][] | (.size, .thtype)'
+}
+
+devlink_pool_size_thtype_save()
+{
+ local pool=$1; shift
+ local key="pool($pool).size_thtype"
+
+ DEVLINK_ORIG[$key]=$(devlink_pool_size_thtype $pool)
+}
+
+devlink_pool_size_thtype_set()
+{
+ local pool=$1; shift
+ local thtype=$1; shift
+ local size=$1; shift
+
+ devlink sb pool set "$DEVLINK_DEV" pool $pool size $size thtype $thtype
+}
+
+devlink_pool_size_thtype_restore()
+{
+ local pool=$1; shift
+ local key="pool($pool).size_thtype"
+ local -a orig=(${DEVLINK_ORIG[$key]})
+
+ if [[ -z ${orig[0]} ]]; then
+ echo "WARNING: Mismatched devlink_pool_size_thtype_restore"
+ else
+ devlink sb pool set "$DEVLINK_DEV" pool $pool \
+ size ${orig[0]} thtype ${orig[1]}
+ fi
+}
+
+devlink_tc_bind_pool_th()
+{
+ local port=$1; shift
+ local tc=$1; shift
+ local dir=$1; shift
+
+ devlink sb tc bind show $port tc $tc type $dir -j \
+ | jq -r '.tc_bind[][] | (.pool, .threshold)'
+}
+
+devlink_tc_bind_pool_th_save()
+{
+ local port=$1; shift
+ local tc=$1; shift
+ local dir=$1; shift
+ local key="tc_bind($port,$dir,$tc).pool_th"
+
+ DEVLINK_ORIG[$key]=$(devlink_tc_bind_pool_th $port $tc $dir)
+}
+
+devlink_tc_bind_pool_th_set()
+{
+ local port=$1; shift
+ local tc=$1; shift
+ local dir=$1; shift
+ local pool=$1; shift
+ local th=$1; shift
+
+ devlink sb tc bind set $port tc $tc type $dir pool $pool th $th
+}
+
+devlink_tc_bind_pool_th_restore()
+{
+ local port=$1; shift
+ local tc=$1; shift
+ local dir=$1; shift
+ local key="tc_bind($port,$dir,$tc).pool_th"
+ local -a orig=(${DEVLINK_ORIG[$key]})
+
+ if [[ -z ${orig[0]} ]]; then
+ echo "WARNING: Mismatched devlink_tc_bind_pool_th_restore"
+ else
+ devlink sb tc bind set $port tc $tc type $dir \
+ pool ${orig[0]} th ${orig[1]}
+ fi
+}
+
+devlink_traps_num_get()
+{
+ devlink -j trap | jq '.[]["'$DEVLINK_DEV'"] | length'
+}
+
+devlink_traps_get()
+{
+ devlink -j trap | jq -r '.[]["'$DEVLINK_DEV'"][].name'
+}
+
+devlink_trap_type_get()
+{
+ local trap_name=$1; shift
+
+ devlink -j trap show $DEVLINK_DEV trap $trap_name \
+ | jq -r '.[][][].type'
+}
+
+devlink_trap_action_set()
+{
+ local trap_name=$1; shift
+ local action=$1; shift
+
+ # Pipe output to /dev/null to avoid expected warnings.
+ devlink trap set $DEVLINK_DEV trap $trap_name \
+ action $action &> /dev/null
+}
+
+devlink_trap_action_get()
+{
+ local trap_name=$1; shift
+
+ devlink -j trap show $DEVLINK_DEV trap $trap_name \
+ | jq -r '.[][][].action'
+}
+
+devlink_trap_group_get()
+{
+ devlink -j trap show $DEVLINK_DEV trap $trap_name \
+ | jq -r '.[][][].group'
+}
+
+devlink_trap_metadata_test()
+{
+ local trap_name=$1; shift
+ local metadata=$1; shift
+
+ devlink -jv trap show $DEVLINK_DEV trap $trap_name \
+ | jq -e '.[][][].metadata | contains(["'$metadata'"])' \
+ &> /dev/null
+}
+
+devlink_trap_rx_packets_get()
+{
+ local trap_name=$1; shift
+
+ devlink -js trap show $DEVLINK_DEV trap $trap_name \
+ | jq '.[][][]["stats"]["rx"]["packets"]'
+}
+
+devlink_trap_rx_bytes_get()
+{
+ local trap_name=$1; shift
+
+ devlink -js trap show $DEVLINK_DEV trap $trap_name \
+ | jq '.[][][]["stats"]["rx"]["bytes"]'
+}
+
+devlink_trap_stats_idle_test()
+{
+ local trap_name=$1; shift
+ local t0_packets t0_bytes
+ local t1_packets t1_bytes
+
+ t0_packets=$(devlink_trap_rx_packets_get $trap_name)
+ t0_bytes=$(devlink_trap_rx_bytes_get $trap_name)
+
+ sleep 1
+
+ t1_packets=$(devlink_trap_rx_packets_get $trap_name)
+ t1_bytes=$(devlink_trap_rx_bytes_get $trap_name)
+
+ if [[ $t0_packets -eq $t1_packets && $t0_bytes -eq $t1_bytes ]]; then
+ return 0
+ else
+ return 1
+ fi
+}
+
+devlink_traps_enable_all()
+{
+ local trap_name
+
+ for trap_name in $(devlink_traps_get); do
+ devlink_trap_action_set $trap_name "trap"
+ done
+}
+
+devlink_traps_disable_all()
+{
+ for trap_name in $(devlink_traps_get); do
+ devlink_trap_action_set $trap_name "drop"
+ done
+}
+
+devlink_trap_groups_get()
+{
+ devlink -j trap group | jq -r '.[]["'$DEVLINK_DEV'"][].name'
+}
+
+devlink_trap_group_action_set()
+{
+ local group_name=$1; shift
+ local action=$1; shift
+
+ # Pipe output to /dev/null to avoid expected warnings.
+ devlink trap group set $DEVLINK_DEV group $group_name action $action \
+ &> /dev/null
+}
+
+devlink_trap_group_rx_packets_get()
+{
+ local group_name=$1; shift
+
+ devlink -js trap group show $DEVLINK_DEV group $group_name \
+ | jq '.[][][]["stats"]["rx"]["packets"]'
+}
+
+devlink_trap_group_rx_bytes_get()
+{
+ local group_name=$1; shift
+
+ devlink -js trap group show $DEVLINK_DEV group $group_name \
+ | jq '.[][][]["stats"]["rx"]["bytes"]'
+}
+
+devlink_trap_group_stats_idle_test()
+{
+ local group_name=$1; shift
+ local t0_packets t0_bytes
+ local t1_packets t1_bytes
+
+ t0_packets=$(devlink_trap_group_rx_packets_get $group_name)
+ t0_bytes=$(devlink_trap_group_rx_bytes_get $group_name)
+
+ sleep 1
+
+ t1_packets=$(devlink_trap_group_rx_packets_get $group_name)
+ t1_bytes=$(devlink_trap_group_rx_bytes_get $group_name)
+
+ if [[ $t0_packets -eq $t1_packets && $t0_bytes -eq $t1_bytes ]]; then
+ return 0
+ else
+ return 1
+ fi
+}
+
+devlink_trap_exception_test()
+{
+ local trap_name=$1; shift
+ local group_name
+
+ group_name=$(devlink_trap_group_get $trap_name)
+
+ devlink_trap_stats_idle_test $trap_name
+ check_fail $? "Trap stats idle when packets should have been trapped"
+
+ devlink_trap_group_stats_idle_test $group_name
+ check_fail $? "Trap group idle when packets should have been trapped"
+}
+
+devlink_trap_drop_test()
+{
+ local trap_name=$1; shift
+ local dev=$1; shift
+ local handle=$1; shift
+ local group_name
+
+ group_name=$(devlink_trap_group_get $trap_name)
+
+ # This is the common part of all the tests. It checks that stats are
+ # initially idle, then non-idle after changing the trap action and
+ # finally idle again. It also makes sure the packets are dropped and
+ # never forwarded.
+ devlink_trap_stats_idle_test $trap_name
+ check_err $? "Trap stats not idle with initial drop action"
+ devlink_trap_group_stats_idle_test $group_name
+ check_err $? "Trap group stats not idle with initial drop action"
+
+ devlink_trap_action_set $trap_name "trap"
+ devlink_trap_stats_idle_test $trap_name
+ check_fail $? "Trap stats idle after setting action to trap"
+ devlink_trap_group_stats_idle_test $group_name
+ check_fail $? "Trap group stats idle after setting action to trap"
+
+ devlink_trap_action_set $trap_name "drop"
+
+ devlink_trap_stats_idle_test $trap_name
+ check_err $? "Trap stats not idle after setting action to drop"
+ devlink_trap_group_stats_idle_test $group_name
+ check_err $? "Trap group stats not idle after setting action to drop"
+
+ tc_check_packets "dev $dev egress" $handle 0
+ check_err $? "Packets were not dropped"
+}
+
+devlink_trap_drop_cleanup()
+{
+ local mz_pid=$1; shift
+ local dev=$1; shift
+ local proto=$1; shift
+ local pref=$1; shift
+ local handle=$1; shift
+
+ kill $mz_pid && wait $mz_pid &> /dev/null
+ tc filter del dev $dev egress protocol $proto pref $pref handle $handle flower
+}
+
+devlink_trap_stats_test()
+{
+ local test_name=$1; shift
+ local trap_name=$1; shift
+ local send_one="$@"
+ local t0_packets
+ local t1_packets
+
+ RET=0
+
+ t0_packets=$(devlink_trap_rx_packets_get $trap_name)
+
+ $send_one && sleep 1
+
+ t1_packets=$(devlink_trap_rx_packets_get $trap_name)
+
+ if [[ $t1_packets -eq $t0_packets ]]; then
+ check_err 1 "Trap stats did not increase"
+ fi
+
+ log_test "$test_name"
+}
+
+devlink_trap_policers_num_get()
+{
+ devlink -j -p trap policer show | jq '.[]["'$DEVLINK_DEV'"] | length'
+}
+
+devlink_trap_policer_rate_get()
+{
+ local policer_id=$1; shift
+
+ devlink -j -p trap policer show $DEVLINK_DEV policer $policer_id \
+ | jq '.[][][]["rate"]'
+}
+
+devlink_trap_policer_burst_get()
+{
+ local policer_id=$1; shift
+
+ devlink -j -p trap policer show $DEVLINK_DEV policer $policer_id \
+ | jq '.[][][]["burst"]'
+}
+
+devlink_trap_policer_rx_dropped_get()
+{
+ local policer_id=$1; shift
+
+ devlink -j -p -s trap policer show $DEVLINK_DEV policer $policer_id \
+ | jq '.[][][]["stats"]["rx"]["dropped"]'
+}
+
+devlink_trap_group_policer_get()
+{
+ local group_name=$1; shift
+
+ devlink -j -p trap group show $DEVLINK_DEV group $group_name \
+ | jq '.[][][]["policer"]'
+}
+
+devlink_trap_policer_ids_get()
+{
+ devlink -j -p trap policer show \
+ | jq '.[]["'$DEVLINK_DEV'"][]["policer"]'
+}
+
+devlink_port_by_netdev()
+{
+ local if_name=$1
+
+ devlink -j port show $if_name | jq -e '.[] | keys' | jq -r '.[]'
+}
+
+devlink_cpu_port_get()
+{
+ local cpu_dl_port_num=$(devlink port list | grep "$DEVLINK_DEV" |
+ grep cpu | cut -d/ -f3 | cut -d: -f1 |
+ sed -n '1p')
+
+ echo "$DEVLINK_DEV/$cpu_dl_port_num"
+}
+
+devlink_cell_size_get()
+{
+ devlink sb pool show "$DEVLINK_DEV" pool 0 -j \
+ | jq '.pool[][].cell_size'
+}
diff --git a/tools/testing/selftests/net/forwarding/ethtool.sh b/tools/testing/selftests/net/forwarding/ethtool.sh
new file mode 100755
index 000000000..aa2eafb7b
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/ethtool.sh
@@ -0,0 +1,301 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="
+ same_speeds_autoneg_off
+ different_speeds_autoneg_off
+ combination_of_neg_on_and_off
+ advertise_subset_of_speeds
+ check_highest_speed_is_chosen
+ different_speeds_autoneg_on
+"
+NUM_NETIFS=2
+source lib.sh
+source ethtool_lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/24
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/24
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.2/24
+}
+
+h2_destroy()
+{
+ simple_if_fini $h2 192.0.2.2/24
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ h2=${NETIFS[p2]}
+
+ h1_create
+ h2_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ h2_destroy
+ h1_destroy
+}
+
+same_speeds_autoneg_off()
+{
+ # Check that when each of the reported speeds is forced, the links come
+ # up and are operational.
+ local -a speeds_arr=($(common_speeds_get $h1 $h2 0 0))
+
+ for speed in "${speeds_arr[@]}"; do
+ RET=0
+ ethtool_set $h1 speed $speed autoneg off
+ ethtool_set $h2 speed $speed autoneg off
+
+ setup_wait_dev_with_timeout $h1
+ setup_wait_dev_with_timeout $h2
+ ping_do $h1 192.0.2.2
+ check_err $? "speed $speed autoneg off"
+ log_test "force of same speed autoneg off"
+ log_info "speed = $speed"
+ done
+
+ ethtool -s $h2 autoneg on
+ ethtool -s $h1 autoneg on
+}
+
+different_speeds_autoneg_off()
+{
+ # Test that when we force different speeds, links are not up and ping
+ # fails.
+ RET=0
+
+ local -a speeds_arr=($(different_speeds_get $h1 $h2 0 0))
+ local speed1=${speeds_arr[0]}
+ local speed2=${speeds_arr[1]}
+
+ ethtool_set $h1 speed $speed1 autoneg off
+ ethtool_set $h2 speed $speed2 autoneg off
+
+ setup_wait_dev_with_timeout $h1
+ setup_wait_dev_with_timeout $h2
+ ping_do $h1 192.0.2.2
+ check_fail $? "ping with different speeds"
+
+ log_test "force of different speeds autoneg off"
+
+ ethtool -s $h2 autoneg on
+ ethtool -s $h1 autoneg on
+}
+
+combination_of_neg_on_and_off()
+{
+ # Test that when one device is forced to a speed supported by both
+ # endpoints and the other device is configured to autoneg on, the links
+ # are up and ping passes.
+ local -a speeds_arr=($(common_speeds_get $h1 $h2 0 1))
+
+ for speed in "${speeds_arr[@]}"; do
+ RET=0
+ ethtool_set $h1 speed $speed autoneg off
+
+ setup_wait_dev_with_timeout $h1
+ setup_wait_dev_with_timeout $h2
+ ping_do $h1 192.0.2.2
+ check_err $? "h1-speed=$speed autoneg off, h2 autoneg on"
+ log_test "one side with autoneg off and another with autoneg on"
+ log_info "force speed = $speed"
+ done
+
+ ethtool -s $h1 autoneg on
+}
+
+hex_speed_value_get()
+{
+ local speed=$1; shift
+
+ local shift_size=${speed_values[$speed]}
+ speed=$((0x1 << $"shift_size"))
+ printf "%#x" "$speed"
+}
+
+subset_of_common_speeds_get()
+{
+ local dev1=$1; shift
+ local dev2=$1; shift
+ local adver=$1; shift
+
+ local -a speeds_arr=($(common_speeds_get $dev1 $dev2 0 $adver))
+ local speed_to_advertise=0
+ local speed_to_remove=${speeds_arr[0]}
+ speed_to_remove+='base'
+
+ local -a speeds_mode_arr=($(common_speeds_get $dev1 $dev2 1 $adver))
+
+ for speed in ${speeds_mode_arr[@]}; do
+ if [[ $speed != $speed_to_remove* ]]; then
+ speed=$(hex_speed_value_get $speed)
+ speed_to_advertise=$(($speed_to_advertise | \
+ $speed))
+ fi
+
+ done
+
+ # Convert to hex.
+ printf "%#x" "$speed_to_advertise"
+}
+
+speed_to_advertise_get()
+{
+ # The function returns the hex number that is composed by OR-ing all
+ # the modes corresponding to the provided speed.
+ local speed_without_mode=$1; shift
+ local supported_speeds=("$@"); shift
+ local speed_to_advertise=0
+
+ speed_without_mode+='base'
+
+ for speed in ${supported_speeds[@]}; do
+ if [[ $speed == $speed_without_mode* ]]; then
+ speed=$(hex_speed_value_get $speed)
+ speed_to_advertise=$(($speed_to_advertise | \
+ $speed))
+ fi
+
+ done
+
+ # Convert to hex.
+ printf "%#x" "$speed_to_advertise"
+}
+
+advertise_subset_of_speeds()
+{
+ # Test that when one device advertises a subset of speeds and another
+ # advertises a specific speed (but all modes of this speed), the links
+ # are up and ping passes.
+ RET=0
+
+ local speed_1_to_advertise=$(subset_of_common_speeds_get $h1 $h2 1)
+ ethtool_set $h1 advertise $speed_1_to_advertise
+
+ if [ $RET != 0 ]; then
+ log_test "advertise subset of speeds"
+ return
+ fi
+
+ local -a speeds_arr_without_mode=($(common_speeds_get $h1 $h2 0 1))
+ # Check only speeds that h1 advertised. Remove the first speed.
+ unset speeds_arr_without_mode[0]
+ local -a speeds_arr_with_mode=($(common_speeds_get $h1 $h2 1 1))
+
+ for speed_value in ${speeds_arr_without_mode[@]}; do
+ RET=0
+ local speed_2_to_advertise=$(speed_to_advertise_get $speed_value \
+ "${speeds_arr_with_mode[@]}")
+ ethtool_set $h2 advertise $speed_2_to_advertise
+
+ setup_wait_dev_with_timeout $h1
+ setup_wait_dev_with_timeout $h2
+ ping_do $h1 192.0.2.2
+ check_err $? "h1=$speed_1_to_advertise, h2=$speed_2_to_advertise ($speed_value)"
+
+ log_test "advertise subset of speeds"
+ log_info "h1=$speed_1_to_advertise, h2=$speed_2_to_advertise"
+ done
+
+ ethtool -s $h2 autoneg on
+ ethtool -s $h1 autoneg on
+}
+
+check_highest_speed_is_chosen()
+{
+ # Test that when one device advertises a subset of speeds, the other
+ # chooses the highest speed. This test checks configuration without
+ # traffic.
+ RET=0
+
+ local max_speed
+ local chosen_speed
+ local speed_to_advertise=$(subset_of_common_speeds_get $h1 $h2 1)
+
+ ethtool_set $h1 advertise $speed_to_advertise
+
+ if [ $RET != 0 ]; then
+ log_test "check highest speed"
+ return
+ fi
+
+ local -a speeds_arr=($(common_speeds_get $h1 $h2 0 1))
+
+ max_speed=${speeds_arr[0]}
+ for current in ${speeds_arr[@]}; do
+ if [[ $current -gt $max_speed ]]; then
+ max_speed=$current
+ fi
+ done
+
+ setup_wait_dev_with_timeout $h1
+ setup_wait_dev_with_timeout $h2
+ chosen_speed=$(ethtool $h1 | grep 'Speed:')
+ chosen_speed=${chosen_speed%"Mb/s"*}
+ chosen_speed=${chosen_speed#*"Speed: "}
+ ((chosen_speed == max_speed))
+ check_err $? "h1 advertise $speed_to_advertise, h2 sync to speed $chosen_speed"
+
+ log_test "check highest speed"
+
+ ethtool -s $h2 autoneg on
+ ethtool -s $h1 autoneg on
+}
+
+different_speeds_autoneg_on()
+{
+ # Test that when we configure links to advertise different speeds,
+ # links are not up and ping fails.
+ RET=0
+
+ local -a speeds=($(different_speeds_get $h1 $h2 1 1))
+ local speed1=${speeds[0]}
+ local speed2=${speeds[1]}
+
+ speed1=$(hex_speed_value_get $speed1)
+ speed2=$(hex_speed_value_get $speed2)
+
+ ethtool_set $h1 advertise $speed1
+ ethtool_set $h2 advertise $speed2
+
+ if (($RET)); then
+ setup_wait_dev_with_timeout $h1
+ setup_wait_dev_with_timeout $h2
+ ping_do $h1 192.0.2.2
+ check_fail $? "ping with different speeds autoneg on"
+ fi
+
+ log_test "advertise different speeds autoneg on"
+
+ ethtool -s $h2 autoneg on
+ ethtool -s $h1 autoneg on
+}
+
+skip_on_veth
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+declare -gA speed_values
+eval "speed_values=($(speeds_arr_get))"
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh b/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh
new file mode 100755
index 000000000..baf831da5
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/ethtool_extended_state.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="
+ autoneg
+ autoneg_force_mode
+ no_cable
+"
+
+NUM_NETIFS=2
+source lib.sh
+source ethtool_lib.sh
+
+setup_prepare()
+{
+ swp1=${NETIFS[p1]}
+ swp2=${NETIFS[p2]}
+ swp3=$NETIF_NO_CABLE
+}
+
+ethtool_extended_state_check()
+{
+ local dev=$1; shift
+ local expected_ext_state=$1; shift
+ local expected_ext_substate=${1:-""}; shift
+
+ local ext_state=$(ethtool $dev | grep "Link detected" \
+ | cut -d "(" -f2 | cut -d ")" -f1)
+ local ext_substate=$(echo $ext_state | cut -sd "," -f2 \
+ | sed -e 's/^[[:space:]]*//')
+ ext_state=$(echo $ext_state | cut -d "," -f1)
+
+ [[ $ext_state == $expected_ext_state ]]
+ check_err $? "Expected \"$expected_ext_state\", got \"$ext_state\""
+
+ [[ $ext_substate == $expected_ext_substate ]]
+ check_err $? "Expected \"$expected_ext_substate\", got \"$ext_substate\""
+}
+
+autoneg()
+{
+ RET=0
+
+ ip link set dev $swp1 up
+
+ sleep 4
+ ethtool_extended_state_check $swp1 "Autoneg" "No partner detected"
+
+ log_test "Autoneg, No partner detected"
+
+ ip link set dev $swp1 down
+}
+
+autoneg_force_mode()
+{
+ RET=0
+
+ ip link set dev $swp1 up
+ ip link set dev $swp2 up
+
+ local -a speeds_arr=($(different_speeds_get $swp1 $swp2 0 0))
+ local speed1=${speeds_arr[0]}
+ local speed2=${speeds_arr[1]}
+
+ ethtool_set $swp1 speed $speed1 autoneg off
+ ethtool_set $swp2 speed $speed2 autoneg off
+
+ sleep 4
+ ethtool_extended_state_check $swp1 "Autoneg" \
+ "No partner detected during force mode"
+
+ ethtool_extended_state_check $swp2 "Autoneg" \
+ "No partner detected during force mode"
+
+ log_test "Autoneg, No partner detected during force mode"
+
+ ethtool -s $swp2 autoneg on
+ ethtool -s $swp1 autoneg on
+
+ ip link set dev $swp2 down
+ ip link set dev $swp1 down
+}
+
+no_cable()
+{
+ RET=0
+
+ ip link set dev $swp3 up
+
+ sleep 1
+ ethtool_extended_state_check $swp3 "No cable"
+
+ log_test "No cable"
+
+ ip link set dev $swp3 down
+}
+
+skip_on_veth
+
+setup_prepare
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/ethtool_lib.sh b/tools/testing/selftests/net/forwarding/ethtool_lib.sh
new file mode 100644
index 000000000..9188e624d
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/ethtool_lib.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+speeds_arr_get()
+{
+ cmd='/ETHTOOL_LINK_MODE_[^[:space:]]*_BIT[[:space:]]+=[[:space:]]+/ \
+ {sub(/,$/, "") \
+ sub(/ETHTOOL_LINK_MODE_/,"") \
+ sub(/_BIT/,"") \
+ sub(/_Full/,"/Full") \
+ sub(/_Half/,"/Half");\
+ print "["$1"]="$3}'
+
+ awk "${cmd}" /usr/include/linux/ethtool.h
+}
+
+ethtool_set()
+{
+ local cmd="$@"
+ local out=$(ethtool -s $cmd 2>&1 | wc -l)
+
+ check_err $out "error in configuration. $cmd"
+}
+
+dev_speeds_get()
+{
+ local dev=$1; shift
+ local with_mode=$1; shift
+ local adver=$1; shift
+ local speeds_str
+
+ if (($adver)); then
+ mode="Advertised link modes"
+ else
+ mode="Supported link modes"
+ fi
+
+ speeds_str=$(ethtool "$dev" | \
+ # Snip everything before the link modes section.
+ sed -n '/'"$mode"':/,$p' | \
+ # Quit processing the rest at the start of the next section.
+ # When checking, skip the header of this section (hence the 2,).
+ sed -n '2,${/^[\t][^ \t]/q};p' | \
+ # Drop the section header of the current section.
+ cut -d':' -f2)
+
+ local -a speeds_arr=($speeds_str)
+ if [[ $with_mode -eq 0 ]]; then
+ for ((i=0; i<${#speeds_arr[@]}; i++)); do
+ speeds_arr[$i]=${speeds_arr[$i]%base*}
+ done
+ fi
+ echo ${speeds_arr[@]}
+}
+
+common_speeds_get()
+{
+ dev1=$1; shift
+ dev2=$1; shift
+ with_mode=$1; shift
+ adver=$1; shift
+
+ local -a dev1_speeds=($(dev_speeds_get $dev1 $with_mode $adver))
+ local -a dev2_speeds=($(dev_speeds_get $dev2 $with_mode $adver))
+
+ comm -12 \
+ <(printf '%s\n' "${dev1_speeds[@]}" | sort -u) \
+ <(printf '%s\n' "${dev2_speeds[@]}" | sort -u)
+}
+
+different_speeds_get()
+{
+ local dev1=$1; shift
+ local dev2=$1; shift
+ local with_mode=$1; shift
+ local adver=$1; shift
+
+ local -a speeds_arr
+
+ speeds_arr=($(common_speeds_get $dev1 $dev2 $with_mode $adver))
+ if [[ ${#speeds_arr[@]} < 2 ]]; then
+ check_err 1 "cannot check different speeds. There are not enough speeds"
+ fi
+
+ echo ${speeds_arr[0]} ${speeds_arr[1]}
+}
diff --git a/tools/testing/selftests/net/forwarding/fib_offload_lib.sh b/tools/testing/selftests/net/forwarding/fib_offload_lib.sh
new file mode 100644
index 000000000..66496659b
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/fib_offload_lib.sh
@@ -0,0 +1,873 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Various helpers and tests to verify FIB offload.
+
+__fib_trap_check()
+{
+ local ns=$1; shift
+ local family=$1; shift
+ local route=$1; shift
+ local should_fail=$1; shift
+ local ret
+
+ ip -n $ns -j -p -$family route show $route \
+ | jq -e '.[]["flags"] | contains(["trap"])' &> /dev/null
+ ret=$?
+ if [[ $should_fail == "true" ]]; then
+ if [[ $ret -ne 0 ]]; then
+ return 0
+ else
+ return 1
+ fi
+ fi
+
+ return $ret
+}
+
+fib_trap_check()
+{
+ local ns=$1; shift
+ local family=$1; shift
+ local route=$1; shift
+ local should_fail=$1; shift
+
+ busywait 5000 __fib_trap_check $ns $family "$route" $should_fail
+}
+
+fib4_trap_check()
+{
+ local ns=$1; shift
+ local route=$1; shift
+ local should_fail=$1; shift
+
+ fib_trap_check $ns 4 "$route" $should_fail
+}
+
+fib6_trap_check()
+{
+ local ns=$1; shift
+ local route=$1; shift
+ local should_fail=$1; shift
+
+ fib_trap_check $ns 6 "$route" $should_fail
+}
+
+fib_ipv4_identical_routes_test()
+{
+ local ns=$1; shift
+ local i
+
+ RET=0
+
+ for i in $(seq 1 3); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ done
+
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 0 metric 1024
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 0 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route append 192.0.2.0/24 dev dummy2 tos 0 metric 1024
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy2 tos 0 metric 1024" true
+ check_err $? "Appended route in hardware when should not"
+
+ ip -n $ns route prepend 192.0.2.0/24 dev dummy3 tos 0 metric 1024
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy3 tos 0 metric 1024" false
+ check_err $? "Prepended route not in hardware when should"
+
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 0 metric 1024" true
+ check_err $? "Route was not replaced in hardware by prepended one"
+
+ log_test "IPv4 identical routes"
+
+ for i in $(seq 1 3); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv4_tos_test()
+{
+ local ns=$1; shift
+
+ RET=0
+
+ ip -n $ns link add name dummy1 type dummy
+ ip -n $ns link set dev dummy1 up
+
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 0 metric 1024
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 0 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 2 metric 1024
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 2 metric 1024" false
+ check_err $? "Highest TOS route not in hardware when should"
+
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 0 metric 1024" true
+ check_err $? "Lowest TOS route still in hardware when should not"
+
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 1 metric 1024
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 1 metric 1024" true
+ check_err $? "Middle TOS route in hardware when should not"
+
+ log_test "IPv4 routes with TOS"
+
+ ip -n $ns link del dev dummy1
+}
+
+fib_ipv4_metric_test()
+{
+ local ns=$1; shift
+
+ RET=0
+
+ ip -n $ns link add name dummy1 type dummy
+ ip -n $ns link set dev dummy1 up
+
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 metric 1024
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 metric 1022
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1022" false
+ check_err $? "Lowest metric route not in hardware when should"
+
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1024" true
+ check_err $? "Highest metric route still in hardware when should not"
+
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 metric 1023
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1023" true
+ check_err $? "Middle metric route in hardware when should not"
+
+ log_test "IPv4 routes with metric"
+
+ ip -n $ns link del dev dummy1
+}
+
+fib_ipv4_replace_test()
+{
+ local ns=$1; shift
+ local i
+
+ RET=0
+
+ for i in $(seq 1 2); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ done
+
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 metric 1024
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route replace 192.0.2.0/24 dev dummy2 metric 1024
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy2 metric 1024" false
+ check_err $? "Replacement route not in hardware when should"
+
+ # Add a route with an higher metric and make sure that replacing it
+ # does not affect the lower metric one.
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 metric 1025
+ ip -n $ns route replace 192.0.2.0/24 dev dummy2 metric 1025
+
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy2 metric 1024" false
+ check_err $? "Lowest metric route not in hardware when should"
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy2 metric 1025" true
+ check_err $? "Highest metric route in hardware when should not"
+
+ log_test "IPv4 route replace"
+
+ for i in $(seq 1 2); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv4_delete_test()
+{
+ local ns=$1; shift
+ local metric
+
+ RET=0
+
+ ip -n $ns link add name dummy1 type dummy
+ ip -n $ns link set dev dummy1 up
+
+ # Insert multiple routes with the same prefix and length and varying
+ # metrics. Make sure that throughout delete operations the lowest
+ # metric route is the one in hardware.
+ for metric in $(seq 1024 1026); do
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 metric $metric
+ done
+
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route del 192.0.2.0/24 dev dummy1 metric 1024
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1025" false
+ check_err $? "Lowest metric route not in hardware when should"
+
+ ip -n $ns route del 192.0.2.0/24 dev dummy1 metric 1026
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1025" false
+ check_err $? "Sole route not in hardware when should"
+
+ log_test "IPv4 route delete"
+
+ ip -n $ns link del dev dummy1
+}
+
+fib_ipv4_plen_test()
+{
+ local ns=$1; shift
+
+ RET=0
+
+ ip -n $ns link add name dummy1 type dummy
+ ip -n $ns link set dev dummy1 up
+
+ # Add two routes with the same key and different prefix length and
+ # make sure both are in hardware. It can be verfied that both are
+ # sharing the same leaf by checking the /proc/net/fib_trie
+ ip -n $ns route add 192.0.2.0/24 dev dummy1
+ ip -n $ns route add 192.0.2.0/25 dev dummy1
+
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1" false
+ check_err $? "/24 not in hardware when should"
+
+ fib4_trap_check $ns "192.0.2.0/25 dev dummy1" false
+ check_err $? "/25 not in hardware when should"
+
+ log_test "IPv4 routes with different prefix length"
+
+ ip -n $ns link del dev dummy1
+}
+
+fib_ipv4_replay_metric_test()
+{
+ local ns=$1; shift
+ local devlink_dev=$1; shift
+
+ RET=0
+
+ ip -n $ns link add name dummy1 type dummy
+ ip -n $ns link set dev dummy1 up
+
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 metric 1024
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 metric 1025
+
+ devlink -N $ns dev reload $devlink_dev
+
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1024" false
+ check_err $? "Lowest metric route not in hardware when should"
+
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 metric 1025" true
+ check_err $? "Highest metric route in hardware when should not"
+
+ log_test "IPv4 routes replay - metric"
+
+ ip -n $ns link del dev dummy1
+}
+
+fib_ipv4_replay_tos_test()
+{
+ local ns=$1; shift
+ local devlink_dev=$1; shift
+
+ RET=0
+
+ ip -n $ns link add name dummy1 type dummy
+ ip -n $ns link set dev dummy1 up
+
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 0
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 tos 1
+
+ devlink -N $ns dev reload $devlink_dev
+
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 1" false
+ check_err $? "Highest TOS route not in hardware when should"
+
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1 tos 0" true
+ check_err $? "Lowest TOS route in hardware when should not"
+
+ log_test "IPv4 routes replay - TOS"
+
+ ip -n $ns link del dev dummy1
+}
+
+fib_ipv4_replay_plen_test()
+{
+ local ns=$1; shift
+ local devlink_dev=$1; shift
+
+ RET=0
+
+ ip -n $ns link add name dummy1 type dummy
+ ip -n $ns link set dev dummy1 up
+
+ ip -n $ns route add 192.0.2.0/24 dev dummy1
+ ip -n $ns route add 192.0.2.0/25 dev dummy1
+
+ devlink -N $ns dev reload $devlink_dev
+
+ fib4_trap_check $ns "192.0.2.0/24 dev dummy1" false
+ check_err $? "/24 not in hardware when should"
+
+ fib4_trap_check $ns "192.0.2.0/25 dev dummy1" false
+ check_err $? "/25 not in hardware when should"
+
+ log_test "IPv4 routes replay - prefix length"
+
+ ip -n $ns link del dev dummy1
+}
+
+fib_ipv4_flush_test()
+{
+ local ns=$1; shift
+ local metric
+
+ RET=0
+
+ ip -n $ns link add name dummy1 type dummy
+ ip -n $ns link set dev dummy1 up
+
+ # Exercise the routes flushing code paths by inserting various
+ # prefix routes on a netdev and then deleting it.
+ for metric in $(seq 1 20); do
+ ip -n $ns route add 192.0.2.0/24 dev dummy1 metric $metric
+ done
+
+ ip -n $ns link del dev dummy1
+
+ log_test "IPv4 routes flushing"
+}
+
+fib_ipv6_add_test()
+{
+ local ns=$1; shift
+
+ RET=0
+
+ for i in $(seq 1 2); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ done
+
+ ip -n $ns route add 2001:db8:1::/64 dev dummy1 metric 1024
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy1 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route append 2001:db8:1::/64 dev dummy2 metric 1024
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy2 metric 1024" true
+ check_err $? "Route in hardware when should not"
+
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy1 metric 1024" false
+ check_err $? "Route not in hardware after appending route"
+
+ log_test "IPv6 single route add"
+
+ for i in $(seq 1 2); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv6_metric_test()
+{
+ local ns=$1; shift
+
+ RET=0
+
+ ip -n $ns link add name dummy1 type dummy
+ ip -n $ns link set dev dummy1 up
+
+ ip -n $ns route add 2001:db8:1::/64 dev dummy1 metric 1024
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy1 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route add 2001:db8:1::/64 dev dummy1 metric 1022
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy1 metric 1022" false
+ check_err $? "Lowest metric route not in hardware when should"
+
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy1 metric 1024" true
+ check_err $? "Highest metric route still in hardware when should not"
+
+ ip -n $ns route add 2001:db8:1::/64 dev dummy1 metric 1023
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy1 metric 1023" true
+ check_err $? "Middle metric route in hardware when should not"
+
+ log_test "IPv6 routes with metric"
+
+ ip -n $ns link del dev dummy1
+}
+
+fib_ipv6_append_single_test()
+{
+ local ns=$1; shift
+
+ # When an IPv6 multipath route is added without the 'nexthop' keyword,
+ # different code paths are taken compared to when the keyword is used.
+ # This test tries to verify the former.
+ RET=0
+
+ for i in $(seq 1 2); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i
+ done
+
+ ip -n $ns route add 2001:db8:10::/64 via 2001:db8:1::2 metric 1024
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route append 2001:db8:10::/64 via 2001:db8:2::2 metric 1024
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Route not in hardware after appending"
+
+ ip -n $ns route add 2001:db8:10::/64 via 2001:db8:1::2 metric 1025
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1025" true
+ check_err $? "Route in hardware when should not"
+
+ ip -n $ns route append 2001:db8:10::/64 via 2001:db8:2::2 metric 1025
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1025" true
+ check_err $? "Route in hardware when should not after appending"
+
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Lowest metric route not in hardware when should"
+
+ log_test "IPv6 append single route without 'nexthop' keyword"
+
+ for i in $(seq 1 2); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv6_replace_single_test()
+{
+ local ns=$1; shift
+ local i
+
+ RET=0
+
+ for i in $(seq 1 2); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ done
+
+ ip -n $ns route add 2001:db8:1::/64 dev dummy1 metric 1024
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy1 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route replace 2001:db8:1::/64 dev dummy2 metric 1024
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy2 metric 1024" false
+ check_err $? "Replacement route not in hardware when should"
+
+ # Add a route with an higher metric and make sure that replacing it
+ # does not affect the lower metric one.
+ ip -n $ns route add 2001:db8:1::/64 dev dummy1 metric 1025
+ ip -n $ns route replace 2001:db8:1::/64 dev dummy2 metric 1025
+
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy2 metric 1024" false
+ check_err $? "Lowest metric route not in hardware when should"
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy2 metric 1025" true
+ check_err $? "Highest metric route in hardware when should not"
+
+ log_test "IPv6 single route replace"
+
+ for i in $(seq 1 2); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv6_metric_multipath_test()
+{
+ local ns=$1; shift
+
+ RET=0
+
+ for i in $(seq 1 2); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i
+ done
+
+ ip -n $ns route add 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route add 2001:db8:10::/64 metric 1022 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1022" false
+ check_err $? "Lowest metric route not in hardware when should"
+
+ ip -n $ns route add 2001:db8:10::/64 metric 1023 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" true
+ check_err $? "Highest metric route still in hardware when should not"
+
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1023" true
+ check_err $? "Middle metric route in hardware when should not"
+
+ log_test "IPv6 multipath routes with metric"
+
+ for i in $(seq 1 2); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv6_append_multipath_test()
+{
+ local ns=$1; shift
+
+ RET=0
+
+ for i in $(seq 1 3); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i
+ done
+
+ ip -n $ns route add 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:1::2 dev dummy1
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route append 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:2::2 dev dummy2 \
+ nexthop via 2001:db8:3::2 dev dummy3
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Route not in hardware after appending"
+
+ ip -n $ns route add 2001:db8:10::/64 metric 1025 \
+ nexthop via 2001:db8:1::2 dev dummy1
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1025" true
+ check_err $? "Route in hardware when should not"
+
+ ip -n $ns route append 2001:db8:10::/64 metric 1025 \
+ nexthop via 2001:db8:2::2 dev dummy2 \
+ nexthop via 2001:db8:3::2 dev dummy3
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1025" true
+ check_err $? "Route in hardware when should not after appending"
+
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Lowest metric route not in hardware when should"
+
+ log_test "IPv6 append multipath route with 'nexthop' keyword"
+
+ for i in $(seq 1 3); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv6_replace_multipath_test()
+{
+ local ns=$1; shift
+ local i
+
+ RET=0
+
+ for i in $(seq 1 3); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i
+ done
+
+ ip -n $ns route add 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route replace 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:3::2 dev dummy3
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Replacement route not in hardware when should"
+
+ # Add a route with an higher metric and make sure that replacing it
+ # does not affect the lower metric one.
+ ip -n $ns route add 2001:db8:10::/64 metric 1025 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ ip -n $ns route replace 2001:db8:10::/64 metric 1025 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:3::2 dev dummy3
+
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Lowest metric route not in hardware when should"
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1025" true
+ check_err $? "Highest metric route in hardware when should not"
+
+ log_test "IPv6 multipath route replace"
+
+ for i in $(seq 1 3); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv6_append_multipath_to_single_test()
+{
+ local ns=$1; shift
+
+ # Test that when the first route in the leaf is not a multipath route
+ # and we try to append a multipath route with the same metric to it, it
+ # is not notified.
+ RET=0
+
+ for i in $(seq 1 2); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i
+ done
+
+ ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1024
+ fib6_trap_check $ns "2001:db8:10::/64 dev dummy1 metric 1024" false
+ check_err $? "Route not in hardware when should"
+
+ ip -n $ns route append 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ fib6_trap_check $ns "2001:db8:10::/64 dev dummy2 metric 1024" true
+ check_err $? "Route in hardware when should not"
+
+ fib6_trap_check $ns "2001:db8:10::/64 dev dummy1 metric 1024" false
+ check_err $? "Route not in hardware after append"
+
+ log_test "IPv6 append multipath route to non-multipath route"
+
+ for i in $(seq 1 2); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv6_delete_single_test()
+{
+ local ns=$1; shift
+
+ # Test various deletion scenarios, where only a single route is
+ # deleted from the FIB node.
+ for i in $(seq 1 2); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i
+ done
+
+ # Test deletion of a single route when it is the only route in the FIB
+ # node.
+ RET=0
+
+ ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1024
+ ip -n $ns route del 2001:db8:10::/64 dev dummy1 metric 1024
+
+ log_test "IPv6 delete sole single route"
+
+ # Test that deletion of last route does not affect the first one.
+ RET=0
+
+ ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1024
+ ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1025
+ ip -n $ns route del 2001:db8:10::/64 dev dummy1 metric 1025
+
+ fib6_trap_check $ns "2001:db8:10::/64 dev dummy1 metric 1024" false
+ check_err $? "Route not in hardware after deleting higher metric route"
+
+ log_test "IPv6 delete single route not in hardware"
+
+ ip -n $ns route del 2001:db8:10::/64 dev dummy1 metric 1024
+
+ # Test that first route is replaced by next single route in the FIB
+ # node.
+ RET=0
+
+ ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1024
+ ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1025
+ ip -n $ns route del 2001:db8:10::/64 dev dummy1 metric 1024
+
+ fib6_trap_check $ns "2001:db8:10::/64 dev dummy1 metric 1025" false
+ check_err $? "Route not in hardware after deleting lowest metric route"
+
+ log_test "IPv6 delete single route - replaced by single"
+
+ ip -n $ns route del 2001:db8:10::/64 dev dummy1 metric 1025
+
+ # Test that first route is replaced by next multipath route in the FIB
+ # node.
+ RET=0
+
+ ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1024
+ ip -n $ns route add 2001:db8:10::/64 metric 1025 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ ip -n $ns route del 2001:db8:10::/64 dev dummy1 metric 1024
+
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1025" false
+ check_err $? "Route not in hardware after deleting lowest metric route"
+
+ log_test "IPv6 delete single route - replaced by multipath"
+
+ ip -n $ns route del 2001:db8:10::/64 metric 1025
+
+ # Test deletion of a single nexthop from a multipath route.
+ ip -n $ns route add 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ ip -n $ns route del 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:1::2 dev dummy1
+
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Route not in hardware after deleting a single nexthop"
+
+ log_test "IPv6 delete single nexthop"
+
+ ip -n $ns route del 2001:db8:10::/64 metric 1024
+
+ for i in $(seq 1 2); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv6_delete_multipath_test()
+{
+ local ns=$1; shift
+
+ # Test various deletion scenarios, where an entire multipath route is
+ # deleted from the FIB node.
+ for i in $(seq 1 2); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i
+ done
+
+ # Test deletion of a multipath route when it is the only route in the
+ # FIB node.
+ RET=0
+
+ ip -n $ns route add 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ ip -n $ns route del 2001:db8:10::/64 metric 1024
+
+ log_test "IPv6 delete sole multipath route"
+
+ # Test that deletion of last route does not affect the first one.
+ RET=0
+
+ ip -n $ns route add 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ ip -n $ns route add 2001:db8:10::/64 metric 1025 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ ip -n $ns route del 2001:db8:10::/64 metric 1025
+
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "Route not in hardware after deleting higher metric route"
+
+ log_test "IPv6 delete multipath route not in hardware"
+
+ ip -n $ns route del 2001:db8:10::/64 metric 1024
+
+ # Test that first route is replaced by next single route in the FIB
+ # node.
+ RET=0
+
+ ip -n $ns route add 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ ip -n $ns route add 2001:db8:10::/64 dev dummy1 metric 1025
+ ip -n $ns route del 2001:db8:10::/64 metric 1024
+
+ fib6_trap_check $ns "2001:db8:10::/64 dev dummy1 metric 1025" false
+ check_err $? "Route not in hardware after deleting lowest metric route"
+
+ log_test "IPv6 delete multipath route - replaced by single"
+
+ ip -n $ns route del 2001:db8:10::/64 dev dummy1 metric 1025
+
+ # Test that first route is replaced by next multipath route in the FIB
+ # node.
+ RET=0
+
+ ip -n $ns route add 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ ip -n $ns route add 2001:db8:10::/64 metric 1025 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ ip -n $ns route del 2001:db8:10::/64 metric 1024
+
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1025" false
+ check_err $? "Route not in hardware after deleting lowest metric route"
+
+ log_test "IPv6 delete multipath route - replaced by multipath"
+
+ ip -n $ns route del 2001:db8:10::/64 metric 1025
+
+ for i in $(seq 1 2); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv6_replay_single_test()
+{
+ local ns=$1; shift
+ local devlink_dev=$1; shift
+
+ RET=0
+
+ for i in $(seq 1 2); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ done
+
+ ip -n $ns route add 2001:db8:1::/64 dev dummy1
+ ip -n $ns route append 2001:db8:1::/64 dev dummy2
+
+ devlink -N $ns dev reload $devlink_dev
+
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy1" false
+ check_err $? "First route not in hardware when should"
+
+ fib6_trap_check $ns "2001:db8:1::/64 dev dummy2" true
+ check_err $? "Second route in hardware when should not"
+
+ log_test "IPv6 routes replay - single route"
+
+ for i in $(seq 1 2); do
+ ip -n $ns link del dev dummy$i
+ done
+}
+
+fib_ipv6_replay_multipath_test()
+{
+ local ns=$1; shift
+ local devlink_dev=$1; shift
+
+ RET=0
+
+ for i in $(seq 1 2); do
+ ip -n $ns link add name dummy$i type dummy
+ ip -n $ns link set dev dummy$i up
+ ip -n $ns address add 2001:db8:$i::1/64 dev dummy$i
+ done
+
+ ip -n $ns route add 2001:db8:10::/64 metric 1024 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+ ip -n $ns route add 2001:db8:10::/64 metric 1025 \
+ nexthop via 2001:db8:1::2 dev dummy1 \
+ nexthop via 2001:db8:2::2 dev dummy2
+
+ devlink -N $ns dev reload $devlink_dev
+
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1024" false
+ check_err $? "First route not in hardware when should"
+
+ fib6_trap_check $ns "2001:db8:10::/64 metric 1025" true
+ check_err $? "Second route in hardware when should not"
+
+ log_test "IPv6 routes replay - multipath route"
+
+ for i in $(seq 1 2); do
+ ip -n $ns link del dev dummy$i
+ done
+}
diff --git a/tools/testing/selftests/net/forwarding/forwarding.config.sample b/tools/testing/selftests/net/forwarding/forwarding.config.sample
new file mode 100644
index 000000000..e51def39f
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/forwarding.config.sample
@@ -0,0 +1,45 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+##############################################################################
+# Topology description. p1 looped back to p2, p3 to p4 and so on.
+declare -A NETIFS
+
+NETIFS[p1]=veth0
+NETIFS[p2]=veth1
+NETIFS[p3]=veth2
+NETIFS[p4]=veth3
+NETIFS[p5]=veth4
+NETIFS[p6]=veth5
+NETIFS[p7]=veth6
+NETIFS[p8]=veth7
+NETIFS[p9]=veth8
+NETIFS[p10]=veth9
+
+# Port that does not have a cable connected.
+NETIF_NO_CABLE=eth8
+
+##############################################################################
+# Defines
+
+# IPv4 ping utility name
+PING=ping
+# IPv6 ping utility name. Some distributions use 'ping' for IPv6.
+PING6=ping6
+# Packet generator. Some distributions use 'mz'.
+MZ=mausezahn
+# Time to wait after interfaces participating in the test are all UP
+WAIT_TIME=5
+# Whether to pause on failure or not.
+PAUSE_ON_FAIL=no
+# Whether to pause on cleanup or not.
+PAUSE_ON_CLEANUP=no
+# Type of network interface to create
+NETIF_TYPE=veth
+# Whether to create virtual interfaces (veth) or not
+NETIF_CREATE=yes
+# Timeout (in seconds) before ping exits regardless of how many packets have
+# been sent or received
+PING_TIMEOUT=5
+# IPv6 traceroute utility name.
+TROUTE6=traceroute6
diff --git a/tools/testing/selftests/net/forwarding/gre_inner_v4_multipath.sh b/tools/testing/selftests/net/forwarding/gre_inner_v4_multipath.sh
new file mode 100755
index 000000000..e4009f658
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/gre_inner_v4_multipath.sh
@@ -0,0 +1,305 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test traffic distribution when there are multiple routes between an IPv4
+# GRE tunnel. The tunnel carries IPv4 traffic between multiple hosts.
+# Multiple routes are in the underlay network. With the default multipath
+# policy, SW2 will only look at the outer IP addresses, hence only a single
+# route would be used.
+#
+# +-------------------------+
+# | H1 |
+# | $h1 + |
+# | 192.0.3.{2-62}/24 | |
+# +-------------------|-----+
+# |
+# +-------------------|------------------------+
+# | SW1 | |
+# | $ol1 + |
+# | 192.0.3.1/24 |
+# | |
+# | + g1 (gre) |
+# | loc=192.0.2.65 |
+# | rem=192.0.2.66 --. |
+# | tos=inherit | |
+# | v |
+# | + $ul1 |
+# | | 192.0.2.129/28 |
+# +---------------------|----------------------+
+# |
+# +---------------------|----------------------+
+# | SW2 | |
+# | $ul21 + |
+# | 192.0.2.130/28 |
+# | | |
+# ! ________________|_____ |
+# | / \ |
+# | | | |
+# | + $ul22.111 (vlan) + $ul22.222 (vlan) |
+# | | 192.0.2.145/28 | 192.0.2.161/28 |
+# | | | |
+# +--|----------------------|------------------+
+# | |
+# +--|----------------------|------------------+
+# | | | |
+# | + $ul32.111 (vlan) + $ul32.222 (vlan) |
+# | | 192.0.2.146/28 | 192.0.2.162/28 |
+# | | | |
+# | \______________________/ |
+# | | |
+# | | |
+# | $ul31 + |
+# | 192.0.2.177/28 | SW3 |
+# +---------------------|----------------------+
+# |
+# +---------------------|----------------------+
+# | + $ul4 |
+# | ^ 192.0.2.178/28 |
+# | | |
+# | + g2 (gre) | |
+# | loc=192.0.2.66 | |
+# | rem=192.0.2.65 --' |
+# | tos=inherit |
+# | |
+# | $ol4 + |
+# | 192.0.4.1/24 | SW4 |
+# +--------------------|-----------------------+
+# |
+# +--------------------|---------+
+# | | |
+# | $h2 + |
+# | 192.0.4.{2-62}/24 H2 |
+# +------------------------------+
+
+ALL_TESTS="
+ ping_ipv4
+ multipath_ipv4
+"
+
+NUM_NETIFS=10
+source lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.3.2/24
+ ip route add vrf v$h1 192.0.4.0/24 via 192.0.3.1
+}
+
+h1_destroy()
+{
+ ip route del vrf v$h1 192.0.4.0/24 via 192.0.3.1
+ simple_if_fini $h1 192.0.3.2/24
+}
+
+sw1_create()
+{
+ simple_if_init $ol1 192.0.3.1/24
+ __simple_if_init $ul1 v$ol1 192.0.2.129/28
+
+ tunnel_create g1 gre 192.0.2.65 192.0.2.66 tos inherit dev v$ol1
+ __simple_if_init g1 v$ol1 192.0.2.65/32
+ ip route add vrf v$ol1 192.0.2.66/32 via 192.0.2.130
+
+ ip route add vrf v$ol1 192.0.4.0/24 nexthop dev g1
+}
+
+sw1_destroy()
+{
+ ip route del vrf v$ol1 192.0.4.0/24
+
+ ip route del vrf v$ol1 192.0.2.66/32
+ __simple_if_fini g1 192.0.2.65/32
+ tunnel_destroy g1
+
+ __simple_if_fini $ul1 192.0.2.129/28
+ simple_if_fini $ol1 192.0.3.1/24
+}
+
+sw2_create()
+{
+ simple_if_init $ul21 192.0.2.130/28
+ __simple_if_init $ul22 v$ul21
+ vlan_create $ul22 111 v$ul21 192.0.2.145/28
+ vlan_create $ul22 222 v$ul21 192.0.2.161/28
+
+ ip route add vrf v$ul21 192.0.2.65/32 via 192.0.2.129
+ ip route add vrf v$ul21 192.0.2.66/32 \
+ nexthop via 192.0.2.146 \
+ nexthop via 192.0.2.162
+}
+
+sw2_destroy()
+{
+ ip route del vrf v$ul21 192.0.2.66/32
+ ip route del vrf v$ul21 192.0.2.65/32
+
+ vlan_destroy $ul22 222
+ vlan_destroy $ul22 111
+ __simple_if_fini $ul22
+ simple_if_fini $ul21 192.0.2.130/28
+}
+
+sw3_create()
+{
+ simple_if_init $ul31 192.0.2.177/28
+ __simple_if_init $ul32 v$ul31
+ vlan_create $ul32 111 v$ul31 192.0.2.146/28
+ vlan_create $ul32 222 v$ul31 192.0.2.162/28
+
+ ip route add vrf v$ul31 192.0.2.66/32 via 192.0.2.178
+ ip route add vrf v$ul31 192.0.2.65/32 \
+ nexthop via 192.0.2.145 \
+ nexthop via 192.0.2.161
+
+ tc qdisc add dev $ul32 clsact
+ tc filter add dev $ul32 ingress pref 111 prot 802.1Q \
+ flower vlan_id 111 action pass
+ tc filter add dev $ul32 ingress pref 222 prot 802.1Q \
+ flower vlan_id 222 action pass
+}
+
+sw3_destroy()
+{
+ tc qdisc del dev $ul32 clsact
+
+ ip route del vrf v$ul31 192.0.2.65/32
+ ip route del vrf v$ul31 192.0.2.66/32
+
+ vlan_destroy $ul32 222
+ vlan_destroy $ul32 111
+ __simple_if_fini $ul32
+ simple_if_fini $ul31 192.0.2.177/28
+}
+
+sw4_create()
+{
+ simple_if_init $ol4 192.0.4.1/24
+ __simple_if_init $ul4 v$ol4 192.0.2.178/28
+
+ tunnel_create g2 gre 192.0.2.66 192.0.2.65 tos inherit dev v$ol4
+ __simple_if_init g2 v$ol4 192.0.2.66/32
+ ip route add vrf v$ol4 192.0.2.65/32 via 192.0.2.177
+
+ ip route add vrf v$ol4 192.0.3.0/24 nexthop dev g2
+}
+
+sw4_destroy()
+{
+ ip route del vrf v$ol4 192.0.3.0/24
+
+ ip route del vrf v$ol4 192.0.2.65/32
+ __simple_if_fini g2 192.0.2.66/32
+ tunnel_destroy g2
+
+ __simple_if_fini $ul4 192.0.2.178/28
+ simple_if_fini $ol4 192.0.4.1/24
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.4.2/24
+ ip route add vrf v$h2 192.0.3.0/24 via 192.0.4.1
+}
+
+h2_destroy()
+{
+ ip route del vrf v$h2 192.0.3.0/24 via 192.0.4.1
+ simple_if_fini $h2 192.0.4.2/24
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+
+ ol1=${NETIFS[p2]}
+ ul1=${NETIFS[p3]}
+
+ ul21=${NETIFS[p4]}
+ ul22=${NETIFS[p5]}
+
+ ul32=${NETIFS[p6]}
+ ul31=${NETIFS[p7]}
+
+ ul4=${NETIFS[p8]}
+ ol4=${NETIFS[p9]}
+
+ h2=${NETIFS[p10]}
+
+ vrf_prepare
+ h1_create
+ sw1_create
+ sw2_create
+ sw3_create
+ sw4_create
+ h2_create
+
+ forwarding_enable
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ forwarding_restore
+
+ h2_destroy
+ sw4_destroy
+ sw3_destroy
+ sw2_destroy
+ sw1_destroy
+ h1_destroy
+ vrf_cleanup
+}
+
+multipath4_test()
+{
+ local what=$1; shift
+ local weight1=$1; shift
+ local weight2=$1; shift
+
+ sysctl_set net.ipv4.fib_multipath_hash_policy 2
+ ip route replace vrf v$ul21 192.0.2.66/32 \
+ nexthop via 192.0.2.146 weight $weight1 \
+ nexthop via 192.0.2.162 weight $weight2
+
+ local t0_111=$(tc_rule_stats_get $ul32 111 ingress)
+ local t0_222=$(tc_rule_stats_get $ul32 222 ingress)
+
+ ip vrf exec v$h1 \
+ $MZ $h1 -q -p 64 -A "192.0.3.2-192.0.3.62" -B "192.0.4.2-192.0.4.62" \
+ -d 1msec -c 50 -t udp "sp=1024,dp=1024"
+ sleep 1
+
+ local t1_111=$(tc_rule_stats_get $ul32 111 ingress)
+ local t1_222=$(tc_rule_stats_get $ul32 222 ingress)
+
+ local d111=$((t1_111 - t0_111))
+ local d222=$((t1_222 - t0_222))
+ multipath_eval "$what" $weight1 $weight2 $d111 $d222
+
+ ip route replace vrf v$ul21 192.0.2.66/32 \
+ nexthop via 192.0.2.146 \
+ nexthop via 192.0.2.162
+ sysctl_restore net.ipv4.fib_multipath_hash_policy
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.4.2
+}
+
+multipath_ipv4()
+{
+ log_info "Running IPv4 over GRE over IPv4 multipath tests"
+ multipath4_test "ECMP" 1 1
+ multipath4_test "Weighted MP 2:1" 2 1
+ multipath4_test "Weighted MP 11:45" 11 45
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/gre_inner_v6_multipath.sh b/tools/testing/selftests/net/forwarding/gre_inner_v6_multipath.sh
new file mode 100755
index 000000000..e449475c4
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/gre_inner_v6_multipath.sh
@@ -0,0 +1,306 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test traffic distribution when there are multiple routes between an IPv4
+# GRE tunnel. The tunnel carries IPv6 traffic between multiple hosts.
+# Multiple routes are in the underlay network. With the default multipath
+# policy, SW2 will only look at the outer IP addresses, hence only a single
+# route would be used.
+#
+# +-------------------------+
+# | H1 |
+# | $h1 + |
+# | 2001:db8:1::2/64 | |
+# +-------------------|-----+
+# |
+# +-------------------|------------------------+
+# | SW1 | |
+# | $ol1 + |
+# | 2001:db8:1::1/64 |
+# | |
+# | + g1 (gre) |
+# | loc=192.0.2.65 |
+# | rem=192.0.2.66 --. |
+# | tos=inherit | |
+# | v |
+# | + $ul1 |
+# | | 192.0.2.129/28 |
+# +---------------------|----------------------+
+# |
+# +---------------------|----------------------+
+# | SW2 | |
+# | $ul21 + |
+# | 192.0.2.130/28 |
+# | | |
+# ! ________________|_____ |
+# | / \ |
+# | | | |
+# | + $ul22.111 (vlan) + $ul22.222 (vlan) |
+# | | 192.0.2.145/28 | 192.0.2.161/28 |
+# | | | |
+# +--|----------------------|------------------+
+# | |
+# +--|----------------------|------------------+
+# | | | |
+# | + $ul32.111 (vlan) + $ul32.222 (vlan) |
+# | | 192.0.2.146/28 | 192.0.2.162/28 |
+# | | | |
+# | \______________________/ |
+# | | |
+# | | |
+# | $ul31 + |
+# | 192.0.2.177/28 | SW3 |
+# +---------------------|----------------------+
+# |
+# +---------------------|----------------------+
+# | + $ul4 |
+# | ^ 192.0.2.178/28 |
+# | | |
+# | + g2 (gre) | |
+# | loc=192.0.2.66 | |
+# | rem=192.0.2.65 --' |
+# | tos=inherit |
+# | |
+# | $ol4 + |
+# | 2001:db8:2::1/64 | SW4 |
+# +--------------------|-----------------------+
+# |
+# +--------------------|---------+
+# | | |
+# | $h2 + |
+# | 2001:db8:2::2/64 H2 |
+# +------------------------------+
+
+ALL_TESTS="
+ ping_ipv6
+ multipath_ipv6
+"
+
+NUM_NETIFS=10
+source lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 2001:db8:1::2/64
+ ip -6 route add vrf v$h1 2001:db8:2::/64 via 2001:db8:1::1
+}
+
+h1_destroy()
+{
+ ip -6 route del vrf v$h1 2001:db8:2::/64 via 2001:db8:1::1
+ simple_if_fini $h1 2001:db8:1::2/64
+}
+
+sw1_create()
+{
+ simple_if_init $ol1 2001:db8:1::1/64
+ __simple_if_init $ul1 v$ol1 192.0.2.129/28
+
+ tunnel_create g1 gre 192.0.2.65 192.0.2.66 tos inherit dev v$ol1
+ __simple_if_init g1 v$ol1 192.0.2.65/32
+ ip route add vrf v$ol1 192.0.2.66/32 via 192.0.2.130
+
+ ip -6 route add vrf v$ol1 2001:db8:2::/64 dev g1
+}
+
+sw1_destroy()
+{
+ ip -6 route del vrf v$ol1 2001:db8:2::/64
+
+ ip route del vrf v$ol1 192.0.2.66/32
+ __simple_if_fini g1 192.0.2.65/32
+ tunnel_destroy g1
+
+ __simple_if_fini $ul1 192.0.2.129/28
+ simple_if_fini $ol1 2001:db8:1::1/64
+}
+
+sw2_create()
+{
+ simple_if_init $ul21 192.0.2.130/28
+ __simple_if_init $ul22 v$ul21
+ vlan_create $ul22 111 v$ul21 192.0.2.145/28
+ vlan_create $ul22 222 v$ul21 192.0.2.161/28
+
+ ip route add vrf v$ul21 192.0.2.65/32 via 192.0.2.129
+ ip route add vrf v$ul21 192.0.2.66/32 \
+ nexthop via 192.0.2.146 \
+ nexthop via 192.0.2.162
+}
+
+sw2_destroy()
+{
+ ip route del vrf v$ul21 192.0.2.66/32
+ ip route del vrf v$ul21 192.0.2.65/32
+
+ vlan_destroy $ul22 222
+ vlan_destroy $ul22 111
+ __simple_if_fini $ul22
+ simple_if_fini $ul21 192.0.2.130/28
+}
+
+sw3_create()
+{
+ simple_if_init $ul31 192.0.2.177/28
+ __simple_if_init $ul32 v$ul31
+ vlan_create $ul32 111 v$ul31 192.0.2.146/28
+ vlan_create $ul32 222 v$ul31 192.0.2.162/28
+
+ ip route add vrf v$ul31 192.0.2.66/32 via 192.0.2.178
+ ip route add vrf v$ul31 192.0.2.65/32 \
+ nexthop via 192.0.2.145 \
+ nexthop via 192.0.2.161
+
+ tc qdisc add dev $ul32 clsact
+ tc filter add dev $ul32 ingress pref 111 prot 802.1Q \
+ flower vlan_id 111 action pass
+ tc filter add dev $ul32 ingress pref 222 prot 802.1Q \
+ flower vlan_id 222 action pass
+}
+
+sw3_destroy()
+{
+ tc qdisc del dev $ul32 clsact
+
+ ip route del vrf v$ul31 192.0.2.65/32
+ ip route del vrf v$ul31 192.0.2.66/32
+
+ vlan_destroy $ul32 222
+ vlan_destroy $ul32 111
+ __simple_if_fini $ul32
+ simple_if_fini $ul31 192.0.2.177/28
+}
+
+sw4_create()
+{
+ simple_if_init $ol4 2001:db8:2::1/64
+ __simple_if_init $ul4 v$ol4 192.0.2.178/28
+
+ tunnel_create g2 gre 192.0.2.66 192.0.2.65 tos inherit dev v$ol4
+ __simple_if_init g2 v$ol4 192.0.2.66/32
+ ip route add vrf v$ol4 192.0.2.65/32 via 192.0.2.177
+
+ ip -6 route add vrf v$ol4 2001:db8:1::/64 dev g2
+}
+
+sw4_destroy()
+{
+ ip -6 route del vrf v$ol4 2001:db8:1::/64
+
+ ip route del vrf v$ol4 192.0.2.65/32
+ __simple_if_fini g2 192.0.2.66/32
+ tunnel_destroy g2
+
+ __simple_if_fini $ul4 192.0.2.178/28
+ simple_if_fini $ol4 2001:db8:2::1/64
+}
+
+h2_create()
+{
+ simple_if_init $h2 2001:db8:2::2/64
+ ip -6 route add vrf v$h2 2001:db8:1::/64 via 2001:db8:2::1
+}
+
+h2_destroy()
+{
+ ip -6 route del vrf v$h2 2001:db8:1::/64 via 2001:db8:2::1
+ simple_if_fini $h2 2001:db8:2::2/64
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+
+ ol1=${NETIFS[p2]}
+ ul1=${NETIFS[p3]}
+
+ ul21=${NETIFS[p4]}
+ ul22=${NETIFS[p5]}
+
+ ul32=${NETIFS[p6]}
+ ul31=${NETIFS[p7]}
+
+ ul4=${NETIFS[p8]}
+ ol4=${NETIFS[p9]}
+
+ h2=${NETIFS[p10]}
+
+ vrf_prepare
+ h1_create
+ sw1_create
+ sw2_create
+ sw3_create
+ sw4_create
+ h2_create
+
+ forwarding_enable
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ forwarding_restore
+
+ h2_destroy
+ sw4_destroy
+ sw3_destroy
+ sw2_destroy
+ sw1_destroy
+ h1_destroy
+ vrf_cleanup
+}
+
+multipath6_test()
+{
+ local what=$1; shift
+ local weight1=$1; shift
+ local weight2=$1; shift
+
+ sysctl_set net.ipv4.fib_multipath_hash_policy 2
+ ip route replace vrf v$ul21 192.0.2.66/32 \
+ nexthop via 192.0.2.146 weight $weight1 \
+ nexthop via 192.0.2.162 weight $weight2
+
+ local t0_111=$(tc_rule_stats_get $ul32 111 ingress)
+ local t0_222=$(tc_rule_stats_get $ul32 222 ingress)
+
+ ip vrf exec v$h1 \
+ $MZ $h1 -6 -q -p 64 -A "2001:db8:1::2-2001:db8:1::1e" \
+ -B "2001:db8:2::2-2001:db8:2::1e" \
+ -d 1msec -c 50 -t udp "sp=1024,dp=1024"
+ sleep 1
+
+ local t1_111=$(tc_rule_stats_get $ul32 111 ingress)
+ local t1_222=$(tc_rule_stats_get $ul32 222 ingress)
+
+ local d111=$((t1_111 - t0_111))
+ local d222=$((t1_222 - t0_222))
+ multipath_eval "$what" $weight1 $weight2 $d111 $d222
+
+ ip route replace vrf v$ul21 192.0.2.66/32 \
+ nexthop via 192.0.2.146 \
+ nexthop via 192.0.2.162
+ sysctl_restore net.ipv4.fib_multipath_hash_policy
+}
+
+ping_ipv6()
+{
+ ping_test $h1 2001:db8:2::2
+}
+
+multipath_ipv6()
+{
+ log_info "Running IPv6 over GRE over IPv4 multipath tests"
+ multipath6_test "ECMP" 1 1
+ multipath6_test "Weighted MP 2:1" 2 1
+ multipath6_test "Weighted MP 11:45" 11 45
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/gre_multipath.sh b/tools/testing/selftests/net/forwarding/gre_multipath.sh
new file mode 100755
index 000000000..a8d8e8b3d
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/gre_multipath.sh
@@ -0,0 +1,257 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test traffic distribution when a wECMP route forwards traffic to two GRE
+# tunnels.
+#
+# +-------------------------+
+# | H1 |
+# | $h1 + |
+# | 192.0.2.1/28 | |
+# +-------------------|-----+
+# |
+# +-------------------|------------------------+
+# | SW1 | |
+# | $ol1 + |
+# | 192.0.2.2/28 |
+# | |
+# | + g1a (gre) + g1b (gre) |
+# | loc=192.0.2.65 loc=192.0.2.81 |
+# | rem=192.0.2.66 --. rem=192.0.2.82 --. |
+# | tos=inherit | tos=inherit | |
+# | .------------------' | |
+# | | .------------------' |
+# | v v |
+# | + $ul1.111 (vlan) + $ul1.222 (vlan) |
+# | | 192.0.2.129/28 | 192.0.2.145/28 |
+# | \ / |
+# | \________________/ |
+# | | |
+# | + $ul1 |
+# +------------|-------------------------------+
+# |
+# +------------|-------------------------------+
+# | SW2 + $ul2 |
+# | _______|________ |
+# | / \ |
+# | / \ |
+# | + $ul2.111 (vlan) + $ul2.222 (vlan) |
+# | ^ 192.0.2.130/28 ^ 192.0.2.146/28 |
+# | | | |
+# | | '------------------. |
+# | '------------------. | |
+# | + g2a (gre) | + g2b (gre) | |
+# | loc=192.0.2.66 | loc=192.0.2.82 | |
+# | rem=192.0.2.65 --' rem=192.0.2.81 --' |
+# | tos=inherit tos=inherit |
+# | |
+# | $ol2 + |
+# | 192.0.2.17/28 | |
+# +-------------------|------------------------+
+# |
+# +-------------------|-----+
+# | H2 | |
+# | $h2 + |
+# | 192.0.2.18/28 |
+# +-------------------------+
+
+ALL_TESTS="
+ ping_ipv4
+ multipath_ipv4
+"
+
+NUM_NETIFS=6
+source lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/28 2001:db8:1::1/64
+ ip route add vrf v$h1 192.0.2.16/28 via 192.0.2.2
+}
+
+h1_destroy()
+{
+ ip route del vrf v$h1 192.0.2.16/28 via 192.0.2.2
+ simple_if_fini $h1 192.0.2.1/28
+}
+
+sw1_create()
+{
+ simple_if_init $ol1 192.0.2.2/28
+ __simple_if_init $ul1 v$ol1
+ vlan_create $ul1 111 v$ol1 192.0.2.129/28
+ vlan_create $ul1 222 v$ol1 192.0.2.145/28
+
+ tunnel_create g1a gre 192.0.2.65 192.0.2.66 tos inherit dev v$ol1
+ __simple_if_init g1a v$ol1 192.0.2.65/32
+ ip route add vrf v$ol1 192.0.2.66/32 via 192.0.2.130
+
+ tunnel_create g1b gre 192.0.2.81 192.0.2.82 tos inherit dev v$ol1
+ __simple_if_init g1b v$ol1 192.0.2.81/32
+ ip route add vrf v$ol1 192.0.2.82/32 via 192.0.2.146
+
+ ip route add vrf v$ol1 192.0.2.16/28 \
+ nexthop dev g1a \
+ nexthop dev g1b
+}
+
+sw1_destroy()
+{
+ ip route del vrf v$ol1 192.0.2.16/28
+
+ ip route del vrf v$ol1 192.0.2.82/32 via 192.0.2.146
+ __simple_if_fini g1b 192.0.2.81/32
+ tunnel_destroy g1b
+
+ ip route del vrf v$ol1 192.0.2.66/32 via 192.0.2.130
+ __simple_if_fini g1a 192.0.2.65/32
+ tunnel_destroy g1a
+
+ vlan_destroy $ul1 222
+ vlan_destroy $ul1 111
+ __simple_if_fini $ul1
+ simple_if_fini $ol1 192.0.2.2/28
+}
+
+sw2_create()
+{
+ simple_if_init $ol2 192.0.2.17/28
+ __simple_if_init $ul2 v$ol2
+ vlan_create $ul2 111 v$ol2 192.0.2.130/28
+ vlan_create $ul2 222 v$ol2 192.0.2.146/28
+
+ tunnel_create g2a gre 192.0.2.66 192.0.2.65 tos inherit dev v$ol2
+ __simple_if_init g2a v$ol2 192.0.2.66/32
+ ip route add vrf v$ol2 192.0.2.65/32 via 192.0.2.129
+
+ tunnel_create g2b gre 192.0.2.82 192.0.2.81 tos inherit dev v$ol2
+ __simple_if_init g2b v$ol2 192.0.2.82/32
+ ip route add vrf v$ol2 192.0.2.81/32 via 192.0.2.145
+
+ ip route add vrf v$ol2 192.0.2.0/28 \
+ nexthop dev g2a \
+ nexthop dev g2b
+
+ tc qdisc add dev $ul2 clsact
+ tc filter add dev $ul2 ingress pref 111 prot 802.1Q \
+ flower vlan_id 111 action pass
+ tc filter add dev $ul2 ingress pref 222 prot 802.1Q \
+ flower vlan_id 222 action pass
+}
+
+sw2_destroy()
+{
+ tc qdisc del dev $ul2 clsact
+
+ ip route del vrf v$ol2 192.0.2.0/28
+
+ ip route del vrf v$ol2 192.0.2.81/32 via 192.0.2.145
+ __simple_if_fini g2b 192.0.2.82/32
+ tunnel_destroy g2b
+
+ ip route del vrf v$ol2 192.0.2.65/32 via 192.0.2.129
+ __simple_if_fini g2a 192.0.2.66/32
+ tunnel_destroy g2a
+
+ vlan_destroy $ul2 222
+ vlan_destroy $ul2 111
+ __simple_if_fini $ul2
+ simple_if_fini $ol2 192.0.2.17/28
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.18/28
+ ip route add vrf v$h2 192.0.2.0/28 via 192.0.2.17
+}
+
+h2_destroy()
+{
+ ip route del vrf v$h2 192.0.2.0/28 via 192.0.2.17
+ simple_if_fini $h2 192.0.2.18/28
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ ol1=${NETIFS[p2]}
+
+ ul1=${NETIFS[p3]}
+ ul2=${NETIFS[p4]}
+
+ ol2=${NETIFS[p5]}
+ h2=${NETIFS[p6]}
+
+ vrf_prepare
+ h1_create
+ sw1_create
+ sw2_create
+ h2_create
+
+ forwarding_enable
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ forwarding_restore
+
+ h2_destroy
+ sw2_destroy
+ sw1_destroy
+ h1_destroy
+ vrf_cleanup
+}
+
+multipath4_test()
+{
+ local what=$1; shift
+ local weight1=$1; shift
+ local weight2=$1; shift
+
+ sysctl_set net.ipv4.fib_multipath_hash_policy 1
+ ip route replace vrf v$ol1 192.0.2.16/28 \
+ nexthop dev g1a weight $weight1 \
+ nexthop dev g1b weight $weight2
+
+ local t0_111=$(tc_rule_stats_get $ul2 111 ingress)
+ local t0_222=$(tc_rule_stats_get $ul2 222 ingress)
+
+ ip vrf exec v$h1 \
+ $MZ $h1 -q -p 64 -A 192.0.2.1 -B 192.0.2.18 \
+ -d 1msec -t udp "sp=1024,dp=0-32768"
+
+ local t1_111=$(tc_rule_stats_get $ul2 111 ingress)
+ local t1_222=$(tc_rule_stats_get $ul2 222 ingress)
+
+ local d111=$((t1_111 - t0_111))
+ local d222=$((t1_222 - t0_222))
+ multipath_eval "$what" $weight1 $weight2 $d111 $d222
+
+ ip route replace vrf v$ol1 192.0.2.16/28 \
+ nexthop dev g1a \
+ nexthop dev g1b
+ sysctl_restore net.ipv4.fib_multipath_hash_policy
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.2.18
+}
+
+multipath_ipv4()
+{
+ log_info "Running IPv4 multipath tests"
+ multipath4_test "ECMP" 1 1
+ multipath4_test "Weighted MP 2:1" 2 1
+ multipath4_test "Weighted MP 11:45" 11 45
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/ip6_forward_instats_vrf.sh b/tools/testing/selftests/net/forwarding/ip6_forward_instats_vrf.sh
new file mode 100755
index 000000000..9f5b3e2e5
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/ip6_forward_instats_vrf.sh
@@ -0,0 +1,172 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test ipv6 stats on the incoming if when forwarding with VRF
+
+ALL_TESTS="
+ ipv6_ping
+ ipv6_in_too_big_err
+ ipv6_in_hdr_err
+ ipv6_in_addr_err
+ ipv6_in_discard
+"
+
+NUM_NETIFS=4
+source lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 2001:1:1::2/64
+ ip -6 route add vrf v$h1 2001:1:2::/64 via 2001:1:1::1
+}
+
+h1_destroy()
+{
+ ip -6 route del vrf v$h1 2001:1:2::/64 via 2001:1:1::1
+ simple_if_fini $h1 2001:1:1::2/64
+}
+
+router_create()
+{
+ vrf_create router
+ __simple_if_init $rtr1 router 2001:1:1::1/64
+ __simple_if_init $rtr2 router 2001:1:2::1/64
+ mtu_set $rtr2 1280
+}
+
+router_destroy()
+{
+ mtu_restore $rtr2
+ __simple_if_fini $rtr2 2001:1:2::1/64
+ __simple_if_fini $rtr1 2001:1:1::1/64
+ vrf_destroy router
+}
+
+h2_create()
+{
+ simple_if_init $h2 2001:1:2::2/64
+ ip -6 route add vrf v$h2 2001:1:1::/64 via 2001:1:2::1
+ mtu_set $h2 1280
+}
+
+h2_destroy()
+{
+ mtu_restore $h2
+ ip -6 route del vrf v$h2 2001:1:1::/64 via 2001:1:2::1
+ simple_if_fini $h2 2001:1:2::2/64
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ rtr1=${NETIFS[p2]}
+
+ rtr2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ vrf_prepare
+ h1_create
+ router_create
+ h2_create
+
+ forwarding_enable
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ forwarding_restore
+
+ h2_destroy
+ router_destroy
+ h1_destroy
+ vrf_cleanup
+}
+
+ipv6_in_too_big_err()
+{
+ RET=0
+
+ local t0=$(ipv6_stats_get $rtr1 Ip6InTooBigErrors)
+ local vrf_name=$(master_name_get $h1)
+
+ # Send too big packets
+ ip vrf exec $vrf_name \
+ $PING6 -s 1300 2001:1:2::2 -c 1 -w $PING_TIMEOUT &> /dev/null
+
+ local t1=$(ipv6_stats_get $rtr1 Ip6InTooBigErrors)
+ test "$((t1 - t0))" -ne 0
+ check_err $?
+ log_test "Ip6InTooBigErrors"
+}
+
+ipv6_in_hdr_err()
+{
+ RET=0
+
+ local t0=$(ipv6_stats_get $rtr1 Ip6InHdrErrors)
+ local vrf_name=$(master_name_get $h1)
+
+ # Send packets with hop limit 1, easiest with traceroute6 as some ping6
+ # doesn't allow hop limit to be specified
+ ip vrf exec $vrf_name \
+ $TROUTE6 2001:1:2::2 &> /dev/null
+
+ local t1=$(ipv6_stats_get $rtr1 Ip6InHdrErrors)
+ test "$((t1 - t0))" -ne 0
+ check_err $?
+ log_test "Ip6InHdrErrors"
+}
+
+ipv6_in_addr_err()
+{
+ RET=0
+
+ local t0=$(ipv6_stats_get $rtr1 Ip6InAddrErrors)
+ local vrf_name=$(master_name_get $h1)
+
+ # Disable forwarding temporary while sending the packet
+ sysctl -qw net.ipv6.conf.all.forwarding=0
+ ip vrf exec $vrf_name \
+ $PING6 2001:1:2::2 -c 1 -w $PING_TIMEOUT &> /dev/null
+ sysctl -qw net.ipv6.conf.all.forwarding=1
+
+ local t1=$(ipv6_stats_get $rtr1 Ip6InAddrErrors)
+ test "$((t1 - t0))" -ne 0
+ check_err $?
+ log_test "Ip6InAddrErrors"
+}
+
+ipv6_in_discard()
+{
+ RET=0
+
+ local t0=$(ipv6_stats_get $rtr1 Ip6InDiscards)
+ local vrf_name=$(master_name_get $h1)
+
+ # Add a policy to discard
+ ip xfrm policy add dst 2001:1:2::2/128 dir fwd action block
+ ip vrf exec $vrf_name \
+ $PING6 2001:1:2::2 -c 1 -w $PING_TIMEOUT &> /dev/null
+ ip xfrm policy del dst 2001:1:2::2/128 dir fwd
+
+ local t1=$(ipv6_stats_get $rtr1 Ip6InDiscards)
+ test "$((t1 - t0))" -ne 0
+ check_err $?
+ log_test "Ip6InDiscards"
+}
+ipv6_ping()
+{
+ RET=0
+
+ ping6_test $h1 2001:1:2::2
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/ip6gre_inner_v4_multipath.sh b/tools/testing/selftests/net/forwarding/ip6gre_inner_v4_multipath.sh
new file mode 100755
index 000000000..a257979d3
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/ip6gre_inner_v4_multipath.sh
@@ -0,0 +1,304 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test traffic distribution when there are multiple routes between an IPv6
+# GRE tunnel. The tunnel carries IPv4 traffic between multiple hosts.
+# Multiple routes are in the underlay network. With the default multipath
+# policy, SW2 will only look at the outer IP addresses, hence only a single
+# route would be used.
+#
+# +-------------------------+
+# | H1 |
+# | $h1 + |
+# | 192.0.3.{2-62}/24 | |
+# +-------------------|-----+
+# |
+# +-------------------|-------------------------+
+# | SW1 | |
+# | $ol1 + |
+# | 192.0.3.1/24 |
+# | |
+# | + g1 (gre) |
+# | loc=2001:db8:40::1 |
+# | rem=2001:db8:40::2 --. |
+# | tos=inherit | |
+# | v |
+# | + $ul1 |
+# | | 2001:db8:80::1/64 |
+# +-------------------------|-------------------+
+# |
+# +-------------------------|-------------------+
+# | SW2 | |
+# | $ul21 + |
+# | 2001:db8:80::2/64 |
+# | | |
+# ! ________________|_____ |
+# | / \ |
+# | | | |
+# | + $ul22.111 (vlan) + $ul22.222 (vlan) |
+# | | 2001:db8:81::1/64 | 2001:db8:82::1/64 |
+# | | | |
+# +--|----------------------|-------------------+
+# | |
+# +--|----------------------|-------------------+
+# | | | |
+# | + $ul32.111 (vlan) + $ul32.222 (vlan) |
+# | | 2001:db8:81::2/64 | 2001:db8:82::2/64 |
+# | | | |
+# | \______________________/ |
+# | | |
+# | | |
+# | $ul31 + |
+# | 2001:db8:83::2/64 | SW3 |
+# +-------------------------|-------------------+
+# |
+# +-------------------------|-------------------+
+# | + $ul4 |
+# | ^ 2001:db8:83::1/64 |
+# | + g2 (gre) | |
+# | loc=2001:db8:40::2 | |
+# | rem=2001:db8:40::1 --' |
+# | tos=inherit |
+# | |
+# | $ol4 + |
+# | 192.0.4.1/24 | SW4 |
+# +--------------------|------------------------+
+# |
+# +--------------------|---------+
+# | | |
+# | $h2 + |
+# | 192.0.4.{2-62}/24 H2 |
+# +------------------------------+
+
+ALL_TESTS="
+ ping_ipv4
+ multipath_ipv4
+"
+
+NUM_NETIFS=10
+source lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.3.2/24
+ ip route add vrf v$h1 192.0.4.0/24 via 192.0.3.1
+}
+
+h1_destroy()
+{
+ ip route del vrf v$h1 192.0.4.0/24 via 192.0.3.1
+ simple_if_fini $h1 192.0.3.2/24
+}
+
+sw1_create()
+{
+ simple_if_init $ol1 192.0.3.1/24
+ __simple_if_init $ul1 v$ol1 2001:db8:80::1/64
+
+ tunnel_create g1 ip6gre 2001:db8:40::1 2001:db8:40::2 tos inherit dev v$ol1
+ __simple_if_init g1 v$ol1 2001:db8:40::1/128
+ ip -6 route add vrf v$ol1 2001:db8:40::2/128 via 2001:db8:80::2
+
+ ip route add vrf v$ol1 192.0.4.0/24 nexthop dev g1
+}
+
+sw1_destroy()
+{
+ ip route del vrf v$ol1 192.0.4.0/24
+
+ ip -6 route del vrf v$ol1 2001:db8:40::2/128
+ __simple_if_fini g1 2001:db8:40::1/128
+ tunnel_destroy g1
+
+ __simple_if_fini $ul1 2001:db8:80::1/64
+ simple_if_fini $ol1 192.0.3.1/24
+}
+
+sw2_create()
+{
+ simple_if_init $ul21 2001:db8:80::2/64
+ __simple_if_init $ul22 v$ul21
+ vlan_create $ul22 111 v$ul21 2001:db8:81::1/64
+ vlan_create $ul22 222 v$ul21 2001:db8:82::1/64
+
+ ip -6 route add vrf v$ul21 2001:db8:40::1/128 via 2001:db8:80::1
+ ip -6 route add vrf v$ul21 2001:db8:40::2/128 \
+ nexthop via 2001:db8:81::2 \
+ nexthop via 2001:db8:82::2
+}
+
+sw2_destroy()
+{
+ ip -6 route del vrf v$ul21 2001:db8:40::2/128
+ ip -6 route del vrf v$ul21 2001:db8:40::1/128
+
+ vlan_destroy $ul22 222
+ vlan_destroy $ul22 111
+ __simple_if_fini $ul22
+ simple_if_fini $ul21 2001:db8:80::2/64
+}
+
+sw3_create()
+{
+ simple_if_init $ul31 2001:db8:83::2/64
+ __simple_if_init $ul32 v$ul31
+ vlan_create $ul32 111 v$ul31 2001:db8:81::2/64
+ vlan_create $ul32 222 v$ul31 2001:db8:82::2/64
+
+ ip -6 route add vrf v$ul31 2001:db8:40::2/128 via 2001:db8:83::1
+ ip -6 route add vrf v$ul31 2001:db8:40::1/128 \
+ nexthop via 2001:db8:81::1 \
+ nexthop via 2001:db8:82::1
+
+ tc qdisc add dev $ul32 clsact
+ tc filter add dev $ul32 ingress pref 111 prot 802.1Q \
+ flower vlan_id 111 action pass
+ tc filter add dev $ul32 ingress pref 222 prot 802.1Q \
+ flower vlan_id 222 action pass
+}
+
+sw3_destroy()
+{
+ tc qdisc del dev $ul32 clsact
+
+ ip -6 route del vrf v$ul31 2001:db8:40::1/128
+ ip -6 route del vrf v$ul31 2001:db8:40::2/128
+
+ vlan_destroy $ul32 222
+ vlan_destroy $ul32 111
+ __simple_if_fini $ul32
+ simple_if_fini $ul31 2001:Db8:83::2/64
+}
+
+sw4_create()
+{
+ simple_if_init $ol4 192.0.4.1/24
+ __simple_if_init $ul4 v$ol4 2001:db8:83::1/64
+
+ tunnel_create g2 ip6gre 2001:db8:40::2 2001:db8:40::1 tos inherit dev v$ol4
+ __simple_if_init g2 v$ol4 2001:db8:40::2/128
+ ip -6 route add vrf v$ol4 2001:db8:40::1/128 via 2001:db8:83::2
+
+ ip route add vrf v$ol4 192.0.3.0/24 nexthop dev g2
+}
+
+sw4_destroy()
+{
+ ip route del vrf v$ol4 192.0.3.0/24
+
+ ip -6 route del vrf v$ol4 2001:db8:40::1/128
+ __simple_if_fini g2 2001:db8:40::2/128
+ tunnel_destroy g2
+
+ __simple_if_fini $ul4 2001:db8:83::1/64
+ simple_if_fini $ol4 192.0.4.1/24
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.4.2/24
+ ip route add vrf v$h2 192.0.3.0/24 via 192.0.4.1
+}
+
+h2_destroy()
+{
+ ip route del vrf v$h2 192.0.3.0/24 via 192.0.4.1
+ simple_if_fini $h2 192.0.4.2/24
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+
+ ol1=${NETIFS[p2]}
+ ul1=${NETIFS[p3]}
+
+ ul21=${NETIFS[p4]}
+ ul22=${NETIFS[p5]}
+
+ ul32=${NETIFS[p6]}
+ ul31=${NETIFS[p7]}
+
+ ul4=${NETIFS[p8]}
+ ol4=${NETIFS[p9]}
+
+ h2=${NETIFS[p10]}
+
+ vrf_prepare
+ h1_create
+ sw1_create
+ sw2_create
+ sw3_create
+ sw4_create
+ h2_create
+
+ forwarding_enable
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ forwarding_restore
+
+ h2_destroy
+ sw4_destroy
+ sw3_destroy
+ sw2_destroy
+ sw1_destroy
+ h1_destroy
+ vrf_cleanup
+}
+
+multipath4_test()
+{
+ local what=$1; shift
+ local weight1=$1; shift
+ local weight2=$1; shift
+
+ sysctl_set net.ipv6.fib_multipath_hash_policy 2
+ ip route replace vrf v$ul21 2001:db8:40::2/128 \
+ nexthop via 2001:db8:81::2 weight $weight1 \
+ nexthop via 2001:db8:82::2 weight $weight2
+
+ local t0_111=$(tc_rule_stats_get $ul32 111 ingress)
+ local t0_222=$(tc_rule_stats_get $ul32 222 ingress)
+
+ ip vrf exec v$h1 \
+ $MZ $h1 -q -p 64 -A "192.0.3.2-192.0.3.62" -B "192.0.4.2-192.0.4.62" \
+ -d 1msec -c 50 -t udp "sp=1024,dp=1024"
+ sleep 1
+
+ local t1_111=$(tc_rule_stats_get $ul32 111 ingress)
+ local t1_222=$(tc_rule_stats_get $ul32 222 ingress)
+
+ local d111=$((t1_111 - t0_111))
+ local d222=$((t1_222 - t0_222))
+ multipath_eval "$what" $weight1 $weight2 $d111 $d222
+
+ ip route replace vrf v$ul21 2001:db8:40::2/128 \
+ nexthop via 2001:db8:81::2 \
+ nexthop via 2001:db8:82::2
+ sysctl_restore net.ipv6.fib_multipath_hash_policy
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.4.2
+}
+
+multipath_ipv4()
+{
+ log_info "Running IPv4 over GRE over IPv6 multipath tests"
+ multipath4_test "ECMP" 1 1
+ multipath4_test "Weighted MP 2:1" 2 1
+ multipath4_test "Weighted MP 11:45" 11 45
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/ip6gre_inner_v6_multipath.sh b/tools/testing/selftests/net/forwarding/ip6gre_inner_v6_multipath.sh
new file mode 100755
index 000000000..d208f5243
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/ip6gre_inner_v6_multipath.sh
@@ -0,0 +1,305 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test traffic distribution when there are multiple routes between an IPv6
+# GRE tunnel. The tunnel carries IPv6 traffic between multiple hosts.
+# Multiple routes are in the underlay network. With the default multipath
+# policy, SW2 will only look at the outer IP addresses, hence only a single
+# route would be used.
+#
+# +-------------------------+
+# | H1 |
+# | $h1 + |
+# | 2001:db8:1::2/64 | |
+# +-------------------|-----+
+# |
+# +-------------------|-------------------------+
+# | SW1 | |
+# | $ol1 + |
+# | 2001:db8:1::1/64 |
+# | |
+# | + g1 (gre) |
+# | loc=2001:db8:40::1 |
+# | rem=2001:db8:40::2 --. |
+# | tos=inherit | |
+# | v |
+# | + $ul1 |
+# | | 2001:db8:80::1/64 |
+# +-------------------------|-------------------+
+# |
+# +-------------------------|-------------------+
+# | SW2 | |
+# | $ul21 + |
+# | 2001:db8:80::2/64 |
+# | | |
+# ! ________________|_____ |
+# | / \ |
+# | | | |
+# | + $ul22.111 (vlan) + $ul22.222 (vlan) |
+# | | 2001:db8:81::1/64 | 2001:db8:82::1/64 |
+# | | | |
+# +--|----------------------|-------------------+
+# | |
+# +--|----------------------|-------------------+
+# | | | |
+# | + $ul32.111 (vlan) + $ul32.222 (vlan) |
+# | | 2001:db8:81::2/64 | 2001:db8:82::2/64 |
+# | | | |
+# | \______________________/ |
+# | | |
+# | | |
+# | $ul31 + |
+# | 2001:db8:83::2/64 | SW3 |
+# +-------------------------|-------------------+
+# |
+# +-------------------------|-------------------+
+# | + $ul4 |
+# | ^ 2001:db8:83::1/64 |
+# | + g2 (gre) | |
+# | loc=2001:db8:40::2 | |
+# | rem=2001:db8:40::1 --' |
+# | tos=inherit |
+# | |
+# | $ol4 + |
+# | 2001:db8:2::1/64 | SW4 |
+# +--------------------|------------------------+
+# |
+# +--------------------|---------+
+# | | |
+# | $h2 + |
+# | 2001:db8:2::2/64 H2 |
+# +------------------------------+
+
+ALL_TESTS="
+ ping_ipv6
+ multipath_ipv6
+"
+
+NUM_NETIFS=10
+source lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 2001:db8:1::2/64
+ ip -6 route add vrf v$h1 2001:db8:2::/64 via 2001:db8:1::1
+}
+
+h1_destroy()
+{
+ ip -6 route del vrf v$h1 2001:db8:2::/64 via 2001:db8:1::1
+ simple_if_fini $h1 2001:db8:1::2/64
+}
+
+sw1_create()
+{
+ simple_if_init $ol1 2001:db8:1::1/64
+ __simple_if_init $ul1 v$ol1 2001:db8:80::1/64
+
+ tunnel_create g1 ip6gre 2001:db8:40::1 2001:db8:40::2 tos inherit dev v$ol1
+ __simple_if_init g1 v$ol1 2001:db8:40::1/128
+ ip -6 route add vrf v$ol1 2001:db8:40::2/128 via 2001:db8:80::2
+
+ ip -6 route add vrf v$ol1 2001:db8:2::/64 dev g1
+}
+
+sw1_destroy()
+{
+ ip -6 route del vrf v$ol1 2001:db8:2::/64
+
+ ip -6 route del vrf v$ol1 2001:db8:40::2/128
+ __simple_if_fini g1 2001:db8:40::1/128
+ tunnel_destroy g1
+
+ __simple_if_fini $ul1 2001:db8:80::1/64
+ simple_if_fini $ol1 2001:db8:1::1/64
+}
+
+sw2_create()
+{
+ simple_if_init $ul21 2001:db8:80::2/64
+ __simple_if_init $ul22 v$ul21
+ vlan_create $ul22 111 v$ul21 2001:db8:81::1/64
+ vlan_create $ul22 222 v$ul21 2001:db8:82::1/64
+
+ ip -6 route add vrf v$ul21 2001:db8:40::1/128 via 2001:db8:80::1
+ ip -6 route add vrf v$ul21 2001:db8:40::2/128 \
+ nexthop via 2001:db8:81::2 \
+ nexthop via 2001:db8:82::2
+}
+
+sw2_destroy()
+{
+ ip -6 route del vrf v$ul21 2001:db8:40::2/128
+ ip -6 route del vrf v$ul21 2001:db8:40::1/128
+
+ vlan_destroy $ul22 222
+ vlan_destroy $ul22 111
+ __simple_if_fini $ul22
+ simple_if_fini $ul21 2001:db8:80::2/64
+}
+
+sw3_create()
+{
+ simple_if_init $ul31 2001:db8:83::2/64
+ __simple_if_init $ul32 v$ul31
+ vlan_create $ul32 111 v$ul31 2001:db8:81::2/64
+ vlan_create $ul32 222 v$ul31 2001:db8:82::2/64
+
+ ip -6 route add vrf v$ul31 2001:db8:40::2/128 via 2001:db8:83::1
+ ip -6 route add vrf v$ul31 2001:db8:40::1/128 \
+ nexthop via 2001:db8:81::1 \
+ nexthop via 2001:db8:82::1
+
+ tc qdisc add dev $ul32 clsact
+ tc filter add dev $ul32 ingress pref 111 prot 802.1Q \
+ flower vlan_id 111 action pass
+ tc filter add dev $ul32 ingress pref 222 prot 802.1Q \
+ flower vlan_id 222 action pass
+}
+
+sw3_destroy()
+{
+ tc qdisc del dev $ul32 clsact
+
+ ip -6 route del vrf v$ul31 2001:db8:40::1/128
+ ip -6 route del vrf v$ul31 2001:db8:40::2/128
+
+ vlan_destroy $ul32 222
+ vlan_destroy $ul32 111
+ __simple_if_fini $ul32
+ simple_if_fini $ul31 2001:Db8:83::2/64
+}
+
+sw4_create()
+{
+ simple_if_init $ol4 2001:db8:2::1/64
+ __simple_if_init $ul4 v$ol4 2001:db8:83::1/64
+
+ tunnel_create g2 ip6gre 2001:db8:40::2 2001:db8:40::1 tos inherit dev v$ol4
+ __simple_if_init g2 v$ol4 2001:db8:40::2/128
+ ip -6 route add vrf v$ol4 2001:db8:40::1/128 via 2001:db8:83::2
+
+ ip -6 route add vrf v$ol4 2001:db8:1::/64 dev g2
+}
+
+sw4_destroy()
+{
+ ip -6 route del vrf v$ol4 2001:db8:1::/64
+
+ ip -6 route del vrf v$ol4 2001:db8:40::1/128
+ __simple_if_fini g2 2001:db8:40::2/128
+ tunnel_destroy g2
+
+ __simple_if_fini $ul4 2001:db8:83::1/64
+ simple_if_fini $ol4 2001:db8:2::1/64
+}
+
+h2_create()
+{
+ simple_if_init $h2 2001:db8:2::2/64
+ ip -6 route add vrf v$h2 2001:db8:1::/64 via 2001:db8:2::1
+}
+
+h2_destroy()
+{
+ ip -6 route del vrf v$h2 2001:db8:1::/64 via 2001:db8:2::1
+ simple_if_fini $h2 2001:db8:2::2/64
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+
+ ol1=${NETIFS[p2]}
+ ul1=${NETIFS[p3]}
+
+ ul21=${NETIFS[p4]}
+ ul22=${NETIFS[p5]}
+
+ ul32=${NETIFS[p6]}
+ ul31=${NETIFS[p7]}
+
+ ul4=${NETIFS[p8]}
+ ol4=${NETIFS[p9]}
+
+ h2=${NETIFS[p10]}
+
+ vrf_prepare
+ h1_create
+ sw1_create
+ sw2_create
+ sw3_create
+ sw4_create
+ h2_create
+
+ forwarding_enable
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ forwarding_restore
+
+ h2_destroy
+ sw4_destroy
+ sw3_destroy
+ sw2_destroy
+ sw1_destroy
+ h1_destroy
+ vrf_cleanup
+}
+
+multipath6_test()
+{
+ local what=$1; shift
+ local weight1=$1; shift
+ local weight2=$1; shift
+
+ sysctl_set net.ipv6.fib_multipath_hash_policy 2
+ ip route replace vrf v$ul21 2001:db8:40::2/128 \
+ nexthop via 2001:db8:81::2 weight $weight1 \
+ nexthop via 2001:db8:82::2 weight $weight2
+
+ local t0_111=$(tc_rule_stats_get $ul32 111 ingress)
+ local t0_222=$(tc_rule_stats_get $ul32 222 ingress)
+
+ ip vrf exec v$h1 \
+ $MZ $h1 -6 -q -p 64 -A "2001:db8:1::2-2001:db8:1::1e" \
+ -B "2001:db8:2::2-2001:db8:2::1e" \
+ -d 1msec -c 50 -t udp "sp=1024,dp=1024"
+ sleep 1
+
+ local t1_111=$(tc_rule_stats_get $ul32 111 ingress)
+ local t1_222=$(tc_rule_stats_get $ul32 222 ingress)
+
+ local d111=$((t1_111 - t0_111))
+ local d222=$((t1_222 - t0_222))
+ multipath_eval "$what" $weight1 $weight2 $d111 $d222
+
+ ip route replace vrf v$ul21 2001:db8:40::2/128 \
+ nexthop via 2001:db8:81::2 \
+ nexthop via 2001:db8:82::2
+ sysctl_restore net.ipv6.fib_multipath_hash_policy
+}
+
+ping_ipv6()
+{
+ ping_test $h1 2001:db8:2::2
+}
+
+multipath_ipv6()
+{
+ log_info "Running IPv6 over GRE over IPv6 multipath tests"
+ multipath6_test "ECMP" 1 1
+ multipath6_test "Weighted MP 2:1" 2 1
+ multipath6_test "Weighted MP 11:45" 11 45
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/ipip_flat_gre.sh b/tools/testing/selftests/net/forwarding/ipip_flat_gre.sh
new file mode 100755
index 000000000..abb694397
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/ipip_flat_gre.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test IP-in-IP GRE tunnel without key.
+# This test uses flat topology for IP tunneling tests. See ipip_lib.sh for more
+# details.
+
+ALL_TESTS="gre_flat4 gre_mtu_change"
+
+NUM_NETIFS=6
+source lib.sh
+source ipip_lib.sh
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ ol1=${NETIFS[p2]}
+
+ ul1=${NETIFS[p3]}
+ ul2=${NETIFS[p4]}
+
+ ol2=${NETIFS[p5]}
+ h2=${NETIFS[p6]}
+
+ forwarding_enable
+ vrf_prepare
+ h1_create
+ h2_create
+ sw1_flat_create gre $ol1 $ul1
+ sw2_flat_create gre $ol2 $ul2
+}
+
+gre_flat4()
+{
+ RET=0
+
+ ping_test $h1 192.0.2.18 " gre flat"
+}
+
+gre_mtu_change()
+{
+ test_mtu_change gre
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ sw2_flat_destroy $ol2 $ul2
+ sw1_flat_destroy $ol1 $ul1
+ h2_destroy
+ h1_destroy
+ vrf_cleanup
+ forwarding_restore
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/ipip_flat_gre_key.sh b/tools/testing/selftests/net/forwarding/ipip_flat_gre_key.sh
new file mode 100755
index 000000000..c4f373337
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/ipip_flat_gre_key.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test IP-in-IP GRE tunnel with key.
+# This test uses flat topology for IP tunneling tests. See ipip_lib.sh for more
+# details.
+
+ALL_TESTS="gre_flat4 gre_mtu_change"
+
+NUM_NETIFS=6
+source lib.sh
+source ipip_lib.sh
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ ol1=${NETIFS[p2]}
+
+ ul1=${NETIFS[p3]}
+ ul2=${NETIFS[p4]}
+
+ ol2=${NETIFS[p5]}
+ h2=${NETIFS[p6]}
+
+ forwarding_enable
+ vrf_prepare
+ h1_create
+ h2_create
+ sw1_flat_create gre $ol1 $ul1 key 233
+ sw2_flat_create gre $ol2 $ul2 key 233
+}
+
+gre_flat4()
+{
+ RET=0
+
+ ping_test $h1 192.0.2.18 " gre flat with key"
+}
+
+gre_mtu_change()
+{
+ test_mtu_change gre
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ sw2_flat_destroy $ol2 $ul2
+ sw1_flat_destroy $ol1 $ul1
+ h2_destroy
+ h1_destroy
+ vrf_cleanup
+ forwarding_restore
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/ipip_flat_gre_keys.sh b/tools/testing/selftests/net/forwarding/ipip_flat_gre_keys.sh
new file mode 100755
index 000000000..a811130c0
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/ipip_flat_gre_keys.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test IP-in-IP GRE tunnel with key.
+# This test uses flat topology for IP tunneling tests. See ipip_lib.sh for more
+# details.
+
+ALL_TESTS="gre_flat4 gre_mtu_change"
+
+NUM_NETIFS=6
+source lib.sh
+source ipip_lib.sh
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ ol1=${NETIFS[p2]}
+
+ ul1=${NETIFS[p3]}
+ ul2=${NETIFS[p4]}
+
+ ol2=${NETIFS[p5]}
+ h2=${NETIFS[p6]}
+
+ forwarding_enable
+ vrf_prepare
+ h1_create
+ h2_create
+ sw1_flat_create gre $ol1 $ul1 ikey 111 okey 222
+ sw2_flat_create gre $ol2 $ul2 ikey 222 okey 111
+}
+
+gre_flat4()
+{
+ RET=0
+
+ ping_test $h1 192.0.2.18 " gre flat with ikey/okey"
+}
+
+gre_mtu_change()
+{
+ test_mtu_change gre
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ sw2_flat_destroy $ol2 $ul2
+ sw1_flat_destroy $ol1 $ul1
+ h2_destroy
+ h1_destroy
+ vrf_cleanup
+ forwarding_restore
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/ipip_hier_gre.sh b/tools/testing/selftests/net/forwarding/ipip_hier_gre.sh
new file mode 100755
index 000000000..05c5b3cf2
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/ipip_hier_gre.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test IP-in-IP GRE tunnels without key.
+# This test uses hierarchical topology for IP tunneling tests. See
+# ipip_lib.sh for more details.
+
+ALL_TESTS="gre_hier4 gre_mtu_change"
+
+NUM_NETIFS=6
+source lib.sh
+source ipip_lib.sh
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ ol1=${NETIFS[p2]}
+
+ ul1=${NETIFS[p3]}
+ ul2=${NETIFS[p4]}
+
+ ol2=${NETIFS[p5]}
+ h2=${NETIFS[p6]}
+
+ forwarding_enable
+ vrf_prepare
+ h1_create
+ h2_create
+ sw1_hierarchical_create gre $ol1 $ul1
+ sw2_hierarchical_create gre $ol2 $ul2
+}
+
+gre_hier4()
+{
+ RET=0
+
+ ping_test $h1 192.0.2.18 " gre hierarchical"
+}
+
+gre_mtu_change()
+{
+ test_mtu_change gre
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ sw2_hierarchical_destroy $ol2 $ul2
+ sw1_hierarchical_destroy $ol1 $ul1
+ h2_destroy
+ h1_destroy
+ vrf_cleanup
+ forwarding_restore
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/ipip_hier_gre_key.sh b/tools/testing/selftests/net/forwarding/ipip_hier_gre_key.sh
new file mode 100755
index 000000000..9b105dbca
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/ipip_hier_gre_key.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test IP-in-IP GRE tunnels without key.
+# This test uses hierarchical topology for IP tunneling tests. See
+# ipip_lib.sh for more details.
+
+ALL_TESTS="gre_hier4 gre_mtu_change"
+
+NUM_NETIFS=6
+source lib.sh
+source ipip_lib.sh
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ ol1=${NETIFS[p2]}
+
+ ul1=${NETIFS[p3]}
+ ul2=${NETIFS[p4]}
+
+ ol2=${NETIFS[p5]}
+ h2=${NETIFS[p6]}
+
+ forwarding_enable
+ vrf_prepare
+ h1_create
+ h2_create
+ sw1_hierarchical_create gre $ol1 $ul1 key 22
+ sw2_hierarchical_create gre $ol2 $ul2 key 22
+}
+
+gre_hier4()
+{
+ RET=0
+
+ ping_test $h1 192.0.2.18 " gre hierarchical with key"
+}
+
+gre_mtu_change()
+{
+ test_mtu_change gre
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ sw2_hierarchical_destroy $ol2 $ul2
+ sw1_hierarchical_destroy $ol1 $ul1
+ h2_destroy
+ h1_destroy
+ vrf_cleanup
+ forwarding_restore
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/ipip_hier_gre_keys.sh b/tools/testing/selftests/net/forwarding/ipip_hier_gre_keys.sh
new file mode 100755
index 000000000..e275d25bd
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/ipip_hier_gre_keys.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test IP-in-IP GRE tunnels without key.
+# This test uses hierarchical topology for IP tunneling tests. See
+# ipip_lib.sh for more details.
+
+ALL_TESTS="gre_hier4 gre_mtu_change"
+
+NUM_NETIFS=6
+source lib.sh
+source ipip_lib.sh
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ ol1=${NETIFS[p2]}
+
+ ul1=${NETIFS[p3]}
+ ul2=${NETIFS[p4]}
+
+ ol2=${NETIFS[p5]}
+ h2=${NETIFS[p6]}
+
+ forwarding_enable
+ vrf_prepare
+ h1_create
+ h2_create
+ sw1_hierarchical_create gre $ol1 $ul1 ikey 111 okey 222
+ sw2_hierarchical_create gre $ol2 $ul2 ikey 222 okey 111
+}
+
+gre_hier4()
+{
+ RET=0
+
+ ping_test $h1 192.0.2.18 " gre hierarchical with ikey/okey"
+}
+
+gre_mtu_change()
+{
+ test_mtu_change gre
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ sw2_hierarchical_destroy $ol2 $ul2
+ sw1_hierarchical_destroy $ol1 $ul1
+ h2_destroy
+ h1_destroy
+ vrf_cleanup
+ forwarding_restore
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/ipip_lib.sh b/tools/testing/selftests/net/forwarding/ipip_lib.sh
new file mode 100644
index 000000000..30f36a57b
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/ipip_lib.sh
@@ -0,0 +1,349 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Handles creation and destruction of IP-in-IP or GRE tunnels over the given
+# topology. Supports both flat and hierarchical models.
+#
+# Flat Model:
+# Overlay and underlay share the same VRF.
+# SW1 uses default VRF so tunnel has no bound dev.
+# SW2 uses non-default VRF tunnel has a bound dev.
+# +-------------------------+
+# | H1 |
+# | $h1 + |
+# | 192.0.2.1/28 | |
+# +-------------------|-----+
+# |
+# +-------------------|-----+
+# | SW1 | |
+# | $ol1 + |
+# | 192.0.2.2/28 |
+# | |
+# | + g1a (gre) |
+# | loc=192.0.2.65 |
+# | rem=192.0.2.66 --. |
+# | tos=inherit | |
+# | .------------------' |
+# | | |
+# | v |
+# | + $ul1.111 (vlan) |
+# | | 192.0.2.129/28 |
+# | \ |
+# | \_______ |
+# | | |
+# |VRF default + $ul1 |
+# +------------|------------+
+# |
+# +------------|------------+
+# | SW2 + $ul2 |
+# | _______| |
+# | / |
+# | / |
+# | + $ul2.111 (vlan) |
+# | ^ 192.0.2.130/28 |
+# | | |
+# | | |
+# | '------------------. |
+# | + g2a (gre) | |
+# | loc=192.0.2.66 | |
+# | rem=192.0.2.65 --' |
+# | tos=inherit |
+# | |
+# | $ol2 + |
+# | 192.0.2.17/28 | |
+# | VRF v$ol2 | |
+# +-------------------|-----+
+# |
+# +-------------------|-----+
+# | H2 | |
+# | $h2 + |
+# | 192.0.2.18/28 |
+# +-------------------------+
+#
+# Hierarchical model:
+# The tunnel is bound to a device in a different VRF
+#
+# +---------------------------+
+# | H1 |
+# | $h1 + |
+# | 192.0.2.1/28 | |
+# +-------------------|-------+
+# |
+# +-------------------|-------+
+# | SW1 | |
+# | +-----------------|-----+ |
+# | | $ol1 + | |
+# | | 192.0.2.2/28 | |
+# | | | |
+# | | + g1a (gre) | |
+# | | rem=192.0.2.66 | |
+# | | tos=inherit | |
+# | | loc=192.0.2.65 | |
+# | | ^ | |
+# | | VRF v$ol1 | | |
+# | +-----------|-----------+ |
+# | | |
+# | +-----------|-----------+ |
+# | | VRF v$ul1 | | |
+# | | | | |
+# | | | | |
+# | | v | |
+# | | dummy1 + | |
+# | | 192.0.2.65 | |
+# | | .-------' | |
+# | | | | |
+# | | v | |
+# | | + $ul1.111 (vlan) | |
+# | | | 192.0.2.129/28 | |
+# | | \ | |
+# | | \_____ | |
+# | | | | |
+# | | + $ul1 | |
+# | +----------|------------+ |
+# +------------|--------------+
+# |
+# +------------|--------------+
+# | SW2 | |
+# | +----------|------------+ |
+# | | + $ul2 | |
+# | | _____| | |
+# | | / | |
+# | | / | |
+# | | | $ul2.111 (vlan) | |
+# | | + 192.0.2.130/28 | |
+# | | ^ | |
+# | | | | |
+# | | '-------. | |
+# | | dummy2 + | |
+# | | 192.0.2.66 | |
+# | | ^ | |
+# | | | | |
+# | | | | |
+# | | VRF v$ul2 | | |
+# | +-----------|-----------+ |
+# | | |
+# | +-----------|-----------+ |
+# | | VRF v$ol2 | | |
+# | | | | |
+# | | v | |
+# | | g2a (gre)+ | |
+# | | loc=192.0.2.66 | |
+# | | rem=192.0.2.65 | |
+# | | tos=inherit | |
+# | | | |
+# | | $ol2 + | |
+# | | 192.0.2.17/28 | | |
+# | +-----------------|-----+ |
+# +-------------------|-------+
+# |
+# +-------------------|-------+
+# | H2 | |
+# | $h2 + |
+# | 192.0.2.18/28 |
+# +---------------------------+
+source lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/28 2001:db8:1::1/64
+ ip route add vrf v$h1 192.0.2.16/28 via 192.0.2.2
+}
+
+h1_destroy()
+{
+ ip route del vrf v$h1 192.0.2.16/28 via 192.0.2.2
+ simple_if_fini $h1 192.0.2.1/28
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.18/28
+ ip route add vrf v$h2 192.0.2.0/28 via 192.0.2.17
+}
+
+h2_destroy()
+{
+ ip route del vrf v$h2 192.0.2.0/28 via 192.0.2.17
+ simple_if_fini $h2 192.0.2.18/28
+}
+
+sw1_flat_create()
+{
+ local type=$1; shift
+ local ol1=$1; shift
+ local ul1=$1; shift
+
+ ip link set dev $ol1 up
+ __addr_add_del $ol1 add "192.0.2.2/28"
+
+ ip link set dev $ul1 up
+ vlan_create $ul1 111 "" 192.0.2.129/28
+
+ tunnel_create g1a $type 192.0.2.65 192.0.2.66 tos inherit "$@"
+ ip link set dev g1a up
+ __addr_add_del g1a add "192.0.2.65/32"
+
+ ip route add 192.0.2.66/32 via 192.0.2.130
+
+ ip route add 192.0.2.16/28 nexthop dev g1a
+}
+
+sw1_flat_destroy()
+{
+ local ol1=$1; shift
+ local ul1=$1; shift
+
+ ip route del 192.0.2.16/28
+
+ ip route del 192.0.2.66/32 via 192.0.2.130
+ __simple_if_fini g1a 192.0.2.65/32
+ tunnel_destroy g1a
+
+ vlan_destroy $ul1 111
+ __simple_if_fini $ul1
+ __simple_if_fini $ol1 192.0.2.2/28
+}
+
+sw2_flat_create()
+{
+ local type=$1; shift
+ local ol2=$1; shift
+ local ul2=$1; shift
+
+ simple_if_init $ol2 192.0.2.17/28
+ __simple_if_init $ul2 v$ol2
+ vlan_create $ul2 111 v$ol2 192.0.2.130/28
+
+ tunnel_create g2a $type 192.0.2.66 192.0.2.65 tos inherit dev v$ol2 \
+ "$@"
+ __simple_if_init g2a v$ol2 192.0.2.66/32
+
+ ip route add vrf v$ol2 192.0.2.65/32 via 192.0.2.129
+ ip route add vrf v$ol2 192.0.2.0/28 nexthop dev g2a
+}
+
+sw2_flat_destroy()
+{
+ local ol2=$1; shift
+ local ul2=$1; shift
+
+ ip route del vrf v$ol2 192.0.2.0/28
+
+ ip route del vrf v$ol2 192.0.2.65/32 via 192.0.2.129
+ __simple_if_fini g2a 192.0.2.66/32
+ tunnel_destroy g2a
+
+ vlan_destroy $ul2 111
+ __simple_if_fini $ul2
+ simple_if_fini $ol2 192.0.2.17/28
+}
+
+sw1_hierarchical_create()
+{
+ local type=$1; shift
+ local ol1=$1; shift
+ local ul1=$1; shift
+
+ simple_if_init $ol1 192.0.2.2/28
+ simple_if_init $ul1
+ ip link add name dummy1 type dummy
+ __simple_if_init dummy1 v$ul1 192.0.2.65/32
+
+ vlan_create $ul1 111 v$ul1 192.0.2.129/28
+ tunnel_create g1a $type 192.0.2.65 192.0.2.66 tos inherit dev dummy1 \
+ "$@"
+ ip link set dev g1a master v$ol1
+
+ ip route add vrf v$ul1 192.0.2.66/32 via 192.0.2.130
+ ip route add vrf v$ol1 192.0.2.16/28 nexthop dev g1a
+}
+
+sw1_hierarchical_destroy()
+{
+ local ol1=$1; shift
+ local ul1=$1; shift
+
+ ip route del vrf v$ol1 192.0.2.16/28
+ ip route del vrf v$ul1 192.0.2.66/32
+
+ tunnel_destroy g1a
+ vlan_destroy $ul1 111
+
+ __simple_if_fini dummy1 192.0.2.65/32
+ ip link del dev dummy1
+
+ simple_if_fini $ul1
+ simple_if_fini $ol1 192.0.2.2/28
+}
+
+sw2_hierarchical_create()
+{
+ local type=$1; shift
+ local ol2=$1; shift
+ local ul2=$1; shift
+
+ simple_if_init $ol2 192.0.2.17/28
+ simple_if_init $ul2
+
+ ip link add name dummy2 type dummy
+ __simple_if_init dummy2 v$ul2 192.0.2.66/32
+
+ vlan_create $ul2 111 v$ul2 192.0.2.130/28
+ tunnel_create g2a $type 192.0.2.66 192.0.2.65 tos inherit dev dummy2 \
+ "$@"
+ ip link set dev g2a master v$ol2
+
+ ip route add vrf v$ul2 192.0.2.65/32 via 192.0.2.129
+ ip route add vrf v$ol2 192.0.2.0/28 nexthop dev g2a
+}
+
+sw2_hierarchical_destroy()
+{
+ local ol2=$1; shift
+ local ul2=$1; shift
+
+ ip route del vrf v$ol2 192.0.2.0/28
+ ip route del vrf v$ul2 192.0.2.65/32
+
+ tunnel_destroy g2a
+ vlan_destroy $ul2 111
+
+ __simple_if_fini dummy2 192.0.2.66/32
+ ip link del dev dummy2
+
+ simple_if_fini $ul2
+ simple_if_fini $ol2 192.0.2.17/28
+}
+
+topo_mtu_change()
+{
+ local mtu=$1
+
+ ip link set mtu $mtu dev $h1
+ ip link set mtu $mtu dev $ol1
+ ip link set mtu $mtu dev g1a
+ ip link set mtu $mtu dev $ul1
+ ip link set mtu $mtu dev $ul1.111
+ ip link set mtu $mtu dev $h2
+ ip link set mtu $mtu dev $ol2
+ ip link set mtu $mtu dev g2a
+ ip link set mtu $mtu dev $ul2
+ ip link set mtu $mtu dev $ul2.111
+}
+
+test_mtu_change()
+{
+ local encap=$1; shift
+
+ RET=0
+
+ ping_do $h1 192.0.2.18 "-s 1800 -w 3"
+ check_fail $? "ping $encap should not pass with size 1800"
+
+ RET=0
+
+ topo_mtu_change 2000
+ ping_do $h1 192.0.2.18 "-s 1800 -w 3"
+ check_err $?
+ log_test "ping $encap packet size 1800 after MTU change"
+}
diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh
new file mode 100644
index 000000000..dfb41db7f
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/lib.sh
@@ -0,0 +1,1300 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+##############################################################################
+# Defines
+
+# Can be overridden by the configuration file.
+PING=${PING:=ping}
+PING6=${PING6:=ping6}
+MZ=${MZ:=mausezahn}
+ARPING=${ARPING:=arping}
+TEAMD=${TEAMD:=teamd}
+WAIT_TIME=${WAIT_TIME:=5}
+PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no}
+PAUSE_ON_CLEANUP=${PAUSE_ON_CLEANUP:=no}
+NETIF_TYPE=${NETIF_TYPE:=veth}
+NETIF_CREATE=${NETIF_CREATE:=yes}
+MCD=${MCD:=smcrouted}
+MC_CLI=${MC_CLI:=smcroutectl}
+PING_TIMEOUT=${PING_TIMEOUT:=5}
+WAIT_TIMEOUT=${WAIT_TIMEOUT:=20}
+INTERFACE_TIMEOUT=${INTERFACE_TIMEOUT:=600}
+
+relative_path="${BASH_SOURCE%/*}"
+if [[ "$relative_path" == "${BASH_SOURCE}" ]]; then
+ relative_path="."
+fi
+
+if [[ -f $relative_path/forwarding.config ]]; then
+ source "$relative_path/forwarding.config"
+fi
+
+##############################################################################
+# Sanity checks
+
+check_tc_version()
+{
+ tc -j &> /dev/null
+ if [[ $? -ne 0 ]]; then
+ echo "SKIP: iproute2 too old; tc is missing JSON support"
+ exit 1
+ fi
+}
+
+check_tc_shblock_support()
+{
+ tc filter help 2>&1 | grep block &> /dev/null
+ if [[ $? -ne 0 ]]; then
+ echo "SKIP: iproute2 too old; tc is missing shared block support"
+ exit 1
+ fi
+}
+
+check_tc_chain_support()
+{
+ tc help 2>&1|grep chain &> /dev/null
+ if [[ $? -ne 0 ]]; then
+ echo "SKIP: iproute2 too old; tc is missing chain support"
+ exit 1
+ fi
+}
+
+check_tc_action_hw_stats_support()
+{
+ tc actions help 2>&1 | grep -q hw_stats
+ if [[ $? -ne 0 ]]; then
+ echo "SKIP: iproute2 too old; tc is missing action hw_stats support"
+ exit 1
+ fi
+}
+
+skip_on_veth()
+{
+ local kind=$(ip -j -d link show dev ${NETIFS[p1]} |
+ jq -r '.[].linkinfo.info_kind')
+
+ if [[ $kind == veth ]]; then
+ echo "SKIP: Test cannot be run with veth pairs"
+ exit $ksft_skip
+ fi
+}
+
+if [[ "$(id -u)" -ne 0 ]]; then
+ echo "SKIP: need root privileges"
+ exit 0
+fi
+
+if [[ "$CHECK_TC" = "yes" ]]; then
+ check_tc_version
+fi
+
+require_command()
+{
+ local cmd=$1; shift
+
+ if [[ ! -x "$(command -v "$cmd")" ]]; then
+ echo "SKIP: $cmd not installed"
+ exit 1
+ fi
+}
+
+require_command jq
+require_command $MZ
+
+if [[ ! -v NUM_NETIFS ]]; then
+ echo "SKIP: importer does not define \"NUM_NETIFS\""
+ exit 1
+fi
+
+##############################################################################
+# Command line options handling
+
+count=0
+
+while [[ $# -gt 0 ]]; do
+ if [[ "$count" -eq "0" ]]; then
+ unset NETIFS
+ declare -A NETIFS
+ fi
+ count=$((count + 1))
+ NETIFS[p$count]="$1"
+ shift
+done
+
+##############################################################################
+# Network interfaces configuration
+
+create_netif_veth()
+{
+ local i
+
+ for ((i = 1; i <= NUM_NETIFS; ++i)); do
+ local j=$((i+1))
+
+ if [ -z ${NETIFS[p$i]} ]; then
+ echo "SKIP: Cannot create interface. Name not specified"
+ exit $ksft_skip
+ fi
+
+ ip link show dev ${NETIFS[p$i]} &> /dev/null
+ if [[ $? -ne 0 ]]; then
+ ip link add ${NETIFS[p$i]} type veth \
+ peer name ${NETIFS[p$j]}
+ if [[ $? -ne 0 ]]; then
+ echo "Failed to create netif"
+ exit 1
+ fi
+ fi
+ i=$j
+ done
+}
+
+create_netif()
+{
+ case "$NETIF_TYPE" in
+ veth) create_netif_veth
+ ;;
+ *) echo "Can not create interfaces of type \'$NETIF_TYPE\'"
+ exit 1
+ ;;
+ esac
+}
+
+if [[ "$NETIF_CREATE" = "yes" ]]; then
+ create_netif
+fi
+
+for ((i = 1; i <= NUM_NETIFS; ++i)); do
+ ip link show dev ${NETIFS[p$i]} &> /dev/null
+ if [[ $? -ne 0 ]]; then
+ echo "SKIP: could not find all required interfaces"
+ exit 1
+ fi
+done
+
+##############################################################################
+# Helpers
+
+# Exit status to return at the end. Set in case one of the tests fails.
+EXIT_STATUS=0
+# Per-test return value. Clear at the beginning of each test.
+RET=0
+
+check_err()
+{
+ local err=$1
+ local msg=$2
+
+ if [[ $RET -eq 0 && $err -ne 0 ]]; then
+ RET=$err
+ retmsg=$msg
+ fi
+}
+
+check_fail()
+{
+ local err=$1
+ local msg=$2
+
+ if [[ $RET -eq 0 && $err -eq 0 ]]; then
+ RET=1
+ retmsg=$msg
+ fi
+}
+
+check_err_fail()
+{
+ local should_fail=$1; shift
+ local err=$1; shift
+ local what=$1; shift
+
+ if ((should_fail)); then
+ check_fail $err "$what succeeded, but should have failed"
+ else
+ check_err $err "$what failed"
+ fi
+}
+
+log_test()
+{
+ local test_name=$1
+ local opt_str=$2
+
+ if [[ $# -eq 2 ]]; then
+ opt_str="($opt_str)"
+ fi
+
+ if [[ $RET -ne 0 ]]; then
+ EXIT_STATUS=1
+ printf "TEST: %-60s [FAIL]\n" "$test_name $opt_str"
+ if [[ ! -z "$retmsg" ]]; then
+ printf "\t%s\n" "$retmsg"
+ fi
+ if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+ echo "Hit enter to continue, 'q' to quit"
+ read a
+ [ "$a" = "q" ] && exit 1
+ fi
+ return 1
+ fi
+
+ printf "TEST: %-60s [ OK ]\n" "$test_name $opt_str"
+ return 0
+}
+
+log_info()
+{
+ local msg=$1
+
+ echo "INFO: $msg"
+}
+
+busywait()
+{
+ local timeout=$1; shift
+
+ local start_time="$(date -u +%s%3N)"
+ while true
+ do
+ local out
+ out=$("$@")
+ local ret=$?
+ if ((!ret)); then
+ echo -n "$out"
+ return 0
+ fi
+
+ local current_time="$(date -u +%s%3N)"
+ if ((current_time - start_time > timeout)); then
+ echo -n "$out"
+ return 1
+ fi
+ done
+}
+
+not()
+{
+ "$@"
+ [[ $? != 0 ]]
+}
+
+grep_bridge_fdb()
+{
+ local addr=$1; shift
+ local word
+ local flag
+
+ if [ "$1" == "self" ] || [ "$1" == "master" ]; then
+ word=$1; shift
+ if [ "$1" == "-v" ]; then
+ flag=$1; shift
+ fi
+ fi
+
+ $@ | grep $addr | grep $flag "$word"
+}
+
+wait_for_offload()
+{
+ "$@" | grep -q offload
+}
+
+until_counter_is()
+{
+ local expr=$1; shift
+ local current=$("$@")
+
+ echo $((current))
+ ((current $expr))
+}
+
+busywait_for_counter()
+{
+ local timeout=$1; shift
+ local delta=$1; shift
+
+ local base=$("$@")
+ busywait "$timeout" until_counter_is ">= $((base + delta))" "$@"
+}
+
+setup_wait_dev()
+{
+ local dev=$1; shift
+ local wait_time=${1:-$WAIT_TIME}; shift
+
+ setup_wait_dev_with_timeout "$dev" $INTERFACE_TIMEOUT $wait_time
+
+ if (($?)); then
+ check_err 1
+ log_test setup_wait_dev ": Interface $dev does not come up."
+ exit 1
+ fi
+}
+
+setup_wait_dev_with_timeout()
+{
+ local dev=$1; shift
+ local max_iterations=${1:-$WAIT_TIMEOUT}; shift
+ local wait_time=${1:-$WAIT_TIME}; shift
+ local i
+
+ for ((i = 1; i <= $max_iterations; ++i)); do
+ ip link show dev $dev up \
+ | grep 'state UP' &> /dev/null
+ if [[ $? -ne 0 ]]; then
+ sleep 1
+ else
+ sleep $wait_time
+ return 0
+ fi
+ done
+
+ return 1
+}
+
+setup_wait()
+{
+ local num_netifs=${1:-$NUM_NETIFS}
+ local i
+
+ for ((i = 1; i <= num_netifs; ++i)); do
+ setup_wait_dev ${NETIFS[p$i]} 0
+ done
+
+ # Make sure links are ready.
+ sleep $WAIT_TIME
+}
+
+cmd_jq()
+{
+ local cmd=$1
+ local jq_exp=$2
+ local jq_opts=$3
+ local ret
+ local output
+
+ output="$($cmd)"
+ # it the command fails, return error right away
+ ret=$?
+ if [[ $ret -ne 0 ]]; then
+ return $ret
+ fi
+ output=$(echo $output | jq -r $jq_opts "$jq_exp")
+ ret=$?
+ if [[ $ret -ne 0 ]]; then
+ return $ret
+ fi
+ echo $output
+ # return success only in case of non-empty output
+ [ ! -z "$output" ]
+}
+
+lldpad_app_wait_set()
+{
+ local dev=$1; shift
+
+ while lldptool -t -i $dev -V APP -c app | grep -Eq "pending|unknown"; do
+ echo "$dev: waiting for lldpad to push pending APP updates"
+ sleep 5
+ done
+}
+
+lldpad_app_wait_del()
+{
+ # Give lldpad a chance to push down the changes. If the device is downed
+ # too soon, the updates will be left pending. However, they will have
+ # been struck off the lldpad's DB already, so we won't be able to tell
+ # they are pending. Then on next test iteration this would cause
+ # weirdness as newly-added APP rules conflict with the old ones,
+ # sometimes getting stuck in an "unknown" state.
+ sleep 5
+}
+
+pre_cleanup()
+{
+ if [ "${PAUSE_ON_CLEANUP}" = "yes" ]; then
+ echo "Pausing before cleanup, hit any key to continue"
+ read
+ fi
+}
+
+vrf_prepare()
+{
+ ip -4 rule add pref 32765 table local
+ ip -4 rule del pref 0
+ ip -6 rule add pref 32765 table local
+ ip -6 rule del pref 0
+}
+
+vrf_cleanup()
+{
+ ip -6 rule add pref 0 table local
+ ip -6 rule del pref 32765
+ ip -4 rule add pref 0 table local
+ ip -4 rule del pref 32765
+}
+
+__last_tb_id=0
+declare -A __TB_IDS
+
+__vrf_td_id_assign()
+{
+ local vrf_name=$1
+
+ __last_tb_id=$((__last_tb_id + 1))
+ __TB_IDS[$vrf_name]=$__last_tb_id
+ return $__last_tb_id
+}
+
+__vrf_td_id_lookup()
+{
+ local vrf_name=$1
+
+ return ${__TB_IDS[$vrf_name]}
+}
+
+vrf_create()
+{
+ local vrf_name=$1
+ local tb_id
+
+ __vrf_td_id_assign $vrf_name
+ tb_id=$?
+
+ ip link add dev $vrf_name type vrf table $tb_id
+ ip -4 route add table $tb_id unreachable default metric 4278198272
+ ip -6 route add table $tb_id unreachable default metric 4278198272
+}
+
+vrf_destroy()
+{
+ local vrf_name=$1
+ local tb_id
+
+ __vrf_td_id_lookup $vrf_name
+ tb_id=$?
+
+ ip -6 route del table $tb_id unreachable default metric 4278198272
+ ip -4 route del table $tb_id unreachable default metric 4278198272
+ ip link del dev $vrf_name
+}
+
+__addr_add_del()
+{
+ local if_name=$1
+ local add_del=$2
+ local array
+
+ shift
+ shift
+ array=("${@}")
+
+ for addrstr in "${array[@]}"; do
+ ip address $add_del $addrstr dev $if_name
+ done
+}
+
+__simple_if_init()
+{
+ local if_name=$1; shift
+ local vrf_name=$1; shift
+ local addrs=("${@}")
+
+ ip link set dev $if_name master $vrf_name
+ ip link set dev $if_name up
+
+ __addr_add_del $if_name add "${addrs[@]}"
+}
+
+__simple_if_fini()
+{
+ local if_name=$1; shift
+ local addrs=("${@}")
+
+ __addr_add_del $if_name del "${addrs[@]}"
+
+ ip link set dev $if_name down
+ ip link set dev $if_name nomaster
+}
+
+simple_if_init()
+{
+ local if_name=$1
+ local vrf_name
+ local array
+
+ shift
+ vrf_name=v$if_name
+ array=("${@}")
+
+ vrf_create $vrf_name
+ ip link set dev $vrf_name up
+ __simple_if_init $if_name $vrf_name "${array[@]}"
+}
+
+simple_if_fini()
+{
+ local if_name=$1
+ local vrf_name
+ local array
+
+ shift
+ vrf_name=v$if_name
+ array=("${@}")
+
+ __simple_if_fini $if_name "${array[@]}"
+ vrf_destroy $vrf_name
+}
+
+tunnel_create()
+{
+ local name=$1; shift
+ local type=$1; shift
+ local local=$1; shift
+ local remote=$1; shift
+
+ ip link add name $name type $type \
+ local $local remote $remote "$@"
+ ip link set dev $name up
+}
+
+tunnel_destroy()
+{
+ local name=$1; shift
+
+ ip link del dev $name
+}
+
+vlan_create()
+{
+ local if_name=$1; shift
+ local vid=$1; shift
+ local vrf=$1; shift
+ local ips=("${@}")
+ local name=$if_name.$vid
+
+ ip link add name $name link $if_name type vlan id $vid
+ if [ "$vrf" != "" ]; then
+ ip link set dev $name master $vrf
+ fi
+ ip link set dev $name up
+ __addr_add_del $name add "${ips[@]}"
+}
+
+vlan_destroy()
+{
+ local if_name=$1; shift
+ local vid=$1; shift
+ local name=$if_name.$vid
+
+ ip link del dev $name
+}
+
+team_create()
+{
+ local if_name=$1; shift
+ local mode=$1; shift
+
+ require_command $TEAMD
+ $TEAMD -t $if_name -d -c '{"runner": {"name": "'$mode'"}}'
+ for slave in "$@"; do
+ ip link set dev $slave down
+ ip link set dev $slave master $if_name
+ ip link set dev $slave up
+ done
+ ip link set dev $if_name up
+}
+
+team_destroy()
+{
+ local if_name=$1; shift
+
+ $TEAMD -t $if_name -k
+}
+
+master_name_get()
+{
+ local if_name=$1
+
+ ip -j link show dev $if_name | jq -r '.[]["master"]'
+}
+
+link_stats_get()
+{
+ local if_name=$1; shift
+ local dir=$1; shift
+ local stat=$1; shift
+
+ ip -j -s link show dev $if_name \
+ | jq '.[]["stats64"]["'$dir'"]["'$stat'"]'
+}
+
+link_stats_tx_packets_get()
+{
+ link_stats_get $1 tx packets
+}
+
+link_stats_rx_errors_get()
+{
+ link_stats_get $1 rx errors
+}
+
+tc_rule_stats_get()
+{
+ local dev=$1; shift
+ local pref=$1; shift
+ local dir=$1; shift
+ local selector=${1:-.packets}; shift
+
+ tc -j -s filter show dev $dev ${dir:-ingress} pref $pref \
+ | jq ".[1].options.actions[].stats$selector"
+}
+
+tc_rule_handle_stats_get()
+{
+ local id=$1; shift
+ local handle=$1; shift
+ local selector=${1:-.packets}; shift
+
+ tc -j -s filter show $id \
+ | jq ".[] | select(.options.handle == $handle) | \
+ .options.actions[0].stats$selector"
+}
+
+ethtool_stats_get()
+{
+ local dev=$1; shift
+ local stat=$1; shift
+
+ ethtool -S $dev | grep "^ *$stat:" | head -n 1 | cut -d: -f2
+}
+
+qdisc_stats_get()
+{
+ local dev=$1; shift
+ local handle=$1; shift
+ local selector=$1; shift
+
+ tc -j -s qdisc show dev "$dev" \
+ | jq '.[] | select(.handle == "'"$handle"'") | '"$selector"
+}
+
+qdisc_parent_stats_get()
+{
+ local dev=$1; shift
+ local parent=$1; shift
+ local selector=$1; shift
+
+ tc -j -s qdisc show dev "$dev" invisible \
+ | jq '.[] | select(.parent == "'"$parent"'") | '"$selector"
+}
+
+ipv6_stats_get()
+{
+ local dev=$1; shift
+ local stat=$1; shift
+
+ cat /proc/net/dev_snmp6/$dev | grep "^$stat" | cut -f2
+}
+
+humanize()
+{
+ local speed=$1; shift
+
+ for unit in bps Kbps Mbps Gbps; do
+ if (($(echo "$speed < 1024" | bc))); then
+ break
+ fi
+
+ speed=$(echo "scale=1; $speed / 1024" | bc)
+ done
+
+ echo "$speed${unit}"
+}
+
+rate()
+{
+ local t0=$1; shift
+ local t1=$1; shift
+ local interval=$1; shift
+
+ echo $((8 * (t1 - t0) / interval))
+}
+
+mac_get()
+{
+ local if_name=$1
+
+ ip -j link show dev $if_name | jq -r '.[]["address"]'
+}
+
+bridge_ageing_time_get()
+{
+ local bridge=$1
+ local ageing_time
+
+ # Need to divide by 100 to convert to seconds.
+ ageing_time=$(ip -j -d link show dev $bridge \
+ | jq '.[]["linkinfo"]["info_data"]["ageing_time"]')
+ echo $((ageing_time / 100))
+}
+
+declare -A SYSCTL_ORIG
+sysctl_set()
+{
+ local key=$1; shift
+ local value=$1; shift
+
+ SYSCTL_ORIG[$key]=$(sysctl -n $key)
+ sysctl -qw $key="$value"
+}
+
+sysctl_restore()
+{
+ local key=$1; shift
+
+ sysctl -qw $key="${SYSCTL_ORIG[$key]}"
+}
+
+forwarding_enable()
+{
+ sysctl_set net.ipv4.conf.all.forwarding 1
+ sysctl_set net.ipv6.conf.all.forwarding 1
+}
+
+forwarding_restore()
+{
+ sysctl_restore net.ipv6.conf.all.forwarding
+ sysctl_restore net.ipv4.conf.all.forwarding
+}
+
+declare -A MTU_ORIG
+mtu_set()
+{
+ local dev=$1; shift
+ local mtu=$1; shift
+
+ MTU_ORIG["$dev"]=$(ip -j link show dev $dev | jq -e '.[].mtu')
+ ip link set dev $dev mtu $mtu
+}
+
+mtu_restore()
+{
+ local dev=$1; shift
+
+ ip link set dev $dev mtu ${MTU_ORIG["$dev"]}
+}
+
+tc_offload_check()
+{
+ local num_netifs=${1:-$NUM_NETIFS}
+
+ for ((i = 1; i <= num_netifs; ++i)); do
+ ethtool -k ${NETIFS[p$i]} \
+ | grep "hw-tc-offload: on" &> /dev/null
+ if [[ $? -ne 0 ]]; then
+ return 1
+ fi
+ done
+
+ return 0
+}
+
+trap_install()
+{
+ local dev=$1; shift
+ local direction=$1; shift
+
+ # Some devices may not support or need in-hardware trapping of traffic
+ # (e.g. the veth pairs that this library creates for non-existent
+ # loopbacks). Use continue instead, so that there is a filter in there
+ # (some tests check counters), and so that other filters are still
+ # processed.
+ tc filter add dev $dev $direction pref 1 \
+ flower skip_sw action trap 2>/dev/null \
+ || tc filter add dev $dev $direction pref 1 \
+ flower action continue
+}
+
+trap_uninstall()
+{
+ local dev=$1; shift
+ local direction=$1; shift
+
+ tc filter del dev $dev $direction pref 1 flower
+}
+
+slow_path_trap_install()
+{
+ # For slow-path testing, we need to install a trap to get to
+ # slow path the packets that would otherwise be switched in HW.
+ if [ "${tcflags/skip_hw}" != "$tcflags" ]; then
+ trap_install "$@"
+ fi
+}
+
+slow_path_trap_uninstall()
+{
+ if [ "${tcflags/skip_hw}" != "$tcflags" ]; then
+ trap_uninstall "$@"
+ fi
+}
+
+__icmp_capture_add_del()
+{
+ local add_del=$1; shift
+ local pref=$1; shift
+ local vsuf=$1; shift
+ local tundev=$1; shift
+ local filter=$1; shift
+
+ tc filter $add_del dev "$tundev" ingress \
+ proto ip$vsuf pref $pref \
+ flower ip_proto icmp$vsuf $filter \
+ action pass
+}
+
+icmp_capture_install()
+{
+ __icmp_capture_add_del add 100 "" "$@"
+}
+
+icmp_capture_uninstall()
+{
+ __icmp_capture_add_del del 100 "" "$@"
+}
+
+icmp6_capture_install()
+{
+ __icmp_capture_add_del add 100 v6 "$@"
+}
+
+icmp6_capture_uninstall()
+{
+ __icmp_capture_add_del del 100 v6 "$@"
+}
+
+__vlan_capture_add_del()
+{
+ local add_del=$1; shift
+ local pref=$1; shift
+ local dev=$1; shift
+ local filter=$1; shift
+
+ tc filter $add_del dev "$dev" ingress \
+ proto 802.1q pref $pref \
+ flower $filter \
+ action pass
+}
+
+vlan_capture_install()
+{
+ __vlan_capture_add_del add 100 "$@"
+}
+
+vlan_capture_uninstall()
+{
+ __vlan_capture_add_del del 100 "$@"
+}
+
+__dscp_capture_add_del()
+{
+ local add_del=$1; shift
+ local dev=$1; shift
+ local base=$1; shift
+ local dscp;
+
+ for prio in {0..7}; do
+ dscp=$((base + prio))
+ __icmp_capture_add_del $add_del $((dscp + 100)) "" $dev \
+ "skip_hw ip_tos $((dscp << 2))"
+ done
+}
+
+dscp_capture_install()
+{
+ local dev=$1; shift
+ local base=$1; shift
+
+ __dscp_capture_add_del add $dev $base
+}
+
+dscp_capture_uninstall()
+{
+ local dev=$1; shift
+ local base=$1; shift
+
+ __dscp_capture_add_del del $dev $base
+}
+
+dscp_fetch_stats()
+{
+ local dev=$1; shift
+ local base=$1; shift
+
+ for prio in {0..7}; do
+ local dscp=$((base + prio))
+ local t=$(tc_rule_stats_get $dev $((dscp + 100)))
+ echo "[$dscp]=$t "
+ done
+}
+
+matchall_sink_create()
+{
+ local dev=$1; shift
+
+ tc qdisc add dev $dev clsact
+ tc filter add dev $dev ingress \
+ pref 10000 \
+ matchall \
+ action drop
+}
+
+tests_run()
+{
+ local current_test
+
+ for current_test in ${TESTS:-$ALL_TESTS}; do
+ $current_test
+ done
+}
+
+multipath_eval()
+{
+ local desc="$1"
+ local weight_rp12=$2
+ local weight_rp13=$3
+ local packets_rp12=$4
+ local packets_rp13=$5
+ local weights_ratio packets_ratio diff
+
+ RET=0
+
+ if [[ "$weight_rp12" -gt "$weight_rp13" ]]; then
+ weights_ratio=$(echo "scale=2; $weight_rp12 / $weight_rp13" \
+ | bc -l)
+ else
+ weights_ratio=$(echo "scale=2; $weight_rp13 / $weight_rp12" \
+ | bc -l)
+ fi
+
+ if [[ "$packets_rp12" -eq "0" || "$packets_rp13" -eq "0" ]]; then
+ check_err 1 "Packet difference is 0"
+ log_test "Multipath"
+ log_info "Expected ratio $weights_ratio"
+ return
+ fi
+
+ if [[ "$weight_rp12" -gt "$weight_rp13" ]]; then
+ packets_ratio=$(echo "scale=2; $packets_rp12 / $packets_rp13" \
+ | bc -l)
+ else
+ packets_ratio=$(echo "scale=2; $packets_rp13 / $packets_rp12" \
+ | bc -l)
+ fi
+
+ diff=$(echo $weights_ratio - $packets_ratio | bc -l)
+ diff=${diff#-}
+
+ test "$(echo "$diff / $weights_ratio > 0.15" | bc -l)" -eq 0
+ check_err $? "Too large discrepancy between expected and measured ratios"
+ log_test "$desc"
+ log_info "Expected ratio $weights_ratio Measured ratio $packets_ratio"
+}
+
+in_ns()
+{
+ local name=$1; shift
+
+ ip netns exec $name bash <<-EOF
+ NUM_NETIFS=0
+ source lib.sh
+ $(for a in "$@"; do printf "%q${IFS:0:1}" "$a"; done)
+ EOF
+}
+
+##############################################################################
+# Tests
+
+ping_do()
+{
+ local if_name=$1
+ local dip=$2
+ local args=$3
+ local vrf_name
+
+ vrf_name=$(master_name_get $if_name)
+ ip vrf exec $vrf_name \
+ $PING $args $dip -c 10 -i 0.1 -w $PING_TIMEOUT &> /dev/null
+}
+
+ping_test()
+{
+ RET=0
+
+ ping_do $1 $2
+ check_err $?
+ log_test "ping$3"
+}
+
+ping6_do()
+{
+ local if_name=$1
+ local dip=$2
+ local args=$3
+ local vrf_name
+
+ vrf_name=$(master_name_get $if_name)
+ ip vrf exec $vrf_name \
+ $PING6 $args $dip -c 10 -i 0.1 -w $PING_TIMEOUT &> /dev/null
+}
+
+ping6_test()
+{
+ RET=0
+
+ ping6_do $1 $2
+ check_err $?
+ log_test "ping6$3"
+}
+
+learning_test()
+{
+ local bridge=$1
+ local br_port1=$2 # Connected to `host1_if`.
+ local host1_if=$3
+ local host2_if=$4
+ local mac=de:ad:be:ef:13:37
+ local ageing_time
+
+ RET=0
+
+ bridge -j fdb show br $bridge brport $br_port1 \
+ | jq -e ".[] | select(.mac == \"$mac\")" &> /dev/null
+ check_fail $? "Found FDB record when should not"
+
+ # Disable unknown unicast flooding on `br_port1` to make sure
+ # packets are only forwarded through the port after a matching
+ # FDB entry was installed.
+ bridge link set dev $br_port1 flood off
+
+ ip link set $host1_if promisc on
+ tc qdisc add dev $host1_if ingress
+ tc filter add dev $host1_if ingress protocol ip pref 1 handle 101 \
+ flower dst_mac $mac action drop
+
+ $MZ $host2_if -c 1 -p 64 -b $mac -t ip -q
+ sleep 1
+
+ tc -j -s filter show dev $host1_if ingress \
+ | jq -e ".[] | select(.options.handle == 101) \
+ | select(.options.actions[0].stats.packets == 1)" &> /dev/null
+ check_fail $? "Packet reached first host when should not"
+
+ $MZ $host1_if -c 1 -p 64 -a $mac -t ip -q
+ sleep 1
+
+ bridge -j fdb show br $bridge brport $br_port1 \
+ | jq -e ".[] | select(.mac == \"$mac\")" &> /dev/null
+ check_err $? "Did not find FDB record when should"
+
+ $MZ $host2_if -c 1 -p 64 -b $mac -t ip -q
+ sleep 1
+
+ tc -j -s filter show dev $host1_if ingress \
+ | jq -e ".[] | select(.options.handle == 101) \
+ | select(.options.actions[0].stats.packets == 1)" &> /dev/null
+ check_err $? "Packet did not reach second host when should"
+
+ # Wait for 10 seconds after the ageing time to make sure FDB
+ # record was aged-out.
+ ageing_time=$(bridge_ageing_time_get $bridge)
+ sleep $((ageing_time + 10))
+
+ bridge -j fdb show br $bridge brport $br_port1 \
+ | jq -e ".[] | select(.mac == \"$mac\")" &> /dev/null
+ check_fail $? "Found FDB record when should not"
+
+ bridge link set dev $br_port1 learning off
+
+ $MZ $host1_if -c 1 -p 64 -a $mac -t ip -q
+ sleep 1
+
+ bridge -j fdb show br $bridge brport $br_port1 \
+ | jq -e ".[] | select(.mac == \"$mac\")" &> /dev/null
+ check_fail $? "Found FDB record when should not"
+
+ bridge link set dev $br_port1 learning on
+
+ tc filter del dev $host1_if ingress protocol ip pref 1 handle 101 flower
+ tc qdisc del dev $host1_if ingress
+ ip link set $host1_if promisc off
+
+ bridge link set dev $br_port1 flood on
+
+ log_test "FDB learning"
+}
+
+flood_test_do()
+{
+ local should_flood=$1
+ local mac=$2
+ local ip=$3
+ local host1_if=$4
+ local host2_if=$5
+ local err=0
+
+ # Add an ACL on `host2_if` which will tell us whether the packet
+ # was flooded to it or not.
+ ip link set $host2_if promisc on
+ tc qdisc add dev $host2_if ingress
+ tc filter add dev $host2_if ingress protocol ip pref 1 handle 101 \
+ flower dst_mac $mac action drop
+
+ $MZ $host1_if -c 1 -p 64 -b $mac -B $ip -t ip -q
+ sleep 1
+
+ tc -j -s filter show dev $host2_if ingress \
+ | jq -e ".[] | select(.options.handle == 101) \
+ | select(.options.actions[0].stats.packets == 1)" &> /dev/null
+ if [[ $? -ne 0 && $should_flood == "true" || \
+ $? -eq 0 && $should_flood == "false" ]]; then
+ err=1
+ fi
+
+ tc filter del dev $host2_if ingress protocol ip pref 1 handle 101 flower
+ tc qdisc del dev $host2_if ingress
+ ip link set $host2_if promisc off
+
+ return $err
+}
+
+flood_unicast_test()
+{
+ local br_port=$1
+ local host1_if=$2
+ local host2_if=$3
+ local mac=de:ad:be:ef:13:37
+ local ip=192.0.2.100
+
+ RET=0
+
+ bridge link set dev $br_port flood off
+
+ flood_test_do false $mac $ip $host1_if $host2_if
+ check_err $? "Packet flooded when should not"
+
+ bridge link set dev $br_port flood on
+
+ flood_test_do true $mac $ip $host1_if $host2_if
+ check_err $? "Packet was not flooded when should"
+
+ log_test "Unknown unicast flood"
+}
+
+flood_multicast_test()
+{
+ local br_port=$1
+ local host1_if=$2
+ local host2_if=$3
+ local mac=01:00:5e:00:00:01
+ local ip=239.0.0.1
+
+ RET=0
+
+ bridge link set dev $br_port mcast_flood off
+
+ flood_test_do false $mac $ip $host1_if $host2_if
+ check_err $? "Packet flooded when should not"
+
+ bridge link set dev $br_port mcast_flood on
+
+ flood_test_do true $mac $ip $host1_if $host2_if
+ check_err $? "Packet was not flooded when should"
+
+ log_test "Unregistered multicast flood"
+}
+
+flood_test()
+{
+ # `br_port` is connected to `host2_if`
+ local br_port=$1
+ local host1_if=$2
+ local host2_if=$3
+
+ flood_unicast_test $br_port $host1_if $host2_if
+ flood_multicast_test $br_port $host1_if $host2_if
+}
+
+__start_traffic()
+{
+ local proto=$1; shift
+ local h_in=$1; shift # Where the traffic egresses the host
+ local sip=$1; shift
+ local dip=$1; shift
+ local dmac=$1; shift
+
+ $MZ $h_in -p 8000 -A $sip -B $dip -c 0 \
+ -a own -b $dmac -t "$proto" -q "$@" &
+ sleep 1
+}
+
+start_traffic()
+{
+ __start_traffic udp "$@"
+}
+
+start_tcp_traffic()
+{
+ __start_traffic tcp "$@"
+}
+
+stop_traffic()
+{
+ # Suppress noise from killing mausezahn.
+ { kill %% && wait %%; } 2>/dev/null
+}
+
+tcpdump_start()
+{
+ local if_name=$1; shift
+ local ns=$1; shift
+
+ capfile=$(mktemp)
+ capout=$(mktemp)
+
+ if [ -z $ns ]; then
+ ns_cmd=""
+ else
+ ns_cmd="ip netns exec ${ns}"
+ fi
+
+ if [ -z $SUDO_USER ] ; then
+ capuser=""
+ else
+ capuser="-Z $SUDO_USER"
+ fi
+
+ $ns_cmd tcpdump -e -n -Q in -i $if_name \
+ -s 65535 -B 32768 $capuser -w $capfile > "$capout" 2>&1 &
+ cappid=$!
+
+ sleep 1
+}
+
+tcpdump_stop()
+{
+ $ns_cmd kill $cappid
+ sleep 1
+}
+
+tcpdump_cleanup()
+{
+ rm $capfile $capout
+}
+
+tcpdump_show()
+{
+ tcpdump -e -n -r $capfile 2>&1
+}
diff --git a/tools/testing/selftests/net/forwarding/loopback.sh b/tools/testing/selftests/net/forwarding/loopback.sh
new file mode 100755
index 000000000..8f4057310
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/loopback.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+ALL_TESTS="loopback_test"
+NUM_NETIFS=2
+source tc_common.sh
+source lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/24
+ tc qdisc add dev $h1 clsact
+}
+
+h1_destroy()
+{
+ tc qdisc del dev $h1 clsact
+ simple_if_fini $h1 192.0.2.1/24
+}
+
+h2_create()
+{
+ simple_if_init $h2
+}
+
+h2_destroy()
+{
+ simple_if_fini $h2
+}
+
+loopback_test()
+{
+ RET=0
+
+ tc filter add dev $h1 ingress protocol arp pref 1 handle 101 flower \
+ skip_hw arp_op reply arp_tip 192.0.2.1 action drop
+
+ $MZ $h1 -c 1 -t arp -q
+
+ tc_check_packets "dev $h1 ingress" 101 1
+ check_fail $? "Matched on a filter without loopback setup"
+
+ ethtool -K $h1 loopback on
+ check_err $? "Failed to enable loopback"
+
+ setup_wait_dev $h1
+
+ $MZ $h1 -c 1 -t arp -q
+
+ tc_check_packets "dev $h1 ingress" 101 1
+ check_err $? "Did not match on filter with loopback"
+
+ ethtool -K $h1 loopback off
+ check_err $? "Failed to disable loopback"
+
+ $MZ $h1 -c 1 -t arp -q
+
+ tc_check_packets "dev $h1 ingress" 101 2
+ check_fail $? "Matched on a filter after loopback was removed"
+
+ tc filter del dev $h1 ingress protocol arp pref 1 handle 101 flower
+
+ log_test "loopback"
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ h2=${NETIFS[p2]}
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+
+ if ethtool -k $h1 | grep loopback | grep -q fixed; then
+ log_test "SKIP: dev $h1 does not support loopback feature"
+ exit $ksft_skip
+ fi
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre.sh b/tools/testing/selftests/net/forwarding/mirror_gre.sh
new file mode 100755
index 000000000..026644360
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre.sh
@@ -0,0 +1,160 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test uses standard topology for testing gretap. See
+# mirror_gre_topo_lib.sh for more details.
+#
+# Test for "tc action mirred egress mirror" when the device to mirror to is a
+# gretap or ip6gretap netdevice. Expect that the packets come out encapsulated,
+# and another gretap / ip6gretap netdevice is then capable of decapsulating the
+# traffic. Test that the payload is what is expected (ICMP ping request or
+# reply, depending on test).
+
+ALL_TESTS="
+ test_gretap
+ test_ip6gretap
+ test_gretap_mac
+ test_ip6gretap_mac
+ test_two_spans
+"
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_gre_lib.sh
+source mirror_gre_topo_lib.sh
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ vrf_prepare
+ mirror_gre_topo_create
+
+ ip address add dev $swp3 192.0.2.129/28
+ ip address add dev $h3 192.0.2.130/28
+
+ ip address add dev $swp3 2001:db8:2::1/64
+ ip address add dev $h3 2001:db8:2::2/64
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ ip address del dev $h3 2001:db8:2::2/64
+ ip address del dev $swp3 2001:db8:2::1/64
+
+ ip address del dev $h3 192.0.2.130/28
+ ip address del dev $swp3 192.0.2.129/28
+
+ mirror_gre_topo_destroy
+ vrf_cleanup
+}
+
+test_span_gre_mac()
+{
+ local tundev=$1; shift
+ local direction=$1; shift
+ local what=$1; shift
+
+ case "$direction" in
+ ingress) local src_mac=$(mac_get $h1); local dst_mac=$(mac_get $h2)
+ ;;
+ egress) local src_mac=$(mac_get $h2); local dst_mac=$(mac_get $h1)
+ ;;
+ esac
+
+ RET=0
+
+ mirror_install $swp1 $direction $tundev "matchall $tcflags"
+ icmp_capture_install h3-${tundev} "src_mac $src_mac dst_mac $dst_mac"
+
+ mirror_test v$h1 192.0.2.1 192.0.2.2 h3-${tundev} 100 10
+
+ icmp_capture_uninstall h3-${tundev}
+ mirror_uninstall $swp1 $direction
+
+ log_test "$direction $what: envelope MAC ($tcflags)"
+}
+
+test_two_spans()
+{
+ RET=0
+
+ mirror_install $swp1 ingress gt4 "matchall $tcflags"
+ mirror_install $swp1 egress gt6 "matchall $tcflags"
+ quick_test_span_gre_dir gt4 ingress
+ quick_test_span_gre_dir gt6 egress
+
+ mirror_uninstall $swp1 ingress
+ fail_test_span_gre_dir gt4 ingress
+ quick_test_span_gre_dir gt6 egress
+
+ mirror_install $swp1 ingress gt4 "matchall $tcflags"
+ mirror_uninstall $swp1 egress
+ quick_test_span_gre_dir gt4 ingress
+ fail_test_span_gre_dir gt6 egress
+
+ mirror_uninstall $swp1 ingress
+ log_test "two simultaneously configured mirrors ($tcflags)"
+}
+
+test_gretap()
+{
+ full_test_span_gre_dir gt4 ingress 8 0 "mirror to gretap"
+ full_test_span_gre_dir gt4 egress 0 8 "mirror to gretap"
+}
+
+test_ip6gretap()
+{
+ full_test_span_gre_dir gt6 ingress 8 0 "mirror to ip6gretap"
+ full_test_span_gre_dir gt6 egress 0 8 "mirror to ip6gretap"
+}
+
+test_gretap_mac()
+{
+ test_span_gre_mac gt4 ingress "mirror to gretap"
+ test_span_gre_mac gt4 egress "mirror to gretap"
+}
+
+test_ip6gretap_mac()
+{
+ test_span_gre_mac gt6 ingress "mirror to ip6gretap"
+ test_span_gre_mac gt6 egress "mirror to ip6gretap"
+}
+
+test_all()
+{
+ slow_path_trap_install $swp1 ingress
+ slow_path_trap_install $swp1 egress
+
+ tests_run
+
+ slow_path_trap_uninstall $swp1 egress
+ slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+ echo "WARN: Could not test offloaded functionality"
+else
+ tcflags="skip_sw"
+ test_all
+fi
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bound.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bound.sh
new file mode 100755
index 000000000..360ca133b
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_bound.sh
@@ -0,0 +1,226 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# +---------------------+ +---------------------+
+# | H1 | | H2 |
+# | + $h1 | | $h2 + |
+# | | 192.0.2.1/28 | | 192.0.2.2/28 | |
+# +-----|---------------+ +---------------|-----+
+# | |
+# +-----|-------------------------------------------------------------|-----+
+# | SW o--> mirror | |
+# | +---|-------------------------------------------------------------|---+ |
+# | | + $swp1 BR $swp2 + | |
+# | +---------------------------------------------------------------------+ |
+# | |
+# | +---------------------------------------------------------------------+ |
+# | | OL + gt6 (ip6gretap) + gt4 (gretap) | |
+# | | : loc=2001:db8:2::1 : loc=192.0.2.129 | |
+# | | : rem=2001:db8:2::2 : rem=192.0.2.130 | |
+# | | : ttl=100 : ttl=100 | |
+# | | : tos=inherit : tos=inherit | |
+# | +-------------------------:--|-------------------:--|-----------------+ |
+# | : | : | |
+# | +-------------------------:--|-------------------:--|-----------------+ |
+# | | UL : |,---------------------' | |
+# | | + $swp3 : || : | |
+# | | | 192.0.2.129/28 : vv : | |
+# | | | 2001:db8:2::1/64 : + ul (dummy) : | |
+# | +---|---------------------:----------------------:--------------------+ |
+# +-----|---------------------:----------------------:----------------------+
+# | : :
+# +-----|---------------------:----------------------:----------------------+
+# | H3 + $h3 + h3-gt6 (ip6gretap) + h3-gt4 (gretap) |
+# | 192.0.2.130/28 loc=2001:db8:2::2 loc=192.0.2.130 |
+# | 2001:db8:2::2/64 rem=2001:db8:2::1 rem=192.0.2.129 |
+# | ttl=100 ttl=100 |
+# | tos=inherit tos=inherit |
+# | |
+# +-------------------------------------------------------------------------+
+#
+# This tests mirroring to gretap and ip6gretap configured in an overlay /
+# underlay manner, i.e. with a bound dummy device that marks underlay VRF where
+# the encapsulated packed should be routed.
+
+ALL_TESTS="
+ test_gretap
+ test_ip6gretap
+"
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_gre_lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/28
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/28
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.2/28
+}
+
+h2_destroy()
+{
+ simple_if_fini $h2 192.0.2.2/28
+}
+
+h3_create()
+{
+ simple_if_init $h3 192.0.2.130/28 2001:db8:2::2/64
+
+ tunnel_create h3-gt4 gretap 192.0.2.130 192.0.2.129
+ ip link set h3-gt4 vrf v$h3
+ matchall_sink_create h3-gt4
+
+ tunnel_create h3-gt6 ip6gretap 2001:db8:2::2 2001:db8:2::1
+ ip link set h3-gt6 vrf v$h3
+ matchall_sink_create h3-gt6
+}
+
+h3_destroy()
+{
+ tunnel_destroy h3-gt6
+ tunnel_destroy h3-gt4
+
+ simple_if_fini $h3 192.0.2.130/28 2001:db8:2::2/64
+}
+
+switch_create()
+{
+ # Bridge between H1 and H2.
+
+ ip link add name br1 type bridge vlan_filtering 1
+ ip link set dev br1 up
+
+ ip link set dev $swp1 master br1
+ ip link set dev $swp1 up
+
+ ip link set dev $swp2 master br1
+ ip link set dev $swp2 up
+
+ tc qdisc add dev $swp1 clsact
+
+ # Underlay.
+
+ simple_if_init $swp3 192.0.2.129/28 2001:db8:2::1/64
+
+ ip link add name ul type dummy
+ ip link set dev ul master v$swp3
+ ip link set dev ul up
+
+ # Overlay.
+
+ vrf_create vrf-ol
+ ip link set dev vrf-ol up
+
+ tunnel_create gt4 gretap 192.0.2.129 192.0.2.130 \
+ ttl 100 tos inherit dev ul
+ ip link set dev gt4 master vrf-ol
+ ip link set dev gt4 up
+
+ tunnel_create gt6 ip6gretap 2001:db8:2::1 2001:db8:2::2 \
+ ttl 100 tos inherit dev ul allow-localremote
+ ip link set dev gt6 master vrf-ol
+ ip link set dev gt6 up
+}
+
+switch_destroy()
+{
+ vrf_destroy vrf-ol
+
+ tunnel_destroy gt6
+ tunnel_destroy gt4
+
+ simple_if_fini $swp3 192.0.2.129/28 2001:db8:2::1/64
+
+ ip link del dev ul
+
+ tc qdisc del dev $swp1 clsact
+
+ ip link set dev $swp1 down
+ ip link set dev $swp2 down
+ ip link del dev br1
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+ h3_create
+
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+
+ h3_destroy
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+test_gretap()
+{
+ full_test_span_gre_dir gt4 ingress 8 0 "mirror to gretap w/ UL"
+ full_test_span_gre_dir gt4 egress 0 8 "mirror to gretap w/ UL"
+}
+
+test_ip6gretap()
+{
+ full_test_span_gre_dir gt6 ingress 8 0 "mirror to ip6gretap w/ UL"
+ full_test_span_gre_dir gt6 egress 0 8 "mirror to ip6gretap w/ UL"
+}
+
+test_all()
+{
+ RET=0
+
+ slow_path_trap_install $swp1 ingress
+ slow_path_trap_install $swp1 egress
+
+ tests_run
+
+ slow_path_trap_uninstall $swp1 egress
+ slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+ echo "WARN: Could not test offloaded functionality"
+else
+ tcflags="skip_sw"
+ test_all
+fi
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh
new file mode 100755
index 000000000..c5095da7f
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test for "tc action mirred egress mirror" when the underlay route points at a
+# bridge device without vlan filtering (802.1d).
+#
+# This test uses standard topology for testing mirror-to-gretap. See
+# mirror_gre_topo_lib.sh for more details. The full topology is as follows:
+#
+# +---------------------+ +---------------------+
+# | H1 | | H2 |
+# | + $h1 | | $h2 + |
+# | | 192.0.2.1/28 | | 192.0.2.2/28 | |
+# +-----|---------------+ +---------------|-----+
+# | |
+# +-----|-------------------------------------------------------------|-----+
+# | SW o---> mirror | |
+# | +---|-------------------------------------------------------------|---+ |
+# | | + $swp1 + br1 (802.1q bridge) $swp2 + | |
+# | +---------------------------------------------------------------------+ |
+# | |
+# | +---------------------------------------------------------------------+ |
+# | | + br2 (802.1d bridge) | |
+# | | 192.0.2.129/28 | |
+# | | + $swp3 2001:db8:2::1/64 | |
+# | +---|-----------------------------------------------------------------+ |
+# | | ^ ^ |
+# | | + gt6 (ip6gretap) | + gt4 (gretap) | |
+# | | : loc=2001:db8:2::1 | : loc=192.0.2.129 | |
+# | | : rem=2001:db8:2::2 -+ : rem=192.0.2.130 -+ |
+# | | : ttl=100 : ttl=100 |
+# | | : tos=inherit : tos=inherit |
+# +-----|---------------------:----------------------:----------------------+
+# | : :
+# +-----|---------------------:----------------------:----------------------+
+# | H3 + $h3 + h3-gt6(ip6gretap) + h3-gt4 (gretap) |
+# | 192.0.2.130/28 loc=2001:db8:2::2 loc=192.0.2.130 |
+# | 2001:db8:2::2/64 rem=2001:db8:2::1 rem=192.0.2.129 |
+# | ttl=100 ttl=100 |
+# | tos=inherit tos=inherit |
+# +-------------------------------------------------------------------------+
+
+ALL_TESTS="
+ test_gretap
+ test_ip6gretap
+"
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_gre_lib.sh
+source mirror_gre_topo_lib.sh
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ vrf_prepare
+ mirror_gre_topo_create
+
+ ip link add name br2 type bridge vlan_filtering 0
+ ip link set dev br2 up
+
+ ip link set dev $swp3 master br2
+ ip route add 192.0.2.130/32 dev br2
+ ip -6 route add 2001:db8:2::2/128 dev br2
+
+ ip address add dev br2 192.0.2.129/28
+ ip address add dev br2 2001:db8:2::1/64
+
+ ip address add dev $h3 192.0.2.130/28
+ ip address add dev $h3 2001:db8:2::2/64
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ ip address del dev $h3 2001:db8:2::2/64
+ ip address del dev $h3 192.0.2.130/28
+ ip link del dev br2
+
+ mirror_gre_topo_destroy
+ vrf_cleanup
+}
+
+test_gretap()
+{
+ full_test_span_gre_dir gt4 ingress 8 0 "mirror to gretap"
+ full_test_span_gre_dir gt4 egress 0 8 "mirror to gretap"
+}
+
+test_ip6gretap()
+{
+ full_test_span_gre_dir gt6 ingress 8 0 "mirror to ip6gretap"
+ full_test_span_gre_dir gt6 egress 0 8 "mirror to ip6gretap"
+}
+
+test_all()
+{
+ slow_path_trap_install $swp1 ingress
+ slow_path_trap_install $swp1 egress
+
+ tests_run
+
+ slow_path_trap_uninstall $swp1 egress
+ slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+ echo "WARN: Could not test offloaded functionality"
+else
+ tcflags="skip_sw"
+ test_all
+fi
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh
new file mode 100755
index 000000000..f8cda822c
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1d_vlan.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test uses standard topology for testing gretap. See
+# mirror_gre_topo_lib.sh for more details.
+#
+# Test for "tc action mirred egress mirror" when the underlay route points at a
+# bridge device without vlan filtering (802.1d). The device attached to that
+# bridge is a VLAN.
+
+ALL_TESTS="
+ test_gretap
+ test_ip6gretap
+ test_gretap_stp
+ test_ip6gretap_stp
+"
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_gre_lib.sh
+source mirror_gre_topo_lib.sh
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ vrf_prepare
+ mirror_gre_topo_create
+
+ ip link add name br2 type bridge vlan_filtering 0
+ ip link set dev br2 up
+
+ vlan_create $swp3 555
+
+ ip link set dev $swp3.555 master br2
+ ip route add 192.0.2.130/32 dev br2
+ ip -6 route add 2001:db8:2::2/128 dev br2
+
+ ip address add dev br2 192.0.2.129/32
+ ip address add dev br2 2001:db8:2::1/128
+
+ vlan_create $h3 555 v$h3 192.0.2.130/28 2001:db8:2::2/64
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ vlan_destroy $h3 555
+ ip link del dev br2
+ vlan_destroy $swp3 555
+
+ mirror_gre_topo_destroy
+ vrf_cleanup
+}
+
+test_vlan_match()
+{
+ local tundev=$1; shift
+ local vlan_match=$1; shift
+ local what=$1; shift
+
+ full_test_span_gre_dir_vlan $tundev ingress "$vlan_match" 8 0 "$what"
+ full_test_span_gre_dir_vlan $tundev egress "$vlan_match" 0 8 "$what"
+}
+
+test_gretap()
+{
+ test_vlan_match gt4 'skip_hw vlan_id 555 vlan_ethtype ip' \
+ "mirror to gretap"
+}
+
+test_ip6gretap()
+{
+ test_vlan_match gt6 'skip_hw vlan_id 555 vlan_ethtype ip' \
+ "mirror to ip6gretap"
+}
+
+test_gretap_stp()
+{
+ # Sometimes after mirror installation, the neighbor's state is not valid.
+ # The reason is that there is no SW datapath activity related to the
+ # neighbor for the remote GRE address. Therefore whether the corresponding
+ # neighbor will be valid is a matter of luck, and the test is thus racy.
+ # Set the neighbor's state to permanent, so it would be always valid.
+ ip neigh replace 192.0.2.130 lladdr $(mac_get $h3) \
+ nud permanent dev br2
+ full_test_span_gre_stp gt4 $swp3.555 "mirror to gretap"
+}
+
+test_ip6gretap_stp()
+{
+ ip neigh replace 2001:db8:2::2 lladdr $(mac_get $h3) \
+ nud permanent dev br2
+ full_test_span_gre_stp gt6 $swp3.555 "mirror to ip6gretap"
+}
+
+test_all()
+{
+ slow_path_trap_install $swp1 ingress
+ slow_path_trap_install $swp1 egress
+
+ tests_run
+
+ slow_path_trap_uninstall $swp1 egress
+ slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+ echo "WARN: Could not test offloaded functionality"
+else
+ tcflags="skip_sw"
+ test_all
+fi
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh
new file mode 100755
index 000000000..9ff22f280
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test for "tc action mirred egress mirror" when the underlay route points at a
+# bridge device with vlan filtering (802.1q).
+#
+# This test uses standard topology for testing mirror-to-gretap. See
+# mirror_gre_topo_lib.sh for more details. The full topology is as follows:
+#
+# +---------------------+ +---------------------+
+# | H1 | | H2 |
+# | + $h1 | | $h2 + |
+# | | 192.0.2.1/28 | | 192.0.2.2/28 | |
+# +-----|---------------+ +---------------|-----+
+# | |
+# +-----|---------------------------------------------------------------|-----+
+# | SW o---> mirror | |
+# | +---|---------------------------------------------------------------|---+ |
+# | | + $swp1 + br1 (802.1q bridge) $swp2 + | |
+# | | 192.0.2.129/28 | |
+# | | + $swp3 2001:db8:2::1/64 | |
+# | | | vid555 vid555[pvid,untagged] | |
+# | +---|-------------------------------------------------------------------+ |
+# | | ^ ^ |
+# | | + gt6 (ip6gretap) | + gt4 (gretap) | |
+# | | : loc=2001:db8:2::1 | : loc=192.0.2.129 | |
+# | | : rem=2001:db8:2::2 -+ : rem=192.0.2.130 -+ |
+# | | : ttl=100 : ttl=100 |
+# | | : tos=inherit : tos=inherit |
+# +-----|---------------------:------------------------:----------------------+
+# | : :
+# +-----|---------------------:------------------------:----------------------+
+# | H3 + $h3 + h3-gt6(ip6gretap) + h3-gt4 (gretap) |
+# | | loc=2001:db8:2::2 loc=192.0.2.130 |
+# | + $h3.555 rem=2001:db8:2::1 rem=192.0.2.129 |
+# | 192.0.2.130/28 ttl=100 ttl=100 |
+# | 2001:db8:2::2/64 tos=inherit tos=inherit |
+# +---------------------------------------------------------------------------+
+
+ALL_TESTS="
+ test_gretap
+ test_ip6gretap
+"
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_gre_lib.sh
+source mirror_gre_topo_lib.sh
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ vrf_prepare
+ mirror_gre_topo_create
+ # Avoid changing br1's PVID while it is operational as a L3 interface.
+ ip link set dev br1 down
+
+ ip link set dev $swp3 master br1
+ bridge vlan add dev br1 vid 555 pvid untagged self
+ ip link set dev br1 up
+ ip address add dev br1 192.0.2.129/28
+ ip address add dev br1 2001:db8:2::1/64
+
+ ip -4 route add 192.0.2.130/32 dev br1
+ ip -6 route add 2001:db8:2::2/128 dev br1
+
+ vlan_create $h3 555 v$h3 192.0.2.130/28 2001:db8:2::2/64
+ bridge vlan add dev $swp3 vid 555
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ ip link set dev $swp3 nomaster
+ vlan_destroy $h3 555
+
+ mirror_gre_topo_destroy
+ vrf_cleanup
+}
+
+test_gretap()
+{
+ full_test_span_gre_dir gt4 ingress 8 0 "mirror to gretap"
+ full_test_span_gre_dir gt4 egress 0 8 "mirror to gretap"
+}
+
+test_ip6gretap()
+{
+ full_test_span_gre_dir gt6 ingress 8 0 "mirror to ip6gretap"
+ full_test_span_gre_dir gt6 egress 0 8 "mirror to ip6gretap"
+}
+
+tests()
+{
+ slow_path_trap_install $swp1 ingress
+ slow_path_trap_install $swp1 egress
+
+ tests_run
+
+ slow_path_trap_uninstall $swp1 egress
+ slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+tests
+
+if ! tc_offload_check; then
+ echo "WARN: Could not test offloaded functionality"
+else
+ tcflags="skip_sw"
+ tests
+fi
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q_lag.sh b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q_lag.sh
new file mode 100755
index 000000000..28d568c48
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_bridge_1q_lag.sh
@@ -0,0 +1,292 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test for "tc action mirred egress mirror" when the underlay route points at a
+# bridge device with vlan filtering (802.1q), and the egress device is a team
+# device.
+#
+# +----------------------+ +----------------------+
+# | H1 | | H2 |
+# | + $h1.333 | | $h1.555 + |
+# | | 192.0.2.1/28 | | 192.0.2.18/28 | |
+# +-----|----------------+ +----------------|-----+
+# | $h1 |
+# +--------------------------------+------------------------------+
+# |
+# +--------------------------------------|------------------------------------+
+# | SW o---> mirror |
+# | | |
+# | +--------------------------------+------------------------------+ |
+# | | $swp1 | |
+# | + $swp1.333 $swp1.555 + |
+# | 192.0.2.2/28 192.0.2.17/28 |
+# | |
+# | +-----------------------------------------------------------------------+ |
+# | | BR1 (802.1q) | |
+# | | + lag (team) 192.0.2.129/28 | |
+# | | / \ 2001:db8:2::1/64 | |
+# | +---/---\---------------------------------------------------------------+ |
+# | / \ ^ |
+# | | \ + gt4 (gretap) | |
+# | | \ loc=192.0.2.129 | |
+# | | \ rem=192.0.2.130 -+ |
+# | | \ ttl=100 |
+# | | \ tos=inherit |
+# | | \ |
+# | | \_________________________________ |
+# | | \ |
+# | + $swp3 + $swp4 |
+# +---|------------------------------------------------|----------------------+
+# | |
+# +---|----------------------+ +---|----------------------+
+# | + $h3 H3 | | + $h4 H4 |
+# | 192.0.2.130/28 | | 192.0.2.130/28 |
+# | 2001:db8:2::2/64 | | 2001:db8:2::2/64 |
+# +--------------------------+ +--------------------------+
+
+ALL_TESTS="
+ test_mirror_gretap_first
+ test_mirror_gretap_second
+"
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_gre_lib.sh
+
+require_command $ARPING
+
+vlan_host_create()
+{
+ local if_name=$1; shift
+ local vid=$1; shift
+ local vrf_name=$1; shift
+ local ips=("${@}")
+
+ vrf_create $vrf_name
+ ip link set dev $vrf_name up
+ vlan_create $if_name $vid $vrf_name "${ips[@]}"
+}
+
+vlan_host_destroy()
+{
+ local if_name=$1; shift
+ local vid=$1; shift
+ local vrf_name=$1; shift
+
+ vlan_destroy $if_name $vid
+ ip link set dev $vrf_name down
+ vrf_destroy $vrf_name
+}
+
+h1_create()
+{
+ vlan_host_create $h1 333 vrf-h1 192.0.2.1/28
+ ip -4 route add 192.0.2.16/28 vrf vrf-h1 nexthop via 192.0.2.2
+}
+
+h1_destroy()
+{
+ ip -4 route del 192.0.2.16/28 vrf vrf-h1
+ vlan_host_destroy $h1 333 vrf-h1
+}
+
+h2_create()
+{
+ vlan_host_create $h1 555 vrf-h2 192.0.2.18/28
+ ip -4 route add 192.0.2.0/28 vrf vrf-h2 nexthop via 192.0.2.17
+}
+
+h2_destroy()
+{
+ ip -4 route del 192.0.2.0/28 vrf vrf-h2
+ vlan_host_destroy $h1 555 vrf-h2
+}
+
+h3_create()
+{
+ simple_if_init $h3 192.0.2.130/28
+ tc qdisc add dev $h3 clsact
+}
+
+h3_destroy()
+{
+ tc qdisc del dev $h3 clsact
+ simple_if_fini $h3 192.0.2.130/28
+}
+
+h4_create()
+{
+ simple_if_init $h4 192.0.2.130/28
+ tc qdisc add dev $h4 clsact
+}
+
+h4_destroy()
+{
+ tc qdisc del dev $h4 clsact
+ simple_if_fini $h4 192.0.2.130/28
+}
+
+switch_create()
+{
+ ip link set dev $swp1 up
+ tc qdisc add dev $swp1 clsact
+ vlan_create $swp1 333 "" 192.0.2.2/28
+ vlan_create $swp1 555 "" 192.0.2.17/28
+
+ tunnel_create gt4 gretap 192.0.2.129 192.0.2.130 \
+ ttl 100 tos inherit
+
+ ip link set dev $swp3 up
+ ip link set dev $swp4 up
+
+ ip link add name br1 type bridge vlan_filtering 1
+ ip link set dev br1 up
+ __addr_add_del br1 add 192.0.2.129/32
+ ip -4 route add 192.0.2.130/32 dev br1
+
+ team_create lag loadbalance $swp3 $swp4
+ ip link set dev lag master br1
+}
+
+switch_destroy()
+{
+ ip link set dev lag nomaster
+ team_destroy lag
+
+ ip -4 route del 192.0.2.130/32 dev br1
+ __addr_add_del br1 del 192.0.2.129/32
+ ip link set dev br1 down
+ ip link del dev br1
+
+ ip link set dev $swp4 down
+ ip link set dev $swp3 down
+
+ tunnel_destroy gt4
+
+ vlan_destroy $swp1 555
+ vlan_destroy $swp1 333
+ tc qdisc del dev $swp1 clsact
+ ip link set dev $swp1 down
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp3=${NETIFS[p3]}
+ h3=${NETIFS[p4]}
+
+ swp4=${NETIFS[p5]}
+ h4=${NETIFS[p6]}
+
+ vrf_prepare
+
+ ip link set dev $h1 up
+ h1_create
+ h2_create
+ h3_create
+ h4_create
+ switch_create
+
+ forwarding_enable
+
+ trap_install $h3 ingress
+ trap_install $h4 ingress
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ trap_uninstall $h4 ingress
+ trap_uninstall $h3 ingress
+
+ forwarding_restore
+
+ switch_destroy
+ h4_destroy
+ h3_destroy
+ h2_destroy
+ h1_destroy
+ ip link set dev $h1 down
+
+ vrf_cleanup
+}
+
+test_lag_slave()
+{
+ local host_dev=$1; shift
+ local up_dev=$1; shift
+ local down_dev=$1; shift
+ local what=$1; shift
+
+ RET=0
+
+ tc filter add dev $swp1 ingress pref 999 \
+ proto 802.1q flower vlan_ethtype arp $tcflags \
+ action pass
+ mirror_install $swp1 ingress gt4 \
+ "proto 802.1q flower vlan_id 333 $tcflags"
+
+ # Test connectivity through $up_dev when $down_dev is set down.
+ ip link set dev $down_dev down
+ ip neigh flush dev br1
+ setup_wait_dev $up_dev
+ setup_wait_dev $host_dev
+ $ARPING -I br1 192.0.2.130 -qfc 1
+ sleep 2
+ mirror_test vrf-h1 192.0.2.1 192.0.2.18 $host_dev 1 10
+
+ # Test lack of connectivity when both slaves are down.
+ ip link set dev $up_dev down
+ sleep 2
+ mirror_test vrf-h1 192.0.2.1 192.0.2.18 $h3 1 0
+ mirror_test vrf-h1 192.0.2.1 192.0.2.18 $h4 1 0
+
+ ip link set dev $up_dev up
+ ip link set dev $down_dev up
+ mirror_uninstall $swp1 ingress
+ tc filter del dev $swp1 ingress pref 999
+
+ log_test "$what ($tcflags)"
+}
+
+test_mirror_gretap_first()
+{
+ test_lag_slave $h3 $swp3 $swp4 "mirror to gretap: LAG first slave"
+}
+
+test_mirror_gretap_second()
+{
+ test_lag_slave $h4 $swp4 $swp3 "mirror to gretap: LAG second slave"
+}
+
+test_all()
+{
+ slow_path_trap_install $swp1 ingress
+ slow_path_trap_install $swp1 egress
+
+ tests_run
+
+ slow_path_trap_uninstall $swp1 egress
+ slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+ echo "WARN: Could not test offloaded functionality"
+else
+ tcflags="skip_sw"
+ test_all
+fi
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
new file mode 100755
index 000000000..b501b3663
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_changes.sh
@@ -0,0 +1,273 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test uses standard topology for testing gretap. See
+# mirror_gre_topo_lib.sh for more details.
+#
+# Test how mirrors to gretap and ip6gretap react to changes to relevant
+# configuration.
+
+ALL_TESTS="
+ test_ttl
+ test_tun_up
+ test_egress_up
+ test_remote_ip
+ test_tun_del
+ test_route_del
+"
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_gre_lib.sh
+source mirror_gre_topo_lib.sh
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ vrf_prepare
+ mirror_gre_topo_create
+
+ # This test downs $swp3, which deletes the configured IPv6 address
+ # unless this sysctl is set.
+ sysctl_set net.ipv6.conf.$swp3.keep_addr_on_down 1
+
+ ip address add dev $swp3 192.0.2.129/28
+ ip address add dev $h3 192.0.2.130/28
+
+ ip address add dev $swp3 2001:db8:2::1/64
+ ip address add dev $h3 2001:db8:2::2/64
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ ip address del dev $h3 2001:db8:2::2/64
+ ip address del dev $swp3 2001:db8:2::1/64
+
+ ip address del dev $h3 192.0.2.130/28
+ ip address del dev $swp3 192.0.2.129/28
+
+ sysctl_restore net.ipv6.conf.$swp3.keep_addr_on_down
+
+ mirror_gre_topo_destroy
+ vrf_cleanup
+}
+
+test_span_gre_ttl()
+{
+ local tundev=$1; shift
+ local type=$1; shift
+ local prot=$1; shift
+ local what=$1; shift
+
+ RET=0
+
+ mirror_install $swp1 ingress $tundev \
+ "prot ip flower $tcflags ip_prot icmp"
+ tc filter add dev $h3 ingress pref 77 prot $prot \
+ flower ip_ttl 50 action pass
+
+ mirror_test v$h1 192.0.2.1 192.0.2.2 $h3 77 0
+
+ ip link set dev $tundev type $type ttl 50
+ sleep 2
+ mirror_test v$h1 192.0.2.1 192.0.2.2 $h3 77 10
+
+ ip link set dev $tundev type $type ttl 100
+ tc filter del dev $h3 ingress pref 77
+ mirror_uninstall $swp1 ingress
+
+ log_test "$what: TTL change ($tcflags)"
+}
+
+test_span_gre_tun_up()
+{
+ local tundev=$1; shift
+ local what=$1; shift
+
+ RET=0
+
+ ip link set dev $tundev down
+ mirror_install $swp1 ingress $tundev "matchall $tcflags"
+ fail_test_span_gre_dir $tundev ingress
+
+ ip link set dev $tundev up
+
+ quick_test_span_gre_dir $tundev ingress
+ mirror_uninstall $swp1 ingress
+
+ log_test "$what: tunnel down/up ($tcflags)"
+}
+
+test_span_gre_egress_up()
+{
+ local tundev=$1; shift
+ local remote_ip=$1; shift
+ local what=$1; shift
+
+ RET=0
+
+ ip link set dev $swp3 down
+ mirror_install $swp1 ingress $tundev "matchall $tcflags"
+ fail_test_span_gre_dir $tundev ingress
+
+ # After setting the device up, wait for neighbor to get resolved so that
+ # we can expect mirroring to work.
+ ip link set dev $swp3 up
+ setup_wait_dev $swp3
+ ping -c 1 -I $swp3 $remote_ip &>/dev/null
+
+ quick_test_span_gre_dir $tundev ingress
+ mirror_uninstall $swp1 ingress
+
+ log_test "$what: egress down/up ($tcflags)"
+}
+
+test_span_gre_remote_ip()
+{
+ local tundev=$1; shift
+ local type=$1; shift
+ local correct_ip=$1; shift
+ local wrong_ip=$1; shift
+ local what=$1; shift
+
+ RET=0
+
+ ip link set dev $tundev type $type remote $wrong_ip
+ mirror_install $swp1 ingress $tundev "matchall $tcflags"
+ fail_test_span_gre_dir $tundev ingress
+
+ ip link set dev $tundev type $type remote $correct_ip
+ quick_test_span_gre_dir $tundev ingress
+ mirror_uninstall $swp1 ingress
+
+ log_test "$what: remote address change ($tcflags)"
+}
+
+test_span_gre_tun_del()
+{
+ local tundev=$1; shift
+ local type=$1; shift
+ local flags=$1; shift
+ local local_ip=$1; shift
+ local remote_ip=$1; shift
+ local what=$1; shift
+
+ RET=0
+
+ mirror_install $swp1 ingress $tundev "matchall $tcflags"
+ quick_test_span_gre_dir $tundev ingress
+ ip link del dev $tundev
+ fail_test_span_gre_dir $tundev ingress
+
+ tunnel_create $tundev $type $local_ip $remote_ip \
+ ttl 100 tos inherit $flags
+
+ # Recreating the tunnel doesn't reestablish mirroring, so reinstall it
+ # and verify it works for the follow-up tests.
+ mirror_uninstall $swp1 ingress
+ mirror_install $swp1 ingress $tundev "matchall $tcflags"
+ quick_test_span_gre_dir $tundev ingress
+ mirror_uninstall $swp1 ingress
+
+ log_test "$what: tunnel deleted ($tcflags)"
+}
+
+test_span_gre_route_del()
+{
+ local tundev=$1; shift
+ local edev=$1; shift
+ local route=$1; shift
+ local what=$1; shift
+
+ RET=0
+
+ mirror_install $swp1 ingress $tundev "matchall $tcflags"
+ quick_test_span_gre_dir $tundev ingress
+
+ ip route del $route dev $edev
+ fail_test_span_gre_dir $tundev ingress
+
+ ip route add $route dev $edev
+ quick_test_span_gre_dir $tundev ingress
+
+ mirror_uninstall $swp1 ingress
+
+ log_test "$what: underlay route removal ($tcflags)"
+}
+
+test_ttl()
+{
+ test_span_gre_ttl gt4 gretap ip "mirror to gretap"
+ test_span_gre_ttl gt6 ip6gretap ipv6 "mirror to ip6gretap"
+}
+
+test_tun_up()
+{
+ test_span_gre_tun_up gt4 "mirror to gretap"
+ test_span_gre_tun_up gt6 "mirror to ip6gretap"
+}
+
+test_egress_up()
+{
+ test_span_gre_egress_up gt4 192.0.2.130 "mirror to gretap"
+ test_span_gre_egress_up gt6 2001:db8:2::2 "mirror to ip6gretap"
+}
+
+test_remote_ip()
+{
+ test_span_gre_remote_ip gt4 gretap 192.0.2.130 192.0.2.132 "mirror to gretap"
+ test_span_gre_remote_ip gt6 ip6gretap 2001:db8:2::2 2001:db8:2::4 "mirror to ip6gretap"
+}
+
+test_tun_del()
+{
+ test_span_gre_tun_del gt4 gretap "" \
+ 192.0.2.129 192.0.2.130 "mirror to gretap"
+ test_span_gre_tun_del gt6 ip6gretap allow-localremote \
+ 2001:db8:2::1 2001:db8:2::2 "mirror to ip6gretap"
+}
+
+test_route_del()
+{
+ test_span_gre_route_del gt4 $swp3 192.0.2.128/28 "mirror to gretap"
+ test_span_gre_route_del gt6 $swp3 2001:db8:2::/64 "mirror to ip6gretap"
+}
+
+test_all()
+{
+ slow_path_trap_install $swp1 ingress
+ slow_path_trap_install $swp1 egress
+
+ tests_run
+
+ slow_path_trap_uninstall $swp1 egress
+ slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+ echo "WARN: Could not test offloaded functionality"
+else
+ tcflags="skip_sw"
+ test_all
+fi
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_flower.sh b/tools/testing/selftests/net/forwarding/mirror_gre_flower.sh
new file mode 100755
index 000000000..09389f3b9
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_flower.sh
@@ -0,0 +1,137 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test uses standard topology for testing gretap. See
+# mirror_gre_topo_lib.sh for more details.
+#
+# This tests flower-triggered mirroring to gretap and ip6gretap netdevices. The
+# interfaces on H1 and H2 have two addresses each. Flower match on one of the
+# addresses is configured with mirror action. It is expected that when pinging
+# this address, mirroring takes place, whereas when pinging the other one,
+# there's no mirroring.
+
+ALL_TESTS="
+ test_gretap
+ test_ip6gretap
+"
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_gre_lib.sh
+source mirror_gre_topo_lib.sh
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ vrf_prepare
+ mirror_gre_topo_create
+
+ ip address add dev $swp3 192.0.2.129/28
+ ip address add dev $h3 192.0.2.130/28
+
+ ip address add dev $swp3 2001:db8:2::1/64
+ ip address add dev $h3 2001:db8:2::2/64
+
+ ip address add dev $h1 192.0.2.3/28
+ ip address add dev $h2 192.0.2.4/28
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ ip address del dev $h2 192.0.2.4/28
+ ip address del dev $h1 192.0.2.3/28
+
+ ip address del dev $h3 2001:db8:2::2/64
+ ip address del dev $swp3 2001:db8:2::1/64
+
+ ip address del dev $h3 192.0.2.130/28
+ ip address del dev $swp3 192.0.2.129/28
+
+ mirror_gre_topo_destroy
+ vrf_cleanup
+}
+
+test_span_gre_dir_acl()
+{
+ test_span_gre_dir_ips "$@" 192.0.2.3 192.0.2.4
+}
+
+fail_test_span_gre_dir_acl()
+{
+ fail_test_span_gre_dir_ips "$@" 192.0.2.3 192.0.2.4
+}
+
+full_test_span_gre_dir_acl()
+{
+ local tundev=$1; shift
+ local direction=$1; shift
+ local forward_type=$1; shift
+ local backward_type=$1; shift
+ local match_dip=$1; shift
+ local what=$1; shift
+
+ RET=0
+
+ mirror_install $swp1 $direction $tundev \
+ "protocol ip flower $tcflags dst_ip $match_dip"
+ fail_test_span_gre_dir $tundev $direction
+ test_span_gre_dir_acl "$tundev" "$direction" \
+ "$forward_type" "$backward_type"
+ mirror_uninstall $swp1 $direction
+
+ # Test lack of mirroring after ACL mirror is uninstalled.
+ fail_test_span_gre_dir_acl "$tundev" "$direction"
+
+ log_test "$direction $what ($tcflags)"
+}
+
+test_gretap()
+{
+ full_test_span_gre_dir_acl gt4 ingress 8 0 192.0.2.4 "ACL mirror to gretap"
+ full_test_span_gre_dir_acl gt4 egress 0 8 192.0.2.3 "ACL mirror to gretap"
+}
+
+test_ip6gretap()
+{
+ full_test_span_gre_dir_acl gt6 ingress 8 0 192.0.2.4 "ACL mirror to ip6gretap"
+ full_test_span_gre_dir_acl gt6 egress 0 8 192.0.2.3 "ACL mirror to ip6gretap"
+}
+
+test_all()
+{
+ slow_path_trap_install $swp1 ingress
+ slow_path_trap_install $swp1 egress
+
+ tests_run
+
+ slow_path_trap_uninstall $swp1 egress
+ slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+ echo "WARN: Could not test offloaded functionality"
+else
+ tcflags="skip_sw"
+ test_all
+fi
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_lag_lacp.sh b/tools/testing/selftests/net/forwarding/mirror_gre_lag_lacp.sh
new file mode 100755
index 000000000..9edf4cb10
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_lag_lacp.sh
@@ -0,0 +1,285 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test for "tc action mirred egress mirror" when the underlay route points at a
+# team device.
+#
+# +----------------------+ +----------------------+
+# | H1 | | H2 |
+# | + $h1.333 | | $h1.555 + |
+# | | 192.0.2.1/28 | | 192.0.2.18/28 | |
+# +----|-----------------+ +----------------|-----+
+# | $h1 |
+# +---------------------------------+------------------------------+
+# |
+# +--------------------------------------|------------------------------------+
+# | SW o---> mirror |
+# | | |
+# | +----------------------------------+------------------------------+ |
+# | | $swp1 | |
+# | + $swp1.333 $swp1.555 + |
+# | 192.0.2.2/28 192.0.2.17/28 |
+# | |
+# | |
+# | + gt4 (gretap) ,-> + lag1 (team) |
+# | loc=192.0.2.129 | | 192.0.2.129/28 |
+# | rem=192.0.2.130 --' | |
+# | ttl=100 | |
+# | tos=inherit | |
+# | _____________________|______________________ |
+# | / \ |
+# | / \ |
+# | + $swp3 + $swp4 |
+# +---|------------------------------------------------|----------------------+
+# | |
+# +---|------------------------------------------------|----------------------+
+# | + $h3 + $h4 H3 |
+# | \ / |
+# | \____________________________________________/ |
+# | | |
+# | + lag2 (team) |
+# | 192.0.2.130/28 |
+# | |
+# +---------------------------------------------------------------------------+
+
+ALL_TESTS="
+ test_mirror_gretap_first
+ test_mirror_gretap_second
+"
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_gre_lib.sh
+
+require_command $ARPING
+
+vlan_host_create()
+{
+ local if_name=$1; shift
+ local vid=$1; shift
+ local vrf_name=$1; shift
+ local ips=("${@}")
+
+ vrf_create $vrf_name
+ ip link set dev $vrf_name up
+ vlan_create $if_name $vid $vrf_name "${ips[@]}"
+}
+
+vlan_host_destroy()
+{
+ local if_name=$1; shift
+ local vid=$1; shift
+ local vrf_name=$1; shift
+
+ vlan_destroy $if_name $vid
+ ip link set dev $vrf_name down
+ vrf_destroy $vrf_name
+}
+
+h1_create()
+{
+ vlan_host_create $h1 333 vrf-h1 192.0.2.1/28
+ ip -4 route add 192.0.2.16/28 vrf vrf-h1 nexthop via 192.0.2.2
+}
+
+h1_destroy()
+{
+ ip -4 route del 192.0.2.16/28 vrf vrf-h1
+ vlan_host_destroy $h1 333 vrf-h1
+}
+
+h2_create()
+{
+ vlan_host_create $h1 555 vrf-h2 192.0.2.18/28
+ ip -4 route add 192.0.2.0/28 vrf vrf-h2 nexthop via 192.0.2.17
+}
+
+h2_destroy()
+{
+ ip -4 route del 192.0.2.0/28 vrf vrf-h2
+ vlan_host_destroy $h1 555 vrf-h2
+}
+
+h3_create_team()
+{
+ team_create lag2 lacp $h3 $h4
+ __simple_if_init lag2 vrf-h3 192.0.2.130/32
+ ip -4 route add vrf vrf-h3 192.0.2.129/32 dev lag2
+}
+
+h3_destroy_team()
+{
+ ip -4 route del vrf vrf-h3 192.0.2.129/32 dev lag2
+ __simple_if_fini lag2 192.0.2.130/32
+ team_destroy lag2
+
+ ip link set dev $h3 down
+ ip link set dev $h4 down
+}
+
+h3_create()
+{
+ vrf_create vrf-h3
+ ip link set dev vrf-h3 up
+ tc qdisc add dev $h3 clsact
+ tc qdisc add dev $h4 clsact
+ h3_create_team
+}
+
+h3_destroy()
+{
+ h3_destroy_team
+ tc qdisc del dev $h4 clsact
+ tc qdisc del dev $h3 clsact
+ ip link set dev vrf-h3 down
+ vrf_destroy vrf-h3
+}
+
+switch_create()
+{
+ ip link set dev $swp1 up
+ tc qdisc add dev $swp1 clsact
+ vlan_create $swp1 333 "" 192.0.2.2/28
+ vlan_create $swp1 555 "" 192.0.2.17/28
+
+ tunnel_create gt4 gretap 192.0.2.129 192.0.2.130 \
+ ttl 100 tos inherit
+
+ ip link set dev $swp3 up
+ ip link set dev $swp4 up
+ team_create lag1 lacp $swp3 $swp4
+ __addr_add_del lag1 add 192.0.2.129/32
+ ip -4 route add 192.0.2.130/32 dev lag1
+}
+
+switch_destroy()
+{
+ ip -4 route del 192.0.2.130/32 dev lag1
+ __addr_add_del lag1 del 192.0.2.129/32
+ team_destroy lag1
+
+ ip link set dev $swp4 down
+ ip link set dev $swp3 down
+
+ tunnel_destroy gt4
+
+ vlan_destroy $swp1 555
+ vlan_destroy $swp1 333
+ tc qdisc del dev $swp1 clsact
+ ip link set dev $swp1 down
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp3=${NETIFS[p3]}
+ h3=${NETIFS[p4]}
+
+ swp4=${NETIFS[p5]}
+ h4=${NETIFS[p6]}
+
+ vrf_prepare
+
+ ip link set dev $h1 up
+ h1_create
+ h2_create
+ h3_create
+ switch_create
+
+ trap_install $h3 ingress
+ trap_install $h4 ingress
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ trap_uninstall $h4 ingress
+ trap_uninstall $h3 ingress
+
+ switch_destroy
+ h3_destroy
+ h2_destroy
+ h1_destroy
+ ip link set dev $h1 down
+
+ vrf_cleanup
+}
+
+test_lag_slave()
+{
+ local up_dev=$1; shift
+ local down_dev=$1; shift
+ local what=$1; shift
+
+ RET=0
+
+ mirror_install $swp1 ingress gt4 \
+ "proto 802.1q flower vlan_id 333 $tcflags"
+
+ # Move $down_dev away from the team. That will prompt change in
+ # txability of the connected device, without changing its upness. The
+ # driver should notice the txability change and move the traffic to the
+ # other slave.
+ ip link set dev $down_dev nomaster
+ sleep 2
+ mirror_test vrf-h1 192.0.2.1 192.0.2.18 $up_dev 1 10
+
+ # Test lack of connectivity when neither slave is txable.
+ ip link set dev $up_dev nomaster
+ sleep 2
+ mirror_test vrf-h1 192.0.2.1 192.0.2.18 $h3 1 0
+ mirror_test vrf-h1 192.0.2.1 192.0.2.18 $h4 1 0
+ mirror_uninstall $swp1 ingress
+
+ # Recreate H3's team device, because mlxsw, which this test is
+ # predominantly mean to test, requires a bottom-up construction and
+ # doesn't allow enslavement to a device that already has an upper.
+ h3_destroy_team
+ h3_create_team
+ # Wait for ${h,swp}{3,4}.
+ setup_wait
+
+ log_test "$what ($tcflags)"
+}
+
+test_mirror_gretap_first()
+{
+ test_lag_slave $h3 $h4 "mirror to gretap: LAG first slave"
+}
+
+test_mirror_gretap_second()
+{
+ test_lag_slave $h4 $h3 "mirror to gretap: LAG second slave"
+}
+
+test_all()
+{
+ slow_path_trap_install $swp1 ingress
+ slow_path_trap_install $swp1 egress
+
+ tests_run
+
+ slow_path_trap_uninstall $swp1 egress
+ slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+ echo "WARN: Could not test offloaded functionality"
+else
+ tcflags="skip_sw"
+ test_all
+fi
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh b/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh
new file mode 100644
index 000000000..fac486178
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_lib.sh
@@ -0,0 +1,130 @@
+# SPDX-License-Identifier: GPL-2.0
+
+source "$relative_path/mirror_lib.sh"
+
+quick_test_span_gre_dir_ips()
+{
+ local tundev=$1; shift
+
+ do_test_span_dir_ips 10 h3-$tundev "$@"
+}
+
+fail_test_span_gre_dir_ips()
+{
+ local tundev=$1; shift
+
+ do_test_span_dir_ips 0 h3-$tundev "$@"
+}
+
+test_span_gre_dir_ips()
+{
+ local tundev=$1; shift
+
+ test_span_dir_ips h3-$tundev "$@"
+}
+
+full_test_span_gre_dir_ips()
+{
+ local tundev=$1; shift
+ local direction=$1; shift
+ local forward_type=$1; shift
+ local backward_type=$1; shift
+ local what=$1; shift
+ local ip1=$1; shift
+ local ip2=$1; shift
+
+ RET=0
+
+ mirror_install $swp1 $direction $tundev "matchall $tcflags"
+ test_span_dir_ips "h3-$tundev" "$direction" "$forward_type" \
+ "$backward_type" "$ip1" "$ip2"
+ mirror_uninstall $swp1 $direction
+
+ log_test "$direction $what ($tcflags)"
+}
+
+full_test_span_gre_dir_vlan_ips()
+{
+ local tundev=$1; shift
+ local direction=$1; shift
+ local vlan_match=$1; shift
+ local forward_type=$1; shift
+ local backward_type=$1; shift
+ local what=$1; shift
+ local ip1=$1; shift
+ local ip2=$1; shift
+
+ RET=0
+
+ mirror_install $swp1 $direction $tundev "matchall $tcflags"
+
+ test_span_dir_ips "h3-$tundev" "$direction" "$forward_type" \
+ "$backward_type" "$ip1" "$ip2"
+
+ tc filter add dev $h3 ingress pref 77 prot 802.1q \
+ flower $vlan_match \
+ action pass
+ mirror_test v$h1 $ip1 $ip2 $h3 77 10
+ tc filter del dev $h3 ingress pref 77
+
+ mirror_uninstall $swp1 $direction
+
+ log_test "$direction $what ($tcflags)"
+}
+
+quick_test_span_gre_dir()
+{
+ quick_test_span_gre_dir_ips "$@" 192.0.2.1 192.0.2.2
+}
+
+fail_test_span_gre_dir()
+{
+ fail_test_span_gre_dir_ips "$@" 192.0.2.1 192.0.2.2
+}
+
+test_span_gre_dir()
+{
+ test_span_gre_dir_ips "$@" 192.0.2.1 192.0.2.2
+}
+
+full_test_span_gre_dir()
+{
+ full_test_span_gre_dir_ips "$@" 192.0.2.1 192.0.2.2
+}
+
+full_test_span_gre_dir_vlan()
+{
+ full_test_span_gre_dir_vlan_ips "$@" 192.0.2.1 192.0.2.2
+}
+
+full_test_span_gre_stp_ips()
+{
+ local tundev=$1; shift
+ local nbpdev=$1; shift
+ local what=$1; shift
+ local ip1=$1; shift
+ local ip2=$1; shift
+ local h3mac=$(mac_get $h3)
+
+ RET=0
+
+ mirror_install $swp1 ingress $tundev "matchall $tcflags"
+ quick_test_span_gre_dir_ips $tundev ingress $ip1 $ip2
+
+ bridge link set dev $nbpdev state disabled
+ sleep 1
+ fail_test_span_gre_dir_ips $tundev ingress $ip1 $ip2
+
+ bridge link set dev $nbpdev state forwarding
+ sleep 1
+ quick_test_span_gre_dir_ips $tundev ingress $ip1 $ip2
+
+ mirror_uninstall $swp1 ingress
+
+ log_test "$what: STP state ($tcflags)"
+}
+
+full_test_span_gre_stp()
+{
+ full_test_span_gre_stp_ips "$@" 192.0.2.1 192.0.2.2
+}
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_neigh.sh b/tools/testing/selftests/net/forwarding/mirror_gre_neigh.sh
new file mode 100755
index 000000000..fc0508e40
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_neigh.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test uses standard topology for testing gretap. See
+# mirror_gre_topo_lib.sh for more details.
+#
+# Test for mirroring to gretap and ip6gretap, such that the neighbor entry for
+# the tunnel remote address has invalid address at the time that the mirroring
+# is set up. Later on, the neighbor is deleted and it is expected to be
+# reinitialized using the usual ARP process, and the mirroring offload updated.
+
+ALL_TESTS="
+ test_gretap
+ test_ip6gretap
+"
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_gre_lib.sh
+source mirror_gre_topo_lib.sh
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ vrf_prepare
+ mirror_gre_topo_create
+
+ ip address add dev $swp3 192.0.2.129/28
+ ip address add dev $h3 192.0.2.130/28
+
+ ip address add dev $swp3 2001:db8:2::1/64
+ ip address add dev $h3 2001:db8:2::2/64
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ ip address del dev $h3 2001:db8:2::2/64
+ ip address del dev $swp3 2001:db8:2::1/64
+
+ ip address del dev $h3 192.0.2.130/28
+ ip address del dev $swp3 192.0.2.129/28
+
+ mirror_gre_topo_destroy
+ vrf_cleanup
+}
+
+test_span_gre_neigh()
+{
+ local addr=$1; shift
+ local tundev=$1; shift
+ local direction=$1; shift
+ local what=$1; shift
+
+ RET=0
+
+ ip neigh replace dev $swp3 $addr lladdr 00:11:22:33:44:55
+ mirror_install $swp1 $direction $tundev "matchall $tcflags"
+ fail_test_span_gre_dir $tundev ingress
+ ip neigh del dev $swp3 $addr
+ quick_test_span_gre_dir $tundev ingress
+ mirror_uninstall $swp1 $direction
+
+ log_test "$direction $what: neighbor change ($tcflags)"
+}
+
+test_gretap()
+{
+ test_span_gre_neigh 192.0.2.130 gt4 ingress "mirror to gretap"
+ test_span_gre_neigh 192.0.2.130 gt4 egress "mirror to gretap"
+}
+
+test_ip6gretap()
+{
+ test_span_gre_neigh 2001:db8:2::2 gt6 ingress "mirror to ip6gretap"
+ test_span_gre_neigh 2001:db8:2::2 gt6 egress "mirror to ip6gretap"
+}
+
+test_all()
+{
+ slow_path_trap_install $swp1 ingress
+ slow_path_trap_install $swp1 egress
+
+ tests_run
+
+ slow_path_trap_uninstall $swp1 egress
+ slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+ echo "WARN: Could not test offloaded functionality"
+else
+ tcflags="skip_sw"
+ test_all
+fi
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_nh.sh b/tools/testing/selftests/net/forwarding/mirror_gre_nh.sh
new file mode 100755
index 000000000..6f9ef1820
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_nh.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test uses standard topology for testing gretap. See
+# mirror_gre_topo_lib.sh for more details.
+#
+# Test that gretap and ip6gretap mirroring works when the other tunnel endpoint
+# is reachable through a next-hop route (as opposed to directly-attached route).
+
+ALL_TESTS="
+ test_gretap
+ test_ip6gretap
+"
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_gre_lib.sh
+source mirror_gre_topo_lib.sh
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ sysctl_set net.ipv4.conf.all.rp_filter 0
+ sysctl_set net.ipv4.conf.$h3.rp_filter 0
+
+ vrf_prepare
+ mirror_gre_topo_create
+
+ sysctl_set net.ipv4.conf.v$h3.rp_filter 0
+
+ ip address add dev $swp3 192.0.2.161/28
+ ip address add dev $h3 192.0.2.162/28
+ ip address add dev gt4 192.0.2.129/32
+ ip address add dev h3-gt4 192.0.2.130/32
+
+ # IPv6 route can't be added after address. Such routes are rejected due
+ # to the gateway address having been configured on the local system. It
+ # works the other way around though.
+ ip address add dev $swp3 2001:db8:4::1/64
+ ip -6 route add 2001:db8:2::2/128 via 2001:db8:4::2
+ ip address add dev $h3 2001:db8:4::2/64
+ ip address add dev gt6 2001:db8:2::1
+ ip address add dev h3-gt6 2001:db8:2::2
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ ip -6 route del 2001:db8:2::2/128 via 2001:db8:4::2
+ ip address del dev $h3 2001:db8:4::2/64
+ ip address del dev $swp3 2001:db8:4::1/64
+
+ ip address del dev $h3 192.0.2.162/28
+ ip address del dev $swp3 192.0.2.161/28
+
+ sysctl_restore net.ipv4.conf.v$h3.rp_filter 0
+
+ mirror_gre_topo_destroy
+ vrf_cleanup
+
+ sysctl_restore net.ipv4.conf.$h3.rp_filter
+ sysctl_restore net.ipv4.conf.all.rp_filter
+}
+
+test_gretap()
+{
+ RET=0
+ mirror_install $swp1 ingress gt4 "matchall $tcflags"
+
+ # For IPv4, test that there's no mirroring without the route directing
+ # the traffic to tunnel remote address. Then add it and test that
+ # mirroring starts. For IPv6 we can't test this due to the limitation
+ # that routes for locally-specified IPv6 addresses can't be added.
+ fail_test_span_gre_dir gt4 ingress
+
+ ip route add 192.0.2.130/32 via 192.0.2.162
+ quick_test_span_gre_dir gt4 ingress
+ ip route del 192.0.2.130/32 via 192.0.2.162
+
+ mirror_uninstall $swp1 ingress
+ log_test "mirror to gre with next-hop remote ($tcflags)"
+}
+
+test_ip6gretap()
+{
+ RET=0
+
+ mirror_install $swp1 ingress gt6 "matchall $tcflags"
+ quick_test_span_gre_dir gt6 ingress
+ mirror_uninstall $swp1 ingress
+
+ log_test "mirror to ip6gre with next-hop remote ($tcflags)"
+}
+
+test_all()
+{
+ slow_path_trap_install $swp1 ingress
+ slow_path_trap_install $swp1 egress
+
+ tests_run
+
+ slow_path_trap_uninstall $swp1 egress
+ slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+ echo "WARN: Could not test offloaded functionality"
+else
+ tcflags="skip_sw"
+ test_all
+fi
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_topo_lib.sh b/tools/testing/selftests/net/forwarding/mirror_gre_topo_lib.sh
new file mode 100644
index 000000000..39c03e286
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_topo_lib.sh
@@ -0,0 +1,94 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# This is the standard topology for testing mirroring to gretap and ip6gretap
+# netdevices. The tests that use it tweak it in one way or another--importantly,
+# $swp3 and $h3 need to have addresses set up.
+#
+# +---------------------+ +---------------------+
+# | H1 | | H2 |
+# | + $h1 | | $h2 + |
+# | | 192.0.2.1/28 | | 192.0.2.2/28 | |
+# +-----|---------------+ +---------------|-----+
+# | |
+# +-----|-------------------------------------------------------------|-----+
+# | SW o--> mirror | |
+# | +---|-------------------------------------------------------------|---+ |
+# | | + $swp1 BR $swp2 + | |
+# | +---------------------------------------------------------------------+ |
+# | |
+# | + $swp3 + gt6 (ip6gretap) + gt4 (gretap) |
+# | | : loc=2001:db8:2::1 : loc=192.0.2.129 |
+# | | : rem=2001:db8:2::2 : rem=192.0.2.130 |
+# | | : ttl=100 : ttl=100 |
+# | | : tos=inherit : tos=inherit |
+# | | : : |
+# +-----|---------------------:----------------------:----------------------+
+# | : :
+# +-----|---------------------:----------------------:----------------------+
+# | H3 + $h3 + h3-gt6 (ip6gretap) + h3-gt4 (gretap) |
+# | loc=2001:db8:2::2 loc=192.0.2.130 |
+# | rem=2001:db8:2::1 rem=192.0.2.129 |
+# | ttl=100 ttl=100 |
+# | tos=inherit tos=inherit |
+# | |
+# +-------------------------------------------------------------------------+
+
+source "$relative_path/mirror_topo_lib.sh"
+
+mirror_gre_topo_h3_create()
+{
+ mirror_topo_h3_create
+
+ tunnel_create h3-gt4 gretap 192.0.2.130 192.0.2.129
+ ip link set h3-gt4 vrf v$h3
+ matchall_sink_create h3-gt4
+
+ tunnel_create h3-gt6 ip6gretap 2001:db8:2::2 2001:db8:2::1
+ ip link set h3-gt6 vrf v$h3
+ matchall_sink_create h3-gt6
+}
+
+mirror_gre_topo_h3_destroy()
+{
+ tunnel_destroy h3-gt6
+ tunnel_destroy h3-gt4
+
+ mirror_topo_h3_destroy
+}
+
+mirror_gre_topo_switch_create()
+{
+ mirror_topo_switch_create
+
+ tunnel_create gt4 gretap 192.0.2.129 192.0.2.130 \
+ ttl 100 tos inherit
+
+ tunnel_create gt6 ip6gretap 2001:db8:2::1 2001:db8:2::2 \
+ ttl 100 tos inherit allow-localremote
+}
+
+mirror_gre_topo_switch_destroy()
+{
+ tunnel_destroy gt6
+ tunnel_destroy gt4
+
+ mirror_topo_switch_destroy
+}
+
+mirror_gre_topo_create()
+{
+ mirror_topo_h1_create
+ mirror_topo_h2_create
+ mirror_gre_topo_h3_create
+
+ mirror_gre_topo_switch_create
+}
+
+mirror_gre_topo_destroy()
+{
+ mirror_gre_topo_switch_destroy
+
+ mirror_gre_topo_h3_destroy
+ mirror_topo_h2_destroy
+ mirror_topo_h1_destroy
+}
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_vlan.sh b/tools/testing/selftests/net/forwarding/mirror_gre_vlan.sh
new file mode 100755
index 000000000..88cecdb9a
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_vlan.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test uses standard topology for testing gretap. See
+# mirror_gre_topo_lib.sh for more details.
+#
+# Test for "tc action mirred egress mirror" that mirrors to a gretap netdevice
+# whose underlay route points at a vlan device.
+
+ALL_TESTS="
+ test_gretap
+"
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_gre_lib.sh
+source mirror_gre_topo_lib.sh
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ vrf_prepare
+ mirror_gre_topo_create
+
+ ip link add name $swp3.555 link $swp3 type vlan id 555
+ ip address add dev $swp3.555 192.0.2.129/32
+ ip address add dev $swp3.555 2001:db8:2::1/128
+ ip link set dev $swp3.555 up
+
+ ip route add 192.0.2.130/32 dev $swp3.555
+ ip -6 route add 2001:db8:2::2/128 dev $swp3.555
+
+ ip link add name $h3.555 link $h3 type vlan id 555
+ ip link set dev $h3.555 master v$h3
+ ip address add dev $h3.555 192.0.2.130/28
+ ip address add dev $h3.555 2001:db8:2::2/64
+ ip link set dev $h3.555 up
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ ip link del dev $h3.555
+ ip link del dev $swp3.555
+
+ mirror_gre_topo_destroy
+ vrf_cleanup
+}
+
+test_gretap()
+{
+ full_test_span_gre_dir gt4 ingress 8 0 "mirror to gretap"
+ full_test_span_gre_dir gt4 egress 0 8 "mirror to gretap"
+}
+
+test_all()
+{
+ slow_path_trap_install $swp1 ingress
+ slow_path_trap_install $swp1 egress
+
+ tests_run
+
+ slow_path_trap_uninstall $swp1 egress
+ slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+ echo "WARN: Could not test offloaded functionality"
+else
+ tcflags="skip_sw"
+ test_all
+fi
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh b/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh
new file mode 100755
index 000000000..880e3ab9d
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_gre_vlan_bridge_1q.sh
@@ -0,0 +1,347 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Test for "tc action mirred egress mirror" when the underlay route points at a
+# vlan device on top of a bridge device with vlan filtering (802.1q).
+#
+# +---------------------+ +---------------------+
+# | H1 | | H2 |
+# | + $h1 | | $h2 + |
+# | | 192.0.2.1/28 | | 192.0.2.2/28 | |
+# +-----|---------------+ +---------------|-----+
+# | |
+# +-----|-------------------------------------------------------------|-----+
+# | SW o--> mirred egress mirror dev {gt4,gt6} | |
+# | | | |
+# | +---|-------------------------------------------------------------|---+ |
+# | | + $swp1 br1 $swp2 + | |
+# | | | |
+# | | + $swp3 | |
+# | +---|-----------------------------------------------------------------+ |
+# | | | |
+# | | + br1.555 |
+# | | 192.0.2.130/28 |
+# | | 2001:db8:2::2/64 |
+# | | |
+# | | + gt6 (ip6gretap) + gt4 (gretap) |
+# | | : loc=2001:db8:2::1 : loc=192.0.2.129 |
+# | | : rem=2001:db8:2::2 : rem=192.0.2.130 |
+# | | : ttl=100 : ttl=100 |
+# | | : tos=inherit : tos=inherit |
+# | | : : |
+# +-----|---------------------:----------------------:----------------------+
+# | : :
+# +-----|---------------------:----------------------:----------------------+
+# | H3 + $h3 + h3-gt6 (ip6gretap) + h3-gt4 (gretap) |
+# | | loc=2001:db8:2::2 loc=192.0.2.130 |
+# | + $h3.555 rem=2001:db8:2::1 rem=192.0.2.129 |
+# | 192.0.2.130/28 ttl=100 ttl=100 |
+# | 2001:db8:2::2/64 tos=inherit tos=inherit |
+# | |
+# +-------------------------------------------------------------------------+
+
+ALL_TESTS="
+ test_gretap
+ test_ip6gretap
+ test_gretap_forbidden_cpu
+ test_ip6gretap_forbidden_cpu
+ test_gretap_forbidden_egress
+ test_ip6gretap_forbidden_egress
+ test_gretap_untagged_egress
+ test_ip6gretap_untagged_egress
+ test_gretap_fdb_roaming
+ test_ip6gretap_fdb_roaming
+ test_gretap_stp
+ test_ip6gretap_stp
+"
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_gre_lib.sh
+source mirror_gre_topo_lib.sh
+
+require_command $ARPING
+
+h3_addr_add_del()
+{
+ local add_del=$1; shift
+ local dev=$1; shift
+
+ ip addr $add_del dev $dev 192.0.2.130/28
+ ip addr $add_del dev $dev 2001:db8:2::2/64
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ # gt4's remote address is at $h3.555, not $h3. Thus the packets arriving
+ # directly to $h3 for test_gretap_untagged_egress() are rejected by
+ # rp_filter and the test spuriously fails.
+ sysctl_set net.ipv4.conf.all.rp_filter 0
+ sysctl_set net.ipv4.conf.$h3.rp_filter 0
+
+ vrf_prepare
+ mirror_gre_topo_create
+
+ vlan_create br1 555 "" 192.0.2.129/32 2001:db8:2::1/128
+ bridge vlan add dev br1 vid 555 self
+ ip route rep 192.0.2.130/32 dev br1.555
+ ip -6 route rep 2001:db8:2::2/128 dev br1.555
+
+ vlan_create $h3 555 v$h3
+ h3_addr_add_del add $h3.555
+
+ ip link set dev $swp3 master br1
+ bridge vlan add dev $swp3 vid 555
+ bridge vlan add dev $swp2 vid 555
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ ip link set dev $swp2 nomaster
+ ip link set dev $swp3 nomaster
+
+ h3_addr_add_del del $h3.555
+ vlan_destroy $h3 555
+ vlan_destroy br1 555
+
+ mirror_gre_topo_destroy
+ vrf_cleanup
+
+ sysctl_restore net.ipv4.conf.$h3.rp_filter
+ sysctl_restore net.ipv4.conf.all.rp_filter
+}
+
+test_vlan_match()
+{
+ local tundev=$1; shift
+ local vlan_match=$1; shift
+ local what=$1; shift
+
+ full_test_span_gre_dir_vlan $tundev ingress "$vlan_match" 8 0 "$what"
+ full_test_span_gre_dir_vlan $tundev egress "$vlan_match" 0 8 "$what"
+}
+
+test_gretap()
+{
+ test_vlan_match gt4 'skip_hw vlan_id 555 vlan_ethtype ip' \
+ "mirror to gretap"
+}
+
+test_ip6gretap()
+{
+ test_vlan_match gt6 'skip_hw vlan_id 555 vlan_ethtype ip' \
+ "mirror to ip6gretap"
+}
+
+test_span_gre_forbidden_cpu()
+{
+ local tundev=$1; shift
+ local what=$1; shift
+
+ RET=0
+
+ # Run the pass-test first, to prime neighbor table.
+ mirror_install $swp1 ingress $tundev "matchall $tcflags"
+ quick_test_span_gre_dir $tundev ingress
+
+ # Now forbid the VLAN at the bridge and see it fail.
+ bridge vlan del dev br1 vid 555 self
+ sleep 1
+ fail_test_span_gre_dir $tundev ingress
+
+ bridge vlan add dev br1 vid 555 self
+ sleep 1
+ quick_test_span_gre_dir $tundev ingress
+
+ mirror_uninstall $swp1 ingress
+
+ log_test "$what: vlan forbidden at a bridge ($tcflags)"
+}
+
+test_gretap_forbidden_cpu()
+{
+ test_span_gre_forbidden_cpu gt4 "mirror to gretap"
+}
+
+test_ip6gretap_forbidden_cpu()
+{
+ test_span_gre_forbidden_cpu gt6 "mirror to ip6gretap"
+}
+
+test_span_gre_forbidden_egress()
+{
+ local tundev=$1; shift
+ local what=$1; shift
+
+ RET=0
+
+ mirror_install $swp1 ingress $tundev "matchall $tcflags"
+ quick_test_span_gre_dir $tundev ingress
+
+ bridge vlan del dev $swp3 vid 555
+ sleep 1
+ fail_test_span_gre_dir $tundev ingress
+
+ bridge vlan add dev $swp3 vid 555
+ # Re-prime FDB
+ $ARPING -I br1.555 192.0.2.130 -fqc 1
+ sleep 1
+ quick_test_span_gre_dir $tundev ingress
+
+ mirror_uninstall $swp1 ingress
+
+ log_test "$what: vlan forbidden at a bridge egress ($tcflags)"
+}
+
+test_gretap_forbidden_egress()
+{
+ test_span_gre_forbidden_egress gt4 "mirror to gretap"
+}
+
+test_ip6gretap_forbidden_egress()
+{
+ test_span_gre_forbidden_egress gt6 "mirror to ip6gretap"
+}
+
+test_span_gre_untagged_egress()
+{
+ local tundev=$1; shift
+ local what=$1; shift
+
+ RET=0
+
+ mirror_install $swp1 ingress $tundev "matchall $tcflags"
+
+ quick_test_span_gre_dir $tundev ingress
+ quick_test_span_vlan_dir $h3 555 ingress
+
+ h3_addr_add_del del $h3.555
+ bridge vlan add dev $swp3 vid 555 pvid untagged
+ h3_addr_add_del add $h3
+ sleep 5
+
+ quick_test_span_gre_dir $tundev ingress
+ fail_test_span_vlan_dir $h3 555 ingress
+
+ h3_addr_add_del del $h3
+ bridge vlan add dev $swp3 vid 555
+ h3_addr_add_del add $h3.555
+ sleep 5
+
+ quick_test_span_gre_dir $tundev ingress
+ quick_test_span_vlan_dir $h3 555 ingress
+
+ mirror_uninstall $swp1 ingress
+
+ log_test "$what: vlan untagged at a bridge egress ($tcflags)"
+}
+
+test_gretap_untagged_egress()
+{
+ test_span_gre_untagged_egress gt4 "mirror to gretap"
+}
+
+test_ip6gretap_untagged_egress()
+{
+ test_span_gre_untagged_egress gt6 "mirror to ip6gretap"
+}
+
+test_span_gre_fdb_roaming()
+{
+ local tundev=$1; shift
+ local what=$1; shift
+ local h3mac=$(mac_get $h3)
+
+ RET=0
+
+ mirror_install $swp1 ingress $tundev "matchall $tcflags"
+ quick_test_span_gre_dir $tundev ingress
+
+ while ((RET == 0)); do
+ bridge fdb del dev $swp3 $h3mac vlan 555 master 2>/dev/null
+ bridge fdb add dev $swp2 $h3mac vlan 555 master static
+ sleep 1
+ fail_test_span_gre_dir $tundev ingress
+
+ if ! bridge fdb sh dev $swp2 vlan 555 master \
+ | grep -q $h3mac; then
+ printf "TEST: %-60s [RETRY]\n" \
+ "$what: MAC roaming ($tcflags)"
+ # ARP or ND probably reprimed the FDB while the test
+ # was running. We would get a spurious failure.
+ RET=0
+ continue
+ fi
+ break
+ done
+
+ bridge fdb del dev $swp2 $h3mac vlan 555 master 2>/dev/null
+ # Re-prime FDB
+ $ARPING -I br1.555 192.0.2.130 -fqc 1
+ sleep 1
+ quick_test_span_gre_dir $tundev ingress
+
+ mirror_uninstall $swp1 ingress
+
+ log_test "$what: MAC roaming ($tcflags)"
+}
+
+test_gretap_fdb_roaming()
+{
+ test_span_gre_fdb_roaming gt4 "mirror to gretap"
+}
+
+test_ip6gretap_fdb_roaming()
+{
+ test_span_gre_fdb_roaming gt6 "mirror to ip6gretap"
+}
+
+test_gretap_stp()
+{
+ full_test_span_gre_stp gt4 $swp3 "mirror to gretap"
+}
+
+test_ip6gretap_stp()
+{
+ full_test_span_gre_stp gt6 $swp3 "mirror to ip6gretap"
+}
+
+test_all()
+{
+ slow_path_trap_install $swp1 ingress
+ slow_path_trap_install $swp1 egress
+
+ tests_run
+
+ slow_path_trap_uninstall $swp1 egress
+ slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+ echo "WARN: Could not test offloaded functionality"
+else
+ tcflags="skip_sw"
+ test_all
+fi
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/mirror_lib.sh b/tools/testing/selftests/net/forwarding/mirror_lib.sh
new file mode 100644
index 000000000..6406cd76a
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_lib.sh
@@ -0,0 +1,148 @@
+# SPDX-License-Identifier: GPL-2.0
+
+mirror_install()
+{
+ local from_dev=$1; shift
+ local direction=$1; shift
+ local to_dev=$1; shift
+ local filter=$1; shift
+
+ tc filter add dev $from_dev $direction \
+ pref 1000 $filter \
+ action mirred egress mirror dev $to_dev
+}
+
+mirror_uninstall()
+{
+ local from_dev=$1; shift
+ local direction=$1; shift
+
+ tc filter del dev $swp1 $direction pref 1000
+}
+
+is_ipv6()
+{
+ local addr=$1; shift
+
+ [[ -z ${addr//[0-9a-fA-F:]/} ]]
+}
+
+mirror_test()
+{
+ local vrf_name=$1; shift
+ local sip=$1; shift
+ local dip=$1; shift
+ local dev=$1; shift
+ local pref=$1; shift
+ local expect=$1; shift
+
+ if is_ipv6 $dip; then
+ local proto=-6
+ local type="icmp6 type=128" # Echo request.
+ else
+ local proto=
+ local type="icmp echoreq"
+ fi
+
+ local t0=$(tc_rule_stats_get $dev $pref)
+ $MZ $proto $vrf_name ${sip:+-A $sip} -B $dip -a own -b bc -q \
+ -c 10 -d 100msec -t $type
+ sleep 0.5
+ local t1=$(tc_rule_stats_get $dev $pref)
+ local delta=$((t1 - t0))
+ # Tolerate a couple stray extra packets.
+ ((expect <= delta && delta <= expect + 2))
+ check_err $? "Expected to capture $expect packets, got $delta."
+}
+
+do_test_span_dir_ips()
+{
+ local expect=$1; shift
+ local dev=$1; shift
+ local direction=$1; shift
+ local ip1=$1; shift
+ local ip2=$1; shift
+
+ icmp_capture_install $dev
+ mirror_test v$h1 $ip1 $ip2 $dev 100 $expect
+ mirror_test v$h2 $ip2 $ip1 $dev 100 $expect
+ icmp_capture_uninstall $dev
+}
+
+quick_test_span_dir_ips()
+{
+ do_test_span_dir_ips 10 "$@"
+}
+
+fail_test_span_dir_ips()
+{
+ do_test_span_dir_ips 0 "$@"
+}
+
+test_span_dir_ips()
+{
+ local dev=$1; shift
+ local direction=$1; shift
+ local forward_type=$1; shift
+ local backward_type=$1; shift
+ local ip1=$1; shift
+ local ip2=$1; shift
+
+ quick_test_span_dir_ips "$dev" "$direction" "$ip1" "$ip2"
+
+ icmp_capture_install $dev "type $forward_type"
+ mirror_test v$h1 $ip1 $ip2 $dev 100 10
+ icmp_capture_uninstall $dev
+
+ icmp_capture_install $dev "type $backward_type"
+ mirror_test v$h2 $ip2 $ip1 $dev 100 10
+ icmp_capture_uninstall $dev
+}
+
+fail_test_span_dir()
+{
+ fail_test_span_dir_ips "$@" 192.0.2.1 192.0.2.2
+}
+
+test_span_dir()
+{
+ test_span_dir_ips "$@" 192.0.2.1 192.0.2.2
+}
+
+do_test_span_vlan_dir_ips()
+{
+ local expect=$1; shift
+ local dev=$1; shift
+ local vid=$1; shift
+ local direction=$1; shift
+ local ip1=$1; shift
+ local ip2=$1; shift
+
+ # Install the capture as skip_hw to avoid double-counting of packets.
+ # The traffic is meant for local box anyway, so will be trapped to
+ # kernel.
+ vlan_capture_install $dev "skip_hw vlan_id $vid vlan_ethtype ip"
+ mirror_test v$h1 $ip1 $ip2 $dev 100 $expect
+ mirror_test v$h2 $ip2 $ip1 $dev 100 $expect
+ vlan_capture_uninstall $dev
+}
+
+quick_test_span_vlan_dir_ips()
+{
+ do_test_span_vlan_dir_ips 10 "$@"
+}
+
+fail_test_span_vlan_dir_ips()
+{
+ do_test_span_vlan_dir_ips 0 "$@"
+}
+
+quick_test_span_vlan_dir()
+{
+ quick_test_span_vlan_dir_ips "$@" 192.0.2.1 192.0.2.2
+}
+
+fail_test_span_vlan_dir()
+{
+ fail_test_span_vlan_dir_ips "$@" 192.0.2.1 192.0.2.2
+}
diff --git a/tools/testing/selftests/net/forwarding/mirror_topo_lib.sh b/tools/testing/selftests/net/forwarding/mirror_topo_lib.sh
new file mode 100644
index 000000000..04979e596
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_topo_lib.sh
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# This is the standard topology for testing mirroring. The tests that use it
+# tweak it in one way or another--typically add more devices to the topology.
+#
+# +---------------------+ +---------------------+
+# | H1 | | H2 |
+# | + $h1 | | $h2 + |
+# | | 192.0.2.1/28 | | 192.0.2.2/28 | |
+# +-----|---------------+ +---------------|-----+
+# | |
+# +-----|-------------------------------------------------------------|-----+
+# | SW o--> mirror | |
+# | +---|-------------------------------------------------------------|---+ |
+# | | + $swp1 BR $swp2 + | |
+# | +---------------------------------------------------------------------+ |
+# | |
+# | + $swp3 |
+# +-----|-------------------------------------------------------------------+
+# |
+# +-----|-------------------------------------------------------------------+
+# | H3 + $h3 |
+# | |
+# +-------------------------------------------------------------------------+
+
+mirror_topo_h1_create()
+{
+ simple_if_init $h1 192.0.2.1/28
+}
+
+mirror_topo_h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/28
+}
+
+mirror_topo_h2_create()
+{
+ simple_if_init $h2 192.0.2.2/28
+}
+
+mirror_topo_h2_destroy()
+{
+ simple_if_fini $h2 192.0.2.2/28
+}
+
+mirror_topo_h3_create()
+{
+ simple_if_init $h3
+ tc qdisc add dev $h3 clsact
+}
+
+mirror_topo_h3_destroy()
+{
+ tc qdisc del dev $h3 clsact
+ simple_if_fini $h3
+}
+
+mirror_topo_switch_create()
+{
+ ip link set dev $swp3 up
+
+ ip link add name br1 type bridge vlan_filtering 1
+ ip link set dev br1 up
+
+ ip link set dev $swp1 master br1
+ ip link set dev $swp1 up
+
+ ip link set dev $swp2 master br1
+ ip link set dev $swp2 up
+
+ tc qdisc add dev $swp1 clsact
+}
+
+mirror_topo_switch_destroy()
+{
+ tc qdisc del dev $swp1 clsact
+
+ ip link set dev $swp1 down
+ ip link set dev $swp2 down
+ ip link del dev br1
+
+ ip link set dev $swp3 down
+}
+
+mirror_topo_create()
+{
+ mirror_topo_h1_create
+ mirror_topo_h2_create
+ mirror_topo_h3_create
+
+ mirror_topo_switch_create
+}
+
+mirror_topo_destroy()
+{
+ mirror_topo_switch_destroy
+
+ mirror_topo_h3_destroy
+ mirror_topo_h2_destroy
+ mirror_topo_h1_destroy
+}
diff --git a/tools/testing/selftests/net/forwarding/mirror_vlan.sh b/tools/testing/selftests/net/forwarding/mirror_vlan.sh
new file mode 100755
index 000000000..9ab2ce77b
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/mirror_vlan.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test uses standard topology for testing mirroring. See mirror_topo_lib.sh
+# for more details.
+#
+# Test for "tc action mirred egress mirror" that mirrors to a vlan device.
+
+ALL_TESTS="
+ test_vlan
+ test_tagged_vlan
+"
+
+NUM_NETIFS=6
+source lib.sh
+source mirror_lib.sh
+source mirror_topo_lib.sh
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ vrf_prepare
+ mirror_topo_create
+
+ vlan_create $swp3 555
+
+ vlan_create $h3 555 v$h3
+ matchall_sink_create $h3.555
+
+ vlan_create $h1 111 v$h1 192.0.2.17/28
+ bridge vlan add dev $swp1 vid 111
+
+ vlan_create $h2 111 v$h2 192.0.2.18/28
+ bridge vlan add dev $swp2 vid 111
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ vlan_destroy $h2 111
+ vlan_destroy $h1 111
+ vlan_destroy $h3 555
+ vlan_destroy $swp3 555
+
+ mirror_topo_destroy
+ vrf_cleanup
+}
+
+test_vlan_dir()
+{
+ local direction=$1; shift
+ local forward_type=$1; shift
+ local backward_type=$1; shift
+
+ RET=0
+
+ mirror_install $swp1 $direction $swp3.555 "matchall $tcflags"
+ test_span_dir "$h3.555" "$direction" "$forward_type" "$backward_type"
+ mirror_uninstall $swp1 $direction
+
+ log_test "$direction mirror to vlan ($tcflags)"
+}
+
+test_vlan()
+{
+ test_vlan_dir ingress 8 0
+ test_vlan_dir egress 0 8
+}
+
+test_tagged_vlan_dir()
+{
+ local direction=$1; shift
+ local forward_type=$1; shift
+ local backward_type=$1; shift
+
+ RET=0
+
+ mirror_install $swp1 $direction $swp3.555 "matchall $tcflags"
+ do_test_span_vlan_dir_ips 10 "$h3.555" 111 "$direction" \
+ 192.0.2.17 192.0.2.18
+ do_test_span_vlan_dir_ips 0 "$h3.555" 555 "$direction" \
+ 192.0.2.17 192.0.2.18
+ mirror_uninstall $swp1 $direction
+
+ log_test "$direction mirror tagged to vlan ($tcflags)"
+}
+
+test_tagged_vlan()
+{
+ test_tagged_vlan_dir ingress 8 0
+ test_tagged_vlan_dir egress 0 8
+}
+
+test_all()
+{
+ slow_path_trap_install $swp1 ingress
+ slow_path_trap_install $swp1 egress
+ trap_install $h3 ingress
+
+ tests_run
+
+ trap_uninstall $h3 ingress
+ slow_path_trap_uninstall $swp1 egress
+ slow_path_trap_uninstall $swp1 ingress
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tcflags="skip_hw"
+test_all
+
+if ! tc_offload_check; then
+ echo "WARN: Could not test offloaded functionality"
+else
+ tcflags="skip_sw"
+ test_all
+fi
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/pedit_dsfield.sh b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh
new file mode 100755
index 000000000..64fbd211d
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/pedit_dsfield.sh
@@ -0,0 +1,311 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test sends traffic from H1 to H2. Either on ingress of $swp1, or on
+# egress of $swp2, the traffic is acted upon by a pedit action. An ingress
+# filter installed on $h2 verifies that the packet looks like expected.
+#
+# +----------------------+ +----------------------+
+# | H1 | | H2 |
+# | + $h1 | | $h2 + |
+# | | 192.0.2.1/28 | | 192.0.2.2/28 | |
+# +----|-----------------+ +----------------|-----+
+# | |
+# +----|----------------------------------------------------------------|-----+
+# | SW | | |
+# | +-|----------------------------------------------------------------|-+ |
+# | | + $swp1 BR $swp2 + | |
+# | +--------------------------------------------------------------------+ |
+# +---------------------------------------------------------------------------+
+
+ALL_TESTS="
+ ping_ipv4
+ ping_ipv6
+ test_ip_dsfield
+ test_ip_dscp
+ test_ip_ecn
+ test_ip_dscp_ecn
+ test_ip6_dsfield
+ test_ip6_dscp
+ test_ip6_ecn
+"
+
+NUM_NETIFS=4
+source lib.sh
+source tc_common.sh
+
+: ${HIT_TIMEOUT:=2000} # ms
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/28 2001:db8:1::1/64
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/28 2001:db8:1::1/64
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.2/28 2001:db8:1::2/64
+ tc qdisc add dev $h2 clsact
+}
+
+h2_destroy()
+{
+ tc qdisc del dev $h2 clsact
+ simple_if_fini $h2 192.0.2.2/28 2001:db8:1::2/64
+}
+
+switch_create()
+{
+ ip link add name br1 up type bridge vlan_filtering 1
+ ip link set dev $swp1 master br1
+ ip link set dev $swp1 up
+ ip link set dev $swp2 master br1
+ ip link set dev $swp2 up
+
+ tc qdisc add dev $swp1 clsact
+ tc qdisc add dev $swp2 clsact
+}
+
+switch_destroy()
+{
+ tc qdisc del dev $swp2 clsact
+ tc qdisc del dev $swp1 clsact
+
+ ip link set dev $swp2 down
+ ip link set dev $swp2 nomaster
+ ip link set dev $swp1 down
+ ip link set dev $swp1 nomaster
+ ip link del dev br1
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ h2mac=$(mac_get $h2)
+
+ vrf_prepare
+ h1_create
+ h2_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h2_destroy
+ h1_destroy
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.2.2
+}
+
+ping_ipv6()
+{
+ ping6_test $h1 2001:db8:1::2
+}
+
+do_test_pedit_dsfield_common()
+{
+ local pedit_locus=$1; shift
+ local pedit_action=$1; shift
+ local mz_flags=$1; shift
+
+ RET=0
+
+ # TOS 125: DSCP 31, ECN 1. Used for testing that the relevant part is
+ # overwritten when zero is selected.
+ $MZ $mz_flags $h1 -c 10 -d 20msec -p 100 \
+ -a own -b $h2mac -q -t tcp tos=0x7d,sp=54321,dp=12345
+
+ local pkts
+ pkts=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= 10" \
+ tc_rule_handle_stats_get "dev $h2 ingress" 101)
+ check_err $? "Expected to get 10 packets on test probe, but got $pkts."
+
+ pkts=$(tc_rule_handle_stats_get "$pedit_locus" 101)
+ ((pkts >= 10))
+ check_err $? "Expected to get 10 packets on pedit rule, but got $pkts."
+
+ log_test "$pedit_locus pedit $pedit_action"
+}
+
+do_test_pedit_dsfield()
+{
+ local pedit_locus=$1; shift
+ local pedit_action=$1; shift
+ local match_prot=$1; shift
+ local match_flower=$1; shift
+ local mz_flags=$1; shift
+ local saddr=$1; shift
+ local daddr=$1; shift
+
+ tc filter add $pedit_locus handle 101 pref 1 \
+ flower action pedit ex munge $pedit_action
+ tc filter add dev $h2 ingress handle 101 pref 1 prot $match_prot \
+ flower skip_hw $match_flower action pass
+
+ do_test_pedit_dsfield_common "$pedit_locus" "$pedit_action" "$mz_flags"
+
+ tc filter del dev $h2 ingress pref 1
+ tc filter del $pedit_locus pref 1
+}
+
+do_test_ip_dsfield()
+{
+ local locus=$1; shift
+ local dsfield
+
+ for dsfield in 0 1 2 3 128 252 253 254 255; do
+ do_test_pedit_dsfield "$locus" \
+ "ip dsfield set $dsfield" \
+ ip "ip_tos $dsfield" \
+ "-A 192.0.2.1 -B 192.0.2.2"
+ done
+}
+
+test_ip_dsfield()
+{
+ do_test_ip_dsfield "dev $swp1 ingress"
+ do_test_ip_dsfield "dev $swp2 egress"
+}
+
+do_test_ip_dscp()
+{
+ local locus=$1; shift
+ local dscp
+
+ for dscp in 0 1 2 3 32 61 62 63; do
+ do_test_pedit_dsfield "$locus" \
+ "ip dsfield set $((dscp << 2)) retain 0xfc" \
+ ip "ip_tos $(((dscp << 2) | 1))" \
+ "-A 192.0.2.1 -B 192.0.2.2"
+ done
+}
+
+test_ip_dscp()
+{
+ do_test_ip_dscp "dev $swp1 ingress"
+ do_test_ip_dscp "dev $swp2 egress"
+}
+
+do_test_ip_ecn()
+{
+ local locus=$1; shift
+ local ecn
+
+ for ecn in 0 1 2 3; do
+ do_test_pedit_dsfield "$locus" \
+ "ip dsfield set $ecn retain 0x03" \
+ ip "ip_tos $((124 | $ecn))" \
+ "-A 192.0.2.1 -B 192.0.2.2"
+ done
+}
+
+test_ip_ecn()
+{
+ do_test_ip_ecn "dev $swp1 ingress"
+ do_test_ip_ecn "dev $swp2 egress"
+}
+
+do_test_ip_dscp_ecn()
+{
+ local locus=$1; shift
+
+ tc filter add $locus handle 101 pref 1 \
+ flower action pedit ex munge ip dsfield set 124 retain 0xfc \
+ action pedit ex munge ip dsfield set 1 retain 0x03
+ tc filter add dev $h2 ingress handle 101 pref 1 prot ip \
+ flower skip_hw ip_tos 125 action pass
+
+ do_test_pedit_dsfield_common "$locus" "set DSCP + set ECN" \
+ "-A 192.0.2.1 -B 192.0.2.2"
+
+ tc filter del dev $h2 ingress pref 1
+ tc filter del $locus pref 1
+}
+
+test_ip_dscp_ecn()
+{
+ do_test_ip_dscp_ecn "dev $swp1 ingress"
+ do_test_ip_dscp_ecn "dev $swp2 egress"
+}
+
+do_test_ip6_dsfield()
+{
+ local locus=$1; shift
+ local dsfield
+
+ for dsfield in 0 1 2 3 128 252 253 254 255; do
+ do_test_pedit_dsfield "$locus" \
+ "ip6 traffic_class set $dsfield" \
+ ipv6 "ip_tos $dsfield" \
+ "-6 -A 2001:db8:1::1 -B 2001:db8:1::2"
+ done
+}
+
+test_ip6_dsfield()
+{
+ do_test_ip6_dsfield "dev $swp1 ingress"
+ do_test_ip6_dsfield "dev $swp2 egress"
+}
+
+do_test_ip6_dscp()
+{
+ local locus=$1; shift
+ local dscp
+
+ for dscp in 0 1 2 3 32 61 62 63; do
+ do_test_pedit_dsfield "$locus" \
+ "ip6 traffic_class set $((dscp << 2)) retain 0xfc" \
+ ipv6 "ip_tos $(((dscp << 2) | 1))" \
+ "-6 -A 2001:db8:1::1 -B 2001:db8:1::2"
+ done
+}
+
+test_ip6_dscp()
+{
+ do_test_ip6_dscp "dev $swp1 ingress"
+ do_test_ip6_dscp "dev $swp2 egress"
+}
+
+do_test_ip6_ecn()
+{
+ local locus=$1; shift
+ local ecn
+
+ for ecn in 0 1 2 3; do
+ do_test_pedit_dsfield "$locus" \
+ "ip6 traffic_class set $ecn retain 0x3" \
+ ipv6 "ip_tos $((124 | $ecn))" \
+ "-6 -A 2001:db8:1::1 -B 2001:db8:1::2"
+ done
+}
+
+test_ip6_ecn()
+{
+ do_test_ip6_ecn "dev $swp1 ingress"
+ do_test_ip6_ecn "dev $swp2 egress"
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/pedit_l4port.sh b/tools/testing/selftests/net/forwarding/pedit_l4port.sh
new file mode 100755
index 000000000..10e594c55
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/pedit_l4port.sh
@@ -0,0 +1,200 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test sends traffic from H1 to H2. Either on ingress of $swp1, or on egress of $swp2, the
+# traffic is acted upon by a pedit action. An ingress filter installed on $h2 verifies that the
+# packet looks like expected.
+#
+# +----------------------+ +----------------------+
+# | H1 | | H2 |
+# | + $h1 | | $h2 + |
+# | | 192.0.2.1/28 | | 192.0.2.2/28 | |
+# +----|-----------------+ +----------------|-----+
+# | |
+# +----|----------------------------------------------------------------|-----+
+# | SW | | |
+# | +-|----------------------------------------------------------------|-+ |
+# | | + $swp1 BR $swp2 + | |
+# | +--------------------------------------------------------------------+ |
+# +---------------------------------------------------------------------------+
+
+ALL_TESTS="
+ ping_ipv4
+ test_udp_sport
+ test_udp_dport
+ test_tcp_sport
+ test_tcp_dport
+"
+
+NUM_NETIFS=4
+source lib.sh
+source tc_common.sh
+
+: ${HIT_TIMEOUT:=2000} # ms
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/28 2001:db8:1::1/64
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/28 2001:db8:1::1/64
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.2/28 2001:db8:1::2/64
+ tc qdisc add dev $h2 clsact
+}
+
+h2_destroy()
+{
+ tc qdisc del dev $h2 clsact
+ simple_if_fini $h2 192.0.2.2/28 2001:db8:1::2/64
+}
+
+switch_create()
+{
+ ip link add name br1 up type bridge vlan_filtering 1
+ ip link set dev $swp1 master br1
+ ip link set dev $swp1 up
+ ip link set dev $swp2 master br1
+ ip link set dev $swp2 up
+
+ tc qdisc add dev $swp1 clsact
+ tc qdisc add dev $swp2 clsact
+}
+
+switch_destroy()
+{
+ tc qdisc del dev $swp2 clsact
+ tc qdisc del dev $swp1 clsact
+
+ ip link set dev $swp2 down
+ ip link set dev $swp2 nomaster
+ ip link set dev $swp1 down
+ ip link set dev $swp1 nomaster
+ ip link del dev br1
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ h2mac=$(mac_get $h2)
+
+ vrf_prepare
+ h1_create
+ h2_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h2_destroy
+ h1_destroy
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.2.2
+}
+
+ping_ipv6()
+{
+ ping6_test $h1 2001:db8:1::2
+}
+
+do_test_pedit_l4port_one()
+{
+ local pedit_locus=$1; shift
+ local pedit_prot=$1; shift
+ local pedit_action=$1; shift
+ local match_prot=$1; shift
+ local match_flower=$1; shift
+ local mz_flags=$1; shift
+ local saddr=$1; shift
+ local daddr=$1; shift
+
+ tc filter add $pedit_locus handle 101 pref 1 \
+ flower action pedit ex munge $pedit_action
+ tc filter add dev $h2 ingress handle 101 pref 1 prot $match_prot \
+ flower skip_hw $match_flower action pass
+
+ RET=0
+
+ $MZ $mz_flags $h1 -c 10 -d 20msec -p 100 \
+ -a own -b $h2mac -q -t $pedit_prot sp=54321,dp=12345
+
+ local pkts
+ pkts=$(busywait "$TC_HIT_TIMEOUT" until_counter_is ">= 10" \
+ tc_rule_handle_stats_get "dev $h2 ingress" 101)
+ check_err $? "Expected to get 10 packets, but got $pkts."
+
+ pkts=$(tc_rule_handle_stats_get "$pedit_locus" 101)
+ ((pkts >= 10))
+ check_err $? "Expected to get 10 packets on pedit rule, but got $pkts."
+
+ log_test "$pedit_locus pedit $pedit_action"
+
+ tc filter del dev $h2 ingress pref 1
+ tc filter del $pedit_locus pref 1
+}
+
+do_test_pedit_l4port()
+{
+ local locus=$1; shift
+ local prot=$1; shift
+ local pedit_port=$1; shift
+ local flower_port=$1; shift
+ local port
+
+ for port in 1 11111 65535; do
+ do_test_pedit_l4port_one "$locus" "$prot" \
+ "$prot $pedit_port set $port" \
+ ip "ip_proto $prot $flower_port $port" \
+ "-A 192.0.2.1 -B 192.0.2.2"
+ done
+}
+
+test_udp_sport()
+{
+ do_test_pedit_l4port "dev $swp1 ingress" udp sport src_port
+ do_test_pedit_l4port "dev $swp2 egress" udp sport src_port
+}
+
+test_udp_dport()
+{
+ do_test_pedit_l4port "dev $swp1 ingress" udp dport dst_port
+ do_test_pedit_l4port "dev $swp2 egress" udp dport dst_port
+}
+
+test_tcp_sport()
+{
+ do_test_pedit_l4port "dev $swp1 ingress" tcp sport src_port
+ do_test_pedit_l4port "dev $swp2 egress" tcp sport src_port
+}
+
+test_tcp_dport()
+{
+ do_test_pedit_l4port "dev $swp1 ingress" tcp dport dst_port
+ do_test_pedit_l4port "dev $swp2 egress" tcp dport dst_port
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/router.sh b/tools/testing/selftests/net/forwarding/router.sh
new file mode 100755
index 000000000..057f91b05
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/router.sh
@@ -0,0 +1,322 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="
+ ping_ipv4
+ ping_ipv6
+ sip_in_class_e
+ mc_mac_mismatch
+ ipv4_sip_equal_dip
+ ipv6_sip_equal_dip
+ ipv4_dip_link_local
+"
+
+NUM_NETIFS=4
+source lib.sh
+source tc_common.sh
+
+require_command $MCD
+require_command $MC_CLI
+table_name=selftests
+
+h1_create()
+{
+ vrf_create "vrf-h1"
+ ip link set dev $h1 master vrf-h1
+
+ ip link set dev vrf-h1 up
+ ip link set dev $h1 up
+
+ ip address add 192.0.2.2/24 dev $h1
+ ip address add 2001:db8:1::2/64 dev $h1
+
+ ip route add 198.51.100.0/24 vrf vrf-h1 nexthop via 192.0.2.1
+ ip route add 2001:db8:2::/64 vrf vrf-h1 nexthop via 2001:db8:1::1
+}
+
+h1_destroy()
+{
+ ip route del 2001:db8:2::/64 vrf vrf-h1
+ ip route del 198.51.100.0/24 vrf vrf-h1
+
+ ip address del 2001:db8:1::2/64 dev $h1
+ ip address del 192.0.2.2/24 dev $h1
+
+ ip link set dev $h1 down
+ vrf_destroy "vrf-h1"
+}
+
+h2_create()
+{
+ vrf_create "vrf-h2"
+ ip link set dev $h2 master vrf-h2
+
+ ip link set dev vrf-h2 up
+ ip link set dev $h2 up
+
+ ip address add 198.51.100.2/24 dev $h2
+ ip address add 2001:db8:2::2/64 dev $h2
+
+ ip route add 192.0.2.0/24 vrf vrf-h2 nexthop via 198.51.100.1
+ ip route add 2001:db8:1::/64 vrf vrf-h2 nexthop via 2001:db8:2::1
+}
+
+h2_destroy()
+{
+ ip route del 2001:db8:1::/64 vrf vrf-h2
+ ip route del 192.0.2.0/24 vrf vrf-h2
+
+ ip address del 2001:db8:2::2/64 dev $h2
+ ip address del 198.51.100.2/24 dev $h2
+
+ ip link set dev $h2 down
+ vrf_destroy "vrf-h2"
+}
+
+router_create()
+{
+ ip link set dev $rp1 up
+ ip link set dev $rp2 up
+
+ tc qdisc add dev $rp2 clsact
+
+ ip address add 192.0.2.1/24 dev $rp1
+ ip address add 2001:db8:1::1/64 dev $rp1
+
+ ip address add 198.51.100.1/24 dev $rp2
+ ip address add 2001:db8:2::1/64 dev $rp2
+}
+
+router_destroy()
+{
+ ip address del 2001:db8:2::1/64 dev $rp2
+ ip address del 198.51.100.1/24 dev $rp2
+
+ ip address del 2001:db8:1::1/64 dev $rp1
+ ip address del 192.0.2.1/24 dev $rp1
+
+ tc qdisc del dev $rp2 clsact
+
+ ip link set dev $rp2 down
+ ip link set dev $rp1 down
+}
+
+start_mcd()
+{
+ SMCROUTEDIR="$(mktemp -d)"
+
+ for ((i = 1; i <= $NUM_NETIFS; ++i)); do
+ echo "phyint ${NETIFS[p$i]} enable" >> \
+ $SMCROUTEDIR/$table_name.conf
+ done
+
+ $MCD -N -I $table_name -f $SMCROUTEDIR/$table_name.conf \
+ -P $SMCROUTEDIR/$table_name.pid
+}
+
+kill_mcd()
+{
+ pkill $MCD
+ rm -rf $SMCROUTEDIR
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ rp1=${NETIFS[p2]}
+
+ rp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ rp1mac=$(mac_get $rp1)
+
+ start_mcd
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+
+ router_create
+
+ forwarding_enable
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ forwarding_restore
+
+ router_destroy
+
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+
+ kill_mcd
+}
+
+ping_ipv4()
+{
+ ping_test $h1 198.51.100.2
+}
+
+ping_ipv6()
+{
+ ping6_test $h1 2001:db8:2::2
+}
+
+sip_in_class_e()
+{
+ RET=0
+
+ # Disable rpfilter to prevent packets to be dropped because of it.
+ sysctl_set net.ipv4.conf.all.rp_filter 0
+ sysctl_set net.ipv4.conf.$rp1.rp_filter 0
+
+ tc filter add dev $rp2 egress protocol ip pref 1 handle 101 \
+ flower src_ip 240.0.0.1 ip_proto udp action pass
+
+ $MZ $h1 -t udp "sp=54321,dp=12345" -c 5 -d 1msec \
+ -A 240.0.0.1 -b $rp1mac -B 198.51.100.2 -q
+
+ tc_check_packets "dev $rp2 egress" 101 5
+ check_err $? "Packets were dropped"
+
+ log_test "Source IP in class E"
+
+ tc filter del dev $rp2 egress protocol ip pref 1 handle 101 flower
+ sysctl_restore net.ipv4.conf.$rp1.rp_filter
+ sysctl_restore net.ipv4.conf.all.rp_filter
+}
+
+create_mcast_sg()
+{
+ local if_name=$1; shift
+ local s_addr=$1; shift
+ local mcast=$1; shift
+ local dest_ifs=${@}
+
+ $MC_CLI -I $table_name add $if_name $s_addr $mcast $dest_ifs
+}
+
+delete_mcast_sg()
+{
+ local if_name=$1; shift
+ local s_addr=$1; shift
+ local mcast=$1; shift
+ local dest_ifs=${@}
+
+ $MC_CLI -I $table_name remove $if_name $s_addr $mcast $dest_ifs
+}
+
+__mc_mac_mismatch()
+{
+ local desc=$1; shift
+ local proto=$1; shift
+ local sip=$1; shift
+ local dip=$1; shift
+ local flags=${1:-""}; shift
+ local dmac=01:02:03:04:05:06
+
+ RET=0
+
+ tc filter add dev $rp2 egress protocol $proto pref 1 handle 101 \
+ flower dst_ip $dip action pass
+
+ create_mcast_sg $rp1 $sip $dip $rp2
+
+ $MZ $flags $h1 -t udp "sp=54321,dp=12345" -c 5 -d 1msec -b $dmac \
+ -B $dip -q
+
+ tc_check_packets "dev $rp2 egress" 101 5
+ check_err $? "Packets were dropped"
+
+ log_test "Multicast MAC mismatch: $desc"
+
+ delete_mcast_sg $rp1 $sip $dip $rp2
+ tc filter del dev $rp2 egress protocol $proto pref 1 handle 101 flower
+}
+
+mc_mac_mismatch()
+{
+ __mc_mac_mismatch "IPv4" "ip" 192.0.2.2 225.1.2.3
+ __mc_mac_mismatch "IPv6" "ipv6" 2001:db8:1::2 ff0e::3 "-6"
+}
+
+ipv4_sip_equal_dip()
+{
+ RET=0
+
+ # Disable rpfilter to prevent packets to be dropped because of it.
+ sysctl_set net.ipv4.conf.all.rp_filter 0
+ sysctl_set net.ipv4.conf.$rp1.rp_filter 0
+
+ tc filter add dev $rp2 egress protocol ip pref 1 handle 101 \
+ flower src_ip 198.51.100.2 action pass
+
+ $MZ $h1 -t udp "sp=54321,dp=12345" -c 5 -d 1msec \
+ -A 198.51.100.2 -b $rp1mac -B 198.51.100.2 -q
+
+ tc_check_packets "dev $rp2 egress" 101 5
+ check_err $? "Packets were dropped"
+
+ log_test "Source IP is equal to destination IP: IPv4"
+
+ tc filter del dev $rp2 egress protocol ip pref 1 handle 101 flower
+ sysctl_restore net.ipv4.conf.$rp1.rp_filter
+ sysctl_restore net.ipv4.conf.all.rp_filter
+}
+
+ipv6_sip_equal_dip()
+{
+ RET=0
+
+ tc filter add dev $rp2 egress protocol ipv6 pref 1 handle 101 \
+ flower src_ip 2001:db8:2::2 action pass
+
+ $MZ -6 $h1 -t udp "sp=54321,dp=12345" -c 5 -d 1msec \
+ -A 2001:db8:2::2 -b $rp1mac -B 2001:db8:2::2 -q
+
+ tc_check_packets "dev $rp2 egress" 101 5
+ check_err $? "Packets were dropped"
+
+ log_test "Source IP is equal to destination IP: IPv6"
+
+ tc filter del dev $rp2 egress protocol ipv6 pref 1 handle 101 flower
+}
+
+ipv4_dip_link_local()
+{
+ local dip=169.254.1.1
+
+ RET=0
+
+ tc filter add dev $rp2 egress protocol ip pref 1 handle 101 \
+ flower dst_ip $dip action pass
+
+ ip neigh add 169.254.1.1 lladdr 00:11:22:33:44:55 dev $rp2
+ ip route add 169.254.1.0/24 dev $rp2
+
+ $MZ $h1 -t udp "sp=54321,dp=12345" -c 5 -d 1msec -b $rp1mac -B $dip -q
+
+ tc_check_packets "dev $rp2 egress" 101 5
+ check_err $? "Packets were dropped"
+
+ log_test "IPv4 destination IP is link-local"
+
+ ip route del 169.254.1.0/24 dev $rp2
+ ip neigh del 169.254.1.1 lladdr 00:11:22:33:44:55 dev $rp2
+ tc filter del dev $rp2 egress protocol ip pref 1 handle 101 flower
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/router_bridge.sh b/tools/testing/selftests/net/forwarding/router_bridge.sh
new file mode 100755
index 000000000..ebc596a27
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/router_bridge.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="
+ ping_ipv4
+ ping_ipv6
+"
+NUM_NETIFS=4
+source lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/28 2001:db8:1::1/64
+ ip -4 route add 192.0.2.128/28 vrf v$h1 nexthop via 192.0.2.2
+ ip -6 route add 2001:db8:2::/64 vrf v$h1 nexthop via 2001:db8:1::2
+}
+
+h1_destroy()
+{
+ ip -6 route del 2001:db8:2::/64 vrf v$h1
+ ip -4 route del 192.0.2.128/28 vrf v$h1
+ simple_if_fini $h1 192.0.2.1/28 2001:db8:1::1/64
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.130/28 2001:db8:2::2/64
+ ip -4 route add 192.0.2.0/28 vrf v$h2 nexthop via 192.0.2.129
+ ip -6 route add 2001:db8:1::/64 vrf v$h2 nexthop via 2001:db8:2::1
+}
+
+h2_destroy()
+{
+ ip -6 route del 2001:db8:1::/64 vrf v$h2
+ ip -4 route del 192.0.2.0/28 vrf v$h2
+ simple_if_fini $h2 192.0.2.130/28 2001:db8:2::2/64
+}
+
+router_create()
+{
+ ip link add name br1 type bridge vlan_filtering 1
+ ip link set dev br1 up
+
+ ip link set dev $swp1 master br1
+ ip link set dev $swp1 up
+ __addr_add_del br1 add 192.0.2.2/28 2001:db8:1::2/64
+
+ ip link set dev $swp2 up
+ __addr_add_del $swp2 add 192.0.2.129/28 2001:db8:2::1/64
+}
+
+router_destroy()
+{
+ __addr_add_del $swp2 del 192.0.2.129/28 2001:db8:2::1/64
+ ip link set dev $swp2 down
+
+ __addr_add_del br1 del 192.0.2.2/28 2001:db8:1::2/64
+ ip link set dev $swp1 down
+ ip link set dev $swp1 nomaster
+
+ ip link del dev br1
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+
+ router_create
+
+ forwarding_enable
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ forwarding_restore
+
+ router_destroy
+
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.2.130
+}
+
+ping_ipv6()
+{
+ ping6_test $h1 2001:db8:2::2
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/router_bridge_vlan.sh b/tools/testing/selftests/net/forwarding/router_bridge_vlan.sh
new file mode 100755
index 000000000..fa6a88c50
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/router_bridge_vlan.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="
+ ping_ipv4
+ ping_ipv6
+ vlan
+"
+NUM_NETIFS=4
+source lib.sh
+
+h1_create()
+{
+ simple_if_init $h1
+ vlan_create $h1 555 v$h1 192.0.2.1/28 2001:db8:1::1/64
+ ip -4 route add 192.0.2.128/28 vrf v$h1 nexthop via 192.0.2.2
+ ip -6 route add 2001:db8:2::/64 vrf v$h1 nexthop via 2001:db8:1::2
+}
+
+h1_destroy()
+{
+ ip -6 route del 2001:db8:2::/64 vrf v$h1
+ ip -4 route del 192.0.2.128/28 vrf v$h1
+ vlan_destroy $h1 555
+ simple_if_fini $h1
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.130/28 2001:db8:2::2/64
+ ip -4 route add 192.0.2.0/28 vrf v$h2 nexthop via 192.0.2.129
+ ip -6 route add 2001:db8:1::/64 vrf v$h2 nexthop via 2001:db8:2::1
+}
+
+h2_destroy()
+{
+ ip -6 route del 2001:db8:1::/64 vrf v$h2
+ ip -4 route del 192.0.2.0/28 vrf v$h2
+ simple_if_fini $h2 192.0.2.130/28 2001:db8:2::2/64
+}
+
+router_create()
+{
+ ip link add name br1 type bridge vlan_filtering 1
+ ip link set dev br1 up
+
+ ip link set dev $swp1 master br1
+ ip link set dev $swp1 up
+
+ bridge vlan add dev br1 vid 555 self pvid untagged
+ bridge vlan add dev $swp1 vid 555
+
+ __addr_add_del br1 add 192.0.2.2/28 2001:db8:1::2/64
+
+ ip link set dev $swp2 up
+ __addr_add_del $swp2 add 192.0.2.129/28 2001:db8:2::1/64
+}
+
+router_destroy()
+{
+ __addr_add_del $swp2 del 192.0.2.129/28 2001:db8:2::1/64
+ ip link set dev $swp2 down
+
+ __addr_add_del br1 del 192.0.2.2/28 2001:db8:1::2/64
+ ip link set dev $swp1 down
+ ip link set dev $swp1 nomaster
+
+ ip link del dev br1
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+
+ router_create
+
+ forwarding_enable
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ forwarding_restore
+
+ router_destroy
+
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+vlan()
+{
+ RET=0
+
+ bridge vlan add dev br1 vid 333 self
+ check_err $? "Can't add a non-PVID VLAN"
+ bridge vlan del dev br1 vid 333 self
+ check_err $? "Can't remove a non-PVID VLAN"
+
+ log_test "vlan"
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.2.130
+}
+
+ping_ipv6()
+{
+ ping6_test $h1 2001:db8:2::2
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/router_broadcast.sh b/tools/testing/selftests/net/forwarding/router_broadcast.sh
new file mode 100755
index 000000000..4eac0a06f
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/router_broadcast.sh
@@ -0,0 +1,237 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="ping_ipv4"
+NUM_NETIFS=6
+source lib.sh
+
+h1_create()
+{
+ vrf_create "vrf-h1"
+ ip link set dev $h1 master vrf-h1
+
+ ip link set dev vrf-h1 up
+ ip link set dev $h1 up
+
+ ip address add 192.0.2.2/24 dev $h1
+
+ ip route add 198.51.100.0/24 vrf vrf-h1 nexthop via 192.0.2.1
+ ip route add 198.51.200.0/24 vrf vrf-h1 nexthop via 192.0.2.1
+}
+
+h1_destroy()
+{
+ ip route del 198.51.200.0/24 vrf vrf-h1
+ ip route del 198.51.100.0/24 vrf vrf-h1
+
+ ip address del 192.0.2.2/24 dev $h1
+
+ ip link set dev $h1 down
+ vrf_destroy "vrf-h1"
+}
+
+h2_create()
+{
+ vrf_create "vrf-h2"
+ ip link set dev $h2 master vrf-h2
+
+ ip link set dev vrf-h2 up
+ ip link set dev $h2 up
+
+ ip address add 198.51.100.2/24 dev $h2
+
+ ip route add 192.0.2.0/24 vrf vrf-h2 nexthop via 198.51.100.1
+ ip route add 198.51.200.0/24 vrf vrf-h2 nexthop via 198.51.100.1
+}
+
+h2_destroy()
+{
+ ip route del 198.51.200.0/24 vrf vrf-h2
+ ip route del 192.0.2.0/24 vrf vrf-h2
+
+ ip address del 198.51.100.2/24 dev $h2
+
+ ip link set dev $h2 down
+ vrf_destroy "vrf-h2"
+}
+
+h3_create()
+{
+ vrf_create "vrf-h3"
+ ip link set dev $h3 master vrf-h3
+
+ ip link set dev vrf-h3 up
+ ip link set dev $h3 up
+
+ ip address add 198.51.200.2/24 dev $h3
+
+ ip route add 192.0.2.0/24 vrf vrf-h3 nexthop via 198.51.200.1
+ ip route add 198.51.100.0/24 vrf vrf-h3 nexthop via 198.51.200.1
+}
+
+h3_destroy()
+{
+ ip route del 198.51.100.0/24 vrf vrf-h3
+ ip route del 192.0.2.0/24 vrf vrf-h3
+
+ ip address del 198.51.200.2/24 dev $h3
+
+ ip link set dev $h3 down
+ vrf_destroy "vrf-h3"
+}
+
+router_create()
+{
+ ip link set dev $rp1 up
+ ip link set dev $rp2 up
+ ip link set dev $rp3 up
+
+ ip address add 192.0.2.1/24 dev $rp1
+
+ ip address add 198.51.100.1/24 dev $rp2
+ ip address add 198.51.200.1/24 dev $rp3
+}
+
+router_destroy()
+{
+ ip address del 198.51.200.1/24 dev $rp3
+ ip address del 198.51.100.1/24 dev $rp2
+
+ ip address del 192.0.2.1/24 dev $rp1
+
+ ip link set dev $rp3 down
+ ip link set dev $rp2 down
+ ip link set dev $rp1 down
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ rp1=${NETIFS[p2]}
+
+ rp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ rp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+ h3_create
+
+ router_create
+
+ forwarding_enable
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ forwarding_restore
+
+ router_destroy
+
+ h3_destroy
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+bc_forwarding_disable()
+{
+ sysctl_set net.ipv4.conf.all.bc_forwarding 0
+ sysctl_set net.ipv4.conf.$rp1.bc_forwarding 0
+ sysctl_set net.ipv4.conf.$rp2.bc_forwarding 0
+}
+
+bc_forwarding_enable()
+{
+ sysctl_set net.ipv4.conf.all.bc_forwarding 1
+ sysctl_set net.ipv4.conf.$rp1.bc_forwarding 1
+ sysctl_set net.ipv4.conf.$rp2.bc_forwarding 1
+}
+
+bc_forwarding_restore()
+{
+ sysctl_restore net.ipv4.conf.$rp2.bc_forwarding
+ sysctl_restore net.ipv4.conf.$rp1.bc_forwarding
+ sysctl_restore net.ipv4.conf.all.bc_forwarding
+}
+
+ping_test_from()
+{
+ local oif=$1
+ local dip=$2
+ local from=$3
+ local fail=${4:-0}
+
+ RET=0
+
+ log_info "ping $dip, expected reply from $from"
+ ip vrf exec $(master_name_get $oif) \
+ $PING -I $oif $dip -c 10 -i 0.1 -w $PING_TIMEOUT -b 2>&1 \
+ | grep "bytes from $from" > /dev/null
+ check_err_fail $fail $?
+}
+
+ping_ipv4()
+{
+ sysctl_set net.ipv4.icmp_echo_ignore_broadcasts 0
+
+ bc_forwarding_disable
+ log_info "bc_forwarding disabled on r1 =>"
+ ping_test_from $h1 198.51.100.255 192.0.2.1
+ log_test "h1 -> net2: reply from r1 (not forwarding)"
+ ping_test_from $h1 198.51.200.255 192.0.2.1
+ log_test "h1 -> net3: reply from r1 (not forwarding)"
+ ping_test_from $h1 192.0.2.255 192.0.2.1
+ log_test "h1 -> net1: reply from r1 (not dropping)"
+ ping_test_from $h1 255.255.255.255 192.0.2.1
+ log_test "h1 -> 255.255.255.255: reply from r1 (not forwarding)"
+
+ ping_test_from $h2 192.0.2.255 198.51.100.1
+ log_test "h2 -> net1: reply from r1 (not forwarding)"
+ ping_test_from $h2 198.51.200.255 198.51.100.1
+ log_test "h2 -> net3: reply from r1 (not forwarding)"
+ ping_test_from $h2 198.51.100.255 198.51.100.1
+ log_test "h2 -> net2: reply from r1 (not dropping)"
+ ping_test_from $h2 255.255.255.255 198.51.100.1
+ log_test "h2 -> 255.255.255.255: reply from r1 (not forwarding)"
+ bc_forwarding_restore
+
+ bc_forwarding_enable
+ log_info "bc_forwarding enabled on r1 =>"
+ ping_test_from $h1 198.51.100.255 198.51.100.2
+ log_test "h1 -> net2: reply from h2 (forwarding)"
+ ping_test_from $h1 198.51.200.255 198.51.200.2
+ log_test "h1 -> net3: reply from h3 (forwarding)"
+ ping_test_from $h1 192.0.2.255 192.0.2.1 1
+ log_test "h1 -> net1: no reply (dropping)"
+ ping_test_from $h1 255.255.255.255 192.0.2.1
+ log_test "h1 -> 255.255.255.255: reply from r1 (not forwarding)"
+
+ ping_test_from $h2 192.0.2.255 192.0.2.2
+ log_test "h2 -> net1: reply from h1 (forwarding)"
+ ping_test_from $h2 198.51.200.255 198.51.200.2
+ log_test "h2 -> net3: reply from h3 (forwarding)"
+ ping_test_from $h2 198.51.100.255 198.51.100.1 1
+ log_test "h2 -> net2: no reply (dropping)"
+ ping_test_from $h2 255.255.255.255 198.51.100.1
+ log_test "h2 -> 255.255.255.255: reply from r1 (not forwarding)"
+ bc_forwarding_restore
+
+ sysctl_restore net.ipv4.icmp_echo_ignore_broadcasts
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/router_mpath_nh.sh b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh
new file mode 100755
index 000000000..7fcc42bc0
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/router_mpath_nh.sh
@@ -0,0 +1,359 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="ping_ipv4 ping_ipv6 multipath_test"
+NUM_NETIFS=8
+source lib.sh
+
+h1_create()
+{
+ vrf_create "vrf-h1"
+ ip link set dev $h1 master vrf-h1
+
+ ip link set dev vrf-h1 up
+ ip link set dev $h1 up
+
+ ip address add 192.0.2.2/24 dev $h1
+ ip address add 2001:db8:1::2/64 dev $h1
+
+ ip route add 198.51.100.0/24 vrf vrf-h1 nexthop via 192.0.2.1
+ ip route add 2001:db8:2::/64 vrf vrf-h1 nexthop via 2001:db8:1::1
+}
+
+h1_destroy()
+{
+ ip route del 2001:db8:2::/64 vrf vrf-h1
+ ip route del 198.51.100.0/24 vrf vrf-h1
+
+ ip address del 2001:db8:1::2/64 dev $h1
+ ip address del 192.0.2.2/24 dev $h1
+
+ ip link set dev $h1 down
+ vrf_destroy "vrf-h1"
+}
+
+h2_create()
+{
+ vrf_create "vrf-h2"
+ ip link set dev $h2 master vrf-h2
+
+ ip link set dev vrf-h2 up
+ ip link set dev $h2 up
+
+ ip address add 198.51.100.2/24 dev $h2
+ ip address add 2001:db8:2::2/64 dev $h2
+
+ ip route add 192.0.2.0/24 vrf vrf-h2 nexthop via 198.51.100.1
+ ip route add 2001:db8:1::/64 vrf vrf-h2 nexthop via 2001:db8:2::1
+}
+
+h2_destroy()
+{
+ ip route del 2001:db8:1::/64 vrf vrf-h2
+ ip route del 192.0.2.0/24 vrf vrf-h2
+
+ ip address del 2001:db8:2::2/64 dev $h2
+ ip address del 198.51.100.2/24 dev $h2
+
+ ip link set dev $h2 down
+ vrf_destroy "vrf-h2"
+}
+
+router1_create()
+{
+ vrf_create "vrf-r1"
+ ip link set dev $rp11 master vrf-r1
+ ip link set dev $rp12 master vrf-r1
+ ip link set dev $rp13 master vrf-r1
+
+ ip link set dev vrf-r1 up
+ ip link set dev $rp11 up
+ ip link set dev $rp12 up
+ ip link set dev $rp13 up
+
+ ip address add 192.0.2.1/24 dev $rp11
+ ip address add 2001:db8:1::1/64 dev $rp11
+
+ ip address add 169.254.2.12/24 dev $rp12
+ ip address add fe80:2::12/64 dev $rp12
+
+ ip address add 169.254.3.13/24 dev $rp13
+ ip address add fe80:3::13/64 dev $rp13
+}
+
+router1_destroy()
+{
+ ip route del 2001:db8:2::/64 vrf vrf-r1
+ ip route del 198.51.100.0/24 vrf vrf-r1
+
+ ip address del fe80:3::13/64 dev $rp13
+ ip address del 169.254.3.13/24 dev $rp13
+
+ ip address del fe80:2::12/64 dev $rp12
+ ip address del 169.254.2.12/24 dev $rp12
+
+ ip address del 2001:db8:1::1/64 dev $rp11
+ ip address del 192.0.2.1/24 dev $rp11
+
+ ip nexthop del id 103
+ ip nexthop del id 101
+ ip nexthop del id 102
+ ip nexthop del id 106
+ ip nexthop del id 104
+ ip nexthop del id 105
+
+ ip link set dev $rp13 down
+ ip link set dev $rp12 down
+ ip link set dev $rp11 down
+
+ vrf_destroy "vrf-r1"
+}
+
+router2_create()
+{
+ vrf_create "vrf-r2"
+ ip link set dev $rp21 master vrf-r2
+ ip link set dev $rp22 master vrf-r2
+ ip link set dev $rp23 master vrf-r2
+
+ ip link set dev vrf-r2 up
+ ip link set dev $rp21 up
+ ip link set dev $rp22 up
+ ip link set dev $rp23 up
+
+ ip address add 198.51.100.1/24 dev $rp21
+ ip address add 2001:db8:2::1/64 dev $rp21
+
+ ip address add 169.254.2.22/24 dev $rp22
+ ip address add fe80:2::22/64 dev $rp22
+
+ ip address add 169.254.3.23/24 dev $rp23
+ ip address add fe80:3::23/64 dev $rp23
+}
+
+router2_destroy()
+{
+ ip route del 2001:db8:1::/64 vrf vrf-r2
+ ip route del 192.0.2.0/24 vrf vrf-r2
+
+ ip address del fe80:3::23/64 dev $rp23
+ ip address del 169.254.3.23/24 dev $rp23
+
+ ip address del fe80:2::22/64 dev $rp22
+ ip address del 169.254.2.22/24 dev $rp22
+
+ ip address del 2001:db8:2::1/64 dev $rp21
+ ip address del 198.51.100.1/24 dev $rp21
+
+ ip nexthop del id 201
+ ip nexthop del id 202
+ ip nexthop del id 204
+ ip nexthop del id 205
+
+ ip link set dev $rp23 down
+ ip link set dev $rp22 down
+ ip link set dev $rp21 down
+
+ vrf_destroy "vrf-r2"
+}
+
+routing_nh_obj()
+{
+ ip nexthop add id 101 via 169.254.2.22 dev $rp12
+ ip nexthop add id 102 via 169.254.3.23 dev $rp13
+ ip nexthop add id 103 group 101/102
+ ip route add 198.51.100.0/24 vrf vrf-r1 nhid 103
+
+ ip nexthop add id 104 via fe80:2::22 dev $rp12
+ ip nexthop add id 105 via fe80:3::23 dev $rp13
+ ip nexthop add id 106 group 104/105
+ ip route add 2001:db8:2::/64 vrf vrf-r1 nhid 106
+
+ ip nexthop add id 201 via 169.254.2.12 dev $rp22
+ ip nexthop add id 202 via 169.254.3.13 dev $rp23
+ ip nexthop add id 203 group 201/202
+ ip route add 192.0.2.0/24 vrf vrf-r2 nhid 203
+
+ ip nexthop add id 204 via fe80:2::12 dev $rp22
+ ip nexthop add id 205 via fe80:3::13 dev $rp23
+ ip nexthop add id 206 group 204/205
+ ip route add 2001:db8:1::/64 vrf vrf-r2 nhid 206
+}
+
+multipath4_test()
+{
+ local desc="$1"
+ local weight_rp12=$2
+ local weight_rp13=$3
+ local t0_rp12 t0_rp13 t1_rp12 t1_rp13
+ local packets_rp12 packets_rp13
+
+ # Transmit multiple flows from h1 to h2 and make sure they are
+ # distributed between both multipath links (rp12 and rp13)
+ # according to the configured weights.
+ sysctl_set net.ipv4.fib_multipath_hash_policy 1
+ ip nexthop replace id 103 group 101,$weight_rp12/102,$weight_rp13
+
+ t0_rp12=$(link_stats_tx_packets_get $rp12)
+ t0_rp13=$(link_stats_tx_packets_get $rp13)
+
+ ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \
+ -d 1msec -t udp "sp=1024,dp=0-32768"
+
+ t1_rp12=$(link_stats_tx_packets_get $rp12)
+ t1_rp13=$(link_stats_tx_packets_get $rp13)
+
+ let "packets_rp12 = $t1_rp12 - $t0_rp12"
+ let "packets_rp13 = $t1_rp13 - $t0_rp13"
+ multipath_eval "$desc" $weight_rp12 $weight_rp13 $packets_rp12 $packets_rp13
+
+ # Restore settings.
+ ip nexthop replace id 103 group 101/102
+ sysctl_restore net.ipv4.fib_multipath_hash_policy
+}
+
+multipath6_l4_test()
+{
+ local desc="$1"
+ local weight_rp12=$2
+ local weight_rp13=$3
+ local t0_rp12 t0_rp13 t1_rp12 t1_rp13
+ local packets_rp12 packets_rp13
+
+ # Transmit multiple flows from h1 to h2 and make sure they are
+ # distributed between both multipath links (rp12 and rp13)
+ # according to the configured weights.
+ sysctl_set net.ipv6.fib_multipath_hash_policy 1
+
+ ip nexthop replace id 106 group 104,$weight_rp12/105,$weight_rp13
+
+ t0_rp12=$(link_stats_tx_packets_get $rp12)
+ t0_rp13=$(link_stats_tx_packets_get $rp13)
+
+ $MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \
+ -d 1msec -t udp "sp=1024,dp=0-32768"
+
+ t1_rp12=$(link_stats_tx_packets_get $rp12)
+ t1_rp13=$(link_stats_tx_packets_get $rp13)
+
+ let "packets_rp12 = $t1_rp12 - $t0_rp12"
+ let "packets_rp13 = $t1_rp13 - $t0_rp13"
+ multipath_eval "$desc" $weight_rp12 $weight_rp13 $packets_rp12 $packets_rp13
+
+ ip nexthop replace id 106 group 104/105
+
+ sysctl_restore net.ipv6.fib_multipath_hash_policy
+}
+
+multipath6_test()
+{
+ local desc="$1"
+ local weight_rp12=$2
+ local weight_rp13=$3
+ local t0_rp12 t0_rp13 t1_rp12 t1_rp13
+ local packets_rp12 packets_rp13
+
+ ip nexthop replace id 106 group 104,$weight_rp12/105,$weight_rp13
+
+ t0_rp12=$(link_stats_tx_packets_get $rp12)
+ t0_rp13=$(link_stats_tx_packets_get $rp13)
+
+ # Generate 16384 echo requests, each with a random flow label.
+ for _ in $(seq 1 16384); do
+ ip vrf exec vrf-h1 $PING6 2001:db8:2::2 -F 0 -c 1 -q >/dev/null 2>&1
+ done
+
+ t1_rp12=$(link_stats_tx_packets_get $rp12)
+ t1_rp13=$(link_stats_tx_packets_get $rp13)
+
+ let "packets_rp12 = $t1_rp12 - $t0_rp12"
+ let "packets_rp13 = $t1_rp13 - $t0_rp13"
+ multipath_eval "$desc" $weight_rp12 $weight_rp13 $packets_rp12 $packets_rp13
+
+ ip nexthop replace id 106 group 104/105
+}
+
+multipath_test()
+{
+ log_info "Running IPv4 multipath tests"
+ multipath4_test "ECMP" 1 1
+ multipath4_test "Weighted MP 2:1" 2 1
+ multipath4_test "Weighted MP 11:45" 11 45
+
+ log_info "Running IPv6 multipath tests"
+ multipath6_test "ECMP" 1 1
+ multipath6_test "Weighted MP 2:1" 2 1
+ multipath6_test "Weighted MP 11:45" 11 45
+
+ log_info "Running IPv6 L4 hash multipath tests"
+ multipath6_l4_test "ECMP" 1 1
+ multipath6_l4_test "Weighted MP 2:1" 2 1
+ multipath6_l4_test "Weighted MP 11:45" 11 45
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ rp11=${NETIFS[p2]}
+
+ rp12=${NETIFS[p3]}
+ rp22=${NETIFS[p4]}
+
+ rp13=${NETIFS[p5]}
+ rp23=${NETIFS[p6]}
+
+ rp21=${NETIFS[p7]}
+ h2=${NETIFS[p8]}
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+
+ router1_create
+ router2_create
+ routing_nh_obj
+
+ forwarding_enable
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ forwarding_restore
+
+ router2_destroy
+ router1_destroy
+
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 198.51.100.2
+}
+
+ping_ipv6()
+{
+ ping6_test $h1 2001:db8:2::2
+}
+
+ip nexthop ls >/dev/null 2>&1
+if [ $? -ne 0 ]; then
+ echo "Nexthop objects not supported; skipping tests"
+ exit 0
+fi
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+routing_nh_obj
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/router_multicast.sh b/tools/testing/selftests/net/forwarding/router_multicast.sh
new file mode 100755
index 000000000..57e90c873
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/router_multicast.sh
@@ -0,0 +1,416 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# +------------------+
+# | H1 (v$h1) |
+# | 2001:db8:1::2/64 |
+# | 198.51.100.2/28 |
+# | $h1 + |
+# +-------------|----+
+# |
+# +-------------|-------------------------------+
+# | SW1 | |
+# | $rp1 + |
+# | 198.51.100.1/28 |
+# | 2001:db8:1::1/64 |
+# | |
+# | 2001:db8:2::1/64 2001:db8:3::1/64 |
+# | 198.51.100.17/28 198.51.100.33/28 |
+# | $rp2 + $rp3 + |
+# +--------------|--------------------------|---+
+# | |
+# | |
+# +--------------|---+ +--------------|---+
+# | H2 (v$h2) | | | H3 (v$h3) | |
+# | $h2 + | | $h3 + |
+# | 198.51.100.18/28 | | 198.51.100.34/28 |
+# | 2001:db8:2::2/64 | | 2001:db8:3::2/64 |
+# +------------------+ +------------------+
+#
+
+ALL_TESTS="mcast_v4 mcast_v6 rpf_v4 rpf_v6"
+NUM_NETIFS=6
+source lib.sh
+source tc_common.sh
+
+require_command $MCD
+require_command $MC_CLI
+table_name=selftests
+
+h1_create()
+{
+ simple_if_init $h1 198.51.100.2/28 2001:db8:1::2/64
+
+ ip route add 198.51.100.16/28 vrf v$h1 nexthop via 198.51.100.1
+ ip route add 198.51.100.32/28 vrf v$h1 nexthop via 198.51.100.1
+
+ ip route add 2001:db8:2::/64 vrf v$h1 nexthop via 2001:db8:1::1
+ ip route add 2001:db8:3::/64 vrf v$h1 nexthop via 2001:db8:1::1
+
+ tc qdisc add dev $h1 ingress
+}
+
+h1_destroy()
+{
+ tc qdisc del dev $h1 ingress
+
+ ip route del 2001:db8:3::/64 vrf v$h1
+ ip route del 2001:db8:2::/64 vrf v$h1
+
+ ip route del 198.51.100.32/28 vrf v$h1
+ ip route del 198.51.100.16/28 vrf v$h1
+
+ simple_if_fini $h1 198.51.100.2/28 2001:db8:1::2/64
+}
+
+h2_create()
+{
+ simple_if_init $h2 198.51.100.18/28 2001:db8:2::2/64
+
+ ip route add 198.51.100.0/28 vrf v$h2 nexthop via 198.51.100.17
+ ip route add 198.51.100.32/28 vrf v$h2 nexthop via 198.51.100.17
+
+ ip route add 2001:db8:1::/64 vrf v$h2 nexthop via 2001:db8:2::1
+ ip route add 2001:db8:3::/64 vrf v$h2 nexthop via 2001:db8:2::1
+
+ tc qdisc add dev $h2 ingress
+}
+
+h2_destroy()
+{
+ tc qdisc del dev $h2 ingress
+
+ ip route del 2001:db8:3::/64 vrf v$h2
+ ip route del 2001:db8:1::/64 vrf v$h2
+
+ ip route del 198.51.100.32/28 vrf v$h2
+ ip route del 198.51.100.0/28 vrf v$h2
+
+ simple_if_fini $h2 198.51.100.18/28 2001:db8:2::2/64
+}
+
+h3_create()
+{
+ simple_if_init $h3 198.51.100.34/28 2001:db8:3::2/64
+
+ ip route add 198.51.100.0/28 vrf v$h3 nexthop via 198.51.100.33
+ ip route add 198.51.100.16/28 vrf v$h3 nexthop via 198.51.100.33
+
+ ip route add 2001:db8:1::/64 vrf v$h3 nexthop via 2001:db8:3::1
+ ip route add 2001:db8:2::/64 vrf v$h3 nexthop via 2001:db8:3::1
+
+ tc qdisc add dev $h3 ingress
+}
+
+h3_destroy()
+{
+ tc qdisc del dev $h3 ingress
+
+ ip route del 2001:db8:2::/64 vrf v$h3
+ ip route del 2001:db8:1::/64 vrf v$h3
+
+ ip route del 198.51.100.16/28 vrf v$h3
+ ip route del 198.51.100.0/28 vrf v$h3
+
+ simple_if_fini $h3 198.51.100.34/28 2001:db8:3::2/64
+}
+
+router_create()
+{
+ ip link set dev $rp1 up
+ ip link set dev $rp2 up
+ ip link set dev $rp3 up
+
+ ip address add 198.51.100.1/28 dev $rp1
+ ip address add 198.51.100.17/28 dev $rp2
+ ip address add 198.51.100.33/28 dev $rp3
+
+ ip address add 2001:db8:1::1/64 dev $rp1
+ ip address add 2001:db8:2::1/64 dev $rp2
+ ip address add 2001:db8:3::1/64 dev $rp3
+
+ tc qdisc add dev $rp3 ingress
+}
+
+router_destroy()
+{
+ tc qdisc del dev $rp3 ingress
+
+ ip address del 2001:db8:3::1/64 dev $rp3
+ ip address del 2001:db8:2::1/64 dev $rp2
+ ip address del 2001:db8:1::1/64 dev $rp1
+
+ ip address del 198.51.100.33/28 dev $rp3
+ ip address del 198.51.100.17/28 dev $rp2
+ ip address del 198.51.100.1/28 dev $rp1
+
+ ip link set dev $rp3 down
+ ip link set dev $rp2 down
+ ip link set dev $rp1 down
+}
+
+start_mcd()
+{
+ SMCROUTEDIR="$(mktemp -d)"
+
+ for ((i = 1; i <= $NUM_NETIFS; ++i)); do
+ echo "phyint ${NETIFS[p$i]} enable" >> \
+ $SMCROUTEDIR/$table_name.conf
+ done
+
+ $MCD -N -I $table_name -f $SMCROUTEDIR/$table_name.conf \
+ -P $SMCROUTEDIR/$table_name.pid
+}
+
+kill_mcd()
+{
+ pkill $MCD
+ rm -rf $SMCROUTEDIR
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ rp1=${NETIFS[p2]}
+
+ rp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ rp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ start_mcd
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+ h3_create
+
+ router_create
+
+ forwarding_enable
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ forwarding_restore
+
+ router_destroy
+
+ h3_destroy
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+
+ kill_mcd
+}
+
+create_mcast_sg()
+{
+ local if_name=$1; shift
+ local s_addr=$1; shift
+ local mcast=$1; shift
+ local dest_ifs=${@}
+
+ $MC_CLI -I $table_name add $if_name $s_addr $mcast $dest_ifs
+}
+
+delete_mcast_sg()
+{
+ local if_name=$1; shift
+ local s_addr=$1; shift
+ local mcast=$1; shift
+ local dest_ifs=${@}
+
+ $MC_CLI -I $table_name remove $if_name $s_addr $mcast $dest_ifs
+}
+
+mcast_v4()
+{
+ # Add two interfaces to an MC group, send a packet to the MC group and
+ # verify packets are received on both. Then delete the route and verify
+ # packets are no longer received.
+
+ RET=0
+
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 122 flower \
+ dst_ip 225.1.2.3 action drop
+ tc filter add dev $h3 ingress protocol ip pref 1 handle 133 flower \
+ dst_ip 225.1.2.3 action drop
+
+ create_mcast_sg $rp1 198.51.100.2 225.1.2.3 $rp2 $rp3
+
+ # Send frames with the corresponding L2 destination address.
+ $MZ $h1 -c 5 -p 128 -t udp -a 00:11:22:33:44:55 -b 01:00:5e:01:02:03 \
+ -A 198.51.100.2 -B 225.1.2.3 -q
+
+ tc_check_packets "dev $h2 ingress" 122 5
+ check_err $? "Multicast not received on first host"
+ tc_check_packets "dev $h3 ingress" 133 5
+ check_err $? "Multicast not received on second host"
+
+ delete_mcast_sg $rp1 198.51.100.2 225.1.2.3 $rp2 $rp3
+
+ $MZ $h1 -c 5 -p 128 -t udp -a 00:11:22:33:44:55 -b 01:00:5e:01:02:03 \
+ -A 198.51.100.2 -B 225.1.2.3 -q
+
+ tc_check_packets "dev $h2 ingress" 122 5
+ check_err $? "Multicast received on host although deleted"
+ tc_check_packets "dev $h3 ingress" 133 5
+ check_err $? "Multicast received on second host although deleted"
+
+ tc filter del dev $h3 ingress protocol ip pref 1 handle 133 flower
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 122 flower
+
+ log_test "mcast IPv4"
+}
+
+mcast_v6()
+{
+ # Add two interfaces to an MC group, send a packet to the MC group and
+ # verify packets are received on both. Then delete the route and verify
+ # packets are no longer received.
+
+ RET=0
+
+ tc filter add dev $h2 ingress protocol ipv6 pref 1 handle 122 flower \
+ dst_ip ff0e::3 action drop
+ tc filter add dev $h3 ingress protocol ipv6 pref 1 handle 133 flower \
+ dst_ip ff0e::3 action drop
+
+ create_mcast_sg $rp1 2001:db8:1::2 ff0e::3 $rp2 $rp3
+
+ # Send frames with the corresponding L2 destination address.
+ $MZ $h1 -6 -c 5 -p 128 -t udp -a 00:11:22:33:44:55 \
+ -b 33:33:00:00:00:03 -A 2001:db8:1::2 -B ff0e::3 -q
+
+ tc_check_packets "dev $h2 ingress" 122 5
+ check_err $? "Multicast not received on first host"
+ tc_check_packets "dev $h3 ingress" 133 5
+ check_err $? "Multicast not received on second host"
+
+ delete_mcast_sg $rp1 2001:db8:1::2 ff0e::3 $rp2 $rp3
+
+ $MZ $h1 -6 -c 5 -p 128 -t udp -a 00:11:22:33:44:55 \
+ -b 33:33:00:00:00:03 -A 2001:db8:1::2 -B ff0e::3 -q
+
+ tc_check_packets "dev $h2 ingress" 122 5
+ check_err $? "Multicast received on first host although deleted"
+ tc_check_packets "dev $h3 ingress" 133 5
+ check_err $? "Multicast received on second host although deleted"
+
+ tc filter del dev $h3 ingress protocol ipv6 pref 1 handle 133 flower
+ tc filter del dev $h2 ingress protocol ipv6 pref 1 handle 122 flower
+
+ log_test "mcast IPv6"
+}
+
+rpf_v4()
+{
+ # Add a multicast route from first router port to the other two. Send
+ # matching packets and test that both hosts receive them. Then, send
+ # the same packets via the third router port and test that they do not
+ # reach any host due to RPF check. A filter with 'skip_hw' is added to
+ # test that devices capable of multicast routing offload trap those
+ # packets. The filter is essentialy a NOP in other scenarios.
+
+ RET=0
+
+ tc filter add dev $h1 ingress protocol ip pref 1 handle 1 flower \
+ dst_ip 225.1.2.3 ip_proto udp dst_port 12345 action drop
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 1 flower \
+ dst_ip 225.1.2.3 ip_proto udp dst_port 12345 action drop
+ tc filter add dev $h3 ingress protocol ip pref 1 handle 1 flower \
+ dst_ip 225.1.2.3 ip_proto udp dst_port 12345 action drop
+ tc filter add dev $rp3 ingress protocol ip pref 1 handle 1 flower \
+ skip_hw dst_ip 225.1.2.3 ip_proto udp dst_port 12345 action pass
+
+ create_mcast_sg $rp1 198.51.100.2 225.1.2.3 $rp2 $rp3
+
+ $MZ $h1 -c 5 -p 128 -t udp "ttl=10,sp=54321,dp=12345" \
+ -a 00:11:22:33:44:55 -b 01:00:5e:01:02:03 \
+ -A 198.51.100.2 -B 225.1.2.3 -q
+
+ tc_check_packets "dev $h2 ingress" 1 5
+ check_err $? "Multicast not received on first host"
+ tc_check_packets "dev $h3 ingress" 1 5
+ check_err $? "Multicast not received on second host"
+
+ $MZ $h3 -c 5 -p 128 -t udp "ttl=10,sp=54321,dp=12345" \
+ -a 00:11:22:33:44:55 -b 01:00:5e:01:02:03 \
+ -A 198.51.100.2 -B 225.1.2.3 -q
+
+ tc_check_packets "dev $h1 ingress" 1 0
+ check_err $? "Multicast received on first host when should not"
+ tc_check_packets "dev $h2 ingress" 1 5
+ check_err $? "Multicast received on second host when should not"
+ tc_check_packets "dev $rp3 ingress" 1 5
+ check_err $? "Packets not trapped due to RPF check"
+
+ delete_mcast_sg $rp1 198.51.100.2 225.1.2.3 $rp2 $rp3
+
+ tc filter del dev $rp3 ingress protocol ip pref 1 handle 1 flower
+ tc filter del dev $h3 ingress protocol ip pref 1 handle 1 flower
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 1 flower
+ tc filter del dev $h1 ingress protocol ip pref 1 handle 1 flower
+
+ log_test "RPF IPv4"
+}
+
+rpf_v6()
+{
+ RET=0
+
+ tc filter add dev $h1 ingress protocol ipv6 pref 1 handle 1 flower \
+ dst_ip ff0e::3 ip_proto udp dst_port 12345 action drop
+ tc filter add dev $h2 ingress protocol ipv6 pref 1 handle 1 flower \
+ dst_ip ff0e::3 ip_proto udp dst_port 12345 action drop
+ tc filter add dev $h3 ingress protocol ipv6 pref 1 handle 1 flower \
+ dst_ip ff0e::3 ip_proto udp dst_port 12345 action drop
+ tc filter add dev $rp3 ingress protocol ipv6 pref 1 handle 1 flower \
+ skip_hw dst_ip ff0e::3 ip_proto udp dst_port 12345 action pass
+
+ create_mcast_sg $rp1 2001:db8:1::2 ff0e::3 $rp2 $rp3
+
+ $MZ $h1 -6 -c 5 -p 128 -t udp "ttl=10,sp=54321,dp=12345" \
+ -a 00:11:22:33:44:55 -b 33:33:00:00:00:03 \
+ -A 2001:db8:1::2 -B ff0e::3 -q
+
+ tc_check_packets "dev $h2 ingress" 1 5
+ check_err $? "Multicast not received on first host"
+ tc_check_packets "dev $h3 ingress" 1 5
+ check_err $? "Multicast not received on second host"
+
+ $MZ $h3 -6 -c 5 -p 128 -t udp "ttl=10,sp=54321,dp=12345" \
+ -a 00:11:22:33:44:55 -b 33:33:00:00:00:03 \
+ -A 2001:db8:1::2 -B ff0e::3 -q
+
+ tc_check_packets "dev $h1 ingress" 1 0
+ check_err $? "Multicast received on first host when should not"
+ tc_check_packets "dev $h2 ingress" 1 5
+ check_err $? "Multicast received on second host when should not"
+ tc_check_packets "dev $rp3 ingress" 1 5
+ check_err $? "Packets not trapped due to RPF check"
+
+ delete_mcast_sg $rp1 2001:db8:1::2 ff0e::3 $rp2 $rp3
+
+ tc filter del dev $rp3 ingress protocol ipv6 pref 1 handle 1 flower
+ tc filter del dev $h3 ingress protocol ipv6 pref 1 handle 1 flower
+ tc filter del dev $h2 ingress protocol ipv6 pref 1 handle 1 flower
+ tc filter del dev $h1 ingress protocol ipv6 pref 1 handle 1 flower
+
+ log_test "RPF IPv6"
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/router_multipath.sh b/tools/testing/selftests/net/forwarding/router_multipath.sh
new file mode 100755
index 000000000..464821c58
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/router_multipath.sh
@@ -0,0 +1,342 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="ping_ipv4 ping_ipv6 multipath_test"
+NUM_NETIFS=8
+source lib.sh
+
+h1_create()
+{
+ vrf_create "vrf-h1"
+ ip link set dev $h1 master vrf-h1
+
+ ip link set dev vrf-h1 up
+ ip link set dev $h1 up
+
+ ip address add 192.0.2.2/24 dev $h1
+ ip address add 2001:db8:1::2/64 dev $h1
+
+ ip route add 198.51.100.0/24 vrf vrf-h1 nexthop via 192.0.2.1
+ ip route add 2001:db8:2::/64 vrf vrf-h1 nexthop via 2001:db8:1::1
+}
+
+h1_destroy()
+{
+ ip route del 2001:db8:2::/64 vrf vrf-h1
+ ip route del 198.51.100.0/24 vrf vrf-h1
+
+ ip address del 2001:db8:1::2/64 dev $h1
+ ip address del 192.0.2.2/24 dev $h1
+
+ ip link set dev $h1 down
+ vrf_destroy "vrf-h1"
+}
+
+h2_create()
+{
+ vrf_create "vrf-h2"
+ ip link set dev $h2 master vrf-h2
+
+ ip link set dev vrf-h2 up
+ ip link set dev $h2 up
+
+ ip address add 198.51.100.2/24 dev $h2
+ ip address add 2001:db8:2::2/64 dev $h2
+
+ ip route add 192.0.2.0/24 vrf vrf-h2 nexthop via 198.51.100.1
+ ip route add 2001:db8:1::/64 vrf vrf-h2 nexthop via 2001:db8:2::1
+}
+
+h2_destroy()
+{
+ ip route del 2001:db8:1::/64 vrf vrf-h2
+ ip route del 192.0.2.0/24 vrf vrf-h2
+
+ ip address del 2001:db8:2::2/64 dev $h2
+ ip address del 198.51.100.2/24 dev $h2
+
+ ip link set dev $h2 down
+ vrf_destroy "vrf-h2"
+}
+
+router1_create()
+{
+ vrf_create "vrf-r1"
+ ip link set dev $rp11 master vrf-r1
+ ip link set dev $rp12 master vrf-r1
+ ip link set dev $rp13 master vrf-r1
+
+ ip link set dev vrf-r1 up
+ ip link set dev $rp11 up
+ ip link set dev $rp12 up
+ ip link set dev $rp13 up
+
+ ip address add 192.0.2.1/24 dev $rp11
+ ip address add 2001:db8:1::1/64 dev $rp11
+
+ ip address add 169.254.2.12/24 dev $rp12
+ ip address add fe80:2::12/64 dev $rp12
+
+ ip address add 169.254.3.13/24 dev $rp13
+ ip address add fe80:3::13/64 dev $rp13
+
+ ip route add 198.51.100.0/24 vrf vrf-r1 \
+ nexthop via 169.254.2.22 dev $rp12 \
+ nexthop via 169.254.3.23 dev $rp13
+ ip route add 2001:db8:2::/64 vrf vrf-r1 \
+ nexthop via fe80:2::22 dev $rp12 \
+ nexthop via fe80:3::23 dev $rp13
+}
+
+router1_destroy()
+{
+ ip route del 2001:db8:2::/64 vrf vrf-r1
+ ip route del 198.51.100.0/24 vrf vrf-r1
+
+ ip address del fe80:3::13/64 dev $rp13
+ ip address del 169.254.3.13/24 dev $rp13
+
+ ip address del fe80:2::12/64 dev $rp12
+ ip address del 169.254.2.12/24 dev $rp12
+
+ ip address del 2001:db8:1::1/64 dev $rp11
+ ip address del 192.0.2.1/24 dev $rp11
+
+ ip link set dev $rp13 down
+ ip link set dev $rp12 down
+ ip link set dev $rp11 down
+
+ vrf_destroy "vrf-r1"
+}
+
+router2_create()
+{
+ vrf_create "vrf-r2"
+ ip link set dev $rp21 master vrf-r2
+ ip link set dev $rp22 master vrf-r2
+ ip link set dev $rp23 master vrf-r2
+
+ ip link set dev vrf-r2 up
+ ip link set dev $rp21 up
+ ip link set dev $rp22 up
+ ip link set dev $rp23 up
+
+ ip address add 198.51.100.1/24 dev $rp21
+ ip address add 2001:db8:2::1/64 dev $rp21
+
+ ip address add 169.254.2.22/24 dev $rp22
+ ip address add fe80:2::22/64 dev $rp22
+
+ ip address add 169.254.3.23/24 dev $rp23
+ ip address add fe80:3::23/64 dev $rp23
+
+ ip route add 192.0.2.0/24 vrf vrf-r2 \
+ nexthop via 169.254.2.12 dev $rp22 \
+ nexthop via 169.254.3.13 dev $rp23
+ ip route add 2001:db8:1::/64 vrf vrf-r2 \
+ nexthop via fe80:2::12 dev $rp22 \
+ nexthop via fe80:3::13 dev $rp23
+}
+
+router2_destroy()
+{
+ ip route del 2001:db8:1::/64 vrf vrf-r2
+ ip route del 192.0.2.0/24 vrf vrf-r2
+
+ ip address del fe80:3::23/64 dev $rp23
+ ip address del 169.254.3.23/24 dev $rp23
+
+ ip address del fe80:2::22/64 dev $rp22
+ ip address del 169.254.2.22/24 dev $rp22
+
+ ip address del 2001:db8:2::1/64 dev $rp21
+ ip address del 198.51.100.1/24 dev $rp21
+
+ ip link set dev $rp23 down
+ ip link set dev $rp22 down
+ ip link set dev $rp21 down
+
+ vrf_destroy "vrf-r2"
+}
+
+multipath4_test()
+{
+ local desc="$1"
+ local weight_rp12=$2
+ local weight_rp13=$3
+ local t0_rp12 t0_rp13 t1_rp12 t1_rp13
+ local packets_rp12 packets_rp13
+
+ # Transmit multiple flows from h1 to h2 and make sure they are
+ # distributed between both multipath links (rp12 and rp13)
+ # according to the configured weights.
+ sysctl_set net.ipv4.fib_multipath_hash_policy 1
+ ip route replace 198.51.100.0/24 vrf vrf-r1 \
+ nexthop via 169.254.2.22 dev $rp12 weight $weight_rp12 \
+ nexthop via 169.254.3.23 dev $rp13 weight $weight_rp13
+
+ t0_rp12=$(link_stats_tx_packets_get $rp12)
+ t0_rp13=$(link_stats_tx_packets_get $rp13)
+
+ ip vrf exec vrf-h1 $MZ $h1 -q -p 64 -A 192.0.2.2 -B 198.51.100.2 \
+ -d 1msec -t udp "sp=1024,dp=0-32768"
+
+ t1_rp12=$(link_stats_tx_packets_get $rp12)
+ t1_rp13=$(link_stats_tx_packets_get $rp13)
+
+ let "packets_rp12 = $t1_rp12 - $t0_rp12"
+ let "packets_rp13 = $t1_rp13 - $t0_rp13"
+ multipath_eval "$desc" $weight_rp12 $weight_rp13 $packets_rp12 $packets_rp13
+
+ # Restore settings.
+ ip route replace 198.51.100.0/24 vrf vrf-r1 \
+ nexthop via 169.254.2.22 dev $rp12 \
+ nexthop via 169.254.3.23 dev $rp13
+ sysctl_restore net.ipv4.fib_multipath_hash_policy
+}
+
+multipath6_l4_test()
+{
+ local desc="$1"
+ local weight_rp12=$2
+ local weight_rp13=$3
+ local t0_rp12 t0_rp13 t1_rp12 t1_rp13
+ local packets_rp12 packets_rp13
+
+ # Transmit multiple flows from h1 to h2 and make sure they are
+ # distributed between both multipath links (rp12 and rp13)
+ # according to the configured weights.
+ sysctl_set net.ipv6.fib_multipath_hash_policy 1
+
+ ip route replace 2001:db8:2::/64 vrf vrf-r1 \
+ nexthop via fe80:2::22 dev $rp12 weight $weight_rp12 \
+ nexthop via fe80:3::23 dev $rp13 weight $weight_rp13
+
+ t0_rp12=$(link_stats_tx_packets_get $rp12)
+ t0_rp13=$(link_stats_tx_packets_get $rp13)
+
+ $MZ $h1 -6 -q -p 64 -A 2001:db8:1::2 -B 2001:db8:2::2 \
+ -d 1msec -t udp "sp=1024,dp=0-32768"
+
+ t1_rp12=$(link_stats_tx_packets_get $rp12)
+ t1_rp13=$(link_stats_tx_packets_get $rp13)
+
+ let "packets_rp12 = $t1_rp12 - $t0_rp12"
+ let "packets_rp13 = $t1_rp13 - $t0_rp13"
+ multipath_eval "$desc" $weight_rp12 $weight_rp13 $packets_rp12 $packets_rp13
+
+ ip route replace 2001:db8:2::/64 vrf vrf-r1 \
+ nexthop via fe80:2::22 dev $rp12 \
+ nexthop via fe80:3::23 dev $rp13
+
+ sysctl_restore net.ipv6.fib_multipath_hash_policy
+}
+
+multipath6_test()
+{
+ local desc="$1"
+ local weight_rp12=$2
+ local weight_rp13=$3
+ local t0_rp12 t0_rp13 t1_rp12 t1_rp13
+ local packets_rp12 packets_rp13
+
+ ip route replace 2001:db8:2::/64 vrf vrf-r1 \
+ nexthop via fe80:2::22 dev $rp12 weight $weight_rp12 \
+ nexthop via fe80:3::23 dev $rp13 weight $weight_rp13
+
+ t0_rp12=$(link_stats_tx_packets_get $rp12)
+ t0_rp13=$(link_stats_tx_packets_get $rp13)
+
+ # Generate 16384 echo requests, each with a random flow label.
+ for _ in $(seq 1 16384); do
+ ip vrf exec vrf-h1 $PING6 2001:db8:2::2 -F 0 -c 1 -q &> /dev/null
+ done
+
+ t1_rp12=$(link_stats_tx_packets_get $rp12)
+ t1_rp13=$(link_stats_tx_packets_get $rp13)
+
+ let "packets_rp12 = $t1_rp12 - $t0_rp12"
+ let "packets_rp13 = $t1_rp13 - $t0_rp13"
+ multipath_eval "$desc" $weight_rp12 $weight_rp13 $packets_rp12 $packets_rp13
+
+ ip route replace 2001:db8:2::/64 vrf vrf-r1 \
+ nexthop via fe80:2::22 dev $rp12 \
+ nexthop via fe80:3::23 dev $rp13
+}
+
+multipath_test()
+{
+ log_info "Running IPv4 multipath tests"
+ multipath4_test "ECMP" 1 1
+ multipath4_test "Weighted MP 2:1" 2 1
+ multipath4_test "Weighted MP 11:45" 11 45
+
+ log_info "Running IPv6 multipath tests"
+ multipath6_test "ECMP" 1 1
+ multipath6_test "Weighted MP 2:1" 2 1
+ multipath6_test "Weighted MP 11:45" 11 45
+
+ log_info "Running IPv6 L4 hash multipath tests"
+ multipath6_l4_test "ECMP" 1 1
+ multipath6_l4_test "Weighted MP 2:1" 2 1
+ multipath6_l4_test "Weighted MP 11:45" 11 45
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ rp11=${NETIFS[p2]}
+
+ rp12=${NETIFS[p3]}
+ rp22=${NETIFS[p4]}
+
+ rp13=${NETIFS[p5]}
+ rp23=${NETIFS[p6]}
+
+ rp21=${NETIFS[p7]}
+ h2=${NETIFS[p8]}
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+
+ router1_create
+ router2_create
+
+ forwarding_enable
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ forwarding_restore
+
+ router2_destroy
+ router1_destroy
+
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 198.51.100.2
+}
+
+ping_ipv6()
+{
+ ping6_test $h1 2001:db8:2::2
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/router_vid_1.sh b/tools/testing/selftests/net/forwarding/router_vid_1.sh
new file mode 100755
index 000000000..a7306c7ac
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/router_vid_1.sh
@@ -0,0 +1,135 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="ping_ipv4 ping_ipv6"
+NUM_NETIFS=4
+source lib.sh
+
+h1_create()
+{
+ vrf_create "vrf-h1"
+ ip link set dev vrf-h1 up
+
+ ip link set dev $h1 up
+ vlan_create $h1 1 vrf-h1 192.0.2.2/24 2001:db8:1::2/64
+
+ ip route add 198.51.100.0/24 vrf vrf-h1 nexthop via 192.0.2.1
+ ip route add 2001:db8:2::/64 vrf vrf-h1 nexthop via 2001:db8:1::1
+}
+
+h1_destroy()
+{
+ ip route del 2001:db8:2::/64 vrf vrf-h1
+ ip route del 198.51.100.0/24 vrf vrf-h1
+
+ vlan_destroy $h1 1
+ ip link set dev $h1 down
+
+ ip link set dev vrf-h1 down
+ vrf_destroy "vrf-h1"
+}
+
+h2_create()
+{
+ vrf_create "vrf-h2"
+ ip link set dev vrf-h2 up
+
+ ip link set dev $h2 up
+ vlan_create $h2 1 vrf-h2 198.51.100.2/24 2001:db8:2::2/64
+
+ ip route add 192.0.2.0/24 vrf vrf-h2 nexthop via 198.51.100.1
+ ip route add 2001:db8:1::/64 vrf vrf-h2 nexthop via 2001:db8:2::1
+}
+
+h2_destroy()
+{
+ ip route del 2001:db8:1::/64 vrf vrf-h2
+ ip route del 192.0.2.0/24 vrf vrf-h2
+
+ vlan_destroy $h2 1
+ ip link set dev $h2 down
+
+ ip link set dev vrf-h2 down
+ vrf_destroy "vrf-h2"
+}
+
+router_create()
+{
+ ip link set dev $rp1 up
+ ip link add link $rp1 name $rp1.1 up type vlan id 1
+
+ ip address add 192.0.2.1/24 dev $rp1.1
+ ip address add 2001:db8:1::1/64 dev $rp1.1
+
+ ip link set dev $rp2 up
+ ip link add link $rp2 name $rp2.1 up type vlan id 1
+
+ ip address add 198.51.100.1/24 dev $rp2.1
+ ip address add 2001:db8:2::1/64 dev $rp2.1
+}
+
+router_destroy()
+{
+ ip address del 2001:db8:2::1/64 dev $rp2.1
+ ip address del 198.51.100.1/24 dev $rp2.1
+
+ ip link del dev $rp2.1
+ ip link set dev $rp2 down
+
+ ip address del 2001:db8:1::1/64 dev $rp1.1
+ ip address del 192.0.2.1/24 dev $rp1.1
+
+ ip link del dev $rp1.1
+ ip link set dev $rp1 down
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ rp1=${NETIFS[p2]}
+
+ rp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+
+ router_create
+
+ forwarding_enable
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ forwarding_restore
+
+ router_destroy
+
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1.1 198.51.100.2
+}
+
+ping_ipv6()
+{
+ ping6_test $h1.1 2001:db8:2::2
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/sch_ets.sh b/tools/testing/selftests/net/forwarding/sch_ets.sh
new file mode 100755
index 000000000..e60c8b481
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/sch_ets.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# A driver for the ETS selftest that implements testing in slowpath.
+lib_dir=.
+source sch_ets_core.sh
+
+ALL_TESTS="
+ ping_ipv4
+ priomap_mode
+ ets_test_strict
+ ets_test_mixed
+ ets_test_dwrr
+ classifier_mode
+ ets_test_strict
+ ets_test_mixed
+ ets_test_dwrr
+"
+
+switch_create()
+{
+ ets_switch_create
+
+ # Create a bottleneck so that the DWRR process can kick in.
+ tc qdisc add dev $swp2 root handle 1: tbf \
+ rate 1Gbit burst 1Mbit latency 100ms
+ PARENT="parent 1:"
+}
+
+switch_destroy()
+{
+ ets_switch_destroy
+ tc qdisc del dev $swp2 root
+}
+
+# Callback from sch_ets_tests.sh
+collect_stats()
+{
+ local -a streams=("$@")
+ local stream
+
+ for stream in ${streams[@]}; do
+ qdisc_parent_stats_get $swp2 10:$((stream + 1)) .bytes
+ done
+}
+
+ets_run
diff --git a/tools/testing/selftests/net/forwarding/sch_ets_core.sh b/tools/testing/selftests/net/forwarding/sch_ets_core.sh
new file mode 100644
index 000000000..f906fcc66
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/sch_ets_core.sh
@@ -0,0 +1,300 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# This is a template for ETS Qdisc test.
+#
+# This test sends from H1 several traffic streams with 802.1p-tagged packets.
+# The tags are used at $swp1 to prioritize the traffic. Each stream is then
+# queued at a different ETS band according to the assigned priority. After
+# runnig for a while, counters at H2 are consulted to determine whether the
+# traffic scheduling was according to the ETS configuration.
+#
+# This template is supposed to be embedded by a test driver, which implements
+# statistics collection, any HW-specific stuff, and prominently configures the
+# system to assure that there is overcommitment at $swp2. That is necessary so
+# that the ETS traffic selection algorithm kicks in and has to schedule some
+# traffic at the expense of other.
+#
+# A driver for veth-based testing is in sch_ets.sh, an example of a driver for
+# an offloaded data path is in selftests/drivers/net/mlxsw/sch_ets.sh.
+#
+# +---------------------------------------------------------------------+
+# | H1 |
+# | + $h1.10 + $h1.11 + $h1.12 |
+# | | 192.0.2.1/28 | 192.0.2.17/28 | 192.0.2.33/28 |
+# | | egress-qos-map | egress-qos-map | egress-qos-map |
+# | | 0:0 | 0:1 | 0:2 |
+# | \____________________ | ____________________/ |
+# | \|/ |
+# | + $h1 |
+# +---------------------------|-----------------------------------------+
+# |
+# +---------------------------|-----------------------------------------+
+# | SW + $swp1 |
+# | | >1Gbps |
+# | ____________________/|\____________________ |
+# | / | \ |
+# | +--|----------------+ +--|----------------+ +--|----------------+ |
+# | | + $swp1.10 | | + $swp1.11 | | + $swp1.12 | |
+# | | ingress-qos-map| | ingress-qos-map| | ingress-qos-map| |
+# | | 0:0 1:1 2:2 | | 0:0 1:1 2:2 | | 0:0 1:1 2:2 | |
+# | | | | | | | |
+# | | BR10 | | BR11 | | BR12 | |
+# | | | | | | | |
+# | | + $swp2.10 | | + $swp2.11 | | + $swp2.12 | |
+# | +--|----------------+ +--|----------------+ +--|----------------+ |
+# | \____________________ | ____________________/ |
+# | \|/ |
+# | + $swp2 |
+# | | 1Gbps (ethtool or HTB qdisc) |
+# | | qdisc ets quanta $W0 $W1 $W2 |
+# | | priomap 0 1 2 |
+# +---------------------------|-----------------------------------------+
+# |
+# +---------------------------|-----------------------------------------+
+# | H2 + $h2 |
+# | ____________________/|\____________________ |
+# | / | \ |
+# | + $h2.10 + $h2.11 + $h2.12 |
+# | 192.0.2.2/28 192.0.2.18/28 192.0.2.34/28 |
+# +---------------------------------------------------------------------+
+
+NUM_NETIFS=4
+CHECK_TC=yes
+source $lib_dir/lib.sh
+source $lib_dir/sch_ets_tests.sh
+
+PARENT=root
+QDISC_DEV=
+
+sip()
+{
+ echo 192.0.2.$((16 * $1 + 1))
+}
+
+dip()
+{
+ echo 192.0.2.$((16 * $1 + 2))
+}
+
+# Callback from sch_ets_tests.sh
+ets_start_traffic()
+{
+ local dst_mac=$(mac_get $h2)
+ local i=$1; shift
+
+ start_traffic $h1.1$i $(sip $i) $(dip $i) $dst_mac
+}
+
+ETS_CHANGE_QDISC=
+
+priomap_mode()
+{
+ echo "Running in priomap mode"
+ ets_delete_qdisc
+ ETS_CHANGE_QDISC=ets_change_qdisc_priomap
+}
+
+classifier_mode()
+{
+ echo "Running in classifier mode"
+ ets_delete_qdisc
+ ETS_CHANGE_QDISC=ets_change_qdisc_classifier
+}
+
+ets_change_qdisc_priomap()
+{
+ local dev=$1; shift
+ local nstrict=$1; shift
+ local priomap=$1; shift
+ local quanta=("${@}")
+
+ local op=$(if [[ -n $QDISC_DEV ]]; then echo change; else echo add; fi)
+
+ tc qdisc $op dev $dev $PARENT handle 10: ets \
+ $(if ((nstrict)); then echo strict $nstrict; fi) \
+ $(if ((${#quanta[@]})); then echo quanta ${quanta[@]}; fi) \
+ priomap $priomap
+ QDISC_DEV=$dev
+}
+
+ets_change_qdisc_classifier()
+{
+ local dev=$1; shift
+ local nstrict=$1; shift
+ local priomap=$1; shift
+ local quanta=("${@}")
+
+ local op=$(if [[ -n $QDISC_DEV ]]; then echo change; else echo add; fi)
+
+ tc qdisc $op dev $dev $PARENT handle 10: ets \
+ $(if ((nstrict)); then echo strict $nstrict; fi) \
+ $(if ((${#quanta[@]})); then echo quanta ${quanta[@]}; fi)
+
+ if [[ $op == add ]]; then
+ local prio=0
+ local band
+
+ for band in $priomap; do
+ tc filter add dev $dev parent 10: basic \
+ match "meta(priority eq $prio)" \
+ flowid 10:$((band + 1))
+ ((prio++))
+ done
+ fi
+ QDISC_DEV=$dev
+}
+
+# Callback from sch_ets_tests.sh
+ets_change_qdisc()
+{
+ if [[ -z "$ETS_CHANGE_QDISC" ]]; then
+ exit 1
+ fi
+ $ETS_CHANGE_QDISC "$@"
+}
+
+ets_delete_qdisc()
+{
+ if [[ -n $QDISC_DEV ]]; then
+ tc qdisc del dev $QDISC_DEV $PARENT
+ QDISC_DEV=
+ fi
+}
+
+h1_create()
+{
+ local i;
+
+ simple_if_init $h1
+ mtu_set $h1 9900
+ for i in {0..2}; do
+ vlan_create $h1 1$i v$h1 $(sip $i)/28
+ ip link set dev $h1.1$i type vlan egress 0:$i
+ done
+}
+
+h1_destroy()
+{
+ local i
+
+ for i in {0..2}; do
+ vlan_destroy $h1 1$i
+ done
+ mtu_restore $h1
+ simple_if_fini $h1
+}
+
+h2_create()
+{
+ local i
+
+ simple_if_init $h2
+ mtu_set $h2 9900
+ for i in {0..2}; do
+ vlan_create $h2 1$i v$h2 $(dip $i)/28
+ done
+}
+
+h2_destroy()
+{
+ local i
+
+ for i in {0..2}; do
+ vlan_destroy $h2 1$i
+ done
+ mtu_restore $h2
+ simple_if_fini $h2
+}
+
+ets_switch_create()
+{
+ local i
+
+ ip link set dev $swp1 up
+ mtu_set $swp1 9900
+
+ ip link set dev $swp2 up
+ mtu_set $swp2 9900
+
+ for i in {0..2}; do
+ vlan_create $swp1 1$i
+ ip link set dev $swp1.1$i type vlan ingress 0:0 1:1 2:2
+
+ vlan_create $swp2 1$i
+
+ ip link add dev br1$i type bridge
+ ip link set dev $swp1.1$i master br1$i
+ ip link set dev $swp2.1$i master br1$i
+
+ ip link set dev br1$i up
+ ip link set dev $swp1.1$i up
+ ip link set dev $swp2.1$i up
+ done
+}
+
+ets_switch_destroy()
+{
+ local i
+
+ ets_delete_qdisc
+
+ for i in {0..2}; do
+ ip link del dev br1$i
+ vlan_destroy $swp2 1$i
+ vlan_destroy $swp1 1$i
+ done
+
+ mtu_restore $swp2
+ ip link set dev $swp2 down
+
+ mtu_restore $swp1
+ ip link set dev $swp1 down
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ put=$swp2
+ hut=$h2
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1.10 $(dip 0) " vlan 10"
+ ping_test $h1.11 $(dip 1) " vlan 11"
+ ping_test $h1.12 $(dip 2) " vlan 12"
+}
+
+ets_run()
+{
+ trap cleanup EXIT
+
+ setup_prepare
+ setup_wait
+
+ tests_run
+
+ exit $EXIT_STATUS
+}
diff --git a/tools/testing/selftests/net/forwarding/sch_ets_tests.sh b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh
new file mode 100644
index 000000000..cdf689e99
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/sch_ets_tests.sh
@@ -0,0 +1,223 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# Global interface:
+# $put -- port under test (e.g. $swp2)
+# collect_stats($streams...) -- A function to get stats for individual streams
+# ets_start_traffic($band) -- Start traffic for this band
+# ets_change_qdisc($op, $dev, $nstrict, $quanta...) -- Add or change qdisc
+
+# WS describes the Qdisc configuration. It has one value per band (so the
+# number of array elements indicates the number of bands). If the value is
+# 0, it is a strict band, otherwise the it's a DRR band and the value is
+# that band's quantum.
+declare -a WS
+
+qdisc_describe()
+{
+ local nbands=${#WS[@]}
+ local nstrict=0
+ local i
+
+ for ((i = 0; i < nbands; i++)); do
+ if ((!${WS[$i]})); then
+ : $((nstrict++))
+ fi
+ done
+
+ echo -n "ets bands $nbands"
+ if ((nstrict)); then
+ echo -n " strict $nstrict"
+ fi
+ if ((nstrict < nbands)); then
+ echo -n " quanta"
+ for ((i = nstrict; i < nbands; i++)); do
+ echo -n " ${WS[$i]}"
+ done
+ fi
+}
+
+__strict_eval()
+{
+ local desc=$1; shift
+ local d=$1; shift
+ local total=$1; shift
+ local above=$1; shift
+
+ RET=0
+
+ if ((! total)); then
+ check_err 1 "No traffic observed"
+ log_test "$desc"
+ return
+ fi
+
+ local ratio=$(echo "scale=2; 100 * $d / $total" | bc -l)
+ if ((above)); then
+ test $(echo "$ratio > 95.0" | bc -l) -eq 1
+ check_err $? "Not enough traffic"
+ log_test "$desc"
+ log_info "Expected ratio >95% Measured ratio $ratio"
+ else
+ test $(echo "$ratio < 5" | bc -l) -eq 1
+ check_err $? "Too much traffic"
+ log_test "$desc"
+ log_info "Expected ratio <5% Measured ratio $ratio"
+ fi
+}
+
+strict_eval()
+{
+ __strict_eval "$@" 1
+}
+
+notraf_eval()
+{
+ __strict_eval "$@" 0
+}
+
+__ets_dwrr_test()
+{
+ local -a streams=("$@")
+
+ local low_stream=${streams[0]}
+ local seen_strict=0
+ local -a t0 t1 d
+ local stream
+ local total
+ local i
+
+ echo "Testing $(qdisc_describe), streams ${streams[@]}"
+
+ for stream in ${streams[@]}; do
+ ets_start_traffic $stream
+ done
+
+ sleep 10
+
+ t0=($(collect_stats "${streams[@]}"))
+
+ sleep 10
+
+ t1=($(collect_stats "${streams[@]}"))
+ d=($(for ((i = 0; i < ${#streams[@]}; i++)); do
+ echo $((${t1[$i]} - ${t0[$i]}))
+ done))
+ total=$(echo ${d[@]} | sed 's/ /+/g' | bc)
+
+ for ((i = 0; i < ${#streams[@]}; i++)); do
+ local stream=${streams[$i]}
+ if ((seen_strict)); then
+ notraf_eval "band $stream" ${d[$i]} $total
+ elif ((${WS[$stream]} == 0)); then
+ strict_eval "band $stream" ${d[$i]} $total
+ seen_strict=1
+ elif ((stream == low_stream)); then
+ # Low stream is used as DWRR evaluation reference.
+ continue
+ else
+ multipath_eval "bands $low_stream:$stream" \
+ ${WS[$low_stream]} ${WS[$stream]} \
+ ${d[0]} ${d[$i]}
+ fi
+ done
+
+ for stream in ${streams[@]}; do
+ stop_traffic
+ done
+}
+
+ets_dwrr_test_012()
+{
+ __ets_dwrr_test 0 1 2
+}
+
+ets_dwrr_test_01()
+{
+ __ets_dwrr_test 0 1
+}
+
+ets_dwrr_test_12()
+{
+ __ets_dwrr_test 1 2
+}
+
+ets_qdisc_setup()
+{
+ local dev=$1; shift
+ local nstrict=$1; shift
+ local -a quanta=("$@")
+
+ local ndwrr=${#quanta[@]}
+ local nbands=$((nstrict + ndwrr))
+ local nstreams=$(if ((nbands > 3)); then echo 3; else echo $nbands; fi)
+ local priomap=$(seq 0 $((nstreams - 1)))
+ local i
+
+ WS=($(
+ for ((i = 0; i < nstrict; i++)); do
+ echo 0
+ done
+ for ((i = 0; i < ndwrr; i++)); do
+ echo ${quanta[$i]}
+ done
+ ))
+
+ ets_change_qdisc $dev $nstrict "$priomap" ${quanta[@]}
+}
+
+ets_set_dwrr_uniform()
+{
+ ets_qdisc_setup $put 0 3300 3300 3300
+}
+
+ets_set_dwrr_varying()
+{
+ ets_qdisc_setup $put 0 5000 3500 1500
+}
+
+ets_set_strict()
+{
+ ets_qdisc_setup $put 3
+}
+
+ets_set_mixed()
+{
+ ets_qdisc_setup $put 1 5000 2500 1500
+}
+
+ets_change_quantum()
+{
+ tc class change dev $put classid 10:2 ets quantum 8000
+ WS[1]=8000
+}
+
+ets_set_dwrr_two_bands()
+{
+ ets_qdisc_setup $put 0 5000 2500
+}
+
+ets_test_strict()
+{
+ ets_set_strict
+ ets_dwrr_test_01
+ ets_dwrr_test_12
+}
+
+ets_test_mixed()
+{
+ ets_set_mixed
+ ets_dwrr_test_01
+ ets_dwrr_test_12
+}
+
+ets_test_dwrr()
+{
+ ets_set_dwrr_uniform
+ ets_dwrr_test_012
+ ets_set_dwrr_varying
+ ets_dwrr_test_012
+ ets_change_quantum
+ ets_dwrr_test_012
+ ets_set_dwrr_two_bands
+ ets_dwrr_test_01
+}
diff --git a/tools/testing/selftests/net/forwarding/sch_red.sh b/tools/testing/selftests/net/forwarding/sch_red.sh
new file mode 100755
index 000000000..81f31179a
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/sch_red.sh
@@ -0,0 +1,493 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test sends one stream of traffic from H1 through a TBF shaper, to a RED
+# within TBF shaper on $swp3. The two shapers have the same configuration, and
+# thus the resulting stream should fill all available bandwidth on the latter
+# shaper. A second stream is sent from H2 also via $swp3, and used to inject
+# additional traffic. Since all available bandwidth is taken, this traffic has
+# to go to backlog.
+#
+# +--------------------------+ +--------------------------+
+# | H1 | | H2 |
+# | + $h1 | | + $h2 |
+# | | 192.0.2.1/28 | | | 192.0.2.2/28 |
+# | | TBF 10Mbps | | | |
+# +-----|--------------------+ +-----|--------------------+
+# | |
+# +-----|------------------------------------------------|--------------------+
+# | SW | | |
+# | +--|------------------------------------------------|----------------+ |
+# | | + $swp1 + $swp2 | |
+# | | BR | |
+# | | | |
+# | | + $swp3 | |
+# | | | TBF 10Mbps / RED | |
+# | +--------------------------------|-----------------------------------+ |
+# | | |
+# +-----------------------------------|---------------------------------------+
+# |
+# +-----|--------------------+
+# | H3 | |
+# | + $h1 |
+# | 192.0.2.3/28 |
+# | |
+# +--------------------------+
+
+ALL_TESTS="
+ ping_ipv4
+ ecn_test
+ ecn_nodrop_test
+ red_test
+ red_qevent_test
+ ecn_qevent_test
+"
+
+NUM_NETIFS=6
+CHECK_TC="yes"
+source lib.sh
+
+BACKLOG=30000
+PKTSZ=1400
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/28
+ mtu_set $h1 10000
+ tc qdisc replace dev $h1 root handle 1: tbf \
+ rate 10Mbit burst 10K limit 1M
+}
+
+h1_destroy()
+{
+ tc qdisc del dev $h1 root
+ mtu_restore $h1
+ simple_if_fini $h1 192.0.2.1/28
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.2/28
+ mtu_set $h2 10000
+}
+
+h2_destroy()
+{
+ mtu_restore $h2
+ simple_if_fini $h2 192.0.2.2/28
+}
+
+h3_create()
+{
+ simple_if_init $h3 192.0.2.3/28
+ mtu_set $h3 10000
+}
+
+h3_destroy()
+{
+ mtu_restore $h3
+ simple_if_fini $h3 192.0.2.3/28
+}
+
+switch_create()
+{
+ ip link add dev br up type bridge
+ ip link set dev $swp1 up master br
+ ip link set dev $swp2 up master br
+ ip link set dev $swp3 up master br
+
+ mtu_set $swp1 10000
+ mtu_set $swp2 10000
+ mtu_set $swp3 10000
+
+ tc qdisc replace dev $swp3 root handle 1: tbf \
+ rate 10Mbit burst 10K limit 1M
+ ip link add name _drop_test up type dummy
+}
+
+switch_destroy()
+{
+ ip link del dev _drop_test
+ tc qdisc del dev $swp3 root
+
+ mtu_restore $h3
+ mtu_restore $h2
+ mtu_restore $h1
+
+ ip link set dev $swp3 down nomaster
+ ip link set dev $swp2 down nomaster
+ ip link set dev $swp1 down nomaster
+ ip link del dev br
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ h2=${NETIFS[p3]}
+ swp2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ h3_mac=$(mac_get $h3)
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+ h3_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h3_destroy
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.2.3 " from host 1"
+ ping_test $h2 192.0.2.3 " from host 2"
+}
+
+get_qdisc_backlog()
+{
+ qdisc_stats_get $swp3 11: .backlog
+}
+
+get_nmarked()
+{
+ qdisc_stats_get $swp3 11: .marked
+}
+
+get_qdisc_npackets()
+{
+ qdisc_stats_get $swp3 11: .packets
+}
+
+get_nmirrored()
+{
+ link_stats_get _drop_test tx packets
+}
+
+send_packets()
+{
+ local proto=$1; shift
+ local pkts=$1; shift
+
+ $MZ $h2 -p $PKTSZ -a own -b $h3_mac -A 192.0.2.2 -B 192.0.2.3 -t $proto -q -c $pkts "$@"
+}
+
+# This sends traffic in an attempt to build a backlog of $size. Returns 0 on
+# success. After 10 failed attempts it bails out and returns 1. It dumps the
+# backlog size to stdout.
+build_backlog()
+{
+ local size=$1; shift
+ local proto=$1; shift
+
+ local i=0
+
+ while :; do
+ local cur=$(get_qdisc_backlog)
+ local diff=$((size - cur))
+ local pkts=$(((diff + PKTSZ - 1) / PKTSZ))
+
+ if ((cur >= size)); then
+ echo $cur
+ return 0
+ elif ((i++ > 10)); then
+ echo $cur
+ return 1
+ fi
+
+ send_packets $proto $pkts "$@"
+ sleep 1
+ done
+}
+
+check_marking()
+{
+ local cond=$1; shift
+
+ local npackets_0=$(get_qdisc_npackets)
+ local nmarked_0=$(get_nmarked)
+ sleep 5
+ local npackets_1=$(get_qdisc_npackets)
+ local nmarked_1=$(get_nmarked)
+
+ local nmarked_d=$((nmarked_1 - nmarked_0))
+ local npackets_d=$((npackets_1 - npackets_0))
+ local pct=$((100 * nmarked_d / npackets_d))
+
+ echo $pct
+ ((pct $cond))
+}
+
+check_mirroring()
+{
+ local cond=$1; shift
+
+ local npackets_0=$(get_qdisc_npackets)
+ local nmirrored_0=$(get_nmirrored)
+ sleep 5
+ local npackets_1=$(get_qdisc_npackets)
+ local nmirrored_1=$(get_nmirrored)
+
+ local nmirrored_d=$((nmirrored_1 - nmirrored_0))
+ local npackets_d=$((npackets_1 - npackets_0))
+ local pct=$((100 * nmirrored_d / npackets_d))
+
+ echo $pct
+ ((pct $cond))
+}
+
+ecn_test_common()
+{
+ local name=$1; shift
+ local limit=$1; shift
+ local backlog
+ local pct
+
+ # Build the below-the-limit backlog using UDP. We could use TCP just
+ # fine, but this way we get a proof that UDP is accepted when queue
+ # length is below the limit. The main stream is using TCP, and if the
+ # limit is misconfigured, we would see this traffic being ECN marked.
+ RET=0
+ backlog=$(build_backlog $((2 * limit / 3)) udp)
+ check_err $? "Could not build the requested backlog"
+ pct=$(check_marking "== 0")
+ check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0."
+ log_test "$name backlog < limit"
+
+ # Now push TCP, because non-TCP traffic would be early-dropped after the
+ # backlog crosses the limit, and we want to make sure that the backlog
+ # is above the limit.
+ RET=0
+ backlog=$(build_backlog $((3 * limit / 2)) tcp tos=0x01)
+ check_err $? "Could not build the requested backlog"
+ pct=$(check_marking ">= 95")
+ check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected >= 95."
+ log_test "$name backlog > limit"
+}
+
+do_ecn_test()
+{
+ local limit=$1; shift
+ local name=ECN
+
+ $MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \
+ -a own -b $h3_mac -t tcp -q tos=0x01 &
+ sleep 1
+
+ ecn_test_common "$name" $limit
+
+ # Up there we saw that UDP gets accepted when backlog is below the
+ # limit. Now that it is above, it should all get dropped, and backlog
+ # building should fail.
+ RET=0
+ build_backlog $((2 * limit)) udp >/dev/null
+ check_fail $? "UDP traffic went into backlog instead of being early-dropped"
+ log_test "$name backlog > limit: UDP early-dropped"
+
+ stop_traffic
+ sleep 1
+}
+
+do_ecn_nodrop_test()
+{
+ local limit=$1; shift
+ local name="ECN nodrop"
+
+ $MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \
+ -a own -b $h3_mac -t tcp -q tos=0x01 &
+ sleep 1
+
+ ecn_test_common "$name" $limit
+
+ # Up there we saw that UDP gets accepted when backlog is below the
+ # limit. Now that it is above, in nodrop mode, make sure it goes to
+ # backlog as well.
+ RET=0
+ build_backlog $((2 * limit)) udp >/dev/null
+ check_err $? "UDP traffic was early-dropped instead of getting into backlog"
+ log_test "$name backlog > limit: UDP not dropped"
+
+ stop_traffic
+ sleep 1
+}
+
+do_red_test()
+{
+ local limit=$1; shift
+ local backlog
+ local pct
+
+ # Use ECN-capable TCP to verify there's no marking even though the queue
+ # is above limit.
+ $MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \
+ -a own -b $h3_mac -t tcp -q tos=0x01 &
+
+ # Pushing below the queue limit should work.
+ RET=0
+ backlog=$(build_backlog $((2 * limit / 3)) tcp tos=0x01)
+ check_err $? "Could not build the requested backlog"
+ pct=$(check_marking "== 0")
+ check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0."
+ log_test "RED backlog < limit"
+
+ # Pushing above should not.
+ RET=0
+ backlog=$(build_backlog $((3 * limit / 2)) tcp tos=0x01)
+ check_fail $? "Traffic went into backlog instead of being early-dropped"
+ pct=$(check_marking "== 0")
+ check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0."
+ log_test "RED backlog > limit"
+
+ stop_traffic
+ sleep 1
+}
+
+do_red_qevent_test()
+{
+ local limit=$1; shift
+ local backlog
+ local base
+ local now
+ local pct
+
+ RET=0
+
+ $MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \
+ -a own -b $h3_mac -t udp -q &
+ sleep 1
+
+ tc filter add block 10 pref 1234 handle 102 matchall skip_hw \
+ action mirred egress mirror dev _drop_test
+
+ # Push to the queue until it's at the limit. The configured limit is
+ # rounded by the qdisc, so this is the best we can do to get to the real
+ # limit.
+ build_backlog $((3 * limit / 2)) udp >/dev/null
+
+ base=$(get_nmirrored)
+ send_packets udp 100
+ sleep 1
+ now=$(get_nmirrored)
+ ((now >= base + 100))
+ check_err $? "Dropped packets not observed: 100 expected, $((now - base)) seen"
+
+ tc filter del block 10 pref 1234 handle 102 matchall
+
+ base=$(get_nmirrored)
+ send_packets udp 100
+ sleep 1
+ now=$(get_nmirrored)
+ ((now == base))
+ check_err $? "Dropped packets still observed: 0 expected, $((now - base)) seen"
+
+ log_test "RED early_dropped packets mirrored"
+
+ stop_traffic
+ sleep 1
+}
+
+do_ecn_qevent_test()
+{
+ local limit=$1; shift
+ local name=ECN
+
+ RET=0
+
+ $MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \
+ -a own -b $h3_mac -t tcp -q tos=0x01 &
+ sleep 1
+
+ tc filter add block 10 pref 1234 handle 102 matchall skip_hw \
+ action mirred egress mirror dev _drop_test
+
+ backlog=$(build_backlog $((2 * limit / 3)) tcp tos=0x01)
+ check_err $? "Could not build the requested backlog"
+ pct=$(check_mirroring "== 0")
+ check_err $? "backlog $backlog / $limit Got $pct% mirrored packets, expected == 0."
+
+ backlog=$(build_backlog $((3 * limit / 2)) tcp tos=0x01)
+ check_err $? "Could not build the requested backlog"
+ pct=$(check_mirroring ">= 95")
+ check_err $? "backlog $backlog / $limit Got $pct% mirrored packets, expected >= 95."
+
+ tc filter del block 10 pref 1234 handle 102 matchall
+
+ log_test "ECN marked packets mirrored"
+
+ stop_traffic
+ sleep 1
+}
+
+install_qdisc()
+{
+ local -a args=("$@")
+
+ tc qdisc replace dev $swp3 parent 1:1 handle 11: red \
+ limit 1M avpkt $PKTSZ probability 1 \
+ min $BACKLOG max $((BACKLOG + 1)) burst 38 "${args[@]}"
+ sleep 1
+}
+
+uninstall_qdisc()
+{
+ tc qdisc del dev $swp3 parent 1:1
+}
+
+ecn_test()
+{
+ install_qdisc ecn
+ do_ecn_test $BACKLOG
+ uninstall_qdisc
+}
+
+ecn_nodrop_test()
+{
+ install_qdisc ecn nodrop
+ do_ecn_nodrop_test $BACKLOG
+ uninstall_qdisc
+}
+
+red_test()
+{
+ install_qdisc
+ do_red_test $BACKLOG
+ uninstall_qdisc
+}
+
+red_qevent_test()
+{
+ install_qdisc qevent early_drop block 10
+ do_red_qevent_test $BACKLOG
+ uninstall_qdisc
+}
+
+ecn_qevent_test()
+{
+ install_qdisc ecn qevent mark block 10
+ do_ecn_qevent_test $BACKLOG
+ uninstall_qdisc
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_core.sh b/tools/testing/selftests/net/forwarding/sch_tbf_core.sh
new file mode 100644
index 000000000..d1f26cb7c
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/sch_tbf_core.sh
@@ -0,0 +1,233 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# This test sends a stream of traffic from H1 through a switch, to H2. On the
+# egress port from the switch ($swp2), a shaper is installed. The test verifies
+# that the rates on the port match the configured shaper.
+#
+# In order to test per-class shaping, $swp2 actually contains TBF under PRIO or
+# ETS, with two different configurations. Traffic is prioritized using 802.1p.
+#
+# +-------------------------------------------+
+# | H1 |
+# | + $h1.10 $h1.11 + |
+# | | 192.0.2.1/28 192.0.2.17/28 | |
+# | | | |
+# | \______________ _____________/ |
+# | \ / |
+# | + $h1 |
+# +---------------------|---------------------+
+# |
+# +---------------------|---------------------+
+# | SW + $swp1 |
+# | _______________/ \_______________ |
+# | / \ |
+# | +-|--------------+ +--------------|-+ |
+# | | + $swp1.10 | | $swp1.11 + | |
+# | | | | | |
+# | | BR10 | | BR11 | |
+# | | | | | |
+# | | + $swp2.10 | | $swp2.11 + | |
+# | +-|--------------+ +--------------|-+ |
+# | \_______________ ______________/ |
+# | \ / |
+# | + $swp2 |
+# +---------------------|---------------------+
+# |
+# +---------------------|---------------------+
+# | H2 + $h2 |
+# | ______________/ \______________ |
+# | / \ |
+# | | | |
+# | + $h2.10 $h2.11 + |
+# | 192.0.2.2/28 192.0.2.18/28 |
+# +-------------------------------------------+
+
+NUM_NETIFS=4
+CHECK_TC="yes"
+source $lib_dir/lib.sh
+
+ipaddr()
+{
+ local host=$1; shift
+ local vlan=$1; shift
+
+ echo 192.0.2.$((16 * (vlan - 10) + host))
+}
+
+host_create()
+{
+ local dev=$1; shift
+ local host=$1; shift
+
+ simple_if_init $dev
+ mtu_set $dev 10000
+
+ vlan_create $dev 10 v$dev $(ipaddr $host 10)/28
+ ip link set dev $dev.10 type vlan egress 0:0
+
+ vlan_create $dev 11 v$dev $(ipaddr $host 11)/28
+ ip link set dev $dev.11 type vlan egress 0:1
+}
+
+host_destroy()
+{
+ local dev=$1; shift
+
+ vlan_destroy $dev 11
+ vlan_destroy $dev 10
+ mtu_restore $dev
+ simple_if_fini $dev
+}
+
+h1_create()
+{
+ host_create $h1 1
+}
+
+h1_destroy()
+{
+ host_destroy $h1
+}
+
+h2_create()
+{
+ host_create $h2 2
+
+ tc qdisc add dev $h2 clsact
+ tc filter add dev $h2 ingress pref 1010 prot 802.1q \
+ flower $TCFLAGS vlan_id 10 action pass
+ tc filter add dev $h2 ingress pref 1011 prot 802.1q \
+ flower $TCFLAGS vlan_id 11 action pass
+}
+
+h2_destroy()
+{
+ tc qdisc del dev $h2 clsact
+ host_destroy $h2
+}
+
+switch_create()
+{
+ local intf
+ local vlan
+
+ ip link add dev br10 type bridge
+ ip link add dev br11 type bridge
+
+ for intf in $swp1 $swp2; do
+ ip link set dev $intf up
+ mtu_set $intf 10000
+
+ for vlan in 10 11; do
+ vlan_create $intf $vlan
+ ip link set dev $intf.$vlan master br$vlan
+ ip link set dev $intf.$vlan up
+ done
+ done
+
+ for vlan in 10 11; do
+ ip link set dev $swp1.$vlan type vlan ingress 0:0 1:1
+ done
+
+ ip link set dev br10 up
+ ip link set dev br11 up
+}
+
+switch_destroy()
+{
+ local intf
+ local vlan
+
+ # A test may have been interrupted mid-run, with Qdisc installed. Delete
+ # it here.
+ tc qdisc del dev $swp2 root 2>/dev/null
+
+ ip link set dev br11 down
+ ip link set dev br10 down
+
+ for intf in $swp2 $swp1; do
+ for vlan in 11 10; do
+ ip link set dev $intf.$vlan down
+ ip link set dev $intf.$vlan nomaster
+ vlan_destroy $intf $vlan
+ done
+
+ mtu_restore $intf
+ ip link set dev $intf down
+ done
+
+ ip link del dev br11
+ ip link del dev br10
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ swp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ swp4=${NETIFS[p7]}
+ swp5=${NETIFS[p8]}
+
+ h2_mac=$(mac_get $h2)
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1.10 $(ipaddr 2 10) " vlan 10"
+ ping_test $h1.11 $(ipaddr 2 11) " vlan 11"
+}
+
+tbf_get_counter()
+{
+ local vlan=$1; shift
+
+ tc_rule_stats_get $h2 10$vlan ingress .bytes
+}
+
+do_tbf_test()
+{
+ local vlan=$1; shift
+ local mbit=$1; shift
+
+ start_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 2 $vlan) $h2_mac
+ sleep 5 # Wait for the burst to dwindle
+
+ local t2=$(busywait_for_counter 1000 +1 tbf_get_counter $vlan)
+ sleep 10
+ local t3=$(tbf_get_counter $vlan)
+ stop_traffic
+
+ RET=0
+
+ # Note: TBF uses 10^6 Mbits, not 2^20 ones.
+ local er=$((mbit * 1000 * 1000))
+ local nr=$(rate $t2 $t3 10)
+ local nr_pct=$((100 * (nr - er) / er))
+ ((-5 <= nr_pct && nr_pct <= 5))
+ check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-5%."
+
+ log_test "TC $((vlan - 10)): TBF rate ${mbit}Mbit"
+}
diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_ets.sh b/tools/testing/selftests/net/forwarding/sch_tbf_ets.sh
new file mode 100755
index 000000000..84fb6cab8
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/sch_tbf_ets.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+QDISC="ets strict"
+: ${lib_dir:=.}
+source $lib_dir/sch_tbf_etsprio.sh
diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_etsprio.sh b/tools/testing/selftests/net/forwarding/sch_tbf_etsprio.sh
new file mode 100644
index 000000000..8bd85da19
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/sch_tbf_etsprio.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="
+ ping_ipv4
+ tbf_test
+"
+source $lib_dir/sch_tbf_core.sh
+
+tbf_test_one()
+{
+ local bs=$1; shift
+
+ tc qdisc replace dev $swp2 parent 10:3 handle 103: tbf \
+ rate 400Mbit burst $bs limit 1M
+ tc qdisc replace dev $swp2 parent 10:2 handle 102: tbf \
+ rate 800Mbit burst $bs limit 1M
+
+ do_tbf_test 10 400 $bs
+ do_tbf_test 11 800 $bs
+}
+
+tbf_test()
+{
+ # This test is used for both ETS and PRIO. Even though we only need two
+ # bands, PRIO demands a minimum of three.
+ tc qdisc add dev $swp2 root handle 10: $QDISC 3 priomap 2 1 0
+ tbf_test_one 128K
+ tc qdisc del dev $swp2 root
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_prio.sh b/tools/testing/selftests/net/forwarding/sch_tbf_prio.sh
new file mode 100755
index 000000000..9c8cb1cb9
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/sch_tbf_prio.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+QDISC="prio bands"
+: ${lib_dir:=.}
+source $lib_dir/sch_tbf_etsprio.sh
diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_root.sh b/tools/testing/selftests/net/forwarding/sch_tbf_root.sh
new file mode 100755
index 000000000..72aa21ba8
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/sch_tbf_root.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="
+ ping_ipv4
+ tbf_test
+"
+: ${lib_dir:=.}
+source $lib_dir/sch_tbf_core.sh
+
+tbf_test_one()
+{
+ local bs=$1; shift
+
+ tc qdisc replace dev $swp2 root handle 108: tbf \
+ rate 400Mbit burst $bs limit 1M
+ do_tbf_test 10 400 $bs
+}
+
+tbf_test()
+{
+ tbf_test_one 128K
+ tc qdisc del dev $swp2 root
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/settings b/tools/testing/selftests/net/forwarding/settings
new file mode 100644
index 000000000..e7b941753
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/settings
@@ -0,0 +1 @@
+timeout=0
diff --git a/tools/testing/selftests/net/forwarding/skbedit_priority.sh b/tools/testing/selftests/net/forwarding/skbedit_priority.sh
new file mode 100755
index 000000000..bde11dc27
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/skbedit_priority.sh
@@ -0,0 +1,170 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test sends traffic from H1 to H2. Either on ingress of $swp1, or on
+# egress of $swp2, the traffic is acted upon by an action skbedit priority. The
+# new priority should be taken into account when classifying traffic on the PRIO
+# qdisc at $swp2. The test verifies that for different priority values, the
+# traffic ends up in expected PRIO band.
+#
+# +----------------------+ +----------------------+
+# | H1 | | H2 |
+# | + $h1 | | $h2 + |
+# | | 192.0.2.1/28 | | 192.0.2.2/28 | |
+# +----|-----------------+ +----------------|-----+
+# | |
+# +----|----------------------------------------------------------------|-----+
+# | SW | | |
+# | +-|----------------------------------------------------------------|-+ |
+# | | + $swp1 BR $swp2 + | |
+# | | PRIO | |
+# | +--------------------------------------------------------------------+ |
+# +---------------------------------------------------------------------------+
+
+ALL_TESTS="
+ ping_ipv4
+ test_ingress
+ test_egress
+"
+
+NUM_NETIFS=4
+source lib.sh
+
+: ${HIT_TIMEOUT:=2000} # ms
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/28
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/28
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.2/28
+}
+
+h2_destroy()
+{
+ simple_if_fini $h2 192.0.2.2/28
+}
+
+switch_create()
+{
+ ip link add name br1 up type bridge vlan_filtering 1
+ ip link set dev $swp1 master br1
+ ip link set dev $swp1 up
+ ip link set dev $swp2 master br1
+ ip link set dev $swp2 up
+
+ tc qdisc add dev $swp1 clsact
+ tc qdisc add dev $swp2 clsact
+ tc qdisc add dev $swp2 root handle 10: \
+ prio bands 8 priomap 7 6 5 4 3 2 1 0
+}
+
+switch_destroy()
+{
+ tc qdisc del dev $swp2 root
+ tc qdisc del dev $swp2 clsact
+ tc qdisc del dev $swp1 clsact
+
+ ip link set dev $swp2 down
+ ip link set dev $swp2 nomaster
+ ip link set dev $swp1 down
+ ip link set dev $swp1 nomaster
+ ip link del dev br1
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ h2mac=$(mac_get $h2)
+
+ vrf_prepare
+ h1_create
+ h2_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h2_destroy
+ h1_destroy
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.2.2
+}
+
+test_skbedit_priority_one()
+{
+ local locus=$1; shift
+ local prio=$1; shift
+ local classid=$1; shift
+
+ RET=0
+
+ tc filter add $locus handle 101 pref 1 \
+ flower action skbedit priority $prio
+
+ local pkt0=$(qdisc_parent_stats_get $swp2 $classid .packets)
+ local pkt2=$(tc_rule_handle_stats_get "$locus" 101)
+ $MZ $h1 -t udp "sp=54321,dp=12345" -c 10 -d 20msec -p 100 \
+ -a own -b $h2mac -A 192.0.2.1 -B 192.0.2.2 -q
+
+ local pkt1
+ pkt1=$(busywait "$HIT_TIMEOUT" until_counter_is ">= $((pkt0 + 10))" \
+ qdisc_parent_stats_get $swp2 $classid .packets)
+ check_err $? "Expected to get 10 packets on class $classid, but got $((pkt1 - pkt0))."
+
+ local pkt3=$(tc_rule_handle_stats_get "$locus" 101)
+ ((pkt3 >= pkt2 + 10))
+ check_err $? "Expected to get 10 packets on skbedit rule but got $((pkt3 - pkt2))."
+
+ log_test "$locus skbedit priority $prio -> classid $classid"
+
+ tc filter del $locus pref 1
+}
+
+test_ingress()
+{
+ local prio
+
+ for prio in {0..7}; do
+ test_skbedit_priority_one "dev $swp1 ingress" \
+ $prio 10:$((8 - prio))
+ done
+}
+
+test_egress()
+{
+ local prio
+
+ for prio in {0..7}; do
+ test_skbedit_priority_one "dev $swp2 egress" \
+ $prio 10:$((8 - prio))
+ done
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh
new file mode 100755
index 000000000..1e2703128
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/tc_actions.sh
@@ -0,0 +1,269 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="gact_drop_and_ok_test mirred_egress_redirect_test \
+ mirred_egress_mirror_test matchall_mirred_egress_mirror_test \
+ gact_trap_test mirred_egress_to_ingress_tcp_test"
+NUM_NETIFS=4
+source tc_common.sh
+source lib.sh
+
+tcflags="skip_hw"
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/24
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/24
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.2/24
+ tc qdisc add dev $h2 clsact
+}
+
+h2_destroy()
+{
+ tc qdisc del dev $h2 clsact
+ simple_if_fini $h2 192.0.2.2/24
+}
+
+switch_create()
+{
+ simple_if_init $swp1 192.0.2.2/24
+ tc qdisc add dev $swp1 clsact
+
+ simple_if_init $swp2 192.0.2.1/24
+}
+
+switch_destroy()
+{
+ simple_if_fini $swp2 192.0.2.1/24
+
+ tc qdisc del dev $swp1 clsact
+ simple_if_fini $swp1 192.0.2.2/24
+}
+
+mirred_egress_test()
+{
+ local action=$1
+ local protocol=$2
+ local classifier=$3
+ local classifier_args=$4
+
+ RET=0
+
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ $tcflags dst_ip 192.0.2.2 action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_fail $? "Matched without redirect rule inserted"
+
+ tc filter add dev $swp1 ingress protocol $protocol pref 1 handle 101 \
+ $classifier $tcflags $classifier_args \
+ action mirred egress $action dev $swp2
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_err $? "Did not match incoming $action packet"
+
+ tc filter del dev $swp1 ingress protocol $protocol pref 1 handle 101 \
+ $classifier
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+
+ log_test "mirred egress $classifier $action ($tcflags)"
+}
+
+gact_drop_and_ok_test()
+{
+ RET=0
+
+ tc filter add dev $swp1 ingress protocol ip pref 2 handle 102 flower \
+ $tcflags dst_ip 192.0.2.2 action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $swp1 ingress" 102 1
+ check_err $? "Packet was not dropped"
+
+ tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \
+ $tcflags dst_ip 192.0.2.2 action ok
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $swp1 ingress" 101 1
+ check_err $? "Did not see passed packet"
+
+ tc_check_packets "dev $swp1 ingress" 102 2
+ check_fail $? "Packet was dropped and it should not reach here"
+
+ tc filter del dev $swp1 ingress protocol ip pref 2 handle 102 flower
+ tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower
+
+ log_test "gact drop and ok ($tcflags)"
+}
+
+gact_trap_test()
+{
+ RET=0
+
+ if [[ "$tcflags" != "skip_sw" ]]; then
+ return 0;
+ fi
+
+ tc filter add dev $swp1 ingress protocol ip pref 1 handle 101 flower \
+ skip_hw dst_ip 192.0.2.2 action drop
+ tc filter add dev $swp1 ingress protocol ip pref 3 handle 103 flower \
+ $tcflags dst_ip 192.0.2.2 action mirred egress redirect \
+ dev $swp2
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $swp1 ingress" 101 1
+ check_fail $? "Saw packet without trap rule inserted"
+
+ tc filter add dev $swp1 ingress protocol ip pref 2 handle 102 flower \
+ $tcflags dst_ip 192.0.2.2 action trap
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $swp1 ingress" 102 1
+ check_err $? "Packet was not trapped"
+
+ tc_check_packets "dev $swp1 ingress" 101 1
+ check_err $? "Did not see trapped packet"
+
+ tc filter del dev $swp1 ingress protocol ip pref 3 handle 103 flower
+ tc filter del dev $swp1 ingress protocol ip pref 2 handle 102 flower
+ tc filter del dev $swp1 ingress protocol ip pref 1 handle 101 flower
+
+ log_test "trap ($tcflags)"
+}
+
+mirred_egress_to_ingress_tcp_test()
+{
+ local tmpfile=$(mktemp) tmpfile1=$(mktemp)
+
+ RET=0
+ dd conv=sparse status=none if=/dev/zero bs=1M count=2 of=$tmpfile
+ tc filter add dev $h1 protocol ip pref 100 handle 100 egress flower \
+ $tcflags ip_proto tcp src_ip 192.0.2.1 dst_ip 192.0.2.2 \
+ action ct commit nat src addr 192.0.2.2 pipe \
+ action ct clear pipe \
+ action ct commit nat dst addr 192.0.2.1 pipe \
+ action ct clear pipe \
+ action skbedit ptype host pipe \
+ action mirred ingress redirect dev $h1
+ tc filter add dev $h1 protocol ip pref 101 handle 101 egress flower \
+ $tcflags ip_proto icmp \
+ action mirred ingress redirect dev $h1
+ tc filter add dev $h1 protocol ip pref 102 handle 102 ingress flower \
+ ip_proto icmp \
+ action drop
+
+ ip vrf exec v$h1 nc --recv-only -w10 -l -p 12345 -o $tmpfile1 &
+ local rpid=$!
+ ip vrf exec v$h1 nc -w1 --send-only 192.0.2.2 12345 <$tmpfile
+ wait -n $rpid
+ cmp -s $tmpfile $tmpfile1
+ check_err $? "server output check failed"
+
+ $MZ $h1 -c 10 -p 64 -a $h1mac -b $h1mac -A 192.0.2.1 -B 192.0.2.1 \
+ -t icmp "ping,id=42,seq=5" -q
+ tc_check_packets "dev $h1 egress" 101 10
+ check_err $? "didn't mirred redirect ICMP"
+ tc_check_packets "dev $h1 ingress" 102 10
+ check_err $? "didn't drop mirred ICMP"
+ local overlimits=$(tc_rule_stats_get ${h1} 101 egress .overlimits)
+ test ${overlimits} = 10
+ check_err $? "wrong overlimits, expected 10 got ${overlimits}"
+
+ tc filter del dev $h1 egress protocol ip pref 100 handle 100 flower
+ tc filter del dev $h1 egress protocol ip pref 101 handle 101 flower
+ tc filter del dev $h1 ingress protocol ip pref 102 handle 102 flower
+
+ rm -f $tmpfile $tmpfile1
+ log_test "mirred_egress_to_ingress_tcp ($tcflags)"
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ h1mac=$(mac_get $h1)
+ h2mac=$(mac_get $h2)
+
+ swp1origmac=$(mac_get $swp1)
+ swp2origmac=$(mac_get $swp2)
+ ip link set $swp1 address $h2mac
+ ip link set $swp2 address $h1mac
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+
+ ip link set $swp2 address $swp2origmac
+ ip link set $swp1 address $swp1origmac
+}
+
+mirred_egress_redirect_test()
+{
+ mirred_egress_test "redirect" "ip" "flower" "dst_ip 192.0.2.2"
+}
+
+mirred_egress_mirror_test()
+{
+ mirred_egress_test "mirror" "ip" "flower" "dst_ip 192.0.2.2"
+}
+
+matchall_mirred_egress_mirror_test()
+{
+ mirred_egress_test "mirror" "all" "matchall" ""
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+tc_offload_check
+if [[ $? -ne 0 ]]; then
+ log_info "Could not test offloaded functionality"
+else
+ tcflags="skip_sw"
+ tests_run
+fi
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/tc_chains.sh b/tools/testing/selftests/net/forwarding/tc_chains.sh
new file mode 100755
index 000000000..2934fb5ed
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/tc_chains.sh
@@ -0,0 +1,205 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="unreachable_chain_test gact_goto_chain_test create_destroy_chain \
+ template_filter_fits"
+NUM_NETIFS=2
+source tc_common.sh
+source lib.sh
+
+tcflags="skip_hw"
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/24
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/24
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.2/24
+ tc qdisc add dev $h2 clsact
+}
+
+h2_destroy()
+{
+ tc qdisc del dev $h2 clsact
+ simple_if_fini $h2 192.0.2.2/24
+}
+
+unreachable_chain_test()
+{
+ RET=0
+
+ tc filter add dev $h2 ingress chain 1 protocol ip pref 1 handle 1101 \
+ flower $tcflags dst_mac $h2mac action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 1101 1
+ check_fail $? "matched on filter in unreachable chain"
+
+ tc filter del dev $h2 ingress chain 1 protocol ip pref 1 handle 1101 \
+ flower
+
+ log_test "unreachable chain ($tcflags)"
+}
+
+gact_goto_chain_test()
+{
+ RET=0
+
+ tc filter add dev $h2 ingress chain 1 protocol ip pref 1 handle 1101 \
+ flower $tcflags dst_mac $h2mac action drop
+ tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \
+ $tcflags dst_mac $h2mac action drop
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ $tcflags dst_mac $h2mac action goto chain 1
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_fail $? "Matched on a wrong filter"
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_err $? "Did not match on correct filter with goto chain action"
+
+ tc_check_packets "dev $h2 ingress" 1101 1
+ check_err $? "Did not match on correct filter in chain 1"
+
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+ tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
+ tc filter del dev $h2 ingress chain 1 protocol ip pref 1 handle 1101 \
+ flower
+
+ log_test "gact goto chain ($tcflags)"
+}
+
+create_destroy_chain()
+{
+ RET=0
+
+ tc chain add dev $h2 ingress
+ check_err $? "Failed to create default chain"
+
+ output="$(tc -j chain get dev $h2 ingress)"
+ check_err $? "Failed to get default chain"
+
+ echo $output | jq -e ".[] | select(.chain == 0)" &> /dev/null
+ check_err $? "Unexpected output for default chain"
+
+ tc chain add dev $h2 ingress chain 1
+ check_err $? "Failed to create chain 1"
+
+ output="$(tc -j chain get dev $h2 ingress chain 1)"
+ check_err $? "Failed to get chain 1"
+
+ echo $output | jq -e ".[] | select(.chain == 1)" &> /dev/null
+ check_err $? "Unexpected output for chain 1"
+
+ output="$(tc -j chain show dev $h2 ingress)"
+ check_err $? "Failed to dump chains"
+
+ echo $output | jq -e ".[] | select(.chain == 0)" &> /dev/null
+ check_err $? "Can't find default chain in dump"
+
+ echo $output | jq -e ".[] | select(.chain == 1)" &> /dev/null
+ check_err $? "Can't find chain 1 in dump"
+
+ tc chain del dev $h2 ingress
+ check_err $? "Failed to destroy default chain"
+
+ tc chain del dev $h2 ingress chain 1
+ check_err $? "Failed to destroy chain 1"
+
+ log_test "create destroy chain"
+}
+
+template_filter_fits()
+{
+ RET=0
+
+ tc chain add dev $h2 ingress protocol ip \
+ flower dst_mac 00:00:00:00:00:00/FF:FF:FF:FF:FF:FF &> /dev/null
+ tc chain add dev $h2 ingress chain 1 protocol ip \
+ flower src_mac 00:00:00:00:00:00/FF:FF:FF:FF:FF:FF &> /dev/null
+
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 1101 \
+ flower dst_mac $h2mac action drop
+ check_err $? "Failed to insert filter which fits template"
+
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 1102 \
+ flower src_mac $h2mac action drop &> /dev/null
+ check_fail $? "Incorrectly succeded to insert filter which does not template"
+
+ tc filter add dev $h2 ingress chain 1 protocol ip pref 1 handle 1101 \
+ flower src_mac $h2mac action drop
+ check_err $? "Failed to insert filter which fits template"
+
+ tc filter add dev $h2 ingress chain 1 protocol ip pref 1 handle 1102 \
+ flower dst_mac $h2mac action drop &> /dev/null
+ check_fail $? "Incorrectly succeded to insert filter which does not template"
+
+ tc filter del dev $h2 ingress chain 1 protocol ip pref 1 handle 1102 \
+ flower &> /dev/null
+ tc filter del dev $h2 ingress chain 1 protocol ip pref 1 handle 1101 \
+ flower &> /dev/null
+
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 1102 \
+ flower &> /dev/null
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 1101 \
+ flower &> /dev/null
+
+ tc chain del dev $h2 ingress chain 1
+ tc chain del dev $h2 ingress
+
+ log_test "template filter fits"
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ h2=${NETIFS[p2]}
+ h1mac=$(mac_get $h1)
+ h2mac=$(mac_get $h2)
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+check_tc_chain_support
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+tc_offload_check
+if [[ $? -ne 0 ]]; then
+ log_info "Could not test offloaded functionality"
+else
+ tcflags="skip_sw"
+ tests_run
+fi
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/tc_common.sh b/tools/testing/selftests/net/forwarding/tc_common.sh
new file mode 100644
index 000000000..0e18e8be6
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/tc_common.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+CHECK_TC="yes"
+
+# Can be overridden by the configuration file. See lib.sh
+TC_HIT_TIMEOUT=${TC_HIT_TIMEOUT:=1000} # ms
+
+tc_check_packets()
+{
+ local id=$1
+ local handle=$2
+ local count=$3
+
+ busywait "$TC_HIT_TIMEOUT" until_counter_is "== $count" \
+ tc_rule_handle_stats_get "$id" "$handle" > /dev/null
+}
+
+tc_check_packets_hitting()
+{
+ local id=$1
+ local handle=$2
+
+ busywait "$TC_HIT_TIMEOUT" until_counter_is "> 0" \
+ tc_rule_handle_stats_get "$id" "$handle" > /dev/null
+}
diff --git a/tools/testing/selftests/net/forwarding/tc_flower.sh b/tools/testing/selftests/net/forwarding/tc_flower.sh
new file mode 100755
index 000000000..b7cdf75ef
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/tc_flower.sh
@@ -0,0 +1,411 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="match_dst_mac_test match_src_mac_test match_dst_ip_test \
+ match_src_ip_test match_ip_flags_test match_pcp_test match_vlan_test \
+ match_ip_tos_test match_indev_test match_ip_ttl_test"
+NUM_NETIFS=2
+source tc_common.sh
+source lib.sh
+
+tcflags="skip_hw"
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/24 198.51.100.1/24
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/24 198.51.100.1/24
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.2/24 198.51.100.2/24
+ tc qdisc add dev $h2 clsact
+}
+
+h2_destroy()
+{
+ tc qdisc del dev $h2 clsact
+ simple_if_fini $h2 192.0.2.2/24 198.51.100.2/24
+}
+
+match_dst_mac_test()
+{
+ local dummy_mac=de:ad:be:ef:aa:aa
+
+ RET=0
+
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ $tcflags dst_mac $dummy_mac action drop
+ tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \
+ $tcflags dst_mac $h2mac action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_fail $? "Matched on a wrong filter"
+
+ tc_check_packets "dev $h2 ingress" 102 0
+ check_fail $? "Did not match on correct filter"
+
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+ tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
+
+ log_test "dst_mac match ($tcflags)"
+}
+
+match_src_mac_test()
+{
+ local dummy_mac=de:ad:be:ef:aa:aa
+
+ RET=0
+
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ $tcflags src_mac $dummy_mac action drop
+ tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \
+ $tcflags src_mac $h1mac action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_fail $? "Matched on a wrong filter"
+
+ tc_check_packets "dev $h2 ingress" 102 0
+ check_fail $? "Did not match on correct filter"
+
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+ tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
+
+ log_test "src_mac match ($tcflags)"
+}
+
+match_dst_ip_test()
+{
+ RET=0
+
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ $tcflags dst_ip 198.51.100.2 action drop
+ tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \
+ $tcflags dst_ip 192.0.2.2 action drop
+ tc filter add dev $h2 ingress protocol ip pref 3 handle 103 flower \
+ $tcflags dst_ip 192.0.2.0/24 action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_fail $? "Matched on a wrong filter"
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_err $? "Did not match on correct filter"
+
+ tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 103 1
+ check_err $? "Did not match on correct filter with mask"
+
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+ tc filter del dev $h2 ingress protocol ip pref 3 handle 103 flower
+
+ log_test "dst_ip match ($tcflags)"
+}
+
+match_src_ip_test()
+{
+ RET=0
+
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ $tcflags src_ip 198.51.100.1 action drop
+ tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \
+ $tcflags src_ip 192.0.2.1 action drop
+ tc filter add dev $h2 ingress protocol ip pref 3 handle 103 flower \
+ $tcflags src_ip 192.0.2.0/24 action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_fail $? "Matched on a wrong filter"
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_err $? "Did not match on correct filter"
+
+ tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 103 1
+ check_err $? "Did not match on correct filter with mask"
+
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+ tc filter del dev $h2 ingress protocol ip pref 3 handle 103 flower
+
+ log_test "src_ip match ($tcflags)"
+}
+
+match_ip_flags_test()
+{
+ RET=0
+
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ $tcflags ip_flags frag action continue
+ tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \
+ $tcflags ip_flags firstfrag action continue
+ tc filter add dev $h2 ingress protocol ip pref 3 handle 103 flower \
+ $tcflags ip_flags nofirstfrag action continue
+ tc filter add dev $h2 ingress protocol ip pref 4 handle 104 flower \
+ $tcflags ip_flags nofrag action drop
+
+ $MZ $h1 -c 1 -p 1000 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip "frag=0" -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_fail $? "Matched on wrong frag filter (nofrag)"
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_fail $? "Matched on wrong firstfrag filter (nofrag)"
+
+ tc_check_packets "dev $h2 ingress" 103 1
+ check_err $? "Did not match on nofirstfrag filter (nofrag) "
+
+ tc_check_packets "dev $h2 ingress" 104 1
+ check_err $? "Did not match on nofrag filter (nofrag)"
+
+ $MZ $h1 -c 1 -p 1000 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip "frag=0,mf" -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_err $? "Did not match on frag filter (1stfrag)"
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_err $? "Did not match fistfrag filter (1stfrag)"
+
+ tc_check_packets "dev $h2 ingress" 103 1
+ check_err $? "Matched on wrong nofirstfrag filter (1stfrag)"
+
+ tc_check_packets "dev $h2 ingress" 104 1
+ check_err $? "Match on wrong nofrag filter (1stfrag)"
+
+ $MZ $h1 -c 1 -p 1000 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip "frag=256,mf" -q
+ $MZ $h1 -c 1 -p 1000 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip "frag=256" -q
+
+ tc_check_packets "dev $h2 ingress" 101 3
+ check_err $? "Did not match on frag filter (no1stfrag)"
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_err $? "Matched on wrong firstfrag filter (no1stfrag)"
+
+ tc_check_packets "dev $h2 ingress" 103 3
+ check_err $? "Did not match on nofirstfrag filter (no1stfrag)"
+
+ tc_check_packets "dev $h2 ingress" 104 1
+ check_err $? "Matched on nofrag filter (no1stfrag)"
+
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+ tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
+ tc filter del dev $h2 ingress protocol ip pref 3 handle 103 flower
+ tc filter del dev $h2 ingress protocol ip pref 4 handle 104 flower
+
+ log_test "ip_flags match ($tcflags)"
+}
+
+match_pcp_test()
+{
+ RET=0
+
+ vlan_create $h2 85 v$h2 192.0.2.11/24
+
+ tc filter add dev $h2 ingress protocol 802.1q pref 1 handle 101 \
+ flower vlan_prio 6 $tcflags dst_mac $h2mac action drop
+ tc filter add dev $h2 ingress protocol 802.1q pref 2 handle 102 \
+ flower vlan_prio 7 $tcflags dst_mac $h2mac action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -B 192.0.2.11 -Q 7:85 -t ip -q
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -B 192.0.2.11 -Q 0:85 -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 101 0
+ check_err $? "Matched on specified PCP when should not"
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_err $? "Did not match on specified PCP"
+
+ tc filter del dev $h2 ingress protocol 802.1q pref 2 handle 102 flower
+ tc filter del dev $h2 ingress protocol 802.1q pref 1 handle 101 flower
+
+ vlan_destroy $h2 85
+
+ log_test "PCP match ($tcflags)"
+}
+
+match_vlan_test()
+{
+ RET=0
+
+ vlan_create $h2 85 v$h2 192.0.2.11/24
+ vlan_create $h2 75 v$h2 192.0.2.10/24
+
+ tc filter add dev $h2 ingress protocol 802.1q pref 1 handle 101 \
+ flower vlan_id 75 $tcflags action drop
+ tc filter add dev $h2 ingress protocol 802.1q pref 2 handle 102 \
+ flower vlan_id 85 $tcflags action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -B 192.0.2.11 -Q 0:85 -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 101 0
+ check_err $? "Matched on specified VLAN when should not"
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_err $? "Did not match on specified VLAN"
+
+ tc filter del dev $h2 ingress protocol 802.1q pref 2 handle 102 flower
+ tc filter del dev $h2 ingress protocol 802.1q pref 1 handle 101 flower
+
+ vlan_destroy $h2 75
+ vlan_destroy $h2 85
+
+ log_test "VLAN match ($tcflags)"
+}
+
+match_ip_tos_test()
+{
+ RET=0
+
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ $tcflags dst_ip 192.0.2.2 ip_tos 0x20 action drop
+ tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \
+ $tcflags dst_ip 192.0.2.2 ip_tos 0x18 action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip tos=18 -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_fail $? "Matched on a wrong filter (0x18)"
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_err $? "Did not match on correct filter (0x18)"
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip tos=20 -q
+
+ tc_check_packets "dev $h2 ingress" 102 2
+ check_fail $? "Matched on a wrong filter (0x20)"
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_err $? "Did not match on correct filter (0x20)"
+
+ tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+
+ log_test "ip_tos match ($tcflags)"
+}
+
+match_ip_ttl_test()
+{
+ RET=0
+
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ $tcflags dst_ip 192.0.2.2 ip_ttl 63 action drop
+ tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \
+ $tcflags dst_ip 192.0.2.2 action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip "ttl=63" -q
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip "ttl=63,mf,frag=256" -q
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_fail $? "Matched on the wrong filter (no check on ttl)"
+
+ tc_check_packets "dev $h2 ingress" 101 2
+ check_err $? "Did not match on correct filter (ttl=63)"
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip "ttl=255" -q
+
+ tc_check_packets "dev $h2 ingress" 101 3
+ check_fail $? "Matched on a wrong filter (ttl=63)"
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_err $? "Did not match on correct filter (no check on ttl)"
+
+ tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+
+ log_test "ip_ttl match ($tcflags)"
+}
+
+match_indev_test()
+{
+ RET=0
+
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ $tcflags indev $h1 dst_mac $h2mac action drop
+ tc filter add dev $h2 ingress protocol ip pref 2 handle 102 flower \
+ $tcflags indev $h2 dst_mac $h2mac action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $h2mac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "dev $h2 ingress" 101 1
+ check_fail $? "Matched on a wrong filter"
+
+ tc_check_packets "dev $h2 ingress" 102 1
+ check_err $? "Did not match on correct filter"
+
+ tc filter del dev $h2 ingress protocol ip pref 2 handle 102 flower
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+
+ log_test "indev match ($tcflags)"
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ h2=${NETIFS[p2]}
+ h1mac=$(mac_get $h1)
+ h2mac=$(mac_get $h2)
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+tc_offload_check
+if [[ $? -ne 0 ]]; then
+ log_info "Could not test offloaded functionality"
+else
+ tcflags="skip_sw"
+ tests_run
+fi
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/tc_flower_router.sh b/tools/testing/selftests/net/forwarding/tc_flower_router.sh
new file mode 100755
index 000000000..4aee9c9e6
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/tc_flower_router.sh
@@ -0,0 +1,172 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="match_indev_egress_test"
+NUM_NETIFS=6
+source tc_common.sh
+source lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.1.1/24
+
+ ip route add 192.0.2.0/24 vrf v$h1 nexthop via 192.0.1.2
+ ip route add 192.0.3.0/24 vrf v$h1 nexthop via 192.0.1.2
+}
+
+h1_destroy()
+{
+ ip route del 192.0.3.0/24 vrf v$h1
+ ip route del 192.0.2.0/24 vrf v$h1
+
+ simple_if_fini $h1 192.0.1.1/24
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.1/24
+
+ ip route add 192.0.1.0/24 vrf v$h2 nexthop via 192.0.2.2
+ ip route add 192.0.3.0/24 vrf v$h2 nexthop via 192.0.2.2
+}
+
+h2_destroy()
+{
+ ip route del 192.0.3.0/24 vrf v$h2
+ ip route del 192.0.1.0/24 vrf v$h2
+
+ simple_if_fini $h2 192.0.2.1/24
+}
+
+h3_create()
+{
+ simple_if_init $h3 192.0.3.1/24
+
+ ip route add 192.0.1.0/24 vrf v$h3 nexthop via 192.0.3.2
+ ip route add 192.0.2.0/24 vrf v$h3 nexthop via 192.0.3.2
+}
+
+h3_destroy()
+{
+ ip route del 192.0.2.0/24 vrf v$h3
+ ip route del 192.0.1.0/24 vrf v$h3
+
+ simple_if_fini $h3 192.0.3.1/24
+}
+
+
+router_create()
+{
+ ip link set dev $rp1 up
+ ip link set dev $rp2 up
+ ip link set dev $rp3 up
+
+ tc qdisc add dev $rp3 clsact
+
+ ip address add 192.0.1.2/24 dev $rp1
+ ip address add 192.0.2.2/24 dev $rp2
+ ip address add 192.0.3.2/24 dev $rp3
+}
+
+router_destroy()
+{
+ ip address del 192.0.3.2/24 dev $rp3
+ ip address del 192.0.2.2/24 dev $rp2
+ ip address del 192.0.1.2/24 dev $rp1
+
+ tc qdisc del dev $rp3 clsact
+
+ ip link set dev $rp3 down
+ ip link set dev $rp2 down
+ ip link set dev $rp1 down
+}
+
+match_indev_egress_test()
+{
+ RET=0
+
+ tc filter add dev $rp3 egress protocol ip pref 1 handle 101 flower \
+ $tcflags indev $rp1 dst_ip 192.0.3.1 action drop
+ tc filter add dev $rp3 egress protocol ip pref 2 handle 102 flower \
+ $tcflags indev $rp2 dst_ip 192.0.3.1 action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $rp1mac -A 192.0.1.1 -B 192.0.3.1 \
+ -t ip -q
+
+ tc_check_packets "dev $rp3 egress" 102 1
+ check_fail $? "Matched on a wrong filter"
+
+ tc_check_packets "dev $rp3 egress" 101 1
+ check_err $? "Did not match on correct filter"
+
+ $MZ $h2 -c 1 -p 64 -a $h2mac -b $rp2mac -A 192.0.2.1 -B 192.0.3.1 \
+ -t ip -q
+
+ tc_check_packets "dev $rp3 egress" 101 2
+ check_fail $? "Matched on a wrong filter"
+
+ tc_check_packets "dev $rp3 egress" 102 1
+ check_err $? "Did not match on correct filter"
+
+ tc filter del dev $rp3 egress protocol ip pref 2 handle 102 flower
+ tc filter del dev $rp3 egress protocol ip pref 1 handle 101 flower
+
+ log_test "indev egress match ($tcflags)"
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ rp1=${NETIFS[p2]}
+
+ h2=${NETIFS[p3]}
+ rp2=${NETIFS[p4]}
+
+ h3=${NETIFS[p5]}
+ rp3=${NETIFS[p6]}
+
+ h1mac=$(mac_get $h1)
+ rp1mac=$(mac_get $rp1)
+ h2mac=$(mac_get $h2)
+ rp2mac=$(mac_get $rp2)
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+ h3_create
+
+ router_create
+
+ forwarding_enable
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ forwarding_restore
+
+ router_destroy
+
+ h3_destroy
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tc_offload_check
+if [[ $? -ne 0 ]]; then
+ log_info "Could not test offloaded functionality"
+else
+ tcflags="skip_sw"
+ tests_run
+fi
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/tc_police.sh b/tools/testing/selftests/net/forwarding/tc_police.sh
new file mode 100755
index 000000000..eb09acdcb
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/tc_police.sh
@@ -0,0 +1,385 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test tc-police action.
+#
+# +---------------------------------+
+# | H1 (vrf) |
+# | + $h1 |
+# | | 192.0.2.1/24 |
+# | | |
+# | | default via 192.0.2.2 |
+# +----|----------------------------+
+# |
+# +----|----------------------------------------------------------------------+
+# | SW | |
+# | + $rp1 |
+# | 192.0.2.2/24 |
+# | |
+# | 198.51.100.2/24 203.0.113.2/24 |
+# | + $rp2 + $rp3 |
+# | | | |
+# +----|-----------------------------------------|----------------------------+
+# | |
+# +----|----------------------------+ +----|----------------------------+
+# | | default via 198.51.100.2 | | | default via 203.0.113.2 |
+# | | | | | |
+# | | 198.51.100.1/24 | | | 203.0.113.1/24 |
+# | + $h2 | | + $h3 |
+# | H2 (vrf) | | H3 (vrf) |
+# +---------------------------------+ +---------------------------------+
+
+ALL_TESTS="
+ police_rx_test
+ police_tx_test
+ police_shared_test
+ police_rx_mirror_test
+ police_tx_mirror_test
+ police_mtu_rx_test
+ police_mtu_tx_test
+"
+NUM_NETIFS=6
+source tc_common.sh
+source lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/24
+
+ ip -4 route add default vrf v$h1 nexthop via 192.0.2.2
+}
+
+h1_destroy()
+{
+ ip -4 route del default vrf v$h1 nexthop via 192.0.2.2
+
+ simple_if_fini $h1 192.0.2.1/24
+}
+
+h2_create()
+{
+ simple_if_init $h2 198.51.100.1/24
+
+ ip -4 route add default vrf v$h2 nexthop via 198.51.100.2
+
+ tc qdisc add dev $h2 clsact
+}
+
+h2_destroy()
+{
+ tc qdisc del dev $h2 clsact
+
+ ip -4 route del default vrf v$h2 nexthop via 198.51.100.2
+
+ simple_if_fini $h2 198.51.100.1/24
+}
+
+h3_create()
+{
+ simple_if_init $h3 203.0.113.1/24
+
+ ip -4 route add default vrf v$h3 nexthop via 203.0.113.2
+
+ tc qdisc add dev $h3 clsact
+}
+
+h3_destroy()
+{
+ tc qdisc del dev $h3 clsact
+
+ ip -4 route del default vrf v$h3 nexthop via 203.0.113.2
+
+ simple_if_fini $h3 203.0.113.1/24
+}
+
+router_create()
+{
+ ip link set dev $rp1 up
+ ip link set dev $rp2 up
+ ip link set dev $rp3 up
+
+ __addr_add_del $rp1 add 192.0.2.2/24
+ __addr_add_del $rp2 add 198.51.100.2/24
+ __addr_add_del $rp3 add 203.0.113.2/24
+
+ tc qdisc add dev $rp1 clsact
+ tc qdisc add dev $rp2 clsact
+}
+
+router_destroy()
+{
+ tc qdisc del dev $rp2 clsact
+ tc qdisc del dev $rp1 clsact
+
+ __addr_add_del $rp3 del 203.0.113.2/24
+ __addr_add_del $rp2 del 198.51.100.2/24
+ __addr_add_del $rp1 del 192.0.2.2/24
+
+ ip link set dev $rp3 down
+ ip link set dev $rp2 down
+ ip link set dev $rp1 down
+}
+
+police_common_test()
+{
+ local test_name=$1; shift
+
+ RET=0
+
+ # Rule to measure bandwidth on ingress of $h2
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \
+ action drop
+
+ mausezahn $h1 -a own -b $(mac_get $rp1) -A 192.0.2.1 -B 198.51.100.1 \
+ -t udp sp=12345,dp=54321 -p 1000 -c 0 -q &
+
+ local t0=$(tc_rule_stats_get $h2 1 ingress .bytes)
+ sleep 10
+ local t1=$(tc_rule_stats_get $h2 1 ingress .bytes)
+
+ local er=$((80 * 1000 * 1000))
+ local nr=$(rate $t0 $t1 10)
+ local nr_pct=$((100 * (nr - er) / er))
+ ((-10 <= nr_pct && nr_pct <= 10))
+ check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-10%."
+
+ log_test "$test_name"
+
+ { kill %% && wait %%; } 2>/dev/null
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+}
+
+police_rx_test()
+{
+ # Rule to police traffic destined to $h2 on ingress of $rp1
+ tc filter add dev $rp1 ingress protocol ip pref 1 handle 101 flower \
+ dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \
+ action police rate 80mbit burst 16k conform-exceed drop/ok
+
+ police_common_test "police on rx"
+
+ tc filter del dev $rp1 ingress protocol ip pref 1 handle 101 flower
+}
+
+police_tx_test()
+{
+ # Rule to police traffic destined to $h2 on egress of $rp2
+ tc filter add dev $rp2 egress protocol ip pref 1 handle 101 flower \
+ dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \
+ action police rate 80mbit burst 16k conform-exceed drop/ok
+
+ police_common_test "police on tx"
+
+ tc filter del dev $rp2 egress protocol ip pref 1 handle 101 flower
+}
+
+police_shared_common_test()
+{
+ local dport=$1; shift
+ local test_name=$1; shift
+
+ RET=0
+
+ mausezahn $h1 -a own -b $(mac_get $rp1) -A 192.0.2.1 -B 198.51.100.1 \
+ -t udp sp=12345,dp=$dport -p 1000 -c 0 -q &
+
+ local t0=$(tc_rule_stats_get $h2 1 ingress .bytes)
+ sleep 10
+ local t1=$(tc_rule_stats_get $h2 1 ingress .bytes)
+
+ local er=$((80 * 1000 * 1000))
+ local nr=$(rate $t0 $t1 10)
+ local nr_pct=$((100 * (nr - er) / er))
+ ((-10 <= nr_pct && nr_pct <= 10))
+ check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-10%."
+
+ log_test "$test_name"
+
+ { kill %% && wait %%; } 2>/dev/null
+}
+
+police_shared_test()
+{
+ # Rule to measure bandwidth on ingress of $h2
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ dst_ip 198.51.100.1 ip_proto udp src_port 12345 \
+ action drop
+
+ # Rule to police traffic destined to $h2 on ingress of $rp1
+ tc filter add dev $rp1 ingress protocol ip pref 1 handle 101 flower \
+ dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \
+ action police rate 80mbit burst 16k conform-exceed drop/ok \
+ index 10
+
+ # Rule to police a different flow destined to $h2 on egress of $rp2
+ # using same policer
+ tc filter add dev $rp2 egress protocol ip pref 1 handle 101 flower \
+ dst_ip 198.51.100.1 ip_proto udp dst_port 22222 \
+ action police index 10
+
+ police_shared_common_test 54321 "police with shared policer - rx"
+
+ police_shared_common_test 22222 "police with shared policer - tx"
+
+ tc filter del dev $rp2 egress protocol ip pref 1 handle 101 flower
+ tc filter del dev $rp1 ingress protocol ip pref 1 handle 101 flower
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+}
+
+police_mirror_common_test()
+{
+ local pol_if=$1; shift
+ local dir=$1; shift
+ local test_name=$1; shift
+
+ RET=0
+
+ # Rule to measure bandwidth on ingress of $h2
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \
+ action drop
+
+ # Rule to measure bandwidth of mirrored traffic on ingress of $h3
+ tc filter add dev $h3 ingress protocol ip pref 1 handle 101 flower \
+ dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \
+ action drop
+
+ # Rule to police traffic destined to $h2 and mirror to $h3
+ tc filter add dev $pol_if $dir protocol ip pref 1 handle 101 flower \
+ dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \
+ action police rate 80mbit burst 16k conform-exceed drop/pipe \
+ action mirred egress mirror dev $rp3
+
+ mausezahn $h1 -a own -b $(mac_get $rp1) -A 192.0.2.1 -B 198.51.100.1 \
+ -t udp sp=12345,dp=54321 -p 1000 -c 0 -q &
+
+ local t0=$(tc_rule_stats_get $h2 1 ingress .bytes)
+ sleep 10
+ local t1=$(tc_rule_stats_get $h2 1 ingress .bytes)
+
+ local er=$((80 * 1000 * 1000))
+ local nr=$(rate $t0 $t1 10)
+ local nr_pct=$((100 * (nr - er) / er))
+ ((-10 <= nr_pct && nr_pct <= 10))
+ check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-10%."
+
+ local t0=$(tc_rule_stats_get $h3 1 ingress .bytes)
+ sleep 10
+ local t1=$(tc_rule_stats_get $h3 1 ingress .bytes)
+
+ local er=$((80 * 1000 * 1000))
+ local nr=$(rate $t0 $t1 10)
+ local nr_pct=$((100 * (nr - er) / er))
+ ((-10 <= nr_pct && nr_pct <= 10))
+ check_err $? "Expected rate $(humanize $er), got $(humanize $nr), which is $nr_pct% off. Required accuracy is +-10%."
+
+ log_test "$test_name"
+
+ { kill %% && wait %%; } 2>/dev/null
+ tc filter del dev $pol_if $dir protocol ip pref 1 handle 101 flower
+ tc filter del dev $h3 ingress protocol ip pref 1 handle 101 flower
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+}
+
+police_rx_mirror_test()
+{
+ police_mirror_common_test $rp1 ingress "police rx and mirror"
+}
+
+police_tx_mirror_test()
+{
+ police_mirror_common_test $rp2 egress "police tx and mirror"
+}
+
+police_mtu_common_test() {
+ RET=0
+
+ local test_name=$1; shift
+ local dev=$1; shift
+ local direction=$1; shift
+
+ tc filter add dev $dev $direction protocol ip pref 1 handle 101 flower \
+ dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \
+ action police mtu 1042 conform-exceed drop/ok
+
+ # to count "conform" packets
+ tc filter add dev $h2 ingress protocol ip pref 1 handle 101 flower \
+ dst_ip 198.51.100.1 ip_proto udp dst_port 54321 \
+ action drop
+
+ mausezahn $h1 -a own -b $(mac_get $rp1) -A 192.0.2.1 -B 198.51.100.1 \
+ -t udp sp=12345,dp=54321 -p 1001 -c 10 -q
+
+ mausezahn $h1 -a own -b $(mac_get $rp1) -A 192.0.2.1 -B 198.51.100.1 \
+ -t udp sp=12345,dp=54321 -p 1000 -c 3 -q
+
+ tc_check_packets "dev $dev $direction" 101 13
+ check_err $? "wrong packet counter"
+
+ # "exceed" packets
+ local overlimits_t0=$(tc_rule_stats_get ${dev} 1 ${direction} .overlimits)
+ test ${overlimits_t0} = 10
+ check_err $? "wrong overlimits, expected 10 got ${overlimits_t0}"
+
+ # "conform" packets
+ tc_check_packets "dev $h2 ingress" 101 3
+ check_err $? "forwarding error"
+
+ tc filter del dev $h2 ingress protocol ip pref 1 handle 101 flower
+ tc filter del dev $dev $direction protocol ip pref 1 handle 101 flower
+
+ log_test "$test_name"
+}
+
+police_mtu_rx_test()
+{
+ police_mtu_common_test "police mtu (rx)" $rp1 ingress
+}
+
+police_mtu_tx_test()
+{
+ police_mtu_common_test "police mtu (tx)" $rp2 egress
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ rp1=${NETIFS[p2]}
+
+ rp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ rp3=${NETIFS[p5]}
+ h3=${NETIFS[p6]}
+
+ vrf_prepare
+ forwarding_enable
+
+ h1_create
+ h2_create
+ h3_create
+ router_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ router_destroy
+ h3_destroy
+ h2_destroy
+ h1_destroy
+
+ forwarding_restore
+ vrf_cleanup
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/tc_shblocks.sh b/tools/testing/selftests/net/forwarding/tc_shblocks.sh
new file mode 100755
index 000000000..772e00ac3
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/tc_shblocks.sh
@@ -0,0 +1,152 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="shared_block_test match_indev_test"
+NUM_NETIFS=4
+source tc_common.sh
+source lib.sh
+
+tcflags="skip_hw"
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/24
+}
+
+h1_destroy()
+{
+ simple_if_fini $h1 192.0.2.1/24
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.1/24
+}
+
+h2_destroy()
+{
+ simple_if_fini $h2 192.0.2.1/24
+}
+
+switch_create()
+{
+ simple_if_init $swp1 192.0.2.2/24
+ tc qdisc add dev $swp1 ingress_block 22 egress_block 23 clsact
+
+ simple_if_init $swp2 192.0.2.2/24
+ tc qdisc add dev $swp2 ingress_block 22 egress_block 23 clsact
+}
+
+switch_destroy()
+{
+ tc qdisc del dev $swp2 clsact
+ simple_if_fini $swp2 192.0.2.2/24
+
+ tc qdisc del dev $swp1 clsact
+ simple_if_fini $swp1 192.0.2.2/24
+}
+
+shared_block_test()
+{
+ RET=0
+
+ tc filter add block 22 protocol ip pref 1 handle 101 flower \
+ $tcflags dst_ip 192.0.2.2 action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $swmac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "block 22" 101 1
+ check_err $? "Did not match first incoming packet on a block"
+
+ $MZ $h2 -c 1 -p 64 -a $h2mac -b $swmac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "block 22" 101 2
+ check_err $? "Did not match second incoming packet on a block"
+
+ tc filter del block 22 protocol ip pref 1 handle 101 flower
+
+ log_test "shared block ($tcflags)"
+}
+
+match_indev_test()
+{
+ RET=0
+
+ tc filter add block 22 protocol ip pref 1 handle 101 flower \
+ $tcflags indev $swp1 dst_mac $swmac action drop
+ tc filter add block 22 protocol ip pref 2 handle 102 flower \
+ $tcflags indev $swp2 dst_mac $swmac action drop
+
+ $MZ $h1 -c 1 -p 64 -a $h1mac -b $swmac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "block 22" 101 1
+ check_err $? "Did not match first incoming packet on a block"
+
+ $MZ $h2 -c 1 -p 64 -a $h2mac -b $swmac -A 192.0.2.1 -B 192.0.2.2 \
+ -t ip -q
+
+ tc_check_packets "block 22" 102 1
+ check_err $? "Did not match second incoming packet on a block"
+
+ tc filter del block 22 protocol ip pref 1 handle 101 flower
+ tc filter del block 22 protocol ip pref 2 handle 102 flower
+
+ log_test "indev match ($tcflags)"
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ h1mac=$(mac_get $h1)
+ h2mac=$(mac_get $h2)
+
+ swmac=$(mac_get $swp1)
+ swp2origmac=$(mac_get $swp2)
+ ip link set $swp2 address $swmac
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+
+ ip link set $swp2 address $swp2origmac
+}
+
+check_tc_shblock_support
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+tc_offload_check
+if [[ $? -ne 0 ]]; then
+ log_info "Could not test offloaded functionality"
+else
+ tcflags="skip_sw"
+ tests_run
+fi
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/tc_vlan_modify.sh b/tools/testing/selftests/net/forwarding/tc_vlan_modify.sh
new file mode 100755
index 000000000..45378905c
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/tc_vlan_modify.sh
@@ -0,0 +1,164 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="
+ vlan_modify_ingress
+ vlan_modify_egress
+"
+
+NUM_NETIFS=4
+CHECK_TC="yes"
+source lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/28 2001:db8:1::1/64
+ vlan_create $h1 85 v$h1 192.0.2.17/28 2001:db8:2::1/64
+}
+
+h1_destroy()
+{
+ vlan_destroy $h1 85
+ simple_if_fini $h1 192.0.2.1/28 2001:db8:1::1/64
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.2/28 2001:db8:1::2/64
+ vlan_create $h2 65 v$h2 192.0.2.18/28 2001:db8:2::2/64
+}
+
+h2_destroy()
+{
+ vlan_destroy $h2 65
+ simple_if_fini $h2 192.0.2.2/28 2001:db8:1::2/64
+}
+
+switch_create()
+{
+ ip link add dev br0 type bridge vlan_filtering 1 mcast_snooping 0
+
+ ip link set dev $swp1 master br0
+ ip link set dev $swp2 master br0
+
+ ip link set dev br0 up
+ ip link set dev $swp1 up
+ ip link set dev $swp2 up
+
+ bridge vlan add dev $swp1 vid 85
+ bridge vlan add dev $swp2 vid 65
+
+ bridge vlan add dev $swp2 vid 85
+ bridge vlan add dev $swp1 vid 65
+
+ tc qdisc add dev $swp1 clsact
+ tc qdisc add dev $swp2 clsact
+}
+
+switch_destroy()
+{
+ tc qdisc del dev $swp2 clsact
+ tc qdisc del dev $swp1 clsact
+
+ bridge vlan del vid 65 dev $swp1
+ bridge vlan del vid 85 dev $swp2
+
+ bridge vlan del vid 65 dev $swp2
+ bridge vlan del vid 85 dev $swp1
+
+ ip link set dev $swp2 down
+ ip link set dev $swp1 down
+
+ ip link del dev br0
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+
+ switch_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ switch_destroy
+
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+vlan_modify_ingress()
+{
+ RET=0
+
+ ping_do $h1.85 192.0.2.18
+ check_fail $? "ping between two different vlans passed when should not"
+
+ ping6_do $h1.85 2001:db8:2::2
+ check_fail $? "ping6 between two different vlans passed when should not"
+
+ tc filter add dev $swp1 ingress protocol all pref 1 handle 1 \
+ flower action vlan modify id 65
+ tc filter add dev $swp2 ingress protocol all pref 1 handle 1 \
+ flower action vlan modify id 85
+
+ ping_do $h1.85 192.0.2.18
+ check_err $? "ping between two different vlans failed when should not"
+
+ ping6_do $h1.85 2001:db8:2::2
+ check_err $? "ping6 between two different vlans failed when should not"
+
+ log_test "VLAN modify at ingress"
+
+ tc filter del dev $swp2 ingress protocol all pref 1 handle 1 flower
+ tc filter del dev $swp1 ingress protocol all pref 1 handle 1 flower
+}
+
+vlan_modify_egress()
+{
+ RET=0
+
+ ping_do $h1.85 192.0.2.18
+ check_fail $? "ping between two different vlans passed when should not"
+
+ ping6_do $h1.85 2001:db8:2::2
+ check_fail $? "ping6 between two different vlans passed when should not"
+
+ tc filter add dev $swp1 egress protocol all pref 1 handle 1 \
+ flower action vlan modify id 85
+ tc filter add dev $swp2 egress protocol all pref 1 handle 1 \
+ flower action vlan modify id 65
+
+ ping_do $h1.85 192.0.2.18
+ check_err $? "ping between two different vlans failed when should not"
+
+ ping6_do $h1.85 2001:db8:2::2
+ check_err $? "ping6 between two different vlans failed when should not"
+
+ log_test "VLAN modify at egress"
+
+ tc filter del dev $swp2 egress protocol all pref 1 handle 1 flower
+ tc filter del dev $swp1 egress protocol all pref 1 handle 1 flower
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/vxlan_asymmetric.sh b/tools/testing/selftests/net/forwarding/vxlan_asymmetric.sh
new file mode 100755
index 000000000..0727e2012
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/vxlan_asymmetric.sh
@@ -0,0 +1,577 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# +---------------------------+ +------------------------------+
+# | vrf-h1 | | vrf-h2 |
+# | + $h1 | | + $h2 |
+# | | 10.1.1.101/24 | | | 10.1.2.101/24 |
+# | | default via 10.1.1.1 | | | default via 10.1.2.1 |
+# +----|----------------------+ +----|-------------------------+
+# | |
+# +----|--------------------------------------------|-------------------------+
+# | SW | | |
+# | +--|--------------------------------------------|-----------------------+ |
+# | | + $swp1 br1 + $swp2 | |
+# | | vid 10 pvid untagged vid 20 pvid untagged | |
+# | | | |
+# | | + vx10 + vx20 | |
+# | | local 10.0.0.1 local 10.0.0.1 | |
+# | | remote 10.0.0.2 remote 10.0.0.2 | |
+# | | id 1000 id 2000 | |
+# | | dstport 4789 dstport 4789 | |
+# | | vid 10 pvid untagged vid 20 pvid untagged | |
+# | | | |
+# | +-----------------------------------+-----------------------------------+ |
+# | | |
+# | +-----------------------------------|-----------------------------------+ |
+# | | | | |
+# | | +--------------------------------+--------------------------------+ | |
+# | | | | | |
+# | | + vlan10 vlan20 + | |
+# | | | 10.1.1.11/24 10.1.2.11/24 | | |
+# | | | | | |
+# | | + vlan10-v (macvlan) vlan20-v (macvlan) + | |
+# | | 10.1.1.1/24 10.1.2.1/24 | |
+# | | 00:00:5e:00:01:01 00:00:5e:00:01:01 | |
+# | | vrf-green | |
+# | +-----------------------------------------------------------------------+ |
+# | |
+# | + $rp1 +lo |
+# | | 192.0.2.1/24 10.0.0.1/32 |
+# +----|----------------------------------------------------------------------+
+# |
+# +----|--------------------------------------------------------+
+# | | vrf-spine |
+# | + $rp2 |
+# | 192.0.2.2/24 |
+# | | (maybe) HW
+# =============================================================================
+# | | (likely) SW
+# | |
+# | + v1 (veth) |
+# | | 192.0.3.2/24 |
+# +----|--------------------------------------------------------+
+# |
+# +----|----------------------------------------------------------------------+
+# | + v2 (veth) +lo NS1 (netns) |
+# | 192.0.3.1/24 10.0.0.2/32 |
+# | |
+# | +-----------------------------------------------------------------------+ |
+# | | vrf-green | |
+# | | + vlan10-v (macvlan) vlan20-v (macvlan) + | |
+# | | | 10.1.1.1/24 10.1.2.1/24 | | |
+# | | | 00:00:5e:00:01:01 00:00:5e:00:01:01 | | |
+# | | | | | |
+# | | + vlan10 vlan20 + | |
+# | | | 10.1.1.12/24 10.1.2.12/24 | | |
+# | | | | | |
+# | | +--------------------------------+--------------------------------+ | |
+# | | | | |
+# | +-----------------------------------|-----------------------------------+ |
+# | | |
+# | +-----------------------------------+-----------------------------------+ |
+# | | | |
+# | | + vx10 + vx20 | |
+# | | local 10.0.0.2 local 10.0.0.2 | |
+# | | remote 10.0.0.1 remote 10.0.0.1 | |
+# | | id 1000 id 2000 | |
+# | | dstport 4789 dstport 4789 | |
+# | | vid 10 pvid untagged vid 20 pvid untagged | |
+# | | | |
+# | | + w1 (veth) + w3 (veth) | |
+# | | | vid 10 pvid untagged br1 | vid 20 pvid untagged | |
+# | +--|------------------------------------------|-------------------------+ |
+# | | | |
+# | | | |
+# | +--|----------------------+ +--|-------------------------+ |
+# | | | vrf-h1 | | | vrf-h2 | |
+# | | + w2 (veth) | | + w4 (veth) | |
+# | | 10.1.1.102/24 | | 10.1.2.102/24 | |
+# | | default via 10.1.1.1 | | default via 10.1.2.1 | |
+# | +-------------------------+ +----------------------------+ |
+# +---------------------------------------------------------------------------+
+
+ALL_TESTS="
+ ping_ipv4
+ arp_decap
+ arp_suppression
+"
+NUM_NETIFS=6
+source lib.sh
+
+require_command $ARPING
+
+hx_create()
+{
+ local vrf_name=$1; shift
+ local if_name=$1; shift
+ local ip_addr=$1; shift
+ local gw_ip=$1; shift
+
+ vrf_create $vrf_name
+ ip link set dev $if_name master $vrf_name
+ ip link set dev $vrf_name up
+ ip link set dev $if_name up
+
+ ip address add $ip_addr/24 dev $if_name
+ ip neigh replace $gw_ip lladdr 00:00:5e:00:01:01 nud permanent \
+ dev $if_name
+ ip route add default vrf $vrf_name nexthop via $gw_ip
+}
+export -f hx_create
+
+hx_destroy()
+{
+ local vrf_name=$1; shift
+ local if_name=$1; shift
+ local ip_addr=$1; shift
+ local gw_ip=$1; shift
+
+ ip route del default vrf $vrf_name nexthop via $gw_ip
+ ip neigh del $gw_ip dev $if_name
+ ip address del $ip_addr/24 dev $if_name
+
+ ip link set dev $if_name down
+ vrf_destroy $vrf_name
+}
+
+h1_create()
+{
+ hx_create "vrf-h1" $h1 10.1.1.101 10.1.1.1
+}
+
+h1_destroy()
+{
+ hx_destroy "vrf-h1" $h1 10.1.1.101 10.1.1.1
+}
+
+h2_create()
+{
+ hx_create "vrf-h2" $h2 10.1.2.101 10.1.2.1
+}
+
+h2_destroy()
+{
+ hx_destroy "vrf-h2" $h2 10.1.2.101 10.1.2.1
+}
+
+switch_create()
+{
+ ip link add name br1 type bridge vlan_filtering 1 vlan_default_pvid 0 \
+ mcast_snooping 0
+ # Make sure the bridge uses the MAC address of the local port and not
+ # that of the VxLAN's device.
+ ip link set dev br1 address $(mac_get $swp1)
+ ip link set dev br1 up
+
+ ip link set dev $rp1 up
+ ip address add dev $rp1 192.0.2.1/24
+ ip route add 10.0.0.2/32 nexthop via 192.0.2.2
+
+ ip link add name vx10 type vxlan id 1000 \
+ local 10.0.0.1 remote 10.0.0.2 dstport 4789 \
+ nolearning noudpcsum tos inherit ttl 100
+ ip link set dev vx10 up
+
+ ip link set dev vx10 master br1
+ bridge vlan add vid 10 dev vx10 pvid untagged
+
+ ip link add name vx20 type vxlan id 2000 \
+ local 10.0.0.1 remote 10.0.0.2 dstport 4789 \
+ nolearning noudpcsum tos inherit ttl 100
+ ip link set dev vx20 up
+
+ ip link set dev vx20 master br1
+ bridge vlan add vid 20 dev vx20 pvid untagged
+
+ ip link set dev $swp1 master br1
+ ip link set dev $swp1 up
+ bridge vlan add vid 10 dev $swp1 pvid untagged
+
+ ip link set dev $swp2 master br1
+ ip link set dev $swp2 up
+ bridge vlan add vid 20 dev $swp2 pvid untagged
+
+ ip address add 10.0.0.1/32 dev lo
+
+ # Create SVIs
+ vrf_create "vrf-green"
+ ip link set dev vrf-green up
+
+ ip link add link br1 name vlan10 up master vrf-green type vlan id 10
+ ip address add 10.1.1.11/24 dev vlan10
+ ip link add link vlan10 name vlan10-v up master vrf-green \
+ address 00:00:5e:00:01:01 type macvlan mode private
+ ip address add 10.1.1.1/24 dev vlan10-v
+
+ ip link add link br1 name vlan20 up master vrf-green type vlan id 20
+ ip address add 10.1.2.11/24 dev vlan20
+ ip link add link vlan20 name vlan20-v up master vrf-green \
+ address 00:00:5e:00:01:01 type macvlan mode private
+ ip address add 10.1.2.1/24 dev vlan20-v
+
+ bridge vlan add vid 10 dev br1 self
+ bridge vlan add vid 20 dev br1 self
+
+ bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 10
+ bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 20
+
+ sysctl_set net.ipv4.conf.all.rp_filter 0
+ sysctl_set net.ipv4.conf.vlan10-v.rp_filter 0
+ sysctl_set net.ipv4.conf.vlan20-v.rp_filter 0
+}
+
+switch_destroy()
+{
+ sysctl_restore net.ipv4.conf.all.rp_filter
+
+ bridge fdb del 00:00:5e:00:01:01 dev br1 self local vlan 20
+ bridge fdb del 00:00:5e:00:01:01 dev br1 self local vlan 10
+
+ bridge vlan del vid 20 dev br1 self
+ bridge vlan del vid 10 dev br1 self
+
+ ip link del dev vlan20
+
+ ip link del dev vlan10
+
+ vrf_destroy "vrf-green"
+
+ ip address del 10.0.0.1/32 dev lo
+
+ bridge vlan del vid 20 dev $swp2
+ ip link set dev $swp2 down
+ ip link set dev $swp2 nomaster
+
+ bridge vlan del vid 10 dev $swp1
+ ip link set dev $swp1 down
+ ip link set dev $swp1 nomaster
+
+ bridge vlan del vid 20 dev vx20
+ ip link set dev vx20 nomaster
+
+ ip link set dev vx20 down
+ ip link del dev vx20
+
+ bridge vlan del vid 10 dev vx10
+ ip link set dev vx10 nomaster
+
+ ip link set dev vx10 down
+ ip link del dev vx10
+
+ ip route del 10.0.0.2/32 nexthop via 192.0.2.2
+ ip address del dev $rp1 192.0.2.1/24
+ ip link set dev $rp1 down
+
+ ip link set dev br1 down
+ ip link del dev br1
+}
+
+spine_create()
+{
+ vrf_create "vrf-spine"
+ ip link set dev $rp2 master vrf-spine
+ ip link set dev v1 master vrf-spine
+ ip link set dev vrf-spine up
+ ip link set dev $rp2 up
+ ip link set dev v1 up
+
+ ip address add 192.0.2.2/24 dev $rp2
+ ip address add 192.0.3.2/24 dev v1
+
+ ip route add 10.0.0.1/32 vrf vrf-spine nexthop via 192.0.2.1
+ ip route add 10.0.0.2/32 vrf vrf-spine nexthop via 192.0.3.1
+}
+
+spine_destroy()
+{
+ ip route del 10.0.0.2/32 vrf vrf-spine nexthop via 192.0.3.1
+ ip route del 10.0.0.1/32 vrf vrf-spine nexthop via 192.0.2.1
+
+ ip address del 192.0.3.2/24 dev v1
+ ip address del 192.0.2.2/24 dev $rp2
+
+ ip link set dev v1 down
+ ip link set dev $rp2 down
+ vrf_destroy "vrf-spine"
+}
+
+ns_h1_create()
+{
+ hx_create "vrf-h1" w2 10.1.1.102 10.1.1.1
+}
+export -f ns_h1_create
+
+ns_h2_create()
+{
+ hx_create "vrf-h2" w4 10.1.2.102 10.1.2.1
+}
+export -f ns_h2_create
+
+ns_switch_create()
+{
+ ip link add name br1 type bridge vlan_filtering 1 vlan_default_pvid 0 \
+ mcast_snooping 0
+ ip link set dev br1 up
+
+ ip link set dev v2 up
+ ip address add dev v2 192.0.3.1/24
+ ip route add 10.0.0.1/32 nexthop via 192.0.3.2
+
+ ip link add name vx10 type vxlan id 1000 \
+ local 10.0.0.2 remote 10.0.0.1 dstport 4789 \
+ nolearning noudpcsum tos inherit ttl 100
+ ip link set dev vx10 up
+
+ ip link set dev vx10 master br1
+ bridge vlan add vid 10 dev vx10 pvid untagged
+
+ ip link add name vx20 type vxlan id 2000 \
+ local 10.0.0.2 remote 10.0.0.1 dstport 4789 \
+ nolearning noudpcsum tos inherit ttl 100
+ ip link set dev vx20 up
+
+ ip link set dev vx20 master br1
+ bridge vlan add vid 20 dev vx20 pvid untagged
+
+ ip link set dev w1 master br1
+ ip link set dev w1 up
+ bridge vlan add vid 10 dev w1 pvid untagged
+
+ ip link set dev w3 master br1
+ ip link set dev w3 up
+ bridge vlan add vid 20 dev w3 pvid untagged
+
+ ip address add 10.0.0.2/32 dev lo
+
+ # Create SVIs
+ vrf_create "vrf-green"
+ ip link set dev vrf-green up
+
+ ip link add link br1 name vlan10 up master vrf-green type vlan id 10
+ ip address add 10.1.1.12/24 dev vlan10
+ ip link add link vlan10 name vlan10-v up master vrf-green \
+ address 00:00:5e:00:01:01 type macvlan mode private
+ ip address add 10.1.1.1/24 dev vlan10-v
+
+ ip link add link br1 name vlan20 up master vrf-green type vlan id 20
+ ip address add 10.1.2.12/24 dev vlan20
+ ip link add link vlan20 name vlan20-v up master vrf-green \
+ address 00:00:5e:00:01:01 type macvlan mode private
+ ip address add 10.1.2.1/24 dev vlan20-v
+
+ bridge vlan add vid 10 dev br1 self
+ bridge vlan add vid 20 dev br1 self
+
+ bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 10
+ bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 20
+
+ sysctl_set net.ipv4.conf.all.rp_filter 0
+ sysctl_set net.ipv4.conf.vlan10-v.rp_filter 0
+ sysctl_set net.ipv4.conf.vlan20-v.rp_filter 0
+}
+export -f ns_switch_create
+
+ns_init()
+{
+ ip link add name w1 type veth peer name w2
+ ip link add name w3 type veth peer name w4
+
+ ip link set dev lo up
+
+ ns_h1_create
+ ns_h2_create
+ ns_switch_create
+}
+export -f ns_init
+
+ns1_create()
+{
+ ip netns add ns1
+ ip link set dev v2 netns ns1
+ in_ns ns1 ns_init
+}
+
+ns1_destroy()
+{
+ ip netns exec ns1 ip link set dev v2 netns 1
+ ip netns del ns1
+}
+
+macs_populate()
+{
+ local mac1=$1; shift
+ local mac2=$1; shift
+ local ip1=$1; shift
+ local ip2=$1; shift
+ local dst=$1; shift
+
+ bridge fdb add $mac1 dev vx10 self master extern_learn static \
+ dst $dst vlan 10
+ bridge fdb add $mac2 dev vx20 self master extern_learn static \
+ dst $dst vlan 20
+
+ ip neigh add $ip1 lladdr $mac1 nud noarp dev vlan10 \
+ extern_learn
+ ip neigh add $ip2 lladdr $mac2 nud noarp dev vlan20 \
+ extern_learn
+}
+export -f macs_populate
+
+macs_initialize()
+{
+ local h1_ns_mac=$(in_ns ns1 mac_get w2)
+ local h2_ns_mac=$(in_ns ns1 mac_get w4)
+ local h1_mac=$(mac_get $h1)
+ local h2_mac=$(mac_get $h2)
+
+ macs_populate $h1_ns_mac $h2_ns_mac 10.1.1.102 10.1.2.102 10.0.0.2
+ in_ns ns1 macs_populate $h1_mac $h2_mac 10.1.1.101 10.1.2.101 10.0.0.1
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ rp1=${NETIFS[p5]}
+ rp2=${NETIFS[p6]}
+
+ vrf_prepare
+ forwarding_enable
+
+ h1_create
+ h2_create
+ switch_create
+
+ ip link add name v1 type veth peer name v2
+ spine_create
+ ns1_create
+
+ macs_initialize
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ ns1_destroy
+ spine_destroy
+ ip link del dev v1
+
+ switch_destroy
+ h2_destroy
+ h1_destroy
+
+ forwarding_restore
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 10.1.2.101 ": local->local vid 10->vid 20"
+ ping_test $h1 10.1.1.102 ": local->remote vid 10->vid 10"
+ ping_test $h2 10.1.2.102 ": local->remote vid 20->vid 20"
+ ping_test $h1 10.1.2.102 ": local->remote vid 10->vid 20"
+ ping_test $h2 10.1.1.102 ": local->remote vid 20->vid 10"
+}
+
+arp_decap()
+{
+ # Repeat the ping tests, but without populating the neighbours. This
+ # makes sure we correctly decapsulate ARP packets
+ log_info "deleting neighbours from vlan interfaces"
+
+ ip neigh del 10.1.1.102 dev vlan10
+ ip neigh del 10.1.2.102 dev vlan20
+
+ ping_ipv4
+
+ ip neigh replace 10.1.1.102 lladdr $(in_ns ns1 mac_get w2) nud noarp \
+ dev vlan10 extern_learn
+ ip neigh replace 10.1.2.102 lladdr $(in_ns ns1 mac_get w4) nud noarp \
+ dev vlan20 extern_learn
+}
+
+arp_suppression_compare()
+{
+ local expect=$1; shift
+ local actual=$(in_ns ns1 tc_rule_stats_get vx10 1 ingress)
+
+ (( expect == actual ))
+ check_err $? "expected $expect arps got $actual"
+}
+
+arp_suppression()
+{
+ ip link set dev vx10 type bridge_slave neigh_suppress on
+
+ in_ns ns1 tc qdisc add dev vx10 clsact
+ in_ns ns1 tc filter add dev vx10 ingress proto arp pref 1 handle 101 \
+ flower dst_mac ff:ff:ff:ff:ff:ff arp_tip 10.1.1.102 arp_op \
+ request action pass
+
+ # The neighbour is configured on the SVI and ARP suppression is on, so
+ # the ARP request should be suppressed
+ RET=0
+
+ $ARPING -I $h1 -fqb -c 1 -w 1 10.1.1.102
+ check_err $? "arping failed"
+
+ arp_suppression_compare 0
+
+ log_test "neigh_suppress: on / neigh exists: yes"
+
+ # Delete the neighbour from the the SVI. A single ARP request should be
+ # received by the remote VTEP
+ RET=0
+
+ ip neigh del 10.1.1.102 dev vlan10
+
+ $ARPING -I $h1 -fqb -c 1 -w 1 10.1.1.102
+ check_err $? "arping failed"
+
+ arp_suppression_compare 1
+
+ log_test "neigh_suppress: on / neigh exists: no"
+
+ # Turn off ARP suppression and make sure ARP is not suppressed,
+ # regardless of neighbour existence on the SVI
+ RET=0
+
+ ip neigh del 10.1.1.102 dev vlan10 &> /dev/null
+ ip link set dev vx10 type bridge_slave neigh_suppress off
+
+ $ARPING -I $h1 -fqb -c 1 -w 1 10.1.1.102
+ check_err $? "arping failed"
+
+ arp_suppression_compare 2
+
+ log_test "neigh_suppress: off / neigh exists: no"
+
+ RET=0
+
+ ip neigh add 10.1.1.102 lladdr $(in_ns ns1 mac_get w2) nud noarp \
+ dev vlan10 extern_learn
+
+ $ARPING -I $h1 -fqb -c 1 -w 1 10.1.1.102
+ check_err $? "arping failed"
+
+ arp_suppression_compare 3
+
+ log_test "neigh_suppress: off / neigh exists: yes"
+
+ in_ns ns1 tc qdisc del dev vx10 clsact
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh
new file mode 100755
index 000000000..0ccb1dda0
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh
@@ -0,0 +1,786 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# +--------------------+ +----------------------+
+# | H1 (vrf) | | H2 (vrf) |
+# | + $h1 | | + $h2 |
+# | | 192.0.2.1/28 | | | 192.0.2.2/28 |
+# +----|---------------+ +--|-------------------+
+# | |
+# +----|--------------------------------------------------|-------------------+
+# | SW | | |
+# | +--|--------------------------------------------------|-----------------+ |
+# | | + $swp1 BR1 (802.1d) + $swp2 | |
+# | | | |
+# | | + vx1 (vxlan) | |
+# | | local 192.0.2.17 | |
+# | | remote 192.0.2.34 192.0.2.50 | |
+# | | id 1000 dstport $VXPORT | |
+# | +-----------------------------------------------------------------------+ |
+# | |
+# | 192.0.2.32/28 via 192.0.2.18 |
+# | 192.0.2.48/28 via 192.0.2.18 |
+# | |
+# | + $rp1 |
+# | | 192.0.2.17/28 |
+# +----|----------------------------------------------------------------------+
+# |
+# +----|--------------------------------------------------------+
+# | | VRP2 (vrf) |
+# | + $rp2 |
+# | 192.0.2.18/28 |
+# | | (maybe) HW
+# =============================================================================
+# | | (likely) SW
+# | + v1 (veth) + v3 (veth) |
+# | | 192.0.2.33/28 | 192.0.2.49/28 |
+# +----|---------------------------------------|----------------+
+# | |
+# +----|------------------------------+ +----|------------------------------+
+# | + v2 (veth) NS1 (netns) | | + v4 (veth) NS2 (netns) |
+# | 192.0.2.34/28 | | 192.0.2.50/28 |
+# | | | |
+# | 192.0.2.16/28 via 192.0.2.33 | | 192.0.2.16/28 via 192.0.2.49 |
+# | 192.0.2.50/32 via 192.0.2.33 | | 192.0.2.34/32 via 192.0.2.49 |
+# | | | |
+# | +-------------------------------+ | | +-------------------------------+ |
+# | | BR2 (802.1d) | | | | BR2 (802.1d) | |
+# | | + vx2 (vxlan) | | | | + vx2 (vxlan) | |
+# | | local 192.0.2.34 | | | | local 192.0.2.50 | |
+# | | remote 192.0.2.17 | | | | remote 192.0.2.17 | |
+# | | remote 192.0.2.50 | | | | remote 192.0.2.34 | |
+# | | id 1000 dstport $VXPORT | | | | id 1000 dstport $VXPORT | |
+# | | | | | | | |
+# | | + w1 (veth) | | | | + w1 (veth) | |
+# | +--|----------------------------+ | | +--|----------------------------+ |
+# | | | | | |
+# | +--|----------------------------+ | | +--|----------------------------+ |
+# | | | VW2 (vrf) | | | | | VW2 (vrf) | |
+# | | + w2 (veth) | | | | + w2 (veth) | |
+# | | 192.0.2.3/28 | | | | 192.0.2.4/28 | |
+# | +-------------------------------+ | | +-------------------------------+ |
+# +-----------------------------------+ +-----------------------------------+
+
+: ${VXPORT:=4789}
+export VXPORT
+
+: ${ALL_TESTS:="
+ ping_ipv4
+ test_flood
+ test_unicast
+ test_ttl
+ test_tos
+ test_ecn_encap
+ test_ecn_decap
+ reapply_config
+ ping_ipv4
+ test_flood
+ test_unicast
+ test_learning
+ "}
+
+NUM_NETIFS=6
+source lib.sh
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/28
+ tc qdisc add dev $h1 clsact
+}
+
+h1_destroy()
+{
+ tc qdisc del dev $h1 clsact
+ simple_if_fini $h1 192.0.2.1/28
+}
+
+h2_create()
+{
+ simple_if_init $h2 192.0.2.2/28
+ tc qdisc add dev $h2 clsact
+}
+
+h2_destroy()
+{
+ tc qdisc del dev $h2 clsact
+ simple_if_fini $h2 192.0.2.2/28
+}
+
+rp1_set_addr()
+{
+ ip address add dev $rp1 192.0.2.17/28
+
+ ip route add 192.0.2.32/28 nexthop via 192.0.2.18
+ ip route add 192.0.2.48/28 nexthop via 192.0.2.18
+}
+
+rp1_unset_addr()
+{
+ ip route del 192.0.2.48/28 nexthop via 192.0.2.18
+ ip route del 192.0.2.32/28 nexthop via 192.0.2.18
+
+ ip address del dev $rp1 192.0.2.17/28
+}
+
+switch_create()
+{
+ ip link add name br1 type bridge vlan_filtering 0 mcast_snooping 0
+ # Make sure the bridge uses the MAC address of the local port and not
+ # that of the VxLAN's device.
+ ip link set dev br1 address $(mac_get $swp1)
+ ip link set dev br1 up
+
+ ip link set dev $rp1 up
+ rp1_set_addr
+
+ ip link add name vx1 type vxlan id 1000 \
+ local 192.0.2.17 dstport "$VXPORT" \
+ nolearning noudpcsum tos inherit ttl 100
+ ip link set dev vx1 up
+
+ ip link set dev vx1 master br1
+ ip link set dev $swp1 master br1
+ ip link set dev $swp1 up
+
+ ip link set dev $swp2 master br1
+ ip link set dev $swp2 up
+
+ bridge fdb append dev vx1 00:00:00:00:00:00 dst 192.0.2.34 self
+ bridge fdb append dev vx1 00:00:00:00:00:00 dst 192.0.2.50 self
+}
+
+switch_destroy()
+{
+ rp1_unset_addr
+ ip link set dev $rp1 down
+
+ bridge fdb del dev vx1 00:00:00:00:00:00 dst 192.0.2.50 self
+ bridge fdb del dev vx1 00:00:00:00:00:00 dst 192.0.2.34 self
+
+ ip link set dev vx1 nomaster
+ ip link set dev vx1 down
+ ip link del dev vx1
+
+ ip link set dev $swp2 down
+ ip link set dev $swp2 nomaster
+
+ ip link set dev $swp1 down
+ ip link set dev $swp1 nomaster
+
+ ip link set dev br1 down
+ ip link del dev br1
+}
+
+vrp2_create()
+{
+ simple_if_init $rp2 192.0.2.18/28
+ __simple_if_init v1 v$rp2 192.0.2.33/28
+ __simple_if_init v3 v$rp2 192.0.2.49/28
+ tc qdisc add dev v1 clsact
+}
+
+vrp2_destroy()
+{
+ tc qdisc del dev v1 clsact
+ __simple_if_fini v3 192.0.2.49/28
+ __simple_if_fini v1 192.0.2.33/28
+ simple_if_fini $rp2 192.0.2.18/28
+}
+
+ns_init_common()
+{
+ local in_if=$1; shift
+ local in_addr=$1; shift
+ local other_in_addr=$1; shift
+ local nh_addr=$1; shift
+ local host_addr=$1; shift
+
+ ip link set dev $in_if up
+ ip address add dev $in_if $in_addr/28
+ tc qdisc add dev $in_if clsact
+
+ ip link add name br2 type bridge vlan_filtering 0
+ ip link set dev br2 up
+
+ ip link add name w1 type veth peer name w2
+
+ ip link set dev w1 master br2
+ ip link set dev w1 up
+
+ ip link add name vx2 type vxlan id 1000 local $in_addr dstport "$VXPORT"
+ ip link set dev vx2 up
+ bridge fdb append dev vx2 00:00:00:00:00:00 dst 192.0.2.17 self
+ bridge fdb append dev vx2 00:00:00:00:00:00 dst $other_in_addr self
+
+ ip link set dev vx2 master br2
+ tc qdisc add dev vx2 clsact
+
+ simple_if_init w2 $host_addr/28
+
+ ip route add 192.0.2.16/28 nexthop via $nh_addr
+ ip route add $other_in_addr/32 nexthop via $nh_addr
+}
+export -f ns_init_common
+
+ns1_create()
+{
+ ip netns add ns1
+ ip link set dev v2 netns ns1
+ in_ns ns1 \
+ ns_init_common v2 192.0.2.34 192.0.2.50 192.0.2.33 192.0.2.3
+}
+
+ns1_destroy()
+{
+ ip netns exec ns1 ip link set dev v2 netns 1
+ ip netns del ns1
+}
+
+ns2_create()
+{
+ ip netns add ns2
+ ip link set dev v4 netns ns2
+ in_ns ns2 \
+ ns_init_common v4 192.0.2.50 192.0.2.34 192.0.2.49 192.0.2.4
+}
+
+ns2_destroy()
+{
+ ip netns exec ns2 ip link set dev v4 netns 1
+ ip netns del ns2
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ rp1=${NETIFS[p5]}
+ rp2=${NETIFS[p6]}
+
+ vrf_prepare
+ forwarding_enable
+
+ h1_create
+ h2_create
+ switch_create
+
+ ip link add name v1 type veth peer name v2
+ ip link add name v3 type veth peer name v4
+ vrp2_create
+ ns1_create
+ ns2_create
+
+ r1_mac=$(in_ns ns1 mac_get w2)
+ r2_mac=$(in_ns ns2 mac_get w2)
+ h2_mac=$(mac_get $h2)
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ ns2_destroy
+ ns1_destroy
+ vrp2_destroy
+ ip link del dev v3
+ ip link del dev v1
+
+ switch_destroy
+ h2_destroy
+ h1_destroy
+
+ forwarding_restore
+ vrf_cleanup
+}
+
+# For the first round of tests, vx1 is the first device to get attached to the
+# bridge, and that at the point that the local IP is already configured. Try the
+# other scenario of attaching the device to an already-offloaded bridge, and
+# only then attach the local IP.
+reapply_config()
+{
+ echo "Reapplying configuration"
+
+ bridge fdb del dev vx1 00:00:00:00:00:00 dst 192.0.2.50 self
+ bridge fdb del dev vx1 00:00:00:00:00:00 dst 192.0.2.34 self
+ rp1_unset_addr
+ ip link set dev vx1 nomaster
+ sleep 5
+
+ ip link set dev vx1 master br1
+ bridge fdb append dev vx1 00:00:00:00:00:00 dst 192.0.2.34 self
+ bridge fdb append dev vx1 00:00:00:00:00:00 dst 192.0.2.50 self
+ sleep 1
+ rp1_set_addr
+ sleep 5
+}
+
+ping_ipv4()
+{
+ ping_test $h1 192.0.2.2 ": local->local"
+ ping_test $h1 192.0.2.3 ": local->remote 1"
+ ping_test $h1 192.0.2.4 ": local->remote 2"
+}
+
+maybe_in_ns()
+{
+ echo ${1:+in_ns} $1
+}
+
+__flood_counter_add_del()
+{
+ local add_del=$1; shift
+ local dev=$1; shift
+ local ns=$1; shift
+
+ # Putting the ICMP capture both to HW and to SW will end up
+ # double-counting the packets that are trapped to slow path, such as for
+ # the unicast test. Adding either skip_hw or skip_sw fixes this problem,
+ # but with skip_hw, the flooded packets are not counted at all, because
+ # those are dropped due to MAC address mismatch; and skip_sw is a no-go
+ # for veth-based topologies.
+ #
+ # So try to install with skip_sw and fall back to skip_sw if that fails.
+
+ $(maybe_in_ns $ns) __icmp_capture_add_del \
+ $add_del 100 "" $dev skip_sw 2>/dev/null || \
+ $(maybe_in_ns $ns) __icmp_capture_add_del \
+ $add_del 100 "" $dev skip_hw
+}
+
+flood_counter_install()
+{
+ __flood_counter_add_del add "$@"
+}
+
+flood_counter_uninstall()
+{
+ __flood_counter_add_del del "$@"
+}
+
+flood_fetch_stat()
+{
+ local dev=$1; shift
+ local ns=$1; shift
+
+ $(maybe_in_ns $ns) tc_rule_stats_get $dev 100 ingress
+}
+
+flood_fetch_stats()
+{
+ local counters=("${@}")
+ local counter
+
+ for counter in "${counters[@]}"; do
+ flood_fetch_stat $counter
+ done
+}
+
+vxlan_flood_test()
+{
+ local mac=$1; shift
+ local dst=$1; shift
+ local -a expects=("${@}")
+
+ local -a counters=($h2 "vx2 ns1" "vx2 ns2")
+ local counter
+ local key
+
+ for counter in "${counters[@]}"; do
+ flood_counter_install $counter
+ done
+
+ local -a t0s=($(flood_fetch_stats "${counters[@]}"))
+ $MZ $h1 -c 10 -d 100msec -p 64 -b $mac -B $dst -t icmp -q
+ sleep 1
+ local -a t1s=($(flood_fetch_stats "${counters[@]}"))
+
+ for key in ${!t0s[@]}; do
+ local delta=$((t1s[$key] - t0s[$key]))
+ local expect=${expects[$key]}
+
+ ((expect == delta))
+ check_err $? "${counters[$key]}: Expected to capture $expect packets, got $delta."
+ done
+
+ for counter in "${counters[@]}"; do
+ flood_counter_uninstall $counter
+ done
+}
+
+__test_flood()
+{
+ local mac=$1; shift
+ local dst=$1; shift
+ local what=$1; shift
+
+ RET=0
+
+ vxlan_flood_test $mac $dst 10 10 10
+
+ log_test "VXLAN: $what"
+}
+
+test_flood()
+{
+ __test_flood de:ad:be:ef:13:37 192.0.2.100 "flood"
+}
+
+vxlan_fdb_add_del()
+{
+ local add_del=$1; shift
+ local mac=$1; shift
+ local dev=$1; shift
+ local dst=$1; shift
+
+ bridge fdb $add_del dev $dev $mac self static permanent \
+ ${dst:+dst} $dst 2>/dev/null
+ bridge fdb $add_del dev $dev $mac master static 2>/dev/null
+}
+
+__test_unicast()
+{
+ local mac=$1; shift
+ local dst=$1; shift
+ local hit_idx=$1; shift
+ local what=$1; shift
+
+ RET=0
+
+ local -a expects=(0 0 0)
+ expects[$hit_idx]=10
+
+ vxlan_flood_test $mac $dst "${expects[@]}"
+
+ log_test "VXLAN: $what"
+}
+
+test_unicast()
+{
+ local -a targets=("$h2_mac $h2"
+ "$r1_mac vx1 192.0.2.34"
+ "$r2_mac vx1 192.0.2.50")
+ local target
+
+ for target in "${targets[@]}"; do
+ vxlan_fdb_add_del add $target
+ done
+
+ __test_unicast $h2_mac 192.0.2.2 0 "local MAC unicast"
+ __test_unicast $r1_mac 192.0.2.3 1 "remote MAC 1 unicast"
+ __test_unicast $r2_mac 192.0.2.4 2 "remote MAC 2 unicast"
+
+ for target in "${targets[@]}"; do
+ vxlan_fdb_add_del del $target
+ done
+}
+
+vxlan_ping_test()
+{
+ local ping_dev=$1; shift
+ local ping_dip=$1; shift
+ local ping_args=$1; shift
+ local capture_dev=$1; shift
+ local capture_dir=$1; shift
+ local capture_pref=$1; shift
+ local expect=$1; shift
+
+ local t0=$(tc_rule_stats_get $capture_dev $capture_pref $capture_dir)
+ ping_do $ping_dev $ping_dip "$ping_args"
+ local t1=$(tc_rule_stats_get $capture_dev $capture_pref $capture_dir)
+ local delta=$((t1 - t0))
+
+ # Tolerate a couple stray extra packets.
+ ((expect <= delta && delta <= expect + 2))
+ check_err $? "$capture_dev: Expected to capture $expect packets, got $delta."
+}
+
+test_ttl()
+{
+ RET=0
+
+ tc filter add dev v1 egress pref 77 prot ip \
+ flower ip_ttl 99 action pass
+ vxlan_ping_test $h1 192.0.2.3 "" v1 egress 77 10
+ tc filter del dev v1 egress pref 77 prot ip
+
+ log_test "VXLAN: envelope TTL"
+}
+
+test_tos()
+{
+ RET=0
+
+ tc filter add dev v1 egress pref 77 prot ip \
+ flower ip_tos 0x14 action pass
+ vxlan_ping_test $h1 192.0.2.3 "-Q 0x14" v1 egress 77 10
+ vxlan_ping_test $h1 192.0.2.3 "-Q 0x18" v1 egress 77 0
+ tc filter del dev v1 egress pref 77 prot ip
+
+ log_test "VXLAN: envelope TOS inheritance"
+}
+
+__test_ecn_encap()
+{
+ local q=$1; shift
+ local tos=$1; shift
+
+ RET=0
+
+ tc filter add dev v1 egress pref 77 prot ip \
+ flower ip_tos $tos action pass
+ sleep 1
+ vxlan_ping_test $h1 192.0.2.3 "-Q $q" v1 egress 77 10
+ tc filter del dev v1 egress pref 77 prot ip
+
+ log_test "VXLAN: ECN encap: $q->$tos"
+}
+
+test_ecn_encap()
+{
+ # In accordance with INET_ECN_encapsulate()
+ __test_ecn_encap 0x00 0x00
+ __test_ecn_encap 0x01 0x01
+ __test_ecn_encap 0x02 0x02
+ __test_ecn_encap 0x03 0x02
+}
+
+vxlan_encapped_ping_do()
+{
+ local count=$1; shift
+ local dev=$1; shift
+ local next_hop_mac=$1; shift
+ local dest_ip=$1; shift
+ local dest_mac=$1; shift
+ local inner_tos=$1; shift
+ local outer_tos=$1; shift
+
+ $MZ $dev -c $count -d 100msec -q \
+ -b $next_hop_mac -B $dest_ip \
+ -t udp tos=$outer_tos,sp=23456,dp=$VXPORT,p=$(:
+ )"08:"$( : VXLAN flags
+ )"00:00:00:"$( : VXLAN reserved
+ )"00:03:e8:"$( : VXLAN VNI
+ )"00:"$( : VXLAN reserved
+ )"$dest_mac:"$( : ETH daddr
+ )"$(mac_get w2):"$( : ETH saddr
+ )"08:00:"$( : ETH type
+ )"45:"$( : IP version + IHL
+ )"$inner_tos:"$( : IP TOS
+ )"00:54:"$( : IP total length
+ )"99:83:"$( : IP identification
+ )"40:00:"$( : IP flags + frag off
+ )"40:"$( : IP TTL
+ )"01:"$( : IP proto
+ )"00:00:"$( : IP header csum
+ )"c0:00:02:03:"$( : IP saddr: 192.0.2.3
+ )"c0:00:02:01:"$( : IP daddr: 192.0.2.1
+ )"08:"$( : ICMP type
+ )"00:"$( : ICMP code
+ )"8b:f2:"$( : ICMP csum
+ )"1f:6a:"$( : ICMP request identifier
+ )"00:01:"$( : ICMP request sequence number
+ )"4f:ff:c5:5b:00:00:00:00:"$( : ICMP payload
+ )"6d:74:0b:00:00:00:00:00:"$( :
+ )"10:11:12:13:14:15:16:17:"$( :
+ )"18:19:1a:1b:1c:1d:1e:1f:"$( :
+ )"20:21:22:23:24:25:26:27:"$( :
+ )"28:29:2a:2b:2c:2d:2e:2f:"$( :
+ )"30:31:32:33:34:35:36:37"
+}
+export -f vxlan_encapped_ping_do
+
+vxlan_encapped_ping_test()
+{
+ local ping_dev=$1; shift
+ local nh_dev=$1; shift
+ local ping_dip=$1; shift
+ local inner_tos=$1; shift
+ local outer_tos=$1; shift
+ local stat_get=$1; shift
+ local expect=$1; shift
+
+ local t0=$($stat_get)
+
+ in_ns ns1 \
+ vxlan_encapped_ping_do 10 $ping_dev $(mac_get $nh_dev) \
+ $ping_dip $(mac_get $h1) \
+ $inner_tos $outer_tos
+
+ local t1=$($stat_get)
+ local delta=$((t1 - t0))
+
+ # Tolerate a couple stray extra packets.
+ ((expect <= delta && delta <= expect + 2))
+ check_err $? "Expected to capture $expect packets, got $delta."
+}
+export -f vxlan_encapped_ping_test
+
+__test_ecn_decap()
+{
+ local orig_inner_tos=$1; shift
+ local orig_outer_tos=$1; shift
+ local decapped_tos=$1; shift
+
+ RET=0
+
+ tc filter add dev $h1 ingress pref 77 prot ip \
+ flower ip_tos $decapped_tos action drop
+ sleep 1
+ vxlan_encapped_ping_test v2 v1 192.0.2.17 \
+ $orig_inner_tos $orig_outer_tos \
+ "tc_rule_stats_get $h1 77 ingress" 10
+ tc filter del dev $h1 ingress pref 77
+
+ log_test "VXLAN: ECN decap: $orig_outer_tos/$orig_inner_tos->$decapped_tos"
+}
+
+test_ecn_decap_error()
+{
+ local orig_inner_tos=00
+ local orig_outer_tos=03
+
+ RET=0
+
+ vxlan_encapped_ping_test v2 v1 192.0.2.17 \
+ $orig_inner_tos $orig_outer_tos \
+ "link_stats_rx_errors_get vx1" 10
+
+ log_test "VXLAN: ECN decap: $orig_outer_tos/$orig_inner_tos->error"
+}
+
+test_ecn_decap()
+{
+ # In accordance with INET_ECN_decapsulate()
+ __test_ecn_decap 00 00 0x00
+ __test_ecn_decap 01 01 0x01
+ __test_ecn_decap 02 01 0x01
+ __test_ecn_decap 01 03 0x03
+ __test_ecn_decap 02 03 0x03
+ test_ecn_decap_error
+}
+
+test_learning()
+{
+ local mac=de:ad:be:ef:13:37
+ local dst=192.0.2.100
+
+ # Enable learning on the VxLAN device and set ageing time to 10 seconds
+ ip link set dev br1 type bridge ageing_time 1000
+ ip link set dev vx1 type vxlan ageing 10
+ ip link set dev vx1 type vxlan learning
+ reapply_config
+
+ # Check that flooding works
+ RET=0
+
+ vxlan_flood_test $mac $dst 10 10 10
+
+ log_test "VXLAN: flood before learning"
+
+ # Send a packet with source mac set to $mac from host w2 and check that
+ # a corresponding entry is created in VxLAN device vx1
+ RET=0
+
+ in_ns ns1 $MZ w2 -c 1 -p 64 -a $mac -b ff:ff:ff:ff:ff:ff -B $dst \
+ -t icmp -q
+ sleep 1
+
+ bridge fdb show brport vx1 | grep $mac | grep -q self
+ check_err $?
+ bridge fdb show brport vx1 | grep $mac | grep -q -v self
+ check_err $?
+
+ log_test "VXLAN: show learned FDB entry"
+
+ # Repeat first test and check that packets only reach host w2 in ns1
+ RET=0
+
+ vxlan_flood_test $mac $dst 0 10 0
+
+ log_test "VXLAN: learned FDB entry"
+
+ # Delete the learned FDB entry from the VxLAN and bridge devices and
+ # check that packets are flooded
+ RET=0
+
+ bridge fdb del dev vx1 $mac master self
+ sleep 1
+
+ vxlan_flood_test $mac $dst 10 10 10
+
+ log_test "VXLAN: deletion of learned FDB entry"
+
+ # Re-learn the first FDB entry and check that it is correctly aged-out
+ RET=0
+
+ in_ns ns1 $MZ w2 -c 1 -p 64 -a $mac -b ff:ff:ff:ff:ff:ff -B $dst \
+ -t icmp -q
+ sleep 1
+
+ bridge fdb show brport vx1 | grep $mac | grep -q self
+ check_err $?
+ bridge fdb show brport vx1 | grep $mac | grep -q -v self
+ check_err $?
+
+ vxlan_flood_test $mac $dst 0 10 0
+
+ sleep 20
+
+ bridge fdb show brport vx1 | grep $mac | grep -q self
+ check_fail $?
+ bridge fdb show brport vx1 | grep $mac | grep -q -v self
+ check_fail $?
+
+ vxlan_flood_test $mac $dst 10 10 10
+
+ log_test "VXLAN: Ageing of learned FDB entry"
+
+ # Toggle learning on the bridge port and check that the bridge's FDB
+ # is populated only when it should
+ RET=0
+
+ ip link set dev vx1 type bridge_slave learning off
+
+ in_ns ns1 $MZ w2 -c 1 -p 64 -a $mac -b ff:ff:ff:ff:ff:ff -B $dst \
+ -t icmp -q
+ sleep 1
+
+ bridge fdb show brport vx1 | grep $mac | grep -q -v self
+ check_fail $?
+
+ ip link set dev vx1 type bridge_slave learning on
+
+ in_ns ns1 $MZ w2 -c 1 -p 64 -a $mac -b ff:ff:ff:ff:ff:ff -B $dst \
+ -t icmp -q
+ sleep 1
+
+ bridge fdb show brport vx1 | grep $mac | grep -q -v self
+ check_err $?
+
+ log_test "VXLAN: learning toggling on bridge port"
+
+ # Restore previous settings
+ ip link set dev vx1 type vxlan nolearning
+ ip link set dev vx1 type vxlan ageing 300
+ ip link set dev br1 type bridge ageing_time 30000
+ reapply_config
+}
+
+test_all()
+{
+ echo "Running tests with UDP port $VXPORT"
+ tests_run
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+test_all
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_port_8472.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_port_8472.sh
new file mode 100755
index 000000000..3bf3da691
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1d_port_8472.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# A wrapper to run VXLAN tests with an unusual port number.
+
+VXPORT=8472
+ALL_TESTS="
+ ping_ipv4
+"
+source vxlan_bridge_1d.sh
diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1q.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q.sh
new file mode 100755
index 000000000..a5789721b
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q.sh
@@ -0,0 +1,860 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# +-----------------------+ +------------------------+
+# | H1 (vrf) | | H2 (vrf) |
+# | + $h1.10 | | + $h2.10 |
+# | | 192.0.2.1/28 | | | 192.0.2.2/28 |
+# | | | | | |
+# | | + $h1.20 | | | + $h2.20 |
+# | \ | 198.51.100.1/24 | | \ | 198.51.100.2/24 |
+# | \| | | \| |
+# | + $h1 | | + $h2 |
+# +----|------------------+ +----|-------------------+
+# | |
+# +----|--------------------------------------------------|-------------------+
+# | SW | | |
+# | +--|--------------------------------------------------|-----------------+ |
+# | | + $swp1 BR1 (802.1q) + $swp2 | |
+# | | vid 10 vid 10 | |
+# | | vid 20 vid 20 | |
+# | | | |
+# | | + vx10 (vxlan) + vx20 (vxlan) | |
+# | | local 192.0.2.17 local 192.0.2.17 | |
+# | | remote 192.0.2.34 192.0.2.50 remote 192.0.2.34 192.0.2.50 | |
+# | | id 1000 dstport $VXPORT id 2000 dstport $VXPORT | |
+# | | vid 10 pvid untagged vid 20 pvid untagged | |
+# | +-----------------------------------------------------------------------+ |
+# | |
+# | 192.0.2.32/28 via 192.0.2.18 |
+# | 192.0.2.48/28 via 192.0.2.18 |
+# | |
+# | + $rp1 |
+# | | 192.0.2.17/28 |
+# +----|----------------------------------------------------------------------+
+# |
+# +----|--------------------------------------------------------+
+# | | VRP2 (vrf) |
+# | + $rp2 |
+# | 192.0.2.18/28 |
+# | | (maybe) HW
+# =============================================================================
+# | | (likely) SW
+# | + v1 (veth) + v3 (veth) |
+# | | 192.0.2.33/28 | 192.0.2.49/28 |
+# +----|---------------------------------------|----------------+
+# | |
+# +----|------------------------------+ +----|------------------------------+
+# | + v2 (veth) NS1 (netns) | | + v4 (veth) NS2 (netns) |
+# | 192.0.2.34/28 | | 192.0.2.50/28 |
+# | | | |
+# | 192.0.2.16/28 via 192.0.2.33 | | 192.0.2.16/28 via 192.0.2.49 |
+# | 192.0.2.50/32 via 192.0.2.33 | | 192.0.2.34/32 via 192.0.2.49 |
+# | | | |
+# | +-------------------------------+ | | +-------------------------------+ |
+# | | BR2 (802.1q) | | | | BR2 (802.1q) | |
+# | | + vx10 (vxlan) | | | | + vx10 (vxlan) | |
+# | | local 192.0.2.34 | | | | local 192.0.2.50 | |
+# | | remote 192.0.2.17 | | | | remote 192.0.2.17 | |
+# | | remote 192.0.2.50 | | | | remote 192.0.2.34 | |
+# | | id 1000 dstport $VXPORT | | | | id 1000 dstport $VXPORT | |
+# | | vid 10 pvid untagged | | | | vid 10 pvid untagged | |
+# | | | | | | | |
+# | | + vx20 (vxlan) | | | | + vx20 (vxlan) | |
+# | | local 192.0.2.34 | | | | local 192.0.2.50 | |
+# | | remote 192.0.2.17 | | | | remote 192.0.2.17 | |
+# | | remote 192.0.2.50 | | | | remote 192.0.2.34 | |
+# | | id 2000 dstport $VXPORT | | | | id 2000 dstport $VXPORT | |
+# | | vid 20 pvid untagged | | | | vid 20 pvid untagged | |
+# | | | | | | | |
+# | | + w1 (veth) | | | | + w1 (veth) | |
+# | | | vid 10 | | | | | vid 10 | |
+# | | | vid 20 | | | | | vid 20 | |
+# | +--|----------------------------+ | | +--|----------------------------+ |
+# | | | | | |
+# | +--|----------------------------+ | | +--|----------------------------+ |
+# | | + w2 (veth) VW2 (vrf) | | | | + w2 (veth) VW2 (vrf) | |
+# | | |\ | | | | |\ | |
+# | | | + w2.10 | | | | | + w2.10 | |
+# | | | 192.0.2.3/28 | | | | | 192.0.2.4/28 | |
+# | | | | | | | | | |
+# | | + w2.20 | | | | + w2.20 | |
+# | | 198.51.100.3/24 | | | | 198.51.100.4/24 | |
+# | +-------------------------------+ | | +-------------------------------+ |
+# +-----------------------------------+ +-----------------------------------+
+
+: ${VXPORT:=4789}
+export VXPORT
+
+: ${ALL_TESTS:="
+ ping_ipv4
+ test_flood
+ test_unicast
+ reapply_config
+ ping_ipv4
+ test_flood
+ test_unicast
+ test_learning
+ test_pvid
+ "}
+
+NUM_NETIFS=6
+source lib.sh
+
+h1_create()
+{
+ simple_if_init $h1
+ tc qdisc add dev $h1 clsact
+ vlan_create $h1 10 v$h1 192.0.2.1/28
+ vlan_create $h1 20 v$h1 198.51.100.1/24
+}
+
+h1_destroy()
+{
+ vlan_destroy $h1 20
+ vlan_destroy $h1 10
+ tc qdisc del dev $h1 clsact
+ simple_if_fini $h1
+}
+
+h2_create()
+{
+ simple_if_init $h2
+ tc qdisc add dev $h2 clsact
+ vlan_create $h2 10 v$h2 192.0.2.2/28
+ vlan_create $h2 20 v$h2 198.51.100.2/24
+}
+
+h2_destroy()
+{
+ vlan_destroy $h2 20
+ vlan_destroy $h2 10
+ tc qdisc del dev $h2 clsact
+ simple_if_fini $h2
+}
+
+rp1_set_addr()
+{
+ ip address add dev $rp1 192.0.2.17/28
+
+ ip route add 192.0.2.32/28 nexthop via 192.0.2.18
+ ip route add 192.0.2.48/28 nexthop via 192.0.2.18
+}
+
+rp1_unset_addr()
+{
+ ip route del 192.0.2.48/28 nexthop via 192.0.2.18
+ ip route del 192.0.2.32/28 nexthop via 192.0.2.18
+
+ ip address del dev $rp1 192.0.2.17/28
+}
+
+switch_create()
+{
+ ip link add name br1 type bridge vlan_filtering 1 vlan_default_pvid 0 \
+ mcast_snooping 0
+ # Make sure the bridge uses the MAC address of the local port and not
+ # that of the VxLAN's device.
+ ip link set dev br1 address $(mac_get $swp1)
+ ip link set dev br1 up
+
+ ip link set dev $rp1 up
+ rp1_set_addr
+
+ ip link add name vx10 type vxlan id 1000 \
+ local 192.0.2.17 dstport "$VXPORT" \
+ nolearning noudpcsum tos inherit ttl 100
+ ip link set dev vx10 up
+
+ ip link set dev vx10 master br1
+ bridge vlan add vid 10 dev vx10 pvid untagged
+
+ ip link add name vx20 type vxlan id 2000 \
+ local 192.0.2.17 dstport "$VXPORT" \
+ nolearning noudpcsum tos inherit ttl 100
+ ip link set dev vx20 up
+
+ ip link set dev vx20 master br1
+ bridge vlan add vid 20 dev vx20 pvid untagged
+
+ ip link set dev $swp1 master br1
+ ip link set dev $swp1 up
+ bridge vlan add vid 10 dev $swp1
+ bridge vlan add vid 20 dev $swp1
+
+ ip link set dev $swp2 master br1
+ ip link set dev $swp2 up
+ bridge vlan add vid 10 dev $swp2
+ bridge vlan add vid 20 dev $swp2
+
+ bridge fdb append dev vx10 00:00:00:00:00:00 dst 192.0.2.34 self
+ bridge fdb append dev vx10 00:00:00:00:00:00 dst 192.0.2.50 self
+
+ bridge fdb append dev vx20 00:00:00:00:00:00 dst 192.0.2.34 self
+ bridge fdb append dev vx20 00:00:00:00:00:00 dst 192.0.2.50 self
+}
+
+switch_destroy()
+{
+ bridge fdb del dev vx20 00:00:00:00:00:00 dst 192.0.2.50 self
+ bridge fdb del dev vx20 00:00:00:00:00:00 dst 192.0.2.34 self
+
+ bridge fdb del dev vx10 00:00:00:00:00:00 dst 192.0.2.50 self
+ bridge fdb del dev vx10 00:00:00:00:00:00 dst 192.0.2.34 self
+
+ bridge vlan del vid 20 dev $swp2
+ bridge vlan del vid 10 dev $swp2
+ ip link set dev $swp2 down
+ ip link set dev $swp2 nomaster
+
+ bridge vlan del vid 20 dev $swp1
+ bridge vlan del vid 10 dev $swp1
+ ip link set dev $swp1 down
+ ip link set dev $swp1 nomaster
+
+ bridge vlan del vid 20 dev vx20
+ ip link set dev vx20 nomaster
+
+ ip link set dev vx20 down
+ ip link del dev vx20
+
+ bridge vlan del vid 10 dev vx10
+ ip link set dev vx10 nomaster
+
+ ip link set dev vx10 down
+ ip link del dev vx10
+
+ rp1_unset_addr
+ ip link set dev $rp1 down
+
+ ip link set dev br1 down
+ ip link del dev br1
+}
+
+vrp2_create()
+{
+ simple_if_init $rp2 192.0.2.18/28
+ __simple_if_init v1 v$rp2 192.0.2.33/28
+ __simple_if_init v3 v$rp2 192.0.2.49/28
+ tc qdisc add dev v1 clsact
+}
+
+vrp2_destroy()
+{
+ tc qdisc del dev v1 clsact
+ __simple_if_fini v3 192.0.2.49/28
+ __simple_if_fini v1 192.0.2.33/28
+ simple_if_fini $rp2 192.0.2.18/28
+}
+
+ns_init_common()
+{
+ local in_if=$1; shift
+ local in_addr=$1; shift
+ local other_in_addr=$1; shift
+ local nh_addr=$1; shift
+ local host_addr1=$1; shift
+ local host_addr2=$1; shift
+
+ ip link set dev $in_if up
+ ip address add dev $in_if $in_addr/28
+ tc qdisc add dev $in_if clsact
+
+ ip link add name br2 type bridge vlan_filtering 1 vlan_default_pvid 0
+ ip link set dev br2 up
+
+ ip link add name w1 type veth peer name w2
+
+ ip link set dev w1 master br2
+ ip link set dev w1 up
+
+ bridge vlan add vid 10 dev w1
+ bridge vlan add vid 20 dev w1
+
+ ip link add name vx10 type vxlan id 1000 local $in_addr \
+ dstport "$VXPORT"
+ ip link set dev vx10 up
+ bridge fdb append dev vx10 00:00:00:00:00:00 dst 192.0.2.17 self
+ bridge fdb append dev vx10 00:00:00:00:00:00 dst $other_in_addr self
+
+ ip link set dev vx10 master br2
+ tc qdisc add dev vx10 clsact
+
+ bridge vlan add vid 10 dev vx10 pvid untagged
+
+ ip link add name vx20 type vxlan id 2000 local $in_addr \
+ dstport "$VXPORT"
+ ip link set dev vx20 up
+ bridge fdb append dev vx20 00:00:00:00:00:00 dst 192.0.2.17 self
+ bridge fdb append dev vx20 00:00:00:00:00:00 dst $other_in_addr self
+
+ ip link set dev vx20 master br2
+ tc qdisc add dev vx20 clsact
+
+ bridge vlan add vid 20 dev vx20 pvid untagged
+
+ simple_if_init w2
+ vlan_create w2 10 vw2 $host_addr1/28
+ vlan_create w2 20 vw2 $host_addr2/24
+
+ ip route add 192.0.2.16/28 nexthop via $nh_addr
+ ip route add $other_in_addr/32 nexthop via $nh_addr
+}
+export -f ns_init_common
+
+ns1_create()
+{
+ ip netns add ns1
+ ip link set dev v2 netns ns1
+ in_ns ns1 \
+ ns_init_common v2 192.0.2.34 192.0.2.50 192.0.2.33 192.0.2.3 \
+ 198.51.100.3
+}
+
+ns1_destroy()
+{
+ ip netns exec ns1 ip link set dev v2 netns 1
+ ip netns del ns1
+}
+
+ns2_create()
+{
+ ip netns add ns2
+ ip link set dev v4 netns ns2
+ in_ns ns2 \
+ ns_init_common v4 192.0.2.50 192.0.2.34 192.0.2.49 192.0.2.4 \
+ 198.51.100.4
+}
+
+ns2_destroy()
+{
+ ip netns exec ns2 ip link set dev v4 netns 1
+ ip netns del ns2
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ rp1=${NETIFS[p5]}
+ rp2=${NETIFS[p6]}
+
+ vrf_prepare
+ forwarding_enable
+
+ h1_create
+ h2_create
+ switch_create
+
+ ip link add name v1 type veth peer name v2
+ ip link add name v3 type veth peer name v4
+ vrp2_create
+ ns1_create
+ ns2_create
+
+ r1_mac=$(in_ns ns1 mac_get w2)
+ r2_mac=$(in_ns ns2 mac_get w2)
+ h2_mac=$(mac_get $h2)
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ ns2_destroy
+ ns1_destroy
+ vrp2_destroy
+ ip link del dev v3
+ ip link del dev v1
+
+ switch_destroy
+ h2_destroy
+ h1_destroy
+
+ forwarding_restore
+ vrf_cleanup
+}
+
+# For the first round of tests, vx10 and vx20 were the first devices to get
+# attached to the bridge, and that at the point that the local IP is already
+# configured. Try the other scenario of attaching these devices to a bridge
+# that already has local ports members, and only then assign the local IP.
+reapply_config()
+{
+ log_info "Reapplying configuration"
+
+ bridge fdb del dev vx20 00:00:00:00:00:00 dst 192.0.2.50 self
+ bridge fdb del dev vx20 00:00:00:00:00:00 dst 192.0.2.34 self
+
+ bridge fdb del dev vx10 00:00:00:00:00:00 dst 192.0.2.50 self
+ bridge fdb del dev vx10 00:00:00:00:00:00 dst 192.0.2.34 self
+
+ ip link set dev vx20 nomaster
+ ip link set dev vx10 nomaster
+
+ rp1_unset_addr
+ sleep 5
+
+ ip link set dev vx10 master br1
+ bridge vlan add vid 10 dev vx10 pvid untagged
+
+ ip link set dev vx20 master br1
+ bridge vlan add vid 20 dev vx20 pvid untagged
+
+ bridge fdb append dev vx10 00:00:00:00:00:00 dst 192.0.2.34 self
+ bridge fdb append dev vx10 00:00:00:00:00:00 dst 192.0.2.50 self
+
+ bridge fdb append dev vx20 00:00:00:00:00:00 dst 192.0.2.34 self
+ bridge fdb append dev vx20 00:00:00:00:00:00 dst 192.0.2.50 self
+
+ rp1_set_addr
+ sleep 5
+}
+
+ping_ipv4()
+{
+ ping_test $h1.10 192.0.2.2 ": local->local vid 10"
+ ping_test $h1.20 198.51.100.2 ": local->local vid 20"
+ ping_test $h1.10 192.0.2.3 ": local->remote 1 vid 10"
+ ping_test $h1.10 192.0.2.4 ": local->remote 2 vid 10"
+ ping_test $h1.20 198.51.100.3 ": local->remote 1 vid 20"
+ ping_test $h1.20 198.51.100.4 ": local->remote 2 vid 20"
+}
+
+maybe_in_ns()
+{
+ echo ${1:+in_ns} $1
+}
+
+__flood_counter_add_del()
+{
+ local add_del=$1; shift
+ local dev=$1; shift
+ local ns=$1; shift
+
+ # Putting the ICMP capture both to HW and to SW will end up
+ # double-counting the packets that are trapped to slow path, such as for
+ # the unicast test. Adding either skip_hw or skip_sw fixes this problem,
+ # but with skip_hw, the flooded packets are not counted at all, because
+ # those are dropped due to MAC address mismatch; and skip_sw is a no-go
+ # for veth-based topologies.
+ #
+ # So try to install with skip_sw and fall back to skip_sw if that fails.
+
+ $(maybe_in_ns $ns) __icmp_capture_add_del \
+ $add_del 100 "" $dev skip_sw 2>/dev/null || \
+ $(maybe_in_ns $ns) __icmp_capture_add_del \
+ $add_del 100 "" $dev skip_hw
+}
+
+flood_counter_install()
+{
+ __flood_counter_add_del add "$@"
+}
+
+flood_counter_uninstall()
+{
+ __flood_counter_add_del del "$@"
+}
+
+flood_fetch_stat()
+{
+ local dev=$1; shift
+ local ns=$1; shift
+
+ $(maybe_in_ns $ns) tc_rule_stats_get $dev 100 ingress
+}
+
+flood_fetch_stats()
+{
+ local counters=("${@}")
+ local counter
+
+ for counter in "${counters[@]}"; do
+ flood_fetch_stat $counter
+ done
+}
+
+vxlan_flood_test()
+{
+ local mac=$1; shift
+ local dst=$1; shift
+ local vid=$1; shift
+ local -a expects=("${@}")
+
+ local -a counters=($h2 "vx10 ns1" "vx20 ns1" "vx10 ns2" "vx20 ns2")
+ local counter
+ local key
+
+ # Packets reach the local host tagged whereas they reach the VxLAN
+ # devices untagged. In order to be able to use the same filter for
+ # all counters, make sure the packets also reach the local host
+ # untagged
+ bridge vlan add vid $vid dev $swp2 untagged
+ for counter in "${counters[@]}"; do
+ flood_counter_install $counter
+ done
+
+ local -a t0s=($(flood_fetch_stats "${counters[@]}"))
+ $MZ $h1 -Q $vid -c 10 -d 100msec -p 64 -b $mac -B $dst -t icmp -q
+ sleep 1
+ local -a t1s=($(flood_fetch_stats "${counters[@]}"))
+
+ for key in ${!t0s[@]}; do
+ local delta=$((t1s[$key] - t0s[$key]))
+ local expect=${expects[$key]}
+
+ ((expect == delta))
+ check_err $? "${counters[$key]}: Expected to capture $expect packets, got $delta."
+ done
+
+ for counter in "${counters[@]}"; do
+ flood_counter_uninstall $counter
+ done
+ bridge vlan add vid $vid dev $swp2
+}
+
+__test_flood()
+{
+ local mac=$1; shift
+ local dst=$1; shift
+ local vid=$1; shift
+ local what=$1; shift
+ local -a expects=("${@}")
+
+ RET=0
+
+ vxlan_flood_test $mac $dst $vid "${expects[@]}"
+
+ log_test "VXLAN: $what"
+}
+
+test_flood()
+{
+ __test_flood de:ad:be:ef:13:37 192.0.2.100 10 "flood vlan 10" \
+ 10 10 0 10 0
+ __test_flood ca:fe:be:ef:13:37 198.51.100.100 20 "flood vlan 20" \
+ 10 0 10 0 10
+}
+
+vxlan_fdb_add_del()
+{
+ local add_del=$1; shift
+ local vid=$1; shift
+ local mac=$1; shift
+ local dev=$1; shift
+ local dst=$1; shift
+
+ bridge fdb $add_del dev $dev $mac self static permanent \
+ ${dst:+dst} $dst 2>/dev/null
+ bridge fdb $add_del dev $dev $mac master static vlan $vid 2>/dev/null
+}
+
+__test_unicast()
+{
+ local mac=$1; shift
+ local dst=$1; shift
+ local hit_idx=$1; shift
+ local vid=$1; shift
+ local what=$1; shift
+
+ RET=0
+
+ local -a expects=(0 0 0 0 0)
+ expects[$hit_idx]=10
+
+ vxlan_flood_test $mac $dst $vid "${expects[@]}"
+
+ log_test "VXLAN: $what"
+}
+
+test_unicast()
+{
+ local -a targets=("$h2_mac $h2"
+ "$r1_mac vx10 192.0.2.34"
+ "$r2_mac vx10 192.0.2.50")
+ local target
+
+ log_info "unicast vlan 10"
+
+ for target in "${targets[@]}"; do
+ vxlan_fdb_add_del add 10 $target
+ done
+
+ __test_unicast $h2_mac 192.0.2.2 0 10 "local MAC unicast"
+ __test_unicast $r1_mac 192.0.2.3 1 10 "remote MAC 1 unicast"
+ __test_unicast $r2_mac 192.0.2.4 3 10 "remote MAC 2 unicast"
+
+ for target in "${targets[@]}"; do
+ vxlan_fdb_add_del del 10 $target
+ done
+
+ log_info "unicast vlan 20"
+
+ targets=("$h2_mac $h2" "$r1_mac vx20 192.0.2.34" \
+ "$r2_mac vx20 192.0.2.50")
+
+ for target in "${targets[@]}"; do
+ vxlan_fdb_add_del add 20 $target
+ done
+
+ __test_unicast $h2_mac 198.51.100.2 0 20 "local MAC unicast"
+ __test_unicast $r1_mac 198.51.100.3 2 20 "remote MAC 1 unicast"
+ __test_unicast $r2_mac 198.51.100.4 4 20 "remote MAC 2 unicast"
+
+ for target in "${targets[@]}"; do
+ vxlan_fdb_add_del del 20 $target
+ done
+}
+
+test_pvid()
+{
+ local -a expects=(0 0 0 0 0)
+ local mac=de:ad:be:ef:13:37
+ local dst=192.0.2.100
+ local vid=10
+
+ # Check that flooding works
+ RET=0
+
+ expects[0]=10; expects[1]=10; expects[3]=10
+ vxlan_flood_test $mac $dst $vid "${expects[@]}"
+
+ log_test "VXLAN: flood before pvid off"
+
+ # Toggle PVID off and test that flood to remote hosts does not work
+ RET=0
+
+ bridge vlan add vid 10 dev vx10
+
+ expects[0]=10; expects[1]=0; expects[3]=0
+ vxlan_flood_test $mac $dst $vid "${expects[@]}"
+
+ log_test "VXLAN: flood after pvid off"
+
+ # Toggle PVID on and test that flood to remote hosts does work
+ RET=0
+
+ bridge vlan add vid 10 dev vx10 pvid untagged
+
+ expects[0]=10; expects[1]=10; expects[3]=10
+ vxlan_flood_test $mac $dst $vid "${expects[@]}"
+
+ log_test "VXLAN: flood after pvid on"
+
+ # Add a new VLAN and test that it does not affect flooding
+ RET=0
+
+ bridge vlan add vid 30 dev vx10
+
+ expects[0]=10; expects[1]=10; expects[3]=10
+ vxlan_flood_test $mac $dst $vid "${expects[@]}"
+
+ bridge vlan del vid 30 dev vx10
+
+ log_test "VXLAN: flood after vlan add"
+
+ # Remove currently mapped VLAN and test that flood to remote hosts does
+ # not work
+ RET=0
+
+ bridge vlan del vid 10 dev vx10
+
+ expects[0]=10; expects[1]=0; expects[3]=0
+ vxlan_flood_test $mac $dst $vid "${expects[@]}"
+
+ log_test "VXLAN: flood after vlan delete"
+
+ # Re-add the VLAN and test that flood to remote hosts does work
+ RET=0
+
+ bridge vlan add vid 10 dev vx10 pvid untagged
+
+ expects[0]=10; expects[1]=10; expects[3]=10
+ vxlan_flood_test $mac $dst $vid "${expects[@]}"
+
+ log_test "VXLAN: flood after vlan re-add"
+}
+
+vxlan_ping_test()
+{
+ local ping_dev=$1; shift
+ local ping_dip=$1; shift
+ local ping_args=$1; shift
+ local capture_dev=$1; shift
+ local capture_dir=$1; shift
+ local capture_pref=$1; shift
+ local expect=$1; shift
+
+ local t0=$(tc_rule_stats_get $capture_dev $capture_pref $capture_dir)
+ ping_do $ping_dev $ping_dip "$ping_args"
+ local t1=$(tc_rule_stats_get $capture_dev $capture_pref $capture_dir)
+ local delta=$((t1 - t0))
+
+ # Tolerate a couple stray extra packets.
+ ((expect <= delta && delta <= expect + 2))
+ check_err $? "$capture_dev: Expected to capture $expect packets, got $delta."
+}
+
+__test_learning()
+{
+ local -a expects=(0 0 0 0 0)
+ local mac=$1; shift
+ local dst=$1; shift
+ local vid=$1; shift
+ local idx1=$1; shift
+ local idx2=$1; shift
+ local vx=vx$vid
+
+ # Check that flooding works
+ RET=0
+
+ expects[0]=10; expects[$idx1]=10; expects[$idx2]=10
+ vxlan_flood_test $mac $dst $vid "${expects[@]}"
+
+ log_test "VXLAN: flood before learning"
+
+ # Send a packet with source mac set to $mac from host w2 and check that
+ # a corresponding entry is created in the VxLAN device
+ RET=0
+
+ in_ns ns1 $MZ w2 -Q $vid -c 1 -p 64 -a $mac -b ff:ff:ff:ff:ff:ff \
+ -B $dst -t icmp -q
+ sleep 1
+
+ bridge fdb show brport $vx | grep $mac | grep -q self
+ check_err $?
+ bridge fdb show brport $vx | grep $mac | grep "vlan $vid" \
+ | grep -q -v self
+ check_err $?
+
+ log_test "VXLAN: show learned FDB entry"
+
+ # Repeat first test and check that packets only reach host w2 in ns1
+ RET=0
+
+ expects[0]=0; expects[$idx1]=10; expects[$idx2]=0
+ vxlan_flood_test $mac $dst $vid "${expects[@]}"
+
+ log_test "VXLAN: learned FDB entry"
+
+ # Delete the learned FDB entry from the VxLAN and bridge devices and
+ # check that packets are flooded
+ RET=0
+
+ bridge fdb del dev $vx $mac master self vlan $vid
+ sleep 1
+
+ expects[0]=10; expects[$idx1]=10; expects[$idx2]=10
+ vxlan_flood_test $mac $dst $vid "${expects[@]}"
+
+ log_test "VXLAN: deletion of learned FDB entry"
+
+ # Re-learn the first FDB entry and check that it is correctly aged-out
+ RET=0
+
+ in_ns ns1 $MZ w2 -Q $vid -c 1 -p 64 -a $mac -b ff:ff:ff:ff:ff:ff \
+ -B $dst -t icmp -q
+ sleep 1
+
+ bridge fdb show brport $vx | grep $mac | grep -q self
+ check_err $?
+ bridge fdb show brport $vx | grep $mac | grep "vlan $vid" \
+ | grep -q -v self
+ check_err $?
+
+ expects[0]=0; expects[$idx1]=10; expects[$idx2]=0
+ vxlan_flood_test $mac $dst $vid "${expects[@]}"
+
+ sleep 20
+
+ bridge fdb show brport $vx | grep $mac | grep -q self
+ check_fail $?
+ bridge fdb show brport $vx | grep $mac | grep "vlan $vid" \
+ | grep -q -v self
+ check_fail $?
+
+ expects[0]=10; expects[$idx1]=10; expects[$idx2]=10
+ vxlan_flood_test $mac $dst $vid "${expects[@]}"
+
+ log_test "VXLAN: Ageing of learned FDB entry"
+
+ # Toggle learning on the bridge port and check that the bridge's FDB
+ # is populated only when it should
+ RET=0
+
+ ip link set dev $vx type bridge_slave learning off
+
+ in_ns ns1 $MZ w2 -Q $vid -c 1 -p 64 -a $mac -b ff:ff:ff:ff:ff:ff \
+ -B $dst -t icmp -q
+ sleep 1
+
+ bridge fdb show brport $vx | grep $mac | grep "vlan $vid" \
+ | grep -q -v self
+ check_fail $?
+
+ ip link set dev $vx type bridge_slave learning on
+
+ in_ns ns1 $MZ w2 -Q $vid -c 1 -p 64 -a $mac -b ff:ff:ff:ff:ff:ff \
+ -B $dst -t icmp -q
+ sleep 1
+
+ bridge fdb show brport $vx | grep $mac | grep "vlan $vid" \
+ | grep -q -v self
+ check_err $?
+
+ log_test "VXLAN: learning toggling on bridge port"
+}
+
+test_learning()
+{
+ local mac=de:ad:be:ef:13:37
+ local dst=192.0.2.100
+ local vid=10
+
+ # Enable learning on the VxLAN devices and set ageing time to 10 seconds
+ ip link set dev br1 type bridge ageing_time 1000
+ ip link set dev vx10 type vxlan ageing 10
+ ip link set dev vx10 type vxlan learning
+ ip link set dev vx20 type vxlan ageing 10
+ ip link set dev vx20 type vxlan learning
+ reapply_config
+
+ log_info "learning vlan 10"
+
+ __test_learning $mac $dst $vid 1 3
+
+ log_info "learning vlan 20"
+
+ mac=ca:fe:be:ef:13:37
+ dst=198.51.100.100
+ vid=20
+
+ __test_learning $mac $dst $vid 2 4
+
+ # Restore previous settings
+ ip link set dev vx20 type vxlan nolearning
+ ip link set dev vx20 type vxlan ageing 300
+ ip link set dev vx10 type vxlan nolearning
+ ip link set dev vx10 type vxlan ageing 300
+ ip link set dev br1 type bridge ageing_time 30000
+ reapply_config
+}
+
+test_all()
+{
+ log_info "Running tests with UDP port $VXPORT"
+ tests_run
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+test_all
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_port_8472.sh b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_port_8472.sh
new file mode 100755
index 000000000..b1b2d1a31
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/vxlan_bridge_1q_port_8472.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# A wrapper to run VXLAN tests with an unusual port number.
+
+VXPORT=8472
+ALL_TESTS="
+ ping_ipv4
+"
+source vxlan_bridge_1q.sh
diff --git a/tools/testing/selftests/net/forwarding/vxlan_symmetric.sh b/tools/testing/selftests/net/forwarding/vxlan_symmetric.sh
new file mode 100755
index 000000000..5d97fa347
--- /dev/null
+++ b/tools/testing/selftests/net/forwarding/vxlan_symmetric.sh
@@ -0,0 +1,561 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# +---------------------------+ +------------------------------+
+# | vrf-h1 | | vrf-h2 |
+# | + $h1 | | + $h2 |
+# | | 10.1.1.101/24 | | | 10.1.2.101/24 |
+# | | default via 10.1.1.1 | | | default via 10.1.2.1 |
+# +----|----------------------+ +----|-------------------------+
+# | |
+# +----|--------------------------------------------|-------------------------+
+# | SW | | |
+# | +--|--------------------------------------------|-----------------------+ |
+# | | + $swp1 br1 + $swp2 | |
+# | | vid 10 pvid untagged vid 20 pvid untagged | |
+# | | | |
+# | | + vx10 + vx20 | |
+# | | local 10.0.0.1 local 10.0.0.1 | |
+# | | remote 10.0.0.2 remote 10.0.0.2 | |
+# | | id 1010 id 1020 | |
+# | | dstport 4789 dstport 4789 | |
+# | | vid 10 pvid untagged vid 20 pvid untagged | |
+# | | | |
+# | | + vx4001 | |
+# | | local 10.0.0.1 | |
+# | | remote 10.0.0.2 | |
+# | | id 104001 | |
+# | | dstport 4789 | |
+# | | vid 4001 pvid untagged | |
+# | | | |
+# | +-----------------------------------+-----------------------------------+ |
+# | | |
+# | +-----------------------------------|-----------------------------------+ |
+# | | | | |
+# | | +--------------------------------+--------------------------------+ | |
+# | | | | | | |
+# | | + vlan10 | vlan20 + | |
+# | | | 10.1.1.11/24 | 10.1.2.11/24 | | |
+# | | | | | | |
+# | | + vlan10-v (macvlan) + vlan20-v (macvlan) + | |
+# | | 10.1.1.1/24 vlan4001 10.1.2.1/24 | |
+# | | 00:00:5e:00:01:01 00:00:5e:00:01:01 | |
+# | | vrf-green | |
+# | +-----------------------------------------------------------------------+ |
+# | |
+# | + $rp1 +lo |
+# | | 192.0.2.1/24 10.0.0.1/32 |
+# +----|----------------------------------------------------------------------+
+# |
+# +----|--------------------------------------------------------+
+# | | vrf-spine |
+# | + $rp2 |
+# | 192.0.2.2/24 |
+# | | (maybe) HW
+# =============================================================================
+# | | (likely) SW
+# | |
+# | + v1 (veth) |
+# | | 192.0.3.2/24 |
+# +----|--------------------------------------------------------+
+# |
+# +----|----------------------------------------------------------------------+
+# | + v2 (veth) +lo NS1 (netns) |
+# | 192.0.3.1/24 10.0.0.2/32 |
+# | |
+# | +-----------------------------------------------------------------------+ |
+# | | vrf-green | |
+# | | + vlan10-v (macvlan) vlan20-v (macvlan) + | |
+# | | | 10.1.1.1/24 10.1.2.1/24 | | |
+# | | | 00:00:5e:00:01:01 00:00:5e:00:01:01 | | |
+# | | | vlan4001 | | |
+# | | + vlan10 + vlan20 + | |
+# | | | 10.1.1.12/24 | 10.1.2.12/24 | | |
+# | | | | | | |
+# | | +--------------------------------+--------------------------------+ | |
+# | | | | |
+# | +-----------------------------------|-----------------------------------+ |
+# | | |
+# | +-----------------------------------+-----------------------------------+ |
+# | | | |
+# | | + vx10 + vx20 | |
+# | | local 10.0.0.2 local 10.0.0.2 | |
+# | | remote 10.0.0.1 remote 10.0.0.1 | |
+# | | id 1010 id 1020 | |
+# | | dstport 4789 dstport 4789 | |
+# | | vid 10 pvid untagged vid 20 pvid untagged | |
+# | | | |
+# | | + vx4001 | |
+# | | local 10.0.0.2 | |
+# | | remote 10.0.0.1 | |
+# | | id 104001 | |
+# | | dstport 4789 | |
+# | | vid 4001 pvid untagged | |
+# | | | |
+# | | + w1 (veth) + w3 (veth) | |
+# | | | vid 10 pvid untagged br1 | vid 20 pvid untagged | |
+# | +--|------------------------------------------|-------------------------+ |
+# | | | |
+# | | | |
+# | +--|----------------------+ +--|-------------------------+ |
+# | | | vrf-h1 | | | vrf-h2 | |
+# | | + w2 (veth) | | + w4 (veth) | |
+# | | 10.1.1.102/24 | | 10.1.2.102/24 | |
+# | | default via 10.1.1.1 | | default via 10.1.2.1 | |
+# | +-------------------------+ +----------------------------+ |
+# +---------------------------------------------------------------------------+
+
+ALL_TESTS="
+ ping_ipv4
+"
+NUM_NETIFS=6
+source lib.sh
+
+hx_create()
+{
+ local vrf_name=$1; shift
+ local if_name=$1; shift
+ local ip_addr=$1; shift
+ local gw_ip=$1; shift
+
+ vrf_create $vrf_name
+ ip link set dev $if_name master $vrf_name
+ ip link set dev $vrf_name up
+ ip link set dev $if_name up
+
+ ip address add $ip_addr/24 dev $if_name
+ ip neigh replace $gw_ip lladdr 00:00:5e:00:01:01 nud permanent \
+ dev $if_name
+ ip route add default vrf $vrf_name nexthop via $gw_ip
+}
+export -f hx_create
+
+hx_destroy()
+{
+ local vrf_name=$1; shift
+ local if_name=$1; shift
+ local ip_addr=$1; shift
+ local gw_ip=$1; shift
+
+ ip route del default vrf $vrf_name nexthop via $gw_ip
+ ip neigh del $gw_ip dev $if_name
+ ip address del $ip_addr/24 dev $if_name
+
+ ip link set dev $if_name down
+ vrf_destroy $vrf_name
+}
+
+h1_create()
+{
+ hx_create "vrf-h1" $h1 10.1.1.101 10.1.1.1
+}
+
+h1_destroy()
+{
+ hx_destroy "vrf-h1" $h1 10.1.1.101 10.1.1.1
+}
+
+h2_create()
+{
+ hx_create "vrf-h2" $h2 10.1.2.101 10.1.2.1
+}
+
+h2_destroy()
+{
+ hx_destroy "vrf-h2" $h2 10.1.2.101 10.1.2.1
+}
+
+switch_create()
+{
+ ip link add name br1 type bridge vlan_filtering 1 vlan_default_pvid 0 \
+ mcast_snooping 0
+ # Make sure the bridge uses the MAC address of the local port and not
+ # that of the VxLAN's device.
+ ip link set dev br1 address $(mac_get $swp1)
+ ip link set dev br1 up
+
+ ip link set dev $rp1 up
+ ip address add dev $rp1 192.0.2.1/24
+ ip route add 10.0.0.2/32 nexthop via 192.0.2.2
+
+ ip link add name vx10 type vxlan id 1010 \
+ local 10.0.0.1 remote 10.0.0.2 dstport 4789 \
+ nolearning noudpcsum tos inherit ttl 100
+ ip link set dev vx10 up
+
+ ip link set dev vx10 master br1
+ bridge vlan add vid 10 dev vx10 pvid untagged
+
+ ip link add name vx20 type vxlan id 1020 \
+ local 10.0.0.1 remote 10.0.0.2 dstport 4789 \
+ nolearning noudpcsum tos inherit ttl 100
+ ip link set dev vx20 up
+
+ ip link set dev vx20 master br1
+ bridge vlan add vid 20 dev vx20 pvid untagged
+
+ ip link set dev $swp1 master br1
+ ip link set dev $swp1 up
+ bridge vlan add vid 10 dev $swp1 pvid untagged
+
+ ip link set dev $swp2 master br1
+ ip link set dev $swp2 up
+ bridge vlan add vid 20 dev $swp2 pvid untagged
+
+ ip link add name vx4001 type vxlan id 104001 \
+ local 10.0.0.1 dstport 4789 \
+ nolearning noudpcsum tos inherit ttl 100
+ ip link set dev vx4001 up
+
+ ip link set dev vx4001 master br1
+ bridge vlan add vid 4001 dev vx4001 pvid untagged
+
+ ip address add 10.0.0.1/32 dev lo
+
+ # Create SVIs
+ vrf_create "vrf-green"
+ ip link set dev vrf-green up
+
+ ip link add link br1 name vlan10 up master vrf-green type vlan id 10
+ ip address add 10.1.1.11/24 dev vlan10
+ ip link add link vlan10 name vlan10-v up master vrf-green \
+ address 00:00:5e:00:01:01 type macvlan mode private
+ ip address add 10.1.1.1/24 dev vlan10-v
+
+ ip link add link br1 name vlan20 up master vrf-green type vlan id 20
+ ip address add 10.1.2.11/24 dev vlan20
+ ip link add link vlan20 name vlan20-v up master vrf-green \
+ address 00:00:5e:00:01:01 type macvlan mode private
+ ip address add 10.1.2.1/24 dev vlan20-v
+
+ ip link add link br1 name vlan4001 up master vrf-green \
+ type vlan id 4001
+
+ bridge vlan add vid 10 dev br1 self
+ bridge vlan add vid 20 dev br1 self
+ bridge vlan add vid 4001 dev br1 self
+
+ bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 10
+ bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 20
+
+ sysctl_set net.ipv4.conf.all.rp_filter 0
+ sysctl_set net.ipv4.conf.vlan10-v.rp_filter 0
+ sysctl_set net.ipv4.conf.vlan20-v.rp_filter 0
+}
+
+switch_destroy()
+{
+ sysctl_restore net.ipv4.conf.all.rp_filter
+
+ bridge fdb del 00:00:5e:00:01:01 dev br1 self local vlan 20
+ bridge fdb del 00:00:5e:00:01:01 dev br1 self local vlan 10
+
+ bridge vlan del vid 4001 dev br1 self
+ bridge vlan del vid 20 dev br1 self
+ bridge vlan del vid 10 dev br1 self
+
+ ip link del dev vlan4001
+
+ ip link del dev vlan20
+
+ ip link del dev vlan10
+
+ vrf_destroy "vrf-green"
+
+ ip address del 10.0.0.1/32 dev lo
+
+ bridge vlan del vid 20 dev $swp2
+ ip link set dev $swp2 down
+ ip link set dev $swp2 nomaster
+
+ bridge vlan del vid 10 dev $swp1
+ ip link set dev $swp1 down
+ ip link set dev $swp1 nomaster
+
+ bridge vlan del vid 4001 dev vx4001
+ ip link set dev vx4001 nomaster
+
+ ip link set dev vx4001 down
+ ip link del dev vx4001
+
+ bridge vlan del vid 20 dev vx20
+ ip link set dev vx20 nomaster
+
+ ip link set dev vx20 down
+ ip link del dev vx20
+
+ bridge vlan del vid 10 dev vx10
+ ip link set dev vx10 nomaster
+
+ ip link set dev vx10 down
+ ip link del dev vx10
+
+ ip route del 10.0.0.2/32 nexthop via 192.0.2.2
+ ip address del dev $rp1 192.0.2.1/24
+ ip link set dev $rp1 down
+
+ ip link set dev br1 down
+ ip link del dev br1
+}
+
+spine_create()
+{
+ vrf_create "vrf-spine"
+ ip link set dev $rp2 master vrf-spine
+ ip link set dev v1 master vrf-spine
+ ip link set dev vrf-spine up
+ ip link set dev $rp2 up
+ ip link set dev v1 up
+
+ ip address add 192.0.2.2/24 dev $rp2
+ ip address add 192.0.3.2/24 dev v1
+
+ ip route add 10.0.0.1/32 vrf vrf-spine nexthop via 192.0.2.1
+ ip route add 10.0.0.2/32 vrf vrf-spine nexthop via 192.0.3.1
+}
+
+spine_destroy()
+{
+ ip route del 10.0.0.2/32 vrf vrf-spine nexthop via 192.0.3.1
+ ip route del 10.0.0.1/32 vrf vrf-spine nexthop via 192.0.2.1
+
+ ip address del 192.0.3.2/24 dev v1
+ ip address del 192.0.2.2/24 dev $rp2
+
+ ip link set dev v1 down
+ ip link set dev $rp2 down
+ vrf_destroy "vrf-spine"
+}
+
+ns_h1_create()
+{
+ hx_create "vrf-h1" w2 10.1.1.102 10.1.1.1
+}
+export -f ns_h1_create
+
+ns_h2_create()
+{
+ hx_create "vrf-h2" w4 10.1.2.102 10.1.2.1
+}
+export -f ns_h2_create
+
+ns_switch_create()
+{
+ ip link add name br1 type bridge vlan_filtering 1 vlan_default_pvid 0 \
+ mcast_snooping 0
+ ip link set dev br1 up
+
+ ip link set dev v2 up
+ ip address add dev v2 192.0.3.1/24
+ ip route add 10.0.0.1/32 nexthop via 192.0.3.2
+
+ ip link add name vx10 type vxlan id 1010 \
+ local 10.0.0.2 remote 10.0.0.1 dstport 4789 \
+ nolearning noudpcsum tos inherit ttl 100
+ ip link set dev vx10 up
+
+ ip link set dev vx10 master br1
+ bridge vlan add vid 10 dev vx10 pvid untagged
+
+ ip link add name vx20 type vxlan id 1020 \
+ local 10.0.0.2 remote 10.0.0.1 dstport 4789 \
+ nolearning noudpcsum tos inherit ttl 100
+ ip link set dev vx20 up
+
+ ip link set dev vx20 master br1
+ bridge vlan add vid 20 dev vx20 pvid untagged
+
+ ip link add name vx4001 type vxlan id 104001 \
+ local 10.0.0.2 dstport 4789 \
+ nolearning noudpcsum tos inherit ttl 100
+ ip link set dev vx4001 up
+
+ ip link set dev vx4001 master br1
+ bridge vlan add vid 4001 dev vx4001 pvid untagged
+
+ ip link set dev w1 master br1
+ ip link set dev w1 up
+ bridge vlan add vid 10 dev w1 pvid untagged
+
+ ip link set dev w3 master br1
+ ip link set dev w3 up
+ bridge vlan add vid 20 dev w3 pvid untagged
+
+ ip address add 10.0.0.2/32 dev lo
+
+ # Create SVIs
+ vrf_create "vrf-green"
+ ip link set dev vrf-green up
+
+ ip link add link br1 name vlan10 up master vrf-green type vlan id 10
+ ip address add 10.1.1.12/24 dev vlan10
+ ip link add link vlan10 name vlan10-v up master vrf-green \
+ address 00:00:5e:00:01:01 type macvlan mode private
+ ip address add 10.1.1.1/24 dev vlan10-v
+
+ ip link add link br1 name vlan20 up master vrf-green type vlan id 20
+ ip address add 10.1.2.12/24 dev vlan20
+ ip link add link vlan20 name vlan20-v up master vrf-green \
+ address 00:00:5e:00:01:01 type macvlan mode private
+ ip address add 10.1.2.1/24 dev vlan20-v
+
+ ip link add link br1 name vlan4001 up master vrf-green \
+ type vlan id 4001
+
+ bridge vlan add vid 10 dev br1 self
+ bridge vlan add vid 20 dev br1 self
+ bridge vlan add vid 4001 dev br1 self
+
+ bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 10
+ bridge fdb add 00:00:5e:00:01:01 dev br1 self local vlan 20
+
+ sysctl_set net.ipv4.conf.all.rp_filter 0
+ sysctl_set net.ipv4.conf.vlan10-v.rp_filter 0
+ sysctl_set net.ipv4.conf.vlan20-v.rp_filter 0
+}
+export -f ns_switch_create
+
+ns_init()
+{
+ ip link add name w1 type veth peer name w2
+ ip link add name w3 type veth peer name w4
+
+ ip link set dev lo up
+
+ ns_h1_create
+ ns_h2_create
+ ns_switch_create
+}
+export -f ns_init
+
+ns1_create()
+{
+ ip netns add ns1
+ ip link set dev v2 netns ns1
+ in_ns ns1 ns_init
+}
+
+ns1_destroy()
+{
+ ip netns exec ns1 ip link set dev v2 netns 1
+ ip netns del ns1
+}
+
+__l2_vni_init()
+{
+ local mac1=$1; shift
+ local mac2=$1; shift
+ local ip1=$1; shift
+ local ip2=$1; shift
+ local dst=$1; shift
+
+ bridge fdb add $mac1 dev vx10 self master extern_learn static \
+ dst $dst vlan 10
+ bridge fdb add $mac2 dev vx20 self master extern_learn static \
+ dst $dst vlan 20
+
+ ip neigh add $ip1 lladdr $mac1 nud noarp dev vlan10 \
+ extern_learn
+ ip neigh add $ip2 lladdr $mac2 nud noarp dev vlan20 \
+ extern_learn
+}
+export -f __l2_vni_init
+
+l2_vni_init()
+{
+ local h1_ns_mac=$(in_ns ns1 mac_get w2)
+ local h2_ns_mac=$(in_ns ns1 mac_get w4)
+ local h1_mac=$(mac_get $h1)
+ local h2_mac=$(mac_get $h2)
+
+ __l2_vni_init $h1_ns_mac $h2_ns_mac 10.1.1.102 10.1.2.102 10.0.0.2
+ in_ns ns1 __l2_vni_init $h1_mac $h2_mac 10.1.1.101 10.1.2.101 10.0.0.1
+}
+
+__l3_vni_init()
+{
+ local mac=$1; shift
+ local vtep_ip=$1; shift
+ local host1_ip=$1; shift
+ local host2_ip=$1; shift
+
+ bridge fdb add $mac dev vx4001 self master extern_learn static \
+ dst $vtep_ip vlan 4001
+
+ ip neigh add $vtep_ip lladdr $mac nud noarp dev vlan4001 extern_learn
+
+ ip route add $host1_ip/32 vrf vrf-green nexthop via $vtep_ip \
+ dev vlan4001 onlink
+ ip route add $host2_ip/32 vrf vrf-green nexthop via $vtep_ip \
+ dev vlan4001 onlink
+}
+export -f __l3_vni_init
+
+l3_vni_init()
+{
+ local vlan4001_ns_mac=$(in_ns ns1 mac_get vlan4001)
+ local vlan4001_mac=$(mac_get vlan4001)
+
+ __l3_vni_init $vlan4001_ns_mac 10.0.0.2 10.1.1.102 10.1.2.102
+ in_ns ns1 __l3_vni_init $vlan4001_mac 10.0.0.1 10.1.1.101 10.1.2.101
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ swp1=${NETIFS[p2]}
+
+ swp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+
+ rp1=${NETIFS[p5]}
+ rp2=${NETIFS[p6]}
+
+ vrf_prepare
+ forwarding_enable
+
+ h1_create
+ h2_create
+ switch_create
+
+ ip link add name v1 type veth peer name v2
+ spine_create
+ ns1_create
+
+ l2_vni_init
+ l3_vni_init
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ ns1_destroy
+ spine_destroy
+ ip link del dev v1
+
+ switch_destroy
+ h2_destroy
+ h1_destroy
+
+ forwarding_restore
+ vrf_cleanup
+}
+
+ping_ipv4()
+{
+ ping_test $h1 10.1.2.101 ": local->local vid 10->vid 20"
+ ping_test $h1 10.1.1.102 ": local->remote vid 10->vid 10"
+ ping_test $h2 10.1.2.102 ": local->remote vid 20->vid 20"
+ ping_test $h1 10.1.2.102 ": local->remote vid 10->vid 20"
+ ping_test $h2 10.1.1.102 ": local->remote vid 20->vid 10"
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/hwtstamp_config.c b/tools/testing/selftests/net/hwtstamp_config.c
new file mode 100644
index 000000000..e1fdee841
--- /dev/null
+++ b/tools/testing/selftests/net/hwtstamp_config.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Test program for SIOC{G,S}HWTSTAMP
+ * Copyright 2013 Solarflare Communications
+ * Author: Ben Hutchings
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+
+#include <linux/if.h>
+#include <linux/net_tstamp.h>
+#include <linux/sockios.h>
+
+static int
+lookup_value(const char **names, int size, const char *name)
+{
+ int value;
+
+ for (value = 0; value < size; value++)
+ if (names[value] && strcasecmp(names[value], name) == 0)
+ return value;
+
+ return -1;
+}
+
+static const char *
+lookup_name(const char **names, int size, int value)
+{
+ return (value >= 0 && value < size) ? names[value] : NULL;
+}
+
+static void list_names(FILE *f, const char **names, int size)
+{
+ int value;
+
+ for (value = 0; value < size; value++)
+ if (names[value])
+ fprintf(f, " %s\n", names[value]);
+}
+
+static const char *tx_types[] = {
+#define TX_TYPE(name) [HWTSTAMP_TX_ ## name] = #name
+ TX_TYPE(OFF),
+ TX_TYPE(ON),
+ TX_TYPE(ONESTEP_SYNC)
+#undef TX_TYPE
+};
+#define N_TX_TYPES ((int)(sizeof(tx_types) / sizeof(tx_types[0])))
+
+static const char *rx_filters[] = {
+#define RX_FILTER(name) [HWTSTAMP_FILTER_ ## name] = #name
+ RX_FILTER(NONE),
+ RX_FILTER(ALL),
+ RX_FILTER(SOME),
+ RX_FILTER(PTP_V1_L4_EVENT),
+ RX_FILTER(PTP_V1_L4_SYNC),
+ RX_FILTER(PTP_V1_L4_DELAY_REQ),
+ RX_FILTER(PTP_V2_L4_EVENT),
+ RX_FILTER(PTP_V2_L4_SYNC),
+ RX_FILTER(PTP_V2_L4_DELAY_REQ),
+ RX_FILTER(PTP_V2_L2_EVENT),
+ RX_FILTER(PTP_V2_L2_SYNC),
+ RX_FILTER(PTP_V2_L2_DELAY_REQ),
+ RX_FILTER(PTP_V2_EVENT),
+ RX_FILTER(PTP_V2_SYNC),
+ RX_FILTER(PTP_V2_DELAY_REQ),
+#undef RX_FILTER
+};
+#define N_RX_FILTERS ((int)(sizeof(rx_filters) / sizeof(rx_filters[0])))
+
+static void usage(void)
+{
+ fputs("Usage: hwtstamp_config if_name [tx_type rx_filter]\n"
+ "tx_type is any of (case-insensitive):\n",
+ stderr);
+ list_names(stderr, tx_types, N_TX_TYPES);
+ fputs("rx_filter is any of (case-insensitive):\n", stderr);
+ list_names(stderr, rx_filters, N_RX_FILTERS);
+}
+
+int main(int argc, char **argv)
+{
+ struct ifreq ifr;
+ struct hwtstamp_config config;
+ const char *name;
+ int sock;
+
+ if ((argc != 2 && argc != 4) || (strlen(argv[1]) >= IFNAMSIZ)) {
+ usage();
+ return 2;
+ }
+
+ if (argc == 4) {
+ config.flags = 0;
+ config.tx_type = lookup_value(tx_types, N_TX_TYPES, argv[2]);
+ config.rx_filter = lookup_value(rx_filters, N_RX_FILTERS, argv[3]);
+ if (config.tx_type < 0 || config.rx_filter < 0) {
+ usage();
+ return 2;
+ }
+ }
+
+ sock = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock < 0) {
+ perror("socket");
+ return 1;
+ }
+
+ strcpy(ifr.ifr_name, argv[1]);
+ ifr.ifr_data = (caddr_t)&config;
+
+ if (ioctl(sock, (argc == 2) ? SIOCGHWTSTAMP : SIOCSHWTSTAMP, &ifr)) {
+ perror("ioctl");
+ return 1;
+ }
+
+ printf("flags = %#x\n", config.flags);
+ name = lookup_name(tx_types, N_TX_TYPES, config.tx_type);
+ if (name)
+ printf("tx_type = %s\n", name);
+ else
+ printf("tx_type = %d\n", config.tx_type);
+ name = lookup_name(rx_filters, N_RX_FILTERS, config.rx_filter);
+ if (name)
+ printf("rx_filter = %s\n", name);
+ else
+ printf("rx_filter = %d\n", config.rx_filter);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/net/icmp_redirect.sh b/tools/testing/selftests/net/icmp_redirect.sh
new file mode 100755
index 000000000..104a7a5f1
--- /dev/null
+++ b/tools/testing/selftests/net/icmp_redirect.sh
@@ -0,0 +1,537 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# redirect test
+#
+# .253 +----+
+# +----| r1 |
+# | +----+
+# +----+ | |.1
+# | h1 |--------------+ | 10.1.1.0/30 2001:db8:1::0/126
+# +----+ .1 | |.2
+# 172.16.1/24 | +----+ +----+
+# 2001:db8:16:1/64 +----| r2 |-------------------| h2 |
+# .254 +----+ .254 .2 +----+
+# 172.16.2/24
+# 2001:db8:16:2/64
+#
+# Route from h1 to h2 goes through r1, eth1 - connection between r1 and r2.
+# Route on r1 changed to go to r2 via eth0. This causes a redirect to be sent
+# from r1 to h1 telling h1 to use r2 when talking to h2.
+
+VERBOSE=0
+PAUSE_ON_FAIL=no
+
+H1_N1_IP=172.16.1.1
+R1_N1_IP=172.16.1.253
+R2_N1_IP=172.16.1.254
+
+H1_N1_IP6=2001:db8:16:1::1
+R1_N1_IP6=2001:db8:16:1::253
+R2_N1_IP6=2001:db8:16:1::254
+
+R1_R2_N1_IP=10.1.1.1
+R2_R1_N1_IP=10.1.1.2
+
+R1_R2_N1_IP6=2001:db8:1::1
+R2_R1_N1_IP6=2001:db8:1::2
+
+H2_N2=172.16.2.0/24
+H2_N2_6=2001:db8:16:2::/64
+H2_N2_IP=172.16.2.2
+R2_N2_IP=172.16.2.254
+H2_N2_IP6=2001:db8:16:2::2
+R2_N2_IP6=2001:db8:16:2::254
+
+VRF=red
+VRF_TABLE=1111
+
+################################################################################
+# helpers
+
+log_section()
+{
+ echo
+ echo "###########################################################################"
+ echo "$*"
+ echo "###########################################################################"
+ echo
+}
+
+log_test()
+{
+ local rc=$1
+ local expected=$2
+ local msg="$3"
+
+ if [ ${rc} -eq ${expected} ]; then
+ printf "TEST: %-60s [ OK ]\n" "${msg}"
+ nsuccess=$((nsuccess+1))
+ else
+ ret=1
+ nfail=$((nfail+1))
+ printf "TEST: %-60s [FAIL]\n" "${msg}"
+ if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+ echo
+ echo "hit enter to continue, 'q' to quit"
+ read a
+ [ "$a" = "q" ] && exit 1
+ fi
+ fi
+}
+
+log_debug()
+{
+ if [ "$VERBOSE" = "1" ]; then
+ echo "$*"
+ fi
+}
+
+run_cmd()
+{
+ local cmd="$*"
+ local out
+ local rc
+
+ if [ "$VERBOSE" = "1" ]; then
+ echo "COMMAND: $cmd"
+ fi
+
+ out=$(eval $cmd 2>&1)
+ rc=$?
+ if [ "$VERBOSE" = "1" -a -n "$out" ]; then
+ echo "$out"
+ fi
+
+ [ "$VERBOSE" = "1" ] && echo
+
+ return $rc
+}
+
+get_linklocal()
+{
+ local ns=$1
+ local dev=$2
+ local addr
+
+ addr=$(ip -netns $ns -6 -br addr show dev ${dev} | \
+ awk '{
+ for (i = 3; i <= NF; ++i) {
+ if ($i ~ /^fe80/)
+ print $i
+ }
+ }'
+ )
+ addr=${addr/\/*}
+
+ [ -z "$addr" ] && return 1
+
+ echo $addr
+
+ return 0
+}
+
+################################################################################
+# setup and teardown
+
+cleanup()
+{
+ local ns
+
+ for ns in h1 h2 r1 r2; do
+ ip netns del $ns 2>/dev/null
+ done
+}
+
+create_vrf()
+{
+ local ns=$1
+
+ ip -netns ${ns} link add ${VRF} type vrf table ${VRF_TABLE}
+ ip -netns ${ns} link set ${VRF} up
+ ip -netns ${ns} route add vrf ${VRF} unreachable default metric 8192
+ ip -netns ${ns} -6 route add vrf ${VRF} unreachable default metric 8192
+
+ ip -netns ${ns} addr add 127.0.0.1/8 dev ${VRF}
+ ip -netns ${ns} -6 addr add ::1 dev ${VRF} nodad
+
+ ip -netns ${ns} ru del pref 0
+ ip -netns ${ns} ru add pref 32765 from all lookup local
+ ip -netns ${ns} -6 ru del pref 0
+ ip -netns ${ns} -6 ru add pref 32765 from all lookup local
+}
+
+setup()
+{
+ local ns
+
+ #
+ # create nodes as namespaces
+ #
+ for ns in h1 h2 r1 r2; do
+ ip netns add $ns
+ ip -netns $ns li set lo up
+
+ case "${ns}" in
+ h[12]) ip netns exec $ns sysctl -q -w net.ipv4.conf.all.accept_redirects=1
+ ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=0
+ ip netns exec $ns sysctl -q -w net.ipv6.conf.all.accept_redirects=1
+ ip netns exec $ns sysctl -q -w net.ipv6.conf.all.keep_addr_on_down=1
+ ;;
+ r[12]) ip netns exec $ns sysctl -q -w net.ipv4.ip_forward=1
+ ip netns exec $ns sysctl -q -w net.ipv4.conf.all.send_redirects=1
+ ip netns exec $ns sysctl -q -w net.ipv4.conf.default.rp_filter=0
+ ip netns exec $ns sysctl -q -w net.ipv4.conf.all.rp_filter=0
+
+ ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=1
+ ip netns exec $ns sysctl -q -w net.ipv6.route.mtu_expires=10
+ esac
+ done
+
+ #
+ # create interconnects
+ #
+ ip -netns h1 li add eth0 type veth peer name r1h1
+ ip -netns h1 li set r1h1 netns r1 name eth0 up
+
+ ip -netns h1 li add eth1 type veth peer name r2h1
+ ip -netns h1 li set r2h1 netns r2 name eth0 up
+
+ ip -netns h2 li add eth0 type veth peer name r2h2
+ ip -netns h2 li set eth0 up
+ ip -netns h2 li set r2h2 netns r2 name eth2 up
+
+ ip -netns r1 li add eth1 type veth peer name r2r1
+ ip -netns r1 li set eth1 up
+ ip -netns r1 li set r2r1 netns r2 name eth1 up
+
+ #
+ # h1
+ #
+ if [ "${WITH_VRF}" = "yes" ]; then
+ create_vrf "h1"
+ H1_VRF_ARG="vrf ${VRF}"
+ H1_PING_ARG="-I ${VRF}"
+ else
+ H1_VRF_ARG=
+ H1_PING_ARG=
+ fi
+ ip -netns h1 li add br0 type bridge
+ if [ "${WITH_VRF}" = "yes" ]; then
+ ip -netns h1 li set br0 vrf ${VRF} up
+ else
+ ip -netns h1 li set br0 up
+ fi
+ ip -netns h1 addr add dev br0 ${H1_N1_IP}/24
+ ip -netns h1 -6 addr add dev br0 ${H1_N1_IP6}/64 nodad
+ ip -netns h1 li set eth0 master br0 up
+ ip -netns h1 li set eth1 master br0 up
+
+ #
+ # h2
+ #
+ ip -netns h2 addr add dev eth0 ${H2_N2_IP}/24
+ ip -netns h2 ro add default via ${R2_N2_IP} dev eth0
+ ip -netns h2 -6 addr add dev eth0 ${H2_N2_IP6}/64 nodad
+ ip -netns h2 -6 ro add default via ${R2_N2_IP6} dev eth0
+
+ #
+ # r1
+ #
+ ip -netns r1 addr add dev eth0 ${R1_N1_IP}/24
+ ip -netns r1 -6 addr add dev eth0 ${R1_N1_IP6}/64 nodad
+ ip -netns r1 addr add dev eth1 ${R1_R2_N1_IP}/30
+ ip -netns r1 -6 addr add dev eth1 ${R1_R2_N1_IP6}/126 nodad
+
+ #
+ # r2
+ #
+ ip -netns r2 addr add dev eth0 ${R2_N1_IP}/24
+ ip -netns r2 -6 addr add dev eth0 ${R2_N1_IP6}/64 nodad
+ ip -netns r2 addr add dev eth1 ${R2_R1_N1_IP}/30
+ ip -netns r2 -6 addr add dev eth1 ${R2_R1_N1_IP6}/126 nodad
+ ip -netns r2 addr add dev eth2 ${R2_N2_IP}/24
+ ip -netns r2 -6 addr add dev eth2 ${R2_N2_IP6}/64 nodad
+
+ sleep 2
+
+ R1_LLADDR=$(get_linklocal r1 eth0)
+ if [ $? -ne 0 ]; then
+ echo "Error: Failed to get link-local address of r1's eth0"
+ exit 1
+ fi
+ log_debug "initial gateway is R1's lladdr = ${R1_LLADDR}"
+
+ R2_LLADDR=$(get_linklocal r2 eth0)
+ if [ $? -ne 0 ]; then
+ echo "Error: Failed to get link-local address of r2's eth0"
+ exit 1
+ fi
+ log_debug "initial gateway is R2's lladdr = ${R2_LLADDR}"
+}
+
+change_h2_mtu()
+{
+ local mtu=$1
+
+ run_cmd ip -netns h2 li set eth0 mtu ${mtu}
+ run_cmd ip -netns r2 li set eth2 mtu ${mtu}
+}
+
+check_exception()
+{
+ local mtu="$1"
+ local with_redirect="$2"
+ local desc="$3"
+
+ # From 172.16.1.101: icmp_seq=1 Redirect Host(New nexthop: 172.16.1.102)
+ if [ "$VERBOSE" = "1" ]; then
+ echo "Commands to check for exception:"
+ run_cmd ip -netns h1 ro get ${H1_VRF_ARG} ${H2_N2_IP}
+ run_cmd ip -netns h1 -6 ro get ${H1_VRF_ARG} ${H2_N2_IP6}
+ fi
+
+ if [ -n "${mtu}" ]; then
+ mtu=" mtu ${mtu}"
+ fi
+ if [ "$with_redirect" = "yes" ]; then
+ ip -netns h1 ro get ${H1_VRF_ARG} ${H2_N2_IP} | \
+ grep -q "cache <redirected> expires [0-9]*sec${mtu}"
+ elif [ -n "${mtu}" ]; then
+ ip -netns h1 ro get ${H1_VRF_ARG} ${H2_N2_IP} | \
+ grep -q "cache expires [0-9]*sec${mtu}"
+ else
+ # want to verify that neither mtu nor redirected appears in
+ # the route get output. The -v will wipe out the cache line
+ # if either are set so the last grep -q will not find a match
+ ip -netns h1 ro get ${H1_VRF_ARG} ${H2_N2_IP} | \
+ grep -E -v 'mtu|redirected' | grep -q "cache"
+ fi
+ log_test $? 0 "IPv4: ${desc}"
+
+ # No PMTU info for test "redirect" and "mtu exception plus redirect"
+ if [ "$with_redirect" = "yes" ] && [ "$desc" != "redirect exception plus mtu" ]; then
+ ip -netns h1 -6 ro get ${H1_VRF_ARG} ${H2_N2_IP6} | \
+ grep -v "mtu" | grep -q "${H2_N2_IP6} .*via ${R2_LLADDR} dev br0"
+ elif [ -n "${mtu}" ]; then
+ ip -netns h1 -6 ro get ${H1_VRF_ARG} ${H2_N2_IP6} | \
+ grep -q "${mtu}"
+ else
+ # IPv6 is a bit harder. First strip out the match if it
+ # contains an mtu exception and then look for the first
+ # gateway - R1's lladdr
+ ip -netns h1 -6 ro get ${H1_VRF_ARG} ${H2_N2_IP6} | \
+ grep -v "mtu" | grep -q "${R1_LLADDR}"
+ fi
+ log_test $? 0 "IPv6: ${desc}"
+}
+
+run_ping()
+{
+ local sz=$1
+
+ run_cmd ip netns exec h1 ping -q -M want -i 0.5 -c 10 -w 2 -s ${sz} ${H1_PING_ARG} ${H2_N2_IP}
+ run_cmd ip netns exec h1 ${ping6} -q -M want -i 0.5 -c 10 -w 2 -s ${sz} ${H1_PING_ARG} ${H2_N2_IP6}
+}
+
+replace_route_new()
+{
+ # r1 to h2 via r2 and eth0
+ run_cmd ip -netns r1 nexthop replace id 1 via ${R2_N1_IP} dev eth0
+ run_cmd ip -netns r1 nexthop replace id 2 via ${R2_LLADDR} dev eth0
+}
+
+reset_route_new()
+{
+ run_cmd ip -netns r1 nexthop flush
+ run_cmd ip -netns h1 nexthop flush
+
+ initial_route_new
+}
+
+initial_route_new()
+{
+ # r1 to h2 via r2 and eth1
+ run_cmd ip -netns r1 nexthop add id 1 via ${R2_R1_N1_IP} dev eth1
+ run_cmd ip -netns r1 ro add ${H2_N2} nhid 1
+
+ run_cmd ip -netns r1 nexthop add id 2 via ${R2_R1_N1_IP6} dev eth1
+ run_cmd ip -netns r1 -6 ro add ${H2_N2_6} nhid 2
+
+ # h1 to h2 via r1
+ run_cmd ip -netns h1 nexthop add id 1 via ${R1_N1_IP} dev br0
+ run_cmd ip -netns h1 ro add ${H1_VRF_ARG} ${H2_N2} nhid 1
+
+ run_cmd ip -netns h1 nexthop add id 2 via ${R1_LLADDR} dev br0
+ run_cmd ip -netns h1 -6 ro add ${H1_VRF_ARG} ${H2_N2_6} nhid 2
+}
+
+replace_route_legacy()
+{
+ # r1 to h2 via r2 and eth0
+ run_cmd ip -netns r1 ro replace ${H2_N2} via ${R2_N1_IP} dev eth0
+ run_cmd ip -netns r1 -6 ro replace ${H2_N2_6} via ${R2_LLADDR} dev eth0
+}
+
+reset_route_legacy()
+{
+ run_cmd ip -netns r1 ro del ${H2_N2}
+ run_cmd ip -netns r1 -6 ro del ${H2_N2_6}
+
+ run_cmd ip -netns h1 ro del ${H1_VRF_ARG} ${H2_N2}
+ run_cmd ip -netns h1 -6 ro del ${H1_VRF_ARG} ${H2_N2_6}
+
+ initial_route_legacy
+}
+
+initial_route_legacy()
+{
+ # r1 to h2 via r2 and eth1
+ run_cmd ip -netns r1 ro add ${H2_N2} via ${R2_R1_N1_IP} dev eth1
+ run_cmd ip -netns r1 -6 ro add ${H2_N2_6} via ${R2_R1_N1_IP6} dev eth1
+
+ # h1 to h2 via r1
+ # - IPv6 redirect only works if gateway is the LLA
+ run_cmd ip -netns h1 ro add ${H1_VRF_ARG} ${H2_N2} via ${R1_N1_IP} dev br0
+ run_cmd ip -netns h1 -6 ro add ${H1_VRF_ARG} ${H2_N2_6} via ${R1_LLADDR} dev br0
+}
+
+check_connectivity()
+{
+ local rc
+
+ run_cmd ip netns exec h1 ping -c1 -w1 ${H1_PING_ARG} ${H2_N2_IP}
+ rc=$?
+ run_cmd ip netns exec h1 ${ping6} -c1 -w1 ${H1_PING_ARG} ${H2_N2_IP6}
+ [ $? -ne 0 ] && rc=$?
+
+ return $rc
+}
+
+do_test()
+{
+ local ttype="$1"
+
+ eval initial_route_${ttype}
+
+ # verify connectivity
+ check_connectivity
+ if [ $? -ne 0 ]; then
+ echo "Error: Basic connectivity is broken"
+ ret=1
+ return
+ fi
+
+ # redirect exception followed by mtu
+ eval replace_route_${ttype}
+ run_ping 64
+ check_exception "" "yes" "redirect exception"
+
+ check_connectivity
+ if [ $? -ne 0 ]; then
+ echo "Error: Basic connectivity is broken after redirect"
+ ret=1
+ return
+ fi
+
+ change_h2_mtu 1300
+ run_ping 1350
+ check_exception "1300" "yes" "redirect exception plus mtu"
+
+ # remove exceptions and restore routing
+ change_h2_mtu 1500
+ eval reset_route_${ttype}
+
+ check_connectivity
+ if [ $? -ne 0 ]; then
+ echo "Error: Basic connectivity is broken after reset"
+ ret=1
+ return
+ fi
+ check_exception "" "no" "routing reset"
+
+ # MTU exception followed by redirect
+ change_h2_mtu 1300
+ run_ping 1350
+ check_exception "1300" "no" "mtu exception"
+
+ eval replace_route_${ttype}
+ run_ping 64
+ check_exception "1300" "yes" "mtu exception plus redirect"
+
+ check_connectivity
+ if [ $? -ne 0 ]; then
+ echo "Error: Basic connectivity is broken after redirect"
+ ret=1
+ return
+ fi
+}
+
+################################################################################
+# usage
+
+usage()
+{
+ cat <<EOF
+usage: ${0##*/} OPTS
+
+ -p Pause on fail
+ -v verbose mode (show commands and output)
+EOF
+}
+
+################################################################################
+# main
+
+# Some systems don't have a ping6 binary anymore
+which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping)
+
+ret=0
+nsuccess=0
+nfail=0
+
+while getopts :pv o
+do
+ case $o in
+ p) PAUSE_ON_FAIL=yes;;
+ v) VERBOSE=$(($VERBOSE + 1));;
+ *) usage; exit 1;;
+ esac
+done
+
+trap cleanup EXIT
+
+cleanup
+WITH_VRF=no
+setup
+
+log_section "Legacy routing"
+do_test "legacy"
+
+cleanup
+log_section "Legacy routing with VRF"
+WITH_VRF=yes
+setup
+do_test "legacy"
+
+cleanup
+log_section "Routing with nexthop objects"
+ip nexthop ls >/dev/null 2>&1
+if [ $? -eq 0 ]; then
+ WITH_VRF=no
+ setup
+ do_test "new"
+
+ cleanup
+ log_section "Routing with nexthop objects and VRF"
+ WITH_VRF=yes
+ setup
+ do_test "new"
+else
+ echo "Nexthop objects not supported; skipping tests"
+fi
+
+printf "\nTests passed: %3d\n" ${nsuccess}
+printf "Tests failed: %3d\n" ${nfail}
+
+exit $ret
diff --git a/tools/testing/selftests/net/in_netns.sh b/tools/testing/selftests/net/in_netns.sh
new file mode 100755
index 000000000..88795b510
--- /dev/null
+++ b/tools/testing/selftests/net/in_netns.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Execute a subprocess in a network namespace
+
+set -e
+
+readonly NETNS="ns-$(mktemp -u XXXXXX)"
+
+setup() {
+ ip netns add "${NETNS}"
+ ip -netns "${NETNS}" link set lo up
+}
+
+cleanup() {
+ ip netns del "${NETNS}"
+}
+
+trap cleanup EXIT
+setup
+
+ip netns exec "${NETNS}" "$@"
+exit "$?"
diff --git a/tools/testing/selftests/net/ip6_gre_headroom.sh b/tools/testing/selftests/net/ip6_gre_headroom.sh
new file mode 100755
index 000000000..5b41e8bb6
--- /dev/null
+++ b/tools/testing/selftests/net/ip6_gre_headroom.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test that enough headroom is reserved for the first packet passing through an
+# IPv6 GRE-like netdevice.
+
+setup_prepare()
+{
+ ip link add h1 type veth peer name swp1
+ ip link add h3 type veth peer name swp3
+
+ ip link set dev h1 up
+ ip address add 192.0.2.1/28 dev h1
+
+ ip link add dev vh3 type vrf table 20
+ ip link set dev h3 master vh3
+ ip link set dev vh3 up
+ ip link set dev h3 up
+
+ ip link set dev swp3 up
+ ip address add dev swp3 2001:db8:2::1/64
+ ip address add dev swp3 2001:db8:2::3/64
+
+ ip link set dev swp1 up
+ tc qdisc add dev swp1 clsact
+
+ ip link add name er6 type ip6erspan \
+ local 2001:db8:2::1 remote 2001:db8:2::2 oseq okey 123
+ ip link set dev er6 up
+
+ ip link add name gt6 type ip6gretap \
+ local 2001:db8:2::3 remote 2001:db8:2::4
+ ip link set dev gt6 up
+
+ sleep 1
+}
+
+cleanup()
+{
+ ip link del dev gt6
+ ip link del dev er6
+ ip link del dev swp1
+ ip link del dev swp3
+ ip link del dev vh3
+}
+
+test_headroom()
+{
+ local type=$1; shift
+ local tundev=$1; shift
+
+ tc filter add dev swp1 ingress pref 1000 matchall skip_hw \
+ action mirred egress mirror dev $tundev
+ ping -I h1 192.0.2.2 -c 1 -w 2 &> /dev/null
+ tc filter del dev swp1 ingress pref 1000
+
+ # If it doesn't panic, it passes.
+ printf "TEST: %-60s [PASS]\n" "$type headroom"
+}
+
+trap cleanup EXIT
+
+setup_prepare
+test_headroom ip6gretap gt6
+test_headroom ip6erspan er6
diff --git a/tools/testing/selftests/net/ip_defrag.c b/tools/testing/selftests/net/ip_defrag.c
new file mode 100644
index 000000000..f9ed749fd
--- /dev/null
+++ b/tools/testing/selftests/net/ip_defrag.c
@@ -0,0 +1,472 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <linux/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/udp.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+static bool cfg_do_ipv4;
+static bool cfg_do_ipv6;
+static bool cfg_verbose;
+static bool cfg_overlap;
+static bool cfg_permissive;
+static unsigned short cfg_port = 9000;
+
+const struct in_addr addr4 = { .s_addr = __constant_htonl(INADDR_LOOPBACK + 2) };
+const struct in6_addr addr6 = IN6ADDR_LOOPBACK_INIT;
+
+#define IP4_HLEN (sizeof(struct iphdr))
+#define IP6_HLEN (sizeof(struct ip6_hdr))
+#define UDP_HLEN (sizeof(struct udphdr))
+
+/* IPv6 fragment header lenth. */
+#define FRAG_HLEN 8
+
+static int payload_len;
+static int max_frag_len;
+
+#define MSG_LEN_MAX 10000 /* Max UDP payload length. */
+
+#define IP4_MF (1u << 13) /* IPv4 MF flag. */
+#define IP6_MF (1) /* IPv6 MF flag. */
+
+#define CSUM_MANGLED_0 (0xffff)
+
+static uint8_t udp_payload[MSG_LEN_MAX];
+static uint8_t ip_frame[IP_MAXPACKET];
+static uint32_t ip_id = 0xabcd;
+static int msg_counter;
+static int frag_counter;
+static unsigned int seed;
+
+/* Receive a UDP packet. Validate it matches udp_payload. */
+static void recv_validate_udp(int fd_udp)
+{
+ ssize_t ret;
+ static uint8_t recv_buff[MSG_LEN_MAX];
+
+ ret = recv(fd_udp, recv_buff, payload_len, 0);
+ msg_counter++;
+
+ if (cfg_overlap) {
+ if (ret == -1 && (errno == ETIMEDOUT || errno == EAGAIN))
+ return; /* OK */
+ if (!cfg_permissive) {
+ if (ret != -1)
+ error(1, 0, "recv: expected timeout; got %d",
+ (int)ret);
+ error(1, errno, "recv: expected timeout: %d", errno);
+ }
+ }
+
+ if (ret == -1)
+ error(1, errno, "recv: payload_len = %d max_frag_len = %d",
+ payload_len, max_frag_len);
+ if (ret != payload_len)
+ error(1, 0, "recv: wrong size: %d vs %d", (int)ret, payload_len);
+ if (memcmp(udp_payload, recv_buff, payload_len))
+ error(1, 0, "recv: wrong data");
+}
+
+static uint32_t raw_checksum(uint8_t *buf, int len, uint32_t sum)
+{
+ int i;
+
+ for (i = 0; i < (len & ~1U); i += 2) {
+ sum += (u_int16_t)ntohs(*((u_int16_t *)(buf + i)));
+ if (sum > 0xffff)
+ sum -= 0xffff;
+ }
+
+ if (i < len) {
+ sum += buf[i] << 8;
+ if (sum > 0xffff)
+ sum -= 0xffff;
+ }
+
+ return sum;
+}
+
+static uint16_t udp_checksum(struct ip *iphdr, struct udphdr *udphdr)
+{
+ uint32_t sum = 0;
+ uint16_t res;
+
+ sum = raw_checksum((uint8_t *)&iphdr->ip_src, 2 * sizeof(iphdr->ip_src),
+ IPPROTO_UDP + (uint32_t)(UDP_HLEN + payload_len));
+ sum = raw_checksum((uint8_t *)udphdr, UDP_HLEN, sum);
+ sum = raw_checksum((uint8_t *)udp_payload, payload_len, sum);
+ res = 0xffff & ~sum;
+ if (res)
+ return htons(res);
+ else
+ return CSUM_MANGLED_0;
+}
+
+static uint16_t udp6_checksum(struct ip6_hdr *iphdr, struct udphdr *udphdr)
+{
+ uint32_t sum = 0;
+ uint16_t res;
+
+ sum = raw_checksum((uint8_t *)&iphdr->ip6_src, 2 * sizeof(iphdr->ip6_src),
+ IPPROTO_UDP);
+ sum = raw_checksum((uint8_t *)&udphdr->len, sizeof(udphdr->len), sum);
+ sum = raw_checksum((uint8_t *)udphdr, UDP_HLEN, sum);
+ sum = raw_checksum((uint8_t *)udp_payload, payload_len, sum);
+ res = 0xffff & ~sum;
+ if (res)
+ return htons(res);
+ else
+ return CSUM_MANGLED_0;
+}
+
+static void send_fragment(int fd_raw, struct sockaddr *addr, socklen_t alen,
+ int offset, bool ipv6)
+{
+ int frag_len;
+ int res;
+ int payload_offset = offset > 0 ? offset - UDP_HLEN : 0;
+ uint8_t *frag_start = ipv6 ? ip_frame + IP6_HLEN + FRAG_HLEN :
+ ip_frame + IP4_HLEN;
+
+ if (offset == 0) {
+ struct udphdr udphdr;
+ udphdr.source = htons(cfg_port + 1);
+ udphdr.dest = htons(cfg_port);
+ udphdr.len = htons(UDP_HLEN + payload_len);
+ udphdr.check = 0;
+ if (ipv6)
+ udphdr.check = udp6_checksum((struct ip6_hdr *)ip_frame, &udphdr);
+ else
+ udphdr.check = udp_checksum((struct ip *)ip_frame, &udphdr);
+ memcpy(frag_start, &udphdr, UDP_HLEN);
+ }
+
+ if (ipv6) {
+ struct ip6_hdr *ip6hdr = (struct ip6_hdr *)ip_frame;
+ struct ip6_frag *fraghdr = (struct ip6_frag *)(ip_frame + IP6_HLEN);
+ if (payload_len - payload_offset <= max_frag_len && offset > 0) {
+ /* This is the last fragment. */
+ frag_len = FRAG_HLEN + payload_len - payload_offset;
+ fraghdr->ip6f_offlg = htons(offset);
+ } else {
+ frag_len = FRAG_HLEN + max_frag_len;
+ fraghdr->ip6f_offlg = htons(offset | IP6_MF);
+ }
+ ip6hdr->ip6_plen = htons(frag_len);
+ if (offset == 0)
+ memcpy(frag_start + UDP_HLEN, udp_payload,
+ frag_len - FRAG_HLEN - UDP_HLEN);
+ else
+ memcpy(frag_start, udp_payload + payload_offset,
+ frag_len - FRAG_HLEN);
+ frag_len += IP6_HLEN;
+ } else {
+ struct ip *iphdr = (struct ip *)ip_frame;
+ if (payload_len - payload_offset <= max_frag_len && offset > 0) {
+ /* This is the last fragment. */
+ frag_len = IP4_HLEN + payload_len - payload_offset;
+ iphdr->ip_off = htons(offset / 8);
+ } else {
+ frag_len = IP4_HLEN + max_frag_len;
+ iphdr->ip_off = htons(offset / 8 | IP4_MF);
+ }
+ iphdr->ip_len = htons(frag_len);
+ if (offset == 0)
+ memcpy(frag_start + UDP_HLEN, udp_payload,
+ frag_len - IP4_HLEN - UDP_HLEN);
+ else
+ memcpy(frag_start, udp_payload + payload_offset,
+ frag_len - IP4_HLEN);
+ }
+
+ res = sendto(fd_raw, ip_frame, frag_len, 0, addr, alen);
+ if (res < 0 && errno != EPERM)
+ error(1, errno, "send_fragment");
+ if (res >= 0 && res != frag_len)
+ error(1, 0, "send_fragment: %d vs %d", res, frag_len);
+
+ frag_counter++;
+}
+
+static void send_udp_frags(int fd_raw, struct sockaddr *addr,
+ socklen_t alen, bool ipv6)
+{
+ struct ip *iphdr = (struct ip *)ip_frame;
+ struct ip6_hdr *ip6hdr = (struct ip6_hdr *)ip_frame;
+ int res;
+ int offset;
+ int frag_len;
+
+ /* Send the UDP datagram using raw IP fragments: the 0th fragment
+ * has the UDP header; other fragments are pieces of udp_payload
+ * split in chunks of frag_len size.
+ *
+ * Odd fragments (1st, 3rd, 5th, etc.) are sent out first, then
+ * even fragments (0th, 2nd, etc.) are sent out.
+ */
+ if (ipv6) {
+ struct ip6_frag *fraghdr = (struct ip6_frag *)(ip_frame + IP6_HLEN);
+ ((struct sockaddr_in6 *)addr)->sin6_port = 0;
+ memset(ip6hdr, 0, sizeof(*ip6hdr));
+ ip6hdr->ip6_flow = htonl(6<<28); /* Version. */
+ ip6hdr->ip6_nxt = IPPROTO_FRAGMENT;
+ ip6hdr->ip6_hops = 255;
+ ip6hdr->ip6_src = addr6;
+ ip6hdr->ip6_dst = addr6;
+ fraghdr->ip6f_nxt = IPPROTO_UDP;
+ fraghdr->ip6f_reserved = 0;
+ fraghdr->ip6f_ident = htonl(ip_id++);
+ } else {
+ memset(iphdr, 0, sizeof(*iphdr));
+ iphdr->ip_hl = 5;
+ iphdr->ip_v = 4;
+ iphdr->ip_tos = 0;
+ iphdr->ip_id = htons(ip_id++);
+ iphdr->ip_ttl = 0x40;
+ iphdr->ip_p = IPPROTO_UDP;
+ iphdr->ip_src.s_addr = htonl(INADDR_LOOPBACK);
+ iphdr->ip_dst = addr4;
+ iphdr->ip_sum = 0;
+ }
+
+ /* Occasionally test in-order fragments. */
+ if (!cfg_overlap && (rand() % 100 < 15)) {
+ offset = 0;
+ while (offset < (UDP_HLEN + payload_len)) {
+ send_fragment(fd_raw, addr, alen, offset, ipv6);
+ offset += max_frag_len;
+ }
+ return;
+ }
+
+ /* Occasionally test IPv4 "runs" (see net/ipv4/ip_fragment.c) */
+ if (!cfg_overlap && (rand() % 100 < 20) &&
+ (payload_len > 9 * max_frag_len)) {
+ offset = 6 * max_frag_len;
+ while (offset < (UDP_HLEN + payload_len)) {
+ send_fragment(fd_raw, addr, alen, offset, ipv6);
+ offset += max_frag_len;
+ }
+ offset = 3 * max_frag_len;
+ while (offset < 6 * max_frag_len) {
+ send_fragment(fd_raw, addr, alen, offset, ipv6);
+ offset += max_frag_len;
+ }
+ offset = 0;
+ while (offset < 3 * max_frag_len) {
+ send_fragment(fd_raw, addr, alen, offset, ipv6);
+ offset += max_frag_len;
+ }
+ return;
+ }
+
+ /* Odd fragments. */
+ offset = max_frag_len;
+ while (offset < (UDP_HLEN + payload_len)) {
+ send_fragment(fd_raw, addr, alen, offset, ipv6);
+ /* IPv4 ignores duplicates, so randomly send a duplicate. */
+ if (rand() % 100 == 1)
+ send_fragment(fd_raw, addr, alen, offset, ipv6);
+ offset += 2 * max_frag_len;
+ }
+
+ if (cfg_overlap) {
+ /* Send an extra random fragment.
+ *
+ * Duplicates and some fragments completely inside
+ * previously sent fragments are dropped/ignored. So
+ * random offset and frag_len can result in a dropped
+ * fragment instead of a dropped queue/packet. Thus we
+ * hard-code offset and frag_len.
+ */
+ if (max_frag_len * 4 < payload_len || max_frag_len < 16) {
+ /* not enough payload for random offset and frag_len. */
+ offset = 8;
+ frag_len = UDP_HLEN + max_frag_len;
+ } else {
+ offset = rand() % (payload_len / 2);
+ frag_len = 2 * max_frag_len + 1 + rand() % 256;
+ }
+ if (ipv6) {
+ struct ip6_frag *fraghdr = (struct ip6_frag *)(ip_frame + IP6_HLEN);
+ /* sendto() returns EINVAL if offset + frag_len is too small. */
+ /* In IPv6 if !!(frag_len % 8), the fragment is dropped. */
+ frag_len &= ~0x7;
+ fraghdr->ip6f_offlg = htons(offset / 8 | IP6_MF);
+ ip6hdr->ip6_plen = htons(frag_len);
+ frag_len += IP6_HLEN;
+ } else {
+ frag_len += IP4_HLEN;
+ iphdr->ip_off = htons(offset / 8 | IP4_MF);
+ iphdr->ip_len = htons(frag_len);
+ }
+ res = sendto(fd_raw, ip_frame, frag_len, 0, addr, alen);
+ if (res < 0 && errno != EPERM)
+ error(1, errno, "sendto overlap: %d", frag_len);
+ if (res >= 0 && res != frag_len)
+ error(1, 0, "sendto overlap: %d vs %d", (int)res, frag_len);
+ frag_counter++;
+ }
+
+ /* Event fragments. */
+ offset = 0;
+ while (offset < (UDP_HLEN + payload_len)) {
+ send_fragment(fd_raw, addr, alen, offset, ipv6);
+ /* IPv4 ignores duplicates, so randomly send a duplicate. */
+ if (rand() % 100 == 1)
+ send_fragment(fd_raw, addr, alen, offset, ipv6);
+ offset += 2 * max_frag_len;
+ }
+}
+
+static void run_test(struct sockaddr *addr, socklen_t alen, bool ipv6)
+{
+ int fd_tx_raw, fd_rx_udp;
+ /* Frag queue timeout is set to one second in the calling script;
+ * socket timeout should be just a bit longer to avoid tests interfering
+ * with each other.
+ */
+ struct timeval tv = { .tv_sec = 1, .tv_usec = 10 };
+ int idx;
+ int min_frag_len = 8;
+
+ /* Initialize the payload. */
+ for (idx = 0; idx < MSG_LEN_MAX; ++idx)
+ udp_payload[idx] = idx % 256;
+
+ /* Open sockets. */
+ fd_tx_raw = socket(addr->sa_family, SOCK_RAW, IPPROTO_RAW);
+ if (fd_tx_raw == -1)
+ error(1, errno, "socket tx_raw");
+
+ fd_rx_udp = socket(addr->sa_family, SOCK_DGRAM, 0);
+ if (fd_rx_udp == -1)
+ error(1, errno, "socket rx_udp");
+ if (bind(fd_rx_udp, addr, alen))
+ error(1, errno, "bind");
+ /* Fail fast. */
+ if (setsockopt(fd_rx_udp, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)))
+ error(1, errno, "setsockopt rcv timeout");
+
+ for (payload_len = min_frag_len; payload_len < MSG_LEN_MAX;
+ payload_len += (rand() % 4096)) {
+ if (cfg_verbose)
+ printf("payload_len: %d\n", payload_len);
+
+ if (cfg_overlap) {
+ /* With overlaps, one send/receive pair below takes
+ * at least one second (== timeout) to run, so there
+ * is not enough test time to run a nested loop:
+ * the full overlap test takes 20-30 seconds.
+ */
+ max_frag_len = min_frag_len +
+ rand() % (1500 - FRAG_HLEN - min_frag_len);
+ send_udp_frags(fd_tx_raw, addr, alen, ipv6);
+ recv_validate_udp(fd_rx_udp);
+ } else {
+ /* Without overlaps, each packet reassembly (== one
+ * send/receive pair below) takes very little time to
+ * run, so we can easily afford more thourough testing
+ * with a nested loop: the full non-overlap test takes
+ * less than one second).
+ */
+ max_frag_len = min_frag_len;
+ do {
+ send_udp_frags(fd_tx_raw, addr, alen, ipv6);
+ recv_validate_udp(fd_rx_udp);
+ max_frag_len += 8 * (rand() % 8);
+ } while (max_frag_len < (1500 - FRAG_HLEN) &&
+ max_frag_len <= payload_len);
+ }
+ }
+
+ /* Cleanup. */
+ if (close(fd_tx_raw))
+ error(1, errno, "close tx_raw");
+ if (close(fd_rx_udp))
+ error(1, errno, "close rx_udp");
+
+ if (cfg_verbose)
+ printf("processed %d messages, %d fragments\n",
+ msg_counter, frag_counter);
+
+ fprintf(stderr, "PASS\n");
+}
+
+
+static void run_test_v4(void)
+{
+ struct sockaddr_in addr = {0};
+
+ addr.sin_family = AF_INET;
+ addr.sin_port = htons(cfg_port);
+ addr.sin_addr = addr4;
+
+ run_test((void *)&addr, sizeof(addr), false /* !ipv6 */);
+}
+
+static void run_test_v6(void)
+{
+ struct sockaddr_in6 addr = {0};
+
+ addr.sin6_family = AF_INET6;
+ addr.sin6_port = htons(cfg_port);
+ addr.sin6_addr = addr6;
+
+ run_test((void *)&addr, sizeof(addr), true /* ipv6 */);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ int c;
+
+ while ((c = getopt(argc, argv, "46opv")) != -1) {
+ switch (c) {
+ case '4':
+ cfg_do_ipv4 = true;
+ break;
+ case '6':
+ cfg_do_ipv6 = true;
+ break;
+ case 'o':
+ cfg_overlap = true;
+ break;
+ case 'p':
+ cfg_permissive = true;
+ break;
+ case 'v':
+ cfg_verbose = true;
+ break;
+ default:
+ error(1, 0, "%s: parse error", argv[0]);
+ }
+ }
+}
+
+int main(int argc, char **argv)
+{
+ parse_opts(argc, argv);
+ seed = time(NULL);
+ srand(seed);
+ /* Print the seed to track/reproduce potential failures. */
+ printf("seed = %d\n", seed);
+
+ if (cfg_do_ipv4)
+ run_test_v4();
+ if (cfg_do_ipv6)
+ run_test_v6();
+
+ return 0;
+}
diff --git a/tools/testing/selftests/net/ip_defrag.sh b/tools/testing/selftests/net/ip_defrag.sh
new file mode 100755
index 000000000..ceb7ad4db
--- /dev/null
+++ b/tools/testing/selftests/net/ip_defrag.sh
@@ -0,0 +1,64 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run a couple of IP defragmentation tests.
+
+set +x
+set -e
+
+modprobe -q nf_defrag_ipv6
+
+readonly NETNS="ns-$(mktemp -u XXXXXX)"
+
+setup() {
+ ip netns add "${NETNS}"
+ ip -netns "${NETNS}" link set lo up
+
+ ip netns exec "${NETNS}" sysctl -w net.ipv4.ipfrag_high_thresh=9000000 >/dev/null 2>&1
+ ip netns exec "${NETNS}" sysctl -w net.ipv4.ipfrag_low_thresh=7000000 >/dev/null 2>&1
+ ip netns exec "${NETNS}" sysctl -w net.ipv4.ipfrag_time=1 >/dev/null 2>&1
+
+ ip netns exec "${NETNS}" sysctl -w net.ipv6.ip6frag_high_thresh=9000000 >/dev/null 2>&1
+ ip netns exec "${NETNS}" sysctl -w net.ipv6.ip6frag_low_thresh=7000000 >/dev/null 2>&1
+ ip netns exec "${NETNS}" sysctl -w net.ipv6.ip6frag_time=1 >/dev/null 2>&1
+
+ ip netns exec "${NETNS}" sysctl -w net.netfilter.nf_conntrack_frag6_high_thresh=9000000 >/dev/null 2>&1
+ ip netns exec "${NETNS}" sysctl -w net.netfilter.nf_conntrack_frag6_low_thresh=7000000 >/dev/null 2>&1
+ ip netns exec "${NETNS}" sysctl -w net.netfilter.nf_conntrack_frag6_timeout=1 >/dev/null 2>&1
+
+ # DST cache can get full with a lot of frags, with GC not keeping up with the test.
+ ip netns exec "${NETNS}" sysctl -w net.ipv6.route.max_size=65536 >/dev/null 2>&1
+}
+
+cleanup() {
+ ip netns del "${NETNS}"
+}
+
+trap cleanup EXIT
+setup
+
+echo "ipv4 defrag"
+ip netns exec "${NETNS}" ./ip_defrag -4
+
+echo "ipv4 defrag with overlaps"
+ip netns exec "${NETNS}" ./ip_defrag -4o
+
+echo "ipv6 defrag"
+ip netns exec "${NETNS}" ./ip_defrag -6
+
+echo "ipv6 defrag with overlaps"
+ip netns exec "${NETNS}" ./ip_defrag -6o
+
+# insert an nf_conntrack rule so that the codepath in nf_conntrack_reasm.c taken
+ip netns exec "${NETNS}" ip6tables -A INPUT -m conntrack --ctstate INVALID -j ACCEPT
+
+echo "ipv6 nf_conntrack defrag"
+ip netns exec "${NETNS}" ./ip_defrag -6
+
+echo "ipv6 nf_conntrack defrag with overlaps"
+# netfilter will drop some invalid packets, so we run the test in
+# permissive mode: i.e. pass the test if the packet is correctly assembled
+# even if we sent an overlap
+ip netns exec "${NETNS}" ./ip_defrag -6op
+
+echo "all tests done"
diff --git a/tools/testing/selftests/net/ipsec.c b/tools/testing/selftests/net/ipsec.c
new file mode 100644
index 000000000..03b048b66
--- /dev/null
+++ b/tools/testing/selftests/net/ipsec.c
@@ -0,0 +1,2195 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ipsec.c - Check xfrm on veth inside a net-ns.
+ * Copyright (c) 2018 Dmitry Safonov
+ */
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <asm/types.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/limits.h>
+#include <linux/netlink.h>
+#include <linux/random.h>
+#include <linux/rtnetlink.h>
+#include <linux/veth.h>
+#include <linux/xfrm.h>
+#include <netinet/in.h>
+#include <net/if.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+
+#define printk(fmt, ...) \
+ ksft_print_msg("%d[%u] " fmt "\n", getpid(), __LINE__, ##__VA_ARGS__)
+
+#define pr_err(fmt, ...) printk(fmt ": %m", ##__VA_ARGS__)
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+
+#define IPV4_STR_SZ 16 /* xxx.xxx.xxx.xxx is longest + \0 */
+#define MAX_PAYLOAD 2048
+#define XFRM_ALGO_KEY_BUF_SIZE 512
+#define MAX_PROCESSES (1 << 14) /* /16 mask divided by /30 subnets */
+#define INADDR_A ((in_addr_t) 0x0a000000) /* 10.0.0.0 */
+#define INADDR_B ((in_addr_t) 0xc0a80000) /* 192.168.0.0 */
+
+/* /30 mask for one veth connection */
+#define PREFIX_LEN 30
+#define child_ip(nr) (4*nr + 1)
+#define grchild_ip(nr) (4*nr + 2)
+
+#define VETH_FMT "ktst-%d"
+#define VETH_LEN 12
+
+static int nsfd_parent = -1;
+static int nsfd_childa = -1;
+static int nsfd_childb = -1;
+static long page_size;
+
+/*
+ * ksft_cnt is static in kselftest, so isn't shared with children.
+ * We have to send a test result back to parent and count there.
+ * results_fd is a pipe with test feedback from children.
+ */
+static int results_fd[2];
+
+const unsigned int ping_delay_nsec = 50 * 1000 * 1000;
+const unsigned int ping_timeout = 300;
+const unsigned int ping_count = 100;
+const unsigned int ping_success = 80;
+
+static void randomize_buffer(void *buf, size_t buflen)
+{
+ int *p = (int *)buf;
+ size_t words = buflen / sizeof(int);
+ size_t leftover = buflen % sizeof(int);
+
+ if (!buflen)
+ return;
+
+ while (words--)
+ *p++ = rand();
+
+ if (leftover) {
+ int tmp = rand();
+
+ memcpy(buf + buflen - leftover, &tmp, leftover);
+ }
+
+ return;
+}
+
+static int unshare_open(void)
+{
+ const char *netns_path = "/proc/self/ns/net";
+ int fd;
+
+ if (unshare(CLONE_NEWNET) != 0) {
+ pr_err("unshare()");
+ return -1;
+ }
+
+ fd = open(netns_path, O_RDONLY);
+ if (fd <= 0) {
+ pr_err("open(%s)", netns_path);
+ return -1;
+ }
+
+ return fd;
+}
+
+static int switch_ns(int fd)
+{
+ if (setns(fd, CLONE_NEWNET)) {
+ pr_err("setns()");
+ return -1;
+ }
+ return 0;
+}
+
+/*
+ * Running the test inside a new parent net namespace to bother less
+ * about cleanup on error-path.
+ */
+static int init_namespaces(void)
+{
+ nsfd_parent = unshare_open();
+ if (nsfd_parent <= 0)
+ return -1;
+
+ nsfd_childa = unshare_open();
+ if (nsfd_childa <= 0)
+ return -1;
+
+ if (switch_ns(nsfd_parent))
+ return -1;
+
+ nsfd_childb = unshare_open();
+ if (nsfd_childb <= 0)
+ return -1;
+
+ if (switch_ns(nsfd_parent))
+ return -1;
+ return 0;
+}
+
+static int netlink_sock(int *sock, uint32_t *seq_nr, int proto)
+{
+ if (*sock > 0) {
+ seq_nr++;
+ return 0;
+ }
+
+ *sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, proto);
+ if (*sock <= 0) {
+ pr_err("socket(AF_NETLINK)");
+ return -1;
+ }
+
+ randomize_buffer(seq_nr, sizeof(*seq_nr));
+
+ return 0;
+}
+
+static inline struct rtattr *rtattr_hdr(struct nlmsghdr *nh)
+{
+ return (struct rtattr *)((char *)(nh) + RTA_ALIGN((nh)->nlmsg_len));
+}
+
+static int rtattr_pack(struct nlmsghdr *nh, size_t req_sz,
+ unsigned short rta_type, const void *payload, size_t size)
+{
+ /* NLMSG_ALIGNTO == RTA_ALIGNTO, nlmsg_len already aligned */
+ struct rtattr *attr = rtattr_hdr(nh);
+ size_t nl_size = RTA_ALIGN(nh->nlmsg_len) + RTA_LENGTH(size);
+
+ if (req_sz < nl_size) {
+ printk("req buf is too small: %zu < %zu", req_sz, nl_size);
+ return -1;
+ }
+ nh->nlmsg_len = nl_size;
+
+ attr->rta_len = RTA_LENGTH(size);
+ attr->rta_type = rta_type;
+ memcpy(RTA_DATA(attr), payload, size);
+
+ return 0;
+}
+
+static struct rtattr *_rtattr_begin(struct nlmsghdr *nh, size_t req_sz,
+ unsigned short rta_type, const void *payload, size_t size)
+{
+ struct rtattr *ret = rtattr_hdr(nh);
+
+ if (rtattr_pack(nh, req_sz, rta_type, payload, size))
+ return 0;
+
+ return ret;
+}
+
+static inline struct rtattr *rtattr_begin(struct nlmsghdr *nh, size_t req_sz,
+ unsigned short rta_type)
+{
+ return _rtattr_begin(nh, req_sz, rta_type, 0, 0);
+}
+
+static inline void rtattr_end(struct nlmsghdr *nh, struct rtattr *attr)
+{
+ char *nlmsg_end = (char *)nh + nh->nlmsg_len;
+
+ attr->rta_len = nlmsg_end - (char *)attr;
+}
+
+static int veth_pack_peerb(struct nlmsghdr *nh, size_t req_sz,
+ const char *peer, int ns)
+{
+ struct ifinfomsg pi;
+ struct rtattr *peer_attr;
+
+ memset(&pi, 0, sizeof(pi));
+ pi.ifi_family = AF_UNSPEC;
+ pi.ifi_change = 0xFFFFFFFF;
+
+ peer_attr = _rtattr_begin(nh, req_sz, VETH_INFO_PEER, &pi, sizeof(pi));
+ if (!peer_attr)
+ return -1;
+
+ if (rtattr_pack(nh, req_sz, IFLA_IFNAME, peer, strlen(peer)))
+ return -1;
+
+ if (rtattr_pack(nh, req_sz, IFLA_NET_NS_FD, &ns, sizeof(ns)))
+ return -1;
+
+ rtattr_end(nh, peer_attr);
+
+ return 0;
+}
+
+static int netlink_check_answer(int sock)
+{
+ struct nlmsgerror {
+ struct nlmsghdr hdr;
+ int error;
+ struct nlmsghdr orig_msg;
+ } answer;
+
+ if (recv(sock, &answer, sizeof(answer), 0) < 0) {
+ pr_err("recv()");
+ return -1;
+ } else if (answer.hdr.nlmsg_type != NLMSG_ERROR) {
+ printk("expected NLMSG_ERROR, got %d", (int)answer.hdr.nlmsg_type);
+ return -1;
+ } else if (answer.error) {
+ printk("NLMSG_ERROR: %d: %s",
+ answer.error, strerror(-answer.error));
+ return answer.error;
+ }
+
+ return 0;
+}
+
+static int veth_add(int sock, uint32_t seq, const char *peera, int ns_a,
+ const char *peerb, int ns_b)
+{
+ uint16_t flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE;
+ struct {
+ struct nlmsghdr nh;
+ struct ifinfomsg info;
+ char attrbuf[MAX_PAYLOAD];
+ } req;
+ const char veth_type[] = "veth";
+ struct rtattr *link_info, *info_data;
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.info));
+ req.nh.nlmsg_type = RTM_NEWLINK;
+ req.nh.nlmsg_flags = flags;
+ req.nh.nlmsg_seq = seq;
+ req.info.ifi_family = AF_UNSPEC;
+ req.info.ifi_change = 0xFFFFFFFF;
+
+ if (rtattr_pack(&req.nh, sizeof(req), IFLA_IFNAME, peera, strlen(peera)))
+ return -1;
+
+ if (rtattr_pack(&req.nh, sizeof(req), IFLA_NET_NS_FD, &ns_a, sizeof(ns_a)))
+ return -1;
+
+ link_info = rtattr_begin(&req.nh, sizeof(req), IFLA_LINKINFO);
+ if (!link_info)
+ return -1;
+
+ if (rtattr_pack(&req.nh, sizeof(req), IFLA_INFO_KIND, veth_type, sizeof(veth_type)))
+ return -1;
+
+ info_data = rtattr_begin(&req.nh, sizeof(req), IFLA_INFO_DATA);
+ if (!info_data)
+ return -1;
+
+ if (veth_pack_peerb(&req.nh, sizeof(req), peerb, ns_b))
+ return -1;
+
+ rtattr_end(&req.nh, info_data);
+ rtattr_end(&req.nh, link_info);
+
+ if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ return -1;
+ }
+ return netlink_check_answer(sock);
+}
+
+static int ip4_addr_set(int sock, uint32_t seq, const char *intf,
+ struct in_addr addr, uint8_t prefix)
+{
+ uint16_t flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE;
+ struct {
+ struct nlmsghdr nh;
+ struct ifaddrmsg info;
+ char attrbuf[MAX_PAYLOAD];
+ } req;
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.info));
+ req.nh.nlmsg_type = RTM_NEWADDR;
+ req.nh.nlmsg_flags = flags;
+ req.nh.nlmsg_seq = seq;
+ req.info.ifa_family = AF_INET;
+ req.info.ifa_prefixlen = prefix;
+ req.info.ifa_index = if_nametoindex(intf);
+
+#ifdef DEBUG
+ {
+ char addr_str[IPV4_STR_SZ] = {};
+
+ strncpy(addr_str, inet_ntoa(addr), IPV4_STR_SZ - 1);
+
+ printk("ip addr set %s", addr_str);
+ }
+#endif
+
+ if (rtattr_pack(&req.nh, sizeof(req), IFA_LOCAL, &addr, sizeof(addr)))
+ return -1;
+
+ if (rtattr_pack(&req.nh, sizeof(req), IFA_ADDRESS, &addr, sizeof(addr)))
+ return -1;
+
+ if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ return -1;
+ }
+ return netlink_check_answer(sock);
+}
+
+static int link_set_up(int sock, uint32_t seq, const char *intf)
+{
+ struct {
+ struct nlmsghdr nh;
+ struct ifinfomsg info;
+ char attrbuf[MAX_PAYLOAD];
+ } req;
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.info));
+ req.nh.nlmsg_type = RTM_NEWLINK;
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.nh.nlmsg_seq = seq;
+ req.info.ifi_family = AF_UNSPEC;
+ req.info.ifi_change = 0xFFFFFFFF;
+ req.info.ifi_index = if_nametoindex(intf);
+ req.info.ifi_flags = IFF_UP;
+ req.info.ifi_change = IFF_UP;
+
+ if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ return -1;
+ }
+ return netlink_check_answer(sock);
+}
+
+static int ip4_route_set(int sock, uint32_t seq, const char *intf,
+ struct in_addr src, struct in_addr dst)
+{
+ struct {
+ struct nlmsghdr nh;
+ struct rtmsg rt;
+ char attrbuf[MAX_PAYLOAD];
+ } req;
+ unsigned int index = if_nametoindex(intf);
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.rt));
+ req.nh.nlmsg_type = RTM_NEWROUTE;
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE;
+ req.nh.nlmsg_seq = seq;
+ req.rt.rtm_family = AF_INET;
+ req.rt.rtm_dst_len = 32;
+ req.rt.rtm_table = RT_TABLE_MAIN;
+ req.rt.rtm_protocol = RTPROT_BOOT;
+ req.rt.rtm_scope = RT_SCOPE_LINK;
+ req.rt.rtm_type = RTN_UNICAST;
+
+ if (rtattr_pack(&req.nh, sizeof(req), RTA_DST, &dst, sizeof(dst)))
+ return -1;
+
+ if (rtattr_pack(&req.nh, sizeof(req), RTA_PREFSRC, &src, sizeof(src)))
+ return -1;
+
+ if (rtattr_pack(&req.nh, sizeof(req), RTA_OIF, &index, sizeof(index)))
+ return -1;
+
+ if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ return -1;
+ }
+
+ return netlink_check_answer(sock);
+}
+
+static int tunnel_set_route(int route_sock, uint32_t *route_seq, char *veth,
+ struct in_addr tunsrc, struct in_addr tundst)
+{
+ if (ip4_addr_set(route_sock, (*route_seq)++, "lo",
+ tunsrc, PREFIX_LEN)) {
+ printk("Failed to set ipv4 addr");
+ return -1;
+ }
+
+ if (ip4_route_set(route_sock, (*route_seq)++, veth, tunsrc, tundst)) {
+ printk("Failed to set ipv4 route");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int init_child(int nsfd, char *veth, unsigned int src, unsigned int dst)
+{
+ struct in_addr intsrc = inet_makeaddr(INADDR_B, src);
+ struct in_addr tunsrc = inet_makeaddr(INADDR_A, src);
+ struct in_addr tundst = inet_makeaddr(INADDR_A, dst);
+ int route_sock = -1, ret = -1;
+ uint32_t route_seq;
+
+ if (switch_ns(nsfd))
+ return -1;
+
+ if (netlink_sock(&route_sock, &route_seq, NETLINK_ROUTE)) {
+ printk("Failed to open netlink route socket in child");
+ return -1;
+ }
+
+ if (ip4_addr_set(route_sock, route_seq++, veth, intsrc, PREFIX_LEN)) {
+ printk("Failed to set ipv4 addr");
+ goto err;
+ }
+
+ if (link_set_up(route_sock, route_seq++, veth)) {
+ printk("Failed to bring up %s", veth);
+ goto err;
+ }
+
+ if (tunnel_set_route(route_sock, &route_seq, veth, tunsrc, tundst)) {
+ printk("Failed to add tunnel route on %s", veth);
+ goto err;
+ }
+ ret = 0;
+
+err:
+ close(route_sock);
+ return ret;
+}
+
+#define ALGO_LEN 64
+enum desc_type {
+ CREATE_TUNNEL = 0,
+ ALLOCATE_SPI,
+ MONITOR_ACQUIRE,
+ EXPIRE_STATE,
+ EXPIRE_POLICY,
+};
+const char *desc_name[] = {
+ "create tunnel",
+ "alloc spi",
+ "monitor acquire",
+ "expire state",
+ "expire policy"
+};
+struct xfrm_desc {
+ enum desc_type type;
+ uint8_t proto;
+ char a_algo[ALGO_LEN];
+ char e_algo[ALGO_LEN];
+ char c_algo[ALGO_LEN];
+ char ae_algo[ALGO_LEN];
+ unsigned int icv_len;
+ /* unsigned key_len; */
+};
+
+enum msg_type {
+ MSG_ACK = 0,
+ MSG_EXIT,
+ MSG_PING,
+ MSG_XFRM_PREPARE,
+ MSG_XFRM_ADD,
+ MSG_XFRM_DEL,
+ MSG_XFRM_CLEANUP,
+};
+
+struct test_desc {
+ enum msg_type type;
+ union {
+ struct {
+ in_addr_t reply_ip;
+ unsigned int port;
+ } ping;
+ struct xfrm_desc xfrm_desc;
+ } body;
+};
+
+struct test_result {
+ struct xfrm_desc desc;
+ unsigned int res;
+};
+
+static void write_test_result(unsigned int res, struct xfrm_desc *d)
+{
+ struct test_result tr = {};
+ ssize_t ret;
+
+ tr.desc = *d;
+ tr.res = res;
+
+ ret = write(results_fd[1], &tr, sizeof(tr));
+ if (ret != sizeof(tr))
+ pr_err("Failed to write the result in pipe %zd", ret);
+}
+
+static void write_msg(int fd, struct test_desc *msg, bool exit_of_fail)
+{
+ ssize_t bytes = write(fd, msg, sizeof(*msg));
+
+ /* Make sure that write/read is atomic to a pipe */
+ BUILD_BUG_ON(sizeof(struct test_desc) > PIPE_BUF);
+
+ if (bytes < 0) {
+ pr_err("write()");
+ if (exit_of_fail)
+ exit(KSFT_FAIL);
+ }
+ if (bytes != sizeof(*msg)) {
+ pr_err("sent part of the message %zd/%zu", bytes, sizeof(*msg));
+ if (exit_of_fail)
+ exit(KSFT_FAIL);
+ }
+}
+
+static void read_msg(int fd, struct test_desc *msg, bool exit_of_fail)
+{
+ ssize_t bytes = read(fd, msg, sizeof(*msg));
+
+ if (bytes < 0) {
+ pr_err("read()");
+ if (exit_of_fail)
+ exit(KSFT_FAIL);
+ }
+ if (bytes != sizeof(*msg)) {
+ pr_err("got incomplete message %zd/%zu", bytes, sizeof(*msg));
+ if (exit_of_fail)
+ exit(KSFT_FAIL);
+ }
+}
+
+static int udp_ping_init(struct in_addr listen_ip, unsigned int u_timeout,
+ unsigned int *server_port, int sock[2])
+{
+ struct sockaddr_in server;
+ struct timeval t = { .tv_sec = 0, .tv_usec = u_timeout };
+ socklen_t s_len = sizeof(server);
+
+ sock[0] = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock[0] < 0) {
+ pr_err("socket()");
+ return -1;
+ }
+
+ server.sin_family = AF_INET;
+ server.sin_port = 0;
+ memcpy(&server.sin_addr.s_addr, &listen_ip, sizeof(struct in_addr));
+
+ if (bind(sock[0], (struct sockaddr *)&server, s_len)) {
+ pr_err("bind()");
+ goto err_close_server;
+ }
+
+ if (getsockname(sock[0], (struct sockaddr *)&server, &s_len)) {
+ pr_err("getsockname()");
+ goto err_close_server;
+ }
+
+ *server_port = ntohs(server.sin_port);
+
+ if (setsockopt(sock[0], SOL_SOCKET, SO_RCVTIMEO, (const char *)&t, sizeof t)) {
+ pr_err("setsockopt()");
+ goto err_close_server;
+ }
+
+ sock[1] = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock[1] < 0) {
+ pr_err("socket()");
+ goto err_close_server;
+ }
+
+ return 0;
+
+err_close_server:
+ close(sock[0]);
+ return -1;
+}
+
+static int udp_ping_send(int sock[2], in_addr_t dest_ip, unsigned int port,
+ char *buf, size_t buf_len)
+{
+ struct sockaddr_in server;
+ const struct sockaddr *dest_addr = (struct sockaddr *)&server;
+ char *sock_buf[buf_len];
+ ssize_t r_bytes, s_bytes;
+
+ server.sin_family = AF_INET;
+ server.sin_port = htons(port);
+ server.sin_addr.s_addr = dest_ip;
+
+ s_bytes = sendto(sock[1], buf, buf_len, 0, dest_addr, sizeof(server));
+ if (s_bytes < 0) {
+ pr_err("sendto()");
+ return -1;
+ } else if (s_bytes != buf_len) {
+ printk("send part of the message: %zd/%zu", s_bytes, sizeof(server));
+ return -1;
+ }
+
+ r_bytes = recv(sock[0], sock_buf, buf_len, 0);
+ if (r_bytes < 0) {
+ if (errno != EAGAIN)
+ pr_err("recv()");
+ return -1;
+ } else if (r_bytes == 0) { /* EOF */
+ printk("EOF on reply to ping");
+ return -1;
+ } else if (r_bytes != buf_len || memcmp(buf, sock_buf, buf_len)) {
+ printk("ping reply packet is corrupted %zd/%zu", r_bytes, buf_len);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int udp_ping_reply(int sock[2], in_addr_t dest_ip, unsigned int port,
+ char *buf, size_t buf_len)
+{
+ struct sockaddr_in server;
+ const struct sockaddr *dest_addr = (struct sockaddr *)&server;
+ char *sock_buf[buf_len];
+ ssize_t r_bytes, s_bytes;
+
+ server.sin_family = AF_INET;
+ server.sin_port = htons(port);
+ server.sin_addr.s_addr = dest_ip;
+
+ r_bytes = recv(sock[0], sock_buf, buf_len, 0);
+ if (r_bytes < 0) {
+ if (errno != EAGAIN)
+ pr_err("recv()");
+ return -1;
+ }
+ if (r_bytes == 0) { /* EOF */
+ printk("EOF on reply to ping");
+ return -1;
+ }
+ if (r_bytes != buf_len || memcmp(buf, sock_buf, buf_len)) {
+ printk("ping reply packet is corrupted %zd/%zu", r_bytes, buf_len);
+ return -1;
+ }
+
+ s_bytes = sendto(sock[1], buf, buf_len, 0, dest_addr, sizeof(server));
+ if (s_bytes < 0) {
+ pr_err("sendto()");
+ return -1;
+ } else if (s_bytes != buf_len) {
+ printk("send part of the message: %zd/%zu", s_bytes, sizeof(server));
+ return -1;
+ }
+
+ return 0;
+}
+
+typedef int (*ping_f)(int sock[2], in_addr_t dest_ip, unsigned int port,
+ char *buf, size_t buf_len);
+static int do_ping(int cmd_fd, char *buf, size_t buf_len, struct in_addr from,
+ bool init_side, int d_port, in_addr_t to, ping_f func)
+{
+ struct test_desc msg;
+ unsigned int s_port, i, ping_succeeded = 0;
+ int ping_sock[2];
+ char to_str[IPV4_STR_SZ] = {}, from_str[IPV4_STR_SZ] = {};
+
+ if (udp_ping_init(from, ping_timeout, &s_port, ping_sock)) {
+ printk("Failed to init ping");
+ return -1;
+ }
+
+ memset(&msg, 0, sizeof(msg));
+ msg.type = MSG_PING;
+ msg.body.ping.port = s_port;
+ memcpy(&msg.body.ping.reply_ip, &from, sizeof(from));
+
+ write_msg(cmd_fd, &msg, 0);
+ if (init_side) {
+ /* The other end sends ip to ping */
+ read_msg(cmd_fd, &msg, 0);
+ if (msg.type != MSG_PING)
+ return -1;
+ to = msg.body.ping.reply_ip;
+ d_port = msg.body.ping.port;
+ }
+
+ for (i = 0; i < ping_count ; i++) {
+ struct timespec sleep_time = {
+ .tv_sec = 0,
+ .tv_nsec = ping_delay_nsec,
+ };
+
+ ping_succeeded += !func(ping_sock, to, d_port, buf, page_size);
+ nanosleep(&sleep_time, 0);
+ }
+
+ close(ping_sock[0]);
+ close(ping_sock[1]);
+
+ strncpy(to_str, inet_ntoa(*(struct in_addr *)&to), IPV4_STR_SZ - 1);
+ strncpy(from_str, inet_ntoa(from), IPV4_STR_SZ - 1);
+
+ if (ping_succeeded < ping_success) {
+ printk("ping (%s) %s->%s failed %u/%u times",
+ init_side ? "send" : "reply", from_str, to_str,
+ ping_count - ping_succeeded, ping_count);
+ return -1;
+ }
+
+#ifdef DEBUG
+ printk("ping (%s) %s->%s succeeded %u/%u times",
+ init_side ? "send" : "reply", from_str, to_str,
+ ping_succeeded, ping_count);
+#endif
+
+ return 0;
+}
+
+static int xfrm_fill_key(char *name, char *buf,
+ size_t buf_len, unsigned int *key_len)
+{
+ /* TODO: use set/map instead */
+ if (strncmp(name, "digest_null", ALGO_LEN) == 0)
+ *key_len = 0;
+ else if (strncmp(name, "ecb(cipher_null)", ALGO_LEN) == 0)
+ *key_len = 0;
+ else if (strncmp(name, "cbc(des)", ALGO_LEN) == 0)
+ *key_len = 64;
+ else if (strncmp(name, "hmac(md5)", ALGO_LEN) == 0)
+ *key_len = 128;
+ else if (strncmp(name, "cmac(aes)", ALGO_LEN) == 0)
+ *key_len = 128;
+ else if (strncmp(name, "xcbc(aes)", ALGO_LEN) == 0)
+ *key_len = 128;
+ else if (strncmp(name, "cbc(cast5)", ALGO_LEN) == 0)
+ *key_len = 128;
+ else if (strncmp(name, "cbc(serpent)", ALGO_LEN) == 0)
+ *key_len = 128;
+ else if (strncmp(name, "hmac(sha1)", ALGO_LEN) == 0)
+ *key_len = 160;
+ else if (strncmp(name, "hmac(rmd160)", ALGO_LEN) == 0)
+ *key_len = 160;
+ else if (strncmp(name, "cbc(des3_ede)", ALGO_LEN) == 0)
+ *key_len = 192;
+ else if (strncmp(name, "hmac(sha256)", ALGO_LEN) == 0)
+ *key_len = 256;
+ else if (strncmp(name, "cbc(aes)", ALGO_LEN) == 0)
+ *key_len = 256;
+ else if (strncmp(name, "cbc(camellia)", ALGO_LEN) == 0)
+ *key_len = 256;
+ else if (strncmp(name, "cbc(twofish)", ALGO_LEN) == 0)
+ *key_len = 256;
+ else if (strncmp(name, "rfc3686(ctr(aes))", ALGO_LEN) == 0)
+ *key_len = 288;
+ else if (strncmp(name, "hmac(sha384)", ALGO_LEN) == 0)
+ *key_len = 384;
+ else if (strncmp(name, "cbc(blowfish)", ALGO_LEN) == 0)
+ *key_len = 448;
+ else if (strncmp(name, "hmac(sha512)", ALGO_LEN) == 0)
+ *key_len = 512;
+ else if (strncmp(name, "rfc4106(gcm(aes))-128", ALGO_LEN) == 0)
+ *key_len = 160;
+ else if (strncmp(name, "rfc4543(gcm(aes))-128", ALGO_LEN) == 0)
+ *key_len = 160;
+ else if (strncmp(name, "rfc4309(ccm(aes))-128", ALGO_LEN) == 0)
+ *key_len = 152;
+ else if (strncmp(name, "rfc4106(gcm(aes))-192", ALGO_LEN) == 0)
+ *key_len = 224;
+ else if (strncmp(name, "rfc4543(gcm(aes))-192", ALGO_LEN) == 0)
+ *key_len = 224;
+ else if (strncmp(name, "rfc4309(ccm(aes))-192", ALGO_LEN) == 0)
+ *key_len = 216;
+ else if (strncmp(name, "rfc4106(gcm(aes))-256", ALGO_LEN) == 0)
+ *key_len = 288;
+ else if (strncmp(name, "rfc4543(gcm(aes))-256", ALGO_LEN) == 0)
+ *key_len = 288;
+ else if (strncmp(name, "rfc4309(ccm(aes))-256", ALGO_LEN) == 0)
+ *key_len = 280;
+ else if (strncmp(name, "rfc7539(chacha20,poly1305)-128", ALGO_LEN) == 0)
+ *key_len = 0;
+
+ if (*key_len > buf_len) {
+ printk("Can't pack a key - too big for buffer");
+ return -1;
+ }
+
+ randomize_buffer(buf, *key_len);
+
+ return 0;
+}
+
+static int xfrm_state_pack_algo(struct nlmsghdr *nh, size_t req_sz,
+ struct xfrm_desc *desc)
+{
+ struct {
+ union {
+ struct xfrm_algo alg;
+ struct xfrm_algo_aead aead;
+ struct xfrm_algo_auth auth;
+ } u;
+ char buf[XFRM_ALGO_KEY_BUF_SIZE];
+ } alg = {};
+ size_t alen, elen, clen, aelen;
+ unsigned short type;
+
+ alen = strlen(desc->a_algo);
+ elen = strlen(desc->e_algo);
+ clen = strlen(desc->c_algo);
+ aelen = strlen(desc->ae_algo);
+
+ /* Verify desc */
+ switch (desc->proto) {
+ case IPPROTO_AH:
+ if (!alen || elen || clen || aelen) {
+ printk("BUG: buggy ah desc");
+ return -1;
+ }
+ strncpy(alg.u.alg.alg_name, desc->a_algo, ALGO_LEN - 1);
+ if (xfrm_fill_key(desc->a_algo, alg.u.alg.alg_key,
+ sizeof(alg.buf), &alg.u.alg.alg_key_len))
+ return -1;
+ type = XFRMA_ALG_AUTH;
+ break;
+ case IPPROTO_COMP:
+ if (!clen || elen || alen || aelen) {
+ printk("BUG: buggy comp desc");
+ return -1;
+ }
+ strncpy(alg.u.alg.alg_name, desc->c_algo, ALGO_LEN - 1);
+ if (xfrm_fill_key(desc->c_algo, alg.u.alg.alg_key,
+ sizeof(alg.buf), &alg.u.alg.alg_key_len))
+ return -1;
+ type = XFRMA_ALG_COMP;
+ break;
+ case IPPROTO_ESP:
+ if (!((alen && elen) ^ aelen) || clen) {
+ printk("BUG: buggy esp desc");
+ return -1;
+ }
+ if (aelen) {
+ alg.u.aead.alg_icv_len = desc->icv_len;
+ strncpy(alg.u.aead.alg_name, desc->ae_algo, ALGO_LEN - 1);
+ if (xfrm_fill_key(desc->ae_algo, alg.u.aead.alg_key,
+ sizeof(alg.buf), &alg.u.aead.alg_key_len))
+ return -1;
+ type = XFRMA_ALG_AEAD;
+ } else {
+
+ strncpy(alg.u.alg.alg_name, desc->e_algo, ALGO_LEN - 1);
+ type = XFRMA_ALG_CRYPT;
+ if (xfrm_fill_key(desc->e_algo, alg.u.alg.alg_key,
+ sizeof(alg.buf), &alg.u.alg.alg_key_len))
+ return -1;
+ if (rtattr_pack(nh, req_sz, type, &alg, sizeof(alg)))
+ return -1;
+
+ strncpy(alg.u.alg.alg_name, desc->a_algo, ALGO_LEN);
+ type = XFRMA_ALG_AUTH;
+ if (xfrm_fill_key(desc->a_algo, alg.u.alg.alg_key,
+ sizeof(alg.buf), &alg.u.alg.alg_key_len))
+ return -1;
+ }
+ break;
+ default:
+ printk("BUG: unknown proto in desc");
+ return -1;
+ }
+
+ if (rtattr_pack(nh, req_sz, type, &alg, sizeof(alg)))
+ return -1;
+
+ return 0;
+}
+
+static inline uint32_t gen_spi(struct in_addr src)
+{
+ return htonl(inet_lnaof(src));
+}
+
+static int xfrm_state_add(int xfrm_sock, uint32_t seq, uint32_t spi,
+ struct in_addr src, struct in_addr dst,
+ struct xfrm_desc *desc)
+{
+ struct {
+ struct nlmsghdr nh;
+ struct xfrm_usersa_info info;
+ char attrbuf[MAX_PAYLOAD];
+ } req;
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.info));
+ req.nh.nlmsg_type = XFRM_MSG_NEWSA;
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.nh.nlmsg_seq = seq;
+
+ /* Fill selector. */
+ memcpy(&req.info.sel.daddr, &dst, sizeof(dst));
+ memcpy(&req.info.sel.saddr, &src, sizeof(src));
+ req.info.sel.family = AF_INET;
+ req.info.sel.prefixlen_d = PREFIX_LEN;
+ req.info.sel.prefixlen_s = PREFIX_LEN;
+
+ /* Fill id */
+ memcpy(&req.info.id.daddr, &dst, sizeof(dst));
+ /* Note: zero-spi cannot be deleted */
+ req.info.id.spi = spi;
+ req.info.id.proto = desc->proto;
+
+ memcpy(&req.info.saddr, &src, sizeof(src));
+
+ /* Fill lifteme_cfg */
+ req.info.lft.soft_byte_limit = XFRM_INF;
+ req.info.lft.hard_byte_limit = XFRM_INF;
+ req.info.lft.soft_packet_limit = XFRM_INF;
+ req.info.lft.hard_packet_limit = XFRM_INF;
+
+ req.info.family = AF_INET;
+ req.info.mode = XFRM_MODE_TUNNEL;
+
+ if (xfrm_state_pack_algo(&req.nh, sizeof(req), desc))
+ return -1;
+
+ if (send(xfrm_sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ return -1;
+ }
+
+ return netlink_check_answer(xfrm_sock);
+}
+
+static bool xfrm_usersa_found(struct xfrm_usersa_info *info, uint32_t spi,
+ struct in_addr src, struct in_addr dst,
+ struct xfrm_desc *desc)
+{
+ if (memcmp(&info->sel.daddr, &dst, sizeof(dst)))
+ return false;
+
+ if (memcmp(&info->sel.saddr, &src, sizeof(src)))
+ return false;
+
+ if (info->sel.family != AF_INET ||
+ info->sel.prefixlen_d != PREFIX_LEN ||
+ info->sel.prefixlen_s != PREFIX_LEN)
+ return false;
+
+ if (info->id.spi != spi || info->id.proto != desc->proto)
+ return false;
+
+ if (memcmp(&info->id.daddr, &dst, sizeof(dst)))
+ return false;
+
+ if (memcmp(&info->saddr, &src, sizeof(src)))
+ return false;
+
+ if (info->lft.soft_byte_limit != XFRM_INF ||
+ info->lft.hard_byte_limit != XFRM_INF ||
+ info->lft.soft_packet_limit != XFRM_INF ||
+ info->lft.hard_packet_limit != XFRM_INF)
+ return false;
+
+ if (info->family != AF_INET || info->mode != XFRM_MODE_TUNNEL)
+ return false;
+
+ /* XXX: check xfrm algo, see xfrm_state_pack_algo(). */
+
+ return true;
+}
+
+static int xfrm_state_check(int xfrm_sock, uint32_t seq, uint32_t spi,
+ struct in_addr src, struct in_addr dst,
+ struct xfrm_desc *desc)
+{
+ struct {
+ struct nlmsghdr nh;
+ char attrbuf[MAX_PAYLOAD];
+ } req;
+ struct {
+ struct nlmsghdr nh;
+ union {
+ struct xfrm_usersa_info info;
+ int error;
+ };
+ char attrbuf[MAX_PAYLOAD];
+ } answer;
+ struct xfrm_address_filter filter = {};
+ bool found = false;
+
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(0);
+ req.nh.nlmsg_type = XFRM_MSG_GETSA;
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+ req.nh.nlmsg_seq = seq;
+
+ /*
+ * Add dump filter by source address as there may be other tunnels
+ * in this netns (if tests run in parallel).
+ */
+ filter.family = AF_INET;
+ filter.splen = 0x1f; /* 0xffffffff mask see addr_match() */
+ memcpy(&filter.saddr, &src, sizeof(src));
+ if (rtattr_pack(&req.nh, sizeof(req), XFRMA_ADDRESS_FILTER,
+ &filter, sizeof(filter)))
+ return -1;
+
+ if (send(xfrm_sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ return -1;
+ }
+
+ while (1) {
+ if (recv(xfrm_sock, &answer, sizeof(answer), 0) < 0) {
+ pr_err("recv()");
+ return -1;
+ }
+ if (answer.nh.nlmsg_type == NLMSG_ERROR) {
+ printk("NLMSG_ERROR: %d: %s",
+ answer.error, strerror(-answer.error));
+ return -1;
+ } else if (answer.nh.nlmsg_type == NLMSG_DONE) {
+ if (found)
+ return 0;
+ printk("didn't find allocated xfrm state in dump");
+ return -1;
+ } else if (answer.nh.nlmsg_type == XFRM_MSG_NEWSA) {
+ if (xfrm_usersa_found(&answer.info, spi, src, dst, desc))
+ found = true;
+ }
+ }
+}
+
+static int xfrm_set(int xfrm_sock, uint32_t *seq,
+ struct in_addr src, struct in_addr dst,
+ struct in_addr tunsrc, struct in_addr tundst,
+ struct xfrm_desc *desc)
+{
+ int err;
+
+ err = xfrm_state_add(xfrm_sock, (*seq)++, gen_spi(src), src, dst, desc);
+ if (err) {
+ printk("Failed to add xfrm state");
+ return -1;
+ }
+
+ err = xfrm_state_add(xfrm_sock, (*seq)++, gen_spi(src), dst, src, desc);
+ if (err) {
+ printk("Failed to add xfrm state");
+ return -1;
+ }
+
+ /* Check dumps for XFRM_MSG_GETSA */
+ err = xfrm_state_check(xfrm_sock, (*seq)++, gen_spi(src), src, dst, desc);
+ err |= xfrm_state_check(xfrm_sock, (*seq)++, gen_spi(src), dst, src, desc);
+ if (err) {
+ printk("Failed to check xfrm state");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int xfrm_policy_add(int xfrm_sock, uint32_t seq, uint32_t spi,
+ struct in_addr src, struct in_addr dst, uint8_t dir,
+ struct in_addr tunsrc, struct in_addr tundst, uint8_t proto)
+{
+ struct {
+ struct nlmsghdr nh;
+ struct xfrm_userpolicy_info info;
+ char attrbuf[MAX_PAYLOAD];
+ } req;
+ struct xfrm_user_tmpl tmpl;
+
+ memset(&req, 0, sizeof(req));
+ memset(&tmpl, 0, sizeof(tmpl));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.info));
+ req.nh.nlmsg_type = XFRM_MSG_NEWPOLICY;
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.nh.nlmsg_seq = seq;
+
+ /* Fill selector. */
+ memcpy(&req.info.sel.daddr, &dst, sizeof(tundst));
+ memcpy(&req.info.sel.saddr, &src, sizeof(tunsrc));
+ req.info.sel.family = AF_INET;
+ req.info.sel.prefixlen_d = PREFIX_LEN;
+ req.info.sel.prefixlen_s = PREFIX_LEN;
+
+ /* Fill lifteme_cfg */
+ req.info.lft.soft_byte_limit = XFRM_INF;
+ req.info.lft.hard_byte_limit = XFRM_INF;
+ req.info.lft.soft_packet_limit = XFRM_INF;
+ req.info.lft.hard_packet_limit = XFRM_INF;
+
+ req.info.dir = dir;
+
+ /* Fill tmpl */
+ memcpy(&tmpl.id.daddr, &dst, sizeof(dst));
+ /* Note: zero-spi cannot be deleted */
+ tmpl.id.spi = spi;
+ tmpl.id.proto = proto;
+ tmpl.family = AF_INET;
+ memcpy(&tmpl.saddr, &src, sizeof(src));
+ tmpl.mode = XFRM_MODE_TUNNEL;
+ tmpl.aalgos = (~(uint32_t)0);
+ tmpl.ealgos = (~(uint32_t)0);
+ tmpl.calgos = (~(uint32_t)0);
+
+ if (rtattr_pack(&req.nh, sizeof(req), XFRMA_TMPL, &tmpl, sizeof(tmpl)))
+ return -1;
+
+ if (send(xfrm_sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ return -1;
+ }
+
+ return netlink_check_answer(xfrm_sock);
+}
+
+static int xfrm_prepare(int xfrm_sock, uint32_t *seq,
+ struct in_addr src, struct in_addr dst,
+ struct in_addr tunsrc, struct in_addr tundst, uint8_t proto)
+{
+ if (xfrm_policy_add(xfrm_sock, (*seq)++, gen_spi(src), src, dst,
+ XFRM_POLICY_OUT, tunsrc, tundst, proto)) {
+ printk("Failed to add xfrm policy");
+ return -1;
+ }
+
+ if (xfrm_policy_add(xfrm_sock, (*seq)++, gen_spi(src), dst, src,
+ XFRM_POLICY_IN, tunsrc, tundst, proto)) {
+ printk("Failed to add xfrm policy");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int xfrm_policy_del(int xfrm_sock, uint32_t seq,
+ struct in_addr src, struct in_addr dst, uint8_t dir,
+ struct in_addr tunsrc, struct in_addr tundst)
+{
+ struct {
+ struct nlmsghdr nh;
+ struct xfrm_userpolicy_id id;
+ char attrbuf[MAX_PAYLOAD];
+ } req;
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.id));
+ req.nh.nlmsg_type = XFRM_MSG_DELPOLICY;
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.nh.nlmsg_seq = seq;
+
+ /* Fill id */
+ memcpy(&req.id.sel.daddr, &dst, sizeof(tundst));
+ memcpy(&req.id.sel.saddr, &src, sizeof(tunsrc));
+ req.id.sel.family = AF_INET;
+ req.id.sel.prefixlen_d = PREFIX_LEN;
+ req.id.sel.prefixlen_s = PREFIX_LEN;
+ req.id.dir = dir;
+
+ if (send(xfrm_sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ return -1;
+ }
+
+ return netlink_check_answer(xfrm_sock);
+}
+
+static int xfrm_cleanup(int xfrm_sock, uint32_t *seq,
+ struct in_addr src, struct in_addr dst,
+ struct in_addr tunsrc, struct in_addr tundst)
+{
+ if (xfrm_policy_del(xfrm_sock, (*seq)++, src, dst,
+ XFRM_POLICY_OUT, tunsrc, tundst)) {
+ printk("Failed to add xfrm policy");
+ return -1;
+ }
+
+ if (xfrm_policy_del(xfrm_sock, (*seq)++, dst, src,
+ XFRM_POLICY_IN, tunsrc, tundst)) {
+ printk("Failed to add xfrm policy");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int xfrm_state_del(int xfrm_sock, uint32_t seq, uint32_t spi,
+ struct in_addr src, struct in_addr dst, uint8_t proto)
+{
+ struct {
+ struct nlmsghdr nh;
+ struct xfrm_usersa_id id;
+ char attrbuf[MAX_PAYLOAD];
+ } req;
+ xfrm_address_t saddr = {};
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.id));
+ req.nh.nlmsg_type = XFRM_MSG_DELSA;
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.nh.nlmsg_seq = seq;
+
+ memcpy(&req.id.daddr, &dst, sizeof(dst));
+ req.id.family = AF_INET;
+ req.id.proto = proto;
+ /* Note: zero-spi cannot be deleted */
+ req.id.spi = spi;
+
+ memcpy(&saddr, &src, sizeof(src));
+ if (rtattr_pack(&req.nh, sizeof(req), XFRMA_SRCADDR, &saddr, sizeof(saddr)))
+ return -1;
+
+ if (send(xfrm_sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ return -1;
+ }
+
+ return netlink_check_answer(xfrm_sock);
+}
+
+static int xfrm_delete(int xfrm_sock, uint32_t *seq,
+ struct in_addr src, struct in_addr dst,
+ struct in_addr tunsrc, struct in_addr tundst, uint8_t proto)
+{
+ if (xfrm_state_del(xfrm_sock, (*seq)++, gen_spi(src), src, dst, proto)) {
+ printk("Failed to remove xfrm state");
+ return -1;
+ }
+
+ if (xfrm_state_del(xfrm_sock, (*seq)++, gen_spi(src), dst, src, proto)) {
+ printk("Failed to remove xfrm state");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int xfrm_state_allocspi(int xfrm_sock, uint32_t *seq,
+ uint32_t spi, uint8_t proto)
+{
+ struct {
+ struct nlmsghdr nh;
+ struct xfrm_userspi_info spi;
+ } req;
+ struct {
+ struct nlmsghdr nh;
+ union {
+ struct xfrm_usersa_info info;
+ int error;
+ };
+ } answer;
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.spi));
+ req.nh.nlmsg_type = XFRM_MSG_ALLOCSPI;
+ req.nh.nlmsg_flags = NLM_F_REQUEST;
+ req.nh.nlmsg_seq = (*seq)++;
+
+ req.spi.info.family = AF_INET;
+ req.spi.min = spi;
+ req.spi.max = spi;
+ req.spi.info.id.proto = proto;
+
+ if (send(xfrm_sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ return KSFT_FAIL;
+ }
+
+ if (recv(xfrm_sock, &answer, sizeof(answer), 0) < 0) {
+ pr_err("recv()");
+ return KSFT_FAIL;
+ } else if (answer.nh.nlmsg_type == XFRM_MSG_NEWSA) {
+ uint32_t new_spi = htonl(answer.info.id.spi);
+
+ if (new_spi != spi) {
+ printk("allocated spi is different from requested: %#x != %#x",
+ new_spi, spi);
+ return KSFT_FAIL;
+ }
+ return KSFT_PASS;
+ } else if (answer.nh.nlmsg_type != NLMSG_ERROR) {
+ printk("expected NLMSG_ERROR, got %d", (int)answer.nh.nlmsg_type);
+ return KSFT_FAIL;
+ }
+
+ printk("NLMSG_ERROR: %d: %s", answer.error, strerror(-answer.error));
+ return (answer.error) ? KSFT_FAIL : KSFT_PASS;
+}
+
+static int netlink_sock_bind(int *sock, uint32_t *seq, int proto, uint32_t groups)
+{
+ struct sockaddr_nl snl = {};
+ socklen_t addr_len;
+ int ret = -1;
+
+ snl.nl_family = AF_NETLINK;
+ snl.nl_groups = groups;
+
+ if (netlink_sock(sock, seq, proto)) {
+ printk("Failed to open xfrm netlink socket");
+ return -1;
+ }
+
+ if (bind(*sock, (struct sockaddr *)&snl, sizeof(snl)) < 0) {
+ pr_err("bind()");
+ goto out_close;
+ }
+
+ addr_len = sizeof(snl);
+ if (getsockname(*sock, (struct sockaddr *)&snl, &addr_len) < 0) {
+ pr_err("getsockname()");
+ goto out_close;
+ }
+ if (addr_len != sizeof(snl)) {
+ printk("Wrong address length %d", addr_len);
+ goto out_close;
+ }
+ if (snl.nl_family != AF_NETLINK) {
+ printk("Wrong address family %d", snl.nl_family);
+ goto out_close;
+ }
+ return 0;
+
+out_close:
+ close(*sock);
+ return ret;
+}
+
+static int xfrm_monitor_acquire(int xfrm_sock, uint32_t *seq, unsigned int nr)
+{
+ struct {
+ struct nlmsghdr nh;
+ union {
+ struct xfrm_user_acquire acq;
+ int error;
+ };
+ char attrbuf[MAX_PAYLOAD];
+ } req;
+ struct xfrm_user_tmpl xfrm_tmpl = {};
+ int xfrm_listen = -1, ret = KSFT_FAIL;
+ uint32_t seq_listen;
+
+ if (netlink_sock_bind(&xfrm_listen, &seq_listen, NETLINK_XFRM, XFRMNLGRP_ACQUIRE))
+ return KSFT_FAIL;
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.acq));
+ req.nh.nlmsg_type = XFRM_MSG_ACQUIRE;
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.nh.nlmsg_seq = (*seq)++;
+
+ req.acq.policy.sel.family = AF_INET;
+ req.acq.aalgos = 0xfeed;
+ req.acq.ealgos = 0xbaad;
+ req.acq.calgos = 0xbabe;
+
+ xfrm_tmpl.family = AF_INET;
+ xfrm_tmpl.id.proto = IPPROTO_ESP;
+ if (rtattr_pack(&req.nh, sizeof(req), XFRMA_TMPL, &xfrm_tmpl, sizeof(xfrm_tmpl)))
+ goto out_close;
+
+ if (send(xfrm_sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ goto out_close;
+ }
+
+ if (recv(xfrm_sock, &req, sizeof(req), 0) < 0) {
+ pr_err("recv()");
+ goto out_close;
+ } else if (req.nh.nlmsg_type != NLMSG_ERROR) {
+ printk("expected NLMSG_ERROR, got %d", (int)req.nh.nlmsg_type);
+ goto out_close;
+ }
+
+ if (req.error) {
+ printk("NLMSG_ERROR: %d: %s", req.error, strerror(-req.error));
+ ret = req.error;
+ goto out_close;
+ }
+
+ if (recv(xfrm_listen, &req, sizeof(req), 0) < 0) {
+ pr_err("recv()");
+ goto out_close;
+ }
+
+ if (req.acq.aalgos != 0xfeed || req.acq.ealgos != 0xbaad
+ || req.acq.calgos != 0xbabe) {
+ printk("xfrm_user_acquire has changed %x %x %x",
+ req.acq.aalgos, req.acq.ealgos, req.acq.calgos);
+ goto out_close;
+ }
+
+ ret = KSFT_PASS;
+out_close:
+ close(xfrm_listen);
+ return ret;
+}
+
+static int xfrm_expire_state(int xfrm_sock, uint32_t *seq,
+ unsigned int nr, struct xfrm_desc *desc)
+{
+ struct {
+ struct nlmsghdr nh;
+ union {
+ struct xfrm_user_expire expire;
+ int error;
+ };
+ } req;
+ struct in_addr src, dst;
+ int xfrm_listen = -1, ret = KSFT_FAIL;
+ uint32_t seq_listen;
+
+ src = inet_makeaddr(INADDR_B, child_ip(nr));
+ dst = inet_makeaddr(INADDR_B, grchild_ip(nr));
+
+ if (xfrm_state_add(xfrm_sock, (*seq)++, gen_spi(src), src, dst, desc)) {
+ printk("Failed to add xfrm state");
+ return KSFT_FAIL;
+ }
+
+ if (netlink_sock_bind(&xfrm_listen, &seq_listen, NETLINK_XFRM, XFRMNLGRP_EXPIRE))
+ return KSFT_FAIL;
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.expire));
+ req.nh.nlmsg_type = XFRM_MSG_EXPIRE;
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.nh.nlmsg_seq = (*seq)++;
+
+ memcpy(&req.expire.state.id.daddr, &dst, sizeof(dst));
+ req.expire.state.id.spi = gen_spi(src);
+ req.expire.state.id.proto = desc->proto;
+ req.expire.state.family = AF_INET;
+ req.expire.hard = 0xff;
+
+ if (send(xfrm_sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ goto out_close;
+ }
+
+ if (recv(xfrm_sock, &req, sizeof(req), 0) < 0) {
+ pr_err("recv()");
+ goto out_close;
+ } else if (req.nh.nlmsg_type != NLMSG_ERROR) {
+ printk("expected NLMSG_ERROR, got %d", (int)req.nh.nlmsg_type);
+ goto out_close;
+ }
+
+ if (req.error) {
+ printk("NLMSG_ERROR: %d: %s", req.error, strerror(-req.error));
+ ret = req.error;
+ goto out_close;
+ }
+
+ if (recv(xfrm_listen, &req, sizeof(req), 0) < 0) {
+ pr_err("recv()");
+ goto out_close;
+ }
+
+ if (req.expire.hard != 0x1) {
+ printk("expire.hard is not set: %x", req.expire.hard);
+ goto out_close;
+ }
+
+ ret = KSFT_PASS;
+out_close:
+ close(xfrm_listen);
+ return ret;
+}
+
+static int xfrm_expire_policy(int xfrm_sock, uint32_t *seq,
+ unsigned int nr, struct xfrm_desc *desc)
+{
+ struct {
+ struct nlmsghdr nh;
+ union {
+ struct xfrm_user_polexpire expire;
+ int error;
+ };
+ } req;
+ struct in_addr src, dst, tunsrc, tundst;
+ int xfrm_listen = -1, ret = KSFT_FAIL;
+ uint32_t seq_listen;
+
+ src = inet_makeaddr(INADDR_B, child_ip(nr));
+ dst = inet_makeaddr(INADDR_B, grchild_ip(nr));
+ tunsrc = inet_makeaddr(INADDR_A, child_ip(nr));
+ tundst = inet_makeaddr(INADDR_A, grchild_ip(nr));
+
+ if (xfrm_policy_add(xfrm_sock, (*seq)++, gen_spi(src), src, dst,
+ XFRM_POLICY_OUT, tunsrc, tundst, desc->proto)) {
+ printk("Failed to add xfrm policy");
+ return KSFT_FAIL;
+ }
+
+ if (netlink_sock_bind(&xfrm_listen, &seq_listen, NETLINK_XFRM, XFRMNLGRP_EXPIRE))
+ return KSFT_FAIL;
+
+ memset(&req, 0, sizeof(req));
+ req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(req.expire));
+ req.nh.nlmsg_type = XFRM_MSG_POLEXPIRE;
+ req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+ req.nh.nlmsg_seq = (*seq)++;
+
+ /* Fill selector. */
+ memcpy(&req.expire.pol.sel.daddr, &dst, sizeof(tundst));
+ memcpy(&req.expire.pol.sel.saddr, &src, sizeof(tunsrc));
+ req.expire.pol.sel.family = AF_INET;
+ req.expire.pol.sel.prefixlen_d = PREFIX_LEN;
+ req.expire.pol.sel.prefixlen_s = PREFIX_LEN;
+ req.expire.pol.dir = XFRM_POLICY_OUT;
+ req.expire.hard = 0xff;
+
+ if (send(xfrm_sock, &req, req.nh.nlmsg_len, 0) < 0) {
+ pr_err("send()");
+ goto out_close;
+ }
+
+ if (recv(xfrm_sock, &req, sizeof(req), 0) < 0) {
+ pr_err("recv()");
+ goto out_close;
+ } else if (req.nh.nlmsg_type != NLMSG_ERROR) {
+ printk("expected NLMSG_ERROR, got %d", (int)req.nh.nlmsg_type);
+ goto out_close;
+ }
+
+ if (req.error) {
+ printk("NLMSG_ERROR: %d: %s", req.error, strerror(-req.error));
+ ret = req.error;
+ goto out_close;
+ }
+
+ if (recv(xfrm_listen, &req, sizeof(req), 0) < 0) {
+ pr_err("recv()");
+ goto out_close;
+ }
+
+ if (req.expire.hard != 0x1) {
+ printk("expire.hard is not set: %x", req.expire.hard);
+ goto out_close;
+ }
+
+ ret = KSFT_PASS;
+out_close:
+ close(xfrm_listen);
+ return ret;
+}
+
+static int child_serv(int xfrm_sock, uint32_t *seq,
+ unsigned int nr, int cmd_fd, void *buf, struct xfrm_desc *desc)
+{
+ struct in_addr src, dst, tunsrc, tundst;
+ struct test_desc msg;
+ int ret = KSFT_FAIL;
+
+ src = inet_makeaddr(INADDR_B, child_ip(nr));
+ dst = inet_makeaddr(INADDR_B, grchild_ip(nr));
+ tunsrc = inet_makeaddr(INADDR_A, child_ip(nr));
+ tundst = inet_makeaddr(INADDR_A, grchild_ip(nr));
+
+ /* UDP pinging without xfrm */
+ if (do_ping(cmd_fd, buf, page_size, src, true, 0, 0, udp_ping_send)) {
+ printk("ping failed before setting xfrm");
+ return KSFT_FAIL;
+ }
+
+ memset(&msg, 0, sizeof(msg));
+ msg.type = MSG_XFRM_PREPARE;
+ memcpy(&msg.body.xfrm_desc, desc, sizeof(*desc));
+ write_msg(cmd_fd, &msg, 1);
+
+ if (xfrm_prepare(xfrm_sock, seq, src, dst, tunsrc, tundst, desc->proto)) {
+ printk("failed to prepare xfrm");
+ goto cleanup;
+ }
+
+ memset(&msg, 0, sizeof(msg));
+ msg.type = MSG_XFRM_ADD;
+ memcpy(&msg.body.xfrm_desc, desc, sizeof(*desc));
+ write_msg(cmd_fd, &msg, 1);
+ if (xfrm_set(xfrm_sock, seq, src, dst, tunsrc, tundst, desc)) {
+ printk("failed to set xfrm");
+ goto delete;
+ }
+
+ /* UDP pinging with xfrm tunnel */
+ if (do_ping(cmd_fd, buf, page_size, tunsrc,
+ true, 0, 0, udp_ping_send)) {
+ printk("ping failed for xfrm");
+ goto delete;
+ }
+
+ ret = KSFT_PASS;
+delete:
+ /* xfrm delete */
+ memset(&msg, 0, sizeof(msg));
+ msg.type = MSG_XFRM_DEL;
+ memcpy(&msg.body.xfrm_desc, desc, sizeof(*desc));
+ write_msg(cmd_fd, &msg, 1);
+
+ if (xfrm_delete(xfrm_sock, seq, src, dst, tunsrc, tundst, desc->proto)) {
+ printk("failed ping to remove xfrm");
+ ret = KSFT_FAIL;
+ }
+
+cleanup:
+ memset(&msg, 0, sizeof(msg));
+ msg.type = MSG_XFRM_CLEANUP;
+ memcpy(&msg.body.xfrm_desc, desc, sizeof(*desc));
+ write_msg(cmd_fd, &msg, 1);
+ if (xfrm_cleanup(xfrm_sock, seq, src, dst, tunsrc, tundst)) {
+ printk("failed ping to cleanup xfrm");
+ ret = KSFT_FAIL;
+ }
+ return ret;
+}
+
+static int child_f(unsigned int nr, int test_desc_fd, int cmd_fd, void *buf)
+{
+ struct xfrm_desc desc;
+ struct test_desc msg;
+ int xfrm_sock = -1;
+ uint32_t seq;
+
+ if (switch_ns(nsfd_childa))
+ exit(KSFT_FAIL);
+
+ if (netlink_sock(&xfrm_sock, &seq, NETLINK_XFRM)) {
+ printk("Failed to open xfrm netlink socket");
+ exit(KSFT_FAIL);
+ }
+
+ /* Check that seq sock is ready, just for sure. */
+ memset(&msg, 0, sizeof(msg));
+ msg.type = MSG_ACK;
+ write_msg(cmd_fd, &msg, 1);
+ read_msg(cmd_fd, &msg, 1);
+ if (msg.type != MSG_ACK) {
+ printk("Ack failed");
+ exit(KSFT_FAIL);
+ }
+
+ for (;;) {
+ ssize_t received = read(test_desc_fd, &desc, sizeof(desc));
+ int ret;
+
+ if (received == 0) /* EOF */
+ break;
+
+ if (received != sizeof(desc)) {
+ pr_err("read() returned %zd", received);
+ exit(KSFT_FAIL);
+ }
+
+ switch (desc.type) {
+ case CREATE_TUNNEL:
+ ret = child_serv(xfrm_sock, &seq, nr,
+ cmd_fd, buf, &desc);
+ break;
+ case ALLOCATE_SPI:
+ ret = xfrm_state_allocspi(xfrm_sock, &seq,
+ -1, desc.proto);
+ break;
+ case MONITOR_ACQUIRE:
+ ret = xfrm_monitor_acquire(xfrm_sock, &seq, nr);
+ break;
+ case EXPIRE_STATE:
+ ret = xfrm_expire_state(xfrm_sock, &seq, nr, &desc);
+ break;
+ case EXPIRE_POLICY:
+ ret = xfrm_expire_policy(xfrm_sock, &seq, nr, &desc);
+ break;
+ default:
+ printk("Unknown desc type %d", desc.type);
+ exit(KSFT_FAIL);
+ }
+ write_test_result(ret, &desc);
+ }
+
+ close(xfrm_sock);
+
+ msg.type = MSG_EXIT;
+ write_msg(cmd_fd, &msg, 1);
+ exit(KSFT_PASS);
+}
+
+static void grand_child_serv(unsigned int nr, int cmd_fd, void *buf,
+ struct test_desc *msg, int xfrm_sock, uint32_t *seq)
+{
+ struct in_addr src, dst, tunsrc, tundst;
+ bool tun_reply;
+ struct xfrm_desc *desc = &msg->body.xfrm_desc;
+
+ src = inet_makeaddr(INADDR_B, grchild_ip(nr));
+ dst = inet_makeaddr(INADDR_B, child_ip(nr));
+ tunsrc = inet_makeaddr(INADDR_A, grchild_ip(nr));
+ tundst = inet_makeaddr(INADDR_A, child_ip(nr));
+
+ switch (msg->type) {
+ case MSG_EXIT:
+ exit(KSFT_PASS);
+ case MSG_ACK:
+ write_msg(cmd_fd, msg, 1);
+ break;
+ case MSG_PING:
+ tun_reply = memcmp(&dst, &msg->body.ping.reply_ip, sizeof(in_addr_t));
+ /* UDP pinging without xfrm */
+ if (do_ping(cmd_fd, buf, page_size, tun_reply ? tunsrc : src,
+ false, msg->body.ping.port,
+ msg->body.ping.reply_ip, udp_ping_reply)) {
+ printk("ping failed before setting xfrm");
+ }
+ break;
+ case MSG_XFRM_PREPARE:
+ if (xfrm_prepare(xfrm_sock, seq, src, dst, tunsrc, tundst,
+ desc->proto)) {
+ xfrm_cleanup(xfrm_sock, seq, src, dst, tunsrc, tundst);
+ printk("failed to prepare xfrm");
+ }
+ break;
+ case MSG_XFRM_ADD:
+ if (xfrm_set(xfrm_sock, seq, src, dst, tunsrc, tundst, desc)) {
+ xfrm_cleanup(xfrm_sock, seq, src, dst, tunsrc, tundst);
+ printk("failed to set xfrm");
+ }
+ break;
+ case MSG_XFRM_DEL:
+ if (xfrm_delete(xfrm_sock, seq, src, dst, tunsrc, tundst,
+ desc->proto)) {
+ xfrm_cleanup(xfrm_sock, seq, src, dst, tunsrc, tundst);
+ printk("failed to remove xfrm");
+ }
+ break;
+ case MSG_XFRM_CLEANUP:
+ if (xfrm_cleanup(xfrm_sock, seq, src, dst, tunsrc, tundst)) {
+ printk("failed to cleanup xfrm");
+ }
+ break;
+ default:
+ printk("got unknown msg type %d", msg->type);
+ };
+}
+
+static int grand_child_f(unsigned int nr, int cmd_fd, void *buf)
+{
+ struct test_desc msg;
+ int xfrm_sock = -1;
+ uint32_t seq;
+
+ if (switch_ns(nsfd_childb))
+ exit(KSFT_FAIL);
+
+ if (netlink_sock(&xfrm_sock, &seq, NETLINK_XFRM)) {
+ printk("Failed to open xfrm netlink socket");
+ exit(KSFT_FAIL);
+ }
+
+ do {
+ read_msg(cmd_fd, &msg, 1);
+ grand_child_serv(nr, cmd_fd, buf, &msg, xfrm_sock, &seq);
+ } while (1);
+
+ close(xfrm_sock);
+ exit(KSFT_FAIL);
+}
+
+static int start_child(unsigned int nr, char *veth, int test_desc_fd[2])
+{
+ int cmd_sock[2];
+ void *data_map;
+ pid_t child;
+
+ if (init_child(nsfd_childa, veth, child_ip(nr), grchild_ip(nr)))
+ return -1;
+
+ if (init_child(nsfd_childb, veth, grchild_ip(nr), child_ip(nr)))
+ return -1;
+
+ child = fork();
+ if (child < 0) {
+ pr_err("fork()");
+ return -1;
+ } else if (child) {
+ /* in parent - selftest */
+ return switch_ns(nsfd_parent);
+ }
+
+ if (close(test_desc_fd[1])) {
+ pr_err("close()");
+ return -1;
+ }
+
+ /* child */
+ data_map = mmap(0, page_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+ if (data_map == MAP_FAILED) {
+ pr_err("mmap()");
+ return -1;
+ }
+
+ randomize_buffer(data_map, page_size);
+
+ if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, cmd_sock)) {
+ pr_err("socketpair()");
+ return -1;
+ }
+
+ child = fork();
+ if (child < 0) {
+ pr_err("fork()");
+ return -1;
+ } else if (child) {
+ if (close(cmd_sock[0])) {
+ pr_err("close()");
+ return -1;
+ }
+ return child_f(nr, test_desc_fd[0], cmd_sock[1], data_map);
+ }
+ if (close(cmd_sock[1])) {
+ pr_err("close()");
+ return -1;
+ }
+ return grand_child_f(nr, cmd_sock[0], data_map);
+}
+
+static void exit_usage(char **argv)
+{
+ printk("Usage: %s [nr_process]", argv[0]);
+ exit(KSFT_FAIL);
+}
+
+static int __write_desc(int test_desc_fd, struct xfrm_desc *desc)
+{
+ ssize_t ret;
+
+ ret = write(test_desc_fd, desc, sizeof(*desc));
+
+ if (ret == sizeof(*desc))
+ return 0;
+
+ pr_err("Writing test's desc failed %ld", ret);
+
+ return -1;
+}
+
+static int write_desc(int proto, int test_desc_fd,
+ char *a, char *e, char *c, char *ae)
+{
+ struct xfrm_desc desc = {};
+
+ desc.type = CREATE_TUNNEL;
+ desc.proto = proto;
+
+ if (a)
+ strncpy(desc.a_algo, a, ALGO_LEN - 1);
+ if (e)
+ strncpy(desc.e_algo, e, ALGO_LEN - 1);
+ if (c)
+ strncpy(desc.c_algo, c, ALGO_LEN - 1);
+ if (ae)
+ strncpy(desc.ae_algo, ae, ALGO_LEN - 1);
+
+ return __write_desc(test_desc_fd, &desc);
+}
+
+int proto_list[] = { IPPROTO_AH, IPPROTO_COMP, IPPROTO_ESP };
+char *ah_list[] = {
+ "digest_null", "hmac(md5)", "hmac(sha1)", "hmac(sha256)",
+ "hmac(sha384)", "hmac(sha512)", "hmac(rmd160)",
+ "xcbc(aes)", "cmac(aes)"
+};
+char *comp_list[] = {
+ "deflate",
+#if 0
+ /* No compression backend realization */
+ "lzs", "lzjh"
+#endif
+};
+char *e_list[] = {
+ "ecb(cipher_null)", "cbc(des)", "cbc(des3_ede)", "cbc(cast5)",
+ "cbc(blowfish)", "cbc(aes)", "cbc(serpent)", "cbc(camellia)",
+ "cbc(twofish)", "rfc3686(ctr(aes))"
+};
+char *ae_list[] = {
+#if 0
+ /* not implemented */
+ "rfc4106(gcm(aes))", "rfc4309(ccm(aes))", "rfc4543(gcm(aes))",
+ "rfc7539esp(chacha20,poly1305)"
+#endif
+};
+
+const unsigned int proto_plan = ARRAY_SIZE(ah_list) + ARRAY_SIZE(comp_list) \
+ + (ARRAY_SIZE(ah_list) * ARRAY_SIZE(e_list)) \
+ + ARRAY_SIZE(ae_list);
+
+static int write_proto_plan(int fd, int proto)
+{
+ unsigned int i;
+
+ switch (proto) {
+ case IPPROTO_AH:
+ for (i = 0; i < ARRAY_SIZE(ah_list); i++) {
+ if (write_desc(proto, fd, ah_list[i], 0, 0, 0))
+ return -1;
+ }
+ break;
+ case IPPROTO_COMP:
+ for (i = 0; i < ARRAY_SIZE(comp_list); i++) {
+ if (write_desc(proto, fd, 0, 0, comp_list[i], 0))
+ return -1;
+ }
+ break;
+ case IPPROTO_ESP:
+ for (i = 0; i < ARRAY_SIZE(ah_list); i++) {
+ int j;
+
+ for (j = 0; j < ARRAY_SIZE(e_list); j++) {
+ if (write_desc(proto, fd, ah_list[i],
+ e_list[j], 0, 0))
+ return -1;
+ }
+ }
+ for (i = 0; i < ARRAY_SIZE(ae_list); i++) {
+ if (write_desc(proto, fd, 0, 0, 0, ae_list[i]))
+ return -1;
+ }
+ break;
+ default:
+ printk("BUG: Specified unknown proto %d", proto);
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Some structures in xfrm uapi header differ in size between
+ * 64-bit and 32-bit ABI:
+ *
+ * 32-bit UABI | 64-bit UABI
+ * -------------------------------------|-------------------------------------
+ * sizeof(xfrm_usersa_info) = 220 | sizeof(xfrm_usersa_info) = 224
+ * sizeof(xfrm_userpolicy_info) = 164 | sizeof(xfrm_userpolicy_info) = 168
+ * sizeof(xfrm_userspi_info) = 228 | sizeof(xfrm_userspi_info) = 232
+ * sizeof(xfrm_user_acquire) = 276 | sizeof(xfrm_user_acquire) = 280
+ * sizeof(xfrm_user_expire) = 224 | sizeof(xfrm_user_expire) = 232
+ * sizeof(xfrm_user_polexpire) = 168 | sizeof(xfrm_user_polexpire) = 176
+ *
+ * Check the affected by the UABI difference structures.
+ */
+const unsigned int compat_plan = 4;
+static int write_compat_struct_tests(int test_desc_fd)
+{
+ struct xfrm_desc desc = {};
+
+ desc.type = ALLOCATE_SPI;
+ desc.proto = IPPROTO_AH;
+ strncpy(desc.a_algo, ah_list[0], ALGO_LEN - 1);
+
+ if (__write_desc(test_desc_fd, &desc))
+ return -1;
+
+ desc.type = MONITOR_ACQUIRE;
+ if (__write_desc(test_desc_fd, &desc))
+ return -1;
+
+ desc.type = EXPIRE_STATE;
+ if (__write_desc(test_desc_fd, &desc))
+ return -1;
+
+ desc.type = EXPIRE_POLICY;
+ if (__write_desc(test_desc_fd, &desc))
+ return -1;
+
+ return 0;
+}
+
+static int write_test_plan(int test_desc_fd)
+{
+ unsigned int i;
+ pid_t child;
+
+ child = fork();
+ if (child < 0) {
+ pr_err("fork()");
+ return -1;
+ }
+ if (child) {
+ if (close(test_desc_fd))
+ printk("close(): %m");
+ return 0;
+ }
+
+ if (write_compat_struct_tests(test_desc_fd))
+ exit(KSFT_FAIL);
+
+ for (i = 0; i < ARRAY_SIZE(proto_list); i++) {
+ if (write_proto_plan(test_desc_fd, proto_list[i]))
+ exit(KSFT_FAIL);
+ }
+
+ exit(KSFT_PASS);
+}
+
+static int children_cleanup(void)
+{
+ unsigned ret = KSFT_PASS;
+
+ while (1) {
+ int status;
+ pid_t p = wait(&status);
+
+ if ((p < 0) && errno == ECHILD)
+ break;
+
+ if (p < 0) {
+ pr_err("wait()");
+ return KSFT_FAIL;
+ }
+
+ if (!WIFEXITED(status)) {
+ ret = KSFT_FAIL;
+ continue;
+ }
+
+ if (WEXITSTATUS(status) == KSFT_FAIL)
+ ret = KSFT_FAIL;
+ }
+
+ return ret;
+}
+
+typedef void (*print_res)(const char *, ...);
+
+static int check_results(void)
+{
+ struct test_result tr = {};
+ struct xfrm_desc *d = &tr.desc;
+ int ret = KSFT_PASS;
+
+ while (1) {
+ ssize_t received = read(results_fd[0], &tr, sizeof(tr));
+ print_res result;
+
+ if (received == 0) /* EOF */
+ break;
+
+ if (received != sizeof(tr)) {
+ pr_err("read() returned %zd", received);
+ return KSFT_FAIL;
+ }
+
+ switch (tr.res) {
+ case KSFT_PASS:
+ result = ksft_test_result_pass;
+ break;
+ case KSFT_FAIL:
+ default:
+ result = ksft_test_result_fail;
+ ret = KSFT_FAIL;
+ }
+
+ result(" %s: [%u, '%s', '%s', '%s', '%s', %u]\n",
+ desc_name[d->type], (unsigned int)d->proto, d->a_algo,
+ d->e_algo, d->c_algo, d->ae_algo, d->icv_len);
+ }
+
+ return ret;
+}
+
+int main(int argc, char **argv)
+{
+ long nr_process = 1;
+ int route_sock = -1, ret = KSFT_SKIP;
+ int test_desc_fd[2];
+ uint32_t route_seq;
+ unsigned int i;
+
+ if (argc > 2)
+ exit_usage(argv);
+
+ if (argc > 1) {
+ char *endptr;
+
+ errno = 0;
+ nr_process = strtol(argv[1], &endptr, 10);
+ if ((errno == ERANGE && (nr_process == LONG_MAX || nr_process == LONG_MIN))
+ || (errno != 0 && nr_process == 0)
+ || (endptr == argv[1]) || (*endptr != '\0')) {
+ printk("Failed to parse [nr_process]");
+ exit_usage(argv);
+ }
+
+ if (nr_process > MAX_PROCESSES || nr_process < 1) {
+ printk("nr_process should be between [1; %u]",
+ MAX_PROCESSES);
+ exit_usage(argv);
+ }
+ }
+
+ srand(time(NULL));
+ page_size = sysconf(_SC_PAGESIZE);
+ if (page_size < 1)
+ ksft_exit_skip("sysconf(): %m\n");
+
+ if (pipe2(test_desc_fd, O_DIRECT) < 0)
+ ksft_exit_skip("pipe(): %m\n");
+
+ if (pipe2(results_fd, O_DIRECT) < 0)
+ ksft_exit_skip("pipe(): %m\n");
+
+ if (init_namespaces())
+ ksft_exit_skip("Failed to create namespaces\n");
+
+ if (netlink_sock(&route_sock, &route_seq, NETLINK_ROUTE))
+ ksft_exit_skip("Failed to open netlink route socket\n");
+
+ for (i = 0; i < nr_process; i++) {
+ char veth[VETH_LEN];
+
+ snprintf(veth, VETH_LEN, VETH_FMT, i);
+
+ if (veth_add(route_sock, route_seq++, veth, nsfd_childa, veth, nsfd_childb)) {
+ close(route_sock);
+ ksft_exit_fail_msg("Failed to create veth device");
+ }
+
+ if (start_child(i, veth, test_desc_fd)) {
+ close(route_sock);
+ ksft_exit_fail_msg("Child %u failed to start", i);
+ }
+ }
+
+ if (close(route_sock) || close(test_desc_fd[0]) || close(results_fd[1]))
+ ksft_exit_fail_msg("close(): %m");
+
+ ksft_set_plan(proto_plan + compat_plan);
+
+ if (write_test_plan(test_desc_fd[1]))
+ ksft_exit_fail_msg("Failed to write test plan to pipe");
+
+ ret = check_results();
+
+ if (children_cleanup() == KSFT_FAIL)
+ exit(KSFT_FAIL);
+
+ exit(ret);
+}
diff --git a/tools/testing/selftests/net/ipv6_flowlabel.c b/tools/testing/selftests/net/ipv6_flowlabel.c
new file mode 100644
index 000000000..a7c413753
--- /dev/null
+++ b/tools/testing/selftests/net/ipv6_flowlabel.c
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Test IPV6_FLOWINFO cmsg on send and recv */
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <asm/byteorder.h>
+#include <error.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/in6.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+/* uapi/glibc weirdness may leave this undefined */
+#ifndef IPV6_FLOWINFO
+#define IPV6_FLOWINFO 11
+#endif
+
+#ifndef IPV6_FLOWLABEL_MGR
+#define IPV6_FLOWLABEL_MGR 32
+#endif
+
+#define FLOWLABEL_WILDCARD ((uint32_t) -1)
+
+static const char cfg_data[] = "a";
+static uint32_t cfg_label = 1;
+
+static void do_send(int fd, bool with_flowlabel, uint32_t flowlabel)
+{
+ char control[CMSG_SPACE(sizeof(flowlabel))] = {0};
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ int ret;
+
+ iov.iov_base = (char *)cfg_data;
+ iov.iov_len = sizeof(cfg_data);
+
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ if (with_flowlabel) {
+ struct cmsghdr *cm;
+
+ cm = (void *)control;
+ cm->cmsg_len = CMSG_LEN(sizeof(flowlabel));
+ cm->cmsg_level = SOL_IPV6;
+ cm->cmsg_type = IPV6_FLOWINFO;
+ *(uint32_t *)CMSG_DATA(cm) = htonl(flowlabel);
+
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+ }
+
+ ret = sendmsg(fd, &msg, 0);
+ if (ret == -1)
+ error(1, errno, "send");
+
+ if (with_flowlabel)
+ fprintf(stderr, "sent with label %u\n", flowlabel);
+ else
+ fprintf(stderr, "sent without label\n");
+}
+
+static void do_recv(int fd, bool with_flowlabel, uint32_t expect)
+{
+ char control[CMSG_SPACE(sizeof(expect))];
+ char data[sizeof(cfg_data)];
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ struct cmsghdr *cm;
+ uint32_t flowlabel;
+ int ret;
+
+ iov.iov_base = data;
+ iov.iov_len = sizeof(data);
+
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ memset(control, 0, sizeof(control));
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ ret = recvmsg(fd, &msg, 0);
+ if (ret == -1)
+ error(1, errno, "recv");
+ if (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))
+ error(1, 0, "recv: truncated");
+ if (ret != sizeof(cfg_data))
+ error(1, 0, "recv: length mismatch");
+ if (memcmp(data, cfg_data, sizeof(data)))
+ error(1, 0, "recv: data mismatch");
+
+ cm = CMSG_FIRSTHDR(&msg);
+ if (with_flowlabel) {
+ if (!cm)
+ error(1, 0, "recv: missing cmsg");
+ if (CMSG_NXTHDR(&msg, cm))
+ error(1, 0, "recv: too many cmsg");
+ if (cm->cmsg_level != SOL_IPV6 ||
+ cm->cmsg_type != IPV6_FLOWINFO)
+ error(1, 0, "recv: unexpected cmsg level or type");
+
+ flowlabel = ntohl(*(uint32_t *)CMSG_DATA(cm));
+ fprintf(stderr, "recv with label %u\n", flowlabel);
+
+ if (expect != FLOWLABEL_WILDCARD && expect != flowlabel)
+ fprintf(stderr, "recv: incorrect flowlabel %u != %u\n",
+ flowlabel, expect);
+
+ } else {
+ fprintf(stderr, "recv without label\n");
+ }
+}
+
+static bool get_autoflowlabel_enabled(void)
+{
+ int fd, ret;
+ char val;
+
+ fd = open("/proc/sys/net/ipv6/auto_flowlabels", O_RDONLY);
+ if (fd == -1)
+ error(1, errno, "open sysctl");
+
+ ret = read(fd, &val, 1);
+ if (ret == -1)
+ error(1, errno, "read sysctl");
+ if (ret == 0)
+ error(1, 0, "read sysctl: 0");
+
+ if (close(fd))
+ error(1, errno, "close sysctl");
+
+ return val == '1';
+}
+
+static void flowlabel_get(int fd, uint32_t label, uint8_t share, uint16_t flags)
+{
+ struct in6_flowlabel_req req = {
+ .flr_action = IPV6_FL_A_GET,
+ .flr_label = htonl(label),
+ .flr_flags = flags,
+ .flr_share = share,
+ };
+
+ /* do not pass IPV6_ADDR_ANY or IPV6_ADDR_MAPPED */
+ req.flr_dst.s6_addr[0] = 0xfd;
+ req.flr_dst.s6_addr[15] = 0x1;
+
+ if (setsockopt(fd, SOL_IPV6, IPV6_FLOWLABEL_MGR, &req, sizeof(req)))
+ error(1, errno, "setsockopt flowlabel get");
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ int c;
+
+ while ((c = getopt(argc, argv, "l:")) != -1) {
+ switch (c) {
+ case 'l':
+ cfg_label = strtoul(optarg, NULL, 0);
+ break;
+ default:
+ error(1, 0, "%s: parse error", argv[0]);
+ }
+ }
+}
+
+int main(int argc, char **argv)
+{
+ struct sockaddr_in6 addr = {
+ .sin6_family = AF_INET6,
+ .sin6_port = htons(8000),
+ .sin6_addr = IN6ADDR_LOOPBACK_INIT,
+ };
+ const int one = 1;
+ int fdt, fdr;
+
+ parse_opts(argc, argv);
+
+ fdt = socket(PF_INET6, SOCK_DGRAM, 0);
+ if (fdt == -1)
+ error(1, errno, "socket t");
+
+ fdr = socket(PF_INET6, SOCK_DGRAM, 0);
+ if (fdr == -1)
+ error(1, errno, "socket r");
+
+ if (connect(fdt, (void *)&addr, sizeof(addr)))
+ error(1, errno, "connect");
+ if (bind(fdr, (void *)&addr, sizeof(addr)))
+ error(1, errno, "bind");
+
+ flowlabel_get(fdt, cfg_label, IPV6_FL_S_EXCL, IPV6_FL_F_CREATE);
+
+ if (setsockopt(fdr, SOL_IPV6, IPV6_FLOWINFO, &one, sizeof(one)))
+ error(1, errno, "setsockopt flowinfo");
+
+ if (get_autoflowlabel_enabled()) {
+ fprintf(stderr, "send no label: recv auto flowlabel\n");
+ do_send(fdt, false, 0);
+ do_recv(fdr, true, FLOWLABEL_WILDCARD);
+ } else {
+ fprintf(stderr, "send no label: recv no label (auto off)\n");
+ do_send(fdt, false, 0);
+ do_recv(fdr, false, 0);
+ }
+
+ fprintf(stderr, "send label\n");
+ do_send(fdt, true, cfg_label);
+ do_recv(fdr, true, cfg_label);
+
+ if (close(fdr))
+ error(1, errno, "close r");
+ if (close(fdt))
+ error(1, errno, "close t");
+
+ return 0;
+}
diff --git a/tools/testing/selftests/net/ipv6_flowlabel.sh b/tools/testing/selftests/net/ipv6_flowlabel.sh
new file mode 100755
index 000000000..d3bc64427
--- /dev/null
+++ b/tools/testing/selftests/net/ipv6_flowlabel.sh
@@ -0,0 +1,21 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Regression tests for IPv6 flowlabels
+#
+# run in separate namespaces to avoid mgmt db conflicts betweent tests
+
+set -e
+
+echo "TEST management"
+./in_netns.sh ./ipv6_flowlabel_mgr
+
+echo "TEST datapath"
+./in_netns.sh \
+ sh -c 'sysctl -q -w net.ipv6.auto_flowlabels=0 && ./ipv6_flowlabel -l 1'
+
+echo "TEST datapath (with auto-flowlabels)"
+./in_netns.sh \
+ sh -c 'sysctl -q -w net.ipv6.auto_flowlabels=1 && ./ipv6_flowlabel -l 1'
+
+echo OK. All tests passed
diff --git a/tools/testing/selftests/net/ipv6_flowlabel_mgr.c b/tools/testing/selftests/net/ipv6_flowlabel_mgr.c
new file mode 100644
index 000000000..af95b48ac
--- /dev/null
+++ b/tools/testing/selftests/net/ipv6_flowlabel_mgr.c
@@ -0,0 +1,199 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Test IPV6_FLOWINFO_MGR */
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <error.h>
+#include <errno.h>
+#include <limits.h>
+#include <linux/in6.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+/* uapi/glibc weirdness may leave this undefined */
+#ifndef IPV6_FLOWLABEL_MGR
+#define IPV6_FLOWLABEL_MGR 32
+#endif
+
+/* from net/ipv6/ip6_flowlabel.c */
+#define FL_MIN_LINGER 6
+
+#define explain(x) \
+ do { if (cfg_verbose) fprintf(stderr, " " x "\n"); } while (0)
+
+#define __expect(x) \
+ do { \
+ if (!(x)) \
+ fprintf(stderr, "[OK] " #x "\n"); \
+ else \
+ error(1, 0, "[ERR] " #x " (line %d)", __LINE__); \
+ } while (0)
+
+#define expect_pass(x) __expect(x)
+#define expect_fail(x) __expect(!(x))
+
+static bool cfg_long_running;
+static bool cfg_verbose;
+
+static int flowlabel_get(int fd, uint32_t label, uint8_t share, uint16_t flags)
+{
+ struct in6_flowlabel_req req = {
+ .flr_action = IPV6_FL_A_GET,
+ .flr_label = htonl(label),
+ .flr_flags = flags,
+ .flr_share = share,
+ };
+
+ /* do not pass IPV6_ADDR_ANY or IPV6_ADDR_MAPPED */
+ req.flr_dst.s6_addr[0] = 0xfd;
+ req.flr_dst.s6_addr[15] = 0x1;
+
+ return setsockopt(fd, SOL_IPV6, IPV6_FLOWLABEL_MGR, &req, sizeof(req));
+}
+
+static int flowlabel_put(int fd, uint32_t label)
+{
+ struct in6_flowlabel_req req = {
+ .flr_action = IPV6_FL_A_PUT,
+ .flr_label = htonl(label),
+ };
+
+ return setsockopt(fd, SOL_IPV6, IPV6_FLOWLABEL_MGR, &req, sizeof(req));
+}
+
+static void run_tests(int fd)
+{
+ int wstatus;
+ pid_t pid;
+
+ explain("cannot get non-existent label");
+ expect_fail(flowlabel_get(fd, 1, IPV6_FL_S_ANY, 0));
+
+ explain("cannot put non-existent label");
+ expect_fail(flowlabel_put(fd, 1));
+
+ explain("cannot create label greater than 20 bits");
+ expect_fail(flowlabel_get(fd, 0x1FFFFF, IPV6_FL_S_ANY,
+ IPV6_FL_F_CREATE));
+
+ explain("create a new label (FL_F_CREATE)");
+ expect_pass(flowlabel_get(fd, 1, IPV6_FL_S_ANY, IPV6_FL_F_CREATE));
+ explain("can get the label (without FL_F_CREATE)");
+ expect_pass(flowlabel_get(fd, 1, IPV6_FL_S_ANY, 0));
+ explain("can get it again with create flag set, too");
+ expect_pass(flowlabel_get(fd, 1, IPV6_FL_S_ANY, IPV6_FL_F_CREATE));
+ explain("cannot get it again with the exclusive (FL_FL_EXCL) flag");
+ expect_fail(flowlabel_get(fd, 1, IPV6_FL_S_ANY,
+ IPV6_FL_F_CREATE | IPV6_FL_F_EXCL));
+ explain("can now put exactly three references");
+ expect_pass(flowlabel_put(fd, 1));
+ expect_pass(flowlabel_put(fd, 1));
+ expect_pass(flowlabel_put(fd, 1));
+ expect_fail(flowlabel_put(fd, 1));
+
+ explain("create a new exclusive label (FL_S_EXCL)");
+ expect_pass(flowlabel_get(fd, 2, IPV6_FL_S_EXCL, IPV6_FL_F_CREATE));
+ explain("cannot get it again in non-exclusive mode");
+ expect_fail(flowlabel_get(fd, 2, IPV6_FL_S_ANY, IPV6_FL_F_CREATE));
+ explain("cannot get it again in exclusive mode either");
+ expect_fail(flowlabel_get(fd, 2, IPV6_FL_S_EXCL, IPV6_FL_F_CREATE));
+ expect_pass(flowlabel_put(fd, 2));
+
+ if (cfg_long_running) {
+ explain("cannot reuse the label, due to linger");
+ expect_fail(flowlabel_get(fd, 2, IPV6_FL_S_ANY,
+ IPV6_FL_F_CREATE));
+ explain("after sleep, can reuse");
+ sleep(FL_MIN_LINGER * 2 + 1);
+ expect_pass(flowlabel_get(fd, 2, IPV6_FL_S_ANY,
+ IPV6_FL_F_CREATE));
+ }
+
+ explain("create a new user-private label (FL_S_USER)");
+ expect_pass(flowlabel_get(fd, 3, IPV6_FL_S_USER, IPV6_FL_F_CREATE));
+ explain("cannot get it again in non-exclusive mode");
+ expect_fail(flowlabel_get(fd, 3, IPV6_FL_S_ANY, 0));
+ explain("cannot get it again in exclusive mode");
+ expect_fail(flowlabel_get(fd, 3, IPV6_FL_S_EXCL, 0));
+ explain("can get it again in user mode");
+ expect_pass(flowlabel_get(fd, 3, IPV6_FL_S_USER, 0));
+ explain("child process can get it too, but not after setuid(nobody)");
+ pid = fork();
+ if (pid == -1)
+ error(1, errno, "fork");
+ if (!pid) {
+ expect_pass(flowlabel_get(fd, 3, IPV6_FL_S_USER, 0));
+ if (setuid(USHRT_MAX))
+ fprintf(stderr, "[INFO] skip setuid child test\n");
+ else
+ expect_fail(flowlabel_get(fd, 3, IPV6_FL_S_USER, 0));
+ exit(0);
+ }
+ if (wait(&wstatus) == -1)
+ error(1, errno, "wait");
+ if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus) != 0)
+ error(1, errno, "wait: unexpected child result");
+
+ explain("create a new process-private label (FL_S_PROCESS)");
+ expect_pass(flowlabel_get(fd, 4, IPV6_FL_S_PROCESS, IPV6_FL_F_CREATE));
+ explain("can get it again");
+ expect_pass(flowlabel_get(fd, 4, IPV6_FL_S_PROCESS, 0));
+ explain("child process cannot can get it");
+ pid = fork();
+ if (pid == -1)
+ error(1, errno, "fork");
+ if (!pid) {
+ expect_fail(flowlabel_get(fd, 4, IPV6_FL_S_PROCESS, 0));
+ exit(0);
+ }
+ if (wait(&wstatus) == -1)
+ error(1, errno, "wait");
+ if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus) != 0)
+ error(1, errno, "wait: unexpected child result");
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ int c;
+
+ while ((c = getopt(argc, argv, "lv")) != -1) {
+ switch (c) {
+ case 'l':
+ cfg_long_running = true;
+ break;
+ case 'v':
+ cfg_verbose = true;
+ break;
+ default:
+ error(1, 0, "%s: parse error", argv[0]);
+ }
+ }
+}
+
+int main(int argc, char **argv)
+{
+ int fd;
+
+ parse_opts(argc, argv);
+
+ fd = socket(PF_INET6, SOCK_DGRAM, 0);
+ if (fd == -1)
+ error(1, errno, "socket");
+
+ run_tests(fd);
+
+ if (close(fd))
+ error(1, errno, "close");
+
+ return 0;
+}
diff --git a/tools/testing/selftests/net/l2tp.sh b/tools/testing/selftests/net/l2tp.sh
new file mode 100755
index 000000000..578243388
--- /dev/null
+++ b/tools/testing/selftests/net/l2tp.sh
@@ -0,0 +1,382 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# L2TPv3 tunnel between 2 hosts
+#
+# host-1 | router | host-2
+# | |
+# lo l2tp | | l2tp lo
+# 172.16.101.1 172.16.1.1 | | 172.16.1.2 172.16.101.2
+# fc00:101::1 fc00:1::1 | | fc00:1::2 fc00:101::2
+# | |
+# eth0 | | eth0
+# 10.1.1.1 | | 10.1.2.1
+# 2001:db8:1::1 | | 2001:db8:2::1
+
+VERBOSE=0
+PAUSE_ON_FAIL=no
+
+which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping)
+
+################################################################################
+#
+log_test()
+{
+ local rc=$1
+ local expected=$2
+ local msg="$3"
+
+ if [ ${rc} -eq ${expected} ]; then
+ printf "TEST: %-60s [ OK ]\n" "${msg}"
+ nsuccess=$((nsuccess+1))
+ else
+ ret=1
+ nfail=$((nfail+1))
+ printf "TEST: %-60s [FAIL]\n" "${msg}"
+ if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+ echo
+ echo "hit enter to continue, 'q' to quit"
+ read a
+ [ "$a" = "q" ] && exit 1
+ fi
+ fi
+}
+
+run_cmd()
+{
+ local ns
+ local cmd
+ local out
+ local rc
+
+ ns="$1"
+ shift
+ cmd="$*"
+
+ if [ "$VERBOSE" = "1" ]; then
+ printf " COMMAND: $cmd\n"
+ fi
+
+ out=$(eval ip netns exec ${ns} ${cmd} 2>&1)
+ rc=$?
+ if [ "$VERBOSE" = "1" -a -n "$out" ]; then
+ echo " $out"
+ fi
+
+ [ "$VERBOSE" = "1" ] && echo
+
+ return $rc
+}
+
+################################################################################
+# create namespaces and interconnects
+
+create_ns()
+{
+ local ns=$1
+ local addr=$2
+ local addr6=$3
+
+ [ -z "${addr}" ] && addr="-"
+ [ -z "${addr6}" ] && addr6="-"
+
+ ip netns add ${ns}
+
+ ip -netns ${ns} link set lo up
+ if [ "${addr}" != "-" ]; then
+ ip -netns ${ns} addr add dev lo ${addr}
+ fi
+ if [ "${addr6}" != "-" ]; then
+ ip -netns ${ns} -6 addr add dev lo ${addr6}
+ fi
+
+ ip -netns ${ns} ro add unreachable default metric 8192
+ ip -netns ${ns} -6 ro add unreachable default metric 8192
+
+ ip netns exec ${ns} sysctl -qw net.ipv4.ip_forward=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.all.keep_addr_on_down=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.all.forwarding=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.default.forwarding=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.default.accept_dad=0
+}
+
+# create veth pair to connect namespaces and apply addresses.
+connect_ns()
+{
+ local ns1=$1
+ local ns1_dev=$2
+ local ns1_addr=$3
+ local ns1_addr6=$4
+ local ns2=$5
+ local ns2_dev=$6
+ local ns2_addr=$7
+ local ns2_addr6=$8
+
+ ip -netns ${ns1} li add ${ns1_dev} type veth peer name tmp
+ ip -netns ${ns1} li set ${ns1_dev} up
+ ip -netns ${ns1} li set tmp netns ${ns2} name ${ns2_dev}
+ ip -netns ${ns2} li set ${ns2_dev} up
+
+ if [ "${ns1_addr}" != "-" ]; then
+ ip -netns ${ns1} addr add dev ${ns1_dev} ${ns1_addr}
+ ip -netns ${ns2} addr add dev ${ns2_dev} ${ns2_addr}
+ fi
+
+ if [ "${ns1_addr6}" != "-" ]; then
+ ip -netns ${ns1} addr add dev ${ns1_dev} ${ns1_addr6}
+ ip -netns ${ns2} addr add dev ${ns2_dev} ${ns2_addr6}
+ fi
+}
+
+################################################################################
+# test setup
+
+cleanup()
+{
+ local ns
+
+ for ns in host-1 host-2 router
+ do
+ ip netns del ${ns} 2>/dev/null
+ done
+}
+
+setup_l2tp_ipv4()
+{
+ #
+ # configure l2tpv3 tunnel on host-1
+ #
+ ip -netns host-1 l2tp add tunnel tunnel_id 1041 peer_tunnel_id 1042 \
+ encap ip local 10.1.1.1 remote 10.1.2.1
+ ip -netns host-1 l2tp add session name l2tp4 tunnel_id 1041 \
+ session_id 1041 peer_session_id 1042
+ ip -netns host-1 link set dev l2tp4 up
+ ip -netns host-1 addr add dev l2tp4 172.16.1.1 peer 172.16.1.2
+
+ #
+ # configure l2tpv3 tunnel on host-2
+ #
+ ip -netns host-2 l2tp add tunnel tunnel_id 1042 peer_tunnel_id 1041 \
+ encap ip local 10.1.2.1 remote 10.1.1.1
+ ip -netns host-2 l2tp add session name l2tp4 tunnel_id 1042 \
+ session_id 1042 peer_session_id 1041
+ ip -netns host-2 link set dev l2tp4 up
+ ip -netns host-2 addr add dev l2tp4 172.16.1.2 peer 172.16.1.1
+
+ #
+ # add routes to loopback addresses
+ #
+ ip -netns host-1 ro add 172.16.101.2/32 via 172.16.1.2
+ ip -netns host-2 ro add 172.16.101.1/32 via 172.16.1.1
+}
+
+setup_l2tp_ipv6()
+{
+ #
+ # configure l2tpv3 tunnel on host-1
+ #
+ ip -netns host-1 l2tp add tunnel tunnel_id 1061 peer_tunnel_id 1062 \
+ encap ip local 2001:db8:1::1 remote 2001:db8:2::1
+ ip -netns host-1 l2tp add session name l2tp6 tunnel_id 1061 \
+ session_id 1061 peer_session_id 1062
+ ip -netns host-1 link set dev l2tp6 up
+ ip -netns host-1 addr add dev l2tp6 fc00:1::1 peer fc00:1::2
+
+ #
+ # configure l2tpv3 tunnel on host-2
+ #
+ ip -netns host-2 l2tp add tunnel tunnel_id 1062 peer_tunnel_id 1061 \
+ encap ip local 2001:db8:2::1 remote 2001:db8:1::1
+ ip -netns host-2 l2tp add session name l2tp6 tunnel_id 1062 \
+ session_id 1062 peer_session_id 1061
+ ip -netns host-2 link set dev l2tp6 up
+ ip -netns host-2 addr add dev l2tp6 fc00:1::2 peer fc00:1::1
+
+ #
+ # add routes to loopback addresses
+ #
+ ip -netns host-1 -6 ro add fc00:101::2/128 via fc00:1::2
+ ip -netns host-2 -6 ro add fc00:101::1/128 via fc00:1::1
+}
+
+setup()
+{
+ # start clean
+ cleanup
+
+ set -e
+ create_ns host-1 172.16.101.1/32 fc00:101::1/128
+ create_ns host-2 172.16.101.2/32 fc00:101::2/128
+ create_ns router
+
+ connect_ns host-1 eth0 10.1.1.1/24 2001:db8:1::1/64 \
+ router eth1 10.1.1.2/24 2001:db8:1::2/64
+
+ connect_ns host-2 eth0 10.1.2.1/24 2001:db8:2::1/64 \
+ router eth2 10.1.2.2/24 2001:db8:2::2/64
+
+ ip -netns host-1 ro add 10.1.2.0/24 via 10.1.1.2
+ ip -netns host-1 -6 ro add 2001:db8:2::/64 via 2001:db8:1::2
+
+ ip -netns host-2 ro add 10.1.1.0/24 via 10.1.2.2
+ ip -netns host-2 -6 ro add 2001:db8:1::/64 via 2001:db8:2::2
+
+ setup_l2tp_ipv4
+ setup_l2tp_ipv6
+ set +e
+}
+
+setup_ipsec()
+{
+ #
+ # IPv4
+ #
+ run_cmd host-1 ip xfrm policy add \
+ src 10.1.1.1 dst 10.1.2.1 dir out \
+ tmpl proto esp mode transport
+
+ run_cmd host-1 ip xfrm policy add \
+ src 10.1.2.1 dst 10.1.1.1 dir in \
+ tmpl proto esp mode transport
+
+ run_cmd host-2 ip xfrm policy add \
+ src 10.1.1.1 dst 10.1.2.1 dir in \
+ tmpl proto esp mode transport
+
+ run_cmd host-2 ip xfrm policy add \
+ src 10.1.2.1 dst 10.1.1.1 dir out \
+ tmpl proto esp mode transport
+
+ ip -netns host-1 xfrm state add \
+ src 10.1.1.1 dst 10.1.2.1 \
+ spi 0x1000 proto esp aead 'rfc4106(gcm(aes))' \
+ 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode transport
+
+ ip -netns host-1 xfrm state add \
+ src 10.1.2.1 dst 10.1.1.1 \
+ spi 0x1001 proto esp aead 'rfc4106(gcm(aes))' \
+ 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode transport
+
+ ip -netns host-2 xfrm state add \
+ src 10.1.1.1 dst 10.1.2.1 \
+ spi 0x1000 proto esp aead 'rfc4106(gcm(aes))' \
+ 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode transport
+
+ ip -netns host-2 xfrm state add \
+ src 10.1.2.1 dst 10.1.1.1 \
+ spi 0x1001 proto esp aead 'rfc4106(gcm(aes))' \
+ 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode transport
+
+ #
+ # IPV6
+ #
+ run_cmd host-1 ip -6 xfrm policy add \
+ src 2001:db8:1::1 dst 2001:db8:2::1 dir out \
+ tmpl proto esp mode transport
+
+ run_cmd host-1 ip -6 xfrm policy add \
+ src 2001:db8:2::1 dst 2001:db8:1::1 dir in \
+ tmpl proto esp mode transport
+
+ run_cmd host-2 ip -6 xfrm policy add \
+ src 2001:db8:1::1 dst 2001:db8:2::1 dir in \
+ tmpl proto esp mode transport
+
+ run_cmd host-2 ip -6 xfrm policy add \
+ src 2001:db8:2::1 dst 2001:db8:1::1 dir out \
+ tmpl proto esp mode transport
+
+ ip -netns host-1 -6 xfrm state add \
+ src 2001:db8:1::1 dst 2001:db8:2::1 \
+ spi 0x1000 proto esp aead 'rfc4106(gcm(aes))' \
+ 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode transport
+
+ ip -netns host-1 -6 xfrm state add \
+ src 2001:db8:2::1 dst 2001:db8:1::1 \
+ spi 0x1001 proto esp aead 'rfc4106(gcm(aes))' \
+ 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode transport
+
+ ip -netns host-2 -6 xfrm state add \
+ src 2001:db8:1::1 dst 2001:db8:2::1 \
+ spi 0x1000 proto esp aead 'rfc4106(gcm(aes))' \
+ 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode transport
+
+ ip -netns host-2 -6 xfrm state add \
+ src 2001:db8:2::1 dst 2001:db8:1::1 \
+ spi 0x1001 proto esp aead 'rfc4106(gcm(aes))' \
+ 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode transport
+}
+
+teardown_ipsec()
+{
+ run_cmd host-1 ip xfrm state flush
+ run_cmd host-1 ip xfrm policy flush
+ run_cmd host-2 ip xfrm state flush
+ run_cmd host-2 ip xfrm policy flush
+}
+
+################################################################################
+# generate traffic through tunnel for various cases
+
+run_ping()
+{
+ local desc="$1"
+
+ run_cmd host-1 ping -c1 -w1 172.16.1.2
+ log_test $? 0 "IPv4 basic L2TP tunnel ${desc}"
+
+ run_cmd host-1 ping -c1 -w1 -I 172.16.101.1 172.16.101.2
+ log_test $? 0 "IPv4 route through L2TP tunnel ${desc}"
+
+ run_cmd host-1 ${ping6} -c1 -w1 fc00:1::2
+ log_test $? 0 "IPv6 basic L2TP tunnel ${desc}"
+
+ run_cmd host-1 ${ping6} -c1 -w1 -I fc00:101::1 fc00:101::2
+ log_test $? 0 "IPv6 route through L2TP tunnel ${desc}"
+}
+
+run_tests()
+{
+ local desc
+
+ setup
+ run_ping
+
+ setup_ipsec
+ run_ping "- with IPsec"
+ run_cmd host-1 ping -c1 -w1 172.16.1.2
+ log_test $? 0 "IPv4 basic L2TP tunnel ${desc}"
+
+ run_cmd host-1 ping -c1 -w1 -I 172.16.101.1 172.16.101.2
+ log_test $? 0 "IPv4 route through L2TP tunnel ${desc}"
+
+ run_cmd host-1 ${ping6} -c1 -w1 fc00:1::2
+ log_test $? 0 "IPv6 basic L2TP tunnel - with IPsec"
+
+ run_cmd host-1 ${ping6} -c1 -w1 -I fc00:101::1 fc00:101::2
+ log_test $? 0 "IPv6 route through L2TP tunnel - with IPsec"
+
+ teardown_ipsec
+ run_ping "- after IPsec teardown"
+}
+
+################################################################################
+# main
+
+declare -i nfail=0
+declare -i nsuccess=0
+
+while getopts :pv o
+do
+ case $o in
+ p) PAUSE_ON_FAIL=yes;;
+ v) VERBOSE=$(($VERBOSE + 1));;
+ *) exit 1;;
+ esac
+done
+
+run_tests
+cleanup
+
+printf "\nTests passed: %3d\n" ${nsuccess}
+printf "Tests failed: %3d\n" ${nfail}
diff --git a/tools/testing/selftests/net/mptcp/.gitignore b/tools/testing/selftests/net/mptcp/.gitignore
new file mode 100644
index 000000000..260336d5f
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+mptcp_connect
+pm_nl_ctl
+*.pcap
diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile
new file mode 100644
index 000000000..7072ef1c0
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/Makefile
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0
+
+top_srcdir = ../../../../..
+KSFT_KHDR_INSTALL := 1
+
+CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include
+
+TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh diag.sh \
+ simult_flows.sh
+
+TEST_GEN_FILES = mptcp_connect pm_nl_ctl
+
+TEST_FILES := mptcp_lib.sh settings
+
+EXTRA_CLEAN := *.pcap
+
+include ../../lib.mk
diff --git a/tools/testing/selftests/net/mptcp/config b/tools/testing/selftests/net/mptcp/config
new file mode 100644
index 000000000..8867c4025
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/config
@@ -0,0 +1,9 @@
+CONFIG_KALLSYMS=y
+CONFIG_MPTCP=y
+CONFIG_IPV6=y
+CONFIG_MPTCP_IPV6=y
+CONFIG_INET_DIAG=m
+CONFIG_INET_MPTCP_DIAG=m
+CONFIG_VETH=y
+CONFIG_NET_SCH_NETEM=m
+CONFIG_SYN_COOKIES=y
diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh
new file mode 100755
index 000000000..34577d469
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/diag.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(dirname "${0}")/mptcp_lib.sh"
+
+rndh=$(printf %x $sec)-$(mktemp -u XXXXXX)
+ns="ns1-$rndh"
+ksft_skip=4
+test_cnt=1
+ret=0
+pids=()
+
+flush_pids()
+{
+ # mptcp_connect in join mode will sleep a bit before completing,
+ # give it some time
+ sleep 1.1
+
+ for pid in ${pids[@]}; do
+ [ -d /proc/$pid ] && kill -SIGUSR1 $pid >/dev/null 2>&1
+ done
+ pids=()
+}
+
+cleanup()
+{
+ ip netns del $ns
+ for pid in ${pids[@]}; do
+ [ -d /proc/$pid ] && kill -9 $pid >/dev/null 2>&1
+ done
+}
+
+mptcp_lib_check_mptcp
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+ss -h | grep -q MPTCP
+if [ $? -ne 0 ];then
+ echo "SKIP: ss tool does not support MPTCP"
+ exit $ksft_skip
+fi
+
+__chk_nr()
+{
+ local condition="$1"
+ local expected=$2
+ local msg nr
+
+ shift 2
+ msg=$*
+ nr=$(ss -inmHMN $ns | $condition)
+
+ printf "%-50s" "$msg"
+ if [ $nr != $expected ]; then
+ echo "[ fail ] expected $expected found $nr"
+ ret=$test_cnt
+ else
+ echo "[ ok ]"
+ fi
+ test_cnt=$((test_cnt+1))
+}
+
+chk_msk_nr()
+{
+ __chk_nr "grep -c token:" $*
+}
+
+chk_msk_fallback_nr()
+{
+ __chk_nr "grep -c fallback" $*
+}
+
+chk_msk_remote_key_nr()
+{
+ __chk_nr "grep -c remote_key" $*
+}
+
+
+trap cleanup EXIT
+ip netns add $ns
+ip -n $ns link set dev lo up
+
+echo "a" | ip netns exec $ns ./mptcp_connect -p 10000 -l 0.0.0.0 -t 100 >/dev/null &
+sleep 0.1
+pids[0]=$!
+chk_msk_nr 0 "no msk on netns creation"
+
+echo "b" | ip netns exec $ns ./mptcp_connect -p 10000 127.0.0.1 -j -t 100 >/dev/null &
+sleep 0.1
+pids[1]=$!
+chk_msk_nr 2 "after MPC handshake "
+chk_msk_remote_key_nr 2 "....chk remote_key"
+chk_msk_fallback_nr 0 "....chk no fallback"
+flush_pids
+
+
+echo "a" | ip netns exec $ns ./mptcp_connect -p 10001 -s TCP -l 0.0.0.0 -t 100 >/dev/null &
+pids[0]=$!
+sleep 0.1
+echo "b" | ip netns exec $ns ./mptcp_connect -p 10001 127.0.0.1 -j -t 100 >/dev/null &
+pids[1]=$!
+sleep 0.1
+chk_msk_fallback_nr 1 "check fallback"
+flush_pids
+
+NR_CLIENTS=100
+for I in `seq 1 $NR_CLIENTS`; do
+ echo "a" | ip netns exec $ns ./mptcp_connect -p $((I+10001)) -l 0.0.0.0 -t 100 -w 10 >/dev/null &
+ pids[$((I*2))]=$!
+done
+sleep 0.1
+
+for I in `seq 1 $NR_CLIENTS`; do
+ echo "b" | ip netns exec $ns ./mptcp_connect -p $((I+10001)) 127.0.0.1 -t 100 -w 10 >/dev/null &
+ pids[$((I*2 + 1))]=$!
+done
+sleep 1.5
+
+chk_msk_nr $((NR_CLIENTS*2)) "many msk socket present"
+flush_pids
+
+exit $ret
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c
new file mode 100644
index 000000000..37c1ec888
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c
@@ -0,0 +1,912 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include <sys/poll.h>
+#include <sys/random.h>
+#include <sys/sendfile.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+
+#include <netdb.h>
+#include <netinet/in.h>
+
+#include <linux/tcp.h>
+
+extern int optind;
+
+#ifndef IPPROTO_MPTCP
+#define IPPROTO_MPTCP 262
+#endif
+#ifndef TCP_ULP
+#define TCP_ULP 31
+#endif
+
+static int poll_timeout = 10 * 1000;
+static bool listen_mode;
+static bool quit;
+
+enum cfg_mode {
+ CFG_MODE_POLL,
+ CFG_MODE_MMAP,
+ CFG_MODE_SENDFILE,
+};
+
+static enum cfg_mode cfg_mode = CFG_MODE_POLL;
+static const char *cfg_host;
+static const char *cfg_port = "12000";
+static int cfg_sock_proto = IPPROTO_MPTCP;
+static bool tcpulp_audit;
+static int pf = AF_INET;
+static int cfg_sndbuf;
+static int cfg_rcvbuf;
+static bool cfg_join;
+static bool cfg_remove;
+static int cfg_wait;
+
+static void die_usage(void)
+{
+ fprintf(stderr, "Usage: mptcp_connect [-6] [-u] [-s MPTCP|TCP] [-p port] [-m mode]"
+ "[-l] [-w sec] connect_address\n");
+ fprintf(stderr, "\t-6 use ipv6\n");
+ fprintf(stderr, "\t-t num -- set poll timeout to num\n");
+ fprintf(stderr, "\t-S num -- set SO_SNDBUF to num\n");
+ fprintf(stderr, "\t-R num -- set SO_RCVBUF to num\n");
+ fprintf(stderr, "\t-p num -- use port num\n");
+ fprintf(stderr, "\t-s [MPTCP|TCP] -- use mptcp(default) or tcp sockets\n");
+ fprintf(stderr, "\t-m [poll|mmap|sendfile] -- use poll(default)/mmap+write/sendfile\n");
+ fprintf(stderr, "\t-u -- check mptcp ulp\n");
+ fprintf(stderr, "\t-w num -- wait num sec before closing the socket\n");
+ exit(1);
+}
+
+static void handle_signal(int nr)
+{
+ quit = true;
+}
+
+static const char *getxinfo_strerr(int err)
+{
+ if (err == EAI_SYSTEM)
+ return strerror(errno);
+
+ return gai_strerror(err);
+}
+
+static void xgetnameinfo(const struct sockaddr *addr, socklen_t addrlen,
+ char *host, socklen_t hostlen,
+ char *serv, socklen_t servlen)
+{
+ int flags = NI_NUMERICHOST | NI_NUMERICSERV;
+ int err = getnameinfo(addr, addrlen, host, hostlen, serv, servlen,
+ flags);
+
+ if (err) {
+ const char *errstr = getxinfo_strerr(err);
+
+ fprintf(stderr, "Fatal: getnameinfo: %s\n", errstr);
+ exit(1);
+ }
+}
+
+static void xgetaddrinfo(const char *node, const char *service,
+ const struct addrinfo *hints,
+ struct addrinfo **res)
+{
+ int err = getaddrinfo(node, service, hints, res);
+
+ if (err) {
+ const char *errstr = getxinfo_strerr(err);
+
+ fprintf(stderr, "Fatal: getaddrinfo(%s:%s): %s\n",
+ node ? node : "", service ? service : "", errstr);
+ exit(1);
+ }
+}
+
+static void set_rcvbuf(int fd, unsigned int size)
+{
+ int err;
+
+ err = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &size, sizeof(size));
+ if (err) {
+ perror("set SO_RCVBUF");
+ exit(1);
+ }
+}
+
+static void set_sndbuf(int fd, unsigned int size)
+{
+ int err;
+
+ err = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &size, sizeof(size));
+ if (err) {
+ perror("set SO_SNDBUF");
+ exit(1);
+ }
+}
+
+static int sock_listen_mptcp(const char * const listenaddr,
+ const char * const port)
+{
+ int sock;
+ struct addrinfo hints = {
+ .ai_protocol = IPPROTO_TCP,
+ .ai_socktype = SOCK_STREAM,
+ .ai_flags = AI_PASSIVE | AI_NUMERICHOST
+ };
+
+ hints.ai_family = pf;
+
+ struct addrinfo *a, *addr;
+ int one = 1;
+
+ xgetaddrinfo(listenaddr, port, &hints, &addr);
+ hints.ai_family = pf;
+
+ for (a = addr; a; a = a->ai_next) {
+ sock = socket(a->ai_family, a->ai_socktype, cfg_sock_proto);
+ if (sock < 0)
+ continue;
+
+ if (-1 == setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &one,
+ sizeof(one)))
+ perror("setsockopt");
+
+ if (bind(sock, a->ai_addr, a->ai_addrlen) == 0)
+ break; /* success */
+
+ perror("bind");
+ close(sock);
+ sock = -1;
+ }
+
+ freeaddrinfo(addr);
+
+ if (sock < 0) {
+ fprintf(stderr, "Could not create listen socket\n");
+ return sock;
+ }
+
+ if (listen(sock, 20)) {
+ perror("listen");
+ close(sock);
+ return -1;
+ }
+
+ return sock;
+}
+
+static bool sock_test_tcpulp(const char * const remoteaddr,
+ const char * const port)
+{
+ struct addrinfo hints = {
+ .ai_protocol = IPPROTO_TCP,
+ .ai_socktype = SOCK_STREAM,
+ };
+ struct addrinfo *a, *addr;
+ int sock = -1, ret = 0;
+ bool test_pass = false;
+
+ hints.ai_family = AF_INET;
+
+ xgetaddrinfo(remoteaddr, port, &hints, &addr);
+ for (a = addr; a; a = a->ai_next) {
+ sock = socket(a->ai_family, a->ai_socktype, IPPROTO_TCP);
+ if (sock < 0) {
+ perror("socket");
+ continue;
+ }
+ ret = setsockopt(sock, IPPROTO_TCP, TCP_ULP, "mptcp",
+ sizeof("mptcp"));
+ if (ret == -1 && errno == EOPNOTSUPP)
+ test_pass = true;
+ close(sock);
+
+ if (test_pass)
+ break;
+ if (!ret)
+ fprintf(stderr,
+ "setsockopt(TCP_ULP) returned 0\n");
+ else
+ perror("setsockopt(TCP_ULP)");
+ }
+ return test_pass;
+}
+
+static int sock_connect_mptcp(const char * const remoteaddr,
+ const char * const port, int proto)
+{
+ struct addrinfo hints = {
+ .ai_protocol = IPPROTO_TCP,
+ .ai_socktype = SOCK_STREAM,
+ };
+ struct addrinfo *a, *addr;
+ int sock = -1;
+
+ hints.ai_family = pf;
+
+ xgetaddrinfo(remoteaddr, port, &hints, &addr);
+ for (a = addr; a; a = a->ai_next) {
+ sock = socket(a->ai_family, a->ai_socktype, proto);
+ if (sock < 0) {
+ perror("socket");
+ continue;
+ }
+
+ if (connect(sock, a->ai_addr, a->ai_addrlen) == 0)
+ break; /* success */
+
+ perror("connect()");
+ close(sock);
+ sock = -1;
+ }
+
+ freeaddrinfo(addr);
+ return sock;
+}
+
+static size_t do_rnd_write(const int fd, char *buf, const size_t len)
+{
+ static bool first = true;
+ unsigned int do_w;
+ ssize_t bw;
+
+ do_w = rand() & 0xffff;
+ if (do_w == 0 || do_w > len)
+ do_w = len;
+
+ if (cfg_join && first && do_w > 100)
+ do_w = 100;
+
+ if (cfg_remove && do_w > 50)
+ do_w = 50;
+
+ bw = write(fd, buf, do_w);
+ if (bw < 0)
+ perror("write");
+
+ /* let the join handshake complete, before going on */
+ if (cfg_join && first) {
+ usleep(200000);
+ first = false;
+ }
+
+ if (cfg_remove)
+ usleep(200000);
+
+ return bw;
+}
+
+static size_t do_write(const int fd, char *buf, const size_t len)
+{
+ size_t offset = 0;
+
+ while (offset < len) {
+ size_t written;
+ ssize_t bw;
+
+ bw = write(fd, buf + offset, len - offset);
+ if (bw < 0) {
+ perror("write");
+ return 0;
+ }
+
+ written = (size_t)bw;
+ offset += written;
+ }
+
+ return offset;
+}
+
+static ssize_t do_rnd_read(const int fd, char *buf, const size_t len)
+{
+ size_t cap = rand();
+
+ cap &= 0xffff;
+
+ if (cap == 0)
+ cap = 1;
+ else if (cap > len)
+ cap = len;
+
+ return read(fd, buf, cap);
+}
+
+static void set_nonblock(int fd)
+{
+ int flags = fcntl(fd, F_GETFL);
+
+ if (flags == -1)
+ return;
+
+ fcntl(fd, F_SETFL, flags | O_NONBLOCK);
+}
+
+static int copyfd_io_poll(int infd, int peerfd, int outfd)
+{
+ struct pollfd fds = {
+ .fd = peerfd,
+ .events = POLLIN | POLLOUT,
+ };
+ unsigned int woff = 0, wlen = 0;
+ char wbuf[8192];
+
+ set_nonblock(peerfd);
+
+ for (;;) {
+ char rbuf[8192];
+ ssize_t len;
+
+ if (fds.events == 0)
+ break;
+
+ switch (poll(&fds, 1, poll_timeout)) {
+ case -1:
+ if (errno == EINTR)
+ continue;
+ perror("poll");
+ return 1;
+ case 0:
+ fprintf(stderr, "%s: poll timed out (events: "
+ "POLLIN %u, POLLOUT %u)\n", __func__,
+ fds.events & POLLIN, fds.events & POLLOUT);
+ return 2;
+ }
+
+ if (fds.revents & POLLIN) {
+ len = do_rnd_read(peerfd, rbuf, sizeof(rbuf));
+ if (len == 0) {
+ /* no more data to receive:
+ * peer has closed its write side
+ */
+ fds.events &= ~POLLIN;
+
+ if ((fds.events & POLLOUT) == 0)
+ /* and nothing more to send */
+ break;
+
+ /* Else, still have data to transmit */
+ } else if (len < 0) {
+ perror("read");
+ return 3;
+ }
+
+ do_write(outfd, rbuf, len);
+ }
+
+ if (fds.revents & POLLOUT) {
+ if (wlen == 0) {
+ woff = 0;
+ wlen = read(infd, wbuf, sizeof(wbuf));
+ }
+
+ if (wlen > 0) {
+ ssize_t bw;
+
+ bw = do_rnd_write(peerfd, wbuf + woff, wlen);
+ if (bw < 0)
+ return 111;
+
+ woff += bw;
+ wlen -= bw;
+ } else if (wlen == 0) {
+ /* We have no more data to send. */
+ fds.events &= ~POLLOUT;
+
+ if ((fds.events & POLLIN) == 0)
+ /* ... and peer also closed already */
+ break;
+
+ /* ... but we still receive.
+ * Close our write side, ev. give some time
+ * for address notification and/or checking
+ * the current status
+ */
+ if (cfg_wait)
+ usleep(cfg_wait);
+ shutdown(peerfd, SHUT_WR);
+ } else {
+ if (errno == EINTR)
+ continue;
+ perror("read");
+ return 4;
+ }
+ }
+
+ if (fds.revents & (POLLERR | POLLNVAL)) {
+ fprintf(stderr, "Unexpected revents: "
+ "POLLERR/POLLNVAL(%x)\n", fds.revents);
+ return 5;
+ }
+ }
+
+ /* leave some time for late join/announce */
+ if (cfg_join || cfg_remove)
+ usleep(cfg_wait);
+
+ close(peerfd);
+ return 0;
+}
+
+static int do_recvfile(int infd, int outfd)
+{
+ ssize_t r;
+
+ do {
+ char buf[16384];
+
+ r = do_rnd_read(infd, buf, sizeof(buf));
+ if (r > 0) {
+ if (write(outfd, buf, r) != r)
+ break;
+ } else if (r < 0) {
+ perror("read");
+ }
+ } while (r > 0);
+
+ return (int)r;
+}
+
+static int do_mmap(int infd, int outfd, unsigned int size)
+{
+ char *inbuf = mmap(NULL, size, PROT_READ, MAP_SHARED, infd, 0);
+ ssize_t ret = 0, off = 0;
+ size_t rem;
+
+ if (inbuf == MAP_FAILED) {
+ perror("mmap");
+ return 1;
+ }
+
+ rem = size;
+
+ while (rem > 0) {
+ ret = write(outfd, inbuf + off, rem);
+
+ if (ret < 0) {
+ perror("write");
+ break;
+ }
+
+ off += ret;
+ rem -= ret;
+ }
+
+ munmap(inbuf, size);
+ return rem;
+}
+
+static int get_infd_size(int fd)
+{
+ struct stat sb;
+ ssize_t count;
+ int err;
+
+ err = fstat(fd, &sb);
+ if (err < 0) {
+ perror("fstat");
+ return -1;
+ }
+
+ if ((sb.st_mode & S_IFMT) != S_IFREG) {
+ fprintf(stderr, "%s: stdin is not a regular file\n", __func__);
+ return -2;
+ }
+
+ count = sb.st_size;
+ if (count > INT_MAX) {
+ fprintf(stderr, "File too large: %zu\n", count);
+ return -3;
+ }
+
+ return (int)count;
+}
+
+static int do_sendfile(int infd, int outfd, unsigned int count)
+{
+ while (count > 0) {
+ ssize_t r;
+
+ r = sendfile(outfd, infd, NULL, count);
+ if (r < 0) {
+ perror("sendfile");
+ return 3;
+ }
+
+ count -= r;
+ }
+
+ return 0;
+}
+
+static int copyfd_io_mmap(int infd, int peerfd, int outfd,
+ unsigned int size)
+{
+ int err;
+
+ if (listen_mode) {
+ err = do_recvfile(peerfd, outfd);
+ if (err)
+ return err;
+
+ err = do_mmap(infd, peerfd, size);
+ } else {
+ err = do_mmap(infd, peerfd, size);
+ if (err)
+ return err;
+
+ shutdown(peerfd, SHUT_WR);
+
+ err = do_recvfile(peerfd, outfd);
+ }
+
+ return err;
+}
+
+static int copyfd_io_sendfile(int infd, int peerfd, int outfd,
+ unsigned int size)
+{
+ int err;
+
+ if (listen_mode) {
+ err = do_recvfile(peerfd, outfd);
+ if (err)
+ return err;
+
+ err = do_sendfile(infd, peerfd, size);
+ } else {
+ err = do_sendfile(infd, peerfd, size);
+ if (err)
+ return err;
+ err = do_recvfile(peerfd, outfd);
+ }
+
+ return err;
+}
+
+static int copyfd_io(int infd, int peerfd, int outfd)
+{
+ int file_size;
+
+ switch (cfg_mode) {
+ case CFG_MODE_POLL:
+ return copyfd_io_poll(infd, peerfd, outfd);
+ case CFG_MODE_MMAP:
+ file_size = get_infd_size(infd);
+ if (file_size < 0)
+ return file_size;
+ return copyfd_io_mmap(infd, peerfd, outfd, file_size);
+ case CFG_MODE_SENDFILE:
+ file_size = get_infd_size(infd);
+ if (file_size < 0)
+ return file_size;
+ return copyfd_io_sendfile(infd, peerfd, outfd, file_size);
+ }
+
+ fprintf(stderr, "Invalid mode %d\n", cfg_mode);
+
+ die_usage();
+ return 1;
+}
+
+static void check_sockaddr(int pf, struct sockaddr_storage *ss,
+ socklen_t salen)
+{
+ struct sockaddr_in6 *sin6;
+ struct sockaddr_in *sin;
+ socklen_t wanted_size = 0;
+
+ switch (pf) {
+ case AF_INET:
+ wanted_size = sizeof(*sin);
+ sin = (void *)ss;
+ if (!sin->sin_port)
+ fprintf(stderr, "accept: something wrong: ip connection from port 0");
+ break;
+ case AF_INET6:
+ wanted_size = sizeof(*sin6);
+ sin6 = (void *)ss;
+ if (!sin6->sin6_port)
+ fprintf(stderr, "accept: something wrong: ipv6 connection from port 0");
+ break;
+ default:
+ fprintf(stderr, "accept: Unknown pf %d, salen %u\n", pf, salen);
+ return;
+ }
+
+ if (salen != wanted_size)
+ fprintf(stderr, "accept: size mismatch, got %d expected %d\n",
+ (int)salen, wanted_size);
+
+ if (ss->ss_family != pf)
+ fprintf(stderr, "accept: pf mismatch, expect %d, ss_family is %d\n",
+ (int)ss->ss_family, pf);
+}
+
+static void check_getpeername(int fd, struct sockaddr_storage *ss, socklen_t salen)
+{
+ struct sockaddr_storage peerss;
+ socklen_t peersalen = sizeof(peerss);
+
+ if (getpeername(fd, (struct sockaddr *)&peerss, &peersalen) < 0) {
+ perror("getpeername");
+ return;
+ }
+
+ if (peersalen != salen) {
+ fprintf(stderr, "%s: %d vs %d\n", __func__, peersalen, salen);
+ return;
+ }
+
+ if (memcmp(ss, &peerss, peersalen)) {
+ char a[INET6_ADDRSTRLEN];
+ char b[INET6_ADDRSTRLEN];
+ char c[INET6_ADDRSTRLEN];
+ char d[INET6_ADDRSTRLEN];
+
+ xgetnameinfo((struct sockaddr *)ss, salen,
+ a, sizeof(a), b, sizeof(b));
+
+ xgetnameinfo((struct sockaddr *)&peerss, peersalen,
+ c, sizeof(c), d, sizeof(d));
+
+ fprintf(stderr, "%s: memcmp failure: accept %s vs peername %s, %s vs %s salen %d vs %d\n",
+ __func__, a, c, b, d, peersalen, salen);
+ }
+}
+
+static void check_getpeername_connect(int fd)
+{
+ struct sockaddr_storage ss;
+ socklen_t salen = sizeof(ss);
+ char a[INET6_ADDRSTRLEN];
+ char b[INET6_ADDRSTRLEN];
+
+ if (getpeername(fd, (struct sockaddr *)&ss, &salen) < 0) {
+ perror("getpeername");
+ return;
+ }
+
+ xgetnameinfo((struct sockaddr *)&ss, salen,
+ a, sizeof(a), b, sizeof(b));
+
+ if (strcmp(cfg_host, a) || strcmp(cfg_port, b))
+ fprintf(stderr, "%s: %s vs %s, %s vs %s\n", __func__,
+ cfg_host, a, cfg_port, b);
+}
+
+static void maybe_close(int fd)
+{
+ unsigned int r = rand();
+
+ if (!(cfg_join || cfg_remove) && (r & 1))
+ close(fd);
+}
+
+int main_loop_s(int listensock)
+{
+ struct sockaddr_storage ss;
+ struct pollfd polls;
+ socklen_t salen;
+ int remotesock;
+
+ polls.fd = listensock;
+ polls.events = POLLIN;
+
+ switch (poll(&polls, 1, poll_timeout)) {
+ case -1:
+ perror("poll");
+ return 1;
+ case 0:
+ fprintf(stderr, "%s: timed out\n", __func__);
+ close(listensock);
+ return 2;
+ }
+
+ salen = sizeof(ss);
+ remotesock = accept(listensock, (struct sockaddr *)&ss, &salen);
+ if (remotesock >= 0) {
+ maybe_close(listensock);
+ check_sockaddr(pf, &ss, salen);
+ check_getpeername(remotesock, &ss, salen);
+
+ return copyfd_io(0, remotesock, 1);
+ }
+
+ perror("accept");
+
+ return 1;
+}
+
+static void init_rng(void)
+{
+ unsigned int foo;
+
+ if (getrandom(&foo, sizeof(foo), 0) == -1) {
+ perror("getrandom");
+ exit(1);
+ }
+
+ srand(foo);
+}
+
+int main_loop(void)
+{
+ int fd;
+
+ /* listener is ready. */
+ fd = sock_connect_mptcp(cfg_host, cfg_port, cfg_sock_proto);
+ if (fd < 0)
+ return 2;
+
+ check_getpeername_connect(fd);
+
+ if (cfg_rcvbuf)
+ set_rcvbuf(fd, cfg_rcvbuf);
+ if (cfg_sndbuf)
+ set_sndbuf(fd, cfg_sndbuf);
+
+ return copyfd_io(0, fd, 1);
+}
+
+int parse_proto(const char *proto)
+{
+ if (!strcasecmp(proto, "MPTCP"))
+ return IPPROTO_MPTCP;
+ if (!strcasecmp(proto, "TCP"))
+ return IPPROTO_TCP;
+
+ fprintf(stderr, "Unknown protocol: %s\n.", proto);
+ die_usage();
+
+ /* silence compiler warning */
+ return 0;
+}
+
+int parse_mode(const char *mode)
+{
+ if (!strcasecmp(mode, "poll"))
+ return CFG_MODE_POLL;
+ if (!strcasecmp(mode, "mmap"))
+ return CFG_MODE_MMAP;
+ if (!strcasecmp(mode, "sendfile"))
+ return CFG_MODE_SENDFILE;
+
+ fprintf(stderr, "Unknown test mode: %s\n", mode);
+ fprintf(stderr, "Supported modes are:\n");
+ fprintf(stderr, "\t\t\"poll\" - interleaved read/write using poll()\n");
+ fprintf(stderr, "\t\t\"mmap\" - send entire input file (mmap+write), then read response (-l will read input first)\n");
+ fprintf(stderr, "\t\t\"sendfile\" - send entire input file (sendfile), then read response (-l will read input first)\n");
+
+ die_usage();
+
+ /* silence compiler warning */
+ return 0;
+}
+
+static int parse_int(const char *size)
+{
+ unsigned long s;
+
+ errno = 0;
+
+ s = strtoul(size, NULL, 0);
+
+ if (errno) {
+ fprintf(stderr, "Invalid sndbuf size %s (%s)\n",
+ size, strerror(errno));
+ die_usage();
+ }
+
+ if (s > INT_MAX) {
+ fprintf(stderr, "Invalid sndbuf size %s (%s)\n",
+ size, strerror(ERANGE));
+ die_usage();
+ }
+
+ return (int)s;
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ int c;
+
+ while ((c = getopt(argc, argv, "6jrlp:s:hut:m:S:R:w:")) != -1) {
+ switch (c) {
+ case 'j':
+ cfg_join = true;
+ cfg_mode = CFG_MODE_POLL;
+ cfg_wait = 400000;
+ break;
+ case 'r':
+ cfg_remove = true;
+ cfg_mode = CFG_MODE_POLL;
+ cfg_wait = 400000;
+ break;
+ case 'l':
+ listen_mode = true;
+ break;
+ case 'p':
+ cfg_port = optarg;
+ break;
+ case 's':
+ cfg_sock_proto = parse_proto(optarg);
+ break;
+ case 'h':
+ die_usage();
+ break;
+ case 'u':
+ tcpulp_audit = true;
+ break;
+ case '6':
+ pf = AF_INET6;
+ break;
+ case 't':
+ poll_timeout = atoi(optarg) * 1000;
+ if (poll_timeout <= 0)
+ poll_timeout = -1;
+ break;
+ case 'm':
+ cfg_mode = parse_mode(optarg);
+ break;
+ case 'S':
+ cfg_sndbuf = parse_int(optarg);
+ break;
+ case 'R':
+ cfg_rcvbuf = parse_int(optarg);
+ break;
+ case 'w':
+ cfg_wait = atoi(optarg)*1000000;
+ break;
+ }
+ }
+
+ if (optind + 1 != argc)
+ die_usage();
+ cfg_host = argv[optind];
+
+ if (strchr(cfg_host, ':'))
+ pf = AF_INET6;
+}
+
+int main(int argc, char *argv[])
+{
+ init_rng();
+
+ signal(SIGUSR1, handle_signal);
+ parse_opts(argc, argv);
+
+ if (tcpulp_audit)
+ return sock_test_tcpulp(cfg_host, cfg_port) ? 0 : 1;
+
+ if (listen_mode) {
+ int fd = sock_listen_mptcp(cfg_host, cfg_port);
+
+ if (fd < 0)
+ return 1;
+
+ if (cfg_rcvbuf)
+ set_rcvbuf(fd, cfg_rcvbuf);
+ if (cfg_sndbuf)
+ set_sndbuf(fd, cfg_sndbuf);
+
+ return main_loop_s(fd);
+ }
+
+ return main_loop();
+}
diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.sh b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
new file mode 100755
index 000000000..fb89298bd
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/mptcp_connect.sh
@@ -0,0 +1,697 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(dirname "${0}")/mptcp_lib.sh"
+
+time_start=$(date +%s)
+
+optstring="S:R:d:e:l:r:h4cm:f:t"
+ret=0
+sin=""
+sout=""
+cin=""
+cout=""
+ksft_skip=4
+capture=false
+timeout=30
+ipv6=true
+ethtool_random_on=true
+tc_delay="$((RANDOM%50))"
+tc_loss=$((RANDOM%101))
+testmode=""
+sndbuf=0
+rcvbuf=0
+options_log=true
+do_tcp=0
+filesize=0
+
+if [ $tc_loss -eq 100 ];then
+ tc_loss=1%
+elif [ $tc_loss -ge 10 ]; then
+ tc_loss=0.$tc_loss%
+elif [ $tc_loss -ge 1 ]; then
+ tc_loss=0.0$tc_loss%
+else
+ tc_loss=""
+fi
+
+usage() {
+ echo "Usage: $0 [ -a ]"
+ echo -e "\t-d: tc/netem delay in milliseconds, e.g. \"-d 10\" (default random)"
+ echo -e "\t-l: tc/netem loss percentage, e.g. \"-l 0.02\" (default random)"
+ echo -e "\t-r: tc/netem reorder mode, e.g. \"-r 25% 50% gap 5\", use "-r 0" to disable reordering (default random)"
+ echo -e "\t-e: ethtool features to disable, e.g.: \"-e tso -e gso\" (default: randomly disable any of tso/gso/gro)"
+ echo -e "\t-4: IPv4 only: disable IPv6 tests (default: test both IPv4 and IPv6)"
+ echo -e "\t-c: capture packets for each test using tcpdump (default: no capture)"
+ echo -e "\t-f: size of file to transfer in bytes (default random)"
+ echo -e "\t-S: set sndbuf value (default: use kernel default)"
+ echo -e "\t-R: set rcvbuf value (default: use kernel default)"
+ echo -e "\t-m: test mode (poll, sendfile; default: poll)"
+ echo -e "\t-t: also run tests with TCP (use twice to non-fallback tcp)"
+}
+
+while getopts "$optstring" option;do
+ case "$option" in
+ "h")
+ usage $0
+ exit 0
+ ;;
+ "d")
+ if [ $OPTARG -ge 0 ];then
+ tc_delay="$OPTARG"
+ else
+ echo "-d requires numeric argument, got \"$OPTARG\"" 1>&2
+ exit 1
+ fi
+ ;;
+ "e")
+ ethtool_args="$ethtool_args $OPTARG off"
+ ethtool_random_on=false
+ ;;
+ "l")
+ tc_loss="$OPTARG"
+ ;;
+ "r")
+ tc_reorder="$OPTARG"
+ ;;
+ "4")
+ ipv6=false
+ ;;
+ "c")
+ capture=true
+ ;;
+ "S")
+ if [ $OPTARG -ge 0 ];then
+ sndbuf="$OPTARG"
+ else
+ echo "-S requires numeric argument, got \"$OPTARG\"" 1>&2
+ exit 1
+ fi
+ ;;
+ "R")
+ if [ $OPTARG -ge 0 ];then
+ rcvbuf="$OPTARG"
+ else
+ echo "-R requires numeric argument, got \"$OPTARG\"" 1>&2
+ exit 1
+ fi
+ ;;
+ "m")
+ testmode="$OPTARG"
+ ;;
+ "f")
+ filesize="$OPTARG"
+ ;;
+ "t")
+ do_tcp=$((do_tcp+1))
+ ;;
+ "?")
+ usage $0
+ exit 1
+ ;;
+ esac
+done
+
+sec=$(date +%s)
+rndh=$(printf %x $sec)-$(mktemp -u XXXXXX)
+ns1="ns1-$rndh"
+ns2="ns2-$rndh"
+ns3="ns3-$rndh"
+ns4="ns4-$rndh"
+
+TEST_COUNT=0
+
+cleanup()
+{
+ rm -f "$cin" "$cout"
+ rm -f "$sin" "$sout"
+ rm -f "$capout"
+
+ local netns
+ for netns in "$ns1" "$ns2" "$ns3" "$ns4";do
+ ip netns del $netns
+ done
+}
+
+mptcp_lib_check_mptcp
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+
+sin=$(mktemp)
+sout=$(mktemp)
+cin=$(mktemp)
+cout=$(mktemp)
+capout=$(mktemp)
+trap cleanup EXIT
+
+for i in "$ns1" "$ns2" "$ns3" "$ns4";do
+ ip netns add $i || exit $ksft_skip
+ ip -net $i link set lo up
+done
+
+# "$ns1" ns2 ns3 ns4
+# ns1eth2 ns2eth1 ns2eth3 ns3eth2 ns3eth4 ns4eth3
+# - drop 1% -> reorder 25%
+# <- TSO off -
+
+ip link add ns1eth2 netns "$ns1" type veth peer name ns2eth1 netns "$ns2"
+ip link add ns2eth3 netns "$ns2" type veth peer name ns3eth2 netns "$ns3"
+ip link add ns3eth4 netns "$ns3" type veth peer name ns4eth3 netns "$ns4"
+
+ip -net "$ns1" addr add 10.0.1.1/24 dev ns1eth2
+ip -net "$ns1" addr add dead:beef:1::1/64 dev ns1eth2 nodad
+
+ip -net "$ns1" link set ns1eth2 up
+ip -net "$ns1" route add default via 10.0.1.2
+ip -net "$ns1" route add default via dead:beef:1::2
+
+ip -net "$ns2" addr add 10.0.1.2/24 dev ns2eth1
+ip -net "$ns2" addr add dead:beef:1::2/64 dev ns2eth1 nodad
+ip -net "$ns2" link set ns2eth1 up
+
+ip -net "$ns2" addr add 10.0.2.1/24 dev ns2eth3
+ip -net "$ns2" addr add dead:beef:2::1/64 dev ns2eth3 nodad
+ip -net "$ns2" link set ns2eth3 up
+ip -net "$ns2" route add default via 10.0.2.2
+ip -net "$ns2" route add default via dead:beef:2::2
+ip netns exec "$ns2" sysctl -q net.ipv4.ip_forward=1
+ip netns exec "$ns2" sysctl -q net.ipv6.conf.all.forwarding=1
+
+ip -net "$ns3" addr add 10.0.2.2/24 dev ns3eth2
+ip -net "$ns3" addr add dead:beef:2::2/64 dev ns3eth2 nodad
+ip -net "$ns3" link set ns3eth2 up
+
+ip -net "$ns3" addr add 10.0.3.2/24 dev ns3eth4
+ip -net "$ns3" addr add dead:beef:3::2/64 dev ns3eth4 nodad
+ip -net "$ns3" link set ns3eth4 up
+ip -net "$ns3" route add default via 10.0.2.1
+ip -net "$ns3" route add default via dead:beef:2::1
+ip netns exec "$ns3" sysctl -q net.ipv4.ip_forward=1
+ip netns exec "$ns3" sysctl -q net.ipv6.conf.all.forwarding=1
+
+ip -net "$ns4" addr add 10.0.3.1/24 dev ns4eth3
+ip -net "$ns4" addr add dead:beef:3::1/64 dev ns4eth3 nodad
+ip -net "$ns4" link set ns4eth3 up
+ip -net "$ns4" route add default via 10.0.3.2
+ip -net "$ns4" route add default via dead:beef:3::2
+
+set_ethtool_flags() {
+ local ns="$1"
+ local dev="$2"
+ local flags="$3"
+
+ ip netns exec $ns ethtool -K $dev $flags 2>/dev/null
+ [ $? -eq 0 ] && echo "INFO: set $ns dev $dev: ethtool -K $flags"
+}
+
+set_random_ethtool_flags() {
+ local flags=""
+ local r=$RANDOM
+
+ local pick1=$((r & 1))
+ local pick2=$((r & 2))
+ local pick3=$((r & 4))
+
+ [ $pick1 -ne 0 ] && flags="tso off"
+ [ $pick2 -ne 0 ] && flags="$flags gso off"
+ [ $pick3 -ne 0 ] && flags="$flags gro off"
+
+ [ -z "$flags" ] && return
+
+ set_ethtool_flags "$1" "$2" "$flags"
+}
+
+if $ethtool_random_on;then
+ set_random_ethtool_flags "$ns3" ns3eth2
+ set_random_ethtool_flags "$ns4" ns4eth3
+else
+ set_ethtool_flags "$ns3" ns3eth2 "$ethtool_args"
+ set_ethtool_flags "$ns4" ns4eth3 "$ethtool_args"
+fi
+
+print_file_err()
+{
+ ls -l "$1" 1>&2
+ echo "Trailing bytes are: "
+ tail -c 27 "$1"
+}
+
+check_transfer()
+{
+ local in=$1
+ local out=$2
+ local what=$3
+
+ cmp "$in" "$out" > /dev/null 2>&1
+ if [ $? -ne 0 ] ;then
+ echo "[ FAIL ] $what does not match (in, out):"
+ print_file_err "$in"
+ print_file_err "$out"
+
+ return 1
+ fi
+
+ return 0
+}
+
+check_mptcp_disabled()
+{
+ local disabled_ns
+ disabled_ns="ns_disabled-$sech-$(mktemp -u XXXXXX)"
+ ip netns add ${disabled_ns} || exit $ksft_skip
+
+ # net.mptcp.enabled should be enabled by default
+ if [ "$(ip netns exec ${disabled_ns} sysctl net.mptcp.enabled | awk '{ print $3 }')" -ne 1 ]; then
+ echo -e "net.mptcp.enabled sysctl is not 1 by default\t\t[ FAIL ]"
+ ret=1
+ return 1
+ fi
+ ip netns exec ${disabled_ns} sysctl -q net.mptcp.enabled=0
+
+ local err=0
+ LANG=C ip netns exec ${disabled_ns} ./mptcp_connect -t $timeout -p 10000 -s MPTCP 127.0.0.1 < "$cin" 2>&1 | \
+ grep -q "^socket: Protocol not available$" && err=1
+ ip netns delete ${disabled_ns}
+
+ if [ ${err} -eq 0 ]; then
+ echo -e "New MPTCP socket cannot be blocked via sysctl\t\t[ FAIL ]"
+ ret=1
+ return 1
+ fi
+
+ echo -e "New MPTCP socket can be blocked via sysctl\t\t[ OK ]"
+ return 0
+}
+
+check_mptcp_ulp_setsockopt()
+{
+ local t retval
+ t="ns_ulp-$sech-$(mktemp -u XXXXXX)"
+
+ ip netns add ${t} || exit $ksft_skip
+ if ! ip netns exec ${t} ./mptcp_connect -u -p 10000 -s TCP 127.0.0.1 2>&1; then
+ printf "setsockopt(..., TCP_ULP, \"mptcp\", ...) allowed\t[ FAIL ]\n"
+ retval=1
+ ret=$retval
+ else
+ printf "setsockopt(..., TCP_ULP, \"mptcp\", ...) blocked\t[ OK ]\n"
+ retval=0
+ fi
+ ip netns del ${t}
+ return $retval
+}
+
+# $1: IP address
+is_v6()
+{
+ [ -z "${1##*:*}" ]
+}
+
+do_ping()
+{
+ local listener_ns="$1"
+ local connector_ns="$2"
+ local connect_addr="$3"
+ local ping_args="-q -c 1"
+
+ if is_v6 "${connect_addr}"; then
+ $ipv6 || return 0
+ ping_args="${ping_args} -6"
+ fi
+
+ ip netns exec ${connector_ns} ping ${ping_args} $connect_addr >/dev/null
+ if [ $? -ne 0 ] ; then
+ echo "$listener_ns -> $connect_addr connectivity [ FAIL ]" 1>&2
+ ret=1
+
+ return 1
+ fi
+
+ return 0
+}
+
+# $1: ns, $2: port
+wait_local_port_listen()
+{
+ local listener_ns="${1}"
+ local port="${2}"
+
+ local port_hex i
+
+ port_hex="$(printf "%04X" "${port}")"
+ for i in $(seq 10); do
+ ip netns exec "${listener_ns}" cat /proc/net/tcp* | \
+ awk "BEGIN {rc=1} {if (\$2 ~ /:${port_hex}\$/ && \$4 ~ /0A/) {rc=0; exit}} END {exit rc}" &&
+ break
+ sleep 0.1
+ done
+}
+
+do_transfer()
+{
+ local listener_ns="$1"
+ local connector_ns="$2"
+ local cl_proto="$3"
+ local srv_proto="$4"
+ local connect_addr="$5"
+ local local_addr="$6"
+ local extra_args=""
+
+ local port
+ port=$((10000+$TEST_COUNT))
+ TEST_COUNT=$((TEST_COUNT+1))
+
+ if [ "$rcvbuf" -gt 0 ]; then
+ extra_args="$extra_args -R $rcvbuf"
+ fi
+
+ if [ "$sndbuf" -gt 0 ]; then
+ extra_args="$extra_args -S $sndbuf"
+ fi
+
+ if [ -n "$testmode" ]; then
+ extra_args="$extra_args -m $testmode"
+ fi
+
+ if [ -n "$extra_args" ] && $options_log; then
+ options_log=false
+ echo "INFO: extra options: $extra_args"
+ fi
+
+ :> "$cout"
+ :> "$sout"
+ :> "$capout"
+
+ local addr_port
+ addr_port=$(printf "%s:%d" ${connect_addr} ${port})
+ printf "%.3s %-5s -> %.3s (%-20s) %-5s\t" ${connector_ns} ${cl_proto} ${listener_ns} ${addr_port} ${srv_proto}
+
+ if $capture; then
+ local capuser
+ if [ -z $SUDO_USER ] ; then
+ capuser=""
+ else
+ capuser="-Z $SUDO_USER"
+ fi
+
+ local capfile="${rndh}-${connector_ns:0:3}-${listener_ns:0:3}-${cl_proto}-${srv_proto}-${connect_addr}-${port}"
+ local capopt="-i any -s 65535 -B 32768 ${capuser}"
+
+ ip netns exec ${listener_ns} tcpdump ${capopt} -w "${capfile}-listener.pcap" >> "${capout}" 2>&1 &
+ local cappid_listener=$!
+
+ ip netns exec ${connector_ns} tcpdump ${capopt} -w "${capfile}-connector.pcap" >> "${capout}" 2>&1 &
+ local cappid_connector=$!
+
+ sleep 1
+ fi
+
+ local stat_synrx_last_l=$(ip netns exec ${listener_ns} nstat -z -a MPTcpExtMPCapableSYNRX | while read a count c rest ;do echo $count;done)
+ local stat_ackrx_last_l=$(ip netns exec ${listener_ns} nstat -z -a MPTcpExtMPCapableACKRX | while read a count c rest ;do echo $count;done)
+ local stat_cookietx_last=$(ip netns exec ${listener_ns} nstat -z -a TcpExtSyncookiesSent | while read a count c rest ;do echo $count;done)
+ local stat_cookierx_last=$(ip netns exec ${listener_ns} nstat -z -a TcpExtSyncookiesRecv | while read a count c rest ;do echo $count;done)
+
+ ip netns exec ${listener_ns} ./mptcp_connect -t $timeout -l -p $port -s ${srv_proto} $extra_args $local_addr < "$sin" > "$sout" &
+ local spid=$!
+
+ wait_local_port_listen "${listener_ns}" "${port}"
+
+ local start
+ start=$(date +%s%3N)
+ ip netns exec ${connector_ns} ./mptcp_connect -t $timeout -p $port -s ${cl_proto} $extra_args $connect_addr < "$cin" > "$cout" &
+ local cpid=$!
+
+ wait $cpid
+ local retc=$?
+ wait $spid
+ local rets=$?
+
+ local stop
+ stop=$(date +%s%3N)
+
+ if $capture; then
+ sleep 1
+ kill ${cappid_listener}
+ kill ${cappid_connector}
+ fi
+
+ local duration
+ duration=$((stop-start))
+ duration=$(printf "(duration %05sms)" $duration)
+ if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
+ echo "$duration [ FAIL ] client exit code $retc, server $rets" 1>&2
+ echo -e "\nnetns ${listener_ns} socket stat for ${port}:" 1>&2
+ ip netns exec ${listener_ns} ss -nita 1>&2 -o "sport = :$port"
+ echo -e "\nnetns ${connector_ns} socket stat for ${port}:" 1>&2
+ ip netns exec ${connector_ns} ss -nita 1>&2 -o "dport = :$port"
+
+ cat "$capout"
+ return 1
+ fi
+
+ check_transfer $sin $cout "file received by client"
+ retc=$?
+ check_transfer $cin $sout "file received by server"
+ rets=$?
+
+ local stat_synrx_now_l=$(ip netns exec ${listener_ns} nstat -z -a MPTcpExtMPCapableSYNRX | while read a count c rest ;do echo $count;done)
+ local stat_ackrx_now_l=$(ip netns exec ${listener_ns} nstat -z -a MPTcpExtMPCapableACKRX | while read a count c rest ;do echo $count;done)
+
+ local stat_cookietx_now=$(ip netns exec ${listener_ns} nstat -z -a TcpExtSyncookiesSent | while read a count c rest ;do echo $count;done)
+ local stat_cookierx_now=$(ip netns exec ${listener_ns} nstat -z -a TcpExtSyncookiesRecv | while read a count c rest ;do echo $count;done)
+
+ expect_synrx=$((stat_synrx_last_l))
+ expect_ackrx=$((stat_ackrx_last_l))
+
+ cookies=$(ip netns exec ${listener_ns} sysctl net.ipv4.tcp_syncookies)
+ cookies=${cookies##*=}
+
+ if [ ${cl_proto} = "MPTCP" ] && [ ${srv_proto} = "MPTCP" ]; then
+ expect_synrx=$((stat_synrx_last_l+1))
+ expect_ackrx=$((stat_ackrx_last_l+1))
+ fi
+ if [ $cookies -eq 2 ];then
+ if [ $stat_cookietx_last -ge $stat_cookietx_now ] ;then
+ echo "${listener_ns} CookieSent: ${cl_proto} -> ${srv_proto}: did not advance"
+ fi
+ if [ $stat_cookierx_last -ge $stat_cookierx_now ] ;then
+ echo "${listener_ns} CookieRecv: ${cl_proto} -> ${srv_proto}: did not advance"
+ fi
+ else
+ if [ $stat_cookietx_last -ne $stat_cookietx_now ] ;then
+ echo "${listener_ns} CookieSent: ${cl_proto} -> ${srv_proto}: changed"
+ fi
+ if [ $stat_cookierx_last -ne $stat_cookierx_now ] ;then
+ echo "${listener_ns} CookieRecv: ${cl_proto} -> ${srv_proto}: changed"
+ fi
+ fi
+
+ if [ $expect_synrx -ne $stat_synrx_now_l ] ;then
+ echo "${listener_ns} SYNRX: ${cl_proto} -> ${srv_proto}: expect ${expect_synrx}, got ${stat_synrx_now_l}"
+ fi
+ if [ $expect_ackrx -ne $stat_ackrx_now_l ] ;then
+ echo "${listener_ns} ACKRX: ${cl_proto} -> ${srv_proto}: expect ${expect_ackrx}, got ${stat_ackrx_now_l} "
+ fi
+
+ if [ $retc -eq 0 ] && [ $rets -eq 0 ];then
+ echo "$duration [ OK ]"
+ cat "$capout"
+ return 0
+ fi
+
+ cat "$capout"
+ return 1
+}
+
+make_file()
+{
+ local name=$1
+ local who=$2
+ local SIZE=$filesize
+ local ksize
+ local rem
+
+ if [ $SIZE -eq 0 ]; then
+ local MAXSIZE=$((1024 * 1024 * 8))
+ local MINSIZE=$((1024 * 256))
+
+ SIZE=$(((RANDOM * RANDOM + MINSIZE) % MAXSIZE))
+ fi
+
+ ksize=$((SIZE / 1024))
+ rem=$((SIZE - (ksize * 1024)))
+
+ dd if=/dev/urandom of="$name" bs=1024 count=$ksize 2> /dev/null
+ dd if=/dev/urandom conv=notrunc of="$name" bs=1 count=$rem 2> /dev/null
+ echo -e "\nMPTCP_TEST_FILE_END_MARKER" >> "$name"
+
+ echo "Created $name (size $(du -b "$name")) containing data sent by $who"
+}
+
+run_tests_lo()
+{
+ local listener_ns="$1"
+ local connector_ns="$2"
+ local connect_addr="$3"
+ local loopback="$4"
+ local lret=0
+
+ # skip if test programs are running inside same netns for subsequent runs.
+ if [ $loopback -eq 0 ] && [ ${listener_ns} = ${connector_ns} ]; then
+ return 0
+ fi
+
+ # skip if we don't want v6
+ if ! $ipv6 && is_v6 "${connect_addr}"; then
+ return 0
+ fi
+
+ local local_addr
+ if is_v6 "${connect_addr}"; then
+ local_addr="::"
+ else
+ local_addr="0.0.0.0"
+ fi
+
+ do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} ${local_addr}
+ lret=$?
+ if [ $lret -ne 0 ]; then
+ ret=$lret
+ return 1
+ fi
+
+ if [ $do_tcp -eq 0 ]; then
+ # don't bother testing fallback tcp except for loopback case.
+ if [ ${listener_ns} != ${connector_ns} ]; then
+ return 0
+ fi
+ fi
+
+ do_transfer ${listener_ns} ${connector_ns} MPTCP TCP ${connect_addr} ${local_addr}
+ lret=$?
+ if [ $lret -ne 0 ]; then
+ ret=$lret
+ return 1
+ fi
+
+ do_transfer ${listener_ns} ${connector_ns} TCP MPTCP ${connect_addr} ${local_addr}
+ lret=$?
+ if [ $lret -ne 0 ]; then
+ ret=$lret
+ return 1
+ fi
+
+ if [ $do_tcp -gt 1 ] ;then
+ do_transfer ${listener_ns} ${connector_ns} TCP TCP ${connect_addr} ${local_addr}
+ lret=$?
+ if [ $lret -ne 0 ]; then
+ ret=$lret
+ return 1
+ fi
+ fi
+
+ return 0
+}
+
+run_tests()
+{
+ run_tests_lo $1 $2 $3 0
+}
+
+make_file "$cin" "client"
+make_file "$sin" "server"
+
+check_mptcp_disabled
+
+check_mptcp_ulp_setsockopt
+
+echo "INFO: validating network environment with pings"
+for sender in "$ns1" "$ns2" "$ns3" "$ns4";do
+ do_ping "$ns1" $sender 10.0.1.1
+ do_ping "$ns1" $sender dead:beef:1::1
+
+ do_ping "$ns2" $sender 10.0.1.2
+ do_ping "$ns2" $sender dead:beef:1::2
+ do_ping "$ns2" $sender 10.0.2.1
+ do_ping "$ns2" $sender dead:beef:2::1
+
+ do_ping "$ns3" $sender 10.0.2.2
+ do_ping "$ns3" $sender dead:beef:2::2
+ do_ping "$ns3" $sender 10.0.3.2
+ do_ping "$ns3" $sender dead:beef:3::2
+
+ do_ping "$ns4" $sender 10.0.3.1
+ do_ping "$ns4" $sender dead:beef:3::1
+done
+
+[ -n "$tc_loss" ] && tc -net "$ns2" qdisc add dev ns2eth3 root netem loss random $tc_loss delay ${tc_delay}ms
+echo -n "INFO: Using loss of $tc_loss "
+test "$tc_delay" -gt 0 && echo -n "delay $tc_delay ms "
+
+reorder_delay=$(($tc_delay / 4))
+
+if [ -z "${tc_reorder}" ]; then
+ reorder1=$((RANDOM%10))
+ reorder1=$((100 - reorder1))
+ reorder2=$((RANDOM%100))
+
+ if [ $reorder_delay -gt 0 ] && [ $reorder1 -lt 100 ] && [ $reorder2 -gt 0 ]; then
+ tc_reorder="reorder ${reorder1}% ${reorder2}%"
+ echo -n "$tc_reorder with delay ${reorder_delay}ms "
+ fi
+elif [ "$tc_reorder" = "0" ];then
+ tc_reorder=""
+elif [ "$reorder_delay" -gt 0 ];then
+ # reordering requires some delay
+ tc_reorder="reorder $tc_reorder"
+ echo -n "$tc_reorder with delay ${reorder_delay}ms "
+fi
+
+echo "on ns3eth4"
+
+tc -net "$ns3" qdisc add dev ns3eth4 root netem delay ${reorder_delay}ms $tc_reorder
+
+for sender in $ns1 $ns2 $ns3 $ns4;do
+ run_tests_lo "$ns1" "$sender" 10.0.1.1 1
+ if [ $ret -ne 0 ] ;then
+ echo "FAIL: Could not even run loopback test" 1>&2
+ exit $ret
+ fi
+ run_tests_lo "$ns1" $sender dead:beef:1::1 1
+ if [ $ret -ne 0 ] ;then
+ echo "FAIL: Could not even run loopback v6 test" 2>&1
+ exit $ret
+ fi
+
+ # ns1<->ns2 is not subject to reordering/tc delays. Use it to test
+ # mptcp syncookie support.
+ if [ $sender = $ns1 ]; then
+ ip netns exec "$ns2" sysctl -q net.ipv4.tcp_syncookies=2
+ else
+ ip netns exec "$ns2" sysctl -q net.ipv4.tcp_syncookies=1
+ fi
+
+ run_tests "$ns2" $sender 10.0.1.2
+ run_tests "$ns2" $sender dead:beef:1::2
+ run_tests "$ns2" $sender 10.0.2.1
+ run_tests "$ns2" $sender dead:beef:2::1
+
+ run_tests "$ns3" $sender 10.0.2.2
+ run_tests "$ns3" $sender dead:beef:2::2
+ run_tests "$ns3" $sender 10.0.3.2
+ run_tests "$ns3" $sender dead:beef:3::2
+
+ run_tests "$ns4" $sender 10.0.3.1
+ run_tests "$ns4" $sender dead:beef:3::1
+done
+
+time_end=$(date +%s)
+time_run=$((time_end-time_start))
+
+echo "Time: ${time_run} seconds"
+
+exit $ret
diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
new file mode 100755
index 000000000..d205828d7
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@ -0,0 +1,629 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(dirname "${0}")/mptcp_lib.sh"
+
+ret=0
+sin=""
+sout=""
+cin=""
+cout=""
+ksft_skip=4
+timeout=30
+mptcp_connect=""
+capture=0
+
+TEST_COUNT=0
+
+init()
+{
+ capout=$(mktemp)
+
+ rndh=$(printf %x $sec)-$(mktemp -u XXXXXX)
+
+ ns1="ns1-$rndh"
+ ns2="ns2-$rndh"
+
+ for netns in "$ns1" "$ns2";do
+ ip netns add $netns || exit $ksft_skip
+ ip -net $netns link set lo up
+ ip netns exec $netns sysctl -q net.mptcp.enabled=1
+ ip netns exec $netns sysctl -q net.ipv4.conf.all.rp_filter=0
+ ip netns exec $netns sysctl -q net.ipv4.conf.default.rp_filter=0
+ done
+
+ # ns1 ns2
+ # ns1eth1 ns2eth1
+ # ns1eth2 ns2eth2
+ # ns1eth3 ns2eth3
+ # ns1eth4 ns2eth4
+
+ for i in `seq 1 4`; do
+ ip link add ns1eth$i netns "$ns1" type veth peer name ns2eth$i netns "$ns2"
+ ip -net "$ns1" addr add 10.0.$i.1/24 dev ns1eth$i
+ ip -net "$ns1" addr add dead:beef:$i::1/64 dev ns1eth$i nodad
+ ip -net "$ns1" link set ns1eth$i up
+
+ ip -net "$ns2" addr add 10.0.$i.2/24 dev ns2eth$i
+ ip -net "$ns2" addr add dead:beef:$i::2/64 dev ns2eth$i nodad
+ ip -net "$ns2" link set ns2eth$i up
+
+ # let $ns2 reach any $ns1 address from any interface
+ ip -net "$ns2" route add default via 10.0.$i.1 dev ns2eth$i metric 10$i
+ done
+}
+
+cleanup_partial()
+{
+ rm -f "$capout"
+
+ for netns in "$ns1" "$ns2"; do
+ ip netns del $netns
+ done
+}
+
+cleanup()
+{
+ rm -f "$cin" "$cout"
+ rm -f "$sin" "$sout"
+ cleanup_partial
+}
+
+reset()
+{
+ cleanup_partial
+ init
+}
+
+reset_with_cookies()
+{
+ reset
+
+ for netns in "$ns1" "$ns2";do
+ ip netns exec $netns sysctl -q net.ipv4.tcp_syncookies=2
+ done
+}
+
+for arg in "$@"; do
+ if [ "$arg" = "-c" ]; then
+ capture=1
+ fi
+done
+
+mptcp_lib_check_mptcp
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+
+
+check_transfer()
+{
+ in=$1
+ out=$2
+ what=$3
+
+ cmp "$in" "$out" > /dev/null 2>&1
+ if [ $? -ne 0 ] ;then
+ echo "[ FAIL ] $what does not match (in, out):"
+ print_file_err "$in"
+ print_file_err "$out"
+
+ return 1
+ fi
+
+ return 0
+}
+
+do_ping()
+{
+ listener_ns="$1"
+ connector_ns="$2"
+ connect_addr="$3"
+
+ ip netns exec ${connector_ns} ping -q -c 1 $connect_addr >/dev/null
+ if [ $? -ne 0 ] ; then
+ echo "$listener_ns -> $connect_addr connectivity [ FAIL ]" 1>&2
+ ret=1
+ fi
+}
+
+# $1: ns ; $2: counter
+get_counter()
+{
+ local ns="${1}"
+ local counter="${2}"
+ local count
+
+ count=$(ip netns exec ${ns} nstat -asz "${counter}" | awk 'NR==1 {next} {print $2}')
+ if [ -z "${count}" ]; then
+ mptcp_lib_fail_if_expected_feature "${counter} counter"
+ return 1
+ fi
+
+ echo "${count}"
+}
+
+do_transfer()
+{
+ listener_ns="$1"
+ connector_ns="$2"
+ cl_proto="$3"
+ srv_proto="$4"
+ connect_addr="$5"
+ rm_nr_ns1="$6"
+ rm_nr_ns2="$7"
+
+ port=$((10000+$TEST_COUNT))
+ TEST_COUNT=$((TEST_COUNT+1))
+
+ :> "$cout"
+ :> "$sout"
+ :> "$capout"
+
+ if [ $capture -eq 1 ]; then
+ if [ -z $SUDO_USER ] ; then
+ capuser=""
+ else
+ capuser="-Z $SUDO_USER"
+ fi
+
+ capfile=$(printf "mp_join-%02u-%s.pcap" "$TEST_COUNT" "${listener_ns}")
+
+ echo "Capturing traffic for test $TEST_COUNT into $capfile"
+ ip netns exec ${listener_ns} tcpdump -i any -s 65535 -B 32768 $capuser -w $capfile > "$capout" 2>&1 &
+ cappid=$!
+
+ sleep 1
+ fi
+
+ if [[ $rm_nr_ns1 -eq 0 && $rm_nr_ns2 -eq 0 ]]; then
+ mptcp_connect="./mptcp_connect -j"
+ else
+ mptcp_connect="./mptcp_connect -r"
+ fi
+
+ ip netns exec ${listener_ns} $mptcp_connect -t $timeout -l -p $port -s ${srv_proto} 0.0.0.0 < "$sin" > "$sout" &
+ spid=$!
+
+ sleep 1
+
+ ip netns exec ${connector_ns} $mptcp_connect -t $timeout -p $port -s ${cl_proto} $connect_addr < "$cin" > "$cout" &
+ cpid=$!
+
+ if [ $rm_nr_ns1 -gt 0 ]; then
+ counter=1
+ sleep 1
+
+ while [ $counter -le $rm_nr_ns1 ]
+ do
+ ip netns exec ${listener_ns} ./pm_nl_ctl del $counter
+ sleep 1
+ let counter+=1
+ done
+ fi
+
+ if [ $rm_nr_ns2 -gt 0 ]; then
+ counter=1
+ sleep 1
+
+ while [ $counter -le $rm_nr_ns2 ]
+ do
+ ip netns exec ${connector_ns} ./pm_nl_ctl del $counter
+ sleep 1
+ let counter+=1
+ done
+ fi
+
+ wait $cpid
+ retc=$?
+ wait $spid
+ rets=$?
+
+ if [ $capture -eq 1 ]; then
+ sleep 1
+ kill $cappid
+ fi
+
+ if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
+ echo " client exit code $retc, server $rets" 1>&2
+ echo -e "\nnetns ${listener_ns} socket stat for ${port}:" 1>&2
+ ip netns exec ${listener_ns} ss -nita 1>&2 -o "sport = :$port"
+ echo -e "\nnetns ${connector_ns} socket stat for ${port}:" 1>&2
+ ip netns exec ${connector_ns} ss -nita 1>&2 -o "dport = :$port"
+
+ cat "$capout"
+ return 1
+ fi
+
+ check_transfer $sin $cout "file received by client"
+ retc=$?
+ check_transfer $cin $sout "file received by server"
+ rets=$?
+
+ if [ $retc -eq 0 ] && [ $rets -eq 0 ];then
+ cat "$capout"
+ return 0
+ fi
+
+ cat "$capout"
+ return 1
+}
+
+make_file()
+{
+ name=$1
+ who=$2
+
+ SIZE=1
+
+ dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null
+ echo -e "\nMPTCP_TEST_FILE_END_MARKER" >> "$name"
+
+ echo "Created $name (size $SIZE KB) containing data sent by $who"
+}
+
+run_tests()
+{
+ listener_ns="$1"
+ connector_ns="$2"
+ connect_addr="$3"
+ lret=0
+
+ do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} 0 0
+ lret=$?
+ if [ $lret -ne 0 ]; then
+ ret=$lret
+ return
+ fi
+}
+
+run_remove_tests()
+{
+ listener_ns="$1"
+ connector_ns="$2"
+ connect_addr="$3"
+ rm_nr_ns1="$4"
+ rm_nr_ns2="$5"
+ lret=0
+
+ do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} ${rm_nr_ns1} ${rm_nr_ns2}
+ lret=$?
+ if [ $lret -ne 0 ]; then
+ ret=$lret
+ return
+ fi
+}
+
+chk_join_nr()
+{
+ local msg="$1"
+ local syn_nr=$2
+ local syn_ack_nr=$3
+ local ack_nr=$4
+ local count
+ local dump_stats
+
+ printf "%02u %-36s %s" "$TEST_COUNT" "$msg" "syn"
+ count=$(get_counter ${ns1} "MPTcpExtMPJoinSynRx")
+ if [ -z "$count" ]; then
+ echo -n "[skip]"
+ elif [ "$count" != "$syn_nr" ]; then
+ echo "[fail] got $count JOIN[s] syn expected $syn_nr"
+ ret=1
+ dump_stats=1
+ else
+ echo -n "[ ok ]"
+ fi
+
+ echo -n " - synack"
+ count=$(get_counter ${ns2} "MPTcpExtMPJoinSynAckRx")
+ if [ -z "$count" ]; then
+ echo -n "[skip]"
+ elif [ "$count" != "$syn_ack_nr" ]; then
+ echo "[fail] got $count JOIN[s] synack expected $syn_ack_nr"
+ ret=1
+ dump_stats=1
+ else
+ echo -n "[ ok ]"
+ fi
+
+ echo -n " - ack"
+ count=$(get_counter ${ns1} "MPTcpExtMPJoinAckRx")
+ if [ -z "$count" ]; then
+ echo "[skip]"
+ elif [ "$count" != "$ack_nr" ]; then
+ echo "[fail] got $count JOIN[s] ack expected $ack_nr"
+ ret=1
+ dump_stats=1
+ else
+ echo "[ ok ]"
+ fi
+ if [ "${dump_stats}" = 1 ]; then
+ echo Server ns stats
+ ip netns exec $ns1 nstat -as | grep MPTcp
+ echo Client ns stats
+ ip netns exec $ns2 nstat -as | grep MPTcp
+ fi
+}
+
+chk_add_nr()
+{
+ local add_nr=$1
+ local echo_nr=$2
+ local count
+ local dump_stats
+
+ printf "%-39s %s" " " "add"
+ count=$(get_counter ${ns2} "MPTcpExtAddAddr")
+ if [ -z "$count" ]; then
+ echo -n "[skip]"
+ elif [ "$count" != "$add_nr" ]; then
+ echo "[fail] got $count ADD_ADDR[s] expected $add_nr"
+ ret=1
+ dump_stats=1
+ else
+ echo -n "[ ok ]"
+ fi
+
+ echo -n " - echo "
+ count=$(get_counter ${ns1} "MPTcpExtEchoAdd")
+ if [ -z "$count" ]; then
+ echo "[skip]"
+ elif [ "$count" != "$echo_nr" ]; then
+ echo "[fail] got $count ADD_ADDR echo[s] expected $echo_nr"
+ ret=1
+ dump_stats=1
+ else
+ echo "[ ok ]"
+ fi
+
+ if [ "${dump_stats}" = 1 ]; then
+ echo Server ns stats
+ ip netns exec $ns1 nstat -as | grep MPTcp
+ echo Client ns stats
+ ip netns exec $ns2 nstat -as | grep MPTcp
+ fi
+}
+
+chk_rm_nr()
+{
+ local rm_addr_nr=$1
+ local rm_subflow_nr=$2
+ local count
+ local dump_stats
+
+ printf "%-39s %s" " " "rm "
+ count=$(get_counter ${ns1} "MPTcpExtRmAddr")
+ if [ -z "$count" ]; then
+ echo -n "[skip]"
+ elif [ "$count" != "$rm_addr_nr" ]; then
+ echo "[fail] got $count RM_ADDR[s] expected $rm_addr_nr"
+ ret=1
+ dump_stats=1
+ else
+ echo -n "[ ok ]"
+ fi
+
+ echo -n " - sf "
+ count=$(get_counter ${ns2} "MPTcpExtRmSubflow")
+ if [ -z "$count" ]; then
+ echo "[skip]"
+ elif [ "$count" != "$rm_subflow_nr" ]; then
+ echo "[fail] got $count RM_SUBFLOW[s] expected $rm_subflow_nr"
+ ret=1
+ dump_stats=1
+ else
+ echo "[ ok ]"
+ fi
+
+ if [ "${dump_stats}" = 1 ]; then
+ echo Server ns stats
+ ip netns exec $ns1 nstat -as | grep MPTcp
+ echo Client ns stats
+ ip netns exec $ns2 nstat -as | grep MPTcp
+ fi
+}
+
+sin=$(mktemp)
+sout=$(mktemp)
+cin=$(mktemp)
+cout=$(mktemp)
+init
+make_file "$cin" "client"
+make_file "$sin" "server"
+trap cleanup EXIT
+
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "no JOIN" "0" "0" "0"
+
+# subflow limted by client
+reset
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "single subflow, limited by client" 0 0 0
+
+# subflow limted by server
+reset
+ip netns exec $ns2 ./pm_nl_ctl limits 0 1
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "single subflow, limited by server" 1 1 0
+
+# subflow
+reset
+ip netns exec $ns1 ./pm_nl_ctl limits 0 1
+ip netns exec $ns2 ./pm_nl_ctl limits 0 1
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "single subflow" 1 1 1
+
+# multiple subflows
+reset
+ip netns exec $ns1 ./pm_nl_ctl limits 0 2
+ip netns exec $ns2 ./pm_nl_ctl limits 0 2
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "multiple subflows" 2 2 2
+
+# multiple subflows limited by serverf
+reset
+ip netns exec $ns1 ./pm_nl_ctl limits 0 1
+ip netns exec $ns2 ./pm_nl_ctl limits 0 2
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "multiple subflows, limited by server" 2 2 1
+
+# add_address, unused
+reset
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "unused signal address" 0 0 0
+chk_add_nr 1 1
+
+# accept and use add_addr
+reset
+ip netns exec $ns1 ./pm_nl_ctl limits 0 1
+ip netns exec $ns2 ./pm_nl_ctl limits 1 1
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "signal address" 1 1 1
+chk_add_nr 1 1
+
+# accept and use add_addr with an additional subflow
+# note: signal address in server ns and local addresses in client ns must
+# belong to different subnets or one of the listed local address could be
+# used for 'add_addr' subflow
+reset
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+ip netns exec $ns1 ./pm_nl_ctl limits 0 2
+ip netns exec $ns2 ./pm_nl_ctl limits 1 2
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "subflow and signal" 2 2 2
+chk_add_nr 1 1
+
+# accept and use add_addr with additional subflows
+reset
+ip netns exec $ns1 ./pm_nl_ctl limits 0 3
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+ip netns exec $ns2 ./pm_nl_ctl limits 1 3
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "multiple subflows and signal" 3 3 3
+chk_add_nr 1 1
+
+# single subflow, remove
+reset
+ip netns exec $ns1 ./pm_nl_ctl limits 0 1
+ip netns exec $ns2 ./pm_nl_ctl limits 0 1
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+run_remove_tests $ns1 $ns2 10.0.1.1 0 1
+chk_join_nr "remove single subflow" 1 1 1
+chk_rm_nr 1 1
+
+# multiple subflows, remove
+reset
+ip netns exec $ns1 ./pm_nl_ctl limits 0 2
+ip netns exec $ns2 ./pm_nl_ctl limits 0 2
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+run_remove_tests $ns1 $ns2 10.0.1.1 0 2
+chk_join_nr "remove multiple subflows" 2 2 2
+chk_rm_nr 2 2
+
+# single address, remove
+reset
+ip netns exec $ns1 ./pm_nl_ctl limits 0 1
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+ip netns exec $ns2 ./pm_nl_ctl limits 1 1
+run_remove_tests $ns1 $ns2 10.0.1.1 1 0
+chk_join_nr "remove single address" 1 1 1
+chk_add_nr 1 1
+chk_rm_nr 0 0
+
+# subflow and signal, remove
+reset
+ip netns exec $ns1 ./pm_nl_ctl limits 0 2
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+ip netns exec $ns2 ./pm_nl_ctl limits 1 2
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+run_remove_tests $ns1 $ns2 10.0.1.1 1 1
+chk_join_nr "remove subflow and signal" 2 2 2
+chk_add_nr 1 1
+chk_rm_nr 1 1
+
+# subflows and signal, remove
+reset
+ip netns exec $ns1 ./pm_nl_ctl limits 0 3
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+ip netns exec $ns2 ./pm_nl_ctl limits 1 3
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
+run_remove_tests $ns1 $ns2 10.0.1.1 1 2
+chk_join_nr "remove subflows and signal" 3 3 3
+chk_add_nr 1 1
+chk_rm_nr 2 2
+
+# single subflow, syncookies
+reset_with_cookies
+ip netns exec $ns1 ./pm_nl_ctl limits 0 1
+ip netns exec $ns2 ./pm_nl_ctl limits 0 1
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "single subflow with syn cookies" 1 1 1
+
+# multiple subflows with syn cookies
+reset_with_cookies
+ip netns exec $ns1 ./pm_nl_ctl limits 0 2
+ip netns exec $ns2 ./pm_nl_ctl limits 0 2
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "multiple subflows with syn cookies" 2 2 2
+
+# multiple subflows limited by server
+reset_with_cookies
+ip netns exec $ns1 ./pm_nl_ctl limits 0 1
+ip netns exec $ns2 ./pm_nl_ctl limits 0 2
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "subflows limited by server w cookies" 2 2 1
+
+# test signal address with cookies
+reset_with_cookies
+ip netns exec $ns1 ./pm_nl_ctl limits 0 1
+ip netns exec $ns2 ./pm_nl_ctl limits 1 1
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "signal address with syn cookies" 1 1 1
+chk_add_nr 1 1
+
+# test cookie with subflow and signal
+reset_with_cookies
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+ip netns exec $ns1 ./pm_nl_ctl limits 0 2
+ip netns exec $ns2 ./pm_nl_ctl limits 1 2
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "subflow and signal w cookies" 2 2 2
+chk_add_nr 1 1
+
+# accept and use add_addr with additional subflows
+reset_with_cookies
+ip netns exec $ns1 ./pm_nl_ctl limits 0 3
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal
+ip netns exec $ns2 ./pm_nl_ctl limits 1 3
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
+run_tests $ns1 $ns2 10.0.1.1
+chk_join_nr "subflows and signal w. cookies" 3 3 3
+chk_add_nr 1 1
+
+exit $ret
diff --git a/tools/testing/selftests/net/mptcp/mptcp_lib.sh b/tools/testing/selftests/net/mptcp/mptcp_lib.sh
new file mode 100644
index 000000000..f32045b23
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/mptcp_lib.sh
@@ -0,0 +1,104 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+readonly KSFT_FAIL=1
+readonly KSFT_SKIP=4
+
+# SELFTESTS_MPTCP_LIB_EXPECT_ALL_FEATURES env var can be set when validating all
+# features using the last version of the kernel and the selftests to make sure
+# a test is not being skipped by mistake.
+mptcp_lib_expect_all_features() {
+ [ "${SELFTESTS_MPTCP_LIB_EXPECT_ALL_FEATURES:-}" = "1" ]
+}
+
+# $1: msg
+mptcp_lib_fail_if_expected_feature() {
+ if mptcp_lib_expect_all_features; then
+ echo "ERROR: missing feature: ${*}"
+ exit ${KSFT_FAIL}
+ fi
+
+ return 1
+}
+
+# $1: file
+mptcp_lib_has_file() {
+ local f="${1}"
+
+ if [ -f "${f}" ]; then
+ return 0
+ fi
+
+ mptcp_lib_fail_if_expected_feature "${f} file not found"
+}
+
+mptcp_lib_check_mptcp() {
+ if ! mptcp_lib_has_file "/proc/sys/net/mptcp/enabled"; then
+ echo "SKIP: MPTCP support is not available"
+ exit ${KSFT_SKIP}
+ fi
+}
+
+mptcp_lib_check_kallsyms() {
+ if ! mptcp_lib_has_file "/proc/kallsyms"; then
+ echo "SKIP: CONFIG_KALLSYMS is missing"
+ exit ${KSFT_SKIP}
+ fi
+}
+
+# Internal: use mptcp_lib_kallsyms_has() instead
+__mptcp_lib_kallsyms_has() {
+ local sym="${1}"
+
+ mptcp_lib_check_kallsyms
+
+ grep -q " ${sym}" /proc/kallsyms
+}
+
+# $1: part of a symbol to look at, add '$' at the end for full name
+mptcp_lib_kallsyms_has() {
+ local sym="${1}"
+
+ if __mptcp_lib_kallsyms_has "${sym}"; then
+ return 0
+ fi
+
+ mptcp_lib_fail_if_expected_feature "${sym} symbol not found"
+}
+
+# $1: part of a symbol to look at, add '$' at the end for full name
+mptcp_lib_kallsyms_doesnt_have() {
+ local sym="${1}"
+
+ if ! __mptcp_lib_kallsyms_has "${sym}"; then
+ return 0
+ fi
+
+ mptcp_lib_fail_if_expected_feature "${sym} symbol has been found"
+}
+
+# !!!AVOID USING THIS!!!
+# Features might not land in the expected version and features can be backported
+#
+# $1: kernel version, e.g. 6.3
+mptcp_lib_kversion_ge() {
+ local exp_maj="${1%.*}"
+ local exp_min="${1#*.}"
+ local v maj min
+
+ # If the kernel has backported features, set this env var to 1:
+ if [ "${SELFTESTS_MPTCP_LIB_NO_KVERSION_CHECK:-}" = "1" ]; then
+ return 0
+ fi
+
+ v=$(uname -r | cut -d'.' -f1,2)
+ maj=${v%.*}
+ min=${v#*.}
+
+ if [ "${maj}" -gt "${exp_maj}" ] ||
+ { [ "${maj}" -eq "${exp_maj}" ] && [ "${min}" -ge "${exp_min}" ]; }; then
+ return 0
+ fi
+
+ mptcp_lib_fail_if_expected_feature "kernel version ${1} lower than ${v}"
+}
diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh
new file mode 100755
index 000000000..fff6f74eb
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh
@@ -0,0 +1,136 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(dirname "${0}")/mptcp_lib.sh"
+
+ksft_skip=4
+ret=0
+
+usage() {
+ echo "Usage: $0 [ -h ]"
+}
+
+
+while getopts "$optstring" option;do
+ case "$option" in
+ "h")
+ usage $0
+ exit 0
+ ;;
+ "?")
+ usage $0
+ exit 1
+ ;;
+ esac
+done
+
+sec=$(date +%s)
+rndh=$(printf %x $sec)-$(mktemp -u XXXXXX)
+ns1="ns1-$rndh"
+err=$(mktemp)
+ret=0
+
+cleanup()
+{
+ rm -f $err
+ ip netns del $ns1
+}
+
+mptcp_lib_check_mptcp
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+
+trap cleanup EXIT
+
+ip netns add $ns1 || exit $ksft_skip
+ip -net $ns1 link set lo up
+ip netns exec $ns1 sysctl -q net.mptcp.enabled=1
+
+check()
+{
+ local cmd="$1"
+ local expected="$2"
+ local msg="$3"
+ local out=`$cmd 2>$err`
+ local cmd_ret=$?
+
+ printf "%-50s %s" "$msg"
+ if [ $cmd_ret -ne 0 ]; then
+ echo "[FAIL] command execution '$cmd' stderr "
+ cat $err
+ ret=1
+ elif [ "$out" = "$expected" ]; then
+ echo "[ OK ]"
+ else
+ echo -n "[FAIL] "
+ echo "expected '$expected' got '$out'"
+ ret=1
+ fi
+}
+
+check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "defaults addr list"
+
+default_limits="$(ip netns exec $ns1 ./pm_nl_ctl limits)"
+if mptcp_lib_expect_all_features; then
+ check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0
+subflows 0" "defaults limits"
+fi
+
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.2 flags subflow dev lo
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 flags signal,backup
+check "ip netns exec $ns1 ./pm_nl_ctl get 1" "id 1 flags 10.0.1.1" "simple add/get addr"
+
+check "ip netns exec $ns1 ./pm_nl_ctl dump" \
+"id 1 flags 10.0.1.1
+id 2 flags subflow dev lo 10.0.1.2
+id 3 flags signal,backup 10.0.1.3" "dump addrs"
+
+ip netns exec $ns1 ./pm_nl_ctl del 2
+check "ip netns exec $ns1 ./pm_nl_ctl get 2" "" "simple del addr"
+check "ip netns exec $ns1 ./pm_nl_ctl dump" \
+"id 1 flags 10.0.1.1
+id 3 flags signal,backup 10.0.1.3" "dump addrs after del"
+
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3
+check "ip netns exec $ns1 ./pm_nl_ctl get 4" "" "duplicate addr"
+
+ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.4 id 10 flags signal
+check "ip netns exec $ns1 ./pm_nl_ctl get 4" "id 4 flags signal 10.0.1.4" "id addr increment"
+
+for i in `seq 5 9`; do
+ ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.$i flags signal >/dev/null 2>&1
+done
+check "ip netns exec $ns1 ./pm_nl_ctl get 9" "id 9 flags signal 10.0.1.9" "hard addr limit"
+check "ip netns exec $ns1 ./pm_nl_ctl get 10" "" "above hard addr limit"
+
+for i in `seq 9 256`; do
+ ip netns exec $ns1 ./pm_nl_ctl del $i
+ ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.9
+done
+check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags 10.0.1.1
+id 3 flags signal,backup 10.0.1.3
+id 4 flags signal 10.0.1.4
+id 5 flags signal 10.0.1.5
+id 6 flags signal 10.0.1.6
+id 7 flags signal 10.0.1.7
+id 8 flags signal 10.0.1.8" "id limit"
+
+ip netns exec $ns1 ./pm_nl_ctl flush
+check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "flush addrs"
+
+ip netns exec $ns1 ./pm_nl_ctl limits 9 1
+check "ip netns exec $ns1 ./pm_nl_ctl limits" "$default_limits" "rcv addrs above hard limit"
+
+ip netns exec $ns1 ./pm_nl_ctl limits 1 9
+check "ip netns exec $ns1 ./pm_nl_ctl limits" "$default_limits" "subflows above hard limit"
+
+ip netns exec $ns1 ./pm_nl_ctl limits 8 8
+check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 8
+subflows 8" "set limits"
+
+exit $ret
diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c
new file mode 100644
index 000000000..b24a2f17d
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c
@@ -0,0 +1,616 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <errno.h>
+#include <error.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/socket.h>
+#include <sys/types.h>
+
+#include <arpa/inet.h>
+#include <net/if.h>
+
+#include <linux/rtnetlink.h>
+#include <linux/genetlink.h>
+
+#include "linux/mptcp.h"
+
+#ifndef MPTCP_PM_NAME
+#define MPTCP_PM_NAME "mptcp_pm"
+#endif
+
+static void syntax(char *argv[])
+{
+ fprintf(stderr, "%s add|get|del|flush|dump|accept [<args>]\n", argv[0]);
+ fprintf(stderr, "\tadd [flags signal|subflow|backup] [id <nr>] [dev <name>] <ip>\n");
+ fprintf(stderr, "\tdel <id>\n");
+ fprintf(stderr, "\tget <id>\n");
+ fprintf(stderr, "\tflush\n");
+ fprintf(stderr, "\tdump\n");
+ fprintf(stderr, "\tlimits [<rcv addr max> <subflow max>]\n");
+ exit(0);
+}
+
+static int init_genl_req(char *data, int family, int cmd, int version)
+{
+ struct nlmsghdr *nh = (void *)data;
+ struct genlmsghdr *gh;
+ int off = 0;
+
+ nh->nlmsg_type = family;
+ nh->nlmsg_flags = NLM_F_REQUEST;
+ nh->nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN);
+ off += NLMSG_ALIGN(sizeof(*nh));
+
+ gh = (void *)(data + off);
+ gh->cmd = cmd;
+ gh->version = version;
+ off += NLMSG_ALIGN(sizeof(*gh));
+ return off;
+}
+
+static void nl_error(struct nlmsghdr *nh)
+{
+ struct nlmsgerr *err = (struct nlmsgerr *)NLMSG_DATA(nh);
+ int len = nh->nlmsg_len - sizeof(*nh);
+ uint32_t off;
+
+ if (len < sizeof(struct nlmsgerr))
+ error(1, 0, "netlink error message truncated %d min %ld", len,
+ sizeof(struct nlmsgerr));
+
+ if (!err->error) {
+ /* check messages from kernel */
+ struct rtattr *attrs = (struct rtattr *)NLMSG_DATA(nh);
+
+ while (RTA_OK(attrs, len)) {
+ if (attrs->rta_type == NLMSGERR_ATTR_MSG)
+ fprintf(stderr, "netlink ext ack msg: %s\n",
+ (char *)RTA_DATA(attrs));
+ if (attrs->rta_type == NLMSGERR_ATTR_OFFS) {
+ memcpy(&off, RTA_DATA(attrs), 4);
+ fprintf(stderr, "netlink err off %d\n",
+ (int)off);
+ }
+ attrs = RTA_NEXT(attrs, len);
+ }
+ } else {
+ fprintf(stderr, "netlink error %d", err->error);
+ }
+}
+
+/* do a netlink command and, if max > 0, fetch the reply */
+static int do_nl_req(int fd, struct nlmsghdr *nh, int len, int max)
+{
+ struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
+ socklen_t addr_len;
+ void *data = nh;
+ int rem, ret;
+ int err = 0;
+
+ nh->nlmsg_len = len;
+ ret = sendto(fd, data, len, 0, (void *)&nladdr, sizeof(nladdr));
+ if (ret != len)
+ error(1, errno, "send netlink: %uB != %uB\n", ret, len);
+ if (max == 0)
+ return 0;
+
+ addr_len = sizeof(nladdr);
+ rem = ret = recvfrom(fd, data, max, 0, (void *)&nladdr, &addr_len);
+ if (ret < 0)
+ error(1, errno, "recv netlink: %uB\n", ret);
+
+ /* Beware: the NLMSG_NEXT macro updates the 'rem' argument */
+ for (; NLMSG_OK(nh, rem); nh = NLMSG_NEXT(nh, rem)) {
+ if (nh->nlmsg_type == NLMSG_ERROR) {
+ nl_error(nh);
+ err = 1;
+ }
+ }
+ if (err)
+ error(1, 0, "bailing out due to netlink error[s]");
+ return ret;
+}
+
+static int genl_parse_getfamily(struct nlmsghdr *nlh)
+{
+ struct genlmsghdr *ghdr = NLMSG_DATA(nlh);
+ int len = nlh->nlmsg_len;
+ struct rtattr *attrs;
+
+ if (nlh->nlmsg_type != GENL_ID_CTRL)
+ error(1, errno, "Not a controller message, len=%d type=0x%x\n",
+ nlh->nlmsg_len, nlh->nlmsg_type);
+
+ len -= NLMSG_LENGTH(GENL_HDRLEN);
+
+ if (len < 0)
+ error(1, errno, "wrong controller message len %d\n", len);
+
+ if (ghdr->cmd != CTRL_CMD_NEWFAMILY)
+ error(1, errno, "Unknown controller command %d\n", ghdr->cmd);
+
+ attrs = (struct rtattr *) ((char *) ghdr + GENL_HDRLEN);
+ while (RTA_OK(attrs, len)) {
+ if (attrs->rta_type == CTRL_ATTR_FAMILY_ID)
+ return *(__u16 *)RTA_DATA(attrs);
+ attrs = RTA_NEXT(attrs, len);
+ }
+
+ error(1, errno, "can't find CTRL_ATTR_FAMILY_ID attr");
+ return -1;
+}
+
+static int resolve_mptcp_pm_netlink(int fd)
+{
+ char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+ NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
+ 1024];
+ struct nlmsghdr *nh;
+ struct rtattr *rta;
+ int namelen;
+ int off = 0;
+
+ memset(data, 0, sizeof(data));
+ nh = (void *)data;
+ off = init_genl_req(data, GENL_ID_CTRL, CTRL_CMD_GETFAMILY, 0);
+
+ rta = (void *)(data + off);
+ namelen = strlen(MPTCP_PM_NAME) + 1;
+ rta->rta_type = CTRL_ATTR_FAMILY_NAME;
+ rta->rta_len = RTA_LENGTH(namelen);
+ memcpy(RTA_DATA(rta), MPTCP_PM_NAME, namelen);
+ off += NLMSG_ALIGN(rta->rta_len);
+
+ do_nl_req(fd, nh, off, sizeof(data));
+ return genl_parse_getfamily((void *)data);
+}
+
+int add_addr(int fd, int pm_family, int argc, char *argv[])
+{
+ char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+ NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
+ 1024];
+ struct rtattr *rta, *nest;
+ struct nlmsghdr *nh;
+ u_int16_t family;
+ u_int32_t flags;
+ int nest_start;
+ u_int8_t id;
+ int off = 0;
+ int arg;
+
+ memset(data, 0, sizeof(data));
+ nh = (void *)data;
+ off = init_genl_req(data, pm_family, MPTCP_PM_CMD_ADD_ADDR,
+ MPTCP_PM_VER);
+
+ if (argc < 3)
+ syntax(argv);
+
+ nest_start = off;
+ nest = (void *)(data + off);
+ nest->rta_type = NLA_F_NESTED | MPTCP_PM_ATTR_ADDR;
+ nest->rta_len = RTA_LENGTH(0);
+ off += NLMSG_ALIGN(nest->rta_len);
+
+ /* addr data */
+ rta = (void *)(data + off);
+ if (inet_pton(AF_INET, argv[2], RTA_DATA(rta))) {
+ family = AF_INET;
+ rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR4;
+ rta->rta_len = RTA_LENGTH(4);
+ } else if (inet_pton(AF_INET6, argv[2], RTA_DATA(rta))) {
+ family = AF_INET6;
+ rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR6;
+ rta->rta_len = RTA_LENGTH(16);
+ } else
+ error(1, errno, "can't parse ip %s", argv[2]);
+ off += NLMSG_ALIGN(rta->rta_len);
+
+ /* family */
+ rta = (void *)(data + off);
+ rta->rta_type = MPTCP_PM_ADDR_ATTR_FAMILY;
+ rta->rta_len = RTA_LENGTH(2);
+ memcpy(RTA_DATA(rta), &family, 2);
+ off += NLMSG_ALIGN(rta->rta_len);
+
+ for (arg = 3; arg < argc; arg++) {
+ if (!strcmp(argv[arg], "flags")) {
+ char *tok, *str;
+
+ /* flags */
+ flags = 0;
+ if (++arg >= argc)
+ error(1, 0, " missing flags value");
+
+ /* do not support flag list yet */
+ for (str = argv[arg]; (tok = strtok(str, ","));
+ str = NULL) {
+ if (!strcmp(tok, "subflow"))
+ flags |= MPTCP_PM_ADDR_FLAG_SUBFLOW;
+ else if (!strcmp(tok, "signal"))
+ flags |= MPTCP_PM_ADDR_FLAG_SIGNAL;
+ else if (!strcmp(tok, "backup"))
+ flags |= MPTCP_PM_ADDR_FLAG_BACKUP;
+ else
+ error(1, errno,
+ "unknown flag %s", argv[arg]);
+ }
+
+ rta = (void *)(data + off);
+ rta->rta_type = MPTCP_PM_ADDR_ATTR_FLAGS;
+ rta->rta_len = RTA_LENGTH(4);
+ memcpy(RTA_DATA(rta), &flags, 4);
+ off += NLMSG_ALIGN(rta->rta_len);
+ } else if (!strcmp(argv[arg], "id")) {
+ if (++arg >= argc)
+ error(1, 0, " missing id value");
+
+ id = atoi(argv[arg]);
+ rta = (void *)(data + off);
+ rta->rta_type = MPTCP_PM_ADDR_ATTR_ID;
+ rta->rta_len = RTA_LENGTH(1);
+ memcpy(RTA_DATA(rta), &id, 1);
+ off += NLMSG_ALIGN(rta->rta_len);
+ } else if (!strcmp(argv[arg], "dev")) {
+ int32_t ifindex;
+
+ if (++arg >= argc)
+ error(1, 0, " missing dev name");
+
+ ifindex = if_nametoindex(argv[arg]);
+ if (!ifindex)
+ error(1, errno, "unknown device %s", argv[arg]);
+
+ rta = (void *)(data + off);
+ rta->rta_type = MPTCP_PM_ADDR_ATTR_IF_IDX;
+ rta->rta_len = RTA_LENGTH(4);
+ memcpy(RTA_DATA(rta), &ifindex, 4);
+ off += NLMSG_ALIGN(rta->rta_len);
+ } else
+ error(1, 0, "unknown keyword %s", argv[arg]);
+ }
+ nest->rta_len = off - nest_start;
+
+ do_nl_req(fd, nh, off, 0);
+ return 0;
+}
+
+int del_addr(int fd, int pm_family, int argc, char *argv[])
+{
+ char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+ NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
+ 1024];
+ struct rtattr *rta, *nest;
+ struct nlmsghdr *nh;
+ int nest_start;
+ u_int8_t id;
+ int off = 0;
+
+ memset(data, 0, sizeof(data));
+ nh = (void *)data;
+ off = init_genl_req(data, pm_family, MPTCP_PM_CMD_DEL_ADDR,
+ MPTCP_PM_VER);
+
+ /* the only argument is the address id */
+ if (argc != 3)
+ syntax(argv);
+
+ id = atoi(argv[2]);
+
+ nest_start = off;
+ nest = (void *)(data + off);
+ nest->rta_type = NLA_F_NESTED | MPTCP_PM_ATTR_ADDR;
+ nest->rta_len = RTA_LENGTH(0);
+ off += NLMSG_ALIGN(nest->rta_len);
+
+ /* build a dummy addr with only the ID set */
+ rta = (void *)(data + off);
+ rta->rta_type = MPTCP_PM_ADDR_ATTR_ID;
+ rta->rta_len = RTA_LENGTH(1);
+ memcpy(RTA_DATA(rta), &id, 1);
+ off += NLMSG_ALIGN(rta->rta_len);
+ nest->rta_len = off - nest_start;
+
+ do_nl_req(fd, nh, off, 0);
+ return 0;
+}
+
+static void print_addr(struct rtattr *attrs, int len)
+{
+ uint16_t family = 0;
+ char str[1024];
+ uint32_t flags;
+ uint8_t id;
+
+ while (RTA_OK(attrs, len)) {
+ if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_FAMILY)
+ memcpy(&family, RTA_DATA(attrs), 2);
+ if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_ADDR4) {
+ if (family != AF_INET)
+ error(1, errno, "wrong IP (v4) for family %d",
+ family);
+ inet_ntop(AF_INET, RTA_DATA(attrs), str, sizeof(str));
+ printf("%s", str);
+ }
+ if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_ADDR6) {
+ if (family != AF_INET6)
+ error(1, errno, "wrong IP (v6) for family %d",
+ family);
+ inet_ntop(AF_INET6, RTA_DATA(attrs), str, sizeof(str));
+ printf("%s", str);
+ }
+ if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_ID) {
+ memcpy(&id, RTA_DATA(attrs), 1);
+ printf("id %d ", id);
+ }
+ if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_FLAGS) {
+ memcpy(&flags, RTA_DATA(attrs), 4);
+
+ printf("flags ");
+ if (flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
+ printf("signal");
+ flags &= ~MPTCP_PM_ADDR_FLAG_SIGNAL;
+ if (flags)
+ printf(",");
+ }
+
+ if (flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
+ printf("subflow");
+ flags &= ~MPTCP_PM_ADDR_FLAG_SUBFLOW;
+ if (flags)
+ printf(",");
+ }
+
+ if (flags & MPTCP_PM_ADDR_FLAG_BACKUP) {
+ printf("backup");
+ flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP;
+ if (flags)
+ printf(",");
+ }
+
+ /* bump unknown flags, if any */
+ if (flags)
+ printf("0x%x", flags);
+ printf(" ");
+ }
+ if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_IF_IDX) {
+ char name[IF_NAMESIZE], *ret;
+ int32_t ifindex;
+
+ memcpy(&ifindex, RTA_DATA(attrs), 4);
+ ret = if_indextoname(ifindex, name);
+ if (ret)
+ printf("dev %s ", ret);
+ else
+ printf("dev unknown/%d", ifindex);
+ }
+
+ attrs = RTA_NEXT(attrs, len);
+ }
+ printf("\n");
+}
+
+static void print_addrs(struct nlmsghdr *nh, int pm_family, int total_len)
+{
+ struct rtattr *attrs;
+
+ for (; NLMSG_OK(nh, total_len); nh = NLMSG_NEXT(nh, total_len)) {
+ int len = nh->nlmsg_len;
+
+ if (nh->nlmsg_type == NLMSG_DONE)
+ break;
+ if (nh->nlmsg_type == NLMSG_ERROR)
+ nl_error(nh);
+ if (nh->nlmsg_type != pm_family)
+ continue;
+
+ len -= NLMSG_LENGTH(GENL_HDRLEN);
+ attrs = (struct rtattr *) ((char *) NLMSG_DATA(nh) +
+ GENL_HDRLEN);
+ while (RTA_OK(attrs, len)) {
+ if (attrs->rta_type ==
+ (MPTCP_PM_ATTR_ADDR | NLA_F_NESTED))
+ print_addr((void *)RTA_DATA(attrs),
+ attrs->rta_len);
+ attrs = RTA_NEXT(attrs, len);
+ }
+ }
+}
+
+int get_addr(int fd, int pm_family, int argc, char *argv[])
+{
+ char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+ NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
+ 1024];
+ struct rtattr *rta, *nest;
+ struct nlmsghdr *nh;
+ int nest_start;
+ u_int8_t id;
+ int off = 0;
+
+ memset(data, 0, sizeof(data));
+ nh = (void *)data;
+ off = init_genl_req(data, pm_family, MPTCP_PM_CMD_GET_ADDR,
+ MPTCP_PM_VER);
+
+ /* the only argument is the address id */
+ if (argc != 3)
+ syntax(argv);
+
+ id = atoi(argv[2]);
+
+ nest_start = off;
+ nest = (void *)(data + off);
+ nest->rta_type = NLA_F_NESTED | MPTCP_PM_ATTR_ADDR;
+ nest->rta_len = RTA_LENGTH(0);
+ off += NLMSG_ALIGN(nest->rta_len);
+
+ /* build a dummy addr with only the ID set */
+ rta = (void *)(data + off);
+ rta->rta_type = MPTCP_PM_ADDR_ATTR_ID;
+ rta->rta_len = RTA_LENGTH(1);
+ memcpy(RTA_DATA(rta), &id, 1);
+ off += NLMSG_ALIGN(rta->rta_len);
+ nest->rta_len = off - nest_start;
+
+ print_addrs(nh, pm_family, do_nl_req(fd, nh, off, sizeof(data)));
+ return 0;
+}
+
+int dump_addrs(int fd, int pm_family, int argc, char *argv[])
+{
+ char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+ NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
+ 1024];
+ pid_t pid = getpid();
+ struct nlmsghdr *nh;
+ int off = 0;
+
+ memset(data, 0, sizeof(data));
+ nh = (void *)data;
+ off = init_genl_req(data, pm_family, MPTCP_PM_CMD_GET_ADDR,
+ MPTCP_PM_VER);
+ nh->nlmsg_flags |= NLM_F_DUMP;
+ nh->nlmsg_seq = 1;
+ nh->nlmsg_pid = pid;
+ nh->nlmsg_len = off;
+
+ print_addrs(nh, pm_family, do_nl_req(fd, nh, off, sizeof(data)));
+ return 0;
+}
+
+int flush_addrs(int fd, int pm_family, int argc, char *argv[])
+{
+ char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+ NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
+ 1024];
+ struct nlmsghdr *nh;
+ int off = 0;
+
+ memset(data, 0, sizeof(data));
+ nh = (void *)data;
+ off = init_genl_req(data, pm_family, MPTCP_PM_CMD_FLUSH_ADDRS,
+ MPTCP_PM_VER);
+
+ do_nl_req(fd, nh, off, 0);
+ return 0;
+}
+
+static void print_limits(struct nlmsghdr *nh, int pm_family, int total_len)
+{
+ struct rtattr *attrs;
+ uint32_t max;
+
+ for (; NLMSG_OK(nh, total_len); nh = NLMSG_NEXT(nh, total_len)) {
+ int len = nh->nlmsg_len;
+
+ if (nh->nlmsg_type == NLMSG_DONE)
+ break;
+ if (nh->nlmsg_type == NLMSG_ERROR)
+ nl_error(nh);
+ if (nh->nlmsg_type != pm_family)
+ continue;
+
+ len -= NLMSG_LENGTH(GENL_HDRLEN);
+ attrs = (struct rtattr *) ((char *) NLMSG_DATA(nh) +
+ GENL_HDRLEN);
+ while (RTA_OK(attrs, len)) {
+ int type = attrs->rta_type;
+
+ if (type != MPTCP_PM_ATTR_RCV_ADD_ADDRS &&
+ type != MPTCP_PM_ATTR_SUBFLOWS)
+ goto next;
+
+ memcpy(&max, RTA_DATA(attrs), 4);
+ printf("%s %u\n", type == MPTCP_PM_ATTR_SUBFLOWS ?
+ "subflows" : "accept", max);
+
+next:
+ attrs = RTA_NEXT(attrs, len);
+ }
+ }
+}
+
+int get_set_limits(int fd, int pm_family, int argc, char *argv[])
+{
+ char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+ NLMSG_ALIGN(sizeof(struct genlmsghdr)) +
+ 1024];
+ uint32_t rcv_addr = 0, subflows = 0;
+ int cmd, len = sizeof(data);
+ struct nlmsghdr *nh;
+ int off = 0;
+
+ /* limit */
+ if (argc == 4) {
+ rcv_addr = atoi(argv[2]);
+ subflows = atoi(argv[3]);
+ cmd = MPTCP_PM_CMD_SET_LIMITS;
+ } else {
+ cmd = MPTCP_PM_CMD_GET_LIMITS;
+ }
+
+ memset(data, 0, sizeof(data));
+ nh = (void *)data;
+ off = init_genl_req(data, pm_family, cmd, MPTCP_PM_VER);
+
+ /* limit */
+ if (cmd == MPTCP_PM_CMD_SET_LIMITS) {
+ struct rtattr *rta = (void *)(data + off);
+
+ rta->rta_type = MPTCP_PM_ATTR_RCV_ADD_ADDRS;
+ rta->rta_len = RTA_LENGTH(4);
+ memcpy(RTA_DATA(rta), &rcv_addr, 4);
+ off += NLMSG_ALIGN(rta->rta_len);
+
+ rta = (void *)(data + off);
+ rta->rta_type = MPTCP_PM_ATTR_SUBFLOWS;
+ rta->rta_len = RTA_LENGTH(4);
+ memcpy(RTA_DATA(rta), &subflows, 4);
+ off += NLMSG_ALIGN(rta->rta_len);
+
+ /* do not expect a reply */
+ len = 0;
+ }
+
+ len = do_nl_req(fd, nh, off, len);
+ if (cmd == MPTCP_PM_CMD_GET_LIMITS)
+ print_limits(nh, pm_family, len);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ int fd, pm_family;
+
+ if (argc < 2)
+ syntax(argv);
+
+ fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC);
+ if (fd == -1)
+ error(1, errno, "socket netlink");
+
+ pm_family = resolve_mptcp_pm_netlink(fd);
+
+ if (!strcmp(argv[1], "add"))
+ return add_addr(fd, pm_family, argc, argv);
+ else if (!strcmp(argv[1], "del"))
+ return del_addr(fd, pm_family, argc, argv);
+ else if (!strcmp(argv[1], "flush"))
+ return flush_addrs(fd, pm_family, argc, argv);
+ else if (!strcmp(argv[1], "get"))
+ return get_addr(fd, pm_family, argc, argv);
+ else if (!strcmp(argv[1], "dump"))
+ return dump_addrs(fd, pm_family, argc, argv);
+ else if (!strcmp(argv[1], "limits"))
+ return get_set_limits(fd, pm_family, argc, argv);
+
+ fprintf(stderr, "unknown sub-command: %s", argv[1]);
+ syntax(argv);
+ return 0;
+}
diff --git a/tools/testing/selftests/net/mptcp/settings b/tools/testing/selftests/net/mptcp/settings
new file mode 100644
index 000000000..026384c18
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/settings
@@ -0,0 +1 @@
+timeout=450
diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh
new file mode 100755
index 000000000..b51afba24
--- /dev/null
+++ b/tools/testing/selftests/net/mptcp/simult_flows.sh
@@ -0,0 +1,297 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+. "$(dirname "${0}")/mptcp_lib.sh"
+
+rndh=$(printf %x $sec)-$(mktemp -u XXXXXX)
+ns1="ns1-$rndh"
+ns2="ns2-$rndh"
+ns3="ns3-$rndh"
+capture=false
+ksft_skip=4
+timeout=30
+test_cnt=1
+ret=0
+bail=0
+
+usage() {
+ echo "Usage: $0 [ -b ] [ -c ] [ -d ]"
+ echo -e "\t-b: bail out after first error, otherwise runs al testcases"
+ echo -e "\t-c: capture packets for each test using tcpdump (default: no capture)"
+ echo -e "\t-d: debug this script"
+}
+
+cleanup()
+{
+ rm -f "$cout" "$sout"
+ rm -f "$large" "$small"
+ rm -f "$capout"
+
+ local netns
+ for netns in "$ns1" "$ns2" "$ns3";do
+ ip netns del $netns
+ done
+}
+
+mptcp_lib_check_mptcp
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+
+# "$ns1" ns2 ns3
+# ns1eth1 ns2eth1 ns2eth3 ns3eth1
+# netem
+# ns1eth2 ns2eth2
+# netem
+
+setup()
+{
+ large=$(mktemp)
+ small=$(mktemp)
+ sout=$(mktemp)
+ cout=$(mktemp)
+ capout=$(mktemp)
+ size=$((2048 * 4096))
+ dd if=/dev/zero of=$small bs=4096 count=20 >/dev/null 2>&1
+ dd if=/dev/zero of=$large bs=4096 count=$((size / 4096)) >/dev/null 2>&1
+
+ trap cleanup EXIT
+
+ for i in "$ns1" "$ns2" "$ns3";do
+ ip netns add $i || exit $ksft_skip
+ ip -net $i link set lo up
+ done
+
+ ip link add ns1eth1 netns "$ns1" type veth peer name ns2eth1 netns "$ns2"
+ ip link add ns1eth2 netns "$ns1" type veth peer name ns2eth2 netns "$ns2"
+ ip link add ns2eth3 netns "$ns2" type veth peer name ns3eth1 netns "$ns3"
+
+ ip -net "$ns1" addr add 10.0.1.1/24 dev ns1eth1
+ ip -net "$ns1" addr add dead:beef:1::1/64 dev ns1eth1 nodad
+ ip -net "$ns1" link set ns1eth1 up mtu 1500
+ ip -net "$ns1" route add default via 10.0.1.2
+ ip -net "$ns1" route add default via dead:beef:1::2
+
+ ip -net "$ns1" addr add 10.0.2.1/24 dev ns1eth2
+ ip -net "$ns1" addr add dead:beef:2::1/64 dev ns1eth2 nodad
+ ip -net "$ns1" link set ns1eth2 up mtu 1500
+ ip -net "$ns1" route add default via 10.0.2.2 metric 101
+ ip -net "$ns1" route add default via dead:beef:2::2 metric 101
+
+ ip netns exec "$ns1" ./pm_nl_ctl limits 1 1
+ ip netns exec "$ns1" ./pm_nl_ctl add 10.0.2.1 dev ns1eth2 flags subflow
+ ip netns exec "$ns1" sysctl -q net.ipv4.conf.all.rp_filter=0
+
+ ip -net "$ns2" addr add 10.0.1.2/24 dev ns2eth1
+ ip -net "$ns2" addr add dead:beef:1::2/64 dev ns2eth1 nodad
+ ip -net "$ns2" link set ns2eth1 up mtu 1500
+
+ ip -net "$ns2" addr add 10.0.2.2/24 dev ns2eth2
+ ip -net "$ns2" addr add dead:beef:2::2/64 dev ns2eth2 nodad
+ ip -net "$ns2" link set ns2eth2 up mtu 1500
+
+ ip -net "$ns2" addr add 10.0.3.2/24 dev ns2eth3
+ ip -net "$ns2" addr add dead:beef:3::2/64 dev ns2eth3 nodad
+ ip -net "$ns2" link set ns2eth3 up mtu 1500
+ ip netns exec "$ns2" sysctl -q net.ipv4.ip_forward=1
+ ip netns exec "$ns2" sysctl -q net.ipv6.conf.all.forwarding=1
+
+ ip -net "$ns3" addr add 10.0.3.3/24 dev ns3eth1
+ ip -net "$ns3" addr add dead:beef:3::3/64 dev ns3eth1 nodad
+ ip -net "$ns3" link set ns3eth1 up mtu 1500
+ ip -net "$ns3" route add default via 10.0.3.2
+ ip -net "$ns3" route add default via dead:beef:3::2
+
+ ip netns exec "$ns3" ./pm_nl_ctl limits 1 1
+}
+
+# $1: ns, $2: port
+wait_local_port_listen()
+{
+ local listener_ns="${1}"
+ local port="${2}"
+
+ local port_hex i
+
+ port_hex="$(printf "%04X" "${port}")"
+ for i in $(seq 10); do
+ ip netns exec "${listener_ns}" cat /proc/net/tcp* | \
+ awk "BEGIN {rc=1} {if (\$2 ~ /:${port_hex}\$/ && \$4 ~ /0A/) {rc=0; exit}} END {exit rc}" &&
+ break
+ sleep 0.1
+ done
+}
+
+do_transfer()
+{
+ local cin=$1
+ local sin=$2
+ local max_time=$3
+ local port
+ port=$((10000+$test_cnt))
+ test_cnt=$((test_cnt+1))
+
+ :> "$cout"
+ :> "$sout"
+ :> "$capout"
+
+ local addr_port
+ addr_port=$(printf "%s:%d" ${connect_addr} ${port})
+
+ if $capture; then
+ local capuser
+ if [ -z $SUDO_USER ] ; then
+ capuser=""
+ else
+ capuser="-Z $SUDO_USER"
+ fi
+
+ local capfile="${rndh}-${port}"
+ local capopt="-i any -s 65535 -B 32768 ${capuser}"
+
+ ip netns exec ${ns3} tcpdump ${capopt} -w "${capfile}-listener.pcap" >> "${capout}" 2>&1 &
+ local cappid_listener=$!
+
+ ip netns exec ${ns1} tcpdump ${capopt} -w "${capfile}-connector.pcap" >> "${capout}" 2>&1 &
+ local cappid_connector=$!
+
+ sleep 1
+ fi
+
+ ip netns exec ${ns3} ./mptcp_connect -jt $timeout -l -p $port 0.0.0.0 < "$sin" > "$sout" &
+ local spid=$!
+
+ wait_local_port_listen "${ns3}" "${port}"
+
+ local start
+ start=$(date +%s%3N)
+ ip netns exec ${ns1} ./mptcp_connect -jt $timeout -p $port 10.0.3.3 < "$cin" > "$cout" &
+ local cpid=$!
+
+ wait $cpid
+ local retc=$?
+ wait $spid
+ local rets=$?
+
+ local stop
+ stop=$(date +%s%3N)
+
+ if $capture; then
+ sleep 1
+ kill ${cappid_listener}
+ kill ${cappid_connector}
+ fi
+
+ local duration
+ duration=$((stop-start))
+
+ cmp $sin $cout > /dev/null 2>&1
+ local cmps=$?
+ cmp $cin $sout > /dev/null 2>&1
+ local cmpc=$?
+
+ printf "%16s" "$duration max $max_time "
+ if [ $retc -eq 0 ] && [ $rets -eq 0 ] && \
+ [ $cmpc -eq 0 ] && [ $cmps -eq 0 ] && \
+ [ $duration -lt $max_time ]; then
+ echo "[ OK ]"
+ cat "$capout"
+ return 0
+ fi
+
+ echo " [ fail ]"
+ echo "client exit code $retc, server $rets" 1>&2
+ echo -e "\nnetns ${ns3} socket stat for $port:" 1>&2
+ ip netns exec ${ns3} ss -nita 1>&2 -o "sport = :$port"
+ echo -e "\nnetns ${ns1} socket stat for $port:" 1>&2
+ ip netns exec ${ns1} ss -nita 1>&2 -o "dport = :$port"
+ ls -l $sin $cout
+ ls -l $cin $sout
+
+ cat "$capout"
+ return 1
+}
+
+run_test()
+{
+ local rate1=$1
+ local rate2=$2
+ local delay1=$3
+ local delay2=$4
+ local lret
+ local dev
+ shift 4
+ local msg=$*
+
+ [ $delay1 -gt 0 ] && delay1="delay $delay1" || delay1=""
+ [ $delay2 -gt 0 ] && delay2="delay $delay2" || delay2=""
+
+ for dev in ns1eth1 ns1eth2; do
+ tc -n $ns1 qdisc del dev $dev root >/dev/null 2>&1
+ done
+ for dev in ns2eth1 ns2eth2; do
+ tc -n $ns2 qdisc del dev $dev root >/dev/null 2>&1
+ done
+ tc -n $ns1 qdisc add dev ns1eth1 root netem rate ${rate1}mbit $delay1
+ tc -n $ns1 qdisc add dev ns1eth2 root netem rate ${rate2}mbit $delay2
+ tc -n $ns2 qdisc add dev ns2eth1 root netem rate ${rate1}mbit $delay1
+ tc -n $ns2 qdisc add dev ns2eth2 root netem rate ${rate2}mbit $delay2
+
+ # time is measure in ms
+ local time=$((size * 8 * 1000 / (( $rate1 + $rate2) * 1024 *1024) ))
+
+ # mptcp_connect will do some sleeps to allow the mp_join handshake
+ # completion
+ time=$((time + 1350))
+
+ printf "%-50s" "$msg"
+ do_transfer $small $large $((time * 11 / 10))
+ lret=$?
+ if [ $lret -ne 0 ]; then
+ ret=$lret
+ [ $bail -eq 0 ] || exit $ret
+ fi
+
+ printf "%-50s" "$msg - reverse direction"
+ do_transfer $large $small $((time * 11 / 10))
+ lret=$?
+ if [ $lret -ne 0 ]; then
+ ret=$lret
+ [ $bail -eq 0 ] || exit $ret
+ fi
+}
+
+while getopts "bcdh" option;do
+ case "$option" in
+ "h")
+ usage $0
+ exit 0
+ ;;
+ "b")
+ bail=1
+ ;;
+ "c")
+ capture=true
+ ;;
+ "d")
+ set -x
+ ;;
+ "?")
+ usage $0
+ exit 1
+ ;;
+ esac
+done
+
+setup
+run_test 10 10 0 0 "balanced bwidth"
+run_test 10 10 1 50 "balanced bwidth with unbalanced delay"
+
+# we still need some additional infrastructure to pass the following test-cases
+# run_test 30 10 0 0 "unbalanced bwidth"
+# run_test 30 10 1 50 "unbalanced bwidth with unbalanced delay"
+# run_test 30 10 50 1 "unbalanced bwidth with opposed, unbalanced delay"
+exit $ret
diff --git a/tools/testing/selftests/net/msg_zerocopy.c b/tools/testing/selftests/net/msg_zerocopy.c
new file mode 100644
index 000000000..bdc03a209
--- /dev/null
+++ b/tools/testing/selftests/net/msg_zerocopy.c
@@ -0,0 +1,811 @@
+/* Evaluate MSG_ZEROCOPY
+ *
+ * Send traffic between two processes over one of the supported
+ * protocols and modes:
+ *
+ * PF_INET/PF_INET6
+ * - SOCK_STREAM
+ * - SOCK_DGRAM
+ * - SOCK_DGRAM with UDP_CORK
+ * - SOCK_RAW
+ * - SOCK_RAW with IP_HDRINCL
+ *
+ * PF_PACKET
+ * - SOCK_DGRAM
+ * - SOCK_RAW
+ *
+ * PF_RDS
+ * - SOCK_SEQPACKET
+ *
+ * Start this program on two connected hosts, one in send mode and
+ * the other with option '-r' to put it in receiver mode.
+ *
+ * If zerocopy mode ('-z') is enabled, the sender will verify that
+ * the kernel queues completions on the error queue for all zerocopy
+ * transfers.
+ */
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <error.h>
+#include <errno.h>
+#include <limits.h>
+#include <linux/errqueue.h>
+#include <linux/if_packet.h>
+#include <linux/ipv6.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <linux/rds.h>
+
+#ifndef SO_EE_ORIGIN_ZEROCOPY
+#define SO_EE_ORIGIN_ZEROCOPY 5
+#endif
+
+#ifndef SO_ZEROCOPY
+#define SO_ZEROCOPY 60
+#endif
+
+#ifndef SO_EE_CODE_ZEROCOPY_COPIED
+#define SO_EE_CODE_ZEROCOPY_COPIED 1
+#endif
+
+#ifndef MSG_ZEROCOPY
+#define MSG_ZEROCOPY 0x4000000
+#endif
+
+static int cfg_cork;
+static bool cfg_cork_mixed;
+static int cfg_cpu = -1; /* default: pin to last cpu */
+static int cfg_family = PF_UNSPEC;
+static int cfg_ifindex = 1;
+static int cfg_payload_len;
+static int cfg_port = 8000;
+static bool cfg_rx;
+static int cfg_runtime_ms = 4200;
+static int cfg_verbose;
+static int cfg_waittime_ms = 500;
+static bool cfg_zerocopy;
+
+static socklen_t cfg_alen;
+static struct sockaddr_storage cfg_dst_addr;
+static struct sockaddr_storage cfg_src_addr;
+
+static char payload[IP_MAXPACKET];
+static long packets, bytes, completions, expected_completions;
+static int zerocopied = -1;
+static uint32_t next_completion;
+
+static unsigned long gettimeofday_ms(void)
+{
+ struct timeval tv;
+
+ gettimeofday(&tv, NULL);
+ return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
+}
+
+static uint16_t get_ip_csum(const uint16_t *start, int num_words)
+{
+ unsigned long sum = 0;
+ int i;
+
+ for (i = 0; i < num_words; i++)
+ sum += start[i];
+
+ while (sum >> 16)
+ sum = (sum & 0xFFFF) + (sum >> 16);
+
+ return ~sum;
+}
+
+static int do_setcpu(int cpu)
+{
+ cpu_set_t mask;
+
+ CPU_ZERO(&mask);
+ CPU_SET(cpu, &mask);
+ if (sched_setaffinity(0, sizeof(mask), &mask))
+ fprintf(stderr, "cpu: unable to pin, may increase variance.\n");
+ else if (cfg_verbose)
+ fprintf(stderr, "cpu: %u\n", cpu);
+
+ return 0;
+}
+
+static void do_setsockopt(int fd, int level, int optname, int val)
+{
+ if (setsockopt(fd, level, optname, &val, sizeof(val)))
+ error(1, errno, "setsockopt %d.%d: %d", level, optname, val);
+}
+
+static int do_poll(int fd, int events)
+{
+ struct pollfd pfd;
+ int ret;
+
+ pfd.events = events;
+ pfd.revents = 0;
+ pfd.fd = fd;
+
+ ret = poll(&pfd, 1, cfg_waittime_ms);
+ if (ret == -1)
+ error(1, errno, "poll");
+
+ return ret && (pfd.revents & events);
+}
+
+static int do_accept(int fd)
+{
+ int fda = fd;
+
+ fd = accept(fda, NULL, NULL);
+ if (fd == -1)
+ error(1, errno, "accept");
+ if (close(fda))
+ error(1, errno, "close listen sock");
+
+ return fd;
+}
+
+static void add_zcopy_cookie(struct msghdr *msg, uint32_t cookie)
+{
+ struct cmsghdr *cm;
+
+ if (!msg->msg_control)
+ error(1, errno, "NULL cookie");
+ cm = (void *)msg->msg_control;
+ cm->cmsg_len = CMSG_LEN(sizeof(cookie));
+ cm->cmsg_level = SOL_RDS;
+ cm->cmsg_type = RDS_CMSG_ZCOPY_COOKIE;
+ memcpy(CMSG_DATA(cm), &cookie, sizeof(cookie));
+}
+
+static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy, int domain)
+{
+ int ret, len, i, flags;
+ static uint32_t cookie;
+ char ckbuf[CMSG_SPACE(sizeof(cookie))];
+
+ len = 0;
+ for (i = 0; i < msg->msg_iovlen; i++)
+ len += msg->msg_iov[i].iov_len;
+
+ flags = MSG_DONTWAIT;
+ if (do_zerocopy) {
+ flags |= MSG_ZEROCOPY;
+ if (domain == PF_RDS) {
+ memset(&msg->msg_control, 0, sizeof(msg->msg_control));
+ msg->msg_controllen = CMSG_SPACE(sizeof(cookie));
+ msg->msg_control = (struct cmsghdr *)ckbuf;
+ add_zcopy_cookie(msg, ++cookie);
+ }
+ }
+
+ ret = sendmsg(fd, msg, flags);
+ if (ret == -1 && errno == EAGAIN)
+ return false;
+ if (ret == -1)
+ error(1, errno, "send");
+ if (cfg_verbose && ret != len)
+ fprintf(stderr, "send: ret=%u != %u\n", ret, len);
+
+ if (len) {
+ packets++;
+ bytes += ret;
+ if (do_zerocopy && ret)
+ expected_completions++;
+ }
+ if (do_zerocopy && domain == PF_RDS) {
+ msg->msg_control = NULL;
+ msg->msg_controllen = 0;
+ }
+
+ return true;
+}
+
+static void do_sendmsg_corked(int fd, struct msghdr *msg)
+{
+ bool do_zerocopy = cfg_zerocopy;
+ int i, payload_len, extra_len;
+
+ /* split up the packet. for non-multiple, make first buffer longer */
+ payload_len = cfg_payload_len / cfg_cork;
+ extra_len = cfg_payload_len - (cfg_cork * payload_len);
+
+ do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1);
+
+ for (i = 0; i < cfg_cork; i++) {
+
+ /* in mixed-frags mode, alternate zerocopy and copy frags
+ * start with non-zerocopy, to ensure attach later works
+ */
+ if (cfg_cork_mixed)
+ do_zerocopy = (i & 1);
+
+ msg->msg_iov[0].iov_len = payload_len + extra_len;
+ extra_len = 0;
+
+ do_sendmsg(fd, msg, do_zerocopy,
+ (cfg_dst_addr.ss_family == AF_INET ?
+ PF_INET : PF_INET6));
+ }
+
+ do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
+}
+
+static int setup_iph(struct iphdr *iph, uint16_t payload_len)
+{
+ struct sockaddr_in *daddr = (void *) &cfg_dst_addr;
+ struct sockaddr_in *saddr = (void *) &cfg_src_addr;
+
+ memset(iph, 0, sizeof(*iph));
+
+ iph->version = 4;
+ iph->tos = 0;
+ iph->ihl = 5;
+ iph->ttl = 2;
+ iph->saddr = saddr->sin_addr.s_addr;
+ iph->daddr = daddr->sin_addr.s_addr;
+ iph->protocol = IPPROTO_EGP;
+ iph->tot_len = htons(sizeof(*iph) + payload_len);
+ iph->check = get_ip_csum((void *) iph, iph->ihl << 1);
+
+ return sizeof(*iph);
+}
+
+static int setup_ip6h(struct ipv6hdr *ip6h, uint16_t payload_len)
+{
+ struct sockaddr_in6 *daddr = (void *) &cfg_dst_addr;
+ struct sockaddr_in6 *saddr = (void *) &cfg_src_addr;
+
+ memset(ip6h, 0, sizeof(*ip6h));
+
+ ip6h->version = 6;
+ ip6h->payload_len = htons(payload_len);
+ ip6h->nexthdr = IPPROTO_EGP;
+ ip6h->hop_limit = 2;
+ ip6h->saddr = saddr->sin6_addr;
+ ip6h->daddr = daddr->sin6_addr;
+
+ return sizeof(*ip6h);
+}
+
+
+static void setup_sockaddr(int domain, const char *str_addr,
+ struct sockaddr_storage *sockaddr)
+{
+ struct sockaddr_in6 *addr6 = (void *) sockaddr;
+ struct sockaddr_in *addr4 = (void *) sockaddr;
+
+ switch (domain) {
+ case PF_INET:
+ memset(addr4, 0, sizeof(*addr4));
+ addr4->sin_family = AF_INET;
+ addr4->sin_port = htons(cfg_port);
+ if (str_addr &&
+ inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
+ error(1, 0, "ipv4 parse error: %s", str_addr);
+ break;
+ case PF_INET6:
+ memset(addr6, 0, sizeof(*addr6));
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_port = htons(cfg_port);
+ if (str_addr &&
+ inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
+ error(1, 0, "ipv6 parse error: %s", str_addr);
+ break;
+ default:
+ error(1, 0, "illegal domain");
+ }
+}
+
+static int do_setup_tx(int domain, int type, int protocol)
+{
+ int fd;
+
+ fd = socket(domain, type, protocol);
+ if (fd == -1)
+ error(1, errno, "socket t");
+
+ do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21);
+ if (cfg_zerocopy)
+ do_setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, 1);
+
+ if (domain != PF_PACKET && domain != PF_RDS)
+ if (connect(fd, (void *) &cfg_dst_addr, cfg_alen))
+ error(1, errno, "connect");
+
+ if (domain == PF_RDS) {
+ if (bind(fd, (void *) &cfg_src_addr, cfg_alen))
+ error(1, errno, "bind");
+ }
+
+ return fd;
+}
+
+static uint32_t do_process_zerocopy_cookies(struct rds_zcopy_cookies *ck)
+{
+ int i;
+
+ if (ck->num > RDS_MAX_ZCOOKIES)
+ error(1, 0, "Returned %d cookies, max expected %d\n",
+ ck->num, RDS_MAX_ZCOOKIES);
+ for (i = 0; i < ck->num; i++)
+ if (cfg_verbose >= 2)
+ fprintf(stderr, "%d\n", ck->cookies[i]);
+ return ck->num;
+}
+
+static bool do_recvmsg_completion(int fd)
+{
+ char cmsgbuf[CMSG_SPACE(sizeof(struct rds_zcopy_cookies))];
+ struct rds_zcopy_cookies *ck;
+ struct cmsghdr *cmsg;
+ struct msghdr msg;
+ bool ret = false;
+
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_control = cmsgbuf;
+ msg.msg_controllen = sizeof(cmsgbuf);
+
+ if (recvmsg(fd, &msg, MSG_DONTWAIT))
+ return ret;
+
+ if (msg.msg_flags & MSG_CTRUNC)
+ error(1, errno, "recvmsg notification: truncated");
+
+ for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+ if (cmsg->cmsg_level == SOL_RDS &&
+ cmsg->cmsg_type == RDS_CMSG_ZCOPY_COMPLETION) {
+
+ ck = (struct rds_zcopy_cookies *)CMSG_DATA(cmsg);
+ completions += do_process_zerocopy_cookies(ck);
+ ret = true;
+ break;
+ }
+ error(0, 0, "ignoring cmsg at level %d type %d\n",
+ cmsg->cmsg_level, cmsg->cmsg_type);
+ }
+ return ret;
+}
+
+static bool do_recv_completion(int fd, int domain)
+{
+ struct sock_extended_err *serr;
+ struct msghdr msg = {};
+ struct cmsghdr *cm;
+ uint32_t hi, lo, range;
+ int ret, zerocopy;
+ char control[100];
+
+ if (domain == PF_RDS)
+ return do_recvmsg_completion(fd);
+
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
+ if (ret == -1 && errno == EAGAIN)
+ return false;
+ if (ret == -1)
+ error(1, errno, "recvmsg notification");
+ if (msg.msg_flags & MSG_CTRUNC)
+ error(1, errno, "recvmsg notification: truncated");
+
+ cm = CMSG_FIRSTHDR(&msg);
+ if (!cm)
+ error(1, 0, "cmsg: no cmsg");
+ if (!((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) ||
+ (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR) ||
+ (cm->cmsg_level == SOL_PACKET && cm->cmsg_type == PACKET_TX_TIMESTAMP)))
+ error(1, 0, "serr: wrong type: %d.%d",
+ cm->cmsg_level, cm->cmsg_type);
+
+ serr = (void *) CMSG_DATA(cm);
+
+ if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY)
+ error(1, 0, "serr: wrong origin: %u", serr->ee_origin);
+ if (serr->ee_errno != 0)
+ error(1, 0, "serr: wrong error code: %u", serr->ee_errno);
+
+ hi = serr->ee_data;
+ lo = serr->ee_info;
+ range = hi - lo + 1;
+
+ /* Detect notification gaps. These should not happen often, if at all.
+ * Gaps can occur due to drops, reordering and retransmissions.
+ */
+ if (lo != next_completion)
+ fprintf(stderr, "gap: %u..%u does not append to %u\n",
+ lo, hi, next_completion);
+ next_completion = hi + 1;
+
+ zerocopy = !(serr->ee_code & SO_EE_CODE_ZEROCOPY_COPIED);
+ if (zerocopied == -1)
+ zerocopied = zerocopy;
+ else if (zerocopied != zerocopy) {
+ fprintf(stderr, "serr: inconsistent\n");
+ zerocopied = zerocopy;
+ }
+
+ if (cfg_verbose >= 2)
+ fprintf(stderr, "completed: %u (h=%u l=%u)\n",
+ range, hi, lo);
+
+ completions += range;
+ return true;
+}
+
+/* Read all outstanding messages on the errqueue */
+static void do_recv_completions(int fd, int domain)
+{
+ while (do_recv_completion(fd, domain)) {}
+}
+
+/* Wait for all remaining completions on the errqueue */
+static void do_recv_remaining_completions(int fd, int domain)
+{
+ int64_t tstop = gettimeofday_ms() + cfg_waittime_ms;
+
+ while (completions < expected_completions &&
+ gettimeofday_ms() < tstop) {
+ if (do_poll(fd, domain == PF_RDS ? POLLIN : POLLERR))
+ do_recv_completions(fd, domain);
+ }
+
+ if (completions < expected_completions)
+ fprintf(stderr, "missing notifications: %lu < %lu\n",
+ completions, expected_completions);
+}
+
+static void do_tx(int domain, int type, int protocol)
+{
+ struct iovec iov[3] = { {0} };
+ struct sockaddr_ll laddr;
+ struct msghdr msg = {0};
+ struct ethhdr eth;
+ union {
+ struct ipv6hdr ip6h;
+ struct iphdr iph;
+ } nh;
+ uint64_t tstop;
+ int fd;
+
+ fd = do_setup_tx(domain, type, protocol);
+
+ if (domain == PF_PACKET) {
+ uint16_t proto = cfg_family == PF_INET ? ETH_P_IP : ETH_P_IPV6;
+
+ /* sock_raw passes ll header as data */
+ if (type == SOCK_RAW) {
+ memset(eth.h_dest, 0x06, ETH_ALEN);
+ memset(eth.h_source, 0x02, ETH_ALEN);
+ eth.h_proto = htons(proto);
+ iov[0].iov_base = &eth;
+ iov[0].iov_len = sizeof(eth);
+ msg.msg_iovlen++;
+ }
+
+ /* both sock_raw and sock_dgram expect name */
+ memset(&laddr, 0, sizeof(laddr));
+ laddr.sll_family = AF_PACKET;
+ laddr.sll_ifindex = cfg_ifindex;
+ laddr.sll_protocol = htons(proto);
+ laddr.sll_halen = ETH_ALEN;
+
+ memset(laddr.sll_addr, 0x06, ETH_ALEN);
+
+ msg.msg_name = &laddr;
+ msg.msg_namelen = sizeof(laddr);
+ }
+
+ /* packet and raw sockets with hdrincl must pass network header */
+ if (domain == PF_PACKET || protocol == IPPROTO_RAW) {
+ if (cfg_family == PF_INET)
+ iov[1].iov_len = setup_iph(&nh.iph, cfg_payload_len);
+ else
+ iov[1].iov_len = setup_ip6h(&nh.ip6h, cfg_payload_len);
+
+ iov[1].iov_base = (void *) &nh;
+ msg.msg_iovlen++;
+ }
+
+ if (domain == PF_RDS) {
+ msg.msg_name = &cfg_dst_addr;
+ msg.msg_namelen = (cfg_dst_addr.ss_family == AF_INET ?
+ sizeof(struct sockaddr_in) :
+ sizeof(struct sockaddr_in6));
+ }
+
+ iov[2].iov_base = payload;
+ iov[2].iov_len = cfg_payload_len;
+ msg.msg_iovlen++;
+ msg.msg_iov = &iov[3 - msg.msg_iovlen];
+
+ tstop = gettimeofday_ms() + cfg_runtime_ms;
+ do {
+ if (cfg_cork)
+ do_sendmsg_corked(fd, &msg);
+ else
+ do_sendmsg(fd, &msg, cfg_zerocopy, domain);
+
+ while (!do_poll(fd, POLLOUT)) {
+ if (cfg_zerocopy)
+ do_recv_completions(fd, domain);
+ }
+
+ } while (gettimeofday_ms() < tstop);
+
+ if (cfg_zerocopy)
+ do_recv_remaining_completions(fd, domain);
+
+ if (close(fd))
+ error(1, errno, "close");
+
+ fprintf(stderr, "tx=%lu (%lu MB) txc=%lu zc=%c\n",
+ packets, bytes >> 20, completions,
+ zerocopied == 1 ? 'y' : 'n');
+}
+
+static int do_setup_rx(int domain, int type, int protocol)
+{
+ int fd;
+
+ /* If tx over PF_PACKET, rx over PF_INET(6)/SOCK_RAW,
+ * to recv the only copy of the packet, not a clone
+ */
+ if (domain == PF_PACKET)
+ error(1, 0, "Use PF_INET/SOCK_RAW to read");
+
+ if (type == SOCK_RAW && protocol == IPPROTO_RAW)
+ error(1, 0, "IPPROTO_RAW: not supported on Rx");
+
+ fd = socket(domain, type, protocol);
+ if (fd == -1)
+ error(1, errno, "socket r");
+
+ do_setsockopt(fd, SOL_SOCKET, SO_RCVBUF, 1 << 21);
+ do_setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT, 1 << 16);
+ do_setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, 1);
+
+ if (bind(fd, (void *) &cfg_dst_addr, cfg_alen))
+ error(1, errno, "bind");
+
+ if (type == SOCK_STREAM) {
+ if (listen(fd, 1))
+ error(1, errno, "listen");
+ fd = do_accept(fd);
+ }
+
+ return fd;
+}
+
+/* Flush all outstanding bytes for the tcp receive queue */
+static void do_flush_tcp(int fd)
+{
+ int ret;
+
+ /* MSG_TRUNC flushes up to len bytes */
+ ret = recv(fd, NULL, 1 << 21, MSG_TRUNC | MSG_DONTWAIT);
+ if (ret == -1 && errno == EAGAIN)
+ return;
+ if (ret == -1)
+ error(1, errno, "flush");
+ if (!ret)
+ return;
+
+ packets++;
+ bytes += ret;
+}
+
+/* Flush all outstanding datagrams. Verify first few bytes of each. */
+static void do_flush_datagram(int fd, int type)
+{
+ int ret, off = 0;
+ char buf[64];
+
+ /* MSG_TRUNC will return full datagram length */
+ ret = recv(fd, buf, sizeof(buf), MSG_DONTWAIT | MSG_TRUNC);
+ if (ret == -1 && errno == EAGAIN)
+ return;
+
+ /* raw ipv4 return with header, raw ipv6 without */
+ if (cfg_family == PF_INET && type == SOCK_RAW) {
+ off += sizeof(struct iphdr);
+ ret -= sizeof(struct iphdr);
+ }
+
+ if (ret == -1)
+ error(1, errno, "recv");
+ if (ret != cfg_payload_len)
+ error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len);
+ if (ret > sizeof(buf) - off)
+ ret = sizeof(buf) - off;
+ if (memcmp(buf + off, payload, ret))
+ error(1, 0, "recv: data mismatch");
+
+ packets++;
+ bytes += cfg_payload_len;
+}
+
+static void do_rx(int domain, int type, int protocol)
+{
+ const int cfg_receiver_wait_ms = 400;
+ uint64_t tstop;
+ int fd;
+
+ fd = do_setup_rx(domain, type, protocol);
+
+ tstop = gettimeofday_ms() + cfg_runtime_ms + cfg_receiver_wait_ms;
+ do {
+ if (type == SOCK_STREAM)
+ do_flush_tcp(fd);
+ else
+ do_flush_datagram(fd, type);
+
+ do_poll(fd, POLLIN);
+
+ } while (gettimeofday_ms() < tstop);
+
+ if (close(fd))
+ error(1, errno, "close");
+
+ fprintf(stderr, "rx=%lu (%lu MB)\n", packets, bytes >> 20);
+}
+
+static void do_test(int domain, int type, int protocol)
+{
+ int i;
+
+ if (cfg_cork && (domain == PF_PACKET || type != SOCK_DGRAM))
+ error(1, 0, "can only cork udp sockets");
+
+ do_setcpu(cfg_cpu);
+
+ for (i = 0; i < IP_MAXPACKET; i++)
+ payload[i] = 'a' + (i % 26);
+
+ if (cfg_rx)
+ do_rx(domain, type, protocol);
+ else
+ do_tx(domain, type, protocol);
+}
+
+static void usage(const char *filepath)
+{
+ error(1, 0, "Usage: %s [options] <test>", filepath);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ const int max_payload_len = sizeof(payload) -
+ sizeof(struct ipv6hdr) -
+ sizeof(struct tcphdr) -
+ 40 /* max tcp options */;
+ int c;
+ char *daddr = NULL, *saddr = NULL;
+ char *cfg_test;
+
+ cfg_payload_len = max_payload_len;
+
+ while ((c = getopt(argc, argv, "46c:C:D:i:mp:rs:S:t:vz")) != -1) {
+ switch (c) {
+ case '4':
+ if (cfg_family != PF_UNSPEC)
+ error(1, 0, "Pass one of -4 or -6");
+ cfg_family = PF_INET;
+ cfg_alen = sizeof(struct sockaddr_in);
+ break;
+ case '6':
+ if (cfg_family != PF_UNSPEC)
+ error(1, 0, "Pass one of -4 or -6");
+ cfg_family = PF_INET6;
+ cfg_alen = sizeof(struct sockaddr_in6);
+ break;
+ case 'c':
+ cfg_cork = strtol(optarg, NULL, 0);
+ break;
+ case 'C':
+ cfg_cpu = strtol(optarg, NULL, 0);
+ break;
+ case 'D':
+ daddr = optarg;
+ break;
+ case 'i':
+ cfg_ifindex = if_nametoindex(optarg);
+ if (cfg_ifindex == 0)
+ error(1, errno, "invalid iface: %s", optarg);
+ break;
+ case 'm':
+ cfg_cork_mixed = true;
+ break;
+ case 'p':
+ cfg_port = strtoul(optarg, NULL, 0);
+ break;
+ case 'r':
+ cfg_rx = true;
+ break;
+ case 's':
+ cfg_payload_len = strtoul(optarg, NULL, 0);
+ break;
+ case 'S':
+ saddr = optarg;
+ break;
+ case 't':
+ cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000;
+ break;
+ case 'v':
+ cfg_verbose++;
+ break;
+ case 'z':
+ cfg_zerocopy = true;
+ break;
+ }
+ }
+
+ cfg_test = argv[argc - 1];
+ if (strcmp(cfg_test, "rds") == 0) {
+ if (!daddr)
+ error(1, 0, "-D <server addr> required for PF_RDS\n");
+ if (!cfg_rx && !saddr)
+ error(1, 0, "-S <client addr> required for PF_RDS\n");
+ }
+ setup_sockaddr(cfg_family, daddr, &cfg_dst_addr);
+ setup_sockaddr(cfg_family, saddr, &cfg_src_addr);
+
+ if (cfg_payload_len > max_payload_len)
+ error(1, 0, "-s: payload exceeds max (%d)", max_payload_len);
+ if (cfg_cork_mixed && (!cfg_zerocopy || !cfg_cork))
+ error(1, 0, "-m: cork_mixed requires corking and zerocopy");
+
+ if (optind != argc - 1)
+ usage(argv[0]);
+}
+
+int main(int argc, char **argv)
+{
+ const char *cfg_test;
+
+ parse_opts(argc, argv);
+
+ cfg_test = argv[argc - 1];
+
+ if (!strcmp(cfg_test, "packet"))
+ do_test(PF_PACKET, SOCK_RAW, 0);
+ else if (!strcmp(cfg_test, "packet_dgram"))
+ do_test(PF_PACKET, SOCK_DGRAM, 0);
+ else if (!strcmp(cfg_test, "raw"))
+ do_test(cfg_family, SOCK_RAW, IPPROTO_EGP);
+ else if (!strcmp(cfg_test, "raw_hdrincl"))
+ do_test(cfg_family, SOCK_RAW, IPPROTO_RAW);
+ else if (!strcmp(cfg_test, "tcp"))
+ do_test(cfg_family, SOCK_STREAM, 0);
+ else if (!strcmp(cfg_test, "udp"))
+ do_test(cfg_family, SOCK_DGRAM, 0);
+ else if (!strcmp(cfg_test, "rds"))
+ do_test(PF_RDS, SOCK_SEQPACKET, 0);
+ else
+ error(1, 0, "unknown cfg_test %s", cfg_test);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/net/msg_zerocopy.sh b/tools/testing/selftests/net/msg_zerocopy.sh
new file mode 100755
index 000000000..825ffec85
--- /dev/null
+++ b/tools/testing/selftests/net/msg_zerocopy.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+#
+# Send data between two processes across namespaces
+# Run twice: once without and once with zerocopy
+
+set -e
+
+readonly DEV="veth0"
+readonly DEV_MTU=65535
+readonly BIN="./msg_zerocopy"
+
+readonly RAND="$(mktemp -u XXXXXX)"
+readonly NSPREFIX="ns-${RAND}"
+readonly NS1="${NSPREFIX}1"
+readonly NS2="${NSPREFIX}2"
+
+readonly SADDR4='192.168.1.1'
+readonly DADDR4='192.168.1.2'
+readonly SADDR6='fd::1'
+readonly DADDR6='fd::2'
+
+readonly path_sysctl_mem="net.core.optmem_max"
+
+# No arguments: automated test
+if [[ "$#" -eq "0" ]]; then
+ $0 4 tcp -t 1
+ $0 6 tcp -t 1
+ $0 4 udp -t 1
+ $0 6 udp -t 1
+ echo "OK. All tests passed"
+ exit 0
+fi
+
+# Argument parsing
+if [[ "$#" -lt "2" ]]; then
+ echo "Usage: $0 [4|6] [tcp|udp|raw|raw_hdrincl|packet|packet_dgram] <args>"
+ exit 1
+fi
+
+readonly IP="$1"
+shift
+readonly TXMODE="$1"
+shift
+readonly EXTRA_ARGS="$@"
+
+# Argument parsing: configure addresses
+if [[ "${IP}" == "4" ]]; then
+ readonly SADDR="${SADDR4}"
+ readonly DADDR="${DADDR4}"
+elif [[ "${IP}" == "6" ]]; then
+ readonly SADDR="${SADDR6}"
+ readonly DADDR="${DADDR6}"
+else
+ echo "Invalid IP version ${IP}"
+ exit 1
+fi
+
+# Argument parsing: select receive mode
+#
+# This differs from send mode for
+# - packet: use raw recv, because packet receives skb clones
+# - raw_hdrinc: use raw recv, because hdrincl is a tx-only option
+case "${TXMODE}" in
+'packet' | 'packet_dgram' | 'raw_hdrincl')
+ RXMODE='raw'
+ ;;
+*)
+ RXMODE="${TXMODE}"
+ ;;
+esac
+
+# Start of state changes: install cleanup handler
+save_sysctl_mem="$(sysctl -n ${path_sysctl_mem})"
+
+cleanup() {
+ ip netns del "${NS2}"
+ ip netns del "${NS1}"
+ sysctl -w -q "${path_sysctl_mem}=${save_sysctl_mem}"
+}
+
+trap cleanup EXIT
+
+# Configure system settings
+sysctl -w -q "${path_sysctl_mem}=1000000"
+
+# Create virtual ethernet pair between network namespaces
+ip netns add "${NS1}"
+ip netns add "${NS2}"
+
+ip link add "${DEV}" mtu "${DEV_MTU}" netns "${NS1}" type veth \
+ peer name "${DEV}" mtu "${DEV_MTU}" netns "${NS2}"
+
+# Bring the devices up
+ip -netns "${NS1}" link set "${DEV}" up
+ip -netns "${NS2}" link set "${DEV}" up
+
+# Set fixed MAC addresses on the devices
+ip -netns "${NS1}" link set dev "${DEV}" address 02:02:02:02:02:02
+ip -netns "${NS2}" link set dev "${DEV}" address 06:06:06:06:06:06
+
+# Add fixed IP addresses to the devices
+ip -netns "${NS1}" addr add 192.168.1.1/24 dev "${DEV}"
+ip -netns "${NS2}" addr add 192.168.1.2/24 dev "${DEV}"
+ip -netns "${NS1}" addr add fd::1/64 dev "${DEV}" nodad
+ip -netns "${NS2}" addr add fd::2/64 dev "${DEV}" nodad
+
+# Optionally disable sg or csum offload to test edge cases
+# ip netns exec "${NS1}" ethtool -K "${DEV}" sg off
+
+do_test() {
+ local readonly ARGS="$1"
+
+ echo "ipv${IP} ${TXMODE} ${ARGS}"
+ ip netns exec "${NS2}" "${BIN}" "-${IP}" -i "${DEV}" -t 2 -C 2 -S "${SADDR}" -D "${DADDR}" ${ARGS} -r "${RXMODE}" &
+ sleep 0.2
+ ip netns exec "${NS1}" "${BIN}" "-${IP}" -i "${DEV}" -t 1 -C 3 -S "${SADDR}" -D "${DADDR}" ${ARGS} "${TXMODE}"
+ wait
+}
+
+do_test "${EXTRA_ARGS}"
+do_test "-z ${EXTRA_ARGS}"
+echo ok
diff --git a/tools/testing/selftests/net/netdevice.sh b/tools/testing/selftests/net/netdevice.sh
new file mode 100755
index 000000000..e3afcb424
--- /dev/null
+++ b/tools/testing/selftests/net/netdevice.sh
@@ -0,0 +1,205 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# This test is for checking network interface
+# For the moment it tests only ethernet interface (but wifi could be easily added)
+#
+# We assume that all network driver are loaded
+# if not they probably have failed earlier in the boot process and their logged error will be catched by another test
+#
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+# this function will try to up the interface
+# if already up, nothing done
+# arg1: network interface name
+kci_net_start()
+{
+ netdev=$1
+
+ ip link show "$netdev" |grep -q UP
+ if [ $? -eq 0 ];then
+ echo "SKIP: $netdev: interface already up"
+ return $ksft_skip
+ fi
+
+ ip link set "$netdev" up
+ if [ $? -ne 0 ];then
+ echo "FAIL: $netdev: Fail to up interface"
+ return 1
+ else
+ echo "PASS: $netdev: set interface up"
+ NETDEV_STARTED=1
+ fi
+ return 0
+}
+
+# this function will try to setup an IP and MAC address on a network interface
+# Doing nothing if the interface was already up
+# arg1: network interface name
+kci_net_setup()
+{
+ netdev=$1
+
+ # do nothing if the interface was already up
+ if [ $NETDEV_STARTED -eq 0 ];then
+ return 0
+ fi
+
+ MACADDR='02:03:04:05:06:07'
+ ip link set dev $netdev address "$MACADDR"
+ if [ $? -ne 0 ];then
+ echo "FAIL: $netdev: Cannot set MAC address"
+ else
+ ip link show $netdev |grep -q "$MACADDR"
+ if [ $? -eq 0 ];then
+ echo "PASS: $netdev: set MAC address"
+ else
+ echo "FAIL: $netdev: Cannot set MAC address"
+ fi
+ fi
+
+ #check that the interface did not already have an IP
+ ip address show "$netdev" |grep '^[[:space:]]*inet'
+ if [ $? -eq 0 ];then
+ echo "SKIP: $netdev: already have an IP"
+ return $ksft_skip
+ fi
+
+ # TODO what ipaddr to set ? DHCP ?
+ echo "SKIP: $netdev: set IP address"
+ return $ksft_skip
+}
+
+# test an ethtool command
+# arg1: return code for not supported (see ethtool code source)
+# arg2: summary of the command
+# arg3: command to execute
+kci_netdev_ethtool_test()
+{
+ if [ $# -le 2 ];then
+ echo "SKIP: $netdev: ethtool: invalid number of arguments"
+ return 1
+ fi
+ $3 >/dev/null
+ ret=$?
+ if [ $ret -ne 0 ];then
+ if [ $ret -eq "$1" ];then
+ echo "SKIP: $netdev: ethtool $2 not supported"
+ return $ksft_skip
+ else
+ echo "FAIL: $netdev: ethtool $2"
+ return 1
+ fi
+ else
+ echo "PASS: $netdev: ethtool $2"
+ fi
+ return 0
+}
+
+# test ethtool commands
+# arg1: network interface name
+kci_netdev_ethtool()
+{
+ netdev=$1
+
+ #check presence of ethtool
+ ethtool --version 2>/dev/null >/dev/null
+ if [ $? -ne 0 ];then
+ echo "SKIP: ethtool not present"
+ return $ksft_skip
+ fi
+
+ TMP_ETHTOOL_FEATURES="$(mktemp)"
+ if [ ! -e "$TMP_ETHTOOL_FEATURES" ];then
+ echo "SKIP: Cannot create a tmp file"
+ return 1
+ fi
+
+ ethtool -k "$netdev" > "$TMP_ETHTOOL_FEATURES"
+ if [ $? -ne 0 ];then
+ echo "FAIL: $netdev: ethtool list features"
+ rm "$TMP_ETHTOOL_FEATURES"
+ return 1
+ fi
+ echo "PASS: $netdev: ethtool list features"
+ #TODO for each non fixed features, try to turn them on/off
+ rm "$TMP_ETHTOOL_FEATURES"
+
+ kci_netdev_ethtool_test 74 'dump' "ethtool -d $netdev"
+ kci_netdev_ethtool_test 94 'stats' "ethtool -S $netdev"
+ return 0
+}
+
+# stop a netdev
+# arg1: network interface name
+kci_netdev_stop()
+{
+ netdev=$1
+
+ if [ $NETDEV_STARTED -eq 0 ];then
+ echo "SKIP: $netdev: interface kept up"
+ return 0
+ fi
+
+ ip link set "$netdev" down
+ if [ $? -ne 0 ];then
+ echo "FAIL: $netdev: stop interface"
+ return 1
+ fi
+ echo "PASS: $netdev: stop interface"
+ return 0
+}
+
+# run all test on a netdev
+# arg1: network interface name
+kci_test_netdev()
+{
+ NETDEV_STARTED=0
+ IFACE_TO_UPDOWN="$1"
+ IFACE_TO_TEST="$1"
+ #check for VLAN interface
+ MASTER_IFACE="$(echo $1 | cut -d@ -f2)"
+ if [ ! -z "$MASTER_IFACE" ];then
+ IFACE_TO_UPDOWN="$MASTER_IFACE"
+ IFACE_TO_TEST="$(echo $1 | cut -d@ -f1)"
+ fi
+
+ NETDEV_STARTED=0
+ kci_net_start "$IFACE_TO_UPDOWN"
+
+ kci_net_setup "$IFACE_TO_TEST"
+
+ kci_netdev_ethtool "$IFACE_TO_TEST"
+
+ kci_netdev_stop "$IFACE_TO_UPDOWN"
+ return 0
+}
+
+#check for needed privileges
+if [ "$(id -u)" -ne 0 ];then
+ echo "SKIP: Need root privileges"
+ exit $ksft_skip
+fi
+
+ip link show 2>/dev/null >/dev/null
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without the ip tool"
+ exit $ksft_skip
+fi
+
+TMP_LIST_NETDEV="$(mktemp)"
+if [ ! -e "$TMP_LIST_NETDEV" ];then
+ echo "FAIL: Cannot create a tmp file"
+ exit 1
+fi
+
+ip link show |grep '^[0-9]' | grep -oE '[[:space:]].*eth[0-9]*:|[[:space:]].*enp[0-9]s[0-9]:' | cut -d\ -f2 | cut -d: -f1> "$TMP_LIST_NETDEV"
+while read netdev
+do
+ kci_test_netdev "$netdev"
+done < "$TMP_LIST_NETDEV"
+
+rm "$TMP_LIST_NETDEV"
+exit 0
diff --git a/tools/testing/selftests/net/nettest.c b/tools/testing/selftests/net/nettest.c
new file mode 100644
index 000000000..f75c53ce0
--- /dev/null
+++ b/tools/testing/selftests/net/nettest.c
@@ -0,0 +1,1815 @@
+// SPDX-License-Identifier: GPL-2.0
+/* nettest - used for functional tests of networking APIs
+ *
+ * Copyright (c) 2013-2019 David Ahern <dsahern@gmail.com>. All rights reserved.
+ */
+
+#define _GNU_SOURCE
+#include <features.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <linux/tcp.h>
+#include <arpa/inet.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <limits.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <time.h>
+#include <errno.h>
+
+#ifndef IPV6_UNICAST_IF
+#define IPV6_UNICAST_IF 76
+#endif
+#ifndef IPV6_MULTICAST_IF
+#define IPV6_MULTICAST_IF 17
+#endif
+
+#define DEFAULT_PORT 12345
+
+#ifndef MAX
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#endif
+#ifndef MIN
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+struct sock_args {
+ /* local address */
+ union {
+ struct in_addr in;
+ struct in6_addr in6;
+ } local_addr;
+
+ /* remote address */
+ union {
+ struct in_addr in;
+ struct in6_addr in6;
+ } remote_addr;
+ int scope_id; /* remote scope; v6 send only */
+
+ struct in_addr grp; /* multicast group */
+
+ unsigned int has_local_ip:1,
+ has_remote_ip:1,
+ has_grp:1,
+ has_expected_laddr:1,
+ has_expected_raddr:1,
+ bind_test_only:1;
+
+ unsigned short port;
+
+ int type; /* DGRAM, STREAM, RAW */
+ int protocol;
+ int version; /* AF_INET/AF_INET6 */
+
+ int use_setsockopt;
+ int use_cmsg;
+ const char *dev;
+ int ifindex;
+
+ const char *password;
+ /* prefix for MD5 password */
+ union {
+ struct sockaddr_in v4;
+ struct sockaddr_in6 v6;
+ } md5_prefix;
+ unsigned int prefix_len;
+
+ /* expected addresses and device index for connection */
+ int expected_ifindex;
+
+ /* local address */
+ union {
+ struct in_addr in;
+ struct in6_addr in6;
+ } expected_laddr;
+
+ /* remote address */
+ union {
+ struct in_addr in;
+ struct in6_addr in6;
+ } expected_raddr;
+};
+
+static int server_mode;
+static unsigned int prog_timeout = 5;
+static unsigned int interactive;
+static int iter = 1;
+static char *msg = "Hello world!";
+static int msglen;
+static int quiet;
+static int try_broadcast = 1;
+
+static char *timestamp(char *timebuf, int buflen)
+{
+ time_t now;
+
+ now = time(NULL);
+ if (strftime(timebuf, buflen, "%T", localtime(&now)) == 0) {
+ memset(timebuf, 0, buflen);
+ strncpy(timebuf, "00:00:00", buflen-1);
+ }
+
+ return timebuf;
+}
+
+static void log_msg(const char *format, ...)
+{
+ char timebuf[64];
+ va_list args;
+
+ if (quiet)
+ return;
+
+ fprintf(stdout, "%s %s:",
+ timestamp(timebuf, sizeof(timebuf)),
+ server_mode ? "server" : "client");
+ va_start(args, format);
+ vfprintf(stdout, format, args);
+ va_end(args);
+
+ fflush(stdout);
+}
+
+static void log_error(const char *format, ...)
+{
+ char timebuf[64];
+ va_list args;
+
+ if (quiet)
+ return;
+
+ fprintf(stderr, "%s %s:",
+ timestamp(timebuf, sizeof(timebuf)),
+ server_mode ? "server" : "client");
+ va_start(args, format);
+ vfprintf(stderr, format, args);
+ va_end(args);
+
+ fflush(stderr);
+}
+
+static void log_err_errno(const char *fmt, ...)
+{
+ char timebuf[64];
+ va_list args;
+
+ if (quiet)
+ return;
+
+ fprintf(stderr, "%s %s: ",
+ timestamp(timebuf, sizeof(timebuf)),
+ server_mode ? "server" : "client");
+ va_start(args, fmt);
+ vfprintf(stderr, fmt, args);
+ va_end(args);
+
+ fprintf(stderr, ": %d: %s\n", errno, strerror(errno));
+ fflush(stderr);
+}
+
+static void log_address(const char *desc, struct sockaddr *sa)
+{
+ char addrstr[64];
+
+ if (quiet)
+ return;
+
+ if (sa->sa_family == AF_INET) {
+ struct sockaddr_in *s = (struct sockaddr_in *) sa;
+
+ log_msg("%s %s:%d",
+ desc,
+ inet_ntop(AF_INET, &s->sin_addr, addrstr,
+ sizeof(addrstr)),
+ ntohs(s->sin_port));
+
+ } else if (sa->sa_family == AF_INET6) {
+ struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) sa;
+
+ log_msg("%s [%s]:%d",
+ desc,
+ inet_ntop(AF_INET6, &s6->sin6_addr, addrstr,
+ sizeof(addrstr)),
+ ntohs(s6->sin6_port));
+ }
+
+ printf("\n");
+
+ fflush(stdout);
+}
+
+static int tcp_md5sig(int sd, void *addr, socklen_t alen, struct sock_args *args)
+{
+ int keylen = strlen(args->password);
+ struct tcp_md5sig md5sig = {};
+ int opt = TCP_MD5SIG;
+ int rc;
+
+ md5sig.tcpm_keylen = keylen;
+ memcpy(md5sig.tcpm_key, args->password, keylen);
+
+ if (args->prefix_len) {
+ opt = TCP_MD5SIG_EXT;
+ md5sig.tcpm_flags |= TCP_MD5SIG_FLAG_PREFIX;
+
+ md5sig.tcpm_prefixlen = args->prefix_len;
+ addr = &args->md5_prefix;
+ }
+ memcpy(&md5sig.tcpm_addr, addr, alen);
+
+ if (args->ifindex) {
+ opt = TCP_MD5SIG_EXT;
+ md5sig.tcpm_flags |= TCP_MD5SIG_FLAG_IFINDEX;
+
+ md5sig.tcpm_ifindex = args->ifindex;
+ }
+
+ rc = setsockopt(sd, IPPROTO_TCP, opt, &md5sig, sizeof(md5sig));
+ if (rc < 0) {
+ /* ENOENT is harmless. Returned when a password is cleared */
+ if (errno == ENOENT)
+ rc = 0;
+ else
+ log_err_errno("setsockopt(TCP_MD5SIG)");
+ }
+
+ return rc;
+}
+
+static int tcp_md5_remote(int sd, struct sock_args *args)
+{
+ struct sockaddr_in sin = {
+ .sin_family = AF_INET,
+ };
+ struct sockaddr_in6 sin6 = {
+ .sin6_family = AF_INET6,
+ };
+ void *addr;
+ int alen;
+
+ switch (args->version) {
+ case AF_INET:
+ sin.sin_port = htons(args->port);
+ sin.sin_addr = args->remote_addr.in;
+ addr = &sin;
+ alen = sizeof(sin);
+ break;
+ case AF_INET6:
+ sin6.sin6_port = htons(args->port);
+ sin6.sin6_addr = args->remote_addr.in6;
+ addr = &sin6;
+ alen = sizeof(sin6);
+ break;
+ default:
+ log_error("unknown address family\n");
+ exit(1);
+ }
+
+ if (tcp_md5sig(sd, addr, alen, args))
+ return -1;
+
+ return 0;
+}
+
+static int get_ifidx(const char *ifname)
+{
+ struct ifreq ifdata;
+ int sd, rc;
+
+ if (!ifname || *ifname == '\0')
+ return -1;
+
+ memset(&ifdata, 0, sizeof(ifdata));
+
+ strcpy(ifdata.ifr_name, ifname);
+
+ sd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
+ if (sd < 0) {
+ log_err_errno("socket failed");
+ return -1;
+ }
+
+ rc = ioctl(sd, SIOCGIFINDEX, (char *)&ifdata);
+ close(sd);
+ if (rc != 0) {
+ log_err_errno("ioctl(SIOCGIFINDEX) failed");
+ return -1;
+ }
+
+ return ifdata.ifr_ifindex;
+}
+
+static int bind_to_device(int sd, const char *name)
+{
+ int rc;
+
+ rc = setsockopt(sd, SOL_SOCKET, SO_BINDTODEVICE, name, strlen(name)+1);
+ if (rc < 0)
+ log_err_errno("setsockopt(SO_BINDTODEVICE)");
+
+ return rc;
+}
+
+static int get_bind_to_device(int sd, char *name, size_t len)
+{
+ int rc;
+ socklen_t optlen = len;
+
+ name[0] = '\0';
+ rc = getsockopt(sd, SOL_SOCKET, SO_BINDTODEVICE, name, &optlen);
+ if (rc < 0)
+ log_err_errno("setsockopt(SO_BINDTODEVICE)");
+
+ return rc;
+}
+
+static int check_device(int sd, struct sock_args *args)
+{
+ int ifindex = 0;
+ char name[32];
+
+ if (get_bind_to_device(sd, name, sizeof(name)))
+ *name = '\0';
+ else
+ ifindex = get_ifidx(name);
+
+ log_msg(" bound to device %s/%d\n",
+ *name ? name : "<none>", ifindex);
+
+ if (!args->expected_ifindex)
+ return 0;
+
+ if (args->expected_ifindex != ifindex) {
+ log_error("Device index mismatch: expected %d have %d\n",
+ args->expected_ifindex, ifindex);
+ return 1;
+ }
+
+ log_msg("Device index matches: expected %d have %d\n",
+ args->expected_ifindex, ifindex);
+
+ return 0;
+}
+
+static int set_pktinfo_v4(int sd)
+{
+ int one = 1;
+ int rc;
+
+ rc = setsockopt(sd, SOL_IP, IP_PKTINFO, &one, sizeof(one));
+ if (rc < 0 && rc != -ENOTSUP)
+ log_err_errno("setsockopt(IP_PKTINFO)");
+
+ return rc;
+}
+
+static int set_recvpktinfo_v6(int sd)
+{
+ int one = 1;
+ int rc;
+
+ rc = setsockopt(sd, SOL_IPV6, IPV6_RECVPKTINFO, &one, sizeof(one));
+ if (rc < 0 && rc != -ENOTSUP)
+ log_err_errno("setsockopt(IPV6_RECVPKTINFO)");
+
+ return rc;
+}
+
+static int set_recverr_v4(int sd)
+{
+ int one = 1;
+ int rc;
+
+ rc = setsockopt(sd, SOL_IP, IP_RECVERR, &one, sizeof(one));
+ if (rc < 0 && rc != -ENOTSUP)
+ log_err_errno("setsockopt(IP_RECVERR)");
+
+ return rc;
+}
+
+static int set_recverr_v6(int sd)
+{
+ int one = 1;
+ int rc;
+
+ rc = setsockopt(sd, SOL_IPV6, IPV6_RECVERR, &one, sizeof(one));
+ if (rc < 0 && rc != -ENOTSUP)
+ log_err_errno("setsockopt(IPV6_RECVERR)");
+
+ return rc;
+}
+
+static int set_unicast_if(int sd, int ifindex, int version)
+{
+ int opt = IP_UNICAST_IF;
+ int level = SOL_IP;
+ int rc;
+
+ ifindex = htonl(ifindex);
+
+ if (version == AF_INET6) {
+ opt = IPV6_UNICAST_IF;
+ level = SOL_IPV6;
+ }
+ rc = setsockopt(sd, level, opt, &ifindex, sizeof(ifindex));
+ if (rc < 0)
+ log_err_errno("setsockopt(IP_UNICAST_IF)");
+
+ return rc;
+}
+
+static int set_multicast_if(int sd, int ifindex)
+{
+ struct ip_mreqn mreq = { .imr_ifindex = ifindex };
+ int rc;
+
+ rc = setsockopt(sd, SOL_IP, IP_MULTICAST_IF, &mreq, sizeof(mreq));
+ if (rc < 0)
+ log_err_errno("setsockopt(IP_MULTICAST_IF)");
+
+ return rc;
+}
+
+static int set_membership(int sd, uint32_t grp, uint32_t addr, int ifindex)
+{
+ uint32_t if_addr = addr;
+ struct ip_mreqn mreq;
+ int rc;
+
+ if (addr == htonl(INADDR_ANY) && !ifindex) {
+ log_error("Either local address or device needs to be given for multicast membership\n");
+ return -1;
+ }
+
+ mreq.imr_multiaddr.s_addr = grp;
+ mreq.imr_address.s_addr = if_addr;
+ mreq.imr_ifindex = ifindex;
+
+ rc = setsockopt(sd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq));
+ if (rc < 0) {
+ log_err_errno("setsockopt(IP_ADD_MEMBERSHIP)");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int set_broadcast(int sd)
+{
+ unsigned int one = 1;
+ int rc = 0;
+
+ if (setsockopt(sd, SOL_SOCKET, SO_BROADCAST, &one, sizeof(one)) != 0) {
+ log_err_errno("setsockopt(SO_BROADCAST)");
+ rc = -1;
+ }
+
+ return rc;
+}
+
+static int set_reuseport(int sd)
+{
+ unsigned int one = 1;
+ int rc = 0;
+
+ if (setsockopt(sd, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)) != 0) {
+ log_err_errno("setsockopt(SO_REUSEPORT)");
+ rc = -1;
+ }
+
+ return rc;
+}
+
+static int set_reuseaddr(int sd)
+{
+ unsigned int one = 1;
+ int rc = 0;
+
+ if (setsockopt(sd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) != 0) {
+ log_err_errno("setsockopt(SO_REUSEADDR)");
+ rc = -1;
+ }
+
+ return rc;
+}
+
+static int str_to_uint(const char *str, int min, int max, unsigned int *value)
+{
+ int number;
+ char *end;
+
+ errno = 0;
+ number = (unsigned int) strtoul(str, &end, 0);
+
+ /* entire string should be consumed by conversion
+ * and value should be between min and max
+ */
+ if (((*end == '\0') || (*end == '\n')) && (end != str) &&
+ (errno != ERANGE) && (min <= number) && (number <= max)) {
+ *value = number;
+ return 0;
+ }
+
+ return -1;
+}
+
+static int expected_addr_match(struct sockaddr *sa, void *expected,
+ const char *desc)
+{
+ char addrstr[64];
+ int rc = 0;
+
+ if (sa->sa_family == AF_INET) {
+ struct sockaddr_in *s = (struct sockaddr_in *) sa;
+ struct in_addr *exp_in = (struct in_addr *) expected;
+
+ if (s->sin_addr.s_addr != exp_in->s_addr) {
+ log_error("%s address does not match expected %s",
+ desc,
+ inet_ntop(AF_INET, exp_in,
+ addrstr, sizeof(addrstr)));
+ rc = 1;
+ }
+ } else if (sa->sa_family == AF_INET6) {
+ struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) sa;
+ struct in6_addr *exp_in = (struct in6_addr *) expected;
+
+ if (memcmp(&s6->sin6_addr, exp_in, sizeof(*exp_in))) {
+ log_error("%s address does not match expected %s",
+ desc,
+ inet_ntop(AF_INET6, exp_in,
+ addrstr, sizeof(addrstr)));
+ rc = 1;
+ }
+ } else {
+ log_error("%s address does not match expected - unknown family",
+ desc);
+ rc = 1;
+ }
+
+ if (!rc)
+ log_msg("%s address matches expected\n", desc);
+
+ return rc;
+}
+
+static int show_sockstat(int sd, struct sock_args *args)
+{
+ struct sockaddr_in6 local_addr, remote_addr;
+ socklen_t alen = sizeof(local_addr);
+ struct sockaddr *sa;
+ const char *desc;
+ int rc = 0;
+
+ desc = server_mode ? "server local:" : "client local:";
+ sa = (struct sockaddr *) &local_addr;
+ if (getsockname(sd, sa, &alen) == 0) {
+ log_address(desc, sa);
+
+ if (args->has_expected_laddr) {
+ rc = expected_addr_match(sa, &args->expected_laddr,
+ "local");
+ }
+ } else {
+ log_err_errno("getsockname failed");
+ }
+
+ sa = (struct sockaddr *) &remote_addr;
+ desc = server_mode ? "server peer:" : "client peer:";
+ if (getpeername(sd, sa, &alen) == 0) {
+ log_address(desc, sa);
+
+ if (args->has_expected_raddr) {
+ rc |= expected_addr_match(sa, &args->expected_raddr,
+ "remote");
+ }
+ } else {
+ log_err_errno("getpeername failed");
+ }
+
+ return rc;
+}
+
+static int get_index_from_cmsg(struct msghdr *m)
+{
+ struct cmsghdr *cm;
+ int ifindex = 0;
+ char buf[64];
+
+ for (cm = (struct cmsghdr *)CMSG_FIRSTHDR(m);
+ m->msg_controllen != 0 && cm;
+ cm = (struct cmsghdr *)CMSG_NXTHDR(m, cm)) {
+
+ if (cm->cmsg_level == SOL_IP &&
+ cm->cmsg_type == IP_PKTINFO) {
+ struct in_pktinfo *pi;
+
+ pi = (struct in_pktinfo *)(CMSG_DATA(cm));
+ inet_ntop(AF_INET, &pi->ipi_addr, buf, sizeof(buf));
+ ifindex = pi->ipi_ifindex;
+ } else if (cm->cmsg_level == SOL_IPV6 &&
+ cm->cmsg_type == IPV6_PKTINFO) {
+ struct in6_pktinfo *pi6;
+
+ pi6 = (struct in6_pktinfo *)(CMSG_DATA(cm));
+ inet_ntop(AF_INET6, &pi6->ipi6_addr, buf, sizeof(buf));
+ ifindex = pi6->ipi6_ifindex;
+ }
+ }
+
+ if (ifindex) {
+ log_msg(" pktinfo: ifindex %d dest addr %s\n",
+ ifindex, buf);
+ }
+ return ifindex;
+}
+
+static int send_msg_no_cmsg(int sd, void *addr, socklen_t alen)
+{
+ int err;
+
+again:
+ err = sendto(sd, msg, msglen, 0, addr, alen);
+ if (err < 0) {
+ if (errno == EACCES && try_broadcast) {
+ try_broadcast = 0;
+ if (!set_broadcast(sd))
+ goto again;
+ errno = EACCES;
+ }
+
+ log_err_errno("sendto failed");
+ return 1;
+ }
+
+ return 0;
+}
+
+static int send_msg_cmsg(int sd, void *addr, socklen_t alen,
+ int ifindex, int version)
+{
+ unsigned char cmsgbuf[64];
+ struct iovec iov[2];
+ struct cmsghdr *cm;
+ struct msghdr m;
+ int err;
+
+ iov[0].iov_base = msg;
+ iov[0].iov_len = msglen;
+ m.msg_iov = iov;
+ m.msg_iovlen = 1;
+ m.msg_name = (caddr_t)addr;
+ m.msg_namelen = alen;
+
+ memset(cmsgbuf, 0, sizeof(cmsgbuf));
+ cm = (struct cmsghdr *)cmsgbuf;
+ m.msg_control = (caddr_t)cm;
+
+ if (version == AF_INET) {
+ struct in_pktinfo *pi;
+
+ cm->cmsg_level = SOL_IP;
+ cm->cmsg_type = IP_PKTINFO;
+ cm->cmsg_len = CMSG_LEN(sizeof(struct in_pktinfo));
+ pi = (struct in_pktinfo *)(CMSG_DATA(cm));
+ pi->ipi_ifindex = ifindex;
+
+ m.msg_controllen = cm->cmsg_len;
+
+ } else if (version == AF_INET6) {
+ struct in6_pktinfo *pi6;
+
+ cm->cmsg_level = SOL_IPV6;
+ cm->cmsg_type = IPV6_PKTINFO;
+ cm->cmsg_len = CMSG_LEN(sizeof(struct in6_pktinfo));
+
+ pi6 = (struct in6_pktinfo *)(CMSG_DATA(cm));
+ pi6->ipi6_ifindex = ifindex;
+
+ m.msg_controllen = cm->cmsg_len;
+ }
+
+again:
+ err = sendmsg(sd, &m, 0);
+ if (err < 0) {
+ if (errno == EACCES && try_broadcast) {
+ try_broadcast = 0;
+ if (!set_broadcast(sd))
+ goto again;
+ errno = EACCES;
+ }
+
+ log_err_errno("sendmsg failed");
+ return 1;
+ }
+
+ return 0;
+}
+
+
+static int send_msg(int sd, void *addr, socklen_t alen, struct sock_args *args)
+{
+ if (args->type == SOCK_STREAM) {
+ if (write(sd, msg, msglen) < 0) {
+ log_err_errno("write failed sending msg to peer");
+ return 1;
+ }
+ } else if (args->ifindex && args->use_cmsg) {
+ if (send_msg_cmsg(sd, addr, alen, args->ifindex, args->version))
+ return 1;
+ } else {
+ if (send_msg_no_cmsg(sd, addr, alen))
+ return 1;
+ }
+
+ log_msg("Sent message:\n");
+ log_msg(" %.24s%s\n", msg, msglen > 24 ? " ..." : "");
+
+ return 0;
+}
+
+static int socket_read_dgram(int sd, struct sock_args *args)
+{
+ unsigned char addr[sizeof(struct sockaddr_in6)];
+ struct sockaddr *sa = (struct sockaddr *) addr;
+ socklen_t alen = sizeof(addr);
+ struct iovec iov[2];
+ struct msghdr m = {
+ .msg_name = (caddr_t)addr,
+ .msg_namelen = alen,
+ .msg_iov = iov,
+ .msg_iovlen = 1,
+ };
+ unsigned char cmsgbuf[256];
+ struct cmsghdr *cm = (struct cmsghdr *)cmsgbuf;
+ char buf[16*1024];
+ int ifindex;
+ int len;
+
+ iov[0].iov_base = (caddr_t)buf;
+ iov[0].iov_len = sizeof(buf);
+
+ memset(cmsgbuf, 0, sizeof(cmsgbuf));
+ m.msg_control = (caddr_t)cm;
+ m.msg_controllen = sizeof(cmsgbuf);
+
+ len = recvmsg(sd, &m, 0);
+ if (len == 0) {
+ log_msg("peer closed connection.\n");
+ return 0;
+ } else if (len < 0) {
+ log_msg("failed to read message: %d: %s\n",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ buf[len] = '\0';
+
+ log_address("Message from:", sa);
+ log_msg(" %.24s%s\n", buf, len > 24 ? " ..." : "");
+
+ ifindex = get_index_from_cmsg(&m);
+ if (args->expected_ifindex) {
+ if (args->expected_ifindex != ifindex) {
+ log_error("Device index mismatch: expected %d have %d\n",
+ args->expected_ifindex, ifindex);
+ return -1;
+ }
+ log_msg("Device index matches: expected %d have %d\n",
+ args->expected_ifindex, ifindex);
+ }
+
+ if (!interactive && server_mode) {
+ if (sa->sa_family == AF_INET6) {
+ struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) sa;
+ struct in6_addr *in6 = &s6->sin6_addr;
+
+ if (IN6_IS_ADDR_V4MAPPED(in6)) {
+ const uint32_t *pa = (uint32_t *) &in6->s6_addr;
+ struct in_addr in4;
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *) addr;
+ pa += 3;
+ in4.s_addr = *pa;
+ sin->sin_addr = in4;
+ sin->sin_family = AF_INET;
+ if (send_msg_cmsg(sd, addr, alen,
+ ifindex, AF_INET) < 0)
+ goto out_err;
+ }
+ }
+again:
+ iov[0].iov_len = len;
+
+ if (args->version == AF_INET6) {
+ struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) sa;
+
+ if (args->dev) {
+ /* avoid PKTINFO conflicts with bindtodev */
+ if (sendto(sd, buf, len, 0,
+ (void *) addr, alen) < 0)
+ goto out_err;
+ } else {
+ /* kernel is allowing scope_id to be set to VRF
+ * index for LLA. for sends to global address
+ * reset scope id
+ */
+ s6->sin6_scope_id = ifindex;
+ if (sendmsg(sd, &m, 0) < 0)
+ goto out_err;
+ }
+ } else {
+ int err;
+
+ err = sendmsg(sd, &m, 0);
+ if (err < 0) {
+ if (errno == EACCES && try_broadcast) {
+ try_broadcast = 0;
+ if (!set_broadcast(sd))
+ goto again;
+ errno = EACCES;
+ }
+ goto out_err;
+ }
+ }
+ log_msg("Sent message:\n");
+ log_msg(" %.24s%s\n", buf, len > 24 ? " ..." : "");
+ }
+
+ return 1;
+out_err:
+ log_err_errno("failed to send msg to peer");
+ return -1;
+}
+
+static int socket_read_stream(int sd)
+{
+ char buf[1024];
+ int len;
+
+ len = read(sd, buf, sizeof(buf)-1);
+ if (len == 0) {
+ log_msg("client closed connection.\n");
+ return 0;
+ } else if (len < 0) {
+ log_msg("failed to read message\n");
+ return -1;
+ }
+
+ buf[len] = '\0';
+ log_msg("Incoming message:\n");
+ log_msg(" %.24s%s\n", buf, len > 24 ? " ..." : "");
+
+ if (!interactive && server_mode) {
+ if (write(sd, buf, len) < 0) {
+ log_err_errno("failed to send buf");
+ return -1;
+ }
+ log_msg("Sent message:\n");
+ log_msg(" %.24s%s\n", buf, len > 24 ? " ..." : "");
+ }
+
+ return 1;
+}
+
+static int socket_read(int sd, struct sock_args *args)
+{
+ if (args->type == SOCK_STREAM)
+ return socket_read_stream(sd);
+
+ return socket_read_dgram(sd, args);
+}
+
+static int stdin_to_socket(int sd, int type, void *addr, socklen_t alen)
+{
+ char buf[1024];
+ int len;
+
+ if (fgets(buf, sizeof(buf), stdin) == NULL)
+ return 0;
+
+ len = strlen(buf);
+ if (type == SOCK_STREAM) {
+ if (write(sd, buf, len) < 0) {
+ log_err_errno("failed to send buf");
+ return -1;
+ }
+ } else {
+ int err;
+
+again:
+ err = sendto(sd, buf, len, 0, addr, alen);
+ if (err < 0) {
+ if (errno == EACCES && try_broadcast) {
+ try_broadcast = 0;
+ if (!set_broadcast(sd))
+ goto again;
+ errno = EACCES;
+ }
+ log_err_errno("failed to send msg to peer");
+ return -1;
+ }
+ }
+ log_msg("Sent message:\n");
+ log_msg(" %.24s%s\n", buf, len > 24 ? " ..." : "");
+
+ return 1;
+}
+
+static void set_recv_attr(int sd, int version)
+{
+ if (version == AF_INET6) {
+ set_recvpktinfo_v6(sd);
+ set_recverr_v6(sd);
+ } else {
+ set_pktinfo_v4(sd);
+ set_recverr_v4(sd);
+ }
+}
+
+static int msg_loop(int client, int sd, void *addr, socklen_t alen,
+ struct sock_args *args)
+{
+ struct timeval timeout = { .tv_sec = prog_timeout }, *ptval = NULL;
+ fd_set rfds;
+ int nfds;
+ int rc;
+
+ if (args->type != SOCK_STREAM)
+ set_recv_attr(sd, args->version);
+
+ if (msg) {
+ msglen = strlen(msg);
+
+ /* client sends first message */
+ if (client) {
+ if (send_msg(sd, addr, alen, args))
+ return 1;
+ }
+ if (!interactive) {
+ ptval = &timeout;
+ if (!prog_timeout)
+ timeout.tv_sec = 5;
+ }
+ }
+
+ nfds = interactive ? MAX(fileno(stdin), sd) + 1 : sd + 1;
+ while (1) {
+ FD_ZERO(&rfds);
+ FD_SET(sd, &rfds);
+ if (interactive)
+ FD_SET(fileno(stdin), &rfds);
+
+ rc = select(nfds, &rfds, NULL, NULL, ptval);
+ if (rc < 0) {
+ if (errno == EINTR)
+ continue;
+
+ rc = 1;
+ log_err_errno("select failed");
+ break;
+ } else if (rc == 0) {
+ log_error("Timed out waiting for response\n");
+ rc = 2;
+ break;
+ }
+
+ if (FD_ISSET(sd, &rfds)) {
+ rc = socket_read(sd, args);
+ if (rc < 0) {
+ rc = 1;
+ break;
+ }
+ if (rc == 0)
+ break;
+ }
+
+ rc = 0;
+
+ if (FD_ISSET(fileno(stdin), &rfds)) {
+ if (stdin_to_socket(sd, args->type, addr, alen) <= 0)
+ break;
+ }
+
+ if (interactive)
+ continue;
+
+ if (iter != -1) {
+ --iter;
+ if (iter == 0)
+ break;
+ }
+
+ log_msg("Going into quiet mode\n");
+ quiet = 1;
+
+ if (client) {
+ if (send_msg(sd, addr, alen, args)) {
+ rc = 1;
+ break;
+ }
+ }
+ }
+
+ return rc;
+}
+
+static int msock_init(struct sock_args *args, int server)
+{
+ uint32_t if_addr = htonl(INADDR_ANY);
+ struct sockaddr_in laddr = {
+ .sin_family = AF_INET,
+ .sin_port = htons(args->port),
+ };
+ int one = 1;
+ int sd;
+
+ if (!server && args->has_local_ip)
+ if_addr = args->local_addr.in.s_addr;
+
+ sd = socket(PF_INET, SOCK_DGRAM, 0);
+ if (sd < 0) {
+ log_err_errno("socket");
+ return -1;
+ }
+
+ if (setsockopt(sd, SOL_SOCKET, SO_REUSEADDR,
+ (char *)&one, sizeof(one)) < 0) {
+ log_err_errno("Setting SO_REUSEADDR error");
+ goto out_err;
+ }
+
+ if (setsockopt(sd, SOL_SOCKET, SO_BROADCAST,
+ (char *)&one, sizeof(one)) < 0)
+ log_err_errno("Setting SO_BROADCAST error");
+
+ if (args->dev && bind_to_device(sd, args->dev) != 0)
+ goto out_err;
+ else if (args->use_setsockopt &&
+ set_multicast_if(sd, args->ifindex))
+ goto out_err;
+
+ laddr.sin_addr.s_addr = if_addr;
+
+ if (bind(sd, (struct sockaddr *) &laddr, sizeof(laddr)) < 0) {
+ log_err_errno("bind failed");
+ goto out_err;
+ }
+
+ if (server &&
+ set_membership(sd, args->grp.s_addr,
+ args->local_addr.in.s_addr, args->ifindex))
+ goto out_err;
+
+ return sd;
+out_err:
+ close(sd);
+ return -1;
+}
+
+static int msock_server(struct sock_args *args)
+{
+ return msock_init(args, 1);
+}
+
+static int msock_client(struct sock_args *args)
+{
+ return msock_init(args, 0);
+}
+
+static int bind_socket(int sd, struct sock_args *args)
+{
+ struct sockaddr_in serv_addr = {
+ .sin_family = AF_INET,
+ };
+ struct sockaddr_in6 serv6_addr = {
+ .sin6_family = AF_INET6,
+ };
+ void *addr;
+ socklen_t alen;
+
+ if (!args->has_local_ip && args->type == SOCK_RAW)
+ return 0;
+
+ switch (args->version) {
+ case AF_INET:
+ serv_addr.sin_port = htons(args->port);
+ serv_addr.sin_addr = args->local_addr.in;
+ addr = &serv_addr;
+ alen = sizeof(serv_addr);
+ break;
+
+ case AF_INET6:
+ serv6_addr.sin6_port = htons(args->port);
+ serv6_addr.sin6_addr = args->local_addr.in6;
+ addr = &serv6_addr;
+ alen = sizeof(serv6_addr);
+ break;
+
+ default:
+ log_error("Invalid address family\n");
+ return -1;
+ }
+
+ if (bind(sd, addr, alen) < 0) {
+ log_err_errno("error binding socket");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int lsock_init(struct sock_args *args)
+{
+ long flags;
+ int sd;
+
+ sd = socket(args->version, args->type, args->protocol);
+ if (sd < 0) {
+ log_err_errno("Error opening socket");
+ return -1;
+ }
+
+ if (set_reuseaddr(sd) != 0)
+ goto err;
+
+ if (set_reuseport(sd) != 0)
+ goto err;
+
+ if (args->dev && bind_to_device(sd, args->dev) != 0)
+ goto err;
+ else if (args->use_setsockopt &&
+ set_unicast_if(sd, args->ifindex, args->version))
+ goto err;
+
+ if (bind_socket(sd, args))
+ goto err;
+
+ if (args->bind_test_only)
+ goto out;
+
+ if (args->type == SOCK_STREAM && listen(sd, 1) < 0) {
+ log_err_errno("listen failed");
+ goto err;
+ }
+
+ flags = fcntl(sd, F_GETFL);
+ if ((flags < 0) || (fcntl(sd, F_SETFL, flags|O_NONBLOCK) < 0)) {
+ log_err_errno("Failed to set non-blocking option");
+ goto err;
+ }
+
+ if (fcntl(sd, F_SETFD, FD_CLOEXEC) < 0)
+ log_err_errno("Failed to set close-on-exec flag");
+
+out:
+ return sd;
+
+err:
+ close(sd);
+ return -1;
+}
+
+static int do_server(struct sock_args *args)
+{
+ struct timeval timeout = { .tv_sec = prog_timeout }, *ptval = NULL;
+ unsigned char addr[sizeof(struct sockaddr_in6)] = {};
+ socklen_t alen = sizeof(addr);
+ int lsd, csd = -1;
+
+ fd_set rfds;
+ int rc;
+
+ if (prog_timeout)
+ ptval = &timeout;
+
+ if (args->has_grp)
+ lsd = msock_server(args);
+ else
+ lsd = lsock_init(args);
+
+ if (lsd < 0)
+ return 1;
+
+ if (args->bind_test_only) {
+ close(lsd);
+ return 0;
+ }
+
+ if (args->type != SOCK_STREAM) {
+ rc = msg_loop(0, lsd, (void *) addr, alen, args);
+ close(lsd);
+ return rc;
+ }
+
+ if (args->password && tcp_md5_remote(lsd, args)) {
+ close(lsd);
+ return 1;
+ }
+
+ while (1) {
+ log_msg("\n");
+ log_msg("waiting for client connection.\n");
+ FD_ZERO(&rfds);
+ FD_SET(lsd, &rfds);
+
+ rc = select(lsd+1, &rfds, NULL, NULL, ptval);
+ if (rc == 0) {
+ rc = 2;
+ break;
+ }
+
+ if (rc < 0) {
+ if (errno == EINTR)
+ continue;
+
+ log_err_errno("select failed");
+ break;
+ }
+
+ if (FD_ISSET(lsd, &rfds)) {
+
+ csd = accept(lsd, (void *) addr, &alen);
+ if (csd < 0) {
+ log_err_errno("accept failed");
+ break;
+ }
+
+ rc = show_sockstat(csd, args);
+ if (rc)
+ break;
+
+ rc = check_device(csd, args);
+ if (rc)
+ break;
+ }
+
+ rc = msg_loop(0, csd, (void *) addr, alen, args);
+ close(csd);
+
+ if (!interactive)
+ break;
+ }
+
+ close(lsd);
+
+ return rc;
+}
+
+static int wait_for_connect(int sd)
+{
+ struct timeval _tv = { .tv_sec = prog_timeout }, *tv = NULL;
+ fd_set wfd;
+ int val = 0, sz = sizeof(val);
+ int rc;
+
+ FD_ZERO(&wfd);
+ FD_SET(sd, &wfd);
+
+ if (prog_timeout)
+ tv = &_tv;
+
+ rc = select(FD_SETSIZE, NULL, &wfd, NULL, tv);
+ if (rc == 0) {
+ log_error("connect timed out\n");
+ return -2;
+ } else if (rc < 0) {
+ log_err_errno("select failed");
+ return -3;
+ }
+
+ if (getsockopt(sd, SOL_SOCKET, SO_ERROR, &val, (socklen_t *)&sz) < 0) {
+ log_err_errno("getsockopt(SO_ERROR) failed");
+ return -4;
+ }
+
+ if (val != 0) {
+ log_error("connect failed: %d: %s\n", val, strerror(val));
+ return -1;
+ }
+
+ return 0;
+}
+
+static int connectsock(void *addr, socklen_t alen, struct sock_args *args)
+{
+ int sd, rc = -1;
+ long flags;
+
+ sd = socket(args->version, args->type, args->protocol);
+ if (sd < 0) {
+ log_err_errno("Failed to create socket");
+ return -1;
+ }
+
+ flags = fcntl(sd, F_GETFL);
+ if ((flags < 0) || (fcntl(sd, F_SETFL, flags|O_NONBLOCK) < 0)) {
+ log_err_errno("Failed to set non-blocking option");
+ goto err;
+ }
+
+ if (set_reuseport(sd) != 0)
+ goto err;
+
+ if (args->dev && bind_to_device(sd, args->dev) != 0)
+ goto err;
+ else if (args->use_setsockopt &&
+ set_unicast_if(sd, args->ifindex, args->version))
+ goto err;
+
+ if (args->has_local_ip && bind_socket(sd, args))
+ goto err;
+
+ if (args->type != SOCK_STREAM)
+ goto out;
+
+ if (args->password && tcp_md5sig(sd, addr, alen, args))
+ goto err;
+
+ if (args->bind_test_only)
+ goto out;
+
+ if (connect(sd, addr, alen) < 0) {
+ if (errno != EINPROGRESS) {
+ log_err_errno("Failed to connect to remote host");
+ rc = -1;
+ goto err;
+ }
+ rc = wait_for_connect(sd);
+ if (rc < 0)
+ goto err;
+ }
+out:
+ return sd;
+
+err:
+ close(sd);
+ return rc;
+}
+
+static int do_client(struct sock_args *args)
+{
+ struct sockaddr_in sin = {
+ .sin_family = AF_INET,
+ };
+ struct sockaddr_in6 sin6 = {
+ .sin6_family = AF_INET6,
+ };
+ void *addr;
+ int alen;
+ int rc = 0;
+ int sd;
+
+ if (!args->has_remote_ip && !args->has_grp) {
+ fprintf(stderr, "remote IP or multicast group not given\n");
+ return 1;
+ }
+
+ switch (args->version) {
+ case AF_INET:
+ sin.sin_port = htons(args->port);
+ if (args->has_grp)
+ sin.sin_addr = args->grp;
+ else
+ sin.sin_addr = args->remote_addr.in;
+ addr = &sin;
+ alen = sizeof(sin);
+ break;
+ case AF_INET6:
+ sin6.sin6_port = htons(args->port);
+ sin6.sin6_addr = args->remote_addr.in6;
+ sin6.sin6_scope_id = args->scope_id;
+ addr = &sin6;
+ alen = sizeof(sin6);
+ break;
+ }
+
+ if (args->has_grp)
+ sd = msock_client(args);
+ else
+ sd = connectsock(addr, alen, args);
+
+ if (sd < 0)
+ return -sd;
+
+ if (args->bind_test_only)
+ goto out;
+
+ if (args->type == SOCK_STREAM) {
+ rc = show_sockstat(sd, args);
+ if (rc != 0)
+ goto out;
+ }
+
+ rc = msg_loop(1, sd, addr, alen, args);
+
+out:
+ close(sd);
+
+ return rc;
+}
+
+enum addr_type {
+ ADDR_TYPE_LOCAL,
+ ADDR_TYPE_REMOTE,
+ ADDR_TYPE_MCAST,
+ ADDR_TYPE_EXPECTED_LOCAL,
+ ADDR_TYPE_EXPECTED_REMOTE,
+ ADDR_TYPE_MD5_PREFIX,
+};
+
+static int convert_addr(struct sock_args *args, const char *_str,
+ enum addr_type atype)
+{
+ int pfx_len_max = args->version == AF_INET6 ? 128 : 32;
+ int family = args->version;
+ char *str, *dev, *sep;
+ struct in6_addr *in6;
+ struct in_addr *in;
+ const char *desc;
+ void *addr;
+ int rc = 0;
+
+ str = strdup(_str);
+ if (!str)
+ return -ENOMEM;
+
+ switch (atype) {
+ case ADDR_TYPE_LOCAL:
+ desc = "local";
+ addr = &args->local_addr;
+ break;
+ case ADDR_TYPE_REMOTE:
+ desc = "remote";
+ addr = &args->remote_addr;
+ break;
+ case ADDR_TYPE_MCAST:
+ desc = "mcast grp";
+ addr = &args->grp;
+ break;
+ case ADDR_TYPE_EXPECTED_LOCAL:
+ desc = "expected local";
+ addr = &args->expected_laddr;
+ break;
+ case ADDR_TYPE_EXPECTED_REMOTE:
+ desc = "expected remote";
+ addr = &args->expected_raddr;
+ break;
+ case ADDR_TYPE_MD5_PREFIX:
+ desc = "md5 prefix";
+ if (family == AF_INET) {
+ args->md5_prefix.v4.sin_family = AF_INET;
+ addr = &args->md5_prefix.v4.sin_addr;
+ } else if (family == AF_INET6) {
+ args->md5_prefix.v6.sin6_family = AF_INET6;
+ addr = &args->md5_prefix.v6.sin6_addr;
+ } else
+ return 1;
+
+ sep = strchr(str, '/');
+ if (sep) {
+ *sep = '\0';
+ sep++;
+ if (str_to_uint(sep, 1, pfx_len_max,
+ &args->prefix_len) != 0) {
+ fprintf(stderr, "Invalid port\n");
+ return 1;
+ }
+ } else {
+ args->prefix_len = pfx_len_max;
+ }
+ break;
+ default:
+ log_error("unknown address type");
+ exit(1);
+ }
+
+ switch (family) {
+ case AF_INET:
+ in = (struct in_addr *) addr;
+ if (str) {
+ if (inet_pton(AF_INET, str, in) == 0) {
+ log_error("Invalid %s IP address\n", desc);
+ rc = -1;
+ goto out;
+ }
+ } else {
+ in->s_addr = htonl(INADDR_ANY);
+ }
+ break;
+
+ case AF_INET6:
+ dev = strchr(str, '%');
+ if (dev) {
+ *dev = '\0';
+ dev++;
+ }
+
+ in6 = (struct in6_addr *) addr;
+ if (str) {
+ if (inet_pton(AF_INET6, str, in6) == 0) {
+ log_error("Invalid %s IPv6 address\n", desc);
+ rc = -1;
+ goto out;
+ }
+ } else {
+ *in6 = in6addr_any;
+ }
+ if (dev) {
+ args->scope_id = get_ifidx(dev);
+ if (args->scope_id < 0) {
+ log_error("Invalid scope on %s IPv6 address\n",
+ desc);
+ rc = -1;
+ goto out;
+ }
+ }
+ break;
+
+ default:
+ log_error("Invalid address family\n");
+ }
+
+out:
+ free(str);
+ return rc;
+}
+
+static char *random_msg(int len)
+{
+ int i, n = 0, olen = len + 1;
+ char *m;
+
+ if (len <= 0)
+ return NULL;
+
+ m = malloc(olen);
+ if (!m)
+ return NULL;
+
+ while (len > 26) {
+ i = snprintf(m + n, olen - n, "%.26s",
+ "abcdefghijklmnopqrstuvwxyz");
+ n += i;
+ len -= i;
+ }
+ i = snprintf(m + n, olen - n, "%.*s", len,
+ "abcdefghijklmnopqrstuvwxyz");
+ return m;
+}
+
+#define GETOPT_STR "sr:l:p:t:g:P:DRn:M:m:d:SCi6L:0:1:2:Fbq"
+
+static void print_usage(char *prog)
+{
+ printf(
+ "usage: %s OPTS\n"
+ "Required:\n"
+ " -r addr remote address to connect to (client mode only)\n"
+ " -p port port to connect to (client mode)/listen on (server mode)\n"
+ " (default: %d)\n"
+ " -s server mode (default: client mode)\n"
+ " -t timeout seconds (default: none)\n"
+ "\n"
+ "Optional:\n"
+ " -F Restart server loop\n"
+ " -6 IPv6 (default is IPv4)\n"
+ " -P proto protocol for socket: icmp, ospf (default: none)\n"
+ " -D|R datagram (D) / raw (R) socket (default stream)\n"
+ " -l addr local address to bind to\n"
+ "\n"
+ " -d dev bind socket to given device name\n"
+ " -S use setsockopt (IP_UNICAST_IF or IP_MULTICAST_IF)\n"
+ " to set device binding\n"
+ " -C use cmsg and IP_PKTINFO to specify device binding\n"
+ "\n"
+ " -L len send random message of given length\n"
+ " -n num number of times to send message\n"
+ "\n"
+ " -M password use MD5 sum protection\n"
+ " -m prefix/len prefix and length to use for MD5 key\n"
+ " -g grp multicast group (e.g., 239.1.1.1)\n"
+ " -i interactive mode (default is echo and terminate)\n"
+ "\n"
+ " -0 addr Expected local address\n"
+ " -1 addr Expected remote address\n"
+ " -2 dev Expected device name (or index) to receive packet\n"
+ "\n"
+ " -b Bind test only.\n"
+ " -q Be quiet. Run test without printing anything.\n"
+ , prog, DEFAULT_PORT);
+}
+
+int main(int argc, char *argv[])
+{
+ struct sock_args args = {
+ .version = AF_INET,
+ .type = SOCK_STREAM,
+ .port = DEFAULT_PORT,
+ };
+ struct protoent *pe;
+ unsigned int tmp;
+ int forever = 0;
+
+ /* process inputs */
+ extern char *optarg;
+ int rc = 0;
+
+ /*
+ * process input args
+ */
+
+ while ((rc = getopt(argc, argv, GETOPT_STR)) != -1) {
+ switch (rc) {
+ case 's':
+ server_mode = 1;
+ break;
+ case 'F':
+ forever = 1;
+ break;
+ case 'l':
+ args.has_local_ip = 1;
+ if (convert_addr(&args, optarg, ADDR_TYPE_LOCAL) < 0)
+ return 1;
+ break;
+ case 'r':
+ args.has_remote_ip = 1;
+ if (convert_addr(&args, optarg, ADDR_TYPE_REMOTE) < 0)
+ return 1;
+ break;
+ case 'p':
+ if (str_to_uint(optarg, 1, 65535, &tmp) != 0) {
+ fprintf(stderr, "Invalid port\n");
+ return 1;
+ }
+ args.port = (unsigned short) tmp;
+ break;
+ case 't':
+ if (str_to_uint(optarg, 0, INT_MAX,
+ &prog_timeout) != 0) {
+ fprintf(stderr, "Invalid timeout\n");
+ return 1;
+ }
+ break;
+ case 'D':
+ args.type = SOCK_DGRAM;
+ break;
+ case 'R':
+ args.type = SOCK_RAW;
+ args.port = 0;
+ if (!args.protocol)
+ args.protocol = IPPROTO_RAW;
+ break;
+ case 'P':
+ pe = getprotobyname(optarg);
+ if (pe) {
+ args.protocol = pe->p_proto;
+ } else {
+ if (str_to_uint(optarg, 0, 0xffff, &tmp) != 0) {
+ fprintf(stderr, "Invalid protocol\n");
+ return 1;
+ }
+ args.protocol = tmp;
+ }
+ break;
+ case 'n':
+ iter = atoi(optarg);
+ break;
+ case 'L':
+ msg = random_msg(atoi(optarg));
+ break;
+ case 'M':
+ args.password = optarg;
+ break;
+ case 'm':
+ if (convert_addr(&args, optarg, ADDR_TYPE_MD5_PREFIX) < 0)
+ return 1;
+ break;
+ case 'S':
+ args.use_setsockopt = 1;
+ break;
+ case 'C':
+ args.use_cmsg = 1;
+ break;
+ case 'd':
+ args.dev = optarg;
+ args.ifindex = get_ifidx(optarg);
+ if (args.ifindex < 0) {
+ fprintf(stderr, "Invalid device name\n");
+ return 1;
+ }
+ break;
+ case 'i':
+ interactive = 1;
+ break;
+ case 'g':
+ args.has_grp = 1;
+ if (convert_addr(&args, optarg, ADDR_TYPE_MCAST) < 0)
+ return 1;
+ args.type = SOCK_DGRAM;
+ break;
+ case '6':
+ args.version = AF_INET6;
+ break;
+ case 'b':
+ args.bind_test_only = 1;
+ break;
+ case '0':
+ args.has_expected_laddr = 1;
+ if (convert_addr(&args, optarg,
+ ADDR_TYPE_EXPECTED_LOCAL))
+ return 1;
+ break;
+ case '1':
+ args.has_expected_raddr = 1;
+ if (convert_addr(&args, optarg,
+ ADDR_TYPE_EXPECTED_REMOTE))
+ return 1;
+
+ break;
+ case '2':
+ if (str_to_uint(optarg, 0, INT_MAX, &tmp) == 0) {
+ args.expected_ifindex = (int)tmp;
+ } else {
+ args.expected_ifindex = get_ifidx(optarg);
+ if (args.expected_ifindex < 0) {
+ fprintf(stderr,
+ "Invalid expected device\n");
+ return 1;
+ }
+ }
+ break;
+ case 'q':
+ quiet = 1;
+ break;
+ default:
+ print_usage(argv[0]);
+ return 1;
+ }
+ }
+
+ if (args.password &&
+ ((!args.has_remote_ip && !args.prefix_len) || args.type != SOCK_STREAM)) {
+ log_error("MD5 passwords apply to TCP only and require a remote ip for the password\n");
+ return 1;
+ }
+
+ if (args.prefix_len && !args.password) {
+ log_error("Prefix range for MD5 protection specified without a password\n");
+ return 1;
+ }
+
+ if ((args.use_setsockopt || args.use_cmsg) && !args.ifindex) {
+ fprintf(stderr, "Device binding not specified\n");
+ return 1;
+ }
+ if (args.use_setsockopt || args.use_cmsg)
+ args.dev = NULL;
+
+ if (iter == 0) {
+ fprintf(stderr, "Invalid number of messages to send\n");
+ return 1;
+ }
+
+ if (args.type == SOCK_STREAM && !args.protocol)
+ args.protocol = IPPROTO_TCP;
+ if (args.type == SOCK_DGRAM && !args.protocol)
+ args.protocol = IPPROTO_UDP;
+
+ if ((args.type == SOCK_STREAM || args.type == SOCK_DGRAM) &&
+ args.port == 0) {
+ fprintf(stderr, "Invalid port number\n");
+ return 1;
+ }
+
+ if (!server_mode && !args.has_grp &&
+ !args.has_remote_ip && !args.has_local_ip) {
+ fprintf(stderr,
+ "Local (server mode) or remote IP (client IP) required\n");
+ return 1;
+ }
+
+ if (interactive) {
+ prog_timeout = 0;
+ msg = NULL;
+ }
+
+ if (server_mode) {
+ do {
+ rc = do_server(&args);
+ } while (forever);
+
+ return rc;
+ }
+ return do_client(&args);
+}
diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
new file mode 100755
index 000000000..3253fdc78
--- /dev/null
+++ b/tools/testing/selftests/net/pmtu.sh
@@ -0,0 +1,1924 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Check that route PMTU values match expectations, and that initial device MTU
+# values are assigned correctly
+#
+# Tests currently implemented:
+#
+# - pmtu_ipv4
+# Set up two namespaces, A and B, with two paths between them over routers
+# R1 and R2 (also implemented with namespaces), with different MTUs:
+#
+# segment a_r1 segment b_r1 a_r1: 2000
+# .--------------R1--------------. b_r1: 1400
+# A B a_r2: 2000
+# '--------------R2--------------' b_r2: 1500
+# segment a_r2 segment b_r2
+#
+# Check that PMTU exceptions with the correct PMTU are created. Then
+# decrease and increase the MTU of the local link for one of the paths,
+# A to R1, checking that route exception PMTU changes accordingly over
+# this path. Also check that locked exceptions are created when an ICMP
+# message advertising a PMTU smaller than net.ipv4.route.min_pmtu is
+# received
+#
+# - pmtu_ipv6
+# Same as pmtu_ipv4, except for locked PMTU tests, using IPv6
+#
+# - pmtu_ipv4_vxlan4_exception
+# Set up the same network topology as pmtu_ipv4, create a VXLAN tunnel
+# over IPv4 between A and B, routed via R1. On the link between R1 and B,
+# set a MTU lower than the VXLAN MTU and the MTU on the link between A and
+# R1. Send IPv4 packets, exceeding the MTU between R1 and B, over VXLAN
+# from A to B and check that the PMTU exception is created with the right
+# value on A
+#
+# - pmtu_ipv6_vxlan4_exception
+# Same as pmtu_ipv4_vxlan4_exception, but send IPv6 packets from A to B
+#
+# - pmtu_ipv4_vxlan6_exception
+# Same as pmtu_ipv4_vxlan4_exception, but use IPv6 transport from A to B
+#
+# - pmtu_ipv6_vxlan6_exception
+# Same as pmtu_ipv4_vxlan6_exception, but send IPv6 packets from A to B
+#
+# - pmtu_ipv4_geneve4_exception
+# Same as pmtu_ipv4_vxlan4_exception, but using a GENEVE tunnel instead of
+# VXLAN
+#
+# - pmtu_ipv6_geneve4_exception
+# Same as pmtu_ipv6_vxlan4_exception, but using a GENEVE tunnel instead of
+# VXLAN
+#
+# - pmtu_ipv4_geneve6_exception
+# Same as pmtu_ipv4_vxlan6_exception, but using a GENEVE tunnel instead of
+# VXLAN
+#
+# - pmtu_ipv6_geneve6_exception
+# Same as pmtu_ipv6_vxlan6_exception, but using a GENEVE tunnel instead of
+# VXLAN
+#
+# - pmtu_ipv{4,6}_br_vxlan{4,6}_exception
+# Set up three namespaces, A, B, and C, with routing between A and B over
+# R1. R2 is unused in these tests. A has a veth connection to C, and is
+# connected to B via a VXLAN endpoint, which is directly bridged to C.
+# MTU on the B-R1 link is lower than other MTUs.
+#
+# Check that both C and A are able to communicate with B over the VXLAN
+# tunnel, and that PMTU exceptions with the correct values are created.
+#
+# segment a_r1 segment b_r1 b_r1: 4000
+# .--------------R1--------------. everything
+# C---veth A B else: 5000
+# ' bridge |
+# '---- - - - - - VXLAN - - - - - - - '
+#
+# - pmtu_ipv{4,6}_br_geneve{4,6}_exception
+# Same as pmtu_ipv{4,6}_br_vxlan{4,6}_exception, with a GENEVE tunnel
+# instead.
+#
+# - pmtu_ipv{4,6}_ovs_vxlan{4,6}_exception
+# Set up two namespaces, B, and C, with routing between the init namespace
+# and B over R1. A and R2 are unused in these tests. The init namespace
+# has a veth connection to C, and is connected to B via a VXLAN endpoint,
+# which is handled by Open vSwitch and bridged to C. MTU on the B-R1 link
+# is lower than other MTUs.
+#
+# Check that C is able to communicate with B over the VXLAN tunnel, and
+# that PMTU exceptions with the correct values are created.
+#
+# segment a_r1 segment b_r1 b_r1: 4000
+# .--------------R1--------------. everything
+# C---veth init B else: 5000
+# '- ovs |
+# '---- - - - - - VXLAN - - - - - - - '
+#
+# - pmtu_ipv{4,6}_ovs_geneve{4,6}_exception
+# Same as pmtu_ipv{4,6}_ovs_vxlan{4,6}_exception, with a GENEVE tunnel
+# instead.
+#
+# - pmtu_ipv{4,6}_fou{4,6}_exception
+# Same as pmtu_ipv4_vxlan4, but using a direct IPv4/IPv6 encapsulation
+# (FoU) over IPv4/IPv6, instead of VXLAN
+#
+# - pmtu_ipv{4,6}_fou{4,6}_exception
+# Same as pmtu_ipv4_vxlan4, but using a generic UDP IPv4/IPv6
+# encapsulation (GUE) over IPv4/IPv6, instead of VXLAN
+#
+# - pmtu_ipv{4,6}_ipv{4,6}_exception
+# Same as pmtu_ipv4_vxlan4, but using a IPv4/IPv6 tunnel over IPv4/IPv6,
+# instead of VXLAN
+#
+# - pmtu_vti4_exception
+# Set up vti tunnel on top of veth, with xfrm states and policies, in two
+# namespaces with matching endpoints. Check that route exception is not
+# created if link layer MTU is not exceeded, then exceed it and check that
+# exception is created with the expected PMTU. The approach described
+# below for IPv6 doesn't apply here, because, on IPv4, administrative MTU
+# changes alone won't affect PMTU
+#
+# - pmtu_vti6_exception
+# Set up vti6 tunnel on top of veth, with xfrm states and policies, in two
+# namespaces with matching endpoints. Check that route exception is
+# created by exceeding link layer MTU with ping to other endpoint. Then
+# decrease and increase MTU of tunnel, checking that route exception PMTU
+# changes accordingly
+#
+# - pmtu_vti4_default_mtu
+# Set up vti4 tunnel on top of veth, in two namespaces with matching
+# endpoints. Check that MTU assigned to vti interface is the MTU of the
+# lower layer (veth) minus additional lower layer headers (zero, for veth)
+# minus IPv4 header length
+#
+# - pmtu_vti6_default_mtu
+# Same as above, for IPv6
+#
+# - pmtu_vti4_link_add_mtu
+# Set up vti4 interface passing MTU value at link creation, check MTU is
+# configured, and that link is not created with invalid MTU values
+#
+# - pmtu_vti6_link_add_mtu
+# Same as above, for IPv6
+#
+# - pmtu_vti6_link_change_mtu
+# Set up two dummy interfaces with different MTUs, create a vti6 tunnel
+# and check that configured MTU is used on link creation and changes, and
+# that MTU is properly calculated instead when MTU is not configured from
+# userspace
+#
+# - cleanup_ipv4_exception
+# Similar to pmtu_ipv4_vxlan4_exception, but explicitly generate PMTU
+# exceptions on multiple CPUs and check that the veth device tear-down
+# happens in a timely manner
+#
+# - cleanup_ipv6_exception
+# Same as above, but use IPv6 transport from A to B
+#
+# - list_flush_ipv4_exception
+# Using the same topology as in pmtu_ipv4, create exceptions, and check
+# they are shown when listing exception caches, gone after flushing them
+#
+# - list_flush_ipv6_exception
+# Using the same topology as in pmtu_ipv6, create exceptions, and check
+# they are shown when listing exception caches, gone after flushing them
+#
+# - pmtu_ipv4_route_change
+# Use the same topology as in pmtu_ipv4, but issue a route replacement
+# command and delete the corresponding device afterward. This tests for
+# proper cleanup of the PMTU exceptions by the route replacement path.
+# Device unregistration should complete successfully
+#
+# - pmtu_ipv6_route_change
+# Same as above but with IPv6
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+PAUSE_ON_FAIL=no
+VERBOSE=0
+TRACING=0
+
+# Some systems don't have a ping6 binary anymore
+which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping)
+
+# Name Description re-run with nh
+tests="
+ pmtu_ipv4_exception ipv4: PMTU exceptions 1
+ pmtu_ipv6_exception ipv6: PMTU exceptions 1
+ pmtu_ipv4_vxlan4_exception IPv4 over vxlan4: PMTU exceptions 1
+ pmtu_ipv6_vxlan4_exception IPv6 over vxlan4: PMTU exceptions 1
+ pmtu_ipv4_vxlan6_exception IPv4 over vxlan6: PMTU exceptions 1
+ pmtu_ipv6_vxlan6_exception IPv6 over vxlan6: PMTU exceptions 1
+ pmtu_ipv4_geneve4_exception IPv4 over geneve4: PMTU exceptions 1
+ pmtu_ipv6_geneve4_exception IPv6 over geneve4: PMTU exceptions 1
+ pmtu_ipv4_geneve6_exception IPv4 over geneve6: PMTU exceptions 1
+ pmtu_ipv6_geneve6_exception IPv6 over geneve6: PMTU exceptions 1
+ pmtu_ipv4_br_vxlan4_exception IPv4, bridged vxlan4: PMTU exceptions 1
+ pmtu_ipv6_br_vxlan4_exception IPv6, bridged vxlan4: PMTU exceptions 1
+ pmtu_ipv4_br_vxlan6_exception IPv4, bridged vxlan6: PMTU exceptions 1
+ pmtu_ipv6_br_vxlan6_exception IPv6, bridged vxlan6: PMTU exceptions 1
+ pmtu_ipv4_br_geneve4_exception IPv4, bridged geneve4: PMTU exceptions 1
+ pmtu_ipv6_br_geneve4_exception IPv6, bridged geneve4: PMTU exceptions 1
+ pmtu_ipv4_br_geneve6_exception IPv4, bridged geneve6: PMTU exceptions 1
+ pmtu_ipv6_br_geneve6_exception IPv6, bridged geneve6: PMTU exceptions 1
+ pmtu_ipv4_ovs_vxlan4_exception IPv4, OVS vxlan4: PMTU exceptions 1
+ pmtu_ipv6_ovs_vxlan4_exception IPv6, OVS vxlan4: PMTU exceptions 1
+ pmtu_ipv4_ovs_vxlan6_exception IPv4, OVS vxlan6: PMTU exceptions 1
+ pmtu_ipv6_ovs_vxlan6_exception IPv6, OVS vxlan6: PMTU exceptions 1
+ pmtu_ipv4_ovs_geneve4_exception IPv4, OVS geneve4: PMTU exceptions 1
+ pmtu_ipv6_ovs_geneve4_exception IPv6, OVS geneve4: PMTU exceptions 1
+ pmtu_ipv4_ovs_geneve6_exception IPv4, OVS geneve6: PMTU exceptions 1
+ pmtu_ipv6_ovs_geneve6_exception IPv6, OVS geneve6: PMTU exceptions 1
+ pmtu_ipv4_fou4_exception IPv4 over fou4: PMTU exceptions 1
+ pmtu_ipv6_fou4_exception IPv6 over fou4: PMTU exceptions 1
+ pmtu_ipv4_fou6_exception IPv4 over fou6: PMTU exceptions 1
+ pmtu_ipv6_fou6_exception IPv6 over fou6: PMTU exceptions 1
+ pmtu_ipv4_gue4_exception IPv4 over gue4: PMTU exceptions 1
+ pmtu_ipv6_gue4_exception IPv6 over gue4: PMTU exceptions 1
+ pmtu_ipv4_gue6_exception IPv4 over gue6: PMTU exceptions 1
+ pmtu_ipv6_gue6_exception IPv6 over gue6: PMTU exceptions 1
+ pmtu_ipv4_ipv4_exception IPv4 over IPv4: PMTU exceptions 1
+ pmtu_ipv6_ipv4_exception IPv6 over IPv4: PMTU exceptions 1
+ pmtu_ipv4_ipv6_exception IPv4 over IPv6: PMTU exceptions 1
+ pmtu_ipv6_ipv6_exception IPv6 over IPv6: PMTU exceptions 1
+ pmtu_vti6_exception vti6: PMTU exceptions 0
+ pmtu_vti4_exception vti4: PMTU exceptions 0
+ pmtu_vti4_default_mtu vti4: default MTU assignment 0
+ pmtu_vti6_default_mtu vti6: default MTU assignment 0
+ pmtu_vti4_link_add_mtu vti4: MTU setting on link creation 0
+ pmtu_vti6_link_add_mtu vti6: MTU setting on link creation 0
+ pmtu_vti6_link_change_mtu vti6: MTU changes on link changes 0
+ cleanup_ipv4_exception ipv4: cleanup of cached exceptions 1
+ cleanup_ipv6_exception ipv6: cleanup of cached exceptions 1
+ list_flush_ipv4_exception ipv4: list and flush cached exceptions 1
+ list_flush_ipv6_exception ipv6: list and flush cached exceptions 1
+ pmtu_ipv4_route_change ipv4: PMTU exception w/route replace 1
+ pmtu_ipv6_route_change ipv6: PMTU exception w/route replace 1"
+
+NS_A="ns-A"
+NS_B="ns-B"
+NS_C="ns-C"
+NS_R1="ns-R1"
+NS_R2="ns-R2"
+ns_a="ip netns exec ${NS_A}"
+ns_b="ip netns exec ${NS_B}"
+ns_c="ip netns exec ${NS_C}"
+ns_r1="ip netns exec ${NS_R1}"
+ns_r2="ip netns exec ${NS_R2}"
+
+# Addressing and routing for tests with routers: four network segments, with
+# index SEGMENT between 1 and 4, a common prefix (PREFIX4 or PREFIX6) and an
+# identifier ID, which is 1 for hosts (A and B), 2 for routers (R1 and R2).
+# Addresses are:
+# - IPv4: PREFIX4.SEGMENT.ID (/24)
+# - IPv6: PREFIX6:SEGMENT::ID (/64)
+prefix4="10.0"
+prefix6="fc00"
+a_r1=1
+a_r2=2
+b_r1=3
+b_r2=4
+# ns peer segment
+routing_addrs="
+ A R1 ${a_r1}
+ A R2 ${a_r2}
+ B R1 ${b_r1}
+ B R2 ${b_r2}
+"
+# Traffic from A to B goes through R1 by default, and through R2, if destined to
+# B's address on the b_r2 segment.
+# Traffic from B to A goes through R1.
+# ns destination gateway
+routes="
+ A default ${prefix4}.${a_r1}.2
+ A ${prefix4}.${b_r2}.1 ${prefix4}.${a_r2}.2
+ B default ${prefix4}.${b_r1}.2
+
+ A default ${prefix6}:${a_r1}::2
+ A ${prefix6}:${b_r2}::1 ${prefix6}:${a_r2}::2
+ B default ${prefix6}:${b_r1}::2
+"
+
+USE_NH="no"
+# ns family nh id destination gateway
+nexthops="
+ A 4 41 ${prefix4}.${a_r1}.2 veth_A-R1
+ A 4 42 ${prefix4}.${a_r2}.2 veth_A-R2
+ B 4 41 ${prefix4}.${b_r1}.2 veth_B-R1
+
+ A 6 61 ${prefix6}:${a_r1}::2 veth_A-R1
+ A 6 62 ${prefix6}:${a_r2}::2 veth_A-R2
+ B 6 61 ${prefix6}:${b_r1}::2 veth_B-R1
+"
+
+# nexthop id correlates to id in nexthops config above
+# ns family prefix nh id
+routes_nh="
+ A 4 default 41
+ A 4 ${prefix4}.${b_r2}.1 42
+ B 4 default 41
+
+ A 6 default 61
+ A 6 ${prefix6}:${b_r2}::1 62
+ B 6 default 61
+"
+
+veth4_a_addr="192.168.1.1"
+veth4_b_addr="192.168.1.2"
+veth4_c_addr="192.168.2.10"
+veth4_mask="24"
+veth6_a_addr="fd00:1::a"
+veth6_b_addr="fd00:1::b"
+veth6_c_addr="fd00:2::c"
+veth6_mask="64"
+
+tunnel4_a_addr="192.168.2.1"
+tunnel4_b_addr="192.168.2.2"
+tunnel4_mask="24"
+tunnel6_a_addr="fd00:2::a"
+tunnel6_b_addr="fd00:2::b"
+tunnel6_mask="64"
+
+dummy6_0_prefix="fc00:1000::"
+dummy6_1_prefix="fc00:1001::"
+dummy6_mask="64"
+
+err_buf=
+tcpdump_pids=
+
+err() {
+ err_buf="${err_buf}${1}
+"
+}
+
+err_flush() {
+ echo -n "${err_buf}"
+ err_buf=
+}
+
+run_cmd() {
+ cmd="$*"
+
+ if [ "$VERBOSE" = "1" ]; then
+ printf " COMMAND: $cmd\n"
+ fi
+
+ out="$($cmd 2>&1)"
+ rc=$?
+ if [ "$VERBOSE" = "1" -a -n "$out" ]; then
+ echo " $out"
+ echo
+ fi
+
+ return $rc
+}
+
+# Find the auto-generated name for this namespace
+nsname() {
+ eval echo \$NS_$1
+}
+
+setup_fou_or_gue() {
+ outer="${1}"
+ inner="${2}"
+ encap="${3}"
+
+ if [ "${outer}" = "4" ]; then
+ modprobe fou || return 2
+ a_addr="${prefix4}.${a_r1}.1"
+ b_addr="${prefix4}.${b_r1}.1"
+ if [ "${inner}" = "4" ]; then
+ type="ipip"
+ ipproto="4"
+ else
+ type="sit"
+ ipproto="41"
+ fi
+ else
+ modprobe fou6 || return 2
+ a_addr="${prefix6}:${a_r1}::1"
+ b_addr="${prefix6}:${b_r1}::1"
+ if [ "${inner}" = "4" ]; then
+ type="ip6tnl"
+ mode="mode ipip6"
+ ipproto="4 -6"
+ else
+ type="ip6tnl"
+ mode="mode ip6ip6"
+ ipproto="41 -6"
+ fi
+ fi
+
+ run_cmd ${ns_a} ip fou add port 5555 ipproto ${ipproto} || return 2
+ run_cmd ${ns_a} ip link add ${encap}_a type ${type} ${mode} local ${a_addr} remote ${b_addr} encap ${encap} encap-sport auto encap-dport 5556 || return 2
+
+ run_cmd ${ns_b} ip fou add port 5556 ipproto ${ipproto}
+ run_cmd ${ns_b} ip link add ${encap}_b type ${type} ${mode} local ${b_addr} remote ${a_addr} encap ${encap} encap-sport auto encap-dport 5555
+
+ if [ "${inner}" = "4" ]; then
+ run_cmd ${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ${encap}_a
+ run_cmd ${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ${encap}_b
+ else
+ run_cmd ${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ${encap}_a
+ run_cmd ${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ${encap}_b
+ fi
+
+ run_cmd ${ns_a} ip link set ${encap}_a up
+ run_cmd ${ns_b} ip link set ${encap}_b up
+}
+
+setup_fou44() {
+ setup_fou_or_gue 4 4 fou
+}
+
+setup_fou46() {
+ setup_fou_or_gue 4 6 fou
+}
+
+setup_fou64() {
+ setup_fou_or_gue 6 4 fou
+}
+
+setup_fou66() {
+ setup_fou_or_gue 6 6 fou
+}
+
+setup_gue44() {
+ setup_fou_or_gue 4 4 gue
+}
+
+setup_gue46() {
+ setup_fou_or_gue 4 6 gue
+}
+
+setup_gue64() {
+ setup_fou_or_gue 6 4 gue
+}
+
+setup_gue66() {
+ setup_fou_or_gue 6 6 gue
+}
+
+setup_ipvX_over_ipvY() {
+ inner=${1}
+ outer=${2}
+
+ if [ "${outer}" -eq 4 ]; then
+ a_addr="${prefix4}.${a_r1}.1"
+ b_addr="${prefix4}.${b_r1}.1"
+ if [ "${inner}" -eq 4 ]; then
+ type="ipip"
+ mode="ipip"
+ else
+ type="sit"
+ mode="ip6ip"
+ fi
+ else
+ a_addr="${prefix6}:${a_r1}::1"
+ b_addr="${prefix6}:${b_r1}::1"
+ type="ip6tnl"
+ if [ "${inner}" -eq 4 ]; then
+ mode="ipip6"
+ else
+ mode="ip6ip6"
+ fi
+ fi
+
+ run_cmd ${ns_a} ip link add ip_a type ${type} local ${a_addr} remote ${b_addr} mode ${mode} || return 2
+ run_cmd ${ns_b} ip link add ip_b type ${type} local ${b_addr} remote ${a_addr} mode ${mode}
+
+ run_cmd ${ns_a} ip link set ip_a up
+ run_cmd ${ns_b} ip link set ip_b up
+
+ if [ "${inner}" = "4" ]; then
+ run_cmd ${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ip_a
+ run_cmd ${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ip_b
+ else
+ run_cmd ${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ip_a
+ run_cmd ${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ip_b
+ fi
+}
+
+setup_ip4ip4() {
+ setup_ipvX_over_ipvY 4 4
+}
+
+setup_ip6ip4() {
+ setup_ipvX_over_ipvY 6 4
+}
+
+setup_ip4ip6() {
+ setup_ipvX_over_ipvY 4 6
+}
+
+setup_ip6ip6() {
+ setup_ipvX_over_ipvY 6 6
+}
+
+setup_namespaces() {
+ for n in ${NS_A} ${NS_B} ${NS_C} ${NS_R1} ${NS_R2}; do
+ ip netns add ${n} || return 1
+
+ # Disable DAD, so that we don't have to wait to use the
+ # configured IPv6 addresses
+ ip netns exec ${n} sysctl -q net/ipv6/conf/default/accept_dad=0
+ done
+}
+
+setup_veth() {
+ run_cmd ${ns_a} ip link add veth_a type veth peer name veth_b || return 1
+ run_cmd ${ns_a} ip link set veth_b netns ${NS_B}
+
+ run_cmd ${ns_a} ip addr add ${veth4_a_addr}/${veth4_mask} dev veth_a
+ run_cmd ${ns_b} ip addr add ${veth4_b_addr}/${veth4_mask} dev veth_b
+
+ run_cmd ${ns_a} ip addr add ${veth6_a_addr}/${veth6_mask} dev veth_a
+ run_cmd ${ns_b} ip addr add ${veth6_b_addr}/${veth6_mask} dev veth_b
+
+ run_cmd ${ns_a} ip link set veth_a up
+ run_cmd ${ns_b} ip link set veth_b up
+}
+
+setup_vti() {
+ proto=${1}
+ veth_a_addr="${2}"
+ veth_b_addr="${3}"
+ vti_a_addr="${4}"
+ vti_b_addr="${5}"
+ vti_mask=${6}
+
+ [ ${proto} -eq 6 ] && vti_type="vti6" || vti_type="vti"
+
+ run_cmd ${ns_a} ip link add vti${proto}_a type ${vti_type} local ${veth_a_addr} remote ${veth_b_addr} key 10 || return 1
+ run_cmd ${ns_b} ip link add vti${proto}_b type ${vti_type} local ${veth_b_addr} remote ${veth_a_addr} key 10
+
+ run_cmd ${ns_a} ip addr add ${vti_a_addr}/${vti_mask} dev vti${proto}_a
+ run_cmd ${ns_b} ip addr add ${vti_b_addr}/${vti_mask} dev vti${proto}_b
+
+ run_cmd ${ns_a} ip link set vti${proto}_a up
+ run_cmd ${ns_b} ip link set vti${proto}_b up
+}
+
+setup_vti4() {
+ setup_vti 4 ${veth4_a_addr} ${veth4_b_addr} ${tunnel4_a_addr} ${tunnel4_b_addr} ${tunnel4_mask}
+}
+
+setup_vti6() {
+ setup_vti 6 ${veth6_a_addr} ${veth6_b_addr} ${tunnel6_a_addr} ${tunnel6_b_addr} ${tunnel6_mask}
+}
+
+setup_vxlan_or_geneve() {
+ type="${1}"
+ a_addr="${2}"
+ b_addr="${3}"
+ opts="${4}"
+ br_if_a="${5}"
+
+ if [ "${type}" = "vxlan" ]; then
+ opts="${opts} ttl 64 dstport 4789"
+ opts_a="local ${a_addr}"
+ opts_b="local ${b_addr}"
+ else
+ opts_a=""
+ opts_b=""
+ fi
+
+ run_cmd ${ns_a} ip link add ${type}_a type ${type} id 1 ${opts_a} remote ${b_addr} ${opts} || return 1
+ run_cmd ${ns_b} ip link add ${type}_b type ${type} id 1 ${opts_b} remote ${a_addr} ${opts}
+
+ if [ -n "${br_if_a}" ]; then
+ run_cmd ${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ${br_if_a}
+ run_cmd ${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ${br_if_a}
+ run_cmd ${ns_a} ip link set ${type}_a master ${br_if_a}
+ else
+ run_cmd ${ns_a} ip addr add ${tunnel4_a_addr}/${tunnel4_mask} dev ${type}_a
+ run_cmd ${ns_a} ip addr add ${tunnel6_a_addr}/${tunnel6_mask} dev ${type}_a
+ fi
+
+ run_cmd ${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ${type}_b
+ run_cmd ${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ${type}_b
+
+ run_cmd ${ns_a} ip link set ${type}_a up
+ run_cmd ${ns_b} ip link set ${type}_b up
+}
+
+setup_geneve4() {
+ setup_vxlan_or_geneve geneve ${prefix4}.${a_r1}.1 ${prefix4}.${b_r1}.1 "df set"
+}
+
+setup_vxlan4() {
+ setup_vxlan_or_geneve vxlan ${prefix4}.${a_r1}.1 ${prefix4}.${b_r1}.1 "df set"
+}
+
+setup_geneve6() {
+ setup_vxlan_or_geneve geneve ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1 ""
+}
+
+setup_vxlan6() {
+ setup_vxlan_or_geneve vxlan ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1 ""
+}
+
+setup_bridged_geneve4() {
+ setup_vxlan_or_geneve geneve ${prefix4}.${a_r1}.1 ${prefix4}.${b_r1}.1 "df set" "br0"
+}
+
+setup_bridged_vxlan4() {
+ setup_vxlan_or_geneve vxlan ${prefix4}.${a_r1}.1 ${prefix4}.${b_r1}.1 "df set" "br0"
+}
+
+setup_bridged_geneve6() {
+ setup_vxlan_or_geneve geneve ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1 "" "br0"
+}
+
+setup_bridged_vxlan6() {
+ setup_vxlan_or_geneve vxlan ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1 "" "br0"
+}
+
+setup_xfrm() {
+ proto=${1}
+ veth_a_addr="${2}"
+ veth_b_addr="${3}"
+
+ run_cmd ${ns_a} ip -${proto} xfrm state add src ${veth_a_addr} dst ${veth_b_addr} spi 0x1000 proto esp aead 'rfc4106(gcm(aes))' 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel || return 1
+ run_cmd ${ns_a} ip -${proto} xfrm state add src ${veth_b_addr} dst ${veth_a_addr} spi 0x1001 proto esp aead 'rfc4106(gcm(aes))' 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel
+ run_cmd ${ns_a} ip -${proto} xfrm policy add dir out mark 10 tmpl src ${veth_a_addr} dst ${veth_b_addr} proto esp mode tunnel
+ run_cmd ${ns_a} ip -${proto} xfrm policy add dir in mark 10 tmpl src ${veth_b_addr} dst ${veth_a_addr} proto esp mode tunnel
+
+ run_cmd ${ns_b} ip -${proto} xfrm state add src ${veth_a_addr} dst ${veth_b_addr} spi 0x1000 proto esp aead 'rfc4106(gcm(aes))' 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel
+ run_cmd ${ns_b} ip -${proto} xfrm state add src ${veth_b_addr} dst ${veth_a_addr} spi 0x1001 proto esp aead 'rfc4106(gcm(aes))' 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f 128 mode tunnel
+ run_cmd ${ns_b} ip -${proto} xfrm policy add dir out mark 10 tmpl src ${veth_b_addr} dst ${veth_a_addr} proto esp mode tunnel
+ run_cmd ${ns_b} ip -${proto} xfrm policy add dir in mark 10 tmpl src ${veth_a_addr} dst ${veth_b_addr} proto esp mode tunnel
+}
+
+setup_xfrm4() {
+ setup_xfrm 4 ${veth4_a_addr} ${veth4_b_addr}
+}
+
+setup_xfrm6() {
+ setup_xfrm 6 ${veth6_a_addr} ${veth6_b_addr}
+}
+
+setup_routing_old() {
+ for i in ${routes}; do
+ [ "${ns}" = "" ] && ns="${i}" && continue
+ [ "${addr}" = "" ] && addr="${i}" && continue
+ [ "${gw}" = "" ] && gw="${i}"
+
+ ns_name="$(nsname ${ns})"
+
+ ip -n ${ns_name} route add ${addr} via ${gw}
+
+ ns=""; addr=""; gw=""
+ done
+}
+
+setup_routing_new() {
+ for i in ${nexthops}; do
+ [ "${ns}" = "" ] && ns="${i}" && continue
+ [ "${fam}" = "" ] && fam="${i}" && continue
+ [ "${nhid}" = "" ] && nhid="${i}" && continue
+ [ "${gw}" = "" ] && gw="${i}" && continue
+ [ "${dev}" = "" ] && dev="${i}"
+
+ ns_name="$(nsname ${ns})"
+
+ ip -n ${ns_name} -${fam} nexthop add id ${nhid} via ${gw} dev ${dev}
+
+ ns=""; fam=""; nhid=""; gw=""; dev=""
+
+ done
+
+ for i in ${routes_nh}; do
+ [ "${ns}" = "" ] && ns="${i}" && continue
+ [ "${fam}" = "" ] && fam="${i}" && continue
+ [ "${addr}" = "" ] && addr="${i}" && continue
+ [ "${nhid}" = "" ] && nhid="${i}"
+
+ ns_name="$(nsname ${ns})"
+
+ ip -n ${ns_name} -${fam} route add ${addr} nhid ${nhid}
+
+ ns=""; fam=""; addr=""; nhid=""
+ done
+}
+
+setup_routing() {
+ for i in ${NS_R1} ${NS_R2}; do
+ ip netns exec ${i} sysctl -q net/ipv4/ip_forward=1
+ ip netns exec ${i} sysctl -q net/ipv6/conf/all/forwarding=1
+ done
+
+ for i in ${routing_addrs}; do
+ [ "${ns}" = "" ] && ns="${i}" && continue
+ [ "${peer}" = "" ] && peer="${i}" && continue
+ [ "${segment}" = "" ] && segment="${i}"
+
+ ns_name="$(nsname ${ns})"
+ peer_name="$(nsname ${peer})"
+ if="veth_${ns}-${peer}"
+ ifpeer="veth_${peer}-${ns}"
+
+ # Create veth links
+ ip link add ${if} up netns ${ns_name} type veth peer name ${ifpeer} netns ${peer_name} || return 1
+ ip -n ${peer_name} link set dev ${ifpeer} up
+
+ # Add addresses
+ ip -n ${ns_name} addr add ${prefix4}.${segment}.1/24 dev ${if}
+ ip -n ${ns_name} addr add ${prefix6}:${segment}::1/64 dev ${if}
+
+ ip -n ${peer_name} addr add ${prefix4}.${segment}.2/24 dev ${ifpeer}
+ ip -n ${peer_name} addr add ${prefix6}:${segment}::2/64 dev ${ifpeer}
+
+ ns=""; peer=""; segment=""
+ done
+
+ if [ "$USE_NH" = "yes" ]; then
+ setup_routing_new
+ else
+ setup_routing_old
+ fi
+
+ return 0
+}
+
+setup_bridge() {
+ run_cmd ${ns_a} ip link add br0 type bridge || return 2
+ run_cmd ${ns_a} ip link set br0 up
+
+ run_cmd ${ns_c} ip link add veth_C-A type veth peer name veth_A-C
+ run_cmd ${ns_c} ip link set veth_A-C netns ns-A
+
+ run_cmd ${ns_a} ip link set veth_A-C up
+ run_cmd ${ns_c} ip link set veth_C-A up
+ run_cmd ${ns_c} ip addr add ${veth4_c_addr}/${veth4_mask} dev veth_C-A
+ run_cmd ${ns_c} ip addr add ${veth6_c_addr}/${veth6_mask} dev veth_C-A
+ run_cmd ${ns_a} ip link set veth_A-C master br0
+}
+
+setup_ovs_vxlan_or_geneve() {
+ type="${1}"
+ a_addr="${2}"
+ b_addr="${3}"
+
+ if [ "${type}" = "vxlan" ]; then
+ opts="${opts} ttl 64 dstport 4789"
+ opts_b="local ${b_addr}"
+ fi
+
+ run_cmd ovs-vsctl add-port ovs_br0 ${type}_a -- \
+ set interface ${type}_a type=${type} \
+ options:remote_ip=${b_addr} options:key=1 options:csum=true || return 1
+
+ run_cmd ${ns_b} ip link add ${type}_b type ${type} id 1 ${opts_b} remote ${a_addr} ${opts} || return 1
+
+ run_cmd ${ns_b} ip addr add ${tunnel4_b_addr}/${tunnel4_mask} dev ${type}_b
+ run_cmd ${ns_b} ip addr add ${tunnel6_b_addr}/${tunnel6_mask} dev ${type}_b
+
+ run_cmd ${ns_b} ip link set ${type}_b up
+}
+
+setup_ovs_geneve4() {
+ setup_ovs_vxlan_or_geneve geneve ${prefix4}.${a_r1}.1 ${prefix4}.${b_r1}.1
+}
+
+setup_ovs_vxlan4() {
+ setup_ovs_vxlan_or_geneve vxlan ${prefix4}.${a_r1}.1 ${prefix4}.${b_r1}.1
+}
+
+setup_ovs_geneve6() {
+ setup_ovs_vxlan_or_geneve geneve ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
+}
+
+setup_ovs_vxlan6() {
+ setup_ovs_vxlan_or_geneve vxlan ${prefix6}:${a_r1}::1 ${prefix6}:${b_r1}::1
+}
+
+setup_ovs_bridge() {
+ run_cmd ovs-vsctl add-br ovs_br0 || return 2
+ run_cmd ip link set ovs_br0 up
+
+ run_cmd ${ns_c} ip link add veth_C-A type veth peer name veth_A-C
+ run_cmd ${ns_c} ip link set veth_A-C netns 1
+
+ run_cmd ip link set veth_A-C up
+ run_cmd ${ns_c} ip link set veth_C-A up
+ run_cmd ${ns_c} ip addr add ${veth4_c_addr}/${veth4_mask} dev veth_C-A
+ run_cmd ${ns_c} ip addr add ${veth6_c_addr}/${veth6_mask} dev veth_C-A
+ run_cmd ovs-vsctl add-port ovs_br0 veth_A-C
+
+ # Move veth_A-R1 to init
+ run_cmd ${ns_a} ip link set veth_A-R1 netns 1
+ run_cmd ip addr add ${prefix4}.${a_r1}.1/${veth4_mask} dev veth_A-R1
+ run_cmd ip addr add ${prefix6}:${a_r1}::1/${veth6_mask} dev veth_A-R1
+ run_cmd ip link set veth_A-R1 up
+ run_cmd ip route add ${prefix4}.${b_r1}.1 via ${prefix4}.${a_r1}.2
+ run_cmd ip route add ${prefix6}:${b_r1}::1 via ${prefix6}:${a_r1}::2
+}
+
+setup() {
+ [ "$(id -u)" -ne 0 ] && echo " need to run as root" && return $ksft_skip
+
+ for arg do
+ eval setup_${arg} || { echo " ${arg} not supported"; return 1; }
+ done
+}
+
+trace() {
+ [ $TRACING -eq 0 ] && return
+
+ for arg do
+ [ "${ns_cmd}" = "" ] && ns_cmd="${arg}" && continue
+ ${ns_cmd} tcpdump --immediate-mode -s 0 -i "${arg}" -w "${name}_${arg}.pcap" 2> /dev/null &
+ tcpdump_pids="${tcpdump_pids} $!"
+ ns_cmd=
+ done
+ sleep 1
+}
+
+cleanup() {
+ for pid in ${tcpdump_pids}; do
+ kill ${pid}
+ done
+ tcpdump_pids=
+
+ for n in ${NS_A} ${NS_B} ${NS_C} ${NS_R1} ${NS_R2}; do
+ ip netns del ${n} 2> /dev/null
+ done
+
+ ip link del veth_A-C 2>/dev/null
+ ip link del veth_A-R1 2>/dev/null
+ ovs-vsctl --if-exists del-port vxlan_a 2>/dev/null
+ ovs-vsctl --if-exists del-br ovs_br0 2>/dev/null
+}
+
+mtu() {
+ ns_cmd="${1}"
+ dev="${2}"
+ mtu="${3}"
+
+ ${ns_cmd} ip link set dev ${dev} mtu ${mtu}
+}
+
+mtu_parse() {
+ input="${1}"
+
+ next=0
+ for i in ${input}; do
+ [ ${next} -eq 1 -a "${i}" = "lock" ] && next=2 && continue
+ [ ${next} -eq 1 ] && echo "${i}" && return
+ [ ${next} -eq 2 ] && echo "lock ${i}" && return
+ [ "${i}" = "mtu" ] && next=1
+ done
+}
+
+link_get() {
+ ns_cmd="${1}"
+ name="${2}"
+
+ ${ns_cmd} ip link show dev "${name}"
+}
+
+link_get_mtu() {
+ ns_cmd="${1}"
+ name="${2}"
+
+ mtu_parse "$(link_get "${ns_cmd}" ${name})"
+}
+
+route_get_dst_exception() {
+ ns_cmd="${1}"
+ dst="${2}"
+
+ ${ns_cmd} ip route get "${dst}"
+}
+
+route_get_dst_pmtu_from_exception() {
+ ns_cmd="${1}"
+ dst="${2}"
+
+ mtu_parse "$(route_get_dst_exception "${ns_cmd}" ${dst})"
+}
+
+check_pmtu_value() {
+ expected="${1}"
+ value="${2}"
+ event="${3}"
+
+ [ "${expected}" = "any" ] && [ -n "${value}" ] && return 0
+ [ "${value}" = "${expected}" ] && return 0
+ [ -z "${value}" ] && err " PMTU exception wasn't created after ${event}" && return 1
+ [ -z "${expected}" ] && err " PMTU exception shouldn't exist after ${event}" && return 1
+ err " found PMTU exception with incorrect MTU ${value}, expected ${expected}, after ${event}"
+ return 1
+}
+
+test_pmtu_ipvX() {
+ family=${1}
+
+ setup namespaces routing || return 2
+ trace "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \
+ "${ns_r1}" veth_R1-B "${ns_b}" veth_B-R1 \
+ "${ns_a}" veth_A-R2 "${ns_r2}" veth_R2-A \
+ "${ns_r2}" veth_R2-B "${ns_b}" veth_B-R2
+
+ if [ ${family} -eq 4 ]; then
+ ping=ping
+ dst1="${prefix4}.${b_r1}.1"
+ dst2="${prefix4}.${b_r2}.1"
+ else
+ ping=${ping6}
+ dst1="${prefix6}:${b_r1}::1"
+ dst2="${prefix6}:${b_r2}::1"
+ fi
+
+ # Set up initial MTU values
+ mtu "${ns_a}" veth_A-R1 2000
+ mtu "${ns_r1}" veth_R1-A 2000
+ mtu "${ns_r1}" veth_R1-B 1400
+ mtu "${ns_b}" veth_B-R1 1400
+
+ mtu "${ns_a}" veth_A-R2 2000
+ mtu "${ns_r2}" veth_R2-A 2000
+ mtu "${ns_r2}" veth_R2-B 1500
+ mtu "${ns_b}" veth_B-R2 1500
+
+ # Create route exceptions
+ run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s 1800 ${dst1}
+ run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s 1800 ${dst2}
+
+ # Check that exceptions have been created with the correct PMTU
+ pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst1})"
+ check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
+ pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
+ check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
+
+ # Decrease local MTU below PMTU, check for PMTU decrease in route exception
+ mtu "${ns_a}" veth_A-R1 1300
+ mtu "${ns_r1}" veth_R1-A 1300
+ pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst1})"
+ check_pmtu_value "1300" "${pmtu_1}" "decreasing local MTU" || return 1
+ # Second exception shouldn't be modified
+ pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
+ check_pmtu_value "1500" "${pmtu_2}" "changing local MTU on a link not on this path" || return 1
+
+ # Increase MTU, check for PMTU increase in route exception
+ mtu "${ns_a}" veth_A-R1 1700
+ mtu "${ns_r1}" veth_R1-A 1700
+ pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst1})"
+ check_pmtu_value "1700" "${pmtu_1}" "increasing local MTU" || return 1
+ # Second exception shouldn't be modified
+ pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
+ check_pmtu_value "1500" "${pmtu_2}" "changing local MTU on a link not on this path" || return 1
+
+ # Skip PMTU locking tests for IPv6
+ [ $family -eq 6 ] && return 0
+
+ # Decrease remote MTU on path via R2, get new exception
+ mtu "${ns_r2}" veth_R2-B 400
+ mtu "${ns_b}" veth_B-R2 400
+ run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s 1400 ${dst2}
+ pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
+ check_pmtu_value "lock 552" "${pmtu_2}" "exceeding MTU, with MTU < min_pmtu" || return 1
+
+ # Decrease local MTU below PMTU
+ mtu "${ns_a}" veth_A-R2 500
+ mtu "${ns_r2}" veth_R2-A 500
+ pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
+ check_pmtu_value "500" "${pmtu_2}" "decreasing local MTU" || return 1
+
+ # Increase local MTU
+ mtu "${ns_a}" veth_A-R2 1500
+ mtu "${ns_r2}" veth_R2-A 1500
+ pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
+ check_pmtu_value "1500" "${pmtu_2}" "increasing local MTU" || return 1
+
+ # Get new exception
+ run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s 1400 ${dst2}
+ pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
+ check_pmtu_value "lock 552" "${pmtu_2}" "exceeding MTU, with MTU < min_pmtu" || return 1
+}
+
+test_pmtu_ipv4_exception() {
+ test_pmtu_ipvX 4
+}
+
+test_pmtu_ipv6_exception() {
+ test_pmtu_ipvX 6
+}
+
+test_pmtu_ipvX_over_vxlanY_or_geneveY_exception() {
+ type=${1}
+ family=${2}
+ outer_family=${3}
+ ll_mtu=4000
+
+ if [ ${outer_family} -eq 4 ]; then
+ setup namespaces routing ${type}4 || return 2
+ # IPv4 header UDP header VXLAN/GENEVE header Ethernet header
+ exp_mtu=$((${ll_mtu} - 20 - 8 - 8 - 14))
+ else
+ setup namespaces routing ${type}6 || return 2
+ # IPv6 header UDP header VXLAN/GENEVE header Ethernet header
+ exp_mtu=$((${ll_mtu} - 40 - 8 - 8 - 14))
+ fi
+
+ trace "${ns_a}" ${type}_a "${ns_b}" ${type}_b \
+ "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \
+ "${ns_b}" veth_B-R1 "${ns_r1}" veth_R1-B
+
+ if [ ${family} -eq 4 ]; then
+ ping=ping
+ dst=${tunnel4_b_addr}
+ else
+ ping=${ping6}
+ dst=${tunnel6_b_addr}
+ fi
+
+ # Create route exception by exceeding link layer MTU
+ mtu "${ns_a}" veth_A-R1 $((${ll_mtu} + 1000))
+ mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000))
+ mtu "${ns_b}" veth_B-R1 ${ll_mtu}
+ mtu "${ns_r1}" veth_R1-B ${ll_mtu}
+
+ mtu "${ns_a}" ${type}_a $((${ll_mtu} + 1000))
+ mtu "${ns_b}" ${type}_b $((${ll_mtu} + 1000))
+ run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${dst}
+
+ # Check that exception was created
+ pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})"
+ check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on ${type} interface"
+}
+
+test_pmtu_ipv4_vxlan4_exception() {
+ test_pmtu_ipvX_over_vxlanY_or_geneveY_exception vxlan 4 4
+}
+
+test_pmtu_ipv6_vxlan4_exception() {
+ test_pmtu_ipvX_over_vxlanY_or_geneveY_exception vxlan 6 4
+}
+
+test_pmtu_ipv4_geneve4_exception() {
+ test_pmtu_ipvX_over_vxlanY_or_geneveY_exception geneve 4 4
+}
+
+test_pmtu_ipv6_geneve4_exception() {
+ test_pmtu_ipvX_over_vxlanY_or_geneveY_exception geneve 6 4
+}
+
+test_pmtu_ipv4_vxlan6_exception() {
+ test_pmtu_ipvX_over_vxlanY_or_geneveY_exception vxlan 4 6
+}
+
+test_pmtu_ipv6_vxlan6_exception() {
+ test_pmtu_ipvX_over_vxlanY_or_geneveY_exception vxlan 6 6
+}
+
+test_pmtu_ipv4_geneve6_exception() {
+ test_pmtu_ipvX_over_vxlanY_or_geneveY_exception geneve 4 6
+}
+
+test_pmtu_ipv6_geneve6_exception() {
+ test_pmtu_ipvX_over_vxlanY_or_geneveY_exception geneve 6 6
+}
+
+test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception() {
+ type=${1}
+ family=${2}
+ outer_family=${3}
+ ll_mtu=4000
+
+ if [ ${outer_family} -eq 4 ]; then
+ setup namespaces routing bridge bridged_${type}4 || return 2
+ # IPv4 header UDP header VXLAN/GENEVE header Ethernet header
+ exp_mtu=$((${ll_mtu} - 20 - 8 - 8 - 14))
+ else
+ setup namespaces routing bridge bridged_${type}6 || return 2
+ # IPv6 header UDP header VXLAN/GENEVE header Ethernet header
+ exp_mtu=$((${ll_mtu} - 40 - 8 - 8 - 14))
+ fi
+
+ trace "${ns_a}" ${type}_a "${ns_b}" ${type}_b \
+ "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \
+ "${ns_b}" veth_B-R1 "${ns_r1}" veth_R1-B \
+ "${ns_a}" br0 "${ns_a}" veth-A-C \
+ "${ns_c}" veth_C-A
+
+ if [ ${family} -eq 4 ]; then
+ ping=ping
+ dst=${tunnel4_b_addr}
+ else
+ ping=${ping6}
+ dst=${tunnel6_b_addr}
+ fi
+
+ # Create route exception by exceeding link layer MTU
+ mtu "${ns_a}" veth_A-R1 $((${ll_mtu} + 1000))
+ mtu "${ns_a}" br0 $((${ll_mtu} + 1000))
+ mtu "${ns_a}" veth_A-C $((${ll_mtu} + 1000))
+ mtu "${ns_c}" veth_C-A $((${ll_mtu} + 1000))
+ mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000))
+ mtu "${ns_b}" veth_B-R1 ${ll_mtu}
+ mtu "${ns_r1}" veth_R1-B ${ll_mtu}
+
+ mtu "${ns_a}" ${type}_a $((${ll_mtu} + 1000))
+ mtu "${ns_b}" ${type}_b $((${ll_mtu} + 1000))
+
+ run_cmd ${ns_c} ${ping} -q -M want -i 0.1 -c 10 -s $((${ll_mtu} + 500)) ${dst} || return 1
+ run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${dst} || return 1
+
+ # Check that exceptions were created
+ pmtu="$(route_get_dst_pmtu_from_exception "${ns_c}" ${dst})"
+ check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on bridged ${type} interface"
+ pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})"
+ check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on locally bridged ${type} interface"
+}
+
+test_pmtu_ipv4_br_vxlan4_exception() {
+ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception vxlan 4 4
+}
+
+test_pmtu_ipv6_br_vxlan4_exception() {
+ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception vxlan 6 4
+}
+
+test_pmtu_ipv4_br_geneve4_exception() {
+ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception geneve 4 4
+}
+
+test_pmtu_ipv6_br_geneve4_exception() {
+ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception geneve 6 4
+}
+
+test_pmtu_ipv4_br_vxlan6_exception() {
+ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception vxlan 4 6
+}
+
+test_pmtu_ipv6_br_vxlan6_exception() {
+ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception vxlan 6 6
+}
+
+test_pmtu_ipv4_br_geneve6_exception() {
+ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception geneve 4 6
+}
+
+test_pmtu_ipv6_br_geneve6_exception() {
+ test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception geneve 6 6
+}
+
+test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception() {
+ type=${1}
+ family=${2}
+ outer_family=${3}
+ ll_mtu=4000
+
+ if [ ${outer_family} -eq 4 ]; then
+ setup namespaces routing ovs_bridge ovs_${type}4 || return 2
+ # IPv4 header UDP header VXLAN/GENEVE header Ethernet header
+ exp_mtu=$((${ll_mtu} - 20 - 8 - 8 - 14))
+ else
+ setup namespaces routing ovs_bridge ovs_${type}6 || return 2
+ # IPv6 header UDP header VXLAN/GENEVE header Ethernet header
+ exp_mtu=$((${ll_mtu} - 40 - 8 - 8 - 14))
+ fi
+
+ if [ "${type}" = "vxlan" ]; then
+ tun_a="vxlan_sys_4789"
+ elif [ "${type}" = "geneve" ]; then
+ tun_a="genev_sys_6081"
+ fi
+
+ trace "" "${tun_a}" "${ns_b}" ${type}_b \
+ "" veth_A-R1 "${ns_r1}" veth_R1-A \
+ "${ns_b}" veth_B-R1 "${ns_r1}" veth_R1-B \
+ "" ovs_br0 "" veth-A-C \
+ "${ns_c}" veth_C-A
+
+ if [ ${family} -eq 4 ]; then
+ ping=ping
+ dst=${tunnel4_b_addr}
+ else
+ ping=${ping6}
+ dst=${tunnel6_b_addr}
+ fi
+
+ # Create route exception by exceeding link layer MTU
+ mtu "" veth_A-R1 $((${ll_mtu} + 1000))
+ mtu "" ovs_br0 $((${ll_mtu} + 1000))
+ mtu "" veth_A-C $((${ll_mtu} + 1000))
+ mtu "${ns_c}" veth_C-A $((${ll_mtu} + 1000))
+ mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000))
+ mtu "${ns_b}" veth_B-R1 ${ll_mtu}
+ mtu "${ns_r1}" veth_R1-B ${ll_mtu}
+
+ mtu "" ${tun_a} $((${ll_mtu} + 1000))
+ mtu "${ns_b}" ${type}_b $((${ll_mtu} + 1000))
+
+ run_cmd ${ns_c} ${ping} -q -M want -i 0.1 -c 20 -s $((${ll_mtu} + 500)) ${dst} || return 1
+
+ # Check that exceptions were created
+ pmtu="$(route_get_dst_pmtu_from_exception "${ns_c}" ${dst})"
+ check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on Open vSwitch ${type} interface"
+}
+
+test_pmtu_ipv4_ovs_vxlan4_exception() {
+ test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception vxlan 4 4
+}
+
+test_pmtu_ipv6_ovs_vxlan4_exception() {
+ test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception vxlan 6 4
+}
+
+test_pmtu_ipv4_ovs_geneve4_exception() {
+ test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception geneve 4 4
+}
+
+test_pmtu_ipv6_ovs_geneve4_exception() {
+ test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception geneve 6 4
+}
+
+test_pmtu_ipv4_ovs_vxlan6_exception() {
+ test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception vxlan 4 6
+}
+
+test_pmtu_ipv6_ovs_vxlan6_exception() {
+ test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception vxlan 6 6
+}
+
+test_pmtu_ipv4_ovs_geneve6_exception() {
+ test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception geneve 4 6
+}
+
+test_pmtu_ipv6_ovs_geneve6_exception() {
+ test_pmtu_ipvX_over_ovs_vxlanY_or_geneveY_exception geneve 6 6
+}
+
+test_pmtu_ipvX_over_fouY_or_gueY() {
+ inner_family=${1}
+ outer_family=${2}
+ encap=${3}
+ ll_mtu=4000
+
+ setup namespaces routing ${encap}${outer_family}${inner_family} || return 2
+ trace "${ns_a}" ${encap}_a "${ns_b}" ${encap}_b \
+ "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \
+ "${ns_b}" veth_B-R1 "${ns_r1}" veth_R1-B
+
+ if [ ${inner_family} -eq 4 ]; then
+ ping=ping
+ dst=${tunnel4_b_addr}
+ else
+ ping=${ping6}
+ dst=${tunnel6_b_addr}
+ fi
+
+ if [ "${encap}" = "gue" ]; then
+ encap_overhead=4
+ else
+ encap_overhead=0
+ fi
+
+ if [ ${outer_family} -eq 4 ]; then
+ # IPv4 header UDP header
+ exp_mtu=$((${ll_mtu} - 20 - 8 - ${encap_overhead}))
+ else
+ # IPv6 header Option 4 UDP header
+ exp_mtu=$((${ll_mtu} - 40 - 8 - 8 - ${encap_overhead}))
+ fi
+
+ # Create route exception by exceeding link layer MTU
+ mtu "${ns_a}" veth_A-R1 $((${ll_mtu} + 1000))
+ mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000))
+ mtu "${ns_b}" veth_B-R1 ${ll_mtu}
+ mtu "${ns_r1}" veth_R1-B ${ll_mtu}
+
+ mtu "${ns_a}" ${encap}_a $((${ll_mtu} + 1000))
+ mtu "${ns_b}" ${encap}_b $((${ll_mtu} + 1000))
+ run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${dst}
+
+ # Check that exception was created
+ pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})"
+ check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on ${encap} interface"
+}
+
+test_pmtu_ipv4_fou4_exception() {
+ test_pmtu_ipvX_over_fouY_or_gueY 4 4 fou
+}
+
+test_pmtu_ipv6_fou4_exception() {
+ test_pmtu_ipvX_over_fouY_or_gueY 6 4 fou
+}
+
+test_pmtu_ipv4_fou6_exception() {
+ test_pmtu_ipvX_over_fouY_or_gueY 4 6 fou
+}
+
+test_pmtu_ipv6_fou6_exception() {
+ test_pmtu_ipvX_over_fouY_or_gueY 6 6 fou
+}
+
+test_pmtu_ipv4_gue4_exception() {
+ test_pmtu_ipvX_over_fouY_or_gueY 4 4 gue
+}
+
+test_pmtu_ipv6_gue4_exception() {
+ test_pmtu_ipvX_over_fouY_or_gueY 6 4 gue
+}
+
+test_pmtu_ipv4_gue6_exception() {
+ test_pmtu_ipvX_over_fouY_or_gueY 4 6 gue
+}
+
+test_pmtu_ipv6_gue6_exception() {
+ test_pmtu_ipvX_over_fouY_or_gueY 6 6 gue
+}
+
+test_pmtu_ipvX_over_ipvY_exception() {
+ inner=${1}
+ outer=${2}
+ ll_mtu=4000
+
+ setup namespaces routing ip${inner}ip${outer} || return 2
+
+ trace "${ns_a}" ip_a "${ns_b}" ip_b \
+ "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \
+ "${ns_b}" veth_B-R1 "${ns_r1}" veth_R1-B
+
+ if [ ${inner} -eq 4 ]; then
+ ping=ping
+ dst=${tunnel4_b_addr}
+ else
+ ping=${ping6}
+ dst=${tunnel6_b_addr}
+ fi
+
+ if [ ${outer} -eq 4 ]; then
+ # IPv4 header
+ exp_mtu=$((${ll_mtu} - 20))
+ else
+ # IPv6 header Option 4
+ exp_mtu=$((${ll_mtu} - 40 - 8))
+ fi
+
+ # Create route exception by exceeding link layer MTU
+ mtu "${ns_a}" veth_A-R1 $((${ll_mtu} + 1000))
+ mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000))
+ mtu "${ns_b}" veth_B-R1 ${ll_mtu}
+ mtu "${ns_r1}" veth_R1-B ${ll_mtu}
+
+ mtu "${ns_a}" ip_a $((${ll_mtu} + 1000)) || return
+ mtu "${ns_b}" ip_b $((${ll_mtu} + 1000)) || return
+ run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${dst}
+
+ # Check that exception was created
+ pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})"
+ check_pmtu_value ${exp_mtu} "${pmtu}" "exceeding link layer MTU on ip${inner}ip${outer} interface"
+}
+
+test_pmtu_ipv4_ipv4_exception() {
+ test_pmtu_ipvX_over_ipvY_exception 4 4
+}
+
+test_pmtu_ipv6_ipv4_exception() {
+ test_pmtu_ipvX_over_ipvY_exception 6 4
+}
+
+test_pmtu_ipv4_ipv6_exception() {
+ test_pmtu_ipvX_over_ipvY_exception 4 6
+}
+
+test_pmtu_ipv6_ipv6_exception() {
+ test_pmtu_ipvX_over_ipvY_exception 6 6
+}
+
+test_pmtu_vti4_exception() {
+ setup namespaces veth vti4 xfrm4 || return 2
+ trace "${ns_a}" veth_a "${ns_b}" veth_b \
+ "${ns_a}" vti4_a "${ns_b}" vti4_b
+
+ veth_mtu=1500
+ vti_mtu=$((veth_mtu - 20))
+
+ # SPI SN IV ICV pad length next header
+ esp_payload_rfc4106=$((vti_mtu - 4 - 4 - 8 - 16 - 1 - 1))
+ ping_payload=$((esp_payload_rfc4106 - 28))
+
+ mtu "${ns_a}" veth_a ${veth_mtu}
+ mtu "${ns_b}" veth_b ${veth_mtu}
+ mtu "${ns_a}" vti4_a ${vti_mtu}
+ mtu "${ns_b}" vti4_b ${vti_mtu}
+
+ # Send DF packet without exceeding link layer MTU, check that no
+ # exception is created
+ run_cmd ${ns_a} ping -q -M want -i 0.1 -w 1 -s ${ping_payload} ${tunnel4_b_addr}
+ pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel4_b_addr})"
+ check_pmtu_value "" "${pmtu}" "sending packet smaller than PMTU (IP payload length ${esp_payload_rfc4106})" || return 1
+
+ # Now exceed link layer MTU by one byte, check that exception is created
+ # with the right PMTU value
+ run_cmd ${ns_a} ping -q -M want -i 0.1 -w 1 -s $((ping_payload + 1)) ${tunnel4_b_addr}
+ pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel4_b_addr})"
+ check_pmtu_value "${esp_payload_rfc4106}" "${pmtu}" "exceeding PMTU (IP payload length $((esp_payload_rfc4106 + 1)))"
+}
+
+test_pmtu_vti6_exception() {
+ setup namespaces veth vti6 xfrm6 || return 2
+ trace "${ns_a}" veth_a "${ns_b}" veth_b \
+ "${ns_a}" vti6_a "${ns_b}" vti6_b
+ fail=0
+
+ # Create route exception by exceeding link layer MTU
+ mtu "${ns_a}" veth_a 4000
+ mtu "${ns_b}" veth_b 4000
+ mtu "${ns_a}" vti6_a 5000
+ mtu "${ns_b}" vti6_b 5000
+ run_cmd ${ns_a} ${ping6} -q -i 0.1 -w 1 -s 60000 ${tunnel6_b_addr}
+
+ # Check that exception was created
+ pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})"
+ check_pmtu_value any "${pmtu}" "creating tunnel exceeding link layer MTU" || return 1
+
+ # Decrease tunnel MTU, check for PMTU decrease in route exception
+ mtu "${ns_a}" vti6_a 3000
+ pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})"
+ check_pmtu_value "3000" "${pmtu}" "decreasing tunnel MTU" || fail=1
+
+ # Increase tunnel MTU, check for PMTU increase in route exception
+ mtu "${ns_a}" vti6_a 9000
+ pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})"
+ check_pmtu_value "9000" "${pmtu}" "increasing tunnel MTU" || fail=1
+
+ return ${fail}
+}
+
+test_pmtu_vti4_default_mtu() {
+ setup namespaces veth vti4 || return 2
+
+ # Check that MTU of vti device is MTU of veth minus IPv4 header length
+ veth_mtu="$(link_get_mtu "${ns_a}" veth_a)"
+ vti4_mtu="$(link_get_mtu "${ns_a}" vti4_a)"
+ if [ $((veth_mtu - vti4_mtu)) -ne 20 ]; then
+ err " vti MTU ${vti4_mtu} is not veth MTU ${veth_mtu} minus IPv4 header length"
+ return 1
+ fi
+}
+
+test_pmtu_vti6_default_mtu() {
+ setup namespaces veth vti6 || return 2
+
+ # Check that MTU of vti device is MTU of veth minus IPv6 header length
+ veth_mtu="$(link_get_mtu "${ns_a}" veth_a)"
+ vti6_mtu="$(link_get_mtu "${ns_a}" vti6_a)"
+ if [ $((veth_mtu - vti6_mtu)) -ne 40 ]; then
+ err " vti MTU ${vti6_mtu} is not veth MTU ${veth_mtu} minus IPv6 header length"
+ return 1
+ fi
+}
+
+test_pmtu_vti4_link_add_mtu() {
+ setup namespaces || return 2
+
+ run_cmd ${ns_a} ip link add vti4_a type vti local ${veth4_a_addr} remote ${veth4_b_addr} key 10
+ [ $? -ne 0 ] && err " vti not supported" && return 2
+ run_cmd ${ns_a} ip link del vti4_a
+
+ fail=0
+
+ min=68
+ max=$((65535 - 20))
+ # Check invalid values first
+ for v in $((min - 1)) $((max + 1)); do
+ run_cmd ${ns_a} ip link add vti4_a mtu ${v} type vti local ${veth4_a_addr} remote ${veth4_b_addr} key 10
+ # This can fail, or MTU can be adjusted to a proper value
+ [ $? -ne 0 ] && continue
+ mtu="$(link_get_mtu "${ns_a}" vti4_a)"
+ if [ ${mtu} -lt ${min} -o ${mtu} -gt ${max} ]; then
+ err " vti tunnel created with invalid MTU ${mtu}"
+ fail=1
+ fi
+ run_cmd ${ns_a} ip link del vti4_a
+ done
+
+ # Now check valid values
+ for v in ${min} 1300 ${max}; do
+ run_cmd ${ns_a} ip link add vti4_a mtu ${v} type vti local ${veth4_a_addr} remote ${veth4_b_addr} key 10
+ mtu="$(link_get_mtu "${ns_a}" vti4_a)"
+ run_cmd ${ns_a} ip link del vti4_a
+ if [ "${mtu}" != "${v}" ]; then
+ err " vti MTU ${mtu} doesn't match configured value ${v}"
+ fail=1
+ fi
+ done
+
+ return ${fail}
+}
+
+test_pmtu_vti6_link_add_mtu() {
+ setup namespaces || return 2
+
+ run_cmd ${ns_a} ip link add vti6_a type vti6 local ${veth6_a_addr} remote ${veth6_b_addr} key 10
+ [ $? -ne 0 ] && err " vti6 not supported" && return 2
+ run_cmd ${ns_a} ip link del vti6_a
+
+ fail=0
+
+ min=68 # vti6 can carry IPv4 packets too
+ max=$((65535 - 40))
+ # Check invalid values first
+ for v in $((min - 1)) $((max + 1)); do
+ run_cmd ${ns_a} ip link add vti6_a mtu ${v} type vti6 local ${veth6_a_addr} remote ${veth6_b_addr} key 10
+ # This can fail, or MTU can be adjusted to a proper value
+ [ $? -ne 0 ] && continue
+ mtu="$(link_get_mtu "${ns_a}" vti6_a)"
+ if [ ${mtu} -lt ${min} -o ${mtu} -gt ${max} ]; then
+ err " vti6 tunnel created with invalid MTU ${v}"
+ fail=1
+ fi
+ run_cmd ${ns_a} ip link del vti6_a
+ done
+
+ # Now check valid values
+ for v in 68 1280 1300 $((65535 - 40)); do
+ run_cmd ${ns_a} ip link add vti6_a mtu ${v} type vti6 local ${veth6_a_addr} remote ${veth6_b_addr} key 10
+ mtu="$(link_get_mtu "${ns_a}" vti6_a)"
+ run_cmd ${ns_a} ip link del vti6_a
+ if [ "${mtu}" != "${v}" ]; then
+ err " vti6 MTU ${mtu} doesn't match configured value ${v}"
+ fail=1
+ fi
+ done
+
+ return ${fail}
+}
+
+test_pmtu_vti6_link_change_mtu() {
+ setup namespaces || return 2
+
+ run_cmd ${ns_a} ip link add dummy0 mtu 1500 type dummy
+ [ $? -ne 0 ] && err " dummy not supported" && return 2
+ run_cmd ${ns_a} ip link add dummy1 mtu 3000 type dummy
+ run_cmd ${ns_a} ip link set dummy0 up
+ run_cmd ${ns_a} ip link set dummy1 up
+
+ run_cmd ${ns_a} ip addr add ${dummy6_0_prefix}1/${dummy6_mask} dev dummy0
+ run_cmd ${ns_a} ip addr add ${dummy6_1_prefix}1/${dummy6_mask} dev dummy1
+
+ fail=0
+
+ # Create vti6 interface bound to device, passing MTU, check it
+ run_cmd ${ns_a} ip link add vti6_a mtu 1300 type vti6 remote ${dummy6_0_prefix}2 local ${dummy6_0_prefix}1
+ mtu="$(link_get_mtu "${ns_a}" vti6_a)"
+ if [ ${mtu} -ne 1300 ]; then
+ err " vti6 MTU ${mtu} doesn't match configured value 1300"
+ fail=1
+ fi
+
+ # Move to another device with different MTU, without passing MTU, check
+ # MTU is adjusted
+ run_cmd ${ns_a} ip link set vti6_a type vti6 remote ${dummy6_1_prefix}2 local ${dummy6_1_prefix}1
+ mtu="$(link_get_mtu "${ns_a}" vti6_a)"
+ if [ ${mtu} -ne $((3000 - 40)) ]; then
+ err " vti MTU ${mtu} is not dummy MTU 3000 minus IPv6 header length"
+ fail=1
+ fi
+
+ # Move it back, passing MTU, check MTU is not overridden
+ run_cmd ${ns_a} ip link set vti6_a mtu 1280 type vti6 remote ${dummy6_0_prefix}2 local ${dummy6_0_prefix}1
+ mtu="$(link_get_mtu "${ns_a}" vti6_a)"
+ if [ ${mtu} -ne 1280 ]; then
+ err " vti6 MTU ${mtu} doesn't match configured value 1280"
+ fail=1
+ fi
+
+ return ${fail}
+}
+
+check_command() {
+ cmd=${1}
+
+ if ! which ${cmd} > /dev/null 2>&1; then
+ err " missing required command: '${cmd}'"
+ return 1
+ fi
+ return 0
+}
+
+test_cleanup_vxlanX_exception() {
+ outer="${1}"
+ encap="vxlan"
+ ll_mtu=4000
+
+ check_command taskset || return 2
+ cpu_list=$(grep -m 2 processor /proc/cpuinfo | cut -d ' ' -f 2)
+
+ setup namespaces routing ${encap}${outer} || return 2
+ trace "${ns_a}" ${encap}_a "${ns_b}" ${encap}_b \
+ "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \
+ "${ns_b}" veth_B-R1 "${ns_r1}" veth_R1-B
+
+ # Create route exception by exceeding link layer MTU
+ mtu "${ns_a}" veth_A-R1 $((${ll_mtu} + 1000))
+ mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000))
+ mtu "${ns_b}" veth_B-R1 ${ll_mtu}
+ mtu "${ns_r1}" veth_R1-B ${ll_mtu}
+
+ mtu "${ns_a}" ${encap}_a $((${ll_mtu} + 1000))
+ mtu "${ns_b}" ${encap}_b $((${ll_mtu} + 1000))
+
+ # Fill exception cache for multiple CPUs (2)
+ # we can always use inner IPv4 for that
+ for cpu in ${cpu_list}; do
+ run_cmd taskset --cpu-list ${cpu} ${ns_a} ping -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${tunnel4_b_addr}
+ done
+
+ ${ns_a} ip link del dev veth_A-R1 &
+ iplink_pid=$!
+ sleep 1
+ if [ "$(cat /proc/${iplink_pid}/cmdline 2>/dev/null | tr -d '\0')" = "iplinkdeldevveth_A-R1" ]; then
+ err " can't delete veth device in a timely manner, PMTU dst likely leaked"
+ return 1
+ fi
+}
+
+test_cleanup_ipv6_exception() {
+ test_cleanup_vxlanX_exception 6
+}
+
+test_cleanup_ipv4_exception() {
+ test_cleanup_vxlanX_exception 4
+}
+
+run_test() {
+ (
+ tname="$1"
+ tdesc="$2"
+
+ unset IFS
+
+ # Since cleanup() relies on variables modified by this subshell, it
+ # has to run in this context.
+ trap cleanup EXIT
+
+ if [ "$VERBOSE" = "1" ]; then
+ printf "\n##########################################################################\n\n"
+ fi
+
+ eval test_${tname}
+ ret=$?
+
+ if [ $ret -eq 0 ]; then
+ printf "TEST: %-60s [ OK ]\n" "${tdesc}"
+ elif [ $ret -eq 1 ]; then
+ printf "TEST: %-60s [FAIL]\n" "${tdesc}"
+ if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+ echo
+ echo "Pausing. Hit enter to continue"
+ read a
+ fi
+ err_flush
+ exit 1
+ elif [ $ret -eq 2 ]; then
+ printf "TEST: %-60s [SKIP]\n" "${tdesc}"
+ err_flush
+ fi
+
+ return $ret
+ )
+ ret=$?
+ [ $ret -ne 0 ] && exitcode=1
+
+ return $ret
+}
+
+run_test_nh() {
+ tname="$1"
+ tdesc="$2"
+
+ USE_NH=yes
+ run_test "${tname}" "${tdesc} - nexthop objects"
+ USE_NH=no
+}
+
+test_list_flush_ipv4_exception() {
+ setup namespaces routing || return 2
+ trace "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \
+ "${ns_r1}" veth_R1-B "${ns_b}" veth_B-R1 \
+ "${ns_a}" veth_A-R2 "${ns_r2}" veth_R2-A \
+ "${ns_r2}" veth_R2-B "${ns_b}" veth_B-R2
+
+ dst_prefix1="${prefix4}.${b_r1}."
+ dst2="${prefix4}.${b_r2}.1"
+
+ # Set up initial MTU values
+ mtu "${ns_a}" veth_A-R1 2000
+ mtu "${ns_r1}" veth_R1-A 2000
+ mtu "${ns_r1}" veth_R1-B 1500
+ mtu "${ns_b}" veth_B-R1 1500
+
+ mtu "${ns_a}" veth_A-R2 2000
+ mtu "${ns_r2}" veth_R2-A 2000
+ mtu "${ns_r2}" veth_R2-B 1500
+ mtu "${ns_b}" veth_B-R2 1500
+
+ fail=0
+
+ # Add 100 addresses for veth endpoint on B reached by default A route
+ for i in $(seq 100 199); do
+ run_cmd ${ns_b} ip addr add "${dst_prefix1}${i}" dev veth_B-R1
+ done
+
+ # Create 100 cached route exceptions for path via R1, one via R2. Note
+ # that with IPv4 we need to actually cause a route lookup that matches
+ # the exception caused by ICMP, in order to actually have a cached
+ # route, so we need to ping each destination twice
+ for i in $(seq 100 199); do
+ run_cmd ${ns_a} ping -q -M want -i 0.1 -c 2 -s 1800 "${dst_prefix1}${i}"
+ done
+ run_cmd ${ns_a} ping -q -M want -i 0.1 -c 2 -s 1800 "${dst2}"
+
+ if [ "$(${ns_a} ip -oneline route list cache | wc -l)" -ne 101 ]; then
+ err " can't list cached exceptions"
+ fail=1
+ fi
+
+ run_cmd ${ns_a} ip route flush cache
+ pmtu1="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst_prefix}1)"
+ pmtu2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst_prefix}2)"
+ if [ -n "${pmtu1}" ] || [ -n "${pmtu2}" ] || \
+ [ -n "$(${ns_a} ip route list cache)" ]; then
+ err " can't flush cached exceptions"
+ fail=1
+ fi
+
+ return ${fail}
+}
+
+test_list_flush_ipv6_exception() {
+ setup namespaces routing || return 2
+ trace "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \
+ "${ns_r1}" veth_R1-B "${ns_b}" veth_B-R1 \
+ "${ns_a}" veth_A-R2 "${ns_r2}" veth_R2-A \
+ "${ns_r2}" veth_R2-B "${ns_b}" veth_B-R2
+
+ dst_prefix1="${prefix6}:${b_r1}::"
+ dst2="${prefix6}:${b_r2}::1"
+
+ # Set up initial MTU values
+ mtu "${ns_a}" veth_A-R1 2000
+ mtu "${ns_r1}" veth_R1-A 2000
+ mtu "${ns_r1}" veth_R1-B 1500
+ mtu "${ns_b}" veth_B-R1 1500
+
+ mtu "${ns_a}" veth_A-R2 2000
+ mtu "${ns_r2}" veth_R2-A 2000
+ mtu "${ns_r2}" veth_R2-B 1500
+ mtu "${ns_b}" veth_B-R2 1500
+
+ fail=0
+
+ # Add 100 addresses for veth endpoint on B reached by default A route
+ for i in $(seq 100 199); do
+ run_cmd ${ns_b} ip addr add "${dst_prefix1}${i}" dev veth_B-R1
+ done
+
+ # Create 100 cached route exceptions for path via R1, one via R2
+ for i in $(seq 100 199); do
+ run_cmd ${ns_a} ping -q -M want -i 0.1 -w 1 -s 1800 "${dst_prefix1}${i}"
+ done
+ run_cmd ${ns_a} ping -q -M want -i 0.1 -w 1 -s 1800 "${dst2}"
+ if [ "$(${ns_a} ip -oneline -6 route list cache | wc -l)" -ne 101 ]; then
+ err " can't list cached exceptions"
+ fail=1
+ fi
+
+ run_cmd ${ns_a} ip -6 route flush cache
+ pmtu1="$(route_get_dst_pmtu_from_exception "${ns_a}" "${dst_prefix1}100")"
+ pmtu2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
+ if [ -n "${pmtu1}" ] || [ -n "${pmtu2}" ] || \
+ [ -n "$(${ns_a} ip -6 route list cache)" ]; then
+ err " can't flush cached exceptions"
+ fail=1
+ fi
+
+ return ${fail}
+}
+
+test_pmtu_ipvX_route_change() {
+ family=${1}
+
+ setup namespaces routing || return 2
+ trace "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \
+ "${ns_r1}" veth_R1-B "${ns_b}" veth_B-R1 \
+ "${ns_a}" veth_A-R2 "${ns_r2}" veth_R2-A \
+ "${ns_r2}" veth_R2-B "${ns_b}" veth_B-R2
+
+ if [ ${family} -eq 4 ]; then
+ ping=ping
+ dst1="${prefix4}.${b_r1}.1"
+ dst2="${prefix4}.${b_r2}.1"
+ gw="${prefix4}.${a_r1}.2"
+ else
+ ping=${ping6}
+ dst1="${prefix6}:${b_r1}::1"
+ dst2="${prefix6}:${b_r2}::1"
+ gw="${prefix6}:${a_r1}::2"
+ fi
+
+ # Set up initial MTU values
+ mtu "${ns_a}" veth_A-R1 2000
+ mtu "${ns_r1}" veth_R1-A 2000
+ mtu "${ns_r1}" veth_R1-B 1400
+ mtu "${ns_b}" veth_B-R1 1400
+
+ mtu "${ns_a}" veth_A-R2 2000
+ mtu "${ns_r2}" veth_R2-A 2000
+ mtu "${ns_r2}" veth_R2-B 1500
+ mtu "${ns_b}" veth_B-R2 1500
+
+ # Create route exceptions
+ run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s 1800 ${dst1}
+ run_cmd ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s 1800 ${dst2}
+
+ # Check that exceptions have been created with the correct PMTU
+ pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst1})"
+ check_pmtu_value "1400" "${pmtu_1}" "exceeding MTU" || return 1
+ pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})"
+ check_pmtu_value "1500" "${pmtu_2}" "exceeding MTU" || return 1
+
+ # Replace the route from A to R1
+ run_cmd ${ns_a} ip route change default via ${gw}
+
+ # Delete the device in A
+ run_cmd ${ns_a} ip link del "veth_A-R1"
+}
+
+test_pmtu_ipv4_route_change() {
+ test_pmtu_ipvX_route_change 4
+}
+
+test_pmtu_ipv6_route_change() {
+ test_pmtu_ipvX_route_change 6
+}
+
+usage() {
+ echo
+ echo "$0 [OPTIONS] [TEST]..."
+ echo "If no TEST argument is given, all tests will be run."
+ echo
+ echo "Options"
+ echo " --trace: capture traffic to TEST_INTERFACE.pcap"
+ echo
+ echo "Available tests${tests}"
+ exit 1
+}
+
+################################################################################
+#
+exitcode=0
+desc=0
+
+while getopts :ptv o
+do
+ case $o in
+ p) PAUSE_ON_FAIL=yes;;
+ v) VERBOSE=1;;
+ t) if which tcpdump > /dev/null 2>&1; then
+ TRACING=1
+ else
+ echo "=== tcpdump not available, tracing disabled"
+ fi
+ ;;
+ *) usage;;
+ esac
+done
+shift $(($OPTIND-1))
+
+IFS="
+"
+
+for arg do
+ # Check first that all requested tests are available before running any
+ command -v > /dev/null "test_${arg}" || { echo "=== Test ${arg} not found"; usage; }
+done
+
+trap cleanup EXIT
+
+# start clean
+cleanup
+
+HAVE_NH=no
+ip nexthop ls >/dev/null 2>&1
+[ $? -eq 0 ] && HAVE_NH=yes
+
+name=""
+desc=""
+rerun_nh=0
+for t in ${tests}; do
+ [ "${name}" = "" ] && name="${t}" && continue
+ [ "${desc}" = "" ] && desc="${t}" && continue
+
+ if [ "${HAVE_NH}" = "yes" ]; then
+ rerun_nh="${t}"
+ fi
+
+ run_this=1
+ for arg do
+ [ "${arg}" != "${arg#--*}" ] && continue
+ [ "${arg}" = "${name}" ] && run_this=1 && break
+ run_this=0
+ done
+ if [ $run_this -eq 1 ]; then
+ run_test "${name}" "${desc}"
+ # if test was skipped no need to retry with nexthop objects
+ [ $? -eq 2 ] && rerun_nh=0
+
+ if [ "${rerun_nh}" = "1" ]; then
+ run_test_nh "${name}" "${desc}"
+ fi
+ fi
+ name=""
+ desc=""
+ rerun_nh=0
+done
+
+exit ${exitcode}
diff --git a/tools/testing/selftests/net/psock_fanout.c b/tools/testing/selftests/net/psock_fanout.c
new file mode 100644
index 000000000..2c522f7a0
--- /dev/null
+++ b/tools/testing/selftests/net/psock_fanout.c
@@ -0,0 +1,472 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2013 Google Inc.
+ * Author: Willem de Bruijn (willemb@google.com)
+ *
+ * A basic test of packet socket fanout behavior.
+ *
+ * Control:
+ * - create fanout fails as expected with illegal flag combinations
+ * - join fanout fails as expected with diverging types or flags
+ *
+ * Datapath:
+ * Open a pair of packet sockets and a pair of INET sockets, send a known
+ * number of packets across the two INET sockets and count the number of
+ * packets enqueued onto the two packet sockets.
+ *
+ * The test currently runs for
+ * - PACKET_FANOUT_HASH
+ * - PACKET_FANOUT_HASH with PACKET_FANOUT_FLAG_ROLLOVER
+ * - PACKET_FANOUT_LB
+ * - PACKET_FANOUT_CPU
+ * - PACKET_FANOUT_ROLLOVER
+ * - PACKET_FANOUT_CBPF
+ * - PACKET_FANOUT_EBPF
+ *
+ * Todo:
+ * - functionality: PACKET_FANOUT_FLAG_DEFRAG
+ */
+
+#define _GNU_SOURCE /* for sched_setaffinity */
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/unistd.h> /* for __NR_bpf */
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/if_packet.h>
+#include <net/if.h>
+#include <net/ethernet.h>
+#include <netinet/ip.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <sched.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "psock_lib.h"
+
+#define RING_NUM_FRAMES 20
+
+/* Open a socket in a given fanout mode.
+ * @return -1 if mode is bad, a valid socket otherwise */
+static int sock_fanout_open(uint16_t typeflags, uint16_t group_id)
+{
+ struct sockaddr_ll addr = {0};
+ int fd, val;
+
+ fd = socket(PF_PACKET, SOCK_RAW, 0);
+ if (fd < 0) {
+ perror("socket packet");
+ exit(1);
+ }
+
+ pair_udp_setfilter(fd);
+
+ addr.sll_family = AF_PACKET;
+ addr.sll_protocol = htons(ETH_P_IP);
+ addr.sll_ifindex = if_nametoindex("lo");
+ if (addr.sll_ifindex == 0) {
+ perror("if_nametoindex");
+ exit(1);
+ }
+ if (bind(fd, (void *) &addr, sizeof(addr))) {
+ perror("bind packet");
+ exit(1);
+ }
+
+ val = (((int) typeflags) << 16) | group_id;
+ if (setsockopt(fd, SOL_PACKET, PACKET_FANOUT, &val, sizeof(val))) {
+ if (close(fd)) {
+ perror("close packet");
+ exit(1);
+ }
+ return -1;
+ }
+
+ return fd;
+}
+
+static void sock_fanout_set_cbpf(int fd)
+{
+ struct sock_filter bpf_filter[] = {
+ BPF_STMT(BPF_LD+BPF_B+BPF_ABS, 80), /* ldb [80] */
+ BPF_STMT(BPF_RET+BPF_A, 0), /* ret A */
+ };
+ struct sock_fprog bpf_prog;
+
+ bpf_prog.filter = bpf_filter;
+ bpf_prog.len = sizeof(bpf_filter) / sizeof(struct sock_filter);
+
+ if (setsockopt(fd, SOL_PACKET, PACKET_FANOUT_DATA, &bpf_prog,
+ sizeof(bpf_prog))) {
+ perror("fanout data cbpf");
+ exit(1);
+ }
+}
+
+static void sock_fanout_getopts(int fd, uint16_t *typeflags, uint16_t *group_id)
+{
+ int sockopt;
+ socklen_t sockopt_len = sizeof(sockopt);
+
+ if (getsockopt(fd, SOL_PACKET, PACKET_FANOUT,
+ &sockopt, &sockopt_len)) {
+ perror("failed to getsockopt");
+ exit(1);
+ }
+ *typeflags = sockopt >> 16;
+ *group_id = sockopt & 0xfffff;
+}
+
+static void sock_fanout_set_ebpf(int fd)
+{
+ static char log_buf[65536];
+
+ const int len_off = __builtin_offsetof(struct __sk_buff, len);
+ struct bpf_insn prog[] = {
+ { BPF_ALU64 | BPF_MOV | BPF_X, 6, 1, 0, 0 },
+ { BPF_LDX | BPF_W | BPF_MEM, 0, 6, len_off, 0 },
+ { BPF_JMP | BPF_JGE | BPF_K, 0, 0, 1, DATA_LEN },
+ { BPF_JMP | BPF_JA | BPF_K, 0, 0, 4, 0 },
+ { BPF_LD | BPF_B | BPF_ABS, 0, 0, 0, 0x50 },
+ { BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 2, DATA_CHAR },
+ { BPF_JMP | BPF_JEQ | BPF_K, 0, 0, 1, DATA_CHAR_1 },
+ { BPF_ALU | BPF_MOV | BPF_K, 0, 0, 0, 0 },
+ { BPF_JMP | BPF_EXIT, 0, 0, 0, 0 }
+ };
+ union bpf_attr attr;
+ int pfd;
+
+ memset(&attr, 0, sizeof(attr));
+ attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+ attr.insns = (unsigned long) prog;
+ attr.insn_cnt = sizeof(prog) / sizeof(prog[0]);
+ attr.license = (unsigned long) "GPL";
+ attr.log_buf = (unsigned long) log_buf,
+ attr.log_size = sizeof(log_buf),
+ attr.log_level = 1,
+
+ pfd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+ if (pfd < 0) {
+ perror("bpf");
+ fprintf(stderr, "bpf verifier:\n%s\n", log_buf);
+ exit(1);
+ }
+
+ if (setsockopt(fd, SOL_PACKET, PACKET_FANOUT_DATA, &pfd, sizeof(pfd))) {
+ perror("fanout data ebpf");
+ exit(1);
+ }
+
+ if (close(pfd)) {
+ perror("close ebpf");
+ exit(1);
+ }
+}
+
+static char *sock_fanout_open_ring(int fd)
+{
+ struct tpacket_req req = {
+ .tp_block_size = getpagesize(),
+ .tp_frame_size = getpagesize(),
+ .tp_block_nr = RING_NUM_FRAMES,
+ .tp_frame_nr = RING_NUM_FRAMES,
+ };
+ char *ring;
+ int val = TPACKET_V2;
+
+ if (setsockopt(fd, SOL_PACKET, PACKET_VERSION, (void *) &val,
+ sizeof(val))) {
+ perror("packetsock ring setsockopt version");
+ exit(1);
+ }
+ if (setsockopt(fd, SOL_PACKET, PACKET_RX_RING, (void *) &req,
+ sizeof(req))) {
+ perror("packetsock ring setsockopt");
+ exit(1);
+ }
+
+ ring = mmap(0, req.tp_block_size * req.tp_block_nr,
+ PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if (ring == MAP_FAILED) {
+ perror("packetsock ring mmap");
+ exit(1);
+ }
+
+ return ring;
+}
+
+static int sock_fanout_read_ring(int fd, void *ring)
+{
+ struct tpacket2_hdr *header = ring;
+ int count = 0;
+
+ while (count < RING_NUM_FRAMES && header->tp_status & TP_STATUS_USER) {
+ count++;
+ header = ring + (count * getpagesize());
+ }
+
+ return count;
+}
+
+static int sock_fanout_read(int fds[], char *rings[], const int expect[])
+{
+ int ret[2];
+
+ ret[0] = sock_fanout_read_ring(fds[0], rings[0]);
+ ret[1] = sock_fanout_read_ring(fds[1], rings[1]);
+
+ fprintf(stderr, "info: count=%d,%d, expect=%d,%d\n",
+ ret[0], ret[1], expect[0], expect[1]);
+
+ if ((!(ret[0] == expect[0] && ret[1] == expect[1])) &&
+ (!(ret[0] == expect[1] && ret[1] == expect[0]))) {
+ fprintf(stderr, "warning: incorrect queue lengths\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Test illegal mode + flag combination */
+static void test_control_single(void)
+{
+ fprintf(stderr, "test: control single socket\n");
+
+ if (sock_fanout_open(PACKET_FANOUT_ROLLOVER |
+ PACKET_FANOUT_FLAG_ROLLOVER, 0) != -1) {
+ fprintf(stderr, "ERROR: opened socket with dual rollover\n");
+ exit(1);
+ }
+}
+
+/* Test illegal group with different modes or flags */
+static void test_control_group(void)
+{
+ int fds[2];
+
+ fprintf(stderr, "test: control multiple sockets\n");
+
+ fds[0] = sock_fanout_open(PACKET_FANOUT_HASH, 0);
+ if (fds[0] == -1) {
+ fprintf(stderr, "ERROR: failed to open HASH socket\n");
+ exit(1);
+ }
+ if (sock_fanout_open(PACKET_FANOUT_HASH |
+ PACKET_FANOUT_FLAG_DEFRAG, 0) != -1) {
+ fprintf(stderr, "ERROR: joined group with wrong flag defrag\n");
+ exit(1);
+ }
+ if (sock_fanout_open(PACKET_FANOUT_HASH |
+ PACKET_FANOUT_FLAG_ROLLOVER, 0) != -1) {
+ fprintf(stderr, "ERROR: joined group with wrong flag ro\n");
+ exit(1);
+ }
+ if (sock_fanout_open(PACKET_FANOUT_CPU, 0) != -1) {
+ fprintf(stderr, "ERROR: joined group with wrong mode\n");
+ exit(1);
+ }
+ fds[1] = sock_fanout_open(PACKET_FANOUT_HASH, 0);
+ if (fds[1] == -1) {
+ fprintf(stderr, "ERROR: failed to join group\n");
+ exit(1);
+ }
+ if (close(fds[1]) || close(fds[0])) {
+ fprintf(stderr, "ERROR: closing sockets\n");
+ exit(1);
+ }
+}
+
+/* Test creating a unique fanout group ids */
+static void test_unique_fanout_group_ids(void)
+{
+ int fds[3];
+ uint16_t typeflags, first_group_id, second_group_id;
+
+ fprintf(stderr, "test: unique ids\n");
+
+ fds[0] = sock_fanout_open(PACKET_FANOUT_HASH |
+ PACKET_FANOUT_FLAG_UNIQUEID, 0);
+ if (fds[0] == -1) {
+ fprintf(stderr, "ERROR: failed to create a unique id group.\n");
+ exit(1);
+ }
+
+ sock_fanout_getopts(fds[0], &typeflags, &first_group_id);
+ if (typeflags != PACKET_FANOUT_HASH) {
+ fprintf(stderr, "ERROR: unexpected typeflags %x\n", typeflags);
+ exit(1);
+ }
+
+ if (sock_fanout_open(PACKET_FANOUT_CPU, first_group_id) != -1) {
+ fprintf(stderr, "ERROR: joined group with wrong type.\n");
+ exit(1);
+ }
+
+ fds[1] = sock_fanout_open(PACKET_FANOUT_HASH, first_group_id);
+ if (fds[1] == -1) {
+ fprintf(stderr,
+ "ERROR: failed to join previously created group.\n");
+ exit(1);
+ }
+
+ fds[2] = sock_fanout_open(PACKET_FANOUT_HASH |
+ PACKET_FANOUT_FLAG_UNIQUEID, 0);
+ if (fds[2] == -1) {
+ fprintf(stderr,
+ "ERROR: failed to create a second unique id group.\n");
+ exit(1);
+ }
+
+ sock_fanout_getopts(fds[2], &typeflags, &second_group_id);
+ if (sock_fanout_open(PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_UNIQUEID,
+ second_group_id) != -1) {
+ fprintf(stderr,
+ "ERROR: specified a group id when requesting unique id\n");
+ exit(1);
+ }
+
+ if (close(fds[0]) || close(fds[1]) || close(fds[2])) {
+ fprintf(stderr, "ERROR: closing sockets\n");
+ exit(1);
+ }
+}
+
+static int test_datapath(uint16_t typeflags, int port_off,
+ const int expect1[], const int expect2[])
+{
+ const int expect0[] = { 0, 0 };
+ char *rings[2];
+ uint8_t type = typeflags & 0xFF;
+ int fds[2], fds_udp[2][2], ret;
+
+ fprintf(stderr, "\ntest: datapath 0x%hx ports %hu,%hu\n",
+ typeflags, (uint16_t)PORT_BASE,
+ (uint16_t)(PORT_BASE + port_off));
+
+ fds[0] = sock_fanout_open(typeflags, 0);
+ fds[1] = sock_fanout_open(typeflags, 0);
+ if (fds[0] == -1 || fds[1] == -1) {
+ fprintf(stderr, "ERROR: failed open\n");
+ exit(1);
+ }
+ if (type == PACKET_FANOUT_CBPF)
+ sock_fanout_set_cbpf(fds[0]);
+ else if (type == PACKET_FANOUT_EBPF)
+ sock_fanout_set_ebpf(fds[0]);
+
+ rings[0] = sock_fanout_open_ring(fds[0]);
+ rings[1] = sock_fanout_open_ring(fds[1]);
+ pair_udp_open(fds_udp[0], PORT_BASE);
+ pair_udp_open(fds_udp[1], PORT_BASE + port_off);
+ sock_fanout_read(fds, rings, expect0);
+
+ /* Send data, but not enough to overflow a queue */
+ pair_udp_send(fds_udp[0], 15);
+ pair_udp_send_char(fds_udp[1], 5, DATA_CHAR_1);
+ ret = sock_fanout_read(fds, rings, expect1);
+
+ /* Send more data, overflow the queue */
+ pair_udp_send_char(fds_udp[0], 15, DATA_CHAR_1);
+ /* TODO: ensure consistent order between expect1 and expect2 */
+ ret |= sock_fanout_read(fds, rings, expect2);
+
+ if (munmap(rings[1], RING_NUM_FRAMES * getpagesize()) ||
+ munmap(rings[0], RING_NUM_FRAMES * getpagesize())) {
+ fprintf(stderr, "close rings\n");
+ exit(1);
+ }
+ if (close(fds_udp[1][1]) || close(fds_udp[1][0]) ||
+ close(fds_udp[0][1]) || close(fds_udp[0][0]) ||
+ close(fds[1]) || close(fds[0])) {
+ fprintf(stderr, "close datapath\n");
+ exit(1);
+ }
+
+ return ret;
+}
+
+static int set_cpuaffinity(int cpuid)
+{
+ cpu_set_t mask;
+
+ CPU_ZERO(&mask);
+ CPU_SET(cpuid, &mask);
+ if (sched_setaffinity(0, sizeof(mask), &mask)) {
+ if (errno != EINVAL) {
+ fprintf(stderr, "setaffinity %d\n", cpuid);
+ exit(1);
+ }
+ return 1;
+ }
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ const int expect_hash[2][2] = { { 15, 5 }, { 20, 5 } };
+ const int expect_hash_rb[2][2] = { { 15, 5 }, { 20, 15 } };
+ const int expect_lb[2][2] = { { 10, 10 }, { 18, 17 } };
+ const int expect_rb[2][2] = { { 15, 5 }, { 20, 15 } };
+ const int expect_cpu0[2][2] = { { 20, 0 }, { 20, 0 } };
+ const int expect_cpu1[2][2] = { { 0, 20 }, { 0, 20 } };
+ const int expect_bpf[2][2] = { { 15, 5 }, { 15, 20 } };
+ const int expect_uniqueid[2][2] = { { 20, 20}, { 20, 20 } };
+ int port_off = 2, tries = 20, ret;
+
+ test_control_single();
+ test_control_group();
+ test_unique_fanout_group_ids();
+
+ /* find a set of ports that do not collide onto the same socket */
+ ret = test_datapath(PACKET_FANOUT_HASH, port_off,
+ expect_hash[0], expect_hash[1]);
+ while (ret) {
+ fprintf(stderr, "info: trying alternate ports (%d)\n", tries);
+ ret = test_datapath(PACKET_FANOUT_HASH, ++port_off,
+ expect_hash[0], expect_hash[1]);
+ if (!--tries) {
+ fprintf(stderr, "too many collisions\n");
+ return 1;
+ }
+ }
+
+ ret |= test_datapath(PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_ROLLOVER,
+ port_off, expect_hash_rb[0], expect_hash_rb[1]);
+ ret |= test_datapath(PACKET_FANOUT_LB,
+ port_off, expect_lb[0], expect_lb[1]);
+ ret |= test_datapath(PACKET_FANOUT_ROLLOVER,
+ port_off, expect_rb[0], expect_rb[1]);
+
+ ret |= test_datapath(PACKET_FANOUT_CBPF,
+ port_off, expect_bpf[0], expect_bpf[1]);
+ ret |= test_datapath(PACKET_FANOUT_EBPF,
+ port_off, expect_bpf[0], expect_bpf[1]);
+
+ set_cpuaffinity(0);
+ ret |= test_datapath(PACKET_FANOUT_CPU, port_off,
+ expect_cpu0[0], expect_cpu0[1]);
+ if (!set_cpuaffinity(1))
+ /* TODO: test that choice alternates with previous */
+ ret |= test_datapath(PACKET_FANOUT_CPU, port_off,
+ expect_cpu1[0], expect_cpu1[1]);
+
+ ret |= test_datapath(PACKET_FANOUT_FLAG_UNIQUEID, port_off,
+ expect_uniqueid[0], expect_uniqueid[1]);
+
+ if (ret)
+ return 1;
+
+ printf("OK. All tests passed\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/net/psock_lib.h b/tools/testing/selftests/net/psock_lib.h
new file mode 100644
index 000000000..faa884385
--- /dev/null
+++ b/tools/testing/selftests/net/psock_lib.h
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2013 Google Inc.
+ * Author: Willem de Bruijn <willemb@google.com>
+ * Daniel Borkmann <dborkman@redhat.com>
+ */
+
+#ifndef PSOCK_LIB_H
+#define PSOCK_LIB_H
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <string.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+
+#define DATA_LEN 100
+#define DATA_CHAR 'a'
+#define DATA_CHAR_1 'b'
+
+#define PORT_BASE 8000
+
+#ifndef __maybe_unused
+# define __maybe_unused __attribute__ ((__unused__))
+#endif
+
+static __maybe_unused void pair_udp_setfilter(int fd)
+{
+ /* the filter below checks for all of the following conditions that
+ * are based on the contents of create_payload()
+ * ether type 0x800 and
+ * ip proto udp and
+ * skb->len == DATA_LEN and
+ * udp[38] == 'a' or udp[38] == 'b'
+ * It can be generated from the following bpf_asm input:
+ * ldh [12]
+ * jne #0x800, drop ; ETH_P_IP
+ * ldb [23]
+ * jneq #17, drop ; IPPROTO_UDP
+ * ld len ; ld skb->len
+ * jlt #100, drop ; DATA_LEN
+ * ldb [80]
+ * jeq #97, pass ; DATA_CHAR
+ * jne #98, drop ; DATA_CHAR_1
+ * pass:
+ * ret #-1
+ * drop:
+ * ret #0
+ */
+ struct sock_filter bpf_filter[] = {
+ { 0x28, 0, 0, 0x0000000c },
+ { 0x15, 0, 8, 0x00000800 },
+ { 0x30, 0, 0, 0x00000017 },
+ { 0x15, 0, 6, 0x00000011 },
+ { 0x80, 0, 0, 0000000000 },
+ { 0x35, 0, 4, 0x00000064 },
+ { 0x30, 0, 0, 0x00000050 },
+ { 0x15, 1, 0, 0x00000061 },
+ { 0x15, 0, 1, 0x00000062 },
+ { 0x06, 0, 0, 0xffffffff },
+ { 0x06, 0, 0, 0000000000 },
+ };
+ struct sock_fprog bpf_prog;
+
+ bpf_prog.filter = bpf_filter;
+ bpf_prog.len = sizeof(bpf_filter) / sizeof(struct sock_filter);
+
+ if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &bpf_prog,
+ sizeof(bpf_prog))) {
+ perror("setsockopt SO_ATTACH_FILTER");
+ exit(1);
+ }
+}
+
+static __maybe_unused void pair_udp_open(int fds[], uint16_t port)
+{
+ struct sockaddr_in saddr, daddr;
+
+ fds[0] = socket(PF_INET, SOCK_DGRAM, 0);
+ fds[1] = socket(PF_INET, SOCK_DGRAM, 0);
+ if (fds[0] == -1 || fds[1] == -1) {
+ fprintf(stderr, "ERROR: socket dgram\n");
+ exit(1);
+ }
+
+ memset(&saddr, 0, sizeof(saddr));
+ saddr.sin_family = AF_INET;
+ saddr.sin_port = htons(port);
+ saddr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+
+ memset(&daddr, 0, sizeof(daddr));
+ daddr.sin_family = AF_INET;
+ daddr.sin_port = htons(port + 1);
+ daddr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+
+ /* must bind both to get consistent hash result */
+ if (bind(fds[1], (void *) &daddr, sizeof(daddr))) {
+ perror("bind");
+ exit(1);
+ }
+ if (bind(fds[0], (void *) &saddr, sizeof(saddr))) {
+ perror("bind");
+ exit(1);
+ }
+ if (connect(fds[0], (void *) &daddr, sizeof(daddr))) {
+ perror("connect");
+ exit(1);
+ }
+}
+
+static __maybe_unused void pair_udp_send_char(int fds[], int num, char payload)
+{
+ char buf[DATA_LEN], rbuf[DATA_LEN];
+
+ memset(buf, payload, sizeof(buf));
+ while (num--) {
+ /* Should really handle EINTR and EAGAIN */
+ if (write(fds[0], buf, sizeof(buf)) != sizeof(buf)) {
+ fprintf(stderr, "ERROR: send failed left=%d\n", num);
+ exit(1);
+ }
+ if (read(fds[1], rbuf, sizeof(rbuf)) != sizeof(rbuf)) {
+ fprintf(stderr, "ERROR: recv failed left=%d\n", num);
+ exit(1);
+ }
+ if (memcmp(buf, rbuf, sizeof(buf))) {
+ fprintf(stderr, "ERROR: data failed left=%d\n", num);
+ exit(1);
+ }
+ }
+}
+
+static __maybe_unused void pair_udp_send(int fds[], int num)
+{
+ return pair_udp_send_char(fds, num, DATA_CHAR);
+}
+
+static __maybe_unused void pair_udp_close(int fds[])
+{
+ close(fds[0]);
+ close(fds[1]);
+}
+
+#endif /* PSOCK_LIB_H */
diff --git a/tools/testing/selftests/net/psock_snd.c b/tools/testing/selftests/net/psock_snd.c
new file mode 100644
index 000000000..7d15e10a9
--- /dev/null
+++ b/tools/testing/selftests/net/psock_snd.c
@@ -0,0 +1,397 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/virtio_net.h>
+#include <net/if.h>
+#include <net/ethernet.h>
+#include <netinet/ip.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "psock_lib.h"
+
+static bool cfg_use_bind;
+static bool cfg_use_csum_off;
+static bool cfg_use_csum_off_bad;
+static bool cfg_use_dgram;
+static bool cfg_use_gso;
+static bool cfg_use_qdisc_bypass;
+static bool cfg_use_vlan;
+static bool cfg_use_vnet;
+
+static char *cfg_ifname = "lo";
+static int cfg_mtu = 1500;
+static int cfg_payload_len = DATA_LEN;
+static int cfg_truncate_len = INT_MAX;
+static uint16_t cfg_port = 8000;
+
+/* test sending up to max mtu + 1 */
+#define TEST_SZ (sizeof(struct virtio_net_hdr) + ETH_HLEN + ETH_MAX_MTU + 1)
+
+static char tbuf[TEST_SZ], rbuf[TEST_SZ];
+
+static unsigned long add_csum_hword(const uint16_t *start, int num_u16)
+{
+ unsigned long sum = 0;
+ int i;
+
+ for (i = 0; i < num_u16; i++)
+ sum += start[i];
+
+ return sum;
+}
+
+static uint16_t build_ip_csum(const uint16_t *start, int num_u16,
+ unsigned long sum)
+{
+ sum += add_csum_hword(start, num_u16);
+
+ while (sum >> 16)
+ sum = (sum & 0xffff) + (sum >> 16);
+
+ return ~sum;
+}
+
+static int build_vnet_header(void *header)
+{
+ struct virtio_net_hdr *vh = header;
+
+ vh->hdr_len = ETH_HLEN + sizeof(struct iphdr) + sizeof(struct udphdr);
+
+ if (cfg_use_csum_off) {
+ vh->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM;
+ vh->csum_start = ETH_HLEN + sizeof(struct iphdr);
+ vh->csum_offset = __builtin_offsetof(struct udphdr, check);
+
+ /* position check field exactly one byte beyond end of packet */
+ if (cfg_use_csum_off_bad)
+ vh->csum_start += sizeof(struct udphdr) + cfg_payload_len -
+ vh->csum_offset - 1;
+ }
+
+ if (cfg_use_gso) {
+ vh->gso_type = VIRTIO_NET_HDR_GSO_UDP;
+ vh->gso_size = cfg_mtu - sizeof(struct iphdr);
+ }
+
+ return sizeof(*vh);
+}
+
+static int build_eth_header(void *header)
+{
+ struct ethhdr *eth = header;
+
+ if (cfg_use_vlan) {
+ uint16_t *tag = header + ETH_HLEN;
+
+ eth->h_proto = htons(ETH_P_8021Q);
+ tag[1] = htons(ETH_P_IP);
+ return ETH_HLEN + 4;
+ }
+
+ eth->h_proto = htons(ETH_P_IP);
+ return ETH_HLEN;
+}
+
+static int build_ipv4_header(void *header, int payload_len)
+{
+ struct iphdr *iph = header;
+
+ iph->ihl = 5;
+ iph->version = 4;
+ iph->ttl = 8;
+ iph->tot_len = htons(sizeof(*iph) + sizeof(struct udphdr) + payload_len);
+ iph->id = htons(1337);
+ iph->protocol = IPPROTO_UDP;
+ iph->saddr = htonl((172 << 24) | (17 << 16) | 2);
+ iph->daddr = htonl((172 << 24) | (17 << 16) | 1);
+ iph->check = build_ip_csum((void *) iph, iph->ihl << 1, 0);
+
+ return iph->ihl << 2;
+}
+
+static int build_udp_header(void *header, int payload_len)
+{
+ const int alen = sizeof(uint32_t);
+ struct udphdr *udph = header;
+ int len = sizeof(*udph) + payload_len;
+
+ udph->source = htons(9);
+ udph->dest = htons(cfg_port);
+ udph->len = htons(len);
+
+ if (cfg_use_csum_off)
+ udph->check = build_ip_csum(header - (2 * alen), alen,
+ htons(IPPROTO_UDP) + udph->len);
+ else
+ udph->check = 0;
+
+ return sizeof(*udph);
+}
+
+static int build_packet(int payload_len)
+{
+ int off = 0;
+
+ off += build_vnet_header(tbuf);
+ off += build_eth_header(tbuf + off);
+ off += build_ipv4_header(tbuf + off, payload_len);
+ off += build_udp_header(tbuf + off, payload_len);
+
+ if (off + payload_len > sizeof(tbuf))
+ error(1, 0, "payload length exceeds max");
+
+ memset(tbuf + off, DATA_CHAR, payload_len);
+
+ return off + payload_len;
+}
+
+static void do_bind(int fd)
+{
+ struct sockaddr_ll laddr = {0};
+
+ laddr.sll_family = AF_PACKET;
+ laddr.sll_protocol = htons(ETH_P_IP);
+ laddr.sll_ifindex = if_nametoindex(cfg_ifname);
+ if (!laddr.sll_ifindex)
+ error(1, errno, "if_nametoindex");
+
+ if (bind(fd, (void *)&laddr, sizeof(laddr)))
+ error(1, errno, "bind");
+}
+
+static void do_send(int fd, char *buf, int len)
+{
+ int ret;
+
+ if (!cfg_use_vnet) {
+ buf += sizeof(struct virtio_net_hdr);
+ len -= sizeof(struct virtio_net_hdr);
+ }
+ if (cfg_use_dgram) {
+ buf += ETH_HLEN;
+ len -= ETH_HLEN;
+ }
+
+ if (cfg_use_bind) {
+ ret = write(fd, buf, len);
+ } else {
+ struct sockaddr_ll laddr = {0};
+
+ laddr.sll_protocol = htons(ETH_P_IP);
+ laddr.sll_ifindex = if_nametoindex(cfg_ifname);
+ if (!laddr.sll_ifindex)
+ error(1, errno, "if_nametoindex");
+
+ ret = sendto(fd, buf, len, 0, (void *)&laddr, sizeof(laddr));
+ }
+
+ if (ret == -1)
+ error(1, errno, "write");
+ if (ret != len)
+ error(1, 0, "write: %u %u", ret, len);
+
+ fprintf(stderr, "tx: %u\n", ret);
+}
+
+static int do_tx(void)
+{
+ const int one = 1;
+ int fd, len;
+
+ fd = socket(PF_PACKET, cfg_use_dgram ? SOCK_DGRAM : SOCK_RAW, 0);
+ if (fd == -1)
+ error(1, errno, "socket t");
+
+ if (cfg_use_bind)
+ do_bind(fd);
+
+ if (cfg_use_qdisc_bypass &&
+ setsockopt(fd, SOL_PACKET, PACKET_QDISC_BYPASS, &one, sizeof(one)))
+ error(1, errno, "setsockopt qdisc bypass");
+
+ if (cfg_use_vnet &&
+ setsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, &one, sizeof(one)))
+ error(1, errno, "setsockopt vnet");
+
+ len = build_packet(cfg_payload_len);
+
+ if (cfg_truncate_len < len)
+ len = cfg_truncate_len;
+
+ do_send(fd, tbuf, len);
+
+ if (close(fd))
+ error(1, errno, "close t");
+
+ return len;
+}
+
+static int setup_rx(void)
+{
+ struct timeval tv = { .tv_usec = 100 * 1000 };
+ struct sockaddr_in raddr = {0};
+ int fd;
+
+ fd = socket(PF_INET, SOCK_DGRAM, 0);
+ if (fd == -1)
+ error(1, errno, "socket r");
+
+ if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)))
+ error(1, errno, "setsockopt rcv timeout");
+
+ raddr.sin_family = AF_INET;
+ raddr.sin_port = htons(cfg_port);
+ raddr.sin_addr.s_addr = htonl(INADDR_ANY);
+
+ if (bind(fd, (void *)&raddr, sizeof(raddr)))
+ error(1, errno, "bind r");
+
+ return fd;
+}
+
+static void do_rx(int fd, int expected_len, char *expected)
+{
+ int ret;
+
+ ret = recv(fd, rbuf, sizeof(rbuf), 0);
+ if (ret == -1)
+ error(1, errno, "recv");
+ if (ret != expected_len)
+ error(1, 0, "recv: %u != %u", ret, expected_len);
+
+ if (memcmp(rbuf, expected, ret))
+ error(1, 0, "recv: data mismatch");
+
+ fprintf(stderr, "rx: %u\n", ret);
+}
+
+static int setup_sniffer(void)
+{
+ struct timeval tv = { .tv_usec = 100 * 1000 };
+ int fd;
+
+ fd = socket(PF_PACKET, SOCK_RAW, 0);
+ if (fd == -1)
+ error(1, errno, "socket p");
+
+ if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)))
+ error(1, errno, "setsockopt rcv timeout");
+
+ pair_udp_setfilter(fd);
+ do_bind(fd);
+
+ return fd;
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ int c;
+
+ while ((c = getopt(argc, argv, "bcCdgl:qt:vV")) != -1) {
+ switch (c) {
+ case 'b':
+ cfg_use_bind = true;
+ break;
+ case 'c':
+ cfg_use_csum_off = true;
+ break;
+ case 'C':
+ cfg_use_csum_off_bad = true;
+ break;
+ case 'd':
+ cfg_use_dgram = true;
+ break;
+ case 'g':
+ cfg_use_gso = true;
+ break;
+ case 'l':
+ cfg_payload_len = strtoul(optarg, NULL, 0);
+ break;
+ case 'q':
+ cfg_use_qdisc_bypass = true;
+ break;
+ case 't':
+ cfg_truncate_len = strtoul(optarg, NULL, 0);
+ break;
+ case 'v':
+ cfg_use_vnet = true;
+ break;
+ case 'V':
+ cfg_use_vlan = true;
+ break;
+ default:
+ error(1, 0, "%s: parse error", argv[0]);
+ }
+ }
+
+ if (cfg_use_vlan && cfg_use_dgram)
+ error(1, 0, "option vlan (-V) conflicts with dgram (-d)");
+
+ if (cfg_use_csum_off && !cfg_use_vnet)
+ error(1, 0, "option csum offload (-c) requires vnet (-v)");
+
+ if (cfg_use_csum_off_bad && !cfg_use_csum_off)
+ error(1, 0, "option csum bad (-C) requires csum offload (-c)");
+
+ if (cfg_use_gso && !cfg_use_csum_off)
+ error(1, 0, "option gso (-g) requires csum offload (-c)");
+}
+
+static void run_test(void)
+{
+ int fdr, fds, total_len;
+
+ fdr = setup_rx();
+ fds = setup_sniffer();
+
+ total_len = do_tx();
+
+ /* BPF filter accepts only this length, vlan changes MAC */
+ if (cfg_payload_len == DATA_LEN && !cfg_use_vlan)
+ do_rx(fds, total_len - sizeof(struct virtio_net_hdr),
+ tbuf + sizeof(struct virtio_net_hdr));
+
+ do_rx(fdr, cfg_payload_len, tbuf + total_len - cfg_payload_len);
+
+ if (close(fds))
+ error(1, errno, "close s");
+ if (close(fdr))
+ error(1, errno, "close r");
+}
+
+int main(int argc, char **argv)
+{
+ parse_opts(argc, argv);
+
+ if (system("ip link set dev lo mtu 1500"))
+ error(1, errno, "ip link set mtu");
+ if (system("ip addr add dev lo 172.17.0.1/24"))
+ error(1, errno, "ip addr add");
+
+ run_test();
+
+ fprintf(stderr, "OK\n\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/net/psock_snd.sh b/tools/testing/selftests/net/psock_snd.sh
new file mode 100755
index 000000000..170be65e0
--- /dev/null
+++ b/tools/testing/selftests/net/psock_snd.sh
@@ -0,0 +1,98 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run a series of packet socket send regression tests
+
+set -e
+
+readonly mtu=1500
+readonly iphlen=20
+readonly udphlen=8
+
+readonly vnet_hlen=10
+readonly eth_hlen=14
+
+readonly mss="$((${mtu} - ${iphlen} - ${udphlen}))"
+readonly mss_exceeds="$((${mss} + 1))"
+
+readonly max_mtu=65535
+readonly max_mss="$((${max_mtu} - ${iphlen} - ${udphlen}))"
+readonly max_mss_exceeds="$((${max_mss} + 1))"
+
+# functional checks (not a full cross-product)
+
+echo "dgram"
+./in_netns.sh ./psock_snd -d
+
+echo "dgram bind"
+./in_netns.sh ./psock_snd -d -b
+
+echo "raw"
+./in_netns.sh ./psock_snd
+
+echo "raw bind"
+./in_netns.sh ./psock_snd -b
+
+echo "raw qdisc bypass"
+./in_netns.sh ./psock_snd -q
+
+echo "raw vlan"
+./in_netns.sh ./psock_snd -V
+
+echo "raw vnet hdr"
+./in_netns.sh ./psock_snd -v
+
+echo "raw csum_off"
+./in_netns.sh ./psock_snd -v -c
+
+echo "raw csum_off with bad offset (expected to fail)"
+(! ./in_netns.sh ./psock_snd -v -c -C)
+
+
+# bounds check: send {max, max + 1, min, min - 1} lengths
+
+echo "raw min size"
+./in_netns.sh ./psock_snd -l 0
+
+echo "raw mtu size"
+./in_netns.sh ./psock_snd -l "${mss}"
+
+echo "raw mtu size + 1 (expected to fail)"
+(! ./in_netns.sh ./psock_snd -l "${mss_exceeds}")
+
+# fails due to ARPHRD_ETHER check in packet_extra_vlan_len_allowed
+#
+# echo "raw vlan mtu size"
+# ./in_netns.sh ./psock_snd -V -l "${mss}"
+
+echo "raw vlan mtu size + 1 (expected to fail)"
+(! ./in_netns.sh ./psock_snd -V -l "${mss_exceeds}")
+
+echo "dgram mtu size"
+./in_netns.sh ./psock_snd -d -l "${mss}"
+
+echo "dgram mtu size + 1 (expected to fail)"
+(! ./in_netns.sh ./psock_snd -d -l "${mss_exceeds}")
+
+echo "raw truncate hlen (expected to fail: does not arrive)"
+(! ./in_netns.sh ./psock_snd -t "$((${vnet_hlen} + ${eth_hlen}))")
+
+echo "raw truncate hlen - 1 (expected to fail: EINVAL)"
+(! ./in_netns.sh ./psock_snd -t "$((${vnet_hlen} + ${eth_hlen} - 1))")
+
+
+# gso checks: implies -l, because with gso len must exceed gso_size
+
+echo "raw gso min size"
+./in_netns.sh ./psock_snd -v -c -g -l "${mss_exceeds}"
+
+echo "raw gso min size - 1 (expected to fail)"
+(! ./in_netns.sh ./psock_snd -v -c -g -l "${mss}")
+
+echo "raw gso max size"
+./in_netns.sh ./psock_snd -v -c -g -l "${max_mss}"
+
+echo "raw gso max size + 1 (expected to fail)"
+(! ./in_netns.sh ./psock_snd -v -c -g -l "${max_mss_exceeds}")
+
+echo "OK. All tests passed"
diff --git a/tools/testing/selftests/net/psock_tpacket.c b/tools/testing/selftests/net/psock_tpacket.c
new file mode 100644
index 000000000..404a2ce75
--- /dev/null
+++ b/tools/testing/selftests/net/psock_tpacket.c
@@ -0,0 +1,850 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2013 Red Hat, Inc.
+ * Author: Daniel Borkmann <dborkman@redhat.com>
+ * Chetan Loke <loke.chetan@gmail.com> (TPACKET_V3 usage example)
+ *
+ * A basic test of packet socket's TPACKET_V1/TPACKET_V2/TPACKET_V3 behavior.
+ *
+ * Control:
+ * Test the setup of the TPACKET socket with different patterns that are
+ * known to fail (TODO) resp. succeed (OK).
+ *
+ * Datapath:
+ * Open a pair of packet sockets and send resp. receive an a priori known
+ * packet pattern accross the sockets and check if it was received resp.
+ * sent correctly. Fanout in combination with RX_RING is currently not
+ * tested here.
+ *
+ * The test currently runs for
+ * - TPACKET_V1: RX_RING, TX_RING
+ * - TPACKET_V2: RX_RING, TX_RING
+ * - TPACKET_V3: RX_RING
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/mman.h>
+#include <linux/if_packet.h>
+#include <linux/filter.h>
+#include <ctype.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <bits/wordsize.h>
+#include <net/ethernet.h>
+#include <netinet/ip.h>
+#include <arpa/inet.h>
+#include <stdint.h>
+#include <string.h>
+#include <assert.h>
+#include <net/if.h>
+#include <inttypes.h>
+#include <poll.h>
+
+#include "psock_lib.h"
+
+#include "../kselftest.h"
+
+#ifndef bug_on
+# define bug_on(cond) assert(!(cond))
+#endif
+
+#ifndef __aligned_tpacket
+# define __aligned_tpacket __attribute__((aligned(TPACKET_ALIGNMENT)))
+#endif
+
+#ifndef __align_tpacket
+# define __align_tpacket(x) __attribute__((aligned(TPACKET_ALIGN(x))))
+#endif
+
+#define NUM_PACKETS 100
+#define ALIGN_8(x) (((x) + 8 - 1) & ~(8 - 1))
+
+struct ring {
+ struct iovec *rd;
+ uint8_t *mm_space;
+ size_t mm_len, rd_len;
+ struct sockaddr_ll ll;
+ void (*walk)(int sock, struct ring *ring);
+ int type, rd_num, flen, version;
+ union {
+ struct tpacket_req req;
+ struct tpacket_req3 req3;
+ };
+};
+
+struct block_desc {
+ uint32_t version;
+ uint32_t offset_to_priv;
+ struct tpacket_hdr_v1 h1;
+};
+
+union frame_map {
+ struct {
+ struct tpacket_hdr tp_h __aligned_tpacket;
+ struct sockaddr_ll s_ll __align_tpacket(sizeof(struct tpacket_hdr));
+ } *v1;
+ struct {
+ struct tpacket2_hdr tp_h __aligned_tpacket;
+ struct sockaddr_ll s_ll __align_tpacket(sizeof(struct tpacket2_hdr));
+ } *v2;
+ void *raw;
+};
+
+static unsigned int total_packets, total_bytes;
+
+static int pfsocket(int ver)
+{
+ int ret, sock = socket(PF_PACKET, SOCK_RAW, 0);
+ if (sock == -1) {
+ perror("socket");
+ exit(1);
+ }
+
+ ret = setsockopt(sock, SOL_PACKET, PACKET_VERSION, &ver, sizeof(ver));
+ if (ret == -1) {
+ perror("setsockopt");
+ exit(1);
+ }
+
+ return sock;
+}
+
+static void status_bar_update(void)
+{
+ if (total_packets % 10 == 0) {
+ fprintf(stderr, ".");
+ fflush(stderr);
+ }
+}
+
+static void test_payload(void *pay, size_t len)
+{
+ struct ethhdr *eth = pay;
+
+ if (len < sizeof(struct ethhdr)) {
+ fprintf(stderr, "test_payload: packet too "
+ "small: %zu bytes!\n", len);
+ exit(1);
+ }
+
+ if (eth->h_proto != htons(ETH_P_IP)) {
+ fprintf(stderr, "test_payload: wrong ethernet "
+ "type: 0x%x!\n", ntohs(eth->h_proto));
+ exit(1);
+ }
+}
+
+static void create_payload(void *pay, size_t *len)
+{
+ int i;
+ struct ethhdr *eth = pay;
+ struct iphdr *ip = pay + sizeof(*eth);
+
+ /* Lets create some broken crap, that still passes
+ * our BPF filter.
+ */
+
+ *len = DATA_LEN + 42;
+
+ memset(pay, 0xff, ETH_ALEN * 2);
+ eth->h_proto = htons(ETH_P_IP);
+
+ for (i = 0; i < sizeof(*ip); ++i)
+ ((uint8_t *) pay)[i + sizeof(*eth)] = (uint8_t) rand();
+
+ ip->ihl = 5;
+ ip->version = 4;
+ ip->protocol = 0x11;
+ ip->frag_off = 0;
+ ip->ttl = 64;
+ ip->tot_len = htons((uint16_t) *len - sizeof(*eth));
+
+ ip->saddr = htonl(INADDR_LOOPBACK);
+ ip->daddr = htonl(INADDR_LOOPBACK);
+
+ memset(pay + sizeof(*eth) + sizeof(*ip),
+ DATA_CHAR, DATA_LEN);
+}
+
+static inline int __v1_rx_kernel_ready(struct tpacket_hdr *hdr)
+{
+ return ((hdr->tp_status & TP_STATUS_USER) == TP_STATUS_USER);
+}
+
+static inline void __v1_rx_user_ready(struct tpacket_hdr *hdr)
+{
+ hdr->tp_status = TP_STATUS_KERNEL;
+ __sync_synchronize();
+}
+
+static inline int __v2_rx_kernel_ready(struct tpacket2_hdr *hdr)
+{
+ return ((hdr->tp_status & TP_STATUS_USER) == TP_STATUS_USER);
+}
+
+static inline void __v2_rx_user_ready(struct tpacket2_hdr *hdr)
+{
+ hdr->tp_status = TP_STATUS_KERNEL;
+ __sync_synchronize();
+}
+
+static inline int __v1_v2_rx_kernel_ready(void *base, int version)
+{
+ switch (version) {
+ case TPACKET_V1:
+ return __v1_rx_kernel_ready(base);
+ case TPACKET_V2:
+ return __v2_rx_kernel_ready(base);
+ default:
+ bug_on(1);
+ return 0;
+ }
+}
+
+static inline void __v1_v2_rx_user_ready(void *base, int version)
+{
+ switch (version) {
+ case TPACKET_V1:
+ __v1_rx_user_ready(base);
+ break;
+ case TPACKET_V2:
+ __v2_rx_user_ready(base);
+ break;
+ }
+}
+
+static void walk_v1_v2_rx(int sock, struct ring *ring)
+{
+ struct pollfd pfd;
+ int udp_sock[2];
+ union frame_map ppd;
+ unsigned int frame_num = 0;
+
+ bug_on(ring->type != PACKET_RX_RING);
+
+ pair_udp_open(udp_sock, PORT_BASE);
+
+ memset(&pfd, 0, sizeof(pfd));
+ pfd.fd = sock;
+ pfd.events = POLLIN | POLLERR;
+ pfd.revents = 0;
+
+ pair_udp_send(udp_sock, NUM_PACKETS);
+
+ while (total_packets < NUM_PACKETS * 2) {
+ while (__v1_v2_rx_kernel_ready(ring->rd[frame_num].iov_base,
+ ring->version)) {
+ ppd.raw = ring->rd[frame_num].iov_base;
+
+ switch (ring->version) {
+ case TPACKET_V1:
+ test_payload((uint8_t *) ppd.raw + ppd.v1->tp_h.tp_mac,
+ ppd.v1->tp_h.tp_snaplen);
+ total_bytes += ppd.v1->tp_h.tp_snaplen;
+ break;
+
+ case TPACKET_V2:
+ test_payload((uint8_t *) ppd.raw + ppd.v2->tp_h.tp_mac,
+ ppd.v2->tp_h.tp_snaplen);
+ total_bytes += ppd.v2->tp_h.tp_snaplen;
+ break;
+ }
+
+ status_bar_update();
+ total_packets++;
+
+ __v1_v2_rx_user_ready(ppd.raw, ring->version);
+
+ frame_num = (frame_num + 1) % ring->rd_num;
+ }
+
+ poll(&pfd, 1, 1);
+ }
+
+ pair_udp_close(udp_sock);
+
+ if (total_packets != 2 * NUM_PACKETS) {
+ fprintf(stderr, "walk_v%d_rx: received %u out of %u pkts\n",
+ ring->version, total_packets, NUM_PACKETS);
+ exit(1);
+ }
+
+ fprintf(stderr, " %u pkts (%u bytes)", NUM_PACKETS, total_bytes >> 1);
+}
+
+static inline int __v1_tx_kernel_ready(struct tpacket_hdr *hdr)
+{
+ return !(hdr->tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING));
+}
+
+static inline void __v1_tx_user_ready(struct tpacket_hdr *hdr)
+{
+ hdr->tp_status = TP_STATUS_SEND_REQUEST;
+ __sync_synchronize();
+}
+
+static inline int __v2_tx_kernel_ready(struct tpacket2_hdr *hdr)
+{
+ return !(hdr->tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING));
+}
+
+static inline void __v2_tx_user_ready(struct tpacket2_hdr *hdr)
+{
+ hdr->tp_status = TP_STATUS_SEND_REQUEST;
+ __sync_synchronize();
+}
+
+static inline int __v3_tx_kernel_ready(struct tpacket3_hdr *hdr)
+{
+ return !(hdr->tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING));
+}
+
+static inline void __v3_tx_user_ready(struct tpacket3_hdr *hdr)
+{
+ hdr->tp_status = TP_STATUS_SEND_REQUEST;
+ __sync_synchronize();
+}
+
+static inline int __tx_kernel_ready(void *base, int version)
+{
+ switch (version) {
+ case TPACKET_V1:
+ return __v1_tx_kernel_ready(base);
+ case TPACKET_V2:
+ return __v2_tx_kernel_ready(base);
+ case TPACKET_V3:
+ return __v3_tx_kernel_ready(base);
+ default:
+ bug_on(1);
+ return 0;
+ }
+}
+
+static inline void __tx_user_ready(void *base, int version)
+{
+ switch (version) {
+ case TPACKET_V1:
+ __v1_tx_user_ready(base);
+ break;
+ case TPACKET_V2:
+ __v2_tx_user_ready(base);
+ break;
+ case TPACKET_V3:
+ __v3_tx_user_ready(base);
+ break;
+ }
+}
+
+static void __v1_v2_set_packet_loss_discard(int sock)
+{
+ int ret, discard = 1;
+
+ ret = setsockopt(sock, SOL_PACKET, PACKET_LOSS, (void *) &discard,
+ sizeof(discard));
+ if (ret == -1) {
+ perror("setsockopt");
+ exit(1);
+ }
+}
+
+static inline void *get_next_frame(struct ring *ring, int n)
+{
+ uint8_t *f0 = ring->rd[0].iov_base;
+
+ switch (ring->version) {
+ case TPACKET_V1:
+ case TPACKET_V2:
+ return ring->rd[n].iov_base;
+ case TPACKET_V3:
+ return f0 + (n * ring->req3.tp_frame_size);
+ default:
+ bug_on(1);
+ }
+}
+
+static void walk_tx(int sock, struct ring *ring)
+{
+ struct pollfd pfd;
+ int rcv_sock, ret;
+ size_t packet_len;
+ union frame_map ppd;
+ char packet[1024];
+ unsigned int frame_num = 0, got = 0;
+ struct sockaddr_ll ll = {
+ .sll_family = PF_PACKET,
+ .sll_halen = ETH_ALEN,
+ };
+ int nframes;
+
+ /* TPACKET_V{1,2} sets up the ring->rd* related variables based
+ * on frames (e.g., rd_num is tp_frame_nr) whereas V3 sets these
+ * up based on blocks (e.g, rd_num is tp_block_nr)
+ */
+ if (ring->version <= TPACKET_V2)
+ nframes = ring->rd_num;
+ else
+ nframes = ring->req3.tp_frame_nr;
+
+ bug_on(ring->type != PACKET_TX_RING);
+ bug_on(nframes < NUM_PACKETS);
+
+ rcv_sock = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
+ if (rcv_sock == -1) {
+ perror("socket");
+ exit(1);
+ }
+
+ pair_udp_setfilter(rcv_sock);
+
+ ll.sll_ifindex = if_nametoindex("lo");
+ ret = bind(rcv_sock, (struct sockaddr *) &ll, sizeof(ll));
+ if (ret == -1) {
+ perror("bind");
+ exit(1);
+ }
+
+ memset(&pfd, 0, sizeof(pfd));
+ pfd.fd = sock;
+ pfd.events = POLLOUT | POLLERR;
+ pfd.revents = 0;
+
+ total_packets = NUM_PACKETS;
+ create_payload(packet, &packet_len);
+
+ while (total_packets > 0) {
+ void *next = get_next_frame(ring, frame_num);
+
+ while (__tx_kernel_ready(next, ring->version) &&
+ total_packets > 0) {
+ ppd.raw = next;
+
+ switch (ring->version) {
+ case TPACKET_V1:
+ ppd.v1->tp_h.tp_snaplen = packet_len;
+ ppd.v1->tp_h.tp_len = packet_len;
+
+ memcpy((uint8_t *) ppd.raw + TPACKET_HDRLEN -
+ sizeof(struct sockaddr_ll), packet,
+ packet_len);
+ total_bytes += ppd.v1->tp_h.tp_snaplen;
+ break;
+
+ case TPACKET_V2:
+ ppd.v2->tp_h.tp_snaplen = packet_len;
+ ppd.v2->tp_h.tp_len = packet_len;
+
+ memcpy((uint8_t *) ppd.raw + TPACKET2_HDRLEN -
+ sizeof(struct sockaddr_ll), packet,
+ packet_len);
+ total_bytes += ppd.v2->tp_h.tp_snaplen;
+ break;
+ case TPACKET_V3: {
+ struct tpacket3_hdr *tx = next;
+
+ tx->tp_snaplen = packet_len;
+ tx->tp_len = packet_len;
+ tx->tp_next_offset = 0;
+
+ memcpy((uint8_t *)tx + TPACKET3_HDRLEN -
+ sizeof(struct sockaddr_ll), packet,
+ packet_len);
+ total_bytes += tx->tp_snaplen;
+ break;
+ }
+ }
+
+ status_bar_update();
+ total_packets--;
+
+ __tx_user_ready(next, ring->version);
+
+ frame_num = (frame_num + 1) % nframes;
+ }
+
+ poll(&pfd, 1, 1);
+ }
+
+ bug_on(total_packets != 0);
+
+ ret = sendto(sock, NULL, 0, 0, NULL, 0);
+ if (ret == -1) {
+ perror("sendto");
+ exit(1);
+ }
+
+ while ((ret = recvfrom(rcv_sock, packet, sizeof(packet),
+ 0, NULL, NULL)) > 0 &&
+ total_packets < NUM_PACKETS) {
+ got += ret;
+ test_payload(packet, ret);
+
+ status_bar_update();
+ total_packets++;
+ }
+
+ close(rcv_sock);
+
+ if (total_packets != NUM_PACKETS) {
+ fprintf(stderr, "walk_v%d_rx: received %u out of %u pkts\n",
+ ring->version, total_packets, NUM_PACKETS);
+ exit(1);
+ }
+
+ fprintf(stderr, " %u pkts (%u bytes)", NUM_PACKETS, got);
+}
+
+static void walk_v1_v2(int sock, struct ring *ring)
+{
+ if (ring->type == PACKET_RX_RING)
+ walk_v1_v2_rx(sock, ring);
+ else
+ walk_tx(sock, ring);
+}
+
+static uint64_t __v3_prev_block_seq_num = 0;
+
+void __v3_test_block_seq_num(struct block_desc *pbd)
+{
+ if (__v3_prev_block_seq_num + 1 != pbd->h1.seq_num) {
+ fprintf(stderr, "\nprev_block_seq_num:%"PRIu64", expected "
+ "seq:%"PRIu64" != actual seq:%"PRIu64"\n",
+ __v3_prev_block_seq_num, __v3_prev_block_seq_num + 1,
+ (uint64_t) pbd->h1.seq_num);
+ exit(1);
+ }
+
+ __v3_prev_block_seq_num = pbd->h1.seq_num;
+}
+
+static void __v3_test_block_len(struct block_desc *pbd, uint32_t bytes, int block_num)
+{
+ if (pbd->h1.num_pkts && bytes != pbd->h1.blk_len) {
+ fprintf(stderr, "\nblock:%u with %upackets, expected "
+ "len:%u != actual len:%u\n", block_num,
+ pbd->h1.num_pkts, bytes, pbd->h1.blk_len);
+ exit(1);
+ }
+}
+
+static void __v3_test_block_header(struct block_desc *pbd, const int block_num)
+{
+ if ((pbd->h1.block_status & TP_STATUS_USER) == 0) {
+ fprintf(stderr, "\nblock %u: not in TP_STATUS_USER\n", block_num);
+ exit(1);
+ }
+
+ __v3_test_block_seq_num(pbd);
+}
+
+static void __v3_walk_block(struct block_desc *pbd, const int block_num)
+{
+ int num_pkts = pbd->h1.num_pkts, i;
+ unsigned long bytes = 0, bytes_with_padding = ALIGN_8(sizeof(*pbd));
+ struct tpacket3_hdr *ppd;
+
+ __v3_test_block_header(pbd, block_num);
+
+ ppd = (struct tpacket3_hdr *) ((uint8_t *) pbd +
+ pbd->h1.offset_to_first_pkt);
+
+ for (i = 0; i < num_pkts; ++i) {
+ bytes += ppd->tp_snaplen;
+
+ if (ppd->tp_next_offset)
+ bytes_with_padding += ppd->tp_next_offset;
+ else
+ bytes_with_padding += ALIGN_8(ppd->tp_snaplen + ppd->tp_mac);
+
+ test_payload((uint8_t *) ppd + ppd->tp_mac, ppd->tp_snaplen);
+
+ status_bar_update();
+ total_packets++;
+
+ ppd = (struct tpacket3_hdr *) ((uint8_t *) ppd + ppd->tp_next_offset);
+ __sync_synchronize();
+ }
+
+ __v3_test_block_len(pbd, bytes_with_padding, block_num);
+ total_bytes += bytes;
+}
+
+void __v3_flush_block(struct block_desc *pbd)
+{
+ pbd->h1.block_status = TP_STATUS_KERNEL;
+ __sync_synchronize();
+}
+
+static void walk_v3_rx(int sock, struct ring *ring)
+{
+ unsigned int block_num = 0;
+ struct pollfd pfd;
+ struct block_desc *pbd;
+ int udp_sock[2];
+
+ bug_on(ring->type != PACKET_RX_RING);
+
+ pair_udp_open(udp_sock, PORT_BASE);
+
+ memset(&pfd, 0, sizeof(pfd));
+ pfd.fd = sock;
+ pfd.events = POLLIN | POLLERR;
+ pfd.revents = 0;
+
+ pair_udp_send(udp_sock, NUM_PACKETS);
+
+ while (total_packets < NUM_PACKETS * 2) {
+ pbd = (struct block_desc *) ring->rd[block_num].iov_base;
+
+ while ((pbd->h1.block_status & TP_STATUS_USER) == 0)
+ poll(&pfd, 1, 1);
+
+ __v3_walk_block(pbd, block_num);
+ __v3_flush_block(pbd);
+
+ block_num = (block_num + 1) % ring->rd_num;
+ }
+
+ pair_udp_close(udp_sock);
+
+ if (total_packets != 2 * NUM_PACKETS) {
+ fprintf(stderr, "walk_v3_rx: received %u out of %u pkts\n",
+ total_packets, NUM_PACKETS);
+ exit(1);
+ }
+
+ fprintf(stderr, " %u pkts (%u bytes)", NUM_PACKETS, total_bytes >> 1);
+}
+
+static void walk_v3(int sock, struct ring *ring)
+{
+ if (ring->type == PACKET_RX_RING)
+ walk_v3_rx(sock, ring);
+ else
+ walk_tx(sock, ring);
+}
+
+static void __v1_v2_fill(struct ring *ring, unsigned int blocks)
+{
+ ring->req.tp_block_size = getpagesize() << 2;
+ ring->req.tp_frame_size = TPACKET_ALIGNMENT << 7;
+ ring->req.tp_block_nr = blocks;
+
+ ring->req.tp_frame_nr = ring->req.tp_block_size /
+ ring->req.tp_frame_size *
+ ring->req.tp_block_nr;
+
+ ring->mm_len = ring->req.tp_block_size * ring->req.tp_block_nr;
+ ring->walk = walk_v1_v2;
+ ring->rd_num = ring->req.tp_frame_nr;
+ ring->flen = ring->req.tp_frame_size;
+}
+
+static void __v3_fill(struct ring *ring, unsigned int blocks, int type)
+{
+ if (type == PACKET_RX_RING) {
+ ring->req3.tp_retire_blk_tov = 64;
+ ring->req3.tp_sizeof_priv = 0;
+ ring->req3.tp_feature_req_word = TP_FT_REQ_FILL_RXHASH;
+ }
+ ring->req3.tp_block_size = getpagesize() << 2;
+ ring->req3.tp_frame_size = TPACKET_ALIGNMENT << 7;
+ ring->req3.tp_block_nr = blocks;
+
+ ring->req3.tp_frame_nr = ring->req3.tp_block_size /
+ ring->req3.tp_frame_size *
+ ring->req3.tp_block_nr;
+
+ ring->mm_len = ring->req3.tp_block_size * ring->req3.tp_block_nr;
+ ring->walk = walk_v3;
+ ring->rd_num = ring->req3.tp_block_nr;
+ ring->flen = ring->req3.tp_block_size;
+}
+
+static void setup_ring(int sock, struct ring *ring, int version, int type)
+{
+ int ret = 0;
+ unsigned int blocks = 256;
+
+ ring->type = type;
+ ring->version = version;
+
+ switch (version) {
+ case TPACKET_V1:
+ case TPACKET_V2:
+ if (type == PACKET_TX_RING)
+ __v1_v2_set_packet_loss_discard(sock);
+ __v1_v2_fill(ring, blocks);
+ ret = setsockopt(sock, SOL_PACKET, type, &ring->req,
+ sizeof(ring->req));
+ break;
+
+ case TPACKET_V3:
+ __v3_fill(ring, blocks, type);
+ ret = setsockopt(sock, SOL_PACKET, type, &ring->req3,
+ sizeof(ring->req3));
+ break;
+ }
+
+ if (ret == -1) {
+ perror("setsockopt");
+ exit(1);
+ }
+
+ ring->rd_len = ring->rd_num * sizeof(*ring->rd);
+ ring->rd = malloc(ring->rd_len);
+ if (ring->rd == NULL) {
+ perror("malloc");
+ exit(1);
+ }
+
+ total_packets = 0;
+ total_bytes = 0;
+}
+
+static void mmap_ring(int sock, struct ring *ring)
+{
+ int i;
+
+ ring->mm_space = mmap(0, ring->mm_len, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_LOCKED | MAP_POPULATE, sock, 0);
+ if (ring->mm_space == MAP_FAILED) {
+ perror("mmap");
+ exit(1);
+ }
+
+ memset(ring->rd, 0, ring->rd_len);
+ for (i = 0; i < ring->rd_num; ++i) {
+ ring->rd[i].iov_base = ring->mm_space + (i * ring->flen);
+ ring->rd[i].iov_len = ring->flen;
+ }
+}
+
+static void bind_ring(int sock, struct ring *ring)
+{
+ int ret;
+
+ pair_udp_setfilter(sock);
+
+ ring->ll.sll_family = PF_PACKET;
+ ring->ll.sll_protocol = htons(ETH_P_ALL);
+ ring->ll.sll_ifindex = if_nametoindex("lo");
+ ring->ll.sll_hatype = 0;
+ ring->ll.sll_pkttype = 0;
+ ring->ll.sll_halen = 0;
+
+ ret = bind(sock, (struct sockaddr *) &ring->ll, sizeof(ring->ll));
+ if (ret == -1) {
+ perror("bind");
+ exit(1);
+ }
+}
+
+static void walk_ring(int sock, struct ring *ring)
+{
+ ring->walk(sock, ring);
+}
+
+static void unmap_ring(int sock, struct ring *ring)
+{
+ munmap(ring->mm_space, ring->mm_len);
+ free(ring->rd);
+}
+
+static int test_kernel_bit_width(void)
+{
+ char in[512], *ptr;
+ int num = 0, fd;
+ ssize_t ret;
+
+ fd = open("/proc/kallsyms", O_RDONLY);
+ if (fd == -1) {
+ perror("open");
+ exit(1);
+ }
+
+ ret = read(fd, in, sizeof(in));
+ if (ret <= 0) {
+ perror("read");
+ exit(1);
+ }
+
+ close(fd);
+
+ ptr = in;
+ while(!isspace(*ptr)) {
+ num++;
+ ptr++;
+ }
+
+ return num * 4;
+}
+
+static int test_user_bit_width(void)
+{
+ return __WORDSIZE;
+}
+
+static const char *tpacket_str[] = {
+ [TPACKET_V1] = "TPACKET_V1",
+ [TPACKET_V2] = "TPACKET_V2",
+ [TPACKET_V3] = "TPACKET_V3",
+};
+
+static const char *type_str[] = {
+ [PACKET_RX_RING] = "PACKET_RX_RING",
+ [PACKET_TX_RING] = "PACKET_TX_RING",
+};
+
+static int test_tpacket(int version, int type)
+{
+ int sock;
+ struct ring ring;
+
+ fprintf(stderr, "test: %s with %s ", tpacket_str[version],
+ type_str[type]);
+ fflush(stderr);
+
+ if (version == TPACKET_V1 &&
+ test_kernel_bit_width() != test_user_bit_width()) {
+ fprintf(stderr, "test: skip %s %s since user and kernel "
+ "space have different bit width\n",
+ tpacket_str[version], type_str[type]);
+ return KSFT_SKIP;
+ }
+
+ sock = pfsocket(version);
+ memset(&ring, 0, sizeof(ring));
+ setup_ring(sock, &ring, version, type);
+ mmap_ring(sock, &ring);
+ bind_ring(sock, &ring);
+ walk_ring(sock, &ring);
+ unmap_ring(sock, &ring);
+ close(sock);
+
+ fprintf(stderr, "\n");
+ return 0;
+}
+
+int main(void)
+{
+ int ret = 0;
+
+ ret |= test_tpacket(TPACKET_V1, PACKET_RX_RING);
+ ret |= test_tpacket(TPACKET_V1, PACKET_TX_RING);
+
+ ret |= test_tpacket(TPACKET_V2, PACKET_RX_RING);
+ ret |= test_tpacket(TPACKET_V2, PACKET_TX_RING);
+
+ ret |= test_tpacket(TPACKET_V3, PACKET_RX_RING);
+ ret |= test_tpacket(TPACKET_V3, PACKET_TX_RING);
+
+ if (ret)
+ return 1;
+
+ printf("OK. All tests passed\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/net/reuseaddr_conflict.c b/tools/testing/selftests/net/reuseaddr_conflict.c
new file mode 100644
index 000000000..7c5b12664
--- /dev/null
+++ b/tools/testing/selftests/net/reuseaddr_conflict.c
@@ -0,0 +1,114 @@
+/*
+ * Test for the regression introduced by
+ *
+ * b9470c27607b ("inet: kill smallest_size and smallest_port")
+ *
+ * If we open an ipv4 socket on a port with reuseaddr we shouldn't reset the tb
+ * when we open the ipv6 conterpart, which is what was happening previously.
+ */
+#include <errno.h>
+#include <error.h>
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#define PORT 9999
+
+int open_port(int ipv6, int any)
+{
+ int fd = -1;
+ int reuseaddr = 1;
+ int v6only = 1;
+ int addrlen;
+ int ret = -1;
+ struct sockaddr *addr;
+ int family = ipv6 ? AF_INET6 : AF_INET;
+
+ struct sockaddr_in6 addr6 = {
+ .sin6_family = AF_INET6,
+ .sin6_port = htons(PORT),
+ .sin6_addr = in6addr_any
+ };
+ struct sockaddr_in addr4 = {
+ .sin_family = AF_INET,
+ .sin_port = htons(PORT),
+ .sin_addr.s_addr = any ? htonl(INADDR_ANY) : inet_addr("127.0.0.1"),
+ };
+
+
+ if (ipv6) {
+ addr = (struct sockaddr*)&addr6;
+ addrlen = sizeof(addr6);
+ } else {
+ addr = (struct sockaddr*)&addr4;
+ addrlen = sizeof(addr4);
+ }
+
+ if ((fd = socket(family, SOCK_STREAM, IPPROTO_TCP)) < 0) {
+ perror("socket");
+ goto out;
+ }
+
+ if (ipv6 && setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, (void*)&v6only,
+ sizeof(v6only)) < 0) {
+ perror("setsockopt IPV6_V6ONLY");
+ goto out;
+ }
+
+ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &reuseaddr,
+ sizeof(reuseaddr)) < 0) {
+ perror("setsockopt SO_REUSEADDR");
+ goto out;
+ }
+
+ if (bind(fd, addr, addrlen) < 0) {
+ perror("bind");
+ goto out;
+ }
+
+ if (any)
+ return fd;
+
+ if (listen(fd, 1) < 0) {
+ perror("listen");
+ goto out;
+ }
+ return fd;
+out:
+ close(fd);
+ return ret;
+}
+
+int main(void)
+{
+ int listenfd;
+ int fd1, fd2;
+
+ fprintf(stderr, "Opening 127.0.0.1:%d\n", PORT);
+ listenfd = open_port(0, 0);
+ if (listenfd < 0)
+ error(1, errno, "Couldn't open listen socket");
+ fprintf(stderr, "Opening INADDR_ANY:%d\n", PORT);
+ fd1 = open_port(0, 1);
+ if (fd1 >= 0)
+ error(1, 0, "Was allowed to create an ipv4 reuseport on a already bound non-reuseport socket");
+ fprintf(stderr, "Opening in6addr_any:%d\n", PORT);
+ fd1 = open_port(1, 1);
+ if (fd1 < 0)
+ error(1, errno, "Couldn't open ipv6 reuseport");
+ fprintf(stderr, "Opening INADDR_ANY:%d\n", PORT);
+ fd2 = open_port(0, 1);
+ if (fd2 >= 0)
+ error(1, 0, "Was allowed to create an ipv4 reuseport on a already bound non-reuseport socket");
+ close(fd1);
+ fprintf(stderr, "Opening INADDR_ANY:%d after closing ipv6 socket\n", PORT);
+ fd1 = open_port(0, 1);
+ if (fd1 >= 0)
+ error(1, 0, "Was allowed to create an ipv4 reuseport on an already bound non-reuseport socket with no ipv6");
+ fprintf(stderr, "Success");
+ return 0;
+}
diff --git a/tools/testing/selftests/net/reuseaddr_ports_exhausted.c b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c
new file mode 100644
index 000000000..066efd30e
--- /dev/null
+++ b/tools/testing/selftests/net/reuseaddr_ports_exhausted.c
@@ -0,0 +1,162 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Check if we can fully utilize 4-tuples for connect().
+ *
+ * Rules to bind sockets to the same port when all ephemeral ports are
+ * exhausted.
+ *
+ * 1. if there are TCP_LISTEN sockets on the port, fail to bind.
+ * 2. if there are sockets without SO_REUSEADDR, fail to bind.
+ * 3. if SO_REUSEADDR is disabled, fail to bind.
+ * 4. if SO_REUSEADDR is enabled and SO_REUSEPORT is disabled,
+ * succeed to bind.
+ * 5. if SO_REUSEADDR and SO_REUSEPORT are enabled and
+ * there is no socket having the both options and the same EUID,
+ * succeed to bind.
+ * 6. fail to bind.
+ *
+ * Author: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
+ */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "../kselftest_harness.h"
+
+struct reuse_opts {
+ int reuseaddr[2];
+ int reuseport[2];
+};
+
+struct reuse_opts unreusable_opts[12] = {
+ {{0, 0}, {0, 0}},
+ {{0, 0}, {0, 1}},
+ {{0, 0}, {1, 0}},
+ {{0, 0}, {1, 1}},
+ {{0, 1}, {0, 0}},
+ {{0, 1}, {0, 1}},
+ {{0, 1}, {1, 0}},
+ {{0, 1}, {1, 1}},
+ {{1, 0}, {0, 0}},
+ {{1, 0}, {0, 1}},
+ {{1, 0}, {1, 0}},
+ {{1, 0}, {1, 1}},
+};
+
+struct reuse_opts reusable_opts[4] = {
+ {{1, 1}, {0, 0}},
+ {{1, 1}, {0, 1}},
+ {{1, 1}, {1, 0}},
+ {{1, 1}, {1, 1}},
+};
+
+int bind_port(struct __test_metadata *_metadata, int reuseaddr, int reuseport)
+{
+ struct sockaddr_in local_addr;
+ int len = sizeof(local_addr);
+ int fd, ret;
+
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ ASSERT_NE(-1, fd) TH_LOG("failed to open socket.");
+
+ ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &reuseaddr, sizeof(int));
+ ASSERT_EQ(0, ret) TH_LOG("failed to setsockopt: SO_REUSEADDR.");
+
+ ret = setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &reuseport, sizeof(int));
+ ASSERT_EQ(0, ret) TH_LOG("failed to setsockopt: SO_REUSEPORT.");
+
+ local_addr.sin_family = AF_INET;
+ local_addr.sin_addr.s_addr = inet_addr("127.0.0.1");
+ local_addr.sin_port = 0;
+
+ if (bind(fd, (struct sockaddr *)&local_addr, len) == -1) {
+ close(fd);
+ return -1;
+ }
+
+ return fd;
+}
+
+TEST(reuseaddr_ports_exhausted_unreusable)
+{
+ struct reuse_opts *opts;
+ int i, j, fd[2];
+
+ for (i = 0; i < 12; i++) {
+ opts = &unreusable_opts[i];
+
+ for (j = 0; j < 2; j++)
+ fd[j] = bind_port(_metadata, opts->reuseaddr[j], opts->reuseport[j]);
+
+ ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind.");
+ EXPECT_EQ(-1, fd[1]) TH_LOG("should fail to bind.");
+
+ for (j = 0; j < 2; j++)
+ if (fd[j] != -1)
+ close(fd[j]);
+ }
+}
+
+TEST(reuseaddr_ports_exhausted_reusable_same_euid)
+{
+ struct reuse_opts *opts;
+ int i, j, fd[2];
+
+ for (i = 0; i < 4; i++) {
+ opts = &reusable_opts[i];
+
+ for (j = 0; j < 2; j++)
+ fd[j] = bind_port(_metadata, opts->reuseaddr[j], opts->reuseport[j]);
+
+ ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind.");
+
+ if (opts->reuseport[0] && opts->reuseport[1]) {
+ EXPECT_EQ(-1, fd[1]) TH_LOG("should fail to bind because both sockets succeed to be listened.");
+ } else {
+ EXPECT_NE(-1, fd[1]) TH_LOG("should succeed to bind to connect to different destinations.");
+ }
+
+ for (j = 0; j < 2; j++)
+ if (fd[j] != -1)
+ close(fd[j]);
+ }
+}
+
+TEST(reuseaddr_ports_exhausted_reusable_different_euid)
+{
+ struct reuse_opts *opts;
+ int i, j, ret, fd[2];
+ uid_t euid[2] = {10, 20};
+
+ for (i = 0; i < 4; i++) {
+ opts = &reusable_opts[i];
+
+ for (j = 0; j < 2; j++) {
+ ret = seteuid(euid[j]);
+ ASSERT_EQ(0, ret) TH_LOG("failed to seteuid: %d.", euid[j]);
+
+ fd[j] = bind_port(_metadata, opts->reuseaddr[j], opts->reuseport[j]);
+
+ ret = seteuid(0);
+ ASSERT_EQ(0, ret) TH_LOG("failed to seteuid: 0.");
+ }
+
+ ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind.");
+ EXPECT_NE(-1, fd[1]) TH_LOG("should succeed to bind because one socket can be bound in each euid.");
+
+ if (fd[1] != -1) {
+ ret = listen(fd[0], 5);
+ ASSERT_EQ(0, ret) TH_LOG("failed to listen.");
+
+ ret = listen(fd[1], 5);
+ EXPECT_EQ(-1, ret) TH_LOG("should fail to listen because only one uid reserves the port in TCP_LISTEN.");
+ }
+
+ for (j = 0; j < 2; j++)
+ if (fd[j] != -1)
+ close(fd[j]);
+ }
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/net/reuseaddr_ports_exhausted.sh b/tools/testing/selftests/net/reuseaddr_ports_exhausted.sh
new file mode 100755
index 000000000..20e3a2913
--- /dev/null
+++ b/tools/testing/selftests/net/reuseaddr_ports_exhausted.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run tests when all ephemeral ports are exhausted.
+#
+# Author: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
+
+set +x
+set -e
+
+readonly NETNS="ns-$(mktemp -u XXXXXX)"
+
+setup() {
+ ip netns add "${NETNS}"
+ ip -netns "${NETNS}" link set lo up
+ ip netns exec "${NETNS}" \
+ sysctl -w net.ipv4.ip_local_port_range="32768 32768" \
+ > /dev/null 2>&1
+ ip netns exec "${NETNS}" \
+ sysctl -w net.ipv4.ip_autobind_reuse=1 > /dev/null 2>&1
+}
+
+cleanup() {
+ ip netns del "${NETNS}"
+}
+
+trap cleanup EXIT
+setup
+
+do_test() {
+ ip netns exec "${NETNS}" ./reuseaddr_ports_exhausted
+}
+
+do_test
+echo "tests done"
diff --git a/tools/testing/selftests/net/reuseport_addr_any.c b/tools/testing/selftests/net/reuseport_addr_any.c
new file mode 100644
index 000000000..b8475cb29
--- /dev/null
+++ b/tools/testing/selftests/net/reuseport_addr_any.c
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* Test that sockets listening on a specific address are preferred
+ * over sockets listening on addr_any.
+ */
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <linux/dccp.h>
+#include <linux/in.h>
+#include <linux/unistd.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#ifndef SOL_DCCP
+#define SOL_DCCP 269
+#endif
+
+static const char *IP4_ADDR = "127.0.0.1";
+static const char *IP6_ADDR = "::1";
+static const char *IP4_MAPPED6 = "::ffff:127.0.0.1";
+
+static const int PORT = 8888;
+
+static void build_rcv_fd(int family, int proto, int *rcv_fds, int count,
+ const char *addr_str)
+{
+ struct sockaddr_in addr4 = {0};
+ struct sockaddr_in6 addr6 = {0};
+ struct sockaddr *addr;
+ int opt, i, sz;
+
+ memset(&addr, 0, sizeof(addr));
+
+ switch (family) {
+ case AF_INET:
+ addr4.sin_family = family;
+ if (!addr_str)
+ addr4.sin_addr.s_addr = htonl(INADDR_ANY);
+ else if (!inet_pton(family, addr_str, &addr4.sin_addr.s_addr))
+ error(1, errno, "inet_pton failed: %s", addr_str);
+ addr4.sin_port = htons(PORT);
+ sz = sizeof(addr4);
+ addr = (struct sockaddr *)&addr4;
+ break;
+ case AF_INET6:
+ addr6.sin6_family = AF_INET6;
+ if (!addr_str)
+ addr6.sin6_addr = in6addr_any;
+ else if (!inet_pton(family, addr_str, &addr6.sin6_addr))
+ error(1, errno, "inet_pton failed: %s", addr_str);
+ addr6.sin6_port = htons(PORT);
+ sz = sizeof(addr6);
+ addr = (struct sockaddr *)&addr6;
+ break;
+ default:
+ error(1, 0, "Unsupported family %d", family);
+ /* clang does not recognize error() above as terminating
+ * the program, so it complains that saddr, sz are
+ * not initialized when this code path is taken. Silence it.
+ */
+ return;
+ }
+
+ for (i = 0; i < count; ++i) {
+ rcv_fds[i] = socket(family, proto, 0);
+ if (rcv_fds[i] < 0)
+ error(1, errno, "failed to create receive socket");
+
+ opt = 1;
+ if (setsockopt(rcv_fds[i], SOL_SOCKET, SO_REUSEPORT, &opt,
+ sizeof(opt)))
+ error(1, errno, "failed to set SO_REUSEPORT");
+
+ if (bind(rcv_fds[i], addr, sz))
+ error(1, errno, "failed to bind receive socket");
+
+ if (proto == SOCK_STREAM && listen(rcv_fds[i], 10))
+ error(1, errno, "tcp: failed to listen on receive port");
+ else if (proto == SOCK_DCCP) {
+ if (setsockopt(rcv_fds[i], SOL_DCCP,
+ DCCP_SOCKOPT_SERVICE,
+ &(int) {htonl(42)}, sizeof(int)))
+ error(1, errno, "failed to setsockopt");
+
+ if (listen(rcv_fds[i], 10))
+ error(1, errno, "dccp: failed to listen on receive port");
+ }
+ }
+}
+
+static int connect_and_send(int family, int proto)
+{
+ struct sockaddr_in saddr4 = {0};
+ struct sockaddr_in daddr4 = {0};
+ struct sockaddr_in6 saddr6 = {0};
+ struct sockaddr_in6 daddr6 = {0};
+ struct sockaddr *saddr, *daddr;
+ int fd, sz;
+
+ switch (family) {
+ case AF_INET:
+ saddr4.sin_family = AF_INET;
+ saddr4.sin_addr.s_addr = htonl(INADDR_ANY);
+ saddr4.sin_port = 0;
+
+ daddr4.sin_family = AF_INET;
+ if (!inet_pton(family, IP4_ADDR, &daddr4.sin_addr.s_addr))
+ error(1, errno, "inet_pton failed: %s", IP4_ADDR);
+ daddr4.sin_port = htons(PORT);
+
+ sz = sizeof(saddr4);
+ saddr = (struct sockaddr *)&saddr4;
+ daddr = (struct sockaddr *)&daddr4;
+ break;
+ case AF_INET6:
+ saddr6.sin6_family = AF_INET6;
+ saddr6.sin6_addr = in6addr_any;
+
+ daddr6.sin6_family = AF_INET6;
+ if (!inet_pton(family, IP6_ADDR, &daddr6.sin6_addr))
+ error(1, errno, "inet_pton failed: %s", IP6_ADDR);
+ daddr6.sin6_port = htons(PORT);
+
+ sz = sizeof(saddr6);
+ saddr = (struct sockaddr *)&saddr6;
+ daddr = (struct sockaddr *)&daddr6;
+ break;
+ default:
+ error(1, 0, "Unsupported family %d", family);
+ /* clang does not recognize error() above as terminating
+ * the program, so it complains that saddr, daddr, sz are
+ * not initialized when this code path is taken. Silence it.
+ */
+ return -1;
+ }
+
+ fd = socket(family, proto, 0);
+ if (fd < 0)
+ error(1, errno, "failed to create send socket");
+
+ if (proto == SOCK_DCCP &&
+ setsockopt(fd, SOL_DCCP, DCCP_SOCKOPT_SERVICE,
+ &(int){htonl(42)}, sizeof(int)))
+ error(1, errno, "failed to setsockopt");
+
+ if (bind(fd, saddr, sz))
+ error(1, errno, "failed to bind send socket");
+
+ if (connect(fd, daddr, sz))
+ error(1, errno, "failed to connect send socket");
+
+ if (send(fd, "a", 1, 0) < 0)
+ error(1, errno, "failed to send message");
+
+ return fd;
+}
+
+static int receive_once(int epfd, int proto)
+{
+ struct epoll_event ev;
+ int i, fd;
+ char buf[8];
+
+ i = epoll_wait(epfd, &ev, 1, 3);
+ if (i < 0)
+ error(1, errno, "epoll_wait failed");
+
+ if (proto == SOCK_STREAM || proto == SOCK_DCCP) {
+ fd = accept(ev.data.fd, NULL, NULL);
+ if (fd < 0)
+ error(1, errno, "failed to accept");
+ i = recv(fd, buf, sizeof(buf), 0);
+ close(fd);
+ } else {
+ i = recv(ev.data.fd, buf, sizeof(buf), 0);
+ }
+
+ if (i < 0)
+ error(1, errno, "failed to recv");
+
+ return ev.data.fd;
+}
+
+static void test(int *rcv_fds, int count, int family, int proto, int fd)
+{
+ struct epoll_event ev;
+ int epfd, i, send_fd, recv_fd;
+
+ epfd = epoll_create(1);
+ if (epfd < 0)
+ error(1, errno, "failed to create epoll");
+
+ ev.events = EPOLLIN;
+ for (i = 0; i < count; ++i) {
+ ev.data.fd = rcv_fds[i];
+ if (epoll_ctl(epfd, EPOLL_CTL_ADD, rcv_fds[i], &ev))
+ error(1, errno, "failed to register sock epoll");
+ }
+
+ send_fd = connect_and_send(family, proto);
+
+ recv_fd = receive_once(epfd, proto);
+ if (recv_fd != fd)
+ error(1, 0, "received on an unexpected socket");
+
+ close(send_fd);
+ close(epfd);
+}
+
+
+static void run_one_test(int fam_send, int fam_rcv, int proto,
+ const char *addr_str)
+{
+ /* Below we test that a socket listening on a specific address
+ * is always selected in preference over a socket listening
+ * on addr_any. Bugs where this is not the case often result
+ * in sockets created first or last to get picked. So below
+ * we make sure that there are always addr_any sockets created
+ * before and after a specific socket is created.
+ */
+ int rcv_fds[10], i;
+
+ build_rcv_fd(AF_INET, proto, rcv_fds, 2, NULL);
+ build_rcv_fd(AF_INET6, proto, rcv_fds + 2, 2, NULL);
+ build_rcv_fd(fam_rcv, proto, rcv_fds + 4, 1, addr_str);
+ build_rcv_fd(AF_INET, proto, rcv_fds + 5, 2, NULL);
+ build_rcv_fd(AF_INET6, proto, rcv_fds + 7, 2, NULL);
+ test(rcv_fds, 9, fam_send, proto, rcv_fds[4]);
+ for (i = 0; i < 9; ++i)
+ close(rcv_fds[i]);
+ fprintf(stderr, "pass\n");
+}
+
+static void test_proto(int proto, const char *proto_str)
+{
+ if (proto == SOCK_DCCP) {
+ int test_fd;
+
+ test_fd = socket(AF_INET, proto, 0);
+ if (test_fd < 0) {
+ if (errno == ESOCKTNOSUPPORT) {
+ fprintf(stderr, "DCCP not supported: skipping DCCP tests\n");
+ return;
+ } else
+ error(1, errno, "failed to create a DCCP socket");
+ }
+ close(test_fd);
+ }
+
+ fprintf(stderr, "%s IPv4 ... ", proto_str);
+ run_one_test(AF_INET, AF_INET, proto, IP4_ADDR);
+
+ fprintf(stderr, "%s IPv6 ... ", proto_str);
+ run_one_test(AF_INET6, AF_INET6, proto, IP6_ADDR);
+
+ fprintf(stderr, "%s IPv4 mapped to IPv6 ... ", proto_str);
+ run_one_test(AF_INET, AF_INET6, proto, IP4_MAPPED6);
+}
+
+int main(void)
+{
+ test_proto(SOCK_DGRAM, "UDP");
+ test_proto(SOCK_STREAM, "TCP");
+ test_proto(SOCK_DCCP, "DCCP");
+
+ fprintf(stderr, "SUCCESS\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/net/reuseport_addr_any.sh b/tools/testing/selftests/net/reuseport_addr_any.sh
new file mode 100755
index 000000000..104592f62
--- /dev/null
+++ b/tools/testing/selftests/net/reuseport_addr_any.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+./in_netns.sh ./reuseport_addr_any
diff --git a/tools/testing/selftests/net/reuseport_bpf.c b/tools/testing/selftests/net/reuseport_bpf.c
new file mode 100644
index 000000000..b0cc082fb
--- /dev/null
+++ b/tools/testing/selftests/net/reuseport_bpf.c
@@ -0,0 +1,641 @@
+/*
+ * Test functionality of BPF filters for SO_REUSEPORT. The tests below will use
+ * a BPF program (both classic and extended) to read the first word from an
+ * incoming packet (expected to be in network byte-order), calculate a modulus
+ * of that number, and then dispatch the packet to the Nth socket using the
+ * result. These tests are run for each supported address family and protocol.
+ * Additionally, a few edge cases in the implementation are tested.
+ */
+
+#include <errno.h>
+#include <error.h>
+#include <fcntl.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/unistd.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/resource.h>
+#include <unistd.h>
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+#endif
+
+struct test_params {
+ int recv_family;
+ int send_family;
+ int protocol;
+ size_t recv_socks;
+ uint16_t recv_port;
+ uint16_t send_port_min;
+};
+
+static size_t sockaddr_size(void)
+{
+ return sizeof(struct sockaddr_storage);
+}
+
+static struct sockaddr *new_any_sockaddr(int family, uint16_t port)
+{
+ struct sockaddr_storage *addr;
+ struct sockaddr_in *addr4;
+ struct sockaddr_in6 *addr6;
+
+ addr = malloc(sizeof(struct sockaddr_storage));
+ memset(addr, 0, sizeof(struct sockaddr_storage));
+
+ switch (family) {
+ case AF_INET:
+ addr4 = (struct sockaddr_in *)addr;
+ addr4->sin_family = AF_INET;
+ addr4->sin_addr.s_addr = htonl(INADDR_ANY);
+ addr4->sin_port = htons(port);
+ break;
+ case AF_INET6:
+ addr6 = (struct sockaddr_in6 *)addr;
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_addr = in6addr_any;
+ addr6->sin6_port = htons(port);
+ break;
+ default:
+ error(1, 0, "Unsupported family %d", family);
+ }
+ return (struct sockaddr *)addr;
+}
+
+static struct sockaddr *new_loopback_sockaddr(int family, uint16_t port)
+{
+ struct sockaddr *addr = new_any_sockaddr(family, port);
+ struct sockaddr_in *addr4;
+ struct sockaddr_in6 *addr6;
+
+ switch (family) {
+ case AF_INET:
+ addr4 = (struct sockaddr_in *)addr;
+ addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ break;
+ case AF_INET6:
+ addr6 = (struct sockaddr_in6 *)addr;
+ addr6->sin6_addr = in6addr_loopback;
+ break;
+ default:
+ error(1, 0, "Unsupported family %d", family);
+ }
+ return addr;
+}
+
+static void attach_ebpf(int fd, uint16_t mod)
+{
+ static char bpf_log_buf[65536];
+ static const char bpf_license[] = "GPL";
+
+ int bpf_fd;
+ const struct bpf_insn prog[] = {
+ /* BPF_MOV64_REG(BPF_REG_6, BPF_REG_1) */
+ { BPF_ALU64 | BPF_MOV | BPF_X, BPF_REG_6, BPF_REG_1, 0, 0 },
+ /* BPF_LD_ABS(BPF_W, 0) R0 = (uint32_t)skb[0] */
+ { BPF_LD | BPF_ABS | BPF_W, 0, 0, 0, 0 },
+ /* BPF_ALU64_IMM(BPF_MOD, BPF_REG_0, mod) */
+ { BPF_ALU64 | BPF_MOD | BPF_K, BPF_REG_0, 0, 0, mod },
+ /* BPF_EXIT_INSN() */
+ { BPF_JMP | BPF_EXIT, 0, 0, 0, 0 }
+ };
+ union bpf_attr attr;
+
+ memset(&attr, 0, sizeof(attr));
+ attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+ attr.insn_cnt = ARRAY_SIZE(prog);
+ attr.insns = (unsigned long) &prog;
+ attr.license = (unsigned long) &bpf_license;
+ attr.log_buf = (unsigned long) &bpf_log_buf;
+ attr.log_size = sizeof(bpf_log_buf);
+ attr.log_level = 1;
+ attr.kern_version = 0;
+
+ bpf_fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+ if (bpf_fd < 0)
+ error(1, errno, "ebpf error. log:\n%s\n", bpf_log_buf);
+
+ if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_REUSEPORT_EBPF, &bpf_fd,
+ sizeof(bpf_fd)))
+ error(1, errno, "failed to set SO_ATTACH_REUSEPORT_EBPF");
+
+ close(bpf_fd);
+}
+
+static void attach_cbpf(int fd, uint16_t mod)
+{
+ struct sock_filter code[] = {
+ /* A = (uint32_t)skb[0] */
+ { BPF_LD | BPF_W | BPF_ABS, 0, 0, 0 },
+ /* A = A % mod */
+ { BPF_ALU | BPF_MOD, 0, 0, mod },
+ /* return A */
+ { BPF_RET | BPF_A, 0, 0, 0 },
+ };
+ struct sock_fprog p = {
+ .len = ARRAY_SIZE(code),
+ .filter = code,
+ };
+
+ if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_REUSEPORT_CBPF, &p, sizeof(p)))
+ error(1, errno, "failed to set SO_ATTACH_REUSEPORT_CBPF");
+}
+
+static void build_recv_group(const struct test_params p, int fd[], uint16_t mod,
+ void (*attach_bpf)(int, uint16_t))
+{
+ struct sockaddr * const addr =
+ new_any_sockaddr(p.recv_family, p.recv_port);
+ int i, opt;
+
+ for (i = 0; i < p.recv_socks; ++i) {
+ fd[i] = socket(p.recv_family, p.protocol, 0);
+ if (fd[i] < 0)
+ error(1, errno, "failed to create recv %d", i);
+
+ opt = 1;
+ if (setsockopt(fd[i], SOL_SOCKET, SO_REUSEPORT, &opt,
+ sizeof(opt)))
+ error(1, errno, "failed to set SO_REUSEPORT on %d", i);
+
+ if (i == 0)
+ attach_bpf(fd[i], mod);
+
+ if (bind(fd[i], addr, sockaddr_size()))
+ error(1, errno, "failed to bind recv socket %d", i);
+
+ if (p.protocol == SOCK_STREAM) {
+ opt = 4;
+ if (setsockopt(fd[i], SOL_TCP, TCP_FASTOPEN, &opt,
+ sizeof(opt)))
+ error(1, errno,
+ "failed to set TCP_FASTOPEN on %d", i);
+ if (listen(fd[i], p.recv_socks * 10))
+ error(1, errno, "failed to listen on socket");
+ }
+ }
+ free(addr);
+}
+
+static void send_from(struct test_params p, uint16_t sport, char *buf,
+ size_t len)
+{
+ struct sockaddr * const saddr = new_any_sockaddr(p.send_family, sport);
+ struct sockaddr * const daddr =
+ new_loopback_sockaddr(p.send_family, p.recv_port);
+ const int fd = socket(p.send_family, p.protocol, 0), one = 1;
+
+ if (fd < 0)
+ error(1, errno, "failed to create send socket");
+
+ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)))
+ error(1, errno, "failed to set reuseaddr");
+
+ if (bind(fd, saddr, sockaddr_size()))
+ error(1, errno, "failed to bind send socket");
+
+ if (sendto(fd, buf, len, MSG_FASTOPEN, daddr, sockaddr_size()) < 0)
+ error(1, errno, "failed to send message");
+
+ close(fd);
+ free(saddr);
+ free(daddr);
+}
+
+static void test_recv_order(const struct test_params p, int fd[], int mod)
+{
+ char recv_buf[8], send_buf[8];
+ struct msghdr msg;
+ struct iovec recv_io = { recv_buf, 8 };
+ struct epoll_event ev;
+ int epfd, conn, i, sport, expected;
+ uint32_t data, ndata;
+
+ epfd = epoll_create(1);
+ if (epfd < 0)
+ error(1, errno, "failed to create epoll");
+ for (i = 0; i < p.recv_socks; ++i) {
+ ev.events = EPOLLIN;
+ ev.data.fd = fd[i];
+ if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd[i], &ev))
+ error(1, errno, "failed to register sock %d epoll", i);
+ }
+
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_iov = &recv_io;
+ msg.msg_iovlen = 1;
+
+ for (data = 0; data < p.recv_socks * 2; ++data) {
+ sport = p.send_port_min + data;
+ ndata = htonl(data);
+ memcpy(send_buf, &ndata, sizeof(ndata));
+ send_from(p, sport, send_buf, sizeof(ndata));
+
+ i = epoll_wait(epfd, &ev, 1, -1);
+ if (i < 0)
+ error(1, errno, "epoll wait failed");
+
+ if (p.protocol == SOCK_STREAM) {
+ conn = accept(ev.data.fd, NULL, NULL);
+ if (conn < 0)
+ error(1, errno, "error accepting");
+ i = recvmsg(conn, &msg, 0);
+ close(conn);
+ } else {
+ i = recvmsg(ev.data.fd, &msg, 0);
+ }
+ if (i < 0)
+ error(1, errno, "recvmsg error");
+ if (i != sizeof(ndata))
+ error(1, 0, "expected size %zd got %d",
+ sizeof(ndata), i);
+
+ for (i = 0; i < p.recv_socks; ++i)
+ if (ev.data.fd == fd[i])
+ break;
+ memcpy(&ndata, recv_buf, sizeof(ndata));
+ fprintf(stderr, "Socket %d: %d\n", i, ntohl(ndata));
+
+ expected = (sport % mod);
+ if (i != expected)
+ error(1, 0, "expected socket %d", expected);
+ }
+}
+
+static void test_reuseport_ebpf(struct test_params p)
+{
+ int i, fd[p.recv_socks];
+
+ fprintf(stderr, "Testing EBPF mod %zd...\n", p.recv_socks);
+ build_recv_group(p, fd, p.recv_socks, attach_ebpf);
+ test_recv_order(p, fd, p.recv_socks);
+
+ p.send_port_min += p.recv_socks * 2;
+ fprintf(stderr, "Reprograming, testing mod %zd...\n", p.recv_socks / 2);
+ attach_ebpf(fd[0], p.recv_socks / 2);
+ test_recv_order(p, fd, p.recv_socks / 2);
+
+ for (i = 0; i < p.recv_socks; ++i)
+ close(fd[i]);
+}
+
+static void test_reuseport_cbpf(struct test_params p)
+{
+ int i, fd[p.recv_socks];
+
+ fprintf(stderr, "Testing CBPF mod %zd...\n", p.recv_socks);
+ build_recv_group(p, fd, p.recv_socks, attach_cbpf);
+ test_recv_order(p, fd, p.recv_socks);
+
+ p.send_port_min += p.recv_socks * 2;
+ fprintf(stderr, "Reprograming, testing mod %zd...\n", p.recv_socks / 2);
+ attach_cbpf(fd[0], p.recv_socks / 2);
+ test_recv_order(p, fd, p.recv_socks / 2);
+
+ for (i = 0; i < p.recv_socks; ++i)
+ close(fd[i]);
+}
+
+static void test_extra_filter(const struct test_params p)
+{
+ struct sockaddr * const addr =
+ new_any_sockaddr(p.recv_family, p.recv_port);
+ int fd1, fd2, opt;
+
+ fprintf(stderr, "Testing too many filters...\n");
+ fd1 = socket(p.recv_family, p.protocol, 0);
+ if (fd1 < 0)
+ error(1, errno, "failed to create socket 1");
+ fd2 = socket(p.recv_family, p.protocol, 0);
+ if (fd2 < 0)
+ error(1, errno, "failed to create socket 2");
+
+ opt = 1;
+ if (setsockopt(fd1, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)))
+ error(1, errno, "failed to set SO_REUSEPORT on socket 1");
+ if (setsockopt(fd2, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)))
+ error(1, errno, "failed to set SO_REUSEPORT on socket 2");
+
+ attach_ebpf(fd1, 10);
+ attach_ebpf(fd2, 10);
+
+ if (bind(fd1, addr, sockaddr_size()))
+ error(1, errno, "failed to bind recv socket 1");
+
+ if (!bind(fd2, addr, sockaddr_size()) || errno != EADDRINUSE)
+ error(1, errno, "bind socket 2 should fail with EADDRINUSE");
+
+ free(addr);
+}
+
+static void test_filter_no_reuseport(const struct test_params p)
+{
+ struct sockaddr * const addr =
+ new_any_sockaddr(p.recv_family, p.recv_port);
+ const char bpf_license[] = "GPL";
+ struct bpf_insn ecode[] = {
+ { BPF_ALU64 | BPF_MOV | BPF_K, BPF_REG_0, 0, 0, 10 },
+ { BPF_JMP | BPF_EXIT, 0, 0, 0, 0 }
+ };
+ struct sock_filter ccode[] = {{ BPF_RET | BPF_A, 0, 0, 0 }};
+ union bpf_attr eprog;
+ struct sock_fprog cprog;
+ int fd, bpf_fd;
+
+ fprintf(stderr, "Testing filters on non-SO_REUSEPORT socket...\n");
+
+ memset(&eprog, 0, sizeof(eprog));
+ eprog.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+ eprog.insn_cnt = ARRAY_SIZE(ecode);
+ eprog.insns = (unsigned long) &ecode;
+ eprog.license = (unsigned long) &bpf_license;
+ eprog.kern_version = 0;
+
+ memset(&cprog, 0, sizeof(cprog));
+ cprog.len = ARRAY_SIZE(ccode);
+ cprog.filter = ccode;
+
+
+ bpf_fd = syscall(__NR_bpf, BPF_PROG_LOAD, &eprog, sizeof(eprog));
+ if (bpf_fd < 0)
+ error(1, errno, "ebpf error");
+ fd = socket(p.recv_family, p.protocol, 0);
+ if (fd < 0)
+ error(1, errno, "failed to create socket 1");
+
+ if (bind(fd, addr, sockaddr_size()))
+ error(1, errno, "failed to bind recv socket 1");
+
+ errno = 0;
+ if (!setsockopt(fd, SOL_SOCKET, SO_ATTACH_REUSEPORT_EBPF, &bpf_fd,
+ sizeof(bpf_fd)) || errno != EINVAL)
+ error(1, errno, "setsockopt should have returned EINVAL");
+
+ errno = 0;
+ if (!setsockopt(fd, SOL_SOCKET, SO_ATTACH_REUSEPORT_CBPF, &cprog,
+ sizeof(cprog)) || errno != EINVAL)
+ error(1, errno, "setsockopt should have returned EINVAL");
+
+ free(addr);
+}
+
+static void test_filter_without_bind(void)
+{
+ int fd1, fd2, opt = 1;
+
+ fprintf(stderr, "Testing filter add without bind...\n");
+ fd1 = socket(AF_INET, SOCK_DGRAM, 0);
+ if (fd1 < 0)
+ error(1, errno, "failed to create socket 1");
+ fd2 = socket(AF_INET, SOCK_DGRAM, 0);
+ if (fd2 < 0)
+ error(1, errno, "failed to create socket 2");
+ if (setsockopt(fd1, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)))
+ error(1, errno, "failed to set SO_REUSEPORT on socket 1");
+ if (setsockopt(fd2, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)))
+ error(1, errno, "failed to set SO_REUSEPORT on socket 2");
+
+ attach_ebpf(fd1, 10);
+ attach_cbpf(fd2, 10);
+
+ close(fd1);
+ close(fd2);
+}
+
+void enable_fastopen(void)
+{
+ int fd = open("/proc/sys/net/ipv4/tcp_fastopen", 0);
+ int rw_mask = 3; /* bit 1: client side; bit-2 server side */
+ int val, size;
+ char buf[16];
+
+ if (fd < 0)
+ error(1, errno, "Unable to open tcp_fastopen sysctl");
+ if (read(fd, buf, sizeof(buf)) <= 0)
+ error(1, errno, "Unable to read tcp_fastopen sysctl");
+ val = atoi(buf);
+ close(fd);
+
+ if ((val & rw_mask) != rw_mask) {
+ fd = open("/proc/sys/net/ipv4/tcp_fastopen", O_RDWR);
+ if (fd < 0)
+ error(1, errno,
+ "Unable to open tcp_fastopen sysctl for writing");
+ val |= rw_mask;
+ size = snprintf(buf, 16, "%d", val);
+ if (write(fd, buf, size) <= 0)
+ error(1, errno, "Unable to write tcp_fastopen sysctl");
+ close(fd);
+ }
+}
+
+static struct rlimit rlim_old;
+
+static __attribute__((constructor)) void main_ctor(void)
+{
+ getrlimit(RLIMIT_MEMLOCK, &rlim_old);
+
+ if (rlim_old.rlim_cur != RLIM_INFINITY) {
+ struct rlimit rlim_new;
+
+ rlim_new.rlim_cur = rlim_old.rlim_cur + (1UL << 20);
+ rlim_new.rlim_max = rlim_old.rlim_max + (1UL << 20);
+ setrlimit(RLIMIT_MEMLOCK, &rlim_new);
+ }
+}
+
+static __attribute__((destructor)) void main_dtor(void)
+{
+ setrlimit(RLIMIT_MEMLOCK, &rlim_old);
+}
+
+int main(void)
+{
+ fprintf(stderr, "---- IPv4 UDP ----\n");
+ /* NOTE: UDP socket lookups traverse a different code path when there
+ * are > 10 sockets in a group. Run the bpf test through both paths.
+ */
+ test_reuseport_ebpf((struct test_params) {
+ .recv_family = AF_INET,
+ .send_family = AF_INET,
+ .protocol = SOCK_DGRAM,
+ .recv_socks = 10,
+ .recv_port = 8000,
+ .send_port_min = 9000});
+ test_reuseport_ebpf((struct test_params) {
+ .recv_family = AF_INET,
+ .send_family = AF_INET,
+ .protocol = SOCK_DGRAM,
+ .recv_socks = 20,
+ .recv_port = 8000,
+ .send_port_min = 9000});
+ test_reuseport_cbpf((struct test_params) {
+ .recv_family = AF_INET,
+ .send_family = AF_INET,
+ .protocol = SOCK_DGRAM,
+ .recv_socks = 10,
+ .recv_port = 8001,
+ .send_port_min = 9020});
+ test_reuseport_cbpf((struct test_params) {
+ .recv_family = AF_INET,
+ .send_family = AF_INET,
+ .protocol = SOCK_DGRAM,
+ .recv_socks = 20,
+ .recv_port = 8001,
+ .send_port_min = 9020});
+ test_extra_filter((struct test_params) {
+ .recv_family = AF_INET,
+ .protocol = SOCK_DGRAM,
+ .recv_port = 8002});
+ test_filter_no_reuseport((struct test_params) {
+ .recv_family = AF_INET,
+ .protocol = SOCK_DGRAM,
+ .recv_port = 8008});
+
+ fprintf(stderr, "---- IPv6 UDP ----\n");
+ test_reuseport_ebpf((struct test_params) {
+ .recv_family = AF_INET6,
+ .send_family = AF_INET6,
+ .protocol = SOCK_DGRAM,
+ .recv_socks = 10,
+ .recv_port = 8003,
+ .send_port_min = 9040});
+ test_reuseport_ebpf((struct test_params) {
+ .recv_family = AF_INET6,
+ .send_family = AF_INET6,
+ .protocol = SOCK_DGRAM,
+ .recv_socks = 20,
+ .recv_port = 8003,
+ .send_port_min = 9040});
+ test_reuseport_cbpf((struct test_params) {
+ .recv_family = AF_INET6,
+ .send_family = AF_INET6,
+ .protocol = SOCK_DGRAM,
+ .recv_socks = 10,
+ .recv_port = 8004,
+ .send_port_min = 9060});
+ test_reuseport_cbpf((struct test_params) {
+ .recv_family = AF_INET6,
+ .send_family = AF_INET6,
+ .protocol = SOCK_DGRAM,
+ .recv_socks = 20,
+ .recv_port = 8004,
+ .send_port_min = 9060});
+ test_extra_filter((struct test_params) {
+ .recv_family = AF_INET6,
+ .protocol = SOCK_DGRAM,
+ .recv_port = 8005});
+ test_filter_no_reuseport((struct test_params) {
+ .recv_family = AF_INET6,
+ .protocol = SOCK_DGRAM,
+ .recv_port = 8009});
+
+ fprintf(stderr, "---- IPv6 UDP w/ mapped IPv4 ----\n");
+ test_reuseport_ebpf((struct test_params) {
+ .recv_family = AF_INET6,
+ .send_family = AF_INET,
+ .protocol = SOCK_DGRAM,
+ .recv_socks = 20,
+ .recv_port = 8006,
+ .send_port_min = 9080});
+ test_reuseport_ebpf((struct test_params) {
+ .recv_family = AF_INET6,
+ .send_family = AF_INET,
+ .protocol = SOCK_DGRAM,
+ .recv_socks = 10,
+ .recv_port = 8006,
+ .send_port_min = 9080});
+ test_reuseport_cbpf((struct test_params) {
+ .recv_family = AF_INET6,
+ .send_family = AF_INET,
+ .protocol = SOCK_DGRAM,
+ .recv_socks = 10,
+ .recv_port = 8007,
+ .send_port_min = 9100});
+ test_reuseport_cbpf((struct test_params) {
+ .recv_family = AF_INET6,
+ .send_family = AF_INET,
+ .protocol = SOCK_DGRAM,
+ .recv_socks = 20,
+ .recv_port = 8007,
+ .send_port_min = 9100});
+
+ /* TCP fastopen is required for the TCP tests */
+ enable_fastopen();
+ fprintf(stderr, "---- IPv4 TCP ----\n");
+ test_reuseport_ebpf((struct test_params) {
+ .recv_family = AF_INET,
+ .send_family = AF_INET,
+ .protocol = SOCK_STREAM,
+ .recv_socks = 10,
+ .recv_port = 8008,
+ .send_port_min = 9120});
+ test_reuseport_cbpf((struct test_params) {
+ .recv_family = AF_INET,
+ .send_family = AF_INET,
+ .protocol = SOCK_STREAM,
+ .recv_socks = 10,
+ .recv_port = 8009,
+ .send_port_min = 9160});
+ test_extra_filter((struct test_params) {
+ .recv_family = AF_INET,
+ .protocol = SOCK_STREAM,
+ .recv_port = 8010});
+ test_filter_no_reuseport((struct test_params) {
+ .recv_family = AF_INET,
+ .protocol = SOCK_STREAM,
+ .recv_port = 8011});
+
+ fprintf(stderr, "---- IPv6 TCP ----\n");
+ test_reuseport_ebpf((struct test_params) {
+ .recv_family = AF_INET6,
+ .send_family = AF_INET6,
+ .protocol = SOCK_STREAM,
+ .recv_socks = 10,
+ .recv_port = 8012,
+ .send_port_min = 9200});
+ test_reuseport_cbpf((struct test_params) {
+ .recv_family = AF_INET6,
+ .send_family = AF_INET6,
+ .protocol = SOCK_STREAM,
+ .recv_socks = 10,
+ .recv_port = 8013,
+ .send_port_min = 9240});
+ test_extra_filter((struct test_params) {
+ .recv_family = AF_INET6,
+ .protocol = SOCK_STREAM,
+ .recv_port = 8014});
+ test_filter_no_reuseport((struct test_params) {
+ .recv_family = AF_INET6,
+ .protocol = SOCK_STREAM,
+ .recv_port = 8015});
+
+ fprintf(stderr, "---- IPv6 TCP w/ mapped IPv4 ----\n");
+ test_reuseport_ebpf((struct test_params) {
+ .recv_family = AF_INET6,
+ .send_family = AF_INET,
+ .protocol = SOCK_STREAM,
+ .recv_socks = 10,
+ .recv_port = 8016,
+ .send_port_min = 9320});
+ test_reuseport_cbpf((struct test_params) {
+ .recv_family = AF_INET6,
+ .send_family = AF_INET,
+ .protocol = SOCK_STREAM,
+ .recv_socks = 10,
+ .recv_port = 8017,
+ .send_port_min = 9360});
+
+ test_filter_without_bind();
+
+ fprintf(stderr, "SUCCESS\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/net/reuseport_bpf_cpu.c b/tools/testing/selftests/net/reuseport_bpf_cpu.c
new file mode 100644
index 000000000..2d6461747
--- /dev/null
+++ b/tools/testing/selftests/net/reuseport_bpf_cpu.c
@@ -0,0 +1,259 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test functionality of BPF filters with SO_REUSEPORT. This program creates
+ * an SO_REUSEPORT receiver group containing one socket per CPU core. It then
+ * creates a BPF program that will select a socket from this group based
+ * on the core id that receives the packet. The sending code artificially
+ * moves itself to run on different core ids and sends one message from
+ * each core. Since these packets are delivered over loopback, they should
+ * arrive on the same core that sent them. The receiving code then ensures
+ * that the packet was received on the socket for the corresponding core id.
+ * This entire process is done for several different core id permutations
+ * and for each IPv4/IPv6 and TCP/UDP combination.
+ */
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <linux/filter.h>
+#include <linux/in.h>
+#include <linux/unistd.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+static const int PORT = 8888;
+
+static void build_rcv_group(int *rcv_fd, size_t len, int family, int proto)
+{
+ struct sockaddr_storage addr;
+ struct sockaddr_in *addr4;
+ struct sockaddr_in6 *addr6;
+ size_t i;
+ int opt;
+
+ switch (family) {
+ case AF_INET:
+ addr4 = (struct sockaddr_in *)&addr;
+ addr4->sin_family = AF_INET;
+ addr4->sin_addr.s_addr = htonl(INADDR_ANY);
+ addr4->sin_port = htons(PORT);
+ break;
+ case AF_INET6:
+ addr6 = (struct sockaddr_in6 *)&addr;
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_addr = in6addr_any;
+ addr6->sin6_port = htons(PORT);
+ break;
+ default:
+ error(1, 0, "Unsupported family %d", family);
+ }
+
+ for (i = 0; i < len; ++i) {
+ rcv_fd[i] = socket(family, proto, 0);
+ if (rcv_fd[i] < 0)
+ error(1, errno, "failed to create receive socket");
+
+ opt = 1;
+ if (setsockopt(rcv_fd[i], SOL_SOCKET, SO_REUSEPORT, &opt,
+ sizeof(opt)))
+ error(1, errno, "failed to set SO_REUSEPORT");
+
+ if (bind(rcv_fd[i], (struct sockaddr *)&addr, sizeof(addr)))
+ error(1, errno, "failed to bind receive socket");
+
+ if (proto == SOCK_STREAM && listen(rcv_fd[i], len * 10))
+ error(1, errno, "failed to listen on receive port");
+ }
+}
+
+static void attach_bpf(int fd)
+{
+ struct sock_filter code[] = {
+ /* A = raw_smp_processor_id() */
+ { BPF_LD | BPF_W | BPF_ABS, 0, 0, SKF_AD_OFF + SKF_AD_CPU },
+ /* return A */
+ { BPF_RET | BPF_A, 0, 0, 0 },
+ };
+ struct sock_fprog p = {
+ .len = 2,
+ .filter = code,
+ };
+
+ if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_REUSEPORT_CBPF, &p, sizeof(p)))
+ error(1, errno, "failed to set SO_ATTACH_REUSEPORT_CBPF");
+}
+
+static void send_from_cpu(int cpu_id, int family, int proto)
+{
+ struct sockaddr_storage saddr, daddr;
+ struct sockaddr_in *saddr4, *daddr4;
+ struct sockaddr_in6 *saddr6, *daddr6;
+ cpu_set_t cpu_set;
+ int fd;
+
+ switch (family) {
+ case AF_INET:
+ saddr4 = (struct sockaddr_in *)&saddr;
+ saddr4->sin_family = AF_INET;
+ saddr4->sin_addr.s_addr = htonl(INADDR_ANY);
+ saddr4->sin_port = 0;
+
+ daddr4 = (struct sockaddr_in *)&daddr;
+ daddr4->sin_family = AF_INET;
+ daddr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ daddr4->sin_port = htons(PORT);
+ break;
+ case AF_INET6:
+ saddr6 = (struct sockaddr_in6 *)&saddr;
+ saddr6->sin6_family = AF_INET6;
+ saddr6->sin6_addr = in6addr_any;
+ saddr6->sin6_port = 0;
+
+ daddr6 = (struct sockaddr_in6 *)&daddr;
+ daddr6->sin6_family = AF_INET6;
+ daddr6->sin6_addr = in6addr_loopback;
+ daddr6->sin6_port = htons(PORT);
+ break;
+ default:
+ error(1, 0, "Unsupported family %d", family);
+ }
+
+ memset(&cpu_set, 0, sizeof(cpu_set));
+ CPU_SET(cpu_id, &cpu_set);
+ if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0)
+ error(1, errno, "failed to pin to cpu");
+
+ fd = socket(family, proto, 0);
+ if (fd < 0)
+ error(1, errno, "failed to create send socket");
+
+ if (bind(fd, (struct sockaddr *)&saddr, sizeof(saddr)))
+ error(1, errno, "failed to bind send socket");
+
+ if (connect(fd, (struct sockaddr *)&daddr, sizeof(daddr)))
+ error(1, errno, "failed to connect send socket");
+
+ if (send(fd, "a", 1, 0) < 0)
+ error(1, errno, "failed to send message");
+
+ close(fd);
+}
+
+static
+void receive_on_cpu(int *rcv_fd, int len, int epfd, int cpu_id, int proto)
+{
+ struct epoll_event ev;
+ int i, fd;
+ char buf[8];
+
+ i = epoll_wait(epfd, &ev, 1, -1);
+ if (i < 0)
+ error(1, errno, "epoll_wait failed");
+
+ if (proto == SOCK_STREAM) {
+ fd = accept(ev.data.fd, NULL, NULL);
+ if (fd < 0)
+ error(1, errno, "failed to accept");
+ i = recv(fd, buf, sizeof(buf), 0);
+ close(fd);
+ } else {
+ i = recv(ev.data.fd, buf, sizeof(buf), 0);
+ }
+
+ if (i < 0)
+ error(1, errno, "failed to recv");
+
+ for (i = 0; i < len; ++i)
+ if (ev.data.fd == rcv_fd[i])
+ break;
+ if (i == len)
+ error(1, 0, "failed to find socket");
+ fprintf(stderr, "send cpu %d, receive socket %d\n", cpu_id, i);
+ if (cpu_id != i)
+ error(1, 0, "cpu id/receive socket mismatch");
+}
+
+static void test(int *rcv_fd, int len, int family, int proto)
+{
+ struct epoll_event ev;
+ int epfd, cpu;
+
+ build_rcv_group(rcv_fd, len, family, proto);
+ attach_bpf(rcv_fd[0]);
+
+ epfd = epoll_create(1);
+ if (epfd < 0)
+ error(1, errno, "failed to create epoll");
+ for (cpu = 0; cpu < len; ++cpu) {
+ ev.events = EPOLLIN;
+ ev.data.fd = rcv_fd[cpu];
+ if (epoll_ctl(epfd, EPOLL_CTL_ADD, rcv_fd[cpu], &ev))
+ error(1, errno, "failed to register sock epoll");
+ }
+
+ /* Forward iterate */
+ for (cpu = 0; cpu < len; ++cpu) {
+ send_from_cpu(cpu, family, proto);
+ receive_on_cpu(rcv_fd, len, epfd, cpu, proto);
+ }
+
+ /* Reverse iterate */
+ for (cpu = len - 1; cpu >= 0; --cpu) {
+ send_from_cpu(cpu, family, proto);
+ receive_on_cpu(rcv_fd, len, epfd, cpu, proto);
+ }
+
+ /* Even cores */
+ for (cpu = 0; cpu < len; cpu += 2) {
+ send_from_cpu(cpu, family, proto);
+ receive_on_cpu(rcv_fd, len, epfd, cpu, proto);
+ }
+
+ /* Odd cores */
+ for (cpu = 1; cpu < len; cpu += 2) {
+ send_from_cpu(cpu, family, proto);
+ receive_on_cpu(rcv_fd, len, epfd, cpu, proto);
+ }
+
+ close(epfd);
+ for (cpu = 0; cpu < len; ++cpu)
+ close(rcv_fd[cpu]);
+}
+
+int main(void)
+{
+ int *rcv_fd, cpus;
+
+ cpus = sysconf(_SC_NPROCESSORS_ONLN);
+ if (cpus <= 0)
+ error(1, errno, "failed counting cpus");
+
+ rcv_fd = calloc(cpus, sizeof(int));
+ if (!rcv_fd)
+ error(1, 0, "failed to allocate array");
+
+ fprintf(stderr, "---- IPv4 UDP ----\n");
+ test(rcv_fd, cpus, AF_INET, SOCK_DGRAM);
+
+ fprintf(stderr, "---- IPv6 UDP ----\n");
+ test(rcv_fd, cpus, AF_INET6, SOCK_DGRAM);
+
+ fprintf(stderr, "---- IPv4 TCP ----\n");
+ test(rcv_fd, cpus, AF_INET, SOCK_STREAM);
+
+ fprintf(stderr, "---- IPv6 TCP ----\n");
+ test(rcv_fd, cpus, AF_INET6, SOCK_STREAM);
+
+ free(rcv_fd);
+
+ fprintf(stderr, "SUCCESS\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/net/reuseport_bpf_numa.c b/tools/testing/selftests/net/reuseport_bpf_numa.c
new file mode 100644
index 000000000..c9f478b40
--- /dev/null
+++ b/tools/testing/selftests/net/reuseport_bpf_numa.c
@@ -0,0 +1,258 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test functionality of BPF filters with SO_REUSEPORT. Same test as
+ * in reuseport_bpf_cpu, only as one socket per NUMA node.
+ */
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/in.h>
+#include <linux/unistd.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include <numa.h>
+
+#include "../kselftest.h"
+
+static const int PORT = 8888;
+
+static void build_rcv_group(int *rcv_fd, size_t len, int family, int proto)
+{
+ struct sockaddr_storage addr;
+ struct sockaddr_in *addr4;
+ struct sockaddr_in6 *addr6;
+ size_t i;
+ int opt;
+
+ switch (family) {
+ case AF_INET:
+ addr4 = (struct sockaddr_in *)&addr;
+ addr4->sin_family = AF_INET;
+ addr4->sin_addr.s_addr = htonl(INADDR_ANY);
+ addr4->sin_port = htons(PORT);
+ break;
+ case AF_INET6:
+ addr6 = (struct sockaddr_in6 *)&addr;
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_addr = in6addr_any;
+ addr6->sin6_port = htons(PORT);
+ break;
+ default:
+ error(1, 0, "Unsupported family %d", family);
+ }
+
+ for (i = 0; i < len; ++i) {
+ rcv_fd[i] = socket(family, proto, 0);
+ if (rcv_fd[i] < 0)
+ error(1, errno, "failed to create receive socket");
+
+ opt = 1;
+ if (setsockopt(rcv_fd[i], SOL_SOCKET, SO_REUSEPORT, &opt,
+ sizeof(opt)))
+ error(1, errno, "failed to set SO_REUSEPORT");
+
+ if (bind(rcv_fd[i], (struct sockaddr *)&addr, sizeof(addr)))
+ error(1, errno, "failed to bind receive socket");
+
+ if (proto == SOCK_STREAM && listen(rcv_fd[i], len * 10))
+ error(1, errno, "failed to listen on receive port");
+ }
+}
+
+static void attach_bpf(int fd)
+{
+ static char bpf_log_buf[65536];
+ static const char bpf_license[] = "";
+
+ int bpf_fd;
+ const struct bpf_insn prog[] = {
+ /* R0 = bpf_get_numa_node_id() */
+ { BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_numa_node_id },
+ /* return R0 */
+ { BPF_JMP | BPF_EXIT, 0, 0, 0, 0 }
+ };
+ union bpf_attr attr;
+
+ memset(&attr, 0, sizeof(attr));
+ attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+ attr.insn_cnt = sizeof(prog) / sizeof(prog[0]);
+ attr.insns = (unsigned long) &prog;
+ attr.license = (unsigned long) &bpf_license;
+ attr.log_buf = (unsigned long) &bpf_log_buf;
+ attr.log_size = sizeof(bpf_log_buf);
+ attr.log_level = 1;
+
+ bpf_fd = syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
+ if (bpf_fd < 0)
+ error(1, errno, "ebpf error. log:\n%s\n", bpf_log_buf);
+
+ if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_REUSEPORT_EBPF, &bpf_fd,
+ sizeof(bpf_fd)))
+ error(1, errno, "failed to set SO_ATTACH_REUSEPORT_EBPF");
+
+ close(bpf_fd);
+}
+
+static void send_from_node(int node_id, int family, int proto)
+{
+ struct sockaddr_storage saddr, daddr;
+ struct sockaddr_in *saddr4, *daddr4;
+ struct sockaddr_in6 *saddr6, *daddr6;
+ int fd;
+
+ switch (family) {
+ case AF_INET:
+ saddr4 = (struct sockaddr_in *)&saddr;
+ saddr4->sin_family = AF_INET;
+ saddr4->sin_addr.s_addr = htonl(INADDR_ANY);
+ saddr4->sin_port = 0;
+
+ daddr4 = (struct sockaddr_in *)&daddr;
+ daddr4->sin_family = AF_INET;
+ daddr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ daddr4->sin_port = htons(PORT);
+ break;
+ case AF_INET6:
+ saddr6 = (struct sockaddr_in6 *)&saddr;
+ saddr6->sin6_family = AF_INET6;
+ saddr6->sin6_addr = in6addr_any;
+ saddr6->sin6_port = 0;
+
+ daddr6 = (struct sockaddr_in6 *)&daddr;
+ daddr6->sin6_family = AF_INET6;
+ daddr6->sin6_addr = in6addr_loopback;
+ daddr6->sin6_port = htons(PORT);
+ break;
+ default:
+ error(1, 0, "Unsupported family %d", family);
+ }
+
+ if (numa_run_on_node(node_id) < 0)
+ error(1, errno, "failed to pin to node");
+
+ fd = socket(family, proto, 0);
+ if (fd < 0)
+ error(1, errno, "failed to create send socket");
+
+ if (bind(fd, (struct sockaddr *)&saddr, sizeof(saddr)))
+ error(1, errno, "failed to bind send socket");
+
+ if (connect(fd, (struct sockaddr *)&daddr, sizeof(daddr)))
+ error(1, errno, "failed to connect send socket");
+
+ if (send(fd, "a", 1, 0) < 0)
+ error(1, errno, "failed to send message");
+
+ close(fd);
+}
+
+static
+void receive_on_node(int *rcv_fd, int len, int epfd, int node_id, int proto)
+{
+ struct epoll_event ev;
+ int i, fd;
+ char buf[8];
+
+ i = epoll_wait(epfd, &ev, 1, -1);
+ if (i < 0)
+ error(1, errno, "epoll_wait failed");
+
+ if (proto == SOCK_STREAM) {
+ fd = accept(ev.data.fd, NULL, NULL);
+ if (fd < 0)
+ error(1, errno, "failed to accept");
+ i = recv(fd, buf, sizeof(buf), 0);
+ close(fd);
+ } else {
+ i = recv(ev.data.fd, buf, sizeof(buf), 0);
+ }
+
+ if (i < 0)
+ error(1, errno, "failed to recv");
+
+ for (i = 0; i < len; ++i)
+ if (ev.data.fd == rcv_fd[i])
+ break;
+ if (i == len)
+ error(1, 0, "failed to find socket");
+ fprintf(stderr, "send node %d, receive socket %d\n", node_id, i);
+ if (node_id != i)
+ error(1, 0, "node id/receive socket mismatch");
+}
+
+static void test(int *rcv_fd, int len, int family, int proto)
+{
+ struct epoll_event ev;
+ int epfd, node;
+
+ build_rcv_group(rcv_fd, len, family, proto);
+ attach_bpf(rcv_fd[0]);
+
+ epfd = epoll_create(1);
+ if (epfd < 0)
+ error(1, errno, "failed to create epoll");
+ for (node = 0; node < len; ++node) {
+ ev.events = EPOLLIN;
+ ev.data.fd = rcv_fd[node];
+ if (epoll_ctl(epfd, EPOLL_CTL_ADD, rcv_fd[node], &ev))
+ error(1, errno, "failed to register sock epoll");
+ }
+
+ /* Forward iterate */
+ for (node = 0; node < len; ++node) {
+ send_from_node(node, family, proto);
+ receive_on_node(rcv_fd, len, epfd, node, proto);
+ }
+
+ /* Reverse iterate */
+ for (node = len - 1; node >= 0; --node) {
+ send_from_node(node, family, proto);
+ receive_on_node(rcv_fd, len, epfd, node, proto);
+ }
+
+ close(epfd);
+ for (node = 0; node < len; ++node)
+ close(rcv_fd[node]);
+}
+
+int main(void)
+{
+ int *rcv_fd, nodes;
+
+ if (numa_available() < 0)
+ ksft_exit_skip("no numa api support\n");
+
+ nodes = numa_max_node() + 1;
+
+ rcv_fd = calloc(nodes, sizeof(int));
+ if (!rcv_fd)
+ error(1, 0, "failed to allocate array");
+
+ fprintf(stderr, "---- IPv4 UDP ----\n");
+ test(rcv_fd, nodes, AF_INET, SOCK_DGRAM);
+
+ fprintf(stderr, "---- IPv6 UDP ----\n");
+ test(rcv_fd, nodes, AF_INET6, SOCK_DGRAM);
+
+ fprintf(stderr, "---- IPv4 TCP ----\n");
+ test(rcv_fd, nodes, AF_INET, SOCK_STREAM);
+
+ fprintf(stderr, "---- IPv6 TCP ----\n");
+ test(rcv_fd, nodes, AF_INET6, SOCK_STREAM);
+
+ free(rcv_fd);
+
+ fprintf(stderr, "SUCCESS\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/net/reuseport_dualstack.c b/tools/testing/selftests/net/reuseport_dualstack.c
new file mode 100644
index 000000000..fb7a59ed7
--- /dev/null
+++ b/tools/testing/selftests/net/reuseport_dualstack.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * It is possible to use SO_REUSEPORT to open multiple sockets bound to
+ * equivalent local addresses using AF_INET and AF_INET6 at the same time. If
+ * the AF_INET6 socket has IPV6_V6ONLY set, it's clear which socket should
+ * receive a given incoming packet. However, when it is not set, incoming v4
+ * packets should prefer the AF_INET socket(s). This behavior was defined with
+ * the original SO_REUSEPORT implementation, but broke with
+ * e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection")
+ * This test creates these mixed AF_INET/AF_INET6 sockets and asserts the
+ * AF_INET preference for v4 packets.
+ */
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <linux/in.h>
+#include <linux/unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+static const int PORT = 8888;
+
+static void build_rcv_fd(int family, int proto, int *rcv_fds, int count)
+{
+ struct sockaddr_storage addr;
+ struct sockaddr_in *addr4;
+ struct sockaddr_in6 *addr6;
+ int opt, i;
+
+ switch (family) {
+ case AF_INET:
+ addr4 = (struct sockaddr_in *)&addr;
+ addr4->sin_family = AF_INET;
+ addr4->sin_addr.s_addr = htonl(INADDR_ANY);
+ addr4->sin_port = htons(PORT);
+ break;
+ case AF_INET6:
+ addr6 = (struct sockaddr_in6 *)&addr;
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_addr = in6addr_any;
+ addr6->sin6_port = htons(PORT);
+ break;
+ default:
+ error(1, 0, "Unsupported family %d", family);
+ }
+
+ for (i = 0; i < count; ++i) {
+ rcv_fds[i] = socket(family, proto, 0);
+ if (rcv_fds[i] < 0)
+ error(1, errno, "failed to create receive socket");
+
+ opt = 1;
+ if (setsockopt(rcv_fds[i], SOL_SOCKET, SO_REUSEPORT, &opt,
+ sizeof(opt)))
+ error(1, errno, "failed to set SO_REUSEPORT");
+
+ if (bind(rcv_fds[i], (struct sockaddr *)&addr, sizeof(addr)))
+ error(1, errno, "failed to bind receive socket");
+
+ if (proto == SOCK_STREAM && listen(rcv_fds[i], 10))
+ error(1, errno, "failed to listen on receive port");
+ }
+}
+
+static void send_from_v4(int proto)
+{
+ struct sockaddr_in saddr, daddr;
+ int fd;
+
+ saddr.sin_family = AF_INET;
+ saddr.sin_addr.s_addr = htonl(INADDR_ANY);
+ saddr.sin_port = 0;
+
+ daddr.sin_family = AF_INET;
+ daddr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ daddr.sin_port = htons(PORT);
+
+ fd = socket(AF_INET, proto, 0);
+ if (fd < 0)
+ error(1, errno, "failed to create send socket");
+
+ if (bind(fd, (struct sockaddr *)&saddr, sizeof(saddr)))
+ error(1, errno, "failed to bind send socket");
+
+ if (connect(fd, (struct sockaddr *)&daddr, sizeof(daddr)))
+ error(1, errno, "failed to connect send socket");
+
+ if (send(fd, "a", 1, 0) < 0)
+ error(1, errno, "failed to send message");
+
+ close(fd);
+}
+
+static int receive_once(int epfd, int proto)
+{
+ struct epoll_event ev;
+ int i, fd;
+ char buf[8];
+
+ i = epoll_wait(epfd, &ev, 1, -1);
+ if (i < 0)
+ error(1, errno, "epoll_wait failed");
+
+ if (proto == SOCK_STREAM) {
+ fd = accept(ev.data.fd, NULL, NULL);
+ if (fd < 0)
+ error(1, errno, "failed to accept");
+ i = recv(fd, buf, sizeof(buf), 0);
+ close(fd);
+ } else {
+ i = recv(ev.data.fd, buf, sizeof(buf), 0);
+ }
+
+ if (i < 0)
+ error(1, errno, "failed to recv");
+
+ return ev.data.fd;
+}
+
+static void test(int *rcv_fds, int count, int proto)
+{
+ struct epoll_event ev;
+ int epfd, i, test_fd;
+ int test_family;
+ socklen_t len;
+
+ epfd = epoll_create(1);
+ if (epfd < 0)
+ error(1, errno, "failed to create epoll");
+
+ ev.events = EPOLLIN;
+ for (i = 0; i < count; ++i) {
+ ev.data.fd = rcv_fds[i];
+ if (epoll_ctl(epfd, EPOLL_CTL_ADD, rcv_fds[i], &ev))
+ error(1, errno, "failed to register sock epoll");
+ }
+
+ send_from_v4(proto);
+
+ test_fd = receive_once(epfd, proto);
+ len = sizeof(test_family);
+ if (getsockopt(test_fd, SOL_SOCKET, SO_DOMAIN, &test_family, &len))
+ error(1, errno, "failed to read socket domain");
+ if (test_family != AF_INET)
+ error(1, 0, "expected to receive on v4 socket but got v6 (%d)",
+ test_family);
+
+ close(epfd);
+}
+
+int main(void)
+{
+ int rcv_fds[32], i;
+
+ fprintf(stderr, "---- UDP IPv4 created before IPv6 ----\n");
+ build_rcv_fd(AF_INET, SOCK_DGRAM, rcv_fds, 5);
+ build_rcv_fd(AF_INET6, SOCK_DGRAM, &(rcv_fds[5]), 5);
+ test(rcv_fds, 10, SOCK_DGRAM);
+ for (i = 0; i < 10; ++i)
+ close(rcv_fds[i]);
+
+ fprintf(stderr, "---- UDP IPv6 created before IPv4 ----\n");
+ build_rcv_fd(AF_INET6, SOCK_DGRAM, rcv_fds, 5);
+ build_rcv_fd(AF_INET, SOCK_DGRAM, &(rcv_fds[5]), 5);
+ test(rcv_fds, 10, SOCK_DGRAM);
+ for (i = 0; i < 10; ++i)
+ close(rcv_fds[i]);
+
+ /* NOTE: UDP socket lookups traverse a different code path when there
+ * are > 10 sockets in a group.
+ */
+ fprintf(stderr, "---- UDP IPv4 created before IPv6 (large) ----\n");
+ build_rcv_fd(AF_INET, SOCK_DGRAM, rcv_fds, 16);
+ build_rcv_fd(AF_INET6, SOCK_DGRAM, &(rcv_fds[16]), 16);
+ test(rcv_fds, 32, SOCK_DGRAM);
+ for (i = 0; i < 32; ++i)
+ close(rcv_fds[i]);
+
+ fprintf(stderr, "---- UDP IPv6 created before IPv4 (large) ----\n");
+ build_rcv_fd(AF_INET6, SOCK_DGRAM, rcv_fds, 16);
+ build_rcv_fd(AF_INET, SOCK_DGRAM, &(rcv_fds[16]), 16);
+ test(rcv_fds, 32, SOCK_DGRAM);
+ for (i = 0; i < 32; ++i)
+ close(rcv_fds[i]);
+
+ fprintf(stderr, "---- TCP IPv4 created before IPv6 ----\n");
+ build_rcv_fd(AF_INET, SOCK_STREAM, rcv_fds, 5);
+ build_rcv_fd(AF_INET6, SOCK_STREAM, &(rcv_fds[5]), 5);
+ test(rcv_fds, 10, SOCK_STREAM);
+ for (i = 0; i < 10; ++i)
+ close(rcv_fds[i]);
+
+ fprintf(stderr, "---- TCP IPv6 created before IPv4 ----\n");
+ build_rcv_fd(AF_INET6, SOCK_STREAM, rcv_fds, 5);
+ build_rcv_fd(AF_INET, SOCK_STREAM, &(rcv_fds[5]), 5);
+ test(rcv_fds, 10, SOCK_STREAM);
+ for (i = 0; i < 10; ++i)
+ close(rcv_fds[i]);
+
+ fprintf(stderr, "SUCCESS\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/net/route_localnet.sh b/tools/testing/selftests/net/route_localnet.sh
new file mode 100755
index 000000000..116bfeab7
--- /dev/null
+++ b/tools/testing/selftests/net/route_localnet.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run a couple of tests when route_localnet = 1.
+
+readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)"
+
+setup() {
+ ip netns add "${PEER_NS}"
+ ip -netns "${PEER_NS}" link set dev lo up
+ ip link add name veth0 type veth peer name veth1
+ ip link set dev veth0 up
+ ip link set dev veth1 netns "${PEER_NS}"
+
+ # Enable route_localnet and delete useless route 127.0.0.0/8.
+ sysctl -w net.ipv4.conf.veth0.route_localnet=1
+ ip netns exec "${PEER_NS}" sysctl -w net.ipv4.conf.veth1.route_localnet=1
+ ip route del 127.0.0.0/8 dev lo table local
+ ip netns exec "${PEER_NS}" ip route del 127.0.0.0/8 dev lo table local
+
+ ifconfig veth0 127.25.3.4/24 up
+ ip netns exec "${PEER_NS}" ifconfig veth1 127.25.3.14/24 up
+
+ ip route flush cache
+ ip netns exec "${PEER_NS}" ip route flush cache
+}
+
+cleanup() {
+ ip link del veth0
+ ip route add local 127.0.0.0/8 dev lo proto kernel scope host src 127.0.0.1
+ local -r ns="$(ip netns list|grep $PEER_NS)"
+ [ -n "$ns" ] && ip netns del $ns 2>/dev/null
+}
+
+# Run test when arp_announce = 2.
+run_arp_announce_test() {
+ echo "run arp_announce test"
+ setup
+
+ sysctl -w net.ipv4.conf.veth0.arp_announce=2
+ ip netns exec "${PEER_NS}" sysctl -w net.ipv4.conf.veth1.arp_announce=2
+ ping -c5 -I veth0 127.25.3.14
+ if [ $? -ne 0 ];then
+ echo "failed"
+ else
+ echo "ok"
+ fi
+
+ cleanup
+}
+
+# Run test when arp_ignore = 3.
+run_arp_ignore_test() {
+ echo "run arp_ignore test"
+ setup
+
+ sysctl -w net.ipv4.conf.veth0.arp_ignore=3
+ ip netns exec "${PEER_NS}" sysctl -w net.ipv4.conf.veth1.arp_ignore=3
+ ping -c5 -I veth0 127.25.3.14
+ if [ $? -ne 0 ];then
+ echo "failed"
+ else
+ echo "ok"
+ fi
+
+ cleanup
+}
+
+run_all_tests() {
+ run_arp_announce_test
+ run_arp_ignore_test
+}
+
+run_all_tests
diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh
new file mode 100755
index 000000000..cbf166df5
--- /dev/null
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -0,0 +1,1301 @@
+#!/bin/bash
+#
+# This test is for checking rtnetlink callpaths, and get as much coverage as possible.
+#
+# set -e
+
+devdummy="test-dummy0"
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+# set global exit status, but never reset nonzero one.
+check_err()
+{
+ if [ $ret -eq 0 ]; then
+ ret=$1
+ fi
+}
+
+# same but inverted -- used when command must fail for test to pass
+check_fail()
+{
+ if [ $1 -eq 0 ]; then
+ ret=1
+ fi
+}
+
+kci_add_dummy()
+{
+ ip link add name "$devdummy" type dummy
+ check_err $?
+ ip link set "$devdummy" up
+ check_err $?
+}
+
+kci_del_dummy()
+{
+ ip link del dev "$devdummy"
+ check_err $?
+}
+
+kci_test_netconf()
+{
+ dev="$1"
+ r=$ret
+
+ ip netconf show dev "$dev" > /dev/null
+ check_err $?
+
+ for f in 4 6; do
+ ip -$f netconf show dev "$dev" > /dev/null
+ check_err $?
+ done
+
+ if [ $ret -ne 0 ] ;then
+ echo "FAIL: ip netconf show $dev"
+ test $r -eq 0 && ret=0
+ return 1
+ fi
+}
+
+# add a bridge with vlans on top
+kci_test_bridge()
+{
+ devbr="test-br0"
+ vlandev="testbr-vlan1"
+
+ local ret=0
+ ip link add name "$devbr" type bridge
+ check_err $?
+
+ ip link set dev "$devdummy" master "$devbr"
+ check_err $?
+
+ ip link set "$devbr" up
+ check_err $?
+
+ ip link add link "$devbr" name "$vlandev" type vlan id 1
+ check_err $?
+ ip addr add dev "$vlandev" 10.200.7.23/30
+ check_err $?
+ ip -6 addr add dev "$vlandev" dead:42::1234/64
+ check_err $?
+ ip -d link > /dev/null
+ check_err $?
+ ip r s t all > /dev/null
+ check_err $?
+
+ for name in "$devbr" "$vlandev" "$devdummy" ; do
+ kci_test_netconf "$name"
+ done
+
+ ip -6 addr del dev "$vlandev" dead:42::1234/64
+ check_err $?
+
+ ip link del dev "$vlandev"
+ check_err $?
+ ip link del dev "$devbr"
+ check_err $?
+
+ if [ $ret -ne 0 ];then
+ echo "FAIL: bridge setup"
+ return 1
+ fi
+ echo "PASS: bridge setup"
+
+}
+
+kci_test_gre()
+{
+ gredev=neta
+ rem=10.42.42.1
+ loc=10.0.0.1
+
+ local ret=0
+ ip tunnel add $gredev mode gre remote $rem local $loc ttl 1
+ check_err $?
+ ip link set $gredev up
+ check_err $?
+ ip addr add 10.23.7.10 dev $gredev
+ check_err $?
+ ip route add 10.23.8.0/30 dev $gredev
+ check_err $?
+ ip addr add dev "$devdummy" 10.23.7.11/24
+ check_err $?
+ ip link > /dev/null
+ check_err $?
+ ip addr > /dev/null
+ check_err $?
+
+ kci_test_netconf "$gredev"
+
+ ip addr del dev "$devdummy" 10.23.7.11/24
+ check_err $?
+
+ ip link del $gredev
+ check_err $?
+
+ if [ $ret -ne 0 ];then
+ echo "FAIL: gre tunnel endpoint"
+ return 1
+ fi
+ echo "PASS: gre tunnel endpoint"
+}
+
+# tc uses rtnetlink too, for full tc testing
+# please see tools/testing/selftests/tc-testing.
+kci_test_tc()
+{
+ dev=lo
+ local ret=0
+
+ tc qdisc add dev "$dev" root handle 1: htb
+ check_err $?
+ tc class add dev "$dev" parent 1: classid 1:10 htb rate 1mbit
+ check_err $?
+ tc filter add dev "$dev" parent 1:0 prio 5 handle ffe: protocol ip u32 divisor 256
+ check_err $?
+ tc filter add dev "$dev" parent 1:0 prio 5 handle ffd: protocol ip u32 divisor 256
+ check_err $?
+ tc filter add dev "$dev" parent 1:0 prio 5 handle ffc: protocol ip u32 divisor 256
+ check_err $?
+ tc filter add dev "$dev" protocol ip parent 1: prio 5 handle ffe:2:3 u32 ht ffe:2: match ip src 10.0.0.3 flowid 1:10
+ check_err $?
+ tc filter add dev "$dev" protocol ip parent 1: prio 5 handle ffe:2:2 u32 ht ffe:2: match ip src 10.0.0.2 flowid 1:10
+ check_err $?
+ tc filter show dev "$dev" parent 1:0 > /dev/null
+ check_err $?
+ tc filter del dev "$dev" protocol ip parent 1: prio 5 handle ffe:2:3 u32
+ check_err $?
+ tc filter show dev "$dev" parent 1:0 > /dev/null
+ check_err $?
+ tc qdisc del dev "$dev" root handle 1: htb
+ check_err $?
+
+ if [ $ret -ne 0 ];then
+ echo "FAIL: tc htb hierarchy"
+ return 1
+ fi
+ echo "PASS: tc htb hierarchy"
+
+}
+
+kci_test_polrouting()
+{
+ local ret=0
+ ip rule add fwmark 1 lookup 100
+ check_err $?
+ ip route add local 0.0.0.0/0 dev lo table 100
+ check_err $?
+ ip r s t all > /dev/null
+ check_err $?
+ ip rule del fwmark 1 lookup 100
+ check_err $?
+ ip route del local 0.0.0.0/0 dev lo table 100
+ check_err $?
+
+ if [ $ret -ne 0 ];then
+ echo "FAIL: policy route test"
+ return 1
+ fi
+ echo "PASS: policy routing"
+}
+
+kci_test_route_get()
+{
+ local hash_policy=$(sysctl -n net.ipv4.fib_multipath_hash_policy)
+
+ local ret=0
+
+ ip route get 127.0.0.1 > /dev/null
+ check_err $?
+ ip route get 127.0.0.1 dev "$devdummy" > /dev/null
+ check_err $?
+ ip route get ::1 > /dev/null
+ check_err $?
+ ip route get fe80::1 dev "$devdummy" > /dev/null
+ check_err $?
+ ip route get 127.0.0.1 from 127.0.0.1 oif lo tos 0x1 mark 0x1 > /dev/null
+ check_err $?
+ ip route get ::1 from ::1 iif lo oif lo tos 0x1 mark 0x1 > /dev/null
+ check_err $?
+ ip addr add dev "$devdummy" 10.23.7.11/24
+ check_err $?
+ ip route get 10.23.7.11 from 10.23.7.12 iif "$devdummy" > /dev/null
+ check_err $?
+ ip route add 10.23.8.0/24 \
+ nexthop via 10.23.7.13 dev "$devdummy" \
+ nexthop via 10.23.7.14 dev "$devdummy"
+ check_err $?
+ sysctl -wq net.ipv4.fib_multipath_hash_policy=0
+ ip route get 10.23.8.11 > /dev/null
+ check_err $?
+ sysctl -wq net.ipv4.fib_multipath_hash_policy=1
+ ip route get 10.23.8.11 > /dev/null
+ check_err $?
+ sysctl -wq net.ipv4.fib_multipath_hash_policy="$hash_policy"
+ ip route del 10.23.8.0/24
+ check_err $?
+ ip addr del dev "$devdummy" 10.23.7.11/24
+ check_err $?
+
+ if [ $ret -ne 0 ];then
+ echo "FAIL: route get"
+ return 1
+ fi
+
+ echo "PASS: route get"
+}
+
+kci_test_addrlft()
+{
+ for i in $(seq 10 100) ;do
+ lft=$(((RANDOM%3) + 1))
+ ip addr add 10.23.11.$i/32 dev "$devdummy" preferred_lft $lft valid_lft $((lft+1))
+ check_err $?
+ done
+
+ sleep 5
+
+ ip addr show dev "$devdummy" | grep "10.23.11."
+ if [ $? -eq 0 ]; then
+ echo "FAIL: preferred_lft addresses remaining"
+ check_err 1
+ return
+ fi
+
+ echo "PASS: preferred_lft addresses have expired"
+}
+
+kci_test_promote_secondaries()
+{
+ promote=$(sysctl -n net.ipv4.conf.$devdummy.promote_secondaries)
+
+ sysctl -q net.ipv4.conf.$devdummy.promote_secondaries=1
+
+ for i in $(seq 2 254);do
+ IP="10.23.11.$i"
+ ip -f inet addr add $IP/16 brd + dev "$devdummy"
+ ifconfig "$devdummy" $IP netmask 255.255.0.0
+ done
+
+ ip addr flush dev "$devdummy"
+
+ [ $promote -eq 0 ] && sysctl -q net.ipv4.conf.$devdummy.promote_secondaries=0
+
+ echo "PASS: promote_secondaries complete"
+}
+
+kci_test_addrlabel()
+{
+ local ret=0
+
+ ip addrlabel add prefix dead::/64 dev lo label 1
+ check_err $?
+
+ ip addrlabel list |grep -q "prefix dead::/64 dev lo label 1"
+ check_err $?
+
+ ip addrlabel del prefix dead::/64 dev lo label 1 2> /dev/null
+ check_err $?
+
+ ip addrlabel add prefix dead::/64 label 1 2> /dev/null
+ check_err $?
+
+ ip addrlabel del prefix dead::/64 label 1 2> /dev/null
+ check_err $?
+
+ # concurrent add/delete
+ for i in $(seq 1 1000); do
+ ip addrlabel add prefix 1c3::/64 label 12345 2>/dev/null
+ done &
+
+ for i in $(seq 1 1000); do
+ ip addrlabel del prefix 1c3::/64 label 12345 2>/dev/null
+ done
+
+ wait
+
+ ip addrlabel del prefix 1c3::/64 label 12345 2>/dev/null
+
+ if [ $ret -ne 0 ];then
+ echo "FAIL: ipv6 addrlabel"
+ return 1
+ fi
+
+ echo "PASS: ipv6 addrlabel"
+}
+
+kci_test_ifalias()
+{
+ local ret=0
+ namewant=$(uuidgen)
+ syspathname="/sys/class/net/$devdummy/ifalias"
+
+ ip link set dev "$devdummy" alias "$namewant"
+ check_err $?
+
+ if [ $ret -ne 0 ]; then
+ echo "FAIL: cannot set interface alias of $devdummy to $namewant"
+ return 1
+ fi
+
+ ip link show "$devdummy" | grep -q "alias $namewant"
+ check_err $?
+
+ if [ -r "$syspathname" ] ; then
+ read namehave < "$syspathname"
+ if [ "$namewant" != "$namehave" ]; then
+ echo "FAIL: did set ifalias $namewant but got $namehave"
+ return 1
+ fi
+
+ namewant=$(uuidgen)
+ echo "$namewant" > "$syspathname"
+ ip link show "$devdummy" | grep -q "alias $namewant"
+ check_err $?
+
+ # sysfs interface allows to delete alias again
+ echo "" > "$syspathname"
+
+ ip link show "$devdummy" | grep -q "alias $namewant"
+ check_fail $?
+
+ for i in $(seq 1 100); do
+ uuidgen > "$syspathname" &
+ done
+
+ wait
+
+ # re-add the alias -- kernel should free mem when dummy dev is removed
+ ip link set dev "$devdummy" alias "$namewant"
+ check_err $?
+ fi
+
+ if [ $ret -ne 0 ]; then
+ echo "FAIL: set interface alias $devdummy to $namewant"
+ return 1
+ fi
+
+ echo "PASS: set ifalias $namewant for $devdummy"
+}
+
+kci_test_vrf()
+{
+ vrfname="test-vrf"
+ local ret=0
+
+ ip link show type vrf 2>/dev/null
+ if [ $? -ne 0 ]; then
+ echo "SKIP: vrf: iproute2 too old"
+ return $ksft_skip
+ fi
+
+ ip link add "$vrfname" type vrf table 10
+ check_err $?
+ if [ $ret -ne 0 ];then
+ echo "FAIL: can't add vrf interface, skipping test"
+ return 0
+ fi
+
+ ip -br link show type vrf | grep -q "$vrfname"
+ check_err $?
+ if [ $ret -ne 0 ];then
+ echo "FAIL: created vrf device not found"
+ return 1
+ fi
+
+ ip link set dev "$vrfname" up
+ check_err $?
+
+ ip link set dev "$devdummy" master "$vrfname"
+ check_err $?
+ ip link del dev "$vrfname"
+ check_err $?
+
+ if [ $ret -ne 0 ];then
+ echo "FAIL: vrf"
+ return 1
+ fi
+
+ echo "PASS: vrf"
+}
+
+kci_test_encap_vxlan()
+{
+ local ret=0
+ vxlan="test-vxlan0"
+ vlan="test-vlan0"
+ testns="$1"
+
+ ip -netns "$testns" link add "$vxlan" type vxlan id 42 group 239.1.1.1 \
+ dev "$devdummy" dstport 4789 2>/dev/null
+ if [ $? -ne 0 ]; then
+ echo "FAIL: can't add vxlan interface, skipping test"
+ return 0
+ fi
+ check_err $?
+
+ ip -netns "$testns" addr add 10.2.11.49/24 dev "$vxlan"
+ check_err $?
+
+ ip -netns "$testns" link set up dev "$vxlan"
+ check_err $?
+
+ ip -netns "$testns" link add link "$vxlan" name "$vlan" type vlan id 1
+ check_err $?
+
+ # changelink testcases
+ ip -netns "$testns" link set dev "$vxlan" type vxlan vni 43 2>/dev/null
+ check_fail $?
+
+ ip -netns "$testns" link set dev "$vxlan" type vxlan group ffe5::5 dev "$devdummy" 2>/dev/null
+ check_fail $?
+
+ ip -netns "$testns" link set dev "$vxlan" type vxlan ttl inherit 2>/dev/null
+ check_fail $?
+
+ ip -netns "$testns" link set dev "$vxlan" type vxlan ttl 64
+ check_err $?
+
+ ip -netns "$testns" link set dev "$vxlan" type vxlan nolearning
+ check_err $?
+
+ ip -netns "$testns" link set dev "$vxlan" type vxlan proxy 2>/dev/null
+ check_fail $?
+
+ ip -netns "$testns" link set dev "$vxlan" type vxlan norsc 2>/dev/null
+ check_fail $?
+
+ ip -netns "$testns" link set dev "$vxlan" type vxlan l2miss 2>/dev/null
+ check_fail $?
+
+ ip -netns "$testns" link set dev "$vxlan" type vxlan l3miss 2>/dev/null
+ check_fail $?
+
+ ip -netns "$testns" link set dev "$vxlan" type vxlan external 2>/dev/null
+ check_fail $?
+
+ ip -netns "$testns" link set dev "$vxlan" type vxlan udpcsum 2>/dev/null
+ check_fail $?
+
+ ip -netns "$testns" link set dev "$vxlan" type vxlan udp6zerocsumtx 2>/dev/null
+ check_fail $?
+
+ ip -netns "$testns" link set dev "$vxlan" type vxlan udp6zerocsumrx 2>/dev/null
+ check_fail $?
+
+ ip -netns "$testns" link set dev "$vxlan" type vxlan remcsumtx 2>/dev/null
+ check_fail $?
+
+ ip -netns "$testns" link set dev "$vxlan" type vxlan remcsumrx 2>/dev/null
+ check_fail $?
+
+ ip -netns "$testns" link set dev "$vxlan" type vxlan gbp 2>/dev/null
+ check_fail $?
+
+ ip -netns "$testns" link set dev "$vxlan" type vxlan gpe 2>/dev/null
+ check_fail $?
+
+ ip -netns "$testns" link del "$vxlan"
+ check_err $?
+
+ if [ $ret -ne 0 ]; then
+ echo "FAIL: vxlan"
+ return 1
+ fi
+ echo "PASS: vxlan"
+}
+
+kci_test_encap_fou()
+{
+ local ret=0
+ name="test-fou"
+ testns="$1"
+
+ ip fou help 2>&1 |grep -q 'Usage: ip fou'
+ if [ $? -ne 0 ];then
+ echo "SKIP: fou: iproute2 too old"
+ return $ksft_skip
+ fi
+
+ if ! /sbin/modprobe -q -n fou; then
+ echo "SKIP: module fou is not found"
+ return $ksft_skip
+ fi
+ /sbin/modprobe -q fou
+ ip -netns "$testns" fou add port 7777 ipproto 47 2>/dev/null
+ if [ $? -ne 0 ];then
+ echo "FAIL: can't add fou port 7777, skipping test"
+ return 1
+ fi
+
+ ip -netns "$testns" fou add port 8888 ipproto 4
+ check_err $?
+
+ ip -netns "$testns" fou del port 9999 2>/dev/null
+ check_fail $?
+
+ ip -netns "$testns" fou del port 7777
+ check_err $?
+
+ if [ $ret -ne 0 ]; then
+ echo "FAIL: fou"
+ return 1
+ fi
+
+ echo "PASS: fou"
+}
+
+# test various encap methods, use netns to avoid unwanted interference
+kci_test_encap()
+{
+ testns="testns"
+ local ret=0
+
+ ip netns add "$testns"
+ if [ $? -ne 0 ]; then
+ echo "SKIP encap tests: cannot add net namespace $testns"
+ return $ksft_skip
+ fi
+
+ ip -netns "$testns" link set lo up
+ check_err $?
+
+ ip -netns "$testns" link add name "$devdummy" type dummy
+ check_err $?
+ ip -netns "$testns" link set "$devdummy" up
+ check_err $?
+
+ kci_test_encap_vxlan "$testns"
+ check_err $?
+ kci_test_encap_fou "$testns"
+ check_err $?
+
+ ip netns del "$testns"
+ return $ret
+}
+
+kci_test_macsec()
+{
+ msname="test_macsec0"
+ local ret=0
+
+ ip macsec help 2>&1 | grep -q "^Usage: ip macsec"
+ if [ $? -ne 0 ]; then
+ echo "SKIP: macsec: iproute2 too old"
+ return $ksft_skip
+ fi
+
+ ip link add link "$devdummy" "$msname" type macsec port 42 encrypt on
+ check_err $?
+ if [ $ret -ne 0 ];then
+ echo "FAIL: can't add macsec interface, skipping test"
+ return 1
+ fi
+
+ ip macsec add "$msname" tx sa 0 pn 1024 on key 01 12345678901234567890123456789012
+ check_err $?
+
+ ip macsec add "$msname" rx port 1234 address "1c:ed:de:ad:be:ef"
+ check_err $?
+
+ ip macsec add "$msname" rx port 1234 address "1c:ed:de:ad:be:ef" sa 0 pn 1 on key 00 0123456789abcdef0123456789abcdef
+ check_err $?
+
+ ip macsec show > /dev/null
+ check_err $?
+
+ ip link del dev "$msname"
+ check_err $?
+
+ if [ $ret -ne 0 ];then
+ echo "FAIL: macsec"
+ return 1
+ fi
+
+ echo "PASS: macsec"
+}
+
+#-------------------------------------------------------------------
+# Example commands
+# ip x s add proto esp src 14.0.0.52 dst 14.0.0.70 \
+# spi 0x07 mode transport reqid 0x07 replay-window 32 \
+# aead 'rfc4106(gcm(aes))' 1234567890123456dcba 128 \
+# sel src 14.0.0.52/24 dst 14.0.0.70/24
+# ip x p add dir out src 14.0.0.52/24 dst 14.0.0.70/24 \
+# tmpl proto esp src 14.0.0.52 dst 14.0.0.70 \
+# spi 0x07 mode transport reqid 0x07
+#
+# Subcommands not tested
+# ip x s update
+# ip x s allocspi
+# ip x s deleteall
+# ip x p update
+# ip x p deleteall
+# ip x p set
+#-------------------------------------------------------------------
+kci_test_ipsec()
+{
+ local ret=0
+ algo="aead rfc4106(gcm(aes)) 0x3132333435363738393031323334353664636261 128"
+ srcip=192.168.123.1
+ dstip=192.168.123.2
+ spi=7
+
+ ip addr add $srcip dev $devdummy
+
+ # flush to be sure there's nothing configured
+ ip x s flush ; ip x p flush
+ check_err $?
+
+ # start the monitor in the background
+ tmpfile=`mktemp /var/run/ipsectestXXX`
+ mpid=`(ip x m > $tmpfile & echo $!) 2>/dev/null`
+ sleep 0.2
+
+ ipsecid="proto esp src $srcip dst $dstip spi 0x07"
+ ip x s add $ipsecid \
+ mode transport reqid 0x07 replay-window 32 \
+ $algo sel src $srcip/24 dst $dstip/24
+ check_err $?
+
+ lines=`ip x s list | grep $srcip | grep $dstip | wc -l`
+ test $lines -eq 2
+ check_err $?
+
+ ip x s count | grep -q "SAD count 1"
+ check_err $?
+
+ lines=`ip x s get $ipsecid | grep $srcip | grep $dstip | wc -l`
+ test $lines -eq 2
+ check_err $?
+
+ ip x s delete $ipsecid
+ check_err $?
+
+ lines=`ip x s list | wc -l`
+ test $lines -eq 0
+ check_err $?
+
+ ipsecsel="dir out src $srcip/24 dst $dstip/24"
+ ip x p add $ipsecsel \
+ tmpl proto esp src $srcip dst $dstip \
+ spi 0x07 mode transport reqid 0x07
+ check_err $?
+
+ lines=`ip x p list | grep $srcip | grep $dstip | wc -l`
+ test $lines -eq 2
+ check_err $?
+
+ ip x p count | grep -q "SPD IN 0 OUT 1 FWD 0"
+ check_err $?
+
+ lines=`ip x p get $ipsecsel | grep $srcip | grep $dstip | wc -l`
+ test $lines -eq 2
+ check_err $?
+
+ ip x p delete $ipsecsel
+ check_err $?
+
+ lines=`ip x p list | wc -l`
+ test $lines -eq 0
+ check_err $?
+
+ # check the monitor results
+ kill $mpid
+ lines=`wc -l $tmpfile | cut "-d " -f1`
+ test $lines -eq 20
+ check_err $?
+ rm -rf $tmpfile
+
+ # clean up any leftovers
+ ip x s flush
+ check_err $?
+ ip x p flush
+ check_err $?
+ ip addr del $srcip/32 dev $devdummy
+
+ if [ $ret -ne 0 ]; then
+ echo "FAIL: ipsec"
+ return 1
+ fi
+ echo "PASS: ipsec"
+}
+
+#-------------------------------------------------------------------
+# Example commands
+# ip x s add proto esp src 14.0.0.52 dst 14.0.0.70 \
+# spi 0x07 mode transport reqid 0x07 replay-window 32 \
+# aead 'rfc4106(gcm(aes))' 1234567890123456dcba 128 \
+# sel src 14.0.0.52/24 dst 14.0.0.70/24
+# offload dev sim1 dir out
+# ip x p add dir out src 14.0.0.52/24 dst 14.0.0.70/24 \
+# tmpl proto esp src 14.0.0.52 dst 14.0.0.70 \
+# spi 0x07 mode transport reqid 0x07
+#
+#-------------------------------------------------------------------
+kci_test_ipsec_offload()
+{
+ local ret=0
+ algo="aead rfc4106(gcm(aes)) 0x3132333435363738393031323334353664636261 128"
+ srcip=192.168.123.3
+ dstip=192.168.123.4
+ sysfsd=/sys/kernel/debug/netdevsim/netdevsim0/ports/0/
+ sysfsf=$sysfsd/ipsec
+ sysfsnet=/sys/bus/netdevsim/devices/netdevsim0/net/
+ probed=false
+
+ # setup netdevsim since dummydev doesn't have offload support
+ if [ ! -w /sys/bus/netdevsim/new_device ] ; then
+ modprobe -q netdevsim
+ check_err $?
+ if [ $ret -ne 0 ]; then
+ echo "SKIP: ipsec_offload can't load netdevsim"
+ return $ksft_skip
+ fi
+ probed=true
+ fi
+
+ echo "0" > /sys/bus/netdevsim/new_device
+ while [ ! -d $sysfsnet ] ; do :; done
+ udevadm settle
+ dev=`ls $sysfsnet`
+
+ ip addr add $srcip dev $dev
+ ip link set $dev up
+ if [ ! -d $sysfsd ] ; then
+ echo "FAIL: ipsec_offload can't create device $dev"
+ return 1
+ fi
+ if [ ! -f $sysfsf ] ; then
+ echo "FAIL: ipsec_offload netdevsim doesn't support IPsec offload"
+ return 1
+ fi
+
+ # flush to be sure there's nothing configured
+ ip x s flush ; ip x p flush
+
+ # create offloaded SAs, both in and out
+ ip x p add dir out src $srcip/24 dst $dstip/24 \
+ tmpl proto esp src $srcip dst $dstip spi 9 \
+ mode transport reqid 42
+ check_err $?
+ ip x p add dir in src $dstip/24 dst $srcip/24 \
+ tmpl proto esp src $dstip dst $srcip spi 9 \
+ mode transport reqid 42
+ check_err $?
+
+ ip x s add proto esp src $srcip dst $dstip spi 9 \
+ mode transport reqid 42 $algo sel src $srcip/24 dst $dstip/24 \
+ offload dev $dev dir out
+ check_err $?
+ ip x s add proto esp src $dstip dst $srcip spi 9 \
+ mode transport reqid 42 $algo sel src $dstip/24 dst $srcip/24 \
+ offload dev $dev dir in
+ check_err $?
+ if [ $ret -ne 0 ]; then
+ echo "FAIL: ipsec_offload can't create SA"
+ return 1
+ fi
+
+ # does offload show up in ip output
+ lines=`ip x s list | grep -c "crypto offload parameters: dev $dev dir"`
+ if [ $lines -ne 2 ] ; then
+ echo "FAIL: ipsec_offload SA offload missing from list output"
+ check_err 1
+ fi
+
+ # use ping to exercise the Tx path
+ ping -I $dev -c 3 -W 1 -i 0 $dstip >/dev/null
+
+ # does driver have correct offload info
+ diff $sysfsf - << EOF
+SA count=2 tx=3
+sa[0] tx ipaddr=0x00000000 00000000 00000000 00000000
+sa[0] spi=0x00000009 proto=0x32 salt=0x61626364 crypt=1
+sa[0] key=0x34333231 38373635 32313039 36353433
+sa[1] rx ipaddr=0x00000000 00000000 00000000 037ba8c0
+sa[1] spi=0x00000009 proto=0x32 salt=0x61626364 crypt=1
+sa[1] key=0x34333231 38373635 32313039 36353433
+EOF
+ if [ $? -ne 0 ] ; then
+ echo "FAIL: ipsec_offload incorrect driver data"
+ check_err 1
+ fi
+
+ # does offload get removed from driver
+ ip x s flush
+ ip x p flush
+ lines=`grep -c "SA count=0" $sysfsf`
+ if [ $lines -ne 1 ] ; then
+ echo "FAIL: ipsec_offload SA not removed from driver"
+ check_err 1
+ fi
+
+ # clean up any leftovers
+ echo 0 > /sys/bus/netdevsim/del_device
+ $probed && rmmod netdevsim
+
+ if [ $ret -ne 0 ]; then
+ echo "FAIL: ipsec_offload"
+ return 1
+ fi
+ echo "PASS: ipsec_offload"
+}
+
+kci_test_gretap()
+{
+ testns="testns"
+ DEV_NS=gretap00
+ local ret=0
+
+ ip netns add "$testns"
+ if [ $? -ne 0 ]; then
+ echo "SKIP gretap tests: cannot add net namespace $testns"
+ return $ksft_skip
+ fi
+
+ ip link help gretap 2>&1 | grep -q "^Usage:"
+ if [ $? -ne 0 ];then
+ echo "SKIP: gretap: iproute2 too old"
+ ip netns del "$testns"
+ return $ksft_skip
+ fi
+
+ # test native tunnel
+ ip -netns "$testns" link add dev "$DEV_NS" type gretap seq \
+ key 102 local 172.16.1.100 remote 172.16.1.200
+ check_err $?
+
+ ip -netns "$testns" addr add dev "$DEV_NS" 10.1.1.100/24
+ check_err $?
+
+ ip -netns "$testns" link set dev $DEV_NS up
+ check_err $?
+
+ ip -netns "$testns" link del "$DEV_NS"
+ check_err $?
+
+ # test external mode
+ ip -netns "$testns" link add dev "$DEV_NS" type gretap external
+ check_err $?
+
+ ip -netns "$testns" link del "$DEV_NS"
+ check_err $?
+
+ if [ $ret -ne 0 ]; then
+ echo "FAIL: gretap"
+ ip netns del "$testns"
+ return 1
+ fi
+ echo "PASS: gretap"
+
+ ip netns del "$testns"
+}
+
+kci_test_ip6gretap()
+{
+ testns="testns"
+ DEV_NS=ip6gretap00
+ local ret=0
+
+ ip netns add "$testns"
+ if [ $? -ne 0 ]; then
+ echo "SKIP ip6gretap tests: cannot add net namespace $testns"
+ return $ksft_skip
+ fi
+
+ ip link help ip6gretap 2>&1 | grep -q "^Usage:"
+ if [ $? -ne 0 ];then
+ echo "SKIP: ip6gretap: iproute2 too old"
+ ip netns del "$testns"
+ return $ksft_skip
+ fi
+
+ # test native tunnel
+ ip -netns "$testns" link add dev "$DEV_NS" type ip6gretap seq \
+ key 102 local fc00:100::1 remote fc00:100::2
+ check_err $?
+
+ ip -netns "$testns" addr add dev "$DEV_NS" fc00:200::1/96
+ check_err $?
+
+ ip -netns "$testns" link set dev $DEV_NS up
+ check_err $?
+
+ ip -netns "$testns" link del "$DEV_NS"
+ check_err $?
+
+ # test external mode
+ ip -netns "$testns" link add dev "$DEV_NS" type ip6gretap external
+ check_err $?
+
+ ip -netns "$testns" link del "$DEV_NS"
+ check_err $?
+
+ if [ $ret -ne 0 ]; then
+ echo "FAIL: ip6gretap"
+ ip netns del "$testns"
+ return 1
+ fi
+ echo "PASS: ip6gretap"
+
+ ip netns del "$testns"
+}
+
+kci_test_erspan()
+{
+ testns="testns"
+ DEV_NS=erspan00
+ local ret=0
+
+ ip link help erspan 2>&1 | grep -q "^Usage:"
+ if [ $? -ne 0 ];then
+ echo "SKIP: erspan: iproute2 too old"
+ return $ksft_skip
+ fi
+
+ ip netns add "$testns"
+ if [ $? -ne 0 ]; then
+ echo "SKIP erspan tests: cannot add net namespace $testns"
+ return $ksft_skip
+ fi
+
+ # test native tunnel erspan v1
+ ip -netns "$testns" link add dev "$DEV_NS" type erspan seq \
+ key 102 local 172.16.1.100 remote 172.16.1.200 \
+ erspan_ver 1 erspan 488
+ check_err $?
+
+ ip -netns "$testns" addr add dev "$DEV_NS" 10.1.1.100/24
+ check_err $?
+
+ ip -netns "$testns" link set dev $DEV_NS up
+ check_err $?
+
+ ip -netns "$testns" link del "$DEV_NS"
+ check_err $?
+
+ # test native tunnel erspan v2
+ ip -netns "$testns" link add dev "$DEV_NS" type erspan seq \
+ key 102 local 172.16.1.100 remote 172.16.1.200 \
+ erspan_ver 2 erspan_dir ingress erspan_hwid 7
+ check_err $?
+
+ ip -netns "$testns" addr add dev "$DEV_NS" 10.1.1.100/24
+ check_err $?
+
+ ip -netns "$testns" link set dev $DEV_NS up
+ check_err $?
+
+ ip -netns "$testns" link del "$DEV_NS"
+ check_err $?
+
+ # test external mode
+ ip -netns "$testns" link add dev "$DEV_NS" type erspan external
+ check_err $?
+
+ ip -netns "$testns" link del "$DEV_NS"
+ check_err $?
+
+ if [ $ret -ne 0 ]; then
+ echo "FAIL: erspan"
+ ip netns del "$testns"
+ return 1
+ fi
+ echo "PASS: erspan"
+
+ ip netns del "$testns"
+}
+
+kci_test_ip6erspan()
+{
+ testns="testns"
+ DEV_NS=ip6erspan00
+ local ret=0
+
+ ip link help ip6erspan 2>&1 | grep -q "^Usage:"
+ if [ $? -ne 0 ];then
+ echo "SKIP: ip6erspan: iproute2 too old"
+ return $ksft_skip
+ fi
+
+ ip netns add "$testns"
+ if [ $? -ne 0 ]; then
+ echo "SKIP ip6erspan tests: cannot add net namespace $testns"
+ return $ksft_skip
+ fi
+
+ # test native tunnel ip6erspan v1
+ ip -netns "$testns" link add dev "$DEV_NS" type ip6erspan seq \
+ key 102 local fc00:100::1 remote fc00:100::2 \
+ erspan_ver 1 erspan 488
+ check_err $?
+
+ ip -netns "$testns" addr add dev "$DEV_NS" 10.1.1.100/24
+ check_err $?
+
+ ip -netns "$testns" link set dev $DEV_NS up
+ check_err $?
+
+ ip -netns "$testns" link del "$DEV_NS"
+ check_err $?
+
+ # test native tunnel ip6erspan v2
+ ip -netns "$testns" link add dev "$DEV_NS" type ip6erspan seq \
+ key 102 local fc00:100::1 remote fc00:100::2 \
+ erspan_ver 2 erspan_dir ingress erspan_hwid 7
+ check_err $?
+
+ ip -netns "$testns" addr add dev "$DEV_NS" 10.1.1.100/24
+ check_err $?
+
+ ip -netns "$testns" link set dev $DEV_NS up
+ check_err $?
+
+ ip -netns "$testns" link del "$DEV_NS"
+ check_err $?
+
+ # test external mode
+ ip -netns "$testns" link add dev "$DEV_NS" \
+ type ip6erspan external
+ check_err $?
+
+ ip -netns "$testns" link del "$DEV_NS"
+ check_err $?
+
+ if [ $ret -ne 0 ]; then
+ echo "FAIL: ip6erspan"
+ ip netns del "$testns"
+ return 1
+ fi
+ echo "PASS: ip6erspan"
+
+ ip netns del "$testns"
+}
+
+kci_test_fdb_get()
+{
+ IP="ip -netns testns"
+ BRIDGE="bridge -netns testns"
+ brdev="test-br0"
+ vxlandev="vxlan10"
+ test_mac=de:ad:be:ef:13:37
+ localip="10.0.2.2"
+ dstip="10.0.2.3"
+ local ret=0
+
+ bridge fdb help 2>&1 |grep -q 'bridge fdb get'
+ if [ $? -ne 0 ];then
+ echo "SKIP: fdb get tests: iproute2 too old"
+ return $ksft_skip
+ fi
+
+ ip netns add testns
+ if [ $? -ne 0 ]; then
+ echo "SKIP fdb get tests: cannot add net namespace $testns"
+ return $ksft_skip
+ fi
+
+ $IP link add "$vxlandev" type vxlan id 10 local $localip \
+ dstport 4789 2>/dev/null
+ check_err $?
+ $IP link add name "$brdev" type bridge &>/dev/null
+ check_err $?
+ $IP link set dev "$vxlandev" master "$brdev" &>/dev/null
+ check_err $?
+ $BRIDGE fdb add $test_mac dev "$vxlandev" master &>/dev/null
+ check_err $?
+ $BRIDGE fdb add $test_mac dev "$vxlandev" dst $dstip self &>/dev/null
+ check_err $?
+
+ $BRIDGE fdb get $test_mac brport "$vxlandev" 2>/dev/null | grep -q "dev $vxlandev master $brdev"
+ check_err $?
+ $BRIDGE fdb get $test_mac br "$brdev" 2>/dev/null | grep -q "dev $vxlandev master $brdev"
+ check_err $?
+ $BRIDGE fdb get $test_mac dev "$vxlandev" self 2>/dev/null | grep -q "dev $vxlandev dst $dstip"
+ check_err $?
+
+ ip netns del testns &>/dev/null
+
+ if [ $ret -ne 0 ]; then
+ echo "FAIL: bridge fdb get"
+ return 1
+ fi
+
+ echo "PASS: bridge fdb get"
+}
+
+kci_test_neigh_get()
+{
+ dstmac=de:ad:be:ef:13:37
+ dstip=10.0.2.4
+ dstip6=dead::2
+ local ret=0
+
+ ip neigh help 2>&1 |grep -q 'ip neigh get'
+ if [ $? -ne 0 ];then
+ echo "SKIP: fdb get tests: iproute2 too old"
+ return $ksft_skip
+ fi
+
+ # ipv4
+ ip neigh add $dstip lladdr $dstmac dev "$devdummy" > /dev/null
+ check_err $?
+ ip neigh get $dstip dev "$devdummy" 2> /dev/null | grep -q "$dstmac"
+ check_err $?
+ ip neigh del $dstip lladdr $dstmac dev "$devdummy" > /dev/null
+ check_err $?
+
+ # ipv4 proxy
+ ip neigh add proxy $dstip dev "$devdummy" > /dev/null
+ check_err $?
+ ip neigh get proxy $dstip dev "$devdummy" 2>/dev/null | grep -q "$dstip"
+ check_err $?
+ ip neigh del proxy $dstip dev "$devdummy" > /dev/null
+ check_err $?
+
+ # ipv6
+ ip neigh add $dstip6 lladdr $dstmac dev "$devdummy" > /dev/null
+ check_err $?
+ ip neigh get $dstip6 dev "$devdummy" 2> /dev/null | grep -q "$dstmac"
+ check_err $?
+ ip neigh del $dstip6 lladdr $dstmac dev "$devdummy" > /dev/null
+ check_err $?
+
+ # ipv6 proxy
+ ip neigh add proxy $dstip6 dev "$devdummy" > /dev/null
+ check_err $?
+ ip neigh get proxy $dstip6 dev "$devdummy" 2>/dev/null | grep -q "$dstip6"
+ check_err $?
+ ip neigh del proxy $dstip6 dev "$devdummy" > /dev/null
+ check_err $?
+
+ if [ $ret -ne 0 ];then
+ echo "FAIL: neigh get"
+ return 1
+ fi
+
+ echo "PASS: neigh get"
+}
+
+kci_test_bridge_parent_id()
+{
+ local ret=0
+ sysfsnet=/sys/bus/netdevsim/devices/netdevsim
+ probed=false
+
+ if [ ! -w /sys/bus/netdevsim/new_device ] ; then
+ modprobe -q netdevsim
+ check_err $?
+ if [ $ret -ne 0 ]; then
+ echo "SKIP: bridge_parent_id can't load netdevsim"
+ return $ksft_skip
+ fi
+ probed=true
+ fi
+
+ echo "10 1" > /sys/bus/netdevsim/new_device
+ while [ ! -d ${sysfsnet}10 ] ; do :; done
+ echo "20 1" > /sys/bus/netdevsim/new_device
+ while [ ! -d ${sysfsnet}20 ] ; do :; done
+ udevadm settle
+ dev10=`ls ${sysfsnet}10/net/`
+ dev20=`ls ${sysfsnet}20/net/`
+
+ ip link add name test-bond0 type bond mode 802.3ad
+ ip link set dev $dev10 master test-bond0
+ ip link set dev $dev20 master test-bond0
+ ip link add name test-br0 type bridge
+ ip link set dev test-bond0 master test-br0
+ check_err $?
+
+ # clean up any leftovers
+ ip link del dev test-br0
+ ip link del dev test-bond0
+ echo 20 > /sys/bus/netdevsim/del_device
+ echo 10 > /sys/bus/netdevsim/del_device
+ $probed && rmmod netdevsim
+
+ if [ $ret -ne 0 ]; then
+ echo "FAIL: bridge_parent_id"
+ return 1
+ fi
+ echo "PASS: bridge_parent_id"
+}
+
+kci_test_rtnl()
+{
+ local ret=0
+ kci_add_dummy
+ if [ $ret -ne 0 ];then
+ echo "FAIL: cannot add dummy interface"
+ return 1
+ fi
+
+ kci_test_polrouting
+ check_err $?
+ kci_test_route_get
+ check_err $?
+ kci_test_addrlft
+ check_err $?
+ kci_test_promote_secondaries
+ check_err $?
+ kci_test_tc
+ check_err $?
+ kci_test_gre
+ check_err $?
+ kci_test_gretap
+ check_err $?
+ kci_test_ip6gretap
+ check_err $?
+ kci_test_erspan
+ check_err $?
+ kci_test_ip6erspan
+ check_err $?
+ kci_test_bridge
+ check_err $?
+ kci_test_addrlabel
+ check_err $?
+ kci_test_ifalias
+ check_err $?
+ kci_test_vrf
+ check_err $?
+ kci_test_encap
+ check_err $?
+ kci_test_macsec
+ check_err $?
+ kci_test_ipsec
+ check_err $?
+ kci_test_ipsec_offload
+ check_err $?
+ kci_test_fdb_get
+ check_err $?
+ kci_test_neigh_get
+ check_err $?
+ kci_test_bridge_parent_id
+ check_err $?
+
+ kci_del_dummy
+ return $ret
+}
+
+#check for needed privileges
+if [ "$(id -u)" -ne 0 ];then
+ echo "SKIP: Need root privileges"
+ exit $ksft_skip
+fi
+
+for x in ip tc;do
+ $x -Version 2>/dev/null >/dev/null
+ if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without the $x tool"
+ exit $ksft_skip
+ fi
+done
+
+kci_test_rtnl
+
+exit $?
diff --git a/tools/testing/selftests/net/run_afpackettests b/tools/testing/selftests/net/run_afpackettests
new file mode 100755
index 000000000..8b42e8b04
--- /dev/null
+++ b/tools/testing/selftests/net/run_afpackettests
@@ -0,0 +1,46 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+if [ $(id -u) != 0 ]; then
+ echo $msg must be run as root >&2
+ exit 0
+fi
+
+ret=0
+echo "--------------------"
+echo "running psock_fanout test"
+echo "--------------------"
+./in_netns.sh ./psock_fanout
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ ret=1
+else
+ echo "[PASS]"
+fi
+
+echo "--------------------"
+echo "running psock_tpacket test"
+echo "--------------------"
+if [ -f /proc/kallsyms ]; then
+ ./in_netns.sh ./psock_tpacket
+ if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ ret=1
+ else
+ echo "[PASS]"
+ fi
+else
+ echo "[SKIP] CONFIG_KALLSYMS not enabled"
+fi
+
+echo "--------------------"
+echo "running txring_overwrite test"
+echo "--------------------"
+./in_netns.sh ./txring_overwrite
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ ret=1
+else
+ echo "[PASS]"
+fi
+exit $ret
diff --git a/tools/testing/selftests/net/run_netsocktests b/tools/testing/selftests/net/run_netsocktests
new file mode 100755
index 000000000..14e41faf2
--- /dev/null
+++ b/tools/testing/selftests/net/run_netsocktests
@@ -0,0 +1,13 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+echo "--------------------"
+echo "running socket test"
+echo "--------------------"
+./socket
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exit 1
+else
+ echo "[PASS]"
+fi
diff --git a/tools/testing/selftests/net/rxtimestamp.c b/tools/testing/selftests/net/rxtimestamp.c
new file mode 100644
index 000000000..e4613ce4e
--- /dev/null
+++ b/tools/testing/selftests/net/rxtimestamp.c
@@ -0,0 +1,430 @@
+#include <errno.h>
+#include <error.h>
+#include <getopt.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/time.h>
+#include <sys/socket.h>
+#include <sys/select.h>
+#include <sys/ioctl.h>
+#include <arpa/inet.h>
+#include <net/if.h>
+
+#include <asm/types.h>
+#include <linux/net_tstamp.h>
+#include <linux/errqueue.h>
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+struct options {
+ int so_timestamp;
+ int so_timestampns;
+ int so_timestamping;
+};
+
+struct tstamps {
+ bool tstamp;
+ bool tstampns;
+ bool swtstamp;
+ bool hwtstamp;
+};
+
+struct socket_type {
+ char *friendly_name;
+ int type;
+ int protocol;
+ bool enabled;
+};
+
+struct test_case {
+ struct options sockopt;
+ struct tstamps expected;
+ bool enabled;
+ bool warn_on_fail;
+};
+
+struct sof_flag {
+ int mask;
+ char *name;
+};
+
+static struct sof_flag sof_flags[] = {
+#define SOF_FLAG(f) { f, #f }
+ SOF_FLAG(SOF_TIMESTAMPING_SOFTWARE),
+ SOF_FLAG(SOF_TIMESTAMPING_RX_SOFTWARE),
+ SOF_FLAG(SOF_TIMESTAMPING_RX_HARDWARE),
+};
+
+static struct socket_type socket_types[] = {
+ { "ip", SOCK_RAW, IPPROTO_EGP },
+ { "udp", SOCK_DGRAM, IPPROTO_UDP },
+ { "tcp", SOCK_STREAM, IPPROTO_TCP },
+};
+
+static struct test_case test_cases[] = {
+ { {}, {} },
+ {
+ { .so_timestamp = 1 },
+ { .tstamp = true }
+ },
+ {
+ { .so_timestampns = 1 },
+ { .tstampns = true }
+ },
+ {
+ { .so_timestamp = 1, .so_timestampns = 1 },
+ { .tstampns = true }
+ },
+ {
+ { .so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE },
+ {}
+ },
+ {
+ /* Loopback device does not support hw timestamps. */
+ { .so_timestamping = SOF_TIMESTAMPING_RX_HARDWARE },
+ {}
+ },
+ {
+ { .so_timestamping = SOF_TIMESTAMPING_SOFTWARE },
+ .warn_on_fail = true
+ },
+ {
+ { .so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE
+ | SOF_TIMESTAMPING_RX_HARDWARE },
+ {}
+ },
+ {
+ { .so_timestamping = SOF_TIMESTAMPING_SOFTWARE
+ | SOF_TIMESTAMPING_RX_SOFTWARE },
+ { .swtstamp = true }
+ },
+ {
+ { .so_timestamp = 1, .so_timestamping = SOF_TIMESTAMPING_SOFTWARE
+ | SOF_TIMESTAMPING_RX_SOFTWARE },
+ { .tstamp = true, .swtstamp = true }
+ },
+};
+
+static struct option long_options[] = {
+ { "list_tests", no_argument, 0, 'l' },
+ { "test_num", required_argument, 0, 'n' },
+ { "op_size", required_argument, 0, 's' },
+ { "tcp", no_argument, 0, 't' },
+ { "udp", no_argument, 0, 'u' },
+ { "ip", no_argument, 0, 'i' },
+ { "strict", no_argument, 0, 'S' },
+ { "ipv4", no_argument, 0, '4' },
+ { "ipv6", no_argument, 0, '6' },
+ { NULL, 0, NULL, 0 },
+};
+
+static int next_port = 19999;
+static int op_size = 10 * 1024;
+
+void print_test_case(struct test_case *t)
+{
+ int f = 0;
+
+ printf("sockopts {");
+ if (t->sockopt.so_timestamp)
+ printf(" SO_TIMESTAMP ");
+ if (t->sockopt.so_timestampns)
+ printf(" SO_TIMESTAMPNS ");
+ if (t->sockopt.so_timestamping) {
+ printf(" SO_TIMESTAMPING: {");
+ for (f = 0; f < ARRAY_SIZE(sof_flags); f++)
+ if (t->sockopt.so_timestamping & sof_flags[f].mask)
+ printf(" %s |", sof_flags[f].name);
+ printf("}");
+ }
+ printf("} expected cmsgs: {");
+ if (t->expected.tstamp)
+ printf(" SCM_TIMESTAMP ");
+ if (t->expected.tstampns)
+ printf(" SCM_TIMESTAMPNS ");
+ if (t->expected.swtstamp || t->expected.hwtstamp) {
+ printf(" SCM_TIMESTAMPING {");
+ if (t->expected.swtstamp)
+ printf("0");
+ if (t->expected.swtstamp && t->expected.hwtstamp)
+ printf(",");
+ if (t->expected.hwtstamp)
+ printf("2");
+ printf("}");
+ }
+ printf("}\n");
+}
+
+void do_send(int src)
+{
+ int r;
+ char *buf = malloc(op_size);
+
+ memset(buf, 'z', op_size);
+ r = write(src, buf, op_size);
+ if (r < 0)
+ error(1, errno, "Failed to sendmsg");
+
+ free(buf);
+}
+
+bool do_recv(int rcv, int read_size, struct tstamps expected)
+{
+ const int CMSG_SIZE = 1024;
+
+ struct scm_timestamping *ts;
+ struct tstamps actual = {};
+ char cmsg_buf[CMSG_SIZE];
+ struct iovec recv_iov;
+ struct cmsghdr *cmsg;
+ bool failed = false;
+ struct msghdr hdr;
+ int flags = 0;
+ int r;
+
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.msg_iov = &recv_iov;
+ hdr.msg_iovlen = 1;
+ recv_iov.iov_base = malloc(read_size);
+ recv_iov.iov_len = read_size;
+
+ hdr.msg_control = cmsg_buf;
+ hdr.msg_controllen = sizeof(cmsg_buf);
+
+ r = recvmsg(rcv, &hdr, flags);
+ if (r < 0)
+ error(1, errno, "Failed to recvmsg");
+ if (r != read_size)
+ error(1, 0, "Only received %d bytes of payload.", r);
+
+ if (hdr.msg_flags & (MSG_TRUNC | MSG_CTRUNC))
+ error(1, 0, "Message was truncated.");
+
+ for (cmsg = CMSG_FIRSTHDR(&hdr); cmsg != NULL;
+ cmsg = CMSG_NXTHDR(&hdr, cmsg)) {
+ if (cmsg->cmsg_level != SOL_SOCKET)
+ error(1, 0, "Unexpected cmsg_level %d",
+ cmsg->cmsg_level);
+ switch (cmsg->cmsg_type) {
+ case SCM_TIMESTAMP:
+ actual.tstamp = true;
+ break;
+ case SCM_TIMESTAMPNS:
+ actual.tstampns = true;
+ break;
+ case SCM_TIMESTAMPING:
+ ts = (struct scm_timestamping *)CMSG_DATA(cmsg);
+ actual.swtstamp = !!ts->ts[0].tv_sec;
+ if (ts->ts[1].tv_sec != 0)
+ error(0, 0, "ts[1] should not be set.");
+ actual.hwtstamp = !!ts->ts[2].tv_sec;
+ break;
+ default:
+ error(1, 0, "Unexpected cmsg_type %d", cmsg->cmsg_type);
+ }
+ }
+
+#define VALIDATE(field) \
+ do { \
+ if (expected.field != actual.field) { \
+ if (expected.field) \
+ error(0, 0, "Expected " #field " to be set."); \
+ else \
+ error(0, 0, \
+ "Expected " #field " to not be set."); \
+ failed = true; \
+ } \
+ } while (0)
+
+ VALIDATE(tstamp);
+ VALIDATE(tstampns);
+ VALIDATE(swtstamp);
+ VALIDATE(hwtstamp);
+#undef VALIDATE
+
+ free(recv_iov.iov_base);
+
+ return failed;
+}
+
+void config_so_flags(int rcv, struct options o)
+{
+ int on = 1;
+
+ if (setsockopt(rcv, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) < 0)
+ error(1, errno, "Failed to enable SO_REUSEADDR");
+
+ if (o.so_timestamp &&
+ setsockopt(rcv, SOL_SOCKET, SO_TIMESTAMP,
+ &o.so_timestamp, sizeof(o.so_timestamp)) < 0)
+ error(1, errno, "Failed to enable SO_TIMESTAMP");
+
+ if (o.so_timestampns &&
+ setsockopt(rcv, SOL_SOCKET, SO_TIMESTAMPNS,
+ &o.so_timestampns, sizeof(o.so_timestampns)) < 0)
+ error(1, errno, "Failed to enable SO_TIMESTAMPNS");
+
+ if (o.so_timestamping &&
+ setsockopt(rcv, SOL_SOCKET, SO_TIMESTAMPING,
+ &o.so_timestamping, sizeof(o.so_timestamping)) < 0)
+ error(1, errno, "Failed to set SO_TIMESTAMPING");
+}
+
+bool run_test_case(struct socket_type *s, int test_num, char ip_version,
+ bool strict)
+{
+ union {
+ struct sockaddr_in6 addr6;
+ struct sockaddr_in addr4;
+ struct sockaddr addr_un;
+ } addr;
+ int read_size = op_size;
+ int src, dst, rcv, port;
+ socklen_t addr_size;
+ bool failed = false;
+
+ port = (s->type == SOCK_RAW) ? 0 : next_port++;
+ memset(&addr, 0, sizeof(addr));
+ if (ip_version == '4') {
+ addr.addr4.sin_family = AF_INET;
+ addr.addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ addr.addr4.sin_port = htons(port);
+ addr_size = sizeof(addr.addr4);
+ if (s->type == SOCK_RAW)
+ read_size += 20; /* for IPv4 header */
+ } else {
+ addr.addr6.sin6_family = AF_INET6;
+ addr.addr6.sin6_addr = in6addr_loopback;
+ addr.addr6.sin6_port = htons(port);
+ addr_size = sizeof(addr.addr6);
+ }
+ printf("Starting testcase %d over ipv%c...\n", test_num, ip_version);
+ src = socket(addr.addr_un.sa_family, s->type,
+ s->protocol);
+ if (src < 0)
+ error(1, errno, "Failed to open src socket");
+
+ dst = socket(addr.addr_un.sa_family, s->type,
+ s->protocol);
+ if (dst < 0)
+ error(1, errno, "Failed to open dst socket");
+
+ if (bind(dst, &addr.addr_un, addr_size) < 0)
+ error(1, errno, "Failed to bind to port %d", port);
+
+ if (s->type == SOCK_STREAM && (listen(dst, 1) < 0))
+ error(1, errno, "Failed to listen");
+
+ if (connect(src, &addr.addr_un, addr_size) < 0)
+ error(1, errno, "Failed to connect");
+
+ if (s->type == SOCK_STREAM) {
+ rcv = accept(dst, NULL, NULL);
+ if (rcv < 0)
+ error(1, errno, "Failed to accept");
+ close(dst);
+ } else {
+ rcv = dst;
+ }
+
+ config_so_flags(rcv, test_cases[test_num].sockopt);
+ usleep(20000); /* setsockopt for SO_TIMESTAMPING is asynchronous */
+ do_send(src);
+
+ failed = do_recv(rcv, read_size, test_cases[test_num].expected);
+
+ close(rcv);
+ close(src);
+
+ if (failed) {
+ printf("FAILURE in testcase %d over ipv%c ", test_num,
+ ip_version);
+ print_test_case(&test_cases[test_num]);
+ if (!strict && test_cases[test_num].warn_on_fail)
+ failed = false;
+ }
+ return failed;
+}
+
+int main(int argc, char **argv)
+{
+ bool all_protocols = true;
+ bool all_tests = true;
+ bool cfg_ipv4 = false;
+ bool cfg_ipv6 = false;
+ bool strict = false;
+ int arg_index = 0;
+ int failures = 0;
+ int s, t, opt;
+
+ while ((opt = getopt_long(argc, argv, "", long_options,
+ &arg_index)) != -1) {
+ switch (opt) {
+ case 'l':
+ for (t = 0; t < ARRAY_SIZE(test_cases); t++) {
+ printf("%d\t", t);
+ print_test_case(&test_cases[t]);
+ }
+ return 0;
+ case 'n':
+ t = atoi(optarg);
+ if (t >= ARRAY_SIZE(test_cases))
+ error(1, 0, "Invalid test case: %d", t);
+ all_tests = false;
+ test_cases[t].enabled = true;
+ break;
+ case 's':
+ op_size = atoi(optarg);
+ break;
+ case 't':
+ all_protocols = false;
+ socket_types[2].enabled = true;
+ break;
+ case 'u':
+ all_protocols = false;
+ socket_types[1].enabled = true;
+ break;
+ case 'i':
+ all_protocols = false;
+ socket_types[0].enabled = true;
+ break;
+ case 'S':
+ strict = true;
+ break;
+ case '4':
+ cfg_ipv4 = true;
+ break;
+ case '6':
+ cfg_ipv6 = true;
+ break;
+ default:
+ error(1, 0, "Failed to parse parameters.");
+ }
+ }
+
+ for (s = 0; s < ARRAY_SIZE(socket_types); s++) {
+ if (!all_protocols && !socket_types[s].enabled)
+ continue;
+
+ printf("Testing %s...\n", socket_types[s].friendly_name);
+ for (t = 0; t < ARRAY_SIZE(test_cases); t++) {
+ if (!all_tests && !test_cases[t].enabled)
+ continue;
+ if (cfg_ipv4 || !cfg_ipv6)
+ if (run_test_case(&socket_types[s], t, '4',
+ strict))
+ failures++;
+ if (cfg_ipv6 || !cfg_ipv4)
+ if (run_test_case(&socket_types[s], t, '6',
+ strict))
+ failures++;
+ }
+ }
+ if (!failures)
+ printf("PASSED.\n");
+ return failures;
+}
diff --git a/tools/testing/selftests/net/rxtimestamp.sh b/tools/testing/selftests/net/rxtimestamp.sh
new file mode 100755
index 000000000..91631e88b
--- /dev/null
+++ b/tools/testing/selftests/net/rxtimestamp.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+./in_netns.sh ./rxtimestamp $@
diff --git a/tools/testing/selftests/net/so_txtime.c b/tools/testing/selftests/net/so_txtime.c
new file mode 100644
index 000000000..3155fbbf6
--- /dev/null
+++ b/tools/testing/selftests/net/so_txtime.c
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test the SO_TXTIME API
+ *
+ * Takes two streams of { payload, delivery time }[], one input and one output.
+ * Sends the input stream and verifies arrival matches the output stream.
+ * The two streams can differ due to out-of-order delivery and drops.
+ */
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <error.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <linux/net_tstamp.h>
+#include <linux/errqueue.h>
+#include <linux/if_ether.h>
+#include <linux/ipv6.h>
+#include <linux/udp.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+static int cfg_clockid = CLOCK_TAI;
+static bool cfg_do_ipv4;
+static bool cfg_do_ipv6;
+static uint16_t cfg_port = 8000;
+static int cfg_variance_us = 4000;
+
+static uint64_t glob_tstart;
+
+/* encode one timed transmission (of a 1B payload) */
+struct timed_send {
+ char data;
+ int64_t delay_us;
+};
+
+#define MAX_NUM_PKT 8
+static struct timed_send cfg_in[MAX_NUM_PKT];
+static struct timed_send cfg_out[MAX_NUM_PKT];
+static int cfg_num_pkt;
+
+static int cfg_errq_level;
+static int cfg_errq_type;
+
+static uint64_t gettime_ns(void)
+{
+ struct timespec ts;
+
+ if (clock_gettime(cfg_clockid, &ts))
+ error(1, errno, "gettime");
+
+ return ts.tv_sec * (1000ULL * 1000 * 1000) + ts.tv_nsec;
+}
+
+static void do_send_one(int fdt, struct timed_send *ts)
+{
+ char control[CMSG_SPACE(sizeof(uint64_t))];
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ struct cmsghdr *cm;
+ uint64_t tdeliver;
+ int ret;
+
+ iov.iov_base = &ts->data;
+ iov.iov_len = 1;
+
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ if (ts->delay_us >= 0) {
+ memset(control, 0, sizeof(control));
+ msg.msg_control = &control;
+ msg.msg_controllen = sizeof(control);
+
+ tdeliver = glob_tstart + ts->delay_us * 1000;
+
+ cm = CMSG_FIRSTHDR(&msg);
+ cm->cmsg_level = SOL_SOCKET;
+ cm->cmsg_type = SCM_TXTIME;
+ cm->cmsg_len = CMSG_LEN(sizeof(tdeliver));
+ memcpy(CMSG_DATA(cm), &tdeliver, sizeof(tdeliver));
+ }
+
+ ret = sendmsg(fdt, &msg, 0);
+ if (ret == -1)
+ error(1, errno, "write");
+ if (ret == 0)
+ error(1, 0, "write: 0B");
+
+}
+
+static bool do_recv_one(int fdr, struct timed_send *ts)
+{
+ int64_t tstop, texpect;
+ char rbuf[2];
+ int ret;
+
+ ret = recv(fdr, rbuf, sizeof(rbuf), 0);
+ if (ret == -1 && errno == EAGAIN)
+ return true;
+ if (ret == -1)
+ error(1, errno, "read");
+ if (ret != 1)
+ error(1, 0, "read: %dB", ret);
+
+ tstop = (gettime_ns() - glob_tstart) / 1000;
+ texpect = ts->delay_us >= 0 ? ts->delay_us : 0;
+
+ fprintf(stderr, "payload:%c delay:%lld expected:%lld (us)\n",
+ rbuf[0], (long long)tstop, (long long)texpect);
+
+ if (rbuf[0] != ts->data)
+ error(1, 0, "payload mismatch. expected %c", ts->data);
+
+ if (llabs(tstop - texpect) > cfg_variance_us)
+ error(1, 0, "exceeds variance (%d us)", cfg_variance_us);
+
+ return false;
+}
+
+static void do_recv_verify_empty(int fdr)
+{
+ char rbuf[1];
+ int ret;
+
+ ret = recv(fdr, rbuf, sizeof(rbuf), 0);
+ if (ret != -1 || errno != EAGAIN)
+ error(1, 0, "recv: not empty as expected (%d, %d)", ret, errno);
+}
+
+static void do_recv_errqueue_timeout(int fdt)
+{
+ char control[CMSG_SPACE(sizeof(struct sock_extended_err)) +
+ CMSG_SPACE(sizeof(struct sockaddr_in6))] = {0};
+ char data[sizeof(struct ethhdr) + sizeof(struct ipv6hdr) +
+ sizeof(struct udphdr) + 1];
+ struct sock_extended_err *err;
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ struct cmsghdr *cm;
+ int64_t tstamp = 0;
+ int ret;
+
+ iov.iov_base = data;
+ iov.iov_len = sizeof(data);
+
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ while (1) {
+ const char *reason;
+
+ ret = recvmsg(fdt, &msg, MSG_ERRQUEUE);
+ if (ret == -1 && errno == EAGAIN)
+ break;
+ if (ret == -1)
+ error(1, errno, "errqueue");
+ if (msg.msg_flags != MSG_ERRQUEUE)
+ error(1, 0, "errqueue: flags 0x%x\n", msg.msg_flags);
+
+ cm = CMSG_FIRSTHDR(&msg);
+ if (cm->cmsg_level != cfg_errq_level ||
+ cm->cmsg_type != cfg_errq_type)
+ error(1, 0, "errqueue: type 0x%x.0x%x\n",
+ cm->cmsg_level, cm->cmsg_type);
+
+ err = (struct sock_extended_err *)CMSG_DATA(cm);
+ if (err->ee_origin != SO_EE_ORIGIN_TXTIME)
+ error(1, 0, "errqueue: origin 0x%x\n", err->ee_origin);
+
+ switch (err->ee_errno) {
+ case ECANCELED:
+ if (err->ee_code != SO_EE_CODE_TXTIME_MISSED)
+ error(1, 0, "errqueue: unknown ECANCELED %u\n",
+ err->ee_code);
+ reason = "missed txtime";
+ break;
+ case EINVAL:
+ if (err->ee_code != SO_EE_CODE_TXTIME_INVALID_PARAM)
+ error(1, 0, "errqueue: unknown EINVAL %u\n",
+ err->ee_code);
+ reason = "invalid txtime";
+ break;
+ default:
+ error(1, 0, "errqueue: errno %u code %u\n",
+ err->ee_errno, err->ee_code);
+ };
+
+ tstamp = ((int64_t) err->ee_data) << 32 | err->ee_info;
+ tstamp -= (int64_t) glob_tstart;
+ tstamp /= 1000 * 1000;
+ fprintf(stderr, "send: pkt %c at %" PRId64 "ms dropped: %s\n",
+ data[ret - 1], tstamp, reason);
+
+ msg.msg_flags = 0;
+ msg.msg_controllen = sizeof(control);
+ }
+
+ error(1, 0, "recv: timeout");
+}
+
+static void setsockopt_txtime(int fd)
+{
+ struct sock_txtime so_txtime_val = { .clockid = cfg_clockid };
+ struct sock_txtime so_txtime_val_read = { 0 };
+ socklen_t vallen = sizeof(so_txtime_val);
+
+ so_txtime_val.flags = SOF_TXTIME_REPORT_ERRORS;
+
+ if (setsockopt(fd, SOL_SOCKET, SO_TXTIME,
+ &so_txtime_val, sizeof(so_txtime_val)))
+ error(1, errno, "setsockopt txtime");
+
+ if (getsockopt(fd, SOL_SOCKET, SO_TXTIME,
+ &so_txtime_val_read, &vallen))
+ error(1, errno, "getsockopt txtime");
+
+ if (vallen != sizeof(so_txtime_val) ||
+ memcmp(&so_txtime_val, &so_txtime_val_read, vallen))
+ error(1, 0, "getsockopt txtime: mismatch");
+}
+
+static int setup_tx(struct sockaddr *addr, socklen_t alen)
+{
+ int fd;
+
+ fd = socket(addr->sa_family, SOCK_DGRAM, 0);
+ if (fd == -1)
+ error(1, errno, "socket t");
+
+ if (connect(fd, addr, alen))
+ error(1, errno, "connect");
+
+ setsockopt_txtime(fd);
+
+ return fd;
+}
+
+static int setup_rx(struct sockaddr *addr, socklen_t alen)
+{
+ struct timeval tv = { .tv_usec = 100 * 1000 };
+ int fd;
+
+ fd = socket(addr->sa_family, SOCK_DGRAM, 0);
+ if (fd == -1)
+ error(1, errno, "socket r");
+
+ if (bind(fd, addr, alen))
+ error(1, errno, "bind");
+
+ if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)))
+ error(1, errno, "setsockopt rcv timeout");
+
+ return fd;
+}
+
+static void do_test(struct sockaddr *addr, socklen_t alen)
+{
+ int fdt, fdr, i;
+
+ fprintf(stderr, "\nSO_TXTIME ipv%c clock %s\n",
+ addr->sa_family == PF_INET ? '4' : '6',
+ cfg_clockid == CLOCK_TAI ? "tai" : "monotonic");
+
+ fdt = setup_tx(addr, alen);
+ fdr = setup_rx(addr, alen);
+
+ glob_tstart = gettime_ns();
+
+ for (i = 0; i < cfg_num_pkt; i++)
+ do_send_one(fdt, &cfg_in[i]);
+ for (i = 0; i < cfg_num_pkt; i++)
+ if (do_recv_one(fdr, &cfg_out[i]))
+ do_recv_errqueue_timeout(fdt);
+
+ do_recv_verify_empty(fdr);
+
+ if (close(fdr))
+ error(1, errno, "close r");
+ if (close(fdt))
+ error(1, errno, "close t");
+}
+
+static int parse_io(const char *optarg, struct timed_send *array)
+{
+ char *arg, *tok;
+ int aoff = 0;
+
+ arg = strdup(optarg);
+ if (!arg)
+ error(1, errno, "strdup");
+
+ while ((tok = strtok(arg, ","))) {
+ arg = NULL; /* only pass non-zero on first call */
+
+ if (aoff / 2 == MAX_NUM_PKT)
+ error(1, 0, "exceeds max pkt count (%d)", MAX_NUM_PKT);
+
+ if (aoff & 1) { /* parse delay */
+ array->delay_us = strtol(tok, NULL, 0) * 1000;
+ array++;
+ } else { /* parse character */
+ array->data = tok[0];
+ }
+
+ aoff++;
+ }
+
+ free(arg);
+
+ return aoff / 2;
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ int c, ilen, olen;
+
+ while ((c = getopt(argc, argv, "46c:")) != -1) {
+ switch (c) {
+ case '4':
+ cfg_do_ipv4 = true;
+ break;
+ case '6':
+ cfg_do_ipv6 = true;
+ break;
+ case 'c':
+ if (!strcmp(optarg, "tai"))
+ cfg_clockid = CLOCK_TAI;
+ else if (!strcmp(optarg, "monotonic") ||
+ !strcmp(optarg, "mono"))
+ cfg_clockid = CLOCK_MONOTONIC;
+ else
+ error(1, 0, "unknown clock id %s", optarg);
+ break;
+ default:
+ error(1, 0, "parse error at %d", optind);
+ }
+ }
+
+ if (argc - optind != 2)
+ error(1, 0, "Usage: %s [-46] -c <clock> <in> <out>", argv[0]);
+
+ ilen = parse_io(argv[optind], cfg_in);
+ olen = parse_io(argv[optind + 1], cfg_out);
+ if (ilen != olen)
+ error(1, 0, "i/o streams len mismatch (%d, %d)\n", ilen, olen);
+ cfg_num_pkt = ilen;
+}
+
+int main(int argc, char **argv)
+{
+ parse_opts(argc, argv);
+
+ if (cfg_do_ipv6) {
+ struct sockaddr_in6 addr6 = {0};
+
+ addr6.sin6_family = AF_INET6;
+ addr6.sin6_port = htons(cfg_port);
+ addr6.sin6_addr = in6addr_loopback;
+
+ cfg_errq_level = SOL_IPV6;
+ cfg_errq_type = IPV6_RECVERR;
+
+ do_test((void *)&addr6, sizeof(addr6));
+ }
+
+ if (cfg_do_ipv4) {
+ struct sockaddr_in addr4 = {0};
+
+ addr4.sin_family = AF_INET;
+ addr4.sin_port = htons(cfg_port);
+ addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+
+ cfg_errq_level = SOL_IP;
+ cfg_errq_type = IP_RECVERR;
+
+ do_test((void *)&addr4, sizeof(addr4));
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/net/so_txtime.sh b/tools/testing/selftests/net/so_txtime.sh
new file mode 100755
index 000000000..3f7800eae
--- /dev/null
+++ b/tools/testing/selftests/net/so_txtime.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Regression tests for the SO_TXTIME interface
+
+# Run in network namespace
+if [[ $# -eq 0 ]]; then
+ if ! ./in_netns.sh $0 __subprocess; then
+ # test is time sensitive, can be flaky
+ echo "test failed: retry once"
+ ./in_netns.sh $0 __subprocess
+ fi
+
+ exit $?
+fi
+
+set -e
+
+tc qdisc add dev lo root fq
+./so_txtime -4 -6 -c mono a,-1 a,-1
+./so_txtime -4 -6 -c mono a,0 a,0
+./so_txtime -4 -6 -c mono a,10 a,10
+./so_txtime -4 -6 -c mono a,10,b,20 a,10,b,20
+./so_txtime -4 -6 -c mono a,20,b,10 b,20,a,20
+
+if tc qdisc replace dev lo root etf clockid CLOCK_TAI delta 400000; then
+ ! ./so_txtime -4 -6 -c tai a,-1 a,-1
+ ! ./so_txtime -4 -6 -c tai a,0 a,0
+ ./so_txtime -4 -6 -c tai a,10 a,10
+ ./so_txtime -4 -6 -c tai a,10,b,20 a,10,b,20
+ ./so_txtime -4 -6 -c tai a,20,b,10 b,10,a,20
+else
+ echo "tc ($(tc -V)) does not support qdisc etf. skipping"
+fi
+
+echo OK. All tests passed
diff --git a/tools/testing/selftests/net/socket.c b/tools/testing/selftests/net/socket.c
new file mode 100644
index 000000000..afca1ead6
--- /dev/null
+++ b/tools/testing/selftests/net/socket.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+
+struct socket_testcase {
+ int domain;
+ int type;
+ int protocol;
+
+ /* 0 = valid file descriptor
+ * -foo = error foo
+ */
+ int expect;
+
+ /* If non-zero, accept EAFNOSUPPORT to handle the case
+ * of the protocol not being configured into the kernel.
+ */
+ int nosupport_ok;
+};
+
+static struct socket_testcase tests[] = {
+ { AF_MAX, 0, 0, -EAFNOSUPPORT, 0 },
+ { AF_INET, SOCK_STREAM, IPPROTO_TCP, 0, 1 },
+ { AF_INET, SOCK_DGRAM, IPPROTO_TCP, -EPROTONOSUPPORT, 1 },
+ { AF_INET, SOCK_DGRAM, IPPROTO_UDP, 0, 1 },
+ { AF_INET, SOCK_STREAM, IPPROTO_UDP, -EPROTONOSUPPORT, 1 },
+};
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+#define ERR_STRING_SZ 64
+
+static int run_tests(void)
+{
+ char err_string1[ERR_STRING_SZ];
+ char err_string2[ERR_STRING_SZ];
+ int i, err;
+
+ err = 0;
+ for (i = 0; i < ARRAY_SIZE(tests); i++) {
+ struct socket_testcase *s = &tests[i];
+ int fd;
+
+ fd = socket(s->domain, s->type, s->protocol);
+ if (fd < 0) {
+ if (s->nosupport_ok &&
+ errno == EAFNOSUPPORT)
+ continue;
+
+ if (s->expect < 0 &&
+ errno == -s->expect)
+ continue;
+
+ strerror_r(-s->expect, err_string1, ERR_STRING_SZ);
+ strerror_r(errno, err_string2, ERR_STRING_SZ);
+
+ fprintf(stderr, "socket(%d, %d, %d) expected "
+ "err (%s) got (%s)\n",
+ s->domain, s->type, s->protocol,
+ err_string1, err_string2);
+
+ err = -1;
+ break;
+ } else {
+ close(fd);
+
+ if (s->expect < 0) {
+ strerror_r(errno, err_string1, ERR_STRING_SZ);
+
+ fprintf(stderr, "socket(%d, %d, %d) expected "
+ "success got err (%s)\n",
+ s->domain, s->type, s->protocol,
+ err_string1);
+
+ err = -1;
+ break;
+ }
+ }
+ }
+
+ return err;
+}
+
+int main(void)
+{
+ int err = run_tests();
+
+ return err;
+}
diff --git a/tools/testing/selftests/net/tcp_fastopen_backup_key.c b/tools/testing/selftests/net/tcp_fastopen_backup_key.c
new file mode 100644
index 000000000..9c55ec44f
--- /dev/null
+++ b/tools/testing/selftests/net/tcp_fastopen_backup_key.c
@@ -0,0 +1,335 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Test key rotation for TFO.
+ * New keys are 'rotated' in two steps:
+ * 1) Add new key as the 'backup' key 'behind' the primary key
+ * 2) Make new key the primary by swapping the backup and primary keys
+ *
+ * The rotation is done in stages using multiple sockets bound
+ * to the same port via SO_REUSEPORT. This simulates key rotation
+ * behind say a load balancer. We verify that across the rotation
+ * there are no cases in which a cookie is not accepted by verifying
+ * that TcpExtTCPFastOpenPassiveFail remains 0.
+ */
+#define _GNU_SOURCE
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <unistd.h>
+#include <netinet/tcp.h>
+#include <fcntl.h>
+#include <time.h>
+
+#ifndef TCP_FASTOPEN_KEY
+#define TCP_FASTOPEN_KEY 33
+#endif
+
+#define N_LISTEN 10
+#define PROC_FASTOPEN_KEY "/proc/sys/net/ipv4/tcp_fastopen_key"
+#define KEY_LENGTH 16
+
+#ifndef ARRAY_SIZE
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+#endif
+
+static bool do_ipv6;
+static bool do_sockopt;
+static bool do_rotate;
+static int key_len = KEY_LENGTH;
+static int rcv_fds[N_LISTEN];
+static int proc_fd;
+static const char *IP4_ADDR = "127.0.0.1";
+static const char *IP6_ADDR = "::1";
+static const int PORT = 8891;
+
+static void get_keys(int fd, uint32_t *keys)
+{
+ char buf[128];
+ socklen_t len = KEY_LENGTH * 2;
+
+ if (do_sockopt) {
+ if (getsockopt(fd, SOL_TCP, TCP_FASTOPEN_KEY, keys, &len))
+ error(1, errno, "Unable to get key");
+ return;
+ }
+ lseek(proc_fd, 0, SEEK_SET);
+ if (read(proc_fd, buf, sizeof(buf)) <= 0)
+ error(1, errno, "Unable to read %s", PROC_FASTOPEN_KEY);
+ if (sscanf(buf, "%x-%x-%x-%x,%x-%x-%x-%x", keys, keys + 1, keys + 2,
+ keys + 3, keys + 4, keys + 5, keys + 6, keys + 7) != 8)
+ error(1, 0, "Unable to parse %s", PROC_FASTOPEN_KEY);
+}
+
+static void set_keys(int fd, uint32_t *keys)
+{
+ char buf[128];
+
+ if (do_sockopt) {
+ if (setsockopt(fd, SOL_TCP, TCP_FASTOPEN_KEY, keys,
+ key_len))
+ error(1, errno, "Unable to set key");
+ return;
+ }
+ if (do_rotate)
+ snprintf(buf, 128, "%08x-%08x-%08x-%08x,%08x-%08x-%08x-%08x",
+ keys[0], keys[1], keys[2], keys[3], keys[4], keys[5],
+ keys[6], keys[7]);
+ else
+ snprintf(buf, 128, "%08x-%08x-%08x-%08x",
+ keys[0], keys[1], keys[2], keys[3]);
+ lseek(proc_fd, 0, SEEK_SET);
+ if (write(proc_fd, buf, sizeof(buf)) <= 0)
+ error(1, errno, "Unable to write %s", PROC_FASTOPEN_KEY);
+}
+
+static void build_rcv_fd(int family, int proto, int *rcv_fds)
+{
+ struct sockaddr_in addr4 = {0};
+ struct sockaddr_in6 addr6 = {0};
+ struct sockaddr *addr;
+ int opt = 1, i, sz;
+ int qlen = 100;
+ uint32_t keys[8];
+
+ switch (family) {
+ case AF_INET:
+ addr4.sin_family = family;
+ addr4.sin_addr.s_addr = htonl(INADDR_ANY);
+ addr4.sin_port = htons(PORT);
+ sz = sizeof(addr4);
+ addr = (struct sockaddr *)&addr4;
+ break;
+ case AF_INET6:
+ addr6.sin6_family = AF_INET6;
+ addr6.sin6_addr = in6addr_any;
+ addr6.sin6_port = htons(PORT);
+ sz = sizeof(addr6);
+ addr = (struct sockaddr *)&addr6;
+ break;
+ default:
+ error(1, 0, "Unsupported family %d", family);
+ /* clang does not recognize error() above as terminating
+ * the program, so it complains that saddr, sz are
+ * not initialized when this code path is taken. Silence it.
+ */
+ return;
+ }
+ for (i = 0; i < ARRAY_SIZE(keys); i++)
+ keys[i] = rand();
+ for (i = 0; i < N_LISTEN; i++) {
+ rcv_fds[i] = socket(family, proto, 0);
+ if (rcv_fds[i] < 0)
+ error(1, errno, "failed to create receive socket");
+ if (setsockopt(rcv_fds[i], SOL_SOCKET, SO_REUSEPORT, &opt,
+ sizeof(opt)))
+ error(1, errno, "failed to set SO_REUSEPORT");
+ if (bind(rcv_fds[i], addr, sz))
+ error(1, errno, "failed to bind receive socket");
+ if (setsockopt(rcv_fds[i], SOL_TCP, TCP_FASTOPEN, &qlen,
+ sizeof(qlen)))
+ error(1, errno, "failed to set TCP_FASTOPEN");
+ set_keys(rcv_fds[i], keys);
+ if (proto == SOCK_STREAM && listen(rcv_fds[i], 10))
+ error(1, errno, "failed to listen on receive port");
+ }
+}
+
+static int connect_and_send(int family, int proto)
+{
+ struct sockaddr_in saddr4 = {0};
+ struct sockaddr_in daddr4 = {0};
+ struct sockaddr_in6 saddr6 = {0};
+ struct sockaddr_in6 daddr6 = {0};
+ struct sockaddr *saddr, *daddr;
+ int fd, sz, ret;
+ char data[1];
+
+ switch (family) {
+ case AF_INET:
+ saddr4.sin_family = AF_INET;
+ saddr4.sin_addr.s_addr = htonl(INADDR_ANY);
+ saddr4.sin_port = 0;
+
+ daddr4.sin_family = AF_INET;
+ if (!inet_pton(family, IP4_ADDR, &daddr4.sin_addr.s_addr))
+ error(1, errno, "inet_pton failed: %s", IP4_ADDR);
+ daddr4.sin_port = htons(PORT);
+
+ sz = sizeof(saddr4);
+ saddr = (struct sockaddr *)&saddr4;
+ daddr = (struct sockaddr *)&daddr4;
+ break;
+ case AF_INET6:
+ saddr6.sin6_family = AF_INET6;
+ saddr6.sin6_addr = in6addr_any;
+
+ daddr6.sin6_family = AF_INET6;
+ if (!inet_pton(family, IP6_ADDR, &daddr6.sin6_addr))
+ error(1, errno, "inet_pton failed: %s", IP6_ADDR);
+ daddr6.sin6_port = htons(PORT);
+
+ sz = sizeof(saddr6);
+ saddr = (struct sockaddr *)&saddr6;
+ daddr = (struct sockaddr *)&daddr6;
+ break;
+ default:
+ error(1, 0, "Unsupported family %d", family);
+ /* clang does not recognize error() above as terminating
+ * the program, so it complains that saddr, daddr, sz are
+ * not initialized when this code path is taken. Silence it.
+ */
+ return -1;
+ }
+ fd = socket(family, proto, 0);
+ if (fd < 0)
+ error(1, errno, "failed to create send socket");
+ if (bind(fd, saddr, sz))
+ error(1, errno, "failed to bind send socket");
+ data[0] = 'a';
+ ret = sendto(fd, data, 1, MSG_FASTOPEN, daddr, sz);
+ if (ret != 1)
+ error(1, errno, "failed to sendto");
+
+ return fd;
+}
+
+static bool is_listen_fd(int fd)
+{
+ int i;
+
+ for (i = 0; i < N_LISTEN; i++) {
+ if (rcv_fds[i] == fd)
+ return true;
+ }
+ return false;
+}
+
+static void rotate_key(int fd)
+{
+ static int iter;
+ static uint32_t new_key[4];
+ uint32_t keys[8];
+ uint32_t tmp_key[4];
+ int i;
+
+ if (iter < N_LISTEN) {
+ /* first set new key as backups */
+ if (iter == 0) {
+ for (i = 0; i < ARRAY_SIZE(new_key); i++)
+ new_key[i] = rand();
+ }
+ get_keys(fd, keys);
+ memcpy(keys + 4, new_key, KEY_LENGTH);
+ set_keys(fd, keys);
+ } else {
+ /* swap the keys */
+ get_keys(fd, keys);
+ memcpy(tmp_key, keys + 4, KEY_LENGTH);
+ memcpy(keys + 4, keys, KEY_LENGTH);
+ memcpy(keys, tmp_key, KEY_LENGTH);
+ set_keys(fd, keys);
+ }
+ if (++iter >= (N_LISTEN * 2))
+ iter = 0;
+}
+
+static void run_one_test(int family)
+{
+ struct epoll_event ev;
+ int i, send_fd;
+ int n_loops = 10000;
+ int rotate_key_fd = 0;
+ int key_rotate_interval = 50;
+ int fd, epfd;
+ char buf[1];
+
+ build_rcv_fd(family, SOCK_STREAM, rcv_fds);
+ epfd = epoll_create(1);
+ if (epfd < 0)
+ error(1, errno, "failed to create epoll");
+ ev.events = EPOLLIN;
+ for (i = 0; i < N_LISTEN; i++) {
+ ev.data.fd = rcv_fds[i];
+ if (epoll_ctl(epfd, EPOLL_CTL_ADD, rcv_fds[i], &ev))
+ error(1, errno, "failed to register sock epoll");
+ }
+ while (n_loops--) {
+ send_fd = connect_and_send(family, SOCK_STREAM);
+ if (do_rotate && ((n_loops % key_rotate_interval) == 0)) {
+ rotate_key(rcv_fds[rotate_key_fd]);
+ if (++rotate_key_fd >= N_LISTEN)
+ rotate_key_fd = 0;
+ }
+ while (1) {
+ i = epoll_wait(epfd, &ev, 1, -1);
+ if (i < 0)
+ error(1, errno, "epoll_wait failed");
+ if (is_listen_fd(ev.data.fd)) {
+ fd = accept(ev.data.fd, NULL, NULL);
+ if (fd < 0)
+ error(1, errno, "failed to accept");
+ ev.data.fd = fd;
+ if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev))
+ error(1, errno, "failed epoll add");
+ continue;
+ }
+ i = recv(ev.data.fd, buf, sizeof(buf), 0);
+ if (i != 1)
+ error(1, errno, "failed recv data");
+ if (epoll_ctl(epfd, EPOLL_CTL_DEL, ev.data.fd, NULL))
+ error(1, errno, "failed epoll del");
+ close(ev.data.fd);
+ break;
+ }
+ close(send_fd);
+ }
+ for (i = 0; i < N_LISTEN; i++)
+ close(rcv_fds[i]);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ int c;
+
+ while ((c = getopt(argc, argv, "46sr")) != -1) {
+ switch (c) {
+ case '4':
+ do_ipv6 = false;
+ break;
+ case '6':
+ do_ipv6 = true;
+ break;
+ case 's':
+ do_sockopt = true;
+ break;
+ case 'r':
+ do_rotate = true;
+ key_len = KEY_LENGTH * 2;
+ break;
+ default:
+ error(1, 0, "%s: parse error", argv[0]);
+ }
+ }
+}
+
+int main(int argc, char **argv)
+{
+ parse_opts(argc, argv);
+ proc_fd = open(PROC_FASTOPEN_KEY, O_RDWR);
+ if (proc_fd < 0)
+ error(1, errno, "Unable to open %s", PROC_FASTOPEN_KEY);
+ srand(time(NULL));
+ if (do_ipv6)
+ run_one_test(AF_INET6);
+ else
+ run_one_test(AF_INET);
+ close(proc_fd);
+ fprintf(stderr, "PASS\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/net/tcp_fastopen_backup_key.sh b/tools/testing/selftests/net/tcp_fastopen_backup_key.sh
new file mode 100755
index 000000000..f6e65674b
--- /dev/null
+++ b/tools/testing/selftests/net/tcp_fastopen_backup_key.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# rotate TFO keys for ipv4/ipv6 and verify that the client does
+# not present an invalid cookie.
+
+set +x
+set -e
+
+readonly NETNS="ns-$(mktemp -u XXXXXX)"
+
+setup() {
+ ip netns add "${NETNS}"
+ ip -netns "${NETNS}" link set lo up
+ ip netns exec "${NETNS}" sysctl -w net.ipv4.tcp_fastopen=3 \
+ >/dev/null 2>&1
+}
+
+cleanup() {
+ ip netns del "${NETNS}"
+}
+
+trap cleanup EXIT
+setup
+
+do_test() {
+ # flush routes before each run, otherwise successive runs can
+ # initially present an old TFO cookie
+ ip netns exec "${NETNS}" ip tcp_metrics flush
+ ip netns exec "${NETNS}" ./tcp_fastopen_backup_key "$1"
+ val=$(ip netns exec "${NETNS}" nstat -az | \
+ grep TcpExtTCPFastOpenPassiveFail | awk '{print $2}')
+ if [ "$val" != 0 ]; then
+ echo "FAIL: TcpExtTCPFastOpenPassiveFail non-zero"
+ return 1
+ fi
+}
+
+do_test "-4"
+do_test "-6"
+do_test "-4"
+do_test "-6"
+do_test "-4s"
+do_test "-6s"
+do_test "-4s"
+do_test "-6s"
+do_test "-4r"
+do_test "-6r"
+do_test "-4r"
+do_test "-6r"
+do_test "-4sr"
+do_test "-6sr"
+do_test "-4sr"
+do_test "-6sr"
+echo "all tests done"
diff --git a/tools/testing/selftests/net/tcp_inq.c b/tools/testing/selftests/net/tcp_inq.c
new file mode 100644
index 000000000..bd6a9c7a3
--- /dev/null
+++ b/tools/testing/selftests/net/tcp_inq.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2018 Google Inc.
+ * Author: Soheil Hassas Yeganeh (soheil@google.com)
+ *
+ * Simple example on how to use TCP_INQ and TCP_CM_INQ.
+ */
+#define _GNU_SOURCE
+
+#include <error.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+#ifndef TCP_INQ
+#define TCP_INQ 36
+#endif
+
+#ifndef TCP_CM_INQ
+#define TCP_CM_INQ TCP_INQ
+#endif
+
+#define BUF_SIZE 8192
+#define CMSG_SIZE 32
+
+static int family = AF_INET6;
+static socklen_t addr_len = sizeof(struct sockaddr_in6);
+static int port = 4974;
+
+static void setup_loopback_addr(int family, struct sockaddr_storage *sockaddr)
+{
+ struct sockaddr_in6 *addr6 = (void *) sockaddr;
+ struct sockaddr_in *addr4 = (void *) sockaddr;
+
+ switch (family) {
+ case PF_INET:
+ memset(addr4, 0, sizeof(*addr4));
+ addr4->sin_family = AF_INET;
+ addr4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ addr4->sin_port = htons(port);
+ break;
+ case PF_INET6:
+ memset(addr6, 0, sizeof(*addr6));
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_addr = in6addr_loopback;
+ addr6->sin6_port = htons(port);
+ break;
+ default:
+ error(1, 0, "illegal family");
+ }
+}
+
+void *start_server(void *arg)
+{
+ int server_fd = (int)(unsigned long)arg;
+ struct sockaddr_in addr;
+ socklen_t addrlen = sizeof(addr);
+ char *buf;
+ int fd;
+ int r;
+
+ buf = malloc(BUF_SIZE);
+
+ for (;;) {
+ fd = accept(server_fd, (struct sockaddr *)&addr, &addrlen);
+ if (fd == -1) {
+ perror("accept");
+ break;
+ }
+ do {
+ r = send(fd, buf, BUF_SIZE, 0);
+ } while (r < 0 && errno == EINTR);
+ if (r < 0)
+ perror("send");
+ if (r != BUF_SIZE)
+ fprintf(stderr, "can only send %d bytes\n", r);
+ /* TCP_INQ can overestimate in-queue by one byte if we send
+ * the FIN packet. Sleep for 1 second, so that the client
+ * likely invoked recvmsg().
+ */
+ sleep(1);
+ close(fd);
+ }
+
+ free(buf);
+ close(server_fd);
+ pthread_exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+ struct sockaddr_storage listen_addr, addr;
+ int c, one = 1, inq = -1;
+ pthread_t server_thread;
+ char cmsgbuf[CMSG_SIZE];
+ struct iovec iov[1];
+ struct cmsghdr *cm;
+ struct msghdr msg;
+ int server_fd, fd;
+ char *buf;
+
+ while ((c = getopt(argc, argv, "46p:")) != -1) {
+ switch (c) {
+ case '4':
+ family = PF_INET;
+ addr_len = sizeof(struct sockaddr_in);
+ break;
+ case '6':
+ family = PF_INET6;
+ addr_len = sizeof(struct sockaddr_in6);
+ break;
+ case 'p':
+ port = atoi(optarg);
+ break;
+ }
+ }
+
+ server_fd = socket(family, SOCK_STREAM, 0);
+ if (server_fd < 0)
+ error(1, errno, "server socket");
+ setup_loopback_addr(family, &listen_addr);
+ if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR,
+ &one, sizeof(one)) != 0)
+ error(1, errno, "setsockopt(SO_REUSEADDR)");
+ if (bind(server_fd, (const struct sockaddr *)&listen_addr,
+ addr_len) == -1)
+ error(1, errno, "bind");
+ if (listen(server_fd, 128) == -1)
+ error(1, errno, "listen");
+ if (pthread_create(&server_thread, NULL, start_server,
+ (void *)(unsigned long)server_fd) != 0)
+ error(1, errno, "pthread_create");
+
+ fd = socket(family, SOCK_STREAM, 0);
+ if (fd < 0)
+ error(1, errno, "client socket");
+ setup_loopback_addr(family, &addr);
+ if (connect(fd, (const struct sockaddr *)&addr, addr_len) == -1)
+ error(1, errno, "connect");
+ if (setsockopt(fd, SOL_TCP, TCP_INQ, &one, sizeof(one)) != 0)
+ error(1, errno, "setsockopt(TCP_INQ)");
+
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_iov = iov;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cmsgbuf;
+ msg.msg_controllen = sizeof(cmsgbuf);
+ msg.msg_flags = 0;
+
+ buf = malloc(BUF_SIZE);
+ iov[0].iov_base = buf;
+ iov[0].iov_len = BUF_SIZE / 2;
+
+ if (recvmsg(fd, &msg, 0) != iov[0].iov_len)
+ error(1, errno, "recvmsg");
+ if (msg.msg_flags & MSG_CTRUNC)
+ error(1, 0, "control message is truncated");
+
+ for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm))
+ if (cm->cmsg_level == SOL_TCP && cm->cmsg_type == TCP_CM_INQ)
+ inq = *((int *) CMSG_DATA(cm));
+
+ if (inq != BUF_SIZE - iov[0].iov_len) {
+ fprintf(stderr, "unexpected inq: %d\n", inq);
+ exit(1);
+ }
+
+ printf("PASSED\n");
+ free(buf);
+ close(fd);
+ return 0;
+}
diff --git a/tools/testing/selftests/net/tcp_mmap.c b/tools/testing/selftests/net/tcp_mmap.c
new file mode 100644
index 000000000..00f837c9b
--- /dev/null
+++ b/tools/testing/selftests/net/tcp_mmap.c
@@ -0,0 +1,517 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2018 Google Inc.
+ * Author: Eric Dumazet (edumazet@google.com)
+ *
+ * Reference program demonstrating tcp mmap() usage,
+ * and SO_RCVLOWAT hints for receiver.
+ *
+ * Note : NIC with header split is needed to use mmap() on TCP :
+ * Each incoming frame must be a multiple of PAGE_SIZE bytes of TCP payload.
+ *
+ * How to use on loopback interface :
+ *
+ * ifconfig lo mtu 61512 # 15*4096 + 40 (ipv6 header) + 32 (TCP with TS option header)
+ * tcp_mmap -s -z &
+ * tcp_mmap -H ::1 -z
+ *
+ * Or leave default lo mtu, but use -M option to set TCP_MAXSEG option to (4096 + 12)
+ * (4096 : page size on x86, 12: TCP TS option length)
+ * tcp_mmap -s -z -M $((4096+12)) &
+ * tcp_mmap -H ::1 -z -M $((4096+12))
+ *
+ * Note: -z option on sender uses MSG_ZEROCOPY, which forces a copy when packets go through loopback interface.
+ * We might use sendfile() instead, but really this test program is about mmap(), for receivers ;)
+ *
+ * $ ./tcp_mmap -s & # Without mmap()
+ * $ for i in {1..4}; do ./tcp_mmap -H ::1 -z ; done
+ * received 32768 MB (0 % mmap'ed) in 14.1157 s, 19.4732 Gbit
+ * cpu usage user:0.057 sys:7.815, 240.234 usec per MB, 65531 c-switches
+ * received 32768 MB (0 % mmap'ed) in 14.6833 s, 18.7204 Gbit
+ * cpu usage user:0.043 sys:8.103, 248.596 usec per MB, 65524 c-switches
+ * received 32768 MB (0 % mmap'ed) in 11.143 s, 24.6682 Gbit
+ * cpu usage user:0.044 sys:6.576, 202.026 usec per MB, 65519 c-switches
+ * received 32768 MB (0 % mmap'ed) in 14.9056 s, 18.4413 Gbit
+ * cpu usage user:0.036 sys:8.193, 251.129 usec per MB, 65530 c-switches
+ * $ kill %1 # kill tcp_mmap server
+ *
+ * $ ./tcp_mmap -s -z & # With mmap()
+ * $ for i in {1..4}; do ./tcp_mmap -H ::1 -z ; done
+ * received 32768 MB (99.9939 % mmap'ed) in 6.73792 s, 40.7956 Gbit
+ * cpu usage user:0.045 sys:2.827, 87.6465 usec per MB, 65532 c-switches
+ * received 32768 MB (99.9939 % mmap'ed) in 7.26732 s, 37.8238 Gbit
+ * cpu usage user:0.037 sys:3.087, 95.3369 usec per MB, 65532 c-switches
+ * received 32768 MB (99.9939 % mmap'ed) in 7.61661 s, 36.0893 Gbit
+ * cpu usage user:0.046 sys:3.559, 110.016 usec per MB, 65529 c-switches
+ * received 32768 MB (99.9939 % mmap'ed) in 7.43764 s, 36.9577 Gbit
+ * cpu usage user:0.035 sys:3.467, 106.873 usec per MB, 65530 c-switches
+ */
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <error.h>
+#include <sys/socket.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include <time.h>
+#include <sys/time.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <poll.h>
+#include <linux/tcp.h>
+#include <assert.h>
+
+#ifndef MSG_ZEROCOPY
+#define MSG_ZEROCOPY 0x4000000
+#endif
+
+#define FILE_SZ (1ULL << 35)
+static int cfg_family = AF_INET6;
+static socklen_t cfg_alen = sizeof(struct sockaddr_in6);
+static int cfg_port = 8787;
+
+static int rcvbuf; /* Default: autotuning. Can be set with -r <integer> option */
+static int sndbuf; /* Default: autotuning. Can be set with -w <integer> option */
+static int zflg; /* zero copy option. (MSG_ZEROCOPY for sender, mmap() for receiver */
+static int xflg; /* hash received data (simple xor) (-h option) */
+static int keepflag; /* -k option: receiver shall keep all received file in memory (no munmap() calls) */
+
+static size_t chunk_size = 512*1024;
+
+static size_t map_align;
+
+unsigned long htotal;
+
+static inline void prefetch(const void *x)
+{
+#if defined(__x86_64__)
+ asm volatile("prefetcht0 %P0" : : "m" (*(const char *)x));
+#endif
+}
+
+void hash_zone(void *zone, unsigned int length)
+{
+ unsigned long temp = htotal;
+
+ while (length >= 8*sizeof(long)) {
+ prefetch(zone + 384);
+ temp ^= *(unsigned long *)zone;
+ temp ^= *(unsigned long *)(zone + sizeof(long));
+ temp ^= *(unsigned long *)(zone + 2*sizeof(long));
+ temp ^= *(unsigned long *)(zone + 3*sizeof(long));
+ temp ^= *(unsigned long *)(zone + 4*sizeof(long));
+ temp ^= *(unsigned long *)(zone + 5*sizeof(long));
+ temp ^= *(unsigned long *)(zone + 6*sizeof(long));
+ temp ^= *(unsigned long *)(zone + 7*sizeof(long));
+ zone += 8*sizeof(long);
+ length -= 8*sizeof(long);
+ }
+ while (length >= 1) {
+ temp ^= *(unsigned char *)zone;
+ zone += 1;
+ length--;
+ }
+ htotal = temp;
+}
+
+#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1))
+#define ALIGN_PTR_UP(p, ptr_align_to) ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to))
+
+
+static void *mmap_large_buffer(size_t need, size_t *allocated)
+{
+ void *buffer;
+ size_t sz;
+
+ /* Attempt to use huge pages if possible. */
+ sz = ALIGN_UP(need, map_align);
+ buffer = mmap(NULL, sz, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
+
+ if (buffer == (void *)-1) {
+ sz = need;
+ buffer = mmap(NULL, sz, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (buffer != (void *)-1)
+ fprintf(stderr, "MAP_HUGETLB attempt failed, look at /sys/kernel/mm/hugepages for optimal performance\n");
+ }
+ *allocated = sz;
+ return buffer;
+}
+
+void *child_thread(void *arg)
+{
+ unsigned long total_mmap = 0, total = 0;
+ struct tcp_zerocopy_receive zc;
+ unsigned long delta_usec;
+ int flags = MAP_SHARED;
+ struct timeval t0, t1;
+ char *buffer = NULL;
+ void *raddr = NULL;
+ void *addr = NULL;
+ double throughput;
+ struct rusage ru;
+ size_t buffer_sz;
+ int lu, fd;
+
+ fd = (int)(unsigned long)arg;
+
+ gettimeofday(&t0, NULL);
+
+ fcntl(fd, F_SETFL, O_NDELAY);
+ buffer = mmap_large_buffer(chunk_size, &buffer_sz);
+ if (buffer == (void *)-1) {
+ perror("mmap");
+ goto error;
+ }
+ if (zflg) {
+ raddr = mmap(NULL, chunk_size + map_align, PROT_READ, flags, fd, 0);
+ if (raddr == (void *)-1) {
+ perror("mmap");
+ zflg = 0;
+ } else {
+ addr = ALIGN_PTR_UP(raddr, map_align);
+ }
+ }
+ while (1) {
+ struct pollfd pfd = { .fd = fd, .events = POLLIN, };
+ int sub;
+
+ poll(&pfd, 1, 10000);
+ if (zflg) {
+ socklen_t zc_len = sizeof(zc);
+ int res;
+
+ memset(&zc, 0, sizeof(zc));
+ zc.address = (__u64)((unsigned long)addr);
+ zc.length = chunk_size;
+
+ res = getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE,
+ &zc, &zc_len);
+ if (res == -1)
+ break;
+
+ if (zc.length) {
+ assert(zc.length <= chunk_size);
+ total_mmap += zc.length;
+ if (xflg)
+ hash_zone(addr, zc.length);
+ /* It is more efficient to unmap the pages right now,
+ * instead of doing this in next TCP_ZEROCOPY_RECEIVE.
+ */
+ madvise(addr, zc.length, MADV_DONTNEED);
+ total += zc.length;
+ }
+ if (zc.recv_skip_hint) {
+ assert(zc.recv_skip_hint <= chunk_size);
+ lu = read(fd, buffer, zc.recv_skip_hint);
+ if (lu > 0) {
+ if (xflg)
+ hash_zone(buffer, lu);
+ total += lu;
+ }
+ }
+ continue;
+ }
+ sub = 0;
+ while (sub < chunk_size) {
+ lu = read(fd, buffer + sub, chunk_size - sub);
+ if (lu == 0)
+ goto end;
+ if (lu < 0)
+ break;
+ if (xflg)
+ hash_zone(buffer + sub, lu);
+ total += lu;
+ sub += lu;
+ }
+ }
+end:
+ gettimeofday(&t1, NULL);
+ delta_usec = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+
+ throughput = 0;
+ if (delta_usec)
+ throughput = total * 8.0 / (double)delta_usec / 1000.0;
+ getrusage(RUSAGE_THREAD, &ru);
+ if (total > 1024*1024) {
+ unsigned long total_usec;
+ unsigned long mb = total >> 20;
+ total_usec = 1000000*ru.ru_utime.tv_sec + ru.ru_utime.tv_usec +
+ 1000000*ru.ru_stime.tv_sec + ru.ru_stime.tv_usec;
+ printf("received %lg MB (%lg %% mmap'ed) in %lg s, %lg Gbit\n"
+ " cpu usage user:%lg sys:%lg, %lg usec per MB, %lu c-switches\n",
+ total / (1024.0 * 1024.0),
+ 100.0*total_mmap/total,
+ (double)delta_usec / 1000000.0,
+ throughput,
+ (double)ru.ru_utime.tv_sec + (double)ru.ru_utime.tv_usec / 1000000.0,
+ (double)ru.ru_stime.tv_sec + (double)ru.ru_stime.tv_usec / 1000000.0,
+ (double)total_usec/mb,
+ ru.ru_nvcsw);
+ }
+error:
+ munmap(buffer, buffer_sz);
+ close(fd);
+ if (zflg)
+ munmap(raddr, chunk_size + map_align);
+ pthread_exit(0);
+}
+
+static void apply_rcvsnd_buf(int fd)
+{
+ if (rcvbuf && setsockopt(fd, SOL_SOCKET,
+ SO_RCVBUF, &rcvbuf, sizeof(rcvbuf)) == -1) {
+ perror("setsockopt SO_RCVBUF");
+ }
+
+ if (sndbuf && setsockopt(fd, SOL_SOCKET,
+ SO_SNDBUF, &sndbuf, sizeof(sndbuf)) == -1) {
+ perror("setsockopt SO_SNDBUF");
+ }
+}
+
+
+static void setup_sockaddr(int domain, const char *str_addr,
+ struct sockaddr_storage *sockaddr)
+{
+ struct sockaddr_in6 *addr6 = (void *) sockaddr;
+ struct sockaddr_in *addr4 = (void *) sockaddr;
+
+ switch (domain) {
+ case PF_INET:
+ memset(addr4, 0, sizeof(*addr4));
+ addr4->sin_family = AF_INET;
+ addr4->sin_port = htons(cfg_port);
+ if (str_addr &&
+ inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
+ error(1, 0, "ipv4 parse error: %s", str_addr);
+ break;
+ case PF_INET6:
+ memset(addr6, 0, sizeof(*addr6));
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_port = htons(cfg_port);
+ if (str_addr &&
+ inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
+ error(1, 0, "ipv6 parse error: %s", str_addr);
+ break;
+ default:
+ error(1, 0, "illegal domain");
+ }
+}
+
+static void do_accept(int fdlisten)
+{
+ pthread_attr_t attr;
+ int rcvlowat;
+
+ pthread_attr_init(&attr);
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+
+ rcvlowat = chunk_size;
+ if (setsockopt(fdlisten, SOL_SOCKET, SO_RCVLOWAT,
+ &rcvlowat, sizeof(rcvlowat)) == -1) {
+ perror("setsockopt SO_RCVLOWAT");
+ }
+
+ apply_rcvsnd_buf(fdlisten);
+
+ while (1) {
+ struct sockaddr_in addr;
+ socklen_t addrlen = sizeof(addr);
+ pthread_t th;
+ int fd, res;
+
+ fd = accept(fdlisten, (struct sockaddr *)&addr, &addrlen);
+ if (fd == -1) {
+ perror("accept");
+ continue;
+ }
+ res = pthread_create(&th, &attr, child_thread,
+ (void *)(unsigned long)fd);
+ if (res) {
+ errno = res;
+ perror("pthread_create");
+ close(fd);
+ }
+ }
+}
+
+/* Each thread should reserve a big enough vma to avoid
+ * spinlock collisions in ptl locks.
+ * This size is 2MB on x86_64, and is exported in /proc/meminfo.
+ */
+static unsigned long default_huge_page_size(void)
+{
+ FILE *f = fopen("/proc/meminfo", "r");
+ unsigned long hps = 0;
+ size_t linelen = 0;
+ char *line = NULL;
+
+ if (!f)
+ return 0;
+ while (getline(&line, &linelen, f) > 0) {
+ if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) {
+ hps <<= 10;
+ break;
+ }
+ }
+ free(line);
+ fclose(f);
+ return hps;
+}
+
+int main(int argc, char *argv[])
+{
+ struct sockaddr_storage listenaddr, addr;
+ unsigned int max_pacing_rate = 0;
+ uint64_t total = 0;
+ char *host = NULL;
+ int fd, c, on = 1;
+ size_t buffer_sz;
+ char *buffer;
+ int sflg = 0;
+ int mss = 0;
+
+ while ((c = getopt(argc, argv, "46p:svr:w:H:zxkP:M:C:a:")) != -1) {
+ switch (c) {
+ case '4':
+ cfg_family = PF_INET;
+ cfg_alen = sizeof(struct sockaddr_in);
+ break;
+ case '6':
+ cfg_family = PF_INET6;
+ cfg_alen = sizeof(struct sockaddr_in6);
+ break;
+ case 'p':
+ cfg_port = atoi(optarg);
+ break;
+ case 'H':
+ host = optarg;
+ break;
+ case 's': /* server : listen for incoming connections */
+ sflg++;
+ break;
+ case 'r':
+ rcvbuf = atoi(optarg);
+ break;
+ case 'w':
+ sndbuf = atoi(optarg);
+ break;
+ case 'z':
+ zflg = 1;
+ break;
+ case 'M':
+ mss = atoi(optarg);
+ break;
+ case 'x':
+ xflg = 1;
+ break;
+ case 'k':
+ keepflag = 1;
+ break;
+ case 'P':
+ max_pacing_rate = atoi(optarg) ;
+ break;
+ case 'C':
+ chunk_size = atol(optarg);
+ break;
+ case 'a':
+ map_align = atol(optarg);
+ break;
+ default:
+ exit(1);
+ }
+ }
+ if (!map_align) {
+ map_align = default_huge_page_size();
+ /* if really /proc/meminfo is not helping,
+ * we use the default x86_64 hugepagesize.
+ */
+ if (!map_align)
+ map_align = 2*1024*1024;
+ }
+ if (sflg) {
+ int fdlisten = socket(cfg_family, SOCK_STREAM, 0);
+
+ if (fdlisten == -1) {
+ perror("socket");
+ exit(1);
+ }
+ apply_rcvsnd_buf(fdlisten);
+ setsockopt(fdlisten, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
+
+ setup_sockaddr(cfg_family, host, &listenaddr);
+
+ if (mss &&
+ setsockopt(fdlisten, IPPROTO_TCP, TCP_MAXSEG,
+ &mss, sizeof(mss)) == -1) {
+ perror("setsockopt TCP_MAXSEG");
+ exit(1);
+ }
+ if (bind(fdlisten, (const struct sockaddr *)&listenaddr, cfg_alen) == -1) {
+ perror("bind");
+ exit(1);
+ }
+ if (listen(fdlisten, 128) == -1) {
+ perror("listen");
+ exit(1);
+ }
+ do_accept(fdlisten);
+ }
+
+ buffer = mmap_large_buffer(chunk_size, &buffer_sz);
+ if (buffer == (char *)-1) {
+ perror("mmap");
+ exit(1);
+ }
+
+ fd = socket(cfg_family, SOCK_STREAM, 0);
+ if (fd == -1) {
+ perror("socket");
+ exit(1);
+ }
+ apply_rcvsnd_buf(fd);
+
+ setup_sockaddr(cfg_family, host, &addr);
+
+ if (mss &&
+ setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) {
+ perror("setsockopt TCP_MAXSEG");
+ exit(1);
+ }
+ if (connect(fd, (const struct sockaddr *)&addr, cfg_alen) == -1) {
+ perror("connect");
+ exit(1);
+ }
+ if (max_pacing_rate &&
+ setsockopt(fd, SOL_SOCKET, SO_MAX_PACING_RATE,
+ &max_pacing_rate, sizeof(max_pacing_rate)) == -1)
+ perror("setsockopt SO_MAX_PACING_RATE");
+
+ if (zflg && setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY,
+ &on, sizeof(on)) == -1) {
+ perror("setsockopt SO_ZEROCOPY, (-z option disabled)");
+ zflg = 0;
+ }
+ while (total < FILE_SZ) {
+ int64_t wr = FILE_SZ - total;
+
+ if (wr > chunk_size)
+ wr = chunk_size;
+ /* Note : we just want to fill the pipe with 0 bytes */
+ wr = send(fd, buffer, (size_t)wr, zflg ? MSG_ZEROCOPY : 0);
+ if (wr <= 0)
+ break;
+ total += wr;
+ }
+ close(fd);
+ munmap(buffer, buffer_sz);
+ return 0;
+}
diff --git a/tools/testing/selftests/net/test_blackhole_dev.sh b/tools/testing/selftests/net/test_blackhole_dev.sh
new file mode 100755
index 000000000..3119b80e7
--- /dev/null
+++ b/tools/testing/selftests/net/test_blackhole_dev.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# Runs blackhole-dev test using blackhole-dev kernel module
+
+if /sbin/modprobe -q test_blackhole_dev ; then
+ /sbin/modprobe -q -r test_blackhole_dev;
+ echo "test_blackhole_dev: ok";
+else
+ echo "test_blackhole_dev: [FAIL]";
+ exit 1;
+fi
diff --git a/tools/testing/selftests/net/test_bpf.sh b/tools/testing/selftests/net/test_bpf.sh
new file mode 100755
index 000000000..65677909c
--- /dev/null
+++ b/tools/testing/selftests/net/test_bpf.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# Runs bpf test using test_bpf kernel module
+
+if /sbin/modprobe -q test_bpf ; then
+ /sbin/modprobe -q -r test_bpf;
+ echo "test_bpf: ok";
+else
+ echo "test_bpf: [FAIL]";
+ exit 1;
+fi
diff --git a/tools/testing/selftests/net/test_vxlan_fdb_changelink.sh b/tools/testing/selftests/net/test_vxlan_fdb_changelink.sh
new file mode 100755
index 000000000..2d442cdab
--- /dev/null
+++ b/tools/testing/selftests/net/test_vxlan_fdb_changelink.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Check FDB default-remote handling across "ip link set".
+
+check_remotes()
+{
+ local what=$1; shift
+ local N=$(bridge fdb sh dev vx | grep 00:00:00:00:00:00 | wc -l)
+
+ echo -ne "expected two remotes after $what\t"
+ if [[ $N != 2 ]]; then
+ echo "[FAIL]"
+ EXIT_STATUS=1
+ else
+ echo "[ OK ]"
+ fi
+}
+
+ip link add name vx up type vxlan id 2000 dstport 4789
+bridge fdb ap dev vx 00:00:00:00:00:00 dst 192.0.2.20 self permanent
+bridge fdb ap dev vx 00:00:00:00:00:00 dst 192.0.2.30 self permanent
+check_remotes "fdb append"
+
+ip link set dev vx type vxlan remote 192.0.2.30
+check_remotes "link set"
+
+ip link del dev vx
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/net/test_vxlan_under_vrf.sh b/tools/testing/selftests/net/test_vxlan_under_vrf.sh
new file mode 100755
index 000000000..a44b9aca7
--- /dev/null
+++ b/tools/testing/selftests/net/test_vxlan_under_vrf.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test is for checking VXLAN underlay in a non-default VRF.
+#
+# It simulates two hypervisors running a VM each using four network namespaces:
+# two for the HVs, two for the VMs.
+# A small VXLAN tunnel is made between the two hypervisors to have the two vms
+# in the same virtual L2:
+#
+# +-------------------+ +-------------------+
+# | | | |
+# | vm-1 netns | | vm-2 netns |
+# | | | |
+# | +-------------+ | | +-------------+ |
+# | | veth-hv | | | | veth-hv | |
+# | | 10.0.0.1/24 | | | | 10.0.0.2/24 | |
+# | +-------------+ | | +-------------+ |
+# | . | | . |
+# +-------------------+ +-------------------+
+# . .
+# . .
+# . .
+# +-----------------------------------+ +------------------------------------+
+# | . | | . |
+# | +----------+ | | +----------+ |
+# | | veth-tap | | | | veth-tap | |
+# | +----+-----+ | | +----+-----+ |
+# | | | | | |
+# | +--+--+ +--------------+ | | +--------------+ +--+--+ |
+# | | br0 | | vrf-underlay | | | | vrf-underlay | | br0 | |
+# | +--+--+ +-------+------+ | | +------+-------+ +--+--+ |
+# | | | | | | | |
+# | +---+----+ +-------+-------+ | | +-------+-------+ +---+----+ |
+# | | vxlan0 |....| veth0 |.|...|.| veth0 |....| vxlan0 | |
+# | +--------+ | 172.16.0.1/24 | | | | 172.16.0.2/24 | +--------+ |
+# | +---------------+ | | +---------------+ |
+# | | | |
+# | hv-1 netns | | hv-2 netns |
+# | | | |
+# +-----------------------------------+ +------------------------------------+
+#
+# This tests both the connectivity between vm-1 and vm-2, and that the underlay
+# can be moved in and out of the vrf by unsetting and setting veth0's master.
+
+set -e
+
+cleanup() {
+ ip link del veth-hv-1 2>/dev/null || true
+ ip link del veth-tap 2>/dev/null || true
+
+ for ns in hv-1 hv-2 vm-1 vm-2; do
+ ip netns del $ns || true
+ done
+}
+
+# Clean start
+cleanup &> /dev/null
+
+[[ $1 == "clean" ]] && exit 0
+
+trap cleanup EXIT
+
+# Setup "Hypervisors" simulated with netns
+ip link add veth-hv-1 type veth peer name veth-hv-2
+setup-hv-networking() {
+ hv=$1
+
+ ip netns add hv-$hv
+ ip link set veth-hv-$hv netns hv-$hv
+ ip -netns hv-$hv link set veth-hv-$hv name veth0
+
+ ip -netns hv-$hv link add vrf-underlay type vrf table 1
+ ip -netns hv-$hv link set vrf-underlay up
+ ip -netns hv-$hv addr add 172.16.0.$hv/24 dev veth0
+ ip -netns hv-$hv link set veth0 up
+
+ ip -netns hv-$hv link add br0 type bridge
+ ip -netns hv-$hv link set br0 up
+
+ ip -netns hv-$hv link add vxlan0 type vxlan id 10 local 172.16.0.$hv dev veth0 dstport 4789
+ ip -netns hv-$hv link set vxlan0 master br0
+ ip -netns hv-$hv link set vxlan0 up
+}
+setup-hv-networking 1
+setup-hv-networking 2
+
+# Check connectivity between HVs by pinging hv-2 from hv-1
+echo -n "Checking HV connectivity "
+ip netns exec hv-1 ping -c 1 -W 1 172.16.0.2 &> /dev/null || (echo "[FAIL]"; false)
+echo "[ OK ]"
+
+# Setups a "VM" simulated by a netns an a veth pair
+setup-vm() {
+ id=$1
+
+ ip netns add vm-$id
+ ip link add veth-tap type veth peer name veth-hv
+
+ ip link set veth-tap netns hv-$id
+ ip -netns hv-$id link set veth-tap master br0
+ ip -netns hv-$id link set veth-tap up
+
+ ip link set veth-hv netns vm-$id
+ ip -netns vm-$id addr add 10.0.0.$id/24 dev veth-hv
+ ip -netns vm-$id link set veth-hv up
+}
+setup-vm 1
+setup-vm 2
+
+# Setup VTEP routes to make ARP work
+bridge -netns hv-1 fdb add 00:00:00:00:00:00 dev vxlan0 dst 172.16.0.2 self permanent
+bridge -netns hv-2 fdb add 00:00:00:00:00:00 dev vxlan0 dst 172.16.0.1 self permanent
+
+echo -n "Check VM connectivity through VXLAN (underlay in the default VRF) "
+ip netns exec vm-1 ping -c 1 -W 1 10.0.0.2 &> /dev/null || (echo "[FAIL]"; false)
+echo "[ OK ]"
+
+# Move the underlay to a non-default VRF
+ip -netns hv-1 link set veth0 vrf vrf-underlay
+ip -netns hv-1 link set vxlan0 down
+ip -netns hv-1 link set vxlan0 up
+ip -netns hv-2 link set veth0 vrf vrf-underlay
+ip -netns hv-2 link set vxlan0 down
+ip -netns hv-2 link set vxlan0 up
+
+echo -n "Check VM connectivity through VXLAN (underlay in a VRF) "
+ip netns exec vm-1 ping -c 1 -W 1 10.0.0.2 &> /dev/null || (echo "[FAIL]"; false)
+echo "[ OK ]"
diff --git a/tools/testing/selftests/net/timestamping.c b/tools/testing/selftests/net/timestamping.c
new file mode 100644
index 000000000..f4bb4fef0
--- /dev/null
+++ b/tools/testing/selftests/net/timestamping.c
@@ -0,0 +1,515 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This program demonstrates how the various time stamping features in
+ * the Linux kernel work. It emulates the behavior of a PTP
+ * implementation in stand-alone master mode by sending PTPv1 Sync
+ * multicasts once every second. It looks for similar packets, but
+ * beyond that doesn't actually implement PTP.
+ *
+ * Outgoing packets are time stamped with SO_TIMESTAMPING with or
+ * without hardware support.
+ *
+ * Incoming packets are time stamped with SO_TIMESTAMPING with or
+ * without hardware support, SIOCGSTAMP[NS] (per-socket time stamp) and
+ * SO_TIMESTAMP[NS].
+ *
+ * Copyright (C) 2009 Intel Corporation.
+ * Author: Patrick Ohly <patrick.ohly@intel.com>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+
+#include <sys/time.h>
+#include <sys/socket.h>
+#include <sys/select.h>
+#include <sys/ioctl.h>
+#include <arpa/inet.h>
+#include <net/if.h>
+
+#include <asm/types.h>
+#include <linux/net_tstamp.h>
+#include <linux/errqueue.h>
+#include <linux/sockios.h>
+
+#ifndef SO_TIMESTAMPING
+# define SO_TIMESTAMPING 37
+# define SCM_TIMESTAMPING SO_TIMESTAMPING
+#endif
+
+#ifndef SO_TIMESTAMPNS
+# define SO_TIMESTAMPNS 35
+#endif
+
+static void usage(const char *error)
+{
+ if (error)
+ printf("invalid option: %s\n", error);
+ printf("timestamping interface option*\n\n"
+ "Options:\n"
+ " IP_MULTICAST_LOOP - looping outgoing multicasts\n"
+ " SO_TIMESTAMP - normal software time stamping, ms resolution\n"
+ " SO_TIMESTAMPNS - more accurate software time stamping\n"
+ " SOF_TIMESTAMPING_TX_HARDWARE - hardware time stamping of outgoing packets\n"
+ " SOF_TIMESTAMPING_TX_SOFTWARE - software fallback for outgoing packets\n"
+ " SOF_TIMESTAMPING_RX_HARDWARE - hardware time stamping of incoming packets\n"
+ " SOF_TIMESTAMPING_RX_SOFTWARE - software fallback for incoming packets\n"
+ " SOF_TIMESTAMPING_SOFTWARE - request reporting of software time stamps\n"
+ " SOF_TIMESTAMPING_RAW_HARDWARE - request reporting of raw HW time stamps\n"
+ " SIOCGSTAMP - check last socket time stamp\n"
+ " SIOCGSTAMPNS - more accurate socket time stamp\n");
+ exit(1);
+}
+
+static void bail(const char *error)
+{
+ printf("%s: %s\n", error, strerror(errno));
+ exit(1);
+}
+
+static const unsigned char sync[] = {
+ 0x00, 0x01, 0x00, 0x01,
+ 0x5f, 0x44, 0x46, 0x4c,
+ 0x54, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x01, 0x01,
+
+ /* fake uuid */
+ 0x00, 0x01,
+ 0x02, 0x03, 0x04, 0x05,
+
+ 0x00, 0x01, 0x00, 0x37,
+ 0x00, 0x00, 0x00, 0x08,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x49, 0x05, 0xcd, 0x01,
+ 0x29, 0xb1, 0x8d, 0xb0,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x01,
+
+ /* fake uuid */
+ 0x00, 0x01,
+ 0x02, 0x03, 0x04, 0x05,
+
+ 0x00, 0x00, 0x00, 0x37,
+ 0x00, 0x00, 0x00, 0x04,
+ 0x44, 0x46, 0x4c, 0x54,
+ 0x00, 0x00, 0xf0, 0x60,
+ 0x00, 0x01, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x01,
+ 0x00, 0x00, 0xf0, 0x60,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x04,
+ 0x44, 0x46, 0x4c, 0x54,
+ 0x00, 0x01,
+
+ /* fake uuid */
+ 0x00, 0x01,
+ 0x02, 0x03, 0x04, 0x05,
+
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00
+};
+
+static void sendpacket(int sock, struct sockaddr *addr, socklen_t addr_len)
+{
+ struct timeval now;
+ int res;
+
+ res = sendto(sock, sync, sizeof(sync), 0,
+ addr, addr_len);
+ gettimeofday(&now, 0);
+ if (res < 0)
+ printf("%s: %s\n", "send", strerror(errno));
+ else
+ printf("%ld.%06ld: sent %d bytes\n",
+ (long)now.tv_sec, (long)now.tv_usec,
+ res);
+}
+
+static void printpacket(struct msghdr *msg, int res,
+ char *data,
+ int sock, int recvmsg_flags,
+ int siocgstamp, int siocgstampns)
+{
+ struct sockaddr_in *from_addr = (struct sockaddr_in *)msg->msg_name;
+ struct cmsghdr *cmsg;
+ struct timeval tv;
+ struct timespec ts;
+ struct timeval now;
+
+ gettimeofday(&now, 0);
+
+ printf("%ld.%06ld: received %s data, %d bytes from %s, %zu bytes control messages\n",
+ (long)now.tv_sec, (long)now.tv_usec,
+ (recvmsg_flags & MSG_ERRQUEUE) ? "error" : "regular",
+ res,
+ inet_ntoa(from_addr->sin_addr),
+ msg->msg_controllen);
+ for (cmsg = CMSG_FIRSTHDR(msg);
+ cmsg;
+ cmsg = CMSG_NXTHDR(msg, cmsg)) {
+ printf(" cmsg len %zu: ", cmsg->cmsg_len);
+ switch (cmsg->cmsg_level) {
+ case SOL_SOCKET:
+ printf("SOL_SOCKET ");
+ switch (cmsg->cmsg_type) {
+ case SO_TIMESTAMP: {
+ struct timeval *stamp =
+ (struct timeval *)CMSG_DATA(cmsg);
+ printf("SO_TIMESTAMP %ld.%06ld",
+ (long)stamp->tv_sec,
+ (long)stamp->tv_usec);
+ break;
+ }
+ case SO_TIMESTAMPNS: {
+ struct timespec *stamp =
+ (struct timespec *)CMSG_DATA(cmsg);
+ printf("SO_TIMESTAMPNS %ld.%09ld",
+ (long)stamp->tv_sec,
+ (long)stamp->tv_nsec);
+ break;
+ }
+ case SO_TIMESTAMPING: {
+ struct timespec *stamp =
+ (struct timespec *)CMSG_DATA(cmsg);
+ printf("SO_TIMESTAMPING ");
+ printf("SW %ld.%09ld ",
+ (long)stamp->tv_sec,
+ (long)stamp->tv_nsec);
+ stamp++;
+ /* skip deprecated HW transformed */
+ stamp++;
+ printf("HW raw %ld.%09ld",
+ (long)stamp->tv_sec,
+ (long)stamp->tv_nsec);
+ break;
+ }
+ default:
+ printf("type %d", cmsg->cmsg_type);
+ break;
+ }
+ break;
+ case IPPROTO_IP:
+ printf("IPPROTO_IP ");
+ switch (cmsg->cmsg_type) {
+ case IP_RECVERR: {
+ struct sock_extended_err *err =
+ (struct sock_extended_err *)CMSG_DATA(cmsg);
+ printf("IP_RECVERR ee_errno '%s' ee_origin %d => %s",
+ strerror(err->ee_errno),
+ err->ee_origin,
+#ifdef SO_EE_ORIGIN_TIMESTAMPING
+ err->ee_origin == SO_EE_ORIGIN_TIMESTAMPING ?
+ "bounced packet" : "unexpected origin"
+#else
+ "probably SO_EE_ORIGIN_TIMESTAMPING"
+#endif
+ );
+ if (res < sizeof(sync))
+ printf(" => truncated data?!");
+ else if (!memcmp(sync, data + res - sizeof(sync),
+ sizeof(sync)))
+ printf(" => GOT OUR DATA BACK (HURRAY!)");
+ break;
+ }
+ case IP_PKTINFO: {
+ struct in_pktinfo *pktinfo =
+ (struct in_pktinfo *)CMSG_DATA(cmsg);
+ printf("IP_PKTINFO interface index %u",
+ pktinfo->ipi_ifindex);
+ break;
+ }
+ default:
+ printf("type %d", cmsg->cmsg_type);
+ break;
+ }
+ break;
+ default:
+ printf("level %d type %d",
+ cmsg->cmsg_level,
+ cmsg->cmsg_type);
+ break;
+ }
+ printf("\n");
+ }
+
+ if (siocgstamp) {
+ if (ioctl(sock, SIOCGSTAMP, &tv))
+ printf(" %s: %s\n", "SIOCGSTAMP", strerror(errno));
+ else
+ printf("SIOCGSTAMP %ld.%06ld\n",
+ (long)tv.tv_sec,
+ (long)tv.tv_usec);
+ }
+ if (siocgstampns) {
+ if (ioctl(sock, SIOCGSTAMPNS, &ts))
+ printf(" %s: %s\n", "SIOCGSTAMPNS", strerror(errno));
+ else
+ printf("SIOCGSTAMPNS %ld.%09ld\n",
+ (long)ts.tv_sec,
+ (long)ts.tv_nsec);
+ }
+}
+
+static void recvpacket(int sock, int recvmsg_flags,
+ int siocgstamp, int siocgstampns)
+{
+ char data[256];
+ struct msghdr msg;
+ struct iovec entry;
+ struct sockaddr_in from_addr;
+ struct {
+ struct cmsghdr cm;
+ char control[512];
+ } control;
+ int res;
+
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_iov = &entry;
+ msg.msg_iovlen = 1;
+ entry.iov_base = data;
+ entry.iov_len = sizeof(data);
+ msg.msg_name = (caddr_t)&from_addr;
+ msg.msg_namelen = sizeof(from_addr);
+ msg.msg_control = &control;
+ msg.msg_controllen = sizeof(control);
+
+ res = recvmsg(sock, &msg, recvmsg_flags|MSG_DONTWAIT);
+ if (res < 0) {
+ printf("%s %s: %s\n",
+ "recvmsg",
+ (recvmsg_flags & MSG_ERRQUEUE) ? "error" : "regular",
+ strerror(errno));
+ } else {
+ printpacket(&msg, res, data,
+ sock, recvmsg_flags,
+ siocgstamp, siocgstampns);
+ }
+}
+
+int main(int argc, char **argv)
+{
+ int so_timestamping_flags = 0;
+ int so_timestamp = 0;
+ int so_timestampns = 0;
+ int siocgstamp = 0;
+ int siocgstampns = 0;
+ int ip_multicast_loop = 0;
+ char *interface;
+ int i;
+ int enabled = 1;
+ int sock;
+ struct ifreq device;
+ struct ifreq hwtstamp;
+ struct hwtstamp_config hwconfig, hwconfig_requested;
+ struct sockaddr_in addr;
+ struct ip_mreq imr;
+ struct in_addr iaddr;
+ int val;
+ socklen_t len;
+ struct timeval next;
+ size_t if_len;
+
+ if (argc < 2)
+ usage(0);
+ interface = argv[1];
+ if_len = strlen(interface);
+ if (if_len >= IFNAMSIZ) {
+ printf("interface name exceeds IFNAMSIZ\n");
+ exit(1);
+ }
+
+ for (i = 2; i < argc; i++) {
+ if (!strcasecmp(argv[i], "SO_TIMESTAMP"))
+ so_timestamp = 1;
+ else if (!strcasecmp(argv[i], "SO_TIMESTAMPNS"))
+ so_timestampns = 1;
+ else if (!strcasecmp(argv[i], "SIOCGSTAMP"))
+ siocgstamp = 1;
+ else if (!strcasecmp(argv[i], "SIOCGSTAMPNS"))
+ siocgstampns = 1;
+ else if (!strcasecmp(argv[i], "IP_MULTICAST_LOOP"))
+ ip_multicast_loop = 1;
+ else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_TX_HARDWARE"))
+ so_timestamping_flags |= SOF_TIMESTAMPING_TX_HARDWARE;
+ else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_TX_SOFTWARE"))
+ so_timestamping_flags |= SOF_TIMESTAMPING_TX_SOFTWARE;
+ else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RX_HARDWARE"))
+ so_timestamping_flags |= SOF_TIMESTAMPING_RX_HARDWARE;
+ else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RX_SOFTWARE"))
+ so_timestamping_flags |= SOF_TIMESTAMPING_RX_SOFTWARE;
+ else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_SOFTWARE"))
+ so_timestamping_flags |= SOF_TIMESTAMPING_SOFTWARE;
+ else if (!strcasecmp(argv[i], "SOF_TIMESTAMPING_RAW_HARDWARE"))
+ so_timestamping_flags |= SOF_TIMESTAMPING_RAW_HARDWARE;
+ else
+ usage(argv[i]);
+ }
+
+ sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP);
+ if (sock < 0)
+ bail("socket");
+
+ memset(&device, 0, sizeof(device));
+ memcpy(device.ifr_name, interface, if_len + 1);
+ if (ioctl(sock, SIOCGIFADDR, &device) < 0)
+ bail("getting interface IP address");
+
+ memset(&hwtstamp, 0, sizeof(hwtstamp));
+ memcpy(hwtstamp.ifr_name, interface, if_len + 1);
+ hwtstamp.ifr_data = (void *)&hwconfig;
+ memset(&hwconfig, 0, sizeof(hwconfig));
+ hwconfig.tx_type =
+ (so_timestamping_flags & SOF_TIMESTAMPING_TX_HARDWARE) ?
+ HWTSTAMP_TX_ON : HWTSTAMP_TX_OFF;
+ hwconfig.rx_filter =
+ (so_timestamping_flags & SOF_TIMESTAMPING_RX_HARDWARE) ?
+ HWTSTAMP_FILTER_PTP_V1_L4_SYNC : HWTSTAMP_FILTER_NONE;
+ hwconfig_requested = hwconfig;
+ if (ioctl(sock, SIOCSHWTSTAMP, &hwtstamp) < 0) {
+ if ((errno == EINVAL || errno == ENOTSUP) &&
+ hwconfig_requested.tx_type == HWTSTAMP_TX_OFF &&
+ hwconfig_requested.rx_filter == HWTSTAMP_FILTER_NONE)
+ printf("SIOCSHWTSTAMP: disabling hardware time stamping not possible\n");
+ else
+ bail("SIOCSHWTSTAMP");
+ }
+ printf("SIOCSHWTSTAMP: tx_type %d requested, got %d; rx_filter %d requested, got %d\n",
+ hwconfig_requested.tx_type, hwconfig.tx_type,
+ hwconfig_requested.rx_filter, hwconfig.rx_filter);
+
+ /* bind to PTP port */
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = htonl(INADDR_ANY);
+ addr.sin_port = htons(319 /* PTP event port */);
+ if (bind(sock,
+ (struct sockaddr *)&addr,
+ sizeof(struct sockaddr_in)) < 0)
+ bail("bind");
+
+ /* set multicast group for outgoing packets */
+ inet_aton("224.0.1.130", &iaddr); /* alternate PTP domain 1 */
+ addr.sin_addr = iaddr;
+ imr.imr_multiaddr.s_addr = iaddr.s_addr;
+ imr.imr_interface.s_addr =
+ ((struct sockaddr_in *)&device.ifr_addr)->sin_addr.s_addr;
+ if (setsockopt(sock, IPPROTO_IP, IP_MULTICAST_IF,
+ &imr.imr_interface.s_addr, sizeof(struct in_addr)) < 0)
+ bail("set multicast");
+
+ /* join multicast group, loop our own packet */
+ if (setsockopt(sock, IPPROTO_IP, IP_ADD_MEMBERSHIP,
+ &imr, sizeof(struct ip_mreq)) < 0)
+ bail("join multicast group");
+
+ if (setsockopt(sock, IPPROTO_IP, IP_MULTICAST_LOOP,
+ &ip_multicast_loop, sizeof(enabled)) < 0) {
+ bail("loop multicast");
+ }
+
+ /* set socket options for time stamping */
+ if (so_timestamp &&
+ setsockopt(sock, SOL_SOCKET, SO_TIMESTAMP,
+ &enabled, sizeof(enabled)) < 0)
+ bail("setsockopt SO_TIMESTAMP");
+
+ if (so_timestampns &&
+ setsockopt(sock, SOL_SOCKET, SO_TIMESTAMPNS,
+ &enabled, sizeof(enabled)) < 0)
+ bail("setsockopt SO_TIMESTAMPNS");
+
+ if (so_timestamping_flags &&
+ setsockopt(sock, SOL_SOCKET, SO_TIMESTAMPING,
+ &so_timestamping_flags,
+ sizeof(so_timestamping_flags)) < 0)
+ bail("setsockopt SO_TIMESTAMPING");
+
+ /* request IP_PKTINFO for debugging purposes */
+ if (setsockopt(sock, SOL_IP, IP_PKTINFO,
+ &enabled, sizeof(enabled)) < 0)
+ printf("%s: %s\n", "setsockopt IP_PKTINFO", strerror(errno));
+
+ /* verify socket options */
+ len = sizeof(val);
+ if (getsockopt(sock, SOL_SOCKET, SO_TIMESTAMP, &val, &len) < 0)
+ printf("%s: %s\n", "getsockopt SO_TIMESTAMP", strerror(errno));
+ else
+ printf("SO_TIMESTAMP %d\n", val);
+
+ if (getsockopt(sock, SOL_SOCKET, SO_TIMESTAMPNS, &val, &len) < 0)
+ printf("%s: %s\n", "getsockopt SO_TIMESTAMPNS",
+ strerror(errno));
+ else
+ printf("SO_TIMESTAMPNS %d\n", val);
+
+ if (getsockopt(sock, SOL_SOCKET, SO_TIMESTAMPING, &val, &len) < 0) {
+ printf("%s: %s\n", "getsockopt SO_TIMESTAMPING",
+ strerror(errno));
+ } else {
+ printf("SO_TIMESTAMPING %d\n", val);
+ if (val != so_timestamping_flags)
+ printf(" not the expected value %d\n",
+ so_timestamping_flags);
+ }
+
+ /* send packets forever every five seconds */
+ gettimeofday(&next, 0);
+ next.tv_sec = (next.tv_sec + 1) / 5 * 5;
+ next.tv_usec = 0;
+ while (1) {
+ struct timeval now;
+ struct timeval delta;
+ long delta_us;
+ int res;
+ fd_set readfs, errorfs;
+
+ gettimeofday(&now, 0);
+ delta_us = (long)(next.tv_sec - now.tv_sec) * 1000000 +
+ (long)(next.tv_usec - now.tv_usec);
+ if (delta_us > 0) {
+ /* continue waiting for timeout or data */
+ delta.tv_sec = delta_us / 1000000;
+ delta.tv_usec = delta_us % 1000000;
+
+ FD_ZERO(&readfs);
+ FD_ZERO(&errorfs);
+ FD_SET(sock, &readfs);
+ FD_SET(sock, &errorfs);
+ printf("%ld.%06ld: select %ldus\n",
+ (long)now.tv_sec, (long)now.tv_usec,
+ delta_us);
+ res = select(sock + 1, &readfs, 0, &errorfs, &delta);
+ gettimeofday(&now, 0);
+ printf("%ld.%06ld: select returned: %d, %s\n",
+ (long)now.tv_sec, (long)now.tv_usec,
+ res,
+ res < 0 ? strerror(errno) : "success");
+ if (res > 0) {
+ if (FD_ISSET(sock, &readfs))
+ printf("ready for reading\n");
+ if (FD_ISSET(sock, &errorfs))
+ printf("has error\n");
+ recvpacket(sock, 0,
+ siocgstamp,
+ siocgstampns);
+ recvpacket(sock, MSG_ERRQUEUE,
+ siocgstamp,
+ siocgstampns);
+ }
+ } else {
+ /* write one packet */
+ sendpacket(sock,
+ (struct sockaddr *)&addr,
+ sizeof(addr));
+ next.tv_sec += 5;
+ continue;
+ }
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
new file mode 100644
index 000000000..44a25a9f1
--- /dev/null
+++ b/tools/testing/selftests/net/tls.c
@@ -0,0 +1,1335 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <linux/tls.h>
+#include <linux/tcp.h>
+#include <linux/socket.h>
+
+#include <sys/types.h>
+#include <sys/sendfile.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+
+#include "../kselftest_harness.h"
+
+#define TLS_PAYLOAD_MAX_LEN 16384
+#define SOL_TLS 282
+
+FIXTURE(tls_basic)
+{
+ int fd, cfd;
+ bool notls;
+};
+
+FIXTURE_SETUP(tls_basic)
+{
+ struct sockaddr_in addr;
+ socklen_t len;
+ int sfd, ret;
+
+ self->notls = false;
+ len = sizeof(addr);
+
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = htonl(INADDR_ANY);
+ addr.sin_port = 0;
+
+ self->fd = socket(AF_INET, SOCK_STREAM, 0);
+ sfd = socket(AF_INET, SOCK_STREAM, 0);
+
+ ret = bind(sfd, &addr, sizeof(addr));
+ ASSERT_EQ(ret, 0);
+ ret = listen(sfd, 10);
+ ASSERT_EQ(ret, 0);
+
+ ret = getsockname(sfd, &addr, &len);
+ ASSERT_EQ(ret, 0);
+
+ ret = connect(self->fd, &addr, sizeof(addr));
+ ASSERT_EQ(ret, 0);
+
+ self->cfd = accept(sfd, &addr, &len);
+ ASSERT_GE(self->cfd, 0);
+
+ close(sfd);
+
+ ret = setsockopt(self->fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls"));
+ if (ret != 0) {
+ ASSERT_EQ(errno, ENOENT);
+ self->notls = true;
+ printf("Failure setting TCP_ULP, testing without tls\n");
+ return;
+ }
+
+ ret = setsockopt(self->cfd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls"));
+ ASSERT_EQ(ret, 0);
+}
+
+FIXTURE_TEARDOWN(tls_basic)
+{
+ close(self->fd);
+ close(self->cfd);
+}
+
+/* Send some data through with ULP but no keys */
+TEST_F(tls_basic, base_base)
+{
+ char const *test_str = "test_read";
+ int send_len = 10;
+ char buf[10];
+
+ ASSERT_EQ(strlen(test_str) + 1, send_len);
+
+ EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len);
+ EXPECT_NE(recv(self->cfd, buf, send_len, 0), -1);
+ EXPECT_EQ(memcmp(buf, test_str, send_len), 0);
+};
+
+FIXTURE(tls)
+{
+ int fd, cfd;
+ bool notls;
+};
+
+FIXTURE_VARIANT(tls)
+{
+ unsigned int tls_version;
+};
+
+FIXTURE_VARIANT_ADD(tls, 12)
+{
+ .tls_version = TLS_1_2_VERSION,
+};
+
+FIXTURE_VARIANT_ADD(tls, 13)
+{
+ .tls_version = TLS_1_3_VERSION,
+};
+
+FIXTURE_SETUP(tls)
+{
+ struct tls12_crypto_info_aes_gcm_128 tls12;
+ struct sockaddr_in addr;
+ socklen_t len;
+ int sfd, ret;
+
+ self->notls = false;
+ len = sizeof(addr);
+
+ memset(&tls12, 0, sizeof(tls12));
+ tls12.info.version = variant->tls_version;
+ tls12.info.cipher_type = TLS_CIPHER_AES_GCM_128;
+
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = htonl(INADDR_ANY);
+ addr.sin_port = 0;
+
+ self->fd = socket(AF_INET, SOCK_STREAM, 0);
+ sfd = socket(AF_INET, SOCK_STREAM, 0);
+
+ ret = bind(sfd, &addr, sizeof(addr));
+ ASSERT_EQ(ret, 0);
+ ret = listen(sfd, 10);
+ ASSERT_EQ(ret, 0);
+
+ ret = getsockname(sfd, &addr, &len);
+ ASSERT_EQ(ret, 0);
+
+ ret = connect(self->fd, &addr, sizeof(addr));
+ ASSERT_EQ(ret, 0);
+
+ ret = setsockopt(self->fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls"));
+ if (ret != 0) {
+ self->notls = true;
+ printf("Failure setting TCP_ULP, testing without tls\n");
+ }
+
+ if (!self->notls) {
+ ret = setsockopt(self->fd, SOL_TLS, TLS_TX, &tls12,
+ sizeof(tls12));
+ ASSERT_EQ(ret, 0);
+ }
+
+ self->cfd = accept(sfd, &addr, &len);
+ ASSERT_GE(self->cfd, 0);
+
+ if (!self->notls) {
+ ret = setsockopt(self->cfd, IPPROTO_TCP, TCP_ULP, "tls",
+ sizeof("tls"));
+ ASSERT_EQ(ret, 0);
+
+ ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12,
+ sizeof(tls12));
+ ASSERT_EQ(ret, 0);
+ }
+
+ close(sfd);
+}
+
+FIXTURE_TEARDOWN(tls)
+{
+ close(self->fd);
+ close(self->cfd);
+}
+
+TEST_F(tls, sendfile)
+{
+ int filefd = open("/proc/self/exe", O_RDONLY);
+ struct stat st;
+
+ EXPECT_GE(filefd, 0);
+ fstat(filefd, &st);
+ EXPECT_GE(sendfile(self->fd, filefd, 0, st.st_size), 0);
+}
+
+TEST_F(tls, send_then_sendfile)
+{
+ int filefd = open("/proc/self/exe", O_RDONLY);
+ char const *test_str = "test_send";
+ int to_send = strlen(test_str) + 1;
+ char recv_buf[10];
+ struct stat st;
+ char *buf;
+
+ EXPECT_GE(filefd, 0);
+ fstat(filefd, &st);
+ buf = (char *)malloc(st.st_size);
+
+ EXPECT_EQ(send(self->fd, test_str, to_send, 0), to_send);
+ EXPECT_EQ(recv(self->cfd, recv_buf, to_send, MSG_WAITALL), to_send);
+ EXPECT_EQ(memcmp(test_str, recv_buf, to_send), 0);
+
+ EXPECT_GE(sendfile(self->fd, filefd, 0, st.st_size), 0);
+ EXPECT_EQ(recv(self->cfd, buf, st.st_size, MSG_WAITALL), st.st_size);
+}
+
+static void chunked_sendfile(struct __test_metadata *_metadata,
+ struct _test_data_tls *self,
+ uint16_t chunk_size,
+ uint16_t extra_payload_size)
+{
+ char buf[TLS_PAYLOAD_MAX_LEN];
+ uint16_t test_payload_size;
+ int size = 0;
+ int ret;
+ char filename[] = "/tmp/mytemp.XXXXXX";
+ int fd = mkstemp(filename);
+ off_t offset = 0;
+
+ unlink(filename);
+ ASSERT_GE(fd, 0);
+ EXPECT_GE(chunk_size, 1);
+ test_payload_size = chunk_size + extra_payload_size;
+ ASSERT_GE(TLS_PAYLOAD_MAX_LEN, test_payload_size);
+ memset(buf, 1, test_payload_size);
+ size = write(fd, buf, test_payload_size);
+ EXPECT_EQ(size, test_payload_size);
+ fsync(fd);
+
+ while (size > 0) {
+ ret = sendfile(self->fd, fd, &offset, chunk_size);
+ EXPECT_GE(ret, 0);
+ size -= ret;
+ }
+
+ EXPECT_EQ(recv(self->cfd, buf, test_payload_size, MSG_WAITALL),
+ test_payload_size);
+
+ close(fd);
+}
+
+TEST_F(tls, multi_chunk_sendfile)
+{
+ chunked_sendfile(_metadata, self, 4096, 4096);
+ chunked_sendfile(_metadata, self, 4096, 0);
+ chunked_sendfile(_metadata, self, 4096, 1);
+ chunked_sendfile(_metadata, self, 4096, 2048);
+ chunked_sendfile(_metadata, self, 8192, 2048);
+ chunked_sendfile(_metadata, self, 4096, 8192);
+ chunked_sendfile(_metadata, self, 8192, 4096);
+ chunked_sendfile(_metadata, self, 12288, 1024);
+ chunked_sendfile(_metadata, self, 12288, 2000);
+ chunked_sendfile(_metadata, self, 15360, 100);
+ chunked_sendfile(_metadata, self, 15360, 300);
+ chunked_sendfile(_metadata, self, 1, 4096);
+ chunked_sendfile(_metadata, self, 2048, 4096);
+ chunked_sendfile(_metadata, self, 2048, 8192);
+ chunked_sendfile(_metadata, self, 4096, 8192);
+ chunked_sendfile(_metadata, self, 1024, 12288);
+ chunked_sendfile(_metadata, self, 2000, 12288);
+ chunked_sendfile(_metadata, self, 100, 15360);
+ chunked_sendfile(_metadata, self, 300, 15360);
+}
+
+TEST_F(tls, recv_max)
+{
+ unsigned int send_len = TLS_PAYLOAD_MAX_LEN;
+ char recv_mem[TLS_PAYLOAD_MAX_LEN];
+ char buf[TLS_PAYLOAD_MAX_LEN];
+
+ EXPECT_GE(send(self->fd, buf, send_len, 0), 0);
+ EXPECT_NE(recv(self->cfd, recv_mem, send_len, 0), -1);
+ EXPECT_EQ(memcmp(buf, recv_mem, send_len), 0);
+}
+
+TEST_F(tls, recv_small)
+{
+ char const *test_str = "test_read";
+ int send_len = 10;
+ char buf[10];
+
+ send_len = strlen(test_str) + 1;
+ EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len);
+ EXPECT_NE(recv(self->cfd, buf, send_len, 0), -1);
+ EXPECT_EQ(memcmp(buf, test_str, send_len), 0);
+}
+
+TEST_F(tls, msg_more)
+{
+ char const *test_str = "test_read";
+ int send_len = 10;
+ char buf[10 * 2];
+
+ EXPECT_EQ(send(self->fd, test_str, send_len, MSG_MORE), send_len);
+ EXPECT_EQ(recv(self->cfd, buf, send_len, MSG_DONTWAIT), -1);
+ EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len);
+ EXPECT_EQ(recv(self->cfd, buf, send_len * 2, MSG_WAITALL),
+ send_len * 2);
+ EXPECT_EQ(memcmp(buf, test_str, send_len), 0);
+}
+
+TEST_F(tls, msg_more_unsent)
+{
+ char const *test_str = "test_read";
+ int send_len = 10;
+ char buf[10];
+
+ EXPECT_EQ(send(self->fd, test_str, send_len, MSG_MORE), send_len);
+ EXPECT_EQ(recv(self->cfd, buf, send_len, MSG_DONTWAIT), -1);
+}
+
+TEST_F(tls, sendmsg_single)
+{
+ struct msghdr msg;
+
+ char const *test_str = "test_sendmsg";
+ size_t send_len = 13;
+ struct iovec vec;
+ char buf[13];
+
+ vec.iov_base = (char *)test_str;
+ vec.iov_len = send_len;
+ memset(&msg, 0, sizeof(struct msghdr));
+ msg.msg_iov = &vec;
+ msg.msg_iovlen = 1;
+ EXPECT_EQ(sendmsg(self->fd, &msg, 0), send_len);
+ EXPECT_EQ(recv(self->cfd, buf, send_len, MSG_WAITALL), send_len);
+ EXPECT_EQ(memcmp(buf, test_str, send_len), 0);
+}
+
+#define MAX_FRAGS 64
+#define SEND_LEN 13
+TEST_F(tls, sendmsg_fragmented)
+{
+ char const *test_str = "test_sendmsg";
+ char buf[SEND_LEN * MAX_FRAGS];
+ struct iovec vec[MAX_FRAGS];
+ struct msghdr msg;
+ int i, frags;
+
+ for (frags = 1; frags <= MAX_FRAGS; frags++) {
+ for (i = 0; i < frags; i++) {
+ vec[i].iov_base = (char *)test_str;
+ vec[i].iov_len = SEND_LEN;
+ }
+
+ memset(&msg, 0, sizeof(struct msghdr));
+ msg.msg_iov = vec;
+ msg.msg_iovlen = frags;
+
+ EXPECT_EQ(sendmsg(self->fd, &msg, 0), SEND_LEN * frags);
+ EXPECT_EQ(recv(self->cfd, buf, SEND_LEN * frags, MSG_WAITALL),
+ SEND_LEN * frags);
+
+ for (i = 0; i < frags; i++)
+ EXPECT_EQ(memcmp(buf + SEND_LEN * i,
+ test_str, SEND_LEN), 0);
+ }
+}
+#undef MAX_FRAGS
+#undef SEND_LEN
+
+TEST_F(tls, sendmsg_large)
+{
+ void *mem = malloc(16384);
+ size_t send_len = 16384;
+ size_t sends = 128;
+ struct msghdr msg;
+ size_t recvs = 0;
+ size_t sent = 0;
+
+ memset(&msg, 0, sizeof(struct msghdr));
+ while (sent++ < sends) {
+ struct iovec vec = { (void *)mem, send_len };
+
+ msg.msg_iov = &vec;
+ msg.msg_iovlen = 1;
+ EXPECT_EQ(sendmsg(self->fd, &msg, 0), send_len);
+ }
+
+ while (recvs++ < sends) {
+ EXPECT_NE(recv(self->cfd, mem, send_len, 0), -1);
+ }
+
+ free(mem);
+}
+
+TEST_F(tls, sendmsg_multiple)
+{
+ char const *test_str = "test_sendmsg_multiple";
+ struct iovec vec[5];
+ char *test_strs[5];
+ struct msghdr msg;
+ int total_len = 0;
+ int len_cmp = 0;
+ int iov_len = 5;
+ char *buf;
+ int i;
+
+ memset(&msg, 0, sizeof(struct msghdr));
+ for (i = 0; i < iov_len; i++) {
+ test_strs[i] = (char *)malloc(strlen(test_str) + 1);
+ snprintf(test_strs[i], strlen(test_str) + 1, "%s", test_str);
+ vec[i].iov_base = (void *)test_strs[i];
+ vec[i].iov_len = strlen(test_strs[i]) + 1;
+ total_len += vec[i].iov_len;
+ }
+ msg.msg_iov = vec;
+ msg.msg_iovlen = iov_len;
+
+ EXPECT_EQ(sendmsg(self->fd, &msg, 0), total_len);
+ buf = malloc(total_len);
+ EXPECT_NE(recv(self->cfd, buf, total_len, 0), -1);
+ for (i = 0; i < iov_len; i++) {
+ EXPECT_EQ(memcmp(test_strs[i], buf + len_cmp,
+ strlen(test_strs[i])),
+ 0);
+ len_cmp += strlen(buf + len_cmp) + 1;
+ }
+ for (i = 0; i < iov_len; i++)
+ free(test_strs[i]);
+ free(buf);
+}
+
+TEST_F(tls, sendmsg_multiple_stress)
+{
+ char const *test_str = "abcdefghijklmno";
+ struct iovec vec[1024];
+ char *test_strs[1024];
+ int iov_len = 1024;
+ int total_len = 0;
+ char buf[1 << 14];
+ struct msghdr msg;
+ int len_cmp = 0;
+ int i;
+
+ memset(&msg, 0, sizeof(struct msghdr));
+ for (i = 0; i < iov_len; i++) {
+ test_strs[i] = (char *)malloc(strlen(test_str) + 1);
+ snprintf(test_strs[i], strlen(test_str) + 1, "%s", test_str);
+ vec[i].iov_base = (void *)test_strs[i];
+ vec[i].iov_len = strlen(test_strs[i]) + 1;
+ total_len += vec[i].iov_len;
+ }
+ msg.msg_iov = vec;
+ msg.msg_iovlen = iov_len;
+
+ EXPECT_EQ(sendmsg(self->fd, &msg, 0), total_len);
+ EXPECT_NE(recv(self->cfd, buf, total_len, 0), -1);
+
+ for (i = 0; i < iov_len; i++)
+ len_cmp += strlen(buf + len_cmp) + 1;
+
+ for (i = 0; i < iov_len; i++)
+ free(test_strs[i]);
+}
+
+TEST_F(tls, splice_from_pipe)
+{
+ int send_len = TLS_PAYLOAD_MAX_LEN;
+ char mem_send[TLS_PAYLOAD_MAX_LEN];
+ char mem_recv[TLS_PAYLOAD_MAX_LEN];
+ int p[2];
+
+ ASSERT_GE(pipe(p), 0);
+ EXPECT_GE(write(p[1], mem_send, send_len), 0);
+ EXPECT_GE(splice(p[0], NULL, self->fd, NULL, send_len, 0), 0);
+ EXPECT_EQ(recv(self->cfd, mem_recv, send_len, MSG_WAITALL), send_len);
+ EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0);
+}
+
+TEST_F(tls, splice_from_pipe2)
+{
+ int send_len = 16000;
+ char mem_send[16000];
+ char mem_recv[16000];
+ int p2[2];
+ int p[2];
+
+ ASSERT_GE(pipe(p), 0);
+ ASSERT_GE(pipe(p2), 0);
+ EXPECT_GE(write(p[1], mem_send, 8000), 0);
+ EXPECT_GE(splice(p[0], NULL, self->fd, NULL, 8000, 0), 0);
+ EXPECT_GE(write(p2[1], mem_send + 8000, 8000), 0);
+ EXPECT_GE(splice(p2[0], NULL, self->fd, NULL, 8000, 0), 0);
+ EXPECT_EQ(recv(self->cfd, mem_recv, send_len, MSG_WAITALL), send_len);
+ EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0);
+}
+
+TEST_F(tls, send_and_splice)
+{
+ int send_len = TLS_PAYLOAD_MAX_LEN;
+ char mem_send[TLS_PAYLOAD_MAX_LEN];
+ char mem_recv[TLS_PAYLOAD_MAX_LEN];
+ char const *test_str = "test_read";
+ int send_len2 = 10;
+ char buf[10];
+ int p[2];
+
+ ASSERT_GE(pipe(p), 0);
+ EXPECT_EQ(send(self->fd, test_str, send_len2, 0), send_len2);
+ EXPECT_EQ(recv(self->cfd, buf, send_len2, MSG_WAITALL), send_len2);
+ EXPECT_EQ(memcmp(test_str, buf, send_len2), 0);
+
+ EXPECT_GE(write(p[1], mem_send, send_len), send_len);
+ EXPECT_GE(splice(p[0], NULL, self->fd, NULL, send_len, 0), send_len);
+
+ EXPECT_EQ(recv(self->cfd, mem_recv, send_len, MSG_WAITALL), send_len);
+ EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0);
+}
+
+TEST_F(tls, splice_to_pipe)
+{
+ int send_len = TLS_PAYLOAD_MAX_LEN;
+ char mem_send[TLS_PAYLOAD_MAX_LEN];
+ char mem_recv[TLS_PAYLOAD_MAX_LEN];
+ int p[2];
+
+ ASSERT_GE(pipe(p), 0);
+ EXPECT_GE(send(self->fd, mem_send, send_len, 0), 0);
+ EXPECT_GE(splice(self->cfd, NULL, p[1], NULL, send_len, 0), 0);
+ EXPECT_GE(read(p[0], mem_recv, send_len), 0);
+ EXPECT_EQ(memcmp(mem_send, mem_recv, send_len), 0);
+}
+
+TEST_F(tls, recvmsg_single)
+{
+ char const *test_str = "test_recvmsg_single";
+ int send_len = strlen(test_str) + 1;
+ char buf[20];
+ struct msghdr hdr;
+ struct iovec vec;
+
+ memset(&hdr, 0, sizeof(hdr));
+ EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len);
+ vec.iov_base = (char *)buf;
+ vec.iov_len = send_len;
+ hdr.msg_iovlen = 1;
+ hdr.msg_iov = &vec;
+ EXPECT_NE(recvmsg(self->cfd, &hdr, 0), -1);
+ EXPECT_EQ(memcmp(test_str, buf, send_len), 0);
+}
+
+TEST_F(tls, recvmsg_single_max)
+{
+ int send_len = TLS_PAYLOAD_MAX_LEN;
+ char send_mem[TLS_PAYLOAD_MAX_LEN];
+ char recv_mem[TLS_PAYLOAD_MAX_LEN];
+ struct iovec vec;
+ struct msghdr hdr;
+
+ EXPECT_EQ(send(self->fd, send_mem, send_len, 0), send_len);
+ vec.iov_base = (char *)recv_mem;
+ vec.iov_len = TLS_PAYLOAD_MAX_LEN;
+
+ hdr.msg_iovlen = 1;
+ hdr.msg_iov = &vec;
+ EXPECT_NE(recvmsg(self->cfd, &hdr, 0), -1);
+ EXPECT_EQ(memcmp(send_mem, recv_mem, send_len), 0);
+}
+
+TEST_F(tls, recvmsg_multiple)
+{
+ unsigned int msg_iovlen = 1024;
+ unsigned int len_compared = 0;
+ struct iovec vec[1024];
+ char *iov_base[1024];
+ unsigned int iov_len = 16;
+ int send_len = 1 << 14;
+ char buf[1 << 14];
+ struct msghdr hdr;
+ int i;
+
+ EXPECT_EQ(send(self->fd, buf, send_len, 0), send_len);
+ for (i = 0; i < msg_iovlen; i++) {
+ iov_base[i] = (char *)malloc(iov_len);
+ vec[i].iov_base = iov_base[i];
+ vec[i].iov_len = iov_len;
+ }
+
+ hdr.msg_iovlen = msg_iovlen;
+ hdr.msg_iov = vec;
+ EXPECT_NE(recvmsg(self->cfd, &hdr, 0), -1);
+ for (i = 0; i < msg_iovlen; i++)
+ len_compared += iov_len;
+
+ for (i = 0; i < msg_iovlen; i++)
+ free(iov_base[i]);
+}
+
+TEST_F(tls, single_send_multiple_recv)
+{
+ unsigned int total_len = TLS_PAYLOAD_MAX_LEN * 2;
+ unsigned int send_len = TLS_PAYLOAD_MAX_LEN;
+ char send_mem[TLS_PAYLOAD_MAX_LEN * 2];
+ char recv_mem[TLS_PAYLOAD_MAX_LEN * 2];
+
+ EXPECT_GE(send(self->fd, send_mem, total_len, 0), 0);
+ memset(recv_mem, 0, total_len);
+
+ EXPECT_NE(recv(self->cfd, recv_mem, send_len, 0), -1);
+ EXPECT_NE(recv(self->cfd, recv_mem + send_len, send_len, 0), -1);
+ EXPECT_EQ(memcmp(send_mem, recv_mem, total_len), 0);
+}
+
+TEST_F(tls, multiple_send_single_recv)
+{
+ unsigned int total_len = 2 * 10;
+ unsigned int send_len = 10;
+ char recv_mem[2 * 10];
+ char send_mem[10];
+
+ EXPECT_GE(send(self->fd, send_mem, send_len, 0), 0);
+ EXPECT_GE(send(self->fd, send_mem, send_len, 0), 0);
+ memset(recv_mem, 0, total_len);
+ EXPECT_EQ(recv(self->cfd, recv_mem, total_len, MSG_WAITALL), total_len);
+
+ EXPECT_EQ(memcmp(send_mem, recv_mem, send_len), 0);
+ EXPECT_EQ(memcmp(send_mem, recv_mem + send_len, send_len), 0);
+}
+
+TEST_F(tls, single_send_multiple_recv_non_align)
+{
+ const unsigned int total_len = 15;
+ const unsigned int recv_len = 10;
+ char recv_mem[recv_len * 2];
+ char send_mem[total_len];
+
+ EXPECT_GE(send(self->fd, send_mem, total_len, 0), 0);
+ memset(recv_mem, 0, total_len);
+
+ EXPECT_EQ(recv(self->cfd, recv_mem, recv_len, 0), recv_len);
+ EXPECT_EQ(recv(self->cfd, recv_mem + recv_len, recv_len, 0), 5);
+ EXPECT_EQ(memcmp(send_mem, recv_mem, total_len), 0);
+}
+
+TEST_F(tls, recv_partial)
+{
+ char const *test_str = "test_read_partial";
+ char const *test_str_first = "test_read";
+ char const *test_str_second = "_partial";
+ int send_len = strlen(test_str) + 1;
+ char recv_mem[18];
+
+ memset(recv_mem, 0, sizeof(recv_mem));
+ EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len);
+ EXPECT_NE(recv(self->cfd, recv_mem, strlen(test_str_first),
+ MSG_WAITALL), -1);
+ EXPECT_EQ(memcmp(test_str_first, recv_mem, strlen(test_str_first)), 0);
+ memset(recv_mem, 0, sizeof(recv_mem));
+ EXPECT_NE(recv(self->cfd, recv_mem, strlen(test_str_second),
+ MSG_WAITALL), -1);
+ EXPECT_EQ(memcmp(test_str_second, recv_mem, strlen(test_str_second)),
+ 0);
+}
+
+TEST_F(tls, recv_nonblock)
+{
+ char buf[4096];
+ bool err;
+
+ EXPECT_EQ(recv(self->cfd, buf, sizeof(buf), MSG_DONTWAIT), -1);
+ err = (errno == EAGAIN || errno == EWOULDBLOCK);
+ EXPECT_EQ(err, true);
+}
+
+TEST_F(tls, recv_peek)
+{
+ char const *test_str = "test_read_peek";
+ int send_len = strlen(test_str) + 1;
+ char buf[15];
+
+ EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len);
+ EXPECT_NE(recv(self->cfd, buf, send_len, MSG_PEEK), -1);
+ EXPECT_EQ(memcmp(test_str, buf, send_len), 0);
+ memset(buf, 0, sizeof(buf));
+ EXPECT_NE(recv(self->cfd, buf, send_len, 0), -1);
+ EXPECT_EQ(memcmp(test_str, buf, send_len), 0);
+}
+
+TEST_F(tls, recv_peek_multiple)
+{
+ char const *test_str = "test_read_peek";
+ int send_len = strlen(test_str) + 1;
+ unsigned int num_peeks = 100;
+ char buf[15];
+ int i;
+
+ EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len);
+ for (i = 0; i < num_peeks; i++) {
+ EXPECT_NE(recv(self->cfd, buf, send_len, MSG_PEEK), -1);
+ EXPECT_EQ(memcmp(test_str, buf, send_len), 0);
+ memset(buf, 0, sizeof(buf));
+ }
+ EXPECT_NE(recv(self->cfd, buf, send_len, 0), -1);
+ EXPECT_EQ(memcmp(test_str, buf, send_len), 0);
+}
+
+TEST_F(tls, recv_peek_multiple_records)
+{
+ char const *test_str = "test_read_peek_mult_recs";
+ char const *test_str_first = "test_read_peek";
+ char const *test_str_second = "_mult_recs";
+ int len;
+ char buf[64];
+
+ len = strlen(test_str_first);
+ EXPECT_EQ(send(self->fd, test_str_first, len, 0), len);
+
+ len = strlen(test_str_second) + 1;
+ EXPECT_EQ(send(self->fd, test_str_second, len, 0), len);
+
+ len = strlen(test_str_first);
+ memset(buf, 0, len);
+ EXPECT_EQ(recv(self->cfd, buf, len, MSG_PEEK | MSG_WAITALL), len);
+
+ /* MSG_PEEK can only peek into the current record. */
+ len = strlen(test_str_first);
+ EXPECT_EQ(memcmp(test_str_first, buf, len), 0);
+
+ len = strlen(test_str) + 1;
+ memset(buf, 0, len);
+ EXPECT_EQ(recv(self->cfd, buf, len, MSG_WAITALL), len);
+
+ /* Non-MSG_PEEK will advance strparser (and therefore record)
+ * however.
+ */
+ len = strlen(test_str) + 1;
+ EXPECT_EQ(memcmp(test_str, buf, len), 0);
+
+ /* MSG_MORE will hold current record open, so later MSG_PEEK
+ * will see everything.
+ */
+ len = strlen(test_str_first);
+ EXPECT_EQ(send(self->fd, test_str_first, len, MSG_MORE), len);
+
+ len = strlen(test_str_second) + 1;
+ EXPECT_EQ(send(self->fd, test_str_second, len, 0), len);
+
+ len = strlen(test_str) + 1;
+ memset(buf, 0, len);
+ EXPECT_EQ(recv(self->cfd, buf, len, MSG_PEEK | MSG_WAITALL), len);
+
+ len = strlen(test_str) + 1;
+ EXPECT_EQ(memcmp(test_str, buf, len), 0);
+}
+
+TEST_F(tls, recv_peek_large_buf_mult_recs)
+{
+ char const *test_str = "test_read_peek_mult_recs";
+ char const *test_str_first = "test_read_peek";
+ char const *test_str_second = "_mult_recs";
+ int len;
+ char buf[64];
+
+ len = strlen(test_str_first);
+ EXPECT_EQ(send(self->fd, test_str_first, len, 0), len);
+
+ len = strlen(test_str_second) + 1;
+ EXPECT_EQ(send(self->fd, test_str_second, len, 0), len);
+
+ len = strlen(test_str) + 1;
+ memset(buf, 0, len);
+ EXPECT_NE((len = recv(self->cfd, buf, len,
+ MSG_PEEK | MSG_WAITALL)), -1);
+ len = strlen(test_str) + 1;
+ EXPECT_EQ(memcmp(test_str, buf, len), 0);
+}
+
+TEST_F(tls, recv_lowat)
+{
+ char send_mem[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+ char recv_mem[20];
+ int lowat = 8;
+
+ EXPECT_EQ(send(self->fd, send_mem, 10, 0), 10);
+ EXPECT_EQ(send(self->fd, send_mem, 5, 0), 5);
+
+ memset(recv_mem, 0, 20);
+ EXPECT_EQ(setsockopt(self->cfd, SOL_SOCKET, SO_RCVLOWAT,
+ &lowat, sizeof(lowat)), 0);
+ EXPECT_EQ(recv(self->cfd, recv_mem, 1, MSG_WAITALL), 1);
+ EXPECT_EQ(recv(self->cfd, recv_mem + 1, 6, MSG_WAITALL), 6);
+ EXPECT_EQ(recv(self->cfd, recv_mem + 7, 10, 0), 8);
+
+ EXPECT_EQ(memcmp(send_mem, recv_mem, 10), 0);
+ EXPECT_EQ(memcmp(send_mem, recv_mem + 10, 5), 0);
+}
+
+TEST_F(tls, bidir)
+{
+ char const *test_str = "test_read";
+ int send_len = 10;
+ char buf[10];
+ int ret;
+
+ if (!self->notls) {
+ struct tls12_crypto_info_aes_gcm_128 tls12;
+
+ memset(&tls12, 0, sizeof(tls12));
+ tls12.info.version = variant->tls_version;
+ tls12.info.cipher_type = TLS_CIPHER_AES_GCM_128;
+
+ ret = setsockopt(self->fd, SOL_TLS, TLS_RX, &tls12,
+ sizeof(tls12));
+ ASSERT_EQ(ret, 0);
+
+ ret = setsockopt(self->cfd, SOL_TLS, TLS_TX, &tls12,
+ sizeof(tls12));
+ ASSERT_EQ(ret, 0);
+ }
+
+ ASSERT_EQ(strlen(test_str) + 1, send_len);
+
+ EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len);
+ EXPECT_NE(recv(self->cfd, buf, send_len, 0), -1);
+ EXPECT_EQ(memcmp(buf, test_str, send_len), 0);
+
+ memset(buf, 0, sizeof(buf));
+
+ EXPECT_EQ(send(self->cfd, test_str, send_len, 0), send_len);
+ EXPECT_NE(recv(self->fd, buf, send_len, 0), -1);
+ EXPECT_EQ(memcmp(buf, test_str, send_len), 0);
+};
+
+TEST_F(tls, pollin)
+{
+ char const *test_str = "test_poll";
+ struct pollfd fd = { 0, 0, 0 };
+ char buf[10];
+ int send_len = 10;
+
+ EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len);
+ fd.fd = self->cfd;
+ fd.events = POLLIN;
+
+ EXPECT_EQ(poll(&fd, 1, 20), 1);
+ EXPECT_EQ(fd.revents & POLLIN, 1);
+ EXPECT_EQ(recv(self->cfd, buf, send_len, MSG_WAITALL), send_len);
+ /* Test timing out */
+ EXPECT_EQ(poll(&fd, 1, 20), 0);
+}
+
+TEST_F(tls, poll_wait)
+{
+ char const *test_str = "test_poll_wait";
+ int send_len = strlen(test_str) + 1;
+ struct pollfd fd = { 0, 0, 0 };
+ char recv_mem[15];
+
+ fd.fd = self->cfd;
+ fd.events = POLLIN;
+ EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len);
+ /* Set timeout to inf. secs */
+ EXPECT_EQ(poll(&fd, 1, -1), 1);
+ EXPECT_EQ(fd.revents & POLLIN, 1);
+ EXPECT_EQ(recv(self->cfd, recv_mem, send_len, MSG_WAITALL), send_len);
+}
+
+TEST_F(tls, poll_wait_split)
+{
+ struct pollfd fd = { 0, 0, 0 };
+ char send_mem[20] = {};
+ char recv_mem[15];
+
+ fd.fd = self->cfd;
+ fd.events = POLLIN;
+ /* Send 20 bytes */
+ EXPECT_EQ(send(self->fd, send_mem, sizeof(send_mem), 0),
+ sizeof(send_mem));
+ /* Poll with inf. timeout */
+ EXPECT_EQ(poll(&fd, 1, -1), 1);
+ EXPECT_EQ(fd.revents & POLLIN, 1);
+ EXPECT_EQ(recv(self->cfd, recv_mem, sizeof(recv_mem), MSG_WAITALL),
+ sizeof(recv_mem));
+
+ /* Now the remaining 5 bytes of record data are in TLS ULP */
+ fd.fd = self->cfd;
+ fd.events = POLLIN;
+ EXPECT_EQ(poll(&fd, 1, -1), 1);
+ EXPECT_EQ(fd.revents & POLLIN, 1);
+ EXPECT_EQ(recv(self->cfd, recv_mem, sizeof(recv_mem), 0),
+ sizeof(send_mem) - sizeof(recv_mem));
+}
+
+TEST_F(tls, blocking)
+{
+ size_t data = 100000;
+ int res = fork();
+
+ EXPECT_NE(res, -1);
+
+ if (res) {
+ /* parent */
+ size_t left = data;
+ char buf[16384];
+ int status;
+ int pid2;
+
+ while (left) {
+ int res = send(self->fd, buf,
+ left > 16384 ? 16384 : left, 0);
+
+ EXPECT_GE(res, 0);
+ left -= res;
+ }
+
+ pid2 = wait(&status);
+ EXPECT_EQ(status, 0);
+ EXPECT_EQ(res, pid2);
+ } else {
+ /* child */
+ size_t left = data;
+ char buf[16384];
+
+ while (left) {
+ int res = recv(self->cfd, buf,
+ left > 16384 ? 16384 : left, 0);
+
+ EXPECT_GE(res, 0);
+ left -= res;
+ }
+ }
+}
+
+TEST_F(tls, nonblocking)
+{
+ size_t data = 100000;
+ int sendbuf = 100;
+ int flags;
+ int res;
+
+ flags = fcntl(self->fd, F_GETFL, 0);
+ fcntl(self->fd, F_SETFL, flags | O_NONBLOCK);
+ fcntl(self->cfd, F_SETFL, flags | O_NONBLOCK);
+
+ /* Ensure nonblocking behavior by imposing a small send
+ * buffer.
+ */
+ EXPECT_EQ(setsockopt(self->fd, SOL_SOCKET, SO_SNDBUF,
+ &sendbuf, sizeof(sendbuf)), 0);
+
+ res = fork();
+ EXPECT_NE(res, -1);
+
+ if (res) {
+ /* parent */
+ bool eagain = false;
+ size_t left = data;
+ char buf[16384];
+ int status;
+ int pid2;
+
+ while (left) {
+ int res = send(self->fd, buf,
+ left > 16384 ? 16384 : left, 0);
+
+ if (res == -1 && errno == EAGAIN) {
+ eagain = true;
+ usleep(10000);
+ continue;
+ }
+ EXPECT_GE(res, 0);
+ left -= res;
+ }
+
+ EXPECT_TRUE(eagain);
+ pid2 = wait(&status);
+
+ EXPECT_EQ(status, 0);
+ EXPECT_EQ(res, pid2);
+ } else {
+ /* child */
+ bool eagain = false;
+ size_t left = data;
+ char buf[16384];
+
+ while (left) {
+ int res = recv(self->cfd, buf,
+ left > 16384 ? 16384 : left, 0);
+
+ if (res == -1 && errno == EAGAIN) {
+ eagain = true;
+ usleep(10000);
+ continue;
+ }
+ EXPECT_GE(res, 0);
+ left -= res;
+ }
+ EXPECT_TRUE(eagain);
+ }
+}
+
+static void
+test_mutliproc(struct __test_metadata *_metadata, struct _test_data_tls *self,
+ bool sendpg, unsigned int n_readers, unsigned int n_writers)
+{
+ const unsigned int n_children = n_readers + n_writers;
+ const size_t data = 6 * 1000 * 1000;
+ const size_t file_sz = data / 100;
+ size_t read_bias, write_bias;
+ int i, fd, child_id;
+ char buf[file_sz];
+ pid_t pid;
+
+ /* Only allow multiples for simplicity */
+ ASSERT_EQ(!(n_readers % n_writers) || !(n_writers % n_readers), true);
+ read_bias = n_writers / n_readers ?: 1;
+ write_bias = n_readers / n_writers ?: 1;
+
+ /* prep a file to send */
+ fd = open("/tmp/", O_TMPFILE | O_RDWR, 0600);
+ ASSERT_GE(fd, 0);
+
+ memset(buf, 0xac, file_sz);
+ ASSERT_EQ(write(fd, buf, file_sz), file_sz);
+
+ /* spawn children */
+ for (child_id = 0; child_id < n_children; child_id++) {
+ pid = fork();
+ ASSERT_NE(pid, -1);
+ if (!pid)
+ break;
+ }
+
+ /* parent waits for all children */
+ if (pid) {
+ for (i = 0; i < n_children; i++) {
+ int status;
+
+ wait(&status);
+ EXPECT_EQ(status, 0);
+ }
+
+ return;
+ }
+
+ /* Split threads for reading and writing */
+ if (child_id < n_readers) {
+ size_t left = data * read_bias;
+ char rb[8001];
+
+ while (left) {
+ int res;
+
+ res = recv(self->cfd, rb,
+ left > sizeof(rb) ? sizeof(rb) : left, 0);
+
+ EXPECT_GE(res, 0);
+ left -= res;
+ }
+ } else {
+ size_t left = data * write_bias;
+
+ while (left) {
+ int res;
+
+ ASSERT_EQ(lseek(fd, 0, SEEK_SET), 0);
+ if (sendpg)
+ res = sendfile(self->fd, fd, NULL,
+ left > file_sz ? file_sz : left);
+ else
+ res = send(self->fd, buf,
+ left > file_sz ? file_sz : left, 0);
+
+ EXPECT_GE(res, 0);
+ left -= res;
+ }
+ }
+}
+
+TEST_F(tls, mutliproc_even)
+{
+ test_mutliproc(_metadata, self, false, 6, 6);
+}
+
+TEST_F(tls, mutliproc_readers)
+{
+ test_mutliproc(_metadata, self, false, 4, 12);
+}
+
+TEST_F(tls, mutliproc_writers)
+{
+ test_mutliproc(_metadata, self, false, 10, 2);
+}
+
+TEST_F(tls, mutliproc_sendpage_even)
+{
+ test_mutliproc(_metadata, self, true, 6, 6);
+}
+
+TEST_F(tls, mutliproc_sendpage_readers)
+{
+ test_mutliproc(_metadata, self, true, 4, 12);
+}
+
+TEST_F(tls, mutliproc_sendpage_writers)
+{
+ test_mutliproc(_metadata, self, true, 10, 2);
+}
+
+TEST_F(tls, control_msg)
+{
+ if (self->notls)
+ return;
+
+ char cbuf[CMSG_SPACE(sizeof(char))];
+ char const *test_str = "test_read";
+ int cmsg_len = sizeof(char);
+ char record_type = 100;
+ struct cmsghdr *cmsg;
+ struct msghdr msg;
+ int send_len = 10;
+ struct iovec vec;
+ char buf[10];
+
+ vec.iov_base = (char *)test_str;
+ vec.iov_len = 10;
+ memset(&msg, 0, sizeof(struct msghdr));
+ msg.msg_iov = &vec;
+ msg.msg_iovlen = 1;
+ msg.msg_control = cbuf;
+ msg.msg_controllen = sizeof(cbuf);
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_TLS;
+ /* test sending non-record types. */
+ cmsg->cmsg_type = TLS_SET_RECORD_TYPE;
+ cmsg->cmsg_len = CMSG_LEN(cmsg_len);
+ *CMSG_DATA(cmsg) = record_type;
+ msg.msg_controllen = cmsg->cmsg_len;
+
+ EXPECT_EQ(sendmsg(self->fd, &msg, 0), send_len);
+ /* Should fail because we didn't provide a control message */
+ EXPECT_EQ(recv(self->cfd, buf, send_len, 0), -1);
+
+ vec.iov_base = buf;
+ EXPECT_EQ(recvmsg(self->cfd, &msg, MSG_WAITALL | MSG_PEEK), send_len);
+
+ cmsg = CMSG_FIRSTHDR(&msg);
+ EXPECT_NE(cmsg, NULL);
+ EXPECT_EQ(cmsg->cmsg_level, SOL_TLS);
+ EXPECT_EQ(cmsg->cmsg_type, TLS_GET_RECORD_TYPE);
+ record_type = *((unsigned char *)CMSG_DATA(cmsg));
+ EXPECT_EQ(record_type, 100);
+ EXPECT_EQ(memcmp(buf, test_str, send_len), 0);
+
+ /* Recv the message again without MSG_PEEK */
+ record_type = 0;
+ memset(buf, 0, sizeof(buf));
+
+ EXPECT_EQ(recvmsg(self->cfd, &msg, MSG_WAITALL), send_len);
+ cmsg = CMSG_FIRSTHDR(&msg);
+ EXPECT_NE(cmsg, NULL);
+ EXPECT_EQ(cmsg->cmsg_level, SOL_TLS);
+ EXPECT_EQ(cmsg->cmsg_type, TLS_GET_RECORD_TYPE);
+ record_type = *((unsigned char *)CMSG_DATA(cmsg));
+ EXPECT_EQ(record_type, 100);
+ EXPECT_EQ(memcmp(buf, test_str, send_len), 0);
+}
+
+TEST_F(tls, shutdown)
+{
+ char const *test_str = "test_read";
+ int send_len = 10;
+ char buf[10];
+
+ ASSERT_EQ(strlen(test_str) + 1, send_len);
+
+ EXPECT_EQ(send(self->fd, test_str, send_len, 0), send_len);
+ EXPECT_NE(recv(self->cfd, buf, send_len, 0), -1);
+ EXPECT_EQ(memcmp(buf, test_str, send_len), 0);
+
+ shutdown(self->fd, SHUT_RDWR);
+ shutdown(self->cfd, SHUT_RDWR);
+}
+
+TEST_F(tls, shutdown_unsent)
+{
+ char const *test_str = "test_read";
+ int send_len = 10;
+
+ EXPECT_EQ(send(self->fd, test_str, send_len, MSG_MORE), send_len);
+
+ shutdown(self->fd, SHUT_RDWR);
+ shutdown(self->cfd, SHUT_RDWR);
+}
+
+TEST_F(tls, shutdown_reuse)
+{
+ struct sockaddr_in addr;
+ int ret;
+
+ shutdown(self->fd, SHUT_RDWR);
+ shutdown(self->cfd, SHUT_RDWR);
+ close(self->cfd);
+
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = htonl(INADDR_ANY);
+ addr.sin_port = 0;
+
+ ret = bind(self->fd, &addr, sizeof(addr));
+ EXPECT_EQ(ret, 0);
+ ret = listen(self->fd, 10);
+ EXPECT_EQ(ret, -1);
+ EXPECT_EQ(errno, EINVAL);
+
+ ret = connect(self->fd, &addr, sizeof(addr));
+ EXPECT_EQ(ret, -1);
+ EXPECT_EQ(errno, EISCONN);
+}
+
+TEST(non_established) {
+ struct tls12_crypto_info_aes_gcm_256 tls12;
+ struct sockaddr_in addr;
+ int sfd, ret, fd;
+ socklen_t len;
+
+ len = sizeof(addr);
+
+ memset(&tls12, 0, sizeof(tls12));
+ tls12.info.version = TLS_1_2_VERSION;
+ tls12.info.cipher_type = TLS_CIPHER_AES_GCM_256;
+
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = htonl(INADDR_ANY);
+ addr.sin_port = 0;
+
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ sfd = socket(AF_INET, SOCK_STREAM, 0);
+
+ ret = bind(sfd, &addr, sizeof(addr));
+ ASSERT_EQ(ret, 0);
+ ret = listen(sfd, 10);
+ ASSERT_EQ(ret, 0);
+
+ ret = setsockopt(fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls"));
+ EXPECT_EQ(ret, -1);
+ /* TLS ULP not supported */
+ if (errno == ENOENT)
+ return;
+ EXPECT_EQ(errno, ENOTCONN);
+
+ ret = setsockopt(sfd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls"));
+ EXPECT_EQ(ret, -1);
+ EXPECT_EQ(errno, ENOTCONN);
+
+ ret = getsockname(sfd, &addr, &len);
+ ASSERT_EQ(ret, 0);
+
+ ret = connect(fd, &addr, sizeof(addr));
+ ASSERT_EQ(ret, 0);
+
+ ret = setsockopt(fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls"));
+ ASSERT_EQ(ret, 0);
+
+ ret = setsockopt(fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls"));
+ EXPECT_EQ(ret, -1);
+ EXPECT_EQ(errno, EEXIST);
+
+ close(fd);
+ close(sfd);
+}
+
+TEST(keysizes) {
+ struct tls12_crypto_info_aes_gcm_256 tls12;
+ struct sockaddr_in addr;
+ int sfd, ret, fd, cfd;
+ socklen_t len;
+ bool notls;
+
+ notls = false;
+ len = sizeof(addr);
+
+ memset(&tls12, 0, sizeof(tls12));
+ tls12.info.version = TLS_1_2_VERSION;
+ tls12.info.cipher_type = TLS_CIPHER_AES_GCM_256;
+
+ addr.sin_family = AF_INET;
+ addr.sin_addr.s_addr = htonl(INADDR_ANY);
+ addr.sin_port = 0;
+
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ sfd = socket(AF_INET, SOCK_STREAM, 0);
+
+ ret = bind(sfd, &addr, sizeof(addr));
+ ASSERT_EQ(ret, 0);
+ ret = listen(sfd, 10);
+ ASSERT_EQ(ret, 0);
+
+ ret = getsockname(sfd, &addr, &len);
+ ASSERT_EQ(ret, 0);
+
+ ret = connect(fd, &addr, sizeof(addr));
+ ASSERT_EQ(ret, 0);
+
+ ret = setsockopt(fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls"));
+ if (ret != 0) {
+ notls = true;
+ printf("Failure setting TCP_ULP, testing without tls\n");
+ }
+
+ if (!notls) {
+ ret = setsockopt(fd, SOL_TLS, TLS_TX, &tls12,
+ sizeof(tls12));
+ EXPECT_EQ(ret, 0);
+ }
+
+ cfd = accept(sfd, &addr, &len);
+ ASSERT_GE(cfd, 0);
+
+ if (!notls) {
+ ret = setsockopt(cfd, IPPROTO_TCP, TCP_ULP, "tls",
+ sizeof("tls"));
+ EXPECT_EQ(ret, 0);
+
+ ret = setsockopt(cfd, SOL_TLS, TLS_RX, &tls12,
+ sizeof(tls12));
+ EXPECT_EQ(ret, 0);
+ }
+
+ close(sfd);
+ close(fd);
+ close(cfd);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/net/traceroute.sh b/tools/testing/selftests/net/traceroute.sh
new file mode 100755
index 000000000..de9ca97ab
--- /dev/null
+++ b/tools/testing/selftests/net/traceroute.sh
@@ -0,0 +1,322 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run traceroute/traceroute6 tests
+#
+
+VERBOSE=0
+PAUSE_ON_FAIL=no
+
+################################################################################
+#
+log_test()
+{
+ local rc=$1
+ local expected=$2
+ local msg="$3"
+
+ if [ ${rc} -eq ${expected} ]; then
+ printf "TEST: %-60s [ OK ]\n" "${msg}"
+ nsuccess=$((nsuccess+1))
+ else
+ ret=1
+ nfail=$((nfail+1))
+ printf "TEST: %-60s [FAIL]\n" "${msg}"
+ if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+ echo
+ echo "hit enter to continue, 'q' to quit"
+ read a
+ [ "$a" = "q" ] && exit 1
+ fi
+ fi
+}
+
+run_cmd()
+{
+ local ns
+ local cmd
+ local out
+ local rc
+
+ ns="$1"
+ shift
+ cmd="$*"
+
+ if [ "$VERBOSE" = "1" ]; then
+ printf " COMMAND: $cmd\n"
+ fi
+
+ out=$(eval ip netns exec ${ns} ${cmd} 2>&1)
+ rc=$?
+ if [ "$VERBOSE" = "1" -a -n "$out" ]; then
+ echo " $out"
+ fi
+
+ [ "$VERBOSE" = "1" ] && echo
+
+ return $rc
+}
+
+################################################################################
+# create namespaces and interconnects
+
+create_ns()
+{
+ local ns=$1
+ local addr=$2
+ local addr6=$3
+
+ [ -z "${addr}" ] && addr="-"
+ [ -z "${addr6}" ] && addr6="-"
+
+ ip netns add ${ns}
+
+ ip netns exec ${ns} ip link set lo up
+ if [ "${addr}" != "-" ]; then
+ ip netns exec ${ns} ip addr add dev lo ${addr}
+ fi
+ if [ "${addr6}" != "-" ]; then
+ ip netns exec ${ns} ip -6 addr add dev lo ${addr6}
+ fi
+
+ ip netns exec ${ns} ip ro add unreachable default metric 8192
+ ip netns exec ${ns} ip -6 ro add unreachable default metric 8192
+
+ ip netns exec ${ns} sysctl -qw net.ipv4.ip_forward=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.all.keep_addr_on_down=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.all.forwarding=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.default.forwarding=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.default.accept_dad=0
+}
+
+# create veth pair to connect namespaces and apply addresses.
+connect_ns()
+{
+ local ns1=$1
+ local ns1_dev=$2
+ local ns1_addr=$3
+ local ns1_addr6=$4
+ local ns2=$5
+ local ns2_dev=$6
+ local ns2_addr=$7
+ local ns2_addr6=$8
+
+ ip netns exec ${ns1} ip li add ${ns1_dev} type veth peer name tmp
+ ip netns exec ${ns1} ip li set ${ns1_dev} up
+ ip netns exec ${ns1} ip li set tmp netns ${ns2} name ${ns2_dev}
+ ip netns exec ${ns2} ip li set ${ns2_dev} up
+
+ if [ "${ns1_addr}" != "-" ]; then
+ ip netns exec ${ns1} ip addr add dev ${ns1_dev} ${ns1_addr}
+ fi
+
+ if [ "${ns2_addr}" != "-" ]; then
+ ip netns exec ${ns2} ip addr add dev ${ns2_dev} ${ns2_addr}
+ fi
+
+ if [ "${ns1_addr6}" != "-" ]; then
+ ip netns exec ${ns1} ip addr add dev ${ns1_dev} ${ns1_addr6}
+ fi
+
+ if [ "${ns2_addr6}" != "-" ]; then
+ ip netns exec ${ns2} ip addr add dev ${ns2_dev} ${ns2_addr6}
+ fi
+}
+
+################################################################################
+# traceroute6 test
+#
+# Verify that in this scenario
+#
+# ------------------------ N2
+# | |
+# ------ ------ N3 ----
+# | R1 | | R2 |------|H2|
+# ------ ------ ----
+# | |
+# ------------------------ N1
+# |
+# ----
+# |H1|
+# ----
+#
+# where H1's default route goes through R1 and R1's default route goes
+# through R2 over N2, traceroute6 from H1 to H2 reports R2's address
+# on N2 and not N1.
+#
+# Addresses are assigned as follows:
+#
+# N1: 2000:101::/64
+# N2: 2000:102::/64
+# N3: 2000:103::/64
+#
+# R1's host part of address: 1
+# R2's host part of address: 2
+# H1's host part of address: 3
+# H2's host part of address: 4
+#
+# For example:
+# the IPv6 address of R1's interface on N2 is 2000:102::1/64
+
+cleanup_traceroute6()
+{
+ local ns
+
+ for ns in host-1 host-2 router-1 router-2
+ do
+ ip netns del ${ns} 2>/dev/null
+ done
+}
+
+setup_traceroute6()
+{
+ brdev=br0
+
+ # start clean
+ cleanup_traceroute6
+
+ set -e
+ create_ns host-1
+ create_ns host-2
+ create_ns router-1
+ create_ns router-2
+
+ # Setup N3
+ connect_ns router-2 eth3 - 2000:103::2/64 host-2 eth3 - 2000:103::4/64
+ ip netns exec host-2 ip route add default via 2000:103::2
+
+ # Setup N2
+ connect_ns router-1 eth2 - 2000:102::1/64 router-2 eth2 - 2000:102::2/64
+ ip netns exec router-1 ip route add default via 2000:102::2
+
+ # Setup N1. host-1 and router-2 connect to a bridge in router-1.
+ ip netns exec router-1 ip link add name ${brdev} type bridge
+ ip netns exec router-1 ip link set ${brdev} up
+ ip netns exec router-1 ip addr add 2000:101::1/64 dev ${brdev}
+
+ connect_ns host-1 eth0 - 2000:101::3/64 router-1 eth0 - -
+ ip netns exec router-1 ip link set dev eth0 master ${brdev}
+ ip netns exec host-1 ip route add default via 2000:101::1
+
+ connect_ns router-2 eth1 - 2000:101::2/64 router-1 eth1 - -
+ ip netns exec router-1 ip link set dev eth1 master ${brdev}
+
+ # Prime the network
+ ip netns exec host-1 ping6 -c5 2000:103::4 >/dev/null 2>&1
+
+ set +e
+}
+
+run_traceroute6()
+{
+ if [ ! -x "$(command -v traceroute6)" ]; then
+ echo "SKIP: Could not run IPV6 test without traceroute6"
+ return
+ fi
+
+ setup_traceroute6
+
+ # traceroute6 host-2 from host-1 (expects 2000:102::2)
+ run_cmd host-1 "traceroute6 2000:103::4 | grep -q 2000:102::2"
+ log_test $? 0 "IPV6 traceroute"
+
+ cleanup_traceroute6
+}
+
+################################################################################
+# traceroute test
+#
+# Verify that traceroute from H1 to H2 shows 1.0.1.1 in this scenario
+#
+# 1.0.3.1/24
+# ---- 1.0.1.3/24 1.0.1.1/24 ---- 1.0.2.1/24 1.0.2.4/24 ----
+# |H1|--------------------------|R1|--------------------------|H2|
+# ---- N1 ---- N2 ----
+#
+# where net.ipv4.icmp_errors_use_inbound_ifaddr is set on R1 and
+# 1.0.3.1/24 and 1.0.1.1/24 are respectively R1's primary and secondary
+# address on N1.
+#
+
+cleanup_traceroute()
+{
+ local ns
+
+ for ns in host-1 host-2 router
+ do
+ ip netns del ${ns} 2>/dev/null
+ done
+}
+
+setup_traceroute()
+{
+ # start clean
+ cleanup_traceroute
+
+ set -e
+ create_ns host-1
+ create_ns host-2
+ create_ns router
+
+ connect_ns host-1 eth0 1.0.1.3/24 - \
+ router eth1 1.0.3.1/24 -
+ ip netns exec host-1 ip route add default via 1.0.1.1
+
+ ip netns exec router ip addr add 1.0.1.1/24 dev eth1
+ ip netns exec router sysctl -qw \
+ net.ipv4.icmp_errors_use_inbound_ifaddr=1
+
+ connect_ns host-2 eth0 1.0.2.4/24 - \
+ router eth2 1.0.2.1/24 -
+ ip netns exec host-2 ip route add default via 1.0.2.1
+
+ # Prime the network
+ ip netns exec host-1 ping -c5 1.0.2.4 >/dev/null 2>&1
+
+ set +e
+}
+
+run_traceroute()
+{
+ if [ ! -x "$(command -v traceroute)" ]; then
+ echo "SKIP: Could not run IPV4 test without traceroute"
+ return
+ fi
+
+ setup_traceroute
+
+ # traceroute host-2 from host-1 (expects 1.0.1.1). Takes a while.
+ run_cmd host-1 "traceroute 1.0.2.4 | grep -q 1.0.1.1"
+ log_test $? 0 "IPV4 traceroute"
+
+ cleanup_traceroute
+}
+
+################################################################################
+# Run tests
+
+run_tests()
+{
+ run_traceroute6
+ run_traceroute
+}
+
+################################################################################
+# main
+
+declare -i nfail=0
+declare -i nsuccess=0
+
+while getopts :pv o
+do
+ case $o in
+ p) PAUSE_ON_FAIL=yes;;
+ v) VERBOSE=$(($VERBOSE + 1));;
+ *) exit 1;;
+ esac
+done
+
+run_tests
+
+printf "\nTests passed: %3d\n" ${nsuccess}
+printf "Tests failed: %3d\n" ${nfail}
diff --git a/tools/testing/selftests/net/txring_overwrite.c b/tools/testing/selftests/net/txring_overwrite.c
new file mode 100644
index 000000000..7d9ea0394
--- /dev/null
+++ b/tools/testing/selftests/net/txring_overwrite.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Verify that consecutive sends over packet tx_ring are mirrored
+ * with their original content intact.
+ */
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <assert.h>
+#include <error.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/filter.h>
+#include <linux/if_packet.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <pthread.h>
+#include <sched.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/utsname.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+const int eth_off = TPACKET_HDRLEN - sizeof(struct sockaddr_ll);
+const int cfg_frame_size = 1000;
+
+static void build_packet(void *buffer, size_t blen, char payload_char)
+{
+ struct udphdr *udph;
+ struct ethhdr *eth;
+ struct iphdr *iph;
+ size_t off = 0;
+
+ memset(buffer, 0, blen);
+
+ eth = buffer;
+ eth->h_proto = htons(ETH_P_IP);
+
+ off += sizeof(*eth);
+ iph = buffer + off;
+ iph->ttl = 8;
+ iph->ihl = 5;
+ iph->version = 4;
+ iph->saddr = htonl(INADDR_LOOPBACK);
+ iph->daddr = htonl(INADDR_LOOPBACK + 1);
+ iph->protocol = IPPROTO_UDP;
+ iph->tot_len = htons(blen - off);
+ iph->check = 0;
+
+ off += sizeof(*iph);
+ udph = buffer + off;
+ udph->dest = htons(8000);
+ udph->source = htons(8001);
+ udph->len = htons(blen - off);
+ udph->check = 0;
+
+ off += sizeof(*udph);
+ memset(buffer + off, payload_char, blen - off);
+}
+
+static int setup_rx(void)
+{
+ int fdr;
+
+ fdr = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_IP));
+ if (fdr == -1)
+ error(1, errno, "socket r");
+
+ return fdr;
+}
+
+static int setup_tx(char **ring)
+{
+ struct sockaddr_ll laddr = {};
+ struct tpacket_req req = {};
+ int fdt;
+
+ fdt = socket(PF_PACKET, SOCK_RAW, 0);
+ if (fdt == -1)
+ error(1, errno, "socket t");
+
+ laddr.sll_family = AF_PACKET;
+ laddr.sll_protocol = htons(0);
+ laddr.sll_ifindex = if_nametoindex("lo");
+ if (!laddr.sll_ifindex)
+ error(1, errno, "if_nametoindex");
+
+ if (bind(fdt, (void *)&laddr, sizeof(laddr)))
+ error(1, errno, "bind fdt");
+
+ req.tp_block_size = getpagesize();
+ req.tp_block_nr = 1;
+ req.tp_frame_size = getpagesize();
+ req.tp_frame_nr = 1;
+
+ if (setsockopt(fdt, SOL_PACKET, PACKET_TX_RING,
+ (void *)&req, sizeof(req)))
+ error(1, errno, "setsockopt ring");
+
+ *ring = mmap(0, req.tp_block_size * req.tp_block_nr,
+ PROT_READ | PROT_WRITE, MAP_SHARED, fdt, 0);
+ if (*ring == MAP_FAILED)
+ error(1, errno, "mmap");
+
+ return fdt;
+}
+
+static void send_pkt(int fdt, void *slot, char payload_char)
+{
+ struct tpacket_hdr *header = slot;
+ int ret;
+
+ while (header->tp_status != TP_STATUS_AVAILABLE)
+ usleep(1000);
+
+ build_packet(slot + eth_off, cfg_frame_size, payload_char);
+
+ header->tp_len = cfg_frame_size;
+ header->tp_status = TP_STATUS_SEND_REQUEST;
+
+ ret = sendto(fdt, NULL, 0, 0, NULL, 0);
+ if (ret == -1)
+ error(1, errno, "kick tx");
+}
+
+static int read_verify_pkt(int fdr, char payload_char)
+{
+ char buf[100];
+ int ret;
+
+ ret = read(fdr, buf, sizeof(buf));
+ if (ret != sizeof(buf))
+ error(1, errno, "read");
+
+ if (buf[60] != payload_char) {
+ printf("wrong pattern: 0x%x != 0x%x\n", buf[60], payload_char);
+ return 1;
+ }
+
+ printf("read: %c (0x%x)\n", buf[60], buf[60]);
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ const char payload_patterns[] = "ab";
+ char *ring;
+ int fdr, fdt, ret = 0;
+
+ fdr = setup_rx();
+ fdt = setup_tx(&ring);
+
+ send_pkt(fdt, ring, payload_patterns[0]);
+ send_pkt(fdt, ring, payload_patterns[1]);
+
+ ret |= read_verify_pkt(fdr, payload_patterns[0]);
+ ret |= read_verify_pkt(fdr, payload_patterns[1]);
+
+ if (close(fdt))
+ error(1, errno, "close t");
+ if (close(fdr))
+ error(1, errno, "close r");
+
+ return ret;
+}
diff --git a/tools/testing/selftests/net/txtimestamp.c b/tools/testing/selftests/net/txtimestamp.c
new file mode 100644
index 000000000..fabb1d555
--- /dev/null
+++ b/tools/testing/selftests/net/txtimestamp.c
@@ -0,0 +1,922 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014 Google Inc.
+ * Author: willemb@google.com (Willem de Bruijn)
+ *
+ * Test software tx timestamping, including
+ *
+ * - SCHED, SND and ACK timestamps
+ * - RAW, UDP and TCP
+ * - IPv4 and IPv6
+ * - various packet sizes (to test GSO and TSO)
+ *
+ * Consult the command line arguments for help on running
+ * the various testcases.
+ *
+ * This test requires a dummy TCP server.
+ * A simple `nc6 [-u] -l -p $DESTPORT` will do
+ */
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <asm/types.h>
+#include <error.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <linux/errqueue.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/ipv6.h>
+#include <linux/net_tstamp.h>
+#include <netdb.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/udp.h>
+#include <netinet/tcp.h>
+#include <poll.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/ioctl.h>
+#include <sys/select.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#define NSEC_PER_USEC 1000L
+#define USEC_PER_SEC 1000000L
+#define NSEC_PER_SEC 1000000000LL
+
+/* command line parameters */
+static int cfg_proto = SOCK_STREAM;
+static int cfg_ipproto = IPPROTO_TCP;
+static int cfg_num_pkts = 4;
+static int do_ipv4 = 1;
+static int do_ipv6 = 1;
+static int cfg_payload_len = 10;
+static int cfg_poll_timeout = 100;
+static int cfg_delay_snd;
+static int cfg_delay_ack;
+static int cfg_delay_tolerance_usec = 500;
+static bool cfg_show_payload;
+static bool cfg_do_pktinfo;
+static bool cfg_busy_poll;
+static int cfg_sleep_usec = 50 * 1000;
+static bool cfg_loop_nodata;
+static bool cfg_use_cmsg;
+static bool cfg_use_pf_packet;
+static bool cfg_use_epoll;
+static bool cfg_epollet;
+static bool cfg_do_listen;
+static uint16_t dest_port = 9000;
+static bool cfg_print_nsec;
+
+static struct sockaddr_in daddr;
+static struct sockaddr_in6 daddr6;
+static struct timespec ts_usr;
+
+static int saved_tskey = -1;
+static int saved_tskey_type = -1;
+
+struct timing_event {
+ int64_t min;
+ int64_t max;
+ int64_t total;
+ int count;
+};
+
+static struct timing_event usr_enq;
+static struct timing_event usr_snd;
+static struct timing_event usr_ack;
+
+static bool test_failed;
+
+static int64_t timespec_to_ns64(struct timespec *ts)
+{
+ return ts->tv_sec * NSEC_PER_SEC + ts->tv_nsec;
+}
+
+static int64_t timespec_to_us64(struct timespec *ts)
+{
+ return ts->tv_sec * USEC_PER_SEC + ts->tv_nsec / NSEC_PER_USEC;
+}
+
+static void init_timing_event(struct timing_event *te)
+{
+ te->min = INT64_MAX;
+ te->max = 0;
+ te->total = 0;
+ te->count = 0;
+}
+
+static void add_timing_event(struct timing_event *te,
+ struct timespec *t_start, struct timespec *t_end)
+{
+ int64_t ts_delta = timespec_to_ns64(t_end) - timespec_to_ns64(t_start);
+
+ te->count++;
+ if (ts_delta < te->min)
+ te->min = ts_delta;
+ if (ts_delta > te->max)
+ te->max = ts_delta;
+ te->total += ts_delta;
+}
+
+static void validate_key(int tskey, int tstype)
+{
+ int stepsize;
+
+ /* compare key for each subsequent request
+ * must only test for one type, the first one requested
+ */
+ if (saved_tskey == -1)
+ saved_tskey_type = tstype;
+ else if (saved_tskey_type != tstype)
+ return;
+
+ stepsize = cfg_proto == SOCK_STREAM ? cfg_payload_len : 1;
+ if (tskey != saved_tskey + stepsize) {
+ fprintf(stderr, "ERROR: key %d, expected %d\n",
+ tskey, saved_tskey + stepsize);
+ test_failed = true;
+ }
+
+ saved_tskey = tskey;
+}
+
+static void validate_timestamp(struct timespec *cur, int min_delay)
+{
+ int64_t cur64, start64;
+ int max_delay;
+
+ cur64 = timespec_to_us64(cur);
+ start64 = timespec_to_us64(&ts_usr);
+ max_delay = min_delay + cfg_delay_tolerance_usec;
+
+ if (cur64 < start64 + min_delay || cur64 > start64 + max_delay) {
+ fprintf(stderr, "ERROR: %lu us expected between %d and %d\n",
+ cur64 - start64, min_delay, max_delay);
+ test_failed = true;
+ }
+}
+
+static void __print_ts_delta_formatted(int64_t ts_delta)
+{
+ if (cfg_print_nsec)
+ fprintf(stderr, "%lu ns", ts_delta);
+ else
+ fprintf(stderr, "%lu us", ts_delta / NSEC_PER_USEC);
+}
+
+static void __print_timestamp(const char *name, struct timespec *cur,
+ uint32_t key, int payload_len)
+{
+ int64_t ts_delta;
+
+ if (!(cur->tv_sec | cur->tv_nsec))
+ return;
+
+ if (cfg_print_nsec)
+ fprintf(stderr, " %s: %lu s %lu ns (seq=%u, len=%u)",
+ name, cur->tv_sec, cur->tv_nsec,
+ key, payload_len);
+ else
+ fprintf(stderr, " %s: %lu s %lu us (seq=%u, len=%u)",
+ name, cur->tv_sec, cur->tv_nsec / NSEC_PER_USEC,
+ key, payload_len);
+
+ if (cur != &ts_usr) {
+ ts_delta = timespec_to_ns64(cur) - timespec_to_ns64(&ts_usr);
+ fprintf(stderr, " (USR +");
+ __print_ts_delta_formatted(ts_delta);
+ fprintf(stderr, ")");
+ }
+
+ fprintf(stderr, "\n");
+}
+
+static void print_timestamp_usr(void)
+{
+ if (clock_gettime(CLOCK_REALTIME, &ts_usr))
+ error(1, errno, "clock_gettime");
+
+ __print_timestamp(" USR", &ts_usr, 0, 0);
+}
+
+static void print_timestamp(struct scm_timestamping *tss, int tstype,
+ int tskey, int payload_len)
+{
+ const char *tsname;
+
+ validate_key(tskey, tstype);
+
+ switch (tstype) {
+ case SCM_TSTAMP_SCHED:
+ tsname = " ENQ";
+ validate_timestamp(&tss->ts[0], 0);
+ add_timing_event(&usr_enq, &ts_usr, &tss->ts[0]);
+ break;
+ case SCM_TSTAMP_SND:
+ tsname = " SND";
+ validate_timestamp(&tss->ts[0], cfg_delay_snd);
+ add_timing_event(&usr_snd, &ts_usr, &tss->ts[0]);
+ break;
+ case SCM_TSTAMP_ACK:
+ tsname = " ACK";
+ validate_timestamp(&tss->ts[0], cfg_delay_ack);
+ add_timing_event(&usr_ack, &ts_usr, &tss->ts[0]);
+ break;
+ default:
+ error(1, 0, "unknown timestamp type: %u",
+ tstype);
+ }
+ __print_timestamp(tsname, &tss->ts[0], tskey, payload_len);
+}
+
+static void print_timing_event(char *name, struct timing_event *te)
+{
+ if (!te->count)
+ return;
+
+ fprintf(stderr, " %s: count=%d", name, te->count);
+ fprintf(stderr, ", avg=");
+ __print_ts_delta_formatted((int64_t)(te->total / te->count));
+ fprintf(stderr, ", min=");
+ __print_ts_delta_formatted(te->min);
+ fprintf(stderr, ", max=");
+ __print_ts_delta_formatted(te->max);
+ fprintf(stderr, "\n");
+}
+
+/* TODO: convert to check_and_print payload once API is stable */
+static void print_payload(char *data, int len)
+{
+ int i;
+
+ if (!len)
+ return;
+
+ if (len > 70)
+ len = 70;
+
+ fprintf(stderr, "payload: ");
+ for (i = 0; i < len; i++)
+ fprintf(stderr, "%02hhx ", data[i]);
+ fprintf(stderr, "\n");
+}
+
+static void print_pktinfo(int family, int ifindex, void *saddr, void *daddr)
+{
+ char sa[INET6_ADDRSTRLEN], da[INET6_ADDRSTRLEN];
+
+ fprintf(stderr, " pktinfo: ifindex=%u src=%s dst=%s\n",
+ ifindex,
+ saddr ? inet_ntop(family, saddr, sa, sizeof(sa)) : "unknown",
+ daddr ? inet_ntop(family, daddr, da, sizeof(da)) : "unknown");
+}
+
+static void __epoll(int epfd)
+{
+ struct epoll_event events;
+ int ret;
+
+ memset(&events, 0, sizeof(events));
+ ret = epoll_wait(epfd, &events, 1, cfg_poll_timeout);
+ if (ret != 1)
+ error(1, errno, "epoll_wait");
+}
+
+static void __poll(int fd)
+{
+ struct pollfd pollfd;
+ int ret;
+
+ memset(&pollfd, 0, sizeof(pollfd));
+ pollfd.fd = fd;
+ ret = poll(&pollfd, 1, cfg_poll_timeout);
+ if (ret != 1)
+ error(1, errno, "poll");
+}
+
+static void __recv_errmsg_cmsg(struct msghdr *msg, int payload_len)
+{
+ struct sock_extended_err *serr = NULL;
+ struct scm_timestamping *tss = NULL;
+ struct cmsghdr *cm;
+ int batch = 0;
+
+ for (cm = CMSG_FIRSTHDR(msg);
+ cm && cm->cmsg_len;
+ cm = CMSG_NXTHDR(msg, cm)) {
+ if (cm->cmsg_level == SOL_SOCKET &&
+ cm->cmsg_type == SCM_TIMESTAMPING) {
+ tss = (void *) CMSG_DATA(cm);
+ } else if ((cm->cmsg_level == SOL_IP &&
+ cm->cmsg_type == IP_RECVERR) ||
+ (cm->cmsg_level == SOL_IPV6 &&
+ cm->cmsg_type == IPV6_RECVERR) ||
+ (cm->cmsg_level == SOL_PACKET &&
+ cm->cmsg_type == PACKET_TX_TIMESTAMP)) {
+ serr = (void *) CMSG_DATA(cm);
+ if (serr->ee_errno != ENOMSG ||
+ serr->ee_origin != SO_EE_ORIGIN_TIMESTAMPING) {
+ fprintf(stderr, "unknown ip error %d %d\n",
+ serr->ee_errno,
+ serr->ee_origin);
+ serr = NULL;
+ }
+ } else if (cm->cmsg_level == SOL_IP &&
+ cm->cmsg_type == IP_PKTINFO) {
+ struct in_pktinfo *info = (void *) CMSG_DATA(cm);
+ print_pktinfo(AF_INET, info->ipi_ifindex,
+ &info->ipi_spec_dst, &info->ipi_addr);
+ } else if (cm->cmsg_level == SOL_IPV6 &&
+ cm->cmsg_type == IPV6_PKTINFO) {
+ struct in6_pktinfo *info6 = (void *) CMSG_DATA(cm);
+ print_pktinfo(AF_INET6, info6->ipi6_ifindex,
+ NULL, &info6->ipi6_addr);
+ } else
+ fprintf(stderr, "unknown cmsg %d,%d\n",
+ cm->cmsg_level, cm->cmsg_type);
+
+ if (serr && tss) {
+ print_timestamp(tss, serr->ee_info, serr->ee_data,
+ payload_len);
+ serr = NULL;
+ tss = NULL;
+ batch++;
+ }
+ }
+
+ if (batch > 1)
+ fprintf(stderr, "batched %d timestamps\n", batch);
+}
+
+static int recv_errmsg(int fd)
+{
+ static char ctrl[1024 /* overprovision*/];
+ static struct msghdr msg;
+ struct iovec entry;
+ static char *data;
+ int ret = 0;
+
+ data = malloc(cfg_payload_len);
+ if (!data)
+ error(1, 0, "malloc");
+
+ memset(&msg, 0, sizeof(msg));
+ memset(&entry, 0, sizeof(entry));
+ memset(ctrl, 0, sizeof(ctrl));
+
+ entry.iov_base = data;
+ entry.iov_len = cfg_payload_len;
+ msg.msg_iov = &entry;
+ msg.msg_iovlen = 1;
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_control = ctrl;
+ msg.msg_controllen = sizeof(ctrl);
+
+ ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
+ if (ret == -1 && errno != EAGAIN)
+ error(1, errno, "recvmsg");
+
+ if (ret >= 0) {
+ __recv_errmsg_cmsg(&msg, ret);
+ if (cfg_show_payload)
+ print_payload(data, cfg_payload_len);
+ }
+
+ free(data);
+ return ret == -1;
+}
+
+static uint16_t get_ip_csum(const uint16_t *start, int num_words,
+ unsigned long sum)
+{
+ int i;
+
+ for (i = 0; i < num_words; i++)
+ sum += start[i];
+
+ while (sum >> 16)
+ sum = (sum & 0xFFFF) + (sum >> 16);
+
+ return ~sum;
+}
+
+static uint16_t get_udp_csum(const struct udphdr *udph, int alen)
+{
+ unsigned long pseudo_sum, csum_len;
+ const void *csum_start = udph;
+
+ pseudo_sum = htons(IPPROTO_UDP);
+ pseudo_sum += udph->len;
+
+ /* checksum ip(v6) addresses + udp header + payload */
+ csum_start -= alen * 2;
+ csum_len = ntohs(udph->len) + alen * 2;
+
+ return get_ip_csum(csum_start, csum_len >> 1, pseudo_sum);
+}
+
+static int fill_header_ipv4(void *p)
+{
+ struct iphdr *iph = p;
+
+ memset(iph, 0, sizeof(*iph));
+
+ iph->ihl = 5;
+ iph->version = 4;
+ iph->ttl = 2;
+ iph->saddr = daddr.sin_addr.s_addr; /* set for udp csum calc */
+ iph->daddr = daddr.sin_addr.s_addr;
+ iph->protocol = IPPROTO_UDP;
+
+ /* kernel writes saddr, csum, len */
+
+ return sizeof(*iph);
+}
+
+static int fill_header_ipv6(void *p)
+{
+ struct ipv6hdr *ip6h = p;
+
+ memset(ip6h, 0, sizeof(*ip6h));
+
+ ip6h->version = 6;
+ ip6h->payload_len = htons(sizeof(struct udphdr) + cfg_payload_len);
+ ip6h->nexthdr = IPPROTO_UDP;
+ ip6h->hop_limit = 64;
+
+ ip6h->saddr = daddr6.sin6_addr;
+ ip6h->daddr = daddr6.sin6_addr;
+
+ /* kernel does not write saddr in case of ipv6 */
+
+ return sizeof(*ip6h);
+}
+
+static void fill_header_udp(void *p, bool is_ipv4)
+{
+ struct udphdr *udph = p;
+
+ udph->source = ntohs(dest_port + 1); /* spoof */
+ udph->dest = ntohs(dest_port);
+ udph->len = ntohs(sizeof(*udph) + cfg_payload_len);
+ udph->check = 0;
+
+ udph->check = get_udp_csum(udph, is_ipv4 ? sizeof(struct in_addr) :
+ sizeof(struct in6_addr));
+}
+
+static void do_test(int family, unsigned int report_opt)
+{
+ char control[CMSG_SPACE(sizeof(uint32_t))];
+ struct sockaddr_ll laddr;
+ unsigned int sock_opt;
+ struct cmsghdr *cmsg;
+ struct msghdr msg;
+ struct iovec iov;
+ char *buf;
+ int fd, i, val = 1, total_len, epfd = 0;
+
+ init_timing_event(&usr_enq);
+ init_timing_event(&usr_snd);
+ init_timing_event(&usr_ack);
+
+ total_len = cfg_payload_len;
+ if (cfg_use_pf_packet || cfg_proto == SOCK_RAW) {
+ total_len += sizeof(struct udphdr);
+ if (cfg_use_pf_packet || cfg_ipproto == IPPROTO_RAW) {
+ if (family == PF_INET)
+ total_len += sizeof(struct iphdr);
+ else
+ total_len += sizeof(struct ipv6hdr);
+ }
+ /* special case, only rawv6_sendmsg:
+ * pass proto in sin6_port if not connected
+ * also see ANK comment in net/ipv4/raw.c
+ */
+ daddr6.sin6_port = htons(cfg_ipproto);
+ }
+
+ buf = malloc(total_len);
+ if (!buf)
+ error(1, 0, "malloc");
+
+ fd = socket(cfg_use_pf_packet ? PF_PACKET : family,
+ cfg_proto, cfg_ipproto);
+ if (fd < 0)
+ error(1, errno, "socket");
+
+ if (cfg_use_epoll) {
+ struct epoll_event ev;
+
+ memset(&ev, 0, sizeof(ev));
+ ev.data.fd = fd;
+ if (cfg_epollet)
+ ev.events |= EPOLLET;
+ epfd = epoll_create(1);
+ if (epfd <= 0)
+ error(1, errno, "epoll_create");
+ if (epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev))
+ error(1, errno, "epoll_ctl");
+ }
+
+ /* reset expected key on each new socket */
+ saved_tskey = -1;
+
+ if (cfg_proto == SOCK_STREAM) {
+ if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY,
+ (char*) &val, sizeof(val)))
+ error(1, 0, "setsockopt no nagle");
+
+ if (family == PF_INET) {
+ if (connect(fd, (void *) &daddr, sizeof(daddr)))
+ error(1, errno, "connect ipv4");
+ } else {
+ if (connect(fd, (void *) &daddr6, sizeof(daddr6)))
+ error(1, errno, "connect ipv6");
+ }
+ }
+
+ if (cfg_do_pktinfo) {
+ if (family == AF_INET6) {
+ if (setsockopt(fd, SOL_IPV6, IPV6_RECVPKTINFO,
+ &val, sizeof(val)))
+ error(1, errno, "setsockopt pktinfo ipv6");
+ } else {
+ if (setsockopt(fd, SOL_IP, IP_PKTINFO,
+ &val, sizeof(val)))
+ error(1, errno, "setsockopt pktinfo ipv4");
+ }
+ }
+
+ sock_opt = SOF_TIMESTAMPING_SOFTWARE |
+ SOF_TIMESTAMPING_OPT_CMSG |
+ SOF_TIMESTAMPING_OPT_ID;
+
+ if (!cfg_use_cmsg)
+ sock_opt |= report_opt;
+
+ if (cfg_loop_nodata)
+ sock_opt |= SOF_TIMESTAMPING_OPT_TSONLY;
+
+ if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING,
+ (char *) &sock_opt, sizeof(sock_opt)))
+ error(1, 0, "setsockopt timestamping");
+
+ for (i = 0; i < cfg_num_pkts; i++) {
+ memset(&msg, 0, sizeof(msg));
+ memset(buf, 'a' + i, total_len);
+
+ if (cfg_use_pf_packet || cfg_proto == SOCK_RAW) {
+ int off = 0;
+
+ if (cfg_use_pf_packet || cfg_ipproto == IPPROTO_RAW) {
+ if (family == PF_INET)
+ off = fill_header_ipv4(buf);
+ else
+ off = fill_header_ipv6(buf);
+ }
+
+ fill_header_udp(buf + off, family == PF_INET);
+ }
+
+ print_timestamp_usr();
+
+ iov.iov_base = buf;
+ iov.iov_len = total_len;
+
+ if (cfg_proto != SOCK_STREAM) {
+ if (cfg_use_pf_packet) {
+ memset(&laddr, 0, sizeof(laddr));
+
+ laddr.sll_family = AF_PACKET;
+ laddr.sll_ifindex = 1;
+ laddr.sll_protocol = htons(family == AF_INET ? ETH_P_IP : ETH_P_IPV6);
+ laddr.sll_halen = ETH_ALEN;
+
+ msg.msg_name = (void *)&laddr;
+ msg.msg_namelen = sizeof(laddr);
+ } else if (family == PF_INET) {
+ msg.msg_name = (void *)&daddr;
+ msg.msg_namelen = sizeof(daddr);
+ } else {
+ msg.msg_name = (void *)&daddr6;
+ msg.msg_namelen = sizeof(daddr6);
+ }
+ }
+
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ if (cfg_use_cmsg) {
+ memset(control, 0, sizeof(control));
+
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SO_TIMESTAMPING;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t));
+
+ *((uint32_t *) CMSG_DATA(cmsg)) = report_opt;
+ }
+
+ val = sendmsg(fd, &msg, 0);
+ if (val != total_len)
+ error(1, errno, "send");
+
+ /* wait for all errors to be queued, else ACKs arrive OOO */
+ if (cfg_sleep_usec)
+ usleep(cfg_sleep_usec);
+
+ if (!cfg_busy_poll) {
+ if (cfg_use_epoll)
+ __epoll(epfd);
+ else
+ __poll(fd);
+ }
+
+ while (!recv_errmsg(fd)) {}
+ }
+
+ print_timing_event("USR-ENQ", &usr_enq);
+ print_timing_event("USR-SND", &usr_snd);
+ print_timing_event("USR-ACK", &usr_ack);
+
+ if (close(fd))
+ error(1, errno, "close");
+
+ free(buf);
+ usleep(100 * NSEC_PER_USEC);
+}
+
+static void __attribute__((noreturn)) usage(const char *filepath)
+{
+ fprintf(stderr, "\nUsage: %s [options] hostname\n"
+ "\nwhere options are:\n"
+ " -4: only IPv4\n"
+ " -6: only IPv6\n"
+ " -h: show this message\n"
+ " -b: busy poll to read from error queue\n"
+ " -c N: number of packets for each test\n"
+ " -C: use cmsg to set tstamp recording options\n"
+ " -e: use level-triggered epoll() instead of poll()\n"
+ " -E: use event-triggered epoll() instead of poll()\n"
+ " -F: poll()/epoll() waits forever for an event\n"
+ " -I: request PKTINFO\n"
+ " -l N: send N bytes at a time\n"
+ " -L listen on hostname and port\n"
+ " -n: set no-payload option\n"
+ " -N: print timestamps and durations in nsec (instead of usec)\n"
+ " -p N: connect to port N\n"
+ " -P: use PF_PACKET\n"
+ " -r: use raw\n"
+ " -R: use raw (IP_HDRINCL)\n"
+ " -S N: usec to sleep before reading error queue\n"
+ " -t N: tolerance (usec) for timestamp validation\n"
+ " -u: use udp\n"
+ " -v: validate SND delay (usec)\n"
+ " -V: validate ACK delay (usec)\n"
+ " -x: show payload (up to 70 bytes)\n",
+ filepath);
+ exit(1);
+}
+
+static void parse_opt(int argc, char **argv)
+{
+ int proto_count = 0;
+ int c;
+
+ while ((c = getopt(argc, argv,
+ "46bc:CeEFhIl:LnNp:PrRS:t:uv:V:x")) != -1) {
+ switch (c) {
+ case '4':
+ do_ipv6 = 0;
+ break;
+ case '6':
+ do_ipv4 = 0;
+ break;
+ case 'b':
+ cfg_busy_poll = true;
+ break;
+ case 'c':
+ cfg_num_pkts = strtoul(optarg, NULL, 10);
+ break;
+ case 'C':
+ cfg_use_cmsg = true;
+ break;
+ case 'e':
+ cfg_use_epoll = true;
+ break;
+ case 'E':
+ cfg_use_epoll = true;
+ cfg_epollet = true;
+ case 'F':
+ cfg_poll_timeout = -1;
+ break;
+ case 'I':
+ cfg_do_pktinfo = true;
+ break;
+ case 'l':
+ cfg_payload_len = strtoul(optarg, NULL, 10);
+ break;
+ case 'L':
+ cfg_do_listen = true;
+ break;
+ case 'n':
+ cfg_loop_nodata = true;
+ break;
+ case 'N':
+ cfg_print_nsec = true;
+ break;
+ case 'p':
+ dest_port = strtoul(optarg, NULL, 10);
+ break;
+ case 'P':
+ proto_count++;
+ cfg_use_pf_packet = true;
+ cfg_proto = SOCK_DGRAM;
+ cfg_ipproto = 0;
+ break;
+ case 'r':
+ proto_count++;
+ cfg_proto = SOCK_RAW;
+ cfg_ipproto = IPPROTO_UDP;
+ break;
+ case 'R':
+ proto_count++;
+ cfg_proto = SOCK_RAW;
+ cfg_ipproto = IPPROTO_RAW;
+ break;
+ case 'S':
+ cfg_sleep_usec = strtoul(optarg, NULL, 10);
+ break;
+ case 't':
+ cfg_delay_tolerance_usec = strtoul(optarg, NULL, 10);
+ break;
+ case 'u':
+ proto_count++;
+ cfg_proto = SOCK_DGRAM;
+ cfg_ipproto = IPPROTO_UDP;
+ break;
+ case 'v':
+ cfg_delay_snd = strtoul(optarg, NULL, 10);
+ break;
+ case 'V':
+ cfg_delay_ack = strtoul(optarg, NULL, 10);
+ break;
+ case 'x':
+ cfg_show_payload = true;
+ break;
+ case 'h':
+ default:
+ usage(argv[0]);
+ }
+ }
+
+ if (!cfg_payload_len)
+ error(1, 0, "payload may not be nonzero");
+ if (cfg_proto != SOCK_STREAM && cfg_payload_len > 1472)
+ error(1, 0, "udp packet might exceed expected MTU");
+ if (!do_ipv4 && !do_ipv6)
+ error(1, 0, "pass -4 or -6, not both");
+ if (proto_count > 1)
+ error(1, 0, "pass -P, -r, -R or -u, not multiple");
+ if (cfg_do_pktinfo && cfg_use_pf_packet)
+ error(1, 0, "cannot ask for pktinfo over pf_packet");
+ if (cfg_busy_poll && cfg_use_epoll)
+ error(1, 0, "pass epoll or busy_poll, not both");
+
+ if (optind != argc - 1)
+ error(1, 0, "missing required hostname argument");
+}
+
+static void resolve_hostname(const char *hostname)
+{
+ struct addrinfo hints = { .ai_family = do_ipv4 ? AF_INET : AF_INET6 };
+ struct addrinfo *addrs, *cur;
+ int have_ipv4 = 0, have_ipv6 = 0;
+
+retry:
+ if (getaddrinfo(hostname, NULL, &hints, &addrs))
+ error(1, errno, "getaddrinfo");
+
+ cur = addrs;
+ while (cur && !have_ipv4 && !have_ipv6) {
+ if (!have_ipv4 && cur->ai_family == AF_INET) {
+ memcpy(&daddr, cur->ai_addr, sizeof(daddr));
+ daddr.sin_port = htons(dest_port);
+ have_ipv4 = 1;
+ }
+ else if (!have_ipv6 && cur->ai_family == AF_INET6) {
+ memcpy(&daddr6, cur->ai_addr, sizeof(daddr6));
+ daddr6.sin6_port = htons(dest_port);
+ have_ipv6 = 1;
+ }
+ cur = cur->ai_next;
+ }
+ if (addrs)
+ freeaddrinfo(addrs);
+
+ if (do_ipv6 && hints.ai_family != AF_INET6) {
+ hints.ai_family = AF_INET6;
+ goto retry;
+ }
+
+ do_ipv4 &= have_ipv4;
+ do_ipv6 &= have_ipv6;
+}
+
+static void do_listen(int family, void *addr, int alen)
+{
+ int fd, type;
+
+ type = cfg_proto == SOCK_RAW ? SOCK_DGRAM : cfg_proto;
+
+ fd = socket(family, type, 0);
+ if (fd == -1)
+ error(1, errno, "socket rx");
+
+ if (bind(fd, addr, alen))
+ error(1, errno, "bind rx");
+
+ if (type == SOCK_STREAM && listen(fd, 10))
+ error(1, errno, "listen rx");
+
+ /* leave fd open, will be closed on process exit.
+ * this enables connect() to succeed and avoids icmp replies
+ */
+}
+
+static void do_main(int family)
+{
+ fprintf(stderr, "family: %s %s\n",
+ family == PF_INET ? "INET" : "INET6",
+ cfg_use_pf_packet ? "(PF_PACKET)" : "");
+
+ fprintf(stderr, "test SND\n");
+ do_test(family, SOF_TIMESTAMPING_TX_SOFTWARE);
+
+ fprintf(stderr, "test ENQ\n");
+ do_test(family, SOF_TIMESTAMPING_TX_SCHED);
+
+ fprintf(stderr, "test ENQ + SND\n");
+ do_test(family, SOF_TIMESTAMPING_TX_SCHED |
+ SOF_TIMESTAMPING_TX_SOFTWARE);
+
+ if (cfg_proto == SOCK_STREAM) {
+ fprintf(stderr, "\ntest ACK\n");
+ do_test(family, SOF_TIMESTAMPING_TX_ACK);
+
+ fprintf(stderr, "\ntest SND + ACK\n");
+ do_test(family, SOF_TIMESTAMPING_TX_SOFTWARE |
+ SOF_TIMESTAMPING_TX_ACK);
+
+ fprintf(stderr, "\ntest ENQ + SND + ACK\n");
+ do_test(family, SOF_TIMESTAMPING_TX_SCHED |
+ SOF_TIMESTAMPING_TX_SOFTWARE |
+ SOF_TIMESTAMPING_TX_ACK);
+ }
+}
+
+const char *sock_names[] = { NULL, "TCP", "UDP", "RAW" };
+
+int main(int argc, char **argv)
+{
+ if (argc == 1)
+ usage(argv[0]);
+
+ parse_opt(argc, argv);
+ resolve_hostname(argv[argc - 1]);
+
+ fprintf(stderr, "protocol: %s\n", sock_names[cfg_proto]);
+ fprintf(stderr, "payload: %u\n", cfg_payload_len);
+ fprintf(stderr, "server port: %u\n", dest_port);
+ fprintf(stderr, "\n");
+
+ if (do_ipv4) {
+ if (cfg_do_listen)
+ do_listen(PF_INET, &daddr, sizeof(daddr));
+ do_main(PF_INET);
+ }
+
+ if (do_ipv6) {
+ if (cfg_do_listen)
+ do_listen(PF_INET6, &daddr6, sizeof(daddr6));
+ do_main(PF_INET6);
+ }
+
+ return test_failed;
+}
diff --git a/tools/testing/selftests/net/txtimestamp.sh b/tools/testing/selftests/net/txtimestamp.sh
new file mode 100755
index 000000000..31637769f
--- /dev/null
+++ b/tools/testing/selftests/net/txtimestamp.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Send packets with transmit timestamps over loopback with netem
+# Verify that timestamps correspond to netem delay
+
+set -e
+
+setup() {
+ # set 1ms delay on lo egress
+ tc qdisc add dev lo root netem delay 1ms
+
+ # set 2ms delay on ifb0 egress
+ modprobe ifb
+ ip link add ifb_netem0 type ifb
+ ip link set dev ifb_netem0 up
+ tc qdisc add dev ifb_netem0 root netem delay 2ms
+
+ # redirect lo ingress through ifb0 egress
+ tc qdisc add dev lo handle ffff: ingress
+ tc filter add dev lo parent ffff: \
+ u32 match mark 0 0xffff \
+ action mirred egress redirect dev ifb_netem0
+}
+
+run_test_v4v6() {
+ # SND will be delayed 1000us
+ # ACK will be delayed 6000us: 1 + 2 ms round-trip
+ local -r args="$@ -v 1000 -V 6000"
+
+ ./txtimestamp ${args} -4 -L 127.0.0.1
+ ./txtimestamp ${args} -6 -L ::1
+}
+
+run_test_tcpudpraw() {
+ local -r args=$@
+
+ run_test_v4v6 ${args} # tcp
+ run_test_v4v6 ${args} -u # udp
+ run_test_v4v6 ${args} -r # raw
+ run_test_v4v6 ${args} -R # raw (IPPROTO_RAW)
+ run_test_v4v6 ${args} -P # pf_packet
+}
+
+run_test_all() {
+ setup
+ run_test_tcpudpraw # setsockopt
+ run_test_tcpudpraw -C # cmsg
+ run_test_tcpudpraw -n # timestamp w/o data
+ echo "OK. All tests passed"
+}
+
+run_test_one() {
+ setup
+ ./txtimestamp $@
+}
+
+usage() {
+ echo "Usage: $0 [ -r | --run ] <txtimestamp args> | [ -h | --help ]"
+ echo " (no args) Run all tests"
+ echo " -r|--run Run an individual test with arguments"
+ echo " -h|--help Help"
+}
+
+main() {
+ if [[ $# -eq 0 ]]; then
+ run_test_all
+ else
+ if [[ "$1" = "-r" || "$1" == "--run" ]]; then
+ shift
+ run_test_one $@
+ else
+ usage
+ fi
+ fi
+}
+
+if [[ -z "$(ip netns identify)" ]]; then
+ ./in_netns.sh $0 $@
+else
+ main $@
+fi
diff --git a/tools/testing/selftests/net/udpgro.sh b/tools/testing/selftests/net/udpgro.sh
new file mode 100755
index 000000000..f8a19f548
--- /dev/null
+++ b/tools/testing/selftests/net/udpgro.sh
@@ -0,0 +1,216 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run a series of udpgro functional tests.
+
+readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)"
+
+# set global exit status, but never reset nonzero one.
+check_err()
+{
+ if [ $ret -eq 0 ]; then
+ ret=$1
+ fi
+}
+
+cleanup() {
+ local -r jobs="$(jobs -p)"
+ local -r ns="$(ip netns list|grep $PEER_NS)"
+
+ [ -n "${jobs}" ] && kill -1 ${jobs} 2>/dev/null
+ [ -n "$ns" ] && ip netns del $ns 2>/dev/null
+}
+trap cleanup EXIT
+
+cfg_veth() {
+ ip netns add "${PEER_NS}"
+ ip -netns "${PEER_NS}" link set lo up
+ ip link add type veth
+ ip link set dev veth0 up
+ ip addr add dev veth0 192.168.1.2/24
+ ip addr add dev veth0 2001:db8::2/64 nodad
+
+ ip link set dev veth1 netns "${PEER_NS}"
+ ip -netns "${PEER_NS}" addr add dev veth1 192.168.1.1/24
+ ip -netns "${PEER_NS}" addr add dev veth1 2001:db8::1/64 nodad
+ ip -netns "${PEER_NS}" link set dev veth1 up
+ ip -n "${PEER_NS}" link set veth1 xdp object ../bpf/xdp_dummy.o section xdp_dummy
+}
+
+run_one() {
+ # use 'rx' as separator between sender args and receiver args
+ local -r all="$@"
+ local -r tx_args=${all%rx*}
+ local -r rx_args=${all#*rx}
+
+ cfg_veth
+
+ ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} && \
+ echo "ok" || \
+ echo "failed" &
+
+ # Hack: let bg programs complete the startup
+ sleep 0.1
+ ./udpgso_bench_tx ${tx_args}
+ ret=$?
+ wait $(jobs -p)
+ return $ret
+}
+
+run_test() {
+ local -r args=$@
+
+ printf " %-40s" "$1"
+ ./in_netns.sh $0 __subprocess $2 rx -G -r $3
+}
+
+run_one_nat() {
+ # use 'rx' as separator between sender args and receiver args
+ local addr1 addr2 pid family="" ipt_cmd=ip6tables
+ local -r all="$@"
+ local -r tx_args=${all%rx*}
+ local -r rx_args=${all#*rx}
+
+ if [[ ${tx_args} = *-4* ]]; then
+ ipt_cmd=iptables
+ family=-4
+ addr1=192.168.1.1
+ addr2=192.168.1.3/24
+ else
+ addr1=2001:db8::1
+ addr2="2001:db8::3/64 nodad"
+ fi
+
+ cfg_veth
+ ip -netns "${PEER_NS}" addr add dev veth1 ${addr2}
+
+ # fool the GRO engine changing the destination address ...
+ ip netns exec "${PEER_NS}" $ipt_cmd -t nat -I PREROUTING -d ${addr1} -j DNAT --to-destination ${addr2%/*}
+
+ # ... so that GRO will match the UDP_GRO enabled socket, but packets
+ # will land on the 'plain' one
+ ip netns exec "${PEER_NS}" ./udpgso_bench_rx -G ${family} -b ${addr1} -n 0 &
+ pid=$!
+ ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${family} -b ${addr2%/*} ${rx_args} && \
+ echo "ok" || \
+ echo "failed"&
+
+ sleep 0.1
+ ./udpgso_bench_tx ${tx_args}
+ ret=$?
+ kill -INT $pid
+ wait $(jobs -p)
+ return $ret
+}
+
+run_one_2sock() {
+ # use 'rx' as separator between sender args and receiver args
+ local -r all="$@"
+ local -r tx_args=${all%rx*}
+ local -r rx_args=${all#*rx}
+
+ cfg_veth
+
+ ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} -p 12345 &
+ ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 2000 -R 10 ${rx_args} && \
+ echo "ok" || \
+ echo "failed" &
+
+ # Hack: let bg programs complete the startup
+ sleep 0.1
+ ./udpgso_bench_tx ${tx_args} -p 12345
+ sleep 0.1
+ # first UDP GSO socket should be closed at this point
+ ./udpgso_bench_tx ${tx_args}
+ ret=$?
+ wait $(jobs -p)
+ return $ret
+}
+
+run_nat_test() {
+ local -r args=$@
+
+ printf " %-40s" "$1"
+ ./in_netns.sh $0 __subprocess_nat $2 rx -r $3
+}
+
+run_2sock_test() {
+ local -r args=$@
+
+ printf " %-40s" "$1"
+ ./in_netns.sh $0 __subprocess_2sock $2 rx -G -r $3
+}
+
+run_all() {
+ local -r core_args="-l 4"
+ local -r ipv4_args="${core_args} -4 -D 192.168.1.1"
+ local -r ipv6_args="${core_args} -6 -D 2001:db8::1"
+ ret=0
+
+ echo "ipv4"
+ run_test "no GRO" "${ipv4_args} -M 10 -s 1400" "-4 -n 10 -l 1400"
+ check_err $?
+
+ # explicitly check we are not receiving UDP_SEGMENT cmsg (-S -1)
+ # when GRO does not take place
+ run_test "no GRO chk cmsg" "${ipv4_args} -M 10 -s 1400" "-4 -n 10 -l 1400 -S -1"
+ check_err $?
+
+ # the GSO packets are aggregated because:
+ # * veth schedule napi after each xmit
+ # * segmentation happens in BH context, veth napi poll is delayed after
+ # the transmission of the last segment
+ run_test "GRO" "${ipv4_args} -M 1 -s 14720 -S 0 " "-4 -n 1 -l 14720"
+ check_err $?
+ run_test "GRO chk cmsg" "${ipv4_args} -M 1 -s 14720 -S 0 " "-4 -n 1 -l 14720 -S 1472"
+ check_err $?
+ run_test "GRO with custom segment size" "${ipv4_args} -M 1 -s 14720 -S 500 " "-4 -n 1 -l 14720"
+ check_err $?
+ run_test "GRO with custom segment size cmsg" "${ipv4_args} -M 1 -s 14720 -S 500 " "-4 -n 1 -l 14720 -S 500"
+ check_err $?
+
+ run_nat_test "bad GRO lookup" "${ipv4_args} -M 1 -s 14720 -S 0" "-n 10 -l 1472"
+ check_err $?
+ run_2sock_test "multiple GRO socks" "${ipv4_args} -M 1 -s 14720 -S 0 " "-4 -n 1 -l 14720 -S 1472"
+ check_err $?
+
+ echo "ipv6"
+ run_test "no GRO" "${ipv6_args} -M 10 -s 1400" "-n 10 -l 1400"
+ check_err $?
+ run_test "no GRO chk cmsg" "${ipv6_args} -M 10 -s 1400" "-n 10 -l 1400 -S -1"
+ check_err $?
+ run_test "GRO" "${ipv6_args} -M 1 -s 14520 -S 0" "-n 1 -l 14520"
+ check_err $?
+ run_test "GRO chk cmsg" "${ipv6_args} -M 1 -s 14520 -S 0" "-n 1 -l 14520 -S 1452"
+ check_err $?
+ run_test "GRO with custom segment size" "${ipv6_args} -M 1 -s 14520 -S 500" "-n 1 -l 14520"
+ check_err $?
+ run_test "GRO with custom segment size cmsg" "${ipv6_args} -M 1 -s 14520 -S 500" "-n 1 -l 14520 -S 500"
+ check_err $?
+
+ run_nat_test "bad GRO lookup" "${ipv6_args} -M 1 -s 14520 -S 0" "-n 10 -l 1452"
+ check_err $?
+ run_2sock_test "multiple GRO socks" "${ipv6_args} -M 1 -s 14520 -S 0 " "-n 1 -l 14520 -S 1452"
+ check_err $?
+ return $ret
+}
+
+if [ ! -f ../bpf/xdp_dummy.o ]; then
+ echo "Missing xdp_dummy helper. Build bpf selftest first"
+ exit -1
+fi
+
+if [[ $# -eq 0 ]]; then
+ run_all
+elif [[ $1 == "__subprocess" ]]; then
+ shift
+ run_one $@
+elif [[ $1 == "__subprocess_nat" ]]; then
+ shift
+ run_one_nat $@
+elif [[ $1 == "__subprocess_2sock" ]]; then
+ shift
+ run_one_2sock $@
+fi
+
+exit $?
diff --git a/tools/testing/selftests/net/udpgro_bench.sh b/tools/testing/selftests/net/udpgro_bench.sh
new file mode 100755
index 000000000..820bc50f6
--- /dev/null
+++ b/tools/testing/selftests/net/udpgro_bench.sh
@@ -0,0 +1,95 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run a series of udpgro benchmarks
+
+readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)"
+
+cleanup() {
+ local -r jobs="$(jobs -p)"
+ local -r ns="$(ip netns list|grep $PEER_NS)"
+
+ [ -n "${jobs}" ] && kill -INT ${jobs} 2>/dev/null
+ [ -n "$ns" ] && ip netns del $ns 2>/dev/null
+}
+trap cleanup EXIT
+
+run_one() {
+ # use 'rx' as separator between sender args and receiver args
+ local -r all="$@"
+ local -r tx_args=${all%rx*}
+ local rx_args=${all#*rx}
+
+ [[ "${tx_args}" == *"-4"* ]] && rx_args="${rx_args} -4"
+
+ ip netns add "${PEER_NS}"
+ ip -netns "${PEER_NS}" link set lo up
+ ip link add type veth
+ ip link set dev veth0 up
+ ip addr add dev veth0 192.168.1.2/24
+ ip addr add dev veth0 2001:db8::2/64 nodad
+
+ ip link set dev veth1 netns "${PEER_NS}"
+ ip -netns "${PEER_NS}" addr add dev veth1 192.168.1.1/24
+ ip -netns "${PEER_NS}" addr add dev veth1 2001:db8::1/64 nodad
+ ip -netns "${PEER_NS}" link set dev veth1 up
+
+ ip -n "${PEER_NS}" link set veth1 xdp object ../bpf/xdp_dummy.o section xdp_dummy
+ ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} -r &
+ ip netns exec "${PEER_NS}" ./udpgso_bench_rx -t ${rx_args} -r &
+
+ # Hack: let bg programs complete the startup
+ sleep 0.1
+ ./udpgso_bench_tx ${tx_args}
+}
+
+run_in_netns() {
+ local -r args=$@
+
+ ./in_netns.sh $0 __subprocess ${args}
+}
+
+run_udp() {
+ local -r args=$@
+
+ echo "udp gso - over veth touching data"
+ run_in_netns ${args} -S 0 rx
+
+ echo "udp gso and gro - over veth touching data"
+ run_in_netns ${args} -S 0 rx -G
+}
+
+run_tcp() {
+ local -r args=$@
+
+ echo "tcp - over veth touching data"
+ run_in_netns ${args} -t rx
+}
+
+run_all() {
+ local -r core_args="-l 4"
+ local -r ipv4_args="${core_args} -4 -D 192.168.1.1"
+ local -r ipv6_args="${core_args} -6 -D 2001:db8::1"
+
+ echo "ipv4"
+ run_tcp "${ipv4_args}"
+ run_udp "${ipv4_args}"
+
+ echo "ipv6"
+ run_tcp "${ipv4_args}"
+ run_udp "${ipv6_args}"
+}
+
+if [ ! -f ../bpf/xdp_dummy.o ]; then
+ echo "Missing xdp_dummy helper. Build bpf selftest first"
+ exit -1
+fi
+
+if [[ $# -eq 0 ]]; then
+ run_all
+elif [[ $1 == "__subprocess" ]]; then
+ shift
+ run_one $@
+else
+ run_in_netns $@
+fi
diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c
new file mode 100644
index 000000000..7badaf215
--- /dev/null
+++ b/tools/testing/selftests/net/udpgso.c
@@ -0,0 +1,685 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <stddef.h>
+#include <arpa/inet.h>
+#include <error.h>
+#include <errno.h>
+#include <net/if.h>
+#include <linux/in.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <netinet/if_ether.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/udp.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#ifndef ETH_MAX_MTU
+#define ETH_MAX_MTU 0xFFFFU
+#endif
+
+#ifndef UDP_SEGMENT
+#define UDP_SEGMENT 103
+#endif
+
+#ifndef UDP_MAX_SEGMENTS
+#define UDP_MAX_SEGMENTS (1 << 6UL)
+#endif
+
+#define CONST_MTU_TEST 1500
+
+#define CONST_HDRLEN_V4 (sizeof(struct iphdr) + sizeof(struct udphdr))
+#define CONST_HDRLEN_V6 (sizeof(struct ip6_hdr) + sizeof(struct udphdr))
+
+#define CONST_MSS_V4 (CONST_MTU_TEST - CONST_HDRLEN_V4)
+#define CONST_MSS_V6 (CONST_MTU_TEST - CONST_HDRLEN_V6)
+
+#define CONST_MAX_SEGS_V4 (ETH_MAX_MTU / CONST_MSS_V4)
+#define CONST_MAX_SEGS_V6 (ETH_MAX_MTU / CONST_MSS_V6)
+
+static bool cfg_do_ipv4;
+static bool cfg_do_ipv6;
+static bool cfg_do_connected;
+static bool cfg_do_connectionless;
+static bool cfg_do_msgmore;
+static bool cfg_do_setsockopt;
+static int cfg_specific_test_id = -1;
+
+static const char cfg_ifname[] = "lo";
+static unsigned short cfg_port = 9000;
+
+static char buf[ETH_MAX_MTU];
+
+struct testcase {
+ int tlen; /* send() buffer size, may exceed mss */
+ bool tfail; /* send() call is expected to fail */
+ int gso_len; /* mss after applying gso */
+ int r_num_mss; /* recv(): number of calls of full mss */
+ int r_len_last; /* recv(): size of last non-mss dgram, if any */
+};
+
+const struct in6_addr addr6 = IN6ADDR_LOOPBACK_INIT;
+const struct in_addr addr4 = { .s_addr = __constant_htonl(INADDR_LOOPBACK + 2) };
+
+struct testcase testcases_v4[] = {
+ {
+ /* no GSO: send a single byte */
+ .tlen = 1,
+ .r_len_last = 1,
+ },
+ {
+ /* no GSO: send a single MSS */
+ .tlen = CONST_MSS_V4,
+ .r_num_mss = 1,
+ },
+ {
+ /* no GSO: send a single MSS + 1B: fail */
+ .tlen = CONST_MSS_V4 + 1,
+ .tfail = true,
+ },
+ {
+ /* send a single MSS: will fall back to no GSO */
+ .tlen = CONST_MSS_V4,
+ .gso_len = CONST_MSS_V4,
+ .r_num_mss = 1,
+ },
+ {
+ /* send a single MSS + 1B */
+ .tlen = CONST_MSS_V4 + 1,
+ .gso_len = CONST_MSS_V4,
+ .r_num_mss = 1,
+ .r_len_last = 1,
+ },
+ {
+ /* send exactly 2 MSS */
+ .tlen = CONST_MSS_V4 * 2,
+ .gso_len = CONST_MSS_V4,
+ .r_num_mss = 2,
+ },
+ {
+ /* send 2 MSS + 1B */
+ .tlen = (CONST_MSS_V4 * 2) + 1,
+ .gso_len = CONST_MSS_V4,
+ .r_num_mss = 2,
+ .r_len_last = 1,
+ },
+ {
+ /* send MAX segs */
+ .tlen = (ETH_MAX_MTU / CONST_MSS_V4) * CONST_MSS_V4,
+ .gso_len = CONST_MSS_V4,
+ .r_num_mss = (ETH_MAX_MTU / CONST_MSS_V4),
+ },
+
+ {
+ /* send MAX bytes */
+ .tlen = ETH_MAX_MTU - CONST_HDRLEN_V4,
+ .gso_len = CONST_MSS_V4,
+ .r_num_mss = CONST_MAX_SEGS_V4,
+ .r_len_last = ETH_MAX_MTU - CONST_HDRLEN_V4 -
+ (CONST_MAX_SEGS_V4 * CONST_MSS_V4),
+ },
+ {
+ /* send MAX + 1: fail */
+ .tlen = ETH_MAX_MTU - CONST_HDRLEN_V4 + 1,
+ .gso_len = CONST_MSS_V4,
+ .tfail = true,
+ },
+ {
+ /* send a single 1B MSS: will fall back to no GSO */
+ .tlen = 1,
+ .gso_len = 1,
+ .r_num_mss = 1,
+ },
+ {
+ /* send 2 1B segments */
+ .tlen = 2,
+ .gso_len = 1,
+ .r_num_mss = 2,
+ },
+ {
+ /* send 2B + 2B + 1B segments */
+ .tlen = 5,
+ .gso_len = 2,
+ .r_num_mss = 2,
+ .r_len_last = 1,
+ },
+ {
+ /* send max number of min sized segments */
+ .tlen = UDP_MAX_SEGMENTS,
+ .gso_len = 1,
+ .r_num_mss = UDP_MAX_SEGMENTS,
+ },
+ {
+ /* send max number + 1 of min sized segments: fail */
+ .tlen = UDP_MAX_SEGMENTS + 1,
+ .gso_len = 1,
+ .tfail = true,
+ },
+ {
+ /* EOL */
+ }
+};
+
+#ifndef IP6_MAX_MTU
+#define IP6_MAX_MTU (ETH_MAX_MTU + sizeof(struct ip6_hdr))
+#endif
+
+struct testcase testcases_v6[] = {
+ {
+ /* no GSO: send a single byte */
+ .tlen = 1,
+ .r_len_last = 1,
+ },
+ {
+ /* no GSO: send a single MSS */
+ .tlen = CONST_MSS_V6,
+ .r_num_mss = 1,
+ },
+ {
+ /* no GSO: send a single MSS + 1B: fail */
+ .tlen = CONST_MSS_V6 + 1,
+ .tfail = true,
+ },
+ {
+ /* send a single MSS: will fall back to no GSO */
+ .tlen = CONST_MSS_V6,
+ .gso_len = CONST_MSS_V6,
+ .r_num_mss = 1,
+ },
+ {
+ /* send a single MSS + 1B */
+ .tlen = CONST_MSS_V6 + 1,
+ .gso_len = CONST_MSS_V6,
+ .r_num_mss = 1,
+ .r_len_last = 1,
+ },
+ {
+ /* send exactly 2 MSS */
+ .tlen = CONST_MSS_V6 * 2,
+ .gso_len = CONST_MSS_V6,
+ .r_num_mss = 2,
+ },
+ {
+ /* send 2 MSS + 1B */
+ .tlen = (CONST_MSS_V6 * 2) + 1,
+ .gso_len = CONST_MSS_V6,
+ .r_num_mss = 2,
+ .r_len_last = 1,
+ },
+ {
+ /* send MAX segs */
+ .tlen = (IP6_MAX_MTU / CONST_MSS_V6) * CONST_MSS_V6,
+ .gso_len = CONST_MSS_V6,
+ .r_num_mss = (IP6_MAX_MTU / CONST_MSS_V6),
+ },
+
+ {
+ /* send MAX bytes */
+ .tlen = IP6_MAX_MTU - CONST_HDRLEN_V6,
+ .gso_len = CONST_MSS_V6,
+ .r_num_mss = CONST_MAX_SEGS_V6,
+ .r_len_last = IP6_MAX_MTU - CONST_HDRLEN_V6 -
+ (CONST_MAX_SEGS_V6 * CONST_MSS_V6),
+ },
+ {
+ /* send MAX + 1: fail */
+ .tlen = IP6_MAX_MTU - CONST_HDRLEN_V6 + 1,
+ .gso_len = CONST_MSS_V6,
+ .tfail = true,
+ },
+ {
+ /* send a single 1B MSS: will fall back to no GSO */
+ .tlen = 1,
+ .gso_len = 1,
+ .r_num_mss = 1,
+ },
+ {
+ /* send 2 1B segments */
+ .tlen = 2,
+ .gso_len = 1,
+ .r_num_mss = 2,
+ },
+ {
+ /* send 2B + 2B + 1B segments */
+ .tlen = 5,
+ .gso_len = 2,
+ .r_num_mss = 2,
+ .r_len_last = 1,
+ },
+ {
+ /* send max number of min sized segments */
+ .tlen = UDP_MAX_SEGMENTS,
+ .gso_len = 1,
+ .r_num_mss = UDP_MAX_SEGMENTS,
+ },
+ {
+ /* send max number + 1 of min sized segments: fail */
+ .tlen = UDP_MAX_SEGMENTS + 1,
+ .gso_len = 1,
+ .tfail = true,
+ },
+ {
+ /* EOL */
+ }
+};
+
+static unsigned int get_device_mtu(int fd, const char *ifname)
+{
+ struct ifreq ifr;
+
+ memset(&ifr, 0, sizeof(ifr));
+
+ strcpy(ifr.ifr_name, ifname);
+
+ if (ioctl(fd, SIOCGIFMTU, &ifr))
+ error(1, errno, "ioctl get mtu");
+
+ return ifr.ifr_mtu;
+}
+
+static void __set_device_mtu(int fd, const char *ifname, unsigned int mtu)
+{
+ struct ifreq ifr;
+
+ memset(&ifr, 0, sizeof(ifr));
+
+ ifr.ifr_mtu = mtu;
+ strcpy(ifr.ifr_name, ifname);
+
+ if (ioctl(fd, SIOCSIFMTU, &ifr))
+ error(1, errno, "ioctl set mtu");
+}
+
+static void set_device_mtu(int fd, int mtu)
+{
+ int val;
+
+ val = get_device_mtu(fd, cfg_ifname);
+ fprintf(stderr, "device mtu (orig): %u\n", val);
+
+ __set_device_mtu(fd, cfg_ifname, mtu);
+ val = get_device_mtu(fd, cfg_ifname);
+ if (val != mtu)
+ error(1, 0, "unable to set device mtu to %u\n", val);
+
+ fprintf(stderr, "device mtu (test): %u\n", val);
+}
+
+static void set_pmtu_discover(int fd, bool is_ipv4)
+{
+ int level, name, val;
+
+ if (is_ipv4) {
+ level = SOL_IP;
+ name = IP_MTU_DISCOVER;
+ val = IP_PMTUDISC_DO;
+ } else {
+ level = SOL_IPV6;
+ name = IPV6_MTU_DISCOVER;
+ val = IPV6_PMTUDISC_DO;
+ }
+
+ if (setsockopt(fd, level, name, &val, sizeof(val)))
+ error(1, errno, "setsockopt path mtu");
+}
+
+static unsigned int get_path_mtu(int fd, bool is_ipv4)
+{
+ socklen_t vallen;
+ unsigned int mtu;
+ int ret;
+
+ vallen = sizeof(mtu);
+ if (is_ipv4)
+ ret = getsockopt(fd, SOL_IP, IP_MTU, &mtu, &vallen);
+ else
+ ret = getsockopt(fd, SOL_IPV6, IPV6_MTU, &mtu, &vallen);
+
+ if (ret)
+ error(1, errno, "getsockopt mtu");
+
+
+ fprintf(stderr, "path mtu (read): %u\n", mtu);
+ return mtu;
+}
+
+/* very wordy version of system("ip route add dev lo mtu 1500 127.0.0.3/32") */
+static void set_route_mtu(int mtu, bool is_ipv4)
+{
+ struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
+ struct nlmsghdr *nh;
+ struct rtattr *rta;
+ struct rtmsg *rt;
+ char data[NLMSG_ALIGN(sizeof(*nh)) +
+ NLMSG_ALIGN(sizeof(*rt)) +
+ NLMSG_ALIGN(RTA_LENGTH(sizeof(addr6))) +
+ NLMSG_ALIGN(RTA_LENGTH(sizeof(int))) +
+ NLMSG_ALIGN(RTA_LENGTH(0) + RTA_LENGTH(sizeof(int)))];
+ int fd, ret, alen, off = 0;
+
+ alen = is_ipv4 ? sizeof(addr4) : sizeof(addr6);
+
+ fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (fd == -1)
+ error(1, errno, "socket netlink");
+
+ memset(data, 0, sizeof(data));
+
+ nh = (void *)data;
+ nh->nlmsg_type = RTM_NEWROUTE;
+ nh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE;
+ off += NLMSG_ALIGN(sizeof(*nh));
+
+ rt = (void *)(data + off);
+ rt->rtm_family = is_ipv4 ? AF_INET : AF_INET6;
+ rt->rtm_table = RT_TABLE_MAIN;
+ rt->rtm_dst_len = alen << 3;
+ rt->rtm_protocol = RTPROT_BOOT;
+ rt->rtm_scope = RT_SCOPE_UNIVERSE;
+ rt->rtm_type = RTN_UNICAST;
+ off += NLMSG_ALIGN(sizeof(*rt));
+
+ rta = (void *)(data + off);
+ rta->rta_type = RTA_DST;
+ rta->rta_len = RTA_LENGTH(alen);
+ if (is_ipv4)
+ memcpy(RTA_DATA(rta), &addr4, alen);
+ else
+ memcpy(RTA_DATA(rta), &addr6, alen);
+ off += NLMSG_ALIGN(rta->rta_len);
+
+ rta = (void *)(data + off);
+ rta->rta_type = RTA_OIF;
+ rta->rta_len = RTA_LENGTH(sizeof(int));
+ *((int *)(RTA_DATA(rta))) = 1; //if_nametoindex("lo");
+ off += NLMSG_ALIGN(rta->rta_len);
+
+ /* MTU is a subtype in a metrics type */
+ rta = (void *)(data + off);
+ rta->rta_type = RTA_METRICS;
+ rta->rta_len = RTA_LENGTH(0) + RTA_LENGTH(sizeof(int));
+ off += NLMSG_ALIGN(rta->rta_len);
+
+ /* now fill MTU subtype. Note that it fits within above rta_len */
+ rta = (void *)(((char *) rta) + RTA_LENGTH(0));
+ rta->rta_type = RTAX_MTU;
+ rta->rta_len = RTA_LENGTH(sizeof(int));
+ *((int *)(RTA_DATA(rta))) = mtu;
+
+ nh->nlmsg_len = off;
+
+ ret = sendto(fd, data, off, 0, (void *)&nladdr, sizeof(nladdr));
+ if (ret != off)
+ error(1, errno, "send netlink: %uB != %uB\n", ret, off);
+
+ if (close(fd))
+ error(1, errno, "close netlink");
+
+ fprintf(stderr, "route mtu (test): %u\n", mtu);
+}
+
+static bool __send_one(int fd, struct msghdr *msg, int flags)
+{
+ int ret;
+
+ ret = sendmsg(fd, msg, flags);
+ if (ret == -1 &&
+ (errno == EMSGSIZE || errno == ENOMEM || errno == EINVAL))
+ return false;
+ if (ret == -1)
+ error(1, errno, "sendmsg");
+ if (ret != msg->msg_iov->iov_len)
+ error(1, 0, "sendto: %d != %llu", ret,
+ (unsigned long long)msg->msg_iov->iov_len);
+ if (msg->msg_flags)
+ error(1, 0, "sendmsg: return flags 0x%x\n", msg->msg_flags);
+
+ return true;
+}
+
+static bool send_one(int fd, int len, int gso_len,
+ struct sockaddr *addr, socklen_t alen)
+{
+ char control[CMSG_SPACE(sizeof(uint16_t))] = {0};
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ struct cmsghdr *cm;
+
+ iov.iov_base = buf;
+ iov.iov_len = len;
+
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ msg.msg_name = addr;
+ msg.msg_namelen = alen;
+
+ if (gso_len && !cfg_do_setsockopt) {
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ cm = CMSG_FIRSTHDR(&msg);
+ cm->cmsg_level = SOL_UDP;
+ cm->cmsg_type = UDP_SEGMENT;
+ cm->cmsg_len = CMSG_LEN(sizeof(uint16_t));
+ *((uint16_t *) CMSG_DATA(cm)) = gso_len;
+ }
+
+ /* If MSG_MORE, send 1 byte followed by remainder */
+ if (cfg_do_msgmore && len > 1) {
+ iov.iov_len = 1;
+ if (!__send_one(fd, &msg, MSG_MORE))
+ error(1, 0, "send 1B failed");
+
+ iov.iov_base++;
+ iov.iov_len = len - 1;
+ }
+
+ return __send_one(fd, &msg, 0);
+}
+
+static int recv_one(int fd, int flags)
+{
+ int ret;
+
+ ret = recv(fd, buf, sizeof(buf), flags);
+ if (ret == -1 && errno == EAGAIN && (flags & MSG_DONTWAIT))
+ return 0;
+ if (ret == -1)
+ error(1, errno, "recv");
+
+ return ret;
+}
+
+static void run_one(struct testcase *test, int fdt, int fdr,
+ struct sockaddr *addr, socklen_t alen)
+{
+ int i, ret, val, mss;
+ bool sent;
+
+ fprintf(stderr, "ipv%d tx:%d gso:%d %s\n",
+ addr->sa_family == AF_INET ? 4 : 6,
+ test->tlen, test->gso_len,
+ test->tfail ? "(fail)" : "");
+
+ val = test->gso_len;
+ if (cfg_do_setsockopt) {
+ if (setsockopt(fdt, SOL_UDP, UDP_SEGMENT, &val, sizeof(val)))
+ error(1, errno, "setsockopt udp segment");
+ }
+
+ sent = send_one(fdt, test->tlen, test->gso_len, addr, alen);
+ if (sent && test->tfail)
+ error(1, 0, "send succeeded while expecting failure");
+ if (!sent && !test->tfail)
+ error(1, 0, "send failed while expecting success");
+ if (!sent)
+ return;
+
+ if (test->gso_len)
+ mss = test->gso_len;
+ else
+ mss = addr->sa_family == AF_INET ? CONST_MSS_V4 : CONST_MSS_V6;
+
+
+ /* Recv all full MSS datagrams */
+ for (i = 0; i < test->r_num_mss; i++) {
+ ret = recv_one(fdr, 0);
+ if (ret != mss)
+ error(1, 0, "recv.%d: %d != %d", i, ret, mss);
+ }
+
+ /* Recv the non-full last datagram, if tlen was not a multiple of mss */
+ if (test->r_len_last) {
+ ret = recv_one(fdr, 0);
+ if (ret != test->r_len_last)
+ error(1, 0, "recv.%d: %d != %d (last)",
+ i, ret, test->r_len_last);
+ }
+
+ /* Verify received all data */
+ ret = recv_one(fdr, MSG_DONTWAIT);
+ if (ret)
+ error(1, 0, "recv: unexpected datagram");
+}
+
+static void run_all(int fdt, int fdr, struct sockaddr *addr, socklen_t alen)
+{
+ struct testcase *tests, *test;
+
+ tests = addr->sa_family == AF_INET ? testcases_v4 : testcases_v6;
+
+ for (test = tests; test->tlen; test++) {
+ /* if a specific test is given, then skip all others */
+ if (cfg_specific_test_id == -1 ||
+ cfg_specific_test_id == test - tests)
+ run_one(test, fdt, fdr, addr, alen);
+ }
+}
+
+static void run_test(struct sockaddr *addr, socklen_t alen)
+{
+ struct timeval tv = { .tv_usec = 100 * 1000 };
+ int fdr, fdt, val;
+
+ fdr = socket(addr->sa_family, SOCK_DGRAM, 0);
+ if (fdr == -1)
+ error(1, errno, "socket r");
+
+ if (bind(fdr, addr, alen))
+ error(1, errno, "bind");
+
+ /* Have tests fail quickly instead of hang */
+ if (setsockopt(fdr, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)))
+ error(1, errno, "setsockopt rcv timeout");
+
+ fdt = socket(addr->sa_family, SOCK_DGRAM, 0);
+ if (fdt == -1)
+ error(1, errno, "socket t");
+
+ /* Do not fragment these datagrams: only succeed if GSO works */
+ set_pmtu_discover(fdt, addr->sa_family == AF_INET);
+
+ if (cfg_do_connectionless) {
+ set_device_mtu(fdt, CONST_MTU_TEST);
+ run_all(fdt, fdr, addr, alen);
+ }
+
+ if (cfg_do_connected) {
+ set_device_mtu(fdt, CONST_MTU_TEST + 100);
+ set_route_mtu(CONST_MTU_TEST, addr->sa_family == AF_INET);
+
+ if (connect(fdt, addr, alen))
+ error(1, errno, "connect");
+
+ val = get_path_mtu(fdt, addr->sa_family == AF_INET);
+ if (val != CONST_MTU_TEST)
+ error(1, 0, "bad path mtu %u\n", val);
+
+ run_all(fdt, fdr, addr, 0 /* use connected addr */);
+ }
+
+ if (close(fdt))
+ error(1, errno, "close t");
+ if (close(fdr))
+ error(1, errno, "close r");
+}
+
+static void run_test_v4(void)
+{
+ struct sockaddr_in addr = {0};
+
+ addr.sin_family = AF_INET;
+ addr.sin_port = htons(cfg_port);
+ addr.sin_addr = addr4;
+
+ run_test((void *)&addr, sizeof(addr));
+}
+
+static void run_test_v6(void)
+{
+ struct sockaddr_in6 addr = {0};
+
+ addr.sin6_family = AF_INET6;
+ addr.sin6_port = htons(cfg_port);
+ addr.sin6_addr = addr6;
+
+ run_test((void *)&addr, sizeof(addr));
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ int c;
+
+ while ((c = getopt(argc, argv, "46cCmst:")) != -1) {
+ switch (c) {
+ case '4':
+ cfg_do_ipv4 = true;
+ break;
+ case '6':
+ cfg_do_ipv6 = true;
+ break;
+ case 'c':
+ cfg_do_connected = true;
+ break;
+ case 'C':
+ cfg_do_connectionless = true;
+ break;
+ case 'm':
+ cfg_do_msgmore = true;
+ break;
+ case 's':
+ cfg_do_setsockopt = true;
+ break;
+ case 't':
+ cfg_specific_test_id = strtoul(optarg, NULL, 0);
+ break;
+ default:
+ error(1, 0, "%s: parse error", argv[0]);
+ }
+ }
+}
+
+int main(int argc, char **argv)
+{
+ parse_opts(argc, argv);
+
+ if (cfg_do_ipv4)
+ run_test_v4();
+ if (cfg_do_ipv6)
+ run_test_v6();
+
+ fprintf(stderr, "OK\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/net/udpgso.sh b/tools/testing/selftests/net/udpgso.sh
new file mode 100755
index 000000000..fec24f584
--- /dev/null
+++ b/tools/testing/selftests/net/udpgso.sh
@@ -0,0 +1,29 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run a series of udpgso regression tests
+
+echo "ipv4 cmsg"
+./in_netns.sh ./udpgso -4 -C
+
+echo "ipv4 setsockopt"
+./in_netns.sh ./udpgso -4 -C -s
+
+echo "ipv6 cmsg"
+./in_netns.sh ./udpgso -6 -C
+
+echo "ipv6 setsockopt"
+./in_netns.sh ./udpgso -6 -C -s
+
+echo "ipv4 connected"
+./in_netns.sh ./udpgso -4 -c
+
+# blocked on 2nd loopback address
+# echo "ipv6 connected"
+# ./in_netns.sh ./udpgso -6 -c
+
+echo "ipv4 msg_more"
+./in_netns.sh ./udpgso -4 -C -m
+
+echo "ipv6 msg_more"
+./in_netns.sh ./udpgso -6 -C -m
diff --git a/tools/testing/selftests/net/udpgso_bench.sh b/tools/testing/selftests/net/udpgso_bench.sh
new file mode 100755
index 000000000..640bc4345
--- /dev/null
+++ b/tools/testing/selftests/net/udpgso_bench.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run a series of udpgso benchmarks
+
+readonly GREEN='\033[0;92m'
+readonly YELLOW='\033[0;33m'
+readonly RED='\033[0;31m'
+readonly NC='\033[0m' # No Color
+readonly TESTPORT=8000
+
+readonly KSFT_PASS=0
+readonly KSFT_FAIL=1
+readonly KSFT_SKIP=4
+
+num_pass=0
+num_err=0
+num_skip=0
+
+kselftest_test_exitcode() {
+ local -r exitcode=$1
+
+ if [[ ${exitcode} -eq ${KSFT_PASS} ]]; then
+ num_pass=$(( $num_pass + 1 ))
+ elif [[ ${exitcode} -eq ${KSFT_SKIP} ]]; then
+ num_skip=$(( $num_skip + 1 ))
+ else
+ num_err=$(( $num_err + 1 ))
+ fi
+}
+
+kselftest_exit() {
+ echo -e "$(basename $0): PASS=${num_pass} SKIP=${num_skip} FAIL=${num_err}"
+
+ if [[ $num_err -ne 0 ]]; then
+ echo -e "$(basename $0): ${RED}FAIL${NC}"
+ exit ${KSFT_FAIL}
+ fi
+
+ if [[ $num_skip -ne 0 ]]; then
+ echo -e "$(basename $0): ${YELLOW}SKIP${NC}"
+ exit ${KSFT_SKIP}
+ fi
+
+ echo -e "$(basename $0): ${GREEN}PASS${NC}"
+ exit ${KSFT_PASS}
+}
+
+wake_children() {
+ local -r jobs="$(jobs -p)"
+
+ if [[ "${jobs}" != "" ]]; then
+ kill -1 ${jobs} 2>/dev/null
+ fi
+}
+trap wake_children EXIT
+
+run_one() {
+ local -r args=$@
+ local nr_socks=0
+ local i=0
+ local -r timeout=10
+
+ ./udpgso_bench_rx -p "$TESTPORT" &
+ ./udpgso_bench_rx -p "$TESTPORT" -t &
+
+ # Wait for the above test program to get ready to receive connections.
+ while [ "$i" -lt "$timeout" ]; do
+ nr_socks="$(ss -lnHi | grep -c "\*:${TESTPORT}")"
+ [ "$nr_socks" -eq 2 ] && break
+ i=$((i + 1))
+ sleep 1
+ done
+ if [ "$nr_socks" -ne 2 ]; then
+ echo "timed out while waiting for udpgso_bench_rx"
+ exit 1
+ fi
+
+ ./udpgso_bench_tx -p "$TESTPORT" ${args}
+}
+
+run_in_netns() {
+ local -r args=$@
+
+ ./in_netns.sh $0 __subprocess ${args}
+ kselftest_test_exitcode $?
+}
+
+run_udp() {
+ local -r args=$@
+
+ echo "udp"
+ run_in_netns ${args}
+
+ echo "udp gso"
+ run_in_netns ${args} -S 0
+
+ echo "udp gso zerocopy"
+ run_in_netns ${args} -S 0 -z
+
+ echo "udp gso timestamp"
+ run_in_netns ${args} -S 0 -T
+
+ echo "udp gso zerocopy audit"
+ run_in_netns ${args} -S 0 -z -a
+
+ echo "udp gso timestamp audit"
+ run_in_netns ${args} -S 0 -T -a
+
+ echo "udp gso zerocopy timestamp audit"
+ run_in_netns ${args} -S 0 -T -z -a
+}
+
+run_tcp() {
+ local -r args=$@
+
+ echo "tcp"
+ run_in_netns ${args} -t
+
+ echo "tcp zerocopy"
+ run_in_netns ${args} -t -z
+
+ # excluding for now because test fails intermittently
+ # add -P option to include poll() to reduce possibility of lost messages
+ #echo "tcp zerocopy audit"
+ #run_in_netns ${args} -t -z -P -a
+}
+
+run_all() {
+ local -r core_args="-l 3"
+ local -r ipv4_args="${core_args} -4 -D 127.0.0.1"
+ local -r ipv6_args="${core_args} -6 -D ::1"
+
+ echo "ipv4"
+ run_tcp "${ipv4_args}"
+ run_udp "${ipv4_args}"
+
+ echo "ipv6"
+ run_tcp "${ipv6_args}"
+ run_udp "${ipv6_args}"
+}
+
+if [[ $# -eq 0 ]]; then
+ run_all
+ kselftest_exit
+elif [[ $1 == "__subprocess" ]]; then
+ shift
+ run_one $@
+else
+ run_in_netns $@
+fi
diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c
new file mode 100644
index 000000000..f35a924d4
--- /dev/null
+++ b/tools/testing/selftests/net/udpgso_bench_rx.c
@@ -0,0 +1,409 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <error.h>
+#include <errno.h>
+#include <limits.h>
+#include <linux/errqueue.h>
+#include <linux/if_packet.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#ifndef UDP_GRO
+#define UDP_GRO 104
+#endif
+
+static int cfg_port = 8000;
+static bool cfg_tcp;
+static bool cfg_verify;
+static bool cfg_read_all;
+static bool cfg_gro_segment;
+static int cfg_family = PF_INET6;
+static int cfg_alen = sizeof(struct sockaddr_in6);
+static int cfg_expected_pkt_nr;
+static int cfg_expected_pkt_len;
+static int cfg_expected_gso_size;
+static int cfg_connect_timeout_ms;
+static int cfg_rcv_timeout_ms;
+static struct sockaddr_storage cfg_bind_addr;
+
+static bool interrupted;
+static unsigned long packets, bytes;
+
+static void sigint_handler(int signum)
+{
+ if (signum == SIGINT)
+ interrupted = true;
+}
+
+static void setup_sockaddr(int domain, const char *str_addr, void *sockaddr)
+{
+ struct sockaddr_in6 *addr6 = (void *) sockaddr;
+ struct sockaddr_in *addr4 = (void *) sockaddr;
+
+ switch (domain) {
+ case PF_INET:
+ addr4->sin_family = AF_INET;
+ addr4->sin_port = htons(cfg_port);
+ if (inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
+ error(1, 0, "ipv4 parse error: %s", str_addr);
+ break;
+ case PF_INET6:
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_port = htons(cfg_port);
+ if (inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
+ error(1, 0, "ipv6 parse error: %s", str_addr);
+ break;
+ default:
+ error(1, 0, "illegal domain");
+ }
+}
+
+static unsigned long gettimeofday_ms(void)
+{
+ struct timeval tv;
+
+ gettimeofday(&tv, NULL);
+ return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
+}
+
+static void do_poll(int fd, int timeout_ms)
+{
+ struct pollfd pfd;
+ int ret;
+
+ pfd.events = POLLIN;
+ pfd.revents = 0;
+ pfd.fd = fd;
+
+ do {
+ ret = poll(&pfd, 1, 10);
+ if (interrupted)
+ break;
+ if (ret == -1)
+ error(1, errno, "poll");
+ if (ret == 0) {
+ if (!timeout_ms)
+ continue;
+
+ timeout_ms -= 10;
+ if (timeout_ms <= 0) {
+ interrupted = true;
+ break;
+ }
+
+ /* no events and more time to wait, do poll again */
+ continue;
+ }
+ if (pfd.revents != POLLIN)
+ error(1, errno, "poll: 0x%x expected 0x%x\n",
+ pfd.revents, POLLIN);
+ } while (!ret);
+}
+
+static int do_socket(bool do_tcp)
+{
+ int fd, val;
+
+ fd = socket(cfg_family, cfg_tcp ? SOCK_STREAM : SOCK_DGRAM, 0);
+ if (fd == -1)
+ error(1, errno, "socket");
+
+ val = 1 << 21;
+ if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)))
+ error(1, errno, "setsockopt rcvbuf");
+ val = 1;
+ if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &val, sizeof(val)))
+ error(1, errno, "setsockopt reuseport");
+
+ if (bind(fd, (void *)&cfg_bind_addr, cfg_alen))
+ error(1, errno, "bind");
+
+ if (do_tcp) {
+ int accept_fd = fd;
+
+ if (listen(accept_fd, 1))
+ error(1, errno, "listen");
+
+ do_poll(accept_fd, cfg_connect_timeout_ms);
+ if (interrupted)
+ exit(0);
+
+ fd = accept(accept_fd, NULL, NULL);
+ if (fd == -1)
+ error(1, errno, "accept");
+ if (close(accept_fd))
+ error(1, errno, "close accept fd");
+ }
+
+ return fd;
+}
+
+/* Flush all outstanding bytes for the tcp receive queue */
+static void do_flush_tcp(int fd)
+{
+ int ret;
+
+ while (true) {
+ /* MSG_TRUNC flushes up to len bytes */
+ ret = recv(fd, NULL, 1 << 21, MSG_TRUNC | MSG_DONTWAIT);
+ if (ret == -1 && errno == EAGAIN)
+ return;
+ if (ret == -1)
+ error(1, errno, "flush");
+ if (ret == 0) {
+ /* client detached */
+ exit(0);
+ }
+
+ packets++;
+ bytes += ret;
+ }
+
+}
+
+static char sanitized_char(char val)
+{
+ return (val >= 'a' && val <= 'z') ? val : '.';
+}
+
+static void do_verify_udp(const char *data, int len)
+{
+ char cur = data[0];
+ int i;
+
+ /* verify contents */
+ if (cur < 'a' || cur > 'z')
+ error(1, 0, "data initial byte out of range");
+
+ for (i = 1; i < len; i++) {
+ if (cur == 'z')
+ cur = 'a';
+ else
+ cur++;
+
+ if (data[i] != cur)
+ error(1, 0, "data[%d]: len %d, %c(%hhu) != %c(%hhu)\n",
+ i, len,
+ sanitized_char(data[i]), data[i],
+ sanitized_char(cur), cur);
+ }
+}
+
+static int recv_msg(int fd, char *buf, int len, int *gso_size)
+{
+ char control[CMSG_SPACE(sizeof(int))] = {0};
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ struct cmsghdr *cmsg;
+ int ret;
+
+ iov.iov_base = buf;
+ iov.iov_len = len;
+
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ *gso_size = -1;
+ ret = recvmsg(fd, &msg, MSG_TRUNC | MSG_DONTWAIT);
+ if (ret != -1) {
+ for (cmsg = CMSG_FIRSTHDR(&msg); cmsg != NULL;
+ cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+ if (cmsg->cmsg_level == SOL_UDP
+ && cmsg->cmsg_type == UDP_GRO) {
+ *gso_size = *(int *)CMSG_DATA(cmsg);
+ break;
+ }
+ }
+ }
+ return ret;
+}
+
+/* Flush all outstanding datagrams. Verify first few bytes of each. */
+static void do_flush_udp(int fd)
+{
+ static char rbuf[ETH_MAX_MTU];
+ int ret, len, gso_size = 0, budget = 256;
+
+ len = cfg_read_all ? sizeof(rbuf) : 0;
+ while (budget--) {
+ /* MSG_TRUNC will make return value full datagram length */
+ if (!cfg_expected_gso_size)
+ ret = recv(fd, rbuf, len, MSG_TRUNC | MSG_DONTWAIT);
+ else
+ ret = recv_msg(fd, rbuf, len, &gso_size);
+ if (ret == -1 && errno == EAGAIN)
+ break;
+ if (ret == -1)
+ error(1, errno, "recv");
+ if (cfg_expected_pkt_len && ret != cfg_expected_pkt_len)
+ error(1, 0, "recv: bad packet len, got %d,"
+ " expected %d\n", ret, cfg_expected_pkt_len);
+ if (len && cfg_verify) {
+ if (ret == 0)
+ error(1, errno, "recv: 0 byte datagram\n");
+
+ do_verify_udp(rbuf, ret);
+ }
+ if (cfg_expected_gso_size && cfg_expected_gso_size != gso_size)
+ error(1, 0, "recv: bad gso size, got %d, expected %d "
+ "(-1 == no gso cmsg))\n", gso_size,
+ cfg_expected_gso_size);
+
+ packets++;
+ bytes += ret;
+ if (cfg_expected_pkt_nr && packets >= cfg_expected_pkt_nr)
+ break;
+ }
+}
+
+static void usage(const char *filepath)
+{
+ error(1, 0, "Usage: %s [-C connect_timeout] [-Grtv] [-b addr] [-p port]"
+ " [-l pktlen] [-n packetnr] [-R rcv_timeout] [-S gsosize]",
+ filepath);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ const char *bind_addr = NULL;
+ int c;
+
+ while ((c = getopt(argc, argv, "4b:C:Gl:n:p:rR:S:tv")) != -1) {
+ switch (c) {
+ case '4':
+ cfg_family = PF_INET;
+ cfg_alen = sizeof(struct sockaddr_in);
+ break;
+ case 'b':
+ bind_addr = optarg;
+ break;
+ case 'C':
+ cfg_connect_timeout_ms = strtoul(optarg, NULL, 0);
+ break;
+ case 'G':
+ cfg_gro_segment = true;
+ break;
+ case 'l':
+ cfg_expected_pkt_len = strtoul(optarg, NULL, 0);
+ break;
+ case 'n':
+ cfg_expected_pkt_nr = strtoul(optarg, NULL, 0);
+ break;
+ case 'p':
+ cfg_port = strtoul(optarg, NULL, 0);
+ break;
+ case 'r':
+ cfg_read_all = true;
+ break;
+ case 'R':
+ cfg_rcv_timeout_ms = strtoul(optarg, NULL, 0);
+ break;
+ case 'S':
+ cfg_expected_gso_size = strtol(optarg, NULL, 0);
+ break;
+ case 't':
+ cfg_tcp = true;
+ break;
+ case 'v':
+ cfg_verify = true;
+ cfg_read_all = true;
+ break;
+ default:
+ exit(1);
+ }
+ }
+
+ if (!bind_addr)
+ bind_addr = cfg_family == PF_INET6 ? "::" : "0.0.0.0";
+
+ setup_sockaddr(cfg_family, bind_addr, &cfg_bind_addr);
+
+ if (optind != argc)
+ usage(argv[0]);
+
+ if (cfg_tcp && cfg_verify)
+ error(1, 0, "TODO: implement verify mode for tcp");
+}
+
+static void do_recv(void)
+{
+ int timeout_ms = cfg_tcp ? cfg_rcv_timeout_ms : cfg_connect_timeout_ms;
+ unsigned long tnow, treport;
+ int fd;
+
+ fd = do_socket(cfg_tcp);
+
+ if (cfg_gro_segment && !cfg_tcp) {
+ int val = 1;
+ if (setsockopt(fd, IPPROTO_UDP, UDP_GRO, &val, sizeof(val)))
+ error(1, errno, "setsockopt UDP_GRO");
+ }
+
+ treport = gettimeofday_ms() + 1000;
+ do {
+ do_poll(fd, timeout_ms);
+
+ if (cfg_tcp)
+ do_flush_tcp(fd);
+ else
+ do_flush_udp(fd);
+
+ tnow = gettimeofday_ms();
+ if (tnow > treport) {
+ if (packets)
+ fprintf(stderr,
+ "%s rx: %6lu MB/s %8lu calls/s\n",
+ cfg_tcp ? "tcp" : "udp",
+ bytes >> 20, packets);
+ bytes = packets = 0;
+ treport = tnow + 1000;
+ }
+
+ timeout_ms = cfg_rcv_timeout_ms;
+
+ } while (!interrupted);
+
+ if (cfg_expected_pkt_nr && (packets != cfg_expected_pkt_nr))
+ error(1, 0, "wrong packet number! got %ld, expected %d\n",
+ packets, cfg_expected_pkt_nr);
+
+ if (close(fd))
+ error(1, errno, "close");
+}
+
+int main(int argc, char **argv)
+{
+ parse_opts(argc, argv);
+
+ signal(SIGINT, sigint_handler);
+
+ do_recv();
+
+ return 0;
+}
diff --git a/tools/testing/selftests/net/udpgso_bench_tx.c b/tools/testing/selftests/net/udpgso_bench_tx.c
new file mode 100644
index 000000000..477392715
--- /dev/null
+++ b/tools/testing/selftests/net/udpgso_bench_tx.c
@@ -0,0 +1,734 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <linux/errqueue.h>
+#include <linux/net_tstamp.h>
+#include <netinet/if_ether.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/poll.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+
+#ifndef ETH_MAX_MTU
+#define ETH_MAX_MTU 0xFFFFU
+#endif
+
+#ifndef UDP_SEGMENT
+#define UDP_SEGMENT 103
+#endif
+
+#ifndef SO_ZEROCOPY
+#define SO_ZEROCOPY 60
+#endif
+
+#ifndef SO_EE_ORIGIN_ZEROCOPY
+#define SO_EE_ORIGIN_ZEROCOPY 5
+#endif
+
+#ifndef MSG_ZEROCOPY
+#define MSG_ZEROCOPY 0x4000000
+#endif
+
+#ifndef ENOTSUPP
+#define ENOTSUPP 524
+#endif
+
+#define NUM_PKT 100
+
+static bool cfg_cache_trash;
+static int cfg_cpu = -1;
+static int cfg_connected = true;
+static int cfg_family = PF_UNSPEC;
+static uint16_t cfg_mss;
+static int cfg_payload_len = (1472 * 42);
+static int cfg_port = 8000;
+static int cfg_runtime_ms = -1;
+static bool cfg_poll;
+static int cfg_poll_loop_timeout_ms = 2000;
+static bool cfg_segment;
+static bool cfg_sendmmsg;
+static bool cfg_tcp;
+static uint32_t cfg_tx_ts = SOF_TIMESTAMPING_TX_SOFTWARE;
+static bool cfg_tx_tstamp;
+static bool cfg_audit;
+static bool cfg_verbose;
+static bool cfg_zerocopy;
+static int cfg_msg_nr;
+static uint16_t cfg_gso_size;
+static unsigned long total_num_msgs;
+static unsigned long total_num_sends;
+static unsigned long stat_tx_ts;
+static unsigned long stat_tx_ts_errors;
+static unsigned long tstart;
+static unsigned long tend;
+static unsigned long stat_zcopies;
+
+static socklen_t cfg_alen;
+static struct sockaddr_storage cfg_dst_addr;
+
+static bool interrupted;
+static char buf[NUM_PKT][ETH_MAX_MTU];
+
+static void sigint_handler(int signum)
+{
+ if (signum == SIGINT)
+ interrupted = true;
+}
+
+static unsigned long gettimeofday_ms(void)
+{
+ struct timeval tv;
+
+ gettimeofday(&tv, NULL);
+ return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
+}
+
+static int set_cpu(int cpu)
+{
+ cpu_set_t mask;
+
+ CPU_ZERO(&mask);
+ CPU_SET(cpu, &mask);
+ if (sched_setaffinity(0, sizeof(mask), &mask))
+ error(1, 0, "setaffinity %d", cpu);
+
+ return 0;
+}
+
+static void setup_sockaddr(int domain, const char *str_addr, void *sockaddr)
+{
+ struct sockaddr_in6 *addr6 = (void *) sockaddr;
+ struct sockaddr_in *addr4 = (void *) sockaddr;
+
+ switch (domain) {
+ case PF_INET:
+ addr4->sin_family = AF_INET;
+ addr4->sin_port = htons(cfg_port);
+ if (inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
+ error(1, 0, "ipv4 parse error: %s", str_addr);
+ break;
+ case PF_INET6:
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_port = htons(cfg_port);
+ if (inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
+ error(1, 0, "ipv6 parse error: %s", str_addr);
+ break;
+ default:
+ error(1, 0, "illegal domain");
+ }
+}
+
+static void flush_cmsg(struct cmsghdr *cmsg)
+{
+ struct sock_extended_err *err;
+ struct scm_timestamping *tss;
+ __u32 lo;
+ __u32 hi;
+ int i;
+
+ switch (cmsg->cmsg_level) {
+ case SOL_SOCKET:
+ if (cmsg->cmsg_type == SO_TIMESTAMPING) {
+ i = (cfg_tx_ts == SOF_TIMESTAMPING_TX_HARDWARE) ? 2 : 0;
+ tss = (struct scm_timestamping *)CMSG_DATA(cmsg);
+ if (tss->ts[i].tv_sec == 0)
+ stat_tx_ts_errors++;
+ } else {
+ error(1, 0, "unknown SOL_SOCKET cmsg type=%u\n",
+ cmsg->cmsg_type);
+ }
+ break;
+ case SOL_IP:
+ case SOL_IPV6:
+ switch (cmsg->cmsg_type) {
+ case IP_RECVERR:
+ case IPV6_RECVERR:
+ {
+ err = (struct sock_extended_err *)CMSG_DATA(cmsg);
+ switch (err->ee_origin) {
+ case SO_EE_ORIGIN_TIMESTAMPING:
+ /* Got a TX timestamp from error queue */
+ stat_tx_ts++;
+ break;
+ case SO_EE_ORIGIN_ICMP:
+ case SO_EE_ORIGIN_ICMP6:
+ if (cfg_verbose)
+ fprintf(stderr,
+ "received ICMP error: type=%u, code=%u\n",
+ err->ee_type, err->ee_code);
+ break;
+ case SO_EE_ORIGIN_ZEROCOPY:
+ {
+ lo = err->ee_info;
+ hi = err->ee_data;
+ /* range of IDs acknowledged */
+ stat_zcopies += hi - lo + 1;
+ break;
+ }
+ case SO_EE_ORIGIN_LOCAL:
+ if (cfg_verbose)
+ fprintf(stderr,
+ "received packet with local origin: %u\n",
+ err->ee_origin);
+ break;
+ default:
+ error(0, 1, "received packet with origin: %u",
+ err->ee_origin);
+ }
+ break;
+ }
+ default:
+ error(0, 1, "unknown IP msg type=%u\n",
+ cmsg->cmsg_type);
+ break;
+ }
+ break;
+ default:
+ error(0, 1, "unknown cmsg level=%u\n",
+ cmsg->cmsg_level);
+ }
+}
+
+static void flush_errqueue_recv(int fd)
+{
+ char control[CMSG_SPACE(sizeof(struct scm_timestamping)) +
+ CMSG_SPACE(sizeof(struct sock_extended_err)) +
+ CMSG_SPACE(sizeof(struct sockaddr_in6))] = {0};
+ struct msghdr msg = {0};
+ struct cmsghdr *cmsg;
+ int ret;
+
+ while (1) {
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+ ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
+ if (ret == -1 && errno == EAGAIN)
+ break;
+ if (ret == -1)
+ error(1, errno, "errqueue");
+ if (msg.msg_flags != MSG_ERRQUEUE)
+ error(1, 0, "errqueue: flags 0x%x\n", msg.msg_flags);
+ if (cfg_audit) {
+ for (cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg;
+ cmsg = CMSG_NXTHDR(&msg, cmsg))
+ flush_cmsg(cmsg);
+ }
+ msg.msg_flags = 0;
+ }
+}
+
+static void flush_errqueue(int fd, const bool do_poll,
+ unsigned long poll_timeout, const bool poll_err)
+{
+ if (do_poll) {
+ struct pollfd fds = {0};
+ int ret;
+
+ fds.fd = fd;
+ ret = poll(&fds, 1, poll_timeout);
+ if (ret == 0) {
+ if ((cfg_verbose) && (poll_err))
+ fprintf(stderr, "poll timeout\n");
+ } else if (ret < 0) {
+ error(1, errno, "poll");
+ }
+ }
+
+ flush_errqueue_recv(fd);
+}
+
+static void flush_errqueue_retry(int fd, unsigned long num_sends)
+{
+ unsigned long tnow, tstop;
+ bool first_try = true;
+
+ tnow = gettimeofday_ms();
+ tstop = tnow + cfg_poll_loop_timeout_ms;
+ do {
+ flush_errqueue(fd, true, tstop - tnow, first_try);
+ first_try = false;
+ tnow = gettimeofday_ms();
+ } while ((stat_zcopies != num_sends) && (tnow < tstop));
+}
+
+static int send_tcp(int fd, char *data)
+{
+ int ret, done = 0, count = 0;
+
+ while (done < cfg_payload_len) {
+ ret = send(fd, data + done, cfg_payload_len - done,
+ cfg_zerocopy ? MSG_ZEROCOPY : 0);
+ if (ret == -1)
+ error(1, errno, "write");
+
+ done += ret;
+ count++;
+ }
+
+ return count;
+}
+
+static int send_udp(int fd, char *data)
+{
+ int ret, total_len, len, count = 0;
+
+ total_len = cfg_payload_len;
+
+ while (total_len) {
+ len = total_len < cfg_mss ? total_len : cfg_mss;
+
+ ret = sendto(fd, data, len, cfg_zerocopy ? MSG_ZEROCOPY : 0,
+ cfg_connected ? NULL : (void *)&cfg_dst_addr,
+ cfg_connected ? 0 : cfg_alen);
+ if (ret == -1)
+ error(1, errno, "write");
+ if (ret != len)
+ error(1, errno, "write: %uB != %uB\n", ret, len);
+
+ total_len -= len;
+ count++;
+ }
+
+ return count;
+}
+
+static void send_ts_cmsg(struct cmsghdr *cm)
+{
+ uint32_t *valp;
+
+ cm->cmsg_level = SOL_SOCKET;
+ cm->cmsg_type = SO_TIMESTAMPING;
+ cm->cmsg_len = CMSG_LEN(sizeof(cfg_tx_ts));
+ valp = (void *)CMSG_DATA(cm);
+ *valp = cfg_tx_ts;
+}
+
+static int send_udp_sendmmsg(int fd, char *data)
+{
+ char control[CMSG_SPACE(sizeof(cfg_tx_ts))] = {0};
+ const int max_nr_msg = ETH_MAX_MTU / ETH_DATA_LEN;
+ struct mmsghdr mmsgs[max_nr_msg];
+ struct iovec iov[max_nr_msg];
+ unsigned int off = 0, left;
+ size_t msg_controllen = 0;
+ int i = 0, ret;
+
+ memset(mmsgs, 0, sizeof(mmsgs));
+
+ if (cfg_tx_tstamp) {
+ struct msghdr msg = {0};
+ struct cmsghdr *cmsg;
+
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+ cmsg = CMSG_FIRSTHDR(&msg);
+ send_ts_cmsg(cmsg);
+ msg_controllen += CMSG_SPACE(sizeof(cfg_tx_ts));
+ }
+
+ left = cfg_payload_len;
+ while (left) {
+ if (i == max_nr_msg)
+ error(1, 0, "sendmmsg: exceeds max_nr_msg");
+
+ iov[i].iov_base = data + off;
+ iov[i].iov_len = cfg_mss < left ? cfg_mss : left;
+
+ mmsgs[i].msg_hdr.msg_iov = iov + i;
+ mmsgs[i].msg_hdr.msg_iovlen = 1;
+
+ mmsgs[i].msg_hdr.msg_name = (void *)&cfg_dst_addr;
+ mmsgs[i].msg_hdr.msg_namelen = cfg_alen;
+ if (msg_controllen) {
+ mmsgs[i].msg_hdr.msg_control = control;
+ mmsgs[i].msg_hdr.msg_controllen = msg_controllen;
+ }
+
+ off += iov[i].iov_len;
+ left -= iov[i].iov_len;
+ i++;
+ }
+
+ ret = sendmmsg(fd, mmsgs, i, cfg_zerocopy ? MSG_ZEROCOPY : 0);
+ if (ret == -1)
+ error(1, errno, "sendmmsg");
+
+ return ret;
+}
+
+static void send_udp_segment_cmsg(struct cmsghdr *cm)
+{
+ uint16_t *valp;
+
+ cm->cmsg_level = SOL_UDP;
+ cm->cmsg_type = UDP_SEGMENT;
+ cm->cmsg_len = CMSG_LEN(sizeof(cfg_gso_size));
+ valp = (void *)CMSG_DATA(cm);
+ *valp = cfg_gso_size;
+}
+
+static int send_udp_segment(int fd, char *data)
+{
+ char control[CMSG_SPACE(sizeof(cfg_gso_size)) +
+ CMSG_SPACE(sizeof(cfg_tx_ts))] = {0};
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ size_t msg_controllen;
+ struct cmsghdr *cmsg;
+ int ret;
+
+ iov.iov_base = data;
+ iov.iov_len = cfg_payload_len;
+
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+ cmsg = CMSG_FIRSTHDR(&msg);
+ send_udp_segment_cmsg(cmsg);
+ msg_controllen = CMSG_SPACE(sizeof(cfg_mss));
+ if (cfg_tx_tstamp) {
+ cmsg = CMSG_NXTHDR(&msg, cmsg);
+ send_ts_cmsg(cmsg);
+ msg_controllen += CMSG_SPACE(sizeof(cfg_tx_ts));
+ }
+
+ msg.msg_controllen = msg_controllen;
+ msg.msg_name = (void *)&cfg_dst_addr;
+ msg.msg_namelen = cfg_alen;
+
+ ret = sendmsg(fd, &msg, cfg_zerocopy ? MSG_ZEROCOPY : 0);
+ if (ret == -1)
+ error(1, errno, "sendmsg");
+ if (ret != iov.iov_len)
+ error(1, 0, "sendmsg: %u != %llu\n", ret,
+ (unsigned long long)iov.iov_len);
+
+ return 1;
+}
+
+static void usage(const char *filepath)
+{
+ error(1, 0, "Usage: %s [-46acmHPtTuvz] [-C cpu] [-D dst ip] [-l secs] "
+ "[-L secs] [-M messagenr] [-p port] [-s sendsize] [-S gsosize]",
+ filepath);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ const char *bind_addr = NULL;
+ int max_len, hdrlen;
+ int c;
+
+ while ((c = getopt(argc, argv, "46acC:D:Hl:L:mM:p:s:PS:tTuvz")) != -1) {
+ switch (c) {
+ case '4':
+ if (cfg_family != PF_UNSPEC)
+ error(1, 0, "Pass one of -4 or -6");
+ cfg_family = PF_INET;
+ cfg_alen = sizeof(struct sockaddr_in);
+ break;
+ case '6':
+ if (cfg_family != PF_UNSPEC)
+ error(1, 0, "Pass one of -4 or -6");
+ cfg_family = PF_INET6;
+ cfg_alen = sizeof(struct sockaddr_in6);
+ break;
+ case 'a':
+ cfg_audit = true;
+ break;
+ case 'c':
+ cfg_cache_trash = true;
+ break;
+ case 'C':
+ cfg_cpu = strtol(optarg, NULL, 0);
+ break;
+ case 'D':
+ bind_addr = optarg;
+ break;
+ case 'l':
+ cfg_runtime_ms = strtoul(optarg, NULL, 10) * 1000;
+ break;
+ case 'L':
+ cfg_poll_loop_timeout_ms = strtoul(optarg, NULL, 10) * 1000;
+ break;
+ case 'm':
+ cfg_sendmmsg = true;
+ break;
+ case 'M':
+ cfg_msg_nr = strtoul(optarg, NULL, 10);
+ break;
+ case 'p':
+ cfg_port = strtoul(optarg, NULL, 0);
+ break;
+ case 'P':
+ cfg_poll = true;
+ break;
+ case 's':
+ cfg_payload_len = strtoul(optarg, NULL, 0);
+ break;
+ case 'S':
+ cfg_gso_size = strtoul(optarg, NULL, 0);
+ cfg_segment = true;
+ break;
+ case 'H':
+ cfg_tx_ts = SOF_TIMESTAMPING_TX_HARDWARE;
+ cfg_tx_tstamp = true;
+ break;
+ case 't':
+ cfg_tcp = true;
+ break;
+ case 'T':
+ cfg_tx_tstamp = true;
+ break;
+ case 'u':
+ cfg_connected = false;
+ break;
+ case 'v':
+ cfg_verbose = true;
+ break;
+ case 'z':
+ cfg_zerocopy = true;
+ break;
+ default:
+ exit(1);
+ }
+ }
+
+ if (!bind_addr)
+ bind_addr = cfg_family == PF_INET6 ? "::" : "0.0.0.0";
+
+ setup_sockaddr(cfg_family, bind_addr, &cfg_dst_addr);
+
+ if (optind != argc)
+ usage(argv[0]);
+
+ if (cfg_family == PF_UNSPEC)
+ error(1, 0, "must pass one of -4 or -6");
+ if (cfg_tcp && !cfg_connected)
+ error(1, 0, "connectionless tcp makes no sense");
+ if (cfg_segment && cfg_sendmmsg)
+ error(1, 0, "cannot combine segment offload and sendmmsg");
+ if (cfg_tx_tstamp && !(cfg_segment || cfg_sendmmsg))
+ error(1, 0, "Options -T and -H require either -S or -m option");
+
+ if (cfg_family == PF_INET)
+ hdrlen = sizeof(struct iphdr) + sizeof(struct udphdr);
+ else
+ hdrlen = sizeof(struct ip6_hdr) + sizeof(struct udphdr);
+
+ cfg_mss = ETH_DATA_LEN - hdrlen;
+ max_len = ETH_MAX_MTU - hdrlen;
+ if (!cfg_gso_size)
+ cfg_gso_size = cfg_mss;
+
+ if (cfg_payload_len > max_len)
+ error(1, 0, "payload length %u exceeds max %u",
+ cfg_payload_len, max_len);
+}
+
+static void set_pmtu_discover(int fd, bool is_ipv4)
+{
+ int level, name, val;
+
+ if (is_ipv4) {
+ level = SOL_IP;
+ name = IP_MTU_DISCOVER;
+ val = IP_PMTUDISC_DO;
+ } else {
+ level = SOL_IPV6;
+ name = IPV6_MTU_DISCOVER;
+ val = IPV6_PMTUDISC_DO;
+ }
+
+ if (setsockopt(fd, level, name, &val, sizeof(val)))
+ error(1, errno, "setsockopt path mtu");
+}
+
+static void set_tx_timestamping(int fd)
+{
+ int val = SOF_TIMESTAMPING_OPT_CMSG | SOF_TIMESTAMPING_OPT_ID |
+ SOF_TIMESTAMPING_OPT_TSONLY;
+
+ if (cfg_tx_ts == SOF_TIMESTAMPING_TX_SOFTWARE)
+ val |= SOF_TIMESTAMPING_SOFTWARE;
+ else
+ val |= SOF_TIMESTAMPING_RAW_HARDWARE;
+
+ if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, &val, sizeof(val)))
+ error(1, errno, "setsockopt tx timestamping");
+}
+
+static void print_audit_report(unsigned long num_msgs, unsigned long num_sends)
+{
+ unsigned long tdelta;
+
+ tdelta = tend - tstart;
+ if (!tdelta)
+ return;
+
+ fprintf(stderr, "Summary over %lu.%03lu seconds...\n",
+ tdelta / 1000, tdelta % 1000);
+ fprintf(stderr,
+ "sum %s tx: %6lu MB/s %10lu calls (%lu/s) %10lu msgs (%lu/s)\n",
+ cfg_tcp ? "tcp" : "udp",
+ ((num_msgs * cfg_payload_len) >> 10) / tdelta,
+ num_sends, num_sends * 1000 / tdelta,
+ num_msgs, num_msgs * 1000 / tdelta);
+
+ if (cfg_tx_tstamp) {
+ if (stat_tx_ts_errors)
+ error(1, 0,
+ "Expected clean TX Timestamps: %9lu msgs received %6lu errors",
+ stat_tx_ts, stat_tx_ts_errors);
+ if (stat_tx_ts != num_sends)
+ error(1, 0,
+ "Unexpected number of TX Timestamps: %9lu expected %9lu received",
+ num_sends, stat_tx_ts);
+ fprintf(stderr,
+ "Tx Timestamps: %19lu received %17lu errors\n",
+ stat_tx_ts, stat_tx_ts_errors);
+ }
+
+ if (cfg_zerocopy) {
+ if (stat_zcopies != num_sends)
+ error(1, 0, "Unexpected number of Zerocopy completions: %9lu expected %9lu received",
+ num_sends, stat_zcopies);
+ fprintf(stderr,
+ "Zerocopy acks: %19lu\n",
+ stat_zcopies);
+ }
+}
+
+static void print_report(unsigned long num_msgs, unsigned long num_sends)
+{
+ fprintf(stderr,
+ "%s tx: %6lu MB/s %8lu calls/s %6lu msg/s\n",
+ cfg_tcp ? "tcp" : "udp",
+ (num_msgs * cfg_payload_len) >> 20,
+ num_sends, num_msgs);
+
+ if (cfg_audit) {
+ total_num_msgs += num_msgs;
+ total_num_sends += num_sends;
+ }
+}
+
+int main(int argc, char **argv)
+{
+ unsigned long num_msgs, num_sends;
+ unsigned long tnow, treport, tstop;
+ int fd, i, val, ret;
+
+ parse_opts(argc, argv);
+
+ if (cfg_cpu > 0)
+ set_cpu(cfg_cpu);
+
+ for (i = 0; i < sizeof(buf[0]); i++)
+ buf[0][i] = 'a' + (i % 26);
+ for (i = 1; i < NUM_PKT; i++)
+ memcpy(buf[i], buf[0], sizeof(buf[0]));
+
+ signal(SIGINT, sigint_handler);
+
+ fd = socket(cfg_family, cfg_tcp ? SOCK_STREAM : SOCK_DGRAM, 0);
+ if (fd == -1)
+ error(1, errno, "socket");
+
+ if (cfg_zerocopy) {
+ val = 1;
+
+ ret = setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY,
+ &val, sizeof(val));
+ if (ret) {
+ if (errno == ENOPROTOOPT || errno == ENOTSUPP) {
+ fprintf(stderr, "SO_ZEROCOPY not supported");
+ exit(KSFT_SKIP);
+ }
+ error(1, errno, "setsockopt zerocopy");
+ }
+ }
+
+ if (cfg_connected &&
+ connect(fd, (void *)&cfg_dst_addr, cfg_alen))
+ error(1, errno, "connect");
+
+ if (cfg_segment)
+ set_pmtu_discover(fd, cfg_family == PF_INET);
+
+ if (cfg_tx_tstamp)
+ set_tx_timestamping(fd);
+
+ num_msgs = num_sends = 0;
+ tnow = gettimeofday_ms();
+ tstart = tnow;
+ tend = tnow;
+ tstop = tnow + cfg_runtime_ms;
+ treport = tnow + 1000;
+
+ i = 0;
+ do {
+ if (cfg_tcp)
+ num_sends += send_tcp(fd, buf[i]);
+ else if (cfg_segment)
+ num_sends += send_udp_segment(fd, buf[i]);
+ else if (cfg_sendmmsg)
+ num_sends += send_udp_sendmmsg(fd, buf[i]);
+ else
+ num_sends += send_udp(fd, buf[i]);
+ num_msgs++;
+ if ((cfg_zerocopy && ((num_msgs & 0xF) == 0)) || cfg_tx_tstamp)
+ flush_errqueue(fd, cfg_poll, 500, true);
+
+ if (cfg_msg_nr && num_msgs >= cfg_msg_nr)
+ break;
+
+ tnow = gettimeofday_ms();
+ if (tnow >= treport) {
+ print_report(num_msgs, num_sends);
+ num_msgs = num_sends = 0;
+ treport = tnow + 1000;
+ }
+
+ /* cold cache when writing buffer */
+ if (cfg_cache_trash)
+ i = ++i < NUM_PKT ? i : 0;
+
+ } while (!interrupted && (cfg_runtime_ms == -1 || tnow < tstop));
+
+ if (cfg_zerocopy || cfg_tx_tstamp)
+ flush_errqueue_retry(fd, num_sends);
+
+ if (close(fd))
+ error(1, errno, "close");
+
+ if (cfg_audit) {
+ tend = tnow;
+ total_num_msgs += num_msgs;
+ total_num_sends += num_sends;
+ print_audit_report(total_num_msgs, total_num_sends);
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/net/vrf-xfrm-tests.sh b/tools/testing/selftests/net/vrf-xfrm-tests.sh
new file mode 100755
index 000000000..452638ae8
--- /dev/null
+++ b/tools/testing/selftests/net/vrf-xfrm-tests.sh
@@ -0,0 +1,436 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Various combinations of VRF with xfrms and qdisc.
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+PAUSE_ON_FAIL=no
+VERBOSE=0
+ret=0
+
+HOST1_4=192.168.1.1
+HOST2_4=192.168.1.2
+HOST1_6=2001:db8:1::1
+HOST2_6=2001:db8:1::2
+
+XFRM1_4=10.0.1.1
+XFRM2_4=10.0.1.2
+XFRM1_6=fc00:1000::1
+XFRM2_6=fc00:1000::2
+IF_ID=123
+
+VRF=red
+TABLE=300
+
+AUTH_1=0xd94fcfea65fddf21dc6e0d24a0253508
+AUTH_2=0xdc6e0d24a0253508d94fcfea65fddf21
+ENC_1=0xfc46c20f8048be9725930ff3fb07ac2a91f0347dffeacf62
+ENC_2=0x3fb07ac2a91f0347dffeacf62fc46c20f8048be9725930ff
+SPI_1=0x02122b77
+SPI_2=0x2b770212
+
+which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping)
+
+################################################################################
+#
+log_test()
+{
+ local rc=$1
+ local expected=$2
+ local msg="$3"
+
+ if [ ${rc} -eq ${expected} ]; then
+ printf "TEST: %-60s [ OK ]\n" "${msg}"
+ nsuccess=$((nsuccess+1))
+ else
+ ret=1
+ nfail=$((nfail+1))
+ printf "TEST: %-60s [FAIL]\n" "${msg}"
+ if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+ echo
+ echo "hit enter to continue, 'q' to quit"
+ read a
+ [ "$a" = "q" ] && exit 1
+ fi
+ fi
+}
+
+run_cmd_host1()
+{
+ local cmd="$*"
+ local out
+ local rc
+
+ if [ "$VERBOSE" = "1" ]; then
+ printf " COMMAND: $cmd\n"
+ fi
+
+ out=$(eval ip netns exec host1 $cmd 2>&1)
+ rc=$?
+ if [ "$VERBOSE" = "1" ]; then
+ if [ -n "$out" ]; then
+ echo
+ echo " $out"
+ fi
+ echo
+ fi
+
+ return $rc
+}
+
+################################################################################
+# create namespaces for hosts and sws
+
+create_vrf()
+{
+ local ns=$1
+ local vrf=$2
+ local table=$3
+
+ if [ -n "${ns}" ]; then
+ ns="-netns ${ns}"
+ fi
+
+ ip ${ns} link add ${vrf} type vrf table ${table}
+ ip ${ns} link set ${vrf} up
+ ip ${ns} route add vrf ${vrf} unreachable default metric 8192
+ ip ${ns} -6 route add vrf ${vrf} unreachable default metric 8192
+
+ ip ${ns} addr add 127.0.0.1/8 dev ${vrf}
+ ip ${ns} -6 addr add ::1 dev ${vrf} nodad
+
+ ip ${ns} ru del pref 0
+ ip ${ns} ru add pref 32765 from all lookup local
+ ip ${ns} -6 ru del pref 0
+ ip ${ns} -6 ru add pref 32765 from all lookup local
+}
+
+create_ns()
+{
+ local ns=$1
+ local addr=$2
+ local addr6=$3
+
+ [ -z "${addr}" ] && addr="-"
+ [ -z "${addr6}" ] && addr6="-"
+
+ ip netns add ${ns}
+
+ ip -netns ${ns} link set lo up
+ if [ "${addr}" != "-" ]; then
+ ip -netns ${ns} addr add dev lo ${addr}
+ fi
+ if [ "${addr6}" != "-" ]; then
+ ip -netns ${ns} -6 addr add dev lo ${addr6}
+ fi
+
+ ip -netns ${ns} ro add unreachable default metric 8192
+ ip -netns ${ns} -6 ro add unreachable default metric 8192
+
+ ip netns exec ${ns} sysctl -qw net.ipv4.ip_forward=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.all.keep_addr_on_down=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.all.forwarding=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.default.forwarding=1
+ ip netns exec ${ns} sysctl -qw net.ipv6.conf.default.accept_dad=0
+}
+
+# create veth pair to connect namespaces and apply addresses.
+connect_ns()
+{
+ local ns1=$1
+ local ns1_dev=$2
+ local ns1_addr=$3
+ local ns1_addr6=$4
+ local ns2=$5
+ local ns2_dev=$6
+ local ns2_addr=$7
+ local ns2_addr6=$8
+ local ns1arg
+ local ns2arg
+
+ if [ -n "${ns1}" ]; then
+ ns1arg="-netns ${ns1}"
+ fi
+ if [ -n "${ns2}" ]; then
+ ns2arg="-netns ${ns2}"
+ fi
+
+ ip ${ns1arg} li add ${ns1_dev} type veth peer name tmp
+ ip ${ns1arg} li set ${ns1_dev} up
+ ip ${ns1arg} li set tmp netns ${ns2} name ${ns2_dev}
+ ip ${ns2arg} li set ${ns2_dev} up
+
+ if [ "${ns1_addr}" != "-" ]; then
+ ip ${ns1arg} addr add dev ${ns1_dev} ${ns1_addr}
+ ip ${ns2arg} addr add dev ${ns2_dev} ${ns2_addr}
+ fi
+
+ if [ "${ns1_addr6}" != "-" ]; then
+ ip ${ns1arg} addr add dev ${ns1_dev} ${ns1_addr6} nodad
+ ip ${ns2arg} addr add dev ${ns2_dev} ${ns2_addr6} nodad
+ fi
+}
+
+################################################################################
+
+cleanup()
+{
+ ip netns del host1
+ ip netns del host2
+}
+
+setup()
+{
+ create_ns "host1"
+ create_ns "host2"
+
+ connect_ns "host1" eth0 ${HOST1_4}/24 ${HOST1_6}/64 \
+ "host2" eth0 ${HOST2_4}/24 ${HOST2_6}/64
+
+ create_vrf "host1" ${VRF} ${TABLE}
+ ip -netns host1 link set dev eth0 master ${VRF}
+}
+
+cleanup_xfrm()
+{
+ for ns in host1 host2
+ do
+ for x in state policy
+ do
+ ip -netns ${ns} xfrm ${x} flush
+ ip -6 -netns ${ns} xfrm ${x} flush
+ done
+ done
+}
+
+setup_xfrm()
+{
+ local h1_4=$1
+ local h2_4=$2
+ local h1_6=$3
+ local h2_6=$4
+ local devarg="$5"
+
+ #
+ # policy
+ #
+
+ # host1 - IPv4 out
+ ip -netns host1 xfrm policy add \
+ src ${h1_4} dst ${h2_4} ${devarg} dir out \
+ tmpl src ${HOST1_4} dst ${HOST2_4} proto esp mode tunnel
+
+ # host2 - IPv4 in
+ ip -netns host2 xfrm policy add \
+ src ${h1_4} dst ${h2_4} dir in \
+ tmpl src ${HOST1_4} dst ${HOST2_4} proto esp mode tunnel
+
+ # host1 - IPv4 in
+ ip -netns host1 xfrm policy add \
+ src ${h2_4} dst ${h1_4} ${devarg} dir in \
+ tmpl src ${HOST2_4} dst ${HOST1_4} proto esp mode tunnel
+
+ # host2 - IPv4 out
+ ip -netns host2 xfrm policy add \
+ src ${h2_4} dst ${h1_4} dir out \
+ tmpl src ${HOST2_4} dst ${HOST1_4} proto esp mode tunnel
+
+
+ # host1 - IPv6 out
+ ip -6 -netns host1 xfrm policy add \
+ src ${h1_6} dst ${h2_6} ${devarg} dir out \
+ tmpl src ${HOST1_6} dst ${HOST2_6} proto esp mode tunnel
+
+ # host2 - IPv6 in
+ ip -6 -netns host2 xfrm policy add \
+ src ${h1_6} dst ${h2_6} dir in \
+ tmpl src ${HOST1_6} dst ${HOST2_6} proto esp mode tunnel
+
+ # host1 - IPv6 in
+ ip -6 -netns host1 xfrm policy add \
+ src ${h2_6} dst ${h1_6} ${devarg} dir in \
+ tmpl src ${HOST2_6} dst ${HOST1_6} proto esp mode tunnel
+
+ # host2 - IPv6 out
+ ip -6 -netns host2 xfrm policy add \
+ src ${h2_6} dst ${h1_6} dir out \
+ tmpl src ${HOST2_6} dst ${HOST1_6} proto esp mode tunnel
+
+ #
+ # state
+ #
+ ip -netns host1 xfrm state add src ${HOST1_4} dst ${HOST2_4} \
+ proto esp spi ${SPI_1} reqid 0 mode tunnel \
+ replay-window 4 replay-oseq 0x4 \
+ auth-trunc 'hmac(sha1)' ${AUTH_1} 96 \
+ enc 'cbc(aes)' ${ENC_1} \
+ sel src ${h1_4} dst ${h2_4} ${devarg}
+
+ ip -netns host2 xfrm state add src ${HOST1_4} dst ${HOST2_4} \
+ proto esp spi ${SPI_1} reqid 0 mode tunnel \
+ replay-window 4 replay-oseq 0x4 \
+ auth-trunc 'hmac(sha1)' ${AUTH_1} 96 \
+ enc 'cbc(aes)' ${ENC_1} \
+ sel src ${h1_4} dst ${h2_4}
+
+
+ ip -netns host1 xfrm state add src ${HOST2_4} dst ${HOST1_4} \
+ proto esp spi ${SPI_2} reqid 0 mode tunnel \
+ replay-window 4 replay-oseq 0x4 \
+ auth-trunc 'hmac(sha1)' ${AUTH_2} 96 \
+ enc 'cbc(aes)' ${ENC_2} \
+ sel src ${h2_4} dst ${h1_4} ${devarg}
+
+ ip -netns host2 xfrm state add src ${HOST2_4} dst ${HOST1_4} \
+ proto esp spi ${SPI_2} reqid 0 mode tunnel \
+ replay-window 4 replay-oseq 0x4 \
+ auth-trunc 'hmac(sha1)' ${AUTH_2} 96 \
+ enc 'cbc(aes)' ${ENC_2} \
+ sel src ${h2_4} dst ${h1_4}
+
+
+ ip -6 -netns host1 xfrm state add src ${HOST1_6} dst ${HOST2_6} \
+ proto esp spi ${SPI_1} reqid 0 mode tunnel \
+ replay-window 4 replay-oseq 0x4 \
+ auth-trunc 'hmac(sha1)' ${AUTH_1} 96 \
+ enc 'cbc(aes)' ${ENC_1} \
+ sel src ${h1_6} dst ${h2_6} ${devarg}
+
+ ip -6 -netns host2 xfrm state add src ${HOST1_6} dst ${HOST2_6} \
+ proto esp spi ${SPI_1} reqid 0 mode tunnel \
+ replay-window 4 replay-oseq 0x4 \
+ auth-trunc 'hmac(sha1)' ${AUTH_1} 96 \
+ enc 'cbc(aes)' ${ENC_1} \
+ sel src ${h1_6} dst ${h2_6}
+
+
+ ip -6 -netns host1 xfrm state add src ${HOST2_6} dst ${HOST1_6} \
+ proto esp spi ${SPI_2} reqid 0 mode tunnel \
+ replay-window 4 replay-oseq 0x4 \
+ auth-trunc 'hmac(sha1)' ${AUTH_2} 96 \
+ enc 'cbc(aes)' ${ENC_2} \
+ sel src ${h2_6} dst ${h1_6} ${devarg}
+
+ ip -6 -netns host2 xfrm state add src ${HOST2_6} dst ${HOST1_6} \
+ proto esp spi ${SPI_2} reqid 0 mode tunnel \
+ replay-window 4 replay-oseq 0x4 \
+ auth-trunc 'hmac(sha1)' ${AUTH_2} 96 \
+ enc 'cbc(aes)' ${ENC_2} \
+ sel src ${h2_6} dst ${h1_6}
+}
+
+cleanup_xfrm_dev()
+{
+ ip -netns host1 li del xfrm0
+ ip -netns host2 addr del ${XFRM2_4}/24 dev eth0
+ ip -netns host2 addr del ${XFRM2_6}/64 dev eth0
+}
+
+setup_xfrm_dev()
+{
+ local vrfarg="vrf ${VRF}"
+
+ ip -netns host1 li add type xfrm dev eth0 if_id ${IF_ID}
+ ip -netns host1 li set xfrm0 ${vrfarg} up
+ ip -netns host1 addr add ${XFRM1_4}/24 dev xfrm0
+ ip -netns host1 addr add ${XFRM1_6}/64 dev xfrm0
+
+ ip -netns host2 addr add ${XFRM2_4}/24 dev eth0
+ ip -netns host2 addr add ${XFRM2_6}/64 dev eth0
+
+ setup_xfrm ${XFRM1_4} ${XFRM2_4} ${XFRM1_6} ${XFRM2_6} "if_id ${IF_ID}"
+}
+
+run_tests()
+{
+ cleanup_xfrm
+
+ # no IPsec
+ run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4}
+ log_test $? 0 "IPv4 no xfrm policy"
+ run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6}
+ log_test $? 0 "IPv6 no xfrm policy"
+
+ # xfrm without VRF in sel
+ setup_xfrm ${HOST1_4} ${HOST2_4} ${HOST1_6} ${HOST2_6}
+ run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4}
+ log_test $? 0 "IPv4 xfrm policy based on address"
+ run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6}
+ log_test $? 0 "IPv6 xfrm policy based on address"
+ cleanup_xfrm
+
+ # xfrm with VRF in sel
+ # Known failure: ipv4 resets the flow oif after the lookup. Fix is
+ # not straightforward.
+ # setup_xfrm ${HOST1_4} ${HOST2_4} ${HOST1_6} ${HOST2_6} "dev ${VRF}"
+ # run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4}
+ # log_test $? 0 "IPv4 xfrm policy with VRF in selector"
+ run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6}
+ log_test $? 0 "IPv6 xfrm policy with VRF in selector"
+ cleanup_xfrm
+
+ # xfrm with enslaved device in sel
+ # Known failures: combined with the above, __xfrm{4,6}_selector_match
+ # needs to consider both l3mdev and enslaved device index.
+ # setup_xfrm ${HOST1_4} ${HOST2_4} ${HOST1_6} ${HOST2_6} "dev eth0"
+ # run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${HOST2_4}
+ # log_test $? 0 "IPv4 xfrm policy with enslaved device in selector"
+ # run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${HOST2_6}
+ # log_test $? 0 "IPv6 xfrm policy with enslaved device in selector"
+ # cleanup_xfrm
+
+ # xfrm device
+ setup_xfrm_dev
+ run_cmd_host1 ip vrf exec ${VRF} ping -c1 -w1 ${XFRM2_4}
+ log_test $? 0 "IPv4 xfrm policy with xfrm device"
+ run_cmd_host1 ip vrf exec ${VRF} ${ping6} -c1 -w1 ${XFRM2_6}
+ log_test $? 0 "IPv6 xfrm policy with xfrm device"
+ cleanup_xfrm_dev
+}
+
+################################################################################
+# usage
+
+usage()
+{
+ cat <<EOF
+usage: ${0##*/} OPTS
+
+ -p Pause on fail
+ -v verbose mode (show commands and output)
+
+done
+EOF
+}
+
+################################################################################
+# main
+
+while getopts :pv o
+do
+ case $o in
+ p) PAUSE_ON_FAIL=yes;;
+ v) VERBOSE=$(($VERBOSE + 1));;
+ h) usage; exit 0;;
+ *) usage; exit 1;;
+ esac
+done
+
+cleanup 2>/dev/null
+setup
+
+echo
+echo "No qdisc on VRF device"
+run_tests
+
+run_cmd_host1 tc qdisc add dev ${VRF} root netem delay 100ms
+echo
+echo "netem qdisc on VRF device"
+run_tests
+
+printf "\nTests passed: %3d\n" ${nsuccess}
+printf "Tests failed: %3d\n" ${nfail}
+
+exit $ret
diff --git a/tools/testing/selftests/net/vrf_route_leaking.sh b/tools/testing/selftests/net/vrf_route_leaking.sh
new file mode 100755
index 000000000..23cf92475
--- /dev/null
+++ b/tools/testing/selftests/net/vrf_route_leaking.sh
@@ -0,0 +1,626 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (c) 2019 David Ahern <dsahern@gmail.com>. All rights reserved.
+# Copyright (c) 2020 Michael Jeanson <mjeanson@efficios.com>. All rights reserved.
+#
+# Requires CONFIG_NET_VRF, CONFIG_VETH, CONFIG_BRIDGE and CONFIG_NET_NS.
+#
+#
+# Symmetric routing topology
+#
+# blue red
+# +----+ .253 +----+ .253 +----+
+# | h1 |-------------------| r1 |-------------------| h2 |
+# +----+ .1 +----+ .2 +----+
+# 172.16.1/24 172.16.2/24
+# 2001:db8:16:1/64 2001:db8:16:2/64
+#
+#
+# Route from h1 to h2 and back goes through r1, incoming vrf blue has a route
+# to the outgoing vrf red for the n2 network and red has a route back to n1.
+# The red VRF interface has a MTU of 1400.
+#
+# The first test sends a ping with a ttl of 1 from h1 to h2 and parses the
+# output of the command to check that a ttl expired error is received.
+#
+# The second test runs traceroute from h1 to h2 and parses the output to check
+# for a hop on r1.
+#
+# The third test sends a ping with a packet size of 1450 from h1 to h2 and
+# parses the output of the command to check that a fragmentation error is
+# received.
+#
+#
+# Asymmetric routing topology
+#
+# This topology represents a customer setup where the issue with icmp errors
+# and VRF route leaking was initialy reported. The MTU test isn't done here
+# because of the lack of a return route in the red VRF.
+#
+# blue red
+# .253 +----+ .253
+# +----| r1 |----+
+# | +----+ |
+# +----+ | | +----+
+# | h1 |--------------+ +--------------| h2 |
+# +----+ .1 | | .2 +----+
+# 172.16.1/24 | +----+ | 172.16.2/24
+# 2001:db8:16:1/64 +----| r2 |----+ 2001:db8:16:2/64
+# .254 +----+ .254
+#
+#
+# Route from h1 to h2 goes through r1, incoming vrf blue has a route to the
+# outgoing vrf red for the n2 network but red doesn't have a route back to n1.
+# Route from h2 to h1 goes through r2.
+#
+# The objective is to check that the incoming vrf routing table is selected
+# to send an ICMP error back to the source when the ttl of a packet reaches 1
+# while it is forwarded between different vrfs.
+
+VERBOSE=0
+PAUSE_ON_FAIL=no
+DEFAULT_TTYPE=sym
+
+H1_N1=172.16.1.0/24
+H1_N1_6=2001:db8:16:1::/64
+
+H1_N1_IP=172.16.1.1
+R1_N1_IP=172.16.1.253
+R2_N1_IP=172.16.1.254
+
+H1_N1_IP6=2001:db8:16:1::1
+R1_N1_IP6=2001:db8:16:1::253
+R2_N1_IP6=2001:db8:16:1::254
+
+H2_N2=172.16.2.0/24
+H2_N2_6=2001:db8:16:2::/64
+
+H2_N2_IP=172.16.2.2
+R1_N2_IP=172.16.2.253
+R2_N2_IP=172.16.2.254
+
+H2_N2_IP6=2001:db8:16:2::2
+R1_N2_IP6=2001:db8:16:2::253
+R2_N2_IP6=2001:db8:16:2::254
+
+################################################################################
+# helpers
+
+log_section()
+{
+ echo
+ echo "###########################################################################"
+ echo "$*"
+ echo "###########################################################################"
+ echo
+}
+
+log_test()
+{
+ local rc=$1
+ local expected=$2
+ local msg="$3"
+
+ if [ "${rc}" -eq "${expected}" ]; then
+ printf "TEST: %-60s [ OK ]\n" "${msg}"
+ nsuccess=$((nsuccess+1))
+ else
+ ret=1
+ nfail=$((nfail+1))
+ printf "TEST: %-60s [FAIL]\n" "${msg}"
+ if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+ echo
+ echo "hit enter to continue, 'q' to quit"
+ read -r a
+ [ "$a" = "q" ] && exit 1
+ fi
+ fi
+}
+
+run_cmd()
+{
+ local cmd="$*"
+ local out
+ local rc
+
+ if [ "$VERBOSE" = "1" ]; then
+ echo "COMMAND: $cmd"
+ fi
+
+ # shellcheck disable=SC2086
+ out=$(eval $cmd 2>&1)
+ rc=$?
+ if [ "$VERBOSE" = "1" ] && [ -n "$out" ]; then
+ echo "$out"
+ fi
+
+ [ "$VERBOSE" = "1" ] && echo
+
+ return $rc
+}
+
+run_cmd_grep()
+{
+ local grep_pattern="$1"
+ shift
+ local cmd="$*"
+ local out
+ local rc
+
+ if [ "$VERBOSE" = "1" ]; then
+ echo "COMMAND: $cmd"
+ fi
+
+ # shellcheck disable=SC2086
+ out=$(eval $cmd 2>&1)
+ if [ "$VERBOSE" = "1" ] && [ -n "$out" ]; then
+ echo "$out"
+ fi
+
+ echo "$out" | grep -q "$grep_pattern"
+ rc=$?
+
+ [ "$VERBOSE" = "1" ] && echo
+
+ return $rc
+}
+
+################################################################################
+# setup and teardown
+
+cleanup()
+{
+ local ns
+
+ for ns in h1 h2 r1 r2; do
+ ip netns del $ns 2>/dev/null
+ done
+}
+
+setup_vrf()
+{
+ local ns=$1
+
+ ip -netns "${ns}" rule del pref 0
+ ip -netns "${ns}" rule add pref 32765 from all lookup local
+ ip -netns "${ns}" -6 rule del pref 0
+ ip -netns "${ns}" -6 rule add pref 32765 from all lookup local
+}
+
+create_vrf()
+{
+ local ns=$1
+ local vrf=$2
+ local table=$3
+
+ ip -netns "${ns}" link add "${vrf}" type vrf table "${table}"
+ ip -netns "${ns}" link set "${vrf}" up
+ ip -netns "${ns}" route add vrf "${vrf}" unreachable default metric 8192
+ ip -netns "${ns}" -6 route add vrf "${vrf}" unreachable default metric 8192
+
+ ip -netns "${ns}" addr add 127.0.0.1/8 dev "${vrf}"
+ ip -netns "${ns}" -6 addr add ::1 dev "${vrf}" nodad
+}
+
+setup_sym()
+{
+ local ns
+
+ # make sure we are starting with a clean slate
+ cleanup
+
+ #
+ # create nodes as namespaces
+ #
+ for ns in h1 h2 r1; do
+ ip netns add $ns
+ ip -netns $ns link set lo up
+
+ case "${ns}" in
+ h[12]) ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=0
+ ip netns exec $ns sysctl -q -w net.ipv6.conf.all.keep_addr_on_down=1
+ ;;
+ r1) ip netns exec $ns sysctl -q -w net.ipv4.ip_forward=1
+ ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=1
+ esac
+ done
+
+ #
+ # create interconnects
+ #
+ ip -netns h1 link add eth0 type veth peer name r1h1
+ ip -netns h1 link set r1h1 netns r1 name eth0 up
+
+ ip -netns h2 link add eth0 type veth peer name r1h2
+ ip -netns h2 link set r1h2 netns r1 name eth1 up
+
+ #
+ # h1
+ #
+ ip -netns h1 addr add dev eth0 ${H1_N1_IP}/24
+ ip -netns h1 -6 addr add dev eth0 ${H1_N1_IP6}/64 nodad
+ ip -netns h1 link set eth0 up
+
+ # h1 to h2 via r1
+ ip -netns h1 route add ${H2_N2} via ${R1_N1_IP} dev eth0
+ ip -netns h1 -6 route add ${H2_N2_6} via "${R1_N1_IP6}" dev eth0
+
+ #
+ # h2
+ #
+ ip -netns h2 addr add dev eth0 ${H2_N2_IP}/24
+ ip -netns h2 -6 addr add dev eth0 ${H2_N2_IP6}/64 nodad
+ ip -netns h2 link set eth0 up
+
+ # h2 to h1 via r1
+ ip -netns h2 route add default via ${R1_N2_IP} dev eth0
+ ip -netns h2 -6 route add default via ${R1_N2_IP6} dev eth0
+
+ #
+ # r1
+ #
+ setup_vrf r1
+ create_vrf r1 blue 1101
+ create_vrf r1 red 1102
+ ip -netns r1 link set mtu 1400 dev eth1
+ ip -netns r1 link set eth0 vrf blue up
+ ip -netns r1 link set eth1 vrf red up
+ ip -netns r1 addr add dev eth0 ${R1_N1_IP}/24
+ ip -netns r1 -6 addr add dev eth0 ${R1_N1_IP6}/64 nodad
+ ip -netns r1 addr add dev eth1 ${R1_N2_IP}/24
+ ip -netns r1 -6 addr add dev eth1 ${R1_N2_IP6}/64 nodad
+
+ # Route leak from blue to red
+ ip -netns r1 route add vrf blue ${H2_N2} dev red
+ ip -netns r1 -6 route add vrf blue ${H2_N2_6} dev red
+
+ # Route leak from red to blue
+ ip -netns r1 route add vrf red ${H1_N1} dev blue
+ ip -netns r1 -6 route add vrf red ${H1_N1_6} dev blue
+
+
+ # Wait for ip config to settle
+ sleep 2
+}
+
+setup_asym()
+{
+ local ns
+
+ # make sure we are starting with a clean slate
+ cleanup
+
+ #
+ # create nodes as namespaces
+ #
+ for ns in h1 h2 r1 r2; do
+ ip netns add $ns
+ ip -netns $ns link set lo up
+
+ case "${ns}" in
+ h[12]) ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=0
+ ip netns exec $ns sysctl -q -w net.ipv6.conf.all.keep_addr_on_down=1
+ ;;
+ r[12]) ip netns exec $ns sysctl -q -w net.ipv4.ip_forward=1
+ ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=1
+ esac
+ done
+
+ #
+ # create interconnects
+ #
+ ip -netns h1 link add eth0 type veth peer name r1h1
+ ip -netns h1 link set r1h1 netns r1 name eth0 up
+
+ ip -netns h1 link add eth1 type veth peer name r2h1
+ ip -netns h1 link set r2h1 netns r2 name eth0 up
+
+ ip -netns h2 link add eth0 type veth peer name r1h2
+ ip -netns h2 link set r1h2 netns r1 name eth1 up
+
+ ip -netns h2 link add eth1 type veth peer name r2h2
+ ip -netns h2 link set r2h2 netns r2 name eth1 up
+
+ #
+ # h1
+ #
+ ip -netns h1 link add br0 type bridge
+ ip -netns h1 link set br0 up
+ ip -netns h1 addr add dev br0 ${H1_N1_IP}/24
+ ip -netns h1 -6 addr add dev br0 ${H1_N1_IP6}/64 nodad
+ ip -netns h1 link set eth0 master br0 up
+ ip -netns h1 link set eth1 master br0 up
+
+ # h1 to h2 via r1
+ ip -netns h1 route add ${H2_N2} via ${R1_N1_IP} dev br0
+ ip -netns h1 -6 route add ${H2_N2_6} via "${R1_N1_IP6}" dev br0
+
+ #
+ # h2
+ #
+ ip -netns h2 link add br0 type bridge
+ ip -netns h2 link set br0 up
+ ip -netns h2 addr add dev br0 ${H2_N2_IP}/24
+ ip -netns h2 -6 addr add dev br0 ${H2_N2_IP6}/64 nodad
+ ip -netns h2 link set eth0 master br0 up
+ ip -netns h2 link set eth1 master br0 up
+
+ # h2 to h1 via r2
+ ip -netns h2 route add default via ${R2_N2_IP} dev br0
+ ip -netns h2 -6 route add default via ${R2_N2_IP6} dev br0
+
+ #
+ # r1
+ #
+ setup_vrf r1
+ create_vrf r1 blue 1101
+ create_vrf r1 red 1102
+ ip -netns r1 link set mtu 1400 dev eth1
+ ip -netns r1 link set eth0 vrf blue up
+ ip -netns r1 link set eth1 vrf red up
+ ip -netns r1 addr add dev eth0 ${R1_N1_IP}/24
+ ip -netns r1 -6 addr add dev eth0 ${R1_N1_IP6}/64 nodad
+ ip -netns r1 addr add dev eth1 ${R1_N2_IP}/24
+ ip -netns r1 -6 addr add dev eth1 ${R1_N2_IP6}/64 nodad
+
+ # Route leak from blue to red
+ ip -netns r1 route add vrf blue ${H2_N2} dev red
+ ip -netns r1 -6 route add vrf blue ${H2_N2_6} dev red
+
+ # No route leak from red to blue
+
+ #
+ # r2
+ #
+ ip -netns r2 addr add dev eth0 ${R2_N1_IP}/24
+ ip -netns r2 -6 addr add dev eth0 ${R2_N1_IP6}/64 nodad
+ ip -netns r2 addr add dev eth1 ${R2_N2_IP}/24
+ ip -netns r2 -6 addr add dev eth1 ${R2_N2_IP6}/64 nodad
+
+ # Wait for ip config to settle
+ sleep 2
+}
+
+check_connectivity()
+{
+ ip netns exec h1 ping -c1 -w1 ${H2_N2_IP} >/dev/null 2>&1
+ log_test $? 0 "Basic IPv4 connectivity"
+ return $?
+}
+
+check_connectivity6()
+{
+ ip netns exec h1 "${ping6}" -c1 -w1 ${H2_N2_IP6} >/dev/null 2>&1
+ log_test $? 0 "Basic IPv6 connectivity"
+ return $?
+}
+
+check_traceroute()
+{
+ if [ ! -x "$(command -v traceroute)" ]; then
+ echo "SKIP: Could not run IPV4 test without traceroute"
+ return 1
+ fi
+}
+
+check_traceroute6()
+{
+ if [ ! -x "$(command -v traceroute6)" ]; then
+ echo "SKIP: Could not run IPV6 test without traceroute6"
+ return 1
+ fi
+}
+
+ipv4_traceroute()
+{
+ local ttype="$1"
+
+ [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE"
+
+ log_section "IPv4 ($ttype route): VRF ICMP error route lookup traceroute"
+
+ check_traceroute || return
+
+ setup_"$ttype"
+
+ check_connectivity || return
+
+ run_cmd_grep "${R1_N1_IP}" ip netns exec h1 traceroute ${H2_N2_IP}
+ log_test $? 0 "Traceroute reports a hop on r1"
+}
+
+ipv4_traceroute_asym()
+{
+ ipv4_traceroute asym
+}
+
+ipv6_traceroute()
+{
+ local ttype="$1"
+
+ [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE"
+
+ log_section "IPv6 ($ttype route): VRF ICMP error route lookup traceroute"
+
+ check_traceroute6 || return
+
+ setup_"$ttype"
+
+ check_connectivity6 || return
+
+ run_cmd_grep "${R1_N1_IP6}" ip netns exec h1 traceroute6 ${H2_N2_IP6}
+ log_test $? 0 "Traceroute6 reports a hop on r1"
+}
+
+ipv6_traceroute_asym()
+{
+ ipv6_traceroute asym
+}
+
+ipv4_ping_ttl()
+{
+ local ttype="$1"
+
+ [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE"
+
+ log_section "IPv4 ($ttype route): VRF ICMP ttl error route lookup ping"
+
+ setup_"$ttype"
+
+ check_connectivity || return
+
+ run_cmd_grep "Time to live exceeded" ip netns exec h1 ping -t1 -c1 -W2 ${H2_N2_IP}
+ log_test $? 0 "Ping received ICMP ttl exceeded"
+}
+
+ipv4_ping_ttl_asym()
+{
+ ipv4_ping_ttl asym
+}
+
+ipv4_ping_frag()
+{
+ local ttype="$1"
+
+ [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE"
+
+ log_section "IPv4 ($ttype route): VRF ICMP fragmentation error route lookup ping"
+
+ setup_"$ttype"
+
+ check_connectivity || return
+
+ run_cmd_grep "Frag needed" ip netns exec h1 ping -s 1450 -Mdo -c1 -W2 ${H2_N2_IP}
+ log_test $? 0 "Ping received ICMP Frag needed"
+}
+
+ipv4_ping_frag_asym()
+{
+ ipv4_ping_frag asym
+}
+
+ipv6_ping_ttl()
+{
+ local ttype="$1"
+
+ [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE"
+
+ log_section "IPv6 ($ttype route): VRF ICMP ttl error route lookup ping"
+
+ setup_"$ttype"
+
+ check_connectivity6 || return
+
+ run_cmd_grep "Time exceeded: Hop limit" ip netns exec h1 "${ping6}" -t1 -c1 -W2 ${H2_N2_IP6}
+ log_test $? 0 "Ping received ICMP Hop limit"
+}
+
+ipv6_ping_ttl_asym()
+{
+ ipv6_ping_ttl asym
+}
+
+ipv6_ping_frag()
+{
+ local ttype="$1"
+
+ [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE"
+
+ log_section "IPv6 ($ttype route): VRF ICMP fragmentation error route lookup ping"
+
+ setup_"$ttype"
+
+ check_connectivity6 || return
+
+ run_cmd_grep "Packet too big" ip netns exec h1 "${ping6}" -s 1450 -Mdo -c1 -W2 ${H2_N2_IP6}
+ log_test $? 0 "Ping received ICMP Packet too big"
+}
+
+ipv6_ping_frag_asym()
+{
+ ipv6_ping_frag asym
+}
+
+################################################################################
+# usage
+
+usage()
+{
+ cat <<EOF
+usage: ${0##*/} OPTS
+
+ -4 Run IPv4 tests only
+ -6 Run IPv6 tests only
+ -t TEST Run only TEST
+ -p Pause on fail
+ -v verbose mode (show commands and output)
+EOF
+}
+
+################################################################################
+# main
+
+# Some systems don't have a ping6 binary anymore
+command -v ping6 > /dev/null 2>&1 && ping6=$(command -v ping6) || ping6=$(command -v ping)
+
+TESTS_IPV4="ipv4_ping_ttl ipv4_traceroute ipv4_ping_frag ipv4_ping_ttl_asym ipv4_traceroute_asym"
+TESTS_IPV6="ipv6_ping_ttl ipv6_traceroute ipv6_ping_frag ipv6_ping_ttl_asym ipv6_traceroute_asym"
+
+ret=0
+nsuccess=0
+nfail=0
+
+while getopts :46t:pvh o
+do
+ case $o in
+ 4) TESTS=ipv4;;
+ 6) TESTS=ipv6;;
+ t) TESTS=$OPTARG;;
+ p) PAUSE_ON_FAIL=yes;;
+ v) VERBOSE=1;;
+ h) usage; exit 0;;
+ *) usage; exit 1;;
+ esac
+done
+
+#
+# show user test config
+#
+if [ -z "$TESTS" ]; then
+ TESTS="$TESTS_IPV4 $TESTS_IPV6"
+elif [ "$TESTS" = "ipv4" ]; then
+ TESTS="$TESTS_IPV4"
+elif [ "$TESTS" = "ipv6" ]; then
+ TESTS="$TESTS_IPV6"
+fi
+
+for t in $TESTS
+do
+ case $t in
+ ipv4_ping_ttl|ping) ipv4_ping_ttl;;&
+ ipv4_ping_ttl_asym|ping) ipv4_ping_ttl_asym;;&
+ ipv4_traceroute|traceroute) ipv4_traceroute;;&
+ ipv4_traceroute_asym|traceroute) ipv4_traceroute_asym;;&
+ ipv4_ping_frag|ping) ipv4_ping_frag;;&
+
+ ipv6_ping_ttl|ping) ipv6_ping_ttl;;&
+ ipv6_ping_ttl_asym|ping) ipv6_ping_ttl_asym;;&
+ ipv6_traceroute|traceroute) ipv6_traceroute;;&
+ ipv6_traceroute_asym|traceroute) ipv6_traceroute_asym;;&
+ ipv6_ping_frag|ping) ipv6_ping_frag;;&
+
+ # setup namespaces and config, but do not run any tests
+ setup_sym|setup) setup_sym; exit 0;;
+ setup_asym) setup_asym; exit 0;;
+
+ help) echo "Test names: $TESTS"; exit 0;;
+ esac
+done
+
+cleanup
+
+printf "\nTests passed: %3d\n" ${nsuccess}
+printf "Tests failed: %3d\n" ${nfail}
+
+exit $ret
diff --git a/tools/testing/selftests/net/vrf_strict_mode_test.sh b/tools/testing/selftests/net/vrf_strict_mode_test.sh
new file mode 100755
index 000000000..18b982d61
--- /dev/null
+++ b/tools/testing/selftests/net/vrf_strict_mode_test.sh
@@ -0,0 +1,396 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test is designed for testing the new VRF strict_mode functionality.
+
+ret=0
+
+# identifies the "init" network namespace which is often called root network
+# namespace.
+INIT_NETNS_NAME="init"
+
+PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no}
+
+log_test()
+{
+ local rc=$1
+ local expected=$2
+ local msg="$3"
+
+ if [ ${rc} -eq ${expected} ]; then
+ nsuccess=$((nsuccess+1))
+ printf "\n TEST: %-60s [ OK ]\n" "${msg}"
+ else
+ ret=1
+ nfail=$((nfail+1))
+ printf "\n TEST: %-60s [FAIL]\n" "${msg}"
+ if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+ echo
+ echo "hit enter to continue, 'q' to quit"
+ read a
+ [ "$a" = "q" ] && exit 1
+ fi
+ fi
+}
+
+print_log_test_results()
+{
+ if [ "$TESTS" != "none" ]; then
+ printf "\nTests passed: %3d\n" ${nsuccess}
+ printf "Tests failed: %3d\n" ${nfail}
+ fi
+}
+
+log_section()
+{
+ echo
+ echo "################################################################################"
+ echo "TEST SECTION: $*"
+ echo "################################################################################"
+}
+
+ip_expand_args()
+{
+ local nsname=$1
+ local nsarg=""
+
+ if [ "${nsname}" != "${INIT_NETNS_NAME}" ]; then
+ nsarg="-netns ${nsname}"
+ fi
+
+ echo "${nsarg}"
+}
+
+vrf_count()
+{
+ local nsname=$1
+ local nsarg="$(ip_expand_args ${nsname})"
+
+ ip ${nsarg} -o link show type vrf | wc -l
+}
+
+count_vrf_by_table_id()
+{
+ local nsname=$1
+ local tableid=$2
+ local nsarg="$(ip_expand_args ${nsname})"
+
+ ip ${nsarg} -d -o link show type vrf | grep "table ${tableid}" | wc -l
+}
+
+add_vrf()
+{
+ local nsname=$1
+ local vrfname=$2
+ local vrftable=$3
+ local nsarg="$(ip_expand_args ${nsname})"
+
+ ip ${nsarg} link add ${vrfname} type vrf table ${vrftable} &>/dev/null
+}
+
+add_vrf_and_check()
+{
+ local nsname=$1
+ local vrfname=$2
+ local vrftable=$3
+ local cnt
+ local rc
+
+ add_vrf ${nsname} ${vrfname} ${vrftable}; rc=$?
+
+ cnt=$(count_vrf_by_table_id ${nsname} ${vrftable})
+
+ log_test ${rc} 0 "${nsname}: add vrf ${vrfname}, ${cnt} vrfs for table ${vrftable}"
+}
+
+add_vrf_and_check_fail()
+{
+ local nsname=$1
+ local vrfname=$2
+ local vrftable=$3
+ local cnt
+ local rc
+
+ add_vrf ${nsname} ${vrfname} ${vrftable}; rc=$?
+
+ cnt=$(count_vrf_by_table_id ${nsname} ${vrftable})
+
+ log_test ${rc} 2 "${nsname}: CANNOT add vrf ${vrfname}, ${cnt} vrfs for table ${vrftable}"
+}
+
+del_vrf_and_check()
+{
+ local nsname=$1
+ local vrfname=$2
+ local nsarg="$(ip_expand_args ${nsname})"
+
+ ip ${nsarg} link del ${vrfname}
+ log_test $? 0 "${nsname}: remove vrf ${vrfname}"
+}
+
+config_vrf_and_check()
+{
+ local nsname=$1
+ local addr=$2
+ local vrfname=$3
+ local nsarg="$(ip_expand_args ${nsname})"
+
+ ip ${nsarg} link set dev ${vrfname} up && \
+ ip ${nsarg} addr add ${addr} dev ${vrfname}
+ log_test $? 0 "${nsname}: vrf ${vrfname} up, addr ${addr}"
+}
+
+read_strict_mode()
+{
+ local nsname=$1
+ local rval
+ local rc=0
+ local nsexec=""
+
+ if [ "${nsname}" != "${INIT_NETNS_NAME}" ]; then
+ # a custom network namespace is provided
+ nsexec="ip netns exec ${nsname}"
+ fi
+
+ rval="$(${nsexec} bash -c "cat /proc/sys/net/vrf/strict_mode" | \
+ grep -E "^[0-1]$")" &> /dev/null
+ if [ $? -ne 0 ]; then
+ # set errors
+ rval=255
+ rc=1
+ fi
+
+ # on success, rval can be only 0 or 1; on error, rval is equal to 255
+ echo ${rval}
+ return ${rc}
+}
+
+read_strict_mode_compare_and_check()
+{
+ local nsname=$1
+ local expected=$2
+ local res
+
+ res="$(read_strict_mode ${nsname})"
+ log_test ${res} ${expected} "${nsname}: check strict_mode=${res}"
+}
+
+set_strict_mode()
+{
+ local nsname=$1
+ local val=$2
+ local nsexec=""
+
+ if [ "${nsname}" != "${INIT_NETNS_NAME}" ]; then
+ # a custom network namespace is provided
+ nsexec="ip netns exec ${nsname}"
+ fi
+
+ ${nsexec} bash -c "echo ${val} >/proc/sys/net/vrf/strict_mode" &>/dev/null
+}
+
+enable_strict_mode()
+{
+ local nsname=$1
+
+ set_strict_mode ${nsname} 1
+}
+
+disable_strict_mode()
+{
+ local nsname=$1
+
+ set_strict_mode ${nsname} 0
+}
+
+disable_strict_mode_and_check()
+{
+ local nsname=$1
+
+ disable_strict_mode ${nsname}
+ log_test $? 0 "${nsname}: disable strict_mode (=0)"
+}
+
+enable_strict_mode_and_check()
+{
+ local nsname=$1
+
+ enable_strict_mode ${nsname}
+ log_test $? 0 "${nsname}: enable strict_mode (=1)"
+}
+
+enable_strict_mode_and_check_fail()
+{
+ local nsname=$1
+
+ enable_strict_mode ${nsname}
+ log_test $? 1 "${nsname}: CANNOT enable strict_mode"
+}
+
+strict_mode_check_default()
+{
+ local nsname=$1
+ local strictmode
+ local vrfcnt
+
+ vrfcnt=$(vrf_count ${nsname})
+ strictmode=$(read_strict_mode ${nsname})
+ log_test ${strictmode} 0 "${nsname}: strict_mode=0 by default, ${vrfcnt} vrfs"
+}
+
+setup()
+{
+ modprobe vrf
+
+ ip netns add testns
+ ip netns exec testns ip link set lo up
+}
+
+cleanup()
+{
+ ip netns del testns 2>/dev/null
+
+ ip link del vrf100 2>/dev/null
+ ip link del vrf101 2>/dev/null
+ ip link del vrf102 2>/dev/null
+
+ echo 0 >/proc/sys/net/vrf/strict_mode 2>/dev/null
+}
+
+vrf_strict_mode_tests_init()
+{
+ vrf_strict_mode_check_support init
+
+ strict_mode_check_default init
+
+ add_vrf_and_check init vrf100 100
+ config_vrf_and_check init 172.16.100.1/24 vrf100
+
+ enable_strict_mode_and_check init
+
+ add_vrf_and_check_fail init vrf101 100
+
+ disable_strict_mode_and_check init
+
+ add_vrf_and_check init vrf101 100
+ config_vrf_and_check init 172.16.101.1/24 vrf101
+
+ enable_strict_mode_and_check_fail init
+
+ del_vrf_and_check init vrf101
+
+ enable_strict_mode_and_check init
+
+ add_vrf_and_check init vrf102 102
+ config_vrf_and_check init 172.16.102.1/24 vrf102
+
+ # the strict_modle is enabled in the init
+}
+
+vrf_strict_mode_tests_testns()
+{
+ vrf_strict_mode_check_support testns
+
+ strict_mode_check_default testns
+
+ enable_strict_mode_and_check testns
+
+ add_vrf_and_check testns vrf100 100
+ config_vrf_and_check testns 10.0.100.1/24 vrf100
+
+ add_vrf_and_check_fail testns vrf101 100
+
+ add_vrf_and_check_fail testns vrf102 100
+
+ add_vrf_and_check testns vrf200 200
+
+ disable_strict_mode_and_check testns
+
+ add_vrf_and_check testns vrf101 100
+
+ add_vrf_and_check testns vrf102 100
+
+ #the strict_mode is disabled in the testns
+}
+
+vrf_strict_mode_tests_mix()
+{
+ read_strict_mode_compare_and_check init 1
+
+ read_strict_mode_compare_and_check testns 0
+
+ del_vrf_and_check testns vrf101
+
+ del_vrf_and_check testns vrf102
+
+ disable_strict_mode_and_check init
+
+ enable_strict_mode_and_check testns
+
+ enable_strict_mode_and_check init
+ enable_strict_mode_and_check init
+
+ disable_strict_mode_and_check testns
+ disable_strict_mode_and_check testns
+
+ read_strict_mode_compare_and_check init 1
+
+ read_strict_mode_compare_and_check testns 0
+}
+
+vrf_strict_mode_tests()
+{
+ log_section "VRF strict_mode test on init network namespace"
+ vrf_strict_mode_tests_init
+
+ log_section "VRF strict_mode test on testns network namespace"
+ vrf_strict_mode_tests_testns
+
+ log_section "VRF strict_mode test mixing init and testns network namespaces"
+ vrf_strict_mode_tests_mix
+}
+
+vrf_strict_mode_check_support()
+{
+ local nsname=$1
+ local output
+ local rc
+
+ output="$(lsmod | grep '^vrf' | awk '{print $1}')"
+ if [ -z "${output}" ]; then
+ modinfo vrf || return $?
+ fi
+
+ # we do not care about the value of the strict_mode; we only check if
+ # the strict_mode parameter is available or not.
+ read_strict_mode ${nsname} &>/dev/null; rc=$?
+ log_test ${rc} 0 "${nsname}: net.vrf.strict_mode is available"
+
+ return ${rc}
+}
+
+if [ "$(id -u)" -ne 0 ];then
+ echo "SKIP: Need root privileges"
+ exit 0
+fi
+
+if [ ! -x "$(command -v ip)" ]; then
+ echo "SKIP: Could not run test without ip tool"
+ exit 0
+fi
+
+modprobe vrf &>/dev/null
+if [ ! -e /proc/sys/net/vrf/strict_mode ]; then
+ echo "SKIP: vrf sysctl does not exist"
+ exit 0
+fi
+
+cleanup &> /dev/null
+
+setup
+vrf_strict_mode_tests
+cleanup
+
+print_log_test_results
+
+exit $ret
diff --git a/tools/testing/selftests/net/xfrm_policy.sh b/tools/testing/selftests/net/xfrm_policy.sh
new file mode 100755
index 000000000..bdf450eaf
--- /dev/null
+++ b/tools/testing/selftests/net/xfrm_policy.sh
@@ -0,0 +1,486 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Check xfrm policy resolution. Topology:
+#
+# 1.2 1.1 3.1 3.10 2.1 2.2
+# eth1 eth1 veth0 veth0 eth1 eth1
+# ns1 ---- ns3 ----- ns4 ---- ns2
+#
+# ns3 and ns4 are connected via ipsec tunnel.
+# pings from ns1 to ns2 (and vice versa) are supposed to work like this:
+# ns1: ping 10.0.2.2: passes via ipsec tunnel.
+# ns2: ping 10.0.1.2: passes via ipsec tunnel.
+
+# ns1: ping 10.0.1.253: passes via ipsec tunnel (direct policy)
+# ns2: ping 10.0.2.253: passes via ipsec tunnel (direct policy)
+#
+# ns1: ping 10.0.2.254: does NOT pass via ipsec tunnel (exception)
+# ns2: ping 10.0.1.254: does NOT pass via ipsec tunnel (exception)
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+ret=0
+policy_checks_ok=1
+
+KEY_SHA=0xdeadbeef1234567890abcdefabcdefabcdefabcd
+KEY_AES=0x0123456789abcdef0123456789012345
+SPI1=0x1
+SPI2=0x2
+
+do_esp_policy() {
+ local ns=$1
+ local me=$2
+ local remote=$3
+ local lnet=$4
+ local rnet=$5
+
+ # to encrypt packets as they go out (includes forwarded packets that need encapsulation)
+ ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 100 action allow
+ # to fwd decrypted packets after esp processing:
+ ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 100 action allow
+}
+
+do_esp() {
+ local ns=$1
+ local me=$2
+ local remote=$3
+ local lnet=$4
+ local rnet=$5
+ local spi_out=$6
+ local spi_in=$7
+
+ ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $rnet dst $lnet
+ ip -net $ns xfrm state add src $me dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet
+
+ do_esp_policy $ns $me $remote $lnet $rnet
+}
+
+# add policies with different netmasks, to make sure kernel carries
+# the policies contained within new netmask over when search tree is
+# re-built.
+# peer netns that are supposed to be encapsulated via esp have addresses
+# in the 10.0.1.0/24 and 10.0.2.0/24 subnets, respectively.
+#
+# Adding a policy for '10.0.1.0/23' will make it necessary to
+# alter the prefix of 10.0.1.0 subnet.
+# In case new prefix overlaps with existing node, the node and all
+# policies it carries need to be merged with the existing one(s).
+#
+# Do that here.
+do_overlap()
+{
+ local ns=$1
+
+ # adds new nodes to tree (neither network exists yet in policy database).
+ ip -net $ns xfrm policy add src 10.1.0.0/24 dst 10.0.0.0/24 dir fwd priority 200 action block
+
+ # adds a new node in the 10.0.0.0/24 tree (dst node exists).
+ ip -net $ns xfrm policy add src 10.2.0.0/24 dst 10.0.0.0/24 dir fwd priority 200 action block
+
+ # adds a 10.2.0.0/23 node, but for different dst.
+ ip -net $ns xfrm policy add src 10.2.0.0/23 dst 10.0.1.0/24 dir fwd priority 200 action block
+
+ # dst now overlaps with the 10.0.1.0/24 ESP policy in fwd.
+ # kernel must 'promote' existing one (10.0.0.0/24) to 10.0.0.0/23.
+ # But 10.0.0.0/23 also includes existing 10.0.1.0/24, so that node
+ # also has to be merged too, including source-sorted subtrees.
+ # old:
+ # 10.0.0.0/24 (node 1 in dst tree of the bin)
+ # 10.1.0.0/24 (node in src tree of dst node 1)
+ # 10.2.0.0/24 (node in src tree of dst node 1)
+ # 10.0.1.0/24 (node 2 in dst tree of the bin)
+ # 10.0.2.0/24 (node in src tree of dst node 2)
+ # 10.2.0.0/24 (node in src tree of dst node 2)
+ #
+ # The next 'policy add' adds dst '10.0.0.0/23', which means
+ # that dst node 1 and dst node 2 have to be merged including
+ # the sub-tree. As no duplicates are allowed, policies in
+ # the two '10.0.2.0/24' are also merged.
+ #
+ # after the 'add', internal search tree should look like this:
+ # 10.0.0.0/23 (node in dst tree of bin)
+ # 10.0.2.0/24 (node in src tree of dst node)
+ # 10.1.0.0/24 (node in src tree of dst node)
+ # 10.2.0.0/24 (node in src tree of dst node)
+ #
+ # 10.0.0.0/24 and 10.0.1.0/24 nodes have been merged as 10.0.0.0/23.
+ ip -net $ns xfrm policy add src 10.1.0.0/24 dst 10.0.0.0/23 dir fwd priority 200 action block
+
+ # similar to above: add policies (with partially random address), with shrinking prefixes.
+ for p in 29 28 27;do
+ for k in $(seq 1 32); do
+ ip -net $ns xfrm policy add src 10.253.1.$((RANDOM%255))/$p dst 10.254.1.$((RANDOM%255))/$p dir fwd priority $((200+k)) action block 2>/dev/null
+ done
+ done
+}
+
+do_esp_policy_get_check() {
+ local ns=$1
+ local lnet=$2
+ local rnet=$3
+
+ ip -net $ns xfrm policy get src $lnet dst $rnet dir out > /dev/null
+ if [ $? -ne 0 ] && [ $policy_checks_ok -eq 1 ] ;then
+ policy_checks_ok=0
+ echo "FAIL: ip -net $ns xfrm policy get src $lnet dst $rnet dir out"
+ ret=1
+ fi
+
+ ip -net $ns xfrm policy get src $rnet dst $lnet dir fwd > /dev/null
+ if [ $? -ne 0 ] && [ $policy_checks_ok -eq 1 ] ;then
+ policy_checks_ok=0
+ echo "FAIL: ip -net $ns xfrm policy get src $rnet dst $lnet dir fwd"
+ ret=1
+ fi
+}
+
+do_exception() {
+ local ns=$1
+ local me=$2
+ local remote=$3
+ local encryptip=$4
+ local plain=$5
+
+ # network $plain passes without tunnel
+ ip -net $ns xfrm policy add dst $plain dir out priority 10 action allow
+
+ # direct policy for $encryptip, use tunnel, higher prio takes precedence
+ ip -net $ns xfrm policy add dst $encryptip dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow
+}
+
+# policies that are not supposed to match any packets generated in this test.
+do_dummies4() {
+ local ns=$1
+
+ for i in $(seq 10 16);do
+ # dummy policy with wildcard src/dst.
+ echo netns exec $ns ip xfrm policy add src 0.0.0.0/0 dst 10.$i.99.0/30 dir out action block
+ echo netns exec $ns ip xfrm policy add src 10.$i.99.0/30 dst 0.0.0.0/0 dir out action block
+ for j in $(seq 32 64);do
+ echo netns exec $ns ip xfrm policy add src 10.$i.1.0/30 dst 10.$i.$j.0/30 dir out action block
+ # silly, as it encompasses the one above too, but its allowed:
+ echo netns exec $ns ip xfrm policy add src 10.$i.1.0/29 dst 10.$i.$j.0/29 dir out action block
+ # and yet again, even more broad one.
+ echo netns exec $ns ip xfrm policy add src 10.$i.1.0/24 dst 10.$i.$j.0/24 dir out action block
+ echo netns exec $ns ip xfrm policy add src 10.$i.$j.0/24 dst 10.$i.1.0/24 dir fwd action block
+ done
+ done | ip -batch /dev/stdin
+}
+
+do_dummies6() {
+ local ns=$1
+
+ for i in $(seq 10 16);do
+ for j in $(seq 32 64);do
+ echo netns exec $ns ip xfrm policy add src dead:$i::/64 dst dead:$i:$j::/64 dir out action block
+ echo netns exec $ns ip xfrm policy add src dead:$i:$j::/64 dst dead:$i::/24 dir fwd action block
+ done
+ done | ip -batch /dev/stdin
+}
+
+check_ipt_policy_count()
+{
+ ns=$1
+
+ ip netns exec $ns iptables-save -c |grep policy | ( read c rest
+ ip netns exec $ns iptables -Z
+ if [ x"$c" = x'[0:0]' ]; then
+ exit 0
+ elif [ x"$c" = x ]; then
+ echo "ERROR: No counters"
+ ret=1
+ exit 111
+ else
+ exit 1
+ fi
+ )
+}
+
+check_xfrm() {
+ # 0: iptables -m policy rule count == 0
+ # 1: iptables -m policy rule count != 0
+ rval=$1
+ ip=$2
+ local lret=0
+
+ ip netns exec ns1 ping -q -c 1 10.0.2.$ip > /dev/null
+
+ check_ipt_policy_count ns3
+ if [ $? -ne $rval ] ; then
+ lret=1
+ fi
+ check_ipt_policy_count ns4
+ if [ $? -ne $rval ] ; then
+ lret=1
+ fi
+
+ ip netns exec ns2 ping -q -c 1 10.0.1.$ip > /dev/null
+
+ check_ipt_policy_count ns3
+ if [ $? -ne $rval ] ; then
+ lret=1
+ fi
+ check_ipt_policy_count ns4
+ if [ $? -ne $rval ] ; then
+ lret=1
+ fi
+
+ return $lret
+}
+
+check_exceptions()
+{
+ logpostfix="$1"
+ local lret=0
+
+ # ping to .254 should be excluded from the tunnel (exception is in place).
+ check_xfrm 0 254
+ if [ $? -ne 0 ]; then
+ echo "FAIL: expected ping to .254 to fail ($logpostfix)"
+ lret=1
+ else
+ echo "PASS: ping to .254 bypassed ipsec tunnel ($logpostfix)"
+ fi
+
+ # ping to .253 should use use ipsec due to direct policy exception.
+ check_xfrm 1 253
+ if [ $? -ne 0 ]; then
+ echo "FAIL: expected ping to .253 to use ipsec tunnel ($logpostfix)"
+ lret=1
+ else
+ echo "PASS: direct policy matches ($logpostfix)"
+ fi
+
+ # ping to .2 should use ipsec.
+ check_xfrm 1 2
+ if [ $? -ne 0 ]; then
+ echo "FAIL: expected ping to .2 to use ipsec tunnel ($logpostfix)"
+ lret=1
+ else
+ echo "PASS: policy matches ($logpostfix)"
+ fi
+
+ return $lret
+}
+
+check_hthresh_repeat()
+{
+ local log=$1
+ i=0
+
+ for i in $(seq 1 10);do
+ ip -net ns1 xfrm policy update src e000:0001::0000 dst ff01::0014:0000:0001 dir in tmpl src :: dst :: proto esp mode tunnel priority 100 action allow || break
+ ip -net ns1 xfrm policy set hthresh6 0 28 || break
+
+ ip -net ns1 xfrm policy update src e000:0001::0000 dst ff01::01 dir in tmpl src :: dst :: proto esp mode tunnel priority 100 action allow || break
+ ip -net ns1 xfrm policy set hthresh6 0 28 || break
+ done
+
+ if [ $i -ne 10 ] ;then
+ echo "FAIL: $log" 1>&2
+ ret=1
+ return 1
+ fi
+
+ echo "PASS: $log"
+ return 0
+}
+
+# insert non-overlapping policies in a random order and check that
+# all of them can be fetched using the traffic selectors.
+check_random_order()
+{
+ local ns=$1
+ local log=$2
+
+ for i in $(seq 100); do
+ ip -net $ns xfrm policy flush
+ for j in $(seq 0 16 255 | sort -R); do
+ ip -net $ns xfrm policy add dst $j.0.0.0/24 dir out priority 10 action allow
+ done
+ for j in $(seq 0 16 255); do
+ if ! ip -net $ns xfrm policy get dst $j.0.0.0/24 dir out > /dev/null; then
+ echo "FAIL: $log" 1>&2
+ return 1
+ fi
+ done
+ done
+
+ for i in $(seq 100); do
+ ip -net $ns xfrm policy flush
+ for j in $(seq 0 16 255 | sort -R); do
+ local addr=$(printf "e000:0000:%02x00::/56" $j)
+ ip -net $ns xfrm policy add dst $addr dir out priority 10 action allow
+ done
+ for j in $(seq 0 16 255); do
+ local addr=$(printf "e000:0000:%02x00::/56" $j)
+ if ! ip -net $ns xfrm policy get dst $addr dir out > /dev/null; then
+ echo "FAIL: $log" 1>&2
+ return 1
+ fi
+ done
+ done
+
+ ip -net $ns xfrm policy flush
+
+ echo "PASS: $log"
+ return 0
+}
+
+#check for needed privileges
+if [ "$(id -u)" -ne 0 ];then
+ echo "SKIP: Need root privileges"
+ exit $ksft_skip
+fi
+
+ip -Version 2>/dev/null >/dev/null
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without the ip tool"
+ exit $ksft_skip
+fi
+
+# needed to check if policy lookup got valid ipsec result
+iptables --version 2>/dev/null >/dev/null
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without iptables tool"
+ exit $ksft_skip
+fi
+
+for i in 1 2 3 4; do
+ ip netns add ns$i
+ ip -net ns$i link set lo up
+done
+
+DEV=veth0
+ip link add $DEV netns ns1 type veth peer name eth1 netns ns3
+ip link add $DEV netns ns2 type veth peer name eth1 netns ns4
+
+ip link add $DEV netns ns3 type veth peer name veth0 netns ns4
+
+DEV=veth0
+for i in 1 2; do
+ ip -net ns$i link set $DEV up
+ ip -net ns$i addr add 10.0.$i.2/24 dev $DEV
+ ip -net ns$i addr add dead:$i::2/64 dev $DEV
+
+ ip -net ns$i addr add 10.0.$i.253 dev $DEV
+ ip -net ns$i addr add 10.0.$i.254 dev $DEV
+ ip -net ns$i addr add dead:$i::fd dev $DEV
+ ip -net ns$i addr add dead:$i::fe dev $DEV
+done
+
+for i in 3 4; do
+ip -net ns$i link set eth1 up
+ip -net ns$i link set veth0 up
+done
+
+ip -net ns1 route add default via 10.0.1.1
+ip -net ns2 route add default via 10.0.2.1
+
+ip -net ns3 addr add 10.0.1.1/24 dev eth1
+ip -net ns3 addr add 10.0.3.1/24 dev veth0
+ip -net ns3 addr add 2001:1::1/64 dev eth1
+ip -net ns3 addr add 2001:3::1/64 dev veth0
+
+ip -net ns3 route add default via 10.0.3.10
+
+ip -net ns4 addr add 10.0.2.1/24 dev eth1
+ip -net ns4 addr add 10.0.3.10/24 dev veth0
+ip -net ns4 addr add 2001:2::1/64 dev eth1
+ip -net ns4 addr add 2001:3::10/64 dev veth0
+ip -net ns4 route add default via 10.0.3.1
+
+for j in 4 6; do
+ for i in 3 4;do
+ ip netns exec ns$i sysctl net.ipv$j.conf.eth1.forwarding=1 > /dev/null
+ ip netns exec ns$i sysctl net.ipv$j.conf.veth0.forwarding=1 > /dev/null
+ done
+done
+
+# abuse iptables rule counter to check if ping matches a policy
+ip netns exec ns3 iptables -p icmp -A FORWARD -m policy --dir out --pol ipsec
+ip netns exec ns4 iptables -p icmp -A FORWARD -m policy --dir out --pol ipsec
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not insert iptables rule"
+ for i in 1 2 3 4;do ip netns del ns$i;done
+ exit $ksft_skip
+fi
+
+# localip remoteip localnet remotenet
+do_esp ns3 10.0.3.1 10.0.3.10 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2
+do_esp ns3 dead:3::1 dead:3::10 dead:1::/64 dead:2::/64 $SPI1 $SPI2
+do_esp ns4 10.0.3.10 10.0.3.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1
+do_esp ns4 dead:3::10 dead:3::1 dead:2::/64 dead:1::/64 $SPI2 $SPI1
+
+do_dummies4 ns3
+do_dummies6 ns4
+
+do_esp_policy_get_check ns3 10.0.1.0/24 10.0.2.0/24
+do_esp_policy_get_check ns4 10.0.2.0/24 10.0.1.0/24
+do_esp_policy_get_check ns3 dead:1::/64 dead:2::/64
+do_esp_policy_get_check ns4 dead:2::/64 dead:1::/64
+
+# ping to .254 should use ipsec, exception is not installed.
+check_xfrm 1 254
+if [ $? -ne 0 ]; then
+ echo "FAIL: expected ping to .254 to use ipsec tunnel"
+ ret=1
+else
+ echo "PASS: policy before exception matches"
+fi
+
+# installs exceptions
+# localip remoteip encryptdst plaindst
+do_exception ns3 10.0.3.1 10.0.3.10 10.0.2.253 10.0.2.240/28
+do_exception ns4 10.0.3.10 10.0.3.1 10.0.1.253 10.0.1.240/28
+
+do_exception ns3 dead:3::1 dead:3::10 dead:2::fd dead:2:f0::/96
+do_exception ns4 dead:3::10 dead:3::1 dead:1::fd dead:1:f0::/96
+
+check_exceptions "exceptions"
+if [ $? -ne 0 ]; then
+ ret=1
+fi
+
+# insert block policies with adjacent/overlapping netmasks
+do_overlap ns3
+
+check_exceptions "exceptions and block policies"
+if [ $? -ne 0 ]; then
+ ret=1
+fi
+
+for n in ns3 ns4;do
+ ip -net $n xfrm policy set hthresh4 28 24 hthresh6 126 125
+ sleep $((RANDOM%5))
+done
+
+check_exceptions "exceptions and block policies after hresh changes"
+
+# full flush of policy db, check everything gets freed incl. internal meta data
+ip -net ns3 xfrm policy flush
+
+do_esp_policy ns3 10.0.3.1 10.0.3.10 10.0.1.0/24 10.0.2.0/24
+do_exception ns3 10.0.3.1 10.0.3.10 10.0.2.253 10.0.2.240/28
+
+# move inexact policies to hash table
+ip -net ns3 xfrm policy set hthresh4 16 16
+
+sleep $((RANDOM%5))
+check_exceptions "exceptions and block policies after hthresh change in ns3"
+
+# restore original hthresh settings -- move policies back to tables
+for n in ns3 ns4;do
+ ip -net $n xfrm policy set hthresh4 32 32 hthresh6 128 128
+ sleep $((RANDOM%5))
+done
+check_exceptions "exceptions and block policies after htresh change to normal"
+
+check_hthresh_repeat "policies with repeated htresh change"
+
+check_random_order ns3 "policies inserted in random order"
+
+for i in 1 2 3 4;do ip netns del ns$i;done
+
+exit $ret
diff --git a/tools/testing/selftests/netfilter/.gitignore b/tools/testing/selftests/netfilter/.gitignore
new file mode 100644
index 000000000..8448f74ad
--- /dev/null
+++ b/tools/testing/selftests/netfilter/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+nf-queue
diff --git a/tools/testing/selftests/netfilter/Makefile b/tools/testing/selftests/netfilter/Makefile
new file mode 100644
index 000000000..a56cfc4f2
--- /dev/null
+++ b/tools/testing/selftests/netfilter/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for netfilter selftests
+
+TEST_PROGS := nft_trans_stress.sh nft_nat.sh bridge_brouter.sh \
+ conntrack_icmp_related.sh nft_flowtable.sh ipvs.sh \
+ nft_concat_range.sh nft_conntrack_helper.sh \
+ nft_queue.sh nft_meta.sh \
+ conntrack_vrf.sh
+
+LDLIBS = -lmnl
+TEST_GEN_FILES = nf-queue
+
+include ../lib.mk
diff --git a/tools/testing/selftests/netfilter/bridge_brouter.sh b/tools/testing/selftests/netfilter/bridge_brouter.sh
new file mode 100755
index 000000000..29f3955b9
--- /dev/null
+++ b/tools/testing/selftests/netfilter/bridge_brouter.sh
@@ -0,0 +1,146 @@
+#!/bin/bash
+#
+# This test is for bridge 'brouting', i.e. make some packets being routed
+# rather than getting bridged even though they arrive on interface that is
+# part of a bridge.
+
+# eth0 br0 eth0
+# setup is: ns1 <-> ns0 <-> ns2
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+ret=0
+
+ebtables -V > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without ebtables"
+ exit $ksft_skip
+fi
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+
+ip netns add ns0
+ip netns add ns1
+ip netns add ns2
+
+ip link add veth0 netns ns0 type veth peer name eth0 netns ns1
+if [ $? -ne 0 ]; then
+ echo "SKIP: Can't create veth device"
+ exit $ksft_skip
+fi
+ip link add veth1 netns ns0 type veth peer name eth0 netns ns2
+
+ip -net ns0 link set lo up
+ip -net ns0 link set veth0 up
+ip -net ns0 link set veth1 up
+
+ip -net ns0 link add br0 type bridge
+if [ $? -ne 0 ]; then
+ echo "SKIP: Can't create bridge br0"
+ exit $ksft_skip
+fi
+
+ip -net ns0 link set veth0 master br0
+ip -net ns0 link set veth1 master br0
+ip -net ns0 link set br0 up
+ip -net ns0 addr add 10.0.0.1/24 dev br0
+
+# place both in same subnet, ns1 and ns2 connected via ns0:br0
+for i in 1 2; do
+ ip -net ns$i link set lo up
+ ip -net ns$i link set eth0 up
+ ip -net ns$i addr add 10.0.0.1$i/24 dev eth0
+done
+
+test_ebtables_broute()
+{
+ local cipt
+
+ # redirect is needed so the dstmac is rewritten to the bridge itself,
+ # ip stack won't process OTHERHOST (foreign unicast mac) packets.
+ ip netns exec ns0 ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP
+ if [ $? -ne 0 ]; then
+ echo "SKIP: Could not add ebtables broute redirect rule"
+ return $ksft_skip
+ fi
+
+ # ping netns1, expected to not work (ip forwarding is off)
+ ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null 2>&1
+ if [ $? -eq 0 ]; then
+ echo "ERROR: ping works, should have failed" 1>&2
+ return 1
+ fi
+
+ # enable forwarding on both interfaces.
+ # neither needs an ip address, but at least the bridge needs
+ # an ip address in same network segment as ns1 and ns2 (ns0
+ # needs to be able to determine route for to-be-forwarded packet).
+ ip netns exec ns0 sysctl -q net.ipv4.conf.veth0.forwarding=1
+ ip netns exec ns0 sysctl -q net.ipv4.conf.veth1.forwarding=1
+
+ sleep 1
+
+ ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null
+ if [ $? -ne 0 ]; then
+ echo "ERROR: ping did not work, but it should (broute+forward)" 1>&2
+ return 1
+ fi
+
+ echo "PASS: ns1/ns2 connectivity with active broute rule"
+ ip netns exec ns0 ebtables -t broute -F
+
+ # ping netns1, expected to work (frames are bridged)
+ ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null
+ if [ $? -ne 0 ]; then
+ echo "ERROR: ping did not work, but it should (bridged)" 1>&2
+ return 1
+ fi
+
+ ip netns exec ns0 ebtables -t filter -A FORWARD -p ipv4 --ip-protocol icmp -j DROP
+
+ # ping netns1, expected to not work (DROP in bridge forward)
+ ip netns exec ns1 ping -q -c 1 10.0.0.12 > /dev/null 2>&1
+ if [ $? -eq 0 ]; then
+ echo "ERROR: ping works, should have failed (icmp forward drop)" 1>&2
+ return 1
+ fi
+
+ # re-activate brouter
+ ip netns exec ns0 ebtables -t broute -A BROUTING -p ipv4 --ip-protocol icmp -j redirect --redirect-target=DROP
+
+ ip netns exec ns2 ping -q -c 1 10.0.0.11 > /dev/null
+ if [ $? -ne 0 ]; then
+ echo "ERROR: ping did not work, but it should (broute+forward 2)" 1>&2
+ return 1
+ fi
+
+ echo "PASS: ns1/ns2 connectivity with active broute rule and bridge forward drop"
+ return 0
+}
+
+# test basic connectivity
+ip netns exec ns1 ping -c 1 -q 10.0.0.12 > /dev/null
+if [ $? -ne 0 ]; then
+ echo "ERROR: Could not reach ns2 from ns1" 1>&2
+ ret=1
+fi
+
+ip netns exec ns2 ping -c 1 -q 10.0.0.11 > /dev/null
+if [ $? -ne 0 ]; then
+ echo "ERROR: Could not reach ns1 from ns2" 1>&2
+ ret=1
+fi
+
+if [ $ret -eq 0 ];then
+ echo "PASS: netns connectivity: ns1 and ns2 can reach each other"
+fi
+
+test_ebtables_broute
+ret=$?
+for i in 0 1 2; do ip netns del ns$i;done
+
+exit $ret
diff --git a/tools/testing/selftests/netfilter/config b/tools/testing/selftests/netfilter/config
new file mode 100644
index 000000000..4faf2ce02
--- /dev/null
+++ b/tools/testing/selftests/netfilter/config
@@ -0,0 +1,8 @@
+CONFIG_NET_NS=y
+CONFIG_NF_TABLES_INET=y
+CONFIG_NFT_QUEUE=m
+CONFIG_NFT_NAT=m
+CONFIG_NFT_REDIR=m
+CONFIG_NFT_MASQ=m
+CONFIG_NFT_FLOW_OFFLOAD=m
+CONFIG_NF_CT_NETLINK=m
diff --git a/tools/testing/selftests/netfilter/conntrack_icmp_related.sh b/tools/testing/selftests/netfilter/conntrack_icmp_related.sh
new file mode 100755
index 000000000..76645aaf2
--- /dev/null
+++ b/tools/testing/selftests/netfilter/conntrack_icmp_related.sh
@@ -0,0 +1,315 @@
+#!/bin/bash
+#
+# check that ICMP df-needed/pkttoobig icmp are set are set as related
+# state
+#
+# Setup is:
+#
+# nsclient1 -> nsrouter1 -> nsrouter2 -> nsclient2
+# MTU 1500, except for nsrouter2 <-> nsclient2 link (1280).
+# ping nsclient2 from nsclient1, checking that conntrack did set RELATED
+# 'fragmentation needed' icmp packet.
+#
+# In addition, nsrouter1 will perform IP masquerading, i.e. also
+# check the icmp errors are propagated to the correct host as per
+# nat of "established" icmp-echo "connection".
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+ret=0
+
+nft --version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without nft tool"
+ exit $ksft_skip
+fi
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+
+cleanup() {
+ for i in 1 2;do ip netns del nsclient$i;done
+ for i in 1 2;do ip netns del nsrouter$i;done
+}
+
+trap cleanup EXIT
+
+ipv4() {
+ echo -n 192.168.$1.2
+}
+
+ipv6 () {
+ echo -n dead:$1::2
+}
+
+check_counter()
+{
+ ns=$1
+ name=$2
+ expect=$3
+ local lret=0
+
+ cnt=$(ip netns exec $ns nft list counter inet filter "$name" | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ echo "ERROR: counter $name in $ns has unexpected value (expected $expect)" 1>&2
+ ip netns exec $ns nft list counter inet filter "$name" 1>&2
+ lret=1
+ fi
+
+ return $lret
+}
+
+check_unknown()
+{
+ expect="packets 0 bytes 0"
+ for n in nsclient1 nsclient2 nsrouter1 nsrouter2; do
+ check_counter $n "unknown" "$expect"
+ if [ $? -ne 0 ] ;then
+ return 1
+ fi
+ done
+
+ return 0
+}
+
+for n in nsclient1 nsclient2 nsrouter1 nsrouter2; do
+ ip netns add $n
+ ip -net $n link set lo up
+done
+
+DEV=veth0
+ip link add $DEV netns nsclient1 type veth peer name eth1 netns nsrouter1
+DEV=veth0
+ip link add $DEV netns nsclient2 type veth peer name eth1 netns nsrouter2
+
+DEV=veth0
+ip link add $DEV netns nsrouter1 type veth peer name eth2 netns nsrouter2
+
+DEV=veth0
+for i in 1 2; do
+ ip -net nsclient$i link set $DEV up
+ ip -net nsclient$i addr add $(ipv4 $i)/24 dev $DEV
+ ip -net nsclient$i addr add $(ipv6 $i)/64 dev $DEV
+done
+
+ip -net nsrouter1 link set eth1 up
+ip -net nsrouter1 link set veth0 up
+
+ip -net nsrouter2 link set eth1 up
+ip -net nsrouter2 link set eth2 up
+
+ip -net nsclient1 route add default via 192.168.1.1
+ip -net nsclient1 -6 route add default via dead:1::1
+
+ip -net nsclient2 route add default via 192.168.2.1
+ip -net nsclient2 route add default via dead:2::1
+
+i=3
+ip -net nsrouter1 addr add 192.168.1.1/24 dev eth1
+ip -net nsrouter1 addr add 192.168.3.1/24 dev veth0
+ip -net nsrouter1 addr add dead:1::1/64 dev eth1
+ip -net nsrouter1 addr add dead:3::1/64 dev veth0
+ip -net nsrouter1 route add default via 192.168.3.10
+ip -net nsrouter1 -6 route add default via dead:3::10
+
+ip -net nsrouter2 addr add 192.168.2.1/24 dev eth1
+ip -net nsrouter2 addr add 192.168.3.10/24 dev eth2
+ip -net nsrouter2 addr add dead:2::1/64 dev eth1
+ip -net nsrouter2 addr add dead:3::10/64 dev eth2
+ip -net nsrouter2 route add default via 192.168.3.1
+ip -net nsrouter2 route add default via dead:3::1
+
+sleep 2
+for i in 4 6; do
+ ip netns exec nsrouter1 sysctl -q net.ipv$i.conf.all.forwarding=1
+ ip netns exec nsrouter2 sysctl -q net.ipv$i.conf.all.forwarding=1
+done
+
+for netns in nsrouter1 nsrouter2; do
+ip netns exec $netns nft -f - <<EOF
+table inet filter {
+ counter unknown { }
+ counter related { }
+ chain forward {
+ type filter hook forward priority 0; policy accept;
+ meta l4proto icmpv6 icmpv6 type "packet-too-big" ct state "related" counter name "related" accept
+ meta l4proto icmp icmp type "destination-unreachable" ct state "related" counter name "related" accept
+ meta l4proto { icmp, icmpv6 } ct state new,established accept
+ counter name "unknown" drop
+ }
+}
+EOF
+done
+
+ip netns exec nsclient1 nft -f - <<EOF
+table inet filter {
+ counter unknown { }
+ counter related { }
+ counter redir4 { }
+ counter redir6 { }
+ chain input {
+ type filter hook input priority 0; policy accept;
+
+ icmp type "redirect" ct state "related" counter name "redir4" accept
+ icmpv6 type "nd-redirect" ct state "related" counter name "redir6" accept
+
+ meta l4proto { icmp, icmpv6 } ct state established,untracked accept
+ meta l4proto { icmp, icmpv6 } ct state "related" counter name "related" accept
+
+ counter name "unknown" drop
+ }
+}
+EOF
+
+ip netns exec nsclient2 nft -f - <<EOF
+table inet filter {
+ counter unknown { }
+ counter new { }
+ counter established { }
+
+ chain input {
+ type filter hook input priority 0; policy accept;
+ meta l4proto { icmp, icmpv6 } ct state established,untracked accept
+
+ meta l4proto { icmp, icmpv6 } ct state "new" counter name "new" accept
+ meta l4proto { icmp, icmpv6 } ct state "established" counter name "established" accept
+ counter name "unknown" drop
+ }
+ chain output {
+ type filter hook output priority 0; policy accept;
+ meta l4proto { icmp, icmpv6 } ct state established,untracked accept
+
+ meta l4proto { icmp, icmpv6 } ct state "new" counter name "new"
+ meta l4proto { icmp, icmpv6 } ct state "established" counter name "established"
+ counter name "unknown" drop
+ }
+}
+EOF
+
+
+# make sure NAT core rewrites adress of icmp error if nat is used according to
+# conntrack nat information (icmp error will be directed at nsrouter1 address,
+# but it needs to be routed to nsclient1 address).
+ip netns exec nsrouter1 nft -f - <<EOF
+table ip nat {
+ chain postrouting {
+ type nat hook postrouting priority 0; policy accept;
+ ip protocol icmp oifname "veth0" counter masquerade
+ }
+}
+table ip6 nat {
+ chain postrouting {
+ type nat hook postrouting priority 0; policy accept;
+ ip6 nexthdr icmpv6 oifname "veth0" counter masquerade
+ }
+}
+EOF
+
+ip netns exec nsrouter2 ip link set eth1 mtu 1280
+ip netns exec nsclient2 ip link set veth0 mtu 1280
+sleep 1
+
+ip netns exec nsclient1 ping -c 1 -s 1000 -q -M do 192.168.2.2 >/dev/null
+if [ $? -ne 0 ]; then
+ echo "ERROR: netns ip routing/connectivity broken" 1>&2
+ cleanup
+ exit 1
+fi
+ip netns exec nsclient1 ping6 -q -c 1 -s 1000 dead:2::2 >/dev/null
+if [ $? -ne 0 ]; then
+ echo "ERROR: netns ipv6 routing/connectivity broken" 1>&2
+ cleanup
+ exit 1
+fi
+
+check_unknown
+if [ $? -ne 0 ]; then
+ ret=1
+fi
+
+expect="packets 0 bytes 0"
+for netns in nsrouter1 nsrouter2 nsclient1;do
+ check_counter "$netns" "related" "$expect"
+ if [ $? -ne 0 ]; then
+ ret=1
+ fi
+done
+
+expect="packets 2 bytes 2076"
+check_counter nsclient2 "new" "$expect"
+if [ $? -ne 0 ]; then
+ ret=1
+fi
+
+ip netns exec nsclient1 ping -q -c 1 -s 1300 -M do 192.168.2.2 > /dev/null
+if [ $? -eq 0 ]; then
+ echo "ERROR: ping should have failed with PMTU too big error" 1>&2
+ ret=1
+fi
+
+# nsrouter2 should have generated the icmp error, so
+# related counter should be 0 (its in forward).
+expect="packets 0 bytes 0"
+check_counter "nsrouter2" "related" "$expect"
+if [ $? -ne 0 ]; then
+ ret=1
+fi
+
+# but nsrouter1 should have seen it, same for nsclient1.
+expect="packets 1 bytes 576"
+for netns in nsrouter1 nsclient1;do
+ check_counter "$netns" "related" "$expect"
+ if [ $? -ne 0 ]; then
+ ret=1
+ fi
+done
+
+ip netns exec nsclient1 ping6 -c 1 -s 1300 dead:2::2 > /dev/null
+if [ $? -eq 0 ]; then
+ echo "ERROR: ping6 should have failed with PMTU too big error" 1>&2
+ ret=1
+fi
+
+expect="packets 2 bytes 1856"
+for netns in nsrouter1 nsclient1;do
+ check_counter "$netns" "related" "$expect"
+ if [ $? -ne 0 ]; then
+ ret=1
+ fi
+done
+
+if [ $ret -eq 0 ];then
+ echo "PASS: icmp mtu error had RELATED state"
+else
+ echo "ERROR: icmp error RELATED state test has failed"
+fi
+
+# add 'bad' route, expect icmp REDIRECT to be generated
+ip netns exec nsclient1 ip route add 192.168.1.42 via 192.168.1.1
+ip netns exec nsclient1 ip route add dead:1::42 via dead:1::1
+
+ip netns exec "nsclient1" ping -q -c 2 192.168.1.42 > /dev/null
+
+expect="packets 1 bytes 112"
+check_counter nsclient1 "redir4" "$expect"
+if [ $? -ne 0 ];then
+ ret=1
+fi
+
+ip netns exec "nsclient1" ping -c 1 dead:1::42 > /dev/null
+expect="packets 1 bytes 192"
+check_counter nsclient1 "redir6" "$expect"
+if [ $? -ne 0 ];then
+ ret=1
+fi
+
+if [ $ret -eq 0 ];then
+ echo "PASS: icmp redirects had RELATED state"
+else
+ echo "ERROR: icmp redirect RELATED state test has failed"
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/netfilter/conntrack_vrf.sh b/tools/testing/selftests/netfilter/conntrack_vrf.sh
new file mode 100755
index 000000000..8b5ea9234
--- /dev/null
+++ b/tools/testing/selftests/netfilter/conntrack_vrf.sh
@@ -0,0 +1,241 @@
+#!/bin/sh
+
+# This script demonstrates interaction of conntrack and vrf.
+# The vrf driver calls the netfilter hooks again, with oif/iif
+# pointing at the VRF device.
+#
+# For ingress, this means first iteration has iifname of lower/real
+# device. In this script, thats veth0.
+# Second iteration is iifname set to vrf device, tvrf in this script.
+#
+# For egress, this is reversed: first iteration has the vrf device,
+# second iteration is done with the lower/real/veth0 device.
+#
+# test_ct_zone_in demonstrates unexpected change of nftables
+# behavior # caused by commit 09e856d54bda5f28 "vrf: Reset skb conntrack
+# connection on VRF rcv"
+#
+# It was possible to assign conntrack zone to a packet (or mark it for
+# `notracking`) in the prerouting chain before conntrack, based on real iif.
+#
+# After the change, the zone assignment is lost and the zone is assigned based
+# on the VRF master interface (in case such a rule exists).
+# assignment is lost. Instead, assignment based on the `iif` matching
+# Thus it is impossible to distinguish packets based on the original
+# interface.
+#
+# test_masquerade_vrf and test_masquerade_veth0 demonstrate the problem
+# that was supposed to be fixed by the commit mentioned above to make sure
+# that any fix to test case 1 won't break masquerade again.
+
+ksft_skip=4
+
+IP0=172.30.30.1
+IP1=172.30.30.2
+PFXL=30
+ret=0
+
+sfx=$(mktemp -u "XXXXXXXX")
+ns0="ns0-$sfx"
+ns1="ns1-$sfx"
+
+cleanup()
+{
+ ip netns pids $ns0 | xargs kill 2>/dev/null
+ ip netns pids $ns1 | xargs kill 2>/dev/null
+
+ ip netns del $ns0 $ns1
+}
+
+nft --version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without nft tool"
+ exit $ksft_skip
+fi
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+
+ip netns add "$ns0"
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not create net namespace $ns0"
+ exit $ksft_skip
+fi
+ip netns add "$ns1"
+
+trap cleanup EXIT
+
+ip netns exec $ns0 sysctl -q -w net.ipv4.conf.default.rp_filter=0
+ip netns exec $ns0 sysctl -q -w net.ipv4.conf.all.rp_filter=0
+ip netns exec $ns0 sysctl -q -w net.ipv4.conf.all.rp_filter=0
+
+ip link add veth0 netns "$ns0" type veth peer name veth0 netns "$ns1" > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not add veth device"
+ exit $ksft_skip
+fi
+
+ip -net $ns0 li add tvrf type vrf table 9876
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not add vrf device"
+ exit $ksft_skip
+fi
+
+ip -net $ns0 li set lo up
+
+ip -net $ns0 li set veth0 master tvrf
+ip -net $ns0 li set tvrf up
+ip -net $ns0 li set veth0 up
+ip -net $ns1 li set veth0 up
+
+ip -net $ns0 addr add $IP0/$PFXL dev veth0
+ip -net $ns1 addr add $IP1/$PFXL dev veth0
+
+ip netns exec $ns1 iperf3 -s > /dev/null 2>&1&
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not start iperf3"
+ exit $ksft_skip
+fi
+
+# test vrf ingress handling.
+# The incoming connection should be placed in conntrack zone 1,
+# as decided by the first iteration of the ruleset.
+test_ct_zone_in()
+{
+ip netns exec $ns0 nft -f - <<EOF
+table testct {
+ chain rawpre {
+ type filter hook prerouting priority raw;
+
+ iif { veth0, tvrf } counter meta nftrace set 1
+ iif veth0 counter ct zone set 1 counter return
+ iif tvrf counter ct zone set 2 counter return
+ ip protocol icmp counter
+ notrack counter
+ }
+
+ chain rawout {
+ type filter hook output priority raw;
+
+ oif veth0 counter ct zone set 1 counter return
+ oif tvrf counter ct zone set 2 counter return
+ notrack counter
+ }
+}
+EOF
+ ip netns exec $ns1 ping -W 1 -c 1 -I veth0 $IP0 > /dev/null
+
+ # should be in zone 1, not zone 2
+ count=$(ip netns exec $ns0 conntrack -L -s $IP1 -d $IP0 -p icmp --zone 1 2>/dev/null | wc -l)
+ if [ $count -eq 1 ]; then
+ echo "PASS: entry found in conntrack zone 1"
+ else
+ echo "FAIL: entry not found in conntrack zone 1"
+ count=$(ip netns exec $ns0 conntrack -L -s $IP1 -d $IP0 -p icmp --zone 2 2> /dev/null | wc -l)
+ if [ $count -eq 1 ]; then
+ echo "FAIL: entry found in zone 2 instead"
+ else
+ echo "FAIL: entry not in zone 1 or 2, dumping table"
+ ip netns exec $ns0 conntrack -L
+ ip netns exec $ns0 nft list ruleset
+ fi
+ fi
+}
+
+# add masq rule that gets evaluated w. outif set to vrf device.
+# This tests the first iteration of the packet through conntrack,
+# oifname is the vrf device.
+test_masquerade_vrf()
+{
+ local qdisc=$1
+
+ if [ "$qdisc" != "default" ]; then
+ tc -net $ns0 qdisc add dev tvrf root $qdisc
+ fi
+
+ ip netns exec $ns0 conntrack -F 2>/dev/null
+
+ip netns exec $ns0 nft -f - <<EOF
+flush ruleset
+table ip nat {
+ chain rawout {
+ type filter hook output priority raw;
+
+ oif tvrf ct state untracked counter
+ }
+ chain postrouting2 {
+ type filter hook postrouting priority mangle;
+
+ oif tvrf ct state untracked counter
+ }
+ chain postrouting {
+ type nat hook postrouting priority 0;
+ # NB: masquerade should always be combined with 'oif(name) bla',
+ # lack of this is intentional here, we want to exercise double-snat.
+ ip saddr 172.30.30.0/30 counter masquerade random
+ }
+}
+EOF
+ ip netns exec $ns0 ip vrf exec tvrf iperf3 -t 1 -c $IP1 >/dev/null
+ if [ $? -ne 0 ]; then
+ echo "FAIL: iperf3 connect failure with masquerade + sport rewrite on vrf device"
+ ret=1
+ return
+ fi
+
+ # must also check that nat table was evaluated on second (lower device) iteration.
+ ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2' &&
+ ip netns exec $ns0 nft list table ip nat |grep -q 'untracked counter packets [1-9]'
+ if [ $? -eq 0 ]; then
+ echo "PASS: iperf3 connect with masquerade + sport rewrite on vrf device ($qdisc qdisc)"
+ else
+ echo "FAIL: vrf rules have unexpected counter value"
+ ret=1
+ fi
+
+ if [ "$qdisc" != "default" ]; then
+ tc -net $ns0 qdisc del dev tvrf root
+ fi
+}
+
+# add masq rule that gets evaluated w. outif set to veth device.
+# This tests the 2nd iteration of the packet through conntrack,
+# oifname is the lower device (veth0 in this case).
+test_masquerade_veth()
+{
+ ip netns exec $ns0 conntrack -F 2>/dev/null
+ip netns exec $ns0 nft -f - <<EOF
+flush ruleset
+table ip nat {
+ chain postrouting {
+ type nat hook postrouting priority 0;
+ meta oif veth0 ip saddr 172.30.30.0/30 counter masquerade random
+ }
+}
+EOF
+ ip netns exec $ns0 ip vrf exec tvrf iperf3 -t 1 -c $IP1 > /dev/null
+ if [ $? -ne 0 ]; then
+ echo "FAIL: iperf3 connect failure with masquerade + sport rewrite on veth device"
+ ret=1
+ return
+ fi
+
+ # must also check that nat table was evaluated on second (lower device) iteration.
+ ip netns exec $ns0 nft list table ip nat |grep -q 'counter packets 2'
+ if [ $? -eq 0 ]; then
+ echo "PASS: iperf3 connect with masquerade + sport rewrite on veth device"
+ else
+ echo "FAIL: vrf masq rule has unexpected counter value"
+ ret=1
+ fi
+}
+
+test_ct_zone_in
+test_masquerade_vrf "default"
+test_masquerade_vrf "pfifo"
+test_masquerade_veth
+
+exit $ret
diff --git a/tools/testing/selftests/netfilter/ipvs.sh b/tools/testing/selftests/netfilter/ipvs.sh
new file mode 100755
index 000000000..c3b8f90c4
--- /dev/null
+++ b/tools/testing/selftests/netfilter/ipvs.sh
@@ -0,0 +1,228 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# End-to-end ipvs test suite
+# Topology:
+#--------------------------------------------------------------+
+# | |
+# ns0 | ns1 |
+# ----------- | ----------- ----------- |
+# | veth01 | --------- | veth10 | | veth12 | |
+# ----------- peer ----------- ----------- |
+# | | | |
+# ----------- | | |
+# | br0 | |----------------- peer |--------------|
+# ----------- | | |
+# | | | |
+# ---------- peer ---------- ----------- |
+# | veth02 | --------- | veth20 | | veth21 | |
+# ---------- | ---------- ----------- |
+# | ns2 |
+# | |
+#--------------------------------------------------------------+
+#
+# We assume that all network driver are loaded
+#
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+ret=0
+GREEN='\033[0;92m'
+RED='\033[0;31m'
+NC='\033[0m' # No Color
+
+readonly port=8080
+
+readonly vip_v4=207.175.44.110
+readonly cip_v4=10.0.0.2
+readonly gip_v4=10.0.0.1
+readonly dip_v4=172.16.0.1
+readonly rip_v4=172.16.0.2
+readonly sip_v4=10.0.0.3
+
+readonly infile="$(mktemp)"
+readonly outfile="$(mktemp)"
+readonly datalen=32
+
+sysipvsnet="/proc/sys/net/ipv4/vs/"
+if [ ! -d $sysipvsnet ]; then
+ modprobe -q ip_vs
+ if [ $? -ne 0 ]; then
+ echo "skip: could not run test without ipvs module"
+ exit $ksft_skip
+ fi
+fi
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ]; then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+
+ipvsadm -v > /dev/null 2>&1
+if [ $? -ne 0 ]; then
+ echo "SKIP: Could not run test without ipvsadm"
+ exit $ksft_skip
+fi
+
+setup() {
+ ip netns add ns0
+ ip netns add ns1
+ ip netns add ns2
+
+ ip link add veth01 netns ns0 type veth peer name veth10 netns ns1
+ ip link add veth02 netns ns0 type veth peer name veth20 netns ns2
+ ip link add veth12 netns ns1 type veth peer name veth21 netns ns2
+
+ ip netns exec ns0 ip link set veth01 up
+ ip netns exec ns0 ip link set veth02 up
+ ip netns exec ns0 ip link add br0 type bridge
+ ip netns exec ns0 ip link set veth01 master br0
+ ip netns exec ns0 ip link set veth02 master br0
+ ip netns exec ns0 ip link set br0 up
+ ip netns exec ns0 ip addr add ${cip_v4}/24 dev br0
+
+ ip netns exec ns1 ip link set lo up
+ ip netns exec ns1 ip link set veth10 up
+ ip netns exec ns1 ip addr add ${gip_v4}/24 dev veth10
+ ip netns exec ns1 ip link set veth12 up
+ ip netns exec ns1 ip addr add ${dip_v4}/24 dev veth12
+
+ ip netns exec ns2 ip link set lo up
+ ip netns exec ns2 ip link set veth21 up
+ ip netns exec ns2 ip addr add ${rip_v4}/24 dev veth21
+ ip netns exec ns2 ip link set veth20 up
+ ip netns exec ns2 ip addr add ${sip_v4}/24 dev veth20
+
+ sleep 1
+
+ dd if=/dev/urandom of="${infile}" bs="${datalen}" count=1 status=none
+}
+
+cleanup() {
+ for i in 0 1 2
+ do
+ ip netns del ns$i > /dev/null 2>&1
+ done
+
+ if [ -f "${outfile}" ]; then
+ rm "${outfile}"
+ fi
+ if [ -f "${infile}" ]; then
+ rm "${infile}"
+ fi
+}
+
+server_listen() {
+ ip netns exec ns2 nc -l -p 8080 > "${outfile}" &
+ server_pid=$!
+ sleep 0.2
+}
+
+client_connect() {
+ ip netns exec ns0 timeout 2 nc -w 1 ${vip_v4} ${port} < "${infile}"
+}
+
+verify_data() {
+ wait "${server_pid}"
+ cmp "$infile" "$outfile" 2>/dev/null
+}
+
+test_service() {
+ server_listen
+ client_connect
+ verify_data
+}
+
+
+test_dr() {
+ ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0
+
+ ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=1
+ ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr
+ ip netns exec ns1 ipvsadm -a -t ${vip_v4}:${port} -r ${rip_v4}:${port}
+ ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1
+
+ # avoid incorrect arp response
+ ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_ignore=1
+ ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_announce=2
+ # avoid reverse route lookup
+ ip netns exec ns2 sysctl -qw net.ipv4.conf.all.rp_filter=0
+ ip netns exec ns2 sysctl -qw net.ipv4.conf.veth21.rp_filter=0
+ ip netns exec ns2 ip addr add ${vip_v4}/32 dev lo:1
+
+ test_service
+}
+
+test_nat() {
+ ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0
+
+ ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=1
+ ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr
+ ip netns exec ns1 ipvsadm -a -m -t ${vip_v4}:${port} -r ${rip_v4}:${port}
+ ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1
+
+ ip netns exec ns2 ip link del veth20
+ ip netns exec ns2 ip route add default via ${dip_v4} dev veth21
+
+ test_service
+}
+
+test_tun() {
+ ip netns exec ns0 ip route add ${vip_v4} via ${gip_v4} dev br0
+
+ ip netns exec ns1 modprobe ipip
+ ip netns exec ns1 ip link set tunl0 up
+ ip netns exec ns1 sysctl -qw net.ipv4.ip_forward=0
+ ip netns exec ns1 sysctl -qw net.ipv4.conf.all.send_redirects=0
+ ip netns exec ns1 sysctl -qw net.ipv4.conf.default.send_redirects=0
+ ip netns exec ns1 ipvsadm -A -t ${vip_v4}:${port} -s rr
+ ip netns exec ns1 ipvsadm -a -i -t ${vip_v4}:${port} -r ${rip_v4}:${port}
+ ip netns exec ns1 ip addr add ${vip_v4}/32 dev lo:1
+
+ ip netns exec ns2 modprobe ipip
+ ip netns exec ns2 ip link set tunl0 up
+ ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_ignore=1
+ ip netns exec ns2 sysctl -qw net.ipv4.conf.all.arp_announce=2
+ ip netns exec ns2 sysctl -qw net.ipv4.conf.all.rp_filter=0
+ ip netns exec ns2 sysctl -qw net.ipv4.conf.tunl0.rp_filter=0
+ ip netns exec ns2 sysctl -qw net.ipv4.conf.veth21.rp_filter=0
+ ip netns exec ns2 ip addr add ${vip_v4}/32 dev lo:1
+
+ test_service
+}
+
+run_tests() {
+ local errors=
+
+ echo "Testing DR mode..."
+ cleanup
+ setup
+ test_dr
+ errors=$(( $errors + $? ))
+
+ echo "Testing NAT mode..."
+ cleanup
+ setup
+ test_nat
+ errors=$(( $errors + $? ))
+
+ echo "Testing Tunnel mode..."
+ cleanup
+ setup
+ test_tun
+ errors=$(( $errors + $? ))
+
+ return $errors
+}
+
+trap cleanup EXIT
+
+run_tests
+
+if [ $? -ne 0 ]; then
+ echo -e "$(basename $0): ${RED}FAIL${NC}"
+ exit 1
+fi
+echo -e "$(basename $0): ${GREEN}PASS${NC}"
+exit 0
diff --git a/tools/testing/selftests/netfilter/nf-queue.c b/tools/testing/selftests/netfilter/nf-queue.c
new file mode 100644
index 000000000..9e56b9d47
--- /dev/null
+++ b/tools/testing/selftests/netfilter/nf-queue.c
@@ -0,0 +1,395 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <time.h>
+#include <arpa/inet.h>
+
+#include <libmnl/libmnl.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_queue.h>
+
+struct options {
+ bool count_packets;
+ bool gso_enabled;
+ int verbose;
+ unsigned int queue_num;
+ unsigned int timeout;
+ uint32_t verdict;
+ uint32_t delay_ms;
+};
+
+static unsigned int queue_stats[5];
+static struct options opts;
+
+static void help(const char *p)
+{
+ printf("Usage: %s [-c|-v [-vv] ] [-t timeout] [-q queue_num] [-Qdst_queue ] [ -d ms_delay ] [-G]\n", p);
+}
+
+static int parse_attr_cb(const struct nlattr *attr, void *data)
+{
+ const struct nlattr **tb = data;
+ int type = mnl_attr_get_type(attr);
+
+ /* skip unsupported attribute in user-space */
+ if (mnl_attr_type_valid(attr, NFQA_MAX) < 0)
+ return MNL_CB_OK;
+
+ switch (type) {
+ case NFQA_MARK:
+ case NFQA_IFINDEX_INDEV:
+ case NFQA_IFINDEX_OUTDEV:
+ case NFQA_IFINDEX_PHYSINDEV:
+ case NFQA_IFINDEX_PHYSOUTDEV:
+ if (mnl_attr_validate(attr, MNL_TYPE_U32) < 0) {
+ perror("mnl_attr_validate");
+ return MNL_CB_ERROR;
+ }
+ break;
+ case NFQA_TIMESTAMP:
+ if (mnl_attr_validate2(attr, MNL_TYPE_UNSPEC,
+ sizeof(struct nfqnl_msg_packet_timestamp)) < 0) {
+ perror("mnl_attr_validate2");
+ return MNL_CB_ERROR;
+ }
+ break;
+ case NFQA_HWADDR:
+ if (mnl_attr_validate2(attr, MNL_TYPE_UNSPEC,
+ sizeof(struct nfqnl_msg_packet_hw)) < 0) {
+ perror("mnl_attr_validate2");
+ return MNL_CB_ERROR;
+ }
+ break;
+ case NFQA_PAYLOAD:
+ break;
+ }
+ tb[type] = attr;
+ return MNL_CB_OK;
+}
+
+static int queue_cb(const struct nlmsghdr *nlh, void *data)
+{
+ struct nlattr *tb[NFQA_MAX+1] = { 0 };
+ struct nfqnl_msg_packet_hdr *ph = NULL;
+ uint32_t id = 0;
+
+ (void)data;
+
+ mnl_attr_parse(nlh, sizeof(struct nfgenmsg), parse_attr_cb, tb);
+ if (tb[NFQA_PACKET_HDR]) {
+ ph = mnl_attr_get_payload(tb[NFQA_PACKET_HDR]);
+ id = ntohl(ph->packet_id);
+
+ if (opts.verbose > 0)
+ printf("packet hook=%u, hwproto 0x%x",
+ ntohs(ph->hw_protocol), ph->hook);
+
+ if (ph->hook >= 5) {
+ fprintf(stderr, "Unknown hook %d\n", ph->hook);
+ return MNL_CB_ERROR;
+ }
+
+ if (opts.verbose > 0) {
+ uint32_t skbinfo = 0;
+
+ if (tb[NFQA_SKB_INFO])
+ skbinfo = ntohl(mnl_attr_get_u32(tb[NFQA_SKB_INFO]));
+ if (skbinfo & NFQA_SKB_CSUMNOTREADY)
+ printf(" csumnotready");
+ if (skbinfo & NFQA_SKB_GSO)
+ printf(" gso");
+ if (skbinfo & NFQA_SKB_CSUM_NOTVERIFIED)
+ printf(" csumnotverified");
+ puts("");
+ }
+
+ if (opts.count_packets)
+ queue_stats[ph->hook]++;
+ }
+
+ return MNL_CB_OK + id;
+}
+
+static struct nlmsghdr *
+nfq_build_cfg_request(char *buf, uint8_t command, int queue_num)
+{
+ struct nlmsghdr *nlh = mnl_nlmsg_put_header(buf);
+ struct nfqnl_msg_config_cmd cmd = {
+ .command = command,
+ .pf = htons(AF_INET),
+ };
+ struct nfgenmsg *nfg;
+
+ nlh->nlmsg_type = (NFNL_SUBSYS_QUEUE << 8) | NFQNL_MSG_CONFIG;
+ nlh->nlmsg_flags = NLM_F_REQUEST;
+
+ nfg = mnl_nlmsg_put_extra_header(nlh, sizeof(*nfg));
+
+ nfg->nfgen_family = AF_UNSPEC;
+ nfg->version = NFNETLINK_V0;
+ nfg->res_id = htons(queue_num);
+
+ mnl_attr_put(nlh, NFQA_CFG_CMD, sizeof(cmd), &cmd);
+
+ return nlh;
+}
+
+static struct nlmsghdr *
+nfq_build_cfg_params(char *buf, uint8_t mode, int range, int queue_num)
+{
+ struct nlmsghdr *nlh = mnl_nlmsg_put_header(buf);
+ struct nfqnl_msg_config_params params = {
+ .copy_range = htonl(range),
+ .copy_mode = mode,
+ };
+ struct nfgenmsg *nfg;
+
+ nlh->nlmsg_type = (NFNL_SUBSYS_QUEUE << 8) | NFQNL_MSG_CONFIG;
+ nlh->nlmsg_flags = NLM_F_REQUEST;
+
+ nfg = mnl_nlmsg_put_extra_header(nlh, sizeof(*nfg));
+ nfg->nfgen_family = AF_UNSPEC;
+ nfg->version = NFNETLINK_V0;
+ nfg->res_id = htons(queue_num);
+
+ mnl_attr_put(nlh, NFQA_CFG_PARAMS, sizeof(params), &params);
+
+ return nlh;
+}
+
+static struct nlmsghdr *
+nfq_build_verdict(char *buf, int id, int queue_num, uint32_t verd)
+{
+ struct nfqnl_msg_verdict_hdr vh = {
+ .verdict = htonl(verd),
+ .id = htonl(id),
+ };
+ struct nlmsghdr *nlh;
+ struct nfgenmsg *nfg;
+
+ nlh = mnl_nlmsg_put_header(buf);
+ nlh->nlmsg_type = (NFNL_SUBSYS_QUEUE << 8) | NFQNL_MSG_VERDICT;
+ nlh->nlmsg_flags = NLM_F_REQUEST;
+ nfg = mnl_nlmsg_put_extra_header(nlh, sizeof(*nfg));
+ nfg->nfgen_family = AF_UNSPEC;
+ nfg->version = NFNETLINK_V0;
+ nfg->res_id = htons(queue_num);
+
+ mnl_attr_put(nlh, NFQA_VERDICT_HDR, sizeof(vh), &vh);
+
+ return nlh;
+}
+
+static void print_stats(void)
+{
+ unsigned int last, total;
+ int i;
+
+ total = 0;
+ last = queue_stats[0];
+
+ for (i = 0; i < 5; i++) {
+ printf("hook %d packets %08u\n", i, queue_stats[i]);
+ last = queue_stats[i];
+ total += last;
+ }
+
+ printf("%u packets total\n", total);
+}
+
+struct mnl_socket *open_queue(void)
+{
+ char buf[MNL_SOCKET_BUFFER_SIZE];
+ unsigned int queue_num;
+ struct mnl_socket *nl;
+ struct nlmsghdr *nlh;
+ struct timeval tv;
+ uint32_t flags;
+
+ nl = mnl_socket_open(NETLINK_NETFILTER);
+ if (nl == NULL) {
+ perror("mnl_socket_open");
+ exit(EXIT_FAILURE);
+ }
+
+ if (mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID) < 0) {
+ perror("mnl_socket_bind");
+ exit(EXIT_FAILURE);
+ }
+
+ queue_num = opts.queue_num;
+ nlh = nfq_build_cfg_request(buf, NFQNL_CFG_CMD_BIND, queue_num);
+
+ if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) {
+ perror("mnl_socket_sendto");
+ exit(EXIT_FAILURE);
+ }
+
+ nlh = nfq_build_cfg_params(buf, NFQNL_COPY_PACKET, 0xFFFF, queue_num);
+
+ flags = opts.gso_enabled ? NFQA_CFG_F_GSO : 0;
+ flags |= NFQA_CFG_F_UID_GID;
+ mnl_attr_put_u32(nlh, NFQA_CFG_FLAGS, htonl(flags));
+ mnl_attr_put_u32(nlh, NFQA_CFG_MASK, htonl(flags));
+
+ if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) {
+ perror("mnl_socket_sendto");
+ exit(EXIT_FAILURE);
+ }
+
+ memset(&tv, 0, sizeof(tv));
+ tv.tv_sec = opts.timeout;
+ if (opts.timeout && setsockopt(mnl_socket_get_fd(nl),
+ SOL_SOCKET, SO_RCVTIMEO,
+ &tv, sizeof(tv))) {
+ perror("setsockopt(SO_RCVTIMEO)");
+ exit(EXIT_FAILURE);
+ }
+
+ return nl;
+}
+
+static void sleep_ms(uint32_t delay)
+{
+ struct timespec ts = { .tv_sec = delay / 1000 };
+
+ delay %= 1000;
+
+ ts.tv_nsec = delay * 1000llu * 1000llu;
+
+ nanosleep(&ts, NULL);
+}
+
+static int mainloop(void)
+{
+ unsigned int buflen = 64 * 1024 + MNL_SOCKET_BUFFER_SIZE;
+ struct mnl_socket *nl;
+ struct nlmsghdr *nlh;
+ unsigned int portid;
+ char *buf;
+ int ret;
+
+ buf = malloc(buflen);
+ if (!buf) {
+ perror("malloc");
+ exit(EXIT_FAILURE);
+ }
+
+ nl = open_queue();
+ portid = mnl_socket_get_portid(nl);
+
+ for (;;) {
+ uint32_t id;
+
+ ret = mnl_socket_recvfrom(nl, buf, buflen);
+ if (ret == -1) {
+ if (errno == ENOBUFS || errno == EINTR)
+ continue;
+
+ if (errno == EAGAIN) {
+ errno = 0;
+ ret = 0;
+ break;
+ }
+
+ perror("mnl_socket_recvfrom");
+ exit(EXIT_FAILURE);
+ }
+
+ ret = mnl_cb_run(buf, ret, 0, portid, queue_cb, NULL);
+ if (ret < 0) {
+ perror("mnl_cb_run");
+ exit(EXIT_FAILURE);
+ }
+
+ id = ret - MNL_CB_OK;
+ if (opts.delay_ms)
+ sleep_ms(opts.delay_ms);
+
+ nlh = nfq_build_verdict(buf, id, opts.queue_num, opts.verdict);
+ if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) {
+ perror("mnl_socket_sendto");
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ mnl_socket_close(nl);
+
+ return ret;
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ int c;
+
+ while ((c = getopt(argc, argv, "chvt:q:Q:d:G")) != -1) {
+ switch (c) {
+ case 'c':
+ opts.count_packets = true;
+ break;
+ case 'h':
+ help(argv[0]);
+ exit(0);
+ break;
+ case 'q':
+ opts.queue_num = atoi(optarg);
+ if (opts.queue_num > 0xffff)
+ opts.queue_num = 0;
+ break;
+ case 'Q':
+ opts.verdict = atoi(optarg);
+ if (opts.verdict > 0xffff) {
+ fprintf(stderr, "Expected destination queue number\n");
+ exit(1);
+ }
+
+ opts.verdict <<= 16;
+ opts.verdict |= NF_QUEUE;
+ break;
+ case 'd':
+ opts.delay_ms = atoi(optarg);
+ if (opts.delay_ms == 0) {
+ fprintf(stderr, "Expected nonzero delay (in milliseconds)\n");
+ exit(1);
+ }
+ break;
+ case 't':
+ opts.timeout = atoi(optarg);
+ break;
+ case 'G':
+ opts.gso_enabled = false;
+ break;
+ case 'v':
+ opts.verbose++;
+ break;
+ }
+ }
+
+ if (opts.verdict != NF_ACCEPT && (opts.verdict >> 16 == opts.queue_num)) {
+ fprintf(stderr, "Cannot use same destination and source queue\n");
+ exit(1);
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ int ret;
+
+ opts.verdict = NF_ACCEPT;
+ opts.gso_enabled = true;
+
+ parse_opts(argc, argv);
+
+ ret = mainloop();
+ if (opts.count_packets)
+ print_stats();
+
+ return ret;
+}
diff --git a/tools/testing/selftests/netfilter/nft_concat_range.sh b/tools/testing/selftests/netfilter/nft_concat_range.sh
new file mode 100755
index 000000000..af3461cb5
--- /dev/null
+++ b/tools/testing/selftests/netfilter/nft_concat_range.sh
@@ -0,0 +1,1586 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# nft_concat_range.sh - Tests for sets with concatenation of ranged fields
+#
+# Copyright (c) 2019 Red Hat GmbH
+#
+# Author: Stefano Brivio <sbrivio@redhat.com>
+#
+# shellcheck disable=SC2154,SC2034,SC2016,SC2030,SC2031
+# ^ Configuration and templates sourced with eval, counters reused in subshells
+
+KSELFTEST_SKIP=4
+
+# Available test groups:
+# - reported_issues: check for issues that were reported in the past
+# - correctness: check that packets match given entries, and only those
+# - concurrency: attempt races between insertion, deletion and lookup
+# - timeout: check that packets match entries until they expire
+# - performance: estimate matching rate, compare with rbtree and hash baselines
+TESTS="reported_issues correctness concurrency timeout"
+[ "${quicktest}" != "1" ] && TESTS="${TESTS} performance"
+
+# Set types, defined by TYPE_ variables below
+TYPES="net_port port_net net6_port port_proto net6_port_mac net6_port_mac_proto
+ net_port_net net_mac net_mac_icmp net6_mac_icmp net6_port_net6_port
+ net_port_mac_proto_net"
+
+# Reported bugs, also described by TYPE_ variables below
+BUGS="flush_remove_add reload"
+
+# List of possible paths to pktgen script from kernel tree for performance tests
+PKTGEN_SCRIPT_PATHS="
+ ../../../../samples/pktgen/pktgen_bench_xmit_mode_netif_receive.sh
+ pktgen/pktgen_bench_xmit_mode_netif_receive.sh"
+
+# Definition of set types:
+# display display text for test report
+# type_spec nftables set type specifier
+# chain_spec nftables type specifier for rules mapping to set
+# dst call sequence of format_*() functions for destination fields
+# src call sequence of format_*() functions for source fields
+# start initial integer used to generate addresses and ports
+# count count of entries to generate and match
+# src_delta number summed to destination generator for source fields
+# tools list of tools for correctness and timeout tests, any can be used
+# proto L4 protocol of test packets
+#
+# race_repeat race attempts per thread, 0 disables concurrency test for type
+# flood_tools list of tools for concurrency tests, any can be used
+# flood_proto L4 protocol of test packets for concurrency tests
+# flood_spec nftables type specifier for concurrency tests
+#
+# perf_duration duration of single pktgen injection test
+# perf_spec nftables type specifier for performance tests
+# perf_dst format_*() functions for destination fields in performance test
+# perf_src format_*() functions for source fields in performance test
+# perf_entries number of set entries for performance test
+# perf_proto L3 protocol of test packets
+TYPE_net_port="
+display net,port
+type_spec ipv4_addr . inet_service
+chain_spec ip daddr . udp dport
+dst addr4 port
+src
+start 1
+count 5
+src_delta 2000
+tools sendip nc bash
+proto udp
+
+race_repeat 3
+flood_tools iperf3 iperf netperf
+flood_proto udp
+flood_spec ip daddr . udp dport
+
+perf_duration 5
+perf_spec ip daddr . udp dport
+perf_dst addr4 port
+perf_src
+perf_entries 1000
+perf_proto ipv4
+"
+
+TYPE_port_net="
+display port,net
+type_spec inet_service . ipv4_addr
+chain_spec udp dport . ip daddr
+dst port addr4
+src
+start 1
+count 5
+src_delta 2000
+tools sendip nc bash
+proto udp
+
+race_repeat 3
+flood_tools iperf3 iperf netperf
+flood_proto udp
+flood_spec udp dport . ip daddr
+
+perf_duration 5
+perf_spec udp dport . ip daddr
+perf_dst port addr4
+perf_src
+perf_entries 100
+perf_proto ipv4
+"
+
+TYPE_net6_port="
+display net6,port
+type_spec ipv6_addr . inet_service
+chain_spec ip6 daddr . udp dport
+dst addr6 port
+src
+start 10
+count 5
+src_delta 2000
+tools sendip nc bash
+proto udp6
+
+race_repeat 3
+flood_tools iperf3 iperf netperf
+flood_proto tcp6
+flood_spec ip6 daddr . udp dport
+
+perf_duration 5
+perf_spec ip6 daddr . udp dport
+perf_dst addr6 port
+perf_src
+perf_entries 1000
+perf_proto ipv6
+"
+
+TYPE_port_proto="
+display port,proto
+type_spec inet_service . inet_proto
+chain_spec udp dport . meta l4proto
+dst port proto
+src
+start 1
+count 5
+src_delta 2000
+tools sendip nc bash
+proto udp
+
+race_repeat 0
+
+perf_duration 5
+perf_spec udp dport . meta l4proto
+perf_dst port proto
+perf_src
+perf_entries 30000
+perf_proto ipv4
+"
+
+TYPE_net6_port_mac="
+display net6,port,mac
+type_spec ipv6_addr . inet_service . ether_addr
+chain_spec ip6 daddr . udp dport . ether saddr
+dst addr6 port
+src mac
+start 10
+count 5
+src_delta 2000
+tools sendip nc bash
+proto udp6
+
+race_repeat 0
+
+perf_duration 5
+perf_spec ip6 daddr . udp dport . ether daddr
+perf_dst addr6 port mac
+perf_src
+perf_entries 10
+perf_proto ipv6
+"
+
+TYPE_net6_port_mac_proto="
+display net6,port,mac,proto
+type_spec ipv6_addr . inet_service . ether_addr . inet_proto
+chain_spec ip6 daddr . udp dport . ether saddr . meta l4proto
+dst addr6 port
+src mac proto
+start 10
+count 5
+src_delta 2000
+tools sendip nc bash
+proto udp6
+
+race_repeat 0
+
+perf_duration 5
+perf_spec ip6 daddr . udp dport . ether daddr . meta l4proto
+perf_dst addr6 port mac proto
+perf_src
+perf_entries 1000
+perf_proto ipv6
+"
+
+TYPE_net_port_net="
+display net,port,net
+type_spec ipv4_addr . inet_service . ipv4_addr
+chain_spec ip daddr . udp dport . ip saddr
+dst addr4 port
+src addr4
+start 1
+count 5
+src_delta 2000
+tools sendip nc bash
+proto udp
+
+race_repeat 3
+flood_tools iperf3 iperf netperf
+flood_proto tcp
+flood_spec ip daddr . udp dport . ip saddr
+
+perf_duration 0
+"
+
+TYPE_net6_port_net6_port="
+display net6,port,net6,port
+type_spec ipv6_addr . inet_service . ipv6_addr . inet_service
+chain_spec ip6 daddr . udp dport . ip6 saddr . udp sport
+dst addr6 port
+src addr6 port
+start 10
+count 5
+src_delta 2000
+tools sendip nc
+proto udp6
+
+race_repeat 3
+flood_tools iperf3 iperf netperf
+flood_proto tcp6
+flood_spec ip6 daddr . tcp dport . ip6 saddr . tcp sport
+
+perf_duration 0
+"
+
+TYPE_net_port_mac_proto_net="
+display net,port,mac,proto,net
+type_spec ipv4_addr . inet_service . ether_addr . inet_proto . ipv4_addr
+chain_spec ip daddr . udp dport . ether saddr . meta l4proto . ip saddr
+dst addr4 port
+src mac proto addr4
+start 1
+count 5
+src_delta 2000
+tools sendip nc bash
+proto udp
+
+race_repeat 0
+
+perf_duration 0
+"
+
+TYPE_net_mac="
+display net,mac
+type_spec ipv4_addr . ether_addr
+chain_spec ip daddr . ether saddr
+dst addr4
+src mac
+start 1
+count 5
+src_delta 2000
+tools sendip nc bash
+proto udp
+
+race_repeat 0
+
+perf_duration 5
+perf_spec ip daddr . ether daddr
+perf_dst addr4 mac
+perf_src
+perf_entries 1000
+perf_proto ipv4
+"
+
+TYPE_net_mac_icmp="
+display net,mac - ICMP
+type_spec ipv4_addr . ether_addr
+chain_spec ip daddr . ether saddr
+dst addr4
+src mac
+start 1
+count 5
+src_delta 2000
+tools ping
+proto icmp
+
+race_repeat 0
+
+perf_duration 0
+"
+
+TYPE_net6_mac_icmp="
+display net6,mac - ICMPv6
+type_spec ipv6_addr . ether_addr
+chain_spec ip6 daddr . ether saddr
+dst addr6
+src mac
+start 10
+count 50
+src_delta 2000
+tools ping
+proto icmp6
+
+race_repeat 0
+
+perf_duration 0
+"
+
+TYPE_net_port_proto_net="
+display net,port,proto,net
+type_spec ipv4_addr . inet_service . inet_proto . ipv4_addr
+chain_spec ip daddr . udp dport . meta l4proto . ip saddr
+dst addr4 port proto
+src addr4
+start 1
+count 5
+src_delta 2000
+tools sendip nc
+proto udp
+
+race_repeat 3
+flood_tools iperf3 iperf netperf
+flood_proto tcp
+flood_spec ip daddr . tcp dport . meta l4proto . ip saddr
+
+perf_duration 0
+"
+
+# Definition of tests for bugs reported in the past:
+# display display text for test report
+TYPE_flush_remove_add="
+display Add two elements, flush, re-add
+"
+
+TYPE_reload="
+display net,mac with reload
+type_spec ipv4_addr . ether_addr
+chain_spec ip daddr . ether saddr
+dst addr4
+src mac
+start 1
+count 1
+src_delta 2000
+tools sendip nc bash
+proto udp
+
+race_repeat 0
+
+perf_duration 0
+"
+
+# Set template for all tests, types and rules are filled in depending on test
+set_template='
+flush ruleset
+
+table inet filter {
+ counter test {
+ packets 0 bytes 0
+ }
+
+ set test {
+ type ${type_spec}
+ flags interval,timeout
+ }
+
+ chain input {
+ type filter hook prerouting priority 0; policy accept;
+ ${chain_spec} @test counter name \"test\"
+ }
+}
+
+table netdev perf {
+ counter test {
+ packets 0 bytes 0
+ }
+
+ counter match {
+ packets 0 bytes 0
+ }
+
+ set test {
+ type ${type_spec}
+ flags interval
+ }
+
+ set norange {
+ type ${type_spec}
+ }
+
+ set noconcat {
+ type ${type_spec%% *}
+ flags interval
+ }
+
+ chain test {
+ type filter hook ingress device veth_a priority 0;
+ }
+}
+'
+
+err_buf=
+info_buf=
+
+# Append string to error buffer
+err() {
+ err_buf="${err_buf}${1}
+"
+}
+
+# Append string to information buffer
+info() {
+ info_buf="${info_buf}${1}
+"
+}
+
+# Flush error buffer to stdout
+err_flush() {
+ printf "%s" "${err_buf}"
+ err_buf=
+}
+
+# Flush information buffer to stdout
+info_flush() {
+ printf "%s" "${info_buf}"
+ info_buf=
+}
+
+# Setup veth pair: this namespace receives traffic, B generates it
+setup_veth() {
+ ip netns add B
+ ip link add veth_a type veth peer name veth_b || return 1
+
+ ip link set veth_a up
+ ip link set veth_b netns B
+
+ ip -n B link set veth_b up
+
+ ip addr add dev veth_a 10.0.0.1
+ ip route add default dev veth_a
+
+ ip -6 addr add fe80::1/64 dev veth_a nodad
+ ip -6 addr add 2001:db8::1/64 dev veth_a nodad
+ ip -6 route add default dev veth_a
+
+ ip -n B route add default dev veth_b
+
+ ip -6 -n B addr add fe80::2/64 dev veth_b nodad
+ ip -6 -n B addr add 2001:db8::2/64 dev veth_b nodad
+ ip -6 -n B route add default dev veth_b
+
+ B() {
+ ip netns exec B "$@" >/dev/null 2>&1
+ }
+
+ sleep 2
+}
+
+# Fill in set template and initialise set
+setup_set() {
+ eval "echo \"${set_template}\"" | nft -f -
+}
+
+# Check that at least one of the needed tools is available
+check_tools() {
+ [ -z "${tools}" ] && return 0
+
+ __tools=
+ for tool in ${tools}; do
+ if [ "${tool}" = "nc" ] && [ "${proto}" = "udp6" ] && \
+ ! nc -u -w0 1.1.1.1 1 2>/dev/null; then
+ # Some GNU netcat builds might not support IPv6
+ __tools="${__tools} netcat-openbsd"
+ continue
+ fi
+ __tools="${__tools} ${tool}"
+
+ command -v "${tool}" >/dev/null && return 0
+ done
+ err "need one of:${__tools}, skipping" && return 1
+}
+
+# Set up function to send ICMP packets
+setup_send_icmp() {
+ send_icmp() {
+ B ping -c1 -W1 "${dst_addr4}" >/dev/null 2>&1
+ }
+}
+
+# Set up function to send ICMPv6 packets
+setup_send_icmp6() {
+ if command -v ping6 >/dev/null; then
+ send_icmp6() {
+ ip -6 addr add "${dst_addr6}" dev veth_a nodad \
+ 2>/dev/null
+ B ping6 -q -c1 -W1 "${dst_addr6}"
+ }
+ else
+ send_icmp6() {
+ ip -6 addr add "${dst_addr6}" dev veth_a nodad \
+ 2>/dev/null
+ B ping -q -6 -c1 -W1 "${dst_addr6}"
+ }
+ fi
+}
+
+# Set up function to send single UDP packets on IPv4
+setup_send_udp() {
+ if command -v sendip >/dev/null; then
+ send_udp() {
+ [ -n "${src_port}" ] && src_port="-us ${src_port}"
+ [ -n "${dst_port}" ] && dst_port="-ud ${dst_port}"
+ [ -n "${src_addr4}" ] && src_addr4="-is ${src_addr4}"
+
+ # shellcheck disable=SC2086 # sendip needs split options
+ B sendip -p ipv4 -p udp ${src_addr4} ${src_port} \
+ ${dst_port} "${dst_addr4}"
+
+ src_port=
+ dst_port=
+ src_addr4=
+ }
+ elif command -v nc >/dev/null; then
+ if nc -u -w0 1.1.1.1 1 2>/dev/null; then
+ # OpenBSD netcat
+ nc_opt="-w0"
+ else
+ # GNU netcat
+ nc_opt="-q0"
+ fi
+
+ send_udp() {
+ if [ -n "${src_addr4}" ]; then
+ B ip addr add "${src_addr4}" dev veth_b
+ __src_addr4="-s ${src_addr4}"
+ fi
+ ip addr add "${dst_addr4}" dev veth_a 2>/dev/null
+ [ -n "${src_port}" ] && src_port="-p ${src_port}"
+
+ echo "" | B nc -u "${nc_opt}" "${__src_addr4}" \
+ "${src_port}" "${dst_addr4}" "${dst_port}"
+
+ src_addr4=
+ src_port=
+ }
+ elif [ -z "$(bash -c 'type -p')" ]; then
+ send_udp() {
+ ip addr add "${dst_addr4}" dev veth_a 2>/dev/null
+ if [ -n "${src_addr4}" ]; then
+ B ip addr add "${src_addr4}/16" dev veth_b
+ B ip route add default dev veth_b
+ fi
+
+ B bash -c "echo > /dev/udp/${dst_addr4}/${dst_port}"
+
+ if [ -n "${src_addr4}" ]; then
+ B ip addr del "${src_addr4}/16" dev veth_b
+ fi
+ src_addr4=
+ }
+ else
+ return 1
+ fi
+}
+
+# Set up function to send single UDP packets on IPv6
+setup_send_udp6() {
+ if command -v sendip >/dev/null; then
+ send_udp6() {
+ [ -n "${src_port}" ] && src_port="-us ${src_port}"
+ [ -n "${dst_port}" ] && dst_port="-ud ${dst_port}"
+ if [ -n "${src_addr6}" ]; then
+ src_addr6="-6s ${src_addr6}"
+ else
+ src_addr6="-6s 2001:db8::2"
+ fi
+ ip -6 addr add "${dst_addr6}" dev veth_a nodad \
+ 2>/dev/null
+
+ # shellcheck disable=SC2086 # this needs split options
+ B sendip -p ipv6 -p udp ${src_addr6} ${src_port} \
+ ${dst_port} "${dst_addr6}"
+
+ src_port=
+ dst_port=
+ src_addr6=
+ }
+ elif command -v nc >/dev/null && nc -u -w0 1.1.1.1 1 2>/dev/null; then
+ # GNU netcat might not work with IPv6, try next tool
+ send_udp6() {
+ ip -6 addr add "${dst_addr6}" dev veth_a nodad \
+ 2>/dev/null
+ if [ -n "${src_addr6}" ]; then
+ B ip addr add "${src_addr6}" dev veth_b nodad
+ else
+ src_addr6="2001:db8::2"
+ fi
+ [ -n "${src_port}" ] && src_port="-p ${src_port}"
+
+ # shellcheck disable=SC2086 # this needs split options
+ echo "" | B nc -u w0 "-s${src_addr6}" ${src_port} \
+ ${dst_addr6} ${dst_port}
+
+ src_addr6=
+ src_port=
+ }
+ elif [ -z "$(bash -c 'type -p')" ]; then
+ send_udp6() {
+ ip -6 addr add "${dst_addr6}" dev veth_a nodad \
+ 2>/dev/null
+ B ip addr add "${src_addr6}" dev veth_b nodad
+ B bash -c "echo > /dev/udp/${dst_addr6}/${dst_port}"
+ ip -6 addr del "${dst_addr6}" dev veth_a 2>/dev/null
+ }
+ else
+ return 1
+ fi
+}
+
+# Set up function to send TCP traffic on IPv4
+setup_flood_tcp() {
+ if command -v iperf3 >/dev/null; then
+ flood_tcp() {
+ [ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
+ if [ -n "${src_addr4}" ]; then
+ B ip addr add "${src_addr4}/16" dev veth_b
+ src_addr4="-B ${src_addr4}"
+ else
+ B ip addr add dev veth_b 10.0.0.2
+ src_addr4="-B 10.0.0.2"
+ fi
+ if [ -n "${src_port}" ]; then
+ src_port="--cport ${src_port}"
+ fi
+ B ip route add default dev veth_b 2>/dev/null
+ ip addr add "${dst_addr4}" dev veth_a 2>/dev/null
+
+ # shellcheck disable=SC2086 # this needs split options
+ iperf3 -s -DB "${dst_addr4}" ${dst_port} >/dev/null 2>&1
+ sleep 2
+
+ # shellcheck disable=SC2086 # this needs split options
+ B iperf3 -c "${dst_addr4}" ${dst_port} ${src_port} \
+ ${src_addr4} -l16 -t 1000
+
+ src_addr4=
+ src_port=
+ dst_port=
+ }
+ elif command -v iperf >/dev/null; then
+ flood_tcp() {
+ [ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
+ if [ -n "${src_addr4}" ]; then
+ B ip addr add "${src_addr4}/16" dev veth_b
+ src_addr4="-B ${src_addr4}"
+ else
+ B ip addr add dev veth_b 10.0.0.2 2>/dev/null
+ src_addr4="-B 10.0.0.2"
+ fi
+ if [ -n "${src_port}" ]; then
+ src_addr4="${src_addr4}:${src_port}"
+ fi
+ B ip route add default dev veth_b
+ ip addr add "${dst_addr4}" dev veth_a 2>/dev/null
+
+ # shellcheck disable=SC2086 # this needs split options
+ iperf -s -DB "${dst_addr4}" ${dst_port} >/dev/null 2>&1
+ sleep 2
+
+ # shellcheck disable=SC2086 # this needs split options
+ B iperf -c "${dst_addr4}" ${dst_port} ${src_addr4} \
+ -l20 -t 1000
+
+ src_addr4=
+ src_port=
+ dst_port=
+ }
+ elif command -v netperf >/dev/null; then
+ flood_tcp() {
+ [ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
+ if [ -n "${src_addr4}" ]; then
+ B ip addr add "${src_addr4}/16" dev veth_b
+ else
+ B ip addr add dev veth_b 10.0.0.2
+ src_addr4="10.0.0.2"
+ fi
+ if [ -n "${src_port}" ]; then
+ dst_port="${dst_port},${src_port}"
+ fi
+ B ip route add default dev veth_b
+ ip addr add "${dst_addr4}" dev veth_a 2>/dev/null
+
+ # shellcheck disable=SC2086 # this needs split options
+ netserver -4 ${dst_port} -L "${dst_addr4}" \
+ >/dev/null 2>&1
+ sleep 2
+
+ # shellcheck disable=SC2086 # this needs split options
+ B netperf -4 -H "${dst_addr4}" ${dst_port} \
+ -L "${src_addr4}" -l 1000 -t TCP_STREAM
+
+ src_addr4=
+ src_port=
+ dst_port=
+ }
+ else
+ return 1
+ fi
+}
+
+# Set up function to send TCP traffic on IPv6
+setup_flood_tcp6() {
+ if command -v iperf3 >/dev/null; then
+ flood_tcp6() {
+ [ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
+ if [ -n "${src_addr6}" ]; then
+ B ip addr add "${src_addr6}" dev veth_b nodad
+ src_addr6="-B ${src_addr6}"
+ else
+ src_addr6="-B 2001:db8::2"
+ fi
+ if [ -n "${src_port}" ]; then
+ src_port="--cport ${src_port}"
+ fi
+ B ip route add default dev veth_b
+ ip -6 addr add "${dst_addr6}" dev veth_a nodad \
+ 2>/dev/null
+
+ # shellcheck disable=SC2086 # this needs split options
+ iperf3 -s -DB "${dst_addr6}" ${dst_port} >/dev/null 2>&1
+ sleep 2
+
+ # shellcheck disable=SC2086 # this needs split options
+ B iperf3 -c "${dst_addr6}" ${dst_port} \
+ ${src_port} ${src_addr6} -l16 -t 1000
+
+ src_addr6=
+ src_port=
+ dst_port=
+ }
+ elif command -v iperf >/dev/null; then
+ flood_tcp6() {
+ [ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
+ if [ -n "${src_addr6}" ]; then
+ B ip addr add "${src_addr6}" dev veth_b nodad
+ src_addr6="-B ${src_addr6}"
+ else
+ src_addr6="-B 2001:db8::2"
+ fi
+ if [ -n "${src_port}" ]; then
+ src_addr6="${src_addr6}:${src_port}"
+ fi
+ B ip route add default dev veth_b
+ ip -6 addr add "${dst_addr6}" dev veth_a nodad \
+ 2>/dev/null
+
+ # shellcheck disable=SC2086 # this needs split options
+ iperf -s -VDB "${dst_addr6}" ${dst_port} >/dev/null 2>&1
+ sleep 2
+
+ # shellcheck disable=SC2086 # this needs split options
+ B iperf -c "${dst_addr6}" -V ${dst_port} \
+ ${src_addr6} -l1 -t 1000
+
+ src_addr6=
+ src_port=
+ dst_port=
+ }
+ elif command -v netperf >/dev/null; then
+ flood_tcp6() {
+ [ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
+ if [ -n "${src_addr6}" ]; then
+ B ip addr add "${src_addr6}" dev veth_b nodad
+ else
+ src_addr6="2001:db8::2"
+ fi
+ if [ -n "${src_port}" ]; then
+ dst_port="${dst_port},${src_port}"
+ fi
+ B ip route add default dev veth_b
+ ip -6 addr add "${dst_addr6}" dev veth_a nodad \
+ 2>/dev/null
+
+ # shellcheck disable=SC2086 # this needs split options
+ netserver -6 ${dst_port} -L "${dst_addr6}" \
+ >/dev/null 2>&1
+ sleep 2
+
+ # shellcheck disable=SC2086 # this needs split options
+ B netperf -6 -H "${dst_addr6}" ${dst_port} \
+ -L "${src_addr6}" -l 1000 -t TCP_STREAM
+
+ src_addr6=
+ src_port=
+ dst_port=
+ }
+ else
+ return 1
+ fi
+}
+
+# Set up function to send UDP traffic on IPv4
+setup_flood_udp() {
+ if command -v iperf3 >/dev/null; then
+ flood_udp() {
+ [ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
+ if [ -n "${src_addr4}" ]; then
+ B ip addr add "${src_addr4}/16" dev veth_b
+ src_addr4="-B ${src_addr4}"
+ else
+ B ip addr add dev veth_b 10.0.0.2 2>/dev/null
+ src_addr4="-B 10.0.0.2"
+ fi
+ if [ -n "${src_port}" ]; then
+ src_port="--cport ${src_port}"
+ fi
+ B ip route add default dev veth_b
+ ip addr add "${dst_addr4}" dev veth_a 2>/dev/null
+
+ # shellcheck disable=SC2086 # this needs split options
+ iperf3 -s -DB "${dst_addr4}" ${dst_port}
+ sleep 2
+
+ # shellcheck disable=SC2086 # this needs split options
+ B iperf3 -u -c "${dst_addr4}" -Z -b 100M -l16 -t1000 \
+ ${dst_port} ${src_port} ${src_addr4}
+
+ src_addr4=
+ src_port=
+ dst_port=
+ }
+ elif command -v iperf >/dev/null; then
+ flood_udp() {
+ [ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
+ if [ -n "${src_addr4}" ]; then
+ B ip addr add "${src_addr4}/16" dev veth_b
+ src_addr4="-B ${src_addr4}"
+ else
+ B ip addr add dev veth_b 10.0.0.2
+ src_addr4="-B 10.0.0.2"
+ fi
+ if [ -n "${src_port}" ]; then
+ src_addr4="${src_addr4}:${src_port}"
+ fi
+ B ip route add default dev veth_b
+ ip addr add "${dst_addr4}" dev veth_a 2>/dev/null
+
+ # shellcheck disable=SC2086 # this needs split options
+ iperf -u -sDB "${dst_addr4}" ${dst_port} >/dev/null 2>&1
+ sleep 2
+
+ # shellcheck disable=SC2086 # this needs split options
+ B iperf -u -c "${dst_addr4}" -b 100M -l1 -t1000 \
+ ${dst_port} ${src_addr4}
+
+ src_addr4=
+ src_port=
+ dst_port=
+ }
+ elif command -v netperf >/dev/null; then
+ flood_udp() {
+ [ -n "${dst_port}" ] && dst_port="-p ${dst_port}"
+ if [ -n "${src_addr4}" ]; then
+ B ip addr add "${src_addr4}/16" dev veth_b
+ else
+ B ip addr add dev veth_b 10.0.0.2
+ src_addr4="10.0.0.2"
+ fi
+ if [ -n "${src_port}" ]; then
+ dst_port="${dst_port},${src_port}"
+ fi
+ B ip route add default dev veth_b
+ ip addr add "${dst_addr4}" dev veth_a 2>/dev/null
+
+ # shellcheck disable=SC2086 # this needs split options
+ netserver -4 ${dst_port} -L "${dst_addr4}" \
+ >/dev/null 2>&1
+ sleep 2
+
+ # shellcheck disable=SC2086 # this needs split options
+ B netperf -4 -H "${dst_addr4}" ${dst_port} \
+ -L "${src_addr4}" -l 1000 -t UDP_STREAM
+
+ src_addr4=
+ src_port=
+ dst_port=
+ }
+ else
+ return 1
+ fi
+}
+
+# Find pktgen script and set up function to start pktgen injection
+setup_perf() {
+ for pktgen_script_path in ${PKTGEN_SCRIPT_PATHS} __notfound; do
+ command -v "${pktgen_script_path}" >/dev/null && break
+ done
+ [ "${pktgen_script_path}" = "__notfound" ] && return 1
+
+ perf_ipv4() {
+ ${pktgen_script_path} -s80 \
+ -i veth_a -d "${dst_addr4}" -p "${dst_port}" \
+ -m "${dst_mac}" \
+ -t $(($(nproc) / 5 + 1)) -b10000 -n0 2>/dev/null &
+ perf_pid=$!
+ }
+ perf_ipv6() {
+ IP6=6 ${pktgen_script_path} -s100 \
+ -i veth_a -d "${dst_addr6}" -p "${dst_port}" \
+ -m "${dst_mac}" \
+ -t $(($(nproc) / 5 + 1)) -b10000 -n0 2>/dev/null &
+ perf_pid=$!
+ }
+}
+
+# Clean up before each test
+cleanup() {
+ nft reset counter inet filter test >/dev/null 2>&1
+ nft flush ruleset >/dev/null 2>&1
+ ip link del dummy0 2>/dev/null
+ ip route del default 2>/dev/null
+ ip -6 route del default 2>/dev/null
+ ip netns del B 2>/dev/null
+ ip link del veth_a 2>/dev/null
+ timeout=
+ killall iperf3 2>/dev/null
+ killall iperf 2>/dev/null
+ killall netperf 2>/dev/null
+ killall netserver 2>/dev/null
+ rm -f ${tmp}
+ sleep 2
+}
+
+# Entry point for setup functions
+setup() {
+ if [ "$(id -u)" -ne 0 ]; then
+ echo " need to run as root"
+ exit ${KSELFTEST_SKIP}
+ fi
+
+ cleanup
+ check_tools || return 1
+ for arg do
+ if ! eval setup_"${arg}"; then
+ err " ${arg} not supported"
+ return 1
+ fi
+ done
+}
+
+# Format integer into IPv4 address, summing 10.0.0.5 (arbitrary) to it
+format_addr4() {
+ a=$((${1} + 16777216 * 10 + 5))
+ printf "%i.%i.%i.%i" \
+ "$((a / 16777216))" "$((a % 16777216 / 65536))" \
+ "$((a % 65536 / 256))" "$((a % 256))"
+}
+
+# Format integer into IPv6 address, summing 2001:db8:: to it
+format_addr6() {
+ printf "2001:db8::%04x:%04x" "$((${1} / 65536))" "$((${1} % 65536))"
+}
+
+# Format integer into EUI-48 address, summing 00:01:00:00:00:00 to it
+format_mac() {
+ printf "00:01:%02x:%02x:%02x:%02x" \
+ "$((${1} / 16777216))" "$((${1} % 16777216 / 65536))" \
+ "$((${1} % 65536 / 256))" "$((${1} % 256))"
+}
+
+# Format integer into port, avoid 0 port
+format_port() {
+ printf "%i" "$((${1} % 65534 + 1))"
+}
+
+# Drop suffixed '6' from L4 protocol, if any
+format_proto() {
+ printf "%s" "${proto}" | tr -d 6
+}
+
+# Format destination and source fields into nft concatenated type
+format() {
+ __start=
+ __end=
+ __expr="{ "
+
+ for f in ${dst}; do
+ [ "${__expr}" != "{ " ] && __expr="${__expr} . "
+
+ __start="$(eval format_"${f}" "${start}")"
+ __end="$(eval format_"${f}" "${end}")"
+
+ if [ "${f}" = "proto" ]; then
+ __expr="${__expr}${__start}"
+ else
+ __expr="${__expr}${__start}-${__end}"
+ fi
+ done
+ for f in ${src}; do
+ __expr="${__expr} . "
+ __start="$(eval format_"${f}" "${srcstart}")"
+ __end="$(eval format_"${f}" "${srcend}")"
+
+ if [ "${f}" = "proto" ]; then
+ __expr="${__expr}${__start}"
+ else
+ __expr="${__expr}${__start}-${__end}"
+ fi
+ done
+
+ if [ -n "${timeout}" ]; then
+ echo "${__expr} timeout ${timeout}s }"
+ else
+ echo "${__expr} }"
+ fi
+}
+
+# Format destination and source fields into nft type, start element only
+format_norange() {
+ __expr="{ "
+
+ for f in ${dst}; do
+ [ "${__expr}" != "{ " ] && __expr="${__expr} . "
+
+ __expr="${__expr}$(eval format_"${f}" "${start}")"
+ done
+ for f in ${src}; do
+ __expr="${__expr} . $(eval format_"${f}" "${start}")"
+ done
+
+ echo "${__expr} }"
+}
+
+# Format first destination field into nft type
+format_noconcat() {
+ for f in ${dst}; do
+ __start="$(eval format_"${f}" "${start}")"
+ __end="$(eval format_"${f}" "${end}")"
+
+ if [ "${f}" = "proto" ]; then
+ echo "{ ${__start} }"
+ else
+ echo "{ ${__start}-${__end} }"
+ fi
+ return
+ done
+}
+
+# Add single entry to 'test' set in 'inet filter' table
+add() {
+ if ! nft add element inet filter test "${1}"; then
+ err "Failed to add ${1} given ruleset:"
+ err "$(nft -a list ruleset)"
+ return 1
+ fi
+}
+
+# Format and output entries for sets in 'netdev perf' table
+add_perf() {
+ if [ "${1}" = "test" ]; then
+ echo "add element netdev perf test $(format)"
+ elif [ "${1}" = "norange" ]; then
+ echo "add element netdev perf norange $(format_norange)"
+ elif [ "${1}" = "noconcat" ]; then
+ echo "add element netdev perf noconcat $(format_noconcat)"
+ fi
+}
+
+# Add single entry to 'norange' set in 'netdev perf' table
+add_perf_norange() {
+ if ! nft add element netdev perf norange "${1}"; then
+ err "Failed to add ${1} given ruleset:"
+ err "$(nft -a list ruleset)"
+ return 1
+ fi
+}
+
+# Add single entry to 'noconcat' set in 'netdev perf' table
+add_perf_noconcat() {
+ if ! nft add element netdev perf noconcat "${1}"; then
+ err "Failed to add ${1} given ruleset:"
+ err "$(nft -a list ruleset)"
+ return 1
+ fi
+}
+
+# Delete single entry from set
+del() {
+ if ! nft delete element inet filter test "${1}"; then
+ err "Failed to delete ${1} given ruleset:"
+ err "$(nft -a list ruleset)"
+ return 1
+ fi
+}
+
+# Return packet count from 'test' counter in 'inet filter' table
+count_packets() {
+ found=0
+ for token in $(nft list counter inet filter test); do
+ [ ${found} -eq 1 ] && echo "${token}" && return
+ [ "${token}" = "packets" ] && found=1
+ done
+}
+
+# Return packet count from 'test' counter in 'netdev perf' table
+count_perf_packets() {
+ found=0
+ for token in $(nft list counter netdev perf test); do
+ [ ${found} -eq 1 ] && echo "${token}" && return
+ [ "${token}" = "packets" ] && found=1
+ done
+}
+
+# Set MAC addresses, send traffic according to specifier
+flood() {
+ ip link set veth_a address "$(format_mac "${1}")"
+ ip -n B link set veth_b address "$(format_mac "${2}")"
+
+ for f in ${dst}; do
+ eval dst_"$f"=\$\(format_\$f "${1}"\)
+ done
+ for f in ${src}; do
+ eval src_"$f"=\$\(format_\$f "${2}"\)
+ done
+ eval flood_\$proto
+}
+
+# Set MAC addresses, start pktgen injection
+perf() {
+ dst_mac="$(format_mac "${1}")"
+ ip link set veth_a address "${dst_mac}"
+
+ for f in ${dst}; do
+ eval dst_"$f"=\$\(format_\$f "${1}"\)
+ done
+ for f in ${src}; do
+ eval src_"$f"=\$\(format_\$f "${2}"\)
+ done
+ eval perf_\$perf_proto
+}
+
+# Set MAC addresses, send single packet, check that it matches, reset counter
+send_match() {
+ ip link set veth_a address "$(format_mac "${1}")"
+ ip -n B link set veth_b address "$(format_mac "${2}")"
+
+ for f in ${dst}; do
+ eval dst_"$f"=\$\(format_\$f "${1}"\)
+ done
+ for f in ${src}; do
+ eval src_"$f"=\$\(format_\$f "${2}"\)
+ done
+ eval send_\$proto
+ if [ "$(count_packets)" != "1" ]; then
+ err "${proto} packet to:"
+ err " $(for f in ${dst}; do
+ eval format_\$f "${1}"; printf ' '; done)"
+ err "from:"
+ err " $(for f in ${src}; do
+ eval format_\$f "${2}"; printf ' '; done)"
+ err "should have matched ruleset:"
+ err "$(nft -a list ruleset)"
+ return 1
+ fi
+ nft reset counter inet filter test >/dev/null
+}
+
+# Set MAC addresses, send single packet, check that it doesn't match
+send_nomatch() {
+ ip link set veth_a address "$(format_mac "${1}")"
+ ip -n B link set veth_b address "$(format_mac "${2}")"
+
+ for f in ${dst}; do
+ eval dst_"$f"=\$\(format_\$f "${1}"\)
+ done
+ for f in ${src}; do
+ eval src_"$f"=\$\(format_\$f "${2}"\)
+ done
+ eval send_\$proto
+ if [ "$(count_packets)" != "0" ]; then
+ err "${proto} packet to:"
+ err " $(for f in ${dst}; do
+ eval format_\$f "${1}"; printf ' '; done)"
+ err "from:"
+ err " $(for f in ${src}; do
+ eval format_\$f "${2}"; printf ' '; done)"
+ err "should not have matched ruleset:"
+ err "$(nft -a list ruleset)"
+ return 1
+ fi
+}
+
+# Correctness test template:
+# - add ranged element, check that packets match it
+# - check that packets outside range don't match it
+# - remove some elements, check that packets don't match anymore
+test_correctness() {
+ setup veth send_"${proto}" set || return ${KSELFTEST_SKIP}
+
+ range_size=1
+ for i in $(seq "${start}" $((start + count))); do
+ end=$((start + range_size))
+
+ # Avoid negative or zero-sized port ranges
+ if [ $((end / 65534)) -gt $((start / 65534)) ]; then
+ start=${end}
+ end=$((end + 1))
+ fi
+ srcstart=$((start + src_delta))
+ srcend=$((end + src_delta))
+
+ add "$(format)" || return 1
+ for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do
+ send_match "${j}" $((j + src_delta)) || return 1
+ done
+ send_nomatch $((end + 1)) $((end + 1 + src_delta)) || return 1
+
+ # Delete elements now and then
+ if [ $((i % 3)) -eq 0 ]; then
+ del "$(format)" || return 1
+ for j in $(seq ${start} \
+ $((range_size / 2 + 1)) ${end}); do
+ send_nomatch "${j}" $((j + src_delta)) \
+ || return 1
+ done
+ fi
+
+ range_size=$((range_size + 1))
+ start=$((end + range_size))
+ done
+}
+
+# Concurrency test template:
+# - add all the elements
+# - start a thread for each physical thread that:
+# - adds all the elements
+# - flushes the set
+# - adds all the elements
+# - flushes the entire ruleset
+# - adds the set back
+# - adds all the elements
+# - delete all the elements
+test_concurrency() {
+ proto=${flood_proto}
+ tools=${flood_tools}
+ chain_spec=${flood_spec}
+ setup veth flood_"${proto}" set || return ${KSELFTEST_SKIP}
+
+ range_size=1
+ cstart=${start}
+ flood_pids=
+ for i in $(seq ${start} $((start + count))); do
+ end=$((start + range_size))
+ srcstart=$((start + src_delta))
+ srcend=$((end + src_delta))
+
+ add "$(format)" || return 1
+
+ flood "${i}" $((i + src_delta)) & flood_pids="${flood_pids} $!"
+
+ range_size=$((range_size + 1))
+ start=$((end + range_size))
+ done
+
+ sleep 10
+
+ pids=
+ for c in $(seq 1 "$(nproc)"); do (
+ for r in $(seq 1 "${race_repeat}"); do
+ range_size=1
+
+ # $start needs to be local to this subshell
+ # shellcheck disable=SC2030
+ start=${cstart}
+ for i in $(seq ${start} $((start + count))); do
+ end=$((start + range_size))
+ srcstart=$((start + src_delta))
+ srcend=$((end + src_delta))
+
+ add "$(format)" 2>/dev/null
+
+ range_size=$((range_size + 1))
+ start=$((end + range_size))
+ done
+
+ nft flush inet filter test 2>/dev/null
+
+ range_size=1
+ start=${cstart}
+ for i in $(seq ${start} $((start + count))); do
+ end=$((start + range_size))
+ srcstart=$((start + src_delta))
+ srcend=$((end + src_delta))
+
+ add "$(format)" 2>/dev/null
+
+ range_size=$((range_size + 1))
+ start=$((end + range_size))
+ done
+
+ nft flush ruleset
+ setup set 2>/dev/null
+
+ range_size=1
+ start=${cstart}
+ for i in $(seq ${start} $((start + count))); do
+ end=$((start + range_size))
+ srcstart=$((start + src_delta))
+ srcend=$((end + src_delta))
+
+ add "$(format)" 2>/dev/null
+
+ range_size=$((range_size + 1))
+ start=$((end + range_size))
+ done
+
+ range_size=1
+ start=${cstart}
+ for i in $(seq ${start} $((start + count))); do
+ end=$((start + range_size))
+ srcstart=$((start + src_delta))
+ srcend=$((end + src_delta))
+
+ del "$(format)" 2>/dev/null
+
+ range_size=$((range_size + 1))
+ start=$((end + range_size))
+ done
+ done
+ ) & pids="${pids} $!"
+ done
+
+ # shellcheck disable=SC2046,SC2086 # word splitting wanted here
+ wait $(for pid in ${pids}; do echo ${pid}; done)
+ # shellcheck disable=SC2046,SC2086
+ kill $(for pid in ${flood_pids}; do echo ${pid}; done) 2>/dev/null
+ # shellcheck disable=SC2046,SC2086
+ wait $(for pid in ${flood_pids}; do echo ${pid}; done) 2>/dev/null
+
+ return 0
+}
+
+# Timeout test template:
+# - add all the elements with 3s timeout while checking that packets match
+# - wait 3s after the last insertion, check that packets don't match any entry
+test_timeout() {
+ setup veth send_"${proto}" set || return ${KSELFTEST_SKIP}
+
+ timeout=3
+ range_size=1
+ for i in $(seq "${start}" $((start + count))); do
+ end=$((start + range_size))
+ srcstart=$((start + src_delta))
+ srcend=$((end + src_delta))
+
+ add "$(format)" || return 1
+
+ for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do
+ send_match "${j}" $((j + src_delta)) || return 1
+ done
+
+ range_size=$((range_size + 1))
+ start=$((end + range_size))
+ done
+ sleep 3
+ for i in $(seq ${start} $((start + count))); do
+ end=$((start + range_size))
+ srcstart=$((start + src_delta))
+ srcend=$((end + src_delta))
+
+ for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do
+ send_nomatch "${j}" $((j + src_delta)) || return 1
+ done
+
+ range_size=$((range_size + 1))
+ start=$((end + range_size))
+ done
+}
+
+# Performance test template:
+# - add concatenated ranged entries
+# - add non-ranged concatenated entries (for hash set matching rate baseline)
+# - add ranged entries with first field only (for rbhash baseline)
+# - start pktgen injection directly on device rx path of this namespace
+# - measure drop only rate, hash and rbtree baselines, then matching rate
+test_performance() {
+ chain_spec=${perf_spec}
+ dst="${perf_dst}"
+ src="${perf_src}"
+ setup veth perf set || return ${KSELFTEST_SKIP}
+
+ first=${start}
+ range_size=1
+ for set in test norange noconcat; do
+ start=${first}
+ for i in $(seq ${start} $((start + perf_entries))); do
+ end=$((start + range_size))
+ srcstart=$((start + src_delta))
+ srcend=$((end + src_delta))
+
+ if [ $((end / 65534)) -gt $((start / 65534)) ]; then
+ start=${end}
+ end=$((end + 1))
+ elif [ ${start} -eq ${end} ]; then
+ end=$((start + 1))
+ fi
+
+ add_perf ${set}
+
+ start=$((end + range_size))
+ done > "${tmp}"
+ nft -f "${tmp}"
+ done
+
+ perf $((end - 1)) ${srcstart}
+
+ sleep 2
+
+ nft add rule netdev perf test counter name \"test\" drop
+ nft reset counter netdev perf test >/dev/null 2>&1
+ sleep "${perf_duration}"
+ pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))"
+ info " baseline (drop from netdev hook): ${pps}pps"
+ handle="$(nft -a list chain netdev perf test | grep counter)"
+ handle="${handle##* }"
+ nft delete rule netdev perf test handle "${handle}"
+
+ nft add rule "netdev perf test ${chain_spec} @norange \
+ counter name \"test\" drop"
+ nft reset counter netdev perf test >/dev/null 2>&1
+ sleep "${perf_duration}"
+ pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))"
+ info " baseline hash (non-ranged entries): ${pps}pps"
+ handle="$(nft -a list chain netdev perf test | grep counter)"
+ handle="${handle##* }"
+ nft delete rule netdev perf test handle "${handle}"
+
+ nft add rule "netdev perf test ${chain_spec%%. *} @noconcat \
+ counter name \"test\" drop"
+ nft reset counter netdev perf test >/dev/null 2>&1
+ sleep "${perf_duration}"
+ pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))"
+ info " baseline rbtree (match on first field only): ${pps}pps"
+ handle="$(nft -a list chain netdev perf test | grep counter)"
+ handle="${handle##* }"
+ nft delete rule netdev perf test handle "${handle}"
+
+ nft add rule "netdev perf test ${chain_spec} @test \
+ counter name \"test\" drop"
+ nft reset counter netdev perf test >/dev/null 2>&1
+ sleep "${perf_duration}"
+ pps="$(printf %10s $(($(count_perf_packets) / perf_duration)))"
+ p5="$(printf %5s "${perf_entries}")"
+ info " set with ${p5} full, ranged entries: ${pps}pps"
+ kill "${perf_pid}"
+}
+
+test_bug_flush_remove_add() {
+ set_cmd='{ set s { type ipv4_addr . inet_service; flags interval; }; }'
+ elem1='{ 10.0.0.1 . 22-25, 10.0.0.1 . 10-20 }'
+ elem2='{ 10.0.0.1 . 10-20, 10.0.0.1 . 22-25 }'
+ for i in `seq 1 100`; do
+ nft add table t ${set_cmd} || return ${KSELFTEST_SKIP}
+ nft add element t s ${elem1} 2>/dev/null || return 1
+ nft flush set t s 2>/dev/null || return 1
+ nft add element t s ${elem2} 2>/dev/null || return 1
+ done
+ nft flush ruleset
+}
+
+# - add ranged element, check that packets match it
+# - reload the set, check packets still match
+test_bug_reload() {
+ setup veth send_"${proto}" set || return ${KSELFTEST_SKIP}
+ rstart=${start}
+
+ range_size=1
+ for i in $(seq "${start}" $((start + count))); do
+ end=$((start + range_size))
+
+ # Avoid negative or zero-sized port ranges
+ if [ $((end / 65534)) -gt $((start / 65534)) ]; then
+ start=${end}
+ end=$((end + 1))
+ fi
+ srcstart=$((start + src_delta))
+ srcend=$((end + src_delta))
+
+ add "$(format)" || return 1
+ range_size=$((range_size + 1))
+ start=$((end + range_size))
+ done
+
+ # check kernel does allocate pcpu sctrach map
+ # for reload with no elemet add/delete
+ ( echo flush set inet filter test ;
+ nft list set inet filter test ) | nft -f -
+
+ start=${rstart}
+ range_size=1
+
+ for i in $(seq "${start}" $((start + count))); do
+ end=$((start + range_size))
+
+ # Avoid negative or zero-sized port ranges
+ if [ $((end / 65534)) -gt $((start / 65534)) ]; then
+ start=${end}
+ end=$((end + 1))
+ fi
+ srcstart=$((start + src_delta))
+ srcend=$((end + src_delta))
+
+ for j in $(seq ${start} $((range_size / 2 + 1)) ${end}); do
+ send_match "${j}" $((j + src_delta)) || return 1
+ done
+
+ range_size=$((range_size + 1))
+ start=$((end + range_size))
+ done
+
+ nft flush ruleset
+}
+
+test_reported_issues() {
+ eval test_bug_"${subtest}"
+}
+
+# Run everything in a separate network namespace
+[ "${1}" != "run" ] && { unshare -n "${0}" run; exit $?; }
+tmp="$(mktemp)"
+trap cleanup EXIT
+
+# Entry point for test runs
+passed=0
+for name in ${TESTS}; do
+ printf "TEST: %s\n" "$(echo ${name} | tr '_' ' ')"
+ if [ "${name}" = "reported_issues" ]; then
+ SUBTESTS="${BUGS}"
+ else
+ SUBTESTS="${TYPES}"
+ fi
+
+ for subtest in ${SUBTESTS}; do
+ eval desc=\$TYPE_"${subtest}"
+ IFS='
+'
+ for __line in ${desc}; do
+ # shellcheck disable=SC2086
+ eval ${__line%% *}=\"${__line##* }\";
+ done
+ IFS='
+'
+
+ if [ "${name}" = "concurrency" ] && \
+ [ "${race_repeat}" = "0" ]; then
+ continue
+ fi
+ if [ "${name}" = "performance" ] && \
+ [ "${perf_duration}" = "0" ]; then
+ continue
+ fi
+
+ printf " %-60s " "${display}"
+ eval test_"${name}"
+ ret=$?
+
+ if [ $ret -eq 0 ]; then
+ printf "[ OK ]\n"
+ info_flush
+ passed=$((passed + 1))
+ elif [ $ret -eq 1 ]; then
+ printf "[FAIL]\n"
+ err_flush
+ exit 1
+ elif [ $ret -eq ${KSELFTEST_SKIP} ]; then
+ printf "[SKIP]\n"
+ err_flush
+ fi
+ done
+done
+
+[ ${passed} -eq 0 ] && exit ${KSELFTEST_SKIP} || exit 0
diff --git a/tools/testing/selftests/netfilter/nft_conntrack_helper.sh b/tools/testing/selftests/netfilter/nft_conntrack_helper.sh
new file mode 100755
index 000000000..bf6b9626c
--- /dev/null
+++ b/tools/testing/selftests/netfilter/nft_conntrack_helper.sh
@@ -0,0 +1,181 @@
+#!/bin/bash
+#
+# This tests connection tracking helper assignment:
+# 1. can attach ftp helper to a connection from nft ruleset.
+# 2. auto-assign still works.
+#
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+ret=0
+
+sfx=$(mktemp -u "XXXXXXXX")
+ns1="ns1-$sfx"
+ns2="ns2-$sfx"
+testipv6=1
+
+cleanup()
+{
+ ip netns del ${ns1}
+ ip netns del ${ns2}
+}
+
+nft --version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without nft tool"
+ exit $ksft_skip
+fi
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+
+conntrack -V > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without conntrack tool"
+ exit $ksft_skip
+fi
+
+which nc >/dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without netcat tool"
+ exit $ksft_skip
+fi
+
+trap cleanup EXIT
+
+ip netns add ${ns1}
+ip netns add ${ns2}
+
+ip link add veth0 netns ${ns1} type veth peer name veth0 netns ${ns2} > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: No virtual ethernet pair device support in kernel"
+ exit $ksft_skip
+fi
+
+ip -net ${ns1} link set lo up
+ip -net ${ns1} link set veth0 up
+
+ip -net ${ns2} link set lo up
+ip -net ${ns2} link set veth0 up
+
+ip -net ${ns1} addr add 10.0.1.1/24 dev veth0
+ip -net ${ns1} addr add dead:1::1/64 dev veth0
+
+ip -net ${ns2} addr add 10.0.1.2/24 dev veth0
+ip -net ${ns2} addr add dead:1::2/64 dev veth0
+
+load_ruleset_family() {
+ local family=$1
+ local ns=$2
+
+ip netns exec ${ns} nft -f - <<EOF
+table $family raw {
+ ct helper ftp {
+ type "ftp" protocol tcp
+ }
+ chain pre {
+ type filter hook prerouting priority 0; policy accept;
+ tcp dport 2121 ct helper set "ftp"
+ }
+ chain output {
+ type filter hook output priority 0; policy accept;
+ tcp dport 2121 ct helper set "ftp"
+ }
+}
+EOF
+ return $?
+}
+
+check_for_helper()
+{
+ local netns=$1
+ local message=$2
+ local port=$3
+
+ if echo $message |grep -q 'ipv6';then
+ local family="ipv6"
+ else
+ local family="ipv4"
+ fi
+
+ ip netns exec ${netns} conntrack -L -f $family -p tcp --dport $port 2> /dev/null |grep -q 'helper=ftp'
+ if [ $? -ne 0 ] ; then
+ echo "FAIL: ${netns} did not show attached helper $message" 1>&2
+ ret=1
+ fi
+
+ echo "PASS: ${netns} connection on port $port has ftp helper attached" 1>&2
+ return 0
+}
+
+test_helper()
+{
+ local port=$1
+ local msg=$2
+
+ sleep 3 | ip netns exec ${ns2} nc -w 2 -l -p $port > /dev/null &
+
+ sleep 1 | ip netns exec ${ns1} nc -w 2 10.0.1.2 $port > /dev/null &
+ sleep 1
+
+ check_for_helper "$ns1" "ip $msg" $port
+ check_for_helper "$ns2" "ip $msg" $port
+
+ wait
+
+ if [ $testipv6 -eq 0 ] ;then
+ return 0
+ fi
+
+ ip netns exec ${ns1} conntrack -F 2> /dev/null
+ ip netns exec ${ns2} conntrack -F 2> /dev/null
+
+ sleep 3 | ip netns exec ${ns2} nc -w 2 -6 -l -p $port > /dev/null &
+
+ sleep 1 | ip netns exec ${ns1} nc -w 2 -6 dead:1::2 $port > /dev/null &
+ sleep 1
+
+ check_for_helper "$ns1" "ipv6 $msg" $port
+ check_for_helper "$ns2" "ipv6 $msg" $port
+
+ wait
+}
+
+load_ruleset_family ip ${ns1}
+if [ $? -ne 0 ];then
+ echo "FAIL: ${ns1} cannot load ip ruleset" 1>&2
+ exit 1
+fi
+
+load_ruleset_family ip6 ${ns1}
+if [ $? -ne 0 ];then
+ echo "SKIP: ${ns1} cannot load ip6 ruleset" 1>&2
+ testipv6=0
+fi
+
+load_ruleset_family inet ${ns2}
+if [ $? -ne 0 ];then
+ echo "SKIP: ${ns1} cannot load inet ruleset" 1>&2
+ load_ruleset_family ip ${ns2}
+ if [ $? -ne 0 ];then
+ echo "FAIL: ${ns2} cannot load ip ruleset" 1>&2
+ exit 1
+ fi
+
+ if [ $testipv6 -eq 1 ] ;then
+ load_ruleset_family ip6 ${ns2}
+ if [ $? -ne 0 ];then
+ echo "FAIL: ${ns2} cannot load ip6 ruleset" 1>&2
+ exit 1
+ fi
+ fi
+fi
+
+test_helper 2121 "set via ruleset"
+ip netns exec ${ns1} sysctl -q 'net.netfilter.nf_conntrack_helper=1'
+ip netns exec ${ns2} sysctl -q 'net.netfilter.nf_conntrack_helper=1'
+test_helper 21 "auto-assign"
+
+exit $ret
diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh
new file mode 100755
index 000000000..aefe50e0e
--- /dev/null
+++ b/tools/testing/selftests/netfilter/nft_flowtable.sh
@@ -0,0 +1,420 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# This tests basic flowtable functionality.
+# Creates following default topology:
+#
+# Originator (MTU 9000) <-Router1-> MTU 1500 <-Router2-> Responder (MTU 2000)
+# Router1 is the one doing flow offloading, Router2 has no special
+# purpose other than having a link that is smaller than either Originator
+# and responder, i.e. TCPMSS announced values are too large and will still
+# result in fragmentation and/or PMTU discovery.
+#
+# You can check with different Orgininator/Link/Responder MTU eg:
+# nft_flowtable.sh -o8000 -l1500 -r2000
+#
+
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+ret=0
+
+ns1in=""
+ns2in=""
+ns1out=""
+ns2out=""
+
+log_netns=$(sysctl -n net.netfilter.nf_log_all_netns)
+
+checktool (){
+ if ! $1 > /dev/null 2>&1; then
+ echo "SKIP: Could not $2"
+ exit $ksft_skip
+ fi
+}
+
+checktool "nft --version" "run test without nft tool"
+checktool "ip -Version" "run test without ip tool"
+checktool "which nc" "run test without nc (netcat)"
+checktool "ip netns add nsr1" "create net namespace"
+
+ip netns add ns1
+ip netns add ns2
+
+ip netns add nsr2
+
+cleanup() {
+ for i in 1 2; do
+ ip netns del ns$i
+ ip netns del nsr$i
+ done
+
+ rm -f "$ns1in" "$ns1out"
+ rm -f "$ns2in" "$ns2out"
+
+ [ $log_netns -eq 0 ] && sysctl -q net.netfilter.nf_log_all_netns=$log_netns
+}
+
+trap cleanup EXIT
+
+sysctl -q net.netfilter.nf_log_all_netns=1
+
+ip link add veth0 netns nsr1 type veth peer name eth0 netns ns1
+ip link add veth1 netns nsr1 type veth peer name veth0 netns nsr2
+
+ip link add veth1 netns nsr2 type veth peer name eth0 netns ns2
+
+for dev in lo veth0 veth1; do
+ for i in 1 2; do
+ ip -net nsr$i link set $dev up
+ done
+done
+
+ip -net nsr1 addr add 10.0.1.1/24 dev veth0
+ip -net nsr1 addr add dead:1::1/64 dev veth0
+
+ip -net nsr2 addr add 10.0.2.1/24 dev veth1
+ip -net nsr2 addr add dead:2::1/64 dev veth1
+
+# set different MTUs so we need to push packets coming from ns1 (large MTU)
+# to ns2 (smaller MTU) to stack either to perform fragmentation (ip_no_pmtu_disc=1),
+# or to do PTMU discovery (send ICMP error back to originator).
+# ns2 is going via nsr2 with a smaller mtu, so that TCPMSS announced by both peers
+# is NOT the lowest link mtu.
+
+omtu=9000
+lmtu=1500
+rmtu=2000
+
+usage(){
+ echo "nft_flowtable.sh [OPTIONS]"
+ echo
+ echo "MTU options"
+ echo " -o originator"
+ echo " -l link"
+ echo " -r responder"
+ exit 1
+}
+
+while getopts "o:l:r:" o
+do
+ case $o in
+ o) omtu=$OPTARG;;
+ l) lmtu=$OPTARG;;
+ r) rmtu=$OPTARG;;
+ *) usage;;
+ esac
+done
+
+if ! ip -net nsr1 link set veth0 mtu $omtu; then
+ exit 1
+fi
+
+ip -net ns1 link set eth0 mtu $omtu
+
+if ! ip -net nsr2 link set veth1 mtu $rmtu; then
+ exit 1
+fi
+
+ip -net ns2 link set eth0 mtu $rmtu
+
+# transfer-net between nsr1 and nsr2.
+# these addresses are not used for connections.
+ip -net nsr1 addr add 192.168.10.1/24 dev veth1
+ip -net nsr1 addr add fee1:2::1/64 dev veth1
+
+ip -net nsr2 addr add 192.168.10.2/24 dev veth0
+ip -net nsr2 addr add fee1:2::2/64 dev veth0
+
+for i in 1 2; do
+ ip netns exec nsr$i sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+ ip netns exec nsr$i sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+
+ ip -net ns$i link set lo up
+ ip -net ns$i link set eth0 up
+ ip -net ns$i addr add 10.0.$i.99/24 dev eth0
+ ip -net ns$i route add default via 10.0.$i.1
+ ip -net ns$i addr add dead:$i::99/64 dev eth0
+ ip -net ns$i route add default via dead:$i::1
+ if ! ip netns exec ns$i sysctl net.ipv4.tcp_no_metrics_save=1 > /dev/null; then
+ echo "ERROR: Check Originator/Responder values (problem during address addition)"
+ exit 1
+ fi
+
+ # don't set ip DF bit for first two tests
+ ip netns exec ns$i sysctl net.ipv4.ip_no_pmtu_disc=1 > /dev/null
+done
+
+ip -net nsr1 route add default via 192.168.10.2
+ip -net nsr2 route add default via 192.168.10.1
+
+ip netns exec nsr1 nft -f - <<EOF
+table inet filter {
+ flowtable f1 {
+ hook ingress priority 0
+ devices = { veth0, veth1 }
+ }
+
+ chain forward {
+ type filter hook forward priority 0; policy drop;
+
+ # flow offloaded? Tag ct with mark 1, so we can detect when it fails.
+ meta oif "veth1" tcp dport 12345 flow offload @f1 counter
+
+ # use packet size to trigger 'should be offloaded by now'.
+ # otherwise, if 'flow offload' expression never offloads, the
+ # test will pass.
+ tcp dport 12345 meta length gt 200 ct mark set 1 counter
+
+ # this turns off flow offloading internally, so expect packets again
+ tcp flags fin,rst ct mark set 0 accept
+
+ # this allows large packets from responder, we need this as long
+ # as PMTUd is off.
+ # This rule is deleted for the last test, when we expect PMTUd
+ # to kick in and ensure all packets meet mtu requirements.
+ meta length gt $lmtu accept comment something-to-grep-for
+
+ # next line blocks connection w.o. working offload.
+ # we only do this for reverse dir, because we expect packets to
+ # enter slow path due to MTU mismatch of veth0 and veth1.
+ tcp sport 12345 ct mark 1 counter log prefix "mark failure " drop
+
+ ct state established,related accept
+
+ # for packets that we can't offload yet, i.e. SYN (any ct that is not confirmed)
+ meta length lt 200 oif "veth1" tcp dport 12345 counter accept
+
+ meta nfproto ipv4 meta l4proto icmp accept
+ meta nfproto ipv6 meta l4proto icmpv6 accept
+ }
+}
+EOF
+
+if [ $? -ne 0 ]; then
+ echo "SKIP: Could not load nft ruleset"
+ exit $ksft_skip
+fi
+
+# test basic connectivity
+if ! ip netns exec ns1 ping -c 1 -q 10.0.2.99 > /dev/null; then
+ echo "ERROR: ns1 cannot reach ns2" 1>&2
+ exit 1
+fi
+
+if ! ip netns exec ns2 ping -c 1 -q 10.0.1.99 > /dev/null; then
+ echo "ERROR: ns2 cannot reach ns1" 1>&2
+ exit 1
+fi
+
+if [ $ret -eq 0 ];then
+ echo "PASS: netns routing/connectivity: ns1 can reach ns2"
+fi
+
+ns1in=$(mktemp)
+ns1out=$(mktemp)
+ns2in=$(mktemp)
+ns2out=$(mktemp)
+
+make_file()
+{
+ name=$1
+
+ SIZE=$((RANDOM % (1024 * 8)))
+ TSIZE=$((SIZE * 1024))
+
+ dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null
+
+ SIZE=$((RANDOM % 1024))
+ SIZE=$((SIZE + 128))
+ TSIZE=$((TSIZE + SIZE))
+ dd if=/dev/urandom conf=notrunc of="$name" bs=1 count=$SIZE 2> /dev/null
+}
+
+check_transfer()
+{
+ in=$1
+ out=$2
+ what=$3
+
+ if ! cmp "$in" "$out" > /dev/null 2>&1; then
+ echo "FAIL: file mismatch for $what" 1>&2
+ ls -l "$in"
+ ls -l "$out"
+ return 1
+ fi
+
+ return 0
+}
+
+test_tcp_forwarding_ip()
+{
+ local nsa=$1
+ local nsb=$2
+ local dstip=$3
+ local dstport=$4
+ local lret=0
+
+ ip netns exec $nsb nc -w 5 -l -p 12345 < "$ns2in" > "$ns2out" &
+ lpid=$!
+
+ sleep 1
+ ip netns exec $nsa nc -w 4 "$dstip" "$dstport" < "$ns1in" > "$ns1out" &
+ cpid=$!
+
+ sleep 3
+
+ if ps -p $lpid > /dev/null;then
+ kill $lpid
+ fi
+
+ if ps -p $cpid > /dev/null;then
+ kill $cpid
+ fi
+
+ wait
+
+ if ! check_transfer "$ns1in" "$ns2out" "ns1 -> ns2"; then
+ lret=1
+ fi
+
+ if ! check_transfer "$ns2in" "$ns1out" "ns1 <- ns2"; then
+ lret=1
+ fi
+
+ return $lret
+}
+
+test_tcp_forwarding()
+{
+ test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345
+
+ return $?
+}
+
+test_tcp_forwarding_nat()
+{
+ local lret
+
+ test_tcp_forwarding_ip "$1" "$2" 10.0.2.99 12345
+ lret=$?
+
+ if [ $lret -eq 0 ] ; then
+ test_tcp_forwarding_ip "$1" "$2" 10.6.6.6 1666
+ lret=$?
+ fi
+
+ return $lret
+}
+
+make_file "$ns1in"
+make_file "$ns2in"
+
+# First test:
+# No PMTU discovery, nsr1 is expected to fragment packets from ns1 to ns2 as needed.
+if test_tcp_forwarding ns1 ns2; then
+ echo "PASS: flow offloaded for ns1/ns2"
+else
+ echo "FAIL: flow offload for ns1/ns2:" 1>&2
+ ip netns exec nsr1 nft list ruleset
+ ret=1
+fi
+
+# delete default route, i.e. ns2 won't be able to reach ns1 and
+# will depend on ns1 being masqueraded in nsr1.
+# expect ns1 has nsr1 address.
+ip -net ns2 route del default via 10.0.2.1
+ip -net ns2 route del default via dead:2::1
+ip -net ns2 route add 192.168.10.1 via 10.0.2.1
+
+# Second test:
+# Same, but with NAT enabled.
+ip netns exec nsr1 nft -f - <<EOF
+table ip nat {
+ chain prerouting {
+ type nat hook prerouting priority 0; policy accept;
+ meta iif "veth0" ip daddr 10.6.6.6 tcp dport 1666 counter dnat ip to 10.0.2.99:12345
+ }
+
+ chain postrouting {
+ type nat hook postrouting priority 0; policy accept;
+ meta oifname "veth1" counter masquerade
+ }
+}
+EOF
+
+if test_tcp_forwarding_nat ns1 ns2; then
+ echo "PASS: flow offloaded for ns1/ns2 with NAT"
+else
+ echo "FAIL: flow offload for ns1/ns2 with NAT" 1>&2
+ ip netns exec nsr1 nft list ruleset
+ ret=1
+fi
+
+# Third test:
+# Same as second test, but with PMTU discovery enabled.
+handle=$(ip netns exec nsr1 nft -a list table inet filter | grep something-to-grep-for | cut -d \# -f 2)
+
+if ! ip netns exec nsr1 nft delete rule inet filter forward $handle; then
+ echo "FAIL: Could not delete large-packet accept rule"
+ exit 1
+fi
+
+ip netns exec ns1 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
+ip netns exec ns2 sysctl net.ipv4.ip_no_pmtu_disc=0 > /dev/null
+
+if test_tcp_forwarding_nat ns1 ns2; then
+ echo "PASS: flow offloaded for ns1/ns2 with NAT and pmtu discovery"
+else
+ echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2
+ ip netns exec nsr1 nft list ruleset
+fi
+
+KEY_SHA="0x"$(ps -xaf | sha1sum | cut -d " " -f 1)
+KEY_AES="0x"$(ps -xaf | md5sum | cut -d " " -f 1)
+SPI1=$RANDOM
+SPI2=$RANDOM
+
+if [ $SPI1 -eq $SPI2 ]; then
+ SPI2=$((SPI2+1))
+fi
+
+do_esp() {
+ local ns=$1
+ local me=$2
+ local remote=$3
+ local lnet=$4
+ local rnet=$5
+ local spi_out=$6
+ local spi_in=$7
+
+ ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $rnet dst $lnet
+ ip -net $ns xfrm state add src $me dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet
+
+ # to encrypt packets as they go out (includes forwarded packets that need encapsulation)
+ ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow
+ # to fwd decrypted packets after esp processing:
+ ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 1 action allow
+
+}
+
+do_esp nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2
+
+do_esp nsr2 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1
+
+ip netns exec nsr1 nft delete table ip nat
+
+# restore default routes
+ip -net ns2 route del 192.168.10.1 via 10.0.2.1
+ip -net ns2 route add default via 10.0.2.1
+ip -net ns2 route add default via dead:2::1
+
+if test_tcp_forwarding ns1 ns2; then
+ echo "PASS: ipsec tunnel mode for ns1/ns2"
+else
+ echo "FAIL: ipsec tunnel mode for ns1/ns2"
+ ip netns exec nsr1 nft list ruleset 1>&2
+ ip netns exec nsr1 cat /proc/net/xfrm_stat 1>&2
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/netfilter/nft_meta.sh b/tools/testing/selftests/netfilter/nft_meta.sh
new file mode 100755
index 000000000..f33154c04
--- /dev/null
+++ b/tools/testing/selftests/netfilter/nft_meta.sh
@@ -0,0 +1,142 @@
+#!/bin/bash
+
+# check iif/iifname/oifgroup/iiftype match.
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+sfx=$(mktemp -u "XXXXXXXX")
+ns0="ns0-$sfx"
+
+if ! nft --version > /dev/null 2>&1; then
+ echo "SKIP: Could not run test without nft tool"
+ exit $ksft_skip
+fi
+
+cleanup()
+{
+ ip netns del "$ns0"
+}
+
+ip netns add "$ns0"
+ip -net "$ns0" link set lo up
+ip -net "$ns0" addr add 127.0.0.1 dev lo
+
+trap cleanup EXIT
+
+currentyear=$(date +%Y)
+lastyear=$((currentyear-1))
+ip netns exec "$ns0" nft -f /dev/stdin <<EOF
+table inet filter {
+ counter iifcount {}
+ counter iifnamecount {}
+ counter iifgroupcount {}
+ counter iiftypecount {}
+ counter infproto4count {}
+ counter il4protocounter {}
+ counter imarkcounter {}
+ counter icpu0counter {}
+ counter ilastyearcounter {}
+ counter icurrentyearcounter {}
+
+ counter oifcount {}
+ counter oifnamecount {}
+ counter oifgroupcount {}
+ counter oiftypecount {}
+ counter onfproto4count {}
+ counter ol4protocounter {}
+ counter oskuidcounter {}
+ counter oskgidcounter {}
+ counter omarkcounter {}
+
+ chain input {
+ type filter hook input priority 0; policy accept;
+
+ meta iif lo counter name "iifcount"
+ meta iifname "lo" counter name "iifnamecount"
+ meta iifgroup "default" counter name "iifgroupcount"
+ meta iiftype "loopback" counter name "iiftypecount"
+ meta nfproto ipv4 counter name "infproto4count"
+ meta l4proto icmp counter name "il4protocounter"
+ meta mark 42 counter name "imarkcounter"
+ meta cpu 0 counter name "icpu0counter"
+ meta time "$lastyear-01-01" - "$lastyear-12-31" counter name ilastyearcounter
+ meta time "$currentyear-01-01" - "$currentyear-12-31" counter name icurrentyearcounter
+ }
+
+ chain output {
+ type filter hook output priority 0; policy accept;
+ meta oif lo counter name "oifcount" counter
+ meta oifname "lo" counter name "oifnamecount"
+ meta oifgroup "default" counter name "oifgroupcount"
+ meta oiftype "loopback" counter name "oiftypecount"
+ meta nfproto ipv4 counter name "onfproto4count"
+ meta l4proto icmp counter name "ol4protocounter"
+ meta skuid 0 counter name "oskuidcounter"
+ meta skgid 0 counter name "oskgidcounter"
+ meta mark 42 counter name "omarkcounter"
+ }
+}
+EOF
+
+if [ $? -ne 0 ]; then
+ echo "SKIP: Could not add test ruleset"
+ exit $ksft_skip
+fi
+
+ret=0
+
+check_one_counter()
+{
+ local cname="$1"
+ local want="packets $2"
+ local verbose="$3"
+
+ if ! ip netns exec "$ns0" nft list counter inet filter $cname | grep -q "$want"; then
+ echo "FAIL: $cname, want \"$want\", got"
+ ret=1
+ ip netns exec "$ns0" nft list counter inet filter $cname
+ fi
+}
+
+check_lo_counters()
+{
+ local want="$1"
+ local verbose="$2"
+ local counter
+
+ for counter in iifcount iifnamecount iifgroupcount iiftypecount infproto4count \
+ oifcount oifnamecount oifgroupcount oiftypecount onfproto4count \
+ il4protocounter icurrentyearcounter ol4protocounter \
+ ; do
+ check_one_counter "$counter" "$want" "$verbose"
+ done
+}
+
+check_lo_counters "0" false
+ip netns exec "$ns0" ping -q -c 1 127.0.0.1 -m 42 > /dev/null
+
+check_lo_counters "2" true
+
+check_one_counter oskuidcounter "1" true
+check_one_counter oskgidcounter "1" true
+check_one_counter imarkcounter "1" true
+check_one_counter omarkcounter "1" true
+check_one_counter ilastyearcounter "0" true
+
+if [ $ret -eq 0 ];then
+ echo "OK: nftables meta iif/oif counters at expected values"
+else
+ exit $ret
+fi
+
+#First CPU execution and counter
+taskset -p 01 $$ > /dev/null
+ip netns exec "$ns0" nft reset counters > /dev/null
+ip netns exec "$ns0" ping -q -c 1 127.0.0.1 > /dev/null
+check_one_counter icpu0counter "2" true
+
+if [ $ret -eq 0 ];then
+ echo "OK: nftables meta cpu counter at expected values"
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/netfilter/nft_nat.sh b/tools/testing/selftests/netfilter/nft_nat.sh
new file mode 100755
index 000000000..67697d8ea
--- /dev/null
+++ b/tools/testing/selftests/netfilter/nft_nat.sh
@@ -0,0 +1,914 @@
+#!/bin/bash
+#
+# This test is for basic NAT functionality: snat, dnat, redirect, masquerade.
+#
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+ret=0
+test_inet_nat=true
+
+sfx=$(mktemp -u "XXXXXXXX")
+ns0="ns0-$sfx"
+ns1="ns1-$sfx"
+ns2="ns2-$sfx"
+
+cleanup()
+{
+ for i in 0 1 2; do ip netns del ns$i-"$sfx";done
+}
+
+nft --version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without nft tool"
+ exit $ksft_skip
+fi
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+
+ip netns add "$ns0"
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not create net namespace $ns0"
+ exit $ksft_skip
+fi
+
+trap cleanup EXIT
+
+ip netns add "$ns1"
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not create net namespace $ns1"
+ exit $ksft_skip
+fi
+
+ip netns add "$ns2"
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not create net namespace $ns2"
+ exit $ksft_skip
+fi
+
+ip link add veth0 netns "$ns0" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: No virtual ethernet pair device support in kernel"
+ exit $ksft_skip
+fi
+ip link add veth1 netns "$ns0" type veth peer name eth0 netns "$ns2"
+
+ip -net "$ns0" link set lo up
+ip -net "$ns0" link set veth0 up
+ip -net "$ns0" addr add 10.0.1.1/24 dev veth0
+ip -net "$ns0" addr add dead:1::1/64 dev veth0
+
+ip -net "$ns0" link set veth1 up
+ip -net "$ns0" addr add 10.0.2.1/24 dev veth1
+ip -net "$ns0" addr add dead:2::1/64 dev veth1
+
+for i in 1 2; do
+ ip -net ns$i-$sfx link set lo up
+ ip -net ns$i-$sfx link set eth0 up
+ ip -net ns$i-$sfx addr add 10.0.$i.99/24 dev eth0
+ ip -net ns$i-$sfx route add default via 10.0.$i.1
+ ip -net ns$i-$sfx addr add dead:$i::99/64 dev eth0
+ ip -net ns$i-$sfx route add default via dead:$i::1
+done
+
+bad_counter()
+{
+ local ns=$1
+ local counter=$2
+ local expect=$3
+ local tag=$4
+
+ echo "ERROR: $counter counter in $ns has unexpected value (expected $expect) at $tag" 1>&2
+ ip netns exec $ns nft list counter inet filter $counter 1>&2
+}
+
+check_counters()
+{
+ ns=$1
+ local lret=0
+
+ cnt=$(ip netns exec $ns nft list counter inet filter ns0in | grep -q "packets 1 bytes 84")
+ if [ $? -ne 0 ]; then
+ bad_counter $ns ns0in "packets 1 bytes 84" "check_counters 1"
+ lret=1
+ fi
+ cnt=$(ip netns exec $ns nft list counter inet filter ns0out | grep -q "packets 1 bytes 84")
+ if [ $? -ne 0 ]; then
+ bad_counter $ns ns0out "packets 1 bytes 84" "check_counters 2"
+ lret=1
+ fi
+
+ expect="packets 1 bytes 104"
+ cnt=$(ip netns exec $ns nft list counter inet filter ns0in6 | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter $ns ns0in6 "$expect" "check_counters 3"
+ lret=1
+ fi
+ cnt=$(ip netns exec $ns nft list counter inet filter ns0out6 | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter $ns ns0out6 "$expect" "check_counters 4"
+ lret=1
+ fi
+
+ return $lret
+}
+
+check_ns0_counters()
+{
+ local ns=$1
+ local lret=0
+
+ cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0in | grep -q "packets 0 bytes 0")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns0" ns0in "packets 0 bytes 0" "check_ns0_counters 1"
+ lret=1
+ fi
+
+ cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0in6 | grep -q "packets 0 bytes 0")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns0" ns0in6 "packets 0 bytes 0"
+ lret=1
+ fi
+
+ cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0out | grep -q "packets 0 bytes 0")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns0" ns0out "packets 0 bytes 0" "check_ns0_counters 2"
+ lret=1
+ fi
+ cnt=$(ip netns exec "$ns0" nft list counter inet filter ns0out6 | grep -q "packets 0 bytes 0")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns0" ns0out6 "packets 0 bytes 0" "check_ns0_counters3 "
+ lret=1
+ fi
+
+ for dir in "in" "out" ; do
+ expect="packets 1 bytes 84"
+ cnt=$(ip netns exec "$ns0" nft list counter inet filter ${ns}${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns0" $ns$dir "$expect" "check_ns0_counters 4"
+ lret=1
+ fi
+
+ expect="packets 1 bytes 104"
+ cnt=$(ip netns exec "$ns0" nft list counter inet filter ${ns}${dir}6 | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns0" $ns$dir6 "$expect" "check_ns0_counters 5"
+ lret=1
+ fi
+ done
+
+ return $lret
+}
+
+reset_counters()
+{
+ for i in 0 1 2;do
+ ip netns exec ns$i-$sfx nft reset counters inet > /dev/null
+ done
+}
+
+test_local_dnat6()
+{
+ local family=$1
+ local lret=0
+ local IPF=""
+
+ if [ $family = "inet" ];then
+ IPF="ip6"
+ fi
+
+ip netns exec "$ns0" nft -f /dev/stdin <<EOF
+table $family nat {
+ chain output {
+ type nat hook output priority 0; policy accept;
+ ip6 daddr dead:1::99 dnat $IPF to dead:2::99
+ }
+}
+EOF
+ if [ $? -ne 0 ]; then
+ echo "SKIP: Could not add add $family dnat hook"
+ return $ksft_skip
+ fi
+
+ # ping netns1, expect rewrite to netns2
+ ip netns exec "$ns0" ping -q -c 1 dead:1::99 > /dev/null
+ if [ $? -ne 0 ]; then
+ lret=1
+ echo "ERROR: ping6 failed"
+ return $lret
+ fi
+
+ expect="packets 0 bytes 0"
+ for dir in "in6" "out6" ; do
+ cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns0" ns1$dir "$expect" "test_local_dnat6 1"
+ lret=1
+ fi
+ done
+
+ expect="packets 1 bytes 104"
+ for dir in "in6" "out6" ; do
+ cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat6 2"
+ lret=1
+ fi
+ done
+
+ # expect 0 count in ns1
+ expect="packets 0 bytes 0"
+ for dir in "in6" "out6" ; do
+ cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns1" ns0$dir "$expect" "test_local_dnat6 3"
+ lret=1
+ fi
+ done
+
+ # expect 1 packet in ns2
+ expect="packets 1 bytes 104"
+ for dir in "in6" "out6" ; do
+ cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat6 4"
+ lret=1
+ fi
+ done
+
+ test $lret -eq 0 && echo "PASS: ipv6 ping to $ns1 was $family NATted to $ns2"
+ ip netns exec "$ns0" nft flush chain ip6 nat output
+
+ return $lret
+}
+
+test_local_dnat()
+{
+ local family=$1
+ local lret=0
+ local IPF=""
+
+ if [ $family = "inet" ];then
+ IPF="ip"
+ fi
+
+ip netns exec "$ns0" nft -f /dev/stdin <<EOF 2>/dev/null
+table $family nat {
+ chain output {
+ type nat hook output priority 0; policy accept;
+ ip daddr 10.0.1.99 dnat $IPF to 10.0.2.99
+ }
+}
+EOF
+ if [ $? -ne 0 ]; then
+ if [ $family = "inet" ];then
+ echo "SKIP: inet nat tests"
+ test_inet_nat=false
+ return $ksft_skip
+ fi
+ echo "SKIP: Could not add add $family dnat hook"
+ return $ksft_skip
+ fi
+
+ # ping netns1, expect rewrite to netns2
+ ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null
+ if [ $? -ne 0 ]; then
+ lret=1
+ echo "ERROR: ping failed"
+ return $lret
+ fi
+
+ expect="packets 0 bytes 0"
+ for dir in "in" "out" ; do
+ cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns0" ns1$dir "$expect" "test_local_dnat 1"
+ lret=1
+ fi
+ done
+
+ expect="packets 1 bytes 84"
+ for dir in "in" "out" ; do
+ cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat 2"
+ lret=1
+ fi
+ done
+
+ # expect 0 count in ns1
+ expect="packets 0 bytes 0"
+ for dir in "in" "out" ; do
+ cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns1" ns0$dir "$expect" "test_local_dnat 3"
+ lret=1
+ fi
+ done
+
+ # expect 1 packet in ns2
+ expect="packets 1 bytes 84"
+ for dir in "in" "out" ; do
+ cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat 4"
+ lret=1
+ fi
+ done
+
+ test $lret -eq 0 && echo "PASS: ping to $ns1 was $family NATted to $ns2"
+
+ ip netns exec "$ns0" nft flush chain $family nat output
+
+ reset_counters
+ ip netns exec "$ns0" ping -q -c 1 10.0.1.99 > /dev/null
+ if [ $? -ne 0 ]; then
+ lret=1
+ echo "ERROR: ping failed"
+ return $lret
+ fi
+
+ expect="packets 1 bytes 84"
+ for dir in "in" "out" ; do
+ cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns1" ns1$dir "$expect" "test_local_dnat 5"
+ lret=1
+ fi
+ done
+ expect="packets 0 bytes 0"
+ for dir in "in" "out" ; do
+ cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns0" ns2$dir "$expect" "test_local_dnat 6"
+ lret=1
+ fi
+ done
+
+ # expect 1 count in ns1
+ expect="packets 1 bytes 84"
+ for dir in "in" "out" ; do
+ cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns0" ns0$dir "$expect" "test_local_dnat 7"
+ lret=1
+ fi
+ done
+
+ # expect 0 packet in ns2
+ expect="packets 0 bytes 0"
+ for dir in "in" "out" ; do
+ cnt=$(ip netns exec "$ns2" nft list counter inet filter ns0${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns2" ns0$dir "$expect" "test_local_dnat 8"
+ lret=1
+ fi
+ done
+
+ test $lret -eq 0 && echo "PASS: ping to $ns1 OK after $family nat output chain flush"
+
+ return $lret
+}
+
+test_local_dnat_portonly()
+{
+ local family=$1
+ local daddr=$2
+ local lret=0
+ local sr_s
+ local sr_r
+
+ip netns exec "$ns0" nft -f /dev/stdin <<EOF
+table $family nat {
+ chain output {
+ type nat hook output priority 0; policy accept;
+ meta l4proto tcp dnat to :2000
+
+ }
+}
+EOF
+ if [ $? -ne 0 ]; then
+ if [ $family = "inet" ];then
+ echo "SKIP: inet port test"
+ test_inet_nat=false
+ return
+ fi
+ echo "SKIP: Could not add $family dnat hook"
+ return
+ fi
+
+ echo SERVER-$family | ip netns exec "$ns1" timeout 5 socat -u STDIN TCP-LISTEN:2000 &
+ sc_s=$!
+
+ sleep 1
+
+ result=$(ip netns exec "$ns0" timeout 1 socat TCP:$daddr:2000 STDOUT)
+
+ if [ "$result" = "SERVER-inet" ];then
+ echo "PASS: inet port rewrite without l3 address"
+ else
+ echo "ERROR: inet port rewrite"
+ ret=1
+ fi
+}
+
+test_masquerade6()
+{
+ local family=$1
+ local natflags=$2
+ local lret=0
+
+ ip netns exec "$ns0" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+
+ ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1
+ if [ $? -ne 0 ] ; then
+ echo "ERROR: cannot ping $ns1 from $ns2 via ipv6"
+ return 1
+ lret=1
+ fi
+
+ expect="packets 1 bytes 104"
+ for dir in "in6" "out6" ; do
+ cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns1" ns2$dir "$expect" "test_masquerade6 1"
+ lret=1
+ fi
+
+ cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns2" ns1$dir "$expect" "test_masquerade6 2"
+ lret=1
+ fi
+ done
+
+ reset_counters
+
+# add masquerading rule
+ip netns exec "$ns0" nft -f /dev/stdin <<EOF
+table $family nat {
+ chain postrouting {
+ type nat hook postrouting priority 0; policy accept;
+ meta oif veth0 masquerade $natflags
+ }
+}
+EOF
+ if [ $? -ne 0 ]; then
+ echo "SKIP: Could not add add $family masquerade hook"
+ return $ksft_skip
+ fi
+
+ ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1
+ if [ $? -ne 0 ] ; then
+ echo "ERROR: cannot ping $ns1 from $ns2 with active $family masquerade $natflags"
+ lret=1
+ fi
+
+ # ns1 should have seen packets from ns0, due to masquerade
+ expect="packets 1 bytes 104"
+ for dir in "in6" "out6" ; do
+ cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns1" ns0$dir "$expect" "test_masquerade6 3"
+ lret=1
+ fi
+
+ cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns2" ns1$dir "$expect" "test_masquerade6 4"
+ lret=1
+ fi
+ done
+
+ # ns1 should not have seen packets from ns2, due to masquerade
+ expect="packets 0 bytes 0"
+ for dir in "in6" "out6" ; do
+ cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns1" ns0$dir "$expect" "test_masquerade6 5"
+ lret=1
+ fi
+
+ cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns0" ns1$dir "$expect" "test_masquerade6 6"
+ lret=1
+ fi
+ done
+
+ ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1
+ if [ $? -ne 0 ] ; then
+ echo "ERROR: cannot ping $ns1 from $ns2 with active ipv6 masquerade $natflags (attempt 2)"
+ lret=1
+ fi
+
+ ip netns exec "$ns0" nft flush chain $family nat postrouting
+ if [ $? -ne 0 ]; then
+ echo "ERROR: Could not flush $family nat postrouting" 1>&2
+ lret=1
+ fi
+
+ test $lret -eq 0 && echo "PASS: $family IPv6 masquerade $natflags for $ns2"
+
+ return $lret
+}
+
+test_masquerade()
+{
+ local family=$1
+ local natflags=$2
+ local lret=0
+
+ ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+ ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+
+ ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1
+ if [ $? -ne 0 ] ; then
+ echo "ERROR: cannot ping $ns1 from "$ns2" $natflags"
+ lret=1
+ fi
+
+ expect="packets 1 bytes 84"
+ for dir in "in" "out" ; do
+ cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns1" ns2$dir "$expect" "test_masquerade 1"
+ lret=1
+ fi
+
+ cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns2" ns1$dir "$expect" "test_masquerade 2"
+ lret=1
+ fi
+ done
+
+ reset_counters
+
+# add masquerading rule
+ip netns exec "$ns0" nft -f /dev/stdin <<EOF
+table $family nat {
+ chain postrouting {
+ type nat hook postrouting priority 0; policy accept;
+ meta oif veth0 masquerade $natflags
+ }
+}
+EOF
+ if [ $? -ne 0 ]; then
+ echo "SKIP: Could not add add $family masquerade hook"
+ return $ksft_skip
+ fi
+
+ ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1
+ if [ $? -ne 0 ] ; then
+ echo "ERROR: cannot ping $ns1 from $ns2 with active $family masquerade $natflags"
+ lret=1
+ fi
+
+ # ns1 should have seen packets from ns0, due to masquerade
+ expect="packets 1 bytes 84"
+ for dir in "in" "out" ; do
+ cnt=$(ip netns exec "$ns1" nft list counter inet filter ns0${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns1" ns0$dir "$expect" "test_masquerade 3"
+ lret=1
+ fi
+
+ cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns2" ns1$dir "$expect" "test_masquerade 4"
+ lret=1
+ fi
+ done
+
+ # ns1 should not have seen packets from ns2, due to masquerade
+ expect="packets 0 bytes 0"
+ for dir in "in" "out" ; do
+ cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns1" ns0$dir "$expect" "test_masquerade 5"
+ lret=1
+ fi
+
+ cnt=$(ip netns exec "$ns0" nft list counter inet filter ns1${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns0" ns1$dir "$expect" "test_masquerade 6"
+ lret=1
+ fi
+ done
+
+ ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1
+ if [ $? -ne 0 ] ; then
+ echo "ERROR: cannot ping $ns1 from $ns2 with active ip masquerade $natflags (attempt 2)"
+ lret=1
+ fi
+
+ ip netns exec "$ns0" nft flush chain $family nat postrouting
+ if [ $? -ne 0 ]; then
+ echo "ERROR: Could not flush $family nat postrouting" 1>&2
+ lret=1
+ fi
+
+ test $lret -eq 0 && echo "PASS: $family IP masquerade $natflags for $ns2"
+
+ return $lret
+}
+
+test_redirect6()
+{
+ local family=$1
+ local lret=0
+
+ ip netns exec "$ns0" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+
+ ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1
+ if [ $? -ne 0 ] ; then
+ echo "ERROR: cannnot ping $ns1 from $ns2 via ipv6"
+ lret=1
+ fi
+
+ expect="packets 1 bytes 104"
+ for dir in "in6" "out6" ; do
+ cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns1" ns2$dir "$expect" "test_redirect6 1"
+ lret=1
+ fi
+
+ cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns2" ns1$dir "$expect" "test_redirect6 2"
+ lret=1
+ fi
+ done
+
+ reset_counters
+
+# add redirect rule
+ip netns exec "$ns0" nft -f /dev/stdin <<EOF
+table $family nat {
+ chain prerouting {
+ type nat hook prerouting priority 0; policy accept;
+ meta iif veth1 meta l4proto icmpv6 ip6 saddr dead:2::99 ip6 daddr dead:1::99 redirect
+ }
+}
+EOF
+ if [ $? -ne 0 ]; then
+ echo "SKIP: Could not add add $family redirect hook"
+ return $ksft_skip
+ fi
+
+ ip netns exec "$ns2" ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1
+ if [ $? -ne 0 ] ; then
+ echo "ERROR: cannot ping $ns1 from $ns2 via ipv6 with active $family redirect"
+ lret=1
+ fi
+
+ # ns1 should have seen no packets from ns2, due to redirection
+ expect="packets 0 bytes 0"
+ for dir in "in6" "out6" ; do
+ cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns1" ns0$dir "$expect" "test_redirect6 3"
+ lret=1
+ fi
+ done
+
+ # ns0 should have seen packets from ns2, due to masquerade
+ expect="packets 1 bytes 104"
+ for dir in "in6" "out6" ; do
+ cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns1" ns0$dir "$expect" "test_redirect6 4"
+ lret=1
+ fi
+ done
+
+ ip netns exec "$ns0" nft delete table $family nat
+ if [ $? -ne 0 ]; then
+ echo "ERROR: Could not delete $family nat table" 1>&2
+ lret=1
+ fi
+
+ test $lret -eq 0 && echo "PASS: $family IPv6 redirection for $ns2"
+
+ return $lret
+}
+
+test_redirect()
+{
+ local family=$1
+ local lret=0
+
+ ip netns exec "$ns0" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+ ip netns exec "$ns0" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+
+ ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1
+ if [ $? -ne 0 ] ; then
+ echo "ERROR: cannot ping $ns1 from $ns2"
+ lret=1
+ fi
+
+ expect="packets 1 bytes 84"
+ for dir in "in" "out" ; do
+ cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns1" $ns2$dir "$expect" "test_redirect 1"
+ lret=1
+ fi
+
+ cnt=$(ip netns exec "$ns2" nft list counter inet filter ns1${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns2" ns1$dir "$expect" "test_redirect 2"
+ lret=1
+ fi
+ done
+
+ reset_counters
+
+# add redirect rule
+ip netns exec "$ns0" nft -f /dev/stdin <<EOF
+table $family nat {
+ chain prerouting {
+ type nat hook prerouting priority 0; policy accept;
+ meta iif veth1 ip protocol icmp ip saddr 10.0.2.99 ip daddr 10.0.1.99 redirect
+ }
+}
+EOF
+ if [ $? -ne 0 ]; then
+ echo "SKIP: Could not add add $family redirect hook"
+ return $ksft_skip
+ fi
+
+ ip netns exec "$ns2" ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1
+ if [ $? -ne 0 ] ; then
+ echo "ERROR: cannot ping $ns1 from $ns2 with active $family ip redirect"
+ lret=1
+ fi
+
+ # ns1 should have seen no packets from ns2, due to redirection
+ expect="packets 0 bytes 0"
+ for dir in "in" "out" ; do
+
+ cnt=$(ip netns exec "$ns1" nft list counter inet filter ns2${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns1" ns0$dir "$expect" "test_redirect 3"
+ lret=1
+ fi
+ done
+
+ # ns0 should have seen packets from ns2, due to masquerade
+ expect="packets 1 bytes 84"
+ for dir in "in" "out" ; do
+ cnt=$(ip netns exec "$ns0" nft list counter inet filter ns2${dir} | grep -q "$expect")
+ if [ $? -ne 0 ]; then
+ bad_counter "$ns0" ns0$dir "$expect" "test_redirect 4"
+ lret=1
+ fi
+ done
+
+ ip netns exec "$ns0" nft delete table $family nat
+ if [ $? -ne 0 ]; then
+ echo "ERROR: Could not delete $family nat table" 1>&2
+ lret=1
+ fi
+
+ test $lret -eq 0 && echo "PASS: $family IP redirection for $ns2"
+
+ return $lret
+}
+
+
+# ip netns exec "$ns0" ping -c 1 -q 10.0.$i.99
+for i in 0 1 2; do
+ip netns exec ns$i-$sfx nft -f /dev/stdin <<EOF
+table inet filter {
+ counter ns0in {}
+ counter ns1in {}
+ counter ns2in {}
+
+ counter ns0out {}
+ counter ns1out {}
+ counter ns2out {}
+
+ counter ns0in6 {}
+ counter ns1in6 {}
+ counter ns2in6 {}
+
+ counter ns0out6 {}
+ counter ns1out6 {}
+ counter ns2out6 {}
+
+ map nsincounter {
+ type ipv4_addr : counter
+ elements = { 10.0.1.1 : "ns0in",
+ 10.0.2.1 : "ns0in",
+ 10.0.1.99 : "ns1in",
+ 10.0.2.99 : "ns2in" }
+ }
+
+ map nsincounter6 {
+ type ipv6_addr : counter
+ elements = { dead:1::1 : "ns0in6",
+ dead:2::1 : "ns0in6",
+ dead:1::99 : "ns1in6",
+ dead:2::99 : "ns2in6" }
+ }
+
+ map nsoutcounter {
+ type ipv4_addr : counter
+ elements = { 10.0.1.1 : "ns0out",
+ 10.0.2.1 : "ns0out",
+ 10.0.1.99: "ns1out",
+ 10.0.2.99: "ns2out" }
+ }
+
+ map nsoutcounter6 {
+ type ipv6_addr : counter
+ elements = { dead:1::1 : "ns0out6",
+ dead:2::1 : "ns0out6",
+ dead:1::99 : "ns1out6",
+ dead:2::99 : "ns2out6" }
+ }
+
+ chain input {
+ type filter hook input priority 0; policy accept;
+ counter name ip saddr map @nsincounter
+ icmpv6 type { "echo-request", "echo-reply" } counter name ip6 saddr map @nsincounter6
+ }
+ chain output {
+ type filter hook output priority 0; policy accept;
+ counter name ip daddr map @nsoutcounter
+ icmpv6 type { "echo-request", "echo-reply" } counter name ip6 daddr map @nsoutcounter6
+ }
+}
+EOF
+done
+
+sleep 3
+# test basic connectivity
+for i in 1 2; do
+ ip netns exec "$ns0" ping -c 1 -q 10.0.$i.99 > /dev/null
+ if [ $? -ne 0 ];then
+ echo "ERROR: Could not reach other namespace(s)" 1>&2
+ ret=1
+ fi
+
+ ip netns exec "$ns0" ping -c 1 -q dead:$i::99 > /dev/null
+ if [ $? -ne 0 ];then
+ echo "ERROR: Could not reach other namespace(s) via ipv6" 1>&2
+ ret=1
+ fi
+ check_counters ns$i-$sfx
+ if [ $? -ne 0 ]; then
+ ret=1
+ fi
+
+ check_ns0_counters ns$i
+ if [ $? -ne 0 ]; then
+ ret=1
+ fi
+ reset_counters
+done
+
+if [ $ret -eq 0 ];then
+ echo "PASS: netns routing/connectivity: $ns0 can reach $ns1 and $ns2"
+fi
+
+reset_counters
+test_local_dnat ip
+test_local_dnat6 ip6
+
+reset_counters
+test_local_dnat_portonly inet 10.0.1.99
+
+reset_counters
+$test_inet_nat && test_local_dnat inet
+$test_inet_nat && test_local_dnat6 inet
+
+for flags in "" "fully-random"; do
+reset_counters
+test_masquerade ip $flags
+test_masquerade6 ip6 $flags
+reset_counters
+$test_inet_nat && test_masquerade inet $flags
+$test_inet_nat && test_masquerade6 inet $flags
+done
+
+reset_counters
+test_redirect ip
+test_redirect6 ip6
+reset_counters
+$test_inet_nat && test_redirect inet
+$test_inet_nat && test_redirect6 inet
+
+if [ $ret -ne 0 ];then
+ echo -n "FAIL: "
+ nft --version
+fi
+
+exit $ret
diff --git a/tools/testing/selftests/netfilter/nft_queue.sh b/tools/testing/selftests/netfilter/nft_queue.sh
new file mode 100755
index 000000000..3d202b90b
--- /dev/null
+++ b/tools/testing/selftests/netfilter/nft_queue.sh
@@ -0,0 +1,376 @@
+#!/bin/bash
+#
+# This tests nf_queue:
+# 1. can process packets from all hooks
+# 2. support running nfqueue from more than one base chain
+#
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+ret=0
+
+sfx=$(mktemp -u "XXXXXXXX")
+ns1="ns1-$sfx"
+ns2="ns2-$sfx"
+nsrouter="nsrouter-$sfx"
+timeout=4
+
+cleanup()
+{
+ ip netns del ${ns1}
+ ip netns del ${ns2}
+ ip netns del ${nsrouter}
+ rm -f "$TMPFILE0"
+ rm -f "$TMPFILE1"
+ rm -f "$TMPFILE2" "$TMPFILE3"
+}
+
+nft --version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without nft tool"
+ exit $ksft_skip
+fi
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+
+ip netns add ${nsrouter}
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not create net namespace"
+ exit $ksft_skip
+fi
+
+TMPFILE0=$(mktemp)
+TMPFILE1=$(mktemp)
+TMPFILE2=$(mktemp)
+TMPFILE3=$(mktemp)
+trap cleanup EXIT
+
+ip netns add ${ns1}
+ip netns add ${ns2}
+
+ip link add veth0 netns ${nsrouter} type veth peer name eth0 netns ${ns1} > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: No virtual ethernet pair device support in kernel"
+ exit $ksft_skip
+fi
+ip link add veth1 netns ${nsrouter} type veth peer name eth0 netns ${ns2}
+
+ip -net ${nsrouter} link set lo up
+ip -net ${nsrouter} link set veth0 up
+ip -net ${nsrouter} addr add 10.0.1.1/24 dev veth0
+ip -net ${nsrouter} addr add dead:1::1/64 dev veth0
+
+ip -net ${nsrouter} link set veth1 up
+ip -net ${nsrouter} addr add 10.0.2.1/24 dev veth1
+ip -net ${nsrouter} addr add dead:2::1/64 dev veth1
+
+ip -net ${ns1} link set lo up
+ip -net ${ns1} link set eth0 up
+
+ip -net ${ns2} link set lo up
+ip -net ${ns2} link set eth0 up
+
+ip -net ${ns1} addr add 10.0.1.99/24 dev eth0
+ip -net ${ns1} addr add dead:1::99/64 dev eth0
+ip -net ${ns1} route add default via 10.0.1.1
+ip -net ${ns1} route add default via dead:1::1
+
+ip -net ${ns2} addr add 10.0.2.99/24 dev eth0
+ip -net ${ns2} addr add dead:2::99/64 dev eth0
+ip -net ${ns2} route add default via 10.0.2.1
+ip -net ${ns2} route add default via dead:2::1
+
+load_ruleset() {
+ local name=$1
+ local prio=$2
+
+ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF
+table inet $name {
+ chain nfq {
+ ip protocol icmp queue bypass
+ icmpv6 type { "echo-request", "echo-reply" } queue num 1 bypass
+ }
+ chain pre {
+ type filter hook prerouting priority $prio; policy accept;
+ jump nfq
+ }
+ chain input {
+ type filter hook input priority $prio; policy accept;
+ jump nfq
+ }
+ chain forward {
+ type filter hook forward priority $prio; policy accept;
+ tcp dport 12345 queue num 2
+ jump nfq
+ }
+ chain output {
+ type filter hook output priority $prio; policy accept;
+ tcp dport 12345 queue num 3
+ jump nfq
+ }
+ chain post {
+ type filter hook postrouting priority $prio; policy accept;
+ jump nfq
+ }
+}
+EOF
+}
+
+load_counter_ruleset() {
+ local prio=$1
+
+ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF
+table inet countrules {
+ chain pre {
+ type filter hook prerouting priority $prio; policy accept;
+ counter
+ }
+ chain input {
+ type filter hook input priority $prio; policy accept;
+ counter
+ }
+ chain forward {
+ type filter hook forward priority $prio; policy accept;
+ counter
+ }
+ chain output {
+ type filter hook output priority $prio; policy accept;
+ counter
+ }
+ chain post {
+ type filter hook postrouting priority $prio; policy accept;
+ counter
+ }
+}
+EOF
+}
+
+test_ping() {
+ ip netns exec ${ns1} ping -c 1 -q 10.0.2.99 > /dev/null
+ if [ $? -ne 0 ];then
+ return 1
+ fi
+
+ ip netns exec ${ns1} ping -c 1 -q dead:2::99 > /dev/null
+ if [ $? -ne 0 ];then
+ return 1
+ fi
+
+ return 0
+}
+
+test_ping_router() {
+ ip netns exec ${ns1} ping -c 1 -q 10.0.2.1 > /dev/null
+ if [ $? -ne 0 ];then
+ return 1
+ fi
+
+ ip netns exec ${ns1} ping -c 1 -q dead:2::1 > /dev/null
+ if [ $? -ne 0 ];then
+ return 1
+ fi
+
+ return 0
+}
+
+test_queue_blackhole() {
+ local proto=$1
+
+ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF
+table $proto blackh {
+ chain forward {
+ type filter hook forward priority 0; policy accept;
+ queue num 600
+ }
+}
+EOF
+ if [ $proto = "ip" ] ;then
+ ip netns exec ${ns1} ping -W 2 -c 1 -q 10.0.2.99 > /dev/null
+ lret=$?
+ elif [ $proto = "ip6" ]; then
+ ip netns exec ${ns1} ping -W 2 -c 1 -q dead:2::99 > /dev/null
+ lret=$?
+ else
+ lret=111
+ fi
+
+ # queue without bypass keyword should drop traffic if no listener exists.
+ if [ $lret -eq 0 ];then
+ echo "FAIL: $proto expected failure, got $lret" 1>&2
+ exit 1
+ fi
+
+ ip netns exec ${nsrouter} nft delete table $proto blackh
+ if [ $? -ne 0 ] ;then
+ echo "FAIL: $proto: Could not delete blackh table"
+ exit 1
+ fi
+
+ echo "PASS: $proto: statement with no listener results in packet drop"
+}
+
+test_queue()
+{
+ local expected=$1
+ local last=""
+
+ # spawn nf-queue listeners
+ ip netns exec ${nsrouter} ./nf-queue -c -q 0 -t $timeout > "$TMPFILE0" &
+ ip netns exec ${nsrouter} ./nf-queue -c -q 1 -t $timeout > "$TMPFILE1" &
+ sleep 1
+ test_ping
+ ret=$?
+ if [ $ret -ne 0 ];then
+ echo "FAIL: netns routing/connectivity with active listener on queue $queue: $ret" 1>&2
+ exit $ret
+ fi
+
+ test_ping_router
+ ret=$?
+ if [ $ret -ne 0 ];then
+ echo "FAIL: netns router unreachable listener on queue $queue: $ret" 1>&2
+ exit $ret
+ fi
+
+ wait
+ ret=$?
+
+ for file in $TMPFILE0 $TMPFILE1; do
+ last=$(tail -n1 "$file")
+ if [ x"$last" != x"$expected packets total" ]; then
+ echo "FAIL: Expected $expected packets total, but got $last" 1>&2
+ cat "$file" 1>&2
+
+ ip netns exec ${nsrouter} nft list ruleset
+ exit 1
+ fi
+ done
+
+ echo "PASS: Expected and received $last"
+}
+
+test_tcp_forward()
+{
+ ip netns exec ${nsrouter} ./nf-queue -q 2 -t $timeout &
+ local nfqpid=$!
+
+ tmpfile=$(mktemp) || exit 1
+ dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile
+ ip netns exec ${ns2} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null &
+ local rpid=$!
+
+ sleep 1
+ ip netns exec ${ns1} nc -w 5 10.0.2.99 12345 <"$tmpfile" >/dev/null &
+
+ rm -f "$tmpfile"
+
+ wait $rpid
+ wait $lpid
+ [ $? -eq 0 ] && echo "PASS: tcp and nfqueue in forward chain"
+}
+
+test_tcp_localhost()
+{
+ tmpfile=$(mktemp) || exit 1
+
+ dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile
+ ip netns exec ${nsrouter} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null &
+ local rpid=$!
+
+ ip netns exec ${nsrouter} ./nf-queue -q 3 -t $timeout &
+ local nfqpid=$!
+
+ sleep 1
+ ip netns exec ${nsrouter} nc -w 5 127.0.0.1 12345 <"$tmpfile" > /dev/null
+ rm -f "$tmpfile"
+
+ wait $rpid
+ [ $? -eq 0 ] && echo "PASS: tcp via loopback"
+ wait 2>/dev/null
+}
+
+test_tcp_localhost_requeue()
+{
+ip netns exec ${nsrouter} nft -f /dev/stdin <<EOF
+flush ruleset
+table inet filter {
+ chain output {
+ type filter hook output priority 0; policy accept;
+ tcp dport 12345 limit rate 1/second burst 1 packets counter queue num 0
+ }
+ chain post {
+ type filter hook postrouting priority 0; policy accept;
+ tcp dport 12345 limit rate 1/second burst 1 packets counter queue num 0
+ }
+}
+EOF
+ tmpfile=$(mktemp) || exit 1
+ dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile
+ ip netns exec ${nsrouter} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null &
+ local rpid=$!
+
+ ip netns exec ${nsrouter} ./nf-queue -c -q 1 -t $timeout > "$TMPFILE2" &
+
+ # nfqueue 1 will be called via output hook. But this time,
+ # re-queue the packet to nfqueue program on queue 2.
+ ip netns exec ${nsrouter} ./nf-queue -G -d 150 -c -q 0 -Q 1 -t $timeout > "$TMPFILE3" &
+
+ sleep 1
+ ip netns exec ${nsrouter} nc -w 5 127.0.0.1 12345 <"$tmpfile" > /dev/null
+ rm -f "$tmpfile"
+
+ wait
+
+ if ! diff -u "$TMPFILE2" "$TMPFILE3" ; then
+ echo "FAIL: lost packets during requeue?!" 1>&2
+ return
+ fi
+
+ echo "PASS: tcp via loopback and re-queueing"
+}
+
+ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
+ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
+ip netns exec ${nsrouter} sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null
+
+load_ruleset "filter" 0
+
+sleep 3
+
+test_ping
+ret=$?
+if [ $ret -eq 0 ];then
+ # queue bypass works (rules were skipped, no listener)
+ echo "PASS: ${ns1} can reach ${ns2}"
+else
+ echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2
+ exit $ret
+fi
+
+test_queue_blackhole ip
+test_queue_blackhole ip6
+
+# dummy ruleset to add base chains between the
+# queueing rules. We don't want the second reinject
+# to re-execute the old hooks.
+load_counter_ruleset 10
+
+# we are hooking all: prerouting/input/forward/output/postrouting.
+# we ping ${ns2} from ${ns1} via ${nsrouter} using ipv4 and ipv6, so:
+# 1x icmp prerouting,forward,postrouting -> 3 queue events (6 incl. reply).
+# 1x icmp prerouting,input,output postrouting -> 4 queue events incl. reply.
+# so we expect that userspace program receives 10 packets.
+test_queue 10
+
+# same. We queue to a second program as well.
+load_ruleset "filter2" 20
+test_queue 20
+
+test_tcp_forward
+test_tcp_localhost
+test_tcp_localhost_requeue
+
+exit $ret
diff --git a/tools/testing/selftests/netfilter/nft_trans_stress.sh b/tools/testing/selftests/netfilter/nft_trans_stress.sh
new file mode 100755
index 000000000..f1affd12c
--- /dev/null
+++ b/tools/testing/selftests/netfilter/nft_trans_stress.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+#
+# This test is for stress-testing the nf_tables config plane path vs.
+# packet path processing: Make sure we never release rules that are
+# still visible to other cpus.
+#
+# set -e
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+testns=testns1
+tables="foo bar baz quux"
+
+nft --version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without nft tool"
+ exit $ksft_skip
+fi
+
+ip -Version > /dev/null 2>&1
+if [ $? -ne 0 ];then
+ echo "SKIP: Could not run test without ip tool"
+ exit $ksft_skip
+fi
+
+tmp=$(mktemp)
+
+for table in $tables; do
+ echo add table inet "$table" >> "$tmp"
+ echo flush table inet "$table" >> "$tmp"
+
+ echo "add chain inet $table INPUT { type filter hook input priority 0; }" >> "$tmp"
+ echo "add chain inet $table OUTPUT { type filter hook output priority 0; }" >> "$tmp"
+ for c in $(seq 1 400); do
+ chain=$(printf "chain%03u" "$c")
+ echo "add chain inet $table $chain" >> "$tmp"
+ done
+
+ for c in $(seq 1 400); do
+ chain=$(printf "chain%03u" "$c")
+ for BASE in INPUT OUTPUT; do
+ echo "add rule inet $table $BASE counter jump $chain" >> "$tmp"
+ done
+ echo "add rule inet $table $chain counter return" >> "$tmp"
+ done
+done
+
+ip netns add "$testns"
+ip -netns "$testns" link set lo up
+
+lscpu | grep ^CPU\(s\): | ( read cpu cpunum ;
+cpunum=$((cpunum-1))
+for i in $(seq 0 $cpunum);do
+ mask=$(printf 0x%x $((1<<$i)))
+ ip netns exec "$testns" taskset $mask ping -4 127.0.0.1 -fq > /dev/null &
+ ip netns exec "$testns" taskset $mask ping -6 ::1 -fq > /dev/null &
+done)
+
+sleep 1
+
+for i in $(seq 1 10) ; do ip netns exec "$testns" nft -f "$tmp" & done
+
+for table in $tables;do
+ randsleep=$((RANDOM%10))
+ sleep $randsleep
+ ip netns exec "$testns" nft delete table inet $table 2>/dev/null
+done
+
+randsleep=$((RANDOM%10))
+sleep $randsleep
+
+pkill -9 ping
+
+wait
+
+rm -f "$tmp"
+ip netns del "$testns"
diff --git a/tools/testing/selftests/nsfs/.gitignore b/tools/testing/selftests/nsfs/.gitignore
new file mode 100644
index 000000000..ed79ebdf2
--- /dev/null
+++ b/tools/testing/selftests/nsfs/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+owner
+pidns
diff --git a/tools/testing/selftests/nsfs/Makefile b/tools/testing/selftests/nsfs/Makefile
new file mode 100644
index 000000000..dd9bd50b7
--- /dev/null
+++ b/tools/testing/selftests/nsfs/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+TEST_GEN_PROGS := owner pidns
+
+CFLAGS := -Wall -Werror
+
+include ../lib.mk
diff --git a/tools/testing/selftests/nsfs/config b/tools/testing/selftests/nsfs/config
new file mode 100644
index 000000000..598d0a225
--- /dev/null
+++ b/tools/testing/selftests/nsfs/config
@@ -0,0 +1,3 @@
+CONFIG_USER_NS=y
+CONFIG_UTS_NS=y
+CONFIG_PID_NS=y
diff --git a/tools/testing/selftests/nsfs/owner.c b/tools/testing/selftests/nsfs/owner.c
new file mode 100644
index 000000000..96a976c74
--- /dev/null
+++ b/tools/testing/selftests/nsfs/owner.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sched.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+
+#define NSIO 0xb7
+#define NS_GET_USERNS _IO(NSIO, 0x1)
+
+#define pr_err(fmt, ...) \
+ ({ \
+ fprintf(stderr, "%s:%d:" fmt ": %m\n", \
+ __func__, __LINE__, ##__VA_ARGS__); \
+ 1; \
+ })
+
+int main(int argc, char *argvp[])
+{
+ int pfd[2], ns, uns, init_uns;
+ struct stat st1, st2;
+ char path[128];
+ pid_t pid;
+ char c;
+
+ if (pipe(pfd))
+ return 1;
+
+ pid = fork();
+ if (pid < 0)
+ return pr_err("fork");
+ if (pid == 0) {
+ prctl(PR_SET_PDEATHSIG, SIGKILL);
+ if (unshare(CLONE_NEWUTS | CLONE_NEWUSER))
+ return pr_err("unshare");
+ close(pfd[0]);
+ close(pfd[1]);
+ while (1)
+ sleep(1);
+ return 0;
+ }
+ close(pfd[1]);
+ if (read(pfd[0], &c, 1) != 0)
+ return pr_err("Unable to read from pipe");
+ close(pfd[0]);
+
+ snprintf(path, sizeof(path), "/proc/%d/ns/uts", pid);
+ ns = open(path, O_RDONLY);
+ if (ns < 0)
+ return pr_err("Unable to open %s", path);
+
+ uns = ioctl(ns, NS_GET_USERNS);
+ if (uns < 0)
+ return pr_err("Unable to get an owning user namespace");
+
+ if (fstat(uns, &st1))
+ return pr_err("fstat");
+
+ snprintf(path, sizeof(path), "/proc/%d/ns/user", pid);
+ if (stat(path, &st2))
+ return pr_err("stat");
+
+ if (st1.st_ino != st2.st_ino)
+ return pr_err("NS_GET_USERNS returned a wrong namespace");
+
+ init_uns = ioctl(uns, NS_GET_USERNS);
+ if (uns < 0)
+ return pr_err("Unable to get an owning user namespace");
+
+ if (ioctl(init_uns, NS_GET_USERNS) >= 0 || errno != EPERM)
+ return pr_err("Don't get EPERM");
+
+ if (unshare(CLONE_NEWUSER))
+ return pr_err("unshare");
+
+ if (ioctl(ns, NS_GET_USERNS) >= 0 || errno != EPERM)
+ return pr_err("Don't get EPERM");
+ if (ioctl(init_uns, NS_GET_USERNS) >= 0 || errno != EPERM)
+ return pr_err("Don't get EPERM");
+
+ kill(pid, SIGKILL);
+ wait(NULL);
+ return 0;
+}
diff --git a/tools/testing/selftests/nsfs/pidns.c b/tools/testing/selftests/nsfs/pidns.c
new file mode 100644
index 000000000..e3c772c6a
--- /dev/null
+++ b/tools/testing/selftests/nsfs/pidns.c
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sched.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+
+#define pr_err(fmt, ...) \
+ ({ \
+ fprintf(stderr, "%s:%d:" fmt ": %m\n", \
+ __func__, __LINE__, ##__VA_ARGS__); \
+ 1; \
+ })
+
+#define NSIO 0xb7
+#define NS_GET_USERNS _IO(NSIO, 0x1)
+#define NS_GET_PARENT _IO(NSIO, 0x2)
+
+#define __stack_aligned__ __attribute__((aligned(16)))
+struct cr_clone_arg {
+ char stack[128] __stack_aligned__;
+ char stack_ptr[];
+};
+
+static int child(void *args)
+{
+ prctl(PR_SET_PDEATHSIG, SIGKILL);
+ while (1)
+ sleep(1);
+ exit(0);
+}
+
+int main(int argc, char *argv[])
+{
+ char *ns_strs[] = {"pid", "user"};
+ char path[] = "/proc/0123456789/ns/pid";
+ struct cr_clone_arg ca;
+ struct stat st1, st2;
+ int ns, pns, i;
+ pid_t pid;
+
+ pid = clone(child, ca.stack_ptr, CLONE_NEWUSER | CLONE_NEWPID | SIGCHLD, NULL);
+ if (pid < 0)
+ return pr_err("clone");
+
+ for (i = 0; i < 2; i++) {
+ snprintf(path, sizeof(path), "/proc/%d/ns/%s", pid, ns_strs[i]);
+ ns = open(path, O_RDONLY);
+ if (ns < 0)
+ return pr_err("Unable to open %s", path);
+
+ pns = ioctl(ns, NS_GET_PARENT);
+ if (pns < 0)
+ return pr_err("Unable to get a parent pidns");
+
+ snprintf(path, sizeof(path), "/proc/self/ns/%s", ns_strs[i]);
+ if (stat(path, &st2))
+ return pr_err("Unable to stat %s", path);
+ if (fstat(pns, &st1))
+ return pr_err("Unable to stat the parent pidns");
+ if (st1.st_ino != st2.st_ino)
+ return pr_err("NS_GET_PARENT returned a wrong namespace");
+
+ if (ioctl(pns, NS_GET_PARENT) >= 0 || errno != EPERM)
+ return pr_err("Don't get EPERM");
+ }
+
+ kill(pid, SIGKILL);
+ wait(NULL);
+ return 0;
+}
diff --git a/tools/testing/selftests/ntb/ntb_test.sh b/tools/testing/selftests/ntb/ntb_test.sh
new file mode 100755
index 000000000..020137b61
--- /dev/null
+++ b/tools/testing/selftests/ntb/ntb_test.sh
@@ -0,0 +1,631 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2016 Microsemi. All Rights Reserved.
+#
+# Author: Logan Gunthorpe <logang@deltatee.com>
+
+REMOTE_HOST=
+LIST_DEVS=FALSE
+
+DEBUGFS=${DEBUGFS-/sys/kernel/debug}
+
+PERF_RUN_ORDER=32
+MAX_MW_SIZE=0
+RUN_DMA_TESTS=
+DONT_CLEANUP=
+MW_SIZE=65536
+
+function show_help()
+{
+ echo "Usage: $0 [OPTIONS] LOCAL_DEV REMOTE_DEV"
+ echo "Run tests on a pair of NTB endpoints."
+ echo
+ echo "If the NTB device loops back to the same host then,"
+ echo "just specifying the two PCI ids on the command line is"
+ echo "sufficient. Otherwise, if the NTB link spans two hosts"
+ echo "use the -r option to specify the hostname for the remote"
+ echo "device. SSH will then be used to test the remote side."
+ echo "An SSH key between the root users of the host would then"
+ echo "be highly recommended."
+ echo
+ echo "Options:"
+ echo " -C don't cleanup ntb modules on exit"
+ echo " -h show this help message"
+ echo " -l list available local and remote PCI ids"
+ echo " -r REMOTE_HOST specify the remote's hostname to connect"
+ echo " to for the test (using ssh)"
+ echo " -m MW_SIZE memory window size for ntb_tool"
+ echo " (default: $MW_SIZE)"
+ echo " -d run dma tests for ntb_perf"
+ echo " -p ORDER total data order for ntb_perf"
+ echo " (default: $PERF_RUN_ORDER)"
+ echo " -w MAX_MW_SIZE maxmium memory window size for ntb_perf"
+ echo
+}
+
+function parse_args()
+{
+ OPTIND=0
+ while getopts "b:Cdhlm:r:p:w:" opt; do
+ case "$opt" in
+ C) DONT_CLEANUP=1 ;;
+ d) RUN_DMA_TESTS=1 ;;
+ h) show_help; exit 0 ;;
+ l) LIST_DEVS=TRUE ;;
+ m) MW_SIZE=${OPTARG} ;;
+ r) REMOTE_HOST=${OPTARG} ;;
+ p) PERF_RUN_ORDER=${OPTARG} ;;
+ w) MAX_MW_SIZE=${OPTARG} ;;
+ \?)
+ echo "Invalid option: -$OPTARG" >&2
+ exit 1
+ ;;
+ esac
+ done
+}
+
+parse_args "$@"
+shift $((OPTIND-1))
+LOCAL_DEV=$1
+shift
+parse_args "$@"
+shift $((OPTIND-1))
+REMOTE_DEV=$1
+shift
+parse_args "$@"
+
+set -e
+
+function _modprobe()
+{
+ modprobe "$@" || return 1
+
+ if [[ "$REMOTE_HOST" != "" ]]; then
+ ssh "$REMOTE_HOST" modprobe "$@" || return 1
+ fi
+}
+
+function split_remote()
+{
+ VPATH=$1
+ REMOTE=
+
+ if [[ "$VPATH" == *":/"* ]]; then
+ REMOTE=${VPATH%%:*}
+ VPATH=${VPATH#*:}
+ fi
+}
+
+function read_file()
+{
+ split_remote $1
+ if [[ "$REMOTE" != "" ]]; then
+ ssh "$REMOTE" cat "$VPATH"
+ else
+ cat "$VPATH"
+ fi
+}
+
+function write_file()
+{
+ split_remote $2
+ VALUE=$1
+
+ if [[ "$REMOTE" != "" ]]; then
+ ssh "$REMOTE" "echo \"$VALUE\" > \"$VPATH\""
+ else
+ echo "$VALUE" > "$VPATH"
+ fi
+}
+
+function check_file()
+{
+ split_remote $1
+
+ if [[ "$REMOTE" != "" ]]; then
+ ssh "$REMOTE" "[[ -e ${VPATH} ]]"
+ else
+ [[ -e ${VPATH} ]]
+ fi
+}
+
+function subdirname()
+{
+ echo $(basename $(dirname $1)) 2> /dev/null
+}
+
+function find_pidx()
+{
+ PORT=$1
+ PPATH=$2
+
+ for ((i = 0; i < 64; i++)); do
+ PEER_DIR="$PPATH/peer$i"
+
+ check_file ${PEER_DIR} || break
+
+ PEER_PORT=$(read_file "${PEER_DIR}/port")
+ if [[ ${PORT} -eq $PEER_PORT ]]; then
+ echo $i
+ return 0
+ fi
+ done
+
+ return 1
+}
+
+function port_test()
+{
+ LOC=$1
+ REM=$2
+
+ echo "Running port tests on: $(basename $LOC) / $(basename $REM)"
+
+ LOCAL_PORT=$(read_file "$LOC/port")
+ REMOTE_PORT=$(read_file "$REM/port")
+
+ LOCAL_PIDX=$(find_pidx ${REMOTE_PORT} "$LOC")
+ REMOTE_PIDX=$(find_pidx ${LOCAL_PORT} "$REM")
+
+ echo "Local port ${LOCAL_PORT} with index ${REMOTE_PIDX} on remote host"
+ echo "Peer port ${REMOTE_PORT} with index ${LOCAL_PIDX} on local host"
+
+ echo " Passed"
+}
+
+function link_test()
+{
+ LOC=$1
+ REM=$2
+ EXP=0
+
+ echo "Running link tests on: $(subdirname $LOC) / $(subdirname $REM)"
+
+ if ! write_file "N" "$LOC/../link" 2> /dev/null; then
+ echo " Unsupported"
+ return
+ fi
+
+ write_file "N" "$LOC/link_event"
+
+ if [[ $(read_file "$REM/link") != "N" ]]; then
+ echo "Expected link to be down in $REM/link" >&2
+ exit -1
+ fi
+
+ write_file "Y" "$LOC/../link"
+
+ echo " Passed"
+}
+
+function doorbell_test()
+{
+ LOC=$1
+ REM=$2
+ EXP=0
+
+ echo "Running db tests on: $(basename $LOC) / $(basename $REM)"
+
+ DB_VALID_MASK=$(read_file "$LOC/db_valid_mask")
+
+ write_file "c $DB_VALID_MASK" "$REM/db"
+
+ for ((i = 0; i < 64; i++)); do
+ DB=$(read_file "$REM/db")
+ if [[ "$DB" -ne "$EXP" ]]; then
+ echo "Doorbell doesn't match expected value $EXP " \
+ "in $REM/db" >&2
+ exit -1
+ fi
+
+ let "MASK = (1 << $i) & $DB_VALID_MASK" || true
+ let "EXP = $EXP | $MASK" || true
+
+ write_file "s $MASK" "$LOC/peer_db"
+ done
+
+ write_file "c $DB_VALID_MASK" "$REM/db_mask"
+ write_file $DB_VALID_MASK "$REM/db_event"
+ write_file "s $DB_VALID_MASK" "$REM/db_mask"
+
+ write_file "c $DB_VALID_MASK" "$REM/db"
+
+ echo " Passed"
+}
+
+function get_files_count()
+{
+ NAME=$1
+ LOC=$2
+
+ split_remote $LOC
+
+ if [[ "$REMOTE" == "" ]]; then
+ echo $(ls -1 "$VPATH"/${NAME}* 2>/dev/null | wc -l)
+ else
+ echo $(ssh "$REMOTE" "ls -1 \"$VPATH\"/${NAME}* | \
+ wc -l" 2> /dev/null)
+ fi
+}
+
+function scratchpad_test()
+{
+ LOC=$1
+ REM=$2
+
+ echo "Running spad tests on: $(subdirname $LOC) / $(subdirname $REM)"
+
+ CNT=$(get_files_count "spad" "$LOC")
+
+ if [[ $CNT -eq 0 ]]; then
+ echo " Unsupported"
+ return
+ fi
+
+ for ((i = 0; i < $CNT; i++)); do
+ VAL=$RANDOM
+ write_file "$VAL" "$LOC/spad$i"
+ RVAL=$(read_file "$REM/../spad$i")
+
+ if [[ "$VAL" -ne "$RVAL" ]]; then
+ echo "Scratchpad $i value $RVAL doesn't match $VAL" >&2
+ exit -1
+ fi
+ done
+
+ echo " Passed"
+}
+
+function message_test()
+{
+ LOC=$1
+ REM=$2
+
+ echo "Running msg tests on: $(subdirname $LOC) / $(subdirname $REM)"
+
+ CNT=$(get_files_count "msg" "$LOC")
+
+ if [[ $CNT -eq 0 ]]; then
+ echo " Unsupported"
+ return
+ fi
+
+ MSG_OUTBITS_MASK=$(read_file "$LOC/../msg_inbits")
+ MSG_INBITS_MASK=$(read_file "$REM/../msg_inbits")
+
+ write_file "c $MSG_OUTBITS_MASK" "$LOC/../msg_sts"
+ write_file "c $MSG_INBITS_MASK" "$REM/../msg_sts"
+
+ for ((i = 0; i < $CNT; i++)); do
+ VAL=$RANDOM
+ write_file "$VAL" "$LOC/msg$i"
+ RVAL=$(read_file "$REM/../msg$i")
+
+ if [[ "$VAL" -ne "${RVAL%%<-*}" ]]; then
+ echo "Message $i value $RVAL doesn't match $VAL" >&2
+ exit -1
+ fi
+ done
+
+ echo " Passed"
+}
+
+function get_number()
+{
+ KEY=$1
+
+ sed -n "s/^\(${KEY}\)[ \t]*\(0x[0-9a-fA-F]*\)\(\[p\]\)\?$/\2/p"
+}
+
+function mw_alloc()
+{
+ IDX=$1
+ LOC=$2
+ REM=$3
+
+ write_file $MW_SIZE "$LOC/mw_trans$IDX"
+
+ INB_MW=$(read_file "$LOC/mw_trans$IDX")
+ MW_ALIGNED_SIZE=$(echo "$INB_MW" | get_number "Window Size")
+ MW_DMA_ADDR=$(echo "$INB_MW" | get_number "DMA Address")
+
+ write_file "$MW_DMA_ADDR:$(($MW_ALIGNED_SIZE))" "$REM/peer_mw_trans$IDX"
+
+ if [[ $MW_SIZE -ne $MW_ALIGNED_SIZE ]]; then
+ echo "MW $IDX size aligned to $MW_ALIGNED_SIZE"
+ fi
+}
+
+function write_mw()
+{
+ split_remote $2
+
+ if [[ "$REMOTE" != "" ]]; then
+ ssh "$REMOTE" \
+ dd if=/dev/urandom "of=$VPATH" 2> /dev/null || true
+ else
+ dd if=/dev/urandom "of=$VPATH" 2> /dev/null || true
+ fi
+}
+
+function mw_check()
+{
+ IDX=$1
+ LOC=$2
+ REM=$3
+
+ write_mw "$LOC/mw$IDX"
+
+ split_remote "$LOC/mw$IDX"
+ if [[ "$REMOTE" == "" ]]; then
+ A=$VPATH
+ else
+ A=/tmp/ntb_test.$$.A
+ ssh "$REMOTE" cat "$VPATH" > "$A"
+ fi
+
+ split_remote "$REM/peer_mw$IDX"
+ if [[ "$REMOTE" == "" ]]; then
+ B=$VPATH
+ else
+ B=/tmp/ntb_test.$$.B
+ ssh "$REMOTE" cat "$VPATH" > "$B"
+ fi
+
+ cmp -n $MW_ALIGNED_SIZE "$A" "$B"
+ if [[ $? != 0 ]]; then
+ echo "Memory window $MW did not match!" >&2
+ fi
+
+ if [[ "$A" == "/tmp/*" ]]; then
+ rm "$A"
+ fi
+
+ if [[ "$B" == "/tmp/*" ]]; then
+ rm "$B"
+ fi
+}
+
+function mw_free()
+{
+ IDX=$1
+ LOC=$2
+ REM=$3
+
+ write_file "$MW_DMA_ADDR:0" "$REM/peer_mw_trans$IDX"
+
+ write_file 0 "$LOC/mw_trans$IDX"
+}
+
+function mw_test()
+{
+ LOC=$1
+ REM=$2
+
+ CNT=$(get_files_count "mw_trans" "$LOC")
+
+ for ((i = 0; i < $CNT; i++)); do
+ echo "Running mw$i tests on: $(subdirname $LOC) / " \
+ "$(subdirname $REM)"
+
+ mw_alloc $i $LOC $REM
+
+ mw_check $i $LOC $REM
+
+ mw_free $i $LOC $REM
+
+ echo " Passed"
+ done
+
+}
+
+function pingpong_test()
+{
+ LOC=$1
+ REM=$2
+
+ echo "Running ping pong tests on: $(basename $LOC) / $(basename $REM)"
+
+ LOC_START=$(read_file "$LOC/count")
+ REM_START=$(read_file "$REM/count")
+
+ sleep 7
+
+ LOC_END=$(read_file "$LOC/count")
+ REM_END=$(read_file "$REM/count")
+
+ if [[ $LOC_START == $LOC_END ]] || [[ $REM_START == $REM_END ]]; then
+ echo "Ping pong counter not incrementing!" >&2
+ exit 1
+ fi
+
+ echo " Passed"
+}
+
+function msi_test()
+{
+ LOC=$1
+ REM=$2
+
+ write_file 1 $LOC/ready
+
+ echo "Running MSI interrupt tests on: $(subdirname $LOC) / $(subdirname $REM)"
+
+ CNT=$(read_file "$LOC/count")
+ for ((i = 0; i < $CNT; i++)); do
+ START=$(read_file $REM/../irq${i}_occurrences)
+ write_file $i $LOC/trigger
+ END=$(read_file $REM/../irq${i}_occurrences)
+
+ if [[ $(($END - $START)) != 1 ]]; then
+ echo "MSI did not trigger the interrupt on the remote side!" >&2
+ exit 1
+ fi
+ done
+
+ echo " Passed"
+}
+
+function perf_test()
+{
+ USE_DMA=$1
+
+ if [[ $USE_DMA == "1" ]]; then
+ WITH="with"
+ else
+ WITH="without"
+ fi
+
+ _modprobe ntb_perf total_order=$PERF_RUN_ORDER \
+ max_mw_size=$MAX_MW_SIZE use_dma=$USE_DMA
+
+ echo "Running local perf test $WITH DMA"
+ write_file "$LOCAL_PIDX" "$LOCAL_PERF/run"
+ echo -n " "
+ read_file "$LOCAL_PERF/run"
+ echo " Passed"
+
+ echo "Running remote perf test $WITH DMA"
+ write_file "$REMOTE_PIDX" "$REMOTE_PERF/run"
+ echo -n " "
+ read_file "$REMOTE_PERF/run"
+ echo " Passed"
+
+ _modprobe -r ntb_perf
+}
+
+function ntb_tool_tests()
+{
+ LOCAL_TOOL="$DEBUGFS/ntb_tool/$LOCAL_DEV"
+ REMOTE_TOOL="$REMOTE_HOST:$DEBUGFS/ntb_tool/$REMOTE_DEV"
+
+ echo "Starting ntb_tool tests..."
+
+ _modprobe ntb_tool
+
+ port_test "$LOCAL_TOOL" "$REMOTE_TOOL"
+
+ LOCAL_PEER_TOOL="$LOCAL_TOOL/peer$LOCAL_PIDX"
+ REMOTE_PEER_TOOL="$REMOTE_TOOL/peer$REMOTE_PIDX"
+
+ link_test "$LOCAL_PEER_TOOL" "$REMOTE_PEER_TOOL"
+ link_test "$REMOTE_PEER_TOOL" "$LOCAL_PEER_TOOL"
+
+ #Ensure the link is up on both sides before continuing
+ write_file "Y" "$LOCAL_PEER_TOOL/link_event"
+ write_file "Y" "$REMOTE_PEER_TOOL/link_event"
+
+ doorbell_test "$LOCAL_TOOL" "$REMOTE_TOOL"
+ doorbell_test "$REMOTE_TOOL" "$LOCAL_TOOL"
+
+ scratchpad_test "$LOCAL_PEER_TOOL" "$REMOTE_PEER_TOOL"
+ scratchpad_test "$REMOTE_PEER_TOOL" "$LOCAL_PEER_TOOL"
+
+ message_test "$LOCAL_PEER_TOOL" "$REMOTE_PEER_TOOL"
+ message_test "$REMOTE_PEER_TOOL" "$LOCAL_PEER_TOOL"
+
+ mw_test "$LOCAL_PEER_TOOL" "$REMOTE_PEER_TOOL"
+ mw_test "$REMOTE_PEER_TOOL" "$LOCAL_PEER_TOOL"
+
+ _modprobe -r ntb_tool
+}
+
+function ntb_pingpong_tests()
+{
+ LOCAL_PP="$DEBUGFS/ntb_pingpong/$LOCAL_DEV"
+ REMOTE_PP="$REMOTE_HOST:$DEBUGFS/ntb_pingpong/$REMOTE_DEV"
+
+ echo "Starting ntb_pingpong tests..."
+
+ _modprobe ntb_pingpong
+
+ pingpong_test $LOCAL_PP $REMOTE_PP
+
+ _modprobe -r ntb_pingpong
+}
+
+function ntb_msi_tests()
+{
+ LOCAL_MSI="$DEBUGFS/ntb_msi_test/$LOCAL_DEV"
+ REMOTE_MSI="$REMOTE_HOST:$DEBUGFS/ntb_msi_test/$REMOTE_DEV"
+
+ echo "Starting ntb_msi_test tests..."
+
+ if ! _modprobe ntb_msi_test 2> /dev/null; then
+ echo " Not doing MSI tests seeing the module is not available."
+ return
+ fi
+
+ port_test $LOCAL_MSI $REMOTE_MSI
+
+ LOCAL_PEER="$LOCAL_MSI/peer$LOCAL_PIDX"
+ REMOTE_PEER="$REMOTE_MSI/peer$REMOTE_PIDX"
+
+ msi_test $LOCAL_PEER $REMOTE_PEER
+ msi_test $REMOTE_PEER $LOCAL_PEER
+
+ _modprobe -r ntb_msi_test
+}
+
+function ntb_perf_tests()
+{
+ LOCAL_PERF="$DEBUGFS/ntb_perf/$LOCAL_DEV"
+ REMOTE_PERF="$REMOTE_HOST:$DEBUGFS/ntb_perf/$REMOTE_DEV"
+
+ echo "Starting ntb_perf tests..."
+
+ perf_test 0
+
+ if [[ $RUN_DMA_TESTS ]]; then
+ perf_test 1
+ fi
+}
+
+function cleanup()
+{
+ set +e
+ _modprobe -r ntb_tool 2> /dev/null
+ _modprobe -r ntb_perf 2> /dev/null
+ _modprobe -r ntb_pingpong 2> /dev/null
+ _modprobe -r ntb_transport 2> /dev/null
+ _modprobe -r ntb_msi_test 2> /dev/null
+ set -e
+}
+
+cleanup
+
+if ! [[ $$DONT_CLEANUP ]]; then
+ trap cleanup EXIT
+fi
+
+if [ "$(id -u)" != "0" ]; then
+ echo "This script must be run as root" 1>&2
+ exit 1
+fi
+
+if [[ "$LIST_DEVS" == TRUE ]]; then
+ echo "Local Devices:"
+ ls -1 /sys/bus/ntb/devices
+ echo
+
+ if [[ "$REMOTE_HOST" != "" ]]; then
+ echo "Remote Devices:"
+ ssh $REMOTE_HOST ls -1 /sys/bus/ntb/devices
+ fi
+
+ exit 0
+fi
+
+if [[ "$LOCAL_DEV" == $"" ]] || [[ "$REMOTE_DEV" == $"" ]]; then
+ show_help
+ exit 1
+fi
+
+ntb_tool_tests
+echo
+ntb_pingpong_tests
+echo
+ntb_msi_tests
+echo
+ntb_perf_tests
+echo
diff --git a/tools/testing/selftests/openat2/.gitignore b/tools/testing/selftests/openat2/.gitignore
new file mode 100644
index 000000000..82a4846cb
--- /dev/null
+++ b/tools/testing/selftests/openat2/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/*_test
diff --git a/tools/testing/selftests/openat2/Makefile b/tools/testing/selftests/openat2/Makefile
new file mode 100644
index 000000000..843ba56d8
--- /dev/null
+++ b/tools/testing/selftests/openat2/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+CFLAGS += -Wall -O2 -g -fsanitize=address -fsanitize=undefined
+TEST_GEN_PROGS := openat2_test resolve_test rename_attack_test
+
+include ../lib.mk
+
+$(TEST_GEN_PROGS): helpers.c helpers.h
diff --git a/tools/testing/selftests/openat2/helpers.c b/tools/testing/selftests/openat2/helpers.c
new file mode 100644
index 000000000..5074681ff
--- /dev/null
+++ b/tools/testing/selftests/openat2/helpers.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Author: Aleksa Sarai <cyphar@cyphar.com>
+ * Copyright (C) 2018-2019 SUSE LLC.
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <string.h>
+#include <syscall.h>
+#include <limits.h>
+
+#include "helpers.h"
+
+bool needs_openat2(const struct open_how *how)
+{
+ return how->resolve != 0;
+}
+
+int raw_openat2(int dfd, const char *path, void *how, size_t size)
+{
+ int ret = syscall(__NR_openat2, dfd, path, how, size);
+ return ret >= 0 ? ret : -errno;
+}
+
+int sys_openat2(int dfd, const char *path, struct open_how *how)
+{
+ return raw_openat2(dfd, path, how, sizeof(*how));
+}
+
+int sys_openat(int dfd, const char *path, struct open_how *how)
+{
+ int ret = openat(dfd, path, how->flags, how->mode);
+ return ret >= 0 ? ret : -errno;
+}
+
+int sys_renameat2(int olddirfd, const char *oldpath,
+ int newdirfd, const char *newpath, unsigned int flags)
+{
+ int ret = syscall(__NR_renameat2, olddirfd, oldpath,
+ newdirfd, newpath, flags);
+ return ret >= 0 ? ret : -errno;
+}
+
+int touchat(int dfd, const char *path)
+{
+ int fd = openat(dfd, path, O_CREAT, 0700);
+ if (fd >= 0)
+ close(fd);
+ return fd;
+}
+
+char *fdreadlink(int fd)
+{
+ char *target, *tmp;
+
+ E_asprintf(&tmp, "/proc/self/fd/%d", fd);
+
+ target = malloc(PATH_MAX);
+ if (!target)
+ ksft_exit_fail_msg("fdreadlink: malloc failed\n");
+ memset(target, 0, PATH_MAX);
+
+ E_readlink(tmp, target, PATH_MAX);
+ free(tmp);
+ return target;
+}
+
+bool fdequal(int fd, int dfd, const char *path)
+{
+ char *fdpath, *dfdpath, *other;
+ bool cmp;
+
+ fdpath = fdreadlink(fd);
+ dfdpath = fdreadlink(dfd);
+
+ if (!path)
+ E_asprintf(&other, "%s", dfdpath);
+ else if (*path == '/')
+ E_asprintf(&other, "%s", path);
+ else
+ E_asprintf(&other, "%s/%s", dfdpath, path);
+
+ cmp = !strcmp(fdpath, other);
+
+ free(fdpath);
+ free(dfdpath);
+ free(other);
+ return cmp;
+}
+
+bool openat2_supported = false;
+
+void __attribute__((constructor)) init(void)
+{
+ struct open_how how = {};
+ int fd;
+
+ BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_VER0);
+
+ /* Check openat2(2) support. */
+ fd = sys_openat2(AT_FDCWD, ".", &how);
+ openat2_supported = (fd >= 0);
+
+ if (fd >= 0)
+ close(fd);
+}
diff --git a/tools/testing/selftests/openat2/helpers.h b/tools/testing/selftests/openat2/helpers.h
new file mode 100644
index 000000000..7056340b9
--- /dev/null
+++ b/tools/testing/selftests/openat2/helpers.h
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Author: Aleksa Sarai <cyphar@cyphar.com>
+ * Copyright (C) 2018-2019 SUSE LLC.
+ */
+
+#ifndef __RESOLVEAT_H__
+#define __RESOLVEAT_H__
+
+#define _GNU_SOURCE
+#include <stdint.h>
+#include <stdbool.h>
+#include <errno.h>
+#include <linux/types.h>
+#include "../kselftest.h"
+
+#define ARRAY_LEN(X) (sizeof (X) / sizeof (*(X)))
+#define BUILD_BUG_ON(e) ((void)(sizeof(struct { int:(-!!(e)); })))
+
+#ifndef SYS_openat2
+#ifndef __NR_openat2
+#define __NR_openat2 437
+#endif /* __NR_openat2 */
+#define SYS_openat2 __NR_openat2
+#endif /* SYS_openat2 */
+
+/*
+ * Arguments for how openat2(2) should open the target path. If @resolve is
+ * zero, then openat2(2) operates very similarly to openat(2).
+ *
+ * However, unlike openat(2), unknown bits in @flags result in -EINVAL rather
+ * than being silently ignored. @mode must be zero unless one of {O_CREAT,
+ * O_TMPFILE} are set.
+ *
+ * @flags: O_* flags.
+ * @mode: O_CREAT/O_TMPFILE file mode.
+ * @resolve: RESOLVE_* flags.
+ */
+struct open_how {
+ __u64 flags;
+ __u64 mode;
+ __u64 resolve;
+};
+
+#define OPEN_HOW_SIZE_VER0 24 /* sizeof first published struct */
+#define OPEN_HOW_SIZE_LATEST OPEN_HOW_SIZE_VER0
+
+bool needs_openat2(const struct open_how *how);
+
+#ifndef RESOLVE_IN_ROOT
+/* how->resolve flags for openat2(2). */
+#define RESOLVE_NO_XDEV 0x01 /* Block mount-point crossings
+ (includes bind-mounts). */
+#define RESOLVE_NO_MAGICLINKS 0x02 /* Block traversal through procfs-style
+ "magic-links". */
+#define RESOLVE_NO_SYMLINKS 0x04 /* Block traversal through all symlinks
+ (implies OEXT_NO_MAGICLINKS) */
+#define RESOLVE_BENEATH 0x08 /* Block "lexical" trickery like
+ "..", symlinks, and absolute
+ paths which escape the dirfd. */
+#define RESOLVE_IN_ROOT 0x10 /* Make all jumps to "/" and ".."
+ be scoped inside the dirfd
+ (similar to chroot(2)). */
+#endif /* RESOLVE_IN_ROOT */
+
+#define E_func(func, ...) \
+ do { \
+ errno = 0; \
+ if (func(__VA_ARGS__) < 0) \
+ ksft_exit_fail_msg("%s:%d %s failed - errno:%d\n", \
+ __FILE__, __LINE__, #func, errno); \
+ } while (0)
+
+#define E_asprintf(...) E_func(asprintf, __VA_ARGS__)
+#define E_chmod(...) E_func(chmod, __VA_ARGS__)
+#define E_dup2(...) E_func(dup2, __VA_ARGS__)
+#define E_fchdir(...) E_func(fchdir, __VA_ARGS__)
+#define E_fstatat(...) E_func(fstatat, __VA_ARGS__)
+#define E_kill(...) E_func(kill, __VA_ARGS__)
+#define E_mkdirat(...) E_func(mkdirat, __VA_ARGS__)
+#define E_mount(...) E_func(mount, __VA_ARGS__)
+#define E_prctl(...) E_func(prctl, __VA_ARGS__)
+#define E_readlink(...) E_func(readlink, __VA_ARGS__)
+#define E_setresuid(...) E_func(setresuid, __VA_ARGS__)
+#define E_symlinkat(...) E_func(symlinkat, __VA_ARGS__)
+#define E_touchat(...) E_func(touchat, __VA_ARGS__)
+#define E_unshare(...) E_func(unshare, __VA_ARGS__)
+
+#define E_assert(expr, msg, ...) \
+ do { \
+ if (!(expr)) \
+ ksft_exit_fail_msg("ASSERT(%s:%d) failed (%s): " msg "\n", \
+ __FILE__, __LINE__, #expr, ##__VA_ARGS__); \
+ } while (0)
+
+int raw_openat2(int dfd, const char *path, void *how, size_t size);
+int sys_openat2(int dfd, const char *path, struct open_how *how);
+int sys_openat(int dfd, const char *path, struct open_how *how);
+int sys_renameat2(int olddirfd, const char *oldpath,
+ int newdirfd, const char *newpath, unsigned int flags);
+
+int touchat(int dfd, const char *path);
+char *fdreadlink(int fd);
+bool fdequal(int fd, int dfd, const char *path);
+
+extern bool openat2_supported;
+
+#endif /* __RESOLVEAT_H__ */
diff --git a/tools/testing/selftests/openat2/openat2_test.c b/tools/testing/selftests/openat2/openat2_test.c
new file mode 100644
index 000000000..453152b58
--- /dev/null
+++ b/tools/testing/selftests/openat2/openat2_test.c
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Author: Aleksa Sarai <cyphar@cyphar.com>
+ * Copyright (C) 2018-2019 SUSE LLC.
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <sched.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/mount.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "../kselftest.h"
+#include "helpers.h"
+
+/*
+ * O_LARGEFILE is set to 0 by glibc.
+ * XXX: This is wrong on {mips, parisc, powerpc, sparc}.
+ */
+#undef O_LARGEFILE
+#define O_LARGEFILE 0x8000
+
+struct open_how_ext {
+ struct open_how inner;
+ uint32_t extra1;
+ char pad1[128];
+ uint32_t extra2;
+ char pad2[128];
+ uint32_t extra3;
+};
+
+struct struct_test {
+ const char *name;
+ struct open_how_ext arg;
+ size_t size;
+ int err;
+};
+
+#define NUM_OPENAT2_STRUCT_TESTS 7
+#define NUM_OPENAT2_STRUCT_VARIATIONS 13
+
+void test_openat2_struct(void)
+{
+ int misalignments[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 17, 87 };
+
+ struct struct_test tests[] = {
+ /* Normal struct. */
+ { .name = "normal struct",
+ .arg.inner.flags = O_RDONLY,
+ .size = sizeof(struct open_how) },
+ /* Bigger struct, with zeroed out end. */
+ { .name = "bigger struct (zeroed out)",
+ .arg.inner.flags = O_RDONLY,
+ .size = sizeof(struct open_how_ext) },
+
+ /* TODO: Once expanded, check zero-padding. */
+
+ /* Smaller than version-0 struct. */
+ { .name = "zero-sized 'struct'",
+ .arg.inner.flags = O_RDONLY, .size = 0, .err = -EINVAL },
+ { .name = "smaller-than-v0 struct",
+ .arg.inner.flags = O_RDONLY,
+ .size = OPEN_HOW_SIZE_VER0 - 1, .err = -EINVAL },
+
+ /* Bigger struct, with non-zero trailing bytes. */
+ { .name = "bigger struct (non-zero data in first 'future field')",
+ .arg.inner.flags = O_RDONLY, .arg.extra1 = 0xdeadbeef,
+ .size = sizeof(struct open_how_ext), .err = -E2BIG },
+ { .name = "bigger struct (non-zero data in middle of 'future fields')",
+ .arg.inner.flags = O_RDONLY, .arg.extra2 = 0xfeedcafe,
+ .size = sizeof(struct open_how_ext), .err = -E2BIG },
+ { .name = "bigger struct (non-zero data at end of 'future fields')",
+ .arg.inner.flags = O_RDONLY, .arg.extra3 = 0xabad1dea,
+ .size = sizeof(struct open_how_ext), .err = -E2BIG },
+ };
+
+ BUILD_BUG_ON(ARRAY_LEN(misalignments) != NUM_OPENAT2_STRUCT_VARIATIONS);
+ BUILD_BUG_ON(ARRAY_LEN(tests) != NUM_OPENAT2_STRUCT_TESTS);
+
+ for (int i = 0; i < ARRAY_LEN(tests); i++) {
+ struct struct_test *test = &tests[i];
+ struct open_how_ext how_ext = test->arg;
+
+ for (int j = 0; j < ARRAY_LEN(misalignments); j++) {
+ int fd, misalign = misalignments[j];
+ char *fdpath = NULL;
+ bool failed;
+ void (*resultfn)(const char *msg, ...) = ksft_test_result_pass;
+
+ void *copy = NULL, *how_copy = &how_ext;
+
+ if (!openat2_supported) {
+ ksft_print_msg("openat2(2) unsupported\n");
+ resultfn = ksft_test_result_skip;
+ goto skip;
+ }
+
+ if (misalign) {
+ /*
+ * Explicitly misalign the structure copying it with the given
+ * (mis)alignment offset. The other data is set to be non-zero to
+ * make sure that non-zero bytes outside the struct aren't checked
+ *
+ * This is effectively to check that is_zeroed_user() works.
+ */
+ copy = malloc(misalign + sizeof(how_ext));
+ how_copy = copy + misalign;
+ memset(copy, 0xff, misalign);
+ memcpy(how_copy, &how_ext, sizeof(how_ext));
+ }
+
+ fd = raw_openat2(AT_FDCWD, ".", how_copy, test->size);
+ if (test->err >= 0)
+ failed = (fd < 0);
+ else
+ failed = (fd != test->err);
+ if (fd >= 0) {
+ fdpath = fdreadlink(fd);
+ close(fd);
+ }
+
+ if (failed) {
+ resultfn = ksft_test_result_fail;
+
+ ksft_print_msg("openat2 unexpectedly returned ");
+ if (fdpath)
+ ksft_print_msg("%d['%s']\n", fd, fdpath);
+ else
+ ksft_print_msg("%d (%s)\n", fd, strerror(-fd));
+ }
+
+skip:
+ if (test->err >= 0)
+ resultfn("openat2 with %s argument [misalign=%d] succeeds\n",
+ test->name, misalign);
+ else
+ resultfn("openat2 with %s argument [misalign=%d] fails with %d (%s)\n",
+ test->name, misalign, test->err,
+ strerror(-test->err));
+
+ free(copy);
+ free(fdpath);
+ fflush(stdout);
+ }
+ }
+}
+
+struct flag_test {
+ const char *name;
+ struct open_how how;
+ int err;
+};
+
+#define NUM_OPENAT2_FLAG_TESTS 23
+
+void test_openat2_flags(void)
+{
+ struct flag_test tests[] = {
+ /* O_TMPFILE is incompatible with O_PATH and O_CREAT. */
+ { .name = "incompatible flags (O_TMPFILE | O_PATH)",
+ .how.flags = O_TMPFILE | O_PATH | O_RDWR, .err = -EINVAL },
+ { .name = "incompatible flags (O_TMPFILE | O_CREAT)",
+ .how.flags = O_TMPFILE | O_CREAT | O_RDWR, .err = -EINVAL },
+
+ /* O_PATH only permits certain other flags to be set ... */
+ { .name = "compatible flags (O_PATH | O_CLOEXEC)",
+ .how.flags = O_PATH | O_CLOEXEC },
+ { .name = "compatible flags (O_PATH | O_DIRECTORY)",
+ .how.flags = O_PATH | O_DIRECTORY },
+ { .name = "compatible flags (O_PATH | O_NOFOLLOW)",
+ .how.flags = O_PATH | O_NOFOLLOW },
+ /* ... and others are absolutely not permitted. */
+ { .name = "incompatible flags (O_PATH | O_RDWR)",
+ .how.flags = O_PATH | O_RDWR, .err = -EINVAL },
+ { .name = "incompatible flags (O_PATH | O_CREAT)",
+ .how.flags = O_PATH | O_CREAT, .err = -EINVAL },
+ { .name = "incompatible flags (O_PATH | O_EXCL)",
+ .how.flags = O_PATH | O_EXCL, .err = -EINVAL },
+ { .name = "incompatible flags (O_PATH | O_NOCTTY)",
+ .how.flags = O_PATH | O_NOCTTY, .err = -EINVAL },
+ { .name = "incompatible flags (O_PATH | O_DIRECT)",
+ .how.flags = O_PATH | O_DIRECT, .err = -EINVAL },
+ { .name = "incompatible flags (O_PATH | O_LARGEFILE)",
+ .how.flags = O_PATH | O_LARGEFILE, .err = -EINVAL },
+
+ /* ->mode must only be set with O_{CREAT,TMPFILE}. */
+ { .name = "non-zero how.mode and O_RDONLY",
+ .how.flags = O_RDONLY, .how.mode = 0600, .err = -EINVAL },
+ { .name = "non-zero how.mode and O_PATH",
+ .how.flags = O_PATH, .how.mode = 0600, .err = -EINVAL },
+ { .name = "valid how.mode and O_CREAT",
+ .how.flags = O_CREAT, .how.mode = 0600 },
+ { .name = "valid how.mode and O_TMPFILE",
+ .how.flags = O_TMPFILE | O_RDWR, .how.mode = 0600 },
+ /* ->mode must only contain 0777 bits. */
+ { .name = "invalid how.mode and O_CREAT",
+ .how.flags = O_CREAT,
+ .how.mode = 0xFFFF, .err = -EINVAL },
+ { .name = "invalid (very large) how.mode and O_CREAT",
+ .how.flags = O_CREAT,
+ .how.mode = 0xC000000000000000ULL, .err = -EINVAL },
+ { .name = "invalid how.mode and O_TMPFILE",
+ .how.flags = O_TMPFILE | O_RDWR,
+ .how.mode = 0x1337, .err = -EINVAL },
+ { .name = "invalid (very large) how.mode and O_TMPFILE",
+ .how.flags = O_TMPFILE | O_RDWR,
+ .how.mode = 0x0000A00000000000ULL, .err = -EINVAL },
+
+ /* ->resolve must only contain RESOLVE_* flags. */
+ { .name = "invalid how.resolve and O_RDONLY",
+ .how.flags = O_RDONLY,
+ .how.resolve = 0x1337, .err = -EINVAL },
+ { .name = "invalid how.resolve and O_CREAT",
+ .how.flags = O_CREAT,
+ .how.resolve = 0x1337, .err = -EINVAL },
+ { .name = "invalid how.resolve and O_TMPFILE",
+ .how.flags = O_TMPFILE | O_RDWR,
+ .how.resolve = 0x1337, .err = -EINVAL },
+ { .name = "invalid how.resolve and O_PATH",
+ .how.flags = O_PATH,
+ .how.resolve = 0x1337, .err = -EINVAL },
+ };
+
+ BUILD_BUG_ON(ARRAY_LEN(tests) != NUM_OPENAT2_FLAG_TESTS);
+
+ for (int i = 0; i < ARRAY_LEN(tests); i++) {
+ int fd, fdflags = -1;
+ char *path, *fdpath = NULL;
+ bool failed = false;
+ struct flag_test *test = &tests[i];
+ void (*resultfn)(const char *msg, ...) = ksft_test_result_pass;
+
+ if (!openat2_supported) {
+ ksft_print_msg("openat2(2) unsupported\n");
+ resultfn = ksft_test_result_skip;
+ goto skip;
+ }
+
+ path = (test->how.flags & O_CREAT) ? "/tmp/ksft.openat2_tmpfile" : ".";
+ unlink(path);
+
+ fd = sys_openat2(AT_FDCWD, path, &test->how);
+ if (fd < 0 && fd == -EOPNOTSUPP) {
+ /*
+ * Skip the testcase if it failed because not supported
+ * by FS. (e.g. a valid O_TMPFILE combination on NFS)
+ */
+ ksft_test_result_skip("openat2 with %s fails with %d (%s)\n",
+ test->name, fd, strerror(-fd));
+ goto next;
+ }
+
+ if (test->err >= 0)
+ failed = (fd < 0);
+ else
+ failed = (fd != test->err);
+ if (fd >= 0) {
+ int otherflags;
+
+ fdpath = fdreadlink(fd);
+ fdflags = fcntl(fd, F_GETFL);
+ otherflags = fcntl(fd, F_GETFD);
+ close(fd);
+
+ E_assert(fdflags >= 0, "fcntl F_GETFL of new fd");
+ E_assert(otherflags >= 0, "fcntl F_GETFD of new fd");
+
+ /* O_CLOEXEC isn't shown in F_GETFL. */
+ if (otherflags & FD_CLOEXEC)
+ fdflags |= O_CLOEXEC;
+ /* O_CREAT is hidden from F_GETFL. */
+ if (test->how.flags & O_CREAT)
+ fdflags |= O_CREAT;
+ if (!(test->how.flags & O_LARGEFILE))
+ fdflags &= ~O_LARGEFILE;
+ failed |= (fdflags != test->how.flags);
+ }
+
+ if (failed) {
+ resultfn = ksft_test_result_fail;
+
+ ksft_print_msg("openat2 unexpectedly returned ");
+ if (fdpath)
+ ksft_print_msg("%d['%s'] with %X (!= %X)\n",
+ fd, fdpath, fdflags,
+ test->how.flags);
+ else
+ ksft_print_msg("%d (%s)\n", fd, strerror(-fd));
+ }
+
+skip:
+ if (test->err >= 0)
+ resultfn("openat2 with %s succeeds\n", test->name);
+ else
+ resultfn("openat2 with %s fails with %d (%s)\n",
+ test->name, test->err, strerror(-test->err));
+next:
+ free(fdpath);
+ fflush(stdout);
+ }
+}
+
+#define NUM_TESTS (NUM_OPENAT2_STRUCT_VARIATIONS * NUM_OPENAT2_STRUCT_TESTS + \
+ NUM_OPENAT2_FLAG_TESTS)
+
+int main(int argc, char **argv)
+{
+ ksft_print_header();
+ ksft_set_plan(NUM_TESTS);
+
+ test_openat2_struct();
+ test_openat2_flags();
+
+ if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0)
+ ksft_exit_fail();
+ else
+ ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/openat2/rename_attack_test.c b/tools/testing/selftests/openat2/rename_attack_test.c
new file mode 100644
index 000000000..0a770728b
--- /dev/null
+++ b/tools/testing/selftests/openat2/rename_attack_test.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Author: Aleksa Sarai <cyphar@cyphar.com>
+ * Copyright (C) 2018-2019 SUSE LLC.
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/mount.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <syscall.h>
+#include <limits.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+#include "helpers.h"
+
+/* Construct a test directory with the following structure:
+ *
+ * root/
+ * |-- a/
+ * | `-- c/
+ * `-- b/
+ */
+int setup_testdir(void)
+{
+ int dfd;
+ char dirname[] = "/tmp/ksft-openat2-rename-attack.XXXXXX";
+
+ /* Make the top-level directory. */
+ if (!mkdtemp(dirname))
+ ksft_exit_fail_msg("setup_testdir: failed to create tmpdir\n");
+ dfd = open(dirname, O_PATH | O_DIRECTORY);
+ if (dfd < 0)
+ ksft_exit_fail_msg("setup_testdir: failed to open tmpdir\n");
+
+ E_mkdirat(dfd, "a", 0755);
+ E_mkdirat(dfd, "b", 0755);
+ E_mkdirat(dfd, "a/c", 0755);
+
+ return dfd;
+}
+
+/* Swap @dirfd/@a and @dirfd/@b constantly. Parent must kill this process. */
+pid_t spawn_attack(int dirfd, char *a, char *b)
+{
+ pid_t child = fork();
+ if (child != 0)
+ return child;
+
+ /* If the parent (the test process) dies, kill ourselves too. */
+ E_prctl(PR_SET_PDEATHSIG, SIGKILL);
+
+ /* Swap @a and @b. */
+ for (;;)
+ renameat2(dirfd, a, dirfd, b, RENAME_EXCHANGE);
+ exit(1);
+}
+
+#define NUM_RENAME_TESTS 2
+#define ROUNDS 400000
+
+const char *flagname(int resolve)
+{
+ switch (resolve) {
+ case RESOLVE_IN_ROOT:
+ return "RESOLVE_IN_ROOT";
+ case RESOLVE_BENEATH:
+ return "RESOLVE_BENEATH";
+ }
+ return "(unknown)";
+}
+
+void test_rename_attack(int resolve)
+{
+ int dfd, afd;
+ pid_t child;
+ void (*resultfn)(const char *msg, ...) = ksft_test_result_pass;
+ int escapes = 0, other_errs = 0, exdevs = 0, eagains = 0, successes = 0;
+
+ struct open_how how = {
+ .flags = O_PATH,
+ .resolve = resolve,
+ };
+
+ if (!openat2_supported) {
+ how.resolve = 0;
+ ksft_print_msg("openat2(2) unsupported -- using openat(2) instead\n");
+ }
+
+ dfd = setup_testdir();
+ afd = openat(dfd, "a", O_PATH);
+ if (afd < 0)
+ ksft_exit_fail_msg("test_rename_attack: failed to open 'a'\n");
+
+ child = spawn_attack(dfd, "a/c", "b");
+
+ for (int i = 0; i < ROUNDS; i++) {
+ int fd;
+ char *victim_path = "c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../../c/../..";
+
+ if (openat2_supported)
+ fd = sys_openat2(afd, victim_path, &how);
+ else
+ fd = sys_openat(afd, victim_path, &how);
+
+ if (fd < 0) {
+ if (fd == -EAGAIN)
+ eagains++;
+ else if (fd == -EXDEV)
+ exdevs++;
+ else if (fd == -ENOENT)
+ escapes++; /* escaped outside and got ENOENT... */
+ else
+ other_errs++; /* unexpected error */
+ } else {
+ if (fdequal(fd, afd, NULL))
+ successes++;
+ else
+ escapes++; /* we got an unexpected fd */
+ }
+ close(fd);
+ }
+
+ if (escapes > 0)
+ resultfn = ksft_test_result_fail;
+ ksft_print_msg("non-escapes: EAGAIN=%d EXDEV=%d E<other>=%d success=%d\n",
+ eagains, exdevs, other_errs, successes);
+ resultfn("rename attack with %s (%d runs, got %d escapes)\n",
+ flagname(resolve), ROUNDS, escapes);
+
+ /* Should be killed anyway, but might as well make sure. */
+ E_kill(child, SIGKILL);
+}
+
+#define NUM_TESTS NUM_RENAME_TESTS
+
+int main(int argc, char **argv)
+{
+ ksft_print_header();
+ ksft_set_plan(NUM_TESTS);
+
+ test_rename_attack(RESOLVE_BENEATH);
+ test_rename_attack(RESOLVE_IN_ROOT);
+
+ if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0)
+ ksft_exit_fail();
+ else
+ ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/openat2/resolve_test.c b/tools/testing/selftests/openat2/resolve_test.c
new file mode 100644
index 000000000..bbafad440
--- /dev/null
+++ b/tools/testing/selftests/openat2/resolve_test.c
@@ -0,0 +1,523 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Author: Aleksa Sarai <cyphar@cyphar.com>
+ * Copyright (C) 2018-2019 SUSE LLC.
+ */
+
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <sched.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/mount.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include "../kselftest.h"
+#include "helpers.h"
+
+/*
+ * Construct a test directory with the following structure:
+ *
+ * root/
+ * |-- procexe -> /proc/self/exe
+ * |-- procroot -> /proc/self/root
+ * |-- root/
+ * |-- mnt/ [mountpoint]
+ * | |-- self -> ../mnt/
+ * | `-- absself -> /mnt/
+ * |-- etc/
+ * | `-- passwd
+ * |-- creatlink -> /newfile3
+ * |-- reletc -> etc/
+ * |-- relsym -> etc/passwd
+ * |-- absetc -> /etc/
+ * |-- abssym -> /etc/passwd
+ * |-- abscheeky -> /cheeky
+ * `-- cheeky/
+ * |-- absself -> /
+ * |-- self -> ../../root/
+ * |-- garbageself -> /../../root/
+ * |-- passwd -> ../cheeky/../cheeky/../etc/../etc/passwd
+ * |-- abspasswd -> /../cheeky/../cheeky/../etc/../etc/passwd
+ * |-- dotdotlink -> ../../../../../../../../../../../../../../etc/passwd
+ * `-- garbagelink -> /../../../../../../../../../../../../../../etc/passwd
+ */
+int setup_testdir(void)
+{
+ int dfd, tmpfd;
+ char dirname[] = "/tmp/ksft-openat2-testdir.XXXXXX";
+
+ /* Unshare and make /tmp a new directory. */
+ E_unshare(CLONE_NEWNS);
+ E_mount("", "/tmp", "", MS_PRIVATE, "");
+
+ /* Make the top-level directory. */
+ if (!mkdtemp(dirname))
+ ksft_exit_fail_msg("setup_testdir: failed to create tmpdir\n");
+ dfd = open(dirname, O_PATH | O_DIRECTORY);
+ if (dfd < 0)
+ ksft_exit_fail_msg("setup_testdir: failed to open tmpdir\n");
+
+ /* A sub-directory which is actually used for tests. */
+ E_mkdirat(dfd, "root", 0755);
+ tmpfd = openat(dfd, "root", O_PATH | O_DIRECTORY);
+ if (tmpfd < 0)
+ ksft_exit_fail_msg("setup_testdir: failed to open tmpdir\n");
+ close(dfd);
+ dfd = tmpfd;
+
+ E_symlinkat("/proc/self/exe", dfd, "procexe");
+ E_symlinkat("/proc/self/root", dfd, "procroot");
+ E_mkdirat(dfd, "root", 0755);
+
+ /* There is no mountat(2), so use chdir. */
+ E_mkdirat(dfd, "mnt", 0755);
+ E_fchdir(dfd);
+ E_mount("tmpfs", "./mnt", "tmpfs", MS_NOSUID | MS_NODEV, "");
+ E_symlinkat("../mnt/", dfd, "mnt/self");
+ E_symlinkat("/mnt/", dfd, "mnt/absself");
+
+ E_mkdirat(dfd, "etc", 0755);
+ E_touchat(dfd, "etc/passwd");
+
+ E_symlinkat("/newfile3", dfd, "creatlink");
+ E_symlinkat("etc/", dfd, "reletc");
+ E_symlinkat("etc/passwd", dfd, "relsym");
+ E_symlinkat("/etc/", dfd, "absetc");
+ E_symlinkat("/etc/passwd", dfd, "abssym");
+ E_symlinkat("/cheeky", dfd, "abscheeky");
+
+ E_mkdirat(dfd, "cheeky", 0755);
+
+ E_symlinkat("/", dfd, "cheeky/absself");
+ E_symlinkat("../../root/", dfd, "cheeky/self");
+ E_symlinkat("/../../root/", dfd, "cheeky/garbageself");
+
+ E_symlinkat("../cheeky/../etc/../etc/passwd", dfd, "cheeky/passwd");
+ E_symlinkat("/../cheeky/../etc/../etc/passwd", dfd, "cheeky/abspasswd");
+
+ E_symlinkat("../../../../../../../../../../../../../../etc/passwd",
+ dfd, "cheeky/dotdotlink");
+ E_symlinkat("/../../../../../../../../../../../../../../etc/passwd",
+ dfd, "cheeky/garbagelink");
+
+ return dfd;
+}
+
+struct basic_test {
+ const char *name;
+ const char *dir;
+ const char *path;
+ struct open_how how;
+ bool pass;
+ union {
+ int err;
+ const char *path;
+ } out;
+};
+
+#define NUM_OPENAT2_OPATH_TESTS 88
+
+void test_openat2_opath_tests(void)
+{
+ int rootfd, hardcoded_fd;
+ char *procselfexe, *hardcoded_fdpath;
+
+ E_asprintf(&procselfexe, "/proc/%d/exe", getpid());
+ rootfd = setup_testdir();
+
+ hardcoded_fd = open("/dev/null", O_RDONLY);
+ E_assert(hardcoded_fd >= 0, "open fd to hardcode");
+ E_asprintf(&hardcoded_fdpath, "self/fd/%d", hardcoded_fd);
+
+ struct basic_test tests[] = {
+ /** RESOLVE_BENEATH **/
+ /* Attempts to cross dirfd should be blocked. */
+ { .name = "[beneath] jump to /",
+ .path = "/", .how.resolve = RESOLVE_BENEATH,
+ .out.err = -EXDEV, .pass = false },
+ { .name = "[beneath] absolute link to $root",
+ .path = "cheeky/absself", .how.resolve = RESOLVE_BENEATH,
+ .out.err = -EXDEV, .pass = false },
+ { .name = "[beneath] chained absolute links to $root",
+ .path = "abscheeky/absself", .how.resolve = RESOLVE_BENEATH,
+ .out.err = -EXDEV, .pass = false },
+ { .name = "[beneath] jump outside $root",
+ .path = "..", .how.resolve = RESOLVE_BENEATH,
+ .out.err = -EXDEV, .pass = false },
+ { .name = "[beneath] temporary jump outside $root",
+ .path = "../root/", .how.resolve = RESOLVE_BENEATH,
+ .out.err = -EXDEV, .pass = false },
+ { .name = "[beneath] symlink temporary jump outside $root",
+ .path = "cheeky/self", .how.resolve = RESOLVE_BENEATH,
+ .out.err = -EXDEV, .pass = false },
+ { .name = "[beneath] chained symlink temporary jump outside $root",
+ .path = "abscheeky/self", .how.resolve = RESOLVE_BENEATH,
+ .out.err = -EXDEV, .pass = false },
+ { .name = "[beneath] garbage links to $root",
+ .path = "cheeky/garbageself", .how.resolve = RESOLVE_BENEATH,
+ .out.err = -EXDEV, .pass = false },
+ { .name = "[beneath] chained garbage links to $root",
+ .path = "abscheeky/garbageself", .how.resolve = RESOLVE_BENEATH,
+ .out.err = -EXDEV, .pass = false },
+ /* Only relative paths that stay inside dirfd should work. */
+ { .name = "[beneath] ordinary path to 'root'",
+ .path = "root", .how.resolve = RESOLVE_BENEATH,
+ .out.path = "root", .pass = true },
+ { .name = "[beneath] ordinary path to 'etc'",
+ .path = "etc", .how.resolve = RESOLVE_BENEATH,
+ .out.path = "etc", .pass = true },
+ { .name = "[beneath] ordinary path to 'etc/passwd'",
+ .path = "etc/passwd", .how.resolve = RESOLVE_BENEATH,
+ .out.path = "etc/passwd", .pass = true },
+ { .name = "[beneath] relative symlink inside $root",
+ .path = "relsym", .how.resolve = RESOLVE_BENEATH,
+ .out.path = "etc/passwd", .pass = true },
+ { .name = "[beneath] chained-'..' relative symlink inside $root",
+ .path = "cheeky/passwd", .how.resolve = RESOLVE_BENEATH,
+ .out.path = "etc/passwd", .pass = true },
+ { .name = "[beneath] absolute symlink component outside $root",
+ .path = "abscheeky/passwd", .how.resolve = RESOLVE_BENEATH,
+ .out.err = -EXDEV, .pass = false },
+ { .name = "[beneath] absolute symlink target outside $root",
+ .path = "abssym", .how.resolve = RESOLVE_BENEATH,
+ .out.err = -EXDEV, .pass = false },
+ { .name = "[beneath] absolute path outside $root",
+ .path = "/etc/passwd", .how.resolve = RESOLVE_BENEATH,
+ .out.err = -EXDEV, .pass = false },
+ { .name = "[beneath] cheeky absolute path outside $root",
+ .path = "cheeky/abspasswd", .how.resolve = RESOLVE_BENEATH,
+ .out.err = -EXDEV, .pass = false },
+ { .name = "[beneath] chained cheeky absolute path outside $root",
+ .path = "abscheeky/abspasswd", .how.resolve = RESOLVE_BENEATH,
+ .out.err = -EXDEV, .pass = false },
+ /* Tricky paths should fail. */
+ { .name = "[beneath] tricky '..'-chained symlink outside $root",
+ .path = "cheeky/dotdotlink", .how.resolve = RESOLVE_BENEATH,
+ .out.err = -EXDEV, .pass = false },
+ { .name = "[beneath] tricky absolute + '..'-chained symlink outside $root",
+ .path = "abscheeky/dotdotlink", .how.resolve = RESOLVE_BENEATH,
+ .out.err = -EXDEV, .pass = false },
+ { .name = "[beneath] tricky garbage link outside $root",
+ .path = "cheeky/garbagelink", .how.resolve = RESOLVE_BENEATH,
+ .out.err = -EXDEV, .pass = false },
+ { .name = "[beneath] tricky absolute + garbage link outside $root",
+ .path = "abscheeky/garbagelink", .how.resolve = RESOLVE_BENEATH,
+ .out.err = -EXDEV, .pass = false },
+
+ /** RESOLVE_IN_ROOT **/
+ /* All attempts to cross the dirfd will be scoped-to-root. */
+ { .name = "[in_root] jump to /",
+ .path = "/", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = NULL, .pass = true },
+ { .name = "[in_root] absolute symlink to /root",
+ .path = "cheeky/absself", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = NULL, .pass = true },
+ { .name = "[in_root] chained absolute symlinks to /root",
+ .path = "abscheeky/absself", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = NULL, .pass = true },
+ { .name = "[in_root] '..' at root",
+ .path = "..", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = NULL, .pass = true },
+ { .name = "[in_root] '../root' at root",
+ .path = "../root/", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "root", .pass = true },
+ { .name = "[in_root] relative symlink containing '..' above root",
+ .path = "cheeky/self", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "root", .pass = true },
+ { .name = "[in_root] garbage link to /root",
+ .path = "cheeky/garbageself", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "root", .pass = true },
+ { .name = "[in_root] chained garbage links to /root",
+ .path = "abscheeky/garbageself", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "root", .pass = true },
+ { .name = "[in_root] relative path to 'root'",
+ .path = "root", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "root", .pass = true },
+ { .name = "[in_root] relative path to 'etc'",
+ .path = "etc", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "etc", .pass = true },
+ { .name = "[in_root] relative path to 'etc/passwd'",
+ .path = "etc/passwd", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "etc/passwd", .pass = true },
+ { .name = "[in_root] relative symlink to 'etc/passwd'",
+ .path = "relsym", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "etc/passwd", .pass = true },
+ { .name = "[in_root] chained-'..' relative symlink to 'etc/passwd'",
+ .path = "cheeky/passwd", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "etc/passwd", .pass = true },
+ { .name = "[in_root] chained-'..' absolute + relative symlink to 'etc/passwd'",
+ .path = "abscheeky/passwd", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "etc/passwd", .pass = true },
+ { .name = "[in_root] absolute symlink to 'etc/passwd'",
+ .path = "abssym", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "etc/passwd", .pass = true },
+ { .name = "[in_root] absolute path 'etc/passwd'",
+ .path = "/etc/passwd", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "etc/passwd", .pass = true },
+ { .name = "[in_root] cheeky absolute path 'etc/passwd'",
+ .path = "cheeky/abspasswd", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "etc/passwd", .pass = true },
+ { .name = "[in_root] chained cheeky absolute path 'etc/passwd'",
+ .path = "abscheeky/abspasswd", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "etc/passwd", .pass = true },
+ { .name = "[in_root] tricky '..'-chained symlink outside $root",
+ .path = "cheeky/dotdotlink", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "etc/passwd", .pass = true },
+ { .name = "[in_root] tricky absolute + '..'-chained symlink outside $root",
+ .path = "abscheeky/dotdotlink", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "etc/passwd", .pass = true },
+ { .name = "[in_root] tricky absolute path + absolute + '..'-chained symlink outside $root",
+ .path = "/../../../../abscheeky/dotdotlink", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "etc/passwd", .pass = true },
+ { .name = "[in_root] tricky garbage link outside $root",
+ .path = "cheeky/garbagelink", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "etc/passwd", .pass = true },
+ { .name = "[in_root] tricky absolute + garbage link outside $root",
+ .path = "abscheeky/garbagelink", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "etc/passwd", .pass = true },
+ { .name = "[in_root] tricky absolute path + absolute + garbage link outside $root",
+ .path = "/../../../../abscheeky/garbagelink", .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "etc/passwd", .pass = true },
+ /* O_CREAT should handle trailing symlinks correctly. */
+ { .name = "[in_root] O_CREAT of relative path inside $root",
+ .path = "newfile1", .how.flags = O_CREAT,
+ .how.mode = 0700,
+ .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "newfile1", .pass = true },
+ { .name = "[in_root] O_CREAT of absolute path",
+ .path = "/newfile2", .how.flags = O_CREAT,
+ .how.mode = 0700,
+ .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "newfile2", .pass = true },
+ { .name = "[in_root] O_CREAT of tricky symlink outside root",
+ .path = "/creatlink", .how.flags = O_CREAT,
+ .how.mode = 0700,
+ .how.resolve = RESOLVE_IN_ROOT,
+ .out.path = "newfile3", .pass = true },
+
+ /** RESOLVE_NO_XDEV **/
+ /* Crossing *down* into a mountpoint is disallowed. */
+ { .name = "[no_xdev] cross into $mnt",
+ .path = "mnt", .how.resolve = RESOLVE_NO_XDEV,
+ .out.err = -EXDEV, .pass = false },
+ { .name = "[no_xdev] cross into $mnt/",
+ .path = "mnt/", .how.resolve = RESOLVE_NO_XDEV,
+ .out.err = -EXDEV, .pass = false },
+ { .name = "[no_xdev] cross into $mnt/.",
+ .path = "mnt/.", .how.resolve = RESOLVE_NO_XDEV,
+ .out.err = -EXDEV, .pass = false },
+ /* Crossing *up* out of a mountpoint is disallowed. */
+ { .name = "[no_xdev] goto mountpoint root",
+ .dir = "mnt", .path = ".", .how.resolve = RESOLVE_NO_XDEV,
+ .out.path = "mnt", .pass = true },
+ { .name = "[no_xdev] cross up through '..'",
+ .dir = "mnt", .path = "..", .how.resolve = RESOLVE_NO_XDEV,
+ .out.err = -EXDEV, .pass = false },
+ { .name = "[no_xdev] temporary cross up through '..'",
+ .dir = "mnt", .path = "../mnt", .how.resolve = RESOLVE_NO_XDEV,
+ .out.err = -EXDEV, .pass = false },
+ { .name = "[no_xdev] temporary relative symlink cross up",
+ .dir = "mnt", .path = "self", .how.resolve = RESOLVE_NO_XDEV,
+ .out.err = -EXDEV, .pass = false },
+ { .name = "[no_xdev] temporary absolute symlink cross up",
+ .dir = "mnt", .path = "absself", .how.resolve = RESOLVE_NO_XDEV,
+ .out.err = -EXDEV, .pass = false },
+ /* Jumping to "/" is ok, but later components cannot cross. */
+ { .name = "[no_xdev] jump to / directly",
+ .dir = "mnt", .path = "/", .how.resolve = RESOLVE_NO_XDEV,
+ .out.path = "/", .pass = true },
+ { .name = "[no_xdev] jump to / (from /) directly",
+ .dir = "/", .path = "/", .how.resolve = RESOLVE_NO_XDEV,
+ .out.path = "/", .pass = true },
+ { .name = "[no_xdev] jump to / then proc",
+ .path = "/proc/1", .how.resolve = RESOLVE_NO_XDEV,
+ .out.err = -EXDEV, .pass = false },
+ { .name = "[no_xdev] jump to / then tmp",
+ .path = "/tmp", .how.resolve = RESOLVE_NO_XDEV,
+ .out.err = -EXDEV, .pass = false },
+ /* Magic-links are blocked since they can switch vfsmounts. */
+ { .name = "[no_xdev] cross through magic-link to self/root",
+ .dir = "/proc", .path = "self/root", .how.resolve = RESOLVE_NO_XDEV,
+ .out.err = -EXDEV, .pass = false },
+ { .name = "[no_xdev] cross through magic-link to self/cwd",
+ .dir = "/proc", .path = "self/cwd", .how.resolve = RESOLVE_NO_XDEV,
+ .out.err = -EXDEV, .pass = false },
+ /* Except magic-link jumps inside the same vfsmount. */
+ { .name = "[no_xdev] jump through magic-link to same procfs",
+ .dir = "/proc", .path = hardcoded_fdpath, .how.resolve = RESOLVE_NO_XDEV,
+ .out.path = "/proc", .pass = true, },
+
+ /** RESOLVE_NO_MAGICLINKS **/
+ /* Regular symlinks should work. */
+ { .name = "[no_magiclinks] ordinary relative symlink",
+ .path = "relsym", .how.resolve = RESOLVE_NO_MAGICLINKS,
+ .out.path = "etc/passwd", .pass = true },
+ /* Magic-links should not work. */
+ { .name = "[no_magiclinks] symlink to magic-link",
+ .path = "procexe", .how.resolve = RESOLVE_NO_MAGICLINKS,
+ .out.err = -ELOOP, .pass = false },
+ { .name = "[no_magiclinks] normal path to magic-link",
+ .path = "/proc/self/exe", .how.resolve = RESOLVE_NO_MAGICLINKS,
+ .out.err = -ELOOP, .pass = false },
+ { .name = "[no_magiclinks] normal path to magic-link with O_NOFOLLOW",
+ .path = "/proc/self/exe", .how.flags = O_NOFOLLOW,
+ .how.resolve = RESOLVE_NO_MAGICLINKS,
+ .out.path = procselfexe, .pass = true },
+ { .name = "[no_magiclinks] symlink to magic-link path component",
+ .path = "procroot/etc", .how.resolve = RESOLVE_NO_MAGICLINKS,
+ .out.err = -ELOOP, .pass = false },
+ { .name = "[no_magiclinks] magic-link path component",
+ .path = "/proc/self/root/etc", .how.resolve = RESOLVE_NO_MAGICLINKS,
+ .out.err = -ELOOP, .pass = false },
+ { .name = "[no_magiclinks] magic-link path component with O_NOFOLLOW",
+ .path = "/proc/self/root/etc", .how.flags = O_NOFOLLOW,
+ .how.resolve = RESOLVE_NO_MAGICLINKS,
+ .out.err = -ELOOP, .pass = false },
+
+ /** RESOLVE_NO_SYMLINKS **/
+ /* Normal paths should work. */
+ { .name = "[no_symlinks] ordinary path to '.'",
+ .path = ".", .how.resolve = RESOLVE_NO_SYMLINKS,
+ .out.path = NULL, .pass = true },
+ { .name = "[no_symlinks] ordinary path to 'root'",
+ .path = "root", .how.resolve = RESOLVE_NO_SYMLINKS,
+ .out.path = "root", .pass = true },
+ { .name = "[no_symlinks] ordinary path to 'etc'",
+ .path = "etc", .how.resolve = RESOLVE_NO_SYMLINKS,
+ .out.path = "etc", .pass = true },
+ { .name = "[no_symlinks] ordinary path to 'etc/passwd'",
+ .path = "etc/passwd", .how.resolve = RESOLVE_NO_SYMLINKS,
+ .out.path = "etc/passwd", .pass = true },
+ /* Regular symlinks are blocked. */
+ { .name = "[no_symlinks] relative symlink target",
+ .path = "relsym", .how.resolve = RESOLVE_NO_SYMLINKS,
+ .out.err = -ELOOP, .pass = false },
+ { .name = "[no_symlinks] relative symlink component",
+ .path = "reletc/passwd", .how.resolve = RESOLVE_NO_SYMLINKS,
+ .out.err = -ELOOP, .pass = false },
+ { .name = "[no_symlinks] absolute symlink target",
+ .path = "abssym", .how.resolve = RESOLVE_NO_SYMLINKS,
+ .out.err = -ELOOP, .pass = false },
+ { .name = "[no_symlinks] absolute symlink component",
+ .path = "absetc/passwd", .how.resolve = RESOLVE_NO_SYMLINKS,
+ .out.err = -ELOOP, .pass = false },
+ { .name = "[no_symlinks] cheeky garbage link",
+ .path = "cheeky/garbagelink", .how.resolve = RESOLVE_NO_SYMLINKS,
+ .out.err = -ELOOP, .pass = false },
+ { .name = "[no_symlinks] cheeky absolute + garbage link",
+ .path = "abscheeky/garbagelink", .how.resolve = RESOLVE_NO_SYMLINKS,
+ .out.err = -ELOOP, .pass = false },
+ { .name = "[no_symlinks] cheeky absolute + absolute symlink",
+ .path = "abscheeky/absself", .how.resolve = RESOLVE_NO_SYMLINKS,
+ .out.err = -ELOOP, .pass = false },
+ /* Trailing symlinks with NO_FOLLOW. */
+ { .name = "[no_symlinks] relative symlink with O_NOFOLLOW",
+ .path = "relsym", .how.flags = O_NOFOLLOW,
+ .how.resolve = RESOLVE_NO_SYMLINKS,
+ .out.path = "relsym", .pass = true },
+ { .name = "[no_symlinks] absolute symlink with O_NOFOLLOW",
+ .path = "abssym", .how.flags = O_NOFOLLOW,
+ .how.resolve = RESOLVE_NO_SYMLINKS,
+ .out.path = "abssym", .pass = true },
+ { .name = "[no_symlinks] trailing symlink with O_NOFOLLOW",
+ .path = "cheeky/garbagelink", .how.flags = O_NOFOLLOW,
+ .how.resolve = RESOLVE_NO_SYMLINKS,
+ .out.path = "cheeky/garbagelink", .pass = true },
+ { .name = "[no_symlinks] multiple symlink components with O_NOFOLLOW",
+ .path = "abscheeky/absself", .how.flags = O_NOFOLLOW,
+ .how.resolve = RESOLVE_NO_SYMLINKS,
+ .out.err = -ELOOP, .pass = false },
+ { .name = "[no_symlinks] multiple symlink (and garbage link) components with O_NOFOLLOW",
+ .path = "abscheeky/garbagelink", .how.flags = O_NOFOLLOW,
+ .how.resolve = RESOLVE_NO_SYMLINKS,
+ .out.err = -ELOOP, .pass = false },
+ };
+
+ BUILD_BUG_ON(ARRAY_LEN(tests) != NUM_OPENAT2_OPATH_TESTS);
+
+ for (int i = 0; i < ARRAY_LEN(tests); i++) {
+ int dfd, fd;
+ char *fdpath = NULL;
+ bool failed;
+ void (*resultfn)(const char *msg, ...) = ksft_test_result_pass;
+ struct basic_test *test = &tests[i];
+
+ if (!openat2_supported) {
+ ksft_print_msg("openat2(2) unsupported\n");
+ resultfn = ksft_test_result_skip;
+ goto skip;
+ }
+
+ /* Auto-set O_PATH. */
+ if (!(test->how.flags & O_CREAT))
+ test->how.flags |= O_PATH;
+
+ if (test->dir)
+ dfd = openat(rootfd, test->dir, O_PATH | O_DIRECTORY);
+ else
+ dfd = dup(rootfd);
+ E_assert(dfd, "failed to openat root '%s': %m", test->dir);
+
+ E_dup2(dfd, hardcoded_fd);
+
+ fd = sys_openat2(dfd, test->path, &test->how);
+ if (test->pass)
+ failed = (fd < 0 || !fdequal(fd, rootfd, test->out.path));
+ else
+ failed = (fd != test->out.err);
+ if (fd >= 0) {
+ fdpath = fdreadlink(fd);
+ close(fd);
+ }
+ close(dfd);
+
+ if (failed) {
+ resultfn = ksft_test_result_fail;
+
+ ksft_print_msg("openat2 unexpectedly returned ");
+ if (fdpath)
+ ksft_print_msg("%d['%s']\n", fd, fdpath);
+ else
+ ksft_print_msg("%d (%s)\n", fd, strerror(-fd));
+ }
+
+skip:
+ if (test->pass)
+ resultfn("%s gives path '%s'\n", test->name,
+ test->out.path ?: ".");
+ else
+ resultfn("%s fails with %d (%s)\n", test->name,
+ test->out.err, strerror(-test->out.err));
+
+ fflush(stdout);
+ free(fdpath);
+ }
+
+ free(procselfexe);
+ close(rootfd);
+
+ free(hardcoded_fdpath);
+ close(hardcoded_fd);
+}
+
+#define NUM_TESTS NUM_OPENAT2_OPATH_TESTS
+
+int main(int argc, char **argv)
+{
+ ksft_print_header();
+ ksft_set_plan(NUM_TESTS);
+
+ /* NOTE: We should be checking for CAP_SYS_ADMIN here... */
+ if (geteuid() != 0)
+ ksft_exit_skip("all tests require euid == 0\n");
+
+ test_openat2_opath_tests();
+
+ if (ksft_get_fail_cnt() + ksft_get_error_cnt() > 0)
+ ksft_exit_fail();
+ else
+ ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/pid_namespace/.gitignore b/tools/testing/selftests/pid_namespace/.gitignore
new file mode 100644
index 000000000..93ab9d7e5
--- /dev/null
+++ b/tools/testing/selftests/pid_namespace/.gitignore
@@ -0,0 +1 @@
+regression_enomem
diff --git a/tools/testing/selftests/pid_namespace/Makefile b/tools/testing/selftests/pid_namespace/Makefile
new file mode 100644
index 000000000..dcaefa224
--- /dev/null
+++ b/tools/testing/selftests/pid_namespace/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS += -g -I../../../../usr/include/
+
+TEST_GEN_PROGS := regression_enomem
+
+include ../lib.mk
+
+$(OUTPUT)/regression_enomem: regression_enomem.c ../pidfd/pidfd.h
diff --git a/tools/testing/selftests/pid_namespace/config b/tools/testing/selftests/pid_namespace/config
new file mode 100644
index 000000000..26cdb27e7
--- /dev/null
+++ b/tools/testing/selftests/pid_namespace/config
@@ -0,0 +1,2 @@
+CONFIG_PID_NS=y
+CONFIG_USER_NS=y
diff --git a/tools/testing/selftests/pid_namespace/regression_enomem.c b/tools/testing/selftests/pid_namespace/regression_enomem.c
new file mode 100644
index 000000000..7d84097ad
--- /dev/null
+++ b/tools/testing/selftests/pid_namespace/regression_enomem.c
@@ -0,0 +1,44 @@
+#define _GNU_SOURCE
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/types.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/wait.h>
+
+#include "../kselftest_harness.h"
+#include "../pidfd/pidfd.h"
+
+/*
+ * Regression test for:
+ * 35f71bc0a09a ("fork: report pid reservation failure properly")
+ * b26ebfe12f34 ("pid: Fix error return value in some cases")
+ */
+TEST(regression_enomem)
+{
+ pid_t pid;
+
+ if (geteuid())
+ EXPECT_EQ(0, unshare(CLONE_NEWUSER));
+
+ EXPECT_EQ(0, unshare(CLONE_NEWPID));
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0)
+ exit(EXIT_SUCCESS);
+
+ EXPECT_EQ(0, wait_for_pid(pid));
+
+ pid = fork();
+ ASSERT_LT(pid, 0);
+ ASSERT_EQ(errno, ENOMEM);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore
new file mode 100644
index 000000000..973198a3e
--- /dev/null
+++ b/tools/testing/selftests/pidfd/.gitignore
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+pidfd_open_test
+pidfd_poll_test
+pidfd_test
+pidfd_wait
+pidfd_fdinfo_test
+pidfd_getfd_test
+pidfd_setns_test
diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile
new file mode 100644
index 000000000..f4a2f28f9
--- /dev/null
+++ b/tools/testing/selftests/pidfd/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+CFLAGS += -g -I../../../../usr/include/ -pthread
+
+TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \
+ pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test
+
+include ../lib.mk
+
diff --git a/tools/testing/selftests/pidfd/config b/tools/testing/selftests/pidfd/config
new file mode 100644
index 000000000..f6f2965e1
--- /dev/null
+++ b/tools/testing/selftests/pidfd/config
@@ -0,0 +1,7 @@
+CONFIG_UTS_NS=y
+CONFIG_IPC_NS=y
+CONFIG_USER_NS=y
+CONFIG_PID_NS=y
+CONFIG_NET_NS=y
+CONFIG_CGROUPS=y
+CONFIG_CHECKPOINT_RESTORE=y
diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h
new file mode 100644
index 000000000..6922d6417
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd.h
@@ -0,0 +1,118 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __PIDFD_H
+#define __PIDFD_H
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/mount.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#include "../kselftest.h"
+
+#ifndef P_PIDFD
+#define P_PIDFD 3
+#endif
+
+#ifndef CLONE_NEWTIME
+#define CLONE_NEWTIME 0x00000080
+#endif
+
+#ifndef CLONE_PIDFD
+#define CLONE_PIDFD 0x00001000
+#endif
+
+#ifndef __NR_pidfd_open
+#define __NR_pidfd_open -1
+#endif
+
+#ifndef __NR_pidfd_send_signal
+#define __NR_pidfd_send_signal -1
+#endif
+
+#ifndef __NR_clone3
+#define __NR_clone3 -1
+#endif
+
+#ifndef __NR_pidfd_getfd
+#define __NR_pidfd_getfd -1
+#endif
+
+#ifndef PIDFD_NONBLOCK
+#define PIDFD_NONBLOCK O_NONBLOCK
+#endif
+
+/*
+ * The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c
+ * That means, when it wraps around any pid < 300 will be skipped.
+ * So we need to use a pid > 300 in order to test recycling.
+ */
+#define PID_RECYCLE 1000
+
+/*
+ * Define a few custom error codes for the child process to clearly indicate
+ * what is happening. This way we can tell the difference between a system
+ * error, a test error, etc.
+ */
+#define PIDFD_PASS 0
+#define PIDFD_FAIL 1
+#define PIDFD_ERROR 2
+#define PIDFD_SKIP 3
+#define PIDFD_XFAIL 4
+
+static inline int wait_for_pid(pid_t pid)
+{
+ int status, ret;
+
+again:
+ ret = waitpid(pid, &status, 0);
+ if (ret == -1) {
+ if (errno == EINTR)
+ goto again;
+
+ ksft_print_msg("waitpid returned -1, errno=%d\n", errno);
+ return -1;
+ }
+
+ if (!WIFEXITED(status)) {
+ ksft_print_msg(
+ "waitpid !WIFEXITED, WIFSIGNALED=%d, WTERMSIG=%d\n",
+ WIFSIGNALED(status), WTERMSIG(status));
+ return -1;
+ }
+
+ ret = WEXITSTATUS(status);
+ ksft_print_msg("waitpid WEXITSTATUS=%d\n", ret);
+ return ret;
+}
+
+static inline int sys_pidfd_open(pid_t pid, unsigned int flags)
+{
+ return syscall(__NR_pidfd_open, pid, flags);
+}
+
+static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
+ unsigned int flags)
+{
+ return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags);
+}
+
+static inline int sys_pidfd_getfd(int pidfd, int fd, int flags)
+{
+ return syscall(__NR_pidfd_getfd, pidfd, fd, flags);
+}
+
+static inline int sys_memfd_create(const char *name, unsigned int flags)
+{
+ return syscall(__NR_memfd_create, name, flags);
+}
+
+#endif /* __PIDFD_H */
diff --git a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c
new file mode 100644
index 000000000..3bc46d615
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c
@@ -0,0 +1,310 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/types.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/wait.h>
+#include <sys/mman.h>
+
+#include "pidfd.h"
+#include "../kselftest.h"
+
+struct error {
+ int code;
+ char msg[512];
+};
+
+static int error_set(struct error *err, int code, const char *fmt, ...)
+{
+ va_list args;
+ int r;
+
+ if (code == PIDFD_PASS || !err || err->code != PIDFD_PASS)
+ return code;
+
+ err->code = code;
+ va_start(args, fmt);
+ r = vsnprintf(err->msg, sizeof(err->msg), fmt, args);
+ assert((size_t)r < sizeof(err->msg));
+ va_end(args);
+
+ return code;
+}
+
+static void error_report(struct error *err, const char *test_name)
+{
+ switch (err->code) {
+ case PIDFD_ERROR:
+ ksft_exit_fail_msg("%s test: Fatal: %s\n", test_name, err->msg);
+ break;
+
+ case PIDFD_FAIL:
+ /* will be: not ok %d # error %s test: %s */
+ ksft_test_result_error("%s test: %s\n", test_name, err->msg);
+ break;
+
+ case PIDFD_SKIP:
+ /* will be: not ok %d # SKIP %s test: %s */
+ ksft_test_result_skip("%s test: %s\n", test_name, err->msg);
+ break;
+
+ case PIDFD_XFAIL:
+ ksft_test_result_pass("%s test: Expected failure: %s\n",
+ test_name, err->msg);
+ break;
+
+ case PIDFD_PASS:
+ ksft_test_result_pass("%s test: Passed\n", test_name);
+ break;
+
+ default:
+ ksft_exit_fail_msg("%s test: Unknown code: %d %s\n",
+ test_name, err->code, err->msg);
+ break;
+ }
+}
+
+static inline int error_check(struct error *err, const char *test_name)
+{
+ /* In case of error we bail out and terminate the test program */
+ if (err->code == PIDFD_ERROR)
+ error_report(err, test_name);
+
+ return err->code;
+}
+
+#define CHILD_STACK_SIZE 8192
+
+struct child {
+ char *stack;
+ pid_t pid;
+ int fd;
+};
+
+static struct child clone_newns(int (*fn)(void *), void *args,
+ struct error *err)
+{
+ static int flags = CLONE_PIDFD | CLONE_NEWPID | CLONE_NEWNS | SIGCHLD;
+ struct child ret;
+
+ if (!(flags & CLONE_NEWUSER) && geteuid() != 0)
+ flags |= CLONE_NEWUSER;
+
+ ret.stack = mmap(NULL, CHILD_STACK_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+ if (ret.stack == MAP_FAILED) {
+ error_set(err, -1, "mmap of stack failed (errno %d)", errno);
+ return ret;
+ }
+
+#ifdef __ia64__
+ ret.pid = __clone2(fn, ret.stack, CHILD_STACK_SIZE, flags, args, &ret.fd);
+#else
+ ret.pid = clone(fn, ret.stack + CHILD_STACK_SIZE, flags, args, &ret.fd);
+#endif
+
+ if (ret.pid < 0) {
+ error_set(err, PIDFD_ERROR, "clone failed (ret %d, errno %d)",
+ ret.fd, errno);
+ return ret;
+ }
+
+ ksft_print_msg("New child: %d, fd: %d\n", ret.pid, ret.fd);
+
+ return ret;
+}
+
+static inline void child_close(struct child *child)
+{
+ close(child->fd);
+}
+
+static inline int child_join(struct child *child, struct error *err)
+{
+ int r;
+
+ r = wait_for_pid(child->pid);
+ if (r < 0)
+ error_set(err, PIDFD_ERROR, "waitpid failed (ret %d, errno %d)",
+ r, errno);
+ else if (r > 0)
+ error_set(err, r, "child %d reported: %d", child->pid, r);
+
+ if (munmap(child->stack, CHILD_STACK_SIZE)) {
+ error_set(err, -1, "munmap of child stack failed (errno %d)", errno);
+ r = -1;
+ }
+
+ return r;
+}
+
+static inline int child_join_close(struct child *child, struct error *err)
+{
+ child_close(child);
+ return child_join(child, err);
+}
+
+static inline void trim_newline(char *str)
+{
+ char *pos = strrchr(str, '\n');
+
+ if (pos)
+ *pos = '\0';
+}
+
+static int verify_fdinfo(int pidfd, struct error *err, const char *prefix,
+ size_t prefix_len, const char *expect, ...)
+{
+ char buffer[512] = {0, };
+ char path[512] = {0, };
+ va_list args;
+ FILE *f;
+ char *line = NULL;
+ size_t n = 0;
+ int found = 0;
+ int r;
+
+ va_start(args, expect);
+ r = vsnprintf(buffer, sizeof(buffer), expect, args);
+ assert((size_t)r < sizeof(buffer));
+ va_end(args);
+
+ snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", pidfd);
+ f = fopen(path, "re");
+ if (!f)
+ return error_set(err, PIDFD_ERROR, "fdinfo open failed for %d",
+ pidfd);
+
+ while (getline(&line, &n, f) != -1) {
+ char *val;
+
+ if (strncmp(line, prefix, prefix_len))
+ continue;
+
+ found = 1;
+
+ val = line + prefix_len;
+ r = strcmp(val, buffer);
+ if (r != 0) {
+ trim_newline(line);
+ trim_newline(buffer);
+ error_set(err, PIDFD_FAIL, "%s '%s' != '%s'",
+ prefix, val, buffer);
+ }
+ break;
+ }
+
+ free(line);
+ fclose(f);
+
+ if (found == 0)
+ return error_set(err, PIDFD_FAIL, "%s not found for fd %d",
+ prefix, pidfd);
+
+ return PIDFD_PASS;
+}
+
+static int child_fdinfo_nspid_test(void *args)
+{
+ struct error err;
+ int pidfd;
+ int r;
+
+ /* if we got no fd for the sibling, we are done */
+ if (!args)
+ return PIDFD_PASS;
+
+ /* verify that we can not resolve the pidfd for a process
+ * in a sibling pid namespace, i.e. a pid namespace it is
+ * not in our or a descended namespace
+ */
+ r = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0);
+ if (r < 0) {
+ ksft_print_msg("Failed to remount / private\n");
+ return PIDFD_ERROR;
+ }
+
+ (void)umount2("/proc", MNT_DETACH);
+ r = mount("proc", "/proc", "proc", 0, NULL);
+ if (r < 0) {
+ ksft_print_msg("Failed to remount /proc\n");
+ return PIDFD_ERROR;
+ }
+
+ pidfd = *(int *)args;
+ r = verify_fdinfo(pidfd, &err, "NSpid:", 6, "\t0\n");
+
+ if (r != PIDFD_PASS)
+ ksft_print_msg("NSpid fdinfo check failed: %s\n", err.msg);
+
+ return r;
+}
+
+static void test_pidfd_fdinfo_nspid(void)
+{
+ struct child a, b;
+ struct error err = {0, };
+ const char *test_name = "pidfd check for NSpid in fdinfo";
+
+ /* Create a new child in a new pid and mount namespace */
+ a = clone_newns(child_fdinfo_nspid_test, NULL, &err);
+ error_check(&err, test_name);
+
+ /* Pass the pidfd representing the first child to the
+ * second child, which will be in a sibling pid namespace,
+ * which means that the fdinfo NSpid entry for the pidfd
+ * should only contain '0'.
+ */
+ b = clone_newns(child_fdinfo_nspid_test, &a.fd, &err);
+ error_check(&err, test_name);
+
+ /* The children will have pid 1 in the new pid namespace,
+ * so the line must be 'NSPid:\t<pid>\t1'.
+ */
+ verify_fdinfo(a.fd, &err, "NSpid:", 6, "\t%d\t%d\n", a.pid, 1);
+ verify_fdinfo(b.fd, &err, "NSpid:", 6, "\t%d\t%d\n", b.pid, 1);
+
+ /* wait for the process, check the exit status and set
+ * 'err' accordingly, if it is not already set.
+ */
+ child_join_close(&a, &err);
+ child_join_close(&b, &err);
+
+ error_report(&err, test_name);
+}
+
+static void test_pidfd_dead_fdinfo(void)
+{
+ struct child a;
+ struct error err = {0, };
+ const char *test_name = "pidfd check fdinfo for dead process";
+
+ /* Create a new child in a new pid and mount namespace */
+ a = clone_newns(child_fdinfo_nspid_test, NULL, &err);
+ error_check(&err, test_name);
+ child_join(&a, &err);
+
+ verify_fdinfo(a.fd, &err, "Pid:", 4, "\t-1\n");
+ verify_fdinfo(a.fd, &err, "NSpid:", 6, "\t-1\n");
+ child_close(&a);
+ error_report(&err, test_name);
+}
+
+int main(int argc, char **argv)
+{
+ ksft_print_header();
+ ksft_set_plan(2);
+
+ test_pidfd_fdinfo_nspid();
+ test_pidfd_dead_fdinfo();
+
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/pidfd/pidfd_getfd_test.c b/tools/testing/selftests/pidfd/pidfd_getfd_test.c
new file mode 100644
index 000000000..0930e2411
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_getfd_test.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/types.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <linux/kcmp.h>
+
+#include "pidfd.h"
+#include "../kselftest_harness.h"
+
+/*
+ * UNKNOWN_FD is an fd number that should never exist in the child, as it is
+ * used to check the negative case.
+ */
+#define UNKNOWN_FD 111
+#define UID_NOBODY 65535
+
+static int sys_kcmp(pid_t pid1, pid_t pid2, int type, unsigned long idx1,
+ unsigned long idx2)
+{
+ return syscall(__NR_kcmp, pid1, pid2, type, idx1, idx2);
+}
+
+static int __child(int sk, int memfd)
+{
+ int ret;
+ char buf;
+
+ /*
+ * Ensure we don't leave around a bunch of orphaned children if our
+ * tests fail.
+ */
+ ret = prctl(PR_SET_PDEATHSIG, SIGKILL);
+ if (ret) {
+ fprintf(stderr, "%s: Child could not set DEATHSIG\n",
+ strerror(errno));
+ return -1;
+ }
+
+ ret = send(sk, &memfd, sizeof(memfd), 0);
+ if (ret != sizeof(memfd)) {
+ fprintf(stderr, "%s: Child failed to send fd number\n",
+ strerror(errno));
+ return -1;
+ }
+
+ /*
+ * The fixture setup is completed at this point. The tests will run.
+ *
+ * This blocking recv enables the parent to message the child.
+ * Either we will read 'P' off of the sk, indicating that we need
+ * to disable ptrace, or we will read a 0, indicating that the other
+ * side has closed the sk. This occurs during fixture teardown time,
+ * indicating that the child should exit.
+ */
+ while ((ret = recv(sk, &buf, sizeof(buf), 0)) > 0) {
+ if (buf == 'P') {
+ ret = prctl(PR_SET_DUMPABLE, 0);
+ if (ret < 0) {
+ fprintf(stderr,
+ "%s: Child failed to disable ptrace\n",
+ strerror(errno));
+ return -1;
+ }
+ } else {
+ fprintf(stderr, "Child received unknown command %c\n",
+ buf);
+ return -1;
+ }
+ ret = send(sk, &buf, sizeof(buf), 0);
+ if (ret != 1) {
+ fprintf(stderr, "%s: Child failed to ack\n",
+ strerror(errno));
+ return -1;
+ }
+ }
+ if (ret < 0) {
+ fprintf(stderr, "%s: Child failed to read from socket\n",
+ strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+static int child(int sk)
+{
+ int memfd, ret;
+
+ memfd = sys_memfd_create("test", 0);
+ if (memfd < 0) {
+ fprintf(stderr, "%s: Child could not create memfd\n",
+ strerror(errno));
+ ret = -1;
+ } else {
+ ret = __child(sk, memfd);
+ close(memfd);
+ }
+
+ close(sk);
+ return ret;
+}
+
+FIXTURE(child)
+{
+ /*
+ * remote_fd is the number of the FD which we are trying to retrieve
+ * from the child.
+ */
+ int remote_fd;
+ /* pid points to the child which we are fetching FDs from */
+ pid_t pid;
+ /* pidfd is the pidfd of the child */
+ int pidfd;
+ /*
+ * sk is our side of the socketpair used to communicate with the child.
+ * When it is closed, the child will exit.
+ */
+ int sk;
+};
+
+FIXTURE_SETUP(child)
+{
+ int ret, sk_pair[2];
+
+ ASSERT_EQ(0, socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) {
+ TH_LOG("%s: failed to create socketpair", strerror(errno));
+ }
+ self->sk = sk_pair[0];
+
+ self->pid = fork();
+ ASSERT_GE(self->pid, 0);
+
+ if (self->pid == 0) {
+ close(sk_pair[0]);
+ if (child(sk_pair[1]))
+ _exit(EXIT_FAILURE);
+ _exit(EXIT_SUCCESS);
+ }
+
+ close(sk_pair[1]);
+
+ self->pidfd = sys_pidfd_open(self->pid, 0);
+ ASSERT_GE(self->pidfd, 0);
+
+ /*
+ * Wait for the child to complete setup. It'll send the remote memfd's
+ * number when ready.
+ */
+ ret = recv(sk_pair[0], &self->remote_fd, sizeof(self->remote_fd), 0);
+ ASSERT_EQ(sizeof(self->remote_fd), ret);
+}
+
+FIXTURE_TEARDOWN(child)
+{
+ EXPECT_EQ(0, close(self->pidfd));
+ EXPECT_EQ(0, close(self->sk));
+
+ EXPECT_EQ(0, wait_for_pid(self->pid));
+}
+
+TEST_F(child, disable_ptrace)
+{
+ int uid, fd;
+ char c;
+
+ /*
+ * Turn into nobody if we're root, to avoid CAP_SYS_PTRACE
+ *
+ * The tests should run in their own process, so even this test fails,
+ * it shouldn't result in subsequent tests failing.
+ */
+ uid = getuid();
+ if (uid == 0)
+ ASSERT_EQ(0, seteuid(UID_NOBODY));
+
+ ASSERT_EQ(1, send(self->sk, "P", 1, 0));
+ ASSERT_EQ(1, recv(self->sk, &c, 1, 0));
+
+ fd = sys_pidfd_getfd(self->pidfd, self->remote_fd, 0);
+ EXPECT_EQ(-1, fd);
+ EXPECT_EQ(EPERM, errno);
+
+ if (uid == 0)
+ ASSERT_EQ(0, seteuid(0));
+}
+
+TEST_F(child, fetch_fd)
+{
+ int fd, ret;
+
+ fd = sys_pidfd_getfd(self->pidfd, self->remote_fd, 0);
+ ASSERT_GE(fd, 0);
+
+ ret = sys_kcmp(getpid(), self->pid, KCMP_FILE, fd, self->remote_fd);
+ if (ret < 0 && errno == ENOSYS)
+ SKIP(return, "kcmp() syscall not supported");
+ EXPECT_EQ(ret, 0);
+
+ ret = fcntl(fd, F_GETFD);
+ ASSERT_GE(ret, 0);
+ EXPECT_GE(ret & FD_CLOEXEC, 0);
+
+ close(fd);
+}
+
+TEST_F(child, test_unknown_fd)
+{
+ int fd;
+
+ fd = sys_pidfd_getfd(self->pidfd, UNKNOWN_FD, 0);
+ EXPECT_EQ(-1, fd) {
+ TH_LOG("getfd succeeded while fetching unknown fd");
+ };
+ EXPECT_EQ(EBADF, errno) {
+ TH_LOG("%s: getfd did not get EBADF", strerror(errno));
+ }
+}
+
+TEST(flags_set)
+{
+ ASSERT_EQ(-1, sys_pidfd_getfd(0, 0, 1));
+ EXPECT_EQ(errno, EINVAL);
+}
+
+#if __NR_pidfd_getfd == -1
+int main(void)
+{
+ fprintf(stderr, "__NR_pidfd_getfd undefined. The pidfd_getfd syscall is unavailable. Test aborting\n");
+ return KSFT_SKIP;
+}
+#else
+TEST_HARNESS_MAIN
+#endif
diff --git a/tools/testing/selftests/pidfd/pidfd_open_test.c b/tools/testing/selftests/pidfd/pidfd_open_test.c
new file mode 100644
index 000000000..8a59438cc
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_open_test.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <linux/types.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/mount.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "pidfd.h"
+#include "../kselftest.h"
+
+static int safe_int(const char *numstr, int *converted)
+{
+ char *err = NULL;
+ long sli;
+
+ errno = 0;
+ sli = strtol(numstr, &err, 0);
+ if (errno == ERANGE && (sli == LONG_MAX || sli == LONG_MIN))
+ return -ERANGE;
+
+ if (errno != 0 && sli == 0)
+ return -EINVAL;
+
+ if (err == numstr || *err != '\0')
+ return -EINVAL;
+
+ if (sli > INT_MAX || sli < INT_MIN)
+ return -ERANGE;
+
+ *converted = (int)sli;
+ return 0;
+}
+
+static int char_left_gc(const char *buffer, size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len; i++) {
+ if (buffer[i] == ' ' ||
+ buffer[i] == '\t')
+ continue;
+
+ return i;
+ }
+
+ return 0;
+}
+
+static int char_right_gc(const char *buffer, size_t len)
+{
+ int i;
+
+ for (i = len - 1; i >= 0; i--) {
+ if (buffer[i] == ' ' ||
+ buffer[i] == '\t' ||
+ buffer[i] == '\n' ||
+ buffer[i] == '\0')
+ continue;
+
+ return i + 1;
+ }
+
+ return 0;
+}
+
+static char *trim_whitespace_in_place(char *buffer)
+{
+ buffer += char_left_gc(buffer, strlen(buffer));
+ buffer[char_right_gc(buffer, strlen(buffer))] = '\0';
+ return buffer;
+}
+
+static pid_t get_pid_from_fdinfo_file(int pidfd, const char *key, size_t keylen)
+{
+ int ret;
+ char path[512];
+ FILE *f;
+ size_t n = 0;
+ pid_t result = -1;
+ char *line = NULL;
+
+ snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", pidfd);
+
+ f = fopen(path, "re");
+ if (!f)
+ return -1;
+
+ while (getline(&line, &n, f) != -1) {
+ char *numstr;
+
+ if (strncmp(line, key, keylen))
+ continue;
+
+ numstr = trim_whitespace_in_place(line + 4);
+ ret = safe_int(numstr, &result);
+ if (ret < 0)
+ goto out;
+
+ break;
+ }
+
+out:
+ free(line);
+ fclose(f);
+ return result;
+}
+
+int main(int argc, char **argv)
+{
+ int pidfd = -1, ret = 1;
+ pid_t pid;
+
+ ksft_set_plan(3);
+
+ pidfd = sys_pidfd_open(-1, 0);
+ if (pidfd >= 0) {
+ ksft_print_msg(
+ "%s - succeeded to open pidfd for invalid pid -1\n",
+ strerror(errno));
+ goto on_error;
+ }
+ ksft_test_result_pass("do not allow invalid pid test: passed\n");
+
+ pidfd = sys_pidfd_open(getpid(), 1);
+ if (pidfd >= 0) {
+ ksft_print_msg(
+ "%s - succeeded to open pidfd with invalid flag value specified\n",
+ strerror(errno));
+ goto on_error;
+ }
+ ksft_test_result_pass("do not allow invalid flag test: passed\n");
+
+ pidfd = sys_pidfd_open(getpid(), 0);
+ if (pidfd < 0) {
+ ksft_print_msg("%s - failed to open pidfd\n", strerror(errno));
+ goto on_error;
+ }
+ ksft_test_result_pass("open a new pidfd test: passed\n");
+
+ pid = get_pid_from_fdinfo_file(pidfd, "Pid:", sizeof("Pid:") - 1);
+ ksft_print_msg("pidfd %d refers to process with pid %d\n", pidfd, pid);
+
+ ret = 0;
+
+on_error:
+ if (pidfd >= 0)
+ close(pidfd);
+
+ return !ret ? ksft_exit_pass() : ksft_exit_fail();
+}
diff --git a/tools/testing/selftests/pidfd/pidfd_poll_test.c b/tools/testing/selftests/pidfd/pidfd_poll_test.c
new file mode 100644
index 000000000..610811275
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_poll_test.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <linux/types.h>
+#include <poll.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "pidfd.h"
+#include "../kselftest.h"
+
+static bool timeout;
+
+static void handle_alarm(int sig)
+{
+ timeout = true;
+}
+
+int main(int argc, char **argv)
+{
+ struct pollfd fds;
+ int iter, nevents;
+ int nr_iterations = 10000;
+
+ fds.events = POLLIN;
+
+ if (argc > 2)
+ ksft_exit_fail_msg("Unexpected command line argument\n");
+
+ if (argc == 2) {
+ nr_iterations = atoi(argv[1]);
+ if (nr_iterations <= 0)
+ ksft_exit_fail_msg("invalid input parameter %s\n",
+ argv[1]);
+ }
+
+ ksft_print_msg("running pidfd poll test for %d iterations\n",
+ nr_iterations);
+
+ for (iter = 0; iter < nr_iterations; iter++) {
+ int pidfd;
+ int child_pid = fork();
+
+ if (child_pid < 0) {
+ if (errno == EAGAIN) {
+ iter--;
+ continue;
+ }
+ ksft_exit_fail_msg(
+ "%s - failed to fork a child process\n",
+ strerror(errno));
+ }
+
+ if (child_pid == 0) {
+ /* Child process just sleeps for a min and exits */
+ sleep(60);
+ exit(EXIT_SUCCESS);
+ }
+
+ /* Parent kills the child and waits for its death */
+ pidfd = sys_pidfd_open(child_pid, 0);
+ if (pidfd < 0)
+ ksft_exit_fail_msg("%s - pidfd_open failed\n",
+ strerror(errno));
+
+ /* Setup 3 sec alarm - plenty of time */
+ if (signal(SIGALRM, handle_alarm) == SIG_ERR)
+ ksft_exit_fail_msg("%s - signal failed\n",
+ strerror(errno));
+ alarm(3);
+
+ /* Send SIGKILL to the child */
+ if (sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0))
+ ksft_exit_fail_msg("%s - pidfd_send_signal failed\n",
+ strerror(errno));
+
+ /* Wait for the death notification */
+ fds.fd = pidfd;
+ nevents = poll(&fds, 1, -1);
+
+ /* Check for error conditions */
+ if (nevents < 0)
+ ksft_exit_fail_msg("%s - poll failed\n",
+ strerror(errno));
+
+ if (nevents != 1)
+ ksft_exit_fail_msg("unexpected poll result: %d\n",
+ nevents);
+
+ if (!(fds.revents & POLLIN))
+ ksft_exit_fail_msg(
+ "unexpected event type received: 0x%x\n",
+ fds.revents);
+
+ if (timeout)
+ ksft_exit_fail_msg(
+ "death notification wait timeout\n");
+
+ close(pidfd);
+ /* Wait for child to prevent zombies */
+ if (waitpid(child_pid, NULL, 0) < 0)
+ ksft_exit_fail_msg("%s - waitpid failed\n",
+ strerror(errno));
+
+ }
+
+ ksft_test_result_pass("pidfd poll test: pass\n");
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c
new file mode 100644
index 000000000..6e2f2cd40
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c
@@ -0,0 +1,559 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/types.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+
+#include "pidfd.h"
+#include "../clone3/clone3_selftests.h"
+#include "../kselftest_harness.h"
+
+enum {
+ PIDFD_NS_USER,
+ PIDFD_NS_MNT,
+ PIDFD_NS_PID,
+ PIDFD_NS_UTS,
+ PIDFD_NS_IPC,
+ PIDFD_NS_NET,
+ PIDFD_NS_CGROUP,
+ PIDFD_NS_PIDCLD,
+ PIDFD_NS_TIME,
+ PIDFD_NS_MAX
+};
+
+const struct ns_info {
+ const char *name;
+ int flag;
+} ns_info[] = {
+ [PIDFD_NS_USER] = { "user", CLONE_NEWUSER, },
+ [PIDFD_NS_MNT] = { "mnt", CLONE_NEWNS, },
+ [PIDFD_NS_PID] = { "pid", CLONE_NEWPID, },
+ [PIDFD_NS_UTS] = { "uts", CLONE_NEWUTS, },
+ [PIDFD_NS_IPC] = { "ipc", CLONE_NEWIPC, },
+ [PIDFD_NS_NET] = { "net", CLONE_NEWNET, },
+ [PIDFD_NS_CGROUP] = { "cgroup", CLONE_NEWCGROUP, },
+ [PIDFD_NS_PIDCLD] = { "pid_for_children", 0, },
+ [PIDFD_NS_TIME] = { "time", CLONE_NEWTIME, },
+};
+
+FIXTURE(current_nsset)
+{
+ pid_t pid;
+ int pidfd;
+ int nsfds[PIDFD_NS_MAX];
+
+ pid_t child_pid_exited;
+ int child_pidfd_exited;
+
+ pid_t child_pid1;
+ int child_pidfd1;
+ int child_nsfds1[PIDFD_NS_MAX];
+
+ pid_t child_pid2;
+ int child_pidfd2;
+ int child_nsfds2[PIDFD_NS_MAX];
+};
+
+static int sys_waitid(int which, pid_t pid, int options)
+{
+ return syscall(__NR_waitid, which, pid, NULL, options, NULL);
+}
+
+pid_t create_child(int *pidfd, unsigned flags)
+{
+ struct __clone_args args = {
+ .flags = CLONE_PIDFD | flags,
+ .exit_signal = SIGCHLD,
+ .pidfd = ptr_to_u64(pidfd),
+ };
+
+ return sys_clone3(&args, sizeof(struct clone_args));
+}
+
+static bool switch_timens(void)
+{
+ int fd, ret;
+
+ if (unshare(CLONE_NEWTIME))
+ return false;
+
+ fd = open("/proc/self/ns/time_for_children", O_RDONLY | O_CLOEXEC);
+ if (fd < 0)
+ return false;
+
+ ret = setns(fd, CLONE_NEWTIME);
+ close(fd);
+ return ret == 0;
+}
+
+static ssize_t read_nointr(int fd, void *buf, size_t count)
+{
+ ssize_t ret;
+
+ do {
+ ret = read(fd, buf, count);
+ } while (ret < 0 && errno == EINTR);
+
+ return ret;
+}
+
+static ssize_t write_nointr(int fd, const void *buf, size_t count)
+{
+ ssize_t ret;
+
+ do {
+ ret = write(fd, buf, count);
+ } while (ret < 0 && errno == EINTR);
+
+ return ret;
+}
+
+FIXTURE_SETUP(current_nsset)
+{
+ int i, proc_fd, ret;
+ int ipc_sockets[2];
+ char c;
+
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ self->nsfds[i] = -EBADF;
+ self->child_nsfds1[i] = -EBADF;
+ self->child_nsfds2[i] = -EBADF;
+ }
+
+ proc_fd = open("/proc/self/ns", O_DIRECTORY | O_CLOEXEC);
+ ASSERT_GE(proc_fd, 0) {
+ TH_LOG("%m - Failed to open /proc/self/ns");
+ }
+
+ self->pid = getpid();
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+ self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC);
+ if (self->nsfds[i] < 0) {
+ EXPECT_EQ(errno, ENOENT) {
+ TH_LOG("%m - Failed to open %s namespace for process %d",
+ info->name, self->pid);
+ }
+ }
+ }
+
+ self->pidfd = sys_pidfd_open(self->pid, 0);
+ EXPECT_GT(self->pidfd, 0) {
+ TH_LOG("%m - Failed to open pidfd for process %d", self->pid);
+ }
+
+ /* Create task that exits right away. */
+ self->child_pid_exited = create_child(&self->child_pidfd_exited,
+ CLONE_NEWUSER | CLONE_NEWNET);
+ EXPECT_GT(self->child_pid_exited, 0);
+
+ if (self->child_pid_exited == 0)
+ _exit(EXIT_SUCCESS);
+
+ ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED | WNOWAIT), 0);
+
+ self->pidfd = sys_pidfd_open(self->pid, 0);
+ EXPECT_GE(self->pidfd, 0) {
+ TH_LOG("%m - Failed to open pidfd for process %d", self->pid);
+ }
+
+ ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ EXPECT_EQ(ret, 0);
+
+ /* Create tasks that will be stopped. */
+ self->child_pid1 = create_child(&self->child_pidfd1,
+ CLONE_NEWUSER | CLONE_NEWNS |
+ CLONE_NEWCGROUP | CLONE_NEWIPC |
+ CLONE_NEWUTS | CLONE_NEWPID |
+ CLONE_NEWNET);
+ EXPECT_GE(self->child_pid1, 0);
+
+ if (self->child_pid1 == 0) {
+ close(ipc_sockets[0]);
+
+ if (!switch_timens())
+ _exit(EXIT_FAILURE);
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0)
+ _exit(EXIT_FAILURE);
+
+ close(ipc_sockets[1]);
+
+ pause();
+ _exit(EXIT_SUCCESS);
+ }
+
+ close(ipc_sockets[1]);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ close(ipc_sockets[0]);
+
+ ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ EXPECT_EQ(ret, 0);
+
+ self->child_pid2 = create_child(&self->child_pidfd2,
+ CLONE_NEWUSER | CLONE_NEWNS |
+ CLONE_NEWCGROUP | CLONE_NEWIPC |
+ CLONE_NEWUTS | CLONE_NEWPID |
+ CLONE_NEWNET);
+ EXPECT_GE(self->child_pid2, 0);
+
+ if (self->child_pid2 == 0) {
+ close(ipc_sockets[0]);
+
+ if (!switch_timens())
+ _exit(EXIT_FAILURE);
+
+ if (write_nointr(ipc_sockets[1], "1", 1) < 0)
+ _exit(EXIT_FAILURE);
+
+ close(ipc_sockets[1]);
+
+ pause();
+ _exit(EXIT_SUCCESS);
+ }
+
+ close(ipc_sockets[1]);
+ ASSERT_EQ(read_nointr(ipc_sockets[0], &c, 1), 1);
+ close(ipc_sockets[0]);
+
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ char p[100];
+
+ const struct ns_info *info = &ns_info[i];
+
+ self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC);
+ if (self->nsfds[i] < 0) {
+ EXPECT_EQ(errno, ENOENT) {
+ TH_LOG("%m - Failed to open %s namespace for process %d",
+ info->name, self->pid);
+ }
+ }
+
+ ret = snprintf(p, sizeof(p), "/proc/%d/ns/%s",
+ self->child_pid1, info->name);
+ EXPECT_GT(ret, 0);
+ EXPECT_LT(ret, sizeof(p));
+
+ self->child_nsfds1[i] = open(p, O_RDONLY | O_CLOEXEC);
+ if (self->child_nsfds1[i] < 0) {
+ EXPECT_EQ(errno, ENOENT) {
+ TH_LOG("%m - Failed to open %s namespace for process %d",
+ info->name, self->child_pid1);
+ }
+ }
+
+ ret = snprintf(p, sizeof(p), "/proc/%d/ns/%s",
+ self->child_pid2, info->name);
+ EXPECT_GT(ret, 0);
+ EXPECT_LT(ret, sizeof(p));
+
+ self->child_nsfds2[i] = open(p, O_RDONLY | O_CLOEXEC);
+ if (self->child_nsfds2[i] < 0) {
+ EXPECT_EQ(errno, ENOENT) {
+ TH_LOG("%m - Failed to open %s namespace for process %d",
+ info->name, self->child_pid1);
+ }
+ }
+ }
+
+ close(proc_fd);
+}
+
+FIXTURE_TEARDOWN(current_nsset)
+{
+ int i;
+
+ ASSERT_EQ(sys_pidfd_send_signal(self->child_pidfd1,
+ SIGKILL, NULL, 0), 0);
+ ASSERT_EQ(sys_pidfd_send_signal(self->child_pidfd2,
+ SIGKILL, NULL, 0), 0);
+
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ if (self->nsfds[i] >= 0)
+ close(self->nsfds[i]);
+ if (self->child_nsfds1[i] >= 0)
+ close(self->child_nsfds1[i]);
+ if (self->child_nsfds2[i] >= 0)
+ close(self->child_nsfds2[i]);
+ }
+
+ if (self->child_pidfd1 >= 0)
+ EXPECT_EQ(0, close(self->child_pidfd1));
+ if (self->child_pidfd2 >= 0)
+ EXPECT_EQ(0, close(self->child_pidfd2));
+ ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED), 0);
+ ASSERT_EQ(sys_waitid(P_PID, self->child_pid1, WEXITED), 0);
+ ASSERT_EQ(sys_waitid(P_PID, self->child_pid2, WEXITED), 0);
+}
+
+static int preserve_ns(const int pid, const char *ns)
+{
+ int ret;
+ char path[50];
+
+ ret = snprintf(path, sizeof(path), "/proc/%d/ns/%s", pid, ns);
+ if (ret < 0 || (size_t)ret >= sizeof(path))
+ return -EIO;
+
+ return open(path, O_RDONLY | O_CLOEXEC);
+}
+
+static int in_same_namespace(int ns_fd1, pid_t pid2, const char *ns)
+{
+ int ns_fd2 = -EBADF;
+ int ret = -1;
+ struct stat ns_st1, ns_st2;
+
+ ret = fstat(ns_fd1, &ns_st1);
+ if (ret < 0)
+ return -1;
+
+ ns_fd2 = preserve_ns(pid2, ns);
+ if (ns_fd2 < 0)
+ return -1;
+
+ ret = fstat(ns_fd2, &ns_st2);
+ close(ns_fd2);
+ if (ret < 0)
+ return -1;
+
+ /* processes are in the same namespace */
+ if ((ns_st1.st_dev == ns_st2.st_dev) &&
+ (ns_st1.st_ino == ns_st2.st_ino))
+ return 1;
+
+ /* processes are in different namespaces */
+ return 0;
+}
+
+/* Test that we can't pass garbage to the kernel. */
+TEST_F(current_nsset, invalid_flags)
+{
+ ASSERT_NE(setns(self->pidfd, 0), 0);
+ EXPECT_EQ(errno, EINVAL);
+
+ ASSERT_NE(setns(self->pidfd, -1), 0);
+ EXPECT_EQ(errno, EINVAL);
+
+ ASSERT_NE(setns(self->pidfd, CLONE_VM), 0);
+ EXPECT_EQ(errno, EINVAL);
+
+ ASSERT_NE(setns(self->pidfd, CLONE_NEWUSER | CLONE_VM), 0);
+ EXPECT_EQ(errno, EINVAL);
+}
+
+/* Test that we can't attach to a task that has already exited. */
+TEST_F(current_nsset, pidfd_exited_child)
+{
+ int i;
+ pid_t pid;
+
+ ASSERT_NE(setns(self->child_pidfd_exited, CLONE_NEWUSER | CLONE_NEWNET),
+ 0);
+ EXPECT_EQ(errno, ESRCH);
+
+ pid = getpid();
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+ /* Verify that we haven't changed any namespaces. */
+ if (self->nsfds[i] >= 0)
+ ASSERT_EQ(in_same_namespace(self->nsfds[i], pid, info->name), 1);
+ }
+}
+
+TEST_F(current_nsset, pidfd_incremental_setns)
+{
+ int i;
+ pid_t pid;
+
+ pid = getpid();
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+ int nsfd;
+
+ if (self->child_nsfds1[i] < 0)
+ continue;
+
+ if (info->flag) {
+ ASSERT_EQ(setns(self->child_pidfd1, info->flag), 0) {
+ TH_LOG("%m - Failed to setns to %s namespace of %d via pidfd %d",
+ info->name, self->child_pid1,
+ self->child_pidfd1);
+ }
+ }
+
+ /* Verify that we have changed to the correct namespaces. */
+ if (info->flag == CLONE_NEWPID)
+ nsfd = self->nsfds[i];
+ else
+ nsfd = self->child_nsfds1[i];
+ ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) {
+ TH_LOG("setns failed to place us correctly into %s namespace of %d via pidfd %d",
+ info->name, self->child_pid1,
+ self->child_pidfd1);
+ }
+ TH_LOG("Managed to correctly setns to %s namespace of %d via pidfd %d",
+ info->name, self->child_pid1, self->child_pidfd1);
+ }
+}
+
+TEST_F(current_nsset, nsfd_incremental_setns)
+{
+ int i;
+ pid_t pid;
+
+ pid = getpid();
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+ int nsfd;
+
+ if (self->child_nsfds1[i] < 0)
+ continue;
+
+ if (info->flag) {
+ ASSERT_EQ(setns(self->child_nsfds1[i], info->flag), 0) {
+ TH_LOG("%m - Failed to setns to %s namespace of %d via nsfd %d",
+ info->name, self->child_pid1,
+ self->child_nsfds1[i]);
+ }
+ }
+
+ /* Verify that we have changed to the correct namespaces. */
+ if (info->flag == CLONE_NEWPID)
+ nsfd = self->nsfds[i];
+ else
+ nsfd = self->child_nsfds1[i];
+ ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) {
+ TH_LOG("setns failed to place us correctly into %s namespace of %d via nsfd %d",
+ info->name, self->child_pid1,
+ self->child_nsfds1[i]);
+ }
+ TH_LOG("Managed to correctly setns to %s namespace of %d via nsfd %d",
+ info->name, self->child_pid1, self->child_nsfds1[i]);
+ }
+}
+
+TEST_F(current_nsset, pidfd_one_shot_setns)
+{
+ unsigned flags = 0;
+ int i;
+ pid_t pid;
+
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+
+ if (self->child_nsfds1[i] < 0)
+ continue;
+
+ flags |= info->flag;
+ TH_LOG("Adding %s namespace of %d to list of namespaces to attach to",
+ info->name, self->child_pid1);
+ }
+
+ ASSERT_EQ(setns(self->child_pidfd1, flags), 0) {
+ TH_LOG("%m - Failed to setns to namespaces of %d",
+ self->child_pid1);
+ }
+
+ pid = getpid();
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+ int nsfd;
+
+ if (self->child_nsfds1[i] < 0)
+ continue;
+
+ /* Verify that we have changed to the correct namespaces. */
+ if (info->flag == CLONE_NEWPID)
+ nsfd = self->nsfds[i];
+ else
+ nsfd = self->child_nsfds1[i];
+ ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) {
+ TH_LOG("setns failed to place us correctly into %s namespace of %d",
+ info->name, self->child_pid1);
+ }
+ TH_LOG("Managed to correctly setns to %s namespace of %d",
+ info->name, self->child_pid1);
+ }
+}
+
+TEST_F(current_nsset, no_foul_play)
+{
+ unsigned flags = 0;
+ int i;
+
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+
+ if (self->child_nsfds1[i] < 0)
+ continue;
+
+ flags |= info->flag;
+ if (info->flag) /* No use logging pid_for_children. */
+ TH_LOG("Adding %s namespace of %d to list of namespaces to attach to",
+ info->name, self->child_pid1);
+ }
+
+ ASSERT_EQ(setns(self->child_pidfd1, flags), 0) {
+ TH_LOG("%m - Failed to setns to namespaces of %d vid pidfd %d",
+ self->child_pid1, self->child_pidfd1);
+ }
+
+ /*
+ * Can't setns to a user namespace outside of our hierarchy since we
+ * don't have caps in there and didn't create it. That means that under
+ * no circumstances should we be able to setns to any of the other
+ * ones since they aren't owned by our user namespace.
+ */
+ for (i = 0; i < PIDFD_NS_MAX; i++) {
+ const struct ns_info *info = &ns_info[i];
+
+ if (self->child_nsfds2[i] < 0 || !info->flag)
+ continue;
+
+ ASSERT_NE(setns(self->child_pidfd2, info->flag), 0) {
+ TH_LOG("Managed to setns to %s namespace of %d via pidfd %d",
+ info->name, self->child_pid2,
+ self->child_pidfd2);
+ }
+ TH_LOG("%m - Correctly failed to setns to %s namespace of %d via pidfd %d",
+ info->name, self->child_pid2,
+ self->child_pidfd2);
+
+ ASSERT_NE(setns(self->child_nsfds2[i], info->flag), 0) {
+ TH_LOG("Managed to setns to %s namespace of %d via nsfd %d",
+ info->name, self->child_pid2,
+ self->child_nsfds2[i]);
+ }
+ TH_LOG("%m - Correctly failed to setns to %s namespace of %d via nsfd %d",
+ info->name, self->child_pid2,
+ self->child_nsfds2[i]);
+ }
+}
+
+TEST(setns_einval)
+{
+ int fd;
+
+ fd = sys_memfd_create("rostock", 0);
+ EXPECT_GT(fd, 0);
+
+ ASSERT_NE(setns(fd, 0), 0);
+ EXPECT_EQ(errno, EINVAL);
+ close(fd);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/pidfd/pidfd_test.c b/tools/testing/selftests/pidfd/pidfd_test.c
new file mode 100644
index 000000000..79f543ad3
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_test.c
@@ -0,0 +1,573 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/types.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/epoll.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "pidfd.h"
+#include "../kselftest.h"
+
+#define str(s) _str(s)
+#define _str(s) #s
+#define CHILD_THREAD_MIN_WAIT 3 /* seconds */
+
+#define MAX_EVENTS 5
+
+static bool have_pidfd_send_signal;
+
+static pid_t pidfd_clone(int flags, int *pidfd, int (*fn)(void *))
+{
+ size_t stack_size = 1024;
+ char *stack[1024] = { 0 };
+
+#ifdef __ia64__
+ return __clone2(fn, stack, stack_size, flags | SIGCHLD, NULL, pidfd);
+#else
+ return clone(fn, stack + stack_size, flags | SIGCHLD, NULL, pidfd);
+#endif
+}
+
+static int signal_received;
+
+static void set_signal_received_on_sigusr1(int sig)
+{
+ if (sig == SIGUSR1)
+ signal_received = 1;
+}
+
+/*
+ * Straightforward test to see whether pidfd_send_signal() works is to send
+ * a signal to ourself.
+ */
+static int test_pidfd_send_signal_simple_success(void)
+{
+ int pidfd, ret;
+ const char *test_name = "pidfd_send_signal send SIGUSR1";
+
+ if (!have_pidfd_send_signal) {
+ ksft_test_result_skip(
+ "%s test: pidfd_send_signal() syscall not supported\n",
+ test_name);
+ return 0;
+ }
+
+ pidfd = open("/proc/self", O_DIRECTORY | O_CLOEXEC);
+ if (pidfd < 0)
+ ksft_exit_fail_msg(
+ "%s test: Failed to open process file descriptor\n",
+ test_name);
+
+ signal(SIGUSR1, set_signal_received_on_sigusr1);
+
+ ret = sys_pidfd_send_signal(pidfd, SIGUSR1, NULL, 0);
+ close(pidfd);
+ if (ret < 0)
+ ksft_exit_fail_msg("%s test: Failed to send signal\n",
+ test_name);
+
+ if (signal_received != 1)
+ ksft_exit_fail_msg("%s test: Failed to receive signal\n",
+ test_name);
+
+ signal_received = 0;
+ ksft_test_result_pass("%s test: Sent signal\n", test_name);
+ return 0;
+}
+
+static int test_pidfd_send_signal_exited_fail(void)
+{
+ int pidfd, ret, saved_errno;
+ char buf[256];
+ pid_t pid;
+ const char *test_name = "pidfd_send_signal signal exited process";
+
+ if (!have_pidfd_send_signal) {
+ ksft_test_result_skip(
+ "%s test: pidfd_send_signal() syscall not supported\n",
+ test_name);
+ return 0;
+ }
+
+ pid = fork();
+ if (pid < 0)
+ ksft_exit_fail_msg("%s test: Failed to create new process\n",
+ test_name);
+
+ if (pid == 0)
+ _exit(EXIT_SUCCESS);
+
+ snprintf(buf, sizeof(buf), "/proc/%d", pid);
+
+ pidfd = open(buf, O_DIRECTORY | O_CLOEXEC);
+
+ (void)wait_for_pid(pid);
+
+ if (pidfd < 0)
+ ksft_exit_fail_msg(
+ "%s test: Failed to open process file descriptor\n",
+ test_name);
+
+ ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0);
+ saved_errno = errno;
+ close(pidfd);
+ if (ret == 0)
+ ksft_exit_fail_msg(
+ "%s test: Managed to send signal to process even though it should have failed\n",
+ test_name);
+
+ if (saved_errno != ESRCH)
+ ksft_exit_fail_msg(
+ "%s test: Expected to receive ESRCH as errno value but received %d instead\n",
+ test_name, saved_errno);
+
+ ksft_test_result_pass("%s test: Failed to send signal as expected\n",
+ test_name);
+ return 0;
+}
+
+/*
+ * Maximum number of cycles we allow. This is equivalent to PID_MAX_DEFAULT.
+ * If users set a higher limit or we have cycled PIDFD_MAX_DEFAULT number of
+ * times then we skip the test to not go into an infinite loop or block for a
+ * long time.
+ */
+#define PIDFD_MAX_DEFAULT 0x8000
+
+static int test_pidfd_send_signal_recycled_pid_fail(void)
+{
+ int i, ret;
+ pid_t pid1;
+ const char *test_name = "pidfd_send_signal signal recycled pid";
+
+ if (!have_pidfd_send_signal) {
+ ksft_test_result_skip(
+ "%s test: pidfd_send_signal() syscall not supported\n",
+ test_name);
+ return 0;
+ }
+
+ ret = unshare(CLONE_NEWPID);
+ if (ret < 0) {
+ if (errno == EPERM) {
+ ksft_test_result_skip("%s test: Unsharing pid namespace not permitted\n",
+ test_name);
+ return 0;
+ }
+ ksft_exit_fail_msg("%s test: Failed to unshare pid namespace\n",
+ test_name);
+ }
+
+ ret = unshare(CLONE_NEWNS);
+ if (ret < 0) {
+ if (errno == EPERM) {
+ ksft_test_result_skip("%s test: Unsharing mount namespace not permitted\n",
+ test_name);
+ return 0;
+ }
+ ksft_exit_fail_msg("%s test: Failed to unshare mount namespace\n",
+ test_name);
+ }
+
+ ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0);
+ if (ret < 0)
+ ksft_exit_fail_msg("%s test: Failed to remount / private\n",
+ test_name);
+
+ /* pid 1 in new pid namespace */
+ pid1 = fork();
+ if (pid1 < 0)
+ ksft_exit_fail_msg("%s test: Failed to create new process\n",
+ test_name);
+
+ if (pid1 == 0) {
+ char buf[256];
+ pid_t pid2;
+ int pidfd = -1;
+
+ (void)umount2("/proc", MNT_DETACH);
+ ret = mount("proc", "/proc", "proc", 0, NULL);
+ if (ret < 0)
+ _exit(PIDFD_ERROR);
+
+ /* grab pid PID_RECYCLE */
+ for (i = 0; i <= PIDFD_MAX_DEFAULT; i++) {
+ pid2 = fork();
+ if (pid2 < 0)
+ _exit(PIDFD_ERROR);
+
+ if (pid2 == 0)
+ _exit(PIDFD_PASS);
+
+ if (pid2 == PID_RECYCLE) {
+ snprintf(buf, sizeof(buf), "/proc/%d", pid2);
+ ksft_print_msg("pid to recycle is %d\n", pid2);
+ pidfd = open(buf, O_DIRECTORY | O_CLOEXEC);
+ }
+
+ if (wait_for_pid(pid2))
+ _exit(PIDFD_ERROR);
+
+ if (pid2 >= PID_RECYCLE)
+ break;
+ }
+
+ /*
+ * We want to be as predictable as we can so if we haven't been
+ * able to grab pid PID_RECYCLE skip the test.
+ */
+ if (pid2 != PID_RECYCLE) {
+ /* skip test */
+ close(pidfd);
+ _exit(PIDFD_SKIP);
+ }
+
+ if (pidfd < 0)
+ _exit(PIDFD_ERROR);
+
+ for (i = 0; i <= PIDFD_MAX_DEFAULT; i++) {
+ char c;
+ int pipe_fds[2];
+ pid_t recycled_pid;
+ int child_ret = PIDFD_PASS;
+
+ ret = pipe2(pipe_fds, O_CLOEXEC);
+ if (ret < 0)
+ _exit(PIDFD_ERROR);
+
+ recycled_pid = fork();
+ if (recycled_pid < 0)
+ _exit(PIDFD_ERROR);
+
+ if (recycled_pid == 0) {
+ close(pipe_fds[1]);
+ (void)read(pipe_fds[0], &c, 1);
+ close(pipe_fds[0]);
+
+ _exit(PIDFD_PASS);
+ }
+
+ /*
+ * Stop the child so we can inspect whether we have
+ * recycled pid PID_RECYCLE.
+ */
+ close(pipe_fds[0]);
+ ret = kill(recycled_pid, SIGSTOP);
+ close(pipe_fds[1]);
+ if (ret) {
+ (void)wait_for_pid(recycled_pid);
+ _exit(PIDFD_ERROR);
+ }
+
+ /*
+ * We have recycled the pid. Try to signal it. This
+ * needs to fail since this is a different process than
+ * the one the pidfd refers to.
+ */
+ if (recycled_pid == PID_RECYCLE) {
+ ret = sys_pidfd_send_signal(pidfd, SIGCONT,
+ NULL, 0);
+ if (ret && errno == ESRCH)
+ child_ret = PIDFD_XFAIL;
+ else
+ child_ret = PIDFD_FAIL;
+ }
+
+ /* let the process move on */
+ ret = kill(recycled_pid, SIGCONT);
+ if (ret)
+ (void)kill(recycled_pid, SIGKILL);
+
+ if (wait_for_pid(recycled_pid))
+ _exit(PIDFD_ERROR);
+
+ switch (child_ret) {
+ case PIDFD_FAIL:
+ /* fallthrough */
+ case PIDFD_XFAIL:
+ _exit(child_ret);
+ case PIDFD_PASS:
+ break;
+ default:
+ /* not reached */
+ _exit(PIDFD_ERROR);
+ }
+
+ /*
+ * If the user set a custom pid_max limit we could be
+ * in the millions.
+ * Skip the test in this case.
+ */
+ if (recycled_pid > PIDFD_MAX_DEFAULT)
+ _exit(PIDFD_SKIP);
+ }
+
+ /* failed to recycle pid */
+ _exit(PIDFD_SKIP);
+ }
+
+ ret = wait_for_pid(pid1);
+ switch (ret) {
+ case PIDFD_FAIL:
+ ksft_exit_fail_msg(
+ "%s test: Managed to signal recycled pid %d\n",
+ test_name, PID_RECYCLE);
+ case PIDFD_PASS:
+ ksft_exit_fail_msg("%s test: Failed to recycle pid %d\n",
+ test_name, PID_RECYCLE);
+ case PIDFD_SKIP:
+ ksft_test_result_skip("%s test: Skipping test\n", test_name);
+ ret = 0;
+ break;
+ case PIDFD_XFAIL:
+ ksft_test_result_pass(
+ "%s test: Failed to signal recycled pid as expected\n",
+ test_name);
+ ret = 0;
+ break;
+ default /* PIDFD_ERROR */:
+ ksft_exit_fail_msg("%s test: Error while running tests\n",
+ test_name);
+ }
+
+ return ret;
+}
+
+static int test_pidfd_send_signal_syscall_support(void)
+{
+ int pidfd, ret;
+ const char *test_name = "pidfd_send_signal check for support";
+
+ pidfd = open("/proc/self", O_DIRECTORY | O_CLOEXEC);
+ if (pidfd < 0)
+ ksft_exit_fail_msg(
+ "%s test: Failed to open process file descriptor\n",
+ test_name);
+
+ ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0);
+ if (ret < 0) {
+ if (errno == ENOSYS) {
+ ksft_test_result_skip(
+ "%s test: pidfd_send_signal() syscall not supported\n",
+ test_name);
+ return 0;
+ }
+ ksft_exit_fail_msg("%s test: Failed to send signal\n",
+ test_name);
+ }
+
+ have_pidfd_send_signal = true;
+ close(pidfd);
+ ksft_test_result_pass(
+ "%s test: pidfd_send_signal() syscall is supported. Tests can be executed\n",
+ test_name);
+ return 0;
+}
+
+static void *test_pidfd_poll_exec_thread(void *priv)
+{
+ ksft_print_msg("Child Thread: starting. pid %d tid %ld ; and sleeping\n",
+ getpid(), syscall(SYS_gettid));
+ ksft_print_msg("Child Thread: doing exec of sleep\n");
+
+ execl("/bin/sleep", "sleep", str(CHILD_THREAD_MIN_WAIT), (char *)NULL);
+
+ ksft_print_msg("Child Thread: DONE. pid %d tid %ld\n",
+ getpid(), syscall(SYS_gettid));
+ return NULL;
+}
+
+static void poll_pidfd(const char *test_name, int pidfd)
+{
+ int c;
+ int epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+ struct epoll_event event, events[MAX_EVENTS];
+
+ if (epoll_fd == -1)
+ ksft_exit_fail_msg("%s test: Failed to create epoll file descriptor "
+ "(errno %d)\n",
+ test_name, errno);
+
+ event.events = EPOLLIN;
+ event.data.fd = pidfd;
+
+ if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, pidfd, &event)) {
+ ksft_exit_fail_msg("%s test: Failed to add epoll file descriptor "
+ "(errno %d)\n",
+ test_name, errno);
+ }
+
+ c = epoll_wait(epoll_fd, events, MAX_EVENTS, 5000);
+ if (c != 1 || !(events[0].events & EPOLLIN))
+ ksft_exit_fail_msg("%s test: Unexpected epoll_wait result (c=%d, events=%x) ",
+ "(errno %d)\n",
+ test_name, c, events[0].events, errno);
+
+ close(epoll_fd);
+ return;
+
+}
+
+static int child_poll_exec_test(void *args)
+{
+ pthread_t t1;
+
+ ksft_print_msg("Child (pidfd): starting. pid %d tid %ld\n", getpid(),
+ syscall(SYS_gettid));
+ pthread_create(&t1, NULL, test_pidfd_poll_exec_thread, NULL);
+ /*
+ * Exec in the non-leader thread will destroy the leader immediately.
+ * If the wait in the parent returns too soon, the test fails.
+ */
+ while (1)
+ sleep(1);
+}
+
+static void test_pidfd_poll_exec(int use_waitpid)
+{
+ int pid, pidfd = 0;
+ int status, ret;
+ time_t prog_start = time(NULL);
+ const char *test_name = "pidfd_poll check for premature notification on child thread exec";
+
+ ksft_print_msg("Parent: pid: %d\n", getpid());
+ pid = pidfd_clone(CLONE_PIDFD, &pidfd, child_poll_exec_test);
+ if (pid < 0)
+ ksft_exit_fail_msg("%s test: pidfd_clone failed (ret %d, errno %d)\n",
+ test_name, pid, errno);
+
+ ksft_print_msg("Parent: Waiting for Child (%d) to complete.\n", pid);
+
+ if (use_waitpid) {
+ ret = waitpid(pid, &status, 0);
+ if (ret == -1)
+ ksft_print_msg("Parent: error\n");
+
+ if (ret == pid)
+ ksft_print_msg("Parent: Child process waited for.\n");
+ } else {
+ poll_pidfd(test_name, pidfd);
+ }
+
+ time_t prog_time = time(NULL) - prog_start;
+
+ ksft_print_msg("Time waited for child: %lu\n", prog_time);
+
+ close(pidfd);
+
+ if (prog_time < CHILD_THREAD_MIN_WAIT || prog_time > CHILD_THREAD_MIN_WAIT + 2)
+ ksft_exit_fail_msg("%s test: Failed\n", test_name);
+ else
+ ksft_test_result_pass("%s test: Passed\n", test_name);
+}
+
+static void *test_pidfd_poll_leader_exit_thread(void *priv)
+{
+ ksft_print_msg("Child Thread: starting. pid %d tid %ld ; and sleeping\n",
+ getpid(), syscall(SYS_gettid));
+ sleep(CHILD_THREAD_MIN_WAIT);
+ ksft_print_msg("Child Thread: DONE. pid %d tid %ld\n", getpid(), syscall(SYS_gettid));
+ return NULL;
+}
+
+static time_t *child_exit_secs;
+static int child_poll_leader_exit_test(void *args)
+{
+ pthread_t t1, t2;
+
+ ksft_print_msg("Child: starting. pid %d tid %ld\n", getpid(), syscall(SYS_gettid));
+ pthread_create(&t1, NULL, test_pidfd_poll_leader_exit_thread, NULL);
+ pthread_create(&t2, NULL, test_pidfd_poll_leader_exit_thread, NULL);
+
+ /*
+ * glibc exit calls exit_group syscall, so explicity call exit only
+ * so that only the group leader exits, leaving the threads alone.
+ */
+ *child_exit_secs = time(NULL);
+ syscall(SYS_exit, 0);
+ /* Never reached, but appeases compiler thinking we should return. */
+ exit(0);
+}
+
+static void test_pidfd_poll_leader_exit(int use_waitpid)
+{
+ int pid, pidfd = 0;
+ int status, ret = 0;
+ const char *test_name = "pidfd_poll check for premature notification on non-empty"
+ "group leader exit";
+
+ child_exit_secs = mmap(NULL, sizeof *child_exit_secs, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+
+ if (child_exit_secs == MAP_FAILED)
+ ksft_exit_fail_msg("%s test: mmap failed (errno %d)\n",
+ test_name, errno);
+
+ ksft_print_msg("Parent: pid: %d\n", getpid());
+ pid = pidfd_clone(CLONE_PIDFD, &pidfd, child_poll_leader_exit_test);
+ if (pid < 0)
+ ksft_exit_fail_msg("%s test: pidfd_clone failed (ret %d, errno %d)\n",
+ test_name, pid, errno);
+
+ ksft_print_msg("Parent: Waiting for Child (%d) to complete.\n", pid);
+
+ if (use_waitpid) {
+ ret = waitpid(pid, &status, 0);
+ if (ret == -1)
+ ksft_print_msg("Parent: error\n");
+ } else {
+ /*
+ * This sleep tests for the case where if the child exits, and is in
+ * EXIT_ZOMBIE, but the thread group leader is non-empty, then the poll
+ * doesn't prematurely return even though there are active threads
+ */
+ sleep(1);
+ poll_pidfd(test_name, pidfd);
+ }
+
+ if (ret == pid)
+ ksft_print_msg("Parent: Child process waited for.\n");
+
+ time_t since_child_exit = time(NULL) - *child_exit_secs;
+
+ ksft_print_msg("Time since child exit: %lu\n", since_child_exit);
+
+ close(pidfd);
+
+ if (since_child_exit < CHILD_THREAD_MIN_WAIT ||
+ since_child_exit > CHILD_THREAD_MIN_WAIT + 2)
+ ksft_exit_fail_msg("%s test: Failed\n", test_name);
+ else
+ ksft_test_result_pass("%s test: Passed\n", test_name);
+}
+
+int main(int argc, char **argv)
+{
+ ksft_print_header();
+ ksft_set_plan(8);
+
+ test_pidfd_poll_exec(0);
+ test_pidfd_poll_exec(1);
+ test_pidfd_poll_leader_exit(0);
+ test_pidfd_poll_leader_exit(1);
+ test_pidfd_send_signal_syscall_support();
+ test_pidfd_send_signal_simple_success();
+ test_pidfd_send_signal_exited_fail();
+ test_pidfd_send_signal_recycled_pid_fail();
+
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/pidfd/pidfd_wait.c b/tools/testing/selftests/pidfd/pidfd_wait.c
new file mode 100644
index 000000000..17999e082
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_wait.c
@@ -0,0 +1,224 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sched.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "pidfd.h"
+#include "../kselftest_harness.h"
+
+#define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr)))
+
+/* Attempt to de-conflict with the selftests tree. */
+#ifndef SKIP
+#define SKIP(s, ...) XFAIL(s, ##__VA_ARGS__)
+#endif
+
+static pid_t sys_clone3(struct clone_args *args)
+{
+ return syscall(__NR_clone3, args, sizeof(struct clone_args));
+}
+
+static int sys_waitid(int which, pid_t pid, siginfo_t *info, int options,
+ struct rusage *ru)
+{
+ return syscall(__NR_waitid, which, pid, info, options, ru);
+}
+
+TEST(wait_simple)
+{
+ int pidfd = -1;
+ pid_t parent_tid = -1;
+ struct clone_args args = {
+ .parent_tid = ptr_to_u64(&parent_tid),
+ .pidfd = ptr_to_u64(&pidfd),
+ .flags = CLONE_PIDFD | CLONE_PARENT_SETTID,
+ .exit_signal = SIGCHLD,
+ };
+ pid_t pid;
+ siginfo_t info = {
+ .si_signo = 0,
+ };
+
+ pidfd = open("/proc/self", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+ ASSERT_GE(pidfd, 0);
+
+ pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+ ASSERT_NE(pid, 0);
+ EXPECT_EQ(close(pidfd), 0);
+ pidfd = -1;
+
+ pidfd = open("/dev/null", O_RDONLY | O_CLOEXEC);
+ ASSERT_GE(pidfd, 0);
+
+ pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+ ASSERT_NE(pid, 0);
+ EXPECT_EQ(close(pidfd), 0);
+ pidfd = -1;
+
+ pid = sys_clone3(&args);
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0)
+ exit(EXIT_SUCCESS);
+
+ pid = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+ ASSERT_GE(pid, 0);
+ ASSERT_EQ(WIFEXITED(info.si_status), true);
+ ASSERT_EQ(WEXITSTATUS(info.si_status), 0);
+ EXPECT_EQ(close(pidfd), 0);
+
+ ASSERT_EQ(info.si_signo, SIGCHLD);
+ ASSERT_EQ(info.si_code, CLD_EXITED);
+ ASSERT_EQ(info.si_pid, parent_tid);
+}
+
+TEST(wait_states)
+{
+ int pidfd = -1;
+ pid_t parent_tid = -1;
+ struct clone_args args = {
+ .parent_tid = ptr_to_u64(&parent_tid),
+ .pidfd = ptr_to_u64(&pidfd),
+ .flags = CLONE_PIDFD | CLONE_PARENT_SETTID,
+ .exit_signal = SIGCHLD,
+ };
+ int ret;
+ pid_t pid;
+ siginfo_t info = {
+ .si_signo = 0,
+ };
+
+ pid = sys_clone3(&args);
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ kill(getpid(), SIGSTOP);
+ kill(getpid(), SIGSTOP);
+ exit(EXIT_SUCCESS);
+ }
+
+ ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED, NULL), 0);
+ ASSERT_EQ(info.si_signo, SIGCHLD);
+ ASSERT_EQ(info.si_code, CLD_STOPPED);
+ ASSERT_EQ(info.si_pid, parent_tid);
+
+ ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGCONT, NULL, 0), 0);
+
+ ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WCONTINUED, NULL), 0);
+ ASSERT_EQ(info.si_signo, SIGCHLD);
+ ASSERT_EQ(info.si_code, CLD_CONTINUED);
+ ASSERT_EQ(info.si_pid, parent_tid);
+
+ ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WUNTRACED, NULL), 0);
+ ASSERT_EQ(info.si_signo, SIGCHLD);
+ ASSERT_EQ(info.si_code, CLD_STOPPED);
+ ASSERT_EQ(info.si_pid, parent_tid);
+
+ ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0), 0);
+
+ ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL), 0);
+ ASSERT_EQ(info.si_signo, SIGCHLD);
+ ASSERT_EQ(info.si_code, CLD_KILLED);
+ ASSERT_EQ(info.si_pid, parent_tid);
+
+ EXPECT_EQ(close(pidfd), 0);
+}
+
+TEST(wait_nonblock)
+{
+ int pidfd, status = 0;
+ unsigned int flags = 0;
+ pid_t parent_tid = -1;
+ struct clone_args args = {
+ .parent_tid = ptr_to_u64(&parent_tid),
+ .flags = CLONE_PARENT_SETTID,
+ .exit_signal = SIGCHLD,
+ };
+ int ret;
+ pid_t pid;
+ siginfo_t info = {
+ .si_signo = 0,
+ };
+
+ /*
+ * Callers need to see ECHILD with non-blocking pidfds when no child
+ * processes exists.
+ */
+ pidfd = sys_pidfd_open(getpid(), PIDFD_NONBLOCK);
+ EXPECT_GE(pidfd, 0) {
+ /* pidfd_open() doesn't support PIDFD_NONBLOCK. */
+ ASSERT_EQ(errno, EINVAL);
+ SKIP(return, "Skipping PIDFD_NONBLOCK test");
+ }
+
+ ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+ ASSERT_LT(ret, 0);
+ ASSERT_EQ(errno, ECHILD);
+ EXPECT_EQ(close(pidfd), 0);
+
+ pid = sys_clone3(&args);
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ kill(getpid(), SIGSTOP);
+ exit(EXIT_SUCCESS);
+ }
+
+ pidfd = sys_pidfd_open(pid, PIDFD_NONBLOCK);
+ EXPECT_GE(pidfd, 0) {
+ /* pidfd_open() doesn't support PIDFD_NONBLOCK. */
+ ASSERT_EQ(errno, EINVAL);
+ SKIP(return, "Skipping PIDFD_NONBLOCK test");
+ }
+
+ flags = fcntl(pidfd, F_GETFL, 0);
+ ASSERT_GT(flags, 0);
+ ASSERT_GT((flags & O_NONBLOCK), 0);
+
+ /*
+ * Callers need to see EAGAIN/EWOULDBLOCK with non-blocking pidfd when
+ * child processes exist but none have exited.
+ */
+ ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL);
+ ASSERT_LT(ret, 0);
+ ASSERT_EQ(errno, EAGAIN);
+
+ /*
+ * Callers need to continue seeing 0 with non-blocking pidfd and
+ * WNOHANG raised explicitly when child processes exist but none have
+ * exited.
+ */
+ ret = sys_waitid(P_PIDFD, pidfd, &info, WEXITED | WNOHANG, NULL);
+ ASSERT_EQ(ret, 0);
+
+ ASSERT_EQ(fcntl(pidfd, F_SETFL, (flags & ~O_NONBLOCK)), 0);
+
+ ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WSTOPPED, NULL), 0);
+ ASSERT_EQ(info.si_signo, SIGCHLD);
+ ASSERT_EQ(info.si_code, CLD_STOPPED);
+ ASSERT_EQ(info.si_pid, parent_tid);
+
+ ASSERT_EQ(sys_pidfd_send_signal(pidfd, SIGCONT, NULL, 0), 0);
+
+ ASSERT_EQ(sys_waitid(P_PIDFD, pidfd, &info, WEXITED, NULL), 0);
+ ASSERT_EQ(info.si_signo, SIGCHLD);
+ ASSERT_EQ(info.si_code, CLD_EXITED);
+ ASSERT_EQ(info.si_pid, parent_tid);
+
+ EXPECT_EQ(close(pidfd), 0);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile
new file mode 100644
index 000000000..0830e6381
--- /dev/null
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -0,0 +1,77 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for powerpc selftests
+
+# ARCH can be overridden by the user for cross compiling
+ARCH ?= $(shell uname -m)
+ARCH := $(shell echo $(ARCH) | sed -e s/ppc.*/powerpc/)
+
+ifeq ($(ARCH),powerpc)
+
+GIT_VERSION = $(shell git describe --always --long --dirty || echo "unknown")
+
+CFLAGS := -std=gnu99 -O2 -Wall -Werror -DGIT_VERSION='"$(GIT_VERSION)"' -I$(CURDIR)/include $(CFLAGS)
+
+export CFLAGS
+
+SUB_DIRS = alignment \
+ benchmarks \
+ cache_shape \
+ copyloops \
+ dscr \
+ mm \
+ nx-gzip \
+ pmu \
+ signal \
+ primitives \
+ stringloops \
+ switch_endian \
+ syscalls \
+ tm \
+ eeh \
+ vphn \
+ math \
+ ptrace \
+ security
+
+endif
+
+all: $(SUB_DIRS)
+
+$(SUB_DIRS):
+ BUILD_TARGET=$(OUTPUT)/$@; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $@ all
+
+include ../lib.mk
+
+override define RUN_TESTS
+ @for TARGET in $(SUB_DIRS); do \
+ BUILD_TARGET=$(OUTPUT)/$$TARGET; \
+ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests;\
+ done;
+endef
+
+override define INSTALL_RULE
+ @for TARGET in $(SUB_DIRS); do \
+ BUILD_TARGET=$(OUTPUT)/$$TARGET; \
+ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install;\
+ done;
+endef
+
+override define EMIT_TESTS
+ @for TARGET in $(SUB_DIRS); do \
+ BUILD_TARGET=$(OUTPUT)/$$TARGET; \
+ $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests;\
+ done;
+endef
+
+override define CLEAN
+ @for TARGET in $(SUB_DIRS); do \
+ BUILD_TARGET=$(OUTPUT)/$$TARGET; \
+ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean; \
+ done;
+ rm -f tags
+endef
+
+tags:
+ find . -name '*.c' -o -name '*.h' | xargs ctags
+
+.PHONY: tags $(SUB_DIRS)
diff --git a/tools/testing/selftests/powerpc/alignment/.gitignore b/tools/testing/selftests/powerpc/alignment/.gitignore
new file mode 100644
index 000000000..28bc6ca13
--- /dev/null
+++ b/tools/testing/selftests/powerpc/alignment/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+copy_first_unaligned
+alignment_handler
diff --git a/tools/testing/selftests/powerpc/alignment/Makefile b/tools/testing/selftests/powerpc/alignment/Makefile
new file mode 100644
index 000000000..93e9af374
--- /dev/null
+++ b/tools/testing/selftests/powerpc/alignment/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+TEST_GEN_PROGS := copy_first_unaligned alignment_handler
+
+top_srcdir = ../../../../..
+include ../../lib.mk
+
+$(TEST_GEN_PROGS): ../harness.c ../utils.c
diff --git a/tools/testing/selftests/powerpc/alignment/alignment_handler.c b/tools/testing/selftests/powerpc/alignment/alignment_handler.c
new file mode 100644
index 000000000..c25cf7cd4
--- /dev/null
+++ b/tools/testing/selftests/powerpc/alignment/alignment_handler.c
@@ -0,0 +1,689 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test the powerpc alignment handler on POWER8/POWER9
+ *
+ * Copyright (C) 2017 IBM Corporation (Michael Neuling, Andrew Donnellan)
+ */
+
+/*
+ * This selftest exercises the powerpc alignment fault handler.
+ *
+ * We create two sets of source and destination buffers, one in regular memory,
+ * the other cache-inhibited (by default we use /dev/fb0 for this, but an
+ * alterative path for cache-inhibited memory may be provided).
+ *
+ * One way to get cache-inhibited memory is to use the "mem" kernel parameter
+ * to limit the kernel to less memory than actually exists. Addresses above
+ * the limit may still be accessed but will be treated as cache-inhibited. For
+ * example, if there is actually 4GB of memory and the parameter "mem=3GB" is
+ * used, memory from address 0xC0000000 onwards is treated as cache-inhibited.
+ * To access this region /dev/mem is used. The kernel should be configured
+ * without CONFIG_STRICT_DEVMEM. In this case use:
+ * ./alignment_handler /dev/mem 0xc0000000
+ *
+ * We initialise the source buffers, then use whichever set of load/store
+ * instructions is under test to copy bytes from the source buffers to the
+ * destination buffers. For the regular buffers, these instructions will
+ * execute normally. For the cache-inhibited buffers, these instructions
+ * will trap and cause an alignment fault, and the alignment fault handler
+ * will emulate the particular instruction under test. We then compare the
+ * destination buffers to ensure that the native and emulated cases give the
+ * same result.
+ *
+ * TODO:
+ * - Any FIXMEs below
+ * - Test VSX regs < 32 and > 32
+ * - Test all loads and stores
+ * - Check update forms do update register
+ * - Test alignment faults over page boundary
+ *
+ * Some old binutils may not support all the instructions.
+ */
+
+
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <getopt.h>
+#include <setjmp.h>
+#include <signal.h>
+
+#include "utils.h"
+#include "instructions.h"
+
+int bufsize;
+int debug;
+int testing;
+volatile int gotsig;
+bool prefixes_enabled;
+char *cipath = "/dev/fb0";
+long cioffset;
+
+void sighandler(int sig, siginfo_t *info, void *ctx)
+{
+ ucontext_t *ucp = ctx;
+
+ if (!testing) {
+ signal(sig, SIG_DFL);
+ kill(0, sig);
+ }
+ gotsig = sig;
+#ifdef __powerpc64__
+ if (prefixes_enabled) {
+ u32 inst = *(u32 *)ucp->uc_mcontext.gp_regs[PT_NIP];
+ ucp->uc_mcontext.gp_regs[PT_NIP] += ((inst >> 26 == 1) ? 8 : 4);
+ } else {
+ ucp->uc_mcontext.gp_regs[PT_NIP] += 4;
+ }
+#else
+ ucp->uc_mcontext.uc_regs->gregs[PT_NIP] += 4;
+#endif
+}
+
+#define XFORM(reg, n) " " #reg " ,%"#n",%2 ;"
+#define DFORM(reg, n) " " #reg " ,0(%"#n") ;"
+
+#define TEST(name, ld_op, st_op, form, ld_reg, st_reg) \
+ void test_##name(char *s, char *d) \
+ { \
+ asm volatile( \
+ #ld_op form(ld_reg, 0) \
+ #st_op form(st_reg, 1) \
+ :: "r"(s), "r"(d), "r"(0) \
+ : "memory", "vs0", "vs32", "r31"); \
+ } \
+ rc |= do_test(#name, test_##name)
+
+#define TESTP(name, ld_op, st_op, ld_reg, st_reg) \
+ void test_##name(char *s, char *d) \
+ { \
+ asm volatile( \
+ ld_op(ld_reg, %0, 0, 0) \
+ st_op(st_reg, %1, 0, 0) \
+ :: "r"(s), "r"(d), "r"(0) \
+ : "memory", "vs0", "vs32", "r31"); \
+ } \
+ rc |= do_test(#name, test_##name)
+
+#define LOAD_VSX_XFORM_TEST(op) TEST(op, op, stxvd2x, XFORM, 32, 32)
+#define STORE_VSX_XFORM_TEST(op) TEST(op, lxvd2x, op, XFORM, 32, 32)
+#define LOAD_VSX_DFORM_TEST(op) TEST(op, op, stxv, DFORM, 32, 32)
+#define STORE_VSX_DFORM_TEST(op) TEST(op, lxv, op, DFORM, 32, 32)
+#define LOAD_VMX_XFORM_TEST(op) TEST(op, op, stxvd2x, XFORM, 0, 32)
+#define STORE_VMX_XFORM_TEST(op) TEST(op, lxvd2x, op, XFORM, 32, 0)
+#define LOAD_VMX_DFORM_TEST(op) TEST(op, op, stxv, DFORM, 0, 32)
+#define STORE_VMX_DFORM_TEST(op) TEST(op, lxv, op, DFORM, 32, 0)
+
+#define LOAD_XFORM_TEST(op) TEST(op, op, stdx, XFORM, 31, 31)
+#define STORE_XFORM_TEST(op) TEST(op, ldx, op, XFORM, 31, 31)
+#define LOAD_DFORM_TEST(op) TEST(op, op, std, DFORM, 31, 31)
+#define STORE_DFORM_TEST(op) TEST(op, ld, op, DFORM, 31, 31)
+
+#define LOAD_FLOAT_DFORM_TEST(op) TEST(op, op, stfd, DFORM, 0, 0)
+#define STORE_FLOAT_DFORM_TEST(op) TEST(op, lfd, op, DFORM, 0, 0)
+#define LOAD_FLOAT_XFORM_TEST(op) TEST(op, op, stfdx, XFORM, 0, 0)
+#define STORE_FLOAT_XFORM_TEST(op) TEST(op, lfdx, op, XFORM, 0, 0)
+
+#define LOAD_MLS_PREFIX_TEST(op) TESTP(op, op, PSTD, 31, 31)
+#define STORE_MLS_PREFIX_TEST(op) TESTP(op, PLD, op, 31, 31)
+
+#define LOAD_8LS_PREFIX_TEST(op) TESTP(op, op, PSTD, 31, 31)
+#define STORE_8LS_PREFIX_TEST(op) TESTP(op, PLD, op, 31, 31)
+
+#define LOAD_FLOAT_MLS_PREFIX_TEST(op) TESTP(op, op, PSTFD, 0, 0)
+#define STORE_FLOAT_MLS_PREFIX_TEST(op) TESTP(op, PLFD, op, 0, 0)
+
+#define LOAD_VSX_8LS_PREFIX_TEST(op, tail) TESTP(op, op, PSTXV ## tail, 0, 32)
+#define STORE_VSX_8LS_PREFIX_TEST(op, tail) TESTP(op, PLXV ## tail, op, 32, 0)
+
+/* FIXME: Unimplemented tests: */
+// STORE_DFORM_TEST(stq) /* FIXME: need two registers for quad */
+// STORE_DFORM_TEST(stswi) /* FIXME: string instruction */
+
+// STORE_XFORM_TEST(stwat) /* AMO can't emulate or run on CI */
+// STORE_XFORM_TEST(stdat) /* ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ */
+
+
+/* preload byte by byte */
+void preload_data(void *dst, int offset, int width)
+{
+ char *c = dst;
+ int i;
+
+ c += offset;
+
+ for (i = 0 ; i < width ; i++)
+ c[i] = i;
+}
+
+int test_memcpy(void *dst, void *src, int size, int offset,
+ void (*test_func)(char *, char *))
+{
+ char *s, *d;
+
+ s = src;
+ s += offset;
+ d = dst;
+ d += offset;
+
+ assert(size == 16);
+ gotsig = 0;
+ testing = 1;
+
+ test_func(s, d); /* run the actual test */
+
+ testing = 0;
+ if (gotsig) {
+ if (debug)
+ printf(" Got signal %i\n", gotsig);
+ return 1;
+ }
+ return 0;
+}
+
+void dumpdata(char *s1, char *s2, int n, char *test_name)
+{
+ int i;
+
+ printf(" %s: unexpected result:\n", test_name);
+ printf(" mem:");
+ for (i = 0; i < n; i++)
+ printf(" %02x", s1[i]);
+ printf("\n");
+ printf(" ci: ");
+ for (i = 0; i < n; i++)
+ printf(" %02x", s2[i]);
+ printf("\n");
+}
+
+int test_memcmp(void *s1, void *s2, int n, int offset, char *test_name)
+{
+ char *s1c, *s2c;
+
+ s1c = s1;
+ s1c += offset;
+ s2c = s2;
+ s2c += offset;
+
+ if (memcmp(s1c, s2c, n)) {
+ if (debug) {
+ printf("\n Compare failed. Offset:%i length:%i\n",
+ offset, n);
+ dumpdata(s1c, s2c, n, test_name);
+ }
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Do two memcpy tests using the same instructions. One cachable
+ * memory and the other doesn't.
+ */
+int do_test(char *test_name, void (*test_func)(char *, char *))
+{
+ int offset, width, fd, rc, r;
+ void *mem0, *mem1, *ci0, *ci1;
+
+ printf("\tDoing %s:\t", test_name);
+
+ fd = open(cipath, O_RDWR);
+ if (fd < 0) {
+ printf("\n");
+ perror("Can't open ci file now?");
+ return 1;
+ }
+
+ ci0 = mmap(NULL, bufsize, PROT_WRITE | PROT_READ, MAP_SHARED,
+ fd, cioffset);
+ ci1 = mmap(NULL, bufsize, PROT_WRITE | PROT_READ, MAP_SHARED,
+ fd, cioffset + bufsize);
+
+ if ((ci0 == MAP_FAILED) || (ci1 == MAP_FAILED)) {
+ printf("\n");
+ perror("mmap failed");
+ SKIP_IF(1);
+ }
+
+ rc = posix_memalign(&mem0, bufsize, bufsize);
+ if (rc) {
+ printf("\n");
+ return rc;
+ }
+
+ rc = posix_memalign(&mem1, bufsize, bufsize);
+ if (rc) {
+ printf("\n");
+ free(mem0);
+ return rc;
+ }
+
+ rc = 0;
+ /*
+ * offset = 0 is aligned but tests the workaround for the P9N
+ * DD2.1 vector CI load issue (see 5080332c2c89 "powerpc/64s:
+ * Add workaround for P9 vector CI load issue")
+ */
+ for (offset = 0; offset < 16; offset++) {
+ width = 16; /* vsx == 16 bytes */
+ r = 0;
+
+ /* load pattern into memory byte by byte */
+ preload_data(ci0, offset, width);
+ preload_data(mem0, offset, width); // FIXME: remove??
+ memcpy(ci0, mem0, bufsize);
+ memcpy(ci1, mem1, bufsize); /* initialise output to the same */
+
+ /* sanity check */
+ test_memcmp(mem0, ci0, width, offset, test_name);
+
+ r |= test_memcpy(ci1, ci0, width, offset, test_func);
+ r |= test_memcpy(mem1, mem0, width, offset, test_func);
+ if (r && !debug) {
+ printf("FAILED: Got signal");
+ rc = 1;
+ break;
+ }
+
+ r |= test_memcmp(mem1, ci1, width, offset, test_name);
+ if (r && !debug) {
+ printf("FAILED: Wrong Data");
+ rc = 1;
+ break;
+ }
+ }
+
+ if (rc == 0)
+ printf("PASSED");
+
+ printf("\n");
+
+ munmap(ci0, bufsize);
+ munmap(ci1, bufsize);
+ free(mem0);
+ free(mem1);
+ close(fd);
+
+ return rc;
+}
+
+static bool can_open_cifile(void)
+{
+ int fd;
+
+ fd = open(cipath, O_RDWR);
+ if (fd < 0)
+ return false;
+
+ close(fd);
+ return true;
+}
+
+int test_alignment_handler_vsx_206(void)
+{
+ int rc = 0;
+
+ SKIP_IF(!can_open_cifile());
+ SKIP_IF(!have_hwcap(PPC_FEATURE_ARCH_2_06));
+
+ printf("VSX: 2.06B\n");
+ LOAD_VSX_XFORM_TEST(lxvd2x);
+ LOAD_VSX_XFORM_TEST(lxvw4x);
+ LOAD_VSX_XFORM_TEST(lxsdx);
+ LOAD_VSX_XFORM_TEST(lxvdsx);
+ STORE_VSX_XFORM_TEST(stxvd2x);
+ STORE_VSX_XFORM_TEST(stxvw4x);
+ STORE_VSX_XFORM_TEST(stxsdx);
+ return rc;
+}
+
+int test_alignment_handler_vsx_207(void)
+{
+ int rc = 0;
+
+ SKIP_IF(!can_open_cifile());
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_2_07));
+
+ printf("VSX: 2.07B\n");
+ LOAD_VSX_XFORM_TEST(lxsspx);
+ LOAD_VSX_XFORM_TEST(lxsiwax);
+ LOAD_VSX_XFORM_TEST(lxsiwzx);
+ STORE_VSX_XFORM_TEST(stxsspx);
+ STORE_VSX_XFORM_TEST(stxsiwx);
+ return rc;
+}
+
+int test_alignment_handler_vsx_300(void)
+{
+ int rc = 0;
+
+ SKIP_IF(!can_open_cifile());
+
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_00));
+ printf("VSX: 3.00B\n");
+ LOAD_VMX_DFORM_TEST(lxsd);
+ LOAD_VSX_XFORM_TEST(lxsibzx);
+ LOAD_VSX_XFORM_TEST(lxsihzx);
+ LOAD_VMX_DFORM_TEST(lxssp);
+ LOAD_VSX_DFORM_TEST(lxv);
+ LOAD_VSX_XFORM_TEST(lxvb16x);
+ LOAD_VSX_XFORM_TEST(lxvh8x);
+ LOAD_VSX_XFORM_TEST(lxvx);
+ LOAD_VSX_XFORM_TEST(lxvwsx);
+ LOAD_VSX_XFORM_TEST(lxvl);
+ LOAD_VSX_XFORM_TEST(lxvll);
+ STORE_VMX_DFORM_TEST(stxsd);
+ STORE_VSX_XFORM_TEST(stxsibx);
+ STORE_VSX_XFORM_TEST(stxsihx);
+ STORE_VMX_DFORM_TEST(stxssp);
+ STORE_VSX_DFORM_TEST(stxv);
+ STORE_VSX_XFORM_TEST(stxvb16x);
+ STORE_VSX_XFORM_TEST(stxvh8x);
+ STORE_VSX_XFORM_TEST(stxvx);
+ STORE_VSX_XFORM_TEST(stxvl);
+ STORE_VSX_XFORM_TEST(stxvll);
+ return rc;
+}
+
+int test_alignment_handler_vsx_prefix(void)
+{
+ int rc = 0;
+
+ SKIP_IF(!can_open_cifile());
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_1));
+
+ printf("VSX: PREFIX\n");
+ LOAD_VSX_8LS_PREFIX_TEST(PLXSD, 0);
+ LOAD_VSX_8LS_PREFIX_TEST(PLXSSP, 0);
+ LOAD_VSX_8LS_PREFIX_TEST(PLXV0, 0);
+ LOAD_VSX_8LS_PREFIX_TEST(PLXV1, 1);
+ STORE_VSX_8LS_PREFIX_TEST(PSTXSD, 0);
+ STORE_VSX_8LS_PREFIX_TEST(PSTXSSP, 0);
+ STORE_VSX_8LS_PREFIX_TEST(PSTXV0, 0);
+ STORE_VSX_8LS_PREFIX_TEST(PSTXV1, 1);
+ return rc;
+}
+
+int test_alignment_handler_integer(void)
+{
+ int rc = 0;
+
+ SKIP_IF(!can_open_cifile());
+
+ printf("Integer\n");
+ LOAD_DFORM_TEST(lbz);
+ LOAD_DFORM_TEST(lbzu);
+ LOAD_XFORM_TEST(lbzx);
+ LOAD_XFORM_TEST(lbzux);
+ LOAD_DFORM_TEST(lhz);
+ LOAD_DFORM_TEST(lhzu);
+ LOAD_XFORM_TEST(lhzx);
+ LOAD_XFORM_TEST(lhzux);
+ LOAD_DFORM_TEST(lha);
+ LOAD_DFORM_TEST(lhau);
+ LOAD_XFORM_TEST(lhax);
+ LOAD_XFORM_TEST(lhaux);
+ LOAD_XFORM_TEST(lhbrx);
+ LOAD_DFORM_TEST(lwz);
+ LOAD_DFORM_TEST(lwzu);
+ LOAD_XFORM_TEST(lwzx);
+ LOAD_XFORM_TEST(lwzux);
+ LOAD_DFORM_TEST(lwa);
+ LOAD_XFORM_TEST(lwax);
+ LOAD_XFORM_TEST(lwaux);
+ LOAD_XFORM_TEST(lwbrx);
+ LOAD_DFORM_TEST(ld);
+ LOAD_DFORM_TEST(ldu);
+ LOAD_XFORM_TEST(ldx);
+ LOAD_XFORM_TEST(ldux);
+ STORE_DFORM_TEST(stb);
+ STORE_XFORM_TEST(stbx);
+ STORE_DFORM_TEST(stbu);
+ STORE_XFORM_TEST(stbux);
+ STORE_DFORM_TEST(sth);
+ STORE_XFORM_TEST(sthx);
+ STORE_DFORM_TEST(sthu);
+ STORE_XFORM_TEST(sthux);
+ STORE_XFORM_TEST(sthbrx);
+ STORE_DFORM_TEST(stw);
+ STORE_XFORM_TEST(stwx);
+ STORE_DFORM_TEST(stwu);
+ STORE_XFORM_TEST(stwux);
+ STORE_XFORM_TEST(stwbrx);
+ STORE_DFORM_TEST(std);
+ STORE_XFORM_TEST(stdx);
+ STORE_DFORM_TEST(stdu);
+ STORE_XFORM_TEST(stdux);
+
+#ifdef __BIG_ENDIAN__
+ LOAD_DFORM_TEST(lmw);
+ STORE_DFORM_TEST(stmw);
+#endif
+
+ return rc;
+}
+
+int test_alignment_handler_integer_206(void)
+{
+ int rc = 0;
+
+ SKIP_IF(!can_open_cifile());
+ SKIP_IF(!have_hwcap(PPC_FEATURE_ARCH_2_06));
+
+ printf("Integer: 2.06\n");
+
+ LOAD_XFORM_TEST(ldbrx);
+ STORE_XFORM_TEST(stdbrx);
+
+ return rc;
+}
+
+int test_alignment_handler_integer_prefix(void)
+{
+ int rc = 0;
+
+ SKIP_IF(!can_open_cifile());
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_1));
+
+ printf("Integer: PREFIX\n");
+ LOAD_MLS_PREFIX_TEST(PLBZ);
+ LOAD_MLS_PREFIX_TEST(PLHZ);
+ LOAD_MLS_PREFIX_TEST(PLHA);
+ LOAD_MLS_PREFIX_TEST(PLWZ);
+ LOAD_8LS_PREFIX_TEST(PLWA);
+ LOAD_8LS_PREFIX_TEST(PLD);
+ STORE_MLS_PREFIX_TEST(PSTB);
+ STORE_MLS_PREFIX_TEST(PSTH);
+ STORE_MLS_PREFIX_TEST(PSTW);
+ STORE_8LS_PREFIX_TEST(PSTD);
+ return rc;
+}
+
+int test_alignment_handler_vmx(void)
+{
+ int rc = 0;
+
+ SKIP_IF(!can_open_cifile());
+ SKIP_IF(!have_hwcap(PPC_FEATURE_HAS_ALTIVEC));
+
+ printf("VMX\n");
+ LOAD_VMX_XFORM_TEST(lvx);
+
+ /*
+ * FIXME: These loads only load part of the register, so our
+ * testing method doesn't work. Also they don't take alignment
+ * faults, so it's kinda pointless anyway
+ *
+ LOAD_VMX_XFORM_TEST(lvebx)
+ LOAD_VMX_XFORM_TEST(lvehx)
+ LOAD_VMX_XFORM_TEST(lvewx)
+ LOAD_VMX_XFORM_TEST(lvxl)
+ */
+ STORE_VMX_XFORM_TEST(stvx);
+ STORE_VMX_XFORM_TEST(stvebx);
+ STORE_VMX_XFORM_TEST(stvehx);
+ STORE_VMX_XFORM_TEST(stvewx);
+ STORE_VMX_XFORM_TEST(stvxl);
+ return rc;
+}
+
+int test_alignment_handler_fp(void)
+{
+ int rc = 0;
+
+ SKIP_IF(!can_open_cifile());
+
+ printf("Floating point\n");
+ LOAD_FLOAT_DFORM_TEST(lfd);
+ LOAD_FLOAT_XFORM_TEST(lfdx);
+ LOAD_FLOAT_DFORM_TEST(lfdu);
+ LOAD_FLOAT_XFORM_TEST(lfdux);
+ LOAD_FLOAT_DFORM_TEST(lfs);
+ LOAD_FLOAT_XFORM_TEST(lfsx);
+ LOAD_FLOAT_DFORM_TEST(lfsu);
+ LOAD_FLOAT_XFORM_TEST(lfsux);
+ STORE_FLOAT_DFORM_TEST(stfd);
+ STORE_FLOAT_XFORM_TEST(stfdx);
+ STORE_FLOAT_DFORM_TEST(stfdu);
+ STORE_FLOAT_XFORM_TEST(stfdux);
+ STORE_FLOAT_DFORM_TEST(stfs);
+ STORE_FLOAT_XFORM_TEST(stfsx);
+ STORE_FLOAT_DFORM_TEST(stfsu);
+ STORE_FLOAT_XFORM_TEST(stfsux);
+ STORE_FLOAT_XFORM_TEST(stfiwx);
+
+ return rc;
+}
+
+int test_alignment_handler_fp_205(void)
+{
+ int rc = 0;
+
+ SKIP_IF(!can_open_cifile());
+ SKIP_IF(!have_hwcap(PPC_FEATURE_ARCH_2_05));
+
+ printf("Floating point: 2.05\n");
+
+ LOAD_FLOAT_DFORM_TEST(lfdp);
+ LOAD_FLOAT_XFORM_TEST(lfdpx);
+ LOAD_FLOAT_XFORM_TEST(lfiwax);
+ STORE_FLOAT_DFORM_TEST(stfdp);
+ STORE_FLOAT_XFORM_TEST(stfdpx);
+
+ return rc;
+}
+
+int test_alignment_handler_fp_206(void)
+{
+ int rc = 0;
+
+ SKIP_IF(!can_open_cifile());
+ SKIP_IF(!have_hwcap(PPC_FEATURE_ARCH_2_06));
+
+ printf("Floating point: 2.06\n");
+
+ LOAD_FLOAT_XFORM_TEST(lfiwzx);
+
+ return rc;
+}
+
+
+int test_alignment_handler_fp_prefix(void)
+{
+ int rc = 0;
+
+ SKIP_IF(!can_open_cifile());
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_1));
+
+ printf("Floating point: PREFIX\n");
+ LOAD_FLOAT_DFORM_TEST(lfs);
+ LOAD_FLOAT_MLS_PREFIX_TEST(PLFS);
+ LOAD_FLOAT_MLS_PREFIX_TEST(PLFD);
+ STORE_FLOAT_MLS_PREFIX_TEST(PSTFS);
+ STORE_FLOAT_MLS_PREFIX_TEST(PSTFD);
+ return rc;
+}
+
+void usage(char *prog)
+{
+ printf("Usage: %s [options] [path [offset]]\n", prog);
+ printf(" -d Enable debug error output\n");
+ printf("\n");
+ printf("This test requires a POWER8, POWER9 or POWER10 CPU ");
+ printf("and either a usable framebuffer at /dev/fb0 or ");
+ printf("the path to usable cache inhibited memory and optional ");
+ printf("offset to be provided\n");
+}
+
+int main(int argc, char *argv[])
+{
+
+ struct sigaction sa;
+ int rc = 0;
+ int option = 0;
+
+ while ((option = getopt(argc, argv, "d")) != -1) {
+ switch (option) {
+ case 'd':
+ debug++;
+ break;
+ default:
+ usage(argv[0]);
+ exit(1);
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (argc > 0)
+ cipath = argv[0];
+ if (argc > 1)
+ cioffset = strtol(argv[1], 0, 0x10);
+
+ bufsize = getpagesize();
+
+ sa.sa_sigaction = sighandler;
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = SA_SIGINFO;
+ if (sigaction(SIGSEGV, &sa, NULL) == -1
+ || sigaction(SIGBUS, &sa, NULL) == -1
+ || sigaction(SIGILL, &sa, NULL) == -1) {
+ perror("sigaction");
+ exit(1);
+ }
+
+ prefixes_enabled = have_hwcap2(PPC_FEATURE2_ARCH_3_1);
+
+ rc |= test_harness(test_alignment_handler_vsx_206,
+ "test_alignment_handler_vsx_206");
+ rc |= test_harness(test_alignment_handler_vsx_207,
+ "test_alignment_handler_vsx_207");
+ rc |= test_harness(test_alignment_handler_vsx_300,
+ "test_alignment_handler_vsx_300");
+ rc |= test_harness(test_alignment_handler_vsx_prefix,
+ "test_alignment_handler_vsx_prefix");
+ rc |= test_harness(test_alignment_handler_integer,
+ "test_alignment_handler_integer");
+ rc |= test_harness(test_alignment_handler_integer_206,
+ "test_alignment_handler_integer_206");
+ rc |= test_harness(test_alignment_handler_integer_prefix,
+ "test_alignment_handler_integer_prefix");
+ rc |= test_harness(test_alignment_handler_vmx,
+ "test_alignment_handler_vmx");
+ rc |= test_harness(test_alignment_handler_fp,
+ "test_alignment_handler_fp");
+ rc |= test_harness(test_alignment_handler_fp_205,
+ "test_alignment_handler_fp_205");
+ rc |= test_harness(test_alignment_handler_fp_206,
+ "test_alignment_handler_fp_206");
+ rc |= test_harness(test_alignment_handler_fp_prefix,
+ "test_alignment_handler_fp_prefix");
+ return rc;
+}
diff --git a/tools/testing/selftests/powerpc/alignment/copy_first_unaligned.c b/tools/testing/selftests/powerpc/alignment/copy_first_unaligned.c
new file mode 100644
index 000000000..db4e8c680
--- /dev/null
+++ b/tools/testing/selftests/powerpc/alignment/copy_first_unaligned.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2016, Chris Smart, IBM Corporation.
+ *
+ * Calls to copy_first which are not 128-byte aligned should be
+ * caught and sent a SIGBUS.
+ */
+
+#include <signal.h>
+#include <string.h>
+#include <unistd.h>
+#include "utils.h"
+#include "instructions.h"
+
+unsigned int expected_instruction = PPC_INST_COPY_FIRST;
+unsigned int instruction_mask = 0xfc2007fe;
+
+void signal_action_handler(int signal_num, siginfo_t *info, void *ptr)
+{
+ ucontext_t *ctx = ptr;
+#ifdef __powerpc64__
+ unsigned int *pc = (unsigned int *)ctx->uc_mcontext.gp_regs[PT_NIP];
+#else
+ unsigned int *pc = (unsigned int *)ctx->uc_mcontext.uc_regs->gregs[PT_NIP];
+#endif
+
+ /*
+ * Check that the signal was on the correct instruction, using a
+ * mask because the compiler assigns the register at RB.
+ */
+ if ((*pc & instruction_mask) == expected_instruction)
+ _exit(0); /* We hit the right instruction */
+
+ _exit(1);
+}
+
+void setup_signal_handler(void)
+{
+ struct sigaction signal_action;
+
+ memset(&signal_action, 0, sizeof(signal_action));
+ signal_action.sa_sigaction = signal_action_handler;
+ signal_action.sa_flags = SA_SIGINFO;
+ sigaction(SIGBUS, &signal_action, NULL);
+}
+
+char cacheline_buf[128] __cacheline_aligned;
+
+int test_copy_first_unaligned(void)
+{
+ /* Only run this test on a P9 or later */
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_3_00));
+
+ /* Register our signal handler with SIGBUS */
+ setup_signal_handler();
+
+ /* +1 makes buf unaligned */
+ copy_first(cacheline_buf+1);
+
+ /* We should not get here */
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(test_copy_first_unaligned, "test_copy_first_unaligned");
+}
diff --git a/tools/testing/selftests/powerpc/benchmarks/.gitignore b/tools/testing/selftests/powerpc/benchmarks/.gitignore
new file mode 100644
index 000000000..c9ce13983
--- /dev/null
+++ b/tools/testing/selftests/powerpc/benchmarks/.gitignore
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+gettimeofday
+context_switch
+fork
+exec_target
+mmap_bench
+futex_bench
+null_syscall
diff --git a/tools/testing/selftests/powerpc/benchmarks/Makefile b/tools/testing/selftests/powerpc/benchmarks/Makefile
new file mode 100644
index 000000000..a32a6ab89
--- /dev/null
+++ b/tools/testing/selftests/powerpc/benchmarks/Makefile
@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: GPL-2.0
+TEST_GEN_PROGS := gettimeofday context_switch fork mmap_bench futex_bench null_syscall
+TEST_GEN_FILES := exec_target
+
+TEST_FILES := settings
+
+CFLAGS += -O2
+
+top_srcdir = ../../../../..
+include ../../lib.mk
+
+$(TEST_GEN_PROGS): ../harness.c
+
+$(OUTPUT)/context_switch: ../utils.c
+$(OUTPUT)/context_switch: CFLAGS += -maltivec -mvsx -mabi=altivec
+$(OUTPUT)/context_switch: LDLIBS += -lpthread
+
+$(OUTPUT)/fork: LDLIBS += -lpthread
+
+$(OUTPUT)/exec_target: CFLAGS += -static -nostartfiles
diff --git a/tools/testing/selftests/powerpc/benchmarks/context_switch.c b/tools/testing/selftests/powerpc/benchmarks/context_switch.c
new file mode 100644
index 000000000..96554e279
--- /dev/null
+++ b/tools/testing/selftests/powerpc/benchmarks/context_switch.c
@@ -0,0 +1,508 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Context switch microbenchmark.
+ *
+ * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <sched.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <signal.h>
+#include <assert.h>
+#include <pthread.h>
+#include <limits.h>
+#include <sys/time.h>
+#include <sys/syscall.h>
+#include <sys/sysinfo.h>
+#include <sys/types.h>
+#include <sys/shm.h>
+#include <linux/futex.h>
+#ifdef __powerpc__
+#include <altivec.h>
+#endif
+#include "utils.h"
+
+static unsigned int timeout = 30;
+
+static int touch_vdso;
+struct timeval tv;
+
+static int touch_fp = 1;
+double fp;
+
+static int touch_vector = 1;
+vector int a, b, c;
+
+#ifdef __powerpc__
+static int touch_altivec = 1;
+
+/*
+ * Note: LTO (Link Time Optimisation) doesn't play well with this function
+ * attribute. Be very careful enabling LTO for this test.
+ */
+static void __attribute__((__target__("no-vsx"))) altivec_touch_fn(void)
+{
+ c = a + b;
+}
+#endif
+
+static void touch(void)
+{
+ if (touch_vdso)
+ gettimeofday(&tv, NULL);
+
+ if (touch_fp)
+ fp += 0.1;
+
+#ifdef __powerpc__
+ if (touch_altivec)
+ altivec_touch_fn();
+#endif
+
+ if (touch_vector)
+ c = a + b;
+
+ asm volatile("# %0 %1 %2": : "r"(&tv), "r"(&fp), "r"(&c));
+}
+
+static void start_thread_on(void *(*fn)(void *), void *arg, unsigned long cpu)
+{
+ int rc;
+ pthread_t tid;
+ cpu_set_t cpuset;
+ pthread_attr_t attr;
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpu, &cpuset);
+
+ rc = pthread_attr_init(&attr);
+ if (rc) {
+ errno = rc;
+ perror("pthread_attr_init");
+ exit(1);
+ }
+
+ rc = pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset);
+ if (rc) {
+ errno = rc;
+ perror("pthread_attr_setaffinity_np");
+ exit(1);
+ }
+
+ rc = pthread_create(&tid, &attr, fn, arg);
+ if (rc) {
+ errno = rc;
+ perror("pthread_create");
+ exit(1);
+ }
+}
+
+static void start_process_on(void *(*fn)(void *), void *arg, unsigned long cpu)
+{
+ int pid, ncpus;
+ cpu_set_t *cpuset;
+ size_t size;
+
+ pid = fork();
+ if (pid == -1) {
+ perror("fork");
+ exit(1);
+ }
+
+ if (pid)
+ return;
+
+ ncpus = get_nprocs();
+ size = CPU_ALLOC_SIZE(ncpus);
+ cpuset = CPU_ALLOC(ncpus);
+ if (!cpuset) {
+ perror("malloc");
+ exit(1);
+ }
+ CPU_ZERO_S(size, cpuset);
+ CPU_SET_S(cpu, size, cpuset);
+
+ if (sched_setaffinity(0, size, cpuset)) {
+ perror("sched_setaffinity");
+ CPU_FREE(cpuset);
+ exit(1);
+ }
+
+ CPU_FREE(cpuset);
+ fn(arg);
+
+ exit(0);
+}
+
+static unsigned long iterations;
+static unsigned long iterations_prev;
+
+static void sigalrm_handler(int junk)
+{
+ unsigned long i = iterations;
+
+ printf("%ld\n", i - iterations_prev);
+ iterations_prev = i;
+
+ if (--timeout == 0)
+ kill(0, SIGUSR1);
+
+ alarm(1);
+}
+
+static void sigusr1_handler(int junk)
+{
+ exit(0);
+}
+
+struct actions {
+ void (*setup)(int, int);
+ void *(*thread1)(void *);
+ void *(*thread2)(void *);
+};
+
+#define READ 0
+#define WRITE 1
+
+static int pipe_fd1[2];
+static int pipe_fd2[2];
+
+static void pipe_setup(int cpu1, int cpu2)
+{
+ if (pipe(pipe_fd1) || pipe(pipe_fd2))
+ exit(1);
+}
+
+static void *pipe_thread1(void *arg)
+{
+ signal(SIGALRM, sigalrm_handler);
+ alarm(1);
+
+ while (1) {
+ assert(read(pipe_fd1[READ], &c, 1) == 1);
+ touch();
+
+ assert(write(pipe_fd2[WRITE], &c, 1) == 1);
+ touch();
+
+ iterations += 2;
+ }
+
+ return NULL;
+}
+
+static void *pipe_thread2(void *arg)
+{
+ while (1) {
+ assert(write(pipe_fd1[WRITE], &c, 1) == 1);
+ touch();
+
+ assert(read(pipe_fd2[READ], &c, 1) == 1);
+ touch();
+ }
+
+ return NULL;
+}
+
+static struct actions pipe_actions = {
+ .setup = pipe_setup,
+ .thread1 = pipe_thread1,
+ .thread2 = pipe_thread2,
+};
+
+static void yield_setup(int cpu1, int cpu2)
+{
+ if (cpu1 != cpu2) {
+ fprintf(stderr, "Both threads must be on the same CPU for yield test\n");
+ exit(1);
+ }
+}
+
+static void *yield_thread1(void *arg)
+{
+ signal(SIGALRM, sigalrm_handler);
+ alarm(1);
+
+ while (1) {
+ sched_yield();
+ touch();
+
+ iterations += 2;
+ }
+
+ return NULL;
+}
+
+static void *yield_thread2(void *arg)
+{
+ while (1) {
+ sched_yield();
+ touch();
+ }
+
+ return NULL;
+}
+
+static struct actions yield_actions = {
+ .setup = yield_setup,
+ .thread1 = yield_thread1,
+ .thread2 = yield_thread2,
+};
+
+static long sys_futex(void *addr1, int op, int val1, struct timespec *timeout,
+ void *addr2, int val3)
+{
+ return syscall(SYS_futex, addr1, op, val1, timeout, addr2, val3);
+}
+
+static unsigned long cmpxchg(unsigned long *p, unsigned long expected,
+ unsigned long desired)
+{
+ unsigned long exp = expected;
+
+ __atomic_compare_exchange_n(p, &exp, desired, 0,
+ __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+ return exp;
+}
+
+static unsigned long xchg(unsigned long *p, unsigned long val)
+{
+ return __atomic_exchange_n(p, val, __ATOMIC_SEQ_CST);
+}
+
+static int processes;
+
+static int mutex_lock(unsigned long *m)
+{
+ int c;
+ int flags = FUTEX_WAIT;
+ if (!processes)
+ flags |= FUTEX_PRIVATE_FLAG;
+
+ c = cmpxchg(m, 0, 1);
+ if (!c)
+ return 0;
+
+ if (c == 1)
+ c = xchg(m, 2);
+
+ while (c) {
+ sys_futex(m, flags, 2, NULL, NULL, 0);
+ c = xchg(m, 2);
+ }
+
+ return 0;
+}
+
+static int mutex_unlock(unsigned long *m)
+{
+ int flags = FUTEX_WAKE;
+ if (!processes)
+ flags |= FUTEX_PRIVATE_FLAG;
+
+ if (*m == 2)
+ *m = 0;
+ else if (xchg(m, 0) == 1)
+ return 0;
+
+ sys_futex(m, flags, 1, NULL, NULL, 0);
+
+ return 0;
+}
+
+static unsigned long *m1, *m2;
+
+static void futex_setup(int cpu1, int cpu2)
+{
+ if (!processes) {
+ static unsigned long _m1, _m2;
+ m1 = &_m1;
+ m2 = &_m2;
+ } else {
+ int shmid;
+ void *shmaddr;
+
+ shmid = shmget(IPC_PRIVATE, getpagesize(), SHM_R | SHM_W);
+ if (shmid < 0) {
+ perror("shmget");
+ exit(1);
+ }
+
+ shmaddr = shmat(shmid, NULL, 0);
+ if (shmaddr == (char *)-1) {
+ perror("shmat");
+ shmctl(shmid, IPC_RMID, NULL);
+ exit(1);
+ }
+
+ shmctl(shmid, IPC_RMID, NULL);
+
+ m1 = shmaddr;
+ m2 = shmaddr + sizeof(*m1);
+ }
+
+ *m1 = 0;
+ *m2 = 0;
+
+ mutex_lock(m1);
+ mutex_lock(m2);
+}
+
+static void *futex_thread1(void *arg)
+{
+ signal(SIGALRM, sigalrm_handler);
+ alarm(1);
+
+ while (1) {
+ mutex_lock(m2);
+ mutex_unlock(m1);
+
+ iterations += 2;
+ }
+
+ return NULL;
+}
+
+static void *futex_thread2(void *arg)
+{
+ while (1) {
+ mutex_unlock(m2);
+ mutex_lock(m1);
+ }
+
+ return NULL;
+}
+
+static struct actions futex_actions = {
+ .setup = futex_setup,
+ .thread1 = futex_thread1,
+ .thread2 = futex_thread2,
+};
+
+static struct option options[] = {
+ { "test", required_argument, 0, 't' },
+ { "process", no_argument, &processes, 1 },
+ { "timeout", required_argument, 0, 's' },
+ { "vdso", no_argument, &touch_vdso, 1 },
+ { "no-fp", no_argument, &touch_fp, 0 },
+#ifdef __powerpc__
+ { "no-altivec", no_argument, &touch_altivec, 0 },
+#endif
+ { "no-vector", no_argument, &touch_vector, 0 },
+ { 0, },
+};
+
+static void usage(void)
+{
+ fprintf(stderr, "Usage: context_switch2 <options> CPU1 CPU2\n\n");
+ fprintf(stderr, "\t\t--test=X\tpipe, futex or yield (default)\n");
+ fprintf(stderr, "\t\t--process\tUse processes (default threads)\n");
+ fprintf(stderr, "\t\t--timeout=X\tDuration in seconds to run (default 30)\n");
+ fprintf(stderr, "\t\t--vdso\t\ttouch VDSO\n");
+ fprintf(stderr, "\t\t--no-fp\t\tDon't touch FP\n");
+#ifdef __powerpc__
+ fprintf(stderr, "\t\t--no-altivec\tDon't touch altivec\n");
+#endif
+ fprintf(stderr, "\t\t--no-vector\tDon't touch vector\n");
+}
+
+int main(int argc, char *argv[])
+{
+ signed char c;
+ struct actions *actions = &yield_actions;
+ int cpu1;
+ int cpu2;
+ static void (*start_fn)(void *(*fn)(void *), void *arg, unsigned long cpu);
+
+ while (1) {
+ int option_index = 0;
+
+ c = getopt_long(argc, argv, "", options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 0:
+ if (options[option_index].flag != 0)
+ break;
+
+ usage();
+ exit(1);
+ break;
+
+ case 't':
+ if (!strcmp(optarg, "pipe")) {
+ actions = &pipe_actions;
+ } else if (!strcmp(optarg, "yield")) {
+ actions = &yield_actions;
+ } else if (!strcmp(optarg, "futex")) {
+ actions = &futex_actions;
+ } else {
+ usage();
+ exit(1);
+ }
+ break;
+
+ case 's':
+ timeout = atoi(optarg);
+ break;
+
+ default:
+ usage();
+ exit(1);
+ }
+ }
+
+ if (processes)
+ start_fn = start_process_on;
+ else
+ start_fn = start_thread_on;
+
+ if (((argc - optind) != 2)) {
+ cpu1 = cpu2 = pick_online_cpu();
+ } else {
+ cpu1 = atoi(argv[optind++]);
+ cpu2 = atoi(argv[optind++]);
+ }
+
+ printf("Using %s with ", processes ? "processes" : "threads");
+
+ if (actions == &pipe_actions)
+ printf("pipe");
+ else if (actions == &yield_actions)
+ printf("yield");
+ else
+ printf("futex");
+
+ if (!have_hwcap(PPC_FEATURE_HAS_ALTIVEC))
+ touch_altivec = 0;
+
+ if (!have_hwcap(PPC_FEATURE_HAS_VSX))
+ touch_vector = 0;
+
+ printf(" on cpus %d/%d touching FP:%s altivec:%s vector:%s vdso:%s\n",
+ cpu1, cpu2, touch_fp ? "yes" : "no", touch_altivec ? "yes" : "no",
+ touch_vector ? "yes" : "no", touch_vdso ? "yes" : "no");
+
+ /* Create a new process group so we can signal everyone for exit */
+ setpgid(getpid(), getpid());
+
+ signal(SIGUSR1, sigusr1_handler);
+
+ actions->setup(cpu1, cpu2);
+
+ start_fn(actions->thread1, NULL, cpu1);
+ start_fn(actions->thread2, NULL, cpu2);
+
+ while (1)
+ sleep(3600);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/powerpc/benchmarks/exec_target.c b/tools/testing/selftests/powerpc/benchmarks/exec_target.c
new file mode 100644
index 000000000..c14b0fc1e
--- /dev/null
+++ b/tools/testing/selftests/powerpc/benchmarks/exec_target.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+/*
+ * Part of fork context switch microbenchmark.
+ *
+ * Copyright 2018, Anton Blanchard, IBM Corp.
+ */
+
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/syscall.h>
+
+void _start(void)
+{
+ syscall(SYS_exit, 0);
+}
diff --git a/tools/testing/selftests/powerpc/benchmarks/fork.c b/tools/testing/selftests/powerpc/benchmarks/fork.c
new file mode 100644
index 000000000..d312e638c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/benchmarks/fork.c
@@ -0,0 +1,325 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+/*
+ * Context switch microbenchmark.
+ *
+ * Copyright 2018, Anton Blanchard, IBM Corp.
+ */
+
+#define _GNU_SOURCE
+#include <assert.h>
+#include <errno.h>
+#include <getopt.h>
+#include <limits.h>
+#include <linux/futex.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/shm.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+static unsigned int timeout = 30;
+
+static void set_cpu(int cpu)
+{
+ cpu_set_t cpuset;
+
+ if (cpu == -1)
+ return;
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpu, &cpuset);
+
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset)) {
+ perror("sched_setaffinity");
+ exit(1);
+ }
+}
+
+static void start_process_on(void *(*fn)(void *), void *arg, int cpu)
+{
+ int pid;
+
+ pid = fork();
+ if (pid == -1) {
+ perror("fork");
+ exit(1);
+ }
+
+ if (pid)
+ return;
+
+ set_cpu(cpu);
+
+ fn(arg);
+
+ exit(0);
+}
+
+static int cpu;
+static int do_fork = 0;
+static int do_vfork = 0;
+static int do_exec = 0;
+static char *exec_file;
+static int exec_target = 0;
+static unsigned long iterations;
+static unsigned long iterations_prev;
+
+static void run_exec(void)
+{
+ char *const argv[] = { "./exec_target", NULL };
+
+ if (execve("./exec_target", argv, NULL) == -1) {
+ perror("execve");
+ exit(1);
+ }
+}
+
+static void bench_fork(void)
+{
+ while (1) {
+ pid_t pid = fork();
+ if (pid == -1) {
+ perror("fork");
+ exit(1);
+ }
+ if (pid == 0) {
+ if (do_exec)
+ run_exec();
+ _exit(0);
+ }
+ pid = waitpid(pid, NULL, 0);
+ if (pid == -1) {
+ perror("waitpid");
+ exit(1);
+ }
+ iterations++;
+ }
+}
+
+static void bench_vfork(void)
+{
+ while (1) {
+ pid_t pid = vfork();
+ if (pid == -1) {
+ perror("fork");
+ exit(1);
+ }
+ if (pid == 0) {
+ if (do_exec)
+ run_exec();
+ _exit(0);
+ }
+ pid = waitpid(pid, NULL, 0);
+ if (pid == -1) {
+ perror("waitpid");
+ exit(1);
+ }
+ iterations++;
+ }
+}
+
+static void *null_fn(void *arg)
+{
+ pthread_exit(NULL);
+}
+
+static void bench_thread(void)
+{
+ pthread_t tid;
+ cpu_set_t cpuset;
+ pthread_attr_t attr;
+ int rc;
+
+ rc = pthread_attr_init(&attr);
+ if (rc) {
+ errno = rc;
+ perror("pthread_attr_init");
+ exit(1);
+ }
+
+ if (cpu != -1) {
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpu, &cpuset);
+
+ rc = pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset);
+ if (rc) {
+ errno = rc;
+ perror("pthread_attr_setaffinity_np");
+ exit(1);
+ }
+ }
+
+ while (1) {
+ rc = pthread_create(&tid, &attr, null_fn, NULL);
+ if (rc) {
+ errno = rc;
+ perror("pthread_create");
+ exit(1);
+ }
+ rc = pthread_join(tid, NULL);
+ if (rc) {
+ errno = rc;
+ perror("pthread_join");
+ exit(1);
+ }
+ iterations++;
+ }
+}
+
+static void sigalrm_handler(int junk)
+{
+ unsigned long i = iterations;
+
+ printf("%ld\n", i - iterations_prev);
+ iterations_prev = i;
+
+ if (--timeout == 0)
+ kill(0, SIGUSR1);
+
+ alarm(1);
+}
+
+static void sigusr1_handler(int junk)
+{
+ exit(0);
+}
+
+static void *bench_proc(void *arg)
+{
+ signal(SIGALRM, sigalrm_handler);
+ alarm(1);
+
+ if (do_fork)
+ bench_fork();
+ else if (do_vfork)
+ bench_vfork();
+ else
+ bench_thread();
+
+ return NULL;
+}
+
+static struct option options[] = {
+ { "fork", no_argument, &do_fork, 1 },
+ { "vfork", no_argument, &do_vfork, 1 },
+ { "exec", no_argument, &do_exec, 1 },
+ { "timeout", required_argument, 0, 's' },
+ { "exec-target", no_argument, &exec_target, 1 },
+ { NULL },
+};
+
+static void usage(void)
+{
+ fprintf(stderr, "Usage: fork <options> CPU\n\n");
+ fprintf(stderr, "\t\t--fork\tUse fork() (default threads)\n");
+ fprintf(stderr, "\t\t--vfork\tUse vfork() (default threads)\n");
+ fprintf(stderr, "\t\t--exec\tAlso exec() (default no exec)\n");
+ fprintf(stderr, "\t\t--timeout=X\tDuration in seconds to run (default 30)\n");
+ fprintf(stderr, "\t\t--exec-target\tInternal option for exec workload\n");
+}
+
+int main(int argc, char *argv[])
+{
+ signed char c;
+
+ while (1) {
+ int option_index = 0;
+
+ c = getopt_long(argc, argv, "", options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 0:
+ if (options[option_index].flag != 0)
+ break;
+
+ usage();
+ exit(1);
+ break;
+
+ case 's':
+ timeout = atoi(optarg);
+ break;
+
+ default:
+ usage();
+ exit(1);
+ }
+ }
+
+ if (do_fork && do_vfork) {
+ usage();
+ exit(1);
+ }
+ if (do_exec && !do_fork && !do_vfork) {
+ usage();
+ exit(1);
+ }
+
+ if (do_exec) {
+ char *dirname = strdup(argv[0]);
+ int i;
+ i = strlen(dirname) - 1;
+ while (i) {
+ if (dirname[i] == '/') {
+ dirname[i] = '\0';
+ if (chdir(dirname) == -1) {
+ perror("chdir");
+ exit(1);
+ }
+ break;
+ }
+ i--;
+ }
+ }
+
+ if (exec_target) {
+ exit(0);
+ }
+
+ if (((argc - optind) != 1)) {
+ cpu = -1;
+ } else {
+ cpu = atoi(argv[optind++]);
+ }
+
+ if (do_exec)
+ exec_file = argv[0];
+
+ set_cpu(cpu);
+
+ printf("Using ");
+ if (do_fork)
+ printf("fork");
+ else if (do_vfork)
+ printf("vfork");
+ else
+ printf("clone");
+
+ if (do_exec)
+ printf(" + exec");
+
+ printf(" on cpu %d\n", cpu);
+
+ /* Create a new process group so we can signal everyone for exit */
+ setpgid(getpid(), getpid());
+
+ signal(SIGUSR1, sigusr1_handler);
+
+ start_process_on(bench_proc, NULL, cpu);
+
+ while (1)
+ sleep(3600);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/powerpc/benchmarks/futex_bench.c b/tools/testing/selftests/powerpc/benchmarks/futex_bench.c
new file mode 100644
index 000000000..017057090
--- /dev/null
+++ b/tools/testing/selftests/powerpc/benchmarks/futex_bench.c
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2016, Anton Blanchard, Michael Ellerman, IBM Corp.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <sys/syscall.h>
+#include <time.h>
+#include <unistd.h>
+#include <linux/futex.h>
+
+#include "utils.h"
+
+#define ITERATIONS 100000000
+
+#define futex(A, B, C, D, E, F) syscall(__NR_futex, A, B, C, D, E, F)
+
+int test_futex(void)
+{
+ struct timespec ts_start, ts_end;
+ unsigned long i = ITERATIONS;
+
+ clock_gettime(CLOCK_MONOTONIC, &ts_start);
+
+ while (i--) {
+ unsigned int addr = 0;
+ futex(&addr, FUTEX_WAKE, 1, NULL, NULL, 0);
+ }
+
+ clock_gettime(CLOCK_MONOTONIC, &ts_end);
+
+ printf("time = %.6f\n", ts_end.tv_sec - ts_start.tv_sec + (ts_end.tv_nsec - ts_start.tv_nsec) / 1e9);
+
+ return 0;
+}
+
+int main(void)
+{
+ test_harness_set_timeout(300);
+ return test_harness(test_futex, "futex_bench");
+}
diff --git a/tools/testing/selftests/powerpc/benchmarks/gettimeofday.c b/tools/testing/selftests/powerpc/benchmarks/gettimeofday.c
new file mode 100644
index 000000000..6b4156833
--- /dev/null
+++ b/tools/testing/selftests/powerpc/benchmarks/gettimeofday.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2015, Anton Blanchard, IBM Corp.
+ */
+
+#include <sys/time.h>
+#include <stdio.h>
+
+#include "utils.h"
+
+static int test_gettimeofday(void)
+{
+ int i;
+
+ struct timeval tv_start, tv_end;
+
+ gettimeofday(&tv_start, NULL);
+
+ for(i = 0; i < 100000000; i++) {
+ gettimeofday(&tv_end, NULL);
+ }
+
+ printf("time = %.6f\n", tv_end.tv_sec - tv_start.tv_sec + (tv_end.tv_usec - tv_start.tv_usec) * 1e-6);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(test_gettimeofday, "gettimeofday");
+}
diff --git a/tools/testing/selftests/powerpc/benchmarks/mmap_bench.c b/tools/testing/selftests/powerpc/benchmarks/mmap_bench.c
new file mode 100644
index 000000000..2525adf64
--- /dev/null
+++ b/tools/testing/selftests/powerpc/benchmarks/mmap_bench.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2016, Anton Blanchard, Michael Ellerman, IBM Corp.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <getopt.h>
+
+#include "utils.h"
+
+#define ITERATIONS 5000000
+
+#define MEMSIZE (1UL << 27)
+#define PAGE_SIZE (1UL << 16)
+#define CHUNK_COUNT (MEMSIZE/PAGE_SIZE)
+
+static int pg_fault;
+static int iterations = ITERATIONS;
+
+static struct option options[] = {
+ { "pgfault", no_argument, &pg_fault, 1 },
+ { "iterations", required_argument, 0, 'i' },
+ { 0, },
+};
+
+static void usage(void)
+{
+ printf("mmap_bench <--pgfault> <--iterations count>\n");
+}
+
+int test_mmap(void)
+{
+ struct timespec ts_start, ts_end;
+ unsigned long i = iterations;
+
+ clock_gettime(CLOCK_MONOTONIC, &ts_start);
+
+ while (i--) {
+ char *c = mmap(NULL, MEMSIZE, PROT_READ|PROT_WRITE,
+ MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+ FAIL_IF(c == MAP_FAILED);
+ if (pg_fault) {
+ int count;
+ for (count = 0; count < CHUNK_COUNT; count++)
+ c[count << 16] = 'c';
+ }
+ munmap(c, MEMSIZE);
+ }
+
+ clock_gettime(CLOCK_MONOTONIC, &ts_end);
+
+ printf("time = %.6f\n", ts_end.tv_sec - ts_start.tv_sec + (ts_end.tv_nsec - ts_start.tv_nsec) / 1e9);
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ signed char c;
+ while (1) {
+ int option_index = 0;
+
+ c = getopt_long(argc, argv, "", options, &option_index);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 0:
+ if (options[option_index].flag != 0)
+ break;
+
+ usage();
+ exit(1);
+ break;
+ case 'i':
+ iterations = atoi(optarg);
+ break;
+ default:
+ usage();
+ exit(1);
+ }
+ }
+
+ test_harness_set_timeout(300);
+ return test_harness(test_mmap, "mmap_bench");
+}
diff --git a/tools/testing/selftests/powerpc/benchmarks/null_syscall.c b/tools/testing/selftests/powerpc/benchmarks/null_syscall.c
new file mode 100644
index 000000000..579f0215c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/benchmarks/null_syscall.c
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test null syscall performance
+ *
+ * Copyright (C) 2009-2015 Anton Blanchard, IBM
+ */
+
+#define NR_LOOPS 10000000
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <signal.h>
+
+static volatile int soak_done;
+unsigned long long clock_frequency;
+unsigned long long timebase_frequency;
+double timebase_multiplier;
+
+static inline unsigned long mftb(void)
+{
+ unsigned long low;
+
+ asm volatile("mftb %0" : "=r" (low));
+
+ return low;
+}
+
+static void sigalrm_handler(int unused)
+{
+ soak_done = 1;
+}
+
+/*
+ * Use a timer instead of busy looping on clock_gettime() so we don't
+ * pollute profiles with glibc and VDSO hits.
+ */
+static void cpu_soak_usecs(unsigned long usecs)
+{
+ struct itimerval val;
+
+ memset(&val, 0, sizeof(val));
+ val.it_value.tv_usec = usecs;
+
+ signal(SIGALRM, sigalrm_handler);
+ setitimer(ITIMER_REAL, &val, NULL);
+
+ while (1) {
+ if (soak_done)
+ break;
+ }
+
+ signal(SIGALRM, SIG_DFL);
+}
+
+/*
+ * This only works with recent kernels where cpufreq modifies
+ * /proc/cpuinfo dynamically.
+ */
+static void get_proc_frequency(void)
+{
+ FILE *f;
+ char line[128];
+ char *p, *end;
+ unsigned long v;
+ double d;
+ char *override;
+
+ /* Try to get out of low power/low frequency mode */
+ cpu_soak_usecs(0.25 * 1000000);
+
+ f = fopen("/proc/cpuinfo", "r");
+ if (f == NULL)
+ return;
+
+ timebase_frequency = 0;
+
+ while (fgets(line, sizeof(line), f) != NULL) {
+ if (strncmp(line, "timebase", 8) == 0) {
+ p = strchr(line, ':');
+ if (p != NULL) {
+ v = strtoull(p + 1, &end, 0);
+ if (end != p + 1)
+ timebase_frequency = v;
+ }
+ }
+
+ if (((strncmp(line, "clock", 5) == 0) ||
+ (strncmp(line, "cpu MHz", 7) == 0))) {
+ p = strchr(line, ':');
+ if (p != NULL) {
+ d = strtod(p + 1, &end);
+ if (end != p + 1) {
+ /* Find fastest clock frequency */
+ if ((d * 1000000ULL) > clock_frequency)
+ clock_frequency = d * 1000000ULL;
+ }
+ }
+ }
+ }
+
+ fclose(f);
+
+ override = getenv("FREQUENCY");
+ if (override)
+ clock_frequency = strtoull(override, NULL, 10);
+
+ if (timebase_frequency)
+ timebase_multiplier = (double)clock_frequency
+ / timebase_frequency;
+ else
+ timebase_multiplier = 1;
+}
+
+static void do_null_syscall(unsigned long nr)
+{
+ unsigned long i;
+
+ for (i = 0; i < nr; i++)
+ getppid();
+}
+
+#define TIME(A, STR) \
+
+int main(void)
+{
+ unsigned long tb_start, tb_now;
+ struct timespec tv_start, tv_now;
+ unsigned long long elapsed_ns, elapsed_tb;
+
+ get_proc_frequency();
+
+ clock_gettime(CLOCK_MONOTONIC, &tv_start);
+ tb_start = mftb();
+
+ do_null_syscall(NR_LOOPS);
+
+ clock_gettime(CLOCK_MONOTONIC, &tv_now);
+ tb_now = mftb();
+
+ elapsed_ns = (tv_now.tv_sec - tv_start.tv_sec) * 1000000000ULL +
+ (tv_now.tv_nsec - tv_start.tv_nsec);
+ elapsed_tb = tb_now - tb_start;
+
+ printf("%10.2f ns %10.2f cycles\n", (float)elapsed_ns / NR_LOOPS,
+ (float)elapsed_tb * timebase_multiplier / NR_LOOPS);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/powerpc/benchmarks/settings b/tools/testing/selftests/powerpc/benchmarks/settings
new file mode 100644
index 000000000..e7b941753
--- /dev/null
+++ b/tools/testing/selftests/powerpc/benchmarks/settings
@@ -0,0 +1 @@
+timeout=0
diff --git a/tools/testing/selftests/powerpc/cache_shape/.gitignore b/tools/testing/selftests/powerpc/cache_shape/.gitignore
new file mode 100644
index 000000000..b385eee30
--- /dev/null
+++ b/tools/testing/selftests/powerpc/cache_shape/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+cache_shape
diff --git a/tools/testing/selftests/powerpc/cache_shape/Makefile b/tools/testing/selftests/powerpc/cache_shape/Makefile
new file mode 100644
index 000000000..689f6c8eb
--- /dev/null
+++ b/tools/testing/selftests/powerpc/cache_shape/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0
+TEST_GEN_PROGS := cache_shape
+
+top_srcdir = ../../../../..
+include ../../lib.mk
+
+$(TEST_GEN_PROGS): ../harness.c ../utils.c
diff --git a/tools/testing/selftests/powerpc/cache_shape/cache_shape.c b/tools/testing/selftests/powerpc/cache_shape/cache_shape.c
new file mode 100644
index 000000000..171b6c948
--- /dev/null
+++ b/tools/testing/selftests/powerpc/cache_shape/cache_shape.c
@@ -0,0 +1,121 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2017, Michael Ellerman, IBM Corp.
+ */
+
+#include <elf.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <link.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "utils.h"
+
+#ifndef AT_L1I_CACHESIZE
+#define AT_L1I_CACHESIZE 40
+#define AT_L1I_CACHEGEOMETRY 41
+#define AT_L1D_CACHESIZE 42
+#define AT_L1D_CACHEGEOMETRY 43
+#define AT_L2_CACHESIZE 44
+#define AT_L2_CACHEGEOMETRY 45
+#define AT_L3_CACHESIZE 46
+#define AT_L3_CACHEGEOMETRY 47
+#endif
+
+static void print_size(const char *label, uint32_t val)
+{
+ printf("%s cache size: %#10x %10dB %10dK\n", label, val, val, val / 1024);
+}
+
+static void print_geo(const char *label, uint32_t val)
+{
+ uint16_t assoc;
+
+ printf("%s line size: %#10x ", label, val & 0xFFFF);
+
+ assoc = val >> 16;
+ if (assoc)
+ printf("%u-way", assoc);
+ else
+ printf("fully");
+
+ printf(" associative\n");
+}
+
+static int test_cache_shape()
+{
+ static char buffer[4096];
+ ElfW(auxv_t) *p;
+ int found;
+
+ FAIL_IF(read_auxv(buffer, sizeof(buffer)));
+
+ found = 0;
+
+ p = find_auxv_entry(AT_L1I_CACHESIZE, buffer);
+ if (p) {
+ found++;
+ print_size("L1I ", (uint32_t)p->a_un.a_val);
+ }
+
+ p = find_auxv_entry(AT_L1I_CACHEGEOMETRY, buffer);
+ if (p) {
+ found++;
+ print_geo("L1I ", (uint32_t)p->a_un.a_val);
+ }
+
+ p = find_auxv_entry(AT_L1D_CACHESIZE, buffer);
+ if (p) {
+ found++;
+ print_size("L1D ", (uint32_t)p->a_un.a_val);
+ }
+
+ p = find_auxv_entry(AT_L1D_CACHEGEOMETRY, buffer);
+ if (p) {
+ found++;
+ print_geo("L1D ", (uint32_t)p->a_un.a_val);
+ }
+
+ p = find_auxv_entry(AT_L2_CACHESIZE, buffer);
+ if (p) {
+ found++;
+ print_size("L2 ", (uint32_t)p->a_un.a_val);
+ }
+
+ p = find_auxv_entry(AT_L2_CACHEGEOMETRY, buffer);
+ if (p) {
+ found++;
+ print_geo("L2 ", (uint32_t)p->a_un.a_val);
+ }
+
+ p = find_auxv_entry(AT_L3_CACHESIZE, buffer);
+ if (p) {
+ found++;
+ print_size("L3 ", (uint32_t)p->a_un.a_val);
+ }
+
+ p = find_auxv_entry(AT_L3_CACHEGEOMETRY, buffer);
+ if (p) {
+ found++;
+ print_geo("L3 ", (uint32_t)p->a_un.a_val);
+ }
+
+ /* If we found none we're probably on a system where they don't exist */
+ SKIP_IF(found == 0);
+
+ /* But if we found any, we expect to find them all */
+ FAIL_IF(found != 8);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(test_cache_shape, "cache_shape");
+}
diff --git a/tools/testing/selftests/powerpc/copyloops/.gitignore b/tools/testing/selftests/powerpc/copyloops/.gitignore
new file mode 100644
index 000000000..994b11af7
--- /dev/null
+++ b/tools/testing/selftests/powerpc/copyloops/.gitignore
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
+copyuser_64_t0
+copyuser_64_t1
+copyuser_64_t2
+copyuser_p7_t0
+copyuser_p7_t1
+memcpy_64_t0
+memcpy_64_t1
+memcpy_64_t2
+memcpy_p7_t0
+memcpy_p7_t1
+copyuser_64_exc_t0
+copyuser_64_exc_t1
+copyuser_64_exc_t2
+copy_mc_64
diff --git a/tools/testing/selftests/powerpc/copyloops/Makefile b/tools/testing/selftests/powerpc/copyloops/Makefile
new file mode 100644
index 000000000..3095b1f1c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/copyloops/Makefile
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: GPL-2.0
+# The loops are all 64-bit code
+CFLAGS += -m64
+CFLAGS += -I$(CURDIR)
+CFLAGS += -D SELFTEST
+CFLAGS += -maltivec
+CFLAGS += -mcpu=power4
+
+# Use our CFLAGS for the implicit .S rule & set the asm machine type
+ASFLAGS = $(CFLAGS) -Wa,-mpower4
+
+TEST_GEN_PROGS := copyuser_64_t0 copyuser_64_t1 copyuser_64_t2 \
+ copyuser_p7_t0 copyuser_p7_t1 \
+ memcpy_64_t0 memcpy_64_t1 memcpy_64_t2 \
+ memcpy_p7_t0 memcpy_p7_t1 copy_mc_64 \
+ copyuser_64_exc_t0 copyuser_64_exc_t1 copyuser_64_exc_t2
+
+EXTRA_SOURCES := validate.c ../harness.c stubs.S
+
+top_srcdir = ../../../../..
+include ../../lib.mk
+
+$(OUTPUT)/copyuser_64_t%: copyuser_64.S $(EXTRA_SOURCES)
+ $(CC) $(CPPFLAGS) $(CFLAGS) \
+ -D COPY_LOOP=test___copy_tofrom_user_base \
+ -D SELFTEST_CASE=$(subst copyuser_64_t,,$(notdir $@)) \
+ -o $@ $^
+
+$(OUTPUT)/copyuser_p7_t%: copyuser_power7.S $(EXTRA_SOURCES)
+ $(CC) $(CPPFLAGS) $(CFLAGS) \
+ -D COPY_LOOP=test___copy_tofrom_user_power7 \
+ -D SELFTEST_CASE=$(subst copyuser_p7_t,,$(notdir $@)) \
+ -o $@ $^
+
+# Strictly speaking, we only need the memcpy_64 test cases for big-endian
+$(OUTPUT)/memcpy_64_t%: memcpy_64.S $(EXTRA_SOURCES)
+ $(CC) $(CPPFLAGS) $(CFLAGS) \
+ -D COPY_LOOP=test_memcpy \
+ -D SELFTEST_CASE=$(subst memcpy_64_t,,$(notdir $@)) \
+ -o $@ $^
+
+$(OUTPUT)/memcpy_p7_t%: memcpy_power7.S $(EXTRA_SOURCES)
+ $(CC) $(CPPFLAGS) $(CFLAGS) \
+ -D COPY_LOOP=test_memcpy_power7 \
+ -D SELFTEST_CASE=$(subst memcpy_p7_t,,$(notdir $@)) \
+ -o $@ $^
+
+$(OUTPUT)/copy_mc_64: copy_mc_64.S $(EXTRA_SOURCES)
+ $(CC) $(CPPFLAGS) $(CFLAGS) \
+ -D COPY_LOOP=test_copy_mc_generic \
+ -o $@ $^
+
+$(OUTPUT)/copyuser_64_exc_t%: copyuser_64.S exc_validate.c ../harness.c \
+ copy_tofrom_user_reference.S stubs.S
+ $(CC) $(CPPFLAGS) $(CFLAGS) \
+ -D COPY_LOOP=test___copy_tofrom_user_base \
+ -D SELFTEST_CASE=$(subst copyuser_64_exc_t,,$(notdir $@)) \
+ -o $@ $^
diff --git a/tools/testing/selftests/powerpc/copyloops/asm/asm-compat.h b/tools/testing/selftests/powerpc/copyloops/asm/asm-compat.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/tools/testing/selftests/powerpc/copyloops/asm/asm-compat.h
diff --git a/tools/testing/selftests/powerpc/copyloops/asm/export.h b/tools/testing/selftests/powerpc/copyloops/asm/export.h
new file mode 100644
index 000000000..e6b80d5fb
--- /dev/null
+++ b/tools/testing/selftests/powerpc/copyloops/asm/export.h
@@ -0,0 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#define EXPORT_SYMBOL(x)
+#define EXPORT_SYMBOL_GPL(x)
+#define EXPORT_SYMBOL_KASAN(x)
diff --git a/tools/testing/selftests/powerpc/copyloops/asm/feature-fixups.h b/tools/testing/selftests/powerpc/copyloops/asm/feature-fixups.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/tools/testing/selftests/powerpc/copyloops/asm/feature-fixups.h
diff --git a/tools/testing/selftests/powerpc/copyloops/asm/kasan.h b/tools/testing/selftests/powerpc/copyloops/asm/kasan.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/tools/testing/selftests/powerpc/copyloops/asm/kasan.h
diff --git a/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
new file mode 100644
index 000000000..58c1cef3e
--- /dev/null
+++ b/tools/testing/selftests/powerpc/copyloops/asm/ppc_asm.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __SELFTESTS_POWERPC_PPC_ASM_H
+#define __SELFTESTS_POWERPC_PPC_ASM_H
+#include <ppc-asm.h>
+
+#define CONFIG_ALTIVEC
+
+#define r1 1
+
+#define R14 r14
+#define R15 r15
+#define R16 r16
+#define R17 r17
+#define R18 r18
+#define R19 r19
+#define R20 r20
+#define R21 r21
+#define R22 r22
+#define R29 r29
+#define R30 r30
+#define R31 r31
+
+#define STACKFRAMESIZE 256
+#define STK_REG(i) (112 + ((i)-14)*8)
+
+#define _GLOBAL(A) FUNC_START(test_ ## A)
+#define _GLOBAL_TOC(A) _GLOBAL(A)
+#define _GLOBAL_TOC_KASAN(A) _GLOBAL(A)
+
+#define PPC_MTOCRF(A, B) mtocrf A, B
+
+#define EX_TABLE(x, y) \
+ .section __ex_table,"a"; \
+ .8byte x, y; \
+ .previous
+
+#define BEGIN_FTR_SECTION .if test_feature
+#define FTR_SECTION_ELSE .else
+#define ALT_FTR_SECTION_END_IFCLR(x) .endif
+#define ALT_FTR_SECTION_END_IFSET(x) .endif
+#define ALT_FTR_SECTION_END(x, y) .endif
+#define END_FTR_SECTION_IFCLR(x) .endif
+#define END_FTR_SECTION_IFSET(x) .endif
+
+/* Default to taking the first of any alternative feature sections */
+test_feature = 1
+
+#endif /* __SELFTESTS_POWERPC_PPC_ASM_H */
diff --git a/tools/testing/selftests/powerpc/copyloops/asm/processor.h b/tools/testing/selftests/powerpc/copyloops/asm/processor.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/tools/testing/selftests/powerpc/copyloops/asm/processor.h
diff --git a/tools/testing/selftests/powerpc/copyloops/copy_mc_64.S b/tools/testing/selftests/powerpc/copyloops/copy_mc_64.S
new file mode 120000
index 000000000..dcbe06d50
--- /dev/null
+++ b/tools/testing/selftests/powerpc/copyloops/copy_mc_64.S
@@ -0,0 +1 @@
+../../../../../arch/powerpc/lib/copy_mc_64.S \ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/copyloops/copy_tofrom_user_reference.S b/tools/testing/selftests/powerpc/copyloops/copy_tofrom_user_reference.S
new file mode 100644
index 000000000..3363b8640
--- /dev/null
+++ b/tools/testing/selftests/powerpc/copyloops/copy_tofrom_user_reference.S
@@ -0,0 +1,24 @@
+#include <asm/ppc_asm.h>
+
+_GLOBAL(copy_tofrom_user_reference)
+ cmpdi r5,0
+ beq 4f
+
+ mtctr r5
+
+1: lbz r6,0(r4)
+2: stb r6,0(r3)
+ addi r3,r3,1
+ addi r4,r4,1
+ bdnz 1b
+
+3: mfctr r3
+ blr
+
+4: mr r3,r5
+ blr
+
+.section __ex_table,"a"
+ .llong 1b,3b
+ .llong 2b,3b
+.text
diff --git a/tools/testing/selftests/powerpc/copyloops/copyuser_64.S b/tools/testing/selftests/powerpc/copyloops/copyuser_64.S
new file mode 120000
index 000000000..f1c418a25
--- /dev/null
+++ b/tools/testing/selftests/powerpc/copyloops/copyuser_64.S
@@ -0,0 +1 @@
+../../../../../arch/powerpc/lib/copyuser_64.S \ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/copyloops/copyuser_power7.S b/tools/testing/selftests/powerpc/copyloops/copyuser_power7.S
new file mode 120000
index 000000000..478689598
--- /dev/null
+++ b/tools/testing/selftests/powerpc/copyloops/copyuser_power7.S
@@ -0,0 +1 @@
+../../../../../arch/powerpc/lib/copyuser_power7.S \ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/copyloops/exc_validate.c b/tools/testing/selftests/powerpc/copyloops/exc_validate.c
new file mode 100644
index 000000000..c896ea9a7
--- /dev/null
+++ b/tools/testing/selftests/powerpc/copyloops/exc_validate.c
@@ -0,0 +1,124 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <signal.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+#include "utils.h"
+
+extern char __start___ex_table[];
+extern char __stop___ex_table[];
+
+#if defined(__powerpc64__)
+#define UCONTEXT_NIA(UC) (UC)->uc_mcontext.gp_regs[PT_NIP]
+#elif defined(__powerpc__)
+#define UCONTEXT_NIA(UC) (UC)->uc_mcontext.uc_regs->gregs[PT_NIP]
+#else
+#error implement UCONTEXT_NIA
+#endif
+
+static void segv_handler(int signr, siginfo_t *info, void *ptr)
+{
+ ucontext_t *uc = (ucontext_t *)ptr;
+ unsigned long addr = (unsigned long)info->si_addr;
+ unsigned long *ip = &UCONTEXT_NIA(uc);
+ unsigned long *ex_p = (unsigned long *)__start___ex_table;
+
+ while (ex_p < (unsigned long *)__stop___ex_table) {
+ unsigned long insn, fixup;
+
+ insn = *ex_p++;
+ fixup = *ex_p++;
+
+ if (insn == *ip) {
+ *ip = fixup;
+ return;
+ }
+ }
+
+ printf("No exception table match for NIA %lx ADDR %lx\n", *ip, addr);
+ abort();
+}
+
+static void setup_segv_handler(void)
+{
+ struct sigaction action;
+
+ memset(&action, 0, sizeof(action));
+ action.sa_sigaction = segv_handler;
+ action.sa_flags = SA_SIGINFO;
+ sigaction(SIGSEGV, &action, NULL);
+}
+
+unsigned long COPY_LOOP(void *to, const void *from, unsigned long size);
+unsigned long test_copy_tofrom_user_reference(void *to, const void *from, unsigned long size);
+
+static int total_passed;
+static int total_failed;
+
+static void do_one_test(char *dstp, char *srcp, unsigned long len)
+{
+ unsigned long got, expected;
+
+ got = COPY_LOOP(dstp, srcp, len);
+ expected = test_copy_tofrom_user_reference(dstp, srcp, len);
+
+ if (got != expected) {
+ total_failed++;
+ printf("FAIL from=%p to=%p len=%ld returned %ld, expected %ld\n",
+ srcp, dstp, len, got, expected);
+ //abort();
+ } else
+ total_passed++;
+}
+
+//#define MAX_LEN 512
+#define MAX_LEN 16
+
+int test_copy_exception(void)
+{
+ int page_size;
+ static char *p, *q;
+ unsigned long src, dst, len;
+
+ page_size = getpagesize();
+ p = mmap(NULL, page_size * 2, PROT_READ|PROT_WRITE,
+ MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+
+ if (p == MAP_FAILED) {
+ perror("mmap");
+ exit(1);
+ }
+
+ memset(p, 0, page_size);
+
+ setup_segv_handler();
+
+ if (mprotect(p + page_size, page_size, PROT_NONE)) {
+ perror("mprotect");
+ exit(1);
+ }
+
+ q = p + page_size - MAX_LEN;
+
+ for (src = 0; src < MAX_LEN; src++) {
+ for (dst = 0; dst < MAX_LEN; dst++) {
+ for (len = 0; len < MAX_LEN+1; len++) {
+ // printf("from=%p to=%p len=%ld\n", q+dst, q+src, len);
+ do_one_test(q+dst, q+src, len);
+ }
+ }
+ }
+
+ printf("Totals:\n");
+ printf(" Pass: %d\n", total_passed);
+ printf(" Fail: %d\n", total_failed);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(test_copy_exception, str(COPY_LOOP));
+}
diff --git a/tools/testing/selftests/powerpc/copyloops/memcpy_64.S b/tools/testing/selftests/powerpc/copyloops/memcpy_64.S
new file mode 120000
index 000000000..cce33fb6f
--- /dev/null
+++ b/tools/testing/selftests/powerpc/copyloops/memcpy_64.S
@@ -0,0 +1 @@
+../../../../../arch/powerpc/lib/memcpy_64.S \ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/copyloops/memcpy_power7.S b/tools/testing/selftests/powerpc/copyloops/memcpy_power7.S
new file mode 120000
index 000000000..0d6fbfaf3
--- /dev/null
+++ b/tools/testing/selftests/powerpc/copyloops/memcpy_power7.S
@@ -0,0 +1 @@
+../../../../../arch/powerpc/lib/memcpy_power7.S \ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/copyloops/stubs.S b/tools/testing/selftests/powerpc/copyloops/stubs.S
new file mode 100644
index 000000000..ec8bcf2bf
--- /dev/null
+++ b/tools/testing/selftests/powerpc/copyloops/stubs.S
@@ -0,0 +1,19 @@
+#include <asm/ppc_asm.h>
+
+FUNC_START(enter_vmx_usercopy)
+ li r3,1
+ blr
+
+FUNC_START(exit_vmx_usercopy)
+ li r3,0
+ blr
+
+FUNC_START(enter_vmx_ops)
+ li r3,1
+ blr
+
+FUNC_START(exit_vmx_ops)
+ blr
+
+FUNC_START(__copy_tofrom_user_base)
+ blr
diff --git a/tools/testing/selftests/powerpc/copyloops/validate.c b/tools/testing/selftests/powerpc/copyloops/validate.c
new file mode 100644
index 000000000..0f6873618
--- /dev/null
+++ b/tools/testing/selftests/powerpc/copyloops/validate.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <malloc.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+#include "utils.h"
+
+#define MAX_LEN 8192
+#define MAX_OFFSET 16
+#define MIN_REDZONE 128
+#define BUFLEN (MAX_LEN+MAX_OFFSET+2*MIN_REDZONE)
+#define POISON 0xa5
+
+unsigned long COPY_LOOP(void *to, const void *from, unsigned long size);
+
+static void do_one(char *src, char *dst, unsigned long src_off,
+ unsigned long dst_off, unsigned long len, void *redzone,
+ void *fill)
+{
+ char *srcp, *dstp;
+ unsigned long ret;
+ unsigned long i;
+
+ srcp = src + MIN_REDZONE + src_off;
+ dstp = dst + MIN_REDZONE + dst_off;
+
+ memset(src, POISON, BUFLEN);
+ memset(dst, POISON, BUFLEN);
+ memcpy(srcp, fill, len);
+
+ ret = COPY_LOOP(dstp, srcp, len);
+ if (ret && ret != (unsigned long)dstp) {
+ printf("(%p,%p,%ld) returned %ld\n", dstp, srcp, len, ret);
+ abort();
+ }
+
+ if (memcmp(dstp, srcp, len)) {
+ printf("(%p,%p,%ld) miscompare\n", dstp, srcp, len);
+ printf("src: ");
+ for (i = 0; i < len; i++)
+ printf("%02x ", srcp[i]);
+ printf("\ndst: ");
+ for (i = 0; i < len; i++)
+ printf("%02x ", dstp[i]);
+ printf("\n");
+ abort();
+ }
+
+ if (memcmp(dst, redzone, dstp - dst)) {
+ printf("(%p,%p,%ld) redzone before corrupted\n",
+ dstp, srcp, len);
+ abort();
+ }
+
+ if (memcmp(dstp+len, redzone, dst+BUFLEN-(dstp+len))) {
+ printf("(%p,%p,%ld) redzone after corrupted\n",
+ dstp, srcp, len);
+ abort();
+ }
+}
+
+int test_copy_loop(void)
+{
+ char *src, *dst, *redzone, *fill;
+ unsigned long len, src_off, dst_off;
+ unsigned long i;
+
+ src = memalign(BUFLEN, BUFLEN);
+ dst = memalign(BUFLEN, BUFLEN);
+ redzone = malloc(BUFLEN);
+ fill = malloc(BUFLEN);
+
+ if (!src || !dst || !redzone || !fill) {
+ fprintf(stderr, "malloc failed\n");
+ exit(1);
+ }
+
+ memset(redzone, POISON, BUFLEN);
+
+ /* Fill with sequential bytes */
+ for (i = 0; i < BUFLEN; i++)
+ fill[i] = i & 0xff;
+
+ for (len = 1; len < MAX_LEN; len++) {
+ for (src_off = 0; src_off < MAX_OFFSET; src_off++) {
+ for (dst_off = 0; dst_off < MAX_OFFSET; dst_off++) {
+ do_one(src, dst, src_off, dst_off, len,
+ redzone, fill);
+ }
+ }
+ }
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(test_copy_loop, str(COPY_LOOP));
+}
diff --git a/tools/testing/selftests/powerpc/dscr/.gitignore b/tools/testing/selftests/powerpc/dscr/.gitignore
new file mode 100644
index 000000000..1d08b15af
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dscr/.gitignore
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+dscr_default_test
+dscr_explicit_test
+dscr_inherit_exec_test
+dscr_inherit_test
+dscr_sysfs_test
+dscr_sysfs_thread_test
+dscr_user_test
diff --git a/tools/testing/selftests/powerpc/dscr/Makefile b/tools/testing/selftests/powerpc/dscr/Makefile
new file mode 100644
index 000000000..845db6273
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dscr/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
+TEST_GEN_PROGS := dscr_default_test dscr_explicit_test dscr_user_test \
+ dscr_inherit_test dscr_inherit_exec_test dscr_sysfs_test \
+ dscr_sysfs_thread_test
+
+TEST_FILES := settings
+
+top_srcdir = ../../../../..
+include ../../lib.mk
+
+$(OUTPUT)/dscr_default_test: LDLIBS += -lpthread
+
+$(TEST_GEN_PROGS): ../harness.c ../utils.c
diff --git a/tools/testing/selftests/powerpc/dscr/dscr.h b/tools/testing/selftests/powerpc/dscr/dscr.h
new file mode 100644
index 000000000..13e9b9e28
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dscr/dscr.h
@@ -0,0 +1,122 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * POWER Data Stream Control Register (DSCR)
+ *
+ * This header file contains helper functions and macros
+ * required for all the DSCR related test cases.
+ *
+ * Copyright 2012, Anton Blanchard, IBM Corporation.
+ * Copyright 2015, Anshuman Khandual, IBM Corporation.
+ */
+#ifndef _SELFTESTS_POWERPC_DSCR_DSCR_H
+#define _SELFTESTS_POWERPC_DSCR_DSCR_H
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <pthread.h>
+#include <sched.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+
+#include "utils.h"
+
+#define THREADS 100 /* Max threads */
+#define COUNT 100 /* Max iterations */
+#define DSCR_MAX 16 /* Max DSCR value */
+#define LEN_MAX 100 /* Max name length */
+
+#define DSCR_DEFAULT "/sys/devices/system/cpu/dscr_default"
+#define CPU_PATH "/sys/devices/system/cpu/"
+
+#define rmb() asm volatile("lwsync":::"memory")
+#define wmb() asm volatile("lwsync":::"memory")
+
+#define READ_ONCE(x) (*(volatile typeof(x) *)&(x))
+
+/* Prilvilege state DSCR access */
+inline unsigned long get_dscr(void)
+{
+ unsigned long ret;
+
+ asm volatile("mfspr %0,%1" : "=r" (ret) : "i" (SPRN_DSCR_PRIV));
+
+ return ret;
+}
+
+inline void set_dscr(unsigned long val)
+{
+ asm volatile("mtspr %1,%0" : : "r" (val), "i" (SPRN_DSCR_PRIV));
+}
+
+/* Problem state DSCR access */
+inline unsigned long get_dscr_usr(void)
+{
+ unsigned long ret;
+
+ asm volatile("mfspr %0,%1" : "=r" (ret) : "i" (SPRN_DSCR));
+
+ return ret;
+}
+
+inline void set_dscr_usr(unsigned long val)
+{
+ asm volatile("mtspr %1,%0" : : "r" (val), "i" (SPRN_DSCR));
+}
+
+/* Default DSCR access */
+unsigned long get_default_dscr(void)
+{
+ int fd = -1, ret;
+ char buf[16];
+ unsigned long val;
+
+ if (fd == -1) {
+ fd = open(DSCR_DEFAULT, O_RDONLY);
+ if (fd == -1) {
+ perror("open() failed");
+ exit(1);
+ }
+ }
+ memset(buf, 0, sizeof(buf));
+ lseek(fd, 0, SEEK_SET);
+ ret = read(fd, buf, sizeof(buf));
+ if (ret == -1) {
+ perror("read() failed");
+ exit(1);
+ }
+ sscanf(buf, "%lx", &val);
+ close(fd);
+ return val;
+}
+
+void set_default_dscr(unsigned long val)
+{
+ int fd = -1, ret;
+ char buf[16];
+
+ if (fd == -1) {
+ fd = open(DSCR_DEFAULT, O_RDWR);
+ if (fd == -1) {
+ perror("open() failed");
+ exit(1);
+ }
+ }
+ sprintf(buf, "%lx\n", val);
+ ret = write(fd, buf, strlen(buf));
+ if (ret == -1) {
+ perror("write() failed");
+ exit(1);
+ }
+ close(fd);
+}
+
+double uniform_deviate(int seed)
+{
+ return seed * (1.0 / (RAND_MAX + 1.0));
+}
+#endif /* _SELFTESTS_POWERPC_DSCR_DSCR_H */
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_default_test.c b/tools/testing/selftests/powerpc/dscr/dscr_default_test.c
new file mode 100644
index 000000000..e76611e60
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dscr/dscr_default_test.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * POWER Data Stream Control Register (DSCR) default test
+ *
+ * This test modifies the system wide default DSCR through
+ * it's sysfs interface and then verifies that all threads
+ * see the correct changed DSCR value immediately.
+ *
+ * Copyright 2012, Anton Blanchard, IBM Corporation.
+ * Copyright 2015, Anshuman Khandual, IBM Corporation.
+ */
+#include "dscr.h"
+
+static unsigned long dscr; /* System DSCR default */
+static unsigned long sequence;
+static unsigned long result[THREADS];
+
+static void *do_test(void *in)
+{
+ unsigned long thread = (unsigned long)in;
+ unsigned long i;
+
+ for (i = 0; i < COUNT; i++) {
+ unsigned long d, cur_dscr, cur_dscr_usr;
+ unsigned long s1, s2;
+
+ s1 = READ_ONCE(sequence);
+ if (s1 & 1)
+ continue;
+ rmb();
+
+ d = dscr;
+ cur_dscr = get_dscr();
+ cur_dscr_usr = get_dscr_usr();
+
+ rmb();
+ s2 = sequence;
+
+ if (s1 != s2)
+ continue;
+
+ if (cur_dscr != d) {
+ fprintf(stderr, "thread %ld kernel DSCR should be %ld "
+ "but is %ld\n", thread, d, cur_dscr);
+ result[thread] = 1;
+ pthread_exit(&result[thread]);
+ }
+
+ if (cur_dscr_usr != d) {
+ fprintf(stderr, "thread %ld user DSCR should be %ld "
+ "but is %ld\n", thread, d, cur_dscr_usr);
+ result[thread] = 1;
+ pthread_exit(&result[thread]);
+ }
+ }
+ result[thread] = 0;
+ pthread_exit(&result[thread]);
+}
+
+int dscr_default(void)
+{
+ pthread_t threads[THREADS];
+ unsigned long i, *status[THREADS];
+ unsigned long orig_dscr_default;
+
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_DSCR));
+
+ orig_dscr_default = get_default_dscr();
+
+ /* Initial DSCR default */
+ dscr = 1;
+ set_default_dscr(dscr);
+
+ /* Spawn all testing threads */
+ for (i = 0; i < THREADS; i++) {
+ if (pthread_create(&threads[i], NULL, do_test, (void *)i)) {
+ perror("pthread_create() failed");
+ goto fail;
+ }
+ }
+
+ srand(getpid());
+
+ /* Keep changing the DSCR default */
+ for (i = 0; i < COUNT; i++) {
+ double ret = uniform_deviate(rand());
+
+ if (ret < 0.0001) {
+ sequence++;
+ wmb();
+
+ dscr++;
+ if (dscr > DSCR_MAX)
+ dscr = 0;
+
+ set_default_dscr(dscr);
+
+ wmb();
+ sequence++;
+ }
+ }
+
+ /* Individual testing thread exit status */
+ for (i = 0; i < THREADS; i++) {
+ if (pthread_join(threads[i], (void **)&(status[i]))) {
+ perror("pthread_join() failed");
+ goto fail;
+ }
+
+ if (*status[i]) {
+ printf("%ldth thread failed to join with %ld status\n",
+ i, *status[i]);
+ goto fail;
+ }
+ }
+ set_default_dscr(orig_dscr_default);
+ return 0;
+fail:
+ set_default_dscr(orig_dscr_default);
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(dscr_default, "dscr_default_test");
+}
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_explicit_test.c b/tools/testing/selftests/powerpc/dscr/dscr_explicit_test.c
new file mode 100644
index 000000000..32fcf2b32
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dscr/dscr_explicit_test.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * POWER Data Stream Control Register (DSCR) explicit test
+ *
+ * This test modifies the DSCR value using mtspr instruction and
+ * verifies the change with mfspr instruction. It uses both the
+ * privilege state SPR and the problem state SPR for this purpose.
+ *
+ * When using the privilege state SPR, the instructions such as
+ * mfspr or mtspr are priviledged and the kernel emulates them
+ * for us. Instructions using problem state SPR can be exuecuted
+ * directly without any emulation if the HW supports them. Else
+ * they also get emulated by the kernel.
+ *
+ * Copyright 2012, Anton Blanchard, IBM Corporation.
+ * Copyright 2015, Anshuman Khandual, IBM Corporation.
+ */
+#include "dscr.h"
+
+int dscr_explicit(void)
+{
+ unsigned long i, dscr = 0;
+
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_DSCR));
+
+ srand(getpid());
+ set_dscr(dscr);
+
+ for (i = 0; i < COUNT; i++) {
+ unsigned long cur_dscr, cur_dscr_usr;
+ double ret = uniform_deviate(rand());
+
+ if (ret < 0.001) {
+ dscr++;
+ if (dscr > DSCR_MAX)
+ dscr = 0;
+
+ set_dscr(dscr);
+ }
+
+ cur_dscr = get_dscr();
+ if (cur_dscr != dscr) {
+ fprintf(stderr, "Kernel DSCR should be %ld but "
+ "is %ld\n", dscr, cur_dscr);
+ return 1;
+ }
+
+ ret = uniform_deviate(rand());
+ if (ret < 0.001) {
+ dscr++;
+ if (dscr > DSCR_MAX)
+ dscr = 0;
+
+ set_dscr_usr(dscr);
+ }
+
+ cur_dscr_usr = get_dscr_usr();
+ if (cur_dscr_usr != dscr) {
+ fprintf(stderr, "User DSCR should be %ld but "
+ "is %ld\n", dscr, cur_dscr_usr);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(dscr_explicit, "dscr_explicit_test");
+}
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_inherit_exec_test.c b/tools/testing/selftests/powerpc/dscr/dscr_inherit_exec_test.c
new file mode 100644
index 000000000..c6a81b2d6
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dscr/dscr_inherit_exec_test.c
@@ -0,0 +1,108 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * POWER Data Stream Control Register (DSCR) fork exec test
+ *
+ * This testcase modifies the DSCR using mtspr, forks & execs and
+ * verifies that the child is using the changed DSCR using mfspr.
+ *
+ * When using the privilege state SPR, the instructions such as
+ * mfspr or mtspr are privileged and the kernel emulates them
+ * for us. Instructions using problem state SPR can be executed
+ * directly without any emulation if the HW supports them. Else
+ * they also get emulated by the kernel.
+ *
+ * Copyright 2012, Anton Blanchard, IBM Corporation.
+ * Copyright 2015, Anshuman Khandual, IBM Corporation.
+ */
+#include "dscr.h"
+
+static char *prog;
+
+static void do_exec(unsigned long parent_dscr)
+{
+ unsigned long cur_dscr, cur_dscr_usr;
+
+ cur_dscr = get_dscr();
+ cur_dscr_usr = get_dscr_usr();
+
+ if (cur_dscr != parent_dscr) {
+ fprintf(stderr, "Parent DSCR %ld was not inherited "
+ "over exec (kernel value)\n", parent_dscr);
+ exit(1);
+ }
+
+ if (cur_dscr_usr != parent_dscr) {
+ fprintf(stderr, "Parent DSCR %ld was not inherited "
+ "over exec (user value)\n", parent_dscr);
+ exit(1);
+ }
+ exit(0);
+}
+
+int dscr_inherit_exec(void)
+{
+ unsigned long i, dscr = 0;
+ pid_t pid;
+
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_DSCR));
+
+ for (i = 0; i < COUNT; i++) {
+ dscr++;
+ if (dscr > DSCR_MAX)
+ dscr = 0;
+
+ if (dscr == get_default_dscr())
+ continue;
+
+ if (i % 2 == 0)
+ set_dscr_usr(dscr);
+ else
+ set_dscr(dscr);
+
+ pid = fork();
+ if (pid == -1) {
+ perror("fork() failed");
+ exit(1);
+ } else if (pid) {
+ int status;
+
+ if (waitpid(pid, &status, 0) == -1) {
+ perror("waitpid() failed");
+ exit(1);
+ }
+
+ if (!WIFEXITED(status)) {
+ fprintf(stderr, "Child didn't exit cleanly\n");
+ exit(1);
+ }
+
+ if (WEXITSTATUS(status) != 0) {
+ fprintf(stderr, "Child didn't exit cleanly\n");
+ return 1;
+ }
+ } else {
+ char dscr_str[16];
+
+ sprintf(dscr_str, "%ld", dscr);
+ execlp(prog, prog, "exec", dscr_str, NULL);
+ exit(1);
+ }
+ }
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ if (argc == 3 && !strcmp(argv[1], "exec")) {
+ unsigned long parent_dscr;
+
+ parent_dscr = atoi(argv[2]);
+ do_exec(parent_dscr);
+ } else if (argc != 1) {
+ fprintf(stderr, "Usage: %s\n", argv[0]);
+ exit(1);
+ }
+
+ prog = argv[0];
+ return test_harness(dscr_inherit_exec, "dscr_inherit_exec_test");
+}
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_inherit_test.c b/tools/testing/selftests/powerpc/dscr/dscr_inherit_test.c
new file mode 100644
index 000000000..f9dfd3d3c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dscr/dscr_inherit_test.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * POWER Data Stream Control Register (DSCR) fork test
+ *
+ * This testcase modifies the DSCR using mtspr, forks and then
+ * verifies that the child process has the correct changed DSCR
+ * value using mfspr.
+ *
+ * When using the privilege state SPR, the instructions such as
+ * mfspr or mtspr are priviledged and the kernel emulates them
+ * for us. Instructions using problem state SPR can be exuecuted
+ * directly without any emulation if the HW supports them. Else
+ * they also get emulated by the kernel.
+ *
+ * Copyright 2012, Anton Blanchard, IBM Corporation.
+ * Copyright 2015, Anshuman Khandual, IBM Corporation.
+ */
+#include "dscr.h"
+
+int dscr_inherit(void)
+{
+ unsigned long i, dscr = 0;
+ pid_t pid;
+
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_DSCR));
+
+ srand(getpid());
+ set_dscr(dscr);
+
+ for (i = 0; i < COUNT; i++) {
+ unsigned long cur_dscr, cur_dscr_usr;
+
+ dscr++;
+ if (dscr > DSCR_MAX)
+ dscr = 0;
+
+ if (i % 2 == 0)
+ set_dscr_usr(dscr);
+ else
+ set_dscr(dscr);
+
+ pid = fork();
+ if (pid == -1) {
+ perror("fork() failed");
+ exit(1);
+ } else if (pid) {
+ int status;
+
+ if (waitpid(pid, &status, 0) == -1) {
+ perror("waitpid() failed");
+ exit(1);
+ }
+
+ if (!WIFEXITED(status)) {
+ fprintf(stderr, "Child didn't exit cleanly\n");
+ exit(1);
+ }
+
+ if (WEXITSTATUS(status) != 0) {
+ fprintf(stderr, "Child didn't exit cleanly\n");
+ return 1;
+ }
+ } else {
+ cur_dscr = get_dscr();
+ if (cur_dscr != dscr) {
+ fprintf(stderr, "Kernel DSCR should be %ld "
+ "but is %ld\n", dscr, cur_dscr);
+ exit(1);
+ }
+
+ cur_dscr_usr = get_dscr_usr();
+ if (cur_dscr_usr != dscr) {
+ fprintf(stderr, "User DSCR should be %ld "
+ "but is %ld\n", dscr, cur_dscr_usr);
+ exit(1);
+ }
+ exit(0);
+ }
+ }
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(dscr_inherit, "dscr_inherit_test");
+}
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c b/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c
new file mode 100644
index 000000000..f20d1c166
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dscr/dscr_sysfs_test.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * POWER Data Stream Control Register (DSCR) sysfs interface test
+ *
+ * This test updates to system wide DSCR default through the sysfs interface
+ * and then verifies that all the CPU specific DSCR defaults are updated as
+ * well verified from their sysfs interfaces.
+ *
+ * Copyright 2015, Anshuman Khandual, IBM Corporation.
+ */
+#include "dscr.h"
+
+static int check_cpu_dscr_default(char *file, unsigned long val)
+{
+ char buf[10];
+ int fd, rc;
+
+ fd = open(file, O_RDWR);
+ if (fd == -1) {
+ perror("open() failed");
+ return 1;
+ }
+
+ rc = read(fd, buf, sizeof(buf));
+ if (rc == -1) {
+ perror("read() failed");
+ close(fd);
+ return 1;
+ }
+ close(fd);
+
+ buf[rc] = '\0';
+ if (strtol(buf, NULL, 16) != val) {
+ printf("DSCR match failed: %ld (system) %ld (cpu)\n",
+ val, strtol(buf, NULL, 16));
+ return 1;
+ }
+ return 0;
+}
+
+static int check_all_cpu_dscr_defaults(unsigned long val)
+{
+ DIR *sysfs;
+ struct dirent *dp;
+ char file[LEN_MAX];
+
+ sysfs = opendir(CPU_PATH);
+ if (!sysfs) {
+ perror("opendir() failed");
+ return 1;
+ }
+
+ while ((dp = readdir(sysfs))) {
+ int len;
+
+ if (!(dp->d_type & DT_DIR))
+ continue;
+ if (!strcmp(dp->d_name, "cpuidle"))
+ continue;
+ if (!strstr(dp->d_name, "cpu"))
+ continue;
+
+ len = snprintf(file, LEN_MAX, "%s%s/dscr", CPU_PATH, dp->d_name);
+ if (len >= LEN_MAX)
+ continue;
+ if (access(file, F_OK))
+ continue;
+
+ if (check_cpu_dscr_default(file, val)) {
+ closedir(sysfs);
+ return 1;
+ }
+ }
+ closedir(sysfs);
+ return 0;
+}
+
+int dscr_sysfs(void)
+{
+ unsigned long orig_dscr_default;
+ int i, j;
+
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_DSCR));
+
+ orig_dscr_default = get_default_dscr();
+ for (i = 0; i < COUNT; i++) {
+ for (j = 0; j < DSCR_MAX; j++) {
+ set_default_dscr(j);
+ if (check_all_cpu_dscr_defaults(j))
+ goto fail;
+ }
+ }
+ set_default_dscr(orig_dscr_default);
+ return 0;
+fail:
+ set_default_dscr(orig_dscr_default);
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(dscr_sysfs, "dscr_sysfs_test");
+}
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_sysfs_thread_test.c b/tools/testing/selftests/powerpc/dscr/dscr_sysfs_thread_test.c
new file mode 100644
index 000000000..191ed126f
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dscr/dscr_sysfs_thread_test.c
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * POWER Data Stream Control Register (DSCR) sysfs thread test
+ *
+ * This test updates the system wide DSCR default value through
+ * sysfs interface which should then update all the CPU specific
+ * DSCR default values which must also be then visible to threads
+ * executing on individual CPUs on the system.
+ *
+ * Copyright 2015, Anshuman Khandual, IBM Corporation.
+ */
+#define _GNU_SOURCE
+#include "dscr.h"
+
+static int test_thread_dscr(unsigned long val)
+{
+ unsigned long cur_dscr, cur_dscr_usr;
+
+ cur_dscr = get_dscr();
+ cur_dscr_usr = get_dscr_usr();
+
+ if (val != cur_dscr) {
+ printf("[cpu %d] Kernel DSCR should be %ld but is %ld\n",
+ sched_getcpu(), val, cur_dscr);
+ return 1;
+ }
+
+ if (val != cur_dscr_usr) {
+ printf("[cpu %d] User DSCR should be %ld but is %ld\n",
+ sched_getcpu(), val, cur_dscr_usr);
+ return 1;
+ }
+ return 0;
+}
+
+static int check_cpu_dscr_thread(unsigned long val)
+{
+ cpu_set_t mask;
+ int cpu;
+
+ for (cpu = 0; cpu < CPU_SETSIZE; cpu++) {
+ CPU_ZERO(&mask);
+ CPU_SET(cpu, &mask);
+ if (sched_setaffinity(0, sizeof(mask), &mask))
+ continue;
+
+ if (test_thread_dscr(val))
+ return 1;
+ }
+ return 0;
+
+}
+
+int dscr_sysfs_thread(void)
+{
+ unsigned long orig_dscr_default;
+ int i, j;
+
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_DSCR));
+
+ orig_dscr_default = get_default_dscr();
+ for (i = 0; i < COUNT; i++) {
+ for (j = 0; j < DSCR_MAX; j++) {
+ set_default_dscr(j);
+ if (check_cpu_dscr_thread(j))
+ goto fail;
+ }
+ }
+ set_default_dscr(orig_dscr_default);
+ return 0;
+fail:
+ set_default_dscr(orig_dscr_default);
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(dscr_sysfs_thread, "dscr_sysfs_thread_test");
+}
diff --git a/tools/testing/selftests/powerpc/dscr/dscr_user_test.c b/tools/testing/selftests/powerpc/dscr/dscr_user_test.c
new file mode 100644
index 000000000..e09072446
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dscr/dscr_user_test.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * POWER Data Stream Control Register (DSCR) SPR test
+ *
+ * This test modifies the DSCR value through both the SPR number
+ * based mtspr instruction and then makes sure that the same is
+ * reflected through mfspr instruction using either of the SPR
+ * numbers.
+ *
+ * When using the privilege state SPR, the instructions such as
+ * mfspr or mtspr are priviledged and the kernel emulates them
+ * for us. Instructions using problem state SPR can be exuecuted
+ * directly without any emulation if the HW supports them. Else
+ * they also get emulated by the kernel.
+ *
+ * Copyright 2013, Anton Blanchard, IBM Corporation.
+ * Copyright 2015, Anshuman Khandual, IBM Corporation.
+ */
+#include "dscr.h"
+
+static int check_dscr(char *str)
+{
+ unsigned long cur_dscr, cur_dscr_usr;
+
+ cur_dscr = get_dscr();
+ cur_dscr_usr = get_dscr_usr();
+ if (cur_dscr != cur_dscr_usr) {
+ printf("%s set, kernel get %lx != user get %lx\n",
+ str, cur_dscr, cur_dscr_usr);
+ return 1;
+ }
+ return 0;
+}
+
+int dscr_user(void)
+{
+ int i;
+
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_DSCR));
+
+ check_dscr("");
+
+ for (i = 0; i < COUNT; i++) {
+ set_dscr(i);
+ if (check_dscr("kernel"))
+ return 1;
+ }
+
+ for (i = 0; i < COUNT; i++) {
+ set_dscr_usr(i);
+ if (check_dscr("user"))
+ return 1;
+ }
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(dscr_user, "dscr_user_test");
+}
diff --git a/tools/testing/selftests/powerpc/dscr/settings b/tools/testing/selftests/powerpc/dscr/settings
new file mode 100644
index 000000000..e7b941753
--- /dev/null
+++ b/tools/testing/selftests/powerpc/dscr/settings
@@ -0,0 +1 @@
+timeout=0
diff --git a/tools/testing/selftests/powerpc/eeh/Makefile b/tools/testing/selftests/powerpc/eeh/Makefile
new file mode 100644
index 000000000..b397babd5
--- /dev/null
+++ b/tools/testing/selftests/powerpc/eeh/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+noarg:
+ $(MAKE) -C ../
+
+TEST_PROGS := eeh-basic.sh
+TEST_FILES := eeh-functions.sh
+
+top_srcdir = ../../../../..
+include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/eeh/eeh-basic.sh b/tools/testing/selftests/powerpc/eeh/eeh-basic.sh
new file mode 100755
index 000000000..64779f073
--- /dev/null
+++ b/tools/testing/selftests/powerpc/eeh/eeh-basic.sh
@@ -0,0 +1,90 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-only
+
+KSELFTESTS_SKIP=4
+
+. ./eeh-functions.sh
+
+if ! eeh_supported ; then
+ echo "EEH not supported on this system, skipping"
+ exit $KSELFTESTS_SKIP;
+fi
+
+if [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_check" ] && \
+ [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_break" ] ; then
+ echo "debugfs EEH testing files are missing. Is debugfs mounted?"
+ exit $KSELFTESTS_SKIP;
+fi
+
+pre_lspci=`mktemp`
+lspci > $pre_lspci
+
+# Bump the max freeze count to something absurd so we don't
+# trip over it while breaking things.
+echo 5000 > /sys/kernel/debug/powerpc/eeh_max_freezes
+
+# record the devices that we break in here. Assuming everything
+# goes to plan we should get them back once the recover process
+# is finished.
+devices=""
+
+# Build up a list of candidate devices.
+for dev in `ls -1 /sys/bus/pci/devices/ | grep '\.0$'` ; do
+ # skip bridges since we can't recover them (yet...)
+ if [ -e "/sys/bus/pci/devices/$dev/pci_bus" ] ; then
+ echo "$dev, Skipped: bridge"
+ continue;
+ fi
+
+ # Skip VFs for now since we don't have a reliable way
+ # to break them.
+ if [ -e "/sys/bus/pci/devices/$dev/physfn" ] ; then
+ echo "$dev, Skipped: virtfn"
+ continue;
+ fi
+
+ if [ "ahci" = "$(basename $(realpath /sys/bus/pci/devices/$dev/driver))" ] ; then
+ echo "$dev, Skipped: ahci doesn't support recovery"
+ continue
+ fi
+
+ # Don't inject errosr into an already-frozen PE. This happens with
+ # PEs that contain multiple PCI devices (e.g. multi-function cards)
+ # and injecting new errors during the recovery process will probably
+ # result in the recovery failing and the device being marked as
+ # failed.
+ if ! pe_ok $dev ; then
+ echo "$dev, Skipped: Bad initial PE state"
+ continue;
+ fi
+
+ echo "$dev, Added"
+
+ # Add to this list of device to check
+ devices="$devices $dev"
+done
+
+dev_count="$(echo $devices | wc -w)"
+echo "Found ${dev_count} breakable devices..."
+
+failed=0
+for dev in $devices ; do
+ echo "Breaking $dev..."
+
+ if ! pe_ok $dev ; then
+ echo "Skipping $dev, Initial PE state is not ok"
+ failed="$((failed + 1))"
+ continue;
+ fi
+
+ if ! eeh_one_dev $dev ; then
+ failed="$((failed + 1))"
+ fi
+done
+
+echo "$failed devices failed to recover ($dev_count tested)"
+lspci | diff -u $pre_lspci -
+rm -f $pre_lspci
+
+test "$failed" -eq 0
+exit $?
diff --git a/tools/testing/selftests/powerpc/eeh/eeh-functions.sh b/tools/testing/selftests/powerpc/eeh/eeh-functions.sh
new file mode 100755
index 000000000..00dc32c0e
--- /dev/null
+++ b/tools/testing/selftests/powerpc/eeh/eeh-functions.sh
@@ -0,0 +1,85 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-only
+
+pe_ok() {
+ local dev="$1"
+ local path="/sys/bus/pci/devices/$dev/eeh_pe_state"
+
+ # if a driver doesn't support the error handling callbacks then the
+ # device is recovered by removing and re-probing it. This causes the
+ # sysfs directory to disappear so read the PE state once and squash
+ # any potential error messages
+ local eeh_state="$(cat $path 2>/dev/null)"
+ if [ -z "$eeh_state" ]; then
+ return 1;
+ fi
+
+ local fw_state="$(echo $eeh_state | cut -d' ' -f1)"
+ local sw_state="$(echo $eeh_state | cut -d' ' -f2)"
+
+ # If EEH_PE_ISOLATED or EEH_PE_RECOVERING are set then the PE is in an
+ # error state or being recovered. Either way, not ok.
+ if [ "$((sw_state & 0x3))" -ne 0 ] ; then
+ return 1
+ fi
+
+ # A functioning PE should have the EEH_STATE_MMIO_ACTIVE and
+ # EEH_STATE_DMA_ACTIVE flags set. For some goddamn stupid reason
+ # the platform backends set these when the PE is in reset. The
+ # RECOVERING check above should stop any false positives though.
+ if [ "$((fw_state & 0x18))" -ne "$((0x18))" ] ; then
+ return 1
+ fi
+
+ return 0;
+}
+
+eeh_supported() {
+ test -e /proc/powerpc/eeh && \
+ grep -q 'EEH Subsystem is enabled' /proc/powerpc/eeh
+}
+
+eeh_one_dev() {
+ local dev="$1"
+
+ # Using this function from the command line is sometimes useful for
+ # testing so check that the argument is a well-formed sysfs device
+ # name.
+ if ! test -e /sys/bus/pci/devices/$dev/ ; then
+ echo "Error: '$dev' must be a sysfs device name (DDDD:BB:DD.F)"
+ return 1;
+ fi
+
+ # Break it
+ echo $dev >/sys/kernel/debug/powerpc/eeh_dev_break
+
+ # Force an EEH device check. If the kernel has already
+ # noticed the EEH (due to a driver poll or whatever), this
+ # is a no-op.
+ echo $dev >/sys/kernel/debug/powerpc/eeh_dev_check
+
+ # Default to a 60s timeout when waiting for a device to recover. This
+ # is an arbitrary default which can be overridden by setting the
+ # EEH_MAX_WAIT environmental variable when required.
+
+ # The current record holder for longest recovery time is:
+ # "Adaptec Series 8 12G SAS/PCIe 3" at 39 seconds
+ max_wait=${EEH_MAX_WAIT:=60}
+
+ for i in `seq 0 ${max_wait}` ; do
+ if pe_ok $dev ; then
+ break;
+ fi
+ echo "$dev, waited $i/${max_wait}"
+ sleep 1
+ done
+
+ if ! pe_ok $dev ; then
+ echo "$dev, Failed to recover!"
+ return 1;
+ fi
+
+ echo "$dev, Recovered after $i seconds"
+ return 0;
+}
+
diff --git a/tools/testing/selftests/powerpc/harness.c b/tools/testing/selftests/powerpc/harness.c
new file mode 100644
index 000000000..0ad4f12b3
--- /dev/null
+++ b/tools/testing/selftests/powerpc/harness.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2013, Michael Ellerman, IBM Corp.
+ */
+
+#include <errno.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <elf.h>
+#include <fcntl.h>
+#include <link.h>
+#include <sys/stat.h>
+
+#include "subunit.h"
+#include "utils.h"
+
+#define KILL_TIMEOUT 5
+
+/* Setting timeout to -1 disables the alarm */
+static uint64_t timeout = 120;
+
+int run_test(int (test_function)(void), char *name)
+{
+ bool terminated;
+ int rc, status;
+ pid_t pid;
+
+ /* Make sure output is flushed before forking */
+ fflush(stdout);
+
+ pid = fork();
+ if (pid == 0) {
+ setpgid(0, 0);
+ exit(test_function());
+ } else if (pid == -1) {
+ perror("fork");
+ return 1;
+ }
+
+ setpgid(pid, pid);
+
+ if (timeout != -1)
+ /* Wake us up in timeout seconds */
+ alarm(timeout);
+ terminated = false;
+
+wait:
+ rc = waitpid(pid, &status, 0);
+ if (rc == -1) {
+ if (errno != EINTR) {
+ printf("unknown error from waitpid\n");
+ return 1;
+ }
+
+ if (terminated) {
+ printf("!! force killing %s\n", name);
+ kill(-pid, SIGKILL);
+ return 1;
+ } else {
+ printf("!! killing %s\n", name);
+ kill(-pid, SIGTERM);
+ terminated = true;
+ alarm(KILL_TIMEOUT);
+ goto wait;
+ }
+ }
+
+ /* Kill anything else in the process group that is still running */
+ kill(-pid, SIGTERM);
+
+ if (WIFEXITED(status))
+ status = WEXITSTATUS(status);
+ else {
+ if (WIFSIGNALED(status))
+ printf("!! child died by signal %d\n", WTERMSIG(status));
+ else
+ printf("!! child died by unknown cause\n");
+
+ status = 1; /* Signal or other */
+ }
+
+ return status;
+}
+
+static void sig_handler(int signum)
+{
+ /* Just wake us up from waitpid */
+}
+
+static struct sigaction sig_action = {
+ .sa_handler = sig_handler,
+};
+
+void test_harness_set_timeout(uint64_t time)
+{
+ timeout = time;
+}
+
+int test_harness(int (test_function)(void), char *name)
+{
+ int rc;
+
+ test_start(name);
+ test_set_git_version(GIT_VERSION);
+
+ if (sigaction(SIGINT, &sig_action, NULL)) {
+ perror("sigaction (sigint)");
+ test_error(name);
+ return 1;
+ }
+
+ if (sigaction(SIGALRM, &sig_action, NULL)) {
+ perror("sigaction (sigalrm)");
+ test_error(name);
+ return 1;
+ }
+
+ rc = run_test(test_function, name);
+
+ if (rc == MAGIC_SKIP_RETURN_VALUE) {
+ test_skip(name);
+ /* so that skipped test is not marked as failed */
+ rc = 0;
+ } else
+ test_finish(name, rc);
+
+ return rc;
+}
diff --git a/tools/testing/selftests/powerpc/include/basic_asm.h b/tools/testing/selftests/powerpc/include/basic_asm.h
new file mode 100644
index 000000000..886dc026f
--- /dev/null
+++ b/tools/testing/selftests/powerpc/include/basic_asm.h
@@ -0,0 +1,74 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _SELFTESTS_POWERPC_BASIC_ASM_H
+#define _SELFTESTS_POWERPC_BASIC_ASM_H
+
+#include <ppc-asm.h>
+#include <asm/unistd.h>
+
+#define LOAD_REG_IMMEDIATE(reg, expr) \
+ lis reg, (expr)@highest; \
+ ori reg, reg, (expr)@higher; \
+ rldicr reg, reg, 32, 31; \
+ oris reg, reg, (expr)@high; \
+ ori reg, reg, (expr)@l;
+
+/*
+ * Note: These macros assume that variables being stored on the stack are
+ * doublewords, while this is usually the case it may not always be the
+ * case for each use case.
+ */
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+#define STACK_FRAME_MIN_SIZE 32
+#define STACK_FRAME_TOC_POS 24
+#define __STACK_FRAME_PARAM(_param) (32 + ((_param)*8))
+#define __STACK_FRAME_LOCAL(_num_params, _var_num) \
+ ((STACK_FRAME_PARAM(_num_params)) + ((_var_num)*8))
+#else
+#define STACK_FRAME_MIN_SIZE 112
+#define STACK_FRAME_TOC_POS 40
+#define __STACK_FRAME_PARAM(i) (48 + ((i)*8))
+
+/*
+ * Caveat: if a function passed more than 8 doublewords, the caller will have
+ * made more space... which would render the 112 incorrect.
+ */
+#define __STACK_FRAME_LOCAL(_num_params, _var_num) \
+ (112 + ((_var_num)*8))
+#endif
+
+/* Parameter x saved to the stack */
+#define STACK_FRAME_PARAM(var) __STACK_FRAME_PARAM(var)
+
+/* Local variable x saved to the stack after x parameters */
+#define STACK_FRAME_LOCAL(num_params, var) \
+ __STACK_FRAME_LOCAL(num_params, var)
+#define STACK_FRAME_LR_POS 16
+#define STACK_FRAME_CR_POS 8
+
+/*
+ * It is very important to note here that _extra is the extra amount of
+ * stack space needed. This space can be accessed using STACK_FRAME_PARAM()
+ * or STACK_FRAME_LOCAL() macros.
+ *
+ * r1 and r2 are not defined in ppc-asm.h (instead they are defined as sp
+ * and toc). Kernel programmers tend to prefer rX even for r1 and r2, hence
+ * %1 and %r2. r0 is defined in ppc-asm.h and therefore %r0 gets
+ * preprocessed incorrectly, hence r0.
+ */
+#define PUSH_BASIC_STACK(_extra) \
+ mflr r0; \
+ std r0, STACK_FRAME_LR_POS(%r1); \
+ stdu %r1, -(_extra + STACK_FRAME_MIN_SIZE)(%r1); \
+ mfcr r0; \
+ stw r0, STACK_FRAME_CR_POS(%r1); \
+ std %r2, STACK_FRAME_TOC_POS(%r1);
+
+#define POP_BASIC_STACK(_extra) \
+ ld %r2, STACK_FRAME_TOC_POS(%r1); \
+ lwz r0, STACK_FRAME_CR_POS(%r1); \
+ mtcr r0; \
+ addi %r1, %r1, (_extra + STACK_FRAME_MIN_SIZE); \
+ ld r0, STACK_FRAME_LR_POS(%r1); \
+ mtlr r0;
+
+#endif /* _SELFTESTS_POWERPC_BASIC_ASM_H */
diff --git a/tools/testing/selftests/powerpc/include/fpu_asm.h b/tools/testing/selftests/powerpc/include/fpu_asm.h
new file mode 100644
index 000000000..58ac2ce33
--- /dev/null
+++ b/tools/testing/selftests/powerpc/include/fpu_asm.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright 2016, Cyril Bur, IBM Corp.
+ */
+
+#ifndef _SELFTESTS_POWERPC_FPU_ASM_H
+#define _SELFTESTS_POWERPC_FPU_ASM_H
+#include "basic_asm.h"
+
+#define PUSH_FPU(stack_size) \
+ stfd f31,(stack_size + STACK_FRAME_MIN_SIZE)(%r1); \
+ stfd f30,(stack_size + STACK_FRAME_MIN_SIZE - 8)(%r1); \
+ stfd f29,(stack_size + STACK_FRAME_MIN_SIZE - 16)(%r1); \
+ stfd f28,(stack_size + STACK_FRAME_MIN_SIZE - 24)(%r1); \
+ stfd f27,(stack_size + STACK_FRAME_MIN_SIZE - 32)(%r1); \
+ stfd f26,(stack_size + STACK_FRAME_MIN_SIZE - 40)(%r1); \
+ stfd f25,(stack_size + STACK_FRAME_MIN_SIZE - 48)(%r1); \
+ stfd f24,(stack_size + STACK_FRAME_MIN_SIZE - 56)(%r1); \
+ stfd f23,(stack_size + STACK_FRAME_MIN_SIZE - 64)(%r1); \
+ stfd f22,(stack_size + STACK_FRAME_MIN_SIZE - 72)(%r1); \
+ stfd f21,(stack_size + STACK_FRAME_MIN_SIZE - 80)(%r1); \
+ stfd f20,(stack_size + STACK_FRAME_MIN_SIZE - 88)(%r1); \
+ stfd f19,(stack_size + STACK_FRAME_MIN_SIZE - 96)(%r1); \
+ stfd f18,(stack_size + STACK_FRAME_MIN_SIZE - 104)(%r1); \
+ stfd f17,(stack_size + STACK_FRAME_MIN_SIZE - 112)(%r1); \
+ stfd f16,(stack_size + STACK_FRAME_MIN_SIZE - 120)(%r1); \
+ stfd f15,(stack_size + STACK_FRAME_MIN_SIZE - 128)(%r1); \
+ stfd f14,(stack_size + STACK_FRAME_MIN_SIZE - 136)(%r1);
+
+#define POP_FPU(stack_size) \
+ lfd f31,(stack_size + STACK_FRAME_MIN_SIZE)(%r1); \
+ lfd f30,(stack_size + STACK_FRAME_MIN_SIZE - 8)(%r1); \
+ lfd f29,(stack_size + STACK_FRAME_MIN_SIZE - 16)(%r1); \
+ lfd f28,(stack_size + STACK_FRAME_MIN_SIZE - 24)(%r1); \
+ lfd f27,(stack_size + STACK_FRAME_MIN_SIZE - 32)(%r1); \
+ lfd f26,(stack_size + STACK_FRAME_MIN_SIZE - 40)(%r1); \
+ lfd f25,(stack_size + STACK_FRAME_MIN_SIZE - 48)(%r1); \
+ lfd f24,(stack_size + STACK_FRAME_MIN_SIZE - 56)(%r1); \
+ lfd f23,(stack_size + STACK_FRAME_MIN_SIZE - 64)(%r1); \
+ lfd f22,(stack_size + STACK_FRAME_MIN_SIZE - 72)(%r1); \
+ lfd f21,(stack_size + STACK_FRAME_MIN_SIZE - 80)(%r1); \
+ lfd f20,(stack_size + STACK_FRAME_MIN_SIZE - 88)(%r1); \
+ lfd f19,(stack_size + STACK_FRAME_MIN_SIZE - 96)(%r1); \
+ lfd f18,(stack_size + STACK_FRAME_MIN_SIZE - 104)(%r1); \
+ lfd f17,(stack_size + STACK_FRAME_MIN_SIZE - 112)(%r1); \
+ lfd f16,(stack_size + STACK_FRAME_MIN_SIZE - 120)(%r1); \
+ lfd f15,(stack_size + STACK_FRAME_MIN_SIZE - 128)(%r1); \
+ lfd f14,(stack_size + STACK_FRAME_MIN_SIZE - 136)(%r1);
+
+/*
+ * Careful calling this, it will 'clobber' fpu (by design)
+ * Don't call this from C
+ */
+FUNC_START(load_fpu)
+ lfd f14,0(r3)
+ lfd f15,8(r3)
+ lfd f16,16(r3)
+ lfd f17,24(r3)
+ lfd f18,32(r3)
+ lfd f19,40(r3)
+ lfd f20,48(r3)
+ lfd f21,56(r3)
+ lfd f22,64(r3)
+ lfd f23,72(r3)
+ lfd f24,80(r3)
+ lfd f25,88(r3)
+ lfd f26,96(r3)
+ lfd f27,104(r3)
+ lfd f28,112(r3)
+ lfd f29,120(r3)
+ lfd f30,128(r3)
+ lfd f31,136(r3)
+ blr
+FUNC_END(load_fpu)
+
+#endif /* _SELFTESTS_POWERPC_FPU_ASM_H */
diff --git a/tools/testing/selftests/powerpc/include/gpr_asm.h b/tools/testing/selftests/powerpc/include/gpr_asm.h
new file mode 100644
index 000000000..5db74f5c6
--- /dev/null
+++ b/tools/testing/selftests/powerpc/include/gpr_asm.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright 2016, Cyril Bur, IBM Corp.
+ */
+
+#ifndef _SELFTESTS_POWERPC_GPR_ASM_H
+#define _SELFTESTS_POWERPC_GPR_ASM_H
+
+#include "basic_asm.h"
+
+#define __PUSH_NVREGS(top_pos); \
+ std r31,(top_pos)(%r1); \
+ std r30,(top_pos - 8)(%r1); \
+ std r29,(top_pos - 16)(%r1); \
+ std r28,(top_pos - 24)(%r1); \
+ std r27,(top_pos - 32)(%r1); \
+ std r26,(top_pos - 40)(%r1); \
+ std r25,(top_pos - 48)(%r1); \
+ std r24,(top_pos - 56)(%r1); \
+ std r23,(top_pos - 64)(%r1); \
+ std r22,(top_pos - 72)(%r1); \
+ std r21,(top_pos - 80)(%r1); \
+ std r20,(top_pos - 88)(%r1); \
+ std r19,(top_pos - 96)(%r1); \
+ std r18,(top_pos - 104)(%r1); \
+ std r17,(top_pos - 112)(%r1); \
+ std r16,(top_pos - 120)(%r1); \
+ std r15,(top_pos - 128)(%r1); \
+ std r14,(top_pos - 136)(%r1)
+
+#define __POP_NVREGS(top_pos); \
+ ld r31,(top_pos)(%r1); \
+ ld r30,(top_pos - 8)(%r1); \
+ ld r29,(top_pos - 16)(%r1); \
+ ld r28,(top_pos - 24)(%r1); \
+ ld r27,(top_pos - 32)(%r1); \
+ ld r26,(top_pos - 40)(%r1); \
+ ld r25,(top_pos - 48)(%r1); \
+ ld r24,(top_pos - 56)(%r1); \
+ ld r23,(top_pos - 64)(%r1); \
+ ld r22,(top_pos - 72)(%r1); \
+ ld r21,(top_pos - 80)(%r1); \
+ ld r20,(top_pos - 88)(%r1); \
+ ld r19,(top_pos - 96)(%r1); \
+ ld r18,(top_pos - 104)(%r1); \
+ ld r17,(top_pos - 112)(%r1); \
+ ld r16,(top_pos - 120)(%r1); \
+ ld r15,(top_pos - 128)(%r1); \
+ ld r14,(top_pos - 136)(%r1)
+
+#define PUSH_NVREGS(stack_size) \
+ __PUSH_NVREGS(stack_size + STACK_FRAME_MIN_SIZE)
+
+/* 18 NV FPU REGS */
+#define PUSH_NVREGS_BELOW_FPU(stack_size) \
+ __PUSH_NVREGS(stack_size + STACK_FRAME_MIN_SIZE - (18 * 8))
+
+#define POP_NVREGS(stack_size) \
+ __POP_NVREGS(stack_size + STACK_FRAME_MIN_SIZE)
+
+/* 18 NV FPU REGS */
+#define POP_NVREGS_BELOW_FPU(stack_size) \
+ __POP_NVREGS(stack_size + STACK_FRAME_MIN_SIZE - (18 * 8))
+
+/*
+ * Careful calling this, it will 'clobber' NVGPRs (by design)
+ * Don't call this from C
+ */
+FUNC_START(load_gpr)
+ ld r14,0(r3)
+ ld r15,8(r3)
+ ld r16,16(r3)
+ ld r17,24(r3)
+ ld r18,32(r3)
+ ld r19,40(r3)
+ ld r20,48(r3)
+ ld r21,56(r3)
+ ld r22,64(r3)
+ ld r23,72(r3)
+ ld r24,80(r3)
+ ld r25,88(r3)
+ ld r26,96(r3)
+ ld r27,104(r3)
+ ld r28,112(r3)
+ ld r29,120(r3)
+ ld r30,128(r3)
+ ld r31,136(r3)
+ blr
+FUNC_END(load_gpr)
+
+
+#endif /* _SELFTESTS_POWERPC_GPR_ASM_H */
diff --git a/tools/testing/selftests/powerpc/include/instructions.h b/tools/testing/selftests/powerpc/include/instructions.h
new file mode 100644
index 000000000..4efa6314b
--- /dev/null
+++ b/tools/testing/selftests/powerpc/include/instructions.h
@@ -0,0 +1,146 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _SELFTESTS_POWERPC_INSTRUCTIONS_H
+#define _SELFTESTS_POWERPC_INSTRUCTIONS_H
+
+#include <stdio.h>
+#include <stdlib.h>
+
+/* This defines the "copy" instruction from Power ISA 3.0 Book II, section 4.4. */
+#define __COPY(RA, RB, L) \
+ (0x7c00060c | (RA) << (31-15) | (RB) << (31-20) | (L) << (31-10))
+#define COPY(RA, RB, L) \
+ .long __COPY((RA), (RB), (L))
+
+static inline void copy(void *i)
+{
+ asm volatile(str(COPY(0, %0, 0))";"
+ :
+ : "b" (i)
+ : "memory"
+ );
+}
+
+static inline void copy_first(void *i)
+{
+ asm volatile(str(COPY(0, %0, 1))";"
+ :
+ : "b" (i)
+ : "memory"
+ );
+}
+
+/* This defines the "paste" instruction from Power ISA 3.0 Book II, section 4.4. */
+#define __PASTE(RA, RB, L, RC) \
+ (0x7c00070c | (RA) << (31-15) | (RB) << (31-20) | (L) << (31-10) | (RC) << (31-31))
+#define PASTE(RA, RB, L, RC) \
+ .long __PASTE((RA), (RB), (L), (RC))
+
+static inline int paste(void *i)
+{
+ int cr;
+
+ asm volatile(str(PASTE(0, %1, 0, 0))";"
+ "mfcr %0;"
+ : "=r" (cr)
+ : "b" (i)
+ : "memory"
+ );
+ return cr;
+}
+
+static inline int paste_last(void *i)
+{
+ int cr;
+
+ asm volatile(str(PASTE(0, %1, 1, 1))";"
+ "mfcr %0;"
+ : "=r" (cr)
+ : "b" (i)
+ : "memory"
+ );
+ return cr;
+}
+
+#define PPC_INST_COPY __COPY(0, 0, 0)
+#define PPC_INST_COPY_FIRST __COPY(0, 0, 1)
+#define PPC_INST_PASTE __PASTE(0, 0, 0, 0)
+#define PPC_INST_PASTE_LAST __PASTE(0, 0, 1, 1)
+
+/* This defines the prefixed load/store instructions */
+#ifdef __ASSEMBLY__
+# define stringify_in_c(...) __VA_ARGS__
+#else
+# define __stringify_in_c(...) #__VA_ARGS__
+# define stringify_in_c(...) __stringify_in_c(__VA_ARGS__) " "
+#endif
+
+#define __PPC_RA(a) (((a) & 0x1f) << 16)
+#define __PPC_RS(s) (((s) & 0x1f) << 21)
+#define __PPC_RT(t) __PPC_RS(t)
+#define __PPC_PREFIX_R(r) (((r) & 0x1) << 20)
+
+#define PPC_PREFIX_MLS 0x06000000
+#define PPC_PREFIX_8LS 0x04000000
+
+#define PPC_INST_LBZ 0x88000000
+#define PPC_INST_LHZ 0xa0000000
+#define PPC_INST_LHA 0xa8000000
+#define PPC_INST_LWZ 0x80000000
+#define PPC_INST_STB 0x98000000
+#define PPC_INST_STH 0xb0000000
+#define PPC_INST_STW 0x90000000
+#define PPC_INST_STD 0xf8000000
+#define PPC_INST_LFS 0xc0000000
+#define PPC_INST_LFD 0xc8000000
+#define PPC_INST_STFS 0xd0000000
+#define PPC_INST_STFD 0xd8000000
+
+#define PREFIX_MLS(instr, t, a, r, d) stringify_in_c(.balign 64, , 4;) \
+ stringify_in_c(.long PPC_PREFIX_MLS | \
+ __PPC_PREFIX_R(r) | \
+ (((d) >> 16) & 0x3ffff);) \
+ stringify_in_c(.long (instr) | \
+ __PPC_RT(t) | \
+ __PPC_RA(a) | \
+ ((d) & 0xffff);\n)
+
+#define PREFIX_8LS(instr, t, a, r, d) stringify_in_c(.balign 64, , 4;) \
+ stringify_in_c(.long PPC_PREFIX_8LS | \
+ __PPC_PREFIX_R(r) | \
+ (((d) >> 16) & 0x3ffff);) \
+ stringify_in_c(.long (instr) | \
+ __PPC_RT(t) | \
+ __PPC_RA(a) | \
+ ((d) & 0xffff);\n)
+
+/* Prefixed Integer Load/Store instructions */
+#define PLBZ(t, a, r, d) PREFIX_MLS(PPC_INST_LBZ, t, a, r, d)
+#define PLHZ(t, a, r, d) PREFIX_MLS(PPC_INST_LHZ, t, a, r, d)
+#define PLHA(t, a, r, d) PREFIX_MLS(PPC_INST_LHA, t, a, r, d)
+#define PLWZ(t, a, r, d) PREFIX_MLS(PPC_INST_LWZ, t, a, r, d)
+#define PLWA(t, a, r, d) PREFIX_8LS(0xa4000000, t, a, r, d)
+#define PLD(t, a, r, d) PREFIX_8LS(0xe4000000, t, a, r, d)
+#define PLQ(t, a, r, d) PREFIX_8LS(0xe0000000, t, a, r, d)
+#define PSTB(s, a, r, d) PREFIX_MLS(PPC_INST_STB, s, a, r, d)
+#define PSTH(s, a, r, d) PREFIX_MLS(PPC_INST_STH, s, a, r, d)
+#define PSTW(s, a, r, d) PREFIX_MLS(PPC_INST_STW, s, a, r, d)
+#define PSTD(s, a, r, d) PREFIX_8LS(0xf4000000, s, a, r, d)
+#define PSTQ(s, a, r, d) PREFIX_8LS(0xf0000000, s, a, r, d)
+
+/* Prefixed Floating-Point Load/Store Instructions */
+#define PLFS(frt, a, r, d) PREFIX_MLS(PPC_INST_LFS, frt, a, r, d)
+#define PLFD(frt, a, r, d) PREFIX_MLS(PPC_INST_LFD, frt, a, r, d)
+#define PSTFS(frs, a, r, d) PREFIX_MLS(PPC_INST_STFS, frs, a, r, d)
+#define PSTFD(frs, a, r, d) PREFIX_MLS(PPC_INST_STFD, frs, a, r, d)
+
+/* Prefixed VSX Load/Store Instructions */
+#define PLXSD(vrt, a, r, d) PREFIX_8LS(0xa8000000, vrt, a, r, d)
+#define PLXSSP(vrt, a, r, d) PREFIX_8LS(0xac000000, vrt, a, r, d)
+#define PLXV0(s, a, r, d) PREFIX_8LS(0xc8000000, s, a, r, d)
+#define PLXV1(s, a, r, d) PREFIX_8LS(0xcc000000, s, a, r, d)
+#define PSTXSD(vrs, a, r, d) PREFIX_8LS(0xb8000000, vrs, a, r, d)
+#define PSTXSSP(vrs, a, r, d) PREFIX_8LS(0xbc000000, vrs, a, r, d)
+#define PSTXV0(s, a, r, d) PREFIX_8LS(0xd8000000, s, a, r, d)
+#define PSTXV1(s, a, r, d) PREFIX_8LS(0xdc000000, s, a, r, d)
+
+#endif /* _SELFTESTS_POWERPC_INSTRUCTIONS_H */
diff --git a/tools/testing/selftests/powerpc/include/pkeys.h b/tools/testing/selftests/powerpc/include/pkeys.h
new file mode 100644
index 000000000..3312cb1b0
--- /dev/null
+++ b/tools/testing/selftests/powerpc/include/pkeys.h
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2020, Sandipan Das, IBM Corp.
+ */
+
+#ifndef _SELFTESTS_POWERPC_PKEYS_H
+#define _SELFTESTS_POWERPC_PKEYS_H
+
+#include <sys/mman.h>
+
+#include "reg.h"
+#include "utils.h"
+
+/*
+ * Older versions of libc use the Intel-specific access rights.
+ * Hence, override the definitions as they might be incorrect.
+ */
+#undef PKEY_DISABLE_ACCESS
+#define PKEY_DISABLE_ACCESS 0x3
+
+#undef PKEY_DISABLE_WRITE
+#define PKEY_DISABLE_WRITE 0x2
+
+#undef PKEY_DISABLE_EXECUTE
+#define PKEY_DISABLE_EXECUTE 0x4
+
+/* Older versions of libc do not not define this */
+#ifndef SEGV_PKUERR
+#define SEGV_PKUERR 4
+#endif
+
+#define SI_PKEY_OFFSET 0x20
+
+#define __NR_pkey_mprotect 386
+#define __NR_pkey_alloc 384
+#define __NR_pkey_free 385
+
+#define PKEY_BITS_PER_PKEY 2
+#define NR_PKEYS 32
+#define PKEY_BITS_MASK ((1UL << PKEY_BITS_PER_PKEY) - 1)
+
+inline unsigned long pkeyreg_get(void)
+{
+ return mfspr(SPRN_AMR);
+}
+
+inline void pkeyreg_set(unsigned long amr)
+{
+ set_amr(amr);
+}
+
+void pkey_set_rights(int pkey, unsigned long rights)
+{
+ unsigned long amr, shift;
+
+ shift = (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY;
+ amr = pkeyreg_get();
+ amr &= ~(PKEY_BITS_MASK << shift);
+ amr |= (rights & PKEY_BITS_MASK) << shift;
+ pkeyreg_set(amr);
+}
+
+int sys_pkey_mprotect(void *addr, size_t len, int prot, int pkey)
+{
+ return syscall(__NR_pkey_mprotect, addr, len, prot, pkey);
+}
+
+int sys_pkey_alloc(unsigned long flags, unsigned long rights)
+{
+ return syscall(__NR_pkey_alloc, flags, rights);
+}
+
+int sys_pkey_free(int pkey)
+{
+ return syscall(__NR_pkey_free, pkey);
+}
+
+int pkeys_unsupported(void)
+{
+ bool hash_mmu = false;
+ int pkey;
+
+ /* Protection keys are currently supported on Hash MMU only */
+ FAIL_IF(using_hash_mmu(&hash_mmu));
+ SKIP_IF(!hash_mmu);
+
+ /* Check if the system call is supported */
+ pkey = sys_pkey_alloc(0, 0);
+ SKIP_IF(pkey < 0);
+ sys_pkey_free(pkey);
+
+ return 0;
+}
+
+int siginfo_pkey(siginfo_t *si)
+{
+ /*
+ * In older versions of libc, siginfo_t does not have si_pkey as
+ * a member.
+ */
+#ifdef si_pkey
+ return si->si_pkey;
+#else
+ return *((int *)(((char *) si) + SI_PKEY_OFFSET));
+#endif
+}
+
+#define pkey_rights(r) ({ \
+ static char buf[4] = "rwx"; \
+ unsigned int amr_bits; \
+ if ((r) & PKEY_DISABLE_EXECUTE) \
+ buf[2] = '-'; \
+ amr_bits = (r) & PKEY_BITS_MASK; \
+ if (amr_bits & PKEY_DISABLE_WRITE) \
+ buf[1] = '-'; \
+ if (amr_bits & PKEY_DISABLE_ACCESS & ~PKEY_DISABLE_WRITE) \
+ buf[0] = '-'; \
+ buf; \
+})
+
+unsigned long next_pkey_rights(unsigned long rights)
+{
+ if (rights == PKEY_DISABLE_ACCESS)
+ return PKEY_DISABLE_EXECUTE;
+ else if (rights == (PKEY_DISABLE_ACCESS | PKEY_DISABLE_EXECUTE))
+ return 0;
+
+ if ((rights & PKEY_BITS_MASK) == 0)
+ rights |= PKEY_DISABLE_WRITE;
+ else if ((rights & PKEY_BITS_MASK) == PKEY_DISABLE_WRITE)
+ rights |= PKEY_DISABLE_ACCESS;
+
+ return rights;
+}
+
+#endif /* _SELFTESTS_POWERPC_PKEYS_H */
diff --git a/tools/testing/selftests/powerpc/include/reg.h b/tools/testing/selftests/powerpc/include/reg.h
new file mode 100644
index 000000000..c0f2742a3
--- /dev/null
+++ b/tools/testing/selftests/powerpc/include/reg.h
@@ -0,0 +1,163 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#ifndef _SELFTESTS_POWERPC_REG_H
+#define _SELFTESTS_POWERPC_REG_H
+
+#define __stringify_1(x) #x
+#define __stringify(x) __stringify_1(x)
+
+#define mfspr(rn) ({unsigned long rval; \
+ asm volatile("mfspr %0," _str(rn) \
+ : "=r" (rval)); rval; })
+#define mtspr(rn, v) asm volatile("mtspr " _str(rn) ",%0" : \
+ : "r" ((unsigned long)(v)) \
+ : "memory")
+
+#define mb() asm volatile("sync" : : : "memory");
+#define barrier() asm volatile("" : : : "memory");
+
+#define SPRN_MMCR2 769
+#define SPRN_MMCRA 770
+#define SPRN_MMCR0 779
+#define MMCR0_PMAO 0x00000080
+#define MMCR0_PMAE 0x04000000
+#define MMCR0_FC 0x80000000
+#define SPRN_EBBHR 804
+#define SPRN_EBBRR 805
+#define SPRN_BESCR 806 /* Branch event status & control register */
+#define SPRN_BESCRS 800 /* Branch event status & control set (1 bits set to 1) */
+#define SPRN_BESCRSU 801 /* Branch event status & control set upper */
+#define SPRN_BESCRR 802 /* Branch event status & control REset (1 bits set to 0) */
+#define SPRN_BESCRRU 803 /* Branch event status & control REset upper */
+
+#define BESCR_PMEO 0x1 /* PMU Event-based exception Occurred */
+#define BESCR_PME (0x1ul << 32) /* PMU Event-based exception Enable */
+
+#define SPRN_PMC1 771
+#define SPRN_PMC2 772
+#define SPRN_PMC3 773
+#define SPRN_PMC4 774
+#define SPRN_PMC5 775
+#define SPRN_PMC6 776
+
+#define SPRN_SIAR 780
+#define SPRN_SDAR 781
+#define SPRN_SIER 768
+
+#define SPRN_TEXASR 0x82 /* Transaction Exception and Status Register */
+#define SPRN_TFIAR 0x81 /* Transaction Failure Inst Addr */
+#define SPRN_TFHAR 0x80 /* Transaction Failure Handler Addr */
+#define SPRN_TAR 0x32f /* Target Address Register */
+
+#define SPRN_DSCR_PRIV 0x11 /* Privilege State DSCR */
+#define SPRN_DSCR 0x03 /* Data Stream Control Register */
+#define SPRN_PPR 896 /* Program Priority Register */
+#define SPRN_AMR 13 /* Authority Mask Register - problem state */
+
+#define set_amr(v) asm volatile("isync;" \
+ "mtspr " __stringify(SPRN_AMR) ",%0;" \
+ "isync" : \
+ : "r" ((unsigned long)(v)) \
+ : "memory")
+
+/* TEXASR register bits */
+#define TEXASR_FC 0xFE00000000000000
+#define TEXASR_FP 0x0100000000000000
+#define TEXASR_DA 0x0080000000000000
+#define TEXASR_NO 0x0040000000000000
+#define TEXASR_FO 0x0020000000000000
+#define TEXASR_SIC 0x0010000000000000
+#define TEXASR_NTC 0x0008000000000000
+#define TEXASR_TC 0x0004000000000000
+#define TEXASR_TIC 0x0002000000000000
+#define TEXASR_IC 0x0001000000000000
+#define TEXASR_IFC 0x0000800000000000
+#define TEXASR_ABT 0x0000000100000000
+#define TEXASR_SPD 0x0000000080000000
+#define TEXASR_HV 0x0000000020000000
+#define TEXASR_PR 0x0000000010000000
+#define TEXASR_FS 0x0000000008000000
+#define TEXASR_TE 0x0000000004000000
+#define TEXASR_ROT 0x0000000002000000
+
+/* MSR register bits */
+#define MSR_TS_S_LG 33 /* Trans Mem state: Suspended */
+#define MSR_TS_T_LG 34 /* Trans Mem state: Active */
+
+#define __MASK(X) (1UL<<(X))
+
+/* macro to check TM MSR bits */
+#define MSR_TS_S __MASK(MSR_TS_S_LG) /* Transaction Suspended */
+#define MSR_TS_T __MASK(MSR_TS_T_LG) /* Transaction Transactional */
+
+/* Vector Instructions */
+#define VSX_XX1(xs, ra, rb) (((xs) & 0x1f) << 21 | ((ra) << 16) | \
+ ((rb) << 11) | (((xs) >> 5)))
+#define STXVD2X(xs, ra, rb) .long (0x7c000798 | VSX_XX1((xs), (ra), (rb)))
+#define LXVD2X(xs, ra, rb) .long (0x7c000698 | VSX_XX1((xs), (ra), (rb)))
+
+#define ASM_LOAD_GPR_IMMED(_asm_symbol_name_immed) \
+ "li 14, %[" #_asm_symbol_name_immed "];" \
+ "li 15, %[" #_asm_symbol_name_immed "];" \
+ "li 16, %[" #_asm_symbol_name_immed "];" \
+ "li 17, %[" #_asm_symbol_name_immed "];" \
+ "li 18, %[" #_asm_symbol_name_immed "];" \
+ "li 19, %[" #_asm_symbol_name_immed "];" \
+ "li 20, %[" #_asm_symbol_name_immed "];" \
+ "li 21, %[" #_asm_symbol_name_immed "];" \
+ "li 22, %[" #_asm_symbol_name_immed "];" \
+ "li 23, %[" #_asm_symbol_name_immed "];" \
+ "li 24, %[" #_asm_symbol_name_immed "];" \
+ "li 25, %[" #_asm_symbol_name_immed "];" \
+ "li 26, %[" #_asm_symbol_name_immed "];" \
+ "li 27, %[" #_asm_symbol_name_immed "];" \
+ "li 28, %[" #_asm_symbol_name_immed "];" \
+ "li 29, %[" #_asm_symbol_name_immed "];" \
+ "li 30, %[" #_asm_symbol_name_immed "];" \
+ "li 31, %[" #_asm_symbol_name_immed "];"
+
+#define ASM_LOAD_FPR_SINGLE_PRECISION(_asm_symbol_name_addr) \
+ "lfs 0, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 1, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 2, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 3, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 4, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 5, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 6, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 7, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 8, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 9, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 10, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 11, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 12, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 13, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 14, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 15, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 16, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 17, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 18, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 19, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 20, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 21, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 22, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 23, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 24, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 25, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 26, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 27, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 28, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 29, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 30, 0(%[" #_asm_symbol_name_addr "]);" \
+ "lfs 31, 0(%[" #_asm_symbol_name_addr "]);"
+
+#ifndef __ASSEMBLER__
+void store_gpr(unsigned long *addr);
+void load_gpr(unsigned long *addr);
+void load_fpr_single_precision(float *addr);
+void store_fpr_single_precision(float *addr);
+#endif /* end of __ASSEMBLER__ */
+
+#endif /* _SELFTESTS_POWERPC_REG_H */
diff --git a/tools/testing/selftests/powerpc/include/subunit.h b/tools/testing/selftests/powerpc/include/subunit.h
new file mode 100644
index 000000000..068d55fdf
--- /dev/null
+++ b/tools/testing/selftests/powerpc/include/subunit.h
@@ -0,0 +1,52 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2013, Michael Ellerman, IBM Corp.
+ */
+
+#ifndef _SELFTESTS_POWERPC_SUBUNIT_H
+#define _SELFTESTS_POWERPC_SUBUNIT_H
+
+static inline void test_start(char *name)
+{
+ printf("test: %s\n", name);
+}
+
+static inline void test_failure_detail(char *name, char *detail)
+{
+ printf("failure: %s [%s]\n", name, detail);
+}
+
+static inline void test_failure(char *name)
+{
+ printf("failure: %s\n", name);
+}
+
+static inline void test_error(char *name)
+{
+ printf("error: %s\n", name);
+}
+
+static inline void test_skip(char *name)
+{
+ printf("skip: %s\n", name);
+}
+
+static inline void test_success(char *name)
+{
+ printf("success: %s\n", name);
+}
+
+static inline void test_finish(char *name, int status)
+{
+ if (status)
+ test_failure(name);
+ else
+ test_success(name);
+}
+
+static inline void test_set_git_version(char *value)
+{
+ printf("tags: git_version:%s\n", value);
+}
+
+#endif /* _SELFTESTS_POWERPC_SUBUNIT_H */
diff --git a/tools/testing/selftests/powerpc/include/utils.h b/tools/testing/selftests/powerpc/include/utils.h
new file mode 100644
index 000000000..b7d188fc8
--- /dev/null
+++ b/tools/testing/selftests/powerpc/include/utils.h
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2013, Michael Ellerman, IBM Corp.
+ */
+
+#ifndef _SELFTESTS_POWERPC_UTILS_H
+#define _SELFTESTS_POWERPC_UTILS_H
+
+#define __cacheline_aligned __attribute__((aligned(128)))
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <linux/auxvec.h>
+#include <linux/perf_event.h>
+#include <asm/cputable.h>
+#include "reg.h"
+
+/* Avoid headaches with PRI?64 - just use %ll? always */
+typedef unsigned long long u64;
+typedef signed long long s64;
+
+/* Just for familiarity */
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t u8;
+
+void test_harness_set_timeout(uint64_t time);
+int test_harness(int (test_function)(void), char *name);
+
+int read_auxv(char *buf, ssize_t buf_size);
+void *find_auxv_entry(int type, char *auxv);
+void *get_auxv_entry(int type);
+
+int pick_online_cpu(void);
+
+int read_debugfs_file(char *debugfs_file, int *result);
+int write_debugfs_file(char *debugfs_file, int result);
+int read_sysfs_file(char *debugfs_file, char *result, size_t result_size);
+int perf_event_open_counter(unsigned int type,
+ unsigned long config, int group_fd);
+int perf_event_enable(int fd);
+int perf_event_disable(int fd);
+int perf_event_reset(int fd);
+
+struct perf_event_read {
+ __u64 nr;
+ __u64 l1d_misses;
+};
+
+#if !defined(__GLIBC_PREREQ) || !__GLIBC_PREREQ(2, 30)
+#include <unistd.h>
+#include <sys/syscall.h>
+
+static inline pid_t gettid(void)
+{
+ return syscall(SYS_gettid);
+}
+#endif
+
+static inline bool have_hwcap(unsigned long ftr)
+{
+ return ((unsigned long)get_auxv_entry(AT_HWCAP) & ftr) == ftr;
+}
+
+#ifdef AT_HWCAP2
+static inline bool have_hwcap2(unsigned long ftr2)
+{
+ return ((unsigned long)get_auxv_entry(AT_HWCAP2) & ftr2) == ftr2;
+}
+#else
+static inline bool have_hwcap2(unsigned long ftr2)
+{
+ return false;
+}
+#endif
+
+bool is_ppc64le(void);
+int using_hash_mmu(bool *using_hash);
+
+/* Yes, this is evil */
+#define FAIL_IF(x) \
+do { \
+ if ((x)) { \
+ fprintf(stderr, \
+ "[FAIL] Test FAILED on line %d\n", __LINE__); \
+ return 1; \
+ } \
+} while (0)
+
+#define FAIL_IF_EXIT(x) \
+do { \
+ if ((x)) { \
+ fprintf(stderr, \
+ "[FAIL] Test FAILED on line %d\n", __LINE__); \
+ _exit(1); \
+ } \
+} while (0)
+
+/* The test harness uses this, yes it's gross */
+#define MAGIC_SKIP_RETURN_VALUE 99
+
+#define SKIP_IF(x) \
+do { \
+ if ((x)) { \
+ fprintf(stderr, \
+ "[SKIP] Test skipped on line %d\n", __LINE__); \
+ return MAGIC_SKIP_RETURN_VALUE; \
+ } \
+} while (0)
+
+#define SKIP_IF_MSG(x, msg) \
+do { \
+ if ((x)) { \
+ fprintf(stderr, \
+ "[SKIP] Test skipped on line %d: %s\n", \
+ __LINE__, msg); \
+ return MAGIC_SKIP_RETURN_VALUE; \
+ } \
+} while (0)
+
+#define _str(s) #s
+#define str(s) _str(s)
+
+#define sigsafe_err(msg) ({ \
+ ssize_t nbytes __attribute__((unused)); \
+ nbytes = write(STDERR_FILENO, msg, strlen(msg)); })
+
+/* POWER9 feature */
+#ifndef PPC_FEATURE2_ARCH_3_00
+#define PPC_FEATURE2_ARCH_3_00 0x00800000
+#endif
+
+/* POWER10 feature */
+#ifndef PPC_FEATURE2_ARCH_3_1
+#define PPC_FEATURE2_ARCH_3_1 0x00040000
+#endif
+
+#if defined(__powerpc64__)
+#define UCONTEXT_NIA(UC) (UC)->uc_mcontext.gp_regs[PT_NIP]
+#define UCONTEXT_MSR(UC) (UC)->uc_mcontext.gp_regs[PT_MSR]
+#elif defined(__powerpc__)
+#define UCONTEXT_NIA(UC) (UC)->uc_mcontext.uc_regs->gregs[PT_NIP]
+#define UCONTEXT_MSR(UC) (UC)->uc_mcontext.uc_regs->gregs[PT_MSR]
+#else
+#error implement UCONTEXT_NIA
+#endif
+
+#endif /* _SELFTESTS_POWERPC_UTILS_H */
diff --git a/tools/testing/selftests/powerpc/include/vmx_asm.h b/tools/testing/selftests/powerpc/include/vmx_asm.h
new file mode 100644
index 000000000..ad9fb1b40
--- /dev/null
+++ b/tools/testing/selftests/powerpc/include/vmx_asm.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ */
+
+#include "basic_asm.h"
+
+/* POS MUST BE 16 ALIGNED! */
+#define PUSH_VMX(pos,reg) \
+ li reg,pos; \
+ stvx v20,reg,%r1; \
+ addi reg,reg,16; \
+ stvx v21,reg,%r1; \
+ addi reg,reg,16; \
+ stvx v22,reg,%r1; \
+ addi reg,reg,16; \
+ stvx v23,reg,%r1; \
+ addi reg,reg,16; \
+ stvx v24,reg,%r1; \
+ addi reg,reg,16; \
+ stvx v25,reg,%r1; \
+ addi reg,reg,16; \
+ stvx v26,reg,%r1; \
+ addi reg,reg,16; \
+ stvx v27,reg,%r1; \
+ addi reg,reg,16; \
+ stvx v28,reg,%r1; \
+ addi reg,reg,16; \
+ stvx v29,reg,%r1; \
+ addi reg,reg,16; \
+ stvx v30,reg,%r1; \
+ addi reg,reg,16; \
+ stvx v31,reg,%r1;
+
+/* POS MUST BE 16 ALIGNED! */
+#define POP_VMX(pos,reg) \
+ li reg,pos; \
+ lvx v20,reg,%r1; \
+ addi reg,reg,16; \
+ lvx v21,reg,%r1; \
+ addi reg,reg,16; \
+ lvx v22,reg,%r1; \
+ addi reg,reg,16; \
+ lvx v23,reg,%r1; \
+ addi reg,reg,16; \
+ lvx v24,reg,%r1; \
+ addi reg,reg,16; \
+ lvx v25,reg,%r1; \
+ addi reg,reg,16; \
+ lvx v26,reg,%r1; \
+ addi reg,reg,16; \
+ lvx v27,reg,%r1; \
+ addi reg,reg,16; \
+ lvx v28,reg,%r1; \
+ addi reg,reg,16; \
+ lvx v29,reg,%r1; \
+ addi reg,reg,16; \
+ lvx v30,reg,%r1; \
+ addi reg,reg,16; \
+ lvx v31,reg,%r1;
+
+/*
+ * Careful this will 'clobber' vmx (by design)
+ * Don't call this from C
+ */
+FUNC_START(load_vmx)
+ li r5,0
+ lvx v20,r5,r3
+ addi r5,r5,16
+ lvx v21,r5,r3
+ addi r5,r5,16
+ lvx v22,r5,r3
+ addi r5,r5,16
+ lvx v23,r5,r3
+ addi r5,r5,16
+ lvx v24,r5,r3
+ addi r5,r5,16
+ lvx v25,r5,r3
+ addi r5,r5,16
+ lvx v26,r5,r3
+ addi r5,r5,16
+ lvx v27,r5,r3
+ addi r5,r5,16
+ lvx v28,r5,r3
+ addi r5,r5,16
+ lvx v29,r5,r3
+ addi r5,r5,16
+ lvx v30,r5,r3
+ addi r5,r5,16
+ lvx v31,r5,r3
+ blr
+FUNC_END(load_vmx)
diff --git a/tools/testing/selftests/powerpc/include/vsx_asm.h b/tools/testing/selftests/powerpc/include/vsx_asm.h
new file mode 100644
index 000000000..434ca2f9b
--- /dev/null
+++ b/tools/testing/selftests/powerpc/include/vsx_asm.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ */
+
+#include "basic_asm.h"
+
+/*
+ * Careful this will 'clobber' vsx (by design), VSX are always
+ * volatile though so unlike vmx this isn't so much of an issue
+ * Still should avoid calling from C
+ */
+FUNC_START(load_vsx)
+ li r5,0
+ lxvd2x vs20,r5,r3
+ addi r5,r5,16
+ lxvd2x vs21,r5,r3
+ addi r5,r5,16
+ lxvd2x vs22,r5,r3
+ addi r5,r5,16
+ lxvd2x vs23,r5,r3
+ addi r5,r5,16
+ lxvd2x vs24,r5,r3
+ addi r5,r5,16
+ lxvd2x vs25,r5,r3
+ addi r5,r5,16
+ lxvd2x vs26,r5,r3
+ addi r5,r5,16
+ lxvd2x vs27,r5,r3
+ addi r5,r5,16
+ lxvd2x vs28,r5,r3
+ addi r5,r5,16
+ lxvd2x vs29,r5,r3
+ addi r5,r5,16
+ lxvd2x vs30,r5,r3
+ addi r5,r5,16
+ lxvd2x vs31,r5,r3
+ blr
+FUNC_END(load_vsx)
+
+FUNC_START(store_vsx)
+ li r5,0
+ stxvd2x vs20,r5,r3
+ addi r5,r5,16
+ stxvd2x vs21,r5,r3
+ addi r5,r5,16
+ stxvd2x vs22,r5,r3
+ addi r5,r5,16
+ stxvd2x vs23,r5,r3
+ addi r5,r5,16
+ stxvd2x vs24,r5,r3
+ addi r5,r5,16
+ stxvd2x vs25,r5,r3
+ addi r5,r5,16
+ stxvd2x vs26,r5,r3
+ addi r5,r5,16
+ stxvd2x vs27,r5,r3
+ addi r5,r5,16
+ stxvd2x vs28,r5,r3
+ addi r5,r5,16
+ stxvd2x vs29,r5,r3
+ addi r5,r5,16
+ stxvd2x vs30,r5,r3
+ addi r5,r5,16
+ stxvd2x vs31,r5,r3
+ blr
+FUNC_END(store_vsx)
diff --git a/tools/testing/selftests/powerpc/lib/reg.S b/tools/testing/selftests/powerpc/lib/reg.S
new file mode 100644
index 000000000..9304ea7d5
--- /dev/null
+++ b/tools/testing/selftests/powerpc/lib/reg.S
@@ -0,0 +1,393 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * test helper assembly functions
+ *
+ * Copyright (C) 2016 Simon Guo, IBM Corporation.
+ */
+#include <ppc-asm.h>
+#include "reg.h"
+
+
+/* Non volatile GPR - unsigned long buf[18] */
+FUNC_START(load_gpr)
+ ld 14, 0*8(3)
+ ld 15, 1*8(3)
+ ld 16, 2*8(3)
+ ld 17, 3*8(3)
+ ld 18, 4*8(3)
+ ld 19, 5*8(3)
+ ld 20, 6*8(3)
+ ld 21, 7*8(3)
+ ld 22, 8*8(3)
+ ld 23, 9*8(3)
+ ld 24, 10*8(3)
+ ld 25, 11*8(3)
+ ld 26, 12*8(3)
+ ld 27, 13*8(3)
+ ld 28, 14*8(3)
+ ld 29, 15*8(3)
+ ld 30, 16*8(3)
+ ld 31, 17*8(3)
+ blr
+FUNC_END(load_gpr)
+
+FUNC_START(store_gpr)
+ std 14, 0*8(3)
+ std 15, 1*8(3)
+ std 16, 2*8(3)
+ std 17, 3*8(3)
+ std 18, 4*8(3)
+ std 19, 5*8(3)
+ std 20, 6*8(3)
+ std 21, 7*8(3)
+ std 22, 8*8(3)
+ std 23, 9*8(3)
+ std 24, 10*8(3)
+ std 25, 11*8(3)
+ std 26, 12*8(3)
+ std 27, 13*8(3)
+ std 28, 14*8(3)
+ std 29, 15*8(3)
+ std 30, 16*8(3)
+ std 31, 17*8(3)
+ blr
+FUNC_END(store_gpr)
+
+/* Single Precision Float - float buf[32] */
+FUNC_START(load_fpr_single_precision)
+ lfs 0, 0*4(3)
+ lfs 1, 1*4(3)
+ lfs 2, 2*4(3)
+ lfs 3, 3*4(3)
+ lfs 4, 4*4(3)
+ lfs 5, 5*4(3)
+ lfs 6, 6*4(3)
+ lfs 7, 7*4(3)
+ lfs 8, 8*4(3)
+ lfs 9, 9*4(3)
+ lfs 10, 10*4(3)
+ lfs 11, 11*4(3)
+ lfs 12, 12*4(3)
+ lfs 13, 13*4(3)
+ lfs 14, 14*4(3)
+ lfs 15, 15*4(3)
+ lfs 16, 16*4(3)
+ lfs 17, 17*4(3)
+ lfs 18, 18*4(3)
+ lfs 19, 19*4(3)
+ lfs 20, 20*4(3)
+ lfs 21, 21*4(3)
+ lfs 22, 22*4(3)
+ lfs 23, 23*4(3)
+ lfs 24, 24*4(3)
+ lfs 25, 25*4(3)
+ lfs 26, 26*4(3)
+ lfs 27, 27*4(3)
+ lfs 28, 28*4(3)
+ lfs 29, 29*4(3)
+ lfs 30, 30*4(3)
+ lfs 31, 31*4(3)
+ blr
+FUNC_END(load_fpr_single_precision)
+
+/* Single Precision Float - float buf[32] */
+FUNC_START(store_fpr_single_precision)
+ stfs 0, 0*4(3)
+ stfs 1, 1*4(3)
+ stfs 2, 2*4(3)
+ stfs 3, 3*4(3)
+ stfs 4, 4*4(3)
+ stfs 5, 5*4(3)
+ stfs 6, 6*4(3)
+ stfs 7, 7*4(3)
+ stfs 8, 8*4(3)
+ stfs 9, 9*4(3)
+ stfs 10, 10*4(3)
+ stfs 11, 11*4(3)
+ stfs 12, 12*4(3)
+ stfs 13, 13*4(3)
+ stfs 14, 14*4(3)
+ stfs 15, 15*4(3)
+ stfs 16, 16*4(3)
+ stfs 17, 17*4(3)
+ stfs 18, 18*4(3)
+ stfs 19, 19*4(3)
+ stfs 20, 20*4(3)
+ stfs 21, 21*4(3)
+ stfs 22, 22*4(3)
+ stfs 23, 23*4(3)
+ stfs 24, 24*4(3)
+ stfs 25, 25*4(3)
+ stfs 26, 26*4(3)
+ stfs 27, 27*4(3)
+ stfs 28, 28*4(3)
+ stfs 29, 29*4(3)
+ stfs 30, 30*4(3)
+ stfs 31, 31*4(3)
+ blr
+FUNC_END(store_fpr_single_precision)
+
+/* VMX/VSX registers - unsigned long buf[128] */
+FUNC_START(loadvsx)
+ lis 4, 0
+ LXVD2X (0,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (1,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (2,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (3,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (4,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (5,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (6,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (7,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (8,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (9,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (10,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (11,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (12,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (13,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (14,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (15,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (16,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (17,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (18,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (19,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (20,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (21,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (22,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (23,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (24,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (25,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (26,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (27,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (28,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (29,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (30,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (31,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (32,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (33,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (34,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (35,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (36,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (37,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (38,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (39,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (40,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (41,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (42,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (43,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (44,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (45,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (46,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (47,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (48,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (49,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (50,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (51,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (52,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (53,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (54,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (55,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (56,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (57,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (58,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (59,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (60,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (61,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (62,(4),(3))
+ addi 4, 4, 16
+ LXVD2X (63,(4),(3))
+ blr
+FUNC_END(loadvsx)
+
+FUNC_START(storevsx)
+ lis 4, 0
+ STXVD2X (0,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (1,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (2,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (3,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (4,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (5,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (6,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (7,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (8,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (9,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (10,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (11,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (12,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (13,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (14,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (15,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (16,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (17,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (18,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (19,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (20,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (21,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (22,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (23,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (24,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (25,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (26,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (27,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (28,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (29,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (30,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (31,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (32,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (33,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (34,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (35,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (36,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (37,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (38,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (39,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (40,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (41,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (42,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (43,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (44,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (45,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (46,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (47,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (48,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (49,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (50,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (51,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (52,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (53,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (54,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (55,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (56,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (57,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (58,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (59,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (60,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (61,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (62,(4),(3))
+ addi 4, 4, 16
+ STXVD2X (63,(4),(3))
+ blr
+FUNC_END(storevsx)
diff --git a/tools/testing/selftests/powerpc/math/.gitignore b/tools/testing/selftests/powerpc/math/.gitignore
new file mode 100644
index 000000000..d0c23b2e4
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/.gitignore
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0-only
+fpu_syscall
+vmx_syscall
+fpu_preempt
+vmx_preempt
+fpu_signal
+vmx_signal
+vsx_preempt
+fpu_denormal
diff --git a/tools/testing/selftests/powerpc/math/Makefile b/tools/testing/selftests/powerpc/math/Makefile
new file mode 100644
index 000000000..fcc91c205
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/Makefile
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0
+TEST_GEN_PROGS := fpu_syscall fpu_preempt fpu_signal fpu_denormal vmx_syscall vmx_preempt vmx_signal vsx_preempt
+
+top_srcdir = ../../../../..
+include ../../lib.mk
+
+$(TEST_GEN_PROGS): ../harness.c
+$(TEST_GEN_PROGS): CFLAGS += -O2 -g -pthread -m64 -maltivec
+
+$(OUTPUT)/fpu_syscall: fpu_asm.S
+$(OUTPUT)/fpu_preempt: fpu_asm.S
+$(OUTPUT)/fpu_signal: fpu_asm.S
+
+$(OUTPUT)/vmx_syscall: vmx_asm.S ../utils.c
+$(OUTPUT)/vmx_preempt: vmx_asm.S ../utils.c
+$(OUTPUT)/vmx_signal: vmx_asm.S ../utils.c
+
+$(OUTPUT)/vsx_preempt: CFLAGS += -mvsx
+$(OUTPUT)/vsx_preempt: vsx_asm.S ../utils.c
diff --git a/tools/testing/selftests/powerpc/math/fpu_asm.S b/tools/testing/selftests/powerpc/math/fpu_asm.S
new file mode 100644
index 000000000..9dc0c158f
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/fpu_asm.S
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ */
+
+#include "basic_asm.h"
+#include "fpu_asm.h"
+
+FUNC_START(check_fpu)
+ mr r4,r3
+ li r3,1 # assume a bad result
+ lfd f0,0(r4)
+ fcmpu cr1,f0,f14
+ bne cr1,1f
+ lfd f0,8(r4)
+ fcmpu cr1,f0,f15
+ bne cr1,1f
+ lfd f0,16(r4)
+ fcmpu cr1,f0,f16
+ bne cr1,1f
+ lfd f0,24(r4)
+ fcmpu cr1,f0,f17
+ bne cr1,1f
+ lfd f0,32(r4)
+ fcmpu cr1,f0,f18
+ bne cr1,1f
+ lfd f0,40(r4)
+ fcmpu cr1,f0,f19
+ bne cr1,1f
+ lfd f0,48(r4)
+ fcmpu cr1,f0,f20
+ bne cr1,1f
+ lfd f0,56(r4)
+ fcmpu cr1,f0,f21
+ bne cr1,1f
+ lfd f0,64(r4)
+ fcmpu cr1,f0,f22
+ bne cr1,1f
+ lfd f0,72(r4)
+ fcmpu cr1,f0,f23
+ bne cr1,1f
+ lfd f0,80(r4)
+ fcmpu cr1,f0,f24
+ bne cr1,1f
+ lfd f0,88(r4)
+ fcmpu cr1,f0,f25
+ bne cr1,1f
+ lfd f0,96(r4)
+ fcmpu cr1,f0,f26
+ bne cr1,1f
+ lfd f0,104(r4)
+ fcmpu cr1,f0,f27
+ bne cr1,1f
+ lfd f0,112(r4)
+ fcmpu cr1,f0,f28
+ bne cr1,1f
+ lfd f0,120(r4)
+ fcmpu cr1,f0,f29
+ bne cr1,1f
+ lfd f0,128(r4)
+ fcmpu cr1,f0,f30
+ bne cr1,1f
+ lfd f0,136(r4)
+ fcmpu cr1,f0,f31
+ bne cr1,1f
+ li r3,0 # Success!!!
+1: blr
+
+FUNC_START(test_fpu)
+ # r3 holds pointer to where to put the result of fork
+ # r4 holds pointer to the pid
+ # f14-f31 are non volatiles
+ PUSH_BASIC_STACK(256)
+ PUSH_FPU(256)
+ std r3,STACK_FRAME_PARAM(0)(sp) # Address of darray
+ std r4,STACK_FRAME_PARAM(1)(sp) # Address of pid
+
+ bl load_fpu
+ nop
+ li r0,__NR_fork
+ sc
+
+ # pass the result of the fork to the caller
+ ld r9,STACK_FRAME_PARAM(1)(sp)
+ std r3,0(r9)
+
+ ld r3,STACK_FRAME_PARAM(0)(sp)
+ bl check_fpu
+ nop
+
+ POP_FPU(256)
+ POP_BASIC_STACK(256)
+ blr
+FUNC_END(test_fpu)
+
+# int preempt_fpu(double *darray, int *threads_running, int *running)
+# On starting will (atomically) decrement not_ready as a signal that the FPU
+# has been loaded with darray. Will proceed to check the validity of the FPU
+# registers while running is not zero.
+FUNC_START(preempt_fpu)
+ PUSH_BASIC_STACK(256)
+ PUSH_FPU(256)
+ std r3,STACK_FRAME_PARAM(0)(sp) # double *darray
+ std r4,STACK_FRAME_PARAM(1)(sp) # int *threads_starting
+ std r5,STACK_FRAME_PARAM(2)(sp) # int *running
+
+ bl load_fpu
+ nop
+
+ sync
+ # Atomic DEC
+ ld r3,STACK_FRAME_PARAM(1)(sp)
+1: lwarx r4,0,r3
+ addi r4,r4,-1
+ stwcx. r4,0,r3
+ bne- 1b
+
+2: ld r3,STACK_FRAME_PARAM(0)(sp)
+ bl check_fpu
+ nop
+ cmpdi r3,0
+ bne 3f
+ ld r4,STACK_FRAME_PARAM(2)(sp)
+ ld r5,0(r4)
+ cmpwi r5,0
+ bne 2b
+
+3: POP_FPU(256)
+ POP_BASIC_STACK(256)
+ blr
+FUNC_END(preempt_fpu)
diff --git a/tools/testing/selftests/powerpc/math/fpu_denormal.c b/tools/testing/selftests/powerpc/math/fpu_denormal.c
new file mode 100644
index 000000000..5f96682ab
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/fpu_denormal.c
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright IBM Corp. 2020
+ *
+ * This test attempts to cause a FP denormal exception on POWER8 CPUs. Unfortunately
+ * if the denormal handler is not configured or working properly, this can cause a bad
+ * crash in kernel mode when the kernel tries to save FP registers when the process
+ * exits.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "utils.h"
+
+static int test_denormal_fpu(void)
+{
+ unsigned int m32;
+ unsigned long m64;
+ volatile float f;
+ volatile double d;
+
+ /* try to induce lfs <denormal> ; stfd */
+
+ m32 = 0x00715fcf; /* random denormal */
+ memcpy((float *)&f, &m32, sizeof(f));
+ d = f;
+ memcpy(&m64, (double *)&d, sizeof(d));
+
+ FAIL_IF((long)(m64 != 0x380c57f3c0000000)); /* renormalised value */
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(test_denormal_fpu, "fpu_denormal");
+}
diff --git a/tools/testing/selftests/powerpc/math/fpu_preempt.c b/tools/testing/selftests/powerpc/math/fpu_preempt.c
new file mode 100644
index 000000000..3e5b5663d
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/fpu_preempt.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This test attempts to see if the FPU registers change across preemption.
+ * Two things should be noted here a) The check_fpu function in asm only checks
+ * the non volatile registers as it is reused from the syscall test b) There is
+ * no way to be sure preemption happened so this test just uses many threads
+ * and a long wait. As such, a successful test doesn't mean much but a failure
+ * is bad.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <pthread.h>
+
+#include "utils.h"
+
+/* Time to wait for workers to get preempted (seconds) */
+#define PREEMPT_TIME 20
+/*
+ * Factor by which to multiply number of online CPUs for total number of
+ * worker threads
+ */
+#define THREAD_FACTOR 8
+
+
+__thread double darray[] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
+ 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0,
+ 2.1};
+
+int threads_starting;
+int running;
+
+extern int preempt_fpu(double *darray, int *threads_starting, int *running);
+
+void *preempt_fpu_c(void *p)
+{
+ long rc;
+ int i;
+
+ srand(pthread_self());
+ for (i = 0; i < 21; i++)
+ darray[i] = rand();
+
+ rc = preempt_fpu(darray, &threads_starting, &running);
+
+ return (void *)rc;
+}
+
+int test_preempt_fpu(void)
+{
+ int i, rc, threads;
+ pthread_t *tids;
+
+ threads = sysconf(_SC_NPROCESSORS_ONLN) * THREAD_FACTOR;
+ tids = malloc((threads) * sizeof(pthread_t));
+ FAIL_IF(!tids);
+
+ running = true;
+ threads_starting = threads;
+ for (i = 0; i < threads; i++) {
+ rc = pthread_create(&tids[i], NULL, preempt_fpu_c, NULL);
+ FAIL_IF(rc);
+ }
+
+ setbuf(stdout, NULL);
+ /* Not really necessary but nice to wait for every thread to start */
+ printf("\tWaiting for all workers to start...");
+ while(threads_starting)
+ asm volatile("": : :"memory");
+ printf("done\n");
+
+ printf("\tWaiting for %d seconds to let some workers get preempted...", PREEMPT_TIME);
+ sleep(PREEMPT_TIME);
+ printf("done\n");
+
+ printf("\tStopping workers...");
+ /*
+ * Working are checking this value every loop. In preempt_fpu 'cmpwi r5,0; bne 2b'.
+ * r5 will have loaded the value of running.
+ */
+ running = 0;
+ for (i = 0; i < threads; i++) {
+ void *rc_p;
+ pthread_join(tids[i], &rc_p);
+
+ /*
+ * Harness will say the fail was here, look at why preempt_fpu
+ * returned
+ */
+ if ((long) rc_p)
+ printf("oops\n");
+ FAIL_IF((long) rc_p);
+ }
+ printf("done\n");
+
+ free(tids);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(test_preempt_fpu, "fpu_preempt");
+}
diff --git a/tools/testing/selftests/powerpc/math/fpu_signal.c b/tools/testing/selftests/powerpc/math/fpu_signal.c
new file mode 100644
index 000000000..7b1addd50
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/fpu_signal.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This test attempts to see if the FPU registers are correctly reported in a
+ * signal context. Each worker just spins checking its FPU registers, at some
+ * point a signal will interrupt it and C code will check the signal context
+ * ensuring it is also the same.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <pthread.h>
+
+#include "utils.h"
+
+/* Number of times each thread should receive the signal */
+#define ITERATIONS 10
+/*
+ * Factor by which to multiply number of online CPUs for total number of
+ * worker threads
+ */
+#define THREAD_FACTOR 8
+
+__thread double darray[] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
+ 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0,
+ 2.1};
+
+bool bad_context;
+int threads_starting;
+int running;
+
+extern long preempt_fpu(double *darray, int *threads_starting, int *running);
+
+void signal_fpu_sig(int sig, siginfo_t *info, void *context)
+{
+ int i;
+ ucontext_t *uc = context;
+ mcontext_t *mc = &uc->uc_mcontext;
+
+ /* Only the non volatiles were loaded up */
+ for (i = 14; i < 32; i++) {
+ if (mc->fp_regs[i] != darray[i - 14]) {
+ bad_context = true;
+ break;
+ }
+ }
+}
+
+void *signal_fpu_c(void *p)
+{
+ int i;
+ long rc;
+ struct sigaction act;
+ act.sa_sigaction = signal_fpu_sig;
+ act.sa_flags = SA_SIGINFO;
+ rc = sigaction(SIGUSR1, &act, NULL);
+ if (rc)
+ return p;
+
+ srand(pthread_self());
+ for (i = 0; i < 21; i++)
+ darray[i] = rand();
+
+ rc = preempt_fpu(darray, &threads_starting, &running);
+
+ return (void *) rc;
+}
+
+int test_signal_fpu(void)
+{
+ int i, j, rc, threads;
+ void *rc_p;
+ pthread_t *tids;
+
+ threads = sysconf(_SC_NPROCESSORS_ONLN) * THREAD_FACTOR;
+ tids = malloc(threads * sizeof(pthread_t));
+ FAIL_IF(!tids);
+
+ running = true;
+ threads_starting = threads;
+ for (i = 0; i < threads; i++) {
+ rc = pthread_create(&tids[i], NULL, signal_fpu_c, NULL);
+ FAIL_IF(rc);
+ }
+
+ setbuf(stdout, NULL);
+ printf("\tWaiting for all workers to start...");
+ while (threads_starting)
+ asm volatile("": : :"memory");
+ printf("done\n");
+
+ printf("\tSending signals to all threads %d times...", ITERATIONS);
+ for (i = 0; i < ITERATIONS; i++) {
+ for (j = 0; j < threads; j++) {
+ pthread_kill(tids[j], SIGUSR1);
+ }
+ sleep(1);
+ }
+ printf("done\n");
+
+ printf("\tStopping workers...");
+ running = 0;
+ for (i = 0; i < threads; i++) {
+ pthread_join(tids[i], &rc_p);
+
+ /*
+ * Harness will say the fail was here, look at why signal_fpu
+ * returned
+ */
+ if ((long) rc_p || bad_context)
+ printf("oops\n");
+ if (bad_context)
+ fprintf(stderr, "\t!! bad_context is true\n");
+ FAIL_IF((long) rc_p || bad_context);
+ }
+ printf("done\n");
+
+ free(tids);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(test_signal_fpu, "fpu_signal");
+}
diff --git a/tools/testing/selftests/powerpc/math/fpu_syscall.c b/tools/testing/selftests/powerpc/math/fpu_syscall.c
new file mode 100644
index 000000000..694f225c7
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/fpu_syscall.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This test attempts to see if the FPU registers change across a syscall (fork).
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+
+#include "utils.h"
+
+extern int test_fpu(double *darray, pid_t *pid);
+
+double darray[] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
+ 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0,
+ 2.1};
+
+int syscall_fpu(void)
+{
+ pid_t fork_pid;
+ int i;
+ int ret;
+ int child_ret;
+ for (i = 0; i < 1000; i++) {
+ /* test_fpu will fork() */
+ ret = test_fpu(darray, &fork_pid);
+ if (fork_pid == -1)
+ return -1;
+ if (fork_pid == 0)
+ exit(ret);
+ waitpid(fork_pid, &child_ret, 0);
+ if (ret || child_ret)
+ return 1;
+ }
+
+ return 0;
+}
+
+int test_syscall_fpu(void)
+{
+ /*
+ * Setup an environment with much context switching
+ */
+ pid_t pid2;
+ pid_t pid = fork();
+ int ret;
+ int child_ret;
+ FAIL_IF(pid == -1);
+
+ pid2 = fork();
+ /* Can't FAIL_IF(pid2 == -1); because already forked once */
+ if (pid2 == -1) {
+ /*
+ * Couldn't fork, ensure test is a fail
+ */
+ child_ret = ret = 1;
+ } else {
+ ret = syscall_fpu();
+ if (pid2)
+ waitpid(pid2, &child_ret, 0);
+ else
+ exit(ret);
+ }
+
+ ret |= child_ret;
+
+ if (pid)
+ waitpid(pid, &child_ret, 0);
+ else
+ exit(ret);
+
+ FAIL_IF(ret || child_ret);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(test_syscall_fpu, "syscall_fpu");
+
+}
diff --git a/tools/testing/selftests/powerpc/math/vmx_asm.S b/tools/testing/selftests/powerpc/math/vmx_asm.S
new file mode 100644
index 000000000..11b0704c5
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/vmx_asm.S
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ */
+
+#include "basic_asm.h"
+#include "vmx_asm.h"
+
+# Should be safe from C, only touches r4, r5 and v0,v1,v2
+FUNC_START(check_vmx)
+ PUSH_BASIC_STACK(32)
+ mr r4,r3
+ li r3,1 # assume a bad result
+ li r5,0
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v20
+ vmr v2,v1
+
+ addi r5,r5,16
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v21
+ vand v2,v2,v1
+
+ addi r5,r5,16
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v22
+ vand v2,v2,v1
+
+ addi r5,r5,16
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v23
+ vand v2,v2,v1
+
+ addi r5,r5,16
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v24
+ vand v2,v2,v1
+
+ addi r5,r5,16
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v25
+ vand v2,v2,v1
+
+ addi r5,r5,16
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v26
+ vand v2,v2,v1
+
+ addi r5,r5,16
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v27
+ vand v2,v2,v1
+
+ addi r5,r5,16
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v28
+ vand v2,v2,v1
+
+ addi r5,r5,16
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v29
+ vand v2,v2,v1
+
+ addi r5,r5,16
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v30
+ vand v2,v2,v1
+
+ addi r5,r5,16
+ lvx v0,r5,r4
+ vcmpequd. v1,v0,v31
+ vand v2,v2,v1
+
+ li r5,STACK_FRAME_LOCAL(0,0)
+ stvx v2,r5,sp
+ ldx r0,r5,sp
+ cmpdi r0,0xffffffffffffffff
+ bne 1f
+ li r3,0
+1: POP_BASIC_STACK(32)
+ blr
+FUNC_END(check_vmx)
+
+# Safe from C
+FUNC_START(test_vmx)
+ # r3 holds pointer to where to put the result of fork
+ # r4 holds pointer to the pid
+ # v20-v31 are non-volatile
+ PUSH_BASIC_STACK(512)
+ std r3,STACK_FRAME_PARAM(0)(sp) # Address of varray
+ std r4,STACK_FRAME_PARAM(1)(sp) # address of pid
+ PUSH_VMX(STACK_FRAME_LOCAL(2,0),r4)
+
+ bl load_vmx
+ nop
+
+ li r0,__NR_fork
+ sc
+ # Pass the result of fork back to the caller
+ ld r9,STACK_FRAME_PARAM(1)(sp)
+ std r3,0(r9)
+
+ ld r3,STACK_FRAME_PARAM(0)(sp)
+ bl check_vmx
+ nop
+
+ POP_VMX(STACK_FRAME_LOCAL(2,0),r4)
+ POP_BASIC_STACK(512)
+ blr
+FUNC_END(test_vmx)
+
+# int preempt_vmx(vector int *varray, int *threads_starting, int *running)
+# On starting will (atomically) decrement threads_starting as a signal that
+# the VMX have been loaded with varray. Will proceed to check the validity of
+# the VMX registers while running is not zero.
+FUNC_START(preempt_vmx)
+ PUSH_BASIC_STACK(512)
+ std r3,STACK_FRAME_PARAM(0)(sp) # vector int *varray
+ std r4,STACK_FRAME_PARAM(1)(sp) # int *threads_starting
+ std r5,STACK_FRAME_PARAM(2)(sp) # int *running
+ # VMX need to write to 16 byte aligned addresses, skip STACK_FRAME_LOCAL(3,0)
+ PUSH_VMX(STACK_FRAME_LOCAL(4,0),r4)
+
+ bl load_vmx
+ nop
+
+ sync
+ # Atomic DEC
+ ld r3,STACK_FRAME_PARAM(1)(sp)
+1: lwarx r4,0,r3
+ addi r4,r4,-1
+ stwcx. r4,0,r3
+ bne- 1b
+
+2: ld r3,STACK_FRAME_PARAM(0)(sp)
+ bl check_vmx
+ nop
+ cmpdi r3,0
+ bne 3f
+ ld r4,STACK_FRAME_PARAM(2)(sp)
+ ld r5,0(r4)
+ cmpwi r5,0
+ bne 2b
+
+3: POP_VMX(STACK_FRAME_LOCAL(4,0),r4)
+ POP_BASIC_STACK(512)
+ blr
+FUNC_END(preempt_vmx)
diff --git a/tools/testing/selftests/powerpc/math/vmx_preempt.c b/tools/testing/selftests/powerpc/math/vmx_preempt.c
new file mode 100644
index 000000000..6f7cf400c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/vmx_preempt.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This test attempts to see if the VMX registers change across preemption.
+ * Two things should be noted here a) The check_vmx function in asm only checks
+ * the non volatile registers as it is reused from the syscall test b) There is
+ * no way to be sure preemption happened so this test just uses many threads
+ * and a long wait. As such, a successful test doesn't mean much but a failure
+ * is bad.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <pthread.h>
+
+#include "utils.h"
+
+/* Time to wait for workers to get preempted (seconds) */
+#define PREEMPT_TIME 20
+/*
+ * Factor by which to multiply number of online CPUs for total number of
+ * worker threads
+ */
+#define THREAD_FACTOR 8
+
+__thread vector int varray[] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10,11,12},
+ {13,14,15,16},{17,18,19,20},{21,22,23,24},
+ {25,26,27,28},{29,30,31,32},{33,34,35,36},
+ {37,38,39,40},{41,42,43,44},{45,46,47,48}};
+
+int threads_starting;
+int running;
+
+extern int preempt_vmx(vector int *varray, int *threads_starting, int *running);
+
+void *preempt_vmx_c(void *p)
+{
+ int i, j;
+ long rc;
+
+ srand(pthread_self());
+ for (i = 0; i < 12; i++)
+ for (j = 0; j < 4; j++)
+ varray[i][j] = rand();
+
+ rc = preempt_vmx(varray, &threads_starting, &running);
+
+ return (void *)rc;
+}
+
+int test_preempt_vmx(void)
+{
+ int i, rc, threads;
+ pthread_t *tids;
+
+ // vcmpequd used in vmx_asm.S is v2.07
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_2_07));
+
+ threads = sysconf(_SC_NPROCESSORS_ONLN) * THREAD_FACTOR;
+ tids = malloc(threads * sizeof(pthread_t));
+ FAIL_IF(!tids);
+
+ running = true;
+ threads_starting = threads;
+ for (i = 0; i < threads; i++) {
+ rc = pthread_create(&tids[i], NULL, preempt_vmx_c, NULL);
+ FAIL_IF(rc);
+ }
+
+ setbuf(stdout, NULL);
+ /* Not really nessesary but nice to wait for every thread to start */
+ printf("\tWaiting for all workers to start...");
+ while(threads_starting)
+ asm volatile("": : :"memory");
+ printf("done\n");
+
+ printf("\tWaiting for %d seconds to let some workers get preempted...", PREEMPT_TIME);
+ sleep(PREEMPT_TIME);
+ printf("done\n");
+
+ printf("\tStopping workers...");
+ /*
+ * Working are checking this value every loop. In preempt_vmx 'cmpwi r5,0; bne 2b'.
+ * r5 will have loaded the value of running.
+ */
+ running = 0;
+ for (i = 0; i < threads; i++) {
+ void *rc_p;
+ pthread_join(tids[i], &rc_p);
+
+ /*
+ * Harness will say the fail was here, look at why preempt_vmx
+ * returned
+ */
+ if ((long) rc_p)
+ printf("oops\n");
+ FAIL_IF((long) rc_p);
+ }
+ printf("done\n");
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(test_preempt_vmx, "vmx_preempt");
+}
diff --git a/tools/testing/selftests/powerpc/math/vmx_signal.c b/tools/testing/selftests/powerpc/math/vmx_signal.c
new file mode 100644
index 000000000..b340a5c4e
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/vmx_signal.c
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This test attempts to see if the VMX registers are correctly reported in a
+ * signal context. Each worker just spins checking its VMX registers, at some
+ * point a signal will interrupt it and C code will check the signal context
+ * ensuring it is also the same.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include <altivec.h>
+
+#include "utils.h"
+
+/* Number of times each thread should receive the signal */
+#define ITERATIONS 10
+/*
+ * Factor by which to multiply number of online CPUs for total number of
+ * worker threads
+ */
+#define THREAD_FACTOR 8
+
+__thread vector int varray[] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10,11,12},
+ {13,14,15,16},{17,18,19,20},{21,22,23,24},
+ {25,26,27,28},{29,30,31,32},{33,34,35,36},
+ {37,38,39,40},{41,42,43,44},{45,46,47,48}};
+
+bool bad_context;
+int running;
+int threads_starting;
+
+extern int preempt_vmx(vector int *varray, int *threads_starting, int *sentinal);
+
+void signal_vmx_sig(int sig, siginfo_t *info, void *context)
+{
+ int i;
+ ucontext_t *uc = context;
+ mcontext_t *mc = &uc->uc_mcontext;
+
+ /* Only the non volatiles were loaded up */
+ for (i = 20; i < 32; i++) {
+ if (memcmp(mc->v_regs->vrregs[i], &varray[i - 20], 16)) {
+ int j;
+ /*
+ * Shouldn't printf() in a signal handler, however, this is a
+ * test and we've detected failure. Understanding what failed
+ * is paramount. All that happens after this is tests exit with
+ * failure.
+ */
+ printf("VMX mismatch at reg %d!\n", i);
+ printf("Reg | Actual | Expected\n");
+ for (j = 20; j < 32; j++) {
+ printf("%d | 0x%04x%04x%04x%04x | 0x%04x%04x%04x%04x\n", j, mc->v_regs->vrregs[j][0],
+ mc->v_regs->vrregs[j][1], mc->v_regs->vrregs[j][2], mc->v_regs->vrregs[j][3],
+ varray[j - 20][0], varray[j - 20][1], varray[j - 20][2], varray[j - 20][3]);
+ }
+ bad_context = true;
+ break;
+ }
+ }
+}
+
+void *signal_vmx_c(void *p)
+{
+ int i, j;
+ long rc;
+ struct sigaction act;
+ act.sa_sigaction = signal_vmx_sig;
+ act.sa_flags = SA_SIGINFO;
+ rc = sigaction(SIGUSR1, &act, NULL);
+ if (rc)
+ return p;
+
+ srand(pthread_self());
+ for (i = 0; i < 12; i++)
+ for (j = 0; j < 4; j++)
+ varray[i][j] = rand();
+
+ rc = preempt_vmx(varray, &threads_starting, &running);
+
+ return (void *) rc;
+}
+
+int test_signal_vmx(void)
+{
+ int i, j, rc, threads;
+ void *rc_p;
+ pthread_t *tids;
+
+ // vcmpequd used in vmx_asm.S is v2.07
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_2_07));
+
+ threads = sysconf(_SC_NPROCESSORS_ONLN) * THREAD_FACTOR;
+ tids = malloc(threads * sizeof(pthread_t));
+ FAIL_IF(!tids);
+
+ running = true;
+ threads_starting = threads;
+ for (i = 0; i < threads; i++) {
+ rc = pthread_create(&tids[i], NULL, signal_vmx_c, NULL);
+ FAIL_IF(rc);
+ }
+
+ setbuf(stdout, NULL);
+ printf("\tWaiting for %d workers to start... %d", threads, threads_starting);
+ while (threads_starting) {
+ asm volatile("": : :"memory");
+ usleep(1000);
+ printf(", %d", threads_starting);
+ }
+ printf(" ...done\n");
+
+ printf("\tSending signals to all threads %d times...", ITERATIONS);
+ for (i = 0; i < ITERATIONS; i++) {
+ for (j = 0; j < threads; j++) {
+ pthread_kill(tids[j], SIGUSR1);
+ }
+ sleep(1);
+ }
+ printf("done\n");
+
+ printf("\tKilling workers...");
+ running = 0;
+ for (i = 0; i < threads; i++) {
+ pthread_join(tids[i], &rc_p);
+
+ /*
+ * Harness will say the fail was here, look at why signal_vmx
+ * returned
+ */
+ if ((long) rc_p || bad_context)
+ printf("oops\n");
+ if (bad_context)
+ fprintf(stderr, "\t!! bad_context is true\n");
+ FAIL_IF((long) rc_p || bad_context);
+ }
+ printf("done\n");
+
+ free(tids);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(test_signal_vmx, "vmx_signal");
+}
diff --git a/tools/testing/selftests/powerpc/math/vmx_syscall.c b/tools/testing/selftests/powerpc/math/vmx_syscall.c
new file mode 100644
index 000000000..03c78dfe3
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/vmx_syscall.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This test attempts to see if the VMX registers change across a syscall (fork).
+ */
+
+#include <altivec.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include "utils.h"
+
+vector int varray[] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10,11,12},
+ {13,14,15,16},{17,18,19,20},{21,22,23,24},
+ {25,26,27,28},{29,30,31,32},{33,34,35,36},
+ {37,38,39,40},{41,42,43,44},{45,46,47,48}};
+
+extern int test_vmx(vector int *varray, pid_t *pid);
+
+int vmx_syscall(void)
+{
+ pid_t fork_pid;
+ int i;
+ int ret;
+ int child_ret;
+ for (i = 0; i < 1000; i++) {
+ /* test_vmx will fork() */
+ ret = test_vmx(varray, &fork_pid);
+ if (fork_pid == -1)
+ return -1;
+ if (fork_pid == 0)
+ exit(ret);
+ waitpid(fork_pid, &child_ret, 0);
+ if (ret || child_ret)
+ return 1;
+ }
+
+ return 0;
+}
+
+int test_vmx_syscall(void)
+{
+ /*
+ * Setup an environment with much context switching
+ */
+ pid_t pid2;
+ pid_t pid;
+ int ret;
+ int child_ret;
+
+ // vcmpequd used in vmx_asm.S is v2.07
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_2_07));
+
+ pid = fork();
+ FAIL_IF(pid == -1);
+
+ pid2 = fork();
+ ret = vmx_syscall();
+ /* Can't FAIL_IF(pid2 == -1); because we've already forked */
+ if (pid2 == -1) {
+ /*
+ * Couldn't fork, ensure child_ret is set and is a fail
+ */
+ ret = child_ret = 1;
+ } else {
+ if (pid2)
+ waitpid(pid2, &child_ret, 0);
+ else
+ exit(ret);
+ }
+
+ ret |= child_ret;
+
+ if (pid)
+ waitpid(pid, &child_ret, 0);
+ else
+ exit(ret);
+
+ FAIL_IF(ret || child_ret);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(test_vmx_syscall, "vmx_syscall");
+
+}
diff --git a/tools/testing/selftests/powerpc/math/vsx_asm.S b/tools/testing/selftests/powerpc/math/vsx_asm.S
new file mode 100644
index 000000000..ffc165d98
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/vsx_asm.S
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ */
+
+#include "basic_asm.h"
+#include "vsx_asm.h"
+
+#long check_vsx(vector int *r3);
+#This function wraps storeing VSX regs to the end of an array and a
+#call to a comparison function in C which boils down to a memcmp()
+FUNC_START(check_vsx)
+ PUSH_BASIC_STACK(32)
+ std r3,STACK_FRAME_PARAM(0)(sp)
+ addi r3, r3, 16 * 12 #Second half of array
+ bl store_vsx
+ ld r3,STACK_FRAME_PARAM(0)(sp)
+ bl vsx_memcmp
+ POP_BASIC_STACK(32)
+ blr
+FUNC_END(check_vsx)
+
+# int preempt_vmx(vector int *varray, int *threads_starting,
+# int *running);
+# On starting will (atomically) decrement threads_starting as a signal
+# that the VMX have been loaded with varray. Will proceed to check the
+# validity of the VMX registers while running is not zero.
+FUNC_START(preempt_vsx)
+ PUSH_BASIC_STACK(512)
+ std r3,STACK_FRAME_PARAM(0)(sp) # vector int *varray
+ std r4,STACK_FRAME_PARAM(1)(sp) # int *threads_starting
+ std r5,STACK_FRAME_PARAM(2)(sp) # int *running
+
+ bl load_vsx
+ nop
+
+ sync
+ # Atomic DEC
+ ld r3,STACK_FRAME_PARAM(1)(sp)
+1: lwarx r4,0,r3
+ addi r4,r4,-1
+ stwcx. r4,0,r3
+ bne- 1b
+
+2: ld r3,STACK_FRAME_PARAM(0)(sp)
+ bl check_vsx
+ nop
+ cmpdi r3,0
+ bne 3f
+ ld r4,STACK_FRAME_PARAM(2)(sp)
+ ld r5,0(r4)
+ cmpwi r5,0
+ bne 2b
+
+3: POP_BASIC_STACK(512)
+ blr
+FUNC_END(preempt_vsx)
diff --git a/tools/testing/selftests/powerpc/math/vsx_preempt.c b/tools/testing/selftests/powerpc/math/vsx_preempt.c
new file mode 100644
index 000000000..d1601bb88
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/vsx_preempt.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This test attempts to see if the VSX registers change across preemption.
+ * There is no way to be sure preemption happened so this test just
+ * uses many threads and a long wait. As such, a successful test
+ * doesn't mean much but a failure is bad.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <pthread.h>
+
+#include "utils.h"
+
+/* Time to wait for workers to get preempted (seconds) */
+#define PREEMPT_TIME 20
+/*
+ * Factor by which to multiply number of online CPUs for total number of
+ * worker threads
+ */
+#define THREAD_FACTOR 8
+
+/*
+ * Ensure there is twice the number of non-volatile VMX regs!
+ * check_vmx() is going to use the other half as space to put the live
+ * registers before calling vsx_memcmp()
+ */
+__thread vector int varray[24] = {
+ {1, 2, 3, 4 }, {5, 6, 7, 8 }, {9, 10,11,12},
+ {13,14,15,16}, {17,18,19,20}, {21,22,23,24},
+ {25,26,27,28}, {29,30,31,32}, {33,34,35,36},
+ {37,38,39,40}, {41,42,43,44}, {45,46,47,48}
+};
+
+int threads_starting;
+int running;
+
+extern long preempt_vsx(vector int *varray, int *threads_starting, int *running);
+
+long vsx_memcmp(vector int *a) {
+ vector int zero = {0, 0, 0, 0};
+ int i;
+
+ FAIL_IF(a != varray);
+
+ for(i = 0; i < 12; i++) {
+ if (memcmp(&a[i + 12], &zero, sizeof(vector int)) == 0) {
+ fprintf(stderr, "Detected zero from the VSX reg %d\n", i + 12);
+ return 2;
+ }
+ }
+
+ if (memcmp(a, &a[12], 12 * sizeof(vector int))) {
+ long *p = (long *)a;
+ fprintf(stderr, "VSX mismatch\n");
+ for (i = 0; i < 24; i=i+2)
+ fprintf(stderr, "%d: 0x%08lx%08lx | 0x%08lx%08lx\n",
+ i/2 + i%2 + 20, p[i], p[i + 1], p[i + 24], p[i + 25]);
+ return 1;
+ }
+ return 0;
+}
+
+void *preempt_vsx_c(void *p)
+{
+ int i, j;
+ long rc;
+ srand(pthread_self());
+ for (i = 0; i < 12; i++)
+ for (j = 0; j < 4; j++) {
+ varray[i][j] = rand();
+ /* Don't want zero because it hides kernel problems */
+ if (varray[i][j] == 0)
+ j--;
+ }
+ rc = preempt_vsx(varray, &threads_starting, &running);
+ if (rc == 2)
+ fprintf(stderr, "Caught zeros in VSX compares\n");
+ return (void *)rc;
+}
+
+int test_preempt_vsx(void)
+{
+ int i, rc, threads;
+ pthread_t *tids;
+
+ SKIP_IF(!have_hwcap(PPC_FEATURE_HAS_VSX));
+
+ threads = sysconf(_SC_NPROCESSORS_ONLN) * THREAD_FACTOR;
+ tids = malloc(threads * sizeof(pthread_t));
+ FAIL_IF(!tids);
+
+ running = true;
+ threads_starting = threads;
+ for (i = 0; i < threads; i++) {
+ rc = pthread_create(&tids[i], NULL, preempt_vsx_c, NULL);
+ FAIL_IF(rc);
+ }
+
+ setbuf(stdout, NULL);
+ /* Not really nessesary but nice to wait for every thread to start */
+ printf("\tWaiting for %d workers to start...", threads_starting);
+ while(threads_starting)
+ asm volatile("": : :"memory");
+ printf("done\n");
+
+ printf("\tWaiting for %d seconds to let some workers get preempted...", PREEMPT_TIME);
+ sleep(PREEMPT_TIME);
+ printf("done\n");
+
+ printf("\tStopping workers...");
+ /*
+ * Working are checking this value every loop. In preempt_vsx 'cmpwi r5,0; bne 2b'.
+ * r5 will have loaded the value of running.
+ */
+ running = 0;
+ for (i = 0; i < threads; i++) {
+ void *rc_p;
+ pthread_join(tids[i], &rc_p);
+
+ /*
+ * Harness will say the fail was here, look at why preempt_vsx
+ * returned
+ */
+ if ((long) rc_p)
+ printf("oops\n");
+ FAIL_IF((long) rc_p);
+ }
+ printf("done\n");
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(test_preempt_vsx, "vsx_preempt");
+}
diff --git a/tools/testing/selftests/powerpc/mm/.gitignore b/tools/testing/selftests/powerpc/mm/.gitignore
new file mode 100644
index 000000000..aac4a59f9
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/.gitignore
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-only
+hugetlb_vs_thp_test
+subpage_prot
+tempfile
+prot_sao
+segv_errors
+wild_bctr
+large_vm_fork_separation
+bad_accesses
+tlbie_test
+pkey_exec_prot
+pkey_siginfo
+stack_expansion_ldst
+stack_expansion_signal
diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile
new file mode 100644
index 000000000..defe488d6
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/Makefile
@@ -0,0 +1,34 @@
+# SPDX-License-Identifier: GPL-2.0
+noarg:
+ $(MAKE) -C ../
+
+TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors wild_bctr \
+ large_vm_fork_separation bad_accesses pkey_exec_prot \
+ pkey_siginfo stack_expansion_signal stack_expansion_ldst
+
+TEST_GEN_PROGS_EXTENDED := tlbie_test
+TEST_GEN_FILES := tempfile
+
+top_srcdir = ../../../../..
+include ../../lib.mk
+
+$(TEST_GEN_PROGS): ../harness.c ../utils.c
+
+$(OUTPUT)/prot_sao: ../utils.c
+
+$(OUTPUT)/wild_bctr: CFLAGS += -m64
+$(OUTPUT)/large_vm_fork_separation: CFLAGS += -m64
+$(OUTPUT)/bad_accesses: CFLAGS += -m64
+$(OUTPUT)/pkey_exec_prot: CFLAGS += -m64
+$(OUTPUT)/pkey_siginfo: CFLAGS += -m64
+
+$(OUTPUT)/stack_expansion_signal: ../utils.c ../pmu/lib.c
+
+$(OUTPUT)/stack_expansion_ldst: CFLAGS += -fno-stack-protector
+$(OUTPUT)/stack_expansion_ldst: ../utils.c
+
+$(OUTPUT)/tempfile:
+ dd if=/dev/zero of=$@ bs=64k count=1
+
+$(OUTPUT)/tlbie_test: LDLIBS += -lpthread
+$(OUTPUT)/pkey_siginfo: LDLIBS += -lpthread
diff --git a/tools/testing/selftests/powerpc/mm/bad_accesses.c b/tools/testing/selftests/powerpc/mm/bad_accesses.c
new file mode 100644
index 000000000..fd747b2ff
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/bad_accesses.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Copyright 2019, Michael Ellerman, IBM Corp.
+//
+// Test that out-of-bounds reads/writes behave as expected.
+
+#include <setjmp.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "utils.h"
+
+// Old distros (Ubuntu 16.04 at least) don't define this
+#ifndef SEGV_BNDERR
+#define SEGV_BNDERR 3
+#endif
+
+// 64-bit kernel is always here
+#define PAGE_OFFSET (0xcul << 60)
+
+static unsigned long kernel_virt_end;
+
+static volatile int fault_code;
+static volatile unsigned long fault_addr;
+static jmp_buf setjmp_env;
+
+static void segv_handler(int n, siginfo_t *info, void *ctxt_v)
+{
+ fault_code = info->si_code;
+ fault_addr = (unsigned long)info->si_addr;
+ siglongjmp(setjmp_env, 1);
+}
+
+int bad_access(char *p, bool write)
+{
+ char x;
+
+ fault_code = 0;
+ fault_addr = 0;
+
+ if (sigsetjmp(setjmp_env, 1) == 0) {
+ if (write)
+ *p = 1;
+ else
+ x = *p;
+
+ printf("Bad - no SEGV! (%c)\n", x);
+ return 1;
+ }
+
+ // If we see MAPERR that means we took a page fault rather than an SLB
+ // miss. We only expect to take page faults for addresses within the
+ // valid kernel range.
+ FAIL_IF(fault_code == SEGV_MAPERR && \
+ (fault_addr < PAGE_OFFSET || fault_addr >= kernel_virt_end));
+
+ FAIL_IF(fault_code != SEGV_MAPERR && fault_code != SEGV_BNDERR);
+
+ return 0;
+}
+
+static int test(void)
+{
+ unsigned long i, j, addr, region_shift, page_shift, page_size;
+ struct sigaction sig;
+ bool hash_mmu;
+
+ sig = (struct sigaction) {
+ .sa_sigaction = segv_handler,
+ .sa_flags = SA_SIGINFO,
+ };
+
+ FAIL_IF(sigaction(SIGSEGV, &sig, NULL) != 0);
+
+ FAIL_IF(using_hash_mmu(&hash_mmu));
+
+ page_size = sysconf(_SC_PAGESIZE);
+ if (page_size == (64 * 1024))
+ page_shift = 16;
+ else
+ page_shift = 12;
+
+ if (page_size == (64 * 1024) || !hash_mmu) {
+ region_shift = 52;
+
+ // We have 7 512T regions (4 kernel linear, vmalloc, io, vmemmap)
+ kernel_virt_end = PAGE_OFFSET + (7 * (512ul << 40));
+ } else if (page_size == (4 * 1024) && hash_mmu) {
+ region_shift = 46;
+
+ // We have 7 64T regions (4 kernel linear, vmalloc, io, vmemmap)
+ kernel_virt_end = PAGE_OFFSET + (7 * (64ul << 40));
+ } else
+ FAIL_IF(true);
+
+ printf("Using %s MMU, PAGE_SIZE = %dKB start address 0x%016lx\n",
+ hash_mmu ? "hash" : "radix",
+ (1 << page_shift) >> 10,
+ 1ul << region_shift);
+
+ // This generates access patterns like:
+ // 0x0010000000000000
+ // 0x0010000000010000
+ // 0x0010000000020000
+ // ...
+ // 0x0014000000000000
+ // 0x0018000000000000
+ // 0x0020000000000000
+ // 0x0020000000010000
+ // 0x0020000000020000
+ // ...
+ // 0xf400000000000000
+ // 0xf800000000000000
+
+ for (i = 1; i <= ((0xful << 60) >> region_shift); i++) {
+ for (j = page_shift - 1; j < 60; j++) {
+ unsigned long base, delta;
+
+ base = i << region_shift;
+ delta = 1ul << j;
+
+ if (delta >= base)
+ break;
+
+ addr = (base | delta) & ~((1 << page_shift) - 1);
+
+ FAIL_IF(bad_access((char *)addr, false));
+ FAIL_IF(bad_access((char *)addr, true));
+ }
+ }
+
+ return 0;
+}
+
+int main(void)
+{
+ test_harness_set_timeout(300);
+ return test_harness(test, "bad_accesses");
+}
diff --git a/tools/testing/selftests/powerpc/mm/hugetlb_vs_thp_test.c b/tools/testing/selftests/powerpc/mm/hugetlb_vs_thp_test.c
new file mode 100644
index 000000000..9932359ce
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/hugetlb_vs_thp_test.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include "utils.h"
+
+/* This must match the huge page & THP size */
+#define SIZE (16 * 1024 * 1024)
+
+static int test_body(void)
+{
+ void *addr;
+ char *p;
+
+ addr = (void *)0xa0000000;
+
+ p = mmap(addr, SIZE, PROT_READ | PROT_WRITE,
+ MAP_HUGETLB | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (p != MAP_FAILED) {
+ /*
+ * Typically the mmap will fail because no huge pages are
+ * allocated on the system. But if there are huge pages
+ * allocated the mmap will succeed. That's fine too, we just
+ * munmap here before continuing. munmap() length of
+ * MAP_HUGETLB memory must be hugepage aligned.
+ */
+ if (munmap(addr, SIZE)) {
+ perror("munmap");
+ return 1;
+ }
+ }
+
+ p = mmap(addr, SIZE, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (p == MAP_FAILED) {
+ printf("Mapping failed @ %p\n", addr);
+ perror("mmap");
+ return 1;
+ }
+
+ /*
+ * Either a user or kernel access is sufficient to trigger the bug.
+ * A kernel access is easier to spot & debug, as it will trigger the
+ * softlockup or RCU stall detectors, and when the system is kicked
+ * into xmon we get a backtrace in the kernel.
+ *
+ * A good option is:
+ * getcwd(p, SIZE);
+ *
+ * For the purposes of this testcase it's preferable to spin in
+ * userspace, so the harness can kill us if we get stuck. That way we
+ * see a test failure rather than a dead system.
+ */
+ *p = 0xf;
+
+ munmap(addr, SIZE);
+
+ return 0;
+}
+
+static int test_main(void)
+{
+ int i;
+
+ /* 10,000 because it's a "bunch", and completes reasonably quickly */
+ for (i = 0; i < 10000; i++)
+ if (test_body())
+ return 1;
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(test_main, "hugetlb_vs_thp");
+}
diff --git a/tools/testing/selftests/powerpc/mm/large_vm_fork_separation.c b/tools/testing/selftests/powerpc/mm/large_vm_fork_separation.c
new file mode 100644
index 000000000..2363a7f3a
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/large_vm_fork_separation.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Copyright 2019, Michael Ellerman, IBM Corp.
+//
+// Test that allocating memory beyond the memory limit and then forking is
+// handled correctly, ie. the child is able to access the mappings beyond the
+// memory limit and the child's writes are not visible to the parent.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "utils.h"
+
+
+#ifndef MAP_FIXED_NOREPLACE
+#define MAP_FIXED_NOREPLACE MAP_FIXED // "Should be safe" above 512TB
+#endif
+
+
+static int test(void)
+{
+ int p2c[2], c2p[2], rc, status, c, *p;
+ unsigned long page_size;
+ pid_t pid;
+
+ page_size = sysconf(_SC_PAGESIZE);
+ SKIP_IF(page_size != 65536);
+
+ // Create a mapping at 512TB to allocate an extended_id
+ p = mmap((void *)(512ul << 40), page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE, -1, 0);
+ if (p == MAP_FAILED) {
+ perror("mmap");
+ printf("Error: couldn't mmap(), confirm kernel has 4TB support?\n");
+ return 1;
+ }
+
+ printf("parent writing %p = 1\n", p);
+ *p = 1;
+
+ FAIL_IF(pipe(p2c) == -1 || pipe(c2p) == -1);
+
+ pid = fork();
+ if (pid == 0) {
+ FAIL_IF(read(p2c[0], &c, 1) != 1);
+
+ pid = getpid();
+ printf("child writing %p = %d\n", p, pid);
+ *p = pid;
+
+ FAIL_IF(write(c2p[1], &c, 1) != 1);
+ FAIL_IF(read(p2c[0], &c, 1) != 1);
+ exit(0);
+ }
+
+ c = 0;
+ FAIL_IF(write(p2c[1], &c, 1) != 1);
+ FAIL_IF(read(c2p[0], &c, 1) != 1);
+
+ // Prevent compiler optimisation
+ barrier();
+
+ rc = 0;
+ printf("parent reading %p = %d\n", p, *p);
+ if (*p != 1) {
+ printf("Error: BUG! parent saw child's write! *p = %d\n", *p);
+ rc = 1;
+ }
+
+ FAIL_IF(write(p2c[1], &c, 1) != 1);
+ FAIL_IF(waitpid(pid, &status, 0) == -1);
+ FAIL_IF(!WIFEXITED(status) || WEXITSTATUS(status));
+
+ if (rc == 0)
+ printf("success: test completed OK\n");
+
+ return rc;
+}
+
+int main(void)
+{
+ return test_harness(test, "large_vm_fork_separation");
+}
diff --git a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
new file mode 100644
index 000000000..0af4f0266
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
@@ -0,0 +1,294 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+/*
+ * Copyright 2020, Sandipan Das, IBM Corp.
+ *
+ * Test if applying execute protection on pages using memory
+ * protection keys works as expected.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+
+#include <unistd.h>
+
+#include "pkeys.h"
+
+#define PPC_INST_NOP 0x60000000
+#define PPC_INST_TRAP 0x7fe00008
+#define PPC_INST_BLR 0x4e800020
+
+static volatile sig_atomic_t fault_pkey, fault_code, fault_type;
+static volatile sig_atomic_t remaining_faults;
+static volatile unsigned int *fault_addr;
+static unsigned long pgsize, numinsns;
+static unsigned int *insns;
+
+static void trap_handler(int signum, siginfo_t *sinfo, void *ctx)
+{
+ /* Check if this fault originated from the expected address */
+ if (sinfo->si_addr != (void *) fault_addr)
+ sigsafe_err("got a fault for an unexpected address\n");
+
+ _exit(1);
+}
+
+static void segv_handler(int signum, siginfo_t *sinfo, void *ctx)
+{
+ int signal_pkey;
+
+ signal_pkey = siginfo_pkey(sinfo);
+ fault_code = sinfo->si_code;
+
+ /* Check if this fault originated from the expected address */
+ if (sinfo->si_addr != (void *) fault_addr) {
+ sigsafe_err("got a fault for an unexpected address\n");
+ _exit(1);
+ }
+
+ /* Check if too many faults have occurred for a single test case */
+ if (!remaining_faults) {
+ sigsafe_err("got too many faults for the same address\n");
+ _exit(1);
+ }
+
+
+ /* Restore permissions in order to continue */
+ switch (fault_code) {
+ case SEGV_ACCERR:
+ if (mprotect(insns, pgsize, PROT_READ | PROT_WRITE)) {
+ sigsafe_err("failed to set access permissions\n");
+ _exit(1);
+ }
+ break;
+ case SEGV_PKUERR:
+ if (signal_pkey != fault_pkey) {
+ sigsafe_err("got a fault for an unexpected pkey\n");
+ _exit(1);
+ }
+
+ switch (fault_type) {
+ case PKEY_DISABLE_ACCESS:
+ pkey_set_rights(fault_pkey, 0);
+ break;
+ case PKEY_DISABLE_EXECUTE:
+ /*
+ * Reassociate the exec-only pkey with the region
+ * to be able to continue. Unlike AMR, we cannot
+ * set IAMR directly from userspace to restore the
+ * permissions.
+ */
+ if (mprotect(insns, pgsize, PROT_EXEC)) {
+ sigsafe_err("failed to set execute permissions\n");
+ _exit(1);
+ }
+ break;
+ default:
+ sigsafe_err("got a fault with an unexpected type\n");
+ _exit(1);
+ }
+ break;
+ default:
+ sigsafe_err("got a fault with an unexpected code\n");
+ _exit(1);
+ }
+
+ remaining_faults--;
+}
+
+static int test(void)
+{
+ struct sigaction segv_act, trap_act;
+ unsigned long rights;
+ int pkey, ret, i;
+
+ ret = pkeys_unsupported();
+ if (ret)
+ return ret;
+
+ /* Setup SIGSEGV handler */
+ segv_act.sa_handler = 0;
+ segv_act.sa_sigaction = segv_handler;
+ FAIL_IF(sigprocmask(SIG_SETMASK, 0, &segv_act.sa_mask) != 0);
+ segv_act.sa_flags = SA_SIGINFO;
+ segv_act.sa_restorer = 0;
+ FAIL_IF(sigaction(SIGSEGV, &segv_act, NULL) != 0);
+
+ /* Setup SIGTRAP handler */
+ trap_act.sa_handler = 0;
+ trap_act.sa_sigaction = trap_handler;
+ FAIL_IF(sigprocmask(SIG_SETMASK, 0, &trap_act.sa_mask) != 0);
+ trap_act.sa_flags = SA_SIGINFO;
+ trap_act.sa_restorer = 0;
+ FAIL_IF(sigaction(SIGTRAP, &trap_act, NULL) != 0);
+
+ /* Setup executable region */
+ pgsize = getpagesize();
+ numinsns = pgsize / sizeof(unsigned int);
+ insns = (unsigned int *) mmap(NULL, pgsize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ FAIL_IF(insns == MAP_FAILED);
+
+ /* Write the instruction words */
+ for (i = 1; i < numinsns - 1; i++)
+ insns[i] = PPC_INST_NOP;
+
+ /*
+ * Set the first instruction as an unconditional trap. If
+ * the last write to this address succeeds, this should
+ * get overwritten by a no-op.
+ */
+ insns[0] = PPC_INST_TRAP;
+
+ /*
+ * Later, to jump to the executable region, we use a branch
+ * and link instruction (bctrl) which sets the return address
+ * automatically in LR. Use that to return back.
+ */
+ insns[numinsns - 1] = PPC_INST_BLR;
+
+ /* Allocate a pkey that restricts execution */
+ rights = PKEY_DISABLE_EXECUTE;
+ pkey = sys_pkey_alloc(0, rights);
+ FAIL_IF(pkey < 0);
+
+ /*
+ * Pick the first instruction's address from the executable
+ * region.
+ */
+ fault_addr = insns;
+
+ /* The following two cases will avoid SEGV_PKUERR */
+ fault_type = -1;
+ fault_pkey = -1;
+
+ /*
+ * Read an instruction word from the address when AMR bits
+ * are not set i.e. the pkey permits both read and write
+ * access.
+ *
+ * This should not generate a fault as having PROT_EXEC
+ * implies PROT_READ on GNU systems. The pkey currently
+ * restricts execution only based on the IAMR bits. The
+ * AMR bits are cleared.
+ */
+ remaining_faults = 0;
+ FAIL_IF(sys_pkey_mprotect(insns, pgsize, PROT_EXEC, pkey) != 0);
+ printf("read from %p, pkey permissions are %s\n", fault_addr,
+ pkey_rights(rights));
+ i = *fault_addr;
+ FAIL_IF(remaining_faults != 0);
+
+ /*
+ * Write an instruction word to the address when AMR bits
+ * are not set i.e. the pkey permits both read and write
+ * access.
+ *
+ * This should generate an access fault as having just
+ * PROT_EXEC also restricts writes. The pkey currently
+ * restricts execution only based on the IAMR bits. The
+ * AMR bits are cleared.
+ */
+ remaining_faults = 1;
+ FAIL_IF(sys_pkey_mprotect(insns, pgsize, PROT_EXEC, pkey) != 0);
+ printf("write to %p, pkey permissions are %s\n", fault_addr,
+ pkey_rights(rights));
+ *fault_addr = PPC_INST_TRAP;
+ FAIL_IF(remaining_faults != 0 || fault_code != SEGV_ACCERR);
+
+ /* The following three cases will generate SEGV_PKUERR */
+ rights |= PKEY_DISABLE_ACCESS;
+ fault_type = PKEY_DISABLE_ACCESS;
+ fault_pkey = pkey;
+
+ /*
+ * Read an instruction word from the address when AMR bits
+ * are set i.e. the pkey permits neither read nor write
+ * access.
+ *
+ * This should generate a pkey fault based on AMR bits only
+ * as having PROT_EXEC implicitly allows reads.
+ */
+ remaining_faults = 1;
+ FAIL_IF(sys_pkey_mprotect(insns, pgsize, PROT_EXEC, pkey) != 0);
+ pkey_set_rights(pkey, rights);
+ printf("read from %p, pkey permissions are %s\n", fault_addr,
+ pkey_rights(rights));
+ i = *fault_addr;
+ FAIL_IF(remaining_faults != 0 || fault_code != SEGV_PKUERR);
+
+ /*
+ * Write an instruction word to the address when AMR bits
+ * are set i.e. the pkey permits neither read nor write
+ * access.
+ *
+ * This should generate two faults. First, a pkey fault
+ * based on AMR bits and then an access fault since
+ * PROT_EXEC does not allow writes.
+ */
+ remaining_faults = 2;
+ FAIL_IF(sys_pkey_mprotect(insns, pgsize, PROT_EXEC, pkey) != 0);
+ pkey_set_rights(pkey, rights);
+ printf("write to %p, pkey permissions are %s\n", fault_addr,
+ pkey_rights(rights));
+ *fault_addr = PPC_INST_NOP;
+ FAIL_IF(remaining_faults != 0 || fault_code != SEGV_ACCERR);
+
+ /* Free the current pkey */
+ sys_pkey_free(pkey);
+
+ rights = 0;
+ do {
+ /*
+ * Allocate pkeys with all valid combinations of read,
+ * write and execute restrictions.
+ */
+ pkey = sys_pkey_alloc(0, rights);
+ FAIL_IF(pkey < 0);
+
+ /*
+ * Jump to the executable region. AMR bits may or may not
+ * be set but they should not affect execution.
+ *
+ * This should generate pkey faults based on IAMR bits which
+ * may be set to restrict execution.
+ *
+ * The first iteration also checks if the overwrite of the
+ * first instruction word from a trap to a no-op succeeded.
+ */
+ fault_pkey = pkey;
+ fault_type = -1;
+ remaining_faults = 0;
+ if (rights & PKEY_DISABLE_EXECUTE) {
+ fault_type = PKEY_DISABLE_EXECUTE;
+ remaining_faults = 1;
+ }
+
+ FAIL_IF(sys_pkey_mprotect(insns, pgsize, PROT_EXEC, pkey) != 0);
+ printf("execute at %p, pkey permissions are %s\n", fault_addr,
+ pkey_rights(rights));
+ asm volatile("mtctr %0; bctrl" : : "r"(insns));
+ FAIL_IF(remaining_faults != 0);
+ if (rights & PKEY_DISABLE_EXECUTE)
+ FAIL_IF(fault_code != SEGV_PKUERR);
+
+ /* Free the current pkey */
+ sys_pkey_free(pkey);
+
+ /* Find next valid combination of pkey rights */
+ rights = next_pkey_rights(rights);
+ } while (rights);
+
+ /* Cleanup */
+ munmap((void *) insns, pgsize);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(test, "pkey_exec_prot");
+}
diff --git a/tools/testing/selftests/powerpc/mm/pkey_siginfo.c b/tools/testing/selftests/powerpc/mm/pkey_siginfo.c
new file mode 100644
index 000000000..2db76e56d
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/pkey_siginfo.c
@@ -0,0 +1,333 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2020, Sandipan Das, IBM Corp.
+ *
+ * Test if the signal information reports the correct memory protection
+ * key upon getting a key access violation fault for a page that was
+ * attempted to be protected by two different keys from two competing
+ * threads at the same time.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+
+#include <unistd.h>
+#include <pthread.h>
+#include <sys/mman.h>
+
+#include "pkeys.h"
+
+#define PPC_INST_NOP 0x60000000
+#define PPC_INST_BLR 0x4e800020
+#define PROT_RWX (PROT_READ | PROT_WRITE | PROT_EXEC)
+
+#define NUM_ITERATIONS 1000000
+
+static volatile sig_atomic_t perm_pkey, rest_pkey;
+static volatile sig_atomic_t rights, fault_count;
+static volatile unsigned int *volatile fault_addr;
+static pthread_barrier_t iteration_barrier;
+
+static void segv_handler(int signum, siginfo_t *sinfo, void *ctx)
+{
+ void *pgstart;
+ size_t pgsize;
+ int pkey;
+
+ pkey = siginfo_pkey(sinfo);
+
+ /* Check if this fault originated from a pkey access violation */
+ if (sinfo->si_code != SEGV_PKUERR) {
+ sigsafe_err("got a fault for an unexpected reason\n");
+ _exit(1);
+ }
+
+ /* Check if this fault originated from the expected address */
+ if (sinfo->si_addr != (void *) fault_addr) {
+ sigsafe_err("got a fault for an unexpected address\n");
+ _exit(1);
+ }
+
+ /* Check if this fault originated from the restrictive pkey */
+ if (pkey != rest_pkey) {
+ sigsafe_err("got a fault for an unexpected pkey\n");
+ _exit(1);
+ }
+
+ /* Check if too many faults have occurred for the same iteration */
+ if (fault_count > 0) {
+ sigsafe_err("got too many faults for the same address\n");
+ _exit(1);
+ }
+
+ pgsize = getpagesize();
+ pgstart = (void *) ((unsigned long) fault_addr & ~(pgsize - 1));
+
+ /*
+ * If the current fault occurred due to lack of execute rights,
+ * reassociate the page with the exec-only pkey since execute
+ * rights cannot be changed directly for the faulting pkey as
+ * IAMR is inaccessible from userspace.
+ *
+ * Otherwise, if the current fault occurred due to lack of
+ * read-write rights, change the AMR permission bits for the
+ * pkey.
+ *
+ * This will let the test continue.
+ */
+ if (rights == PKEY_DISABLE_EXECUTE &&
+ mprotect(pgstart, pgsize, PROT_EXEC))
+ _exit(1);
+ else
+ pkey_set_rights(pkey, 0);
+
+ fault_count++;
+}
+
+struct region {
+ unsigned long rights;
+ unsigned int *base;
+ size_t size;
+};
+
+static void *protect(void *p)
+{
+ unsigned long rights;
+ unsigned int *base;
+ size_t size;
+ int tid, i;
+
+ tid = gettid();
+ base = ((struct region *) p)->base;
+ size = ((struct region *) p)->size;
+ FAIL_IF_EXIT(!base);
+
+ /* No read, write and execute restrictions */
+ rights = 0;
+
+ printf("tid %d, pkey permissions are %s\n", tid, pkey_rights(rights));
+
+ /* Allocate the permissive pkey */
+ perm_pkey = sys_pkey_alloc(0, rights);
+ FAIL_IF_EXIT(perm_pkey < 0);
+
+ /*
+ * Repeatedly try to protect the common region with a permissive
+ * pkey
+ */
+ for (i = 0; i < NUM_ITERATIONS; i++) {
+ /*
+ * Wait until the other thread has finished allocating the
+ * restrictive pkey or until the next iteration has begun
+ */
+ pthread_barrier_wait(&iteration_barrier);
+
+ /* Try to associate the permissive pkey with the region */
+ FAIL_IF_EXIT(sys_pkey_mprotect(base, size, PROT_RWX,
+ perm_pkey));
+ }
+
+ /* Free the permissive pkey */
+ sys_pkey_free(perm_pkey);
+
+ return NULL;
+}
+
+static void *protect_access(void *p)
+{
+ size_t size, numinsns;
+ unsigned int *base;
+ int tid, i;
+
+ tid = gettid();
+ base = ((struct region *) p)->base;
+ size = ((struct region *) p)->size;
+ rights = ((struct region *) p)->rights;
+ numinsns = size / sizeof(base[0]);
+ FAIL_IF_EXIT(!base);
+
+ /* Allocate the restrictive pkey */
+ rest_pkey = sys_pkey_alloc(0, rights);
+ FAIL_IF_EXIT(rest_pkey < 0);
+
+ printf("tid %d, pkey permissions are %s\n", tid, pkey_rights(rights));
+ printf("tid %d, %s randomly in range [%p, %p]\n", tid,
+ (rights == PKEY_DISABLE_EXECUTE) ? "execute" :
+ (rights == PKEY_DISABLE_WRITE) ? "write" : "read",
+ base, base + numinsns);
+
+ /*
+ * Repeatedly try to protect the common region with a restrictive
+ * pkey and read, write or execute from it
+ */
+ for (i = 0; i < NUM_ITERATIONS; i++) {
+ /*
+ * Wait until the other thread has finished allocating the
+ * permissive pkey or until the next iteration has begun
+ */
+ pthread_barrier_wait(&iteration_barrier);
+
+ /* Try to associate the restrictive pkey with the region */
+ FAIL_IF_EXIT(sys_pkey_mprotect(base, size, PROT_RWX,
+ rest_pkey));
+
+ /* Choose a random instruction word address from the region */
+ fault_addr = base + (rand() % numinsns);
+ fault_count = 0;
+
+ switch (rights) {
+ /* Read protection test */
+ case PKEY_DISABLE_ACCESS:
+ /*
+ * Read an instruction word from the region and
+ * verify if it has not been overwritten to
+ * something unexpected
+ */
+ FAIL_IF_EXIT(*fault_addr != PPC_INST_NOP &&
+ *fault_addr != PPC_INST_BLR);
+ break;
+
+ /* Write protection test */
+ case PKEY_DISABLE_WRITE:
+ /*
+ * Write an instruction word to the region and
+ * verify if the overwrite has succeeded
+ */
+ *fault_addr = PPC_INST_BLR;
+ FAIL_IF_EXIT(*fault_addr != PPC_INST_BLR);
+ break;
+
+ /* Execute protection test */
+ case PKEY_DISABLE_EXECUTE:
+ /* Jump to the region and execute instructions */
+ asm volatile(
+ "mtctr %0; bctrl"
+ : : "r"(fault_addr) : "ctr", "lr");
+ break;
+ }
+
+ /*
+ * Restore the restrictions originally imposed by the
+ * restrictive pkey as the signal handler would have
+ * cleared out the corresponding AMR bits
+ */
+ pkey_set_rights(rest_pkey, rights);
+ }
+
+ /* Free restrictive pkey */
+ sys_pkey_free(rest_pkey);
+
+ return NULL;
+}
+
+static void reset_pkeys(unsigned long rights)
+{
+ int pkeys[NR_PKEYS], i;
+
+ /* Exhaustively allocate all available pkeys */
+ for (i = 0; i < NR_PKEYS; i++)
+ pkeys[i] = sys_pkey_alloc(0, rights);
+
+ /* Free all allocated pkeys */
+ for (i = 0; i < NR_PKEYS; i++)
+ sys_pkey_free(pkeys[i]);
+}
+
+static int test(void)
+{
+ pthread_t prot_thread, pacc_thread;
+ struct sigaction act;
+ pthread_attr_t attr;
+ size_t numinsns;
+ struct region r;
+ int ret, i;
+
+ srand(time(NULL));
+ ret = pkeys_unsupported();
+ if (ret)
+ return ret;
+
+ /* Allocate the region */
+ r.size = getpagesize();
+ r.base = mmap(NULL, r.size, PROT_RWX,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ FAIL_IF(r.base == MAP_FAILED);
+
+ /*
+ * Fill the region with no-ops with a branch at the end
+ * for returning to the caller
+ */
+ numinsns = r.size / sizeof(r.base[0]);
+ for (i = 0; i < numinsns - 1; i++)
+ r.base[i] = PPC_INST_NOP;
+ r.base[i] = PPC_INST_BLR;
+
+ /* Setup SIGSEGV handler */
+ act.sa_handler = 0;
+ act.sa_sigaction = segv_handler;
+ FAIL_IF(sigprocmask(SIG_SETMASK, 0, &act.sa_mask) != 0);
+ act.sa_flags = SA_SIGINFO;
+ act.sa_restorer = 0;
+ FAIL_IF(sigaction(SIGSEGV, &act, NULL) != 0);
+
+ /*
+ * For these tests, the parent process should clear all bits of
+ * AMR and IAMR, i.e. impose no restrictions, for all available
+ * pkeys. This will be the base for the initial AMR and IAMR
+ * values for all the test thread pairs.
+ *
+ * If the AMR and IAMR bits of all available pkeys are cleared
+ * before running the tests and a fault is generated when
+ * attempting to read, write or execute instructions from a
+ * pkey protected region, the pkey responsible for this must be
+ * the one from the protect-and-access thread since the other
+ * one is fully permissive. Despite that, if the pkey reported
+ * by siginfo is not the restrictive pkey, then there must be a
+ * kernel bug.
+ */
+ reset_pkeys(0);
+
+ /* Setup barrier for protect and protect-and-access threads */
+ FAIL_IF(pthread_attr_init(&attr) != 0);
+ FAIL_IF(pthread_barrier_init(&iteration_barrier, NULL, 2) != 0);
+
+ /* Setup and start protect and protect-and-read threads */
+ puts("starting thread pair (protect, protect-and-read)");
+ r.rights = PKEY_DISABLE_ACCESS;
+ FAIL_IF(pthread_create(&prot_thread, &attr, &protect, &r) != 0);
+ FAIL_IF(pthread_create(&pacc_thread, &attr, &protect_access, &r) != 0);
+ FAIL_IF(pthread_join(prot_thread, NULL) != 0);
+ FAIL_IF(pthread_join(pacc_thread, NULL) != 0);
+
+ /* Setup and start protect and protect-and-write threads */
+ puts("starting thread pair (protect, protect-and-write)");
+ r.rights = PKEY_DISABLE_WRITE;
+ FAIL_IF(pthread_create(&prot_thread, &attr, &protect, &r) != 0);
+ FAIL_IF(pthread_create(&pacc_thread, &attr, &protect_access, &r) != 0);
+ FAIL_IF(pthread_join(prot_thread, NULL) != 0);
+ FAIL_IF(pthread_join(pacc_thread, NULL) != 0);
+
+ /* Setup and start protect and protect-and-execute threads */
+ puts("starting thread pair (protect, protect-and-execute)");
+ r.rights = PKEY_DISABLE_EXECUTE;
+ FAIL_IF(pthread_create(&prot_thread, &attr, &protect, &r) != 0);
+ FAIL_IF(pthread_create(&pacc_thread, &attr, &protect_access, &r) != 0);
+ FAIL_IF(pthread_join(prot_thread, NULL) != 0);
+ FAIL_IF(pthread_join(pacc_thread, NULL) != 0);
+
+ /* Cleanup */
+ FAIL_IF(pthread_attr_destroy(&attr) != 0);
+ FAIL_IF(pthread_barrier_destroy(&iteration_barrier) != 0);
+ munmap(r.base, r.size);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(test, "pkey_siginfo");
+}
diff --git a/tools/testing/selftests/powerpc/mm/prot_sao.c b/tools/testing/selftests/powerpc/mm/prot_sao.c
new file mode 100644
index 000000000..30b71b1d7
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/prot_sao.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2016, Michael Ellerman, IBM Corp.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <asm/cputable.h>
+
+#include "utils.h"
+
+#define SIZE (64 * 1024)
+
+int test_prot_sao(void)
+{
+ char *p;
+
+ /*
+ * SAO was introduced in 2.06 and removed in 3.1. It's disabled in
+ * guests/LPARs by default, so also skip if we are running in a guest.
+ */
+ SKIP_IF(!have_hwcap(PPC_FEATURE_ARCH_2_06) ||
+ have_hwcap2(PPC_FEATURE2_ARCH_3_1) ||
+ access("/proc/device-tree/rtas/ibm,hypertas-functions", F_OK) == 0);
+
+ /*
+ * Ensure we can ask for PROT_SAO.
+ * We can't really verify that it does the right thing, but at least we
+ * confirm the kernel will accept it.
+ */
+ p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE | PROT_SAO,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ FAIL_IF(p == MAP_FAILED);
+
+ /* Write to the mapping, to at least cause a fault */
+ memset(p, 0xaa, SIZE);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(test_prot_sao, "prot-sao");
+}
diff --git a/tools/testing/selftests/powerpc/mm/segv_errors.c b/tools/testing/selftests/powerpc/mm/segv_errors.c
new file mode 100644
index 000000000..06ae76ee3
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/segv_errors.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2017 John Sperbeck
+ *
+ * Test that an access to a mapped but inaccessible area causes a SEGV and
+ * reports si_code == SEGV_ACCERR.
+ */
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/mman.h>
+#include <assert.h>
+#include <ucontext.h>
+
+#include "utils.h"
+
+static bool faulted;
+static int si_code;
+
+static void segv_handler(int n, siginfo_t *info, void *ctxt_v)
+{
+ ucontext_t *ctxt = (ucontext_t *)ctxt_v;
+ struct pt_regs *regs = ctxt->uc_mcontext.regs;
+
+ faulted = true;
+ si_code = info->si_code;
+ regs->nip += 4;
+}
+
+int test_segv_errors(void)
+{
+ struct sigaction act = {
+ .sa_sigaction = segv_handler,
+ .sa_flags = SA_SIGINFO,
+ };
+ char c, *p = NULL;
+
+ p = mmap(NULL, getpagesize(), 0, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+ FAIL_IF(p == MAP_FAILED);
+
+ FAIL_IF(sigaction(SIGSEGV, &act, NULL) != 0);
+
+ faulted = false;
+ si_code = 0;
+
+ /*
+ * We just need a compiler barrier, but mb() works and has the nice
+ * property of being easy to spot in the disassembly.
+ */
+ mb();
+ c = *p;
+ mb();
+
+ FAIL_IF(!faulted);
+ FAIL_IF(si_code != SEGV_ACCERR);
+
+ faulted = false;
+ si_code = 0;
+
+ mb();
+ *p = c;
+ mb();
+
+ FAIL_IF(!faulted);
+ FAIL_IF(si_code != SEGV_ACCERR);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(test_segv_errors, "segv_errors");
+}
diff --git a/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c b/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c
new file mode 100644
index 000000000..ed9143990
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/stack_expansion_ldst.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test that loads/stores expand the stack segment, or trigger a SEGV, in
+ * various conditions.
+ *
+ * Based on test code by Tom Lane.
+ */
+
+#undef NDEBUG
+#include <assert.h>
+
+#include <err.h>
+#include <errno.h>
+#include <stdio.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#define _KB (1024)
+#define _MB (1024 * 1024)
+
+volatile char *stack_top_ptr;
+volatile unsigned long stack_top_sp;
+volatile char c;
+
+enum access_type {
+ LOAD,
+ STORE,
+};
+
+/*
+ * Consume stack until the stack pointer is below @target_sp, then do an access
+ * (load or store) at offset @delta from either the base of the stack or the
+ * current stack pointer.
+ */
+__attribute__ ((noinline))
+int consume_stack(unsigned long target_sp, unsigned long stack_high, int delta, enum access_type type)
+{
+ unsigned long target;
+ char stack_cur;
+
+ if ((unsigned long)&stack_cur > target_sp)
+ return consume_stack(target_sp, stack_high, delta, type);
+ else {
+ // We don't really need this, but without it GCC might not
+ // generate a recursive call above.
+ stack_top_ptr = &stack_cur;
+
+#ifdef __powerpc__
+ asm volatile ("mr %[sp], %%r1" : [sp] "=r" (stack_top_sp));
+#else
+ asm volatile ("mov %%rsp, %[sp]" : [sp] "=r" (stack_top_sp));
+#endif
+ target = stack_high - delta + 1;
+ volatile char *p = (char *)target;
+
+ if (type == STORE)
+ *p = c;
+ else
+ c = *p;
+
+ // Do something to prevent the stack frame being popped prior to
+ // our access above.
+ getpid();
+ }
+
+ return 0;
+}
+
+static int search_proc_maps(char *needle, unsigned long *low, unsigned long *high)
+{
+ unsigned long start, end;
+ static char buf[4096];
+ char name[128];
+ FILE *f;
+ int rc;
+
+ f = fopen("/proc/self/maps", "r");
+ if (!f) {
+ perror("fopen");
+ return -1;
+ }
+
+ while (fgets(buf, sizeof(buf), f)) {
+ rc = sscanf(buf, "%lx-%lx %*c%*c%*c%*c %*x %*d:%*d %*d %127s\n",
+ &start, &end, name);
+ if (rc == 2)
+ continue;
+
+ if (rc != 3) {
+ printf("sscanf errored\n");
+ rc = -1;
+ break;
+ }
+
+ if (strstr(name, needle)) {
+ *low = start;
+ *high = end - 1;
+ rc = 0;
+ break;
+ }
+ }
+
+ fclose(f);
+
+ return rc;
+}
+
+int child(unsigned int stack_used, int delta, enum access_type type)
+{
+ unsigned long low, stack_high;
+
+ assert(search_proc_maps("[stack]", &low, &stack_high) == 0);
+
+ assert(consume_stack(stack_high - stack_used, stack_high, delta, type) == 0);
+
+ printf("Access OK: %s delta %-7d used size 0x%06x stack high 0x%lx top_ptr %p top sp 0x%lx actual used 0x%lx\n",
+ type == LOAD ? "load" : "store", delta, stack_used, stack_high,
+ stack_top_ptr, stack_top_sp, stack_high - stack_top_sp + 1);
+
+ return 0;
+}
+
+static int test_one(unsigned int stack_used, int delta, enum access_type type)
+{
+ pid_t pid;
+ int rc;
+
+ pid = fork();
+ if (pid == 0)
+ exit(child(stack_used, delta, type));
+
+ assert(waitpid(pid, &rc, 0) != -1);
+
+ if (WIFEXITED(rc) && WEXITSTATUS(rc) == 0)
+ return 0;
+
+ // We don't expect a non-zero exit that's not a signal
+ assert(!WIFEXITED(rc));
+
+ printf("Faulted: %s delta %-7d used size 0x%06x signal %d\n",
+ type == LOAD ? "load" : "store", delta, stack_used,
+ WTERMSIG(rc));
+
+ return 1;
+}
+
+// This is fairly arbitrary but is well below any of the targets below,
+// so that the delta between the stack pointer and the target is large.
+#define DEFAULT_SIZE (32 * _KB)
+
+static void test_one_type(enum access_type type, unsigned long page_size, unsigned long rlim_cur)
+{
+ unsigned long delta;
+
+ // We should be able to access anywhere within the rlimit
+ for (delta = page_size; delta <= rlim_cur; delta += page_size)
+ assert(test_one(DEFAULT_SIZE, delta, type) == 0);
+
+ assert(test_one(DEFAULT_SIZE, rlim_cur, type) == 0);
+
+ // But if we go past the rlimit it should fail
+ assert(test_one(DEFAULT_SIZE, rlim_cur + 1, type) != 0);
+}
+
+static int test(void)
+{
+ unsigned long page_size;
+ struct rlimit rlimit;
+
+ page_size = getpagesize();
+ getrlimit(RLIMIT_STACK, &rlimit);
+ printf("Stack rlimit is 0x%lx\n", rlimit.rlim_cur);
+
+ printf("Testing loads ...\n");
+ test_one_type(LOAD, page_size, rlimit.rlim_cur);
+ printf("Testing stores ...\n");
+ test_one_type(STORE, page_size, rlimit.rlim_cur);
+
+ printf("All OK\n");
+
+ return 0;
+}
+
+#ifdef __powerpc__
+#include "utils.h"
+
+int main(void)
+{
+ return test_harness(test, "stack_expansion_ldst");
+}
+#else
+int main(void)
+{
+ return test();
+}
+#endif
diff --git a/tools/testing/selftests/powerpc/mm/stack_expansion_signal.c b/tools/testing/selftests/powerpc/mm/stack_expansion_signal.c
new file mode 100644
index 000000000..c8b32a29e
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/stack_expansion_signal.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test that signal delivery is able to expand the stack segment without
+ * triggering a SEGV.
+ *
+ * Based on test code by Tom Lane.
+ */
+
+#include <err.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../pmu/lib.h"
+#include "utils.h"
+
+#define _KB (1024)
+#define _MB (1024 * 1024)
+
+static char *stack_base_ptr;
+static char *stack_top_ptr;
+
+static volatile sig_atomic_t sig_occurred = 0;
+
+static void sigusr1_handler(int signal_arg)
+{
+ sig_occurred = 1;
+}
+
+static int consume_stack(unsigned int stack_size, union pipe write_pipe)
+{
+ char stack_cur;
+
+ if ((stack_base_ptr - &stack_cur) < stack_size)
+ return consume_stack(stack_size, write_pipe);
+ else {
+ stack_top_ptr = &stack_cur;
+
+ FAIL_IF(notify_parent(write_pipe));
+
+ while (!sig_occurred)
+ barrier();
+ }
+
+ return 0;
+}
+
+static int child(unsigned int stack_size, union pipe write_pipe)
+{
+ struct sigaction act;
+ char stack_base;
+
+ act.sa_handler = sigusr1_handler;
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = 0;
+ if (sigaction(SIGUSR1, &act, NULL) < 0)
+ err(1, "sigaction");
+
+ stack_base_ptr = (char *) (((size_t) &stack_base + 65535) & ~65535UL);
+
+ FAIL_IF(consume_stack(stack_size, write_pipe));
+
+ printf("size 0x%06x: OK, stack base %p top %p (%zx used)\n",
+ stack_size, stack_base_ptr, stack_top_ptr,
+ stack_base_ptr - stack_top_ptr);
+
+ return 0;
+}
+
+static int test_one_size(unsigned int stack_size)
+{
+ union pipe read_pipe, write_pipe;
+ pid_t pid;
+
+ FAIL_IF(pipe(read_pipe.fds) == -1);
+ FAIL_IF(pipe(write_pipe.fds) == -1);
+
+ pid = fork();
+ if (pid == 0) {
+ close(read_pipe.read_fd);
+ close(write_pipe.write_fd);
+ exit(child(stack_size, read_pipe));
+ }
+
+ close(read_pipe.write_fd);
+ close(write_pipe.read_fd);
+ FAIL_IF(sync_with_child(read_pipe, write_pipe));
+
+ kill(pid, SIGUSR1);
+
+ FAIL_IF(wait_for_child(pid));
+
+ close(read_pipe.read_fd);
+ close(write_pipe.write_fd);
+
+ return 0;
+}
+
+int test(void)
+{
+ unsigned int i, size;
+
+ // Test with used stack from 1MB - 64K to 1MB + 64K
+ // Increment by 64 to get more coverage of odd sizes
+ for (i = 0; i < (128 * _KB); i += 64) {
+ size = i + (1 * _MB) - (64 * _KB);
+ FAIL_IF(test_one_size(size));
+ }
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(test, "stack_expansion_signal");
+}
diff --git a/tools/testing/selftests/powerpc/mm/subpage_prot.c b/tools/testing/selftests/powerpc/mm/subpage_prot.c
new file mode 100644
index 000000000..3ae77ba93
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/subpage_prot.c
@@ -0,0 +1,236 @@
+/*
+ * Copyright IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/ptrace.h>
+#include <sys/syscall.h>
+#include <ucontext.h>
+#include <unistd.h>
+
+#include "utils.h"
+
+char *file_name;
+
+int in_test;
+volatile int faulted;
+volatile void *dar;
+int errors;
+
+static void segv(int signum, siginfo_t *info, void *ctxt_v)
+{
+ ucontext_t *ctxt = (ucontext_t *)ctxt_v;
+ struct pt_regs *regs = ctxt->uc_mcontext.regs;
+
+ if (!in_test) {
+ fprintf(stderr, "Segfault outside of test !\n");
+ exit(1);
+ }
+
+ faulted = 1;
+ dar = (void *)regs->dar;
+ regs->nip += 4;
+}
+
+static inline void do_read(const volatile void *addr)
+{
+ int ret;
+
+ asm volatile("lwz %0,0(%1); twi 0,%0,0; isync;\n"
+ : "=r" (ret) : "r" (addr) : "memory");
+}
+
+static inline void do_write(const volatile void *addr)
+{
+ int val = 0x1234567;
+
+ asm volatile("stw %0,0(%1); sync; \n"
+ : : "r" (val), "r" (addr) : "memory");
+}
+
+static inline void check_faulted(void *addr, long page, long subpage, int write)
+{
+ int want_fault = (subpage == ((page + 3) % 16));
+
+ if (write)
+ want_fault |= (subpage == ((page + 1) % 16));
+
+ if (faulted != want_fault) {
+ printf("Failed at %p (p=%ld,sp=%ld,w=%d), want=%s, got=%s !\n",
+ addr, page, subpage, write,
+ want_fault ? "fault" : "pass",
+ faulted ? "fault" : "pass");
+ ++errors;
+ }
+
+ if (faulted) {
+ if (dar != addr) {
+ printf("Fault expected at %p and happened at %p !\n",
+ addr, dar);
+ }
+ faulted = 0;
+ asm volatile("sync" : : : "memory");
+ }
+}
+
+static int run_test(void *addr, unsigned long size)
+{
+ unsigned int *map;
+ long i, j, pages, err;
+
+ pages = size / 0x10000;
+ map = malloc(pages * 4);
+ assert(map);
+
+ /*
+ * for each page, mark subpage i % 16 read only and subpage
+ * (i + 3) % 16 inaccessible
+ */
+ for (i = 0; i < pages; i++) {
+ map[i] = (0x40000000 >> (((i + 1) * 2) % 32)) |
+ (0xc0000000 >> (((i + 3) * 2) % 32));
+ }
+
+ err = syscall(__NR_subpage_prot, addr, size, map);
+ if (err) {
+ perror("subpage_perm");
+ return 1;
+ }
+ free(map);
+
+ in_test = 1;
+ errors = 0;
+ for (i = 0; i < pages; i++) {
+ for (j = 0; j < 16; j++, addr += 0x1000) {
+ do_read(addr);
+ check_faulted(addr, i, j, 0);
+ do_write(addr);
+ check_faulted(addr, i, j, 1);
+ }
+ }
+
+ in_test = 0;
+ if (errors) {
+ printf("%d errors detected\n", errors);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int syscall_available(void)
+{
+ int rc;
+
+ errno = 0;
+ rc = syscall(__NR_subpage_prot, 0, 0, 0);
+
+ return rc == 0 || (errno != ENOENT && errno != ENOSYS);
+}
+
+int test_anon(void)
+{
+ unsigned long align;
+ struct sigaction act = {
+ .sa_sigaction = segv,
+ .sa_flags = SA_SIGINFO
+ };
+ void *mallocblock;
+ unsigned long mallocsize;
+
+ SKIP_IF(!syscall_available());
+
+ if (getpagesize() != 0x10000) {
+ fprintf(stderr, "Kernel page size must be 64K!\n");
+ return 1;
+ }
+
+ sigaction(SIGSEGV, &act, NULL);
+
+ mallocsize = 4 * 16 * 1024 * 1024;
+
+ FAIL_IF(posix_memalign(&mallocblock, 64 * 1024, mallocsize));
+
+ align = (unsigned long)mallocblock;
+ if (align & 0xffff)
+ align = (align | 0xffff) + 1;
+
+ mallocblock = (void *)align;
+
+ printf("allocated malloc block of 0x%lx bytes at %p\n",
+ mallocsize, mallocblock);
+
+ printf("testing malloc block...\n");
+
+ return run_test(mallocblock, mallocsize);
+}
+
+int test_file(void)
+{
+ struct sigaction act = {
+ .sa_sigaction = segv,
+ .sa_flags = SA_SIGINFO
+ };
+ void *fileblock;
+ off_t filesize;
+ int fd;
+
+ SKIP_IF(!syscall_available());
+
+ fd = open(file_name, O_RDWR);
+ if (fd == -1) {
+ perror("failed to open file");
+ return 1;
+ }
+ sigaction(SIGSEGV, &act, NULL);
+
+ filesize = lseek(fd, 0, SEEK_END);
+ if (filesize & 0xffff)
+ filesize &= ~0xfffful;
+
+ fileblock = mmap(NULL, filesize, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+ if (fileblock == MAP_FAILED) {
+ perror("failed to map file");
+ return 1;
+ }
+ printf("allocated %s for 0x%lx bytes at %p\n",
+ file_name, filesize, fileblock);
+
+ printf("testing file map...\n");
+
+ return run_test(fileblock, filesize);
+}
+
+int main(int argc, char *argv[])
+{
+ int rc;
+
+ rc = test_harness(test_anon, "subpage_prot_anon");
+ if (rc)
+ return rc;
+
+ if (argc > 1)
+ file_name = argv[1];
+ else
+ file_name = "tempfile";
+
+ return test_harness(test_file, "subpage_prot_file");
+}
diff --git a/tools/testing/selftests/powerpc/mm/tlbie_test.c b/tools/testing/selftests/powerpc/mm/tlbie_test.c
new file mode 100644
index 000000000..f85a0938a
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/tlbie_test.c
@@ -0,0 +1,734 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2019, Nick Piggin, Gautham R. Shenoy, Aneesh Kumar K.V, IBM Corp.
+ */
+
+/*
+ *
+ * Test tlbie/mtpidr race. We have 4 threads doing flush/load/compare/store
+ * sequence in a loop. The same threads also rung a context switch task
+ * that does sched_yield() in loop.
+ *
+ * The snapshot thread mark the mmap area PROT_READ in between, make a copy
+ * and copy it back to the original area. This helps us to detect if any
+ * store continued to happen after we marked the memory PROT_READ.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <linux/futex.h>
+#include <unistd.h>
+#include <asm/unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <time.h>
+#include <stdarg.h>
+#include <sched.h>
+#include <pthread.h>
+#include <signal.h>
+#include <sys/prctl.h>
+
+static inline void dcbf(volatile unsigned int *addr)
+{
+ __asm__ __volatile__ ("dcbf %y0; sync" : : "Z"(*(unsigned char *)addr) : "memory");
+}
+
+static void err_msg(char *msg)
+{
+
+ time_t now;
+ time(&now);
+ printf("=================================\n");
+ printf(" Error: %s\n", msg);
+ printf(" %s", ctime(&now));
+ printf("=================================\n");
+ exit(1);
+}
+
+static char *map1;
+static char *map2;
+static pid_t rim_process_pid;
+
+/*
+ * A "rim-sequence" is defined to be the sequence of the following
+ * operations performed on a memory word:
+ * 1) FLUSH the contents of that word.
+ * 2) LOAD the contents of that word.
+ * 3) COMPARE the contents of that word with the content that was
+ * previously stored at that word
+ * 4) STORE new content into that word.
+ *
+ * The threads in this test that perform the rim-sequence are termed
+ * as rim_threads.
+ */
+
+/*
+ * A "corruption" is defined to be the failed COMPARE operation in a
+ * rim-sequence.
+ *
+ * A rim_thread that detects a corruption informs about it to all the
+ * other rim_threads, and the mem_snapshot thread.
+ */
+static volatile unsigned int corruption_found;
+
+/*
+ * This defines the maximum number of rim_threads in this test.
+ *
+ * The THREAD_ID_BITS denote the number of bits required
+ * to represent the thread_ids [0..MAX_THREADS - 1].
+ * We are being a bit paranoid here and set it to 8 bits,
+ * though 6 bits suffice.
+ *
+ */
+#define MAX_THREADS 64
+#define THREAD_ID_BITS 8
+#define THREAD_ID_MASK ((1 << THREAD_ID_BITS) - 1)
+static unsigned int rim_thread_ids[MAX_THREADS];
+static pthread_t rim_threads[MAX_THREADS];
+
+
+/*
+ * Each rim_thread works on an exclusive "chunk" of size
+ * RIM_CHUNK_SIZE.
+ *
+ * The ith rim_thread works on the ith chunk.
+ *
+ * The ith chunk begins at
+ * map1 + (i * RIM_CHUNK_SIZE)
+ */
+#define RIM_CHUNK_SIZE 1024
+#define BITS_PER_BYTE 8
+#define WORD_SIZE (sizeof(unsigned int))
+#define WORD_BITS (WORD_SIZE * BITS_PER_BYTE)
+#define WORDS_PER_CHUNK (RIM_CHUNK_SIZE/WORD_SIZE)
+
+static inline char *compute_chunk_start_addr(unsigned int thread_id)
+{
+ char *chunk_start;
+
+ chunk_start = (char *)((unsigned long)map1 +
+ (thread_id * RIM_CHUNK_SIZE));
+
+ return chunk_start;
+}
+
+/*
+ * The "word-offset" of a word-aligned address inside a chunk, is
+ * defined to be the number of words that precede the address in that
+ * chunk.
+ *
+ * WORD_OFFSET_BITS denote the number of bits required to represent
+ * the word-offsets of all the word-aligned addresses of a chunk.
+ */
+#define WORD_OFFSET_BITS (__builtin_ctz(WORDS_PER_CHUNK))
+#define WORD_OFFSET_MASK ((1 << WORD_OFFSET_BITS) - 1)
+
+static inline unsigned int compute_word_offset(char *start, unsigned int *addr)
+{
+ unsigned int delta_bytes, ret;
+ delta_bytes = (unsigned long)addr - (unsigned long)start;
+
+ ret = delta_bytes/WORD_SIZE;
+
+ return ret;
+}
+
+/*
+ * A "sweep" is defined to be the sequential execution of the
+ * rim-sequence by a rim_thread on its chunk one word at a time,
+ * starting from the first word of its chunk and ending with the last
+ * word of its chunk.
+ *
+ * Each sweep of a rim_thread is uniquely identified by a sweep_id.
+ * SWEEP_ID_BITS denote the number of bits required to represent
+ * the sweep_ids of rim_threads.
+ *
+ * As to why SWEEP_ID_BITS are computed as a function of THREAD_ID_BITS,
+ * WORD_OFFSET_BITS, and WORD_BITS, see the "store-pattern" below.
+ */
+#define SWEEP_ID_BITS (WORD_BITS - (THREAD_ID_BITS + WORD_OFFSET_BITS))
+#define SWEEP_ID_MASK ((1 << SWEEP_ID_BITS) - 1)
+
+/*
+ * A "store-pattern" is the word-pattern that is stored into a word
+ * location in the 4)STORE step of the rim-sequence.
+ *
+ * In the store-pattern, we shall encode:
+ *
+ * - The thread-id of the rim_thread performing the store
+ * (The most significant THREAD_ID_BITS)
+ *
+ * - The word-offset of the address into which the store is being
+ * performed (The next WORD_OFFSET_BITS)
+ *
+ * - The sweep_id of the current sweep in which the store is
+ * being performed. (The lower SWEEP_ID_BITS)
+ *
+ * Store Pattern: 32 bits
+ * |------------------|--------------------|---------------------------------|
+ * | Thread id | Word offset | sweep_id |
+ * |------------------|--------------------|---------------------------------|
+ * THREAD_ID_BITS WORD_OFFSET_BITS SWEEP_ID_BITS
+ *
+ * In the store pattern, the (Thread-id + Word-offset) uniquely identify the
+ * address to which the store is being performed i.e,
+ * address == map1 +
+ * (Thread-id * RIM_CHUNK_SIZE) + (Word-offset * WORD_SIZE)
+ *
+ * And the sweep_id in the store pattern identifies the time when the
+ * store was performed by the rim_thread.
+ *
+ * We shall use this property in the 3)COMPARE step of the
+ * rim-sequence.
+ */
+#define SWEEP_ID_SHIFT 0
+#define WORD_OFFSET_SHIFT (SWEEP_ID_BITS)
+#define THREAD_ID_SHIFT (WORD_OFFSET_BITS + SWEEP_ID_BITS)
+
+/*
+ * Compute the store pattern for a given thread with id @tid, at
+ * location @addr in the sweep identified by @sweep_id
+ */
+static inline unsigned int compute_store_pattern(unsigned int tid,
+ unsigned int *addr,
+ unsigned int sweep_id)
+{
+ unsigned int ret = 0;
+ char *start = compute_chunk_start_addr(tid);
+ unsigned int word_offset = compute_word_offset(start, addr);
+
+ ret += (tid & THREAD_ID_MASK) << THREAD_ID_SHIFT;
+ ret += (word_offset & WORD_OFFSET_MASK) << WORD_OFFSET_SHIFT;
+ ret += (sweep_id & SWEEP_ID_MASK) << SWEEP_ID_SHIFT;
+ return ret;
+}
+
+/* Extract the thread-id from the given store-pattern */
+static inline unsigned int extract_tid(unsigned int pattern)
+{
+ unsigned int ret;
+
+ ret = (pattern >> THREAD_ID_SHIFT) & THREAD_ID_MASK;
+ return ret;
+}
+
+/* Extract the word-offset from the given store-pattern */
+static inline unsigned int extract_word_offset(unsigned int pattern)
+{
+ unsigned int ret;
+
+ ret = (pattern >> WORD_OFFSET_SHIFT) & WORD_OFFSET_MASK;
+
+ return ret;
+}
+
+/* Extract the sweep-id from the given store-pattern */
+static inline unsigned int extract_sweep_id(unsigned int pattern)
+
+{
+ unsigned int ret;
+
+ ret = (pattern >> SWEEP_ID_SHIFT) & SWEEP_ID_MASK;
+
+ return ret;
+}
+
+/************************************************************
+ * *
+ * Logging the output of the verification *
+ * *
+ ************************************************************/
+#define LOGDIR_NAME_SIZE 100
+static char logdir[LOGDIR_NAME_SIZE];
+
+static FILE *fp[MAX_THREADS];
+static const char logfilename[] ="Thread-%02d-Chunk";
+
+static inline void start_verification_log(unsigned int tid,
+ unsigned int *addr,
+ unsigned int cur_sweep_id,
+ unsigned int prev_sweep_id)
+{
+ FILE *f;
+ char logfile[30];
+ char path[LOGDIR_NAME_SIZE + 30];
+ char separator[2] = "/";
+ char *chunk_start = compute_chunk_start_addr(tid);
+ unsigned int size = RIM_CHUNK_SIZE;
+
+ sprintf(logfile, logfilename, tid);
+ strcpy(path, logdir);
+ strcat(path, separator);
+ strcat(path, logfile);
+ f = fopen(path, "w");
+
+ if (!f) {
+ err_msg("Unable to create logfile\n");
+ }
+
+ fp[tid] = f;
+
+ fprintf(f, "----------------------------------------------------------\n");
+ fprintf(f, "PID = %d\n", rim_process_pid);
+ fprintf(f, "Thread id = %02d\n", tid);
+ fprintf(f, "Chunk Start Addr = 0x%016lx\n", (unsigned long)chunk_start);
+ fprintf(f, "Chunk Size = %d\n", size);
+ fprintf(f, "Next Store Addr = 0x%016lx\n", (unsigned long)addr);
+ fprintf(f, "Current sweep-id = 0x%08x\n", cur_sweep_id);
+ fprintf(f, "Previous sweep-id = 0x%08x\n", prev_sweep_id);
+ fprintf(f, "----------------------------------------------------------\n");
+}
+
+static inline void log_anamoly(unsigned int tid, unsigned int *addr,
+ unsigned int expected, unsigned int observed)
+{
+ FILE *f = fp[tid];
+
+ fprintf(f, "Thread %02d: Addr 0x%lx: Expected 0x%x, Observed 0x%x\n",
+ tid, (unsigned long)addr, expected, observed);
+ fprintf(f, "Thread %02d: Expected Thread id = %02d\n", tid, extract_tid(expected));
+ fprintf(f, "Thread %02d: Observed Thread id = %02d\n", tid, extract_tid(observed));
+ fprintf(f, "Thread %02d: Expected Word offset = %03d\n", tid, extract_word_offset(expected));
+ fprintf(f, "Thread %02d: Observed Word offset = %03d\n", tid, extract_word_offset(observed));
+ fprintf(f, "Thread %02d: Expected sweep-id = 0x%x\n", tid, extract_sweep_id(expected));
+ fprintf(f, "Thread %02d: Observed sweep-id = 0x%x\n", tid, extract_sweep_id(observed));
+ fprintf(f, "----------------------------------------------------------\n");
+}
+
+static inline void end_verification_log(unsigned int tid, unsigned nr_anamolies)
+{
+ FILE *f = fp[tid];
+ char logfile[30];
+ char path[LOGDIR_NAME_SIZE + 30];
+ char separator[] = "/";
+
+ fclose(f);
+
+ if (nr_anamolies == 0) {
+ remove(path);
+ return;
+ }
+
+ sprintf(logfile, logfilename, tid);
+ strcpy(path, logdir);
+ strcat(path, separator);
+ strcat(path, logfile);
+
+ printf("Thread %02d chunk has %d corrupted words. For details check %s\n",
+ tid, nr_anamolies, path);
+}
+
+/*
+ * When a COMPARE step of a rim-sequence fails, the rim_thread informs
+ * everyone else via the shared_memory pointed to by
+ * corruption_found variable. On seeing this, every thread verifies the
+ * content of its chunk as follows.
+ *
+ * Suppose a thread identified with @tid was about to store (but not
+ * yet stored) to @next_store_addr in its current sweep identified
+ * @cur_sweep_id. Let @prev_sweep_id indicate the previous sweep_id.
+ *
+ * This implies that for all the addresses @addr < @next_store_addr,
+ * Thread @tid has already performed a store as part of its current
+ * sweep. Hence we expect the content of such @addr to be:
+ * |-------------------------------------------------|
+ * | tid | word_offset(addr) | cur_sweep_id |
+ * |-------------------------------------------------|
+ *
+ * Since Thread @tid is yet to perform stores on address
+ * @next_store_addr and above, we expect the content of such an
+ * address @addr to be:
+ * |-------------------------------------------------|
+ * | tid | word_offset(addr) | prev_sweep_id |
+ * |-------------------------------------------------|
+ *
+ * The verifier function @verify_chunk does this verification and logs
+ * any anamolies that it finds.
+ */
+static void verify_chunk(unsigned int tid, unsigned int *next_store_addr,
+ unsigned int cur_sweep_id,
+ unsigned int prev_sweep_id)
+{
+ unsigned int *iter_ptr;
+ unsigned int size = RIM_CHUNK_SIZE;
+ unsigned int expected;
+ unsigned int observed;
+ char *chunk_start = compute_chunk_start_addr(tid);
+
+ int nr_anamolies = 0;
+
+ start_verification_log(tid, next_store_addr,
+ cur_sweep_id, prev_sweep_id);
+
+ for (iter_ptr = (unsigned int *)chunk_start;
+ (unsigned long)iter_ptr < (unsigned long)chunk_start + size;
+ iter_ptr++) {
+ unsigned int expected_sweep_id;
+
+ if (iter_ptr < next_store_addr) {
+ expected_sweep_id = cur_sweep_id;
+ } else {
+ expected_sweep_id = prev_sweep_id;
+ }
+
+ expected = compute_store_pattern(tid, iter_ptr, expected_sweep_id);
+
+ dcbf((volatile unsigned int*)iter_ptr); //Flush before reading
+ observed = *iter_ptr;
+
+ if (observed != expected) {
+ nr_anamolies++;
+ log_anamoly(tid, iter_ptr, expected, observed);
+ }
+ }
+
+ end_verification_log(tid, nr_anamolies);
+}
+
+static void set_pthread_cpu(pthread_t th, int cpu)
+{
+ cpu_set_t run_cpu_mask;
+ struct sched_param param;
+
+ CPU_ZERO(&run_cpu_mask);
+ CPU_SET(cpu, &run_cpu_mask);
+ pthread_setaffinity_np(th, sizeof(cpu_set_t), &run_cpu_mask);
+
+ param.sched_priority = 1;
+ if (0 && sched_setscheduler(0, SCHED_FIFO, &param) == -1) {
+ /* haven't reproduced with this setting, it kills random preemption which may be a factor */
+ fprintf(stderr, "could not set SCHED_FIFO, run as root?\n");
+ }
+}
+
+static void set_mycpu(int cpu)
+{
+ cpu_set_t run_cpu_mask;
+ struct sched_param param;
+
+ CPU_ZERO(&run_cpu_mask);
+ CPU_SET(cpu, &run_cpu_mask);
+ sched_setaffinity(0, sizeof(cpu_set_t), &run_cpu_mask);
+
+ param.sched_priority = 1;
+ if (0 && sched_setscheduler(0, SCHED_FIFO, &param) == -1) {
+ fprintf(stderr, "could not set SCHED_FIFO, run as root?\n");
+ }
+}
+
+static volatile int segv_wait;
+
+static void segv_handler(int signo, siginfo_t *info, void *extra)
+{
+ while (segv_wait) {
+ sched_yield();
+ }
+
+}
+
+static void set_segv_handler(void)
+{
+ struct sigaction sa;
+
+ sa.sa_flags = SA_SIGINFO;
+ sa.sa_sigaction = segv_handler;
+
+ if (sigaction(SIGSEGV, &sa, NULL) == -1) {
+ perror("sigaction");
+ exit(EXIT_FAILURE);
+ }
+}
+
+int timeout = 0;
+/*
+ * This function is executed by every rim_thread.
+ *
+ * This function performs sweeps over the exclusive chunks of the
+ * rim_threads executing the rim-sequence one word at a time.
+ */
+static void *rim_fn(void *arg)
+{
+ unsigned int tid = *((unsigned int *)arg);
+
+ int size = RIM_CHUNK_SIZE;
+ char *chunk_start = compute_chunk_start_addr(tid);
+
+ unsigned int prev_sweep_id;
+ unsigned int cur_sweep_id = 0;
+
+ /* word access */
+ unsigned int pattern = cur_sweep_id;
+ unsigned int *pattern_ptr = &pattern;
+ unsigned int *w_ptr, read_data;
+
+ set_segv_handler();
+
+ /*
+ * Let us initialize the chunk:
+ *
+ * Each word-aligned address addr in the chunk,
+ * is initialized to :
+ * |-------------------------------------------------|
+ * | tid | word_offset(addr) | 0 |
+ * |-------------------------------------------------|
+ */
+ for (w_ptr = (unsigned int *)chunk_start;
+ (unsigned long)w_ptr < (unsigned long)(chunk_start) + size;
+ w_ptr++) {
+
+ *pattern_ptr = compute_store_pattern(tid, w_ptr, cur_sweep_id);
+ *w_ptr = *pattern_ptr;
+ }
+
+ while (!corruption_found && !timeout) {
+ prev_sweep_id = cur_sweep_id;
+ cur_sweep_id = cur_sweep_id + 1;
+
+ for (w_ptr = (unsigned int *)chunk_start;
+ (unsigned long)w_ptr < (unsigned long)(chunk_start) + size;
+ w_ptr++) {
+ unsigned int old_pattern;
+
+ /*
+ * Compute the pattern that we would have
+ * stored at this location in the previous
+ * sweep.
+ */
+ old_pattern = compute_store_pattern(tid, w_ptr, prev_sweep_id);
+
+ /*
+ * FLUSH:Ensure that we flush the contents of
+ * the cache before loading
+ */
+ dcbf((volatile unsigned int*)w_ptr); //Flush
+
+ /* LOAD: Read the value */
+ read_data = *w_ptr; //Load
+
+ /*
+ * COMPARE: Is it the same as what we had stored
+ * in the previous sweep ? It better be!
+ */
+ if (read_data != old_pattern) {
+ /* No it isn't! Tell everyone */
+ corruption_found = 1;
+ }
+
+ /*
+ * Before performing a store, let us check if
+ * any rim_thread has found a corruption.
+ */
+ if (corruption_found || timeout) {
+ /*
+ * Yes. Someone (including us!) has found
+ * a corruption :(
+ *
+ * Let us verify that our chunk is
+ * correct.
+ */
+ /* But first, let us allow the dust to settle down! */
+ verify_chunk(tid, w_ptr, cur_sweep_id, prev_sweep_id);
+
+ return 0;
+ }
+
+ /*
+ * Compute the new pattern that we are going
+ * to write to this location
+ */
+ *pattern_ptr = compute_store_pattern(tid, w_ptr, cur_sweep_id);
+
+ /*
+ * STORE: Now let us write this pattern into
+ * the location
+ */
+ *w_ptr = *pattern_ptr;
+ }
+ }
+
+ return NULL;
+}
+
+
+static unsigned long start_cpu = 0;
+static unsigned long nrthreads = 4;
+
+static pthread_t mem_snapshot_thread;
+
+static void *mem_snapshot_fn(void *arg)
+{
+ int page_size = getpagesize();
+ size_t size = page_size;
+ void *tmp = malloc(size);
+
+ while (!corruption_found && !timeout) {
+ /* Stop memory migration once corruption is found */
+ segv_wait = 1;
+
+ mprotect(map1, size, PROT_READ);
+
+ /*
+ * Load from the working alias (map1). Loading from map2
+ * also fails.
+ */
+ memcpy(tmp, map1, size);
+
+ /*
+ * Stores must go via map2 which has write permissions, but
+ * the corrupted data tends to be seen in the snapshot buffer,
+ * so corruption does not appear to be introduced at the
+ * copy-back via map2 alias here.
+ */
+ memcpy(map2, tmp, size);
+ /*
+ * Before releasing other threads, must ensure the copy
+ * back to
+ */
+ asm volatile("sync" ::: "memory");
+ mprotect(map1, size, PROT_READ|PROT_WRITE);
+ asm volatile("sync" ::: "memory");
+ segv_wait = 0;
+
+ usleep(1); /* This value makes a big difference */
+ }
+
+ return 0;
+}
+
+void alrm_sighandler(int sig)
+{
+ timeout = 1;
+}
+
+int main(int argc, char *argv[])
+{
+ int c;
+ int page_size = getpagesize();
+ time_t now;
+ int i, dir_error;
+ pthread_attr_t attr;
+ key_t shm_key = (key_t) getpid();
+ int shmid, run_time = 20 * 60;
+ struct sigaction sa_alrm;
+
+ snprintf(logdir, LOGDIR_NAME_SIZE,
+ "/tmp/logdir-%u", (unsigned int)getpid());
+ while ((c = getopt(argc, argv, "r:hn:l:t:")) != -1) {
+ switch(c) {
+ case 'r':
+ start_cpu = strtoul(optarg, NULL, 10);
+ break;
+ case 'h':
+ printf("%s [-r <start_cpu>] [-n <nrthreads>] [-l <logdir>] [-t <timeout>]\n", argv[0]);
+ exit(0);
+ break;
+ case 'n':
+ nrthreads = strtoul(optarg, NULL, 10);
+ break;
+ case 'l':
+ strncpy(logdir, optarg, LOGDIR_NAME_SIZE - 1);
+ break;
+ case 't':
+ run_time = strtoul(optarg, NULL, 10);
+ break;
+ default:
+ printf("invalid option\n");
+ exit(0);
+ break;
+ }
+ }
+
+ if (nrthreads > MAX_THREADS)
+ nrthreads = MAX_THREADS;
+
+ shmid = shmget(shm_key, page_size, IPC_CREAT|0666);
+ if (shmid < 0) {
+ err_msg("Failed shmget\n");
+ }
+
+ map1 = shmat(shmid, NULL, 0);
+ if (map1 == (void *) -1) {
+ err_msg("Failed shmat");
+ }
+
+ map2 = shmat(shmid, NULL, 0);
+ if (map2 == (void *) -1) {
+ err_msg("Failed shmat");
+ }
+
+ dir_error = mkdir(logdir, 0755);
+
+ if (dir_error) {
+ err_msg("Failed mkdir");
+ }
+
+ printf("start_cpu list:%lu\n", start_cpu);
+ printf("number of worker threads:%lu + 1 snapshot thread\n", nrthreads);
+ printf("Allocated address:0x%016lx + secondary map:0x%016lx\n", (unsigned long)map1, (unsigned long)map2);
+ printf("logdir at : %s\n", logdir);
+ printf("Timeout: %d seconds\n", run_time);
+
+ time(&now);
+ printf("=================================\n");
+ printf(" Starting Test\n");
+ printf(" %s", ctime(&now));
+ printf("=================================\n");
+
+ for (i = 0; i < nrthreads; i++) {
+ if (1 && !fork()) {
+ prctl(PR_SET_PDEATHSIG, SIGKILL);
+ set_mycpu(start_cpu + i);
+ for (;;)
+ sched_yield();
+ exit(0);
+ }
+ }
+
+
+ sa_alrm.sa_handler = &alrm_sighandler;
+ sigemptyset(&sa_alrm.sa_mask);
+ sa_alrm.sa_flags = 0;
+
+ if (sigaction(SIGALRM, &sa_alrm, 0) == -1) {
+ err_msg("Failed signal handler registration\n");
+ }
+
+ alarm(run_time);
+
+ pthread_attr_init(&attr);
+ for (i = 0; i < nrthreads; i++) {
+ rim_thread_ids[i] = i;
+ pthread_create(&rim_threads[i], &attr, rim_fn, &rim_thread_ids[i]);
+ set_pthread_cpu(rim_threads[i], start_cpu + i);
+ }
+
+ pthread_create(&mem_snapshot_thread, &attr, mem_snapshot_fn, map1);
+ set_pthread_cpu(mem_snapshot_thread, start_cpu + i);
+
+
+ pthread_join(mem_snapshot_thread, NULL);
+ for (i = 0; i < nrthreads; i++) {
+ pthread_join(rim_threads[i], NULL);
+ }
+
+ if (!timeout) {
+ time(&now);
+ printf("=================================\n");
+ printf(" Data Corruption Detected\n");
+ printf(" %s", ctime(&now));
+ printf(" See logfiles in %s\n", logdir);
+ printf("=================================\n");
+ return 1;
+ }
+ return 0;
+}
diff --git a/tools/testing/selftests/powerpc/mm/wild_bctr.c b/tools/testing/selftests/powerpc/mm/wild_bctr.c
new file mode 100644
index 000000000..f2fa101c5
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/wild_bctr.c
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright 2018, Michael Ellerman, IBM Corp.
+ *
+ * Test that an out-of-bounds branch to counter behaves as expected.
+ */
+
+#include <setjmp.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <ucontext.h>
+#include <unistd.h>
+
+#include "utils.h"
+
+
+#define BAD_NIP 0x788c545a18000000ull
+
+static struct pt_regs signal_regs;
+static jmp_buf setjmp_env;
+
+static void save_regs(ucontext_t *ctxt)
+{
+ struct pt_regs *regs = ctxt->uc_mcontext.regs;
+
+ memcpy(&signal_regs, regs, sizeof(signal_regs));
+}
+
+static void segv_handler(int signum, siginfo_t *info, void *ctxt_v)
+{
+ save_regs(ctxt_v);
+ longjmp(setjmp_env, 1);
+}
+
+static void usr2_handler(int signum, siginfo_t *info, void *ctxt_v)
+{
+ save_regs(ctxt_v);
+}
+
+static int ok(void)
+{
+ printf("Everything is OK in here.\n");
+ return 0;
+}
+
+#define REG_POISON 0x5a5a
+#define POISONED_REG(n) ((((unsigned long)REG_POISON) << 48) | ((n) << 32) | \
+ (((unsigned long)REG_POISON) << 16) | (n))
+
+static inline void poison_regs(void)
+{
+ #define POISON_REG(n) \
+ "lis " __stringify(n) "," __stringify(REG_POISON) ";" \
+ "addi " __stringify(n) "," __stringify(n) "," __stringify(n) ";" \
+ "sldi " __stringify(n) "," __stringify(n) ", 32 ;" \
+ "oris " __stringify(n) "," __stringify(n) "," __stringify(REG_POISON) ";" \
+ "addi " __stringify(n) "," __stringify(n) "," __stringify(n) ";"
+
+ asm (POISON_REG(15)
+ POISON_REG(16)
+ POISON_REG(17)
+ POISON_REG(18)
+ POISON_REG(19)
+ POISON_REG(20)
+ POISON_REG(21)
+ POISON_REG(22)
+ POISON_REG(23)
+ POISON_REG(24)
+ POISON_REG(25)
+ POISON_REG(26)
+ POISON_REG(27)
+ POISON_REG(28)
+ POISON_REG(29)
+ : // inputs
+ : // outputs
+ : "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25",
+ "26", "27", "28", "29"
+ );
+ #undef POISON_REG
+}
+
+static int check_regs(void)
+{
+ unsigned long i;
+
+ for (i = 15; i <= 29; i++)
+ FAIL_IF(signal_regs.gpr[i] != POISONED_REG(i));
+
+ printf("Regs OK\n");
+ return 0;
+}
+
+static void dump_regs(void)
+{
+ for (int i = 0; i < 32; i += 4) {
+ printf("r%02d 0x%016lx r%02d 0x%016lx " \
+ "r%02d 0x%016lx r%02d 0x%016lx\n",
+ i, signal_regs.gpr[i],
+ i+1, signal_regs.gpr[i+1],
+ i+2, signal_regs.gpr[i+2],
+ i+3, signal_regs.gpr[i+3]);
+ }
+}
+
+#ifdef _CALL_AIXDESC
+struct opd {
+ unsigned long ip;
+ unsigned long toc;
+ unsigned long env;
+};
+static struct opd bad_opd = {
+ .ip = BAD_NIP,
+};
+#define BAD_FUNC (&bad_opd)
+#else
+#define BAD_FUNC BAD_NIP
+#endif
+
+int test_wild_bctr(void)
+{
+ int (*func_ptr)(void);
+ struct sigaction segv = {
+ .sa_sigaction = segv_handler,
+ .sa_flags = SA_SIGINFO
+ };
+ struct sigaction usr2 = {
+ .sa_sigaction = usr2_handler,
+ .sa_flags = SA_SIGINFO
+ };
+
+ FAIL_IF(sigaction(SIGSEGV, &segv, NULL));
+ FAIL_IF(sigaction(SIGUSR2, &usr2, NULL));
+
+ bzero(&signal_regs, sizeof(signal_regs));
+
+ if (setjmp(setjmp_env) == 0) {
+ func_ptr = ok;
+ func_ptr();
+
+ kill(getpid(), SIGUSR2);
+ printf("Regs before:\n");
+ dump_regs();
+ bzero(&signal_regs, sizeof(signal_regs));
+
+ poison_regs();
+
+ func_ptr = (int (*)(void))BAD_FUNC;
+ func_ptr();
+
+ FAIL_IF(1); /* we didn't segv? */
+ }
+
+ FAIL_IF(signal_regs.nip != BAD_NIP);
+
+ printf("All good - took SEGV as expected branching to 0x%llx\n", BAD_NIP);
+
+ dump_regs();
+ FAIL_IF(check_regs());
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(test_wild_bctr, "wild_bctr");
+}
diff --git a/tools/testing/selftests/powerpc/nx-gzip/99-nx-gzip.rules b/tools/testing/selftests/powerpc/nx-gzip/99-nx-gzip.rules
new file mode 100644
index 000000000..5a7118495
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/99-nx-gzip.rules
@@ -0,0 +1 @@
+SUBSYSTEM=="nxgzip", KERNEL=="nx-gzip", MODE="0666"
diff --git a/tools/testing/selftests/powerpc/nx-gzip/Makefile b/tools/testing/selftests/powerpc/nx-gzip/Makefile
new file mode 100644
index 000000000..640fad6cc
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/Makefile
@@ -0,0 +1,8 @@
+CFLAGS = -O3 -m64 -I./include
+
+TEST_GEN_FILES := gzfht_test gunz_test
+TEST_PROGS := nx-gzip-test.sh
+
+include ../../lib.mk
+
+$(TEST_GEN_FILES): gzip_vas.c
diff --git a/tools/testing/selftests/powerpc/nx-gzip/README b/tools/testing/selftests/powerpc/nx-gzip/README
new file mode 100644
index 000000000..9809dbaa1
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/README
@@ -0,0 +1,45 @@
+Test the nx-gzip function:
+=========================
+
+Verify that following device exists:
+ /dev/crypto/nx-gzip
+If you get a permission error run as sudo or set the device permissions:
+ sudo chmod go+rw /dev/crypto/nx-gzip
+However, chmod may not survive across boots. You may create a udev file such
+as:
+ /etc/udev/rules.d/99-nx-gzip.rules
+
+
+To manually build and run:
+$ gcc -O3 -I./include -o gzfht_test gzfht_test.c gzip_vas.c
+$ gcc -O3 -I./include -o gunz_test gunz_test.c gzip_vas.c
+
+
+Compress any file using Fixed Huffman mode. Output will have a .nx.gz suffix:
+$ ./gzfht_test gzip_vas.c
+file gzip_vas.c read, 6413 bytes
+compressed 6413 to 3124 bytes total, crc32 checksum = abd15e8a
+
+
+Uncompress the previous output. Output will have a .nx.gunzip suffix:
+./gunz_test gzip_vas.c.nx.gz
+gzHeader FLG 0
+00 00 00 00 04 03
+gzHeader MTIME, XFL, OS ignored
+computed checksum abd15e8a isize 0000190d
+stored checksum abd15e8a isize 0000190d
+decomp is complete: fclose
+
+
+Compare two files:
+$ sha1sum gzip_vas.c.nx.gz.nx.gunzip gzip_vas.c
+bf43e3c0c3651f5f22b6f9784cd9b1eeab4120b6 gzip_vas.c.nx.gz.nx.gunzip
+bf43e3c0c3651f5f22b6f9784cd9b1eeab4120b6 gzip_vas.c
+
+
+Note that the code here are intended for testing the nx-gzip hardware function.
+They are not intended for demonstrating performance or compression ratio.
+By being simplistic these selftests expect to allocate the entire set of source
+and target pages in the memory so it needs enough memory to work.
+For more information and source code consider using:
+https://github.com/libnxz/power-gzip
diff --git a/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
new file mode 100644
index 000000000..7c23d3dd7
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/gunz_test.c
@@ -0,0 +1,1028 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* P9 gunzip sample code for demonstrating the P9 NX hardware
+ * interface. Not intended for productive uses or for performance or
+ * compression ratio measurements. Note also that /dev/crypto/gzip,
+ * VAS and skiboot support are required
+ *
+ * Copyright 2020 IBM Corp.
+ *
+ * Author: Bulent Abali <abali@us.ibm.com>
+ *
+ * https://github.com/libnxz/power-gzip for zlib api and other utils
+ * Definitions of acronyms used here. See
+ * P9 NX Gzip Accelerator User's Manual for details:
+ * https://github.com/libnxz/power-gzip/blob/develop/doc/power_nx_gzip_um.pdf
+ *
+ * adler/crc: 32 bit checksums appended to stream tail
+ * ce: completion extension
+ * cpb: coprocessor parameter block (metadata)
+ * crb: coprocessor request block (command)
+ * csb: coprocessor status block (status)
+ * dht: dynamic huffman table
+ * dde: data descriptor element (address, length)
+ * ddl: list of ddes
+ * dh/fh: dynamic and fixed huffman types
+ * fc: coprocessor function code
+ * histlen: history/dictionary length
+ * history: sliding window of up to 32KB of data
+ * lzcount: Deflate LZ symbol counts
+ * rembytecnt: remaining byte count
+ * sfbt: source final block type; last block's type during decomp
+ * spbc: source processed byte count
+ * subc: source unprocessed bit count
+ * tebc: target ending bit count; valid bits in the last byte
+ * tpbc: target processed byte count
+ * vas: virtual accelerator switch; the user mode interface
+ */
+
+#define _ISOC11_SOURCE // For aligned_alloc()
+#define _DEFAULT_SOURCE // For endian.h
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/fcntl.h>
+#include <sys/mman.h>
+#include <endian.h>
+#include <bits/endian.h>
+#include <sys/ioctl.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include "nxu.h"
+#include "nx.h"
+#include "crb.h"
+
+int nx_dbg;
+FILE *nx_gzip_log;
+
+#define NX_MIN(X, Y) (((X) < (Y))?(X):(Y))
+#define NX_MAX(X, Y) (((X) > (Y))?(X):(Y))
+
+#define GETINPC(X) fgetc(X)
+#define FNAME_MAX 1024
+
+/* fifo queue management */
+#define fifo_used_bytes(used) (used)
+#define fifo_free_bytes(used, len) ((len)-(used))
+/* amount of free bytes in the first and last parts */
+#define fifo_free_first_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \
+ ? (len)-((cur)+(used)) : 0)
+#define fifo_free_last_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \
+ ? (cur) : (len)-(used))
+/* amount of used bytes in the first and last parts */
+#define fifo_used_first_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \
+ ? (used) : (len)-(cur))
+#define fifo_used_last_bytes(cur, used, len) ((((cur)+(used)) <= (len)) \
+ ? 0 : ((used)+(cur))-(len))
+/* first and last free parts start here */
+#define fifo_free_first_offset(cur, used) ((cur)+(used))
+#define fifo_free_last_offset(cur, used, len) \
+ fifo_used_last_bytes(cur, used, len)
+/* first and last used parts start here */
+#define fifo_used_first_offset(cur) (cur)
+#define fifo_used_last_offset(cur) (0)
+
+const int fifo_in_len = 1<<24;
+const int fifo_out_len = 1<<24;
+const int page_sz = 1<<16;
+const int line_sz = 1<<7;
+const int window_max = 1<<15;
+
+/*
+ * Adds an (address, len) pair to the list of ddes (ddl) and updates
+ * the base dde. ddl[0] is the only dde in a direct dde which
+ * contains a single (addr,len) pair. For more pairs, ddl[0] becomes
+ * the indirect (base) dde that points to a list of direct ddes.
+ * See Section 6.4 of the NX-gzip user manual for DDE description.
+ * Addr=NULL, len=0 clears the ddl[0]. Returns the total number of
+ * bytes in ddl. Caller is responsible for allocting the array of
+ * nx_dde_t *ddl. If N addresses are required in the scatter-gather
+ * list, the ddl array must have N+1 entries minimum.
+ */
+static inline uint32_t nx_append_dde(struct nx_dde_t *ddl, void *addr,
+ uint32_t len)
+{
+ uint32_t ddecnt;
+ uint32_t bytes;
+
+ if (addr == NULL && len == 0) {
+ clearp_dde(ddl);
+ return 0;
+ }
+
+ NXPRT(fprintf(stderr, "%d: %s addr %p len %x\n", __LINE__, addr,
+ __func__, len));
+
+ /* Number of ddes in the dde list ; == 0 when it is a direct dde */
+ ddecnt = getpnn(ddl, dde_count);
+ bytes = getp32(ddl, ddebc);
+
+ if (ddecnt == 0 && bytes == 0) {
+ /* First dde is unused; make it a direct dde */
+ bytes = len;
+ putp32(ddl, ddebc, bytes);
+ putp64(ddl, ddead, (uint64_t) addr);
+ } else if (ddecnt == 0) {
+ /* Converting direct to indirect dde
+ * ddl[0] becomes head dde of ddl
+ * copy direct to indirect first.
+ */
+ ddl[1] = ddl[0];
+
+ /* Add the new dde next */
+ clear_dde(ddl[2]);
+ put32(ddl[2], ddebc, len);
+ put64(ddl[2], ddead, (uint64_t) addr);
+
+ /* Ddl head points to 2 direct ddes */
+ ddecnt = 2;
+ putpnn(ddl, dde_count, ddecnt);
+ bytes = bytes + len;
+ putp32(ddl, ddebc, bytes);
+ /* Pointer to the first direct dde */
+ putp64(ddl, ddead, (uint64_t) &ddl[1]);
+ } else {
+ /* Append a dde to an existing indirect ddl */
+ ++ddecnt;
+ clear_dde(ddl[ddecnt]);
+ put64(ddl[ddecnt], ddead, (uint64_t) addr);
+ put32(ddl[ddecnt], ddebc, len);
+
+ putpnn(ddl, dde_count, ddecnt);
+ bytes = bytes + len;
+ putp32(ddl, ddebc, bytes); /* byte sum of all dde */
+ }
+ return bytes;
+}
+
+/*
+ * Touch specified number of pages represented in number bytes
+ * beginning from the first buffer in a dde list.
+ * Do not touch the pages past buf_sz-th byte's page.
+ *
+ * Set buf_sz = 0 to touch all pages described by the ddep.
+ */
+static int nx_touch_pages_dde(struct nx_dde_t *ddep, long buf_sz, long page_sz,
+ int wr)
+{
+ uint32_t indirect_count;
+ uint32_t buf_len;
+ long total;
+ uint64_t buf_addr;
+ struct nx_dde_t *dde_list;
+ int i;
+
+ assert(!!ddep);
+
+ indirect_count = getpnn(ddep, dde_count);
+
+ NXPRT(fprintf(stderr, "%s dde_count %d request len ", __func__,
+ indirect_count));
+ NXPRT(fprintf(stderr, "0x%lx\n", buf_sz));
+
+ if (indirect_count == 0) {
+ /* Direct dde */
+ buf_len = getp32(ddep, ddebc);
+ buf_addr = getp64(ddep, ddead);
+
+ NXPRT(fprintf(stderr, "touch direct ddebc 0x%x ddead %p\n",
+ buf_len, (void *)buf_addr));
+
+ if (buf_sz == 0)
+ nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
+ else
+ nxu_touch_pages((void *)buf_addr, NX_MIN(buf_len,
+ buf_sz), page_sz, wr);
+
+ return ERR_NX_OK;
+ }
+
+ /* Indirect dde */
+ if (indirect_count > MAX_DDE_COUNT)
+ return ERR_NX_EXCESSIVE_DDE;
+
+ /* First address of the list */
+ dde_list = (struct nx_dde_t *) getp64(ddep, ddead);
+
+ if (buf_sz == 0)
+ buf_sz = getp32(ddep, ddebc);
+
+ total = 0;
+ for (i = 0; i < indirect_count; i++) {
+ buf_len = get32(dde_list[i], ddebc);
+ buf_addr = get64(dde_list[i], ddead);
+ total += buf_len;
+
+ NXPRT(fprintf(stderr, "touch loop len 0x%x ddead %p total ",
+ buf_len, (void *)buf_addr));
+ NXPRT(fprintf(stderr, "0x%lx\n", total));
+
+ /* Touching fewer pages than encoded in the ddebc */
+ if (total > buf_sz) {
+ buf_len = NX_MIN(buf_len, total - buf_sz);
+ nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
+ NXPRT(fprintf(stderr, "touch loop break len 0x%x ",
+ buf_len));
+ NXPRT(fprintf(stderr, "ddead %p\n", (void *)buf_addr));
+ break;
+ }
+ nxu_touch_pages((void *)buf_addr, buf_len, page_sz, wr);
+ }
+ return ERR_NX_OK;
+}
+
+/*
+ * Src and dst buffers are supplied in scatter gather lists.
+ * NX function code and other parameters supplied in cmdp.
+ */
+static int nx_submit_job(struct nx_dde_t *src, struct nx_dde_t *dst,
+ struct nx_gzip_crb_cpb_t *cmdp, void *handle)
+{
+ uint64_t csbaddr;
+
+ memset((void *)&cmdp->crb.csb, 0, sizeof(cmdp->crb.csb));
+
+ cmdp->crb.source_dde = *src;
+ cmdp->crb.target_dde = *dst;
+
+ /* Status, output byte count in tpbc */
+ csbaddr = ((uint64_t) &cmdp->crb.csb) & csb_address_mask;
+ put64(cmdp->crb, csb_address, csbaddr);
+
+ /* NX reports input bytes in spbc; cleared */
+ cmdp->cpb.out_spbc_comp_wrap = 0;
+ cmdp->cpb.out_spbc_comp_with_count = 0;
+ cmdp->cpb.out_spbc_decomp = 0;
+
+ /* Clear output */
+ put32(cmdp->cpb, out_crc, INIT_CRC);
+ put32(cmdp->cpb, out_adler, INIT_ADLER);
+
+ /* Submit the crb, the job descriptor, to the accelerator. */
+ return nxu_submit_job(cmdp, handle);
+}
+
+int decompress_file(int argc, char **argv, void *devhandle)
+{
+ FILE *inpf = NULL;
+ FILE *outf = NULL;
+
+ int c, expect, i, cc, rc = 0;
+ char gzfname[FNAME_MAX];
+
+ /* Queuing, file ops, byte counting */
+ char *fifo_in, *fifo_out;
+ int used_in, cur_in, used_out, cur_out, read_sz, n;
+ int first_free, last_free, first_used, last_used;
+ int first_offset, last_offset;
+ int write_sz, free_space, source_sz;
+ int source_sz_estimate, target_sz_estimate;
+ uint64_t last_comp_ratio = 0; /* 1000 max */
+ uint64_t total_out = 0;
+ int is_final, is_eof;
+
+ /* nx hardware */
+ int sfbt, subc, spbc, tpbc, nx_ce, fc, resuming = 0;
+ int history_len = 0;
+ struct nx_gzip_crb_cpb_t cmd, *cmdp;
+ struct nx_dde_t *ddl_in;
+ struct nx_dde_t dde_in[6] __aligned(128);
+ struct nx_dde_t *ddl_out;
+ struct nx_dde_t dde_out[6] __aligned(128);
+ int pgfault_retries;
+
+ /* when using mmap'ed files */
+ off_t input_file_offset;
+
+ if (argc > 2) {
+ fprintf(stderr, "usage: %s <fname> or stdin\n", argv[0]);
+ fprintf(stderr, " writes to stdout or <fname>.nx.gunzip\n");
+ return -1;
+ }
+
+ if (argc == 1) {
+ inpf = stdin;
+ outf = stdout;
+ } else if (argc == 2) {
+ char w[1024];
+ char *wp;
+
+ inpf = fopen(argv[1], "r");
+ if (inpf == NULL) {
+ perror(argv[1]);
+ return -1;
+ }
+
+ /* Make a new file name to write to. Ignoring '.gz' */
+ wp = (NULL != (wp = strrchr(argv[1], '/'))) ? (wp+1) : argv[1];
+ strcpy(w, wp);
+ strcat(w, ".nx.gunzip");
+
+ outf = fopen(w, "w");
+ if (outf == NULL) {
+ perror(w);
+ return -1;
+ }
+ }
+
+ /* Decode the gzip header */
+ c = GETINPC(inpf); expect = 0x1f; /* ID1 */
+ if (c != expect)
+ goto err1;
+
+ c = GETINPC(inpf); expect = 0x8b; /* ID2 */
+ if (c != expect)
+ goto err1;
+
+ c = GETINPC(inpf); expect = 0x08; /* CM */
+ if (c != expect)
+ goto err1;
+
+ int flg = GETINPC(inpf); /* FLG */
+
+ if (flg & 0xE0 || flg & 0x4 || flg == EOF)
+ goto err2;
+
+ fprintf(stderr, "gzHeader FLG %x\n", flg);
+
+ /* Read 6 bytes; ignoring the MTIME, XFL, OS fields in this
+ * sample code.
+ */
+ for (i = 0; i < 6; i++) {
+ char tmp[10];
+
+ tmp[i] = GETINPC(inpf);
+ if (tmp[i] == EOF)
+ goto err3;
+ fprintf(stderr, "%02x ", tmp[i]);
+ if (i == 5)
+ fprintf(stderr, "\n");
+ }
+ fprintf(stderr, "gzHeader MTIME, XFL, OS ignored\n");
+
+ /* FNAME */
+ if (flg & 0x8) {
+ int k = 0;
+
+ do {
+ c = GETINPC(inpf);
+ if (c == EOF || k >= FNAME_MAX)
+ goto err3;
+ gzfname[k++] = c;
+ } while (c);
+ fprintf(stderr, "gzHeader FNAME: %s\n", gzfname);
+ }
+
+ /* FHCRC */
+ if (flg & 0x2) {
+ c = GETINPC(inpf);
+ if (c == EOF)
+ goto err3;
+ c = GETINPC(inpf);
+ if (c == EOF)
+ goto err3;
+ fprintf(stderr, "gzHeader FHCRC: ignored\n");
+ }
+
+ used_in = cur_in = used_out = cur_out = 0;
+ is_final = is_eof = 0;
+
+ /* Allocate one page larger to prevent page faults due to NX
+ * overfetching.
+ * Either do this (char*)(uintptr_t)aligned_alloc or use
+ * -std=c11 flag to make the int-to-pointer warning go away.
+ */
+ assert((fifo_in = (char *)(uintptr_t)aligned_alloc(line_sz,
+ fifo_in_len + page_sz)) != NULL);
+ assert((fifo_out = (char *)(uintptr_t)aligned_alloc(line_sz,
+ fifo_out_len + page_sz + line_sz)) != NULL);
+ /* Leave unused space due to history rounding rules */
+ fifo_out = fifo_out + line_sz;
+ nxu_touch_pages(fifo_out, fifo_out_len, page_sz, 1);
+
+ ddl_in = &dde_in[0];
+ ddl_out = &dde_out[0];
+ cmdp = &cmd;
+ memset(&cmdp->crb, 0, sizeof(cmdp->crb));
+
+read_state:
+
+ /* Read from .gz file */
+
+ NXPRT(fprintf(stderr, "read_state:\n"));
+
+ if (is_eof != 0)
+ goto write_state;
+
+ /* We read in to fifo_in in two steps: first: read in to from
+ * cur_in to the end of the buffer. last: if free space wrapped
+ * around, read from fifo_in offset 0 to offset cur_in.
+ */
+
+ /* Reset fifo head to reduce unnecessary wrap arounds */
+ cur_in = (used_in == 0) ? 0 : cur_in;
+
+ /* Free space total is reduced by a gap */
+ free_space = NX_MAX(0, fifo_free_bytes(used_in, fifo_in_len)
+ - line_sz);
+
+ /* Free space may wrap around as first and last */
+ first_free = fifo_free_first_bytes(cur_in, used_in, fifo_in_len);
+ last_free = fifo_free_last_bytes(cur_in, used_in, fifo_in_len);
+
+ /* Start offsets of the free memory */
+ first_offset = fifo_free_first_offset(cur_in, used_in);
+ last_offset = fifo_free_last_offset(cur_in, used_in, fifo_in_len);
+
+ /* Reduce read_sz because of the line_sz gap */
+ read_sz = NX_MIN(free_space, first_free);
+ n = 0;
+ if (read_sz > 0) {
+ /* Read in to offset cur_in + used_in */
+ n = fread(fifo_in + first_offset, 1, read_sz, inpf);
+ used_in = used_in + n;
+ free_space = free_space - n;
+ assert(n <= read_sz);
+ if (n != read_sz) {
+ /* Either EOF or error; exit the read loop */
+ is_eof = 1;
+ goto write_state;
+ }
+ }
+
+ /* If free space wrapped around */
+ if (last_free > 0) {
+ /* Reduce read_sz because of the line_sz gap */
+ read_sz = NX_MIN(free_space, last_free);
+ n = 0;
+ if (read_sz > 0) {
+ n = fread(fifo_in + last_offset, 1, read_sz, inpf);
+ used_in = used_in + n; /* Increase used space */
+ free_space = free_space - n; /* Decrease free space */
+ assert(n <= read_sz);
+ if (n != read_sz) {
+ /* Either EOF or error; exit the read loop */
+ is_eof = 1;
+ goto write_state;
+ }
+ }
+ }
+
+ /* At this point we have used_in bytes in fifo_in with the
+ * data head starting at cur_in and possibly wrapping around.
+ */
+
+write_state:
+
+ /* Write decompressed data to output file */
+
+ NXPRT(fprintf(stderr, "write_state:\n"));
+
+ if (used_out == 0)
+ goto decomp_state;
+
+ /* If fifo_out has data waiting, write it out to the file to
+ * make free target space for the accelerator used bytes in
+ * the first and last parts of fifo_out.
+ */
+
+ first_used = fifo_used_first_bytes(cur_out, used_out, fifo_out_len);
+ last_used = fifo_used_last_bytes(cur_out, used_out, fifo_out_len);
+
+ write_sz = first_used;
+
+ n = 0;
+ if (write_sz > 0) {
+ n = fwrite(fifo_out + cur_out, 1, write_sz, outf);
+ used_out = used_out - n;
+ /* Move head of the fifo */
+ cur_out = (cur_out + n) % fifo_out_len;
+ assert(n <= write_sz);
+ if (n != write_sz) {
+ fprintf(stderr, "error: write\n");
+ rc = -1;
+ goto err5;
+ }
+ }
+
+ if (last_used > 0) { /* If more data available in the last part */
+ write_sz = last_used; /* Keep it here for later */
+ n = 0;
+ if (write_sz > 0) {
+ n = fwrite(fifo_out, 1, write_sz, outf);
+ used_out = used_out - n;
+ cur_out = (cur_out + n) % fifo_out_len;
+ assert(n <= write_sz);
+ if (n != write_sz) {
+ fprintf(stderr, "error: write\n");
+ rc = -1;
+ goto err5;
+ }
+ }
+ }
+
+decomp_state:
+
+ /* NX decompresses input data */
+
+ NXPRT(fprintf(stderr, "decomp_state:\n"));
+
+ if (is_final)
+ goto finish_state;
+
+ /* Address/len lists */
+ clearp_dde(ddl_in);
+ clearp_dde(ddl_out);
+
+ /* FC, CRC, HistLen, Table 6-6 */
+ if (resuming) {
+ /* Resuming a partially decompressed input.
+ * The key to resume is supplying the 32KB
+ * dictionary (history) to NX, which is basically
+ * the last 32KB of output produced.
+ */
+ fc = GZIP_FC_DECOMPRESS_RESUME;
+
+ cmdp->cpb.in_crc = cmdp->cpb.out_crc;
+ cmdp->cpb.in_adler = cmdp->cpb.out_adler;
+
+ /* Round up the history size to quadword. Section 2.10 */
+ history_len = (history_len + 15) / 16;
+ putnn(cmdp->cpb, in_histlen, history_len);
+ history_len = history_len * 16; /* bytes */
+
+ if (history_len > 0) {
+ /* Chain in the history buffer to the DDE list */
+ if (cur_out >= history_len) {
+ nx_append_dde(ddl_in, fifo_out
+ + (cur_out - history_len),
+ history_len);
+ } else {
+ nx_append_dde(ddl_in, fifo_out
+ + ((fifo_out_len + cur_out)
+ - history_len),
+ history_len - cur_out);
+ /* Up to 32KB history wraps around fifo_out */
+ nx_append_dde(ddl_in, fifo_out, cur_out);
+ }
+
+ }
+ } else {
+ /* First decompress job */
+ fc = GZIP_FC_DECOMPRESS;
+
+ history_len = 0;
+ /* Writing 0 clears out subc as well */
+ cmdp->cpb.in_histlen = 0;
+ total_out = 0;
+
+ put32(cmdp->cpb, in_crc, INIT_CRC);
+ put32(cmdp->cpb, in_adler, INIT_ADLER);
+ put32(cmdp->cpb, out_crc, INIT_CRC);
+ put32(cmdp->cpb, out_adler, INIT_ADLER);
+
+ /* Assuming 10% compression ratio initially; use the
+ * most recently measured compression ratio as a
+ * heuristic to estimate the input and output
+ * sizes. If we give too much input, the target buffer
+ * overflows and NX cycles are wasted, and then we
+ * must retry with smaller input size. 1000 is 100%.
+ */
+ last_comp_ratio = 100UL;
+ }
+ cmdp->crb.gzip_fc = 0;
+ putnn(cmdp->crb, gzip_fc, fc);
+
+ /*
+ * NX source buffers
+ */
+ first_used = fifo_used_first_bytes(cur_in, used_in, fifo_in_len);
+ last_used = fifo_used_last_bytes(cur_in, used_in, fifo_in_len);
+
+ if (first_used > 0)
+ nx_append_dde(ddl_in, fifo_in + cur_in, first_used);
+
+ if (last_used > 0)
+ nx_append_dde(ddl_in, fifo_in, last_used);
+
+ /*
+ * NX target buffers
+ */
+ first_free = fifo_free_first_bytes(cur_out, used_out, fifo_out_len);
+ last_free = fifo_free_last_bytes(cur_out, used_out, fifo_out_len);
+
+ /* Reduce output free space amount not to overwrite the history */
+ int target_max = NX_MAX(0, fifo_free_bytes(used_out, fifo_out_len)
+ - (1<<16));
+
+ NXPRT(fprintf(stderr, "target_max %d (0x%x)\n", target_max,
+ target_max));
+
+ first_free = NX_MIN(target_max, first_free);
+ if (first_free > 0) {
+ first_offset = fifo_free_first_offset(cur_out, used_out);
+ nx_append_dde(ddl_out, fifo_out + first_offset, first_free);
+ }
+
+ if (last_free > 0) {
+ last_free = NX_MIN(target_max - first_free, last_free);
+ if (last_free > 0) {
+ last_offset = fifo_free_last_offset(cur_out, used_out,
+ fifo_out_len);
+ nx_append_dde(ddl_out, fifo_out + last_offset,
+ last_free);
+ }
+ }
+
+ /* Target buffer size is used to limit the source data size
+ * based on previous measurements of compression ratio.
+ */
+
+ /* source_sz includes history */
+ source_sz = getp32(ddl_in, ddebc);
+ assert(source_sz > history_len);
+ source_sz = source_sz - history_len;
+
+ /* Estimating how much source is needed to 3/4 fill a
+ * target_max size target buffer. If we overshoot, then NX
+ * must repeat the job with smaller input and we waste
+ * bandwidth. If we undershoot then we use more NX calls than
+ * necessary.
+ */
+
+ source_sz_estimate = ((uint64_t)target_max * last_comp_ratio * 3UL)
+ / 4000;
+
+ if (source_sz_estimate < source_sz) {
+ /* Target might be small, therefore limiting the
+ * source data.
+ */
+ source_sz = source_sz_estimate;
+ target_sz_estimate = target_max;
+ } else {
+ /* Source file might be small, therefore limiting target
+ * touch pages to a smaller value to save processor cycles.
+ */
+ target_sz_estimate = ((uint64_t)source_sz * 1000UL)
+ / (last_comp_ratio + 1);
+ target_sz_estimate = NX_MIN(2 * target_sz_estimate,
+ target_max);
+ }
+
+ source_sz = source_sz + history_len;
+
+ /* Some NX condition codes require submitting the NX job again.
+ * Kernel doesn't handle NX page faults. Expects user code to
+ * touch pages.
+ */
+ pgfault_retries = NX_MAX_FAULTS;
+
+restart_nx:
+
+ putp32(ddl_in, ddebc, source_sz);
+
+ /* Fault in pages */
+ nxu_touch_pages(cmdp, sizeof(struct nx_gzip_crb_cpb_t), page_sz, 1);
+ nx_touch_pages_dde(ddl_in, 0, page_sz, 0);
+ nx_touch_pages_dde(ddl_out, target_sz_estimate, page_sz, 1);
+
+ /* Send job to NX */
+ cc = nx_submit_job(ddl_in, ddl_out, cmdp, devhandle);
+
+ switch (cc) {
+
+ case ERR_NX_AT_FAULT:
+
+ /* We touched the pages ahead of time. In the most common case
+ * we shouldn't be here. But may be some pages were paged out.
+ * Kernel should have placed the faulting address to fsaddr.
+ */
+ NXPRT(fprintf(stderr, "ERR_NX_AT_FAULT %p\n",
+ (void *)cmdp->crb.csb.fsaddr));
+
+ if (pgfault_retries == NX_MAX_FAULTS) {
+ /* Try once with exact number of pages */
+ --pgfault_retries;
+ goto restart_nx;
+ } else if (pgfault_retries > 0) {
+ /* If still faulting try fewer input pages
+ * assuming memory outage
+ */
+ if (source_sz > page_sz)
+ source_sz = NX_MAX(source_sz / 2, page_sz);
+ --pgfault_retries;
+ goto restart_nx;
+ } else {
+ fprintf(stderr, "cannot make progress; too many ");
+ fprintf(stderr, "page fault retries cc= %d\n", cc);
+ rc = -1;
+ goto err5;
+ }
+
+ case ERR_NX_DATA_LENGTH:
+
+ NXPRT(fprintf(stderr, "ERR_NX_DATA_LENGTH; "));
+ NXPRT(fprintf(stderr, "stream may have trailing data\n"));
+
+ /* Not an error in the most common case; it just says
+ * there is trailing data that we must examine.
+ *
+ * CC=3 CE(1)=0 CE(0)=1 indicates partial completion
+ * Fig.6-7 and Table 6-8.
+ */
+ nx_ce = get_csb_ce_ms3b(cmdp->crb.csb);
+
+ if (!csb_ce_termination(nx_ce) &&
+ csb_ce_partial_completion(nx_ce)) {
+ /* Check CPB for more information
+ * spbc and tpbc are valid
+ */
+ sfbt = getnn(cmdp->cpb, out_sfbt); /* Table 6-4 */
+ subc = getnn(cmdp->cpb, out_subc); /* Table 6-4 */
+ spbc = get32(cmdp->cpb, out_spbc_decomp);
+ tpbc = get32(cmdp->crb.csb, tpbc);
+ assert(target_max >= tpbc);
+
+ goto ok_cc3; /* not an error */
+ } else {
+ /* History length error when CE(1)=1 CE(0)=0. */
+ rc = -1;
+ fprintf(stderr, "history length error cc= %d\n", cc);
+ goto err5;
+ }
+
+ case ERR_NX_TARGET_SPACE:
+
+ /* Target buffer not large enough; retry smaller input
+ * data; give at least 1 byte. SPBC/TPBC are not valid.
+ */
+ assert(source_sz > history_len);
+ source_sz = ((source_sz - history_len + 2) / 2) + history_len;
+ NXPRT(fprintf(stderr, "ERR_NX_TARGET_SPACE; retry with "));
+ NXPRT(fprintf(stderr, "smaller input data src %d hist %d\n",
+ source_sz, history_len));
+ goto restart_nx;
+
+ case ERR_NX_OK:
+
+ /* This should not happen for gzip formatted data;
+ * we need trailing crc and isize
+ */
+ fprintf(stderr, "ERR_NX_OK\n");
+ spbc = get32(cmdp->cpb, out_spbc_decomp);
+ tpbc = get32(cmdp->crb.csb, tpbc);
+ assert(target_max >= tpbc);
+ assert(spbc >= history_len);
+ source_sz = spbc - history_len;
+ goto offsets_state;
+
+ default:
+ fprintf(stderr, "error: cc= %d\n", cc);
+ rc = -1;
+ goto err5;
+ }
+
+ok_cc3:
+
+ NXPRT(fprintf(stderr, "cc3: sfbt: %x\n", sfbt));
+
+ assert(spbc > history_len);
+ source_sz = spbc - history_len;
+
+ /* Table 6-4: Source Final Block Type (SFBT) describes the
+ * last processed deflate block and clues the software how to
+ * resume the next job. SUBC indicates how many input bits NX
+ * consumed but did not process. SPBC indicates how many
+ * bytes of source were given to the accelerator including
+ * history bytes.
+ */
+
+ switch (sfbt) {
+ int dhtlen;
+
+ case 0x0: /* Deflate final EOB received */
+
+ /* Calculating the checksum start position. */
+
+ source_sz = source_sz - subc / 8;
+ is_final = 1;
+ break;
+
+ /* Resume decompression cases are below. Basically
+ * indicates where NX has suspended and how to resume
+ * the input stream.
+ */
+
+ case 0x8: /* Within a literal block; use rembytecount */
+ case 0x9: /* Within a literal block; use rembytecount; bfinal=1 */
+
+ /* Supply the partially processed source byte again */
+ source_sz = source_sz - ((subc + 7) / 8);
+
+ /* SUBC LS 3bits: number of bits in the first source byte need
+ * to be processed.
+ * 000 means all 8 bits; Table 6-3
+ * Clear subc, histlen, sfbt, rembytecnt, dhtlen
+ */
+ cmdp->cpb.in_subc = 0;
+ cmdp->cpb.in_sfbt = 0;
+ putnn(cmdp->cpb, in_subc, subc % 8);
+ putnn(cmdp->cpb, in_sfbt, sfbt);
+ putnn(cmdp->cpb, in_rembytecnt, getnn(cmdp->cpb,
+ out_rembytecnt));
+ break;
+
+ case 0xA: /* Within a FH block; */
+ case 0xB: /* Within a FH block; bfinal=1 */
+
+ source_sz = source_sz - ((subc + 7) / 8);
+
+ /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
+ cmdp->cpb.in_subc = 0;
+ cmdp->cpb.in_sfbt = 0;
+ putnn(cmdp->cpb, in_subc, subc % 8);
+ putnn(cmdp->cpb, in_sfbt, sfbt);
+ break;
+
+ case 0xC: /* Within a DH block; */
+ case 0xD: /* Within a DH block; bfinal=1 */
+
+ source_sz = source_sz - ((subc + 7) / 8);
+
+ /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
+ cmdp->cpb.in_subc = 0;
+ cmdp->cpb.in_sfbt = 0;
+ putnn(cmdp->cpb, in_subc, subc % 8);
+ putnn(cmdp->cpb, in_sfbt, sfbt);
+
+ dhtlen = getnn(cmdp->cpb, out_dhtlen);
+ putnn(cmdp->cpb, in_dhtlen, dhtlen);
+ assert(dhtlen >= 42);
+
+ /* Round up to a qword */
+ dhtlen = (dhtlen + 127) / 128;
+
+ while (dhtlen > 0) { /* Copy dht from cpb.out to cpb.in */
+ --dhtlen;
+ cmdp->cpb.in_dht[dhtlen] = cmdp->cpb.out_dht[dhtlen];
+ }
+ break;
+
+ case 0xE: /* Within a block header; bfinal=0; */
+ /* Also given if source data exactly ends (SUBC=0) with
+ * EOB code with BFINAL=0. Means the next byte will
+ * contain a block header.
+ */
+ case 0xF: /* within a block header with BFINAL=1. */
+
+ source_sz = source_sz - ((subc + 7) / 8);
+
+ /* Clear subc, histlen, sfbt, rembytecnt, dhtlen */
+ cmdp->cpb.in_subc = 0;
+ cmdp->cpb.in_sfbt = 0;
+ putnn(cmdp->cpb, in_subc, subc % 8);
+ putnn(cmdp->cpb, in_sfbt, sfbt);
+
+ /* Engine did not process any data */
+ if (is_eof && (source_sz == 0))
+ is_final = 1;
+ }
+
+offsets_state:
+
+ /* Adjust the source and target buffer offsets and lengths */
+
+ NXPRT(fprintf(stderr, "offsets_state:\n"));
+
+ /* Delete input data from fifo_in */
+ used_in = used_in - source_sz;
+ cur_in = (cur_in + source_sz) % fifo_in_len;
+ input_file_offset = input_file_offset + source_sz;
+
+ /* Add output data to fifo_out */
+ used_out = used_out + tpbc;
+
+ assert(used_out <= fifo_out_len);
+
+ total_out = total_out + tpbc;
+
+ /* Deflate history is 32KB max. No need to supply more
+ * than 32KB on a resume.
+ */
+ history_len = (total_out > window_max) ? window_max : total_out;
+
+ /* To estimate expected expansion in the next NX job; 500 means 50%.
+ * Deflate best case is around 1 to 1000.
+ */
+ last_comp_ratio = (1000UL * ((uint64_t)source_sz + 1))
+ / ((uint64_t)tpbc + 1);
+ last_comp_ratio = NX_MAX(NX_MIN(1000UL, last_comp_ratio), 1);
+ NXPRT(fprintf(stderr, "comp_ratio %ld source_sz %d spbc %d tpbc %d\n",
+ last_comp_ratio, source_sz, spbc, tpbc));
+
+ resuming = 1;
+
+finish_state:
+
+ NXPRT(fprintf(stderr, "finish_state:\n"));
+
+ if (is_final) {
+ if (used_out)
+ goto write_state; /* More data to write out */
+ else if (used_in < 8) {
+ /* Need at least 8 more bytes containing gzip crc
+ * and isize.
+ */
+ rc = -1;
+ goto err4;
+ } else {
+ /* Compare checksums and exit */
+ int i;
+ unsigned char tail[8];
+ uint32_t cksum, isize;
+
+ for (i = 0; i < 8; i++)
+ tail[i] = fifo_in[(cur_in + i) % fifo_in_len];
+ fprintf(stderr, "computed checksum %08x isize %08x\n",
+ cmdp->cpb.out_crc, (uint32_t) (total_out
+ % (1ULL<<32)));
+ cksum = ((uint32_t) tail[0] | (uint32_t) tail[1]<<8
+ | (uint32_t) tail[2]<<16
+ | (uint32_t) tail[3]<<24);
+ isize = ((uint32_t) tail[4] | (uint32_t) tail[5]<<8
+ | (uint32_t) tail[6]<<16
+ | (uint32_t) tail[7]<<24);
+ fprintf(stderr, "stored checksum %08x isize %08x\n",
+ cksum, isize);
+
+ if (cksum == cmdp->cpb.out_crc && isize == (uint32_t)
+ (total_out % (1ULL<<32))) {
+ rc = 0; goto ok1;
+ } else {
+ rc = -1; goto err4;
+ }
+ }
+ } else
+ goto read_state;
+
+ return -1;
+
+err1:
+ fprintf(stderr, "error: not a gzip file, expect %x, read %x\n",
+ expect, c);
+ return -1;
+
+err2:
+ fprintf(stderr, "error: the FLG byte is wrong or not being handled\n");
+ return -1;
+
+err3:
+ fprintf(stderr, "error: gzip header\n");
+ return -1;
+
+err4:
+ fprintf(stderr, "error: checksum missing or mismatch\n");
+
+err5:
+ok1:
+ fprintf(stderr, "decomp is complete: fclose\n");
+ fclose(outf);
+
+ return rc;
+}
+
+
+int main(int argc, char **argv)
+{
+ int rc;
+ struct sigaction act;
+ void *handle;
+
+ nx_dbg = 0;
+ nx_gzip_log = NULL;
+ act.sa_handler = 0;
+ act.sa_sigaction = nxu_sigsegv_handler;
+ act.sa_flags = SA_SIGINFO;
+ act.sa_restorer = 0;
+ sigemptyset(&act.sa_mask);
+ sigaction(SIGSEGV, &act, NULL);
+
+ handle = nx_function_begin(NX_FUNC_COMP_GZIP, 0);
+ if (!handle) {
+ fprintf(stderr, "Unable to init NX, errno %d\n", errno);
+ exit(-1);
+ }
+
+ rc = decompress_file(argc, argv, handle);
+
+ nx_function_end(handle);
+
+ return rc;
+}
diff --git a/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c b/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c
new file mode 100644
index 000000000..02dffb65d
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/gzfht_test.c
@@ -0,0 +1,433 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/* P9 gzip sample code for demonstrating the P9 NX hardware interface.
+ * Not intended for productive uses or for performance or compression
+ * ratio measurements. For simplicity of demonstration, this sample
+ * code compresses in to fixed Huffman blocks only (Deflate btype=1)
+ * and has very simple memory management. Dynamic Huffman blocks
+ * (Deflate btype=2) are more involved as detailed in the user guide.
+ * Note also that /dev/crypto/gzip, VAS and skiboot support are
+ * required.
+ *
+ * Copyright 2020 IBM Corp.
+ *
+ * https://github.com/libnxz/power-gzip for zlib api and other utils
+ *
+ * Author: Bulent Abali <abali@us.ibm.com>
+ *
+ * Definitions of acronyms used here. See
+ * P9 NX Gzip Accelerator User's Manual for details:
+ * https://github.com/libnxz/power-gzip/blob/develop/doc/power_nx_gzip_um.pdf
+ *
+ * adler/crc: 32 bit checksums appended to stream tail
+ * ce: completion extension
+ * cpb: coprocessor parameter block (metadata)
+ * crb: coprocessor request block (command)
+ * csb: coprocessor status block (status)
+ * dht: dynamic huffman table
+ * dde: data descriptor element (address, length)
+ * ddl: list of ddes
+ * dh/fh: dynamic and fixed huffman types
+ * fc: coprocessor function code
+ * histlen: history/dictionary length
+ * history: sliding window of up to 32KB of data
+ * lzcount: Deflate LZ symbol counts
+ * rembytecnt: remaining byte count
+ * sfbt: source final block type; last block's type during decomp
+ * spbc: source processed byte count
+ * subc: source unprocessed bit count
+ * tebc: target ending bit count; valid bits in the last byte
+ * tpbc: target processed byte count
+ * vas: virtual accelerator switch; the user mode interface
+ */
+
+#define _ISOC11_SOURCE // For aligned_alloc()
+#define _DEFAULT_SOURCE // For endian.h
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/fcntl.h>
+#include <sys/mman.h>
+#include <endian.h>
+#include <bits/endian.h>
+#include <sys/ioctl.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include "nxu.h"
+#include "nx.h"
+
+int nx_dbg;
+FILE *nx_gzip_log;
+
+#define NX_MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
+#define FNAME_MAX 1024
+#define FEXT ".nx.gz"
+
+/*
+ * LZ counts returned in the user supplied nx_gzip_crb_cpb_t structure.
+ */
+static int compress_fht_sample(char *src, uint32_t srclen, char *dst,
+ uint32_t dstlen, int with_count,
+ struct nx_gzip_crb_cpb_t *cmdp, void *handle)
+{
+ uint32_t fc;
+
+ assert(!!cmdp);
+
+ put32(cmdp->crb, gzip_fc, 0); /* clear */
+ fc = (with_count) ? GZIP_FC_COMPRESS_RESUME_FHT_COUNT :
+ GZIP_FC_COMPRESS_RESUME_FHT;
+ putnn(cmdp->crb, gzip_fc, fc);
+ putnn(cmdp->cpb, in_histlen, 0); /* resuming with no history */
+ memset((void *) &cmdp->crb.csb, 0, sizeof(cmdp->crb.csb));
+
+ /* Section 6.6 programming notes; spbc may be in two different
+ * places depending on FC.
+ */
+ if (!with_count)
+ put32(cmdp->cpb, out_spbc_comp, 0);
+ else
+ put32(cmdp->cpb, out_spbc_comp_with_count, 0);
+
+ /* Figure 6-3 6-4; CSB location */
+ put64(cmdp->crb, csb_address, 0);
+ put64(cmdp->crb, csb_address,
+ (uint64_t) &cmdp->crb.csb & csb_address_mask);
+
+ /* Source direct dde (scatter-gather list) */
+ clear_dde(cmdp->crb.source_dde);
+ putnn(cmdp->crb.source_dde, dde_count, 0);
+ put32(cmdp->crb.source_dde, ddebc, srclen);
+ put64(cmdp->crb.source_dde, ddead, (uint64_t) src);
+
+ /* Target direct dde (scatter-gather list) */
+ clear_dde(cmdp->crb.target_dde);
+ putnn(cmdp->crb.target_dde, dde_count, 0);
+ put32(cmdp->crb.target_dde, ddebc, dstlen);
+ put64(cmdp->crb.target_dde, ddead, (uint64_t) dst);
+
+ /* Submit the crb, the job descriptor, to the accelerator */
+ return nxu_submit_job(cmdp, handle);
+}
+
+/*
+ * Prepares a blank no filename no timestamp gzip header and returns
+ * the number of bytes written to buf.
+ * Gzip specification at https://tools.ietf.org/html/rfc1952
+ */
+int gzip_header_blank(char *buf)
+{
+ int i = 0;
+
+ buf[i++] = 0x1f; /* ID1 */
+ buf[i++] = 0x8b; /* ID2 */
+ buf[i++] = 0x08; /* CM */
+ buf[i++] = 0x00; /* FLG */
+ buf[i++] = 0x00; /* MTIME */
+ buf[i++] = 0x00; /* MTIME */
+ buf[i++] = 0x00; /* MTIME */
+ buf[i++] = 0x00; /* MTIME */
+ buf[i++] = 0x04; /* XFL 4=fastest */
+ buf[i++] = 0x03; /* OS UNIX */
+
+ return i;
+}
+
+/* Caller must free the allocated buffer return nonzero on error. */
+int read_alloc_input_file(char *fname, char **buf, size_t *bufsize)
+{
+ struct stat statbuf;
+ FILE *fp;
+ char *p;
+ size_t num_bytes;
+
+ if (stat(fname, &statbuf)) {
+ perror(fname);
+ return(-1);
+ }
+ fp = fopen(fname, "r");
+ if (fp == NULL) {
+ perror(fname);
+ return(-1);
+ }
+ assert(NULL != (p = (char *) malloc(statbuf.st_size)));
+ num_bytes = fread(p, 1, statbuf.st_size, fp);
+ if (ferror(fp) || (num_bytes != statbuf.st_size)) {
+ perror(fname);
+ return(-1);
+ }
+ *buf = p;
+ *bufsize = num_bytes;
+ return 0;
+}
+
+/* Returns nonzero on error */
+int write_output_file(char *fname, char *buf, size_t bufsize)
+{
+ FILE *fp;
+ size_t num_bytes;
+
+ fp = fopen(fname, "w");
+ if (fp == NULL) {
+ perror(fname);
+ return(-1);
+ }
+ num_bytes = fwrite(buf, 1, bufsize, fp);
+ if (ferror(fp) || (num_bytes != bufsize)) {
+ perror(fname);
+ return(-1);
+ }
+ fclose(fp);
+ return 0;
+}
+
+/*
+ * Z_SYNC_FLUSH as described in zlib.h.
+ * Returns number of appended bytes
+ */
+int append_sync_flush(char *buf, int tebc, int final)
+{
+ uint64_t flush;
+ int shift = (tebc & 0x7);
+
+ if (tebc > 0) {
+ /* Last byte is partially full */
+ buf = buf - 1;
+ *buf = *buf & (unsigned char) ((1<<tebc)-1);
+ } else
+ *buf = 0;
+ flush = ((0x1ULL & final) << shift) | *buf;
+ shift = shift + 3; /* BFINAL and BTYPE written */
+ shift = (shift <= 8) ? 8 : 16;
+ flush |= (0xFFFF0000ULL) << shift; /* Zero length block */
+ shift = shift + 32;
+ while (shift > 0) {
+ *buf++ = (unsigned char) (flush & 0xffULL);
+ flush = flush >> 8;
+ shift = shift - 8;
+ }
+ return(((tebc > 5) || (tebc == 0)) ? 5 : 4);
+}
+
+/*
+ * Final deflate block bit. This call assumes the block
+ * beginning is byte aligned.
+ */
+static void set_bfinal(void *buf, int bfinal)
+{
+ char *b = buf;
+
+ if (bfinal)
+ *b = *b | (unsigned char) 0x01;
+ else
+ *b = *b & (unsigned char) 0xfe;
+}
+
+int compress_file(int argc, char **argv, void *handle)
+{
+ char *inbuf, *outbuf, *srcbuf, *dstbuf;
+ char outname[FNAME_MAX];
+ uint32_t srclen, dstlen;
+ uint32_t flushlen, chunk;
+ size_t inlen, outlen, dsttotlen, srctotlen;
+ uint32_t crc, spbc, tpbc, tebc;
+ int lzcounts = 0;
+ int cc;
+ int num_hdr_bytes;
+ struct nx_gzip_crb_cpb_t *cmdp;
+ uint32_t pagelen = 65536;
+ int fault_tries = NX_MAX_FAULTS;
+
+ cmdp = (void *)(uintptr_t)
+ aligned_alloc(sizeof(struct nx_gzip_crb_cpb_t),
+ sizeof(struct nx_gzip_crb_cpb_t));
+
+ if (argc != 2) {
+ fprintf(stderr, "usage: %s <fname>\n", argv[0]);
+ exit(-1);
+ }
+ if (read_alloc_input_file(argv[1], &inbuf, &inlen))
+ exit(-1);
+ fprintf(stderr, "file %s read, %ld bytes\n", argv[1], inlen);
+
+ /* Generous output buffer for header/trailer */
+ outlen = 2 * inlen + 1024;
+
+ assert(NULL != (outbuf = (char *)malloc(outlen)));
+ nxu_touch_pages(outbuf, outlen, pagelen, 1);
+
+ /* Compress piecemeal in smallish chunks */
+ chunk = 1<<22;
+
+ /* Write the gzip header to the stream */
+ num_hdr_bytes = gzip_header_blank(outbuf);
+ dstbuf = outbuf + num_hdr_bytes;
+ outlen = outlen - num_hdr_bytes;
+ dsttotlen = num_hdr_bytes;
+
+ srcbuf = inbuf;
+ srctotlen = 0;
+
+ /* Init the CRB, the coprocessor request block */
+ memset(&cmdp->crb, 0, sizeof(cmdp->crb));
+
+ /* Initial gzip crc32 */
+ put32(cmdp->cpb, in_crc, 0);
+
+ while (inlen > 0) {
+
+ /* Submit chunk size source data per job */
+ srclen = NX_MIN(chunk, inlen);
+ /* Supply large target in case data expands */
+ dstlen = NX_MIN(2*srclen, outlen);
+
+ /* Page faults are handled by the user code */
+
+ /* Fault-in pages; an improved code wouldn't touch so
+ * many pages but would try to estimate the
+ * compression ratio and adjust both the src and dst
+ * touch amounts.
+ */
+ nxu_touch_pages(cmdp, sizeof(struct nx_gzip_crb_cpb_t), pagelen,
+ 1);
+ nxu_touch_pages(srcbuf, srclen, pagelen, 0);
+ nxu_touch_pages(dstbuf, dstlen, pagelen, 1);
+
+ cc = compress_fht_sample(
+ srcbuf, srclen,
+ dstbuf, dstlen,
+ lzcounts, cmdp, handle);
+
+ if (cc != ERR_NX_OK && cc != ERR_NX_TPBC_GT_SPBC &&
+ cc != ERR_NX_AT_FAULT) {
+ fprintf(stderr, "nx error: cc= %d\n", cc);
+ exit(-1);
+ }
+
+ /* Page faults are handled by the user code */
+ if (cc == ERR_NX_AT_FAULT) {
+ NXPRT(fprintf(stderr, "page fault: cc= %d, ", cc));
+ NXPRT(fprintf(stderr, "try= %d, fsa= %08llx\n",
+ fault_tries,
+ (unsigned long long) cmdp->crb.csb.fsaddr));
+ fault_tries--;
+ if (fault_tries > 0) {
+ continue;
+ } else {
+ fprintf(stderr, "error: cannot progress; ");
+ fprintf(stderr, "too many faults\n");
+ exit(-1);
+ };
+ }
+
+ fault_tries = NX_MAX_FAULTS; /* Reset for the next chunk */
+
+ inlen = inlen - srclen;
+ srcbuf = srcbuf + srclen;
+ srctotlen = srctotlen + srclen;
+
+ /* Two possible locations for spbc depending on the function
+ * code.
+ */
+ spbc = (!lzcounts) ? get32(cmdp->cpb, out_spbc_comp) :
+ get32(cmdp->cpb, out_spbc_comp_with_count);
+ assert(spbc == srclen);
+
+ /* Target byte count */
+ tpbc = get32(cmdp->crb.csb, tpbc);
+ /* Target ending bit count */
+ tebc = getnn(cmdp->cpb, out_tebc);
+ NXPRT(fprintf(stderr, "compressed chunk %d ", spbc));
+ NXPRT(fprintf(stderr, "to %d bytes, tebc= %d\n", tpbc, tebc));
+
+ if (inlen > 0) { /* More chunks to go */
+ set_bfinal(dstbuf, 0);
+ dstbuf = dstbuf + tpbc;
+ dsttotlen = dsttotlen + tpbc;
+ outlen = outlen - tpbc;
+ /* Round up to the next byte with a flush
+ * block; do not set the BFINAqL bit.
+ */
+ flushlen = append_sync_flush(dstbuf, tebc, 0);
+ dsttotlen = dsttotlen + flushlen;
+ outlen = outlen - flushlen;
+ dstbuf = dstbuf + flushlen;
+ NXPRT(fprintf(stderr, "added sync_flush %d bytes\n",
+ flushlen));
+ } else { /* Done */
+ /* Set the BFINAL bit of the last block per Deflate
+ * specification.
+ */
+ set_bfinal(dstbuf, 1);
+ dstbuf = dstbuf + tpbc;
+ dsttotlen = dsttotlen + tpbc;
+ outlen = outlen - tpbc;
+ }
+
+ /* Resuming crc32 for the next chunk */
+ crc = get32(cmdp->cpb, out_crc);
+ put32(cmdp->cpb, in_crc, crc);
+ crc = be32toh(crc);
+ }
+
+ /* Append crc32 and ISIZE to the end */
+ memcpy(dstbuf, &crc, 4);
+ memcpy(dstbuf+4, &srctotlen, 4);
+ dsttotlen = dsttotlen + 8;
+ outlen = outlen - 8;
+
+ assert(FNAME_MAX > (strlen(argv[1]) + strlen(FEXT)));
+ strcpy(outname, argv[1]);
+ strcat(outname, FEXT);
+ if (write_output_file(outname, outbuf, dsttotlen)) {
+ fprintf(stderr, "write error: %s\n", outname);
+ exit(-1);
+ }
+
+ fprintf(stderr, "compressed %ld to %ld bytes total, ", srctotlen,
+ dsttotlen);
+ fprintf(stderr, "crc32 checksum = %08x\n", crc);
+
+ if (inbuf != NULL)
+ free(inbuf);
+
+ if (outbuf != NULL)
+ free(outbuf);
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ int rc;
+ struct sigaction act;
+ void *handle;
+
+ nx_dbg = 0;
+ nx_gzip_log = NULL;
+ act.sa_handler = 0;
+ act.sa_sigaction = nxu_sigsegv_handler;
+ act.sa_flags = SA_SIGINFO;
+ act.sa_restorer = 0;
+ sigemptyset(&act.sa_mask);
+ sigaction(SIGSEGV, &act, NULL);
+
+ handle = nx_function_begin(NX_FUNC_COMP_GZIP, 0);
+ if (!handle) {
+ fprintf(stderr, "Unable to init NX, errno %d\n", errno);
+ exit(-1);
+ }
+
+ rc = compress_file(argc, argv, handle);
+
+ nx_function_end(handle);
+
+ return rc;
+}
diff --git a/tools/testing/selftests/powerpc/nx-gzip/gzip_vas.c b/tools/testing/selftests/powerpc/nx-gzip/gzip_vas.c
new file mode 100644
index 000000000..c055885da
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/gzip_vas.c
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/*
+ * Copyright 2020 IBM Corp.
+ *
+ * Author: Bulent Abali <abali@us.ibm.com>
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/fcntl.h>
+#include <sys/mman.h>
+#include <endian.h>
+#include <bits/endian.h>
+#include <sys/ioctl.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include "vas-api.h"
+#include "nx.h"
+#include "copy-paste.h"
+#include "nxu.h"
+#include "nx_dbg.h"
+#include <sys/platform/ppc.h>
+
+#define barrier()
+#define hwsync() ({ asm volatile("sync" ::: "memory"); })
+
+#ifndef NX_NO_CPU_PRI
+#define cpu_pri_default() ({ asm volatile ("or 2, 2, 2"); })
+#define cpu_pri_low() ({ asm volatile ("or 31, 31, 31"); })
+#else
+#define cpu_pri_default()
+#define cpu_pri_low()
+#endif
+
+void *nx_fault_storage_address;
+
+struct nx_handle {
+ int fd;
+ int function;
+ void *paste_addr;
+};
+
+static int open_device_nodes(char *devname, int pri, struct nx_handle *handle)
+{
+ int rc, fd;
+ void *addr;
+ struct vas_tx_win_open_attr txattr;
+
+ fd = open(devname, O_RDWR);
+ if (fd < 0) {
+ fprintf(stderr, " open device name %s\n", devname);
+ return -errno;
+ }
+
+ memset(&txattr, 0, sizeof(txattr));
+ txattr.version = 1;
+ txattr.vas_id = pri;
+ rc = ioctl(fd, VAS_TX_WIN_OPEN, (unsigned long)&txattr);
+ if (rc < 0) {
+ fprintf(stderr, "ioctl() n %d, error %d\n", rc, errno);
+ rc = -errno;
+ goto out;
+ }
+
+ addr = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0ULL);
+ if (addr == MAP_FAILED) {
+ fprintf(stderr, "mmap() failed, errno %d\n", errno);
+ rc = -errno;
+ goto out;
+ }
+ handle->fd = fd;
+ handle->paste_addr = (void *)((char *)addr + 0x400);
+
+ rc = 0;
+out:
+ close(fd);
+ return rc;
+}
+
+void *nx_function_begin(int function, int pri)
+{
+ int rc;
+ char *devname = "/dev/crypto/nx-gzip";
+ struct nx_handle *nxhandle;
+
+ if (function != NX_FUNC_COMP_GZIP) {
+ errno = EINVAL;
+ fprintf(stderr, " NX_FUNC_COMP_GZIP not found\n");
+ return NULL;
+ }
+
+
+ nxhandle = malloc(sizeof(*nxhandle));
+ if (!nxhandle) {
+ errno = ENOMEM;
+ fprintf(stderr, " No memory\n");
+ return NULL;
+ }
+
+ nxhandle->function = function;
+ rc = open_device_nodes(devname, pri, nxhandle);
+ if (rc < 0) {
+ errno = -rc;
+ fprintf(stderr, " open_device_nodes failed\n");
+ return NULL;
+ }
+
+ return nxhandle;
+}
+
+int nx_function_end(void *handle)
+{
+ int rc = 0;
+ struct nx_handle *nxhandle = handle;
+
+ rc = munmap(nxhandle->paste_addr - 0x400, 4096);
+ if (rc < 0) {
+ fprintf(stderr, "munmap() failed, errno %d\n", errno);
+ return rc;
+ }
+ close(nxhandle->fd);
+ free(nxhandle);
+
+ return rc;
+}
+
+static int nx_wait_for_csb(struct nx_gzip_crb_cpb_t *cmdp)
+{
+ long poll = 0;
+ uint64_t t;
+
+ /* Save power and let other threads use the h/w. top may show
+ * 100% but only because OS doesn't know we slowed the this
+ * h/w thread while polling. We're letting other threads have
+ * higher throughput on the core.
+ */
+ cpu_pri_low();
+
+#define CSB_MAX_POLL 200000000UL
+#define USLEEP_TH 300000UL
+
+ t = __ppc_get_timebase();
+
+ while (getnn(cmdp->crb.csb, csb_v) == 0) {
+ ++poll;
+ hwsync();
+
+ cpu_pri_low();
+
+ /* usleep(0) takes around 29000 ticks ~60 us.
+ * 300000 is spinning for about 600 us then
+ * start sleeping.
+ */
+ if ((__ppc_get_timebase() - t) > USLEEP_TH) {
+ cpu_pri_default();
+ usleep(1);
+ }
+
+ if (poll > CSB_MAX_POLL)
+ break;
+
+ /* Fault address from signal handler */
+ if (nx_fault_storage_address) {
+ cpu_pri_default();
+ return -EAGAIN;
+ }
+
+ }
+
+ cpu_pri_default();
+
+ /* hw has updated csb and output buffer */
+ hwsync();
+
+ /* Check CSB flags. */
+ if (getnn(cmdp->crb.csb, csb_v) == 0) {
+ fprintf(stderr, "CSB still not valid after %d polls.\n",
+ (int) poll);
+ prt_err("CSB still not valid after %d polls, giving up.\n",
+ (int) poll);
+ return -ETIMEDOUT;
+ }
+
+ return 0;
+}
+
+static int nxu_run_job(struct nx_gzip_crb_cpb_t *cmdp, void *handle)
+{
+ int i, ret, retries;
+ struct nx_handle *nxhandle = handle;
+
+ assert(handle != NULL);
+ i = 0;
+ retries = 5000;
+ while (i++ < retries) {
+ hwsync();
+ vas_copy(&cmdp->crb, 0);
+ ret = vas_paste(nxhandle->paste_addr, 0);
+ hwsync();
+
+ NXPRT(fprintf(stderr, "Paste attempt %d/%d returns 0x%x\n",
+ i, retries, ret));
+
+ if ((ret == 2) || (ret == 3)) {
+
+ ret = nx_wait_for_csb(cmdp);
+ if (!ret) {
+ goto out;
+ } else if (ret == -EAGAIN) {
+ long x;
+
+ prt_err("Touching address %p, 0x%lx\n",
+ nx_fault_storage_address,
+ *(long *) nx_fault_storage_address);
+ x = *(long *) nx_fault_storage_address;
+ *(long *) nx_fault_storage_address = x;
+ nx_fault_storage_address = 0;
+ continue;
+ } else {
+ prt_err("wait_for_csb() returns %d\n", ret);
+ break;
+ }
+ } else {
+ if (i < 10) {
+ /* spin for few ticks */
+#define SPIN_TH 500UL
+ uint64_t fail_spin;
+
+ fail_spin = __ppc_get_timebase();
+ while ((__ppc_get_timebase() - fail_spin) <
+ SPIN_TH)
+ ;
+ } else {
+ /* sleep */
+ unsigned int pr = 0;
+
+ if (pr++ % 100 == 0) {
+ prt_err("Paste attempt %d/", i);
+ prt_err("%d, failed pid= %d\n", retries,
+ getpid());
+ }
+ usleep(1);
+ }
+ continue;
+ }
+ }
+
+out:
+ cpu_pri_default();
+
+ return ret;
+}
+
+int nxu_submit_job(struct nx_gzip_crb_cpb_t *cmdp, void *handle)
+{
+ int cc;
+
+ cc = nxu_run_job(cmdp, handle);
+
+ if (!cc)
+ cc = getnn(cmdp->crb.csb, csb_cc); /* CC Table 6-8 */
+
+ return cc;
+}
+
+
+void nxu_sigsegv_handler(int sig, siginfo_t *info, void *ctx)
+{
+ fprintf(stderr, "%d: Got signal %d si_code %d, si_addr %p\n", getpid(),
+ sig, info->si_code, info->si_addr);
+
+ nx_fault_storage_address = info->si_addr;
+}
+
+/*
+ * Fault in pages prior to NX job submission. wr=1 may be required to
+ * touch writeable pages. System zero pages do not fault-in the page as
+ * intended. Typically set wr=1 for NX target pages and set wr=0 for NX
+ * source pages.
+ */
+int nxu_touch_pages(void *buf, long buf_len, long page_len, int wr)
+{
+ char *begin = buf;
+ char *end = (char *) buf + buf_len - 1;
+ volatile char t;
+
+ assert(buf_len >= 0 && !!buf);
+
+ NXPRT(fprintf(stderr, "touch %p %p len 0x%lx wr=%d\n", buf,
+ (buf + buf_len), buf_len, wr));
+
+ if (buf_len <= 0 || buf == NULL)
+ return -1;
+
+ do {
+ t = *begin;
+ if (wr)
+ *begin = t;
+ begin = begin + page_len;
+ } while (begin < end);
+
+ /* When buf_sz is small or buf tail is in another page */
+ t = *end;
+ if (wr)
+ *end = t;
+
+ return 0;
+}
diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/copy-paste.h b/tools/testing/selftests/powerpc/nx-gzip/include/copy-paste.h
new file mode 100644
index 000000000..0db2d6485
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/include/copy-paste.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+/* From asm-compat.h */
+#define __stringify_in_c(...) #__VA_ARGS__
+#define stringify_in_c(...) __stringify_in_c(__VA_ARGS__) " "
+
+/*
+ * Macros taken from arch/powerpc/include/asm/ppc-opcode.h and other
+ * header files.
+ */
+#define ___PPC_RA(a) (((a) & 0x1f) << 16)
+#define ___PPC_RB(b) (((b) & 0x1f) << 11)
+
+#define PPC_INST_COPY 0x7c20060c
+#define PPC_INST_PASTE 0x7c20070d
+
+#define PPC_COPY(a, b) stringify_in_c(.long PPC_INST_COPY | \
+ ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_PASTE(a, b) stringify_in_c(.long PPC_INST_PASTE | \
+ ___PPC_RA(a) | ___PPC_RB(b))
+#define CR0_SHIFT 28
+#define CR0_MASK 0xF
+/*
+ * Copy/paste instructions:
+ *
+ * copy RA,RB
+ * Copy contents of address (RA) + effective_address(RB)
+ * to internal copy-buffer.
+ *
+ * paste RA,RB
+ * Paste contents of internal copy-buffer to the address
+ * (RA) + effective_address(RB)
+ */
+static inline int vas_copy(void *crb, int offset)
+{
+ asm volatile(PPC_COPY(%0, %1)";"
+ :
+ : "b" (offset), "b" (crb)
+ : "memory");
+
+ return 0;
+}
+
+static inline int vas_paste(void *paste_address, int offset)
+{
+ __u32 cr;
+
+ cr = 0;
+ asm volatile(PPC_PASTE(%1, %2)";"
+ "mfocrf %0, 0x80;"
+ : "=r" (cr)
+ : "b" (offset), "b" (paste_address)
+ : "memory", "cr0");
+
+ return (cr >> CR0_SHIFT) & CR0_MASK;
+}
diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/crb.h b/tools/testing/selftests/powerpc/nx-gzip/include/crb.h
new file mode 100644
index 000000000..ab101085f
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/include/crb.h
@@ -0,0 +1,155 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __CRB_H
+#define __CRB_H
+#include <linux/types.h>
+#include "nx.h"
+
+/* CCW 842 CI/FC masks
+ * NX P8 workbook, section 4.3.1, figure 4-6
+ * "CI/FC Boundary by NX CT type"
+ */
+#define CCW_CI_842 (0x00003ff8)
+#define CCW_FC_842 (0x00000007)
+
+/* Chapter 6.5.8 Coprocessor-Completion Block (CCB) */
+
+#define CCB_VALUE (0x3fffffffffffffff)
+#define CCB_ADDRESS (0xfffffffffffffff8)
+#define CCB_CM (0x0000000000000007)
+#define CCB_CM0 (0x0000000000000004)
+#define CCB_CM12 (0x0000000000000003)
+
+#define CCB_CM0_ALL_COMPLETIONS (0x0)
+#define CCB_CM0_LAST_IN_CHAIN (0x4)
+#define CCB_CM12_STORE (0x0)
+#define CCB_CM12_INTERRUPT (0x1)
+
+#define CCB_SIZE (0x10)
+#define CCB_ALIGN CCB_SIZE
+
+struct coprocessor_completion_block {
+ __be64 value;
+ __be64 address;
+} __aligned(CCB_ALIGN);
+
+
+/* Chapter 6.5.7 Coprocessor-Status Block (CSB) */
+
+#define CSB_V (0x80)
+#define CSB_F (0x04)
+#define CSB_CH (0x03)
+#define CSB_CE_INCOMPLETE (0x80)
+#define CSB_CE_TERMINATION (0x40)
+#define CSB_CE_TPBC (0x20)
+
+#define CSB_CC_SUCCESS (0)
+#define CSB_CC_INVALID_ALIGN (1)
+#define CSB_CC_OPERAND_OVERLAP (2)
+#define CSB_CC_DATA_LENGTH (3)
+#define CSB_CC_TRANSLATION (5)
+#define CSB_CC_PROTECTION (6)
+#define CSB_CC_RD_EXTERNAL (7)
+#define CSB_CC_INVALID_OPERAND (8)
+#define CSB_CC_PRIVILEGE (9)
+#define CSB_CC_INTERNAL (10)
+#define CSB_CC_WR_EXTERNAL (12)
+#define CSB_CC_NOSPC (13)
+#define CSB_CC_EXCESSIVE_DDE (14)
+#define CSB_CC_WR_TRANSLATION (15)
+#define CSB_CC_WR_PROTECTION (16)
+#define CSB_CC_UNKNOWN_CODE (17)
+#define CSB_CC_ABORT (18)
+#define CSB_CC_TRANSPORT (20)
+#define CSB_CC_SEGMENTED_DDL (31)
+#define CSB_CC_PROGRESS_POINT (32)
+#define CSB_CC_DDE_OVERFLOW (33)
+#define CSB_CC_SESSION (34)
+#define CSB_CC_PROVISION (36)
+#define CSB_CC_CHAIN (37)
+#define CSB_CC_SEQUENCE (38)
+#define CSB_CC_HW (39)
+
+#define CSB_SIZE (0x10)
+#define CSB_ALIGN CSB_SIZE
+
+struct coprocessor_status_block {
+ __u8 flags;
+ __u8 cs;
+ __u8 cc;
+ __u8 ce;
+ __be32 count;
+ __be64 address;
+} __aligned(CSB_ALIGN);
+
+
+/* Chapter 6.5.10 Data-Descriptor List (DDL)
+ * each list contains one or more Data-Descriptor Entries (DDE)
+ */
+
+#define DDE_P (0x8000)
+
+#define DDE_SIZE (0x10)
+#define DDE_ALIGN DDE_SIZE
+
+struct data_descriptor_entry {
+ __be16 flags;
+ __u8 count;
+ __u8 index;
+ __be32 length;
+ __be64 address;
+} __aligned(DDE_ALIGN);
+
+
+/* Chapter 6.5.2 Coprocessor-Request Block (CRB) */
+
+#define CRB_SIZE (0x80)
+#define CRB_ALIGN (0x100) /* Errata: requires 256 alignment */
+
+
+/* Coprocessor Status Block field
+ * ADDRESS address of CSB
+ * C CCB is valid
+ * AT 0 = addrs are virtual, 1 = addrs are phys
+ * M enable perf monitor
+ */
+#define CRB_CSB_ADDRESS (0xfffffffffffffff0)
+#define CRB_CSB_C (0x0000000000000008)
+#define CRB_CSB_AT (0x0000000000000002)
+#define CRB_CSB_M (0x0000000000000001)
+
+struct coprocessor_request_block {
+ __be32 ccw;
+ __be32 flags;
+ __be64 csb_addr;
+
+ struct data_descriptor_entry source;
+ struct data_descriptor_entry target;
+
+ struct coprocessor_completion_block ccb;
+
+ __u8 reserved[48];
+
+ struct coprocessor_status_block csb;
+} __aligned(CRB_ALIGN);
+
+#define crb_csb_addr(c) __be64_to_cpu(c->csb_addr)
+#define crb_nx_fault_addr(c) __be64_to_cpu(c->stamp.nx.fault_storage_addr)
+#define crb_nx_flags(c) c->stamp.nx.flags
+#define crb_nx_fault_status(c) c->stamp.nx.fault_status
+#define crb_nx_pswid(c) c->stamp.nx.pswid
+
+
+/* RFC02167 Initiate Coprocessor Instructions document
+ * Chapter 8.2.1.1.1 RS
+ * Chapter 8.2.3 Coprocessor Directive
+ * Chapter 8.2.4 Execution
+ *
+ * The CCW must be converted to BE before passing to icswx()
+ */
+
+#define CCW_PS (0xff000000)
+#define CCW_CT (0x00ff0000)
+#define CCW_CD (0x0000ffff)
+#define CCW_CL (0x0000c000)
+
+#endif
diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/nx.h b/tools/testing/selftests/powerpc/nx-gzip/include/nx.h
new file mode 100644
index 000000000..1abe23fc2
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/include/nx.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright 2020 IBM Corp.
+ *
+ */
+#ifndef _NX_H
+#define _NX_H
+
+#include <stdbool.h>
+
+#define NX_FUNC_COMP_842 1
+#define NX_FUNC_COMP_GZIP 2
+
+#ifndef __aligned
+#define __aligned(x) __attribute__((aligned(x)))
+#endif
+
+struct nx842_func_args {
+ bool use_crc;
+ bool decompress; /* true decompress; false compress */
+ bool move_data;
+ int timeout; /* seconds */
+};
+
+struct nxbuf_t {
+ int len;
+ char *buf;
+};
+
+/* @function should be EFT (aka 842), GZIP etc */
+void *nx_function_begin(int function, int pri);
+
+int nx_function(void *handle, struct nxbuf_t *in, struct nxbuf_t *out,
+ void *arg);
+
+int nx_function_end(void *handle);
+
+#endif /* _NX_H */
diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/nx_dbg.h b/tools/testing/selftests/powerpc/nx-gzip/include/nx_dbg.h
new file mode 100644
index 000000000..16464e19c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/include/nx_dbg.h
@@ -0,0 +1,95 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright 2020 IBM Corporation
+ *
+ */
+
+#ifndef _NXU_DBG_H_
+#define _NXU_DBG_H_
+
+#include <sys/file.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <time.h>
+#include <pthread.h>
+
+extern FILE * nx_gzip_log;
+extern int nx_gzip_trace;
+extern unsigned int nx_gzip_inflate_impl;
+extern unsigned int nx_gzip_deflate_impl;
+extern unsigned int nx_gzip_inflate_flags;
+extern unsigned int nx_gzip_deflate_flags;
+
+extern int nx_dbg;
+pthread_mutex_t mutex_log;
+
+#define nx_gzip_trace_enabled() (nx_gzip_trace & 0x1)
+#define nx_gzip_hw_trace_enabled() (nx_gzip_trace & 0x2)
+#define nx_gzip_sw_trace_enabled() (nx_gzip_trace & 0x4)
+#define nx_gzip_gather_statistics() (nx_gzip_trace & 0x8)
+#define nx_gzip_per_stream_stat() (nx_gzip_trace & 0x10)
+
+#define prt(fmt, ...) do { \
+ pthread_mutex_lock(&mutex_log); \
+ flock(nx_gzip_log->_fileno, LOCK_EX); \
+ time_t t; struct tm *m; time(&t); m = localtime(&t); \
+ fprintf(nx_gzip_log, "[%04d/%02d/%02d %02d:%02d:%02d] " \
+ "pid %d: " fmt, \
+ (int)m->tm_year + 1900, (int)m->tm_mon+1, (int)m->tm_mday, \
+ (int)m->tm_hour, (int)m->tm_min, (int)m->tm_sec, \
+ (int)getpid(), ## __VA_ARGS__); \
+ fflush(nx_gzip_log); \
+ flock(nx_gzip_log->_fileno, LOCK_UN); \
+ pthread_mutex_unlock(&mutex_log); \
+} while (0)
+
+/* Use in case of an error */
+#define prt_err(fmt, ...) do { if (nx_dbg >= 0) { \
+ prt("%s:%u: Error: "fmt, \
+ __FILE__, __LINE__, ## __VA_ARGS__); \
+}} while (0)
+
+/* Use in case of an warning */
+#define prt_warn(fmt, ...) do { if (nx_dbg >= 1) { \
+ prt("%s:%u: Warning: "fmt, \
+ __FILE__, __LINE__, ## __VA_ARGS__); \
+}} while (0)
+
+/* Informational printouts */
+#define prt_info(fmt, ...) do { if (nx_dbg >= 2) { \
+ prt("Info: "fmt, ## __VA_ARGS__); \
+}} while (0)
+
+/* Trace zlib wrapper code */
+#define prt_trace(fmt, ...) do { if (nx_gzip_trace_enabled()) { \
+ prt("### "fmt, ## __VA_ARGS__); \
+}} while (0)
+
+/* Trace statistics */
+#define prt_stat(fmt, ...) do { if (nx_gzip_gather_statistics()) { \
+ prt("### "fmt, ## __VA_ARGS__); \
+}} while (0)
+
+/* Trace zlib hardware implementation */
+#define hw_trace(fmt, ...) do { \
+ if (nx_gzip_hw_trace_enabled()) \
+ fprintf(nx_gzip_log, "hhh " fmt, ## __VA_ARGS__); \
+ } while (0)
+
+/* Trace zlib software implementation */
+#define sw_trace(fmt, ...) do { \
+ if (nx_gzip_sw_trace_enabled()) \
+ fprintf(nx_gzip_log, "sss " fmt, ## __VA_ARGS__); \
+ } while (0)
+
+
+/**
+ * str_to_num - Convert string into number and copy with endings like
+ * KiB for kilobyte
+ * MiB for megabyte
+ * GiB for gigabyte
+ */
+uint64_t str_to_num(char *str);
+void nx_lib_debug(int onoff);
+
+#endif /* _NXU_DBG_H_ */
diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/nxu.h b/tools/testing/selftests/powerpc/nx-gzip/include/nxu.h
new file mode 100644
index 000000000..20a4e883e
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/include/nxu.h
@@ -0,0 +1,650 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Hardware interface of the NX-GZIP compression accelerator
+ *
+ * Copyright (C) IBM Corporation, 2020
+ *
+ * Author: Bulent Abali <abali@us.ibm.com>
+ *
+ */
+
+#ifndef _NXU_H
+#define _NXU_H
+
+#include <stdint.h>
+#include <endian.h>
+#include "nx.h"
+
+/* deflate */
+#define LLSZ 286
+#define DSZ 30
+
+/* nx */
+#define DHTSZ 18
+#define DHT_MAXSZ 288
+#define MAX_DDE_COUNT 256
+
+/* util */
+#ifdef NXDBG
+#define NXPRT(X) X
+#else
+#define NXPRT(X)
+#endif
+
+#ifdef NXTIMER
+#include <sys/platform/ppc.h>
+#define NX_CLK(X) X
+#define nx_get_time() __ppc_get_timebase()
+#define nx_get_freq() __ppc_get_timebase_freq()
+#else
+#define NX_CLK(X)
+#define nx_get_time() (-1)
+#define nx_get_freq() (-1)
+#endif
+
+#define NX_MAX_FAULTS 500
+
+/*
+ * Definitions of acronyms used here. See
+ * P9 NX Gzip Accelerator User's Manual for details:
+ * https://github.com/libnxz/power-gzip/blob/develop/doc/power_nx_gzip_um.pdf
+ *
+ * adler/crc: 32 bit checksums appended to stream tail
+ * ce: completion extension
+ * cpb: coprocessor parameter block (metadata)
+ * crb: coprocessor request block (command)
+ * csb: coprocessor status block (status)
+ * dht: dynamic huffman table
+ * dde: data descriptor element (address, length)
+ * ddl: list of ddes
+ * dh/fh: dynamic and fixed huffman types
+ * fc: coprocessor function code
+ * histlen: history/dictionary length
+ * history: sliding window of up to 32KB of data
+ * lzcount: Deflate LZ symbol counts
+ * rembytecnt: remaining byte count
+ * sfbt: source final block type; last block's type during decomp
+ * spbc: source processed byte count
+ * subc: source unprocessed bit count
+ * tebc: target ending bit count; valid bits in the last byte
+ * tpbc: target processed byte count
+ * vas: virtual accelerator switch; the user mode interface
+ */
+
+union nx_qw_t {
+ uint32_t word[4];
+ uint64_t dword[2];
+} __aligned(16);
+
+/*
+ * Note: NX registers with fewer than 32 bits are declared by
+ * convention as uint32_t variables in unions. If *_offset and *_mask
+ * are defined for a variable, then use get_ put_ macros to
+ * conveniently access the register fields for endian conversions.
+ */
+
+struct nx_dde_t {
+ /* Data Descriptor Element, Section 6.4 */
+ union {
+ uint32_t dde_count;
+ /* When dde_count == 0 ddead is a pointer to a data buffer;
+ * ddebc is the buffer length bytes.
+ * When dde_count > 0 dde is an indirect dde; ddead is a
+ * pointer to a contiguous list of direct ddes; ddebc is the
+ * total length of all data pointed to by the list of direct
+ * ddes. Note that only one level of indirection is permitted.
+ * See Section 6.4 of the user manual for additional details.
+ */
+ };
+ uint32_t ddebc; /* dde byte count */
+ uint64_t ddead; /* dde address */
+} __aligned(16);
+
+struct nx_csb_t {
+ /* Coprocessor Status Block, Section 6.6 */
+ union {
+ uint32_t csb_v;
+ /* Valid bit. v must be set to 0 by the program
+ * before submitting the coprocessor command.
+ * Software can poll for the v bit
+ */
+
+ uint32_t csb_f;
+ /* 16B CSB size. Written to 0 by DMA when it writes the CPB */
+
+ uint32_t csb_cs;
+ /* cs completion sequence; unused */
+
+ uint32_t csb_cc;
+ /* cc completion code; cc != 0 exception occurred */
+
+ uint32_t csb_ce;
+ /* ce completion extension */
+
+ };
+ uint32_t tpbc;
+ /* target processed byte count TPBC */
+
+ uint64_t fsaddr;
+ /* Section 6.12.1 CSB NonZero error summary. FSA Failing storage
+ * address. Address where error occurred. When available, written
+ * to A field of CSB
+ */
+} __aligned(16);
+
+struct nx_ccb_t {
+ /* Coprocessor Completion Block, Section 6.7 */
+
+ uint32_t reserved[3];
+ union {
+ /* When crb.c==0 (no ccb defined) it is reserved;
+ * When crb.c==1 (ccb defined) it is cm
+ */
+
+ uint32_t ccb_cm;
+ /* Signal interrupt of crb.c==1 and cm==1 */
+
+ uint32_t word;
+ /* generic access to the 32bit word */
+ };
+} __aligned(16);
+
+struct vas_stamped_crb_t {
+ /*
+ * CRB operand of the paste coprocessor instruction is stamped
+ * in quadword 4 with the information shown here as its written
+ * in to the receive FIFO of the coprocessor
+ */
+
+ union {
+ uint32_t vas_buf_num;
+ /* Verification only vas buffer number which correlates to
+ * the low order bits of the atag in the paste command
+ */
+
+ uint32_t send_wc_id;
+ /* Pointer to Send Window Context that provides for NX address
+ * translation information, such as MSR and LPCR bits, job
+ * completion interrupt RA, PSWID, and job utilization counter.
+ */
+
+ };
+ union {
+ uint32_t recv_wc_id;
+ /* Pointer to Receive Window Context. NX uses this to return
+ * credits to a Receive FIFO as entries are dequeued.
+ */
+
+ };
+ uint32_t reserved2;
+ union {
+ uint32_t vas_invalid;
+ /* Invalid bit. If this bit is 1 the CRB is discarded by
+ * NX upon fetching from the receive FIFO. If this bit is 0
+ * the CRB is processed normally. The bit is stamped to 0
+ * by VAS and may be written to 1 by hypervisor while
+ * the CRB is in the receive FIFO (in memory).
+ */
+
+ };
+};
+
+struct nx_stamped_fault_crb_t {
+ /*
+ * A CRB that has a translation fault is stamped by NX in quadword 4
+ * and pasted to the Fault Send Window in VAS.
+ */
+ uint64_t fsa;
+ union {
+ uint32_t nxsf_t;
+ uint32_t nxsf_fs;
+ };
+ uint32_t pswid;
+};
+
+union stamped_crb_t {
+ struct vas_stamped_crb_t vas;
+ struct nx_stamped_fault_crb_t nx;
+};
+
+struct nx_gzip_cpb_t {
+ /*
+ * Coprocessor Parameter Block In/Out are used to pass metadata
+ * to/from accelerator. Tables 6.5 and 6.6 of the user manual.
+ */
+
+ /* CPBInput */
+
+ struct {
+ union {
+ union nx_qw_t qw0;
+ struct {
+ uint32_t in_adler; /* bits 0:31 */
+ uint32_t in_crc; /* bits 32:63 */
+ union {
+ uint32_t in_histlen; /* bits 64:75 */
+ uint32_t in_subc; /* bits 93:95 */
+ };
+ union {
+ /* bits 108:111 */
+ uint32_t in_sfbt;
+ /* bits 112:127 */
+ uint32_t in_rembytecnt;
+ /* bits 116:127 */
+ uint32_t in_dhtlen;
+ };
+ };
+ };
+ union {
+ union nx_qw_t in_dht[DHTSZ]; /* qw[1:18] */
+ char in_dht_char[DHT_MAXSZ]; /* byte access */
+ };
+ union nx_qw_t reserved[5]; /* qw[19:23] */
+ };
+
+ /* CPBOutput */
+
+ volatile struct {
+ union {
+ union nx_qw_t qw24;
+ struct {
+ uint32_t out_adler; /* bits 0:31 qw[24] */
+ uint32_t out_crc; /* bits 32:63 qw[24] */
+ union {
+ /* bits 77:79 qw[24] */
+ uint32_t out_tebc;
+ /* bits 80:95 qw[24] */
+ uint32_t out_subc;
+ };
+ union {
+ /* bits 108:111 qw[24] */
+ uint32_t out_sfbt;
+ /* bits 112:127 qw[24] */
+ uint32_t out_rembytecnt;
+ /* bits 116:127 qw[24] */
+ uint32_t out_dhtlen;
+ };
+ };
+ };
+ union {
+ union nx_qw_t qw25[79]; /* qw[25:103] */
+ /* qw[25] compress no lzcounts or wrap */
+ uint32_t out_spbc_comp_wrap;
+ uint32_t out_spbc_wrap; /* qw[25] wrap */
+ /* qw[25] compress no lzcounts */
+ uint32_t out_spbc_comp;
+ /* 286 LL and 30 D symbol counts */
+ uint32_t out_lzcount[LLSZ+DSZ];
+ struct {
+ union nx_qw_t out_dht[DHTSZ]; /* qw[25:42] */
+ /* qw[43] decompress */
+ uint32_t out_spbc_decomp;
+ };
+ };
+ /* qw[104] compress with lzcounts */
+ uint32_t out_spbc_comp_with_count;
+ };
+} __aligned(128);
+
+struct nx_gzip_crb_t {
+ union { /* byte[0:3] */
+ uint32_t gzip_fc; /* bits[24-31] */
+ };
+ uint32_t reserved1; /* byte[4:7] */
+ union {
+ uint64_t csb_address; /* byte[8:15] */
+ struct {
+ uint32_t reserved2;
+ union {
+ uint32_t crb_c;
+ /* c==0 no ccb defined */
+
+ uint32_t crb_at;
+ /* at==0 address type is ignored;
+ * all addrs effective assumed.
+ */
+
+ };
+ };
+ };
+ struct nx_dde_t source_dde; /* byte[16:31] */
+ struct nx_dde_t target_dde; /* byte[32:47] */
+ volatile struct nx_ccb_t ccb; /* byte[48:63] */
+ volatile union {
+ /* byte[64:239] shift csb by 128 bytes out of the crb; csb was
+ * in crb earlier; JReilly says csb written with partial inject
+ */
+ union nx_qw_t reserved64[11];
+ union stamped_crb_t stamp; /* byte[64:79] */
+ };
+ volatile struct nx_csb_t csb;
+} __aligned(128);
+
+struct nx_gzip_crb_cpb_t {
+ struct nx_gzip_crb_t crb;
+ struct nx_gzip_cpb_t cpb;
+} __aligned(2048);
+
+
+/*
+ * NX hardware convention has the msb bit on the left numbered 0.
+ * The defines below has *_offset defined as the right most bit
+ * position of a field. x of size_mask(x) is the field width in bits.
+ */
+
+#define size_mask(x) ((1U<<(x))-1)
+
+/*
+ * Offsets and Widths within the containing 32 bits of the various NX
+ * gzip hardware registers. Use the getnn/putnn macros to access
+ * these regs
+ */
+
+#define dde_count_mask size_mask(8)
+#define dde_count_offset 23
+
+/* CSB */
+
+#define csb_v_mask size_mask(1)
+#define csb_v_offset 0
+#define csb_f_mask size_mask(1)
+#define csb_f_offset 6
+#define csb_cs_mask size_mask(8)
+#define csb_cs_offset 15
+#define csb_cc_mask size_mask(8)
+#define csb_cc_offset 23
+#define csb_ce_mask size_mask(8)
+#define csb_ce_offset 31
+
+/* CCB */
+
+#define ccb_cm_mask size_mask(3)
+#define ccb_cm_offset 31
+
+/* VAS stamped CRB fields */
+
+#define vas_buf_num_mask size_mask(6)
+#define vas_buf_num_offset 5
+#define send_wc_id_mask size_mask(16)
+#define send_wc_id_offset 31
+#define recv_wc_id_mask size_mask(16)
+#define recv_wc_id_offset 31
+#define vas_invalid_mask size_mask(1)
+#define vas_invalid_offset 31
+
+/* NX stamped fault CRB fields */
+
+#define nxsf_t_mask size_mask(1)
+#define nxsf_t_offset 23
+#define nxsf_fs_mask size_mask(8)
+#define nxsf_fs_offset 31
+
+/* CPB input */
+
+#define in_histlen_mask size_mask(12)
+#define in_histlen_offset 11
+#define in_dhtlen_mask size_mask(12)
+#define in_dhtlen_offset 31
+#define in_subc_mask size_mask(3)
+#define in_subc_offset 31
+#define in_sfbt_mask size_mask(4)
+#define in_sfbt_offset 15
+#define in_rembytecnt_mask size_mask(16)
+#define in_rembytecnt_offset 31
+
+/* CPB output */
+
+#define out_tebc_mask size_mask(3)
+#define out_tebc_offset 15
+#define out_subc_mask size_mask(16)
+#define out_subc_offset 31
+#define out_sfbt_mask size_mask(4)
+#define out_sfbt_offset 15
+#define out_rembytecnt_mask size_mask(16)
+#define out_rembytecnt_offset 31
+#define out_dhtlen_mask size_mask(12)
+#define out_dhtlen_offset 31
+
+/* CRB */
+
+#define gzip_fc_mask size_mask(8)
+#define gzip_fc_offset 31
+#define crb_c_mask size_mask(1)
+#define crb_c_offset 28
+#define crb_at_mask size_mask(1)
+#define crb_at_offset 30
+#define csb_address_mask ~(15UL) /* mask off bottom 4b */
+
+/*
+ * Access macros for the registers. Do not access registers directly
+ * because of the endian conversion. P9 processor may run either as
+ * Little or Big endian. However the NX coprocessor regs are always
+ * big endian.
+ * Use the 32 and 64b macros to access respective
+ * register sizes.
+ * Use nn forms for the register fields shorter than 32 bits.
+ */
+
+#define getnn(ST, REG) ((be32toh(ST.REG) >> (31-REG##_offset)) \
+ & REG##_mask)
+#define getpnn(ST, REG) ((be32toh((ST)->REG) >> (31-REG##_offset)) \
+ & REG##_mask)
+#define get32(ST, REG) (be32toh(ST.REG))
+#define getp32(ST, REG) (be32toh((ST)->REG))
+#define get64(ST, REG) (be64toh(ST.REG))
+#define getp64(ST, REG) (be64toh((ST)->REG))
+
+#define unget32(ST, REG) (get32(ST, REG) & ~((REG##_mask) \
+ << (31-REG##_offset)))
+/* get 32bits less the REG field */
+
+#define ungetp32(ST, REG) (getp32(ST, REG) & ~((REG##_mask) \
+ << (31-REG##_offset)))
+/* get 32bits less the REG field */
+
+#define clear_regs(ST) memset((void *)(&(ST)), 0, sizeof(ST))
+#define clear_dde(ST) do { ST.dde_count = ST.ddebc = 0; ST.ddead = 0; \
+ } while (0)
+#define clearp_dde(ST) do { (ST)->dde_count = (ST)->ddebc = 0; \
+ (ST)->ddead = 0; \
+ } while (0)
+#define clear_struct(ST) memset((void *)(&(ST)), 0, sizeof(ST))
+#define putnn(ST, REG, X) (ST.REG = htobe32(unget32(ST, REG) | (((X) \
+ & REG##_mask) << (31-REG##_offset))))
+#define putpnn(ST, REG, X) ((ST)->REG = htobe32(ungetp32(ST, REG) \
+ | (((X) & REG##_mask) << (31-REG##_offset))))
+
+#define put32(ST, REG, X) (ST.REG = htobe32(X))
+#define putp32(ST, REG, X) ((ST)->REG = htobe32(X))
+#define put64(ST, REG, X) (ST.REG = htobe64(X))
+#define putp64(ST, REG, X) ((ST)->REG = htobe64(X))
+
+/*
+ * Completion extension ce(0) ce(1) ce(2). Bits ce(3-7)
+ * unused. Section 6.6 Figure 6.7.
+ */
+
+#define get_csb_ce(ST) ((uint32_t)getnn(ST, csb_ce))
+#define get_csb_ce_ms3b(ST) (get_csb_ce(ST) >> 5)
+#define put_csb_ce_ms3b(ST, X) putnn(ST, csb_ce, ((uint32_t)(X) << 5))
+
+#define CSB_CE_PARTIAL 0x4
+#define CSB_CE_TERMINATE 0x2
+#define CSB_CE_TPBC_VALID 0x1
+
+#define csb_ce_termination(X) (!!((X) & CSB_CE_TERMINATE))
+/* termination, output buffers may be modified, SPBC/TPBC invalid Fig.6-7 */
+
+#define csb_ce_check_completion(X) (!csb_ce_termination(X))
+/* if not terminated then check full or partial completion */
+
+#define csb_ce_partial_completion(X) (!!((X) & CSB_CE_PARTIAL))
+#define csb_ce_full_completion(X) (!csb_ce_partial_completion(X))
+#define csb_ce_tpbc_valid(X) (!!((X) & CSB_CE_TPBC_VALID))
+/* TPBC indicates successfully stored data count */
+
+#define csb_ce_default_err(X) csb_ce_termination(X)
+/* most error CEs have CE(0)=0 and CE(1)=1 */
+
+#define csb_ce_cc3_partial(X) csb_ce_partial_completion(X)
+/* some CC=3 are partially completed, Table 6-8 */
+
+#define csb_ce_cc64(X) ((X)&(CSB_CE_PARTIAL \
+ | CSB_CE_TERMINATE) == 0)
+/* Compression: when TPBC>SPBC then CC=64 Table 6-8; target didn't
+ * compress smaller than source.
+ */
+
+/* Decompress SFBT combinations Tables 5-3, 6-4, 6-6 */
+
+#define SFBT_BFINAL 0x1
+#define SFBT_LIT 0x4
+#define SFBT_FHT 0x5
+#define SFBT_DHT 0x6
+#define SFBT_HDR 0x7
+
+/*
+ * NX gzip function codes. Table 6.2.
+ * Bits 0:4 are the FC. Bit 5 is used by the DMA controller to
+ * select one of the two Byte Count Limits.
+ */
+
+#define GZIP_FC_LIMIT_MASK 0x01
+#define GZIP_FC_COMPRESS_FHT 0x00
+#define GZIP_FC_COMPRESS_DHT 0x02
+#define GZIP_FC_COMPRESS_FHT_COUNT 0x04
+#define GZIP_FC_COMPRESS_DHT_COUNT 0x06
+#define GZIP_FC_COMPRESS_RESUME_FHT 0x08
+#define GZIP_FC_COMPRESS_RESUME_DHT 0x0a
+#define GZIP_FC_COMPRESS_RESUME_FHT_COUNT 0x0c
+#define GZIP_FC_COMPRESS_RESUME_DHT_COUNT 0x0e
+#define GZIP_FC_DECOMPRESS 0x10
+#define GZIP_FC_DECOMPRESS_SINGLE_BLK_N_SUSPEND 0x12
+#define GZIP_FC_DECOMPRESS_RESUME 0x14
+#define GZIP_FC_DECOMPRESS_RESUME_SINGLE_BLK_N_SUSPEND 0x16
+#define GZIP_FC_WRAP 0x1e
+
+#define fc_is_compress(fc) (((fc) & 0x10) == 0)
+#define fc_has_count(fc) (fc_is_compress(fc) && (((fc) & 0x4) != 0))
+
+/* CSB.CC Error codes */
+
+#define ERR_NX_OK 0
+#define ERR_NX_ALIGNMENT 1
+#define ERR_NX_OPOVERLAP 2
+#define ERR_NX_DATA_LENGTH 3
+#define ERR_NX_TRANSLATION 5
+#define ERR_NX_PROTECTION 6
+#define ERR_NX_EXTERNAL_UE7 7
+#define ERR_NX_INVALID_OP 8
+#define ERR_NX_PRIVILEGE 9
+#define ERR_NX_INTERNAL_UE 10
+#define ERR_NX_EXTERN_UE_WR 12
+#define ERR_NX_TARGET_SPACE 13
+#define ERR_NX_EXCESSIVE_DDE 14
+#define ERR_NX_TRANSL_WR 15
+#define ERR_NX_PROTECT_WR 16
+#define ERR_NX_SUBFUNCTION 17
+#define ERR_NX_FUNC_ABORT 18
+#define ERR_NX_BYTE_MAX 19
+#define ERR_NX_CORRUPT_CRB 20
+#define ERR_NX_INVALID_CRB 21
+#define ERR_NX_INVALID_DDE 30
+#define ERR_NX_SEGMENTED_DDL 31
+#define ERR_NX_DDE_OVERFLOW 33
+#define ERR_NX_TPBC_GT_SPBC 64
+#define ERR_NX_MISSING_CODE 66
+#define ERR_NX_INVALID_DIST 67
+#define ERR_NX_INVALID_DHT 68
+#define ERR_NX_EXTERNAL_UE90 90
+#define ERR_NX_WDOG_TIMER 224
+#define ERR_NX_AT_FAULT 250
+#define ERR_NX_INTR_SERVER 252
+#define ERR_NX_UE253 253
+#define ERR_NX_NO_HW 254
+#define ERR_NX_HUNG_OP 255
+#define ERR_NX_END 256
+
+/* initial values for non-resume operations */
+#define INIT_CRC 0 /* crc32(0L, Z_NULL, 0) */
+#define INIT_ADLER 1 /* adler32(0L, Z_NULL, 0) adler is initialized to 1 */
+
+/* prototypes */
+int nxu_submit_job(struct nx_gzip_crb_cpb_t *c, void *handle);
+
+extern void nxu_sigsegv_handler(int sig, siginfo_t *info, void *ctx);
+extern int nxu_touch_pages(void *buf, long buf_len, long page_len, int wr);
+
+/* caller supplies a print buffer 4*sizeof(crb) */
+
+char *nx_crb_str(struct nx_gzip_crb_t *crb, char *prbuf);
+char *nx_cpb_str(struct nx_gzip_cpb_t *cpb, char *prbuf);
+char *nx_prt_hex(void *cp, int sz, char *prbuf);
+char *nx_lzcount_str(struct nx_gzip_cpb_t *cpb, char *prbuf);
+char *nx_strerror(int e);
+
+#ifdef NX_SIM
+#include <stdio.h>
+int nx_sim_init(void *ctx);
+int nx_sim_end(void *ctx);
+int nxu_run_sim_job(struct nx_gzip_crb_cpb_t *c, void *ctx);
+#endif /* NX_SIM */
+
+/* Deflate stream manipulation */
+
+#define set_final_bit(x) (x |= (unsigned char)1)
+#define clr_final_bit(x) (x &= ~(unsigned char)1)
+
+#define append_empty_fh_blk(p, b) do { *(p) = (2 | (1&(b))); *((p)+1) = 0; \
+ } while (0)
+/* append 10 bits 0000001b 00...... ;
+ * assumes appending starts on a byte boundary; b is the final bit.
+ */
+
+
+#ifdef NX_842
+
+/* 842 Engine */
+
+struct nx_eft_crb_t {
+ union { /* byte[0:3] */
+ uint32_t eft_fc; /* bits[29-31] */
+ };
+ uint32_t reserved1; /* byte[4:7] */
+ union {
+ uint64_t csb_address; /* byte[8:15] */
+ struct {
+ uint32_t reserved2;
+ union {
+ uint32_t crb_c;
+ /* c==0 no ccb defined */
+
+ uint32_t crb_at;
+ /* at==0 address type is ignored;
+ * all addrs effective assumed.
+ */
+
+ };
+ };
+ };
+ struct nx_dde_t source_dde; /* byte[16:31] */
+ struct nx_dde_t target_dde; /* byte[32:47] */
+ struct nx_ccb_t ccb; /* byte[48:63] */
+ union {
+ union nx_qw_t reserved64[3]; /* byte[64:96] */
+ };
+ struct nx_csb_t csb;
+} __aligned(128);
+
+/* 842 CRB */
+
+#define EFT_FC_MASK size_mask(3)
+#define EFT_FC_OFFSET 31
+#define EFT_FC_COMPRESS 0x0
+#define EFT_FC_COMPRESS_WITH_CRC 0x1
+#define EFT_FC_DECOMPRESS 0x2
+#define EFT_FC_DECOMPRESS_WITH_CRC 0x3
+#define EFT_FC_BLK_DATA_MOVE 0x4
+#endif /* NX_842 */
+
+#endif /* _NXU_H */
diff --git a/tools/testing/selftests/powerpc/nx-gzip/include/vas-api.h b/tools/testing/selftests/powerpc/nx-gzip/include/vas-api.h
new file mode 120000
index 000000000..77fb4c723
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/include/vas-api.h
@@ -0,0 +1 @@
+../../../../../../arch/powerpc/include/uapi/asm/vas-api.h \ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/nx-gzip/nx-gzip-test.sh b/tools/testing/selftests/powerpc/nx-gzip/nx-gzip-test.sh
new file mode 100755
index 000000000..c7b46c5fd
--- /dev/null
+++ b/tools/testing/selftests/powerpc/nx-gzip/nx-gzip-test.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+if [[ ! -w /dev/crypto/nx-gzip ]]; then
+ echo "Can't access /dev/crypto/nx-gzip, skipping"
+ echo "skip: $0"
+ exit 4
+fi
+
+set -e
+
+function cleanup
+{
+ rm -f nx-tempfile*
+}
+
+trap cleanup EXIT
+
+function test_sizes
+{
+ local n=$1
+ local fname="nx-tempfile.$n"
+
+ for size in 4K 64K 1M 64M
+ do
+ echo "Testing $size ($n) ..."
+ dd if=/dev/urandom of=$fname bs=$size count=1
+ ./gzfht_test $fname
+ ./gunz_test ${fname}.nx.gz
+ done
+}
+
+echo "Doing basic test of different sizes ..."
+test_sizes 0
+
+echo "Running tests in parallel ..."
+for i in {1..16}
+do
+ test_sizes $i &
+done
+
+wait
+
+echo "OK"
+
+exit 0
diff --git a/tools/testing/selftests/powerpc/pmu/.gitignore b/tools/testing/selftests/powerpc/pmu/.gitignore
new file mode 100644
index 000000000..f69b1e264
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/.gitignore
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+count_instructions
+l3_bank_test
+per_event_excludes
+count_stcx_fail
diff --git a/tools/testing/selftests/powerpc/pmu/Makefile b/tools/testing/selftests/powerpc/pmu/Makefile
new file mode 100644
index 000000000..904672fb7
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/Makefile
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: GPL-2.0
+noarg:
+ $(MAKE) -C ../
+
+TEST_GEN_PROGS := count_instructions count_stcx_fail l3_bank_test per_event_excludes
+EXTRA_SOURCES := ../harness.c event.c lib.c ../utils.c
+
+top_srcdir = ../../../../..
+include ../../lib.mk
+
+all: $(TEST_GEN_PROGS) ebb
+
+$(TEST_GEN_PROGS): $(EXTRA_SOURCES)
+
+# loop.S can only be built 64-bit
+$(OUTPUT)/count_instructions: CFLAGS += -m64
+$(OUTPUT)/count_instructions: loop.S count_instructions.c $(EXTRA_SOURCES)
+
+$(OUTPUT)/count_stcx_fail: CFLAGS += -m64
+$(OUTPUT)/count_stcx_fail: loop.S $(EXTRA_SOURCES)
+
+
+$(OUTPUT)/per_event_excludes: ../utils.c
+
+DEFAULT_RUN_TESTS := $(RUN_TESTS)
+override define RUN_TESTS
+ $(DEFAULT_RUN_TESTS)
+ TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET run_tests
+endef
+
+DEFAULT_EMIT_TESTS := $(EMIT_TESTS)
+override define EMIT_TESTS
+ $(DEFAULT_EMIT_TESTS)
+ TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -s -C $$TARGET emit_tests
+endef
+
+DEFAULT_INSTALL_RULE := $(INSTALL_RULE)
+override define INSTALL_RULE
+ $(DEFAULT_INSTALL_RULE)
+ TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET install
+endef
+
+clean:
+ $(RM) $(TEST_GEN_PROGS) $(OUTPUT)/loop.o
+ TARGET=ebb; BUILD_TARGET=$$OUTPUT/$$TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -C $$TARGET clean
+
+ebb:
+ TARGET=$@; BUILD_TARGET=$$OUTPUT/$$TARGET; mkdir -p $$BUILD_TARGET; $(MAKE) OUTPUT=$$BUILD_TARGET -k -C $$TARGET all
+
+.PHONY: all run_tests clean ebb
diff --git a/tools/testing/selftests/powerpc/pmu/count_instructions.c b/tools/testing/selftests/powerpc/pmu/count_instructions.c
new file mode 100644
index 000000000..a3984ef1e
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/count_instructions.c
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2013, Michael Ellerman, IBM Corp.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sys/prctl.h>
+
+#include "event.h"
+#include "utils.h"
+#include "lib.h"
+
+extern void thirty_two_instruction_loop(u64 loops);
+
+static void setup_event(struct event *e, u64 config, char *name)
+{
+ event_init_opts(e, config, PERF_TYPE_HARDWARE, name);
+
+ e->attr.disabled = 1;
+ e->attr.exclude_kernel = 1;
+ e->attr.exclude_hv = 1;
+ e->attr.exclude_idle = 1;
+}
+
+static int do_count_loop(struct event *events, u64 instructions,
+ u64 overhead, bool report)
+{
+ s64 difference, expected;
+ double percentage;
+
+ prctl(PR_TASK_PERF_EVENTS_ENABLE);
+
+ /* Run for 1M instructions */
+ thirty_two_instruction_loop(instructions >> 5);
+
+ prctl(PR_TASK_PERF_EVENTS_DISABLE);
+
+ event_read(&events[0]);
+ event_read(&events[1]);
+
+ expected = instructions + overhead;
+ difference = events[0].result.value - expected;
+ percentage = (double)difference / events[0].result.value * 100;
+
+ if (report) {
+ event_report(&events[0]);
+ event_report(&events[1]);
+
+ printf("Looped for %llu instructions, overhead %llu\n", instructions, overhead);
+ printf("Expected %llu\n", expected);
+ printf("Actual %llu\n", events[0].result.value);
+ printf("Delta %lld, %f%%\n", difference, percentage);
+ }
+
+ event_reset(&events[0]);
+ event_reset(&events[1]);
+
+ if (difference < 0)
+ difference = -difference;
+
+ /* Tolerate a difference below 0.0001 % */
+ difference *= 10000 * 100;
+ if (difference / events[0].result.value)
+ return -1;
+
+ return 0;
+}
+
+/* Count how many instructions it takes to do a null loop */
+static u64 determine_overhead(struct event *events)
+{
+ u64 current, overhead;
+ int i;
+
+ do_count_loop(events, 0, 0, false);
+ overhead = events[0].result.value;
+
+ for (i = 0; i < 100; i++) {
+ do_count_loop(events, 0, 0, false);
+ current = events[0].result.value;
+ if (current < overhead) {
+ printf("Replacing overhead %llu with %llu\n", overhead, current);
+ overhead = current;
+ }
+ }
+
+ return overhead;
+}
+
+static int test_body(void)
+{
+ struct event events[2];
+ u64 overhead;
+
+ setup_event(&events[0], PERF_COUNT_HW_INSTRUCTIONS, "instructions");
+ setup_event(&events[1], PERF_COUNT_HW_CPU_CYCLES, "cycles");
+
+ if (event_open(&events[0])) {
+ perror("perf_event_open");
+ return -1;
+ }
+
+ if (event_open_with_group(&events[1], events[0].fd)) {
+ perror("perf_event_open");
+ return -1;
+ }
+
+ overhead = determine_overhead(events);
+ printf("Overhead of null loop: %llu instructions\n", overhead);
+
+ /* Run for 1Mi instructions */
+ FAIL_IF(do_count_loop(events, 1000000, overhead, true));
+
+ /* Run for 10Mi instructions */
+ FAIL_IF(do_count_loop(events, 10000000, overhead, true));
+
+ /* Run for 100Mi instructions */
+ FAIL_IF(do_count_loop(events, 100000000, overhead, true));
+
+ /* Run for 1Bi instructions */
+ FAIL_IF(do_count_loop(events, 1000000000, overhead, true));
+
+ /* Run for 16Bi instructions */
+ FAIL_IF(do_count_loop(events, 16000000000, overhead, true));
+
+ /* Run for 64Bi instructions */
+ FAIL_IF(do_count_loop(events, 64000000000, overhead, true));
+
+ event_close(&events[0]);
+ event_close(&events[1]);
+
+ return 0;
+}
+
+static int count_instructions(void)
+{
+ return eat_cpu(test_body);
+}
+
+int main(void)
+{
+ return test_harness(count_instructions, "count_instructions");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/count_stcx_fail.c b/tools/testing/selftests/powerpc/pmu/count_stcx_fail.c
new file mode 100644
index 000000000..2070a1e2b
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/count_stcx_fail.c
@@ -0,0 +1,164 @@
+/*
+ * Copyright 2013, Michael Ellerman, IBM Corp.
+ * Licensed under GPLv2.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sys/prctl.h>
+
+#include "event.h"
+#include "utils.h"
+#include "lib.h"
+
+extern void thirty_two_instruction_loop_with_ll_sc(u64 loops, u64 *ll_sc_target);
+
+static void setup_event(struct event *e, u64 config, int type, char *name)
+{
+ event_init_opts(e, config, type, name);
+
+ e->attr.disabled = 1;
+ e->attr.exclude_kernel = 1;
+ e->attr.exclude_hv = 1;
+ e->attr.exclude_idle = 1;
+}
+
+static int do_count_loop(struct event *events, u64 instructions,
+ u64 overhead, bool report)
+{
+ s64 difference, expected;
+ double percentage;
+ u64 dummy;
+
+ prctl(PR_TASK_PERF_EVENTS_ENABLE);
+
+ /* Run for 1M instructions */
+ thirty_two_instruction_loop_with_ll_sc(instructions >> 5, &dummy);
+
+ prctl(PR_TASK_PERF_EVENTS_DISABLE);
+
+ event_read(&events[0]);
+ event_read(&events[1]);
+ event_read(&events[2]);
+
+ expected = instructions + overhead + (events[2].result.value * 10);
+ difference = events[0].result.value - expected;
+ percentage = (double)difference / events[0].result.value * 100;
+
+ if (report) {
+ printf("-----\n");
+ event_report(&events[0]);
+ event_report(&events[1]);
+ event_report(&events[2]);
+
+ printf("Looped for %llu instructions, overhead %llu\n", instructions, overhead);
+ printf("Expected %llu\n", expected);
+ printf("Actual %llu\n", events[0].result.value);
+ printf("Delta %lld, %f%%\n", difference, percentage);
+ }
+
+ event_reset(&events[0]);
+ event_reset(&events[1]);
+ event_reset(&events[2]);
+
+ if (difference < 0)
+ difference = -difference;
+
+ /* Tolerate a difference below 0.0001 % */
+ difference *= 10000 * 100;
+ if (difference / events[0].result.value)
+ return -1;
+
+ return 0;
+}
+
+/* Count how many instructions it takes to do a null loop */
+static u64 determine_overhead(struct event *events)
+{
+ u64 current, overhead;
+ int i;
+
+ do_count_loop(events, 0, 0, false);
+ overhead = events[0].result.value;
+
+ for (i = 0; i < 100; i++) {
+ do_count_loop(events, 0, 0, false);
+ current = events[0].result.value;
+ if (current < overhead) {
+ printf("Replacing overhead %llu with %llu\n", overhead, current);
+ overhead = current;
+ }
+ }
+
+ return overhead;
+}
+
+#define PM_MRK_STCX_FAIL 0x03e158
+#define PM_STCX_FAIL 0x01e058
+
+static int test_body(void)
+{
+ struct event events[3];
+ u64 overhead;
+
+ // The STCX_FAIL event we use works on Power8 or later
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_2_07));
+
+ setup_event(&events[0], PERF_COUNT_HW_INSTRUCTIONS, PERF_TYPE_HARDWARE, "instructions");
+ setup_event(&events[1], PERF_COUNT_HW_CPU_CYCLES, PERF_TYPE_HARDWARE, "cycles");
+ setup_event(&events[2], PM_STCX_FAIL, PERF_TYPE_RAW, "stcx_fail");
+
+ if (event_open(&events[0])) {
+ perror("perf_event_open");
+ return -1;
+ }
+
+ if (event_open_with_group(&events[1], events[0].fd)) {
+ perror("perf_event_open");
+ return -1;
+ }
+
+ if (event_open_with_group(&events[2], events[0].fd)) {
+ perror("perf_event_open");
+ return -1;
+ }
+
+ overhead = determine_overhead(events);
+ printf("Overhead of null loop: %llu instructions\n", overhead);
+
+ /* Run for 1Mi instructions */
+ FAIL_IF(do_count_loop(events, 1000000, overhead, true));
+
+ /* Run for 10Mi instructions */
+ FAIL_IF(do_count_loop(events, 10000000, overhead, true));
+
+ /* Run for 100Mi instructions */
+ FAIL_IF(do_count_loop(events, 100000000, overhead, true));
+
+ /* Run for 1Bi instructions */
+ FAIL_IF(do_count_loop(events, 1000000000, overhead, true));
+
+ /* Run for 16Bi instructions */
+ FAIL_IF(do_count_loop(events, 16000000000, overhead, true));
+
+ /* Run for 64Bi instructions */
+ FAIL_IF(do_count_loop(events, 64000000000, overhead, true));
+
+ event_close(&events[0]);
+ event_close(&events[1]);
+
+ return 0;
+}
+
+static int count_ll_sc(void)
+{
+ return eat_cpu(test_body);
+}
+
+int main(void)
+{
+ return test_harness(count_ll_sc, "count_ll_sc");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/.gitignore b/tools/testing/selftests/powerpc/pmu/ebb/.gitignore
new file mode 100644
index 000000000..2920fb394
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/.gitignore
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: GPL-2.0-only
+reg_access_test
+event_attributes_test
+cycles_test
+cycles_with_freeze_test
+pmc56_overflow_test
+ebb_vs_cpu_event_test
+cpu_event_vs_ebb_test
+cpu_event_pinned_vs_ebb_test
+task_event_vs_ebb_test
+task_event_pinned_vs_ebb_test
+multi_ebb_procs_test
+multi_counter_test
+pmae_handling_test
+close_clears_pmcc_test
+instruction_count_test
+fork_cleanup_test
+ebb_on_child_test
+ebb_on_willing_child_test
+back_to_back_ebbs_test
+lost_exception_test
+no_handler_test
+cycles_with_mmcr2_test
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/Makefile b/tools/testing/selftests/powerpc/pmu/ebb/Makefile
new file mode 100644
index 000000000..af3df79d8
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/Makefile
@@ -0,0 +1,37 @@
+# SPDX-License-Identifier: GPL-2.0
+include ../../../../../../scripts/Kbuild.include
+
+noarg:
+ $(MAKE) -C ../../
+
+# The EBB handler is 64-bit code and everything links against it
+CFLAGS += -m64
+
+TMPOUT = $(OUTPUT)/TMPDIR/
+# Toolchains may build PIE by default which breaks the assembly
+no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \
+ $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -no-pie -x c - -o "$$TMP", -no-pie)
+
+LDFLAGS += $(no-pie-option)
+
+TEST_GEN_PROGS := reg_access_test event_attributes_test cycles_test \
+ cycles_with_freeze_test pmc56_overflow_test \
+ ebb_vs_cpu_event_test cpu_event_vs_ebb_test \
+ cpu_event_pinned_vs_ebb_test task_event_vs_ebb_test \
+ task_event_pinned_vs_ebb_test multi_ebb_procs_test \
+ multi_counter_test pmae_handling_test \
+ close_clears_pmcc_test instruction_count_test \
+ fork_cleanup_test ebb_on_child_test \
+ ebb_on_willing_child_test back_to_back_ebbs_test \
+ lost_exception_test no_handler_test \
+ cycles_with_mmcr2_test
+
+top_srcdir = ../../../../../..
+include ../../../lib.mk
+
+$(TEST_GEN_PROGS): ../../harness.c ../../utils.c ../event.c ../lib.c \
+ ebb.c ebb_handler.S trace.c busy_loop.S
+
+$(OUTPUT)/instruction_count_test: ../loop.S
+
+$(OUTPUT)/lost_exception_test: ../lib.c
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/back_to_back_ebbs_test.c b/tools/testing/selftests/powerpc/pmu/ebb/back_to_back_ebbs_test.c
new file mode 100644
index 000000000..a26ac122c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/back_to_back_ebbs_test.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "ebb.h"
+
+
+#define NUMBER_OF_EBBS 50
+
+/*
+ * Test that if we overflow the counter while in the EBB handler, we take
+ * another EBB on exiting from the handler.
+ *
+ * We do this by counting with a stupidly low sample period, causing us to
+ * overflow the PMU while we're still in the EBB handler, leading to another
+ * EBB.
+ *
+ * We get out of what would otherwise be an infinite loop by leaving the
+ * counter frozen once we've taken enough EBBs.
+ */
+
+static void ebb_callee(void)
+{
+ uint64_t siar, val;
+
+ val = mfspr(SPRN_BESCR);
+ if (!(val & BESCR_PMEO)) {
+ ebb_state.stats.spurious++;
+ goto out;
+ }
+
+ ebb_state.stats.ebb_count++;
+ trace_log_counter(ebb_state.trace, ebb_state.stats.ebb_count);
+
+ /* Resets the PMC */
+ count_pmc(1, sample_period);
+
+out:
+ if (ebb_state.stats.ebb_count == NUMBER_OF_EBBS)
+ /* Reset but leave counters frozen */
+ reset_ebb_with_clear_mask(MMCR0_PMAO);
+ else
+ /* Unfreezes */
+ reset_ebb();
+
+ /* Do some stuff to chew some cycles and pop the counter */
+ siar = mfspr(SPRN_SIAR);
+ trace_log_reg(ebb_state.trace, SPRN_SIAR, siar);
+
+ val = mfspr(SPRN_PMC1);
+ trace_log_reg(ebb_state.trace, SPRN_PMC1, val);
+
+ val = mfspr(SPRN_MMCR0);
+ trace_log_reg(ebb_state.trace, SPRN_MMCR0, val);
+}
+
+int back_to_back_ebbs(void)
+{
+ struct event event;
+
+ SKIP_IF(!ebb_is_supported());
+
+ event_init_named(&event, 0x1001e, "cycles");
+ event_leader_ebb_init(&event);
+
+ event.attr.exclude_kernel = 1;
+ event.attr.exclude_hv = 1;
+ event.attr.exclude_idle = 1;
+
+ FAIL_IF(event_open(&event));
+
+ setup_ebb_handler(ebb_callee);
+
+ FAIL_IF(ebb_event_enable(&event));
+
+ sample_period = 5;
+
+ ebb_freeze_pmcs();
+ mtspr(SPRN_PMC1, pmc_sample_period(sample_period));
+ ebb_global_enable();
+ ebb_unfreeze_pmcs();
+
+ while (ebb_state.stats.ebb_count < NUMBER_OF_EBBS)
+ FAIL_IF(core_busy_loop());
+
+ ebb_global_disable();
+ ebb_freeze_pmcs();
+
+ dump_ebb_state();
+
+ event_close(&event);
+
+ FAIL_IF(ebb_state.stats.ebb_count != NUMBER_OF_EBBS);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(back_to_back_ebbs, "back_to_back_ebbs");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/busy_loop.S b/tools/testing/selftests/powerpc/pmu/ebb/busy_loop.S
new file mode 100644
index 000000000..4866a3a76
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/busy_loop.S
@@ -0,0 +1,271 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <ppc-asm.h>
+
+ .text
+
+FUNC_START(core_busy_loop)
+ stdu %r1, -168(%r1)
+ std r14, 160(%r1)
+ std r15, 152(%r1)
+ std r16, 144(%r1)
+ std r17, 136(%r1)
+ std r18, 128(%r1)
+ std r19, 120(%r1)
+ std r20, 112(%r1)
+ std r21, 104(%r1)
+ std r22, 96(%r1)
+ std r23, 88(%r1)
+ std r24, 80(%r1)
+ std r25, 72(%r1)
+ std r26, 64(%r1)
+ std r27, 56(%r1)
+ std r28, 48(%r1)
+ std r29, 40(%r1)
+ std r30, 32(%r1)
+ std r31, 24(%r1)
+
+ li r3, 0x3030
+ std r3, -96(%r1)
+ li r4, 0x4040
+ std r4, -104(%r1)
+ li r5, 0x5050
+ std r5, -112(%r1)
+ li r6, 0x6060
+ std r6, -120(%r1)
+ li r7, 0x7070
+ std r7, -128(%r1)
+ li r8, 0x0808
+ std r8, -136(%r1)
+ li r9, 0x0909
+ std r9, -144(%r1)
+ li r10, 0x1010
+ std r10, -152(%r1)
+ li r11, 0x1111
+ std r11, -160(%r1)
+ li r14, 0x1414
+ std r14, -168(%r1)
+ li r15, 0x1515
+ std r15, -176(%r1)
+ li r16, 0x1616
+ std r16, -184(%r1)
+ li r17, 0x1717
+ std r17, -192(%r1)
+ li r18, 0x1818
+ std r18, -200(%r1)
+ li r19, 0x1919
+ std r19, -208(%r1)
+ li r20, 0x2020
+ std r20, -216(%r1)
+ li r21, 0x2121
+ std r21, -224(%r1)
+ li r22, 0x2222
+ std r22, -232(%r1)
+ li r23, 0x2323
+ std r23, -240(%r1)
+ li r24, 0x2424
+ std r24, -248(%r1)
+ li r25, 0x2525
+ std r25, -256(%r1)
+ li r26, 0x2626
+ std r26, -264(%r1)
+ li r27, 0x2727
+ std r27, -272(%r1)
+ li r28, 0x2828
+ std r28, -280(%r1)
+ li r29, 0x2929
+ std r29, -288(%r1)
+ li r30, 0x3030
+ li r31, 0x3131
+
+ li r3, 0
+0: addi r3, r3, 1
+ cmpwi r3, 100
+ blt 0b
+
+ /* Return 1 (fail) unless we get through all the checks */
+ li r3, 1
+
+ /* Check none of our registers have been corrupted */
+ cmpwi r4, 0x4040
+ bne 1f
+ cmpwi r5, 0x5050
+ bne 1f
+ cmpwi r6, 0x6060
+ bne 1f
+ cmpwi r7, 0x7070
+ bne 1f
+ cmpwi r8, 0x0808
+ bne 1f
+ cmpwi r9, 0x0909
+ bne 1f
+ cmpwi r10, 0x1010
+ bne 1f
+ cmpwi r11, 0x1111
+ bne 1f
+ cmpwi r14, 0x1414
+ bne 1f
+ cmpwi r15, 0x1515
+ bne 1f
+ cmpwi r16, 0x1616
+ bne 1f
+ cmpwi r17, 0x1717
+ bne 1f
+ cmpwi r18, 0x1818
+ bne 1f
+ cmpwi r19, 0x1919
+ bne 1f
+ cmpwi r20, 0x2020
+ bne 1f
+ cmpwi r21, 0x2121
+ bne 1f
+ cmpwi r22, 0x2222
+ bne 1f
+ cmpwi r23, 0x2323
+ bne 1f
+ cmpwi r24, 0x2424
+ bne 1f
+ cmpwi r25, 0x2525
+ bne 1f
+ cmpwi r26, 0x2626
+ bne 1f
+ cmpwi r27, 0x2727
+ bne 1f
+ cmpwi r28, 0x2828
+ bne 1f
+ cmpwi r29, 0x2929
+ bne 1f
+ cmpwi r30, 0x3030
+ bne 1f
+ cmpwi r31, 0x3131
+ bne 1f
+
+ /* Load junk into all our registers before we reload them from the stack. */
+ li r3, 0xde
+ li r4, 0xad
+ li r5, 0xbe
+ li r6, 0xef
+ li r7, 0xde
+ li r8, 0xad
+ li r9, 0xbe
+ li r10, 0xef
+ li r11, 0xde
+ li r14, 0xad
+ li r15, 0xbe
+ li r16, 0xef
+ li r17, 0xde
+ li r18, 0xad
+ li r19, 0xbe
+ li r20, 0xef
+ li r21, 0xde
+ li r22, 0xad
+ li r23, 0xbe
+ li r24, 0xef
+ li r25, 0xde
+ li r26, 0xad
+ li r27, 0xbe
+ li r28, 0xef
+ li r29, 0xdd
+
+ ld r3, -96(%r1)
+ cmpwi r3, 0x3030
+ bne 1f
+ ld r4, -104(%r1)
+ cmpwi r4, 0x4040
+ bne 1f
+ ld r5, -112(%r1)
+ cmpwi r5, 0x5050
+ bne 1f
+ ld r6, -120(%r1)
+ cmpwi r6, 0x6060
+ bne 1f
+ ld r7, -128(%r1)
+ cmpwi r7, 0x7070
+ bne 1f
+ ld r8, -136(%r1)
+ cmpwi r8, 0x0808
+ bne 1f
+ ld r9, -144(%r1)
+ cmpwi r9, 0x0909
+ bne 1f
+ ld r10, -152(%r1)
+ cmpwi r10, 0x1010
+ bne 1f
+ ld r11, -160(%r1)
+ cmpwi r11, 0x1111
+ bne 1f
+ ld r14, -168(%r1)
+ cmpwi r14, 0x1414
+ bne 1f
+ ld r15, -176(%r1)
+ cmpwi r15, 0x1515
+ bne 1f
+ ld r16, -184(%r1)
+ cmpwi r16, 0x1616
+ bne 1f
+ ld r17, -192(%r1)
+ cmpwi r17, 0x1717
+ bne 1f
+ ld r18, -200(%r1)
+ cmpwi r18, 0x1818
+ bne 1f
+ ld r19, -208(%r1)
+ cmpwi r19, 0x1919
+ bne 1f
+ ld r20, -216(%r1)
+ cmpwi r20, 0x2020
+ bne 1f
+ ld r21, -224(%r1)
+ cmpwi r21, 0x2121
+ bne 1f
+ ld r22, -232(%r1)
+ cmpwi r22, 0x2222
+ bne 1f
+ ld r23, -240(%r1)
+ cmpwi r23, 0x2323
+ bne 1f
+ ld r24, -248(%r1)
+ cmpwi r24, 0x2424
+ bne 1f
+ ld r25, -256(%r1)
+ cmpwi r25, 0x2525
+ bne 1f
+ ld r26, -264(%r1)
+ cmpwi r26, 0x2626
+ bne 1f
+ ld r27, -272(%r1)
+ cmpwi r27, 0x2727
+ bne 1f
+ ld r28, -280(%r1)
+ cmpwi r28, 0x2828
+ bne 1f
+ ld r29, -288(%r1)
+ cmpwi r29, 0x2929
+ bne 1f
+
+ /* Load 0 (success) to return */
+ li r3, 0
+
+1: ld r14, 160(%r1)
+ ld r15, 152(%r1)
+ ld r16, 144(%r1)
+ ld r17, 136(%r1)
+ ld r18, 128(%r1)
+ ld r19, 120(%r1)
+ ld r20, 112(%r1)
+ ld r21, 104(%r1)
+ ld r22, 96(%r1)
+ ld r23, 88(%r1)
+ ld r24, 80(%r1)
+ ld r25, 72(%r1)
+ ld r26, 64(%r1)
+ ld r27, 56(%r1)
+ ld r28, 48(%r1)
+ ld r29, 40(%r1)
+ ld r30, 32(%r1)
+ ld r31, 24(%r1)
+ addi %r1, %r1, 168
+ blr
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/close_clears_pmcc_test.c b/tools/testing/selftests/powerpc/pmu/ebb/close_clears_pmcc_test.c
new file mode 100644
index 000000000..ca9aeb0d8
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/close_clears_pmcc_test.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <setjmp.h>
+#include <signal.h>
+
+#include "ebb.h"
+
+
+/*
+ * Test that closing the EBB event clears MMCR0_PMCC, preventing further access
+ * by userspace to the PMU hardware.
+ */
+
+int close_clears_pmcc(void)
+{
+ struct event event;
+
+ SKIP_IF(!ebb_is_supported());
+
+ event_init_named(&event, 0x1001e, "cycles");
+ event_leader_ebb_init(&event);
+
+ FAIL_IF(event_open(&event));
+
+ ebb_enable_pmc_counting(1);
+ setup_ebb_handler(standard_ebb_callee);
+ ebb_global_enable();
+ FAIL_IF(ebb_event_enable(&event));
+
+ mtspr(SPRN_PMC1, pmc_sample_period(sample_period));
+
+ while (ebb_state.stats.ebb_count < 1)
+ FAIL_IF(core_busy_loop());
+
+ ebb_global_disable();
+ event_close(&event);
+
+ FAIL_IF(ebb_state.stats.ebb_count == 0);
+
+ /* The real test is here, do we take a SIGILL when writing PMU regs now
+ * that we have closed the event. We expect that we will. */
+
+ FAIL_IF(catch_sigill(write_pmc1));
+
+ /* We should still be able to read EBB regs though */
+ mfspr(SPRN_EBBHR);
+ mfspr(SPRN_EBBRR);
+ mfspr(SPRN_BESCR);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(close_clears_pmcc, "close_clears_pmcc");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/cpu_event_pinned_vs_ebb_test.c b/tools/testing/selftests/powerpc/pmu/ebb/cpu_event_pinned_vs_ebb_test.c
new file mode 100644
index 000000000..3cd33eb51
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/cpu_event_pinned_vs_ebb_test.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "ebb.h"
+
+
+/*
+ * Tests a pinned cpu event vs an EBB - in that order. The pinned cpu event
+ * should remain and the EBB event should fail to enable.
+ */
+
+static int setup_cpu_event(struct event *event, int cpu)
+{
+ event_init_named(event, 0x400FA, "PM_RUN_INST_CMPL");
+
+ event->attr.pinned = 1;
+
+ event->attr.exclude_kernel = 1;
+ event->attr.exclude_hv = 1;
+ event->attr.exclude_idle = 1;
+
+ SKIP_IF(require_paranoia_below(1));
+ FAIL_IF(event_open_with_cpu(event, cpu));
+ FAIL_IF(event_enable(event));
+
+ return 0;
+}
+
+int cpu_event_pinned_vs_ebb(void)
+{
+ union pipe read_pipe, write_pipe;
+ struct event event;
+ int cpu, rc;
+ pid_t pid;
+
+ SKIP_IF(!ebb_is_supported());
+
+ cpu = pick_online_cpu();
+ FAIL_IF(cpu < 0);
+ FAIL_IF(bind_to_cpu(cpu));
+
+ FAIL_IF(pipe(read_pipe.fds) == -1);
+ FAIL_IF(pipe(write_pipe.fds) == -1);
+
+ pid = fork();
+ if (pid == 0) {
+ /* NB order of pipes looks reversed */
+ exit(ebb_child(write_pipe, read_pipe));
+ }
+
+ /* We setup the cpu event first */
+ rc = setup_cpu_event(&event, cpu);
+ if (rc) {
+ kill_child_and_wait(pid);
+ return rc;
+ }
+
+ /* Signal the child to install its EBB event and wait */
+ if (sync_with_child(read_pipe, write_pipe))
+ /* If it fails, wait for it to exit */
+ goto wait;
+
+ /* Signal the child to run */
+ FAIL_IF(sync_with_child(read_pipe, write_pipe));
+
+wait:
+ /* We expect it to fail to read the event */
+ FAIL_IF(wait_for_child(pid) != 2);
+
+ FAIL_IF(event_disable(&event));
+ FAIL_IF(event_read(&event));
+
+ event_report(&event);
+
+ /* The cpu event should have run */
+ FAIL_IF(event.result.value == 0);
+ FAIL_IF(event.result.enabled != event.result.running);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(cpu_event_pinned_vs_ebb, "cpu_event_pinned_vs_ebb");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/cpu_event_vs_ebb_test.c b/tools/testing/selftests/powerpc/pmu/ebb/cpu_event_vs_ebb_test.c
new file mode 100644
index 000000000..8466ef9d7
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/cpu_event_vs_ebb_test.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "ebb.h"
+
+
+/*
+ * Tests a cpu event vs an EBB - in that order. The EBB should force the cpu
+ * event off the PMU.
+ */
+
+static int setup_cpu_event(struct event *event, int cpu)
+{
+ event_init_named(event, 0x400FA, "PM_RUN_INST_CMPL");
+
+ event->attr.exclude_kernel = 1;
+ event->attr.exclude_hv = 1;
+ event->attr.exclude_idle = 1;
+
+ SKIP_IF(require_paranoia_below(1));
+ FAIL_IF(event_open_with_cpu(event, cpu));
+ FAIL_IF(event_enable(event));
+
+ return 0;
+}
+
+int cpu_event_vs_ebb(void)
+{
+ union pipe read_pipe, write_pipe;
+ struct event event;
+ int cpu, rc;
+ pid_t pid;
+
+ SKIP_IF(!ebb_is_supported());
+
+ cpu = pick_online_cpu();
+ FAIL_IF(cpu < 0);
+ FAIL_IF(bind_to_cpu(cpu));
+
+ FAIL_IF(pipe(read_pipe.fds) == -1);
+ FAIL_IF(pipe(write_pipe.fds) == -1);
+
+ pid = fork();
+ if (pid == 0) {
+ /* NB order of pipes looks reversed */
+ exit(ebb_child(write_pipe, read_pipe));
+ }
+
+ /* We setup the cpu event first */
+ rc = setup_cpu_event(&event, cpu);
+ if (rc) {
+ kill_child_and_wait(pid);
+ return rc;
+ }
+
+ /* Signal the child to install its EBB event and wait */
+ if (sync_with_child(read_pipe, write_pipe))
+ /* If it fails, wait for it to exit */
+ goto wait;
+
+ /* Signal the child to run */
+ FAIL_IF(sync_with_child(read_pipe, write_pipe));
+
+wait:
+ /* We expect the child to succeed */
+ FAIL_IF(wait_for_child(pid));
+
+ FAIL_IF(event_disable(&event));
+ FAIL_IF(event_read(&event));
+
+ event_report(&event);
+
+ /* The cpu event may have run */
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(cpu_event_vs_ebb, "cpu_event_vs_ebb");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/cycles_test.c b/tools/testing/selftests/powerpc/pmu/ebb/cycles_test.c
new file mode 100644
index 000000000..bb9f587fa
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/cycles_test.c
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "ebb.h"
+
+
+/*
+ * Basic test that counts user cycles and takes EBBs.
+ */
+int cycles(void)
+{
+ struct event event;
+
+ SKIP_IF(!ebb_is_supported());
+
+ event_init_named(&event, 0x1001e, "cycles");
+ event_leader_ebb_init(&event);
+
+ event.attr.exclude_kernel = 1;
+ event.attr.exclude_hv = 1;
+ event.attr.exclude_idle = 1;
+
+ FAIL_IF(event_open(&event));
+
+ ebb_enable_pmc_counting(1);
+ setup_ebb_handler(standard_ebb_callee);
+ ebb_global_enable();
+ FAIL_IF(ebb_event_enable(&event));
+
+ mtspr(SPRN_PMC1, pmc_sample_period(sample_period));
+
+ while (ebb_state.stats.ebb_count < 10) {
+ FAIL_IF(core_busy_loop());
+ FAIL_IF(ebb_check_mmcr0());
+ }
+
+ ebb_global_disable();
+ ebb_freeze_pmcs();
+
+ dump_ebb_state();
+
+ event_close(&event);
+
+ FAIL_IF(ebb_state.stats.ebb_count == 0);
+ FAIL_IF(!ebb_check_count(1, sample_period, 100));
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(cycles, "cycles");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/cycles_with_freeze_test.c b/tools/testing/selftests/powerpc/pmu/ebb/cycles_with_freeze_test.c
new file mode 100644
index 000000000..9ae795ce3
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/cycles_with_freeze_test.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+#include "ebb.h"
+
+
+/*
+ * Test of counting cycles while using MMCR0_FC (freeze counters) to only count
+ * parts of the code. This is complicated by the fact that FC is set by the
+ * hardware when the event overflows. We may take the EBB after we have set FC,
+ * so we have to be careful about whether we clear FC at the end of the EBB
+ * handler or not.
+ */
+
+static bool counters_frozen = false;
+static int ebbs_while_frozen = 0;
+
+static void ebb_callee(void)
+{
+ uint64_t mask, val;
+
+ mask = MMCR0_PMAO | MMCR0_FC;
+
+ val = mfspr(SPRN_BESCR);
+ if (!(val & BESCR_PMEO)) {
+ ebb_state.stats.spurious++;
+ goto out;
+ }
+
+ ebb_state.stats.ebb_count++;
+ trace_log_counter(ebb_state.trace, ebb_state.stats.ebb_count);
+
+ val = mfspr(SPRN_MMCR0);
+ trace_log_reg(ebb_state.trace, SPRN_MMCR0, val);
+
+ if (counters_frozen) {
+ trace_log_string(ebb_state.trace, "frozen");
+ ebbs_while_frozen++;
+ mask &= ~MMCR0_FC;
+ }
+
+ count_pmc(1, sample_period);
+out:
+ reset_ebb_with_clear_mask(mask);
+}
+
+int cycles_with_freeze(void)
+{
+ struct event event;
+ uint64_t val;
+ bool fc_cleared;
+
+ SKIP_IF(!ebb_is_supported());
+
+ event_init_named(&event, 0x1001e, "cycles");
+ event_leader_ebb_init(&event);
+
+ event.attr.exclude_kernel = 1;
+ event.attr.exclude_hv = 1;
+ event.attr.exclude_idle = 1;
+
+ FAIL_IF(event_open(&event));
+
+ setup_ebb_handler(ebb_callee);
+ ebb_global_enable();
+ FAIL_IF(ebb_event_enable(&event));
+
+ mtspr(SPRN_PMC1, pmc_sample_period(sample_period));
+
+ fc_cleared = false;
+
+ /* Make sure we loop until we take at least one EBB */
+ while ((ebb_state.stats.ebb_count < 20 && !fc_cleared) ||
+ ebb_state.stats.ebb_count < 1)
+ {
+ counters_frozen = false;
+ mb();
+ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_FC);
+
+ FAIL_IF(core_busy_loop());
+
+ counters_frozen = true;
+ mb();
+ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC);
+
+ val = mfspr(SPRN_MMCR0);
+ if (! (val & MMCR0_FC)) {
+ printf("Outside of loop, FC NOT set MMCR0 0x%lx\n", val);
+ fc_cleared = true;
+ }
+ }
+
+ ebb_global_disable();
+ ebb_freeze_pmcs();
+
+ dump_ebb_state();
+
+ printf("EBBs while frozen %d\n", ebbs_while_frozen);
+
+ event_close(&event);
+
+ FAIL_IF(ebb_state.stats.ebb_count == 0);
+ FAIL_IF(fc_cleared);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(cycles_with_freeze, "cycles_with_freeze");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/cycles_with_mmcr2_test.c b/tools/testing/selftests/powerpc/pmu/ebb/cycles_with_mmcr2_test.c
new file mode 100644
index 000000000..4b45a2e70
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/cycles_with_mmcr2_test.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+#include "ebb.h"
+
+
+/*
+ * Test of counting cycles while manipulating the user accessible bits in MMCR2.
+ */
+
+/* We use two values because the first freezes PMC1 and so we would get no EBBs */
+#define MMCR2_EXPECTED_1 0x4020100804020000UL /* (FC1P|FC2P|FC3P|FC4P|FC5P|FC6P) */
+#define MMCR2_EXPECTED_2 0x0020100804020000UL /* ( FC2P|FC3P|FC4P|FC5P|FC6P) */
+
+
+int cycles_with_mmcr2(void)
+{
+ struct event event;
+ uint64_t val, expected[2], actual;
+ int i;
+ bool bad_mmcr2;
+
+ SKIP_IF(!ebb_is_supported());
+
+ event_init_named(&event, 0x1001e, "cycles");
+ event_leader_ebb_init(&event);
+
+ event.attr.exclude_kernel = 1;
+ event.attr.exclude_hv = 1;
+ event.attr.exclude_idle = 1;
+
+ FAIL_IF(event_open(&event));
+
+ ebb_enable_pmc_counting(1);
+ setup_ebb_handler(standard_ebb_callee);
+ ebb_global_enable();
+
+ FAIL_IF(ebb_event_enable(&event));
+
+ mtspr(SPRN_PMC1, pmc_sample_period(sample_period));
+
+ /* XXX Set of MMCR2 must be after enable */
+ expected[0] = MMCR2_EXPECTED_1;
+ expected[1] = MMCR2_EXPECTED_2;
+ i = 0;
+ bad_mmcr2 = false;
+
+ /* Make sure we loop until we take at least one EBB */
+ while ((ebb_state.stats.ebb_count < 20 && !bad_mmcr2) ||
+ ebb_state.stats.ebb_count < 1)
+ {
+ mtspr(SPRN_MMCR2, expected[i % 2]);
+
+ FAIL_IF(core_busy_loop());
+
+ val = mfspr(SPRN_MMCR2);
+ if (val != expected[i % 2]) {
+ bad_mmcr2 = true;
+ actual = val;
+ }
+
+ i++;
+ }
+
+ ebb_global_disable();
+ ebb_freeze_pmcs();
+
+ dump_ebb_state();
+
+ event_close(&event);
+
+ FAIL_IF(ebb_state.stats.ebb_count == 0);
+
+ if (bad_mmcr2)
+ printf("Bad MMCR2 value seen is 0x%lx\n", actual);
+
+ FAIL_IF(bad_mmcr2);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(cycles_with_mmcr2, "cycles_with_mmcr2");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/ebb.c b/tools/testing/selftests/powerpc/pmu/ebb/ebb.c
new file mode 100644
index 000000000..21537d6eb
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/ebb.c
@@ -0,0 +1,485 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#define _GNU_SOURCE /* For CPU_ZERO etc. */
+
+#include <sched.h>
+#include <sys/wait.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+
+#include "trace.h"
+#include "ebb.h"
+
+
+void (*ebb_user_func)(void);
+
+void ebb_hook(void)
+{
+ if (ebb_user_func)
+ ebb_user_func();
+}
+
+struct ebb_state ebb_state;
+
+u64 sample_period = 0x40000000ull;
+
+void reset_ebb_with_clear_mask(unsigned long mmcr0_clear_mask)
+{
+ u64 val;
+
+ /* 2) clear MMCR0[PMAO] - docs say BESCR[PMEO] should do this */
+ /* 3) set MMCR0[PMAE] - docs say BESCR[PME] should do this */
+ val = mfspr(SPRN_MMCR0);
+ mtspr(SPRN_MMCR0, (val & ~mmcr0_clear_mask) | MMCR0_PMAE);
+
+ /* 4) clear BESCR[PMEO] */
+ mtspr(SPRN_BESCRR, BESCR_PMEO);
+
+ /* 5) set BESCR[PME] */
+ mtspr(SPRN_BESCRS, BESCR_PME);
+
+ /* 6) rfebb 1 - done in our caller */
+}
+
+void reset_ebb(void)
+{
+ reset_ebb_with_clear_mask(MMCR0_PMAO | MMCR0_FC);
+}
+
+/* Called outside of the EBB handler to check MMCR0 is sane */
+int ebb_check_mmcr0(void)
+{
+ u64 val;
+
+ val = mfspr(SPRN_MMCR0);
+ if ((val & (MMCR0_FC | MMCR0_PMAO)) == MMCR0_FC) {
+ /* It's OK if we see FC & PMAO, but not FC by itself */
+ printf("Outside of loop, only FC set 0x%llx\n", val);
+ return 1;
+ }
+
+ return 0;
+}
+
+bool ebb_check_count(int pmc, u64 sample_period, int fudge)
+{
+ u64 count, upper, lower;
+
+ count = ebb_state.stats.pmc_count[PMC_INDEX(pmc)];
+
+ lower = ebb_state.stats.ebb_count * (sample_period - fudge);
+
+ if (count < lower) {
+ printf("PMC%d count (0x%llx) below lower limit 0x%llx (-0x%llx)\n",
+ pmc, count, lower, lower - count);
+ return false;
+ }
+
+ upper = ebb_state.stats.ebb_count * (sample_period + fudge);
+
+ if (count > upper) {
+ printf("PMC%d count (0x%llx) above upper limit 0x%llx (+0x%llx)\n",
+ pmc, count, upper, count - upper);
+ return false;
+ }
+
+ printf("PMC%d count (0x%llx) is between 0x%llx and 0x%llx delta +0x%llx/-0x%llx\n",
+ pmc, count, lower, upper, count - lower, upper - count);
+
+ return true;
+}
+
+void standard_ebb_callee(void)
+{
+ int found, i;
+ u64 val;
+
+ val = mfspr(SPRN_BESCR);
+ if (!(val & BESCR_PMEO)) {
+ ebb_state.stats.spurious++;
+ goto out;
+ }
+
+ ebb_state.stats.ebb_count++;
+ trace_log_counter(ebb_state.trace, ebb_state.stats.ebb_count);
+
+ val = mfspr(SPRN_MMCR0);
+ trace_log_reg(ebb_state.trace, SPRN_MMCR0, val);
+
+ found = 0;
+ for (i = 1; i <= 6; i++) {
+ if (ebb_state.pmc_enable[PMC_INDEX(i)])
+ found += count_pmc(i, sample_period);
+ }
+
+ if (!found)
+ ebb_state.stats.no_overflow++;
+
+out:
+ reset_ebb();
+}
+
+extern void ebb_handler(void);
+
+void setup_ebb_handler(void (*callee)(void))
+{
+ u64 entry;
+
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+ entry = (u64)ebb_handler;
+#else
+ struct opd
+ {
+ u64 entry;
+ u64 toc;
+ } *opd;
+
+ opd = (struct opd *)ebb_handler;
+ entry = opd->entry;
+#endif
+ printf("EBB Handler is at %#llx\n", entry);
+
+ ebb_user_func = callee;
+
+ /* Ensure ebb_user_func is set before we set the handler */
+ mb();
+ mtspr(SPRN_EBBHR, entry);
+
+ /* Make sure the handler is set before we return */
+ mb();
+}
+
+void clear_ebb_stats(void)
+{
+ memset(&ebb_state.stats, 0, sizeof(ebb_state.stats));
+}
+
+void dump_summary_ebb_state(void)
+{
+ printf("ebb_state:\n" \
+ " ebb_count = %d\n" \
+ " spurious = %d\n" \
+ " negative = %d\n" \
+ " no_overflow = %d\n" \
+ " pmc[1] count = 0x%llx\n" \
+ " pmc[2] count = 0x%llx\n" \
+ " pmc[3] count = 0x%llx\n" \
+ " pmc[4] count = 0x%llx\n" \
+ " pmc[5] count = 0x%llx\n" \
+ " pmc[6] count = 0x%llx\n",
+ ebb_state.stats.ebb_count, ebb_state.stats.spurious,
+ ebb_state.stats.negative, ebb_state.stats.no_overflow,
+ ebb_state.stats.pmc_count[0], ebb_state.stats.pmc_count[1],
+ ebb_state.stats.pmc_count[2], ebb_state.stats.pmc_count[3],
+ ebb_state.stats.pmc_count[4], ebb_state.stats.pmc_count[5]);
+}
+
+static char *decode_mmcr0(u32 value)
+{
+ static char buf[16];
+
+ buf[0] = '\0';
+
+ if (value & (1 << 31))
+ strcat(buf, "FC ");
+ if (value & (1 << 26))
+ strcat(buf, "PMAE ");
+ if (value & (1 << 7))
+ strcat(buf, "PMAO ");
+
+ return buf;
+}
+
+static char *decode_bescr(u64 value)
+{
+ static char buf[16];
+
+ buf[0] = '\0';
+
+ if (value & (1ull << 63))
+ strcat(buf, "GE ");
+ if (value & (1ull << 32))
+ strcat(buf, "PMAE ");
+ if (value & 1)
+ strcat(buf, "PMAO ");
+
+ return buf;
+}
+
+void dump_ebb_hw_state(void)
+{
+ u64 bescr;
+ u32 mmcr0;
+
+ mmcr0 = mfspr(SPRN_MMCR0);
+ bescr = mfspr(SPRN_BESCR);
+
+ printf("HW state:\n" \
+ "MMCR0 0x%016x %s\n" \
+ "MMCR2 0x%016lx\n" \
+ "EBBHR 0x%016lx\n" \
+ "BESCR 0x%016llx %s\n" \
+ "PMC1 0x%016lx\n" \
+ "PMC2 0x%016lx\n" \
+ "PMC3 0x%016lx\n" \
+ "PMC4 0x%016lx\n" \
+ "PMC5 0x%016lx\n" \
+ "PMC6 0x%016lx\n" \
+ "SIAR 0x%016lx\n",
+ mmcr0, decode_mmcr0(mmcr0), mfspr(SPRN_MMCR2),
+ mfspr(SPRN_EBBHR), bescr, decode_bescr(bescr),
+ mfspr(SPRN_PMC1), mfspr(SPRN_PMC2), mfspr(SPRN_PMC3),
+ mfspr(SPRN_PMC4), mfspr(SPRN_PMC5), mfspr(SPRN_PMC6),
+ mfspr(SPRN_SIAR));
+}
+
+void dump_ebb_state(void)
+{
+ dump_summary_ebb_state();
+
+ dump_ebb_hw_state();
+
+ trace_buffer_print(ebb_state.trace);
+}
+
+int count_pmc(int pmc, uint32_t sample_period)
+{
+ uint32_t start_value;
+ u64 val;
+
+ /* 0) Read PMC */
+ start_value = pmc_sample_period(sample_period);
+
+ val = read_pmc(pmc);
+ if (val < start_value)
+ ebb_state.stats.negative++;
+ else
+ ebb_state.stats.pmc_count[PMC_INDEX(pmc)] += val - start_value;
+
+ trace_log_reg(ebb_state.trace, SPRN_PMC1 + pmc - 1, val);
+
+ /* 1) Reset PMC */
+ write_pmc(pmc, start_value);
+
+ /* Report if we overflowed */
+ return val >= COUNTER_OVERFLOW;
+}
+
+int ebb_event_enable(struct event *e)
+{
+ int rc;
+
+ /* Ensure any SPR writes are ordered vs us */
+ mb();
+
+ rc = ioctl(e->fd, PERF_EVENT_IOC_ENABLE);
+ if (rc)
+ return rc;
+
+ rc = event_read(e);
+
+ /* Ditto */
+ mb();
+
+ return rc;
+}
+
+void ebb_freeze_pmcs(void)
+{
+ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC);
+ mb();
+}
+
+void ebb_unfreeze_pmcs(void)
+{
+ /* Unfreeze counters */
+ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_FC);
+ mb();
+}
+
+void ebb_global_enable(void)
+{
+ /* Enable EBBs globally and PMU EBBs */
+ mtspr(SPRN_BESCR, 0x8000000100000000ull);
+ mb();
+}
+
+void ebb_global_disable(void)
+{
+ /* Disable EBBs & freeze counters, events are still scheduled */
+ mtspr(SPRN_BESCRR, BESCR_PME);
+ mb();
+}
+
+bool ebb_is_supported(void)
+{
+#ifdef PPC_FEATURE2_EBB
+ /* EBB requires at least POWER8 */
+ return have_hwcap2(PPC_FEATURE2_EBB);
+#else
+ return false;
+#endif
+}
+
+void event_ebb_init(struct event *e)
+{
+ e->attr.config |= (1ull << 63);
+}
+
+void event_bhrb_init(struct event *e, unsigned ifm)
+{
+ e->attr.config |= (1ull << 62) | ((u64)ifm << 60);
+}
+
+void event_leader_ebb_init(struct event *e)
+{
+ event_ebb_init(e);
+
+ e->attr.exclusive = 1;
+ e->attr.pinned = 1;
+}
+
+int ebb_child(union pipe read_pipe, union pipe write_pipe)
+{
+ struct event event;
+ uint64_t val;
+
+ FAIL_IF(wait_for_parent(read_pipe));
+
+ event_init_named(&event, 0x1001e, "cycles");
+ event_leader_ebb_init(&event);
+
+ event.attr.exclude_kernel = 1;
+ event.attr.exclude_hv = 1;
+ event.attr.exclude_idle = 1;
+
+ FAIL_IF(event_open(&event));
+
+ ebb_enable_pmc_counting(1);
+ setup_ebb_handler(standard_ebb_callee);
+ ebb_global_enable();
+
+ FAIL_IF(event_enable(&event));
+
+ if (event_read(&event)) {
+ /*
+ * Some tests expect to fail here, so don't report an error on
+ * this line, and return a distinguisable error code. Tell the
+ * parent an error happened.
+ */
+ notify_parent_of_error(write_pipe);
+ return 2;
+ }
+
+ mtspr(SPRN_PMC1, pmc_sample_period(sample_period));
+
+ FAIL_IF(notify_parent(write_pipe));
+ FAIL_IF(wait_for_parent(read_pipe));
+ FAIL_IF(notify_parent(write_pipe));
+
+ while (ebb_state.stats.ebb_count < 20) {
+ FAIL_IF(core_busy_loop());
+
+ /* To try and hit SIGILL case */
+ val = mfspr(SPRN_MMCRA);
+ val |= mfspr(SPRN_MMCR2);
+ val |= mfspr(SPRN_MMCR0);
+ }
+
+ ebb_global_disable();
+ ebb_freeze_pmcs();
+
+ dump_ebb_state();
+
+ event_close(&event);
+
+ FAIL_IF(ebb_state.stats.ebb_count == 0);
+
+ return 0;
+}
+
+static jmp_buf setjmp_env;
+
+static void sigill_handler(int signal)
+{
+ printf("Took sigill\n");
+ longjmp(setjmp_env, 1);
+}
+
+static struct sigaction sigill_action = {
+ .sa_handler = sigill_handler,
+};
+
+int catch_sigill(void (*func)(void))
+{
+ if (sigaction(SIGILL, &sigill_action, NULL)) {
+ perror("sigaction");
+ return 1;
+ }
+
+ if (setjmp(setjmp_env) == 0) {
+ func();
+ return 1;
+ }
+
+ return 0;
+}
+
+void write_pmc1(void)
+{
+ mtspr(SPRN_PMC1, 0);
+}
+
+void write_pmc(int pmc, u64 value)
+{
+ switch (pmc) {
+ case 1: mtspr(SPRN_PMC1, value); break;
+ case 2: mtspr(SPRN_PMC2, value); break;
+ case 3: mtspr(SPRN_PMC3, value); break;
+ case 4: mtspr(SPRN_PMC4, value); break;
+ case 5: mtspr(SPRN_PMC5, value); break;
+ case 6: mtspr(SPRN_PMC6, value); break;
+ }
+}
+
+u64 read_pmc(int pmc)
+{
+ switch (pmc) {
+ case 1: return mfspr(SPRN_PMC1);
+ case 2: return mfspr(SPRN_PMC2);
+ case 3: return mfspr(SPRN_PMC3);
+ case 4: return mfspr(SPRN_PMC4);
+ case 5: return mfspr(SPRN_PMC5);
+ case 6: return mfspr(SPRN_PMC6);
+ }
+
+ return 0;
+}
+
+static void term_handler(int signal)
+{
+ dump_summary_ebb_state();
+ dump_ebb_hw_state();
+ abort();
+}
+
+struct sigaction term_action = {
+ .sa_handler = term_handler,
+};
+
+static void __attribute__((constructor)) ebb_init(void)
+{
+ clear_ebb_stats();
+
+ if (sigaction(SIGTERM, &term_action, NULL))
+ perror("sigaction");
+
+ ebb_state.trace = trace_buffer_allocate(1 * 1024 * 1024);
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/ebb.h b/tools/testing/selftests/powerpc/pmu/ebb/ebb.h
new file mode 100644
index 000000000..b5bc2b616
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/ebb.h
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#ifndef _SELFTESTS_POWERPC_PMU_EBB_EBB_H
+#define _SELFTESTS_POWERPC_PMU_EBB_EBB_H
+
+#include "../event.h"
+#include "../lib.h"
+#include "trace.h"
+#include "reg.h"
+
+#define PMC_INDEX(pmc) ((pmc)-1)
+
+#define NUM_PMC_VALUES 128
+
+struct ebb_state
+{
+ struct {
+ u64 pmc_count[6];
+ volatile int ebb_count;
+ int spurious;
+ int negative;
+ int no_overflow;
+ } stats;
+
+ bool pmc_enable[6];
+ struct trace_buffer *trace;
+};
+
+extern struct ebb_state ebb_state;
+
+#define COUNTER_OVERFLOW 0x80000000ull
+
+static inline uint32_t pmc_sample_period(uint32_t value)
+{
+ return COUNTER_OVERFLOW - value;
+}
+
+static inline void ebb_enable_pmc_counting(int pmc)
+{
+ ebb_state.pmc_enable[PMC_INDEX(pmc)] = true;
+}
+
+bool ebb_check_count(int pmc, u64 sample_period, int fudge);
+void event_leader_ebb_init(struct event *e);
+void event_ebb_init(struct event *e);
+void event_bhrb_init(struct event *e, unsigned ifm);
+void setup_ebb_handler(void (*callee)(void));
+void standard_ebb_callee(void);
+int ebb_event_enable(struct event *e);
+void ebb_global_enable(void);
+void ebb_global_disable(void);
+bool ebb_is_supported(void);
+void ebb_freeze_pmcs(void);
+void ebb_unfreeze_pmcs(void);
+void event_ebb_init(struct event *e);
+void event_leader_ebb_init(struct event *e);
+int count_pmc(int pmc, uint32_t sample_period);
+void dump_ebb_state(void);
+void dump_summary_ebb_state(void);
+void dump_ebb_hw_state(void);
+void clear_ebb_stats(void);
+void write_pmc(int pmc, u64 value);
+u64 read_pmc(int pmc);
+void reset_ebb_with_clear_mask(unsigned long mmcr0_clear_mask);
+void reset_ebb(void);
+int ebb_check_mmcr0(void);
+
+extern u64 sample_period;
+
+int core_busy_loop(void);
+int ebb_child(union pipe read_pipe, union pipe write_pipe);
+int catch_sigill(void (*func)(void));
+void write_pmc1(void);
+
+#endif /* _SELFTESTS_POWERPC_PMU_EBB_EBB_H */
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/ebb_handler.S b/tools/testing/selftests/powerpc/pmu/ebb/ebb_handler.S
new file mode 100644
index 000000000..c170398de
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/ebb_handler.S
@@ -0,0 +1,365 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <ppc-asm.h>
+#include "reg.h"
+
+
+/* ppc-asm.h defines most of the reg aliases, but not r1/r2. */
+#define r1 1
+#define r2 2
+
+#define RFEBB .long 0x4c000924
+
+/* Stack layout:
+ *
+ * ^
+ * User stack |
+ * Back chain ------+ <- r1 <-------+
+ * ... |
+ * Red zone / ABI Gap |
+ * ... |
+ * vr63 <+ |
+ * vr0 | |
+ * VSCR | |
+ * FSCR | |
+ * r31 | Save area |
+ * r0 | |
+ * XER | |
+ * CTR | |
+ * LR | |
+ * CCR <+ |
+ * ... <+ |
+ * LR | Caller frame |
+ * CCR | |
+ * Back chain <+ <- updated r1 --------+
+ *
+ */
+
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+#define ABIGAP 512
+#else
+#define ABIGAP 288
+#endif
+
+#define NR_GPR 32
+#define NR_SPR 6
+#define NR_VSR 64
+
+#define SAVE_AREA ((NR_GPR + NR_SPR) * 8 + (NR_VSR * 16))
+#define CALLER_FRAME 112
+
+#define STACK_FRAME (ABIGAP + SAVE_AREA + CALLER_FRAME)
+
+#define CCR_SAVE (CALLER_FRAME)
+#define LR_SAVE (CCR_SAVE + 8)
+#define CTR_SAVE (LR_SAVE + 8)
+#define XER_SAVE (CTR_SAVE + 8)
+#define GPR_SAVE(n) (XER_SAVE + 8 + (8 * n))
+#define FSCR_SAVE (GPR_SAVE(31) + 8)
+#define VSCR_SAVE (FSCR_SAVE + 8)
+#define VSR_SAVE(n) (VSCR_SAVE + 8 + (16 * n))
+
+#define SAVE_GPR(n) std n,GPR_SAVE(n)(r1)
+#define REST_GPR(n) ld n,GPR_SAVE(n)(r1)
+#define TRASH_GPR(n) lis n,0xaaaa
+
+#define SAVE_VSR(n, b) li b, VSR_SAVE(n); stxvd2x n,b,r1
+#define LOAD_VSR(n, b) li b, VSR_SAVE(n); lxvd2x n,b,r1
+
+#define LOAD_REG_IMMEDIATE(reg,expr) \
+ lis reg,(expr)@highest; \
+ ori reg,reg,(expr)@higher; \
+ rldicr reg,reg,32,31; \
+ oris reg,reg,(expr)@h; \
+ ori reg,reg,(expr)@l;
+
+
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+#define ENTRY_POINT(name) \
+ .type FUNC_NAME(name),@function; \
+ .globl FUNC_NAME(name); \
+ FUNC_NAME(name):
+
+#define RESTORE_TOC(name) \
+ /* Restore our TOC pointer using our entry point */ \
+ LOAD_REG_IMMEDIATE(r12, name) \
+0: addis r2,r12,(.TOC.-0b)@ha; \
+ addi r2,r2,(.TOC.-0b)@l;
+
+#else
+#define ENTRY_POINT(name) FUNC_START(name)
+#define RESTORE_TOC(name) \
+ /* Restore our TOC pointer via our opd entry */ \
+ LOAD_REG_IMMEDIATE(r2, name) \
+ ld r2,8(r2);
+#endif
+
+ .text
+
+ENTRY_POINT(ebb_handler)
+ stdu r1,-STACK_FRAME(r1)
+ SAVE_GPR(0)
+ mflr r0
+ std r0,LR_SAVE(r1)
+ mfcr r0
+ std r0,CCR_SAVE(r1)
+ mfctr r0
+ std r0,CTR_SAVE(r1)
+ mfxer r0
+ std r0,XER_SAVE(r1)
+ SAVE_GPR(2)
+ SAVE_GPR(3)
+ SAVE_GPR(4)
+ SAVE_GPR(5)
+ SAVE_GPR(6)
+ SAVE_GPR(7)
+ SAVE_GPR(8)
+ SAVE_GPR(9)
+ SAVE_GPR(10)
+ SAVE_GPR(11)
+ SAVE_GPR(12)
+ SAVE_GPR(13)
+ SAVE_GPR(14)
+ SAVE_GPR(15)
+ SAVE_GPR(16)
+ SAVE_GPR(17)
+ SAVE_GPR(18)
+ SAVE_GPR(19)
+ SAVE_GPR(20)
+ SAVE_GPR(21)
+ SAVE_GPR(22)
+ SAVE_GPR(23)
+ SAVE_GPR(24)
+ SAVE_GPR(25)
+ SAVE_GPR(26)
+ SAVE_GPR(27)
+ SAVE_GPR(28)
+ SAVE_GPR(29)
+ SAVE_GPR(30)
+ SAVE_GPR(31)
+ SAVE_VSR(0, r3)
+ mffs f0
+ stfd f0, FSCR_SAVE(r1)
+ mfvscr f0
+ stfd f0, VSCR_SAVE(r1)
+ SAVE_VSR(1, r3)
+ SAVE_VSR(2, r3)
+ SAVE_VSR(3, r3)
+ SAVE_VSR(4, r3)
+ SAVE_VSR(5, r3)
+ SAVE_VSR(6, r3)
+ SAVE_VSR(7, r3)
+ SAVE_VSR(8, r3)
+ SAVE_VSR(9, r3)
+ SAVE_VSR(10, r3)
+ SAVE_VSR(11, r3)
+ SAVE_VSR(12, r3)
+ SAVE_VSR(13, r3)
+ SAVE_VSR(14, r3)
+ SAVE_VSR(15, r3)
+ SAVE_VSR(16, r3)
+ SAVE_VSR(17, r3)
+ SAVE_VSR(18, r3)
+ SAVE_VSR(19, r3)
+ SAVE_VSR(20, r3)
+ SAVE_VSR(21, r3)
+ SAVE_VSR(22, r3)
+ SAVE_VSR(23, r3)
+ SAVE_VSR(24, r3)
+ SAVE_VSR(25, r3)
+ SAVE_VSR(26, r3)
+ SAVE_VSR(27, r3)
+ SAVE_VSR(28, r3)
+ SAVE_VSR(29, r3)
+ SAVE_VSR(30, r3)
+ SAVE_VSR(31, r3)
+ SAVE_VSR(32, r3)
+ SAVE_VSR(33, r3)
+ SAVE_VSR(34, r3)
+ SAVE_VSR(35, r3)
+ SAVE_VSR(36, r3)
+ SAVE_VSR(37, r3)
+ SAVE_VSR(38, r3)
+ SAVE_VSR(39, r3)
+ SAVE_VSR(40, r3)
+ SAVE_VSR(41, r3)
+ SAVE_VSR(42, r3)
+ SAVE_VSR(43, r3)
+ SAVE_VSR(44, r3)
+ SAVE_VSR(45, r3)
+ SAVE_VSR(46, r3)
+ SAVE_VSR(47, r3)
+ SAVE_VSR(48, r3)
+ SAVE_VSR(49, r3)
+ SAVE_VSR(50, r3)
+ SAVE_VSR(51, r3)
+ SAVE_VSR(52, r3)
+ SAVE_VSR(53, r3)
+ SAVE_VSR(54, r3)
+ SAVE_VSR(55, r3)
+ SAVE_VSR(56, r3)
+ SAVE_VSR(57, r3)
+ SAVE_VSR(58, r3)
+ SAVE_VSR(59, r3)
+ SAVE_VSR(60, r3)
+ SAVE_VSR(61, r3)
+ SAVE_VSR(62, r3)
+ SAVE_VSR(63, r3)
+
+ TRASH_GPR(2)
+ TRASH_GPR(3)
+ TRASH_GPR(4)
+ TRASH_GPR(5)
+ TRASH_GPR(6)
+ TRASH_GPR(7)
+ TRASH_GPR(8)
+ TRASH_GPR(9)
+ TRASH_GPR(10)
+ TRASH_GPR(11)
+ TRASH_GPR(12)
+ TRASH_GPR(14)
+ TRASH_GPR(15)
+ TRASH_GPR(16)
+ TRASH_GPR(17)
+ TRASH_GPR(18)
+ TRASH_GPR(19)
+ TRASH_GPR(20)
+ TRASH_GPR(21)
+ TRASH_GPR(22)
+ TRASH_GPR(23)
+ TRASH_GPR(24)
+ TRASH_GPR(25)
+ TRASH_GPR(26)
+ TRASH_GPR(27)
+ TRASH_GPR(28)
+ TRASH_GPR(29)
+ TRASH_GPR(30)
+ TRASH_GPR(31)
+
+ RESTORE_TOC(ebb_handler)
+
+ /*
+ * r13 is our TLS pointer. We leave whatever value was in there when the
+ * EBB fired. That seems to be OK because once set the TLS pointer is not
+ * changed - but presumably that could change in future.
+ */
+
+ bl ebb_hook
+ nop
+
+ /* r2 may be changed here but we don't care */
+
+ lfd f0, FSCR_SAVE(r1)
+ mtfsf 0xff,f0
+ lfd f0, VSCR_SAVE(r1)
+ mtvscr f0
+ LOAD_VSR(0, r3)
+ LOAD_VSR(1, r3)
+ LOAD_VSR(2, r3)
+ LOAD_VSR(3, r3)
+ LOAD_VSR(4, r3)
+ LOAD_VSR(5, r3)
+ LOAD_VSR(6, r3)
+ LOAD_VSR(7, r3)
+ LOAD_VSR(8, r3)
+ LOAD_VSR(9, r3)
+ LOAD_VSR(10, r3)
+ LOAD_VSR(11, r3)
+ LOAD_VSR(12, r3)
+ LOAD_VSR(13, r3)
+ LOAD_VSR(14, r3)
+ LOAD_VSR(15, r3)
+ LOAD_VSR(16, r3)
+ LOAD_VSR(17, r3)
+ LOAD_VSR(18, r3)
+ LOAD_VSR(19, r3)
+ LOAD_VSR(20, r3)
+ LOAD_VSR(21, r3)
+ LOAD_VSR(22, r3)
+ LOAD_VSR(23, r3)
+ LOAD_VSR(24, r3)
+ LOAD_VSR(25, r3)
+ LOAD_VSR(26, r3)
+ LOAD_VSR(27, r3)
+ LOAD_VSR(28, r3)
+ LOAD_VSR(29, r3)
+ LOAD_VSR(30, r3)
+ LOAD_VSR(31, r3)
+ LOAD_VSR(32, r3)
+ LOAD_VSR(33, r3)
+ LOAD_VSR(34, r3)
+ LOAD_VSR(35, r3)
+ LOAD_VSR(36, r3)
+ LOAD_VSR(37, r3)
+ LOAD_VSR(38, r3)
+ LOAD_VSR(39, r3)
+ LOAD_VSR(40, r3)
+ LOAD_VSR(41, r3)
+ LOAD_VSR(42, r3)
+ LOAD_VSR(43, r3)
+ LOAD_VSR(44, r3)
+ LOAD_VSR(45, r3)
+ LOAD_VSR(46, r3)
+ LOAD_VSR(47, r3)
+ LOAD_VSR(48, r3)
+ LOAD_VSR(49, r3)
+ LOAD_VSR(50, r3)
+ LOAD_VSR(51, r3)
+ LOAD_VSR(52, r3)
+ LOAD_VSR(53, r3)
+ LOAD_VSR(54, r3)
+ LOAD_VSR(55, r3)
+ LOAD_VSR(56, r3)
+ LOAD_VSR(57, r3)
+ LOAD_VSR(58, r3)
+ LOAD_VSR(59, r3)
+ LOAD_VSR(60, r3)
+ LOAD_VSR(61, r3)
+ LOAD_VSR(62, r3)
+ LOAD_VSR(63, r3)
+
+ ld r0,XER_SAVE(r1)
+ mtxer r0
+ ld r0,CTR_SAVE(r1)
+ mtctr r0
+ ld r0,LR_SAVE(r1)
+ mtlr r0
+ ld r0,CCR_SAVE(r1)
+ mtcr r0
+ REST_GPR(0)
+ REST_GPR(2)
+ REST_GPR(3)
+ REST_GPR(4)
+ REST_GPR(5)
+ REST_GPR(6)
+ REST_GPR(7)
+ REST_GPR(8)
+ REST_GPR(9)
+ REST_GPR(10)
+ REST_GPR(11)
+ REST_GPR(12)
+ REST_GPR(13)
+ REST_GPR(14)
+ REST_GPR(15)
+ REST_GPR(16)
+ REST_GPR(17)
+ REST_GPR(18)
+ REST_GPR(19)
+ REST_GPR(20)
+ REST_GPR(21)
+ REST_GPR(22)
+ REST_GPR(23)
+ REST_GPR(24)
+ REST_GPR(25)
+ REST_GPR(26)
+ REST_GPR(27)
+ REST_GPR(28)
+ REST_GPR(29)
+ REST_GPR(30)
+ REST_GPR(31)
+ addi r1,r1,STACK_FRAME
+ RFEBB
+FUNC_END(ebb_handler)
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/ebb_on_child_test.c b/tools/testing/selftests/powerpc/pmu/ebb/ebb_on_child_test.c
new file mode 100644
index 000000000..8980f054d
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/ebb_on_child_test.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "ebb.h"
+
+
+/*
+ * Tests we can setup an EBB on our child. Nothing interesting happens, because
+ * even though the event is enabled and running the child hasn't enabled the
+ * actual delivery of the EBBs.
+ */
+
+static int victim_child(union pipe read_pipe, union pipe write_pipe)
+{
+ int i;
+
+ FAIL_IF(wait_for_parent(read_pipe));
+ FAIL_IF(notify_parent(write_pipe));
+
+ /* Parent creates EBB event */
+
+ FAIL_IF(wait_for_parent(read_pipe));
+ FAIL_IF(notify_parent(write_pipe));
+
+ /* Check the EBB is enabled by writing PMC1 */
+ write_pmc1();
+
+ /* EBB event is enabled here */
+ for (i = 0; i < 1000000; i++) ;
+
+ return 0;
+}
+
+int ebb_on_child(void)
+{
+ union pipe read_pipe, write_pipe;
+ struct event event;
+ pid_t pid;
+
+ SKIP_IF(!ebb_is_supported());
+
+ FAIL_IF(pipe(read_pipe.fds) == -1);
+ FAIL_IF(pipe(write_pipe.fds) == -1);
+
+ pid = fork();
+ if (pid == 0) {
+ /* NB order of pipes looks reversed */
+ exit(victim_child(write_pipe, read_pipe));
+ }
+
+ FAIL_IF(sync_with_child(read_pipe, write_pipe));
+
+ /* Child is running now */
+
+ event_init_named(&event, 0x1001e, "cycles");
+ event_leader_ebb_init(&event);
+
+ event.attr.exclude_kernel = 1;
+ event.attr.exclude_hv = 1;
+ event.attr.exclude_idle = 1;
+
+ FAIL_IF(event_open_with_pid(&event, pid));
+ FAIL_IF(ebb_event_enable(&event));
+
+ FAIL_IF(sync_with_child(read_pipe, write_pipe));
+
+ /* Child should just exit happily */
+ FAIL_IF(wait_for_child(pid));
+
+ event_close(&event);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(ebb_on_child, "ebb_on_child");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/ebb_on_willing_child_test.c b/tools/testing/selftests/powerpc/pmu/ebb/ebb_on_willing_child_test.c
new file mode 100644
index 000000000..b208bf6ad
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/ebb_on_willing_child_test.c
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "ebb.h"
+
+
+/*
+ * Tests we can setup an EBB on our child. The child expects this and enables
+ * EBBs, which are then delivered to the child, even though the event is
+ * created by the parent.
+ */
+
+static int victim_child(union pipe read_pipe, union pipe write_pipe)
+{
+ FAIL_IF(wait_for_parent(read_pipe));
+
+ /* Setup our EBB handler, before the EBB event is created */
+ ebb_enable_pmc_counting(1);
+ setup_ebb_handler(standard_ebb_callee);
+ ebb_global_enable();
+
+ FAIL_IF(notify_parent(write_pipe));
+
+ while (ebb_state.stats.ebb_count < 20) {
+ FAIL_IF(core_busy_loop());
+ }
+
+ ebb_global_disable();
+ ebb_freeze_pmcs();
+
+ dump_ebb_state();
+
+ FAIL_IF(ebb_state.stats.ebb_count == 0);
+
+ return 0;
+}
+
+/* Tests we can setup an EBB on our child - if it's expecting it */
+int ebb_on_willing_child(void)
+{
+ union pipe read_pipe, write_pipe;
+ struct event event;
+ pid_t pid;
+
+ SKIP_IF(!ebb_is_supported());
+
+ FAIL_IF(pipe(read_pipe.fds) == -1);
+ FAIL_IF(pipe(write_pipe.fds) == -1);
+
+ pid = fork();
+ if (pid == 0) {
+ /* NB order of pipes looks reversed */
+ exit(victim_child(write_pipe, read_pipe));
+ }
+
+ /* Signal the child to setup its EBB handler */
+ FAIL_IF(sync_with_child(read_pipe, write_pipe));
+
+ /* Child is running now */
+
+ event_init_named(&event, 0x1001e, "cycles");
+ event_leader_ebb_init(&event);
+
+ event.attr.exclude_kernel = 1;
+ event.attr.exclude_hv = 1;
+ event.attr.exclude_idle = 1;
+
+ FAIL_IF(event_open_with_pid(&event, pid));
+ FAIL_IF(ebb_event_enable(&event));
+
+ /* Child show now take EBBs and then exit */
+ FAIL_IF(wait_for_child(pid));
+
+ event_close(&event);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(ebb_on_willing_child, "ebb_on_willing_child");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/ebb_vs_cpu_event_test.c b/tools/testing/selftests/powerpc/pmu/ebb/ebb_vs_cpu_event_test.c
new file mode 100644
index 000000000..4d822cb35
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/ebb_vs_cpu_event_test.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "ebb.h"
+
+
+/*
+ * Tests an EBB vs a cpu event - in that order. The EBB should force the cpu
+ * event off the PMU.
+ */
+
+static int setup_cpu_event(struct event *event, int cpu)
+{
+ event_init_named(event, 0x400FA, "PM_RUN_INST_CMPL");
+
+ event->attr.exclude_kernel = 1;
+ event->attr.exclude_hv = 1;
+ event->attr.exclude_idle = 1;
+
+ SKIP_IF(require_paranoia_below(1));
+ FAIL_IF(event_open_with_cpu(event, cpu));
+ FAIL_IF(event_enable(event));
+
+ return 0;
+}
+
+int ebb_vs_cpu_event(void)
+{
+ union pipe read_pipe, write_pipe;
+ struct event event;
+ int cpu, rc;
+ pid_t pid;
+
+ SKIP_IF(!ebb_is_supported());
+
+ cpu = pick_online_cpu();
+ FAIL_IF(cpu < 0);
+ FAIL_IF(bind_to_cpu(cpu));
+
+ FAIL_IF(pipe(read_pipe.fds) == -1);
+ FAIL_IF(pipe(write_pipe.fds) == -1);
+
+ pid = fork();
+ if (pid == 0) {
+ /* NB order of pipes looks reversed */
+ exit(ebb_child(write_pipe, read_pipe));
+ }
+
+ /* Signal the child to install its EBB event and wait */
+ FAIL_IF(sync_with_child(read_pipe, write_pipe));
+
+ /* Now try to install our CPU event */
+ rc = setup_cpu_event(&event, cpu);
+ if (rc) {
+ kill_child_and_wait(pid);
+ return rc;
+ }
+
+ /* Signal the child to run */
+ FAIL_IF(sync_with_child(read_pipe, write_pipe));
+
+ /* .. and wait for it to complete */
+ FAIL_IF(wait_for_child(pid));
+ FAIL_IF(event_disable(&event));
+ FAIL_IF(event_read(&event));
+
+ event_report(&event);
+
+ /* The cpu event may have run, but we don't expect 100% */
+ FAIL_IF(event.result.enabled >= event.result.running);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(ebb_vs_cpu_event, "ebb_vs_cpu_event");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/event_attributes_test.c b/tools/testing/selftests/powerpc/pmu/ebb/event_attributes_test.c
new file mode 100644
index 000000000..6e6dd0bce
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/event_attributes_test.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "ebb.h"
+
+
+/*
+ * Test various attributes of the EBB event are enforced.
+ */
+int event_attributes(void)
+{
+ struct event event, leader;
+
+ SKIP_IF(!ebb_is_supported());
+
+ event_init(&event, 0x1001e);
+ event_leader_ebb_init(&event);
+ /* Expected to succeed */
+ FAIL_IF(event_open(&event));
+ event_close(&event);
+
+
+ event_init(&event, 0x001e); /* CYCLES - no PMC specified */
+ event_leader_ebb_init(&event);
+ /* Expected to fail, no PMC specified */
+ FAIL_IF(event_open(&event) == 0);
+
+
+ event_init(&event, 0x2001e);
+ event_leader_ebb_init(&event);
+ event.attr.exclusive = 0;
+ /* Expected to fail, not exclusive */
+ FAIL_IF(event_open(&event) == 0);
+
+
+ event_init(&event, 0x3001e);
+ event_leader_ebb_init(&event);
+ event.attr.freq = 1;
+ /* Expected to fail, sets freq */
+ FAIL_IF(event_open(&event) == 0);
+
+
+ event_init(&event, 0x4001e);
+ event_leader_ebb_init(&event);
+ event.attr.sample_period = 1;
+ /* Expected to fail, sets sample_period */
+ FAIL_IF(event_open(&event) == 0);
+
+
+ event_init(&event, 0x1001e);
+ event_leader_ebb_init(&event);
+ event.attr.enable_on_exec = 1;
+ /* Expected to fail, sets enable_on_exec */
+ FAIL_IF(event_open(&event) == 0);
+
+
+ event_init(&event, 0x1001e);
+ event_leader_ebb_init(&event);
+ event.attr.inherit = 1;
+ /* Expected to fail, sets inherit */
+ FAIL_IF(event_open(&event) == 0);
+
+
+ event_init(&leader, 0x1001e);
+ event_leader_ebb_init(&leader);
+ FAIL_IF(event_open(&leader));
+
+ event_init(&event, 0x20002);
+ event_ebb_init(&event);
+
+ /* Expected to succeed */
+ FAIL_IF(event_open_with_group(&event, leader.fd));
+ event_close(&leader);
+ event_close(&event);
+
+
+ event_init(&leader, 0x1001e);
+ event_leader_ebb_init(&leader);
+ FAIL_IF(event_open(&leader));
+
+ event_init(&event, 0x20002);
+
+ /* Expected to fail, event doesn't request EBB, leader does */
+ FAIL_IF(event_open_with_group(&event, leader.fd) == 0);
+ event_close(&leader);
+
+
+ event_init(&leader, 0x1001e);
+ event_leader_ebb_init(&leader);
+ /* Clear the EBB flag */
+ leader.attr.config &= ~(1ull << 63);
+
+ FAIL_IF(event_open(&leader));
+
+ event_init(&event, 0x20002);
+ event_ebb_init(&event);
+
+ /* Expected to fail, leader doesn't request EBB */
+ FAIL_IF(event_open_with_group(&event, leader.fd) == 0);
+ event_close(&leader);
+
+
+ event_init(&leader, 0x1001e);
+ event_leader_ebb_init(&leader);
+ leader.attr.exclusive = 0;
+ /* Expected to fail, leader isn't exclusive */
+ FAIL_IF(event_open(&leader) == 0);
+
+
+ event_init(&leader, 0x1001e);
+ event_leader_ebb_init(&leader);
+ leader.attr.pinned = 0;
+ /* Expected to fail, leader isn't pinned */
+ FAIL_IF(event_open(&leader) == 0);
+
+ event_init(&event, 0x1001e);
+ event_leader_ebb_init(&event);
+ /* Expected to fail, not a task event */
+ SKIP_IF(require_paranoia_below(1));
+ FAIL_IF(event_open_with_cpu(&event, 0) == 0);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(event_attributes, "event_attributes");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/fixed_instruction_loop.S b/tools/testing/selftests/powerpc/pmu/ebb/fixed_instruction_loop.S
new file mode 100644
index 000000000..08a7b5f13
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/fixed_instruction_loop.S
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <ppc-asm.h>
+
+ .text
+
+FUNC_START(thirty_two_instruction_loop)
+ cmpwi r3,0
+ beqlr
+ addi r4,r3,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1 # 28 addi's
+ subi r3,r3,1
+ b FUNC_NAME(thirty_two_instruction_loop)
+FUNC_END(thirty_two_instruction_loop)
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/fork_cleanup_test.c b/tools/testing/selftests/powerpc/pmu/ebb/fork_cleanup_test.c
new file mode 100644
index 000000000..2b25b5545
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/fork_cleanup_test.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <setjmp.h>
+
+#include "ebb.h"
+
+
+/*
+ * Test that a fork clears the PMU state of the child. eg. BESCR/EBBHR/EBBRR
+ * are cleared, and MMCR0_PMCC is reset, preventing the child from accessing
+ * the PMU.
+ */
+
+static struct event event;
+
+static int child(void)
+{
+ /* Even though we have EBE=0 we can still see the EBB regs */
+ FAIL_IF(mfspr(SPRN_BESCR) != 0);
+ FAIL_IF(mfspr(SPRN_EBBHR) != 0);
+ FAIL_IF(mfspr(SPRN_EBBRR) != 0);
+
+ FAIL_IF(catch_sigill(write_pmc1));
+
+ /* We can still read from the event, though it is on our parent */
+ FAIL_IF(event_read(&event));
+
+ return 0;
+}
+
+/* Tests that fork clears EBB state */
+int fork_cleanup(void)
+{
+ pid_t pid;
+
+ SKIP_IF(!ebb_is_supported());
+
+ event_init_named(&event, 0x1001e, "cycles");
+ event_leader_ebb_init(&event);
+
+ FAIL_IF(event_open(&event));
+
+ ebb_enable_pmc_counting(1);
+ setup_ebb_handler(standard_ebb_callee);
+ ebb_global_enable();
+
+ FAIL_IF(ebb_event_enable(&event));
+
+ mtspr(SPRN_MMCR0, MMCR0_FC);
+ mtspr(SPRN_PMC1, pmc_sample_period(sample_period));
+
+ /* Don't need to actually take any EBBs */
+
+ pid = fork();
+ if (pid == 0)
+ exit(child());
+
+ /* Child does the actual testing */
+ FAIL_IF(wait_for_child(pid));
+
+ /* After fork */
+ event_close(&event);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(fork_cleanup, "fork_cleanup");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/instruction_count_test.c b/tools/testing/selftests/powerpc/pmu/ebb/instruction_count_test.c
new file mode 100644
index 000000000..eed338b18
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/instruction_count_test.c
@@ -0,0 +1,167 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sys/prctl.h>
+
+#include "ebb.h"
+
+
+/*
+ * Run a calibrated instruction loop and count instructions executed using
+ * EBBs. Make sure the counts look right.
+ */
+
+extern void thirty_two_instruction_loop(uint64_t loops);
+
+static bool counters_frozen = true;
+
+static int do_count_loop(struct event *event, uint64_t instructions,
+ uint64_t overhead, bool report)
+{
+ int64_t difference, expected;
+ double percentage;
+
+ clear_ebb_stats();
+
+ counters_frozen = false;
+ mb();
+ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_FC);
+
+ thirty_two_instruction_loop(instructions >> 5);
+
+ counters_frozen = true;
+ mb();
+ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC);
+
+ count_pmc(4, sample_period);
+
+ event->result.value = ebb_state.stats.pmc_count[4-1];
+ expected = instructions + overhead;
+ difference = event->result.value - expected;
+ percentage = (double)difference / event->result.value * 100;
+
+ if (report) {
+ printf("Looped for %lu instructions, overhead %lu\n", instructions, overhead);
+ printf("Expected %lu\n", expected);
+ printf("Actual %llu\n", event->result.value);
+ printf("Delta %ld, %f%%\n", difference, percentage);
+ printf("Took %d EBBs\n", ebb_state.stats.ebb_count);
+ }
+
+ if (difference < 0)
+ difference = -difference;
+
+ /* Tolerate a difference of up to 0.0001 % */
+ difference *= 10000 * 100;
+ if (difference / event->result.value)
+ return -1;
+
+ return 0;
+}
+
+/* Count how many instructions it takes to do a null loop */
+static uint64_t determine_overhead(struct event *event)
+{
+ uint64_t current, overhead;
+ int i;
+
+ do_count_loop(event, 0, 0, false);
+ overhead = event->result.value;
+
+ for (i = 0; i < 100; i++) {
+ do_count_loop(event, 0, 0, false);
+ current = event->result.value;
+ if (current < overhead) {
+ printf("Replacing overhead %lu with %lu\n", overhead, current);
+ overhead = current;
+ }
+ }
+
+ return overhead;
+}
+
+static void pmc4_ebb_callee(void)
+{
+ uint64_t val;
+
+ val = mfspr(SPRN_BESCR);
+ if (!(val & BESCR_PMEO)) {
+ ebb_state.stats.spurious++;
+ goto out;
+ }
+
+ ebb_state.stats.ebb_count++;
+ count_pmc(4, sample_period);
+out:
+ if (counters_frozen)
+ reset_ebb_with_clear_mask(MMCR0_PMAO);
+ else
+ reset_ebb();
+}
+
+int instruction_count(void)
+{
+ struct event event;
+ uint64_t overhead;
+
+ SKIP_IF(!ebb_is_supported());
+
+ event_init_named(&event, 0x400FA, "PM_RUN_INST_CMPL");
+ event_leader_ebb_init(&event);
+ event.attr.exclude_kernel = 1;
+ event.attr.exclude_hv = 1;
+ event.attr.exclude_idle = 1;
+
+ FAIL_IF(event_open(&event));
+ FAIL_IF(ebb_event_enable(&event));
+
+ sample_period = COUNTER_OVERFLOW;
+
+ setup_ebb_handler(pmc4_ebb_callee);
+ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_FC);
+ ebb_global_enable();
+
+ overhead = determine_overhead(&event);
+ printf("Overhead of null loop: %lu instructions\n", overhead);
+
+ /* Run for 1M instructions */
+ FAIL_IF(do_count_loop(&event, 0x100000, overhead, true));
+
+ /* Run for 10M instructions */
+ FAIL_IF(do_count_loop(&event, 0xa00000, overhead, true));
+
+ /* Run for 100M instructions */
+ FAIL_IF(do_count_loop(&event, 0x6400000, overhead, true));
+
+ /* Run for 1G instructions */
+ FAIL_IF(do_count_loop(&event, 0x40000000, overhead, true));
+
+ /* Run for 16G instructions */
+ FAIL_IF(do_count_loop(&event, 0x400000000, overhead, true));
+
+ /* Run for 64G instructions */
+ FAIL_IF(do_count_loop(&event, 0x1000000000, overhead, true));
+
+ /* Run for 128G instructions */
+ FAIL_IF(do_count_loop(&event, 0x2000000000, overhead, true));
+
+ ebb_global_disable();
+ event_close(&event);
+
+ printf("Finished OK\n");
+
+ return 0;
+}
+
+int main(void)
+{
+ test_harness_set_timeout(300);
+ return test_harness(instruction_count, "instruction_count");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/lost_exception_test.c b/tools/testing/selftests/powerpc/pmu/ebb/lost_exception_test.c
new file mode 100644
index 000000000..ba2681a12
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/lost_exception_test.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+
+#include "ebb.h"
+
+
+/*
+ * Test that tries to trigger CPU_FTR_PMAO_BUG. Which is a hardware defect
+ * where an exception triggers but we context switch before it is delivered and
+ * lose the exception.
+ */
+
+static int test_body(void)
+{
+ int i, orig_period, max_period;
+ struct event event;
+
+ SKIP_IF(!ebb_is_supported());
+
+ /* We use PMC4 to make sure the kernel switches all counters correctly */
+ event_init_named(&event, 0x40002, "instructions");
+ event_leader_ebb_init(&event);
+
+ event.attr.exclude_kernel = 1;
+ event.attr.exclude_hv = 1;
+ event.attr.exclude_idle = 1;
+
+ FAIL_IF(event_open(&event));
+
+ ebb_enable_pmc_counting(4);
+ setup_ebb_handler(standard_ebb_callee);
+ ebb_global_enable();
+ FAIL_IF(ebb_event_enable(&event));
+
+ /*
+ * We want a low sample period, but we also want to get out of the EBB
+ * handler without tripping up again.
+ *
+ * This value picked after much experimentation.
+ */
+ orig_period = max_period = sample_period = 400;
+
+ mtspr(SPRN_PMC4, pmc_sample_period(sample_period));
+
+ while (ebb_state.stats.ebb_count < 1000000) {
+ /*
+ * We are trying to get the EBB exception to race exactly with
+ * us entering the kernel to do the syscall. We then need the
+ * kernel to decide our timeslice is up and context switch to
+ * the other thread. When we come back our EBB will have been
+ * lost and we'll spin in this while loop forever.
+ */
+
+ for (i = 0; i < 100000; i++)
+ sched_yield();
+
+ /* Change the sample period slightly to try and hit the race */
+ if (sample_period >= (orig_period + 200))
+ sample_period = orig_period;
+ else
+ sample_period++;
+
+ if (sample_period > max_period)
+ max_period = sample_period;
+ }
+
+ ebb_freeze_pmcs();
+ ebb_global_disable();
+
+ mtspr(SPRN_PMC4, 0xdead);
+
+ dump_summary_ebb_state();
+ dump_ebb_hw_state();
+
+ event_close(&event);
+
+ FAIL_IF(ebb_state.stats.ebb_count == 0);
+
+ /* We vary our sample period so we need extra fudge here */
+ FAIL_IF(!ebb_check_count(4, orig_period, 2 * (max_period - orig_period)));
+
+ return 0;
+}
+
+static int lost_exception(void)
+{
+ return eat_cpu(test_body);
+}
+
+int main(void)
+{
+ test_harness_set_timeout(300);
+ return test_harness(lost_exception, "lost_exception");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/multi_counter_test.c b/tools/testing/selftests/powerpc/pmu/ebb/multi_counter_test.c
new file mode 100644
index 000000000..791d37ba3
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/multi_counter_test.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+
+#include "ebb.h"
+
+
+/*
+ * Test counting multiple events using EBBs.
+ */
+int multi_counter(void)
+{
+ struct event events[6];
+ int i, group_fd;
+
+ SKIP_IF(!ebb_is_supported());
+
+ event_init_named(&events[0], 0x1001C, "PM_CMPLU_STALL_THRD");
+ event_init_named(&events[1], 0x2D016, "PM_CMPLU_STALL_FXU");
+ event_init_named(&events[2], 0x30006, "PM_CMPLU_STALL_OTHER_CMPL");
+ event_init_named(&events[3], 0x4000A, "PM_CMPLU_STALL");
+ event_init_named(&events[4], 0x600f4, "PM_RUN_CYC");
+ event_init_named(&events[5], 0x500fa, "PM_RUN_INST_CMPL");
+
+ event_leader_ebb_init(&events[0]);
+ for (i = 1; i < 6; i++)
+ event_ebb_init(&events[i]);
+
+ group_fd = -1;
+ for (i = 0; i < 6; i++) {
+ events[i].attr.exclude_kernel = 1;
+ events[i].attr.exclude_hv = 1;
+ events[i].attr.exclude_idle = 1;
+
+ FAIL_IF(event_open_with_group(&events[i], group_fd));
+ if (group_fd == -1)
+ group_fd = events[0].fd;
+ }
+
+ ebb_enable_pmc_counting(1);
+ ebb_enable_pmc_counting(2);
+ ebb_enable_pmc_counting(3);
+ ebb_enable_pmc_counting(4);
+ ebb_enable_pmc_counting(5);
+ ebb_enable_pmc_counting(6);
+ setup_ebb_handler(standard_ebb_callee);
+
+ FAIL_IF(ioctl(events[0].fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP));
+ FAIL_IF(event_read(&events[0]));
+
+ ebb_global_enable();
+
+ mtspr(SPRN_PMC1, pmc_sample_period(sample_period));
+ mtspr(SPRN_PMC2, pmc_sample_period(sample_period));
+ mtspr(SPRN_PMC3, pmc_sample_period(sample_period));
+ mtspr(SPRN_PMC4, pmc_sample_period(sample_period));
+ mtspr(SPRN_PMC5, pmc_sample_period(sample_period));
+ mtspr(SPRN_PMC6, pmc_sample_period(sample_period));
+
+ while (ebb_state.stats.ebb_count < 50) {
+ FAIL_IF(core_busy_loop());
+ FAIL_IF(ebb_check_mmcr0());
+ }
+
+ ebb_global_disable();
+ ebb_freeze_pmcs();
+
+ dump_ebb_state();
+
+ for (i = 0; i < 6; i++)
+ event_close(&events[i]);
+
+ FAIL_IF(ebb_state.stats.ebb_count == 0);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(multi_counter, "multi_counter");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/multi_ebb_procs_test.c b/tools/testing/selftests/powerpc/pmu/ebb/multi_ebb_procs_test.c
new file mode 100644
index 000000000..9b0f70d59
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/multi_ebb_procs_test.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#include "ebb.h"
+
+
+/*
+ * Test running multiple EBB using processes at once on a single CPU. They
+ * should all run happily without interfering with each other.
+ */
+
+static bool child_should_exit;
+
+static void sigint_handler(int signal)
+{
+ child_should_exit = true;
+}
+
+struct sigaction sigint_action = {
+ .sa_handler = sigint_handler,
+};
+
+static int cycles_child(void)
+{
+ struct event event;
+
+ if (sigaction(SIGINT, &sigint_action, NULL)) {
+ perror("sigaction");
+ return 1;
+ }
+
+ event_init_named(&event, 0x1001e, "cycles");
+ event_leader_ebb_init(&event);
+
+ event.attr.exclude_kernel = 1;
+ event.attr.exclude_hv = 1;
+ event.attr.exclude_idle = 1;
+
+ FAIL_IF(event_open(&event));
+
+ ebb_enable_pmc_counting(1);
+ setup_ebb_handler(standard_ebb_callee);
+ ebb_global_enable();
+
+ FAIL_IF(ebb_event_enable(&event));
+
+ mtspr(SPRN_PMC1, pmc_sample_period(sample_period));
+
+ while (!child_should_exit) {
+ FAIL_IF(core_busy_loop());
+ FAIL_IF(ebb_check_mmcr0());
+ }
+
+ ebb_global_disable();
+ ebb_freeze_pmcs();
+
+ dump_summary_ebb_state();
+
+ event_close(&event);
+
+ FAIL_IF(ebb_state.stats.ebb_count == 0);
+
+ return 0;
+}
+
+#define NR_CHILDREN 4
+
+int multi_ebb_procs(void)
+{
+ pid_t pids[NR_CHILDREN];
+ int cpu, rc, i;
+
+ SKIP_IF(!ebb_is_supported());
+
+ cpu = pick_online_cpu();
+ FAIL_IF(cpu < 0);
+ FAIL_IF(bind_to_cpu(cpu));
+
+ for (i = 0; i < NR_CHILDREN; i++) {
+ pids[i] = fork();
+ if (pids[i] == 0)
+ exit(cycles_child());
+ }
+
+ /* Have them all run for "a while" */
+ sleep(10);
+
+ rc = 0;
+ for (i = 0; i < NR_CHILDREN; i++) {
+ /* Tell them to stop */
+ kill(pids[i], SIGINT);
+ /* And wait */
+ rc |= wait_for_child(pids[i]);
+ }
+
+ return rc;
+}
+
+int main(void)
+{
+ return test_harness(multi_ebb_procs, "multi_ebb_procs");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/no_handler_test.c b/tools/testing/selftests/powerpc/pmu/ebb/no_handler_test.c
new file mode 100644
index 000000000..01e827c31
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/no_handler_test.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <setjmp.h>
+#include <signal.h>
+
+#include "ebb.h"
+
+
+/* Test that things work sanely if we have no handler */
+
+static int no_handler_test(void)
+{
+ struct event event;
+ u64 val;
+ int i;
+
+ SKIP_IF(!ebb_is_supported());
+
+ event_init_named(&event, 0x1001e, "cycles");
+ event_leader_ebb_init(&event);
+
+ event.attr.exclude_kernel = 1;
+ event.attr.exclude_hv = 1;
+ event.attr.exclude_idle = 1;
+
+ FAIL_IF(event_open(&event));
+ FAIL_IF(ebb_event_enable(&event));
+
+ val = mfspr(SPRN_EBBHR);
+ FAIL_IF(val != 0);
+
+ /* Make sure it overflows quickly */
+ sample_period = 1000;
+ mtspr(SPRN_PMC1, pmc_sample_period(sample_period));
+
+ /* Spin to make sure the event has time to overflow */
+ for (i = 0; i < 1000; i++)
+ mb();
+
+ dump_ebb_state();
+
+ /* We expect to see the PMU frozen & PMAO set */
+ val = mfspr(SPRN_MMCR0);
+ FAIL_IF(val != 0x0000000080000080);
+
+ event_close(&event);
+
+ /* The real test is that we never took an EBB at 0x0 */
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(no_handler_test,"no_handler_test");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/pmae_handling_test.c b/tools/testing/selftests/powerpc/pmu/ebb/pmae_handling_test.c
new file mode 100644
index 000000000..2904c741e
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/pmae_handling_test.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <sched.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "ebb.h"
+
+
+/*
+ * Test that the kernel properly handles PMAE across context switches.
+ *
+ * We test this by calling into the kernel inside our EBB handler, where PMAE
+ * is clear. A cpu eater companion thread is running on the same CPU as us to
+ * encourage the scheduler to switch us.
+ *
+ * The kernel must make sure that when it context switches us back in, it
+ * honours the fact that we had PMAE clear.
+ *
+ * Observed to hit the failing case on the first EBB with a broken kernel.
+ */
+
+static bool mmcr0_mismatch;
+static uint64_t before, after;
+
+static void syscall_ebb_callee(void)
+{
+ uint64_t val;
+
+ val = mfspr(SPRN_BESCR);
+ if (!(val & BESCR_PMEO)) {
+ ebb_state.stats.spurious++;
+ goto out;
+ }
+
+ ebb_state.stats.ebb_count++;
+ count_pmc(1, sample_period);
+
+ before = mfspr(SPRN_MMCR0);
+
+ /* Try and get ourselves scheduled, to force a PMU context switch */
+ sched_yield();
+
+ after = mfspr(SPRN_MMCR0);
+ if (before != after)
+ mmcr0_mismatch = true;
+
+out:
+ reset_ebb();
+}
+
+static int test_body(void)
+{
+ struct event event;
+
+ SKIP_IF(!ebb_is_supported());
+
+ event_init_named(&event, 0x1001e, "cycles");
+ event_leader_ebb_init(&event);
+
+ event.attr.exclude_kernel = 1;
+ event.attr.exclude_hv = 1;
+ event.attr.exclude_idle = 1;
+
+ FAIL_IF(event_open(&event));
+
+ setup_ebb_handler(syscall_ebb_callee);
+ ebb_global_enable();
+
+ FAIL_IF(ebb_event_enable(&event));
+
+ mtspr(SPRN_PMC1, pmc_sample_period(sample_period));
+
+ while (ebb_state.stats.ebb_count < 20 && !mmcr0_mismatch)
+ FAIL_IF(core_busy_loop());
+
+ ebb_global_disable();
+ ebb_freeze_pmcs();
+
+ dump_ebb_state();
+
+ if (mmcr0_mismatch)
+ printf("Saw MMCR0 before 0x%lx after 0x%lx\n", before, after);
+
+ event_close(&event);
+
+ FAIL_IF(ebb_state.stats.ebb_count == 0);
+ FAIL_IF(mmcr0_mismatch);
+
+ return 0;
+}
+
+int pmae_handling(void)
+{
+ return eat_cpu(test_body);
+}
+
+int main(void)
+{
+ return test_harness(pmae_handling, "pmae_handling");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/pmc56_overflow_test.c b/tools/testing/selftests/powerpc/pmu/ebb/pmc56_overflow_test.c
new file mode 100644
index 000000000..b29f8ba22
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/pmc56_overflow_test.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "ebb.h"
+
+
+/*
+ * Test that PMC5 & 6 are frozen (ie. don't overflow) when they are not being
+ * used. Tests the MMCR0_FC56 logic in the kernel.
+ */
+
+static int pmc56_overflowed;
+
+static void ebb_callee(void)
+{
+ uint64_t val;
+
+ val = mfspr(SPRN_BESCR);
+ if (!(val & BESCR_PMEO)) {
+ ebb_state.stats.spurious++;
+ goto out;
+ }
+
+ ebb_state.stats.ebb_count++;
+ count_pmc(2, sample_period);
+
+ val = mfspr(SPRN_PMC5);
+ if (val >= COUNTER_OVERFLOW)
+ pmc56_overflowed++;
+
+ count_pmc(5, COUNTER_OVERFLOW);
+
+ val = mfspr(SPRN_PMC6);
+ if (val >= COUNTER_OVERFLOW)
+ pmc56_overflowed++;
+
+ count_pmc(6, COUNTER_OVERFLOW);
+
+out:
+ reset_ebb();
+}
+
+int pmc56_overflow(void)
+{
+ struct event event;
+
+ SKIP_IF(!ebb_is_supported());
+
+ /* Use PMC2 so we set PMCjCE, which enables PMC5/6 */
+ event_init(&event, 0x2001e);
+ event_leader_ebb_init(&event);
+
+ event.attr.exclude_kernel = 1;
+ event.attr.exclude_hv = 1;
+ event.attr.exclude_idle = 1;
+
+ FAIL_IF(event_open(&event));
+
+ setup_ebb_handler(ebb_callee);
+ ebb_global_enable();
+
+ FAIL_IF(ebb_event_enable(&event));
+
+ mtspr(SPRN_PMC2, pmc_sample_period(sample_period));
+ mtspr(SPRN_PMC5, 0);
+ mtspr(SPRN_PMC6, 0);
+
+ while (ebb_state.stats.ebb_count < 10)
+ FAIL_IF(core_busy_loop());
+
+ ebb_global_disable();
+ ebb_freeze_pmcs();
+
+ dump_ebb_state();
+
+ printf("PMC5/6 overflow %d\n", pmc56_overflowed);
+
+ event_close(&event);
+
+ FAIL_IF(ebb_state.stats.ebb_count == 0 || pmc56_overflowed != 0);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(pmc56_overflow, "pmc56_overflow");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/reg_access_test.c b/tools/testing/selftests/powerpc/pmu/ebb/reg_access_test.c
new file mode 100644
index 000000000..bd1ace9a0
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/reg_access_test.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "ebb.h"
+
+
+/*
+ * Test basic access to the EBB regs, they should be user accessible with no
+ * kernel interaction required.
+ */
+int reg_access(void)
+{
+ uint64_t val, expected;
+
+ SKIP_IF(!ebb_is_supported());
+
+ expected = 0x8000000100000000ull;
+ mtspr(SPRN_BESCR, expected);
+ val = mfspr(SPRN_BESCR);
+
+ FAIL_IF(val != expected);
+
+ expected = 0x0000000001000000ull;
+ mtspr(SPRN_EBBHR, expected);
+ val = mfspr(SPRN_EBBHR);
+
+ FAIL_IF(val != expected);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(reg_access, "reg_access");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/task_event_pinned_vs_ebb_test.c b/tools/testing/selftests/powerpc/pmu/ebb/task_event_pinned_vs_ebb_test.c
new file mode 100644
index 000000000..0aa2aefd3
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/task_event_pinned_vs_ebb_test.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "ebb.h"
+
+
+/*
+ * Tests a pinned per-task event vs an EBB - in that order. The pinned per-task
+ * event should prevent the EBB event from being enabled.
+ */
+
+static int setup_child_event(struct event *event, pid_t child_pid)
+{
+ event_init_named(event, 0x400FA, "PM_RUN_INST_CMPL");
+
+ event->attr.pinned = 1;
+
+ event->attr.exclude_kernel = 1;
+ event->attr.exclude_hv = 1;
+ event->attr.exclude_idle = 1;
+
+ FAIL_IF(event_open_with_pid(event, child_pid));
+ FAIL_IF(event_enable(event));
+
+ return 0;
+}
+
+int task_event_pinned_vs_ebb(void)
+{
+ union pipe read_pipe, write_pipe;
+ struct event event;
+ pid_t pid;
+ int rc;
+
+ SKIP_IF(!ebb_is_supported());
+
+ FAIL_IF(pipe(read_pipe.fds) == -1);
+ FAIL_IF(pipe(write_pipe.fds) == -1);
+
+ pid = fork();
+ if (pid == 0) {
+ /* NB order of pipes looks reversed */
+ exit(ebb_child(write_pipe, read_pipe));
+ }
+
+ /* We setup the task event first */
+ rc = setup_child_event(&event, pid);
+ if (rc) {
+ kill_child_and_wait(pid);
+ return rc;
+ }
+
+ /* Signal the child to install its EBB event and wait */
+ if (sync_with_child(read_pipe, write_pipe))
+ /* If it fails, wait for it to exit */
+ goto wait;
+
+ /* Signal the child to run */
+ FAIL_IF(sync_with_child(read_pipe, write_pipe));
+
+wait:
+ /* We expect it to fail to read the event */
+ FAIL_IF(wait_for_child(pid) != 2);
+ FAIL_IF(event_disable(&event));
+ FAIL_IF(event_read(&event));
+
+ event_report(&event);
+
+ FAIL_IF(event.result.value == 0);
+ /*
+ * For reasons I don't understand enabled is usually just slightly
+ * lower than running. Would be good to confirm why.
+ */
+ FAIL_IF(event.result.enabled == 0);
+ FAIL_IF(event.result.running == 0);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(task_event_pinned_vs_ebb, "task_event_pinned_vs_ebb");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/task_event_vs_ebb_test.c b/tools/testing/selftests/powerpc/pmu/ebb/task_event_vs_ebb_test.c
new file mode 100644
index 000000000..3e9d95ad9
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/task_event_vs_ebb_test.c
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "ebb.h"
+
+
+/*
+ * Tests a per-task event vs an EBB - in that order. The EBB should push the
+ * per-task event off the PMU.
+ */
+
+static int setup_child_event(struct event *event, pid_t child_pid)
+{
+ event_init_named(event, 0x400FA, "PM_RUN_INST_CMPL");
+
+ event->attr.exclude_kernel = 1;
+ event->attr.exclude_hv = 1;
+ event->attr.exclude_idle = 1;
+
+ FAIL_IF(event_open_with_pid(event, child_pid));
+ FAIL_IF(event_enable(event));
+
+ return 0;
+}
+
+int task_event_vs_ebb(void)
+{
+ union pipe read_pipe, write_pipe;
+ struct event event;
+ pid_t pid;
+ int rc;
+
+ SKIP_IF(!ebb_is_supported());
+
+ FAIL_IF(pipe(read_pipe.fds) == -1);
+ FAIL_IF(pipe(write_pipe.fds) == -1);
+
+ pid = fork();
+ if (pid == 0) {
+ /* NB order of pipes looks reversed */
+ exit(ebb_child(write_pipe, read_pipe));
+ }
+
+ /* We setup the task event first */
+ rc = setup_child_event(&event, pid);
+ if (rc) {
+ kill_child_and_wait(pid);
+ return rc;
+ }
+
+ /* Signal the child to install its EBB event and wait */
+ if (sync_with_child(read_pipe, write_pipe))
+ /* If it fails, wait for it to exit */
+ goto wait;
+
+ /* Signal the child to run */
+ FAIL_IF(sync_with_child(read_pipe, write_pipe));
+
+wait:
+ /* The EBB event should push the task event off so the child should succeed */
+ FAIL_IF(wait_for_child(pid));
+ FAIL_IF(event_disable(&event));
+ FAIL_IF(event_read(&event));
+
+ event_report(&event);
+
+ /* The task event may have run, or not so we can't assert anything about it */
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(task_event_vs_ebb, "task_event_vs_ebb");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/trace.c b/tools/testing/selftests/powerpc/pmu/ebb/trace.c
new file mode 100644
index 000000000..0c59f66a6
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/trace.c
@@ -0,0 +1,300 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+
+#include "trace.h"
+
+
+struct trace_buffer *trace_buffer_allocate(u64 size)
+{
+ struct trace_buffer *tb;
+
+ if (size < sizeof(*tb)) {
+ fprintf(stderr, "Error: trace buffer too small\n");
+ return NULL;
+ }
+
+ tb = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (tb == MAP_FAILED) {
+ perror("mmap");
+ return NULL;
+ }
+
+ tb->size = size;
+ tb->tail = tb->data;
+ tb->overflow = false;
+
+ return tb;
+}
+
+static bool trace_check_bounds(struct trace_buffer *tb, void *p)
+{
+ return p < ((void *)tb + tb->size);
+}
+
+static bool trace_check_alloc(struct trace_buffer *tb, void *p)
+{
+ /*
+ * If we ever overflowed don't allow any more input. This prevents us
+ * from dropping a large item and then later logging a small one. The
+ * buffer should just stop when overflow happened, not be patchy. If
+ * you're overflowing, make your buffer bigger.
+ */
+ if (tb->overflow)
+ return false;
+
+ if (!trace_check_bounds(tb, p)) {
+ tb->overflow = true;
+ return false;
+ }
+
+ return true;
+}
+
+static void *trace_alloc(struct trace_buffer *tb, int bytes)
+{
+ void *p, *newtail;
+
+ p = tb->tail;
+ newtail = tb->tail + bytes;
+ if (!trace_check_alloc(tb, newtail))
+ return NULL;
+
+ tb->tail = newtail;
+
+ return p;
+}
+
+static struct trace_entry *trace_alloc_entry(struct trace_buffer *tb, int payload_size)
+{
+ struct trace_entry *e;
+
+ e = trace_alloc(tb, sizeof(*e) + payload_size);
+ if (e)
+ e->length = payload_size;
+
+ return e;
+}
+
+int trace_log_reg(struct trace_buffer *tb, u64 reg, u64 value)
+{
+ struct trace_entry *e;
+ u64 *p;
+
+ e = trace_alloc_entry(tb, sizeof(reg) + sizeof(value));
+ if (!e)
+ return -ENOSPC;
+
+ e->type = TRACE_TYPE_REG;
+ p = (u64 *)e->data;
+ *p++ = reg;
+ *p++ = value;
+
+ return 0;
+}
+
+int trace_log_counter(struct trace_buffer *tb, u64 value)
+{
+ struct trace_entry *e;
+ u64 *p;
+
+ e = trace_alloc_entry(tb, sizeof(value));
+ if (!e)
+ return -ENOSPC;
+
+ e->type = TRACE_TYPE_COUNTER;
+ p = (u64 *)e->data;
+ *p++ = value;
+
+ return 0;
+}
+
+int trace_log_string(struct trace_buffer *tb, char *str)
+{
+ struct trace_entry *e;
+ char *p;
+ int len;
+
+ len = strlen(str);
+
+ /* We NULL terminate to make printing easier */
+ e = trace_alloc_entry(tb, len + 1);
+ if (!e)
+ return -ENOSPC;
+
+ e->type = TRACE_TYPE_STRING;
+ p = (char *)e->data;
+ memcpy(p, str, len);
+ p += len;
+ *p = '\0';
+
+ return 0;
+}
+
+int trace_log_indent(struct trace_buffer *tb)
+{
+ struct trace_entry *e;
+
+ e = trace_alloc_entry(tb, 0);
+ if (!e)
+ return -ENOSPC;
+
+ e->type = TRACE_TYPE_INDENT;
+
+ return 0;
+}
+
+int trace_log_outdent(struct trace_buffer *tb)
+{
+ struct trace_entry *e;
+
+ e = trace_alloc_entry(tb, 0);
+ if (!e)
+ return -ENOSPC;
+
+ e->type = TRACE_TYPE_OUTDENT;
+
+ return 0;
+}
+
+static void trace_print_header(int seq, int prefix)
+{
+ printf("%*s[%d]: ", prefix, "", seq);
+}
+
+static char *trace_decode_reg(int reg)
+{
+ switch (reg) {
+ case 769: return "SPRN_MMCR2"; break;
+ case 770: return "SPRN_MMCRA"; break;
+ case 779: return "SPRN_MMCR0"; break;
+ case 804: return "SPRN_EBBHR"; break;
+ case 805: return "SPRN_EBBRR"; break;
+ case 806: return "SPRN_BESCR"; break;
+ case 800: return "SPRN_BESCRS"; break;
+ case 801: return "SPRN_BESCRSU"; break;
+ case 802: return "SPRN_BESCRR"; break;
+ case 803: return "SPRN_BESCRRU"; break;
+ case 771: return "SPRN_PMC1"; break;
+ case 772: return "SPRN_PMC2"; break;
+ case 773: return "SPRN_PMC3"; break;
+ case 774: return "SPRN_PMC4"; break;
+ case 775: return "SPRN_PMC5"; break;
+ case 776: return "SPRN_PMC6"; break;
+ case 780: return "SPRN_SIAR"; break;
+ case 781: return "SPRN_SDAR"; break;
+ case 768: return "SPRN_SIER"; break;
+ }
+
+ return NULL;
+}
+
+static void trace_print_reg(struct trace_entry *e)
+{
+ u64 *p, *reg, *value;
+ char *name;
+
+ p = (u64 *)e->data;
+ reg = p++;
+ value = p;
+
+ name = trace_decode_reg(*reg);
+ if (name)
+ printf("register %-10s = 0x%016llx\n", name, *value);
+ else
+ printf("register %lld = 0x%016llx\n", *reg, *value);
+}
+
+static void trace_print_counter(struct trace_entry *e)
+{
+ u64 *value;
+
+ value = (u64 *)e->data;
+ printf("counter = %lld\n", *value);
+}
+
+static void trace_print_string(struct trace_entry *e)
+{
+ char *str;
+
+ str = (char *)e->data;
+ puts(str);
+}
+
+#define BASE_PREFIX 2
+#define PREFIX_DELTA 8
+
+static void trace_print_entry(struct trace_entry *e, int seq, int *prefix)
+{
+ switch (e->type) {
+ case TRACE_TYPE_REG:
+ trace_print_header(seq, *prefix);
+ trace_print_reg(e);
+ break;
+ case TRACE_TYPE_COUNTER:
+ trace_print_header(seq, *prefix);
+ trace_print_counter(e);
+ break;
+ case TRACE_TYPE_STRING:
+ trace_print_header(seq, *prefix);
+ trace_print_string(e);
+ break;
+ case TRACE_TYPE_INDENT:
+ trace_print_header(seq, *prefix);
+ puts("{");
+ *prefix += PREFIX_DELTA;
+ break;
+ case TRACE_TYPE_OUTDENT:
+ *prefix -= PREFIX_DELTA;
+ if (*prefix < BASE_PREFIX)
+ *prefix = BASE_PREFIX;
+ trace_print_header(seq, *prefix);
+ puts("}");
+ break;
+ default:
+ trace_print_header(seq, *prefix);
+ printf("entry @ %p type %d\n", e, e->type);
+ break;
+ }
+}
+
+void trace_buffer_print(struct trace_buffer *tb)
+{
+ struct trace_entry *e;
+ int i, prefix;
+ void *p;
+
+ printf("Trace buffer dump:\n");
+ printf(" address %p \n", tb);
+ printf(" tail %p\n", tb->tail);
+ printf(" size %llu\n", tb->size);
+ printf(" overflow %s\n", tb->overflow ? "TRUE" : "false");
+ printf(" Content:\n");
+
+ p = tb->data;
+
+ i = 0;
+ prefix = BASE_PREFIX;
+
+ while (trace_check_bounds(tb, p) && p < tb->tail) {
+ e = p;
+
+ trace_print_entry(e, i, &prefix);
+
+ i++;
+ p = (void *)e + sizeof(*e) + e->length;
+ }
+}
+
+void trace_print_location(struct trace_buffer *tb)
+{
+ printf("Trace buffer 0x%llx bytes @ %p\n", tb->size, tb);
+}
diff --git a/tools/testing/selftests/powerpc/pmu/ebb/trace.h b/tools/testing/selftests/powerpc/pmu/ebb/trace.h
new file mode 100644
index 000000000..da2a3be54
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/ebb/trace.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#ifndef _SELFTESTS_POWERPC_PMU_EBB_TRACE_H
+#define _SELFTESTS_POWERPC_PMU_EBB_TRACE_H
+
+#include "utils.h"
+
+#define TRACE_TYPE_REG 1
+#define TRACE_TYPE_COUNTER 2
+#define TRACE_TYPE_STRING 3
+#define TRACE_TYPE_INDENT 4
+#define TRACE_TYPE_OUTDENT 5
+
+struct trace_entry
+{
+ u8 type;
+ u8 length;
+ u8 data[];
+};
+
+struct trace_buffer
+{
+ u64 size;
+ bool overflow;
+ void *tail;
+ u8 data[];
+};
+
+struct trace_buffer *trace_buffer_allocate(u64 size);
+int trace_log_reg(struct trace_buffer *tb, u64 reg, u64 value);
+int trace_log_counter(struct trace_buffer *tb, u64 value);
+int trace_log_string(struct trace_buffer *tb, char *str);
+int trace_log_indent(struct trace_buffer *tb);
+int trace_log_outdent(struct trace_buffer *tb);
+void trace_buffer_print(struct trace_buffer *tb);
+void trace_print_location(struct trace_buffer *tb);
+
+#endif /* _SELFTESTS_POWERPC_PMU_EBB_TRACE_H */
diff --git a/tools/testing/selftests/powerpc/pmu/event.c b/tools/testing/selftests/powerpc/pmu/event.c
new file mode 100644
index 000000000..48e3a413b
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/event.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2013, Michael Ellerman, IBM Corp.
+ */
+
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/ioctl.h>
+
+#include "event.h"
+
+
+int perf_event_open(struct perf_event_attr *attr, pid_t pid, int cpu,
+ int group_fd, unsigned long flags)
+{
+ return syscall(__NR_perf_event_open, attr, pid, cpu,
+ group_fd, flags);
+}
+
+void event_init_opts(struct event *e, u64 config, int type, char *name)
+{
+ memset(e, 0, sizeof(*e));
+
+ e->name = name;
+
+ e->attr.type = type;
+ e->attr.config = config;
+ e->attr.size = sizeof(e->attr);
+ /* This has to match the structure layout in the header */
+ e->attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | \
+ PERF_FORMAT_TOTAL_TIME_RUNNING;
+}
+
+void event_init_named(struct event *e, u64 config, char *name)
+{
+ event_init_opts(e, config, PERF_TYPE_RAW, name);
+}
+
+void event_init(struct event *e, u64 config)
+{
+ event_init_opts(e, config, PERF_TYPE_RAW, "event");
+}
+
+#define PERF_CURRENT_PID 0
+#define PERF_NO_PID -1
+#define PERF_NO_CPU -1
+#define PERF_NO_GROUP -1
+
+int event_open_with_options(struct event *e, pid_t pid, int cpu, int group_fd)
+{
+ e->fd = perf_event_open(&e->attr, pid, cpu, group_fd, 0);
+ if (e->fd == -1) {
+ perror("perf_event_open");
+ return -1;
+ }
+
+ return 0;
+}
+
+int event_open_with_group(struct event *e, int group_fd)
+{
+ return event_open_with_options(e, PERF_CURRENT_PID, PERF_NO_CPU, group_fd);
+}
+
+int event_open_with_pid(struct event *e, pid_t pid)
+{
+ return event_open_with_options(e, pid, PERF_NO_CPU, PERF_NO_GROUP);
+}
+
+int event_open_with_cpu(struct event *e, int cpu)
+{
+ return event_open_with_options(e, PERF_NO_PID, cpu, PERF_NO_GROUP);
+}
+
+int event_open(struct event *e)
+{
+ return event_open_with_options(e, PERF_CURRENT_PID, PERF_NO_CPU, PERF_NO_GROUP);
+}
+
+void event_close(struct event *e)
+{
+ close(e->fd);
+}
+
+int event_enable(struct event *e)
+{
+ return ioctl(e->fd, PERF_EVENT_IOC_ENABLE);
+}
+
+int event_disable(struct event *e)
+{
+ return ioctl(e->fd, PERF_EVENT_IOC_DISABLE);
+}
+
+int event_reset(struct event *e)
+{
+ return ioctl(e->fd, PERF_EVENT_IOC_RESET);
+}
+
+int event_read(struct event *e)
+{
+ int rc;
+
+ rc = read(e->fd, &e->result, sizeof(e->result));
+ if (rc != sizeof(e->result)) {
+ fprintf(stderr, "read error on event %p!\n", e);
+ return -1;
+ }
+
+ return 0;
+}
+
+void event_report_justified(struct event *e, int name_width, int result_width)
+{
+ printf("%*s: result %*llu ", name_width, e->name, result_width,
+ e->result.value);
+
+ if (e->result.running == e->result.enabled)
+ printf("running/enabled %llu\n", e->result.running);
+ else
+ printf("running %llu enabled %llu\n", e->result.running,
+ e->result.enabled);
+}
+
+void event_report(struct event *e)
+{
+ event_report_justified(e, 0, 0);
+}
diff --git a/tools/testing/selftests/powerpc/pmu/event.h b/tools/testing/selftests/powerpc/pmu/event.h
new file mode 100644
index 000000000..302eaab51
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/event.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2013, Michael Ellerman, IBM Corp.
+ */
+
+#ifndef _SELFTESTS_POWERPC_PMU_EVENT_H
+#define _SELFTESTS_POWERPC_PMU_EVENT_H
+
+#include <unistd.h>
+#include <linux/perf_event.h>
+
+#include "utils.h"
+
+
+struct event {
+ struct perf_event_attr attr;
+ char *name;
+ int fd;
+ /* This must match the read_format we use */
+ struct {
+ u64 value;
+ u64 running;
+ u64 enabled;
+ } result;
+};
+
+void event_init(struct event *e, u64 config);
+void event_init_named(struct event *e, u64 config, char *name);
+void event_init_opts(struct event *e, u64 config, int type, char *name);
+int event_open_with_options(struct event *e, pid_t pid, int cpu, int group_fd);
+int event_open_with_group(struct event *e, int group_fd);
+int event_open_with_pid(struct event *e, pid_t pid);
+int event_open_with_cpu(struct event *e, int cpu);
+int event_open(struct event *e);
+void event_close(struct event *e);
+int event_enable(struct event *e);
+int event_disable(struct event *e);
+int event_reset(struct event *e);
+int event_read(struct event *e);
+void event_report_justified(struct event *e, int name_width, int result_width);
+void event_report(struct event *e);
+
+#endif /* _SELFTESTS_POWERPC_PMU_EVENT_H */
diff --git a/tools/testing/selftests/powerpc/pmu/l3_bank_test.c b/tools/testing/selftests/powerpc/pmu/l3_bank_test.c
new file mode 100644
index 000000000..a5dfa9bf3
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/l3_bank_test.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "event.h"
+#include "utils.h"
+
+#define MALLOC_SIZE (0x10000 * 10) /* Ought to be enough .. */
+
+/*
+ * Tests that the L3 bank handling is correct. We fixed it in commit e9aaac1.
+ */
+static int l3_bank_test(void)
+{
+ struct event event;
+ char *p;
+ int i;
+
+ // The L3 bank logic is only used on Power8 or later
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_2_07));
+
+ p = malloc(MALLOC_SIZE);
+ FAIL_IF(!p);
+
+ event_init(&event, 0x84918F);
+
+ FAIL_IF(event_open(&event));
+
+ for (i = 0; i < MALLOC_SIZE; i += 0x10000)
+ p[i] = i;
+
+ event_read(&event);
+ event_report(&event);
+
+ FAIL_IF(event.result.running == 0);
+ FAIL_IF(event.result.enabled == 0);
+
+ event_close(&event);
+ free(p);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(l3_bank_test, "l3_bank_test");
+}
diff --git a/tools/testing/selftests/powerpc/pmu/lib.c b/tools/testing/selftests/powerpc/pmu/lib.c
new file mode 100644
index 000000000..88690b97b
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/lib.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#define _GNU_SOURCE /* For CPU_ZERO etc. */
+
+#include <errno.h>
+#include <sched.h>
+#include <setjmp.h>
+#include <stdlib.h>
+#include <sys/wait.h>
+
+#include "utils.h"
+#include "lib.h"
+
+
+int bind_to_cpu(int cpu)
+{
+ cpu_set_t mask;
+
+ printf("Binding to cpu %d\n", cpu);
+
+ CPU_ZERO(&mask);
+ CPU_SET(cpu, &mask);
+
+ return sched_setaffinity(0, sizeof(mask), &mask);
+}
+
+#define PARENT_TOKEN 0xAA
+#define CHILD_TOKEN 0x55
+
+int sync_with_child(union pipe read_pipe, union pipe write_pipe)
+{
+ char c = PARENT_TOKEN;
+
+ FAIL_IF(write(write_pipe.write_fd, &c, 1) != 1);
+ FAIL_IF(read(read_pipe.read_fd, &c, 1) != 1);
+ if (c != CHILD_TOKEN) /* sometimes expected */
+ return 1;
+
+ return 0;
+}
+
+int wait_for_parent(union pipe read_pipe)
+{
+ char c;
+
+ FAIL_IF(read(read_pipe.read_fd, &c, 1) != 1);
+ FAIL_IF(c != PARENT_TOKEN);
+
+ return 0;
+}
+
+int notify_parent(union pipe write_pipe)
+{
+ char c = CHILD_TOKEN;
+
+ FAIL_IF(write(write_pipe.write_fd, &c, 1) != 1);
+
+ return 0;
+}
+
+int notify_parent_of_error(union pipe write_pipe)
+{
+ char c = ~CHILD_TOKEN;
+
+ FAIL_IF(write(write_pipe.write_fd, &c, 1) != 1);
+
+ return 0;
+}
+
+int wait_for_child(pid_t child_pid)
+{
+ int rc;
+
+ if (waitpid(child_pid, &rc, 0) == -1) {
+ perror("waitpid");
+ return 1;
+ }
+
+ if (WIFEXITED(rc))
+ rc = WEXITSTATUS(rc);
+ else
+ rc = 1; /* Signal or other */
+
+ return rc;
+}
+
+int kill_child_and_wait(pid_t child_pid)
+{
+ kill(child_pid, SIGTERM);
+
+ return wait_for_child(child_pid);
+}
+
+static int eat_cpu_child(union pipe read_pipe, union pipe write_pipe)
+{
+ volatile int i = 0;
+
+ /*
+ * We are just here to eat cpu and die. So make sure we can be killed,
+ * and also don't do any custom SIGTERM handling.
+ */
+ signal(SIGTERM, SIG_DFL);
+
+ notify_parent(write_pipe);
+ wait_for_parent(read_pipe);
+
+ /* Soak up cpu forever */
+ while (1) i++;
+
+ return 0;
+}
+
+pid_t eat_cpu(int (test_function)(void))
+{
+ union pipe read_pipe, write_pipe;
+ int cpu, rc;
+ pid_t pid;
+
+ cpu = pick_online_cpu();
+ FAIL_IF(cpu < 0);
+ FAIL_IF(bind_to_cpu(cpu));
+
+ if (pipe(read_pipe.fds) == -1)
+ return -1;
+
+ if (pipe(write_pipe.fds) == -1)
+ return -1;
+
+ pid = fork();
+ if (pid == 0)
+ exit(eat_cpu_child(write_pipe, read_pipe));
+
+ if (sync_with_child(read_pipe, write_pipe)) {
+ rc = -1;
+ goto out;
+ }
+
+ printf("main test running as pid %d\n", getpid());
+
+ rc = test_function();
+out:
+ kill(pid, SIGKILL);
+
+ return rc;
+}
+
+struct addr_range libc, vdso;
+
+int parse_proc_maps(void)
+{
+ unsigned long start, end;
+ char execute, name[128];
+ FILE *f;
+ int rc;
+
+ f = fopen("/proc/self/maps", "r");
+ if (!f) {
+ perror("fopen");
+ return -1;
+ }
+
+ do {
+ /* This skips line with no executable which is what we want */
+ rc = fscanf(f, "%lx-%lx %*c%*c%c%*c %*x %*d:%*d %*d %127s\n",
+ &start, &end, &execute, name);
+ if (rc <= 0)
+ break;
+
+ if (execute != 'x')
+ continue;
+
+ if (strstr(name, "libc")) {
+ libc.first = start;
+ libc.last = end - 1;
+ } else if (strstr(name, "[vdso]")) {
+ vdso.first = start;
+ vdso.last = end - 1;
+ }
+ } while(1);
+
+ fclose(f);
+
+ return 0;
+}
+
+#define PARANOID_PATH "/proc/sys/kernel/perf_event_paranoid"
+
+bool require_paranoia_below(int level)
+{
+ long current;
+ char *end, buf[16];
+ FILE *f;
+ bool rc;
+
+ rc = false;
+
+ f = fopen(PARANOID_PATH, "r");
+ if (!f) {
+ perror("fopen");
+ goto out;
+ }
+
+ if (!fgets(buf, sizeof(buf), f)) {
+ printf("Couldn't read " PARANOID_PATH "?\n");
+ goto out_close;
+ }
+
+ current = strtol(buf, &end, 10);
+
+ if (end == buf) {
+ printf("Couldn't parse " PARANOID_PATH "?\n");
+ goto out_close;
+ }
+
+ if (current >= level)
+ goto out_close;
+
+ rc = true;
+out_close:
+ fclose(f);
+out:
+ return rc;
+}
+
diff --git a/tools/testing/selftests/powerpc/pmu/lib.h b/tools/testing/selftests/powerpc/pmu/lib.h
new file mode 100644
index 000000000..bf1bec013
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/lib.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#ifndef __SELFTESTS_POWERPC_PMU_LIB_H
+#define __SELFTESTS_POWERPC_PMU_LIB_H
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+
+union pipe {
+ struct {
+ int read_fd;
+ int write_fd;
+ };
+ int fds[2];
+};
+
+extern int bind_to_cpu(int cpu);
+extern int kill_child_and_wait(pid_t child_pid);
+extern int wait_for_child(pid_t child_pid);
+extern int sync_with_child(union pipe read_pipe, union pipe write_pipe);
+extern int wait_for_parent(union pipe read_pipe);
+extern int notify_parent(union pipe write_pipe);
+extern int notify_parent_of_error(union pipe write_pipe);
+extern pid_t eat_cpu(int (test_function)(void));
+extern bool require_paranoia_below(int level);
+
+struct addr_range {
+ uint64_t first, last;
+};
+
+extern struct addr_range libc, vdso;
+
+int parse_proc_maps(void);
+
+#endif /* __SELFTESTS_POWERPC_PMU_LIB_H */
diff --git a/tools/testing/selftests/powerpc/pmu/loop.S b/tools/testing/selftests/powerpc/pmu/loop.S
new file mode 100644
index 000000000..c52ba09b6
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/loop.S
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2013, Michael Ellerman, IBM Corp.
+ */
+
+#include <ppc-asm.h>
+
+ .text
+
+FUNC_START(thirty_two_instruction_loop)
+ cmpdi r3,0
+ beqlr
+ addi r4,r3,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1
+ addi r4,r4,1 # 28 addi's
+ subi r3,r3,1
+ b FUNC_NAME(thirty_two_instruction_loop)
+FUNC_END(thirty_two_instruction_loop)
+
+FUNC_START(thirty_two_instruction_loop_with_ll_sc)
+ cmpdi r3,0
+ beqlr
+ addi r5,r5,1
+ addi r5,r5,1
+ addi r5,r5,1 # 5
+ addi r5,r5,1
+ addi r5,r5,1
+ addi r5,r5,1
+ addi r5,r5,1
+1: ldarx r6,0,r4 # 10
+ addi r5,r5,1
+ addi r5,r5,1
+ addi r5,r5,1
+ addi r5,r5,1
+ addi r5,r5,1 # 15
+ addi r5,r5,1
+ addi r5,r5,1
+ stdcx. r6,0,r4
+ bne- 1b
+ addi r5,r5,1 # 20
+ addi r5,r5,1
+ addi r5,r5,1
+ addi r5,r5,1
+ addi r5,r5,1
+ addi r5,r5,1 # 25
+ addi r5,r5,1
+ addi r5,r5,1
+ addi r5,r5,1
+ addi r5,r5,1
+ addi r5,r5,1 # 30
+ subi r3,r3,1
+ b FUNC_NAME(thirty_two_instruction_loop_with_ll_sc)
+FUNC_END(thirty_two_instruction_loop_with_ll_sc)
diff --git a/tools/testing/selftests/powerpc/pmu/per_event_excludes.c b/tools/testing/selftests/powerpc/pmu/per_event_excludes.c
new file mode 100644
index 000000000..ad32a09a6
--- /dev/null
+++ b/tools/testing/selftests/powerpc/pmu/per_event_excludes.c
@@ -0,0 +1,111 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014, Michael Ellerman, IBM Corp.
+ */
+
+#define _GNU_SOURCE
+
+#include <elf.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sys/prctl.h>
+
+#include "event.h"
+#include "lib.h"
+#include "utils.h"
+
+/*
+ * Test that per-event excludes work.
+ */
+
+static int per_event_excludes(void)
+{
+ struct event *e, events[4];
+ int i;
+
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_2_07));
+
+ /*
+ * We need to create the events disabled, otherwise the running/enabled
+ * counts don't match up.
+ */
+ e = &events[0];
+ event_init_opts(e, PERF_COUNT_HW_INSTRUCTIONS,
+ PERF_TYPE_HARDWARE, "instructions");
+ e->attr.disabled = 1;
+
+ e = &events[1];
+ event_init_opts(e, PERF_COUNT_HW_INSTRUCTIONS,
+ PERF_TYPE_HARDWARE, "instructions(k)");
+ e->attr.disabled = 1;
+ e->attr.exclude_user = 1;
+ e->attr.exclude_hv = 1;
+
+ e = &events[2];
+ event_init_opts(e, PERF_COUNT_HW_INSTRUCTIONS,
+ PERF_TYPE_HARDWARE, "instructions(h)");
+ e->attr.disabled = 1;
+ e->attr.exclude_user = 1;
+ e->attr.exclude_kernel = 1;
+
+ e = &events[3];
+ event_init_opts(e, PERF_COUNT_HW_INSTRUCTIONS,
+ PERF_TYPE_HARDWARE, "instructions(u)");
+ e->attr.disabled = 1;
+ e->attr.exclude_hv = 1;
+ e->attr.exclude_kernel = 1;
+
+ FAIL_IF(event_open(&events[0]));
+
+ /*
+ * The open here will fail if we don't have per event exclude support,
+ * because the second event has an incompatible set of exclude settings
+ * and we're asking for the events to be in a group.
+ */
+ for (i = 1; i < 4; i++)
+ FAIL_IF(event_open_with_group(&events[i], events[0].fd));
+
+ /*
+ * Even though the above will fail without per-event excludes we keep
+ * testing in order to be thorough.
+ */
+ prctl(PR_TASK_PERF_EVENTS_ENABLE);
+
+ /* Spin for a while */
+ for (i = 0; i < INT_MAX; i++)
+ asm volatile("" : : : "memory");
+
+ prctl(PR_TASK_PERF_EVENTS_DISABLE);
+
+ for (i = 0; i < 4; i++) {
+ FAIL_IF(event_read(&events[i]));
+ event_report(&events[i]);
+ }
+
+ /*
+ * We should see that all events have enabled == running. That
+ * shows that they were all on the PMU at once.
+ */
+ for (i = 0; i < 4; i++)
+ FAIL_IF(events[i].result.running != events[i].result.enabled);
+
+ /*
+ * We can also check that the result for instructions is >= all the
+ * other counts. That's because it is counting all instructions while
+ * the others are counting a subset.
+ */
+ for (i = 1; i < 4; i++)
+ FAIL_IF(events[0].result.value < events[i].result.value);
+
+ for (i = 0; i < 4; i++)
+ event_close(&events[i]);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(per_event_excludes, "per_event_excludes");
+}
diff --git a/tools/testing/selftests/powerpc/primitives/.gitignore b/tools/testing/selftests/powerpc/primitives/.gitignore
new file mode 100644
index 000000000..1e5c04e24
--- /dev/null
+++ b/tools/testing/selftests/powerpc/primitives/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+load_unaligned_zeropad
diff --git a/tools/testing/selftests/powerpc/primitives/Makefile b/tools/testing/selftests/powerpc/primitives/Makefile
new file mode 100644
index 000000000..9b9491a63
--- /dev/null
+++ b/tools/testing/selftests/powerpc/primitives/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0-only
+CFLAGS += -I$(CURDIR)
+
+TEST_GEN_PROGS := load_unaligned_zeropad
+
+top_srcdir = ../../../../..
+include ../../lib.mk
+
+$(TEST_GEN_PROGS): ../harness.c
diff --git a/tools/testing/selftests/powerpc/primitives/asm/asm-compat.h b/tools/testing/selftests/powerpc/primitives/asm/asm-compat.h
new file mode 120000
index 000000000..b14255e15
--- /dev/null
+++ b/tools/testing/selftests/powerpc/primitives/asm/asm-compat.h
@@ -0,0 +1 @@
+../.././../../../../arch/powerpc/include/asm/asm-compat.h \ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/primitives/asm/asm-const.h b/tools/testing/selftests/powerpc/primitives/asm/asm-const.h
new file mode 120000
index 000000000..18d8be13e
--- /dev/null
+++ b/tools/testing/selftests/powerpc/primitives/asm/asm-const.h
@@ -0,0 +1 @@
+../../../../../../arch/powerpc/include/asm/asm-const.h \ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/primitives/asm/feature-fixups.h b/tools/testing/selftests/powerpc/primitives/asm/feature-fixups.h
new file mode 120000
index 000000000..8dc6d4d46
--- /dev/null
+++ b/tools/testing/selftests/powerpc/primitives/asm/feature-fixups.h
@@ -0,0 +1 @@
+../../../../../../arch/powerpc/include/asm/feature-fixups.h \ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/primitives/asm/firmware.h b/tools/testing/selftests/powerpc/primitives/asm/firmware.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/tools/testing/selftests/powerpc/primitives/asm/firmware.h
diff --git a/tools/testing/selftests/powerpc/primitives/asm/ppc-opcode.h b/tools/testing/selftests/powerpc/primitives/asm/ppc-opcode.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/tools/testing/selftests/powerpc/primitives/asm/ppc-opcode.h
diff --git a/tools/testing/selftests/powerpc/primitives/asm/ppc_asm.h b/tools/testing/selftests/powerpc/primitives/asm/ppc_asm.h
new file mode 120000
index 000000000..66c819322
--- /dev/null
+++ b/tools/testing/selftests/powerpc/primitives/asm/ppc_asm.h
@@ -0,0 +1 @@
+../../../../../../arch/powerpc/include/asm/ppc_asm.h \ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/primitives/asm/processor.h b/tools/testing/selftests/powerpc/primitives/asm/processor.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/tools/testing/selftests/powerpc/primitives/asm/processor.h
diff --git a/tools/testing/selftests/powerpc/primitives/linux/stringify.h b/tools/testing/selftests/powerpc/primitives/linux/stringify.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/tools/testing/selftests/powerpc/primitives/linux/stringify.h
diff --git a/tools/testing/selftests/powerpc/primitives/load_unaligned_zeropad.c b/tools/testing/selftests/powerpc/primitives/load_unaligned_zeropad.c
new file mode 100644
index 000000000..1439c8c7f
--- /dev/null
+++ b/tools/testing/selftests/powerpc/primitives/load_unaligned_zeropad.c
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Userspace test harness for load_unaligned_zeropad. Creates two
+ * pages and uses mprotect to prevent access to the second page and
+ * a SEGV handler that walks the exception tables and runs the fixup
+ * routine.
+ *
+ * The results are compared against a normal load that is that is
+ * performed while access to the second page is enabled via mprotect.
+ *
+ * Copyright (C) 2014 Anton Blanchard <anton@au.ibm.com>, IBM
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+#define FIXUP_SECTION ".ex_fixup"
+
+static inline unsigned long __fls(unsigned long x);
+
+#include "word-at-a-time.h"
+
+#include "utils.h"
+
+static inline unsigned long __fls(unsigned long x)
+{
+ int lz;
+
+ asm (PPC_CNTLZL "%0,%1" : "=r" (lz) : "r" (x));
+ return sizeof(unsigned long) - 1 - lz;
+}
+
+static int page_size;
+static char *mem_region;
+
+static int protect_region(void)
+{
+ if (mprotect(mem_region + page_size, page_size, PROT_NONE)) {
+ perror("mprotect");
+ return 1;
+ }
+
+ return 0;
+}
+
+static int unprotect_region(void)
+{
+ if (mprotect(mem_region + page_size, page_size, PROT_READ|PROT_WRITE)) {
+ perror("mprotect");
+ return 1;
+ }
+
+ return 0;
+}
+
+extern char __start___ex_table[];
+extern char __stop___ex_table[];
+
+struct extbl_entry {
+ int insn;
+ int fixup;
+};
+
+static void segv_handler(int signr, siginfo_t *info, void *ptr)
+{
+ ucontext_t *uc = (ucontext_t *)ptr;
+ unsigned long addr = (unsigned long)info->si_addr;
+ unsigned long *ip = &UCONTEXT_NIA(uc);
+ struct extbl_entry *entry = (struct extbl_entry *)__start___ex_table;
+
+ while (entry < (struct extbl_entry *)__stop___ex_table) {
+ unsigned long insn, fixup;
+
+ insn = (unsigned long)&entry->insn + entry->insn;
+ fixup = (unsigned long)&entry->fixup + entry->fixup;
+
+ if (insn == *ip) {
+ *ip = fixup;
+ return;
+ }
+ }
+
+ printf("No exception table match for NIA %lx ADDR %lx\n", *ip, addr);
+ abort();
+}
+
+static void setup_segv_handler(void)
+{
+ struct sigaction action;
+
+ memset(&action, 0, sizeof(action));
+ action.sa_sigaction = segv_handler;
+ action.sa_flags = SA_SIGINFO;
+ sigaction(SIGSEGV, &action, NULL);
+}
+
+static int do_one_test(char *p, int page_offset)
+{
+ unsigned long should;
+ unsigned long got;
+
+ FAIL_IF(unprotect_region());
+ should = *(unsigned long *)p;
+ FAIL_IF(protect_region());
+
+ got = load_unaligned_zeropad(p);
+
+ if (should != got) {
+ printf("offset %u load_unaligned_zeropad returned 0x%lx, should be 0x%lx\n", page_offset, got, should);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int test_body(void)
+{
+ unsigned long i;
+
+ page_size = getpagesize();
+ mem_region = mmap(NULL, page_size * 2, PROT_READ|PROT_WRITE,
+ MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+
+ FAIL_IF(mem_region == MAP_FAILED);
+
+ for (i = 0; i < page_size; i++)
+ mem_region[i] = i;
+
+ memset(mem_region+page_size, 0, page_size);
+
+ setup_segv_handler();
+
+ for (i = 0; i < page_size; i++)
+ FAIL_IF(do_one_test(mem_region+i, i));
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(test_body, "load_unaligned_zeropad");
+}
diff --git a/tools/testing/selftests/powerpc/primitives/word-at-a-time.h b/tools/testing/selftests/powerpc/primitives/word-at-a-time.h
new file mode 120000
index 000000000..eb74401b5
--- /dev/null
+++ b/tools/testing/selftests/powerpc/primitives/word-at-a-time.h
@@ -0,0 +1 @@
+../../../../../arch/powerpc/include/asm/word-at-a-time.h \ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/ptrace/.gitignore b/tools/testing/selftests/powerpc/ptrace/.gitignore
new file mode 100644
index 000000000..0e96150b7
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/.gitignore
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0-only
+ptrace-gpr
+ptrace-tm-gpr
+ptrace-tm-spd-gpr
+ptrace-tar
+ptrace-tm-tar
+ptrace-tm-spd-tar
+ptrace-vsx
+ptrace-tm-vsx
+ptrace-tm-spd-vsx
+ptrace-tm-spr
+ptrace-hwbreak
+perf-hwbreak
+core-pkey
+ptrace-pkey
+ptrace-syscall
diff --git a/tools/testing/selftests/powerpc/ptrace/Makefile b/tools/testing/selftests/powerpc/ptrace/Makefile
new file mode 100644
index 000000000..8d3f006c9
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/Makefile
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0
+TEST_GEN_PROGS := ptrace-gpr ptrace-tm-gpr ptrace-tm-spd-gpr \
+ ptrace-tar ptrace-tm-tar ptrace-tm-spd-tar ptrace-vsx ptrace-tm-vsx \
+ ptrace-tm-spd-vsx ptrace-tm-spr ptrace-hwbreak ptrace-pkey core-pkey \
+ perf-hwbreak ptrace-syscall
+
+top_srcdir = ../../../../..
+include ../../lib.mk
+
+CFLAGS += -m64 -I../../../../../usr/include -I../tm -mhtm -fno-pie
+
+$(OUTPUT)/ptrace-pkey $(OUTPUT)/core-pkey: child.h
+$(OUTPUT)/ptrace-pkey $(OUTPUT)/core-pkey: LDLIBS += -pthread
+
+$(TEST_GEN_PROGS): ../harness.c ../utils.c ../lib/reg.S ptrace.h
diff --git a/tools/testing/selftests/powerpc/ptrace/child.h b/tools/testing/selftests/powerpc/ptrace/child.h
new file mode 100644
index 000000000..d7275b7b3
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/child.h
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Helper functions to sync execution between parent and child processes.
+ *
+ * Copyright 2018, Thiago Jung Bauermann, IBM Corporation.
+ */
+#include <stdio.h>
+#include <stdbool.h>
+#include <semaphore.h>
+
+/*
+ * Information in a shared memory location for synchronization between child and
+ * parent.
+ */
+struct child_sync {
+ /* The parent waits on this semaphore. */
+ sem_t sem_parent;
+
+ /* If true, the child should give up as well. */
+ bool parent_gave_up;
+
+ /* The child waits on this semaphore. */
+ sem_t sem_child;
+
+ /* If true, the parent should give up as well. */
+ bool child_gave_up;
+};
+
+#define CHILD_FAIL_IF(x, sync) \
+ do { \
+ if (x) { \
+ fprintf(stderr, \
+ "[FAIL] Test FAILED on line %d\n", __LINE__); \
+ (sync)->child_gave_up = true; \
+ prod_parent(sync); \
+ return 1; \
+ } \
+ } while (0)
+
+#define PARENT_FAIL_IF(x, sync) \
+ do { \
+ if (x) { \
+ fprintf(stderr, \
+ "[FAIL] Test FAILED on line %d\n", __LINE__); \
+ (sync)->parent_gave_up = true; \
+ prod_child(sync); \
+ return 1; \
+ } \
+ } while (0)
+
+#define PARENT_SKIP_IF_UNSUPPORTED(x, sync) \
+ do { \
+ if ((x) == -1 && (errno == ENODEV || errno == EINVAL)) { \
+ (sync)->parent_gave_up = true; \
+ prod_child(sync); \
+ SKIP_IF(1); \
+ } \
+ } while (0)
+
+int init_child_sync(struct child_sync *sync)
+{
+ int ret;
+
+ ret = sem_init(&sync->sem_parent, 1, 0);
+ if (ret) {
+ perror("Semaphore initialization failed");
+ return 1;
+ }
+
+ ret = sem_init(&sync->sem_child, 1, 0);
+ if (ret) {
+ perror("Semaphore initialization failed");
+ return 1;
+ }
+
+ return 0;
+}
+
+void destroy_child_sync(struct child_sync *sync)
+{
+ sem_destroy(&sync->sem_parent);
+ sem_destroy(&sync->sem_child);
+}
+
+int wait_child(struct child_sync *sync)
+{
+ int ret;
+
+ /* Wait until the child prods us. */
+ ret = sem_wait(&sync->sem_parent);
+ if (ret) {
+ perror("Error waiting for child");
+ return 1;
+ }
+
+ return sync->child_gave_up;
+}
+
+int prod_child(struct child_sync *sync)
+{
+ int ret;
+
+ /* Unblock the child now. */
+ ret = sem_post(&sync->sem_child);
+ if (ret) {
+ perror("Error prodding child");
+ return 1;
+ }
+
+ return 0;
+}
+
+int wait_parent(struct child_sync *sync)
+{
+ int ret;
+
+ /* Wait until the parent prods us. */
+ ret = sem_wait(&sync->sem_child);
+ if (ret) {
+ perror("Error waiting for parent");
+ return 1;
+ }
+
+ return sync->parent_gave_up;
+}
+
+int prod_parent(struct child_sync *sync)
+{
+ int ret;
+
+ /* Unblock the parent now. */
+ ret = sem_post(&sync->sem_parent);
+ if (ret) {
+ perror("Error prodding parent");
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/powerpc/ptrace/core-pkey.c b/tools/testing/selftests/powerpc/ptrace/core-pkey.c
new file mode 100644
index 000000000..bbc05ffc5
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/core-pkey.c
@@ -0,0 +1,462 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Ptrace test for Memory Protection Key registers
+ *
+ * Copyright (C) 2015 Anshuman Khandual, IBM Corporation.
+ * Copyright (C) 2018 IBM Corporation.
+ */
+#include <limits.h>
+#include <linux/kernel.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "ptrace.h"
+#include "child.h"
+
+#ifndef __NR_pkey_alloc
+#define __NR_pkey_alloc 384
+#endif
+
+#ifndef __NR_pkey_free
+#define __NR_pkey_free 385
+#endif
+
+#ifndef NT_PPC_PKEY
+#define NT_PPC_PKEY 0x110
+#endif
+
+#ifndef PKEY_DISABLE_EXECUTE
+#define PKEY_DISABLE_EXECUTE 0x4
+#endif
+
+#define AMR_BITS_PER_PKEY 2
+#define PKEY_REG_BITS (sizeof(u64) * 8)
+#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey + 1) * AMR_BITS_PER_PKEY))
+
+#define CORE_FILE_LIMIT (5 * 1024 * 1024) /* 5 MB should be enough */
+
+static const char core_pattern_file[] = "/proc/sys/kernel/core_pattern";
+
+static const char user_write[] = "[User Write (Running)]";
+static const char core_read_running[] = "[Core Read (Running)]";
+
+/* Information shared between the parent and the child. */
+struct shared_info {
+ struct child_sync child_sync;
+
+ /* AMR value the parent expects to read in the core file. */
+ unsigned long amr;
+
+ /* IAMR value the parent expects to read in the core file. */
+ unsigned long iamr;
+
+ /* UAMOR value the parent expects to read in the core file. */
+ unsigned long uamor;
+
+ /* When the child crashed. */
+ time_t core_time;
+};
+
+static int sys_pkey_alloc(unsigned long flags, unsigned long init_access_rights)
+{
+ return syscall(__NR_pkey_alloc, flags, init_access_rights);
+}
+
+static int sys_pkey_free(int pkey)
+{
+ return syscall(__NR_pkey_free, pkey);
+}
+
+static int increase_core_file_limit(void)
+{
+ struct rlimit rlim;
+ int ret;
+
+ ret = getrlimit(RLIMIT_CORE, &rlim);
+ FAIL_IF(ret);
+
+ if (rlim.rlim_cur != RLIM_INFINITY && rlim.rlim_cur < CORE_FILE_LIMIT) {
+ rlim.rlim_cur = CORE_FILE_LIMIT;
+
+ if (rlim.rlim_max != RLIM_INFINITY &&
+ rlim.rlim_max < CORE_FILE_LIMIT)
+ rlim.rlim_max = CORE_FILE_LIMIT;
+
+ ret = setrlimit(RLIMIT_CORE, &rlim);
+ FAIL_IF(ret);
+ }
+
+ ret = getrlimit(RLIMIT_FSIZE, &rlim);
+ FAIL_IF(ret);
+
+ if (rlim.rlim_cur != RLIM_INFINITY && rlim.rlim_cur < CORE_FILE_LIMIT) {
+ rlim.rlim_cur = CORE_FILE_LIMIT;
+
+ if (rlim.rlim_max != RLIM_INFINITY &&
+ rlim.rlim_max < CORE_FILE_LIMIT)
+ rlim.rlim_max = CORE_FILE_LIMIT;
+
+ ret = setrlimit(RLIMIT_FSIZE, &rlim);
+ FAIL_IF(ret);
+ }
+
+ return TEST_PASS;
+}
+
+static int child(struct shared_info *info)
+{
+ bool disable_execute = true;
+ int pkey1, pkey2, pkey3;
+ int *ptr, ret;
+
+ /* Wait until parent fills out the initial register values. */
+ ret = wait_parent(&info->child_sync);
+ if (ret)
+ return ret;
+
+ ret = increase_core_file_limit();
+ FAIL_IF(ret);
+
+ /* Get some pkeys so that we can change their bits in the AMR. */
+ pkey1 = sys_pkey_alloc(0, PKEY_DISABLE_EXECUTE);
+ if (pkey1 < 0) {
+ pkey1 = sys_pkey_alloc(0, 0);
+ FAIL_IF(pkey1 < 0);
+
+ disable_execute = false;
+ }
+
+ pkey2 = sys_pkey_alloc(0, 0);
+ FAIL_IF(pkey2 < 0);
+
+ pkey3 = sys_pkey_alloc(0, 0);
+ FAIL_IF(pkey3 < 0);
+
+ info->amr |= 3ul << pkeyshift(pkey1) | 2ul << pkeyshift(pkey2);
+
+ if (disable_execute)
+ info->iamr |= 1ul << pkeyshift(pkey1);
+ else
+ info->iamr &= ~(1ul << pkeyshift(pkey1));
+
+ info->iamr &= ~(1ul << pkeyshift(pkey2) | 1ul << pkeyshift(pkey3));
+
+ info->uamor |= 3ul << pkeyshift(pkey1) | 3ul << pkeyshift(pkey2);
+
+ printf("%-30s AMR: %016lx pkey1: %d pkey2: %d pkey3: %d\n",
+ user_write, info->amr, pkey1, pkey2, pkey3);
+
+ set_amr(info->amr);
+
+ /*
+ * We won't use pkey3. This tests whether the kernel restores the UAMOR
+ * permissions after a key is freed.
+ */
+ sys_pkey_free(pkey3);
+
+ info->core_time = time(NULL);
+
+ /* Crash. */
+ ptr = 0;
+ *ptr = 1;
+
+ /* Shouldn't get here. */
+ FAIL_IF(true);
+
+ return TEST_FAIL;
+}
+
+/* Return file size if filename exists and pass sanity check, or zero if not. */
+static off_t try_core_file(const char *filename, struct shared_info *info,
+ pid_t pid)
+{
+ struct stat buf;
+ int ret;
+
+ ret = stat(filename, &buf);
+ if (ret == -1)
+ return TEST_FAIL;
+
+ /* Make sure we're not using a stale core file. */
+ return buf.st_mtime >= info->core_time ? buf.st_size : TEST_FAIL;
+}
+
+static Elf64_Nhdr *next_note(Elf64_Nhdr *nhdr)
+{
+ return (void *) nhdr + sizeof(*nhdr) +
+ __ALIGN_KERNEL(nhdr->n_namesz, 4) +
+ __ALIGN_KERNEL(nhdr->n_descsz, 4);
+}
+
+static int check_core_file(struct shared_info *info, Elf64_Ehdr *ehdr,
+ off_t core_size)
+{
+ unsigned long *regs;
+ Elf64_Phdr *phdr;
+ Elf64_Nhdr *nhdr;
+ size_t phdr_size;
+ void *p = ehdr, *note;
+ int ret;
+
+ ret = memcmp(ehdr->e_ident, ELFMAG, SELFMAG);
+ FAIL_IF(ret);
+
+ FAIL_IF(ehdr->e_type != ET_CORE);
+ FAIL_IF(ehdr->e_machine != EM_PPC64);
+ FAIL_IF(ehdr->e_phoff == 0 || ehdr->e_phnum == 0);
+
+ /*
+ * e_phnum is at most 65535 so calculating the size of the
+ * program header cannot overflow.
+ */
+ phdr_size = sizeof(*phdr) * ehdr->e_phnum;
+
+ /* Sanity check the program header table location. */
+ FAIL_IF(ehdr->e_phoff + phdr_size < ehdr->e_phoff);
+ FAIL_IF(ehdr->e_phoff + phdr_size > core_size);
+
+ /* Find the PT_NOTE segment. */
+ for (phdr = p + ehdr->e_phoff;
+ (void *) phdr < p + ehdr->e_phoff + phdr_size;
+ phdr += ehdr->e_phentsize)
+ if (phdr->p_type == PT_NOTE)
+ break;
+
+ FAIL_IF((void *) phdr >= p + ehdr->e_phoff + phdr_size);
+
+ /* Find the NT_PPC_PKEY note. */
+ for (nhdr = p + phdr->p_offset;
+ (void *) nhdr < p + phdr->p_offset + phdr->p_filesz;
+ nhdr = next_note(nhdr))
+ if (nhdr->n_type == NT_PPC_PKEY)
+ break;
+
+ FAIL_IF((void *) nhdr >= p + phdr->p_offset + phdr->p_filesz);
+ FAIL_IF(nhdr->n_descsz == 0);
+
+ p = nhdr;
+ note = p + sizeof(*nhdr) + __ALIGN_KERNEL(nhdr->n_namesz, 4);
+
+ regs = (unsigned long *) note;
+
+ printf("%-30s AMR: %016lx IAMR: %016lx UAMOR: %016lx\n",
+ core_read_running, regs[0], regs[1], regs[2]);
+
+ FAIL_IF(regs[0] != info->amr);
+ FAIL_IF(regs[1] != info->iamr);
+ FAIL_IF(regs[2] != info->uamor);
+
+ return TEST_PASS;
+}
+
+static int parent(struct shared_info *info, pid_t pid)
+{
+ char *filenames, *filename[3];
+ int fd, i, ret, status;
+ unsigned long regs[3];
+ off_t core_size;
+ void *core;
+
+ /*
+ * Get the initial values for AMR, IAMR and UAMOR and communicate them
+ * to the child.
+ */
+ ret = ptrace_read_regs(pid, NT_PPC_PKEY, regs, 3);
+ PARENT_SKIP_IF_UNSUPPORTED(ret, &info->child_sync);
+ PARENT_FAIL_IF(ret, &info->child_sync);
+
+ info->amr = regs[0];
+ info->iamr = regs[1];
+ info->uamor = regs[2];
+
+ /* Wake up child so that it can set itself up. */
+ ret = prod_child(&info->child_sync);
+ PARENT_FAIL_IF(ret, &info->child_sync);
+
+ ret = wait(&status);
+ if (ret != pid) {
+ printf("Child's exit status not captured\n");
+ return TEST_FAIL;
+ } else if (!WIFSIGNALED(status) || !WCOREDUMP(status)) {
+ printf("Child didn't dump core\n");
+ return TEST_FAIL;
+ }
+
+ /* Construct array of core file names to try. */
+
+ filename[0] = filenames = malloc(PATH_MAX);
+ if (!filenames) {
+ perror("Error allocating memory");
+ return TEST_FAIL;
+ }
+
+ ret = snprintf(filename[0], PATH_MAX, "core-pkey.%d", pid);
+ if (ret < 0 || ret >= PATH_MAX) {
+ ret = TEST_FAIL;
+ goto out;
+ }
+
+ filename[1] = filename[0] + ret + 1;
+ ret = snprintf(filename[1], PATH_MAX - ret - 1, "core.%d", pid);
+ if (ret < 0 || ret >= PATH_MAX - ret - 1) {
+ ret = TEST_FAIL;
+ goto out;
+ }
+ filename[2] = "core";
+
+ for (i = 0; i < 3; i++) {
+ core_size = try_core_file(filename[i], info, pid);
+ if (core_size != TEST_FAIL)
+ break;
+ }
+
+ if (i == 3) {
+ printf("Couldn't find core file\n");
+ ret = TEST_FAIL;
+ goto out;
+ }
+
+ fd = open(filename[i], O_RDONLY);
+ if (fd == -1) {
+ perror("Error opening core file");
+ ret = TEST_FAIL;
+ goto out;
+ }
+
+ core = mmap(NULL, core_size, PROT_READ, MAP_PRIVATE, fd, 0);
+ if (core == (void *) -1) {
+ perror("Error mmaping core file");
+ ret = TEST_FAIL;
+ goto out;
+ }
+
+ ret = check_core_file(info, core, core_size);
+
+ munmap(core, core_size);
+ close(fd);
+ unlink(filename[i]);
+
+ out:
+ free(filenames);
+
+ return ret;
+}
+
+static int write_core_pattern(const char *core_pattern)
+{
+ size_t len = strlen(core_pattern), ret;
+ FILE *f;
+
+ f = fopen(core_pattern_file, "w");
+ SKIP_IF_MSG(!f, "Try with root privileges");
+
+ ret = fwrite(core_pattern, 1, len, f);
+ fclose(f);
+ if (ret != len) {
+ perror("Error writing to core_pattern file");
+ return TEST_FAIL;
+ }
+
+ return TEST_PASS;
+}
+
+static int setup_core_pattern(char **core_pattern_, bool *changed_)
+{
+ FILE *f;
+ char *core_pattern;
+ int ret;
+
+ core_pattern = malloc(PATH_MAX);
+ if (!core_pattern) {
+ perror("Error allocating memory");
+ return TEST_FAIL;
+ }
+
+ f = fopen(core_pattern_file, "r");
+ if (!f) {
+ perror("Error opening core_pattern file");
+ ret = TEST_FAIL;
+ goto out;
+ }
+
+ ret = fread(core_pattern, 1, PATH_MAX, f);
+ fclose(f);
+ if (!ret) {
+ perror("Error reading core_pattern file");
+ ret = TEST_FAIL;
+ goto out;
+ }
+
+ /* Check whether we can predict the name of the core file. */
+ if (!strcmp(core_pattern, "core") || !strcmp(core_pattern, "core.%p"))
+ *changed_ = false;
+ else {
+ ret = write_core_pattern("core-pkey.%p");
+ if (ret)
+ goto out;
+
+ *changed_ = true;
+ }
+
+ *core_pattern_ = core_pattern;
+ ret = TEST_PASS;
+
+ out:
+ if (ret)
+ free(core_pattern);
+
+ return ret;
+}
+
+static int core_pkey(void)
+{
+ char *core_pattern;
+ bool changed_core_pattern;
+ struct shared_info *info;
+ int shm_id;
+ int ret;
+ pid_t pid;
+
+ ret = setup_core_pattern(&core_pattern, &changed_core_pattern);
+ if (ret)
+ return ret;
+
+ shm_id = shmget(IPC_PRIVATE, sizeof(*info), 0777 | IPC_CREAT);
+ info = shmat(shm_id, NULL, 0);
+
+ ret = init_child_sync(&info->child_sync);
+ if (ret)
+ return ret;
+
+ pid = fork();
+ if (pid < 0) {
+ perror("fork() failed");
+ ret = TEST_FAIL;
+ } else if (pid == 0)
+ ret = child(info);
+ else
+ ret = parent(info, pid);
+
+ shmdt(info);
+
+ if (pid) {
+ destroy_child_sync(&info->child_sync);
+ shmctl(shm_id, IPC_RMID, NULL);
+
+ if (changed_core_pattern)
+ write_core_pattern(core_pattern);
+ }
+
+ free(core_pattern);
+
+ return ret;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(core_pkey, "core_pkey");
+}
diff --git a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
new file mode 100644
index 000000000..c1f324afd
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
@@ -0,0 +1,308 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * perf events self profiling example test case for hw breakpoints.
+ *
+ * This tests perf PERF_TYPE_BREAKPOINT parameters
+ * 1) tests all variants of the break on read/write flags
+ * 2) tests exclude_user == 0 and 1
+ * 3) test array matches (if DAWR is supported))
+ * 4) test different numbers of breakpoints matches
+ *
+ * Configure this breakpoint, then read and write the data a number of
+ * times. Then check the output count from perf is as expected.
+ *
+ * Based on:
+ * http://ozlabs.org/~anton/junkcode/perf_events_example1.c
+ *
+ * Copyright (C) 2018 Michael Neuling, IBM Corporation.
+ */
+
+#include <unistd.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <elf.h>
+#include <pthread.h>
+#include <sys/syscall.h>
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
+#include "utils.h"
+
+#define MAX_LOOPS 10000
+
+#define DAWR_LENGTH_MAX ((0x3f + 1) * 8)
+
+static inline int sys_perf_event_open(struct perf_event_attr *attr, pid_t pid,
+ int cpu, int group_fd,
+ unsigned long flags)
+{
+ attr->size = sizeof(*attr);
+ return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
+}
+
+static inline bool breakpoint_test(int len)
+{
+ struct perf_event_attr attr;
+ int fd;
+
+ /* setup counters */
+ memset(&attr, 0, sizeof(attr));
+ attr.disabled = 1;
+ attr.type = PERF_TYPE_BREAKPOINT;
+ attr.bp_type = HW_BREAKPOINT_R;
+ /* bp_addr can point anywhere but needs to be aligned */
+ attr.bp_addr = (__u64)(&attr) & 0xfffffffffffff800;
+ attr.bp_len = len;
+ fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
+ if (fd < 0)
+ return false;
+ close(fd);
+ return true;
+}
+
+static inline bool perf_breakpoint_supported(void)
+{
+ return breakpoint_test(4);
+}
+
+static inline bool dawr_supported(void)
+{
+ return breakpoint_test(DAWR_LENGTH_MAX);
+}
+
+static int runtestsingle(int readwriteflag, int exclude_user, int arraytest)
+{
+ int i,j;
+ struct perf_event_attr attr;
+ size_t res;
+ unsigned long long breaks, needed;
+ int readint;
+ int readintarraybig[2*DAWR_LENGTH_MAX/sizeof(int)];
+ int *readintalign;
+ volatile int *ptr;
+ int break_fd;
+ int loop_num = MAX_LOOPS - (rand() % 100); /* provide some variability */
+ volatile int *k;
+
+ /* align to 0x400 boundary as required by DAWR */
+ readintalign = (int *)(((unsigned long)readintarraybig + 0x7ff) &
+ 0xfffffffffffff800);
+
+ ptr = &readint;
+ if (arraytest)
+ ptr = &readintalign[0];
+
+ /* setup counters */
+ memset(&attr, 0, sizeof(attr));
+ attr.disabled = 1;
+ attr.type = PERF_TYPE_BREAKPOINT;
+ attr.bp_type = readwriteflag;
+ attr.bp_addr = (__u64)ptr;
+ attr.bp_len = sizeof(int);
+ if (arraytest)
+ attr.bp_len = DAWR_LENGTH_MAX;
+ attr.exclude_user = exclude_user;
+ break_fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
+ if (break_fd < 0) {
+ perror("sys_perf_event_open");
+ exit(1);
+ }
+
+ /* start counters */
+ ioctl(break_fd, PERF_EVENT_IOC_ENABLE);
+
+ /* Test a bunch of reads and writes */
+ k = &readint;
+ for (i = 0; i < loop_num; i++) {
+ if (arraytest)
+ k = &(readintalign[i % (DAWR_LENGTH_MAX/sizeof(int))]);
+
+ j = *k;
+ *k = j;
+ }
+
+ /* stop counters */
+ ioctl(break_fd, PERF_EVENT_IOC_DISABLE);
+
+ /* read and check counters */
+ res = read(break_fd, &breaks, sizeof(unsigned long long));
+ assert(res == sizeof(unsigned long long));
+ /* we read and write each loop, so subtract the ones we are counting */
+ needed = 0;
+ if (readwriteflag & HW_BREAKPOINT_R)
+ needed += loop_num;
+ if (readwriteflag & HW_BREAKPOINT_W)
+ needed += loop_num;
+ needed = needed * (1 - exclude_user);
+ printf("TESTED: addr:0x%lx brks:% 8lld loops:% 8i rw:%i !user:%i array:%i\n",
+ (unsigned long int)ptr, breaks, loop_num, readwriteflag, exclude_user, arraytest);
+ if (breaks != needed) {
+ printf("FAILED: 0x%lx brks:%lld needed:%lli %i %i %i\n\n",
+ (unsigned long int)ptr, breaks, needed, loop_num, readwriteflag, exclude_user);
+ return 1;
+ }
+ close(break_fd);
+
+ return 0;
+}
+
+static int runtest_dar_outside(void)
+{
+ void *target;
+ volatile __u16 temp16;
+ volatile __u64 temp64;
+ struct perf_event_attr attr;
+ int break_fd;
+ unsigned long long breaks;
+ int fail = 0;
+ size_t res;
+
+ target = malloc(8);
+ if (!target) {
+ perror("malloc failed");
+ exit(EXIT_FAILURE);
+ }
+
+ /* setup counters */
+ memset(&attr, 0, sizeof(attr));
+ attr.disabled = 1;
+ attr.type = PERF_TYPE_BREAKPOINT;
+ attr.exclude_kernel = 1;
+ attr.exclude_hv = 1;
+ attr.exclude_guest = 1;
+ attr.bp_type = HW_BREAKPOINT_RW;
+ /* watch middle half of target array */
+ attr.bp_addr = (__u64)(target + 2);
+ attr.bp_len = 4;
+ break_fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
+ if (break_fd < 0) {
+ free(target);
+ perror("sys_perf_event_open");
+ exit(EXIT_FAILURE);
+ }
+
+ /* Shouldn't hit. */
+ ioctl(break_fd, PERF_EVENT_IOC_RESET);
+ ioctl(break_fd, PERF_EVENT_IOC_ENABLE);
+ temp16 = *((__u16 *)target);
+ *((__u16 *)target) = temp16;
+ ioctl(break_fd, PERF_EVENT_IOC_DISABLE);
+ res = read(break_fd, &breaks, sizeof(unsigned long long));
+ assert(res == sizeof(unsigned long long));
+ if (breaks == 0) {
+ printf("TESTED: No overlap\n");
+ } else {
+ printf("FAILED: No overlap: %lld != 0\n", breaks);
+ fail = 1;
+ }
+
+ /* Hit */
+ ioctl(break_fd, PERF_EVENT_IOC_RESET);
+ ioctl(break_fd, PERF_EVENT_IOC_ENABLE);
+ temp16 = *((__u16 *)(target + 1));
+ *((__u16 *)(target + 1)) = temp16;
+ ioctl(break_fd, PERF_EVENT_IOC_DISABLE);
+ res = read(break_fd, &breaks, sizeof(unsigned long long));
+ assert(res == sizeof(unsigned long long));
+ if (breaks == 2) {
+ printf("TESTED: Partial overlap\n");
+ } else {
+ printf("FAILED: Partial overlap: %lld != 2\n", breaks);
+ fail = 1;
+ }
+
+ /* Hit */
+ ioctl(break_fd, PERF_EVENT_IOC_RESET);
+ ioctl(break_fd, PERF_EVENT_IOC_ENABLE);
+ temp16 = *((__u16 *)(target + 5));
+ *((__u16 *)(target + 5)) = temp16;
+ ioctl(break_fd, PERF_EVENT_IOC_DISABLE);
+ res = read(break_fd, &breaks, sizeof(unsigned long long));
+ assert(res == sizeof(unsigned long long));
+ if (breaks == 2) {
+ printf("TESTED: Partial overlap\n");
+ } else {
+ printf("FAILED: Partial overlap: %lld != 2\n", breaks);
+ fail = 1;
+ }
+
+ /* Shouldn't Hit */
+ ioctl(break_fd, PERF_EVENT_IOC_RESET);
+ ioctl(break_fd, PERF_EVENT_IOC_ENABLE);
+ temp16 = *((__u16 *)(target + 6));
+ *((__u16 *)(target + 6)) = temp16;
+ ioctl(break_fd, PERF_EVENT_IOC_DISABLE);
+ res = read(break_fd, &breaks, sizeof(unsigned long long));
+ assert(res == sizeof(unsigned long long));
+ if (breaks == 0) {
+ printf("TESTED: No overlap\n");
+ } else {
+ printf("FAILED: No overlap: %lld != 0\n", breaks);
+ fail = 1;
+ }
+
+ /* Hit */
+ ioctl(break_fd, PERF_EVENT_IOC_RESET);
+ ioctl(break_fd, PERF_EVENT_IOC_ENABLE);
+ temp64 = *((__u64 *)target);
+ *((__u64 *)target) = temp64;
+ ioctl(break_fd, PERF_EVENT_IOC_DISABLE);
+ res = read(break_fd, &breaks, sizeof(unsigned long long));
+ assert(res == sizeof(unsigned long long));
+ if (breaks == 2) {
+ printf("TESTED: Full overlap\n");
+ } else {
+ printf("FAILED: Full overlap: %lld != 2\n", breaks);
+ fail = 1;
+ }
+
+ free(target);
+ close(break_fd);
+ return fail;
+}
+
+static int runtest(void)
+{
+ int rwflag;
+ int exclude_user;
+ int ret;
+
+ /*
+ * perf defines rwflag as two bits read and write and at least
+ * one must be set. So range 1-3.
+ */
+ for (rwflag = 1 ; rwflag < 4; rwflag++) {
+ for (exclude_user = 0 ; exclude_user < 2; exclude_user++) {
+ ret = runtestsingle(rwflag, exclude_user, 0);
+ if (ret)
+ return ret;
+
+ /* if we have the dawr, we can do an array test */
+ if (!dawr_supported())
+ continue;
+ ret = runtestsingle(rwflag, exclude_user, 1);
+ if (ret)
+ return ret;
+ }
+ }
+
+ ret = runtest_dar_outside();
+ return ret;
+}
+
+
+static int perf_hwbreak(void)
+{
+ srand ( time(NULL) );
+
+ SKIP_IF(!perf_breakpoint_supported());
+
+ return runtest();
+}
+
+int main(int argc, char *argv[], char **envp)
+{
+ return test_harness(perf_hwbreak, "perf_hwbreak");
+}
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-gpr.c b/tools/testing/selftests/powerpc/ptrace/ptrace-gpr.c
new file mode 100644
index 000000000..17cd480c8
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-gpr.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Ptrace test for GPR/FPR registers
+ *
+ * Copyright (C) 2015 Anshuman Khandual, IBM Corporation.
+ */
+#include "ptrace.h"
+#include "ptrace-gpr.h"
+#include "reg.h"
+
+/* Tracer and Tracee Shared Data */
+int shm_id;
+int *cptr, *pptr;
+
+float a = FPR_1;
+float b = FPR_2;
+float c = FPR_3;
+
+void gpr(void)
+{
+ unsigned long gpr_buf[18];
+ float fpr_buf[32];
+
+ cptr = (int *)shmat(shm_id, NULL, 0);
+
+ asm __volatile__(
+ ASM_LOAD_GPR_IMMED(gpr_1)
+ ASM_LOAD_FPR_SINGLE_PRECISION(flt_1)
+ :
+ : [gpr_1]"i"(GPR_1), [flt_1] "b" (&a)
+ : "memory", "r6", "r7", "r8", "r9", "r10",
+ "r11", "r12", "r13", "r14", "r15", "r16", "r17",
+ "r18", "r19", "r20", "r21", "r22", "r23", "r24",
+ "r25", "r26", "r27", "r28", "r29", "r30", "r31"
+ );
+
+ cptr[1] = 1;
+
+ while (!cptr[0])
+ asm volatile("" : : : "memory");
+
+ shmdt((void *)cptr);
+ store_gpr(gpr_buf);
+ store_fpr_single_precision(fpr_buf);
+
+ if (validate_gpr(gpr_buf, GPR_3))
+ exit(1);
+
+ if (validate_fpr_float(fpr_buf, c))
+ exit(1);
+
+ exit(0);
+}
+
+int trace_gpr(pid_t child)
+{
+ unsigned long gpr[18];
+ unsigned long fpr[32];
+
+ FAIL_IF(start_trace(child));
+ FAIL_IF(show_gpr(child, gpr));
+ FAIL_IF(validate_gpr(gpr, GPR_1));
+ FAIL_IF(show_fpr(child, fpr));
+ FAIL_IF(validate_fpr(fpr, FPR_1_REP));
+ FAIL_IF(write_gpr(child, GPR_3));
+ FAIL_IF(write_fpr(child, FPR_3_REP));
+ FAIL_IF(stop_trace(child));
+
+ return TEST_PASS;
+}
+
+int ptrace_gpr(void)
+{
+ pid_t pid;
+ int ret, status;
+
+ shm_id = shmget(IPC_PRIVATE, sizeof(int) * 2, 0777|IPC_CREAT);
+ pid = fork();
+ if (pid < 0) {
+ perror("fork() failed");
+ return TEST_FAIL;
+ }
+ if (pid == 0)
+ gpr();
+
+ if (pid) {
+ pptr = (int *)shmat(shm_id, NULL, 0);
+ while (!pptr[1])
+ asm volatile("" : : : "memory");
+
+ ret = trace_gpr(pid);
+ if (ret) {
+ kill(pid, SIGTERM);
+ shmdt((void *)pptr);
+ shmctl(shm_id, IPC_RMID, NULL);
+ return TEST_FAIL;
+ }
+
+ pptr[0] = 1;
+ shmdt((void *)pptr);
+
+ ret = wait(&status);
+ shmctl(shm_id, IPC_RMID, NULL);
+ if (ret != pid) {
+ printf("Child's exit status not captured\n");
+ return TEST_FAIL;
+ }
+
+ return (WIFEXITED(status) && WEXITSTATUS(status)) ? TEST_FAIL :
+ TEST_PASS;
+ }
+
+ return TEST_PASS;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(ptrace_gpr, "ptrace_gpr");
+}
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-gpr.h b/tools/testing/selftests/powerpc/ptrace/ptrace-gpr.h
new file mode 100644
index 000000000..c5cd53181
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-gpr.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2015 Anshuman Khandual, IBM Corporation.
+ */
+#define GPR_1 1
+#define GPR_2 2
+#define GPR_3 3
+#define GPR_4 4
+
+#define FPR_1 0.001
+#define FPR_2 0.002
+#define FPR_3 0.003
+#define FPR_4 0.004
+
+#define FPR_1_REP 0x3f50624de0000000
+#define FPR_2_REP 0x3f60624de0000000
+#define FPR_3_REP 0x3f689374c0000000
+#define FPR_4_REP 0x3f70624de0000000
+
+/* Buffer must have 18 elements */
+int validate_gpr(unsigned long *gpr, unsigned long val)
+{
+ int i, found = 1;
+
+ for (i = 0; i < 18; i++) {
+ if (gpr[i] != val) {
+ printf("GPR[%d]: %lx Expected: %lx\n",
+ i+14, gpr[i], val);
+ found = 0;
+ }
+ }
+
+ if (!found)
+ return TEST_FAIL;
+ return TEST_PASS;
+}
+
+/* Buffer must have 32 elements */
+int validate_fpr(unsigned long *fpr, unsigned long val)
+{
+ int i, found = 1;
+
+ for (i = 0; i < 32; i++) {
+ if (fpr[i] != val) {
+ printf("FPR[%d]: %lx Expected: %lx\n", i, fpr[i], val);
+ found = 0;
+ }
+ }
+
+ if (!found)
+ return TEST_FAIL;
+ return TEST_PASS;
+}
+
+/* Buffer must have 32 elements */
+int validate_fpr_float(float *fpr, float val)
+{
+ int i, found = 1;
+
+ for (i = 0; i < 32; i++) {
+ if (fpr[i] != val) {
+ printf("FPR[%d]: %f Expected: %f\n", i, fpr[i], val);
+ found = 0;
+ }
+ }
+
+ if (!found)
+ return TEST_FAIL;
+ return TEST_PASS;
+}
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c
new file mode 100644
index 000000000..2e0d86e06
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-hwbreak.c
@@ -0,0 +1,550 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+/*
+ * Ptrace test for hw breakpoints
+ *
+ * Based on tools/testing/selftests/breakpoints/breakpoint_test.c
+ *
+ * This test forks and the parent then traces the child doing various
+ * types of ptrace enabled breakpoints
+ *
+ * Copyright (C) 2018 Michael Neuling, IBM Corporation.
+ */
+
+#include <sys/ptrace.h>
+#include <unistd.h>
+#include <stddef.h>
+#include <sys/user.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/syscall.h>
+#include <linux/limits.h>
+#include "ptrace.h"
+
+#define SPRN_PVR 0x11F
+#define PVR_8xx 0x00500000
+
+bool is_8xx;
+
+/*
+ * Use volatile on all global var so that compiler doesn't
+ * optimise their load/stores. Otherwise selftest can fail.
+ */
+static volatile __u64 glvar;
+
+#define DAWR_MAX_LEN 512
+static volatile __u8 big_var[DAWR_MAX_LEN] __attribute__((aligned(512)));
+
+#define A_LEN 6
+#define B_LEN 6
+struct gstruct {
+ __u8 a[A_LEN]; /* double word aligned */
+ __u8 b[B_LEN]; /* double word unaligned */
+};
+static volatile struct gstruct gstruct __attribute__((aligned(512)));
+
+static volatile char cwd[PATH_MAX] __attribute__((aligned(8)));
+
+static void get_dbginfo(pid_t child_pid, struct ppc_debug_info *dbginfo)
+{
+ if (ptrace(PPC_PTRACE_GETHWDBGINFO, child_pid, NULL, dbginfo)) {
+ perror("Can't get breakpoint info");
+ exit(-1);
+ }
+}
+
+static bool dawr_present(struct ppc_debug_info *dbginfo)
+{
+ return !!(dbginfo->features & PPC_DEBUG_FEATURE_DATA_BP_DAWR);
+}
+
+static void write_var(int len)
+{
+ __u8 *pcvar;
+ __u16 *psvar;
+ __u32 *pivar;
+ __u64 *plvar;
+
+ switch (len) {
+ case 1:
+ pcvar = (__u8 *)&glvar;
+ *pcvar = 0xff;
+ break;
+ case 2:
+ psvar = (__u16 *)&glvar;
+ *psvar = 0xffff;
+ break;
+ case 4:
+ pivar = (__u32 *)&glvar;
+ *pivar = 0xffffffff;
+ break;
+ case 8:
+ plvar = (__u64 *)&glvar;
+ *plvar = 0xffffffffffffffffLL;
+ break;
+ }
+}
+
+static void read_var(int len)
+{
+ __u8 cvar __attribute__((unused));
+ __u16 svar __attribute__((unused));
+ __u32 ivar __attribute__((unused));
+ __u64 lvar __attribute__((unused));
+
+ switch (len) {
+ case 1:
+ cvar = (__u8)glvar;
+ break;
+ case 2:
+ svar = (__u16)glvar;
+ break;
+ case 4:
+ ivar = (__u32)glvar;
+ break;
+ case 8:
+ lvar = (__u64)glvar;
+ break;
+ }
+}
+
+static void test_workload(void)
+{
+ __u8 cvar __attribute__((unused));
+ __u32 ivar __attribute__((unused));
+ int len = 0;
+
+ if (ptrace(PTRACE_TRACEME, 0, NULL, 0)) {
+ perror("Child can't be traced?");
+ exit(-1);
+ }
+
+ /* Wake up father so that it sets up the first test */
+ kill(getpid(), SIGUSR1);
+
+ /* PTRACE_SET_DEBUGREG, WO test */
+ for (len = 1; len <= sizeof(glvar); len <<= 1)
+ write_var(len);
+
+ /* PTRACE_SET_DEBUGREG, RO test */
+ for (len = 1; len <= sizeof(glvar); len <<= 1)
+ read_var(len);
+
+ /* PTRACE_SET_DEBUGREG, RW test */
+ for (len = 1; len <= sizeof(glvar); len <<= 1) {
+ if (rand() % 2)
+ read_var(len);
+ else
+ write_var(len);
+ }
+
+ /* PTRACE_SET_DEBUGREG, Kernel Access Userspace test */
+ syscall(__NR_getcwd, &cwd, PATH_MAX);
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_EXACT, WO test */
+ write_var(1);
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_EXACT, RO test */
+ read_var(1);
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_EXACT, RW test */
+ if (rand() % 2)
+ write_var(1);
+ else
+ read_var(1);
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_EXACT, Kernel Access Userspace test */
+ syscall(__NR_getcwd, &cwd, PATH_MAX);
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW ALIGNED, WO test */
+ gstruct.a[rand() % A_LEN] = 'a';
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW ALIGNED, RO test */
+ cvar = gstruct.a[rand() % A_LEN];
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW ALIGNED, RW test */
+ if (rand() % 2)
+ gstruct.a[rand() % A_LEN] = 'a';
+ else
+ cvar = gstruct.a[rand() % A_LEN];
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW UNALIGNED, WO test */
+ gstruct.b[rand() % B_LEN] = 'b';
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW UNALIGNED, RO test */
+ cvar = gstruct.b[rand() % B_LEN];
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW UNALIGNED, RW test */
+ if (rand() % 2)
+ gstruct.b[rand() % B_LEN] = 'b';
+ else
+ cvar = gstruct.b[rand() % B_LEN];
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW UNALIGNED, DAR OUTSIDE, RW test */
+ if (rand() % 2)
+ *((int *)(gstruct.a + 4)) = 10;
+ else
+ ivar = *((int *)(gstruct.a + 4));
+
+ /* PPC_PTRACE_SETHWDEBUG. DAWR_MAX_LEN. RW test */
+ if (rand() % 2)
+ big_var[rand() % DAWR_MAX_LEN] = 'a';
+ else
+ cvar = big_var[rand() % DAWR_MAX_LEN];
+}
+
+static void check_success(pid_t child_pid, const char *name, const char *type,
+ unsigned long saddr, int len)
+{
+ int status;
+ siginfo_t siginfo;
+ unsigned long eaddr = (saddr + len - 1) | 0x7;
+
+ saddr &= ~0x7;
+
+ /* Wait for the child to SIGTRAP */
+ wait(&status);
+
+ ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &siginfo);
+
+ if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGTRAP ||
+ (unsigned long)siginfo.si_addr < saddr ||
+ (unsigned long)siginfo.si_addr > eaddr) {
+ printf("%s, %s, len: %d: Fail\n", name, type, len);
+ exit(-1);
+ }
+
+ printf("%s, %s, len: %d: Ok\n", name, type, len);
+
+ if (!is_8xx) {
+ /*
+ * For ptrace registered watchpoint, signal is generated
+ * before executing load/store. Singlestep the instruction
+ * and then continue the test.
+ */
+ ptrace(PTRACE_SINGLESTEP, child_pid, NULL, 0);
+ wait(NULL);
+ }
+}
+
+static void ptrace_set_debugreg(pid_t child_pid, unsigned long wp_addr)
+{
+ if (ptrace(PTRACE_SET_DEBUGREG, child_pid, 0, wp_addr)) {
+ perror("PTRACE_SET_DEBUGREG failed");
+ exit(-1);
+ }
+}
+
+static int ptrace_sethwdebug(pid_t child_pid, struct ppc_hw_breakpoint *info)
+{
+ int wh = ptrace(PPC_PTRACE_SETHWDEBUG, child_pid, 0, info);
+
+ if (wh <= 0) {
+ perror("PPC_PTRACE_SETHWDEBUG failed");
+ exit(-1);
+ }
+ return wh;
+}
+
+static void ptrace_delhwdebug(pid_t child_pid, int wh)
+{
+ if (ptrace(PPC_PTRACE_DELHWDEBUG, child_pid, 0, wh) < 0) {
+ perror("PPC_PTRACE_DELHWDEBUG failed");
+ exit(-1);
+ }
+}
+
+#define DABR_READ_SHIFT 0
+#define DABR_WRITE_SHIFT 1
+#define DABR_TRANSLATION_SHIFT 2
+
+static int test_set_debugreg(pid_t child_pid)
+{
+ unsigned long wp_addr = (unsigned long)&glvar;
+ char *name = "PTRACE_SET_DEBUGREG";
+ int len;
+
+ /* PTRACE_SET_DEBUGREG, WO test*/
+ wp_addr &= ~0x7UL;
+ wp_addr |= (1UL << DABR_WRITE_SHIFT);
+ wp_addr |= (1UL << DABR_TRANSLATION_SHIFT);
+ for (len = 1; len <= sizeof(glvar); len <<= 1) {
+ ptrace_set_debugreg(child_pid, wp_addr);
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ check_success(child_pid, name, "WO", wp_addr, len);
+ }
+
+ /* PTRACE_SET_DEBUGREG, RO test */
+ wp_addr &= ~0x7UL;
+ wp_addr |= (1UL << DABR_READ_SHIFT);
+ wp_addr |= (1UL << DABR_TRANSLATION_SHIFT);
+ for (len = 1; len <= sizeof(glvar); len <<= 1) {
+ ptrace_set_debugreg(child_pid, wp_addr);
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ check_success(child_pid, name, "RO", wp_addr, len);
+ }
+
+ /* PTRACE_SET_DEBUGREG, RW test */
+ wp_addr &= ~0x7UL;
+ wp_addr |= (1Ul << DABR_READ_SHIFT);
+ wp_addr |= (1UL << DABR_WRITE_SHIFT);
+ wp_addr |= (1UL << DABR_TRANSLATION_SHIFT);
+ for (len = 1; len <= sizeof(glvar); len <<= 1) {
+ ptrace_set_debugreg(child_pid, wp_addr);
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ check_success(child_pid, name, "RW", wp_addr, len);
+ }
+
+ ptrace_set_debugreg(child_pid, 0);
+ return 0;
+}
+
+static int test_set_debugreg_kernel_userspace(pid_t child_pid)
+{
+ unsigned long wp_addr = (unsigned long)cwd;
+ char *name = "PTRACE_SET_DEBUGREG";
+
+ /* PTRACE_SET_DEBUGREG, Kernel Access Userspace test */
+ wp_addr &= ~0x7UL;
+ wp_addr |= (1Ul << DABR_READ_SHIFT);
+ wp_addr |= (1UL << DABR_WRITE_SHIFT);
+ wp_addr |= (1UL << DABR_TRANSLATION_SHIFT);
+ ptrace_set_debugreg(child_pid, wp_addr);
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ check_success(child_pid, name, "Kernel Access Userspace", wp_addr, 8);
+
+ ptrace_set_debugreg(child_pid, 0);
+ return 0;
+}
+
+static void get_ppc_hw_breakpoint(struct ppc_hw_breakpoint *info, int type,
+ unsigned long addr, int len)
+{
+ info->version = 1;
+ info->trigger_type = type;
+ info->condition_mode = PPC_BREAKPOINT_CONDITION_NONE;
+ info->addr = (__u64)addr;
+ info->addr2 = (__u64)addr + len;
+ info->condition_value = 0;
+ if (!len)
+ info->addr_mode = PPC_BREAKPOINT_MODE_EXACT;
+ else
+ info->addr_mode = PPC_BREAKPOINT_MODE_RANGE_INCLUSIVE;
+}
+
+static void test_sethwdebug_exact(pid_t child_pid)
+{
+ struct ppc_hw_breakpoint info;
+ unsigned long wp_addr = (unsigned long)&glvar;
+ char *name = "PPC_PTRACE_SETHWDEBUG, MODE_EXACT";
+ int len = 1; /* hardcoded in kernel */
+ int wh;
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_EXACT, WO test */
+ get_ppc_hw_breakpoint(&info, PPC_BREAKPOINT_TRIGGER_WRITE, wp_addr, 0);
+ wh = ptrace_sethwdebug(child_pid, &info);
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ check_success(child_pid, name, "WO", wp_addr, len);
+ ptrace_delhwdebug(child_pid, wh);
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_EXACT, RO test */
+ get_ppc_hw_breakpoint(&info, PPC_BREAKPOINT_TRIGGER_READ, wp_addr, 0);
+ wh = ptrace_sethwdebug(child_pid, &info);
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ check_success(child_pid, name, "RO", wp_addr, len);
+ ptrace_delhwdebug(child_pid, wh);
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_EXACT, RW test */
+ get_ppc_hw_breakpoint(&info, PPC_BREAKPOINT_TRIGGER_RW, wp_addr, 0);
+ wh = ptrace_sethwdebug(child_pid, &info);
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ check_success(child_pid, name, "RW", wp_addr, len);
+ ptrace_delhwdebug(child_pid, wh);
+}
+
+static void test_sethwdebug_exact_kernel_userspace(pid_t child_pid)
+{
+ struct ppc_hw_breakpoint info;
+ unsigned long wp_addr = (unsigned long)&cwd;
+ char *name = "PPC_PTRACE_SETHWDEBUG, MODE_EXACT";
+ int len = 1; /* hardcoded in kernel */
+ int wh;
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_EXACT, Kernel Access Userspace test */
+ get_ppc_hw_breakpoint(&info, PPC_BREAKPOINT_TRIGGER_WRITE, wp_addr, 0);
+ wh = ptrace_sethwdebug(child_pid, &info);
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ check_success(child_pid, name, "Kernel Access Userspace", wp_addr, len);
+ ptrace_delhwdebug(child_pid, wh);
+}
+
+static void test_sethwdebug_range_aligned(pid_t child_pid)
+{
+ struct ppc_hw_breakpoint info;
+ unsigned long wp_addr;
+ char *name = "PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW ALIGNED";
+ int len;
+ int wh;
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW ALIGNED, WO test */
+ wp_addr = (unsigned long)&gstruct.a;
+ len = A_LEN;
+ get_ppc_hw_breakpoint(&info, PPC_BREAKPOINT_TRIGGER_WRITE, wp_addr, len);
+ wh = ptrace_sethwdebug(child_pid, &info);
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ check_success(child_pid, name, "WO", wp_addr, len);
+ ptrace_delhwdebug(child_pid, wh);
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW ALIGNED, RO test */
+ wp_addr = (unsigned long)&gstruct.a;
+ len = A_LEN;
+ get_ppc_hw_breakpoint(&info, PPC_BREAKPOINT_TRIGGER_READ, wp_addr, len);
+ wh = ptrace_sethwdebug(child_pid, &info);
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ check_success(child_pid, name, "RO", wp_addr, len);
+ ptrace_delhwdebug(child_pid, wh);
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW ALIGNED, RW test */
+ wp_addr = (unsigned long)&gstruct.a;
+ len = A_LEN;
+ get_ppc_hw_breakpoint(&info, PPC_BREAKPOINT_TRIGGER_RW, wp_addr, len);
+ wh = ptrace_sethwdebug(child_pid, &info);
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ check_success(child_pid, name, "RW", wp_addr, len);
+ ptrace_delhwdebug(child_pid, wh);
+}
+
+static void test_sethwdebug_range_unaligned(pid_t child_pid)
+{
+ struct ppc_hw_breakpoint info;
+ unsigned long wp_addr;
+ char *name = "PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW UNALIGNED";
+ int len;
+ int wh;
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW UNALIGNED, WO test */
+ wp_addr = (unsigned long)&gstruct.b;
+ len = B_LEN;
+ get_ppc_hw_breakpoint(&info, PPC_BREAKPOINT_TRIGGER_WRITE, wp_addr, len);
+ wh = ptrace_sethwdebug(child_pid, &info);
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ check_success(child_pid, name, "WO", wp_addr, len);
+ ptrace_delhwdebug(child_pid, wh);
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW UNALIGNED, RO test */
+ wp_addr = (unsigned long)&gstruct.b;
+ len = B_LEN;
+ get_ppc_hw_breakpoint(&info, PPC_BREAKPOINT_TRIGGER_READ, wp_addr, len);
+ wh = ptrace_sethwdebug(child_pid, &info);
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ check_success(child_pid, name, "RO", wp_addr, len);
+ ptrace_delhwdebug(child_pid, wh);
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW UNALIGNED, RW test */
+ wp_addr = (unsigned long)&gstruct.b;
+ len = B_LEN;
+ get_ppc_hw_breakpoint(&info, PPC_BREAKPOINT_TRIGGER_RW, wp_addr, len);
+ wh = ptrace_sethwdebug(child_pid, &info);
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ check_success(child_pid, name, "RW", wp_addr, len);
+ ptrace_delhwdebug(child_pid, wh);
+
+}
+
+static void test_sethwdebug_range_unaligned_dar(pid_t child_pid)
+{
+ struct ppc_hw_breakpoint info;
+ unsigned long wp_addr;
+ char *name = "PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW UNALIGNED, DAR OUTSIDE";
+ int len;
+ int wh;
+
+ /* PPC_PTRACE_SETHWDEBUG, MODE_RANGE, DW UNALIGNED, DAR OUTSIDE, RW test */
+ wp_addr = (unsigned long)&gstruct.b;
+ len = B_LEN;
+ get_ppc_hw_breakpoint(&info, PPC_BREAKPOINT_TRIGGER_WRITE, wp_addr, len);
+ wh = ptrace_sethwdebug(child_pid, &info);
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ check_success(child_pid, name, "RW", wp_addr, len);
+ ptrace_delhwdebug(child_pid, wh);
+}
+
+static void test_sethwdebug_dawr_max_range(pid_t child_pid)
+{
+ struct ppc_hw_breakpoint info;
+ unsigned long wp_addr;
+ char *name = "PPC_PTRACE_SETHWDEBUG, DAWR_MAX_LEN";
+ int len;
+ int wh;
+
+ /* PPC_PTRACE_SETHWDEBUG, DAWR_MAX_LEN, RW test */
+ wp_addr = (unsigned long)big_var;
+ len = DAWR_MAX_LEN;
+ get_ppc_hw_breakpoint(&info, PPC_BREAKPOINT_TRIGGER_RW, wp_addr, len);
+ wh = ptrace_sethwdebug(child_pid, &info);
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ check_success(child_pid, name, "RW", wp_addr, len);
+ ptrace_delhwdebug(child_pid, wh);
+}
+
+/* Set the breakpoints and check the child successfully trigger them */
+static void
+run_tests(pid_t child_pid, struct ppc_debug_info *dbginfo, bool dawr)
+{
+ test_set_debugreg(child_pid);
+ test_set_debugreg_kernel_userspace(child_pid);
+ test_sethwdebug_exact(child_pid);
+ test_sethwdebug_exact_kernel_userspace(child_pid);
+ if (dbginfo->features & PPC_DEBUG_FEATURE_DATA_BP_RANGE) {
+ test_sethwdebug_range_aligned(child_pid);
+ if (dawr || is_8xx) {
+ test_sethwdebug_range_unaligned(child_pid);
+ test_sethwdebug_range_unaligned_dar(child_pid);
+ test_sethwdebug_dawr_max_range(child_pid);
+ }
+ }
+}
+
+static int ptrace_hwbreak(void)
+{
+ pid_t child_pid;
+ struct ppc_debug_info dbginfo;
+ bool dawr;
+
+ child_pid = fork();
+ if (!child_pid) {
+ test_workload();
+ return 0;
+ }
+
+ wait(NULL);
+
+ get_dbginfo(child_pid, &dbginfo);
+ SKIP_IF(dbginfo.num_data_bps == 0);
+
+ dawr = dawr_present(&dbginfo);
+ run_tests(child_pid, &dbginfo, dawr);
+
+ /* Let the child exit first. */
+ ptrace(PTRACE_CONT, child_pid, NULL, 0);
+ wait(NULL);
+
+ /*
+ * Testcases exits immediately with -1 on any failure. If
+ * it has reached here, it means all tests were successful.
+ */
+ return TEST_PASS;
+}
+
+int main(int argc, char **argv, char **envp)
+{
+ int pvr = 0;
+ asm __volatile__ ("mfspr %0,%1" : "=r"(pvr) : "i"(SPRN_PVR));
+ if (pvr == PVR_8xx)
+ is_8xx = true;
+
+ return test_harness(ptrace_hwbreak, "ptrace-hwbreak");
+}
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
new file mode 100644
index 000000000..bc454f899
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
@@ -0,0 +1,330 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Ptrace test for Memory Protection Key registers
+ *
+ * Copyright (C) 2015 Anshuman Khandual, IBM Corporation.
+ * Copyright (C) 2018 IBM Corporation.
+ */
+#include "ptrace.h"
+#include "child.h"
+
+#ifndef __NR_pkey_alloc
+#define __NR_pkey_alloc 384
+#endif
+
+#ifndef __NR_pkey_free
+#define __NR_pkey_free 385
+#endif
+
+#ifndef NT_PPC_PKEY
+#define NT_PPC_PKEY 0x110
+#endif
+
+#ifndef PKEY_DISABLE_EXECUTE
+#define PKEY_DISABLE_EXECUTE 0x4
+#endif
+
+#define AMR_BITS_PER_PKEY 2
+#define PKEY_REG_BITS (sizeof(u64) * 8)
+#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey + 1) * AMR_BITS_PER_PKEY))
+
+static const char user_read[] = "[User Read (Running)]";
+static const char user_write[] = "[User Write (Running)]";
+static const char ptrace_read_running[] = "[Ptrace Read (Running)]";
+static const char ptrace_write_running[] = "[Ptrace Write (Running)]";
+
+/* Information shared between the parent and the child. */
+struct shared_info {
+ struct child_sync child_sync;
+
+ /* AMR value the parent expects to read from the child. */
+ unsigned long amr1;
+
+ /* AMR value the parent is expected to write to the child. */
+ unsigned long amr2;
+
+ /* AMR value that ptrace should refuse to write to the child. */
+ unsigned long invalid_amr;
+
+ /* IAMR value the parent expects to read from the child. */
+ unsigned long expected_iamr;
+
+ /* UAMOR value the parent expects to read from the child. */
+ unsigned long expected_uamor;
+
+ /*
+ * IAMR and UAMOR values that ptrace should refuse to write to the child
+ * (even though they're valid ones) because userspace doesn't have
+ * access to those registers.
+ */
+ unsigned long invalid_iamr;
+ unsigned long invalid_uamor;
+};
+
+static int sys_pkey_alloc(unsigned long flags, unsigned long init_access_rights)
+{
+ return syscall(__NR_pkey_alloc, flags, init_access_rights);
+}
+
+static int child(struct shared_info *info)
+{
+ unsigned long reg;
+ bool disable_execute = true;
+ int pkey1, pkey2, pkey3;
+ int ret;
+
+ /* Wait until parent fills out the initial register values. */
+ ret = wait_parent(&info->child_sync);
+ if (ret)
+ return ret;
+
+ /* Get some pkeys so that we can change their bits in the AMR. */
+ pkey1 = sys_pkey_alloc(0, PKEY_DISABLE_EXECUTE);
+ if (pkey1 < 0) {
+ pkey1 = sys_pkey_alloc(0, 0);
+ CHILD_FAIL_IF(pkey1 < 0, &info->child_sync);
+
+ disable_execute = false;
+ }
+
+ pkey2 = sys_pkey_alloc(0, 0);
+ CHILD_FAIL_IF(pkey2 < 0, &info->child_sync);
+
+ pkey3 = sys_pkey_alloc(0, 0);
+ CHILD_FAIL_IF(pkey3 < 0, &info->child_sync);
+
+ info->amr1 |= 3ul << pkeyshift(pkey1);
+ info->amr2 |= 3ul << pkeyshift(pkey2);
+ /*
+ * invalid amr value where we try to force write
+ * things which are deined by a uamor setting.
+ */
+ info->invalid_amr = info->amr2 | (~0x0UL & ~info->expected_uamor);
+
+ /*
+ * if PKEY_DISABLE_EXECUTE succeeded we should update the expected_iamr
+ */
+ if (disable_execute)
+ info->expected_iamr |= 1ul << pkeyshift(pkey1);
+ else
+ info->expected_iamr &= ~(1ul << pkeyshift(pkey1));
+
+ /*
+ * We allocated pkey2 and pkey 3 above. Clear the IAMR bits.
+ */
+ info->expected_iamr &= ~(1ul << pkeyshift(pkey2));
+ info->expected_iamr &= ~(1ul << pkeyshift(pkey3));
+
+ /*
+ * Create an IAMR value different from expected value.
+ * Kernel will reject an IAMR and UAMOR change.
+ */
+ info->invalid_iamr = info->expected_iamr | (1ul << pkeyshift(pkey1) | 1ul << pkeyshift(pkey2));
+ info->invalid_uamor = info->expected_uamor & ~(0x3ul << pkeyshift(pkey1));
+
+ printf("%-30s AMR: %016lx pkey1: %d pkey2: %d pkey3: %d\n",
+ user_write, info->amr1, pkey1, pkey2, pkey3);
+
+ set_amr(info->amr1);
+
+ /* Wait for parent to read our AMR value and write a new one. */
+ ret = prod_parent(&info->child_sync);
+ CHILD_FAIL_IF(ret, &info->child_sync);
+
+ ret = wait_parent(&info->child_sync);
+ if (ret)
+ return ret;
+
+ reg = mfspr(SPRN_AMR);
+
+ printf("%-30s AMR: %016lx\n", user_read, reg);
+
+ CHILD_FAIL_IF(reg != info->amr2, &info->child_sync);
+
+ /*
+ * Wait for parent to try to write an invalid AMR value.
+ */
+ ret = prod_parent(&info->child_sync);
+ CHILD_FAIL_IF(ret, &info->child_sync);
+
+ ret = wait_parent(&info->child_sync);
+ if (ret)
+ return ret;
+
+ reg = mfspr(SPRN_AMR);
+
+ printf("%-30s AMR: %016lx\n", user_read, reg);
+
+ CHILD_FAIL_IF(reg != info->amr2, &info->child_sync);
+
+ /*
+ * Wait for parent to try to write an IAMR and a UAMOR value. We can't
+ * verify them, but we can verify that the AMR didn't change.
+ */
+ ret = prod_parent(&info->child_sync);
+ CHILD_FAIL_IF(ret, &info->child_sync);
+
+ ret = wait_parent(&info->child_sync);
+ if (ret)
+ return ret;
+
+ reg = mfspr(SPRN_AMR);
+
+ printf("%-30s AMR: %016lx\n", user_read, reg);
+
+ CHILD_FAIL_IF(reg != info->amr2, &info->child_sync);
+
+ /* Now let parent now that we are finished. */
+
+ ret = prod_parent(&info->child_sync);
+ CHILD_FAIL_IF(ret, &info->child_sync);
+
+ return TEST_PASS;
+}
+
+static int parent(struct shared_info *info, pid_t pid)
+{
+ unsigned long regs[3];
+ int ret, status;
+
+ /*
+ * Get the initial values for AMR, IAMR and UAMOR and communicate them
+ * to the child.
+ */
+ ret = ptrace_read_regs(pid, NT_PPC_PKEY, regs, 3);
+ PARENT_SKIP_IF_UNSUPPORTED(ret, &info->child_sync);
+ PARENT_FAIL_IF(ret, &info->child_sync);
+
+ info->amr1 = info->amr2 = regs[0];
+ info->expected_iamr = regs[1];
+ info->expected_uamor = regs[2];
+
+ /* Wake up child so that it can set itself up. */
+ ret = prod_child(&info->child_sync);
+ PARENT_FAIL_IF(ret, &info->child_sync);
+
+ ret = wait_child(&info->child_sync);
+ if (ret)
+ return ret;
+
+ /* Verify that we can read the pkey registers from the child. */
+ ret = ptrace_read_regs(pid, NT_PPC_PKEY, regs, 3);
+ PARENT_FAIL_IF(ret, &info->child_sync);
+
+ printf("%-30s AMR: %016lx IAMR: %016lx UAMOR: %016lx\n",
+ ptrace_read_running, regs[0], regs[1], regs[2]);
+
+ PARENT_FAIL_IF(regs[0] != info->amr1, &info->child_sync);
+ PARENT_FAIL_IF(regs[1] != info->expected_iamr, &info->child_sync);
+ PARENT_FAIL_IF(regs[2] != info->expected_uamor, &info->child_sync);
+
+ /* Write valid AMR value in child. */
+ ret = ptrace_write_regs(pid, NT_PPC_PKEY, &info->amr2, 1);
+ PARENT_FAIL_IF(ret, &info->child_sync);
+
+ printf("%-30s AMR: %016lx\n", ptrace_write_running, info->amr2);
+
+ /* Wake up child so that it can verify it changed. */
+ ret = prod_child(&info->child_sync);
+ PARENT_FAIL_IF(ret, &info->child_sync);
+
+ ret = wait_child(&info->child_sync);
+ if (ret)
+ return ret;
+
+ /* Write invalid AMR value in child. */
+ ret = ptrace_write_regs(pid, NT_PPC_PKEY, &info->invalid_amr, 1);
+ PARENT_FAIL_IF(ret, &info->child_sync);
+
+ printf("%-30s AMR: %016lx\n", ptrace_write_running, info->invalid_amr);
+
+ /* Wake up child so that it can verify it didn't change. */
+ ret = prod_child(&info->child_sync);
+ PARENT_FAIL_IF(ret, &info->child_sync);
+
+ ret = wait_child(&info->child_sync);
+ if (ret)
+ return ret;
+
+ /* Try to write to IAMR. */
+ regs[0] = info->amr1;
+ regs[1] = info->invalid_iamr;
+ ret = ptrace_write_regs(pid, NT_PPC_PKEY, regs, 2);
+ PARENT_FAIL_IF(!ret, &info->child_sync);
+
+ printf("%-30s AMR: %016lx IAMR: %016lx\n",
+ ptrace_write_running, regs[0], regs[1]);
+
+ /* Try to write to IAMR and UAMOR. */
+ regs[2] = info->invalid_uamor;
+ ret = ptrace_write_regs(pid, NT_PPC_PKEY, regs, 3);
+ PARENT_FAIL_IF(!ret, &info->child_sync);
+
+ printf("%-30s AMR: %016lx IAMR: %016lx UAMOR: %016lx\n",
+ ptrace_write_running, regs[0], regs[1], regs[2]);
+
+ /* Verify that all registers still have their expected values. */
+ ret = ptrace_read_regs(pid, NT_PPC_PKEY, regs, 3);
+ PARENT_FAIL_IF(ret, &info->child_sync);
+
+ printf("%-30s AMR: %016lx IAMR: %016lx UAMOR: %016lx\n",
+ ptrace_read_running, regs[0], regs[1], regs[2]);
+
+ PARENT_FAIL_IF(regs[0] != info->amr2, &info->child_sync);
+ PARENT_FAIL_IF(regs[1] != info->expected_iamr, &info->child_sync);
+ PARENT_FAIL_IF(regs[2] != info->expected_uamor, &info->child_sync);
+
+ /* Wake up child so that it can verify AMR didn't change and wrap up. */
+ ret = prod_child(&info->child_sync);
+ PARENT_FAIL_IF(ret, &info->child_sync);
+
+ ret = wait(&status);
+ if (ret != pid) {
+ printf("Child's exit status not captured\n");
+ ret = TEST_PASS;
+ } else if (!WIFEXITED(status)) {
+ printf("Child exited abnormally\n");
+ ret = TEST_FAIL;
+ } else
+ ret = WEXITSTATUS(status) ? TEST_FAIL : TEST_PASS;
+
+ return ret;
+}
+
+static int ptrace_pkey(void)
+{
+ struct shared_info *info;
+ int shm_id;
+ int ret;
+ pid_t pid;
+
+ shm_id = shmget(IPC_PRIVATE, sizeof(*info), 0777 | IPC_CREAT);
+ info = shmat(shm_id, NULL, 0);
+
+ ret = init_child_sync(&info->child_sync);
+ if (ret)
+ return ret;
+
+ pid = fork();
+ if (pid < 0) {
+ perror("fork() failed");
+ ret = TEST_FAIL;
+ } else if (pid == 0)
+ ret = child(info);
+ else
+ ret = parent(info, pid);
+
+ shmdt(info);
+
+ if (pid) {
+ destroy_child_sync(&info->child_sync);
+ shmctl(shm_id, IPC_RMID, NULL);
+ }
+
+ return ret;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(ptrace_pkey, "ptrace_pkey");
+}
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-syscall.c b/tools/testing/selftests/powerpc/ptrace/ptrace-syscall.c
new file mode 100644
index 000000000..3353210dc
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-syscall.c
@@ -0,0 +1,228 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A ptrace test for testing PTRACE_SYSEMU, PTRACE_SETREGS and
+ * PTRACE_GETREG. This test basically create a child process that executes
+ * syscalls and the parent process check if it is being traced appropriated.
+ *
+ * This test is heavily based on tools/testing/selftests/x86/ptrace_syscall.c
+ * test, and it was adapted to run on Powerpc by
+ * Breno Leitao <leitao@debian.org>
+ */
+#define _GNU_SOURCE
+
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/syscall.h>
+#include <sys/user.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <err.h>
+#include <string.h>
+#include <sys/auxv.h>
+#include "utils.h"
+
+/* Bitness-agnostic defines for user_regs_struct fields. */
+#define user_syscall_nr gpr[0]
+#define user_arg0 gpr[3]
+#define user_arg1 gpr[4]
+#define user_arg2 gpr[5]
+#define user_arg3 gpr[6]
+#define user_arg4 gpr[7]
+#define user_arg5 gpr[8]
+#define user_ip nip
+
+#define PTRACE_SYSEMU 0x1d
+
+static int nerrs;
+
+static void wait_trap(pid_t chld)
+{
+ siginfo_t si;
+
+ if (waitid(P_PID, chld, &si, WEXITED|WSTOPPED) != 0)
+ err(1, "waitid");
+ if (si.si_pid != chld)
+ errx(1, "got unexpected pid in event\n");
+ if (si.si_code != CLD_TRAPPED)
+ errx(1, "got unexpected event type %d\n", si.si_code);
+}
+
+static void test_ptrace_syscall_restart(void)
+{
+ int status;
+ struct pt_regs regs;
+ pid_t chld;
+
+ printf("[RUN]\tptrace-induced syscall restart\n");
+
+ chld = fork();
+ if (chld < 0)
+ err(1, "fork");
+
+ /*
+ * Child process is running 4 syscalls after ptrace.
+ *
+ * 1) getpid()
+ * 2) gettid()
+ * 3) tgkill() -> Send SIGSTOP
+ * 4) gettid() -> Where the tests will happen essentially
+ */
+ if (chld == 0) {
+ if (ptrace(PTRACE_TRACEME, 0, 0, 0) != 0)
+ err(1, "PTRACE_TRACEME");
+
+ pid_t pid = getpid(), tid = syscall(SYS_gettid);
+
+ printf("\tChild will make one syscall\n");
+ syscall(SYS_tgkill, pid, tid, SIGSTOP);
+
+ syscall(SYS_gettid, 10, 11, 12, 13, 14, 15);
+ _exit(0);
+ }
+ /* Parent process below */
+
+ /* Wait for SIGSTOP sent by tgkill above. */
+ if (waitpid(chld, &status, 0) != chld || !WIFSTOPPED(status))
+ err(1, "waitpid");
+
+ printf("[RUN]\tSYSEMU\n");
+ if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
+ err(1, "PTRACE_SYSEMU");
+ wait_trap(chld);
+
+ if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_GETREGS");
+
+ /*
+ * Ptrace trapped prior to executing the syscall, thus r3 still has
+ * the syscall number instead of the sys_gettid() result
+ */
+ if (regs.user_syscall_nr != SYS_gettid ||
+ regs.user_arg0 != 10 || regs.user_arg1 != 11 ||
+ regs.user_arg2 != 12 || regs.user_arg3 != 13 ||
+ regs.user_arg4 != 14 || regs.user_arg5 != 15) {
+ printf("[FAIL]\tInitial args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n",
+ (unsigned long)regs.user_syscall_nr,
+ (unsigned long)regs.user_arg0,
+ (unsigned long)regs.user_arg1,
+ (unsigned long)regs.user_arg2,
+ (unsigned long)regs.user_arg3,
+ (unsigned long)regs.user_arg4,
+ (unsigned long)regs.user_arg5);
+ nerrs++;
+ } else {
+ printf("[OK]\tInitial nr and args are correct\n"); }
+
+ printf("[RUN]\tRestart the syscall (ip = 0x%lx)\n",
+ (unsigned long)regs.user_ip);
+
+ /*
+ * Rewind to retry the same syscall again. This will basically test
+ * the rewind process together with PTRACE_SETREGS and PTRACE_GETREGS.
+ */
+ regs.user_ip -= 4;
+ if (ptrace(PTRACE_SETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_SETREGS");
+
+ if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
+ err(1, "PTRACE_SYSEMU");
+ wait_trap(chld);
+
+ if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_GETREGS");
+
+ if (regs.user_syscall_nr != SYS_gettid ||
+ regs.user_arg0 != 10 || regs.user_arg1 != 11 ||
+ regs.user_arg2 != 12 || regs.user_arg3 != 13 ||
+ regs.user_arg4 != 14 || regs.user_arg5 != 15) {
+ printf("[FAIL]\tRestart nr or args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n",
+ (unsigned long)regs.user_syscall_nr,
+ (unsigned long)regs.user_arg0,
+ (unsigned long)regs.user_arg1,
+ (unsigned long)regs.user_arg2,
+ (unsigned long)regs.user_arg3,
+ (unsigned long)regs.user_arg4,
+ (unsigned long)regs.user_arg5);
+ nerrs++;
+ } else {
+ printf("[OK]\tRestarted nr and args are correct\n");
+ }
+
+ printf("[RUN]\tChange nr and args and restart the syscall (ip = 0x%lx)\n",
+ (unsigned long)regs.user_ip);
+
+ /*
+ * Inject a new syscall (getpid) in the same place the previous
+ * syscall (gettid), rewind and re-execute.
+ */
+ regs.user_syscall_nr = SYS_getpid;
+ regs.user_arg0 = 20;
+ regs.user_arg1 = 21;
+ regs.user_arg2 = 22;
+ regs.user_arg3 = 23;
+ regs.user_arg4 = 24;
+ regs.user_arg5 = 25;
+ regs.user_ip -= 4;
+
+ if (ptrace(PTRACE_SETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_SETREGS");
+
+ if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
+ err(1, "PTRACE_SYSEMU");
+ wait_trap(chld);
+
+ if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_GETREGS");
+
+ /* Check that ptrace stopped at the new syscall that was
+ * injected, and guarantee that it haven't executed, i.e, user_args
+ * contain the arguments and not the syscall return value, for
+ * instance.
+ */
+ if (regs.user_syscall_nr != SYS_getpid
+ || regs.user_arg0 != 20 || regs.user_arg1 != 21
+ || regs.user_arg2 != 22 || regs.user_arg3 != 23
+ || regs.user_arg4 != 24 || regs.user_arg5 != 25) {
+
+ printf("[FAIL]\tRestart nr or args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n",
+ (unsigned long)regs.user_syscall_nr,
+ (unsigned long)regs.user_arg0,
+ (unsigned long)regs.user_arg1,
+ (unsigned long)regs.user_arg2,
+ (unsigned long)regs.user_arg3,
+ (unsigned long)regs.user_arg4,
+ (unsigned long)regs.user_arg5);
+ nerrs++;
+ } else {
+ printf("[OK]\tReplacement nr and args are correct\n");
+ }
+
+ if (ptrace(PTRACE_CONT, chld, 0, 0) != 0)
+ err(1, "PTRACE_CONT");
+
+ if (waitpid(chld, &status, 0) != chld)
+ err(1, "waitpid");
+
+ /* Guarantee that the process executed properly, returning 0 */
+ if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
+ printf("[FAIL]\tChild failed\n");
+ nerrs++;
+ } else {
+ printf("[OK]\tChild exited cleanly\n");
+ }
+}
+
+int ptrace_syscall(void)
+{
+ test_ptrace_syscall_restart();
+
+ return nerrs;
+}
+
+int main(void)
+{
+ return test_harness(ptrace_syscall, "ptrace_syscall");
+}
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tar.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tar.c
new file mode 100644
index 000000000..4436ca9d3
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tar.c
@@ -0,0 +1,134 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Ptrace test for TAR, PPR, DSCR registers
+ *
+ * Copyright (C) 2015 Anshuman Khandual, IBM Corporation.
+ */
+#include "ptrace.h"
+#include "ptrace-tar.h"
+
+/* Tracer and Tracee Shared Data */
+int shm_id;
+int *cptr;
+int *pptr;
+
+void tar(void)
+{
+ unsigned long reg[3];
+ int ret;
+
+ cptr = (int *)shmat(shm_id, NULL, 0);
+ printf("%-30s TAR: %u PPR: %lx DSCR: %u\n",
+ user_write, TAR_1, PPR_1, DSCR_1);
+
+ mtspr(SPRN_TAR, TAR_1);
+ mtspr(SPRN_PPR, PPR_1);
+ mtspr(SPRN_DSCR, DSCR_1);
+
+ cptr[2] = 1;
+
+ /* Wait on parent */
+ while (!cptr[0])
+ asm volatile("" : : : "memory");
+
+ reg[0] = mfspr(SPRN_TAR);
+ reg[1] = mfspr(SPRN_PPR);
+ reg[2] = mfspr(SPRN_DSCR);
+
+ printf("%-30s TAR: %lu PPR: %lx DSCR: %lu\n",
+ user_read, reg[0], reg[1], reg[2]);
+
+ /* Unblock the parent now */
+ cptr[1] = 1;
+ shmdt((int *)cptr);
+
+ ret = validate_tar_registers(reg, TAR_2, PPR_2, DSCR_2);
+ if (ret)
+ exit(1);
+ exit(0);
+}
+
+int trace_tar(pid_t child)
+{
+ unsigned long reg[3];
+
+ FAIL_IF(start_trace(child));
+ FAIL_IF(show_tar_registers(child, reg));
+ printf("%-30s TAR: %lu PPR: %lx DSCR: %lu\n",
+ ptrace_read_running, reg[0], reg[1], reg[2]);
+
+ FAIL_IF(validate_tar_registers(reg, TAR_1, PPR_1, DSCR_1));
+ FAIL_IF(stop_trace(child));
+ return TEST_PASS;
+}
+
+int trace_tar_write(pid_t child)
+{
+ FAIL_IF(start_trace(child));
+ FAIL_IF(write_tar_registers(child, TAR_2, PPR_2, DSCR_2));
+ printf("%-30s TAR: %u PPR: %lx DSCR: %u\n",
+ ptrace_write_running, TAR_2, PPR_2, DSCR_2);
+
+ FAIL_IF(stop_trace(child));
+ return TEST_PASS;
+}
+
+int ptrace_tar(void)
+{
+ pid_t pid;
+ int ret, status;
+
+ // TAR was added in v2.07
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_2_07));
+
+ shm_id = shmget(IPC_PRIVATE, sizeof(int) * 3, 0777|IPC_CREAT);
+ pid = fork();
+ if (pid < 0) {
+ perror("fork() failed");
+ return TEST_FAIL;
+ }
+
+ if (pid == 0)
+ tar();
+
+ if (pid) {
+ pptr = (int *)shmat(shm_id, NULL, 0);
+ pptr[0] = 0;
+ pptr[1] = 0;
+
+ while (!pptr[2])
+ asm volatile("" : : : "memory");
+ ret = trace_tar(pid);
+ if (ret)
+ return ret;
+
+ ret = trace_tar_write(pid);
+ if (ret)
+ return ret;
+
+ /* Unblock the child now */
+ pptr[0] = 1;
+
+ /* Wait on child */
+ while (!pptr[1])
+ asm volatile("" : : : "memory");
+
+ shmdt((int *)pptr);
+
+ ret = wait(&status);
+ shmctl(shm_id, IPC_RMID, NULL);
+ if (ret != pid) {
+ printf("Child's exit status not captured\n");
+ return TEST_PASS;
+ }
+
+ return (WIFEXITED(status) && WEXITSTATUS(status)) ? TEST_FAIL :
+ TEST_PASS;
+ }
+ return TEST_PASS;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(ptrace_tar, "ptrace_tar");
+}
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tar.h b/tools/testing/selftests/powerpc/ptrace/ptrace-tar.h
new file mode 100644
index 000000000..d6a4c0aab
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tar.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2015 Anshuman Khandual, IBM Corporation.
+ */
+#define TAR_1 10
+#define TAR_2 20
+#define TAR_3 30
+#define TAR_4 40
+#define TAR_5 50
+
+#define DSCR_1 100
+#define DSCR_2 200
+#define DSCR_3 300
+#define DSCR_4 400
+#define DSCR_5 500
+
+#define PPR_1 0x4000000000000 /* or 31,31,31*/
+#define PPR_2 0x8000000000000 /* or 1,1,1 */
+#define PPR_3 0xc000000000000 /* or 6,6,6 */
+#define PPR_4 0x10000000000000 /* or 2,2,2 */
+
+char *user_read = "[User Read (Running)]";
+char *user_write = "[User Write (Running)]";
+char *ptrace_read_running = "[Ptrace Read (Running)]";
+char *ptrace_write_running = "[Ptrace Write (Running)]";
+char *ptrace_read_ckpt = "[Ptrace Read (Checkpointed)]";
+char *ptrace_write_ckpt = "[Ptrace Write (Checkpointed)]";
+
+int validate_tar_registers(unsigned long *reg, unsigned long tar,
+ unsigned long ppr, unsigned long dscr)
+{
+ int match = 1;
+
+ if (reg[0] != tar)
+ match = 0;
+
+ if (reg[1] != ppr)
+ match = 0;
+
+ if (reg[2] != dscr)
+ match = 0;
+
+ if (!match)
+ return TEST_FAIL;
+ return TEST_PASS;
+}
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c
new file mode 100644
index 000000000..82f7bdc2e
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-gpr.c
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Ptrace test for GPR/FPR registers in TM context
+ *
+ * Copyright (C) 2015 Anshuman Khandual, IBM Corporation.
+ */
+#include "ptrace.h"
+#include "ptrace-gpr.h"
+#include "tm.h"
+
+/* Tracer and Tracee Shared Data */
+int shm_id;
+unsigned long *cptr, *pptr;
+
+float a = FPR_1;
+float b = FPR_2;
+float c = FPR_3;
+
+void tm_gpr(void)
+{
+ unsigned long gpr_buf[18];
+ unsigned long result, texasr;
+ float fpr_buf[32];
+
+ printf("Starting the child\n");
+ cptr = (unsigned long *)shmat(shm_id, NULL, 0);
+
+trans:
+ cptr[1] = 0;
+ asm __volatile__(
+ ASM_LOAD_GPR_IMMED(gpr_1)
+ ASM_LOAD_FPR_SINGLE_PRECISION(flt_1)
+ "1: ;"
+ "tbegin.;"
+ "beq 2f;"
+ ASM_LOAD_GPR_IMMED(gpr_2)
+ ASM_LOAD_FPR_SINGLE_PRECISION(flt_2)
+ "tsuspend.;"
+ "li 7, 1;"
+ "stw 7, 0(%[cptr1]);"
+ "tresume.;"
+ "b .;"
+
+ "tend.;"
+ "li 0, 0;"
+ "ori %[res], 0, 0;"
+ "b 3f;"
+
+ /* Transaction abort handler */
+ "2: ;"
+ "li 0, 1;"
+ "ori %[res], 0, 0;"
+ "mfspr %[texasr], %[sprn_texasr];"
+
+ "3: ;"
+ : [res] "=r" (result), [texasr] "=r" (texasr)
+ : [gpr_1]"i"(GPR_1), [gpr_2]"i"(GPR_2),
+ [sprn_texasr] "i" (SPRN_TEXASR), [flt_1] "b" (&a),
+ [flt_2] "b" (&b), [cptr1] "b" (&cptr[1])
+ : "memory", "r7", "r8", "r9", "r10",
+ "r11", "r12", "r13", "r14", "r15", "r16",
+ "r17", "r18", "r19", "r20", "r21", "r22",
+ "r23", "r24", "r25", "r26", "r27", "r28",
+ "r29", "r30", "r31"
+ );
+
+ if (result) {
+ if (!cptr[0])
+ goto trans;
+
+ shmdt((void *)cptr);
+ store_gpr(gpr_buf);
+ store_fpr_single_precision(fpr_buf);
+
+ if (validate_gpr(gpr_buf, GPR_3))
+ exit(1);
+
+ if (validate_fpr_float(fpr_buf, c))
+ exit(1);
+
+ exit(0);
+ }
+ shmdt((void *)cptr);
+ exit(1);
+}
+
+int trace_tm_gpr(pid_t child)
+{
+ unsigned long gpr[18];
+ unsigned long fpr[32];
+
+ FAIL_IF(start_trace(child));
+ FAIL_IF(show_gpr(child, gpr));
+ FAIL_IF(validate_gpr(gpr, GPR_2));
+ FAIL_IF(show_fpr(child, fpr));
+ FAIL_IF(validate_fpr(fpr, FPR_2_REP));
+ FAIL_IF(show_ckpt_fpr(child, fpr));
+ FAIL_IF(validate_fpr(fpr, FPR_1_REP));
+ FAIL_IF(show_ckpt_gpr(child, gpr));
+ FAIL_IF(validate_gpr(gpr, GPR_1));
+ FAIL_IF(write_ckpt_gpr(child, GPR_3));
+ FAIL_IF(write_ckpt_fpr(child, FPR_3_REP));
+
+ pptr[0] = 1;
+ FAIL_IF(stop_trace(child));
+
+ return TEST_PASS;
+}
+
+int ptrace_tm_gpr(void)
+{
+ pid_t pid;
+ int ret, status;
+
+ SKIP_IF(!have_htm());
+ shm_id = shmget(IPC_PRIVATE, sizeof(int) * 2, 0777|IPC_CREAT);
+ pid = fork();
+ if (pid < 0) {
+ perror("fork() failed");
+ return TEST_FAIL;
+ }
+ if (pid == 0)
+ tm_gpr();
+
+ if (pid) {
+ pptr = (unsigned long *)shmat(shm_id, NULL, 0);
+
+ while (!pptr[1])
+ asm volatile("" : : : "memory");
+ ret = trace_tm_gpr(pid);
+ if (ret) {
+ kill(pid, SIGTERM);
+ return TEST_FAIL;
+ }
+
+ shmdt((void *)pptr);
+
+ ret = wait(&status);
+ shmctl(shm_id, IPC_RMID, NULL);
+ if (ret != pid) {
+ printf("Child's exit status not captured\n");
+ return TEST_FAIL;
+ }
+
+ return (WIFEXITED(status) && WEXITSTATUS(status)) ? TEST_FAIL :
+ TEST_PASS;
+ }
+ return TEST_PASS;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(ptrace_tm_gpr, "ptrace_tm_gpr");
+}
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c
new file mode 100644
index 000000000..ad65be6e8
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-gpr.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Ptrace test for GPR/FPR registers in TM Suspend context
+ *
+ * Copyright (C) 2015 Anshuman Khandual, IBM Corporation.
+ */
+#include "ptrace.h"
+#include "ptrace-gpr.h"
+#include "tm.h"
+
+/* Tracer and Tracee Shared Data */
+int shm_id;
+int *cptr, *pptr;
+
+float a = FPR_1;
+float b = FPR_2;
+float c = FPR_3;
+float d = FPR_4;
+
+__attribute__((used)) void wait_parent(void)
+{
+ cptr[2] = 1;
+ while (!cptr[1])
+ asm volatile("" : : : "memory");
+}
+
+void tm_spd_gpr(void)
+{
+ unsigned long gpr_buf[18];
+ unsigned long result, texasr;
+ float fpr_buf[32];
+
+ cptr = (int *)shmat(shm_id, NULL, 0);
+
+trans:
+ cptr[2] = 0;
+ asm __volatile__(
+ ASM_LOAD_GPR_IMMED(gpr_1)
+ ASM_LOAD_FPR_SINGLE_PRECISION(flt_1)
+
+ "1: ;"
+ "tbegin.;"
+ "beq 2f;"
+
+ ASM_LOAD_GPR_IMMED(gpr_2)
+ "tsuspend.;"
+ ASM_LOAD_GPR_IMMED(gpr_4)
+ ASM_LOAD_FPR_SINGLE_PRECISION(flt_4)
+
+ "bl wait_parent;"
+ "tresume.;"
+ "tend.;"
+ "li 0, 0;"
+ "ori %[res], 0, 0;"
+ "b 3f;"
+
+ /* Transaction abort handler */
+ "2: ;"
+ "li 0, 1;"
+ "ori %[res], 0, 0;"
+ "mfspr %[texasr], %[sprn_texasr];"
+
+ "3: ;"
+ : [res] "=r" (result), [texasr] "=r" (texasr)
+ : [gpr_1]"i"(GPR_1), [gpr_2]"i"(GPR_2), [gpr_4]"i"(GPR_4),
+ [sprn_texasr] "i" (SPRN_TEXASR), [flt_1] "b" (&a),
+ [flt_4] "b" (&d)
+ : "memory", "r5", "r6", "r7",
+ "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
+ "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
+ "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31"
+ );
+
+ if (result) {
+ if (!cptr[0])
+ goto trans;
+
+ shmdt((void *)cptr);
+ store_gpr(gpr_buf);
+ store_fpr_single_precision(fpr_buf);
+
+ if (validate_gpr(gpr_buf, GPR_3))
+ exit(1);
+
+ if (validate_fpr_float(fpr_buf, c))
+ exit(1);
+ exit(0);
+ }
+ shmdt((void *)cptr);
+ exit(1);
+}
+
+int trace_tm_spd_gpr(pid_t child)
+{
+ unsigned long gpr[18];
+ unsigned long fpr[32];
+
+ FAIL_IF(start_trace(child));
+ FAIL_IF(show_gpr(child, gpr));
+ FAIL_IF(validate_gpr(gpr, GPR_4));
+ FAIL_IF(show_fpr(child, fpr));
+ FAIL_IF(validate_fpr(fpr, FPR_4_REP));
+ FAIL_IF(show_ckpt_fpr(child, fpr));
+ FAIL_IF(validate_fpr(fpr, FPR_1_REP));
+ FAIL_IF(show_ckpt_gpr(child, gpr));
+ FAIL_IF(validate_gpr(gpr, GPR_1));
+ FAIL_IF(write_ckpt_gpr(child, GPR_3));
+ FAIL_IF(write_ckpt_fpr(child, FPR_3_REP));
+
+ pptr[0] = 1;
+ pptr[1] = 1;
+ FAIL_IF(stop_trace(child));
+ return TEST_PASS;
+}
+
+int ptrace_tm_spd_gpr(void)
+{
+ pid_t pid;
+ int ret, status;
+
+ SKIP_IF(!have_htm());
+ shm_id = shmget(IPC_PRIVATE, sizeof(int) * 3, 0777|IPC_CREAT);
+ pid = fork();
+ if (pid < 0) {
+ perror("fork() failed");
+ return TEST_FAIL;
+ }
+
+ if (pid == 0)
+ tm_spd_gpr();
+
+ if (pid) {
+ pptr = (int *)shmat(shm_id, NULL, 0);
+ pptr[0] = 0;
+ pptr[1] = 0;
+
+ while (!pptr[2])
+ asm volatile("" : : : "memory");
+ ret = trace_tm_spd_gpr(pid);
+ if (ret) {
+ kill(pid, SIGTERM);
+ shmdt((void *)pptr);
+ shmctl(shm_id, IPC_RMID, NULL);
+ return TEST_FAIL;
+ }
+
+ shmdt((void *)pptr);
+
+ ret = wait(&status);
+ shmctl(shm_id, IPC_RMID, NULL);
+ if (ret != pid) {
+ printf("Child's exit status not captured\n");
+ return TEST_FAIL;
+ }
+
+ return (WIFEXITED(status) && WEXITSTATUS(status)) ? TEST_FAIL :
+ TEST_PASS;
+ }
+ return TEST_PASS;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(ptrace_tm_spd_gpr, "ptrace_tm_spd_gpr");
+}
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-tar.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-tar.c
new file mode 100644
index 000000000..2ecfa1158
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-tar.c
@@ -0,0 +1,170 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Ptrace test for TAR, PPR, DSCR registers in the TM Suspend context
+ *
+ * Copyright (C) 2015 Anshuman Khandual, IBM Corporation.
+ */
+#include "ptrace.h"
+#include "tm.h"
+#include "ptrace-tar.h"
+
+int shm_id;
+int *cptr, *pptr;
+
+__attribute__((used)) void wait_parent(void)
+{
+ cptr[2] = 1;
+ while (!cptr[1])
+ asm volatile("" : : : "memory");
+}
+
+void tm_spd_tar(void)
+{
+ unsigned long result, texasr;
+ unsigned long regs[3];
+ int ret;
+
+ cptr = (int *)shmat(shm_id, NULL, 0);
+
+trans:
+ cptr[2] = 0;
+ asm __volatile__(
+ "li 4, %[tar_1];"
+ "mtspr %[sprn_tar], 4;" /* TAR_1 */
+ "li 4, %[dscr_1];"
+ "mtspr %[sprn_dscr], 4;" /* DSCR_1 */
+ "or 31,31,31;" /* PPR_1*/
+
+ "1: ;"
+ "tbegin.;"
+ "beq 2f;"
+
+ "li 4, %[tar_2];"
+ "mtspr %[sprn_tar], 4;" /* TAR_2 */
+ "li 4, %[dscr_2];"
+ "mtspr %[sprn_dscr], 4;" /* DSCR_2 */
+ "or 1,1,1;" /* PPR_2 */
+
+ "tsuspend.;"
+ "li 4, %[tar_3];"
+ "mtspr %[sprn_tar], 4;" /* TAR_3 */
+ "li 4, %[dscr_3];"
+ "mtspr %[sprn_dscr], 4;" /* DSCR_3 */
+ "or 6,6,6;" /* PPR_3 */
+ "bl wait_parent;"
+ "tresume.;"
+
+ "tend.;"
+ "li 0, 0;"
+ "ori %[res], 0, 0;"
+ "b 3f;"
+
+ /* Transaction abort handler */
+ "2: ;"
+ "li 0, 1;"
+ "ori %[res], 0, 0;"
+ "mfspr %[texasr], %[sprn_texasr];"
+
+ "3: ;"
+
+ : [res] "=r" (result), [texasr] "=r" (texasr)
+ : [sprn_dscr]"i"(SPRN_DSCR),
+ [sprn_tar]"i"(SPRN_TAR), [sprn_ppr]"i"(SPRN_PPR),
+ [sprn_texasr]"i"(SPRN_TEXASR), [tar_1]"i"(TAR_1),
+ [dscr_1]"i"(DSCR_1), [tar_2]"i"(TAR_2), [dscr_2]"i"(DSCR_2),
+ [tar_3]"i"(TAR_3), [dscr_3]"i"(DSCR_3)
+ : "memory", "r0", "r3", "r4", "r5", "r6", "lr"
+ );
+
+ /* TM failed, analyse */
+ if (result) {
+ if (!cptr[0])
+ goto trans;
+
+ regs[0] = mfspr(SPRN_TAR);
+ regs[1] = mfspr(SPRN_PPR);
+ regs[2] = mfspr(SPRN_DSCR);
+
+ shmdt(&cptr);
+ printf("%-30s TAR: %lu PPR: %lx DSCR: %lu\n",
+ user_read, regs[0], regs[1], regs[2]);
+
+ ret = validate_tar_registers(regs, TAR_4, PPR_4, DSCR_4);
+ if (ret)
+ exit(1);
+ exit(0);
+ }
+ shmdt(&cptr);
+ exit(1);
+}
+
+int trace_tm_spd_tar(pid_t child)
+{
+ unsigned long regs[3];
+
+ FAIL_IF(start_trace(child));
+ FAIL_IF(show_tar_registers(child, regs));
+ printf("%-30s TAR: %lu PPR: %lx DSCR: %lu\n",
+ ptrace_read_running, regs[0], regs[1], regs[2]);
+
+ FAIL_IF(validate_tar_registers(regs, TAR_3, PPR_3, DSCR_3));
+ FAIL_IF(show_tm_checkpointed_state(child, regs));
+ printf("%-30s TAR: %lu PPR: %lx DSCR: %lu\n",
+ ptrace_read_ckpt, regs[0], regs[1], regs[2]);
+
+ FAIL_IF(validate_tar_registers(regs, TAR_1, PPR_1, DSCR_1));
+ FAIL_IF(write_ckpt_tar_registers(child, TAR_4, PPR_4, DSCR_4));
+ printf("%-30s TAR: %u PPR: %lx DSCR: %u\n",
+ ptrace_write_ckpt, TAR_4, PPR_4, DSCR_4);
+
+ pptr[0] = 1;
+ pptr[1] = 1;
+ FAIL_IF(stop_trace(child));
+ return TEST_PASS;
+}
+
+int ptrace_tm_spd_tar(void)
+{
+ pid_t pid;
+ int ret, status;
+
+ SKIP_IF(!have_htm());
+ shm_id = shmget(IPC_PRIVATE, sizeof(int) * 3, 0777|IPC_CREAT);
+ pid = fork();
+ if (pid == 0)
+ tm_spd_tar();
+
+ pptr = (int *)shmat(shm_id, NULL, 0);
+ pptr[0] = 0;
+ pptr[1] = 0;
+
+ if (pid) {
+ while (!pptr[2])
+ asm volatile("" : : : "memory");
+ ret = trace_tm_spd_tar(pid);
+ if (ret) {
+ kill(pid, SIGTERM);
+ shmdt(&pptr);
+ shmctl(shm_id, IPC_RMID, NULL);
+ return TEST_FAIL;
+ }
+
+ shmdt(&pptr);
+
+ ret = wait(&status);
+ shmctl(shm_id, IPC_RMID, NULL);
+ if (ret != pid) {
+ printf("Child's exit status not captured\n");
+ return TEST_FAIL;
+ }
+
+ return (WIFEXITED(status) && WEXITSTATUS(status)) ? TEST_FAIL :
+ TEST_PASS;
+ }
+ return TEST_PASS;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(ptrace_tm_spd_tar, "ptrace_tm_spd_tar");
+}
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-vsx.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-vsx.c
new file mode 100644
index 000000000..6f7fb51f0
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spd-vsx.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Ptrace test for VMX/VSX registers in the TM Suspend context
+ *
+ * Copyright (C) 2015 Anshuman Khandual, IBM Corporation.
+ */
+#include "ptrace.h"
+#include "tm.h"
+#include "ptrace-vsx.h"
+
+int shm_id;
+int *cptr, *pptr;
+
+unsigned long fp_load[VEC_MAX];
+unsigned long fp_load_new[VEC_MAX];
+unsigned long fp_store[VEC_MAX];
+unsigned long fp_load_ckpt[VEC_MAX];
+unsigned long fp_load_ckpt_new[VEC_MAX];
+
+__attribute__((used)) void load_vsx(void)
+{
+ loadvsx(fp_load, 0);
+}
+
+__attribute__((used)) void load_vsx_new(void)
+{
+ loadvsx(fp_load_new, 0);
+}
+
+__attribute__((used)) void load_vsx_ckpt(void)
+{
+ loadvsx(fp_load_ckpt, 0);
+}
+
+__attribute__((used)) void wait_parent(void)
+{
+ cptr[2] = 1;
+ while (!cptr[1])
+ asm volatile("" : : : "memory");
+}
+
+void tm_spd_vsx(void)
+{
+ unsigned long result, texasr;
+ int ret;
+
+ cptr = (int *)shmat(shm_id, NULL, 0);
+
+trans:
+ cptr[2] = 0;
+ asm __volatile__(
+ "bl load_vsx_ckpt;"
+
+ "1: ;"
+ "tbegin.;"
+ "beq 2f;"
+
+ "bl load_vsx_new;"
+ "tsuspend.;"
+ "bl load_vsx;"
+ "bl wait_parent;"
+ "tresume.;"
+
+ "tend.;"
+ "li 0, 0;"
+ "ori %[res], 0, 0;"
+ "b 3f;"
+
+ "2: ;"
+ "li 0, 1;"
+ "ori %[res], 0, 0;"
+ "mfspr %[texasr], %[sprn_texasr];"
+
+ "3: ;"
+ : [res] "=r" (result), [texasr] "=r" (texasr)
+ : [sprn_texasr] "i" (SPRN_TEXASR)
+ : "memory", "r0", "r3", "r4",
+ "r7", "r8", "r9", "r10", "r11", "lr"
+ );
+
+ if (result) {
+ if (!cptr[0])
+ goto trans;
+ shmdt((void *)cptr);
+
+ storevsx(fp_store, 0);
+ ret = compare_vsx_vmx(fp_store, fp_load_ckpt_new);
+ if (ret)
+ exit(1);
+ exit(0);
+ }
+ shmdt((void *)cptr);
+ exit(1);
+}
+
+int trace_tm_spd_vsx(pid_t child)
+{
+ unsigned long vsx[VSX_MAX];
+ unsigned long vmx[VMX_MAX + 2][2];
+
+ FAIL_IF(start_trace(child));
+ FAIL_IF(show_vsx(child, vsx));
+ FAIL_IF(validate_vsx(vsx, fp_load));
+ FAIL_IF(show_vmx(child, vmx));
+ FAIL_IF(validate_vmx(vmx, fp_load));
+ FAIL_IF(show_vsx_ckpt(child, vsx));
+ FAIL_IF(validate_vsx(vsx, fp_load_ckpt));
+ FAIL_IF(show_vmx_ckpt(child, vmx));
+ FAIL_IF(validate_vmx(vmx, fp_load_ckpt));
+
+ memset(vsx, 0, sizeof(vsx));
+ memset(vmx, 0, sizeof(vmx));
+
+ load_vsx_vmx(fp_load_ckpt_new, vsx, vmx);
+
+ FAIL_IF(write_vsx_ckpt(child, vsx));
+ FAIL_IF(write_vmx_ckpt(child, vmx));
+
+ pptr[0] = 1;
+ pptr[1] = 1;
+ FAIL_IF(stop_trace(child));
+
+ return TEST_PASS;
+}
+
+int ptrace_tm_spd_vsx(void)
+{
+ pid_t pid;
+ int ret, status, i;
+
+ SKIP_IF(!have_htm());
+ shm_id = shmget(IPC_PRIVATE, sizeof(int) * 3, 0777|IPC_CREAT);
+
+ for (i = 0; i < 128; i++) {
+ fp_load[i] = 1 + rand();
+ fp_load_new[i] = 1 + 2 * rand();
+ fp_load_ckpt[i] = 1 + 3 * rand();
+ fp_load_ckpt_new[i] = 1 + 4 * rand();
+ }
+
+ pid = fork();
+ if (pid < 0) {
+ perror("fork() failed");
+ return TEST_FAIL;
+ }
+
+ if (pid == 0)
+ tm_spd_vsx();
+
+ if (pid) {
+ pptr = (int *)shmat(shm_id, NULL, 0);
+ while (!pptr[2])
+ asm volatile("" : : : "memory");
+
+ ret = trace_tm_spd_vsx(pid);
+ if (ret) {
+ kill(pid, SIGKILL);
+ shmdt((void *)pptr);
+ shmctl(shm_id, IPC_RMID, NULL);
+ return TEST_FAIL;
+ }
+
+ shmdt((void *)pptr);
+ ret = wait(&status);
+ shmctl(shm_id, IPC_RMID, NULL);
+ if (ret != pid) {
+ printf("Child's exit status not captured\n");
+ return TEST_FAIL;
+ }
+
+ return (WIFEXITED(status) && WEXITSTATUS(status)) ? TEST_FAIL :
+ TEST_PASS;
+ }
+ return TEST_PASS;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(ptrace_tm_spd_vsx, "ptrace_tm_spd_vsx");
+}
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spr.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spr.c
new file mode 100644
index 000000000..068bfed2e
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-spr.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Ptrace test TM SPR registers
+ *
+ * Copyright (C) 2015 Anshuman Khandual, IBM Corporation.
+ */
+#include "ptrace.h"
+#include "tm.h"
+
+/* Tracee and tracer shared data */
+struct shared {
+ int flag;
+ struct tm_spr_regs regs;
+};
+unsigned long tfhar;
+
+int shm_id;
+struct shared *cptr, *pptr;
+
+int shm_id1;
+int *cptr1, *pptr1;
+
+#define TM_KVM_SCHED 0xe0000001ac000001
+int validate_tm_spr(struct tm_spr_regs *regs)
+{
+ FAIL_IF(regs->tm_tfhar != tfhar);
+ FAIL_IF((regs->tm_texasr == TM_KVM_SCHED) && (regs->tm_tfiar != 0));
+
+ return TEST_PASS;
+}
+
+void tm_spr(void)
+{
+ unsigned long result, texasr;
+ int ret;
+
+ cptr = (struct shared *)shmat(shm_id, NULL, 0);
+ cptr1 = (int *)shmat(shm_id1, NULL, 0);
+
+trans:
+ cptr1[0] = 0;
+ asm __volatile__(
+ "1: ;"
+ /* TM failover handler should follow "tbegin.;" */
+ "mflr 31;"
+ "bl 4f;" /* $ = TFHAR - 12 */
+ "4: ;"
+ "mflr %[tfhar];"
+ "mtlr 31;"
+
+ "tbegin.;"
+ "beq 2f;"
+
+ "tsuspend.;"
+ "li 8, 1;"
+ "sth 8, 0(%[cptr1]);"
+ "tresume.;"
+ "b .;"
+
+ "tend.;"
+ "li 0, 0;"
+ "ori %[res], 0, 0;"
+ "b 3f;"
+
+ "2: ;"
+
+ "li 0, 1;"
+ "ori %[res], 0, 0;"
+ "mfspr %[texasr], %[sprn_texasr];"
+
+ "3: ;"
+ : [tfhar] "=r" (tfhar), [res] "=r" (result),
+ [texasr] "=r" (texasr), [cptr1] "=b" (cptr1)
+ : [sprn_texasr] "i" (SPRN_TEXASR)
+ : "memory", "r0", "r8", "r31"
+ );
+
+ /* There are 2 32bit instructions before tbegin. */
+ tfhar += 12;
+
+ if (result) {
+ if (!cptr->flag)
+ goto trans;
+
+ ret = validate_tm_spr((struct tm_spr_regs *)&cptr->regs);
+ shmdt((void *)cptr);
+ shmdt((void *)cptr1);
+ if (ret)
+ exit(1);
+ exit(0);
+ }
+ shmdt((void *)cptr);
+ shmdt((void *)cptr1);
+ exit(1);
+}
+
+int trace_tm_spr(pid_t child)
+{
+ FAIL_IF(start_trace(child));
+ FAIL_IF(show_tm_spr(child, (struct tm_spr_regs *)&pptr->regs));
+
+ printf("TFHAR: %lx TEXASR: %lx TFIAR: %lx\n", pptr->regs.tm_tfhar,
+ pptr->regs.tm_texasr, pptr->regs.tm_tfiar);
+
+ pptr->flag = 1;
+ FAIL_IF(stop_trace(child));
+
+ return TEST_PASS;
+}
+
+int ptrace_tm_spr(void)
+{
+ pid_t pid;
+ int ret, status;
+
+ SKIP_IF(!have_htm());
+ shm_id = shmget(IPC_PRIVATE, sizeof(struct shared), 0777|IPC_CREAT);
+ shm_id1 = shmget(IPC_PRIVATE, sizeof(int), 0777|IPC_CREAT);
+ pid = fork();
+ if (pid < 0) {
+ perror("fork() failed");
+ return TEST_FAIL;
+ }
+
+ if (pid == 0)
+ tm_spr();
+
+ if (pid) {
+ pptr = (struct shared *)shmat(shm_id, NULL, 0);
+ pptr1 = (int *)shmat(shm_id1, NULL, 0);
+
+ while (!pptr1[0])
+ asm volatile("" : : : "memory");
+ ret = trace_tm_spr(pid);
+ if (ret) {
+ kill(pid, SIGKILL);
+ shmdt((void *)pptr);
+ shmdt((void *)pptr1);
+ shmctl(shm_id, IPC_RMID, NULL);
+ shmctl(shm_id1, IPC_RMID, NULL);
+ return TEST_FAIL;
+ }
+
+ shmdt((void *)pptr);
+ shmdt((void *)pptr1);
+ ret = wait(&status);
+ shmctl(shm_id, IPC_RMID, NULL);
+ shmctl(shm_id1, IPC_RMID, NULL);
+ if (ret != pid) {
+ printf("Child's exit status not captured\n");
+ return TEST_FAIL;
+ }
+
+ return (WIFEXITED(status) && WEXITSTATUS(status)) ? TEST_FAIL :
+ TEST_PASS;
+ }
+ return TEST_PASS;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(ptrace_tm_spr, "ptrace_tm_spr");
+}
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar.c
new file mode 100644
index 000000000..46ef378a1
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-tar.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Ptrace test for TAR, PPR, DSCR registers in the TM context
+ *
+ * Copyright (C) 2015 Anshuman Khandual, IBM Corporation.
+ */
+#include "ptrace.h"
+#include "tm.h"
+#include "ptrace-tar.h"
+
+int shm_id;
+unsigned long *cptr, *pptr;
+
+
+void tm_tar(void)
+{
+ unsigned long result, texasr;
+ unsigned long regs[3];
+ int ret;
+
+ cptr = (unsigned long *)shmat(shm_id, NULL, 0);
+
+trans:
+ cptr[1] = 0;
+ asm __volatile__(
+ "li 4, %[tar_1];"
+ "mtspr %[sprn_tar], 4;" /* TAR_1 */
+ "li 4, %[dscr_1];"
+ "mtspr %[sprn_dscr], 4;" /* DSCR_1 */
+ "or 31,31,31;" /* PPR_1*/
+
+ "1: ;"
+ "tbegin.;"
+ "beq 2f;"
+
+ "li 4, %[tar_2];"
+ "mtspr %[sprn_tar], 4;" /* TAR_2 */
+ "li 4, %[dscr_2];"
+ "mtspr %[sprn_dscr], 4;" /* DSCR_2 */
+ "or 1,1,1;" /* PPR_2 */
+ "tsuspend.;"
+ "li 0, 1;"
+ "stw 0, 0(%[cptr1]);"
+ "tresume.;"
+ "b .;"
+
+ "tend.;"
+ "li 0, 0;"
+ "ori %[res], 0, 0;"
+ "b 3f;"
+
+ /* Transaction abort handler */
+ "2: ;"
+ "li 0, 1;"
+ "ori %[res], 0, 0;"
+ "mfspr %[texasr], %[sprn_texasr];"
+
+ "3: ;"
+
+ : [res] "=r" (result), [texasr] "=r" (texasr)
+ : [sprn_dscr]"i"(SPRN_DSCR), [sprn_tar]"i"(SPRN_TAR),
+ [sprn_ppr]"i"(SPRN_PPR), [sprn_texasr]"i"(SPRN_TEXASR),
+ [tar_1]"i"(TAR_1), [dscr_1]"i"(DSCR_1), [tar_2]"i"(TAR_2),
+ [dscr_2]"i"(DSCR_2), [cptr1] "b" (&cptr[1])
+ : "memory", "r0", "r3", "r4", "r5", "r6"
+ );
+
+ /* TM failed, analyse */
+ if (result) {
+ if (!cptr[0])
+ goto trans;
+
+ regs[0] = mfspr(SPRN_TAR);
+ regs[1] = mfspr(SPRN_PPR);
+ regs[2] = mfspr(SPRN_DSCR);
+
+ shmdt(&cptr);
+ printf("%-30s TAR: %lu PPR: %lx DSCR: %lu\n",
+ user_read, regs[0], regs[1], regs[2]);
+
+ ret = validate_tar_registers(regs, TAR_4, PPR_4, DSCR_4);
+ if (ret)
+ exit(1);
+ exit(0);
+ }
+ shmdt(&cptr);
+ exit(1);
+}
+
+int trace_tm_tar(pid_t child)
+{
+ unsigned long regs[3];
+
+ FAIL_IF(start_trace(child));
+ FAIL_IF(show_tar_registers(child, regs));
+ printf("%-30s TAR: %lu PPR: %lx DSCR: %lu\n",
+ ptrace_read_running, regs[0], regs[1], regs[2]);
+
+ FAIL_IF(validate_tar_registers(regs, TAR_2, PPR_2, DSCR_2));
+ FAIL_IF(show_tm_checkpointed_state(child, regs));
+ printf("%-30s TAR: %lu PPR: %lx DSCR: %lu\n",
+ ptrace_read_ckpt, regs[0], regs[1], regs[2]);
+
+ FAIL_IF(validate_tar_registers(regs, TAR_1, PPR_1, DSCR_1));
+ FAIL_IF(write_ckpt_tar_registers(child, TAR_4, PPR_4, DSCR_4));
+ printf("%-30s TAR: %u PPR: %lx DSCR: %u\n",
+ ptrace_write_ckpt, TAR_4, PPR_4, DSCR_4);
+
+ pptr[0] = 1;
+ FAIL_IF(stop_trace(child));
+ return TEST_PASS;
+}
+
+int ptrace_tm_tar(void)
+{
+ pid_t pid;
+ int ret, status;
+
+ SKIP_IF(!have_htm());
+ shm_id = shmget(IPC_PRIVATE, sizeof(int) * 2, 0777|IPC_CREAT);
+ pid = fork();
+ if (pid == 0)
+ tm_tar();
+
+ pptr = (unsigned long *)shmat(shm_id, NULL, 0);
+ pptr[0] = 0;
+
+ if (pid) {
+ while (!pptr[1])
+ asm volatile("" : : : "memory");
+ ret = trace_tm_tar(pid);
+ if (ret) {
+ kill(pid, SIGTERM);
+ shmdt(&pptr);
+ shmctl(shm_id, IPC_RMID, NULL);
+ return TEST_FAIL;
+ }
+ shmdt(&pptr);
+
+ ret = wait(&status);
+ shmctl(shm_id, IPC_RMID, NULL);
+ if (ret != pid) {
+ printf("Child's exit status not captured\n");
+ return TEST_FAIL;
+ }
+
+ return (WIFEXITED(status) && WEXITSTATUS(status)) ? TEST_FAIL :
+ TEST_PASS;
+ }
+ return TEST_PASS;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(ptrace_tm_tar, "ptrace_tm_tar");
+}
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-tm-vsx.c b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-vsx.c
new file mode 100644
index 000000000..70ca01234
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-tm-vsx.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Ptrace test for VMX/VSX registers in the TM context
+ *
+ * Copyright (C) 2015 Anshuman Khandual, IBM Corporation.
+ */
+#include "ptrace.h"
+#include "tm.h"
+#include "ptrace-vsx.h"
+
+int shm_id;
+unsigned long *cptr, *pptr;
+
+unsigned long fp_load[VEC_MAX];
+unsigned long fp_store[VEC_MAX];
+unsigned long fp_load_ckpt[VEC_MAX];
+unsigned long fp_load_ckpt_new[VEC_MAX];
+
+__attribute__((used)) void load_vsx(void)
+{
+ loadvsx(fp_load, 0);
+}
+
+__attribute__((used)) void load_vsx_ckpt(void)
+{
+ loadvsx(fp_load_ckpt, 0);
+}
+
+void tm_vsx(void)
+{
+ unsigned long result, texasr;
+ int ret;
+
+ cptr = (unsigned long *)shmat(shm_id, NULL, 0);
+
+trans:
+ cptr[1] = 0;
+ asm __volatile__(
+ "bl load_vsx_ckpt;"
+
+ "1: ;"
+ "tbegin.;"
+ "beq 2f;"
+
+ "bl load_vsx;"
+ "tsuspend.;"
+ "li 7, 1;"
+ "stw 7, 0(%[cptr1]);"
+ "tresume.;"
+ "b .;"
+
+ "tend.;"
+ "li 0, 0;"
+ "ori %[res], 0, 0;"
+ "b 3f;"
+
+ "2: ;"
+ "li 0, 1;"
+ "ori %[res], 0, 0;"
+ "mfspr %[texasr], %[sprn_texasr];"
+
+ "3: ;"
+ : [res] "=r" (result), [texasr] "=r" (texasr)
+ : [sprn_texasr] "i" (SPRN_TEXASR), [cptr1] "b" (&cptr[1])
+ : "memory", "r0", "r3", "r4",
+ "r7", "r8", "r9", "r10", "r11", "lr"
+ );
+
+ if (result) {
+ if (!cptr[0])
+ goto trans;
+
+ shmdt((void *)cptr);
+ storevsx(fp_store, 0);
+ ret = compare_vsx_vmx(fp_store, fp_load_ckpt_new);
+ if (ret)
+ exit(1);
+ exit(0);
+ }
+ shmdt((void *)cptr);
+ exit(1);
+}
+
+int trace_tm_vsx(pid_t child)
+{
+ unsigned long vsx[VSX_MAX];
+ unsigned long vmx[VMX_MAX + 2][2];
+
+ FAIL_IF(start_trace(child));
+ FAIL_IF(show_vsx(child, vsx));
+ FAIL_IF(validate_vsx(vsx, fp_load));
+ FAIL_IF(show_vmx(child, vmx));
+ FAIL_IF(validate_vmx(vmx, fp_load));
+ FAIL_IF(show_vsx_ckpt(child, vsx));
+ FAIL_IF(validate_vsx(vsx, fp_load_ckpt));
+ FAIL_IF(show_vmx_ckpt(child, vmx));
+ FAIL_IF(validate_vmx(vmx, fp_load_ckpt));
+ memset(vsx, 0, sizeof(vsx));
+ memset(vmx, 0, sizeof(vmx));
+
+ load_vsx_vmx(fp_load_ckpt_new, vsx, vmx);
+
+ FAIL_IF(write_vsx_ckpt(child, vsx));
+ FAIL_IF(write_vmx_ckpt(child, vmx));
+ pptr[0] = 1;
+ FAIL_IF(stop_trace(child));
+ return TEST_PASS;
+}
+
+int ptrace_tm_vsx(void)
+{
+ pid_t pid;
+ int ret, status, i;
+
+ SKIP_IF(!have_htm());
+ shm_id = shmget(IPC_PRIVATE, sizeof(int) * 2, 0777|IPC_CREAT);
+
+ for (i = 0; i < 128; i++) {
+ fp_load[i] = 1 + rand();
+ fp_load_ckpt[i] = 1 + 2 * rand();
+ fp_load_ckpt_new[i] = 1 + 3 * rand();
+ }
+
+ pid = fork();
+ if (pid < 0) {
+ perror("fork() failed");
+ return TEST_FAIL;
+ }
+
+ if (pid == 0)
+ tm_vsx();
+
+ if (pid) {
+ pptr = (unsigned long *)shmat(shm_id, NULL, 0);
+ while (!pptr[1])
+ asm volatile("" : : : "memory");
+
+ ret = trace_tm_vsx(pid);
+ if (ret) {
+ kill(pid, SIGKILL);
+ shmdt((void *)pptr);
+ shmctl(shm_id, IPC_RMID, NULL);
+ return TEST_FAIL;
+ }
+
+ shmdt((void *)pptr);
+ ret = wait(&status);
+ shmctl(shm_id, IPC_RMID, NULL);
+ if (ret != pid) {
+ printf("Child's exit status not captured\n");
+ return TEST_FAIL;
+ }
+
+ return (WIFEXITED(status) && WEXITSTATUS(status)) ? TEST_FAIL :
+ TEST_PASS;
+ }
+ return TEST_PASS;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(ptrace_tm_vsx, "ptrace_tm_vsx");
+}
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-vsx.c b/tools/testing/selftests/powerpc/ptrace/ptrace-vsx.c
new file mode 100644
index 000000000..cb9875f76
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-vsx.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Ptrace test for VMX/VSX registers
+ *
+ * Copyright (C) 2015 Anshuman Khandual, IBM Corporation.
+ */
+#include "ptrace.h"
+#include "ptrace-vsx.h"
+
+/* Tracer and Tracee Shared Data */
+int shm_id;
+int *cptr, *pptr;
+
+unsigned long fp_load[VEC_MAX];
+unsigned long fp_load_new[VEC_MAX];
+unsigned long fp_store[VEC_MAX];
+
+void vsx(void)
+{
+ int ret;
+
+ cptr = (int *)shmat(shm_id, NULL, 0);
+ loadvsx(fp_load, 0);
+ cptr[1] = 1;
+
+ while (!cptr[0])
+ asm volatile("" : : : "memory");
+ shmdt((void *) cptr);
+
+ storevsx(fp_store, 0);
+ ret = compare_vsx_vmx(fp_store, fp_load_new);
+ if (ret)
+ exit(1);
+ exit(0);
+}
+
+int trace_vsx(pid_t child)
+{
+ unsigned long vsx[VSX_MAX];
+ unsigned long vmx[VMX_MAX + 2][2];
+
+ FAIL_IF(start_trace(child));
+ FAIL_IF(show_vsx(child, vsx));
+ FAIL_IF(validate_vsx(vsx, fp_load));
+ FAIL_IF(show_vmx(child, vmx));
+ FAIL_IF(validate_vmx(vmx, fp_load));
+
+ memset(vsx, 0, sizeof(vsx));
+ memset(vmx, 0, sizeof(vmx));
+ load_vsx_vmx(fp_load_new, vsx, vmx);
+
+ FAIL_IF(write_vsx(child, vsx));
+ FAIL_IF(write_vmx(child, vmx));
+ FAIL_IF(stop_trace(child));
+
+ return TEST_PASS;
+}
+
+int ptrace_vsx(void)
+{
+ pid_t pid;
+ int ret, status, i;
+
+ SKIP_IF(!have_hwcap(PPC_FEATURE_HAS_VSX));
+
+ shm_id = shmget(IPC_PRIVATE, sizeof(int) * 2, 0777|IPC_CREAT);
+
+ for (i = 0; i < VEC_MAX; i++)
+ fp_load[i] = i + rand();
+
+ for (i = 0; i < VEC_MAX; i++)
+ fp_load_new[i] = i + 2 * rand();
+
+ pid = fork();
+ if (pid < 0) {
+ perror("fork() failed");
+ return TEST_FAIL;
+ }
+
+ if (pid == 0)
+ vsx();
+
+ if (pid) {
+ pptr = (int *)shmat(shm_id, NULL, 0);
+ while (!pptr[1])
+ asm volatile("" : : : "memory");
+
+ ret = trace_vsx(pid);
+ if (ret) {
+ kill(pid, SIGTERM);
+ shmdt((void *)pptr);
+ shmctl(shm_id, IPC_RMID, NULL);
+ return TEST_FAIL;
+ }
+
+ pptr[0] = 1;
+ shmdt((void *)pptr);
+
+ ret = wait(&status);
+ shmctl(shm_id, IPC_RMID, NULL);
+ if (ret != pid) {
+ printf("Child's exit status not captured\n");
+ return TEST_FAIL;
+ }
+
+ return (WIFEXITED(status) && WEXITSTATUS(status)) ? TEST_FAIL :
+ TEST_PASS;
+ }
+ return TEST_PASS;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(ptrace_vsx, "ptrace_vsx");
+}
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-vsx.h b/tools/testing/selftests/powerpc/ptrace/ptrace-vsx.h
new file mode 100644
index 000000000..663348521
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-vsx.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2015 Anshuman Khandual, IBM Corporation.
+ */
+#define VEC_MAX 128
+#define VSX_MAX 32
+#define VMX_MAX 32
+
+/*
+ * unsigned long vsx[32]
+ * unsigned long load[128]
+ */
+int validate_vsx(unsigned long *vsx, unsigned long *load)
+{
+ int i;
+
+ for (i = 0; i < VSX_MAX; i++) {
+ if (vsx[i] != load[2 * i + 1]) {
+ printf("vsx[%d]: %lx load[%d] %lx\n",
+ i, vsx[i], 2 * i + 1, load[2 * i + 1]);
+ return TEST_FAIL;
+ }
+ }
+ return TEST_PASS;
+}
+
+/*
+ * unsigned long vmx[32][2]
+ * unsigned long load[128]
+ */
+int validate_vmx(unsigned long vmx[][2], unsigned long *load)
+{
+ int i;
+
+ for (i = 0; i < VMX_MAX; i++) {
+ #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ if ((vmx[i][0] != load[64 + 2 * i]) ||
+ (vmx[i][1] != load[65 + 2 * i])) {
+ printf("vmx[%d][0]: %lx load[%d] %lx\n",
+ i, vmx[i][0], 64 + 2 * i,
+ load[64 + 2 * i]);
+ printf("vmx[%d][1]: %lx load[%d] %lx\n",
+ i, vmx[i][1], 65 + 2 * i,
+ load[65 + 2 * i]);
+ return TEST_FAIL;
+ }
+ #else /*
+ * In LE each value pair is stored in an
+ * alternate manner.
+ */
+ if ((vmx[i][0] != load[65 + 2 * i]) ||
+ (vmx[i][1] != load[64 + 2 * i])) {
+ printf("vmx[%d][0]: %lx load[%d] %lx\n",
+ i, vmx[i][0], 65 + 2 * i,
+ load[65 + 2 * i]);
+ printf("vmx[%d][1]: %lx load[%d] %lx\n",
+ i, vmx[i][1], 64 + 2 * i,
+ load[64 + 2 * i]);
+ return TEST_FAIL;
+ }
+ #endif
+ }
+ return TEST_PASS;
+}
+
+/*
+ * unsigned long store[128]
+ * unsigned long load[128]
+ */
+int compare_vsx_vmx(unsigned long *store, unsigned long *load)
+{
+ int i;
+
+ for (i = 0; i < VSX_MAX; i++) {
+ if (store[1 + 2 * i] != load[1 + 2 * i]) {
+ printf("store[%d]: %lx load[%d] %lx\n",
+ 1 + 2 * i, store[i],
+ 1 + 2 * i, load[i]);
+ return TEST_FAIL;
+ }
+ }
+
+ #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ for (i = 64; i < VEC_MAX; i++) {
+ if (store[i] != load[i]) {
+ printf("store[%d]: %lx load[%d] %lx\n",
+ i, store[i], i, load[i]);
+ return TEST_FAIL;
+ }
+ }
+ #else /* In LE each value pair is stored in an alternate manner */
+ for (i = 64; i < VEC_MAX; i++) {
+ if (!(i % 2) && (store[i] != load[i+1])) {
+ printf("store[%d]: %lx load[%d] %lx\n",
+ i, store[i], i+1, load[i+1]);
+ return TEST_FAIL;
+ }
+ if ((i % 2) && (store[i] != load[i-1])) {
+ printf("here store[%d]: %lx load[%d] %lx\n",
+ i, store[i], i-1, load[i-1]);
+ return TEST_FAIL;
+ }
+ }
+ #endif
+ return TEST_PASS;
+}
+
+void load_vsx_vmx(unsigned long *load, unsigned long *vsx,
+ unsigned long vmx[][2])
+{
+ int i;
+
+ for (i = 0; i < VSX_MAX; i++)
+ vsx[i] = load[1 + 2 * i];
+
+ for (i = 0; i < VMX_MAX; i++) {
+ vmx[i][0] = load[64 + 2 * i];
+ vmx[i][1] = load[65 + 2 * i];
+ }
+}
+
+void loadvsx(void *p, int tmp);
+void storevsx(void *p, int tmp);
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace.h b/tools/testing/selftests/powerpc/ptrace/ptrace.h
new file mode 100644
index 000000000..5181ad9b4
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace.h
@@ -0,0 +1,745 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Ptrace interface test helper functions
+ *
+ * Copyright (C) 2015 Anshuman Khandual, IBM Corporation.
+ */
+#include <inttypes.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <malloc.h>
+#include <errno.h>
+#include <time.h>
+#include <sys/ptrace.h>
+#include <sys/ioctl.h>
+#include <sys/uio.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/signal.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/user.h>
+#include <linux/elf.h>
+#include <linux/types.h>
+#include <linux/auxvec.h>
+#include "reg.h"
+#include "utils.h"
+
+#define TEST_PASS 0
+#define TEST_FAIL 1
+
+struct fpr_regs {
+ unsigned long fpr[32];
+ unsigned long fpscr;
+};
+
+struct tm_spr_regs {
+ unsigned long tm_tfhar;
+ unsigned long tm_texasr;
+ unsigned long tm_tfiar;
+};
+
+#ifndef NT_PPC_TAR
+#define NT_PPC_TAR 0x103
+#define NT_PPC_PPR 0x104
+#define NT_PPC_DSCR 0x105
+#define NT_PPC_EBB 0x106
+#define NT_PPC_PMU 0x107
+#define NT_PPC_TM_CGPR 0x108
+#define NT_PPC_TM_CFPR 0x109
+#define NT_PPC_TM_CVMX 0x10a
+#define NT_PPC_TM_CVSX 0x10b
+#define NT_PPC_TM_SPR 0x10c
+#define NT_PPC_TM_CTAR 0x10d
+#define NT_PPC_TM_CPPR 0x10e
+#define NT_PPC_TM_CDSCR 0x10f
+#endif
+
+/* Basic ptrace operations */
+int start_trace(pid_t child)
+{
+ int ret;
+
+ ret = ptrace(PTRACE_ATTACH, child, NULL, NULL);
+ if (ret) {
+ perror("ptrace(PTRACE_ATTACH) failed");
+ return TEST_FAIL;
+ }
+ ret = waitpid(child, NULL, 0);
+ if (ret != child) {
+ perror("waitpid() failed");
+ return TEST_FAIL;
+ }
+ return TEST_PASS;
+}
+
+int stop_trace(pid_t child)
+{
+ int ret;
+
+ ret = ptrace(PTRACE_DETACH, child, NULL, NULL);
+ if (ret) {
+ perror("ptrace(PTRACE_DETACH) failed");
+ return TEST_FAIL;
+ }
+ return TEST_PASS;
+}
+
+int cont_trace(pid_t child)
+{
+ int ret;
+
+ ret = ptrace(PTRACE_CONT, child, NULL, NULL);
+ if (ret) {
+ perror("ptrace(PTRACE_CONT) failed");
+ return TEST_FAIL;
+ }
+ return TEST_PASS;
+}
+
+int ptrace_read_regs(pid_t child, unsigned long type, unsigned long regs[],
+ int n)
+{
+ struct iovec iov;
+ long ret;
+
+ FAIL_IF(start_trace(child));
+
+ iov.iov_base = regs;
+ iov.iov_len = n * sizeof(unsigned long);
+
+ ret = ptrace(PTRACE_GETREGSET, child, type, &iov);
+ if (ret)
+ return ret;
+
+ FAIL_IF(stop_trace(child));
+
+ return TEST_PASS;
+}
+
+long ptrace_write_regs(pid_t child, unsigned long type, unsigned long regs[],
+ int n)
+{
+ struct iovec iov;
+ long ret;
+
+ FAIL_IF(start_trace(child));
+
+ iov.iov_base = regs;
+ iov.iov_len = n * sizeof(unsigned long);
+
+ ret = ptrace(PTRACE_SETREGSET, child, type, &iov);
+
+ FAIL_IF(stop_trace(child));
+
+ return ret;
+}
+
+/* TAR, PPR, DSCR */
+int show_tar_registers(pid_t child, unsigned long *out)
+{
+ struct iovec iov;
+ unsigned long *reg;
+ int ret;
+
+ reg = malloc(sizeof(unsigned long));
+ if (!reg) {
+ perror("malloc() failed");
+ return TEST_FAIL;
+ }
+ iov.iov_base = (u64 *) reg;
+ iov.iov_len = sizeof(unsigned long);
+
+ ret = ptrace(PTRACE_GETREGSET, child, NT_PPC_TAR, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET) failed");
+ goto fail;
+ }
+ if (out)
+ out[0] = *reg;
+
+ ret = ptrace(PTRACE_GETREGSET, child, NT_PPC_PPR, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET) failed");
+ goto fail;
+ }
+ if (out)
+ out[1] = *reg;
+
+ ret = ptrace(PTRACE_GETREGSET, child, NT_PPC_DSCR, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET) failed");
+ goto fail;
+ }
+ if (out)
+ out[2] = *reg;
+
+ free(reg);
+ return TEST_PASS;
+fail:
+ free(reg);
+ return TEST_FAIL;
+}
+
+int write_tar_registers(pid_t child, unsigned long tar,
+ unsigned long ppr, unsigned long dscr)
+{
+ struct iovec iov;
+ unsigned long *reg;
+ int ret;
+
+ reg = malloc(sizeof(unsigned long));
+ if (!reg) {
+ perror("malloc() failed");
+ return TEST_FAIL;
+ }
+
+ iov.iov_base = (u64 *) reg;
+ iov.iov_len = sizeof(unsigned long);
+
+ *reg = tar;
+ ret = ptrace(PTRACE_SETREGSET, child, NT_PPC_TAR, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_SETREGSET) failed");
+ goto fail;
+ }
+
+ *reg = ppr;
+ ret = ptrace(PTRACE_SETREGSET, child, NT_PPC_PPR, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_SETREGSET) failed");
+ goto fail;
+ }
+
+ *reg = dscr;
+ ret = ptrace(PTRACE_SETREGSET, child, NT_PPC_DSCR, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_SETREGSET) failed");
+ goto fail;
+ }
+
+ free(reg);
+ return TEST_PASS;
+fail:
+ free(reg);
+ return TEST_FAIL;
+}
+
+int show_tm_checkpointed_state(pid_t child, unsigned long *out)
+{
+ struct iovec iov;
+ unsigned long *reg;
+ int ret;
+
+ reg = malloc(sizeof(unsigned long));
+ if (!reg) {
+ perror("malloc() failed");
+ return TEST_FAIL;
+ }
+
+ iov.iov_base = (u64 *) reg;
+ iov.iov_len = sizeof(unsigned long);
+
+ ret = ptrace(PTRACE_GETREGSET, child, NT_PPC_TM_CTAR, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET) failed");
+ goto fail;
+ }
+ if (out)
+ out[0] = *reg;
+
+ ret = ptrace(PTRACE_GETREGSET, child, NT_PPC_TM_CPPR, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET) failed");
+ goto fail;
+ }
+ if (out)
+ out[1] = *reg;
+
+ ret = ptrace(PTRACE_GETREGSET, child, NT_PPC_TM_CDSCR, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET) failed");
+ goto fail;
+ }
+ if (out)
+ out[2] = *reg;
+
+ free(reg);
+ return TEST_PASS;
+
+fail:
+ free(reg);
+ return TEST_FAIL;
+}
+
+int write_ckpt_tar_registers(pid_t child, unsigned long tar,
+ unsigned long ppr, unsigned long dscr)
+{
+ struct iovec iov;
+ unsigned long *reg;
+ int ret;
+
+ reg = malloc(sizeof(unsigned long));
+ if (!reg) {
+ perror("malloc() failed");
+ return TEST_FAIL;
+ }
+
+ iov.iov_base = (u64 *) reg;
+ iov.iov_len = sizeof(unsigned long);
+
+ *reg = tar;
+ ret = ptrace(PTRACE_SETREGSET, child, NT_PPC_TM_CTAR, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET) failed");
+ goto fail;
+ }
+
+ *reg = ppr;
+ ret = ptrace(PTRACE_SETREGSET, child, NT_PPC_TM_CPPR, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET) failed");
+ goto fail;
+ }
+
+ *reg = dscr;
+ ret = ptrace(PTRACE_SETREGSET, child, NT_PPC_TM_CDSCR, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET) failed");
+ goto fail;
+ }
+
+ free(reg);
+ return TEST_PASS;
+fail:
+ free(reg);
+ return TEST_FAIL;
+}
+
+/* FPR */
+int show_fpr(pid_t child, unsigned long *fpr)
+{
+ struct fpr_regs *regs;
+ int ret, i;
+
+ regs = (struct fpr_regs *) malloc(sizeof(struct fpr_regs));
+ ret = ptrace(PTRACE_GETFPREGS, child, NULL, regs);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET) failed");
+ return TEST_FAIL;
+ }
+
+ if (fpr) {
+ for (i = 0; i < 32; i++)
+ fpr[i] = regs->fpr[i];
+ }
+ return TEST_PASS;
+}
+
+int write_fpr(pid_t child, unsigned long val)
+{
+ struct fpr_regs *regs;
+ int ret, i;
+
+ regs = (struct fpr_regs *) malloc(sizeof(struct fpr_regs));
+ ret = ptrace(PTRACE_GETFPREGS, child, NULL, regs);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET) failed");
+ return TEST_FAIL;
+ }
+
+ for (i = 0; i < 32; i++)
+ regs->fpr[i] = val;
+
+ ret = ptrace(PTRACE_SETFPREGS, child, NULL, regs);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET) failed");
+ return TEST_FAIL;
+ }
+ return TEST_PASS;
+}
+
+int show_ckpt_fpr(pid_t child, unsigned long *fpr)
+{
+ struct fpr_regs *regs;
+ struct iovec iov;
+ int ret, i;
+
+ regs = (struct fpr_regs *) malloc(sizeof(struct fpr_regs));
+ iov.iov_base = regs;
+ iov.iov_len = sizeof(struct fpr_regs);
+
+ ret = ptrace(PTRACE_GETREGSET, child, NT_PPC_TM_CFPR, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET) failed");
+ return TEST_FAIL;
+ }
+
+ if (fpr) {
+ for (i = 0; i < 32; i++)
+ fpr[i] = regs->fpr[i];
+ }
+
+ return TEST_PASS;
+}
+
+int write_ckpt_fpr(pid_t child, unsigned long val)
+{
+ struct fpr_regs *regs;
+ struct iovec iov;
+ int ret, i;
+
+ regs = (struct fpr_regs *) malloc(sizeof(struct fpr_regs));
+ iov.iov_base = regs;
+ iov.iov_len = sizeof(struct fpr_regs);
+
+ ret = ptrace(PTRACE_GETREGSET, child, NT_PPC_TM_CFPR, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET) failed");
+ return TEST_FAIL;
+ }
+
+ for (i = 0; i < 32; i++)
+ regs->fpr[i] = val;
+
+ ret = ptrace(PTRACE_SETREGSET, child, NT_PPC_TM_CFPR, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET) failed");
+ return TEST_FAIL;
+ }
+ return TEST_PASS;
+}
+
+/* GPR */
+int show_gpr(pid_t child, unsigned long *gpr)
+{
+ struct pt_regs *regs;
+ int ret, i;
+
+ regs = (struct pt_regs *) malloc(sizeof(struct pt_regs));
+ if (!regs) {
+ perror("malloc() failed");
+ return TEST_FAIL;
+ }
+
+ ret = ptrace(PTRACE_GETREGS, child, NULL, regs);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET) failed");
+ return TEST_FAIL;
+ }
+
+ if (gpr) {
+ for (i = 14; i < 32; i++)
+ gpr[i-14] = regs->gpr[i];
+ }
+
+ return TEST_PASS;
+}
+
+int write_gpr(pid_t child, unsigned long val)
+{
+ struct pt_regs *regs;
+ int i, ret;
+
+ regs = (struct pt_regs *) malloc(sizeof(struct pt_regs));
+ if (!regs) {
+ perror("malloc() failed");
+ return TEST_FAIL;
+ }
+
+ ret = ptrace(PTRACE_GETREGS, child, NULL, regs);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET) failed");
+ return TEST_FAIL;
+ }
+
+ for (i = 14; i < 32; i++)
+ regs->gpr[i] = val;
+
+ ret = ptrace(PTRACE_SETREGS, child, NULL, regs);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET) failed");
+ return TEST_FAIL;
+ }
+ return TEST_PASS;
+}
+
+int show_ckpt_gpr(pid_t child, unsigned long *gpr)
+{
+ struct pt_regs *regs;
+ struct iovec iov;
+ int ret, i;
+
+ regs = (struct pt_regs *) malloc(sizeof(struct pt_regs));
+ if (!regs) {
+ perror("malloc() failed");
+ return TEST_FAIL;
+ }
+
+ iov.iov_base = (u64 *) regs;
+ iov.iov_len = sizeof(struct pt_regs);
+
+ ret = ptrace(PTRACE_GETREGSET, child, NT_PPC_TM_CGPR, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET) failed");
+ return TEST_FAIL;
+ }
+
+ if (gpr) {
+ for (i = 14; i < 32; i++)
+ gpr[i-14] = regs->gpr[i];
+ }
+
+ return TEST_PASS;
+}
+
+int write_ckpt_gpr(pid_t child, unsigned long val)
+{
+ struct pt_regs *regs;
+ struct iovec iov;
+ int ret, i;
+
+ regs = (struct pt_regs *) malloc(sizeof(struct pt_regs));
+ if (!regs) {
+ perror("malloc() failed\n");
+ return TEST_FAIL;
+ }
+ iov.iov_base = (u64 *) regs;
+ iov.iov_len = sizeof(struct pt_regs);
+
+ ret = ptrace(PTRACE_GETREGSET, child, NT_PPC_TM_CGPR, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET) failed");
+ return TEST_FAIL;
+ }
+
+ for (i = 14; i < 32; i++)
+ regs->gpr[i] = val;
+
+ ret = ptrace(PTRACE_SETREGSET, child, NT_PPC_TM_CGPR, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET) failed");
+ return TEST_FAIL;
+ }
+ return TEST_PASS;
+}
+
+/* VMX */
+int show_vmx(pid_t child, unsigned long vmx[][2])
+{
+ int ret;
+
+ ret = ptrace(PTRACE_GETVRREGS, child, 0, vmx);
+ if (ret) {
+ perror("ptrace(PTRACE_GETVRREGS) failed");
+ return TEST_FAIL;
+ }
+ return TEST_PASS;
+}
+
+int show_vmx_ckpt(pid_t child, unsigned long vmx[][2])
+{
+ unsigned long regs[34][2];
+ struct iovec iov;
+ int ret;
+
+ iov.iov_base = (u64 *) regs;
+ iov.iov_len = sizeof(regs);
+ ret = ptrace(PTRACE_GETREGSET, child, NT_PPC_TM_CVMX, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET, NT_PPC_TM_CVMX) failed");
+ return TEST_FAIL;
+ }
+ memcpy(vmx, regs, sizeof(regs));
+ return TEST_PASS;
+}
+
+
+int write_vmx(pid_t child, unsigned long vmx[][2])
+{
+ int ret;
+
+ ret = ptrace(PTRACE_SETVRREGS, child, 0, vmx);
+ if (ret) {
+ perror("ptrace(PTRACE_SETVRREGS) failed");
+ return TEST_FAIL;
+ }
+ return TEST_PASS;
+}
+
+int write_vmx_ckpt(pid_t child, unsigned long vmx[][2])
+{
+ unsigned long regs[34][2];
+ struct iovec iov;
+ int ret;
+
+ memcpy(regs, vmx, sizeof(regs));
+ iov.iov_base = (u64 *) regs;
+ iov.iov_len = sizeof(regs);
+ ret = ptrace(PTRACE_SETREGSET, child, NT_PPC_TM_CVMX, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_SETREGSET, NT_PPC_TM_CVMX) failed");
+ return TEST_FAIL;
+ }
+ return TEST_PASS;
+}
+
+/* VSX */
+int show_vsx(pid_t child, unsigned long *vsx)
+{
+ int ret;
+
+ ret = ptrace(PTRACE_GETVSRREGS, child, 0, vsx);
+ if (ret) {
+ perror("ptrace(PTRACE_GETVSRREGS) failed");
+ return TEST_FAIL;
+ }
+ return TEST_PASS;
+}
+
+int show_vsx_ckpt(pid_t child, unsigned long *vsx)
+{
+ unsigned long regs[32];
+ struct iovec iov;
+ int ret;
+
+ iov.iov_base = (u64 *) regs;
+ iov.iov_len = sizeof(regs);
+ ret = ptrace(PTRACE_GETREGSET, child, NT_PPC_TM_CVSX, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET, NT_PPC_TM_CVSX) failed");
+ return TEST_FAIL;
+ }
+ memcpy(vsx, regs, sizeof(regs));
+ return TEST_PASS;
+}
+
+int write_vsx(pid_t child, unsigned long *vsx)
+{
+ int ret;
+
+ ret = ptrace(PTRACE_SETVSRREGS, child, 0, vsx);
+ if (ret) {
+ perror("ptrace(PTRACE_SETVSRREGS) failed");
+ return TEST_FAIL;
+ }
+ return TEST_PASS;
+}
+
+int write_vsx_ckpt(pid_t child, unsigned long *vsx)
+{
+ unsigned long regs[32];
+ struct iovec iov;
+ int ret;
+
+ memcpy(regs, vsx, sizeof(regs));
+ iov.iov_base = (u64 *) regs;
+ iov.iov_len = sizeof(regs);
+ ret = ptrace(PTRACE_SETREGSET, child, NT_PPC_TM_CVSX, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_SETREGSET, NT_PPC_TM_CVSX) failed");
+ return TEST_FAIL;
+ }
+ return TEST_PASS;
+}
+
+/* TM SPR */
+int show_tm_spr(pid_t child, struct tm_spr_regs *out)
+{
+ struct tm_spr_regs *regs;
+ struct iovec iov;
+ int ret;
+
+ regs = (struct tm_spr_regs *) malloc(sizeof(struct tm_spr_regs));
+ if (!regs) {
+ perror("malloc() failed");
+ return TEST_FAIL;
+ }
+
+ iov.iov_base = (u64 *) regs;
+ iov.iov_len = sizeof(struct tm_spr_regs);
+
+ ret = ptrace(PTRACE_GETREGSET, child, NT_PPC_TM_SPR, &iov);
+ if (ret) {
+ perror("ptrace(PTRACE_GETREGSET) failed");
+ return TEST_FAIL;
+ }
+
+ if (out)
+ memcpy(out, regs, sizeof(struct tm_spr_regs));
+
+ return TEST_PASS;
+}
+
+
+
+/* Analyse TEXASR after TM failure */
+inline unsigned long get_tfiar(void)
+{
+ unsigned long ret;
+
+ asm volatile("mfspr %0,%1" : "=r" (ret) : "i" (SPRN_TFIAR));
+ return ret;
+}
+
+void analyse_texasr(unsigned long texasr)
+{
+ printf("TEXASR: %16lx\t", texasr);
+
+ if (texasr & TEXASR_FP)
+ printf("TEXASR_FP ");
+
+ if (texasr & TEXASR_DA)
+ printf("TEXASR_DA ");
+
+ if (texasr & TEXASR_NO)
+ printf("TEXASR_NO ");
+
+ if (texasr & TEXASR_FO)
+ printf("TEXASR_FO ");
+
+ if (texasr & TEXASR_SIC)
+ printf("TEXASR_SIC ");
+
+ if (texasr & TEXASR_NTC)
+ printf("TEXASR_NTC ");
+
+ if (texasr & TEXASR_TC)
+ printf("TEXASR_TC ");
+
+ if (texasr & TEXASR_TIC)
+ printf("TEXASR_TIC ");
+
+ if (texasr & TEXASR_IC)
+ printf("TEXASR_IC ");
+
+ if (texasr & TEXASR_IFC)
+ printf("TEXASR_IFC ");
+
+ if (texasr & TEXASR_ABT)
+ printf("TEXASR_ABT ");
+
+ if (texasr & TEXASR_SPD)
+ printf("TEXASR_SPD ");
+
+ if (texasr & TEXASR_HV)
+ printf("TEXASR_HV ");
+
+ if (texasr & TEXASR_PR)
+ printf("TEXASR_PR ");
+
+ if (texasr & TEXASR_FS)
+ printf("TEXASR_FS ");
+
+ if (texasr & TEXASR_TE)
+ printf("TEXASR_TE ");
+
+ if (texasr & TEXASR_ROT)
+ printf("TEXASR_ROT ");
+
+ printf("TFIAR :%lx\n", get_tfiar());
+}
+
+void store_gpr(unsigned long *addr);
+void store_fpr(float *addr);
diff --git a/tools/testing/selftests/powerpc/scripts/hmi.sh b/tools/testing/selftests/powerpc/scripts/hmi.sh
new file mode 100755
index 000000000..dcdb392e8
--- /dev/null
+++ b/tools/testing/selftests/powerpc/scripts/hmi.sh
@@ -0,0 +1,82 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Copyright 2015, Daniel Axtens, IBM Corporation
+#
+
+
+# do we have ./getscom, ./putscom?
+if [ -x ./getscom ] && [ -x ./putscom ]; then
+ GETSCOM=./getscom
+ PUTSCOM=./putscom
+elif which getscom > /dev/null; then
+ GETSCOM=$(which getscom)
+ PUTSCOM=$(which putscom)
+else
+ cat <<EOF
+Can't find getscom/putscom in . or \$PATH.
+See https://github.com/open-power/skiboot.
+The tool is in external/xscom-utils
+EOF
+ exit 1
+fi
+
+# We will get 8 HMI events per injection
+# todo: deal with things being offline
+expected_hmis=8
+COUNT_HMIS() {
+ dmesg | grep -c 'Harmless Hypervisor Maintenance interrupt'
+}
+
+# massively expand snooze delay, allowing injection on all cores
+ppc64_cpu --smt-snooze-delay=1000000000
+
+# when we exit, restore it
+trap "ppc64_cpu --smt-snooze-delay=100" 0 1
+
+# for each chip+core combination
+# todo - less fragile parsing
+egrep -o 'OCC: Chip [0-9a-f]+ Core [0-9a-f]' < /sys/firmware/opal/msglog |
+while read chipcore; do
+ chip=$(echo "$chipcore"|awk '{print $3}')
+ core=$(echo "$chipcore"|awk '{print $5}')
+ fir="0x1${core}013100"
+
+ # verify that Core FIR is zero as expected
+ if [ "$($GETSCOM -c 0x${chip} $fir)" != 0 ]; then
+ echo "FIR was not zero before injection for chip $chip, core $core. Aborting!"
+ echo "Result of $GETSCOM -c 0x${chip} $fir:"
+ $GETSCOM -c 0x${chip} $fir
+ echo "If you get a -5 error, the core may be in idle state. Try stress-ng."
+ echo "Otherwise, try $PUTSCOM -c 0x${chip} $fir 0"
+ exit 1
+ fi
+
+ # keep track of the number of HMIs handled
+ old_hmis=$(COUNT_HMIS)
+
+ # do injection, adding a marker to dmesg for clarity
+ echo "Injecting HMI on core $core, chip $chip" | tee /dev/kmsg
+ # inject a RegFile recoverable error
+ if ! $PUTSCOM -c 0x${chip} $fir 2000000000000000 > /dev/null; then
+ echo "Error injecting. Aborting!"
+ exit 1
+ fi
+
+ # now we want to wait for all the HMIs to be processed
+ # we expect one per thread on the core
+ i=0;
+ new_hmis=$(COUNT_HMIS)
+ while [ $new_hmis -lt $((old_hmis + expected_hmis)) ] && [ $i -lt 12 ]; do
+ echo "Seen $((new_hmis - old_hmis)) HMI(s) out of $expected_hmis expected, sleeping"
+ sleep 5;
+ i=$((i + 1))
+ new_hmis=$(COUNT_HMIS)
+ done
+ if [ $i = 12 ]; then
+ echo "Haven't seen expected $expected_hmis recoveries after 1 min. Aborting."
+ exit 1
+ fi
+ echo "Processed $expected_hmis events; presumed success. Check dmesg."
+ echo ""
+done
diff --git a/tools/testing/selftests/powerpc/security/.gitignore b/tools/testing/selftests/powerpc/security/.gitignore
new file mode 100644
index 000000000..4257a1f15
--- /dev/null
+++ b/tools/testing/selftests/powerpc/security/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+rfi_flush
+entry_flush
diff --git a/tools/testing/selftests/powerpc/security/Makefile b/tools/testing/selftests/powerpc/security/Makefile
new file mode 100644
index 000000000..f25e854fe
--- /dev/null
+++ b/tools/testing/selftests/powerpc/security/Makefile
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0+
+
+TEST_GEN_PROGS := rfi_flush entry_flush spectre_v2
+top_srcdir = ../../../../..
+
+CFLAGS += -I../../../../../usr/include
+
+include ../../lib.mk
+
+$(TEST_GEN_PROGS): ../harness.c ../utils.c
+
+$(OUTPUT)/spectre_v2: CFLAGS += -m64
+$(OUTPUT)/spectre_v2: ../pmu/event.c branch_loops.S
+$(OUTPUT)/rfi_flush: flush_utils.c
+$(OUTPUT)/entry_flush: flush_utils.c
diff --git a/tools/testing/selftests/powerpc/security/branch_loops.S b/tools/testing/selftests/powerpc/security/branch_loops.S
new file mode 100644
index 000000000..22e9204e3
--- /dev/null
+++ b/tools/testing/selftests/powerpc/security/branch_loops.S
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+/*
+ * Copyright 2019, Michael Ellerman, IBM Corp.
+ */
+
+#include <ppc-asm.h>
+
+ .data
+
+jump_table:
+ .long 0x0
+ .long (.Lstate_1 - .Lstate_0)
+ .long (.Lstate_2 - .Lstate_0)
+ .long (.Lstate_3 - .Lstate_0)
+ .long (.Lstate_4 - .Lstate_0)
+ .long (.Lstate_5 - .Lstate_0)
+ .long (.Lstate_6 - .Lstate_0)
+ .long (.Lstate_7 - .Lstate_0)
+
+ .text
+
+#define ITER_SHIFT 31
+
+.macro state number
+ .balign 32
+.Lstate_\number:
+ .if \number==7
+ li r3, 0
+ .else
+ li r3, \number+1
+ .endif
+ b .Lloop
+.endm
+
+FUNC_START(pattern_cache_loop)
+ li r3, 0
+ li r4, 1
+ sldi r4, r4, ITER_SHIFT
+
+.Lloop: cmpdi r4, 0
+ beqlr
+
+ addi r4, r4, -1
+
+ ld r6, jump_table@got(%r2)
+ sldi r5, r3, 2
+ lwax r6, r5, r6
+ ld r7, .Lstate_0@got(%r2)
+ add r6, r6, r7
+ mtctr r6
+ bctr
+
+ state 0
+ state 1
+ state 2
+ state 3
+ state 4
+ state 5
+ state 6
+ state 7
+
+FUNC_END(pattern_cache_loop)
+
+
+FUNC_START(indirect_branch_loop)
+ li r3, 1
+ sldi r3, r3, ITER_SHIFT
+
+1: cmpdi r3, 0
+ beqlr
+
+ addi r3, r3, -1
+
+ ld r4, 2f@got(%r2)
+ mtctr r4
+ bctr
+
+ .balign 32
+2: b 1b
+
+FUNC_END(indirect_branch_loop)
diff --git a/tools/testing/selftests/powerpc/security/entry_flush.c b/tools/testing/selftests/powerpc/security/entry_flush.c
new file mode 100644
index 000000000..68ce377b2
--- /dev/null
+++ b/tools/testing/selftests/powerpc/security/entry_flush.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+/*
+ * Copyright 2018 IBM Corporation.
+ */
+
+#define __SANE_USERSPACE_TYPES__
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <malloc.h>
+#include <unistd.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include "utils.h"
+#include "flush_utils.h"
+
+int entry_flush_test(void)
+{
+ char *p;
+ int repetitions = 10;
+ int fd, passes = 0, iter, rc = 0;
+ struct perf_event_read v;
+ __u64 l1d_misses_total = 0;
+ unsigned long iterations = 100000, zero_size = 24 * 1024;
+ unsigned long l1d_misses_expected;
+ int rfi_flush_orig;
+ int entry_flush, entry_flush_orig;
+
+ SKIP_IF(geteuid() != 0);
+
+ // The PMU event we use only works on Power7 or later
+ SKIP_IF(!have_hwcap(PPC_FEATURE_ARCH_2_06));
+
+ if (read_debugfs_file("powerpc/rfi_flush", &rfi_flush_orig) < 0) {
+ perror("Unable to read powerpc/rfi_flush debugfs file");
+ SKIP_IF(1);
+ }
+
+ if (read_debugfs_file("powerpc/entry_flush", &entry_flush_orig) < 0) {
+ perror("Unable to read powerpc/entry_flush debugfs file");
+ SKIP_IF(1);
+ }
+
+ if (rfi_flush_orig != 0) {
+ if (write_debugfs_file("powerpc/rfi_flush", 0) < 0) {
+ perror("error writing to powerpc/rfi_flush debugfs file");
+ FAIL_IF(1);
+ }
+ }
+
+ entry_flush = entry_flush_orig;
+
+ fd = perf_event_open_counter(PERF_TYPE_HW_CACHE, PERF_L1D_READ_MISS_CONFIG, -1);
+ FAIL_IF(fd < 0);
+
+ p = (char *)memalign(zero_size, CACHELINE_SIZE);
+
+ FAIL_IF(perf_event_enable(fd));
+
+ // disable L1 prefetching
+ set_dscr(1);
+
+ iter = repetitions;
+
+ /*
+ * We expect to see l1d miss for each cacheline access when entry_flush
+ * is set. Allow a small variation on this.
+ */
+ l1d_misses_expected = iterations * (zero_size / CACHELINE_SIZE - 2);
+
+again:
+ FAIL_IF(perf_event_reset(fd));
+
+ syscall_loop(p, iterations, zero_size);
+
+ FAIL_IF(read(fd, &v, sizeof(v)) != sizeof(v));
+
+ if (entry_flush && v.l1d_misses >= l1d_misses_expected)
+ passes++;
+ else if (!entry_flush && v.l1d_misses < (l1d_misses_expected / 2))
+ passes++;
+
+ l1d_misses_total += v.l1d_misses;
+
+ while (--iter)
+ goto again;
+
+ if (passes < repetitions) {
+ printf("FAIL (L1D misses with entry_flush=%d: %llu %c %lu) [%d/%d failures]\n",
+ entry_flush, l1d_misses_total, entry_flush ? '<' : '>',
+ entry_flush ? repetitions * l1d_misses_expected :
+ repetitions * l1d_misses_expected / 2,
+ repetitions - passes, repetitions);
+ rc = 1;
+ } else {
+ printf("PASS (L1D misses with entry_flush=%d: %llu %c %lu) [%d/%d pass]\n",
+ entry_flush, l1d_misses_total, entry_flush ? '>' : '<',
+ entry_flush ? repetitions * l1d_misses_expected :
+ repetitions * l1d_misses_expected / 2,
+ passes, repetitions);
+ }
+
+ if (entry_flush == entry_flush_orig) {
+ entry_flush = !entry_flush_orig;
+ if (write_debugfs_file("powerpc/entry_flush", entry_flush) < 0) {
+ perror("error writing to powerpc/entry_flush debugfs file");
+ return 1;
+ }
+ iter = repetitions;
+ l1d_misses_total = 0;
+ passes = 0;
+ goto again;
+ }
+
+ perf_event_disable(fd);
+ close(fd);
+
+ set_dscr(0);
+
+ if (write_debugfs_file("powerpc/rfi_flush", rfi_flush_orig) < 0) {
+ perror("unable to restore original value of powerpc/rfi_flush debugfs file");
+ return 1;
+ }
+
+ if (write_debugfs_file("powerpc/entry_flush", entry_flush_orig) < 0) {
+ perror("unable to restore original value of powerpc/entry_flush debugfs file");
+ return 1;
+ }
+
+ return rc;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(entry_flush_test, "entry_flush_test");
+}
diff --git a/tools/testing/selftests/powerpc/security/flush_utils.c b/tools/testing/selftests/powerpc/security/flush_utils.c
new file mode 100644
index 000000000..0c3c4c40c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/security/flush_utils.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+/*
+ * Copyright 2018 IBM Corporation.
+ */
+
+#define __SANE_USERSPACE_TYPES__
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include "utils.h"
+#include "flush_utils.h"
+
+static inline __u64 load(void *addr)
+{
+ __u64 tmp;
+
+ asm volatile("ld %0,0(%1)" : "=r"(tmp) : "b"(addr));
+
+ return tmp;
+}
+
+void syscall_loop(char *p, unsigned long iterations,
+ unsigned long zero_size)
+{
+ for (unsigned long i = 0; i < iterations; i++) {
+ for (unsigned long j = 0; j < zero_size; j += CACHELINE_SIZE)
+ load(p + j);
+ getppid();
+ }
+}
+
+static void sigill_handler(int signr, siginfo_t *info, void *unused)
+{
+ static int warned;
+ ucontext_t *ctx = (ucontext_t *)unused;
+ unsigned long *pc = &UCONTEXT_NIA(ctx);
+
+ /* mtspr 3,RS to check for move to DSCR below */
+ if ((*((unsigned int *)*pc) & 0xfc1fffff) == 0x7c0303a6) {
+ if (!warned++)
+ printf("WARNING: Skipping over dscr setup. Consider running 'ppc64_cpu --dscr=1' manually.\n");
+ *pc += 4;
+ } else {
+ printf("SIGILL at %p\n", pc);
+ abort();
+ }
+}
+
+void set_dscr(unsigned long val)
+{
+ static int init;
+ struct sigaction sa;
+
+ if (!init) {
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = sigill_handler;
+ sa.sa_flags = SA_SIGINFO;
+ if (sigaction(SIGILL, &sa, NULL))
+ perror("sigill_handler");
+ init = 1;
+ }
+
+ asm volatile("mtspr %1,%0" : : "r" (val), "i" (SPRN_DSCR));
+}
diff --git a/tools/testing/selftests/powerpc/security/flush_utils.h b/tools/testing/selftests/powerpc/security/flush_utils.h
new file mode 100644
index 000000000..7a3d60292
--- /dev/null
+++ b/tools/testing/selftests/powerpc/security/flush_utils.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+
+/*
+ * Copyright 2018 IBM Corporation.
+ */
+
+#ifndef _SELFTESTS_POWERPC_SECURITY_FLUSH_UTILS_H
+#define _SELFTESTS_POWERPC_SECURITY_FLUSH_UTILS_H
+
+#define CACHELINE_SIZE 128
+
+#define PERF_L1D_READ_MISS_CONFIG ((PERF_COUNT_HW_CACHE_L1D) | \
+ (PERF_COUNT_HW_CACHE_OP_READ << 8) | \
+ (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))
+
+void syscall_loop(char *p, unsigned long iterations,
+ unsigned long zero_size);
+
+void set_dscr(unsigned long val);
+
+#endif /* _SELFTESTS_POWERPC_SECURITY_FLUSH_UTILS_H */
diff --git a/tools/testing/selftests/powerpc/security/rfi_flush.c b/tools/testing/selftests/powerpc/security/rfi_flush.c
new file mode 100644
index 000000000..f73484a64
--- /dev/null
+++ b/tools/testing/selftests/powerpc/security/rfi_flush.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+/*
+ * Copyright 2018 IBM Corporation.
+ */
+
+#define __SANE_USERSPACE_TYPES__
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <malloc.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include "utils.h"
+#include "flush_utils.h"
+
+
+int rfi_flush_test(void)
+{
+ char *p;
+ int repetitions = 10;
+ int fd, passes = 0, iter, rc = 0;
+ struct perf_event_read v;
+ __u64 l1d_misses_total = 0;
+ unsigned long iterations = 100000, zero_size = 24 * 1024;
+ unsigned long l1d_misses_expected;
+ int rfi_flush_orig, rfi_flush;
+ int have_entry_flush, entry_flush_orig;
+
+ SKIP_IF(geteuid() != 0);
+
+ // The PMU event we use only works on Power7 or later
+ SKIP_IF(!have_hwcap(PPC_FEATURE_ARCH_2_06));
+
+ if (read_debugfs_file("powerpc/rfi_flush", &rfi_flush_orig) < 0) {
+ perror("Unable to read powerpc/rfi_flush debugfs file");
+ SKIP_IF(1);
+ }
+
+ if (read_debugfs_file("powerpc/entry_flush", &entry_flush_orig) < 0) {
+ have_entry_flush = 0;
+ } else {
+ have_entry_flush = 1;
+
+ if (entry_flush_orig != 0) {
+ if (write_debugfs_file("powerpc/entry_flush", 0) < 0) {
+ perror("error writing to powerpc/entry_flush debugfs file");
+ return 1;
+ }
+ }
+ }
+
+ rfi_flush = rfi_flush_orig;
+
+ fd = perf_event_open_counter(PERF_TYPE_HW_CACHE, PERF_L1D_READ_MISS_CONFIG, -1);
+ FAIL_IF(fd < 0);
+
+ p = (char *)memalign(zero_size, CACHELINE_SIZE);
+
+ FAIL_IF(perf_event_enable(fd));
+
+ // disable L1 prefetching
+ set_dscr(1);
+
+ iter = repetitions;
+
+ /*
+ * We expect to see l1d miss for each cacheline access when rfi_flush
+ * is set. Allow a small variation on this.
+ */
+ l1d_misses_expected = iterations * (zero_size / CACHELINE_SIZE - 2);
+
+again:
+ FAIL_IF(perf_event_reset(fd));
+
+ syscall_loop(p, iterations, zero_size);
+
+ FAIL_IF(read(fd, &v, sizeof(v)) != sizeof(v));
+
+ if (rfi_flush && v.l1d_misses >= l1d_misses_expected)
+ passes++;
+ else if (!rfi_flush && v.l1d_misses < (l1d_misses_expected / 2))
+ passes++;
+
+ l1d_misses_total += v.l1d_misses;
+
+ while (--iter)
+ goto again;
+
+ if (passes < repetitions) {
+ printf("FAIL (L1D misses with rfi_flush=%d: %llu %c %lu) [%d/%d failures]\n",
+ rfi_flush, l1d_misses_total, rfi_flush ? '<' : '>',
+ rfi_flush ? repetitions * l1d_misses_expected :
+ repetitions * l1d_misses_expected / 2,
+ repetitions - passes, repetitions);
+ rc = 1;
+ } else
+ printf("PASS (L1D misses with rfi_flush=%d: %llu %c %lu) [%d/%d pass]\n",
+ rfi_flush, l1d_misses_total, rfi_flush ? '>' : '<',
+ rfi_flush ? repetitions * l1d_misses_expected :
+ repetitions * l1d_misses_expected / 2,
+ passes, repetitions);
+
+ if (rfi_flush == rfi_flush_orig) {
+ rfi_flush = !rfi_flush_orig;
+ if (write_debugfs_file("powerpc/rfi_flush", rfi_flush) < 0) {
+ perror("error writing to powerpc/rfi_flush debugfs file");
+ return 1;
+ }
+ iter = repetitions;
+ l1d_misses_total = 0;
+ passes = 0;
+ goto again;
+ }
+
+ perf_event_disable(fd);
+ close(fd);
+
+ set_dscr(0);
+
+ if (write_debugfs_file("powerpc/rfi_flush", rfi_flush_orig) < 0) {
+ perror("unable to restore original value of powerpc/rfi_flush debugfs file");
+ return 1;
+ }
+
+ if (have_entry_flush) {
+ if (write_debugfs_file("powerpc/entry_flush", entry_flush_orig) < 0) {
+ perror("unable to restore original value of powerpc/entry_flush "
+ "debugfs file");
+ return 1;
+ }
+ }
+
+ return rc;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(rfi_flush_test, "rfi_flush_test");
+}
diff --git a/tools/testing/selftests/powerpc/security/spectre_v2.c b/tools/testing/selftests/powerpc/security/spectre_v2.c
new file mode 100644
index 000000000..83647b827
--- /dev/null
+++ b/tools/testing/selftests/powerpc/security/spectre_v2.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+/*
+ * Copyright 2018-2019 IBM Corporation.
+ */
+
+#define __SANE_USERSPACE_TYPES__
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <malloc.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/prctl.h>
+#include "utils.h"
+
+#include "../pmu/event.h"
+
+
+extern void pattern_cache_loop(void);
+extern void indirect_branch_loop(void);
+
+static int do_count_loop(struct event *events, bool is_p9, s64 *miss_percent)
+{
+ u64 pred, mpred;
+
+ prctl(PR_TASK_PERF_EVENTS_ENABLE);
+
+ if (is_p9)
+ pattern_cache_loop();
+ else
+ indirect_branch_loop();
+
+ prctl(PR_TASK_PERF_EVENTS_DISABLE);
+
+ event_read(&events[0]);
+ event_read(&events[1]);
+
+ // We could scale all the events by running/enabled but we're lazy
+ // As long as the PMU is uncontended they should all run
+ FAIL_IF(events[0].result.running != events[0].result.enabled);
+ FAIL_IF(events[1].result.running != events[1].result.enabled);
+
+ pred = events[0].result.value;
+ mpred = events[1].result.value;
+
+ if (is_p9) {
+ event_read(&events[2]);
+ event_read(&events[3]);
+ FAIL_IF(events[2].result.running != events[2].result.enabled);
+ FAIL_IF(events[3].result.running != events[3].result.enabled);
+
+ pred += events[2].result.value;
+ mpred += events[3].result.value;
+ }
+
+ *miss_percent = 100 * mpred / pred;
+
+ return 0;
+}
+
+static void setup_event(struct event *e, u64 config, char *name)
+{
+ event_init_named(e, config, name);
+
+ e->attr.disabled = 1;
+ e->attr.exclude_kernel = 1;
+ e->attr.exclude_hv = 1;
+ e->attr.exclude_idle = 1;
+}
+
+enum spectre_v2_state {
+ VULNERABLE = 0,
+ UNKNOWN = 1, // Works with FAIL_IF()
+ NOT_AFFECTED,
+ BRANCH_SERIALISATION,
+ COUNT_CACHE_DISABLED,
+ COUNT_CACHE_FLUSH_SW,
+ COUNT_CACHE_FLUSH_HW,
+ BTB_FLUSH,
+};
+
+static enum spectre_v2_state get_sysfs_state(void)
+{
+ enum spectre_v2_state state = UNKNOWN;
+ char buf[256];
+ int len;
+
+ memset(buf, 0, sizeof(buf));
+ FAIL_IF(read_sysfs_file("devices/system/cpu/vulnerabilities/spectre_v2", buf, sizeof(buf)));
+
+ // Make sure it's NULL terminated
+ buf[sizeof(buf) - 1] = '\0';
+
+ // Trim the trailing newline
+ len = strlen(buf);
+ FAIL_IF(len < 1);
+ buf[len - 1] = '\0';
+
+ printf("sysfs reports: '%s'\n", buf);
+
+ // Order matters
+ if (strstr(buf, "Vulnerable"))
+ state = VULNERABLE;
+ else if (strstr(buf, "Not affected"))
+ state = NOT_AFFECTED;
+ else if (strstr(buf, "Indirect branch serialisation (kernel only)"))
+ state = BRANCH_SERIALISATION;
+ else if (strstr(buf, "Indirect branch cache disabled"))
+ state = COUNT_CACHE_DISABLED;
+ else if (strstr(buf, "Software count cache flush (hardware accelerated)"))
+ state = COUNT_CACHE_FLUSH_HW;
+ else if (strstr(buf, "Software count cache flush"))
+ state = COUNT_CACHE_FLUSH_SW;
+ else if (strstr(buf, "Branch predictor state flush"))
+ state = BTB_FLUSH;
+
+ return state;
+}
+
+#define PM_BR_PRED_CCACHE 0x040a4 // P8 + P9
+#define PM_BR_MPRED_CCACHE 0x040ac // P8 + P9
+#define PM_BR_PRED_PCACHE 0x048a0 // P9 only
+#define PM_BR_MPRED_PCACHE 0x048b0 // P9 only
+
+#define SPRN_PVR 287
+
+int spectre_v2_test(void)
+{
+ enum spectre_v2_state state;
+ struct event events[4];
+ s64 miss_percent;
+ bool is_p9;
+
+ // The PMU events we use only work on Power8 or later
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_2_07));
+
+ state = get_sysfs_state();
+ if (state == UNKNOWN) {
+ printf("Error: couldn't determine spectre_v2 mitigation state?\n");
+ return -1;
+ }
+
+ memset(events, 0, sizeof(events));
+
+ setup_event(&events[0], PM_BR_PRED_CCACHE, "PM_BR_PRED_CCACHE");
+ setup_event(&events[1], PM_BR_MPRED_CCACHE, "PM_BR_MPRED_CCACHE");
+ FAIL_IF(event_open(&events[0]));
+ FAIL_IF(event_open_with_group(&events[1], events[0].fd) == -1);
+
+ is_p9 = ((mfspr(SPRN_PVR) >> 16) & 0xFFFF) == 0x4e;
+
+ if (is_p9) {
+ // Count pattern cache too
+ setup_event(&events[2], PM_BR_PRED_PCACHE, "PM_BR_PRED_PCACHE");
+ setup_event(&events[3], PM_BR_MPRED_PCACHE, "PM_BR_MPRED_PCACHE");
+
+ FAIL_IF(event_open_with_group(&events[2], events[0].fd) == -1);
+ FAIL_IF(event_open_with_group(&events[3], events[0].fd) == -1);
+ }
+
+ FAIL_IF(do_count_loop(events, is_p9, &miss_percent));
+
+ event_report_justified(&events[0], 18, 10);
+ event_report_justified(&events[1], 18, 10);
+ event_close(&events[0]);
+ event_close(&events[1]);
+
+ if (is_p9) {
+ event_report_justified(&events[2], 18, 10);
+ event_report_justified(&events[3], 18, 10);
+ event_close(&events[2]);
+ event_close(&events[3]);
+ }
+
+ printf("Miss percent %lld %%\n", miss_percent);
+
+ switch (state) {
+ case VULNERABLE:
+ case NOT_AFFECTED:
+ case COUNT_CACHE_FLUSH_SW:
+ case COUNT_CACHE_FLUSH_HW:
+ // These should all not affect userspace branch prediction
+ if (miss_percent > 15) {
+ printf("Branch misses > 15%% unexpected in this configuration!\n");
+ printf("Possible mis-match between reported & actual mitigation\n");
+ /*
+ * Such a mismatch may be caused by a guest system
+ * reporting as vulnerable when the host is mitigated.
+ * Return skip code to avoid detecting this as an error.
+ * We are not vulnerable and reporting otherwise, so
+ * missing such a mismatch is safe.
+ */
+ if (miss_percent > 95)
+ return 4;
+
+ return 1;
+ }
+ break;
+ case BRANCH_SERIALISATION:
+ // This seems to affect userspace branch prediction a bit?
+ if (miss_percent > 25) {
+ printf("Branch misses > 25%% unexpected in this configuration!\n");
+ printf("Possible mis-match between reported & actual mitigation\n");
+ return 1;
+ }
+ break;
+ case COUNT_CACHE_DISABLED:
+ if (miss_percent < 95) {
+ printf("Branch misses < 20%% unexpected in this configuration!\n");
+ printf("Possible mis-match between reported & actual mitigation\n");
+ return 1;
+ }
+ break;
+ case UNKNOWN:
+ case BTB_FLUSH:
+ printf("Not sure!\n");
+ return 1;
+ }
+
+ printf("OK - Measured branch prediction rates match reported spectre v2 mitigation.\n");
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(spectre_v2_test, "spectre_v2");
+}
diff --git a/tools/testing/selftests/powerpc/signal/.gitignore b/tools/testing/selftests/powerpc/signal/.gitignore
new file mode 100644
index 000000000..405b53640
--- /dev/null
+++ b/tools/testing/selftests/powerpc/signal/.gitignore
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+signal
+signal_tm
+sigfuz
+sigreturn_vdso
diff --git a/tools/testing/selftests/powerpc/signal/Makefile b/tools/testing/selftests/powerpc/signal/Makefile
new file mode 100644
index 000000000..d6ae54663
--- /dev/null
+++ b/tools/testing/selftests/powerpc/signal/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0
+TEST_GEN_PROGS := signal signal_tm sigfuz sigreturn_vdso sig_sc_double_restart
+
+CFLAGS += -maltivec
+$(OUTPUT)/signal_tm: CFLAGS += -mhtm
+$(OUTPUT)/sigfuz: CFLAGS += -pthread -m64
+
+TEST_FILES := settings
+
+top_srcdir = ../../../../..
+include ../../lib.mk
+
+$(TEST_GEN_PROGS): ../harness.c ../utils.c signal.S
diff --git a/tools/testing/selftests/powerpc/signal/settings b/tools/testing/selftests/powerpc/signal/settings
new file mode 100644
index 000000000..e7b941753
--- /dev/null
+++ b/tools/testing/selftests/powerpc/signal/settings
@@ -0,0 +1 @@
+timeout=0
diff --git a/tools/testing/selftests/powerpc/signal/sig_sc_double_restart.c b/tools/testing/selftests/powerpc/signal/sig_sc_double_restart.c
new file mode 100644
index 000000000..e39722646
--- /dev/null
+++ b/tools/testing/selftests/powerpc/signal/sig_sc_double_restart.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Test that a syscall does not get restarted twice, handled by trap_norestart()
+ *
+ * Based on Al's description, and a test for the bug fixed in this commit:
+ *
+ * commit 9a81c16b527528ad307843be5571111aa8d35a80
+ * Author: Al Viro <viro@zeniv.linux.org.uk>
+ * Date: Mon Sep 20 21:48:57 2010 +0100
+ *
+ * powerpc: fix double syscall restarts
+ *
+ * Make sigreturn zero regs->trap, make do_signal() do the same on all
+ * paths. As it is, signal interrupting e.g. read() from fd 512 (==
+ * ERESTARTSYS) with another signal getting unblocked when the first
+ * handler finishes will lead to restart one insn earlier than it ought
+ * to. Same for multiple signals with in-kernel handlers interrupting
+ * that sucker at the same time. Same for multiple signals of any kind
+ * interrupting that sucker on 64bit...
+ */
+#define _GNU_SOURCE
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <signal.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "utils.h"
+
+static void SIGUSR1_handler(int sig)
+{
+ kill(getpid(), SIGUSR2);
+ /*
+ * SIGUSR2 is blocked until the handler exits, at which point it will
+ * be raised again and think there is a restart to be done because the
+ * pending restarted syscall has 512 (ERESTARTSYS) in r3. The second
+ * restart will retreat NIP another 4 bytes to fail case branch.
+ */
+}
+
+static void SIGUSR2_handler(int sig)
+{
+}
+
+static ssize_t raw_read(int fd, void *buf, size_t count)
+{
+ register long nr asm("r0") = __NR_read;
+ register long _fd asm("r3") = fd;
+ register void *_buf asm("r4") = buf;
+ register size_t _count asm("r5") = count;
+
+ asm volatile(
+" b 0f \n"
+" b 1f \n"
+" 0: sc 0 \n"
+" bns 2f \n"
+" neg %0,%0 \n"
+" b 2f \n"
+" 1: \n"
+" li %0,%4 \n"
+" 2: \n"
+ : "+r"(_fd), "+r"(nr), "+r"(_buf), "+r"(_count)
+ : "i"(-ENOANO)
+ : "memory", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "ctr", "cr0");
+
+ if (_fd < 0) {
+ errno = -_fd;
+ _fd = -1;
+ }
+
+ return _fd;
+}
+
+#define DATA "test 123"
+#define DLEN (strlen(DATA)+1)
+
+int test_restart(void)
+{
+ int pipefd[2];
+ pid_t pid;
+ char buf[512];
+
+ if (pipe(pipefd) == -1) {
+ perror("pipe");
+ exit(EXIT_FAILURE);
+ }
+
+ pid = fork();
+ if (pid == -1) {
+ perror("fork");
+ exit(EXIT_FAILURE);
+ }
+
+ if (pid == 0) { /* Child reads from pipe */
+ struct sigaction act;
+ int fd;
+
+ memset(&act, 0, sizeof(act));
+ sigaddset(&act.sa_mask, SIGUSR2);
+ act.sa_handler = SIGUSR1_handler;
+ act.sa_flags = SA_RESTART;
+ if (sigaction(SIGUSR1, &act, NULL) == -1) {
+ perror("sigaction");
+ exit(EXIT_FAILURE);
+ }
+
+ memset(&act, 0, sizeof(act));
+ act.sa_handler = SIGUSR2_handler;
+ act.sa_flags = SA_RESTART;
+ if (sigaction(SIGUSR2, &act, NULL) == -1) {
+ perror("sigaction");
+ exit(EXIT_FAILURE);
+ }
+
+ /* Let's get ERESTARTSYS into r3 */
+ while ((fd = dup(pipefd[0])) != 512) {
+ if (fd == -1) {
+ perror("dup");
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ if (raw_read(fd, buf, 512) == -1) {
+ if (errno == ENOANO) {
+ fprintf(stderr, "Double restart moved restart before sc instruction.\n");
+ _exit(EXIT_FAILURE);
+ }
+ perror("read");
+ exit(EXIT_FAILURE);
+ }
+
+ if (strncmp(buf, DATA, DLEN)) {
+ fprintf(stderr, "bad test string %s\n", buf);
+ exit(EXIT_FAILURE);
+ }
+
+ return 0;
+
+ } else {
+ int wstatus;
+
+ usleep(100000); /* Hack to get reader waiting */
+ kill(pid, SIGUSR1);
+ usleep(100000);
+ if (write(pipefd[1], DATA, DLEN) != DLEN) {
+ perror("write");
+ exit(EXIT_FAILURE);
+ }
+ close(pipefd[0]);
+ close(pipefd[1]);
+ if (wait(&wstatus) == -1) {
+ perror("wait");
+ exit(EXIT_FAILURE);
+ }
+ if (!WIFEXITED(wstatus)) {
+ fprintf(stderr, "child exited abnormally\n");
+ exit(EXIT_FAILURE);
+ }
+
+ FAIL_IF(WEXITSTATUS(wstatus) != EXIT_SUCCESS);
+
+ return 0;
+ }
+}
+
+int main(void)
+{
+ test_harness_set_timeout(10);
+ return test_harness(test_restart, "sig sys restart");
+}
diff --git a/tools/testing/selftests/powerpc/signal/sigfuz.c b/tools/testing/selftests/powerpc/signal/sigfuz.c
new file mode 100644
index 000000000..08f9afe3b
--- /dev/null
+++ b/tools/testing/selftests/powerpc/signal/sigfuz.c
@@ -0,0 +1,325 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2018, Breno Leitao, IBM Corp.
+ * Licensed under GPLv2.
+ *
+ * Sigfuz(tm): A PowerPC TM-aware signal fuzzer.
+ *
+ * This is a new selftest that raises SIGUSR1 signals and handles it in a set
+ * of different ways, trying to create different scenario for testing
+ * purpose.
+ *
+ * This test works raising a signal and calling sigreturn interleaved with
+ * TM operations, as starting, suspending and terminating a transaction. The
+ * test depends on random numbers, and, based on them, it sets different TM
+ * states.
+ *
+ * Other than that, the test fills out the user context struct that is passed
+ * to the sigreturn system call with random data, in order to make sure that
+ * the signal handler syscall can handle different and invalid states
+ * properly.
+ *
+ * This selftest has command line parameters to control what kind of tests the
+ * user wants to run, as for example, if a transaction should be started prior
+ * to signal being raised, or, after the signal being raised and before the
+ * sigreturn. If no parameter is given, the default is enabling all options.
+ *
+ * This test does not check if the user context is being read and set
+ * properly by the kernel. Its purpose, at this time, is basically
+ * guaranteeing that the kernel does not crash on invalid scenarios.
+ */
+
+#include <stdio.h>
+#include <limits.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <string.h>
+#include <ucontext.h>
+#include <sys/mman.h>
+#include <pthread.h>
+#include "utils.h"
+
+/* Selftest defaults */
+#define COUNT_MAX 600 /* Number of interactions */
+#define THREADS 16 /* Number of threads */
+
+/* Arguments options */
+#define ARG_MESS_WITH_TM_AT 0x1
+#define ARG_MESS_WITH_TM_BEFORE 0x2
+#define ARG_MESS_WITH_MSR_AT 0x4
+#define ARG_FOREVER 0x10
+#define ARG_COMPLETE (ARG_MESS_WITH_TM_AT | \
+ ARG_MESS_WITH_TM_BEFORE | \
+ ARG_MESS_WITH_MSR_AT)
+
+static int args;
+static int nthread = THREADS;
+static int count_max = COUNT_MAX;
+
+/* checkpoint context */
+static ucontext_t *tmp_uc;
+
+/* Return true with 1/x probability */
+static int one_in_chance(int x)
+{
+ return rand() % x == 0;
+}
+
+/* Change TM states */
+static void mess_with_tm(void)
+{
+ /* Starts a transaction 33% of the time */
+ if (one_in_chance(3)) {
+ asm ("tbegin. ;"
+ "beq 8 ;");
+
+ /* And suspended half of them */
+ if (one_in_chance(2))
+ asm("tsuspend. ;");
+ }
+
+ /* Call 'tend' in 5% of the runs */
+ if (one_in_chance(20))
+ asm("tend. ;");
+}
+
+/* Signal handler that will be invoked with raise() */
+static void trap_signal_handler(int signo, siginfo_t *si, void *uc)
+{
+ ucontext_t *ucp = uc;
+
+ ucp->uc_link = tmp_uc;
+
+ /*
+ * Set uc_link in three possible ways:
+ * - Setting a single 'int' in the whole chunk
+ * - Cloning ucp into uc_link
+ * - Allocating a new memory chunk
+ */
+ if (one_in_chance(3)) {
+ memset(ucp->uc_link, rand(), sizeof(ucontext_t));
+ } else if (one_in_chance(2)) {
+ memcpy(ucp->uc_link, uc, sizeof(ucontext_t));
+ } else if (one_in_chance(2)) {
+ if (tmp_uc) {
+ free(tmp_uc);
+ tmp_uc = NULL;
+ }
+ tmp_uc = malloc(sizeof(ucontext_t));
+ ucp->uc_link = tmp_uc;
+ /* Trying to cause a major page fault at Kernel level */
+ madvise(ucp->uc_link, sizeof(ucontext_t), MADV_DONTNEED);
+ }
+
+ if (args & ARG_MESS_WITH_MSR_AT) {
+ /* Changing the checkpointed registers */
+ if (one_in_chance(4)) {
+ ucp->uc_link->uc_mcontext.gp_regs[PT_MSR] |= MSR_TS_S;
+ } else {
+ if (one_in_chance(2)) {
+ ucp->uc_link->uc_mcontext.gp_regs[PT_MSR] |=
+ MSR_TS_T;
+ } else if (one_in_chance(2)) {
+ ucp->uc_link->uc_mcontext.gp_regs[PT_MSR] |=
+ MSR_TS_T | MSR_TS_S;
+ }
+ }
+
+ /* Checking the current register context */
+ if (one_in_chance(2)) {
+ ucp->uc_mcontext.gp_regs[PT_MSR] |= MSR_TS_S;
+ } else if (one_in_chance(2)) {
+ if (one_in_chance(2))
+ ucp->uc_mcontext.gp_regs[PT_MSR] |=
+ MSR_TS_T;
+ else if (one_in_chance(2))
+ ucp->uc_mcontext.gp_regs[PT_MSR] |=
+ MSR_TS_T | MSR_TS_S;
+ }
+ }
+
+ if (one_in_chance(20)) {
+ /* Nested transaction start */
+ if (one_in_chance(5))
+ mess_with_tm();
+
+ /* Return without changing any other context info */
+ return;
+ }
+
+ if (one_in_chance(10))
+ ucp->uc_mcontext.gp_regs[PT_MSR] = random();
+ if (one_in_chance(10))
+ ucp->uc_mcontext.gp_regs[PT_NIP] = random();
+ if (one_in_chance(10))
+ ucp->uc_link->uc_mcontext.gp_regs[PT_MSR] = random();
+ if (one_in_chance(10))
+ ucp->uc_link->uc_mcontext.gp_regs[PT_NIP] = random();
+
+ ucp->uc_mcontext.gp_regs[PT_TRAP] = random();
+ ucp->uc_mcontext.gp_regs[PT_DSISR] = random();
+ ucp->uc_mcontext.gp_regs[PT_DAR] = random();
+ ucp->uc_mcontext.gp_regs[PT_ORIG_R3] = random();
+ ucp->uc_mcontext.gp_regs[PT_XER] = random();
+ ucp->uc_mcontext.gp_regs[PT_RESULT] = random();
+ ucp->uc_mcontext.gp_regs[PT_SOFTE] = random();
+ ucp->uc_mcontext.gp_regs[PT_DSCR] = random();
+ ucp->uc_mcontext.gp_regs[PT_CTR] = random();
+ ucp->uc_mcontext.gp_regs[PT_LNK] = random();
+ ucp->uc_mcontext.gp_regs[PT_CCR] = random();
+ ucp->uc_mcontext.gp_regs[PT_REGS_COUNT] = random();
+
+ ucp->uc_link->uc_mcontext.gp_regs[PT_TRAP] = random();
+ ucp->uc_link->uc_mcontext.gp_regs[PT_DSISR] = random();
+ ucp->uc_link->uc_mcontext.gp_regs[PT_DAR] = random();
+ ucp->uc_link->uc_mcontext.gp_regs[PT_ORIG_R3] = random();
+ ucp->uc_link->uc_mcontext.gp_regs[PT_XER] = random();
+ ucp->uc_link->uc_mcontext.gp_regs[PT_RESULT] = random();
+ ucp->uc_link->uc_mcontext.gp_regs[PT_SOFTE] = random();
+ ucp->uc_link->uc_mcontext.gp_regs[PT_DSCR] = random();
+ ucp->uc_link->uc_mcontext.gp_regs[PT_CTR] = random();
+ ucp->uc_link->uc_mcontext.gp_regs[PT_LNK] = random();
+ ucp->uc_link->uc_mcontext.gp_regs[PT_CCR] = random();
+ ucp->uc_link->uc_mcontext.gp_regs[PT_REGS_COUNT] = random();
+
+ if (args & ARG_MESS_WITH_TM_BEFORE) {
+ if (one_in_chance(2))
+ mess_with_tm();
+ }
+}
+
+static void seg_signal_handler(int signo, siginfo_t *si, void *uc)
+{
+ /* Clear exit for process that segfaults */
+ exit(0);
+}
+
+static void *sigfuz_test(void *thrid)
+{
+ struct sigaction trap_sa, seg_sa;
+ int ret, i = 0;
+ pid_t t;
+
+ tmp_uc = malloc(sizeof(ucontext_t));
+
+ /* Main signal handler */
+ trap_sa.sa_flags = SA_SIGINFO;
+ trap_sa.sa_sigaction = trap_signal_handler;
+
+ /* SIGSEGV signal handler */
+ seg_sa.sa_flags = SA_SIGINFO;
+ seg_sa.sa_sigaction = seg_signal_handler;
+
+ /* The signal handler will enable MSR_TS */
+ sigaction(SIGUSR1, &trap_sa, NULL);
+
+ /* If it does not crash, it will segfault, avoid it to retest */
+ sigaction(SIGSEGV, &seg_sa, NULL);
+
+ while (i < count_max) {
+ t = fork();
+
+ if (t == 0) {
+ /* Once seed per process */
+ srand(time(NULL) + getpid());
+ if (args & ARG_MESS_WITH_TM_AT) {
+ if (one_in_chance(2))
+ mess_with_tm();
+ }
+ raise(SIGUSR1);
+ exit(0);
+ } else {
+ waitpid(t, &ret, 0);
+ }
+ if (!(args & ARG_FOREVER))
+ i++;
+ }
+
+ /* If not freed already, free now */
+ if (tmp_uc) {
+ free(tmp_uc);
+ tmp_uc = NULL;
+ }
+
+ return NULL;
+}
+
+static int signal_fuzzer(void)
+{
+ int t, rc;
+ pthread_t *threads;
+
+ threads = malloc(nthread * sizeof(pthread_t));
+
+ for (t = 0; t < nthread; t++) {
+ rc = pthread_create(&threads[t], NULL, sigfuz_test,
+ (void *)&t);
+ if (rc)
+ perror("Thread creation error\n");
+ }
+
+ for (t = 0; t < nthread; t++) {
+ rc = pthread_join(threads[t], NULL);
+ if (rc)
+ perror("Thread join error\n");
+ }
+
+ free(threads);
+
+ return EXIT_SUCCESS;
+}
+
+static void show_help(char *name)
+{
+ printf("%s: Sigfuzzer for powerpc\n", name);
+ printf("Usage:\n");
+ printf("\t-b\t Mess with TM before raising a SIGUSR1 signal\n");
+ printf("\t-a\t Mess with TM after raising a SIGUSR1 signal\n");
+ printf("\t-m\t Mess with MSR[TS] bits at mcontext\n");
+ printf("\t-x\t Mess with everything above\n");
+ printf("\t-f\t Run forever (Press ^C to Quit)\n");
+ printf("\t-i\t Amount of interactions. (Default = %d)\n", COUNT_MAX);
+ printf("\t-t\t Amount of threads. (Default = %d)\n", THREADS);
+ exit(-1);
+}
+
+int main(int argc, char **argv)
+{
+ int opt;
+
+ while ((opt = getopt(argc, argv, "bamxt:fi:h")) != -1) {
+ if (opt == 'b') {
+ printf("Mess with TM before signal\n");
+ args |= ARG_MESS_WITH_TM_BEFORE;
+ } else if (opt == 'a') {
+ printf("Mess with TM at signal handler\n");
+ args |= ARG_MESS_WITH_TM_AT;
+ } else if (opt == 'm') {
+ printf("Mess with MSR[TS] bits in mcontext\n");
+ args |= ARG_MESS_WITH_MSR_AT;
+ } else if (opt == 'x') {
+ printf("Running with all options enabled\n");
+ args |= ARG_COMPLETE;
+ } else if (opt == 't') {
+ nthread = atoi(optarg);
+ printf("Threads = %d\n", nthread);
+ } else if (opt == 'f') {
+ args |= ARG_FOREVER;
+ printf("Press ^C to stop\n");
+ test_harness_set_timeout(-1);
+ } else if (opt == 'i') {
+ count_max = atoi(optarg);
+ printf("Running for %d interactions\n", count_max);
+ } else if (opt == 'h') {
+ show_help(argv[0]);
+ }
+ }
+
+ /* Default test suite */
+ if (!args)
+ args = ARG_COMPLETE;
+
+ test_harness(signal_fuzzer, "signal_fuzzer");
+}
diff --git a/tools/testing/selftests/powerpc/signal/signal.S b/tools/testing/selftests/powerpc/signal/signal.S
new file mode 100644
index 000000000..228fba499
--- /dev/null
+++ b/tools/testing/selftests/powerpc/signal/signal.S
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ */
+
+#include "basic_asm.h"
+
+/* long signal_self(pid_t pid, int sig); */
+FUNC_START(signal_self)
+ li r0,37 /* sys_kill */
+ /* r3 already has our pid in it */
+ /* r4 already has signal type in it */
+ sc
+ bc 4,3,1f
+ subfze r3,r3
+1: blr
+FUNC_END(signal_self)
+
+/* long tm_signal_self(pid_t pid, int sig, int *ret); */
+FUNC_START(tm_signal_self)
+ PUSH_BASIC_STACK(8)
+ std r5,STACK_FRAME_PARAM(0)(sp) /* ret */
+ tbegin.
+ beq 1f
+ tsuspend.
+ li r0,37 /* sys_kill */
+ /* r3 already has our pid in it */
+ /* r4 already has signal type in it */
+ sc
+ ld r5,STACK_FRAME_PARAM(0)(sp) /* ret */
+ bc 4,3,2f
+ subfze r3,r3
+2: std r3,0(r5)
+ tabort. 0
+ tresume. /* Be nice to some cleanup, jumps back to tbegin then to 1: */
+ /*
+ * Transaction should be proper doomed and we should never get
+ * here
+ */
+ li r3,1
+ POP_BASIC_STACK(8)
+ blr
+1: li r3,0
+ POP_BASIC_STACK(8)
+ blr
+FUNC_END(tm_signal_self)
diff --git a/tools/testing/selftests/powerpc/signal/signal.c b/tools/testing/selftests/powerpc/signal/signal.c
new file mode 100644
index 000000000..766e484d9
--- /dev/null
+++ b/tools/testing/selftests/powerpc/signal/signal.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2016, Cyril Bur, IBM Corp.
+ *
+ * Sending one self a signal should always get delivered.
+ */
+
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include <altivec.h>
+
+#include "utils.h"
+
+#define MAX_ATTEMPT 500000
+#define TIMEOUT 5
+
+extern long signal_self(pid_t pid, int sig);
+
+static sig_atomic_t signaled;
+static sig_atomic_t fail;
+
+static void signal_handler(int sig)
+{
+ if (sig == SIGUSR1)
+ signaled = 1;
+ else
+ fail = 1;
+}
+
+static int test_signal()
+{
+ int i;
+ struct sigaction act;
+ pid_t ppid = getpid();
+ pid_t pid;
+
+ act.sa_handler = signal_handler;
+ act.sa_flags = 0;
+ sigemptyset(&act.sa_mask);
+ if (sigaction(SIGUSR1, &act, NULL) < 0) {
+ perror("sigaction SIGUSR1");
+ exit(1);
+ }
+ if (sigaction(SIGALRM, &act, NULL) < 0) {
+ perror("sigaction SIGALRM");
+ exit(1);
+ }
+
+ /* Don't do this for MAX_ATTEMPT, its simply too long */
+ for(i = 0; i < 1000; i++) {
+ pid = fork();
+ if (pid == -1) {
+ perror("fork");
+ exit(1);
+ }
+ if (pid == 0) {
+ signal_self(ppid, SIGUSR1);
+ exit(1);
+ } else {
+ alarm(0); /* Disable any pending */
+ alarm(2);
+ while (!signaled && !fail)
+ asm volatile("": : :"memory");
+ if (!signaled) {
+ fprintf(stderr, "Didn't get signal from child\n");
+ FAIL_IF(1); /* For the line number */
+ }
+ /* Otherwise we'll loop too fast and fork() will eventually fail */
+ waitpid(pid, NULL, 0);
+ }
+ }
+
+ for (i = 0; i < MAX_ATTEMPT; i++) {
+ long rc;
+
+ alarm(0); /* Disable any pending */
+ signaled = 0;
+ alarm(TIMEOUT);
+ rc = signal_self(ppid, SIGUSR1);
+ if (rc) {
+ fprintf(stderr, "(%d) Fail reason: %d rc=0x%lx",
+ i, fail, rc);
+ FAIL_IF(1); /* For the line number */
+ }
+ while (!signaled && !fail)
+ asm volatile("": : :"memory");
+ if (!signaled) {
+ fprintf(stderr, "(%d) Fail reason: %d rc=0x%lx",
+ i, fail, rc);
+ FAIL_IF(1); /* For the line number */
+ }
+ }
+
+ return 0;
+}
+
+int main(void)
+{
+ test_harness_set_timeout(300);
+ return test_harness(test_signal, "signal");
+}
diff --git a/tools/testing/selftests/powerpc/signal/signal_tm.c b/tools/testing/selftests/powerpc/signal/signal_tm.c
new file mode 100644
index 000000000..5bf2224ef
--- /dev/null
+++ b/tools/testing/selftests/powerpc/signal/signal_tm.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2016, Cyril Bur, IBM Corp.
+ *
+ * Sending one self a signal should always get delivered.
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include <altivec.h>
+
+#include "utils.h"
+#include "../tm/tm.h"
+
+#define MAX_ATTEMPT 500000
+#define TIMEOUT 10
+
+extern long tm_signal_self(pid_t pid, int sig, long *ret);
+
+static sig_atomic_t signaled;
+static sig_atomic_t fail;
+
+static void signal_handler(int sig)
+{
+ if (tcheck_active()) {
+ fail = 2;
+ return;
+ }
+
+ if (sig == SIGUSR1)
+ signaled = 1;
+ else
+ fail = 1;
+}
+
+static int test_signal_tm()
+{
+ int i;
+ struct sigaction act;
+
+ act.sa_handler = signal_handler;
+ act.sa_flags = 0;
+ sigemptyset(&act.sa_mask);
+ if (sigaction(SIGUSR1, &act, NULL) < 0) {
+ perror("sigaction SIGUSR1");
+ exit(1);
+ }
+ if (sigaction(SIGALRM, &act, NULL) < 0) {
+ perror("sigaction SIGALRM");
+ exit(1);
+ }
+
+ SKIP_IF(!have_htm());
+
+ for (i = 0; i < MAX_ATTEMPT; i++) {
+ /*
+ * If anything bad happens in ASM and we fail to set ret
+ * because *handwave* TM this will cause failure
+ */
+ long ret = 0xdead;
+ long rc = 0xbeef;
+
+ alarm(0); /* Disable any pending */
+ signaled = 0;
+ alarm(TIMEOUT);
+ FAIL_IF(tcheck_transactional());
+ rc = tm_signal_self(getpid(), SIGUSR1, &ret);
+ if (ret == 0xdead)
+ /*
+ * This basically means the transaction aborted before we
+ * even got to the suspend... this is crazy but it
+ * happens.
+ * Yes this also means we might never make forward
+ * progress... the alarm() will trip eventually...
+ */
+ continue;
+
+ if (rc || ret) {
+ /* Ret is actually an errno */
+ printf("TEXASR 0x%016lx, TFIAR 0x%016lx\n",
+ __builtin_get_texasr(), __builtin_get_tfiar());
+ fprintf(stderr, "(%d) Fail reason: %d rc=0x%lx ret=0x%lx\n",
+ i, fail, rc, ret);
+ FAIL_IF(ret);
+ }
+ while(!signaled && !fail)
+ asm volatile("": : :"memory");
+ if (!signaled) {
+ fprintf(stderr, "(%d) Fail reason: %d rc=0x%lx ret=0x%lx\n",
+ i, fail, rc, ret);
+ FAIL_IF(fail); /* For the line number */
+ }
+ }
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(test_signal_tm, "signal_tm");
+}
diff --git a/tools/testing/selftests/powerpc/signal/sigreturn_vdso.c b/tools/testing/selftests/powerpc/signal/sigreturn_vdso.c
new file mode 100644
index 000000000..e282fff0f
--- /dev/null
+++ b/tools/testing/selftests/powerpc/signal/sigreturn_vdso.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test that we can take signals with and without the VDSO mapped, which trigger
+ * different paths in the signal handling code.
+ *
+ * See handle_rt_signal64() and setup_trampoline() in signal_64.c
+ */
+
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <stdio.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+// Ensure assert() is not compiled out
+#undef NDEBUG
+#include <assert.h>
+
+#include "utils.h"
+
+static int search_proc_maps(char *needle, unsigned long *low, unsigned long *high)
+{
+ unsigned long start, end;
+ static char buf[4096];
+ char name[128];
+ FILE *f;
+ int rc = -1;
+
+ f = fopen("/proc/self/maps", "r");
+ if (!f) {
+ perror("fopen");
+ return -1;
+ }
+
+ while (fgets(buf, sizeof(buf), f)) {
+ rc = sscanf(buf, "%lx-%lx %*c%*c%*c%*c %*x %*d:%*d %*d %127s\n",
+ &start, &end, name);
+ if (rc == 2)
+ continue;
+
+ if (rc != 3) {
+ printf("sscanf errored\n");
+ rc = -1;
+ break;
+ }
+
+ if (strstr(name, needle)) {
+ *low = start;
+ *high = end - 1;
+ rc = 0;
+ break;
+ }
+ }
+
+ fclose(f);
+
+ return rc;
+}
+
+static volatile sig_atomic_t took_signal = 0;
+
+static void sigusr1_handler(int sig)
+{
+ took_signal++;
+}
+
+int test_sigreturn_vdso(void)
+{
+ unsigned long low, high, size;
+ struct sigaction act;
+ char *p;
+
+ act.sa_handler = sigusr1_handler;
+ act.sa_flags = 0;
+ sigemptyset(&act.sa_mask);
+
+ assert(sigaction(SIGUSR1, &act, NULL) == 0);
+
+ // Confirm the VDSO is mapped, and work out where it is
+ assert(search_proc_maps("[vdso]", &low, &high) == 0);
+ size = high - low + 1;
+ printf("VDSO is at 0x%lx-0x%lx (%lu bytes)\n", low, high, size);
+
+ kill(getpid(), SIGUSR1);
+ assert(took_signal == 1);
+ printf("Signal delivered OK with VDSO mapped\n");
+
+ // Remap the VDSO somewhere else
+ p = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+ assert(p != MAP_FAILED);
+ assert(mremap((void *)low, size, size, MREMAP_MAYMOVE|MREMAP_FIXED, p) != MAP_FAILED);
+ assert(search_proc_maps("[vdso]", &low, &high) == 0);
+ size = high - low + 1;
+ printf("VDSO moved to 0x%lx-0x%lx (%lu bytes)\n", low, high, size);
+
+ kill(getpid(), SIGUSR1);
+ assert(took_signal == 2);
+ printf("Signal delivered OK with VDSO moved\n");
+
+ assert(munmap((void *)low, size) == 0);
+ printf("Unmapped VDSO\n");
+
+ // Confirm the VDSO is not mapped anymore
+ assert(search_proc_maps("[vdso]", &low, &high) != 0);
+
+ // Make the stack executable
+ assert(search_proc_maps("[stack]", &low, &high) == 0);
+ size = high - low + 1;
+ mprotect((void *)low, size, PROT_READ|PROT_WRITE|PROT_EXEC);
+ printf("Remapped the stack executable\n");
+
+ kill(getpid(), SIGUSR1);
+ assert(took_signal == 3);
+ printf("Signal delivered OK with VDSO unmapped\n");
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(test_sigreturn_vdso, "sigreturn_vdso");
+}
diff --git a/tools/testing/selftests/powerpc/stringloops/.gitignore b/tools/testing/selftests/powerpc/stringloops/.gitignore
new file mode 100644
index 000000000..b0dfc74aa
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/.gitignore
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+memcmp_64
+memcmp_32
+strlen
+strlen_32
diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile b/tools/testing/selftests/powerpc/stringloops/Makefile
new file mode 100644
index 000000000..9c39f55a5
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/Makefile
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: GPL-2.0
+# The loops are all 64-bit code
+CFLAGS += -I$(CURDIR)
+
+EXTRA_SOURCES := ../harness.c
+
+build_32bit = $(shell if ($(CC) $(CFLAGS) -m32 -o /dev/null memcmp.c >/dev/null 2>&1) then echo "1"; fi)
+
+TEST_GEN_PROGS := memcmp_64 strlen
+
+$(OUTPUT)/memcmp_64: memcmp.c ../utils.c
+$(OUTPUT)/memcmp_64: CFLAGS += -m64 -maltivec
+
+ifeq ($(build_32bit),1)
+$(OUTPUT)/memcmp_32: memcmp.c
+$(OUTPUT)/memcmp_32: CFLAGS += -m32
+
+TEST_GEN_PROGS += memcmp_32
+endif
+
+$(OUTPUT)/strlen: strlen.c string.c
+
+ifeq ($(build_32bit),1)
+$(OUTPUT)/strlen_32: strlen.c
+$(OUTPUT)/strlen_32: CFLAGS += -m32
+
+TEST_GEN_PROGS += strlen_32
+endif
+
+ASFLAGS = $(CFLAGS)
+
+top_srcdir = ../../../../..
+include ../../lib.mk
+
+$(TEST_GEN_PROGS): $(EXTRA_SOURCES)
diff --git a/tools/testing/selftests/powerpc/stringloops/asm/cache.h b/tools/testing/selftests/powerpc/stringloops/asm/cache.h
new file mode 100644
index 000000000..8a2840831
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/asm/cache.h
@@ -0,0 +1 @@
+#define IFETCH_ALIGN_BYTES 4
diff --git a/tools/testing/selftests/powerpc/stringloops/asm/export.h b/tools/testing/selftests/powerpc/stringloops/asm/export.h
new file mode 100644
index 000000000..2d14a9b42
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/asm/export.h
@@ -0,0 +1 @@
+#define EXPORT_SYMBOL(x)
diff --git a/tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h b/tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h
new file mode 100644
index 000000000..3edd1a1d9
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright 2009 Freescale Semiconductor, Inc.
+ *
+ * provides masks and opcode images for use by code generation, emulation
+ * and for instructions that older assemblers might not know about
+ */
+#ifndef _ASM_POWERPC_PPC_OPCODE_H
+#define _ASM_POWERPC_PPC_OPCODE_H
+
+
+# define stringify_in_c(...) __VA_ARGS__
+# define ASM_CONST(x) x
+
+
+#define PPC_INST_VCMPEQUD_RC 0x100000c7
+#define PPC_INST_VCMPEQUB_RC 0x10000006
+
+#define __PPC_RC21 (0x1 << 10)
+
+/* macros to insert fields into opcodes */
+#define ___PPC_RA(a) (((a) & 0x1f) << 16)
+#define ___PPC_RB(b) (((b) & 0x1f) << 11)
+#define ___PPC_RS(s) (((s) & 0x1f) << 21)
+#define ___PPC_RT(t) ___PPC_RS(t)
+
+#define VCMPEQUD_RC(vrt, vra, vrb) stringify_in_c(.long PPC_INST_VCMPEQUD_RC | \
+ ___PPC_RT(vrt) | ___PPC_RA(vra) | \
+ ___PPC_RB(vrb) | __PPC_RC21)
+
+#define VCMPEQUB_RC(vrt, vra, vrb) stringify_in_c(.long PPC_INST_VCMPEQUB_RC | \
+ ___PPC_RT(vrt) | ___PPC_RA(vra) | \
+ ___PPC_RB(vrb) | __PPC_RC21)
+
+#endif /* _ASM_POWERPC_PPC_OPCODE_H */
diff --git a/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
new file mode 100644
index 000000000..2b488b78c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/asm/ppc_asm.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PPC_ASM_H
+#define _PPC_ASM_H
+#include <ppc-asm.h>
+
+#ifndef r1
+#define r1 sp
+#endif
+
+#define _GLOBAL(A) FUNC_START(test_ ## A)
+#define _GLOBAL_TOC(A) FUNC_START(test_ ## A)
+
+#define CONFIG_ALTIVEC
+
+#define R14 r14
+#define R15 r15
+#define R16 r16
+#define R17 r17
+#define R18 r18
+#define R19 r19
+#define R20 r20
+#define R21 r21
+#define R22 r22
+#define R29 r29
+#define R30 r30
+#define R31 r31
+
+#define STACKFRAMESIZE 256
+#define STK_REG(i) (112 + ((i)-14)*8)
+
+#define BEGIN_FTR_SECTION
+#define END_FTR_SECTION_IFSET(val)
+#endif
diff --git a/tools/testing/selftests/powerpc/stringloops/memcmp.c b/tools/testing/selftests/powerpc/stringloops/memcmp.c
new file mode 100644
index 000000000..cb2f18855
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/memcmp.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <malloc.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <time.h>
+
+#include "utils.h"
+
+#define SIZE 256
+#define ITERATIONS 10000
+
+#define LARGE_SIZE (5 * 1024)
+#define LARGE_ITERATIONS 1000
+#define LARGE_MAX_OFFSET 32
+#define LARGE_SIZE_START 4096
+
+/* This is big enough to fit LARGE_SIZE and works on 4K & 64K kernels */
+#define MAP_SIZE (64 * 1024)
+
+#define MAX_OFFSET_DIFF_S1_S2 48
+
+int vmx_count;
+int enter_vmx_ops(void)
+{
+ vmx_count++;
+ return 1;
+}
+
+void exit_vmx_ops(void)
+{
+ vmx_count--;
+}
+int test_memcmp(const void *s1, const void *s2, size_t n);
+
+/* test all offsets and lengths */
+static void test_one(char *s1, char *s2, unsigned long max_offset,
+ unsigned long size_start, unsigned long max_size)
+{
+ unsigned long offset, size;
+
+ for (offset = 0; offset < max_offset; offset++) {
+ for (size = size_start; size < (max_size - offset); size++) {
+ int x, y;
+ unsigned long i;
+
+ y = memcmp(s1+offset, s2+offset, size);
+ x = test_memcmp(s1+offset, s2+offset, size);
+
+ if (((x ^ y) < 0) && /* Trick to compare sign */
+ ((x | y) != 0)) { /* check for zero */
+ printf("memcmp returned %d, should have returned %d (offset %ld size %ld)\n", x, y, offset, size);
+
+ for (i = offset; i < offset+size; i++)
+ printf("%02x ", s1[i]);
+ printf("\n");
+
+ for (i = offset; i < offset+size; i++)
+ printf("%02x ", s2[i]);
+ printf("\n");
+ abort();
+ }
+
+ if (vmx_count != 0) {
+ printf("vmx enter/exit not paired.(offset:%ld size:%ld s1:%p s2:%p vc:%d\n",
+ offset, size, s1, s2, vmx_count);
+ printf("\n");
+ abort();
+ }
+ }
+ }
+}
+
+static int testcase(bool islarge)
+{
+ unsigned long i, comp_size, alloc_size;
+ char *p, *s1, *s2;
+ int iterations;
+
+ comp_size = (islarge ? LARGE_SIZE : SIZE);
+ alloc_size = comp_size + MAX_OFFSET_DIFF_S1_S2;
+ iterations = islarge ? LARGE_ITERATIONS : ITERATIONS;
+
+ p = mmap(NULL, 4 * MAP_SIZE, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ FAIL_IF(p == MAP_FAILED);
+
+ /* Put s1/s2 at the end of a page */
+ s1 = p + MAP_SIZE - alloc_size;
+ s2 = p + 3 * MAP_SIZE - alloc_size;
+
+ /* And unmap the subsequent page to force a fault if we overread */
+ munmap(p + MAP_SIZE, MAP_SIZE);
+ munmap(p + 3 * MAP_SIZE, MAP_SIZE);
+
+ srandom(time(0));
+
+ for (i = 0; i < iterations; i++) {
+ unsigned long j;
+ unsigned long change;
+ char *rand_s1 = s1;
+ char *rand_s2 = s2;
+
+ for (j = 0; j < alloc_size; j++)
+ s1[j] = random();
+
+ rand_s1 += random() % MAX_OFFSET_DIFF_S1_S2;
+ rand_s2 += random() % MAX_OFFSET_DIFF_S1_S2;
+ memcpy(rand_s2, rand_s1, comp_size);
+
+ /* change one byte */
+ change = random() % comp_size;
+ rand_s2[change] = random() & 0xff;
+
+ if (islarge)
+ test_one(rand_s1, rand_s2, LARGE_MAX_OFFSET,
+ LARGE_SIZE_START, comp_size);
+ else
+ test_one(rand_s1, rand_s2, SIZE, 0, comp_size);
+ }
+
+ srandom(time(0));
+
+ for (i = 0; i < iterations; i++) {
+ unsigned long j;
+ unsigned long change;
+ char *rand_s1 = s1;
+ char *rand_s2 = s2;
+
+ for (j = 0; j < alloc_size; j++)
+ s1[j] = random();
+
+ rand_s1 += random() % MAX_OFFSET_DIFF_S1_S2;
+ rand_s2 += random() % MAX_OFFSET_DIFF_S1_S2;
+ memcpy(rand_s2, rand_s1, comp_size);
+
+ /* change multiple bytes, 1/8 of total */
+ for (j = 0; j < comp_size / 8; j++) {
+ change = random() % comp_size;
+ s2[change] = random() & 0xff;
+ }
+
+ if (islarge)
+ test_one(rand_s1, rand_s2, LARGE_MAX_OFFSET,
+ LARGE_SIZE_START, comp_size);
+ else
+ test_one(rand_s1, rand_s2, SIZE, 0, comp_size);
+ }
+
+ return 0;
+}
+
+static int testcases(void)
+{
+#ifdef __powerpc64__
+ // vcmpequd used in memcmp_64.S is v2.07
+ SKIP_IF(!have_hwcap2(PPC_FEATURE2_ARCH_2_07));
+#endif
+
+ testcase(0);
+ testcase(1);
+ return 0;
+}
+
+int main(void)
+{
+ test_harness_set_timeout(300);
+ return test_harness(testcases, "memcmp");
+}
diff --git a/tools/testing/selftests/powerpc/stringloops/memcmp_32.S b/tools/testing/selftests/powerpc/stringloops/memcmp_32.S
new file mode 120000
index 000000000..056f2b3af
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/memcmp_32.S
@@ -0,0 +1 @@
+../../../../../arch/powerpc/lib/memcmp_32.S \ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/stringloops/memcmp_64.S b/tools/testing/selftests/powerpc/stringloops/memcmp_64.S
new file mode 120000
index 000000000..9bc87e438
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/memcmp_64.S
@@ -0,0 +1 @@
+../../../../../arch/powerpc/lib/memcmp_64.S \ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/stringloops/string.c b/tools/testing/selftests/powerpc/stringloops/string.c
new file mode 100644
index 000000000..45e777541
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/string.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copied from linux/lib/string.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ */
+
+#include <stddef.h>
+
+/**
+ * strlen - Find the length of a string
+ * @s: The string to be sized
+ */
+size_t test_strlen(const char *s)
+{
+ const char *sc;
+
+ for (sc = s; *sc != '\0'; ++sc)
+ /* nothing */;
+ return sc - s;
+}
diff --git a/tools/testing/selftests/powerpc/stringloops/strlen.c b/tools/testing/selftests/powerpc/stringloops/strlen.c
new file mode 100644
index 000000000..9055ebc48
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/strlen.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <malloc.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "utils.h"
+
+#define SIZE 256
+#define ITERATIONS 1000
+#define ITERATIONS_BENCH 100000
+
+int test_strlen(const void *s);
+
+/* test all offsets and lengths */
+static void test_one(char *s)
+{
+ unsigned long offset;
+
+ for (offset = 0; offset < SIZE; offset++) {
+ int x, y;
+ unsigned long i;
+
+ y = strlen(s + offset);
+ x = test_strlen(s + offset);
+
+ if (x != y) {
+ printf("strlen() returned %d, should have returned %d (%p offset %ld)\n", x, y, s, offset);
+
+ for (i = offset; i < SIZE; i++)
+ printf("%02x ", s[i]);
+ printf("\n");
+ }
+ }
+}
+
+static void bench_test(char *s)
+{
+ struct timespec ts_start, ts_end;
+ int i;
+
+ clock_gettime(CLOCK_MONOTONIC, &ts_start);
+
+ for (i = 0; i < ITERATIONS_BENCH; i++)
+ test_strlen(s);
+
+ clock_gettime(CLOCK_MONOTONIC, &ts_end);
+
+ printf("len %3.3d : time = %.6f\n", test_strlen(s), ts_end.tv_sec - ts_start.tv_sec + (ts_end.tv_nsec - ts_start.tv_nsec) / 1e9);
+}
+
+static int testcase(void)
+{
+ char *s;
+ unsigned long i;
+
+ s = memalign(128, SIZE);
+ if (!s) {
+ perror("memalign");
+ exit(1);
+ }
+
+ srandom(1);
+
+ memset(s, 0, SIZE);
+ for (i = 0; i < SIZE; i++) {
+ char c;
+
+ do {
+ c = random() & 0x7f;
+ } while (!c);
+ s[i] = c;
+ test_one(s);
+ }
+
+ for (i = 0; i < ITERATIONS; i++) {
+ unsigned long j;
+
+ for (j = 0; j < SIZE; j++) {
+ char c;
+
+ do {
+ c = random() & 0x7f;
+ } while (!c);
+ s[j] = c;
+ }
+ for (j = 0; j < sizeof(long); j++) {
+ s[SIZE - 1 - j] = 0;
+ test_one(s);
+ }
+ }
+
+ for (i = 0; i < SIZE; i++) {
+ char c;
+
+ do {
+ c = random() & 0x7f;
+ } while (!c);
+ s[i] = c;
+ }
+
+ bench_test(s);
+
+ s[16] = 0;
+ bench_test(s);
+
+ s[8] = 0;
+ bench_test(s);
+
+ s[4] = 0;
+ bench_test(s);
+
+ s[3] = 0;
+ bench_test(s);
+
+ s[2] = 0;
+ bench_test(s);
+
+ s[1] = 0;
+ bench_test(s);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(testcase, "strlen");
+}
diff --git a/tools/testing/selftests/powerpc/stringloops/strlen_32.S b/tools/testing/selftests/powerpc/stringloops/strlen_32.S
new file mode 120000
index 000000000..72b13731b
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/strlen_32.S
@@ -0,0 +1 @@
+../../../../../arch/powerpc/lib/strlen_32.S \ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/switch_endian/.gitignore b/tools/testing/selftests/powerpc/switch_endian/.gitignore
new file mode 100644
index 000000000..30e962cf8
--- /dev/null
+++ b/tools/testing/selftests/powerpc/switch_endian/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+switch_endian_test
+check-reversed.S
diff --git a/tools/testing/selftests/powerpc/switch_endian/Makefile b/tools/testing/selftests/powerpc/switch_endian/Makefile
new file mode 100644
index 000000000..bdc081afe
--- /dev/null
+++ b/tools/testing/selftests/powerpc/switch_endian/Makefile
@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: GPL-2.0
+TEST_GEN_PROGS := switch_endian_test
+
+ASFLAGS += -O2 -Wall -g -nostdlib -m64
+
+EXTRA_CLEAN = $(OUTPUT)/*.o $(OUTPUT)/check-reversed.S
+
+top_srcdir = ../../../../..
+include ../../lib.mk
+
+$(OUTPUT)/switch_endian_test: ASFLAGS += -I $(OUTPUT)
+$(OUTPUT)/switch_endian_test: $(OUTPUT)/check-reversed.S
+
+$(OUTPUT)/check-reversed.o: $(OUTPUT)/check.o
+ $(CROSS_COMPILE)objcopy -j .text --reverse-bytes=4 -O binary $< $@
+
+$(OUTPUT)/check-reversed.S: $(OUTPUT)/check-reversed.o
+ hexdump -v -e '/1 ".byte 0x%02X\n"' $< > $@
diff --git a/tools/testing/selftests/powerpc/switch_endian/check.S b/tools/testing/selftests/powerpc/switch_endian/check.S
new file mode 100644
index 000000000..927a5c675
--- /dev/null
+++ b/tools/testing/selftests/powerpc/switch_endian/check.S
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include "common.h"
+
+/*
+ * Checks that registers contain what we expect, ie. they were not clobbered by
+ * the syscall.
+ *
+ * r15: pattern to check registers against.
+ *
+ * At the end r3 == 0 if everything's OK.
+ */
+ nop # guaranteed to be illegal in reverse-endian
+ mr r9,r15
+ cmpd r9,r3 # check r3
+ bne 1f
+ addi r9,r15,4 # check r4
+ cmpd r9,r4
+ bne 1f
+ lis r9,0x00FF # check CR
+ ori r9,r9,0xF000
+ mfcr r10
+ and r10,r10,r9
+ cmpw r9,r10
+ addi r9,r15,34
+ bne 1f
+ addi r9,r15,32 # check LR
+ mflr r10
+ cmpd r9,r10
+ bne 1f
+ addi r9,r15,5 # check r5
+ cmpd r9,r5
+ bne 1f
+ addi r9,r15,6 # check r6
+ cmpd r9,r6
+ bne 1f
+ addi r9,r15,7 # check r7
+ cmpd r9,r7
+ bne 1f
+ addi r9,r15,8 # check r8
+ cmpd r9,r8
+ bne 1f
+ addi r9,r15,13 # check r13
+ cmpd r9,r13
+ bne 1f
+ addi r9,r15,14 # check r14
+ cmpd r9,r14
+ bne 1f
+ addi r9,r15,16 # check r16
+ cmpd r9,r16
+ bne 1f
+ addi r9,r15,17 # check r17
+ cmpd r9,r17
+ bne 1f
+ addi r9,r15,18 # check r18
+ cmpd r9,r18
+ bne 1f
+ addi r9,r15,19 # check r19
+ cmpd r9,r19
+ bne 1f
+ addi r9,r15,20 # check r20
+ cmpd r9,r20
+ bne 1f
+ addi r9,r15,21 # check r21
+ cmpd r9,r21
+ bne 1f
+ addi r9,r15,22 # check r22
+ cmpd r9,r22
+ bne 1f
+ addi r9,r15,23 # check r23
+ cmpd r9,r23
+ bne 1f
+ addi r9,r15,24 # check r24
+ cmpd r9,r24
+ bne 1f
+ addi r9,r15,25 # check r25
+ cmpd r9,r25
+ bne 1f
+ addi r9,r15,26 # check r26
+ cmpd r9,r26
+ bne 1f
+ addi r9,r15,27 # check r27
+ cmpd r9,r27
+ bne 1f
+ addi r9,r15,28 # check r28
+ cmpd r9,r28
+ bne 1f
+ addi r9,r15,29 # check r29
+ cmpd r9,r29
+ bne 1f
+ addi r9,r15,30 # check r30
+ cmpd r9,r30
+ bne 1f
+ addi r9,r15,31 # check r31
+ cmpd r9,r31
+ bne 1f
+ b 2f
+1: mr r3, r9
+ li r0, __NR_exit
+ sc
+2: li r0, __NR_switch_endian
+ nop
diff --git a/tools/testing/selftests/powerpc/switch_endian/common.h b/tools/testing/selftests/powerpc/switch_endian/common.h
new file mode 100644
index 000000000..1434cbc2a
--- /dev/null
+++ b/tools/testing/selftests/powerpc/switch_endian/common.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <ppc-asm.h>
+#include <asm/unistd.h>
+
+#ifndef __NR_switch_endian
+#define __NR_switch_endian 363
+#endif
diff --git a/tools/testing/selftests/powerpc/switch_endian/switch_endian_test.S b/tools/testing/selftests/powerpc/switch_endian/switch_endian_test.S
new file mode 100644
index 000000000..7887f78cf
--- /dev/null
+++ b/tools/testing/selftests/powerpc/switch_endian/switch_endian_test.S
@@ -0,0 +1,97 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include "common.h"
+
+ .data
+ .balign 8
+success_message:
+ .ascii "success: switch_endian_test\n\0"
+
+ .balign 8
+failure_message:
+ .ascii "failure: switch_endian_test\n\0"
+
+ .section ".toc"
+ .balign 8
+pattern:
+ .8byte 0x5555AAAA5555AAAA
+
+ .text
+FUNC_START(_start)
+ /* Load the pattern */
+ ld r15, pattern@TOC(%r2)
+
+ /* Setup CR, only CR2-CR4 are maintained */
+ lis r3, 0x00FF
+ ori r3, r3, 0xF000
+ mtcr r3
+
+ /* Load the pattern slightly modified into the registers */
+ mr r3, r15
+ addi r4, r15, 4
+
+ addi r5, r15, 32
+ mtlr r5
+
+ addi r5, r15, 5
+ addi r6, r15, 6
+ addi r7, r15, 7
+ addi r8, r15, 8
+
+ /* r9 - r12 are clobbered */
+
+ addi r13, r15, 13
+ addi r14, r15, 14
+
+ /* Skip r15 we're using it */
+
+ addi r16, r15, 16
+ addi r17, r15, 17
+ addi r18, r15, 18
+ addi r19, r15, 19
+ addi r20, r15, 20
+ addi r21, r15, 21
+ addi r22, r15, 22
+ addi r23, r15, 23
+ addi r24, r15, 24
+ addi r25, r15, 25
+ addi r26, r15, 26
+ addi r27, r15, 27
+ addi r28, r15, 28
+ addi r29, r15, 29
+ addi r30, r15, 30
+ addi r31, r15, 31
+
+ /*
+ * Call the syscall to switch endian.
+ * It clobbers r9-r12, XER, CTR and CR0-1,5-7.
+ */
+ li r0, __NR_switch_endian
+ sc
+
+ tdi 0, 0, 0x48 // b +8 if the endian was switched
+ b .Lfail // exit if endian didn't switch
+
+#include "check-reversed.S"
+
+ /* Flip back, r0 already has the switch syscall number */
+ .long 0x02000044 /* sc */
+
+#include "check.S"
+
+ ld r4, success_message@got(%r2)
+ li r5, 28 // strlen(success_message)
+ li r14, 0 // exit status
+.Lout:
+ li r0, __NR_write
+ li r3, 1 /* stdout */
+ sc
+ li r0, __NR_exit
+ mr r3, r14
+ sc
+ b .
+
+.Lfail:
+ ld r4, failure_message@got(%r2)
+ li r5, 28 // strlen(failure_message)
+ li r14, 1
+ b .Lout
diff --git a/tools/testing/selftests/powerpc/syscalls/.gitignore b/tools/testing/selftests/powerpc/syscalls/.gitignore
new file mode 100644
index 000000000..b00cab225
--- /dev/null
+++ b/tools/testing/selftests/powerpc/syscalls/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+ipc_unmuxed
diff --git a/tools/testing/selftests/powerpc/syscalls/Makefile b/tools/testing/selftests/powerpc/syscalls/Makefile
new file mode 100644
index 000000000..b63f8459c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/syscalls/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0-only
+TEST_GEN_PROGS := ipc_unmuxed rtas_filter
+
+CFLAGS += -I../../../../../usr/include
+
+top_srcdir = ../../../../..
+include ../../lib.mk
+
+$(TEST_GEN_PROGS): ../harness.c
diff --git a/tools/testing/selftests/powerpc/syscalls/ipc.h b/tools/testing/selftests/powerpc/syscalls/ipc.h
new file mode 100644
index 000000000..26a20682c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/syscalls/ipc.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifdef __NR_semop
+DO_TEST(semop, __NR_semop)
+#endif
+
+#ifdef __NR_semget
+DO_TEST(semget, __NR_semget)
+#endif
+
+#ifdef __NR_semctl
+DO_TEST(semctl, __NR_semctl)
+#endif
+
+#ifdef __NR_semtimedop
+DO_TEST(semtimedop, __NR_semtimedop)
+#endif
+
+#ifdef __NR_msgsnd
+DO_TEST(msgsnd, __NR_msgsnd)
+#endif
+
+#ifdef __NR_msgrcv
+DO_TEST(msgrcv, __NR_msgrcv)
+#endif
+
+#ifdef __NR_msgget
+DO_TEST(msgget, __NR_msgget)
+#endif
+
+#ifdef __NR_msgctl
+DO_TEST(msgctl, __NR_msgctl)
+#endif
+
+#ifdef __NR_shmat
+DO_TEST(shmat, __NR_shmat)
+#endif
+
+#ifdef __NR_shmdt
+DO_TEST(shmdt, __NR_shmdt)
+#endif
+
+#ifdef __NR_shmget
+DO_TEST(shmget, __NR_shmget)
+#endif
+
+#ifdef __NR_shmctl
+DO_TEST(shmctl, __NR_shmctl)
+#endif
diff --git a/tools/testing/selftests/powerpc/syscalls/ipc_unmuxed.c b/tools/testing/selftests/powerpc/syscalls/ipc_unmuxed.c
new file mode 100644
index 000000000..4c582524a
--- /dev/null
+++ b/tools/testing/selftests/powerpc/syscalls/ipc_unmuxed.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2015, Michael Ellerman, IBM Corp.
+ *
+ * This test simply tests that certain syscalls are implemented. It doesn't
+ * actually exercise their logic in any way.
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#include "utils.h"
+
+
+#define DO_TEST(_name, _num) \
+static int test_##_name(void) \
+{ \
+ int rc; \
+ printf("Testing " #_name); \
+ errno = 0; \
+ rc = syscall(_num, -1, 0, 0, 0, 0, 0); \
+ printf("\treturned %d, errno %d\n", rc, errno); \
+ return errno == ENOSYS; \
+}
+
+#include "ipc.h"
+#undef DO_TEST
+
+static int ipc_unmuxed(void)
+{
+ int tests_done = 0;
+
+#define DO_TEST(_name, _num) \
+ FAIL_IF(test_##_name()); \
+ tests_done++;
+
+#include "ipc.h"
+#undef DO_TEST
+
+ /*
+ * If we ran no tests then it means none of the syscall numbers were
+ * defined, possibly because we were built against old headers. But it
+ * means we didn't really test anything, so instead of passing mark it
+ * as a skip to give the user a clue.
+ */
+ SKIP_IF(tests_done == 0);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(ipc_unmuxed, "ipc_unmuxed");
+}
diff --git a/tools/testing/selftests/powerpc/syscalls/rtas_filter.c b/tools/testing/selftests/powerpc/syscalls/rtas_filter.c
new file mode 100644
index 000000000..03b487f18
--- /dev/null
+++ b/tools/testing/selftests/powerpc/syscalls/rtas_filter.c
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright 2005-2020 IBM Corporation.
+ *
+ * Includes code from librtas (https://github.com/ibm-power-utilities/librtas/)
+ */
+
+#include <byteswap.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <errno.h>
+#include "utils.h"
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define cpu_to_be32(x) bswap_32(x)
+#define be32_to_cpu(x) bswap_32(x)
+#else
+#define cpu_to_be32(x) (x)
+#define be32_to_cpu(x) (x)
+#endif
+
+#define RTAS_IO_ASSERT -1098 /* Unexpected I/O Error */
+#define RTAS_UNKNOWN_OP -1099 /* No Firmware Implementation of Function */
+#define BLOCK_SIZE 4096
+#define PAGE_SIZE 4096
+#define MAX_PAGES 64
+
+static const char *ofdt_rtas_path = "/proc/device-tree/rtas";
+
+typedef __be32 uint32_t;
+struct rtas_args {
+ __be32 token;
+ __be32 nargs;
+ __be32 nret;
+ __be32 args[16];
+ __be32 *rets; /* Pointer to return values in args[]. */
+};
+
+struct region {
+ uint64_t addr;
+ uint32_t size;
+ struct region *next;
+};
+
+int read_entire_file(int fd, char **buf, size_t *len)
+{
+ size_t buf_size = 0;
+ size_t off = 0;
+ int rc;
+
+ *buf = NULL;
+ do {
+ buf_size += BLOCK_SIZE;
+ if (*buf == NULL)
+ *buf = malloc(buf_size);
+ else
+ *buf = realloc(*buf, buf_size);
+
+ if (*buf == NULL)
+ return -ENOMEM;
+
+ rc = read(fd, *buf + off, BLOCK_SIZE);
+ if (rc < 0)
+ return -EIO;
+
+ off += rc;
+ } while (rc == BLOCK_SIZE);
+
+ if (len)
+ *len = off;
+
+ return 0;
+}
+
+static int open_prop_file(const char *prop_path, const char *prop_name, int *fd)
+{
+ char *path;
+ int len;
+
+ /* allocate enough for two string, a slash and trailing NULL */
+ len = strlen(prop_path) + strlen(prop_name) + 1 + 1;
+ path = malloc(len);
+ if (path == NULL)
+ return -ENOMEM;
+
+ snprintf(path, len, "%s/%s", prop_path, prop_name);
+
+ *fd = open(path, O_RDONLY);
+ free(path);
+ if (*fd < 0)
+ return -errno;
+
+ return 0;
+}
+
+static int get_property(const char *prop_path, const char *prop_name,
+ char **prop_val, size_t *prop_len)
+{
+ int rc, fd;
+
+ rc = open_prop_file(prop_path, prop_name, &fd);
+ if (rc)
+ return rc;
+
+ rc = read_entire_file(fd, prop_val, prop_len);
+ close(fd);
+
+ return rc;
+}
+
+int rtas_token(const char *call_name)
+{
+ char *prop_buf = NULL;
+ size_t len;
+ int rc;
+
+ rc = get_property(ofdt_rtas_path, call_name, &prop_buf, &len);
+ if (rc < 0) {
+ rc = RTAS_UNKNOWN_OP;
+ goto err;
+ }
+
+ rc = be32_to_cpu(*(int *)prop_buf);
+
+err:
+ free(prop_buf);
+ return rc;
+}
+
+static int read_kregion_bounds(struct region *kregion)
+{
+ char *buf;
+ int fd;
+ int rc;
+
+ fd = open("/proc/ppc64/rtas/rmo_buffer", O_RDONLY);
+ if (fd < 0) {
+ printf("Could not open rmo_buffer file\n");
+ return RTAS_IO_ASSERT;
+ }
+
+ rc = read_entire_file(fd, &buf, NULL);
+ close(fd);
+ if (rc) {
+ free(buf);
+ return rc;
+ }
+
+ sscanf(buf, "%" SCNx64 " %x", &kregion->addr, &kregion->size);
+ free(buf);
+
+ if (!(kregion->size && kregion->addr) ||
+ (kregion->size > (PAGE_SIZE * MAX_PAGES))) {
+ printf("Unexpected kregion bounds\n");
+ return RTAS_IO_ASSERT;
+ }
+
+ return 0;
+}
+
+static int rtas_call(const char *name, int nargs,
+ int nrets, ...)
+{
+ struct rtas_args args;
+ __be32 *rets[16];
+ int i, rc, token;
+ va_list ap;
+
+ va_start(ap, nrets);
+
+ token = rtas_token(name);
+ if (token == RTAS_UNKNOWN_OP) {
+ // We don't care if the call doesn't exist
+ printf("call '%s' not available, skipping...", name);
+ rc = RTAS_UNKNOWN_OP;
+ goto err;
+ }
+
+ args.token = cpu_to_be32(token);
+ args.nargs = cpu_to_be32(nargs);
+ args.nret = cpu_to_be32(nrets);
+
+ for (i = 0; i < nargs; i++)
+ args.args[i] = (__be32) va_arg(ap, unsigned long);
+
+ for (i = 0; i < nrets; i++)
+ rets[i] = (__be32 *) va_arg(ap, unsigned long);
+
+ rc = syscall(__NR_rtas, &args);
+ if (rc) {
+ rc = -errno;
+ goto err;
+ }
+
+ if (nrets) {
+ *(rets[0]) = be32_to_cpu(args.args[nargs]);
+
+ for (i = 1; i < nrets; i++) {
+ *(rets[i]) = args.args[nargs + i];
+ }
+ }
+
+err:
+ va_end(ap);
+ return rc;
+}
+
+static int test(void)
+{
+ struct region rmo_region;
+ uint32_t rmo_start;
+ uint32_t rmo_end;
+ __be32 rets[1];
+ int rc;
+
+ // Test a legitimate harmless call
+ // Expected: call succeeds
+ printf("Test a permitted call, no parameters... ");
+ rc = rtas_call("get-time-of-day", 0, 1, rets);
+ printf("rc: %d\n", rc);
+ FAIL_IF(rc != 0 && rc != RTAS_UNKNOWN_OP);
+
+ // Test a prohibited call
+ // Expected: call returns -EINVAL
+ printf("Test a prohibited call... ");
+ rc = rtas_call("nvram-fetch", 0, 1, rets);
+ printf("rc: %d\n", rc);
+ FAIL_IF(rc != -EINVAL && rc != RTAS_UNKNOWN_OP);
+
+ // Get RMO
+ rc = read_kregion_bounds(&rmo_region);
+ if (rc) {
+ printf("Couldn't read RMO region bounds, skipping remaining cases\n");
+ return 0;
+ }
+ rmo_start = rmo_region.addr;
+ rmo_end = rmo_start + rmo_region.size - 1;
+ printf("RMO range: %08x - %08x\n", rmo_start, rmo_end);
+
+ // Test a permitted call, user-supplied size, buffer inside RMO
+ // Expected: call succeeds
+ printf("Test a permitted call, user-supplied size, buffer inside RMO... ");
+ rc = rtas_call("ibm,get-system-parameter", 3, 1, 0, cpu_to_be32(rmo_start),
+ cpu_to_be32(rmo_end - rmo_start + 1), rets);
+ printf("rc: %d\n", rc);
+ FAIL_IF(rc != 0 && rc != RTAS_UNKNOWN_OP);
+
+ // Test a permitted call, user-supplied size, buffer start outside RMO
+ // Expected: call returns -EINVAL
+ printf("Test a permitted call, user-supplied size, buffer start outside RMO... ");
+ rc = rtas_call("ibm,get-system-parameter", 3, 1, 0, cpu_to_be32(rmo_end + 1),
+ cpu_to_be32(4000), rets);
+ printf("rc: %d\n", rc);
+ FAIL_IF(rc != -EINVAL && rc != RTAS_UNKNOWN_OP);
+
+ // Test a permitted call, user-supplied size, buffer end outside RMO
+ // Expected: call returns -EINVAL
+ printf("Test a permitted call, user-supplied size, buffer end outside RMO... ");
+ rc = rtas_call("ibm,get-system-parameter", 3, 1, 0, cpu_to_be32(rmo_start),
+ cpu_to_be32(rmo_end - rmo_start + 2), rets);
+ printf("rc: %d\n", rc);
+ FAIL_IF(rc != -EINVAL && rc != RTAS_UNKNOWN_OP);
+
+ // Test a permitted call, fixed size, buffer end outside RMO
+ // Expected: call returns -EINVAL
+ printf("Test a permitted call, fixed size, buffer end outside RMO... ");
+ rc = rtas_call("ibm,configure-connector", 2, 1, cpu_to_be32(rmo_end - 4000), 0, rets);
+ printf("rc: %d\n", rc);
+ FAIL_IF(rc != -EINVAL && rc != RTAS_UNKNOWN_OP);
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(test, "rtas_filter");
+}
diff --git a/tools/testing/selftests/powerpc/tm/.gitignore b/tools/testing/selftests/powerpc/tm/.gitignore
new file mode 100644
index 000000000..d8900a0c4
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/.gitignore
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: GPL-2.0-only
+tm-resched-dscr
+tm-syscall
+tm-signal-msr-resv
+tm-signal-stack
+tm-vmxcopy
+tm-fork
+tm-tar
+tm-tmspr
+tm-exec
+tm-signal-context-chk-fpu
+tm-signal-context-chk-gpr
+tm-signal-context-chk-vmx
+tm-signal-context-chk-vsx
+tm-signal-context-force-tm
+tm-signal-sigreturn-nt
+tm-signal-pagefault
+tm-vmx-unavail
+tm-unavailable
+tm-trap
+tm-sigreturn
+tm-poison
diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile
new file mode 100644
index 000000000..5881e97c7
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/Makefile
@@ -0,0 +1,32 @@
+# SPDX-License-Identifier: GPL-2.0
+SIGNAL_CONTEXT_CHK_TESTS := tm-signal-context-chk-gpr tm-signal-context-chk-fpu \
+ tm-signal-context-chk-vmx tm-signal-context-chk-vsx
+
+TEST_GEN_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv tm-signal-stack \
+ tm-vmxcopy tm-fork tm-tar tm-tmspr tm-vmx-unavail tm-unavailable tm-trap \
+ $(SIGNAL_CONTEXT_CHK_TESTS) tm-sigreturn tm-signal-sigreturn-nt \
+ tm-signal-context-force-tm tm-poison tm-signal-pagefault
+
+TEST_FILES := settings
+
+top_srcdir = ../../../../..
+include ../../lib.mk
+
+$(TEST_GEN_PROGS): ../harness.c ../utils.c
+
+CFLAGS += -mhtm
+
+$(OUTPUT)/tm-syscall: tm-syscall-asm.S
+$(OUTPUT)/tm-syscall: CFLAGS += -I../../../../../usr/include
+$(OUTPUT)/tm-tmspr: CFLAGS += -pthread
+$(OUTPUT)/tm-vmx-unavail: CFLAGS += -pthread -m64
+$(OUTPUT)/tm-resched-dscr: ../pmu/lib.c
+$(OUTPUT)/tm-unavailable: CFLAGS += -O0 -pthread -m64 -Wno-error=uninitialized -mvsx
+$(OUTPUT)/tm-trap: CFLAGS += -O0 -pthread -m64
+$(OUTPUT)/tm-signal-context-force-tm: CFLAGS += -pthread -m64
+$(OUTPUT)/tm-signal-pagefault: CFLAGS += -pthread -m64
+$(OUTPUT)/tm-poison: CFLAGS += -m64
+
+SIGNAL_CONTEXT_CHK_TESTS := $(patsubst %,$(OUTPUT)/%,$(SIGNAL_CONTEXT_CHK_TESTS))
+$(SIGNAL_CONTEXT_CHK_TESTS): tm-signal.S
+$(SIGNAL_CONTEXT_CHK_TESTS): CFLAGS += -mhtm -m64 -mvsx
diff --git a/tools/testing/selftests/powerpc/tm/settings b/tools/testing/selftests/powerpc/tm/settings
new file mode 100644
index 000000000..e7b941753
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/settings
@@ -0,0 +1 @@
+timeout=0
diff --git a/tools/testing/selftests/powerpc/tm/tm-exec.c b/tools/testing/selftests/powerpc/tm/tm-exec.c
new file mode 100644
index 000000000..260cfdb97
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-exec.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2016, Cyril Bur, IBM Corp.
+ *
+ * Syscalls can be performed provided the transactions are suspended.
+ * The exec() class of syscall is unique as a new process is loaded.
+ *
+ * It makes little sense for after an exec() call for the previously
+ * suspended transaction to still exist.
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <inttypes.h>
+#include <libgen.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "utils.h"
+#include "tm.h"
+
+static char *path;
+
+static int test_exec(void)
+{
+ SKIP_IF(!have_htm());
+
+ asm __volatile__(
+ "tbegin.;"
+ "blt 1f; "
+ "tsuspend.;"
+ "1: ;"
+ : : : "memory");
+
+ execl(path, "tm-exec", "--child", NULL);
+
+ /* Shouldn't get here */
+ perror("execl() failed");
+ return 1;
+}
+
+static int after_exec(void)
+{
+ asm __volatile__(
+ "tbegin.;"
+ "blt 1f;"
+ "tsuspend.;"
+ "1: ;"
+ : : : "memory");
+
+ FAIL_IF(failure_is_nesting());
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ path = argv[0];
+
+ if (argc > 1 && strcmp(argv[1], "--child") == 0)
+ return after_exec();
+
+ return test_harness(test_exec, "tm_exec");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-fork.c b/tools/testing/selftests/powerpc/tm/tm-fork.c
new file mode 100644
index 000000000..6efa5a685
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-fork.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2015, Michael Neuling, IBM Corp.
+ *
+ * Edited: Rashmica Gupta, Nov 2015
+ *
+ * This test does a fork syscall inside a transaction. Basic sniff test
+ * to see if we can enter the kernel during a transaction.
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "utils.h"
+#include "tm.h"
+
+int test_fork(void)
+{
+ SKIP_IF(!have_htm());
+
+ asm __volatile__(
+ "tbegin.;"
+ "blt 1f; "
+ "li 0, 2;" /* fork syscall */
+ "sc ;"
+ "tend.;"
+ "1: ;"
+ : : : "memory", "r0");
+ /* If we reach here, we've passed. Otherwise we've probably crashed
+ * the kernel */
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ return test_harness(test_fork, "tm_fork");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-poison.c b/tools/testing/selftests/powerpc/tm/tm-poison.c
new file mode 100644
index 000000000..29e5f26af
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-poison.c
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2019, Gustavo Romero, Michael Neuling, IBM Corp.
+ *
+ * This test will spawn two processes. Both will be attached to the same
+ * CPU (CPU 0). The child will be in a loop writing to FP register f31 and
+ * VMX/VEC/Altivec register vr31 a known value, called poison, calling
+ * sched_yield syscall after to allow the parent to switch on the CPU.
+ * Parent will set f31 and vr31 to 1 and in a loop will check if f31 and
+ * vr31 remain 1 as expected until a given timeout (2m). If the issue is
+ * present child's poison will leak into parent's f31 or vr31 registers,
+ * otherwise, poison will never leak into parent's f31 and vr31 registers.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <sched.h>
+#include <sys/types.h>
+#include <signal.h>
+#include <inttypes.h>
+
+#include "tm.h"
+
+int tm_poison_test(void)
+{
+ int cpu, pid;
+ cpu_set_t cpuset;
+ uint64_t poison = 0xdeadbeefc0dec0fe;
+ uint64_t unknown = 0;
+ bool fail_fp = false;
+ bool fail_vr = false;
+
+ SKIP_IF(!have_htm());
+
+ cpu = pick_online_cpu();
+ FAIL_IF(cpu < 0);
+
+ // Attach both Child and Parent to the same CPU
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpu, &cpuset);
+ FAIL_IF(sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0);
+
+ pid = fork();
+ if (!pid) {
+ /**
+ * child
+ */
+ while (1) {
+ sched_yield();
+ asm (
+ "mtvsrd 31, %[poison];" // f31 = poison
+ "mtvsrd 63, %[poison];" // vr31 = poison
+
+ : : [poison] "r" (poison) : );
+ }
+ }
+
+ /**
+ * parent
+ */
+ asm (
+ /*
+ * Set r3, r4, and f31 to known value 1 before entering
+ * in transaction. They won't be written after that.
+ */
+ " li 3, 0x1 ;"
+ " li 4, 0x1 ;"
+ " mtvsrd 31, 4 ;"
+
+ /*
+ * The Time Base (TB) is a 64-bit counter register that is
+ * independent of the CPU clock and which is incremented
+ * at a frequency of 512000000 Hz, so every 1.953125ns.
+ * So it's necessary 120s/0.000000001953125s = 61440000000
+ * increments to get a 2 minutes timeout. Below we set that
+ * value in r5 and then use r6 to track initial TB value,
+ * updating TB values in r7 at every iteration and comparing it
+ * to r6. When r7 (current) - r6 (initial) > 61440000000 we bail
+ * out since for sure we spent already 2 minutes in the loop.
+ * SPR 268 is the TB register.
+ */
+ " lis 5, 14 ;"
+ " ori 5, 5, 19996 ;"
+ " sldi 5, 5, 16 ;" // r5 = 61440000000
+
+ " mfspr 6, 268 ;" // r6 (TB initial)
+ "1: mfspr 7, 268 ;" // r7 (TB current)
+ " subf 7, 6, 7 ;" // r7 - r6 > 61440000000 ?
+ " cmpd 7, 5 ;"
+ " bgt 3f ;" // yes, exit
+
+ /*
+ * Main loop to check f31
+ */
+ " tbegin. ;" // no, try again
+ " beq 1b ;" // restart if no timeout
+ " mfvsrd 3, 31 ;" // read f31
+ " cmpd 3, 4 ;" // f31 == 1 ?
+ " bne 2f ;" // broken :-(
+ " tabort. 3 ;" // try another transaction
+ "2: tend. ;" // commit transaction
+ "3: mr %[unknown], 3 ;" // record r3
+
+ : [unknown] "=r" (unknown)
+ :
+ : "cr0", "r3", "r4", "r5", "r6", "r7", "vs31"
+
+ );
+
+ /*
+ * On leak 'unknown' will contain 'poison' value from child,
+ * otherwise (no leak) 'unknown' will contain the same value
+ * as r3 before entering in transactional mode, i.e. 0x1.
+ */
+ fail_fp = unknown != 0x1;
+ if (fail_fp)
+ printf("Unknown value %#"PRIx64" leaked into f31!\n", unknown);
+ else
+ printf("Good, no poison or leaked value into FP registers\n");
+
+ asm (
+ /*
+ * Set r3, r4, and vr31 to known value 1 before entering
+ * in transaction. They won't be written after that.
+ */
+ " li 3, 0x1 ;"
+ " li 4, 0x1 ;"
+ " mtvsrd 63, 4 ;"
+
+ " lis 5, 14 ;"
+ " ori 5, 5, 19996 ;"
+ " sldi 5, 5, 16 ;" // r5 = 61440000000
+
+ " mfspr 6, 268 ;" // r6 (TB initial)
+ "1: mfspr 7, 268 ;" // r7 (TB current)
+ " subf 7, 6, 7 ;" // r7 - r6 > 61440000000 ?
+ " cmpd 7, 5 ;"
+ " bgt 3f ;" // yes, exit
+
+ /*
+ * Main loop to check vr31
+ */
+ " tbegin. ;" // no, try again
+ " beq 1b ;" // restart if no timeout
+ " mfvsrd 3, 63 ;" // read vr31
+ " cmpd 3, 4 ;" // vr31 == 1 ?
+ " bne 2f ;" // broken :-(
+ " tabort. 3 ;" // try another transaction
+ "2: tend. ;" // commit transaction
+ "3: mr %[unknown], 3 ;" // record r3
+
+ : [unknown] "=r" (unknown)
+ :
+ : "cr0", "r3", "r4", "r5", "r6", "r7", "vs63"
+
+ );
+
+ /*
+ * On leak 'unknown' will contain 'poison' value from child,
+ * otherwise (no leak) 'unknown' will contain the same value
+ * as r3 before entering in transactional mode, i.e. 0x1.
+ */
+ fail_vr = unknown != 0x1;
+ if (fail_vr)
+ printf("Unknown value %#"PRIx64" leaked into vr31!\n", unknown);
+ else
+ printf("Good, no poison or leaked value into VEC registers\n");
+
+ kill(pid, SIGKILL);
+
+ return (fail_fp | fail_vr);
+}
+
+int main(int argc, char *argv[])
+{
+ /* Test completes in about 4m */
+ test_harness_set_timeout(250);
+ return test_harness(tm_poison_test, "tm_poison_test");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c
new file mode 100644
index 000000000..4cdb83964
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-resched-dscr.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Test context switching to see if the DSCR SPR is correctly preserved
+ * when within a transaction.
+ *
+ * Note: We assume that the DSCR has been left at the default value (0)
+ * for all CPUs.
+ *
+ * Method:
+ *
+ * Set a value into the DSCR.
+ *
+ * Start a transaction, and suspend it (*).
+ *
+ * Hard loop checking to see if the transaction has become doomed.
+ *
+ * Now that we *may* have been preempted, record the DSCR and TEXASR SPRS.
+ *
+ * If the abort was because of a context switch, check the DSCR value.
+ * Otherwise, try again.
+ *
+ * (*) If the transaction is not suspended we can't see the problem because
+ * the transaction abort handler will restore the DSCR to it's checkpointed
+ * value before we regain control.
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <asm/tm.h>
+
+#include "utils.h"
+#include "tm.h"
+#include "../pmu/lib.h"
+
+#define SPRN_DSCR 0x03
+
+int test_body(void)
+{
+ uint64_t rv, dscr1 = 1, dscr2, texasr;
+
+ SKIP_IF(!have_htm());
+
+ printf("Check DSCR TM context switch: ");
+ fflush(stdout);
+ for (;;) {
+ asm __volatile__ (
+ /* set a known value into the DSCR */
+ "ld 3, %[dscr1];"
+ "mtspr %[sprn_dscr], 3;"
+
+ "li %[rv], 1;"
+ /* start and suspend a transaction */
+ "tbegin.;"
+ "beq 1f;"
+ "tsuspend.;"
+
+ /* hard loop until the transaction becomes doomed */
+ "2: ;"
+ "tcheck 0;"
+ "bc 4, 0, 2b;"
+
+ /* record DSCR and TEXASR */
+ "mfspr 3, %[sprn_dscr];"
+ "std 3, %[dscr2];"
+ "mfspr 3, %[sprn_texasr];"
+ "std 3, %[texasr];"
+
+ "tresume.;"
+ "tend.;"
+ "li %[rv], 0;"
+ "1: ;"
+ : [rv]"=r"(rv), [dscr2]"=m"(dscr2), [texasr]"=m"(texasr)
+ : [dscr1]"m"(dscr1)
+ , [sprn_dscr]"i"(SPRN_DSCR), [sprn_texasr]"i"(SPRN_TEXASR)
+ : "memory", "r3"
+ );
+ assert(rv); /* make sure the transaction aborted */
+ if ((texasr >> 56) != TM_CAUSE_RESCHED) {
+ continue;
+ }
+ if (dscr2 != dscr1) {
+ printf(" FAIL\n");
+ return 1;
+ } else {
+ printf(" OK\n");
+ return 0;
+ }
+ }
+}
+
+static int tm_resched_dscr(void)
+{
+ return eat_cpu(test_body);
+}
+
+int main(int argc, const char *argv[])
+{
+ return test_harness(tm_resched_dscr, "tm_resched_dscr");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c
new file mode 100644
index 000000000..254f912ad
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-fpu.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2016, Cyril Bur, IBM Corp.
+ *
+ * Test the kernel's signal frame code.
+ *
+ * The kernel sets up two sets of ucontexts if the signal was to be
+ * delivered while the thread was in a transaction (referred too as
+ * first and second contexts).
+ * Expected behaviour is that the checkpointed state is in the user
+ * context passed to the signal handler (first context). The speculated
+ * state can be accessed with the uc_link pointer (second context).
+ *
+ * The rationale for this is that if TM unaware code (which linked
+ * against TM libs) installs a signal handler it will not know of the
+ * speculative nature of the 'live' registers and may infer the wrong
+ * thing.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include <altivec.h>
+
+#include "utils.h"
+#include "tm.h"
+
+#define MAX_ATTEMPT 500000
+
+#define NV_FPU_REGS 18 /* Number of non-volatile FP registers */
+#define FPR14 14 /* First non-volatile FP register to check in f14-31 subset */
+
+long tm_signal_self_context_load(pid_t pid, long *gprs, double *fps, vector int *vms, vector int *vss);
+
+/* Test only non-volatile registers, i.e. 18 fpr registers from f14 to f31 */
+static double fps[] = {
+ /* First context will be set with these values, i.e. non-speculative */
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+ /* Second context will be set with these values, i.e. speculative */
+ -1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18
+};
+
+static sig_atomic_t fail, broken;
+
+static void signal_usr1(int signum, siginfo_t *info, void *uc)
+{
+ int i;
+ ucontext_t *ucp = uc;
+ ucontext_t *tm_ucp = ucp->uc_link;
+
+ for (i = 0; i < NV_FPU_REGS; i++) {
+ /* Check first context. Print all mismatches. */
+ fail = (ucp->uc_mcontext.fp_regs[FPR14 + i] != fps[i]);
+ if (fail) {
+ broken = 1;
+ printf("FPR%d (1st context) == %g instead of %g (expected)\n",
+ FPR14 + i, ucp->uc_mcontext.fp_regs[FPR14 + i], fps[i]);
+ }
+ }
+
+ for (i = 0; i < NV_FPU_REGS; i++) {
+ /* Check second context. Print all mismatches. */
+ fail = (tm_ucp->uc_mcontext.fp_regs[FPR14 + i] != fps[NV_FPU_REGS + i]);
+ if (fail) {
+ broken = 1;
+ printf("FPR%d (2nd context) == %g instead of %g (expected)\n",
+ FPR14 + i, tm_ucp->uc_mcontext.fp_regs[FPR14 + i], fps[NV_FPU_REGS + i]);
+ }
+ }
+}
+
+static int tm_signal_context_chk_fpu()
+{
+ struct sigaction act;
+ int i;
+ long rc;
+ pid_t pid = getpid();
+
+ SKIP_IF(!have_htm());
+
+ act.sa_sigaction = signal_usr1;
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = SA_SIGINFO;
+ if (sigaction(SIGUSR1, &act, NULL) < 0) {
+ perror("sigaction sigusr1");
+ exit(1);
+ }
+
+ i = 0;
+ while (i < MAX_ATTEMPT && !broken) {
+ /*
+ * tm_signal_self_context_load will set both first and second
+ * contexts accordingly to the values passed through non-NULL
+ * array pointers to it, in that case 'fps', and invoke the
+ * signal handler installed for SIGUSR1.
+ */
+ rc = tm_signal_self_context_load(pid, NULL, fps, NULL, NULL);
+ FAIL_IF(rc != pid);
+ i++;
+ }
+
+ return (broken);
+}
+
+int main(void)
+{
+ return test_harness(tm_signal_context_chk_fpu, "tm_signal_context_chk_fpu");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c
new file mode 100644
index 000000000..0cc680f61
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-gpr.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2016, Cyril Bur, IBM Corp.
+ *
+ * Test the kernel's signal frame code.
+ *
+ * The kernel sets up two sets of ucontexts if the signal was to be
+ * delivered while the thread was in a transaction (referred too as
+ * first and second contexts).
+ * Expected behaviour is that the checkpointed state is in the user
+ * context passed to the signal handler (first context). The speculated
+ * state can be accessed with the uc_link pointer (second context).
+ *
+ * The rationale for this is that if TM unaware code (which linked
+ * against TM libs) installs a signal handler it will not know of the
+ * speculative nature of the 'live' registers and may infer the wrong
+ * thing.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include <altivec.h>
+
+#include "utils.h"
+#include "tm.h"
+
+#define MAX_ATTEMPT 500000
+
+#define NV_GPR_REGS 18 /* Number of non-volatile GPR registers */
+#define R14 14 /* First non-volatile register to check in r14-r31 subset */
+
+long tm_signal_self_context_load(pid_t pid, long *gprs, double *fps, vector int *vms, vector int *vss);
+
+static sig_atomic_t fail, broken;
+
+/* Test only non-volatile general purpose registers, i.e. r14-r31 */
+static long gprs[] = {
+ /* First context will be set with these values, i.e. non-speculative */
+ /* R14, R15, ... */
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+ /* Second context will be set with these values, i.e. speculative */
+ /* R14, R15, ... */
+ -1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16,-17,-18
+};
+
+static void signal_usr1(int signum, siginfo_t *info, void *uc)
+{
+ int i;
+ ucontext_t *ucp = uc;
+ ucontext_t *tm_ucp = ucp->uc_link;
+
+ /* Check first context. Print all mismatches. */
+ for (i = 0; i < NV_GPR_REGS; i++) {
+ fail = (ucp->uc_mcontext.gp_regs[R14 + i] != gprs[i]);
+ if (fail) {
+ broken = 1;
+ printf("GPR%d (1st context) == %lu instead of %lu (expected)\n",
+ R14 + i, ucp->uc_mcontext.gp_regs[R14 + i], gprs[i]);
+ }
+ }
+
+ /* Check second context. Print all mismatches. */
+ for (i = 0; i < NV_GPR_REGS; i++) {
+ fail = (tm_ucp->uc_mcontext.gp_regs[R14 + i] != gprs[NV_GPR_REGS + i]);
+ if (fail) {
+ broken = 1;
+ printf("GPR%d (2nd context) == %lu instead of %lu (expected)\n",
+ R14 + i, tm_ucp->uc_mcontext.gp_regs[R14 + i], gprs[NV_GPR_REGS + i]);
+ }
+ }
+}
+
+static int tm_signal_context_chk_gpr()
+{
+ struct sigaction act;
+ int i;
+ long rc;
+ pid_t pid = getpid();
+
+ SKIP_IF(!have_htm());
+
+ act.sa_sigaction = signal_usr1;
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = SA_SIGINFO;
+ if (sigaction(SIGUSR1, &act, NULL) < 0) {
+ perror("sigaction sigusr1");
+ exit(1);
+ }
+
+ i = 0;
+ while (i < MAX_ATTEMPT && !broken) {
+ /*
+ * tm_signal_self_context_load will set both first and second
+ * contexts accordingly to the values passed through non-NULL
+ * array pointers to it, in that case 'gprs', and invoke the
+ * signal handler installed for SIGUSR1.
+ */
+ rc = tm_signal_self_context_load(pid, gprs, NULL, NULL, NULL);
+ FAIL_IF(rc != pid);
+ i++;
+ }
+
+ return broken;
+}
+
+int main(void)
+{
+ return test_harness(tm_signal_context_chk_gpr, "tm_signal_context_chk_gpr");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c
new file mode 100644
index 000000000..b6d52730a
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vmx.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2016, Cyril Bur, IBM Corp.
+ *
+ * Test the kernel's signal frame code.
+ *
+ * The kernel sets up two sets of ucontexts if the signal was to be
+ * delivered while the thread was in a transaction (referred too as
+ * first and second contexts).
+ * Expected behaviour is that the checkpointed state is in the user
+ * context passed to the signal handler (first context). The speculated
+ * state can be accessed with the uc_link pointer (second context).
+ *
+ * The rationale for this is that if TM unaware code (which linked
+ * against TM libs) installs a signal handler it will not know of the
+ * speculative nature of the 'live' registers and may infer the wrong
+ * thing.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include <altivec.h>
+
+#include "utils.h"
+#include "tm.h"
+
+#define MAX_ATTEMPT 500000
+
+#define NV_VMX_REGS 12 /* Number of non-volatile VMX registers */
+#define VMX20 20 /* First non-volatile register to check in vr20-31 subset */
+
+long tm_signal_self_context_load(pid_t pid, long *gprs, double *fps, vector int *vms, vector int *vss);
+
+static sig_atomic_t fail, broken;
+
+/* Test only non-volatile registers, i.e. 12 vmx registers from vr20 to vr31 */
+vector int vms[] = {
+ /* First context will be set with these values, i.e. non-speculative */
+ /* VMX20 , VMX21 , ... */
+ { 1, 2, 3, 4},{ 5, 6, 7, 8},{ 9,10,11,12},
+ {13,14,15,16},{17,18,19,20},{21,22,23,24},
+ {25,26,27,28},{29,30,31,32},{33,34,35,36},
+ {37,38,39,40},{41,42,43,44},{45,46,47,48},
+ /* Second context will be set with these values, i.e. speculative */
+ /* VMX20 , VMX21 , ... */
+ { -1, -2, -3, -4},{ -5, -6, -7, -8},{ -9,-10,-11,-12},
+ {-13,-14,-15,-16},{-17,-18,-19,-20},{-21,-22,-23,-24},
+ {-25,-26,-27,-28},{-29,-30,-31,-32},{-33,-34,-35,-36},
+ {-37,-38,-39,-40},{-41,-42,-43,-44},{-45,-46,-47,-48}
+};
+
+static void signal_usr1(int signum, siginfo_t *info, void *uc)
+{
+ int i, j;
+ ucontext_t *ucp = uc;
+ ucontext_t *tm_ucp = ucp->uc_link;
+
+ for (i = 0; i < NV_VMX_REGS; i++) {
+ /* Check first context. Print all mismatches. */
+ fail = memcmp(ucp->uc_mcontext.v_regs->vrregs[VMX20 + i],
+ &vms[i], sizeof(vector int));
+ if (fail) {
+ broken = 1;
+ printf("VMX%d (1st context) == 0x", VMX20 + i);
+ /* Print actual value in first context. */
+ for (j = 0; j < 4; j++)
+ printf("%08x", ucp->uc_mcontext.v_regs->vrregs[VMX20 + i][j]);
+ printf(" instead of 0x");
+ /* Print expected value. */
+ for (j = 0; j < 4; j++)
+ printf("%08x", vms[i][j]);
+ printf(" (expected)\n");
+ }
+ }
+
+ for (i = 0; i < NV_VMX_REGS; i++) {
+ /* Check second context. Print all mismatches. */
+ fail = memcmp(tm_ucp->uc_mcontext.v_regs->vrregs[VMX20 + i],
+ &vms[NV_VMX_REGS + i], sizeof (vector int));
+ if (fail) {
+ broken = 1;
+ printf("VMX%d (2nd context) == 0x", NV_VMX_REGS + i);
+ /* Print actual value in second context. */
+ for (j = 0; j < 4; j++)
+ printf("%08x", tm_ucp->uc_mcontext.v_regs->vrregs[VMX20 + i][j]);
+ printf(" instead of 0x");
+ /* Print expected value. */
+ for (j = 0; j < 4; j++)
+ printf("%08x", vms[NV_VMX_REGS + i][j]);
+ printf(" (expected)\n");
+ }
+ }
+}
+
+static int tm_signal_context_chk()
+{
+ struct sigaction act;
+ int i;
+ long rc;
+ pid_t pid = getpid();
+
+ SKIP_IF(!have_htm());
+
+ act.sa_sigaction = signal_usr1;
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = SA_SIGINFO;
+ if (sigaction(SIGUSR1, &act, NULL) < 0) {
+ perror("sigaction sigusr1");
+ exit(1);
+ }
+
+ i = 0;
+ while (i < MAX_ATTEMPT && !broken) {
+ /*
+ * tm_signal_self_context_load will set both first and second
+ * contexts accordingly to the values passed through non-NULL
+ * array pointers to it, in that case 'vms', and invoke the
+ * signal handler installed for SIGUSR1.
+ */
+ rc = tm_signal_self_context_load(pid, NULL, NULL, vms, NULL);
+ FAIL_IF(rc != pid);
+ i++;
+ }
+
+ return (broken);
+}
+
+int main(void)
+{
+ return test_harness(tm_signal_context_chk, "tm_signal_context_chk_vmx");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c
new file mode 100644
index 000000000..8e25e2072
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-chk-vsx.c
@@ -0,0 +1,184 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright 2016, Cyril Bur, IBM Corp.
+ *
+ * Test the kernel's signal frame code.
+ *
+ * The kernel sets up two sets of ucontexts if the signal was to be
+ * delivered while the thread was in a transaction (referred too as
+ * first and second contexts).
+ * Expected behaviour is that the checkpointed state is in the user
+ * context passed to the signal handler (first context). The speculated
+ * state can be accessed with the uc_link pointer (second context).
+ *
+ * The rationale for this is that if TM unaware code (which linked
+ * against TM libs) installs a signal handler it will not know of the
+ * speculative nature of the 'live' registers and may infer the wrong
+ * thing.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include <altivec.h>
+
+#include "utils.h"
+#include "tm.h"
+
+#define MAX_ATTEMPT 500000
+
+#define NV_VSX_REGS 12 /* Number of VSX registers to check. */
+#define VSX20 20 /* First VSX register to check in vsr20-vsr31 subset */
+#define FPR20 20 /* FPR20 overlaps VSX20 most significant doubleword */
+
+long tm_signal_self_context_load(pid_t pid, long *gprs, double *fps, vector int *vms, vector int *vss);
+
+static sig_atomic_t fail, broken;
+
+/* Test only 12 vsx registers from vsr20 to vsr31 */
+vector int vsxs[] = {
+ /* First context will be set with these values, i.e. non-speculative */
+ /* VSX20 , VSX21 , ... */
+ { 1, 2, 3, 4},{ 5, 6, 7, 8},{ 9,10,11,12},
+ {13,14,15,16},{17,18,19,20},{21,22,23,24},
+ {25,26,27,28},{29,30,31,32},{33,34,35,36},
+ {37,38,39,40},{41,42,43,44},{45,46,47,48},
+ /* Second context will be set with these values, i.e. speculative */
+ /* VSX20 , VSX21 , ... */
+ {-1, -2, -3, -4 },{-5, -6, -7, -8 },{-9, -10,-11,-12},
+ {-13,-14,-15,-16},{-17,-18,-19,-20},{-21,-22,-23,-24},
+ {-25,-26,-27,-28},{-29,-30,-31,-32},{-33,-34,-35,-36},
+ {-37,-38,-39,-40},{-41,-42,-43,-44},{-45,-46,-47,-48}
+};
+
+static void signal_usr1(int signum, siginfo_t *info, void *uc)
+{
+ int i, j;
+ uint8_t vsx[sizeof(vector int)];
+ uint8_t vsx_tm[sizeof(vector int)];
+ ucontext_t *ucp = uc;
+ ucontext_t *tm_ucp = ucp->uc_link;
+
+ /*
+ * FP registers and VMX registers overlap the VSX registers.
+ *
+ * FP registers (f0-31) overlap the most significant 64 bits of VSX
+ * registers vsr0-31, whilst VMX registers vr0-31, being 128-bit like
+ * the VSX registers, overlap fully the other half of VSX registers,
+ * i.e. vr0-31 overlaps fully vsr32-63.
+ *
+ * Due to compatibility and historical reasons (VMX/Altivec support
+ * appeared first on the architecture), VMX registers vr0-31 (so VSX
+ * half vsr32-63 too) are stored right after the v_regs pointer, in an
+ * area allocated for 'vmx_reverse' array (please see
+ * arch/powerpc/include/uapi/asm/sigcontext.h for details about the
+ * mcontext_t structure on Power).
+ *
+ * The other VSX half (vsr0-31) is hence stored below vr0-31/vsr32-63
+ * registers, but only the least significant 64 bits of vsr0-31. The
+ * most significant 64 bits of vsr0-31 (f0-31), as it overlaps the FP
+ * registers, is kept in fp_regs.
+ *
+ * v_regs is a 16 byte aligned pointer at the start of vmx_reserve
+ * (vmx_reserve may or may not be 16 aligned) where the v_regs structure
+ * exists, so v_regs points to where vr0-31 / vsr32-63 registers are
+ * fully stored. Since v_regs type is elf_vrregset_t, v_regs + 1
+ * skips all the slots used to store vr0-31 / vsr32-64 and points to
+ * part of one VSX half, i.e. v_regs + 1 points to the least significant
+ * 64 bits of vsr0-31. The other part of this half (the most significant
+ * part of vsr0-31) is stored in fp_regs.
+ *
+ */
+ /* Get pointer to least significant doubleword of vsr0-31 */
+ long *vsx_ptr = (long *)(ucp->uc_mcontext.v_regs + 1);
+ long *tm_vsx_ptr = (long *)(tm_ucp->uc_mcontext.v_regs + 1);
+
+ /* Check first context. Print all mismatches. */
+ for (i = 0; i < NV_VSX_REGS; i++) {
+ /*
+ * Copy VSX most significant doubleword from fp_regs and
+ * copy VSX least significant one from 64-bit slots below
+ * saved VMX registers.
+ */
+ memcpy(vsx, &ucp->uc_mcontext.fp_regs[FPR20 + i], 8);
+ memcpy(vsx + 8, &vsx_ptr[VSX20 + i], 8);
+
+ fail = memcmp(vsx, &vsxs[i], sizeof(vector int));
+
+ if (fail) {
+ broken = 1;
+ printf("VSX%d (1st context) == 0x", VSX20 + i);
+ for (j = 0; j < 16; j++)
+ printf("%02x", vsx[j]);
+ printf(" instead of 0x");
+ for (j = 0; j < 4; j++)
+ printf("%08x", vsxs[i][j]);
+ printf(" (expected)\n");
+ }
+ }
+
+ /* Check second context. Print all mismatches. */
+ for (i = 0; i < NV_VSX_REGS; i++) {
+ /*
+ * Copy VSX most significant doubleword from fp_regs and
+ * copy VSX least significant one from 64-bit slots below
+ * saved VMX registers.
+ */
+ memcpy(vsx_tm, &tm_ucp->uc_mcontext.fp_regs[FPR20 + i], 8);
+ memcpy(vsx_tm + 8, &tm_vsx_ptr[VSX20 + i], 8);
+
+ fail = memcmp(vsx_tm, &vsxs[NV_VSX_REGS + i], sizeof(vector int));
+
+ if (fail) {
+ broken = 1;
+ printf("VSX%d (2nd context) == 0x", VSX20 + i);
+ for (j = 0; j < 16; j++)
+ printf("%02x", vsx_tm[j]);
+ printf(" instead of 0x");
+ for (j = 0; j < 4; j++)
+ printf("%08x", vsxs[NV_VSX_REGS + i][j]);
+ printf("(expected)\n");
+ }
+ }
+}
+
+static int tm_signal_context_chk()
+{
+ struct sigaction act;
+ int i;
+ long rc;
+ pid_t pid = getpid();
+
+ SKIP_IF(!have_htm());
+
+ act.sa_sigaction = signal_usr1;
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = SA_SIGINFO;
+ if (sigaction(SIGUSR1, &act, NULL) < 0) {
+ perror("sigaction sigusr1");
+ exit(1);
+ }
+
+ i = 0;
+ while (i < MAX_ATTEMPT && !broken) {
+ /*
+ * tm_signal_self_context_load will set both first and second
+ * contexts accordingly to the values passed through non-NULL
+ * array pointers to it, in that case 'vsxs', and invoke the
+ * signal handler installed for SIGUSR1.
+ */
+ rc = tm_signal_self_context_load(pid, NULL, NULL, NULL, vsxs);
+ FAIL_IF(rc != pid);
+ i++;
+ }
+
+ return (broken);
+}
+
+int main(void)
+{
+ return test_harness(tm_signal_context_chk, "tm_signal_context_chk_vsx");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c b/tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c
new file mode 100644
index 000000000..421cb082f
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-context-force-tm.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2018, Breno Leitao, Gustavo Romero, IBM Corp.
+ *
+ * This test raises a SIGUSR1 signal, and toggle the MSR[TS]
+ * fields at the signal handler. With MSR[TS] being set, the kernel will
+ * force a recheckpoint, which may cause a segfault when returning to
+ * user space. Since the test needs to re-run, the segfault needs to be
+ * caught and handled.
+ *
+ * In order to continue the test even after a segfault, the context is
+ * saved prior to the signal being raised, and it is restored when there is
+ * a segmentation fault. This happens for COUNT_MAX times.
+ *
+ * This test never fails (as returning EXIT_FAILURE). It either succeeds,
+ * or crash the kernel (on a buggy kernel).
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <string.h>
+#include <ucontext.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+#include "tm.h"
+#include "utils.h"
+#include "reg.h"
+
+#define COUNT_MAX 5000 /* Number of interactions */
+
+/*
+ * This test only runs on 64 bits system. Unsetting MSR_TS_S to avoid
+ * compilation issue on 32 bits system. There is no side effect, since the
+ * whole test will be skipped if it is not running on 64 bits system.
+ */
+#ifndef __powerpc64__
+#undef MSR_TS_S
+#define MSR_TS_S 0
+#endif
+
+/* Setting contexts because the test will crash and we want to recover */
+ucontext_t init_context;
+
+/* count is changed in the signal handler, so it must be volatile */
+static volatile int count;
+
+void usr_signal_handler(int signo, siginfo_t *si, void *uc)
+{
+ ucontext_t *ucp = uc;
+ int ret;
+
+ /*
+ * Allocating memory in a signal handler, and never freeing it on
+ * purpose, forcing the heap increase, so, the memory leak is what
+ * we want here.
+ */
+ ucp->uc_link = mmap(NULL, sizeof(ucontext_t),
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+ if (ucp->uc_link == (void *)-1) {
+ perror("Mmap failed");
+ exit(-1);
+ }
+
+ /* Forcing the page to be allocated in a page fault */
+ ret = madvise(ucp->uc_link, sizeof(ucontext_t), MADV_DONTNEED);
+ if (ret) {
+ perror("madvise failed");
+ exit(-1);
+ }
+
+ memcpy(&ucp->uc_link->uc_mcontext, &ucp->uc_mcontext,
+ sizeof(ucp->uc_mcontext));
+
+ /* Forcing to enable MSR[TM] */
+ UCONTEXT_MSR(ucp) |= MSR_TS_S;
+
+ /*
+ * A fork inside a signal handler seems to be more efficient than a
+ * fork() prior to the signal being raised.
+ */
+ if (fork() == 0) {
+ /*
+ * Both child and parent will return, but, child returns
+ * with count set so it will exit in the next segfault.
+ * Parent will continue to loop.
+ */
+ count = COUNT_MAX;
+ }
+
+ /*
+ * If the change above does not hit the bug, it will cause a
+ * segmentation fault, since the ck structures are NULL.
+ */
+}
+
+void seg_signal_handler(int signo, siginfo_t *si, void *uc)
+{
+ count++;
+
+ /* Reexecute the test */
+ setcontext(&init_context);
+}
+
+void tm_trap_test(void)
+{
+ struct sigaction usr_sa, seg_sa;
+ stack_t ss;
+
+ usr_sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
+ usr_sa.sa_sigaction = usr_signal_handler;
+
+ seg_sa.sa_flags = SA_SIGINFO;
+ seg_sa.sa_sigaction = seg_signal_handler;
+
+ /*
+ * Set initial context. Will get back here from
+ * seg_signal_handler()
+ */
+ getcontext(&init_context);
+
+ while (count < COUNT_MAX) {
+ /* Allocated an alternative signal stack area */
+ ss.ss_sp = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+ ss.ss_size = SIGSTKSZ;
+ ss.ss_flags = 0;
+
+ if (ss.ss_sp == (void *)-1) {
+ perror("mmap error\n");
+ exit(-1);
+ }
+
+ /* Force the allocation through a page fault */
+ if (madvise(ss.ss_sp, SIGSTKSZ, MADV_DONTNEED)) {
+ perror("madvise\n");
+ exit(-1);
+ }
+
+ /*
+ * Setting an alternative stack to generate a page fault when
+ * the signal is raised.
+ */
+ if (sigaltstack(&ss, NULL)) {
+ perror("sigaltstack\n");
+ exit(-1);
+ }
+
+ /* The signal handler will enable MSR_TS */
+ sigaction(SIGUSR1, &usr_sa, NULL);
+ /* If it does not crash, it might segfault, avoid it to retest */
+ sigaction(SIGSEGV, &seg_sa, NULL);
+
+ raise(SIGUSR1);
+ count++;
+ }
+}
+
+int tm_signal_context_force_tm(void)
+{
+ SKIP_IF(!have_htm());
+ /*
+ * Skipping if not running on 64 bits system, since I think it is
+ * not possible to set mcontext's [MSR] with TS, due to it being 32
+ * bits.
+ */
+ SKIP_IF(!is_ppc64le());
+
+ tm_trap_test();
+
+ return EXIT_SUCCESS;
+}
+
+int main(int argc, char **argv)
+{
+ test_harness(tm_signal_context_force_tm, "tm_signal_context_force_tm");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-msr-resv.c b/tools/testing/selftests/powerpc/tm/tm-signal-msr-resv.c
new file mode 100644
index 000000000..4a61e9bd1
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-msr-resv.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2015, Michael Neuling, IBM Corp.
+ *
+ * Test the kernel's signal return code to ensure that it doesn't
+ * crash when both the transactional and suspend MSR bits are set in
+ * the signal context.
+ *
+ * For this test, we send ourselves a SIGUSR1. In the SIGUSR1 handler
+ * we modify the signal context to set both MSR TM S and T bits (which
+ * is "reserved" by the PowerISA). When we return from the signal
+ * handler (implicit sigreturn), the kernel should detect reserved MSR
+ * value and send us with a SIGSEGV.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include "utils.h"
+#include "tm.h"
+
+int segv_expected = 0;
+
+void signal_segv(int signum)
+{
+ if (segv_expected && (signum == SIGSEGV))
+ _exit(0);
+ _exit(1);
+}
+
+void signal_usr1(int signum, siginfo_t *info, void *uc)
+{
+ ucontext_t *ucp = uc;
+
+ /* Link tm checkpointed context to normal context */
+ ucp->uc_link = ucp;
+ /* Set all TM bits so that the context is now invalid */
+#ifdef __powerpc64__
+ ucp->uc_mcontext.gp_regs[PT_MSR] |= (7ULL << 32);
+#else
+ ucp->uc_mcontext.uc_regs->gregs[PT_MSR] |= (7ULL);
+#endif
+ /* Should segv on return becuase of invalid context */
+ segv_expected = 1;
+}
+
+int tm_signal_msr_resv()
+{
+ struct sigaction act;
+
+ SKIP_IF(!have_htm());
+
+ act.sa_sigaction = signal_usr1;
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = SA_SIGINFO;
+ if (sigaction(SIGUSR1, &act, NULL) < 0) {
+ perror("sigaction sigusr1");
+ exit(1);
+ }
+ if (signal(SIGSEGV, signal_segv) == SIG_ERR)
+ exit(1);
+
+ raise(SIGUSR1);
+
+ /* We shouldn't get here as we exit in the segv handler */
+ return 1;
+}
+
+int main(void)
+{
+ return test_harness(tm_signal_msr_resv, "tm_signal_msr_resv");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-pagefault.c b/tools/testing/selftests/powerpc/tm/tm-signal-pagefault.c
new file mode 100644
index 000000000..5908bc6ab
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-pagefault.c
@@ -0,0 +1,284 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2020, Gustavo Luiz Duarte, IBM Corp.
+ *
+ * This test starts a transaction and triggers a signal, forcing a pagefault to
+ * happen when the kernel signal handling code touches the user signal stack.
+ *
+ * In order to avoid pre-faulting the signal stack memory and to force the
+ * pagefault to happen precisely in the kernel signal handling code, the
+ * pagefault handling is done in userspace using the userfaultfd facility.
+ *
+ * Further pagefaults are triggered by crafting the signal handler's ucontext
+ * to point to additional memory regions managed by the userfaultfd, so using
+ * the same mechanism used to avoid pre-faulting the signal stack memory.
+ *
+ * On failure (bug is present) kernel crashes or never returns control back to
+ * userspace. If bug is not present, tests completes almost immediately.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <linux/userfaultfd.h>
+#include <poll.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <pthread.h>
+#include <signal.h>
+#include <errno.h>
+
+#include "tm.h"
+
+
+#define UF_MEM_SIZE 655360 /* 10 x 64k pages */
+
+/* Memory handled by userfaultfd */
+static char *uf_mem;
+static size_t uf_mem_offset = 0;
+
+/*
+ * Data that will be copied into the faulting pages (instead of zero-filled
+ * pages). This is used to make the test more reliable and avoid segfaulting
+ * when we return from the signal handler. Since we are making the signal
+ * handler's ucontext point to newly allocated memory, when that memory is
+ * paged-in it will contain the expected content.
+ */
+static char backing_mem[UF_MEM_SIZE];
+
+static size_t pagesize;
+
+/*
+ * Return a chunk of at least 'size' bytes of memory that will be handled by
+ * userfaultfd. If 'backing_data' is not NULL, its content will be save to
+ * 'backing_mem' and then copied into the faulting pages when the page fault
+ * is handled.
+ */
+void *get_uf_mem(size_t size, void *backing_data)
+{
+ void *ret;
+
+ if (uf_mem_offset + size > UF_MEM_SIZE) {
+ fprintf(stderr, "Requesting more uf_mem than expected!\n");
+ exit(EXIT_FAILURE);
+ }
+
+ ret = &uf_mem[uf_mem_offset];
+
+ /* Save the data that will be copied into the faulting page */
+ if (backing_data != NULL)
+ memcpy(&backing_mem[uf_mem_offset], backing_data, size);
+
+ /* Reserve the requested amount of uf_mem */
+ uf_mem_offset += size;
+ /* Keep uf_mem_offset aligned to the page size (round up) */
+ uf_mem_offset = (uf_mem_offset + pagesize - 1) & ~(pagesize - 1);
+
+ return ret;
+}
+
+void *fault_handler_thread(void *arg)
+{
+ struct uffd_msg msg; /* Data read from userfaultfd */
+ long uffd; /* userfaultfd file descriptor */
+ struct uffdio_copy uffdio_copy;
+ struct pollfd pollfd;
+ ssize_t nread, offset;
+
+ uffd = (long) arg;
+
+ for (;;) {
+ pollfd.fd = uffd;
+ pollfd.events = POLLIN;
+ if (poll(&pollfd, 1, -1) == -1) {
+ perror("poll() failed");
+ exit(EXIT_FAILURE);
+ }
+
+ nread = read(uffd, &msg, sizeof(msg));
+ if (nread == 0) {
+ fprintf(stderr, "read(): EOF on userfaultfd\n");
+ exit(EXIT_FAILURE);
+ }
+
+ if (nread == -1) {
+ perror("read() failed");
+ exit(EXIT_FAILURE);
+ }
+
+ /* We expect only one kind of event */
+ if (msg.event != UFFD_EVENT_PAGEFAULT) {
+ fprintf(stderr, "Unexpected event on userfaultfd\n");
+ exit(EXIT_FAILURE);
+ }
+
+ /*
+ * We need to handle page faults in units of pages(!).
+ * So, round faulting address down to page boundary.
+ */
+ uffdio_copy.dst = msg.arg.pagefault.address & ~(pagesize-1);
+
+ offset = (char *) uffdio_copy.dst - uf_mem;
+ uffdio_copy.src = (unsigned long) &backing_mem[offset];
+
+ uffdio_copy.len = pagesize;
+ uffdio_copy.mode = 0;
+ uffdio_copy.copy = 0;
+ if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == -1) {
+ perror("ioctl-UFFDIO_COPY failed");
+ exit(EXIT_FAILURE);
+ }
+ }
+}
+
+void setup_uf_mem(void)
+{
+ long uffd; /* userfaultfd file descriptor */
+ pthread_t thr;
+ struct uffdio_api uffdio_api;
+ struct uffdio_register uffdio_register;
+ int ret;
+
+ pagesize = sysconf(_SC_PAGE_SIZE);
+
+ /* Create and enable userfaultfd object */
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ if (uffd == -1) {
+ perror("userfaultfd() failed");
+ exit(EXIT_FAILURE);
+ }
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = 0;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) {
+ perror("ioctl-UFFDIO_API failed");
+ exit(EXIT_FAILURE);
+ }
+
+ /*
+ * Create a private anonymous mapping. The memory will be demand-zero
+ * paged, that is, not yet allocated. When we actually touch the memory
+ * the related page will be allocated via the userfaultfd mechanism.
+ */
+ uf_mem = mmap(NULL, UF_MEM_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (uf_mem == MAP_FAILED) {
+ perror("mmap() failed");
+ exit(EXIT_FAILURE);
+ }
+
+ /*
+ * Register the memory range of the mapping we've just mapped to be
+ * handled by the userfaultfd object. In 'mode' we request to track
+ * missing pages (i.e. pages that have not yet been faulted-in).
+ */
+ uffdio_register.range.start = (unsigned long) uf_mem;
+ uffdio_register.range.len = UF_MEM_SIZE;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) {
+ perror("ioctl-UFFDIO_REGISTER");
+ exit(EXIT_FAILURE);
+ }
+
+ /* Create a thread that will process the userfaultfd events */
+ ret = pthread_create(&thr, NULL, fault_handler_thread, (void *) uffd);
+ if (ret != 0) {
+ fprintf(stderr, "pthread_create(): Error. Returned %d\n", ret);
+ exit(EXIT_FAILURE);
+ }
+}
+
+/*
+ * Assumption: the signal was delivered while userspace was in transactional or
+ * suspended state, i.e. uc->uc_link != NULL.
+ */
+void signal_handler(int signo, siginfo_t *si, void *uc)
+{
+ ucontext_t *ucp = uc;
+
+ /* Skip 'trap' after returning, otherwise we get a SIGTRAP again */
+ ucp->uc_link->uc_mcontext.regs->nip += 4;
+
+ ucp->uc_mcontext.v_regs =
+ get_uf_mem(sizeof(elf_vrreg_t), ucp->uc_mcontext.v_regs);
+
+ ucp->uc_link->uc_mcontext.v_regs =
+ get_uf_mem(sizeof(elf_vrreg_t), ucp->uc_link->uc_mcontext.v_regs);
+
+ ucp->uc_link = get_uf_mem(sizeof(ucontext_t), ucp->uc_link);
+}
+
+bool have_userfaultfd(void)
+{
+ long rc;
+
+ errno = 0;
+ rc = syscall(__NR_userfaultfd, -1);
+
+ return rc == 0 || errno != ENOSYS;
+}
+
+int tm_signal_pagefault(void)
+{
+ struct sigaction sa;
+ stack_t ss;
+
+ SKIP_IF(!have_htm());
+ SKIP_IF(!have_userfaultfd());
+
+ setup_uf_mem();
+
+ /*
+ * Set an alternative stack that will generate a page fault when the
+ * signal is raised. The page fault will be treated via userfaultfd,
+ * i.e. via fault_handler_thread.
+ */
+ ss.ss_sp = get_uf_mem(SIGSTKSZ, NULL);
+ ss.ss_size = SIGSTKSZ;
+ ss.ss_flags = 0;
+ if (sigaltstack(&ss, NULL) == -1) {
+ perror("sigaltstack() failed");
+ exit(EXIT_FAILURE);
+ }
+
+ sa.sa_flags = SA_SIGINFO | SA_ONSTACK;
+ sa.sa_sigaction = signal_handler;
+ if (sigaction(SIGTRAP, &sa, NULL) == -1) {
+ perror("sigaction() failed");
+ exit(EXIT_FAILURE);
+ }
+
+ /* Trigger a SIGTRAP in transactional state */
+ asm __volatile__(
+ "tbegin.;"
+ "beq 1f;"
+ "trap;"
+ "1: ;"
+ : : : "memory");
+
+ /* Trigger a SIGTRAP in suspended state */
+ asm __volatile__(
+ "tbegin.;"
+ "beq 1f;"
+ "tsuspend.;"
+ "trap;"
+ "tresume.;"
+ "1: ;"
+ : : : "memory");
+
+ return EXIT_SUCCESS;
+}
+
+int main(int argc, char **argv)
+{
+ /*
+ * Depending on kernel config, the TM Bad Thing might not result in a
+ * crash, instead the kernel never returns control back to userspace, so
+ * set a tight timeout. If the test passes it completes almost
+ * immediately.
+ */
+ test_harness_set_timeout(2);
+ return test_harness(tm_signal_pagefault, "tm_signal_pagefault");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c b/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c
new file mode 100644
index 000000000..07c388147
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-sigreturn-nt.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2018, Breno Leitao, Gustavo Romero, IBM Corp.
+ *
+ * A test case that creates a signal and starts a suspended transaction
+ * inside the signal handler.
+ *
+ * It returns from the signal handler with the CPU at suspended state, but
+ * without setting usercontext MSR Transaction State (TS) fields.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+
+#include "utils.h"
+#include "tm.h"
+
+void trap_signal_handler(int signo, siginfo_t *si, void *uc)
+{
+ ucontext_t *ucp = (ucontext_t *) uc;
+
+ asm("tbegin.; tsuspend.;");
+
+ /* Skip 'trap' instruction if it succeed */
+ ucp->uc_mcontext.regs->nip += 4;
+}
+
+int tm_signal_sigreturn_nt(void)
+{
+ struct sigaction trap_sa;
+
+ SKIP_IF(!have_htm());
+
+ trap_sa.sa_flags = SA_SIGINFO;
+ trap_sa.sa_sigaction = trap_signal_handler;
+
+ sigaction(SIGTRAP, &trap_sa, NULL);
+
+ raise(SIGTRAP);
+
+ return EXIT_SUCCESS;
+}
+
+int main(int argc, char **argv)
+{
+ test_harness(tm_signal_sigreturn_nt, "tm_signal_sigreturn_nt");
+}
+
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-stack.c b/tools/testing/selftests/powerpc/tm/tm-signal-stack.c
new file mode 100644
index 000000000..cdcf8c5bb
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-stack.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2015, Michael Neuling, IBM Corp.
+ *
+ * Test the kernel's signal delievery code to ensure that we don't
+ * trelaim twice in the kernel signal delivery code. This can happen
+ * if we trigger a signal when in a transaction and the stack pointer
+ * is bogus.
+ *
+ * This test case registers a SEGV handler, sets the stack pointer
+ * (r1) to NULL, starts a transaction and then generates a SEGV. The
+ * SEGV should be handled but we exit here as the stack pointer is
+ * invalid and hance we can't sigreturn. We only need to check that
+ * this flow doesn't crash the kernel.
+ */
+
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <signal.h>
+
+#include "utils.h"
+#include "tm.h"
+
+void signal_segv(int signum)
+{
+ /* This should never actually run since stack is foobar */
+ exit(1);
+}
+
+int tm_signal_stack()
+{
+ int pid;
+
+ SKIP_IF(!have_htm());
+
+ pid = fork();
+ if (pid < 0)
+ exit(1);
+
+ if (pid) { /* Parent */
+ /*
+ * It's likely the whole machine will crash here so if
+ * the child ever exits, we are good.
+ */
+ wait(NULL);
+ return 0;
+ }
+
+ /*
+ * The flow here is:
+ * 1) register a signal handler (so signal delievery occurs)
+ * 2) make stack pointer (r1) = NULL
+ * 3) start transaction
+ * 4) cause segv
+ */
+ if (signal(SIGSEGV, signal_segv) == SIG_ERR)
+ exit(1);
+ asm volatile("li 1, 0 ;" /* stack ptr == NULL */
+ "1:"
+ "tbegin.;"
+ "beq 1b ;" /* retry forever */
+ "tsuspend.;"
+ "ld 2, 0(1) ;" /* trigger segv" */
+ : : : "memory");
+
+ /* This should never get here due to above segv */
+ return 1;
+}
+
+int main(void)
+{
+ return test_harness(tm_signal_stack, "tm_signal_stack");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal.S b/tools/testing/selftests/powerpc/tm/tm-signal.S
new file mode 100644
index 000000000..c80c91366
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-signal.S
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ */
+
+#include "basic_asm.h"
+#include "gpr_asm.h"
+#include "fpu_asm.h"
+#include "vmx_asm.h"
+#include "vsx_asm.h"
+
+/*
+ * Large caveat here being that the caller cannot expect the
+ * signal to always be sent! The hardware can (AND WILL!) abort
+ * the transaction between the tbegin and the tsuspend (however
+ * unlikely it seems or infrequently it actually happens).
+ * You have been warned.
+ */
+/* long tm_signal_self(pid_t pid, long *gprs, double *fps, vector *vms, vector *vss); */
+FUNC_START(tm_signal_self_context_load)
+ PUSH_BASIC_STACK(512)
+ /*
+ * Don't strictly need to save and restore as it depends on if
+ * we're going to use them, however this reduces messy logic
+ */
+ PUSH_VMX(STACK_FRAME_LOCAL(5,0),r8)
+ PUSH_FPU(512)
+ PUSH_NVREGS_BELOW_FPU(512)
+ std r3, STACK_FRAME_PARAM(0)(sp) /* pid */
+ std r4, STACK_FRAME_PARAM(1)(sp) /* gps */
+ std r5, STACK_FRAME_PARAM(2)(sp) /* fps */
+ std r6, STACK_FRAME_PARAM(3)(sp) /* vms */
+ std r7, STACK_FRAME_PARAM(4)(sp) /* vss */
+
+ ld r3, STACK_FRAME_PARAM(1)(sp)
+ cmpdi r3, 0
+ beq skip_gpr_lc
+ bl load_gpr
+skip_gpr_lc:
+ ld r3, STACK_FRAME_PARAM(2)(sp)
+ cmpdi r3, 0
+ beq skip_fpu_lc
+ bl load_fpu
+skip_fpu_lc:
+ ld r3, STACK_FRAME_PARAM(3)(sp)
+ cmpdi r3, 0
+ beq skip_vmx_lc
+ bl load_vmx
+skip_vmx_lc:
+ ld r3, STACK_FRAME_PARAM(4)(sp)
+ cmpdi r3, 0
+ beq skip_vsx_lc
+ bl load_vsx
+skip_vsx_lc:
+ /*
+ * Set r3 (return value) before tbegin. Use the pid as a known
+ * 'all good' return value, zero is used to indicate a non-doomed
+ * transaction.
+ */
+ ld r3, STACK_FRAME_PARAM(0)(sp)
+ tbegin.
+ beq 1f
+ tsuspend. /* Can't enter a syscall transactionally */
+ ld r3, STACK_FRAME_PARAM(1)(sp)
+ cmpdi r3, 0
+ beq skip_gpr_lt
+ /* Get the second half of the array */
+ addi r3, r3, 8 * 18
+ bl load_gpr
+skip_gpr_lt:
+ ld r3, STACK_FRAME_PARAM(2)(sp)
+ cmpdi r3, 0
+ beq skip_fpu_lt
+ /* Get the second half of the array */
+ addi r3, r3, 8 * 18
+ bl load_fpu
+skip_fpu_lt:
+ ld r3, STACK_FRAME_PARAM(3)(sp)
+ cmpdi r3, 0
+ beq skip_vmx_lt
+ /* Get the second half of the array */
+ addi r3, r3, 16 * 12
+ bl load_vmx
+skip_vmx_lt:
+ ld r3, STACK_FRAME_PARAM(4)(sp)
+ cmpdi r3, 0
+ beq skip_vsx_lt
+ /* Get the second half of the array */
+ addi r3, r3, 16 * 12
+ bl load_vsx
+skip_vsx_lt:
+ li r0, 37 /* sys_kill */
+ ld r3, STACK_FRAME_PARAM(0)(sp) /* pid */
+ li r4, 10 /* SIGUSR1 */
+ sc /* Taking the signal will doom the transaction */
+ tabort. 0
+ tresume. /* Be super sure we abort */
+ /*
+ * This will cause us to resume doomed transaction and cause
+ * hardware to cleanup, we'll end up at 1: anything between
+ * tresume. and 1: shouldn't ever run.
+ */
+ li r3, 0
+ 1:
+ POP_VMX(STACK_FRAME_LOCAL(5,0),r4)
+ POP_FPU(512)
+ POP_NVREGS_BELOW_FPU(512)
+ POP_BASIC_STACK(512)
+ blr
+FUNC_END(tm_signal_self_context_load)
diff --git a/tools/testing/selftests/powerpc/tm/tm-sigreturn.c b/tools/testing/selftests/powerpc/tm/tm-sigreturn.c
new file mode 100644
index 000000000..9a6017a1d
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-sigreturn.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2015, Laurent Dufour, IBM Corp.
+ *
+ * Test the kernel's signal returning code to check reclaim is done if the
+ * sigreturn() is called while in a transaction (suspended since active is
+ * already dropped trough the system call path).
+ *
+ * The kernel must discard the transaction when entering sigreturn, since
+ * restoring the potential TM SPRS from the signal frame is requiring to not be
+ * in a transaction.
+ */
+
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "tm.h"
+#include "utils.h"
+
+
+void handler(int sig)
+{
+ uint64_t ret;
+
+ asm __volatile__(
+ "li 3,1 ;"
+ "tbegin. ;"
+ "beq 1f ;"
+ "li 3,0 ;"
+ "tsuspend. ;"
+ "1: ;"
+ "std%X[ret] 3, %[ret] ;"
+ : [ret] "=m"(ret)
+ :
+ : "memory", "3", "cr0");
+
+ if (ret)
+ exit(1);
+
+ /*
+ * We return from the signal handle while in a suspended transaction
+ */
+}
+
+
+int tm_sigreturn(void)
+{
+ struct sigaction sa;
+ uint64_t ret = 0;
+
+ SKIP_IF(!have_htm());
+ SKIP_IF(!is_ppc64le());
+
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = handler;
+ sigemptyset(&sa.sa_mask);
+
+ if (sigaction(SIGSEGV, &sa, NULL))
+ exit(1);
+
+ asm __volatile__(
+ "tbegin. ;"
+ "beq 1f ;"
+ "li 3,0 ;"
+ "std 3,0(3) ;" /* trigger SEGV */
+ "li 3,1 ;"
+ "std%X[ret] 3,%[ret] ;"
+ "tend. ;"
+ "b 2f ;"
+ "1: ;"
+ "li 3,2 ;"
+ "std%X[ret] 3,%[ret] ;"
+ "2: ;"
+ : [ret] "=m"(ret)
+ :
+ : "memory", "3", "cr0");
+
+ if (ret != 2)
+ exit(1);
+
+ exit(0);
+}
+
+int main(void)
+{
+ return test_harness(tm_sigreturn, "tm_sigreturn");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-syscall-asm.S b/tools/testing/selftests/powerpc/tm/tm-syscall-asm.S
new file mode 100644
index 000000000..bd1ca25fe
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-syscall-asm.S
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <ppc-asm.h>
+#include <asm/unistd.h>
+
+ .text
+FUNC_START(getppid_tm_active)
+ tbegin.
+ beq 1f
+ li r0, __NR_getppid
+ sc
+ tend.
+ blr
+1:
+ li r3, -1
+ blr
+
+FUNC_START(getppid_tm_suspended)
+ tbegin.
+ beq 1f
+ li r0, __NR_getppid
+ tsuspend.
+ sc
+ tresume.
+ tend.
+ blr
+1:
+ li r3, -1
+ blr
diff --git a/tools/testing/selftests/powerpc/tm/tm-syscall.c b/tools/testing/selftests/powerpc/tm/tm-syscall.c
new file mode 100644
index 000000000..becb8207b
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-syscall.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2015, Sam Bobroff, IBM Corp.
+ *
+ * Test the kernel's system call code to ensure that a system call
+ * made from within an active HTM transaction is aborted with the
+ * correct failure code.
+ * Conversely, ensure that a system call made from within a
+ * suspended transaction can succeed.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <asm/tm.h>
+#include <sys/time.h>
+#include <stdlib.h>
+
+#include "utils.h"
+#include "tm.h"
+
+extern int getppid_tm_active(void);
+extern int getppid_tm_suspended(void);
+
+unsigned retries = 0;
+
+#define TEST_DURATION 10 /* seconds */
+#define TM_RETRIES 100
+
+pid_t getppid_tm(bool suspend)
+{
+ int i;
+ pid_t pid;
+
+ for (i = 0; i < TM_RETRIES; i++) {
+ if (suspend)
+ pid = getppid_tm_suspended();
+ else
+ pid = getppid_tm_active();
+
+ if (pid >= 0)
+ return pid;
+
+ if (failure_is_persistent()) {
+ if (failure_is_syscall())
+ return -1;
+
+ printf("Unexpected persistent transaction failure.\n");
+ printf("TEXASR 0x%016lx, TFIAR 0x%016lx.\n",
+ __builtin_get_texasr(), __builtin_get_tfiar());
+ exit(-1);
+ }
+
+ retries++;
+ }
+
+ printf("Exceeded limit of %d temporary transaction failures.\n", TM_RETRIES);
+ printf("TEXASR 0x%016lx, TFIAR 0x%016lx.\n",
+ __builtin_get_texasr(), __builtin_get_tfiar());
+
+ exit(-1);
+}
+
+int tm_syscall(void)
+{
+ unsigned count = 0;
+ struct timeval end, now;
+
+ SKIP_IF(!have_htm_nosc());
+
+ setbuf(stdout, NULL);
+
+ printf("Testing transactional syscalls for %d seconds...\n", TEST_DURATION);
+
+ gettimeofday(&end, NULL);
+ now.tv_sec = TEST_DURATION;
+ now.tv_usec = 0;
+ timeradd(&end, &now, &end);
+
+ for (count = 0; timercmp(&now, &end, <); count++) {
+ /*
+ * Test a syscall within a suspended transaction and verify
+ * that it succeeds.
+ */
+ FAIL_IF(getppid_tm(true) == -1); /* Should succeed. */
+
+ /*
+ * Test a syscall within an active transaction and verify that
+ * it fails with the correct failure code.
+ */
+ FAIL_IF(getppid_tm(false) != -1); /* Should fail... */
+ FAIL_IF(!failure_is_persistent()); /* ...persistently... */
+ FAIL_IF(!failure_is_syscall()); /* ...with code syscall. */
+ gettimeofday(&now, 0);
+ }
+
+ printf("%d active and suspended transactions behaved correctly.\n", count);
+ printf("(There were %d transaction retries.)\n", retries);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(tm_syscall, "tm_syscall");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-tar.c b/tools/testing/selftests/powerpc/tm/tm-tar.c
new file mode 100644
index 000000000..03be8c472
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-tar.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2015, Michael Neuling, IBM Corp.
+ * Original: Michael Neuling 19/7/2013
+ * Edited: Rashmica Gupta 01/12/2015
+ *
+ * Do some transactions, see if the tar is corrupted.
+ * If the transaction is aborted, the TAR should be rolled back to the
+ * checkpointed value before the transaction began. The value written to
+ * TAR in suspended mode should only remain in TAR if the transaction
+ * completes.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "tm.h"
+#include "utils.h"
+
+int num_loops = 10000;
+
+int test_tar(void)
+{
+ int i;
+
+ SKIP_IF(!have_htm());
+ SKIP_IF(!is_ppc64le());
+
+ for (i = 0; i < num_loops; i++)
+ {
+ uint64_t result = 0;
+ asm __volatile__(
+ "li 7, 1;"
+ "mtspr %[tar], 7;" /* tar = 1 */
+ "tbegin.;"
+ "beq 3f;"
+ "li 4, 0x7000;" /* Loop lots, to use time */
+ "2:;" /* Start loop */
+ "li 7, 2;"
+ "mtspr %[tar], 7;" /* tar = 2 */
+ "tsuspend.;"
+ "li 7, 3;"
+ "mtspr %[tar], 7;" /* tar = 3 */
+ "tresume.;"
+ "subi 4, 4, 1;"
+ "cmpdi 4, 0;"
+ "bne 2b;"
+ "tend.;"
+
+ /* Transaction sucess! TAR should be 3 */
+ "mfspr 7, %[tar];"
+ "ori %[res], 7, 4;" // res = 3|4 = 7
+ "b 4f;"
+
+ /* Abort handler. TAR should be rolled back to 1 */
+ "3:;"
+ "mfspr 7, %[tar];"
+ "ori %[res], 7, 8;" // res = 1|8 = 9
+ "4:;"
+
+ : [res]"=r"(result)
+ : [tar]"i"(SPRN_TAR)
+ : "memory", "r0", "r4", "r7");
+
+ /* If result is anything else other than 7 or 9, the tar
+ * value must have been corrupted. */
+ if ((result != 7) && (result != 9))
+ return 1;
+ }
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ /* A low number of iterations (eg 100) can cause a false pass */
+ if (argc > 1) {
+ if (strcmp(argv[1], "-h") == 0) {
+ printf("Syntax:\n\t%s [<num loops>]\n",
+ argv[0]);
+ return 1;
+ } else {
+ num_loops = atoi(argv[1]);
+ }
+ }
+
+ printf("Starting, %d loops\n", num_loops);
+
+ return test_harness(test_tar, "tm_tar");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-tmspr.c b/tools/testing/selftests/powerpc/tm/tm-tmspr.c
new file mode 100644
index 000000000..794d574db
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-tmspr.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2015, Michael Neuling, IBM Corp.
+ *
+ * Original: Michael Neuling 3/4/2014
+ * Modified: Rashmica Gupta 8/12/2015
+ *
+ * Check if any of the Transaction Memory SPRs get corrupted.
+ * - TFIAR - stores address of location of transaction failure
+ * - TFHAR - stores address of software failure handler (if transaction
+ * fails)
+ * - TEXASR - lots of info about the transacion(s)
+ *
+ * (1) create more threads than cpus
+ * (2) in each thread:
+ * (a) set TFIAR and TFHAR a unique value
+ * (b) loop for awhile, continually checking to see if
+ * either register has been corrupted.
+ *
+ * (3) Loop:
+ * (a) begin transaction
+ * (b) abort transaction
+ * (c) check TEXASR to see if FS has been corrupted
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <string.h>
+
+#include "utils.h"
+#include "tm.h"
+
+int num_loops = 1000000;
+int passed = 1;
+
+void tfiar_tfhar(void *in)
+{
+ unsigned long tfhar, tfhar_rd, tfiar, tfiar_rd;
+ int i;
+
+ /* TFIAR: Last bit has to be high so userspace can read register */
+ tfiar = ((unsigned long)in) + 1;
+ tfiar += 2;
+ mtspr(SPRN_TFIAR, tfiar);
+
+ /* TFHAR: Last two bits are reserved */
+ tfhar = ((unsigned long)in);
+ tfhar &= ~0x3UL;
+ tfhar += 4;
+ mtspr(SPRN_TFHAR, tfhar);
+
+ for (i = 0; i < num_loops; i++) {
+ tfhar_rd = mfspr(SPRN_TFHAR);
+ tfiar_rd = mfspr(SPRN_TFIAR);
+ if ( (tfhar != tfhar_rd) || (tfiar != tfiar_rd) ) {
+ passed = 0;
+ return;
+ }
+ }
+ return;
+}
+
+void texasr(void *in)
+{
+ unsigned long i;
+ uint64_t result = 0;
+
+ for (i = 0; i < num_loops; i++) {
+ asm __volatile__(
+ "tbegin.;"
+ "beq 3f ;"
+ "tabort. 0 ;"
+ "tend.;"
+
+ /* Abort handler */
+ "3: ;"
+ ::: "memory");
+
+ /* Check the TEXASR */
+ result = mfspr(SPRN_TEXASR);
+ if ((result & TEXASR_FS) == 0) {
+ passed = 0;
+ return;
+ }
+ }
+ return;
+}
+
+int test_tmspr()
+{
+ pthread_t *thread;
+ int thread_num;
+ unsigned long i;
+
+ SKIP_IF(!have_htm());
+
+ /* To cause some context switching */
+ thread_num = 10 * sysconf(_SC_NPROCESSORS_ONLN);
+
+ thread = malloc(thread_num * sizeof(pthread_t));
+ if (thread == NULL)
+ return EXIT_FAILURE;
+
+ /* Test TFIAR and TFHAR */
+ for (i = 0; i < thread_num; i += 2) {
+ if (pthread_create(&thread[i], NULL, (void *)tfiar_tfhar,
+ (void *)i))
+ return EXIT_FAILURE;
+ }
+ /* Test TEXASR */
+ for (i = 1; i < thread_num; i += 2) {
+ if (pthread_create(&thread[i], NULL, (void *)texasr, (void *)i))
+ return EXIT_FAILURE;
+ }
+
+ for (i = 0; i < thread_num; i++) {
+ if (pthread_join(thread[i], NULL) != 0)
+ return EXIT_FAILURE;
+ }
+
+ free(thread);
+
+ if (passed)
+ return 0;
+ else
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ if (argc > 1) {
+ if (strcmp(argv[1], "-h") == 0) {
+ printf("Syntax:\t [<num loops>]\n");
+ return 0;
+ } else {
+ num_loops = atoi(argv[1]);
+ }
+ }
+ return test_harness(test_tmspr, "tm_tmspr");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-trap.c b/tools/testing/selftests/powerpc/tm/tm-trap.c
new file mode 100644
index 000000000..c75960af8
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-trap.c
@@ -0,0 +1,333 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2017, Gustavo Romero, IBM Corp.
+ *
+ * Check if thread endianness is flipped inadvertently to BE on trap
+ * caught in TM whilst MSR.FP and MSR.VEC are zero (i.e. just after
+ * load_fp and load_vec overflowed).
+ *
+ * The issue can be checked on LE machines simply by zeroing load_fp
+ * and load_vec and then causing a trap in TM. Since the endianness
+ * changes to BE on return from the signal handler, 'nop' is
+ * thread as an illegal instruction in following sequence:
+ * tbegin.
+ * beq 1f
+ * trap
+ * tend.
+ * 1: nop
+ *
+ * However, although the issue is also present on BE machines, it's a
+ * bit trickier to check it on BE machines because MSR.LE bit is set
+ * to zero which determines a BE endianness that is the native
+ * endianness on BE machines, so nothing notably critical happens,
+ * i.e. no illegal instruction is observed immediately after returning
+ * from the signal handler (as it happens on LE machines). Thus to test
+ * it on BE machines LE endianness is forced after a first trap and then
+ * the endianness is verified on subsequent traps to determine if the
+ * endianness "flipped back" to the native endianness (BE).
+ */
+
+#define _GNU_SOURCE
+#include <error.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <htmintrin.h>
+#include <inttypes.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdbool.h>
+
+#include "tm.h"
+#include "utils.h"
+
+#define pr_error(error_code, format, ...) \
+ error_at_line(1, error_code, __FILE__, __LINE__, format, ##__VA_ARGS__)
+
+#define MSR_LE 1UL
+#define LE 1UL
+
+pthread_t t0_ping;
+pthread_t t1_pong;
+
+int exit_from_pong;
+
+int trap_event;
+int le;
+
+bool success;
+
+void trap_signal_handler(int signo, siginfo_t *si, void *uc)
+{
+ ucontext_t *ucp = uc;
+ uint64_t thread_endianness;
+
+ /* Get thread endianness: extract bit LE from MSR */
+ thread_endianness = MSR_LE & ucp->uc_mcontext.gp_regs[PT_MSR];
+
+ /***
+ * Little-Endian Machine
+ */
+
+ if (le) {
+ /* First trap event */
+ if (trap_event == 0) {
+ /* Do nothing. Since it is returning from this trap
+ * event that endianness is flipped by the bug, so just
+ * let the process return from the signal handler and
+ * check on the second trap event if endianness is
+ * flipped or not.
+ */
+ }
+ /* Second trap event */
+ else if (trap_event == 1) {
+ /*
+ * Since trap was caught in TM on first trap event, if
+ * endianness was still LE (not flipped inadvertently)
+ * after returning from the signal handler instruction
+ * (1) is executed (basically a 'nop'), as it's located
+ * at address of tbegin. +4 (rollback addr). As (1) on
+ * LE endianness does in effect nothing, instruction (2)
+ * is then executed again as 'trap', generating a second
+ * trap event (note that in that case 'trap' is caught
+ * not in transacional mode). On te other hand, if after
+ * the return from the signal handler the endianness in-
+ * advertently flipped, instruction (1) is tread as a
+ * branch instruction, i.e. b .+8, hence instruction (3)
+ * and (4) are executed (tbegin.; trap;) and we get sim-
+ * ilaly on the trap signal handler, but now in TM mode.
+ * Either way, it's now possible to check the MSR LE bit
+ * once in the trap handler to verify if endianness was
+ * flipped or not after the return from the second trap
+ * event. If endianness is flipped, the bug is present.
+ * Finally, getting a trap in TM mode or not is just
+ * worth noting because it affects the math to determine
+ * the offset added to the NIP on return: the NIP for a
+ * trap caught in TM is the rollback address, i.e. the
+ * next instruction after 'tbegin.', whilst the NIP for
+ * a trap caught in non-transactional mode is the very
+ * same address of the 'trap' instruction that generated
+ * the trap event.
+ */
+
+ if (thread_endianness == LE) {
+ /* Go to 'success', i.e. instruction (6) */
+ ucp->uc_mcontext.gp_regs[PT_NIP] += 16;
+ } else {
+ /*
+ * Thread endianness is BE, so it flipped
+ * inadvertently. Thus we flip back to LE and
+ * set NIP to go to 'failure', instruction (5).
+ */
+ ucp->uc_mcontext.gp_regs[PT_MSR] |= 1UL;
+ ucp->uc_mcontext.gp_regs[PT_NIP] += 4;
+ }
+ }
+ }
+
+ /***
+ * Big-Endian Machine
+ */
+
+ else {
+ /* First trap event */
+ if (trap_event == 0) {
+ /*
+ * Force thread endianness to be LE. Instructions (1),
+ * (3), and (4) will be executed, generating a second
+ * trap in TM mode.
+ */
+ ucp->uc_mcontext.gp_regs[PT_MSR] |= 1UL;
+ }
+ /* Second trap event */
+ else if (trap_event == 1) {
+ /*
+ * Do nothing. If bug is present on return from this
+ * second trap event endianness will flip back "automat-
+ * ically" to BE, otherwise thread endianness will
+ * continue to be LE, just as it was set above.
+ */
+ }
+ /* A third trap event */
+ else {
+ /*
+ * Once here it means that after returning from the sec-
+ * ond trap event instruction (4) (trap) was executed
+ * as LE, generating a third trap event. In that case
+ * endianness is still LE as set on return from the
+ * first trap event, hence no bug. Otherwise, bug
+ * flipped back to BE on return from the second trap
+ * event and instruction (4) was executed as 'tdi' (so
+ * basically a 'nop') and branch to 'failure' in
+ * instruction (5) was taken to indicate failure and we
+ * never get here.
+ */
+
+ /*
+ * Flip back to BE and go to instruction (6), i.e. go to
+ * 'success'.
+ */
+ ucp->uc_mcontext.gp_regs[PT_MSR] &= ~1UL;
+ ucp->uc_mcontext.gp_regs[PT_NIP] += 8;
+ }
+ }
+
+ trap_event++;
+}
+
+void usr1_signal_handler(int signo, siginfo_t *si, void *not_used)
+{
+ /* Got a USR1 signal from ping(), so just tell pong() to exit */
+ exit_from_pong = 1;
+}
+
+void *ping(void *not_used)
+{
+ uint64_t i;
+
+ trap_event = 0;
+
+ /*
+ * Wait an amount of context switches so load_fp and load_vec overflows
+ * and MSR_[FP|VEC|V] is 0.
+ */
+ for (i = 0; i < 1024*1024*512; i++)
+ ;
+
+ asm goto(
+ /*
+ * [NA] means "Native Endianness", i.e. it tells how a
+ * instruction is executed on machine's native endianness (in
+ * other words, native endianness matches kernel endianness).
+ * [OP] means "Opposite Endianness", i.e. on a BE machine, it
+ * tells how a instruction is executed as a LE instruction; con-
+ * versely, on a LE machine, it tells how a instruction is
+ * executed as a BE instruction. When [NA] is omitted, it means
+ * that the native interpretation of a given instruction is not
+ * relevant for the test. Likewise when [OP] is omitted.
+ */
+
+ " tbegin. ;" /* (0) tbegin. [NA] */
+ " tdi 0, 0, 0x48;" /* (1) nop [NA]; b (3) [OP] */
+ " trap ;" /* (2) trap [NA] */
+ ".long 0x1D05007C;" /* (3) tbegin. [OP] */
+ ".long 0x0800E07F;" /* (4) trap [OP]; nop [NA] */
+ " b %l[failure] ;" /* (5) b [NA]; MSR.LE flipped (bug) */
+ " b %l[success] ;" /* (6) b [NA]; MSR.LE did not flip (ok)*/
+
+ : : : : failure, success);
+
+failure:
+ success = false;
+ goto exit_from_ping;
+
+success:
+ success = true;
+
+exit_from_ping:
+ /* Tell pong() to exit before leaving */
+ pthread_kill(t1_pong, SIGUSR1);
+ return NULL;
+}
+
+void *pong(void *not_used)
+{
+ while (!exit_from_pong)
+ /*
+ * Induce context switches on ping() thread
+ * until ping() finishes its job and signs
+ * to exit from this loop.
+ */
+ sched_yield();
+
+ return NULL;
+}
+
+int tm_trap_test(void)
+{
+ uint16_t k = 1;
+ int cpu, rc;
+
+ pthread_attr_t attr;
+ cpu_set_t cpuset;
+
+ struct sigaction trap_sa;
+
+ SKIP_IF(!have_htm());
+
+ trap_sa.sa_flags = SA_SIGINFO;
+ trap_sa.sa_sigaction = trap_signal_handler;
+ sigaction(SIGTRAP, &trap_sa, NULL);
+
+ struct sigaction usr1_sa;
+
+ usr1_sa.sa_flags = SA_SIGINFO;
+ usr1_sa.sa_sigaction = usr1_signal_handler;
+ sigaction(SIGUSR1, &usr1_sa, NULL);
+
+ cpu = pick_online_cpu();
+ FAIL_IF(cpu < 0);
+
+ // Set only one CPU in the mask. Both threads will be bound to that CPU.
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpu, &cpuset);
+
+ /* Init pthread attribute */
+ rc = pthread_attr_init(&attr);
+ if (rc)
+ pr_error(rc, "pthread_attr_init()");
+
+ /*
+ * Bind thread ping() and pong() both to CPU 0 so they ping-pong and
+ * speed up context switches on ping() thread, speeding up the load_fp
+ * and load_vec overflow.
+ */
+ rc = pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset);
+ if (rc)
+ pr_error(rc, "pthread_attr_setaffinity()");
+
+ /* Figure out the machine endianness */
+ le = (int) *(uint8_t *)&k;
+
+ printf("%s machine detected. Checking if endianness flips %s",
+ le ? "Little-Endian" : "Big-Endian",
+ "inadvertently on trap in TM... ");
+
+ rc = fflush(0);
+ if (rc)
+ pr_error(rc, "fflush()");
+
+ /* Launch ping() */
+ rc = pthread_create(&t0_ping, &attr, ping, NULL);
+ if (rc)
+ pr_error(rc, "pthread_create()");
+
+ exit_from_pong = 0;
+
+ /* Launch pong() */
+ rc = pthread_create(&t1_pong, &attr, pong, NULL);
+ if (rc)
+ pr_error(rc, "pthread_create()");
+
+ rc = pthread_join(t0_ping, NULL);
+ if (rc)
+ pr_error(rc, "pthread_join()");
+
+ rc = pthread_join(t1_pong, NULL);
+ if (rc)
+ pr_error(rc, "pthread_join()");
+
+ if (success) {
+ printf("no.\n"); /* no, endianness did not flip inadvertently */
+ return EXIT_SUCCESS;
+ }
+
+ printf("yes!\n"); /* yes, endianness did flip inadvertently */
+ return EXIT_FAILURE;
+}
+
+int main(int argc, char **argv)
+{
+ return test_harness(tm_trap_test, "tm_trap_test");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-unavailable.c b/tools/testing/selftests/powerpc/tm/tm-unavailable.c
new file mode 100644
index 000000000..a1348a5f7
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-unavailable.c
@@ -0,0 +1,410 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2017, Gustavo Romero, Breno Leitao, Cyril Bur, IBM Corp.
+ *
+ * Force FP, VEC and VSX unavailable exception during transaction in all
+ * possible scenarios regarding the MSR.FP and MSR.VEC state, e.g. when FP
+ * is enable and VEC is disable, when FP is disable and VEC is enable, and
+ * so on. Then we check if the restored state is correctly set for the
+ * FP and VEC registers to the previous state we set just before we entered
+ * in TM, i.e. we check if it corrupts somehow the recheckpointed FP and
+ * VEC/Altivec registers on abortion due to an unavailable exception in TM.
+ * N.B. In this test we do not test all the FP/Altivec/VSX registers for
+ * corruption, but only for registers vs0 and vs32, which are respectively
+ * representatives of FP and VEC/Altivec reg sets.
+ */
+
+#define _GNU_SOURCE
+#include <error.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <pthread.h>
+#include <sched.h>
+
+#include "tm.h"
+
+#define DEBUG 0
+
+/* Unavailable exceptions to test in HTM */
+#define FP_UNA_EXCEPTION 0
+#define VEC_UNA_EXCEPTION 1
+#define VSX_UNA_EXCEPTION 2
+
+#define NUM_EXCEPTIONS 3
+#define err_at_line(status, errnum, format, ...) \
+ error_at_line(status, errnum, __FILE__, __LINE__, format ##__VA_ARGS__)
+
+#define pr_warn(code, format, ...) err_at_line(0, code, format, ##__VA_ARGS__)
+#define pr_err(code, format, ...) err_at_line(1, code, format, ##__VA_ARGS__)
+
+struct Flags {
+ int touch_fp;
+ int touch_vec;
+ int result;
+ int exception;
+} flags;
+
+bool expecting_failure(void)
+{
+ if (flags.touch_fp && flags.exception == FP_UNA_EXCEPTION)
+ return false;
+
+ if (flags.touch_vec && flags.exception == VEC_UNA_EXCEPTION)
+ return false;
+
+ /*
+ * If both FP and VEC are touched it does not mean that touching VSX
+ * won't raise an exception. However since FP and VEC state are already
+ * correctly loaded, the transaction is not aborted (i.e.
+ * treclaimed/trecheckpointed) and MSR.VSX is just set as 1, so a TM
+ * failure is not expected also in this case.
+ */
+ if ((flags.touch_fp && flags.touch_vec) &&
+ flags.exception == VSX_UNA_EXCEPTION)
+ return false;
+
+ return true;
+}
+
+/* Check if failure occurred whilst in transaction. */
+bool is_failure(uint64_t condition_reg)
+{
+ /*
+ * When failure handling occurs, CR0 is set to 0b1010 (0xa). Otherwise
+ * transaction completes without failure and hence reaches out 'tend.'
+ * that sets CR0 to 0b0100 (0x4).
+ */
+ return ((condition_reg >> 28) & 0xa) == 0xa;
+}
+
+void *tm_una_ping(void *input)
+{
+
+ /*
+ * Expected values for vs0 and vs32 after a TM failure. They must never
+ * change, otherwise they got corrupted.
+ */
+ uint64_t high_vs0 = 0x5555555555555555;
+ uint64_t low_vs0 = 0xffffffffffffffff;
+ uint64_t high_vs32 = 0x5555555555555555;
+ uint64_t low_vs32 = 0xffffffffffffffff;
+
+ /* Counter for busy wait */
+ uint64_t counter = 0x1ff000000;
+
+ /*
+ * Variable to keep a copy of CR register content taken just after we
+ * leave the transactional state.
+ */
+ uint64_t cr_ = 0;
+
+ /*
+ * Wait a bit so thread can get its name "ping". This is not important
+ * to reproduce the issue but it's nice to have for systemtap debugging.
+ */
+ if (DEBUG)
+ sleep(1);
+
+ printf("If MSR.FP=%d MSR.VEC=%d: ", flags.touch_fp, flags.touch_vec);
+
+ if (flags.exception != FP_UNA_EXCEPTION &&
+ flags.exception != VEC_UNA_EXCEPTION &&
+ flags.exception != VSX_UNA_EXCEPTION) {
+ printf("No valid exception specified to test.\n");
+ return NULL;
+ }
+
+ asm (
+ /* Prepare to merge low and high. */
+ " mtvsrd 33, %[high_vs0] ;"
+ " mtvsrd 34, %[low_vs0] ;"
+
+ /*
+ * Adjust VS0 expected value after an TM failure,
+ * i.e. vs0 = 0x5555555555555555555FFFFFFFFFFFFFFFF
+ */
+ " xxmrghd 0, 33, 34 ;"
+
+ /*
+ * Adjust VS32 expected value after an TM failure,
+ * i.e. vs32 = 0x5555555555555555555FFFFFFFFFFFFFFFF
+ */
+ " xxmrghd 32, 33, 34 ;"
+
+ /*
+ * Wait an amount of context switches so load_fp and load_vec
+ * overflow and MSR.FP, MSR.VEC, and MSR.VSX become zero (off).
+ */
+ " mtctr %[counter] ;"
+
+ /* Decrement CTR branch if CTR non zero. */
+ "1: bdnz 1b ;"
+
+ /*
+ * Check if we want to touch FP prior to the test in order
+ * to set MSR.FP = 1 before provoking an unavailable
+ * exception in TM.
+ */
+ " cmpldi %[touch_fp], 0 ;"
+ " beq no_fp ;"
+ " fadd 10, 10, 10 ;"
+ "no_fp: ;"
+
+ /*
+ * Check if we want to touch VEC prior to the test in order
+ * to set MSR.VEC = 1 before provoking an unavailable
+ * exception in TM.
+ */
+ " cmpldi %[touch_vec], 0 ;"
+ " beq no_vec ;"
+ " vaddcuw 10, 10, 10 ;"
+ "no_vec: ;"
+
+ /*
+ * Perhaps it would be a better idea to do the
+ * compares outside transactional context and simply
+ * duplicate code.
+ */
+ " tbegin. ;"
+ " beq trans_fail ;"
+
+ /* Do we do FP Unavailable? */
+ " cmpldi %[exception], %[ex_fp] ;"
+ " bne 1f ;"
+ " fadd 10, 10, 10 ;"
+ " b done ;"
+
+ /* Do we do VEC Unavailable? */
+ "1: cmpldi %[exception], %[ex_vec] ;"
+ " bne 2f ;"
+ " vaddcuw 10, 10, 10 ;"
+ " b done ;"
+
+ /*
+ * Not FP or VEC, therefore VSX. Ensure this
+ * instruction always generates a VSX Unavailable.
+ * ISA 3.0 is tricky here.
+ * (xxmrghd will on ISA 2.07 and ISA 3.0)
+ */
+ "2: xxmrghd 10, 10, 10 ;"
+
+ "done: tend. ;"
+
+ "trans_fail: ;"
+
+ /* Give values back to C. */
+ " mfvsrd %[high_vs0], 0 ;"
+ " xxsldwi 3, 0, 0, 2 ;"
+ " mfvsrd %[low_vs0], 3 ;"
+ " mfvsrd %[high_vs32], 32 ;"
+ " xxsldwi 3, 32, 32, 2 ;"
+ " mfvsrd %[low_vs32], 3 ;"
+
+ /* Give CR back to C so that it can check what happened. */
+ " mfcr %[cr_] ;"
+
+ : [high_vs0] "+r" (high_vs0),
+ [low_vs0] "+r" (low_vs0),
+ [high_vs32] "=r" (high_vs32),
+ [low_vs32] "=r" (low_vs32),
+ [cr_] "+r" (cr_)
+ : [touch_fp] "r" (flags.touch_fp),
+ [touch_vec] "r" (flags.touch_vec),
+ [exception] "r" (flags.exception),
+ [ex_fp] "i" (FP_UNA_EXCEPTION),
+ [ex_vec] "i" (VEC_UNA_EXCEPTION),
+ [ex_vsx] "i" (VSX_UNA_EXCEPTION),
+ [counter] "r" (counter)
+
+ : "cr0", "ctr", "v10", "vs0", "vs10", "vs3", "vs32", "vs33",
+ "vs34", "fr10"
+
+ );
+
+ /*
+ * Check if we were expecting a failure and it did not occur by checking
+ * CR0 state just after we leave the transaction. Either way we check if
+ * vs0 or vs32 got corrupted.
+ */
+ if (expecting_failure() && !is_failure(cr_)) {
+ printf("\n\tExpecting the transaction to fail, %s",
+ "but it didn't\n\t");
+ flags.result++;
+ }
+
+ /* Check if we were not expecting a failure and a it occurred. */
+ if (!expecting_failure() && is_failure(cr_) &&
+ !failure_is_reschedule()) {
+ printf("\n\tUnexpected transaction failure 0x%02lx\n\t",
+ failure_code());
+ return (void *) -1;
+ }
+
+ /*
+ * Check if TM failed due to the cause we were expecting. 0xda is a
+ * TM_CAUSE_FAC_UNAV cause, otherwise it's an unexpected cause, unless
+ * it was caused by a reschedule.
+ */
+ if (is_failure(cr_) && !failure_is_unavailable() &&
+ !failure_is_reschedule()) {
+ printf("\n\tUnexpected failure cause 0x%02lx\n\t",
+ failure_code());
+ return (void *) -1;
+ }
+
+ /* 0x4 is a success and 0xa is a fail. See comment in is_failure(). */
+ if (DEBUG)
+ printf("CR0: 0x%1lx ", cr_ >> 28);
+
+ /* Check FP (vs0) for the expected value. */
+ if (high_vs0 != 0x5555555555555555 || low_vs0 != 0xFFFFFFFFFFFFFFFF) {
+ printf("FP corrupted!");
+ printf(" high = %#16" PRIx64 " low = %#16" PRIx64 " ",
+ high_vs0, low_vs0);
+ flags.result++;
+ } else
+ printf("FP ok ");
+
+ /* Check VEC (vs32) for the expected value. */
+ if (high_vs32 != 0x5555555555555555 || low_vs32 != 0xFFFFFFFFFFFFFFFF) {
+ printf("VEC corrupted!");
+ printf(" high = %#16" PRIx64 " low = %#16" PRIx64,
+ high_vs32, low_vs32);
+ flags.result++;
+ } else
+ printf("VEC ok");
+
+ putchar('\n');
+
+ return NULL;
+}
+
+/* Thread to force context switch */
+void *tm_una_pong(void *not_used)
+{
+ /* Wait thread get its name "pong". */
+ if (DEBUG)
+ sleep(1);
+
+ /* Classed as an interactive-like thread. */
+ while (1)
+ sched_yield();
+}
+
+/* Function that creates a thread and launches the "ping" task. */
+void test_fp_vec(int fp, int vec, pthread_attr_t *attr)
+{
+ int retries = 2;
+ void *ret_value;
+ pthread_t t0;
+
+ flags.touch_fp = fp;
+ flags.touch_vec = vec;
+
+ /*
+ * Without luck it's possible that the transaction is aborted not due to
+ * the unavailable exception caught in the middle as we expect but also,
+ * for instance, due to a context switch or due to a KVM reschedule (if
+ * it's running on a VM). Thus we try a few times before giving up,
+ * checking if the failure cause is the one we expect.
+ */
+ do {
+ int rc;
+
+ /* Bind to CPU 0, as specified in 'attr'. */
+ rc = pthread_create(&t0, attr, tm_una_ping, (void *) &flags);
+ if (rc)
+ pr_err(rc, "pthread_create()");
+ rc = pthread_setname_np(t0, "tm_una_ping");
+ if (rc)
+ pr_warn(rc, "pthread_setname_np");
+ rc = pthread_join(t0, &ret_value);
+ if (rc)
+ pr_err(rc, "pthread_join");
+
+ retries--;
+ } while (ret_value != NULL && retries);
+
+ if (!retries) {
+ flags.result = 1;
+ if (DEBUG)
+ printf("All transactions failed unexpectedly\n");
+
+ }
+}
+
+int tm_unavailable_test(void)
+{
+ int cpu, rc, exception; /* FP = 0, VEC = 1, VSX = 2 */
+ pthread_t t1;
+ pthread_attr_t attr;
+ cpu_set_t cpuset;
+
+ SKIP_IF(!have_htm());
+
+ cpu = pick_online_cpu();
+ FAIL_IF(cpu < 0);
+
+ // Set only one CPU in the mask. Both threads will be bound to that CPU.
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpu, &cpuset);
+
+ /* Init pthread attribute. */
+ rc = pthread_attr_init(&attr);
+ if (rc)
+ pr_err(rc, "pthread_attr_init()");
+
+ /* Set CPU 0 mask into the pthread attribute. */
+ rc = pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset);
+ if (rc)
+ pr_err(rc, "pthread_attr_setaffinity_np()");
+
+ rc = pthread_create(&t1, &attr /* Bind to CPU 0 */, tm_una_pong, NULL);
+ if (rc)
+ pr_err(rc, "pthread_create()");
+
+ /* Name it for systemtap convenience */
+ rc = pthread_setname_np(t1, "tm_una_pong");
+ if (rc)
+ pr_warn(rc, "pthread_create()");
+
+ flags.result = 0;
+
+ for (exception = 0; exception < NUM_EXCEPTIONS; exception++) {
+ printf("Checking if FP/VEC registers are sane after");
+
+ if (exception == FP_UNA_EXCEPTION)
+ printf(" a FP unavailable exception...\n");
+
+ else if (exception == VEC_UNA_EXCEPTION)
+ printf(" a VEC unavailable exception...\n");
+
+ else
+ printf(" a VSX unavailable exception...\n");
+
+ flags.exception = exception;
+
+ test_fp_vec(0, 0, &attr);
+ test_fp_vec(1, 0, &attr);
+ test_fp_vec(0, 1, &attr);
+ test_fp_vec(1, 1, &attr);
+
+ }
+
+ if (flags.result > 0) {
+ printf("result: failed!\n");
+ exit(1);
+ } else {
+ printf("result: success\n");
+ exit(0);
+ }
+}
+
+int main(int argc, char **argv)
+{
+ test_harness_set_timeout(220);
+ return test_harness(tm_unavailable_test, "tm_unavailable_test");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-vmx-unavail.c b/tools/testing/selftests/powerpc/tm/tm-vmx-unavail.c
new file mode 100644
index 000000000..e2a0c07e8
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-vmx-unavail.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2017, Michael Neuling, IBM Corp.
+ * Original: Breno Leitao <brenohl@br.ibm.com> &
+ * Gustavo Bueno Romero <gromero@br.ibm.com>
+ * Edited: Michael Neuling
+ *
+ * Force VMX unavailable during a transaction and see if it corrupts
+ * the checkpointed VMX register state after the abort.
+ */
+
+#include <inttypes.h>
+#include <htmintrin.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <pthread.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <pthread.h>
+
+#include "tm.h"
+#include "utils.h"
+
+int passed;
+
+void *worker(void *unused)
+{
+ __int128 vmx0;
+ uint64_t texasr;
+
+ asm goto (
+ "li 3, 1;" /* Stick non-zero value in VMX0 */
+ "std 3, 0(%[vmx0_ptr]);"
+ "lvx 0, 0, %[vmx0_ptr];"
+
+ /* Wait here a bit so we get scheduled out 255 times */
+ "lis 3, 0x3fff;"
+ "1: ;"
+ "addi 3, 3, -1;"
+ "cmpdi 3, 0;"
+ "bne 1b;"
+
+ /* Kernel will hopefully turn VMX off now */
+
+ "tbegin. ;"
+ "beq failure;"
+
+ /* Cause VMX unavail. Any VMX instruction */
+ "vaddcuw 0,0,0;"
+
+ "tend. ;"
+ "b %l[success];"
+
+ /* Check VMX0 sanity after abort */
+ "failure: ;"
+ "lvx 1, 0, %[vmx0_ptr];"
+ "vcmpequb. 2, 0, 1;"
+ "bc 4, 24, %l[value_mismatch];"
+ "b %l[value_match];"
+ :
+ : [vmx0_ptr] "r"(&vmx0)
+ : "r3"
+ : success, value_match, value_mismatch
+ );
+
+ /* HTM aborted and VMX0 is corrupted */
+value_mismatch:
+ texasr = __builtin_get_texasr();
+
+ printf("\n\n==============\n\n");
+ printf("Failure with error: %lx\n", _TEXASR_FAILURE_CODE(texasr));
+ printf("Summary error : %lx\n", _TEXASR_FAILURE_SUMMARY(texasr));
+ printf("TFIAR exact : %lx\n\n", _TEXASR_TFIAR_EXACT(texasr));
+
+ passed = 0;
+ return NULL;
+
+ /* HTM aborted but VMX0 is correct */
+value_match:
+// printf("!");
+ return NULL;
+
+success:
+// printf(".");
+ return NULL;
+}
+
+int tm_vmx_unavail_test()
+{
+ int threads;
+ pthread_t *thread;
+
+ SKIP_IF(!have_htm());
+
+ passed = 1;
+
+ threads = sysconf(_SC_NPROCESSORS_ONLN) * 4;
+ thread = malloc(sizeof(pthread_t)*threads);
+ if (!thread)
+ return EXIT_FAILURE;
+
+ for (uint64_t i = 0; i < threads; i++)
+ pthread_create(&thread[i], NULL, &worker, NULL);
+
+ for (uint64_t i = 0; i < threads; i++)
+ pthread_join(thread[i], NULL);
+
+ free(thread);
+
+ return passed ? EXIT_SUCCESS : EXIT_FAILURE;
+}
+
+
+int main(int argc, char **argv)
+{
+ return test_harness(tm_vmx_unavail_test, "tm_vmx_unavail_test");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c b/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c
new file mode 100644
index 000000000..c1e788a6d
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-vmxcopy.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2015, Michael Neuling, IBM Corp.
+ *
+ * Original: Michael Neuling 4/12/2013
+ * Edited: Rashmica Gupta 4/12/2015
+ *
+ * See if the altivec state is leaked out of an aborted transaction due to
+ * kernel vmx copy loops.
+ *
+ * When the transaction aborts, VSR values should rollback to the values
+ * they held before the transaction commenced. Using VSRs while transaction
+ * is suspended should not affect the checkpointed values.
+ *
+ * (1) write A to a VSR
+ * (2) start transaction
+ * (3) suspend transaction
+ * (4) change the VSR to B
+ * (5) trigger kernel vmx copy loop
+ * (6) abort transaction
+ * (7) check that the VSR value is A
+ */
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <assert.h>
+
+#include "tm.h"
+#include "utils.h"
+
+int test_vmxcopy()
+{
+ long double vecin = 1.3;
+ long double vecout;
+ unsigned long pgsize = getpagesize();
+ int i;
+ int fd;
+ int size = pgsize*16;
+ char tmpfile[] = "/tmp/page_faultXXXXXX";
+ char buf[pgsize];
+ char *a;
+ uint64_t aborted = 0;
+
+ SKIP_IF(!have_htm());
+ SKIP_IF(!is_ppc64le());
+
+ fd = mkstemp(tmpfile);
+ assert(fd >= 0);
+
+ memset(buf, 0, pgsize);
+ for (i = 0; i < size; i += pgsize)
+ assert(write(fd, buf, pgsize) == pgsize);
+
+ unlink(tmpfile);
+
+ a = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
+ assert(a != MAP_FAILED);
+
+ asm __volatile__(
+ "lxvd2x 40,0,%[vecinptr];" /* set 40 to initial value*/
+ "tbegin.;"
+ "beq 3f;"
+ "tsuspend.;"
+ "xxlxor 40,40,40;" /* set 40 to 0 */
+ "std 5, 0(%[map]);" /* cause kernel vmx copy page */
+ "tabort. 0;"
+ "tresume.;"
+ "tend.;"
+ "li %[res], 0;"
+ "b 5f;"
+
+ /* Abort handler */
+ "3:;"
+ "li %[res], 1;"
+
+ "5:;"
+ "stxvd2x 40,0,%[vecoutptr];"
+ : [res]"=&r"(aborted)
+ : [vecinptr]"r"(&vecin),
+ [vecoutptr]"r"(&vecout),
+ [map]"r"(a)
+ : "memory", "r0", "r3", "r4", "r5", "r6", "r7");
+
+ if (aborted && (vecin != vecout)){
+ printf("FAILED: vector state leaked on abort %f != %f\n",
+ (double)vecin, (double)vecout);
+ return 1;
+ }
+
+ munmap(a, size);
+
+ close(fd);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(test_vmxcopy, "tm_vmxcopy");
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm.h b/tools/testing/selftests/powerpc/tm/tm.h
new file mode 100644
index 000000000..c5a1e5c16
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm.h
@@ -0,0 +1,96 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2015, Michael Ellerman, IBM Corp.
+ */
+
+#ifndef _SELFTESTS_POWERPC_TM_TM_H
+#define _SELFTESTS_POWERPC_TM_TM_H
+
+#include <stdbool.h>
+#include <asm/tm.h>
+
+#include "utils.h"
+
+static inline bool have_htm(void)
+{
+#ifdef PPC_FEATURE2_HTM
+ return have_hwcap2(PPC_FEATURE2_HTM);
+#else
+ printf("PPC_FEATURE2_HTM not defined, can't check AT_HWCAP2\n");
+ return false;
+#endif
+}
+
+static inline bool have_htm_nosc(void)
+{
+#ifdef PPC_FEATURE2_HTM_NOSC
+ return have_hwcap2(PPC_FEATURE2_HTM_NOSC);
+#else
+ printf("PPC_FEATURE2_HTM_NOSC not defined, can't check AT_HWCAP2\n");
+ return false;
+#endif
+}
+
+static inline long failure_code(void)
+{
+ return __builtin_get_texasru() >> 24;
+}
+
+static inline bool failure_is_persistent(void)
+{
+ return (failure_code() & TM_CAUSE_PERSISTENT) == TM_CAUSE_PERSISTENT;
+}
+
+static inline bool failure_is_syscall(void)
+{
+ return (failure_code() & TM_CAUSE_SYSCALL) == TM_CAUSE_SYSCALL;
+}
+
+static inline bool failure_is_unavailable(void)
+{
+ return (failure_code() & TM_CAUSE_FAC_UNAV) == TM_CAUSE_FAC_UNAV;
+}
+
+static inline bool failure_is_reschedule(void)
+{
+ if ((failure_code() & TM_CAUSE_RESCHED) == TM_CAUSE_RESCHED ||
+ (failure_code() & TM_CAUSE_KVM_RESCHED) == TM_CAUSE_KVM_RESCHED ||
+ (failure_code() & TM_CAUSE_KVM_FAC_UNAV) == TM_CAUSE_KVM_FAC_UNAV)
+ return true;
+
+ return false;
+}
+
+static inline bool failure_is_nesting(void)
+{
+ return (__builtin_get_texasru() & 0x400000);
+}
+
+static inline int tcheck(void)
+{
+ long cr;
+ asm volatile ("tcheck 0" : "=r"(cr) : : "cr0");
+ return (cr >> 28) & 4;
+}
+
+static inline bool tcheck_doomed(void)
+{
+ return tcheck() & 8;
+}
+
+static inline bool tcheck_active(void)
+{
+ return tcheck() & 4;
+}
+
+static inline bool tcheck_suspended(void)
+{
+ return tcheck() & 2;
+}
+
+static inline bool tcheck_transactional(void)
+{
+ return tcheck() & 6;
+}
+
+#endif /* _SELFTESTS_POWERPC_TM_TM_H */
diff --git a/tools/testing/selftests/powerpc/utils.c b/tools/testing/selftests/powerpc/utils.c
new file mode 100644
index 000000000..1f36ee1a9
--- /dev/null
+++ b/tools/testing/selftests/powerpc/utils.c
@@ -0,0 +1,303 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2013-2015, Michael Ellerman, IBM Corp.
+ */
+
+#define _GNU_SOURCE /* For CPU_ZERO etc. */
+
+#include <elf.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <link.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/sysinfo.h>
+#include <sys/types.h>
+#include <sys/utsname.h>
+#include <unistd.h>
+#include <asm/unistd.h>
+#include <linux/limits.h>
+
+#include "utils.h"
+
+static char auxv[4096];
+
+int read_auxv(char *buf, ssize_t buf_size)
+{
+ ssize_t num;
+ int rc, fd;
+
+ fd = open("/proc/self/auxv", O_RDONLY);
+ if (fd == -1) {
+ perror("open");
+ return -errno;
+ }
+
+ num = read(fd, buf, buf_size);
+ if (num < 0) {
+ perror("read");
+ rc = -EIO;
+ goto out;
+ }
+
+ if (num > buf_size) {
+ printf("overflowed auxv buffer\n");
+ rc = -EOVERFLOW;
+ goto out;
+ }
+
+ rc = 0;
+out:
+ close(fd);
+ return rc;
+}
+
+void *find_auxv_entry(int type, char *auxv)
+{
+ ElfW(auxv_t) *p;
+
+ p = (ElfW(auxv_t) *)auxv;
+
+ while (p->a_type != AT_NULL) {
+ if (p->a_type == type)
+ return p;
+
+ p++;
+ }
+
+ return NULL;
+}
+
+void *get_auxv_entry(int type)
+{
+ ElfW(auxv_t) *p;
+
+ if (read_auxv(auxv, sizeof(auxv)))
+ return NULL;
+
+ p = find_auxv_entry(type, auxv);
+ if (p)
+ return (void *)p->a_un.a_val;
+
+ return NULL;
+}
+
+int pick_online_cpu(void)
+{
+ int ncpus, cpu = -1;
+ cpu_set_t *mask;
+ size_t size;
+
+ ncpus = get_nprocs_conf();
+ size = CPU_ALLOC_SIZE(ncpus);
+ mask = CPU_ALLOC(ncpus);
+ if (!mask) {
+ perror("malloc");
+ return -1;
+ }
+
+ CPU_ZERO_S(size, mask);
+
+ if (sched_getaffinity(0, size, mask)) {
+ perror("sched_getaffinity");
+ goto done;
+ }
+
+ /* We prefer a primary thread, but skip 0 */
+ for (cpu = 8; cpu < ncpus; cpu += 8)
+ if (CPU_ISSET_S(cpu, size, mask))
+ goto done;
+
+ /* Search for anything, but in reverse */
+ for (cpu = ncpus - 1; cpu >= 0; cpu--)
+ if (CPU_ISSET_S(cpu, size, mask))
+ goto done;
+
+ printf("No cpus in affinity mask?!\n");
+
+done:
+ CPU_FREE(mask);
+ return cpu;
+}
+
+bool is_ppc64le(void)
+{
+ struct utsname uts;
+ int rc;
+
+ errno = 0;
+ rc = uname(&uts);
+ if (rc) {
+ perror("uname");
+ return false;
+ }
+
+ return strcmp(uts.machine, "ppc64le") == 0;
+}
+
+int read_sysfs_file(char *fpath, char *result, size_t result_size)
+{
+ char path[PATH_MAX] = "/sys/";
+ int rc = -1, fd;
+
+ strncat(path, fpath, PATH_MAX - strlen(path) - 1);
+
+ if ((fd = open(path, O_RDONLY)) < 0)
+ return rc;
+
+ rc = read(fd, result, result_size);
+
+ close(fd);
+
+ if (rc < 0)
+ return rc;
+
+ return 0;
+}
+
+int read_debugfs_file(char *debugfs_file, int *result)
+{
+ int rc = -1, fd;
+ char path[PATH_MAX];
+ char value[16];
+
+ strcpy(path, "/sys/kernel/debug/");
+ strncat(path, debugfs_file, PATH_MAX - strlen(path) - 1);
+
+ if ((fd = open(path, O_RDONLY)) < 0)
+ return rc;
+
+ if ((rc = read(fd, value, sizeof(value))) < 0)
+ return rc;
+
+ value[15] = 0;
+ *result = atoi(value);
+ close(fd);
+
+ return 0;
+}
+
+int write_debugfs_file(char *debugfs_file, int result)
+{
+ int rc = -1, fd;
+ char path[PATH_MAX];
+ char value[16];
+
+ strcpy(path, "/sys/kernel/debug/");
+ strncat(path, debugfs_file, PATH_MAX - strlen(path) - 1);
+
+ if ((fd = open(path, O_WRONLY)) < 0)
+ return rc;
+
+ snprintf(value, 16, "%d", result);
+
+ if ((rc = write(fd, value, strlen(value))) < 0)
+ return rc;
+
+ close(fd);
+
+ return 0;
+}
+
+static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
+ int cpu, int group_fd, unsigned long flags)
+{
+ return syscall(__NR_perf_event_open, hw_event, pid, cpu,
+ group_fd, flags);
+}
+
+static void perf_event_attr_init(struct perf_event_attr *event_attr,
+ unsigned int type,
+ unsigned long config)
+{
+ memset(event_attr, 0, sizeof(*event_attr));
+
+ event_attr->type = type;
+ event_attr->size = sizeof(struct perf_event_attr);
+ event_attr->config = config;
+ event_attr->read_format = PERF_FORMAT_GROUP;
+ event_attr->disabled = 1;
+ event_attr->exclude_kernel = 1;
+ event_attr->exclude_hv = 1;
+ event_attr->exclude_guest = 1;
+}
+
+int perf_event_open_counter(unsigned int type,
+ unsigned long config, int group_fd)
+{
+ int fd;
+ struct perf_event_attr event_attr;
+
+ perf_event_attr_init(&event_attr, type, config);
+
+ fd = perf_event_open(&event_attr, 0, -1, group_fd, 0);
+
+ if (fd < 0)
+ perror("perf_event_open() failed");
+
+ return fd;
+}
+
+int perf_event_enable(int fd)
+{
+ if (ioctl(fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP) == -1) {
+ perror("error while enabling perf events");
+ return -1;
+ }
+
+ return 0;
+}
+
+int perf_event_disable(int fd)
+{
+ if (ioctl(fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP) == -1) {
+ perror("error disabling perf events");
+ return -1;
+ }
+
+ return 0;
+}
+
+int perf_event_reset(int fd)
+{
+ if (ioctl(fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP) == -1) {
+ perror("error resetting perf events");
+ return -1;
+ }
+
+ return 0;
+}
+
+int using_hash_mmu(bool *using_hash)
+{
+ char line[128];
+ FILE *f;
+ int rc;
+
+ f = fopen("/proc/cpuinfo", "r");
+ FAIL_IF(!f);
+
+ rc = 0;
+ while (fgets(line, sizeof(line), f) != NULL) {
+ if (!strcmp(line, "MMU : Hash\n") ||
+ !strcmp(line, "platform : Cell\n") ||
+ !strcmp(line, "platform : PowerMac\n")) {
+ *using_hash = true;
+ goto out;
+ }
+
+ if (strcmp(line, "MMU : Radix\n") == 0) {
+ *using_hash = false;
+ goto out;
+ }
+ }
+
+ rc = -1;
+out:
+ fclose(f);
+ return rc;
+}
diff --git a/tools/testing/selftests/powerpc/vphn/.gitignore b/tools/testing/selftests/powerpc/vphn/.gitignore
new file mode 100644
index 000000000..b744aedfd
--- /dev/null
+++ b/tools/testing/selftests/powerpc/vphn/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+test-vphn
diff --git a/tools/testing/selftests/powerpc/vphn/Makefile b/tools/testing/selftests/powerpc/vphn/Makefile
new file mode 100644
index 000000000..cf65cbf33
--- /dev/null
+++ b/tools/testing/selftests/powerpc/vphn/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+TEST_GEN_PROGS := test-vphn
+
+CFLAGS += -m64 -I$(CURDIR)
+
+top_srcdir = ../../../../..
+include ../../lib.mk
+
+$(TEST_GEN_PROGS): ../harness.c
+
diff --git a/tools/testing/selftests/powerpc/vphn/asm/lppaca.h b/tools/testing/selftests/powerpc/vphn/asm/lppaca.h
new file mode 120000
index 000000000..942b1d009
--- /dev/null
+++ b/tools/testing/selftests/powerpc/vphn/asm/lppaca.h
@@ -0,0 +1 @@
+../../../../../../arch/powerpc/include/asm/lppaca.h \ No newline at end of file
diff --git a/tools/testing/selftests/powerpc/vphn/test-vphn.c b/tools/testing/selftests/powerpc/vphn/test-vphn.c
new file mode 100644
index 000000000..81d3069ff
--- /dev/null
+++ b/tools/testing/selftests/powerpc/vphn/test-vphn.c
@@ -0,0 +1,411 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <byteswap.h>
+#include "utils.h"
+#include "subunit.h"
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define cpu_to_be32(x) bswap_32(x)
+#define be32_to_cpu(x) bswap_32(x)
+#define be16_to_cpup(x) bswap_16(*x)
+#define cpu_to_be64(x) bswap_64(x)
+#else
+#define cpu_to_be32(x) (x)
+#define be32_to_cpu(x) (x)
+#define be16_to_cpup(x) (*x)
+#define cpu_to_be64(x) (x)
+#endif
+
+#include "vphn.c"
+
+static struct test {
+ char *descr;
+ long input[VPHN_REGISTER_COUNT];
+ u32 expected[VPHN_ASSOC_BUFSIZE];
+} all_tests[] = {
+ {
+ "vphn: no data",
+ {
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ },
+ {
+ 0x00000000
+ }
+ },
+ {
+ "vphn: 1 x 16-bit value",
+ {
+ 0x8001ffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ },
+ {
+ 0x00000001,
+ 0x00000001
+ }
+ },
+ {
+ "vphn: 2 x 16-bit values",
+ {
+ 0x80018002ffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ },
+ {
+ 0x00000002,
+ 0x00000001,
+ 0x00000002
+ }
+ },
+ {
+ "vphn: 3 x 16-bit values",
+ {
+ 0x800180028003ffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ },
+ {
+ 0x00000003,
+ 0x00000001,
+ 0x00000002,
+ 0x00000003
+ }
+ },
+ {
+ "vphn: 4 x 16-bit values",
+ {
+ 0x8001800280038004,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ },
+ {
+ 0x00000004,
+ 0x00000001,
+ 0x00000002,
+ 0x00000003,
+ 0x00000004
+ }
+ },
+ {
+ /* Parsing the next 16-bit value out of the next 64-bit input
+ * value.
+ */
+ "vphn: 5 x 16-bit values",
+ {
+ 0x8001800280038004,
+ 0x8005ffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ },
+ {
+ 0x00000005,
+ 0x00000001,
+ 0x00000002,
+ 0x00000003,
+ 0x00000004,
+ 0x00000005
+ }
+ },
+ {
+ /* Parse at most 6 x 64-bit input values */
+ "vphn: 24 x 16-bit values",
+ {
+ 0x8001800280038004,
+ 0x8005800680078008,
+ 0x8009800a800b800c,
+ 0x800d800e800f8010,
+ 0x8011801280138014,
+ 0x8015801680178018
+ },
+ {
+ 0x00000018,
+ 0x00000001,
+ 0x00000002,
+ 0x00000003,
+ 0x00000004,
+ 0x00000005,
+ 0x00000006,
+ 0x00000007,
+ 0x00000008,
+ 0x00000009,
+ 0x0000000a,
+ 0x0000000b,
+ 0x0000000c,
+ 0x0000000d,
+ 0x0000000e,
+ 0x0000000f,
+ 0x00000010,
+ 0x00000011,
+ 0x00000012,
+ 0x00000013,
+ 0x00000014,
+ 0x00000015,
+ 0x00000016,
+ 0x00000017,
+ 0x00000018
+ }
+ },
+ {
+ "vphn: 1 x 32-bit value",
+ {
+ 0x00000001ffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff
+ },
+ {
+ 0x00000001,
+ 0x00000001
+ }
+ },
+ {
+ "vphn: 2 x 32-bit values",
+ {
+ 0x0000000100000002,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff
+ },
+ {
+ 0x00000002,
+ 0x00000001,
+ 0x00000002
+ }
+ },
+ {
+ /* Parsing the next 32-bit value out of the next 64-bit input
+ * value.
+ */
+ "vphn: 3 x 32-bit values",
+ {
+ 0x0000000100000002,
+ 0x00000003ffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff
+ },
+ {
+ 0x00000003,
+ 0x00000001,
+ 0x00000002,
+ 0x00000003
+ }
+ },
+ {
+ /* Parse at most 6 x 64-bit input values */
+ "vphn: 12 x 32-bit values",
+ {
+ 0x0000000100000002,
+ 0x0000000300000004,
+ 0x0000000500000006,
+ 0x0000000700000008,
+ 0x000000090000000a,
+ 0x0000000b0000000c
+ },
+ {
+ 0x0000000c,
+ 0x00000001,
+ 0x00000002,
+ 0x00000003,
+ 0x00000004,
+ 0x00000005,
+ 0x00000006,
+ 0x00000007,
+ 0x00000008,
+ 0x00000009,
+ 0x0000000a,
+ 0x0000000b,
+ 0x0000000c
+ }
+ },
+ {
+ "vphn: 16-bit value followed by 32-bit value",
+ {
+ 0x800100000002ffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff
+ },
+ {
+ 0x00000002,
+ 0x00000001,
+ 0x00000002
+ }
+ },
+ {
+ "vphn: 32-bit value followed by 16-bit value",
+ {
+ 0x000000018002ffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff
+ },
+ {
+ 0x00000002,
+ 0x00000001,
+ 0x00000002
+ }
+ },
+ {
+ /* Parse a 32-bit value split accross two consecutives 64-bit
+ * input values.
+ */
+ "vphn: 16-bit value followed by 2 x 32-bit values",
+ {
+ 0x8001000000020000,
+ 0x0003ffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff
+ },
+ {
+ 0x00000003,
+ 0x00000001,
+ 0x00000002,
+ 0x00000003,
+ 0x00000004,
+ 0x00000005
+ }
+ },
+ {
+ /* The lower bits in 0x0001ffff don't get mixed up with the
+ * 0xffff terminator.
+ */
+ "vphn: 32-bit value has all ones in 16 lower bits",
+ {
+ 0x0001ffff80028003,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff,
+ 0xffffffffffffffff
+ },
+ {
+ 0x00000003,
+ 0x0001ffff,
+ 0x00000002,
+ 0x00000003
+ }
+ },
+ {
+ /* The following input doesn't follow the specification.
+ */
+ "vphn: last 32-bit value is truncated",
+ {
+ 0x0000000100000002,
+ 0x0000000300000004,
+ 0x0000000500000006,
+ 0x0000000700000008,
+ 0x000000090000000a,
+ 0x0000000b800c2bad
+ },
+ {
+ 0x0000000c,
+ 0x00000001,
+ 0x00000002,
+ 0x00000003,
+ 0x00000004,
+ 0x00000005,
+ 0x00000006,
+ 0x00000007,
+ 0x00000008,
+ 0x00000009,
+ 0x0000000a,
+ 0x0000000b,
+ 0x0000000c
+ }
+ },
+ {
+ "vphn: garbage after terminator",
+ {
+ 0xffff2bad2bad2bad,
+ 0x2bad2bad2bad2bad,
+ 0x2bad2bad2bad2bad,
+ 0x2bad2bad2bad2bad,
+ 0x2bad2bad2bad2bad,
+ 0x2bad2bad2bad2bad
+ },
+ {
+ 0x00000000
+ }
+ },
+ {
+ NULL
+ }
+};
+
+static int test_one(struct test *test)
+{
+ __be32 output[VPHN_ASSOC_BUFSIZE] = { 0 };
+ int i, len;
+
+ vphn_unpack_associativity(test->input, output);
+
+ len = be32_to_cpu(output[0]);
+ if (len != test->expected[0]) {
+ printf("expected %d elements, got %d\n", test->expected[0],
+ len);
+ return 1;
+ }
+
+ for (i = 1; i < len; i++) {
+ u32 val = be32_to_cpu(output[i]);
+ if (val != test->expected[i]) {
+ printf("element #%d is 0x%x, should be 0x%x\n", i, val,
+ test->expected[i]);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int test_vphn(void)
+{
+ static struct test *test;
+
+ for (test = all_tests; test->descr; test++) {
+ int ret;
+
+ ret = test_one(test);
+ test_finish(test->descr, ret);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ return test_harness(test_vphn, "test-vphn");
+}
diff --git a/tools/testing/selftests/powerpc/vphn/vphn.c b/tools/testing/selftests/powerpc/vphn/vphn.c
new file mode 120000
index 000000000..5b5fbddcc
--- /dev/null
+++ b/tools/testing/selftests/powerpc/vphn/vphn.c
@@ -0,0 +1 @@
+../../../../../arch/powerpc/platforms/pseries/vphn.c \ No newline at end of file
diff --git a/tools/testing/selftests/prctl/.gitignore b/tools/testing/selftests/prctl/.gitignore
new file mode 100644
index 000000000..91af2b631
--- /dev/null
+++ b/tools/testing/selftests/prctl/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+disable-tsc-ctxt-sw-stress-test
+disable-tsc-on-off-stress-test
+disable-tsc-test
diff --git a/tools/testing/selftests/prctl/Makefile b/tools/testing/selftests/prctl/Makefile
new file mode 100644
index 000000000..c7923b205
--- /dev/null
+++ b/tools/testing/selftests/prctl/Makefile
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0
+ifndef CROSS_COMPILE
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/)
+
+ifeq ($(ARCH),x86)
+TEST_PROGS := disable-tsc-ctxt-sw-stress-test disable-tsc-on-off-stress-test \
+ disable-tsc-test
+all: $(TEST_PROGS)
+
+include ../lib.mk
+
+clean:
+ rm -fr $(TEST_PROGS)
+endif
+endif
diff --git a/tools/testing/selftests/prctl/disable-tsc-ctxt-sw-stress-test.c b/tools/testing/selftests/prctl/disable-tsc-ctxt-sw-stress-test.c
new file mode 100644
index 000000000..62a93cc61
--- /dev/null
+++ b/tools/testing/selftests/prctl/disable-tsc-ctxt-sw-stress-test.c
@@ -0,0 +1,98 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tests for prctl(PR_GET_TSC, ...) / prctl(PR_SET_TSC, ...)
+ *
+ * Tests if the control register is updated correctly
+ * at context switches
+ *
+ * Warning: this test will cause a very high load for a few seconds
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <inttypes.h>
+#include <wait.h>
+
+
+#include <sys/prctl.h>
+#include <linux/prctl.h>
+
+/* Get/set the process' ability to use the timestamp counter instruction */
+#ifndef PR_GET_TSC
+#define PR_GET_TSC 25
+#define PR_SET_TSC 26
+# define PR_TSC_ENABLE 1 /* allow the use of the timestamp counter */
+# define PR_TSC_SIGSEGV 2 /* throw a SIGSEGV instead of reading the TSC */
+#endif
+
+static uint64_t rdtsc(void)
+{
+uint32_t lo, hi;
+/* We cannot use "=A", since this would use %rax on x86_64 */
+__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
+return (uint64_t)hi << 32 | lo;
+}
+
+static void sigsegv_expect(int sig)
+{
+ /* */
+}
+
+static void segvtask(void)
+{
+ if (prctl(PR_SET_TSC, PR_TSC_SIGSEGV) < 0)
+ {
+ perror("prctl");
+ exit(0);
+ }
+ signal(SIGSEGV, sigsegv_expect);
+ alarm(10);
+ rdtsc();
+ fprintf(stderr, "FATAL ERROR, rdtsc() succeeded while disabled\n");
+ exit(0);
+}
+
+
+static void sigsegv_fail(int sig)
+{
+ fprintf(stderr, "FATAL ERROR, rdtsc() failed while enabled\n");
+ exit(0);
+}
+
+static void rdtsctask(void)
+{
+ if (prctl(PR_SET_TSC, PR_TSC_ENABLE) < 0)
+ {
+ perror("prctl");
+ exit(0);
+ }
+ signal(SIGSEGV, sigsegv_fail);
+ alarm(10);
+ for(;;) rdtsc();
+}
+
+
+int main(void)
+{
+ int n_tasks = 100, i;
+
+ fprintf(stderr, "[No further output means we're allright]\n");
+
+ for (i=0; i<n_tasks; i++)
+ if (fork() == 0)
+ {
+ if (i & 1)
+ segvtask();
+ else
+ rdtsctask();
+ }
+
+ for (i=0; i<n_tasks; i++)
+ wait(NULL);
+
+ exit(0);
+}
+
diff --git a/tools/testing/selftests/prctl/disable-tsc-on-off-stress-test.c b/tools/testing/selftests/prctl/disable-tsc-on-off-stress-test.c
new file mode 100644
index 000000000..79950f9a2
--- /dev/null
+++ b/tools/testing/selftests/prctl/disable-tsc-on-off-stress-test.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tests for prctl(PR_GET_TSC, ...) / prctl(PR_SET_TSC, ...)
+ *
+ * Tests if the control register is updated correctly
+ * when set with prctl()
+ *
+ * Warning: this test will cause a very high load for a few seconds
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <inttypes.h>
+#include <wait.h>
+
+
+#include <sys/prctl.h>
+#include <linux/prctl.h>
+
+/* Get/set the process' ability to use the timestamp counter instruction */
+#ifndef PR_GET_TSC
+#define PR_GET_TSC 25
+#define PR_SET_TSC 26
+# define PR_TSC_ENABLE 1 /* allow the use of the timestamp counter */
+# define PR_TSC_SIGSEGV 2 /* throw a SIGSEGV instead of reading the TSC */
+#endif
+
+/* snippet from wikipedia :-) */
+
+static uint64_t rdtsc(void)
+{
+uint32_t lo, hi;
+/* We cannot use "=A", since this would use %rax on x86_64 */
+__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
+return (uint64_t)hi << 32 | lo;
+}
+
+int should_segv = 0;
+
+static void sigsegv_cb(int sig)
+{
+ if (!should_segv)
+ {
+ fprintf(stderr, "FATAL ERROR, rdtsc() failed while enabled\n");
+ exit(0);
+ }
+ if (prctl(PR_SET_TSC, PR_TSC_ENABLE) < 0)
+ {
+ perror("prctl");
+ exit(0);
+ }
+ should_segv = 0;
+
+ rdtsc();
+}
+
+static void task(void)
+{
+ signal(SIGSEGV, sigsegv_cb);
+ alarm(10);
+ for(;;)
+ {
+ rdtsc();
+ if (should_segv)
+ {
+ fprintf(stderr, "FATAL ERROR, rdtsc() succeeded while disabled\n");
+ exit(0);
+ }
+ if (prctl(PR_SET_TSC, PR_TSC_SIGSEGV) < 0)
+ {
+ perror("prctl");
+ exit(0);
+ }
+ should_segv = 1;
+ }
+}
+
+
+int main(void)
+{
+ int n_tasks = 100, i;
+
+ fprintf(stderr, "[No further output means we're allright]\n");
+
+ for (i=0; i<n_tasks; i++)
+ if (fork() == 0)
+ task();
+
+ for (i=0; i<n_tasks; i++)
+ wait(NULL);
+
+ exit(0);
+}
+
diff --git a/tools/testing/selftests/prctl/disable-tsc-test.c b/tools/testing/selftests/prctl/disable-tsc-test.c
new file mode 100644
index 000000000..f84d4ee11
--- /dev/null
+++ b/tools/testing/selftests/prctl/disable-tsc-test.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tests for prctl(PR_GET_TSC, ...) / prctl(PR_SET_TSC, ...)
+ *
+ * Basic test to test behaviour of PR_GET_TSC and PR_SET_TSC
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <signal.h>
+#include <inttypes.h>
+
+
+#include <sys/prctl.h>
+#include <linux/prctl.h>
+
+/* Get/set the process' ability to use the timestamp counter instruction */
+#ifndef PR_GET_TSC
+#define PR_GET_TSC 25
+#define PR_SET_TSC 26
+# define PR_TSC_ENABLE 1 /* allow the use of the timestamp counter */
+# define PR_TSC_SIGSEGV 2 /* throw a SIGSEGV instead of reading the TSC */
+#endif
+
+const char *tsc_names[] =
+{
+ [0] = "[not set]",
+ [PR_TSC_ENABLE] = "PR_TSC_ENABLE",
+ [PR_TSC_SIGSEGV] = "PR_TSC_SIGSEGV",
+};
+
+static uint64_t rdtsc(void)
+{
+uint32_t lo, hi;
+/* We cannot use "=A", since this would use %rax on x86_64 */
+__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
+return (uint64_t)hi << 32 | lo;
+}
+
+static void sigsegv_cb(int sig)
+{
+ int tsc_val = 0;
+
+ printf("[ SIG_SEGV ]\n");
+ printf("prctl(PR_GET_TSC, &tsc_val); ");
+ fflush(stdout);
+
+ if ( prctl(PR_GET_TSC, &tsc_val) == -1)
+ perror("prctl");
+
+ printf("tsc_val == %s\n", tsc_names[tsc_val]);
+ printf("prctl(PR_SET_TSC, PR_TSC_ENABLE)\n");
+ fflush(stdout);
+ if ( prctl(PR_SET_TSC, PR_TSC_ENABLE) == -1)
+ perror("prctl");
+
+ printf("rdtsc() == ");
+}
+
+int main(void)
+{
+ int tsc_val = 0;
+
+ signal(SIGSEGV, sigsegv_cb);
+
+ printf("rdtsc() == %llu\n", (unsigned long long)rdtsc());
+ printf("prctl(PR_GET_TSC, &tsc_val); ");
+ fflush(stdout);
+
+ if ( prctl(PR_GET_TSC, &tsc_val) == -1)
+ perror("prctl");
+
+ printf("tsc_val == %s\n", tsc_names[tsc_val]);
+ printf("rdtsc() == %llu\n", (unsigned long long)rdtsc());
+ printf("prctl(PR_SET_TSC, PR_TSC_ENABLE)\n");
+ fflush(stdout);
+
+ if ( prctl(PR_SET_TSC, PR_TSC_ENABLE) == -1)
+ perror("prctl");
+
+ printf("rdtsc() == %llu\n", (unsigned long long)rdtsc());
+ printf("prctl(PR_SET_TSC, PR_TSC_SIGSEGV)\n");
+ fflush(stdout);
+
+ if ( prctl(PR_SET_TSC, PR_TSC_SIGSEGV) == -1)
+ perror("prctl");
+
+ printf("rdtsc() == ");
+ fflush(stdout);
+ printf("%llu\n", (unsigned long long)rdtsc());
+ fflush(stdout);
+
+ exit(EXIT_SUCCESS);
+}
+
diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore
new file mode 100644
index 000000000..bed4b5318
--- /dev/null
+++ b/tools/testing/selftests/proc/.gitignore
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/fd-001-lookup
+/fd-002-posix-eq
+/fd-003-kthread
+/proc-fsconfig-hidepid
+/proc-loadavg-001
+/proc-multiple-procfs
+/proc-pid-vm
+/proc-self-map-files-001
+/proc-self-map-files-002
+/proc-self-syscall
+/proc-self-wchan
+/proc-uptime-001
+/proc-uptime-002
+/read
+/self
+/setns-dcache
+/setns-sysvipc
+/thread-self
diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile
new file mode 100644
index 000000000..8be8a03d2
--- /dev/null
+++ b/tools/testing/selftests/proc/Makefile
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: GPL-2.0-only
+CFLAGS += -Wall -O2 -Wno-unused-function
+CFLAGS += -D_GNU_SOURCE
+
+TEST_GEN_PROGS :=
+TEST_GEN_PROGS += fd-001-lookup
+TEST_GEN_PROGS += fd-002-posix-eq
+TEST_GEN_PROGS += fd-003-kthread
+TEST_GEN_PROGS += proc-loadavg-001
+TEST_GEN_PROGS += proc-pid-vm
+TEST_GEN_PROGS += proc-self-map-files-001
+TEST_GEN_PROGS += proc-self-map-files-002
+TEST_GEN_PROGS += proc-self-syscall
+TEST_GEN_PROGS += proc-self-wchan
+TEST_GEN_PROGS += proc-uptime-001
+TEST_GEN_PROGS += proc-uptime-002
+TEST_GEN_PROGS += read
+TEST_GEN_PROGS += self
+TEST_GEN_PROGS += setns-dcache
+TEST_GEN_PROGS += setns-sysvipc
+TEST_GEN_PROGS += thread-self
+TEST_GEN_PROGS += proc-multiple-procfs
+TEST_GEN_PROGS += proc-fsconfig-hidepid
+
+include ../lib.mk
diff --git a/tools/testing/selftests/proc/config b/tools/testing/selftests/proc/config
new file mode 100644
index 000000000..68fbd2b35
--- /dev/null
+++ b/tools/testing/selftests/proc/config
@@ -0,0 +1 @@
+CONFIG_PROC_FS=y
diff --git a/tools/testing/selftests/proc/fd-001-lookup.c b/tools/testing/selftests/proc/fd-001-lookup.c
new file mode 100644
index 000000000..60d7948e7
--- /dev/null
+++ b/tools/testing/selftests/proc/fd-001-lookup.c
@@ -0,0 +1,168 @@
+/*
+ * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+// Test /proc/*/fd lookup.
+
+#undef NDEBUG
+#include <assert.h>
+#include <dirent.h>
+#include <errno.h>
+#include <limits.h>
+#include <sched.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "proc.h"
+
+/* lstat(2) has more "coverage" in case non-symlink pops up somehow. */
+static void test_lookup_pass(const char *pathname)
+{
+ struct stat st;
+ ssize_t rv;
+
+ memset(&st, 0, sizeof(struct stat));
+ rv = lstat(pathname, &st);
+ assert(rv == 0);
+ assert(S_ISLNK(st.st_mode));
+}
+
+static void test_lookup_fail(const char *pathname)
+{
+ struct stat st;
+ ssize_t rv;
+
+ rv = lstat(pathname, &st);
+ assert(rv == -1 && errno == ENOENT);
+}
+
+static void test_lookup(unsigned int fd)
+{
+ char buf[64];
+ unsigned int c;
+ unsigned int u;
+ int i;
+
+ snprintf(buf, sizeof(buf), "/proc/self/fd/%u", fd);
+ test_lookup_pass(buf);
+
+ /* leading junk */
+ for (c = 1; c <= 255; c++) {
+ if (c == '/')
+ continue;
+ snprintf(buf, sizeof(buf), "/proc/self/fd/%c%u", c, fd);
+ test_lookup_fail(buf);
+ }
+
+ /* trailing junk */
+ for (c = 1; c <= 255; c++) {
+ if (c == '/')
+ continue;
+ snprintf(buf, sizeof(buf), "/proc/self/fd/%u%c", fd, c);
+ test_lookup_fail(buf);
+ }
+
+ for (i = INT_MIN; i < INT_MIN + 1024; i++) {
+ snprintf(buf, sizeof(buf), "/proc/self/fd/%d", i);
+ test_lookup_fail(buf);
+ }
+ for (i = -1024; i < 0; i++) {
+ snprintf(buf, sizeof(buf), "/proc/self/fd/%d", i);
+ test_lookup_fail(buf);
+ }
+ for (u = INT_MAX - 1024; u <= (unsigned int)INT_MAX + 1024; u++) {
+ snprintf(buf, sizeof(buf), "/proc/self/fd/%u", u);
+ test_lookup_fail(buf);
+ }
+ for (u = UINT_MAX - 1024; u != 0; u++) {
+ snprintf(buf, sizeof(buf), "/proc/self/fd/%u", u);
+ test_lookup_fail(buf);
+ }
+
+
+}
+
+int main(void)
+{
+ struct dirent *de;
+ unsigned int fd, target_fd;
+
+ if (unshare(CLONE_FILES) == -1)
+ return 1;
+
+ /* Wipe fdtable. */
+ do {
+ DIR *d;
+
+ d = opendir("/proc/self/fd");
+ if (!d)
+ return 1;
+
+ de = xreaddir(d);
+ assert(de->d_type == DT_DIR);
+ assert(streq(de->d_name, "."));
+
+ de = xreaddir(d);
+ assert(de->d_type == DT_DIR);
+ assert(streq(de->d_name, ".."));
+next:
+ de = xreaddir(d);
+ if (de) {
+ unsigned long long fd_ull;
+ unsigned int fd;
+ char *end;
+
+ assert(de->d_type == DT_LNK);
+
+ fd_ull = xstrtoull(de->d_name, &end);
+ assert(*end == '\0');
+ assert(fd_ull == (unsigned int)fd_ull);
+
+ fd = fd_ull;
+ if (fd == dirfd(d))
+ goto next;
+ close(fd);
+ }
+
+ closedir(d);
+ } while (de);
+
+ /* Now fdtable is clean. */
+
+ fd = open("/", O_PATH|O_DIRECTORY);
+ assert(fd == 0);
+ test_lookup(fd);
+ close(fd);
+
+ /* Clean again! */
+
+ fd = open("/", O_PATH|O_DIRECTORY);
+ assert(fd == 0);
+ /* Default RLIMIT_NOFILE-1 */
+ target_fd = 1023;
+ while (target_fd > 0) {
+ if (dup2(fd, target_fd) == target_fd)
+ break;
+ target_fd /= 2;
+ }
+ assert(target_fd > 0);
+ close(fd);
+ test_lookup(target_fd);
+ close(target_fd);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/proc/fd-002-posix-eq.c b/tools/testing/selftests/proc/fd-002-posix-eq.c
new file mode 100644
index 000000000..417322ca9
--- /dev/null
+++ b/tools/testing/selftests/proc/fd-002-posix-eq.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+// Test that open(/proc/*/fd/*) opens the same file.
+#undef NDEBUG
+#include <assert.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+int main(void)
+{
+ int fd0, fd1, fd2;
+ struct stat st0, st1, st2;
+ char buf[64];
+ int rv;
+
+ fd0 = open("/", O_DIRECTORY|O_RDONLY);
+ assert(fd0 >= 0);
+
+ snprintf(buf, sizeof(buf), "/proc/self/fd/%u", fd0);
+ fd1 = open(buf, O_RDONLY);
+ assert(fd1 >= 0);
+
+ snprintf(buf, sizeof(buf), "/proc/thread-self/fd/%u", fd0);
+ fd2 = open(buf, O_RDONLY);
+ assert(fd2 >= 0);
+
+ rv = fstat(fd0, &st0);
+ assert(rv == 0);
+ rv = fstat(fd1, &st1);
+ assert(rv == 0);
+ rv = fstat(fd2, &st2);
+ assert(rv == 0);
+
+ assert(st0.st_dev == st1.st_dev);
+ assert(st0.st_ino == st1.st_ino);
+
+ assert(st0.st_dev == st2.st_dev);
+ assert(st0.st_ino == st2.st_ino);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/proc/fd-003-kthread.c b/tools/testing/selftests/proc/fd-003-kthread.c
new file mode 100644
index 000000000..dc591f97b
--- /dev/null
+++ b/tools/testing/selftests/proc/fd-003-kthread.c
@@ -0,0 +1,178 @@
+/*
+ * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+// Test that /proc/$KERNEL_THREAD/fd/ is empty.
+
+#undef NDEBUG
+#include <sys/syscall.h>
+#include <assert.h>
+#include <dirent.h>
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "proc.h"
+
+#define PF_KHTREAD 0x00200000
+
+/*
+ * Test for kernel threadness atomically with openat().
+ *
+ * Return /proc/$PID/fd descriptor if process is kernel thread.
+ * Return -1 if a process is userspace process.
+ */
+static int kernel_thread_fd(unsigned int pid)
+{
+ unsigned int flags = 0;
+ char buf[4096];
+ int dir_fd, fd;
+ ssize_t rv;
+
+ snprintf(buf, sizeof(buf), "/proc/%u", pid);
+ dir_fd = open(buf, O_RDONLY|O_DIRECTORY);
+ if (dir_fd == -1)
+ return -1;
+
+ /*
+ * Believe it or not, struct task_struct::flags is directly exposed
+ * to userspace!
+ */
+ fd = openat(dir_fd, "stat", O_RDONLY);
+ if (fd == -1) {
+ close(dir_fd);
+ return -1;
+ }
+ rv = read(fd, buf, sizeof(buf));
+ close(fd);
+ if (0 < rv && rv <= sizeof(buf)) {
+ unsigned long long flags_ull;
+ char *p, *end;
+ int i;
+
+ assert(buf[rv - 1] == '\n');
+ buf[rv - 1] = '\0';
+
+ /* Search backwards: ->comm can contain whitespace and ')'. */
+ for (i = 0; i < 43; i++) {
+ p = strrchr(buf, ' ');
+ assert(p);
+ *p = '\0';
+ }
+
+ p = strrchr(buf, ' ');
+ assert(p);
+
+ flags_ull = xstrtoull(p + 1, &end);
+ assert(*end == '\0');
+ assert(flags_ull == (unsigned int)flags_ull);
+
+ flags = flags_ull;
+ }
+
+ fd = -1;
+ if (flags & PF_KHTREAD) {
+ fd = openat(dir_fd, "fd", O_RDONLY|O_DIRECTORY);
+ }
+ close(dir_fd);
+ return fd;
+}
+
+static void test_readdir(int fd)
+{
+ DIR *d;
+ struct dirent *de;
+
+ d = fdopendir(fd);
+ assert(d);
+
+ de = xreaddir(d);
+ assert(streq(de->d_name, "."));
+ assert(de->d_type == DT_DIR);
+
+ de = xreaddir(d);
+ assert(streq(de->d_name, ".."));
+ assert(de->d_type == DT_DIR);
+
+ de = xreaddir(d);
+ assert(!de);
+}
+
+static inline int sys_statx(int dirfd, const char *pathname, int flags,
+ unsigned int mask, void *stx)
+{
+ return syscall(SYS_statx, dirfd, pathname, flags, mask, stx);
+}
+
+static void test_lookup_fail(int fd, const char *pathname)
+{
+ char stx[256] __attribute__((aligned(8)));
+ int rv;
+
+ rv = sys_statx(fd, pathname, AT_SYMLINK_NOFOLLOW, 0, (void *)stx);
+ assert(rv == -1 && errno == ENOENT);
+}
+
+static void test_lookup(int fd)
+{
+ char buf[64];
+ unsigned int u;
+ int i;
+
+ for (i = INT_MIN; i < INT_MIN + 1024; i++) {
+ snprintf(buf, sizeof(buf), "%d", i);
+ test_lookup_fail(fd, buf);
+ }
+ for (i = -1024; i < 1024; i++) {
+ snprintf(buf, sizeof(buf), "%d", i);
+ test_lookup_fail(fd, buf);
+ }
+ for (u = INT_MAX - 1024; u < (unsigned int)INT_MAX + 1024; u++) {
+ snprintf(buf, sizeof(buf), "%u", u);
+ test_lookup_fail(fd, buf);
+ }
+ for (u = UINT_MAX - 1024; u != 0; u++) {
+ snprintf(buf, sizeof(buf), "%u", u);
+ test_lookup_fail(fd, buf);
+ }
+}
+
+int main(void)
+{
+ unsigned int pid;
+ int fd;
+
+ /*
+ * In theory this will loop indefinitely if kernel threads are exiled
+ * from /proc.
+ *
+ * Start with kthreadd.
+ */
+ pid = 2;
+ while ((fd = kernel_thread_fd(pid)) == -1 && pid < 1024) {
+ pid++;
+ }
+ /* EACCES if run as non-root. */
+ if (pid >= 1024)
+ return 1;
+
+ test_readdir(fd);
+ test_lookup(fd);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/proc/proc-fsconfig-hidepid.c b/tools/testing/selftests/proc/proc-fsconfig-hidepid.c
new file mode 100644
index 000000000..b9af8f537
--- /dev/null
+++ b/tools/testing/selftests/proc/proc-fsconfig-hidepid.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright © 2020 Alexey Gladkov <gladkov.alexey@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <assert.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <linux/mount.h>
+#include <linux/unistd.h>
+
+static inline int fsopen(const char *fsname, unsigned int flags)
+{
+ return syscall(__NR_fsopen, fsname, flags);
+}
+
+static inline int fsconfig(int fd, unsigned int cmd, const char *key, const void *val, int aux)
+{
+ return syscall(__NR_fsconfig, fd, cmd, key, val, aux);
+}
+
+int main(void)
+{
+ int fsfd, ret;
+ int hidepid = 2;
+
+ assert((fsfd = fsopen("proc", 0)) != -1);
+
+ ret = fsconfig(fsfd, FSCONFIG_SET_BINARY, "hidepid", &hidepid, 0);
+ assert(ret == -1);
+ assert(errno == EINVAL);
+
+ assert(!fsconfig(fsfd, FSCONFIG_SET_STRING, "hidepid", "2", 0));
+ assert(!fsconfig(fsfd, FSCONFIG_SET_STRING, "hidepid", "invisible", 0));
+
+ assert(!close(fsfd));
+
+ return 0;
+}
diff --git a/tools/testing/selftests/proc/proc-loadavg-001.c b/tools/testing/selftests/proc/proc-loadavg-001.c
new file mode 100644
index 000000000..fb4fe9188
--- /dev/null
+++ b/tools/testing/selftests/proc/proc-loadavg-001.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/* Test that /proc/loadavg correctly reports last pid in pid namespace. */
+#include <errno.h>
+#include <sched.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/wait.h>
+
+int main(void)
+{
+ pid_t pid;
+ int wstatus;
+
+ if (unshare(CLONE_NEWPID) == -1) {
+ if (errno == ENOSYS || errno == EPERM)
+ return 4;
+ return 1;
+ }
+
+ pid = fork();
+ if (pid == -1)
+ return 1;
+ if (pid == 0) {
+ char buf[128], *p;
+ int fd;
+ ssize_t rv;
+
+ fd = open("/proc/loadavg" , O_RDONLY);
+ if (fd == -1)
+ return 1;
+ rv = read(fd, buf, sizeof(buf));
+ if (rv < 3)
+ return 1;
+ p = buf + rv;
+
+ /* pid 1 */
+ if (!(p[-3] == ' ' && p[-2] == '1' && p[-1] == '\n'))
+ return 1;
+
+ pid = fork();
+ if (pid == -1)
+ return 1;
+ if (pid == 0)
+ return 0;
+ if (waitpid(pid, NULL, 0) == -1)
+ return 1;
+
+ lseek(fd, 0, SEEK_SET);
+ rv = read(fd, buf, sizeof(buf));
+ if (rv < 3)
+ return 1;
+ p = buf + rv;
+
+ /* pid 2 */
+ if (!(p[-3] == ' ' && p[-2] == '2' && p[-1] == '\n'))
+ return 1;
+
+ return 0;
+ }
+
+ if (waitpid(pid, &wstatus, 0) == -1)
+ return 1;
+ if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0)
+ return 0;
+ return 1;
+}
diff --git a/tools/testing/selftests/proc/proc-multiple-procfs.c b/tools/testing/selftests/proc/proc-multiple-procfs.c
new file mode 100644
index 000000000..ab912ad95
--- /dev/null
+++ b/tools/testing/selftests/proc/proc-multiple-procfs.c
@@ -0,0 +1,48 @@
+/*
+ * Copyright © 2020 Alexey Gladkov <gladkov.alexey@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/mount.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+int main(void)
+{
+ struct stat proc_st1, proc_st2;
+ char procbuff[] = "/tmp/proc.XXXXXX/meminfo";
+ char procdir1[] = "/tmp/proc.XXXXXX";
+ char procdir2[] = "/tmp/proc.XXXXXX";
+
+ assert(mkdtemp(procdir1) != NULL);
+ assert(mkdtemp(procdir2) != NULL);
+
+ assert(!mount("proc", procdir1, "proc", 0, "hidepid=1"));
+ assert(!mount("proc", procdir2, "proc", 0, "hidepid=2"));
+
+ snprintf(procbuff, sizeof(procbuff), "%s/meminfo", procdir1);
+ assert(!stat(procbuff, &proc_st1));
+
+ snprintf(procbuff, sizeof(procbuff), "%s/meminfo", procdir2);
+ assert(!stat(procbuff, &proc_st2));
+
+ umount(procdir1);
+ umount(procdir2);
+
+ assert(proc_st1.st_dev != proc_st2.st_dev);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/proc/proc-pid-vm.c b/tools/testing/selftests/proc/proc-pid-vm.c
new file mode 100644
index 000000000..18a3bde8b
--- /dev/null
+++ b/tools/testing/selftests/proc/proc-pid-vm.c
@@ -0,0 +1,462 @@
+/*
+ * Copyright (c) 2019 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/*
+ * Fork and exec tiny 1 page executable which precisely controls its VM.
+ * Test /proc/$PID/maps
+ * Test /proc/$PID/smaps
+ * Test /proc/$PID/smaps_rollup
+ * Test /proc/$PID/statm
+ *
+ * FIXME require CONFIG_TMPFS which can be disabled
+ * FIXME test other values from "smaps"
+ * FIXME support other archs
+ */
+#undef NDEBUG
+#include <assert.h>
+#include <errno.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <sys/mount.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/uio.h>
+#include <linux/kdev_t.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+static inline long sys_execveat(int dirfd, const char *pathname, char **argv, char **envp, int flags)
+{
+ return syscall(SYS_execveat, dirfd, pathname, argv, envp, flags);
+}
+
+static void make_private_tmp(void)
+{
+ if (unshare(CLONE_NEWNS) == -1) {
+ if (errno == ENOSYS || errno == EPERM) {
+ exit(4);
+ }
+ exit(1);
+ }
+ if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) {
+ exit(1);
+ }
+ if (mount(NULL, "/tmp", "tmpfs", 0, NULL) == -1) {
+ exit(1);
+ }
+}
+
+static pid_t pid = -1;
+static void ate(void)
+{
+ if (pid > 0) {
+ kill(pid, SIGTERM);
+ }
+}
+
+struct elf64_hdr {
+ uint8_t e_ident[16];
+ uint16_t e_type;
+ uint16_t e_machine;
+ uint32_t e_version;
+ uint64_t e_entry;
+ uint64_t e_phoff;
+ uint64_t e_shoff;
+ uint32_t e_flags;
+ uint16_t e_ehsize;
+ uint16_t e_phentsize;
+ uint16_t e_phnum;
+ uint16_t e_shentsize;
+ uint16_t e_shnum;
+ uint16_t e_shstrndx;
+};
+
+struct elf64_phdr {
+ uint32_t p_type;
+ uint32_t p_flags;
+ uint64_t p_offset;
+ uint64_t p_vaddr;
+ uint64_t p_paddr;
+ uint64_t p_filesz;
+ uint64_t p_memsz;
+ uint64_t p_align;
+};
+
+#ifdef __x86_64__
+#define PAGE_SIZE 4096
+#define VADDR (1UL << 32)
+#define MAPS_OFFSET 73
+
+#define syscall 0x0f, 0x05
+#define mov_rdi(x) \
+ 0x48, 0xbf, \
+ (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff, \
+ ((x)>>32)&0xff, ((x)>>40)&0xff, ((x)>>48)&0xff, ((x)>>56)&0xff
+
+#define mov_rsi(x) \
+ 0x48, 0xbe, \
+ (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff, \
+ ((x)>>32)&0xff, ((x)>>40)&0xff, ((x)>>48)&0xff, ((x)>>56)&0xff
+
+#define mov_eax(x) \
+ 0xb8, (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff
+
+static const uint8_t payload[] = {
+ /* Casually unmap stack, vDSO and everything else. */
+ /* munmap */
+ mov_rdi(VADDR + 4096),
+ mov_rsi((1ULL << 47) - 4096 - VADDR - 4096),
+ mov_eax(11),
+ syscall,
+
+ /* Ping parent. */
+ /* write(0, &c, 1); */
+ 0x31, 0xff, /* xor edi, edi */
+ 0x48, 0x8d, 0x35, 0x00, 0x00, 0x00, 0x00, /* lea rsi, [rip] */
+ 0xba, 0x01, 0x00, 0x00, 0x00, /* mov edx, 1 */
+ mov_eax(1),
+ syscall,
+
+ /* 1: pause(); */
+ mov_eax(34),
+ syscall,
+
+ 0xeb, 0xf7, /* jmp 1b */
+};
+
+static int make_exe(const uint8_t *payload, size_t len)
+{
+ struct elf64_hdr h;
+ struct elf64_phdr ph;
+
+ struct iovec iov[3] = {
+ {&h, sizeof(struct elf64_hdr)},
+ {&ph, sizeof(struct elf64_phdr)},
+ {(void *)payload, len},
+ };
+ int fd, fd1;
+ char buf[64];
+
+ memset(&h, 0, sizeof(h));
+ h.e_ident[0] = 0x7f;
+ h.e_ident[1] = 'E';
+ h.e_ident[2] = 'L';
+ h.e_ident[3] = 'F';
+ h.e_ident[4] = 2;
+ h.e_ident[5] = 1;
+ h.e_ident[6] = 1;
+ h.e_ident[7] = 0;
+ h.e_type = 2;
+ h.e_machine = 0x3e;
+ h.e_version = 1;
+ h.e_entry = VADDR + sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr);
+ h.e_phoff = sizeof(struct elf64_hdr);
+ h.e_shoff = 0;
+ h.e_flags = 0;
+ h.e_ehsize = sizeof(struct elf64_hdr);
+ h.e_phentsize = sizeof(struct elf64_phdr);
+ h.e_phnum = 1;
+ h.e_shentsize = 0;
+ h.e_shnum = 0;
+ h.e_shstrndx = 0;
+
+ memset(&ph, 0, sizeof(ph));
+ ph.p_type = 1;
+ ph.p_flags = (1<<2)|1;
+ ph.p_offset = 0;
+ ph.p_vaddr = VADDR;
+ ph.p_paddr = 0;
+ ph.p_filesz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + len;
+ ph.p_memsz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + len;
+ ph.p_align = 4096;
+
+ fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_EXCL|O_TMPFILE, 0700);
+ if (fd == -1) {
+ exit(1);
+ }
+
+ if (writev(fd, iov, 3) != sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + len) {
+ exit(1);
+ }
+
+ /* Avoid ETXTBSY on exec. */
+ snprintf(buf, sizeof(buf), "/proc/self/fd/%u", fd);
+ fd1 = open(buf, O_RDONLY|O_CLOEXEC);
+ close(fd);
+
+ return fd1;
+}
+#endif
+
+static bool g_vsyscall = false;
+
+static const char str_vsyscall[] =
+"ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0 [vsyscall]\n";
+
+#ifdef __x86_64__
+static void sigaction_SIGSEGV(int _, siginfo_t *__, void *___)
+{
+ _exit(1);
+}
+
+/*
+ * vsyscall page can't be unmapped, probe it with memory load.
+ */
+static void vsyscall(void)
+{
+ pid_t pid;
+ int wstatus;
+
+ pid = fork();
+ if (pid < 0) {
+ fprintf(stderr, "fork, errno %d\n", errno);
+ exit(1);
+ }
+ if (pid == 0) {
+ struct rlimit rlim = {0, 0};
+ (void)setrlimit(RLIMIT_CORE, &rlim);
+
+ /* Hide "segfault at ffffffffff600000" messages. */
+ struct sigaction act;
+ memset(&act, 0, sizeof(struct sigaction));
+ act.sa_flags = SA_SIGINFO;
+ act.sa_sigaction = sigaction_SIGSEGV;
+ (void)sigaction(SIGSEGV, &act, NULL);
+
+ *(volatile int *)0xffffffffff600000UL;
+ exit(0);
+ }
+ waitpid(pid, &wstatus, 0);
+ if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) == 0) {
+ g_vsyscall = true;
+ }
+}
+
+int main(void)
+{
+ int pipefd[2];
+ int exec_fd;
+
+ vsyscall();
+
+ atexit(ate);
+
+ make_private_tmp();
+
+ /* Reserve fd 0 for 1-byte pipe ping from child. */
+ close(0);
+ if (open("/", O_RDONLY|O_DIRECTORY|O_PATH) != 0) {
+ return 1;
+ }
+
+ exec_fd = make_exe(payload, sizeof(payload));
+
+ if (pipe(pipefd) == -1) {
+ return 1;
+ }
+ if (dup2(pipefd[1], 0) != 0) {
+ return 1;
+ }
+
+ pid = fork();
+ if (pid == -1) {
+ return 1;
+ }
+ if (pid == 0) {
+ sys_execveat(exec_fd, "", NULL, NULL, AT_EMPTY_PATH);
+ return 1;
+ }
+
+ char _;
+ if (read(pipefd[0], &_, 1) != 1) {
+ return 1;
+ }
+
+ struct stat st;
+ if (fstat(exec_fd, &st) == -1) {
+ return 1;
+ }
+
+ /* Generate "head -n1 /proc/$PID/maps" */
+ char buf0[256];
+ memset(buf0, ' ', sizeof(buf0));
+ int len = snprintf(buf0, sizeof(buf0),
+ "%08lx-%08lx r-xp 00000000 %02lx:%02lx %llu",
+ VADDR, VADDR + PAGE_SIZE,
+ MAJOR(st.st_dev), MINOR(st.st_dev),
+ (unsigned long long)st.st_ino);
+ buf0[len] = ' ';
+ snprintf(buf0 + MAPS_OFFSET, sizeof(buf0) - MAPS_OFFSET,
+ "/tmp/#%llu (deleted)\n", (unsigned long long)st.st_ino);
+
+ /* Test /proc/$PID/maps */
+ {
+ const size_t len = strlen(buf0) + (g_vsyscall ? strlen(str_vsyscall) : 0);
+ char buf[256];
+ ssize_t rv;
+ int fd;
+
+ snprintf(buf, sizeof(buf), "/proc/%u/maps", pid);
+ fd = open(buf, O_RDONLY);
+ if (fd == -1) {
+ return 1;
+ }
+ rv = read(fd, buf, sizeof(buf));
+ assert(rv == len);
+ assert(memcmp(buf, buf0, strlen(buf0)) == 0);
+ if (g_vsyscall) {
+ assert(memcmp(buf + strlen(buf0), str_vsyscall, strlen(str_vsyscall)) == 0);
+ }
+ }
+
+ /* Test /proc/$PID/smaps */
+ {
+ char buf[4096];
+ ssize_t rv;
+ int fd;
+
+ snprintf(buf, sizeof(buf), "/proc/%u/smaps", pid);
+ fd = open(buf, O_RDONLY);
+ if (fd == -1) {
+ return 1;
+ }
+ rv = read(fd, buf, sizeof(buf));
+ assert(0 <= rv && rv <= sizeof(buf));
+
+ assert(rv >= strlen(buf0));
+ assert(memcmp(buf, buf0, strlen(buf0)) == 0);
+
+#define RSS1 "Rss: 4 kB\n"
+#define RSS2 "Rss: 0 kB\n"
+#define PSS1 "Pss: 4 kB\n"
+#define PSS2 "Pss: 0 kB\n"
+ assert(memmem(buf, rv, RSS1, strlen(RSS1)) ||
+ memmem(buf, rv, RSS2, strlen(RSS2)));
+ assert(memmem(buf, rv, PSS1, strlen(PSS1)) ||
+ memmem(buf, rv, PSS2, strlen(PSS2)));
+
+ static const char *S[] = {
+ "Size: 4 kB\n",
+ "KernelPageSize: 4 kB\n",
+ "MMUPageSize: 4 kB\n",
+ "Anonymous: 0 kB\n",
+ "AnonHugePages: 0 kB\n",
+ "Shared_Hugetlb: 0 kB\n",
+ "Private_Hugetlb: 0 kB\n",
+ "Locked: 0 kB\n",
+ };
+ int i;
+
+ for (i = 0; i < sizeof(S)/sizeof(S[0]); i++) {
+ assert(memmem(buf, rv, S[i], strlen(S[i])));
+ }
+
+ if (g_vsyscall) {
+ assert(memmem(buf, rv, str_vsyscall, strlen(str_vsyscall)));
+ }
+ }
+
+ /* Test /proc/$PID/smaps_rollup */
+ {
+ char bufr[256];
+ memset(bufr, ' ', sizeof(bufr));
+ len = snprintf(bufr, sizeof(bufr),
+ "%08lx-%08lx ---p 00000000 00:00 0",
+ VADDR, VADDR + PAGE_SIZE);
+ bufr[len] = ' ';
+ snprintf(bufr + MAPS_OFFSET, sizeof(bufr) - MAPS_OFFSET,
+ "[rollup]\n");
+
+ char buf[1024];
+ ssize_t rv;
+ int fd;
+
+ snprintf(buf, sizeof(buf), "/proc/%u/smaps_rollup", pid);
+ fd = open(buf, O_RDONLY);
+ if (fd == -1) {
+ return 1;
+ }
+ rv = read(fd, buf, sizeof(buf));
+ assert(0 <= rv && rv <= sizeof(buf));
+
+ assert(rv >= strlen(bufr));
+ assert(memcmp(buf, bufr, strlen(bufr)) == 0);
+
+ assert(memmem(buf, rv, RSS1, strlen(RSS1)) ||
+ memmem(buf, rv, RSS2, strlen(RSS2)));
+ assert(memmem(buf, rv, PSS1, strlen(PSS1)) ||
+ memmem(buf, rv, PSS2, strlen(PSS2)));
+
+ static const char *S[] = {
+ "Anonymous: 0 kB\n",
+ "AnonHugePages: 0 kB\n",
+ "Shared_Hugetlb: 0 kB\n",
+ "Private_Hugetlb: 0 kB\n",
+ "Locked: 0 kB\n",
+ };
+ int i;
+
+ for (i = 0; i < sizeof(S)/sizeof(S[0]); i++) {
+ assert(memmem(buf, rv, S[i], strlen(S[i])));
+ }
+ }
+
+ /* Test /proc/$PID/statm */
+ {
+ char buf[64];
+ ssize_t rv;
+ int fd;
+
+ snprintf(buf, sizeof(buf), "/proc/%u/statm", pid);
+ fd = open(buf, O_RDONLY);
+ if (fd == -1) {
+ return 1;
+ }
+ rv = read(fd, buf, sizeof(buf));
+ assert(rv == 7 * 2);
+
+ assert(buf[0] == '1'); /* ->total_vm */
+ assert(buf[1] == ' ');
+ assert(buf[2] == '0' || buf[2] == '1'); /* rss */
+ assert(buf[3] == ' ');
+ assert(buf[4] == '0' || buf[2] == '1'); /* file rss */
+ assert(buf[5] == ' ');
+ assert(buf[6] == '1'); /* ELF executable segments */
+ assert(buf[7] == ' ');
+ assert(buf[8] == '0');
+ assert(buf[9] == ' ');
+ assert(buf[10] == '0'); /* ->data_vm + ->stack_vm */
+ assert(buf[11] == ' ');
+ assert(buf[12] == '0');
+ assert(buf[13] == '\n');
+ }
+
+ return 0;
+}
+#else
+int main(void)
+{
+ return 4;
+}
+#endif
diff --git a/tools/testing/selftests/proc/proc-self-map-files-001.c b/tools/testing/selftests/proc/proc-self-map-files-001.c
new file mode 100644
index 000000000..4209c6428
--- /dev/null
+++ b/tools/testing/selftests/proc/proc-self-map-files-001.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/* Test readlink /proc/self/map_files/... */
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <stdlib.h>
+
+static void pass(const char *fmt, unsigned long a, unsigned long b)
+{
+ char name[64];
+ char buf[64];
+
+ snprintf(name, sizeof(name), fmt, a, b);
+ if (readlink(name, buf, sizeof(buf)) == -1)
+ exit(1);
+}
+
+static void fail(const char *fmt, unsigned long a, unsigned long b)
+{
+ char name[64];
+ char buf[64];
+
+ snprintf(name, sizeof(name), fmt, a, b);
+ if (readlink(name, buf, sizeof(buf)) == -1 && errno == ENOENT)
+ return;
+ exit(1);
+}
+
+int main(void)
+{
+ const unsigned int PAGE_SIZE = sysconf(_SC_PAGESIZE);
+ void *p;
+ int fd;
+ unsigned long a, b;
+
+ fd = open("/dev/zero", O_RDONLY);
+ if (fd == -1)
+ return 1;
+
+ p = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_FILE, fd, 0);
+ if (p == MAP_FAILED)
+ return 1;
+
+ a = (unsigned long)p;
+ b = (unsigned long)p + PAGE_SIZE;
+
+ pass("/proc/self/map_files/%lx-%lx", a, b);
+ fail("/proc/self/map_files/ %lx-%lx", a, b);
+ fail("/proc/self/map_files/%lx -%lx", a, b);
+ fail("/proc/self/map_files/%lx- %lx", a, b);
+ fail("/proc/self/map_files/%lx-%lx ", a, b);
+ fail("/proc/self/map_files/0%lx-%lx", a, b);
+ fail("/proc/self/map_files/%lx-0%lx", a, b);
+ if (sizeof(long) == 4) {
+ fail("/proc/self/map_files/100000000%lx-%lx", a, b);
+ fail("/proc/self/map_files/%lx-100000000%lx", a, b);
+ } else if (sizeof(long) == 8) {
+ fail("/proc/self/map_files/10000000000000000%lx-%lx", a, b);
+ fail("/proc/self/map_files/%lx-10000000000000000%lx", a, b);
+ } else
+ return 1;
+
+ return 0;
+}
diff --git a/tools/testing/selftests/proc/proc-self-map-files-002.c b/tools/testing/selftests/proc/proc-self-map-files-002.c
new file mode 100644
index 000000000..e6aa00a18
--- /dev/null
+++ b/tools/testing/selftests/proc/proc-self-map-files-002.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/* Test readlink /proc/self/map_files/... with minimum address. */
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <stdlib.h>
+
+static void pass(const char *fmt, unsigned long a, unsigned long b)
+{
+ char name[64];
+ char buf[64];
+
+ snprintf(name, sizeof(name), fmt, a, b);
+ if (readlink(name, buf, sizeof(buf)) == -1)
+ exit(1);
+}
+
+static void fail(const char *fmt, unsigned long a, unsigned long b)
+{
+ char name[64];
+ char buf[64];
+
+ snprintf(name, sizeof(name), fmt, a, b);
+ if (readlink(name, buf, sizeof(buf)) == -1 && errno == ENOENT)
+ return;
+ exit(1);
+}
+
+int main(void)
+{
+ const int PAGE_SIZE = sysconf(_SC_PAGESIZE);
+ /*
+ * va_max must be enough bigger than vm.mmap_min_addr, which is
+ * 64KB/32KB by default. (depends on CONFIG_LSM_MMAP_MIN_ADDR)
+ */
+ const unsigned long va_max = 1UL << 20;
+ unsigned long va;
+ void *p;
+ int fd;
+ unsigned long a, b;
+
+ fd = open("/dev/zero", O_RDONLY);
+ if (fd == -1)
+ return 1;
+
+ for (va = 0; va < va_max; va += PAGE_SIZE) {
+ p = mmap((void *)va, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_FILE|MAP_FIXED, fd, 0);
+ if (p == (void *)va)
+ break;
+ }
+ if (va == va_max) {
+ fprintf(stderr, "error: mmap doesn't like you\n");
+ return 1;
+ }
+
+ a = (unsigned long)p;
+ b = (unsigned long)p + PAGE_SIZE;
+
+ pass("/proc/self/map_files/%lx-%lx", a, b);
+ fail("/proc/self/map_files/ %lx-%lx", a, b);
+ fail("/proc/self/map_files/%lx -%lx", a, b);
+ fail("/proc/self/map_files/%lx- %lx", a, b);
+ fail("/proc/self/map_files/%lx-%lx ", a, b);
+ fail("/proc/self/map_files/0%lx-%lx", a, b);
+ fail("/proc/self/map_files/%lx-0%lx", a, b);
+ if (sizeof(long) == 4) {
+ fail("/proc/self/map_files/100000000%lx-%lx", a, b);
+ fail("/proc/self/map_files/%lx-100000000%lx", a, b);
+ } else if (sizeof(long) == 8) {
+ fail("/proc/self/map_files/10000000000000000%lx-%lx", a, b);
+ fail("/proc/self/map_files/%lx-10000000000000000%lx", a, b);
+ } else
+ return 1;
+
+ return 0;
+}
diff --git a/tools/testing/selftests/proc/proc-self-syscall.c b/tools/testing/selftests/proc/proc-self-syscall.c
new file mode 100644
index 000000000..8511dcfe6
--- /dev/null
+++ b/tools/testing/selftests/proc/proc-self-syscall.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+
+static inline ssize_t sys_read(int fd, void *buf, size_t len)
+{
+ return syscall(SYS_read, fd, buf, len);
+}
+
+int main(void)
+{
+ char buf1[64];
+ char buf2[64];
+ int fd;
+ ssize_t rv;
+
+ fd = open("/proc/self/syscall", O_RDONLY);
+ if (fd == -1) {
+ if (errno == ENOENT)
+ return 4;
+ return 1;
+ }
+
+ /* Do direct system call as libc can wrap anything. */
+ snprintf(buf1, sizeof(buf1), "%ld 0x%lx 0x%lx 0x%lx",
+ (long)SYS_read, (long)fd, (long)buf2, (long)sizeof(buf2));
+
+ memset(buf2, 0, sizeof(buf2));
+ rv = sys_read(fd, buf2, sizeof(buf2));
+ if (rv < 0)
+ return 1;
+ if (rv < strlen(buf1))
+ return 1;
+ if (strncmp(buf1, buf2, strlen(buf1)) != 0)
+ return 1;
+
+ return 0;
+}
diff --git a/tools/testing/selftests/proc/proc-self-wchan.c b/tools/testing/selftests/proc/proc-self-wchan.c
new file mode 100644
index 000000000..b467b98a4
--- /dev/null
+++ b/tools/testing/selftests/proc/proc-self-wchan.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+
+int main(void)
+{
+ char buf[64];
+ int fd;
+
+ fd = open("/proc/self/wchan", O_RDONLY);
+ if (fd == -1) {
+ if (errno == ENOENT)
+ return 4;
+ return 1;
+ }
+
+ buf[0] = '\0';
+ if (read(fd, buf, sizeof(buf)) != 1)
+ return 1;
+ if (buf[0] != '0')
+ return 1;
+ return 0;
+}
diff --git a/tools/testing/selftests/proc/proc-uptime-001.c b/tools/testing/selftests/proc/proc-uptime-001.c
new file mode 100644
index 000000000..781f7a50f
--- /dev/null
+++ b/tools/testing/selftests/proc/proc-uptime-001.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+// Test that values in /proc/uptime increment monotonically.
+#undef NDEBUG
+#include <assert.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "proc-uptime.h"
+
+int main(void)
+{
+ uint64_t start, u0, u1, i0, i1;
+ int fd;
+
+ fd = open("/proc/uptime", O_RDONLY);
+ assert(fd >= 0);
+
+ proc_uptime(fd, &u0, &i0);
+ start = u0;
+ do {
+ proc_uptime(fd, &u1, &i1);
+ assert(u1 >= u0);
+ assert(i1 >= i0);
+ u0 = u1;
+ i0 = i1;
+ } while (u1 - start < 100);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/proc/proc-uptime-002.c b/tools/testing/selftests/proc/proc-uptime-002.c
new file mode 100644
index 000000000..7d0aa22bd
--- /dev/null
+++ b/tools/testing/selftests/proc/proc-uptime-002.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+// Test that values in /proc/uptime increment monotonically
+// while shifting across CPUs.
+#undef NDEBUG
+#include <assert.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#include "proc-uptime.h"
+
+static inline int sys_sched_getaffinity(pid_t pid, unsigned int len, unsigned long *m)
+{
+ return syscall(SYS_sched_getaffinity, pid, len, m);
+}
+
+static inline int sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long *m)
+{
+ return syscall(SYS_sched_setaffinity, pid, len, m);
+}
+
+int main(void)
+{
+ unsigned int len;
+ unsigned long *m;
+ unsigned int cpu;
+ uint64_t u0, u1, i0, i1;
+ int fd;
+
+ /* find out "nr_cpu_ids" */
+ m = NULL;
+ len = 0;
+ do {
+ len += sizeof(unsigned long);
+ free(m);
+ m = malloc(len);
+ } while (sys_sched_getaffinity(0, len, m) == -1 && errno == EINVAL);
+
+ fd = open("/proc/uptime", O_RDONLY);
+ assert(fd >= 0);
+
+ proc_uptime(fd, &u0, &i0);
+ for (cpu = 0; cpu < len * 8; cpu++) {
+ memset(m, 0, len);
+ m[cpu / (8 * sizeof(unsigned long))] |= 1UL << (cpu % (8 * sizeof(unsigned long)));
+
+ /* CPU might not exist, ignore error */
+ sys_sched_setaffinity(0, len, m);
+
+ proc_uptime(fd, &u1, &i1);
+ assert(u1 >= u0);
+ assert(i1 >= i0);
+ u0 = u1;
+ i0 = i1;
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/proc/proc-uptime.h b/tools/testing/selftests/proc/proc-uptime.h
new file mode 100644
index 000000000..dc6a42b1d
--- /dev/null
+++ b/tools/testing/selftests/proc/proc-uptime.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#undef NDEBUG
+#include <assert.h>
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "proc.h"
+
+static void proc_uptime(int fd, uint64_t *uptime, uint64_t *idle)
+{
+ uint64_t val1, val2;
+ char buf[64], *p;
+ ssize_t rv;
+
+ /* save "p < end" checks */
+ memset(buf, 0, sizeof(buf));
+ rv = pread(fd, buf, sizeof(buf), 0);
+ assert(0 <= rv && rv <= sizeof(buf));
+ buf[sizeof(buf) - 1] = '\0';
+
+ p = buf;
+
+ val1 = xstrtoull(p, &p);
+ assert(p[0] == '.');
+ assert('0' <= p[1] && p[1] <= '9');
+ assert('0' <= p[2] && p[2] <= '9');
+ assert(p[3] == ' ');
+
+ val2 = (p[1] - '0') * 10 + p[2] - '0';
+ *uptime = val1 * 100 + val2;
+
+ p += 4;
+
+ val1 = xstrtoull(p, &p);
+ assert(p[0] == '.');
+ assert('0' <= p[1] && p[1] <= '9');
+ assert('0' <= p[2] && p[2] <= '9');
+ assert(p[3] == '\n');
+
+ val2 = (p[1] - '0') * 10 + p[2] - '0';
+ *idle = val1 * 100 + val2;
+
+ assert(p + 4 == buf + rv);
+}
diff --git a/tools/testing/selftests/proc/proc.h b/tools/testing/selftests/proc/proc.h
new file mode 100644
index 000000000..b7d57ea40
--- /dev/null
+++ b/tools/testing/selftests/proc/proc.h
@@ -0,0 +1,51 @@
+#pragma once
+#undef NDEBUG
+#include <assert.h>
+#include <dirent.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+static inline pid_t sys_getpid(void)
+{
+ return syscall(SYS_getpid);
+}
+
+static inline pid_t sys_gettid(void)
+{
+ return syscall(SYS_gettid);
+}
+
+static inline bool streq(const char *s1, const char *s2)
+{
+ return strcmp(s1, s2) == 0;
+}
+
+static unsigned long long xstrtoull(const char *p, char **end)
+{
+ if (*p == '0') {
+ *end = (char *)p + 1;
+ return 0;
+ } else if ('1' <= *p && *p <= '9') {
+ unsigned long long val;
+
+ errno = 0;
+ val = strtoull(p, end, 10);
+ assert(errno == 0);
+ return val;
+ } else
+ assert(0);
+}
+
+static struct dirent *xreaddir(DIR *d)
+{
+ struct dirent *de;
+
+ errno = 0;
+ de = readdir(d);
+ assert(de || errno == 0);
+ return de;
+}
diff --git a/tools/testing/selftests/proc/read.c b/tools/testing/selftests/proc/read.c
new file mode 100644
index 000000000..b3ef9e14d
--- /dev/null
+++ b/tools/testing/selftests/proc/read.c
@@ -0,0 +1,146 @@
+/*
+ * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+// Test
+// 1) read of every file in /proc
+// 2) readlink of every symlink in /proc
+// 3) recursively (1) + (2) for every directory in /proc
+// 4) write to /proc/*/clear_refs and /proc/*/task/*/clear_refs
+// 5) write to /proc/sysrq-trigger
+#undef NDEBUG
+#include <assert.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/vfs.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "proc.h"
+
+static void f_reg(DIR *d, const char *filename)
+{
+ char buf[4096];
+ int fd;
+ ssize_t rv;
+
+ /* read from /proc/kmsg can block */
+ fd = openat(dirfd(d), filename, O_RDONLY|O_NONBLOCK);
+ if (fd == -1)
+ return;
+ rv = read(fd, buf, sizeof(buf));
+ assert((0 <= rv && rv <= sizeof(buf)) || rv == -1);
+ close(fd);
+}
+
+static void f_reg_write(DIR *d, const char *filename, const char *buf, size_t len)
+{
+ int fd;
+ ssize_t rv;
+
+ fd = openat(dirfd(d), filename, O_WRONLY);
+ if (fd == -1)
+ return;
+ rv = write(fd, buf, len);
+ assert((0 <= rv && rv <= len) || rv == -1);
+ close(fd);
+}
+
+static void f_lnk(DIR *d, const char *filename)
+{
+ char buf[4096];
+ ssize_t rv;
+
+ rv = readlinkat(dirfd(d), filename, buf, sizeof(buf));
+ assert((0 <= rv && rv <= sizeof(buf)) || rv == -1);
+}
+
+static void f(DIR *d, unsigned int level)
+{
+ struct dirent *de;
+
+ de = xreaddir(d);
+ assert(de->d_type == DT_DIR);
+ assert(streq(de->d_name, "."));
+
+ de = xreaddir(d);
+ assert(de->d_type == DT_DIR);
+ assert(streq(de->d_name, ".."));
+
+ while ((de = xreaddir(d))) {
+ assert(!streq(de->d_name, "."));
+ assert(!streq(de->d_name, ".."));
+
+ switch (de->d_type) {
+ DIR *dd;
+ int fd;
+
+ case DT_REG:
+ if (level == 0 && streq(de->d_name, "sysrq-trigger")) {
+ f_reg_write(d, de->d_name, "h", 1);
+ } else if (level == 1 && streq(de->d_name, "clear_refs")) {
+ f_reg_write(d, de->d_name, "1", 1);
+ } else if (level == 3 && streq(de->d_name, "clear_refs")) {
+ f_reg_write(d, de->d_name, "1", 1);
+ } else {
+ f_reg(d, de->d_name);
+ }
+ break;
+ case DT_DIR:
+ fd = openat(dirfd(d), de->d_name, O_DIRECTORY|O_RDONLY);
+ if (fd == -1)
+ continue;
+ dd = fdopendir(fd);
+ if (!dd)
+ continue;
+ f(dd, level + 1);
+ closedir(dd);
+ break;
+ case DT_LNK:
+ f_lnk(d, de->d_name);
+ break;
+ default:
+ assert(0);
+ }
+ }
+}
+
+int main(void)
+{
+ DIR *d;
+ struct statfs sfs;
+
+ d = opendir("/proc");
+ if (!d)
+ return 4;
+
+ /* Ensure /proc is proc. */
+ if (fstatfs(dirfd(d), &sfs) == -1) {
+ return 1;
+ }
+ if (sfs.f_type != 0x9fa0) {
+ fprintf(stderr, "error: unexpected f_type %lx\n", (long)sfs.f_type);
+ return 2;
+ }
+
+ f(d, 0);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/proc/self.c b/tools/testing/selftests/proc/self.c
new file mode 100644
index 000000000..21c15a1ff
--- /dev/null
+++ b/tools/testing/selftests/proc/self.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+// Test that /proc/self gives correct TGID.
+#undef NDEBUG
+#include <assert.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "proc.h"
+
+int main(void)
+{
+ char buf1[64], buf2[64];
+ pid_t pid;
+ ssize_t rv;
+
+ pid = sys_getpid();
+ snprintf(buf1, sizeof(buf1), "%u", pid);
+
+ rv = readlink("/proc/self", buf2, sizeof(buf2));
+ assert(rv == strlen(buf1));
+ buf2[rv] = '\0';
+ assert(streq(buf1, buf2));
+
+ return 0;
+}
diff --git a/tools/testing/selftests/proc/setns-dcache.c b/tools/testing/selftests/proc/setns-dcache.c
new file mode 100644
index 000000000..60ab197a7
--- /dev/null
+++ b/tools/testing/selftests/proc/setns-dcache.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright © 2019 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/*
+ * Test that setns(CLONE_NEWNET) points to new /proc/net content even
+ * if old one is in dcache.
+ *
+ * FIXME /proc/net/unix is under CONFIG_UNIX which can be disabled.
+ */
+#undef NDEBUG
+#include <assert.h>
+#include <errno.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+
+static pid_t pid = -1;
+
+static void f(void)
+{
+ if (pid > 0) {
+ kill(pid, SIGTERM);
+ }
+}
+
+int main(void)
+{
+ int fd[2];
+ char _ = 0;
+ int nsfd;
+
+ atexit(f);
+
+ /* Check for priviledges and syscall availability straight away. */
+ if (unshare(CLONE_NEWNET) == -1) {
+ if (errno == ENOSYS || errno == EPERM) {
+ return 4;
+ }
+ return 1;
+ }
+ /* Distinguisher between two otherwise empty net namespaces. */
+ if (socket(AF_UNIX, SOCK_STREAM, 0) == -1) {
+ return 1;
+ }
+
+ if (pipe(fd) == -1) {
+ return 1;
+ }
+
+ pid = fork();
+ if (pid == -1) {
+ return 1;
+ }
+
+ if (pid == 0) {
+ if (unshare(CLONE_NEWNET) == -1) {
+ return 1;
+ }
+
+ if (write(fd[1], &_, 1) != 1) {
+ return 1;
+ }
+
+ pause();
+
+ return 0;
+ }
+
+ if (read(fd[0], &_, 1) != 1) {
+ return 1;
+ }
+
+ {
+ char buf[64];
+ snprintf(buf, sizeof(buf), "/proc/%u/ns/net", pid);
+ nsfd = open(buf, O_RDONLY);
+ if (nsfd == -1) {
+ return 1;
+ }
+ }
+
+ /* Reliably pin dentry into dcache. */
+ (void)open("/proc/net/unix", O_RDONLY);
+
+ if (setns(nsfd, CLONE_NEWNET) == -1) {
+ return 1;
+ }
+
+ kill(pid, SIGTERM);
+ pid = 0;
+
+ {
+ char buf[4096];
+ ssize_t rv;
+ int fd;
+
+ fd = open("/proc/net/unix", O_RDONLY);
+ if (fd == -1) {
+ return 1;
+ }
+
+#define S "Num RefCount Protocol Flags Type St Inode Path\n"
+ rv = read(fd, buf, sizeof(buf));
+
+ assert(rv == strlen(S));
+ assert(memcmp(buf, S, strlen(S)) == 0);
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/proc/setns-sysvipc.c b/tools/testing/selftests/proc/setns-sysvipc.c
new file mode 100644
index 000000000..903890c5e
--- /dev/null
+++ b/tools/testing/selftests/proc/setns-sysvipc.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright © 2019 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/*
+ * Test that setns(CLONE_NEWIPC) points to new /proc/sysvipc content even
+ * if old one is in dcache.
+ */
+#undef NDEBUG
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+
+static pid_t pid = -1;
+
+static void f(void)
+{
+ if (pid > 0) {
+ kill(pid, SIGTERM);
+ }
+}
+
+int main(void)
+{
+ int fd[2];
+ char _ = 0;
+ int nsfd;
+
+ atexit(f);
+
+ /* Check for priviledges and syscall availability straight away. */
+ if (unshare(CLONE_NEWIPC) == -1) {
+ if (errno == ENOSYS || errno == EPERM) {
+ return 4;
+ }
+ return 1;
+ }
+ /* Distinguisher between two otherwise empty IPC namespaces. */
+ if (shmget(IPC_PRIVATE, 1, IPC_CREAT) == -1) {
+ return 1;
+ }
+
+ if (pipe(fd) == -1) {
+ return 1;
+ }
+
+ pid = fork();
+ if (pid == -1) {
+ return 1;
+ }
+
+ if (pid == 0) {
+ if (unshare(CLONE_NEWIPC) == -1) {
+ return 1;
+ }
+
+ if (write(fd[1], &_, 1) != 1) {
+ return 1;
+ }
+
+ pause();
+
+ return 0;
+ }
+
+ if (read(fd[0], &_, 1) != 1) {
+ return 1;
+ }
+
+ {
+ char buf[64];
+ snprintf(buf, sizeof(buf), "/proc/%u/ns/ipc", pid);
+ nsfd = open(buf, O_RDONLY);
+ if (nsfd == -1) {
+ return 1;
+ }
+ }
+
+ /* Reliably pin dentry into dcache. */
+ (void)open("/proc/sysvipc/shm", O_RDONLY);
+
+ if (setns(nsfd, CLONE_NEWIPC) == -1) {
+ return 1;
+ }
+
+ kill(pid, SIGTERM);
+ pid = 0;
+
+ {
+ char buf[4096];
+ ssize_t rv;
+ int fd;
+
+ fd = open("/proc/sysvipc/shm", O_RDONLY);
+ if (fd == -1) {
+ return 1;
+ }
+
+#define S32 " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n"
+#define S64 " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime rss swap\n"
+ rv = read(fd, buf, sizeof(buf));
+ if (rv == strlen(S32)) {
+ assert(memcmp(buf, S32, strlen(S32)) == 0);
+ } else if (rv == strlen(S64)) {
+ assert(memcmp(buf, S64, strlen(S64)) == 0);
+ } else {
+ assert(0);
+ }
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/proc/thread-self.c b/tools/testing/selftests/proc/thread-self.c
new file mode 100644
index 000000000..4b23b39b7
--- /dev/null
+++ b/tools/testing/selftests/proc/thread-self.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright © 2018 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+// Test that /proc/thread-self gives correct TGID/PID.
+#undef NDEBUG
+#include <assert.h>
+#include <sched.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+
+#include "proc.h"
+
+int f(void *arg)
+{
+ char buf1[64], buf2[64];
+ pid_t pid, tid;
+ ssize_t rv;
+
+ pid = sys_getpid();
+ tid = sys_gettid();
+ snprintf(buf1, sizeof(buf1), "%u/task/%u", pid, tid);
+
+ rv = readlink("/proc/thread-self", buf2, sizeof(buf2));
+ assert(rv == strlen(buf1));
+ buf2[rv] = '\0';
+ assert(streq(buf1, buf2));
+
+ if (arg)
+ exit(0);
+ return 0;
+}
+
+int main(void)
+{
+ const int PAGE_SIZE = sysconf(_SC_PAGESIZE);
+ pid_t pid;
+ void *stack;
+
+ /* main thread */
+ f((void *)0);
+
+ stack = mmap(NULL, 2 * PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+ assert(stack != MAP_FAILED);
+ /* side thread */
+ pid = clone(f, stack + PAGE_SIZE, CLONE_THREAD|CLONE_SIGHAND|CLONE_VM, (void *)1);
+ assert(pid > 0);
+ pause();
+
+ return 0;
+}
diff --git a/tools/testing/selftests/pstore/.gitignore b/tools/testing/selftests/pstore/.gitignore
new file mode 100644
index 000000000..9938fb406
--- /dev/null
+++ b/tools/testing/selftests/pstore/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+logs
+*uuid
diff --git a/tools/testing/selftests/pstore/Makefile b/tools/testing/selftests/pstore/Makefile
new file mode 100644
index 000000000..5ef57855a
--- /dev/null
+++ b/tools/testing/selftests/pstore/Makefile
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for pstore selftests.
+# Expects pstore backend is registered.
+
+all:
+
+TEST_PROGS := pstore_tests pstore_post_reboot_tests
+TEST_FILES := common_tests pstore_crash_test
+EXTRA_CLEAN := logs/* *uuid
+
+include ../lib.mk
+
+run_crash:
+ @sh pstore_crash_test || { echo "pstore_crash_test: [FAIL]"; exit 1; }
diff --git a/tools/testing/selftests/pstore/common_tests b/tools/testing/selftests/pstore/common_tests
new file mode 100755
index 000000000..4509f0cc9
--- /dev/null
+++ b/tools/testing/selftests/pstore/common_tests
@@ -0,0 +1,83 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-only
+
+# common_tests - Shell script commonly used by pstore test scripts
+#
+# Copyright (C) Hitachi Ltd., 2015
+# Written by Hiraku Toyooka <hiraku.toyooka.gu@hitachi.com>
+#
+
+# Utilities
+errexit() { # message
+ echo "Error: $1" 1>&2
+ exit 1
+}
+
+absdir() { # file_path
+ (cd `dirname $1`; pwd)
+}
+
+show_result() { # result_value
+ if [ $1 -eq 0 ]; then
+ prlog "ok"
+ else
+ prlog "FAIL"
+ rc=1
+ fi
+}
+
+check_files_exist() { # type of pstorefs file
+ if [ -e ${1}-${backend}-0 ]; then
+ prlog "ok"
+ for f in `ls ${1}-${backend}-*`; do
+ prlog -e "\t${f}"
+ done
+ else
+ prlog "FAIL"
+ rc=1
+ fi
+}
+
+operate_files() { # tested value, files, operation
+ if [ $1 -eq 0 ]; then
+ prlog
+ for f in $2; do
+ prlog -ne "\t${f} ... "
+ # execute operation
+ $3 $f
+ show_result $?
+ done
+ else
+ prlog " ... FAIL"
+ rc=1
+ fi
+}
+
+# Parameters
+TEST_STRING_PATTERN="Testing pstore: uuid="
+UUID=`cat /proc/sys/kernel/random/uuid`
+TOP_DIR=`absdir $0`
+LOG_DIR=$TOP_DIR/logs/`date +%Y%m%d-%H%M%S`_${UUID}/
+REBOOT_FLAG=$TOP_DIR/reboot_flag
+
+# Preparing logs
+LOG_FILE=$LOG_DIR/`basename $0`.log
+mkdir -p $LOG_DIR || errexit "Failed to make a log directory: $LOG_DIR"
+date > $LOG_FILE
+prlog() { # messages
+ /bin/echo "$@" | tee -a $LOG_FILE
+}
+
+# Starting tests
+rc=0
+prlog "=== Pstore unit tests (`basename $0`) ==="
+prlog "UUID="$UUID
+
+prlog -n "Checking pstore backend is registered ... "
+backend=`cat /sys/module/pstore/parameters/backend`
+show_result $?
+prlog -e "\tbackend=${backend}"
+prlog -e "\tcmdline=`cat /proc/cmdline`"
+if [ $rc -ne 0 ]; then
+ exit 1
+fi
diff --git a/tools/testing/selftests/pstore/config b/tools/testing/selftests/pstore/config
new file mode 100644
index 000000000..d148f9f89
--- /dev/null
+++ b/tools/testing/selftests/pstore/config
@@ -0,0 +1,5 @@
+CONFIG_MISC_FILESYSTEMS=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_PMSG=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=m
diff --git a/tools/testing/selftests/pstore/pstore_crash_test b/tools/testing/selftests/pstore/pstore_crash_test
new file mode 100755
index 000000000..2a329bbb4
--- /dev/null
+++ b/tools/testing/selftests/pstore/pstore_crash_test
@@ -0,0 +1,30 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-only
+
+# pstore_crash_test - Pstore test shell script which causes crash and reboot
+#
+# Copyright (C) Hitachi Ltd., 2015
+# Written by Hiraku Toyooka <hiraku.toyooka.gu@hitachi.com>
+#
+
+# exit if pstore backend is not registered
+. ./common_tests
+
+prlog "Causing kernel crash ..."
+
+# enable all functions triggered by sysrq
+echo 1 > /proc/sys/kernel/sysrq
+# setting to reboot in 3 seconds after panic
+echo 3 > /proc/sys/kernel/panic
+
+# save uuid file by different name because next test execution will replace it.
+mv $TOP_DIR/uuid $TOP_DIR/prev_uuid
+
+# create a file as reboot flag
+touch $REBOOT_FLAG
+sync
+
+# cause crash
+# Note: If you use kdump and want to see kmesg-* files after reboot, you should
+# specify 'crash_kexec_post_notifiers' in 1st kernel's cmdline.
+echo c > /proc/sysrq-trigger
diff --git a/tools/testing/selftests/pstore/pstore_post_reboot_tests b/tools/testing/selftests/pstore/pstore_post_reboot_tests
new file mode 100755
index 000000000..d6da5e86e
--- /dev/null
+++ b/tools/testing/selftests/pstore/pstore_post_reboot_tests
@@ -0,0 +1,80 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-only
+
+# pstore_post_reboot_tests - Check pstore's behavior after crash/reboot
+#
+# Copyright (C) Hitachi Ltd., 2015
+# Written by Hiraku Toyooka <hiraku.toyooka.gu@hitachi.com>
+#
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+. ./common_tests
+
+if [ -e $REBOOT_FLAG ]; then
+ rm $REBOOT_FLAG
+else
+ prlog "pstore_crash_test has not been executed yet. we skip further tests."
+ exit $ksft_skip
+fi
+
+prlog -n "Mounting pstore filesystem ... "
+mount_info=`grep pstore /proc/mounts`
+if [ $? -eq 0 ]; then
+ mount_point=`echo ${mount_info} | cut -d' ' -f2 | head -n1`
+ prlog "ok"
+else
+ mount none /sys/fs/pstore -t pstore
+ if [ $? -eq 0 ]; then
+ mount_point=`grep pstore /proc/mounts | cut -d' ' -f2 | head -n1`
+ prlog "ok"
+ else
+ prlog "FAIL"
+ exit 1
+ fi
+fi
+
+cd ${mount_point}
+
+prlog -n "Checking dmesg files exist in pstore filesystem ... "
+check_files_exist dmesg
+
+prlog -n "Checking console files exist in pstore filesystem ... "
+check_files_exist console
+
+prlog -n "Checking pmsg files exist in pstore filesystem ... "
+check_files_exist pmsg
+
+prlog -n "Checking dmesg files contain oops end marker"
+grep_end_trace() {
+ grep -q "\---\[ end trace" $1
+}
+files=`ls dmesg-${backend}-*`
+operate_files $? "$files" grep_end_trace
+
+prlog -n "Checking console file contains oops end marker ... "
+grep -q "\---\[ end trace" console-${backend}-0
+show_result $?
+
+prlog -n "Checking pmsg file properly keeps the content written before crash ... "
+prev_uuid=`cat $TOP_DIR/prev_uuid`
+if [ $? -eq 0 ]; then
+ nr_matched=`grep -c "$TEST_STRING_PATTERN" pmsg-${backend}-0`
+ if [ $nr_matched -eq 1 ]; then
+ grep -q "$TEST_STRING_PATTERN"$prev_uuid pmsg-${backend}-0
+ show_result $?
+ else
+ prlog "FAIL"
+ rc=1
+ fi
+else
+ prlog "FAIL"
+ rc=1
+fi
+
+prlog -n "Removing all files in pstore filesystem "
+files=`ls *-${backend}-*`
+operate_files $? "$files" rm
+
+exit $rc
diff --git a/tools/testing/selftests/pstore/pstore_tests b/tools/testing/selftests/pstore/pstore_tests
new file mode 100755
index 000000000..2aa9a3852
--- /dev/null
+++ b/tools/testing/selftests/pstore/pstore_tests
@@ -0,0 +1,30 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-only
+
+# pstore_tests - Check pstore's behavior before crash/reboot
+#
+# Copyright (C) Hitachi Ltd., 2015
+# Written by Hiraku Toyooka <hiraku.toyooka.gu@hitachi.com>
+#
+
+. ./common_tests
+
+prlog -n "Checking pstore console is registered ... "
+dmesg | grep -Eq "console \[(pstore|${backend})"
+show_result $?
+
+prlog -n "Checking /dev/pmsg0 exists ... "
+test -e /dev/pmsg0
+show_result $?
+
+prlog -n "Writing unique string to /dev/pmsg0 ... "
+if [ -e "/dev/pmsg0" ]; then
+ echo "${TEST_STRING_PATTERN}""$UUID" > /dev/pmsg0
+ show_result $?
+ echo "$UUID" > $TOP_DIR/uuid
+else
+ prlog "FAIL"
+ rc=1
+fi
+
+exit $rc
diff --git a/tools/testing/selftests/ptp/.gitignore b/tools/testing/selftests/ptp/.gitignore
new file mode 100644
index 000000000..534ca26ee
--- /dev/null
+++ b/tools/testing/selftests/ptp/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+testptp
diff --git a/tools/testing/selftests/ptp/Makefile b/tools/testing/selftests/ptp/Makefile
new file mode 100644
index 000000000..ef06de089
--- /dev/null
+++ b/tools/testing/selftests/ptp/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS += -I../../../../usr/include/
+TEST_PROGS := testptp
+LDLIBS += -lrt
+all: $(TEST_PROGS)
+
+include ../lib.mk
+
+clean:
+ rm -fr $(TEST_PROGS)
diff --git a/tools/testing/selftests/ptp/phc.sh b/tools/testing/selftests/ptp/phc.sh
new file mode 100755
index 000000000..ac6e5a6e1
--- /dev/null
+++ b/tools/testing/selftests/ptp/phc.sh
@@ -0,0 +1,166 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+ALL_TESTS="
+ settime
+ adjtime
+ adjfreq
+"
+DEV=$1
+
+##############################################################################
+# Sanity checks
+
+if [[ "$(id -u)" -ne 0 ]]; then
+ echo "SKIP: need root privileges"
+ exit 0
+fi
+
+if [[ "$DEV" == "" ]]; then
+ echo "SKIP: PTP device not provided"
+ exit 0
+fi
+
+require_command()
+{
+ local cmd=$1; shift
+
+ if [[ ! -x "$(command -v "$cmd")" ]]; then
+ echo "SKIP: $cmd not installed"
+ exit 1
+ fi
+}
+
+phc_sanity()
+{
+ phc_ctl $DEV get &> /dev/null
+
+ if [ $? != 0 ]; then
+ echo "SKIP: unknown clock $DEV: No such device"
+ exit 1
+ fi
+}
+
+require_command phc_ctl
+phc_sanity
+
+##############################################################################
+# Helpers
+
+# Exit status to return at the end. Set in case one of the tests fails.
+EXIT_STATUS=0
+# Per-test return value. Clear at the beginning of each test.
+RET=0
+
+check_err()
+{
+ local err=$1
+
+ if [[ $RET -eq 0 && $err -ne 0 ]]; then
+ RET=$err
+ fi
+}
+
+log_test()
+{
+ local test_name=$1
+
+ if [[ $RET -ne 0 ]]; then
+ EXIT_STATUS=1
+ printf "TEST: %-60s [FAIL]\n" "$test_name"
+ return 1
+ fi
+
+ printf "TEST: %-60s [ OK ]\n" "$test_name"
+ return 0
+}
+
+tests_run()
+{
+ local current_test
+
+ for current_test in ${TESTS:-$ALL_TESTS}; do
+ $current_test
+ done
+}
+
+##############################################################################
+# Tests
+
+settime_do()
+{
+ local res
+
+ res=$(phc_ctl $DEV set 0 wait 120.5 get 2> /dev/null \
+ | awk '/clock time is/{print $5}' \
+ | awk -F. '{print $1}')
+
+ (( res == 120 ))
+}
+
+adjtime_do()
+{
+ local res
+
+ res=$(phc_ctl $DEV set 0 adj 10 get 2> /dev/null \
+ | awk '/clock time is/{print $5}' \
+ | awk -F. '{print $1}')
+
+ (( res == 10 ))
+}
+
+adjfreq_do()
+{
+ local res
+
+ # Set the clock to be 1% faster
+ res=$(phc_ctl $DEV freq 10000000 set 0 wait 100.5 get 2> /dev/null \
+ | awk '/clock time is/{print $5}' \
+ | awk -F. '{print $1}')
+
+ (( res == 101 ))
+}
+
+##############################################################################
+
+cleanup()
+{
+ phc_ctl $DEV freq 0.0 &> /dev/null
+ phc_ctl $DEV set &> /dev/null
+}
+
+settime()
+{
+ RET=0
+
+ settime_do
+ check_err $?
+ log_test "settime"
+ cleanup
+}
+
+adjtime()
+{
+ RET=0
+
+ adjtime_do
+ check_err $?
+ log_test "adjtime"
+ cleanup
+}
+
+adjfreq()
+{
+ RET=0
+
+ adjfreq_do
+ check_err $?
+ log_test "adjfreq"
+ cleanup
+}
+
+trap cleanup EXIT
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/ptp/testptp.c b/tools/testing/selftests/ptp/testptp.c
new file mode 100644
index 000000000..aa474febb
--- /dev/null
+++ b/tools/testing/selftests/ptp/testptp.c
@@ -0,0 +1,511 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * PTP 1588 clock support - User space test program
+ *
+ * Copyright (C) 2010 OMICRON electronics GmbH
+ */
+#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__ /* For PPC64, to get LL64 types */
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <math.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <linux/ptp_clock.h>
+
+#define DEVICE "/dev/ptp0"
+
+#ifndef ADJ_SETOFFSET
+#define ADJ_SETOFFSET 0x0100
+#endif
+
+#ifndef CLOCK_INVALID
+#define CLOCK_INVALID -1
+#endif
+
+#define NSEC_PER_SEC 1000000000LL
+
+/* clock_adjtime is not available in GLIBC < 2.14 */
+#if !__GLIBC_PREREQ(2, 14)
+#include <sys/syscall.h>
+static int clock_adjtime(clockid_t id, struct timex *tx)
+{
+ return syscall(__NR_clock_adjtime, id, tx);
+}
+#endif
+
+static void show_flag_test(int rq_index, unsigned int flags, int err)
+{
+ printf("PTP_EXTTS_REQUEST%c flags 0x%08x : (%d) %s\n",
+ rq_index ? '1' + rq_index : ' ',
+ flags, err, strerror(errno));
+ /* sigh, uClibc ... */
+ errno = 0;
+}
+
+static void do_flag_test(int fd, unsigned int index)
+{
+ struct ptp_extts_request extts_request;
+ unsigned long request[2] = {
+ PTP_EXTTS_REQUEST,
+ PTP_EXTTS_REQUEST2,
+ };
+ unsigned int enable_flags[5] = {
+ PTP_ENABLE_FEATURE,
+ PTP_ENABLE_FEATURE | PTP_RISING_EDGE,
+ PTP_ENABLE_FEATURE | PTP_FALLING_EDGE,
+ PTP_ENABLE_FEATURE | PTP_RISING_EDGE | PTP_FALLING_EDGE,
+ PTP_ENABLE_FEATURE | (PTP_EXTTS_VALID_FLAGS + 1),
+ };
+ int err, i, j;
+
+ memset(&extts_request, 0, sizeof(extts_request));
+ extts_request.index = index;
+
+ for (i = 0; i < 2; i++) {
+ for (j = 0; j < 5; j++) {
+ extts_request.flags = enable_flags[j];
+ err = ioctl(fd, request[i], &extts_request);
+ show_flag_test(i, extts_request.flags, err);
+
+ extts_request.flags = 0;
+ err = ioctl(fd, request[i], &extts_request);
+ }
+ }
+}
+
+static clockid_t get_clockid(int fd)
+{
+#define CLOCKFD 3
+ return (((unsigned int) ~fd) << 3) | CLOCKFD;
+}
+
+static long ppb_to_scaled_ppm(int ppb)
+{
+ /*
+ * The 'freq' field in the 'struct timex' is in parts per
+ * million, but with a 16 bit binary fractional field.
+ * Instead of calculating either one of
+ *
+ * scaled_ppm = (ppb / 1000) << 16 [1]
+ * scaled_ppm = (ppb << 16) / 1000 [2]
+ *
+ * we simply use double precision math, in order to avoid the
+ * truncation in [1] and the possible overflow in [2].
+ */
+ return (long) (ppb * 65.536);
+}
+
+static int64_t pctns(struct ptp_clock_time *t)
+{
+ return t->sec * 1000000000LL + t->nsec;
+}
+
+static void usage(char *progname)
+{
+ fprintf(stderr,
+ "usage: %s [options]\n"
+ " -c query the ptp clock's capabilities\n"
+ " -d name device to open\n"
+ " -e val read 'val' external time stamp events\n"
+ " -f val adjust the ptp clock frequency by 'val' ppb\n"
+ " -g get the ptp clock time\n"
+ " -h prints this message\n"
+ " -i val index for event/trigger\n"
+ " -k val measure the time offset between system and phc clock\n"
+ " for 'val' times (Maximum 25)\n"
+ " -l list the current pin configuration\n"
+ " -L pin,val configure pin index 'pin' with function 'val'\n"
+ " the channel index is taken from the '-i' option\n"
+ " 'val' specifies the auxiliary function:\n"
+ " 0 - none\n"
+ " 1 - external time stamp\n"
+ " 2 - periodic output\n"
+ " -p val enable output with a period of 'val' nanoseconds\n"
+ " -H val set output phase to 'val' nanoseconds (requires -p)\n"
+ " -w val set output pulse width to 'val' nanoseconds (requires -p)\n"
+ " -P val enable or disable (val=1|0) the system clock PPS\n"
+ " -s set the ptp clock time from the system time\n"
+ " -S set the system time from the ptp clock time\n"
+ " -t val shift the ptp clock time by 'val' seconds\n"
+ " -T val set the ptp clock time to 'val' seconds\n"
+ " -z test combinations of rising/falling external time stamp flags\n",
+ progname);
+}
+
+int main(int argc, char *argv[])
+{
+ struct ptp_clock_caps caps;
+ struct ptp_extts_event event;
+ struct ptp_extts_request extts_request;
+ struct ptp_perout_request perout_request;
+ struct ptp_pin_desc desc;
+ struct timespec ts;
+ struct timex tx;
+ struct ptp_clock_time *pct;
+ struct ptp_sys_offset *sysoff;
+
+ char *progname;
+ unsigned int i;
+ int c, cnt, fd;
+
+ char *device = DEVICE;
+ clockid_t clkid;
+ int adjfreq = 0x7fffffff;
+ int adjtime = 0;
+ int capabilities = 0;
+ int extts = 0;
+ int flagtest = 0;
+ int gettime = 0;
+ int index = 0;
+ int list_pins = 0;
+ int pct_offset = 0;
+ int n_samples = 0;
+ int pin_index = -1, pin_func;
+ int pps = -1;
+ int seconds = 0;
+ int settime = 0;
+
+ int64_t t1, t2, tp;
+ int64_t interval, offset;
+ int64_t perout_phase = -1;
+ int64_t pulsewidth = -1;
+ int64_t perout = -1;
+
+ progname = strrchr(argv[0], '/');
+ progname = progname ? 1+progname : argv[0];
+ while (EOF != (c = getopt(argc, argv, "cd:e:f:ghH:i:k:lL:p:P:sSt:T:w:z"))) {
+ switch (c) {
+ case 'c':
+ capabilities = 1;
+ break;
+ case 'd':
+ device = optarg;
+ break;
+ case 'e':
+ extts = atoi(optarg);
+ break;
+ case 'f':
+ adjfreq = atoi(optarg);
+ break;
+ case 'g':
+ gettime = 1;
+ break;
+ case 'H':
+ perout_phase = atoll(optarg);
+ break;
+ case 'i':
+ index = atoi(optarg);
+ break;
+ case 'k':
+ pct_offset = 1;
+ n_samples = atoi(optarg);
+ break;
+ case 'l':
+ list_pins = 1;
+ break;
+ case 'L':
+ cnt = sscanf(optarg, "%d,%d", &pin_index, &pin_func);
+ if (cnt != 2) {
+ usage(progname);
+ return -1;
+ }
+ break;
+ case 'p':
+ perout = atoll(optarg);
+ break;
+ case 'P':
+ pps = atoi(optarg);
+ break;
+ case 's':
+ settime = 1;
+ break;
+ case 'S':
+ settime = 2;
+ break;
+ case 't':
+ adjtime = atoi(optarg);
+ break;
+ case 'T':
+ settime = 3;
+ seconds = atoi(optarg);
+ break;
+ case 'w':
+ pulsewidth = atoi(optarg);
+ break;
+ case 'z':
+ flagtest = 1;
+ break;
+ case 'h':
+ usage(progname);
+ return 0;
+ case '?':
+ default:
+ usage(progname);
+ return -1;
+ }
+ }
+
+ fd = open(device, O_RDWR);
+ if (fd < 0) {
+ fprintf(stderr, "opening %s: %s\n", device, strerror(errno));
+ return -1;
+ }
+
+ clkid = get_clockid(fd);
+ if (CLOCK_INVALID == clkid) {
+ fprintf(stderr, "failed to read clock id\n");
+ return -1;
+ }
+
+ if (capabilities) {
+ if (ioctl(fd, PTP_CLOCK_GETCAPS, &caps)) {
+ perror("PTP_CLOCK_GETCAPS");
+ } else {
+ printf("capabilities:\n"
+ " %d maximum frequency adjustment (ppb)\n"
+ " %d programmable alarms\n"
+ " %d external time stamp channels\n"
+ " %d programmable periodic signals\n"
+ " %d pulse per second\n"
+ " %d programmable pins\n"
+ " %d cross timestamping\n"
+ " %d adjust_phase\n",
+ caps.max_adj,
+ caps.n_alarm,
+ caps.n_ext_ts,
+ caps.n_per_out,
+ caps.pps,
+ caps.n_pins,
+ caps.cross_timestamping,
+ caps.adjust_phase);
+ }
+ }
+
+ if (0x7fffffff != adjfreq) {
+ memset(&tx, 0, sizeof(tx));
+ tx.modes = ADJ_FREQUENCY;
+ tx.freq = ppb_to_scaled_ppm(adjfreq);
+ if (clock_adjtime(clkid, &tx)) {
+ perror("clock_adjtime");
+ } else {
+ puts("frequency adjustment okay");
+ }
+ }
+
+ if (adjtime) {
+ memset(&tx, 0, sizeof(tx));
+ tx.modes = ADJ_SETOFFSET;
+ tx.time.tv_sec = adjtime;
+ tx.time.tv_usec = 0;
+ if (clock_adjtime(clkid, &tx) < 0) {
+ perror("clock_adjtime");
+ } else {
+ puts("time shift okay");
+ }
+ }
+
+ if (gettime) {
+ if (clock_gettime(clkid, &ts)) {
+ perror("clock_gettime");
+ } else {
+ printf("clock time: %ld.%09ld or %s",
+ ts.tv_sec, ts.tv_nsec, ctime(&ts.tv_sec));
+ }
+ }
+
+ if (settime == 1) {
+ clock_gettime(CLOCK_REALTIME, &ts);
+ if (clock_settime(clkid, &ts)) {
+ perror("clock_settime");
+ } else {
+ puts("set time okay");
+ }
+ }
+
+ if (settime == 2) {
+ clock_gettime(clkid, &ts);
+ if (clock_settime(CLOCK_REALTIME, &ts)) {
+ perror("clock_settime");
+ } else {
+ puts("set time okay");
+ }
+ }
+
+ if (settime == 3) {
+ ts.tv_sec = seconds;
+ ts.tv_nsec = 0;
+ if (clock_settime(clkid, &ts)) {
+ perror("clock_settime");
+ } else {
+ puts("set time okay");
+ }
+ }
+
+ if (extts) {
+ memset(&extts_request, 0, sizeof(extts_request));
+ extts_request.index = index;
+ extts_request.flags = PTP_ENABLE_FEATURE;
+ if (ioctl(fd, PTP_EXTTS_REQUEST, &extts_request)) {
+ perror("PTP_EXTTS_REQUEST");
+ extts = 0;
+ } else {
+ puts("external time stamp request okay");
+ }
+ for (; extts; extts--) {
+ cnt = read(fd, &event, sizeof(event));
+ if (cnt != sizeof(event)) {
+ perror("read");
+ break;
+ }
+ printf("event index %u at %lld.%09u\n", event.index,
+ event.t.sec, event.t.nsec);
+ fflush(stdout);
+ }
+ /* Disable the feature again. */
+ extts_request.flags = 0;
+ if (ioctl(fd, PTP_EXTTS_REQUEST, &extts_request)) {
+ perror("PTP_EXTTS_REQUEST");
+ }
+ }
+
+ if (flagtest) {
+ do_flag_test(fd, index);
+ }
+
+ if (list_pins) {
+ int n_pins = 0;
+ if (ioctl(fd, PTP_CLOCK_GETCAPS, &caps)) {
+ perror("PTP_CLOCK_GETCAPS");
+ } else {
+ n_pins = caps.n_pins;
+ }
+ for (i = 0; i < n_pins; i++) {
+ desc.index = i;
+ if (ioctl(fd, PTP_PIN_GETFUNC, &desc)) {
+ perror("PTP_PIN_GETFUNC");
+ break;
+ }
+ printf("name %s index %u func %u chan %u\n",
+ desc.name, desc.index, desc.func, desc.chan);
+ }
+ }
+
+ if (pulsewidth >= 0 && perout < 0) {
+ puts("-w can only be specified together with -p");
+ return -1;
+ }
+
+ if (perout_phase >= 0 && perout < 0) {
+ puts("-H can only be specified together with -p");
+ return -1;
+ }
+
+ if (perout >= 0) {
+ if (clock_gettime(clkid, &ts)) {
+ perror("clock_gettime");
+ return -1;
+ }
+ memset(&perout_request, 0, sizeof(perout_request));
+ perout_request.index = index;
+ perout_request.period.sec = perout / NSEC_PER_SEC;
+ perout_request.period.nsec = perout % NSEC_PER_SEC;
+ perout_request.flags = 0;
+ if (pulsewidth >= 0) {
+ perout_request.flags |= PTP_PEROUT_DUTY_CYCLE;
+ perout_request.on.sec = pulsewidth / NSEC_PER_SEC;
+ perout_request.on.nsec = pulsewidth % NSEC_PER_SEC;
+ }
+ if (perout_phase >= 0) {
+ perout_request.flags |= PTP_PEROUT_PHASE;
+ perout_request.phase.sec = perout_phase / NSEC_PER_SEC;
+ perout_request.phase.nsec = perout_phase % NSEC_PER_SEC;
+ } else {
+ perout_request.start.sec = ts.tv_sec + 2;
+ perout_request.start.nsec = 0;
+ }
+
+ if (ioctl(fd, PTP_PEROUT_REQUEST2, &perout_request)) {
+ perror("PTP_PEROUT_REQUEST");
+ } else {
+ puts("periodic output request okay");
+ }
+ }
+
+ if (pin_index >= 0) {
+ memset(&desc, 0, sizeof(desc));
+ desc.index = pin_index;
+ desc.func = pin_func;
+ desc.chan = index;
+ if (ioctl(fd, PTP_PIN_SETFUNC, &desc)) {
+ perror("PTP_PIN_SETFUNC");
+ } else {
+ puts("set pin function okay");
+ }
+ }
+
+ if (pps != -1) {
+ int enable = pps ? 1 : 0;
+ if (ioctl(fd, PTP_ENABLE_PPS, enable)) {
+ perror("PTP_ENABLE_PPS");
+ } else {
+ puts("pps for system time request okay");
+ }
+ }
+
+ if (pct_offset) {
+ if (n_samples <= 0 || n_samples > 25) {
+ puts("n_samples should be between 1 and 25");
+ usage(progname);
+ return -1;
+ }
+
+ sysoff = calloc(1, sizeof(*sysoff));
+ if (!sysoff) {
+ perror("calloc");
+ return -1;
+ }
+ sysoff->n_samples = n_samples;
+
+ if (ioctl(fd, PTP_SYS_OFFSET, sysoff))
+ perror("PTP_SYS_OFFSET");
+ else
+ puts("system and phc clock time offset request okay");
+
+ pct = &sysoff->ts[0];
+ for (i = 0; i < sysoff->n_samples; i++) {
+ t1 = pctns(pct+2*i);
+ tp = pctns(pct+2*i+1);
+ t2 = pctns(pct+2*i+2);
+ interval = t2 - t1;
+ offset = (t2 + t1) / 2 - tp;
+
+ printf("system time: %lld.%09u\n",
+ (pct+2*i)->sec, (pct+2*i)->nsec);
+ printf("phc time: %lld.%09u\n",
+ (pct+2*i+1)->sec, (pct+2*i+1)->nsec);
+ printf("system time: %lld.%09u\n",
+ (pct+2*i+2)->sec, (pct+2*i+2)->nsec);
+ printf("system/phc clock time offset is %" PRId64 " ns\n"
+ "system clock time delay is %" PRId64 " ns\n",
+ offset, interval);
+ }
+
+ free(sysoff);
+ }
+
+ close(fd);
+ return 0;
+}
diff --git a/tools/testing/selftests/ptp/testptp.mk b/tools/testing/selftests/ptp/testptp.mk
new file mode 100644
index 000000000..4ef2d9755
--- /dev/null
+++ b/tools/testing/selftests/ptp/testptp.mk
@@ -0,0 +1,33 @@
+# PTP 1588 clock support - User space test program
+#
+# Copyright (C) 2010 OMICRON electronics GmbH
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+CC = $(CROSS_COMPILE)gcc
+INC = -I$(KBUILD_OUTPUT)/usr/include
+CFLAGS = -Wall $(INC)
+LDLIBS = -lrt
+PROGS = testptp
+
+all: $(PROGS)
+
+testptp: testptp.o
+
+clean:
+ rm -f testptp.o
+
+distclean: clean
+ rm -f $(PROGS)
diff --git a/tools/testing/selftests/ptrace/.gitignore b/tools/testing/selftests/ptrace/.gitignore
new file mode 100644
index 000000000..792318aaa
--- /dev/null
+++ b/tools/testing/selftests/ptrace/.gitignore
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+get_syscall_info
+peeksiginfo
+vmaccess
diff --git a/tools/testing/selftests/ptrace/Makefile b/tools/testing/selftests/ptrace/Makefile
new file mode 100644
index 000000000..2f1f532c3
--- /dev/null
+++ b/tools/testing/selftests/ptrace/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+CFLAGS += -std=c99 -pthread -iquote../../../../include/uapi -Wall
+
+TEST_GEN_PROGS := get_syscall_info peeksiginfo vmaccess
+
+include ../lib.mk
diff --git a/tools/testing/selftests/ptrace/get_syscall_info.c b/tools/testing/selftests/ptrace/get_syscall_info.c
new file mode 100644
index 000000000..5bcd1c7b5
--- /dev/null
+++ b/tools/testing/selftests/ptrace/get_syscall_info.c
@@ -0,0 +1,271 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (c) 2018 Dmitry V. Levin <ldv@altlinux.org>
+ * All rights reserved.
+ *
+ * Check whether PTRACE_GET_SYSCALL_INFO semantics implemented in the kernel
+ * matches userspace expectations.
+ */
+
+#include "../kselftest_harness.h"
+#include <err.h>
+#include <signal.h>
+#include <asm/unistd.h>
+#include "linux/ptrace.h"
+
+static int
+kill_tracee(pid_t pid)
+{
+ if (!pid)
+ return 0;
+
+ int saved_errno = errno;
+
+ int rc = kill(pid, SIGKILL);
+
+ errno = saved_errno;
+ return rc;
+}
+
+static long
+sys_ptrace(int request, pid_t pid, unsigned long addr, unsigned long data)
+{
+ return syscall(__NR_ptrace, request, pid, addr, data);
+}
+
+#define LOG_KILL_TRACEE(fmt, ...) \
+ do { \
+ kill_tracee(pid); \
+ TH_LOG("wait #%d: " fmt, \
+ ptrace_stop, ##__VA_ARGS__); \
+ } while (0)
+
+TEST(get_syscall_info)
+{
+ static const unsigned long args[][7] = {
+ /* a sequence of architecture-agnostic syscalls */
+ {
+ __NR_chdir,
+ (unsigned long) "",
+ 0xbad1fed1,
+ 0xbad2fed2,
+ 0xbad3fed3,
+ 0xbad4fed4,
+ 0xbad5fed5
+ },
+ {
+ __NR_gettid,
+ 0xcaf0bea0,
+ 0xcaf1bea1,
+ 0xcaf2bea2,
+ 0xcaf3bea3,
+ 0xcaf4bea4,
+ 0xcaf5bea5
+ },
+ {
+ __NR_exit_group,
+ 0,
+ 0xfac1c0d1,
+ 0xfac2c0d2,
+ 0xfac3c0d3,
+ 0xfac4c0d4,
+ 0xfac5c0d5
+ }
+ };
+ const unsigned long *exp_args;
+
+ pid_t pid = fork();
+
+ ASSERT_LE(0, pid) {
+ TH_LOG("fork: %m");
+ }
+
+ if (pid == 0) {
+ /* get the pid before PTRACE_TRACEME */
+ pid = getpid();
+ ASSERT_EQ(0, sys_ptrace(PTRACE_TRACEME, 0, 0, 0)) {
+ TH_LOG("PTRACE_TRACEME: %m");
+ }
+ ASSERT_EQ(0, kill(pid, SIGSTOP)) {
+ /* cannot happen */
+ TH_LOG("kill SIGSTOP: %m");
+ }
+ for (unsigned int i = 0; i < ARRAY_SIZE(args); ++i) {
+ syscall(args[i][0],
+ args[i][1], args[i][2], args[i][3],
+ args[i][4], args[i][5], args[i][6]);
+ }
+ /* unreachable */
+ _exit(1);
+ }
+
+ const struct {
+ unsigned int is_error;
+ int rval;
+ } *exp_param, exit_param[] = {
+ { 1, -ENOENT }, /* chdir */
+ { 0, pid } /* gettid */
+ };
+
+ unsigned int ptrace_stop;
+
+ for (ptrace_stop = 0; ; ++ptrace_stop) {
+ struct ptrace_syscall_info info = {
+ .op = 0xff /* invalid PTRACE_SYSCALL_INFO_* op */
+ };
+ const size_t size = sizeof(info);
+ const int expected_none_size =
+ (void *) &info.entry - (void *) &info;
+ const int expected_entry_size =
+ (void *) &info.entry.args[6] - (void *) &info;
+ const int expected_exit_size =
+ (void *) (&info.exit.is_error + 1) -
+ (void *) &info;
+ int status;
+ long rc;
+
+ ASSERT_EQ(pid, wait(&status)) {
+ /* cannot happen */
+ LOG_KILL_TRACEE("wait: %m");
+ }
+ if (WIFEXITED(status)) {
+ pid = 0; /* the tracee is no more */
+ ASSERT_EQ(0, WEXITSTATUS(status));
+ break;
+ }
+ ASSERT_FALSE(WIFSIGNALED(status)) {
+ pid = 0; /* the tracee is no more */
+ LOG_KILL_TRACEE("unexpected signal %u",
+ WTERMSIG(status));
+ }
+ ASSERT_TRUE(WIFSTOPPED(status)) {
+ /* cannot happen */
+ LOG_KILL_TRACEE("unexpected wait status %#x", status);
+ }
+
+ switch (WSTOPSIG(status)) {
+ case SIGSTOP:
+ ASSERT_EQ(0, ptrace_stop) {
+ LOG_KILL_TRACEE("unexpected signal stop");
+ }
+ ASSERT_EQ(0, sys_ptrace(PTRACE_SETOPTIONS, pid, 0,
+ PTRACE_O_TRACESYSGOOD)) {
+ LOG_KILL_TRACEE("PTRACE_SETOPTIONS: %m");
+ }
+ ASSERT_LT(0, (rc = sys_ptrace(PTRACE_GET_SYSCALL_INFO,
+ pid, size,
+ (unsigned long) &info))) {
+ LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO: %m");
+ }
+ ASSERT_EQ(expected_none_size, rc) {
+ LOG_KILL_TRACEE("signal stop mismatch");
+ }
+ ASSERT_EQ(PTRACE_SYSCALL_INFO_NONE, info.op) {
+ LOG_KILL_TRACEE("signal stop mismatch");
+ }
+ ASSERT_TRUE(info.arch) {
+ LOG_KILL_TRACEE("signal stop mismatch");
+ }
+ ASSERT_TRUE(info.instruction_pointer) {
+ LOG_KILL_TRACEE("signal stop mismatch");
+ }
+ ASSERT_TRUE(info.stack_pointer) {
+ LOG_KILL_TRACEE("signal stop mismatch");
+ }
+ break;
+
+ case SIGTRAP | 0x80:
+ ASSERT_LT(0, (rc = sys_ptrace(PTRACE_GET_SYSCALL_INFO,
+ pid, size,
+ (unsigned long) &info))) {
+ LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO: %m");
+ }
+ switch (ptrace_stop) {
+ case 1: /* entering chdir */
+ case 3: /* entering gettid */
+ case 5: /* entering exit_group */
+ exp_args = args[ptrace_stop / 2];
+ ASSERT_EQ(expected_entry_size, rc) {
+ LOG_KILL_TRACEE("entry stop mismatch");
+ }
+ ASSERT_EQ(PTRACE_SYSCALL_INFO_ENTRY, info.op) {
+ LOG_KILL_TRACEE("entry stop mismatch");
+ }
+ ASSERT_TRUE(info.arch) {
+ LOG_KILL_TRACEE("entry stop mismatch");
+ }
+ ASSERT_TRUE(info.instruction_pointer) {
+ LOG_KILL_TRACEE("entry stop mismatch");
+ }
+ ASSERT_TRUE(info.stack_pointer) {
+ LOG_KILL_TRACEE("entry stop mismatch");
+ }
+ ASSERT_EQ(exp_args[0], info.entry.nr) {
+ LOG_KILL_TRACEE("entry stop mismatch");
+ }
+ ASSERT_EQ(exp_args[1], info.entry.args[0]) {
+ LOG_KILL_TRACEE("entry stop mismatch");
+ }
+ ASSERT_EQ(exp_args[2], info.entry.args[1]) {
+ LOG_KILL_TRACEE("entry stop mismatch");
+ }
+ ASSERT_EQ(exp_args[3], info.entry.args[2]) {
+ LOG_KILL_TRACEE("entry stop mismatch");
+ }
+ ASSERT_EQ(exp_args[4], info.entry.args[3]) {
+ LOG_KILL_TRACEE("entry stop mismatch");
+ }
+ ASSERT_EQ(exp_args[5], info.entry.args[4]) {
+ LOG_KILL_TRACEE("entry stop mismatch");
+ }
+ ASSERT_EQ(exp_args[6], info.entry.args[5]) {
+ LOG_KILL_TRACEE("entry stop mismatch");
+ }
+ break;
+ case 2: /* exiting chdir */
+ case 4: /* exiting gettid */
+ exp_param = &exit_param[ptrace_stop / 2 - 1];
+ ASSERT_EQ(expected_exit_size, rc) {
+ LOG_KILL_TRACEE("exit stop mismatch");
+ }
+ ASSERT_EQ(PTRACE_SYSCALL_INFO_EXIT, info.op) {
+ LOG_KILL_TRACEE("exit stop mismatch");
+ }
+ ASSERT_TRUE(info.arch) {
+ LOG_KILL_TRACEE("exit stop mismatch");
+ }
+ ASSERT_TRUE(info.instruction_pointer) {
+ LOG_KILL_TRACEE("exit stop mismatch");
+ }
+ ASSERT_TRUE(info.stack_pointer) {
+ LOG_KILL_TRACEE("exit stop mismatch");
+ }
+ ASSERT_EQ(exp_param->is_error,
+ info.exit.is_error) {
+ LOG_KILL_TRACEE("exit stop mismatch");
+ }
+ ASSERT_EQ(exp_param->rval, info.exit.rval) {
+ LOG_KILL_TRACEE("exit stop mismatch");
+ }
+ break;
+ default:
+ LOG_KILL_TRACEE("unexpected syscall stop");
+ abort();
+ }
+ break;
+
+ default:
+ LOG_KILL_TRACEE("unexpected stop signal %#x",
+ WSTOPSIG(status));
+ abort();
+ }
+
+ ASSERT_EQ(0, sys_ptrace(PTRACE_SYSCALL, pid, 0, 0)) {
+ LOG_KILL_TRACEE("PTRACE_SYSCALL: %m");
+ }
+ }
+
+ ASSERT_EQ(ARRAY_SIZE(args) * 2, ptrace_stop);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/ptrace/peeksiginfo.c b/tools/testing/selftests/ptrace/peeksiginfo.c
new file mode 100644
index 000000000..54900657e
--- /dev/null
+++ b/tools/testing/selftests/ptrace/peeksiginfo.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <signal.h>
+#include <unistd.h>
+#include <errno.h>
+#include <linux/types.h>
+#include <sys/wait.h>
+#include <sys/syscall.h>
+#include <sys/user.h>
+#include <sys/mman.h>
+
+#include "linux/ptrace.h"
+
+static int sys_rt_sigqueueinfo(pid_t tgid, int sig, siginfo_t *uinfo)
+{
+ return syscall(SYS_rt_sigqueueinfo, tgid, sig, uinfo);
+}
+
+static int sys_rt_tgsigqueueinfo(pid_t tgid, pid_t tid,
+ int sig, siginfo_t *uinfo)
+{
+ return syscall(SYS_rt_tgsigqueueinfo, tgid, tid, sig, uinfo);
+}
+
+static int sys_ptrace(int request, pid_t pid, void *addr, void *data)
+{
+ return syscall(SYS_ptrace, request, pid, addr, data);
+}
+
+#define SIGNR 10
+#define TEST_SICODE_PRIV -1
+#define TEST_SICODE_SHARE -2
+
+#ifndef PAGE_SIZE
+#define PAGE_SIZE sysconf(_SC_PAGESIZE)
+#endif
+
+#define err(fmt, ...) \
+ fprintf(stderr, \
+ "Error (%s:%d): " fmt, \
+ __FILE__, __LINE__, ##__VA_ARGS__)
+
+static int check_error_paths(pid_t child)
+{
+ struct ptrace_peeksiginfo_args arg;
+ int ret, exit_code = -1;
+ void *addr_rw, *addr_ro;
+
+ /*
+ * Allocate two contiguous pages. The first one is for read-write,
+ * another is for read-only.
+ */
+ addr_rw = mmap(NULL, 2 * PAGE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (addr_rw == MAP_FAILED) {
+ err("mmap() failed: %m\n");
+ return 1;
+ }
+
+ addr_ro = mmap(addr_rw + PAGE_SIZE, PAGE_SIZE, PROT_READ,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
+ if (addr_ro == MAP_FAILED) {
+ err("mmap() failed: %m\n");
+ goto out;
+ }
+
+ arg.nr = SIGNR;
+ arg.off = 0;
+
+ /* Unsupported flags */
+ arg.flags = ~0;
+ ret = sys_ptrace(PTRACE_PEEKSIGINFO, child, &arg, addr_rw);
+ if (ret != -1 || errno != EINVAL) {
+ err("sys_ptrace() returns %d (expected -1),"
+ " errno %d (expected %d): %m\n",
+ ret, errno, EINVAL);
+ goto out;
+ }
+ arg.flags = 0;
+
+ /* A part of the buffer is read-only */
+ ret = sys_ptrace(PTRACE_PEEKSIGINFO, child, &arg,
+ addr_ro - sizeof(siginfo_t) * 2);
+ if (ret != 2) {
+ err("sys_ptrace() returns %d (expected 2): %m\n", ret);
+ goto out;
+ }
+
+ /* Read-only buffer */
+ ret = sys_ptrace(PTRACE_PEEKSIGINFO, child, &arg, addr_ro);
+ if (ret != -1 && errno != EFAULT) {
+ err("sys_ptrace() returns %d (expected -1),"
+ " errno %d (expected %d): %m\n",
+ ret, errno, EFAULT);
+ goto out;
+ }
+
+ exit_code = 0;
+out:
+ munmap(addr_rw, 2 * PAGE_SIZE);
+ return exit_code;
+}
+
+int check_direct_path(pid_t child, int shared, int nr)
+{
+ struct ptrace_peeksiginfo_args arg = {.flags = 0, .nr = nr, .off = 0};
+ int i, j, ret, exit_code = -1;
+ siginfo_t siginfo[SIGNR];
+ int si_code;
+
+ if (shared == 1) {
+ arg.flags = PTRACE_PEEKSIGINFO_SHARED;
+ si_code = TEST_SICODE_SHARE;
+ } else {
+ arg.flags = 0;
+ si_code = TEST_SICODE_PRIV;
+ }
+
+ for (i = 0; i < SIGNR; ) {
+ arg.off = i;
+ ret = sys_ptrace(PTRACE_PEEKSIGINFO, child, &arg, siginfo);
+ if (ret == -1) {
+ err("ptrace() failed: %m\n");
+ goto out;
+ }
+
+ if (ret == 0)
+ break;
+
+ for (j = 0; j < ret; j++, i++) {
+ if (siginfo[j].si_code == si_code &&
+ siginfo[j].si_int == i)
+ continue;
+
+ err("%d: Wrong siginfo i=%d si_code=%d si_int=%d\n",
+ shared, i, siginfo[j].si_code, siginfo[j].si_int);
+ goto out;
+ }
+ }
+
+ if (i != SIGNR) {
+ err("Only %d signals were read\n", i);
+ goto out;
+ }
+
+ exit_code = 0;
+out:
+ return exit_code;
+}
+
+int main(int argc, char *argv[])
+{
+ siginfo_t siginfo[SIGNR];
+ int i, exit_code = 1;
+ sigset_t blockmask;
+ pid_t child;
+
+ sigemptyset(&blockmask);
+ sigaddset(&blockmask, SIGRTMIN);
+ sigprocmask(SIG_BLOCK, &blockmask, NULL);
+
+ child = fork();
+ if (child == -1) {
+ err("fork() failed: %m");
+ return 1;
+ } else if (child == 0) {
+ pid_t ppid = getppid();
+ while (1) {
+ if (ppid != getppid())
+ break;
+ sleep(1);
+ }
+ return 1;
+ }
+
+ /* Send signals in process-wide and per-thread queues */
+ for (i = 0; i < SIGNR; i++) {
+ siginfo->si_code = TEST_SICODE_SHARE;
+ siginfo->si_int = i;
+ sys_rt_sigqueueinfo(child, SIGRTMIN, siginfo);
+
+ siginfo->si_code = TEST_SICODE_PRIV;
+ siginfo->si_int = i;
+ sys_rt_tgsigqueueinfo(child, child, SIGRTMIN, siginfo);
+ }
+
+ if (sys_ptrace(PTRACE_ATTACH, child, NULL, NULL) == -1)
+ return 1;
+
+ waitpid(child, NULL, 0);
+
+ /* Dump signals one by one*/
+ if (check_direct_path(child, 0, 1))
+ goto out;
+ /* Dump all signals for one call */
+ if (check_direct_path(child, 0, SIGNR))
+ goto out;
+
+ /*
+ * Dump signal from the process-wide queue.
+ * The number of signals is not multible to the buffer size
+ */
+ if (check_direct_path(child, 1, 3))
+ goto out;
+
+ if (check_error_paths(child))
+ goto out;
+
+ printf("PASS\n");
+ exit_code = 0;
+out:
+ if (sys_ptrace(PTRACE_KILL, child, NULL, NULL) == -1)
+ return 1;
+
+ waitpid(child, NULL, 0);
+
+ return exit_code;
+}
diff --git a/tools/testing/selftests/ptrace/vmaccess.c b/tools/testing/selftests/ptrace/vmaccess.c
new file mode 100644
index 000000000..4db327b44
--- /dev/null
+++ b/tools/testing/selftests/ptrace/vmaccess.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Copyright (c) 2020 Bernd Edlinger <bernd.edlinger@hotmail.de>
+ * All rights reserved.
+ *
+ * Check whether /proc/$pid/mem can be accessed without causing deadlocks
+ * when de_thread is blocked with ->cred_guard_mutex held.
+ */
+
+#include "../kselftest_harness.h"
+#include <stdio.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <signal.h>
+#include <unistd.h>
+#include <sys/ptrace.h>
+
+static void *thread(void *arg)
+{
+ ptrace(PTRACE_TRACEME, 0, 0L, 0L);
+ return NULL;
+}
+
+TEST(vmaccess)
+{
+ int f, pid = fork();
+ char mm[64];
+
+ if (!pid) {
+ pthread_t pt;
+
+ pthread_create(&pt, NULL, thread, NULL);
+ pthread_join(pt, NULL);
+ execlp("true", "true", NULL);
+ }
+
+ sleep(1);
+ sprintf(mm, "/proc/%d/mem", pid);
+ f = open(mm, O_RDONLY);
+ ASSERT_GE(f, 0);
+ close(f);
+ f = kill(pid, SIGCONT);
+ ASSERT_EQ(f, 0);
+}
+
+TEST(attach)
+{
+ int s, k, pid = fork();
+
+ if (!pid) {
+ pthread_t pt;
+
+ pthread_create(&pt, NULL, thread, NULL);
+ pthread_join(pt, NULL);
+ execlp("sleep", "sleep", "2", NULL);
+ }
+
+ sleep(1);
+ k = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
+ ASSERT_EQ(errno, EAGAIN);
+ ASSERT_EQ(k, -1);
+ k = waitpid(-1, &s, WNOHANG);
+ ASSERT_NE(k, -1);
+ ASSERT_NE(k, 0);
+ ASSERT_NE(k, pid);
+ ASSERT_EQ(WIFEXITED(s), 1);
+ ASSERT_EQ(WEXITSTATUS(s), 0);
+ sleep(1);
+ k = ptrace(PTRACE_ATTACH, pid, 0L, 0L);
+ ASSERT_EQ(k, 0);
+ k = waitpid(-1, &s, 0);
+ ASSERT_EQ(k, pid);
+ ASSERT_EQ(WIFSTOPPED(s), 1);
+ ASSERT_EQ(WSTOPSIG(s), SIGSTOP);
+ k = ptrace(PTRACE_DETACH, pid, 0L, 0L);
+ ASSERT_EQ(k, 0);
+ k = waitpid(-1, &s, 0);
+ ASSERT_EQ(k, pid);
+ ASSERT_EQ(WIFEXITED(s), 1);
+ ASSERT_EQ(WEXITSTATUS(s), 0);
+ k = waitpid(-1, NULL, 0);
+ ASSERT_EQ(k, -1);
+ ASSERT_EQ(errno, ECHILD);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/rcutorture/.gitignore b/tools/testing/selftests/rcutorture/.gitignore
new file mode 100644
index 000000000..f6cbce774
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/.gitignore
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+initrd
+b[0-9]*
+res
+*.swp
diff --git a/tools/testing/selftests/rcutorture/Makefile b/tools/testing/selftests/rcutorture/Makefile
new file mode 100644
index 000000000..5202dc666
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0+
+all:
+ ( cd ../../../..; tools/testing/selftests/rcutorture/bin/kvm.sh --duration 10 --configs TREE01 )
diff --git a/tools/testing/selftests/rcutorture/bin/configNR_CPUS.sh b/tools/testing/selftests/rcutorture/bin/configNR_CPUS.sh
new file mode 100755
index 000000000..2deea2169
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/configNR_CPUS.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Extract the number of CPUs expected from the specified Kconfig-file
+# fragment by checking CONFIG_SMP and CONFIG_NR_CPUS. If the specified
+# file gives no clue, base the number on the number of idle CPUs on
+# the system.
+#
+# Usage: configNR_CPUS.sh config-frag
+#
+# Copyright (C) IBM Corporation, 2013
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+cf=$1
+if test ! -r $cf
+then
+ echo Unreadable config fragment $cf 1>&2
+ exit -1
+fi
+if grep -q '^CONFIG_SMP=n$' $cf
+then
+ echo 1
+ exit 0
+fi
+if grep -q '^CONFIG_NR_CPUS=' $cf
+then
+ grep '^CONFIG_NR_CPUS=' $cf |
+ sed -e 's/^CONFIG_NR_CPUS=\([0-9]*\).*$/\1/'
+ exit 0
+fi
+cpus2use.sh
diff --git a/tools/testing/selftests/rcutorture/bin/config_override.sh b/tools/testing/selftests/rcutorture/bin/config_override.sh
new file mode 100755
index 000000000..90016c359
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/config_override.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# config_override.sh base override
+#
+# Combines base and override, removing any Kconfig options from base
+# that conflict with any in override, concatenating what remains and
+# sending the result to standard output.
+#
+# Copyright (C) IBM Corporation, 2017
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+base=$1
+if test -r $base
+then
+ :
+else
+ echo Base file $base unreadable!!!
+ exit 1
+fi
+
+override=$2
+if test -r $override
+then
+ :
+else
+ echo Override file $override unreadable!!!
+ exit 1
+fi
+
+T=${TMPDIR-/tmp}/config_override.sh.$$
+trap 'rm -rf $T' 0
+mkdir $T
+
+sed < $override -e 's/^/grep -v "/' -e 's/=.*$/="/' |
+ awk '
+ {
+ if (last)
+ print last " |";
+ last = $0;
+ }
+ END {
+ if (last)
+ print last;
+ }' > $T/script
+sh $T/script < $base
+cat $override
diff --git a/tools/testing/selftests/rcutorture/bin/configcheck.sh b/tools/testing/selftests/rcutorture/bin/configcheck.sh
new file mode 100755
index 000000000..31584cee8
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/configcheck.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Usage: configcheck.sh .config .config-template
+#
+# Copyright (C) IBM Corporation, 2011
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+T=${TMPDIR-/tmp}/abat-chk-config.sh.$$
+trap 'rm -rf $T' 0
+mkdir $T
+
+cat $1 > $T/.config
+
+cat $2 | sed -e 's/\(.*\)=n/# \1 is not set/' -e 's/^#CHECK#//' |
+grep -v '^CONFIG_INITRAMFS_SOURCE' |
+awk '
+{
+ print "if grep -q \"" $0 "\" < '"$T/.config"'";
+ print "then";
+ print "\t:";
+ print "else";
+ if ($1 == "#") {
+ print "\tif grep -q \"" $2 "\" < '"$T/.config"'";
+ print "\tthen";
+ print "\t\tif test \"$firsttime\" = \"\""
+ print "\t\tthen"
+ print "\t\t\tfirsttime=1"
+ print "\t\tfi"
+ print "\t\techo \":" $2 ": improperly set\"";
+ print "\telse";
+ print "\t\t:";
+ print "\tfi";
+ } else {
+ print "\tif test \"$firsttime\" = \"\""
+ print "\tthen"
+ print "\t\tfirsttime=1"
+ print "\tfi"
+ print "\techo \":" $0 ": improperly set\"";
+ }
+ print "fi";
+ }' | sh
diff --git a/tools/testing/selftests/rcutorture/bin/configinit.sh b/tools/testing/selftests/rcutorture/bin/configinit.sh
new file mode 100755
index 000000000..d6e5ce084
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/configinit.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Usage: configinit.sh config-spec-file results-dir
+#
+# Create a .config file from the spec file. Run from the kernel source tree.
+# Exits with 0 if all went well, with 1 if all went well but the config
+# did not match, and some other number for other failures.
+#
+# The first argument is the .config specification file, which contains
+# desired settings, for example, "CONFIG_NO_HZ=y". For best results,
+# this should be a full pathname.
+#
+# Copyright (C) IBM Corporation, 2013
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+T=${TMPDIR-/tmp}/configinit.sh.$$
+trap 'rm -rf $T' 0
+mkdir $T
+
+# Capture config spec file.
+
+c=$1
+resdir=$2
+
+sed -e 's/^\(CONFIG[0-9A-Z_]*\)=.*$/grep -v "^# \1" |/' < $c > $T/u.sh
+sed -e 's/^\(CONFIG[0-9A-Z_]*=\).*$/grep -v \1 |/' < $c >> $T/u.sh
+grep '^grep' < $T/u.sh > $T/upd.sh
+echo "cat - $c" >> $T/upd.sh
+if test -z "$TORTURE_TRUST_MAKE"
+then
+ make clean > $resdir/Make.clean 2>&1
+fi
+make $TORTURE_KMAKE_ARG $TORTURE_DEFCONFIG > $resdir/Make.defconfig.out 2>&1
+mv .config .config.sav
+sh $T/upd.sh < .config.sav > .config
+cp .config .config.new
+yes '' | make $TORTURE_KMAKE_ARG oldconfig > $resdir/Make.oldconfig.out 2> $resdir/Make.oldconfig.err
+
+# verify new config matches specification.
+configcheck.sh .config $c
+
+exit 0
diff --git a/tools/testing/selftests/rcutorture/bin/console-badness.sh b/tools/testing/selftests/rcutorture/bin/console-badness.sh
new file mode 100755
index 000000000..80ae7f08b
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/console-badness.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Scan standard input for error messages, dumping any found to standard
+# output.
+#
+# Usage: console-badness.sh
+#
+# Copyright (C) 2020 Facebook, Inc.
+#
+# Authors: Paul E. McKenney <paulmck@kernel.org>
+
+egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state|rcu_.*kthread starved for|!!!' |
+grep -v 'ODEBUG: ' |
+grep -v 'This means that this is a DEBUG kernel and it is' |
+grep -v 'Warning: unable to open an initial console' |
+grep -v 'NOHZ tick-stop error: Non-RCU local softirq work is pending, handler'
diff --git a/tools/testing/selftests/rcutorture/bin/cpus2use.sh b/tools/testing/selftests/rcutorture/bin/cpus2use.sh
new file mode 100755
index 000000000..1dbfb6256
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/cpus2use.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Get an estimate of how CPU-hoggy to be.
+#
+# Usage: cpus2use.sh
+#
+# Copyright (C) IBM Corporation, 2013
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+if test -n "$TORTURE_ALLOTED_CPUS"
+then
+ echo $TORTURE_ALLOTED_CPUS
+ exit 0
+fi
+ncpus=`grep '^processor' /proc/cpuinfo | wc -l`
+if mpstat -V > /dev/null 2>&1
+then
+ idlecpus=`mpstat | tail -1 | \
+ awk -v ncpus=$ncpus '{ print ncpus * ($7 + $NF) / 100 }'`
+else
+ # No mpstat command, so use all available CPUs.
+ echo The mpstat command is not available, so greedily using all CPUs.
+ idlecpus=$ncpus
+fi
+awk -v ncpus=$ncpus -v idlecpus=$idlecpus < /dev/null '
+BEGIN {
+ cpus2use = idlecpus;
+ if (cpus2use < 1)
+ cpus2use = 1;
+ if (cpus2use < ncpus / 10)
+ cpus2use = ncpus / 10;
+ if (cpus2use == int(cpus2use))
+ cpus2use = int(cpus2use)
+ else
+ cpus2use = int(cpus2use) + 1
+ print cpus2use;
+}'
+
diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh
new file mode 100644
index 000000000..51f3464b9
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/functions.sh
@@ -0,0 +1,292 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Shell functions for the rest of the scripts.
+#
+# Copyright (C) IBM Corporation, 2013
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+# bootparam_hotplug_cpu bootparam-string
+#
+# Returns 1 if the specified boot-parameter string tells rcutorture to
+# test CPU-hotplug operations.
+bootparam_hotplug_cpu () {
+ echo "$1" | grep -q "torture\.onoff_"
+}
+
+# checkarg --argname argtype $# arg mustmatch cannotmatch
+#
+# Checks the specified argument "arg" against the mustmatch and cannotmatch
+# patterns.
+checkarg () {
+ if test $3 -le 1
+ then
+ echo $1 needs argument $2 matching \"$5\"
+ usage
+ fi
+ if echo "$4" | grep -q -e "$5"
+ then
+ :
+ else
+ echo $1 $2 \"$4\" must match \"$5\"
+ usage
+ fi
+ if echo "$4" | grep -q -e "$6"
+ then
+ echo $1 $2 \"$4\" must not match \"$6\"
+ usage
+ fi
+}
+
+# configfrag_boot_params bootparam-string config-fragment-file
+#
+# Adds boot parameters from the .boot file, if any.
+configfrag_boot_params () {
+ if test -r "$2.boot"
+ then
+ echo $1 `grep -v '^#' "$2.boot" | tr '\012' ' '`
+ else
+ echo $1
+ fi
+}
+
+# configfrag_boot_cpus bootparam-string config-fragment-file config-cpus
+#
+# Decreases number of CPUs based on any nr_cpus= boot parameters specified.
+configfrag_boot_cpus () {
+ local bootargs="`configfrag_boot_params "$1" "$2"`"
+ local nr_cpus
+ if echo "${bootargs}" | grep -q 'nr_cpus=[0-9]'
+ then
+ nr_cpus="`echo "${bootargs}" | sed -e 's/^.*nr_cpus=\([0-9]*\).*$/\1/'`"
+ if test "$3" -gt "$nr_cpus"
+ then
+ echo $nr_cpus
+ else
+ echo $3
+ fi
+ else
+ echo $3
+ fi
+}
+
+# configfrag_boot_maxcpus bootparam-string config-fragment-file config-cpus
+#
+# Decreases number of CPUs based on any maxcpus= boot parameters specified.
+# This allows tests where additional CPUs come online later during the
+# test run. However, the torture parameters will be set based on the
+# number of CPUs initially present, so the scripting should schedule
+# test runs based on the maxcpus= boot parameter controlling the initial
+# number of CPUs instead of on the ultimate number of CPUs.
+configfrag_boot_maxcpus () {
+ local bootargs="`configfrag_boot_params "$1" "$2"`"
+ local maxcpus
+ if echo "${bootargs}" | grep -q 'maxcpus=[0-9]'
+ then
+ maxcpus="`echo "${bootargs}" | sed -e 's/^.*maxcpus=\([0-9]*\).*$/\1/'`"
+ if test "$3" -gt "$maxcpus"
+ then
+ echo $maxcpus
+ else
+ echo $3
+ fi
+ else
+ echo $3
+ fi
+}
+
+# configfrag_hotplug_cpu config-fragment-file
+#
+# Returns 1 if the config fragment specifies hotplug CPU.
+configfrag_hotplug_cpu () {
+ if test ! -r "$1"
+ then
+ echo Unreadable config fragment "$1" 1>&2
+ exit -1
+ fi
+ grep -q '^CONFIG_HOTPLUG_CPU=y$' "$1"
+}
+
+# identify_boot_image qemu-cmd
+#
+# Returns the relative path to the kernel build image. This will be
+# arch/<arch>/boot/bzImage or vmlinux if bzImage is not a target for the
+# architecture, unless overridden with the TORTURE_BOOT_IMAGE environment
+# variable.
+identify_boot_image () {
+ if test -n "$TORTURE_BOOT_IMAGE"
+ then
+ echo $TORTURE_BOOT_IMAGE
+ else
+ case "$1" in
+ qemu-system-x86_64|qemu-system-i386)
+ echo arch/x86/boot/bzImage
+ ;;
+ qemu-system-aarch64)
+ echo arch/arm64/boot/Image
+ ;;
+ *)
+ echo vmlinux
+ ;;
+ esac
+ fi
+}
+
+# identify_qemu builddir
+#
+# Returns our best guess as to which qemu command is appropriate for
+# the kernel at hand. Override with the TORTURE_QEMU_CMD environment variable.
+identify_qemu () {
+ local u="`file "$1"`"
+ if test -n "$TORTURE_QEMU_CMD"
+ then
+ echo $TORTURE_QEMU_CMD
+ elif echo $u | grep -q x86-64
+ then
+ echo qemu-system-x86_64
+ elif echo $u | grep -q "Intel 80386"
+ then
+ echo qemu-system-i386
+ elif echo $u | grep -q aarch64
+ then
+ echo qemu-system-aarch64
+ elif uname -a | grep -q ppc64
+ then
+ echo qemu-system-ppc64
+ else
+ echo Cannot figure out what qemu command to use! 1>&2
+ echo file $1 output: $u
+ # Usually this will be one of /usr/bin/qemu-system-*
+ # Use TORTURE_QEMU_CMD environment variable or appropriate
+ # argument to top-level script.
+ exit 1
+ fi
+}
+
+# identify_qemu_append qemu-cmd
+#
+# Output arguments for the qemu "-append" string based on CPU type
+# and the TORTURE_QEMU_INTERACTIVE environment variable.
+identify_qemu_append () {
+ local console=ttyS0
+ case "$1" in
+ qemu-system-x86_64|qemu-system-i386)
+ echo selinux=0 initcall_debug debug
+ ;;
+ qemu-system-aarch64)
+ console=ttyAMA0
+ ;;
+ esac
+ if test -n "$TORTURE_QEMU_INTERACTIVE"
+ then
+ echo root=/dev/sda
+ else
+ echo console=$console
+ fi
+}
+
+# identify_qemu_args qemu-cmd serial-file
+#
+# Output arguments for qemu arguments based on the TORTURE_QEMU_MAC
+# and TORTURE_QEMU_INTERACTIVE environment variables.
+identify_qemu_args () {
+ local KVM_CPU=""
+ case "$1" in
+ qemu-system-x86_64)
+ KVM_CPU=kvm64
+ ;;
+ qemu-system-i386)
+ KVM_CPU=kvm32
+ ;;
+ esac
+ case "$1" in
+ qemu-system-x86_64|qemu-system-i386)
+ echo -machine q35,accel=kvm
+ echo -cpu ${KVM_CPU}
+ ;;
+ qemu-system-aarch64)
+ echo -machine virt,gic-version=host -cpu host
+ ;;
+ qemu-system-ppc64)
+ echo -enable-kvm -M pseries -nodefaults
+ echo -device spapr-vscsi
+ if test -n "$TORTURE_QEMU_INTERACTIVE" -a -n "$TORTURE_QEMU_MAC"
+ then
+ echo -device spapr-vlan,netdev=net0,mac=$TORTURE_QEMU_MAC
+ echo -netdev bridge,br=br0,id=net0
+ fi
+ ;;
+ esac
+ if test -n "$TORTURE_QEMU_INTERACTIVE"
+ then
+ echo -monitor stdio -serial pty -S
+ else
+ echo -serial file:$2
+ fi
+}
+
+# identify_qemu_vcpus
+#
+# Returns the number of virtual CPUs available to the aggregate of the
+# guest OSes.
+identify_qemu_vcpus () {
+ lscpu | grep '^CPU(s):' | sed -e 's/CPU(s)://' -e 's/[ ]*//g'
+}
+
+# print_bug
+#
+# Prints "BUG: " in red followed by remaining arguments
+print_bug () {
+ printf '\033[031mBUG: \033[m'
+ echo $*
+}
+
+# print_warning
+#
+# Prints "WARNING: " in yellow followed by remaining arguments
+print_warning () {
+ printf '\033[033mWARNING: \033[m'
+ echo $*
+}
+
+# specify_qemu_cpus qemu-cmd qemu-args #cpus
+#
+# Appends a string containing "-smp XXX" to qemu-args, unless the incoming
+# qemu-args already contains "-smp".
+specify_qemu_cpus () {
+ local nt;
+
+ if echo $2 | grep -q -e -smp
+ then
+ echo $2
+ else
+ case "$1" in
+ qemu-system-x86_64|qemu-system-i386|qemu-system-aarch64)
+ echo $2 -smp $3
+ ;;
+ qemu-system-ppc64)
+ nt="`lscpu | grep '^NUMA node0' | sed -e 's/^[^,]*,\([0-9]*\),.*$/\1/'`"
+ echo $2 -smp cores=`expr \( $3 + $nt - 1 \) / $nt`,threads=$nt
+ ;;
+ esac
+ fi
+}
+
+# specify_qemu_net qemu-args
+#
+# Appends a string containing "-net none" to qemu-args, unless the incoming
+# qemu-args already contains "-smp" or unless the TORTURE_QEMU_INTERACTIVE
+# environment variable is set, in which case the string that is be added is
+# instead "-net nic -net user".
+specify_qemu_net () {
+ if echo $1 | grep -q -e -net
+ then
+ echo $1
+ elif test -n "$TORTURE_QEMU_INTERACTIVE"
+ then
+ echo $1 -net nic -net user
+ else
+ echo $1 -net none
+ fi
+}
diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh
new file mode 100755
index 000000000..188b864bc
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/jitter.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Alternate sleeping and spinning on randomly selected CPUs. The purpose
+# of this script is to inflict random OS jitter on a concurrently running
+# test.
+#
+# Usage: jitter.sh me duration [ sleepmax [ spinmax ] ]
+#
+# me: Random-number-generator seed salt.
+# duration: Time to run in seconds.
+# sleepmax: Maximum microseconds to sleep, defaults to one second.
+# spinmax: Maximum microseconds to spin, defaults to one millisecond.
+#
+# Copyright (C) IBM Corporation, 2016
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+me=$(($1 * 1000))
+duration=$2
+sleepmax=${3-1000000}
+spinmax=${4-1000}
+
+n=1
+
+starttime=`gawk 'BEGIN { print systime(); }' < /dev/null`
+
+nohotplugcpus=
+for i in /sys/devices/system/cpu/cpu[0-9]*
+do
+ if test -f $i/online
+ then
+ :
+ else
+ curcpu=`echo $i | sed -e 's/^[^0-9]*//'`
+ nohotplugcpus="$nohotplugcpus $curcpu"
+ fi
+done
+
+while :
+do
+ # Check for done.
+ t=`gawk -v s=$starttime 'BEGIN { print systime() - s; }' < /dev/null`
+ if test "$t" -gt "$duration"
+ then
+ exit 0;
+ fi
+
+ # Check for stop request.
+ if test -f "$TORTURE_STOPFILE"
+ then
+ exit 1;
+ fi
+
+ # Set affinity to randomly selected online CPU
+ if cpus=`grep 1 /sys/devices/system/cpu/*/online 2>&1 |
+ sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//'`
+ then
+ :
+ else
+ cpus=
+ fi
+ # Do not leave out non-hot-pluggable CPUs
+ cpus="$cpus $nohotplugcpus"
+
+ cpumask=`awk -v cpus="$cpus" -v me=$me -v n=$n 'BEGIN {
+ srand(n + me + systime());
+ ncpus = split(cpus, ca);
+ curcpu = ca[int(rand() * ncpus + 1)];
+ mask = lshift(1, curcpu);
+ if (mask + 0 <= 0)
+ mask = 1;
+ printf("%#x\n", mask);
+ }' < /dev/null`
+ n=$(($n+1))
+ if ! taskset -p $cpumask $$ > /dev/null 2>&1
+ then
+ echo taskset failure: '"taskset -p ' $cpumask $$ '"'
+ exit 1
+ fi
+
+ # Sleep a random duration
+ sleeptime=`awk -v me=$me -v n=$n -v sleepmax=$sleepmax 'BEGIN {
+ srand(n + me + systime());
+ printf("%06d", int(rand() * sleepmax));
+ }' < /dev/null`
+ n=$(($n+1))
+ sleep .$sleeptime
+
+ # Spin a random duration
+ limit=`awk -v me=$me -v n=$n -v spinmax=$spinmax 'BEGIN {
+ srand(n + me + systime());
+ printf("%06d", int(rand() * spinmax));
+ }' < /dev/null`
+ n=$(($n+1))
+ for i in {1..$limit}
+ do
+ echo > /dev/null
+ done
+done
+
+exit 1
diff --git a/tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh b/tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh
new file mode 100755
index 000000000..e5cc6b2f1
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kcsan-collapse.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# If this was a KCSAN run, collapse the reports in the various console.log
+# files onto pairs of functions.
+#
+# Usage: kcsan-collapse.sh resultsdir
+#
+# Copyright (C) 2020 Facebook, Inc.
+#
+# Authors: Paul E. McKenney <paulmck@kernel.org>
+
+if test -z "$TORTURE_KCONFIG_KCSAN_ARG"
+then
+ exit 0
+fi
+cat $1/*/console.log |
+ grep "BUG: KCSAN: " |
+ sed -e 's/^\[[^]]*] //' |
+ sort |
+ uniq -c |
+ sort -k1nr > $1/kcsan.sum
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-build.sh b/tools/testing/selftests/rcutorture/bin/kvm-build.sh
new file mode 100755
index 000000000..115e1822b
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-build.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Build a kvm-ready Linux kernel from the tree in the current directory.
+#
+# Usage: kvm-build.sh config-template resdir
+#
+# Copyright (C) IBM Corporation, 2011
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+if test -f "$TORTURE_STOPFILE"
+then
+ echo "kvm-build.sh early exit due to run STOP request"
+ exit 1
+fi
+
+config_template=${1}
+if test -z "$config_template" -o ! -f "$config_template" -o ! -r "$config_template"
+then
+ echo "kvm-build.sh :$config_template: Not a readable file"
+ exit 1
+fi
+resdir=${2}
+
+T=${TMPDIR-/tmp}/test-linux.sh.$$
+trap 'rm -rf $T' 0
+mkdir $T
+
+cp ${config_template} $T/config
+cat << ___EOF___ >> $T/config
+CONFIG_INITRAMFS_SOURCE="$TORTURE_INITRD"
+CONFIG_VIRTIO_PCI=y
+CONFIG_VIRTIO_CONSOLE=y
+___EOF___
+
+configinit.sh $T/config $resdir
+retval=$?
+if test $retval -gt 1
+then
+ exit 2
+fi
+ncpus=`cpus2use.sh`
+make -j$ncpus $TORTURE_KMAKE_ARG > $resdir/Make.out 2>&1
+retval=$?
+if test $retval -ne 0 || grep "rcu[^/]*": < $resdir/Make.out | egrep -q "Stop|Error|error:|warning:" || egrep -q "Stop|Error|error:" < $resdir/Make.out
+then
+ echo Kernel build error
+ egrep "Stop|Error|error:|warning:" < $resdir/Make.out
+ echo Run aborted.
+ exit 3
+fi
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh
new file mode 100755
index 000000000..6e65c134e
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh
@@ -0,0 +1,108 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Run a group of kvm.sh tests on the specified commits. This currently
+# unconditionally does three-minute runs on each scenario in CFLIST,
+# taking advantage of all available CPUs and trusting the "make" utility.
+# In the short term, adjustments can be made by editing this script and
+# CFLIST. If some adjustments appear to have ongoing value, this script
+# might grow some command-line arguments.
+#
+# Usage: kvm-check-branches.sh commit1 commit2..commit3 commit4 ...
+#
+# This script considers its arguments one at a time. If more elaborate
+# specification of commits is needed, please use "git rev-list" to
+# produce something that this simple script can understand. The reason
+# for retaining the simplicity is that it allows the user to more easily
+# see which commit came from which branch.
+#
+# This script creates a yyyy.mm.dd-hh.mm.ss-group entry in the "res"
+# directory. The calls to kvm.sh create the usual entries, but this script
+# moves them under the yyyy.mm.dd-hh.mm.ss-group entry, each in its own
+# directory numbered in run order, that is, "0001", "0002", and so on.
+# For successful runs, the large build artifacts are removed. Doing this
+# reduces the disk space required by about two orders of magnitude for
+# successful runs.
+#
+# Copyright (C) Facebook, 2020
+#
+# Authors: Paul E. McKenney <paulmck@kernel.org>
+
+if ! git status > /dev/null 2>&1
+then
+ echo '!!!' This script needs to run in a git archive. 1>&2
+ echo '!!!' Giving up. 1>&2
+ exit 1
+fi
+
+# Remember where we started so that we can get back and the end.
+curcommit="`git status | head -1 | awk '{ print $NF }'`"
+
+nfail=0
+ntry=0
+resdir="tools/testing/selftests/rcutorture/res"
+ds="`date +%Y.%m.%d-%H.%M.%S`-group"
+if ! test -e $resdir
+then
+ mkdir $resdir || :
+fi
+mkdir $resdir/$ds
+echo Results directory: $resdir/$ds
+
+KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM
+PATH=${KVM}/bin:$PATH; export PATH
+. functions.sh
+cpus="`identify_qemu_vcpus`"
+echo Using up to $cpus CPUs.
+
+# Each pass through this loop does one command-line argument.
+for gitbr in $@
+do
+ echo ' --- git branch ' $gitbr
+
+ # Each pass through this loop tests one commit.
+ for i in `git rev-list "$gitbr"`
+ do
+ ntry=`expr $ntry + 1`
+ idir=`awk -v ntry="$ntry" 'END { printf "%04d", ntry; }' < /dev/null`
+ echo ' --- commit ' $i from branch $gitbr
+ date
+ mkdir $resdir/$ds/$idir
+ echo $gitbr > $resdir/$ds/$idir/gitbr
+ echo $i >> $resdir/$ds/$idir/gitbr
+
+ # Test the specified commit.
+ git checkout $i > $resdir/$ds/$idir/git-checkout.out 2>&1
+ echo git checkout return code: $? "(Commit $ntry: $i)"
+ kvm.sh --cpus $cpus --duration 3 --trust-make > $resdir/$ds/$idir/kvm.sh.out 2>&1
+ ret=$?
+ echo kvm.sh return code $ret for commit $i from branch $gitbr
+
+ # Move the build products to their resting place.
+ runresdir="`grep -m 1 '^Results directory:' < $resdir/$ds/$idir/kvm.sh.out | sed -e 's/^Results directory://'`"
+ mv $runresdir $resdir/$ds/$idir
+ rrd="`echo $runresdir | sed -e 's,^.*/,,'`"
+ echo Run results: $resdir/$ds/$idir/$rrd
+ if test "$ret" -ne 0
+ then
+ # Failure, so leave all evidence intact.
+ nfail=`expr $nfail + 1`
+ else
+ # Success, so remove large files to save about 1GB.
+ ( cd $resdir/$ds/$idir/$rrd; rm -f */vmlinux */bzImage */System.map */Module.symvers )
+ fi
+ done
+done
+date
+
+# Go back to the original commit.
+git checkout "$curcommit"
+
+if test $nfail -ne 0
+then
+ echo '!!! ' $nfail failures in $ntry 'runs!!!'
+ exit 1
+else
+ echo No failures in $ntry runs.
+ exit 0
+fi
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
new file mode 100755
index 000000000..6f50722f2
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh
@@ -0,0 +1,66 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Invoke a text editor on all console.log files for all runs with diagnostics,
+# that is, on all such files having a console.log.diags counterpart.
+# Note that both console.log.diags and console.log are passed to the
+# editor (currently defaulting to "vi"), allowing the user to get an
+# idea of what to search for in the console.log file.
+#
+# Usage: kvm-find-errors.sh directory
+#
+# The "directory" above should end with the date/time directory, for example,
+# "tools/testing/selftests/rcutorture/res/2018.02.25-14:27:27".
+# Returns error status reflecting the success (or not) of the specified run.
+#
+# Copyright (C) IBM Corporation, 2018
+#
+# Author: Paul E. McKenney <paulmck@linux.ibm.com>
+
+rundir="${1}"
+if test -z "$rundir" -o ! -d "$rundir"
+then
+ echo Directory "$rundir" not found.
+ echo Usage: $0 directory
+ exit 1
+fi
+editor=${EDITOR-vi}
+
+# Find builds with errors
+files=
+for i in ${rundir}/*/Make.out
+do
+ if egrep -q "error:|warning:" < $i
+ then
+ egrep "error:|warning:" < $i > $i.diags
+ files="$files $i.diags $i"
+ fi
+done
+if test -n "$files"
+then
+ $editor $files
+else
+ echo No build errors.
+fi
+if grep -q -e "--buildonly" < ${rundir}/log
+then
+ echo Build-only run, no console logs to check.
+fi
+
+# Find console logs with errors
+files=
+for i in ${rundir}/*/console.log
+do
+ if test -r $i.diags
+ then
+ files="$files $i.diags $i"
+ fi
+done
+if test -n "$files"
+then
+ $editor $files
+ exit 1
+else
+ echo No errors in console logs.
+ exit 0
+fi
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-lock.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-lock.sh
new file mode 100755
index 000000000..f3a7a5e2b
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-lock.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Analyze a given results directory for locktorture progress.
+#
+# Usage: kvm-recheck-lock.sh resdir
+#
+# Copyright (C) IBM Corporation, 2014
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+i="$1"
+if test -d "$i" -a -r "$i"
+then
+ :
+else
+ echo Unreadable results directory: $i
+ exit 1
+fi
+
+configfile=`echo $i | sed -e 's/^.*\///'`
+ncs=`grep "Writes: Total:" $i/console.log 2> /dev/null | tail -1 | sed -e 's/^.* Total: //' -e 's/ .*$//'`
+if test -z "$ncs"
+then
+ echo "$configfile -------"
+else
+ title="$configfile ------- $ncs acquisitions/releases"
+ dur=`sed -e 's/^.* locktorture.shutdown_secs=//' -e 's/ .*$//' < $i/qemu-cmd 2> /dev/null`
+ if test -z "$dur"
+ then
+ :
+ else
+ ncsps=`awk -v ncs=$ncs -v dur=$dur '
+ BEGIN { print ncs / dur }' < /dev/null`
+ title="$title ($ncsps per second)"
+ fi
+ echo $title
+fi
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh
new file mode 100755
index 000000000..1706cd446
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcu.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Analyze a given results directory for rcutorture progress.
+#
+# Usage: kvm-recheck-rcu.sh resdir
+#
+# Copyright (C) IBM Corporation, 2014
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+i="$1"
+if test -d "$i" -a -r "$i"
+then
+ :
+else
+ echo Unreadable results directory: $i
+ exit 1
+fi
+. functions.sh
+
+configfile=`echo $i | sed -e 's/^.*\///'`
+ngps=`grep ver: $i/console.log 2> /dev/null | tail -1 | sed -e 's/^.* ver: //' -e 's/ .*$//'`
+stopstate="`grep 'End-test grace-period state: g' $i/console.log 2> /dev/null |
+ tail -1 | sed -e 's/^\[[ 0-9.]*] //' |
+ awk '{ print \"[\" $1 \" \" $5 \" \" $6 \" \" $7 \"]\"; }' |
+ tr -d '\012\015'`"
+fwdprog="`grep 'rcu_torture_fwd_prog_cr Duration' $i/console.log 2> /dev/null | sed -e 's/^\[[^]]*] //' | sort -k15nr | head -1 | awk '{ print $14 " " $15 }'`"
+if test -z "$ngps"
+then
+ echo "$configfile ------- " $stopstate
+else
+ title="$configfile ------- $ngps GPs"
+ dur=`sed -e 's/^.* rcutorture.shutdown_secs=//' -e 's/ .*$//' < $i/qemu-cmd 2> /dev/null`
+ if test -z "$dur"
+ then
+ :
+ else
+ ngpsps=`awk -v ngps=$ngps -v dur=$dur '
+ BEGIN { print ngps / dur }' < /dev/null`
+ title="$title ($ngpsps/s)"
+ fi
+ echo $title $stopstate $fwdprog
+ nclosecalls=`grep --binary-files=text 'torture: Reader Batch' $i/console.log | tail -1 | \
+ awk -v sum=0 '
+ {
+ for (i = 0; i <= NF; i++) {
+ sum += $i;
+ if ($i ~ /Batch:/) {
+ sum = 0;
+ i = i + 2;
+ }
+ }
+ }
+
+ END {
+ print sum
+ }'`
+ if test -z "$nclosecalls"
+ then
+ exit 0
+ fi
+ if test "$nclosecalls" -eq 0
+ then
+ exit 0
+ fi
+ # Compute number of close calls per tenth of an hour
+ nclosecalls10=`awk -v nclosecalls=$nclosecalls -v dur=$dur 'BEGIN { print int(nclosecalls * 36000 / dur) }' < /dev/null`
+ if test $nclosecalls10 -gt 5 -a $nclosecalls -gt 1
+ then
+ print_bug $nclosecalls "Reader Batch close calls in" $(($dur/60)) minute run: $i
+ else
+ print_warning $nclosecalls "Reader Batch close calls in" $(($dur/60)) minute run: $i
+ fi
+ echo $nclosecalls "Reader Batch close calls in" $(($dur/60)) minute run: $i > $i/console.log.rcu.diags
+fi
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale-ftrace.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale-ftrace.sh
new file mode 100755
index 000000000..d4bec5380
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale-ftrace.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Analyze a given results directory for rcuscale performance measurements,
+# looking for ftrace data. Exits with 0 if data was found, analyzed, and
+# printed. Intended to be invoked from kvm-recheck-rcuscale.sh after
+# argument checking.
+#
+# Usage: kvm-recheck-rcuscale-ftrace.sh resdir
+#
+# Copyright (C) IBM Corporation, 2016
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+i="$1"
+. functions.sh
+
+if test "`grep -c 'rcu_exp_grace_period.*start' < $i/console.log`" -lt 100
+then
+ exit 10
+fi
+
+sed -e 's/^\[[^]]*]//' < $i/console.log |
+grep 'us : rcu_exp_grace_period' |
+sed -e 's/us : / : /' |
+tr -d '\015' |
+awk '
+$8 == "start" {
+ if (startseq != "")
+ nlost++;
+ starttask = $1;
+ starttime = $3;
+ startseq = $7;
+ seqtask[startseq] = starttask;
+}
+
+$8 == "end" {
+ if (startseq == $7) {
+ curgpdur = $3 - starttime;
+ gptimes[++n] = curgpdur;
+ gptaskcnt[starttask]++;
+ sum += curgpdur;
+ if (curgpdur > 1000)
+ print "Long GP " starttime "us to " $3 "us (" curgpdur "us)";
+ startseq = "";
+ } else {
+ # Lost a message or some such, reset.
+ startseq = "";
+ nlost++;
+ }
+}
+
+$8 == "done" && seqtask[$7] != $1 {
+ piggybackcnt[$1]++;
+}
+
+END {
+ newNR = asort(gptimes);
+ if (newNR <= 0) {
+ print "No ftrace records found???"
+ exit 10;
+ }
+ pct50 = int(newNR * 50 / 100);
+ if (pct50 < 1)
+ pct50 = 1;
+ pct90 = int(newNR * 90 / 100);
+ if (pct90 < 1)
+ pct90 = 1;
+ pct99 = int(newNR * 99 / 100);
+ if (pct99 < 1)
+ pct99 = 1;
+ div = 10 ** int(log(gptimes[pct90]) / log(10) + .5) / 100;
+ print "Histogram bucket size: " div;
+ last = gptimes[1] - 10;
+ count = 0;
+ for (i = 1; i <= newNR; i++) {
+ current = div * int(gptimes[i] / div);
+ if (last == current) {
+ count++;
+ } else {
+ if (count > 0)
+ print last, count;
+ count = 1;
+ last = current;
+ }
+ }
+ if (count > 0)
+ print last, count;
+ print "Distribution of grace periods across tasks:";
+ for (i in gptaskcnt) {
+ print "\t" i, gptaskcnt[i];
+ nbatches += gptaskcnt[i];
+ }
+ ngps = nbatches;
+ print "Distribution of piggybacking across tasks:";
+ for (i in piggybackcnt) {
+ print "\t" i, piggybackcnt[i];
+ ngps += piggybackcnt[i];
+ }
+ print "Average grace-period duration: " sum / newNR " microseconds";
+ print "Minimum grace-period duration: " gptimes[1];
+ print "50th percentile grace-period duration: " gptimes[pct50];
+ print "90th percentile grace-period duration: " gptimes[pct90];
+ print "99th percentile grace-period duration: " gptimes[pct99];
+ print "Maximum grace-period duration: " gptimes[newNR];
+ print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches " Lost: " nlost + 0;
+ print "Computed from ftrace data.";
+}'
+exit 0
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh
new file mode 100755
index 000000000..aa745152a
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Analyze a given results directory for rcuscale scalability measurements.
+#
+# Usage: kvm-recheck-rcuscale.sh resdir
+#
+# Copyright (C) IBM Corporation, 2016
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+i="$1"
+if test -d "$i" -a -r "$i"
+then
+ :
+else
+ echo Unreadable results directory: $i
+ exit 1
+fi
+PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH
+. functions.sh
+
+if kvm-recheck-rcuscale-ftrace.sh $i
+then
+ # ftrace data was successfully analyzed, call it good!
+ exit 0
+fi
+
+configfile=`echo $i | sed -e 's/^.*\///'`
+
+sed -e 's/^\[[^]]*]//' < $i/console.log |
+awk '
+/-scale: .* gps: .* batches:/ {
+ ngps = $9;
+ nbatches = $11;
+}
+
+/-scale: .*writer-duration/ {
+ gptimes[++n] = $5 / 1000.;
+ sum += $5 / 1000.;
+}
+
+END {
+ newNR = asort(gptimes);
+ if (newNR <= 0) {
+ print "No rcuscale records found???"
+ exit;
+ }
+ pct50 = int(newNR * 50 / 100);
+ if (pct50 < 1)
+ pct50 = 1;
+ pct90 = int(newNR * 90 / 100);
+ if (pct90 < 1)
+ pct90 = 1;
+ pct99 = int(newNR * 99 / 100);
+ if (pct99 < 1)
+ pct99 = 1;
+ div = 10 ** int(log(gptimes[pct90]) / log(10) + .5) / 100;
+ print "Histogram bucket size: " div;
+ last = gptimes[1] - 10;
+ count = 0;
+ for (i = 1; i <= newNR; i++) {
+ current = div * int(gptimes[i] / div);
+ if (last == current) {
+ count++;
+ } else {
+ if (count > 0)
+ print last, count;
+ count = 1;
+ last = current;
+ }
+ }
+ if (count > 0)
+ print last, count;
+ print "Average grace-period duration: " sum / newNR " microseconds";
+ print "Minimum grace-period duration: " gptimes[1];
+ print "50th percentile grace-period duration: " gptimes[pct50];
+ print "90th percentile grace-period duration: " gptimes[pct90];
+ print "99th percentile grace-period duration: " gptimes[pct99];
+ print "Maximum grace-period duration: " gptimes[newNR];
+ print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches;
+ print "Computed from rcuscale printk output.";
+}'
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-refscale.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refscale.sh
new file mode 100755
index 000000000..35a463ddd
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refscale.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Analyze a given results directory for refscale performance measurements.
+#
+# Usage: kvm-recheck-refscale.sh resdir
+#
+# Copyright (C) IBM Corporation, 2016
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+i="$1"
+if test -d "$i" -a -r "$i"
+then
+ :
+else
+ echo Unreadable results directory: $i
+ exit 1
+fi
+PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH
+. functions.sh
+
+configfile=`echo $i | sed -e 's/^.*\///'`
+
+sed -e 's/^\[[^]]*]//' < $i/console.log | tr -d '\015' |
+awk -v configfile="$configfile" '
+/^[ ]*Runs Time\(ns\) *$/ {
+ if (dataphase + 0 == 0) {
+ dataphase = 1;
+ # print configfile, $0;
+ }
+ next;
+}
+
+/[^ ]*[0-9][0-9]* [0-9][0-9]*\.[0-9][0-9]*$/ {
+ if (dataphase == 1) {
+ # print $0;
+ readertimes[++n] = $2;
+ sum += $2;
+ }
+ next;
+}
+
+{
+ if (dataphase == 1)
+ dataphase == 2;
+ next;
+}
+
+END {
+ print configfile " results:";
+ newNR = asort(readertimes);
+ if (newNR <= 0) {
+ print "No refscale records found???"
+ exit;
+ }
+ medianidx = int(newNR / 2);
+ if (newNR == medianidx * 2)
+ medianvalue = (readertimes[medianidx - 1] + readertimes[medianidx]) / 2;
+ else
+ medianvalue = readertimes[medianidx];
+ points = "Points:";
+ for (i = 1; i <= newNR; i++)
+ points = points " " readertimes[i];
+ print points;
+ print "Average reader duration: " sum / newNR " nanoseconds";
+ print "Minimum reader duration: " readertimes[1];
+ print "Median reader duration: " medianvalue;
+ print "Maximum reader duration: " readertimes[newNR];
+ print "Computed from refscale printk output.";
+}'
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-scf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-scf.sh
new file mode 100755
index 000000000..671bfee4f
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-scf.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Analyze a given results directory for rcutorture progress.
+#
+# Usage: kvm-recheck-rcu.sh resdir
+#
+# Copyright (C) Facebook, 2020
+#
+# Authors: Paul E. McKenney <paulmck@kernel.org>
+
+i="$1"
+if test -d "$i" -a -r "$i"
+then
+ :
+else
+ echo Unreadable results directory: $i
+ exit 1
+fi
+. functions.sh
+
+configfile=`echo $i | sed -e 's/^.*\///'`
+nscfs="`grep 'scf_invoked_count ver:' $i/console.log 2> /dev/null | tail -1 | sed -e 's/^.* scf_invoked_count ver: //' -e 's/ .*$//' | tr -d '\015'`"
+if test -z "$nscfs"
+then
+ echo "$configfile ------- "
+else
+ dur="`sed -e 's/^.* scftorture.shutdown_secs=//' -e 's/ .*$//' < $i/qemu-cmd 2> /dev/null`"
+ if test -z "$dur"
+ then
+ rate=""
+ else
+ nscfss=`awk -v nscfs=$nscfs -v dur=$dur '
+ BEGIN { print nscfs / dur }' < /dev/null`
+ rate=" ($nscfss/s)"
+ fi
+ echo "${configfile} ------- ${nscfs} SCF handler invocations$rate"
+fi
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
new file mode 100755
index 000000000..840a4679a
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Given the results directories for previous KVM-based torture runs,
+# check the build and console output for errors. Given a directory
+# containing results directories, this recursively checks them all.
+#
+# Usage: kvm-recheck.sh resdir ...
+#
+# Returns status reflecting the success or not of the last run specified.
+#
+# Copyright (C) IBM Corporation, 2011
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+T=/tmp/kvm-recheck.sh.$$
+trap 'rm -f $T' 0 2
+
+PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH
+. functions.sh
+for rd in "$@"
+do
+ firsttime=1
+ dirs=`find $rd -name Make.defconfig.out -print | sort | sed -e 's,/[^/]*$,,' | sort -u`
+ for i in $dirs
+ do
+ if test -n "$firsttime"
+ then
+ firsttime=""
+ resdir=`echo $i | sed -e 's,/$,,' -e 's,/[^/]*$,,'`
+ head -1 $resdir/log
+ fi
+ TORTURE_SUITE="`cat $i/../TORTURE_SUITE`"
+ configfile=`echo $i | sed -e 's,^.*/,,'`
+ rm -f $i/console.log.*.diags
+ kvm-recheck-${TORTURE_SUITE}.sh $i
+ if test -f "$i/qemu-retval" && test "`cat $i/qemu-retval`" -ne 0 && test "`cat $i/qemu-retval`" -ne 137
+ then
+ echo QEMU error, output:
+ cat $i/qemu-output
+ elif test -f "$i/console.log"
+ then
+ if test -f "$i/qemu-retval" && test "`cat $i/qemu-retval`" -eq 137
+ then
+ echo QEMU killed
+ fi
+ configcheck.sh $i/.config $i/ConfigFragment > $T 2>&1
+ cat $T
+ if test -r $i/Make.oldconfig.err
+ then
+ cat $i/Make.oldconfig.err
+ fi
+ parse-build.sh $i/Make.out $configfile
+ parse-console.sh $i/console.log $configfile
+ if test -r $i/Warnings
+ then
+ cat $i/Warnings
+ fi
+ else
+ if test -f "$i/buildonly"
+ then
+ echo Build-only run, no boot/test
+ configcheck.sh $i/.config $i/ConfigFragment
+ parse-build.sh $i/Make.out $configfile
+ elif test -f "$i/qemu-cmd"
+ then
+ print_bug qemu failed
+ echo " $i"
+ else
+ print_bug Build failed
+ echo " $i"
+ fi
+ fi
+ done
+ if test -f "$rd/kcsan.sum"
+ then
+ if grep -q CONFIG_KCSAN=y $T
+ then
+ echo "Compiler or architecture does not support KCSAN!"
+ echo Did you forget to switch your compiler with '--kmake-arg CC=<cc-that-supports-kcsan>'?
+ elif test -s "$rd/kcsan.sum"
+ then
+ echo KCSAN summary in $rd/kcsan.sum
+ else
+ echo Clean KCSAN run in $rd
+ fi
+ fi
+done
+EDITOR=echo kvm-find-errors.sh "${@: -1}" > $T 2>&1
+ret=$?
+builderrors="`tr ' ' '\012' < $T | grep -c '/Make.out.diags'`"
+if test "$builderrors" -gt 0
+then
+ echo $builderrors runs with build errors.
+fi
+runerrors="`tr ' ' '\012' < $T | grep -c '/console.log.diags'`"
+if test "$runerrors" -gt 0
+then
+ echo $runerrors runs with runtime errors.
+fi
+exit $ret
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
new file mode 100755
index 000000000..6dc2b49b8
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -0,0 +1,287 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Run a kvm-based test of the specified tree on the specified configs.
+# Fully automated run and error checking, no graphics console.
+#
+# Execute this in the source tree. Do not run it as a background task
+# because qemu does not seem to like that much.
+#
+# Usage: kvm-test-1-run.sh config builddir resdir seconds qemu-args boot_args
+#
+# qemu-args defaults to "-enable-kvm -nographic", along with arguments
+# specifying the number of CPUs and other options
+# generated from the underlying CPU architecture.
+# boot_args defaults to value returned by the per_version_boot_params
+# shell function.
+#
+# Anything you specify for either qemu-args or boot_args is appended to
+# the default values. The "-smp" value is deduced from the contents of
+# the config fragment.
+#
+# More sophisticated argument parsing is clearly needed.
+#
+# Copyright (C) IBM Corporation, 2011
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+T=${TMPDIR-/tmp}/kvm-test-1-run.sh.$$
+trap 'rm -rf $T' 0
+mkdir $T
+
+. functions.sh
+. $CONFIGFRAG/ver_functions.sh
+
+config_template=${1}
+config_dir=`echo $config_template | sed -e 's,/[^/]*$,,'`
+title=`echo $config_template | sed -e 's/^.*\///'`
+builddir=${2}
+resdir=${3}
+if test -z "$resdir" -o ! -d "$resdir" -o ! -w "$resdir"
+then
+ echo "kvm-test-1-run.sh :$resdir: Not a writable directory, cannot store results into it"
+ exit 1
+fi
+echo ' ---' `date`: Starting build
+echo ' ---' Kconfig fragment at: $config_template >> $resdir/log
+touch $resdir/ConfigFragment.input
+
+# Combine additional Kconfig options into an existing set such that
+# newer options win. The first argument is the Kconfig source ID, the
+# second the to-be-updated file within $T, and the third and final the
+# list of additional Kconfig options. Note that a $2.tmp file is
+# created when doing the update.
+config_override_param () {
+ if test -n "$3"
+ then
+ echo $3 | sed -e 's/^ *//' -e 's/ *$//' | tr -s " " "\012" > $T/Kconfig_args
+ echo " --- $1" >> $resdir/ConfigFragment.input
+ cat $T/Kconfig_args >> $resdir/ConfigFragment.input
+ config_override.sh $T/$2 $T/Kconfig_args > $T/$2.tmp
+ mv $T/$2.tmp $T/$2
+ # Note that "#CHECK#" is not permitted on commandline.
+ fi
+}
+
+echo > $T/KcList
+config_override_param "$config_dir/CFcommon" KcList "`cat $config_dir/CFcommon 2> /dev/null`"
+config_override_param "$config_template" KcList "`cat $config_template 2> /dev/null`"
+config_override_param "--gdb options" KcList "$TORTURE_KCONFIG_GDB_ARG"
+config_override_param "--kasan options" KcList "$TORTURE_KCONFIG_KASAN_ARG"
+config_override_param "--kcsan options" KcList "$TORTURE_KCONFIG_KCSAN_ARG"
+config_override_param "--kconfig argument" KcList "$TORTURE_KCONFIG_ARG"
+cp $T/KcList $resdir/ConfigFragment
+
+base_resdir=`echo $resdir | sed -e 's/\.[0-9]\+$//'`
+if test "$base_resdir" != "$resdir" -a -f $base_resdir/bzImage -a -f $base_resdir/vmlinux
+then
+ # Rerunning previous test, so use that test's kernel.
+ QEMU="`identify_qemu $base_resdir/vmlinux`"
+ BOOT_IMAGE="`identify_boot_image $QEMU`"
+ KERNEL=$base_resdir/${BOOT_IMAGE##*/} # use the last component of ${BOOT_IMAGE}
+ ln -s $base_resdir/Make*.out $resdir # for kvm-recheck.sh
+ ln -s $base_resdir/.config $resdir # for kvm-recheck.sh
+ # Arch-independent indicator
+ touch $resdir/builtkernel
+elif kvm-build.sh $T/KcList $resdir
+then
+ # Had to build a kernel for this test.
+ QEMU="`identify_qemu vmlinux`"
+ BOOT_IMAGE="`identify_boot_image $QEMU`"
+ cp vmlinux $resdir
+ cp .config $resdir
+ cp Module.symvers $resdir > /dev/null || :
+ cp System.map $resdir > /dev/null || :
+ if test -n "$BOOT_IMAGE"
+ then
+ cp $BOOT_IMAGE $resdir
+ KERNEL=$resdir/${BOOT_IMAGE##*/}
+ # Arch-independent indicator
+ touch $resdir/builtkernel
+ else
+ echo No identifiable boot image, not running KVM, see $resdir.
+ echo Do the torture scripts know about your architecture?
+ fi
+ parse-build.sh $resdir/Make.out $title
+else
+ # Build failed.
+ cp .config $resdir || :
+ echo Build failed, not running KVM, see $resdir.
+ if test -f $builddir.wait
+ then
+ mv $builddir.wait $builddir.ready
+ fi
+ exit 1
+fi
+if test -f $builddir.wait
+then
+ mv $builddir.wait $builddir.ready
+fi
+while test -f $builddir.ready
+do
+ sleep 1
+done
+seconds=$4
+qemu_args=$5
+boot_args=$6
+
+kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null`
+if test -z "$TORTURE_BUILDONLY"
+then
+ echo ' ---' `date`: Starting kernel
+fi
+
+# Generate -smp qemu argument.
+qemu_args="-enable-kvm -nographic $qemu_args"
+cpu_count=`configNR_CPUS.sh $resdir/ConfigFragment`
+cpu_count=`configfrag_boot_cpus "$boot_args" "$config_template" "$cpu_count"`
+if test "$cpu_count" -gt "$TORTURE_ALLOTED_CPUS"
+then
+ echo CPU count limited from $cpu_count to $TORTURE_ALLOTED_CPUS | tee -a $resdir/Warnings
+ cpu_count=$TORTURE_ALLOTED_CPUS
+fi
+qemu_args="`specify_qemu_cpus "$QEMU" "$qemu_args" "$cpu_count"`"
+qemu_args="`specify_qemu_net "$qemu_args"`"
+
+# Generate architecture-specific and interaction-specific qemu arguments
+qemu_args="$qemu_args `identify_qemu_args "$QEMU" "$resdir/console.log"`"
+
+# Generate qemu -append arguments
+qemu_append="`identify_qemu_append "$QEMU"`"
+
+# Pull in Kconfig-fragment boot parameters
+boot_args="`configfrag_boot_params "$boot_args" "$config_template"`"
+# Generate kernel-version-specific boot parameters
+boot_args="`per_version_boot_params "$boot_args" $resdir/.config $seconds`"
+if test -n "$TORTURE_BOOT_GDB_ARG"
+then
+ boot_args="$boot_args $TORTURE_BOOT_GDB_ARG"
+fi
+echo $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append \"$qemu_append $boot_args\" $TORTURE_QEMU_GDB_ARG > $resdir/qemu-cmd
+
+if test -n "$TORTURE_BUILDONLY"
+then
+ echo Build-only run specified, boot/test omitted.
+ touch $resdir/buildonly
+ exit 0
+fi
+
+# Decorate qemu-cmd with redirection, backgrounding, and PID capture
+sed -e 's/$/ 2>\&1 \&/' < $resdir/qemu-cmd > $T/qemu-cmd
+echo 'echo $! > $resdir/qemu_pid' >> $T/qemu-cmd
+
+# In case qemu refuses to run...
+echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log
+
+# Attempt to run qemu
+( . $T/qemu-cmd; wait `cat $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) &
+commandcompleted=0
+if test -z "$TORTURE_KCONFIG_GDB_ARG"
+then
+ sleep 10 # Give qemu's pid a chance to reach the file
+ if test -s "$resdir/qemu_pid"
+ then
+ qemu_pid=`cat "$resdir/qemu_pid"`
+ echo Monitoring qemu job at pid $qemu_pid
+ else
+ qemu_pid=""
+ echo Monitoring qemu job at yet-as-unknown pid
+ fi
+fi
+if test -n "$TORTURE_KCONFIG_GDB_ARG"
+then
+ echo Waiting for you to attach a debug session, for example: > /dev/tty
+ echo " gdb $base_resdir/vmlinux" > /dev/tty
+ echo 'After symbols load and the "(gdb)" prompt appears:' > /dev/tty
+ echo " target remote :1234" > /dev/tty
+ echo " continue" > /dev/tty
+ kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null`
+fi
+while :
+do
+ if test -z "$qemu_pid" -a -s "$resdir/qemu_pid"
+ then
+ qemu_pid=`cat "$resdir/qemu_pid"`
+ fi
+ kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null`
+ if test -z "$qemu_pid" || kill -0 "$qemu_pid" > /dev/null 2>&1
+ then
+ if test $kruntime -ge $seconds -o -f "$TORTURE_STOPFILE"
+ then
+ break;
+ fi
+ sleep 1
+ else
+ commandcompleted=1
+ if test $kruntime -lt $seconds
+ then
+ echo Completed in $kruntime vs. $seconds >> $resdir/Warnings 2>&1
+ grep "^(qemu) qemu:" $resdir/kvm-test-1-run.sh.out >> $resdir/Warnings 2>&1
+ killpid="`sed -n "s/^(qemu) qemu: terminating on signal [0-9]* from pid \([0-9]*\).*$/\1/p" $resdir/Warnings`"
+ if test -n "$killpid"
+ then
+ echo "ps -fp $killpid" >> $resdir/Warnings 2>&1
+ ps -fp $killpid >> $resdir/Warnings 2>&1
+ fi
+ else
+ echo ' ---' `date`: "Kernel done"
+ fi
+ break
+ fi
+done
+if test -z "$qemu_pid" -a -s "$resdir/qemu_pid"
+then
+ qemu_pid=`cat "$resdir/qemu_pid"`
+fi
+if test $commandcompleted -eq 0 -a -n "$qemu_pid"
+then
+ if ! test -f "$TORTURE_STOPFILE"
+ then
+ echo Grace period for qemu job at pid $qemu_pid
+ fi
+ oldline="`tail $resdir/console.log`"
+ while :
+ do
+ if test -f "$TORTURE_STOPFILE"
+ then
+ echo "PID $qemu_pid killed due to run STOP request" >> $resdir/Warnings 2>&1
+ kill -KILL $qemu_pid
+ break
+ fi
+ kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null`
+ if kill -0 $qemu_pid > /dev/null 2>&1
+ then
+ :
+ else
+ break
+ fi
+ must_continue=no
+ newline="`tail $resdir/console.log`"
+ if test "$newline" != "$oldline" && echo $newline | grep -q ' [0-9]\+us : '
+ then
+ must_continue=yes
+ fi
+ last_ts="`tail $resdir/console.log | grep '^\[ *[0-9]\+\.[0-9]\+]' | tail -1 | sed -e 's/^\[ *//' -e 's/\..*$//'`"
+ if test -z "$last_ts"
+ then
+ last_ts=0
+ fi
+ if test "$newline" != "$oldline" -a "$last_ts" -lt $((seconds + $TORTURE_SHUTDOWN_GRACE))
+ then
+ must_continue=yes
+ fi
+ if test $must_continue = no -a $kruntime -ge $((seconds + $TORTURE_SHUTDOWN_GRACE))
+ then
+ echo "!!! PID $qemu_pid hung at $kruntime vs. $seconds seconds" >> $resdir/Warnings 2>&1
+ kill -KILL $qemu_pid
+ break
+ fi
+ oldline=$newline
+ sleep 10
+ done
+elif test -z "$qemu_pid"
+then
+ echo Unknown PID, cannot kill qemu command
+fi
+
+parse-console.sh $resdir/console.log $title
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-transform.sh b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh
new file mode 100755
index 000000000..c45a953ef
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Transform a qemu-cmd file to allow reuse.
+#
+# Usage: kvm-transform.sh bzImage console.log < qemu-cmd-in > qemu-cmd-out
+#
+# bzImage: Kernel and initrd from the same prior kvm.sh run.
+# console.log: File into which to place console output.
+#
+# The original qemu-cmd file is provided on standard input.
+# The transformed qemu-cmd file is on standard output.
+# The transformation assumes that the qemu command is confined to a
+# single line. It also assumes no whitespace in filenames.
+#
+# Copyright (C) 2020 Facebook, Inc.
+#
+# Authors: Paul E. McKenney <paulmck@kernel.org>
+
+image="$1"
+if test -z "$image"
+then
+ echo Need kernel image file.
+ exit 1
+fi
+consolelog="$2"
+if test -z "$consolelog"
+then
+ echo "Need console log file name."
+ exit 1
+fi
+
+awk -v image="$image" -v consolelog="$consolelog" '
+{
+ line = "";
+ for (i = 1; i <= NF; i++) {
+ if (line == "")
+ line = $i;
+ else
+ line = line " " $i;
+ if ($i == "-serial") {
+ i++;
+ line = line " file:" consolelog;
+ }
+ if ($i == "-kernel") {
+ i++;
+ line = line " " image;
+ }
+ }
+ print line;
+}'
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
new file mode 100755
index 000000000..6eb1d3f65
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -0,0 +1,536 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Run a series of tests under KVM. By default, this series is specified
+# by the relevant CFLIST file, but can be overridden by the --configs
+# command-line argument.
+#
+# Usage: kvm.sh [ options ]
+#
+# Copyright (C) IBM Corporation, 2011
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+scriptname=$0
+args="$*"
+
+T=${TMPDIR-/tmp}/kvm.sh.$$
+trap 'rm -rf $T' 0
+mkdir $T
+
+cd `dirname $scriptname`/../../../../../
+
+dur=$((30*60))
+dryrun=""
+KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM
+PATH=${KVM}/bin:$PATH; export PATH
+. functions.sh
+
+TORTURE_ALLOTED_CPUS="`identify_qemu_vcpus`"
+TORTURE_DEFCONFIG=defconfig
+TORTURE_BOOT_IMAGE=""
+TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD
+TORTURE_KCONFIG_ARG=""
+TORTURE_KCONFIG_GDB_ARG=""
+TORTURE_BOOT_GDB_ARG=""
+TORTURE_QEMU_GDB_ARG=""
+TORTURE_KCONFIG_KASAN_ARG=""
+TORTURE_KCONFIG_KCSAN_ARG=""
+TORTURE_KMAKE_ARG=""
+TORTURE_QEMU_MEM=512
+TORTURE_SHUTDOWN_GRACE=180
+TORTURE_SUITE=rcu
+TORTURE_TRUST_MAKE=""
+resdir=""
+configs=""
+cpus=0
+ds=`date +%Y.%m.%d-%H.%M.%S`
+jitter="-1"
+
+usage () {
+ echo "Usage: $scriptname optional arguments:"
+ echo " --allcpus"
+ echo " --bootargs kernel-boot-arguments"
+ echo " --bootimage relative-path-to-kernel-boot-image"
+ echo " --buildonly"
+ echo " --configs \"config-file list w/ repeat factor (3*TINY01)\""
+ echo " --cpus N"
+ echo " --datestamp string"
+ echo " --defconfig string"
+ echo " --dryrun sched|script"
+ echo " --duration minutes"
+ echo " --gdb"
+ echo " --help"
+ echo " --interactive"
+ echo " --jitter N [ maxsleep (us) [ maxspin (us) ] ]"
+ echo " --kconfig Kconfig-options"
+ echo " --kmake-arg kernel-make-arguments"
+ echo " --mac nn:nn:nn:nn:nn:nn"
+ echo " --memory megabytes|nnnG"
+ echo " --no-initrd"
+ echo " --qemu-args qemu-arguments"
+ echo " --qemu-cmd qemu-system-..."
+ echo " --results absolute-pathname"
+ echo " --torture lock|rcu|rcuscale|refscale|scf"
+ echo " --trust-make"
+ exit 1
+}
+
+while test $# -gt 0
+do
+ case "$1" in
+ --allcpus)
+ cpus=$TORTURE_ALLOTED_CPUS
+ max_cpus=$TORTURE_ALLOTED_CPUS
+ ;;
+ --bootargs|--bootarg)
+ checkarg --bootargs "(list of kernel boot arguments)" "$#" "$2" '.*' '^--'
+ TORTURE_BOOTARGS="$2"
+ shift
+ ;;
+ --bootimage)
+ checkarg --bootimage "(relative path to kernel boot image)" "$#" "$2" '[a-zA-Z0-9][a-zA-Z0-9_]*' '^--'
+ TORTURE_BOOT_IMAGE="$2"
+ shift
+ ;;
+ --buildonly)
+ TORTURE_BUILDONLY=1
+ ;;
+ --configs|--config)
+ checkarg --configs "(list of config files)" "$#" "$2" '^[^/]*$' '^--'
+ configs="$2"
+ shift
+ ;;
+ --cpus)
+ checkarg --cpus "(number)" "$#" "$2" '^[0-9]*$' '^--'
+ cpus=$2
+ TORTURE_ALLOTED_CPUS="$2"
+ max_cpus="`identify_qemu_vcpus`"
+ if test "$TORTURE_ALLOTED_CPUS" -gt "$max_cpus"
+ then
+ TORTURE_ALLOTED_CPUS=$max_cpus
+ fi
+ shift
+ ;;
+ --datestamp)
+ checkarg --datestamp "(relative pathname)" "$#" "$2" '^[^/]*$' '^--'
+ ds=$2
+ shift
+ ;;
+ --defconfig)
+ checkarg --defconfig "defconfigtype" "$#" "$2" '^[^/][^/]*$' '^--'
+ TORTURE_DEFCONFIG=$2
+ shift
+ ;;
+ --dryrun)
+ checkarg --dryrun "sched|script" $# "$2" 'sched\|script' '^--'
+ dryrun=$2
+ shift
+ ;;
+ --duration)
+ checkarg --duration "(minutes)" $# "$2" '^[0-9]*$' '^error'
+ dur=$(($2*60))
+ shift
+ ;;
+ --gdb)
+ TORTURE_KCONFIG_GDB_ARG="CONFIG_DEBUG_INFO=y"; export TORTURE_KCONFIG_GDB_ARG
+ TORTURE_BOOT_GDB_ARG="nokaslr"; export TORTURE_BOOT_GDB_ARG
+ TORTURE_QEMU_GDB_ARG="-s -S"; export TORTURE_QEMU_GDB_ARG
+ ;;
+ --help|-h)
+ usage
+ ;;
+ --interactive)
+ TORTURE_QEMU_INTERACTIVE=1; export TORTURE_QEMU_INTERACTIVE
+ ;;
+ --jitter)
+ checkarg --jitter "(# threads [ sleep [ spin ] ])" $# "$2" '^-\{,1\}[0-9]\+\( \+[0-9]\+\)\{,2\} *$' '^error$'
+ jitter="$2"
+ shift
+ ;;
+ --kconfig)
+ checkarg --kconfig "(Kconfig options)" $# "$2" '^CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\)\( CONFIG_[A-Z0-9_]\+=\([ynm]\|[0-9]\+\)\)*$' '^error$'
+ TORTURE_KCONFIG_ARG="$2"
+ shift
+ ;;
+ --kasan)
+ TORTURE_KCONFIG_KASAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KASAN=y"; export TORTURE_KCONFIG_KASAN_ARG
+ ;;
+ --kcsan)
+ TORTURE_KCONFIG_KCSAN_ARG="CONFIG_DEBUG_INFO=y CONFIG_KCSAN=y CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000 CONFIG_KCSAN_VERBOSE=y CONFIG_KCSAN_INTERRUPT_WATCHER=y"; export TORTURE_KCONFIG_KCSAN_ARG
+ ;;
+ --kmake-arg)
+ checkarg --kmake-arg "(kernel make arguments)" $# "$2" '.*' '^error$'
+ TORTURE_KMAKE_ARG="$2"
+ shift
+ ;;
+ --mac)
+ checkarg --mac "(MAC address)" $# "$2" '^\([0-9a-fA-F]\{2\}:\)\{5\}[0-9a-fA-F]\{2\}$' error
+ TORTURE_QEMU_MAC=$2
+ shift
+ ;;
+ --memory)
+ checkarg --memory "(memory size)" $# "$2" '^[0-9]\+[MG]\?$' error
+ TORTURE_QEMU_MEM=$2
+ shift
+ ;;
+ --no-initrd)
+ TORTURE_INITRD=""; export TORTURE_INITRD
+ ;;
+ --qemu-args|--qemu-arg)
+ checkarg --qemu-args "(qemu arguments)" $# "$2" '^-' '^error'
+ TORTURE_QEMU_ARG="$2"
+ shift
+ ;;
+ --qemu-cmd)
+ checkarg --qemu-cmd "(qemu-system-...)" $# "$2" 'qemu-system-' '^--'
+ TORTURE_QEMU_CMD="$2"
+ shift
+ ;;
+ --results)
+ checkarg --results "(absolute pathname)" "$#" "$2" '^/' '^error'
+ resdir=$2
+ shift
+ ;;
+ --shutdown-grace)
+ checkarg --shutdown-grace "(seconds)" "$#" "$2" '^[0-9]*$' '^error'
+ TORTURE_SHUTDOWN_GRACE=$2
+ shift
+ ;;
+ --torture)
+ checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuscale\|refscale\|scf\)$' '^--'
+ TORTURE_SUITE=$2
+ shift
+ if test "$TORTURE_SUITE" = rcuscale || test "$TORTURE_SUITE" = refscale
+ then
+ # If you really want jitter for refscale or
+ # rcuscale, specify it after specifying the rcuscale
+ # or the refscale. (But why jitter in these cases?)
+ jitter=0
+ fi
+ ;;
+ --trust-make)
+ TORTURE_TRUST_MAKE="y"
+ ;;
+ *)
+ echo Unknown argument $1
+ usage
+ ;;
+ esac
+ shift
+done
+
+if test -z "$TORTURE_INITRD" || tools/testing/selftests/rcutorture/bin/mkinitrd.sh
+then
+ :
+else
+ echo No initrd and unable to create one, aborting test >&2
+ exit 1
+fi
+
+CONFIGFRAG=${KVM}/configs/${TORTURE_SUITE}; export CONFIGFRAG
+
+defaultconfigs="`tr '\012' ' ' < $CONFIGFRAG/CFLIST`"
+if test -z "$configs"
+then
+ configs=$defaultconfigs
+fi
+
+if test -z "$resdir"
+then
+ resdir=$KVM/res
+fi
+
+# Create a file of test-name/#cpus pairs, sorted by decreasing #cpus.
+configs_derep=
+for CF in $configs
+do
+ case $CF in
+ [0-9]\**|[0-9][0-9]\**|[0-9][0-9][0-9]\**)
+ config_reps=`echo $CF | sed -e 's/\*.*$//'`
+ CF1=`echo $CF | sed -e 's/^[^*]*\*//'`
+ ;;
+ *)
+ config_reps=1
+ CF1=$CF
+ ;;
+ esac
+ for ((cur_rep=0;cur_rep<$config_reps;cur_rep++))
+ do
+ configs_derep="$configs_derep $CF1"
+ done
+done
+touch $T/cfgcpu
+configs_derep="`echo $configs_derep | sed -e "s/\<CFLIST\>/$defaultconfigs/g"`"
+if test -n "$TORTURE_KCONFIG_GDB_ARG"
+then
+ if test "`echo $configs_derep | wc -w`" -gt 1
+ then
+ echo "The --config list is: $configs_derep."
+ echo "Only one --config permitted with --gdb, terminating."
+ exit 1
+ fi
+fi
+for CF1 in $configs_derep
+do
+ if test -f "$CONFIGFRAG/$CF1"
+ then
+ cpu_count=`configNR_CPUS.sh $CONFIGFRAG/$CF1`
+ cpu_count=`configfrag_boot_cpus "$TORTURE_BOOTARGS" "$CONFIGFRAG/$CF1" "$cpu_count"`
+ cpu_count=`configfrag_boot_maxcpus "$TORTURE_BOOTARGS" "$CONFIGFRAG/$CF1" "$cpu_count"`
+ echo $CF1 $cpu_count >> $T/cfgcpu
+ else
+ echo "The --configs file $CF1 does not exist, terminating."
+ exit 1
+ fi
+done
+sort -k2nr $T/cfgcpu -T="$T" > $T/cfgcpu.sort
+
+# Use a greedy bin-packing algorithm, sorting the list accordingly.
+awk < $T/cfgcpu.sort > $T/cfgcpu.pack -v ncpus=$cpus '
+BEGIN {
+ njobs = 0;
+}
+
+{
+ # Read file of tests and corresponding required numbers of CPUs.
+ cf[njobs] = $1;
+ cpus[njobs] = $2;
+ njobs++;
+}
+
+END {
+ batch = 0;
+ nc = -1;
+
+ # Each pass through the following loop creates on test batch
+ # that can be executed concurrently given ncpus. Note that a
+ # given test that requires more than the available CPUs will run in
+ # their own batch. Such tests just have to make do with what
+ # is available.
+ while (nc != ncpus) {
+ batch++;
+ nc = ncpus;
+
+ # Each pass through the following loop considers one
+ # test for inclusion in the current batch.
+ for (i = 0; i < njobs; i++) {
+ if (done[i])
+ continue; # Already part of a batch.
+ if (nc >= cpus[i] || nc == ncpus) {
+
+ # This test fits into the current batch.
+ done[i] = batch;
+ nc -= cpus[i];
+ if (nc <= 0)
+ break; # Too-big test in its own batch.
+ }
+ }
+ }
+
+ # Dump out the tests in batch order.
+ for (b = 1; b <= batch; b++)
+ for (i = 0; i < njobs; i++)
+ if (done[i] == b)
+ print cf[i], cpus[i];
+}'
+
+# Generate a script to execute the tests in appropriate batches.
+cat << ___EOF___ > $T/script
+CONFIGFRAG="$CONFIGFRAG"; export CONFIGFRAG
+KVM="$KVM"; export KVM
+PATH="$PATH"; export PATH
+TORTURE_ALLOTED_CPUS="$TORTURE_ALLOTED_CPUS"; export TORTURE_ALLOTED_CPUS
+TORTURE_BOOT_IMAGE="$TORTURE_BOOT_IMAGE"; export TORTURE_BOOT_IMAGE
+TORTURE_BUILDONLY="$TORTURE_BUILDONLY"; export TORTURE_BUILDONLY
+TORTURE_DEFCONFIG="$TORTURE_DEFCONFIG"; export TORTURE_DEFCONFIG
+TORTURE_INITRD="$TORTURE_INITRD"; export TORTURE_INITRD
+TORTURE_KCONFIG_ARG="$TORTURE_KCONFIG_ARG"; export TORTURE_KCONFIG_ARG
+TORTURE_KCONFIG_GDB_ARG="$TORTURE_KCONFIG_GDB_ARG"; export TORTURE_KCONFIG_GDB_ARG
+TORTURE_BOOT_GDB_ARG="$TORTURE_BOOT_GDB_ARG"; export TORTURE_BOOT_GDB_ARG
+TORTURE_QEMU_GDB_ARG="$TORTURE_QEMU_GDB_ARG"; export TORTURE_QEMU_GDB_ARG
+TORTURE_KCONFIG_KASAN_ARG="$TORTURE_KCONFIG_KASAN_ARG"; export TORTURE_KCONFIG_KASAN_ARG
+TORTURE_KCONFIG_KCSAN_ARG="$TORTURE_KCONFIG_KCSAN_ARG"; export TORTURE_KCONFIG_KCSAN_ARG
+TORTURE_KMAKE_ARG="$TORTURE_KMAKE_ARG"; export TORTURE_KMAKE_ARG
+TORTURE_QEMU_CMD="$TORTURE_QEMU_CMD"; export TORTURE_QEMU_CMD
+TORTURE_QEMU_INTERACTIVE="$TORTURE_QEMU_INTERACTIVE"; export TORTURE_QEMU_INTERACTIVE
+TORTURE_QEMU_MAC="$TORTURE_QEMU_MAC"; export TORTURE_QEMU_MAC
+TORTURE_QEMU_MEM="$TORTURE_QEMU_MEM"; export TORTURE_QEMU_MEM
+TORTURE_SHUTDOWN_GRACE="$TORTURE_SHUTDOWN_GRACE"; export TORTURE_SHUTDOWN_GRACE
+TORTURE_SUITE="$TORTURE_SUITE"; export TORTURE_SUITE
+TORTURE_TRUST_MAKE="$TORTURE_TRUST_MAKE"; export TORTURE_TRUST_MAKE
+if ! test -e $resdir
+then
+ mkdir -p "$resdir" || :
+fi
+mkdir $resdir/$ds
+TORTURE_RESDIR="$resdir/$ds"; export TORTURE_RESDIR
+TORTURE_STOPFILE="$resdir/$ds/STOP"; export TORTURE_STOPFILE
+echo Results directory: $resdir/$ds
+echo $scriptname $args
+touch $resdir/$ds/log
+echo $scriptname $args >> $resdir/$ds/log
+echo ${TORTURE_SUITE} > $resdir/$ds/TORTURE_SUITE
+pwd > $resdir/$ds/testid.txt
+if test -d .git
+then
+ git status >> $resdir/$ds/testid.txt
+ git rev-parse HEAD >> $resdir/$ds/testid.txt
+ git diff HEAD >> $resdir/$ds/testid.txt
+fi
+___EOF___
+awk < $T/cfgcpu.pack \
+ -v TORTURE_BUILDONLY="$TORTURE_BUILDONLY" \
+ -v CONFIGDIR="$CONFIGFRAG/" \
+ -v KVM="$KVM" \
+ -v ncpus=$cpus \
+ -v jitter="$jitter" \
+ -v rd=$resdir/$ds/ \
+ -v dur=$dur \
+ -v TORTURE_QEMU_ARG="$TORTURE_QEMU_ARG" \
+ -v TORTURE_BOOTARGS="$TORTURE_BOOTARGS" \
+'BEGIN {
+ i = 0;
+}
+
+{
+ cf[i] = $1;
+ cpus[i] = $2;
+ i++;
+}
+
+# Dump out the scripting required to run one test batch.
+function dump(first, pastlast, batchnum)
+{
+ print "echo ----Start batch " batchnum ": `date` | tee -a " rd "log";
+ print "needqemurun="
+ jn=1
+ for (j = first; j < pastlast; j++) {
+ builddir=KVM "/b" j - first + 1
+ cpusr[jn] = cpus[j];
+ if (cfrep[cf[j]] == "") {
+ cfr[jn] = cf[j];
+ cfrep[cf[j]] = 1;
+ } else {
+ cfrep[cf[j]]++;
+ cfr[jn] = cf[j] "." cfrep[cf[j]];
+ }
+ if (cpusr[jn] > ncpus && ncpus != 0)
+ ovf = "-ovf";
+ else
+ ovf = "";
+ print "echo ", cfr[jn], cpusr[jn] ovf ": Starting build. `date` | tee -a " rd "log";
+ print "rm -f " builddir ".*";
+ print "touch " builddir ".wait";
+ print "mkdir " rd cfr[jn] " || :";
+ print "kvm-test-1-run.sh " CONFIGDIR cf[j], builddir, rd cfr[jn], dur " \"" TORTURE_QEMU_ARG "\" \"" TORTURE_BOOTARGS "\" > " rd cfr[jn] "/kvm-test-1-run.sh.out 2>&1 &"
+ print "echo ", cfr[jn], cpusr[jn] ovf ": Waiting for build to complete. `date` | tee -a " rd "log";
+ print "while test -f " builddir ".wait"
+ print "do"
+ print "\tsleep 1"
+ print "done"
+ print "echo ", cfr[jn], cpusr[jn] ovf ": Build complete. `date` | tee -a " rd "log";
+ jn++;
+ }
+ for (j = 1; j < jn; j++) {
+ builddir=KVM "/b" j
+ print "rm -f " builddir ".ready"
+ print "if test -f \"" rd cfr[j] "/builtkernel\""
+ print "then"
+ print "\techo ----", cfr[j], cpusr[j] ovf ": Kernel present. `date` | tee -a " rd "log";
+ print "\tneedqemurun=1"
+ print "fi"
+ }
+ njitter = 0;
+ split(jitter, ja);
+ if (ja[1] == -1 && ncpus == 0)
+ njitter = 1;
+ else if (ja[1] == -1)
+ njitter = ncpus;
+ else
+ njitter = ja[1];
+ if (TORTURE_BUILDONLY && njitter != 0) {
+ njitter = 0;
+ print "echo Build-only run, so suppressing jitter | tee -a " rd "log"
+ }
+ if (TORTURE_BUILDONLY) {
+ print "needqemurun="
+ }
+ print "if test -n \"$needqemurun\""
+ print "then"
+ print "\techo ---- Starting kernels. `date` | tee -a " rd "log";
+ for (j = 0; j < njitter; j++)
+ print "\tjitter.sh " j " " dur " " ja[2] " " ja[3] "&"
+ print "\twait"
+ print "\techo ---- All kernel runs complete. `date` | tee -a " rd "log";
+ print "else"
+ print "\twait"
+ print "\techo ---- No kernel runs. `date` | tee -a " rd "log";
+ print "fi"
+ for (j = 1; j < jn; j++) {
+ builddir=KVM "/b" j
+ print "echo ----", cfr[j], cpusr[j] ovf ": Build/run results: | tee -a " rd "log";
+ print "cat " rd cfr[j] "/kvm-test-1-run.sh.out | tee -a " rd "log";
+ }
+}
+
+END {
+ njobs = i;
+ nc = ncpus;
+ first = 0;
+ batchnum = 1;
+
+ # Each pass through the following loop considers one test.
+ for (i = 0; i < njobs; i++) {
+ if (ncpus == 0) {
+ # Sequential test specified, each test its own batch.
+ dump(i, i + 1, batchnum);
+ first = i;
+ batchnum++;
+ } else if (nc < cpus[i] && i != 0) {
+ # Out of CPUs, dump out a batch.
+ dump(first, i, batchnum);
+ first = i;
+ nc = ncpus;
+ batchnum++;
+ }
+ # Account for the CPUs needed by the current test.
+ nc -= cpus[i];
+ }
+ # Dump the last batch.
+ if (ncpus != 0)
+ dump(first, i, batchnum);
+}' >> $T/script
+
+cat << ___EOF___ >> $T/script
+echo
+echo
+echo " --- `date` Test summary:"
+echo Results directory: $resdir/$ds
+kcsan-collapse.sh $resdir/$ds
+kvm-recheck.sh $resdir/$ds
+___EOF___
+
+if test "$dryrun" = script
+then
+ cat $T/script
+ exit 0
+elif test "$dryrun" = sched
+then
+ # Extract the test run schedule from the script.
+ egrep 'Start batch|Starting build\.' $T/script |
+ grep -v ">>" |
+ sed -e 's/:.*$//' -e 's/^echo //'
+ exit 0
+else
+ # Not a dryrun, so run the script.
+ sh $T/script
+fi
+
+# Tracing: trace_event=rcu:rcu_grace_period,rcu:rcu_future_grace_period,rcu:rcu_grace_period_init,rcu:rcu_nocb_wake,rcu:rcu_preempt_task,rcu:rcu_unlock_preempted_task,rcu:rcu_quiescent_state_report,rcu:rcu_fqs,rcu:rcu_callback,rcu:rcu_kfree_callback,rcu:rcu_batch_start,rcu:rcu_invoke_callback,rcu:rcu_invoke_kfree_callback,rcu:rcu_batch_end,rcu:rcu_torture_read,rcu:rcu_barrier
+# Function-graph tracing: ftrace=function_graph ftrace_graph_filter=sched_setaffinity,migration_cpu_stop
+# Also --kconfig "CONFIG_FUNCTION_TRACER=y CONFIG_FUNCTION_GRAPH_TRACER=y"
+# Control buffer size: --bootargs trace_buf_size=3k
+# Get trace-buffer dumps on all oopses: --bootargs ftrace_dump_on_oops
+# Ditto, but dump only the oopsing CPU: --bootargs ftrace_dump_on_oops=orig_cpu
+# Heavy-handed way to also dump on warnings: --bootargs panic_on_warn
diff --git a/tools/testing/selftests/rcutorture/bin/mkinitrd.sh b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh
new file mode 100755
index 000000000..38e424d23
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/mkinitrd.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Create an initrd directory if one does not already exist.
+#
+# Copyright (C) IBM Corporation, 2013
+#
+# Author: Connor Shu <Connor.Shu@ibm.com>
+
+D=tools/testing/selftests/rcutorture
+
+# Prerequisite checks
+[ -z "$D" ] && echo >&2 "No argument supplied" && exit 1
+if [ ! -d "$D" ]; then
+ echo >&2 "$D does not exist: Malformed kernel source tree?"
+ exit 1
+fi
+if [ -s "$D/initrd/init" ]; then
+ echo "$D/initrd/init already exists, no need to create it"
+ exit 0
+fi
+
+# Create a C-language initrd/init infinite-loop program and statically
+# link it. This results in a very small initrd.
+echo "Creating a statically linked C-language initrd"
+cd $D
+mkdir -p initrd
+cd initrd
+cat > init.c << '___EOF___'
+#ifndef NOLIBC
+#include <unistd.h>
+#include <sys/time.h>
+#endif
+
+volatile unsigned long delaycount;
+
+int main(int argc, int argv[])
+{
+ int i;
+ struct timeval tv;
+ struct timeval tvb;
+
+ for (;;) {
+ sleep(1);
+ /* Need some userspace time. */
+ if (gettimeofday(&tvb, NULL))
+ continue;
+ do {
+ for (i = 0; i < 1000 * 100; i++)
+ delaycount = i * i;
+ if (gettimeofday(&tv, NULL))
+ break;
+ tv.tv_sec -= tvb.tv_sec;
+ if (tv.tv_sec > 1)
+ break;
+ tv.tv_usec += tv.tv_sec * 1000 * 1000;
+ tv.tv_usec -= tvb.tv_usec;
+ } while (tv.tv_usec < 1000);
+ }
+ return 0;
+}
+___EOF___
+
+# build using nolibc on supported archs (smaller executable) and fall
+# back to regular glibc on other ones.
+if echo -e "#if __x86_64__||__i386__||__i486__||__i586__||__i686__" \
+ "||__ARM_EABI__||__aarch64__\nyes\n#endif" \
+ | ${CROSS_COMPILE}gcc -E -nostdlib -xc - \
+ | grep -q '^yes'; then
+ # architecture supported by nolibc
+ ${CROSS_COMPILE}gcc -fno-asynchronous-unwind-tables -fno-ident \
+ -nostdlib -include ../../../../include/nolibc/nolibc.h \
+ -lgcc -s -static -Os -o init init.c
+else
+ ${CROSS_COMPILE}gcc -s -static -Os -o init init.c
+fi
+
+rm init.c
+echo "Done creating a statically linked C-language initrd"
+
+exit 0
diff --git a/tools/testing/selftests/rcutorture/bin/parse-build.sh b/tools/testing/selftests/rcutorture/bin/parse-build.sh
new file mode 100755
index 000000000..09155c15e
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/parse-build.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Check the build output from an rcutorture run for goodness.
+# The "file" is a pathname on the local system, and "title" is
+# a text string for error-message purposes.
+#
+# The file must contain kernel build output.
+#
+# Usage: parse-build.sh file title
+#
+# Copyright (C) IBM Corporation, 2011
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+F=$1
+title=$2
+T=${TMPDIR-/tmp}/parse-build.sh.$$
+trap 'rm -rf $T' 0
+mkdir $T
+
+. functions.sh
+
+if grep -q CC < $F || test -n "$TORTURE_TRUST_MAKE"
+then
+ :
+else
+ print_bug $title no build
+ exit 1
+fi
+
+if grep -q "error:" < $F
+then
+ print_bug $title build errors:
+ grep "error:" < $F
+ exit 2
+fi
+
+grep warning: < $F > $T/warnings
+grep "include/linux/*rcu*\.h:" $T/warnings > $T/hwarnings
+grep "kernel/rcu/[^/]*:" $T/warnings > $T/cwarnings
+cat $T/hwarnings $T/cwarnings > $T/rcuwarnings
+if test -s $T/rcuwarnings
+then
+ print_warning $title build errors:
+ cat $T/rcuwarnings
+ exit 2
+fi
+exit 0
diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh
new file mode 100755
index 000000000..e03338091
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh
@@ -0,0 +1,174 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Check the console output from an rcutorture run for oopses.
+# The "file" is a pathname on the local system, and "title" is
+# a text string for error-message purposes.
+#
+# Usage: parse-console.sh file title
+#
+# Copyright (C) IBM Corporation, 2011
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+T=${TMPDIR-/tmp}/parse-console.sh.$$
+file="$1"
+title="$2"
+
+trap 'rm -f $T.seq $T.diags' 0
+
+. functions.sh
+
+# Check for presence and readability of console output file
+if test -f "$file" -a -r "$file"
+then
+ :
+else
+ echo $title unreadable console output file: $file
+ exit 1
+fi
+if grep -Pq '\x00' < $file
+then
+ print_warning Console output contains nul bytes, old qemu still running?
+fi
+cat /dev/null > $file.diags
+
+# Check for proper termination, except for rcuscale and refscale.
+if test "$TORTURE_SUITE" != rcuscale && test "$TORTURE_SUITE" != refscale
+then
+ # check for abject failure
+
+ if grep -q FAILURE $file || grep -q -e '-torture.*!!!' $file
+ then
+ nerrs=`grep --binary-files=text '!!!' $file |
+ tail -1 |
+ awk '
+ {
+ normalexit = 1;
+ for (i=NF-8;i<=NF;i++) {
+ if (i <= 0 || i !~ /^[0-9]*$/) {
+ bangstring = $0;
+ gsub(/^\[[^]]*] /, "", bangstring);
+ print bangstring;
+ normalexit = 0;
+ exit 0;
+ }
+ sum+=$i;
+ }
+ }
+ END {
+ if (normalexit)
+ print sum " instances"
+ }'`
+ print_bug $title FAILURE, $nerrs
+ exit
+ fi
+
+ grep --binary-files=text 'torture:.*ver:' $file |
+ egrep --binary-files=text -v '\(null\)|rtc: 000000000* ' |
+ sed -e 's/^(initramfs)[^]]*] //' -e 's/^\[[^]]*] //' |
+ sed -e 's/^.*ver: //' |
+ awk '
+ BEGIN {
+ ver = 0;
+ badseq = 0;
+ }
+
+ {
+ if (!badseq && ($1 + 0 != $1 || $1 <= ver)) {
+ badseqno1 = ver;
+ badseqno2 = $1;
+ badseqnr = NR;
+ badseq = 1;
+ }
+ ver = $1
+ }
+
+ END {
+ if (badseq) {
+ if (badseqno1 == badseqno2 && badseqno2 == ver)
+ print "GP HANG at " ver " torture stat " badseqnr;
+ else
+ print "BAD SEQ " badseqno1 ":" badseqno2 " last:" ver " version " badseqnr;
+ }
+ }' > $T.seq
+
+ if grep -q SUCCESS $file
+ then
+ if test -s $T.seq
+ then
+ print_warning $title `cat $T.seq`
+ echo " " $file
+ exit 2
+ fi
+ else
+ if grep -q "_HOTPLUG:" $file
+ then
+ print_warning HOTPLUG FAILURES $title `cat $T.seq`
+ echo " " $file
+ exit 3
+ fi
+ echo $title no success message, `grep --binary-files=text 'ver:' $file | wc -l` successful version messages
+ if test -s $T.seq
+ then
+ print_warning $title `cat $T.seq`
+ fi
+ exit 2
+ fi
+fi | tee -a $file.diags
+
+console-badness.sh < $file > $T.diags
+if test -s $T.diags
+then
+ print_warning "Assertion failure in $file $title"
+ # cat $T.diags
+ summary=""
+ n_badness=`grep -c Badness $file`
+ if test "$n_badness" -ne 0
+ then
+ summary="$summary Badness: $n_badness"
+ fi
+ n_warn=`grep -v 'Warning: unable to open an initial console' $file | egrep -c 'WARNING:|Warn'`
+ if test "$n_warn" -ne 0
+ then
+ summary="$summary Warnings: $n_warn"
+ fi
+ n_bugs=`egrep -c 'BUG|Oops:' $file`
+ if test "$n_bugs" -ne 0
+ then
+ summary="$summary Bugs: $n_bugs"
+ fi
+ n_calltrace=`grep -c 'Call Trace:' $file`
+ if test "$n_calltrace" -ne 0
+ then
+ summary="$summary Call Traces: $n_calltrace"
+ fi
+ n_lockdep=`grep -c =========== $file`
+ if test "$n_badness" -ne 0
+ then
+ summary="$summary lockdep: $n_badness"
+ fi
+ n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state' $file`
+ if test "$n_stalls" -ne 0
+ then
+ summary="$summary Stalls: $n_stalls"
+ fi
+ n_starves=`grep -c 'rcu_.*kthread starved for' $file`
+ if test "$n_starves" -ne 0
+ then
+ summary="$summary Starves: $n_starves"
+ fi
+ print_warning Summary: $summary
+ cat $T.diags >> $file.diags
+fi
+for i in $file.*.diags
+do
+ if test -f "$i"
+ then
+ cat $i >> $file.diags
+ fi
+done
+if ! test -s $file.diags
+then
+ rm -f $file.diags
+fi
diff --git a/tools/testing/selftests/rcutorture/configs/lock/BUSTED b/tools/testing/selftests/rcutorture/configs/lock/BUSTED
new file mode 100644
index 000000000..1d1da1477
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/BUSTED
@@ -0,0 +1,6 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=4
+CONFIG_HOTPLUG_CPU=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
diff --git a/tools/testing/selftests/rcutorture/configs/lock/BUSTED.boot b/tools/testing/selftests/rcutorture/configs/lock/BUSTED.boot
new file mode 100644
index 000000000..6386c15e9
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/BUSTED.boot
@@ -0,0 +1 @@
+locktorture.torture_type=lock_busted
diff --git a/tools/testing/selftests/rcutorture/configs/lock/CFLIST b/tools/testing/selftests/rcutorture/configs/lock/CFLIST
new file mode 100644
index 000000000..41bae5824
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/CFLIST
@@ -0,0 +1,7 @@
+LOCK01
+LOCK02
+LOCK03
+LOCK04
+LOCK05
+LOCK06
+LOCK07
diff --git a/tools/testing/selftests/rcutorture/configs/lock/CFcommon b/tools/testing/selftests/rcutorture/configs/lock/CFcommon
new file mode 100644
index 000000000..e372dc269
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/CFcommon
@@ -0,0 +1,2 @@
+CONFIG_LOCK_TORTURE_TEST=y
+CONFIG_PRINTK_TIME=y
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK01 b/tools/testing/selftests/rcutorture/configs/lock/LOCK01
new file mode 100644
index 000000000..a9625e3d6
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK01
@@ -0,0 +1,6 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=8
+CONFIG_HOTPLUG_CPU=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK02 b/tools/testing/selftests/rcutorture/configs/lock/LOCK02
new file mode 100644
index 000000000..1d1da1477
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK02
@@ -0,0 +1,6 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=4
+CONFIG_HOTPLUG_CPU=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK02.boot b/tools/testing/selftests/rcutorture/configs/lock/LOCK02.boot
new file mode 100644
index 000000000..5aa44b4f1
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK02.boot
@@ -0,0 +1 @@
+locktorture.torture_type=mutex_lock
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK03 b/tools/testing/selftests/rcutorture/configs/lock/LOCK03
new file mode 100644
index 000000000..1d1da1477
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK03
@@ -0,0 +1,6 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=4
+CONFIG_HOTPLUG_CPU=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK03.boot b/tools/testing/selftests/rcutorture/configs/lock/LOCK03.boot
new file mode 100644
index 000000000..a67bbe024
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK03.boot
@@ -0,0 +1 @@
+locktorture.torture_type=rwsem_lock
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK04 b/tools/testing/selftests/rcutorture/configs/lock/LOCK04
new file mode 100644
index 000000000..1d1da1477
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK04
@@ -0,0 +1,6 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=4
+CONFIG_HOTPLUG_CPU=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK04.boot b/tools/testing/selftests/rcutorture/configs/lock/LOCK04.boot
new file mode 100644
index 000000000..48c04fe47
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK04.boot
@@ -0,0 +1 @@
+locktorture.torture_type=rw_lock
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK05 b/tools/testing/selftests/rcutorture/configs/lock/LOCK05
new file mode 100644
index 000000000..1d1da1477
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK05
@@ -0,0 +1,6 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=4
+CONFIG_HOTPLUG_CPU=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK05.boot b/tools/testing/selftests/rcutorture/configs/lock/LOCK05.boot
new file mode 100644
index 000000000..8ac37307c
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK05.boot
@@ -0,0 +1 @@
+locktorture.torture_type=rtmutex_lock
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK06 b/tools/testing/selftests/rcutorture/configs/lock/LOCK06
new file mode 100644
index 000000000..1d1da1477
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK06
@@ -0,0 +1,6 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=4
+CONFIG_HOTPLUG_CPU=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK06.boot b/tools/testing/selftests/rcutorture/configs/lock/LOCK06.boot
new file mode 100644
index 000000000..f92219cd4
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK06.boot
@@ -0,0 +1 @@
+locktorture.torture_type=percpu_rwsem_lock
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK07 b/tools/testing/selftests/rcutorture/configs/lock/LOCK07
new file mode 100644
index 000000000..1d1da1477
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK07
@@ -0,0 +1,6 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=4
+CONFIG_HOTPLUG_CPU=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
diff --git a/tools/testing/selftests/rcutorture/configs/lock/LOCK07.boot b/tools/testing/selftests/rcutorture/configs/lock/LOCK07.boot
new file mode 100644
index 000000000..97dadd1a9
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/LOCK07.boot
@@ -0,0 +1 @@
+locktorture.torture_type=ww_mutex_lock
diff --git a/tools/testing/selftests/rcutorture/configs/lock/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/lock/ver_functions.sh
new file mode 100644
index 000000000..d3e4b2971
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/lock/ver_functions.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Kernel-version-dependent shell functions for the rest of the scripts.
+#
+# Copyright (C) IBM Corporation, 2014
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+# locktorture_param_onoff bootparam-string config-file
+#
+# Adds onoff locktorture module parameters to kernels having it.
+locktorture_param_onoff () {
+ if ! bootparam_hotplug_cpu "$1" && configfrag_hotplug_cpu "$2"
+ then
+ echo CPU-hotplug kernel, adding locktorture onoff. 1>&2
+ echo locktorture.onoff_interval=3 locktorture.onoff_holdoff=30
+ fi
+}
+
+# per_version_boot_params bootparam-string config-file seconds
+#
+# Adds per-version torture-module parameters to kernels supporting them.
+per_version_boot_params () {
+ echo $1 `locktorture_param_onoff "$1" "$2"` \
+ locktorture.stat_interval=15 \
+ locktorture.shutdown_secs=$3 \
+ locktorture.verbose=1
+}
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED
new file mode 100644
index 000000000..48d8a245c
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED
@@ -0,0 +1,7 @@
+CONFIG_RCU_TRACE=n
+CONFIG_SMP=y
+CONFIG_NR_CPUS=4
+CONFIG_HOTPLUG_CPU=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/BUSTED.boot b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED.boot
new file mode 100644
index 000000000..be7728db4
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/BUSTED.boot
@@ -0,0 +1 @@
+rcutorture.torture_type=busted
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFLIST b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST
new file mode 100644
index 000000000..f2b20db9e
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/CFLIST
@@ -0,0 +1,19 @@
+TREE01
+TREE02
+TREE03
+TREE04
+TREE05
+TREE07
+TREE09
+SRCU-N
+SRCU-P
+SRCU-t
+SRCU-u
+TINY01
+TINY02
+TASKS01
+TASKS02
+TASKS03
+RUDE01
+TRACE01
+TRACE02
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
new file mode 100644
index 000000000..0e92d8531
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
@@ -0,0 +1,7 @@
+CONFIG_RCU_TORTURE_TEST=y
+CONFIG_PRINTK_TIME=y
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_PARAVIRT=y
+CONFIG_KVM_GUEST=y
+CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC=n
+CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/RUDE01 b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01
new file mode 100644
index 000000000..bafe94cbd
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01
@@ -0,0 +1,10 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=2
+CONFIG_HOTPLUG_CPU=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+CONFIG_DEBUG_LOCK_ALLOC=y
+CONFIG_PROVE_LOCKING=y
+#CHECK#CONFIG_PROVE_RCU=y
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/RUDE01.boot b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01.boot
new file mode 100644
index 000000000..9363708c9
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/RUDE01.boot
@@ -0,0 +1 @@
+rcutorture.torture_type=tasks-rude
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N
new file mode 100644
index 000000000..2da8b4958
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N
@@ -0,0 +1,8 @@
+CONFIG_RCU_TRACE=n
+CONFIG_SMP=y
+CONFIG_NR_CPUS=4
+CONFIG_HOTPLUG_CPU=y
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+#CHECK#CONFIG_RCU_EXPERT=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N.boot
new file mode 100644
index 000000000..238bfe3bd
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-N.boot
@@ -0,0 +1 @@
+rcutorture.torture_type=srcu
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P
new file mode 100644
index 000000000..ab7ccd382
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P
@@ -0,0 +1,12 @@
+CONFIG_RCU_TRACE=n
+CONFIG_SMP=y
+CONFIG_NR_CPUS=8
+CONFIG_HOTPLUG_CPU=y
+CONFIG_RCU_EXPERT=y
+CONFIG_RCU_FANOUT=2
+CONFIG_RCU_FANOUT_LEAF=2
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+CONFIG_DEBUG_LOCK_ALLOC=y
+CONFIG_PROVE_LOCKING=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot
new file mode 100644
index 000000000..ce48c7b82
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-P.boot
@@ -0,0 +1,2 @@
+rcutorture.torture_type=srcud
+rcupdate.rcu_self_test=1
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t
new file mode 100644
index 000000000..6c78022c8
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t
@@ -0,0 +1,10 @@
+CONFIG_SMP=n
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+#CHECK#CONFIG_TINY_SRCU=y
+CONFIG_RCU_TRACE=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_DEBUG_ATOMIC_SLEEP=y
+#CHECK#CONFIG_PREEMPT_COUNT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t.boot
new file mode 100644
index 000000000..238bfe3bd
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-t.boot
@@ -0,0 +1 @@
+rcutorture.torture_type=srcu
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u
new file mode 100644
index 000000000..c15ada821
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u
@@ -0,0 +1,10 @@
+CONFIG_SMP=n
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+#CHECK#CONFIG_TINY_SRCU=y
+CONFIG_RCU_TRACE=n
+CONFIG_DEBUG_LOCK_ALLOC=y
+CONFIG_PROVE_LOCKING=y
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_PREEMPT_COUNT=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u.boot b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u.boot
new file mode 100644
index 000000000..ce48c7b82
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/SRCU-u.boot
@@ -0,0 +1,2 @@
+rcutorture.torture_type=srcud
+rcupdate.rcu_self_test=1
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS01 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS01
new file mode 100644
index 000000000..bafe94cbd
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS01
@@ -0,0 +1,10 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=2
+CONFIG_HOTPLUG_CPU=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+CONFIG_DEBUG_LOCK_ALLOC=y
+CONFIG_PROVE_LOCKING=y
+#CHECK#CONFIG_PROVE_RCU=y
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS01.boot b/tools/testing/selftests/rcutorture/configs/rcu/TASKS01.boot
new file mode 100644
index 000000000..cd2a188ee
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS01.boot
@@ -0,0 +1 @@
+rcutorture.torture_type=tasks
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS02 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS02
new file mode 100644
index 000000000..ad2be91e5
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS02
@@ -0,0 +1,4 @@
+CONFIG_SMP=n
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS02.boot b/tools/testing/selftests/rcutorture/configs/rcu/TASKS02.boot
new file mode 100644
index 000000000..cd2a188ee
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS02.boot
@@ -0,0 +1 @@
+rcutorture.torture_type=tasks
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03 b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03
new file mode 100644
index 000000000..ea4399020
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03
@@ -0,0 +1,9 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=2
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=n
+CONFIG_NO_HZ_FULL=y
+#CHECK#CONFIG_RCU_EXPERT=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TASKS03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03.boot
new file mode 100644
index 000000000..838297c58
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TASKS03.boot
@@ -0,0 +1 @@
+rcutorture.torture_type=tasks nohz_full=1
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TINY01 b/tools/testing/selftests/rcutorture/configs/rcu/TINY01
new file mode 100644
index 000000000..6db705e55
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TINY01
@@ -0,0 +1,13 @@
+CONFIG_SMP=n
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+#CHECK#CONFIG_TINY_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_TRACE=n
+#CHECK#CONFIG_RCU_STALL_COMMON=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_PREEMPT_COUNT=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TINY02 b/tools/testing/selftests/rcutorture/configs/rcu/TINY02
new file mode 100644
index 000000000..d86742643
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TINY02
@@ -0,0 +1,14 @@
+CONFIG_SMP=n
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+#CHECK#CONFIG_TINY_RCU=y
+CONFIG_HZ_PERIODIC=y
+CONFIG_NO_HZ_IDLE=n
+CONFIG_NO_HZ_FULL=n
+CONFIG_PROVE_LOCKING=y
+#CHECK#CONFIG_PROVE_RCU=y
+CONFIG_DEBUG_LOCK_ALLOC=y
+CONFIG_DEBUG_OBJECTS=y
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
+CONFIG_DEBUG_ATOMIC_SLEEP=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TINY02.boot b/tools/testing/selftests/rcutorture/configs/rcu/TINY02.boot
new file mode 100644
index 000000000..b39f1553a
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TINY02.boot
@@ -0,0 +1 @@
+rcupdate.rcu_self_test=1
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01
new file mode 100644
index 000000000..12e7661b8
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01
@@ -0,0 +1,11 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=4
+CONFIG_HOTPLUG_CPU=y
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+CONFIG_DEBUG_LOCK_ALLOC=y
+CONFIG_PROVE_LOCKING=y
+#CHECK#CONFIG_PROVE_RCU=y
+CONFIG_TASKS_TRACE_RCU_READ_MB=y
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE01.boot b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01.boot
new file mode 100644
index 000000000..9675ad632
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE01.boot
@@ -0,0 +1 @@
+rcutorture.torture_type=tasks-tracing
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02 b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02
new file mode 100644
index 000000000..b69ed6673
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02
@@ -0,0 +1,11 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=4
+CONFIG_HOTPLUG_CPU=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_PROVE_LOCKING=n
+#CHECK#CONFIG_PROVE_RCU=n
+CONFIG_TASKS_TRACE_RCU_READ_MB=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRACE02.boot b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02.boot
new file mode 100644
index 000000000..9675ad632
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TRACE02.boot
@@ -0,0 +1 @@
+rcutorture.torture_type=tasks-tracing
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01 b/tools/testing/selftests/rcutorture/configs/rcu/TREE01
new file mode 100644
index 000000000..b5b53973c
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01
@@ -0,0 +1,18 @@
+CONFIG_SMP=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+#CHECK#CONFIG_PREEMPT_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_FAST_NO_HZ=y
+CONFIG_RCU_TRACE=y
+CONFIG_HOTPLUG_CPU=y
+CONFIG_MAXSMP=y
+CONFIG_CPUMASK_OFFSTACK=y
+CONFIG_RCU_NOCB_CPU=y
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_RCU_BOOST=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot
new file mode 100644
index 000000000..d6da9a61d
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE01.boot
@@ -0,0 +1,6 @@
+maxcpus=8 nr_cpus=43
+rcutree.gp_preinit_delay=3
+rcutree.gp_init_delay=3
+rcutree.gp_cleanup_delay=3
+rcu_nocbs=0
+rcutorture.fwd_progress=0
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE02 b/tools/testing/selftests/rcutorture/configs/rcu/TREE02
new file mode 100644
index 000000000..65daee4fb
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE02
@@ -0,0 +1,20 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=8
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+#CHECK#CONFIG_PREEMPT_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_FAST_NO_HZ=n
+CONFIG_RCU_TRACE=n
+CONFIG_RCU_FANOUT=3
+CONFIG_RCU_FANOUT_LEAF=3
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=y
+CONFIG_PROVE_LOCKING=n
+CONFIG_RCU_BOOST=n
+CONFIG_RCU_EXPERT=y
+CONFIG_DEBUG_OBJECTS=y
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03 b/tools/testing/selftests/rcutorture/configs/rcu/TREE03
new file mode 100644
index 000000000..2dc31b16e
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03
@@ -0,0 +1,18 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=16
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+#CHECK#CONFIG_PREEMPT_RCU=y
+CONFIG_HZ_PERIODIC=y
+CONFIG_NO_HZ_IDLE=n
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_TRACE=y
+CONFIG_HOTPLUG_CPU=y
+CONFIG_RCU_FANOUT=2
+CONFIG_RCU_FANOUT_LEAF=2
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_RCU_BOOST=y
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot
new file mode 100644
index 000000000..1c218944b
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE03.boot
@@ -0,0 +1,6 @@
+rcutorture.onoff_interval=200 rcutorture.onoff_holdoff=30
+rcutree.gp_preinit_delay=12
+rcutree.gp_init_delay=3
+rcutree.gp_cleanup_delay=3
+rcutree.kthread_prio=2
+threadirqs
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04 b/tools/testing/selftests/rcutorture/configs/rcu/TREE04
new file mode 100644
index 000000000..f6d6a40c0
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04
@@ -0,0 +1,17 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=8
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+#CHECK#CONFIG_TREE_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=n
+CONFIG_NO_HZ_FULL=y
+CONFIG_RCU_FAST_NO_HZ=y
+CONFIG_RCU_TRACE=y
+CONFIG_RCU_FANOUT=4
+CONFIG_RCU_FANOUT_LEAF=3
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
+CONFIG_RCU_EQS_DEBUG=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot
new file mode 100644
index 000000000..5adc67567
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE04.boot
@@ -0,0 +1 @@
+rcutree.rcu_fanout_leaf=4 nohz_full=1-7
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE05 b/tools/testing/selftests/rcutorture/configs/rcu/TREE05
new file mode 100644
index 000000000..4f95f8544
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE05
@@ -0,0 +1,21 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=8
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+#CHECK#CONFIG_TREE_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_FAST_NO_HZ=n
+CONFIG_RCU_TRACE=n
+CONFIG_HOTPLUG_CPU=y
+CONFIG_RCU_FANOUT=6
+CONFIG_RCU_FANOUT_LEAF=6
+CONFIG_RCU_NOCB_CPU=y
+CONFIG_DEBUG_LOCK_ALLOC=y
+CONFIG_PROVE_LOCKING=y
+#CHECK#CONFIG_PROVE_RCU=y
+CONFIG_PROVE_RCU_LIST=y
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot
new file mode 100644
index 000000000..c419cac23
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE05.boot
@@ -0,0 +1,4 @@
+rcutree.gp_preinit_delay=3
+rcutree.gp_init_delay=3
+rcutree.gp_cleanup_delay=3
+rcupdate.rcu_self_test=1
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE06 b/tools/testing/selftests/rcutorture/configs/rcu/TREE06
new file mode 100644
index 000000000..bf4980d60
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE06
@@ -0,0 +1,20 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=8
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+#CHECK#CONFIG_TREE_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_FAST_NO_HZ=n
+CONFIG_RCU_TRACE=n
+CONFIG_RCU_FANOUT=6
+CONFIG_RCU_FANOUT_LEAF=6
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=y
+CONFIG_PROVE_LOCKING=y
+#CHECK#CONFIG_PROVE_RCU=y
+CONFIG_DEBUG_OBJECTS=y
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot
new file mode 100644
index 000000000..055f4aa79
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE06.boot
@@ -0,0 +1,5 @@
+rcupdate.rcu_self_test=1
+rcutree.rcu_fanout_exact=1
+rcutree.gp_preinit_delay=3
+rcutree.gp_init_delay=3
+rcutree.gp_cleanup_delay=3
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07 b/tools/testing/selftests/rcutorture/configs/rcu/TREE07
new file mode 100644
index 000000000..d7afb271a
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07
@@ -0,0 +1,17 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=16
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+#CHECK#CONFIG_TREE_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=n
+CONFIG_NO_HZ_FULL=y
+CONFIG_RCU_FAST_NO_HZ=n
+CONFIG_RCU_TRACE=y
+CONFIG_HOTPLUG_CPU=y
+CONFIG_RCU_FANOUT=2
+CONFIG_RCU_FANOUT_LEAF=2
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot
new file mode 100644
index 000000000..d44609937
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE07.boot
@@ -0,0 +1 @@
+nohz_full=2-9
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08 b/tools/testing/selftests/rcutorture/configs/rcu/TREE08
new file mode 100644
index 000000000..c810c5276
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08
@@ -0,0 +1,20 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=8
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+#CHECK#CONFIG_PREEMPT_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_FAST_NO_HZ=n
+CONFIG_RCU_TRACE=n
+CONFIG_RCU_FANOUT=3
+CONFIG_RCU_FANOUT_LEAF=2
+CONFIG_RCU_NOCB_CPU=y
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_PROVE_LOCKING=n
+CONFIG_RCU_BOOST=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
+CONFIG_RCU_EQS_DEBUG=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot b/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot
new file mode 100644
index 000000000..22478fd3a
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE08.boot
@@ -0,0 +1,3 @@
+rcupdate.rcu_self_test=1
+rcutree.rcu_fanout_exact=1
+rcu_nocbs=0-7
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 b/tools/testing/selftests/rcutorture/configs/rcu/TREE09
new file mode 100644
index 000000000..8523a7515
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE09
@@ -0,0 +1,15 @@
+CONFIG_SMP=n
+CONFIG_NR_CPUS=1
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+#CHECK#CONFIG_PREEMPT_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_TRACE=n
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_RCU_BOOST=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+#CHECK#CONFIG_RCU_EXPERT=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE10 b/tools/testing/selftests/rcutorture/configs/rcu/TREE10
new file mode 100644
index 000000000..7311f84a5
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE10
@@ -0,0 +1,18 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=56
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+#CHECK#CONFIG_TREE_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_FAST_NO_HZ=n
+CONFIG_RCU_TRACE=n
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_PROVE_LOCKING=n
+#CHECK#CONFIG_PROVE_RCU=n
+CONFIG_DEBUG_OBJECTS=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=n
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL b/tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL
new file mode 100644
index 000000000..5d546efa6
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL
@@ -0,0 +1,11 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=8
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL.boot b/tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL.boot
new file mode 100644
index 000000000..7017f5f5a
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/TRIVIAL.boot
@@ -0,0 +1,3 @@
+rcutorture.torture_type=trivial
+rcutorture.onoff_interval=0
+rcutorture.shuffle_interval=0
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh
new file mode 100644
index 000000000..effa415f9
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcu/ver_functions.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Kernel-version-dependent shell functions for the rest of the scripts.
+#
+# Copyright (C) IBM Corporation, 2013
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+# rcutorture_param_n_barrier_cbs bootparam-string
+#
+# Adds n_barrier_cbs rcutorture module parameter to kernels having it.
+rcutorture_param_n_barrier_cbs () {
+ if echo $1 | grep -q "rcutorture\.n_barrier_cbs"
+ then
+ :
+ else
+ echo rcutorture.n_barrier_cbs=4
+ fi
+}
+
+# rcutorture_param_onoff bootparam-string config-file
+#
+# Adds onoff rcutorture module parameters to kernels having it.
+rcutorture_param_onoff () {
+ if ! bootparam_hotplug_cpu "$1" && configfrag_hotplug_cpu "$2"
+ then
+ echo CPU-hotplug kernel, adding rcutorture onoff. 1>&2
+ echo rcutorture.onoff_interval=1000 rcutorture.onoff_holdoff=30
+ fi
+}
+
+# per_version_boot_params bootparam-string config-file seconds
+#
+# Adds per-version torture-module parameters to kernels supporting them.
+per_version_boot_params () {
+ echo $1 `rcutorture_param_onoff "$1" "$2"` \
+ `rcutorture_param_n_barrier_cbs "$1"` \
+ rcutorture.stat_interval=15 \
+ rcutorture.shutdown_secs=$3 \
+ rcutorture.test_no_idle_hz=1 \
+ rcutorture.verbose=1
+}
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/CFLIST b/tools/testing/selftests/rcutorture/configs/rcuscale/CFLIST
new file mode 100644
index 000000000..c9f56cf20
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/CFLIST
@@ -0,0 +1 @@
+TREE
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon
new file mode 100644
index 000000000..87caa0e93
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon
@@ -0,0 +1,2 @@
+CONFIG_RCU_SCALE_TEST=y
+CONFIG_PRINTK_TIME=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TINY b/tools/testing/selftests/rcutorture/configs/rcuscale/TINY
new file mode 100644
index 000000000..fb05ef527
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TINY
@@ -0,0 +1,16 @@
+CONFIG_SMP=n
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+#CHECK#CONFIG_TINY_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_FAST_NO_HZ=n
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_PROVE_LOCKING=n
+CONFIG_RCU_BOOST=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
+CONFIG_RCU_TRACE=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE
new file mode 100644
index 000000000..721cfda76
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE
@@ -0,0 +1,19 @@
+CONFIG_SMP=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+#CHECK#CONFIG_PREEMPT_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_FAST_NO_HZ=n
+CONFIG_HOTPLUG_CPU=n
+CONFIG_SUSPEND=n
+CONFIG_HIBERNATION=n
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_PROVE_LOCKING=n
+CONFIG_RCU_BOOST=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
+CONFIG_RCU_TRACE=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54 b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54
new file mode 100644
index 000000000..7629f5dd7
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54
@@ -0,0 +1,22 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=54
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+#CHECK#CONFIG_PREEMPT_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_FAST_NO_HZ=n
+CONFIG_HOTPLUG_CPU=n
+CONFIG_SUSPEND=n
+CONFIG_HIBERNATION=n
+CONFIG_RCU_FANOUT=3
+CONFIG_RCU_FANOUT_LEAF=2
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_PROVE_LOCKING=n
+CONFIG_RCU_BOOST=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
+CONFIG_RCU_TRACE=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh
new file mode 100644
index 000000000..0333e9b18
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Torture-suite-dependent shell functions for the rest of the scripts.
+#
+# Copyright (C) IBM Corporation, 2015
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+# per_version_boot_params bootparam-string config-file seconds
+#
+# Adds per-version torture-module parameters to kernels supporting them.
+per_version_boot_params () {
+ echo $1 rcuscale.shutdown=1 \
+ rcuscale.verbose=1
+}
diff --git a/tools/testing/selftests/rcutorture/configs/refscale/CFLIST b/tools/testing/selftests/rcutorture/configs/refscale/CFLIST
new file mode 100644
index 000000000..4d62eb4a3
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/refscale/CFLIST
@@ -0,0 +1,2 @@
+NOPREEMPT
+PREEMPT
diff --git a/tools/testing/selftests/rcutorture/configs/refscale/CFcommon b/tools/testing/selftests/rcutorture/configs/refscale/CFcommon
new file mode 100644
index 000000000..a98b58b54
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/refscale/CFcommon
@@ -0,0 +1,2 @@
+CONFIG_RCU_REF_SCALE_TEST=y
+CONFIG_PRINTK_TIME=y
diff --git a/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT
new file mode 100644
index 000000000..1cd25b731
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT
@@ -0,0 +1,18 @@
+CONFIG_SMP=y
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+#CHECK#CONFIG_PREEMPT_RCU=n
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_FAST_NO_HZ=n
+CONFIG_HOTPLUG_CPU=n
+CONFIG_SUSPEND=n
+CONFIG_HIBERNATION=n
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_PROVE_LOCKING=n
+CONFIG_RCU_BOOST=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT
new file mode 100644
index 000000000..d10bc694f
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT
@@ -0,0 +1,18 @@
+CONFIG_SMP=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+#CHECK#CONFIG_PREEMPT_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_FAST_NO_HZ=n
+CONFIG_HOTPLUG_CPU=n
+CONFIG_SUSPEND=n
+CONFIG_HIBERNATION=n
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_PROVE_LOCKING=n
+CONFIG_RCU_BOOST=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh
new file mode 100644
index 000000000..321e82641
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Torture-suite-dependent shell functions for the rest of the scripts.
+#
+# Copyright (C) IBM Corporation, 2015
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+# per_version_boot_params bootparam-string config-file seconds
+#
+# Adds per-version torture-module parameters to kernels supporting them.
+per_version_boot_params () {
+ echo $1 refscale.shutdown=1 \
+ refscale.verbose=1
+}
diff --git a/tools/testing/selftests/rcutorture/configs/scf/CFLIST b/tools/testing/selftests/rcutorture/configs/scf/CFLIST
new file mode 100644
index 000000000..4d62eb4a3
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/scf/CFLIST
@@ -0,0 +1,2 @@
+NOPREEMPT
+PREEMPT
diff --git a/tools/testing/selftests/rcutorture/configs/scf/CFcommon b/tools/testing/selftests/rcutorture/configs/scf/CFcommon
new file mode 100644
index 000000000..c11ab91f4
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/scf/CFcommon
@@ -0,0 +1,2 @@
+CONFIG_SCF_TORTURE_TEST=y
+CONFIG_PRINTK_TIME=y
diff --git a/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT b/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT
new file mode 100644
index 000000000..b8429d6c6
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT
@@ -0,0 +1,9 @@
+CONFIG_SMP=y
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=n
+CONFIG_NO_HZ_FULL=y
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_PROVE_LOCKING=n
diff --git a/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT.boot b/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT.boot
new file mode 100644
index 000000000..d6a7fa097
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/scf/NOPREEMPT.boot
@@ -0,0 +1 @@
+nohz_full=1
diff --git a/tools/testing/selftests/rcutorture/configs/scf/PREEMPT b/tools/testing/selftests/rcutorture/configs/scf/PREEMPT
new file mode 100644
index 000000000..ae4992b14
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/scf/PREEMPT
@@ -0,0 +1,9 @@
+CONFIG_SMP=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_DEBUG_LOCK_ALLOC=y
+CONFIG_PROVE_LOCKING=y
diff --git a/tools/testing/selftests/rcutorture/configs/scf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/scf/ver_functions.sh
new file mode 100644
index 000000000..d3d9e35d3
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/scf/ver_functions.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Torture-suite-dependent shell functions for the rest of the scripts.
+#
+# Copyright (C) Facebook, 2020
+#
+# Authors: Paul E. McKenney <paulmck@kernel.org>
+
+# scftorture_param_onoff bootparam-string config-file
+#
+# Adds onoff scftorture module parameters to kernels having it.
+scftorture_param_onoff () {
+ if ! bootparam_hotplug_cpu "$1" && configfrag_hotplug_cpu "$2"
+ then
+ echo CPU-hotplug kernel, adding scftorture onoff. 1>&2
+ echo scftorture.onoff_interval=1000 scftorture.onoff_holdoff=30
+ fi
+}
+
+# per_version_boot_params bootparam-string config-file seconds
+#
+# Adds per-version torture-module parameters to kernels supporting them.
+per_version_boot_params () {
+ echo $1 `scftorture_param_onoff "$1" "$2"` \
+ scftorture.stat_interval=15 \
+ scftorture.shutdown_secs=$3 \
+ scftorture.verbose=1 \
+ scf
+}
diff --git a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt
new file mode 100644
index 000000000..a75b16991
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt
@@ -0,0 +1,38 @@
+This document gives a brief rationale for the TINY_RCU test cases.
+
+
+Kconfig Parameters:
+
+CONFIG_DEBUG_LOCK_ALLOC -- Do all three and none of the three.
+CONFIG_PREEMPT_COUNT
+CONFIG_RCU_TRACE
+
+The theory here is that randconfig testing will hit the other six possible
+combinations of these parameters.
+
+
+Kconfig Parameters Ignored:
+
+CONFIG_DEBUG_OBJECTS_RCU_HEAD
+CONFIG_PROVE_RCU
+
+ In common code tested by TREE_RCU test cases.
+
+CONFIG_RCU_NOCB_CPU
+
+ Meaningless for TINY_RCU.
+
+CONFIG_RCU_STALL_COMMON
+CONFIG_RCU_TORTURE_TEST
+
+ Redundant with CONFIG_RCU_TRACE.
+
+CONFIG_HOTPLUG_CPU
+CONFIG_PREEMPT
+CONFIG_PREEMPT_RCU
+CONFIG_SMP
+CONFIG_TINY_RCU
+CONFIG_PREEMPT_RCU
+CONFIG_TREE_RCU
+
+ All forced by CONFIG_TINY_RCU.
diff --git a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
new file mode 100644
index 000000000..1b96d6847
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
@@ -0,0 +1,80 @@
+This document gives a brief rationale for the TREE_RCU-related test
+cases, a group that includes PREEMPT_RCU.
+
+
+Kconfig Parameters:
+
+CONFIG_DEBUG_LOCK_ALLOC -- Do three, covering CONFIG_PROVE_LOCKING & not.
+CONFIG_DEBUG_OBJECTS_RCU_HEAD -- Do one.
+CONFIG_HZ_PERIODIC -- Do one.
+CONFIG_NO_HZ_IDLE -- Do those not otherwise specified. (Groups of two.)
+CONFIG_NO_HZ_FULL -- Do two, one with partial CPU enablement.
+CONFIG_PREEMPT -- Do half. (First three and #8.)
+CONFIG_PROVE_LOCKING -- Do several, covering CONFIG_DEBUG_LOCK_ALLOC=y and not.
+CONFIG_PROVE_RCU -- Hardwired to CONFIG_PROVE_LOCKING.
+CONFIG_RCU_BOOST -- one of PREEMPT_RCU.
+CONFIG_RCU_FANOUT -- Cover hierarchy, but overlap with others.
+CONFIG_RCU_FANOUT_LEAF -- Do one non-default.
+CONFIG_RCU_FAST_NO_HZ -- Do one, but not with all nohz_full CPUs.
+CONFIG_RCU_NOCB_CPU -- Do three, one with no rcu_nocbs CPUs, one with
+ rcu_nocbs=0, and one with all rcu_nocbs CPUs.
+CONFIG_RCU_TRACE -- Do half.
+CONFIG_SMP -- Need one !SMP for PREEMPT_RCU.
+CONFIG_RCU_EXPERT=n -- Do a few, but these have to be vanilla configurations.
+CONFIG_RCU_EQS_DEBUG -- Do at least one for CONFIG_NO_HZ_FULL and not.
+
+RCU-bh: Do one with PREEMPT and one with !PREEMPT.
+RCU-sched: Do one with PREEMPT but not BOOST.
+
+
+Boot parameters:
+
+nohz_full - do at least one.
+maxcpu -- do at least one.
+rcupdate.rcu_self_test_bh -- Do at least one each, offloaded and not.
+rcupdate.rcu_self_test_sched -- Do at least one each, offloaded and not.
+rcupdate.rcu_self_test -- Do at least one each, offloaded and not.
+rcutree.rcu_fanout_exact -- Do at least one.
+
+
+Kconfig Parameters Ignored:
+
+CONFIG_64BIT
+
+ Used only to check CONFIG_RCU_FANOUT value, inspection suffices.
+
+CONFIG_PREEMPT_COUNT
+CONFIG_PREEMPT_RCU
+
+ Redundant with CONFIG_PREEMPT, ignore.
+
+CONFIG_RCU_BOOST_DELAY
+
+ Inspection suffices, ignore.
+
+CONFIG_RCU_CPU_STALL_TIMEOUT
+
+ Inspection suffices, ignore.
+
+CONFIG_RCU_STALL_COMMON
+
+ Implied by TREE_RCU and PREEMPT_RCU.
+
+CONFIG_RCU_TORTURE_TEST
+CONFIG_RCU_TORTURE_TEST_RUNNABLE
+
+ Always used in KVM testing.
+
+CONFIG_PREEMPT_RCU
+CONFIG_TREE_RCU
+CONFIG_TINY_RCU
+CONFIG_TASKS_RCU
+
+ These are controlled by CONFIG_PREEMPT and/or CONFIG_SMP.
+
+CONFIG_SRCU
+
+ Selected by CONFIG_RCU_TORTURE_TEST, so cannot disable.
+
+
+boot parameters ignored: TBD
diff --git a/tools/testing/selftests/rcutorture/doc/initrd.txt b/tools/testing/selftests/rcutorture/doc/initrd.txt
new file mode 100644
index 000000000..41a425586
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/doc/initrd.txt
@@ -0,0 +1,16 @@
+The rcutorture scripting tools automatically create an initrd containing
+a single statically linked binary named "init" that loops over a
+very long sleep() call. In both cases, this creation is done by
+tools/testing/selftests/rcutorture/bin/mkinitrd.sh.
+
+However, if you don't like the notion of statically linked bare-bones
+userspace environments, you might wish to press an existing initrd
+into service:
+
+------------------------------------------------------------------------
+cd tools/testing/selftests/rcutorture
+zcat /initrd.img > /tmp/initrd.img.zcat
+mkdir initrd
+cd initrd
+cpio -id < /tmp/initrd.img.zcat
+# Manually verify that initrd contains needed binaries and libraries.
diff --git a/tools/testing/selftests/rcutorture/doc/rcu-test-image.txt b/tools/testing/selftests/rcutorture/doc/rcu-test-image.txt
new file mode 100644
index 000000000..b2fc24797
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/doc/rcu-test-image.txt
@@ -0,0 +1,67 @@
+Normally, a minimal initrd is created automatically by the rcutorture
+scripting. But minimal really does mean "minimal", namely just a single
+root directory with a single statically linked executable named "init":
+
+$ size tools/testing/selftests/rcutorture/initrd/init
+ text data bss dec hex filename
+ 328 0 8 336 150 tools/testing/selftests/rcutorture/initrd/init
+
+Suppose you need to run some scripts, perhaps to monitor or control
+some aspect of the rcutorture testing. This will require a more fully
+filled-out userspace, perhaps containing libraries, executables for
+the shell and other utilities, and soforth. In that case, place your
+desired filesystem here:
+
+ tools/testing/selftests/rcutorture/initrd
+
+For example, your tools/testing/selftests/rcutorture/initrd/init might
+be a script that does any needed mount operations and starts whatever
+scripts need starting to properly monitor or control your testing.
+The next rcutorture build will then incorporate this filesystem into
+the kernel image that is passed to qemu.
+
+Or maybe you need a real root filesystem for some reason, in which case
+please read on!
+
+The remainder of this document describes one way to create the
+rcu-test-image file that contains the filesystem used by the guest-OS
+kernel. There are probably much better ways of doing this, and this
+filesystem could no doubt be smaller. It is probably also possible to
+simply download an appropriate image from any number of places.
+
+That said, here are the commands:
+
+------------------------------------------------------------------------
+dd if=/dev/zero of=rcu-test-image bs=400M count=1
+mkfs.ext3 ./rcu-test-image
+sudo mount -o loop ./rcu-test-image /mnt
+
+# Replace "precise" below with your favorite Ubuntu release.
+# Empirical evidence says this image will work for 64-bit, but...
+# Note that debootstrap does take a few minutes to run. Or longer.
+sudo debootstrap --verbose --arch i386 precise /mnt http://archive.ubuntu.com/ubuntu
+cat << '___EOF___' | sudo dd of=/mnt/etc/fstab
+# UNCONFIGURED FSTAB FOR BASE SYSTEM
+#
+/dev/vda / ext3 defaults 1 1
+dev /dev tmpfs rw 0 0
+tmpfs /dev/shm tmpfs defaults 0 0
+devpts /dev/pts devpts gid=5,mode=620 0 0
+sysfs /sys sysfs defaults 0 0
+proc /proc proc defaults 0 0
+___EOF___
+sudo umount /mnt
+------------------------------------------------------------------------
+
+
+References:
+
+ http://sripathikodi.blogspot.com/2010/02/creating-kvm-bootable-fedora-system.html
+ https://help.ubuntu.com/community/KVM/CreateGuests
+ https://help.ubuntu.com/community/JeOSVMBuilder
+ http://wiki.libvirt.org/page/UbuntuKVMWalkthrough
+ http://www.moe.co.uk/2011/01/07/pci_add_option_rom-failed-to-find-romfile-pxe-rtl8139-bin/ -- "apt-get install kvm-pxe"
+ https://www.landley.net/writing/rootfs-howto.html
+ https://en.wikipedia.org/wiki/Initrd
+ https://en.wikipedia.org/wiki/Cpio
+ http://wiki.libvirt.org/page/UbuntuKVMWalkthrough
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/.gitignore b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/.gitignore
new file mode 100644
index 000000000..24e27957e
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+srcu.c
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/Makefile b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/Makefile
new file mode 100644
index 000000000..4bed0b678
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/Makefile
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0
+all: srcu.c store_buffering
+
+LINUX_SOURCE = ../../../../../..
+
+modified_srcu_input = $(LINUX_SOURCE)/include/linux/srcu.h \
+ $(LINUX_SOURCE)/kernel/rcu/srcu.c
+
+modified_srcu_output = include/linux/srcu.h srcu.c
+
+include/linux/srcu.h: srcu.c
+
+srcu.c: modify_srcu.awk Makefile $(modified_srcu_input)
+ awk -f modify_srcu.awk $(modified_srcu_input) $(modified_srcu_output)
+
+store_buffering:
+ @cd tests/store_buffering; make
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/delay.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/delay.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/delay.h
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/export.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/export.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/export.h
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/mutex.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/mutex.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/mutex.h
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/percpu.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/percpu.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/percpu.h
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/preempt.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/preempt.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/preempt.h
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/rcupdate.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/rcupdate.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/rcupdate.h
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/sched.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/sched.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/sched.h
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/smp.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/smp.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/smp.h
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/workqueue.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/workqueue.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/linux/workqueue.h
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/uapi/linux/types.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/uapi/linux/types.h
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/empty_includes/uapi/linux/types.h
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/.gitignore b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/.gitignore
new file mode 100644
index 000000000..57d296341
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+srcu.h
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/kconfig.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/kconfig.h
new file mode 100644
index 000000000..f2860dd1b
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/kconfig.h
@@ -0,0 +1 @@
+#include <LINUX_SOURCE/linux/kconfig.h>
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/types.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/types.h
new file mode 100644
index 000000000..8bc960e5e
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/include/linux/types.h
@@ -0,0 +1,152 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * This header has been modifies to remove definitions of types that
+ * are defined in standard userspace headers or are problematic for some
+ * other reason.
+ */
+
+#ifndef _LINUX_TYPES_H
+#define _LINUX_TYPES_H
+
+#define __EXPORTED_HEADERS__
+#include <uapi/linux/types.h>
+
+#ifndef __ASSEMBLY__
+
+#define DECLARE_BITMAP(name, bits) \
+ unsigned long name[BITS_TO_LONGS(bits)]
+
+typedef __u32 __kernel_dev_t;
+
+/* bsd */
+typedef unsigned char u_char;
+typedef unsigned short u_short;
+typedef unsigned int u_int;
+typedef unsigned long u_long;
+
+/* sysv */
+typedef unsigned char unchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+typedef unsigned long ulong;
+
+#ifndef __BIT_TYPES_DEFINED__
+#define __BIT_TYPES_DEFINED__
+
+typedef __u8 u_int8_t;
+typedef __s8 int8_t;
+typedef __u16 u_int16_t;
+typedef __s16 int16_t;
+typedef __u32 u_int32_t;
+typedef __s32 int32_t;
+
+#endif /* !(__BIT_TYPES_DEFINED__) */
+
+typedef __u8 uint8_t;
+typedef __u16 uint16_t;
+typedef __u32 uint32_t;
+
+/* this is a special 64bit data type that is 8-byte aligned */
+#define aligned_u64 __u64 __attribute__((aligned(8)))
+#define aligned_be64 __be64 __attribute__((aligned(8)))
+#define aligned_le64 __le64 __attribute__((aligned(8)))
+
+/**
+ * The type used for indexing onto a disc or disc partition.
+ *
+ * Linux always considers sectors to be 512 bytes long independently
+ * of the devices real block size.
+ *
+ * blkcnt_t is the type of the inode's block count.
+ */
+typedef u64 sector_t;
+
+/*
+ * The type of an index into the pagecache.
+ */
+#define pgoff_t unsigned long
+
+/*
+ * A dma_addr_t can hold any valid DMA address, i.e., any address returned
+ * by the DMA API.
+ *
+ * If the DMA API only uses 32-bit addresses, dma_addr_t need only be 32
+ * bits wide. Bus addresses, e.g., PCI BARs, may be wider than 32 bits,
+ * but drivers do memory-mapped I/O to ioremapped kernel virtual addresses,
+ * so they don't care about the size of the actual bus addresses.
+ */
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
+typedef u64 dma_addr_t;
+#else
+typedef u32 dma_addr_t;
+#endif
+
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+typedef u64 phys_addr_t;
+#else
+typedef u32 phys_addr_t;
+#endif
+
+typedef phys_addr_t resource_size_t;
+
+/*
+ * This type is the placeholder for a hardware interrupt number. It has to be
+ * big enough to enclose whatever representation is used by a given platform.
+ */
+typedef unsigned long irq_hw_number_t;
+
+typedef struct {
+ int counter;
+} atomic_t;
+
+#ifdef CONFIG_64BIT
+typedef struct {
+ long counter;
+} atomic64_t;
+#endif
+
+struct list_head {
+ struct list_head *next, *prev;
+};
+
+struct hlist_head {
+ struct hlist_node *first;
+};
+
+struct hlist_node {
+ struct hlist_node *next, **pprev;
+};
+
+/**
+ * struct callback_head - callback structure for use with RCU and task_work
+ * @next: next update requests in a list
+ * @func: actual update function to call after the grace period.
+ *
+ * The struct is aligned to size of pointer. On most architectures it happens
+ * naturally due ABI requirements, but some architectures (like CRIS) have
+ * weird ABI and we need to ask it explicitly.
+ *
+ * The alignment is required to guarantee that bits 0 and 1 of @next will be
+ * clear under normal conditions -- as long as we use call_rcu() or
+ * call_srcu() to queue callback.
+ *
+ * This guarantee is important for few reasons:
+ * - future call_rcu_lazy() will make use of lower bits in the pointer;
+ * - the structure shares storage spacer in struct page with @compound_head,
+ * which encode PageTail() in bit 0. The guarantee is needed to avoid
+ * false-positive PageTail().
+ */
+struct callback_head {
+ struct callback_head *next;
+ void (*func)(struct callback_head *head);
+} __attribute__((aligned(sizeof(void *))));
+#define rcu_head callback_head
+
+typedef void (*rcu_callback_t)(struct rcu_head *head);
+typedef void (*call_rcu_func_t)(struct rcu_head *head, rcu_callback_t func);
+
+/* clocksource cycle base type */
+typedef u64 cycle_t;
+
+#endif /* __ASSEMBLY__ */
+#endif /* _LINUX_TYPES_H */
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/modify_srcu.awk b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/modify_srcu.awk
new file mode 100755
index 000000000..e05182d3e
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/modify_srcu.awk
@@ -0,0 +1,376 @@
+#!/usr/bin/awk -f
+# SPDX-License-Identifier: GPL-2.0
+
+# Modify SRCU for formal verification. The first argument should be srcu.h and
+# the second should be srcu.c. Outputs modified srcu.h and srcu.c into the
+# current directory.
+
+BEGIN {
+ if (ARGC != 5) {
+ print "Usange: input.h input.c output.h output.c" > "/dev/stderr";
+ exit 1;
+ }
+ h_output = ARGV[3];
+ c_output = ARGV[4];
+ ARGC = 3;
+
+ # Tokenize using FS and not RS as FS supports regular expressions. Each
+ # record is one line of source, except that backslashed lines are
+ # combined. Comments are treated as field separators, as are quotes.
+ quote_regexp="\"([^\\\\\"]|\\\\.)*\"";
+ comment_regexp="\\/\\*([^*]|\\*+[^*/])*\\*\\/|\\/\\/.*(\n|$)";
+ FS="([ \\\\\t\n\v\f;,.=(){}+*/<>&|^-]|\\[|\\]|" comment_regexp "|" quote_regexp ")+";
+
+ inside_srcu_struct = 0;
+ inside_srcu_init_def = 0;
+ srcu_init_param_name = "";
+ in_macro = 0;
+ brace_nesting = 0;
+ paren_nesting = 0;
+
+ # Allow the manipulation of the last field separator after has been
+ # seen.
+ last_fs = "";
+ # Whether the last field separator was intended to be output.
+ last_fs_print = 0;
+
+ # rcu_batches stores the initialization for each instance of struct
+ # rcu_batch
+
+ in_comment = 0;
+
+ outputfile = "";
+}
+
+{
+ prev_outputfile = outputfile;
+ if (FILENAME ~ /\.h$/) {
+ outputfile = h_output;
+ if (FNR != NR) {
+ print "Incorrect file order" > "/dev/stderr";
+ exit 1;
+ }
+ }
+ else
+ outputfile = c_output;
+
+ if (prev_outputfile && outputfile != prev_outputfile) {
+ new_outputfile = outputfile;
+ outputfile = prev_outputfile;
+ update_fieldsep("", 0);
+ outputfile = new_outputfile;
+ }
+}
+
+# Combine the next line into $0.
+function combine_line() {
+ ret = getline next_line;
+ if (ret == 0) {
+ # Don't allow two consecutive getlines at the end of the file
+ if (eof_found) {
+ print "Error: expected more input." > "/dev/stderr";
+ exit 1;
+ } else {
+ eof_found = 1;
+ }
+ } else if (ret == -1) {
+ print "Error reading next line of file" FILENAME > "/dev/stderr";
+ exit 1;
+ }
+ $0 = $0 "\n" next_line;
+}
+
+# Combine backslashed lines and multiline comments.
+function combine_backslashes() {
+ while (/\\$|\/\*([^*]|\*+[^*\/])*\**$/) {
+ combine_line();
+ }
+}
+
+function read_line() {
+ combine_line();
+ combine_backslashes();
+}
+
+# Print out field separators and update variables that depend on them. Only
+# print if p is true. Call with sep="" and p=0 to print out the last field
+# separator.
+function update_fieldsep(sep, p) {
+ # Count braces
+ sep_tmp = sep;
+ gsub(quote_regexp "|" comment_regexp, "", sep_tmp);
+ while (1)
+ {
+ if (sub("[^{}()]*\\{", "", sep_tmp)) {
+ brace_nesting++;
+ continue;
+ }
+ if (sub("[^{}()]*\\}", "", sep_tmp)) {
+ brace_nesting--;
+ if (brace_nesting < 0) {
+ print "Unbalanced braces!" > "/dev/stderr";
+ exit 1;
+ }
+ continue;
+ }
+ if (sub("[^{}()]*\\(", "", sep_tmp)) {
+ paren_nesting++;
+ continue;
+ }
+ if (sub("[^{}()]*\\)", "", sep_tmp)) {
+ paren_nesting--;
+ if (paren_nesting < 0) {
+ print "Unbalanced parenthesis!" > "/dev/stderr";
+ exit 1;
+ }
+ continue;
+ }
+
+ break;
+ }
+
+ if (last_fs_print)
+ printf("%s", last_fs) > outputfile;
+ last_fs = sep;
+ last_fs_print = p;
+}
+
+# Shifts the fields down by n positions. Calls next if there are no more. If p
+# is true then print out field separators.
+function shift_fields(n, p) {
+ do {
+ if (match($0, FS) > 0) {
+ update_fieldsep(substr($0, RSTART, RLENGTH), p);
+ if (RSTART + RLENGTH <= length())
+ $0 = substr($0, RSTART + RLENGTH);
+ else
+ $0 = "";
+ } else {
+ update_fieldsep("", 0);
+ print "" > outputfile;
+ next;
+ }
+ } while (--n > 0);
+}
+
+# Shifts and prints the first n fields.
+function print_fields(n) {
+ do {
+ update_fieldsep("", 0);
+ printf("%s", $1) > outputfile;
+ shift_fields(1, 1);
+ } while (--n > 0);
+}
+
+{
+ combine_backslashes();
+}
+
+# Print leading FS
+{
+ if (match($0, "^(" FS ")+") > 0) {
+ update_fieldsep(substr($0, RSTART, RLENGTH), 1);
+ if (RSTART + RLENGTH <= length())
+ $0 = substr($0, RSTART + RLENGTH);
+ else
+ $0 = "";
+ }
+}
+
+# Parse the line.
+{
+ while (NF > 0) {
+ if ($1 == "struct" && NF < 3) {
+ read_line();
+ continue;
+ }
+
+ if (FILENAME ~ /\.h$/ && !inside_srcu_struct &&
+ brace_nesting == 0 && paren_nesting == 0 &&
+ $1 == "struct" && $2 == "srcu_struct" &&
+ $0 ~ "^struct(" FS ")+srcu_struct(" FS ")+\\{") {
+ inside_srcu_struct = 1;
+ print_fields(2);
+ continue;
+ }
+ if (inside_srcu_struct && brace_nesting == 0 &&
+ paren_nesting == 0) {
+ inside_srcu_struct = 0;
+ update_fieldsep("", 0);
+ for (name in rcu_batches)
+ print "extern struct rcu_batch " name ";" > outputfile;
+ }
+
+ if (inside_srcu_struct && $1 == "struct" && $2 == "rcu_batch") {
+ # Move rcu_batches outside of the struct.
+ rcu_batches[$3] = "";
+ shift_fields(3, 1);
+ sub(/;[[:space:]]*$/, "", last_fs);
+ continue;
+ }
+
+ if (FILENAME ~ /\.h$/ && !inside_srcu_init_def &&
+ $1 == "#define" && $2 == "__SRCU_STRUCT_INIT") {
+ inside_srcu_init_def = 1;
+ srcu_init_param_name = $3;
+ in_macro = 1;
+ print_fields(3);
+ continue;
+ }
+ if (inside_srcu_init_def && brace_nesting == 0 &&
+ paren_nesting == 0) {
+ inside_srcu_init_def = 0;
+ in_macro = 0;
+ continue;
+ }
+
+ if (inside_srcu_init_def && brace_nesting == 1 &&
+ paren_nesting == 0 && last_fs ~ /\.[[:space:]]*$/ &&
+ $1 ~ /^[[:alnum:]_]+$/) {
+ name = $1;
+ if (name in rcu_batches) {
+ # Remove the dot.
+ sub(/\.[[:space:]]*$/, "", last_fs);
+
+ old_record = $0;
+ do
+ shift_fields(1, 0);
+ while (last_fs !~ /,/ || paren_nesting > 0);
+ end_loc = length(old_record) - length($0);
+ end_loc += index(last_fs, ",") - length(last_fs);
+
+ last_fs = substr(last_fs, index(last_fs, ",") + 1);
+ last_fs_print = 1;
+
+ match(old_record, "^"name"("FS")+=");
+ start_loc = RSTART + RLENGTH;
+
+ len = end_loc - start_loc;
+ initializer = substr(old_record, start_loc, len);
+ gsub(srcu_init_param_name "\\.", "", initializer);
+ rcu_batches[name] = initializer;
+ continue;
+ }
+ }
+
+ # Don't include a nonexistent file
+ if (!in_macro && $1 == "#include" && /^#include[[:space:]]+"rcu\.h"/) {
+ update_fieldsep("", 0);
+ next;
+ }
+
+ # Ignore most preprocessor stuff.
+ if (!in_macro && $1 ~ /#/) {
+ break;
+ }
+
+ if (brace_nesting > 0 && $1 ~ "^[[:alnum:]_]+$" && NF < 2) {
+ read_line();
+ continue;
+ }
+ if (brace_nesting > 0 &&
+ $0 ~ "^[[:alnum:]_]+[[:space:]]*(\\.|->)[[:space:]]*[[:alnum:]_]+" &&
+ $2 in rcu_batches) {
+ # Make uses of rcu_batches global. Somewhat unreliable.
+ shift_fields(1, 0);
+ print_fields(1);
+ continue;
+ }
+
+ if ($1 == "static" && NF < 3) {
+ read_line();
+ continue;
+ }
+ if ($1 == "static" && ($2 == "bool" && $3 == "try_check_zero" ||
+ $2 == "void" && $3 == "srcu_flip")) {
+ shift_fields(1, 1);
+ print_fields(2);
+ continue;
+ }
+
+ # Distinguish between read-side and write-side memory barriers.
+ if ($1 == "smp_mb" && NF < 2) {
+ read_line();
+ continue;
+ }
+ if (match($0, /^smp_mb[[:space:]();\/*]*[[:alnum:]]/)) {
+ barrier_letter = substr($0, RLENGTH, 1);
+ if (barrier_letter ~ /A|D/)
+ new_barrier_name = "sync_smp_mb";
+ else if (barrier_letter ~ /B|C/)
+ new_barrier_name = "rs_smp_mb";
+ else {
+ print "Unrecognized memory barrier." > "/dev/null";
+ exit 1;
+ }
+
+ shift_fields(1, 1);
+ printf("%s", new_barrier_name) > outputfile;
+ continue;
+ }
+
+ # Skip definition of rcu_synchronize, since it is already
+ # defined in misc.h. Only present in old versions of srcu.
+ if (brace_nesting == 0 && paren_nesting == 0 &&
+ $1 == "struct" && $2 == "rcu_synchronize" &&
+ $0 ~ "^struct(" FS ")+rcu_synchronize(" FS ")+\\{") {
+ shift_fields(2, 0);
+ while (brace_nesting) {
+ if (NF < 2)
+ read_line();
+ shift_fields(1, 0);
+ }
+ }
+
+ # Skip definition of wakeme_after_rcu for the same reason
+ if (brace_nesting == 0 && $1 == "static" && $2 == "void" &&
+ $3 == "wakeme_after_rcu") {
+ while (NF < 5)
+ read_line();
+ shift_fields(3, 0);
+ do {
+ while (NF < 3)
+ read_line();
+ shift_fields(1, 0);
+ } while (paren_nesting || brace_nesting);
+ }
+
+ if ($1 ~ /^(unsigned|long)$/ && NF < 3) {
+ read_line();
+ continue;
+ }
+
+ # Give srcu_batches_completed the correct type for old SRCU.
+ if (brace_nesting == 0 && $1 == "long" &&
+ $2 == "srcu_batches_completed") {
+ update_fieldsep("", 0);
+ printf("unsigned ") > outputfile;
+ print_fields(2);
+ continue;
+ }
+ if (brace_nesting == 0 && $1 == "unsigned" && $2 == "long" &&
+ $3 == "srcu_batches_completed") {
+ print_fields(3);
+ continue;
+ }
+
+ # Just print out the input code by default.
+ print_fields(1);
+ }
+ update_fieldsep("", 0);
+ print > outputfile;
+ next;
+}
+
+END {
+ update_fieldsep("", 0);
+
+ if (brace_nesting != 0) {
+ print "Unbalanced braces!" > "/dev/stderr";
+ exit 1;
+ }
+
+ # Define the rcu_batches
+ for (name in rcu_batches)
+ print "struct rcu_batch " name " = " rcu_batches[name] ";" > c_output;
+}
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/assume.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/assume.h
new file mode 100644
index 000000000..570a49d9d
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/assume.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef ASSUME_H
+#define ASSUME_H
+
+/* Provide an assumption macro that can be disabled for gcc. */
+#ifdef RUN
+#define assume(x) \
+ do { \
+ /* Evaluate x to suppress warnings. */ \
+ (void) (x); \
+ } while (0)
+
+#else
+#define assume(x) __CPROVER_assume(x)
+#endif
+
+#endif
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/barriers.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/barriers.h
new file mode 100644
index 000000000..3f95a768a
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/barriers.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BARRIERS_H
+#define BARRIERS_H
+
+#define barrier() __asm__ __volatile__("" : : : "memory")
+
+#ifdef RUN
+#define smp_mb() __sync_synchronize()
+#define smp_mb__after_unlock_lock() __sync_synchronize()
+#else
+/*
+ * Copied from CBMC's implementation of __sync_synchronize(), which
+ * seems to be disabled by default.
+ */
+#define smp_mb() __CPROVER_fence("WWfence", "RRfence", "RWfence", "WRfence", \
+ "WWcumul", "RRcumul", "RWcumul", "WRcumul")
+#define smp_mb__after_unlock_lock() __CPROVER_fence("WWfence", "RRfence", "RWfence", "WRfence", \
+ "WWcumul", "RRcumul", "RWcumul", "WRcumul")
+#endif
+
+/*
+ * Allow memory barriers to be disabled in either the read or write side
+ * of SRCU individually.
+ */
+
+#ifndef NO_SYNC_SMP_MB
+#define sync_smp_mb() smp_mb()
+#else
+#define sync_smp_mb() do {} while (0)
+#endif
+
+#ifndef NO_READ_SIDE_SMP_MB
+#define rs_smp_mb() smp_mb()
+#else
+#define rs_smp_mb() do {} while (0)
+#endif
+
+#define READ_ONCE(x) (*(volatile typeof(x) *) &(x))
+#define WRITE_ONCE(x) ((*(volatile typeof(x) *) &(x)) = (val))
+
+#endif
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/bug_on.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/bug_on.h
new file mode 100644
index 000000000..5e7912c6a
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/bug_on.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef BUG_ON_H
+#define BUG_ON_H
+
+#include <assert.h>
+
+#define BUG() assert(0)
+#define BUG_ON(x) assert(!(x))
+
+/* Does it make sense to treat warnings as errors? */
+#define WARN() BUG()
+#define WARN_ON(x) (BUG_ON(x), false)
+
+#endif
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/combined_source.c b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/combined_source.c
new file mode 100644
index 000000000..e67ee5b3d
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/combined_source.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <config.h>
+
+/* Include all source files. */
+
+#include "include_srcu.c"
+
+#include "preempt.c"
+#include "misc.c"
+
+/* Used by test.c files */
+#include <pthread.h>
+#include <stdlib.h>
+#include <linux/srcu.h>
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/config.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/config.h
new file mode 100644
index 000000000..283d71033
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/config.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* "Cheater" definitions based on restricted Kconfig choices. */
+
+#undef CONFIG_TINY_RCU
+#undef __CHECKER__
+#undef CONFIG_DEBUG_LOCK_ALLOC
+#undef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+#undef CONFIG_HOTPLUG_CPU
+#undef CONFIG_MODULES
+#undef CONFIG_NO_HZ_FULL_SYSIDLE
+#undef CONFIG_PREEMPT_COUNT
+#undef CONFIG_PREEMPT_RCU
+#undef CONFIG_PROVE_RCU
+#undef CONFIG_RCU_NOCB_CPU
+#undef CONFIG_RCU_NOCB_CPU_ALL
+#undef CONFIG_RCU_STALL_COMMON
+#undef CONFIG_RCU_TRACE
+#undef CONFIG_RCU_USER_QS
+#undef CONFIG_TASKS_RCU
+#define CONFIG_TREE_RCU
+
+#define CONFIG_GENERIC_ATOMIC64
+
+#if NR_CPUS > 1
+#define CONFIG_SMP
+#else
+#undef CONFIG_SMP
+#endif
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/include_srcu.c b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/include_srcu.c
new file mode 100644
index 000000000..e5202d4cf
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/include_srcu.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <config.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <pthread.h>
+#include <stddef.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include "int_typedefs.h"
+
+#include "barriers.h"
+#include "bug_on.h"
+#include "locks.h"
+#include "misc.h"
+#include "preempt.h"
+#include "percpu.h"
+#include "workqueues.h"
+
+#ifdef USE_SIMPLE_SYNC_SRCU
+#define synchronize_srcu(sp) synchronize_srcu_original(sp)
+#endif
+
+#include <srcu.c>
+
+#ifdef USE_SIMPLE_SYNC_SRCU
+#undef synchronize_srcu
+
+#include "simple_sync_srcu.c"
+#endif
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/int_typedefs.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/int_typedefs.h
new file mode 100644
index 000000000..0dd27aa51
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/int_typedefs.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef INT_TYPEDEFS_H
+#define INT_TYPEDEFS_H
+
+#include <inttypes.h>
+
+typedef int8_t s8;
+typedef uint8_t u8;
+typedef int16_t s16;
+typedef uint16_t u16;
+typedef int32_t s32;
+typedef uint32_t u32;
+typedef int64_t s64;
+typedef uint64_t u64;
+
+typedef int8_t __s8;
+typedef uint8_t __u8;
+typedef int16_t __s16;
+typedef uint16_t __u16;
+typedef int32_t __s32;
+typedef uint32_t __u32;
+typedef int64_t __s64;
+typedef uint64_t __u64;
+
+#define S8_C(x) INT8_C(x)
+#define U8_C(x) UINT8_C(x)
+#define S16_C(x) INT16_C(x)
+#define U16_C(x) UINT16_C(x)
+#define S32_C(x) INT32_C(x)
+#define U32_C(x) UINT32_C(x)
+#define S64_C(x) INT64_C(x)
+#define U64_C(x) UINT64_C(x)
+
+#endif
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/locks.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/locks.h
new file mode 100644
index 000000000..cf6938d67
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/locks.h
@@ -0,0 +1,221 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LOCKS_H
+#define LOCKS_H
+
+#include <limits.h>
+#include <pthread.h>
+#include <stdbool.h>
+
+#include "assume.h"
+#include "bug_on.h"
+#include "preempt.h"
+
+int nondet_int(void);
+
+#define __acquire(x)
+#define __acquires(x)
+#define __release(x)
+#define __releases(x)
+
+/* Only use one lock mechanism. Select which one. */
+#ifdef PTHREAD_LOCK
+struct lock_impl {
+ pthread_mutex_t mutex;
+};
+
+static inline void lock_impl_lock(struct lock_impl *lock)
+{
+ BUG_ON(pthread_mutex_lock(&lock->mutex));
+}
+
+static inline void lock_impl_unlock(struct lock_impl *lock)
+{
+ BUG_ON(pthread_mutex_unlock(&lock->mutex));
+}
+
+static inline bool lock_impl_trylock(struct lock_impl *lock)
+{
+ int err = pthread_mutex_trylock(&lock->mutex);
+
+ if (!err)
+ return true;
+ else if (err == EBUSY)
+ return false;
+ BUG();
+}
+
+static inline void lock_impl_init(struct lock_impl *lock)
+{
+ pthread_mutex_init(&lock->mutex, NULL);
+}
+
+#define LOCK_IMPL_INITIALIZER {.mutex = PTHREAD_MUTEX_INITIALIZER}
+
+#else /* !defined(PTHREAD_LOCK) */
+/* Spinlock that assumes that it always gets the lock immediately. */
+
+struct lock_impl {
+ bool locked;
+};
+
+static inline bool lock_impl_trylock(struct lock_impl *lock)
+{
+#ifdef RUN
+ /* TODO: Should this be a test and set? */
+ return __sync_bool_compare_and_swap(&lock->locked, false, true);
+#else
+ __CPROVER_atomic_begin();
+ bool old_locked = lock->locked;
+ lock->locked = true;
+ __CPROVER_atomic_end();
+
+ /* Minimal barrier to prevent accesses leaking out of lock. */
+ __CPROVER_fence("RRfence", "RWfence");
+
+ return !old_locked;
+#endif
+}
+
+static inline void lock_impl_lock(struct lock_impl *lock)
+{
+ /*
+ * CBMC doesn't support busy waiting, so just assume that the
+ * lock is available.
+ */
+ assume(lock_impl_trylock(lock));
+
+ /*
+ * If the lock was already held by this thread then the assumption
+ * is unsatisfiable (deadlock).
+ */
+}
+
+static inline void lock_impl_unlock(struct lock_impl *lock)
+{
+#ifdef RUN
+ BUG_ON(!__sync_bool_compare_and_swap(&lock->locked, true, false));
+#else
+ /* Minimal barrier to prevent accesses leaking out of lock. */
+ __CPROVER_fence("RWfence", "WWfence");
+
+ __CPROVER_atomic_begin();
+ bool old_locked = lock->locked;
+ lock->locked = false;
+ __CPROVER_atomic_end();
+
+ BUG_ON(!old_locked);
+#endif
+}
+
+static inline void lock_impl_init(struct lock_impl *lock)
+{
+ lock->locked = false;
+}
+
+#define LOCK_IMPL_INITIALIZER {.locked = false}
+
+#endif /* !defined(PTHREAD_LOCK) */
+
+/*
+ * Implement spinlocks using the lock mechanism. Wrap the lock to prevent mixing
+ * locks of different types.
+ */
+typedef struct {
+ struct lock_impl internal_lock;
+} spinlock_t;
+
+#define SPIN_LOCK_UNLOCKED {.internal_lock = LOCK_IMPL_INITIALIZER}
+#define __SPIN_LOCK_UNLOCKED(x) SPIN_LOCK_UNLOCKED
+#define DEFINE_SPINLOCK(x) spinlock_t x = SPIN_LOCK_UNLOCKED
+
+static inline void spin_lock_init(spinlock_t *lock)
+{
+ lock_impl_init(&lock->internal_lock);
+}
+
+static inline void spin_lock(spinlock_t *lock)
+{
+ /*
+ * Spin locks also need to be removed in order to eliminate all
+ * memory barriers. They are only used by the write side anyway.
+ */
+#ifndef NO_SYNC_SMP_MB
+ preempt_disable();
+ lock_impl_lock(&lock->internal_lock);
+#endif
+}
+
+static inline void spin_unlock(spinlock_t *lock)
+{
+#ifndef NO_SYNC_SMP_MB
+ lock_impl_unlock(&lock->internal_lock);
+ preempt_enable();
+#endif
+}
+
+/* Don't bother with interrupts */
+#define spin_lock_irq(lock) spin_lock(lock)
+#define spin_unlock_irq(lock) spin_unlock(lock)
+#define spin_lock_irqsave(lock, flags) spin_lock(lock)
+#define spin_unlock_irqrestore(lock, flags) spin_unlock(lock)
+
+/*
+ * This is supposed to return an int, but I think that a bool should work as
+ * well.
+ */
+static inline bool spin_trylock(spinlock_t *lock)
+{
+#ifndef NO_SYNC_SMP_MB
+ preempt_disable();
+ return lock_impl_trylock(&lock->internal_lock);
+#else
+ return true;
+#endif
+}
+
+struct completion {
+ /* Hopefuly this won't overflow. */
+ unsigned int count;
+};
+
+#define COMPLETION_INITIALIZER(x) {.count = 0}
+#define DECLARE_COMPLETION(x) struct completion x = COMPLETION_INITIALIZER(x)
+#define DECLARE_COMPLETION_ONSTACK(x) DECLARE_COMPLETION(x)
+
+static inline void init_completion(struct completion *c)
+{
+ c->count = 0;
+}
+
+static inline void wait_for_completion(struct completion *c)
+{
+ unsigned int prev_count = __sync_fetch_and_sub(&c->count, 1);
+
+ assume(prev_count);
+}
+
+static inline void complete(struct completion *c)
+{
+ unsigned int prev_count = __sync_fetch_and_add(&c->count, 1);
+
+ BUG_ON(prev_count == UINT_MAX);
+}
+
+/* This function probably isn't very useful for CBMC. */
+static inline bool try_wait_for_completion(struct completion *c)
+{
+ BUG();
+}
+
+static inline bool completion_done(struct completion *c)
+{
+ return c->count;
+}
+
+/* TODO: Implement complete_all */
+static inline void complete_all(struct completion *c)
+{
+ BUG();
+}
+
+#endif
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/misc.c b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/misc.c
new file mode 100644
index 000000000..9440cc39e
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/misc.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <config.h>
+
+#include "misc.h"
+#include "bug_on.h"
+
+struct rcu_head;
+
+void wakeme_after_rcu(struct rcu_head *head)
+{
+ BUG();
+}
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/misc.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/misc.h
new file mode 100644
index 000000000..aca50030f
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/misc.h
@@ -0,0 +1,58 @@
+#ifndef MISC_H
+#define MISC_H
+
+#include "assume.h"
+#include "int_typedefs.h"
+#include "locks.h"
+
+#include <linux/types.h>
+
+/* Probably won't need to deal with bottom halves. */
+static inline void local_bh_disable(void) {}
+static inline void local_bh_enable(void) {}
+
+#define MODULE_ALIAS(X)
+#define module_param(...)
+#define EXPORT_SYMBOL_GPL(x)
+
+#define container_of(ptr, type, member) ({ \
+ const typeof(((type *)0)->member) *__mptr = (ptr); \
+ (type *)((char *)__mptr - offsetof(type, member)); \
+})
+
+#ifndef USE_SIMPLE_SYNC_SRCU
+/* Abuse udelay to make sure that busy loops terminate. */
+#define udelay(x) assume(0)
+
+#else
+
+/* The simple custom synchronize_srcu is ok with try_check_zero failing. */
+#define udelay(x) do { } while (0)
+#endif
+
+#define trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \
+ do { } while (0)
+
+#define notrace
+
+/* Avoid including rcupdate.h */
+struct rcu_synchronize {
+ struct rcu_head head;
+ struct completion completion;
+};
+
+void wakeme_after_rcu(struct rcu_head *head);
+
+#define rcu_lock_acquire(a) do { } while (0)
+#define rcu_lock_release(a) do { } while (0)
+#define rcu_lockdep_assert(c, s) do { } while (0)
+#define RCU_LOCKDEP_WARN(c, s) do { } while (0)
+
+/* Let CBMC non-deterministically choose switch between normal and expedited. */
+bool rcu_gp_is_normal(void);
+bool rcu_gp_is_expedited(void);
+
+/* Do the same for old versions of rcu. */
+#define rcu_expedited (rcu_gp_is_expedited())
+
+#endif
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/percpu.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/percpu.h
new file mode 100644
index 000000000..27e67a3f2
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/percpu.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef PERCPU_H
+#define PERCPU_H
+
+#include <stddef.h>
+#include "bug_on.h"
+#include "preempt.h"
+
+#define __percpu
+
+/* Maximum size of any percpu data. */
+#define PERCPU_OFFSET (4 * sizeof(long))
+
+/* Ignore alignment, as CBMC doesn't care about false sharing. */
+#define alloc_percpu(type) __alloc_percpu(sizeof(type), 1)
+
+static inline void *__alloc_percpu(size_t size, size_t align)
+{
+ BUG();
+ return NULL;
+}
+
+static inline void free_percpu(void *ptr)
+{
+ BUG();
+}
+
+#define per_cpu_ptr(ptr, cpu) \
+ ((typeof(ptr)) ((char *) (ptr) + PERCPU_OFFSET * cpu))
+
+#define __this_cpu_inc(pcp) __this_cpu_add(pcp, 1)
+#define __this_cpu_dec(pcp) __this_cpu_sub(pcp, 1)
+#define __this_cpu_sub(pcp, n) __this_cpu_add(pcp, -(typeof(pcp)) (n))
+
+#define this_cpu_inc(pcp) this_cpu_add(pcp, 1)
+#define this_cpu_dec(pcp) this_cpu_sub(pcp, 1)
+#define this_cpu_sub(pcp, n) this_cpu_add(pcp, -(typeof(pcp)) (n))
+
+/* Make CBMC use atomics to work around bug. */
+#ifdef RUN
+#define THIS_CPU_ADD_HELPER(ptr, x) (*(ptr) += (x))
+#else
+/*
+ * Split the atomic into a read and a write so that it has the least
+ * possible ordering.
+ */
+#define THIS_CPU_ADD_HELPER(ptr, x) \
+ do { \
+ typeof(ptr) this_cpu_add_helper_ptr = (ptr); \
+ typeof(ptr) this_cpu_add_helper_x = (x); \
+ typeof(*ptr) this_cpu_add_helper_temp; \
+ __CPROVER_atomic_begin(); \
+ this_cpu_add_helper_temp = *(this_cpu_add_helper_ptr); \
+ __CPROVER_atomic_end(); \
+ this_cpu_add_helper_temp += this_cpu_add_helper_x; \
+ __CPROVER_atomic_begin(); \
+ *(this_cpu_add_helper_ptr) = this_cpu_add_helper_temp; \
+ __CPROVER_atomic_end(); \
+ } while (0)
+#endif
+
+/*
+ * For some reason CBMC needs an atomic operation even though this is percpu
+ * data.
+ */
+#define __this_cpu_add(pcp, n) \
+ do { \
+ BUG_ON(preemptible()); \
+ THIS_CPU_ADD_HELPER(per_cpu_ptr(&(pcp), thread_cpu_id), \
+ (typeof(pcp)) (n)); \
+ } while (0)
+
+#define this_cpu_add(pcp, n) \
+ do { \
+ int this_cpu_add_impl_cpu = get_cpu(); \
+ THIS_CPU_ADD_HELPER(per_cpu_ptr(&(pcp), this_cpu_add_impl_cpu), \
+ (typeof(pcp)) (n)); \
+ put_cpu(); \
+ } while (0)
+
+/*
+ * This will cause a compiler warning because of the cast from char[][] to
+ * type*. This will cause a compile time error if type is too big.
+ */
+#define DEFINE_PER_CPU(type, name) \
+ char name[NR_CPUS][PERCPU_OFFSET]; \
+ typedef char percpu_too_big_##name \
+ [sizeof(type) > PERCPU_OFFSET ? -1 : 1]
+
+#define for_each_possible_cpu(cpu) \
+ for ((cpu) = 0; (cpu) < NR_CPUS; ++(cpu))
+
+#endif
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/preempt.c b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/preempt.c
new file mode 100644
index 000000000..b4083ae34
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/preempt.c
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <config.h>
+
+#include "preempt.h"
+
+#include "assume.h"
+#include "locks.h"
+
+/* Support NR_CPUS of at most 64 */
+#define CPU_PREEMPTION_LOCKS_INIT0 LOCK_IMPL_INITIALIZER
+#define CPU_PREEMPTION_LOCKS_INIT1 \
+ CPU_PREEMPTION_LOCKS_INIT0, CPU_PREEMPTION_LOCKS_INIT0
+#define CPU_PREEMPTION_LOCKS_INIT2 \
+ CPU_PREEMPTION_LOCKS_INIT1, CPU_PREEMPTION_LOCKS_INIT1
+#define CPU_PREEMPTION_LOCKS_INIT3 \
+ CPU_PREEMPTION_LOCKS_INIT2, CPU_PREEMPTION_LOCKS_INIT2
+#define CPU_PREEMPTION_LOCKS_INIT4 \
+ CPU_PREEMPTION_LOCKS_INIT3, CPU_PREEMPTION_LOCKS_INIT3
+#define CPU_PREEMPTION_LOCKS_INIT5 \
+ CPU_PREEMPTION_LOCKS_INIT4, CPU_PREEMPTION_LOCKS_INIT4
+
+/*
+ * Simulate disabling preemption by locking a particular cpu. NR_CPUS
+ * should be the actual number of cpus, not just the maximum.
+ */
+struct lock_impl cpu_preemption_locks[NR_CPUS] = {
+ CPU_PREEMPTION_LOCKS_INIT0
+#if (NR_CPUS - 1) & 1
+ , CPU_PREEMPTION_LOCKS_INIT0
+#endif
+#if (NR_CPUS - 1) & 2
+ , CPU_PREEMPTION_LOCKS_INIT1
+#endif
+#if (NR_CPUS - 1) & 4
+ , CPU_PREEMPTION_LOCKS_INIT2
+#endif
+#if (NR_CPUS - 1) & 8
+ , CPU_PREEMPTION_LOCKS_INIT3
+#endif
+#if (NR_CPUS - 1) & 16
+ , CPU_PREEMPTION_LOCKS_INIT4
+#endif
+#if (NR_CPUS - 1) & 32
+ , CPU_PREEMPTION_LOCKS_INIT5
+#endif
+};
+
+#undef CPU_PREEMPTION_LOCKS_INIT0
+#undef CPU_PREEMPTION_LOCKS_INIT1
+#undef CPU_PREEMPTION_LOCKS_INIT2
+#undef CPU_PREEMPTION_LOCKS_INIT3
+#undef CPU_PREEMPTION_LOCKS_INIT4
+#undef CPU_PREEMPTION_LOCKS_INIT5
+
+__thread int thread_cpu_id;
+__thread int preempt_disable_count;
+
+void preempt_disable(void)
+{
+ BUG_ON(preempt_disable_count < 0 || preempt_disable_count == INT_MAX);
+
+ if (preempt_disable_count++)
+ return;
+
+ thread_cpu_id = nondet_int();
+ assume(thread_cpu_id >= 0);
+ assume(thread_cpu_id < NR_CPUS);
+ lock_impl_lock(&cpu_preemption_locks[thread_cpu_id]);
+}
+
+void preempt_enable(void)
+{
+ BUG_ON(preempt_disable_count < 1);
+
+ if (--preempt_disable_count)
+ return;
+
+ lock_impl_unlock(&cpu_preemption_locks[thread_cpu_id]);
+}
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/preempt.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/preempt.h
new file mode 100644
index 000000000..f8b762cd2
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/preempt.h
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef PREEMPT_H
+#define PREEMPT_H
+
+#include <stdbool.h>
+
+#include "bug_on.h"
+
+/* This flag contains garbage if preempt_disable_count is 0. */
+extern __thread int thread_cpu_id;
+
+/* Support recursive preemption disabling. */
+extern __thread int preempt_disable_count;
+
+void preempt_disable(void);
+void preempt_enable(void);
+
+static inline void preempt_disable_notrace(void)
+{
+ preempt_disable();
+}
+
+static inline void preempt_enable_no_resched(void)
+{
+ preempt_enable();
+}
+
+static inline void preempt_enable_notrace(void)
+{
+ preempt_enable();
+}
+
+static inline int preempt_count(void)
+{
+ return preempt_disable_count;
+}
+
+static inline bool preemptible(void)
+{
+ return !preempt_count();
+}
+
+static inline int get_cpu(void)
+{
+ preempt_disable();
+ return thread_cpu_id;
+}
+
+static inline void put_cpu(void)
+{
+ preempt_enable();
+}
+
+static inline void might_sleep(void)
+{
+ BUG_ON(preempt_disable_count);
+}
+
+#endif
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/simple_sync_srcu.c b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/simple_sync_srcu.c
new file mode 100644
index 000000000..97f592048
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/simple_sync_srcu.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <config.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <pthread.h>
+#include <stddef.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include "int_typedefs.h"
+
+#include "barriers.h"
+#include "bug_on.h"
+#include "locks.h"
+#include "misc.h"
+#include "preempt.h"
+#include "percpu.h"
+#include "workqueues.h"
+
+#include <linux/srcu.h>
+
+/* Functions needed from modify_srcu.c */
+bool try_check_zero(struct srcu_struct *sp, int idx, int trycount);
+void srcu_flip(struct srcu_struct *sp);
+
+/* Simpler implementation of synchronize_srcu that ignores batching. */
+void synchronize_srcu(struct srcu_struct *sp)
+{
+ int idx;
+ /*
+ * This code assumes that try_check_zero will succeed anyway,
+ * so there is no point in multiple tries.
+ */
+ const int trycount = 1;
+
+ might_sleep();
+
+ /* Ignore the lock, as multiple writers aren't working yet anyway. */
+
+ idx = 1 ^ (sp->completed & 1);
+
+ /* For comments see srcu_advance_batches. */
+
+ assume(try_check_zero(sp, idx, trycount));
+
+ srcu_flip(sp);
+
+ assume(try_check_zero(sp, idx^1, trycount));
+}
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/workqueues.h b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/workqueues.h
new file mode 100644
index 000000000..28b960300
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/src/workqueues.h
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef WORKQUEUES_H
+#define WORKQUEUES_H
+
+#include <stdbool.h>
+
+#include "barriers.h"
+#include "bug_on.h"
+#include "int_typedefs.h"
+
+#include <linux/types.h>
+
+/* Stub workqueue implementation. */
+
+struct work_struct;
+typedef void (*work_func_t)(struct work_struct *work);
+void delayed_work_timer_fn(unsigned long __data);
+
+struct work_struct {
+/* atomic_long_t data; */
+ unsigned long data;
+
+ struct list_head entry;
+ work_func_t func;
+#ifdef CONFIG_LOCKDEP
+ struct lockdep_map lockdep_map;
+#endif
+};
+
+struct timer_list {
+ struct hlist_node entry;
+ unsigned long expires;
+ void (*function)(unsigned long);
+ unsigned long data;
+ u32 flags;
+ int slack;
+};
+
+struct delayed_work {
+ struct work_struct work;
+ struct timer_list timer;
+
+ /* target workqueue and CPU ->timer uses to queue ->work */
+ struct workqueue_struct *wq;
+ int cpu;
+};
+
+
+static inline bool schedule_work(struct work_struct *work)
+{
+ BUG();
+ return true;
+}
+
+static inline bool schedule_work_on(int cpu, struct work_struct *work)
+{
+ BUG();
+ return true;
+}
+
+static inline bool queue_work(struct workqueue_struct *wq,
+ struct work_struct *work)
+{
+ BUG();
+ return true;
+}
+
+static inline bool queue_delayed_work(struct workqueue_struct *wq,
+ struct delayed_work *dwork,
+ unsigned long delay)
+{
+ BUG();
+ return true;
+}
+
+#define INIT_WORK(w, f) \
+ do { \
+ (w)->data = 0; \
+ (w)->func = (f); \
+ } while (0)
+
+#define INIT_DELAYED_WORK(w, f) INIT_WORK(&(w)->work, (f))
+
+#define __WORK_INITIALIZER(n, f) { \
+ .data = 0, \
+ .entry = { &(n).entry, &(n).entry }, \
+ .func = f \
+ }
+
+/* Don't bother initializing timer. */
+#define __DELAYED_WORK_INITIALIZER(n, f, tflags) { \
+ .work = __WORK_INITIALIZER((n).work, (f)), \
+ }
+
+#define DECLARE_WORK(n, f) \
+ struct workqueue_struct n = __WORK_INITIALIZER
+
+#define DECLARE_DELAYED_WORK(n, f) \
+ struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, 0)
+
+#define system_power_efficient_wq ((struct workqueue_struct *) NULL)
+
+#endif
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/.gitignore b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/.gitignore
new file mode 100644
index 000000000..d65462d64
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+*.out
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/Makefile b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/Makefile
new file mode 100644
index 000000000..ad21b925f
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+CBMC_FLAGS = -I../.. -I../../src -I../../include -I../../empty_includes -32 -pointer-check -mm pso
+
+all:
+ for i in ./*.pass; do \
+ echo $$i ; \
+ CBMC_FLAGS="$(CBMC_FLAGS)" sh ../test_script.sh --should-pass $$i > $$i.out 2>&1 ; \
+ done
+ for i in ./*.fail; do \
+ echo $$i ; \
+ CBMC_FLAGS="$(CBMC_FLAGS)" sh ../test_script.sh --should-fail $$i > $$i.out 2>&1 ; \
+ done
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/assert_end.fail b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/assert_end.fail
new file mode 100644
index 000000000..40c807591
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/assert_end.fail
@@ -0,0 +1 @@
+test_cbmc_options="-DASSERT_END"
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force.fail b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force.fail
new file mode 100644
index 000000000..ada5baf0b
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force.fail
@@ -0,0 +1 @@
+test_cbmc_options="-DFORCE_FAILURE"
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force2.fail b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force2.fail
new file mode 100644
index 000000000..8fe00c8db
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force2.fail
@@ -0,0 +1 @@
+test_cbmc_options="-DFORCE_FAILURE_2"
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force3.fail b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force3.fail
new file mode 100644
index 000000000..612ed6772
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/force3.fail
@@ -0,0 +1 @@
+test_cbmc_options="-DFORCE_FAILURE_3"
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/main.pass b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/main.pass
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/main.pass
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/test.c b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/test.c
new file mode 100644
index 000000000..2ce2016f7
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/store_buffering/test.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <src/combined_source.c>
+
+int x;
+int y;
+
+int __unbuffered_tpr_x;
+int __unbuffered_tpr_y;
+
+DEFINE_SRCU(ss);
+
+void rcu_reader(void)
+{
+ int idx;
+
+#ifndef FORCE_FAILURE_3
+ idx = srcu_read_lock(&ss);
+#endif
+ might_sleep();
+
+ __unbuffered_tpr_y = READ_ONCE(y);
+#ifdef FORCE_FAILURE
+ srcu_read_unlock(&ss, idx);
+ idx = srcu_read_lock(&ss);
+#endif
+ WRITE_ONCE(x, 1);
+
+#ifndef FORCE_FAILURE_3
+ srcu_read_unlock(&ss, idx);
+#endif
+ might_sleep();
+}
+
+void *thread_update(void *arg)
+{
+ WRITE_ONCE(y, 1);
+#ifndef FORCE_FAILURE_2
+ synchronize_srcu(&ss);
+#endif
+ might_sleep();
+ __unbuffered_tpr_x = READ_ONCE(x);
+
+ return NULL;
+}
+
+void *thread_process_reader(void *arg)
+{
+ rcu_reader();
+
+ return NULL;
+}
+
+int main(int argc, char *argv[])
+{
+ pthread_t tu;
+ pthread_t tpr;
+
+ if (pthread_create(&tu, NULL, thread_update, NULL))
+ abort();
+ if (pthread_create(&tpr, NULL, thread_process_reader, NULL))
+ abort();
+ if (pthread_join(tu, NULL))
+ abort();
+ if (pthread_join(tpr, NULL))
+ abort();
+ assert(__unbuffered_tpr_y != 0 || __unbuffered_tpr_x != 0);
+
+#ifdef ASSERT_END
+ assert(0);
+#endif
+
+ return 0;
+}
diff --git a/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/test_script.sh b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/test_script.sh
new file mode 100755
index 000000000..2fe1f0339
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/formal/srcu-cbmc/tests/test_script.sh
@@ -0,0 +1,103 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+# This script expects a mode (either --should-pass or --should-fail) followed by
+# an input file. The script uses the following environment variables. The test C
+# source file is expected to be named test.c in the directory containing the
+# input file.
+#
+# CBMC: The command to run CBMC. Default: cbmc
+# CBMC_FLAGS: Additional flags to pass to CBMC
+# NR_CPUS: Number of cpus to run tests with. Default specified by the test
+# SYNC_SRCU_MODE: Choose implementation of synchronize_srcu. Defaults to simple.
+# kernel: Version included in the linux kernel source.
+# simple: Use try_check_zero directly.
+#
+# The input file is a script that is sourced by this file. It can define any of
+# the following variables to configure the test.
+#
+# test_cbmc_options: Extra options to pass to CBMC.
+# min_cpus_fail: Minimum number of CPUs (NR_CPUS) for verification to fail.
+# The test is expected to pass if it is run with fewer. (Only
+# useful for .fail files)
+# default_cpus: Quantity of CPUs to use for the test, if not specified on the
+# command line. Default: Larger of 2 and MIN_CPUS_FAIL.
+
+set -e
+
+if test "$#" -ne 2; then
+ echo "Expected one option followed by an input file" 1>&2
+ exit 99
+fi
+
+if test "x$1" = "x--should-pass"; then
+ should_pass="yes"
+elif test "x$1" = "x--should-fail"; then
+ should_pass="no"
+else
+ echo "Unrecognized argument '$1'" 1>&2
+
+ # Exit code 99 indicates a hard error.
+ exit 99
+fi
+
+CBMC=${CBMC:-cbmc}
+
+SYNC_SRCU_MODE=${SYNC_SRCU_MODE:-simple}
+
+case ${SYNC_SRCU_MODE} in
+kernel) sync_srcu_mode_flags="" ;;
+simple) sync_srcu_mode_flags="-DUSE_SIMPLE_SYNC_SRCU" ;;
+
+*)
+ echo "Unrecognized argument '${SYNC_SRCU_MODE}'" 1>&2
+ exit 99
+ ;;
+esac
+
+min_cpus_fail=1
+
+c_file=`dirname "$2"`/test.c
+
+# Source the input file.
+. $2
+
+if test ${min_cpus_fail} -gt 2; then
+ default_default_cpus=${min_cpus_fail}
+else
+ default_default_cpus=2
+fi
+default_cpus=${default_cpus:-${default_default_cpus}}
+cpus=${NR_CPUS:-${default_cpus}}
+
+# Check if there are two few cpus to make the test fail.
+if test $cpus -lt ${min_cpus_fail:-0}; then
+ should_pass="yes"
+fi
+
+cbmc_opts="-DNR_CPUS=${cpus} ${sync_srcu_mode_flags} ${test_cbmc_options} ${CBMC_FLAGS}"
+
+echo "Running CBMC: ${CBMC} ${cbmc_opts} ${c_file}"
+if ${CBMC} ${cbmc_opts} "${c_file}"; then
+ # Verification successful. Make sure that it was supposed to verify.
+ test "x${should_pass}" = xyes
+else
+ cbmc_exit_status=$?
+
+ # An exit status of 10 indicates a failed verification.
+ # (see cbmc_parse_optionst::do_bmc in the CBMC source code)
+ if test ${cbmc_exit_status} -eq 10 && test "x${should_pass}" = xno; then
+ :
+ else
+ echo "CBMC returned ${cbmc_exit_status} exit status" 1>&2
+
+ # Parse errors have exit status 6. Any other type of error
+ # should be considered a hard error.
+ if test ${cbmc_exit_status} -ne 6 && \
+ test ${cbmc_exit_status} -ne 10; then
+ exit 99
+ else
+ exit 1
+ fi
+ fi
+fi
diff --git a/tools/testing/selftests/resctrl/Makefile b/tools/testing/selftests/resctrl/Makefile
new file mode 100644
index 000000000..6bcee2ec9
--- /dev/null
+++ b/tools/testing/selftests/resctrl/Makefile
@@ -0,0 +1,17 @@
+CC = $(CROSS_COMPILE)gcc
+CFLAGS = -g -Wall -O2 -D_FORTIFY_SOURCE=2
+SRCS=$(wildcard *.c)
+OBJS=$(SRCS:.c=.o)
+
+all: resctrl_tests
+
+$(OBJS): $(SRCS)
+ $(CC) $(CFLAGS) -c $(SRCS)
+
+resctrl_tests: $(OBJS)
+ $(CC) $(CFLAGS) -o $@ $^
+
+.PHONY: clean
+
+clean:
+ $(RM) $(OBJS) resctrl_tests
diff --git a/tools/testing/selftests/resctrl/README b/tools/testing/selftests/resctrl/README
new file mode 100644
index 000000000..20502cb47
--- /dev/null
+++ b/tools/testing/selftests/resctrl/README
@@ -0,0 +1,53 @@
+resctrl_tests - resctrl file system test suit
+
+Authors:
+ Fenghua Yu <fenghua.yu@intel.com>
+ Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+
+resctrl_tests tests various resctrl functionalities and interfaces including
+both software and hardware.
+
+Currently it supports Memory Bandwidth Monitoring test and Memory Bandwidth
+Allocation test on Intel RDT hardware. More tests will be added in the future.
+And the test suit can be extended to cover AMD QoS and ARM MPAM hardware
+as well.
+
+BUILD
+-----
+
+Run "make" to build executable file "resctrl_tests".
+
+RUN
+---
+
+To use resctrl_tests, root or sudoer privileges are required. This is because
+the test needs to mount resctrl file system and change contents in the file
+system.
+
+Executing the test without any parameter will run all supported tests:
+
+ sudo ./resctrl_tests
+
+OVERVIEW OF EXECUTION
+---------------------
+
+A test case has four stages:
+
+ - setup: mount resctrl file system, create group, setup schemata, move test
+ process pids to tasks, start benchmark.
+ - execute: let benchmark run
+ - verify: get resctrl data and verify the data with another source, e.g.
+ perf event.
+ - teardown: umount resctrl and clear temporary files.
+
+ARGUMENTS
+---------
+
+Parameter '-h' shows usage information.
+
+usage: resctrl_tests [-h] [-b "benchmark_cmd [options]"] [-t test list] [-n no_of_bits]
+ -b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CQM default benchmark is builtin fill_buf
+ -t test list: run tests specified in the test list, e.g. -t mbm,mba,cqm,cat
+ -n no_of_bits: run cache tests using specified no of bits in cache bit mask
+ -p cpu_no: specify CPU number to run the test. 1 is default
+ -h: help
diff --git a/tools/testing/selftests/resctrl/cache.c b/tools/testing/selftests/resctrl/cache.c
new file mode 100644
index 000000000..b3c0e858c
--- /dev/null
+++ b/tools/testing/selftests/resctrl/cache.c
@@ -0,0 +1,276 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdint.h>
+#include "resctrl.h"
+
+struct read_format {
+ __u64 nr; /* The number of events */
+ struct {
+ __u64 value; /* The value of the event */
+ } values[2];
+};
+
+static struct perf_event_attr pea_llc_miss;
+static struct read_format rf_cqm;
+static int fd_lm;
+char llc_occup_path[1024];
+
+static void initialize_perf_event_attr(void)
+{
+ pea_llc_miss.type = PERF_TYPE_HARDWARE;
+ pea_llc_miss.size = sizeof(struct perf_event_attr);
+ pea_llc_miss.read_format = PERF_FORMAT_GROUP;
+ pea_llc_miss.exclude_kernel = 1;
+ pea_llc_miss.exclude_hv = 1;
+ pea_llc_miss.exclude_idle = 1;
+ pea_llc_miss.exclude_callchain_kernel = 1;
+ pea_llc_miss.inherit = 1;
+ pea_llc_miss.exclude_guest = 1;
+ pea_llc_miss.disabled = 1;
+}
+
+static void ioctl_perf_event_ioc_reset_enable(void)
+{
+ ioctl(fd_lm, PERF_EVENT_IOC_RESET, 0);
+ ioctl(fd_lm, PERF_EVENT_IOC_ENABLE, 0);
+}
+
+static int perf_event_open_llc_miss(pid_t pid, int cpu_no)
+{
+ fd_lm = perf_event_open(&pea_llc_miss, pid, cpu_no, -1,
+ PERF_FLAG_FD_CLOEXEC);
+ if (fd_lm == -1) {
+ perror("Error opening leader");
+ ctrlc_handler(0, NULL, NULL);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int initialize_llc_perf(void)
+{
+ memset(&pea_llc_miss, 0, sizeof(struct perf_event_attr));
+ memset(&rf_cqm, 0, sizeof(struct read_format));
+
+ /* Initialize perf_event_attr structures for HW_CACHE_MISSES */
+ initialize_perf_event_attr();
+
+ pea_llc_miss.config = PERF_COUNT_HW_CACHE_MISSES;
+
+ rf_cqm.nr = 1;
+
+ return 0;
+}
+
+static int reset_enable_llc_perf(pid_t pid, int cpu_no)
+{
+ int ret = 0;
+
+ ret = perf_event_open_llc_miss(pid, cpu_no);
+ if (ret < 0)
+ return ret;
+
+ /* Start counters to log values */
+ ioctl_perf_event_ioc_reset_enable();
+
+ return 0;
+}
+
+/*
+ * get_llc_perf: llc cache miss through perf events
+ * @cpu_no: CPU number that the benchmark PID is binded to
+ *
+ * Perf events like HW_CACHE_MISSES could be used to validate number of
+ * cache lines allocated.
+ *
+ * Return: =0 on success. <0 on failure.
+ */
+static int get_llc_perf(unsigned long *llc_perf_miss)
+{
+ __u64 total_misses;
+ int ret;
+
+ /* Stop counters after one span to get miss rate */
+
+ ioctl(fd_lm, PERF_EVENT_IOC_DISABLE, 0);
+
+ ret = read(fd_lm, &rf_cqm, sizeof(struct read_format));
+ if (ret == -1) {
+ perror("Could not get llc misses through perf");
+ return -1;
+ }
+
+ total_misses = rf_cqm.values[0].value;
+ *llc_perf_miss = total_misses;
+
+ return 0;
+}
+
+/*
+ * Get LLC Occupancy as reported by RESCTRL FS
+ * For CQM,
+ * 1. If con_mon grp and mon grp given, then read from mon grp in
+ * con_mon grp
+ * 2. If only con_mon grp given, then read from con_mon grp
+ * 3. If both not given, then read from root con_mon grp
+ * For CAT,
+ * 1. If con_mon grp given, then read from it
+ * 2. If con_mon grp not given, then read from root con_mon grp
+ *
+ * Return: =0 on success. <0 on failure.
+ */
+static int get_llc_occu_resctrl(unsigned long *llc_occupancy)
+{
+ FILE *fp;
+
+ fp = fopen(llc_occup_path, "r");
+ if (!fp) {
+ perror("Failed to open results file");
+
+ return errno;
+ }
+ if (fscanf(fp, "%lu", llc_occupancy) <= 0) {
+ perror("Could not get llc occupancy");
+ fclose(fp);
+
+ return -1;
+ }
+ fclose(fp);
+
+ return 0;
+}
+
+/*
+ * print_results_cache: the cache results are stored in a file
+ * @filename: file that stores the results
+ * @bm_pid: child pid that runs benchmark
+ * @llc_value: perf miss value /
+ * llc occupancy value reported by resctrl FS
+ *
+ * Return: 0 on success. non-zero on failure.
+ */
+static int print_results_cache(char *filename, int bm_pid,
+ unsigned long llc_value)
+{
+ FILE *fp;
+
+ if (strcmp(filename, "stdio") == 0 || strcmp(filename, "stderr") == 0) {
+ printf("Pid: %d \t LLC_value: %lu\n", bm_pid,
+ llc_value);
+ } else {
+ fp = fopen(filename, "a");
+ if (!fp) {
+ perror("Cannot open results file");
+
+ return errno;
+ }
+ fprintf(fp, "Pid: %d \t llc_value: %lu\n", bm_pid, llc_value);
+ fclose(fp);
+ }
+
+ return 0;
+}
+
+int measure_cache_vals(struct resctrl_val_param *param, int bm_pid)
+{
+ unsigned long llc_perf_miss = 0, llc_occu_resc = 0, llc_value = 0;
+ int ret;
+
+ /*
+ * Measure cache miss from perf.
+ */
+ if (!strncmp(param->resctrl_val, CAT_STR, sizeof(CAT_STR))) {
+ ret = get_llc_perf(&llc_perf_miss);
+ if (ret < 0)
+ return ret;
+ llc_value = llc_perf_miss;
+ }
+
+ /*
+ * Measure llc occupancy from resctrl.
+ */
+ if (!strncmp(param->resctrl_val, CQM_STR, sizeof(CQM_STR))) {
+ ret = get_llc_occu_resctrl(&llc_occu_resc);
+ if (ret < 0)
+ return ret;
+ llc_value = llc_occu_resc;
+ }
+ ret = print_results_cache(param->filename, bm_pid, llc_value);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+/*
+ * cache_val: execute benchmark and measure LLC occupancy resctrl
+ * and perf cache miss for the benchmark
+ * @param: parameters passed to cache_val()
+ *
+ * Return: 0 on success. non-zero on failure.
+ */
+int cat_val(struct resctrl_val_param *param)
+{
+ int malloc_and_init_memory = 1, memflush = 1, operation = 0, ret = 0;
+ char *resctrl_val = param->resctrl_val;
+ pid_t bm_pid;
+
+ if (strcmp(param->filename, "") == 0)
+ sprintf(param->filename, "stdio");
+
+ bm_pid = getpid();
+
+ /* Taskset benchmark to specified cpu */
+ ret = taskset_benchmark(bm_pid, param->cpu_no);
+ if (ret)
+ return ret;
+
+ /* Write benchmark to specified con_mon grp, mon_grp in resctrl FS*/
+ ret = write_bm_pid_to_resctrl(bm_pid, param->ctrlgrp, param->mongrp,
+ resctrl_val);
+ if (ret)
+ return ret;
+
+ if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR))) {
+ ret = initialize_llc_perf();
+ if (ret)
+ return ret;
+ }
+
+ /* Test runs until the callback setup() tells the test to stop. */
+ while (1) {
+ if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR))) {
+ ret = param->setup(1, param);
+ if (ret) {
+ ret = 0;
+ break;
+ }
+ ret = reset_enable_llc_perf(bm_pid, param->cpu_no);
+ if (ret)
+ break;
+
+ if (run_fill_buf(param->span, malloc_and_init_memory,
+ memflush, operation, resctrl_val)) {
+ fprintf(stderr, "Error-running fill buffer\n");
+ ret = -1;
+ goto pe_close;
+ }
+
+ sleep(1);
+ ret = measure_cache_vals(param, bm_pid);
+ if (ret)
+ goto pe_close;
+
+ close(fd_lm);
+ } else {
+ break;
+ }
+ }
+
+ return ret;
+
+pe_close:
+ close(fd_lm);
+ return ret;
+}
diff --git a/tools/testing/selftests/resctrl/cat_test.c b/tools/testing/selftests/resctrl/cat_test.c
new file mode 100644
index 000000000..20823725d
--- /dev/null
+++ b/tools/testing/selftests/resctrl/cat_test.c
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Cache Allocation Technology (CAT) test
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ * Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+#include <unistd.h>
+
+#define RESULT_FILE_NAME1 "result_cat1"
+#define RESULT_FILE_NAME2 "result_cat2"
+#define NUM_OF_RUNS 5
+#define MAX_DIFF_PERCENT 4
+#define MAX_DIFF 1000000
+
+static int count_of_bits;
+static char cbm_mask[256];
+static unsigned long long_mask;
+static unsigned long cache_size;
+
+/*
+ * Change schemata. Write schemata to specified
+ * con_mon grp, mon_grp in resctrl FS.
+ * Run 5 times in order to get average values.
+ */
+static int cat_setup(int num, ...)
+{
+ struct resctrl_val_param *p;
+ char schemata[64];
+ va_list param;
+ int ret = 0;
+
+ va_start(param, num);
+ p = va_arg(param, struct resctrl_val_param *);
+ va_end(param);
+
+ /* Run NUM_OF_RUNS times */
+ if (p->num_of_runs >= NUM_OF_RUNS)
+ return -1;
+
+ if (p->num_of_runs == 0) {
+ sprintf(schemata, "%lx", p->mask);
+ ret = write_schemata(p->ctrlgrp, schemata, p->cpu_no,
+ p->resctrl_val);
+ }
+ p->num_of_runs++;
+
+ return ret;
+}
+
+static void show_cache_info(unsigned long sum_llc_perf_miss, int no_of_bits,
+ unsigned long span)
+{
+ unsigned long allocated_cache_lines = span / 64;
+ unsigned long avg_llc_perf_miss = 0;
+ float diff_percent;
+
+ avg_llc_perf_miss = sum_llc_perf_miss / (NUM_OF_RUNS - 1);
+ diff_percent = ((float)allocated_cache_lines - avg_llc_perf_miss) /
+ allocated_cache_lines * 100;
+
+ printf("%sok CAT: cache miss rate within %d%%\n",
+ !is_amd && abs((int)diff_percent) > MAX_DIFF_PERCENT ?
+ "not " : "", MAX_DIFF_PERCENT);
+ tests_run++;
+ printf("# Percent diff=%d\n", abs((int)diff_percent));
+ printf("# Number of bits: %d\n", no_of_bits);
+ printf("# Avg_llc_perf_miss: %lu\n", avg_llc_perf_miss);
+ printf("# Allocated cache lines: %lu\n", allocated_cache_lines);
+}
+
+static int check_results(struct resctrl_val_param *param)
+{
+ char *token_array[8], temp[512];
+ unsigned long sum_llc_perf_miss = 0;
+ int runs = 0, no_of_bits = 0;
+ FILE *fp;
+
+ printf("# Checking for pass/fail\n");
+ fp = fopen(param->filename, "r");
+ if (!fp) {
+ perror("# Cannot open file");
+
+ return errno;
+ }
+
+ while (fgets(temp, sizeof(temp), fp)) {
+ char *token = strtok(temp, ":\t");
+ int fields = 0;
+
+ while (token) {
+ token_array[fields++] = token;
+ token = strtok(NULL, ":\t");
+ }
+ /*
+ * Discard the first value which is inaccurate due to monitoring
+ * setup transition phase.
+ */
+ if (runs > 0)
+ sum_llc_perf_miss += strtoul(token_array[3], NULL, 0);
+ runs++;
+ }
+
+ fclose(fp);
+ no_of_bits = count_bits(param->mask);
+
+ show_cache_info(sum_llc_perf_miss, no_of_bits, param->span);
+
+ return 0;
+}
+
+void cat_test_cleanup(void)
+{
+ remove(RESULT_FILE_NAME1);
+ remove(RESULT_FILE_NAME2);
+}
+
+int cat_perf_miss_val(int cpu_no, int n, char *cache_type)
+{
+ unsigned long l_mask, l_mask_1;
+ int ret, pipefd[2], sibling_cpu_no;
+ char pipe_message;
+ pid_t bm_pid;
+
+ cache_size = 0;
+
+ ret = remount_resctrlfs(true);
+ if (ret)
+ return ret;
+
+ if (!validate_resctrl_feature_request("cat"))
+ return -1;
+
+ /* Get default cbm mask for L3/L2 cache */
+ ret = get_cbm_mask(cache_type, cbm_mask);
+ if (ret)
+ return ret;
+
+ long_mask = strtoul(cbm_mask, NULL, 16);
+
+ /* Get L3/L2 cache size */
+ ret = get_cache_size(cpu_no, cache_type, &cache_size);
+ if (ret)
+ return ret;
+ printf("cache size :%lu\n", cache_size);
+
+ /* Get max number of bits from default-cabm mask */
+ count_of_bits = count_bits(long_mask);
+
+ if (n < 1 || n > count_of_bits - 1) {
+ printf("Invalid input value for no_of_bits n!\n");
+ printf("Please Enter value in range 1 to %d\n",
+ count_of_bits - 1);
+ return -1;
+ }
+
+ /* Get core id from same socket for running another thread */
+ sibling_cpu_no = get_core_sibling(cpu_no);
+ if (sibling_cpu_no < 0)
+ return -1;
+
+ struct resctrl_val_param param = {
+ .resctrl_val = CAT_STR,
+ .cpu_no = cpu_no,
+ .mum_resctrlfs = 0,
+ .setup = cat_setup,
+ };
+
+ l_mask = long_mask >> n;
+ l_mask_1 = ~l_mask & long_mask;
+
+ /* Set param values for parent thread which will be allocated bitmask
+ * with (max_bits - n) bits
+ */
+ param.span = cache_size * (count_of_bits - n) / count_of_bits;
+ strcpy(param.ctrlgrp, "c2");
+ strcpy(param.mongrp, "m2");
+ strcpy(param.filename, RESULT_FILE_NAME2);
+ param.mask = l_mask;
+ param.num_of_runs = 0;
+
+ if (pipe(pipefd)) {
+ perror("# Unable to create pipe");
+ return errno;
+ }
+
+ bm_pid = fork();
+
+ /* Set param values for child thread which will be allocated bitmask
+ * with n bits
+ */
+ if (bm_pid == 0) {
+ param.mask = l_mask_1;
+ strcpy(param.ctrlgrp, "c1");
+ strcpy(param.mongrp, "m1");
+ param.span = cache_size * n / count_of_bits;
+ strcpy(param.filename, RESULT_FILE_NAME1);
+ param.num_of_runs = 0;
+ param.cpu_no = sibling_cpu_no;
+ }
+
+ remove(param.filename);
+
+ ret = cat_val(&param);
+ if (ret)
+ return ret;
+
+ ret = check_results(&param);
+ if (ret)
+ return ret;
+
+ if (bm_pid == 0) {
+ /* Tell parent that child is ready */
+ close(pipefd[0]);
+ pipe_message = 1;
+ if (write(pipefd[1], &pipe_message, sizeof(pipe_message)) <
+ sizeof(pipe_message)) {
+ close(pipefd[1]);
+ perror("# failed signaling parent process");
+ return errno;
+ }
+
+ close(pipefd[1]);
+ while (1)
+ ;
+ } else {
+ /* Parent waits for child to be ready. */
+ close(pipefd[1]);
+ pipe_message = 0;
+ while (pipe_message != 1) {
+ if (read(pipefd[0], &pipe_message,
+ sizeof(pipe_message)) < sizeof(pipe_message)) {
+ perror("# failed reading from child process");
+ break;
+ }
+ }
+ close(pipefd[0]);
+ kill(bm_pid, SIGKILL);
+ }
+
+ cat_test_cleanup();
+ if (bm_pid)
+ umount_resctrlfs();
+
+ return 0;
+}
diff --git a/tools/testing/selftests/resctrl/cqm_test.c b/tools/testing/selftests/resctrl/cqm_test.c
new file mode 100644
index 000000000..271752e9e
--- /dev/null
+++ b/tools/testing/selftests/resctrl/cqm_test.c
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Cache Monitoring Technology (CQM) test
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ * Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+#include <unistd.h>
+
+#define RESULT_FILE_NAME "result_cqm"
+#define NUM_OF_RUNS 5
+#define MAX_DIFF 2000000
+#define MAX_DIFF_PERCENT 15
+
+static int count_of_bits;
+static char cbm_mask[256];
+static unsigned long long_mask;
+static unsigned long cache_size;
+
+static int cqm_setup(int num, ...)
+{
+ struct resctrl_val_param *p;
+ va_list param;
+
+ va_start(param, num);
+ p = va_arg(param, struct resctrl_val_param *);
+ va_end(param);
+
+ /* Run NUM_OF_RUNS times */
+ if (p->num_of_runs >= NUM_OF_RUNS)
+ return -1;
+
+ p->num_of_runs++;
+
+ return 0;
+}
+
+static void show_cache_info(unsigned long sum_llc_occu_resc, int no_of_bits,
+ unsigned long span)
+{
+ unsigned long avg_llc_occu_resc = 0;
+ float diff_percent;
+ long avg_diff = 0;
+ bool res;
+
+ avg_llc_occu_resc = sum_llc_occu_resc / (NUM_OF_RUNS - 1);
+ avg_diff = (long)abs(span - avg_llc_occu_resc);
+
+ diff_percent = (((float)span - avg_llc_occu_resc) / span) * 100;
+
+ if ((abs((int)diff_percent) <= MAX_DIFF_PERCENT) ||
+ (abs(avg_diff) <= MAX_DIFF))
+ res = true;
+ else
+ res = false;
+
+ printf("%sok CQM: diff within %d, %d\%%\n", res ? "" : "not",
+ MAX_DIFF, (int)MAX_DIFF_PERCENT);
+
+ printf("# diff: %ld\n", avg_diff);
+ printf("# percent diff=%d\n", abs((int)diff_percent));
+ printf("# Results are displayed in (Bytes)\n");
+ printf("# Number of bits: %d\n", no_of_bits);
+ printf("# Avg_llc_occu_resc: %lu\n", avg_llc_occu_resc);
+ printf("# llc_occu_exp (span): %lu\n", span);
+
+ tests_run++;
+}
+
+static int check_results(struct resctrl_val_param *param, int no_of_bits)
+{
+ char *token_array[8], temp[512];
+ unsigned long sum_llc_occu_resc = 0;
+ int runs = 0;
+ FILE *fp;
+
+ printf("# checking for pass/fail\n");
+ fp = fopen(param->filename, "r");
+ if (!fp) {
+ perror("# Error in opening file\n");
+
+ return errno;
+ }
+
+ while (fgets(temp, sizeof(temp), fp)) {
+ char *token = strtok(temp, ":\t");
+ int fields = 0;
+
+ while (token) {
+ token_array[fields++] = token;
+ token = strtok(NULL, ":\t");
+ }
+
+ /* Field 3 is llc occ resc value */
+ if (runs > 0)
+ sum_llc_occu_resc += strtoul(token_array[3], NULL, 0);
+ runs++;
+ }
+ fclose(fp);
+ show_cache_info(sum_llc_occu_resc, no_of_bits, param->span);
+
+ return 0;
+}
+
+void cqm_test_cleanup(void)
+{
+ remove(RESULT_FILE_NAME);
+}
+
+int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd)
+{
+ int ret, mum_resctrlfs;
+
+ cache_size = 0;
+ mum_resctrlfs = 1;
+
+ ret = remount_resctrlfs(mum_resctrlfs);
+ if (ret)
+ return ret;
+
+ if (!validate_resctrl_feature_request("cqm"))
+ return -1;
+
+ ret = get_cbm_mask("L3", cbm_mask);
+ if (ret)
+ return ret;
+
+ long_mask = strtoul(cbm_mask, NULL, 16);
+
+ ret = get_cache_size(cpu_no, "L3", &cache_size);
+ if (ret)
+ return ret;
+ printf("cache size :%lu\n", cache_size);
+
+ count_of_bits = count_bits(long_mask);
+
+ if (n < 1 || n > count_of_bits) {
+ printf("Invalid input value for numbr_of_bits n!\n");
+ printf("Please Enter value in range 1 to %d\n", count_of_bits);
+ return -1;
+ }
+
+ struct resctrl_val_param param = {
+ .resctrl_val = CQM_STR,
+ .ctrlgrp = "c1",
+ .mongrp = "m1",
+ .cpu_no = cpu_no,
+ .mum_resctrlfs = 0,
+ .filename = RESULT_FILE_NAME,
+ .mask = ~(long_mask << n) & long_mask,
+ .span = cache_size * n / count_of_bits,
+ .num_of_runs = 0,
+ .setup = cqm_setup,
+ };
+
+ if (strcmp(benchmark_cmd[0], "fill_buf") == 0)
+ sprintf(benchmark_cmd[1], "%lu", param.span);
+
+ remove(RESULT_FILE_NAME);
+
+ ret = resctrl_val(benchmark_cmd, &param);
+ if (ret)
+ return ret;
+
+ ret = check_results(&param, n);
+ if (ret)
+ return ret;
+
+ cqm_test_cleanup();
+
+ return 0;
+}
diff --git a/tools/testing/selftests/resctrl/fill_buf.c b/tools/testing/selftests/resctrl/fill_buf.c
new file mode 100644
index 000000000..ab1d91328
--- /dev/null
+++ b/tools/testing/selftests/resctrl/fill_buf.c
@@ -0,0 +1,218 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fill_buf benchmark
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ * Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <inttypes.h>
+#include <malloc.h>
+#include <string.h>
+
+#include "resctrl.h"
+
+#define CL_SIZE (64)
+#define PAGE_SIZE (4 * 1024)
+#define MB (1024 * 1024)
+
+static unsigned char *startptr;
+
+static void sb(void)
+{
+#if defined(__i386) || defined(__x86_64)
+ asm volatile("sfence\n\t"
+ : : : "memory");
+#endif
+}
+
+static void ctrl_handler(int signo)
+{
+ free(startptr);
+ printf("\nEnding\n");
+ sb();
+ exit(EXIT_SUCCESS);
+}
+
+static void cl_flush(void *p)
+{
+#if defined(__i386) || defined(__x86_64)
+ asm volatile("clflush (%0)\n\t"
+ : : "r"(p) : "memory");
+#endif
+}
+
+static void mem_flush(void *p, size_t s)
+{
+ char *cp = (char *)p;
+ size_t i = 0;
+
+ s = s / CL_SIZE; /* mem size in cache llines */
+
+ for (i = 0; i < s; i++)
+ cl_flush(&cp[i * CL_SIZE]);
+
+ sb();
+}
+
+static void *malloc_and_init_memory(size_t s)
+{
+ uint64_t *p64;
+ size_t s64;
+
+ void *p = memalign(PAGE_SIZE, s);
+ if (!p)
+ return NULL;
+
+ p64 = (uint64_t *)p;
+ s64 = s / sizeof(uint64_t);
+
+ while (s64 > 0) {
+ *p64 = (uint64_t)rand();
+ p64 += (CL_SIZE / sizeof(uint64_t));
+ s64 -= (CL_SIZE / sizeof(uint64_t));
+ }
+
+ return p;
+}
+
+static int fill_one_span_read(unsigned char *start_ptr, unsigned char *end_ptr)
+{
+ unsigned char sum, *p;
+
+ sum = 0;
+ p = start_ptr;
+ while (p < end_ptr) {
+ sum += *p;
+ p += (CL_SIZE / 2);
+ }
+
+ return sum;
+}
+
+static
+void fill_one_span_write(unsigned char *start_ptr, unsigned char *end_ptr)
+{
+ unsigned char *p;
+
+ p = start_ptr;
+ while (p < end_ptr) {
+ *p = '1';
+ p += (CL_SIZE / 2);
+ }
+}
+
+static int fill_cache_read(unsigned char *start_ptr, unsigned char *end_ptr,
+ char *resctrl_val)
+{
+ int ret = 0;
+ FILE *fp;
+
+ while (1) {
+ ret = fill_one_span_read(start_ptr, end_ptr);
+ if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR)))
+ break;
+ }
+
+ /* Consume read result so that reading memory is not optimized out. */
+ fp = fopen("/dev/null", "w");
+ if (!fp) {
+ perror("Unable to write to /dev/null");
+ return -1;
+ }
+ fprintf(fp, "Sum: %d ", ret);
+ fclose(fp);
+
+ return 0;
+}
+
+static int fill_cache_write(unsigned char *start_ptr, unsigned char *end_ptr,
+ char *resctrl_val)
+{
+ while (1) {
+ fill_one_span_write(start_ptr, end_ptr);
+ if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR)))
+ break;
+ }
+
+ return 0;
+}
+
+static int
+fill_cache(unsigned long long buf_size, int malloc_and_init, int memflush,
+ int op, char *resctrl_val)
+{
+ unsigned char *start_ptr, *end_ptr;
+ unsigned long long i;
+ int ret;
+
+ if (malloc_and_init)
+ start_ptr = malloc_and_init_memory(buf_size);
+ else
+ start_ptr = malloc(buf_size);
+
+ if (!start_ptr)
+ return -1;
+
+ startptr = start_ptr;
+ end_ptr = start_ptr + buf_size;
+
+ /*
+ * It's better to touch the memory once to avoid any compiler
+ * optimizations
+ */
+ if (!malloc_and_init) {
+ for (i = 0; i < buf_size; i++)
+ *start_ptr++ = (unsigned char)rand();
+ }
+
+ start_ptr = startptr;
+
+ /* Flush the memory before using to avoid "cache hot pages" effect */
+ if (memflush)
+ mem_flush(start_ptr, buf_size);
+
+ if (op == 0)
+ ret = fill_cache_read(start_ptr, end_ptr, resctrl_val);
+ else
+ ret = fill_cache_write(start_ptr, end_ptr, resctrl_val);
+
+ free(startptr);
+
+ if (ret) {
+ printf("\n Error in fill cache read/write...\n");
+ return -1;
+ }
+
+
+ return 0;
+}
+
+int run_fill_buf(unsigned long span, int malloc_and_init_memory,
+ int memflush, int op, char *resctrl_val)
+{
+ unsigned long long cache_size = span;
+ int ret;
+
+ /* set up ctrl-c handler */
+ if (signal(SIGINT, ctrl_handler) == SIG_ERR)
+ printf("Failed to catch SIGINT!\n");
+ if (signal(SIGHUP, ctrl_handler) == SIG_ERR)
+ printf("Failed to catch SIGHUP!\n");
+
+ ret = fill_cache(cache_size, malloc_and_init_memory, memflush, op,
+ resctrl_val);
+ if (ret) {
+ printf("\n Error in fill cache\n");
+ return -1;
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/resctrl/mba_test.c b/tools/testing/selftests/resctrl/mba_test.c
new file mode 100644
index 000000000..6cfddd1d4
--- /dev/null
+++ b/tools/testing/selftests/resctrl/mba_test.c
@@ -0,0 +1,176 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory Bandwidth Allocation (MBA) test
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ * Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+
+#define RESULT_FILE_NAME "result_mba"
+#define NUM_OF_RUNS 5
+#define MAX_DIFF 300
+#define ALLOCATION_MAX 100
+#define ALLOCATION_MIN 10
+#define ALLOCATION_STEP 10
+
+/*
+ * Change schemata percentage from 100 to 10%. Write schemata to specified
+ * con_mon grp, mon_grp in resctrl FS.
+ * For each allocation, run 5 times in order to get average values.
+ */
+static int mba_setup(int num, ...)
+{
+ static int runs_per_allocation, allocation = 100;
+ struct resctrl_val_param *p;
+ char allocation_str[64];
+ va_list param;
+ int ret;
+
+ va_start(param, num);
+ p = va_arg(param, struct resctrl_val_param *);
+ va_end(param);
+
+ if (runs_per_allocation >= NUM_OF_RUNS)
+ runs_per_allocation = 0;
+
+ /* Only set up schemata once every NUM_OF_RUNS of allocations */
+ if (runs_per_allocation++ != 0)
+ return 0;
+
+ if (allocation < ALLOCATION_MIN || allocation > ALLOCATION_MAX)
+ return -1;
+
+ sprintf(allocation_str, "%d", allocation);
+
+ ret = write_schemata(p->ctrlgrp, allocation_str, p->cpu_no,
+ p->resctrl_val);
+ if (ret < 0)
+ return ret;
+
+ allocation -= ALLOCATION_STEP;
+
+ return 0;
+}
+
+static void show_mba_info(unsigned long *bw_imc, unsigned long *bw_resc)
+{
+ int allocation, runs;
+ bool failed = false;
+
+ printf("# Results are displayed in (MB)\n");
+ /* Memory bandwidth from 100% down to 10% */
+ for (allocation = 0; allocation < ALLOCATION_MAX / ALLOCATION_STEP;
+ allocation++) {
+ unsigned long avg_bw_imc, avg_bw_resc;
+ unsigned long sum_bw_imc = 0, sum_bw_resc = 0;
+ unsigned long avg_diff;
+
+ /*
+ * The first run is discarded due to inaccurate value from
+ * phase transition.
+ */
+ for (runs = NUM_OF_RUNS * allocation + 1;
+ runs < NUM_OF_RUNS * allocation + NUM_OF_RUNS ; runs++) {
+ sum_bw_imc += bw_imc[runs];
+ sum_bw_resc += bw_resc[runs];
+ }
+
+ avg_bw_imc = sum_bw_imc / (NUM_OF_RUNS - 1);
+ avg_bw_resc = sum_bw_resc / (NUM_OF_RUNS - 1);
+ avg_diff = labs((long)(avg_bw_resc - avg_bw_imc));
+
+ printf("%sok MBA schemata percentage %u smaller than %d %%\n",
+ avg_diff > MAX_DIFF ? "not " : "",
+ ALLOCATION_MAX - ALLOCATION_STEP * allocation,
+ MAX_DIFF);
+ tests_run++;
+ printf("# avg_diff: %lu\n", avg_diff);
+ printf("# avg_bw_imc: %lu\n", avg_bw_imc);
+ printf("# avg_bw_resc: %lu\n", avg_bw_resc);
+ if (avg_diff > MAX_DIFF)
+ failed = true;
+ }
+
+ printf("%sok schemata change using MBA%s\n", failed ? "not " : "",
+ failed ? " # at least one test failed" : "");
+ tests_run++;
+}
+
+static int check_results(void)
+{
+ char *token_array[8], output[] = RESULT_FILE_NAME, temp[512];
+ unsigned long bw_imc[1024], bw_resc[1024];
+ int runs;
+ FILE *fp;
+
+ fp = fopen(output, "r");
+ if (!fp) {
+ perror(output);
+
+ return errno;
+ }
+
+ runs = 0;
+ while (fgets(temp, sizeof(temp), fp)) {
+ char *token = strtok(temp, ":\t");
+ int fields = 0;
+
+ while (token) {
+ token_array[fields++] = token;
+ token = strtok(NULL, ":\t");
+ }
+
+ /* Field 3 is perf imc value */
+ bw_imc[runs] = strtoul(token_array[3], NULL, 0);
+ /* Field 5 is resctrl value */
+ bw_resc[runs] = strtoul(token_array[5], NULL, 0);
+ runs++;
+ }
+
+ fclose(fp);
+
+ show_mba_info(bw_imc, bw_resc);
+
+ return 0;
+}
+
+void mba_test_cleanup(void)
+{
+ remove(RESULT_FILE_NAME);
+}
+
+int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd)
+{
+ struct resctrl_val_param param = {
+ .resctrl_val = MBA_STR,
+ .ctrlgrp = "c1",
+ .mongrp = "m1",
+ .cpu_no = cpu_no,
+ .mum_resctrlfs = 1,
+ .filename = RESULT_FILE_NAME,
+ .bw_report = bw_report,
+ .setup = mba_setup
+ };
+ int ret;
+
+ remove(RESULT_FILE_NAME);
+
+ if (!validate_resctrl_feature_request("mba"))
+ return -1;
+
+ ret = resctrl_val(benchmark_cmd, &param);
+ if (ret)
+ return ret;
+
+ ret = check_results();
+ if (ret)
+ return ret;
+
+ mba_test_cleanup();
+
+ return 0;
+}
diff --git a/tools/testing/selftests/resctrl/mbm_test.c b/tools/testing/selftests/resctrl/mbm_test.c
new file mode 100644
index 000000000..ec6cfe01c
--- /dev/null
+++ b/tools/testing/selftests/resctrl/mbm_test.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory Bandwidth Monitoring (MBM) test
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ * Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+
+#define RESULT_FILE_NAME "result_mbm"
+#define MAX_DIFF 300
+#define NUM_OF_RUNS 5
+
+static void
+show_bw_info(unsigned long *bw_imc, unsigned long *bw_resc, int span)
+{
+ unsigned long avg_bw_imc = 0, avg_bw_resc = 0;
+ unsigned long sum_bw_imc = 0, sum_bw_resc = 0;
+ long avg_diff = 0;
+ int runs;
+
+ /*
+ * Discard the first value which is inaccurate due to monitoring setup
+ * transition phase.
+ */
+ for (runs = 1; runs < NUM_OF_RUNS ; runs++) {
+ sum_bw_imc += bw_imc[runs];
+ sum_bw_resc += bw_resc[runs];
+ }
+
+ avg_bw_imc = sum_bw_imc / 4;
+ avg_bw_resc = sum_bw_resc / 4;
+ avg_diff = avg_bw_resc - avg_bw_imc;
+
+ printf("%sok MBM: diff within %d%%\n",
+ labs(avg_diff) > MAX_DIFF ? "not " : "", MAX_DIFF);
+ tests_run++;
+ printf("# avg_diff: %lu\n", labs(avg_diff));
+ printf("# Span (MB): %d\n", span);
+ printf("# avg_bw_imc: %lu\n", avg_bw_imc);
+ printf("# avg_bw_resc: %lu\n", avg_bw_resc);
+}
+
+static int check_results(int span)
+{
+ unsigned long bw_imc[NUM_OF_RUNS], bw_resc[NUM_OF_RUNS];
+ char temp[1024], *token_array[8];
+ char output[] = RESULT_FILE_NAME;
+ int runs;
+ FILE *fp;
+
+ printf("# Checking for pass/fail\n");
+
+ fp = fopen(output, "r");
+ if (!fp) {
+ perror(output);
+
+ return errno;
+ }
+
+ runs = 0;
+ while (fgets(temp, sizeof(temp), fp)) {
+ char *token = strtok(temp, ":\t");
+ int i = 0;
+
+ while (token) {
+ token_array[i++] = token;
+ token = strtok(NULL, ":\t");
+ }
+
+ bw_resc[runs] = strtoul(token_array[5], NULL, 0);
+ bw_imc[runs] = strtoul(token_array[3], NULL, 0);
+ runs++;
+ }
+
+ show_bw_info(bw_imc, bw_resc, span);
+
+ fclose(fp);
+
+ return 0;
+}
+
+static int mbm_setup(int num, ...)
+{
+ struct resctrl_val_param *p;
+ static int num_of_runs;
+ va_list param;
+ int ret = 0;
+
+ /* Run NUM_OF_RUNS times */
+ if (num_of_runs++ >= NUM_OF_RUNS)
+ return -1;
+
+ va_start(param, num);
+ p = va_arg(param, struct resctrl_val_param *);
+ va_end(param);
+
+ /* Set up shemata with 100% allocation on the first run. */
+ if (num_of_runs == 0)
+ ret = write_schemata(p->ctrlgrp, "100", p->cpu_no,
+ p->resctrl_val);
+
+ return ret;
+}
+
+void mbm_test_cleanup(void)
+{
+ remove(RESULT_FILE_NAME);
+}
+
+int mbm_bw_change(int span, int cpu_no, char *bw_report, char **benchmark_cmd)
+{
+ struct resctrl_val_param param = {
+ .resctrl_val = MBM_STR,
+ .ctrlgrp = "c1",
+ .mongrp = "m1",
+ .span = span,
+ .cpu_no = cpu_no,
+ .mum_resctrlfs = 1,
+ .filename = RESULT_FILE_NAME,
+ .bw_report = bw_report,
+ .setup = mbm_setup
+ };
+ int ret;
+
+ remove(RESULT_FILE_NAME);
+
+ if (!validate_resctrl_feature_request("mbm"))
+ return -1;
+
+ ret = resctrl_val(benchmark_cmd, &param);
+ if (ret)
+ return ret;
+
+ ret = check_results(span);
+ if (ret)
+ return ret;
+
+ mbm_test_cleanup();
+
+ return 0;
+}
diff --git a/tools/testing/selftests/resctrl/resctrl.h b/tools/testing/selftests/resctrl/resctrl.h
new file mode 100644
index 000000000..c38f2d58d
--- /dev/null
+++ b/tools/testing/selftests/resctrl/resctrl.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#define _GNU_SOURCE
+#ifndef RESCTRL_H
+#define RESCTRL_H
+#include <stdio.h>
+#include <stdarg.h>
+#include <math.h>
+#include <errno.h>
+#include <sched.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <signal.h>
+#include <dirent.h>
+#include <stdbool.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/eventfd.h>
+#include <asm/unistd.h>
+#include <linux/perf_event.h>
+
+#define MB (1024 * 1024)
+#define RESCTRL_PATH "/sys/fs/resctrl"
+#define PHYS_ID_PATH "/sys/devices/system/cpu/cpu"
+#define CBM_MASK_PATH "/sys/fs/resctrl/info"
+
+#define PARENT_EXIT(err_msg) \
+ do { \
+ perror(err_msg); \
+ kill(ppid, SIGKILL); \
+ umount_resctrlfs(); \
+ exit(EXIT_FAILURE); \
+ } while (0)
+
+/*
+ * resctrl_val_param: resctrl test parameters
+ * @resctrl_val: Resctrl feature (Eg: mbm, mba.. etc)
+ * @ctrlgrp: Name of the control monitor group (con_mon grp)
+ * @mongrp: Name of the monitor group (mon grp)
+ * @cpu_no: CPU number to which the benchmark would be binded
+ * @span: Memory bytes accessed in each benchmark iteration
+ * @mum_resctrlfs: Should the resctrl FS be remounted?
+ * @filename: Name of file to which the o/p should be written
+ * @bw_report: Bandwidth report type (reads vs writes)
+ * @setup: Call back function to setup test environment
+ */
+struct resctrl_val_param {
+ char *resctrl_val;
+ char ctrlgrp[64];
+ char mongrp[64];
+ int cpu_no;
+ unsigned long span;
+ int mum_resctrlfs;
+ char filename[64];
+ char *bw_report;
+ unsigned long mask;
+ int num_of_runs;
+ int (*setup)(int num, ...);
+};
+
+#define MBM_STR "mbm"
+#define MBA_STR "mba"
+#define CQM_STR "cqm"
+#define CAT_STR "cat"
+
+extern pid_t bm_pid, ppid;
+extern int tests_run;
+
+extern char llc_occup_path[1024];
+extern bool is_amd;
+
+bool check_resctrlfs_support(void);
+int filter_dmesg(void);
+int remount_resctrlfs(bool mum_resctrlfs);
+int get_resource_id(int cpu_no, int *resource_id);
+int umount_resctrlfs(void);
+int validate_bw_report_request(char *bw_report);
+bool validate_resctrl_feature_request(char *resctrl_val);
+char *fgrep(FILE *inf, const char *str);
+int taskset_benchmark(pid_t bm_pid, int cpu_no);
+void run_benchmark(int signum, siginfo_t *info, void *ucontext);
+int write_schemata(char *ctrlgrp, char *schemata, int cpu_no,
+ char *resctrl_val);
+int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp,
+ char *resctrl_val);
+int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu,
+ int group_fd, unsigned long flags);
+int run_fill_buf(unsigned long span, int malloc_and_init_memory, int memflush,
+ int op, char *resctrl_va);
+int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param);
+int mbm_bw_change(int span, int cpu_no, char *bw_report, char **benchmark_cmd);
+void tests_cleanup(void);
+void mbm_test_cleanup(void);
+int mba_schemata_change(int cpu_no, char *bw_report, char **benchmark_cmd);
+void mba_test_cleanup(void);
+int get_cbm_mask(char *cache_type, char *cbm_mask);
+int get_cache_size(int cpu_no, char *cache_type, unsigned long *cache_size);
+void ctrlc_handler(int signum, siginfo_t *info, void *ptr);
+int cat_val(struct resctrl_val_param *param);
+void cat_test_cleanup(void);
+int cat_perf_miss_val(int cpu_no, int no_of_bits, char *cache_type);
+int cqm_resctrl_val(int cpu_no, int n, char **benchmark_cmd);
+unsigned int count_bits(unsigned long n);
+void cqm_test_cleanup(void);
+int get_core_sibling(int cpu_no);
+int measure_cache_vals(struct resctrl_val_param *param, int bm_pid);
+
+#endif /* RESCTRL_H */
diff --git a/tools/testing/selftests/resctrl/resctrl_tests.c b/tools/testing/selftests/resctrl/resctrl_tests.c
new file mode 100644
index 000000000..b33d1d6dd
--- /dev/null
+++ b/tools/testing/selftests/resctrl/resctrl_tests.c
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Resctrl tests
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ * Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+
+#define BENCHMARK_ARGS 64
+#define BENCHMARK_ARG_SIZE 64
+
+bool is_amd;
+
+void detect_amd(void)
+{
+ FILE *inf = fopen("/proc/cpuinfo", "r");
+ char *res;
+
+ if (!inf)
+ return;
+
+ res = fgrep(inf, "vendor_id");
+
+ if (res) {
+ char *s = strchr(res, ':');
+
+ is_amd = s && !strcmp(s, ": AuthenticAMD\n");
+ free(res);
+ }
+ fclose(inf);
+}
+
+static void cmd_help(void)
+{
+ printf("usage: resctrl_tests [-h] [-b \"benchmark_cmd [options]\"] [-t test list] [-n no_of_bits]\n");
+ printf("\t-b benchmark_cmd [options]: run specified benchmark for MBM, MBA and CQM");
+ printf("\t default benchmark is builtin fill_buf\n");
+ printf("\t-t test list: run tests specified in the test list, ");
+ printf("e.g. -t mbm,mba,cqm,cat\n");
+ printf("\t-n no_of_bits: run cache tests using specified no of bits in cache bit mask\n");
+ printf("\t-p cpu_no: specify CPU number to run the test. 1 is default\n");
+ printf("\t-h: help\n");
+}
+
+void tests_cleanup(void)
+{
+ mbm_test_cleanup();
+ mba_test_cleanup();
+ cqm_test_cleanup();
+ cat_test_cleanup();
+}
+
+int main(int argc, char **argv)
+{
+ bool has_ben = false, mbm_test = true, mba_test = true, cqm_test = true;
+ int res, c, cpu_no = 1, span = 250, argc_new = argc, i, no_of_bits = 5;
+ char *benchmark_cmd[BENCHMARK_ARGS], bw_report[64], bm_type[64];
+ char benchmark_cmd_area[BENCHMARK_ARGS][BENCHMARK_ARG_SIZE];
+ int ben_ind, ben_count;
+ bool cat_test = true;
+
+ for (i = 0; i < argc; i++) {
+ if (strcmp(argv[i], "-b") == 0) {
+ ben_ind = i + 1;
+ ben_count = argc - ben_ind;
+ argc_new = ben_ind - 1;
+ has_ben = true;
+ break;
+ }
+ }
+
+ while ((c = getopt(argc_new, argv, "ht:b:n:p:")) != -1) {
+ char *token;
+
+ switch (c) {
+ case 't':
+ token = strtok(optarg, ",");
+
+ mbm_test = false;
+ mba_test = false;
+ cqm_test = false;
+ cat_test = false;
+ while (token) {
+ if (!strncmp(token, MBM_STR, sizeof(MBM_STR))) {
+ mbm_test = true;
+ } else if (!strncmp(token, MBA_STR, sizeof(MBA_STR))) {
+ mba_test = true;
+ } else if (!strncmp(token, CQM_STR, sizeof(CQM_STR))) {
+ cqm_test = true;
+ } else if (!strncmp(token, CAT_STR, sizeof(CAT_STR))) {
+ cat_test = true;
+ } else {
+ printf("invalid argument\n");
+
+ return -1;
+ }
+ token = strtok(NULL, ",");
+ }
+ break;
+ case 'p':
+ cpu_no = atoi(optarg);
+ break;
+ case 'n':
+ no_of_bits = atoi(optarg);
+ break;
+ case 'h':
+ cmd_help();
+
+ return 0;
+ default:
+ printf("invalid argument\n");
+
+ return -1;
+ }
+ }
+
+ printf("TAP version 13\n");
+
+ /*
+ * Typically we need root privileges, because:
+ * 1. We write to resctrl FS
+ * 2. We execute perf commands
+ */
+ if (geteuid() != 0)
+ printf("# WARNING: not running as root, tests may fail.\n");
+
+ /* Detect AMD vendor */
+ detect_amd();
+
+ if (has_ben) {
+ if (argc - ben_ind >= BENCHMARK_ARGS)
+ ksft_exit_fail_msg("Too long benchmark command.\n");
+
+ /* Extract benchmark command from command line. */
+ for (i = ben_ind; i < argc; i++) {
+ benchmark_cmd[i - ben_ind] = benchmark_cmd_area[i];
+ if (strlen(argv[i]) >= BENCHMARK_ARG_SIZE)
+ ksft_exit_fail_msg("Too long benchmark command argument.\n");
+ sprintf(benchmark_cmd[i - ben_ind], "%s", argv[i]);
+ }
+ benchmark_cmd[ben_count] = NULL;
+ } else {
+ /* If no benchmark is given by "-b" argument, use fill_buf. */
+ for (i = 0; i < 6; i++)
+ benchmark_cmd[i] = benchmark_cmd_area[i];
+
+ strcpy(benchmark_cmd[0], "fill_buf");
+ sprintf(benchmark_cmd[1], "%d", span);
+ strcpy(benchmark_cmd[2], "1");
+ strcpy(benchmark_cmd[3], "1");
+ strcpy(benchmark_cmd[4], "0");
+ strcpy(benchmark_cmd[5], "");
+ benchmark_cmd[6] = NULL;
+ }
+
+ sprintf(bw_report, "reads");
+ sprintf(bm_type, "fill_buf");
+
+ check_resctrlfs_support();
+ filter_dmesg();
+
+ if (!is_amd && mbm_test) {
+ printf("# Starting MBM BW change ...\n");
+ if (!has_ben)
+ sprintf(benchmark_cmd[5], "%s", MBA_STR);
+ res = mbm_bw_change(span, cpu_no, bw_report, benchmark_cmd);
+ printf("%sok MBM: bw change\n", res ? "not " : "");
+ mbm_test_cleanup();
+ tests_run++;
+ }
+
+ if (!is_amd && mba_test) {
+ printf("# Starting MBA Schemata change ...\n");
+ if (!has_ben)
+ sprintf(benchmark_cmd[1], "%d", span);
+ res = mba_schemata_change(cpu_no, bw_report, benchmark_cmd);
+ printf("%sok MBA: schemata change\n", res ? "not " : "");
+ mba_test_cleanup();
+ tests_run++;
+ }
+
+ if (cqm_test) {
+ printf("# Starting CQM test ...\n");
+ if (!has_ben)
+ sprintf(benchmark_cmd[5], "%s", CQM_STR);
+ res = cqm_resctrl_val(cpu_no, no_of_bits, benchmark_cmd);
+ printf("%sok CQM: test\n", res ? "not " : "");
+ cqm_test_cleanup();
+ tests_run++;
+ }
+
+ if (cat_test) {
+ printf("# Starting CAT test ...\n");
+ res = cat_perf_miss_val(cpu_no, no_of_bits, "L3");
+ printf("%sok CAT: test\n", res ? "not " : "");
+ tests_run++;
+ cat_test_cleanup();
+ }
+
+ printf("1..%d\n", tests_run);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/resctrl/resctrl_val.c b/tools/testing/selftests/resctrl/resctrl_val.c
new file mode 100644
index 000000000..8df557894
--- /dev/null
+++ b/tools/testing/selftests/resctrl/resctrl_val.c
@@ -0,0 +1,767 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Memory bandwidth monitoring and allocation library
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ * Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+
+#define UNCORE_IMC "uncore_imc"
+#define READ_FILE_NAME "events/cas_count_read"
+#define WRITE_FILE_NAME "events/cas_count_write"
+#define DYN_PMU_PATH "/sys/bus/event_source/devices"
+#define SCALE 0.00006103515625
+#define MAX_IMCS 20
+#define MAX_TOKENS 5
+#define READ 0
+#define WRITE 1
+#define CON_MON_MBM_LOCAL_BYTES_PATH \
+ "%s/%s/mon_groups/%s/mon_data/mon_L3_%02d/mbm_local_bytes"
+
+#define CON_MBM_LOCAL_BYTES_PATH \
+ "%s/%s/mon_data/mon_L3_%02d/mbm_local_bytes"
+
+#define MON_MBM_LOCAL_BYTES_PATH \
+ "%s/mon_groups/%s/mon_data/mon_L3_%02d/mbm_local_bytes"
+
+#define MBM_LOCAL_BYTES_PATH \
+ "%s/mon_data/mon_L3_%02d/mbm_local_bytes"
+
+#define CON_MON_LCC_OCCUP_PATH \
+ "%s/%s/mon_groups/%s/mon_data/mon_L3_%02d/llc_occupancy"
+
+#define CON_LCC_OCCUP_PATH \
+ "%s/%s/mon_data/mon_L3_%02d/llc_occupancy"
+
+#define MON_LCC_OCCUP_PATH \
+ "%s/mon_groups/%s/mon_data/mon_L3_%02d/llc_occupancy"
+
+#define LCC_OCCUP_PATH \
+ "%s/mon_data/mon_L3_%02d/llc_occupancy"
+
+struct membw_read_format {
+ __u64 value; /* The value of the event */
+ __u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */
+ __u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */
+ __u64 id; /* if PERF_FORMAT_ID */
+};
+
+struct imc_counter_config {
+ __u32 type;
+ __u64 event;
+ __u64 umask;
+ struct perf_event_attr pe;
+ struct membw_read_format return_value;
+ int fd;
+};
+
+static char mbm_total_path[1024];
+static int imcs;
+static struct imc_counter_config imc_counters_config[MAX_IMCS][2];
+
+void membw_initialize_perf_event_attr(int i, int j)
+{
+ memset(&imc_counters_config[i][j].pe, 0,
+ sizeof(struct perf_event_attr));
+ imc_counters_config[i][j].pe.type = imc_counters_config[i][j].type;
+ imc_counters_config[i][j].pe.size = sizeof(struct perf_event_attr);
+ imc_counters_config[i][j].pe.disabled = 1;
+ imc_counters_config[i][j].pe.inherit = 1;
+ imc_counters_config[i][j].pe.exclude_guest = 0;
+ imc_counters_config[i][j].pe.config =
+ imc_counters_config[i][j].umask << 8 |
+ imc_counters_config[i][j].event;
+ imc_counters_config[i][j].pe.sample_type = PERF_SAMPLE_IDENTIFIER;
+ imc_counters_config[i][j].pe.read_format =
+ PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
+}
+
+void membw_ioctl_perf_event_ioc_reset_enable(int i, int j)
+{
+ ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_RESET, 0);
+ ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_ENABLE, 0);
+}
+
+void membw_ioctl_perf_event_ioc_disable(int i, int j)
+{
+ ioctl(imc_counters_config[i][j].fd, PERF_EVENT_IOC_DISABLE, 0);
+}
+
+/*
+ * get_event_and_umask: Parse config into event and umask
+ * @cas_count_cfg: Config
+ * @count: iMC number
+ * @op: Operation (read/write)
+ */
+void get_event_and_umask(char *cas_count_cfg, int count, bool op)
+{
+ char *token[MAX_TOKENS];
+ int i = 0;
+
+ strcat(cas_count_cfg, ",");
+ token[0] = strtok(cas_count_cfg, "=,");
+
+ for (i = 1; i < MAX_TOKENS; i++)
+ token[i] = strtok(NULL, "=,");
+
+ for (i = 0; i < MAX_TOKENS; i++) {
+ if (!token[i])
+ break;
+ if (strcmp(token[i], "event") == 0) {
+ if (op == READ)
+ imc_counters_config[count][READ].event =
+ strtol(token[i + 1], NULL, 16);
+ else
+ imc_counters_config[count][WRITE].event =
+ strtol(token[i + 1], NULL, 16);
+ }
+ if (strcmp(token[i], "umask") == 0) {
+ if (op == READ)
+ imc_counters_config[count][READ].umask =
+ strtol(token[i + 1], NULL, 16);
+ else
+ imc_counters_config[count][WRITE].umask =
+ strtol(token[i + 1], NULL, 16);
+ }
+ }
+}
+
+static int open_perf_event(int i, int cpu_no, int j)
+{
+ imc_counters_config[i][j].fd =
+ perf_event_open(&imc_counters_config[i][j].pe, -1, cpu_no, -1,
+ PERF_FLAG_FD_CLOEXEC);
+
+ if (imc_counters_config[i][j].fd == -1) {
+ fprintf(stderr, "Error opening leader %llx\n",
+ imc_counters_config[i][j].pe.config);
+
+ return -1;
+ }
+
+ return 0;
+}
+
+/* Get type and config (read and write) of an iMC counter */
+static int read_from_imc_dir(char *imc_dir, int count)
+{
+ char cas_count_cfg[1024], imc_counter_cfg[1024], imc_counter_type[1024];
+ FILE *fp;
+
+ /* Get type of iMC counter */
+ sprintf(imc_counter_type, "%s%s", imc_dir, "type");
+ fp = fopen(imc_counter_type, "r");
+ if (!fp) {
+ perror("Failed to open imc counter type file");
+
+ return -1;
+ }
+ if (fscanf(fp, "%u", &imc_counters_config[count][READ].type) <= 0) {
+ perror("Could not get imc type");
+ fclose(fp);
+
+ return -1;
+ }
+ fclose(fp);
+
+ imc_counters_config[count][WRITE].type =
+ imc_counters_config[count][READ].type;
+
+ /* Get read config */
+ sprintf(imc_counter_cfg, "%s%s", imc_dir, READ_FILE_NAME);
+ fp = fopen(imc_counter_cfg, "r");
+ if (!fp) {
+ perror("Failed to open imc config file");
+
+ return -1;
+ }
+ if (fscanf(fp, "%s", cas_count_cfg) <= 0) {
+ perror("Could not get imc cas count read");
+ fclose(fp);
+
+ return -1;
+ }
+ fclose(fp);
+
+ get_event_and_umask(cas_count_cfg, count, READ);
+
+ /* Get write config */
+ sprintf(imc_counter_cfg, "%s%s", imc_dir, WRITE_FILE_NAME);
+ fp = fopen(imc_counter_cfg, "r");
+ if (!fp) {
+ perror("Failed to open imc config file");
+
+ return -1;
+ }
+ if (fscanf(fp, "%s", cas_count_cfg) <= 0) {
+ perror("Could not get imc cas count write");
+ fclose(fp);
+
+ return -1;
+ }
+ fclose(fp);
+
+ get_event_and_umask(cas_count_cfg, count, WRITE);
+
+ return 0;
+}
+
+/*
+ * A system can have 'n' number of iMC (Integrated Memory Controller)
+ * counters, get that 'n'. For each iMC counter get it's type and config.
+ * Also, each counter has two configs, one for read and the other for write.
+ * A config again has two parts, event and umask.
+ * Enumerate all these details into an array of structures.
+ *
+ * Return: >= 0 on success. < 0 on failure.
+ */
+static int num_of_imcs(void)
+{
+ char imc_dir[512], *temp;
+ unsigned int count = 0;
+ struct dirent *ep;
+ int ret;
+ DIR *dp;
+
+ dp = opendir(DYN_PMU_PATH);
+ if (dp) {
+ while ((ep = readdir(dp))) {
+ temp = strstr(ep->d_name, UNCORE_IMC);
+ if (!temp)
+ continue;
+
+ /*
+ * imc counters are named as "uncore_imc_<n>", hence
+ * increment the pointer to point to <n>. Note that
+ * sizeof(UNCORE_IMC) would count for null character as
+ * well and hence the last underscore character in
+ * uncore_imc'_' need not be counted.
+ */
+ temp = temp + sizeof(UNCORE_IMC);
+
+ /*
+ * Some directories under "DYN_PMU_PATH" could have
+ * names like "uncore_imc_free_running", hence, check if
+ * first character is a numerical digit or not.
+ */
+ if (temp[0] >= '0' && temp[0] <= '9') {
+ sprintf(imc_dir, "%s/%s/", DYN_PMU_PATH,
+ ep->d_name);
+ ret = read_from_imc_dir(imc_dir, count);
+ if (ret) {
+ closedir(dp);
+
+ return ret;
+ }
+ count++;
+ }
+ }
+ closedir(dp);
+ if (count == 0) {
+ perror("Unable find iMC counters!\n");
+
+ return -1;
+ }
+ } else {
+ perror("Unable to open PMU directory!\n");
+
+ return -1;
+ }
+
+ return count;
+}
+
+static int initialize_mem_bw_imc(void)
+{
+ int imc, j;
+
+ imcs = num_of_imcs();
+ if (imcs <= 0)
+ return imcs;
+
+ /* Initialize perf_event_attr structures for all iMC's */
+ for (imc = 0; imc < imcs; imc++) {
+ for (j = 0; j < 2; j++)
+ membw_initialize_perf_event_attr(imc, j);
+ }
+
+ return 0;
+}
+
+/*
+ * get_mem_bw_imc: Memory band width as reported by iMC counters
+ * @cpu_no: CPU number that the benchmark PID is binded to
+ * @bw_report: Bandwidth report type (reads, writes)
+ *
+ * Memory B/W utilized by a process on a socket can be calculated using
+ * iMC counters. Perf events are used to read these counters.
+ *
+ * Return: = 0 on success. < 0 on failure.
+ */
+static int get_mem_bw_imc(int cpu_no, char *bw_report, float *bw_imc)
+{
+ float reads, writes, of_mul_read, of_mul_write;
+ int imc, j, ret;
+
+ /* Start all iMC counters to log values (both read and write) */
+ reads = 0, writes = 0, of_mul_read = 1, of_mul_write = 1;
+ for (imc = 0; imc < imcs; imc++) {
+ for (j = 0; j < 2; j++) {
+ ret = open_perf_event(imc, cpu_no, j);
+ if (ret)
+ return -1;
+ }
+ for (j = 0; j < 2; j++)
+ membw_ioctl_perf_event_ioc_reset_enable(imc, j);
+ }
+
+ sleep(1);
+
+ /* Stop counters after a second to get results (both read and write) */
+ for (imc = 0; imc < imcs; imc++) {
+ for (j = 0; j < 2; j++)
+ membw_ioctl_perf_event_ioc_disable(imc, j);
+ }
+
+ /*
+ * Get results which are stored in struct type imc_counter_config
+ * Take over flow into consideration before calculating total b/w
+ */
+ for (imc = 0; imc < imcs; imc++) {
+ struct imc_counter_config *r =
+ &imc_counters_config[imc][READ];
+ struct imc_counter_config *w =
+ &imc_counters_config[imc][WRITE];
+
+ if (read(r->fd, &r->return_value,
+ sizeof(struct membw_read_format)) == -1) {
+ perror("Couldn't get read b/w through iMC");
+
+ return -1;
+ }
+
+ if (read(w->fd, &w->return_value,
+ sizeof(struct membw_read_format)) == -1) {
+ perror("Couldn't get write bw through iMC");
+
+ return -1;
+ }
+
+ __u64 r_time_enabled = r->return_value.time_enabled;
+ __u64 r_time_running = r->return_value.time_running;
+
+ if (r_time_enabled != r_time_running)
+ of_mul_read = (float)r_time_enabled /
+ (float)r_time_running;
+
+ __u64 w_time_enabled = w->return_value.time_enabled;
+ __u64 w_time_running = w->return_value.time_running;
+
+ if (w_time_enabled != w_time_running)
+ of_mul_write = (float)w_time_enabled /
+ (float)w_time_running;
+ reads += r->return_value.value * of_mul_read * SCALE;
+ writes += w->return_value.value * of_mul_write * SCALE;
+ }
+
+ for (imc = 0; imc < imcs; imc++) {
+ close(imc_counters_config[imc][READ].fd);
+ close(imc_counters_config[imc][WRITE].fd);
+ }
+
+ if (strcmp(bw_report, "reads") == 0) {
+ *bw_imc = reads;
+ return 0;
+ }
+
+ if (strcmp(bw_report, "writes") == 0) {
+ *bw_imc = writes;
+ return 0;
+ }
+
+ *bw_imc = reads + writes;
+ return 0;
+}
+
+void set_mbm_path(const char *ctrlgrp, const char *mongrp, int resource_id)
+{
+ if (ctrlgrp && mongrp)
+ sprintf(mbm_total_path, CON_MON_MBM_LOCAL_BYTES_PATH,
+ RESCTRL_PATH, ctrlgrp, mongrp, resource_id);
+ else if (!ctrlgrp && mongrp)
+ sprintf(mbm_total_path, MON_MBM_LOCAL_BYTES_PATH, RESCTRL_PATH,
+ mongrp, resource_id);
+ else if (ctrlgrp && !mongrp)
+ sprintf(mbm_total_path, CON_MBM_LOCAL_BYTES_PATH, RESCTRL_PATH,
+ ctrlgrp, resource_id);
+ else if (!ctrlgrp && !mongrp)
+ sprintf(mbm_total_path, MBM_LOCAL_BYTES_PATH, RESCTRL_PATH,
+ resource_id);
+}
+
+/*
+ * initialize_mem_bw_resctrl: Appropriately populate "mbm_total_path"
+ * @ctrlgrp: Name of the control monitor group (con_mon grp)
+ * @mongrp: Name of the monitor group (mon grp)
+ * @cpu_no: CPU number that the benchmark PID is binded to
+ * @resctrl_val: Resctrl feature (Eg: mbm, mba.. etc)
+ */
+static void initialize_mem_bw_resctrl(const char *ctrlgrp, const char *mongrp,
+ int cpu_no, char *resctrl_val)
+{
+ int resource_id;
+
+ if (get_resource_id(cpu_no, &resource_id) < 0) {
+ perror("Could not get resource_id");
+ return;
+ }
+
+ if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)))
+ set_mbm_path(ctrlgrp, mongrp, resource_id);
+
+ if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) {
+ if (ctrlgrp)
+ sprintf(mbm_total_path, CON_MBM_LOCAL_BYTES_PATH,
+ RESCTRL_PATH, ctrlgrp, resource_id);
+ else
+ sprintf(mbm_total_path, MBM_LOCAL_BYTES_PATH,
+ RESCTRL_PATH, resource_id);
+ }
+}
+
+/*
+ * Get MBM Local bytes as reported by resctrl FS
+ * For MBM,
+ * 1. If con_mon grp and mon grp are given, then read from con_mon grp's mon grp
+ * 2. If only con_mon grp is given, then read from con_mon grp
+ * 3. If both are not given, then read from root con_mon grp
+ * For MBA,
+ * 1. If con_mon grp is given, then read from it
+ * 2. If con_mon grp is not given, then read from root con_mon grp
+ */
+static int get_mem_bw_resctrl(unsigned long *mbm_total)
+{
+ FILE *fp;
+
+ fp = fopen(mbm_total_path, "r");
+ if (!fp) {
+ perror("Failed to open total bw file");
+
+ return -1;
+ }
+ if (fscanf(fp, "%lu", mbm_total) <= 0) {
+ perror("Could not get mbm local bytes");
+ fclose(fp);
+
+ return -1;
+ }
+ fclose(fp);
+
+ return 0;
+}
+
+pid_t bm_pid, ppid;
+
+void ctrlc_handler(int signum, siginfo_t *info, void *ptr)
+{
+ kill(bm_pid, SIGKILL);
+ umount_resctrlfs();
+ tests_cleanup();
+ printf("Ending\n\n");
+
+ exit(EXIT_SUCCESS);
+}
+
+/*
+ * print_results_bw: the memory bandwidth results are stored in a file
+ * @filename: file that stores the results
+ * @bm_pid: child pid that runs benchmark
+ * @bw_imc: perf imc counter value
+ * @bw_resc: memory bandwidth value
+ *
+ * Return: 0 on success. non-zero on failure.
+ */
+static int print_results_bw(char *filename, int bm_pid, float bw_imc,
+ unsigned long bw_resc)
+{
+ unsigned long diff = fabs(bw_imc - bw_resc);
+ FILE *fp;
+
+ if (strcmp(filename, "stdio") == 0 || strcmp(filename, "stderr") == 0) {
+ printf("Pid: %d \t Mem_BW_iMC: %f \t ", bm_pid, bw_imc);
+ printf("Mem_BW_resc: %lu \t Difference: %lu\n", bw_resc, diff);
+ } else {
+ fp = fopen(filename, "a");
+ if (!fp) {
+ perror("Cannot open results file");
+
+ return errno;
+ }
+ if (fprintf(fp, "Pid: %d \t Mem_BW_iMC: %f \t Mem_BW_resc: %lu \t Difference: %lu\n",
+ bm_pid, bw_imc, bw_resc, diff) <= 0) {
+ fclose(fp);
+ perror("Could not log results.");
+
+ return errno;
+ }
+ fclose(fp);
+ }
+
+ return 0;
+}
+
+static void set_cqm_path(const char *ctrlgrp, const char *mongrp, char sock_num)
+{
+ if (strlen(ctrlgrp) && strlen(mongrp))
+ sprintf(llc_occup_path, CON_MON_LCC_OCCUP_PATH, RESCTRL_PATH,
+ ctrlgrp, mongrp, sock_num);
+ else if (!strlen(ctrlgrp) && strlen(mongrp))
+ sprintf(llc_occup_path, MON_LCC_OCCUP_PATH, RESCTRL_PATH,
+ mongrp, sock_num);
+ else if (strlen(ctrlgrp) && !strlen(mongrp))
+ sprintf(llc_occup_path, CON_LCC_OCCUP_PATH, RESCTRL_PATH,
+ ctrlgrp, sock_num);
+ else if (!strlen(ctrlgrp) && !strlen(mongrp))
+ sprintf(llc_occup_path, LCC_OCCUP_PATH, RESCTRL_PATH, sock_num);
+}
+
+/*
+ * initialize_llc_occu_resctrl: Appropriately populate "llc_occup_path"
+ * @ctrlgrp: Name of the control monitor group (con_mon grp)
+ * @mongrp: Name of the monitor group (mon grp)
+ * @cpu_no: CPU number that the benchmark PID is binded to
+ * @resctrl_val: Resctrl feature (Eg: cat, cqm.. etc)
+ */
+static void initialize_llc_occu_resctrl(const char *ctrlgrp, const char *mongrp,
+ int cpu_no, char *resctrl_val)
+{
+ int resource_id;
+
+ if (get_resource_id(cpu_no, &resource_id) < 0) {
+ perror("# Unable to resource_id");
+ return;
+ }
+
+ if (!strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR)))
+ set_cqm_path(ctrlgrp, mongrp, resource_id);
+}
+
+static int
+measure_vals(struct resctrl_val_param *param, unsigned long *bw_resc_start)
+{
+ unsigned long bw_resc, bw_resc_end;
+ float bw_imc;
+ int ret;
+
+ /*
+ * Measure memory bandwidth from resctrl and from
+ * another source which is perf imc value or could
+ * be something else if perf imc event is not available.
+ * Compare the two values to validate resctrl value.
+ * It takes 1sec to measure the data.
+ */
+ ret = get_mem_bw_imc(param->cpu_no, param->bw_report, &bw_imc);
+ if (ret < 0)
+ return ret;
+
+ ret = get_mem_bw_resctrl(&bw_resc_end);
+ if (ret < 0)
+ return ret;
+
+ bw_resc = (bw_resc_end - *bw_resc_start) / MB;
+ ret = print_results_bw(param->filename, bm_pid, bw_imc, bw_resc);
+ if (ret)
+ return ret;
+
+ *bw_resc_start = bw_resc_end;
+
+ return 0;
+}
+
+/*
+ * resctrl_val: execute benchmark and measure memory bandwidth on
+ * the benchmark
+ * @benchmark_cmd: benchmark command and its arguments
+ * @param: parameters passed to resctrl_val()
+ *
+ * Return: 0 on success. non-zero on failure.
+ */
+int resctrl_val(char **benchmark_cmd, struct resctrl_val_param *param)
+{
+ char *resctrl_val = param->resctrl_val;
+ unsigned long bw_resc_start = 0;
+ struct sigaction sigact;
+ int ret = 0, pipefd[2];
+ char pipe_message = 0;
+ union sigval value;
+
+ if (strcmp(param->filename, "") == 0)
+ sprintf(param->filename, "stdio");
+
+ if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR)) ||
+ !strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR))) {
+ ret = validate_bw_report_request(param->bw_report);
+ if (ret)
+ return ret;
+ }
+
+ ret = remount_resctrlfs(param->mum_resctrlfs);
+ if (ret)
+ return ret;
+
+ /*
+ * If benchmark wasn't successfully started by child, then child should
+ * kill parent, so save parent's pid
+ */
+ ppid = getpid();
+
+ if (pipe(pipefd)) {
+ perror("# Unable to create pipe");
+
+ return -1;
+ }
+
+ /*
+ * Fork to start benchmark, save child's pid so that it can be killed
+ * when needed
+ */
+ bm_pid = fork();
+ if (bm_pid == -1) {
+ perror("# Unable to fork");
+
+ return -1;
+ }
+
+ if (bm_pid == 0) {
+ /*
+ * Mask all signals except SIGUSR1, parent uses SIGUSR1 to
+ * start benchmark
+ */
+ sigfillset(&sigact.sa_mask);
+ sigdelset(&sigact.sa_mask, SIGUSR1);
+
+ sigact.sa_sigaction = run_benchmark;
+ sigact.sa_flags = SA_SIGINFO;
+
+ /* Register for "SIGUSR1" signal from parent */
+ if (sigaction(SIGUSR1, &sigact, NULL))
+ PARENT_EXIT("Can't register child for signal");
+
+ /* Tell parent that child is ready */
+ close(pipefd[0]);
+ pipe_message = 1;
+ if (write(pipefd[1], &pipe_message, sizeof(pipe_message)) <
+ sizeof(pipe_message)) {
+ perror("# failed signaling parent process");
+ close(pipefd[1]);
+ return -1;
+ }
+ close(pipefd[1]);
+
+ /* Suspend child until delivery of "SIGUSR1" from parent */
+ sigsuspend(&sigact.sa_mask);
+
+ PARENT_EXIT("Child is done");
+ }
+
+ printf("# benchmark PID: %d\n", bm_pid);
+
+ /*
+ * Register CTRL-C handler for parent, as it has to kill benchmark
+ * before exiting
+ */
+ sigact.sa_sigaction = ctrlc_handler;
+ sigemptyset(&sigact.sa_mask);
+ sigact.sa_flags = SA_SIGINFO;
+ if (sigaction(SIGINT, &sigact, NULL) ||
+ sigaction(SIGHUP, &sigact, NULL)) {
+ perror("# sigaction");
+ ret = errno;
+ goto out;
+ }
+
+ value.sival_ptr = benchmark_cmd;
+
+ /* Taskset benchmark to specified cpu */
+ ret = taskset_benchmark(bm_pid, param->cpu_no);
+ if (ret)
+ goto out;
+
+ /* Write benchmark to specified control&monitoring grp in resctrl FS */
+ ret = write_bm_pid_to_resctrl(bm_pid, param->ctrlgrp, param->mongrp,
+ resctrl_val);
+ if (ret)
+ goto out;
+
+ if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)) ||
+ !strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) {
+ ret = initialize_mem_bw_imc();
+ if (ret)
+ goto out;
+
+ initialize_mem_bw_resctrl(param->ctrlgrp, param->mongrp,
+ param->cpu_no, resctrl_val);
+ } else if (!strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR)))
+ initialize_llc_occu_resctrl(param->ctrlgrp, param->mongrp,
+ param->cpu_no, resctrl_val);
+
+ /* Parent waits for child to be ready. */
+ close(pipefd[1]);
+ while (pipe_message != 1) {
+ if (read(pipefd[0], &pipe_message, sizeof(pipe_message)) <
+ sizeof(pipe_message)) {
+ perror("# failed reading message from child process");
+ close(pipefd[0]);
+ goto out;
+ }
+ }
+ close(pipefd[0]);
+
+ /* Signal child to start benchmark */
+ if (sigqueue(bm_pid, SIGUSR1, value) == -1) {
+ perror("# sigqueue SIGUSR1 to child");
+ ret = errno;
+ goto out;
+ }
+
+ /* Give benchmark enough time to fully run */
+ sleep(1);
+
+ /* Test runs until the callback setup() tells the test to stop. */
+ while (1) {
+ if (!strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR)) ||
+ !strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR))) {
+ ret = param->setup(1, param);
+ if (ret) {
+ ret = 0;
+ break;
+ }
+
+ ret = measure_vals(param, &bw_resc_start);
+ if (ret)
+ break;
+ } else if (!strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR))) {
+ ret = param->setup(1, param);
+ if (ret) {
+ ret = 0;
+ break;
+ }
+ sleep(1);
+ ret = measure_cache_vals(param, bm_pid);
+ if (ret)
+ break;
+ } else {
+ break;
+ }
+ }
+
+out:
+ kill(bm_pid, SIGKILL);
+ umount_resctrlfs();
+
+ return ret;
+}
diff --git a/tools/testing/selftests/resctrl/resctrlfs.c b/tools/testing/selftests/resctrl/resctrlfs.c
new file mode 100644
index 000000000..4174e48e0
--- /dev/null
+++ b/tools/testing/selftests/resctrl/resctrlfs.c
@@ -0,0 +1,723 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Basic resctrl file system operations
+ *
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * Authors:
+ * Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>,
+ * Fenghua Yu <fenghua.yu@intel.com>
+ */
+#include "resctrl.h"
+
+int tests_run;
+
+static int find_resctrl_mount(char *buffer)
+{
+ FILE *mounts;
+ char line[256], *fs, *mntpoint;
+
+ mounts = fopen("/proc/mounts", "r");
+ if (!mounts) {
+ perror("/proc/mounts");
+ return -ENXIO;
+ }
+ while (!feof(mounts)) {
+ if (!fgets(line, 256, mounts))
+ break;
+ fs = strtok(line, " \t");
+ if (!fs)
+ continue;
+ mntpoint = strtok(NULL, " \t");
+ if (!mntpoint)
+ continue;
+ fs = strtok(NULL, " \t");
+ if (!fs)
+ continue;
+ if (strcmp(fs, "resctrl"))
+ continue;
+
+ fclose(mounts);
+ if (buffer)
+ strncpy(buffer, mntpoint, 256);
+
+ return 0;
+ }
+
+ fclose(mounts);
+
+ return -ENOENT;
+}
+
+/*
+ * remount_resctrlfs - Remount resctrl FS at /sys/fs/resctrl
+ * @mum_resctrlfs: Should the resctrl FS be remounted?
+ *
+ * If not mounted, mount it.
+ * If mounted and mum_resctrlfs then remount resctrl FS.
+ * If mounted and !mum_resctrlfs then noop
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+int remount_resctrlfs(bool mum_resctrlfs)
+{
+ char mountpoint[256];
+ int ret;
+
+ ret = find_resctrl_mount(mountpoint);
+ if (ret)
+ strcpy(mountpoint, RESCTRL_PATH);
+
+ if (!ret && mum_resctrlfs && umount(mountpoint)) {
+ printf("not ok unmounting \"%s\"\n", mountpoint);
+ perror("# umount");
+ tests_run++;
+ }
+
+ if (!ret && !mum_resctrlfs)
+ return 0;
+
+ ret = mount("resctrl", RESCTRL_PATH, "resctrl", 0, NULL);
+ printf("%sok mounting resctrl to \"%s\"\n", ret ? "not " : "",
+ RESCTRL_PATH);
+ if (ret)
+ perror("# mount");
+
+ tests_run++;
+
+ return ret;
+}
+
+int umount_resctrlfs(void)
+{
+ if (umount(RESCTRL_PATH)) {
+ perror("# Unable to umount resctrl");
+
+ return errno;
+ }
+
+ return 0;
+}
+
+/*
+ * get_resource_id - Get socket number/l3 id for a specified CPU
+ * @cpu_no: CPU number
+ * @resource_id: Socket number or l3_id
+ *
+ * Return: >= 0 on success, < 0 on failure.
+ */
+int get_resource_id(int cpu_no, int *resource_id)
+{
+ char phys_pkg_path[1024];
+ FILE *fp;
+
+ if (is_amd)
+ sprintf(phys_pkg_path, "%s%d/cache/index3/id",
+ PHYS_ID_PATH, cpu_no);
+ else
+ sprintf(phys_pkg_path, "%s%d/topology/physical_package_id",
+ PHYS_ID_PATH, cpu_no);
+
+ fp = fopen(phys_pkg_path, "r");
+ if (!fp) {
+ perror("Failed to open physical_package_id");
+
+ return -1;
+ }
+ if (fscanf(fp, "%d", resource_id) <= 0) {
+ perror("Could not get socket number or l3 id");
+ fclose(fp);
+
+ return -1;
+ }
+ fclose(fp);
+
+ return 0;
+}
+
+/*
+ * get_cache_size - Get cache size for a specified CPU
+ * @cpu_no: CPU number
+ * @cache_type: Cache level L2/L3
+ * @cache_size: pointer to cache_size
+ *
+ * Return: = 0 on success, < 0 on failure.
+ */
+int get_cache_size(int cpu_no, char *cache_type, unsigned long *cache_size)
+{
+ char cache_path[1024], cache_str[64];
+ int length, i, cache_num;
+ FILE *fp;
+
+ if (!strcmp(cache_type, "L3")) {
+ cache_num = 3;
+ } else if (!strcmp(cache_type, "L2")) {
+ cache_num = 2;
+ } else {
+ perror("Invalid cache level");
+ return -1;
+ }
+
+ sprintf(cache_path, "/sys/bus/cpu/devices/cpu%d/cache/index%d/size",
+ cpu_no, cache_num);
+ fp = fopen(cache_path, "r");
+ if (!fp) {
+ perror("Failed to open cache size");
+
+ return -1;
+ }
+ if (fscanf(fp, "%s", cache_str) <= 0) {
+ perror("Could not get cache_size");
+ fclose(fp);
+
+ return -1;
+ }
+ fclose(fp);
+
+ length = (int)strlen(cache_str);
+
+ *cache_size = 0;
+
+ for (i = 0; i < length; i++) {
+ if ((cache_str[i] >= '0') && (cache_str[i] <= '9'))
+
+ *cache_size = *cache_size * 10 + (cache_str[i] - '0');
+
+ else if (cache_str[i] == 'K')
+
+ *cache_size = *cache_size * 1024;
+
+ else if (cache_str[i] == 'M')
+
+ *cache_size = *cache_size * 1024 * 1024;
+
+ else
+ break;
+ }
+
+ return 0;
+}
+
+#define CORE_SIBLINGS_PATH "/sys/bus/cpu/devices/cpu"
+
+/*
+ * get_cbm_mask - Get cbm mask for given cache
+ * @cache_type: Cache level L2/L3
+ * @cbm_mask: cbm_mask returned as a string
+ *
+ * Return: = 0 on success, < 0 on failure.
+ */
+int get_cbm_mask(char *cache_type, char *cbm_mask)
+{
+ char cbm_mask_path[1024];
+ FILE *fp;
+
+ if (!cbm_mask)
+ return -1;
+
+ sprintf(cbm_mask_path, "%s/%s/cbm_mask", CBM_MASK_PATH, cache_type);
+
+ fp = fopen(cbm_mask_path, "r");
+ if (!fp) {
+ perror("Failed to open cache level");
+
+ return -1;
+ }
+ if (fscanf(fp, "%s", cbm_mask) <= 0) {
+ perror("Could not get max cbm_mask");
+ fclose(fp);
+
+ return -1;
+ }
+ fclose(fp);
+
+ return 0;
+}
+
+/*
+ * get_core_sibling - Get sibling core id from the same socket for given CPU
+ * @cpu_no: CPU number
+ *
+ * Return: > 0 on success, < 0 on failure.
+ */
+int get_core_sibling(int cpu_no)
+{
+ char core_siblings_path[1024], cpu_list_str[64];
+ int sibling_cpu_no = -1;
+ FILE *fp;
+
+ sprintf(core_siblings_path, "%s%d/topology/core_siblings_list",
+ CORE_SIBLINGS_PATH, cpu_no);
+
+ fp = fopen(core_siblings_path, "r");
+ if (!fp) {
+ perror("Failed to open core siblings path");
+
+ return -1;
+ }
+ if (fscanf(fp, "%s", cpu_list_str) <= 0) {
+ perror("Could not get core_siblings list");
+ fclose(fp);
+
+ return -1;
+ }
+ fclose(fp);
+
+ char *token = strtok(cpu_list_str, "-,");
+
+ while (token) {
+ sibling_cpu_no = atoi(token);
+ /* Skipping core 0 as we don't want to run test on core 0 */
+ if (sibling_cpu_no != 0)
+ break;
+ token = strtok(NULL, "-,");
+ }
+
+ return sibling_cpu_no;
+}
+
+/*
+ * taskset_benchmark - Taskset PID (i.e. benchmark) to a specified cpu
+ * @bm_pid: PID that should be binded
+ * @cpu_no: CPU number at which the PID would be binded
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+int taskset_benchmark(pid_t bm_pid, int cpu_no)
+{
+ cpu_set_t my_set;
+
+ CPU_ZERO(&my_set);
+ CPU_SET(cpu_no, &my_set);
+
+ if (sched_setaffinity(bm_pid, sizeof(cpu_set_t), &my_set)) {
+ perror("Unable to taskset benchmark");
+
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * run_benchmark - Run a specified benchmark or fill_buf (default benchmark)
+ * in specified signal. Direct benchmark stdio to /dev/null.
+ * @signum: signal number
+ * @info: signal info
+ * @ucontext: user context in signal handling
+ *
+ * Return: void
+ */
+void run_benchmark(int signum, siginfo_t *info, void *ucontext)
+{
+ int operation, ret, malloc_and_init_memory, memflush;
+ unsigned long span, buffer_span;
+ char **benchmark_cmd;
+ char resctrl_val[64];
+ FILE *fp;
+
+ benchmark_cmd = info->si_ptr;
+
+ /*
+ * Direct stdio of child to /dev/null, so that only parent writes to
+ * stdio (console)
+ */
+ fp = freopen("/dev/null", "w", stdout);
+ if (!fp)
+ PARENT_EXIT("Unable to direct benchmark status to /dev/null");
+
+ if (strcmp(benchmark_cmd[0], "fill_buf") == 0) {
+ /* Execute default fill_buf benchmark */
+ span = strtoul(benchmark_cmd[1], NULL, 10);
+ malloc_and_init_memory = atoi(benchmark_cmd[2]);
+ memflush = atoi(benchmark_cmd[3]);
+ operation = atoi(benchmark_cmd[4]);
+ sprintf(resctrl_val, "%s", benchmark_cmd[5]);
+
+ if (strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR)))
+ buffer_span = span * MB;
+ else
+ buffer_span = span;
+
+ if (run_fill_buf(buffer_span, malloc_and_init_memory, memflush,
+ operation, resctrl_val))
+ fprintf(stderr, "Error in running fill buffer\n");
+ } else {
+ /* Execute specified benchmark */
+ ret = execvp(benchmark_cmd[0], benchmark_cmd);
+ if (ret)
+ perror("wrong\n");
+ }
+
+ fclose(stdout);
+ PARENT_EXIT("Unable to run specified benchmark");
+}
+
+/*
+ * create_grp - Create a group only if one doesn't exist
+ * @grp_name: Name of the group
+ * @grp: Full path and name of the group
+ * @parent_grp: Full path and name of the parent group
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+static int create_grp(const char *grp_name, char *grp, const char *parent_grp)
+{
+ int found_grp = 0;
+ struct dirent *ep;
+ DIR *dp;
+
+ /*
+ * At this point, we are guaranteed to have resctrl FS mounted and if
+ * length of grp_name == 0, it means, user wants to use root con_mon
+ * grp, so do nothing
+ */
+ if (strlen(grp_name) == 0)
+ return 0;
+
+ /* Check if requested grp exists or not */
+ dp = opendir(parent_grp);
+ if (dp) {
+ while ((ep = readdir(dp)) != NULL) {
+ if (strcmp(ep->d_name, grp_name) == 0)
+ found_grp = 1;
+ }
+ closedir(dp);
+ } else {
+ perror("Unable to open resctrl for group");
+
+ return -1;
+ }
+
+ /* Requested grp doesn't exist, hence create it */
+ if (found_grp == 0) {
+ if (mkdir(grp, 0) == -1) {
+ perror("Unable to create group");
+
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int write_pid_to_tasks(char *tasks, pid_t pid)
+{
+ FILE *fp;
+
+ fp = fopen(tasks, "w");
+ if (!fp) {
+ perror("Failed to open tasks file");
+
+ return -1;
+ }
+ if (fprintf(fp, "%d\n", pid) < 0) {
+ perror("Failed to wr pid to tasks file");
+ fclose(fp);
+
+ return -1;
+ }
+ fclose(fp);
+
+ return 0;
+}
+
+/*
+ * write_bm_pid_to_resctrl - Write a PID (i.e. benchmark) to resctrl FS
+ * @bm_pid: PID that should be written
+ * @ctrlgrp: Name of the control monitor group (con_mon grp)
+ * @mongrp: Name of the monitor group (mon grp)
+ * @resctrl_val: Resctrl feature (Eg: mbm, mba.. etc)
+ *
+ * If a con_mon grp is requested, create it and write pid to it, otherwise
+ * write pid to root con_mon grp.
+ * If a mon grp is requested, create it and write pid to it, otherwise
+ * pid is not written, this means that pid is in con_mon grp and hence
+ * should consult con_mon grp's mon_data directory for results.
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+int write_bm_pid_to_resctrl(pid_t bm_pid, char *ctrlgrp, char *mongrp,
+ char *resctrl_val)
+{
+ char controlgroup[128], monitorgroup[512], monitorgroup_p[256];
+ char tasks[1024];
+ int ret = 0;
+
+ if (strlen(ctrlgrp))
+ sprintf(controlgroup, "%s/%s", RESCTRL_PATH, ctrlgrp);
+ else
+ sprintf(controlgroup, "%s", RESCTRL_PATH);
+
+ /* Create control and monitoring group and write pid into it */
+ ret = create_grp(ctrlgrp, controlgroup, RESCTRL_PATH);
+ if (ret)
+ goto out;
+ sprintf(tasks, "%s/tasks", controlgroup);
+ ret = write_pid_to_tasks(tasks, bm_pid);
+ if (ret)
+ goto out;
+
+ /* Create mon grp and write pid into it for "mbm" and "cqm" test */
+ if (!strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR)) ||
+ !strncmp(resctrl_val, MBM_STR, sizeof(MBM_STR))) {
+ if (strlen(mongrp)) {
+ sprintf(monitorgroup_p, "%s/mon_groups", controlgroup);
+ sprintf(monitorgroup, "%s/%s", monitorgroup_p, mongrp);
+ ret = create_grp(mongrp, monitorgroup, monitorgroup_p);
+ if (ret)
+ goto out;
+
+ sprintf(tasks, "%s/mon_groups/%s/tasks",
+ controlgroup, mongrp);
+ ret = write_pid_to_tasks(tasks, bm_pid);
+ if (ret)
+ goto out;
+ }
+ }
+
+out:
+ printf("%sok writing benchmark parameters to resctrl FS\n",
+ ret ? "not " : "");
+ if (ret)
+ perror("# writing to resctrlfs");
+
+ tests_run++;
+
+ return ret;
+}
+
+/*
+ * write_schemata - Update schemata of a con_mon grp
+ * @ctrlgrp: Name of the con_mon grp
+ * @schemata: Schemata that should be updated to
+ * @cpu_no: CPU number that the benchmark PID is binded to
+ * @resctrl_val: Resctrl feature (Eg: mbm, mba.. etc)
+ *
+ * Update schemata of a con_mon grp *only* if requested resctrl feature is
+ * allocation type
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+int write_schemata(char *ctrlgrp, char *schemata, int cpu_no, char *resctrl_val)
+{
+ char controlgroup[1024], schema[1024], reason[64];
+ int resource_id, ret = 0;
+ FILE *fp;
+
+ if (strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR)) &&
+ strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR)) &&
+ strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR)))
+ return -ENOENT;
+
+ if (!schemata) {
+ printf("# Skipping empty schemata update\n");
+
+ return -1;
+ }
+
+ if (get_resource_id(cpu_no, &resource_id) < 0) {
+ sprintf(reason, "Failed to get resource id");
+ ret = -1;
+
+ goto out;
+ }
+
+ if (strlen(ctrlgrp) != 0)
+ sprintf(controlgroup, "%s/%s/schemata", RESCTRL_PATH, ctrlgrp);
+ else
+ sprintf(controlgroup, "%s/schemata", RESCTRL_PATH);
+
+ if (!strncmp(resctrl_val, CAT_STR, sizeof(CAT_STR)) ||
+ !strncmp(resctrl_val, CQM_STR, sizeof(CQM_STR)))
+ sprintf(schema, "%s%d%c%s", "L3:", resource_id, '=', schemata);
+ if (!strncmp(resctrl_val, MBA_STR, sizeof(MBA_STR)))
+ sprintf(schema, "%s%d%c%s", "MB:", resource_id, '=', schemata);
+
+ fp = fopen(controlgroup, "w");
+ if (!fp) {
+ sprintf(reason, "Failed to open control group");
+ ret = -1;
+
+ goto out;
+ }
+
+ if (fprintf(fp, "%s\n", schema) < 0) {
+ sprintf(reason, "Failed to write schemata in control group");
+ fclose(fp);
+ ret = -1;
+
+ goto out;
+ }
+ fclose(fp);
+
+out:
+ printf("%sok Write schema \"%s\" to resctrl FS%s%s\n",
+ ret ? "not " : "", schema, ret ? " # " : "",
+ ret ? reason : "");
+ tests_run++;
+
+ return ret;
+}
+
+bool check_resctrlfs_support(void)
+{
+ FILE *inf = fopen("/proc/filesystems", "r");
+ DIR *dp;
+ char *res;
+ bool ret = false;
+
+ if (!inf)
+ return false;
+
+ res = fgrep(inf, "nodev\tresctrl\n");
+
+ if (res) {
+ ret = true;
+ free(res);
+ }
+
+ fclose(inf);
+
+ printf("%sok kernel supports resctrl filesystem\n", ret ? "" : "not ");
+ tests_run++;
+
+ dp = opendir(RESCTRL_PATH);
+ printf("%sok resctrl mountpoint \"%s\" exists\n",
+ dp ? "" : "not ", RESCTRL_PATH);
+ if (dp)
+ closedir(dp);
+ tests_run++;
+
+ printf("# resctrl filesystem %s mounted\n",
+ find_resctrl_mount(NULL) ? "not" : "is");
+
+ return ret;
+}
+
+char *fgrep(FILE *inf, const char *str)
+{
+ char line[256];
+ int slen = strlen(str);
+
+ while (!feof(inf)) {
+ if (!fgets(line, 256, inf))
+ break;
+ if (strncmp(line, str, slen))
+ continue;
+
+ return strdup(line);
+ }
+
+ return NULL;
+}
+
+/*
+ * validate_resctrl_feature_request - Check if requested feature is valid.
+ * @resctrl_val: Requested feature
+ *
+ * Return: 0 on success, non-zero on failure
+ */
+bool validate_resctrl_feature_request(char *resctrl_val)
+{
+ FILE *inf = fopen("/proc/cpuinfo", "r");
+ bool found = false;
+ char *res;
+
+ if (!inf)
+ return false;
+
+ res = fgrep(inf, "flags");
+
+ if (res) {
+ char *s = strchr(res, ':');
+
+ found = s && !strstr(s, resctrl_val);
+ free(res);
+ }
+ fclose(inf);
+
+ return found;
+}
+
+int filter_dmesg(void)
+{
+ char line[1024];
+ FILE *fp;
+ int pipefds[2];
+ pid_t pid;
+ int ret;
+
+ ret = pipe(pipefds);
+ if (ret) {
+ perror("pipe");
+ return ret;
+ }
+ pid = fork();
+ if (pid == 0) {
+ close(pipefds[0]);
+ dup2(pipefds[1], STDOUT_FILENO);
+ execlp("dmesg", "dmesg", NULL);
+ perror("executing dmesg");
+ exit(1);
+ }
+ close(pipefds[1]);
+ fp = fdopen(pipefds[0], "r");
+ if (!fp) {
+ perror("fdopen(pipe)");
+ kill(pid, SIGTERM);
+
+ return -1;
+ }
+
+ while (fgets(line, 1024, fp)) {
+ if (strstr(line, "intel_rdt:"))
+ printf("# dmesg: %s", line);
+ if (strstr(line, "resctrl:"))
+ printf("# dmesg: %s", line);
+ }
+ fclose(fp);
+ waitpid(pid, NULL, 0);
+
+ return 0;
+}
+
+int validate_bw_report_request(char *bw_report)
+{
+ if (strcmp(bw_report, "reads") == 0)
+ return 0;
+ if (strcmp(bw_report, "writes") == 0)
+ return 0;
+ if (strcmp(bw_report, "nt-writes") == 0) {
+ strcpy(bw_report, "writes");
+ return 0;
+ }
+ if (strcmp(bw_report, "total") == 0)
+ return 0;
+
+ fprintf(stderr, "Requested iMC B/W report type unavailable\n");
+
+ return -1;
+}
+
+int perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu,
+ int group_fd, unsigned long flags)
+{
+ int ret;
+
+ ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
+ group_fd, flags);
+ return ret;
+}
+
+unsigned int count_bits(unsigned long n)
+{
+ unsigned int count = 0;
+
+ while (n) {
+ count += n & 1;
+ n >>= 1;
+ }
+
+ return count;
+}
diff --git a/tools/testing/selftests/rseq/.gitignore b/tools/testing/selftests/rseq/.gitignore
new file mode 100644
index 000000000..5910888eb
--- /dev/null
+++ b/tools/testing/selftests/rseq/.gitignore
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+basic_percpu_ops_test
+basic_test
+basic_rseq_op_test
+param_test
+param_test_benchmark
+param_test_compare_twice
diff --git a/tools/testing/selftests/rseq/Makefile b/tools/testing/selftests/rseq/Makefile
new file mode 100644
index 000000000..82ceca6aa
--- /dev/null
+++ b/tools/testing/selftests/rseq/Makefile
@@ -0,0 +1,40 @@
+# SPDX-License-Identifier: GPL-2.0+ OR MIT
+
+ifneq ($(shell $(CC) --version 2>&1 | head -n 1 | grep clang),)
+CLANG_FLAGS += -no-integrated-as
+endif
+
+top_srcdir = ../../../..
+
+CFLAGS += -O2 -Wall -g -I./ -I../../../../usr/include/ -L$(OUTPUT) -Wl,-rpath=./ \
+ $(CLANG_FLAGS) -I$(top_srcdir)/tools/include
+LDLIBS += -lpthread -ldl
+
+# Own dependencies because we only want to build against 1st prerequisite, but
+# still track changes to header files and depend on shared object.
+OVERRIDE_TARGETS = 1
+
+TEST_GEN_PROGS = basic_test basic_percpu_ops_test param_test \
+ param_test_benchmark param_test_compare_twice
+
+TEST_GEN_PROGS_EXTENDED = librseq.so
+
+TEST_PROGS = run_param_test.sh
+
+TEST_FILES := settings
+
+include ../lib.mk
+
+$(OUTPUT)/librseq.so: rseq.c rseq.h rseq-*.h
+ $(CC) $(CFLAGS) -shared -fPIC $< $(LDLIBS) -o $@
+
+$(OUTPUT)/%: %.c $(TEST_GEN_PROGS_EXTENDED) rseq.h rseq-*.h
+ $(CC) $(CFLAGS) $< $(LDLIBS) -lrseq -o $@
+
+$(OUTPUT)/param_test_benchmark: param_test.c $(TEST_GEN_PROGS_EXTENDED) \
+ rseq.h rseq-*.h
+ $(CC) $(CFLAGS) -DBENCHMARK $< $(LDLIBS) -lrseq -o $@
+
+$(OUTPUT)/param_test_compare_twice: param_test.c $(TEST_GEN_PROGS_EXTENDED) \
+ rseq.h rseq-*.h
+ $(CC) $(CFLAGS) -DRSEQ_COMPARE_TWICE $< $(LDLIBS) -lrseq -o $@
diff --git a/tools/testing/selftests/rseq/basic_percpu_ops_test.c b/tools/testing/selftests/rseq/basic_percpu_ops_test.c
new file mode 100644
index 000000000..517756afc
--- /dev/null
+++ b/tools/testing/selftests/rseq/basic_percpu_ops_test.c
@@ -0,0 +1,311 @@
+// SPDX-License-Identifier: LGPL-2.1
+#define _GNU_SOURCE
+#include <assert.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stddef.h>
+
+#include "../kselftest.h"
+#include "rseq.h"
+
+struct percpu_lock_entry {
+ intptr_t v;
+} __attribute__((aligned(128)));
+
+struct percpu_lock {
+ struct percpu_lock_entry c[CPU_SETSIZE];
+};
+
+struct test_data_entry {
+ intptr_t count;
+} __attribute__((aligned(128)));
+
+struct spinlock_test_data {
+ struct percpu_lock lock;
+ struct test_data_entry c[CPU_SETSIZE];
+ int reps;
+};
+
+struct percpu_list_node {
+ intptr_t data;
+ struct percpu_list_node *next;
+};
+
+struct percpu_list_entry {
+ struct percpu_list_node *head;
+} __attribute__((aligned(128)));
+
+struct percpu_list {
+ struct percpu_list_entry c[CPU_SETSIZE];
+};
+
+/* A simple percpu spinlock. Returns the cpu lock was acquired on. */
+int rseq_this_cpu_lock(struct percpu_lock *lock)
+{
+ int cpu;
+
+ for (;;) {
+ int ret;
+
+ cpu = rseq_cpu_start();
+ ret = rseq_cmpeqv_storev(&lock->c[cpu].v,
+ 0, 1, cpu);
+ if (rseq_likely(!ret))
+ break;
+ /* Retry if comparison fails or rseq aborts. */
+ }
+ /*
+ * Acquire semantic when taking lock after control dependency.
+ * Matches rseq_smp_store_release().
+ */
+ rseq_smp_acquire__after_ctrl_dep();
+ return cpu;
+}
+
+void rseq_percpu_unlock(struct percpu_lock *lock, int cpu)
+{
+ assert(lock->c[cpu].v == 1);
+ /*
+ * Release lock, with release semantic. Matches
+ * rseq_smp_acquire__after_ctrl_dep().
+ */
+ rseq_smp_store_release(&lock->c[cpu].v, 0);
+}
+
+void *test_percpu_spinlock_thread(void *arg)
+{
+ struct spinlock_test_data *data = arg;
+ int i, cpu;
+
+ if (rseq_register_current_thread()) {
+ fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n",
+ errno, strerror(errno));
+ abort();
+ }
+ for (i = 0; i < data->reps; i++) {
+ cpu = rseq_this_cpu_lock(&data->lock);
+ data->c[cpu].count++;
+ rseq_percpu_unlock(&data->lock, cpu);
+ }
+ if (rseq_unregister_current_thread()) {
+ fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d): %s\n",
+ errno, strerror(errno));
+ abort();
+ }
+
+ return NULL;
+}
+
+/*
+ * A simple test which implements a sharded counter using a per-cpu
+ * lock. Obviously real applications might prefer to simply use a
+ * per-cpu increment; however, this is reasonable for a test and the
+ * lock can be extended to synchronize more complicated operations.
+ */
+void test_percpu_spinlock(void)
+{
+ const int num_threads = 200;
+ int i;
+ uint64_t sum;
+ pthread_t test_threads[num_threads];
+ struct spinlock_test_data data;
+
+ memset(&data, 0, sizeof(data));
+ data.reps = 5000;
+
+ for (i = 0; i < num_threads; i++)
+ pthread_create(&test_threads[i], NULL,
+ test_percpu_spinlock_thread, &data);
+
+ for (i = 0; i < num_threads; i++)
+ pthread_join(test_threads[i], NULL);
+
+ sum = 0;
+ for (i = 0; i < CPU_SETSIZE; i++)
+ sum += data.c[i].count;
+
+ assert(sum == (uint64_t)data.reps * num_threads);
+}
+
+void this_cpu_list_push(struct percpu_list *list,
+ struct percpu_list_node *node,
+ int *_cpu)
+{
+ int cpu;
+
+ for (;;) {
+ intptr_t *targetptr, newval, expect;
+ int ret;
+
+ cpu = rseq_cpu_start();
+ /* Load list->c[cpu].head with single-copy atomicity. */
+ expect = (intptr_t)RSEQ_READ_ONCE(list->c[cpu].head);
+ newval = (intptr_t)node;
+ targetptr = (intptr_t *)&list->c[cpu].head;
+ node->next = (struct percpu_list_node *)expect;
+ ret = rseq_cmpeqv_storev(targetptr, expect, newval, cpu);
+ if (rseq_likely(!ret))
+ break;
+ /* Retry if comparison fails or rseq aborts. */
+ }
+ if (_cpu)
+ *_cpu = cpu;
+}
+
+/*
+ * Unlike a traditional lock-less linked list; the availability of a
+ * rseq primitive allows us to implement pop without concerns over
+ * ABA-type races.
+ */
+struct percpu_list_node *this_cpu_list_pop(struct percpu_list *list,
+ int *_cpu)
+{
+ for (;;) {
+ struct percpu_list_node *head;
+ intptr_t *targetptr, expectnot, *load;
+ long offset;
+ int ret, cpu;
+
+ cpu = rseq_cpu_start();
+ targetptr = (intptr_t *)&list->c[cpu].head;
+ expectnot = (intptr_t)NULL;
+ offset = offsetof(struct percpu_list_node, next);
+ load = (intptr_t *)&head;
+ ret = rseq_cmpnev_storeoffp_load(targetptr, expectnot,
+ offset, load, cpu);
+ if (rseq_likely(!ret)) {
+ if (_cpu)
+ *_cpu = cpu;
+ return head;
+ }
+ if (ret > 0)
+ return NULL;
+ /* Retry if rseq aborts. */
+ }
+}
+
+/*
+ * __percpu_list_pop is not safe against concurrent accesses. Should
+ * only be used on lists that are not concurrently modified.
+ */
+struct percpu_list_node *__percpu_list_pop(struct percpu_list *list, int cpu)
+{
+ struct percpu_list_node *node;
+
+ node = list->c[cpu].head;
+ if (!node)
+ return NULL;
+ list->c[cpu].head = node->next;
+ return node;
+}
+
+void *test_percpu_list_thread(void *arg)
+{
+ int i;
+ struct percpu_list *list = (struct percpu_list *)arg;
+
+ if (rseq_register_current_thread()) {
+ fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n",
+ errno, strerror(errno));
+ abort();
+ }
+
+ for (i = 0; i < 100000; i++) {
+ struct percpu_list_node *node;
+
+ node = this_cpu_list_pop(list, NULL);
+ sched_yield(); /* encourage shuffling */
+ if (node)
+ this_cpu_list_push(list, node, NULL);
+ }
+
+ if (rseq_unregister_current_thread()) {
+ fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d): %s\n",
+ errno, strerror(errno));
+ abort();
+ }
+
+ return NULL;
+}
+
+/* Simultaneous modification to a per-cpu linked list from many threads. */
+void test_percpu_list(void)
+{
+ int i, j;
+ uint64_t sum = 0, expected_sum = 0;
+ struct percpu_list list;
+ pthread_t test_threads[200];
+ cpu_set_t allowed_cpus;
+
+ memset(&list, 0, sizeof(list));
+
+ /* Generate list entries for every usable cpu. */
+ sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
+ for (i = 0; i < CPU_SETSIZE; i++) {
+ if (!CPU_ISSET(i, &allowed_cpus))
+ continue;
+ for (j = 1; j <= 100; j++) {
+ struct percpu_list_node *node;
+
+ expected_sum += j;
+
+ node = malloc(sizeof(*node));
+ assert(node);
+ node->data = j;
+ node->next = list.c[i].head;
+ list.c[i].head = node;
+ }
+ }
+
+ for (i = 0; i < 200; i++)
+ pthread_create(&test_threads[i], NULL,
+ test_percpu_list_thread, &list);
+
+ for (i = 0; i < 200; i++)
+ pthread_join(test_threads[i], NULL);
+
+ for (i = 0; i < CPU_SETSIZE; i++) {
+ struct percpu_list_node *node;
+
+ if (!CPU_ISSET(i, &allowed_cpus))
+ continue;
+
+ while ((node = __percpu_list_pop(&list, i))) {
+ sum += node->data;
+ free(node);
+ }
+ }
+
+ /*
+ * All entries should now be accounted for (unless some external
+ * actor is interfering with our allowed affinity while this
+ * test is running).
+ */
+ assert(sum == expected_sum);
+}
+
+int main(int argc, char **argv)
+{
+ if (rseq_register_current_thread()) {
+ fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n",
+ errno, strerror(errno));
+ goto error;
+ }
+ printf("spinlock\n");
+ test_percpu_spinlock();
+ printf("percpu_list\n");
+ test_percpu_list();
+ if (rseq_unregister_current_thread()) {
+ fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d): %s\n",
+ errno, strerror(errno));
+ goto error;
+ }
+ return 0;
+
+error:
+ return -1;
+}
diff --git a/tools/testing/selftests/rseq/basic_test.c b/tools/testing/selftests/rseq/basic_test.c
new file mode 100644
index 000000000..d8efbfb89
--- /dev/null
+++ b/tools/testing/selftests/rseq/basic_test.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: LGPL-2.1
+/*
+ * Basic test coverage for critical regions and rseq_current_cpu().
+ */
+
+#define _GNU_SOURCE
+#include <assert.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/time.h>
+
+#include "rseq.h"
+
+void test_cpu_pointer(void)
+{
+ cpu_set_t affinity, test_affinity;
+ int i;
+
+ sched_getaffinity(0, sizeof(affinity), &affinity);
+ CPU_ZERO(&test_affinity);
+ for (i = 0; i < CPU_SETSIZE; i++) {
+ if (CPU_ISSET(i, &affinity)) {
+ CPU_SET(i, &test_affinity);
+ sched_setaffinity(0, sizeof(test_affinity),
+ &test_affinity);
+ assert(sched_getcpu() == i);
+ assert(rseq_current_cpu() == i);
+ assert(rseq_current_cpu_raw() == i);
+ assert(rseq_cpu_start() == i);
+ CPU_CLR(i, &test_affinity);
+ }
+ }
+ sched_setaffinity(0, sizeof(affinity), &affinity);
+}
+
+int main(int argc, char **argv)
+{
+ if (rseq_register_current_thread()) {
+ fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n",
+ errno, strerror(errno));
+ goto init_thread_error;
+ }
+ printf("testing current cpu\n");
+ test_cpu_pointer();
+ if (rseq_unregister_current_thread()) {
+ fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d): %s\n",
+ errno, strerror(errno));
+ goto init_thread_error;
+ }
+ return 0;
+
+init_thread_error:
+ return -1;
+}
diff --git a/tools/testing/selftests/rseq/compiler.h b/tools/testing/selftests/rseq/compiler.h
new file mode 100644
index 000000000..876eb6a7f
--- /dev/null
+++ b/tools/testing/selftests/rseq/compiler.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: LGPL-2.1-only OR MIT */
+/*
+ * rseq/compiler.h
+ *
+ * Work-around asm goto compiler bugs.
+ *
+ * (C) Copyright 2021 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#ifndef RSEQ_COMPILER_H
+#define RSEQ_COMPILER_H
+
+/*
+ * gcc prior to 4.8.2 miscompiles asm goto.
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670
+ *
+ * gcc prior to 8.1.0 miscompiles asm goto at O1.
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103908
+ *
+ * clang prior to version 13.0.1 miscompiles asm goto at O2.
+ * https://github.com/llvm/llvm-project/issues/52735
+ *
+ * Work around these issues by adding a volatile inline asm with
+ * memory clobber in the fallthrough after the asm goto and at each
+ * label target. Emit this for all compilers in case other similar
+ * issues are found in the future.
+ */
+#define rseq_after_asm_goto() asm volatile ("" : : : "memory")
+
+#endif /* RSEQ_COMPILER_H_ */
diff --git a/tools/testing/selftests/rseq/param_test.c b/tools/testing/selftests/rseq/param_test.c
new file mode 100644
index 000000000..e29ecc715
--- /dev/null
+++ b/tools/testing/selftests/rseq/param_test.c
@@ -0,0 +1,1550 @@
+// SPDX-License-Identifier: LGPL-2.1
+#define _GNU_SOURCE
+#include <assert.h>
+#include <linux/membarrier.h>
+#include <pthread.h>
+#include <sched.h>
+#include <stdatomic.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <poll.h>
+#include <sys/types.h>
+#include <signal.h>
+#include <errno.h>
+#include <stddef.h>
+
+static inline pid_t rseq_gettid(void)
+{
+ return syscall(__NR_gettid);
+}
+
+#define NR_INJECT 9
+static int loop_cnt[NR_INJECT + 1];
+
+static int loop_cnt_1 asm("asm_loop_cnt_1") __attribute__((used));
+static int loop_cnt_2 asm("asm_loop_cnt_2") __attribute__((used));
+static int loop_cnt_3 asm("asm_loop_cnt_3") __attribute__((used));
+static int loop_cnt_4 asm("asm_loop_cnt_4") __attribute__((used));
+static int loop_cnt_5 asm("asm_loop_cnt_5") __attribute__((used));
+static int loop_cnt_6 asm("asm_loop_cnt_6") __attribute__((used));
+
+static int opt_modulo, verbose;
+
+static int opt_yield, opt_signal, opt_sleep,
+ opt_disable_rseq, opt_threads = 200,
+ opt_disable_mod = 0, opt_test = 's', opt_mb = 0;
+
+#ifndef RSEQ_SKIP_FASTPATH
+static long long opt_reps = 5000;
+#else
+static long long opt_reps = 100;
+#endif
+
+static __thread __attribute__((tls_model("initial-exec")))
+unsigned int signals_delivered;
+
+#ifndef BENCHMARK
+
+static __thread __attribute__((tls_model("initial-exec"), unused))
+unsigned int yield_mod_cnt, nr_abort;
+
+#define printf_verbose(fmt, ...) \
+ do { \
+ if (verbose) \
+ printf(fmt, ## __VA_ARGS__); \
+ } while (0)
+
+#ifdef __i386__
+
+#define INJECT_ASM_REG "eax"
+
+#define RSEQ_INJECT_CLOBBER \
+ , INJECT_ASM_REG
+
+#define RSEQ_INJECT_ASM(n) \
+ "mov asm_loop_cnt_" #n ", %%" INJECT_ASM_REG "\n\t" \
+ "test %%" INJECT_ASM_REG ",%%" INJECT_ASM_REG "\n\t" \
+ "jz 333f\n\t" \
+ "222:\n\t" \
+ "dec %%" INJECT_ASM_REG "\n\t" \
+ "jnz 222b\n\t" \
+ "333:\n\t"
+
+#elif defined(__x86_64__)
+
+#define INJECT_ASM_REG_P "rax"
+#define INJECT_ASM_REG "eax"
+
+#define RSEQ_INJECT_CLOBBER \
+ , INJECT_ASM_REG_P \
+ , INJECT_ASM_REG
+
+#define RSEQ_INJECT_ASM(n) \
+ "lea asm_loop_cnt_" #n "(%%rip), %%" INJECT_ASM_REG_P "\n\t" \
+ "mov (%%" INJECT_ASM_REG_P "), %%" INJECT_ASM_REG "\n\t" \
+ "test %%" INJECT_ASM_REG ",%%" INJECT_ASM_REG "\n\t" \
+ "jz 333f\n\t" \
+ "222:\n\t" \
+ "dec %%" INJECT_ASM_REG "\n\t" \
+ "jnz 222b\n\t" \
+ "333:\n\t"
+
+#elif defined(__s390__)
+
+#define RSEQ_INJECT_INPUT \
+ , [loop_cnt_1]"m"(loop_cnt[1]) \
+ , [loop_cnt_2]"m"(loop_cnt[2]) \
+ , [loop_cnt_3]"m"(loop_cnt[3]) \
+ , [loop_cnt_4]"m"(loop_cnt[4]) \
+ , [loop_cnt_5]"m"(loop_cnt[5]) \
+ , [loop_cnt_6]"m"(loop_cnt[6])
+
+#define INJECT_ASM_REG "r12"
+
+#define RSEQ_INJECT_CLOBBER \
+ , INJECT_ASM_REG
+
+#define RSEQ_INJECT_ASM(n) \
+ "l %%" INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t" \
+ "ltr %%" INJECT_ASM_REG ", %%" INJECT_ASM_REG "\n\t" \
+ "je 333f\n\t" \
+ "222:\n\t" \
+ "ahi %%" INJECT_ASM_REG ", -1\n\t" \
+ "jnz 222b\n\t" \
+ "333:\n\t"
+
+#elif defined(__ARMEL__)
+
+#define RSEQ_INJECT_INPUT \
+ , [loop_cnt_1]"m"(loop_cnt[1]) \
+ , [loop_cnt_2]"m"(loop_cnt[2]) \
+ , [loop_cnt_3]"m"(loop_cnt[3]) \
+ , [loop_cnt_4]"m"(loop_cnt[4]) \
+ , [loop_cnt_5]"m"(loop_cnt[5]) \
+ , [loop_cnt_6]"m"(loop_cnt[6])
+
+#define INJECT_ASM_REG "r4"
+
+#define RSEQ_INJECT_CLOBBER \
+ , INJECT_ASM_REG
+
+#define RSEQ_INJECT_ASM(n) \
+ "ldr " INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t" \
+ "cmp " INJECT_ASM_REG ", #0\n\t" \
+ "beq 333f\n\t" \
+ "222:\n\t" \
+ "subs " INJECT_ASM_REG ", #1\n\t" \
+ "bne 222b\n\t" \
+ "333:\n\t"
+
+#elif defined(__AARCH64EL__)
+
+#define RSEQ_INJECT_INPUT \
+ , [loop_cnt_1] "Qo" (loop_cnt[1]) \
+ , [loop_cnt_2] "Qo" (loop_cnt[2]) \
+ , [loop_cnt_3] "Qo" (loop_cnt[3]) \
+ , [loop_cnt_4] "Qo" (loop_cnt[4]) \
+ , [loop_cnt_5] "Qo" (loop_cnt[5]) \
+ , [loop_cnt_6] "Qo" (loop_cnt[6])
+
+#define INJECT_ASM_REG RSEQ_ASM_TMP_REG32
+
+#define RSEQ_INJECT_ASM(n) \
+ " ldr " INJECT_ASM_REG ", %[loop_cnt_" #n "]\n" \
+ " cbz " INJECT_ASM_REG ", 333f\n" \
+ "222:\n" \
+ " sub " INJECT_ASM_REG ", " INJECT_ASM_REG ", #1\n" \
+ " cbnz " INJECT_ASM_REG ", 222b\n" \
+ "333:\n"
+
+#elif defined(__PPC__)
+
+#define RSEQ_INJECT_INPUT \
+ , [loop_cnt_1]"m"(loop_cnt[1]) \
+ , [loop_cnt_2]"m"(loop_cnt[2]) \
+ , [loop_cnt_3]"m"(loop_cnt[3]) \
+ , [loop_cnt_4]"m"(loop_cnt[4]) \
+ , [loop_cnt_5]"m"(loop_cnt[5]) \
+ , [loop_cnt_6]"m"(loop_cnt[6])
+
+#define INJECT_ASM_REG "r18"
+
+#define RSEQ_INJECT_CLOBBER \
+ , INJECT_ASM_REG
+
+#define RSEQ_INJECT_ASM(n) \
+ "lwz %%" INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t" \
+ "cmpwi %%" INJECT_ASM_REG ", 0\n\t" \
+ "beq 333f\n\t" \
+ "222:\n\t" \
+ "subic. %%" INJECT_ASM_REG ", %%" INJECT_ASM_REG ", 1\n\t" \
+ "bne 222b\n\t" \
+ "333:\n\t"
+
+#elif defined(__mips__)
+
+#define RSEQ_INJECT_INPUT \
+ , [loop_cnt_1]"m"(loop_cnt[1]) \
+ , [loop_cnt_2]"m"(loop_cnt[2]) \
+ , [loop_cnt_3]"m"(loop_cnt[3]) \
+ , [loop_cnt_4]"m"(loop_cnt[4]) \
+ , [loop_cnt_5]"m"(loop_cnt[5]) \
+ , [loop_cnt_6]"m"(loop_cnt[6])
+
+#define INJECT_ASM_REG "$5"
+
+#define RSEQ_INJECT_CLOBBER \
+ , INJECT_ASM_REG
+
+#define RSEQ_INJECT_ASM(n) \
+ "lw " INJECT_ASM_REG ", %[loop_cnt_" #n "]\n\t" \
+ "beqz " INJECT_ASM_REG ", 333f\n\t" \
+ "222:\n\t" \
+ "addiu " INJECT_ASM_REG ", -1\n\t" \
+ "bnez " INJECT_ASM_REG ", 222b\n\t" \
+ "333:\n\t"
+
+#else
+#error unsupported target
+#endif
+
+#define RSEQ_INJECT_FAILED \
+ nr_abort++;
+
+#define RSEQ_INJECT_C(n) \
+{ \
+ int loc_i, loc_nr_loops = loop_cnt[n]; \
+ \
+ for (loc_i = 0; loc_i < loc_nr_loops; loc_i++) { \
+ rseq_barrier(); \
+ } \
+ if (loc_nr_loops == -1 && opt_modulo) { \
+ if (yield_mod_cnt == opt_modulo - 1) { \
+ if (opt_sleep > 0) \
+ poll(NULL, 0, opt_sleep); \
+ if (opt_yield) \
+ sched_yield(); \
+ if (opt_signal) \
+ raise(SIGUSR1); \
+ yield_mod_cnt = 0; \
+ } else { \
+ yield_mod_cnt++; \
+ } \
+ } \
+}
+
+#else
+
+#define printf_verbose(fmt, ...)
+
+#endif /* BENCHMARK */
+
+#include "rseq.h"
+
+struct percpu_lock_entry {
+ intptr_t v;
+} __attribute__((aligned(128)));
+
+struct percpu_lock {
+ struct percpu_lock_entry c[CPU_SETSIZE];
+};
+
+struct test_data_entry {
+ intptr_t count;
+} __attribute__((aligned(128)));
+
+struct spinlock_test_data {
+ struct percpu_lock lock;
+ struct test_data_entry c[CPU_SETSIZE];
+};
+
+struct spinlock_thread_test_data {
+ struct spinlock_test_data *data;
+ long long reps;
+ int reg;
+};
+
+struct inc_test_data {
+ struct test_data_entry c[CPU_SETSIZE];
+};
+
+struct inc_thread_test_data {
+ struct inc_test_data *data;
+ long long reps;
+ int reg;
+};
+
+struct percpu_list_node {
+ intptr_t data;
+ struct percpu_list_node *next;
+};
+
+struct percpu_list_entry {
+ struct percpu_list_node *head;
+} __attribute__((aligned(128)));
+
+struct percpu_list {
+ struct percpu_list_entry c[CPU_SETSIZE];
+};
+
+#define BUFFER_ITEM_PER_CPU 100
+
+struct percpu_buffer_node {
+ intptr_t data;
+};
+
+struct percpu_buffer_entry {
+ intptr_t offset;
+ intptr_t buflen;
+ struct percpu_buffer_node **array;
+} __attribute__((aligned(128)));
+
+struct percpu_buffer {
+ struct percpu_buffer_entry c[CPU_SETSIZE];
+};
+
+#define MEMCPY_BUFFER_ITEM_PER_CPU 100
+
+struct percpu_memcpy_buffer_node {
+ intptr_t data1;
+ uint64_t data2;
+};
+
+struct percpu_memcpy_buffer_entry {
+ intptr_t offset;
+ intptr_t buflen;
+ struct percpu_memcpy_buffer_node *array;
+} __attribute__((aligned(128)));
+
+struct percpu_memcpy_buffer {
+ struct percpu_memcpy_buffer_entry c[CPU_SETSIZE];
+};
+
+/* A simple percpu spinlock. Grabs lock on current cpu. */
+static int rseq_this_cpu_lock(struct percpu_lock *lock)
+{
+ int cpu;
+
+ for (;;) {
+ int ret;
+
+ cpu = rseq_cpu_start();
+ ret = rseq_cmpeqv_storev(&lock->c[cpu].v,
+ 0, 1, cpu);
+ if (rseq_likely(!ret))
+ break;
+ /* Retry if comparison fails or rseq aborts. */
+ }
+ /*
+ * Acquire semantic when taking lock after control dependency.
+ * Matches rseq_smp_store_release().
+ */
+ rseq_smp_acquire__after_ctrl_dep();
+ return cpu;
+}
+
+static void rseq_percpu_unlock(struct percpu_lock *lock, int cpu)
+{
+ assert(lock->c[cpu].v == 1);
+ /*
+ * Release lock, with release semantic. Matches
+ * rseq_smp_acquire__after_ctrl_dep().
+ */
+ rseq_smp_store_release(&lock->c[cpu].v, 0);
+}
+
+void *test_percpu_spinlock_thread(void *arg)
+{
+ struct spinlock_thread_test_data *thread_data = arg;
+ struct spinlock_test_data *data = thread_data->data;
+ long long i, reps;
+
+ if (!opt_disable_rseq && thread_data->reg &&
+ rseq_register_current_thread())
+ abort();
+ reps = thread_data->reps;
+ for (i = 0; i < reps; i++) {
+ int cpu = rseq_this_cpu_lock(&data->lock);
+ data->c[cpu].count++;
+ rseq_percpu_unlock(&data->lock, cpu);
+#ifndef BENCHMARK
+ if (i != 0 && !(i % (reps / 10)))
+ printf_verbose("tid %d: count %lld\n",
+ (int) rseq_gettid(), i);
+#endif
+ }
+ printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+ (int) rseq_gettid(), nr_abort, signals_delivered);
+ if (!opt_disable_rseq && thread_data->reg &&
+ rseq_unregister_current_thread())
+ abort();
+ return NULL;
+}
+
+/*
+ * A simple test which implements a sharded counter using a per-cpu
+ * lock. Obviously real applications might prefer to simply use a
+ * per-cpu increment; however, this is reasonable for a test and the
+ * lock can be extended to synchronize more complicated operations.
+ */
+void test_percpu_spinlock(void)
+{
+ const int num_threads = opt_threads;
+ int i, ret;
+ uint64_t sum;
+ pthread_t test_threads[num_threads];
+ struct spinlock_test_data data;
+ struct spinlock_thread_test_data thread_data[num_threads];
+
+ memset(&data, 0, sizeof(data));
+ for (i = 0; i < num_threads; i++) {
+ thread_data[i].reps = opt_reps;
+ if (opt_disable_mod <= 0 || (i % opt_disable_mod))
+ thread_data[i].reg = 1;
+ else
+ thread_data[i].reg = 0;
+ thread_data[i].data = &data;
+ ret = pthread_create(&test_threads[i], NULL,
+ test_percpu_spinlock_thread,
+ &thread_data[i]);
+ if (ret) {
+ errno = ret;
+ perror("pthread_create");
+ abort();
+ }
+ }
+
+ for (i = 0; i < num_threads; i++) {
+ ret = pthread_join(test_threads[i], NULL);
+ if (ret) {
+ errno = ret;
+ perror("pthread_join");
+ abort();
+ }
+ }
+
+ sum = 0;
+ for (i = 0; i < CPU_SETSIZE; i++)
+ sum += data.c[i].count;
+
+ assert(sum == (uint64_t)opt_reps * num_threads);
+}
+
+void *test_percpu_inc_thread(void *arg)
+{
+ struct inc_thread_test_data *thread_data = arg;
+ struct inc_test_data *data = thread_data->data;
+ long long i, reps;
+
+ if (!opt_disable_rseq && thread_data->reg &&
+ rseq_register_current_thread())
+ abort();
+ reps = thread_data->reps;
+ for (i = 0; i < reps; i++) {
+ int ret;
+
+ do {
+ int cpu;
+
+ cpu = rseq_cpu_start();
+ ret = rseq_addv(&data->c[cpu].count, 1, cpu);
+ } while (rseq_unlikely(ret));
+#ifndef BENCHMARK
+ if (i != 0 && !(i % (reps / 10)))
+ printf_verbose("tid %d: count %lld\n",
+ (int) rseq_gettid(), i);
+#endif
+ }
+ printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+ (int) rseq_gettid(), nr_abort, signals_delivered);
+ if (!opt_disable_rseq && thread_data->reg &&
+ rseq_unregister_current_thread())
+ abort();
+ return NULL;
+}
+
+void test_percpu_inc(void)
+{
+ const int num_threads = opt_threads;
+ int i, ret;
+ uint64_t sum;
+ pthread_t test_threads[num_threads];
+ struct inc_test_data data;
+ struct inc_thread_test_data thread_data[num_threads];
+
+ memset(&data, 0, sizeof(data));
+ for (i = 0; i < num_threads; i++) {
+ thread_data[i].reps = opt_reps;
+ if (opt_disable_mod <= 0 || (i % opt_disable_mod))
+ thread_data[i].reg = 1;
+ else
+ thread_data[i].reg = 0;
+ thread_data[i].data = &data;
+ ret = pthread_create(&test_threads[i], NULL,
+ test_percpu_inc_thread,
+ &thread_data[i]);
+ if (ret) {
+ errno = ret;
+ perror("pthread_create");
+ abort();
+ }
+ }
+
+ for (i = 0; i < num_threads; i++) {
+ ret = pthread_join(test_threads[i], NULL);
+ if (ret) {
+ errno = ret;
+ perror("pthread_join");
+ abort();
+ }
+ }
+
+ sum = 0;
+ for (i = 0; i < CPU_SETSIZE; i++)
+ sum += data.c[i].count;
+
+ assert(sum == (uint64_t)opt_reps * num_threads);
+}
+
+void this_cpu_list_push(struct percpu_list *list,
+ struct percpu_list_node *node,
+ int *_cpu)
+{
+ int cpu;
+
+ for (;;) {
+ intptr_t *targetptr, newval, expect;
+ int ret;
+
+ cpu = rseq_cpu_start();
+ /* Load list->c[cpu].head with single-copy atomicity. */
+ expect = (intptr_t)RSEQ_READ_ONCE(list->c[cpu].head);
+ newval = (intptr_t)node;
+ targetptr = (intptr_t *)&list->c[cpu].head;
+ node->next = (struct percpu_list_node *)expect;
+ ret = rseq_cmpeqv_storev(targetptr, expect, newval, cpu);
+ if (rseq_likely(!ret))
+ break;
+ /* Retry if comparison fails or rseq aborts. */
+ }
+ if (_cpu)
+ *_cpu = cpu;
+}
+
+/*
+ * Unlike a traditional lock-less linked list; the availability of a
+ * rseq primitive allows us to implement pop without concerns over
+ * ABA-type races.
+ */
+struct percpu_list_node *this_cpu_list_pop(struct percpu_list *list,
+ int *_cpu)
+{
+ struct percpu_list_node *node = NULL;
+ int cpu;
+
+ for (;;) {
+ struct percpu_list_node *head;
+ intptr_t *targetptr, expectnot, *load;
+ long offset;
+ int ret;
+
+ cpu = rseq_cpu_start();
+ targetptr = (intptr_t *)&list->c[cpu].head;
+ expectnot = (intptr_t)NULL;
+ offset = offsetof(struct percpu_list_node, next);
+ load = (intptr_t *)&head;
+ ret = rseq_cmpnev_storeoffp_load(targetptr, expectnot,
+ offset, load, cpu);
+ if (rseq_likely(!ret)) {
+ node = head;
+ break;
+ }
+ if (ret > 0)
+ break;
+ /* Retry if rseq aborts. */
+ }
+ if (_cpu)
+ *_cpu = cpu;
+ return node;
+}
+
+/*
+ * __percpu_list_pop is not safe against concurrent accesses. Should
+ * only be used on lists that are not concurrently modified.
+ */
+struct percpu_list_node *__percpu_list_pop(struct percpu_list *list, int cpu)
+{
+ struct percpu_list_node *node;
+
+ node = list->c[cpu].head;
+ if (!node)
+ return NULL;
+ list->c[cpu].head = node->next;
+ return node;
+}
+
+void *test_percpu_list_thread(void *arg)
+{
+ long long i, reps;
+ struct percpu_list *list = (struct percpu_list *)arg;
+
+ if (!opt_disable_rseq && rseq_register_current_thread())
+ abort();
+
+ reps = opt_reps;
+ for (i = 0; i < reps; i++) {
+ struct percpu_list_node *node;
+
+ node = this_cpu_list_pop(list, NULL);
+ if (opt_yield)
+ sched_yield(); /* encourage shuffling */
+ if (node)
+ this_cpu_list_push(list, node, NULL);
+ }
+
+ printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+ (int) rseq_gettid(), nr_abort, signals_delivered);
+ if (!opt_disable_rseq && rseq_unregister_current_thread())
+ abort();
+
+ return NULL;
+}
+
+/* Simultaneous modification to a per-cpu linked list from many threads. */
+void test_percpu_list(void)
+{
+ const int num_threads = opt_threads;
+ int i, j, ret;
+ uint64_t sum = 0, expected_sum = 0;
+ struct percpu_list list;
+ pthread_t test_threads[num_threads];
+ cpu_set_t allowed_cpus;
+
+ memset(&list, 0, sizeof(list));
+
+ /* Generate list entries for every usable cpu. */
+ sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
+ for (i = 0; i < CPU_SETSIZE; i++) {
+ if (!CPU_ISSET(i, &allowed_cpus))
+ continue;
+ for (j = 1; j <= 100; j++) {
+ struct percpu_list_node *node;
+
+ expected_sum += j;
+
+ node = malloc(sizeof(*node));
+ assert(node);
+ node->data = j;
+ node->next = list.c[i].head;
+ list.c[i].head = node;
+ }
+ }
+
+ for (i = 0; i < num_threads; i++) {
+ ret = pthread_create(&test_threads[i], NULL,
+ test_percpu_list_thread, &list);
+ if (ret) {
+ errno = ret;
+ perror("pthread_create");
+ abort();
+ }
+ }
+
+ for (i = 0; i < num_threads; i++) {
+ ret = pthread_join(test_threads[i], NULL);
+ if (ret) {
+ errno = ret;
+ perror("pthread_join");
+ abort();
+ }
+ }
+
+ for (i = 0; i < CPU_SETSIZE; i++) {
+ struct percpu_list_node *node;
+
+ if (!CPU_ISSET(i, &allowed_cpus))
+ continue;
+
+ while ((node = __percpu_list_pop(&list, i))) {
+ sum += node->data;
+ free(node);
+ }
+ }
+
+ /*
+ * All entries should now be accounted for (unless some external
+ * actor is interfering with our allowed affinity while this
+ * test is running).
+ */
+ assert(sum == expected_sum);
+}
+
+bool this_cpu_buffer_push(struct percpu_buffer *buffer,
+ struct percpu_buffer_node *node,
+ int *_cpu)
+{
+ bool result = false;
+ int cpu;
+
+ for (;;) {
+ intptr_t *targetptr_spec, newval_spec;
+ intptr_t *targetptr_final, newval_final;
+ intptr_t offset;
+ int ret;
+
+ cpu = rseq_cpu_start();
+ offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+ if (offset == buffer->c[cpu].buflen)
+ break;
+ newval_spec = (intptr_t)node;
+ targetptr_spec = (intptr_t *)&buffer->c[cpu].array[offset];
+ newval_final = offset + 1;
+ targetptr_final = &buffer->c[cpu].offset;
+ if (opt_mb)
+ ret = rseq_cmpeqv_trystorev_storev_release(
+ targetptr_final, offset, targetptr_spec,
+ newval_spec, newval_final, cpu);
+ else
+ ret = rseq_cmpeqv_trystorev_storev(targetptr_final,
+ offset, targetptr_spec, newval_spec,
+ newval_final, cpu);
+ if (rseq_likely(!ret)) {
+ result = true;
+ break;
+ }
+ /* Retry if comparison fails or rseq aborts. */
+ }
+ if (_cpu)
+ *_cpu = cpu;
+ return result;
+}
+
+struct percpu_buffer_node *this_cpu_buffer_pop(struct percpu_buffer *buffer,
+ int *_cpu)
+{
+ struct percpu_buffer_node *head;
+ int cpu;
+
+ for (;;) {
+ intptr_t *targetptr, newval;
+ intptr_t offset;
+ int ret;
+
+ cpu = rseq_cpu_start();
+ /* Load offset with single-copy atomicity. */
+ offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+ if (offset == 0) {
+ head = NULL;
+ break;
+ }
+ head = RSEQ_READ_ONCE(buffer->c[cpu].array[offset - 1]);
+ newval = offset - 1;
+ targetptr = (intptr_t *)&buffer->c[cpu].offset;
+ ret = rseq_cmpeqv_cmpeqv_storev(targetptr, offset,
+ (intptr_t *)&buffer->c[cpu].array[offset - 1],
+ (intptr_t)head, newval, cpu);
+ if (rseq_likely(!ret))
+ break;
+ /* Retry if comparison fails or rseq aborts. */
+ }
+ if (_cpu)
+ *_cpu = cpu;
+ return head;
+}
+
+/*
+ * __percpu_buffer_pop is not safe against concurrent accesses. Should
+ * only be used on buffers that are not concurrently modified.
+ */
+struct percpu_buffer_node *__percpu_buffer_pop(struct percpu_buffer *buffer,
+ int cpu)
+{
+ struct percpu_buffer_node *head;
+ intptr_t offset;
+
+ offset = buffer->c[cpu].offset;
+ if (offset == 0)
+ return NULL;
+ head = buffer->c[cpu].array[offset - 1];
+ buffer->c[cpu].offset = offset - 1;
+ return head;
+}
+
+void *test_percpu_buffer_thread(void *arg)
+{
+ long long i, reps;
+ struct percpu_buffer *buffer = (struct percpu_buffer *)arg;
+
+ if (!opt_disable_rseq && rseq_register_current_thread())
+ abort();
+
+ reps = opt_reps;
+ for (i = 0; i < reps; i++) {
+ struct percpu_buffer_node *node;
+
+ node = this_cpu_buffer_pop(buffer, NULL);
+ if (opt_yield)
+ sched_yield(); /* encourage shuffling */
+ if (node) {
+ if (!this_cpu_buffer_push(buffer, node, NULL)) {
+ /* Should increase buffer size. */
+ abort();
+ }
+ }
+ }
+
+ printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+ (int) rseq_gettid(), nr_abort, signals_delivered);
+ if (!opt_disable_rseq && rseq_unregister_current_thread())
+ abort();
+
+ return NULL;
+}
+
+/* Simultaneous modification to a per-cpu buffer from many threads. */
+void test_percpu_buffer(void)
+{
+ const int num_threads = opt_threads;
+ int i, j, ret;
+ uint64_t sum = 0, expected_sum = 0;
+ struct percpu_buffer buffer;
+ pthread_t test_threads[num_threads];
+ cpu_set_t allowed_cpus;
+
+ memset(&buffer, 0, sizeof(buffer));
+
+ /* Generate list entries for every usable cpu. */
+ sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
+ for (i = 0; i < CPU_SETSIZE; i++) {
+ if (!CPU_ISSET(i, &allowed_cpus))
+ continue;
+ /* Worse-case is every item in same CPU. */
+ buffer.c[i].array =
+ malloc(sizeof(*buffer.c[i].array) * CPU_SETSIZE *
+ BUFFER_ITEM_PER_CPU);
+ assert(buffer.c[i].array);
+ buffer.c[i].buflen = CPU_SETSIZE * BUFFER_ITEM_PER_CPU;
+ for (j = 1; j <= BUFFER_ITEM_PER_CPU; j++) {
+ struct percpu_buffer_node *node;
+
+ expected_sum += j;
+
+ /*
+ * We could theoretically put the word-sized
+ * "data" directly in the buffer. However, we
+ * want to model objects that would not fit
+ * within a single word, so allocate an object
+ * for each node.
+ */
+ node = malloc(sizeof(*node));
+ assert(node);
+ node->data = j;
+ buffer.c[i].array[j - 1] = node;
+ buffer.c[i].offset++;
+ }
+ }
+
+ for (i = 0; i < num_threads; i++) {
+ ret = pthread_create(&test_threads[i], NULL,
+ test_percpu_buffer_thread, &buffer);
+ if (ret) {
+ errno = ret;
+ perror("pthread_create");
+ abort();
+ }
+ }
+
+ for (i = 0; i < num_threads; i++) {
+ ret = pthread_join(test_threads[i], NULL);
+ if (ret) {
+ errno = ret;
+ perror("pthread_join");
+ abort();
+ }
+ }
+
+ for (i = 0; i < CPU_SETSIZE; i++) {
+ struct percpu_buffer_node *node;
+
+ if (!CPU_ISSET(i, &allowed_cpus))
+ continue;
+
+ while ((node = __percpu_buffer_pop(&buffer, i))) {
+ sum += node->data;
+ free(node);
+ }
+ free(buffer.c[i].array);
+ }
+
+ /*
+ * All entries should now be accounted for (unless some external
+ * actor is interfering with our allowed affinity while this
+ * test is running).
+ */
+ assert(sum == expected_sum);
+}
+
+bool this_cpu_memcpy_buffer_push(struct percpu_memcpy_buffer *buffer,
+ struct percpu_memcpy_buffer_node item,
+ int *_cpu)
+{
+ bool result = false;
+ int cpu;
+
+ for (;;) {
+ intptr_t *targetptr_final, newval_final, offset;
+ char *destptr, *srcptr;
+ size_t copylen;
+ int ret;
+
+ cpu = rseq_cpu_start();
+ /* Load offset with single-copy atomicity. */
+ offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+ if (offset == buffer->c[cpu].buflen)
+ break;
+ destptr = (char *)&buffer->c[cpu].array[offset];
+ srcptr = (char *)&item;
+ /* copylen must be <= 4kB. */
+ copylen = sizeof(item);
+ newval_final = offset + 1;
+ targetptr_final = &buffer->c[cpu].offset;
+ if (opt_mb)
+ ret = rseq_cmpeqv_trymemcpy_storev_release(
+ targetptr_final, offset,
+ destptr, srcptr, copylen,
+ newval_final, cpu);
+ else
+ ret = rseq_cmpeqv_trymemcpy_storev(targetptr_final,
+ offset, destptr, srcptr, copylen,
+ newval_final, cpu);
+ if (rseq_likely(!ret)) {
+ result = true;
+ break;
+ }
+ /* Retry if comparison fails or rseq aborts. */
+ }
+ if (_cpu)
+ *_cpu = cpu;
+ return result;
+}
+
+bool this_cpu_memcpy_buffer_pop(struct percpu_memcpy_buffer *buffer,
+ struct percpu_memcpy_buffer_node *item,
+ int *_cpu)
+{
+ bool result = false;
+ int cpu;
+
+ for (;;) {
+ intptr_t *targetptr_final, newval_final, offset;
+ char *destptr, *srcptr;
+ size_t copylen;
+ int ret;
+
+ cpu = rseq_cpu_start();
+ /* Load offset with single-copy atomicity. */
+ offset = RSEQ_READ_ONCE(buffer->c[cpu].offset);
+ if (offset == 0)
+ break;
+ destptr = (char *)item;
+ srcptr = (char *)&buffer->c[cpu].array[offset - 1];
+ /* copylen must be <= 4kB. */
+ copylen = sizeof(*item);
+ newval_final = offset - 1;
+ targetptr_final = &buffer->c[cpu].offset;
+ ret = rseq_cmpeqv_trymemcpy_storev(targetptr_final,
+ offset, destptr, srcptr, copylen,
+ newval_final, cpu);
+ if (rseq_likely(!ret)) {
+ result = true;
+ break;
+ }
+ /* Retry if comparison fails or rseq aborts. */
+ }
+ if (_cpu)
+ *_cpu = cpu;
+ return result;
+}
+
+/*
+ * __percpu_memcpy_buffer_pop is not safe against concurrent accesses. Should
+ * only be used on buffers that are not concurrently modified.
+ */
+bool __percpu_memcpy_buffer_pop(struct percpu_memcpy_buffer *buffer,
+ struct percpu_memcpy_buffer_node *item,
+ int cpu)
+{
+ intptr_t offset;
+
+ offset = buffer->c[cpu].offset;
+ if (offset == 0)
+ return false;
+ memcpy(item, &buffer->c[cpu].array[offset - 1], sizeof(*item));
+ buffer->c[cpu].offset = offset - 1;
+ return true;
+}
+
+void *test_percpu_memcpy_buffer_thread(void *arg)
+{
+ long long i, reps;
+ struct percpu_memcpy_buffer *buffer = (struct percpu_memcpy_buffer *)arg;
+
+ if (!opt_disable_rseq && rseq_register_current_thread())
+ abort();
+
+ reps = opt_reps;
+ for (i = 0; i < reps; i++) {
+ struct percpu_memcpy_buffer_node item;
+ bool result;
+
+ result = this_cpu_memcpy_buffer_pop(buffer, &item, NULL);
+ if (opt_yield)
+ sched_yield(); /* encourage shuffling */
+ if (result) {
+ if (!this_cpu_memcpy_buffer_push(buffer, item, NULL)) {
+ /* Should increase buffer size. */
+ abort();
+ }
+ }
+ }
+
+ printf_verbose("tid %d: number of rseq abort: %d, signals delivered: %u\n",
+ (int) rseq_gettid(), nr_abort, signals_delivered);
+ if (!opt_disable_rseq && rseq_unregister_current_thread())
+ abort();
+
+ return NULL;
+}
+
+/* Simultaneous modification to a per-cpu buffer from many threads. */
+void test_percpu_memcpy_buffer(void)
+{
+ const int num_threads = opt_threads;
+ int i, j, ret;
+ uint64_t sum = 0, expected_sum = 0;
+ struct percpu_memcpy_buffer buffer;
+ pthread_t test_threads[num_threads];
+ cpu_set_t allowed_cpus;
+
+ memset(&buffer, 0, sizeof(buffer));
+
+ /* Generate list entries for every usable cpu. */
+ sched_getaffinity(0, sizeof(allowed_cpus), &allowed_cpus);
+ for (i = 0; i < CPU_SETSIZE; i++) {
+ if (!CPU_ISSET(i, &allowed_cpus))
+ continue;
+ /* Worse-case is every item in same CPU. */
+ buffer.c[i].array =
+ malloc(sizeof(*buffer.c[i].array) * CPU_SETSIZE *
+ MEMCPY_BUFFER_ITEM_PER_CPU);
+ assert(buffer.c[i].array);
+ buffer.c[i].buflen = CPU_SETSIZE * MEMCPY_BUFFER_ITEM_PER_CPU;
+ for (j = 1; j <= MEMCPY_BUFFER_ITEM_PER_CPU; j++) {
+ expected_sum += 2 * j + 1;
+
+ /*
+ * We could theoretically put the word-sized
+ * "data" directly in the buffer. However, we
+ * want to model objects that would not fit
+ * within a single word, so allocate an object
+ * for each node.
+ */
+ buffer.c[i].array[j - 1].data1 = j;
+ buffer.c[i].array[j - 1].data2 = j + 1;
+ buffer.c[i].offset++;
+ }
+ }
+
+ for (i = 0; i < num_threads; i++) {
+ ret = pthread_create(&test_threads[i], NULL,
+ test_percpu_memcpy_buffer_thread,
+ &buffer);
+ if (ret) {
+ errno = ret;
+ perror("pthread_create");
+ abort();
+ }
+ }
+
+ for (i = 0; i < num_threads; i++) {
+ ret = pthread_join(test_threads[i], NULL);
+ if (ret) {
+ errno = ret;
+ perror("pthread_join");
+ abort();
+ }
+ }
+
+ for (i = 0; i < CPU_SETSIZE; i++) {
+ struct percpu_memcpy_buffer_node item;
+
+ if (!CPU_ISSET(i, &allowed_cpus))
+ continue;
+
+ while (__percpu_memcpy_buffer_pop(&buffer, &item, i)) {
+ sum += item.data1;
+ sum += item.data2;
+ }
+ free(buffer.c[i].array);
+ }
+
+ /*
+ * All entries should now be accounted for (unless some external
+ * actor is interfering with our allowed affinity while this
+ * test is running).
+ */
+ assert(sum == expected_sum);
+}
+
+static void test_signal_interrupt_handler(int signo)
+{
+ signals_delivered++;
+}
+
+static int set_signal_handler(void)
+{
+ int ret = 0;
+ struct sigaction sa;
+ sigset_t sigset;
+
+ ret = sigemptyset(&sigset);
+ if (ret < 0) {
+ perror("sigemptyset");
+ return ret;
+ }
+
+ sa.sa_handler = test_signal_interrupt_handler;
+ sa.sa_mask = sigset;
+ sa.sa_flags = 0;
+ ret = sigaction(SIGUSR1, &sa, NULL);
+ if (ret < 0) {
+ perror("sigaction");
+ return ret;
+ }
+
+ printf_verbose("Signal handler set for SIGUSR1\n");
+
+ return ret;
+}
+
+struct test_membarrier_thread_args {
+ int stop;
+ intptr_t percpu_list_ptr;
+};
+
+/* Worker threads modify data in their "active" percpu lists. */
+void *test_membarrier_worker_thread(void *arg)
+{
+ struct test_membarrier_thread_args *args =
+ (struct test_membarrier_thread_args *)arg;
+ const int iters = opt_reps;
+ int i;
+
+ if (rseq_register_current_thread()) {
+ fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n",
+ errno, strerror(errno));
+ abort();
+ }
+
+ /* Wait for initialization. */
+ while (!atomic_load(&args->percpu_list_ptr)) {}
+
+ for (i = 0; i < iters; ++i) {
+ int ret;
+
+ do {
+ int cpu = rseq_cpu_start();
+
+ ret = rseq_offset_deref_addv(&args->percpu_list_ptr,
+ sizeof(struct percpu_list_entry) * cpu, 1, cpu);
+ } while (rseq_unlikely(ret));
+ }
+
+ if (rseq_unregister_current_thread()) {
+ fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d): %s\n",
+ errno, strerror(errno));
+ abort();
+ }
+ return NULL;
+}
+
+void test_membarrier_init_percpu_list(struct percpu_list *list)
+{
+ int i;
+
+ memset(list, 0, sizeof(*list));
+ for (i = 0; i < CPU_SETSIZE; i++) {
+ struct percpu_list_node *node;
+
+ node = malloc(sizeof(*node));
+ assert(node);
+ node->data = 0;
+ node->next = NULL;
+ list->c[i].head = node;
+ }
+}
+
+void test_membarrier_free_percpu_list(struct percpu_list *list)
+{
+ int i;
+
+ for (i = 0; i < CPU_SETSIZE; i++)
+ free(list->c[i].head);
+}
+
+static int sys_membarrier(int cmd, int flags, int cpu_id)
+{
+ return syscall(__NR_membarrier, cmd, flags, cpu_id);
+}
+
+/*
+ * The manager thread swaps per-cpu lists that worker threads see,
+ * and validates that there are no unexpected modifications.
+ */
+void *test_membarrier_manager_thread(void *arg)
+{
+ struct test_membarrier_thread_args *args =
+ (struct test_membarrier_thread_args *)arg;
+ struct percpu_list list_a, list_b;
+ intptr_t expect_a = 0, expect_b = 0;
+ int cpu_a = 0, cpu_b = 0;
+
+ if (rseq_register_current_thread()) {
+ fprintf(stderr, "Error: rseq_register_current_thread(...) failed(%d): %s\n",
+ errno, strerror(errno));
+ abort();
+ }
+
+ /* Init lists. */
+ test_membarrier_init_percpu_list(&list_a);
+ test_membarrier_init_percpu_list(&list_b);
+
+ atomic_store(&args->percpu_list_ptr, (intptr_t)&list_a);
+
+ while (!atomic_load(&args->stop)) {
+ /* list_a is "active". */
+ cpu_a = rand() % CPU_SETSIZE;
+ /*
+ * As list_b is "inactive", we should never see changes
+ * to list_b.
+ */
+ if (expect_b != atomic_load(&list_b.c[cpu_b].head->data)) {
+ fprintf(stderr, "Membarrier test failed\n");
+ abort();
+ }
+
+ /* Make list_b "active". */
+ atomic_store(&args->percpu_list_ptr, (intptr_t)&list_b);
+ if (sys_membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ,
+ MEMBARRIER_CMD_FLAG_CPU, cpu_a) &&
+ errno != ENXIO /* missing CPU */) {
+ perror("sys_membarrier");
+ abort();
+ }
+ /*
+ * Cpu A should now only modify list_b, so the values
+ * in list_a should be stable.
+ */
+ expect_a = atomic_load(&list_a.c[cpu_a].head->data);
+
+ cpu_b = rand() % CPU_SETSIZE;
+ /*
+ * As list_a is "inactive", we should never see changes
+ * to list_a.
+ */
+ if (expect_a != atomic_load(&list_a.c[cpu_a].head->data)) {
+ fprintf(stderr, "Membarrier test failed\n");
+ abort();
+ }
+
+ /* Make list_a "active". */
+ atomic_store(&args->percpu_list_ptr, (intptr_t)&list_a);
+ if (sys_membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ,
+ MEMBARRIER_CMD_FLAG_CPU, cpu_b) &&
+ errno != ENXIO /* missing CPU*/) {
+ perror("sys_membarrier");
+ abort();
+ }
+ /* Remember a value from list_b. */
+ expect_b = atomic_load(&list_b.c[cpu_b].head->data);
+ }
+
+ test_membarrier_free_percpu_list(&list_a);
+ test_membarrier_free_percpu_list(&list_b);
+
+ if (rseq_unregister_current_thread()) {
+ fprintf(stderr, "Error: rseq_unregister_current_thread(...) failed(%d): %s\n",
+ errno, strerror(errno));
+ abort();
+ }
+ return NULL;
+}
+
+/* Test MEMBARRIER_CMD_PRIVATE_RESTART_RSEQ_ON_CPU membarrier command. */
+#ifdef RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV
+void test_membarrier(void)
+{
+ const int num_threads = opt_threads;
+ struct test_membarrier_thread_args thread_args;
+ pthread_t worker_threads[num_threads];
+ pthread_t manager_thread;
+ int i, ret;
+
+ if (sys_membarrier(MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ, 0, 0)) {
+ perror("sys_membarrier");
+ abort();
+ }
+
+ thread_args.stop = 0;
+ thread_args.percpu_list_ptr = 0;
+ ret = pthread_create(&manager_thread, NULL,
+ test_membarrier_manager_thread, &thread_args);
+ if (ret) {
+ errno = ret;
+ perror("pthread_create");
+ abort();
+ }
+
+ for (i = 0; i < num_threads; i++) {
+ ret = pthread_create(&worker_threads[i], NULL,
+ test_membarrier_worker_thread, &thread_args);
+ if (ret) {
+ errno = ret;
+ perror("pthread_create");
+ abort();
+ }
+ }
+
+
+ for (i = 0; i < num_threads; i++) {
+ ret = pthread_join(worker_threads[i], NULL);
+ if (ret) {
+ errno = ret;
+ perror("pthread_join");
+ abort();
+ }
+ }
+
+ atomic_store(&thread_args.stop, 1);
+ ret = pthread_join(manager_thread, NULL);
+ if (ret) {
+ errno = ret;
+ perror("pthread_join");
+ abort();
+ }
+}
+#else /* RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV */
+void test_membarrier(void)
+{
+ fprintf(stderr, "rseq_offset_deref_addv is not implemented on this architecture. "
+ "Skipping membarrier test.\n");
+}
+#endif
+
+static void show_usage(int argc, char **argv)
+{
+ printf("Usage : %s <OPTIONS>\n",
+ argv[0]);
+ printf("OPTIONS:\n");
+ printf(" [-1 loops] Number of loops for delay injection 1\n");
+ printf(" [-2 loops] Number of loops for delay injection 2\n");
+ printf(" [-3 loops] Number of loops for delay injection 3\n");
+ printf(" [-4 loops] Number of loops for delay injection 4\n");
+ printf(" [-5 loops] Number of loops for delay injection 5\n");
+ printf(" [-6 loops] Number of loops for delay injection 6\n");
+ printf(" [-7 loops] Number of loops for delay injection 7 (-1 to enable -m)\n");
+ printf(" [-8 loops] Number of loops for delay injection 8 (-1 to enable -m)\n");
+ printf(" [-9 loops] Number of loops for delay injection 9 (-1 to enable -m)\n");
+ printf(" [-m N] Yield/sleep/kill every modulo N (default 0: disabled) (>= 0)\n");
+ printf(" [-y] Yield\n");
+ printf(" [-k] Kill thread with signal\n");
+ printf(" [-s S] S: =0: disabled (default), >0: sleep time (ms)\n");
+ printf(" [-t N] Number of threads (default 200)\n");
+ printf(" [-r N] Number of repetitions per thread (default 5000)\n");
+ printf(" [-d] Disable rseq system call (no initialization)\n");
+ printf(" [-D M] Disable rseq for each M threads\n");
+ printf(" [-T test] Choose test: (s)pinlock, (l)ist, (b)uffer, (m)emcpy, (i)ncrement, membarrie(r)\n");
+ printf(" [-M] Push into buffer and memcpy buffer with memory barriers.\n");
+ printf(" [-v] Verbose output.\n");
+ printf(" [-h] Show this help.\n");
+ printf("\n");
+}
+
+int main(int argc, char **argv)
+{
+ int i;
+
+ for (i = 1; i < argc; i++) {
+ if (argv[i][0] != '-')
+ continue;
+ switch (argv[i][1]) {
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ if (argc < i + 2) {
+ show_usage(argc, argv);
+ goto error;
+ }
+ loop_cnt[argv[i][1] - '0'] = atol(argv[i + 1]);
+ i++;
+ break;
+ case 'm':
+ if (argc < i + 2) {
+ show_usage(argc, argv);
+ goto error;
+ }
+ opt_modulo = atol(argv[i + 1]);
+ if (opt_modulo < 0) {
+ show_usage(argc, argv);
+ goto error;
+ }
+ i++;
+ break;
+ case 's':
+ if (argc < i + 2) {
+ show_usage(argc, argv);
+ goto error;
+ }
+ opt_sleep = atol(argv[i + 1]);
+ if (opt_sleep < 0) {
+ show_usage(argc, argv);
+ goto error;
+ }
+ i++;
+ break;
+ case 'y':
+ opt_yield = 1;
+ break;
+ case 'k':
+ opt_signal = 1;
+ break;
+ case 'd':
+ opt_disable_rseq = 1;
+ break;
+ case 'D':
+ if (argc < i + 2) {
+ show_usage(argc, argv);
+ goto error;
+ }
+ opt_disable_mod = atol(argv[i + 1]);
+ if (opt_disable_mod < 0) {
+ show_usage(argc, argv);
+ goto error;
+ }
+ i++;
+ break;
+ case 't':
+ if (argc < i + 2) {
+ show_usage(argc, argv);
+ goto error;
+ }
+ opt_threads = atol(argv[i + 1]);
+ if (opt_threads < 0) {
+ show_usage(argc, argv);
+ goto error;
+ }
+ i++;
+ break;
+ case 'r':
+ if (argc < i + 2) {
+ show_usage(argc, argv);
+ goto error;
+ }
+ opt_reps = atoll(argv[i + 1]);
+ if (opt_reps < 0) {
+ show_usage(argc, argv);
+ goto error;
+ }
+ i++;
+ break;
+ case 'h':
+ show_usage(argc, argv);
+ goto end;
+ case 'T':
+ if (argc < i + 2) {
+ show_usage(argc, argv);
+ goto error;
+ }
+ opt_test = *argv[i + 1];
+ switch (opt_test) {
+ case 's':
+ case 'l':
+ case 'i':
+ case 'b':
+ case 'm':
+ case 'r':
+ break;
+ default:
+ show_usage(argc, argv);
+ goto error;
+ }
+ i++;
+ break;
+ case 'v':
+ verbose = 1;
+ break;
+ case 'M':
+ opt_mb = 1;
+ break;
+ default:
+ show_usage(argc, argv);
+ goto error;
+ }
+ }
+
+ loop_cnt_1 = loop_cnt[1];
+ loop_cnt_2 = loop_cnt[2];
+ loop_cnt_3 = loop_cnt[3];
+ loop_cnt_4 = loop_cnt[4];
+ loop_cnt_5 = loop_cnt[5];
+ loop_cnt_6 = loop_cnt[6];
+
+ if (set_signal_handler())
+ goto error;
+
+ if (!opt_disable_rseq && rseq_register_current_thread())
+ goto error;
+ switch (opt_test) {
+ case 's':
+ printf_verbose("spinlock\n");
+ test_percpu_spinlock();
+ break;
+ case 'l':
+ printf_verbose("linked list\n");
+ test_percpu_list();
+ break;
+ case 'b':
+ printf_verbose("buffer\n");
+ test_percpu_buffer();
+ break;
+ case 'm':
+ printf_verbose("memcpy buffer\n");
+ test_percpu_memcpy_buffer();
+ break;
+ case 'i':
+ printf_verbose("counter increment\n");
+ test_percpu_inc();
+ break;
+ case 'r':
+ printf_verbose("membarrier\n");
+ test_membarrier();
+ break;
+ }
+ if (!opt_disable_rseq && rseq_unregister_current_thread())
+ abort();
+end:
+ return 0;
+
+error:
+ return -1;
+}
diff --git a/tools/testing/selftests/rseq/rseq-abi.h b/tools/testing/selftests/rseq/rseq-abi.h
new file mode 100644
index 000000000..a8c44d9af
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-abi.h
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+#ifndef _RSEQ_ABI_H
+#define _RSEQ_ABI_H
+
+/*
+ * rseq-abi.h
+ *
+ * Restartable sequences system call API
+ *
+ * Copyright (c) 2015-2022 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+
+enum rseq_abi_cpu_id_state {
+ RSEQ_ABI_CPU_ID_UNINITIALIZED = -1,
+ RSEQ_ABI_CPU_ID_REGISTRATION_FAILED = -2,
+};
+
+enum rseq_abi_flags {
+ RSEQ_ABI_FLAG_UNREGISTER = (1 << 0),
+};
+
+enum rseq_abi_cs_flags_bit {
+ RSEQ_ABI_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT = 0,
+ RSEQ_ABI_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT = 1,
+ RSEQ_ABI_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT = 2,
+};
+
+enum rseq_abi_cs_flags {
+ RSEQ_ABI_CS_FLAG_NO_RESTART_ON_PREEMPT =
+ (1U << RSEQ_ABI_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT),
+ RSEQ_ABI_CS_FLAG_NO_RESTART_ON_SIGNAL =
+ (1U << RSEQ_ABI_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT),
+ RSEQ_ABI_CS_FLAG_NO_RESTART_ON_MIGRATE =
+ (1U << RSEQ_ABI_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
+};
+
+/*
+ * struct rseq_abi_cs is aligned on 4 * 8 bytes to ensure it is always
+ * contained within a single cache-line. It is usually declared as
+ * link-time constant data.
+ */
+struct rseq_abi_cs {
+ /* Version of this structure. */
+ __u32 version;
+ /* enum rseq_abi_cs_flags */
+ __u32 flags;
+ __u64 start_ip;
+ /* Offset from start_ip. */
+ __u64 post_commit_offset;
+ __u64 abort_ip;
+} __attribute__((aligned(4 * sizeof(__u64))));
+
+/*
+ * struct rseq_abi is aligned on 4 * 8 bytes to ensure it is always
+ * contained within a single cache-line.
+ *
+ * A single struct rseq_abi per thread is allowed.
+ */
+struct rseq_abi {
+ /*
+ * Restartable sequences cpu_id_start field. Updated by the
+ * kernel. Read by user-space with single-copy atomicity
+ * semantics. This field should only be read by the thread which
+ * registered this data structure. Aligned on 32-bit. Always
+ * contains a value in the range of possible CPUs, although the
+ * value may not be the actual current CPU (e.g. if rseq is not
+ * initialized). This CPU number value should always be compared
+ * against the value of the cpu_id field before performing a rseq
+ * commit or returning a value read from a data structure indexed
+ * using the cpu_id_start value.
+ */
+ __u32 cpu_id_start;
+ /*
+ * Restartable sequences cpu_id field. Updated by the kernel.
+ * Read by user-space with single-copy atomicity semantics. This
+ * field should only be read by the thread which registered this
+ * data structure. Aligned on 32-bit. Values
+ * RSEQ_CPU_ID_UNINITIALIZED and RSEQ_CPU_ID_REGISTRATION_FAILED
+ * have a special semantic: the former means "rseq uninitialized",
+ * and latter means "rseq initialization failed". This value is
+ * meant to be read within rseq critical sections and compared
+ * with the cpu_id_start value previously read, before performing
+ * the commit instruction, or read and compared with the
+ * cpu_id_start value before returning a value loaded from a data
+ * structure indexed using the cpu_id_start value.
+ */
+ __u32 cpu_id;
+ /*
+ * Restartable sequences rseq_cs field.
+ *
+ * Contains NULL when no critical section is active for the current
+ * thread, or holds a pointer to the currently active struct rseq_cs.
+ *
+ * Updated by user-space, which sets the address of the currently
+ * active rseq_cs at the beginning of assembly instruction sequence
+ * block, and set to NULL by the kernel when it restarts an assembly
+ * instruction sequence block, as well as when the kernel detects that
+ * it is preempting or delivering a signal outside of the range
+ * targeted by the rseq_cs. Also needs to be set to NULL by user-space
+ * before reclaiming memory that contains the targeted struct rseq_cs.
+ *
+ * Read and set by the kernel. Set by user-space with single-copy
+ * atomicity semantics. This field should only be updated by the
+ * thread which registered this data structure. Aligned on 64-bit.
+ */
+ union {
+ __u64 ptr64;
+
+ /*
+ * The "arch" field provides architecture accessor for
+ * the ptr field based on architecture pointer size and
+ * endianness.
+ */
+ struct {
+#ifdef __LP64__
+ __u64 ptr;
+#elif defined(__BYTE_ORDER) ? (__BYTE_ORDER == __BIG_ENDIAN) : defined(__BIG_ENDIAN)
+ __u32 padding; /* Initialized to zero. */
+ __u32 ptr;
+#else
+ __u32 ptr;
+ __u32 padding; /* Initialized to zero. */
+#endif
+ } arch;
+ } rseq_cs;
+
+ /*
+ * Restartable sequences flags field.
+ *
+ * This field should only be updated by the thread which
+ * registered this data structure. Read by the kernel.
+ * Mainly used for single-stepping through rseq critical sections
+ * with debuggers.
+ *
+ * - RSEQ_ABI_CS_FLAG_NO_RESTART_ON_PREEMPT
+ * Inhibit instruction sequence block restart on preemption
+ * for this thread.
+ * - RSEQ_ABI_CS_FLAG_NO_RESTART_ON_SIGNAL
+ * Inhibit instruction sequence block restart on signal
+ * delivery for this thread.
+ * - RSEQ_ABI_CS_FLAG_NO_RESTART_ON_MIGRATE
+ * Inhibit instruction sequence block restart on migration for
+ * this thread.
+ */
+ __u32 flags;
+} __attribute__((aligned(4 * sizeof(__u64))));
+
+#endif /* _RSEQ_ABI_H */
diff --git a/tools/testing/selftests/rseq/rseq-arm.h b/tools/testing/selftests/rseq/rseq-arm.h
new file mode 100644
index 000000000..893a11eca
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-arm.h
@@ -0,0 +1,827 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * rseq-arm.h
+ *
+ * (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+/*
+ * - ARM little endian
+ *
+ * RSEQ_SIG uses the udf A32 instruction with an uncommon immediate operand
+ * value 0x5de3. This traps if user-space reaches this instruction by mistake,
+ * and the uncommon operand ensures the kernel does not move the instruction
+ * pointer to attacker-controlled code on rseq abort.
+ *
+ * The instruction pattern in the A32 instruction set is:
+ *
+ * e7f5def3 udf #24035 ; 0x5de3
+ *
+ * This translates to the following instruction pattern in the T16 instruction
+ * set:
+ *
+ * little endian:
+ * def3 udf #243 ; 0xf3
+ * e7f5 b.n <7f5>
+ *
+ * - ARMv6+ big endian (BE8):
+ *
+ * ARMv6+ -mbig-endian generates mixed endianness code vs data: little-endian
+ * code and big-endian data. The data value of the signature needs to have its
+ * byte order reversed to generate the trap instruction:
+ *
+ * Data: 0xf3def5e7
+ *
+ * Translates to this A32 instruction pattern:
+ *
+ * e7f5def3 udf #24035 ; 0x5de3
+ *
+ * Translates to this T16 instruction pattern:
+ *
+ * def3 udf #243 ; 0xf3
+ * e7f5 b.n <7f5>
+ *
+ * - Prior to ARMv6 big endian (BE32):
+ *
+ * Prior to ARMv6, -mbig-endian generates big-endian code and data
+ * (which match), so the endianness of the data representation of the
+ * signature should not be reversed. However, the choice between BE32
+ * and BE8 is done by the linker, so we cannot know whether code and
+ * data endianness will be mixed before the linker is invoked. So rather
+ * than try to play tricks with the linker, the rseq signature is simply
+ * data (not a trap instruction) prior to ARMv6 on big endian. This is
+ * why the signature is expressed as data (.word) rather than as
+ * instruction (.inst) in assembler.
+ */
+
+#ifdef __ARMEB__
+#define RSEQ_SIG 0xf3def5e7 /* udf #24035 ; 0x5de3 (ARMv6+) */
+#else
+#define RSEQ_SIG 0xe7f5def3 /* udf #24035 ; 0x5de3 */
+#endif
+
+#define rseq_smp_mb() __asm__ __volatile__ ("dmb" ::: "memory", "cc")
+#define rseq_smp_rmb() __asm__ __volatile__ ("dmb" ::: "memory", "cc")
+#define rseq_smp_wmb() __asm__ __volatile__ ("dmb" ::: "memory", "cc")
+
+#define rseq_smp_load_acquire(p) \
+__extension__ ({ \
+ __typeof(*p) ____p1 = RSEQ_READ_ONCE(*p); \
+ rseq_smp_mb(); \
+ ____p1; \
+})
+
+#define rseq_smp_acquire__after_ctrl_dep() rseq_smp_rmb()
+
+#define rseq_smp_store_release(p, v) \
+do { \
+ rseq_smp_mb(); \
+ RSEQ_WRITE_ONCE(*p, v); \
+} while (0)
+
+#ifdef RSEQ_SKIP_FASTPATH
+#include "rseq-skip.h"
+#else /* !RSEQ_SKIP_FASTPATH */
+
+#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip, \
+ post_commit_offset, abort_ip) \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
+ ".balign 32\n\t" \
+ __rseq_str(label) ":\n\t" \
+ ".word " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
+ ".word " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) ", 0x0\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".word " __rseq_str(label) "b, 0x0\n\t" \
+ ".popsection\n\t"
+
+#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
+ __RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip, \
+ (post_commit_ip - start_ip), abort_ip)
+
+/*
+ * Exit points of a rseq critical section consist of all instructions outside
+ * of the critical section where a critical section can either branch to or
+ * reach through the normal course of its execution. The abort IP and the
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
+ * useful to assist debuggers stepping over the critical section.
+ */
+#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
+ ".pushsection __rseq_exit_point_array, \"aw\"\n\t" \
+ ".word " __rseq_str(start_ip) ", 0x0, " __rseq_str(exit_ip) ", 0x0\n\t" \
+ ".popsection\n\t"
+
+#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs) \
+ RSEQ_INJECT_ASM(1) \
+ "adr r0, " __rseq_str(cs_label) "\n\t" \
+ "str r0, %[" __rseq_str(rseq_cs) "]\n\t" \
+ __rseq_str(label) ":\n\t"
+
+#define RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, label) \
+ RSEQ_INJECT_ASM(2) \
+ "ldr r0, %[" __rseq_str(current_cpu_id) "]\n\t" \
+ "cmp %[" __rseq_str(cpu_id) "], r0\n\t" \
+ "bne " __rseq_str(label) "\n\t"
+
+#define __RSEQ_ASM_DEFINE_ABORT(table_label, label, teardown, \
+ abort_label, version, flags, \
+ start_ip, post_commit_offset, abort_ip) \
+ ".balign 32\n\t" \
+ __rseq_str(table_label) ":\n\t" \
+ ".word " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
+ ".word " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) ", 0x0\n\t" \
+ ".word " __rseq_str(RSEQ_SIG) "\n\t" \
+ __rseq_str(label) ":\n\t" \
+ teardown \
+ "b %l[" __rseq_str(abort_label) "]\n\t"
+
+#define RSEQ_ASM_DEFINE_ABORT(table_label, label, teardown, abort_label, \
+ start_ip, post_commit_ip, abort_ip) \
+ __RSEQ_ASM_DEFINE_ABORT(table_label, label, teardown, \
+ abort_label, 0x0, 0x0, start_ip, \
+ (post_commit_ip - start_ip), abort_ip)
+
+#define RSEQ_ASM_DEFINE_CMPFAIL(label, teardown, cmpfail_label) \
+ __rseq_str(label) ":\n\t" \
+ teardown \
+ "b %l[" __rseq_str(cmpfail_label) "]\n\t"
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ "ldr r0, %[v]\n\t"
+ "cmp %[expect], r0\n\t"
+ "bne %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ "ldr r0, %[v]\n\t"
+ "cmp %[expect], r0\n\t"
+ "bne %l[error2]\n\t"
+#endif
+ /* final store */
+ "str %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(5)
+ "b 5f\n\t"
+ RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
+ "5:\n\t"
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv)
+ RSEQ_INJECT_INPUT
+ : "r0", "memory", "cc"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
+ long voffp, intptr_t *load, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ "ldr r0, %[v]\n\t"
+ "cmp %[expectnot], r0\n\t"
+ "beq %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ "ldr r0, %[v]\n\t"
+ "cmp %[expectnot], r0\n\t"
+ "beq %l[error2]\n\t"
+#endif
+ "str r0, %[load]\n\t"
+ "add r0, %[voffp]\n\t"
+ "ldr r0, [r0]\n\t"
+ /* final store */
+ "str r0, %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(5)
+ "b 5f\n\t"
+ RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
+ "5:\n\t"
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* final store input */
+ [v] "m" (*v),
+ [expectnot] "r" (expectnot),
+ [voffp] "Ir" (voffp),
+ [load] "m" (*load)
+ RSEQ_INJECT_INPUT
+ : "r0", "memory", "cc"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_addv(intptr_t *v, intptr_t count, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+#endif
+ "ldr r0, %[v]\n\t"
+ "add r0, %[count]\n\t"
+ /* final store */
+ "str r0, %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(4)
+ "b 5f\n\t"
+ RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
+ "5:\n\t"
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ [v] "m" (*v),
+ [count] "Ir" (count)
+ RSEQ_INJECT_INPUT
+ : "r0", "memory", "cc"
+ RSEQ_INJECT_CLOBBER
+ : abort
+#ifdef RSEQ_COMPARE_TWICE
+ , error1
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t newv2,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ "ldr r0, %[v]\n\t"
+ "cmp %[expect], r0\n\t"
+ "bne %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ "ldr r0, %[v]\n\t"
+ "cmp %[expect], r0\n\t"
+ "bne %l[error2]\n\t"
+#endif
+ /* try store */
+ "str %[newv2], %[v2]\n\t"
+ RSEQ_INJECT_ASM(5)
+ /* final store */
+ "str %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(6)
+ "b 5f\n\t"
+ RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
+ "5:\n\t"
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* try store input */
+ [v2] "m" (*v2),
+ [newv2] "r" (newv2),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv)
+ RSEQ_INJECT_INPUT
+ : "r0", "memory", "cc"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t newv2,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ "ldr r0, %[v]\n\t"
+ "cmp %[expect], r0\n\t"
+ "bne %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ "ldr r0, %[v]\n\t"
+ "cmp %[expect], r0\n\t"
+ "bne %l[error2]\n\t"
+#endif
+ /* try store */
+ "str %[newv2], %[v2]\n\t"
+ RSEQ_INJECT_ASM(5)
+ "dmb\n\t" /* full mb provides store-release */
+ /* final store */
+ "str %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(6)
+ "b 5f\n\t"
+ RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
+ "5:\n\t"
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* try store input */
+ [v2] "m" (*v2),
+ [newv2] "r" (newv2),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv)
+ RSEQ_INJECT_INPUT
+ : "r0", "memory", "cc"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t expect2,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ "ldr r0, %[v]\n\t"
+ "cmp %[expect], r0\n\t"
+ "bne %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+ "ldr r0, %[v2]\n\t"
+ "cmp %[expect2], r0\n\t"
+ "bne %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ "ldr r0, %[v]\n\t"
+ "cmp %[expect], r0\n\t"
+ "bne %l[error2]\n\t"
+ "ldr r0, %[v2]\n\t"
+ "cmp %[expect2], r0\n\t"
+ "bne %l[error3]\n\t"
+#endif
+ /* final store */
+ "str %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(6)
+ "b 5f\n\t"
+ RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
+ "5:\n\t"
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* cmp2 input */
+ [v2] "m" (*v2),
+ [expect2] "r" (expect2),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv)
+ RSEQ_INJECT_INPUT
+ : "r0", "memory", "cc"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2, error3
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("1st expected value comparison failed");
+error3:
+ rseq_after_asm_goto();
+ rseq_bug("2nd expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
+ void *dst, void *src, size_t len,
+ intptr_t newv, int cpu)
+{
+ uint32_t rseq_scratch[3];
+
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ "str %[src], %[rseq_scratch0]\n\t"
+ "str %[dst], %[rseq_scratch1]\n\t"
+ "str %[len], %[rseq_scratch2]\n\t"
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ "ldr r0, %[v]\n\t"
+ "cmp %[expect], r0\n\t"
+ "bne 5f\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 6f)
+ "ldr r0, %[v]\n\t"
+ "cmp %[expect], r0\n\t"
+ "bne 7f\n\t"
+#endif
+ /* try memcpy */
+ "cmp %[len], #0\n\t" \
+ "beq 333f\n\t" \
+ "222:\n\t" \
+ "ldrb %%r0, [%[src]]\n\t" \
+ "strb %%r0, [%[dst]]\n\t" \
+ "adds %[src], #1\n\t" \
+ "adds %[dst], #1\n\t" \
+ "subs %[len], #1\n\t" \
+ "bne 222b\n\t" \
+ "333:\n\t" \
+ RSEQ_INJECT_ASM(5)
+ /* final store */
+ "str %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(6)
+ /* teardown */
+ "ldr %[len], %[rseq_scratch2]\n\t"
+ "ldr %[dst], %[rseq_scratch1]\n\t"
+ "ldr %[src], %[rseq_scratch0]\n\t"
+ "b 8f\n\t"
+ RSEQ_ASM_DEFINE_ABORT(3, 4,
+ /* teardown */
+ "ldr %[len], %[rseq_scratch2]\n\t"
+ "ldr %[dst], %[rseq_scratch1]\n\t"
+ "ldr %[src], %[rseq_scratch0]\n\t",
+ abort, 1b, 2b, 4f)
+ RSEQ_ASM_DEFINE_CMPFAIL(5,
+ /* teardown */
+ "ldr %[len], %[rseq_scratch2]\n\t"
+ "ldr %[dst], %[rseq_scratch1]\n\t"
+ "ldr %[src], %[rseq_scratch0]\n\t",
+ cmpfail)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_CMPFAIL(6,
+ /* teardown */
+ "ldr %[len], %[rseq_scratch2]\n\t"
+ "ldr %[dst], %[rseq_scratch1]\n\t"
+ "ldr %[src], %[rseq_scratch0]\n\t",
+ error1)
+ RSEQ_ASM_DEFINE_CMPFAIL(7,
+ /* teardown */
+ "ldr %[len], %[rseq_scratch2]\n\t"
+ "ldr %[dst], %[rseq_scratch1]\n\t"
+ "ldr %[src], %[rseq_scratch0]\n\t",
+ error2)
+#endif
+ "8:\n\t"
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv),
+ /* try memcpy input */
+ [dst] "r" (dst),
+ [src] "r" (src),
+ [len] "r" (len),
+ [rseq_scratch0] "m" (rseq_scratch[0]),
+ [rseq_scratch1] "m" (rseq_scratch[1]),
+ [rseq_scratch2] "m" (rseq_scratch[2])
+ RSEQ_INJECT_INPUT
+ : "r0", "memory", "cc"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
+ void *dst, void *src, size_t len,
+ intptr_t newv, int cpu)
+{
+ uint32_t rseq_scratch[3];
+
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ "str %[src], %[rseq_scratch0]\n\t"
+ "str %[dst], %[rseq_scratch1]\n\t"
+ "str %[len], %[rseq_scratch2]\n\t"
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ "ldr r0, %[v]\n\t"
+ "cmp %[expect], r0\n\t"
+ "bne 5f\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 6f)
+ "ldr r0, %[v]\n\t"
+ "cmp %[expect], r0\n\t"
+ "bne 7f\n\t"
+#endif
+ /* try memcpy */
+ "cmp %[len], #0\n\t" \
+ "beq 333f\n\t" \
+ "222:\n\t" \
+ "ldrb %%r0, [%[src]]\n\t" \
+ "strb %%r0, [%[dst]]\n\t" \
+ "adds %[src], #1\n\t" \
+ "adds %[dst], #1\n\t" \
+ "subs %[len], #1\n\t" \
+ "bne 222b\n\t" \
+ "333:\n\t" \
+ RSEQ_INJECT_ASM(5)
+ "dmb\n\t" /* full mb provides store-release */
+ /* final store */
+ "str %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(6)
+ /* teardown */
+ "ldr %[len], %[rseq_scratch2]\n\t"
+ "ldr %[dst], %[rseq_scratch1]\n\t"
+ "ldr %[src], %[rseq_scratch0]\n\t"
+ "b 8f\n\t"
+ RSEQ_ASM_DEFINE_ABORT(3, 4,
+ /* teardown */
+ "ldr %[len], %[rseq_scratch2]\n\t"
+ "ldr %[dst], %[rseq_scratch1]\n\t"
+ "ldr %[src], %[rseq_scratch0]\n\t",
+ abort, 1b, 2b, 4f)
+ RSEQ_ASM_DEFINE_CMPFAIL(5,
+ /* teardown */
+ "ldr %[len], %[rseq_scratch2]\n\t"
+ "ldr %[dst], %[rseq_scratch1]\n\t"
+ "ldr %[src], %[rseq_scratch0]\n\t",
+ cmpfail)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_CMPFAIL(6,
+ /* teardown */
+ "ldr %[len], %[rseq_scratch2]\n\t"
+ "ldr %[dst], %[rseq_scratch1]\n\t"
+ "ldr %[src], %[rseq_scratch0]\n\t",
+ error1)
+ RSEQ_ASM_DEFINE_CMPFAIL(7,
+ /* teardown */
+ "ldr %[len], %[rseq_scratch2]\n\t"
+ "ldr %[dst], %[rseq_scratch1]\n\t"
+ "ldr %[src], %[rseq_scratch0]\n\t",
+ error2)
+#endif
+ "8:\n\t"
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv),
+ /* try memcpy input */
+ [dst] "r" (dst),
+ [src] "r" (src),
+ [len] "r" (len),
+ [rseq_scratch0] "m" (rseq_scratch[0]),
+ [rseq_scratch1] "m" (rseq_scratch[1]),
+ [rseq_scratch2] "m" (rseq_scratch[2])
+ RSEQ_INJECT_INPUT
+ : "r0", "memory", "cc"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+#endif /* !RSEQ_SKIP_FASTPATH */
diff --git a/tools/testing/selftests/rseq/rseq-arm64.h b/tools/testing/selftests/rseq/rseq-arm64.h
new file mode 100644
index 000000000..cbe190a4d
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-arm64.h
@@ -0,0 +1,695 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * rseq-arm64.h
+ *
+ * (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ * (C) Copyright 2018 - Will Deacon <will.deacon@arm.com>
+ */
+
+/*
+ * aarch64 -mbig-endian generates mixed endianness code vs data:
+ * little-endian code and big-endian data. Ensure the RSEQ_SIG signature
+ * matches code endianness.
+ */
+#define RSEQ_SIG_CODE 0xd428bc00 /* BRK #0x45E0. */
+
+#ifdef __AARCH64EB__
+#define RSEQ_SIG_DATA 0x00bc28d4 /* BRK #0x45E0. */
+#else
+#define RSEQ_SIG_DATA RSEQ_SIG_CODE
+#endif
+
+#define RSEQ_SIG RSEQ_SIG_DATA
+
+#define rseq_smp_mb() __asm__ __volatile__ ("dmb ish" ::: "memory")
+#define rseq_smp_rmb() __asm__ __volatile__ ("dmb ishld" ::: "memory")
+#define rseq_smp_wmb() __asm__ __volatile__ ("dmb ishst" ::: "memory")
+
+#define rseq_smp_load_acquire(p) \
+__extension__ ({ \
+ __typeof(*p) ____p1; \
+ switch (sizeof(*p)) { \
+ case 1: \
+ asm volatile ("ldarb %w0, %1" \
+ : "=r" (*(__u8 *)p) \
+ : "Q" (*p) : "memory"); \
+ break; \
+ case 2: \
+ asm volatile ("ldarh %w0, %1" \
+ : "=r" (*(__u16 *)p) \
+ : "Q" (*p) : "memory"); \
+ break; \
+ case 4: \
+ asm volatile ("ldar %w0, %1" \
+ : "=r" (*(__u32 *)p) \
+ : "Q" (*p) : "memory"); \
+ break; \
+ case 8: \
+ asm volatile ("ldar %0, %1" \
+ : "=r" (*(__u64 *)p) \
+ : "Q" (*p) : "memory"); \
+ break; \
+ } \
+ ____p1; \
+})
+
+#define rseq_smp_acquire__after_ctrl_dep() rseq_smp_rmb()
+
+#define rseq_smp_store_release(p, v) \
+do { \
+ switch (sizeof(*p)) { \
+ case 1: \
+ asm volatile ("stlrb %w1, %0" \
+ : "=Q" (*p) \
+ : "r" ((__u8)v) \
+ : "memory"); \
+ break; \
+ case 2: \
+ asm volatile ("stlrh %w1, %0" \
+ : "=Q" (*p) \
+ : "r" ((__u16)v) \
+ : "memory"); \
+ break; \
+ case 4: \
+ asm volatile ("stlr %w1, %0" \
+ : "=Q" (*p) \
+ : "r" ((__u32)v) \
+ : "memory"); \
+ break; \
+ case 8: \
+ asm volatile ("stlr %1, %0" \
+ : "=Q" (*p) \
+ : "r" ((__u64)v) \
+ : "memory"); \
+ break; \
+ } \
+} while (0)
+
+#ifdef RSEQ_SKIP_FASTPATH
+#include "rseq-skip.h"
+#else /* !RSEQ_SKIP_FASTPATH */
+
+#define RSEQ_ASM_TMP_REG32 "w15"
+#define RSEQ_ASM_TMP_REG "x15"
+#define RSEQ_ASM_TMP_REG_2 "x14"
+
+#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip, \
+ post_commit_offset, abort_ip) \
+ " .pushsection __rseq_cs, \"aw\"\n" \
+ " .balign 32\n" \
+ __rseq_str(label) ":\n" \
+ " .long " __rseq_str(version) ", " __rseq_str(flags) "\n" \
+ " .quad " __rseq_str(start_ip) ", " \
+ __rseq_str(post_commit_offset) ", " \
+ __rseq_str(abort_ip) "\n" \
+ " .popsection\n\t" \
+ " .pushsection __rseq_cs_ptr_array, \"aw\"\n" \
+ " .quad " __rseq_str(label) "b\n" \
+ " .popsection\n"
+
+#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
+ __RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip, \
+ (post_commit_ip - start_ip), abort_ip)
+
+/*
+ * Exit points of a rseq critical section consist of all instructions outside
+ * of the critical section where a critical section can either branch to or
+ * reach through the normal course of its execution. The abort IP and the
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
+ * useful to assist debuggers stepping over the critical section.
+ */
+#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
+ " .pushsection __rseq_exit_point_array, \"aw\"\n" \
+ " .quad " __rseq_str(start_ip) ", " __rseq_str(exit_ip) "\n" \
+ " .popsection\n"
+
+#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs) \
+ RSEQ_INJECT_ASM(1) \
+ " adrp " RSEQ_ASM_TMP_REG ", " __rseq_str(cs_label) "\n" \
+ " add " RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG \
+ ", :lo12:" __rseq_str(cs_label) "\n" \
+ " str " RSEQ_ASM_TMP_REG ", %[" __rseq_str(rseq_cs) "]\n" \
+ __rseq_str(label) ":\n"
+
+#define RSEQ_ASM_DEFINE_ABORT(label, abort_label) \
+ " b 222f\n" \
+ " .inst " __rseq_str(RSEQ_SIG_CODE) "\n" \
+ __rseq_str(label) ":\n" \
+ " b %l[" __rseq_str(abort_label) "]\n" \
+ "222:\n"
+
+#define RSEQ_ASM_OP_STORE(value, var) \
+ " str %[" __rseq_str(value) "], %[" __rseq_str(var) "]\n"
+
+#define RSEQ_ASM_OP_STORE_RELEASE(value, var) \
+ " stlr %[" __rseq_str(value) "], %[" __rseq_str(var) "]\n"
+
+#define RSEQ_ASM_OP_FINAL_STORE(value, var, post_commit_label) \
+ RSEQ_ASM_OP_STORE(value, var) \
+ __rseq_str(post_commit_label) ":\n"
+
+#define RSEQ_ASM_OP_FINAL_STORE_RELEASE(value, var, post_commit_label) \
+ RSEQ_ASM_OP_STORE_RELEASE(value, var) \
+ __rseq_str(post_commit_label) ":\n"
+
+#define RSEQ_ASM_OP_CMPEQ(var, expect, label) \
+ " ldr " RSEQ_ASM_TMP_REG ", %[" __rseq_str(var) "]\n" \
+ " sub " RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG \
+ ", %[" __rseq_str(expect) "]\n" \
+ " cbnz " RSEQ_ASM_TMP_REG ", " __rseq_str(label) "\n"
+
+#define RSEQ_ASM_OP_CMPEQ32(var, expect, label) \
+ " ldr " RSEQ_ASM_TMP_REG32 ", %[" __rseq_str(var) "]\n" \
+ " sub " RSEQ_ASM_TMP_REG32 ", " RSEQ_ASM_TMP_REG32 \
+ ", %w[" __rseq_str(expect) "]\n" \
+ " cbnz " RSEQ_ASM_TMP_REG32 ", " __rseq_str(label) "\n"
+
+#define RSEQ_ASM_OP_CMPNE(var, expect, label) \
+ " ldr " RSEQ_ASM_TMP_REG ", %[" __rseq_str(var) "]\n" \
+ " sub " RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG \
+ ", %[" __rseq_str(expect) "]\n" \
+ " cbz " RSEQ_ASM_TMP_REG ", " __rseq_str(label) "\n"
+
+#define RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, label) \
+ RSEQ_INJECT_ASM(2) \
+ RSEQ_ASM_OP_CMPEQ32(current_cpu_id, cpu_id, label)
+
+#define RSEQ_ASM_OP_R_LOAD(var) \
+ " ldr " RSEQ_ASM_TMP_REG ", %[" __rseq_str(var) "]\n"
+
+#define RSEQ_ASM_OP_R_STORE(var) \
+ " str " RSEQ_ASM_TMP_REG ", %[" __rseq_str(var) "]\n"
+
+#define RSEQ_ASM_OP_R_LOAD_OFF(offset) \
+ " ldr " RSEQ_ASM_TMP_REG ", [" RSEQ_ASM_TMP_REG \
+ ", %[" __rseq_str(offset) "]]\n"
+
+#define RSEQ_ASM_OP_R_ADD(count) \
+ " add " RSEQ_ASM_TMP_REG ", " RSEQ_ASM_TMP_REG \
+ ", %[" __rseq_str(count) "]\n"
+
+#define RSEQ_ASM_OP_R_FINAL_STORE(var, post_commit_label) \
+ " str " RSEQ_ASM_TMP_REG ", %[" __rseq_str(var) "]\n" \
+ __rseq_str(post_commit_label) ":\n"
+
+#define RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len) \
+ " cbz %[" __rseq_str(len) "], 333f\n" \
+ " mov " RSEQ_ASM_TMP_REG_2 ", %[" __rseq_str(len) "]\n" \
+ "222: sub " RSEQ_ASM_TMP_REG_2 ", " RSEQ_ASM_TMP_REG_2 ", #1\n" \
+ " ldrb " RSEQ_ASM_TMP_REG32 ", [%[" __rseq_str(src) "]" \
+ ", " RSEQ_ASM_TMP_REG_2 "]\n" \
+ " strb " RSEQ_ASM_TMP_REG32 ", [%[" __rseq_str(dst) "]" \
+ ", " RSEQ_ASM_TMP_REG_2 "]\n" \
+ " cbnz " RSEQ_ASM_TMP_REG_2 ", 222b\n" \
+ "333:\n"
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
+#endif
+ RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+ RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+ RSEQ_INJECT_ASM(5)
+ RSEQ_ASM_DEFINE_ABORT(4, abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "Qo" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ [v] "Qo" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv)
+ RSEQ_INJECT_INPUT
+ : "memory", RSEQ_ASM_TMP_REG
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
+ long voffp, intptr_t *load, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
+#endif
+ RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ RSEQ_ASM_OP_CMPNE(v, expectnot, %l[cmpfail])
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ RSEQ_ASM_OP_CMPNE(v, expectnot, %l[error2])
+#endif
+ RSEQ_ASM_OP_R_LOAD(v)
+ RSEQ_ASM_OP_R_STORE(load)
+ RSEQ_ASM_OP_R_LOAD_OFF(voffp)
+ RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
+ RSEQ_INJECT_ASM(5)
+ RSEQ_ASM_DEFINE_ABORT(4, abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "Qo" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ [v] "Qo" (*v),
+ [expectnot] "r" (expectnot),
+ [load] "Qo" (*load),
+ [voffp] "r" (voffp)
+ RSEQ_INJECT_INPUT
+ : "memory", RSEQ_ASM_TMP_REG
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_addv(intptr_t *v, intptr_t count, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
+#endif
+ RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+#endif
+ RSEQ_ASM_OP_R_LOAD(v)
+ RSEQ_ASM_OP_R_ADD(count)
+ RSEQ_ASM_OP_R_FINAL_STORE(v, 3)
+ RSEQ_INJECT_ASM(4)
+ RSEQ_ASM_DEFINE_ABORT(4, abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "Qo" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ [v] "Qo" (*v),
+ [count] "r" (count)
+ RSEQ_INJECT_INPUT
+ : "memory", RSEQ_ASM_TMP_REG
+ : abort
+#ifdef RSEQ_COMPARE_TWICE
+ , error1
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t newv2,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
+#endif
+ RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+ RSEQ_ASM_OP_STORE(newv2, v2)
+ RSEQ_INJECT_ASM(5)
+ RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+ RSEQ_INJECT_ASM(6)
+ RSEQ_ASM_DEFINE_ABORT(4, abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "Qo" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ [expect] "r" (expect),
+ [v] "Qo" (*v),
+ [newv] "r" (newv),
+ [v2] "Qo" (*v2),
+ [newv2] "r" (newv2)
+ RSEQ_INJECT_INPUT
+ : "memory", RSEQ_ASM_TMP_REG
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t newv2,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
+#endif
+ RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+ RSEQ_ASM_OP_STORE(newv2, v2)
+ RSEQ_INJECT_ASM(5)
+ RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
+ RSEQ_INJECT_ASM(6)
+ RSEQ_ASM_DEFINE_ABORT(4, abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "Qo" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ [expect] "r" (expect),
+ [v] "Qo" (*v),
+ [newv] "r" (newv),
+ [v2] "Qo" (*v2),
+ [newv2] "r" (newv2)
+ RSEQ_INJECT_INPUT
+ : "memory", RSEQ_ASM_TMP_REG
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t expect2,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error3])
+#endif
+ RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+ RSEQ_INJECT_ASM(4)
+ RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[cmpfail])
+ RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+ RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[error3])
+#endif
+ RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+ RSEQ_INJECT_ASM(6)
+ RSEQ_ASM_DEFINE_ABORT(4, abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "Qo" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ [v] "Qo" (*v),
+ [expect] "r" (expect),
+ [v2] "Qo" (*v2),
+ [expect2] "r" (expect2),
+ [newv] "r" (newv)
+ RSEQ_INJECT_INPUT
+ : "memory", RSEQ_ASM_TMP_REG
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2, error3
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+error3:
+ rseq_after_asm_goto();
+ rseq_bug("2nd expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
+ void *dst, void *src, size_t len,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
+#endif
+ RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+ RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
+ RSEQ_INJECT_ASM(5)
+ RSEQ_ASM_OP_FINAL_STORE(newv, v, 3)
+ RSEQ_INJECT_ASM(6)
+ RSEQ_ASM_DEFINE_ABORT(4, abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "Qo" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ [expect] "r" (expect),
+ [v] "Qo" (*v),
+ [newv] "r" (newv),
+ [dst] "r" (dst),
+ [src] "r" (src),
+ [len] "r" (len)
+ RSEQ_INJECT_INPUT
+ : "memory", RSEQ_ASM_TMP_REG, RSEQ_ASM_TMP_REG_2
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
+ void *dst, void *src, size_t len,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(1, 2f, 3f, 4f)
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(2f, %l[error2])
+#endif
+ RSEQ_ASM_STORE_RSEQ_CS(2, 1b, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+ RSEQ_ASM_OP_R_BAD_MEMCPY(dst, src, len)
+ RSEQ_INJECT_ASM(5)
+ RSEQ_ASM_OP_FINAL_STORE_RELEASE(newv, v, 3)
+ RSEQ_INJECT_ASM(6)
+ RSEQ_ASM_DEFINE_ABORT(4, abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "Qo" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ [expect] "r" (expect),
+ [v] "Qo" (*v),
+ [newv] "r" (newv),
+ [dst] "r" (dst),
+ [src] "r" (src),
+ [len] "r" (len)
+ RSEQ_INJECT_INPUT
+ : "memory", RSEQ_ASM_TMP_REG, RSEQ_ASM_TMP_REG_2
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+#endif /* !RSEQ_SKIP_FASTPATH */
diff --git a/tools/testing/selftests/rseq/rseq-generic-thread-pointer.h b/tools/testing/selftests/rseq/rseq-generic-thread-pointer.h
new file mode 100644
index 000000000..38c584661
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-generic-thread-pointer.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: LGPL-2.1-only OR MIT */
+/*
+ * rseq-generic-thread-pointer.h
+ *
+ * (C) Copyright 2021 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#ifndef _RSEQ_GENERIC_THREAD_POINTER
+#define _RSEQ_GENERIC_THREAD_POINTER
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Use gcc builtin thread pointer. */
+static inline void *rseq_thread_pointer(void)
+{
+ return __builtin_thread_pointer();
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/tools/testing/selftests/rseq/rseq-mips.h b/tools/testing/selftests/rseq/rseq-mips.h
new file mode 100644
index 000000000..878739fae
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-mips.h
@@ -0,0 +1,777 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * Author: Paul Burton <paul.burton@mips.com>
+ * (C) Copyright 2018 MIPS Tech LLC
+ *
+ * Based on rseq-arm.h:
+ * (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+/*
+ * RSEQ_SIG uses the break instruction. The instruction pattern is:
+ *
+ * On MIPS:
+ * 0350000d break 0x350
+ *
+ * On nanoMIPS:
+ * 00100350 break 0x350
+ *
+ * On microMIPS:
+ * 0000d407 break 0x350
+ *
+ * For nanoMIPS32 and microMIPS, the instruction stream is encoded as 16-bit
+ * halfwords, so the signature halfwords need to be swapped accordingly for
+ * little-endian.
+ */
+#if defined(__nanomips__)
+# ifdef __MIPSEL__
+# define RSEQ_SIG 0x03500010
+# else
+# define RSEQ_SIG 0x00100350
+# endif
+#elif defined(__mips_micromips)
+# ifdef __MIPSEL__
+# define RSEQ_SIG 0xd4070000
+# else
+# define RSEQ_SIG 0x0000d407
+# endif
+#elif defined(__mips__)
+# define RSEQ_SIG 0x0350000d
+#else
+/* Unknown MIPS architecture. */
+#endif
+
+#define rseq_smp_mb() __asm__ __volatile__ ("sync" ::: "memory")
+#define rseq_smp_rmb() rseq_smp_mb()
+#define rseq_smp_wmb() rseq_smp_mb()
+
+#define rseq_smp_load_acquire(p) \
+__extension__ ({ \
+ __typeof(*p) ____p1 = RSEQ_READ_ONCE(*p); \
+ rseq_smp_mb(); \
+ ____p1; \
+})
+
+#define rseq_smp_acquire__after_ctrl_dep() rseq_smp_rmb()
+
+#define rseq_smp_store_release(p, v) \
+do { \
+ rseq_smp_mb(); \
+ RSEQ_WRITE_ONCE(*p, v); \
+} while (0)
+
+#ifdef RSEQ_SKIP_FASTPATH
+#include "rseq-skip.h"
+#else /* !RSEQ_SKIP_FASTPATH */
+
+#if _MIPS_SZLONG == 64
+# define LONG ".dword"
+# define LONG_LA "dla"
+# define LONG_L "ld"
+# define LONG_S "sd"
+# define LONG_ADDI "daddiu"
+# define U32_U64_PAD(x) x
+#elif _MIPS_SZLONG == 32
+# define LONG ".word"
+# define LONG_LA "la"
+# define LONG_L "lw"
+# define LONG_S "sw"
+# define LONG_ADDI "addiu"
+# ifdef __BIG_ENDIAN
+# define U32_U64_PAD(x) "0x0, " x
+# else
+# define U32_U64_PAD(x) x ", 0x0"
+# endif
+#else
+# error unsupported _MIPS_SZLONG
+#endif
+
+#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, start_ip, \
+ post_commit_offset, abort_ip) \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
+ ".balign 32\n\t" \
+ __rseq_str(label) ":\n\t" \
+ ".word " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
+ LONG " " U32_U64_PAD(__rseq_str(start_ip)) "\n\t" \
+ LONG " " U32_U64_PAD(__rseq_str(post_commit_offset)) "\n\t" \
+ LONG " " U32_U64_PAD(__rseq_str(abort_ip)) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ LONG " " U32_U64_PAD(__rseq_str(label) "b") "\n\t" \
+ ".popsection\n\t"
+
+#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
+ __RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip, \
+ (post_commit_ip - start_ip), abort_ip)
+
+/*
+ * Exit points of a rseq critical section consist of all instructions outside
+ * of the critical section where a critical section can either branch to or
+ * reach through the normal course of its execution. The abort IP and the
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
+ * useful to assist debuggers stepping over the critical section.
+ */
+#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
+ ".pushsection __rseq_exit_point_array, \"aw\"\n\t" \
+ LONG " " U32_U64_PAD(__rseq_str(start_ip)) "\n\t" \
+ LONG " " U32_U64_PAD(__rseq_str(exit_ip)) "\n\t" \
+ ".popsection\n\t"
+
+#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs) \
+ RSEQ_INJECT_ASM(1) \
+ LONG_LA " $4, " __rseq_str(cs_label) "\n\t" \
+ LONG_S " $4, %[" __rseq_str(rseq_cs) "]\n\t" \
+ __rseq_str(label) ":\n\t"
+
+#define RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, label) \
+ RSEQ_INJECT_ASM(2) \
+ "lw $4, %[" __rseq_str(current_cpu_id) "]\n\t" \
+ "bne $4, %[" __rseq_str(cpu_id) "], " __rseq_str(label) "\n\t"
+
+#define __RSEQ_ASM_DEFINE_ABORT(table_label, label, teardown, \
+ abort_label, version, flags, \
+ start_ip, post_commit_offset, abort_ip) \
+ ".balign 32\n\t" \
+ __rseq_str(table_label) ":\n\t" \
+ ".word " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
+ LONG " " U32_U64_PAD(__rseq_str(start_ip)) "\n\t" \
+ LONG " " U32_U64_PAD(__rseq_str(post_commit_offset)) "\n\t" \
+ LONG " " U32_U64_PAD(__rseq_str(abort_ip)) "\n\t" \
+ ".word " __rseq_str(RSEQ_SIG) "\n\t" \
+ __rseq_str(label) ":\n\t" \
+ teardown \
+ "b %l[" __rseq_str(abort_label) "]\n\t"
+
+#define RSEQ_ASM_DEFINE_ABORT(table_label, label, teardown, abort_label, \
+ start_ip, post_commit_ip, abort_ip) \
+ __RSEQ_ASM_DEFINE_ABORT(table_label, label, teardown, \
+ abort_label, 0x0, 0x0, start_ip, \
+ (post_commit_ip - start_ip), abort_ip)
+
+#define RSEQ_ASM_DEFINE_CMPFAIL(label, teardown, cmpfail_label) \
+ __rseq_str(label) ":\n\t" \
+ teardown \
+ "b %l[" __rseq_str(cmpfail_label) "]\n\t"
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ LONG_L " $4, %[v]\n\t"
+ "bne $4, %[expect], %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ LONG_L " $4, %[v]\n\t"
+ "bne $4, %[expect], %l[error2]\n\t"
+#endif
+ /* final store */
+ LONG_S " %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(5)
+ "b 5f\n\t"
+ RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
+ "5:\n\t"
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv)
+ RSEQ_INJECT_INPUT
+ : "$4", "memory"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ return 0;
+abort:
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
+ long voffp, intptr_t *load, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ LONG_L " $4, %[v]\n\t"
+ "beq $4, %[expectnot], %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ LONG_L " $4, %[v]\n\t"
+ "beq $4, %[expectnot], %l[error2]\n\t"
+#endif
+ LONG_S " $4, %[load]\n\t"
+ LONG_ADDI " $4, %[voffp]\n\t"
+ LONG_L " $4, 0($4)\n\t"
+ /* final store */
+ LONG_S " $4, %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(5)
+ "b 5f\n\t"
+ RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
+ "5:\n\t"
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* final store input */
+ [v] "m" (*v),
+ [expectnot] "r" (expectnot),
+ [voffp] "Ir" (voffp),
+ [load] "m" (*load)
+ RSEQ_INJECT_INPUT
+ : "$4", "memory"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ return 0;
+abort:
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_addv(intptr_t *v, intptr_t count, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+#endif
+ LONG_L " $4, %[v]\n\t"
+ LONG_ADDI " $4, %[count]\n\t"
+ /* final store */
+ LONG_S " $4, %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(4)
+ "b 5f\n\t"
+ RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
+ "5:\n\t"
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ [v] "m" (*v),
+ [count] "Ir" (count)
+ RSEQ_INJECT_INPUT
+ : "$4", "memory"
+ RSEQ_INJECT_CLOBBER
+ : abort
+#ifdef RSEQ_COMPARE_TWICE
+ , error1
+#endif
+ );
+ return 0;
+abort:
+ RSEQ_INJECT_FAILED
+ return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t newv2,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ LONG_L " $4, %[v]\n\t"
+ "bne $4, %[expect], %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ LONG_L " $4, %[v]\n\t"
+ "bne $4, %[expect], %l[error2]\n\t"
+#endif
+ /* try store */
+ LONG_S " %[newv2], %[v2]\n\t"
+ RSEQ_INJECT_ASM(5)
+ /* final store */
+ LONG_S " %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(6)
+ "b 5f\n\t"
+ RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
+ "5:\n\t"
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* try store input */
+ [v2] "m" (*v2),
+ [newv2] "r" (newv2),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv)
+ RSEQ_INJECT_INPUT
+ : "$4", "memory"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ return 0;
+abort:
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t newv2,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ LONG_L " $4, %[v]\n\t"
+ "bne $4, %[expect], %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ LONG_L " $4, %[v]\n\t"
+ "bne $4, %[expect], %l[error2]\n\t"
+#endif
+ /* try store */
+ LONG_S " %[newv2], %[v2]\n\t"
+ RSEQ_INJECT_ASM(5)
+ "sync\n\t" /* full sync provides store-release */
+ /* final store */
+ LONG_S " %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(6)
+ "b 5f\n\t"
+ RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
+ "5:\n\t"
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* try store input */
+ [v2] "m" (*v2),
+ [newv2] "r" (newv2),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv)
+ RSEQ_INJECT_INPUT
+ : "$4", "memory"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ return 0;
+abort:
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t expect2,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ LONG_L " $4, %[v]\n\t"
+ "bne $4, %[expect], %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+ LONG_L " $4, %[v2]\n\t"
+ "bne $4, %[expect2], %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ LONG_L " $4, %[v]\n\t"
+ "bne $4, %[expect], %l[error2]\n\t"
+ LONG_L " $4, %[v2]\n\t"
+ "bne $4, %[expect2], %l[error3]\n\t"
+#endif
+ /* final store */
+ LONG_S " %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(6)
+ "b 5f\n\t"
+ RSEQ_ASM_DEFINE_ABORT(3, 4, "", abort, 1b, 2b, 4f)
+ "5:\n\t"
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* cmp2 input */
+ [v2] "m" (*v2),
+ [expect2] "r" (expect2),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv)
+ RSEQ_INJECT_INPUT
+ : "$4", "memory"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2, error3
+#endif
+ );
+ return 0;
+abort:
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_bug("1st expected value comparison failed");
+error3:
+ rseq_bug("2nd expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
+ void *dst, void *src, size_t len,
+ intptr_t newv, int cpu)
+{
+ uintptr_t rseq_scratch[3];
+
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ LONG_S " %[src], %[rseq_scratch0]\n\t"
+ LONG_S " %[dst], %[rseq_scratch1]\n\t"
+ LONG_S " %[len], %[rseq_scratch2]\n\t"
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ LONG_L " $4, %[v]\n\t"
+ "bne $4, %[expect], 5f\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 6f)
+ LONG_L " $4, %[v]\n\t"
+ "bne $4, %[expect], 7f\n\t"
+#endif
+ /* try memcpy */
+ "beqz %[len], 333f\n\t" \
+ "222:\n\t" \
+ "lb $4, 0(%[src])\n\t" \
+ "sb $4, 0(%[dst])\n\t" \
+ LONG_ADDI " %[src], 1\n\t" \
+ LONG_ADDI " %[dst], 1\n\t" \
+ LONG_ADDI " %[len], -1\n\t" \
+ "bnez %[len], 222b\n\t" \
+ "333:\n\t" \
+ RSEQ_INJECT_ASM(5)
+ /* final store */
+ LONG_S " %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(6)
+ /* teardown */
+ LONG_L " %[len], %[rseq_scratch2]\n\t"
+ LONG_L " %[dst], %[rseq_scratch1]\n\t"
+ LONG_L " %[src], %[rseq_scratch0]\n\t"
+ "b 8f\n\t"
+ RSEQ_ASM_DEFINE_ABORT(3, 4,
+ /* teardown */
+ LONG_L " %[len], %[rseq_scratch2]\n\t"
+ LONG_L " %[dst], %[rseq_scratch1]\n\t"
+ LONG_L " %[src], %[rseq_scratch0]\n\t",
+ abort, 1b, 2b, 4f)
+ RSEQ_ASM_DEFINE_CMPFAIL(5,
+ /* teardown */
+ LONG_L " %[len], %[rseq_scratch2]\n\t"
+ LONG_L " %[dst], %[rseq_scratch1]\n\t"
+ LONG_L " %[src], %[rseq_scratch0]\n\t",
+ cmpfail)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_CMPFAIL(6,
+ /* teardown */
+ LONG_L " %[len], %[rseq_scratch2]\n\t"
+ LONG_L " %[dst], %[rseq_scratch1]\n\t"
+ LONG_L " %[src], %[rseq_scratch0]\n\t",
+ error1)
+ RSEQ_ASM_DEFINE_CMPFAIL(7,
+ /* teardown */
+ LONG_L " %[len], %[rseq_scratch2]\n\t"
+ LONG_L " %[dst], %[rseq_scratch1]\n\t"
+ LONG_L " %[src], %[rseq_scratch0]\n\t",
+ error2)
+#endif
+ "8:\n\t"
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv),
+ /* try memcpy input */
+ [dst] "r" (dst),
+ [src] "r" (src),
+ [len] "r" (len),
+ [rseq_scratch0] "m" (rseq_scratch[0]),
+ [rseq_scratch1] "m" (rseq_scratch[1]),
+ [rseq_scratch2] "m" (rseq_scratch[2])
+ RSEQ_INJECT_INPUT
+ : "$4", "memory"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ return 0;
+abort:
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
+ void *dst, void *src, size_t len,
+ intptr_t newv, int cpu)
+{
+ uintptr_t rseq_scratch[3];
+
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(9, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ LONG_S " %[src], %[rseq_scratch0]\n\t"
+ LONG_S " %[dst], %[rseq_scratch1]\n\t"
+ LONG_S " %[len], %[rseq_scratch2]\n\t"
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3f, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ LONG_L " $4, %[v]\n\t"
+ "bne $4, %[expect], 5f\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 6f)
+ LONG_L " $4, %[v]\n\t"
+ "bne $4, %[expect], 7f\n\t"
+#endif
+ /* try memcpy */
+ "beqz %[len], 333f\n\t" \
+ "222:\n\t" \
+ "lb $4, 0(%[src])\n\t" \
+ "sb $4, 0(%[dst])\n\t" \
+ LONG_ADDI " %[src], 1\n\t" \
+ LONG_ADDI " %[dst], 1\n\t" \
+ LONG_ADDI " %[len], -1\n\t" \
+ "bnez %[len], 222b\n\t" \
+ "333:\n\t" \
+ RSEQ_INJECT_ASM(5)
+ "sync\n\t" /* full sync provides store-release */
+ /* final store */
+ LONG_S " %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(6)
+ /* teardown */
+ LONG_L " %[len], %[rseq_scratch2]\n\t"
+ LONG_L " %[dst], %[rseq_scratch1]\n\t"
+ LONG_L " %[src], %[rseq_scratch0]\n\t"
+ "b 8f\n\t"
+ RSEQ_ASM_DEFINE_ABORT(3, 4,
+ /* teardown */
+ LONG_L " %[len], %[rseq_scratch2]\n\t"
+ LONG_L " %[dst], %[rseq_scratch1]\n\t"
+ LONG_L " %[src], %[rseq_scratch0]\n\t",
+ abort, 1b, 2b, 4f)
+ RSEQ_ASM_DEFINE_CMPFAIL(5,
+ /* teardown */
+ LONG_L " %[len], %[rseq_scratch2]\n\t"
+ LONG_L " %[dst], %[rseq_scratch1]\n\t"
+ LONG_L " %[src], %[rseq_scratch0]\n\t",
+ cmpfail)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_CMPFAIL(6,
+ /* teardown */
+ LONG_L " %[len], %[rseq_scratch2]\n\t"
+ LONG_L " %[dst], %[rseq_scratch1]\n\t"
+ LONG_L " %[src], %[rseq_scratch0]\n\t",
+ error1)
+ RSEQ_ASM_DEFINE_CMPFAIL(7,
+ /* teardown */
+ LONG_L " %[len], %[rseq_scratch2]\n\t"
+ LONG_L " %[dst], %[rseq_scratch1]\n\t"
+ LONG_L " %[src], %[rseq_scratch0]\n\t",
+ error2)
+#endif
+ "8:\n\t"
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv),
+ /* try memcpy input */
+ [dst] "r" (dst),
+ [src] "r" (src),
+ [len] "r" (len),
+ [rseq_scratch0] "m" (rseq_scratch[0]),
+ [rseq_scratch1] "m" (rseq_scratch[1]),
+ [rseq_scratch2] "m" (rseq_scratch[2])
+ RSEQ_INJECT_INPUT
+ : "$4", "memory"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ return 0;
+abort:
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+#endif /* !RSEQ_SKIP_FASTPATH */
diff --git a/tools/testing/selftests/rseq/rseq-ppc-thread-pointer.h b/tools/testing/selftests/rseq/rseq-ppc-thread-pointer.h
new file mode 100644
index 000000000..263eee84f
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-ppc-thread-pointer.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: LGPL-2.1-only OR MIT */
+/*
+ * rseq-ppc-thread-pointer.h
+ *
+ * (C) Copyright 2021 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#ifndef _RSEQ_PPC_THREAD_POINTER
+#define _RSEQ_PPC_THREAD_POINTER
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static inline void *rseq_thread_pointer(void)
+{
+#ifdef __powerpc64__
+ register void *__result asm ("r13");
+#else
+ register void *__result asm ("r2");
+#endif
+ asm ("" : "=r" (__result));
+ return __result;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/tools/testing/selftests/rseq/rseq-ppc.h b/tools/testing/selftests/rseq/rseq-ppc.h
new file mode 100644
index 000000000..bab8e0b9f
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-ppc.h
@@ -0,0 +1,791 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * rseq-ppc.h
+ *
+ * (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ * (C) Copyright 2016-2018 - Boqun Feng <boqun.feng@gmail.com>
+ */
+
+/*
+ * RSEQ_SIG is used with the following trap instruction:
+ *
+ * powerpc-be: 0f e5 00 0b twui r5,11
+ * powerpc64-le: 0b 00 e5 0f twui r5,11
+ * powerpc64-be: 0f e5 00 0b twui r5,11
+ */
+
+#define RSEQ_SIG 0x0fe5000b
+
+#define rseq_smp_mb() __asm__ __volatile__ ("sync" ::: "memory", "cc")
+#define rseq_smp_lwsync() __asm__ __volatile__ ("lwsync" ::: "memory", "cc")
+#define rseq_smp_rmb() rseq_smp_lwsync()
+#define rseq_smp_wmb() rseq_smp_lwsync()
+
+#define rseq_smp_load_acquire(p) \
+__extension__ ({ \
+ __typeof(*p) ____p1 = RSEQ_READ_ONCE(*p); \
+ rseq_smp_lwsync(); \
+ ____p1; \
+})
+
+#define rseq_smp_acquire__after_ctrl_dep() rseq_smp_lwsync()
+
+#define rseq_smp_store_release(p, v) \
+do { \
+ rseq_smp_lwsync(); \
+ RSEQ_WRITE_ONCE(*p, v); \
+} while (0)
+
+#ifdef RSEQ_SKIP_FASTPATH
+#include "rseq-skip.h"
+#else /* !RSEQ_SKIP_FASTPATH */
+
+/*
+ * The __rseq_cs_ptr_array and __rseq_cs sections can be used by debuggers to
+ * better handle single-stepping through the restartable critical sections.
+ */
+
+#ifdef __PPC64__
+
+#define RSEQ_STORE_LONG(arg) "std%U[" __rseq_str(arg) "]%X[" __rseq_str(arg) "] " /* To memory ("m" constraint) */
+#define RSEQ_STORE_INT(arg) "stw%U[" __rseq_str(arg) "]%X[" __rseq_str(arg) "] " /* To memory ("m" constraint) */
+#define RSEQ_LOAD_LONG(arg) "ld%U[" __rseq_str(arg) "]%X[" __rseq_str(arg) "] " /* From memory ("m" constraint) */
+#define RSEQ_LOAD_INT(arg) "lwz%U[" __rseq_str(arg) "]%X[" __rseq_str(arg) "] " /* From memory ("m" constraint) */
+#define RSEQ_LOADX_LONG "ldx " /* From base register ("b" constraint) */
+#define RSEQ_CMP_LONG "cmpd "
+#define RSEQ_CMP_LONG_INT "cmpdi "
+
+#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
+ start_ip, post_commit_offset, abort_ip) \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
+ ".balign 32\n\t" \
+ __rseq_str(label) ":\n\t" \
+ ".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
+ ".quad " __rseq_str(start_ip) ", " __rseq_str(post_commit_offset) ", " __rseq_str(abort_ip) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".quad " __rseq_str(label) "b\n\t" \
+ ".popsection\n\t"
+
+#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs) \
+ RSEQ_INJECT_ASM(1) \
+ "lis %%r17, (" __rseq_str(cs_label) ")@highest\n\t" \
+ "ori %%r17, %%r17, (" __rseq_str(cs_label) ")@higher\n\t" \
+ "rldicr %%r17, %%r17, 32, 31\n\t" \
+ "oris %%r17, %%r17, (" __rseq_str(cs_label) ")@high\n\t" \
+ "ori %%r17, %%r17, (" __rseq_str(cs_label) ")@l\n\t" \
+ "std %%r17, %[" __rseq_str(rseq_cs) "]\n\t" \
+ __rseq_str(label) ":\n\t"
+
+/*
+ * Exit points of a rseq critical section consist of all instructions outside
+ * of the critical section where a critical section can either branch to or
+ * reach through the normal course of its execution. The abort IP and the
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
+ * useful to assist debuggers stepping over the critical section.
+ */
+#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
+ ".pushsection __rseq_exit_point_array, \"aw\"\n\t" \
+ ".quad " __rseq_str(start_ip) ", " __rseq_str(exit_ip) "\n\t" \
+ ".popsection\n\t"
+
+#else /* #ifdef __PPC64__ */
+
+#define RSEQ_STORE_LONG(arg) "stw%U[" __rseq_str(arg) "]%X[" __rseq_str(arg) "] " /* To memory ("m" constraint) */
+#define RSEQ_STORE_INT(arg) RSEQ_STORE_LONG(arg) /* To memory ("m" constraint) */
+#define RSEQ_LOAD_LONG(arg) "lwz%U[" __rseq_str(arg) "]%X[" __rseq_str(arg) "] " /* From memory ("m" constraint) */
+#define RSEQ_LOAD_INT(arg) RSEQ_LOAD_LONG(arg) /* From memory ("m" constraint) */
+#define RSEQ_LOADX_LONG "lwzx " /* From base register ("b" constraint) */
+#define RSEQ_CMP_LONG "cmpw "
+#define RSEQ_CMP_LONG_INT "cmpwi "
+
+#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
+ start_ip, post_commit_offset, abort_ip) \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
+ ".balign 32\n\t" \
+ __rseq_str(label) ":\n\t" \
+ ".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
+ /* 32-bit only supported on BE */ \
+ ".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".long 0x0, " __rseq_str(label) "b\n\t" \
+ ".popsection\n\t"
+
+/*
+ * Exit points of a rseq critical section consist of all instructions outside
+ * of the critical section where a critical section can either branch to or
+ * reach through the normal course of its execution. The abort IP and the
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
+ * useful to assist debuggers stepping over the critical section.
+ */
+#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
+ ".pushsection __rseq_exit_point_array, \"aw\"\n\t" \
+ /* 32-bit only supported on BE */ \
+ ".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(exit_ip) "\n\t" \
+ ".popsection\n\t"
+
+#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs) \
+ RSEQ_INJECT_ASM(1) \
+ "lis %%r17, (" __rseq_str(cs_label) ")@ha\n\t" \
+ "addi %%r17, %%r17, (" __rseq_str(cs_label) ")@l\n\t" \
+ RSEQ_STORE_INT(rseq_cs) "%%r17, %[" __rseq_str(rseq_cs) "]\n\t" \
+ __rseq_str(label) ":\n\t"
+
+#endif /* #ifdef __PPC64__ */
+
+#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
+ __RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip, \
+ (post_commit_ip - start_ip), abort_ip)
+
+#define RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, label) \
+ RSEQ_INJECT_ASM(2) \
+ RSEQ_LOAD_INT(current_cpu_id) "%%r17, %[" __rseq_str(current_cpu_id) "]\n\t" \
+ "cmpw cr7, %[" __rseq_str(cpu_id) "], %%r17\n\t" \
+ "bne- cr7, " __rseq_str(label) "\n\t"
+
+#define RSEQ_ASM_DEFINE_ABORT(label, abort_label) \
+ ".pushsection __rseq_failure, \"ax\"\n\t" \
+ ".long " __rseq_str(RSEQ_SIG) "\n\t" \
+ __rseq_str(label) ":\n\t" \
+ "b %l[" __rseq_str(abort_label) "]\n\t" \
+ ".popsection\n\t"
+
+/*
+ * RSEQ_ASM_OPs: asm operations for rseq
+ * RSEQ_ASM_OP_R_*: has hard-code registers in it
+ * RSEQ_ASM_OP_* (else): doesn't have hard-code registers(unless cr7)
+ */
+#define RSEQ_ASM_OP_CMPEQ(var, expect, label) \
+ RSEQ_LOAD_LONG(var) "%%r17, %[" __rseq_str(var) "]\n\t" \
+ RSEQ_CMP_LONG "cr7, %%r17, %[" __rseq_str(expect) "]\n\t" \
+ "bne- cr7, " __rseq_str(label) "\n\t"
+
+#define RSEQ_ASM_OP_CMPNE(var, expectnot, label) \
+ RSEQ_LOAD_LONG(var) "%%r17, %[" __rseq_str(var) "]\n\t" \
+ RSEQ_CMP_LONG "cr7, %%r17, %[" __rseq_str(expectnot) "]\n\t" \
+ "beq- cr7, " __rseq_str(label) "\n\t"
+
+#define RSEQ_ASM_OP_STORE(value, var) \
+ RSEQ_STORE_LONG(var) "%[" __rseq_str(value) "], %[" __rseq_str(var) "]\n\t"
+
+/* Load @var to r17 */
+#define RSEQ_ASM_OP_R_LOAD(var) \
+ RSEQ_LOAD_LONG(var) "%%r17, %[" __rseq_str(var) "]\n\t"
+
+/* Store r17 to @var */
+#define RSEQ_ASM_OP_R_STORE(var) \
+ RSEQ_STORE_LONG(var) "%%r17, %[" __rseq_str(var) "]\n\t"
+
+/* Add @count to r17 */
+#define RSEQ_ASM_OP_R_ADD(count) \
+ "add %%r17, %[" __rseq_str(count) "], %%r17\n\t"
+
+/* Load (r17 + voffp) to r17 */
+#define RSEQ_ASM_OP_R_LOADX(voffp) \
+ RSEQ_LOADX_LONG "%%r17, %[" __rseq_str(voffp) "], %%r17\n\t"
+
+/* TODO: implement a faster memcpy. */
+#define RSEQ_ASM_OP_R_MEMCPY() \
+ RSEQ_CMP_LONG_INT "%%r19, 0\n\t" \
+ "beq 333f\n\t" \
+ "addi %%r20, %%r20, -1\n\t" \
+ "addi %%r21, %%r21, -1\n\t" \
+ "222:\n\t" \
+ "lbzu %%r18, 1(%%r20)\n\t" \
+ "stbu %%r18, 1(%%r21)\n\t" \
+ "addi %%r19, %%r19, -1\n\t" \
+ RSEQ_CMP_LONG_INT "%%r19, 0\n\t" \
+ "bne 222b\n\t" \
+ "333:\n\t" \
+
+#define RSEQ_ASM_OP_R_FINAL_STORE(var, post_commit_label) \
+ RSEQ_STORE_LONG(var) "%%r17, %[" __rseq_str(var) "]\n\t" \
+ __rseq_str(post_commit_label) ":\n\t"
+
+#define RSEQ_ASM_OP_FINAL_STORE(value, var, post_commit_label) \
+ RSEQ_STORE_LONG(var) "%[" __rseq_str(value) "], %[" __rseq_str(var) "]\n\t" \
+ __rseq_str(post_commit_label) ":\n\t"
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+ /* cmp cpuid */
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ /* cmp @v equal to @expect */
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ /* cmp cpuid */
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ /* cmp @v equal to @expect */
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+ /* final store */
+ RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
+ RSEQ_INJECT_ASM(5)
+ RSEQ_ASM_DEFINE_ABORT(4, abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv)
+ RSEQ_INJECT_INPUT
+ : "memory", "cc", "r17"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
+ long voffp, intptr_t *load, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+ /* cmp cpuid */
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ /* cmp @v not equal to @expectnot */
+ RSEQ_ASM_OP_CMPNE(v, expectnot, %l[cmpfail])
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ /* cmp cpuid */
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ /* cmp @v not equal to @expectnot */
+ RSEQ_ASM_OP_CMPNE(v, expectnot, %l[error2])
+#endif
+ /* load the value of @v */
+ RSEQ_ASM_OP_R_LOAD(v)
+ /* store it in @load */
+ RSEQ_ASM_OP_R_STORE(load)
+ /* dereference voffp(v) */
+ RSEQ_ASM_OP_R_LOADX(voffp)
+ /* final store the value at voffp(v) */
+ RSEQ_ASM_OP_R_FINAL_STORE(v, 2)
+ RSEQ_INJECT_ASM(5)
+ RSEQ_ASM_DEFINE_ABORT(4, abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* final store input */
+ [v] "m" (*v),
+ [expectnot] "r" (expectnot),
+ [voffp] "b" (voffp),
+ [load] "m" (*load)
+ RSEQ_INJECT_INPUT
+ : "memory", "cc", "r17"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_addv(intptr_t *v, intptr_t count, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+ /* cmp cpuid */
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+ /* cmp cpuid */
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+#endif
+ /* load the value of @v */
+ RSEQ_ASM_OP_R_LOAD(v)
+ /* add @count to it */
+ RSEQ_ASM_OP_R_ADD(count)
+ /* final store */
+ RSEQ_ASM_OP_R_FINAL_STORE(v, 2)
+ RSEQ_INJECT_ASM(4)
+ RSEQ_ASM_DEFINE_ABORT(4, abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* final store input */
+ [v] "m" (*v),
+ [count] "r" (count)
+ RSEQ_INJECT_INPUT
+ : "memory", "cc", "r17"
+ RSEQ_INJECT_CLOBBER
+ : abort
+#ifdef RSEQ_COMPARE_TWICE
+ , error1
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t newv2,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+ /* cmp cpuid */
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ /* cmp @v equal to @expect */
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ /* cmp cpuid */
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ /* cmp @v equal to @expect */
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+ /* try store */
+ RSEQ_ASM_OP_STORE(newv2, v2)
+ RSEQ_INJECT_ASM(5)
+ /* final store */
+ RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
+ RSEQ_INJECT_ASM(6)
+ RSEQ_ASM_DEFINE_ABORT(4, abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* try store input */
+ [v2] "m" (*v2),
+ [newv2] "r" (newv2),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv)
+ RSEQ_INJECT_INPUT
+ : "memory", "cc", "r17"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t newv2,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+ /* cmp cpuid */
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ /* cmp @v equal to @expect */
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ /* cmp cpuid */
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ /* cmp @v equal to @expect */
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+ /* try store */
+ RSEQ_ASM_OP_STORE(newv2, v2)
+ RSEQ_INJECT_ASM(5)
+ /* for 'release' */
+ "lwsync\n\t"
+ /* final store */
+ RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
+ RSEQ_INJECT_ASM(6)
+ RSEQ_ASM_DEFINE_ABORT(4, abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* try store input */
+ [v2] "m" (*v2),
+ [newv2] "r" (newv2),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv)
+ RSEQ_INJECT_INPUT
+ : "memory", "cc", "r17"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t expect2,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+ /* cmp cpuid */
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ /* cmp @v equal to @expect */
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+ RSEQ_INJECT_ASM(4)
+ /* cmp @v2 equal to @expct2 */
+ RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[cmpfail])
+ RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_COMPARE_TWICE
+ /* cmp cpuid */
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ /* cmp @v equal to @expect */
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+ /* cmp @v2 equal to @expct2 */
+ RSEQ_ASM_OP_CMPEQ(v2, expect2, %l[error3])
+#endif
+ /* final store */
+ RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
+ RSEQ_INJECT_ASM(6)
+ RSEQ_ASM_DEFINE_ABORT(4, abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* cmp2 input */
+ [v2] "m" (*v2),
+ [expect2] "r" (expect2),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv)
+ RSEQ_INJECT_INPUT
+ : "memory", "cc", "r17"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2, error3
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("1st expected value comparison failed");
+error3:
+ rseq_after_asm_goto();
+ rseq_bug("2nd expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
+ void *dst, void *src, size_t len,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* setup for mempcy */
+ "mr %%r19, %[len]\n\t"
+ "mr %%r20, %[src]\n\t"
+ "mr %%r21, %[dst]\n\t"
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+ /* cmp cpuid */
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ /* cmp @v equal to @expect */
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ /* cmp cpuid */
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ /* cmp @v equal to @expect */
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+ /* try memcpy */
+ RSEQ_ASM_OP_R_MEMCPY()
+ RSEQ_INJECT_ASM(5)
+ /* final store */
+ RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
+ RSEQ_INJECT_ASM(6)
+ /* teardown */
+ RSEQ_ASM_DEFINE_ABORT(4, abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv),
+ /* try memcpy input */
+ [dst] "r" (dst),
+ [src] "r" (src),
+ [len] "r" (len)
+ RSEQ_INJECT_INPUT
+ : "memory", "cc", "r17", "r18", "r19", "r20", "r21"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
+ void *dst, void *src, size_t len,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* setup for mempcy */
+ "mr %%r19, %[len]\n\t"
+ "mr %%r20, %[src]\n\t"
+ "mr %%r21, %[dst]\n\t"
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+ /* cmp cpuid */
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ /* cmp @v equal to @expect */
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[cmpfail])
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ /* cmp cpuid */
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ /* cmp @v equal to @expect */
+ RSEQ_ASM_OP_CMPEQ(v, expect, %l[error2])
+#endif
+ /* try memcpy */
+ RSEQ_ASM_OP_R_MEMCPY()
+ RSEQ_INJECT_ASM(5)
+ /* for 'release' */
+ "lwsync\n\t"
+ /* final store */
+ RSEQ_ASM_OP_FINAL_STORE(newv, v, 2)
+ RSEQ_INJECT_ASM(6)
+ /* teardown */
+ RSEQ_ASM_DEFINE_ABORT(4, abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv),
+ /* try memcpy input */
+ [dst] "r" (dst),
+ [src] "r" (src),
+ [len] "r" (len)
+ RSEQ_INJECT_INPUT
+ : "memory", "cc", "r17", "r18", "r19", "r20", "r21"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+#endif /* !RSEQ_SKIP_FASTPATH */
diff --git a/tools/testing/selftests/rseq/rseq-s390.h b/tools/testing/selftests/rseq/rseq-s390.h
new file mode 100644
index 000000000..4e6dc5f0c
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-s390.h
@@ -0,0 +1,610 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+
+/*
+ * RSEQ_SIG uses the trap4 instruction. As Linux does not make use of the
+ * access-register mode nor the linkage stack this instruction will always
+ * cause a special-operation exception (the trap-enabled bit in the DUCT
+ * is and will stay 0). The instruction pattern is
+ * b2 ff 0f ff trap4 4095(%r0)
+ */
+#define RSEQ_SIG 0xB2FF0FFF
+
+#define rseq_smp_mb() __asm__ __volatile__ ("bcr 15,0" ::: "memory")
+#define rseq_smp_rmb() rseq_smp_mb()
+#define rseq_smp_wmb() rseq_smp_mb()
+
+#define rseq_smp_load_acquire(p) \
+__extension__ ({ \
+ __typeof(*p) ____p1 = RSEQ_READ_ONCE(*p); \
+ rseq_barrier(); \
+ ____p1; \
+})
+
+#define rseq_smp_acquire__after_ctrl_dep() rseq_smp_rmb()
+
+#define rseq_smp_store_release(p, v) \
+do { \
+ rseq_barrier(); \
+ RSEQ_WRITE_ONCE(*p, v); \
+} while (0)
+
+#ifdef RSEQ_SKIP_FASTPATH
+#include "rseq-skip.h"
+#else /* !RSEQ_SKIP_FASTPATH */
+
+#ifdef __s390x__
+
+#define LONG_L "lg"
+#define LONG_S "stg"
+#define LONG_LT_R "ltgr"
+#define LONG_CMP "cg"
+#define LONG_CMP_R "cgr"
+#define LONG_ADDI "aghi"
+#define LONG_ADD_R "agr"
+
+#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
+ start_ip, post_commit_offset, abort_ip) \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
+ ".balign 32\n\t" \
+ __rseq_str(label) ":\n\t" \
+ ".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
+ ".quad " __rseq_str(start_ip) ", " __rseq_str(post_commit_offset) ", " __rseq_str(abort_ip) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".quad " __rseq_str(label) "b\n\t" \
+ ".popsection\n\t"
+
+/*
+ * Exit points of a rseq critical section consist of all instructions outside
+ * of the critical section where a critical section can either branch to or
+ * reach through the normal course of its execution. The abort IP and the
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
+ * useful to assist debuggers stepping over the critical section.
+ */
+#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
+ ".pushsection __rseq_exit_point_array, \"aw\"\n\t" \
+ ".quad " __rseq_str(start_ip) ", " __rseq_str(exit_ip) "\n\t" \
+ ".popsection\n\t"
+
+#elif __s390__
+
+#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
+ start_ip, post_commit_offset, abort_ip) \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
+ ".balign 32\n\t" \
+ __rseq_str(label) ":\n\t" \
+ ".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
+ ".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".long 0x0, " __rseq_str(label) "b\n\t" \
+ ".popsection\n\t"
+
+/*
+ * Exit points of a rseq critical section consist of all instructions outside
+ * of the critical section where a critical section can either branch to or
+ * reach through the normal course of its execution. The abort IP and the
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
+ * useful to assist debuggers stepping over the critical section.
+ */
+#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
+ ".pushsection __rseq_exit_point_array, \"aw\"\n\t" \
+ ".long 0x0, " __rseq_str(start_ip) ", 0x0, " __rseq_str(exit_ip) "\n\t" \
+ ".popsection\n\t"
+
+#define LONG_L "l"
+#define LONG_S "st"
+#define LONG_LT_R "ltr"
+#define LONG_CMP "c"
+#define LONG_CMP_R "cr"
+#define LONG_ADDI "ahi"
+#define LONG_ADD_R "ar"
+
+#endif
+
+#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
+ __RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip, \
+ (post_commit_ip - start_ip), abort_ip)
+
+#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs) \
+ RSEQ_INJECT_ASM(1) \
+ "larl %%r0, " __rseq_str(cs_label) "\n\t" \
+ LONG_S " %%r0, %[" __rseq_str(rseq_cs) "]\n\t" \
+ __rseq_str(label) ":\n\t"
+
+#define RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, label) \
+ RSEQ_INJECT_ASM(2) \
+ "c %[" __rseq_str(cpu_id) "], %[" __rseq_str(current_cpu_id) "]\n\t" \
+ "jnz " __rseq_str(label) "\n\t"
+
+#define RSEQ_ASM_DEFINE_ABORT(label, teardown, abort_label) \
+ ".pushsection __rseq_failure, \"ax\"\n\t" \
+ ".long " __rseq_str(RSEQ_SIG) "\n\t" \
+ __rseq_str(label) ":\n\t" \
+ teardown \
+ "jg %l[" __rseq_str(abort_label) "]\n\t" \
+ ".popsection\n\t"
+
+#define RSEQ_ASM_DEFINE_CMPFAIL(label, teardown, cmpfail_label) \
+ ".pushsection __rseq_failure, \"ax\"\n\t" \
+ __rseq_str(label) ":\n\t" \
+ teardown \
+ "jg %l[" __rseq_str(cmpfail_label) "]\n\t" \
+ ".popsection\n\t"
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ LONG_CMP " %[expect], %[v]\n\t"
+ "jnz %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ LONG_CMP " %[expect], %[v]\n\t"
+ "jnz %l[error2]\n\t"
+#endif
+ /* final store */
+ LONG_S " %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(5)
+ RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv)
+ RSEQ_INJECT_INPUT
+ : "memory", "cc", "r0"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+/*
+ * Compare @v against @expectnot. When it does _not_ match, load @v
+ * into @load, and store the content of *@v + voffp into @v.
+ */
+static inline __attribute__((always_inline))
+int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
+ long voffp, intptr_t *load, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ LONG_L " %%r1, %[v]\n\t"
+ LONG_CMP_R " %%r1, %[expectnot]\n\t"
+ "je %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ LONG_L " %%r1, %[v]\n\t"
+ LONG_CMP_R " %%r1, %[expectnot]\n\t"
+ "je %l[error2]\n\t"
+#endif
+ LONG_S " %%r1, %[load]\n\t"
+ LONG_ADD_R " %%r1, %[voffp]\n\t"
+ LONG_L " %%r1, 0(%%r1)\n\t"
+ /* final store */
+ LONG_S " %%r1, %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(5)
+ RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* final store input */
+ [v] "m" (*v),
+ [expectnot] "r" (expectnot),
+ [voffp] "r" (voffp),
+ [load] "m" (*load)
+ RSEQ_INJECT_INPUT
+ : "memory", "cc", "r0", "r1"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_addv(intptr_t *v, intptr_t count, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+#endif
+ LONG_L " %%r0, %[v]\n\t"
+ LONG_ADD_R " %%r0, %[count]\n\t"
+ /* final store */
+ LONG_S " %%r0, %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(4)
+ RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* final store input */
+ [v] "m" (*v),
+ [count] "r" (count)
+ RSEQ_INJECT_INPUT
+ : "memory", "cc", "r0"
+ RSEQ_INJECT_CLOBBER
+ : abort
+#ifdef RSEQ_COMPARE_TWICE
+ , error1
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t newv2,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ LONG_CMP " %[expect], %[v]\n\t"
+ "jnz %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ LONG_CMP " %[expect], %[v]\n\t"
+ "jnz %l[error2]\n\t"
+#endif
+ /* try store */
+ LONG_S " %[newv2], %[v2]\n\t"
+ RSEQ_INJECT_ASM(5)
+ /* final store */
+ LONG_S " %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(6)
+ RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* try store input */
+ [v2] "m" (*v2),
+ [newv2] "r" (newv2),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv)
+ RSEQ_INJECT_INPUT
+ : "memory", "cc", "r0"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+/* s390 is TSO. */
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t newv2,
+ intptr_t newv, int cpu)
+{
+ return rseq_cmpeqv_trystorev_storev(v, expect, v2, newv2, newv, cpu);
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t expect2,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ LONG_CMP " %[expect], %[v]\n\t"
+ "jnz %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+ LONG_CMP " %[expect2], %[v2]\n\t"
+ "jnz %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, %l[error1])
+ LONG_CMP " %[expect], %[v]\n\t"
+ "jnz %l[error2]\n\t"
+ LONG_CMP " %[expect2], %[v2]\n\t"
+ "jnz %l[error3]\n\t"
+#endif
+ /* final store */
+ LONG_S " %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(6)
+ RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* cmp2 input */
+ [v2] "m" (*v2),
+ [expect2] "r" (expect2),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv)
+ RSEQ_INJECT_INPUT
+ : "memory", "cc", "r0"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2, error3
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("1st expected value comparison failed");
+error3:
+ rseq_after_asm_goto();
+ rseq_bug("2nd expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
+ void *dst, void *src, size_t len,
+ intptr_t newv, int cpu)
+{
+ uint64_t rseq_scratch[3];
+
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ LONG_S " %[src], %[rseq_scratch0]\n\t"
+ LONG_S " %[dst], %[rseq_scratch1]\n\t"
+ LONG_S " %[len], %[rseq_scratch2]\n\t"
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, rseq_cs)
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 4f)
+ RSEQ_INJECT_ASM(3)
+ LONG_CMP " %[expect], %[v]\n\t"
+ "jnz 5f\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, 6f)
+ LONG_CMP " %[expect], %[v]\n\t"
+ "jnz 7f\n\t"
+#endif
+ /* try memcpy */
+ LONG_LT_R " %[len], %[len]\n\t"
+ "jz 333f\n\t"
+ "222:\n\t"
+ "ic %%r0,0(%[src])\n\t"
+ "stc %%r0,0(%[dst])\n\t"
+ LONG_ADDI " %[src], 1\n\t"
+ LONG_ADDI " %[dst], 1\n\t"
+ LONG_ADDI " %[len], -1\n\t"
+ "jnz 222b\n\t"
+ "333:\n\t"
+ RSEQ_INJECT_ASM(5)
+ /* final store */
+ LONG_S " %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(6)
+ /* teardown */
+ LONG_L " %[len], %[rseq_scratch2]\n\t"
+ LONG_L " %[dst], %[rseq_scratch1]\n\t"
+ LONG_L " %[src], %[rseq_scratch0]\n\t"
+ RSEQ_ASM_DEFINE_ABORT(4,
+ LONG_L " %[len], %[rseq_scratch2]\n\t"
+ LONG_L " %[dst], %[rseq_scratch1]\n\t"
+ LONG_L " %[src], %[rseq_scratch0]\n\t",
+ abort)
+ RSEQ_ASM_DEFINE_CMPFAIL(5,
+ LONG_L " %[len], %[rseq_scratch2]\n\t"
+ LONG_L " %[dst], %[rseq_scratch1]\n\t"
+ LONG_L " %[src], %[rseq_scratch0]\n\t",
+ cmpfail)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_CMPFAIL(6,
+ LONG_L " %[len], %[rseq_scratch2]\n\t"
+ LONG_L " %[dst], %[rseq_scratch1]\n\t"
+ LONG_L " %[src], %[rseq_scratch0]\n\t",
+ error1)
+ RSEQ_ASM_DEFINE_CMPFAIL(7,
+ LONG_L " %[len], %[rseq_scratch2]\n\t"
+ LONG_L " %[dst], %[rseq_scratch1]\n\t"
+ LONG_L " %[src], %[rseq_scratch0]\n\t",
+ error2)
+#endif
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [current_cpu_id] "m" (rseq_get_abi()->cpu_id),
+ [rseq_cs] "m" (rseq_get_abi()->rseq_cs.arch.ptr),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv),
+ /* try memcpy input */
+ [dst] "r" (dst),
+ [src] "r" (src),
+ [len] "r" (len),
+ [rseq_scratch0] "m" (rseq_scratch[0]),
+ [rseq_scratch1] "m" (rseq_scratch[1]),
+ [rseq_scratch2] "m" (rseq_scratch[2])
+ RSEQ_INJECT_INPUT
+ : "memory", "cc", "r0"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+/* s390 is TSO. */
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
+ void *dst, void *src, size_t len,
+ intptr_t newv, int cpu)
+{
+ return rseq_cmpeqv_trymemcpy_storev(v, expect, dst, src, len,
+ newv, cpu);
+}
+#endif /* !RSEQ_SKIP_FASTPATH */
diff --git a/tools/testing/selftests/rseq/rseq-skip.h b/tools/testing/selftests/rseq/rseq-skip.h
new file mode 100644
index 000000000..7b53dac1f
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-skip.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * rseq-skip.h
+ *
+ * (C) Copyright 2017-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
+{
+ return -1;
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
+ long voffp, intptr_t *load, int cpu)
+{
+ return -1;
+}
+
+static inline __attribute__((always_inline))
+int rseq_addv(intptr_t *v, intptr_t count, int cpu)
+{
+ return -1;
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t newv2,
+ intptr_t newv, int cpu)
+{
+ return -1;
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t newv2,
+ intptr_t newv, int cpu)
+{
+ return -1;
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t expect2,
+ intptr_t newv, int cpu)
+{
+ return -1;
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
+ void *dst, void *src, size_t len,
+ intptr_t newv, int cpu)
+{
+ return -1;
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
+ void *dst, void *src, size_t len,
+ intptr_t newv, int cpu)
+{
+ return -1;
+}
diff --git a/tools/testing/selftests/rseq/rseq-thread-pointer.h b/tools/testing/selftests/rseq/rseq-thread-pointer.h
new file mode 100644
index 000000000..977c25d75
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-thread-pointer.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: LGPL-2.1-only OR MIT */
+/*
+ * rseq-thread-pointer.h
+ *
+ * (C) Copyright 2021 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#ifndef _RSEQ_THREAD_POINTER
+#define _RSEQ_THREAD_POINTER
+
+#if defined(__x86_64__) || defined(__i386__)
+#include "rseq-x86-thread-pointer.h"
+#elif defined(__PPC__)
+#include "rseq-ppc-thread-pointer.h"
+#else
+#include "rseq-generic-thread-pointer.h"
+#endif
+
+#endif
diff --git a/tools/testing/selftests/rseq/rseq-x86-thread-pointer.h b/tools/testing/selftests/rseq/rseq-x86-thread-pointer.h
new file mode 100644
index 000000000..d3133587d
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-x86-thread-pointer.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: LGPL-2.1-only OR MIT */
+/*
+ * rseq-x86-thread-pointer.h
+ *
+ * (C) Copyright 2021 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#ifndef _RSEQ_X86_THREAD_POINTER
+#define _RSEQ_X86_THREAD_POINTER
+
+#include <features.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if __GNUC_PREREQ (11, 1)
+static inline void *rseq_thread_pointer(void)
+{
+ return __builtin_thread_pointer();
+}
+#else
+static inline void *rseq_thread_pointer(void)
+{
+ void *__result;
+
+# ifdef __x86_64__
+ __asm__ ("mov %%fs:0, %0" : "=r" (__result));
+# else
+ __asm__ ("mov %%gs:0, %0" : "=r" (__result));
+# endif
+ return __result;
+}
+#endif /* !GCC 11 */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/tools/testing/selftests/rseq/rseq-x86.h b/tools/testing/selftests/rseq/rseq-x86.h
new file mode 100644
index 000000000..bd01dc41c
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq-x86.h
@@ -0,0 +1,1365 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * rseq-x86.h
+ *
+ * (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#include <stdint.h>
+
+/*
+ * RSEQ_SIG is used with the following reserved undefined instructions, which
+ * trap in user-space:
+ *
+ * x86-32: 0f b9 3d 53 30 05 53 ud1 0x53053053,%edi
+ * x86-64: 0f b9 3d 53 30 05 53 ud1 0x53053053(%rip),%edi
+ */
+#define RSEQ_SIG 0x53053053
+
+/*
+ * Due to a compiler optimization bug in gcc-8 with asm goto and TLS asm input
+ * operands, we cannot use "m" input operands, and rather pass the __rseq_abi
+ * address through a "r" input operand.
+ */
+
+/* Offset of cpu_id and rseq_cs fields in struct rseq. */
+#define RSEQ_CPU_ID_OFFSET 4
+#define RSEQ_CS_OFFSET 8
+
+#ifdef __x86_64__
+
+#define RSEQ_ASM_TP_SEGMENT %%fs
+
+#define rseq_smp_mb() \
+ __asm__ __volatile__ ("lock; addl $0,-128(%%rsp)" ::: "memory", "cc")
+#define rseq_smp_rmb() rseq_barrier()
+#define rseq_smp_wmb() rseq_barrier()
+
+#define rseq_smp_load_acquire(p) \
+__extension__ ({ \
+ __typeof(*p) ____p1 = RSEQ_READ_ONCE(*p); \
+ rseq_barrier(); \
+ ____p1; \
+})
+
+#define rseq_smp_acquire__after_ctrl_dep() rseq_smp_rmb()
+
+#define rseq_smp_store_release(p, v) \
+do { \
+ rseq_barrier(); \
+ RSEQ_WRITE_ONCE(*p, v); \
+} while (0)
+
+#ifdef RSEQ_SKIP_FASTPATH
+#include "rseq-skip.h"
+#else /* !RSEQ_SKIP_FASTPATH */
+
+#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
+ start_ip, post_commit_offset, abort_ip) \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
+ ".balign 32\n\t" \
+ __rseq_str(label) ":\n\t" \
+ ".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
+ ".quad " __rseq_str(start_ip) ", " __rseq_str(post_commit_offset) ", " __rseq_str(abort_ip) "\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".quad " __rseq_str(label) "b\n\t" \
+ ".popsection\n\t"
+
+
+#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
+ __RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip, \
+ (post_commit_ip - start_ip), abort_ip)
+
+/*
+ * Exit points of a rseq critical section consist of all instructions outside
+ * of the critical section where a critical section can either branch to or
+ * reach through the normal course of its execution. The abort IP and the
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
+ * useful to assist debuggers stepping over the critical section.
+ */
+#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
+ ".pushsection __rseq_exit_point_array, \"aw\"\n\t" \
+ ".quad " __rseq_str(start_ip) ", " __rseq_str(exit_ip) "\n\t" \
+ ".popsection\n\t"
+
+#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs) \
+ RSEQ_INJECT_ASM(1) \
+ "leaq " __rseq_str(cs_label) "(%%rip), %%rax\n\t" \
+ "movq %%rax, " __rseq_str(rseq_cs) "\n\t" \
+ __rseq_str(label) ":\n\t"
+
+#define RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, label) \
+ RSEQ_INJECT_ASM(2) \
+ "cmpl %[" __rseq_str(cpu_id) "], " __rseq_str(current_cpu_id) "\n\t" \
+ "jnz " __rseq_str(label) "\n\t"
+
+#define RSEQ_ASM_DEFINE_ABORT(label, teardown, abort_label) \
+ ".pushsection __rseq_failure, \"ax\"\n\t" \
+ /* Disassembler-friendly signature: ud1 <sig>(%rip),%edi. */ \
+ ".byte 0x0f, 0xb9, 0x3d\n\t" \
+ ".long " __rseq_str(RSEQ_SIG) "\n\t" \
+ __rseq_str(label) ":\n\t" \
+ teardown \
+ "jmp %l[" __rseq_str(abort_label) "]\n\t" \
+ ".popsection\n\t"
+
+#define RSEQ_ASM_DEFINE_CMPFAIL(label, teardown, cmpfail_label) \
+ ".pushsection __rseq_failure, \"ax\"\n\t" \
+ __rseq_str(label) ":\n\t" \
+ teardown \
+ "jmp %l[" __rseq_str(cmpfail_label) "]\n\t" \
+ ".popsection\n\t"
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+ RSEQ_INJECT_ASM(3)
+ "cmpq %[v], %[expect]\n\t"
+ "jnz %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+ "cmpq %[v], %[expect]\n\t"
+ "jnz %l[error2]\n\t"
+#endif
+ /* final store */
+ "movq %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(5)
+ RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [rseq_offset] "r" (rseq_offset),
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv)
+ : "memory", "cc", "rax"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+/*
+ * Compare @v against @expectnot. When it does _not_ match, load @v
+ * into @load, and store the content of *@v + voffp into @v.
+ */
+static inline __attribute__((always_inline))
+int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
+ long voffp, intptr_t *load, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+ RSEQ_INJECT_ASM(3)
+ "movq %[v], %%rbx\n\t"
+ "cmpq %%rbx, %[expectnot]\n\t"
+ "je %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+ "movq %[v], %%rbx\n\t"
+ "cmpq %%rbx, %[expectnot]\n\t"
+ "je %l[error2]\n\t"
+#endif
+ "movq %%rbx, %[load]\n\t"
+ "addq %[voffp], %%rbx\n\t"
+ "movq (%%rbx), %%rbx\n\t"
+ /* final store */
+ "movq %%rbx, %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(5)
+ RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [rseq_offset] "r" (rseq_offset),
+ /* final store input */
+ [v] "m" (*v),
+ [expectnot] "r" (expectnot),
+ [voffp] "er" (voffp),
+ [load] "m" (*load)
+ : "memory", "cc", "rax", "rbx"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_addv(intptr_t *v, intptr_t count, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+ RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+#endif
+ /* final store */
+ "addq %[count], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(4)
+ RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [rseq_offset] "r" (rseq_offset),
+ /* final store input */
+ [v] "m" (*v),
+ [count] "er" (count)
+ : "memory", "cc", "rax"
+ RSEQ_INJECT_CLOBBER
+ : abort
+#ifdef RSEQ_COMPARE_TWICE
+ , error1
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+#define RSEQ_ARCH_HAS_OFFSET_DEREF_ADDV
+
+/*
+ * pval = *(ptr+off)
+ * *pval += inc;
+ */
+static inline __attribute__((always_inline))
+int rseq_offset_deref_addv(intptr_t *ptr, long off, intptr_t inc, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+ RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+#endif
+ /* get p+v */
+ "movq %[ptr], %%rbx\n\t"
+ "addq %[off], %%rbx\n\t"
+ /* get pv */
+ "movq (%%rbx), %%rcx\n\t"
+ /* *pv += inc */
+ "addq %[inc], (%%rcx)\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(4)
+ RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [rseq_offset] "r" (rseq_offset),
+ /* final store input */
+ [ptr] "m" (*ptr),
+ [off] "er" (off),
+ [inc] "er" (inc)
+ : "memory", "cc", "rax", "rbx", "rcx"
+ RSEQ_INJECT_CLOBBER
+ : abort
+#ifdef RSEQ_COMPARE_TWICE
+ , error1
+#endif
+ );
+ return 0;
+abort:
+ RSEQ_INJECT_FAILED
+ return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t newv2,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+ RSEQ_INJECT_ASM(3)
+ "cmpq %[v], %[expect]\n\t"
+ "jnz %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+ "cmpq %[v], %[expect]\n\t"
+ "jnz %l[error2]\n\t"
+#endif
+ /* try store */
+ "movq %[newv2], %[v2]\n\t"
+ RSEQ_INJECT_ASM(5)
+ /* final store */
+ "movq %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(6)
+ RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [rseq_offset] "r" (rseq_offset),
+ /* try store input */
+ [v2] "m" (*v2),
+ [newv2] "r" (newv2),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv)
+ : "memory", "cc", "rax"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+/* x86-64 is TSO. */
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t newv2,
+ intptr_t newv, int cpu)
+{
+ return rseq_cmpeqv_trystorev_storev(v, expect, v2, newv2, newv, cpu);
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t expect2,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+ RSEQ_INJECT_ASM(3)
+ "cmpq %[v], %[expect]\n\t"
+ "jnz %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+ "cmpq %[v2], %[expect2]\n\t"
+ "jnz %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+ "cmpq %[v], %[expect]\n\t"
+ "jnz %l[error2]\n\t"
+ "cmpq %[v2], %[expect2]\n\t"
+ "jnz %l[error3]\n\t"
+#endif
+ /* final store */
+ "movq %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(6)
+ RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [rseq_offset] "r" (rseq_offset),
+ /* cmp2 input */
+ [v2] "m" (*v2),
+ [expect2] "r" (expect2),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv)
+ : "memory", "cc", "rax"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2, error3
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("1st expected value comparison failed");
+error3:
+ rseq_after_asm_goto();
+ rseq_bug("2nd expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
+ void *dst, void *src, size_t len,
+ intptr_t newv, int cpu)
+{
+ uint64_t rseq_scratch[3];
+
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ "movq %[src], %[rseq_scratch0]\n\t"
+ "movq %[dst], %[rseq_scratch1]\n\t"
+ "movq %[len], %[rseq_scratch2]\n\t"
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+ RSEQ_INJECT_ASM(3)
+ "cmpq %[v], %[expect]\n\t"
+ "jnz 5f\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 6f)
+ "cmpq %[v], %[expect]\n\t"
+ "jnz 7f\n\t"
+#endif
+ /* try memcpy */
+ "test %[len], %[len]\n\t" \
+ "jz 333f\n\t" \
+ "222:\n\t" \
+ "movb (%[src]), %%al\n\t" \
+ "movb %%al, (%[dst])\n\t" \
+ "inc %[src]\n\t" \
+ "inc %[dst]\n\t" \
+ "dec %[len]\n\t" \
+ "jnz 222b\n\t" \
+ "333:\n\t" \
+ RSEQ_INJECT_ASM(5)
+ /* final store */
+ "movq %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(6)
+ /* teardown */
+ "movq %[rseq_scratch2], %[len]\n\t"
+ "movq %[rseq_scratch1], %[dst]\n\t"
+ "movq %[rseq_scratch0], %[src]\n\t"
+ RSEQ_ASM_DEFINE_ABORT(4,
+ "movq %[rseq_scratch2], %[len]\n\t"
+ "movq %[rseq_scratch1], %[dst]\n\t"
+ "movq %[rseq_scratch0], %[src]\n\t",
+ abort)
+ RSEQ_ASM_DEFINE_CMPFAIL(5,
+ "movq %[rseq_scratch2], %[len]\n\t"
+ "movq %[rseq_scratch1], %[dst]\n\t"
+ "movq %[rseq_scratch0], %[src]\n\t",
+ cmpfail)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_CMPFAIL(6,
+ "movq %[rseq_scratch2], %[len]\n\t"
+ "movq %[rseq_scratch1], %[dst]\n\t"
+ "movq %[rseq_scratch0], %[src]\n\t",
+ error1)
+ RSEQ_ASM_DEFINE_CMPFAIL(7,
+ "movq %[rseq_scratch2], %[len]\n\t"
+ "movq %[rseq_scratch1], %[dst]\n\t"
+ "movq %[rseq_scratch0], %[src]\n\t",
+ error2)
+#endif
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [rseq_offset] "r" (rseq_offset),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv),
+ /* try memcpy input */
+ [dst] "r" (dst),
+ [src] "r" (src),
+ [len] "r" (len),
+ [rseq_scratch0] "m" (rseq_scratch[0]),
+ [rseq_scratch1] "m" (rseq_scratch[1]),
+ [rseq_scratch2] "m" (rseq_scratch[2])
+ : "memory", "cc", "rax"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+/* x86-64 is TSO. */
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
+ void *dst, void *src, size_t len,
+ intptr_t newv, int cpu)
+{
+ return rseq_cmpeqv_trymemcpy_storev(v, expect, dst, src, len,
+ newv, cpu);
+}
+
+#endif /* !RSEQ_SKIP_FASTPATH */
+
+#elif defined(__i386__)
+
+#define RSEQ_ASM_TP_SEGMENT %%gs
+
+#define rseq_smp_mb() \
+ __asm__ __volatile__ ("lock; addl $0,-128(%%esp)" ::: "memory", "cc")
+#define rseq_smp_rmb() \
+ __asm__ __volatile__ ("lock; addl $0,-128(%%esp)" ::: "memory", "cc")
+#define rseq_smp_wmb() \
+ __asm__ __volatile__ ("lock; addl $0,-128(%%esp)" ::: "memory", "cc")
+
+#define rseq_smp_load_acquire(p) \
+__extension__ ({ \
+ __typeof(*p) ____p1 = RSEQ_READ_ONCE(*p); \
+ rseq_smp_mb(); \
+ ____p1; \
+})
+
+#define rseq_smp_acquire__after_ctrl_dep() rseq_smp_rmb()
+
+#define rseq_smp_store_release(p, v) \
+do { \
+ rseq_smp_mb(); \
+ RSEQ_WRITE_ONCE(*p, v); \
+} while (0)
+
+#ifdef RSEQ_SKIP_FASTPATH
+#include "rseq-skip.h"
+#else /* !RSEQ_SKIP_FASTPATH */
+
+/*
+ * Use eax as scratch register and take memory operands as input to
+ * lessen register pressure. Especially needed when compiling in O0.
+ */
+#define __RSEQ_ASM_DEFINE_TABLE(label, version, flags, \
+ start_ip, post_commit_offset, abort_ip) \
+ ".pushsection __rseq_cs, \"aw\"\n\t" \
+ ".balign 32\n\t" \
+ __rseq_str(label) ":\n\t" \
+ ".long " __rseq_str(version) ", " __rseq_str(flags) "\n\t" \
+ ".long " __rseq_str(start_ip) ", 0x0, " __rseq_str(post_commit_offset) ", 0x0, " __rseq_str(abort_ip) ", 0x0\n\t" \
+ ".popsection\n\t" \
+ ".pushsection __rseq_cs_ptr_array, \"aw\"\n\t" \
+ ".long " __rseq_str(label) "b, 0x0\n\t" \
+ ".popsection\n\t"
+
+#define RSEQ_ASM_DEFINE_TABLE(label, start_ip, post_commit_ip, abort_ip) \
+ __RSEQ_ASM_DEFINE_TABLE(label, 0x0, 0x0, start_ip, \
+ (post_commit_ip - start_ip), abort_ip)
+
+/*
+ * Exit points of a rseq critical section consist of all instructions outside
+ * of the critical section where a critical section can either branch to or
+ * reach through the normal course of its execution. The abort IP and the
+ * post-commit IP are already part of the __rseq_cs section and should not be
+ * explicitly defined as additional exit points. Knowing all exit points is
+ * useful to assist debuggers stepping over the critical section.
+ */
+#define RSEQ_ASM_DEFINE_EXIT_POINT(start_ip, exit_ip) \
+ ".pushsection __rseq_exit_point_array, \"aw\"\n\t" \
+ ".long " __rseq_str(start_ip) ", 0x0, " __rseq_str(exit_ip) ", 0x0\n\t" \
+ ".popsection\n\t"
+
+#define RSEQ_ASM_STORE_RSEQ_CS(label, cs_label, rseq_cs) \
+ RSEQ_INJECT_ASM(1) \
+ "movl $" __rseq_str(cs_label) ", " __rseq_str(rseq_cs) "\n\t" \
+ __rseq_str(label) ":\n\t"
+
+#define RSEQ_ASM_CMP_CPU_ID(cpu_id, current_cpu_id, label) \
+ RSEQ_INJECT_ASM(2) \
+ "cmpl %[" __rseq_str(cpu_id) "], " __rseq_str(current_cpu_id) "\n\t" \
+ "jnz " __rseq_str(label) "\n\t"
+
+#define RSEQ_ASM_DEFINE_ABORT(label, teardown, abort_label) \
+ ".pushsection __rseq_failure, \"ax\"\n\t" \
+ /* Disassembler-friendly signature: ud1 <sig>,%edi. */ \
+ ".byte 0x0f, 0xb9, 0x3d\n\t" \
+ ".long " __rseq_str(RSEQ_SIG) "\n\t" \
+ __rseq_str(label) ":\n\t" \
+ teardown \
+ "jmp %l[" __rseq_str(abort_label) "]\n\t" \
+ ".popsection\n\t"
+
+#define RSEQ_ASM_DEFINE_CMPFAIL(label, teardown, cmpfail_label) \
+ ".pushsection __rseq_failure, \"ax\"\n\t" \
+ __rseq_str(label) ":\n\t" \
+ teardown \
+ "jmp %l[" __rseq_str(cmpfail_label) "]\n\t" \
+ ".popsection\n\t"
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_storev(intptr_t *v, intptr_t expect, intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+ RSEQ_INJECT_ASM(3)
+ "cmpl %[v], %[expect]\n\t"
+ "jnz %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+ "cmpl %[v], %[expect]\n\t"
+ "jnz %l[error2]\n\t"
+#endif
+ /* final store */
+ "movl %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(5)
+ RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [rseq_offset] "r" (rseq_offset),
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv)
+ : "memory", "cc", "eax"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+/*
+ * Compare @v against @expectnot. When it does _not_ match, load @v
+ * into @load, and store the content of *@v + voffp into @v.
+ */
+static inline __attribute__((always_inline))
+int rseq_cmpnev_storeoffp_load(intptr_t *v, intptr_t expectnot,
+ long voffp, intptr_t *load, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+ RSEQ_INJECT_ASM(3)
+ "movl %[v], %%ebx\n\t"
+ "cmpl %%ebx, %[expectnot]\n\t"
+ "je %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+ "movl %[v], %%ebx\n\t"
+ "cmpl %%ebx, %[expectnot]\n\t"
+ "je %l[error2]\n\t"
+#endif
+ "movl %%ebx, %[load]\n\t"
+ "addl %[voffp], %%ebx\n\t"
+ "movl (%%ebx), %%ebx\n\t"
+ /* final store */
+ "movl %%ebx, %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(5)
+ RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [rseq_offset] "r" (rseq_offset),
+ /* final store input */
+ [v] "m" (*v),
+ [expectnot] "r" (expectnot),
+ [voffp] "ir" (voffp),
+ [load] "m" (*load)
+ : "memory", "cc", "eax", "ebx"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_addv(intptr_t *v, intptr_t count, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+ RSEQ_INJECT_ASM(3)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+#endif
+ /* final store */
+ "addl %[count], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(4)
+ RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [rseq_offset] "r" (rseq_offset),
+ /* final store input */
+ [v] "m" (*v),
+ [count] "ir" (count)
+ : "memory", "cc", "eax"
+ RSEQ_INJECT_CLOBBER
+ : abort
+#ifdef RSEQ_COMPARE_TWICE
+ , error1
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t newv2,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+ RSEQ_INJECT_ASM(3)
+ "cmpl %[v], %[expect]\n\t"
+ "jnz %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+ "cmpl %[v], %[expect]\n\t"
+ "jnz %l[error2]\n\t"
+#endif
+ /* try store */
+ "movl %[newv2], %%eax\n\t"
+ "movl %%eax, %[v2]\n\t"
+ RSEQ_INJECT_ASM(5)
+ /* final store */
+ "movl %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(6)
+ RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [rseq_offset] "r" (rseq_offset),
+ /* try store input */
+ [v2] "m" (*v2),
+ [newv2] "m" (newv2),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "r" (newv)
+ : "memory", "cc", "eax"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trystorev_storev_release(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t newv2,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+ RSEQ_INJECT_ASM(3)
+ "movl %[expect], %%eax\n\t"
+ "cmpl %[v], %%eax\n\t"
+ "jnz %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+ "movl %[expect], %%eax\n\t"
+ "cmpl %[v], %%eax\n\t"
+ "jnz %l[error2]\n\t"
+#endif
+ /* try store */
+ "movl %[newv2], %[v2]\n\t"
+ RSEQ_INJECT_ASM(5)
+ "lock; addl $0,-128(%%esp)\n\t"
+ /* final store */
+ "movl %[newv], %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(6)
+ RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [rseq_offset] "r" (rseq_offset),
+ /* try store input */
+ [v2] "m" (*v2),
+ [newv2] "r" (newv2),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "m" (expect),
+ [newv] "r" (newv)
+ : "memory", "cc", "eax"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+
+}
+
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_cmpeqv_storev(intptr_t *v, intptr_t expect,
+ intptr_t *v2, intptr_t expect2,
+ intptr_t newv, int cpu)
+{
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error3])
+#endif
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+ RSEQ_INJECT_ASM(3)
+ "cmpl %[v], %[expect]\n\t"
+ "jnz %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(4)
+ "cmpl %[expect2], %[v2]\n\t"
+ "jnz %l[cmpfail]\n\t"
+ RSEQ_INJECT_ASM(5)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), %l[error1])
+ "cmpl %[v], %[expect]\n\t"
+ "jnz %l[error2]\n\t"
+ "cmpl %[expect2], %[v2]\n\t"
+ "jnz %l[error3]\n\t"
+#endif
+ "movl %[newv], %%eax\n\t"
+ /* final store */
+ "movl %%eax, %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(6)
+ RSEQ_ASM_DEFINE_ABORT(4, "", abort)
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [rseq_offset] "r" (rseq_offset),
+ /* cmp2 input */
+ [v2] "m" (*v2),
+ [expect2] "r" (expect2),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "r" (expect),
+ [newv] "m" (newv)
+ : "memory", "cc", "eax"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2, error3
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("1st expected value comparison failed");
+error3:
+ rseq_after_asm_goto();
+ rseq_bug("2nd expected value comparison failed");
+#endif
+}
+
+/* TODO: implement a faster memcpy. */
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev(intptr_t *v, intptr_t expect,
+ void *dst, void *src, size_t len,
+ intptr_t newv, int cpu)
+{
+ uint32_t rseq_scratch[3];
+
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ "movl %[src], %[rseq_scratch0]\n\t"
+ "movl %[dst], %[rseq_scratch1]\n\t"
+ "movl %[len], %[rseq_scratch2]\n\t"
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+ RSEQ_INJECT_ASM(3)
+ "movl %[expect], %%eax\n\t"
+ "cmpl %%eax, %[v]\n\t"
+ "jnz 5f\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 6f)
+ "movl %[expect], %%eax\n\t"
+ "cmpl %%eax, %[v]\n\t"
+ "jnz 7f\n\t"
+#endif
+ /* try memcpy */
+ "test %[len], %[len]\n\t" \
+ "jz 333f\n\t" \
+ "222:\n\t" \
+ "movb (%[src]), %%al\n\t" \
+ "movb %%al, (%[dst])\n\t" \
+ "inc %[src]\n\t" \
+ "inc %[dst]\n\t" \
+ "dec %[len]\n\t" \
+ "jnz 222b\n\t" \
+ "333:\n\t" \
+ RSEQ_INJECT_ASM(5)
+ "movl %[newv], %%eax\n\t"
+ /* final store */
+ "movl %%eax, %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(6)
+ /* teardown */
+ "movl %[rseq_scratch2], %[len]\n\t"
+ "movl %[rseq_scratch1], %[dst]\n\t"
+ "movl %[rseq_scratch0], %[src]\n\t"
+ RSEQ_ASM_DEFINE_ABORT(4,
+ "movl %[rseq_scratch2], %[len]\n\t"
+ "movl %[rseq_scratch1], %[dst]\n\t"
+ "movl %[rseq_scratch0], %[src]\n\t",
+ abort)
+ RSEQ_ASM_DEFINE_CMPFAIL(5,
+ "movl %[rseq_scratch2], %[len]\n\t"
+ "movl %[rseq_scratch1], %[dst]\n\t"
+ "movl %[rseq_scratch0], %[src]\n\t",
+ cmpfail)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_CMPFAIL(6,
+ "movl %[rseq_scratch2], %[len]\n\t"
+ "movl %[rseq_scratch1], %[dst]\n\t"
+ "movl %[rseq_scratch0], %[src]\n\t",
+ error1)
+ RSEQ_ASM_DEFINE_CMPFAIL(7,
+ "movl %[rseq_scratch2], %[len]\n\t"
+ "movl %[rseq_scratch1], %[dst]\n\t"
+ "movl %[rseq_scratch0], %[src]\n\t",
+ error2)
+#endif
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [rseq_offset] "r" (rseq_offset),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "m" (expect),
+ [newv] "m" (newv),
+ /* try memcpy input */
+ [dst] "r" (dst),
+ [src] "r" (src),
+ [len] "r" (len),
+ [rseq_scratch0] "m" (rseq_scratch[0]),
+ [rseq_scratch1] "m" (rseq_scratch[1]),
+ [rseq_scratch2] "m" (rseq_scratch[2])
+ : "memory", "cc", "eax"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+/* TODO: implement a faster memcpy. */
+static inline __attribute__((always_inline))
+int rseq_cmpeqv_trymemcpy_storev_release(intptr_t *v, intptr_t expect,
+ void *dst, void *src, size_t len,
+ intptr_t newv, int cpu)
+{
+ uint32_t rseq_scratch[3];
+
+ RSEQ_INJECT_C(9)
+
+ __asm__ __volatile__ goto (
+ RSEQ_ASM_DEFINE_TABLE(3, 1f, 2f, 4f) /* start, commit, abort */
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[cmpfail])
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error1])
+ RSEQ_ASM_DEFINE_EXIT_POINT(1f, %l[error2])
+#endif
+ "movl %[src], %[rseq_scratch0]\n\t"
+ "movl %[dst], %[rseq_scratch1]\n\t"
+ "movl %[len], %[rseq_scratch2]\n\t"
+ /* Start rseq by storing table entry pointer into rseq_cs. */
+ RSEQ_ASM_STORE_RSEQ_CS(1, 3b, RSEQ_ASM_TP_SEGMENT:RSEQ_CS_OFFSET(%[rseq_offset]))
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 4f)
+ RSEQ_INJECT_ASM(3)
+ "movl %[expect], %%eax\n\t"
+ "cmpl %%eax, %[v]\n\t"
+ "jnz 5f\n\t"
+ RSEQ_INJECT_ASM(4)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_CMP_CPU_ID(cpu_id, RSEQ_ASM_TP_SEGMENT:RSEQ_CPU_ID_OFFSET(%[rseq_offset]), 6f)
+ "movl %[expect], %%eax\n\t"
+ "cmpl %%eax, %[v]\n\t"
+ "jnz 7f\n\t"
+#endif
+ /* try memcpy */
+ "test %[len], %[len]\n\t" \
+ "jz 333f\n\t" \
+ "222:\n\t" \
+ "movb (%[src]), %%al\n\t" \
+ "movb %%al, (%[dst])\n\t" \
+ "inc %[src]\n\t" \
+ "inc %[dst]\n\t" \
+ "dec %[len]\n\t" \
+ "jnz 222b\n\t" \
+ "333:\n\t" \
+ RSEQ_INJECT_ASM(5)
+ "lock; addl $0,-128(%%esp)\n\t"
+ "movl %[newv], %%eax\n\t"
+ /* final store */
+ "movl %%eax, %[v]\n\t"
+ "2:\n\t"
+ RSEQ_INJECT_ASM(6)
+ /* teardown */
+ "movl %[rseq_scratch2], %[len]\n\t"
+ "movl %[rseq_scratch1], %[dst]\n\t"
+ "movl %[rseq_scratch0], %[src]\n\t"
+ RSEQ_ASM_DEFINE_ABORT(4,
+ "movl %[rseq_scratch2], %[len]\n\t"
+ "movl %[rseq_scratch1], %[dst]\n\t"
+ "movl %[rseq_scratch0], %[src]\n\t",
+ abort)
+ RSEQ_ASM_DEFINE_CMPFAIL(5,
+ "movl %[rseq_scratch2], %[len]\n\t"
+ "movl %[rseq_scratch1], %[dst]\n\t"
+ "movl %[rseq_scratch0], %[src]\n\t",
+ cmpfail)
+#ifdef RSEQ_COMPARE_TWICE
+ RSEQ_ASM_DEFINE_CMPFAIL(6,
+ "movl %[rseq_scratch2], %[len]\n\t"
+ "movl %[rseq_scratch1], %[dst]\n\t"
+ "movl %[rseq_scratch0], %[src]\n\t",
+ error1)
+ RSEQ_ASM_DEFINE_CMPFAIL(7,
+ "movl %[rseq_scratch2], %[len]\n\t"
+ "movl %[rseq_scratch1], %[dst]\n\t"
+ "movl %[rseq_scratch0], %[src]\n\t",
+ error2)
+#endif
+ : /* gcc asm goto does not allow outputs */
+ : [cpu_id] "r" (cpu),
+ [rseq_offset] "r" (rseq_offset),
+ /* final store input */
+ [v] "m" (*v),
+ [expect] "m" (expect),
+ [newv] "m" (newv),
+ /* try memcpy input */
+ [dst] "r" (dst),
+ [src] "r" (src),
+ [len] "r" (len),
+ [rseq_scratch0] "m" (rseq_scratch[0]),
+ [rseq_scratch1] "m" (rseq_scratch[1]),
+ [rseq_scratch2] "m" (rseq_scratch[2])
+ : "memory", "cc", "eax"
+ RSEQ_INJECT_CLOBBER
+ : abort, cmpfail
+#ifdef RSEQ_COMPARE_TWICE
+ , error1, error2
+#endif
+ );
+ rseq_after_asm_goto();
+ return 0;
+abort:
+ rseq_after_asm_goto();
+ RSEQ_INJECT_FAILED
+ return -1;
+cmpfail:
+ rseq_after_asm_goto();
+ return 1;
+#ifdef RSEQ_COMPARE_TWICE
+error1:
+ rseq_after_asm_goto();
+ rseq_bug("cpu_id comparison failed");
+error2:
+ rseq_after_asm_goto();
+ rseq_bug("expected value comparison failed");
+#endif
+}
+
+#endif /* !RSEQ_SKIP_FASTPATH */
+
+#endif
diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c
new file mode 100644
index 000000000..e20191fb4
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: LGPL-2.1
+/*
+ * rseq.c
+ *
+ * Copyright (C) 2016 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; only
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <assert.h>
+#include <signal.h>
+#include <limits.h>
+#include <dlfcn.h>
+#include <stddef.h>
+
+#include <linux/compiler.h>
+
+#include "../kselftest.h"
+#include "rseq.h"
+
+/*
+ * Define weak versions to play nice with binaries that are statically linked
+ * against a libc that doesn't support registering its own rseq.
+ */
+__weak ptrdiff_t __rseq_offset;
+__weak unsigned int __rseq_size;
+__weak unsigned int __rseq_flags;
+
+static const ptrdiff_t *libc_rseq_offset_p = &__rseq_offset;
+static const unsigned int *libc_rseq_size_p = &__rseq_size;
+static const unsigned int *libc_rseq_flags_p = &__rseq_flags;
+
+/* Offset from the thread pointer to the rseq area. */
+ptrdiff_t rseq_offset;
+
+/* Size of the registered rseq area. 0 if the registration was
+ unsuccessful. */
+unsigned int rseq_size = -1U;
+
+/* Flags used during rseq registration. */
+unsigned int rseq_flags;
+
+static int rseq_ownership;
+
+static
+__thread struct rseq_abi __rseq_abi __attribute__((tls_model("initial-exec"))) = {
+ .cpu_id = RSEQ_ABI_CPU_ID_UNINITIALIZED,
+};
+
+static int sys_rseq(struct rseq_abi *rseq_abi, uint32_t rseq_len,
+ int flags, uint32_t sig)
+{
+ return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig);
+}
+
+int rseq_available(void)
+{
+ int rc;
+
+ rc = sys_rseq(NULL, 0, 0, 0);
+ if (rc != -1)
+ abort();
+ switch (errno) {
+ case ENOSYS:
+ return 0;
+ case EINVAL:
+ return 1;
+ default:
+ abort();
+ }
+}
+
+int rseq_register_current_thread(void)
+{
+ int rc;
+
+ if (!rseq_ownership) {
+ /* Treat libc's ownership as a successful registration. */
+ return 0;
+ }
+ rc = sys_rseq(&__rseq_abi, sizeof(struct rseq_abi), 0, RSEQ_SIG);
+ if (rc)
+ return -1;
+ assert(rseq_current_cpu_raw() >= 0);
+ return 0;
+}
+
+int rseq_unregister_current_thread(void)
+{
+ int rc;
+
+ if (!rseq_ownership) {
+ /* Treat libc's ownership as a successful unregistration. */
+ return 0;
+ }
+ rc = sys_rseq(&__rseq_abi, sizeof(struct rseq_abi), RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG);
+ if (rc)
+ return -1;
+ return 0;
+}
+
+static __attribute__((constructor))
+void rseq_init(void)
+{
+ /*
+ * If the libc's registered rseq size isn't already valid, it may be
+ * because the binary is dynamically linked and not necessarily due to
+ * libc not having registered a restartable sequence. Try to find the
+ * symbols if that's the case.
+ */
+ if (!*libc_rseq_size_p) {
+ libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset");
+ libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size");
+ libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags");
+ }
+ if (libc_rseq_size_p && libc_rseq_offset_p && libc_rseq_flags_p &&
+ *libc_rseq_size_p != 0) {
+ /* rseq registration owned by glibc */
+ rseq_offset = *libc_rseq_offset_p;
+ rseq_size = *libc_rseq_size_p;
+ rseq_flags = *libc_rseq_flags_p;
+ return;
+ }
+ if (!rseq_available())
+ return;
+ rseq_ownership = 1;
+ rseq_offset = (void *)&__rseq_abi - rseq_thread_pointer();
+ rseq_size = sizeof(struct rseq_abi);
+ rseq_flags = 0;
+}
+
+static __attribute__((destructor))
+void rseq_exit(void)
+{
+ if (!rseq_ownership)
+ return;
+ rseq_offset = 0;
+ rseq_size = -1U;
+ rseq_ownership = 0;
+}
+
+int32_t rseq_fallback_current_cpu(void)
+{
+ int32_t cpu;
+
+ cpu = sched_getcpu();
+ if (cpu < 0) {
+ perror("sched_getcpu()");
+ abort();
+ }
+ return cpu;
+}
diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h
new file mode 100644
index 000000000..9d850b290
--- /dev/null
+++ b/tools/testing/selftests/rseq/rseq.h
@@ -0,0 +1,175 @@
+/* SPDX-License-Identifier: LGPL-2.1 OR MIT */
+/*
+ * rseq.h
+ *
+ * (C) Copyright 2016-2018 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ */
+
+#ifndef RSEQ_H
+#define RSEQ_H
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <pthread.h>
+#include <signal.h>
+#include <sched.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include "rseq-abi.h"
+#include "compiler.h"
+
+/*
+ * Empty code injection macros, override when testing.
+ * It is important to consider that the ASM injection macros need to be
+ * fully reentrant (e.g. do not modify the stack).
+ */
+#ifndef RSEQ_INJECT_ASM
+#define RSEQ_INJECT_ASM(n)
+#endif
+
+#ifndef RSEQ_INJECT_C
+#define RSEQ_INJECT_C(n)
+#endif
+
+#ifndef RSEQ_INJECT_INPUT
+#define RSEQ_INJECT_INPUT
+#endif
+
+#ifndef RSEQ_INJECT_CLOBBER
+#define RSEQ_INJECT_CLOBBER
+#endif
+
+#ifndef RSEQ_INJECT_FAILED
+#define RSEQ_INJECT_FAILED
+#endif
+
+#include "rseq-thread-pointer.h"
+
+/* Offset from the thread pointer to the rseq area. */
+extern ptrdiff_t rseq_offset;
+/* Size of the registered rseq area. 0 if the registration was
+ unsuccessful. */
+extern unsigned int rseq_size;
+/* Flags used during rseq registration. */
+extern unsigned int rseq_flags;
+
+static inline struct rseq_abi *rseq_get_abi(void)
+{
+ return (struct rseq_abi *) ((uintptr_t) rseq_thread_pointer() + rseq_offset);
+}
+
+#define rseq_likely(x) __builtin_expect(!!(x), 1)
+#define rseq_unlikely(x) __builtin_expect(!!(x), 0)
+#define rseq_barrier() __asm__ __volatile__("" : : : "memory")
+
+#define RSEQ_ACCESS_ONCE(x) (*(__volatile__ __typeof__(x) *)&(x))
+#define RSEQ_WRITE_ONCE(x, v) __extension__ ({ RSEQ_ACCESS_ONCE(x) = (v); })
+#define RSEQ_READ_ONCE(x) RSEQ_ACCESS_ONCE(x)
+
+#define __rseq_str_1(x) #x
+#define __rseq_str(x) __rseq_str_1(x)
+
+#define rseq_log(fmt, args...) \
+ fprintf(stderr, fmt "(in %s() at " __FILE__ ":" __rseq_str(__LINE__)"\n", \
+ ## args, __func__)
+
+#define rseq_bug(fmt, args...) \
+ do { \
+ rseq_log(fmt, ##args); \
+ abort(); \
+ } while (0)
+
+#if defined(__x86_64__) || defined(__i386__)
+#include <rseq-x86.h>
+#elif defined(__ARMEL__)
+#include <rseq-arm.h>
+#elif defined (__AARCH64EL__)
+#include <rseq-arm64.h>
+#elif defined(__PPC__)
+#include <rseq-ppc.h>
+#elif defined(__mips__)
+#include <rseq-mips.h>
+#elif defined(__s390__)
+#include <rseq-s390.h>
+#else
+#error unsupported target
+#endif
+
+/*
+ * Register rseq for the current thread. This needs to be called once
+ * by any thread which uses restartable sequences, before they start
+ * using restartable sequences, to ensure restartable sequences
+ * succeed. A restartable sequence executed from a non-registered
+ * thread will always fail.
+ */
+int rseq_register_current_thread(void);
+
+/*
+ * Unregister rseq for current thread.
+ */
+int rseq_unregister_current_thread(void);
+
+/*
+ * Restartable sequence fallback for reading the current CPU number.
+ */
+int32_t rseq_fallback_current_cpu(void);
+
+/*
+ * Values returned can be either the current CPU number, -1 (rseq is
+ * uninitialized), or -2 (rseq initialization has failed).
+ */
+static inline int32_t rseq_current_cpu_raw(void)
+{
+ return RSEQ_ACCESS_ONCE(rseq_get_abi()->cpu_id);
+}
+
+/*
+ * Returns a possible CPU number, which is typically the current CPU.
+ * The returned CPU number can be used to prepare for an rseq critical
+ * section, which will confirm whether the cpu number is indeed the
+ * current one, and whether rseq is initialized.
+ *
+ * The CPU number returned by rseq_cpu_start should always be validated
+ * by passing it to a rseq asm sequence, or by comparing it to the
+ * return value of rseq_current_cpu_raw() if the rseq asm sequence
+ * does not need to be invoked.
+ */
+static inline uint32_t rseq_cpu_start(void)
+{
+ return RSEQ_ACCESS_ONCE(rseq_get_abi()->cpu_id_start);
+}
+
+static inline uint32_t rseq_current_cpu(void)
+{
+ int32_t cpu;
+
+ cpu = rseq_current_cpu_raw();
+ if (rseq_unlikely(cpu < 0))
+ cpu = rseq_fallback_current_cpu();
+ return cpu;
+}
+
+static inline void rseq_clear_rseq_cs(void)
+{
+ RSEQ_WRITE_ONCE(rseq_get_abi()->rseq_cs.arch.ptr, 0);
+}
+
+/*
+ * rseq_prepare_unload() should be invoked by each thread executing a rseq
+ * critical section at least once between their last critical section and
+ * library unload of the library defining the rseq critical section (struct
+ * rseq_cs) or the code referred to by the struct rseq_cs start_ip and
+ * post_commit_offset fields. This also applies to use of rseq in code
+ * generated by JIT: rseq_prepare_unload() should be invoked at least once by
+ * each thread executing a rseq critical section before reclaim of the memory
+ * holding the struct rseq_cs or reclaim of the code pointed to by struct
+ * rseq_cs start_ip and post_commit_offset fields.
+ */
+static inline void rseq_prepare_unload(void)
+{
+ rseq_clear_rseq_cs();
+}
+
+#endif /* RSEQ_H_ */
diff --git a/tools/testing/selftests/rseq/run_param_test.sh b/tools/testing/selftests/rseq/run_param_test.sh
new file mode 100755
index 000000000..f51bc83c9
--- /dev/null
+++ b/tools/testing/selftests/rseq/run_param_test.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+ or MIT
+
+NR_CPUS=`grep '^processor' /proc/cpuinfo | wc -l`
+
+EXTRA_ARGS=${@}
+
+OLDIFS="$IFS"
+IFS=$'\n'
+TEST_LIST=(
+ "-T s"
+ "-T l"
+ "-T b"
+ "-T b -M"
+ "-T m"
+ "-T m -M"
+ "-T i"
+ "-T r"
+)
+
+TEST_NAME=(
+ "spinlock"
+ "list"
+ "buffer"
+ "buffer with barrier"
+ "memcpy"
+ "memcpy with barrier"
+ "increment"
+ "membarrier"
+)
+IFS="$OLDIFS"
+
+REPS=1000
+SLOW_REPS=100
+NR_THREADS=$((6*${NR_CPUS}))
+
+function do_tests()
+{
+ local i=0
+ while [ "$i" -lt "${#TEST_LIST[@]}" ]; do
+ echo "Running test ${TEST_NAME[$i]}"
+ ./param_test ${TEST_LIST[$i]} -r ${REPS} -t ${NR_THREADS} ${@} ${EXTRA_ARGS} || exit 1
+ echo "Running compare-twice test ${TEST_NAME[$i]}"
+ ./param_test_compare_twice ${TEST_LIST[$i]} -r ${REPS} -t ${NR_THREADS} ${@} ${EXTRA_ARGS} || exit 1
+ let "i++"
+ done
+}
+
+echo "Default parameters"
+do_tests
+
+echo "Loop injection: 10000 loops"
+
+OLDIFS="$IFS"
+IFS=$'\n'
+INJECT_LIST=(
+ "1"
+ "2"
+ "3"
+ "4"
+ "5"
+ "6"
+ "7"
+ "8"
+ "9"
+)
+IFS="$OLDIFS"
+
+NR_LOOPS=10000
+
+i=0
+while [ "$i" -lt "${#INJECT_LIST[@]}" ]; do
+ echo "Injecting at <${INJECT_LIST[$i]}>"
+ do_tests -${INJECT_LIST[i]} ${NR_LOOPS}
+ let "i++"
+done
+NR_LOOPS=
+
+function inject_blocking()
+{
+ OLDIFS="$IFS"
+ IFS=$'\n'
+ INJECT_LIST=(
+ "7"
+ "8"
+ "9"
+ )
+ IFS="$OLDIFS"
+
+ NR_LOOPS=-1
+
+ i=0
+ while [ "$i" -lt "${#INJECT_LIST[@]}" ]; do
+ echo "Injecting at <${INJECT_LIST[$i]}>"
+ do_tests -${INJECT_LIST[i]} -1 ${@}
+ let "i++"
+ done
+ NR_LOOPS=
+}
+
+echo "Yield injection (25%)"
+inject_blocking -m 4 -y
+
+echo "Yield injection (50%)"
+inject_blocking -m 2 -y
+
+echo "Yield injection (100%)"
+inject_blocking -m 1 -y
+
+echo "Kill injection (25%)"
+inject_blocking -m 4 -k
+
+echo "Kill injection (50%)"
+inject_blocking -m 2 -k
+
+echo "Kill injection (100%)"
+inject_blocking -m 1 -k
+
+echo "Sleep injection (1ms, 25%)"
+inject_blocking -m 4 -s 1
+
+echo "Sleep injection (1ms, 50%)"
+inject_blocking -m 2 -s 1
+
+echo "Sleep injection (1ms, 100%)"
+inject_blocking -m 1 -s 1
diff --git a/tools/testing/selftests/rseq/settings b/tools/testing/selftests/rseq/settings
new file mode 100644
index 000000000..e7b941753
--- /dev/null
+++ b/tools/testing/selftests/rseq/settings
@@ -0,0 +1 @@
+timeout=0
diff --git a/tools/testing/selftests/rtc/.gitignore b/tools/testing/selftests/rtc/.gitignore
new file mode 100644
index 000000000..fb2d533aa
--- /dev/null
+++ b/tools/testing/selftests/rtc/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+rtctest
+setdate
diff --git a/tools/testing/selftests/rtc/Makefile b/tools/testing/selftests/rtc/Makefile
new file mode 100644
index 000000000..55198ecc0
--- /dev/null
+++ b/tools/testing/selftests/rtc/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS += -O3 -Wl,-no-as-needed -Wall
+LDLIBS += -lrt -lpthread -lm
+
+TEST_GEN_PROGS = rtctest
+
+TEST_GEN_PROGS_EXTENDED = setdate
+
+TEST_FILES := settings
+
+include ../lib.mk
diff --git a/tools/testing/selftests/rtc/rtctest.c b/tools/testing/selftests/rtc/rtctest.c
new file mode 100644
index 000000000..66af608fb
--- /dev/null
+++ b/tools/testing/selftests/rtc/rtctest.c
@@ -0,0 +1,337 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Real Time Clock Driver Test Program
+ *
+ * Copyright (c) 2018 Alexandre Belloni <alexandre.belloni@bootlin.com>
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/rtc.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "../kselftest_harness.h"
+
+#define NUM_UIE 3
+#define ALARM_DELTA 3
+
+static char *rtc_file = "/dev/rtc0";
+
+FIXTURE(rtc) {
+ int fd;
+};
+
+FIXTURE_SETUP(rtc) {
+ self->fd = open(rtc_file, O_RDONLY);
+ ASSERT_NE(-1, self->fd);
+}
+
+FIXTURE_TEARDOWN(rtc) {
+ close(self->fd);
+}
+
+TEST_F(rtc, date_read) {
+ int rc;
+ struct rtc_time rtc_tm;
+
+ /* Read the RTC time/date */
+ rc = ioctl(self->fd, RTC_RD_TIME, &rtc_tm);
+ ASSERT_NE(-1, rc);
+
+ TH_LOG("Current RTC date/time is %02d/%02d/%02d %02d:%02d:%02d.",
+ rtc_tm.tm_mday, rtc_tm.tm_mon + 1, rtc_tm.tm_year + 1900,
+ rtc_tm.tm_hour, rtc_tm.tm_min, rtc_tm.tm_sec);
+}
+
+TEST_F_TIMEOUT(rtc, uie_read, NUM_UIE + 2) {
+ int i, rc, irq = 0;
+ unsigned long data;
+
+ /* Turn on update interrupts */
+ rc = ioctl(self->fd, RTC_UIE_ON, 0);
+ if (rc == -1) {
+ ASSERT_EQ(EINVAL, errno);
+ TH_LOG("skip update IRQs not supported.");
+ return;
+ }
+
+ for (i = 0; i < NUM_UIE; i++) {
+ /* This read will block */
+ rc = read(self->fd, &data, sizeof(data));
+ ASSERT_NE(-1, rc);
+ irq++;
+ }
+
+ EXPECT_EQ(NUM_UIE, irq);
+
+ rc = ioctl(self->fd, RTC_UIE_OFF, 0);
+ ASSERT_NE(-1, rc);
+}
+
+TEST_F(rtc, uie_select) {
+ int i, rc, irq = 0;
+ unsigned long data;
+
+ /* Turn on update interrupts */
+ rc = ioctl(self->fd, RTC_UIE_ON, 0);
+ if (rc == -1) {
+ ASSERT_EQ(EINVAL, errno);
+ TH_LOG("skip update IRQs not supported.");
+ return;
+ }
+
+ for (i = 0; i < NUM_UIE; i++) {
+ struct timeval tv = { .tv_sec = 2 };
+ fd_set readfds;
+
+ FD_ZERO(&readfds);
+ FD_SET(self->fd, &readfds);
+ /* The select will wait until an RTC interrupt happens. */
+ rc = select(self->fd + 1, &readfds, NULL, NULL, &tv);
+ ASSERT_NE(-1, rc);
+ ASSERT_NE(0, rc);
+
+ /* This read won't block */
+ rc = read(self->fd, &data, sizeof(unsigned long));
+ ASSERT_NE(-1, rc);
+ irq++;
+ }
+
+ EXPECT_EQ(NUM_UIE, irq);
+
+ rc = ioctl(self->fd, RTC_UIE_OFF, 0);
+ ASSERT_NE(-1, rc);
+}
+
+TEST_F(rtc, alarm_alm_set) {
+ struct timeval tv = { .tv_sec = ALARM_DELTA + 2 };
+ unsigned long data;
+ struct rtc_time tm;
+ fd_set readfds;
+ time_t secs, new;
+ int rc;
+
+ rc = ioctl(self->fd, RTC_RD_TIME, &tm);
+ ASSERT_NE(-1, rc);
+
+ secs = timegm((struct tm *)&tm) + ALARM_DELTA;
+ gmtime_r(&secs, (struct tm *)&tm);
+
+ rc = ioctl(self->fd, RTC_ALM_SET, &tm);
+ if (rc == -1) {
+ ASSERT_EQ(EINVAL, errno);
+ TH_LOG("skip alarms are not supported.");
+ return;
+ }
+
+ rc = ioctl(self->fd, RTC_ALM_READ, &tm);
+ ASSERT_NE(-1, rc);
+
+ TH_LOG("Alarm time now set to %02d:%02d:%02d.",
+ tm.tm_hour, tm.tm_min, tm.tm_sec);
+
+ /* Enable alarm interrupts */
+ rc = ioctl(self->fd, RTC_AIE_ON, 0);
+ ASSERT_NE(-1, rc);
+
+ FD_ZERO(&readfds);
+ FD_SET(self->fd, &readfds);
+
+ rc = select(self->fd + 1, &readfds, NULL, NULL, &tv);
+ ASSERT_NE(-1, rc);
+ ASSERT_NE(0, rc);
+
+ /* Disable alarm interrupts */
+ rc = ioctl(self->fd, RTC_AIE_OFF, 0);
+ ASSERT_NE(-1, rc);
+
+ rc = read(self->fd, &data, sizeof(unsigned long));
+ ASSERT_NE(-1, rc);
+ TH_LOG("data: %lx", data);
+
+ rc = ioctl(self->fd, RTC_RD_TIME, &tm);
+ ASSERT_NE(-1, rc);
+
+ new = timegm((struct tm *)&tm);
+ ASSERT_EQ(new, secs);
+}
+
+TEST_F(rtc, alarm_wkalm_set) {
+ struct timeval tv = { .tv_sec = ALARM_DELTA + 2 };
+ struct rtc_wkalrm alarm = { 0 };
+ struct rtc_time tm;
+ unsigned long data;
+ fd_set readfds;
+ time_t secs, new;
+ int rc;
+
+ rc = ioctl(self->fd, RTC_RD_TIME, &alarm.time);
+ ASSERT_NE(-1, rc);
+
+ secs = timegm((struct tm *)&alarm.time) + ALARM_DELTA;
+ gmtime_r(&secs, (struct tm *)&alarm.time);
+
+ alarm.enabled = 1;
+
+ rc = ioctl(self->fd, RTC_WKALM_SET, &alarm);
+ if (rc == -1) {
+ ASSERT_EQ(EINVAL, errno);
+ TH_LOG("skip alarms are not supported.");
+ return;
+ }
+
+ rc = ioctl(self->fd, RTC_WKALM_RD, &alarm);
+ ASSERT_NE(-1, rc);
+
+ TH_LOG("Alarm time now set to %02d/%02d/%02d %02d:%02d:%02d.",
+ alarm.time.tm_mday, alarm.time.tm_mon + 1,
+ alarm.time.tm_year + 1900, alarm.time.tm_hour,
+ alarm.time.tm_min, alarm.time.tm_sec);
+
+ FD_ZERO(&readfds);
+ FD_SET(self->fd, &readfds);
+
+ rc = select(self->fd + 1, &readfds, NULL, NULL, &tv);
+ ASSERT_NE(-1, rc);
+ ASSERT_NE(0, rc);
+
+ rc = read(self->fd, &data, sizeof(unsigned long));
+ ASSERT_NE(-1, rc);
+
+ rc = ioctl(self->fd, RTC_RD_TIME, &tm);
+ ASSERT_NE(-1, rc);
+
+ new = timegm((struct tm *)&tm);
+ ASSERT_EQ(new, secs);
+}
+
+TEST_F_TIMEOUT(rtc, alarm_alm_set_minute, 65) {
+ struct timeval tv = { .tv_sec = 62 };
+ unsigned long data;
+ struct rtc_time tm;
+ fd_set readfds;
+ time_t secs, new;
+ int rc;
+
+ rc = ioctl(self->fd, RTC_RD_TIME, &tm);
+ ASSERT_NE(-1, rc);
+
+ secs = timegm((struct tm *)&tm) + 60 - tm.tm_sec;
+ gmtime_r(&secs, (struct tm *)&tm);
+
+ rc = ioctl(self->fd, RTC_ALM_SET, &tm);
+ if (rc == -1) {
+ ASSERT_EQ(EINVAL, errno);
+ TH_LOG("skip alarms are not supported.");
+ return;
+ }
+
+ rc = ioctl(self->fd, RTC_ALM_READ, &tm);
+ ASSERT_NE(-1, rc);
+
+ TH_LOG("Alarm time now set to %02d:%02d:%02d.",
+ tm.tm_hour, tm.tm_min, tm.tm_sec);
+
+ /* Enable alarm interrupts */
+ rc = ioctl(self->fd, RTC_AIE_ON, 0);
+ ASSERT_NE(-1, rc);
+
+ FD_ZERO(&readfds);
+ FD_SET(self->fd, &readfds);
+
+ rc = select(self->fd + 1, &readfds, NULL, NULL, &tv);
+ ASSERT_NE(-1, rc);
+ ASSERT_NE(0, rc);
+
+ /* Disable alarm interrupts */
+ rc = ioctl(self->fd, RTC_AIE_OFF, 0);
+ ASSERT_NE(-1, rc);
+
+ rc = read(self->fd, &data, sizeof(unsigned long));
+ ASSERT_NE(-1, rc);
+ TH_LOG("data: %lx", data);
+
+ rc = ioctl(self->fd, RTC_RD_TIME, &tm);
+ ASSERT_NE(-1, rc);
+
+ new = timegm((struct tm *)&tm);
+ ASSERT_EQ(new, secs);
+}
+
+TEST_F_TIMEOUT(rtc, alarm_wkalm_set_minute, 65) {
+ struct timeval tv = { .tv_sec = 62 };
+ struct rtc_wkalrm alarm = { 0 };
+ struct rtc_time tm;
+ unsigned long data;
+ fd_set readfds;
+ time_t secs, new;
+ int rc;
+
+ rc = ioctl(self->fd, RTC_RD_TIME, &alarm.time);
+ ASSERT_NE(-1, rc);
+
+ secs = timegm((struct tm *)&alarm.time) + 60 - alarm.time.tm_sec;
+ gmtime_r(&secs, (struct tm *)&alarm.time);
+
+ alarm.enabled = 1;
+
+ rc = ioctl(self->fd, RTC_WKALM_SET, &alarm);
+ if (rc == -1) {
+ ASSERT_EQ(EINVAL, errno);
+ TH_LOG("skip alarms are not supported.");
+ return;
+ }
+
+ rc = ioctl(self->fd, RTC_WKALM_RD, &alarm);
+ ASSERT_NE(-1, rc);
+
+ TH_LOG("Alarm time now set to %02d/%02d/%02d %02d:%02d:%02d.",
+ alarm.time.tm_mday, alarm.time.tm_mon + 1,
+ alarm.time.tm_year + 1900, alarm.time.tm_hour,
+ alarm.time.tm_min, alarm.time.tm_sec);
+
+ FD_ZERO(&readfds);
+ FD_SET(self->fd, &readfds);
+
+ rc = select(self->fd + 1, &readfds, NULL, NULL, &tv);
+ ASSERT_NE(-1, rc);
+ ASSERT_NE(0, rc);
+
+ rc = read(self->fd, &data, sizeof(unsigned long));
+ ASSERT_NE(-1, rc);
+
+ rc = ioctl(self->fd, RTC_RD_TIME, &tm);
+ ASSERT_NE(-1, rc);
+
+ new = timegm((struct tm *)&tm);
+ ASSERT_EQ(new, secs);
+}
+
+static void __attribute__((constructor))
+__constructor_order_last(void)
+{
+ if (!__constructor_order)
+ __constructor_order = _CONSTRUCTOR_ORDER_BACKWARD;
+}
+
+int main(int argc, char **argv)
+{
+ switch (argc) {
+ case 2:
+ rtc_file = argv[1];
+ /* FALLTHROUGH */
+ case 1:
+ break;
+ default:
+ fprintf(stderr, "usage: %s [rtcdev]\n", argv[0]);
+ return 1;
+ }
+
+ return test_harness_run(argc, argv);
+}
diff --git a/tools/testing/selftests/rtc/setdate.c b/tools/testing/selftests/rtc/setdate.c
new file mode 100644
index 000000000..b303890b3
--- /dev/null
+++ b/tools/testing/selftests/rtc/setdate.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Real Time Clock Driver Test
+ * by: Benjamin Gaignard (benjamin.gaignard@linaro.org)
+ *
+ * To build
+ * gcc rtctest_setdate.c -o rtctest_setdate
+ */
+
+#include <stdio.h>
+#include <linux/rtc.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <errno.h>
+
+static const char default_time[] = "00:00:00";
+
+int main(int argc, char **argv)
+{
+ int fd, retval;
+ struct rtc_time new, current;
+ const char *rtc, *date;
+ const char *time = default_time;
+
+ switch (argc) {
+ case 4:
+ time = argv[3];
+ /* FALLTHROUGH */
+ case 3:
+ date = argv[2];
+ rtc = argv[1];
+ break;
+ default:
+ fprintf(stderr, "usage: rtctest_setdate <rtcdev> <DD-MM-YYYY> [HH:MM:SS]\n");
+ return 1;
+ }
+
+ fd = open(rtc, O_RDONLY);
+ if (fd == -1) {
+ perror(rtc);
+ exit(errno);
+ }
+
+ sscanf(date, "%d-%d-%d", &new.tm_mday, &new.tm_mon, &new.tm_year);
+ new.tm_mon -= 1;
+ new.tm_year -= 1900;
+ sscanf(time, "%d:%d:%d", &new.tm_hour, &new.tm_min, &new.tm_sec);
+
+ fprintf(stderr, "Test will set RTC date/time to %d-%d-%d, %02d:%02d:%02d.\n",
+ new.tm_mday, new.tm_mon + 1, new.tm_year + 1900,
+ new.tm_hour, new.tm_min, new.tm_sec);
+
+ /* Write the new date in RTC */
+ retval = ioctl(fd, RTC_SET_TIME, &new);
+ if (retval == -1) {
+ perror("RTC_SET_TIME ioctl");
+ close(fd);
+ exit(errno);
+ }
+
+ /* Read back */
+ retval = ioctl(fd, RTC_RD_TIME, &current);
+ if (retval == -1) {
+ perror("RTC_RD_TIME ioctl");
+ exit(errno);
+ }
+
+ fprintf(stderr, "\n\nCurrent RTC date/time is %d-%d-%d, %02d:%02d:%02d.\n",
+ current.tm_mday, current.tm_mon + 1, current.tm_year + 1900,
+ current.tm_hour, current.tm_min, current.tm_sec);
+
+ close(fd);
+ return 0;
+}
diff --git a/tools/testing/selftests/rtc/settings b/tools/testing/selftests/rtc/settings
new file mode 100644
index 000000000..a953c96aa
--- /dev/null
+++ b/tools/testing/selftests/rtc/settings
@@ -0,0 +1 @@
+timeout=180
diff --git a/tools/testing/selftests/run_kselftest.sh b/tools/testing/selftests/run_kselftest.sh
new file mode 100755
index 000000000..97165a83d
--- /dev/null
+++ b/tools/testing/selftests/run_kselftest.sh
@@ -0,0 +1,93 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run installed kselftest tests.
+#
+BASE_DIR=$(realpath $(dirname $0))
+cd $BASE_DIR
+TESTS="$BASE_DIR"/kselftest-list.txt
+if [ ! -r "$TESTS" ] ; then
+ echo "$0: Could not find list of tests to run ($TESTS)" >&2
+ available=""
+else
+ available="$(cat "$TESTS")"
+fi
+
+. ./kselftest/runner.sh
+ROOT=$PWD
+
+usage()
+{
+ cat <<EOF
+Usage: $0 [OPTIONS]
+ -s | --summary Print summary with detailed log in output.log
+ -t | --test COLLECTION:TEST Run TEST from COLLECTION
+ -c | --collection COLLECTION Run all tests from COLLECTION
+ -l | --list List the available collection:test entries
+ -d | --dry-run Don't actually run any tests
+ -h | --help Show this usage info
+EOF
+ exit $1
+}
+
+COLLECTIONS=""
+TESTS=""
+dryrun=""
+while true; do
+ case "$1" in
+ -s | --summary)
+ logfile="$BASE_DIR"/output.log
+ cat /dev/null > $logfile
+ shift ;;
+ -t | --test)
+ TESTS="$TESTS $2"
+ shift 2 ;;
+ -c | --collection)
+ COLLECTIONS="$COLLECTIONS $2"
+ shift 2 ;;
+ -l | --list)
+ echo "$available"
+ exit 0 ;;
+ -d | --dry-run)
+ dryrun="echo"
+ shift ;;
+ -h | --help)
+ usage 0 ;;
+ "")
+ break ;;
+ *)
+ usage 1 ;;
+ esac
+done
+
+# Add all selected collections to the explicit test list.
+if [ -n "$COLLECTIONS" ]; then
+ for collection in $COLLECTIONS ; do
+ found="$(echo "$available" | grep "^$collection:")"
+ if [ -z "$found" ] ; then
+ echo "No such collection '$collection'" >&2
+ exit 1
+ fi
+ TESTS="$TESTS $found"
+ done
+fi
+# Replace available test list with explicitly selected tests.
+if [ -n "$TESTS" ]; then
+ valid=""
+ for test in $TESTS ; do
+ found="$(echo "$available" | grep "^${test}$")"
+ if [ -z "$found" ] ; then
+ echo "No such test '$test'" >&2
+ exit 1
+ fi
+ valid="$valid $found"
+ done
+ available="$(echo "$valid" | sed -e 's/ /\n/g')"
+fi
+
+collections=$(echo "$available" | cut -d: -f1 | uniq)
+for collection in $collections ; do
+ [ -w /dev/kmsg ] && echo "kselftest: Running tests in $collection" >> /dev/kmsg
+ tests=$(echo "$available" | grep "^$collection:" | cut -d: -f2)
+ ($dryrun cd "$collection" && $dryrun run_many $tests)
+done
diff --git a/tools/testing/selftests/safesetid/.gitignore b/tools/testing/selftests/safesetid/.gitignore
new file mode 100644
index 000000000..25d3db172
--- /dev/null
+++ b/tools/testing/selftests/safesetid/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+safesetid-test
diff --git a/tools/testing/selftests/safesetid/Makefile b/tools/testing/selftests/safesetid/Makefile
new file mode 100644
index 000000000..fa02c4d5e
--- /dev/null
+++ b/tools/testing/selftests/safesetid/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for mount selftests.
+CFLAGS = -Wall -O2
+LDLIBS = -lcap
+
+TEST_PROGS := safesetid-test.sh
+TEST_GEN_FILES := safesetid-test
+
+include ../lib.mk
diff --git a/tools/testing/selftests/safesetid/config b/tools/testing/selftests/safesetid/config
new file mode 100644
index 000000000..9d44e5c2e
--- /dev/null
+++ b/tools/testing/selftests/safesetid/config
@@ -0,0 +1,2 @@
+CONFIG_SECURITY=y
+CONFIG_SECURITYFS=y
diff --git a/tools/testing/selftests/safesetid/safesetid-test.c b/tools/testing/selftests/safesetid/safesetid-test.c
new file mode 100644
index 000000000..0c4d50644
--- /dev/null
+++ b/tools/testing/selftests/safesetid/safesetid-test.c
@@ -0,0 +1,335 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <pwd.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/capability.h>
+#include <sys/types.h>
+#include <sys/mount.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdarg.h>
+
+#ifndef CLONE_NEWUSER
+# define CLONE_NEWUSER 0x10000000
+#endif
+
+#define ROOT_USER 0
+#define RESTRICTED_PARENT 1
+#define ALLOWED_CHILD1 2
+#define ALLOWED_CHILD2 3
+#define NO_POLICY_USER 4
+
+char* add_whitelist_policy_file = "/sys/kernel/security/safesetid/add_whitelist_policy";
+
+static void die(char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap, fmt);
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+ exit(EXIT_FAILURE);
+}
+
+static bool vmaybe_write_file(bool enoent_ok, char *filename, char *fmt, va_list ap)
+{
+ char buf[4096];
+ int fd;
+ ssize_t written;
+ int buf_len;
+
+ buf_len = vsnprintf(buf, sizeof(buf), fmt, ap);
+ if (buf_len < 0) {
+ printf("vsnprintf failed: %s\n",
+ strerror(errno));
+ return false;
+ }
+ if (buf_len >= sizeof(buf)) {
+ printf("vsnprintf output truncated\n");
+ return false;
+ }
+
+ fd = open(filename, O_WRONLY);
+ if (fd < 0) {
+ if ((errno == ENOENT) && enoent_ok)
+ return true;
+ return false;
+ }
+ written = write(fd, buf, buf_len);
+ if (written != buf_len) {
+ if (written >= 0) {
+ printf("short write to %s\n", filename);
+ return false;
+ } else {
+ printf("write to %s failed: %s\n",
+ filename, strerror(errno));
+ return false;
+ }
+ }
+ if (close(fd) != 0) {
+ printf("close of %s failed: %s\n",
+ filename, strerror(errno));
+ return false;
+ }
+ return true;
+}
+
+static bool write_file(char *filename, char *fmt, ...)
+{
+ va_list ap;
+ bool ret;
+
+ va_start(ap, fmt);
+ ret = vmaybe_write_file(false, filename, fmt, ap);
+ va_end(ap);
+
+ return ret;
+}
+
+static void ensure_user_exists(uid_t uid)
+{
+ struct passwd p;
+
+ FILE *fd;
+ char name_str[10];
+
+ if (getpwuid(uid) == NULL) {
+ memset(&p,0x00,sizeof(p));
+ fd=fopen("/etc/passwd","a");
+ if (fd == NULL)
+ die("couldn't open file\n");
+ if (fseek(fd, 0, SEEK_END))
+ die("couldn't fseek\n");
+ snprintf(name_str, 10, "%d", uid);
+ p.pw_name=name_str;
+ p.pw_uid=uid;
+ p.pw_gecos="Test account";
+ p.pw_dir="/dev/null";
+ p.pw_shell="/bin/false";
+ int value = putpwent(&p,fd);
+ if (value != 0)
+ die("putpwent failed\n");
+ if (fclose(fd))
+ die("fclose failed\n");
+ }
+}
+
+static void ensure_securityfs_mounted(void)
+{
+ int fd = open(add_whitelist_policy_file, O_WRONLY);
+ if (fd < 0) {
+ if (errno == ENOENT) {
+ // Need to mount securityfs
+ if (mount("securityfs", "/sys/kernel/security",
+ "securityfs", 0, NULL) < 0)
+ die("mounting securityfs failed\n");
+ } else {
+ die("couldn't find securityfs for unknown reason\n");
+ }
+ } else {
+ if (close(fd) != 0) {
+ die("close of %s failed: %s\n",
+ add_whitelist_policy_file, strerror(errno));
+ }
+ }
+}
+
+static void write_policies(void)
+{
+ static char *policy_str =
+ "1:2\n"
+ "1:3\n"
+ "2:2\n"
+ "3:3\n";
+ ssize_t written;
+ int fd;
+
+ fd = open(add_whitelist_policy_file, O_WRONLY);
+ if (fd < 0)
+ die("cant open add_whitelist_policy file\n");
+ written = write(fd, policy_str, strlen(policy_str));
+ if (written != strlen(policy_str)) {
+ if (written >= 0) {
+ die("short write to %s\n", add_whitelist_policy_file);
+ } else {
+ die("write to %s failed: %s\n",
+ add_whitelist_policy_file, strerror(errno));
+ }
+ }
+ if (close(fd) != 0) {
+ die("close of %s failed: %s\n",
+ add_whitelist_policy_file, strerror(errno));
+ }
+}
+
+static bool test_userns(bool expect_success)
+{
+ uid_t uid;
+ char map_file_name[32];
+ size_t sz = sizeof(map_file_name);
+ pid_t cpid;
+ bool success;
+
+ uid = getuid();
+
+ int clone_flags = CLONE_NEWUSER;
+ cpid = syscall(SYS_clone, clone_flags, NULL);
+ if (cpid == -1) {
+ printf("clone failed");
+ return false;
+ }
+
+ if (cpid == 0) { /* Code executed by child */
+ // Give parent 1 second to write map file
+ sleep(1);
+ exit(EXIT_SUCCESS);
+ } else { /* Code executed by parent */
+ if(snprintf(map_file_name, sz, "/proc/%d/uid_map", cpid) < 0) {
+ printf("preparing file name string failed");
+ return false;
+ }
+ success = write_file(map_file_name, "0 0 1", uid);
+ return success == expect_success;
+ }
+
+ printf("should not reach here");
+ return false;
+}
+
+static void test_setuid(uid_t child_uid, bool expect_success)
+{
+ pid_t cpid, w;
+ int wstatus;
+
+ cpid = fork();
+ if (cpid == -1) {
+ die("fork\n");
+ }
+
+ if (cpid == 0) { /* Code executed by child */
+ if (setuid(child_uid) < 0)
+ exit(EXIT_FAILURE);
+ if (getuid() == child_uid)
+ exit(EXIT_SUCCESS);
+ else
+ exit(EXIT_FAILURE);
+ } else { /* Code executed by parent */
+ do {
+ w = waitpid(cpid, &wstatus, WUNTRACED | WCONTINUED);
+ if (w == -1) {
+ die("waitpid\n");
+ }
+
+ if (WIFEXITED(wstatus)) {
+ if (WEXITSTATUS(wstatus) == EXIT_SUCCESS) {
+ if (expect_success) {
+ return;
+ } else {
+ die("unexpected success\n");
+ }
+ } else {
+ if (expect_success) {
+ die("unexpected failure\n");
+ } else {
+ return;
+ }
+ }
+ } else if (WIFSIGNALED(wstatus)) {
+ if (WTERMSIG(wstatus) == 9) {
+ if (expect_success)
+ die("killed unexpectedly\n");
+ else
+ return;
+ } else {
+ die("unexpected signal: %d\n", wstatus);
+ }
+ } else {
+ die("unexpected status: %d\n", wstatus);
+ }
+ } while (!WIFEXITED(wstatus) && !WIFSIGNALED(wstatus));
+ }
+
+ die("should not reach here\n");
+}
+
+static void ensure_users_exist(void)
+{
+ ensure_user_exists(ROOT_USER);
+ ensure_user_exists(RESTRICTED_PARENT);
+ ensure_user_exists(ALLOWED_CHILD1);
+ ensure_user_exists(ALLOWED_CHILD2);
+ ensure_user_exists(NO_POLICY_USER);
+}
+
+static void drop_caps(bool setid_retained)
+{
+ cap_value_t cap_values[] = {CAP_SETUID, CAP_SETGID};
+ cap_t caps;
+
+ caps = cap_get_proc();
+ if (setid_retained)
+ cap_set_flag(caps, CAP_EFFECTIVE, 2, cap_values, CAP_SET);
+ else
+ cap_clear(caps);
+ cap_set_proc(caps);
+ cap_free(caps);
+}
+
+int main(int argc, char **argv)
+{
+ ensure_users_exist();
+ ensure_securityfs_mounted();
+ write_policies();
+
+ if (prctl(PR_SET_KEEPCAPS, 1L))
+ die("Error with set keepcaps\n");
+
+ // First test to make sure we can write userns mappings from a user
+ // that doesn't have any restrictions (as long as it has CAP_SETUID);
+ if (setuid(NO_POLICY_USER) < 0)
+ die("Error with set uid(%d)\n", NO_POLICY_USER);
+ if (setgid(NO_POLICY_USER) < 0)
+ die("Error with set gid(%d)\n", NO_POLICY_USER);
+
+ // Take away all but setid caps
+ drop_caps(true);
+
+ // Need PR_SET_DUMPABLE flag set so we can write /proc/[pid]/uid_map
+ // from non-root parent process.
+ if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0))
+ die("Error with set dumpable\n");
+
+ if (!test_userns(true)) {
+ die("test_userns failed when it should work\n");
+ }
+
+ if (setuid(RESTRICTED_PARENT) < 0)
+ die("Error with set uid(%d)\n", RESTRICTED_PARENT);
+ if (setgid(RESTRICTED_PARENT) < 0)
+ die("Error with set gid(%d)\n", RESTRICTED_PARENT);
+
+ test_setuid(ROOT_USER, false);
+ test_setuid(ALLOWED_CHILD1, true);
+ test_setuid(ALLOWED_CHILD2, true);
+ test_setuid(NO_POLICY_USER, false);
+
+ if (!test_userns(false)) {
+ die("test_userns worked when it should fail\n");
+ }
+
+ // Now take away all caps
+ drop_caps(false);
+ test_setuid(2, false);
+ test_setuid(3, false);
+ test_setuid(4, false);
+
+ // NOTE: this test doesn't clean up users that were created in
+ // /etc/passwd or flush policies that were added to the LSM.
+ return EXIT_SUCCESS;
+}
diff --git a/tools/testing/selftests/safesetid/safesetid-test.sh b/tools/testing/selftests/safesetid/safesetid-test.sh
new file mode 100755
index 000000000..e4fdce675
--- /dev/null
+++ b/tools/testing/selftests/safesetid/safesetid-test.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+TCID="safesetid-test.sh"
+errcode=0
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+check_root()
+{
+ uid=$(id -u)
+ if [ $uid -ne 0 ]; then
+ echo $TCID: must be run as root >&2
+ exit $ksft_skip
+ fi
+}
+
+main_function()
+{
+ check_root
+ ./safesetid-test
+}
+
+main_function
+echo "$TCID: done"
+exit $errcode
diff --git a/tools/testing/selftests/seccomp/.gitignore b/tools/testing/selftests/seccomp/.gitignore
new file mode 100644
index 000000000..dec678577
--- /dev/null
+++ b/tools/testing/selftests/seccomp/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+seccomp_bpf
+seccomp_benchmark
diff --git a/tools/testing/selftests/seccomp/Makefile b/tools/testing/selftests/seccomp/Makefile
new file mode 100644
index 000000000..585f7a0c1
--- /dev/null
+++ b/tools/testing/selftests/seccomp/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS += -Wl,-no-as-needed -Wall -isystem ../../../../usr/include/
+LDFLAGS += -lpthread
+
+TEST_GEN_PROGS := seccomp_bpf seccomp_benchmark
+include ../lib.mk
diff --git a/tools/testing/selftests/seccomp/config b/tools/testing/selftests/seccomp/config
new file mode 100644
index 000000000..ad431a517
--- /dev/null
+++ b/tools/testing/selftests/seccomp/config
@@ -0,0 +1,4 @@
+CONFIG_PID_NS=y
+CONFIG_SECCOMP=y
+CONFIG_SECCOMP_FILTER=y
+CONFIG_USER_NS=y
diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c
new file mode 100644
index 000000000..91f5a89ca
--- /dev/null
+++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c
@@ -0,0 +1,135 @@
+/*
+ * Strictly speaking, this is not a test. But it can report during test
+ * runs so relative performace can be measured.
+ */
+#define _GNU_SOURCE
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <sys/prctl.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+
+#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
+
+unsigned long long timing(clockid_t clk_id, unsigned long long samples)
+{
+ struct timespec start, finish;
+ unsigned long long i;
+ pid_t pid, ret;
+
+ pid = getpid();
+ assert(clock_gettime(clk_id, &start) == 0);
+ for (i = 0; i < samples; i++) {
+ ret = syscall(__NR_getpid);
+ assert(pid == ret);
+ }
+ assert(clock_gettime(clk_id, &finish) == 0);
+
+ i = finish.tv_sec - start.tv_sec;
+ i *= 1000000000ULL;
+ i += finish.tv_nsec - start.tv_nsec;
+
+ printf("%lu.%09lu - %lu.%09lu = %llu (%.1fs)\n",
+ finish.tv_sec, finish.tv_nsec,
+ start.tv_sec, start.tv_nsec,
+ i, (double)i / 1000000000.0);
+
+ return i;
+}
+
+unsigned long long calibrate(void)
+{
+ struct timespec start, finish;
+ unsigned long long i, samples, step = 9973;
+ pid_t pid, ret;
+ int seconds = 15;
+
+ printf("Calibrating sample size for %d seconds worth of syscalls ...\n", seconds);
+
+ samples = 0;
+ pid = getpid();
+ assert(clock_gettime(CLOCK_MONOTONIC, &start) == 0);
+ do {
+ for (i = 0; i < step; i++) {
+ ret = syscall(__NR_getpid);
+ assert(pid == ret);
+ }
+ assert(clock_gettime(CLOCK_MONOTONIC, &finish) == 0);
+
+ samples += step;
+ i = finish.tv_sec - start.tv_sec;
+ i *= 1000000000ULL;
+ i += finish.tv_nsec - start.tv_nsec;
+ } while (i < 1000000000ULL);
+
+ return samples * seconds;
+}
+
+int main(int argc, char *argv[])
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+ unsigned long long samples;
+ unsigned long long native, filter1, filter2;
+
+ printf("Current BPF sysctl settings:\n");
+ system("sysctl net.core.bpf_jit_enable");
+ system("sysctl net.core.bpf_jit_harden");
+
+ if (argc > 1)
+ samples = strtoull(argv[1], NULL, 0);
+ else
+ samples = calibrate();
+
+ printf("Benchmarking %llu syscalls...\n", samples);
+
+ /* Native call */
+ native = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
+ printf("getpid native: %llu ns\n", native);
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ assert(ret == 0);
+
+ /* One filter */
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ assert(ret == 0);
+
+ filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
+ printf("getpid RET_ALLOW 1 filter: %llu ns\n", filter1);
+
+ if (filter1 == native)
+ printf("No overhead measured!? Try running again with more samples.\n");
+
+ /* Two filters */
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ assert(ret == 0);
+
+ filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples;
+ printf("getpid RET_ALLOW 2 filters: %llu ns\n", filter2);
+
+ /* Calculations */
+ printf("Estimated total seccomp overhead for 1 filter: %llu ns\n",
+ filter1 - native);
+
+ printf("Estimated total seccomp overhead for 2 filters: %llu ns\n",
+ filter2 - native);
+
+ printf("Estimated seccomp per-filter overhead: %llu ns\n",
+ filter2 - filter1);
+
+ printf("Estimated seccomp entry overhead: %llu ns\n",
+ filter1 - native - (filter2 - filter1));
+
+ return 0;
+}
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
new file mode 100644
index 000000000..413a7b9f3
--- /dev/null
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -0,0 +1,4162 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
+ *
+ * Test code for seccomp bpf.
+ */
+
+#define _GNU_SOURCE
+#include <sys/types.h>
+
+/*
+ * glibc 2.26 and later have SIGSYS in siginfo_t. Before that,
+ * we need to use the kernel's siginfo.h file and trick glibc
+ * into accepting it.
+ */
+#if !__GLIBC_PREREQ(2, 26)
+# include <asm/siginfo.h>
+# define __have_siginfo_t 1
+# define __have_sigval_t 1
+# define __have_sigevent_t 1
+#endif
+
+#include <errno.h>
+#include <linux/filter.h>
+#include <sys/prctl.h>
+#include <sys/ptrace.h>
+#include <sys/user.h>
+#include <linux/prctl.h>
+#include <linux/ptrace.h>
+#include <linux/seccomp.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <signal.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <string.h>
+#include <time.h>
+#include <limits.h>
+#include <linux/elf.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <sys/fcntl.h>
+#include <sys/mman.h>
+#include <sys/times.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <linux/kcmp.h>
+#include <sys/resource.h>
+
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <poll.h>
+
+#include "../kselftest_harness.h"
+#include "../clone3/clone3_selftests.h"
+
+/* Attempt to de-conflict with the selftests tree. */
+#ifndef SKIP
+#define SKIP(s, ...) XFAIL(s, ##__VA_ARGS__)
+#endif
+
+#ifndef PR_SET_PTRACER
+# define PR_SET_PTRACER 0x59616d61
+#endif
+
+#ifndef PR_SET_NO_NEW_PRIVS
+#define PR_SET_NO_NEW_PRIVS 38
+#define PR_GET_NO_NEW_PRIVS 39
+#endif
+
+#ifndef PR_SECCOMP_EXT
+#define PR_SECCOMP_EXT 43
+#endif
+
+#ifndef SECCOMP_EXT_ACT
+#define SECCOMP_EXT_ACT 1
+#endif
+
+#ifndef SECCOMP_EXT_ACT_TSYNC
+#define SECCOMP_EXT_ACT_TSYNC 1
+#endif
+
+#ifndef SECCOMP_MODE_STRICT
+#define SECCOMP_MODE_STRICT 1
+#endif
+
+#ifndef SECCOMP_MODE_FILTER
+#define SECCOMP_MODE_FILTER 2
+#endif
+
+#ifndef SECCOMP_RET_ALLOW
+struct seccomp_data {
+ int nr;
+ __u32 arch;
+ __u64 instruction_pointer;
+ __u64 args[6];
+};
+#endif
+
+#ifndef SECCOMP_RET_KILL_PROCESS
+#define SECCOMP_RET_KILL_PROCESS 0x80000000U /* kill the process */
+#define SECCOMP_RET_KILL_THREAD 0x00000000U /* kill the thread */
+#endif
+#ifndef SECCOMP_RET_KILL
+#define SECCOMP_RET_KILL SECCOMP_RET_KILL_THREAD
+#define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */
+#define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */
+#define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */
+#define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */
+#endif
+#ifndef SECCOMP_RET_LOG
+#define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */
+#endif
+
+#ifndef __NR_seccomp
+# if defined(__i386__)
+# define __NR_seccomp 354
+# elif defined(__x86_64__)
+# define __NR_seccomp 317
+# elif defined(__arm__)
+# define __NR_seccomp 383
+# elif defined(__aarch64__)
+# define __NR_seccomp 277
+# elif defined(__riscv)
+# define __NR_seccomp 277
+# elif defined(__csky__)
+# define __NR_seccomp 277
+# elif defined(__hppa__)
+# define __NR_seccomp 338
+# elif defined(__powerpc__)
+# define __NR_seccomp 358
+# elif defined(__s390__)
+# define __NR_seccomp 348
+# elif defined(__xtensa__)
+# define __NR_seccomp 337
+# elif defined(__sh__)
+# define __NR_seccomp 372
+# else
+# warning "seccomp syscall number unknown for this architecture"
+# define __NR_seccomp 0xffff
+# endif
+#endif
+
+#ifndef SECCOMP_SET_MODE_STRICT
+#define SECCOMP_SET_MODE_STRICT 0
+#endif
+
+#ifndef SECCOMP_SET_MODE_FILTER
+#define SECCOMP_SET_MODE_FILTER 1
+#endif
+
+#ifndef SECCOMP_GET_ACTION_AVAIL
+#define SECCOMP_GET_ACTION_AVAIL 2
+#endif
+
+#ifndef SECCOMP_GET_NOTIF_SIZES
+#define SECCOMP_GET_NOTIF_SIZES 3
+#endif
+
+#ifndef SECCOMP_FILTER_FLAG_TSYNC
+#define SECCOMP_FILTER_FLAG_TSYNC (1UL << 0)
+#endif
+
+#ifndef SECCOMP_FILTER_FLAG_LOG
+#define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
+#endif
+
+#ifndef SECCOMP_FILTER_FLAG_SPEC_ALLOW
+#define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2)
+#endif
+
+#ifndef PTRACE_SECCOMP_GET_METADATA
+#define PTRACE_SECCOMP_GET_METADATA 0x420d
+
+struct seccomp_metadata {
+ __u64 filter_off; /* Input: which filter */
+ __u64 flags; /* Output: filter's flags */
+};
+#endif
+
+#ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
+#define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
+#endif
+
+#ifndef SECCOMP_RET_USER_NOTIF
+#define SECCOMP_RET_USER_NOTIF 0x7fc00000U
+
+#define SECCOMP_IOC_MAGIC '!'
+#define SECCOMP_IO(nr) _IO(SECCOMP_IOC_MAGIC, nr)
+#define SECCOMP_IOR(nr, type) _IOR(SECCOMP_IOC_MAGIC, nr, type)
+#define SECCOMP_IOW(nr, type) _IOW(SECCOMP_IOC_MAGIC, nr, type)
+#define SECCOMP_IOWR(nr, type) _IOWR(SECCOMP_IOC_MAGIC, nr, type)
+
+/* Flags for seccomp notification fd ioctl. */
+#define SECCOMP_IOCTL_NOTIF_RECV SECCOMP_IOWR(0, struct seccomp_notif)
+#define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \
+ struct seccomp_notif_resp)
+#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOW(2, __u64)
+
+struct seccomp_notif {
+ __u64 id;
+ __u32 pid;
+ __u32 flags;
+ struct seccomp_data data;
+};
+
+struct seccomp_notif_resp {
+ __u64 id;
+ __s64 val;
+ __s32 error;
+ __u32 flags;
+};
+
+struct seccomp_notif_sizes {
+ __u16 seccomp_notif;
+ __u16 seccomp_notif_resp;
+ __u16 seccomp_data;
+};
+#endif
+
+#ifndef SECCOMP_IOCTL_NOTIF_ADDFD
+/* On success, the return value is the remote process's added fd number */
+#define SECCOMP_IOCTL_NOTIF_ADDFD SECCOMP_IOW(3, \
+ struct seccomp_notif_addfd)
+
+/* valid flags for seccomp_notif_addfd */
+#define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */
+
+struct seccomp_notif_addfd {
+ __u64 id;
+ __u32 flags;
+ __u32 srcfd;
+ __u32 newfd;
+ __u32 newfd_flags;
+};
+#endif
+
+struct seccomp_notif_addfd_small {
+ __u64 id;
+ char weird[4];
+};
+#define SECCOMP_IOCTL_NOTIF_ADDFD_SMALL \
+ SECCOMP_IOW(3, struct seccomp_notif_addfd_small)
+
+struct seccomp_notif_addfd_big {
+ union {
+ struct seccomp_notif_addfd addfd;
+ char buf[sizeof(struct seccomp_notif_addfd) + 8];
+ };
+};
+#define SECCOMP_IOCTL_NOTIF_ADDFD_BIG \
+ SECCOMP_IOWR(3, struct seccomp_notif_addfd_big)
+
+#ifndef PTRACE_EVENTMSG_SYSCALL_ENTRY
+#define PTRACE_EVENTMSG_SYSCALL_ENTRY 1
+#define PTRACE_EVENTMSG_SYSCALL_EXIT 2
+#endif
+
+#ifndef SECCOMP_USER_NOTIF_FLAG_CONTINUE
+#define SECCOMP_USER_NOTIF_FLAG_CONTINUE 0x00000001
+#endif
+
+#ifndef SECCOMP_FILTER_FLAG_TSYNC_ESRCH
+#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH (1UL << 4)
+#endif
+
+#ifndef seccomp
+int seccomp(unsigned int op, unsigned int flags, void *args)
+{
+ errno = 0;
+ return syscall(__NR_seccomp, op, flags, args);
+}
+#endif
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]))
+#elif __BYTE_ORDER == __BIG_ENDIAN
+#define syscall_arg(_n) (offsetof(struct seccomp_data, args[_n]) + sizeof(__u32))
+#else
+#error "wut? Unknown __BYTE_ORDER?!"
+#endif
+
+#define SIBLING_EXIT_UNKILLED 0xbadbeef
+#define SIBLING_EXIT_FAILURE 0xbadface
+#define SIBLING_EXIT_NEWPRIVS 0xbadfeed
+
+static int __filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2)
+{
+#ifdef __NR_kcmp
+ errno = 0;
+ return syscall(__NR_kcmp, pid1, pid2, KCMP_FILE, fd1, fd2);
+#else
+ errno = ENOSYS;
+ return -1;
+#endif
+}
+
+/* Have TH_LOG report actual location filecmp() is used. */
+#define filecmp(pid1, pid2, fd1, fd2) ({ \
+ int _ret; \
+ \
+ _ret = __filecmp(pid1, pid2, fd1, fd2); \
+ if (_ret != 0) { \
+ if (_ret < 0 && errno == ENOSYS) { \
+ TH_LOG("kcmp() syscall missing (test is less accurate)");\
+ _ret = 0; \
+ } \
+ } \
+ _ret; })
+
+TEST(kcmp)
+{
+ int ret;
+
+ ret = __filecmp(getpid(), getpid(), 1, 1);
+ EXPECT_EQ(ret, 0);
+ if (ret != 0 && errno == ENOSYS)
+ SKIP(return, "Kernel does not support kcmp() (missing CONFIG_KCMP?)");
+}
+
+TEST(mode_strict_support)
+{
+ long ret;
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support CONFIG_SECCOMP");
+ }
+ syscall(__NR_exit, 0);
+}
+
+TEST_SIGNAL(mode_strict_cannot_call_prctl, SIGKILL)
+{
+ long ret;
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, NULL, NULL);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support CONFIG_SECCOMP");
+ }
+ syscall(__NR_prctl, PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
+ NULL, NULL, NULL);
+ EXPECT_FALSE(true) {
+ TH_LOG("Unreachable!");
+ }
+}
+
+/* Note! This doesn't test no new privs behavior */
+TEST(no_new_privs_support)
+{
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ EXPECT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+}
+
+/* Tests kernel support by checking for a copy_from_user() fault on NULL. */
+TEST(mode_filter_support)
+{
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, NULL, NULL);
+ EXPECT_EQ(-1, ret);
+ EXPECT_EQ(EFAULT, errno) {
+ TH_LOG("Kernel does not support CONFIG_SECCOMP_FILTER!");
+ }
+}
+
+TEST(mode_filter_without_nnp)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_GET_NO_NEW_PRIVS, 0, NULL, 0, 0);
+ ASSERT_LE(0, ret) {
+ TH_LOG("Expected 0 or unsupported for NO_NEW_PRIVS");
+ }
+ errno = 0;
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+ /* Succeeds with CAP_SYS_ADMIN, fails without */
+ /* TODO(wad) check caps not euid */
+ if (geteuid()) {
+ EXPECT_EQ(-1, ret);
+ EXPECT_EQ(EACCES, errno);
+ } else {
+ EXPECT_EQ(0, ret);
+ }
+}
+
+#define MAX_INSNS_PER_PATH 32768
+
+TEST(filter_size_limits)
+{
+ int i;
+ int count = BPF_MAXINSNS + 1;
+ struct sock_filter allow[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_filter *filter;
+ struct sock_fprog prog = { };
+ long ret;
+
+ filter = calloc(count, sizeof(*filter));
+ ASSERT_NE(NULL, filter);
+
+ for (i = 0; i < count; i++)
+ filter[i] = allow[0];
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ prog.filter = filter;
+ prog.len = count;
+
+ /* Too many filter instructions in a single filter. */
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+ ASSERT_NE(0, ret) {
+ TH_LOG("Installing %d insn filter was allowed", prog.len);
+ }
+
+ /* One less is okay, though. */
+ prog.len -= 1;
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Installing %d insn filter wasn't allowed", prog.len);
+ }
+}
+
+TEST(filter_chain_limits)
+{
+ int i;
+ int count = BPF_MAXINSNS;
+ struct sock_filter allow[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_filter *filter;
+ struct sock_fprog prog = { };
+ long ret;
+
+ filter = calloc(count, sizeof(*filter));
+ ASSERT_NE(NULL, filter);
+
+ for (i = 0; i < count; i++)
+ filter[i] = allow[0];
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ prog.filter = filter;
+ prog.len = 1;
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ prog.len = count;
+
+ /* Too many total filter instructions. */
+ for (i = 0; i < MAX_INSNS_PER_PATH; i++) {
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+ if (ret != 0)
+ break;
+ }
+ ASSERT_NE(0, ret) {
+ TH_LOG("Allowed %d %d-insn filters (total with penalties:%d)",
+ i, count, i * (count + 4));
+ }
+}
+
+TEST(mode_filter_cannot_move_to_strict)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, NULL, 0, 0);
+ EXPECT_EQ(-1, ret);
+ EXPECT_EQ(EINVAL, errno);
+}
+
+
+TEST(mode_filter_get_seccomp)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
+ EXPECT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
+ EXPECT_EQ(2, ret);
+}
+
+
+TEST(ALLOW_all)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ ASSERT_EQ(0, ret);
+}
+
+TEST(empty_prog)
+{
+ struct sock_filter filter[] = {
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ EXPECT_EQ(-1, ret);
+ EXPECT_EQ(EINVAL, errno);
+}
+
+TEST(log_all)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+ pid_t parent = getppid();
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ ASSERT_EQ(0, ret);
+
+ /* getppid() should succeed and be logged (no check for logging) */
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+}
+
+TEST_SIGNAL(unknown_ret_is_kill_inside, SIGSYS)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, 0x10000000U),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ ASSERT_EQ(0, ret);
+ EXPECT_EQ(0, syscall(__NR_getpid)) {
+ TH_LOG("getpid() shouldn't ever return");
+ }
+}
+
+/* return code >= 0x80000000 is unused. */
+TEST_SIGNAL(unknown_ret_is_kill_above_allow, SIGSYS)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, 0x90000000U),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ ASSERT_EQ(0, ret);
+ EXPECT_EQ(0, syscall(__NR_getpid)) {
+ TH_LOG("getpid() shouldn't ever return");
+ }
+}
+
+TEST_SIGNAL(KILL_all, SIGSYS)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ ASSERT_EQ(0, ret);
+}
+
+TEST_SIGNAL(KILL_one, SIGSYS)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+ pid_t parent = getppid();
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ ASSERT_EQ(0, ret);
+
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ /* getpid() should never return. */
+ EXPECT_EQ(0, syscall(__NR_getpid));
+}
+
+TEST_SIGNAL(KILL_one_arg_one, SIGSYS)
+{
+ void *fatal_address;
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_times, 1, 0),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ /* Only both with lower 32-bit for now. */
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(0)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K,
+ (unsigned long)&fatal_address, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+ pid_t parent = getppid();
+ struct tms timebuf;
+ clock_t clock = times(&timebuf);
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ ASSERT_EQ(0, ret);
+
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ EXPECT_LE(clock, syscall(__NR_times, &timebuf));
+ /* times() should never return. */
+ EXPECT_EQ(0, syscall(__NR_times, &fatal_address));
+}
+
+TEST_SIGNAL(KILL_one_arg_six, SIGSYS)
+{
+#ifndef __NR_mmap2
+ int sysno = __NR_mmap;
+#else
+ int sysno = __NR_mmap2;
+#endif
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, sysno, 1, 0),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ /* Only both with lower 32-bit for now. */
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(5)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, 0x0C0FFEE, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+ pid_t parent = getppid();
+ int fd;
+ void *map1, *map2;
+ int page_size = sysconf(_SC_PAGESIZE);
+
+ ASSERT_LT(0, page_size);
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ ASSERT_EQ(0, ret);
+
+ fd = open("/dev/zero", O_RDONLY);
+ ASSERT_NE(-1, fd);
+
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ map1 = (void *)syscall(sysno,
+ NULL, page_size, PROT_READ, MAP_PRIVATE, fd, page_size);
+ EXPECT_NE(MAP_FAILED, map1);
+ /* mmap2() should never return. */
+ map2 = (void *)syscall(sysno,
+ NULL, page_size, PROT_READ, MAP_PRIVATE, fd, 0x0C0FFEE);
+ EXPECT_EQ(MAP_FAILED, map2);
+
+ /* The test failed, so clean up the resources. */
+ munmap(map1, page_size);
+ munmap(map2, page_size);
+ close(fd);
+}
+
+/* This is a thread task to die via seccomp filter violation. */
+void *kill_thread(void *data)
+{
+ bool die = (bool)data;
+
+ if (die) {
+ prctl(PR_GET_SECCOMP, 0, 0, 0, 0);
+ return (void *)SIBLING_EXIT_FAILURE;
+ }
+
+ return (void *)SIBLING_EXIT_UNKILLED;
+}
+
+enum kill_t {
+ KILL_THREAD,
+ KILL_PROCESS,
+ RET_UNKNOWN
+};
+
+/* Prepare a thread that will kill itself or both of us. */
+void kill_thread_or_group(struct __test_metadata *_metadata,
+ enum kill_t kill_how)
+{
+ pthread_t thread;
+ void *status;
+ /* Kill only when calling __NR_prctl. */
+ struct sock_filter filter_thread[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL_THREAD),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog_thread = {
+ .len = (unsigned short)ARRAY_SIZE(filter_thread),
+ .filter = filter_thread,
+ };
+ int kill = kill_how == KILL_PROCESS ? SECCOMP_RET_KILL_PROCESS : 0xAAAAAAAA;
+ struct sock_filter filter_process[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, kill),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog_process = {
+ .len = (unsigned short)ARRAY_SIZE(filter_process),
+ .filter = filter_process,
+ };
+
+ ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0,
+ kill_how == KILL_THREAD ? &prog_thread
+ : &prog_process));
+
+ /*
+ * Add the KILL_THREAD rule again to make sure that the KILL_PROCESS
+ * flag cannot be downgraded by a new filter.
+ */
+ if (kill_how == KILL_PROCESS)
+ ASSERT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog_thread));
+
+ /* Start a thread that will exit immediately. */
+ ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)false));
+ ASSERT_EQ(0, pthread_join(thread, &status));
+ ASSERT_EQ(SIBLING_EXIT_UNKILLED, (unsigned long)status);
+
+ /* Start a thread that will die immediately. */
+ ASSERT_EQ(0, pthread_create(&thread, NULL, kill_thread, (void *)true));
+ ASSERT_EQ(0, pthread_join(thread, &status));
+ ASSERT_NE(SIBLING_EXIT_FAILURE, (unsigned long)status);
+
+ /*
+ * If we get here, only the spawned thread died. Let the parent know
+ * the whole process didn't die (i.e. this thread, the spawner,
+ * stayed running).
+ */
+ exit(42);
+}
+
+TEST(KILL_thread)
+{
+ int status;
+ pid_t child_pid;
+
+ child_pid = fork();
+ ASSERT_LE(0, child_pid);
+ if (child_pid == 0) {
+ kill_thread_or_group(_metadata, KILL_THREAD);
+ _exit(38);
+ }
+
+ ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
+
+ /* If only the thread was killed, we'll see exit 42. */
+ ASSERT_TRUE(WIFEXITED(status));
+ ASSERT_EQ(42, WEXITSTATUS(status));
+}
+
+TEST(KILL_process)
+{
+ int status;
+ pid_t child_pid;
+
+ child_pid = fork();
+ ASSERT_LE(0, child_pid);
+ if (child_pid == 0) {
+ kill_thread_or_group(_metadata, KILL_PROCESS);
+ _exit(38);
+ }
+
+ ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
+
+ /* If the entire process was killed, we'll see SIGSYS. */
+ ASSERT_TRUE(WIFSIGNALED(status));
+ ASSERT_EQ(SIGSYS, WTERMSIG(status));
+}
+
+TEST(KILL_unknown)
+{
+ int status;
+ pid_t child_pid;
+
+ child_pid = fork();
+ ASSERT_LE(0, child_pid);
+ if (child_pid == 0) {
+ kill_thread_or_group(_metadata, RET_UNKNOWN);
+ _exit(38);
+ }
+
+ ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
+
+ /* If the entire process was killed, we'll see SIGSYS. */
+ EXPECT_TRUE(WIFSIGNALED(status)) {
+ TH_LOG("Unknown SECCOMP_RET is only killing the thread?");
+ }
+ ASSERT_EQ(SIGSYS, WTERMSIG(status));
+}
+
+/* TODO(wad) add 64-bit versus 32-bit arg tests. */
+TEST(arg_out_of_range)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(6)),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ EXPECT_EQ(-1, ret);
+ EXPECT_EQ(EINVAL, errno);
+}
+
+#define ERRNO_FILTER(name, errno) \
+ struct sock_filter _read_filter_##name[] = { \
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS, \
+ offsetof(struct seccomp_data, nr)), \
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1), \
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | errno), \
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), \
+ }; \
+ struct sock_fprog prog_##name = { \
+ .len = (unsigned short)ARRAY_SIZE(_read_filter_##name), \
+ .filter = _read_filter_##name, \
+ }
+
+/* Make sure basic errno values are correctly passed through a filter. */
+TEST(ERRNO_valid)
+{
+ ERRNO_FILTER(valid, E2BIG);
+ long ret;
+ pid_t parent = getppid();
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_valid);
+ ASSERT_EQ(0, ret);
+
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ EXPECT_EQ(-1, read(-1, NULL, 0));
+ EXPECT_EQ(E2BIG, errno);
+}
+
+/* Make sure an errno of zero is correctly handled by the arch code. */
+TEST(ERRNO_zero)
+{
+ ERRNO_FILTER(zero, 0);
+ long ret;
+ pid_t parent = getppid();
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_zero);
+ ASSERT_EQ(0, ret);
+
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ /* "errno" of 0 is ok. */
+ EXPECT_EQ(0, read(-1, NULL, 0));
+}
+
+/*
+ * The SECCOMP_RET_DATA mask is 16 bits wide, but errno is smaller.
+ * This tests that the errno value gets capped correctly, fixed by
+ * 580c57f10768 ("seccomp: cap SECCOMP_RET_ERRNO data to MAX_ERRNO").
+ */
+TEST(ERRNO_capped)
+{
+ ERRNO_FILTER(capped, 4096);
+ long ret;
+ pid_t parent = getppid();
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_capped);
+ ASSERT_EQ(0, ret);
+
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ EXPECT_EQ(-1, read(-1, NULL, 0));
+ EXPECT_EQ(4095, errno);
+}
+
+/*
+ * Filters are processed in reverse order: last applied is executed first.
+ * Since only the SECCOMP_RET_ACTION mask is tested for return values, the
+ * SECCOMP_RET_DATA mask results will follow the most recently applied
+ * matching filter return (and not the lowest or highest value).
+ */
+TEST(ERRNO_order)
+{
+ ERRNO_FILTER(first, 11);
+ ERRNO_FILTER(second, 13);
+ ERRNO_FILTER(third, 12);
+ long ret;
+ pid_t parent = getppid();
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_first);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_second);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog_third);
+ ASSERT_EQ(0, ret);
+
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ EXPECT_EQ(-1, read(-1, NULL, 0));
+ EXPECT_EQ(12, errno);
+}
+
+FIXTURE(TRAP) {
+ struct sock_fprog prog;
+};
+
+FIXTURE_SETUP(TRAP)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+
+ memset(&self->prog, 0, sizeof(self->prog));
+ self->prog.filter = malloc(sizeof(filter));
+ ASSERT_NE(NULL, self->prog.filter);
+ memcpy(self->prog.filter, filter, sizeof(filter));
+ self->prog.len = (unsigned short)ARRAY_SIZE(filter);
+}
+
+FIXTURE_TEARDOWN(TRAP)
+{
+ if (self->prog.filter)
+ free(self->prog.filter);
+}
+
+TEST_F_SIGNAL(TRAP, dfl, SIGSYS)
+{
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
+ ASSERT_EQ(0, ret);
+ syscall(__NR_getpid);
+}
+
+/* Ensure that SIGSYS overrides SIG_IGN */
+TEST_F_SIGNAL(TRAP, ign, SIGSYS)
+{
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ signal(SIGSYS, SIG_IGN);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
+ ASSERT_EQ(0, ret);
+ syscall(__NR_getpid);
+}
+
+static siginfo_t TRAP_info;
+static volatile int TRAP_nr;
+static void TRAP_action(int nr, siginfo_t *info, void *void_context)
+{
+ memcpy(&TRAP_info, info, sizeof(TRAP_info));
+ TRAP_nr = nr;
+}
+
+TEST_F(TRAP, handler)
+{
+ int ret, test;
+ struct sigaction act;
+ sigset_t mask;
+
+ memset(&act, 0, sizeof(act));
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGSYS);
+
+ act.sa_sigaction = &TRAP_action;
+ act.sa_flags = SA_SIGINFO;
+ ret = sigaction(SIGSYS, &act, NULL);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("sigaction failed");
+ }
+ ret = sigprocmask(SIG_UNBLOCK, &mask, NULL);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("sigprocmask failed");
+ }
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog);
+ ASSERT_EQ(0, ret);
+ TRAP_nr = 0;
+ memset(&TRAP_info, 0, sizeof(TRAP_info));
+ /* Expect the registers to be rolled back. (nr = error) may vary
+ * based on arch. */
+ ret = syscall(__NR_getpid);
+ /* Silence gcc warning about volatile. */
+ test = TRAP_nr;
+ EXPECT_EQ(SIGSYS, test);
+ struct local_sigsys {
+ void *_call_addr; /* calling user insn */
+ int _syscall; /* triggering system call number */
+ unsigned int _arch; /* AUDIT_ARCH_* of syscall */
+ } *sigsys = (struct local_sigsys *)
+#ifdef si_syscall
+ &(TRAP_info.si_call_addr);
+#else
+ &TRAP_info.si_pid;
+#endif
+ EXPECT_EQ(__NR_getpid, sigsys->_syscall);
+ /* Make sure arch is non-zero. */
+ EXPECT_NE(0, sigsys->_arch);
+ EXPECT_NE(0, (unsigned long)sigsys->_call_addr);
+}
+
+FIXTURE(precedence) {
+ struct sock_fprog allow;
+ struct sock_fprog log;
+ struct sock_fprog trace;
+ struct sock_fprog error;
+ struct sock_fprog trap;
+ struct sock_fprog kill;
+};
+
+FIXTURE_SETUP(precedence)
+{
+ struct sock_filter allow_insns[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_filter log_insns[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_LOG),
+ };
+ struct sock_filter trace_insns[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE),
+ };
+ struct sock_filter error_insns[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO),
+ };
+ struct sock_filter trap_insns[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRAP),
+ };
+ struct sock_filter kill_insns[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 1, 0),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+ };
+
+ memset(self, 0, sizeof(*self));
+#define FILTER_ALLOC(_x) \
+ self->_x.filter = malloc(sizeof(_x##_insns)); \
+ ASSERT_NE(NULL, self->_x.filter); \
+ memcpy(self->_x.filter, &_x##_insns, sizeof(_x##_insns)); \
+ self->_x.len = (unsigned short)ARRAY_SIZE(_x##_insns)
+ FILTER_ALLOC(allow);
+ FILTER_ALLOC(log);
+ FILTER_ALLOC(trace);
+ FILTER_ALLOC(error);
+ FILTER_ALLOC(trap);
+ FILTER_ALLOC(kill);
+}
+
+FIXTURE_TEARDOWN(precedence)
+{
+#define FILTER_FREE(_x) if (self->_x.filter) free(self->_x.filter)
+ FILTER_FREE(allow);
+ FILTER_FREE(log);
+ FILTER_FREE(trace);
+ FILTER_FREE(error);
+ FILTER_FREE(trap);
+ FILTER_FREE(kill);
+}
+
+TEST_F(precedence, allow_ok)
+{
+ pid_t parent, res = 0;
+ long ret;
+
+ parent = getppid();
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
+ ASSERT_EQ(0, ret);
+ /* Should work just fine. */
+ res = syscall(__NR_getppid);
+ EXPECT_EQ(parent, res);
+}
+
+TEST_F_SIGNAL(precedence, kill_is_highest, SIGSYS)
+{
+ pid_t parent, res = 0;
+ long ret;
+
+ parent = getppid();
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
+ ASSERT_EQ(0, ret);
+ /* Should work just fine. */
+ res = syscall(__NR_getppid);
+ EXPECT_EQ(parent, res);
+ /* getpid() should never return. */
+ res = syscall(__NR_getpid);
+ EXPECT_EQ(0, res);
+}
+
+TEST_F_SIGNAL(precedence, kill_is_highest_in_any_order, SIGSYS)
+{
+ pid_t parent;
+ long ret;
+
+ parent = getppid();
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->kill);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
+ ASSERT_EQ(0, ret);
+ /* Should work just fine. */
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ /* getpid() should never return. */
+ EXPECT_EQ(0, syscall(__NR_getpid));
+}
+
+TEST_F_SIGNAL(precedence, trap_is_second, SIGSYS)
+{
+ pid_t parent;
+ long ret;
+
+ parent = getppid();
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
+ ASSERT_EQ(0, ret);
+ /* Should work just fine. */
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ /* getpid() should never return. */
+ EXPECT_EQ(0, syscall(__NR_getpid));
+}
+
+TEST_F_SIGNAL(precedence, trap_is_second_in_any_order, SIGSYS)
+{
+ pid_t parent;
+ long ret;
+
+ parent = getppid();
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trap);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
+ ASSERT_EQ(0, ret);
+ /* Should work just fine. */
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ /* getpid() should never return. */
+ EXPECT_EQ(0, syscall(__NR_getpid));
+}
+
+TEST_F(precedence, errno_is_third)
+{
+ pid_t parent;
+ long ret;
+
+ parent = getppid();
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
+ ASSERT_EQ(0, ret);
+ /* Should work just fine. */
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ EXPECT_EQ(0, syscall(__NR_getpid));
+}
+
+TEST_F(precedence, errno_is_third_in_any_order)
+{
+ pid_t parent;
+ long ret;
+
+ parent = getppid();
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->error);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+ ASSERT_EQ(0, ret);
+ /* Should work just fine. */
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ EXPECT_EQ(0, syscall(__NR_getpid));
+}
+
+TEST_F(precedence, trace_is_fourth)
+{
+ pid_t parent;
+ long ret;
+
+ parent = getppid();
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
+ ASSERT_EQ(0, ret);
+ /* Should work just fine. */
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ /* No ptracer */
+ EXPECT_EQ(-1, syscall(__NR_getpid));
+}
+
+TEST_F(precedence, trace_is_fourth_in_any_order)
+{
+ pid_t parent;
+ long ret;
+
+ parent = getppid();
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->trace);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+ ASSERT_EQ(0, ret);
+ /* Should work just fine. */
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ /* No ptracer */
+ EXPECT_EQ(-1, syscall(__NR_getpid));
+}
+
+TEST_F(precedence, log_is_fifth)
+{
+ pid_t mypid, parent;
+ long ret;
+
+ mypid = getpid();
+ parent = getppid();
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+ ASSERT_EQ(0, ret);
+ /* Should work just fine. */
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ /* Should also work just fine */
+ EXPECT_EQ(mypid, syscall(__NR_getpid));
+}
+
+TEST_F(precedence, log_is_fifth_in_any_order)
+{
+ pid_t mypid, parent;
+ long ret;
+
+ mypid = getpid();
+ parent = getppid();
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->log);
+ ASSERT_EQ(0, ret);
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->allow);
+ ASSERT_EQ(0, ret);
+ /* Should work just fine. */
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ /* Should also work just fine */
+ EXPECT_EQ(mypid, syscall(__NR_getpid));
+}
+
+#ifndef PTRACE_O_TRACESECCOMP
+#define PTRACE_O_TRACESECCOMP 0x00000080
+#endif
+
+/* Catch the Ubuntu 12.04 value error. */
+#if PTRACE_EVENT_SECCOMP != 7
+#undef PTRACE_EVENT_SECCOMP
+#endif
+
+#ifndef PTRACE_EVENT_SECCOMP
+#define PTRACE_EVENT_SECCOMP 7
+#endif
+
+#define IS_SECCOMP_EVENT(status) ((status >> 16) == PTRACE_EVENT_SECCOMP)
+bool tracer_running;
+void tracer_stop(int sig)
+{
+ tracer_running = false;
+}
+
+typedef void tracer_func_t(struct __test_metadata *_metadata,
+ pid_t tracee, int status, void *args);
+
+void start_tracer(struct __test_metadata *_metadata, int fd, pid_t tracee,
+ tracer_func_t tracer_func, void *args, bool ptrace_syscall)
+{
+ int ret = -1;
+ struct sigaction action = {
+ .sa_handler = tracer_stop,
+ };
+
+ /* Allow external shutdown. */
+ tracer_running = true;
+ ASSERT_EQ(0, sigaction(SIGUSR1, &action, NULL));
+
+ errno = 0;
+ while (ret == -1 && errno != EINVAL)
+ ret = ptrace(PTRACE_ATTACH, tracee, NULL, 0);
+ ASSERT_EQ(0, ret) {
+ kill(tracee, SIGKILL);
+ }
+ /* Wait for attach stop */
+ wait(NULL);
+
+ ret = ptrace(PTRACE_SETOPTIONS, tracee, NULL, ptrace_syscall ?
+ PTRACE_O_TRACESYSGOOD :
+ PTRACE_O_TRACESECCOMP);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Failed to set PTRACE_O_TRACESECCOMP");
+ kill(tracee, SIGKILL);
+ }
+ ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
+ tracee, NULL, 0);
+ ASSERT_EQ(0, ret);
+
+ /* Unblock the tracee */
+ ASSERT_EQ(1, write(fd, "A", 1));
+ ASSERT_EQ(0, close(fd));
+
+ /* Run until we're shut down. Must assert to stop execution. */
+ while (tracer_running) {
+ int status;
+
+ if (wait(&status) != tracee)
+ continue;
+ if (WIFSIGNALED(status) || WIFEXITED(status))
+ /* Child is dead. Time to go. */
+ return;
+
+ /* Check if this is a seccomp event. */
+ ASSERT_EQ(!ptrace_syscall, IS_SECCOMP_EVENT(status));
+
+ tracer_func(_metadata, tracee, status, args);
+
+ ret = ptrace(ptrace_syscall ? PTRACE_SYSCALL : PTRACE_CONT,
+ tracee, NULL, 0);
+ ASSERT_EQ(0, ret);
+ }
+ /* Directly report the status of our test harness results. */
+ syscall(__NR_exit, _metadata->passed ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+/* Common tracer setup/teardown functions. */
+void cont_handler(int num)
+{ }
+pid_t setup_trace_fixture(struct __test_metadata *_metadata,
+ tracer_func_t func, void *args, bool ptrace_syscall)
+{
+ char sync;
+ int pipefd[2];
+ pid_t tracer_pid;
+ pid_t tracee = getpid();
+
+ /* Setup a pipe for clean synchronization. */
+ ASSERT_EQ(0, pipe(pipefd));
+
+ /* Fork a child which we'll promote to tracer */
+ tracer_pid = fork();
+ ASSERT_LE(0, tracer_pid);
+ signal(SIGALRM, cont_handler);
+ if (tracer_pid == 0) {
+ close(pipefd[0]);
+ start_tracer(_metadata, pipefd[1], tracee, func, args,
+ ptrace_syscall);
+ syscall(__NR_exit, 0);
+ }
+ close(pipefd[1]);
+ prctl(PR_SET_PTRACER, tracer_pid, 0, 0, 0);
+ read(pipefd[0], &sync, 1);
+ close(pipefd[0]);
+
+ return tracer_pid;
+}
+
+void teardown_trace_fixture(struct __test_metadata *_metadata,
+ pid_t tracer)
+{
+ if (tracer) {
+ int status;
+ /*
+ * Extract the exit code from the other process and
+ * adopt it for ourselves in case its asserts failed.
+ */
+ ASSERT_EQ(0, kill(tracer, SIGUSR1));
+ ASSERT_EQ(tracer, waitpid(tracer, &status, 0));
+ if (WEXITSTATUS(status))
+ _metadata->passed = 0;
+ }
+}
+
+/* "poke" tracer arguments and function. */
+struct tracer_args_poke_t {
+ unsigned long poke_addr;
+};
+
+void tracer_poke(struct __test_metadata *_metadata, pid_t tracee, int status,
+ void *args)
+{
+ int ret;
+ unsigned long msg;
+ struct tracer_args_poke_t *info = (struct tracer_args_poke_t *)args;
+
+ ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
+ EXPECT_EQ(0, ret);
+ /* If this fails, don't try to recover. */
+ ASSERT_EQ(0x1001, msg) {
+ kill(tracee, SIGKILL);
+ }
+ /*
+ * Poke in the message.
+ * Registers are not touched to try to keep this relatively arch
+ * agnostic.
+ */
+ ret = ptrace(PTRACE_POKEDATA, tracee, info->poke_addr, 0x1001);
+ EXPECT_EQ(0, ret);
+}
+
+FIXTURE(TRACE_poke) {
+ struct sock_fprog prog;
+ pid_t tracer;
+ long poked;
+ struct tracer_args_poke_t tracer_args;
+};
+
+FIXTURE_SETUP(TRACE_poke)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1001),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+
+ self->poked = 0;
+ memset(&self->prog, 0, sizeof(self->prog));
+ self->prog.filter = malloc(sizeof(filter));
+ ASSERT_NE(NULL, self->prog.filter);
+ memcpy(self->prog.filter, filter, sizeof(filter));
+ self->prog.len = (unsigned short)ARRAY_SIZE(filter);
+
+ /* Set up tracer args. */
+ self->tracer_args.poke_addr = (unsigned long)&self->poked;
+
+ /* Launch tracer. */
+ self->tracer = setup_trace_fixture(_metadata, tracer_poke,
+ &self->tracer_args, false);
+}
+
+FIXTURE_TEARDOWN(TRACE_poke)
+{
+ teardown_trace_fixture(_metadata, self->tracer);
+ if (self->prog.filter)
+ free(self->prog.filter);
+}
+
+TEST_F(TRACE_poke, read_has_side_effects)
+{
+ ssize_t ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ EXPECT_EQ(0, self->poked);
+ ret = read(-1, NULL, 0);
+ EXPECT_EQ(-1, ret);
+ EXPECT_EQ(0x1001, self->poked);
+}
+
+TEST_F(TRACE_poke, getpid_runs_normally)
+{
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ EXPECT_EQ(0, self->poked);
+ EXPECT_NE(0, syscall(__NR_getpid));
+ EXPECT_EQ(0, self->poked);
+}
+
+#if defined(__x86_64__)
+# define ARCH_REGS struct user_regs_struct
+# define SYSCALL_NUM(_regs) (_regs).orig_rax
+# define SYSCALL_RET(_regs) (_regs).rax
+#elif defined(__i386__)
+# define ARCH_REGS struct user_regs_struct
+# define SYSCALL_NUM(_regs) (_regs).orig_eax
+# define SYSCALL_RET(_regs) (_regs).eax
+#elif defined(__arm__)
+# define ARCH_REGS struct pt_regs
+# define SYSCALL_NUM(_regs) (_regs).ARM_r7
+# ifndef PTRACE_SET_SYSCALL
+# define PTRACE_SET_SYSCALL 23
+# endif
+# define SYSCALL_NUM_SET(_regs, _nr) \
+ EXPECT_EQ(0, ptrace(PTRACE_SET_SYSCALL, tracee, NULL, _nr))
+# define SYSCALL_RET(_regs) (_regs).ARM_r0
+#elif defined(__aarch64__)
+# define ARCH_REGS struct user_pt_regs
+# define SYSCALL_NUM(_regs) (_regs).regs[8]
+# ifndef NT_ARM_SYSTEM_CALL
+# define NT_ARM_SYSTEM_CALL 0x404
+# endif
+# define SYSCALL_NUM_SET(_regs, _nr) \
+ do { \
+ struct iovec __v; \
+ typeof(_nr) __nr = (_nr); \
+ __v.iov_base = &__nr; \
+ __v.iov_len = sizeof(__nr); \
+ EXPECT_EQ(0, ptrace(PTRACE_SETREGSET, tracee, \
+ NT_ARM_SYSTEM_CALL, &__v)); \
+ } while (0)
+# define SYSCALL_RET(_regs) (_regs).regs[0]
+#elif defined(__riscv) && __riscv_xlen == 64
+# define ARCH_REGS struct user_regs_struct
+# define SYSCALL_NUM(_regs) (_regs).a7
+# define SYSCALL_RET(_regs) (_regs).a0
+#elif defined(__csky__)
+# define ARCH_REGS struct pt_regs
+# if defined(__CSKYABIV2__)
+# define SYSCALL_NUM(_regs) (_regs).regs[3]
+# else
+# define SYSCALL_NUM(_regs) (_regs).regs[9]
+# endif
+# define SYSCALL_RET(_regs) (_regs).a0
+#elif defined(__hppa__)
+# define ARCH_REGS struct user_regs_struct
+# define SYSCALL_NUM(_regs) (_regs).gr[20]
+# define SYSCALL_RET(_regs) (_regs).gr[28]
+#elif defined(__powerpc__)
+# define ARCH_REGS struct pt_regs
+# define SYSCALL_NUM(_regs) (_regs).gpr[0]
+# define SYSCALL_RET(_regs) (_regs).gpr[3]
+# define SYSCALL_RET_SET(_regs, _val) \
+ do { \
+ typeof(_val) _result = (_val); \
+ if ((_regs.trap & 0xfff0) == 0x3000) { \
+ /* \
+ * scv 0 system call uses -ve result \
+ * for error, so no need to adjust. \
+ */ \
+ SYSCALL_RET(_regs) = _result; \
+ } else { \
+ /* \
+ * A syscall error is signaled by the \
+ * CR0 SO bit and the code is stored as \
+ * a positive value. \
+ */ \
+ if (_result < 0) { \
+ SYSCALL_RET(_regs) = -_result; \
+ (_regs).ccr |= 0x10000000; \
+ } else { \
+ SYSCALL_RET(_regs) = _result; \
+ (_regs).ccr &= ~0x10000000; \
+ } \
+ } \
+ } while (0)
+# define SYSCALL_RET_SET_ON_PTRACE_EXIT
+#elif defined(__s390__)
+# define ARCH_REGS s390_regs
+# define SYSCALL_NUM(_regs) (_regs).gprs[2]
+# define SYSCALL_RET_SET(_regs, _val) \
+ TH_LOG("Can't modify syscall return on this architecture")
+#elif defined(__mips__)
+# include <asm/unistd_nr_n32.h>
+# include <asm/unistd_nr_n64.h>
+# include <asm/unistd_nr_o32.h>
+# define ARCH_REGS struct pt_regs
+# define SYSCALL_NUM(_regs) \
+ ({ \
+ typeof((_regs).regs[2]) _nr; \
+ if ((_regs).regs[2] == __NR_O32_Linux) \
+ _nr = (_regs).regs[4]; \
+ else \
+ _nr = (_regs).regs[2]; \
+ _nr; \
+ })
+# define SYSCALL_NUM_SET(_regs, _nr) \
+ do { \
+ if ((_regs).regs[2] == __NR_O32_Linux) \
+ (_regs).regs[4] = _nr; \
+ else \
+ (_regs).regs[2] = _nr; \
+ } while (0)
+# define SYSCALL_RET_SET(_regs, _val) \
+ TH_LOG("Can't modify syscall return on this architecture")
+#elif defined(__xtensa__)
+# define ARCH_REGS struct user_pt_regs
+# define SYSCALL_NUM(_regs) (_regs).syscall
+/*
+ * On xtensa syscall return value is in the register
+ * a2 of the current window which is not fixed.
+ */
+#define SYSCALL_RET(_regs) (_regs).a[(_regs).windowbase * 4 + 2]
+#elif defined(__sh__)
+# define ARCH_REGS struct pt_regs
+# define SYSCALL_NUM(_regs) (_regs).regs[3]
+# define SYSCALL_RET(_regs) (_regs).regs[0]
+#else
+# error "Do not know how to find your architecture's registers and syscalls"
+#endif
+
+/*
+ * Most architectures can change the syscall by just updating the
+ * associated register. This is the default if not defined above.
+ */
+#ifndef SYSCALL_NUM_SET
+# define SYSCALL_NUM_SET(_regs, _nr) \
+ do { \
+ SYSCALL_NUM(_regs) = (_nr); \
+ } while (0)
+#endif
+/*
+ * Most architectures can change the syscall return value by just
+ * writing to the SYSCALL_RET register. This is the default if not
+ * defined above. If an architecture cannot set the return value
+ * (for example when the syscall and return value register is
+ * shared), report it with TH_LOG() in an arch-specific definition
+ * of SYSCALL_RET_SET() above, and leave SYSCALL_RET undefined.
+ */
+#if !defined(SYSCALL_RET) && !defined(SYSCALL_RET_SET)
+# error "One of SYSCALL_RET or SYSCALL_RET_SET is needed for this arch"
+#endif
+#ifndef SYSCALL_RET_SET
+# define SYSCALL_RET_SET(_regs, _val) \
+ do { \
+ SYSCALL_RET(_regs) = (_val); \
+ } while (0)
+#endif
+
+/* When the syscall return can't be changed, stub out the tests for it. */
+#ifndef SYSCALL_RET
+# define EXPECT_SYSCALL_RETURN(val, action) EXPECT_EQ(-1, action)
+#else
+# define EXPECT_SYSCALL_RETURN(val, action) \
+ do { \
+ errno = 0; \
+ if (val < 0) { \
+ EXPECT_EQ(-1, action); \
+ EXPECT_EQ(-(val), errno); \
+ } else { \
+ EXPECT_EQ(val, action); \
+ } \
+ } while (0)
+#endif
+
+/*
+ * Some architectures (e.g. powerpc) can only set syscall
+ * return values on syscall exit during ptrace.
+ */
+const bool ptrace_entry_set_syscall_nr = true;
+const bool ptrace_entry_set_syscall_ret =
+#ifndef SYSCALL_RET_SET_ON_PTRACE_EXIT
+ true;
+#else
+ false;
+#endif
+
+/*
+ * Use PTRACE_GETREGS and PTRACE_SETREGS when available. This is useful for
+ * architectures without HAVE_ARCH_TRACEHOOK (e.g. User-mode Linux).
+ */
+#if defined(__x86_64__) || defined(__i386__) || defined(__mips__)
+# define ARCH_GETREGS(_regs) ptrace(PTRACE_GETREGS, tracee, 0, &(_regs))
+# define ARCH_SETREGS(_regs) ptrace(PTRACE_SETREGS, tracee, 0, &(_regs))
+#else
+# define ARCH_GETREGS(_regs) ({ \
+ struct iovec __v; \
+ __v.iov_base = &(_regs); \
+ __v.iov_len = sizeof(_regs); \
+ ptrace(PTRACE_GETREGSET, tracee, NT_PRSTATUS, &__v); \
+ })
+# define ARCH_SETREGS(_regs) ({ \
+ struct iovec __v; \
+ __v.iov_base = &(_regs); \
+ __v.iov_len = sizeof(_regs); \
+ ptrace(PTRACE_SETREGSET, tracee, NT_PRSTATUS, &__v); \
+ })
+#endif
+
+/* Architecture-specific syscall fetching routine. */
+int get_syscall(struct __test_metadata *_metadata, pid_t tracee)
+{
+ ARCH_REGS regs;
+
+ EXPECT_EQ(0, ARCH_GETREGS(regs)) {
+ return -1;
+ }
+
+ return SYSCALL_NUM(regs);
+}
+
+/* Architecture-specific syscall changing routine. */
+void __change_syscall(struct __test_metadata *_metadata,
+ pid_t tracee, long *syscall, long *ret)
+{
+ ARCH_REGS orig, regs;
+
+ /* Do not get/set registers if we have nothing to do. */
+ if (!syscall && !ret)
+ return;
+
+ EXPECT_EQ(0, ARCH_GETREGS(regs)) {
+ return;
+ }
+ orig = regs;
+
+ if (syscall)
+ SYSCALL_NUM_SET(regs, *syscall);
+
+ if (ret)
+ SYSCALL_RET_SET(regs, *ret);
+
+ /* Flush any register changes made. */
+ if (memcmp(&orig, &regs, sizeof(orig)) != 0)
+ EXPECT_EQ(0, ARCH_SETREGS(regs));
+}
+
+/* Change only syscall number. */
+void change_syscall_nr(struct __test_metadata *_metadata,
+ pid_t tracee, long syscall)
+{
+ __change_syscall(_metadata, tracee, &syscall, NULL);
+}
+
+/* Change syscall return value (and set syscall number to -1). */
+void change_syscall_ret(struct __test_metadata *_metadata,
+ pid_t tracee, long ret)
+{
+ long syscall = -1;
+
+ __change_syscall(_metadata, tracee, &syscall, &ret);
+}
+
+void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee,
+ int status, void *args)
+{
+ int ret;
+ unsigned long msg;
+
+ /* Make sure we got the right message. */
+ ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
+ EXPECT_EQ(0, ret);
+
+ /* Validate and take action on expected syscalls. */
+ switch (msg) {
+ case 0x1002:
+ /* change getpid to getppid. */
+ EXPECT_EQ(__NR_getpid, get_syscall(_metadata, tracee));
+ change_syscall_nr(_metadata, tracee, __NR_getppid);
+ break;
+ case 0x1003:
+ /* skip gettid with valid return code. */
+ EXPECT_EQ(__NR_gettid, get_syscall(_metadata, tracee));
+ change_syscall_ret(_metadata, tracee, 45000);
+ break;
+ case 0x1004:
+ /* skip openat with error. */
+ EXPECT_EQ(__NR_openat, get_syscall(_metadata, tracee));
+ change_syscall_ret(_metadata, tracee, -ESRCH);
+ break;
+ case 0x1005:
+ /* do nothing (allow getppid) */
+ EXPECT_EQ(__NR_getppid, get_syscall(_metadata, tracee));
+ break;
+ default:
+ EXPECT_EQ(0, msg) {
+ TH_LOG("Unknown PTRACE_GETEVENTMSG: 0x%lx", msg);
+ kill(tracee, SIGKILL);
+ }
+ }
+
+}
+
+FIXTURE(TRACE_syscall) {
+ struct sock_fprog prog;
+ pid_t tracer, mytid, mypid, parent;
+ long syscall_nr;
+};
+
+void tracer_ptrace(struct __test_metadata *_metadata, pid_t tracee,
+ int status, void *args)
+{
+ int ret;
+ unsigned long msg;
+ static bool entry;
+ long syscall_nr_val, syscall_ret_val;
+ long *syscall_nr = NULL, *syscall_ret = NULL;
+ FIXTURE_DATA(TRACE_syscall) *self = args;
+
+ /*
+ * The traditional way to tell PTRACE_SYSCALL entry/exit
+ * is by counting.
+ */
+ entry = !entry;
+
+ /* Make sure we got an appropriate message. */
+ ret = ptrace(PTRACE_GETEVENTMSG, tracee, NULL, &msg);
+ EXPECT_EQ(0, ret);
+ EXPECT_EQ(entry ? PTRACE_EVENTMSG_SYSCALL_ENTRY
+ : PTRACE_EVENTMSG_SYSCALL_EXIT, msg);
+
+ /*
+ * Some architectures only support setting return values during
+ * syscall exit under ptrace, and on exit the syscall number may
+ * no longer be available. Therefore, save the initial sycall
+ * number here, so it can be examined during both entry and exit
+ * phases.
+ */
+ if (entry)
+ self->syscall_nr = get_syscall(_metadata, tracee);
+
+ /*
+ * Depending on the architecture's syscall setting abilities, we
+ * pick which things to set during this phase (entry or exit).
+ */
+ if (entry == ptrace_entry_set_syscall_nr)
+ syscall_nr = &syscall_nr_val;
+ if (entry == ptrace_entry_set_syscall_ret)
+ syscall_ret = &syscall_ret_val;
+
+ /* Now handle the actual rewriting cases. */
+ switch (self->syscall_nr) {
+ case __NR_getpid:
+ syscall_nr_val = __NR_getppid;
+ /* Never change syscall return for this case. */
+ syscall_ret = NULL;
+ break;
+ case __NR_gettid:
+ syscall_nr_val = -1;
+ syscall_ret_val = 45000;
+ break;
+ case __NR_openat:
+ syscall_nr_val = -1;
+ syscall_ret_val = -ESRCH;
+ break;
+ default:
+ /* Unhandled, do nothing. */
+ return;
+ }
+
+ __change_syscall(_metadata, tracee, syscall_nr, syscall_ret);
+}
+
+FIXTURE_VARIANT(TRACE_syscall) {
+ /*
+ * All of the SECCOMP_RET_TRACE behaviors can be tested with either
+ * SECCOMP_RET_TRACE+PTRACE_CONT or plain ptrace()+PTRACE_SYSCALL.
+ * This indicates if we should use SECCOMP_RET_TRACE (false), or
+ * ptrace (true).
+ */
+ bool use_ptrace;
+};
+
+FIXTURE_VARIANT_ADD(TRACE_syscall, ptrace) {
+ .use_ptrace = true,
+};
+
+FIXTURE_VARIANT_ADD(TRACE_syscall, seccomp) {
+ .use_ptrace = false,
+};
+
+FIXTURE_SETUP(TRACE_syscall)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1002),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_gettid, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1003),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_openat, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1004),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1005),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ /* Prepare some testable syscall results. */
+ self->mytid = syscall(__NR_gettid);
+ ASSERT_GT(self->mytid, 0);
+ ASSERT_NE(self->mytid, 1) {
+ TH_LOG("Running this test as init is not supported. :)");
+ }
+
+ self->mypid = getpid();
+ ASSERT_GT(self->mypid, 0);
+ ASSERT_EQ(self->mytid, self->mypid);
+
+ self->parent = getppid();
+ ASSERT_GT(self->parent, 0);
+ ASSERT_NE(self->parent, self->mypid);
+
+ /* Launch tracer. */
+ self->tracer = setup_trace_fixture(_metadata,
+ variant->use_ptrace ? tracer_ptrace
+ : tracer_seccomp,
+ self, variant->use_ptrace);
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ if (variant->use_ptrace)
+ return;
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+ ASSERT_EQ(0, ret);
+}
+
+FIXTURE_TEARDOWN(TRACE_syscall)
+{
+ teardown_trace_fixture(_metadata, self->tracer);
+}
+
+TEST(negative_ENOSYS)
+{
+ /*
+ * There should be no difference between an "internal" skip
+ * and userspace asking for syscall "-1".
+ */
+ errno = 0;
+ EXPECT_EQ(-1, syscall(-1));
+ EXPECT_EQ(errno, ENOSYS);
+ /* And no difference for "still not valid but not -1". */
+ errno = 0;
+ EXPECT_EQ(-1, syscall(-101));
+ EXPECT_EQ(errno, ENOSYS);
+}
+
+TEST_F(TRACE_syscall, negative_ENOSYS)
+{
+ negative_ENOSYS(_metadata);
+}
+
+TEST_F(TRACE_syscall, syscall_allowed)
+{
+ /* getppid works as expected (no changes). */
+ EXPECT_EQ(self->parent, syscall(__NR_getppid));
+ EXPECT_NE(self->mypid, syscall(__NR_getppid));
+}
+
+TEST_F(TRACE_syscall, syscall_redirected)
+{
+ /* getpid has been redirected to getppid as expected. */
+ EXPECT_EQ(self->parent, syscall(__NR_getpid));
+ EXPECT_NE(self->mypid, syscall(__NR_getpid));
+}
+
+TEST_F(TRACE_syscall, syscall_errno)
+{
+ /* Tracer should skip the open syscall, resulting in ESRCH. */
+ EXPECT_SYSCALL_RETURN(-ESRCH, syscall(__NR_openat));
+}
+
+TEST_F(TRACE_syscall, syscall_faked)
+{
+ /* Tracer skips the gettid syscall and store altered return value. */
+ EXPECT_SYSCALL_RETURN(45000, syscall(__NR_gettid));
+}
+
+TEST_F(TRACE_syscall, skip_after)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EPERM),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ /* Install additional "errno on getppid" filter. */
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ /* Tracer will redirect getpid to getppid, and we should see EPERM. */
+ errno = 0;
+ EXPECT_EQ(-1, syscall(__NR_getpid));
+ EXPECT_EQ(EPERM, errno);
+}
+
+TEST_F_SIGNAL(TRACE_syscall, kill_after, SIGSYS)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ /* Install additional "death on getppid" filter. */
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ /* Tracer will redirect getpid to getppid, and we should die. */
+ EXPECT_NE(self->mypid, syscall(__NR_getpid));
+}
+
+TEST(seccomp_syscall)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ /* Reject insane operation. */
+ ret = seccomp(-1, 0, &prog);
+ ASSERT_NE(ENOSYS, errno) {
+ TH_LOG("Kernel does not support seccomp syscall!");
+ }
+ EXPECT_EQ(EINVAL, errno) {
+ TH_LOG("Did not reject crazy op value!");
+ }
+
+ /* Reject strict with flags or pointer. */
+ ret = seccomp(SECCOMP_SET_MODE_STRICT, -1, NULL);
+ EXPECT_EQ(EINVAL, errno) {
+ TH_LOG("Did not reject mode strict with flags!");
+ }
+ ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, &prog);
+ EXPECT_EQ(EINVAL, errno) {
+ TH_LOG("Did not reject mode strict with uargs!");
+ }
+
+ /* Reject insane args for filter. */
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, -1, &prog);
+ EXPECT_EQ(EINVAL, errno) {
+ TH_LOG("Did not reject crazy filter flags!");
+ }
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, NULL);
+ EXPECT_EQ(EFAULT, errno) {
+ TH_LOG("Did not reject NULL filter!");
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
+ EXPECT_EQ(0, errno) {
+ TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER: %s",
+ strerror(errno));
+ }
+}
+
+TEST(seccomp_syscall_mode_lock)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
+ ASSERT_NE(ENOSYS, errno) {
+ TH_LOG("Kernel does not support seccomp syscall!");
+ }
+ EXPECT_EQ(0, ret) {
+ TH_LOG("Could not install filter!");
+ }
+
+ /* Make sure neither entry point will switch to strict. */
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
+ EXPECT_EQ(EINVAL, errno) {
+ TH_LOG("Switched to mode strict!");
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_STRICT, 0, NULL);
+ EXPECT_EQ(EINVAL, errno) {
+ TH_LOG("Switched to mode strict!");
+ }
+}
+
+/*
+ * Test detection of known and unknown filter flags. Userspace needs to be able
+ * to check if a filter flag is supported by the current kernel and a good way
+ * of doing that is by attempting to enter filter mode, with the flag bit in
+ * question set, and a NULL pointer for the _args_ parameter. EFAULT indicates
+ * that the flag is valid and EINVAL indicates that the flag is invalid.
+ */
+TEST(detect_seccomp_filter_flags)
+{
+ unsigned int flags[] = { SECCOMP_FILTER_FLAG_TSYNC,
+ SECCOMP_FILTER_FLAG_LOG,
+ SECCOMP_FILTER_FLAG_SPEC_ALLOW,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER,
+ SECCOMP_FILTER_FLAG_TSYNC_ESRCH };
+ unsigned int exclusive[] = {
+ SECCOMP_FILTER_FLAG_TSYNC,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER };
+ unsigned int flag, all_flags, exclusive_mask;
+ int i;
+ long ret;
+
+ /* Test detection of individual known-good filter flags */
+ for (i = 0, all_flags = 0; i < ARRAY_SIZE(flags); i++) {
+ int bits = 0;
+
+ flag = flags[i];
+ /* Make sure the flag is a single bit! */
+ while (flag) {
+ if (flag & 0x1)
+ bits ++;
+ flag >>= 1;
+ }
+ ASSERT_EQ(1, bits);
+ flag = flags[i];
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
+ ASSERT_NE(ENOSYS, errno) {
+ TH_LOG("Kernel does not support seccomp syscall!");
+ }
+ EXPECT_EQ(-1, ret);
+ EXPECT_EQ(EFAULT, errno) {
+ TH_LOG("Failed to detect that a known-good filter flag (0x%X) is supported!",
+ flag);
+ }
+
+ all_flags |= flag;
+ }
+
+ /*
+ * Test detection of all known-good filter flags combined. But
+ * for the exclusive flags we need to mask them out and try them
+ * individually for the "all flags" testing.
+ */
+ exclusive_mask = 0;
+ for (i = 0; i < ARRAY_SIZE(exclusive); i++)
+ exclusive_mask |= exclusive[i];
+ for (i = 0; i < ARRAY_SIZE(exclusive); i++) {
+ flag = all_flags & ~exclusive_mask;
+ flag |= exclusive[i];
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
+ EXPECT_EQ(-1, ret);
+ EXPECT_EQ(EFAULT, errno) {
+ TH_LOG("Failed to detect that all known-good filter flags (0x%X) are supported!",
+ flag);
+ }
+ }
+
+ /* Test detection of an unknown filter flags, without exclusives. */
+ flag = -1;
+ flag &= ~exclusive_mask;
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
+ EXPECT_EQ(-1, ret);
+ EXPECT_EQ(EINVAL, errno) {
+ TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported!",
+ flag);
+ }
+
+ /*
+ * Test detection of an unknown filter flag that may simply need to be
+ * added to this test
+ */
+ flag = flags[ARRAY_SIZE(flags) - 1] << 1;
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, flag, NULL);
+ EXPECT_EQ(-1, ret);
+ EXPECT_EQ(EINVAL, errno) {
+ TH_LOG("Failed to detect that an unknown filter flag (0x%X) is unsupported! Does a new flag need to be added to this test?",
+ flag);
+ }
+}
+
+TEST(TSYNC_first)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, NULL, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
+ &prog);
+ ASSERT_NE(ENOSYS, errno) {
+ TH_LOG("Kernel does not support seccomp syscall!");
+ }
+ EXPECT_EQ(0, ret) {
+ TH_LOG("Could not install initial filter with TSYNC!");
+ }
+}
+
+#define TSYNC_SIBLINGS 2
+struct tsync_sibling {
+ pthread_t tid;
+ pid_t system_tid;
+ sem_t *started;
+ pthread_cond_t *cond;
+ pthread_mutex_t *mutex;
+ int diverge;
+ int num_waits;
+ struct sock_fprog *prog;
+ struct __test_metadata *metadata;
+};
+
+/*
+ * To avoid joining joined threads (which is not allowed by Bionic),
+ * make sure we both successfully join and clear the tid to skip a
+ * later join attempt during fixture teardown. Any remaining threads
+ * will be directly killed during teardown.
+ */
+#define PTHREAD_JOIN(tid, status) \
+ do { \
+ int _rc = pthread_join(tid, status); \
+ if (_rc) { \
+ TH_LOG("pthread_join of tid %u failed: %d\n", \
+ (unsigned int)tid, _rc); \
+ } else { \
+ tid = 0; \
+ } \
+ } while (0)
+
+FIXTURE(TSYNC) {
+ struct sock_fprog root_prog, apply_prog;
+ struct tsync_sibling sibling[TSYNC_SIBLINGS];
+ sem_t started;
+ pthread_cond_t cond;
+ pthread_mutex_t mutex;
+ int sibling_count;
+};
+
+FIXTURE_SETUP(TSYNC)
+{
+ struct sock_filter root_filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_filter apply_filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+
+ memset(&self->root_prog, 0, sizeof(self->root_prog));
+ memset(&self->apply_prog, 0, sizeof(self->apply_prog));
+ memset(&self->sibling, 0, sizeof(self->sibling));
+ self->root_prog.filter = malloc(sizeof(root_filter));
+ ASSERT_NE(NULL, self->root_prog.filter);
+ memcpy(self->root_prog.filter, &root_filter, sizeof(root_filter));
+ self->root_prog.len = (unsigned short)ARRAY_SIZE(root_filter);
+
+ self->apply_prog.filter = malloc(sizeof(apply_filter));
+ ASSERT_NE(NULL, self->apply_prog.filter);
+ memcpy(self->apply_prog.filter, &apply_filter, sizeof(apply_filter));
+ self->apply_prog.len = (unsigned short)ARRAY_SIZE(apply_filter);
+
+ self->sibling_count = 0;
+ pthread_mutex_init(&self->mutex, NULL);
+ pthread_cond_init(&self->cond, NULL);
+ sem_init(&self->started, 0, 0);
+ self->sibling[0].tid = 0;
+ self->sibling[0].cond = &self->cond;
+ self->sibling[0].started = &self->started;
+ self->sibling[0].mutex = &self->mutex;
+ self->sibling[0].diverge = 0;
+ self->sibling[0].num_waits = 1;
+ self->sibling[0].prog = &self->root_prog;
+ self->sibling[0].metadata = _metadata;
+ self->sibling[1].tid = 0;
+ self->sibling[1].cond = &self->cond;
+ self->sibling[1].started = &self->started;
+ self->sibling[1].mutex = &self->mutex;
+ self->sibling[1].diverge = 0;
+ self->sibling[1].prog = &self->root_prog;
+ self->sibling[1].num_waits = 1;
+ self->sibling[1].metadata = _metadata;
+}
+
+FIXTURE_TEARDOWN(TSYNC)
+{
+ int sib = 0;
+
+ if (self->root_prog.filter)
+ free(self->root_prog.filter);
+ if (self->apply_prog.filter)
+ free(self->apply_prog.filter);
+
+ for ( ; sib < self->sibling_count; ++sib) {
+ struct tsync_sibling *s = &self->sibling[sib];
+
+ if (!s->tid)
+ continue;
+ /*
+ * If a thread is still running, it may be stuck, so hit
+ * it over the head really hard.
+ */
+ pthread_kill(s->tid, 9);
+ }
+ pthread_mutex_destroy(&self->mutex);
+ pthread_cond_destroy(&self->cond);
+ sem_destroy(&self->started);
+}
+
+void *tsync_sibling(void *data)
+{
+ long ret = 0;
+ struct tsync_sibling *me = data;
+
+ me->system_tid = syscall(__NR_gettid);
+
+ pthread_mutex_lock(me->mutex);
+ if (me->diverge) {
+ /* Just re-apply the root prog to fork the tree */
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
+ me->prog, 0, 0);
+ }
+ sem_post(me->started);
+ /* Return outside of started so parent notices failures. */
+ if (ret) {
+ pthread_mutex_unlock(me->mutex);
+ return (void *)SIBLING_EXIT_FAILURE;
+ }
+ do {
+ pthread_cond_wait(me->cond, me->mutex);
+ me->num_waits = me->num_waits - 1;
+ } while (me->num_waits);
+ pthread_mutex_unlock(me->mutex);
+
+ ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
+ if (!ret)
+ return (void *)SIBLING_EXIT_NEWPRIVS;
+ read(-1, NULL, 0);
+ return (void *)SIBLING_EXIT_UNKILLED;
+}
+
+void tsync_start_sibling(struct tsync_sibling *sibling)
+{
+ pthread_create(&sibling->tid, NULL, tsync_sibling, (void *)sibling);
+}
+
+TEST_F(TSYNC, siblings_fail_prctl)
+{
+ long ret;
+ void *status;
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_prctl, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EINVAL),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+
+ ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ /* Check prctl failure detection by requesting sib 0 diverge. */
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog);
+ ASSERT_NE(ENOSYS, errno) {
+ TH_LOG("Kernel does not support seccomp syscall!");
+ }
+ ASSERT_EQ(0, ret) {
+ TH_LOG("setting filter failed");
+ }
+
+ self->sibling[0].diverge = 1;
+ tsync_start_sibling(&self->sibling[0]);
+ tsync_start_sibling(&self->sibling[1]);
+
+ while (self->sibling_count < TSYNC_SIBLINGS) {
+ sem_wait(&self->started);
+ self->sibling_count++;
+ }
+
+ /* Signal the threads to clean up*/
+ pthread_mutex_lock(&self->mutex);
+ ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
+ TH_LOG("cond broadcast non-zero");
+ }
+ pthread_mutex_unlock(&self->mutex);
+
+ /* Ensure diverging sibling failed to call prctl. */
+ PTHREAD_JOIN(self->sibling[0].tid, &status);
+ EXPECT_EQ(SIBLING_EXIT_FAILURE, (long)status);
+ PTHREAD_JOIN(self->sibling[1].tid, &status);
+ EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
+}
+
+TEST_F(TSYNC, two_siblings_with_ancestor)
+{
+ long ret;
+ void *status;
+
+ ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
+ ASSERT_NE(ENOSYS, errno) {
+ TH_LOG("Kernel does not support seccomp syscall!");
+ }
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
+ }
+ tsync_start_sibling(&self->sibling[0]);
+ tsync_start_sibling(&self->sibling[1]);
+
+ while (self->sibling_count < TSYNC_SIBLINGS) {
+ sem_wait(&self->started);
+ self->sibling_count++;
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
+ &self->apply_prog);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Could install filter on all threads!");
+ }
+ /* Tell the siblings to test the policy */
+ pthread_mutex_lock(&self->mutex);
+ ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
+ TH_LOG("cond broadcast non-zero");
+ }
+ pthread_mutex_unlock(&self->mutex);
+ /* Ensure they are both killed and don't exit cleanly. */
+ PTHREAD_JOIN(self->sibling[0].tid, &status);
+ EXPECT_EQ(0x0, (long)status);
+ PTHREAD_JOIN(self->sibling[1].tid, &status);
+ EXPECT_EQ(0x0, (long)status);
+}
+
+TEST_F(TSYNC, two_sibling_want_nnp)
+{
+ void *status;
+
+ /* start siblings before any prctl() operations */
+ tsync_start_sibling(&self->sibling[0]);
+ tsync_start_sibling(&self->sibling[1]);
+ while (self->sibling_count < TSYNC_SIBLINGS) {
+ sem_wait(&self->started);
+ self->sibling_count++;
+ }
+
+ /* Tell the siblings to test no policy */
+ pthread_mutex_lock(&self->mutex);
+ ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
+ TH_LOG("cond broadcast non-zero");
+ }
+ pthread_mutex_unlock(&self->mutex);
+
+ /* Ensure they are both upset about lacking nnp. */
+ PTHREAD_JOIN(self->sibling[0].tid, &status);
+ EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
+ PTHREAD_JOIN(self->sibling[1].tid, &status);
+ EXPECT_EQ(SIBLING_EXIT_NEWPRIVS, (long)status);
+}
+
+TEST_F(TSYNC, two_siblings_with_no_filter)
+{
+ long ret;
+ void *status;
+
+ /* start siblings before any prctl() operations */
+ tsync_start_sibling(&self->sibling[0]);
+ tsync_start_sibling(&self->sibling[1]);
+ while (self->sibling_count < TSYNC_SIBLINGS) {
+ sem_wait(&self->started);
+ self->sibling_count++;
+ }
+
+ ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
+ &self->apply_prog);
+ ASSERT_NE(ENOSYS, errno) {
+ TH_LOG("Kernel does not support seccomp syscall!");
+ }
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Could install filter on all threads!");
+ }
+
+ /* Tell the siblings to test the policy */
+ pthread_mutex_lock(&self->mutex);
+ ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
+ TH_LOG("cond broadcast non-zero");
+ }
+ pthread_mutex_unlock(&self->mutex);
+
+ /* Ensure they are both killed and don't exit cleanly. */
+ PTHREAD_JOIN(self->sibling[0].tid, &status);
+ EXPECT_EQ(0x0, (long)status);
+ PTHREAD_JOIN(self->sibling[1].tid, &status);
+ EXPECT_EQ(0x0, (long)status);
+}
+
+TEST_F(TSYNC, two_siblings_with_one_divergence)
+{
+ long ret;
+ void *status;
+
+ ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
+ ASSERT_NE(ENOSYS, errno) {
+ TH_LOG("Kernel does not support seccomp syscall!");
+ }
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
+ }
+ self->sibling[0].diverge = 1;
+ tsync_start_sibling(&self->sibling[0]);
+ tsync_start_sibling(&self->sibling[1]);
+
+ while (self->sibling_count < TSYNC_SIBLINGS) {
+ sem_wait(&self->started);
+ self->sibling_count++;
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
+ &self->apply_prog);
+ ASSERT_EQ(self->sibling[0].system_tid, ret) {
+ TH_LOG("Did not fail on diverged sibling.");
+ }
+
+ /* Wake the threads */
+ pthread_mutex_lock(&self->mutex);
+ ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
+ TH_LOG("cond broadcast non-zero");
+ }
+ pthread_mutex_unlock(&self->mutex);
+
+ /* Ensure they are both unkilled. */
+ PTHREAD_JOIN(self->sibling[0].tid, &status);
+ EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
+ PTHREAD_JOIN(self->sibling[1].tid, &status);
+ EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
+}
+
+TEST_F(TSYNC, two_siblings_with_one_divergence_no_tid_in_err)
+{
+ long ret, flags;
+ void *status;
+
+ ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
+ ASSERT_NE(ENOSYS, errno) {
+ TH_LOG("Kernel does not support seccomp syscall!");
+ }
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
+ }
+ self->sibling[0].diverge = 1;
+ tsync_start_sibling(&self->sibling[0]);
+ tsync_start_sibling(&self->sibling[1]);
+
+ while (self->sibling_count < TSYNC_SIBLINGS) {
+ sem_wait(&self->started);
+ self->sibling_count++;
+ }
+
+ flags = SECCOMP_FILTER_FLAG_TSYNC | \
+ SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, flags, &self->apply_prog);
+ ASSERT_EQ(ESRCH, errno) {
+ TH_LOG("Did not return ESRCH for diverged sibling.");
+ }
+ ASSERT_EQ(-1, ret) {
+ TH_LOG("Did not fail on diverged sibling.");
+ }
+
+ /* Wake the threads */
+ pthread_mutex_lock(&self->mutex);
+ ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
+ TH_LOG("cond broadcast non-zero");
+ }
+ pthread_mutex_unlock(&self->mutex);
+
+ /* Ensure they are both unkilled. */
+ PTHREAD_JOIN(self->sibling[0].tid, &status);
+ EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
+ PTHREAD_JOIN(self->sibling[1].tid, &status);
+ EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
+}
+
+TEST_F(TSYNC, two_siblings_not_under_filter)
+{
+ long ret, sib;
+ void *status;
+ struct timespec delay = { .tv_nsec = 100000000 };
+
+ ASSERT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ /*
+ * Sibling 0 will have its own seccomp policy
+ * and Sibling 1 will not be under seccomp at
+ * all. Sibling 1 will enter seccomp and 0
+ * will cause failure.
+ */
+ self->sibling[0].diverge = 1;
+ tsync_start_sibling(&self->sibling[0]);
+ tsync_start_sibling(&self->sibling[1]);
+
+ while (self->sibling_count < TSYNC_SIBLINGS) {
+ sem_wait(&self->started);
+ self->sibling_count++;
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &self->root_prog);
+ ASSERT_NE(ENOSYS, errno) {
+ TH_LOG("Kernel does not support seccomp syscall!");
+ }
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support SECCOMP_SET_MODE_FILTER!");
+ }
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
+ &self->apply_prog);
+ ASSERT_EQ(ret, self->sibling[0].system_tid) {
+ TH_LOG("Did not fail on diverged sibling.");
+ }
+ sib = 1;
+ if (ret == self->sibling[0].system_tid)
+ sib = 0;
+
+ pthread_mutex_lock(&self->mutex);
+
+ /* Increment the other siblings num_waits so we can clean up
+ * the one we just saw.
+ */
+ self->sibling[!sib].num_waits += 1;
+
+ /* Signal the thread to clean up*/
+ ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
+ TH_LOG("cond broadcast non-zero");
+ }
+ pthread_mutex_unlock(&self->mutex);
+ PTHREAD_JOIN(self->sibling[sib].tid, &status);
+ EXPECT_EQ(SIBLING_EXIT_UNKILLED, (long)status);
+ /* Poll for actual task death. pthread_join doesn't guarantee it. */
+ while (!kill(self->sibling[sib].system_tid, 0))
+ nanosleep(&delay, NULL);
+ /* Switch to the remaining sibling */
+ sib = !sib;
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
+ &self->apply_prog);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Expected the remaining sibling to sync");
+ };
+
+ pthread_mutex_lock(&self->mutex);
+
+ /* If remaining sibling didn't have a chance to wake up during
+ * the first broadcast, manually reduce the num_waits now.
+ */
+ if (self->sibling[sib].num_waits > 1)
+ self->sibling[sib].num_waits = 1;
+ ASSERT_EQ(0, pthread_cond_broadcast(&self->cond)) {
+ TH_LOG("cond broadcast non-zero");
+ }
+ pthread_mutex_unlock(&self->mutex);
+ PTHREAD_JOIN(self->sibling[sib].tid, &status);
+ EXPECT_EQ(0, (long)status);
+ /* Poll for actual task death. pthread_join doesn't guarantee it. */
+ while (!kill(self->sibling[sib].system_tid, 0))
+ nanosleep(&delay, NULL);
+
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC,
+ &self->apply_prog);
+ ASSERT_EQ(0, ret); /* just us chickens */
+}
+
+/* Make sure restarted syscalls are seen directly as "restart_syscall". */
+TEST(syscall_restart)
+{
+ long ret;
+ unsigned long msg;
+ pid_t child_pid;
+ int pipefd[2];
+ int status;
+ siginfo_t info = { };
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+
+#ifdef __NR_sigreturn
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_sigreturn, 7, 0),
+#endif
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_read, 6, 0),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_exit, 5, 0),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_rt_sigreturn, 4, 0),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_nanosleep, 5, 0),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_clock_nanosleep, 4, 0),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_restart_syscall, 4, 0),
+
+ /* Allow __NR_write for easy logging. */
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_write, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+ /* The nanosleep jump target. */
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x100),
+ /* The restart_syscall jump target. */
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE|0x200),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+#if defined(__arm__)
+ struct utsname utsbuf;
+#endif
+
+ ASSERT_EQ(0, pipe(pipefd));
+
+ child_pid = fork();
+ ASSERT_LE(0, child_pid);
+ if (child_pid == 0) {
+ /* Child uses EXPECT not ASSERT to deliver status correctly. */
+ char buf = ' ';
+ struct timespec timeout = { };
+
+ /* Attach parent as tracer and stop. */
+ EXPECT_EQ(0, ptrace(PTRACE_TRACEME));
+ EXPECT_EQ(0, raise(SIGSTOP));
+
+ EXPECT_EQ(0, close(pipefd[1]));
+
+ EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0);
+ EXPECT_EQ(0, ret) {
+ TH_LOG("Failed to install filter!");
+ }
+
+ EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
+ TH_LOG("Failed to read() sync from parent");
+ }
+ EXPECT_EQ('.', buf) {
+ TH_LOG("Failed to get sync data from read()");
+ }
+
+ /* Start nanosleep to be interrupted. */
+ timeout.tv_sec = 1;
+ errno = 0;
+ EXPECT_EQ(0, nanosleep(&timeout, NULL)) {
+ TH_LOG("Call to nanosleep() failed (errno %d)", errno);
+ }
+
+ /* Read final sync from parent. */
+ EXPECT_EQ(1, read(pipefd[0], &buf, 1)) {
+ TH_LOG("Failed final read() from parent");
+ }
+ EXPECT_EQ('!', buf) {
+ TH_LOG("Failed to get final data from read()");
+ }
+
+ /* Directly report the status of our test harness results. */
+ syscall(__NR_exit, _metadata->passed ? EXIT_SUCCESS
+ : EXIT_FAILURE);
+ }
+ EXPECT_EQ(0, close(pipefd[0]));
+
+ /* Attach to child, setup options, and release. */
+ ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
+ ASSERT_EQ(true, WIFSTOPPED(status));
+ ASSERT_EQ(0, ptrace(PTRACE_SETOPTIONS, child_pid, NULL,
+ PTRACE_O_TRACESECCOMP));
+ ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
+ ASSERT_EQ(1, write(pipefd[1], ".", 1));
+
+ /* Wait for nanosleep() to start. */
+ ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
+ ASSERT_EQ(true, WIFSTOPPED(status));
+ ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
+ ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
+ ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
+ ASSERT_EQ(0x100, msg);
+ ret = get_syscall(_metadata, child_pid);
+ EXPECT_TRUE(ret == __NR_nanosleep || ret == __NR_clock_nanosleep);
+
+ /* Might as well check siginfo for sanity while we're here. */
+ ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
+ ASSERT_EQ(SIGTRAP, info.si_signo);
+ ASSERT_EQ(SIGTRAP | (PTRACE_EVENT_SECCOMP << 8), info.si_code);
+ EXPECT_EQ(0, info.si_errno);
+ EXPECT_EQ(getuid(), info.si_uid);
+ /* Verify signal delivery came from child (seccomp-triggered). */
+ EXPECT_EQ(child_pid, info.si_pid);
+
+ /* Interrupt nanosleep with SIGSTOP (which we'll need to handle). */
+ ASSERT_EQ(0, kill(child_pid, SIGSTOP));
+ ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
+ ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
+ ASSERT_EQ(true, WIFSTOPPED(status));
+ ASSERT_EQ(SIGSTOP, WSTOPSIG(status));
+ ASSERT_EQ(0, ptrace(PTRACE_GETSIGINFO, child_pid, NULL, &info));
+ /*
+ * There is no siginfo on SIGSTOP any more, so we can't verify
+ * signal delivery came from parent now (getpid() == info.si_pid).
+ * https://lkml.kernel.org/r/CAGXu5jJaZAOzP1qFz66tYrtbuywqb+UN2SOA1VLHpCCOiYvYeg@mail.gmail.com
+ * At least verify the SIGSTOP via PTRACE_GETSIGINFO.
+ */
+ EXPECT_EQ(SIGSTOP, info.si_signo);
+
+ /* Restart nanosleep with SIGCONT, which triggers restart_syscall. */
+ ASSERT_EQ(0, kill(child_pid, SIGCONT));
+ ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
+ ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
+ ASSERT_EQ(true, WIFSTOPPED(status));
+ ASSERT_EQ(SIGCONT, WSTOPSIG(status));
+ ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
+
+ /* Wait for restart_syscall() to start. */
+ ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
+ ASSERT_EQ(true, WIFSTOPPED(status));
+ ASSERT_EQ(SIGTRAP, WSTOPSIG(status));
+ ASSERT_EQ(PTRACE_EVENT_SECCOMP, (status >> 16));
+ ASSERT_EQ(0, ptrace(PTRACE_GETEVENTMSG, child_pid, NULL, &msg));
+
+ ASSERT_EQ(0x200, msg);
+ ret = get_syscall(_metadata, child_pid);
+#if defined(__arm__)
+ /*
+ * FIXME:
+ * - native ARM registers do NOT expose true syscall.
+ * - compat ARM registers on ARM64 DO expose true syscall.
+ */
+ ASSERT_EQ(0, uname(&utsbuf));
+ if (strncmp(utsbuf.machine, "arm", 3) == 0) {
+ EXPECT_EQ(__NR_nanosleep, ret);
+ } else
+#endif
+ {
+ EXPECT_EQ(__NR_restart_syscall, ret);
+ }
+
+ /* Write again to end test. */
+ ASSERT_EQ(0, ptrace(PTRACE_CONT, child_pid, NULL, 0));
+ ASSERT_EQ(1, write(pipefd[1], "!", 1));
+ EXPECT_EQ(0, close(pipefd[1]));
+
+ ASSERT_EQ(child_pid, waitpid(child_pid, &status, 0));
+ if (WIFSIGNALED(status) || WEXITSTATUS(status))
+ _metadata->passed = 0;
+}
+
+TEST_SIGNAL(filter_flag_log, SIGSYS)
+{
+ struct sock_filter allow_filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_filter kill_filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getpid, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog allow_prog = {
+ .len = (unsigned short)ARRAY_SIZE(allow_filter),
+ .filter = allow_filter,
+ };
+ struct sock_fprog kill_prog = {
+ .len = (unsigned short)ARRAY_SIZE(kill_filter),
+ .filter = kill_filter,
+ };
+ long ret;
+ pid_t parent = getppid();
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ /* Verify that the FILTER_FLAG_LOG flag isn't accepted in strict mode */
+ ret = seccomp(SECCOMP_SET_MODE_STRICT, SECCOMP_FILTER_FLAG_LOG,
+ &allow_prog);
+ ASSERT_NE(ENOSYS, errno) {
+ TH_LOG("Kernel does not support seccomp syscall!");
+ }
+ EXPECT_NE(0, ret) {
+ TH_LOG("Kernel accepted FILTER_FLAG_LOG flag in strict mode!");
+ }
+ EXPECT_EQ(EINVAL, errno) {
+ TH_LOG("Kernel returned unexpected errno for FILTER_FLAG_LOG flag in strict mode!");
+ }
+
+ /* Verify that a simple, permissive filter can be added with no flags */
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, 0, &allow_prog);
+ EXPECT_EQ(0, ret);
+
+ /* See if the same filter can be added with the FILTER_FLAG_LOG flag */
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
+ &allow_prog);
+ ASSERT_NE(EINVAL, errno) {
+ TH_LOG("Kernel does not support the FILTER_FLAG_LOG flag!");
+ }
+ EXPECT_EQ(0, ret);
+
+ /* Ensure that the kill filter works with the FILTER_FLAG_LOG flag */
+ ret = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_LOG,
+ &kill_prog);
+ EXPECT_EQ(0, ret);
+
+ EXPECT_EQ(parent, syscall(__NR_getppid));
+ /* getpid() should never return. */
+ EXPECT_EQ(0, syscall(__NR_getpid));
+}
+
+TEST(get_action_avail)
+{
+ __u32 actions[] = { SECCOMP_RET_KILL_THREAD, SECCOMP_RET_TRAP,
+ SECCOMP_RET_ERRNO, SECCOMP_RET_TRACE,
+ SECCOMP_RET_LOG, SECCOMP_RET_ALLOW };
+ __u32 unknown_action = 0x10000000U;
+ int i;
+ long ret;
+
+ ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[0]);
+ ASSERT_NE(ENOSYS, errno) {
+ TH_LOG("Kernel does not support seccomp syscall!");
+ }
+ ASSERT_NE(EINVAL, errno) {
+ TH_LOG("Kernel does not support SECCOMP_GET_ACTION_AVAIL operation!");
+ }
+ EXPECT_EQ(ret, 0);
+
+ for (i = 0; i < ARRAY_SIZE(actions); i++) {
+ ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &actions[i]);
+ EXPECT_EQ(ret, 0) {
+ TH_LOG("Expected action (0x%X) not available!",
+ actions[i]);
+ }
+ }
+
+ /* Check that an unknown action is handled properly (EOPNOTSUPP) */
+ ret = seccomp(SECCOMP_GET_ACTION_AVAIL, 0, &unknown_action);
+ EXPECT_EQ(ret, -1);
+ EXPECT_EQ(errno, EOPNOTSUPP);
+}
+
+TEST(get_metadata)
+{
+ pid_t pid;
+ int pipefd[2];
+ char buf;
+ struct seccomp_metadata md;
+ long ret;
+
+ /* Only real root can get metadata. */
+ if (geteuid()) {
+ SKIP(return, "get_metadata requires real root");
+ return;
+ }
+
+ ASSERT_EQ(0, pipe(pipefd));
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0) {
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+
+ /* one with log, one without */
+ EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER,
+ SECCOMP_FILTER_FLAG_LOG, &prog));
+ EXPECT_EQ(0, seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog));
+
+ EXPECT_EQ(0, close(pipefd[0]));
+ ASSERT_EQ(1, write(pipefd[1], "1", 1));
+ ASSERT_EQ(0, close(pipefd[1]));
+
+ while (1)
+ sleep(100);
+ }
+
+ ASSERT_EQ(0, close(pipefd[1]));
+ ASSERT_EQ(1, read(pipefd[0], &buf, 1));
+
+ ASSERT_EQ(0, ptrace(PTRACE_ATTACH, pid));
+ ASSERT_EQ(pid, waitpid(pid, NULL, 0));
+
+ /* Past here must not use ASSERT or child process is never killed. */
+
+ md.filter_off = 0;
+ errno = 0;
+ ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
+ EXPECT_EQ(sizeof(md), ret) {
+ if (errno == EINVAL)
+ SKIP(goto skip, "Kernel does not support PTRACE_SECCOMP_GET_METADATA (missing CONFIG_CHECKPOINT_RESTORE?)");
+ }
+
+ EXPECT_EQ(md.flags, SECCOMP_FILTER_FLAG_LOG);
+ EXPECT_EQ(md.filter_off, 0);
+
+ md.filter_off = 1;
+ ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md);
+ EXPECT_EQ(sizeof(md), ret);
+ EXPECT_EQ(md.flags, 0);
+ EXPECT_EQ(md.filter_off, 1);
+
+skip:
+ ASSERT_EQ(0, kill(pid, SIGKILL));
+}
+
+static int user_notif_syscall(int nr, unsigned int flags)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, nr)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, nr, 0, 1),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_USER_NOTIF),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+
+ return seccomp(SECCOMP_SET_MODE_FILTER, flags, &prog);
+}
+
+#define USER_NOTIF_MAGIC INT_MAX
+TEST(user_notification_basic)
+{
+ pid_t pid;
+ long ret;
+ int status, listener;
+ struct seccomp_notif req = {};
+ struct seccomp_notif_resp resp = {};
+ struct pollfd pollfd;
+
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ /* Check that we get -ENOSYS with no listener attached */
+ if (pid == 0) {
+ if (user_notif_syscall(__NR_getppid, 0) < 0)
+ exit(1);
+ ret = syscall(__NR_getppid);
+ exit(ret >= 0 || errno != ENOSYS);
+ }
+
+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status));
+
+ /* Add some no-op filters for grins. */
+ EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
+ EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
+ EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
+ EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0);
+
+ /* Check that the basic notification machinery works */
+ listener = user_notif_syscall(__NR_getppid,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
+ ASSERT_GE(listener, 0);
+
+ /* Installing a second listener in the chain should EBUSY */
+ EXPECT_EQ(user_notif_syscall(__NR_getppid,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER),
+ -1);
+ EXPECT_EQ(errno, EBUSY);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ ret = syscall(__NR_getppid);
+ exit(ret != USER_NOTIF_MAGIC);
+ }
+
+ pollfd.fd = listener;
+ pollfd.events = POLLIN | POLLOUT;
+
+ EXPECT_GT(poll(&pollfd, 1, -1), 0);
+ EXPECT_EQ(pollfd.revents, POLLIN);
+
+ /* Test that we can't pass garbage to the kernel. */
+ memset(&req, 0, sizeof(req));
+ req.pid = -1;
+ errno = 0;
+ ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req);
+ EXPECT_EQ(-1, ret);
+ EXPECT_EQ(EINVAL, errno);
+
+ if (ret) {
+ req.pid = 0;
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+ }
+
+ pollfd.fd = listener;
+ pollfd.events = POLLIN | POLLOUT;
+
+ EXPECT_GT(poll(&pollfd, 1, -1), 0);
+ EXPECT_EQ(pollfd.revents, POLLOUT);
+
+ EXPECT_EQ(req.data.nr, __NR_getppid);
+
+ resp.id = req.id;
+ resp.error = 0;
+ resp.val = USER_NOTIF_MAGIC;
+
+ /* check that we make sure flags == 0 */
+ resp.flags = 1;
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
+ EXPECT_EQ(errno, EINVAL);
+
+ resp.flags = 0;
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
+
+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
+TEST(user_notification_with_tsync)
+{
+ int ret;
+ unsigned int flags;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ /* these were exclusive */
+ flags = SECCOMP_FILTER_FLAG_NEW_LISTENER |
+ SECCOMP_FILTER_FLAG_TSYNC;
+ ASSERT_EQ(-1, user_notif_syscall(__NR_getppid, flags));
+ ASSERT_EQ(EINVAL, errno);
+
+ /* but now they're not */
+ flags |= SECCOMP_FILTER_FLAG_TSYNC_ESRCH;
+ ret = user_notif_syscall(__NR_getppid, flags);
+ close(ret);
+ ASSERT_LE(0, ret);
+}
+
+TEST(user_notification_kill_in_middle)
+{
+ pid_t pid;
+ long ret;
+ int listener;
+ struct seccomp_notif req = {};
+ struct seccomp_notif_resp resp = {};
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ listener = user_notif_syscall(__NR_getppid,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
+ ASSERT_GE(listener, 0);
+
+ /*
+ * Check that nothing bad happens when we kill the task in the middle
+ * of a syscall.
+ */
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ ret = syscall(__NR_getppid);
+ exit(ret != USER_NOTIF_MAGIC);
+ }
+
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), 0);
+
+ EXPECT_EQ(kill(pid, SIGKILL), 0);
+ EXPECT_EQ(waitpid(pid, NULL, 0), pid);
+
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ID_VALID, &req.id), -1);
+
+ resp.id = req.id;
+ ret = ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp);
+ EXPECT_EQ(ret, -1);
+ EXPECT_EQ(errno, ENOENT);
+}
+
+static int handled = -1;
+
+static void signal_handler(int signal)
+{
+ if (write(handled, "c", 1) != 1)
+ perror("write from signal");
+}
+
+TEST(user_notification_signal)
+{
+ pid_t pid;
+ long ret;
+ int status, listener, sk_pair[2];
+ struct seccomp_notif req = {};
+ struct seccomp_notif_resp resp = {};
+ char c;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0);
+
+ listener = user_notif_syscall(__NR_gettid,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
+ ASSERT_GE(listener, 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ close(sk_pair[0]);
+ handled = sk_pair[1];
+ if (signal(SIGUSR1, signal_handler) == SIG_ERR) {
+ perror("signal");
+ exit(1);
+ }
+ /*
+ * ERESTARTSYS behavior is a bit hard to test, because we need
+ * to rely on a signal that has not yet been handled. Let's at
+ * least check that the error code gets propagated through, and
+ * hope that it doesn't break when there is actually a signal :)
+ */
+ ret = syscall(__NR_gettid);
+ exit(!(ret == -1 && errno == 512));
+ }
+
+ close(sk_pair[1]);
+
+ memset(&req, 0, sizeof(req));
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+
+ EXPECT_EQ(kill(pid, SIGUSR1), 0);
+
+ /*
+ * Make sure the signal really is delivered, which means we're not
+ * stuck in the user notification code any more and the notification
+ * should be dead.
+ */
+ EXPECT_EQ(read(sk_pair[0], &c, 1), 1);
+
+ resp.id = req.id;
+ resp.error = -EPERM;
+ resp.val = 0;
+
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
+ EXPECT_EQ(errno, ENOENT);
+
+ memset(&req, 0, sizeof(req));
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+
+ resp.id = req.id;
+ resp.error = -512; /* -ERESTARTSYS */
+ resp.val = 0;
+
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
+
+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
+TEST(user_notification_closed_listener)
+{
+ pid_t pid;
+ long ret;
+ int status, listener;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ listener = user_notif_syscall(__NR_getppid,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
+ ASSERT_GE(listener, 0);
+
+ /*
+ * Check that we get an ENOSYS when the listener is closed.
+ */
+ pid = fork();
+ ASSERT_GE(pid, 0);
+ if (pid == 0) {
+ close(listener);
+ ret = syscall(__NR_getppid);
+ exit(ret != -1 && errno != ENOSYS);
+ }
+
+ close(listener);
+
+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
+/*
+ * Check that a pid in a child namespace still shows up as valid in ours.
+ */
+TEST(user_notification_child_pid_ns)
+{
+ pid_t pid;
+ int status, listener;
+ struct seccomp_notif req = {};
+ struct seccomp_notif_resp resp = {};
+
+ ASSERT_EQ(unshare(CLONE_NEWUSER | CLONE_NEWPID), 0) {
+ if (errno == EINVAL)
+ SKIP(return, "kernel missing CLONE_NEWUSER support");
+ };
+
+ listener = user_notif_syscall(__NR_getppid,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
+ ASSERT_GE(listener, 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0)
+ exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
+
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+ EXPECT_EQ(req.pid, pid);
+
+ resp.id = req.id;
+ resp.error = 0;
+ resp.val = USER_NOTIF_MAGIC;
+
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
+
+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status));
+ close(listener);
+}
+
+/*
+ * Check that a pid in a sibling (i.e. unrelated) namespace shows up as 0, i.e.
+ * invalid.
+ */
+TEST(user_notification_sibling_pid_ns)
+{
+ pid_t pid, pid2;
+ int status, listener;
+ struct seccomp_notif req = {};
+ struct seccomp_notif_resp resp = {};
+
+ ASSERT_EQ(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0), 0) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ listener = user_notif_syscall(__NR_getppid,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
+ ASSERT_GE(listener, 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ ASSERT_EQ(unshare(CLONE_NEWPID), 0);
+
+ pid2 = fork();
+ ASSERT_GE(pid2, 0);
+
+ if (pid2 == 0)
+ exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
+
+ EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status));
+ exit(WEXITSTATUS(status));
+ }
+
+ /* Create the sibling ns, and sibling in it. */
+ ASSERT_EQ(unshare(CLONE_NEWPID), 0) {
+ if (errno == EPERM)
+ SKIP(return, "CLONE_NEWPID requires CAP_SYS_ADMIN");
+ }
+ ASSERT_EQ(errno, 0);
+
+ pid2 = fork();
+ ASSERT_GE(pid2, 0);
+
+ if (pid2 == 0) {
+ ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+ /*
+ * The pid should be 0, i.e. the task is in some namespace that
+ * we can't "see".
+ */
+ EXPECT_EQ(req.pid, 0);
+
+ resp.id = req.id;
+ resp.error = 0;
+ resp.val = USER_NOTIF_MAGIC;
+
+ ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
+ exit(0);
+ }
+
+ close(listener);
+
+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status));
+
+ EXPECT_EQ(waitpid(pid2, &status, 0), pid2);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
+TEST(user_notification_fault_recv)
+{
+ pid_t pid;
+ int status, listener;
+ struct seccomp_notif req = {};
+ struct seccomp_notif_resp resp = {};
+
+ ASSERT_EQ(unshare(CLONE_NEWUSER), 0);
+
+ listener = user_notif_syscall(__NR_getppid,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
+ ASSERT_GE(listener, 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0)
+ exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
+
+ /* Do a bad recv() */
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, NULL), -1);
+ EXPECT_EQ(errno, EFAULT);
+
+ /* We should still be able to receive this notification, though. */
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+ EXPECT_EQ(req.pid, pid);
+
+ resp.id = req.id;
+ resp.error = 0;
+ resp.val = USER_NOTIF_MAGIC;
+
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
+
+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status));
+}
+
+TEST(seccomp_get_notif_sizes)
+{
+ struct seccomp_notif_sizes sizes;
+
+ ASSERT_EQ(seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes), 0);
+ EXPECT_EQ(sizes.seccomp_notif, sizeof(struct seccomp_notif));
+ EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp));
+}
+
+TEST(user_notification_continue)
+{
+ pid_t pid;
+ long ret;
+ int status, listener;
+ struct seccomp_notif req = {};
+ struct seccomp_notif_resp resp = {};
+ struct pollfd pollfd;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
+ ASSERT_GE(listener, 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int dup_fd, pipe_fds[2];
+ pid_t self;
+
+ ASSERT_GE(pipe(pipe_fds), 0);
+
+ dup_fd = dup(pipe_fds[0]);
+ ASSERT_GE(dup_fd, 0);
+ EXPECT_NE(pipe_fds[0], dup_fd);
+
+ self = getpid();
+ ASSERT_EQ(filecmp(self, self, pipe_fds[0], dup_fd), 0);
+ exit(0);
+ }
+
+ pollfd.fd = listener;
+ pollfd.events = POLLIN | POLLOUT;
+
+ EXPECT_GT(poll(&pollfd, 1, -1), 0);
+ EXPECT_EQ(pollfd.revents, POLLIN);
+
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+
+ pollfd.fd = listener;
+ pollfd.events = POLLIN | POLLOUT;
+
+ EXPECT_GT(poll(&pollfd, 1, -1), 0);
+ EXPECT_EQ(pollfd.revents, POLLOUT);
+
+ EXPECT_EQ(req.data.nr, __NR_dup);
+
+ resp.id = req.id;
+ resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
+
+ /*
+ * Verify that setting SECCOMP_USER_NOTIF_FLAG_CONTINUE enforces other
+ * args be set to 0.
+ */
+ resp.error = 0;
+ resp.val = USER_NOTIF_MAGIC;
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
+ EXPECT_EQ(errno, EINVAL);
+
+ resp.error = USER_NOTIF_MAGIC;
+ resp.val = 0;
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), -1);
+ EXPECT_EQ(errno, EINVAL);
+
+ resp.error = 0;
+ resp.val = 0;
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0) {
+ if (errno == EINVAL)
+ SKIP(goto skip, "Kernel does not support SECCOMP_USER_NOTIF_FLAG_CONTINUE");
+ }
+
+skip:
+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status)) {
+ if (WEXITSTATUS(status) == 2) {
+ SKIP(return, "Kernel does not support kcmp() syscall");
+ return;
+ }
+ }
+}
+
+TEST(user_notification_filter_empty)
+{
+ pid_t pid;
+ long ret;
+ int status;
+ struct pollfd pollfd;
+ struct __clone_args args = {
+ .flags = CLONE_FILES,
+ .exit_signal = SIGCHLD,
+ };
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ pid = sys_clone3(&args, sizeof(args));
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ int listener;
+
+ listener = user_notif_syscall(__NR_mknodat, SECCOMP_FILTER_FLAG_NEW_LISTENER);
+ if (listener < 0)
+ _exit(EXIT_FAILURE);
+
+ if (dup2(listener, 200) != 200)
+ _exit(EXIT_FAILURE);
+
+ close(listener);
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status));
+
+ /*
+ * The seccomp filter has become unused so we should be notified once
+ * the kernel gets around to cleaning up task struct.
+ */
+ pollfd.fd = 200;
+ pollfd.events = POLLHUP;
+
+ EXPECT_GT(poll(&pollfd, 1, 2000), 0);
+ EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
+}
+
+static void *do_thread(void *data)
+{
+ return NULL;
+}
+
+TEST(user_notification_filter_empty_threaded)
+{
+ pid_t pid;
+ long ret;
+ int status;
+ struct pollfd pollfd;
+ struct __clone_args args = {
+ .flags = CLONE_FILES,
+ .exit_signal = SIGCHLD,
+ };
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ pid = sys_clone3(&args, sizeof(args));
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ pid_t pid1, pid2;
+ int listener, status;
+ pthread_t thread;
+
+ listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER);
+ if (listener < 0)
+ _exit(EXIT_FAILURE);
+
+ if (dup2(listener, 200) != 200)
+ _exit(EXIT_FAILURE);
+
+ close(listener);
+
+ pid1 = fork();
+ if (pid1 < 0)
+ _exit(EXIT_FAILURE);
+
+ if (pid1 == 0)
+ _exit(EXIT_SUCCESS);
+
+ pid2 = fork();
+ if (pid2 < 0)
+ _exit(EXIT_FAILURE);
+
+ if (pid2 == 0)
+ _exit(EXIT_SUCCESS);
+
+ if (pthread_create(&thread, NULL, do_thread, NULL) ||
+ pthread_join(thread, NULL))
+ _exit(EXIT_FAILURE);
+
+ if (pthread_create(&thread, NULL, do_thread, NULL) ||
+ pthread_join(thread, NULL))
+ _exit(EXIT_FAILURE);
+
+ if (waitpid(pid1, &status, 0) != pid1 || !WIFEXITED(status) ||
+ WEXITSTATUS(status))
+ _exit(EXIT_FAILURE);
+
+ if (waitpid(pid2, &status, 0) != pid2 || !WIFEXITED(status) ||
+ WEXITSTATUS(status))
+ _exit(EXIT_FAILURE);
+
+ exit(EXIT_SUCCESS);
+ }
+
+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status));
+
+ /*
+ * The seccomp filter has become unused so we should be notified once
+ * the kernel gets around to cleaning up task struct.
+ */
+ pollfd.fd = 200;
+ pollfd.events = POLLHUP;
+
+ EXPECT_GT(poll(&pollfd, 1, 2000), 0);
+ EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0);
+}
+
+TEST(user_notification_addfd)
+{
+ pid_t pid;
+ long ret;
+ int status, listener, memfd, fd;
+ struct seccomp_notif_addfd addfd = {};
+ struct seccomp_notif_addfd_small small = {};
+ struct seccomp_notif_addfd_big big = {};
+ struct seccomp_notif req = {};
+ struct seccomp_notif_resp resp = {};
+ /* 100 ms */
+ struct timespec delay = { .tv_nsec = 100000000 };
+
+ memfd = memfd_create("test", 0);
+ ASSERT_GE(memfd, 0);
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ /* Check that the basic notification machinery works */
+ listener = user_notif_syscall(__NR_getppid,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
+ ASSERT_GE(listener, 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ if (syscall(__NR_getppid) != USER_NOTIF_MAGIC)
+ exit(1);
+ exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
+ }
+
+ ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+
+ addfd.srcfd = memfd;
+ addfd.newfd = 0;
+ addfd.id = req.id;
+ addfd.flags = 0x0;
+
+ /* Verify bad newfd_flags cannot be set */
+ addfd.newfd_flags = ~O_CLOEXEC;
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
+ EXPECT_EQ(errno, EINVAL);
+ addfd.newfd_flags = O_CLOEXEC;
+
+ /* Verify bad flags cannot be set */
+ addfd.flags = 0xff;
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
+ EXPECT_EQ(errno, EINVAL);
+ addfd.flags = 0;
+
+ /* Verify that remote_fd cannot be set without setting flags */
+ addfd.newfd = 1;
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
+ EXPECT_EQ(errno, EINVAL);
+ addfd.newfd = 0;
+
+ /* Verify small size cannot be set */
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_SMALL, &small), -1);
+ EXPECT_EQ(errno, EINVAL);
+
+ /* Verify we can't send bits filled in unknown buffer area */
+ memset(&big, 0xAA, sizeof(big));
+ big.addfd = addfd;
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big), -1);
+ EXPECT_EQ(errno, E2BIG);
+
+
+ /* Verify we can set an arbitrary remote fd */
+ fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
+ /*
+ * The child has fds 0(stdin), 1(stdout), 2(stderr), 3(memfd),
+ * 4(listener), so the newly allocated fd should be 5.
+ */
+ EXPECT_EQ(fd, 5);
+ EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
+
+ /* Verify we can set an arbitrary remote fd with large size */
+ memset(&big, 0x0, sizeof(big));
+ big.addfd = addfd;
+ fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big);
+ EXPECT_EQ(fd, 6);
+
+ /* Verify we can set a specific remote fd */
+ addfd.newfd = 42;
+ addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
+ fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
+ EXPECT_EQ(fd, 42);
+ EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0);
+
+ /* Resume syscall */
+ resp.id = req.id;
+ resp.error = 0;
+ resp.val = USER_NOTIF_MAGIC;
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
+
+ /*
+ * This sets the ID of the ADD FD to the last request plus 1. The
+ * notification ID increments 1 per notification.
+ */
+ addfd.id = req.id + 1;
+
+ /* This spins until the underlying notification is generated */
+ while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 &&
+ errno != -EINPROGRESS)
+ nanosleep(&delay, NULL);
+
+ memset(&req, 0, sizeof(req));
+ ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+ ASSERT_EQ(addfd.id, req.id);
+
+ resp.id = req.id;
+ resp.error = 0;
+ resp.val = USER_NOTIF_MAGIC;
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
+
+ /* Wait for child to finish. */
+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status));
+
+ close(memfd);
+}
+
+TEST(user_notification_addfd_rlimit)
+{
+ pid_t pid;
+ long ret;
+ int status, listener, memfd;
+ struct seccomp_notif_addfd addfd = {};
+ struct seccomp_notif req = {};
+ struct seccomp_notif_resp resp = {};
+ const struct rlimit lim = {
+ .rlim_cur = 0,
+ .rlim_max = 0,
+ };
+
+ memfd = memfd_create("test", 0);
+ ASSERT_GE(memfd, 0);
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret) {
+ TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
+ }
+
+ /* Check that the basic notification machinery works */
+ listener = user_notif_syscall(__NR_getppid,
+ SECCOMP_FILTER_FLAG_NEW_LISTENER);
+ ASSERT_GE(listener, 0);
+
+ pid = fork();
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0)
+ exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC);
+
+
+ ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
+
+ ASSERT_EQ(prlimit(pid, RLIMIT_NOFILE, &lim, NULL), 0);
+
+ addfd.srcfd = memfd;
+ addfd.newfd_flags = O_CLOEXEC;
+ addfd.newfd = 0;
+ addfd.id = req.id;
+ addfd.flags = 0;
+
+ /* Should probably spot check /proc/sys/fs/file-nr */
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
+ EXPECT_EQ(errno, EMFILE);
+
+ addfd.newfd = 100;
+ addfd.flags = SECCOMP_ADDFD_FLAG_SETFD;
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1);
+ EXPECT_EQ(errno, EBADF);
+
+ resp.id = req.id;
+ resp.error = 0;
+ resp.val = USER_NOTIF_MAGIC;
+
+ EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
+
+ /* Wait for child to finish. */
+ EXPECT_EQ(waitpid(pid, &status, 0), pid);
+ EXPECT_EQ(true, WIFEXITED(status));
+ EXPECT_EQ(0, WEXITSTATUS(status));
+
+ close(memfd);
+}
+
+/*
+ * TODO:
+ * - expand NNP testing
+ * - better arch-specific TRACE and TRAP handlers.
+ * - endianness checking when appropriate
+ * - 64-bit arg prodding
+ * - arch value testing (x86 modes especially)
+ * - verify that FILTER_FLAG_LOG filters generate log messages
+ * - verify that RET_LOG generates log messages
+ */
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/seccomp/settings b/tools/testing/selftests/seccomp/settings
new file mode 100644
index 000000000..ba4d85f74
--- /dev/null
+++ b/tools/testing/selftests/seccomp/settings
@@ -0,0 +1 @@
+timeout=90
diff --git a/tools/testing/selftests/sigaltstack/.gitignore b/tools/testing/selftests/sigaltstack/.gitignore
new file mode 100644
index 000000000..50a19a888
--- /dev/null
+++ b/tools/testing/selftests/sigaltstack/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+sas
diff --git a/tools/testing/selftests/sigaltstack/Makefile b/tools/testing/selftests/sigaltstack/Makefile
new file mode 100644
index 000000000..3e96d5d47
--- /dev/null
+++ b/tools/testing/selftests/sigaltstack/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+CFLAGS = -Wall
+TEST_GEN_PROGS = sas
+
+include ../lib.mk
+
diff --git a/tools/testing/selftests/sigaltstack/current_stack_pointer.h b/tools/testing/selftests/sigaltstack/current_stack_pointer.h
new file mode 100644
index 000000000..ea9bdf3a9
--- /dev/null
+++ b/tools/testing/selftests/sigaltstack/current_stack_pointer.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#if __alpha__
+register unsigned long sp asm("$30");
+#elif __arm__ || __aarch64__ || __csky__ || __m68k__ || __mips__ || __riscv
+register unsigned long sp asm("sp");
+#elif __i386__
+register unsigned long sp asm("esp");
+#elif __loongarch64
+register unsigned long sp asm("$sp");
+#elif __ppc__
+register unsigned long sp asm("r1");
+#elif __s390x__
+register unsigned long sp asm("%15");
+#elif __sh__
+register unsigned long sp asm("r15");
+#elif __x86_64__
+register unsigned long sp asm("rsp");
+#elif __XTENSA__
+register unsigned long sp asm("a1");
+#else
+#error "implement current_stack_pointer equivalent"
+#endif
diff --git a/tools/testing/selftests/sigaltstack/sas.c b/tools/testing/selftests/sigaltstack/sas.c
new file mode 100644
index 000000000..41646c223
--- /dev/null
+++ b/tools/testing/selftests/sigaltstack/sas.c
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Stas Sergeev <stsp@users.sourceforge.net>
+ *
+ * test sigaltstack(SS_ONSTACK | SS_AUTODISARM)
+ * If that succeeds, then swapcontext() can be used inside sighandler safely.
+ *
+ */
+
+#define _GNU_SOURCE
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <ucontext.h>
+#include <alloca.h>
+#include <string.h>
+#include <assert.h>
+#include <errno.h>
+
+#include "../kselftest.h"
+#include "current_stack_pointer.h"
+
+#ifndef SS_AUTODISARM
+#define SS_AUTODISARM (1U << 31)
+#endif
+
+static void *sstack, *ustack;
+static ucontext_t uc, sc;
+static const char *msg = "[OK]\tStack preserved";
+static const char *msg2 = "[FAIL]\tStack corrupted";
+struct stk_data {
+ char msg[128];
+ int flag;
+};
+
+void my_usr1(int sig, siginfo_t *si, void *u)
+{
+ char *aa;
+ int err;
+ stack_t stk;
+ struct stk_data *p;
+
+ if (sp < (unsigned long)sstack ||
+ sp >= (unsigned long)sstack + SIGSTKSZ) {
+ ksft_exit_fail_msg("SP is not on sigaltstack\n");
+ }
+ /* put some data on stack. other sighandler will try to overwrite it */
+ aa = alloca(1024);
+ assert(aa);
+ p = (struct stk_data *)(aa + 512);
+ strcpy(p->msg, msg);
+ p->flag = 1;
+ ksft_print_msg("[RUN]\tsignal USR1\n");
+ err = sigaltstack(NULL, &stk);
+ if (err) {
+ ksft_exit_fail_msg("sigaltstack() - %s\n", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ if (stk.ss_flags != SS_DISABLE)
+ ksft_test_result_fail("tss_flags=%x, should be SS_DISABLE\n",
+ stk.ss_flags);
+ else
+ ksft_test_result_pass(
+ "sigaltstack is disabled in sighandler\n");
+ swapcontext(&sc, &uc);
+ ksft_print_msg("%s\n", p->msg);
+ if (!p->flag) {
+ ksft_exit_fail_msg("[RUN]\tAborting\n");
+ exit(EXIT_FAILURE);
+ }
+}
+
+void my_usr2(int sig, siginfo_t *si, void *u)
+{
+ char *aa;
+ struct stk_data *p;
+
+ ksft_print_msg("[RUN]\tsignal USR2\n");
+ aa = alloca(1024);
+ /* dont run valgrind on this */
+ /* try to find the data stored by previous sighandler */
+ p = memmem(aa, 1024, msg, strlen(msg));
+ if (p) {
+ ksft_test_result_fail("sigaltstack re-used\n");
+ /* corrupt the data */
+ strcpy(p->msg, msg2);
+ /* tell other sighandler that his data is corrupted */
+ p->flag = 0;
+ }
+}
+
+static void switch_fn(void)
+{
+ ksft_print_msg("[RUN]\tswitched to user ctx\n");
+ raise(SIGUSR2);
+ setcontext(&sc);
+}
+
+int main(void)
+{
+ struct sigaction act;
+ stack_t stk;
+ int err;
+
+ ksft_print_header();
+ ksft_set_plan(3);
+
+ sigemptyset(&act.sa_mask);
+ act.sa_flags = SA_ONSTACK | SA_SIGINFO;
+ act.sa_sigaction = my_usr1;
+ sigaction(SIGUSR1, &act, NULL);
+ act.sa_sigaction = my_usr2;
+ sigaction(SIGUSR2, &act, NULL);
+ sstack = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+ if (sstack == MAP_FAILED) {
+ ksft_exit_fail_msg("mmap() - %s\n", strerror(errno));
+ return EXIT_FAILURE;
+ }
+
+ err = sigaltstack(NULL, &stk);
+ if (err) {
+ ksft_exit_fail_msg("sigaltstack() - %s\n", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ if (stk.ss_flags == SS_DISABLE) {
+ ksft_test_result_pass(
+ "Initial sigaltstack state was SS_DISABLE\n");
+ } else {
+ ksft_exit_fail_msg("Initial sigaltstack state was %x; "
+ "should have been SS_DISABLE\n", stk.ss_flags);
+ return EXIT_FAILURE;
+ }
+
+ stk.ss_sp = sstack;
+ stk.ss_size = SIGSTKSZ;
+ stk.ss_flags = SS_ONSTACK | SS_AUTODISARM;
+ err = sigaltstack(&stk, NULL);
+ if (err) {
+ if (errno == EINVAL) {
+ ksft_test_result_skip(
+ "[NOTE]\tThe running kernel doesn't support SS_AUTODISARM\n");
+ /*
+ * If test cases for the !SS_AUTODISARM variant were
+ * added, we could still run them. We don't have any
+ * test cases like that yet, so just exit and report
+ * success.
+ */
+ return 0;
+ } else {
+ ksft_exit_fail_msg(
+ "sigaltstack(SS_ONSTACK | SS_AUTODISARM) %s\n",
+ strerror(errno));
+ return EXIT_FAILURE;
+ }
+ }
+
+ ustack = mmap(NULL, SIGSTKSZ, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+ if (ustack == MAP_FAILED) {
+ ksft_exit_fail_msg("mmap() - %s\n", strerror(errno));
+ return EXIT_FAILURE;
+ }
+ getcontext(&uc);
+ uc.uc_link = NULL;
+ uc.uc_stack.ss_sp = ustack;
+ uc.uc_stack.ss_size = SIGSTKSZ;
+ makecontext(&uc, switch_fn, 0);
+ raise(SIGUSR1);
+
+ err = sigaltstack(NULL, &stk);
+ if (err) {
+ ksft_exit_fail_msg("sigaltstack() - %s\n", strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ if (stk.ss_flags != SS_AUTODISARM) {
+ ksft_exit_fail_msg("ss_flags=%x, should be SS_AUTODISARM\n",
+ stk.ss_flags);
+ exit(EXIT_FAILURE);
+ }
+ ksft_test_result_pass(
+ "sigaltstack is still SS_AUTODISARM after signal\n");
+
+ ksft_exit_pass();
+ return 0;
+}
diff --git a/tools/testing/selftests/size/.gitignore b/tools/testing/selftests/size/.gitignore
new file mode 100644
index 000000000..923e18eed
--- /dev/null
+++ b/tools/testing/selftests/size/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+get_size
diff --git a/tools/testing/selftests/size/Makefile b/tools/testing/selftests/size/Makefile
new file mode 100644
index 000000000..b87facc00
--- /dev/null
+++ b/tools/testing/selftests/size/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+CFLAGS := -static -ffreestanding -nostartfiles -s
+
+TEST_GEN_PROGS := get_size
+
+include ../lib.mk
diff --git a/tools/testing/selftests/size/get_size.c b/tools/testing/selftests/size/get_size.c
new file mode 100644
index 000000000..2980b1a63
--- /dev/null
+++ b/tools/testing/selftests/size/get_size.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2014 Sony Mobile Communications Inc.
+ *
+ * Selftest for runtime system size
+ *
+ * Prints the amount of RAM that the currently running system is using.
+ *
+ * This program tries to be as small as possible itself, to
+ * avoid perturbing the system memory utilization with its
+ * own execution. It also attempts to have as few dependencies
+ * on kernel features as possible.
+ *
+ * It should be statically linked, with startup libs avoided. It uses
+ * no library calls except the syscall() function for the following 3
+ * syscalls:
+ * sysinfo(), write(), and _exit()
+ *
+ * For output, it avoids printf (which in some C libraries
+ * has large external dependencies) by implementing it's own
+ * number output and print routines, and using __builtin_strlen()
+ *
+ * The test may crash if any of the above syscalls fails because in some
+ * libc implementations (e.g. the GNU C Library) errno is saved in
+ * thread-local storage, which does not get initialized due to avoiding
+ * startup libs.
+ */
+
+#include <sys/sysinfo.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#define STDOUT_FILENO 1
+
+static int print(const char *s)
+{
+ size_t len = 0;
+
+ while (s[len] != '\0')
+ len++;
+
+ return syscall(SYS_write, STDOUT_FILENO, s, len);
+}
+
+static inline char *num_to_str(unsigned long num, char *buf, int len)
+{
+ unsigned int digit;
+
+ /* put digits in buffer from back to front */
+ buf += len - 1;
+ *buf = 0;
+ do {
+ digit = num % 10;
+ *(--buf) = digit + '0';
+ num /= 10;
+ } while (num > 0);
+
+ return buf;
+}
+
+static int print_num(unsigned long num)
+{
+ char num_buf[30];
+
+ return print(num_to_str(num, num_buf, sizeof(num_buf)));
+}
+
+static int print_k_value(const char *s, unsigned long num, unsigned long units)
+{
+ unsigned long long temp;
+ int ccode;
+
+ print(s);
+
+ temp = num;
+ temp = (temp * units)/1024;
+ num = temp;
+ ccode = print_num(num);
+ print("\n");
+ return ccode;
+}
+
+/* this program has no main(), as startup libraries are not used */
+void _start(void)
+{
+ int ccode;
+ struct sysinfo info;
+ unsigned long used;
+ static const char *test_name = " get runtime memory use\n";
+
+ print("TAP version 13\n");
+ print("# Testing system size.\n");
+
+ ccode = syscall(SYS_sysinfo, &info);
+ if (ccode < 0) {
+ print("not ok 1");
+ print(test_name);
+ print(" ---\n reason: \"could not get sysinfo\"\n ...\n");
+ syscall(SYS_exit, ccode);
+ }
+ print("ok 1");
+ print(test_name);
+
+ /* ignore cache complexities for now */
+ used = info.totalram - info.freeram - info.bufferram;
+ print("# System runtime memory report (units in Kilobytes):\n");
+ print(" ---\n");
+ print_k_value(" Total: ", info.totalram, info.mem_unit);
+ print_k_value(" Free: ", info.freeram, info.mem_unit);
+ print_k_value(" Buffer: ", info.bufferram, info.mem_unit);
+ print_k_value(" In use: ", used, info.mem_unit);
+ print(" ...\n");
+ print("1..1\n");
+
+ syscall(SYS_exit, 0);
+}
diff --git a/tools/testing/selftests/sparc64/Makefile b/tools/testing/selftests/sparc64/Makefile
new file mode 100644
index 000000000..a19531dba
--- /dev/null
+++ b/tools/testing/selftests/sparc64/Makefile
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: GPL-2.0
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+ARCH ?= $(shell echo $(uname_M) | sed -e s/x86_64/x86/)
+
+ifneq ($(ARCH),sparc64)
+nothing:
+.PHONY: all clean run_tests install
+.SILENT:
+else
+
+SUBDIRS := drivers
+
+TEST_PROGS := run.sh
+
+
+.PHONY: all clean
+
+include ../lib.mk
+
+all:
+ @for DIR in $(SUBDIRS); do \
+ BUILD_TARGET=$(OUTPUT)/$$DIR; \
+ mkdir $$BUILD_TARGET -p; \
+ make OUTPUT=$$BUILD_TARGET -C $$DIR $@;\
+ #SUBDIR test prog name should be in the form: SUBDIR_test.sh \
+ TEST=$$DIR"_test.sh"; \
+ if [ -e $$DIR/$$TEST ]; then \
+ rsync -a $$DIR/$$TEST $$BUILD_TARGET/; \
+ fi \
+ done
+
+override define INSTALL_RULE
+ mkdir -p $(INSTALL_PATH)
+ install -t $(INSTALL_PATH) $(TEST_PROGS) $(TEST_PROGS_EXTENDED) $(TEST_FILES)
+
+ @for SUBDIR in $(SUBDIRS); do \
+ BUILD_TARGET=$(OUTPUT)/$$SUBDIR; \
+ mkdir $$BUILD_TARGET -p; \
+ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$SUBDIR INSTALL_PATH=$(INSTALL_PATH)/$$SUBDIR install; \
+ done;
+endef
+
+override define CLEAN
+ @for DIR in $(SUBDIRS); do \
+ BUILD_TARGET=$(OUTPUT)/$$DIR; \
+ mkdir $$BUILD_TARGET -p; \
+ make OUTPUT=$$BUILD_TARGET -C $$DIR $@;\
+ done
+endef
+endif
diff --git a/tools/testing/selftests/sparc64/drivers/.gitignore b/tools/testing/selftests/sparc64/drivers/.gitignore
new file mode 100644
index 000000000..0331f7737
--- /dev/null
+++ b/tools/testing/selftests/sparc64/drivers/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+adi-test
diff --git a/tools/testing/selftests/sparc64/drivers/Makefile b/tools/testing/selftests/sparc64/drivers/Makefile
new file mode 100644
index 000000000..deb0df415
--- /dev/null
+++ b/tools/testing/selftests/sparc64/drivers/Makefile
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0
+INCLUDEDIR := -I.
+CFLAGS := $(CFLAGS) $(INCLUDEDIR) -Wall -O2 -g
+
+TEST_GEN_FILES := adi-test
+
+all: $(TEST_GEN_FILES)
+
+$(TEST_GEN_FILES): adi-test.c
+
+TEST_PROGS := drivers_test.sh
+
+include ../../lib.mk
+
+$(OUTPUT)/adi-test: adi-test.c
diff --git a/tools/testing/selftests/sparc64/drivers/adi-test.c b/tools/testing/selftests/sparc64/drivers/adi-test.c
new file mode 100644
index 000000000..95d93c6a8
--- /dev/null
+++ b/tools/testing/selftests/sparc64/drivers/adi-test.c
@@ -0,0 +1,721 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * selftest for sparc64's privileged ADI driver
+ *
+ * Author: Tom Hromatka <tom.hromatka@oracle.com>
+ */
+#include <linux/kernel.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "../../kselftest.h"
+
+#define DEBUG_LEVEL_1_BIT (0x0001)
+#define DEBUG_LEVEL_2_BIT (0x0002)
+#define DEBUG_LEVEL_3_BIT (0x0004)
+#define DEBUG_LEVEL_4_BIT (0x0008)
+#define DEBUG_TIMING_BIT (0x1000)
+
+#ifndef ARRAY_SIZE
+# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+/* bit mask of enabled bits to print */
+#define DEBUG 0x0001
+
+#define DEBUG_PRINT_L1(...) debug_print(DEBUG_LEVEL_1_BIT, __VA_ARGS__)
+#define DEBUG_PRINT_L2(...) debug_print(DEBUG_LEVEL_2_BIT, __VA_ARGS__)
+#define DEBUG_PRINT_L3(...) debug_print(DEBUG_LEVEL_3_BIT, __VA_ARGS__)
+#define DEBUG_PRINT_L4(...) debug_print(DEBUG_LEVEL_4_BIT, __VA_ARGS__)
+#define DEBUG_PRINT_T(...) debug_print(DEBUG_TIMING_BIT, __VA_ARGS__)
+
+static void debug_print(int level, const char *s, ...)
+{
+ va_list args;
+
+ va_start(args, s);
+
+ if (DEBUG & level)
+ vfprintf(stdout, s, args);
+ va_end(args);
+}
+
+#ifndef min
+#define min(x, y) ((x) < (y) ? x : y)
+#endif
+
+#define RETURN_FROM_TEST(_ret) \
+ do { \
+ DEBUG_PRINT_L1( \
+ "\tTest %s returned %d\n", __func__, _ret); \
+ return _ret; \
+ } while (0)
+
+#define ADI_BLKSZ 64
+#define ADI_MAX_VERSION 15
+
+#define TEST_STEP_FAILURE(_ret) \
+ do { \
+ fprintf(stderr, "\tTest step failure: %d at %s:%d\n", \
+ _ret, __func__, __LINE__); \
+ goto out; \
+ } while (0)
+
+#define RDTICK(_x) \
+ asm volatile(" rd %%tick, %0\n" : "=r" (_x))
+
+static int random_version(void)
+{
+ long tick;
+
+ RDTICK(tick);
+
+ return tick % (ADI_MAX_VERSION + 1);
+}
+
+#define MAX_RANGES_SUPPORTED 5
+static const char system_ram_str[] = "System RAM\n";
+static int range_count;
+static unsigned long long int start_addr[MAX_RANGES_SUPPORTED];
+static unsigned long long int end_addr[MAX_RANGES_SUPPORTED];
+
+struct stats {
+ char name[16];
+ unsigned long total;
+ unsigned long count;
+ unsigned long bytes;
+};
+
+static struct stats read_stats = {
+ .name = "read", .total = 0, .count = 0, .bytes = 0};
+static struct stats pread_stats = {
+ .name = "pread", .total = 0, .count = 0, .bytes = 0};
+static struct stats write_stats = {
+ .name = "write", .total = 0, .count = 0, .bytes = 0};
+static struct stats pwrite_stats = {
+ .name = "pwrite", .total = 0, .count = 0, .bytes = 0};
+static struct stats seek_stats = {
+ .name = "seek", .total = 0, .count = 0, .bytes = 0};
+
+static void update_stats(struct stats * const ustats,
+ unsigned long measurement, unsigned long bytes)
+{
+ ustats->total += measurement;
+ ustats->bytes += bytes;
+ ustats->count++;
+}
+
+static void print_ustats(const struct stats * const ustats)
+{
+ DEBUG_PRINT_L1("%s\t%7d\t%7.0f\t%7.0f\n",
+ ustats->name, ustats->count,
+ (float)ustats->total / (float)ustats->count,
+ (float)ustats->bytes / (float)ustats->count);
+}
+
+static void print_stats(void)
+{
+ DEBUG_PRINT_L1("\nSyscall\tCall\tAvgTime\tAvgSize\n"
+ "\tCount\t(ticks)\t(bytes)\n"
+ "-------------------------------\n");
+
+ print_ustats(&read_stats);
+ print_ustats(&pread_stats);
+ print_ustats(&write_stats);
+ print_ustats(&pwrite_stats);
+ print_ustats(&seek_stats);
+}
+
+static int build_memory_map(void)
+{
+ char line[256];
+ FILE *fp;
+ int i;
+
+ range_count = 0;
+
+ fp = fopen("/proc/iomem", "r");
+ if (!fp) {
+ fprintf(stderr, "/proc/iomem: error %d: %s\n",
+ errno, strerror(errno));
+ return -errno;
+ }
+
+ while (fgets(line, sizeof(line), fp) != 0) {
+ if (strstr(line, system_ram_str)) {
+ char *dash, *end_ptr;
+
+ /* Given a line like this:
+ * d0400000-10ffaffff : System RAM
+ * replace the "-" with a space
+ */
+ dash = strstr(line, "-");
+ dash[0] = 0x20;
+
+ start_addr[range_count] = strtoull(line, &end_ptr, 16);
+ end_addr[range_count] = strtoull(end_ptr, NULL, 16);
+ range_count++;
+ }
+ }
+
+ fclose(fp);
+
+ DEBUG_PRINT_L1("RAM Ranges\n");
+ for (i = 0; i < range_count; i++)
+ DEBUG_PRINT_L1("\trange %d: 0x%llx\t- 0x%llx\n",
+ i, start_addr[i], end_addr[i]);
+
+ if (range_count == 0) {
+ fprintf(stderr, "No valid address ranges found. Error.\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int read_adi(int fd, unsigned char *buf, int buf_sz)
+{
+ int ret, bytes_read = 0;
+ long start, end, elapsed_time = 0;
+
+ do {
+ RDTICK(start);
+ ret = read(fd, buf + bytes_read, buf_sz - bytes_read);
+ RDTICK(end);
+ if (ret < 0)
+ return -errno;
+
+ elapsed_time += end - start;
+ update_stats(&read_stats, elapsed_time, buf_sz);
+ bytes_read += ret;
+
+ } while (bytes_read < buf_sz);
+
+ DEBUG_PRINT_T("\tread elapsed timed = %ld\n", elapsed_time);
+ DEBUG_PRINT_L3("\tRead %d bytes\n", bytes_read);
+
+ return bytes_read;
+}
+
+static int pread_adi(int fd, unsigned char *buf,
+ int buf_sz, unsigned long offset)
+{
+ int ret, i, bytes_read = 0;
+ unsigned long cur_offset;
+ long start, end, elapsed_time = 0;
+
+ cur_offset = offset;
+ do {
+ RDTICK(start);
+ ret = pread(fd, buf + bytes_read, buf_sz - bytes_read,
+ cur_offset);
+ RDTICK(end);
+ if (ret < 0)
+ return -errno;
+
+ elapsed_time += end - start;
+ update_stats(&pread_stats, elapsed_time, buf_sz);
+ bytes_read += ret;
+ cur_offset += ret;
+
+ } while (bytes_read < buf_sz);
+
+ DEBUG_PRINT_T("\tpread elapsed timed = %ld\n", elapsed_time);
+ DEBUG_PRINT_L3("\tRead %d bytes starting at offset 0x%lx\n",
+ bytes_read, offset);
+ for (i = 0; i < bytes_read; i++)
+ DEBUG_PRINT_L4("\t\t0x%lx\t%d\n", offset + i, buf[i]);
+
+ return bytes_read;
+}
+
+static int write_adi(int fd, const unsigned char * const buf, int buf_sz)
+{
+ int ret, bytes_written = 0;
+ long start, end, elapsed_time = 0;
+
+ do {
+ RDTICK(start);
+ ret = write(fd, buf + bytes_written, buf_sz - bytes_written);
+ RDTICK(end);
+ if (ret < 0)
+ return -errno;
+
+ elapsed_time += (end - start);
+ update_stats(&write_stats, elapsed_time, buf_sz);
+ bytes_written += ret;
+ } while (bytes_written < buf_sz);
+
+ DEBUG_PRINT_T("\twrite elapsed timed = %ld\n", elapsed_time);
+ DEBUG_PRINT_L3("\tWrote %d of %d bytes\n", bytes_written, buf_sz);
+
+ return bytes_written;
+}
+
+static int pwrite_adi(int fd, const unsigned char * const buf,
+ int buf_sz, unsigned long offset)
+{
+ int ret, bytes_written = 0;
+ unsigned long cur_offset;
+ long start, end, elapsed_time = 0;
+
+ cur_offset = offset;
+
+ do {
+ RDTICK(start);
+ ret = pwrite(fd, buf + bytes_written,
+ buf_sz - bytes_written, cur_offset);
+ RDTICK(end);
+ if (ret < 0) {
+ fprintf(stderr, "pwrite(): error %d: %s\n",
+ errno, strerror(errno));
+ return -errno;
+ }
+
+ elapsed_time += (end - start);
+ update_stats(&pwrite_stats, elapsed_time, buf_sz);
+ bytes_written += ret;
+ cur_offset += ret;
+
+ } while (bytes_written < buf_sz);
+
+ DEBUG_PRINT_T("\tpwrite elapsed timed = %ld\n", elapsed_time);
+ DEBUG_PRINT_L3("\tWrote %d of %d bytes starting at address 0x%lx\n",
+ bytes_written, buf_sz, offset);
+
+ return bytes_written;
+}
+
+static off_t seek_adi(int fd, off_t offset, int whence)
+{
+ long start, end;
+ off_t ret;
+
+ RDTICK(start);
+ ret = lseek(fd, offset, whence);
+ RDTICK(end);
+ DEBUG_PRINT_L2("\tlseek ret = 0x%llx\n", ret);
+ if (ret < 0)
+ goto out;
+
+ DEBUG_PRINT_T("\tlseek elapsed timed = %ld\n", end - start);
+ update_stats(&seek_stats, end - start, 0);
+
+out:
+ (void)lseek(fd, 0, SEEK_END);
+ return ret;
+}
+
+static int test0_prpw_aligned_1byte(int fd)
+{
+ /* somewhat arbitrarily chosen address */
+ unsigned long paddr =
+ (end_addr[range_count - 1] - 0x1000) & ~(ADI_BLKSZ - 1);
+ unsigned char version[1], expected_version;
+ loff_t offset;
+ int ret;
+
+ version[0] = random_version();
+ expected_version = version[0];
+
+ offset = paddr / ADI_BLKSZ;
+
+ ret = pwrite_adi(fd, version, sizeof(version), offset);
+ if (ret != sizeof(version))
+ TEST_STEP_FAILURE(ret);
+
+ ret = pread_adi(fd, version, sizeof(version), offset);
+ if (ret != sizeof(version))
+ TEST_STEP_FAILURE(ret);
+
+ if (expected_version != version[0]) {
+ DEBUG_PRINT_L2("\tExpected version %d but read version %d\n",
+ expected_version, version[0]);
+ TEST_STEP_FAILURE(-expected_version);
+ }
+
+ ret = 0;
+out:
+ RETURN_FROM_TEST(ret);
+}
+
+#define TEST1_VERSION_SZ 4096
+static int test1_prpw_aligned_4096bytes(int fd)
+{
+ /* somewhat arbitrarily chosen address */
+ unsigned long paddr =
+ (end_addr[range_count - 1] - 0x6000) & ~(ADI_BLKSZ - 1);
+ unsigned char version[TEST1_VERSION_SZ],
+ expected_version[TEST1_VERSION_SZ];
+ loff_t offset;
+ int ret, i;
+
+ for (i = 0; i < TEST1_VERSION_SZ; i++) {
+ version[i] = random_version();
+ expected_version[i] = version[i];
+ }
+
+ offset = paddr / ADI_BLKSZ;
+
+ ret = pwrite_adi(fd, version, sizeof(version), offset);
+ if (ret != sizeof(version))
+ TEST_STEP_FAILURE(ret);
+
+ ret = pread_adi(fd, version, sizeof(version), offset);
+ if (ret != sizeof(version))
+ TEST_STEP_FAILURE(ret);
+
+ for (i = 0; i < TEST1_VERSION_SZ; i++) {
+ if (expected_version[i] != version[i]) {
+ DEBUG_PRINT_L2(
+ "\tExpected version %d but read version %d\n",
+ expected_version, version[0]);
+ TEST_STEP_FAILURE(-expected_version[i]);
+ }
+ }
+
+ ret = 0;
+out:
+ RETURN_FROM_TEST(ret);
+}
+
+#define TEST2_VERSION_SZ 10327
+static int test2_prpw_aligned_10327bytes(int fd)
+{
+ /* somewhat arbitrarily chosen address */
+ unsigned long paddr =
+ (start_addr[0] + 0x6000) & ~(ADI_BLKSZ - 1);
+ unsigned char version[TEST2_VERSION_SZ],
+ expected_version[TEST2_VERSION_SZ];
+ loff_t offset;
+ int ret, i;
+
+ for (i = 0; i < TEST2_VERSION_SZ; i++) {
+ version[i] = random_version();
+ expected_version[i] = version[i];
+ }
+
+ offset = paddr / ADI_BLKSZ;
+
+ ret = pwrite_adi(fd, version, sizeof(version), offset);
+ if (ret != sizeof(version))
+ TEST_STEP_FAILURE(ret);
+
+ ret = pread_adi(fd, version, sizeof(version), offset);
+ if (ret != sizeof(version))
+ TEST_STEP_FAILURE(ret);
+
+ for (i = 0; i < TEST2_VERSION_SZ; i++) {
+ if (expected_version[i] != version[i]) {
+ DEBUG_PRINT_L2(
+ "\tExpected version %d but read version %d\n",
+ expected_version, version[0]);
+ TEST_STEP_FAILURE(-expected_version[i]);
+ }
+ }
+
+ ret = 0;
+out:
+ RETURN_FROM_TEST(ret);
+}
+
+#define TEST3_VERSION_SZ 12541
+static int test3_prpw_unaligned_12541bytes(int fd)
+{
+ /* somewhat arbitrarily chosen address */
+ unsigned long paddr =
+ ((start_addr[0] + 0xC000) & ~(ADI_BLKSZ - 1)) + 17;
+ unsigned char version[TEST3_VERSION_SZ],
+ expected_version[TEST3_VERSION_SZ];
+ loff_t offset;
+ int ret, i;
+
+ for (i = 0; i < TEST3_VERSION_SZ; i++) {
+ version[i] = random_version();
+ expected_version[i] = version[i];
+ }
+
+ offset = paddr / ADI_BLKSZ;
+
+ ret = pwrite_adi(fd, version, sizeof(version), offset);
+ if (ret != sizeof(version))
+ TEST_STEP_FAILURE(ret);
+
+ ret = pread_adi(fd, version, sizeof(version), offset);
+ if (ret != sizeof(version))
+ TEST_STEP_FAILURE(ret);
+
+ for (i = 0; i < TEST3_VERSION_SZ; i++) {
+ if (expected_version[i] != version[i]) {
+ DEBUG_PRINT_L2(
+ "\tExpected version %d but read version %d\n",
+ expected_version, version[0]);
+ TEST_STEP_FAILURE(-expected_version[i]);
+ }
+ }
+
+ ret = 0;
+out:
+ RETURN_FROM_TEST(ret);
+}
+
+static int test4_lseek(int fd)
+{
+#define OFFSET_ADD (0x100)
+#define OFFSET_SUBTRACT (0xFFFFFFF000000000)
+
+ off_t offset_out, offset_in;
+ int ret;
+
+
+ offset_in = 0x123456789abcdef0;
+ offset_out = seek_adi(fd, offset_in, SEEK_SET);
+ if (offset_out != offset_in) {
+ ret = -1;
+ TEST_STEP_FAILURE(ret);
+ }
+
+ /* seek to the current offset. this should return EINVAL */
+ offset_out = seek_adi(fd, offset_in, SEEK_SET);
+ if (offset_out < 0 && errno == EINVAL)
+ DEBUG_PRINT_L2(
+ "\tSEEK_SET failed as designed. Not an error\n");
+ else {
+ ret = -2;
+ TEST_STEP_FAILURE(ret);
+ }
+
+ offset_out = seek_adi(fd, 0, SEEK_CUR);
+ if (offset_out != offset_in) {
+ ret = -3;
+ TEST_STEP_FAILURE(ret);
+ }
+
+ offset_out = seek_adi(fd, OFFSET_ADD, SEEK_CUR);
+ if (offset_out != (offset_in + OFFSET_ADD)) {
+ ret = -4;
+ TEST_STEP_FAILURE(ret);
+ }
+
+ offset_out = seek_adi(fd, OFFSET_SUBTRACT, SEEK_CUR);
+ if (offset_out != (offset_in + OFFSET_ADD + OFFSET_SUBTRACT)) {
+ ret = -5;
+ TEST_STEP_FAILURE(ret);
+ }
+
+ ret = 0;
+out:
+ RETURN_FROM_TEST(ret);
+}
+
+static int test5_rw_aligned_1byte(int fd)
+{
+ /* somewhat arbitrarily chosen address */
+ unsigned long paddr =
+ (end_addr[range_count - 1] - 0xF000) & ~(ADI_BLKSZ - 1);
+ unsigned char version, expected_version;
+ loff_t offset;
+ off_t oret;
+ int ret;
+
+ offset = paddr / ADI_BLKSZ;
+ version = expected_version = random_version();
+
+ oret = seek_adi(fd, offset, SEEK_SET);
+ if (oret != offset) {
+ ret = -1;
+ TEST_STEP_FAILURE(ret);
+ }
+
+ ret = write_adi(fd, &version, sizeof(version));
+ if (ret != sizeof(version))
+ TEST_STEP_FAILURE(ret);
+
+ oret = seek_adi(fd, offset, SEEK_SET);
+ if (oret != offset) {
+ ret = -1;
+ TEST_STEP_FAILURE(ret);
+ }
+
+ ret = read_adi(fd, &version, sizeof(version));
+ if (ret != sizeof(version))
+ TEST_STEP_FAILURE(ret);
+
+ if (expected_version != version) {
+ DEBUG_PRINT_L2("\tExpected version %d but read version %d\n",
+ expected_version, version);
+ TEST_STEP_FAILURE(-expected_version);
+ }
+
+ ret = 0;
+out:
+ RETURN_FROM_TEST(ret);
+}
+
+#define TEST6_VERSION_SZ 9434
+static int test6_rw_aligned_9434bytes(int fd)
+{
+ /* somewhat arbitrarily chosen address */
+ unsigned long paddr =
+ (end_addr[range_count - 1] - 0x5F000) & ~(ADI_BLKSZ - 1);
+ unsigned char version[TEST6_VERSION_SZ],
+ expected_version[TEST6_VERSION_SZ];
+ loff_t offset;
+ off_t oret;
+ int ret, i;
+
+ offset = paddr / ADI_BLKSZ;
+ for (i = 0; i < TEST6_VERSION_SZ; i++)
+ version[i] = expected_version[i] = random_version();
+
+ oret = seek_adi(fd, offset, SEEK_SET);
+ if (oret != offset) {
+ ret = -1;
+ TEST_STEP_FAILURE(ret);
+ }
+
+ ret = write_adi(fd, version, sizeof(version));
+ if (ret != sizeof(version))
+ TEST_STEP_FAILURE(ret);
+
+ memset(version, 0, TEST6_VERSION_SZ);
+
+ oret = seek_adi(fd, offset, SEEK_SET);
+ if (oret != offset) {
+ ret = -1;
+ TEST_STEP_FAILURE(ret);
+ }
+
+ ret = read_adi(fd, version, sizeof(version));
+ if (ret != sizeof(version))
+ TEST_STEP_FAILURE(ret);
+
+ for (i = 0; i < TEST6_VERSION_SZ; i++) {
+ if (expected_version[i] != version[i]) {
+ DEBUG_PRINT_L2(
+ "\tExpected version %d but read version %d\n",
+ expected_version[i], version[i]);
+ TEST_STEP_FAILURE(-expected_version[i]);
+ }
+ }
+
+ ret = 0;
+out:
+ RETURN_FROM_TEST(ret);
+}
+
+#define TEST7_VERSION_SZ 14963
+static int test7_rw_aligned_14963bytes(int fd)
+{
+ /* somewhat arbitrarily chosen address */
+ unsigned long paddr =
+ ((start_addr[range_count - 1] + 0xF000) & ~(ADI_BLKSZ - 1)) + 39;
+ unsigned char version[TEST7_VERSION_SZ],
+ expected_version[TEST7_VERSION_SZ];
+ loff_t offset;
+ off_t oret;
+ int ret, i;
+
+ offset = paddr / ADI_BLKSZ;
+ for (i = 0; i < TEST7_VERSION_SZ; i++) {
+ version[i] = random_version();
+ expected_version[i] = version[i];
+ }
+
+ oret = seek_adi(fd, offset, SEEK_SET);
+ if (oret != offset) {
+ ret = -1;
+ TEST_STEP_FAILURE(ret);
+ }
+
+ ret = write_adi(fd, version, sizeof(version));
+ if (ret != sizeof(version))
+ TEST_STEP_FAILURE(ret);
+
+ memset(version, 0, TEST7_VERSION_SZ);
+
+ oret = seek_adi(fd, offset, SEEK_SET);
+ if (oret != offset) {
+ ret = -1;
+ TEST_STEP_FAILURE(ret);
+ }
+
+ ret = read_adi(fd, version, sizeof(version));
+ if (ret != sizeof(version))
+ TEST_STEP_FAILURE(ret);
+
+ for (i = 0; i < TEST7_VERSION_SZ; i++) {
+ if (expected_version[i] != version[i]) {
+ DEBUG_PRINT_L2(
+ "\tExpected version %d but read version %d\n",
+ expected_version[i], version[i]);
+ TEST_STEP_FAILURE(-expected_version[i]);
+ }
+
+ paddr += ADI_BLKSZ;
+ }
+
+ ret = 0;
+out:
+ RETURN_FROM_TEST(ret);
+}
+
+static int (*tests[])(int fd) = {
+ test0_prpw_aligned_1byte,
+ test1_prpw_aligned_4096bytes,
+ test2_prpw_aligned_10327bytes,
+ test3_prpw_unaligned_12541bytes,
+ test4_lseek,
+ test5_rw_aligned_1byte,
+ test6_rw_aligned_9434bytes,
+ test7_rw_aligned_14963bytes,
+};
+#define TEST_COUNT ARRAY_SIZE(tests)
+
+int main(int argc, char *argv[])
+{
+ int fd, ret, test;
+
+ ret = build_memory_map();
+ if (ret < 0)
+ return ret;
+
+ fd = open("/dev/adi", O_RDWR);
+ if (fd < 0) {
+ fprintf(stderr, "open: error %d: %s\n",
+ errno, strerror(errno));
+ return -errno;
+ }
+
+ for (test = 0; test < TEST_COUNT; test++) {
+ DEBUG_PRINT_L1("Running test #%d\n", test);
+
+ ret = (*tests[test])(fd);
+ if (ret != 0)
+ ksft_test_result_fail("Test #%d failed: error %d\n",
+ test, ret);
+ else
+ ksft_test_result_pass("Test #%d passed\n", test);
+ }
+
+ print_stats();
+ close(fd);
+
+ if (ksft_get_fail_cnt() > 0)
+ ksft_exit_fail();
+ else
+ ksft_exit_pass();
+
+ /* it's impossible to get here, but the compiler throws a warning
+ * about control reaching the end of non-void function. bah.
+ */
+ return 0;
+}
diff --git a/tools/testing/selftests/sparc64/drivers/drivers_test.sh b/tools/testing/selftests/sparc64/drivers/drivers_test.sh
new file mode 100755
index 000000000..6d08273b7
--- /dev/null
+++ b/tools/testing/selftests/sparc64/drivers/drivers_test.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+SRC_TREE=../../../../
+
+test_run()
+{
+ if [ -f ${SRC_TREE}/drivers/char/adi.ko ]; then
+ insmod ${SRC_TREE}/drivers/char/adi.ko 2> /dev/null
+ if [ $? -ne 0 ]; then
+ rc=1
+ fi
+ else
+ # Use modprobe dry run to check for missing adi module
+ if ! /sbin/modprobe -q -n adi; then
+ echo "adi: [SKIP]"
+ elif /sbin/modprobe -q adi; then
+ echo "adi: ok"
+ else
+ echo "adi: [FAIL]"
+ rc=1
+ fi
+ fi
+ ./adi-test
+ rmmod adi 2> /dev/null
+}
+
+rc=0
+test_run
+exit $rc
diff --git a/tools/testing/selftests/sparc64/run.sh b/tools/testing/selftests/sparc64/run.sh
new file mode 100755
index 000000000..38ad61f93
--- /dev/null
+++ b/tools/testing/selftests/sparc64/run.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+(cd drivers; ./drivers_test.sh)
diff --git a/tools/testing/selftests/splice/.gitignore b/tools/testing/selftests/splice/.gitignore
new file mode 100644
index 000000000..be8266f5d
--- /dev/null
+++ b/tools/testing/selftests/splice/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+default_file_splice_read
+splice_read
diff --git a/tools/testing/selftests/splice/Makefile b/tools/testing/selftests/splice/Makefile
new file mode 100644
index 000000000..541cd826d
--- /dev/null
+++ b/tools/testing/selftests/splice/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0
+TEST_PROGS := default_file_splice_read.sh short_splice_read.sh
+TEST_GEN_PROGS_EXTENDED := default_file_splice_read splice_read
+
+include ../lib.mk
diff --git a/tools/testing/selftests/splice/config b/tools/testing/selftests/splice/config
new file mode 100644
index 000000000..058c92836
--- /dev/null
+++ b/tools/testing/selftests/splice/config
@@ -0,0 +1 @@
+CONFIG_TEST_LKM=m
diff --git a/tools/testing/selftests/splice/default_file_splice_read.c b/tools/testing/selftests/splice/default_file_splice_read.c
new file mode 100644
index 000000000..a3c6e5672
--- /dev/null
+++ b/tools/testing/selftests/splice/default_file_splice_read.c
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <fcntl.h>
+
+int main(int argc, char **argv)
+{
+ splice(0, 0, 1, 0, 1<<30, 0);
+ return 0;
+}
diff --git a/tools/testing/selftests/splice/default_file_splice_read.sh b/tools/testing/selftests/splice/default_file_splice_read.sh
new file mode 100755
index 000000000..490db5a2e
--- /dev/null
+++ b/tools/testing/selftests/splice/default_file_splice_read.sh
@@ -0,0 +1,8 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+n=`./default_file_splice_read </dev/null | wc -c`
+
+test "$n" = 0 && exit 0
+
+echo "default_file_splice_read broken: leaked $n"
+exit 1
diff --git a/tools/testing/selftests/splice/settings b/tools/testing/selftests/splice/settings
new file mode 100644
index 000000000..89cedfc0d
--- /dev/null
+++ b/tools/testing/selftests/splice/settings
@@ -0,0 +1 @@
+timeout=5
diff --git a/tools/testing/selftests/splice/short_splice_read.sh b/tools/testing/selftests/splice/short_splice_read.sh
new file mode 100755
index 000000000..22b6c8910
--- /dev/null
+++ b/tools/testing/selftests/splice/short_splice_read.sh
@@ -0,0 +1,133 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test for mishandling of splice() on pseudofilesystems, which should catch
+# bugs like 11990a5bd7e5 ("module: Correctly truncate sysfs sections output")
+#
+# Since splice fallback was removed as part of the set_fs() rework, many of these
+# tests expect to fail now. See https://lore.kernel.org/lkml/202009181443.C2179FB@keescook/
+set -e
+
+DIR=$(dirname "$0")
+
+ret=0
+
+expect_success()
+{
+ title="$1"
+ shift
+
+ echo "" >&2
+ echo "$title ..." >&2
+
+ set +e
+ "$@"
+ rc=$?
+ set -e
+
+ case "$rc" in
+ 0)
+ echo "ok: $title succeeded" >&2
+ ;;
+ 1)
+ echo "FAIL: $title should work" >&2
+ ret=$(( ret + 1 ))
+ ;;
+ *)
+ echo "FAIL: something else went wrong" >&2
+ ret=$(( ret + 1 ))
+ ;;
+ esac
+}
+
+expect_failure()
+{
+ title="$1"
+ shift
+
+ echo "" >&2
+ echo "$title ..." >&2
+
+ set +e
+ "$@"
+ rc=$?
+ set -e
+
+ case "$rc" in
+ 0)
+ echo "FAIL: $title unexpectedly worked" >&2
+ ret=$(( ret + 1 ))
+ ;;
+ 1)
+ echo "ok: $title correctly failed" >&2
+ ;;
+ *)
+ echo "FAIL: something else went wrong" >&2
+ ret=$(( ret + 1 ))
+ ;;
+ esac
+}
+
+do_splice()
+{
+ filename="$1"
+ bytes="$2"
+ expected="$3"
+ report="$4"
+
+ out=$("$DIR"/splice_read "$filename" "$bytes" | cat)
+ if [ "$out" = "$expected" ] ; then
+ echo " matched $report" >&2
+ return 0
+ else
+ echo " no match: '$out' vs $report" >&2
+ return 1
+ fi
+}
+
+test_splice()
+{
+ filename="$1"
+
+ echo " checking $filename ..." >&2
+
+ full=$(cat "$filename")
+ rc=$?
+ if [ $rc -ne 0 ] ; then
+ return 2
+ fi
+
+ two=$(echo "$full" | grep -m1 . | cut -c-2)
+
+ # Make sure full splice has the same contents as a standard read.
+ echo " splicing 4096 bytes ..." >&2
+ if ! do_splice "$filename" 4096 "$full" "full read" ; then
+ return 1
+ fi
+
+ # Make sure a partial splice see the first two characters.
+ echo " splicing 2 bytes ..." >&2
+ if ! do_splice "$filename" 2 "$two" "'$two'" ; then
+ return 1
+ fi
+
+ return 0
+}
+
+### /proc/$pid/ has no splice interface; these should all fail.
+expect_failure "proc_single_open(), seq_read() splice" test_splice /proc/$$/limits
+expect_failure "special open(), seq_read() splice" test_splice /proc/$$/comm
+
+### /proc/sys/ has a splice interface; these should all succeed.
+expect_success "proc_handler: proc_dointvec_minmax() splice" test_splice /proc/sys/fs/nr_open
+expect_success "proc_handler: proc_dostring() splice" test_splice /proc/sys/kernel/modprobe
+expect_success "proc_handler: special read splice" test_splice /proc/sys/kernel/version
+
+### /sys/ has no splice interface; these should all fail.
+if ! [ -d /sys/module/test_module/sections ] ; then
+ expect_success "test_module kernel module load" modprobe test_module
+fi
+expect_failure "kernfs attr splice" test_splice /sys/module/test_module/coresize
+expect_failure "kernfs binattr splice" test_splice /sys/module/test_module/sections/.init.text
+
+exit $ret
diff --git a/tools/testing/selftests/splice/splice_read.c b/tools/testing/selftests/splice/splice_read.c
new file mode 100644
index 000000000..46dae6a25
--- /dev/null
+++ b/tools/testing/selftests/splice/splice_read.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+int main(int argc, char *argv[])
+{
+ int fd;
+ size_t size;
+ ssize_t spliced;
+
+ if (argc < 2) {
+ fprintf(stderr, "Usage: %s INPUT [BYTES]\n", argv[0]);
+ return EXIT_FAILURE;
+ }
+
+ fd = open(argv[1], O_RDONLY);
+ if (fd < 0) {
+ perror(argv[1]);
+ return EXIT_FAILURE;
+ }
+
+ if (argc == 3)
+ size = atol(argv[2]);
+ else {
+ struct stat statbuf;
+
+ if (fstat(fd, &statbuf) < 0) {
+ perror(argv[1]);
+ return EXIT_FAILURE;
+ }
+
+ if (statbuf.st_size > INT_MAX) {
+ fprintf(stderr, "%s: Too big\n", argv[1]);
+ return EXIT_FAILURE;
+ }
+
+ size = statbuf.st_size;
+ }
+
+ /* splice(2) file to stdout. */
+ spliced = splice(fd, NULL, STDOUT_FILENO, NULL,
+ size, SPLICE_F_MOVE);
+ if (spliced < 0) {
+ perror("splice");
+ return EXIT_FAILURE;
+ }
+
+ close(fd);
+ return EXIT_SUCCESS;
+}
diff --git a/tools/testing/selftests/static_keys/Makefile b/tools/testing/selftests/static_keys/Makefile
new file mode 100644
index 000000000..aa64104c7
--- /dev/null
+++ b/tools/testing/selftests/static_keys/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Makefile for static keys selftests
+
+# No binaries, but make sure arg-less "make" doesn't trigger "run_tests"
+all:
+
+TEST_PROGS := test_static_keys.sh
+
+include ../lib.mk
diff --git a/tools/testing/selftests/static_keys/config b/tools/testing/selftests/static_keys/config
new file mode 100644
index 000000000..d538fb774
--- /dev/null
+++ b/tools/testing/selftests/static_keys/config
@@ -0,0 +1 @@
+CONFIG_TEST_STATIC_KEYS=m
diff --git a/tools/testing/selftests/static_keys/test_static_keys.sh b/tools/testing/selftests/static_keys/test_static_keys.sh
new file mode 100755
index 000000000..fc9f8cde7
--- /dev/null
+++ b/tools/testing/selftests/static_keys/test_static_keys.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# Runs static keys kernel module tests
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+if ! /sbin/modprobe -q -n test_static_key_base; then
+ echo "static_key: module test_static_key_base is not found [SKIP]"
+ exit $ksft_skip
+fi
+
+if ! /sbin/modprobe -q -n test_static_keys; then
+ echo "static_key: module test_static_keys is not found [SKIP]"
+ exit $ksft_skip
+fi
+
+if /sbin/modprobe -q test_static_key_base; then
+ if /sbin/modprobe -q test_static_keys; then
+ echo "static_key: ok"
+ /sbin/modprobe -q -r test_static_keys
+ /sbin/modprobe -q -r test_static_key_base
+ else
+ echo "static_keys: [FAIL]"
+ /sbin/modprobe -q -r test_static_key_base
+ fi
+else
+ echo "static_key: [FAIL]"
+ exit 1
+fi
diff --git a/tools/testing/selftests/sync/.gitignore b/tools/testing/selftests/sync/.gitignore
new file mode 100644
index 000000000..f11523577
--- /dev/null
+++ b/tools/testing/selftests/sync/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+sync_test
diff --git a/tools/testing/selftests/sync/Makefile b/tools/testing/selftests/sync/Makefile
new file mode 100644
index 000000000..d0121a8a3
--- /dev/null
+++ b/tools/testing/selftests/sync/Makefile
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS += -O2 -g -std=gnu89 -pthread -Wall -Wextra
+CFLAGS += -I../../../../usr/include/
+LDFLAGS += -pthread
+
+.PHONY: all clean
+
+include ../lib.mk
+
+# lib.mk TEST_CUSTOM_PROGS var is for custom tests that need special
+# build rules. lib.mk will run and install them.
+
+TEST_CUSTOM_PROGS := $(OUTPUT)/sync_test
+all: $(TEST_CUSTOM_PROGS)
+
+OBJS = sync_test.o sync.o
+
+TESTS += sync_alloc.o
+TESTS += sync_fence.o
+TESTS += sync_merge.o
+TESTS += sync_wait.o
+TESTS += sync_stress_parallelism.o
+TESTS += sync_stress_consumer.o
+TESTS += sync_stress_merge.o
+
+OBJS := $(patsubst %,$(OUTPUT)/%,$(OBJS))
+TESTS := $(patsubst %,$(OUTPUT)/%,$(TESTS))
+
+$(TEST_CUSTOM_PROGS): $(TESTS) $(OBJS)
+ $(CC) -o $(TEST_CUSTOM_PROGS) $(OBJS) $(TESTS) $(CFLAGS) $(LDFLAGS)
+
+$(OBJS): $(OUTPUT)/%.o: %.c
+ $(CC) -c $^ -o $@ $(CFLAGS)
+
+$(TESTS): $(OUTPUT)/%.o: %.c
+ $(CC) -c $^ -o $@
+
+EXTRA_CLEAN := $(TEST_CUSTOM_PROGS) $(OBJS) $(TESTS)
diff --git a/tools/testing/selftests/sync/config b/tools/testing/selftests/sync/config
new file mode 100644
index 000000000..1ab7e8130
--- /dev/null
+++ b/tools/testing/selftests/sync/config
@@ -0,0 +1,4 @@
+CONFIG_STAGING=y
+CONFIG_ANDROID=y
+CONFIG_SYNC=y
+CONFIG_SW_SYNC=y
diff --git a/tools/testing/selftests/sync/sw_sync.h b/tools/testing/selftests/sync/sw_sync.h
new file mode 100644
index 000000000..e2cfc6bad
--- /dev/null
+++ b/tools/testing/selftests/sync/sw_sync.h
@@ -0,0 +1,46 @@
+/*
+ * sw_sync abstraction
+ *
+ * Copyright 2015-2016 Collabora Ltd.
+ *
+ * Based on the implementation from the Android Open Source Project,
+ *
+ * Copyright 2013 Google, Inc
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef SELFTESTS_SW_SYNC_H
+#define SELFTESTS_SW_SYNC_H
+
+/*
+ * sw_sync is mainly intended for testing and should not be compiled into
+ * production kernels
+ */
+
+int sw_sync_timeline_create(void);
+int sw_sync_timeline_is_valid(int fd);
+int sw_sync_timeline_inc(int fd, unsigned int count);
+void sw_sync_timeline_destroy(int fd);
+
+int sw_sync_fence_create(int fd, const char *name, unsigned int value);
+int sw_sync_fence_is_valid(int fd);
+void sw_sync_fence_destroy(int fd);
+
+#endif
diff --git a/tools/testing/selftests/sync/sync.c b/tools/testing/selftests/sync/sync.c
new file mode 100644
index 000000000..7741c0518
--- /dev/null
+++ b/tools/testing/selftests/sync/sync.c
@@ -0,0 +1,221 @@
+/*
+ * sync / sw_sync abstraction
+ * Copyright 2015-2016 Collabora Ltd.
+ *
+ * Based on the implementation from the Android Open Source Project,
+ *
+ * Copyright 2012 Google, Inc
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <fcntl.h>
+#include <malloc.h>
+#include <poll.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "sync.h"
+#include "sw_sync.h"
+
+#include <linux/sync_file.h>
+
+
+/* SW_SYNC ioctls */
+struct sw_sync_create_fence_data {
+ __u32 value;
+ char name[32];
+ __s32 fence;
+};
+
+#define SW_SYNC_IOC_MAGIC 'W'
+#define SW_SYNC_IOC_CREATE_FENCE _IOWR(SW_SYNC_IOC_MAGIC, 0,\
+ struct sw_sync_create_fence_data)
+#define SW_SYNC_IOC_INC _IOW(SW_SYNC_IOC_MAGIC, 1, __u32)
+
+
+int sync_wait(int fd, int timeout)
+{
+ struct pollfd fds;
+
+ fds.fd = fd;
+ fds.events = POLLIN | POLLERR;
+
+ return poll(&fds, 1, timeout);
+}
+
+int sync_merge(const char *name, int fd1, int fd2)
+{
+ struct sync_merge_data data = {};
+ int err;
+
+ data.fd2 = fd2;
+ strncpy(data.name, name, sizeof(data.name) - 1);
+ data.name[sizeof(data.name) - 1] = '\0';
+
+ err = ioctl(fd1, SYNC_IOC_MERGE, &data);
+ if (err < 0)
+ return err;
+
+ return data.fence;
+}
+
+static struct sync_file_info *sync_file_info(int fd)
+{
+ struct sync_file_info *info;
+ struct sync_fence_info *fence_info;
+ int err, num_fences;
+
+ info = calloc(1, sizeof(*info));
+ if (info == NULL)
+ return NULL;
+
+ err = ioctl(fd, SYNC_IOC_FILE_INFO, info);
+ if (err < 0) {
+ free(info);
+ return NULL;
+ }
+
+ num_fences = info->num_fences;
+
+ if (num_fences) {
+ info->flags = 0;
+ info->num_fences = num_fences;
+
+ fence_info = calloc(num_fences, sizeof(*fence_info));
+ if (!fence_info) {
+ free(info);
+ return NULL;
+ }
+
+ info->sync_fence_info = (uint64_t)(unsigned long)fence_info;
+
+ err = ioctl(fd, SYNC_IOC_FILE_INFO, info);
+ if (err < 0) {
+ free(fence_info);
+ free(info);
+ return NULL;
+ }
+ }
+
+ return info;
+}
+
+static void sync_file_info_free(struct sync_file_info *info)
+{
+ free((void *)(unsigned long)info->sync_fence_info);
+ free(info);
+}
+
+int sync_fence_size(int fd)
+{
+ int count;
+ struct sync_file_info *info = sync_file_info(fd);
+
+ if (!info)
+ return 0;
+
+ count = info->num_fences;
+
+ sync_file_info_free(info);
+
+ return count;
+}
+
+int sync_fence_count_with_status(int fd, int status)
+{
+ unsigned int i, count = 0;
+ struct sync_fence_info *fence_info = NULL;
+ struct sync_file_info *info = sync_file_info(fd);
+
+ if (!info)
+ return -1;
+
+ fence_info = (struct sync_fence_info *)(unsigned long)info->sync_fence_info;
+ for (i = 0 ; i < info->num_fences ; i++) {
+ if (fence_info[i].status == status)
+ count++;
+ }
+
+ sync_file_info_free(info);
+
+ return count;
+}
+
+int sw_sync_timeline_create(void)
+{
+ return open("/sys/kernel/debug/sync/sw_sync", O_RDWR);
+}
+
+int sw_sync_timeline_inc(int fd, unsigned int count)
+{
+ __u32 arg = count;
+
+ return ioctl(fd, SW_SYNC_IOC_INC, &arg);
+}
+
+int sw_sync_timeline_is_valid(int fd)
+{
+ int status;
+
+ if (fd == -1)
+ return 0;
+
+ status = fcntl(fd, F_GETFD, 0);
+ return (status >= 0);
+}
+
+void sw_sync_timeline_destroy(int fd)
+{
+ if (sw_sync_timeline_is_valid(fd))
+ close(fd);
+}
+
+int sw_sync_fence_create(int fd, const char *name, unsigned int value)
+{
+ struct sw_sync_create_fence_data data = {};
+ int err;
+
+ data.value = value;
+ strncpy(data.name, name, sizeof(data.name) - 1);
+ data.name[sizeof(data.name) - 1] = '\0';
+
+ err = ioctl(fd, SW_SYNC_IOC_CREATE_FENCE, &data);
+ if (err < 0)
+ return err;
+
+ return data.fence;
+}
+
+int sw_sync_fence_is_valid(int fd)
+{
+ /* Same code! */
+ return sw_sync_timeline_is_valid(fd);
+}
+
+void sw_sync_fence_destroy(int fd)
+{
+ if (sw_sync_fence_is_valid(fd))
+ close(fd);
+}
diff --git a/tools/testing/selftests/sync/sync.h b/tools/testing/selftests/sync/sync.h
new file mode 100644
index 000000000..fb7156148
--- /dev/null
+++ b/tools/testing/selftests/sync/sync.h
@@ -0,0 +1,40 @@
+/*
+ * sync abstraction
+ * Copyright 2015-2016 Collabora Ltd.
+ *
+ * Based on the implementation from the Android Open Source Project,
+ *
+ * Copyright 2012 Google, Inc
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef SELFTESTS_SYNC_H
+#define SELFTESTS_SYNC_H
+
+#define FENCE_STATUS_ERROR (-1)
+#define FENCE_STATUS_ACTIVE (0)
+#define FENCE_STATUS_SIGNALED (1)
+
+int sync_wait(int fd, int timeout);
+int sync_merge(const char *name, int fd1, int fd2);
+int sync_fence_size(int fd);
+int sync_fence_count_with_status(int fd, int status);
+
+#endif
diff --git a/tools/testing/selftests/sync/sync_alloc.c b/tools/testing/selftests/sync/sync_alloc.c
new file mode 100644
index 000000000..66a28afc0
--- /dev/null
+++ b/tools/testing/selftests/sync/sync_alloc.c
@@ -0,0 +1,74 @@
+/*
+ * sync allocation tests
+ * Copyright 2015-2016 Collabora Ltd.
+ *
+ * Based on the implementation from the Android Open Source Project,
+ *
+ * Copyright 2012 Google, Inc
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "sync.h"
+#include "sw_sync.h"
+#include "synctest.h"
+
+int test_alloc_timeline(void)
+{
+ int timeline, valid;
+
+ timeline = sw_sync_timeline_create();
+ valid = sw_sync_timeline_is_valid(timeline);
+ ASSERT(valid, "Failure allocating timeline\n");
+
+ sw_sync_timeline_destroy(timeline);
+ return 0;
+}
+
+int test_alloc_fence(void)
+{
+ int timeline, fence, valid;
+
+ timeline = sw_sync_timeline_create();
+ valid = sw_sync_timeline_is_valid(timeline);
+ ASSERT(valid, "Failure allocating timeline\n");
+
+ fence = sw_sync_fence_create(timeline, "allocFence", 1);
+ valid = sw_sync_fence_is_valid(fence);
+ ASSERT(valid, "Failure allocating fence\n");
+
+ sw_sync_fence_destroy(fence);
+ sw_sync_timeline_destroy(timeline);
+ return 0;
+}
+
+int test_alloc_fence_negative(void)
+{
+ int fence, timeline;
+
+ timeline = sw_sync_timeline_create();
+ ASSERT(timeline > 0, "Failure allocating timeline\n");
+
+ fence = sw_sync_fence_create(-1, "fence", 1);
+ ASSERT(fence < 0, "Success allocating negative fence\n");
+
+ sw_sync_fence_destroy(fence);
+ sw_sync_timeline_destroy(timeline);
+ return 0;
+}
diff --git a/tools/testing/selftests/sync/sync_fence.c b/tools/testing/selftests/sync/sync_fence.c
new file mode 100644
index 000000000..13f175287
--- /dev/null
+++ b/tools/testing/selftests/sync/sync_fence.c
@@ -0,0 +1,132 @@
+/*
+ * sync fence tests with one timeline
+ * Copyright 2015-2016 Collabora Ltd.
+ *
+ * Based on the implementation from the Android Open Source Project,
+ *
+ * Copyright 2012 Google, Inc
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "sync.h"
+#include "sw_sync.h"
+#include "synctest.h"
+
+int test_fence_one_timeline_wait(void)
+{
+ int fence, valid, ret;
+ int timeline = sw_sync_timeline_create();
+
+ valid = sw_sync_timeline_is_valid(timeline);
+ ASSERT(valid, "Failure allocating timeline\n");
+
+ fence = sw_sync_fence_create(timeline, "allocFence", 5);
+ valid = sw_sync_fence_is_valid(fence);
+ ASSERT(valid, "Failure allocating fence\n");
+
+ /* Wait on fence until timeout */
+ ret = sync_wait(fence, 0);
+ ASSERT(ret == 0, "Failure waiting on fence until timeout\n");
+
+ /* Advance timeline from 0 -> 1 */
+ ret = sw_sync_timeline_inc(timeline, 1);
+ ASSERT(ret == 0, "Failure advancing timeline\n");
+
+ /* Wait on fence until timeout */
+ ret = sync_wait(fence, 0);
+ ASSERT(ret == 0, "Failure waiting on fence until timeout\n");
+
+ /* Signal the fence */
+ ret = sw_sync_timeline_inc(timeline, 4);
+ ASSERT(ret == 0, "Failure signaling the fence\n");
+
+ /* Wait successfully */
+ ret = sync_wait(fence, 0);
+ ASSERT(ret > 0, "Failure waiting on fence\n");
+
+ /* Go even further, and confirm wait still succeeds */
+ ret = sw_sync_timeline_inc(timeline, 10);
+ ASSERT(ret == 0, "Failure going further\n");
+ ret = sync_wait(fence, 0);
+ ASSERT(ret > 0, "Failure waiting ahead\n");
+
+ sw_sync_fence_destroy(fence);
+ sw_sync_timeline_destroy(timeline);
+
+ return 0;
+}
+
+int test_fence_one_timeline_merge(void)
+{
+ int a, b, c, d, valid;
+ int timeline = sw_sync_timeline_create();
+
+ /* create fence a,b,c and then merge them all into fence d */
+ a = sw_sync_fence_create(timeline, "allocFence", 1);
+ b = sw_sync_fence_create(timeline, "allocFence", 2);
+ c = sw_sync_fence_create(timeline, "allocFence", 3);
+
+ valid = sw_sync_fence_is_valid(a) &&
+ sw_sync_fence_is_valid(b) &&
+ sw_sync_fence_is_valid(c);
+ ASSERT(valid, "Failure allocating fences\n");
+
+ d = sync_merge("mergeFence", b, a);
+ d = sync_merge("mergeFence", c, d);
+ valid = sw_sync_fence_is_valid(d);
+ ASSERT(valid, "Failure merging fences\n");
+
+ /* confirm all fences have one active point (even d) */
+ ASSERT(sync_fence_count_with_status(a, FENCE_STATUS_ACTIVE) == 1,
+ "a has too many active fences!\n");
+ ASSERT(sync_fence_count_with_status(a, FENCE_STATUS_ACTIVE) == 1,
+ "b has too many active fences!\n");
+ ASSERT(sync_fence_count_with_status(a, FENCE_STATUS_ACTIVE) == 1,
+ "c has too many active fences!\n");
+ ASSERT(sync_fence_count_with_status(a, FENCE_STATUS_ACTIVE) == 1,
+ "d has too many active fences!\n");
+
+ /* confirm that d is not signaled until the max of a,b,c */
+ sw_sync_timeline_inc(timeline, 1);
+ ASSERT(sync_fence_count_with_status(a, FENCE_STATUS_SIGNALED) == 1,
+ "a did not signal!\n");
+ ASSERT(sync_fence_count_with_status(d, FENCE_STATUS_ACTIVE) == 1,
+ "d signaled too early!\n");
+
+ sw_sync_timeline_inc(timeline, 1);
+ ASSERT(sync_fence_count_with_status(b, FENCE_STATUS_SIGNALED) == 1,
+ "b did not signal!\n");
+ ASSERT(sync_fence_count_with_status(d, FENCE_STATUS_ACTIVE) == 1,
+ "d signaled too early!\n");
+
+ sw_sync_timeline_inc(timeline, 1);
+ ASSERT(sync_fence_count_with_status(c, FENCE_STATUS_SIGNALED) == 1,
+ "c did not signal!\n");
+ ASSERT(sync_fence_count_with_status(d, FENCE_STATUS_ACTIVE) == 0 &&
+ sync_fence_count_with_status(d, FENCE_STATUS_SIGNALED) == 1,
+ "d did not signal!\n");
+
+ sw_sync_fence_destroy(d);
+ sw_sync_fence_destroy(c);
+ sw_sync_fence_destroy(b);
+ sw_sync_fence_destroy(a);
+ sw_sync_timeline_destroy(timeline);
+ return 0;
+}
diff --git a/tools/testing/selftests/sync/sync_merge.c b/tools/testing/selftests/sync/sync_merge.c
new file mode 100644
index 000000000..8914d4339
--- /dev/null
+++ b/tools/testing/selftests/sync/sync_merge.c
@@ -0,0 +1,60 @@
+/*
+ * sync fence merge tests
+ * Copyright 2015-2016 Collabora Ltd.
+ *
+ * Based on the implementation from the Android Open Source Project,
+ *
+ * Copyright 2012 Google, Inc
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "sync.h"
+#include "sw_sync.h"
+#include "synctest.h"
+
+int test_fence_merge_same_fence(void)
+{
+ int fence, valid, merged;
+ int timeline = sw_sync_timeline_create();
+
+ valid = sw_sync_timeline_is_valid(timeline);
+ ASSERT(valid, "Failure allocating timeline\n");
+
+ fence = sw_sync_fence_create(timeline, "allocFence", 5);
+ valid = sw_sync_fence_is_valid(fence);
+ ASSERT(valid, "Failure allocating fence\n");
+
+ merged = sync_merge("mergeFence", fence, fence);
+ valid = sw_sync_fence_is_valid(fence);
+ ASSERT(valid, "Failure merging fence\n");
+
+ ASSERT(sync_fence_count_with_status(merged, FENCE_STATUS_SIGNALED) == 0,
+ "fence signaled too early!\n");
+
+ sw_sync_timeline_inc(timeline, 5);
+ ASSERT(sync_fence_count_with_status(merged, FENCE_STATUS_SIGNALED) == 1,
+ "fence did not signal!\n");
+
+ sw_sync_fence_destroy(merged);
+ sw_sync_fence_destroy(fence);
+ sw_sync_timeline_destroy(timeline);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/sync/sync_stress_consumer.c b/tools/testing/selftests/sync/sync_stress_consumer.c
new file mode 100644
index 000000000..d9eff8d52
--- /dev/null
+++ b/tools/testing/selftests/sync/sync_stress_consumer.c
@@ -0,0 +1,185 @@
+/*
+ * sync stress test: producer/consumer
+ * Copyright 2015-2016 Collabora Ltd.
+ *
+ * Based on the implementation from the Android Open Source Project,
+ *
+ * Copyright 2012 Google, Inc
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <pthread.h>
+
+#include "sync.h"
+#include "sw_sync.h"
+#include "synctest.h"
+
+/* IMPORTANT NOTE: if you see this test failing on your system, it may be
+ * due to a shortage of file descriptors. Please ensure your system has
+ * a sensible limit for this test to finish correctly.
+ */
+
+/* Returns 1 on error, 0 on success */
+static int busy_wait_on_fence(int fence)
+{
+ int error, active;
+
+ do {
+ error = sync_fence_count_with_status(fence, FENCE_STATUS_ERROR);
+ ASSERT(error == 0, "Error occurred on fence\n");
+ active = sync_fence_count_with_status(fence,
+ FENCE_STATUS_ACTIVE);
+ } while (active);
+
+ return 0;
+}
+
+static struct {
+ int iterations;
+ int threads;
+ int counter;
+ int consumer_timeline;
+ int *producer_timelines;
+ pthread_mutex_t lock;
+} test_data_mpsc;
+
+static int mpsc_producer_thread(void *d)
+{
+ int id = (long)d;
+ int fence, valid, i;
+ int *producer_timelines = test_data_mpsc.producer_timelines;
+ int consumer_timeline = test_data_mpsc.consumer_timeline;
+ int iterations = test_data_mpsc.iterations;
+
+ for (i = 0; i < iterations; i++) {
+ fence = sw_sync_fence_create(consumer_timeline, "fence", i);
+ valid = sw_sync_fence_is_valid(fence);
+ ASSERT(valid, "Failure creating fence\n");
+
+ /*
+ * Wait for the consumer to finish. Use alternate
+ * means of waiting on the fence
+ */
+
+ if ((iterations + id) % 8 != 0) {
+ ASSERT(sync_wait(fence, -1) > 0,
+ "Failure waiting on fence\n");
+ } else {
+ ASSERT(busy_wait_on_fence(fence) == 0,
+ "Failure waiting on fence\n");
+ }
+
+ /*
+ * Every producer increments the counter, the consumer
+ * checks and erases it
+ */
+ pthread_mutex_lock(&test_data_mpsc.lock);
+ test_data_mpsc.counter++;
+ pthread_mutex_unlock(&test_data_mpsc.lock);
+
+ ASSERT(sw_sync_timeline_inc(producer_timelines[id], 1) == 0,
+ "Error advancing producer timeline\n");
+
+ sw_sync_fence_destroy(fence);
+ }
+
+ return 0;
+}
+
+static int mpcs_consumer_thread(void)
+{
+ int fence, merged, tmp, valid, it, i;
+ int *producer_timelines = test_data_mpsc.producer_timelines;
+ int consumer_timeline = test_data_mpsc.consumer_timeline;
+ int iterations = test_data_mpsc.iterations;
+ int n = test_data_mpsc.threads;
+
+ for (it = 1; it <= iterations; it++) {
+ fence = sw_sync_fence_create(producer_timelines[0], "name", it);
+ for (i = 1; i < n; i++) {
+ tmp = sw_sync_fence_create(producer_timelines[i],
+ "name", it);
+ merged = sync_merge("name", tmp, fence);
+ sw_sync_fence_destroy(tmp);
+ sw_sync_fence_destroy(fence);
+ fence = merged;
+ }
+
+ valid = sw_sync_fence_is_valid(fence);
+ ASSERT(valid, "Failure merging fences\n");
+
+ /*
+ * Make sure we see an increment from every producer thread.
+ * Vary the means by which we wait.
+ */
+ if (iterations % 8 != 0) {
+ ASSERT(sync_wait(fence, -1) > 0,
+ "Producers did not increment as expected\n");
+ } else {
+ ASSERT(busy_wait_on_fence(fence) == 0,
+ "Producers did not increment as expected\n");
+ }
+
+ ASSERT(test_data_mpsc.counter == n * it,
+ "Counter value mismatch!\n");
+
+ /* Release the producer threads */
+ ASSERT(sw_sync_timeline_inc(consumer_timeline, 1) == 0,
+ "Failure releasing producer threads\n");
+
+ sw_sync_fence_destroy(fence);
+ }
+
+ return 0;
+}
+
+int test_consumer_stress_multi_producer_single_consumer(void)
+{
+ int iterations = 1 << 12;
+ int n = 5;
+ long i, ret;
+ int producer_timelines[n];
+ int consumer_timeline;
+ pthread_t threads[n];
+
+ consumer_timeline = sw_sync_timeline_create();
+ for (i = 0; i < n; i++)
+ producer_timelines[i] = sw_sync_timeline_create();
+
+ test_data_mpsc.producer_timelines = producer_timelines;
+ test_data_mpsc.consumer_timeline = consumer_timeline;
+ test_data_mpsc.iterations = iterations;
+ test_data_mpsc.threads = n;
+ test_data_mpsc.counter = 0;
+ pthread_mutex_init(&test_data_mpsc.lock, NULL);
+
+ for (i = 0; i < n; i++) {
+ pthread_create(&threads[i], NULL, (void * (*)(void *))
+ mpsc_producer_thread, (void *)i);
+ }
+
+ /* Consumer thread runs here */
+ ret = mpcs_consumer_thread();
+
+ for (i = 0; i < n; i++)
+ pthread_join(threads[i], NULL);
+
+ return ret;
+}
diff --git a/tools/testing/selftests/sync/sync_stress_merge.c b/tools/testing/selftests/sync/sync_stress_merge.c
new file mode 100644
index 000000000..99e83ef45
--- /dev/null
+++ b/tools/testing/selftests/sync/sync_stress_merge.c
@@ -0,0 +1,115 @@
+/*
+ * sync stress test: merging
+ * Copyright 2015-2016 Collabora Ltd.
+ *
+ * Based on the implementation from the Android Open Source Project,
+ *
+ * Copyright 2012 Google, Inc
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "sync.h"
+#include "sw_sync.h"
+#include "synctest.h"
+
+int test_merge_stress_random_merge(void)
+{
+ int i, size, ret;
+ int timeline_count = 32;
+ int merge_count = 1024 * 32;
+ int timelines[timeline_count];
+ int fence_map[timeline_count];
+ int fence, tmpfence, merged, valid;
+ int timeline, timeline_offset, sync_point;
+
+ srand(time(NULL));
+
+ for (i = 0; i < timeline_count; i++)
+ timelines[i] = sw_sync_timeline_create();
+
+ fence = sw_sync_fence_create(timelines[0], "fence", 0);
+ valid = sw_sync_fence_is_valid(fence);
+ ASSERT(valid, "Failure creating fence\n");
+
+ memset(fence_map, -1, sizeof(fence_map));
+ fence_map[0] = 0;
+
+ /*
+ * Randomly create sync_points out of a fixed set of timelines,
+ * and merge them together
+ */
+ for (i = 0; i < merge_count; i++) {
+ /* Generate sync_point. */
+ timeline_offset = rand() % timeline_count;
+ timeline = timelines[timeline_offset];
+ sync_point = rand();
+
+ /* Keep track of the latest sync_point in each timeline. */
+ if (fence_map[timeline_offset] == -1)
+ fence_map[timeline_offset] = sync_point;
+ else if (fence_map[timeline_offset] < sync_point)
+ fence_map[timeline_offset] = sync_point;
+
+ /* Merge */
+ tmpfence = sw_sync_fence_create(timeline, "fence", sync_point);
+ merged = sync_merge("merge", tmpfence, fence);
+ sw_sync_fence_destroy(tmpfence);
+ sw_sync_fence_destroy(fence);
+ fence = merged;
+
+ valid = sw_sync_fence_is_valid(merged);
+ ASSERT(valid, "Failure creating fence i\n");
+ }
+
+ size = 0;
+ for (i = 0; i < timeline_count; i++)
+ if (fence_map[i] != -1)
+ size++;
+
+ /* Confirm our map matches the fence. */
+ ASSERT(sync_fence_size(fence) == size,
+ "Quantity of elements not matching\n");
+
+ /* Trigger the merged fence */
+ for (i = 0; i < timeline_count; i++) {
+ if (fence_map[i] != -1) {
+ ret = sync_wait(fence, 0);
+ ASSERT(ret == 0,
+ "Failure waiting on fence until timeout\n");
+ /* Increment the timeline to the last sync_point */
+ sw_sync_timeline_inc(timelines[i], fence_map[i]);
+ }
+ }
+
+ /* Check that the fence is triggered. */
+ ret = sync_wait(fence, 0);
+ ASSERT(ret > 0, "Failure triggering fence\n");
+
+ sw_sync_fence_destroy(fence);
+
+ for (i = 0; i < timeline_count; i++)
+ sw_sync_timeline_destroy(timelines[i]);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/sync/sync_stress_parallelism.c b/tools/testing/selftests/sync/sync_stress_parallelism.c
new file mode 100644
index 000000000..e6c9be671
--- /dev/null
+++ b/tools/testing/selftests/sync/sync_stress_parallelism.c
@@ -0,0 +1,111 @@
+/*
+ * sync stress test: parallelism
+ * Copyright 2015-2016 Collabora Ltd.
+ *
+ * Based on the implementation from the Android Open Source Project,
+ *
+ * Copyright 2012 Google, Inc
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <pthread.h>
+
+#include "sync.h"
+#include "sw_sync.h"
+#include "synctest.h"
+
+static struct {
+ int iterations;
+ int timeline;
+ int counter;
+} test_data_two_threads;
+
+static int test_stress_two_threads_shared_timeline_thread(void *d)
+{
+ int thread_id = (long)d;
+ int timeline = test_data_two_threads.timeline;
+ int iterations = test_data_two_threads.iterations;
+ int fence, valid, ret, i;
+
+ for (i = 0; i < iterations; i++) {
+ fence = sw_sync_fence_create(timeline, "fence",
+ i * 2 + thread_id);
+ valid = sw_sync_fence_is_valid(fence);
+ ASSERT(valid, "Failure allocating fence\n");
+
+ /* Wait on the prior thread to complete */
+ ret = sync_wait(fence, -1);
+ ASSERT(ret > 0, "Problem occurred on prior thread\n");
+
+ /*
+ * Confirm the previous thread's writes are visible
+ * and then increment
+ */
+ ASSERT(test_data_two_threads.counter == i * 2 + thread_id,
+ "Counter got damaged!\n");
+ test_data_two_threads.counter++;
+
+ /* Kick off the other thread */
+ ret = sw_sync_timeline_inc(timeline, 1);
+ ASSERT(ret == 0, "Advancing timeline failed\n");
+
+ sw_sync_fence_destroy(fence);
+ }
+
+ return 0;
+}
+
+int test_stress_two_threads_shared_timeline(void)
+{
+ pthread_t a, b;
+ int valid;
+ int timeline = sw_sync_timeline_create();
+
+ valid = sw_sync_timeline_is_valid(timeline);
+ ASSERT(valid, "Failure allocating timeline\n");
+
+ test_data_two_threads.iterations = 1 << 16;
+ test_data_two_threads.counter = 0;
+ test_data_two_threads.timeline = timeline;
+
+ /*
+ * Use a single timeline to synchronize two threads
+ * hammmering on the same counter.
+ */
+
+ pthread_create(&a, NULL, (void *(*)(void *))
+ test_stress_two_threads_shared_timeline_thread,
+ (void *)0);
+ pthread_create(&b, NULL, (void *(*)(void *))
+ test_stress_two_threads_shared_timeline_thread,
+ (void *)1);
+
+ pthread_join(a, NULL);
+ pthread_join(b, NULL);
+
+ /* make sure the threads did not trample on one another */
+ ASSERT(test_data_two_threads.counter ==
+ test_data_two_threads.iterations * 2,
+ "Counter has unexpected value\n");
+
+ sw_sync_timeline_destroy(timeline);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/sync/sync_test.c b/tools/testing/selftests/sync/sync_test.c
new file mode 100644
index 000000000..414a617db
--- /dev/null
+++ b/tools/testing/selftests/sync/sync_test.c
@@ -0,0 +1,114 @@
+/*
+ * sync test runner
+ * Copyright 2015-2016 Collabora Ltd.
+ *
+ * Based on the implementation from the Android Open Source Project,
+ *
+ * Copyright 2012 Google, Inc
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <errno.h>
+#include <string.h>
+
+#include "../kselftest.h"
+#include "synctest.h"
+
+static int run_test(int (*test)(void), char *name)
+{
+ int result;
+ pid_t childpid;
+ int ret;
+
+ fflush(stdout);
+ childpid = fork();
+
+ if (childpid) {
+ waitpid(childpid, &result, 0);
+ if (WIFEXITED(result)) {
+ ret = WEXITSTATUS(result);
+ if (!ret)
+ ksft_test_result_pass("[RUN]\t%s\n", name);
+ else
+ ksft_test_result_fail("[RUN]\t%s\n", name);
+ return ret;
+ }
+ return 1;
+ }
+
+ exit(test());
+}
+
+static void sync_api_supported(void)
+{
+ struct stat sbuf;
+ int ret;
+
+ ret = stat("/sys/kernel/debug/sync/sw_sync", &sbuf);
+ if (!ret)
+ return;
+
+ if (errno == ENOENT)
+ ksft_exit_skip("Sync framework not supported by kernel\n");
+
+ if (errno == EACCES)
+ ksft_exit_skip("Run Sync test as root.\n");
+
+ ksft_exit_fail_msg("stat failed on /sys/kernel/debug/sync/sw_sync: %s",
+ strerror(errno));
+}
+
+int main(void)
+{
+ int err;
+
+ ksft_print_header();
+
+ sync_api_supported();
+ ksft_set_plan(3 + 7);
+
+ ksft_print_msg("[RUN]\tTesting sync framework\n");
+
+ RUN_TEST(test_alloc_timeline);
+ RUN_TEST(test_alloc_fence);
+ RUN_TEST(test_alloc_fence_negative);
+
+ RUN_TEST(test_fence_one_timeline_wait);
+ RUN_TEST(test_fence_one_timeline_merge);
+ RUN_TEST(test_fence_merge_same_fence);
+ RUN_TEST(test_fence_multi_timeline_wait);
+ RUN_TEST(test_stress_two_threads_shared_timeline);
+ RUN_TEST(test_consumer_stress_multi_producer_single_consumer);
+ RUN_TEST(test_merge_stress_random_merge);
+
+ err = ksft_get_fail_cnt();
+ if (err)
+ ksft_exit_fail_msg("%d out of %d sync tests failed\n",
+ err, ksft_test_num());
+
+ /* need this return to keep gcc happy */
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/sync/sync_wait.c b/tools/testing/selftests/sync/sync_wait.c
new file mode 100644
index 000000000..d69b752f6
--- /dev/null
+++ b/tools/testing/selftests/sync/sync_wait.c
@@ -0,0 +1,91 @@
+/*
+ * sync fence wait tests
+ * Copyright 2015-2016 Collabora Ltd.
+ *
+ * Based on the implementation from the Android Open Source Project,
+ *
+ * Copyright 2012 Google, Inc
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "sync.h"
+#include "sw_sync.h"
+#include "synctest.h"
+
+int test_fence_multi_timeline_wait(void)
+{
+ int timelineA, timelineB, timelineC;
+ int fenceA, fenceB, fenceC, merged;
+ int valid, active, signaled, ret;
+
+ timelineA = sw_sync_timeline_create();
+ timelineB = sw_sync_timeline_create();
+ timelineC = sw_sync_timeline_create();
+
+ fenceA = sw_sync_fence_create(timelineA, "fenceA", 5);
+ fenceB = sw_sync_fence_create(timelineB, "fenceB", 5);
+ fenceC = sw_sync_fence_create(timelineC, "fenceC", 5);
+
+ merged = sync_merge("mergeFence", fenceB, fenceA);
+ merged = sync_merge("mergeFence", fenceC, merged);
+
+ valid = sw_sync_fence_is_valid(merged);
+ ASSERT(valid, "Failure merging fence from various timelines\n");
+
+ /* Confirm fence isn't signaled */
+ active = sync_fence_count_with_status(merged, FENCE_STATUS_ACTIVE);
+ ASSERT(active == 3, "Fence signaled too early!\n");
+
+ ret = sync_wait(merged, 0);
+ ASSERT(ret == 0,
+ "Failure waiting on fence until timeout\n");
+
+ ret = sw_sync_timeline_inc(timelineA, 5);
+ active = sync_fence_count_with_status(merged, FENCE_STATUS_ACTIVE);
+ signaled = sync_fence_count_with_status(merged, FENCE_STATUS_SIGNALED);
+ ASSERT(active == 2 && signaled == 1,
+ "Fence did not signal properly!\n");
+
+ ret = sw_sync_timeline_inc(timelineB, 5);
+ active = sync_fence_count_with_status(merged, FENCE_STATUS_ACTIVE);
+ signaled = sync_fence_count_with_status(merged, FENCE_STATUS_SIGNALED);
+ ASSERT(active == 1 && signaled == 2,
+ "Fence did not signal properly!\n");
+
+ ret = sw_sync_timeline_inc(timelineC, 5);
+ active = sync_fence_count_with_status(merged, FENCE_STATUS_ACTIVE);
+ signaled = sync_fence_count_with_status(merged, FENCE_STATUS_SIGNALED);
+ ASSERT(active == 0 && signaled == 3,
+ "Fence did not signal properly!\n");
+
+ /* confirm you can successfully wait */
+ ret = sync_wait(merged, 100);
+ ASSERT(ret > 0, "Failure waiting on signaled fence\n");
+
+ sw_sync_fence_destroy(merged);
+ sw_sync_fence_destroy(fenceC);
+ sw_sync_fence_destroy(fenceB);
+ sw_sync_fence_destroy(fenceA);
+ sw_sync_timeline_destroy(timelineC);
+ sw_sync_timeline_destroy(timelineB);
+ sw_sync_timeline_destroy(timelineA);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/sync/synctest.h b/tools/testing/selftests/sync/synctest.h
new file mode 100644
index 000000000..90a8e5369
--- /dev/null
+++ b/tools/testing/selftests/sync/synctest.h
@@ -0,0 +1,67 @@
+/*
+ * sync tests
+ * Copyright 2015-2016 Collabora Ltd.
+ *
+ * Based on the implementation from the Android Open Source Project,
+ *
+ * Copyright 2012 Google, Inc
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef SELFTESTS_SYNCTEST_H
+#define SELFTESTS_SYNCTEST_H
+
+#include <stdio.h>
+#include "../kselftest.h"
+
+#define ASSERT(cond, msg) do { \
+ if (!(cond)) { \
+ ksft_print_msg("[ERROR]\t%s", (msg)); \
+ return 1; \
+ } \
+} while (0)
+
+#define RUN_TEST(x) run_test((x), #x)
+
+/* Allocation tests */
+int test_alloc_timeline(void);
+int test_alloc_fence(void);
+int test_alloc_fence_negative(void);
+
+/* Fence tests with one timeline */
+int test_fence_one_timeline_wait(void);
+int test_fence_one_timeline_merge(void);
+
+/* Fence merge tests */
+int test_fence_merge_same_fence(void);
+
+/* Fence wait tests */
+int test_fence_multi_timeline_wait(void);
+
+/* Stress test - parallelism */
+int test_stress_two_threads_shared_timeline(void);
+
+/* Stress test - consumer */
+int test_consumer_stress_multi_producer_single_consumer(void);
+
+/* Stress test - merging */
+int test_merge_stress_random_merge(void);
+
+#endif
diff --git a/tools/testing/selftests/sysctl/Makefile b/tools/testing/selftests/sysctl/Makefile
new file mode 100644
index 000000000..110301f9f
--- /dev/null
+++ b/tools/testing/selftests/sysctl/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Makefile for sysctl selftests.
+# Expects kernel.sysctl_writes_strict=1.
+
+# No binaries, but make sure arg-less "make" doesn't trigger "run_tests".
+all:
+
+TEST_PROGS := sysctl.sh
+
+include ../lib.mk
+
+# Nothing to clean up.
+clean:
diff --git a/tools/testing/selftests/sysctl/config b/tools/testing/selftests/sysctl/config
new file mode 100644
index 000000000..fc263efd1
--- /dev/null
+++ b/tools/testing/selftests/sysctl/config
@@ -0,0 +1 @@
+CONFIG_TEST_SYSCTL=m
diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh
new file mode 100755
index 000000000..19515dcb7
--- /dev/null
+++ b/tools/testing/selftests/sysctl/sysctl.sh
@@ -0,0 +1,971 @@
+#!/bin/bash
+# Copyright (C) 2017 Luis R. Rodriguez <mcgrof@kernel.org>
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the Free
+# Software Foundation; either version 2 of the License, or at your option any
+# later version; or, when distributed separately from the Linux kernel or
+# when incorporated into other software packages, subject to the following
+# license:
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of copyleft-next (version 0.3.1 or later) as published
+# at http://copyleft-next.org/.
+
+# This performs a series tests against the proc sysctl interface.
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+TEST_NAME="sysctl"
+TEST_DRIVER="test_${TEST_NAME}"
+TEST_DIR=$(dirname $0)
+TEST_FILE=$(mktemp)
+
+# This represents
+#
+# TEST_ID:TEST_COUNT:ENABLED:TARGET
+#
+# TEST_ID: is the test id number
+# TEST_COUNT: number of times we should run the test
+# ENABLED: 1 if enabled, 0 otherwise
+# TARGET: test target file required on the test_sysctl module
+#
+# Once these are enabled please leave them as-is. Write your own test,
+# we have tons of space.
+ALL_TESTS="0001:1:1:int_0001"
+ALL_TESTS="$ALL_TESTS 0002:1:1:string_0001"
+ALL_TESTS="$ALL_TESTS 0003:1:1:int_0002"
+ALL_TESTS="$ALL_TESTS 0004:1:1:uint_0001"
+ALL_TESTS="$ALL_TESTS 0005:3:1:int_0003"
+ALL_TESTS="$ALL_TESTS 0006:50:1:bitmap_0001"
+ALL_TESTS="$ALL_TESTS 0007:1:1:boot_int"
+
+function allow_user_defaults()
+{
+ if [ -z $DIR ]; then
+ DIR="/sys/module/test_sysctl/"
+ fi
+ if [ -z $DEFAULT_NUM_TESTS ]; then
+ DEFAULT_NUM_TESTS=50
+ fi
+ if [ -z $SYSCTL ]; then
+ SYSCTL="/proc/sys/debug/test_sysctl"
+ fi
+ if [ -z $PROD_SYSCTL ]; then
+ PROD_SYSCTL="/proc/sys"
+ fi
+ if [ -z $WRITES_STRICT ]; then
+ WRITES_STRICT="${PROD_SYSCTL}/kernel/sysctl_writes_strict"
+ fi
+}
+
+function check_production_sysctl_writes_strict()
+{
+ echo -n "Checking production write strict setting ... "
+ if [ ! -e ${WRITES_STRICT} ]; then
+ echo "FAIL, but skip in case of old kernel" >&2
+ else
+ old_strict=$(cat ${WRITES_STRICT})
+ if [ "$old_strict" = "1" ]; then
+ echo "ok"
+ else
+ echo "FAIL, strict value is 0 but force to 1 to continue" >&2
+ echo "1" > ${WRITES_STRICT}
+ fi
+ fi
+
+ if [ -z $PAGE_SIZE ]; then
+ PAGE_SIZE=$(getconf PAGESIZE)
+ fi
+ if [ -z $MAX_DIGITS ]; then
+ MAX_DIGITS=$(($PAGE_SIZE/8))
+ fi
+ if [ -z $INT_MAX ]; then
+ INT_MAX=$(getconf INT_MAX)
+ fi
+ if [ -z $UINT_MAX ]; then
+ UINT_MAX=$(getconf UINT_MAX)
+ fi
+}
+
+test_reqs()
+{
+ uid=$(id -u)
+ if [ $uid -ne 0 ]; then
+ echo $msg must be run as root >&2
+ exit $ksft_skip
+ fi
+
+ if ! which perl 2> /dev/null > /dev/null; then
+ echo "$0: You need perl installed"
+ exit $ksft_skip
+ fi
+ if ! which getconf 2> /dev/null > /dev/null; then
+ echo "$0: You need getconf installed"
+ exit $ksft_skip
+ fi
+ if ! which diff 2> /dev/null > /dev/null; then
+ echo "$0: You need diff installed"
+ exit $ksft_skip
+ fi
+}
+
+function load_req_mod()
+{
+ if [ ! -d $SYSCTL ]; then
+ if ! modprobe -q -n $TEST_DRIVER; then
+ echo "$0: module $TEST_DRIVER not found [SKIP]"
+ echo "You must set CONFIG_TEST_SYSCTL=m in your kernel" >&2
+ exit $ksft_skip
+ fi
+ modprobe $TEST_DRIVER
+ if [ $? -ne 0 ]; then
+ echo "$0: modprobe $TEST_DRIVER failed."
+ exit
+ fi
+ fi
+}
+
+reset_vals()
+{
+ VAL=""
+ TRIGGER=$(basename ${TARGET})
+ case "$TRIGGER" in
+ int_0001)
+ VAL="60"
+ ;;
+ int_0002)
+ VAL="1"
+ ;;
+ uint_0001)
+ VAL="314"
+ ;;
+ string_0001)
+ VAL="(none)"
+ ;;
+ bitmap_0001)
+ VAL=""
+ ;;
+ *)
+ ;;
+ esac
+ echo -n $VAL > $TARGET
+}
+
+set_orig()
+{
+ if [ ! -z $TARGET ] && [ ! -z $ORIG ]; then
+ if [ -f ${TARGET} ]; then
+ echo "${ORIG}" > "${TARGET}"
+ fi
+ fi
+}
+
+set_test()
+{
+ echo "${TEST_STR}" > "${TARGET}"
+}
+
+verify()
+{
+ local seen
+ seen=$(cat "$1")
+ if [ "${seen}" != "${TEST_STR}" ]; then
+ return 1
+ fi
+ return 0
+}
+
+# proc files get read a page at a time, which can confuse diff,
+# and get you incorrect results on proc files with long data. To use
+# diff against them you must first extract the output to a file, and
+# then compare against that file.
+verify_diff_proc_file()
+{
+ TMP_DUMP_FILE=$(mktemp)
+ cat $1 > $TMP_DUMP_FILE
+
+ if ! diff -w -q $TMP_DUMP_FILE $2; then
+ return 1
+ else
+ return 0
+ fi
+}
+
+verify_diff_w()
+{
+ echo "$TEST_STR" | diff -q -w -u - $1 > /dev/null
+ return $?
+}
+
+test_rc()
+{
+ if [[ $rc != 0 ]]; then
+ echo "Failed test, return value: $rc" >&2
+ exit $rc
+ fi
+}
+
+test_finish()
+{
+ set_orig
+ rm -f "${TEST_FILE}"
+
+ if [ ! -z ${old_strict} ]; then
+ echo ${old_strict} > ${WRITES_STRICT}
+ fi
+ exit $rc
+}
+
+run_numerictests()
+{
+ echo "== Testing sysctl behavior against ${TARGET} =="
+
+ rc=0
+
+ echo -n "Writing test file ... "
+ echo "${TEST_STR}" > "${TEST_FILE}"
+ if ! verify "${TEST_FILE}"; then
+ echo "FAIL" >&2
+ exit 1
+ else
+ echo "ok"
+ fi
+
+ echo -n "Checking sysctl is not set to test value ... "
+ if verify "${TARGET}"; then
+ echo "FAIL" >&2
+ exit 1
+ else
+ echo "ok"
+ fi
+
+ echo -n "Writing sysctl from shell ... "
+ set_test
+ if ! verify "${TARGET}"; then
+ echo "FAIL" >&2
+ exit 1
+ else
+ echo "ok"
+ fi
+
+ echo -n "Resetting sysctl to original value ... "
+ set_orig
+ if verify "${TARGET}"; then
+ echo "FAIL" >&2
+ exit 1
+ else
+ echo "ok"
+ fi
+
+ # Now that we've validated the sanity of "set_test" and "set_orig",
+ # we can use those functions to set starting states before running
+ # specific behavioral tests.
+
+ echo -n "Writing entire sysctl in single write ... "
+ set_orig
+ dd if="${TEST_FILE}" of="${TARGET}" bs=4096 2>/dev/null
+ if ! verify "${TARGET}"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+
+ echo -n "Writing middle of sysctl after synchronized seek ... "
+ set_test
+ dd if="${TEST_FILE}" of="${TARGET}" bs=1 seek=1 skip=1 2>/dev/null
+ if ! verify "${TARGET}"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+
+ echo -n "Writing beyond end of sysctl ... "
+ set_orig
+ dd if="${TEST_FILE}" of="${TARGET}" bs=20 seek=2 2>/dev/null
+ if verify "${TARGET}"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+
+ echo -n "Writing sysctl with multiple long writes ... "
+ set_orig
+ (perl -e 'print "A" x 50;'; echo "${TEST_STR}") | \
+ dd of="${TARGET}" bs=50 2>/dev/null
+ if verify "${TARGET}"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+ test_rc
+}
+
+check_failure()
+{
+ echo -n "Testing that $1 fails as expected..."
+ reset_vals
+ TEST_STR="$1"
+ orig="$(cat $TARGET)"
+ echo -n "$TEST_STR" > $TARGET 2> /dev/null
+
+ # write should fail and $TARGET should retain its original value
+ if [ $? = 0 ] || [ "$(cat $TARGET)" != "$orig" ]; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+ test_rc
+}
+
+run_wideint_tests()
+{
+ # sysctl conversion functions receive a boolean sign and ulong
+ # magnitude; here we list the magnitudes we want to test (each of
+ # which will be tested in both positive and negative forms). Since
+ # none of these values fit in 32 bits, writing them to an int- or
+ # uint-typed sysctl should fail.
+ local magnitudes=(
+ # common boundary-condition values (zero, +1, -1, INT_MIN,
+ # and INT_MAX respectively) if truncated to lower 32 bits
+ # (potential for being falsely deemed in range)
+ 0x0000000100000000
+ 0x0000000100000001
+ 0x00000001ffffffff
+ 0x0000000180000000
+ 0x000000017fffffff
+
+ # these look like negatives, but without a leading '-' are
+ # actually large positives (should be rejected as above
+ # despite being zero/+1/-1/INT_MIN/INT_MAX in the lower 32)
+ 0xffffffff00000000
+ 0xffffffff00000001
+ 0xffffffffffffffff
+ 0xffffffff80000000
+ 0xffffffff7fffffff
+ )
+
+ for sign in '' '-'; do
+ for mag in "${magnitudes[@]}"; do
+ check_failure "${sign}${mag}"
+ done
+ done
+}
+
+# Your test must accept digits 3 and 4 to use this
+run_limit_digit()
+{
+ echo -n "Checking ignoring spaces up to PAGE_SIZE works on write ..."
+ reset_vals
+
+ LIMIT=$((MAX_DIGITS -1))
+ TEST_STR="3"
+ (perl -e 'print " " x '$LIMIT';'; echo "${TEST_STR}") | \
+ dd of="${TARGET}" 2>/dev/null
+
+ if ! verify "${TARGET}"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+ test_rc
+
+ echo -n "Checking passing PAGE_SIZE of spaces fails on write ..."
+ reset_vals
+
+ LIMIT=$((MAX_DIGITS))
+ TEST_STR="4"
+ (perl -e 'print " " x '$LIMIT';'; echo "${TEST_STR}") | \
+ dd of="${TARGET}" 2>/dev/null
+
+ if verify "${TARGET}"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+ test_rc
+}
+
+# You are using an int
+run_limit_digit_int()
+{
+ echo -n "Testing INT_MAX works ..."
+ reset_vals
+ TEST_STR="$INT_MAX"
+ echo -n $TEST_STR > $TARGET
+
+ if ! verify "${TARGET}"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+ test_rc
+
+ echo -n "Testing INT_MAX + 1 will fail as expected..."
+ reset_vals
+ let TEST_STR=$INT_MAX+1
+ echo -n $TEST_STR > $TARGET 2> /dev/null
+
+ if verify "${TARGET}"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+ test_rc
+
+ echo -n "Testing negative values will work as expected..."
+ reset_vals
+ TEST_STR="-3"
+ echo -n $TEST_STR > $TARGET 2> /dev/null
+ if ! verify "${TARGET}"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+ test_rc
+}
+
+# You used an int array
+run_limit_digit_int_array()
+{
+ echo -n "Testing array works as expected ... "
+ TEST_STR="4 3 2 1"
+ echo -n $TEST_STR > $TARGET
+
+ if ! verify_diff_w "${TARGET}"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+ test_rc
+
+ echo -n "Testing skipping trailing array elements works ... "
+ # Do not reset_vals, carry on the values from the last test.
+ # If we only echo in two digits the last two are left intact
+ TEST_STR="100 101"
+ echo -n $TEST_STR > $TARGET
+ # After we echo in, to help diff we need to set on TEST_STR what
+ # we expect the result to be.
+ TEST_STR="100 101 2 1"
+
+ if ! verify_diff_w "${TARGET}"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+ test_rc
+
+ echo -n "Testing PAGE_SIZE limit on array works ... "
+ # Do not reset_vals, carry on the values from the last test.
+ # Even if you use an int array, you are still restricted to
+ # MAX_DIGITS, this is a known limitation. Test limit works.
+ LIMIT=$((MAX_DIGITS -1))
+ TEST_STR="9"
+ (perl -e 'print " " x '$LIMIT';'; echo "${TEST_STR}") | \
+ dd of="${TARGET}" 2>/dev/null
+
+ TEST_STR="9 101 2 1"
+ if ! verify_diff_w "${TARGET}"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+ test_rc
+
+ echo -n "Testing exceeding PAGE_SIZE limit fails as expected ... "
+ # Do not reset_vals, carry on the values from the last test.
+ # Now go over limit.
+ LIMIT=$((MAX_DIGITS))
+ TEST_STR="7"
+ (perl -e 'print " " x '$LIMIT';'; echo "${TEST_STR}") | \
+ dd of="${TARGET}" 2>/dev/null
+
+ TEST_STR="7 101 2 1"
+ if verify_diff_w "${TARGET}"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+ test_rc
+}
+
+# You are using an unsigned int
+run_limit_digit_uint()
+{
+ echo -n "Testing UINT_MAX works ..."
+ reset_vals
+ TEST_STR="$UINT_MAX"
+ echo -n $TEST_STR > $TARGET
+
+ if ! verify "${TARGET}"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+ test_rc
+
+ echo -n "Testing UINT_MAX + 1 will fail as expected..."
+ reset_vals
+ TEST_STR=$(($UINT_MAX+1))
+ echo -n $TEST_STR > $TARGET 2> /dev/null
+
+ if verify "${TARGET}"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+ test_rc
+
+ echo -n "Testing negative values will not work as expected ..."
+ reset_vals
+ TEST_STR="-3"
+ echo -n $TEST_STR > $TARGET 2> /dev/null
+
+ if verify "${TARGET}"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+ test_rc
+}
+
+run_stringtests()
+{
+ echo -n "Writing entire sysctl in short writes ... "
+ set_orig
+ dd if="${TEST_FILE}" of="${TARGET}" bs=1 2>/dev/null
+ if ! verify "${TARGET}"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+
+ echo -n "Writing middle of sysctl after unsynchronized seek ... "
+ set_test
+ dd if="${TEST_FILE}" of="${TARGET}" bs=1 seek=1 2>/dev/null
+ if verify "${TARGET}"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+
+ echo -n "Checking sysctl maxlen is at least $MAXLEN ... "
+ set_orig
+ perl -e 'print "A" x ('"${MAXLEN}"'-2), "B";' | \
+ dd of="${TARGET}" bs="${MAXLEN}" 2>/dev/null
+ if ! grep -q B "${TARGET}"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+
+ echo -n "Checking sysctl keeps original string on overflow append ... "
+ set_orig
+ perl -e 'print "A" x ('"${MAXLEN}"'-1), "B";' | \
+ dd of="${TARGET}" bs=$(( MAXLEN - 1 )) 2>/dev/null
+ if grep -q B "${TARGET}"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+
+ echo -n "Checking sysctl stays NULL terminated on write ... "
+ set_orig
+ perl -e 'print "A" x ('"${MAXLEN}"'-1), "B";' | \
+ dd of="${TARGET}" bs="${MAXLEN}" 2>/dev/null
+ if grep -q B "${TARGET}"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+
+ echo -n "Checking sysctl stays NULL terminated on overwrite ... "
+ set_orig
+ perl -e 'print "A" x ('"${MAXLEN}"'-1), "BB";' | \
+ dd of="${TARGET}" bs=$(( $MAXLEN + 1 )) 2>/dev/null
+ if grep -q B "${TARGET}"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ fi
+
+ test_rc
+}
+
+target_exists()
+{
+ TARGET="${SYSCTL}/$1"
+ TEST_ID="$2"
+
+ if [ ! -f ${TARGET} ] ; then
+ echo "Target for test $TEST_ID: $TARGET not exist, skipping test ..."
+ return 0
+ fi
+ return 1
+}
+
+run_bitmaptest() {
+ # Total length of bitmaps string to use, a bit under
+ # the maximum input size of the test node
+ LENGTH=$((RANDOM % 65000))
+
+ # First bit to set
+ BIT=$((RANDOM % 1024))
+
+ # String containing our list of bits to set
+ TEST_STR=$BIT
+
+ # build up the string
+ while [ "${#TEST_STR}" -le "$LENGTH" ]; do
+ # Make sure next entry is discontiguous,
+ # skip ahead at least 2
+ BIT=$((BIT + $((2 + RANDOM % 10))))
+
+ # Add new bit to the list
+ TEST_STR="${TEST_STR},${BIT}"
+
+ # Randomly make it a range
+ if [ "$((RANDOM % 2))" -eq "1" ]; then
+ RANGE_END=$((BIT + $((1 + RANDOM % 10))))
+ TEST_STR="${TEST_STR}-${RANGE_END}"
+ BIT=$RANGE_END
+ fi
+ done
+
+ echo -n "Checking bitmap handler... "
+ TEST_FILE=$(mktemp)
+ echo -n "$TEST_STR" > $TEST_FILE
+
+ cat $TEST_FILE > $TARGET 2> /dev/null
+ if [ $? -ne 0 ]; then
+ echo "FAIL" >&2
+ rc=1
+ test_rc
+ fi
+
+ if ! verify_diff_proc_file "$TARGET" "$TEST_FILE"; then
+ echo "FAIL" >&2
+ rc=1
+ else
+ echo "ok"
+ rc=0
+ fi
+ test_rc
+}
+
+sysctl_test_0001()
+{
+ TARGET="${SYSCTL}/$(get_test_target 0001)"
+ reset_vals
+ ORIG=$(cat "${TARGET}")
+ TEST_STR=$(( $ORIG + 1 ))
+
+ run_numerictests
+ run_wideint_tests
+ run_limit_digit
+}
+
+sysctl_test_0002()
+{
+ TARGET="${SYSCTL}/$(get_test_target 0002)"
+ reset_vals
+ ORIG=$(cat "${TARGET}")
+ TEST_STR="Testing sysctl"
+ # Only string sysctls support seeking/appending.
+ MAXLEN=65
+
+ run_numerictests
+ run_stringtests
+}
+
+sysctl_test_0003()
+{
+ TARGET="${SYSCTL}/$(get_test_target 0003)"
+ reset_vals
+ ORIG=$(cat "${TARGET}")
+ TEST_STR=$(( $ORIG + 1 ))
+
+ run_numerictests
+ run_wideint_tests
+ run_limit_digit
+ run_limit_digit_int
+}
+
+sysctl_test_0004()
+{
+ TARGET="${SYSCTL}/$(get_test_target 0004)"
+ reset_vals
+ ORIG=$(cat "${TARGET}")
+ TEST_STR=$(( $ORIG + 1 ))
+
+ run_numerictests
+ run_wideint_tests
+ run_limit_digit
+ run_limit_digit_uint
+}
+
+sysctl_test_0005()
+{
+ TARGET="${SYSCTL}/$(get_test_target 0005)"
+ reset_vals
+ ORIG=$(cat "${TARGET}")
+
+ run_limit_digit_int_array
+}
+
+sysctl_test_0006()
+{
+ TARGET="${SYSCTL}/bitmap_0001"
+ reset_vals
+ ORIG=""
+ run_bitmaptest
+}
+
+sysctl_test_0007()
+{
+ TARGET="${SYSCTL}/boot_int"
+ if [ ! -f $TARGET ]; then
+ echo "Skipping test for $TARGET as it is not present ..."
+ return $ksft_skip
+ fi
+
+ if [ -d $DIR ]; then
+ echo "Boot param test only possible sysctl_test is built-in, not module:"
+ cat $TEST_DIR/config >&2
+ return $ksft_skip
+ fi
+
+ echo -n "Testing if $TARGET is set to 1 ..."
+ ORIG=$(cat "${TARGET}")
+
+ if [ x$ORIG = "x1" ]; then
+ echo "ok"
+ return 0
+ fi
+ echo "FAIL"
+ echo "Checking if /proc/cmdline contains setting of the expected parameter ..."
+ if [ ! -f /proc/cmdline ]; then
+ echo "/proc/cmdline does not exist, test inconclusive"
+ return 0
+ fi
+
+ FOUND=$(grep -c "sysctl[./]debug[./]test_sysctl[./]boot_int=1" /proc/cmdline)
+ if [ $FOUND = "1" ]; then
+ echo "Kernel param found but $TARGET is not 1, TEST FAILED"
+ rc=1
+ test_rc
+ fi
+
+ echo "Skipping test, expected kernel parameter missing."
+ echo "To perform this test, make sure kernel is booted with parameter: sysctl.debug.test_sysctl.boot_int=1"
+ return $ksft_skip
+}
+
+list_tests()
+{
+ echo "Test ID list:"
+ echo
+ echo "TEST_ID x NUM_TEST"
+ echo "TEST_ID: Test ID"
+ echo "NUM_TESTS: Number of recommended times to run the test"
+ echo
+ echo "0001 x $(get_test_count 0001) - tests proc_dointvec_minmax()"
+ echo "0002 x $(get_test_count 0002) - tests proc_dostring()"
+ echo "0003 x $(get_test_count 0003) - tests proc_dointvec()"
+ echo "0004 x $(get_test_count 0004) - tests proc_douintvec()"
+ echo "0005 x $(get_test_count 0005) - tests proc_douintvec() array"
+ echo "0006 x $(get_test_count 0006) - tests proc_do_large_bitmap()"
+ echo "0007 x $(get_test_count 0007) - tests setting sysctl from kernel boot param"
+}
+
+usage()
+{
+ NUM_TESTS=$(grep -o ' ' <<<"$ALL_TESTS" | grep -c .)
+ let NUM_TESTS=$NUM_TESTS+1
+ MAX_TEST=$(printf "%04d\n" $NUM_TESTS)
+ echo "Usage: $0 [ -t <4-number-digit> ] | [ -w <4-number-digit> ] |"
+ echo " [ -s <4-number-digit> ] | [ -c <4-number-digit> <test- count>"
+ echo " [ all ] [ -h | --help ] [ -l ]"
+ echo ""
+ echo "Valid tests: 0001-$MAX_TEST"
+ echo ""
+ echo " all Runs all tests (default)"
+ echo " -t Run test ID the number amount of times is recommended"
+ echo " -w Watch test ID run until it runs into an error"
+ echo " -c Run test ID once"
+ echo " -s Run test ID x test-count number of times"
+ echo " -l List all test ID list"
+ echo " -h|--help Help"
+ echo
+ echo "If an error every occurs execution will immediately terminate."
+ echo "If you are adding a new test try using -w <test-ID> first to"
+ echo "make sure the test passes a series of tests."
+ echo
+ echo Example uses:
+ echo
+ echo "$TEST_NAME.sh -- executes all tests"
+ echo "$TEST_NAME.sh -t 0002 -- Executes test ID 0002 number of times is recomended"
+ echo "$TEST_NAME.sh -w 0002 -- Watch test ID 0002 run until an error occurs"
+ echo "$TEST_NAME.sh -s 0002 -- Run test ID 0002 once"
+ echo "$TEST_NAME.sh -c 0002 3 -- Run test ID 0002 three times"
+ echo
+ list_tests
+ exit 1
+}
+
+function test_num()
+{
+ re='^[0-9]+$'
+ if ! [[ $1 =~ $re ]]; then
+ usage
+ fi
+}
+
+function get_test_count()
+{
+ test_num $1
+ TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}')
+ echo ${TEST_DATA} | awk -F":" '{print $2}'
+}
+
+function get_test_enabled()
+{
+ test_num $1
+ TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}')
+ echo ${TEST_DATA} | awk -F":" '{print $3}'
+}
+
+function get_test_target()
+{
+ test_num $1
+ TEST_DATA=$(echo $ALL_TESTS | awk '{print $'$1'}')
+ echo ${TEST_DATA} | awk -F":" '{print $4}'
+}
+
+function run_all_tests()
+{
+ for i in $ALL_TESTS ; do
+ TEST_ID=${i%:*:*:*}
+ ENABLED=$(get_test_enabled $TEST_ID)
+ TEST_COUNT=$(get_test_count $TEST_ID)
+ TEST_TARGET=$(get_test_target $TEST_ID)
+ if target_exists $TEST_TARGET $TEST_ID; then
+ continue
+ fi
+ if [[ $ENABLED -eq "1" ]]; then
+ test_case $TEST_ID $TEST_COUNT $TEST_TARGET
+ fi
+ done
+}
+
+function watch_log()
+{
+ if [ $# -ne 3 ]; then
+ clear
+ fi
+ date
+ echo "Running test: $2 - run #$1"
+}
+
+function watch_case()
+{
+ i=0
+ while [ 1 ]; do
+
+ if [ $# -eq 1 ]; then
+ test_num $1
+ watch_log $i ${TEST_NAME}_test_$1
+ ${TEST_NAME}_test_$1
+ else
+ watch_log $i all
+ run_all_tests
+ fi
+ let i=$i+1
+ done
+}
+
+function test_case()
+{
+ NUM_TESTS=$2
+
+ i=0
+
+ if target_exists $3 $1; then
+ continue
+ fi
+
+ while [ $i -lt $NUM_TESTS ]; do
+ test_num $1
+ watch_log $i ${TEST_NAME}_test_$1 noclear
+ RUN_TEST=${TEST_NAME}_test_$1
+ $RUN_TEST
+ let i=$i+1
+ done
+}
+
+function parse_args()
+{
+ if [ $# -eq 0 ]; then
+ run_all_tests
+ else
+ if [[ "$1" = "all" ]]; then
+ run_all_tests
+ elif [[ "$1" = "-w" ]]; then
+ shift
+ watch_case $@
+ elif [[ "$1" = "-t" ]]; then
+ shift
+ test_num $1
+ test_case $1 $(get_test_count $1) $(get_test_target $1)
+ elif [[ "$1" = "-c" ]]; then
+ shift
+ test_num $1
+ test_num $2
+ test_case $1 $2 $(get_test_target $1)
+ elif [[ "$1" = "-s" ]]; then
+ shift
+ test_case $1 1 $(get_test_target $1)
+ elif [[ "$1" = "-l" ]]; then
+ list_tests
+ elif [[ "$1" = "-h" || "$1" = "--help" ]]; then
+ usage
+ else
+ usage
+ fi
+ fi
+}
+
+test_reqs
+allow_user_defaults
+check_production_sysctl_writes_strict
+load_req_mod
+
+trap "test_finish" EXIT
+
+parse_args $@
+
+exit 0
diff --git a/tools/testing/selftests/tc-testing/.gitignore b/tools/testing/selftests/tc-testing/.gitignore
new file mode 100644
index 000000000..d52f65de2
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/.gitignore
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+__pycache__/
+*.pyc
+plugins/
+*.xml
+*.tap
+tdc_config_local.py
diff --git a/tools/testing/selftests/tc-testing/Makefile b/tools/testing/selftests/tc-testing/Makefile
new file mode 100644
index 000000000..91fee5c43
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/Makefile
@@ -0,0 +1,33 @@
+# SPDX-License-Identifier: GPL-2.0
+
+top_srcdir = $(abspath ../../../..)
+APIDIR := $(top_scrdir)/include/uapi
+TEST_GEN_FILES = action.o
+
+KSFT_KHDR_INSTALL := 1
+include ../lib.mk
+
+CLANG ?= clang
+LLC ?= llc
+PROBE := $(shell $(LLC) -march=bpf -mcpu=probe -filetype=null /dev/null 2>&1)
+
+ifeq ($(PROBE),)
+ CPU ?= probe
+else
+ CPU ?= generic
+endif
+
+CLANG_SYS_INCLUDES := $(shell $(CLANG) -v -E - </dev/null 2>&1 \
+ | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }')
+
+CLANG_FLAGS = -I. -I$(APIDIR) \
+ $(CLANG_SYS_INCLUDES) \
+ -Wno-compare-distinct-pointer-types
+
+$(OUTPUT)/%.o: %.c
+ $(CLANG) $(CLANG_FLAGS) \
+ -O2 -target bpf -emit-llvm -c $< -o - | \
+ $(LLC) -march=bpf -mcpu=$(CPU) $(LLC_FLAGS) -filetype=obj -o $@
+
+TEST_PROGS += ./tdc.sh
+TEST_FILES := tdc*.py Tdc*.py plugins plugin-lib tc-tests
diff --git a/tools/testing/selftests/tc-testing/README b/tools/testing/selftests/tc-testing/README
new file mode 100644
index 000000000..b0954c873
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/README
@@ -0,0 +1,257 @@
+tdc - Linux Traffic Control (tc) unit testing suite
+
+Author: Lucas Bates - lucasb@mojatatu.com
+
+tdc is a Python script to load tc unit tests from a separate JSON file and
+execute them inside a network namespace dedicated to the task.
+
+
+REQUIREMENTS
+------------
+
+* Minimum Python version of 3.4. Earlier 3.X versions may work but are not
+ guaranteed.
+
+* The kernel must have network namespace support if using nsPlugin
+
+* The kernel must have veth support available, as a veth pair is created
+ prior to running the tests when using nsPlugin.
+
+* The kernel must have the appropriate infrastructure enabled to run all tdc
+ unit tests. See the config file in this directory for minimum required
+ features. As new tests will be added, config options list will be updated.
+
+* All tc-related features being tested must be built in or available as
+ modules. To check what is required in current setup run:
+ ./tdc.py -c
+
+ Note:
+ In the current release, tdc run will abort due to a failure in setup or
+ teardown commands - which includes not being able to run a test simply
+ because the kernel did not support a specific feature. (This will be
+ handled in a future version - the current workaround is to run the tests
+ on specific test categories that your kernel supports)
+
+
+BEFORE YOU RUN
+--------------
+
+The path to the tc executable that will be most commonly tested can be defined
+in the tdc_config.py file. Find the 'TC' entry in the NAMES dictionary and
+define the path.
+
+If you need to test a different tc executable on the fly, you can do so by
+using the -p option when running tdc:
+ ./tdc.py -p /path/to/tc
+
+
+RUNNING TDC
+-----------
+
+To use tdc, root privileges are required. This is because the
+commands being tested must be run as root. The code that enforces
+execution by root uid has been moved into a plugin (see PLUGIN
+ARCHITECTURE, below).
+
+Tests that use a network device should have nsPlugin.py listed as a
+requirement for that test. nsPlugin executes all commands within a
+network namespace and creates a veth pair which may be used in those test
+cases. To disable execution within the namespace, pass the -N option
+to tdc when starting a test run; the veth pair will still be created
+by the plugin.
+
+Running tdc without any arguments will run all tests. Refer to the section
+on command line arguments for more information, or run:
+ ./tdc.py -h
+
+tdc will list the test names as they are being run, and print a summary in
+TAP (Test Anything Protocol) format when they are done. If tests fail,
+output captured from the failing test will be printed immediately following
+the failed test in the TAP output.
+
+
+OVERVIEW OF TDC EXECUTION
+-------------------------
+
+One run of tests is considered a "test suite" (this will be refined in the
+future). A test suite has one or more test cases in it.
+
+A test case has four stages:
+
+ - setup
+ - execute
+ - verify
+ - teardown
+
+The setup and teardown stages can run zero or more commands. The setup
+stage does some setup if the test needs it. The teardown stage undoes
+the setup and returns the system to a "neutral" state so any other test
+can be run next. These two stages require any commands run to return
+success, but do not otherwise verify the results.
+
+The execute and verify stages each run one command. The execute stage
+tests the return code against one or more acceptable values. The
+verify stage checks the return code for success, and also compares
+the stdout with a regular expression.
+
+Each of the commands in any stage will run in a shell instance.
+
+
+USER-DEFINED CONSTANTS
+----------------------
+
+The tdc_config.py file contains multiple values that can be altered to suit
+your needs. Any value in the NAMES dictionary can be altered without affecting
+the tests to be run. These values are used in the tc commands that will be
+executed as part of the test. More will be added as test cases require.
+
+Example:
+ $TC qdisc add dev $DEV1 ingress
+
+The NAMES values are used to substitute into the commands in the test cases.
+
+
+COMMAND LINE ARGUMENTS
+----------------------
+
+Run tdc.py -h to see the full list of available arguments.
+
+usage: tdc.py [-h] [-p PATH] [-D DIR [DIR ...]] [-f FILE [FILE ...]]
+ [-c [CATG [CATG ...]]] [-e ID [ID ...]] [-l] [-s] [-i] [-v] [-N]
+ [-d DEVICE] [-P] [-n] [-V]
+
+Linux TC unit tests
+
+optional arguments:
+ -h, --help show this help message and exit
+ -p PATH, --path PATH The full path to the tc executable to use
+ -v, --verbose Show the commands that are being run
+ -N, --notap Suppress tap results for command under test
+ -d DEVICE, --device DEVICE
+ Execute test cases that use a physical device, where
+ DEVICE is its name. (If not defined, tests that require
+ a physical device will be skipped)
+ -P, --pause Pause execution just before post-suite stage
+
+selection:
+ select which test cases: files plus directories; filtered by categories
+ plus testids
+
+ -D DIR [DIR ...], --directory DIR [DIR ...]
+ Collect tests from the specified directory(ies)
+ (default [tc-tests])
+ -f FILE [FILE ...], --file FILE [FILE ...]
+ Run tests from the specified file(s)
+ -c [CATG [CATG ...]], --category [CATG [CATG ...]]
+ Run tests only from the specified category/ies, or if
+ no category/ies is/are specified, list known
+ categories.
+ -e ID [ID ...], --execute ID [ID ...]
+ Execute the specified test cases with specified IDs
+
+action:
+ select action to perform on selected test cases
+
+ -l, --list List all test cases, or those only within the
+ specified category
+ -s, --show Display the selected test cases
+ -i, --id Generate ID numbers for new test cases
+
+netns:
+ options for nsPlugin (run commands in net namespace)
+
+ -N, --no-namespace
+ Do not run commands in a network namespace.
+
+valgrind:
+ options for valgrindPlugin (run command under test under Valgrind)
+
+ -V, --valgrind Run commands under valgrind
+
+
+PLUGIN ARCHITECTURE
+-------------------
+
+There is now a plugin architecture, and some of the functionality that
+was in the tdc.py script has been moved into the plugins.
+
+The plugins are in the directory plugin-lib. The are executed from
+directory plugins. Put symbolic links from plugins to plugin-lib,
+and name them according to the order you want them to run. This is not
+necessary if a test case being run requires a specific plugin to work.
+
+Example:
+
+bjb@bee:~/work/tc-testing$ ls -l plugins
+total 4
+lrwxrwxrwx 1 bjb bjb 27 Oct 4 16:12 10-rootPlugin.py -> ../plugin-lib/rootPlugin.py
+lrwxrwxrwx 1 bjb bjb 25 Oct 12 17:55 20-nsPlugin.py -> ../plugin-lib/nsPlugin.py
+-rwxr-xr-x 1 bjb bjb 0 Sep 29 15:56 __init__.py
+
+The plugins are a subclass of TdcPlugin, defined in TdcPlugin.py and
+must be called "SubPlugin" so tdc can find them. They are
+distinguished from each other in the python program by their module
+name.
+
+This base class supplies "hooks" to run extra functions. These hooks are as follows:
+
+pre- and post-suite
+pre- and post-case
+pre- and post-execute stage
+adjust-command (runs in all stages and receives the stage name)
+
+The pre-suite hook receives the number of tests and an array of test ids.
+This allows you to dump out the list of skipped tests in the event of a
+failure during setup or teardown stage.
+
+The pre-case hook receives the ordinal number and test id of the current test.
+
+The adjust-command hook receives the stage id (see list below) and the
+full command to be executed. This allows for last-minute adjustment
+of the command.
+
+The stages are identified by the following strings:
+
+ - pre (pre-suite)
+ - setup
+ - command
+ - verify
+ - teardown
+ - post (post-suite)
+
+
+To write a plugin, you need to inherit from TdcPlugin in
+TdcPlugin.py. To use the plugin, you have to put the
+implementation file in plugin-lib, and add a symbolic link to it from
+plugins. It will be detected at run time and invoked at the
+appropriate times. There are a few examples in the plugin-lib
+directory:
+
+ - rootPlugin.py:
+ implements the enforcement of running as root
+ - nsPlugin.py:
+ sets up a network namespace and runs all commands in that namespace,
+ while also setting up dummy devices to be used in testing.
+ - valgrindPlugin.py
+ runs each command in the execute stage under valgrind,
+ and checks for leaks.
+ This plugin will output an extra test for each test in the test file,
+ one is the existing output as to whether the test passed or failed,
+ and the other is a test whether the command leaked memory or not.
+ (This one is a preliminary version, it may not work quite right yet,
+ but the overall template is there and it should only need tweaks.)
+ - buildebpfPlugin.py:
+ builds all programs in $EBPFDIR.
+
+
+ACKNOWLEDGEMENTS
+----------------
+
+Thanks to:
+
+Jamal Hadi Salim, for providing valuable test cases
+Keara Leibovitz, who wrote the CLI test driver that I used as a base for the
+ first version of the tc testing suite. This work was presented at
+ Netdev 1.2 Tokyo in October 2016.
+Samir Hussain, for providing help while I dove into Python for the first time
+ and being a second eye for this code.
diff --git a/tools/testing/selftests/tc-testing/TODO.txt b/tools/testing/selftests/tc-testing/TODO.txt
new file mode 100644
index 000000000..c40698557
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/TODO.txt
@@ -0,0 +1,31 @@
+tc Testing Suite To-Do list:
+
+- Determine what tc features are supported in the kernel. If features are not
+ present, prevent the related categories from running.
+
+- Add support for multiple versions of tc to run successively
+
+- Improve error messages when tdc aborts its run. Partially done - still
+ need to better handle problems in pre- and post-suite.
+
+- Use python logger module for debug/verbose output
+
+- Allow tdc to write its results to file.
+ Maybe use python logger module for this too.
+
+- A better implementation of the "hooks". Currently, every plugin
+ will attempt to run a function at every hook point. Could be
+ changed so that plugin __init__ methods will register functions to
+ be run in the various predefined times. Then if a plugin does not
+ require action at a specific point, no penalty will be paid for
+ trying to run a function that will do nothing.
+
+- Proper exception handling - make an exception class and use it
+
+- a TestCase class, for easier testcase handling, searching, comparison
+
+- a TestSuite class
+ and a way to configure a test suite,
+ to automate running multiple "test suites" with different requirements
+
+- super simple test case example using ls, touch, etc
diff --git a/tools/testing/selftests/tc-testing/TdcPlugin.py b/tools/testing/selftests/tc-testing/TdcPlugin.py
new file mode 100644
index 000000000..79f3ca861
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/TdcPlugin.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+
+class TdcPlugin:
+ def __init__(self):
+ super().__init__()
+ print(' -- {}.__init__'.format(self.sub_class))
+
+ def pre_suite(self, testcount, testidlist):
+ '''run commands before test_runner goes into a test loop'''
+ self.testcount = testcount
+ self.testidlist = testidlist
+ if self.args.verbose > 1:
+ print(' -- {}.pre_suite'.format(self.sub_class))
+
+ def post_suite(self, index):
+ '''run commands after test_runner completes the test loop
+ index is the last ordinal number of test that was attempted'''
+ if self.args.verbose > 1:
+ print(' -- {}.post_suite'.format(self.sub_class))
+
+ def pre_case(self, caseinfo, test_skip):
+ '''run commands before test_runner does one test'''
+ if self.args.verbose > 1:
+ print(' -- {}.pre_case'.format(self.sub_class))
+ self.args.caseinfo = caseinfo
+ self.args.test_skip = test_skip
+
+ def post_case(self):
+ '''run commands after test_runner does one test'''
+ if self.args.verbose > 1:
+ print(' -- {}.post_case'.format(self.sub_class))
+
+ def pre_execute(self):
+ '''run command before test-runner does the execute step'''
+ if self.args.verbose > 1:
+ print(' -- {}.pre_execute'.format(self.sub_class))
+
+ def post_execute(self):
+ '''run command after test-runner does the execute step'''
+ if self.args.verbose > 1:
+ print(' -- {}.post_execute'.format(self.sub_class))
+
+ def adjust_command(self, stage, command):
+ '''adjust the command'''
+ if self.args.verbose > 1:
+ print(' -- {}.adjust_command {}'.format(self.sub_class, stage))
+
+ # if stage == 'pre':
+ # pass
+ # elif stage == 'setup':
+ # pass
+ # elif stage == 'execute':
+ # pass
+ # elif stage == 'verify':
+ # pass
+ # elif stage == 'teardown':
+ # pass
+ # elif stage == 'post':
+ # pass
+ # else:
+ # pass
+
+ return command
+
+ def add_args(self, parser):
+ '''Get the plugin args from the command line'''
+ self.argparser = parser
+ return self.argparser
+
+ def check_args(self, args, remaining):
+ '''Check that the args are set correctly'''
+ self.args = args
+ if self.args.verbose > 1:
+ print(' -- {}.check_args'.format(self.sub_class))
diff --git a/tools/testing/selftests/tc-testing/TdcResults.py b/tools/testing/selftests/tc-testing/TdcResults.py
new file mode 100644
index 000000000..1e4d95fdf
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/TdcResults.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+
+from enum import Enum
+
+class ResultState(Enum):
+ noresult = -1
+ skip = 0
+ success = 1
+ fail = 2
+
+class TestResult:
+ def __init__(self, test_id="", test_name=""):
+ self.test_id = test_id
+ self.test_name = test_name
+ self.result = ResultState.noresult
+ self.failmsg = ""
+ self.errormsg = ""
+ self.steps = []
+
+ def set_result(self, result):
+ if (isinstance(result, ResultState)):
+ self.result = result
+ return True
+ else:
+ raise TypeError('Unknown result type, must be type ResultState')
+
+ def get_result(self):
+ return self.result
+
+ def set_errormsg(self, errormsg):
+ self.errormsg = errormsg
+ return True
+
+ def append_errormsg(self, errormsg):
+ self.errormsg = '{}\n{}'.format(self.errormsg, errormsg)
+
+ def get_errormsg(self):
+ return self.errormsg
+
+ def set_failmsg(self, failmsg):
+ self.failmsg = failmsg
+ return True
+
+ def append_failmsg(self, failmsg):
+ self.failmsg = '{}\n{}'.format(self.failmsg, failmsg)
+
+ def get_failmsg(self):
+ return self.failmsg
+
+ def add_steps(self, newstep):
+ if type(newstep) == list:
+ self.steps.extend(newstep)
+ elif type(newstep) == str:
+ self.steps.append(step)
+ else:
+ raise TypeError('TdcResults.add_steps() requires a list or str')
+
+ def get_executed_steps(self):
+ return self.steps
+
+class TestSuiteReport():
+ _testsuite = []
+
+ def add_resultdata(self, result_data):
+ if isinstance(result_data, TestResult):
+ self._testsuite.append(result_data)
+ return True
+
+ def count_tests(self):
+ return len(self._testsuite)
+
+ def count_failures(self):
+ return sum(1 for t in self._testsuite if t.result == ResultState.fail)
+
+ def count_skips(self):
+ return sum(1 for t in self._testsuite if t.result == ResultState.skip)
+
+ def find_result(self, test_id):
+ return next((tr for tr in self._testsuite if tr.test_id == test_id), None)
+
+ def update_result(self, result_data):
+ orig = self.find_result(result_data.test_id)
+ if orig != None:
+ idx = self._testsuite.index(orig)
+ self._testsuite[idx] = result_data
+ else:
+ self.add_resultdata(result_data)
+
+ def format_tap(self):
+ ftap = ""
+ ftap += '1..{}\n'.format(self.count_tests())
+ index = 1
+ for t in self._testsuite:
+ if t.result == ResultState.fail:
+ ftap += 'not '
+ ftap += 'ok {} {} - {}'.format(str(index), t.test_id, t.test_name)
+ if t.result == ResultState.skip or t.result == ResultState.noresult:
+ ftap += ' # skipped - {}\n'.format(t.errormsg)
+ elif t.result == ResultState.fail:
+ if len(t.steps) > 0:
+ ftap += '\tCommands executed in this test case:'
+ for step in t.steps:
+ ftap += '\n\t\t{}'.format(step)
+ ftap += '\n\t{}'.format(t.failmsg)
+ ftap += '\n'
+ index += 1
+ return ftap
+
+ def format_xunit(self):
+ from xml.sax.saxutils import escape
+ xunit = "<testsuites>\n"
+ xunit += '\t<testsuite tests=\"{}\" skips=\"{}\">\n'.format(self.count_tests(), self.count_skips())
+ for t in self._testsuite:
+ xunit += '\t\t<testcase classname=\"{}\" '.format(escape(t.test_id))
+ xunit += 'name=\"{}\">\n'.format(escape(t.test_name))
+ if t.failmsg:
+ xunit += '\t\t\t<failure>\n'
+ if len(t.steps) > 0:
+ xunit += 'Commands executed in this test case:\n'
+ for step in t.steps:
+ xunit += '\t{}\n'.format(escape(step))
+ xunit += 'FAILURE: {}\n'.format(escape(t.failmsg))
+ xunit += '\t\t\t</failure>\n'
+ if t.errormsg:
+ xunit += '\t\t\t<error>\n{}\n'.format(escape(t.errormsg))
+ xunit += '\t\t\t</error>\n'
+ if t.result == ResultState.skip:
+ xunit += '\t\t\t<skipped/>\n'
+ xunit += '\t\t</testcase>\n'
+ xunit += '\t</testsuite>\n'
+ xunit += '</testsuites>\n'
+ return xunit
diff --git a/tools/testing/selftests/tc-testing/action.c b/tools/testing/selftests/tc-testing/action.c
new file mode 100644
index 000000000..c32b99b80
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/action.c
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright (c) 2018 Davide Caratti, Red Hat inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#include <linux/bpf.h>
+#include <linux/pkt_cls.h>
+
+__attribute__((section("action-ok"),used)) int action_ok(struct __sk_buff *s)
+{
+ return TC_ACT_OK;
+}
+
+__attribute__((section("action-ko"),used)) int action_ko(struct __sk_buff *s)
+{
+ s->data = 0x0;
+ return TC_ACT_OK;
+}
+
+char _license[] __attribute__((section("license"),used)) = "GPL";
diff --git a/tools/testing/selftests/tc-testing/config b/tools/testing/selftests/tc-testing/config
new file mode 100644
index 000000000..5f581c3a1
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/config
@@ -0,0 +1,68 @@
+#
+# Core Netfilter Configuration
+#
+CONFIG_NF_CONNTRACK=m
+CONFIG_NF_CONNTRACK_MARK=y
+CONFIG_NF_CONNTRACK_ZONES=y
+CONFIG_NF_CONNTRACK_LABELS=y
+CONFIG_NF_FLOW_TABLE=m
+CONFIG_NF_NAT=m
+
+CONFIG_NET_SCHED=y
+
+#
+# Queueing/Scheduling
+#
+CONFIG_NET_SCH_PRIO=m
+CONFIG_NET_SCH_INGRESS=m
+
+#
+# Classification
+#
+CONFIG_NET_CLS=y
+CONFIG_NET_CLS_FW=m
+CONFIG_NET_CLS_U32=m
+CONFIG_CLS_U32_PERF=y
+CONFIG_CLS_U32_MARK=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_STACK=32
+CONFIG_NET_EMATCH_CMP=m
+CONFIG_NET_EMATCH_NBYTE=m
+CONFIG_NET_EMATCH_U32=m
+CONFIG_NET_EMATCH_META=m
+CONFIG_NET_EMATCH_TEXT=m
+CONFIG_NET_EMATCH_IPSET=m
+CONFIG_NET_EMATCH_CANID=m
+CONFIG_NET_EMATCH_IPT=m
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_ACT_POLICE=m
+CONFIG_NET_ACT_GACT=m
+CONFIG_GACT_PROB=y
+CONFIG_NET_ACT_MIRRED=m
+CONFIG_NET_ACT_SAMPLE=m
+CONFIG_NET_ACT_IPT=m
+CONFIG_NET_ACT_NAT=m
+CONFIG_NET_ACT_PEDIT=m
+CONFIG_NET_ACT_SIMP=m
+CONFIG_NET_ACT_SKBEDIT=m
+CONFIG_NET_ACT_CSUM=m
+CONFIG_NET_ACT_VLAN=m
+CONFIG_NET_ACT_BPF=m
+CONFIG_NET_ACT_CONNMARK=m
+CONFIG_NET_ACT_CTINFO=m
+CONFIG_NET_ACT_SKBMOD=m
+CONFIG_NET_ACT_IFE=m
+CONFIG_NET_ACT_TUNNEL_KEY=m
+CONFIG_NET_ACT_CT=m
+CONFIG_NET_ACT_MPLS=m
+CONFIG_NET_IFE_SKBMARK=m
+CONFIG_NET_IFE_SKBPRIO=m
+CONFIG_NET_IFE_SKBTCINDEX=m
+CONFIG_NET_SCH_FIFO=y
+CONFIG_NET_SCH_ETS=m
+CONFIG_NET_SCH_RED=m
+
+#
+## Network testing
+#
+CONFIG_CAN=m
diff --git a/tools/testing/selftests/tc-testing/creating-plugins/AddingPlugins.txt b/tools/testing/selftests/tc-testing/creating-plugins/AddingPlugins.txt
new file mode 100644
index 000000000..c18f88d09
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/creating-plugins/AddingPlugins.txt
@@ -0,0 +1,104 @@
+tdc - Adding plugins for tdc
+
+Author: Brenda J. Butler - bjb@mojatatu.com
+
+ADDING PLUGINS
+--------------
+
+A new plugin should be written in python as a class that inherits from TdcPlugin.
+There are some examples in plugin-lib.
+
+The plugin can be used to add functionality to the test framework,
+such as:
+
+- adding commands to be run before and/or after the test suite
+- adding commands to be run before and/or after the test cases
+- adding commands to be run before and/or after the execute phase of the test cases
+- ability to alter the command to be run in any phase:
+ pre (the pre-suite stage)
+ prepare
+ execute
+ verify
+ teardown
+ post (the post-suite stage)
+- ability to add to the command line args, and use them at run time
+
+
+The functions in the class should follow the following interfaces:
+
+ def __init__(self)
+ def pre_suite(self, testcount, testidlist) # see "PRE_SUITE" below
+ def post_suite(self, ordinal) # see "SKIPPING" below
+ def pre_case(self, test_ordinal, testid) # see "PRE_CASE" below
+ def post_case(self)
+ def pre_execute(self)
+ def post_execute(self)
+ def adjust_command(self, stage, command) # see "ADJUST" below
+ def add_args(self, parser) # see "ADD_ARGS" below
+ def check_args(self, args, remaining) # see "CHECK_ARGS" below
+
+
+PRE_SUITE
+
+This method takes a testcount (number of tests to be run) and
+testidlist (array of test ids for tests that will be run). This is
+useful for various things, including when an exception occurs and the
+rest of the tests must be skipped. The info is stored in the object,
+and the post_suite method can refer to it when dumping the "skipped"
+TAP output. The tdc.py script will do that for the test suite as
+defined in the test case, but if the plugin is being used to run extra
+tests on each test (eg, check for memory leaks on associated
+co-processes) then that other tap output can be generated in the
+post-suite method using this info passed in to the pre_suite method.
+
+
+SKIPPING
+
+The post_suite method will receive the ordinal number of the last
+test to be attempted. It can use this info when outputting
+the TAP output for the extra test cases.
+
+
+PRE_CASE
+
+The pre_case method will receive the ordinal number of the test
+and the test id. Useful for outputing the extra test results.
+
+
+ADJUST
+
+The adjust_command method receives a string representing
+the execution stage and a string which is the actual command to be
+executed. The plugin can adjust the command, based on the stage of
+execution.
+
+The stages are represented by the following strings:
+
+ 'pre'
+ 'setup'
+ 'command'
+ 'verify'
+ 'teardown'
+ 'post'
+
+The adjust_command method must return the adjusted command so tdc
+can use it.
+
+
+ADD_ARGS
+
+The add_args method receives the argparser object and can add
+arguments to it. Care should be taken that the new arguments do not
+conflict with any from tdc.py or from other plugins that will be used
+concurrently.
+
+The add_args method should return the argparser object.
+
+
+CHECK_ARGS
+
+The check_args method is so that the plugin can do validation on
+the args, if needed. If there is a problem, and Exception should
+be raised, with a string that explains the problem.
+
+eg: raise Exception('plugin xxx, arg -y is wrong, fix it')
diff --git a/tools/testing/selftests/tc-testing/creating-testcases/AddingTestCases.txt b/tools/testing/selftests/tc-testing/creating-testcases/AddingTestCases.txt
new file mode 100644
index 000000000..a28571aff
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/creating-testcases/AddingTestCases.txt
@@ -0,0 +1,105 @@
+tdc - Adding test cases for tdc
+
+Author: Lucas Bates - lucasb@mojatatu.com
+
+ADDING TEST CASES
+-----------------
+
+User-defined tests should be added by defining a separate JSON file. This
+will help prevent conflicts when updating the repository. Refer to
+template.json for the required JSON format for test cases.
+
+Include the 'id' field, but do not assign a value. Running tdc with the -i
+option will generate a unique ID for that test case.
+
+tdc will recursively search the 'tc-tests' subdirectory (or the
+directories named with the -D option) for .json files. Any test case
+files you create in these directories will automatically be included.
+If you wish to store your custom test cases elsewhere, be sure to run
+tdc with the -f argument and the path to your file, or the -D argument
+and the path to your directory(ies).
+
+Be aware of required escape characters in the JSON data - particularly
+when defining the match pattern. Refer to the supplied json test files
+for examples when in doubt. The match pattern is written in json, and
+will be used by python. So the match pattern will be a python regular
+expression, but should be written using json syntax.
+
+
+TEST CASE STRUCTURE
+-------------------
+
+Each test case has required data:
+
+id: A unique alphanumeric value to identify a particular test case
+name: Descriptive name that explains the command under test
+skip: A completely optional key, if the corresponding value is "yes"
+ then tdc will not execute the test case in question. However,
+ this test case will still appear in the results output but
+ marked as skipped. This key can be placed anywhere inside the
+ test case at the top level.
+category: A list of single-word descriptions covering what the command
+ under test is testing. Example: filter, actions, u32, gact, etc.
+setup: The list of commands required to ensure the command under test
+ succeeds. For example: if testing a filter, the command to create
+ the qdisc would appear here.
+ This list can be empty.
+ Each command can be a string to be executed, or a list consisting
+ of a string which is a command to be executed, followed by 1 or
+ more acceptable exit codes for this command.
+ If only a string is given for the command, then an exit code of 0
+ will be expected.
+cmdUnderTest: The tc command being tested itself.
+expExitCode: The code returned by the command under test upon its termination.
+ tdc will compare this value against the actual returned value.
+verifyCmd: The tc command to be run to verify successful execution.
+ For example: if the command under test creates a gact action,
+ verifyCmd should be "$TC actions show action gact"
+matchPattern: A regular expression to be applied against the output of the
+ verifyCmd to prove the command under test succeeded. This pattern
+ should be as specific as possible so that a false positive is not
+ matched.
+matchCount: How many times the regex in matchPattern should match. A value
+ of 0 is acceptable.
+teardown: The list of commands to clean up after the test is completed.
+ The environment should be returned to the same state as when
+ this test was started: qdiscs deleted, actions flushed, etc.
+ This list can be empty.
+ Each command can be a string to be executed, or a list consisting
+ of a string which is a command to be executed, followed by 1 or
+ more acceptable exit codes for this command.
+ If only a string is given for the command, then an exit code of 0
+ will be expected.
+
+
+SETUP/TEARDOWN ERRORS
+---------------------
+
+If an error is detected during the setup/teardown process, execution of the
+tests will immediately stop with an error message and the namespace in which
+the tests are run will be destroyed. This is to prevent inaccurate results
+in the test cases. tdc will output a series of TAP results for the skipped
+tests.
+
+Repeated failures of the setup/teardown may indicate a problem with the test
+case, or possibly even a bug in one of the commands that are not being tested.
+
+It's possible to include acceptable exit codes with the setup/teardown command
+so that it doesn't halt the script for an error that doesn't matter. Turn the
+individual command into a list, with the command being first, followed by all
+acceptable exit codes for the command.
+
+Example:
+
+A pair of setup commands. The first can have exit code 0, 1 or 255, the
+second must have exit code 0.
+
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action reclassify index 65536"
+ ],
diff --git a/tools/testing/selftests/tc-testing/creating-testcases/example.json b/tools/testing/selftests/tc-testing/creating-testcases/example.json
new file mode 100644
index 000000000..5ec501200
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/creating-testcases/example.json
@@ -0,0 +1,55 @@
+[
+ {
+ "id": "1f",
+ "name": "simple test to test framework",
+ "category": [
+ "example"
+ ],
+ "setup": [
+ "mkdir mytest"
+ ],
+ "cmdUnderTest": "touch mytest/blorfl",
+ "expExitCode": "0",
+ "verifyCmd": "ls mytest/* | grep '[b]lorfl'",
+ "matchPattern": "orfl",
+ "matchCount": "1",
+ "teardown": [
+ "rm -rf mytest"
+ ]
+ },
+ {
+ "id": "2f",
+ "name": "simple test, no need for verify",
+ "category": [
+ "example"
+ ],
+ "setup": [
+ "mkdir mytest",
+ "touch mytest/blorfl"
+ ],
+ "cmdUnderTest": "ls mytest/blorfl",
+ "expExitCode": "0",
+ "verifyCmd": "/bin/true",
+ "matchPattern": " ",
+ "matchCount": "0",
+ "teardown": [
+ "rm -rf mytest"
+ ]
+ },
+ {
+ "id": "3f",
+ "name": "simple test, no need for setup or teardown (or verify)",
+ "category": [
+ "example"
+ ],
+ "setup": [
+ ],
+ "cmdUnderTest": "ip l l lo",
+ "expExitCode": "0",
+ "verifyCmd": "/bin/true",
+ "matchPattern": " ",
+ "matchCount": "0",
+ "teardown": [
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/creating-testcases/scapy-example.json b/tools/testing/selftests/tc-testing/creating-testcases/scapy-example.json
new file mode 100644
index 000000000..5a9377b72
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/creating-testcases/scapy-example.json
@@ -0,0 +1,98 @@
+[
+ {
+ "id": "b1e9",
+ "name": "Test matching of source IP",
+ "category": [
+ "actions",
+ "scapy"
+ ],
+ "plugins": {
+ "requires": [
+ "nsPlugin",
+ "scapyPlugin"
+ ]
+ },
+ "setup": [
+ [
+ "$TC qdisc del dev $DEV1 ingress",
+ 0,
+ 1,
+ 2,
+ 255
+ ],
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: prio 3 protocol ip flower src_ip 16.61.16.61 flowid 1:1 action ok",
+ "scapy": {
+ "iface": "$DEV0",
+ "count": 1,
+ "packet": "Ether(type=0x800)/IP(src='16.61.16.61')/ICMP()"
+ },
+ "expExitCode": "0",
+ "verifyCmd": "$TC -s -j filter ls dev $DEV1 ingress prio 3",
+ "matchJSON": [
+ {
+ "path": [
+ 1,
+ "options",
+ "actions",
+ 0,
+ "stats",
+ "packets"
+ ],
+ "value": 1
+ }
+ ],
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "e9c4",
+ "name": "Test matching of source IP with wrong count",
+ "category": [
+ "actions",
+ "scapy"
+ ],
+ "plugins": {
+ "requires": [
+ "nsPlugin",
+ "scapyPlugin"
+ ]
+ },
+ "setup": [
+ [
+ "$TC qdisc del dev $DEV1 ingress",
+ 0,
+ 1,
+ 2,
+ 255
+ ],
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: prio 3 protocol ip flower src_ip 16.61.16.61 flowid 1:1 action ok",
+ "scapy": {
+ "iface": "$DEV0",
+ "count": 3,
+ "packet": "Ether(type=0x800)/IP(src='16.61.16.61')/ICMP()"
+ },
+ "expExitCode": "0",
+ "verifyCmd": "$TC -s -j filter ls dev $DEV1 parent ffff:",
+ "matchJSON": [
+ {
+ "path": [
+ 1,
+ "options",
+ "actions",
+ 0,
+ "stats",
+ "packets"
+ ],
+ "value": 1
+ }
+ ],
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/creating-testcases/template.json b/tools/testing/selftests/tc-testing/creating-testcases/template.json
new file mode 100644
index 000000000..8b99b86d6
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/creating-testcases/template.json
@@ -0,0 +1,51 @@
+[
+ {
+ "id": "",
+ "name": "",
+ "category": [
+ "",
+ ""
+ ],
+ "setup": [
+ ""
+ ],
+ "cmdUnderTest": "",
+ "expExitCode": "",
+ "verifyCmd": "",
+ "matchPattern": "",
+ "matchCount": "",
+ "teardown": [
+ ""
+ ]
+ },
+ {
+ "id": "",
+ "name": "",
+ "category": [
+ "",
+ ""
+ ],
+ "setup": [
+ "",
+ [
+ "",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "",
+ "expExitCode": "",
+ "verifyCmd": "",
+ "matchPattern": "",
+ "matchCount": "",
+ "teardown": [
+ "",
+ [
+ "",
+ 0,
+ 255
+ ]
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/plugin-lib/README-PLUGINS b/tools/testing/selftests/tc-testing/plugin-lib/README-PLUGINS
new file mode 100644
index 000000000..aa8a26697
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/plugin-lib/README-PLUGINS
@@ -0,0 +1,27 @@
+tdc.py will look for plugins in a directory plugins off the cwd.
+Make a set of numbered symbolic links from there to the actual plugins.
+Eg:
+
+tdc.py
+plugin-lib/
+plugins/
+ __init__.py
+ 10-rootPlugin.py -> ../plugin-lib/rootPlugin.py
+ 20-valgrindPlugin.py -> ../plugin-lib/valgrindPlugin.py
+ 30-nsPlugin.py -> ../plugin-lib/nsPlugin.py
+
+
+tdc.py will find them and use them.
+
+
+rootPlugin
+ Check if the uid is root. If not, bail out.
+
+valgrindPlugin
+ Run the command under test with valgrind, and produce an extra set of TAP results for the memory tests.
+ This plugin will write files to the cwd, called vgnd-xxx.log. These will contain
+ the valgrind output for test xxx. Any file matching the glob 'vgnd-*.log' will be
+ deleted at the end of the run.
+
+nsPlugin
+ Run all the commands in a network namespace.
diff --git a/tools/testing/selftests/tc-testing/plugin-lib/buildebpfPlugin.py b/tools/testing/selftests/tc-testing/plugin-lib/buildebpfPlugin.py
new file mode 100644
index 000000000..d34fe0626
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/plugin-lib/buildebpfPlugin.py
@@ -0,0 +1,67 @@
+'''
+build ebpf program
+'''
+
+import os
+import signal
+from string import Template
+import subprocess
+import time
+from TdcPlugin import TdcPlugin
+from tdc_config import *
+
+class SubPlugin(TdcPlugin):
+ def __init__(self):
+ self.sub_class = 'buildebpf/SubPlugin'
+ self.tap = ''
+ super().__init__()
+
+ def pre_suite(self, testcount, testidlist):
+ super().pre_suite(testcount, testidlist)
+
+ if self.args.buildebpf:
+ self._ebpf_makeall()
+
+ def post_suite(self, index):
+ super().post_suite(index)
+
+ self._ebpf_makeclean()
+
+ def add_args(self, parser):
+ super().add_args(parser)
+
+ self.argparser_group = self.argparser.add_argument_group(
+ 'buildebpf',
+ 'options for buildebpfPlugin')
+ self.argparser_group.add_argument(
+ '--nobuildebpf', action='store_false', default=True,
+ dest='buildebpf',
+ help='Don\'t build eBPF programs')
+
+ return self.argparser
+
+ def _ebpf_makeall(self):
+ if self.args.buildebpf:
+ self._make('all')
+
+ def _ebpf_makeclean(self):
+ if self.args.buildebpf:
+ self._make('clean')
+
+ def _make(self, target):
+ command = 'make -C {} {}'.format(self.args.NAMES['EBPFDIR'], target)
+ proc = subprocess.Popen(command,
+ shell=True,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ env=os.environ.copy())
+ (rawout, serr) = proc.communicate()
+
+ if proc.returncode != 0 and len(serr) > 0:
+ foutput = serr.decode("utf-8")
+ else:
+ foutput = rawout.decode("utf-8")
+
+ proc.stdout.close()
+ proc.stderr.close()
+ return proc, foutput
diff --git a/tools/testing/selftests/tc-testing/plugin-lib/nsPlugin.py b/tools/testing/selftests/tc-testing/plugin-lib/nsPlugin.py
new file mode 100644
index 000000000..9539cffa9
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/plugin-lib/nsPlugin.py
@@ -0,0 +1,155 @@
+import os
+import signal
+from string import Template
+import subprocess
+import time
+from TdcPlugin import TdcPlugin
+
+from tdc_config import *
+
+class SubPlugin(TdcPlugin):
+ def __init__(self):
+ self.sub_class = 'ns/SubPlugin'
+ super().__init__()
+
+ def pre_suite(self, testcount, testidlist):
+ '''run commands before test_runner goes into a test loop'''
+ super().pre_suite(testcount, testidlist)
+
+ if self.args.namespace:
+ self._ns_create()
+ else:
+ self._ports_create()
+
+ def post_suite(self, index):
+ '''run commands after test_runner goes into a test loop'''
+ super().post_suite(index)
+ if self.args.verbose:
+ print('{}.post_suite'.format(self.sub_class))
+
+ if self.args.namespace:
+ self._ns_destroy()
+ else:
+ self._ports_destroy()
+
+ def add_args(self, parser):
+ super().add_args(parser)
+ self.argparser_group = self.argparser.add_argument_group(
+ 'netns',
+ 'options for nsPlugin(run commands in net namespace)')
+ self.argparser_group.add_argument(
+ '-N', '--no-namespace', action='store_false', default=True,
+ dest='namespace', help='Don\'t run commands in namespace')
+ return self.argparser
+
+ def adjust_command(self, stage, command):
+ super().adjust_command(stage, command)
+ cmdform = 'list'
+ cmdlist = list()
+
+ if not self.args.namespace:
+ return command
+
+ if self.args.verbose:
+ print('{}.adjust_command'.format(self.sub_class))
+
+ if not isinstance(command, list):
+ cmdform = 'str'
+ cmdlist = command.split()
+ else:
+ cmdlist = command
+ if stage == 'setup' or stage == 'execute' or stage == 'verify' or stage == 'teardown':
+ if self.args.verbose:
+ print('adjust_command: stage is {}; inserting netns stuff in command [{}] list [{}]'.format(stage, command, cmdlist))
+ cmdlist.insert(0, self.args.NAMES['NS'])
+ cmdlist.insert(0, 'exec')
+ cmdlist.insert(0, 'netns')
+ cmdlist.insert(0, self.args.NAMES['IP'])
+ else:
+ pass
+
+ if cmdform == 'str':
+ command = ' '.join(cmdlist)
+ else:
+ command = cmdlist
+
+ if self.args.verbose:
+ print('adjust_command: return command [{}]'.format(command))
+ return command
+
+ def _ports_create(self):
+ cmd = '$IP link add $DEV0 type veth peer name $DEV1'
+ self._exec_cmd('pre', cmd)
+ cmd = '$IP link set $DEV0 up'
+ self._exec_cmd('pre', cmd)
+ if not self.args.namespace:
+ cmd = '$IP link set $DEV1 up'
+ self._exec_cmd('pre', cmd)
+
+ def _ports_destroy(self):
+ cmd = '$IP link del $DEV0'
+ self._exec_cmd('post', cmd)
+
+ def _ns_create(self):
+ '''
+ Create the network namespace in which the tests will be run and set up
+ the required network devices for it.
+ '''
+ self._ports_create()
+ if self.args.namespace:
+ cmd = '$IP netns add {}'.format(self.args.NAMES['NS'])
+ self._exec_cmd('pre', cmd)
+ cmd = '$IP link set $DEV1 netns {}'.format(self.args.NAMES['NS'])
+ self._exec_cmd('pre', cmd)
+ cmd = '$IP -n {} link set $DEV1 up'.format(self.args.NAMES['NS'])
+ self._exec_cmd('pre', cmd)
+ if self.args.device:
+ cmd = '$IP link set $DEV2 netns {}'.format(self.args.NAMES['NS'])
+ self._exec_cmd('pre', cmd)
+ cmd = '$IP -n {} link set $DEV2 up'.format(self.args.NAMES['NS'])
+ self._exec_cmd('pre', cmd)
+
+ def _ns_destroy(self):
+ '''
+ Destroy the network namespace for testing (and any associated network
+ devices as well)
+ '''
+ if self.args.namespace:
+ cmd = '$IP netns delete {}'.format(self.args.NAMES['NS'])
+ self._exec_cmd('post', cmd)
+
+ def _exec_cmd(self, stage, command):
+ '''
+ Perform any required modifications on an executable command, then run
+ it in a subprocess and return the results.
+ '''
+ if '$' in command:
+ command = self._replace_keywords(command)
+
+ self.adjust_command(stage, command)
+ if self.args.verbose:
+ print('_exec_cmd: command "{}"'.format(command))
+ proc = subprocess.Popen(command,
+ shell=True,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ env=ENVIR)
+ (rawout, serr) = proc.communicate()
+
+ if proc.returncode != 0 and len(serr) > 0:
+ foutput = serr.decode("utf-8")
+ else:
+ foutput = rawout.decode("utf-8")
+
+ proc.stdout.close()
+ proc.stderr.close()
+ return proc, foutput
+
+ def _replace_keywords(self, cmd):
+ """
+ For a given executable command, substitute any known
+ variables contained within NAMES with the correct values
+ """
+ tcmd = Template(cmd)
+ subcmd = tcmd.safe_substitute(self.args.NAMES)
+ return subcmd
diff --git a/tools/testing/selftests/tc-testing/plugin-lib/rootPlugin.py b/tools/testing/selftests/tc-testing/plugin-lib/rootPlugin.py
new file mode 100644
index 000000000..e36775bd4
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/plugin-lib/rootPlugin.py
@@ -0,0 +1,19 @@
+import os
+import sys
+from TdcPlugin import TdcPlugin
+
+from tdc_config import *
+
+
+class SubPlugin(TdcPlugin):
+ def __init__(self):
+ self.sub_class = 'root/SubPlugin'
+ super().__init__()
+
+ def pre_suite(self, testcount, testidlist):
+ # run commands before test_runner goes into a test loop
+ super().pre_suite(testcount, testidlist)
+
+ if os.geteuid():
+ print('This script must be run with root privileges', file=sys.stderr)
+ exit(1)
diff --git a/tools/testing/selftests/tc-testing/plugin-lib/scapyPlugin.py b/tools/testing/selftests/tc-testing/plugin-lib/scapyPlugin.py
new file mode 100644
index 000000000..a7b21658a
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/plugin-lib/scapyPlugin.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+
+import os
+import signal
+from string import Template
+import subprocess
+import time
+from TdcPlugin import TdcPlugin
+
+from tdc_config import *
+
+try:
+ from scapy.all import *
+except ImportError:
+ print("Unable to import the scapy python module.")
+ print("\nIf not already installed, you may do so with:")
+ print("\t\tpip3 install scapy==2.4.2")
+ exit(1)
+
+class SubPlugin(TdcPlugin):
+ def __init__(self):
+ self.sub_class = 'scapy/SubPlugin'
+ super().__init__()
+
+ def post_execute(self):
+ if 'scapy' not in self.args.caseinfo:
+ if self.args.verbose:
+ print('{}.post_execute: no scapy info in test case'.format(self.sub_class))
+ return
+
+ # Check for required fields
+ scapyinfo = self.args.caseinfo['scapy']
+ scapy_keys = ['iface', 'count', 'packet']
+ missing_keys = []
+ keyfail = False
+ for k in scapy_keys:
+ if k not in scapyinfo:
+ keyfail = True
+ missing_keys.append(k)
+ if keyfail:
+ print('{}: Scapy block present in the test, but is missing info:'
+ .format(self.sub_class))
+ print('{}'.format(missing_keys))
+
+ pkt = eval(scapyinfo['packet'])
+ if '$' in scapyinfo['iface']:
+ tpl = Template(scapyinfo['iface'])
+ scapyinfo['iface'] = tpl.safe_substitute(NAMES)
+ for count in range(scapyinfo['count']):
+ sendp(pkt, iface=scapyinfo['iface'])
diff --git a/tools/testing/selftests/tc-testing/plugin-lib/valgrindPlugin.py b/tools/testing/selftests/tc-testing/plugin-lib/valgrindPlugin.py
new file mode 100644
index 000000000..4bb866575
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/plugin-lib/valgrindPlugin.py
@@ -0,0 +1,160 @@
+'''
+run the command under test, under valgrind and collect memory leak info
+as a separate test.
+'''
+
+
+import os
+import re
+import signal
+from string import Template
+import subprocess
+import time
+from TdcPlugin import TdcPlugin
+from TdcResults import *
+
+from tdc_config import *
+
+def vp_extract_num_from_string(num_as_string_maybe_with_commas):
+ return int(num_as_string_maybe_with_commas.replace(',',''))
+
+class SubPlugin(TdcPlugin):
+ def __init__(self):
+ self.sub_class = 'valgrind/SubPlugin'
+ self.tap = ''
+ self._tsr = TestSuiteReport()
+ super().__init__()
+
+ def pre_suite(self, testcount, testidlist):
+ '''run commands before test_runner goes into a test loop'''
+ super().pre_suite(testcount, testidlist)
+ if self.args.verbose > 1:
+ print('{}.pre_suite'.format(self.sub_class))
+ if self.args.valgrind:
+ self._add_to_tap('1..{}\n'.format(self.testcount))
+
+ def post_suite(self, index):
+ '''run commands after test_runner goes into a test loop'''
+ super().post_suite(index)
+ if self.args.verbose > 1:
+ print('{}.post_suite'.format(self.sub_class))
+ #print('{}'.format(self.tap))
+ for xx in range(index - 1, self.testcount):
+ res = TestResult('{}-mem'.format(self.testidlist[xx]), 'Test skipped')
+ res.set_result(ResultState.skip)
+ res.set_errormsg('Skipped because of prior setup/teardown failure')
+ self._add_results(res)
+ if self.args.verbose < 4:
+ subprocess.check_output('rm -f vgnd-*.log', shell=True)
+
+ def add_args(self, parser):
+ super().add_args(parser)
+ self.argparser_group = self.argparser.add_argument_group(
+ 'valgrind',
+ 'options for valgrindPlugin (run command under test under Valgrind)')
+
+ self.argparser_group.add_argument(
+ '-V', '--valgrind', action='store_true',
+ help='Run commands under valgrind')
+
+ return self.argparser
+
+ def adjust_command(self, stage, command):
+ super().adjust_command(stage, command)
+ cmdform = 'list'
+ cmdlist = list()
+
+ if not self.args.valgrind:
+ return command
+
+ if self.args.verbose > 1:
+ print('{}.adjust_command'.format(self.sub_class))
+
+ if not isinstance(command, list):
+ cmdform = 'str'
+ cmdlist = command.split()
+ else:
+ cmdlist = command
+
+ if stage == 'execute':
+ if self.args.verbose > 1:
+ print('adjust_command: stage is {}; inserting valgrind stuff in command [{}] list [{}]'.
+ format(stage, command, cmdlist))
+ cmdlist.insert(0, '--track-origins=yes')
+ cmdlist.insert(0, '--show-leak-kinds=definite,indirect')
+ cmdlist.insert(0, '--leak-check=full')
+ cmdlist.insert(0, '--log-file=vgnd-{}.log'.format(self.args.testid))
+ cmdlist.insert(0, '-v') # ask for summary of non-leak errors
+ cmdlist.insert(0, ENVIR['VALGRIND_BIN'])
+ else:
+ pass
+
+ if cmdform == 'str':
+ command = ' '.join(cmdlist)
+ else:
+ command = cmdlist
+
+ if self.args.verbose > 1:
+ print('adjust_command: return command [{}]'.format(command))
+ return command
+
+ def post_execute(self):
+ if not self.args.valgrind:
+ return
+
+ res = TestResult('{}-mem'.format(self.args.testid),
+ '{} memory leak check'.format(self.args.test_name))
+ if self.args.test_skip:
+ res.set_result(ResultState.skip)
+ res.set_errormsg('Test case designated as skipped.')
+ self._add_results(res)
+ return
+
+ self.definitely_lost_re = re.compile(
+ r'definitely lost:\s+([,0-9]+)\s+bytes in\s+([,0-9]+)\sblocks', re.MULTILINE | re.DOTALL)
+ self.indirectly_lost_re = re.compile(
+ r'indirectly lost:\s+([,0-9]+)\s+bytes in\s+([,0-9]+)\s+blocks', re.MULTILINE | re.DOTALL)
+ self.possibly_lost_re = re.compile(
+ r'possibly lost:\s+([,0-9]+)bytes in\s+([,0-9]+)\s+blocks', re.MULTILINE | re.DOTALL)
+ self.non_leak_error_re = re.compile(
+ r'ERROR SUMMARY:\s+([,0-9]+) errors from\s+([,0-9]+)\s+contexts', re.MULTILINE | re.DOTALL)
+
+ def_num = 0
+ ind_num = 0
+ pos_num = 0
+ nle_num = 0
+
+ # what about concurrent test runs? Maybe force them to be in different directories?
+ with open('vgnd-{}.log'.format(self.args.testid)) as vfd:
+ content = vfd.read()
+ def_mo = self.definitely_lost_re.search(content)
+ ind_mo = self.indirectly_lost_re.search(content)
+ pos_mo = self.possibly_lost_re.search(content)
+ nle_mo = self.non_leak_error_re.search(content)
+
+ if def_mo:
+ def_num = int(def_mo.group(2))
+ if ind_mo:
+ ind_num = int(ind_mo.group(2))
+ if pos_mo:
+ pos_num = int(pos_mo.group(2))
+ if nle_mo:
+ nle_num = int(nle_mo.group(1))
+
+ mem_results = ''
+ if (def_num > 0) or (ind_num > 0) or (pos_num > 0) or (nle_num > 0):
+ mem_results += 'not '
+ res.set_result(ResultState.fail)
+ res.set_failmsg('Memory leak detected')
+ res.append_failmsg(content)
+ else:
+ res.set_result(ResultState.success)
+
+ self._add_results(res)
+
+
+ def _add_results(self, res):
+ self._tsr.add_resultdata(res)
+
+ def _add_to_tap(self, more_tap_output):
+ self.tap += more_tap_output
diff --git a/tools/testing/selftests/tc-testing/plugins/__init__.py b/tools/testing/selftests/tc-testing/plugins/__init__.py
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/plugins/__init__.py
diff --git a/tools/testing/selftests/tc-testing/settings b/tools/testing/selftests/tc-testing/settings
new file mode 100644
index 000000000..e2206265f
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/settings
@@ -0,0 +1 @@
+timeout=900
diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json b/tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json
new file mode 100644
index 000000000..503982b8f
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/bpf.json
@@ -0,0 +1,321 @@
+[
+ {
+ "id": "d959",
+ "name": "Add cBPF action with valid bytecode",
+ "category": [
+ "actions",
+ "bpf"
+ ],
+ "setup": [
+ [
+ "$TC action flush action bpf",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC action add action bpf bytecode '4,40 0 0 12,21 0 1 2048,6 0 0 262144,6 0 0 0' index 100",
+ "expExitCode": "0",
+ "verifyCmd": "$TC action get action bpf index 100",
+ "matchPattern": "action order [0-9]*: bpf bytecode '4,40 0 0 12,21 0 1 2048,6 0 0 262144,6 0 0 0' default-action pipe.*index 100 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC action flush action bpf"
+ ]
+ },
+ {
+ "id": "f84a",
+ "name": "Add cBPF action with invalid bytecode",
+ "category": [
+ "actions",
+ "bpf"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action bpf",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC action add action bpf bytecode '4,40 0 0 12,31 0 1 2048,6 0 0 262144,6 0 0 0' index 100",
+ "expExitCode": "255",
+ "verifyCmd": "$TC action get action bpf index 100",
+ "matchPattern": "action order [0-9]*: bpf bytecode '4,40 0 0 12,31 0 1 2048,6 0 0 262144,6 0 0 0' default-action pipe.*index 100 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action bpf"
+ ]
+ },
+ {
+ "id": "e939",
+ "name": "Add eBPF action with valid object-file",
+ "category": [
+ "actions",
+ "bpf"
+ ],
+ "plugins": {
+ "requires": "buildebpfPlugin"
+ },
+ "setup": [
+ [
+ "$TC action flush action bpf",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC action add action bpf object-file $EBPFDIR/action.o section action-ok index 667",
+ "expExitCode": "0",
+ "verifyCmd": "$TC action get action bpf index 667",
+ "matchPattern": "action order [0-9]*: bpf action.o:\\[action-ok\\] id [0-9]* tag [0-9a-f]{16}( jited)? default-action pipe.*index 667 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC action flush action bpf"
+ ]
+ },
+ {
+ "id": "282d",
+ "name": "Add eBPF action with invalid object-file",
+ "category": [
+ "actions",
+ "bpf"
+ ],
+ "plugins": {
+ "requires": "buildebpfPlugin"
+ },
+ "setup": [
+ [
+ "$TC action flush action bpf",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC action add action bpf object-file $EBPFDIR/action.o section action-ko index 667",
+ "expExitCode": "255",
+ "verifyCmd": "$TC action get action bpf index 667",
+ "matchPattern": "action order [0-9]*: bpf action.o:\\[action-ko\\] id [0-9].*index 667 ref",
+ "matchCount": "0",
+ "teardown": [
+ [
+ "$TC action flush action bpf",
+ 0,
+ 1,
+ 255
+ ]
+ ]
+ },
+ {
+ "id": "d819",
+ "name": "Replace cBPF bytecode and action control",
+ "category": [
+ "actions",
+ "bpf"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action bpf",
+ 0,
+ 1,
+ 255
+ ],
+ [
+ "$TC action add action bpf bytecode '4,40 0 0 12,21 0 1 2048,6 0 0 262144,6 0 0 0' index 555",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC action replace action bpf bytecode '4,40 0 0 12,21 0 1 2054,6 0 0 262144,6 0 0 0' drop index 555",
+ "expExitCode": "0",
+ "verifyCmd": "$TC action get action bpf index 555",
+ "matchPattern": "action order [0-9]*: bpf bytecode '4,40 0 0 12,21 0 1 2054,6 0 0 262144,6 0 0 0' default-action drop.*index 555 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC action flush action bpf"
+ ]
+ },
+ {
+ "id": "6ae3",
+ "name": "Delete cBPF action ",
+ "category": [
+ "actions",
+ "bpf"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action bpf",
+ 0,
+ 1,
+ 255
+ ],
+ [
+ "$TC action add action bpf bytecode '4,40 0 0 12,21 0 1 2048,6 0 0 262144,6 0 0 0' index 444",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC action delete action bpf index 444",
+ "expExitCode": "0",
+ "verifyCmd": "$TC action get action bpf index 444",
+ "matchPattern": "action order [0-9]*: bpf bytecode '4,40 0 0 12,21 0 1 2048,6 0 0 262144,6 0 0 0' default-action pipe.*index 444 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC action flush action bpf"
+ ]
+ },
+ {
+ "id": "3e0d",
+ "name": "List cBPF actions",
+ "category": [
+ "actions",
+ "bpf"
+ ],
+ "setup": [
+ [
+ "$TC action flush action bpf",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC action add action bpf bytecode '4,40 0 0 12,21 0 1 2048,6 0 0 262144,6 0 0 0' ok index 101",
+ "$TC action add action bpf bytecode '4,40 0 0 12,21 0 1 2054,6 0 0 262144,6 0 0 0' drop index 102",
+ "$TC action add action bpf bytecode '4,40 0 0 12,21 0 1 33024,6 0 0 262144,6 0 0 0' continue index 103"
+ ],
+ "cmdUnderTest": "$TC action list action bpf",
+ "expExitCode": "0",
+ "verifyCmd": "$TC action list action bpf",
+ "matchPattern": "action order [0-9]*: bpf bytecode",
+ "matchCount": "3",
+ "teardown": [
+ "$TC actions flush action bpf"
+ ]
+ },
+ {
+ "id": "55ce",
+ "name": "Flush BPF actions",
+ "category": [
+ "actions",
+ "bpf"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action bpf",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC action add action bpf bytecode '4,40 0 0 12,21 0 1 2048,6 0 0 262144,6 0 0 0' ok index 101",
+ "$TC action add action bpf bytecode '4,40 0 0 12,21 0 1 2054,6 0 0 262144,6 0 0 0' drop index 102",
+ "$TC action add action bpf bytecode '4,40 0 0 12,21 0 1 33024,6 0 0 262144,6 0 0 0' continue index 103"
+ ],
+ "cmdUnderTest": "$TC action flush action bpf",
+ "expExitCode": "0",
+ "verifyCmd": "$TC action list action bpf",
+ "matchPattern": "action order [0-9]*: bpf bytecode",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action bpf"
+ ]
+ },
+ {
+ "id": "ccc3",
+ "name": "Add cBPF action with duplicate index",
+ "category": [
+ "actions",
+ "bpf"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action bpf",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC action add action bpf bytecode '4,40 0 0 12,21 0 1 2048,6 0 0 262144,6 0 0 0' index 4294967295"
+ ],
+ "cmdUnderTest": "$TC action add action bpf bytecode '4,40 0 0 12,21 0 1 2054,6 0 0 262144,6 0 0 0' index 4294967295",
+ "expExitCode": "255",
+ "verifyCmd": "$TC action get action bpf index 4294967295",
+ "matchPattern": "action order [0-9]*: bpf bytecode '4,40 0 0 12,21 0 1 2048,6 0 0 262144,6 0 0 0' default-action pipe.*index 4294967295",
+ "matchCount": "1",
+ "teardown": [
+ "$TC action flush action bpf"
+ ]
+ },
+ {
+ "id": "89c7",
+ "name": "Add cBPF action with invalid index",
+ "category": [
+ "actions",
+ "bpf"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action bpf",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC action add action bpf bytecode '4,40 0 0 12,21 0 1 2054,6 0 0 262144,6 0 0 0' index 4294967296 cookie 123456",
+ "expExitCode": "255",
+ "verifyCmd": "$TC action ls action bpf",
+ "matchPattern": "action order [0-9]*: bpf bytecode '4,40 0 0 12,21 0 1 2048,6 0 0 262144,6 0 0 0' default-action pipe.*cookie 123456",
+ "matchCount": "0",
+ "teardown": [
+ "$TC action flush action bpf"
+ ]
+ },
+ {
+ "id": "7ab9",
+ "name": "Add cBPF action with cookie",
+ "category": [
+ "actions",
+ "bpf"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action bpf",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC action add action bpf bytecode '4,40 0 0 12,21 0 1 2054,6 0 0 262144,6 0 0 0' cookie d0d0d0d0d0d0d0d0",
+ "expExitCode": "0",
+ "verifyCmd": "$TC action list action bpf",
+ "matchPattern": "action order [0-9]*: bpf.*cookie d0d0d0d0d0d0d0",
+ "matchCount": "1",
+ "teardown": [
+ "$TC action flush action bpf"
+ ]
+ },
+ {
+ "id": "b8a1",
+ "name": "Replace bpf action with invalid goto_chain control",
+ "category": [
+ "actions",
+ "bpf"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action bpf",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC action add action bpf bytecode '1,6 0 0 4294967295' pass index 90"
+ ],
+ "cmdUnderTest": "$TC action replace action bpf bytecode '1,6 0 0 4294967295' goto chain 42 index 90 cookie c1a0c1a0",
+ "expExitCode": "255",
+ "verifyCmd": "$TC action list action bpf",
+ "matchPattern": "action order [0-9]*: bpf.* default-action pass.*index 90",
+ "matchCount": "1",
+ "teardown": [
+ "$TC action flush action bpf"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/connmark.json b/tools/testing/selftests/tc-testing/tc-tests/actions/connmark.json
new file mode 100644
index 000000000..cadde8f41
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/connmark.json
@@ -0,0 +1,316 @@
+[
+ {
+ "id": "2002",
+ "name": "Add valid connmark action with defaults",
+ "category": [
+ "actions",
+ "connmark"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action connmark",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action connmark",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action connmark",
+ "matchPattern": "action order [0-9]+: connmark zone 0 pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action connmark"
+ ]
+ },
+ {
+ "id": "56a5",
+ "name": "Add valid connmark action with control pass",
+ "category": [
+ "actions",
+ "connmark"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action connmark",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action connmark pass index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action connmark index 1",
+ "matchPattern": "action order [0-9]+: connmark zone 0 pass.*index 1 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action connmark"
+ ]
+ },
+ {
+ "id": "7c66",
+ "name": "Add valid connmark action with control drop",
+ "category": [
+ "actions",
+ "connmark"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action connmark",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action connmark drop index 100",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action connmark index 100",
+ "matchPattern": "action order [0-9]+: connmark zone 0 drop.*index 100 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action connmark"
+ ]
+ },
+ {
+ "id": "a913",
+ "name": "Add valid connmark action with control pipe",
+ "category": [
+ "actions",
+ "connmark"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action connmark",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action connmark pipe index 455",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action connmark index 455",
+ "matchPattern": "action order [0-9]+: connmark zone 0 pipe.*index 455 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action connmark"
+ ]
+ },
+ {
+ "id": "bdd8",
+ "name": "Add valid connmark action with control reclassify",
+ "category": [
+ "actions",
+ "connmark"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action connmark",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action connmark reclassify index 7",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action connmark",
+ "matchPattern": "action order [0-9]+: connmark zone 0 reclassify.*index 7 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action connmark"
+ ]
+ },
+ {
+ "id": "b8be",
+ "name": "Add valid connmark action with control continue",
+ "category": [
+ "actions",
+ "connmark"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action connmark",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action connmark continue index 17",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action connmark",
+ "matchPattern": "action order [0-9]+: connmark zone 0 continue.*index 17 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action connmark"
+ ]
+ },
+ {
+ "id": "d8a6",
+ "name": "Add valid connmark action with control jump",
+ "category": [
+ "actions",
+ "connmark"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action connmark",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action connmark jump 10 index 17",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action connmark",
+ "matchPattern": "action order [0-9]+: connmark zone 0 jump 10.*index 17 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action connmark"
+ ]
+ },
+ {
+ "id": "aae8",
+ "name": "Add valid connmark action with zone argument",
+ "category": [
+ "actions",
+ "connmark"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action connmark",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action connmark zone 100 pipe index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action connmark index 1",
+ "matchPattern": "action order [0-9]+: connmark zone 100 pipe.*index 1 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action connmark"
+ ]
+ },
+ {
+ "id": "2f0b",
+ "name": "Add valid connmark action with invalid zone argument",
+ "category": [
+ "actions",
+ "connmark"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action connmark",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action connmark zone 65536 reclassify index 21",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action connmark index 1",
+ "matchPattern": "action order [0-9]+: connmark zone 65536 reclassify.*index 21 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action connmark"
+ ]
+ },
+ {
+ "id": "9305",
+ "name": "Add connmark action with unsupported argument",
+ "category": [
+ "actions",
+ "connmark"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action connmark",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action connmark zone 655 unsupp_arg pass index 2",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action connmark index 2",
+ "matchPattern": "action order [0-9]+: connmark zone 655 unsupp_arg pass.*index 2 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action connmark"
+ ]
+ },
+ {
+ "id": "71ca",
+ "name": "Add valid connmark action and replace it",
+ "category": [
+ "actions",
+ "connmark"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action connmark",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action connmark zone 777 pass index 555"
+ ],
+ "cmdUnderTest": "$TC actions replace action connmark zone 555 reclassify index 555",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action connmark index 555",
+ "matchPattern": "action order [0-9]+: connmark zone 555 reclassify.*index 555 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action connmark"
+ ]
+ },
+ {
+ "id": "5f8f",
+ "name": "Add valid connmark action with cookie",
+ "category": [
+ "actions",
+ "connmark"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action connmark",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action connmark zone 555 pipe index 5 cookie aabbccddeeff112233445566778800a1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action connmark index 5",
+ "matchPattern": "action order [0-9]+: connmark zone 555 pipe.*index 5 ref.*cookie aabbccddeeff112233445566778800a1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action connmark"
+ ]
+ },
+ {
+ "id": "c506",
+ "name": "Replace connmark with invalid goto chain control",
+ "category": [
+ "actions",
+ "connmark"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action connmark",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action connmark pass index 90"
+ ],
+ "cmdUnderTest": "$TC actions replace action connmark goto chain 42 index 90 cookie c1a0c1a0",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action connmark index 90",
+ "matchPattern": "action order [0-9]+: connmark zone 0 pass.*index 90 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action connmark"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/csum.json b/tools/testing/selftests/tc-testing/tc-tests/actions/csum.json
new file mode 100644
index 000000000..072febf25
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/csum.json
@@ -0,0 +1,553 @@
+[
+ {
+ "id": "6d84",
+ "name": "Add csum iph action",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action csum iph index 800",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action csum index 800",
+ "matchPattern": "action order [0-9]*: csum \\(iph\\) action pass.*index 800 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action csum"
+ ]
+ },
+ {
+ "id": "1862",
+ "name": "Add csum ip4h action",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action csum ip4h index 7",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action csum index 7",
+ "matchPattern": "action order [0-9]*: csum \\(iph\\) action pass.*index 7 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action csum"
+ ]
+ },
+ {
+ "id": "15c6",
+ "name": "Add csum ipv4h action",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action csum ipv4h index 1122",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action csum index 1122",
+ "matchPattern": "action order [0-9]*: csum \\(iph\\) action pass.*index 1122 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action csum"
+ ]
+ },
+ {
+ "id": "bf47",
+ "name": "Add csum icmp action",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action csum icmp index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action csum index 1",
+ "matchPattern": "action order [0-9]*: csum \\(icmp\\) action pass.*index 1 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action csum"
+ ]
+ },
+ {
+ "id": "cc1d",
+ "name": "Add csum igmp action",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action csum igmp index 999",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action csum index 999",
+ "matchPattern": "action order [0-9]*: csum \\(igmp\\) action pass.*index 999 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action csum"
+ ]
+ },
+ {
+ "id": "bccc",
+ "name": "Add csum foobar action",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action csum foobar index 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions ls action csum",
+ "matchPattern": "action order [0-9]*: csum \\(foobar\\) action pass.*index 1 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action csum"
+ ]
+ },
+ {
+ "id": "3bb4",
+ "name": "Add csum tcp action",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action csum tcp index 9999",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action csum index 9999",
+ "matchPattern": "action order [0-9]*: csum \\(tcp\\) action pass.*index 9999 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action csum"
+ ]
+ },
+ {
+ "id": "759c",
+ "name": "Add csum udp action",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action csum udp index 334455",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action csum index 334455",
+ "matchPattern": "action order [0-9]*: csum \\(udp\\) action pass.*index 334455 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action csum"
+ ]
+ },
+ {
+ "id": "bdb6",
+ "name": "Add csum udp xor iph action",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action csum udp xor iph index 3",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions ls action csum",
+ "matchPattern": "action order [0-9]*: csum \\(udp xor iph\\) action pass.*index 3 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action csum"
+ ]
+ },
+ {
+ "id": "c220",
+ "name": "Add csum udplite action",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action csum udplite continue index 3",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action csum index 3",
+ "matchPattern": "action order [0-9]*: csum \\(udplite\\) action continue.*index 3 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action csum"
+ ]
+ },
+ {
+ "id": "8993",
+ "name": "Add csum sctp action",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action csum sctp index 777",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action csum index 777",
+ "matchPattern": "action order [0-9]*: csum \\(sctp\\) action pass.*index 777 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action csum"
+ ]
+ },
+ {
+ "id": "b138",
+ "name": "Add csum ip & icmp action",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action csum ip and icmp pipe index 123",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action csum index 123",
+ "matchPattern": "action order [0-9]*: csum \\(iph, icmp\\) action pipe.*index 123 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action csum"
+ ]
+ },
+ {
+ "id": "eeda",
+ "name": "Add csum ip & sctp action",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action csum ipv4h sctp continue index 2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action csum index 2",
+ "matchPattern": "action order [0-9]*: csum \\(iph, sctp\\) action continue.*index 2 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action csum"
+ ]
+ },
+ {
+ "id": "0017",
+ "name": "Add csum udp or tcp action",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action csum udp or tcp continue index 27",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action csum index 27",
+ "matchPattern": "action order [0-9]*: csum \\(tcp, udp\\) action continue.*index 27 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action csum"
+ ]
+ },
+ {
+ "id": "b10b",
+ "name": "Add all 7 csum actions",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action csum icmp ip4h sctp igmp udplite udp tcp index 7",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action csum index 7",
+ "matchPattern": "action order [0-9]*: csum \\(iph, icmp, igmp, tcp, udp, udplite, sctp\\).*index 7 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action csum"
+ ]
+ },
+ {
+ "id": "ce92",
+ "name": "Add csum udp action with cookie",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action csum udp pipe index 7 cookie 12345678",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action csum index 7",
+ "matchPattern": "action order [0-9]*: csum \\(udp\\) action pipe.*index 7.*cookie 12345678",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action csum"
+ ]
+ },
+ {
+ "id": "912f",
+ "name": "Add csum icmp action with large cookie",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action csum icmp pipe index 17 cookie aabbccddeeff1122",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action csum index 17",
+ "matchPattern": "action order [0-9]*: csum \\(icmp\\) action pipe.*index 17.*cookie aabbccddeeff1122",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action csum"
+ ]
+ },
+ {
+ "id": "879b",
+ "name": "Add batch of 32 csum tcp actions",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action csum tcp continue index \\$i \\\"; args=\"\\$args\\$cmd\"; done && $TC actions add \\$args\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action csum",
+ "matchPattern": "^[ \t]+index [0-9]* ref",
+ "matchCount": "32",
+ "teardown": [
+ "$TC actions flush action csum"
+ ]
+ },
+ {
+ "id": "b4e9",
+ "name": "Delete batch of 32 csum actions",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ],
+ "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action csum tcp continue index \\$i \\\"; args=\"\\$args\\$cmd\"; done && $TC actions add \\$args\""
+ ],
+ "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action csum index \\$i \\\"; args=\"\\$args\\$cmd\"; done && $TC actions del \\$args\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action csum",
+ "matchPattern": "^[ \t]+index [0-9]+ ref",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "0015",
+ "name": "Add batch of 32 csum tcp actions with large cookies",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action csum tcp continue index \\$i cookie 123456789abcde \\\"; args=\"\\$args\\$cmd\"; done && $TC actions add \\$args\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action csum",
+ "matchPattern": "^[ \t]+index [0-9]* ref",
+ "matchCount": "32",
+ "teardown": [
+ "$TC actions flush action csum"
+ ]
+ },
+ {
+ "id": "989e",
+ "name": "Delete batch of 32 csum actions with large cookies",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ],
+ "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action csum tcp continue index \\$i cookie 123456789abcde \\\"; args=\"\\$args\\$cmd\"; done && $TC actions add \\$args\""
+ ],
+ "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action csum index \\$i \\\"; args=\"\\$args\\$cmd\"; done && $TC actions del \\$args\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action csum",
+ "matchPattern": "^[ \t]+index [0-9]+ ref",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "d128",
+ "name": "Replace csum action with invalid goto chain control",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action csum iph index 90"
+ ],
+ "cmdUnderTest": "$TC actions replace action csum iph goto chain 42 index 90 cookie c1a0c1a0",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action csum index 90",
+ "matchPattern": "action order [0-9]*: csum \\(iph\\) action pass.*index 90 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action csum"
+ ]
+ },
+ {
+ "id": "eaf0",
+ "name": "Add csum iph action with no_percpu flag",
+ "category": [
+ "actions",
+ "csum"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action csum",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action csum iph no_percpu",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action csum",
+ "matchPattern": "action order [0-9]*: csum \\(iph\\) action pass.*no_percpu",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action csum"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/ct.json b/tools/testing/selftests/tc-testing/tc-tests/actions/ct.json
new file mode 100644
index 000000000..4202e95e2
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/ct.json
@@ -0,0 +1,410 @@
+[
+ {
+ "id": "696a",
+ "name": "Add simple ct action",
+ "category": [
+ "actions",
+ "ct"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ct",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ct index 42",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action ct",
+ "matchPattern": "action order [0-9]*: ct zone 0 pipe.*index 42 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ct"
+ ]
+ },
+ {
+ "id": "e38c",
+ "name": "Add simple ct action with cookie",
+ "category": [
+ "actions",
+ "ct"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ct",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ct index 42 cookie deadbeef",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action ct",
+ "matchPattern": "action order [0-9]*: ct zone 0 pipe.*index 42 ref.*cookie deadbeef",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ct"
+ ]
+ },
+ {
+ "id": "9f20",
+ "name": "Add ct clear action",
+ "category": [
+ "actions",
+ "ct"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ct",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ct clear index 42",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action ct",
+ "matchPattern": "action order [0-9]*: ct clear pipe.*index 42 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ct"
+ ]
+ },
+ {
+ "id": "0bc1",
+ "name": "Add ct clear action with cookie of max length",
+ "category": [
+ "actions",
+ "ct"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ct",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ct clear index 42 cookie aa11bb22cc33dd44ee55ff66aa11b1b2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action ct",
+ "matchPattern": "action order [0-9]*: ct clear pipe.*index 42 ref.*cookie aa11bb22cc33dd44ee55ff66aa11b1b2",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ct"
+ ]
+ },
+ {
+ "id": "5bea",
+ "name": "Try ct with zone",
+ "category": [
+ "actions",
+ "ct"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ct",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ct zone 404 index 42",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action ct",
+ "matchPattern": "action order [0-9]*: ct zone 404 pipe.*index 42 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ct"
+ ]
+ },
+ {
+ "id": "d5d6",
+ "name": "Try ct with zone, commit",
+ "category": [
+ "actions",
+ "ct"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ct",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ct zone 404 commit index 42",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action ct",
+ "matchPattern": "action order [0-9]*: ct commit zone 404 pipe.*index 42 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ct"
+ ]
+ },
+ {
+ "id": "029f",
+ "name": "Try ct with zone, commit, mark",
+ "category": [
+ "actions",
+ "ct"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ct",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ct zone 404 commit mark 0x42 index 42",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action ct",
+ "matchPattern": "action order [0-9]*: ct commit mark 66 zone 404 pipe.*index 42 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ct"
+ ]
+ },
+ {
+ "id": "a58d",
+ "name": "Try ct with zone, commit, mark, nat",
+ "category": [
+ "actions",
+ "ct"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ct",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ct zone 404 commit mark 0x42 nat src addr 5.5.5.7 index 42",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action ct",
+ "matchPattern": "action order [0-9]*: ct commit mark 66 zone 404 nat src addr 5.5.5.7 pipe.*index 42 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ct"
+ ]
+ },
+ {
+ "id": "901b",
+ "name": "Try ct with full nat ipv4 range syntax",
+ "category": [
+ "actions",
+ "ct"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ct",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ct commit nat src addr 5.5.5.7-5.5.6.0 port 1000-2000 index 44",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action ct",
+ "matchPattern": "action order [0-9]*: ct commit zone 0 nat src addr 5.5.5.7-5.5.6.0 port 1000-2000 pipe.*index 44 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ct"
+ ]
+ },
+ {
+ "id": "072b",
+ "name": "Try ct with full nat ipv6 syntax",
+ "category": [
+ "actions",
+ "ct"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ct",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ct commit nat src addr 2001::1 port 1000-2000 index 44",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action ct",
+ "matchPattern": "action order [0-9]*: ct commit zone 0 nat src addr 2001::1 port 1000-2000 pipe.*index 44 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ct"
+ ]
+ },
+ {
+ "id": "3420",
+ "name": "Try ct with full nat ipv6 range syntax",
+ "category": [
+ "actions",
+ "ct"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ct",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ct commit nat src addr 2001::1-2001::10 port 1000-2000 index 44",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action ct",
+ "matchPattern": "action order [0-9]*: ct commit zone 0 nat src addr 2001::1-2001::10 port 1000-2000 pipe.*index 44 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ct"
+ ]
+ },
+ {
+ "id": "4470",
+ "name": "Try ct with full nat ipv6 range syntax + force",
+ "category": [
+ "actions",
+ "ct"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ct",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ct commit force nat src addr 2001::1-2001::10 port 1000-2000 index 44",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action ct",
+ "matchPattern": "action order [0-9]*: ct commit force zone 0 nat src addr 2001::1-2001::10 port 1000-2000 pipe.*index 44 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ct"
+ ]
+ },
+ {
+ "id": "5d88",
+ "name": "Try ct with label",
+ "category": [
+ "actions",
+ "ct"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ct",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ct label 123123 index 44",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action ct",
+ "matchPattern": "action order [0-9]*: ct zone 0 label 12312300000000000000000000000000 pipe.*index 44 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ct"
+ ]
+ },
+ {
+ "id": "04d4",
+ "name": "Try ct with label with mask",
+ "category": [
+ "actions",
+ "ct"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ct",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ct label 12312300000000000000000000000001/ffffffff000000000000000000000001 index 44",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action ct",
+ "matchPattern": "action order [0-9]*: ct zone 0 label 12312300000000000000000000000001/ffffffff000000000000000000000001 pipe.*index 44 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ct"
+ ]
+ },
+ {
+ "id": "9751",
+ "name": "Try ct with mark + mask",
+ "category": [
+ "actions",
+ "ct"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ct",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ct mark 0x42/0xf0 index 42",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action ct",
+ "matchPattern": "action order [0-9]*: ct mark 66/0xf0 zone 0 pipe.*index 42 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ct"
+ ]
+ },
+ {
+ "id": "2faa",
+ "name": "Try ct with mark + mask and cookie",
+ "category": [
+ "actions",
+ "ct"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ct",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ct mark 0x42/0xf0 index 42 cookie aa11bb22cc33dd44ee55ff66aa11b1b2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action ct",
+ "matchPattern": "action order [0-9]*: ct mark 66/0xf0 zone 0 pipe.*index 42 ref.*cookie aa11bb22cc33dd44ee55ff66aa11b1b2",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ct"
+ ]
+ },
+ {
+ "id": "3991",
+ "name": "Add simple ct action with no_percpu flag",
+ "category": [
+ "actions",
+ "ct"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ct",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ct no_percpu",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action ct",
+ "matchPattern": "action order [0-9]*: ct zone 0 pipe.*no_percpu",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ct"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/gact.json b/tools/testing/selftests/tc-testing/tc-tests/actions/gact.json
new file mode 100644
index 000000000..b24494c6f
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/gact.json
@@ -0,0 +1,613 @@
+[
+ {
+ "id": "e89a",
+ "name": "Add valid pass action",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pass index 8",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action gact",
+ "matchPattern": "action order [0-9]*: gact action pass.*index 8 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "a02c",
+ "name": "Add valid pipe action",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pipe index 6",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action gact",
+ "matchPattern": "action order [0-9]*: gact action pipe.*index 6 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "feef",
+ "name": "Add valid reclassify action",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action reclassify index 5",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action gact",
+ "matchPattern": "action order [0-9]*: gact action reclassify.*index 5 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "8a7a",
+ "name": "Add valid drop action",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action drop index 30",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action gact",
+ "matchPattern": "action order [0-9]*: gact action drop.*index 30 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "9a52",
+ "name": "Add valid continue action",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action continue index 432",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action gact",
+ "matchPattern": "action order [0-9]*: gact action continue.*index 432 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "d700",
+ "name": "Add invalid action",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pump index 386",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action gact",
+ "matchPattern": "action order [0-9]*: gact action.*index 386 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "9215",
+ "name": "Add action with duplicate index",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action pipe index 15"
+ ],
+ "cmdUnderTest": "$TC actions add action drop index 15",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action gact",
+ "matchPattern": "action order [0-9]*: gact action drop.*index 15 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "798e",
+ "name": "Add action with index exceeding 32-bit maximum",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action drop index 4294967296",
+ "expExitCode": "255",
+ "verifyCmd": "actions list action gact",
+ "matchPattern": "action order [0-9]*: gact action drop.*index 4294967296 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "22be",
+ "name": "Add action with index at 32-bit maximum",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action drop index 4294967295",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action gact",
+ "matchPattern": "action order [0-9]*: gact action drop.*index 4294967295 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "ac2a",
+ "name": "List actions",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action reclassify index 101",
+ "$TC actions add action reclassify index 102",
+ "$TC actions add action reclassify index 103",
+ "$TC actions add action reclassify index 104",
+ "$TC actions add action reclassify index 105"
+ ],
+ "cmdUnderTest": "$TC actions list action gact",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action gact",
+ "matchPattern": "action order [0-9]*: gact action reclassify",
+ "matchCount": "5",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "3edf",
+ "name": "Flush gact actions",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ "$TC actions add action reclassify index 101",
+ "$TC actions add action reclassify index 102",
+ "$TC actions add action reclassify index 103",
+ "$TC actions add action reclassify index 104",
+ "$TC actions add action reclassify index 105"
+ ],
+ "cmdUnderTest": "$TC actions flush action gact",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action gact",
+ "matchPattern": "action order [0-9]*: gact action reclassify",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "63ec",
+ "name": "Delete pass action",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action pass index 1"
+ ],
+ "cmdUnderTest": "$TC actions del action gact index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action gact",
+ "matchPattern": "action order [0-9]*: gact action pass.*index 1 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "46be",
+ "name": "Delete pipe action",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action pipe index 9"
+ ],
+ "cmdUnderTest": "$TC actions del action gact index 9",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action gact",
+ "matchPattern": "action order [0-9]*: gact action pipe.*index 9 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "2e08",
+ "name": "Delete reclassify action",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action reclassify index 65536"
+ ],
+ "cmdUnderTest": "$TC actions del action gact index 65536",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action gact",
+ "matchPattern": "action order [0-9]*: gact action reclassify.*index 65536 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "99c4",
+ "name": "Delete drop action",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action drop index 16"
+ ],
+ "cmdUnderTest": "$TC actions del action gact index 16",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action gact",
+ "matchPattern": "action order [0-9]*: gact action drop.*index 16 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "fb6b",
+ "name": "Delete continue action",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action continue index 32"
+ ],
+ "cmdUnderTest": "$TC actions del action gact index 32",
+ "expExitCode": "0",
+ "verifyCmd": "actions list action gact",
+ "matchPattern": "action order [0-9]*: gact action continue.*index 32 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "0eb3",
+ "name": "Delete non-existent action",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions del action gact index 2",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action gact",
+ "matchPattern": "action order [0-9]*: gact action",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "f02c",
+ "name": "Replace gact action",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action drop index 10",
+ "$TC actions add action drop index 12"
+ ],
+ "cmdUnderTest": "$TC actions replace action ok index 12",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action gact",
+ "matchPattern": "action order [0-9]*: gact action pass",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "525f",
+ "name": "Get gact action by index",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action drop index 3900800700"
+ ],
+ "cmdUnderTest": "$TC actions get action gact index 3900800700",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action gact index 3900800700",
+ "matchPattern": "index 3900800700",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "1021",
+ "name": "Add batch of 32 gact pass actions",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action pass index \\$i \\\"; args=\"\\$args\\$cmd\"; done && $TC actions add \\$args\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action gact",
+ "matchPattern": "^[ \t]+index [0-9]+ ref",
+ "matchCount": "32",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "da7a",
+ "name": "Add batch of 32 gact continue actions with cookie",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action continue index \\$i cookie aabbccddeeff112233445566778800a1 \\\"; args=\"\\$args\\$cmd\"; done && $TC actions add \\$args\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action gact",
+ "matchPattern": "^[ \t]+index [0-9]+ ref",
+ "matchCount": "32",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "8aa3",
+ "name": "Delete batch of 32 gact continue actions",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ],
+ "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action continue index \\$i \\\"; args=\\\"\\$args\\$cmd\\\"; done && $TC actions add \\$args\""
+ ],
+ "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action gact index \\$i \\\"; args=\"\\$args\\$cmd\"; done && $TC actions del \\$args\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action gact",
+ "matchPattern": "^[ \t]+index [0-9]+ ref",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "8e47",
+ "name": "Add gact action with random determ goto chain control action",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pass random determ goto chain 1 2 index 90",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action gact",
+ "matchPattern": "action order [0-9]*: gact action pass random type determ goto chain 1 val 2.*index 90 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "ca89",
+ "name": "Replace gact action with invalid goto chain control",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action pass random determ drop 2 index 90"
+ ],
+ "cmdUnderTest": "$TC actions replace action goto chain 42 random determ drop 5 index 90 cookie c1a0c1a0",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action gact",
+ "matchPattern": "action order [0-9]*: gact action pass.*random type determ drop val 2.*index 90 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "95ad",
+ "name": "Add gact pass action with no_percpu flag",
+ "category": [
+ "actions",
+ "gact"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action gact",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pass no_percpu",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action gact",
+ "matchPattern": "action order [0-9]*: gact action pass.*no_percpu",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action gact"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json b/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json
new file mode 100644
index 000000000..c13a68b98
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/ife.json
@@ -0,0 +1,1089 @@
+[
+ {
+ "id": "7682",
+ "name": "Create valid ife encode action with mark and pass control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode allow mark pass index 2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 2",
+ "matchPattern": "action order [0-9]*: ife encode action pass.*type 0[xX]ED3E.*allow mark.*index 2",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "ef47",
+ "name": "Create valid ife encode action with mark and pipe control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode use mark 10 pipe index 2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 2",
+ "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0[xX]ED3E.*use mark.*index 2",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "df43",
+ "name": "Create valid ife encode action with mark and continue control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode allow mark continue index 2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 2",
+ "matchPattern": "action order [0-9]*: ife encode action continue.*type 0[xX]ED3E.*allow mark.*index 2",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "e4cf",
+ "name": "Create valid ife encode action with mark and drop control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode use mark 789 drop index 2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 2",
+ "matchPattern": "action order [0-9]*: ife encode action drop.*type 0[xX]ED3E.*use mark 789.*index 2",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "ccba",
+ "name": "Create valid ife encode action with mark and reclassify control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode use mark 656768 reclassify index 2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 2",
+ "matchPattern": "action order [0-9]*: ife encode action reclassify.*type 0[xX]ED3E.*use mark 656768.*index 2",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "a1cf",
+ "name": "Create valid ife encode action with mark and jump control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode use mark 65 jump 1 index 2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 2",
+ "matchPattern": "action order [0-9]*: ife encode action jump 1.*type 0[xX]ED3E.*use mark 65.*index 2",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "cb3d",
+ "name": "Create valid ife encode action with mark value at 32-bit maximum",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode use mark 4294967295 reclassify index 90",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 90",
+ "matchPattern": "action order [0-9]*: ife encode action reclassify.*type 0[xX]ED3E.*use mark 4294967295.*index 90",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "1efb",
+ "name": "Create ife encode action with mark value exceeding 32-bit maximum",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode use mark 4294967295999 pipe index 90",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action ife index 90",
+ "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0[xX]ED3E.*use mark 4294967295999.*index 90",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "95ed",
+ "name": "Create valid ife encode action with prio and pass control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode allow prio pass index 9",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 9",
+ "matchPattern": "action order [0-9]*: ife encode action pass.*type 0[xX]ED3E.*allow prio.*index 9",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "aa17",
+ "name": "Create valid ife encode action with prio and pipe control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode use prio 7 pipe index 9",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 9",
+ "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0[xX]ED3E.*use prio 7.*index 9",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "74c7",
+ "name": "Create valid ife encode action with prio and continue control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode use prio 3 continue index 9",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 9",
+ "matchPattern": "action order [0-9]*: ife encode action continue.*type 0[xX]ED3E.*use prio 3.*index 9",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "7a97",
+ "name": "Create valid ife encode action with prio and drop control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode allow prio drop index 9",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 9",
+ "matchPattern": "action order [0-9]*: ife encode action drop.*type 0[xX]ED3E.*allow prio.*index 9",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "f66b",
+ "name": "Create valid ife encode action with prio and reclassify control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode use prio 998877 reclassify index 9",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 9",
+ "matchPattern": "action order [0-9]*: ife encode action reclassify.*type 0[xX]ED3E.*use prio 998877.*index 9",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "3056",
+ "name": "Create valid ife encode action with prio and jump control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode use prio 998877 jump 10 index 9",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 9",
+ "matchPattern": "action order [0-9]*: ife encode action jump 10.*type 0[xX]ED3E.*use prio 998877.*index 9",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "7dd3",
+ "name": "Create valid ife encode action with prio value at 32-bit maximum",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode use prio 4294967295 reclassify index 99",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 99",
+ "matchPattern": "action order [0-9]*: ife encode action reclassify.*type 0[xX]ED3E.*use prio 4294967295.*index 99",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "2ca1",
+ "name": "Create ife encode action with prio value exceeding 32-bit maximum",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode use prio 4294967298 pipe index 99",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action ife index 99",
+ "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0[xX]ED3E.*use prio 4294967298.*index 99",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "05bb",
+ "name": "Create valid ife encode action with tcindex and pass control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode allow tcindex pass index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 1",
+ "matchPattern": "action order [0-9]*: ife encode action pass.*type 0[xX]ED3E.*allow tcindex.*index 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "ce65",
+ "name": "Create valid ife encode action with tcindex and pipe control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode use tcindex 111 pipe index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 1",
+ "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0[xX]ED3E.*use tcindex 111.*index 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "09cd",
+ "name": "Create valid ife encode action with tcindex and continue control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode use tcindex 1 continue index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 1",
+ "matchPattern": "action order [0-9]*: ife encode action continue.*type 0[xX]ED3E.*use tcindex 1.*index 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "8eb5",
+ "name": "Create valid ife encode action with tcindex and continue control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode use tcindex 1 continue index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 1",
+ "matchPattern": "action order [0-9]*: ife encode action continue.*type 0[xX]ED3E.*use tcindex 1.*index 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "451a",
+ "name": "Create valid ife encode action with tcindex and drop control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode allow tcindex drop index 77",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 77",
+ "matchPattern": "action order [0-9]*: ife encode action drop.*type 0[xX]ED3E.*allow tcindex.*index 77",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "d76c",
+ "name": "Create valid ife encode action with tcindex and reclassify control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode allow tcindex reclassify index 77",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 77",
+ "matchPattern": "action order [0-9]*: ife encode action reclassify.*type 0[xX]ED3E.*allow tcindex.*index 77",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "e731",
+ "name": "Create valid ife encode action with tcindex and jump control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode allow tcindex jump 999 index 77",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 77",
+ "matchPattern": "action order [0-9]*: ife encode action jump 999.*type 0[xX]ED3E.*allow tcindex.*index 77",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "b7b8",
+ "name": "Create valid ife encode action with tcindex value at 16-bit maximum",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode use tcindex 65535 pass index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 1",
+ "matchPattern": "action order [0-9]*: ife encode action pass.*type 0[xX]ED3E.*use tcindex 65535.*index 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "d0d8",
+ "name": "Create ife encode action with tcindex value exceeding 16-bit maximum",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode use tcindex 65539 pipe index 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action ife index 1",
+ "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0[xX]ED3E.*use tcindex 65539.*index 1",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "2a9c",
+ "name": "Create valid ife encode action with mac src parameter",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode allow mark src 00:11:22:33:44:55 pipe index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 1",
+ "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0[xX]ED3E.*allow mark src 00:11:22:33:44:55.*index 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "cf5c",
+ "name": "Create valid ife encode action with mac dst parameter",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode use prio 9876 dst 00:11:22:33:44:55 reclassify index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 1",
+ "matchPattern": "action order [0-9]*: ife encode action reclassify.*type 0[xX]ED3E.*use prio 9876 dst 00:11:22:33:44:55.*index 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "2353",
+ "name": "Create valid ife encode action with mac src and mac dst parameters",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode allow tcindex src 00:aa:bb:cc:dd:ee dst 00:11:22:33:44:55 pass index 11",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 11",
+ "matchPattern": "action order [0-9]*: ife encode action pass.*type 0[xX]ED3E.*allow tcindex dst 00:11:22:33:44:55 src 00:aa:bb:cc:dd:ee .*index 11",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "552c",
+ "name": "Create valid ife encode action with mark and type parameters",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode use mark 7 type 0xfefe pass index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 1",
+ "matchPattern": "action order [0-9]*: ife encode action pass.*type 0[xX]FEFE.*use mark 7.*index 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "0421",
+ "name": "Create valid ife encode action with prio and type parameters",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode use prio 444 type 0xabba pipe index 21",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 21",
+ "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0[xX]ABBA.*use prio 444.*index 21",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "4017",
+ "name": "Create valid ife encode action with tcindex and type parameters",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode use tcindex 5000 type 0xabcd reclassify index 21",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 21",
+ "matchPattern": "action order [0-9]*: ife encode action reclassify.*type 0[xX]ABCD.*use tcindex 5000.*index 21",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "fac3",
+ "name": "Create valid ife encode action with index at 32-bit maximum",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode allow mark pass index 4294967295",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 4294967295",
+ "matchPattern": "action order [0-9]*: ife encode action pass.*type 0[xX]ED3E.*allow mark.*index 4294967295",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "7c25",
+ "name": "Create valid ife decode action with pass control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife decode pass index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 1",
+ "matchPattern": "action order [0-9]*: ife decode action pass.*type 0(x0)?.*allow mark allow tcindex allow prio.*index 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "dccb",
+ "name": "Create valid ife decode action with pipe control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife decode pipe index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 1",
+ "matchPattern": "action order [0-9]*: ife decode action pipe.*type 0(x0)?.*allow mark allow tcindex allow prio.*index 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "7bb9",
+ "name": "Create valid ife decode action with continue control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife decode continue index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 1",
+ "matchPattern": "action order [0-9]*: ife decode action continue.*type 0(x0)?.*allow mark allow tcindex allow prio.*index 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "d9ad",
+ "name": "Create valid ife decode action with drop control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife decode drop index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 1",
+ "matchPattern": "action order [0-9]*: ife decode action drop.*type 0(x0)?.*allow mark allow tcindex allow prio.*index 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "219f",
+ "name": "Create valid ife decode action with reclassify control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife decode reclassify index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 1",
+ "matchPattern": "action order [0-9]*: ife decode action reclassify.*type 0(x0)?.*allow mark allow tcindex allow prio.*index 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "8f44",
+ "name": "Create valid ife decode action with jump control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife decode jump 10 index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 1",
+ "matchPattern": "action order [0-9]*: ife decode action jump 10.*type 0(x0)?.*allow mark allow tcindex allow prio.*index 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "56cf",
+ "name": "Create ife encode action with index exceeding 32-bit maximum",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode allow mark pass index 4294967295999",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action ife index 4294967295999",
+ "matchPattern": "action order [0-9]*: ife encode action pass.*type 0[xX]ED3E.*allow mark.*index 4294967295999",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "ee94",
+ "name": "Create ife encode action with invalid control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode allow mark kuka index 4",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action ife index 4",
+ "matchPattern": "action order [0-9]*: ife encode action kuka.*type 0[xX]ED3E.*allow mark.*index 4",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "b330",
+ "name": "Create ife encode action with cookie",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode allow prio pipe index 4 cookie aabbccddeeff112233445566778800a1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action ife index 4",
+ "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0[xX]ED3E.*allow prio.*index 4.*cookie aabbccddeeff112233445566778800a1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ },
+ {
+ "id": "bbc0",
+ "name": "Create ife encode action with invalid argument",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode allow foo pipe index 4",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action ife index 4",
+ "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0[xX]ED3E.*allow foo.*index 4",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "d54a",
+ "name": "Create ife encode action with invalid type argument",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode allow prio type 70000 pipe index 4",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action ife index 4",
+ "matchPattern": "action order [0-9]*: ife encode action pipe.*type 0[xX]11170.*allow prio.*index 4",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "7ee0",
+ "name": "Create ife encode action with invalid mac src argument",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode allow prio src 00:11:22:33:44:pp pipe index 4",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action ife index 4",
+ "matchPattern": "action order [0-9]*: ife encode action pipe.*allow prio.*index 4",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "0a7d",
+ "name": "Create ife encode action with invalid mac dst argument",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action ife encode allow prio dst 00.111-22:33:44:aa pipe index 4",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action ife index 4",
+ "matchPattern": "action order [0-9]*: ife encode action pipe.*allow prio.*index 4",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "a0e2",
+ "name": "Replace ife encode action with invalid goto chain control",
+ "category": [
+ "actions",
+ "ife"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action ife",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action ife encode allow mark pass index 90"
+ ],
+ "cmdUnderTest": "$TC actions replace action ife encode allow mark goto chain 42 index 90 cookie c1a0c1a0",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action ife index 90",
+ "matchPattern": "action order [0-9]*: ife encode action pass.*type 0[xX]ED3E .*allow mark.*index 90 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action ife"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json b/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json
new file mode 100644
index 000000000..12a2fe0e1
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/mirred.json
@@ -0,0 +1,581 @@
+[
+ {
+ "id": "5124",
+ "name": "Add mirred mirror to egress action",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mirred egress mirror index 1 dev lo",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mirred",
+ "matchPattern": "action order [0-9]*: mirred \\(Egress Mirror to device lo\\).*index 1 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mirred"
+ ]
+ },
+ {
+ "id": "6fb4",
+ "name": "Add mirred redirect to egress action",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mirred egress redirect index 2 dev lo action pipe",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mirred",
+ "matchPattern": "action order [0-9]*: mirred \\(Egress Redirect to device lo\\).*index 2 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mirred",
+ "$TC actions flush action gact"
+ ]
+ },
+ {
+ "id": "ba38",
+ "name": "Get mirred actions",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action mirred egress mirror index 1 dev lo",
+ "$TC actions add action mirred egress redirect index 2 dev lo"
+ ],
+ "cmdUnderTest": "$TC actions show action mirred",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mirred",
+ "matchPattern": "[Mirror|Redirect] to device lo",
+ "matchCount": "2",
+ "teardown": [
+ "$TC actions flush action mirred"
+ ]
+ },
+ {
+ "id": "d7c0",
+ "name": "Add invalid mirred direction",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mirred inbound mirror index 20 dev lo",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mirred",
+ "matchPattern": "action order [0-9]*: mirred \\(.*to device lo\\).*index 20 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action mirred"
+ ]
+ },
+ {
+ "id": "e213",
+ "name": "Add invalid mirred action",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mirred egress remirror index 20 dev lo",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mirred",
+ "matchPattern": "action order [0-9]*: mirred \\(Egress.*to device lo\\).*index 20 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action mirred"
+ ]
+ },
+ {
+ "id": "2d89",
+ "name": "Add mirred action with invalid device",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mirred egress mirror index 20 dev eltoh",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mirred",
+ "matchPattern": "action order [0-9]*: mirred \\(.*to device eltoh\\).*index 20 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action mirred"
+ ]
+ },
+ {
+ "id": "300b",
+ "name": "Add mirred action with duplicate index",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action mirred egress redirect index 15 dev lo"
+ ],
+ "cmdUnderTest": "$TC actions add action mirred egress mirror index 15 dev lo",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mirred",
+ "matchPattern": "action order [0-9]*: mirred \\(.*to device lo\\).*index 15 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mirred"
+ ]
+ },
+ {
+ "id": "8917",
+ "name": "Add mirred mirror action with control pass",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mirred ingress mirror dev lo pass index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action mirred index 1",
+ "matchPattern": "action order [0-9]*: mirred \\(Ingress Mirror to device lo\\) pass.*index 1 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mirred"
+ ]
+ },
+ {
+ "id": "1054",
+ "name": "Add mirred mirror action with control pipe",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mirred ingress mirror dev lo pipe index 15",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action mirred index 15",
+ "matchPattern": "action order [0-9]*: mirred \\(Ingress Mirror to device lo\\) pipe.*index 15 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mirred"
+ ]
+ },
+ {
+ "id": "9887",
+ "name": "Add mirred mirror action with control continue",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mirred ingress mirror dev lo continue index 15",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action mirred index 15",
+ "matchPattern": "action order [0-9]*: mirred \\(Ingress Mirror to device lo\\) continue.*index 15 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mirred"
+ ]
+ },
+ {
+ "id": "e4aa",
+ "name": "Add mirred mirror action with control reclassify",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mirred ingress mirror dev lo reclassify index 150",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action mirred index 150",
+ "matchPattern": "action order [0-9]*: mirred \\(Ingress Mirror to device lo\\) reclassify.*index 150 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mirred"
+ ]
+ },
+ {
+ "id": "ece9",
+ "name": "Add mirred mirror action with control drop",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mirred ingress mirror dev lo drop index 99",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action mirred index 99",
+ "matchPattern": "action order [0-9]*: mirred \\(Ingress Mirror to device lo\\) drop.*index 99 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mirred"
+ ]
+ },
+ {
+ "id": "0031",
+ "name": "Add mirred mirror action with control jump",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mirred ingress mirror dev lo jump 10 index 99",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action mirred index 99",
+ "matchPattern": "action order [0-9]*: mirred \\(Ingress Mirror to device lo\\) jump 10.*index 99 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mirred"
+ ]
+ },
+ {
+ "id": "407c",
+ "name": "Add mirred mirror action with cookie",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mirred ingress mirror dev lo reclassify cookie aa11bb22cc33dd44ee55",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action mirred",
+ "matchPattern": "action order [0-9]*: mirred \\(Ingress Mirror to device lo\\) reclassify.*cookie aa11bb22cc33dd44ee55",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mirred"
+ ]
+ },
+ {
+ "id": "8b69",
+ "name": "Add mirred mirror action with index at 32-bit maximum",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mirred ingress mirror dev lo pipe index 4294967295",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action mirred index 4294967295",
+ "matchPattern": "action order [0-9]*: mirred \\(Ingress Mirror to device lo\\) pipe.*index 4294967295",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mirred"
+ ]
+ },
+ {
+ "id": "3f66",
+ "name": "Add mirred mirror action with index exceeding 32-bit maximum",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mirred ingress mirror dev lo pipe index 429496729555",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action mirred index 429496729555",
+ "matchPattern": "action order [0-9]*: mirred \\(Ingress Mirror to device lo\\) pipe.*index 429496729555",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "a70e",
+ "name": "Delete mirred mirror action",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action mirred egress mirror index 5 dev lo"
+ ],
+ "cmdUnderTest": "$TC actions del action mirred index 5",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mirred",
+ "matchPattern": "action order [0-9]*: mirred \\(Egress Mirror to device lo\\).*index 5 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action mirred"
+ ]
+ },
+ {
+ "id": "3fb3",
+ "name": "Delete mirred redirect action",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action mirred egress redirect index 5 dev lo"
+ ],
+ "cmdUnderTest": "$TC actions del action mirred index 5",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mirred",
+ "matchPattern": "action order [0-9]*: mirred \\(Egress Redirect to device lo\\).*index 5 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action mirred"
+ ]
+ },
+ {
+ "id": "2a9a",
+ "name": "Replace mirred action with invalid goto chain control",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action mirred ingress mirror dev lo drop index 90"
+ ],
+ "cmdUnderTest": "$TC actions replace action mirred ingress mirror dev lo goto chain 42 index 90 cookie c1a0c1a0",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action mirred index 90",
+ "matchPattern": "action order [0-9]*: mirred \\(Ingress Mirror to device lo\\) drop.*index 90 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mirred"
+ ]
+ },
+ {
+ "id": "4749",
+ "name": "Add batch of 32 mirred redirect egress actions with cookie",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action mirred egress redirect dev lo index \\$i cookie aabbccddeeff112233445566778800a1 \\\"; args=\"\\$args\\$cmd\"; done && $TC actions add \\$args\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mirred",
+ "matchPattern": "^[ \t]+index [0-9]+ ref",
+ "matchCount": "32",
+ "teardown": [
+ "$TC actions flush action mirred"
+ ]
+ },
+ {
+ "id": "5c69",
+ "name": "Delete batch of 32 mirred redirect egress actions",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ],
+ "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action mirred egress redirect dev lo index \\$i \\\"; args=\\\"\\$args\\$cmd\\\"; done && $TC actions add \\$args\""
+ ],
+ "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action mirred index \\$i \\\"; args=\"\\$args\\$cmd\"; done && $TC actions del \\$args\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mirred",
+ "matchPattern": "^[ \t]+index [0-9]+ ref",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "d3c0",
+ "name": "Add batch of 32 mirred mirror ingress actions with cookie",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action mirred ingress mirror dev lo index \\$i cookie aabbccddeeff112233445566778800a1 \\\"; args=\"\\$args\\$cmd\"; done && $TC actions add \\$args\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mirred",
+ "matchPattern": "^[ \t]+index [0-9]+ ref",
+ "matchCount": "32",
+ "teardown": [
+ "$TC actions flush action mirred"
+ ]
+ },
+ {
+ "id": "e684",
+ "name": "Delete batch of 32 mirred mirror ingress actions",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ],
+ "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action mirred ingress mirror dev lo index \\$i \\\"; args=\\\"\\$args\\$cmd\\\"; done && $TC actions add \\$args\""
+ ],
+ "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action mirred index \\$i \\\"; args=\"\\$args\\$cmd\"; done && $TC actions del \\$args\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mirred",
+ "matchPattern": "^[ \t]+index [0-9]+ ref",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "31e3",
+ "name": "Add mirred mirror to egress action with no_percpu flag",
+ "category": [
+ "actions",
+ "mirred"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mirred",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mirred egress mirror dev lo no_percpu",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mirred",
+ "matchPattern": "action order [0-9]*: mirred \\(Egress Mirror to device lo\\).*no_percpu",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mirred"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/mpls.json b/tools/testing/selftests/tc-testing/tc-tests/actions/mpls.json
new file mode 100644
index 000000000..866f0efd0
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/mpls.json
@@ -0,0 +1,1233 @@
+[
+ {
+ "id": "a933",
+ "name": "Add MPLS dec_ttl action with pipe opcode",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls dec_ttl pipe index 8",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*dec_ttl.*pipe.*index 8 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "08d1",
+ "name": "Add mpls dec_ttl action with pass opcode",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls dec_ttl pass index 8",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action mpls index 8",
+ "matchPattern": "action order [0-9]+: mpls.*dec_ttl.*pass.*index 8 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "d786",
+ "name": "Add mpls dec_ttl action with drop opcode",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls dec_ttl drop index 8",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action mpls index 8",
+ "matchPattern": "action order [0-9]+: mpls.*dec_ttl.*drop.*index 8 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "f334",
+ "name": "Add mpls dec_ttl action with reclassify opcode",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls dec_ttl reclassify index 8",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action mpls index 8",
+ "matchPattern": "action order [0-9]+: mpls.*dec_ttl.*reclassify.*index 8 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "29bd",
+ "name": "Add mpls dec_ttl action with continue opcode",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls dec_ttl continue index 8",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action mpls index 8",
+ "matchPattern": "action order [0-9]+: mpls.*dec_ttl.*continue.*index 8 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "48df",
+ "name": "Add mpls dec_ttl action with jump opcode",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls dec_ttl jump 10 index 8",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*jump 10.*index 8 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "62eb",
+ "name": "Add mpls dec_ttl action with trap opcode",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls dec_ttl trap index 8",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*dec_ttl trap.*index 8 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "09d2",
+ "name": "Add mpls dec_ttl action with opcode and cookie",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls dec_ttl pipe index 8 cookie aabbccddeeff",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*dec_ttl pipe.*index 8 ref.*cookie aabbccddeeff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "c170",
+ "name": "Add mpls dec_ttl action with opcode and cookie of max length",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls dec_ttl continue index 8 cookie aa11bb22cc33dd44ee55ff66aa11b1b2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*dec_ttl continue.*index 8 ref.*cookie aa11bb22cc33dd44ee55ff66aa11b1b2",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "9118",
+ "name": "Add mpls dec_ttl action with invalid opcode",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls dec_ttl foo index 8",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*dec_ttl.*foo.*index 8 ref",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "6ce1",
+ "name": "Add mpls dec_ttl action with label (invalid)",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls dec_ttl label 20",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*dec_ttl.*label.*20.*pipe",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "352f",
+ "name": "Add mpls dec_ttl action with tc (invalid)",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls dec_ttl tc 3",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*dec_ttl.*tc.*3.*pipe",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "fa1c",
+ "name": "Add mpls dec_ttl action with ttl (invalid)",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls dec_ttl ttl 20",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*dec_ttl.*ttl.*20.*pipe",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "6b79",
+ "name": "Add mpls dec_ttl action with bos (invalid)",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls dec_ttl bos 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*dec_ttl.*bos.*1.*pipe",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "d4c4",
+ "name": "Add mpls pop action with ip proto",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls pop protocol ipv4",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*pop.*protocol.*ip.*pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "91fb",
+ "name": "Add mpls pop action with ip proto and cookie",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls pop protocol ipv4 cookie 12345678",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*pop.*protocol.*ip.*pipe.*ref 1.*cookie 12345678",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "92fe",
+ "name": "Add mpls pop action with mpls proto",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls pop protocol mpls_mc",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*pop.*protocol.*mpls_mc.*pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "7e23",
+ "name": "Add mpls pop action with no protocol (invalid)",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls pop",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*pop.*pipe",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "6182",
+ "name": "Add mpls pop action with label (invalid)",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls pop protocol ipv4 label 20",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*pop.*label.*20.*pipe",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "6475",
+ "name": "Add mpls pop action with tc (invalid)",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls pop protocol ipv4 tc 3",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*pop.*tc.*3.*pipe",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "067b",
+ "name": "Add mpls pop action with ttl (invalid)",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls pop protocol ipv4 ttl 20",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*pop.*ttl.*20.*pipe",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "7316",
+ "name": "Add mpls pop action with bos (invalid)",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls pop protocol ipv4 bos 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*pop.*bos.*1.*pipe",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "38cc",
+ "name": "Add mpls push action with label",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls push label 20",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*push.*protocol.*mpls_uc.*label.*20.*ttl.*[0-9]+.*pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "c281",
+ "name": "Add mpls push action with mpls_mc protocol",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls push protocol mpls_mc label 20",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*push.*protocol.*mpls_mc.*label.*20.*ttl.*[0-9]+.*pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "5db4",
+ "name": "Add mpls push action with label, tc and ttl",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls push label 20 tc 3 ttl 128",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*push.*protocol.*mpls_uc.*label.*20.*tc.*3.*ttl.*128.*pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "7c34",
+ "name": "Add mpls push action with label, tc ttl and cookie of max length",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls push label 20 tc 3 ttl 128 cookie aa11bb22cc33dd44ee55ff66aa11b1b2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*push.*protocol.*mpls_uc.*label.*20.*tc.*3.*ttl.*128.*pipe.*ref 1.*cookie aa11bb22cc33dd44ee55ff66aa11b1b2",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "16eb",
+ "name": "Add mpls push action with label and bos",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls push label 20 bos 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*push.*protocol.*mpls_uc.*label.*20.*bos.*1.*pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "d69d",
+ "name": "Add mpls push action with no label (invalid)",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls push",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*push.*protocol.*mpls_uc.*pipe",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "e8e4",
+ "name": "Add mpls push action with ipv4 protocol (invalid)",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls push protocol ipv4 label 20",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*push.*protocol.*mpls_uc.*label.*20.*ttl.*[0-9]+.*pipe",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "ecd0",
+ "name": "Add mpls push action with out of range label (invalid)",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls push label 1048576",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*push.*protocol.*mpls_uc.*label.*1048576.*pipe",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "d303",
+ "name": "Add mpls push action with out of range tc (invalid)",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls push label 20 tc 8",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*push.*protocol.*mpls_uc.*label.*20.*tc.*8.*pipe",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "fd6e",
+ "name": "Add mpls push action with ttl of 0 (invalid)",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls push label 20 ttl 0",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*push.*protocol.*mpls_uc.*label.*20.*ttl.*0.*pipe",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "19e9",
+ "name": "Add mpls mod action with mpls label",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls mod label 20",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*modify.*label.*20.*pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "1fde",
+ "name": "Add mpls mod action with max mpls label",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls mod label 0xfffff",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*modify.*label.*1048575.*pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "0c50",
+ "name": "Add mpls mod action with mpls label exceeding max (invalid)",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls mod label 0x100000",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*modify.*label.*1048576.*pipe",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "10b6",
+ "name": "Add mpls mod action with mpls label of MPLS_LABEL_IMPLNULL (invalid)",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls mod label 3",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*modify.*label.*3.*pipe",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "57c9",
+ "name": "Add mpls mod action with mpls min tc",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls mod tc 0",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*modify.*tc.*0.*pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "6872",
+ "name": "Add mpls mod action with mpls max tc",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls mod tc 7",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*modify.*tc.*7.*pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "a70a",
+ "name": "Add mpls mod action with mpls tc exceeding max (invalid)",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls mod tc 8",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*modify.*tc.*4.*pipe",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "6ed5",
+ "name": "Add mpls mod action with mpls ttl",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls mod ttl 128",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*modify.*ttl.*128.*pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "77c1",
+ "name": "Add mpls mod action with mpls ttl and cookie",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls mod ttl 128 cookie aa11bb22cc33dd44ee55ff66aa11b1b2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*modify.*ttl.*128.*pipe.*ref 1.*cookie aa11bb22cc33dd44ee55ff66aa11b1b2",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "b80f",
+ "name": "Add mpls mod action with mpls max ttl",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls mod ttl 255",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*modify.*ttl.*255.*pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "8864",
+ "name": "Add mpls mod action with mpls min ttl",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls mod ttl 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*modify.*ttl.*1.*pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "6c06",
+ "name": "Add mpls mod action with mpls ttl of 0 (invalid)",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls mod ttl 0",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*modify.*ttl.*0.*pipe",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "b5d8",
+ "name": "Add mpls mod action with mpls ttl exceeding max (invalid)",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls mod ttl 256",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*modify.*ttl.*256.*pipe",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "451f",
+ "name": "Add mpls mod action with mpls max bos",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls mod bos 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*modify.*bos.*1.*pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "a1ed",
+ "name": "Add mpls mod action with mpls min bos",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls mod bos 0",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*modify.*bos.*0.*pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "3dcf",
+ "name": "Add mpls mod action with mpls bos exceeding max (invalid)",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls mod bos 2",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*modify.*bos.*2.*pipe",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "db7c",
+ "name": "Add mpls mod action with protocol (invalid)",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action mpls mod protocol ipv4",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*modify.*protocol.*ip.*pipe",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "b070",
+ "name": "Replace existing mpls push action with new ID",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action mpls push label 20 pipe index 12"
+ ],
+ "cmdUnderTest": "$TC actions replace action mpls push label 30 pipe index 12",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action mpls index 12",
+ "matchPattern": "action order [0-9]+: mpls.*push.*protocol.*mpls_uc.*label.*30.*pipe.*index 12 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "95a9",
+ "name": "Replace existing mpls push action with new label, tc, ttl and cookie",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action mpls push label 20 tc 3 ttl 128 index 1 cookie aa11bb22cc33dd44ee55ff66aa11b1b2"
+ ],
+ "cmdUnderTest": "$TC actions replace action mpls push label 30 tc 2 ttl 125 pipe index 1 cookie aa11bb22cc33",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action mpls index 1",
+ "matchPattern": "action order [0-9]+: mpls.*push.*protocol.*mpls_uc.*label.*30 tc 2 ttl 125 pipe.*index 1.*cookie aa11bb22cc33",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action mpls"
+ ]
+ },
+ {
+ "id": "6cce",
+ "name": "Delete mpls pop action",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action mpls pop protocol ipv4 index 44"
+ ],
+ "cmdUnderTest": "$TC actions del action mpls index 44",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*pop.*index 44 ref",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "d138",
+ "name": "Flush mpls actions",
+ "category": [
+ "actions",
+ "mpls"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action mpls",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action mpls push label 10 index 10",
+ "$TC actions add action mpls push label 20 index 20",
+ "$TC actions add action mpls push label 30 index 30",
+ "$TC actions add action mpls push label 40 index 40"
+ ],
+ "cmdUnderTest": "$TC actions flush action mpls",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action mpls",
+ "matchPattern": "action order [0-9]+: mpls.*push.*",
+ "matchCount": "0",
+ "teardown": []
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/nat.json b/tools/testing/selftests/tc-testing/tc-tests/actions/nat.json
new file mode 100644
index 000000000..bc12c1cca
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/nat.json
@@ -0,0 +1,618 @@
+[
+ {
+ "id": "7565",
+ "name": "Add nat action on ingress with default control action",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat ingress 192.168.1.1 200.200.200.1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action nat",
+ "matchPattern": "action order [0-9]+: nat ingress 192.168.1.1/32 200.200.200.1 pass",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action nat"
+ ]
+ },
+ {
+ "id": "fd79",
+ "name": "Add nat action on ingress with pipe control action",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat ingress 1.1.1.1 2.2.2.1 pipe index 77",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action nat index 77",
+ "matchPattern": "action order [0-9]+: nat ingress 1.1.1.1/32 2.2.2.1 pipe.*index 77 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action nat"
+ ]
+ },
+ {
+ "id": "eab9",
+ "name": "Add nat action on ingress with continue control action",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat ingress 192.168.10.10 192.168.20.20 continue index 1000",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action nat index 1000",
+ "matchPattern": "action order [0-9]+: nat ingress 192.168.10.10/32 192.168.20.20 continue.*index 1000 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action nat"
+ ]
+ },
+ {
+ "id": "c53a",
+ "name": "Add nat action on ingress with reclassify control action",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat ingress 192.168.10.10 192.168.20.20 reclassify index 1000",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action nat index 1000",
+ "matchPattern": "action order [0-9]+: nat ingress 192.168.10.10/32 192.168.20.20 reclassify.*index 1000 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action nat"
+ ]
+ },
+ {
+ "id": "76c9",
+ "name": "Add nat action on ingress with jump control action",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat ingress 12.18.10.10 12.18.20.20 jump 10 index 22",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action nat index 22",
+ "matchPattern": "action order [0-9]+: nat ingress 12.18.10.10/32 12.18.20.20 jump 10.*index 22 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action nat"
+ ]
+ },
+ {
+ "id": "24c6",
+ "name": "Add nat action on ingress with drop control action",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat ingress 1.18.1.1 1.18.2.2 drop index 722",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action nat index 722",
+ "matchPattern": "action order [0-9]+: nat ingress 1.18.1.1/32 1.18.2.2 drop.*index 722 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action nat"
+ ]
+ },
+ {
+ "id": "2120",
+ "name": "Add nat action on ingress with maximum index value",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat ingress 1.18.1.1 1.18.2.2 index 4294967295",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action nat index 4294967295",
+ "matchPattern": "action order [0-9]+: nat ingress 1.18.1.1/32 1.18.2.2 pass.*index 4294967295 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action nat"
+ ]
+ },
+ {
+ "id": "3e9d",
+ "name": "Add nat action on ingress with invalid index value",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat ingress 1.18.1.1 1.18.2.2 index 4294967295555",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action nat index 4294967295555",
+ "matchPattern": "action order [0-9]+: nat ingress 1.18.1.1/32 1.18.2.2 pass.*index 4294967295555 ref",
+ "matchCount": "0",
+ "teardown": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ]
+ },
+ {
+ "id": "f6c9",
+ "name": "Add nat action on ingress with invalid IP address",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat ingress 1.1.1.1 1.1888.2.2 index 7",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action nat index 7",
+ "matchPattern": "action order [0-9]+: nat ingress 1.1.1.1/32 1.1888.2.2 pass.*index 7 ref",
+ "matchCount": "0",
+ "teardown": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ]
+ },
+ {
+ "id": "be25",
+ "name": "Add nat action on ingress with invalid argument",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat ingress 1.1.1.1 1.18.2.2 another_arg index 12",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action nat index 12",
+ "matchPattern": "action order [0-9]+: nat ingress 1.1.1.1/32 1.18.2.2 pass.*another_arg.*index 12 ref",
+ "matchCount": "0",
+ "teardown": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ]
+ },
+ {
+ "id": "a7bd",
+ "name": "Add nat action on ingress with DEFAULT IP address",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat ingress default 10.10.10.1 index 12",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action nat index 12",
+ "matchPattern": "action order [0-9]+: nat ingress 0.0.0.0/32 10.10.10.1 pass.*index 12 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action nat"
+ ]
+ },
+ {
+ "id": "ee1e",
+ "name": "Add nat action on ingress with ANY IP address",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat ingress any 10.10.10.1 index 12",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action nat index 12",
+ "matchPattern": "action order [0-9]+: nat ingress 0.0.0.0/32 10.10.10.1 pass.*index 12 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action nat"
+ ]
+ },
+ {
+ "id": "1de8",
+ "name": "Add nat action on ingress with ALL IP address",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat ingress all 10.10.10.1 index 12",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action nat index 12",
+ "matchPattern": "action order [0-9]+: nat ingress 0.0.0.0/32 10.10.10.1 pass.*index 12 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action nat"
+ ]
+ },
+ {
+ "id": "8dba",
+ "name": "Add nat action on egress with default control action",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat egress 10.10.10.1 20.20.20.1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action nat",
+ "matchPattern": "action order [0-9]+: nat egress 10.10.10.1/32 20.20.20.1 pass",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action nat"
+ ]
+ },
+ {
+ "id": "19a7",
+ "name": "Add nat action on egress with pipe control action",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat egress 10.10.10.1 20.20.20.1 pipe",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action nat",
+ "matchPattern": "action order [0-9]+: nat egress 10.10.10.1/32 20.20.20.1 pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action nat"
+ ]
+ },
+ {
+ "id": "f1d9",
+ "name": "Add nat action on egress with continue control action",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat egress 10.10.10.1 20.20.20.1 continue",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action nat",
+ "matchPattern": "action order [0-9]+: nat egress 10.10.10.1/32 20.20.20.1 continue",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action nat"
+ ]
+ },
+ {
+ "id": "6d4a",
+ "name": "Add nat action on egress with reclassify control action",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat egress 10.10.10.1 20.20.20.1 reclassify",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action nat",
+ "matchPattern": "action order [0-9]+: nat egress 10.10.10.1/32 20.20.20.1 reclassify",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action nat"
+ ]
+ },
+ {
+ "id": "b313",
+ "name": "Add nat action on egress with jump control action",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat egress 10.10.10.1 20.20.20.1 jump 777",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action nat",
+ "matchPattern": "action order [0-9]+: nat egress 10.10.10.1/32 20.20.20.1 jump 777",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action nat"
+ ]
+ },
+ {
+ "id": "d9fc",
+ "name": "Add nat action on egress with drop control action",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat egress 10.10.10.1 20.20.20.1 drop",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action nat",
+ "matchPattern": "action order [0-9]+: nat egress 10.10.10.1/32 20.20.20.1 drop",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action nat"
+ ]
+ },
+ {
+ "id": "a895",
+ "name": "Add nat action on egress with DEFAULT IP address",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat egress default 20.20.20.1 pipe index 10",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action nat index 10",
+ "matchPattern": "action order [0-9]+: nat egress 0.0.0.0/32 20.20.20.1 pipe.*index 10 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action nat"
+ ]
+ },
+ {
+ "id": "2572",
+ "name": "Add nat action on egress with ANY IP address",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat egress any 20.20.20.1 pipe index 10",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action nat index 10",
+ "matchPattern": "action order [0-9]+: nat egress 0.0.0.0/32 20.20.20.1 pipe.*index 10 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action nat"
+ ]
+ },
+ {
+ "id": "37f3",
+ "name": "Add nat action on egress with ALL IP address",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat egress all 20.20.20.1 pipe index 10",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action nat index 10",
+ "matchPattern": "action order [0-9]+: nat egress 0.0.0.0/32 20.20.20.1 pipe.*index 10 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action nat"
+ ]
+ },
+ {
+ "id": "6054",
+ "name": "Add nat action on egress with cookie",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat egress all 20.20.20.1 pipe index 10 cookie aa1bc2d3eeff112233445566778800a1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action nat index 10",
+ "matchPattern": "action order [0-9]+: nat egress 0.0.0.0/32 20.20.20.1 pipe.*index 10 ref.*cookie aa1bc2d3eeff112233445566778800a1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action nat"
+ ]
+ },
+ {
+ "id": "79d6",
+ "name": "Add nat action on ingress with cookie",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action nat ingress 192.168.1.1 10.10.10.1 reclassify index 1 cookie 112233445566778899aabbccddeeff11",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action nat index 1",
+ "matchPattern": "action order [0-9]+: nat ingress 192.168.1.1/32 10.10.10.1 reclassify.*index 1 ref.*cookie 112233445566778899aabbccddeeff11",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action nat"
+ ]
+ },
+ {
+ "id": "4b12",
+ "name": "Replace nat action with invalid goto chain control",
+ "category": [
+ "actions",
+ "nat"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action nat",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action nat ingress 1.18.1.1 1.18.2.2 drop index 90"
+ ],
+ "cmdUnderTest": "$TC actions replace action nat ingress 1.18.1.1 1.18.2.2 goto chain 42 index 90 cookie c1a0c1a0",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action nat index 90",
+ "matchPattern": "action order [0-9]+: nat ingress 1.18.1.1/32 1.18.2.2 drop.*index 90 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action nat"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json b/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json
new file mode 100644
index 000000000..72cdc3c80
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/pedit.json
@@ -0,0 +1,1726 @@
+[
+ {
+ "id": "319a",
+ "name": "Add pedit action that mangles IP TTL",
+ "category": [
+ "actions",
+ "pedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge ip ttl set 10",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action pedit",
+ "matchPattern": "action order [0-9]+: pedit action pass keys 1.*index 1 ref.*key #0 at ipv4\\+8: val 0a000000 mask 00ffffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "7e67",
+ "name": "Replace pedit action with invalid goto chain",
+ "category": [
+ "actions",
+ "pedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action pedit ex munge ip ttl set 10 pass index 90"
+ ],
+ "cmdUnderTest": "$TC actions replace action pedit ex munge ip ttl set 10 goto chain 42 index 90 cookie c1a0c1a0",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions ls action pedit",
+ "matchPattern": "action order [0-9]+: pedit action pass keys 1.*index 90 ref.*key #0 at ipv4\\+8: val 0a000000 mask 00ffffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "377e",
+ "name": "Add pedit action with RAW_OP offset u32",
+ "category": [
+ "actions",
+ "pedit",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge offset 12 u32 set 0x90abcdef",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": "12: val 90abcdef mask 00000000",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "a0ca",
+ "name": "Add pedit action with RAW_OP offset u32 (INVALID)",
+ "category": [
+ "actions",
+ "pedit",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge offset 2 u32 set 0x12345678",
+ "expExitCode": "255",
+ "verifyCmd": "/bin/true",
+ "matchPattern": " ",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "dd8a",
+ "name": "Add pedit action with RAW_OP offset u16 u16",
+ "category": [
+ "actions",
+ "pedit",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge offset 12 u16 set 0x1234 munge offset 14 u16 set 0x5678",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": "val 12340000 mask 0000ffff.*val 00005678 mask ffff0000",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "53db",
+ "name": "Add pedit action with RAW_OP offset u16 (INVALID)",
+ "category": [
+ "actions",
+ "pedit",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge offset 15 u16 set 0x1234",
+ "expExitCode": "255",
+ "verifyCmd": "/bin/true",
+ "matchPattern": " ",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "5c7e",
+ "name": "Add pedit action with RAW_OP offset u8 add value",
+ "category": [
+ "actions",
+ "pedit",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge offset 16 u8 add 0xf",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": " 16: add 0f000000 mask 00ffffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "2893",
+ "name": "Add pedit action with RAW_OP offset u8 quad",
+ "category": [
+ "actions",
+ "pedit",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge offset 12 u8 set 0x12 munge offset 13 u8 set 0x34 munge offset 14 u8 set 0x56 munge offset 15 u8 set 0x78",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": "val 12000000 mask 00ffffff.*val 00340000 mask ff00ffff.*val 00005600 mask ffff00ff.*val 00000078 mask ffffff00",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "3a07",
+ "name": "Add pedit action with RAW_OP offset u8-u16-u8",
+ "category": [
+ "actions",
+ "pedit",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge offset 0 u8 set 0x12 munge offset 1 u16 set 0x3456 munge offset 3 u8 set 0x78",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": "val 12000000 mask 00ffffff.*val 00345600 mask ff0000ff.*val 00000078 mask ffffff00",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "ab0f",
+ "name": "Add pedit action with RAW_OP offset u16-u8-u8",
+ "category": [
+ "actions",
+ "pedit",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge offset 0 u16 set 0x1234 munge offset 2 u8 set 0x56 munge offset 3 u8 set 0x78",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": "val 12340000 mask 0000ffff.*val 00005600 mask ffff00ff.*val 00000078 mask ffffff00",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "9d12",
+ "name": "Add pedit action with RAW_OP offset u32 set u16 clear u8 invert",
+ "category": [
+ "actions",
+ "pedit",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge offset 0 u32 set 0x12345678 munge offset 1 u16 clear munge offset 2 u8 invert",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": "val 12345678 mask 00000000.*val 00000000 mask ff0000ff.*val 0000ff00 mask ffffffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "ebfa",
+ "name": "Add pedit action with RAW_OP offset overflow u32 (INVALID)",
+ "category": [
+ "actions",
+ "pedit",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge offset 0xffffffffffffffffffffffffffffffffffffffffff u32 set 0x1",
+ "expExitCode": "255",
+ "verifyCmd": "/bin/true",
+ "matchPattern": " ",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "f512",
+ "name": "Add pedit action with RAW_OP offset u16 at offmask shift set",
+ "category": [
+ "actions",
+ "pedit",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge offset 12 u16 at 12 ffff 1 set 0xaaaa",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": " 12: val aaaa0000 mask 0000ffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "c2cb",
+ "name": "Add pedit action with RAW_OP offset u32 retain value",
+ "category": [
+ "actions",
+ "pedit",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge offset 12 u32 set 0x12345678 retain 0xff00",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": " 12: val 00005600 mask ffff00ff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "1762",
+ "name": "Add pedit action with RAW_OP offset u8 clear value",
+ "category": [
+ "actions",
+ "pedit",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge offset 0 u8 clear",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+:.*pedit.*keys 1.*key #0.*at 0: val 00000000 mask 00ffffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "bcee",
+ "name": "Add pedit action with RAW_OP offset u8 retain value",
+ "category": [
+ "actions",
+ "pedit",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge offset 0 u8 set 0x11 retain 0x0f",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+:.*pedit.*keys 1.*key #0.*at 0: val 01000000 mask f0ffffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "e89f",
+ "name": "Add pedit action with RAW_OP offset u16 retain value",
+ "category": [
+ "actions",
+ "pedit",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge offset 0 u16 set 0x1122 retain 0xff00",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+:.*pedit.*keys 1.*key #0.*at 0: val 11000000 mask 00ffffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "c282",
+ "name": "Add pedit action with RAW_OP offset u32 clear value",
+ "category": [
+ "actions",
+ "pedit",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge offset 0 u32 clear",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+:.*pedit.*keys 1.*key #0.*at 0: val 00000000 mask 00000000",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "c422",
+ "name": "Add pedit action with RAW_OP offset u16 invert value",
+ "category": [
+ "actions",
+ "pedit",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge offset 12 u16 invert",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+:.*pedit.*keys 1.*key #0.*at 12: val ffff0000 mask ffffffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "d3d3",
+ "name": "Add pedit action with RAW_OP offset u32 invert value",
+ "category": [
+ "actions",
+ "pedit",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge offset 12 u32 invert",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+:.*pedit.*keys 1.*key #0.*at 12: val ffffffff mask ffffffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "57e5",
+ "name": "Add pedit action with RAW_OP offset u8 preserve value",
+ "category": [
+ "actions",
+ "pedit",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge offset 0 u8 preserve",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+:.*pedit.*keys 1.*key #0.*at 0: val 00000000 mask ffffffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "99e0",
+ "name": "Add pedit action with RAW_OP offset u16 preserve value",
+ "category": [
+ "actions",
+ "pedit",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge offset 0 u16 preserve",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+:.*pedit.*keys 1.*key #0.*at 0: val 00000000 mask ffffffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "1892",
+ "name": "Add pedit action with RAW_OP offset u32 preserve value",
+ "category": [
+ "actions",
+ "pedit",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge offset 0 u32 preserve",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+:.*pedit.*keys 1.*key #0.*at 0: val 00000000 mask ffffffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "4b60",
+ "name": "Add pedit action with RAW_OP negative offset u16/u32 set value",
+ "category": [
+ "actions",
+ "pedit",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge offset -14 u16 set 0x0000 munge offset -12 u32 set 0x00000100 munge offset -8 u32 set 0x0aaf0100 munge offset -4 u32 set 0x0008eb06 pipe",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+:.*pedit.*keys 4.*key #0.*at -16: val 00000000 mask ffff0000.*key #1.*at -12: val 00000100 mask 00000000.*key #2.*at -8: val 0aaf0100 mask 00000000.*key #3.*at -4: val 0008eb06 mask 00000000",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "a5a7",
+ "name": "Add pedit action with LAYERED_OP eth set src",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge eth src set 11:22:33:44:55:66",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+: pedit action pass keys 2.*key #0 at eth\\+4: val 00001122 mask ffff0000.*key #1 at eth\\+8: val 33445566 mask 00000000",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "86d4",
+ "name": "Add pedit action with LAYERED_OP eth set src & dst",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge eth src set 11:22:33:44:55:66 munge eth dst set ff:ee:dd:cc:bb:aa",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": "eth\\+4: val 00001122 mask ffff0000.*eth\\+8: val 33445566 mask 00000000.*eth\\+0: val ffeeddcc mask 00000000.*eth\\+4: val bbaa0000 mask 0000ffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "f8a9",
+ "name": "Add pedit action with LAYERED_OP eth set dst",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge eth dst set 11:22:33:44:55:66",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+: pedit action pass keys 2.*key #0 at eth\\+0: val 11223344 mask 00000000.*key #1 at eth\\+4: val 55660000 mask 0000ffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "c715",
+ "name": "Add pedit action with LAYERED_OP eth set src (INVALID)",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge eth src set %e:11:m2:33:x4:-5",
+ "expExitCode": "255",
+ "verifyCmd": "/bin/true",
+ "matchPattern": " ",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "8131",
+ "name": "Add pedit action with LAYERED_OP eth set dst (INVALID)",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge eth dst set %e:11:m2:33:x4:-5",
+ "expExitCode": "255",
+ "verifyCmd": "/bin/true",
+ "matchPattern": " ",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "ba22",
+ "name": "Add pedit action with LAYERED_OP eth type set/clear sequence",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge eth type set 0x1 munge eth type clear munge eth type set 0x1 munge eth type clear munge eth type set 0x1 munge eth type clear munge eth type set 0x1 munge eth type clear munge eth type set 0x1 munge eth type clear munge eth type set 0x1 munge eth type clear",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": "eth\\+12: val 00010000 mask 0000ffff.*eth\\+12: val 00000000 mask 0000ffff.*eth\\+12: val 00010000 mask 0000ffff.*eth\\+12: val 00000000 mask 0000ffff.*eth\\+12: val 00010000 mask 0000ffff.*eth\\+12: val 00000000 mask 0000ffff.*eth\\+12: val 00010000 mask 0000ffff.*eth\\+12: val 00000000 mask 0000ffff.*eth\\+12: val 00010000 mask 0000ffff.*eth\\+12: val 00000000 mask 0000ffff.*eth\\+12: val 00010000 mask 0000ffff.*eth\\+12: val 00000000 mask 0000ffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "dec4",
+ "name": "Add pedit action with LAYERED_OP eth set type (INVALID)",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge eth type set 0xabcdef",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+: pedit action pass keys 1.*key #0 at eth+12: val ",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "ab06",
+ "name": "Add pedit action with LAYERED_OP eth add type",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge eth type add 0x1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+: pedit action pass keys 1.*key #0 at eth\\+12: add 00010000 mask 0000ffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "918d",
+ "name": "Add pedit action with LAYERED_OP eth invert src",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge eth src invert",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+: pedit action pass keys 2.*key #0 at eth\\+4: val 0000ff00 mask ffff0000.*key #1 at eth\\+8: val 00000000 mask 00000000",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "a8d4",
+ "name": "Add pedit action with LAYERED_OP eth invert dst",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge eth dst invert",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+: pedit action pass keys 2.*key #0 at eth\\+0: val ff000000 mask 00000000.*key #1 at eth\\+4: val 00000000 mask 0000ffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "ee13",
+ "name": "Add pedit action with LAYERED_OP eth invert type",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge eth type invert",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+: pedit action pass keys 1.*key #0 at eth\\+12: val ffff0000 mask ffffffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "7588",
+ "name": "Add pedit action with LAYERED_OP ip set src",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge ip src set 1.1.1.1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+: pedit action pass keys 1.*key #0 at 12: val 01010101 mask 00000000",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "0fa7",
+ "name": "Add pedit action with LAYERED_OP ip set dst",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge ip dst set 2.2.2.2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+: pedit action pass keys 1.*key #0 at 16: val 02020202 mask 00000000",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "5810",
+ "name": "Add pedit action with LAYERED_OP ip set src & dst",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge ip src set 18.52.86.120 munge ip dst set 18.52.86.120",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": " 12: val 12345678 mask 00000000.* 16: val 12345678 mask 00000000",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "1092",
+ "name": "Add pedit action with LAYERED_OP ip set ihl & dsfield",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge ip ihl set 0xff munge ip dsfield set 0xff",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": " 0: val 0f000000 mask f0ffffff.* 0: val 00ff0000 mask ff00ffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "02d8",
+ "name": "Add pedit action with LAYERED_OP ip set ttl & protocol",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge ip ttl set 0x1 munge ip protocol set 0xff",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": " 8: val 01000000 mask 00ffffff.* 8: val 00ff0000 mask ff00ffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "3e2d",
+ "name": "Add pedit action with LAYERED_OP ip set ttl (INVALID)",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge ip ttl set 300",
+ "expExitCode": "255",
+ "verifyCmd": "/bin/true",
+ "matchPattern": " ",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "31ae",
+ "name": "Add pedit action with LAYERED_OP ip ttl clear/set",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge ip ttl clear munge ip ttl set 0x1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": " 8: val 00000000 mask 00ffffff.* 8: val 01000000 mask 00ffffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "486f",
+ "name": "Add pedit action with LAYERED_OP ip set duplicate fields",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge ip ttl set 0x1 munge ip ttl set 0x1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": " 8: val 01000000 mask 00ffffff.* 8: val 01000000 mask 00ffffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "e790",
+ "name": "Add pedit action with LAYERED_OP ip set ce, df, mf, firstfrag, nofrag fields",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge ip ce set 0xff munge ip df set 0xff munge ip mf set 0xff munge ip firstfrag set 0xff munge ip nofrag set 0xff",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": " 4: val 00008000 mask ffff7fff.* 4: val 00004000 mask ffffbfff.* 4: val 00002000 mask ffffdfff.* 4: val 00001f00 mask ffffe0ff.* 4: val 00003f00 mask ffffc0ff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "cc8a",
+ "name": "Add pedit action with LAYERED_OP ip set tos",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge ip tos set 0x4 continue",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+: pedit action continue keys 1.*key #0 at 0: val 00040000 mask ff00ffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "7a17",
+ "name": "Add pedit action with LAYERED_OP ip set precedence",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge ip precedence set 3 jump 2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+: pedit action jump 2 keys 1.*key #0 at 0: val 00030000 mask ff00ffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "c3b6",
+ "name": "Add pedit action with LAYERED_OP ip add tos",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge ip tos add 0x1 pass",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+: pedit action pass keys 1.*key #0 at ipv4\\+0: add 00010000 mask ff00ffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "43d3",
+ "name": "Add pedit action with LAYERED_OP ip add precedence",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge ip precedence add 0x1 pipe",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+: pedit action pipe keys 1.*key #0 at ipv4\\+0: add 00010000 mask ff00ffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "438e",
+ "name": "Add pedit action with LAYERED_OP ip clear tos",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge ip tos clear continue",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+: pedit action continue keys 1.*key #0 at 0: val 00000000 mask ff00ffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "6b1b",
+ "name": "Add pedit action with LAYERED_OP ip clear precedence",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge ip precedence clear jump 2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+: pedit action jump 2 keys 1.*key #0 at 0: val 00000000 mask ff00ffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "824a",
+ "name": "Add pedit action with LAYERED_OP ip invert tos",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge ip tos invert pipe",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+: pedit action pipe keys 1.*key #0 at 0: val 00ff0000 mask ffffffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "106f",
+ "name": "Add pedit action with LAYERED_OP ip invert precedence",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge ip precedence invert reclassify",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+: pedit action reclassify keys 1.*key #0 at 0: val 00ff0000 mask ffffffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "6829",
+ "name": "Add pedit action with LAYERED_OP beyond ip set dport & sport",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge ip dport set 0x1234 munge ip sport set 0x5678",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": " 20: val 00001234 mask ffff0000.* 20: val 56780000 mask 0000ffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "afd8",
+ "name": "Add pedit action with LAYERED_OP beyond ip set icmp_type & icmp_code",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit munge ip icmp_type set 0xff munge ip icmp_code set 0xff",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": " 20: val ff000000 mask 00ffffff.* 20: val ff000000 mask 00ffffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "3143",
+ "name": "Add pedit action with LAYERED_OP beyond ip set dport (INVALID)",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge ip dport set 0x1234",
+ "expExitCode": "255",
+ "verifyCmd": "/bin/true",
+ "matchPattern": " ",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "815c",
+ "name": "Add pedit action with LAYERED_OP ip6 set src",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge ip6 src set 2001:0db8:0:f101::1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+: pedit action pass keys 4.*key #0 at ipv6\\+8: val 20010db8 mask 00000000.*key #1 at ipv6\\+12: val 0000f101 mask 00000000.*key #2 at ipv6\\+16: val 00000000 mask 00000000.*key #3 at ipv6\\+20: val 00000001 mask 00000000",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "4dae",
+ "name": "Add pedit action with LAYERED_OP ip6 set dst",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge ip6 dst set 2001:0db8:0:f101::1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "action order [0-9]+: pedit action pass keys 4.*key #0 at ipv6\\+24: val 20010db8 mask 00000000.*key #1 at ipv6\\+28: val 0000f101 mask 00000000.*key #2 at ipv6\\+32: val 00000000 mask 00000000.*key #3 at ipv6\\+36: val 00000001 mask 00000000",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "fc1f",
+ "name": "Add pedit action with LAYERED_OP ip6 set src & dst",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge ip6 src set 2001:0db8:0:f101::1 munge ip6 dst set 2001:0db8:0:f101::1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": "ipv6\\+8: val 20010db8 mask 00000000.*ipv6\\+12: val 0000f101 mask 00000000.*ipv6\\+16: val 00000000 mask 00000000.*ipv6\\+20: val 00000001 mask 00000000.*ipv6\\+24: val 20010db8 mask 00000000.*ipv6\\+28: val 0000f101 mask 00000000.*ipv6\\+32: val 00000000 mask 00000000.*ipv6\\+36: val 00000001 mask 00000000",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "6d34",
+ "name": "Add pedit action with LAYERED_OP ip6 dst retain value (INVALID)",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge ip6 dst set 2001:0db8:0:f101::1 retain 0xff0000",
+ "expExitCode": "255",
+ "verifyCmd": "/bin/true",
+ "matchPattern": " ",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "94bb",
+ "name": "Add pedit action with LAYERED_OP ip6 traffic_class",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge ip6 traffic_class set 0x40 continue",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit",
+ "matchPattern": "ipv6\\+0: val 04000000 mask f00fffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "6f5e",
+ "name": "Add pedit action with LAYERED_OP ip6 flow_lbl",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge ip6 flow_lbl set 0xfffff",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": "ipv6\\+0: val 0007ffff mask fff80000",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "6795",
+ "name": "Add pedit action with LAYERED_OP ip6 set payload_len, nexthdr, hoplimit",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge ip6 payload_len set 0xffff munge ip6 nexthdr set 0xff munge ip6 hoplimit set 0xff",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": "ipv6\\+4: val ffff0000 mask 0000ffff.*ipv6\\+4: val 0000ff00 mask ffff00ff.*ipv6\\+4: val 000000ff mask ffffff00",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "1442",
+ "name": "Add pedit action with LAYERED_OP tcp set dport & sport",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge tcp dport set 4789 munge tcp sport set 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": "tcp\\+0: val 000012b5 mask ffff0000.*tcp\\+0: val 00010000 mask 0000ffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "b7ac",
+ "name": "Add pedit action with LAYERED_OP tcp sport set (INVALID)",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge tcp sport set -200",
+ "expExitCode": "255",
+ "verifyCmd": "/bin/true",
+ "matchPattern": " ",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "cfcc",
+ "name": "Add pedit action with LAYERED_OP tcp flags set",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge tcp flags set 0x16",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": "tcp\\+12: val 00160000 mask ff00ffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "3bc4",
+ "name": "Add pedit action with LAYERED_OP tcp set dport, sport & flags fields",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge tcp dport set 4789 munge tcp sport set 1 munge tcp flags set 0x1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": "tcp\\+0: val 000012b5 mask ffff0000.*tcp\\+0: val 00010000 mask 0000ffff.*tcp\\+12: val 00010000 mask ff00ffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "f1c8",
+ "name": "Add pedit action with LAYERED_OP udp set dport & sport",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge udp dport set 4789 munge udp sport set 4789",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": "udp\\+0: val 000012b5 mask ffff0000.*udp\\+0: val 12b50000 mask 0000ffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "d784",
+ "name": "Add pedit action with mixed RAW/LAYERED_OP #1",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge eth src set 11:22:33:44:55:66 munge ip ttl set 0xff munge tcp flags clear munge offset 15 u8 add 40 retain 0xf0 munge udp dport add 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": "eth\\+4: val 00001122 mask ffff0000.*eth\\+8: val 33445566 mask 00000000.*ipv4\\+8: val ff000000 mask 00ffffff.*tcp\\+12: val 00000000 mask ff00ffff.* 12: add 00000020 mask ffffff0f.*udp\\+0: add 00000001 mask ffff0000",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ },
+ {
+ "id": "70ca",
+ "name": "Add pedit action with mixed RAW/LAYERED_OP #2",
+ "category": [
+ "actions",
+ "pedit",
+ "layered_op",
+ "raw_op"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action pedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action pedit ex munge eth src set 11:22:33:44:55:66 munge eth dst set ff:ee:dd:cc:bb:aa munge ip6 payload_len set 0xffff munge ip6 nexthdr set 0xff munge ip6 hoplimit preserve munge offset 0 u8 set 0x12 munge offset 1 u16 set 0x3456 munge offset 3 u8 set 0x78 munge ip ttl set 0xaa munge ip protocol set 0xff",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action pedit | grep 'key '",
+ "matchPattern": "eth\\+4: val 00001122 mask ffff0000.*eth\\+8: val 33445566 mask 00000000.*eth\\+0: val ffeeddcc mask 00000000.*eth\\+4: val bbaa0000 mask 0000ffff.*ipv6\\+4: val ffff0000 mask 0000ffff.*ipv6\\+4: val 0000ff00 mask ffff00ff.*ipv6\\+4: val 00000000 mask ffffffff.* 0: val 12000000 mask 00ffffff.* 0: val 00345600 mask ff0000ff.* 0: val 00000078 mask ffffff00.*ipv4\\+8: val aa000000 mask 00ffffff.*ipv4\\+8: val 00ff0000 mask ff00ffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action pedit"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/police.json b/tools/testing/selftests/tc-testing/tc-tests/actions/police.json
new file mode 100644
index 000000000..b8268da5a
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/police.json
@@ -0,0 +1,768 @@
+[
+ {
+ "id": "49aa",
+ "name": "Add valid basic police action",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 1kbit burst 10k index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action police",
+ "matchPattern": "action order [0-9]*: police 0x1 rate 1Kbit burst 10Kb",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "3abe",
+ "name": "Add police action with duplicate index",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action police rate 4Mbit burst 120k index 9"
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 8kbit burst 24k index 9",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions ls action police",
+ "matchPattern": "action order [0-9]*: police 0x9",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "49fa",
+ "name": "Add valid police action with mtu",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 90kbit burst 10k mtu 1k index 98",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action police index 98",
+ "matchPattern": "action order [0-9]*: police 0x62 rate 90Kbit burst 10Kb mtu 1Kb",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "7943",
+ "name": "Add valid police action with peakrate",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 90kbit burst 10k mtu 2kb peakrate 100kbit index 3",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action police",
+ "matchPattern": "action order [0-9]*: police 0x3 rate 90Kbit burst 10Kb mtu 2Kb peakrate 100Kbit",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "055e",
+ "name": "Add police action with peakrate and no mtu",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 5kbit burst 6kb peakrate 10kbit index 9",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions ls action police",
+ "matchPattern": "action order [0-9]*: police 0x9 rate 5Kb burst 10Kb",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "f057",
+ "name": "Add police action with valid overhead",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 1mbit burst 100k overhead 64 index 64",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action police index 64",
+ "matchPattern": "action order [0-9]*: police 0x40 rate 1Mbit burst 100Kb mtu 2Kb action reclassify overhead 64b",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "7ffb",
+ "name": "Add police action with ethernet linklayer type",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 2mbit burst 200k linklayer ethernet index 8",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions show action police",
+ "matchPattern": "action order [0-9]*: police 0x8 rate 2Mbit burst 200Kb mtu 2Kb action reclassify overhead 0b",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "3dda",
+ "name": "Add police action with atm linklayer type",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 2mbit burst 200k linklayer atm index 8",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions show action police",
+ "matchPattern": "action order [0-9]*: police 0x8 rate 2Mbit burst 200Kb mtu 2Kb action reclassify overhead 0b linklayer atm",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "551b",
+ "name": "Add police actions with conform-exceed control continue/drop",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 3mbit burst 250k conform-exceed continue/drop index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action police index 1",
+ "matchPattern": "action order [0-9]*: police 0x1 rate 3Mbit burst 250Kb mtu 2Kb action continue/drop",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "0c70",
+ "name": "Add police actions with conform-exceed control pass/reclassify",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 3mbit burst 250k conform-exceed pass/reclassify index 4",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action police",
+ "matchPattern": "action order [0-9]*: police 0x4 rate 3Mbit burst 250Kb mtu 2Kb action pass/reclassify",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "d946",
+ "name": "Add police actions with conform-exceed control pass/pipe",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 3mbit burst 250k conform-exceed pass/pipe index 5",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action police",
+ "matchPattern": "action order [0-9]*: police 0x5 rate 3Mbit burst 250Kb mtu 2Kb action pass/pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "ddd6",
+ "name": "Add police action with invalid rate value",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 3tb burst 250k conform-exceed pass/pipe index 5",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions ls action police",
+ "matchPattern": "action order [0-9]*: police 0x5 rate 3Tb burst 250Kb mtu 2Kb action pass/pipe",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "f61c",
+ "name": "Add police action with invalid burst value",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 3kbit burst 250P conform-exceed pass/pipe index 5",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions ls action police",
+ "matchPattern": "action order [0-9]*: police 0x5 rate 3Kbit burst 250Pb mtu 2Kb action pass/pipe",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "6aaf",
+ "name": "Add police actions with conform-exceed control pass/pipe [with numeric values]",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 3mbit burst 250k conform-exceed 0/3 index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action police index 1",
+ "matchPattern": "action order [0-9]*: police 0x1 rate 3Mbit burst 250Kb mtu 2Kb action pass/pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "29b1",
+ "name": "Add police actions with conform-exceed control <invalid>/drop",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 3mbit burst 250k conform-exceed 10/drop index 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions ls action police",
+ "matchPattern": "action order [0-9]*: police 0x1 rate 3Mbit burst 250Kb mtu 2Kb action ",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "c26f",
+ "name": "Add police action with invalid peakrate value",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 90kbit burst 10k mtu 2kb peakrate 100T index 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions ls action police",
+ "matchPattern": "action order [0-9]*: police 0x1 rate 90Kbit burst 10Kb mtu 2Kb peakrate 100Tbit",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "db04",
+ "name": "Add police action with invalid mtu value",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 10kbit burst 10k mtu 2Pbit index 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions ls action police",
+ "matchPattern": "action order [0-9]*: police 0x1 rate 10Kbit burst 1Kb mtu 2Pb",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "f3c9",
+ "name": "Add police action with cookie",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 10mbit burst 10k index 1 cookie a1b1c1d1e1f12233bb",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action police index 1",
+ "matchPattern": "action order [0-9]*: police 0x1 rate 10Mbit burst 10Kb mtu 2Kb.*cookie a1b1c1d1e1f12233bb",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "d190",
+ "name": "Add police action with maximum index",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 10mbit burst 10k index 4294967295",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action police index 4294967295",
+ "matchPattern": "action order [0-9]*: police 0xffffffff rate 10Mbit burst 10Kb mtu 2Kb",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "336e",
+ "name": "Delete police action",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action police rate 5mbit burst 2m index 12"
+ ],
+ "cmdUnderTest": "$TC actions delete action police index 12",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action police",
+ "matchPattern": "action order [0-9]*: police 0xc rate 5Mb burst 2Mb",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "77fa",
+ "name": "Get single police action from many actions",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action police rate 1mbit burst 100k index 1",
+ "$TC actions add action police rate 2mbit burst 200k index 2",
+ "$TC actions add action police rate 3mbit burst 300k index 3",
+ "$TC actions add action police rate 4mbit burst 400k index 4",
+ "$TC actions add action police rate 5mbit burst 500k index 5",
+ "$TC actions add action police rate 6mbit burst 600k index 6",
+ "$TC actions add action police rate 7mbit burst 700k index 7",
+ "$TC actions add action police rate 8mbit burst 800k index 8"
+ ],
+ "cmdUnderTest": "$TC actions get action police index 4",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action police index 4",
+ "matchPattern": "action order [0-9]*: police 0x4 rate 4Mbit burst 400Kb",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "aa43",
+ "name": "Get single police action without specifying index",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action police rate 1mbit burst 100k index 1"
+ ],
+ "cmdUnderTest": "$TC actions get action police",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action police",
+ "matchPattern": "action order [0-9]*: police",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "858b",
+ "name": "List police actions",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action police rate 1mbit burst 100k index 1",
+ "$TC actions add action police rate 2mbit burst 200k index 2",
+ "$TC actions add action police rate 3mbit burst 300k index 3",
+ "$TC actions add action police rate 4mbit burst 400k index 4",
+ "$TC actions add action police rate 5mbit burst 500k index 5",
+ "$TC actions add action police rate 6mbit burst 600k index 6",
+ "$TC actions add action police rate 7mbit burst 700k index 7",
+ "$TC actions add action police rate 8mbit burst 800k index 8"
+ ],
+ "cmdUnderTest": "$TC actions list action police",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action police",
+ "matchPattern": "action order [0-9]*: police 0x[1-8] rate [1-8]Mbit burst [1-8]00Kb",
+ "matchCount": "8",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "1c3a",
+ "name": "Flush police actions",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ "$TC actions add action police rate 1mbit burst 100k index 1",
+ "$TC actions add action police rate 2mbit burst 200k index 2",
+ "$TC actions add action police rate 3mbit burst 300k index 3",
+ "$TC actions add action police rate 4mbit burst 400k index 4",
+ "$TC actions add action police rate 5mbit burst 500k index 5",
+ "$TC actions add action police rate 6mbit burst 600k index 6",
+ "$TC actions add action police rate 7mbit burst 700k index 7",
+ "$TC actions add action police rate 8mbit burst 800k index 8"
+ ],
+ "cmdUnderTest": "$TC actions flush action police",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action police",
+ "matchPattern": "action order [0-9]*: police",
+ "matchCount": "0",
+ "teardown": [
+ ""
+ ]
+ },
+ {
+ "id": "7326",
+ "name": "Add police action with control continue",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 7mbit burst 1m continue index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action police index 1",
+ "matchPattern": "action order [0-9]*: police 0x1 rate 7Mbit burst 1024Kb mtu 2Kb action continue",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "34fa",
+ "name": "Add police action with control drop",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 7mbit burst 1m drop index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action police",
+ "matchPattern": "action order [0-9]*: police 0x1 rate 7Mbit burst 1024Kb mtu 2Kb action drop",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "8dd5",
+ "name": "Add police action with control ok",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 7mbit burst 1m ok index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action police",
+ "matchPattern": "action order [0-9]*: police 0x1 rate 7Mbit burst 1024Kb mtu 2Kb action pass",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "b9d1",
+ "name": "Add police action with control reclassify",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 7mbit burst 1m reclassify index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action police index 1",
+ "matchPattern": "action order [0-9]*: police 0x1 rate 7Mbit burst 1024Kb mtu 2Kb action reclassify",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "c534",
+ "name": "Add police action with control pipe",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 7mbit burst 1m pipe index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action police",
+ "matchPattern": "action order [0-9]*: police 0x1 rate 7Mbit burst 1024Kb mtu 2Kb action pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "b48b",
+ "name": "Add police action with exceed goto chain control action",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action police rate 1mbit burst 1k conform-exceed pass / goto chain 42",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions ls action police",
+ "matchPattern": "action order [0-9]*: police 0x1 rate 1Mbit burst 1Kb mtu 2Kb action pass/goto chain 42",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ },
+ {
+ "id": "689e",
+ "name": "Replace police action with invalid goto chain control",
+ "category": [
+ "actions",
+ "police"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action police",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action police rate 3mbit burst 250k drop index 90"
+ ],
+ "cmdUnderTest": "$TC actions replace action police rate 3mbit burst 250k goto chain 42 index 90 cookie c1a0c1a0",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action police index 90",
+ "matchPattern": "action order [0-9]*: police 0x5a rate 3Mbit burst 250Kb mtu 2Kb action drop",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action police"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/sample.json b/tools/testing/selftests/tc-testing/tc-tests/actions/sample.json
new file mode 100644
index 000000000..ddabb160a
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/sample.json
@@ -0,0 +1,637 @@
+[
+ {
+ "id": "9784",
+ "name": "Add valid sample action with mandatory arguments",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action sample rate 10 group 1 index 2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action sample index 2",
+ "matchPattern": "action order [0-9]+: sample rate 1/10 group 1.*index 2 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action sample"
+ ]
+ },
+ {
+ "id": "5c91",
+ "name": "Add valid sample action with mandatory arguments and continue control action",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action sample rate 700 group 2 continue index 2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action sample index 2",
+ "matchPattern": "action order [0-9]+: sample rate 1/700 group 2 continue.*index 2 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action sample"
+ ]
+ },
+ {
+ "id": "334b",
+ "name": "Add valid sample action with mandatory arguments and drop control action",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action sample rate 10000 group 11 drop index 22",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action sample",
+ "matchPattern": "action order [0-9]+: sample rate 1/10000 group 11 drop.*index 22 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action sample"
+ ]
+ },
+ {
+ "id": "da69",
+ "name": "Add valid sample action with mandatory arguments and reclassify control action",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action sample rate 20000 group 72 reclassify index 100",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action sample",
+ "matchPattern": "action order [0-9]+: sample rate 1/20000 group 72 reclassify.*index 100 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action sample"
+ ]
+ },
+ {
+ "id": "13ce",
+ "name": "Add valid sample action with mandatory arguments and pipe control action",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action sample rate 20 group 2 pipe index 100",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action sample",
+ "matchPattern": "action order [0-9]+: sample rate 1/20 group 2 pipe.*index 100 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action sample"
+ ]
+ },
+ {
+ "id": "1886",
+ "name": "Add valid sample action with mandatory arguments and jump control action",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action sample rate 700 group 25 jump 4 index 200",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action sample index 200",
+ "matchPattern": "action order [0-9]+: sample rate 1/700 group 25 jump 4.*index 200 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action sample"
+ ]
+ },
+ {
+ "id": "7571",
+ "name": "Add sample action with invalid rate",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action sample rate 0 group 1 index 2",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action sample index 2",
+ "matchPattern": "action order [0-9]+: sample rate 1/0 group 1.*index 2 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action sample"
+ ]
+ },
+ {
+ "id": "b6d4",
+ "name": "Add sample action with mandatory arguments and invalid control action",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action sample rate 200000 group 52 foo index 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action sample",
+ "matchPattern": "action order [0-9]+: sample rate 1/200000 group 52 foo.*index 1 ref",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "a874",
+ "name": "Add invalid sample action without mandatory arguments",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action sample index 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action sample",
+ "matchPattern": "action order [0-9]+: sample.*index 1 ref",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "ac01",
+ "name": "Add invalid sample action without mandatory argument rate",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action sample group 10 index 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action sample",
+ "matchPattern": "action order [0-9]+: sample.*group 10.*index 1 ref",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "4203",
+ "name": "Add invalid sample action without mandatory argument group",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action sample rate 100 index 10",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action sample index 10",
+ "matchPattern": "action order [0-9]+: sample rate 1/100.*index 10 ref",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "14a7",
+ "name": "Add invalid sample action without mandatory argument group",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action sample rate 100 index 10",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action sample index 10",
+ "matchPattern": "action order [0-9]+: sample rate 1/100.*index 10 ref",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "8f2e",
+ "name": "Add valid sample action with trunc argument",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action sample rate 1024 group 4 trunc 1024 index 10",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action sample index 10",
+ "matchPattern": "action order [0-9]+: sample rate 1/1024 group 4 trunc_size 1024 pipe.*index 10 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action sample"
+ ]
+ },
+ {
+ "id": "45f8",
+ "name": "Add sample action with maximum rate argument",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action sample rate 4294967295 group 4 index 10",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action sample index 10",
+ "matchPattern": "action order [0-9]+: sample rate 1/4294967295 group 4 pipe.*index 10 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action sample"
+ ]
+ },
+ {
+ "id": "ad0c",
+ "name": "Add sample action with maximum trunc argument",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action sample rate 16000 group 4 trunc 4294967295 index 10",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action sample index 10",
+ "matchPattern": "action order [0-9]+: sample rate 1/16000 group 4 trunc_size 4294967295 pipe.*index 10 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action sample"
+ ]
+ },
+ {
+ "id": "83a9",
+ "name": "Add sample action with maximum group argument",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action sample rate 4294 group 4294967295 index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action sample index 1",
+ "matchPattern": "action order [0-9]+: sample rate 1/4294 group 4294967295 pipe.*index 1 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action sample"
+ ]
+ },
+ {
+ "id": "ed27",
+ "name": "Add sample action with invalid rate argument",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action sample rate 4294967296 group 4 index 10",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action sample index 10",
+ "matchPattern": "action order [0-9]+: sample rate 1/4294967296 group 4 pipe.*index 10 ref",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "2eae",
+ "name": "Add sample action with invalid group argument",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action sample rate 4098 group 5294967299 continue index 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action sample index 1",
+ "matchPattern": "action order [0-9]+: sample rate 1/4098 group 5294967299 continue.*index 1 ref",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "6ff3",
+ "name": "Add sample action with invalid trunc size",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action sample rate 1024 group 4 trunc 112233445566 index 11",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action sample index 11",
+ "matchPattern": "action order [0-9]+: sample rate 1/1024 group 4 trunc_size 112233445566.*index 11 ref",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "2b2a",
+ "name": "Add sample action with invalid index",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action sample rate 1024 group 4 index 5294967299",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action sample index 5294967299",
+ "matchPattern": "action order [0-9]+: sample rate 1/1024 group 4 pipe.*index 5294967299 ref",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "dee2",
+ "name": "Add sample action with maximum allowed index",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action sample rate 1024 group 4 index 4294967295",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action sample index 4294967295",
+ "matchPattern": "action order [0-9]+: sample rate 1/1024 group 4 pipe.*index 4294967295 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action sample"
+ ]
+ },
+ {
+ "id": "560e",
+ "name": "Add sample action with cookie",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action sample rate 1024 group 4 index 45 cookie aabbccdd",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action sample index 45",
+ "matchPattern": "action order [0-9]+: sample rate 1/1024 group 4 pipe.*index 45.*cookie aabbccdd",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action sample"
+ ]
+ },
+ {
+ "id": "704a",
+ "name": "Replace existing sample action with new rate argument",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action sample rate 1024 group 4 index 4"
+ ],
+ "cmdUnderTest": "$TC actions replace action sample rate 2048 group 4 index 4",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action sample",
+ "matchPattern": "action order [0-9]+: sample rate 1/2048 group 4 pipe.*index 4",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action sample"
+ ]
+ },
+ {
+ "id": "60eb",
+ "name": "Replace existing sample action with new group argument",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action sample rate 1024 group 4 index 4"
+ ],
+ "cmdUnderTest": "$TC actions replace action sample rate 1024 group 7 index 4",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action sample",
+ "matchPattern": "action order [0-9]+: sample rate 1/1024 group 7 pipe.*index 4",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action sample"
+ ]
+ },
+ {
+ "id": "2cce",
+ "name": "Replace existing sample action with new trunc argument",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action sample rate 1024 group 4 trunc 48 index 4"
+ ],
+ "cmdUnderTest": "$TC actions replace action sample rate 1024 group 7 trunc 64 index 4",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action sample",
+ "matchPattern": "action order [0-9]+: sample rate 1/1024 group 7 trunc_size 64 pipe.*index 4",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action sample"
+ ]
+ },
+ {
+ "id": "59d1",
+ "name": "Replace existing sample action with new control argument",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action sample rate 1024 group 4 reclassify index 4"
+ ],
+ "cmdUnderTest": "$TC actions replace action sample rate 1024 group 7 pipe index 4",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action sample",
+ "matchPattern": "action order [0-9]+: sample rate 1/1024 group 7 pipe.*index 4",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action sample"
+ ]
+ },
+ {
+ "id": "0a6e",
+ "name": "Replace sample action with invalid goto chain control",
+ "category": [
+ "actions",
+ "sample"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action sample",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action sample rate 1024 group 4 pass index 90"
+ ],
+ "cmdUnderTest": "$TC actions replace action sample rate 1024 group 7 goto chain 42 index 90 cookie c1a0c1a0",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action sample",
+ "matchPattern": "action order [0-9]+: sample rate 1/1024 group 4 pass.*index 90",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action sample"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json b/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json
new file mode 100644
index 000000000..8e8c1ae12
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/simple.json
@@ -0,0 +1,155 @@
+[
+ {
+ "id": "b078",
+ "name": "Add simple action",
+ "category": [
+ "actions",
+ "simple"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action simple",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action simple sdata \"A triumph\" index 60",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action simple",
+ "matchPattern": "action order [0-9]*: Simple <A triumph>.*index 60 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action simple"
+ ]
+ },
+ {
+ "id": "6d4c",
+ "name": "Add simple action with duplicate index",
+ "category": [
+ "actions",
+ "simple"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action simple",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action simple sdata \"Aruba\" index 4"
+ ],
+ "cmdUnderTest": "$TC actions add action simple sdata \"Jamaica\" index 4",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action simple",
+ "matchPattern": "action order [0-9]*: Simple <Jamaica>.*ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action simple"
+ ]
+ },
+ {
+ "id": "2542",
+ "name": "List simple actions",
+ "category": [
+ "actions",
+ "simple"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action simple",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action simple sdata \"Rock\"",
+ "$TC actions add action simple sdata \"Paper\"",
+ "$TC actions add action simple sdata \"Scissors\" index 98"
+ ],
+ "cmdUnderTest": "$TC actions list action simple",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action simple",
+ "matchPattern": "action order [0-9]*: Simple <[A-Z][a-z]*>",
+ "matchCount": "3",
+ "teardown": [
+ "$TC actions flush action simple"
+ ]
+ },
+ {
+ "id": "ea67",
+ "name": "Delete simple action",
+ "category": [
+ "actions",
+ "simple"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action simple",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action simple sdata \"Blinkenlights\" index 1"
+ ],
+ "cmdUnderTest": "$TC actions delete action simple index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action simple",
+ "matchPattern": "action order [0-9]*: Simple <Blinkenlights>.*index 1 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action simple"
+ ]
+ },
+ {
+ "id": "8ff1",
+ "name": "Flush simple actions",
+ "category": [
+ "actions",
+ "simple"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action simple",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action simple sdata \"Kirk\"",
+ "$TC actions add action simple sdata \"Spock\" index 50",
+ "$TC actions add action simple sdata \"McCoy\" index 9"
+ ],
+ "cmdUnderTest": "$TC actions flush action simple",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action simple",
+ "matchPattern": "action order [0-9]*: Simple <[A-Z][a-z]*>",
+ "matchCount": "0",
+ "teardown": [
+ ""
+ ]
+ },
+ {
+ "id": "b776",
+ "name": "Replace simple action with invalid goto chain control",
+ "category": [
+ "actions",
+ "simple"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action simple",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action simple sdata \"hello\" pass index 90"
+ ],
+ "cmdUnderTest": "$TC actions replace action simple sdata \"world\" goto chain 42 index 90 cookie c1a0c1a0",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action simple",
+ "matchPattern": "action order [0-9]*: Simple <hello>.*index 90 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action simple"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json b/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json
new file mode 100644
index 000000000..9cdd2e31a
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/skbedit.json
@@ -0,0 +1,721 @@
+[
+ {
+ "id": "6236",
+ "name": "Add skbedit action with valid mark",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit mark 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action skbedit",
+ "matchPattern": "action order [0-9]*: skbedit mark 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "c8cf",
+ "name": "Add skbedit action with 32-bit maximum mark",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit mark 4294967295 pipe index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action skbedit index 1",
+ "matchPattern": "action order [0-9]*: skbedit mark 4294967295.*pipe.*index 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "407b",
+ "name": "Add skbedit action with mark exceeding 32-bit maximum",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit mark 666777888999",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action skbedit",
+ "matchPattern": "action order [0-9]*: skbedit mark",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "d4cd",
+ "name": "Add skbedit action with valid mark and mask",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit mark 1/0xaabb",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action skbedit",
+ "matchPattern": "action order [0-9]*: skbedit mark 1/0xaabb",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "baa7",
+ "name": "Add skbedit action with valid mark and 32-bit maximum mask",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit mark 1/0xffffffff",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action skbedit",
+ "matchPattern": "action order [0-9]*: skbedit mark 1/0xffffffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "62a5",
+ "name": "Add skbedit action with valid mark and mask exceeding 32-bit maximum",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit mark 1/0xaabbccddeeff112233",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action skbedit",
+ "matchPattern": "action order [0-9]*: skbedit mark 1/0xaabbccddeeff112233",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "bc15",
+ "name": "Add skbedit action with valid mark and mask with invalid format",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit mark 1/-1234",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action skbedit",
+ "matchPattern": "action order [0-9]*: skbedit mark 1/-1234",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "57c2",
+ "name": "Replace skbedit action with new mask",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action skbedit mark 1/0x11223344 index 1"
+ ],
+ "cmdUnderTest": "$TC actions replace action skbedit mark 1/0xaabb index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action skbedit",
+ "matchPattern": "action order [0-9]*: skbedit mark 1/0xaabb",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "081d",
+ "name": "Add skbedit action with priority",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit prio 99",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action skbedit",
+ "matchPattern": "action order [0-9]*: skbedit priority :99",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "cc37",
+ "name": "Add skbedit action with invalid priority",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit prio foo",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action skbedit",
+ "matchPattern": "action order [0-9]*: skbedit priority",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "3c95",
+ "name": "Add skbedit action with queue_mapping",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit queue_mapping 909",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action skbedit",
+ "matchPattern": "action order [0-9]*: skbedit queue_mapping 909",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "985c",
+ "name": "Add skbedit action with queue_mapping exceeding 16-bit maximum",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit queue_mapping 67000",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action skbedit",
+ "matchPattern": "action order [0-9]*: skbedit queue_mapping",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "224f",
+ "name": "Add skbedit action with ptype host",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit ptype host",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action skbedit",
+ "matchPattern": "action order [0-9]*: skbedit ptype host",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "d1a3",
+ "name": "Add skbedit action with ptype otherhost",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit ptype otherhost",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action skbedit",
+ "matchPattern": "action order [0-9]*: skbedit ptype otherhost",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "b9c6",
+ "name": "Add skbedit action with invalid ptype",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit ptype openair",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action skbedit",
+ "matchPattern": "action order [0-9]*: skbedit ptype openair",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "464a",
+ "name": "Add skbedit action with control pipe",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit ptype host pipe index 11",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action skbedit index 11",
+ "matchPattern": "action order [0-9]*: skbedit ptype host pipe.*index 11 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "212f",
+ "name": "Add skbedit action with control reclassify",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit mark 56789 reclassify index 90",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action skbedit index 90",
+ "matchPattern": "action order [0-9]*: skbedit mark 56789 reclassify.*index 90 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "0651",
+ "name": "Add skbedit action with control pass",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit queue_mapping 3 pass index 271",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action skbedit index 271",
+ "matchPattern": "action order [0-9]*: skbedit queue_mapping 3 pass.*index 271 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "cc53",
+ "name": "Add skbedit action with control drop",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit queue_mapping 3 drop index 271",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action skbedit index 271",
+ "matchPattern": "action order [0-9]*: skbedit queue_mapping 3 drop.*index 271 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "ec16",
+ "name": "Add skbedit action with control jump",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit priority 8 jump 9 index 2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action skbedit index 2",
+ "matchPattern": "action order [0-9]*: skbedit priority :8 jump 9.*index 2 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "db54",
+ "name": "Add skbedit action with control continue",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit priority 16 continue index 32",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action skbedit index 32",
+ "matchPattern": "action order [0-9]*: skbedit priority :16 continue.*index 32 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "1055",
+ "name": "Add skbedit action with cookie",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit priority 16 continue index 32 cookie deadbeef",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action skbedit index 32",
+ "matchPattern": "action order [0-9]*: skbedit priority :16 continue.*index 32 ref.*cookie deadbeef",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "5172",
+ "name": "List skbedit actions",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action skbedit ptype otherhost",
+ "$TC actions add action skbedit ptype broadcast",
+ "$TC actions add action skbedit mark 59",
+ "$TC actions add action skbedit mark 409"
+ ],
+ "cmdUnderTest": "$TC actions list action skbedit",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action skbedit",
+ "matchPattern": "action order [0-9]*: skbedit",
+ "matchCount": "4",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "a6d6",
+ "name": "Add skbedit action with index at 32-bit maximum",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit mark 808 index 4294967295",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action skbedit index 4294967295",
+ "matchPattern": "action order [0-9]*: skbedit mark 808.*index 4294967295",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "f0f4",
+ "name": "Add skbedit action with index exceeding 32-bit maximum",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbedit mark 808 pass index 4294967297",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action skbedit index 4294967297",
+ "matchPattern": "action order [0-9]*:.*skbedit.*mark 808.*pass.*index 4294967297",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "38f3",
+ "name": "Delete skbedit action",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action skbedit mark 42 index 9009"
+ ],
+ "cmdUnderTest": "$TC actions del action skbedit index 9009",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action skbedit",
+ "matchPattern": "action order [0-9]*: skbedit mark 42",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "ce97",
+ "name": "Flush skbedit actions",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ "$TC actions add action skbedit mark 500",
+ "$TC actions add action skbedit mark 501",
+ "$TC actions add action skbedit mark 502",
+ "$TC actions add action skbedit mark 503",
+ "$TC actions add action skbedit mark 504",
+ "$TC actions add action skbedit mark 505",
+ "$TC actions add action skbedit mark 506"
+ ],
+ "cmdUnderTest": "$TC actions flush action skbedit",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action skbedit",
+ "matchPattern": "action order [0-9]*: skbedit",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "1b2b",
+ "name": "Replace skbedit action with invalid goto_chain control",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action skbedit ptype host pass index 90"
+ ],
+ "cmdUnderTest": "$TC actions replace action skbedit ptype host goto chain 42 index 90 cookie c1a0c1a0",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action skbedit",
+ "matchPattern": "action order [0-9]*: skbedit ptype host pass.*index 90 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "630c",
+ "name": "Add batch of 32 skbedit actions with all parameters and cookie",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action skbedit queue_mapping 2 priority 10 mark 7/0xaabbccdd ptype host inheritdsfield index \\$i cookie aabbccddeeff112233445566778800a1 \\\"; args=\"\\$args\\$cmd\"; done && $TC actions add \\$args\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action skbedit",
+ "matchPattern": "^[ \t]+index [0-9]+ ref",
+ "matchCount": "32",
+ "teardown": [
+ "$TC actions flush action skbedit"
+ ]
+ },
+ {
+ "id": "706d",
+ "name": "Delete batch of 32 skbedit actions with all parameters",
+ "category": [
+ "actions",
+ "skbedit"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbedit",
+ 0,
+ 1,
+ 255
+ ],
+ "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action skbedit queue_mapping 2 priority 10 mark 7/0xaabbccdd ptype host inheritdsfield index \\$i \\\"; args=\\\"\\$args\\$cmd\\\"; done && $TC actions add \\$args\""
+ ],
+ "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action skbedit index \\$i \\\"; args=\"\\$args\\$cmd\"; done && $TC actions del \\$args\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action skbedit",
+ "matchPattern": "^[ \t]+index [0-9]+ ref",
+ "matchCount": "0",
+ "teardown": []
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/skbmod.json b/tools/testing/selftests/tc-testing/tc-tests/actions/skbmod.json
new file mode 100644
index 000000000..6eb4c4f97
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/skbmod.json
@@ -0,0 +1,421 @@
+[
+ {
+ "id": "7d50",
+ "name": "Add skbmod action to set destination mac",
+ "category": [
+ "actions",
+ "skbmod"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbmod",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbmod set dmac 11:22:33:44:55:66 index 5",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action skbmod",
+ "matchPattern": "action order [0-9]*: skbmod pipe set dmac 11:22:33:44:55:66\\s+index 5",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbmod"
+ ]
+ },
+ {
+ "id": "9b29",
+ "name": "Add skbmod action to set source mac",
+ "category": [
+ "actions",
+ "skbmod"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbmod",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbmod set smac 77:88:99:AA:BB:CC index 7",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action skbmod index 7",
+ "matchPattern": "action order [0-9]*: skbmod pipe set smac 77:88:99:aa:bb:cc\\s+index 7",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbmod"
+ ]
+ },
+ {
+ "id": "1724",
+ "name": "Add skbmod action with invalid mac",
+ "category": [
+ "actions",
+ "skbmod"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbmod",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbmod set smac 00:44:55:44:55",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions ls action skbmod",
+ "matchPattern": "action order [0-9]*: skbmod pipe set smac 00:44:55:44:55",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action skbmod"
+ ]
+ },
+ {
+ "id": "3cf1",
+ "name": "Add skbmod action with valid etype",
+ "category": [
+ "actions",
+ "skbmod"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbmod",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbmod set etype 0xfefe",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action skbmod",
+ "matchPattern": "action order [0-9]*: skbmod pipe set etype 0xFEFE",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbmod"
+ ]
+ },
+ {
+ "id": "a749",
+ "name": "Add skbmod action with invalid etype",
+ "category": [
+ "actions",
+ "skbmod"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbmod",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbmod set etype 0xfefef",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions ls action skbmod",
+ "matchPattern": "action order [0-9]*: skbmod pipe set etype 0xFEFEF",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action skbmod"
+ ]
+ },
+ {
+ "id": "bfe6",
+ "name": "Add skbmod action to swap mac",
+ "category": [
+ "actions",
+ "skbmod"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbmod",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbmod swap mac",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action skbmod index 1",
+ "matchPattern": "action order [0-9]*: skbmod pipe swap mac",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbmod"
+ ]
+ },
+ {
+ "id": "839b",
+ "name": "Add skbmod action with control pipe",
+ "category": [
+ "actions",
+ "skbmod"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbmod",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbmod swap mac pipe",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action skbmod",
+ "matchPattern": "action order [0-9]*: skbmod pipe swap mac",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbmod"
+ ]
+ },
+ {
+ "id": "c167",
+ "name": "Add skbmod action with control reclassify",
+ "category": [
+ "actions",
+ "skbmod"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbmod",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbmod set etype 0xbeef reclassify",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action skbmod",
+ "matchPattern": "action order [0-9]*: skbmod reclassify set etype 0xBEEF",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbmod"
+ ]
+ },
+ {
+ "id": "0c2f",
+ "name": "Add skbmod action with control drop",
+ "category": [
+ "actions",
+ "skbmod"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbmod",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbmod set etype 0x0001 drop",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action skbmod index 1",
+ "matchPattern": "action order [0-9]*: skbmod drop set etype 0x1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbmod"
+ ]
+ },
+ {
+ "id": "d113",
+ "name": "Add skbmod action with control continue",
+ "category": [
+ "actions",
+ "skbmod"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbmod",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbmod set etype 0x1 continue",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action skbmod",
+ "matchPattern": "action order [0-9]*: skbmod continue set etype 0x1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbmod"
+ ]
+ },
+ {
+ "id": "7242",
+ "name": "Add skbmod action with control pass",
+ "category": [
+ "actions",
+ "skbmod"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbmod",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbmod set smac 00:00:00:00:00:01 pass",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action skbmod",
+ "matchPattern": "action order [0-9]*: skbmod pass set smac 00:00:00:00:00:01",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbmod"
+ ]
+ },
+ {
+ "id": "6046",
+ "name": "Add skbmod action with control reclassify and cookie",
+ "category": [
+ "actions",
+ "skbmod"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbmod",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action skbmod set smac 00:01:02:03:04:01 reclassify index 1 cookie ddeeffaabb11cc22",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action skbmod index 1",
+ "matchPattern": "action order [0-9]*: skbmod reclassify set smac 00:01:02:03:04:01.*index 1 ref.*cookie ddeeffaabb11cc22",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbmod"
+ ]
+ },
+ {
+ "id": "58cb",
+ "name": "List skbmod actions",
+ "category": [
+ "actions",
+ "skbmod"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbmod",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action skbmod set etype 0x0001",
+ "$TC actions add action skbmod set etype 0x0011",
+ "$TC actions add action skbmod set etype 0x0021",
+ "$TC actions add action skbmod set etype 0x0031",
+ "$TC actions add action skbmod set etype 0x0041"
+ ],
+ "cmdUnderTest": "$TC actions ls action skbmod",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action skbmod",
+ "matchPattern": "action order [0-9]*: skbmod",
+ "matchCount": "5",
+ "teardown": [
+ "$TC actions flush action skbmod"
+ ]
+ },
+ {
+ "id": "9aa8",
+ "name": "Get a single skbmod action from a list",
+ "category": [
+ "actions",
+ "skbmod"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbmod",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action skbmod set etype 0x0001",
+ "$TC actions add action skbmod set etype 0x0011",
+ "$TC actions add action skbmod set etype 0x0021",
+ "$TC actions add action skbmod set etype 0x0031",
+ "$TC actions add action skbmod set etype 0x0041"
+ ],
+ "cmdUnderTest": "$TC actions ls action skbmod",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action skbmod index 4",
+ "matchPattern": "action order [0-9]*: skbmod pipe set etype 0x31",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbmod"
+ ]
+ },
+ {
+ "id": "e93a",
+ "name": "Delete an skbmod action",
+ "category": [
+ "actions",
+ "skbmod"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbmod",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action skbmod set etype 0x1111 index 909"
+ ],
+ "cmdUnderTest": "$TC actions del action skbmod index 909",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action skbmod",
+ "matchPattern": "action order [0-9]*: skbmod pipe set etype 0x1111\\s+index 909",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action skbmod"
+ ]
+ },
+ {
+ "id": "40c2",
+ "name": "Flush skbmod actions",
+ "category": [
+ "actions",
+ "skbmod"
+ ],
+ "setup": [
+ "$TC actions add action skbmod set etype 0x0001",
+ "$TC actions add action skbmod set etype 0x0011",
+ "$TC actions add action skbmod set etype 0x0021",
+ "$TC actions add action skbmod set etype 0x0031",
+ "$TC actions add action skbmod set etype 0x0041"
+ ],
+ "cmdUnderTest": "$TC actions flush action skbmod",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions ls action skbmod",
+ "matchPattern": "action order [0-9]*: skbmod",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action skbmod"
+ ]
+ },
+ {
+ "id": "b651",
+ "name": "Replace skbmod action with invalid goto_chain control",
+ "category": [
+ "actions",
+ "skbmod"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action skbmod",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action skbmod set etype 0x1111 pass index 90"
+ ],
+ "cmdUnderTest": "$TC actions replace action skbmod set etype 0x1111 goto chain 42 index 90 cookie c1a0c1a0",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions ls action skbmod",
+ "matchPattern": "action order [0-9]*: skbmod pass set etype 0x1111\\s+index 90 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action skbmod"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json b/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json
new file mode 100644
index 000000000..d06346968
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/tunnel_key.json
@@ -0,0 +1,937 @@
+[
+ {
+ "id": "2b11",
+ "name": "Add tunnel_key set action with mandatory parameters",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 10.10.10.1 dst_ip 20.20.20.2 id 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action tunnel_key",
+ "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 10.10.10.1.*dst_ip 20.20.20.2.*key_id 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "dc6b",
+ "name": "Add tunnel_key set action with missing mandatory src_ip parameter",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set dst_ip 20.20.20.2 id 100",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action tunnel_key",
+ "matchPattern": "action order [0-9]+: tunnel_key set.*dst_ip 20.20.20.2.*key_id 100",
+ "matchCount": "0",
+ "teardown": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ]
+ },
+ {
+ "id": "7f25",
+ "name": "Add tunnel_key set action with missing mandatory dst_ip parameter",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 10.10.10.1 id 100",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action tunnel_key",
+ "matchPattern": "action order [0-9]+: tunnel_key set.*src_ip 10.10.10.1.*key_id 100",
+ "matchCount": "0",
+ "teardown": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ]
+ },
+ {
+ "id": "a5e0",
+ "name": "Add tunnel_key set action with invalid src_ip parameter",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 300.168.100.1 dst_ip 192.168.200.1 id 7 index 1",
+ "expExitCode": "1",
+ "verifyCmd": "$TC actions get action tunnel_key index 1",
+ "matchPattern": "action order [0-9]+: tunnel_key set.*src_ip 300.168.100.1.*dst_ip 192.168.200.1.*key_id 7.*index 1 ref",
+ "matchCount": "0",
+ "teardown": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ]
+ },
+ {
+ "id": "eaa8",
+ "name": "Add tunnel_key set action with invalid dst_ip parameter",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 192.168.100.1 dst_ip 192.168.800.1 id 10 index 11",
+ "expExitCode": "1",
+ "verifyCmd": "$TC actions get action tunnel_key index 11",
+ "matchPattern": "action order [0-9]+: tunnel_key set.*src_ip 192.168.100.1.*dst_ip 192.168.800.1.*key_id 10.*index 11 ref",
+ "matchCount": "0",
+ "teardown": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ]
+ },
+ {
+ "id": "3b09",
+ "name": "Add tunnel_key set action with invalid id parameter",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 112233445566778899 index 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action tunnel_key index 1",
+ "matchPattern": "action order [0-9]+: tunnel_key set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 112233445566778899.*index 1 ref",
+ "matchCount": "0",
+ "teardown": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ]
+ },
+ {
+ "id": "9625",
+ "name": "Add tunnel_key set action with invalid dst_port parameter",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 11 dst_port 998877 index 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action tunnel_key index 1",
+ "matchPattern": "action order [0-9]+: tunnel_key set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 11.*dst_port 998877.*index 1 ref",
+ "matchCount": "0",
+ "teardown": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ]
+ },
+ {
+ "id": "05af",
+ "name": "Add tunnel_key set action with optional dst_port parameter",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 192.168.100.1 dst_ip 192.168.200.1 id 789 dst_port 4000 index 10",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action tunnel_key index 10",
+ "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 192.168.100.1.*dst_ip 192.168.200.1.*key_id 789.*dst_port 4000.*index 10 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "da80",
+ "name": "Add tunnel_key set action with index at 32-bit maximum",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 11 index 4294967295",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action tunnel_key index 4294967295",
+ "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*id 11.*index 4294967295 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "d407",
+ "name": "Add tunnel_key set action with index exceeding 32-bit maximum",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 11 index 4294967295678",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action tunnel_key index 4294967295678",
+ "matchPattern": "action order [0-9]+: tunnel_key set.*index 4294967295678 ref",
+ "matchCount": "0",
+ "teardown": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ]
+ },
+ {
+ "id": "5cba",
+ "name": "Add tunnel_key set action with id value at 32-bit maximum",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 4294967295 index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action tunnel_key index 1",
+ "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 4294967295.*index 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "e84a",
+ "name": "Add tunnel_key set action with id value exceeding 32-bit maximum",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 42949672955 index 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action tunnel_key index 4294967295",
+ "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 42949672955.*index 1",
+ "matchCount": "0",
+ "teardown": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ]
+ },
+ {
+ "id": "9c19",
+ "name": "Add tunnel_key set action with dst_port value at 16-bit maximum",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 429 dst_port 65535 index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action tunnel_key index 1",
+ "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 429.*dst_port 65535.*index 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "3bd9",
+ "name": "Add tunnel_key set action with dst_port value exceeding 16-bit maximum",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 429 dst_port 65535789 index 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action tunnel_key index 1",
+ "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 429.*dst_port 65535789.*index 1",
+ "matchCount": "0",
+ "teardown": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ]
+ },
+ {
+ "id": "68e2",
+ "name": "Add tunnel_key unset action",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key unset index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action tunnel_key index 1",
+ "matchPattern": "action order [0-9]+: tunnel_key.*unset.*index 1 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "6192",
+ "name": "Add tunnel_key unset continue action",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key unset continue index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action tunnel_key index 1",
+ "matchPattern": "action order [0-9]+: tunnel_key.*unset continue.*index 1 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "061d",
+ "name": "Add tunnel_key set continue action with cookie",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 192.168.10.1 dst_ip 192.168.20.2 id 123 continue index 1 cookie aa11bb22cc33dd44ee55ff66aa11b1b2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action tunnel_key index 1",
+ "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 192.168.10.1.*dst_ip 192.168.20.2.*key_id 123.*csum continue.*index 1.*cookie aa11bb22cc33dd44ee55ff66aa11b1b2",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "8acb",
+ "name": "Add tunnel_key set continue action with invalid cookie",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 192.168.10.1 dst_ip 192.168.20.2 id 123 continue index 1 cookie aa11bb22cc33dd44ee55ff66aa11b1b2777888",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action tunnel_key index 1",
+ "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 192.168.10.1.*dst_ip 192.168.20.2.*key_id 123.*csum continue.*index 1.*cookie aa11bb22cc33dd44ee55ff66aa11b1b2777888",
+ "matchCount": "0",
+ "teardown": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ]
+ },
+ {
+ "id": "a07e",
+ "name": "Add tunnel_key action with no set/unset command specified",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key src_ip 10.10.10.1 dst_ip 20.20.20.2 id 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action tunnel_key index 1",
+ "matchPattern": "action order [0-9]+: tunnel_key.*src_ip 10.10.10.1.*dst_ip 20.20.20.2.*key_id 1",
+ "matchCount": "0",
+ "teardown": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ]
+ },
+ {
+ "id": "b227",
+ "name": "Add tunnel_key action with csum option",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 10.10.10.1 dst_ip 20.20.20.2 id 1 csum index 99",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action tunnel_key index 99",
+ "matchPattern": "action order [0-9]+: tunnel_key.*src_ip 10.10.10.1.*dst_ip 20.20.20.2.*key_id 1.*csum pipe.*index 99",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "58a7",
+ "name": "Add tunnel_key action with nocsum option",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 10.10.10.1 dst_ip 10.10.10.2 id 7823 nocsum index 234",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action tunnel_key index 234",
+ "matchPattern": "action order [0-9]+: tunnel_key.*src_ip 10.10.10.1.*dst_ip 10.10.10.2.*key_id 7823.*nocsum pipe.*index 234",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "2575",
+ "name": "Add tunnel_key action with not-supported parameter",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 10.10.10.1 dst_ip 10.10.10.2 id 7 foobar 999 index 4",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action tunnel_key index 4",
+ "matchPattern": "action order [0-9]+: tunnel_key.*src_ip 10.10.10.1.*dst_ip 10.10.10.2.*key_id 7.*foobar 999.*index 4",
+ "matchCount": "0",
+ "teardown": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ]
+ },
+ {
+ "id": "7a88",
+ "name": "Add tunnel_key action with cookie parameter",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 10.10.10.1 dst_ip 10.10.10.2 id 7 index 4 cookie aa11bb22cc33dd44ee55ff66aa11b1b2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action tunnel_key index 4",
+ "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 10.10.10.1.*dst_ip 10.10.10.2.*key_id 7.*csum pipe.*index 4 ref.*cookie aa11bb22cc33dd44ee55ff66aa11b1b2",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "4f20",
+ "name": "Add tunnel_key action with a single geneve option parameter",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 42 dst_port 6081 geneve_opts 0102:80:00880022 index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action tunnel_key index 1",
+ "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 42.*dst_port 6081.*geneve_opt[s]? 0102:80:00880022.*index 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "e33d",
+ "name": "Add tunnel_key action with multiple geneve options parameter",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 42 dst_port 6081 geneve_opts 0102:80:00880022,0408:42:0040007611223344,0111:02:1020304011223344 index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action tunnel_key index 1",
+ "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 42.*dst_port 6081.*geneve_opt[s]? 0102:80:00880022,0408:42:0040007611223344,0111:02:1020304011223344.*index 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "0778",
+ "name": "Add tunnel_key action with invalid class geneve option parameter",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 42 dst_port 6081 geneve_opts 824212:80:00880022 index 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action tunnel_key index 1",
+ "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 42.*dst_port 6081.*geneve_opt[s]? 824212:80:00880022.*index 1",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "4ae8",
+ "name": "Add tunnel_key action with invalid type geneve option parameter",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 42 dst_port 6081 geneve_opts 0102:4224:00880022 index 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action tunnel_key index 1",
+ "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 42.*dst_port 6081.*geneve_opt[s]? 0102:4224:00880022.*index 1",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "4039",
+ "name": "Add tunnel_key action with short data length geneve option parameter",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 42 dst_port 6081 geneve_opts 0102:80:4288 index 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action tunnel_key index 1",
+ "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 42.*dst_port 6081.*geneve_opt[s]? 0102:80:4288.*index 1",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "26a6",
+ "name": "Add tunnel_key action with non-multiple of 4 data length geneve option parameter",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 42 dst_port 6081 geneve_opts 0102:80:4288428822 index 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action tunnel_key index 1",
+ "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 42.*dst_port 6081.*geneve_opt[s]? 0102:80:4288428822.*index 1",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "f44d",
+ "name": "Add tunnel_key action with incomplete geneve options parameter",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 1.1.1.1 dst_ip 2.2.2.2 id 42 dst_port 6081 geneve_opts 0102:80:00880022,0408:42: index 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action tunnel_key index 1",
+ "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 1.1.1.1.*dst_ip 2.2.2.2.*key_id 42.*dst_port 6081.*geneve_opt[s]? 0102:80:00880022,0408:42:.*index 1",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "7afc",
+ "name": "Replace tunnel_key set action with all parameters",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action tunnel_key set src_ip 10.10.10.1 dst_ip 20.20.20.2 dst_port 3128 csum id 1 index 1"
+ ],
+ "cmdUnderTest": "$TC actions replace action tunnel_key set src_ip 11.11.11.1 dst_ip 21.21.21.2 dst_port 3129 nocsum id 11 index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action tunnel_key index 1",
+ "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 11.11.11.1.*dst_ip 21.21.21.2.*key_id 11.*dst_port 3129.*nocsum pipe.*index 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "364d",
+ "name": "Replace tunnel_key set action with all parameters and cookie",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action tunnel_key set src_ip 10.10.10.1 dst_ip 20.20.20.2 dst_port 3128 nocsum id 1 index 1 cookie 123456"
+ ],
+ "cmdUnderTest": "$TC actions replace action tunnel_key set src_ip 11.11.11.1 dst_ip 21.21.21.2 dst_port 3129 id 11 csum reclassify index 1 cookie 123456",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action tunnel_key index 1",
+ "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 11.11.11.1.*dst_ip 21.21.21.2.*key_id 11.*dst_port 3129.*csum reclassify.*index 1.*cookie 123456",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "937c",
+ "name": "Fetch all existing tunnel_key actions",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action tunnel_key set src_ip 10.10.10.1 dst_ip 20.20.20.2 dst_port 3128 nocsum id 1 pipe index 1",
+ "$TC actions add action tunnel_key set src_ip 11.10.10.1 dst_ip 21.20.20.2 dst_port 3129 csum id 2 jump 10 index 2",
+ "$TC actions add action tunnel_key set src_ip 12.10.10.1 dst_ip 22.20.20.2 dst_port 3130 csum id 3 pass index 3",
+ "$TC actions add action tunnel_key set src_ip 13.10.10.1 dst_ip 23.20.20.2 dst_port 3131 nocsum id 4 continue index 4"
+ ],
+ "cmdUnderTest": "$TC actions list action tunnel_key",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action tunnel_key",
+ "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 10.10.10.1.*dst_ip 20.20.20.2.*key_id 1.*dst_port 3128.*nocsum pipe.*index 1.*set.*src_ip 11.10.10.1.*dst_ip 21.20.20.2.*key_id 2.*dst_port 3129.*csum jump 10.*index 2.*set.*src_ip 12.10.10.1.*dst_ip 22.20.20.2.*key_id 3.*dst_port 3130.*csum pass.*index 3.*set.*src_ip 13.10.10.1.*dst_ip 23.20.20.2.*key_id 4.*dst_port 3131.*nocsum continue.*index 4",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "6783",
+ "name": "Flush all existing tunnel_key actions",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action tunnel_key set src_ip 10.10.10.1 dst_ip 20.20.20.2 dst_port 3128 nocsum id 1 pipe index 1",
+ "$TC actions add action tunnel_key set src_ip 11.10.10.1 dst_ip 21.20.20.2 dst_port 3129 csum id 2 reclassify index 2",
+ "$TC actions add action tunnel_key set src_ip 12.10.10.1 dst_ip 22.20.20.2 dst_port 3130 csum id 3 pass index 3",
+ "$TC actions add action tunnel_key set src_ip 13.10.10.1 dst_ip 23.20.20.2 dst_port 3131 nocsum id 4 continue index 4"
+ ],
+ "cmdUnderTest": "$TC actions flush action tunnel_key",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action tunnel_key",
+ "matchPattern": "action order [0-9]+:.*",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "8242",
+ "name": "Replace tunnel_key set action with invalid goto chain",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action tunnel_key set src_ip 10.10.10.1 dst_ip 20.20.20.2 dst_port 3128 nocsum id 1 pass index 90"
+ ],
+ "cmdUnderTest": "$TC actions replace action tunnel_key set src_ip 10.10.10.2 dst_ip 20.20.20.1 dst_port 3129 id 2 csum goto chain 42 index 90 cookie c1a0c1a0",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action tunnel_key index 90",
+ "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 10.10.10.1.*dst_ip 20.20.20.2.*key_id 1.*dst_port 3128.*csum pass.*index 90 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ },
+ {
+ "id": "0cd2",
+ "name": "Add tunnel_key set action with no_percpu flag",
+ "category": [
+ "actions",
+ "tunnel_key"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action tunnel_key",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action tunnel_key set src_ip 10.10.10.1 dst_ip 20.20.20.2 id 1 no_percpu",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action tunnel_key",
+ "matchPattern": "action order [0-9]+: tunnel_key.*set.*src_ip 10.10.10.1.*dst_ip 20.20.20.2.*key_id 1.*no_percpu",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action tunnel_key"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/actions/vlan.json b/tools/testing/selftests/tc-testing/tc-tests/actions/vlan.json
new file mode 100644
index 000000000..41d783254
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/actions/vlan.json
@@ -0,0 +1,835 @@
+[
+ {
+ "id": "6f5a",
+ "name": "Add vlan pop action with pipe opcode",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan pop pipe index 8",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action vlan",
+ "matchPattern": "action order [0-9]+: vlan.*pop.*pipe.*index 8 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "df35",
+ "name": "Add vlan pop action with pass opcode",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan pop pass index 8",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action vlan index 8",
+ "matchPattern": "action order [0-9]+: vlan.*pop.*pass.*index 8 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "b0d4",
+ "name": "Add vlan pop action with drop opcode",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan pop drop index 8",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action vlan index 8",
+ "matchPattern": "action order [0-9]+: vlan.*pop.*drop.*index 8 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "95ee",
+ "name": "Add vlan pop action with reclassify opcode",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan pop reclassify index 8",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action vlan index 8",
+ "matchPattern": "action order [0-9]+: vlan.*pop.*reclassify.*index 8 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "0283",
+ "name": "Add vlan pop action with continue opcode",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan pop continue index 8",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action vlan index 8",
+ "matchPattern": "action order [0-9]+: vlan.*pop.*continue.*index 8 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "b6b9",
+ "name": "Add vlan pop action with jump opcode",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan pop jump 10 index 8",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action vlan",
+ "matchPattern": "action order [0-9]+: vlan.*jump 10.*index 8 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "87c3",
+ "name": "Add vlan pop action with trap opcode",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan pop trap index 8",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action vlan",
+ "matchPattern": "action order [0-9]+: vlan.*pop trap.*index 8 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "a178",
+ "name": "Add vlan pop action with invalid opcode",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan pop foo index 8",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action vlan",
+ "matchPattern": "action order [0-9]+: vlan.*pop.*foo.*index 8 ref",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "ee6f",
+ "name": "Add vlan pop action with index at 32-bit maximum",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan pop index 4294967295",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action vlan",
+ "matchPattern": "action order [0-9]+: vlan.*pop.*index 4294967295 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "0dfa",
+ "name": "Add vlan pop action with index exceeding 32-bit maximum",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan pop reclassify index 429496729599",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action vlan index 429496729599",
+ "matchPattern": "action order [0-9]+: vlan.*pop.reclassify.*index 429496729599",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "2b91",
+ "name": "Add vlan invalid action",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan bad_mode",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action vlan",
+ "matchPattern": "action order [0-9]+: vlan.*bad_mode",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "57fc",
+ "name": "Add vlan push action with invalid protocol type",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan push protocol ABCD",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action vlan",
+ "matchPattern": "action order [0-9]+: vlan.*push",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "3989",
+ "name": "Add vlan push action with default protocol and priority",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan push id 123 index 18",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action vlan index 18",
+ "matchPattern": "action order [0-9]+: vlan.*push id 123 protocol 802.1Q priority 0 pipe.*index 18 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "79dc",
+ "name": "Add vlan push action with protocol 802.1Q and priority 3",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan push id 77 protocol 802.1Q priority 3 continue index 734",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action vlan index 734",
+ "matchPattern": "action order [0-9]+: vlan.*push id 77 protocol 802.1Q priority 3 continue.*index 734 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "4d73",
+ "name": "Add vlan push action with protocol 802.1AD",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan push id 1024 protocol 802.1AD pass index 10000",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action vlan index 10000",
+ "matchPattern": "action order [0-9]+: vlan.*push id 1024 protocol 802.1ad priority 0 pass.*index 10000 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "1f4b",
+ "name": "Add vlan push action with maximum 12-bit vlan ID",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan push id 4094 index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action vlan index 1",
+ "matchPattern": "action order [0-9]+: vlan.*push id 4094.*protocol 802.1Q.*priority 0.*index 1 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "1f7b",
+ "name": "Add vlan push action with invalid vlan ID",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan push id 5678 index 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action vlan",
+ "matchPattern": "action order [0-9]+: vlan.*push id 5678.*index 1 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "fe40",
+ "name": "Add vlan push action with maximum 3-bit IEEE 802.1p priority",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan push id 4 priority 7 reclassify index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action vlan index 1",
+ "matchPattern": "action order [0-9]+: vlan.*push id 4.*protocol 802.1Q.*priority 7.*reclassify.*index 1 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "5d02",
+ "name": "Add vlan push action with invalid IEEE 802.1p priority",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan push id 5 priority 10 index 1",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions list action vlan",
+ "matchPattern": "action order [0-9]+: vlan.*push id 5.*index 1 ref",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "6812",
+ "name": "Add vlan modify action for protocol 802.1Q",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan modify protocol 802.1Q id 5 index 100",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action vlan index 100",
+ "matchPattern": "action order [0-9]+: vlan.*modify id 100 protocol 802.1Q priority 0 pipe.*index 100 ref",
+ "matchCount": "0",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "5a31",
+ "name": "Add vlan modify action for protocol 802.1AD",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan modify protocol 802.1ad id 500 reclassify index 12",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action vlan index 12",
+ "matchPattern": "action order [0-9]+: vlan.*modify id 500 protocol 802.1ad priority 0 reclassify.*index 12 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "3deb",
+ "name": "Replace existing vlan push action with new ID",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action vlan push id 500 pipe index 12"
+ ],
+ "cmdUnderTest": "$TC actions replace action vlan push id 700 pipe index 12",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action vlan index 12",
+ "matchPattern": "action order [0-9]+: vlan.*push id 700 protocol 802.1Q priority 0 pipe.*index 12 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "9e76",
+ "name": "Replace existing vlan push action with new protocol",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action vlan push id 1 protocol 802.1Q pipe index 1"
+ ],
+ "cmdUnderTest": "$TC actions replace action vlan push id 1 protocol 802.1ad pipe index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action vlan index 1",
+ "matchPattern": "action order [0-9]+: vlan.*push id 1 protocol 802.1ad priority 0 pipe.*index 1 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "ede4",
+ "name": "Replace existing vlan push action with new priority",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action vlan push id 1 protocol 802.1Q priority 3 reclassify index 1"
+ ],
+ "cmdUnderTest": "$TC actions replace action vlan push id 1 priority 4 reclassify index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action vlan index 1",
+ "matchPattern": "action order [0-9]+: vlan.*push id 1 protocol 802.1Q priority 4 reclassify.*index 1 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "d413",
+ "name": "Replace existing vlan pop action with new cookie",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action vlan pop continue index 1 cookie 22334455"
+ ],
+ "cmdUnderTest": "$TC actions replace action vlan pop continue index 1 cookie a1b1c2d1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions get action vlan index 1",
+ "matchPattern": "action order [0-9]+: vlan.*pop continue.*index 1 ref.*cookie a1b1c2d1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "83a4",
+ "name": "Delete vlan pop action",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action vlan pop index 44"
+ ],
+ "cmdUnderTest": "$TC actions del action vlan index 44",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action vlan",
+ "matchPattern": "action order [0-9]+: vlan.*pop.*index 44 ref",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "ed1e",
+ "name": "Delete vlan push action for protocol 802.1Q",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action vlan push id 4094 protocol 802.1Q index 999"
+ ],
+ "cmdUnderTest": "$TC actions del action vlan index 999",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action vlan",
+ "matchPattern": "action order [0-9]+: vlan.*push id 4094 protocol 802.1Q priority 0 pipe.*index 999 ref",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "a2a3",
+ "name": "Flush vlan actions",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action vlan push id 4 protocol 802.1ad index 10",
+ "$TC actions add action vlan push id 4 protocol 802.1ad index 11",
+ "$TC actions add action vlan push id 4 protocol 802.1ad index 12",
+ "$TC actions add action vlan push id 4 protocol 802.1ad index 13"
+ ],
+ "cmdUnderTest": "$TC actions flush action vlan",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action vlan",
+ "matchPattern": "action order [0-9]+: vlan.*push id 4 protocol 802.1ad",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "1d78",
+ "name": "Add vlan push action with cookie",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan push id 4 cookie a0a0a0a0a0a0a0",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action vlan",
+ "matchPattern": "action order [0-9]+: vlan.*push id 4.*cookie a0a0a0a0a0a0a0",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "e394",
+ "name": "Replace vlan push action with invalid goto chain control",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ],
+ "$TC actions add action vlan push id 500 pass index 90"
+ ],
+ "cmdUnderTest": "$TC actions replace action vlan push id 500 goto chain 42 index 90 cookie c1a0c1a0",
+ "expExitCode": "255",
+ "verifyCmd": "$TC actions get action vlan index 90",
+ "matchPattern": "action order [0-9]+: vlan.*push id 500 protocol 802.1Q priority 0 pass.*index 90 ref",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "294e",
+ "name": "Add batch of 32 vlan push actions with cookie",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action vlan push protocol 802.1q id 4094 priority 7 pipe index \\$i cookie aabbccddeeff112233445566778800a1 \\\"; args=\"\\$args\\$cmd\"; done && $TC actions add \\$args\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action vlan",
+ "matchPattern": "^[ \t]+index [0-9]+ ref",
+ "matchCount": "32",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "56f7",
+ "name": "Delete batch of 32 vlan push actions",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ],
+ "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action vlan push protocol 802.1q id 4094 priority 7 pipe index \\$i \\\"; args=\\\"\\$args\\$cmd\\\"; done && $TC actions add \\$args\""
+ ],
+ "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action vlan index \\$i \\\"; args=\"\\$args\\$cmd\"; done && $TC actions del \\$args\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action vlan",
+ "matchPattern": "^[ \t]+index [0-9]+ ref",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "759f",
+ "name": "Add batch of 32 vlan pop actions with cookie",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action vlan pop continue index \\$i cookie aabbccddeeff112233445566778800a1 \\\"; args=\"\\$args\\$cmd\"; done && $TC actions add \\$args\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action vlan",
+ "matchPattern": "^[ \t]+index [0-9]+ ref",
+ "matchCount": "32",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ },
+ {
+ "id": "c84a",
+ "name": "Delete batch of 32 vlan pop actions",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ],
+ "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action vlan pop index \\$i \\\"; args=\\\"\\$args\\$cmd\\\"; done && $TC actions add \\$args\""
+ ],
+ "cmdUnderTest": "bash -c \"for i in \\`seq 1 32\\`; do cmd=\\\"action vlan index \\$i \\\"; args=\"\\$args\\$cmd\"; done && $TC actions del \\$args\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action vlan",
+ "matchPattern": "^[ \t]+index [0-9]+ ref",
+ "matchCount": "0",
+ "teardown": []
+ },
+ {
+ "id": "1a3d",
+ "name": "Add vlan pop action with no_percpu flag",
+ "category": [
+ "actions",
+ "vlan"
+ ],
+ "setup": [
+ [
+ "$TC actions flush action vlan",
+ 0,
+ 1,
+ 255
+ ]
+ ],
+ "cmdUnderTest": "$TC actions add action vlan pop no_percpu",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action vlan",
+ "matchPattern": "action order [0-9]+: vlan.*pop.*no_percpu",
+ "matchCount": "1",
+ "teardown": [
+ "$TC actions flush action vlan"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/basic.json b/tools/testing/selftests/tc-testing/tc-tests/filters/basic.json
new file mode 100644
index 000000000..e788c114a
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/filters/basic.json
@@ -0,0 +1,1278 @@
+[
+ {
+ "id": "7a92",
+ "name": "Add basic filter with cmp ematch u8/link layer and default action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'cmp(u8 at 0 layer link mask 0xff gt 10)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*cmp\\(u8 at 0 layer 0 mask 0xff gt 10\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "2e8a",
+ "name": "Add basic filter with cmp ematch u8/link layer with trans flag and default action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'cmp(u8 at 0 layer link mask 0xff trans gt 10)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*cmp\\(u8 at 0 layer 0 mask 0xff trans gt 10\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "4d9f",
+ "name": "Add basic filter with cmp ematch u16/link layer and a single action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'cmp(u16 at 0 layer 0 mask 0xff00 lt 3)' action pass",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1.*cmp\\(u16 at 0 layer 0 mask 0xff00 lt 3\\).*action.*gact action pass",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "4943",
+ "name": "Add basic filter with cmp ematch u32/link layer and miltiple actions",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'cmp(u32 at 4 layer link mask 0xff00ff00 eq 3)' action skbedit mark 7 pipe action gact drop",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1.*cmp\\(u32 at 4 layer 0 mask 0xff00ff00 eq 3\\).*action.*skbedit.*mark 7 pipe.*action.*gact action drop",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "7559",
+ "name": "Add basic filter with cmp ematch u8/network layer and default action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 0xab protocol ip prio 11 basic match 'cmp(u8 at 0 layer 1 mask 0xff gt 10)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 0xab prio 11 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 11 basic.*handle 0xab flowid 1:1.*cmp\\(u8 at 0 layer 1 mask 0xff gt 10\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "aff4",
+ "name": "Add basic filter with cmp ematch u8/network layer with trans flag and default action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 0xab protocol ip prio 11 basic match 'cmp(u8 at 0 layer 1 mask 0xff trans gt 10)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 0xab prio 11 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 11 basic.*handle 0xab flowid 1:1.*cmp\\(u8 at 0 layer 1 mask 0xff trans gt 10\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "c732",
+ "name": "Add basic filter with cmp ematch u16/network layer and a single action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 0x100 protocol ip prio 100 basic match 'cmp(u16 at 0 layer network mask 0xff00 lt 3)' action pass",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 0x100 prio 100 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 100 basic.*handle 0x100.*cmp\\(u16 at 0 layer 1 mask 0xff00 lt 3\\).*action.*gact action pass",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "32d8",
+ "name": "Add basic filter with cmp ematch u32/network layer and miltiple actions",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 0x112233 protocol ip prio 7 basic match 'cmp(u32 at 4 layer network mask 0xff00ff00 eq 3)' action skbedit mark 7 pipe action gact drop",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 0x112233 prio 7 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 7 basic.*handle 0x112233.*cmp\\(u32 at 4 layer 1 mask 0xff00ff00 eq 3\\).*action.*skbedit.*mark 7 pipe.*action.*gact action drop",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "b99c",
+ "name": "Add basic filter with cmp ematch u8/transport layer and default action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'cmp(u8 at 0 layer transport mask 0xff gt 10)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*cmp\\(u8 at 0 layer 2 mask 0xff gt 10\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "0752",
+ "name": "Add basic filter with cmp ematch u8/transport layer with trans flag and default action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'cmp(u8 at 0 layer transport mask 0xff trans gt 10)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*cmp\\(u8 at 0 layer 2 mask 0xff trans gt 10\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "7e07",
+ "name": "Add basic filter with cmp ematch u16/transport layer and a single action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'cmp(u16 at 0 layer 2 mask 0xff00 lt 3)' action pass",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1.*cmp\\(u16 at 0 layer 2 mask 0xff00 lt 3\\).*action.*gact action pass",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "62d7",
+ "name": "Add basic filter with cmp ematch u32/transport layer and miltiple actions",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'cmp(u32 at 4 layer transport mask 0xff00ff00 eq 3)' action skbedit mark 7 pipe action gact drop",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1.*cmp\\(u32 at 4 layer 2 mask 0xff00ff00 eq 3\\).*action.*skbedit.*mark 7 pipe.*action.*gact action drop",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "304b",
+ "name": "Add basic filter with NOT cmp ematch rule and default action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'not cmp(u8 at 0 layer link mask 0xff eq 3)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*NOT cmp\\(u8 at 0 layer 0 mask 0xff eq 3\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "8ecb",
+ "name": "Add basic filter with two ANDed cmp ematch rules and single action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'cmp(u8 at 0 layer link mask 0xff eq 3) and cmp(u16 at 8 layer link mask 0x00ff gt 7)' action gact drop",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1.*cmp\\(u8 at 0 layer 0 mask 0xff eq 3\\).*AND cmp\\(u16 at 8 layer 0 mask 0xff gt 7\\).*action.*gact action drop",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "b1ad",
+ "name": "Add basic filter with two ORed cmp ematch rules and single action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'cmp(u8 at 0 layer link mask 0xff eq 3) or cmp(u16 at 8 layer link mask 0x00ff gt 7)' action gact drop",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1.*cmp\\(u8 at 0 layer 0 mask 0xff eq 3\\).*OR cmp\\(u16 at 8 layer 0 mask 0xff gt 7\\).*action.*gact action drop",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "4600",
+ "name": "Add basic filter with two ANDed cmp ematch rules and one ORed ematch rule and single action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'cmp(u8 at 0 layer link mask 0xff eq 3) and cmp(u16 at 8 layer link mask 0x00ff gt 7) or cmp(u32 at 4 layer network mask 0xa0a0 lt 3)' action gact drop",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1.*cmp\\(u8 at 0 layer 0 mask 0xff eq 3\\).*AND cmp\\(u16 at 8 layer 0 mask 0xff gt 7\\).*OR cmp\\(u32 at 4 layer 1 mask 0xa0a0 lt 3\\).*action.*gact action drop",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "bc59",
+ "name": "Add basic filter with two ANDed cmp ematch rules and one NOT ORed ematch rule and single action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'cmp(u8 at 0 layer link mask 0xff eq 3) and cmp(u16 at 8 layer link mask 0x00ff gt 7) or not cmp(u32 at 4 layer network mask 0xa0a0 lt 3)' action gact drop",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1.*cmp\\(u8 at 0 layer 0 mask 0xff eq 3\\).*AND cmp\\(u16 at 8 layer 0 mask 0xff gt 7\\).*OR NOT cmp\\(u32 at 4 layer 1 mask 0xa0a0 lt 3\\).*action.*gact action drop",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "bae4",
+ "name": "Add basic filter with u32 ematch u8/zero offset and default action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x11 0x0f at 0)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(01000000/0f000000 at 0\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "e6cb",
+ "name": "Add basic filter with u32 ematch u8/zero offset and invalid value >0xFF",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x1122 0x0f at 0)' classid 1:1",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11220000/0f000000 at 0\\)",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "7727",
+ "name": "Add basic filter with u32 ematch u8/positive offset and default action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x77 0x1f at 12)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(17000000/1f000000 at 12\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "a429",
+ "name": "Add basic filter with u32 ematch u8/invalid mask >0xFF",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x77 0xff00 at 12)' classid 1:1",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(77000000/ff000000 at 12\\)",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "8373",
+ "name": "Add basic filter with u32 ematch u8/missing offset",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x77 0xff at)' classid 1:1",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(77000000 at 12\\)",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "ab8e",
+ "name": "Add basic filter with u32 ematch u8/missing AT keyword",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x77 0xff 0)' classid 1:1",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(77000000 at 12\\)",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "712d",
+ "name": "Add basic filter with u32 ematch u8/missing value",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 at 12)' classid 1:1",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(at 12\\)",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "350f",
+ "name": "Add basic filter with u32 ematch u8/non-numeric value",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 zero 0xff at 0)' classid 1:1",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(00000000/ff000000 at 0\\)",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "e28f",
+ "name": "Add basic filter with u32 ematch u8/non-numeric mask",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x11 mask at 0)' classid 1:1",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11000000/00000000 at 0\\)",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "6d5f",
+ "name": "Add basic filter with u32 ematch u8/negative offset and default action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0xaa 0xf0 at -14)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(0000a000/0000f000 at -16\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "12dc",
+ "name": "Add basic filter with u32 ematch u8/nexthdr+ offset and default action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0xaa 0xf0 at nexthdr+0)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(a0000000/f0000000 at nexthdr\\+0\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "1d85",
+ "name": "Add basic filter with u32 ematch u16/zero offset and default action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0x1122 0xffff at 0)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11220000/ffff0000 at 0\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "3672",
+ "name": "Add basic filter with u32 ematch u16/zero offset and invalid value >0xFFFF",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0x112233 0xffff at 0)' classid 1:1",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11223300/ffff0000 at 0\\)",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "7fb0",
+ "name": "Add basic filter with u32 ematch u16/positive offset and default action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0x7788 0x1fff at 12)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(17880000/1fff0000 at 12\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "19af",
+ "name": "Add basic filter with u32 ematch u16/invalid mask >0xFFFF",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0x7788 0xffffffff at 12)' classid 1:1",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(77880000/ffffffff at 12\\)",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "446d",
+ "name": "Add basic filter with u32 ematch u16/missing offset",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0x7788 0xffff at)' classid 1:1",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(77880000 at 12\\)",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "151b",
+ "name": "Add basic filter with u32 ematch u16/missing AT keyword",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0x7788 0xffff 0)' classid 1:1",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(77880000/ffff0000 at 0\\)",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "bb23",
+ "name": "Add basic filter with u32 ematch u16/missing value",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 at 12)' classid 1:1",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(at 12\\)",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "decc",
+ "name": "Add basic filter with u32 ematch u16/non-numeric value",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 zero 0xffff at 0)' classid 1:1",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(00000000/ffff0000 at 0\\)",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "e988",
+ "name": "Add basic filter with u32 ematch u16/non-numeric mask",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u8 0x1122 mask at 0)' classid 1:1",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11220000/00000000 at 0\\)",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "07d8",
+ "name": "Add basic filter with u32 ematch u16/negative offset and default action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0xaabb 0xffff at -12)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(aabb0000/ffff0000 at -12\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "f474",
+ "name": "Add basic filter with u32 ematch u16/nexthdr+ offset and default action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u16 0xaabb 0xf0f0 at nexthdr+0)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(a0b00000/f0f00000 at nexthdr\\+0\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "47a0",
+ "name": "Add basic filter with u32 ematch u32/zero offset and default action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 0xaabbccdd 0xffffffff at 0)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(aabbccdd/ffffffff at 0\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "849f",
+ "name": "Add basic filter with u32 ematch u32/positive offset and default action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 0x11227788 0x1ffff0f0 at 12)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11227080/1ffff0f0 at 12\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "d288",
+ "name": "Add basic filter with u32 ematch u32/missing offset",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 0x11227788 0xffffffff at)' classid 1:1",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11227788/ffffffff at 12\\)",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "4998",
+ "name": "Add basic filter with u32 ematch u32/missing AT keyword",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 0x77889900 0xfffff0f0 0)' classid 1:1",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(77889900/fffff0f0 at 0\\)",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "1f0a",
+ "name": "Add basic filter with u32 ematch u32/missing value",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 at 12)' classid 1:1",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(at 12\\)",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "848e",
+ "name": "Add basic filter with u32 ematch u32/non-numeric value",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 zero 0xffff at 0)' classid 1:1",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(00000000/ffff0000 at 0\\)",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "f748",
+ "name": "Add basic filter with u32 ematch u32/non-numeric mask",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 0x11223344 mask at 0)' classid 1:1",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(11223344/00000000 at 0\\)",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "55a6",
+ "name": "Add basic filter with u32 ematch u32/negative offset and default action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 0xaabbccdd 0xff00ff00 at -12)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(aa00cc00/ff00ff00 at -12\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "7282",
+ "name": "Add basic filter with u32 ematch u32/nexthdr+ offset and default action",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'u32(u32 0xaabbccdd 0xffffffff at nexthdr+0)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*u32\\(aabbccdd/ffffffff at nexthdr\\+0\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "b2b6",
+ "name": "Add basic filter with canid ematch and single SFF",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(sff 1)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(sff 0x1\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "f67f",
+ "name": "Add basic filter with canid ematch and single SFF with mask",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(sff 0xaabb:0x00ff)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(sff 0x2BB:0xFF\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "bd5c",
+ "name": "Add basic filter with canid ematch and multiple SFF",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(sff 1 sff 2 sff 3)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(sff 0x1 sff 0x2 sff 0x3\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "83c7",
+ "name": "Add basic filter with canid ematch and multiple SFF with masks",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(sff 0xaa:0x01 sff 0xbb:0x02 sff 0xcc:0x03)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(sff 0xAA:0x1 sff 0xBB:0x2 sff 0xCC:0x3\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "a8f5",
+ "name": "Add basic filter with canid ematch and single EFF",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(eff 1)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(eff 0x1\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "98ae",
+ "name": "Add basic filter with canid ematch and single EFF with mask",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(eff 0xaabb:0xf1f1)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(eff 0xAABB:0xF1F1\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "6056",
+ "name": "Add basic filter with canid ematch and multiple EFF",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(eff 1 eff 2 eff 3)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(eff 0x1 eff 0x2 eff 0x3\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "d188",
+ "name": "Add basic filter with canid ematch and multiple EFF with masks",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(eff 0xaa:0x01 eff 0xbb:0x02 eff 0xcc:0x03)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(eff 0xAA:0x1 eff 0xBB:0x2 eff 0xCC:0x3\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "25d1",
+ "name": "Add basic filter with canid ematch and a combination of SFF/EFF",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(sff 0x01 eff 0x02)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(eff 0x2 sff 0x1\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "b438",
+ "name": "Add basic filter with canid ematch and a combination of SFF/EFF with masks",
+ "category": [
+ "filter",
+ "basic"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 protocol ip prio 1 basic match 'canid(sff 0x01:0xf eff 0x02:0xf)' classid 1:1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol ip basic",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 basic.*handle 0x1 flowid 1:1.*canid\\(eff 0x2:0xF sff 0x1:0xF\\)",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/concurrency.json b/tools/testing/selftests/tc-testing/tc-tests/filters/concurrency.json
new file mode 100644
index 000000000..c2a433a47
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/filters/concurrency.json
@@ -0,0 +1,177 @@
+[
+ {
+ "id": "e41d",
+ "name": "Add 1M flower filters with 10 parallel tc instances",
+ "category": [
+ "filter",
+ "flower",
+ "concurrency"
+ ],
+ "setup": [
+ "/bin/mkdir $BATCH_DIR",
+ "$TC qdisc add dev $DEV2 ingress",
+ "./tdc_multibatch.py $DEV2 $BATCH_DIR 100000 10 add"
+ ],
+ "cmdUnderTest": "bash -c \"find $BATCH_DIR/add* -print | xargs -n 1 -P 10 $TC -b\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC -s filter show dev $DEV2 ingress",
+ "matchPattern": "filter protocol ip pref 1 flower chain 0 handle",
+ "matchCount": "1000000",
+ "teardown": [
+ "$TC qdisc del dev $DEV2 ingress",
+ "/bin/rm -rf $BATCH_DIR"
+ ]
+ },
+ {
+ "id": "6f52",
+ "name": "Delete 1M flower filters with 10 parallel tc instances",
+ "category": [
+ "filter",
+ "flower",
+ "concurrency"
+ ],
+ "setup": [
+ "/bin/mkdir $BATCH_DIR",
+ "$TC qdisc add dev $DEV2 ingress",
+ "./tdc_multibatch.py $DEV2 $BATCH_DIR 1000000 1 add",
+ "$TC -b $BATCH_DIR/add_0",
+ "./tdc_multibatch.py $DEV2 $BATCH_DIR 100000 10 del"
+ ],
+ "cmdUnderTest": "bash -c \"find $BATCH_DIR/del* -print | xargs -n 1 -P 10 $TC -b\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC -s filter show dev $DEV2 ingress",
+ "matchPattern": "filter protocol ip pref 1 flower chain 0 handle",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV2 ingress",
+ "/bin/rm -rf $BATCH_DIR"
+ ]
+ },
+ {
+ "id": "c9da",
+ "name": "Replace 1M flower filters with 10 parallel tc instances",
+ "category": [
+ "filter",
+ "flower",
+ "concurrency"
+ ],
+ "setup": [
+ "/bin/mkdir $BATCH_DIR",
+ "$TC qdisc add dev $DEV2 ingress",
+ "./tdc_multibatch.py $DEV2 $BATCH_DIR 1000000 1 add",
+ "$TC -b $BATCH_DIR/add_0",
+ "./tdc_multibatch.py $DEV2 $BATCH_DIR 100000 10 replace"
+ ],
+ "cmdUnderTest": "bash -c \"find $BATCH_DIR/replace* -print | xargs -n 1 -P 10 $TC -b\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC -s filter show dev $DEV2 ingress",
+ "matchPattern": "filter protocol ip pref 1 flower chain 0 handle",
+ "matchCount": "1000000",
+ "teardown": [
+ "$TC qdisc del dev $DEV2 ingress",
+ "/bin/rm -rf $BATCH_DIR"
+ ]
+ },
+ {
+ "id": "14be",
+ "name": "Concurrently replace same range of 100k flower filters from 10 tc instances",
+ "category": [
+ "filter",
+ "flower",
+ "concurrency"
+ ],
+ "setup": [
+ "/bin/mkdir $BATCH_DIR",
+ "$TC qdisc add dev $DEV2 ingress",
+ "./tdc_multibatch.py $DEV2 $BATCH_DIR 100000 1 add",
+ "$TC -b $BATCH_DIR/add_0",
+ "./tdc_multibatch.py -d $DEV2 $BATCH_DIR 100000 10 replace"
+ ],
+ "cmdUnderTest": "bash -c \"find $BATCH_DIR/replace* -print | xargs -n 1 -P 10 $TC -b\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC -s filter show dev $DEV2 ingress",
+ "matchPattern": "filter protocol ip pref 1 flower chain 0 handle",
+ "matchCount": "100000",
+ "teardown": [
+ "$TC qdisc del dev $DEV2 ingress",
+ "/bin/rm -rf $BATCH_DIR"
+ ]
+ },
+ {
+ "id": "0c44",
+ "name": "Concurrently delete same range of 100k flower filters from 10 tc instances",
+ "category": [
+ "filter",
+ "flower",
+ "concurrency"
+ ],
+ "setup": [
+ "/bin/mkdir $BATCH_DIR",
+ "$TC qdisc add dev $DEV2 ingress",
+ "./tdc_multibatch.py $DEV2 $BATCH_DIR 100000 1 add",
+ "$TC -b $BATCH_DIR/add_0",
+ "./tdc_multibatch.py -d $DEV2 $BATCH_DIR 100000 10 del"
+ ],
+ "cmdUnderTest": "bash -c \"find $BATCH_DIR/del* -print | xargs -n 1 -P 10 $TC -f -b\"",
+ "expExitCode": "123",
+ "verifyCmd": "$TC -s filter show dev $DEV2 ingress",
+ "matchPattern": "filter protocol ip pref 1 flower chain 0 handle",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV2 ingress",
+ "/bin/rm -rf $BATCH_DIR"
+ ]
+ },
+ {
+ "id": "ab62",
+ "name": "Add and delete from same tp with 10 tc instances",
+ "category": [
+ "filter",
+ "flower",
+ "concurrency"
+ ],
+ "setup": [
+ "/bin/mkdir $BATCH_DIR",
+ "$TC qdisc add dev $DEV2 ingress",
+ "./tdc_multibatch.py -x init_ $DEV2 $BATCH_DIR 100000 5 add",
+ "bash -c \"find $BATCH_DIR/init_* -print | xargs -n 1 -P 5 $TC -b\"",
+ "./tdc_multibatch.py -x par_ -a 500001 -m 5 $DEV2 $BATCH_DIR 100000 5 add",
+ "./tdc_multibatch.py -x par_ $DEV2 $BATCH_DIR 100000 5 del"
+ ],
+ "cmdUnderTest": "bash -c \"find $BATCH_DIR/par_* -print | xargs -n 1 -P 10 $TC -b\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC -s filter show dev $DEV2 ingress",
+ "matchPattern": "filter protocol ip pref 1 flower chain 0 handle",
+ "matchCount": "500000",
+ "teardown": [
+ "$TC qdisc del dev $DEV2 ingress",
+ "/bin/rm -rf $BATCH_DIR"
+ ]
+ },
+ {
+ "id": "6e8f",
+ "name": "Replace and delete from same tp with 10 tc instances",
+ "category": [
+ "filter",
+ "flower",
+ "concurrency"
+ ],
+ "setup": [
+ "/bin/mkdir $BATCH_DIR",
+ "$TC qdisc add dev $DEV2 ingress",
+ "./tdc_multibatch.py -x init_ $DEV2 $BATCH_DIR 100000 10 add",
+ "bash -c \"find $BATCH_DIR/init_* -print | xargs -n 1 -P 5 $TC -b\"",
+ "./tdc_multibatch.py -x par_ -a 500001 -m 5 $DEV2 $BATCH_DIR 100000 5 replace",
+ "./tdc_multibatch.py -x par_ $DEV2 $BATCH_DIR 100000 5 del"
+ ],
+ "cmdUnderTest": "bash -c \"find $BATCH_DIR/par_* -print | xargs -n 1 -P 10 $TC -b\"",
+ "expExitCode": "0",
+ "verifyCmd": "$TC -s filter show dev $DEV2 ingress",
+ "matchPattern": "filter protocol ip pref 1 flower chain 0 handle",
+ "matchCount": "500000",
+ "teardown": [
+ "$TC qdisc del dev $DEV2 ingress",
+ "/bin/rm -rf $BATCH_DIR"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/fw.json b/tools/testing/selftests/tc-testing/tc-tests/filters/fw.json
new file mode 100644
index 000000000..527204956
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/filters/fw.json
@@ -0,0 +1,1355 @@
+[
+ {
+ "id": "901f",
+ "name": "Add fw filter with prio at 32-bit maxixum",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 65535 fw action ok",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 65535 protocol all fw",
+ "matchPattern": "pref 65535 fw.*handle 0x1.*gact action pass",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "51e2",
+ "name": "Add fw filter with prio exceeding 32-bit maxixum",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 65536 fw action ok",
+ "expExitCode": "255",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 65536 protocol all fw",
+ "matchPattern": "pref 65536 fw.*handle 0x1.*gact action pass",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "d987",
+ "name": "Add fw filter with action ok",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 fw action ok",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol all fw",
+ "matchPattern": "handle 0x1.*gact action pass",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "c591",
+ "name": "Add fw filter with action ok by reference",
+ "__comment": "We add sleep here because action might have not been deleted by workqueue just yet. Remove this when the behaviour is fixed.",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "/bin/sleep 1",
+ "$TC actions add action gact ok index 1"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 fw action gact index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol all fw",
+ "matchPattern": "handle 0x1.*gact action pass.*index 1 ref 2 bind 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress",
+ "/bin/sleep 1",
+ "$TC actions del action gact index 1"
+ ]
+ },
+ {
+ "id": "affe",
+ "name": "Add fw filter with action continue",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 fw action continue",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol all fw",
+ "matchPattern": "handle 0x1.*gact action continue",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "38b3",
+ "name": "Add fw filter with action continue by reference",
+ "__comment": "We add sleep here because action might have not been deleted by workqueue just yet. Remove this when the behaviour is fixed.",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "/bin/sleep 1",
+ "$TC actions add action gact continue index 1"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 fw action gact index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol all fw",
+ "matchPattern": "handle 0x1.*gact action continue.*index 1 ref 2 bind 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress",
+ "/bin/sleep 1",
+ "$TC actions del action gact index 1"
+ ]
+ },
+ {
+ "id": "28bc",
+ "name": "Add fw filter with action pipe",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 fw action pipe",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol all fw",
+ "matchPattern": "handle 0x1.*gact action pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "6753",
+ "name": "Add fw filter with action pipe by reference",
+ "__comment": "We add sleep here because action might have not been deleted by workqueue just yet.",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "/bin/sleep 1",
+ "$TC actions add action gact pipe index 1"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 fw action gact index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol all fw",
+ "matchPattern": "handle 0x1.*gact action pipe.*index 1 ref 2 bind 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress",
+ "/bin/sleep 1",
+ "$TC actions del action gact index 1"
+ ]
+ },
+ {
+ "id": "8da2",
+ "name": "Add fw filter with action drop",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 fw action drop",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 protocol all prio 1 fw",
+ "matchPattern": "handle 0x1.*gact action drop",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "6dc6",
+ "name": "Add fw filter with action drop by reference",
+ "__comment": "We add sleep here because action might have not been deleted by workqueue just yet.",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "/bin/sleep 1",
+ "$TC actions add action gact drop index 1"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 fw action gact index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol all fw",
+ "matchPattern": "handle 0x1.*gact action drop.*index 1 ref 2 bind 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress",
+ "/bin/sleep 1",
+ "$TC actions del action gact index 1"
+ ]
+ },
+ {
+ "id": "9436",
+ "name": "Add fw filter with action reclassify",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 fw action reclassify",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol all fw",
+ "matchPattern": "handle 0x1.*gact action reclassify",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "3bc2",
+ "name": "Add fw filter with action reclassify by reference",
+ "__comment": "We add sleep here because action might have not been deleted by workqueue just yet.",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "/bin/sleep 1",
+ "$TC actions add action gact reclassify index 1"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 fw action gact index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol all fw",
+ "matchPattern": "handle 0x1.*gact action reclassify.*index 1 ref 2 bind 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress",
+ "/bin/sleep 1",
+ "$TC actions del action gact index 1"
+ ]
+ },
+ {
+ "id": "95bb",
+ "name": "Add fw filter with action jump 10",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 fw action jump 10",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol all fw",
+ "matchPattern": "handle 0x1.*gact action jump 10",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "36f7",
+ "name": "Add fw filter with action jump 10 by reference",
+ "__comment": "We add sleep here because action might have not been deleted by workqueue just yet.",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "/bin/sleep 1",
+ "$TC actions add action gact jump 10 index 1"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 fw action gact index 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol all fw",
+ "matchPattern": "handle 0x1.*gact action jump 10.*index 1 ref 2 bind 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress",
+ "/bin/sleep 1",
+ "$TC actions del action gact index 1"
+ ]
+ },
+ {
+ "id": "3d74",
+ "name": "Add fw filter with action goto chain 5",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 fw action goto chain 5",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol all fw",
+ "matchPattern": "handle 0x1.*gact action goto chain 5",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "eb8f",
+ "name": "Add fw filter with invalid action",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 fw action pump",
+ "expExitCode": "255",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol all fw",
+ "matchPattern": "handle 0x1.*gact action pump",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "6a79",
+ "name": "Add fw filter with missing mandatory action",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 fw",
+ "expExitCode": "2",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol all fw",
+ "matchPattern": "filter protocol all pref [0-9]+ fw.*handle 0x1",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "8298",
+ "name": "Add fw filter with cookie",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 2 fw action pipe cookie aa11bb22cc33dd44ee55ff66aa11b1b2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 2 protocol all fw",
+ "matchPattern": "pref 2 fw.*handle 0x1.*gact action pipe.*cookie aa11bb22cc33dd44ee55ff66aa11b1b2",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "a88c",
+ "name": "Add fw filter with invalid cookie",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 2 fw action continue cookie aa11bb22cc33dd44ee55ff66aa11b1b2777888",
+ "expExitCode": "255",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 2 protocol all fw",
+ "matchPattern": "pref 2 fw.*handle 0x1.*gact action continue.*cookie aa11bb22cc33dd44ee55ff66aa11b1b2777888",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "10f6",
+ "name": "Add fw filter with handle in hex",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 0xa1b2ff prio 1 fw action ok",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 0xa1b2ff prio 1 protocol all fw",
+ "matchPattern": "fw.*handle 0xa1b2ff.*gact action pass",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "9d51",
+ "name": "Add fw filter with handle at 32-bit maximum",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 4294967295 prio 1 fw action ok",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 4294967295 prio 1 protocol all fw",
+ "matchPattern": "fw.*handle 0xffffffff.*gact action pass",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "d939",
+ "name": "Add fw filter with handle exceeding 32-bit maximum",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 4294967296 prio 1 fw action ok",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 4294967296 prio 1 protocol all fw",
+ "matchPattern": "fw.*handle 0x.*gact action pass",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "658c",
+ "name": "Add fw filter with mask in hex",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 10/0xa1b2f prio 1 fw action ok",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 10 prio 1 protocol all fw",
+ "matchPattern": "fw.*handle 0xa/0xa1b2f",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "86be",
+ "name": "Add fw filter with mask at 32-bit maximum",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 10/4294967295 prio 1 fw action ok",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 10 prio 1 protocol all fw",
+ "matchPattern": "fw.*handle 0xa[^/]",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "e635",
+ "name": "Add fw filter with mask exceeding 32-bit maximum",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 10/4294967296 prio 1 fw action ok",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 10 prio 1 protocol all fw",
+ "matchPattern": "fw.*handle 0xa",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "6cab",
+ "name": "Add fw filter with handle/mask in hex",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 0xa1b2cdff/0x1a2bffdc prio 1 fw action ok",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 0xa1b2cdff prio 1 protocol all fw",
+ "matchPattern": "fw.*handle 0xa1b2cdff/0x1a2bffdc",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "8700",
+ "name": "Add fw filter with handle/mask at 32-bit maximum",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 4294967295/4294967295 prio 1 fw action ok",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 0xffffffff prio 1 protocol all fw",
+ "matchPattern": "fw.*handle 0xffffffff[^/]",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "7d62",
+ "name": "Add fw filter with handle/mask exceeding 32-bit maximum",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 4294967296/4294967296 prio 1 fw action ok",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 10 prio 1 protocol all fw",
+ "matchPattern": "fw.*handle",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "7b69",
+ "name": "Add fw filter with missing mandatory handle",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: prio 1 fw action ok",
+ "expExitCode": "2",
+ "verifyCmd": "$TC filter show dev $DEV1 parent ffff:",
+ "matchPattern": "filter protocol all.*fw.*handle.*gact action pass",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "d68b",
+ "name": "Add fw filter with invalid parent",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent aa11b1b2: handle 1 prio 1 fw action ok",
+ "expExitCode": "255",
+ "verifyCmd": "$TC filter dev $DEV1 parent aa11b1b2: handle 1 prio 1 protocol all fw",
+ "matchPattern": "filter protocol all pref 1 fw.*handle 0x1.*gact action pass",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "66e0",
+ "name": "Add fw filter with missing mandatory parent id",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 handle 1 prio 1 fw action ok",
+ "expExitCode": "2",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol all fw",
+ "matchPattern": "pref [0-9]+ fw.*handle 0x1.*gact action pass",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "0ff3",
+ "name": "Add fw filter with classid",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 fw classid 3 action ok",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol all fw",
+ "matchPattern": "fw.*handle 0x1 classid :3.*gact action pass",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "9849",
+ "name": "Add fw filter with classid at root",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 fw classid ffff:ffff action ok",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol all fw",
+ "matchPattern": "pref 1 fw.*handle 0x1 classid root.*gact action pass",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "b7ff",
+ "name": "Add fw filter with classid - keeps last 8 (hex) digits",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 fw classid 98765fedcb action ok",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol all fw",
+ "matchPattern": "fw.*handle 0x1 classid 765f:edcb.*gact action pass",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "2b18",
+ "name": "Add fw filter with invalid classid",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 1 prio 1 fw classid 6789defg action ok",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol all fw",
+ "matchPattern": "fw.*handle 0x1 classid 6789:defg.*gact action pass",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "fade",
+ "name": "Add fw filter with flowid",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 10 prio 1 fw flowid 1:10 action ok",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 10 prio 1 protocol all fw",
+ "matchPattern": "filter parent ffff: protocol all pref 1 fw.*handle 0xa classid 1:10.*gact action pass",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "33af",
+ "name": "Add fw filter with flowid then classid (same arg, takes second)",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 11 prio 1 fw flowid 10 classid 4 action ok",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 11 prio 1 protocol all fw",
+ "matchPattern": "filter parent ffff: protocol all pref 1 fw.*handle 0xb classid :4.*gact action pass",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "8a8c",
+ "name": "Add fw filter with classid then flowid (same arg, takes second)",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: handle 11 prio 1 fw classid 4 flowid 10 action ok",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 11 prio 1 protocol all fw",
+ "matchPattern": "filter parent ffff: protocol all pref 1 fw.*handle 0xb classid :10.*gact action pass",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "b50d",
+ "name": "Add fw filter with handle val/mask and flowid 10:1000",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: prio 3 handle 10/0xff fw flowid 10:1000 action ok",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 10 prio 3 protocol all fw",
+ "matchPattern": "filter parent ffff: protocol all pref 3 fw.*handle 0xa/0xff classid 10:1000.*gact action pass",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "7207",
+ "name": "Add fw filter with protocol ip",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: protocol ip prio 1 handle 3 fw action ok",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 3 prio 1 protocol ip fw",
+ "matchPattern": "filter parent ffff: protocol ip pref 1 fw.*handle 0x3.*gact action pass.*index [0-9]+ ref [0-9]+ bind [0-9]+",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "306d",
+ "name": "Add fw filter with protocol ipv6",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: protocol ipv6 prio 2 handle 4 fw action ok",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 4 prio 2 protocol ipv6 fw",
+ "matchPattern": "filter parent ffff: protocol ipv6 pref 2 fw.*handle 0x4.*gact action pass.*index [0-9]+ ref [0-9]+ bind [0-9]+",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "9a78",
+ "name": "Add fw filter with protocol arp",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: protocol arp prio 5 handle 7 fw action drop",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 7 prio 5 protocol arp fw",
+ "matchPattern": "filter parent ffff: protocol arp pref 5 fw.*handle 0x7.*gact action drop.*index [0-9]+ ref [0-9]+ bind [0-9]+",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "1821",
+ "name": "Add fw filter with protocol 802_3",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: protocol 802_3 handle 1 prio 1 fw action ok",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol 802_3 fw",
+ "matchPattern": "filter parent ffff: protocol 802_3 pref 1 fw.*handle 0x1.*gact action pass",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "2260",
+ "name": "Add fw filter with invalid protocol",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: protocol igmp handle 1 prio 1 fw action ok",
+ "expExitCode": "255",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 1 prio 1 protocol igmp fw",
+ "matchPattern": "filter parent ffff: protocol igmp pref 1 fw.*handle 0x1.*gact action pass",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "09d7",
+ "name": "Add fw filters protocol 802_3 and ip with conflicting priorities",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "$TC filter add dev $DEV1 parent ffff: protocol 802_3 prio 3 handle 7 fw action ok"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: protocol ip prio 3 handle 8 fw action ok",
+ "expExitCode": "2",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 8 prio 3 protocol ip fw",
+ "matchPattern": "filter parent ffff: protocol ip pref 3 fw.*handle 0x8",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "6973",
+ "name": "Add fw filters with same index, same action",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "$TC filter add dev $DEV1 parent ffff: prio 6 handle 2 fw action continue index 5"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: prio 8 handle 4 fw action continue index 5",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 4 prio 8 protocol all fw",
+ "matchPattern": "filter parent ffff: protocol all pref 8 fw.*handle 0x4.*gact action continue.*index 5 ref 2 bind 2",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "fc06",
+ "name": "Add fw filters with action police",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: prio 3 handle 4 fw action police rate 1kbit burst 10k index 5",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 4 prio 3 protocol all fw",
+ "matchPattern": "filter parent ffff: protocol all pref 3 fw.*handle 0x4.*police 0x5 rate 1Kbit burst 10Kb mtu 2Kb action reclassify overhead 0b.*ref 1 bind 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "aac7",
+ "name": "Add fw filters with action police linklayer atm",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 parent ffff: prio 3 handle 4 fw action police rate 2mbit burst 200k linklayer atm index 8",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 4 prio 3 protocol all fw",
+ "matchPattern": "filter parent ffff: protocol all pref 3 fw.*handle 0x4.*police 0x8 rate 2Mbit burst 200Kb mtu 2Kb action reclassify overhead 0b linklayer atm.*ref 1 bind 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "5339",
+ "name": "Del entire fw filter",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "$TC filter add dev $DEV1 parent ffff: handle 5 prio 7 fw action pass",
+ "$TC filter add dev $DEV1 parent ffff: handle 3 prio 9 fw action pass"
+ ],
+ "cmdUnderTest": "$TC filter del dev $DEV1 parent ffff:",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter show dev $DEV1 parent ffff:",
+ "matchPattern": "protocol all pref.*handle.*gact action pass",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "0e99",
+ "name": "Del single fw filter x1",
+ "__comment__": "First of two tests to check that one filter is there and the other isn't",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "$TC filter add dev $DEV1 parent ffff: handle 5 prio 7 fw action pass",
+ "$TC filter add dev $DEV1 parent ffff: handle 3 prio 9 fw action pass"
+ ],
+ "cmdUnderTest": "$TC filter del dev $DEV1 parent ffff: handle 3 prio 9 fw action pass",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter show dev $DEV1 parent ffff:",
+ "matchPattern": "protocol all pref 7.*handle 0x5.*gact action pass",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "f54c",
+ "name": "Del single fw filter x2",
+ "__comment__": "Second of two tests to check that one filter is there and the other isn't",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "$TC filter add dev $DEV1 parent ffff: handle 5 prio 7 fw action pass",
+ "$TC filter add dev $DEV1 parent ffff: handle 3 prio 9 fw action pass"
+ ],
+ "cmdUnderTest": "$TC filter del dev $DEV1 parent ffff: handle 3 prio 9 fw action pass",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter show dev $DEV1 parent ffff:",
+ "matchPattern": "protocol all pref 9.*handle 0x3.*gact action pass",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "ba94",
+ "name": "Del fw filter by prio",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "$TC filter add dev $DEV1 parent ffff: handle 1 prio 4 fw action ok",
+ "$TC filter add dev $DEV1 parent ffff: handle 2 prio 4 fw action ok"
+ ],
+ "cmdUnderTest": "$TC filter del dev $DEV1 parent ffff: prio 4",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter show dev $DEV1 parent ffff:",
+ "matchPattern": "pref 4 fw.*gact action pass",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "4acb",
+ "name": "Del fw filter by chain",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "$TC filter add dev $DEV1 parent ffff: handle 4 prio 2 chain 13 fw action pipe",
+ "$TC filter add dev $DEV1 parent ffff: handle 3 prio 5 chain 13 fw action pipe"
+ ],
+ "cmdUnderTest": "$TC filter del dev $DEV1 parent ffff: chain 13",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter show dev $DEV1 parent ffff:",
+ "matchPattern": "fw chain 13 handle.*gact action pipe",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "3424",
+ "name": "Del fw filter by action (invalid)",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "$TC filter add dev $DEV1 parent ffff: handle 2 prio 4 fw action drop"
+ ],
+ "cmdUnderTest": "$TC filter del dev $DEV1 parent ffff: fw action drop",
+ "expExitCode": "2",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 2 prio 4 protocol all fw",
+ "matchPattern": "handle 0x2.*gact action drop",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "da89",
+ "name": "Del fw filter by handle (invalid)",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "$TC filter add dev $DEV1 parent ffff: handle 3 prio 4 fw action continue"
+ ],
+ "cmdUnderTest": "$TC filter del dev $DEV1 parent ffff: handle 3 fw",
+ "expExitCode": "2",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 3 prio 4 protocol all fw",
+ "matchPattern": "handle 0x3.*gact action continue",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "4d95",
+ "name": "Del fw filter by protocol (invalid)",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "$TC filter add dev $DEV1 parent ffff: handle 4 prio 2 protocol arp fw action pipe"
+ ],
+ "cmdUnderTest": "$TC filter del dev $DEV1 parent ffff: protocol arp fw",
+ "expExitCode": "2",
+ "verifyCmd": "$TC filter get dev $DEV1 parent ffff: handle 4 prio 2 protocol arp fw",
+ "matchPattern": "filter parent ffff: protocol arp.*handle 0x4.*gact action pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "4736",
+ "name": "Del fw filter by flowid (invalid)",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "$TC filter add dev $DEV1 parent ffff: handle 4 prio 2 fw action pipe flowid 45"
+ ],
+ "cmdUnderTest": "$TC filter del dev $DEV1 parent ffff: fw flowid 45",
+ "expExitCode": "2",
+ "verifyCmd": "$TC filter show dev $DEV1 parent ffff:",
+ "matchPattern": "handle 0x4.*gact action pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "3dcb",
+ "name": "Replace fw filter action",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "$TC filter add dev $DEV1 parent ffff: handle 1 prio 2 fw action ok"
+ ],
+ "cmdUnderTest": "$TC filter replace dev $DEV1 parent ffff: handle 1 prio 2 fw action pipe",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter show dev $DEV1 parent ffff:",
+ "matchPattern": "pref 2 fw.*handle 0x1.*gact action pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "eb4d",
+ "name": "Replace fw filter classid",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "$TC filter add dev $DEV1 parent ffff: handle 1 prio 2 fw action ok"
+ ],
+ "cmdUnderTest": "$TC filter replace dev $DEV1 parent ffff: handle 1 prio 2 fw action pipe classid 2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter show dev $DEV1 parent ffff:",
+ "matchPattern": "pref 2 fw.*handle 0x1 classid :2.*gact action pipe",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "67ec",
+ "name": "Replace fw filter index",
+ "category": [
+ "filter",
+ "fw"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "$TC filter add dev $DEV1 parent ffff: handle 1 prio 2 fw action ok index 3"
+ ],
+ "cmdUnderTest": "$TC filter replace dev $DEV1 parent ffff: handle 1 prio 2 fw action ok index 16",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter show dev $DEV1 parent ffff:",
+ "matchPattern": "pref 2 fw.*handle 0x1.*gact action pass.*index 16",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/matchall.json b/tools/testing/selftests/tc-testing/tc-tests/filters/matchall.json
new file mode 100644
index 000000000..51799874a
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/filters/matchall.json
@@ -0,0 +1,391 @@
+[
+ {
+ "id": "f62b",
+ "name": "Add ingress matchall filter for protocol ipv4 and action PASS",
+ "category": [
+ "filter",
+ "matchall"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DUMMY parent ffff: handle 0x1 prio 1 protocol ip matchall action ok",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DUMMY parent ffff: handle 1 prio 1 protocol ip matchall",
+ "matchPattern": "^filter parent ffff: protocol ip pref 1 matchall.*handle 0x1.*gact action pass.*ref 1 bind 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY ingress",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "7f09",
+ "name": "Add egress matchall filter for protocol ipv4 and action PASS",
+ "category": [
+ "filter",
+ "matchall"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY root handle 1: prio"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DUMMY parent 1: handle 0x1 prio 1 protocol ip matchall action ok",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DUMMY parent 1: handle 1 prio 1 protocol ip matchall",
+ "matchPattern": "^filter parent 1: protocol ip pref 1 matchall.*handle 0x1.*gact action pass.*ref 1 bind 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY root handle 1: prio",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "0596",
+ "name": "Add ingress matchall filter for protocol ipv6 and action DROP",
+ "category": [
+ "filter",
+ "matchall"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DUMMY parent ffff: handle 0x1 prio 1 protocol ipv6 matchall action drop",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DUMMY parent ffff: handle 1 prio 1 protocol ipv6 matchall",
+ "matchPattern": "^filter parent ffff: protocol ipv6 pref 1 matchall.*handle 0x1.*gact action drop.*ref 1 bind 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY ingress",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "41df",
+ "name": "Add egress matchall filter for protocol ipv6 and action DROP",
+ "category": [
+ "filter",
+ "matchall"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY root handle 1: prio"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DUMMY parent 1: handle 0x1 prio 1 protocol ipv6 matchall action drop",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DUMMY parent 1: handle 1 prio 1 protocol ipv6 matchall",
+ "matchPattern": "^filter parent 1: protocol ipv6 pref 1 matchall.*handle 0x1.*gact action drop.*ref 1 bind 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY root handle 1: prio",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "e1da",
+ "name": "Add ingress matchall filter for protocol ipv4 and action PASS with priority at 16-bit maximum",
+ "category": [
+ "filter",
+ "matchall"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DUMMY parent ffff: handle 0x1 prio 65535 protocol ipv4 matchall action pass",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DUMMY parent ffff: handle 1 prio 65535 protocol ipv4 matchall",
+ "matchPattern": "^filter parent ffff: protocol ip pref 65535 matchall.*handle 0x1.*gact action pass.*ref 1 bind 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY ingress",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "3de5",
+ "name": "Add egress matchall filter for protocol ipv4 and action PASS with priority at 16-bit maximum",
+ "category": [
+ "filter",
+ "matchall"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY root handle 1: prio"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DUMMY parent 1: handle 0x1 prio 65535 protocol ipv4 matchall action pass",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DUMMY parent 1: handle 1 prio 65535 protocol ipv4 matchall",
+ "matchPattern": "^filter parent 1: protocol ip pref 65535 matchall.*handle 0x1.*gact action pass.*ref 1 bind 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY root handle 1: prio",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "72d7",
+ "name": "Add ingress matchall filter for protocol ipv4 and action PASS with priority exceeding 16-bit maximum",
+ "category": [
+ "filter",
+ "matchall"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DUMMY parent ffff: handle 0x1 prio 655355 protocol ipv4 matchall action pass",
+ "expExitCode": "255",
+ "verifyCmd": "$TC filter get dev $DUMMY parent ffff: handle 1 prio 655355 protocol ipv4 matchall",
+ "matchPattern": "^filter parent ffff: protocol ip pref 655355 matchall.*handle 0x1.*gact action pass.*ref 1 bind 1",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY ingress",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "41d3",
+ "name": "Add egress matchall filter for protocol ipv4 and action PASS with priority exceeding 16-bit maximum",
+ "category": [
+ "filter",
+ "matchall"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY root handle 1: prio"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DUMMY parent 1: handle 0x1 prio 655355 protocol ipv4 matchall action pass",
+ "expExitCode": "255",
+ "verifyCmd": "$TC filter get dev $DUMMY parent 1: handle 1 prio 655355 protocol ipv4 matchall",
+ "matchPattern": "^filter parent 1: protocol ip pref 655355 matchall.*handle 0x1.*gact action pass.*ref 1 bind 1",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY root handle 1: prio",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "f755",
+ "name": "Add ingress matchall filter for all protocols and action CONTINUE with handle at 32-bit maximum",
+ "category": [
+ "filter",
+ "matchall"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DUMMY parent ffff: handle 0xffffffff prio 1 protocol all matchall action continue",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DUMMY parent ffff: handle 0xffffffff prio 1 protocol all matchall",
+ "matchPattern": "^filter parent ffff: protocol all pref 1 matchall.*handle 0xffffffff.*gact action continue.*ref 1 bind 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY ingress",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "2c33",
+ "name": "Add egress matchall filter for all protocols and action CONTINUE with handle at 32-bit maximum",
+ "category": [
+ "filter",
+ "matchall"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY root handle 1: prio"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DUMMY parent 1: handle 0xffffffff prio 1 protocol all matchall action continue",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DUMMY parent 1: handle 0xffffffff prio 1 protocol all matchall",
+ "matchPattern": "^filter parent 1: protocol all pref 1 matchall.*handle 0xffffffff.*gact action continue.*ref 1 bind 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY root handle 1: prio",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "0e4a",
+ "name": "Add ingress matchall filter for all protocols and action RECLASSIFY with skip_hw flag",
+ "category": [
+ "filter",
+ "matchall"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DUMMY parent ffff: handle 0x1 prio 1 protocol all matchall skip_hw action reclassify",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DUMMY parent ffff: handle 0x1 prio 1 protocol all matchall",
+ "matchPattern": "^filter parent ffff: protocol all pref 1 matchall.*handle 0x1.*skip_hw.*not_in_hw.*gact action reclassify.*ref 1 bind 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY ingress",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "7f60",
+ "name": "Add egress matchall filter for all protocols and action RECLASSIFY with skip_hw flag",
+ "category": [
+ "filter",
+ "matchall"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY root handle 1: prio"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DUMMY parent 1: handle 0x1 prio 1 protocol all matchall skip_hw action reclassify",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DUMMY parent 1: handle 0x1 prio 1 protocol all matchall",
+ "matchPattern": "^filter parent 1: protocol all pref 1 matchall.*handle 0x1.*skip_hw.*not_in_hw.*gact action reclassify.*ref 1 bind 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY root handle 1: prio",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "8bd2",
+ "name": "Add ingress matchall filter for protocol ipv6 and action PASS with classid",
+ "category": [
+ "filter",
+ "matchall"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DUMMY parent ffff: handle 0x1 prio 1 protocol ipv6 matchall classid 1:1 action pass",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DUMMY parent ffff: handle 0x1 prio 1 protocol ipv6 matchall",
+ "matchPattern": "^filter parent ffff: protocol ipv6 pref 1 matchall.*handle 0x1.*flowid 1:1.*gact action pass.*ref 1 bind 1",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY ingress",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "2a4a",
+ "name": "Add ingress matchall filter for protocol ipv6 and action PASS with invalid classid",
+ "category": [
+ "filter",
+ "matchall"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DUMMY parent ffff: handle 0x1 prio 1 protocol ipv6 matchall classid 6789defg action pass",
+ "expExitCode": "1",
+ "verifyCmd": "$TC filter get dev $DUMMY parent ffff: handle 0x1 prio 1 protocol ipv6 matchall",
+ "matchPattern": "^filter protocol ipv6 pref 1 matchall.*handle 0x1.*flowid 6789defg.*gact action pass.*ref 1 bind 1",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY ingress",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "eaf8",
+ "name": "Delete single ingress matchall filter",
+ "category": [
+ "filter",
+ "matchall"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY ingress",
+ "$TC filter add dev $DUMMY parent ffff: handle 0x1 prio 1 protocol ipv6 matchall classid 1:2 action pass"
+ ],
+ "cmdUnderTest": "$TC filter del dev $DUMMY parent ffff: handle 0x1 prio 1 protocol ipv6 matchall",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter get dev $DUMMY parent ffff: handle 0x1 prio 1 protocol ipv6 matchall",
+ "matchPattern": "^filter protocol ipv6 pref 1 matchall.*handle 0x1.*flowid 1:2.*gact action pass.*ref 1 bind 1",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY ingress",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "76ad",
+ "name": "Delete all ingress matchall filters",
+ "category": [
+ "filter",
+ "matchall"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY ingress",
+ "$TC filter add dev $DUMMY parent ffff: handle 0x1 prio 1 protocol all matchall classid 1:2 action pass",
+ "$TC filter add dev $DUMMY parent ffff: handle 0x2 prio 2 protocol all matchall classid 1:3 action pass",
+ "$TC filter add dev $DUMMY parent ffff: handle 0x3 prio 3 protocol all matchall classid 1:4 action pass",
+ "$TC filter add dev $DUMMY parent ffff: handle 0x4 prio 4 protocol all matchall classid 1:5 action pass"
+ ],
+ "cmdUnderTest": "$TC filter del dev $DUMMY parent ffff:",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter show dev $DUMMY parent ffff:",
+ "matchPattern": "^filter protocol all pref.*matchall.*handle.*flowid.*gact action pass",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY ingress",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "1eb9",
+ "name": "Delete single ingress matchall filter out of multiple",
+ "category": [
+ "filter",
+ "matchall"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY ingress",
+ "$TC filter add dev $DUMMY parent ffff: handle 0x1 prio 1 protocol all matchall classid 1:2 action pass",
+ "$TC filter add dev $DUMMY parent ffff: handle 0x2 prio 2 protocol all matchall classid 1:3 action pass",
+ "$TC filter add dev $DUMMY parent ffff: handle 0x3 prio 3 protocol all matchall classid 1:4 action pass",
+ "$TC filter add dev $DUMMY parent ffff: handle 0x4 prio 4 protocol all matchall classid 1:5 action pass"
+ ],
+ "cmdUnderTest": "$TC filter del dev $DUMMY parent ffff: protocol all handle 0x2 prio 2 matchall",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter show dev $DUMMY parent ffff:",
+ "matchPattern": "^filter protocol all pref 2 matchall.*handle 0x2 flowid 1:2.*gact action pass",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY ingress",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "6d63",
+ "name": "Delete ingress matchall filter by chain ID",
+ "category": [
+ "filter",
+ "matchall"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY ingress",
+ "$TC filter add dev $DUMMY parent ffff: handle 0x1 prio 1 protocol all chain 1 matchall classid 1:1 action pass",
+ "$TC filter add dev $DUMMY parent ffff: handle 0x1 prio 1 protocol ipv4 chain 2 matchall classid 1:3 action continue"
+ ],
+ "cmdUnderTest": "$TC filter del dev $DUMMY parent ffff: chain 2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter show dev $DUMMY parent ffff:",
+ "matchPattern": "^filter protocol all pref 1 matchall chain 1 handle 0x1 flowid 1:1.*gact action pass",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY ingress",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json
new file mode 100644
index 000000000..361235ad5
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json
@@ -0,0 +1,129 @@
+[
+ {
+ "id": "2638",
+ "name": "Add matchall and try to get it",
+ "category": [
+ "filter",
+ "matchall"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 clsact",
+ "$TC filter add dev $DEV1 protocol all pref 1 ingress handle 0x1234 matchall action ok"
+ ],
+ "cmdUnderTest": "$TC filter get dev $DEV1 protocol all pref 1 ingress handle 0x1234 matchall",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter show dev $DEV1 ingress",
+ "matchPattern": "filter protocol all pref 1 matchall chain 0 handle 0x1234",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 clsact"
+ ]
+ },
+ {
+ "id": "2ff3",
+ "name": "Add flower with max handle and then dump it",
+ "category": [
+ "filter",
+ "flower"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV2 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip pref 1 ingress handle 0xffffffff flower action ok",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter show dev $DEV2 ingress",
+ "matchPattern": "filter protocol ip pref 1 flower.*handle 0xffffffff",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV2 ingress"
+ ]
+ },
+ {
+ "id": "d052",
+ "name": "Add 1M filters with the same action",
+ "category": [
+ "filter",
+ "flower"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV2 ingress",
+ "./tdc_batch.py $DEV2 $BATCH_FILE --share_action -n 1000000"
+ ],
+ "cmdUnderTest": "$TC -b $BATCH_FILE",
+ "expExitCode": "0",
+ "verifyCmd": "$TC actions list action gact",
+ "matchPattern": "action order 0: gact action drop.*index 1 ref 1000000 bind 1000000",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV2 ingress",
+ "/bin/rm $BATCH_FILE"
+ ]
+ },
+ {
+ "id": "4cbd",
+ "name": "Try to add filter with duplicate key",
+ "category": [
+ "filter",
+ "flower"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV2 ingress",
+ "$TC filter add dev $DEV2 protocol ip prio 1 ingress flower dst_mac e4:11:22:11:4a:51 src_mac e4:11:22:11:4a:50 ip_proto tcp src_ip 1.1.1.1 dst_ip 2.2.2.2 action drop"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip prio 1 ingress flower dst_mac e4:11:22:11:4a:51 src_mac e4:11:22:11:4a:50 ip_proto tcp src_ip 1.1.1.1 dst_ip 2.2.2.2 action drop",
+ "expExitCode": "2",
+ "verifyCmd": "$TC -s filter show dev $DEV2 ingress",
+ "matchPattern": "filter protocol ip pref 1 flower chain 0 handle",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV2 ingress"
+ ]
+ },
+ {
+ "id": "7c65",
+ "name": "Add flower filter and then terse dump it",
+ "category": [
+ "filter",
+ "flower"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV2 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip pref 1 ingress flower dst_mac e4:11:22:11:4a:51 action drop",
+ "expExitCode": "0",
+ "verifyCmd": "$TC -br filter show dev $DEV2 ingress",
+ "matchPattern": "filter protocol ip pref 1 flower.*handle",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DEV2 ingress"
+ ]
+ },
+ {
+ "id": "d45e",
+ "name": "Add flower filter and verify that terse dump doesn't output filter key",
+ "category": [
+ "filter",
+ "flower"
+ ],
+ "setup": [
+ "$TC qdisc add dev $DEV2 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip pref 1 ingress flower dst_mac e4:11:22:11:4a:51 action drop",
+ "expExitCode": "0",
+ "verifyCmd": "$TC -br filter show dev $DEV2 ingress",
+ "matchPattern": " dst_mac e4:11:22:11:4a:51",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV2 ingress"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/u32.json b/tools/testing/selftests/tc-testing/tc-tests/filters/u32.json
new file mode 100644
index 000000000..e09d3c0e3
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/filters/u32.json
@@ -0,0 +1,205 @@
+[
+ {
+ "id": "afa9",
+ "name": "Add u32 with source match",
+ "category": [
+ "filter",
+ "u32"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 ingress protocol ip prio 1 u32 match ip src 127.0.0.1/32 flowid 1:1 action ok",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter show dev $DEV1 ingress",
+ "matchPattern": "filter protocol ip pref 1 u32 chain (0[ ]+$|0 fh 800: ht divisor 1|0 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1.*match 7f000001/ffffffff at 12)",
+ "matchCount": "3",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "6aa7",
+ "name": "Add/Replace u32 with source match and invalid indev",
+ "category": [
+ "filter",
+ "u32"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter replace dev $DEV1 ingress protocol ip prio 1 u32 match ip src 127.0.0.1/32 indev notexist20 flowid 1:1 action ok",
+ "expExitCode": "2",
+ "verifyCmd": "$TC filter show dev $DEV1 ingress",
+ "matchPattern": "filter protocol ip pref 1 u32 chain 0",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "bc4d",
+ "name": "Replace valid u32 with source match and invalid indev",
+ "category": [
+ "filter",
+ "u32"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "$TC filter add dev $DEV1 ingress protocol ip prio 1 u32 match ip src 127.0.0.3/32 flowid 1:3 action ok"
+ ],
+ "cmdUnderTest": "$TC filter replace dev $DEV1 ingress protocol ip prio 1 u32 match ip src 127.0.0.2/32 indev notexist20 flowid 1:2 action ok",
+ "expExitCode": "2",
+ "verifyCmd": "$TC filter show dev $DEV1 ingress",
+ "matchPattern": "filter protocol ip pref 1 u32 chain (0[ ]+$|0 fh 800: ht divisor 1|0 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:3.*match 7f000003/ffffffff at 12)",
+ "matchCount": "3",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "648b",
+ "name": "Add u32 with custom hash table",
+ "category": [
+ "filter",
+ "u32"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 ingress prio 99 handle 42: u32 divisor 256",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter show dev $DEV1 ingress",
+ "matchPattern": "pref 99 u32 chain (0[ ]+$|0 fh 42: ht divisor 256|0 fh 800: ht divisor 1)",
+ "matchCount": "3",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "6658",
+ "name": "Add/Replace u32 with custom hash table and invalid handle",
+ "category": [
+ "filter",
+ "u32"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter replace dev $DEV1 ingress prio 99 handle 42:42 u32 divisor 256",
+ "expExitCode": "2",
+ "verifyCmd": "$TC filter show dev $DEV1 ingress",
+ "matchPattern": "pref 99 u32 chain 0",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "9d0a",
+ "name": "Replace valid u32 with custom hash table and invalid handle",
+ "category": [
+ "filter",
+ "u32"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "$TC filter add dev $DEV1 ingress prio 99 handle 42: u32 divisor 256"
+ ],
+ "cmdUnderTest": "$TC filter replace dev $DEV1 ingress prio 99 handle 42:42 u32 divisor 128",
+ "expExitCode": "2",
+ "verifyCmd": "$TC filter show dev $DEV1 ingress",
+ "matchPattern": "pref 99 u32 chain (0[ ]+$|0 fh 42: ht divisor 256|0 fh 800: ht divisor 1)",
+ "matchCount": "3",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "1644",
+ "name": "Add u32 filter that links to a custom hash table",
+ "category": [
+ "filter",
+ "u32"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "$TC filter add dev $DEV1 ingress prio 99 handle 43: u32 divisor 256"
+ ],
+ "cmdUnderTest": "$TC filter add dev $DEV1 ingress protocol ip prio 98 u32 link 43: hashkey mask 0x0000ff00 at 12 match ip src 192.168.0.0/16",
+ "expExitCode": "0",
+ "verifyCmd": "$TC filter show dev $DEV1 ingress",
+ "matchPattern": "filter protocol ip pref 98 u32 chain (0[ ]+$|0 fh 801: ht divisor 1|0 fh 801::800 order 2048 key ht 801 bkt 0 link 43:.*match c0a80000/ffff0000 at 12.*hash mask 0000ff00 at 12)",
+ "matchCount": "3",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "74c2",
+ "name": "Add/Replace u32 filter with invalid hash table id",
+ "category": [
+ "filter",
+ "u32"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress"
+ ],
+ "cmdUnderTest": "$TC filter replace dev $DEV1 ingress protocol ip prio 20 u32 ht 47:47 action drop",
+ "expExitCode": "2",
+ "verifyCmd": "$TC filter show dev $DEV1 ingress",
+ "matchPattern": "filter protocol ip pref 20 u32 chain 0",
+ "matchCount": "0",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ },
+ {
+ "id": "1fe6",
+ "name": "Replace valid u32 filter with invalid hash table id",
+ "category": [
+ "filter",
+ "u32"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$TC qdisc add dev $DEV1 ingress",
+ "$TC filter add dev $DEV1 ingress protocol ip prio 99 handle 43: u32 divisor 1",
+ "$TC filter add dev $DEV1 ingress protocol ip prio 98 u32 ht 43: match tcp src 22 FFFF classid 1:3"
+ ],
+ "cmdUnderTest": "$TC filter replace dev $DEV1 ingress protocol ip prio 98 u32 ht 43:1 match tcp src 23 FFFF classid 1:4",
+ "expExitCode": "2",
+ "verifyCmd": "$TC filter show dev $DEV1 ingress",
+ "matchPattern": "filter protocol ip pref 99 u32 chain (0[ ]+$|0 fh (43|800): ht divisor 1|0 fh 43::800 order 2048 key ht 43 bkt 0 flowid 1:3.*match 00160000/ffff0000 at nexthdr\\+0)",
+ "matchCount": "4",
+ "teardown": [
+ "$TC qdisc del dev $DEV1 ingress"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/ets.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/ets.json
new file mode 100644
index 000000000..180593010
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/ets.json
@@ -0,0 +1,940 @@
+[
+ {
+ "id": "e90e",
+ "name": "Add ETS qdisc using bands",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets bands 2",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets 1: root .* bands 2",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "b059",
+ "name": "Add ETS qdisc using quanta",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets quanta 1000 900 800 700",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets 1: root .*bands 4 quanta 1000 900 800 700",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "e8e7",
+ "name": "Add ETS qdisc using strict",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets strict 3",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets 1: root .*bands 3 strict 3",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "233c",
+ "name": "Add ETS qdisc using bands + quanta",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets bands 4 quanta 1000 900 800 700",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets 1: root .*bands 4 quanta 1000 900 800 700 priomap",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "3d35",
+ "name": "Add ETS qdisc using bands + strict",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets bands 3 strict 3",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets 1: root .*bands 3 strict 3 priomap",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "7f3b",
+ "name": "Add ETS qdisc using strict + quanta",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets strict 3 quanta 1500 750",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets 1: root .*bands 5 strict 3 quanta 1500 750 priomap",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "4593",
+ "name": "Add ETS qdisc using strict 0 + quanta",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets strict 0 quanta 1500 750",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets 1: root .*bands 2 quanta 1500 750 priomap",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "8938",
+ "name": "Add ETS qdisc using bands + strict + quanta",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets bands 5 strict 3 quanta 1500 750",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets 1: root .*bands 5 .*strict 3 quanta 1500 750 priomap",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "0782",
+ "name": "Add ETS qdisc with more bands than quanta",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets bands 2 quanta 1000",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets 1: root .*bands 2 .*quanta 1000 [1-9][0-9]* priomap",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "501b",
+ "name": "Add ETS qdisc with more bands than strict",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets bands 3 strict 1",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets 1: root .*bands 3 strict 1 quanta ([1-9][0-9]* ){2}priomap",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "671a",
+ "name": "Add ETS qdisc with more bands than strict + quanta",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets bands 3 strict 1 quanta 1000",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets 1: root .*bands 3 strict 1 quanta 1000 [1-9][0-9]* priomap",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "2a23",
+ "name": "Add ETS qdisc with 16 bands",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets bands 16",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets 1: root .* bands 16",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "8daf",
+ "name": "Add ETS qdisc with 17 bands",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets bands 17",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "7f95",
+ "name": "Add ETS qdisc with 17 strict",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets strict 17",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "837a",
+ "name": "Add ETS qdisc with 16 quanta",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets quanta 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets 1: root .* bands 16",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "65b6",
+ "name": "Add ETS qdisc with 17 quanta",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets quanta 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17",
+ "expExitCode": "2",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "b9e9",
+ "name": "Add ETS qdisc with 16 strict + quanta",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets strict 8 quanta 1 2 3 4 5 6 7 8",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets 1: root .* bands 16",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "9877",
+ "name": "Add ETS qdisc with 17 strict + quanta",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets strict 9 quanta 1 2 3 4 5 6 7 8",
+ "expExitCode": "2",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "c696",
+ "name": "Add ETS qdisc with priomap",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets bands 5 priomap 0 0 1 0 1 2 0 1 2 3 0 1 2 3 4 0",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets 1: root .*priomap 0 0 1 0 1 2 0 1 2 3 0 1 2 3 4 0",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "30c4",
+ "name": "Add ETS qdisc with quanta + priomap",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets quanta 1000 2000 3000 4000 5000 priomap 0 0 1 0 1 2 0 1 2 3 0 1 2 3 4 0",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets 1: root .*quanta 1000 2000 3000 4000 5000 priomap 0 0 1 0 1 2 0 1 2 3 0 1 2 3 4 0",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "e8ac",
+ "name": "Add ETS qdisc with strict + priomap",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets strict 5 priomap 0 0 1 0 1 2 0 1 2 3 0 1 2 3 4 0",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets 1: root .*bands 5 strict 5 priomap 0 0 1 0 1 2 0 1 2 3 0 1 2 3 4 0",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "5a7e",
+ "name": "Add ETS qdisc with quanta + strict + priomap",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets strict 2 quanta 1000 2000 3000 priomap 0 0 1 0 1 2 0 1 2 3 0 1 2 3 4 0",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets 1: root .*strict 2 quanta 1000 2000 3000 priomap 0 0 1 0 1 2 0 1 2 3 0 1 2 3 4 0",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "cb8b",
+ "name": "Show ETS class :1",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets quanta 4000 3000 2000",
+ "expExitCode": "0",
+ "verifyCmd": "$TC class show dev $DUMMY classid 1:1",
+ "matchPattern": "class ets 1:1 root quantum 4000",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "1b4e",
+ "name": "Show ETS class :2",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets quanta 4000 3000 2000",
+ "expExitCode": "0",
+ "verifyCmd": "$TC class show dev $DUMMY classid 1:2",
+ "matchPattern": "class ets 1:2 root quantum 3000",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "f642",
+ "name": "Show ETS class :3",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets quanta 4000 3000 2000",
+ "expExitCode": "0",
+ "verifyCmd": "$TC class show dev $DUMMY classid 1:3",
+ "matchPattern": "class ets 1:3 root quantum 2000",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "0a5f",
+ "name": "Show ETS strict class",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets strict 3",
+ "expExitCode": "0",
+ "verifyCmd": "$TC class show dev $DUMMY classid 1:1",
+ "matchPattern": "class ets 1:1 root $",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "f7c8",
+ "name": "Add ETS qdisc with too many quanta",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets bands 2 quanta 1000 2000 3000",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "2389",
+ "name": "Add ETS qdisc with too many strict",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets bands 2 strict 3",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "fe3c",
+ "name": "Add ETS qdisc with too many strict + quanta",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets bands 4 strict 2 quanta 1000 2000 3000",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "cb04",
+ "name": "Add ETS qdisc with excess priomap elements",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets bands 5 priomap 0 0 1 0 1 2 0 1 2 3 0 1 2 3 4 0 1 2",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "c32e",
+ "name": "Add ETS qdisc with priomap above bands",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets bands 2 priomap 0 1 2",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "744c",
+ "name": "Add ETS qdisc with priomap above quanta",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets quanta 1000 500 priomap 0 1 2",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "7b33",
+ "name": "Add ETS qdisc with priomap above strict",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets strict 2 priomap 0 1 2",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "dbe6",
+ "name": "Add ETS qdisc with priomap above strict + quanta",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets strict 1 quanta 1000 500 priomap 0 1 2 3",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "bdb2",
+ "name": "Add ETS qdisc with priomap within bands with strict + quanta",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets bands 4 strict 1 quanta 1000 500 priomap 0 1 2 3",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets",
+ "matchCount": "1",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "39a3",
+ "name": "Add ETS qdisc with priomap above bands with strict + quanta",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets bands 4 strict 1 quanta 1000 500 priomap 0 1 2 3 4",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "557c",
+ "name": "Unset priorities default to the last band",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets bands 4 priomap 0 0 0 0",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets .*priomap 0 0 0 0 3 3 3 3 3 3 3 3 3 3 3 3",
+ "matchCount": "1",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "a347",
+ "name": "Unset priorities default to the last band -- no priomap",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets bands 4",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets .*priomap 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3",
+ "matchCount": "1",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "39c4",
+ "name": "Add ETS qdisc with too few bands",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets bands 0",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "930b",
+ "name": "Add ETS qdisc with too many bands",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets bands 17",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "406a",
+ "name": "Add ETS qdisc without parameters",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "e51a",
+ "name": "Zero element in quanta",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets quanta 1000 0 800 700",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "e7f2",
+ "name": "Sole zero element in quanta",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets quanta 0",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "d6e6",
+ "name": "No values after the quanta keyword",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root ets quanta",
+ "expExitCode": "255",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "28c6",
+ "name": "Change ETS band quantum",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY handle 1: root ets quanta 1000 2000 3000"
+ ],
+ "cmdUnderTest": "$TC class change dev $DUMMY classid 1:1 ets quantum 1500",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets 1: root .*quanta 1500 2000 3000 priomap ",
+ "matchCount": "1",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "4714",
+ "name": "Change ETS band without quantum",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY handle 1: root ets quanta 1000 2000 3000"
+ ],
+ "cmdUnderTest": "$TC class change dev $DUMMY classid 1:1 ets",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets 1: root .*quanta 1000 2000 3000 priomap ",
+ "matchCount": "1",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "6979",
+ "name": "Change quantum of a strict ETS band",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY handle 1: root ets strict 5"
+ ],
+ "cmdUnderTest": "$TC class change dev $DUMMY classid 1:2 ets quantum 1500",
+ "expExitCode": "2",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets .*bands 5 .*strict 5",
+ "matchCount": "1",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "9a7d",
+ "name": "Change ETS strict band without quantum",
+ "category": [
+ "qdisc",
+ "ets"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY handle 1: root ets strict 5"
+ ],
+ "cmdUnderTest": "$TC class change dev $DUMMY classid 1:2 ets",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ets .*bands 5 .*strict 5",
+ "matchCount": "1",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fifo.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fifo.json
new file mode 100644
index 000000000..5ecd93b4c
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fifo.json
@@ -0,0 +1,304 @@
+[
+ {
+ "id": "a519",
+ "name": "Add bfifo qdisc with system default parameters on egress",
+ "__comment": "When omitted, queue size in bfifo is calculated as: txqueuelen * (MTU + LinkLayerHdrSize), where LinkLayerHdrSize=14 for Ethernet",
+ "category": [
+ "qdisc",
+ "fifo"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root bfifo",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc bfifo 1: root.*limit [0-9]+b",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root bfifo",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "585c",
+ "name": "Add pfifo qdisc with system default parameters on egress",
+ "__comment": "When omitted, queue size in pfifo is defaulted to the interface's txqueuelen value.",
+ "category": [
+ "qdisc",
+ "fifo"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root pfifo",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc pfifo 1: root.*limit [0-9]+p",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root pfifo",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "a86e",
+ "name": "Add bfifo qdisc with system default parameters on egress with handle of maximum value",
+ "category": [
+ "qdisc",
+ "fifo"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY root handle ffff: bfifo",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc bfifo ffff: root.*limit [0-9]+b",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle ffff: root bfifo",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "9ac8",
+ "name": "Add bfifo qdisc on egress with queue size of 3000 bytes",
+ "category": [
+ "qdisc",
+ "fifo"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root bfifo limit 3000b",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc bfifo 1: root.*limit 3000b",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root bfifo",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "f4e6",
+ "name": "Add pfifo qdisc on egress with queue size of 3000 packets",
+ "category": [
+ "qdisc",
+ "fifo"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY txqueuelen 3000 type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root pfifo limit 3000",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc pfifo 1: root.*limit 3000p",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root pfifo",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "b1b1",
+ "name": "Add bfifo qdisc with system default parameters on egress with invalid handle exceeding maximum value",
+ "category": [
+ "qdisc",
+ "fifo"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY root handle 10000: bfifo",
+ "expExitCode": "255",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc bfifo 10000: root.*limit [0-9]+b",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "8d5e",
+ "name": "Add bfifo qdisc on egress with unsupported argument",
+ "category": [
+ "qdisc",
+ "fifo"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root bfifo foorbar",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc bfifo 1: root",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "7787",
+ "name": "Add pfifo qdisc on egress with unsupported argument",
+ "category": [
+ "qdisc",
+ "fifo"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root pfifo foorbar",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc pfifo 1: root",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "c4b6",
+ "name": "Replace bfifo qdisc on egress with new queue size",
+ "category": [
+ "qdisc",
+ "fifo"
+ ],
+ "setup": [
+ "$IP link del dev $DUMMY type dummy || /bin/true",
+ "$IP link add dev $DUMMY txqueuelen 1000 type dummy",
+ "$TC qdisc add dev $DUMMY handle 1: root bfifo"
+ ],
+ "cmdUnderTest": "$TC qdisc replace dev $DUMMY handle 1: root bfifo limit 3000b",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc bfifo 1: root.*limit 3000b",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root bfifo",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "3df6",
+ "name": "Replace pfifo qdisc on egress with new queue size",
+ "category": [
+ "qdisc",
+ "fifo"
+ ],
+ "setup": [
+ "$IP link del dev $DUMMY type dummy || /bin/true",
+ "$IP link add dev $DUMMY txqueuelen 1000 type dummy",
+ "$TC qdisc add dev $DUMMY handle 1: root pfifo"
+ ],
+ "cmdUnderTest": "$TC qdisc replace dev $DUMMY handle 1: root pfifo limit 30",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc pfifo 1: root.*limit 30p",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root pfifo",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "7a67",
+ "name": "Add bfifo qdisc on egress with queue size in invalid format",
+ "category": [
+ "qdisc",
+ "fifo"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root bfifo limit foo-bar",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc bfifo 1: root.*limit foo-bar",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "1298",
+ "name": "Add duplicate bfifo qdisc on egress",
+ "category": [
+ "qdisc",
+ "fifo"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY handle 1: root bfifo"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root bfifo",
+ "expExitCode": "2",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc bfifo 1: root",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root bfifo",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "45a0",
+ "name": "Delete nonexistent bfifo qdisc",
+ "category": [
+ "qdisc",
+ "fifo"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc del dev $DUMMY root handle 1: bfifo",
+ "expExitCode": "2",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc bfifo 1: root",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "972b",
+ "name": "Add prio qdisc on egress with invalid format for handles",
+ "category": [
+ "qdisc",
+ "fifo"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY root handle 123^ bfifo limit 100b",
+ "expExitCode": "255",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc bfifo 123 root",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "4d39",
+ "name": "Delete bfifo qdisc twice",
+ "category": [
+ "qdisc",
+ "fifo"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY root handle 1: bfifo",
+ "$TC qdisc del dev $DUMMY root handle 1: bfifo"
+ ],
+ "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 1: root bfifo",
+ "expExitCode": "2",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc bfifo 1: root",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json
new file mode 100644
index 000000000..773c50275
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/fq_pie.json
@@ -0,0 +1,21 @@
+[
+ {
+ "id": "83be",
+ "name": "Create FQ-PIE with invalid number of flows",
+ "category": [
+ "qdisc",
+ "fq_pie"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root fq_pie flows 65536",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc fq_pie 1: root refcnt 2 limit 10240p flows 65536",
+ "matchCount": "1",
+ "teardown": [
+ "$IP link del dev $DUMMY"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/ingress.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/ingress.json
new file mode 100644
index 000000000..d99dba6e2
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/ingress.json
@@ -0,0 +1,102 @@
+[
+ {
+ "id": "9872",
+ "name": "Add ingress qdisc",
+ "category": [
+ "qdisc",
+ "ingress"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY ingress",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ingress ffff:",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY ingress",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "5c5e",
+ "name": "Add ingress qdisc with unsupported argument",
+ "category": [
+ "qdisc",
+ "ingress"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY ingress foorbar",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ingress ffff:",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "74f6",
+ "name": "Add duplicate ingress qdisc",
+ "category": [
+ "qdisc",
+ "ingress"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY ingress"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY ingress",
+ "expExitCode": "2",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ingress ffff:",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY ingress",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "f769",
+ "name": "Delete nonexistent ingress qdisc",
+ "category": [
+ "qdisc",
+ "ingress"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc del dev $DUMMY ingress",
+ "expExitCode": "2",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ingress ffff:",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "3b88",
+ "name": "Delete ingress qdisc twice",
+ "category": [
+ "qdisc",
+ "ingress"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY ingress",
+ "$TC qdisc del dev $DUMMY ingress"
+ ],
+ "cmdUnderTest": "$TC qdisc del dev $DUMMY ingress",
+ "expExitCode": "2",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ingress ffff:",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/prio.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/prio.json
new file mode 100644
index 000000000..3076c02d0
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/prio.json
@@ -0,0 +1,276 @@
+[
+ {
+ "id": "ddd9",
+ "name": "Add prio qdisc on egress",
+ "category": [
+ "qdisc",
+ "prio"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root prio",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc prio 1: root",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root prio",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "aa71",
+ "name": "Add prio qdisc on egress with handle of maximum value",
+ "category": [
+ "qdisc",
+ "prio"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY root handle ffff: prio",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc prio ffff: root",
+ "matchCount": "1",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "db37",
+ "name": "Add prio qdisc on egress with invalid handle exceeding maximum value",
+ "category": [
+ "qdisc",
+ "prio"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY root handle 10000: prio",
+ "expExitCode": "255",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc prio 10000: root",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "39d8",
+ "name": "Add prio qdisc on egress with unsupported argument",
+ "category": [
+ "qdisc",
+ "prio"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root prio foorbar",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc prio 1: root",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "5769",
+ "name": "Add prio qdisc on egress with 4 bands and new priomap",
+ "category": [
+ "qdisc",
+ "prio"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root prio bands 4 priomap 1 1 2 2 3 3 0 0 1 2 3 0 0 0 0 0",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc prio 1: root.*bands 4 priomap.*1 1 2 2 3 3 0 0 1 2 3 0 0 0 0 0",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root prio",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "fe0f",
+ "name": "Add prio qdisc on egress with 4 bands and priomap exceeding TC_PRIO_MAX entries",
+ "category": [
+ "qdisc",
+ "prio"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root prio bands 4 priomap 1 1 2 2 3 3 0 0 1 2 3 0 0 0 0 0 1 1",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc prio 1: root.*bands 4 priomap.*1 1 2 2 3 3 0 0 1 2 3 0 0 0 0 0 1 1",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "1f91",
+ "name": "Add prio qdisc on egress with 4 bands and priomap's values exceeding bands number",
+ "category": [
+ "qdisc",
+ "prio"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root prio bands 4 priomap 1 1 2 2 7 5 0 0 1 2 3 0 0 0 0 0",
+ "expExitCode": "1",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc prio 1: root.*bands 4 priomap.*1 1 2 2 7 5 0 0 1 2 3 0 0 0 0 0",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "d248",
+ "name": "Add prio qdisc on egress with invalid bands value (< 2)",
+ "category": [
+ "qdisc",
+ "prio"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root prio bands 1 priomap 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0",
+ "expExitCode": "2",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc prio 1: root.*bands 1 priomap.*0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "1d0e",
+ "name": "Add prio qdisc on egress with invalid bands value exceeding TCQ_PRIO_BANDS",
+ "category": [
+ "qdisc",
+ "prio"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root prio bands 1024 priomap 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16",
+ "expExitCode": "2",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc prio 1: root.*bands 1024 priomap.*1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "1971",
+ "name": "Replace default prio qdisc on egress with 8 bands and new priomap",
+ "category": [
+ "qdisc",
+ "prio"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY handle 1: root prio"
+ ],
+ "cmdUnderTest": "$TC qdisc replace dev $DUMMY handle 1: root prio bands 8 priomap 1 1 2 2 3 3 4 4 5 5 6 6 7 7 0 0",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc prio 1: root.*bands 8 priomap.*1 1 2 2 3 3 4 4 5 5 6 6 7 7 0 0",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root prio",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "d88a",
+ "name": "Add duplicate prio qdisc on egress",
+ "category": [
+ "qdisc",
+ "prio"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY handle 1: root prio"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root prio",
+ "expExitCode": "2",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc prio 1: root",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root prio",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "5948",
+ "name": "Delete nonexistent prio qdisc",
+ "category": [
+ "qdisc",
+ "prio"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc del dev $DUMMY root handle 1: prio",
+ "expExitCode": "2",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc prio 1: root",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "6c0a",
+ "name": "Add prio qdisc on egress with invalid format for handles",
+ "category": [
+ "qdisc",
+ "prio"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY root handle 123^ prio",
+ "expExitCode": "255",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc prio 123 root",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "0175",
+ "name": "Delete prio qdisc twice",
+ "category": [
+ "qdisc",
+ "prio"
+ ],
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true",
+ "$TC qdisc add dev $DUMMY root handle 1: prio",
+ "$TC qdisc del dev $DUMMY root handle 1: prio"
+ ],
+ "cmdUnderTest": "$TC qdisc del dev $DUMMY handle 1: root prio",
+ "expExitCode": "2",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc ingress ffff:",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/red.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/red.json
new file mode 100644
index 000000000..0703a2a25
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/red.json
@@ -0,0 +1,185 @@
+[
+ {
+ "id": "8b6e",
+ "name": "Create RED with no flags",
+ "category": [
+ "qdisc",
+ "red"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red limit 1M avpkt 1500 min 100K max 300K",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc red 1: root .* limit 1Mb min 100Kb max 300Kb $",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "342e",
+ "name": "Create RED with adaptive flag",
+ "category": [
+ "qdisc",
+ "red"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red adaptive limit 1M avpkt 1500 min 100K max 300K",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc red 1: root .* limit 1Mb min 100Kb max 300Kb adaptive $",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "2d4b",
+ "name": "Create RED with ECN flag",
+ "category": [
+ "qdisc",
+ "red"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red ecn limit 1M avpkt 1500 min 100K max 300K",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc red 1: root .* limit 1Mb min 100Kb max 300Kb ecn $",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "650f",
+ "name": "Create RED with flags ECN, adaptive",
+ "category": [
+ "qdisc",
+ "red"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red ecn adaptive limit 1M avpkt 1500 min 100K max 300K",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc red 1: root .* limit 1Mb min 100Kb max 300Kb ecn adaptive $",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "5f15",
+ "name": "Create RED with flags ECN, harddrop",
+ "category": [
+ "qdisc",
+ "red"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red ecn harddrop limit 1M avpkt 1500 min 100K max 300K",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc red 1: root .* limit 1Mb min 100Kb max 300Kb ecn harddrop $",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "53e8",
+ "name": "Create RED with flags ECN, nodrop",
+ "category": [
+ "qdisc",
+ "red"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red ecn nodrop limit 1M avpkt 1500 min 100K max 300K",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc red 1: root .* limit 1Mb min 100Kb max 300Kb ecn nodrop $",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "d091",
+ "name": "Fail to create RED with only nodrop flag",
+ "category": [
+ "qdisc",
+ "red"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red nodrop limit 1M avpkt 1500 min 100K max 300K",
+ "expExitCode": "2",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc red",
+ "matchCount": "0",
+ "teardown": [
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ },
+ {
+ "id": "af8e",
+ "name": "Create RED with flags ECN, nodrop, harddrop",
+ "category": [
+ "qdisc",
+ "red"
+ ],
+ "plugins": {
+ "requires": "nsPlugin"
+ },
+ "setup": [
+ "$IP link add dev $DUMMY type dummy || /bin/true"
+ ],
+ "cmdUnderTest": "$TC qdisc add dev $DUMMY handle 1: root red ecn harddrop nodrop limit 1M avpkt 1500 min 100K max 300K",
+ "expExitCode": "0",
+ "verifyCmd": "$TC qdisc show dev $DUMMY",
+ "matchPattern": "qdisc red 1: root .* limit 1Mb min 100Kb max 300Kb ecn harddrop nodrop $",
+ "matchCount": "1",
+ "teardown": [
+ "$TC qdisc del dev $DUMMY handle 1: root",
+ "$IP link del dev $DUMMY type dummy"
+ ]
+ }
+]
diff --git a/tools/testing/selftests/tc-testing/tdc.py b/tools/testing/selftests/tc-testing/tdc.py
new file mode 100755
index 000000000..a3e43189d
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tdc.py
@@ -0,0 +1,774 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+
+"""
+tdc.py - Linux tc (Traffic Control) unit test driver
+
+Copyright (C) 2017 Lucas Bates <lucasb@mojatatu.com>
+"""
+
+import re
+import os
+import sys
+import argparse
+import importlib
+import json
+import subprocess
+import time
+import traceback
+from collections import OrderedDict
+from string import Template
+
+from tdc_config import *
+from tdc_helper import *
+
+import TdcPlugin
+from TdcResults import *
+
+class PluginDependencyException(Exception):
+ def __init__(self, missing_pg):
+ self.missing_pg = missing_pg
+
+class PluginMgrTestFail(Exception):
+ def __init__(self, stage, output, message):
+ self.stage = stage
+ self.output = output
+ self.message = message
+
+class PluginMgr:
+ def __init__(self, argparser):
+ super().__init__()
+ self.plugins = {}
+ self.plugin_instances = []
+ self.failed_plugins = {}
+ self.argparser = argparser
+
+ # TODO, put plugins in order
+ plugindir = os.getenv('TDC_PLUGIN_DIR', './plugins')
+ for dirpath, dirnames, filenames in os.walk(plugindir):
+ for fn in filenames:
+ if (fn.endswith('.py') and
+ not fn == '__init__.py' and
+ not fn.startswith('#') and
+ not fn.startswith('.#')):
+ mn = fn[0:-3]
+ foo = importlib.import_module('plugins.' + mn)
+ self.plugins[mn] = foo
+ self.plugin_instances.append(foo.SubPlugin())
+
+ def load_plugin(self, pgdir, pgname):
+ pgname = pgname[0:-3]
+ foo = importlib.import_module('{}.{}'.format(pgdir, pgname))
+ self.plugins[pgname] = foo
+ self.plugin_instances.append(foo.SubPlugin())
+ self.plugin_instances[-1].check_args(self.args, None)
+
+ def get_required_plugins(self, testlist):
+ '''
+ Get all required plugins from the list of test cases and return
+ all unique items.
+ '''
+ reqs = []
+ for t in testlist:
+ try:
+ if 'requires' in t['plugins']:
+ if isinstance(t['plugins']['requires'], list):
+ reqs.extend(t['plugins']['requires'])
+ else:
+ reqs.append(t['plugins']['requires'])
+ except KeyError:
+ continue
+ reqs = get_unique_item(reqs)
+ return reqs
+
+ def load_required_plugins(self, reqs, parser, args, remaining):
+ '''
+ Get all required plugins from the list of test cases and load any plugin
+ that is not already enabled.
+ '''
+ pgd = ['plugin-lib', 'plugin-lib-custom']
+ pnf = []
+
+ for r in reqs:
+ if r not in self.plugins:
+ fname = '{}.py'.format(r)
+ source_path = []
+ for d in pgd:
+ pgpath = '{}/{}'.format(d, fname)
+ if os.path.isfile(pgpath):
+ source_path.append(pgpath)
+ if len(source_path) == 0:
+ print('ERROR: unable to find required plugin {}'.format(r))
+ pnf.append(fname)
+ continue
+ elif len(source_path) > 1:
+ print('WARNING: multiple copies of plugin {} found, using version found')
+ print('at {}'.format(source_path[0]))
+ pgdir = source_path[0]
+ pgdir = pgdir.split('/')[0]
+ self.load_plugin(pgdir, fname)
+ if len(pnf) > 0:
+ raise PluginDependencyException(pnf)
+
+ parser = self.call_add_args(parser)
+ (args, remaining) = parser.parse_known_args(args=remaining, namespace=args)
+ return args
+
+ def call_pre_suite(self, testcount, testidlist):
+ for pgn_inst in self.plugin_instances:
+ pgn_inst.pre_suite(testcount, testidlist)
+
+ def call_post_suite(self, index):
+ for pgn_inst in reversed(self.plugin_instances):
+ pgn_inst.post_suite(index)
+
+ def call_pre_case(self, caseinfo, *, test_skip=False):
+ for pgn_inst in self.plugin_instances:
+ try:
+ pgn_inst.pre_case(caseinfo, test_skip)
+ except Exception as ee:
+ print('exception {} in call to pre_case for {} plugin'.
+ format(ee, pgn_inst.__class__))
+ print('test_ordinal is {}'.format(test_ordinal))
+ print('testid is {}'.format(caseinfo['id']))
+ raise
+
+ def call_post_case(self):
+ for pgn_inst in reversed(self.plugin_instances):
+ pgn_inst.post_case()
+
+ def call_pre_execute(self):
+ for pgn_inst in self.plugin_instances:
+ pgn_inst.pre_execute()
+
+ def call_post_execute(self):
+ for pgn_inst in reversed(self.plugin_instances):
+ pgn_inst.post_execute()
+
+ def call_add_args(self, parser):
+ for pgn_inst in self.plugin_instances:
+ parser = pgn_inst.add_args(parser)
+ return parser
+
+ def call_check_args(self, args, remaining):
+ for pgn_inst in self.plugin_instances:
+ pgn_inst.check_args(args, remaining)
+
+ def call_adjust_command(self, stage, command):
+ for pgn_inst in self.plugin_instances:
+ command = pgn_inst.adjust_command(stage, command)
+ return command
+
+ def set_args(self, args):
+ self.args = args
+
+ @staticmethod
+ def _make_argparser(args):
+ self.argparser = argparse.ArgumentParser(
+ description='Linux TC unit tests')
+
+def replace_keywords(cmd):
+ """
+ For a given executable command, substitute any known
+ variables contained within NAMES with the correct values
+ """
+ tcmd = Template(cmd)
+ subcmd = tcmd.safe_substitute(NAMES)
+ return subcmd
+
+
+def exec_cmd(args, pm, stage, command):
+ """
+ Perform any required modifications on an executable command, then run
+ it in a subprocess and return the results.
+ """
+ if len(command.strip()) == 0:
+ return None, None
+ if '$' in command:
+ command = replace_keywords(command)
+
+ command = pm.call_adjust_command(stage, command)
+ if args.verbose > 0:
+ print('command "{}"'.format(command))
+ proc = subprocess.Popen(command,
+ shell=True,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ env=ENVIR)
+
+ try:
+ (rawout, serr) = proc.communicate(timeout=NAMES['TIMEOUT'])
+ if proc.returncode != 0 and len(serr) > 0:
+ foutput = serr.decode("utf-8", errors="ignore")
+ else:
+ foutput = rawout.decode("utf-8", errors="ignore")
+ except subprocess.TimeoutExpired:
+ foutput = "Command \"{}\" timed out\n".format(command)
+ proc.returncode = 255
+
+ proc.stdout.close()
+ proc.stderr.close()
+ return proc, foutput
+
+
+def prepare_env(args, pm, stage, prefix, cmdlist, output = None):
+ """
+ Execute the setup/teardown commands for a test case.
+ Optionally terminate test execution if the command fails.
+ """
+ if args.verbose > 0:
+ print('{}'.format(prefix))
+ for cmdinfo in cmdlist:
+ if isinstance(cmdinfo, list):
+ exit_codes = cmdinfo[1:]
+ cmd = cmdinfo[0]
+ else:
+ exit_codes = [0]
+ cmd = cmdinfo
+
+ if not cmd:
+ continue
+
+ (proc, foutput) = exec_cmd(args, pm, stage, cmd)
+
+ if proc and (proc.returncode not in exit_codes):
+ print('', file=sys.stderr)
+ print("{} *** Could not execute: \"{}\"".format(prefix, cmd),
+ file=sys.stderr)
+ print("\n{} *** Error message: \"{}\"".format(prefix, foutput),
+ file=sys.stderr)
+ print("returncode {}; expected {}".format(proc.returncode,
+ exit_codes))
+ print("\n{} *** Aborting test run.".format(prefix), file=sys.stderr)
+ print("\n\n{} *** stdout ***".format(proc.stdout), file=sys.stderr)
+ print("\n\n{} *** stderr ***".format(proc.stderr), file=sys.stderr)
+ raise PluginMgrTestFail(
+ stage, output,
+ '"{}" did not complete successfully'.format(prefix))
+
+def run_one_test(pm, args, index, tidx):
+ global NAMES
+ result = True
+ tresult = ""
+ tap = ""
+ res = TestResult(tidx['id'], tidx['name'])
+ if args.verbose > 0:
+ print("\t====================\n=====> ", end="")
+ print("Test " + tidx["id"] + ": " + tidx["name"])
+
+ if 'skip' in tidx:
+ if tidx['skip'] == 'yes':
+ res = TestResult(tidx['id'], tidx['name'])
+ res.set_result(ResultState.skip)
+ res.set_errormsg('Test case designated as skipped.')
+ pm.call_pre_case(tidx, test_skip=True)
+ pm.call_post_execute()
+ return res
+
+ # populate NAMES with TESTID for this test
+ NAMES['TESTID'] = tidx['id']
+
+ pm.call_pre_case(tidx)
+ prepare_env(args, pm, 'setup', "-----> prepare stage", tidx["setup"])
+
+ if (args.verbose > 0):
+ print('-----> execute stage')
+ pm.call_pre_execute()
+ (p, procout) = exec_cmd(args, pm, 'execute', tidx["cmdUnderTest"])
+ if p:
+ exit_code = p.returncode
+ else:
+ exit_code = None
+
+ pm.call_post_execute()
+
+ if (exit_code is None or exit_code != int(tidx["expExitCode"])):
+ print("exit: {!r}".format(exit_code))
+ print("exit: {}".format(int(tidx["expExitCode"])))
+ #print("exit: {!r} {}".format(exit_code, int(tidx["expExitCode"])))
+ res.set_result(ResultState.fail)
+ res.set_failmsg('Command exited with {}, expected {}\n{}'.format(exit_code, tidx["expExitCode"], procout))
+ print(procout)
+ else:
+ if args.verbose > 0:
+ print('-----> verify stage')
+ match_pattern = re.compile(
+ str(tidx["matchPattern"]), re.DOTALL | re.MULTILINE)
+ (p, procout) = exec_cmd(args, pm, 'verify', tidx["verifyCmd"])
+ if procout:
+ match_index = re.findall(match_pattern, procout)
+ if len(match_index) != int(tidx["matchCount"]):
+ res.set_result(ResultState.fail)
+ res.set_failmsg('Could not match regex pattern. Verify command output:\n{}'.format(procout))
+ else:
+ res.set_result(ResultState.success)
+ elif int(tidx["matchCount"]) != 0:
+ res.set_result(ResultState.fail)
+ res.set_failmsg('No output generated by verify command.')
+ else:
+ res.set_result(ResultState.success)
+
+ prepare_env(args, pm, 'teardown', '-----> teardown stage', tidx['teardown'], procout)
+ pm.call_post_case()
+
+ index += 1
+
+ # remove TESTID from NAMES
+ del(NAMES['TESTID'])
+ return res
+
+def test_runner(pm, args, filtered_tests):
+ """
+ Driver function for the unit tests.
+
+ Prints information about the tests being run, executes the setup and
+ teardown commands and the command under test itself. Also determines
+ success/failure based on the information in the test case and generates
+ TAP output accordingly.
+ """
+ testlist = filtered_tests
+ tcount = len(testlist)
+ index = 1
+ tap = ''
+ badtest = None
+ stage = None
+ emergency_exit = False
+ emergency_exit_message = ''
+
+ tsr = TestSuiteReport()
+
+ try:
+ pm.call_pre_suite(tcount, [tidx['id'] for tidx in testlist])
+ except Exception as ee:
+ ex_type, ex, ex_tb = sys.exc_info()
+ print('Exception {} {} (caught in pre_suite).'.
+ format(ex_type, ex))
+ traceback.print_tb(ex_tb)
+ emergency_exit_message = 'EMERGENCY EXIT, call_pre_suite failed with exception {} {}\n'.format(ex_type, ex)
+ emergency_exit = True
+ stage = 'pre-SUITE'
+
+ if emergency_exit:
+ pm.call_post_suite(index)
+ return emergency_exit_message
+ if args.verbose > 1:
+ print('give test rig 2 seconds to stabilize')
+ time.sleep(2)
+ for tidx in testlist:
+ if "flower" in tidx["category"] and args.device == None:
+ errmsg = "Tests using the DEV2 variable must define the name of a "
+ errmsg += "physical NIC with the -d option when running tdc.\n"
+ errmsg += "Test has been skipped."
+ if args.verbose > 1:
+ print(errmsg)
+ res = TestResult(tidx['id'], tidx['name'])
+ res.set_result(ResultState.skip)
+ res.set_errormsg(errmsg)
+ tsr.add_resultdata(res)
+ continue
+ try:
+ badtest = tidx # in case it goes bad
+ res = run_one_test(pm, args, index, tidx)
+ tsr.add_resultdata(res)
+ except PluginMgrTestFail as pmtf:
+ ex_type, ex, ex_tb = sys.exc_info()
+ stage = pmtf.stage
+ message = pmtf.message
+ output = pmtf.output
+ res = TestResult(tidx['id'], tidx['name'])
+ res.set_result(ResultState.skip)
+ res.set_errormsg(pmtf.message)
+ res.set_failmsg(pmtf.output)
+ tsr.add_resultdata(res)
+ index += 1
+ print(message)
+ print('Exception {} {} (caught in test_runner, running test {} {} {} stage {})'.
+ format(ex_type, ex, index, tidx['id'], tidx['name'], stage))
+ print('---------------')
+ print('traceback')
+ traceback.print_tb(ex_tb)
+ print('---------------')
+ if stage == 'teardown':
+ print('accumulated output for this test:')
+ if pmtf.output:
+ print(pmtf.output)
+ print('---------------')
+ break
+ index += 1
+
+ # if we failed in setup or teardown,
+ # fill in the remaining tests with ok-skipped
+ count = index
+
+ if tcount + 1 != count:
+ for tidx in testlist[count - 1:]:
+ res = TestResult(tidx['id'], tidx['name'])
+ res.set_result(ResultState.skip)
+ msg = 'skipped - previous {} failed {} {}'.format(stage,
+ index, badtest.get('id', '--Unknown--'))
+ res.set_errormsg(msg)
+ tsr.add_resultdata(res)
+ count += 1
+
+ if args.pause:
+ print('Want to pause\nPress enter to continue ...')
+ if input(sys.stdin):
+ print('got something on stdin')
+
+ pm.call_post_suite(index)
+
+ return tsr
+
+def has_blank_ids(idlist):
+ """
+ Search the list for empty ID fields and return true/false accordingly.
+ """
+ return not(all(k for k in idlist))
+
+
+def load_from_file(filename):
+ """
+ Open the JSON file containing the test cases and return them
+ as list of ordered dictionary objects.
+ """
+ try:
+ with open(filename) as test_data:
+ testlist = json.load(test_data, object_pairs_hook=OrderedDict)
+ except json.JSONDecodeError as jde:
+ print('IGNORING test case file {}\n\tBECAUSE: {}'.format(filename, jde))
+ testlist = list()
+ else:
+ idlist = get_id_list(testlist)
+ if (has_blank_ids(idlist)):
+ for k in testlist:
+ k['filename'] = filename
+ return testlist
+
+
+def args_parse():
+ """
+ Create the argument parser.
+ """
+ parser = argparse.ArgumentParser(description='Linux TC unit tests')
+ return parser
+
+
+def set_args(parser):
+ """
+ Set the command line arguments for tdc.
+ """
+ parser.add_argument(
+ '--outfile', type=str,
+ help='Path to the file in which results should be saved. ' +
+ 'Default target is the current directory.')
+ parser.add_argument(
+ '-p', '--path', type=str,
+ help='The full path to the tc executable to use')
+ sg = parser.add_argument_group(
+ 'selection', 'select which test cases: ' +
+ 'files plus directories; filtered by categories plus testids')
+ ag = parser.add_argument_group(
+ 'action', 'select action to perform on selected test cases')
+
+ sg.add_argument(
+ '-D', '--directory', nargs='+', metavar='DIR',
+ help='Collect tests from the specified directory(ies) ' +
+ '(default [tc-tests])')
+ sg.add_argument(
+ '-f', '--file', nargs='+', metavar='FILE',
+ help='Run tests from the specified file(s)')
+ sg.add_argument(
+ '-c', '--category', nargs='*', metavar='CATG', default=['+c'],
+ help='Run tests only from the specified category/ies, ' +
+ 'or if no category/ies is/are specified, list known categories.')
+ sg.add_argument(
+ '-e', '--execute', nargs='+', metavar='ID',
+ help='Execute the specified test cases with specified IDs')
+ ag.add_argument(
+ '-l', '--list', action='store_true',
+ help='List all test cases, or those only within the specified category')
+ ag.add_argument(
+ '-s', '--show', action='store_true', dest='showID',
+ help='Display the selected test cases')
+ ag.add_argument(
+ '-i', '--id', action='store_true', dest='gen_id',
+ help='Generate ID numbers for new test cases')
+ parser.add_argument(
+ '-v', '--verbose', action='count', default=0,
+ help='Show the commands that are being run')
+ parser.add_argument(
+ '--format', default='tap', const='tap', nargs='?',
+ choices=['none', 'xunit', 'tap'],
+ help='Specify the format for test results. (Default: TAP)')
+ parser.add_argument('-d', '--device',
+ help='Execute test cases that use a physical device, ' +
+ 'where DEVICE is its name. (If not defined, tests ' +
+ 'that require a physical device will be skipped)')
+ parser.add_argument(
+ '-P', '--pause', action='store_true',
+ help='Pause execution just before post-suite stage')
+ return parser
+
+
+def check_default_settings(args, remaining, pm):
+ """
+ Process any arguments overriding the default settings,
+ and ensure the settings are correct.
+ """
+ # Allow for overriding specific settings
+ global NAMES
+
+ if args.path != None:
+ NAMES['TC'] = args.path
+ if args.device != None:
+ NAMES['DEV2'] = args.device
+ if 'TIMEOUT' not in NAMES:
+ NAMES['TIMEOUT'] = None
+ if not os.path.isfile(NAMES['TC']):
+ print("The specified tc path " + NAMES['TC'] + " does not exist.")
+ exit(1)
+
+ pm.call_check_args(args, remaining)
+
+
+def get_id_list(alltests):
+ """
+ Generate a list of all IDs in the test cases.
+ """
+ return [x["id"] for x in alltests]
+
+
+def check_case_id(alltests):
+ """
+ Check for duplicate test case IDs.
+ """
+ idl = get_id_list(alltests)
+ return [x for x in idl if idl.count(x) > 1]
+
+
+def does_id_exist(alltests, newid):
+ """
+ Check if a given ID already exists in the list of test cases.
+ """
+ idl = get_id_list(alltests)
+ return (any(newid == x for x in idl))
+
+
+def generate_case_ids(alltests):
+ """
+ If a test case has a blank ID field, generate a random hex ID for it
+ and then write the test cases back to disk.
+ """
+ import random
+ for c in alltests:
+ if (c["id"] == ""):
+ while True:
+ newid = str('{:04x}'.format(random.randrange(16**4)))
+ if (does_id_exist(alltests, newid)):
+ continue
+ else:
+ c['id'] = newid
+ break
+
+ ufilename = []
+ for c in alltests:
+ if ('filename' in c):
+ ufilename.append(c['filename'])
+ ufilename = get_unique_item(ufilename)
+ for f in ufilename:
+ testlist = []
+ for t in alltests:
+ if 'filename' in t:
+ if t['filename'] == f:
+ del t['filename']
+ testlist.append(t)
+ outfile = open(f, "w")
+ json.dump(testlist, outfile, indent=4)
+ outfile.write("\n")
+ outfile.close()
+
+def filter_tests_by_id(args, testlist):
+ '''
+ Remove tests from testlist that are not in the named id list.
+ If id list is empty, return empty list.
+ '''
+ newlist = list()
+ if testlist and args.execute:
+ target_ids = args.execute
+
+ if isinstance(target_ids, list) and (len(target_ids) > 0):
+ newlist = list(filter(lambda x: x['id'] in target_ids, testlist))
+ return newlist
+
+def filter_tests_by_category(args, testlist):
+ '''
+ Remove tests from testlist that are not in a named category.
+ '''
+ answer = list()
+ if args.category and testlist:
+ test_ids = list()
+ for catg in set(args.category):
+ if catg == '+c':
+ continue
+ print('considering category {}'.format(catg))
+ for tc in testlist:
+ if catg in tc['category'] and tc['id'] not in test_ids:
+ answer.append(tc)
+ test_ids.append(tc['id'])
+
+ return answer
+
+
+def get_test_cases(args):
+ """
+ If a test case file is specified, retrieve tests from that file.
+ Otherwise, glob for all json files in subdirectories and load from
+ each one.
+ Also, if requested, filter by category, and add tests matching
+ certain ids.
+ """
+ import fnmatch
+
+ flist = []
+ testdirs = ['tc-tests']
+
+ if args.file:
+ # at least one file was specified - remove the default directory
+ testdirs = []
+
+ for ff in args.file:
+ if not os.path.isfile(ff):
+ print("IGNORING file " + ff + "\n\tBECAUSE does not exist.")
+ else:
+ flist.append(os.path.abspath(ff))
+
+ if args.directory:
+ testdirs = args.directory
+
+ for testdir in testdirs:
+ for root, dirnames, filenames in os.walk(testdir):
+ for filename in fnmatch.filter(filenames, '*.json'):
+ candidate = os.path.abspath(os.path.join(root, filename))
+ if candidate not in testdirs:
+ flist.append(candidate)
+
+ alltestcases = list()
+ for casefile in flist:
+ alltestcases = alltestcases + (load_from_file(casefile))
+
+ allcatlist = get_test_categories(alltestcases)
+ allidlist = get_id_list(alltestcases)
+
+ testcases_by_cats = get_categorized_testlist(alltestcases, allcatlist)
+ idtestcases = filter_tests_by_id(args, alltestcases)
+ cattestcases = filter_tests_by_category(args, alltestcases)
+
+ cat_ids = [x['id'] for x in cattestcases]
+ if args.execute:
+ if args.category:
+ alltestcases = cattestcases + [x for x in idtestcases if x['id'] not in cat_ids]
+ else:
+ alltestcases = idtestcases
+ else:
+ if cat_ids:
+ alltestcases = cattestcases
+ else:
+ # just accept the existing value of alltestcases,
+ # which has been filtered by file/directory
+ pass
+
+ return allcatlist, allidlist, testcases_by_cats, alltestcases
+
+
+def set_operation_mode(pm, parser, args, remaining):
+ """
+ Load the test case data and process remaining arguments to determine
+ what the script should do for this run, and call the appropriate
+ function.
+ """
+ ucat, idlist, testcases, alltests = get_test_cases(args)
+
+ if args.gen_id:
+ if (has_blank_ids(idlist)):
+ alltests = generate_case_ids(alltests)
+ else:
+ print("No empty ID fields found in test files.")
+ exit(0)
+
+ duplicate_ids = check_case_id(alltests)
+ if (len(duplicate_ids) > 0):
+ print("The following test case IDs are not unique:")
+ print(str(set(duplicate_ids)))
+ print("Please correct them before continuing.")
+ exit(1)
+
+ if args.showID:
+ for atest in alltests:
+ print_test_case(atest)
+ exit(0)
+
+ if isinstance(args.category, list) and (len(args.category) == 0):
+ print("Available categories:")
+ print_sll(ucat)
+ exit(0)
+
+ if args.list:
+ list_test_cases(alltests)
+ exit(0)
+
+ if len(alltests):
+ req_plugins = pm.get_required_plugins(alltests)
+ try:
+ args = pm.load_required_plugins(req_plugins, parser, args, remaining)
+ except PluginDependencyException as pde:
+ print('The following plugins were not found:')
+ print('{}'.format(pde.missing_pg))
+ catresults = test_runner(pm, args, alltests)
+ if args.format == 'none':
+ print('Test results output suppression requested\n')
+ else:
+ print('\nAll test results: \n')
+ if args.format == 'xunit':
+ suffix = 'xml'
+ res = catresults.format_xunit()
+ elif args.format == 'tap':
+ suffix = 'tap'
+ res = catresults.format_tap()
+ print(res)
+ print('\n\n')
+ if not args.outfile:
+ fname = 'test-results.{}'.format(suffix)
+ else:
+ fname = args.outfile
+ with open(fname, 'w') as fh:
+ fh.write(res)
+ fh.close()
+ if os.getenv('SUDO_UID') is not None:
+ os.chown(fname, uid=int(os.getenv('SUDO_UID')),
+ gid=int(os.getenv('SUDO_GID')))
+ else:
+ print('No tests found\n')
+
+def main():
+ """
+ Start of execution; set up argument parser and get the arguments,
+ and start operations.
+ """
+ parser = args_parse()
+ parser = set_args(parser)
+ pm = PluginMgr(parser)
+ parser = pm.call_add_args(parser)
+ (args, remaining) = parser.parse_known_args()
+ args.NAMES = NAMES
+ pm.set_args(args)
+ check_default_settings(args, remaining, pm)
+ if args.verbose > 2:
+ print('args is {}'.format(args))
+
+ set_operation_mode(pm, parser, args, remaining)
+
+ exit(0)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tools/testing/selftests/tc-testing/tdc.sh b/tools/testing/selftests/tc-testing/tdc.sh
new file mode 100755
index 000000000..7fe38c76d
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tdc.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+./tdc.py -c actions --nobuildebpf
+./tdc.py -c qdisc
diff --git a/tools/testing/selftests/tc-testing/tdc_batch.py b/tools/testing/selftests/tc-testing/tdc_batch.py
new file mode 100755
index 000000000..35d5d9493
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tdc_batch.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+
+"""
+tdc_batch.py - a script to generate TC batch file
+
+Copyright (C) 2017 Chris Mi <chrism@mellanox.com>
+"""
+
+import argparse
+
+parser = argparse.ArgumentParser(description='TC batch file generator')
+parser.add_argument("device", help="device name")
+parser.add_argument("file", help="batch file name")
+parser.add_argument("-n", "--number", type=int,
+ help="how many lines in batch file")
+parser.add_argument(
+ "-a",
+ "--handle_start",
+ type=int,
+ default=1,
+ help="start handle range from (default: 1)")
+parser.add_argument("-o", "--skip_sw",
+ help="skip_sw (offload), by default skip_hw",
+ action="store_true")
+parser.add_argument("-s", "--share_action",
+ help="all filters share the same action",
+ action="store_true")
+parser.add_argument("-p", "--prio",
+ help="all filters have different prio",
+ action="store_true")
+parser.add_argument(
+ "-e",
+ "--operation",
+ choices=['add', 'del', 'replace'],
+ default='add',
+ help="operation to perform on filters"
+ "(default: add filter)")
+parser.add_argument(
+ "-m",
+ "--mac_prefix",
+ type=int,
+ default=0,
+ choices=range(0, 256),
+ help="third byte of source MAC address of flower filter"
+ "(default: 0)")
+args = parser.parse_args()
+
+device = args.device
+file = open(args.file, 'w')
+
+number = 1
+if args.number:
+ number = args.number
+
+handle_start = args.handle_start
+
+skip = "skip_hw"
+if args.skip_sw:
+ skip = "skip_sw"
+
+share_action = ""
+if args.share_action:
+ share_action = "index 1"
+
+prio = "prio 1"
+if args.prio:
+ prio = ""
+ if number > 0x4000:
+ number = 0x4000
+
+mac_prefix = args.mac_prefix
+
+def format_add_filter(device, prio, handle, skip, src_mac, dst_mac,
+ share_action):
+ return ("filter add dev {} {} protocol ip ingress handle {} "
+ " flower {} src_mac {} dst_mac {} action drop {}".format(
+ device, prio, handle, skip, src_mac, dst_mac, share_action))
+
+
+def format_rep_filter(device, prio, handle, skip, src_mac, dst_mac,
+ share_action):
+ return ("filter replace dev {} {} protocol ip ingress handle {} "
+ " flower {} src_mac {} dst_mac {} action drop {}".format(
+ device, prio, handle, skip, src_mac, dst_mac, share_action))
+
+
+def format_del_filter(device, prio, handle, skip, src_mac, dst_mac,
+ share_action):
+ return ("filter del dev {} {} protocol ip ingress handle {} "
+ "flower".format(device, prio, handle))
+
+
+formatter = format_add_filter
+if args.operation == "del":
+ formatter = format_del_filter
+elif args.operation == "replace":
+ formatter = format_rep_filter
+
+index = 0
+for i in range(0x100):
+ for j in range(0x100):
+ for k in range(0x100):
+ mac = ("{:02x}:{:02x}:{:02x}".format(i, j, k))
+ src_mac = "e4:11:{:02x}:{}".format(mac_prefix, mac)
+ dst_mac = "e4:12:00:" + mac
+ cmd = formatter(device, prio, handle_start + index, skip, src_mac,
+ dst_mac, share_action)
+ file.write("{}\n".format(cmd))
+ index += 1
+ if index >= number:
+ file.close()
+ exit(0)
diff --git a/tools/testing/selftests/tc-testing/tdc_config.py b/tools/testing/selftests/tc-testing/tdc_config.py
new file mode 100644
index 000000000..cd4a27ee1
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tdc_config.py
@@ -0,0 +1,42 @@
+"""
+# SPDX-License-Identifier: GPL-2.0
+tdc_config.py - tdc user-specified values
+
+Copyright (C) 2017 Lucas Bates <lucasb@mojatatu.com>
+"""
+
+# Dictionary containing all values that can be substituted in executable
+# commands.
+NAMES = {
+ # Substitute your own tc path here
+ 'TC': '/sbin/tc',
+ # Substitute your own ip path here
+ 'IP': '/sbin/ip',
+ # Name of veth devices to be created for the namespace
+ 'DEV0': 'v0p0',
+ 'DEV1': 'v0p1',
+ 'DEV2': '',
+ 'DUMMY': 'dummy1',
+ 'BATCH_FILE': './batch.txt',
+ 'BATCH_DIR': 'tmp',
+ # Length of time in seconds to wait before terminating a command
+ 'TIMEOUT': 12,
+ # Name of the namespace to use
+ 'NS': 'tcut',
+ # Directory containing eBPF test programs
+ 'EBPFDIR': './'
+ }
+
+
+ENVIR = { }
+
+# put customizations in tdc_config_local.py
+try:
+ from tdc_config_local import *
+except ImportError as ie:
+ pass
+
+try:
+ NAMES.update(EXTRA_NAMES)
+except NameError as ne:
+ pass
diff --git a/tools/testing/selftests/tc-testing/tdc_config_local_template.py b/tools/testing/selftests/tc-testing/tdc_config_local_template.py
new file mode 100644
index 000000000..d48fc732a
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tdc_config_local_template.py
@@ -0,0 +1,23 @@
+"""
+tdc_config_local.py - tdc plugin-writer-specified values
+
+Copyright (C) 2017 bjb@mojatatu.com
+"""
+
+import os
+
+ENVIR = os.environ.copy()
+
+ENV_LD_LIBRARY_PATH = os.getenv('LD_LIBRARY_PATH', '')
+ENV_OTHER_LIB = os.getenv('OTHER_LIB', '')
+
+
+# example adding value to NAMES, without editing tdc_config.py
+EXTRA_NAMES = dict()
+EXTRA_NAMES['SOME_BIN'] = os.path.join(os.getenv('OTHER_BIN', ''), 'some_bin')
+
+
+# example adding values to ENVIR, without editing tdc_config.py
+ENVIR['VALGRIND_LIB'] = '/usr/lib/valgrind'
+ENVIR['VALGRIND_BIN'] = '/usr/bin/valgrind'
+ENVIR['VGDB_BIN'] = '/usr/bin/vgdb'
diff --git a/tools/testing/selftests/tc-testing/tdc_helper.py b/tools/testing/selftests/tc-testing/tdc_helper.py
new file mode 100644
index 000000000..0440d252c
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tdc_helper.py
@@ -0,0 +1,70 @@
+"""
+# SPDX-License-Identifier: GPL-2.0
+tdc_helper.py - tdc helper functions
+
+Copyright (C) 2017 Lucas Bates <lucasb@mojatatu.com>
+"""
+
+def get_categorized_testlist(alltests, ucat):
+ """ Sort the master test list into categories. """
+ testcases = dict()
+
+ for category in ucat:
+ testcases[category] = list(filter(lambda x: category in x['category'], alltests))
+
+ return(testcases)
+
+
+def get_unique_item(lst):
+ """ For a list, return a list of the unique items in the list. """
+ if len(lst) > 1:
+ return list(set(lst))
+ else:
+ return lst
+
+
+def get_test_categories(alltests):
+ """ Discover all unique test categories present in the test case file. """
+ ucat = []
+ for t in alltests:
+ ucat.extend(get_unique_item(t['category']))
+ ucat = get_unique_item(ucat)
+ return ucat
+
+def list_test_cases(testlist):
+ """ Print IDs and names of all test cases. """
+ for curcase in testlist:
+ print(curcase['id'] + ': (' + ', '.join(curcase['category']) + ") " + curcase['name'])
+
+
+def list_categories(testlist):
+ """ Show all categories that are present in a test case file. """
+ categories = set(map(lambda x: x['category'], testlist))
+ print("Available categories:")
+ print(", ".join(str(s) for s in categories))
+ print("")
+
+
+def print_list(cmdlist):
+ """ Print a list of strings prepended with a tab. """
+ for l in cmdlist:
+ if (type(l) == list):
+ print("\t" + str(l[0]))
+ else:
+ print("\t" + str(l))
+
+
+def print_sll(items):
+ print("\n".join(str(s) for s in items))
+
+
+def print_test_case(tcase):
+ """ Pretty-printing of a given test case. """
+ print('\n==============\nTest {}\t{}\n'.format(tcase['id'], tcase['name']))
+ for k in tcase.keys():
+ if (isinstance(tcase[k], list)):
+ print(k + ":")
+ print_list(tcase[k])
+ else:
+ if not ((k == 'id') or (k == 'name')):
+ print(k + ": " + str(tcase[k]))
diff --git a/tools/testing/selftests/tc-testing/tdc_multibatch.py b/tools/testing/selftests/tc-testing/tdc_multibatch.py
new file mode 100755
index 000000000..48e1f17ff
--- /dev/null
+++ b/tools/testing/selftests/tc-testing/tdc_multibatch.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+"""
+tdc_multibatch.py - a thin wrapper over tdc_batch.py to generate multiple batch
+files
+
+Copyright (C) 2019 Vlad Buslov <vladbu@mellanox.com>
+"""
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(
+ description='TC multiple batch file generator')
+parser.add_argument("device", help="device name")
+parser.add_argument("dir", help="where to put batch files")
+parser.add_argument(
+ "num_filters", type=int, help="how many lines per batch file")
+parser.add_argument("num_files", type=int, help="how many batch files")
+parser.add_argument(
+ "operation",
+ choices=['add', 'del', 'replace'],
+ help="operation to perform on filters")
+parser.add_argument(
+ "-x",
+ "--file_prefix",
+ default="",
+ help="prefix for generated batch file names")
+parser.add_argument(
+ "-d",
+ "--duplicate_handles",
+ action="store_true",
+ help="duplicate filter handle range in all files")
+parser.add_argument(
+ "-a",
+ "--handle_start",
+ type=int,
+ default=1,
+ help="start handle range from (default: 1)")
+parser.add_argument(
+ "-m",
+ "--mac_prefix",
+ type=int,
+ default=0,
+ choices=range(0, 256),
+ help="add this value to third byte of source MAC address of flower filter"
+ "(default: 0)")
+args = parser.parse_args()
+
+device = args.device
+dir = args.dir
+file_prefix = args.file_prefix + args.operation + "_"
+num_filters = args.num_filters
+num_files = args.num_files
+operation = args.operation
+duplicate_handles = args.duplicate_handles
+handle = args.handle_start
+mac_prefix = args.mac_prefix
+
+for i in range(num_files):
+ file = dir + '/' + file_prefix + str(i)
+ os.system("./tdc_batch.py -n {} -a {} -e {} -m {} {} {}".format(
+ num_filters, handle, operation, i + mac_prefix, device, file))
+ if not duplicate_handles:
+ handle += num_filters
diff --git a/tools/testing/selftests/timens/.gitignore b/tools/testing/selftests/timens/.gitignore
new file mode 100644
index 000000000..2e43851b4
--- /dev/null
+++ b/tools/testing/selftests/timens/.gitignore
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0-only
+clock_nanosleep
+exec
+gettime_perf
+gettime_perf_cold
+procfs
+timens
+timer
+timerfd
diff --git a/tools/testing/selftests/timens/Makefile b/tools/testing/selftests/timens/Makefile
new file mode 100644
index 000000000..3a5936cc1
--- /dev/null
+++ b/tools/testing/selftests/timens/Makefile
@@ -0,0 +1,7 @@
+TEST_GEN_PROGS := timens timerfd timer clock_nanosleep procfs exec futex
+TEST_GEN_PROGS_EXTENDED := gettime_perf
+
+CFLAGS := -Wall -Werror -pthread
+LDLIBS := -lrt -ldl
+
+include ../lib.mk
diff --git a/tools/testing/selftests/timens/clock_nanosleep.c b/tools/testing/selftests/timens/clock_nanosleep.c
new file mode 100644
index 000000000..72d41b955
--- /dev/null
+++ b/tools/testing/selftests/timens/clock_nanosleep.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sched.h>
+
+#include <sys/timerfd.h>
+#include <sys/syscall.h>
+#include <time.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <pthread.h>
+#include <signal.h>
+#include <string.h>
+
+#include "log.h"
+#include "timens.h"
+
+void test_sig(int sig)
+{
+ if (sig == SIGUSR2)
+ pthread_exit(NULL);
+}
+
+struct thread_args {
+ struct timespec *now, *rem;
+ pthread_mutex_t *lock;
+ int clockid;
+ int abs;
+};
+
+void *call_nanosleep(void *_args)
+{
+ struct thread_args *args = _args;
+
+ clock_nanosleep(args->clockid, args->abs ? TIMER_ABSTIME : 0, args->now, args->rem);
+ pthread_mutex_unlock(args->lock);
+ return NULL;
+}
+
+int run_test(int clockid, int abs)
+{
+ struct timespec now = {}, rem;
+ struct thread_args args = { .now = &now, .rem = &rem, .clockid = clockid};
+ struct timespec start;
+ pthread_mutex_t lock;
+ pthread_t thread;
+ int j, ok, ret;
+
+ signal(SIGUSR1, test_sig);
+ signal(SIGUSR2, test_sig);
+
+ pthread_mutex_init(&lock, NULL);
+ pthread_mutex_lock(&lock);
+
+ if (clock_gettime(clockid, &start) == -1) {
+ if (errno == EINVAL && check_skip(clockid))
+ return 0;
+ return pr_perror("clock_gettime");
+ }
+
+
+ if (abs) {
+ now.tv_sec = start.tv_sec;
+ now.tv_nsec = start.tv_nsec;
+ }
+
+ now.tv_sec += 3600;
+ args.abs = abs;
+ args.lock = &lock;
+ ret = pthread_create(&thread, NULL, call_nanosleep, &args);
+ if (ret != 0) {
+ pr_err("Unable to create a thread: %s", strerror(ret));
+ return 1;
+ }
+
+ /* Wait when the thread will call clock_nanosleep(). */
+ ok = 0;
+ for (j = 0; j < 8; j++) {
+ /* The maximum timeout is about 5 seconds. */
+ usleep(10000 << j);
+
+ /* Try to interrupt clock_nanosleep(). */
+ pthread_kill(thread, SIGUSR1);
+
+ usleep(10000 << j);
+ /* Check whether clock_nanosleep() has been interrupted or not. */
+ if (pthread_mutex_trylock(&lock) == 0) {
+ /**/
+ ok = 1;
+ break;
+ }
+ }
+ if (!ok)
+ pthread_kill(thread, SIGUSR2);
+ pthread_join(thread, NULL);
+ pthread_mutex_destroy(&lock);
+
+ if (!ok) {
+ ksft_test_result_pass("clockid: %d abs:%d timeout\n", clockid, abs);
+ return 1;
+ }
+
+ if (rem.tv_sec < 3300 || rem.tv_sec > 3900) {
+ pr_fail("clockid: %d abs: %d remain: %ld\n",
+ clockid, abs, rem.tv_sec);
+ return 1;
+ }
+ ksft_test_result_pass("clockid: %d abs:%d\n", clockid, abs);
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ int ret, nsfd;
+
+ nscheck();
+
+ ksft_set_plan(4);
+
+ check_supported_timers();
+
+ if (unshare_timens())
+ return 1;
+
+ if (_settime(CLOCK_MONOTONIC, 7 * 24 * 3600))
+ return 1;
+ if (_settime(CLOCK_BOOTTIME, 9 * 24 * 3600))
+ return 1;
+
+ nsfd = open("/proc/self/ns/time_for_children", O_RDONLY);
+ if (nsfd < 0)
+ return pr_perror("Unable to open timens_for_children");
+
+ if (setns(nsfd, CLONE_NEWTIME))
+ return pr_perror("Unable to set timens");
+
+ ret = 0;
+ ret |= run_test(CLOCK_MONOTONIC, 0);
+ ret |= run_test(CLOCK_MONOTONIC, 1);
+ ret |= run_test(CLOCK_BOOTTIME_ALARM, 0);
+ ret |= run_test(CLOCK_BOOTTIME_ALARM, 1);
+
+ if (ret)
+ ksft_exit_fail();
+ ksft_exit_pass();
+ return ret;
+}
diff --git a/tools/testing/selftests/timens/config b/tools/testing/selftests/timens/config
new file mode 100644
index 000000000..4480620f6
--- /dev/null
+++ b/tools/testing/selftests/timens/config
@@ -0,0 +1 @@
+CONFIG_TIME_NS=y
diff --git a/tools/testing/selftests/timens/exec.c b/tools/testing/selftests/timens/exec.c
new file mode 100644
index 000000000..e40dc5be2
--- /dev/null
+++ b/tools/testing/selftests/timens/exec.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "log.h"
+#include "timens.h"
+
+#define OFFSET (36000)
+
+int main(int argc, char *argv[])
+{
+ struct timespec now, tst;
+ int status, i;
+ pid_t pid;
+
+ if (argc > 1) {
+ if (sscanf(argv[1], "%ld", &now.tv_sec) != 1)
+ return pr_perror("sscanf");
+
+ for (i = 0; i < 2; i++) {
+ _gettime(CLOCK_MONOTONIC, &tst, i);
+ if (abs(tst.tv_sec - now.tv_sec) > 5)
+ return pr_fail("%ld %ld\n", now.tv_sec, tst.tv_sec);
+ }
+ return 0;
+ }
+
+ nscheck();
+
+ ksft_set_plan(1);
+
+ clock_gettime(CLOCK_MONOTONIC, &now);
+
+ if (unshare_timens())
+ return 1;
+
+ if (_settime(CLOCK_MONOTONIC, OFFSET))
+ return 1;
+
+ for (i = 0; i < 2; i++) {
+ _gettime(CLOCK_MONOTONIC, &tst, i);
+ if (abs(tst.tv_sec - now.tv_sec) > 5)
+ return pr_fail("%ld %ld\n",
+ now.tv_sec, tst.tv_sec);
+ }
+
+ if (argc > 1)
+ return 0;
+
+ pid = fork();
+ if (pid < 0)
+ return pr_perror("fork");
+
+ if (pid == 0) {
+ char now_str[64];
+ char *cargv[] = {"exec", now_str, NULL};
+ char *cenv[] = {NULL};
+
+ /* Check that a child process is in the new timens. */
+ for (i = 0; i < 2; i++) {
+ _gettime(CLOCK_MONOTONIC, &tst, i);
+ if (abs(tst.tv_sec - now.tv_sec - OFFSET) > 5)
+ return pr_fail("%ld %ld\n",
+ now.tv_sec + OFFSET, tst.tv_sec);
+ }
+
+ /* Check for proper vvar offsets after execve. */
+ snprintf(now_str, sizeof(now_str), "%ld", now.tv_sec + OFFSET);
+ execve("/proc/self/exe", cargv, cenv);
+ return pr_perror("execve");
+ }
+
+ if (waitpid(pid, &status, 0) != pid)
+ return pr_perror("waitpid");
+
+ if (status)
+ ksft_exit_fail();
+
+ ksft_test_result_pass("exec\n");
+ ksft_exit_pass();
+ return 0;
+}
diff --git a/tools/testing/selftests/timens/futex.c b/tools/testing/selftests/timens/futex.c
new file mode 100644
index 000000000..6b2b9264e
--- /dev/null
+++ b/tools/testing/selftests/timens/futex.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sched.h>
+
+#include <linux/unistd.h>
+#include <linux/futex.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "log.h"
+#include "timens.h"
+
+#define NSEC_PER_SEC 1000000000ULL
+
+static int run_test(int clockid)
+{
+ int futex_op = FUTEX_WAIT_BITSET;
+ struct timespec timeout, end;
+ int val = 0;
+
+ if (clockid == CLOCK_REALTIME)
+ futex_op |= FUTEX_CLOCK_REALTIME;
+
+ clock_gettime(clockid, &timeout);
+ timeout.tv_nsec += NSEC_PER_SEC / 10; // 100ms
+ if (timeout.tv_nsec > NSEC_PER_SEC) {
+ timeout.tv_sec++;
+ timeout.tv_nsec -= NSEC_PER_SEC;
+ }
+
+ if (syscall(__NR_futex, &val, futex_op, 0,
+ &timeout, 0, FUTEX_BITSET_MATCH_ANY) >= 0) {
+ ksft_test_result_fail("futex didn't return ETIMEDOUT\n");
+ return 1;
+ }
+
+ if (errno != ETIMEDOUT) {
+ ksft_test_result_fail("futex didn't return ETIMEDOUT: %s\n",
+ strerror(errno));
+ return 1;
+ }
+
+ clock_gettime(clockid, &end);
+
+ if (end.tv_sec < timeout.tv_sec ||
+ (end.tv_sec == timeout.tv_sec && end.tv_nsec < timeout.tv_nsec)) {
+ ksft_test_result_fail("futex slept less than 100ms\n");
+ return 1;
+ }
+
+
+ ksft_test_result_pass("futex with the %d clockid\n", clockid);
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ int status, len, fd;
+ char buf[4096];
+ pid_t pid;
+ struct timespec mtime_now;
+
+ nscheck();
+
+ ksft_set_plan(2);
+
+ clock_gettime(CLOCK_MONOTONIC, &mtime_now);
+
+ if (unshare_timens())
+ return 1;
+
+ len = snprintf(buf, sizeof(buf), "%d %d 0",
+ CLOCK_MONOTONIC, 70 * 24 * 3600);
+ fd = open("/proc/self/timens_offsets", O_WRONLY);
+ if (fd < 0)
+ return pr_perror("/proc/self/timens_offsets");
+
+ if (write(fd, buf, len) != len)
+ return pr_perror("/proc/self/timens_offsets");
+
+ close(fd);
+
+ pid = fork();
+ if (pid < 0)
+ return pr_perror("Unable to fork");
+ if (pid == 0) {
+ int ret = 0;
+
+ ret |= run_test(CLOCK_REALTIME);
+ ret |= run_test(CLOCK_MONOTONIC);
+ if (ret)
+ ksft_exit_fail();
+ ksft_exit_pass();
+ return 0;
+ }
+
+ if (waitpid(pid, &status, 0) != pid)
+ return pr_perror("Unable to wait the child process");
+
+ if (WIFEXITED(status))
+ return WEXITSTATUS(status);
+
+ return 1;
+}
diff --git a/tools/testing/selftests/timens/gettime_perf.c b/tools/testing/selftests/timens/gettime_perf.c
new file mode 100644
index 000000000..7bf841a39
--- /dev/null
+++ b/tools/testing/selftests/timens/gettime_perf.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <time.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <dlfcn.h>
+
+#include "log.h"
+#include "timens.h"
+
+typedef int (*vgettime_t)(clockid_t, struct timespec *);
+
+vgettime_t vdso_clock_gettime;
+
+static void fill_function_pointers(void)
+{
+ void *vdso = dlopen("linux-vdso.so.1",
+ RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+ if (!vdso)
+ vdso = dlopen("linux-gate.so.1",
+ RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+ if (!vdso) {
+ pr_err("[WARN]\tfailed to find vDSO\n");
+ return;
+ }
+
+ vdso_clock_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime");
+ if (!vdso_clock_gettime)
+ pr_err("Warning: failed to find clock_gettime in vDSO\n");
+
+}
+
+static void test(clock_t clockid, char *clockstr, bool in_ns)
+{
+ struct timespec tp, start;
+ long i = 0;
+ const int timeout = 3;
+
+ vdso_clock_gettime(clockid, &start);
+ tp = start;
+ for (tp = start; start.tv_sec + timeout > tp.tv_sec ||
+ (start.tv_sec + timeout == tp.tv_sec &&
+ start.tv_nsec > tp.tv_nsec); i++) {
+ vdso_clock_gettime(clockid, &tp);
+ }
+
+ ksft_test_result_pass("%s:\tclock: %10s\tcycles:\t%10ld\n",
+ in_ns ? "ns" : "host", clockstr, i);
+}
+
+int main(int argc, char *argv[])
+{
+ time_t offset = 10;
+ int nsfd;
+
+ ksft_set_plan(8);
+
+ fill_function_pointers();
+
+ test(CLOCK_MONOTONIC, "monotonic", false);
+ test(CLOCK_MONOTONIC_COARSE, "monotonic-coarse", false);
+ test(CLOCK_MONOTONIC_RAW, "monotonic-raw", false);
+ test(CLOCK_BOOTTIME, "boottime", false);
+
+ nscheck();
+
+ if (unshare_timens())
+ return 1;
+
+ nsfd = open("/proc/self/ns/time_for_children", O_RDONLY);
+ if (nsfd < 0)
+ return pr_perror("Can't open a time namespace");
+
+ if (_settime(CLOCK_MONOTONIC, offset))
+ return 1;
+ if (_settime(CLOCK_BOOTTIME, offset))
+ return 1;
+
+ if (setns(nsfd, CLONE_NEWTIME))
+ return pr_perror("setns");
+
+ test(CLOCK_MONOTONIC, "monotonic", true);
+ test(CLOCK_MONOTONIC_COARSE, "monotonic-coarse", true);
+ test(CLOCK_MONOTONIC_RAW, "monotonic-raw", true);
+ test(CLOCK_BOOTTIME, "boottime", true);
+
+ ksft_exit_pass();
+ return 0;
+}
diff --git a/tools/testing/selftests/timens/log.h b/tools/testing/selftests/timens/log.h
new file mode 100644
index 000000000..db64df2a8
--- /dev/null
+++ b/tools/testing/selftests/timens/log.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __SELFTEST_TIMENS_LOG_H__
+#define __SELFTEST_TIMENS_LOG_H__
+
+#define pr_msg(fmt, lvl, ...) \
+ ksft_print_msg("[%s] (%s:%d)\t" fmt "\n", \
+ lvl, __FILE__, __LINE__, ##__VA_ARGS__)
+
+#define pr_p(func, fmt, ...) func(fmt ": %m", ##__VA_ARGS__)
+
+#define pr_err(fmt, ...) \
+ ({ \
+ ksft_test_result_error(fmt "\n", ##__VA_ARGS__); \
+ -1; \
+ })
+
+#define pr_fail(fmt, ...) \
+ ({ \
+ ksft_test_result_fail(fmt, ##__VA_ARGS__); \
+ -1; \
+ })
+
+#define pr_perror(fmt, ...) pr_p(pr_err, fmt, ##__VA_ARGS__)
+
+#endif
diff --git a/tools/testing/selftests/timens/procfs.c b/tools/testing/selftests/timens/procfs.c
new file mode 100644
index 000000000..7f14f0fda
--- /dev/null
+++ b/tools/testing/selftests/timens/procfs.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <math.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "log.h"
+#include "timens.h"
+
+/*
+ * Test shouldn't be run for a day, so add 10 days to child
+ * time and check parent's time to be in the same day.
+ */
+#define MAX_TEST_TIME_SEC (60*5)
+#define DAY_IN_SEC (60*60*24)
+#define TEN_DAYS_IN_SEC (10*DAY_IN_SEC)
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+static int child_ns, parent_ns;
+
+static int switch_ns(int fd)
+{
+ if (setns(fd, CLONE_NEWTIME))
+ return pr_perror("setns()");
+
+ return 0;
+}
+
+static int init_namespaces(void)
+{
+ char path[] = "/proc/self/ns/time_for_children";
+ struct stat st1, st2;
+
+ parent_ns = open(path, O_RDONLY);
+ if (parent_ns <= 0)
+ return pr_perror("Unable to open %s", path);
+
+ if (fstat(parent_ns, &st1))
+ return pr_perror("Unable to stat the parent timens");
+
+ if (unshare_timens())
+ return -1;
+
+ child_ns = open(path, O_RDONLY);
+ if (child_ns <= 0)
+ return pr_perror("Unable to open %s", path);
+
+ if (fstat(child_ns, &st2))
+ return pr_perror("Unable to stat the timens");
+
+ if (st1.st_ino == st2.st_ino)
+ return pr_err("The same child_ns after CLONE_NEWTIME");
+
+ if (_settime(CLOCK_BOOTTIME, TEN_DAYS_IN_SEC))
+ return -1;
+
+ return 0;
+}
+
+static int read_proc_uptime(struct timespec *uptime)
+{
+ unsigned long up_sec, up_nsec;
+ FILE *proc;
+
+ proc = fopen("/proc/uptime", "r");
+ if (proc == NULL) {
+ pr_perror("Unable to open /proc/uptime");
+ return -1;
+ }
+
+ if (fscanf(proc, "%lu.%02lu", &up_sec, &up_nsec) != 2) {
+ if (errno) {
+ pr_perror("fscanf");
+ return -errno;
+ }
+ pr_err("failed to parse /proc/uptime");
+ return -1;
+ }
+ fclose(proc);
+
+ uptime->tv_sec = up_sec;
+ uptime->tv_nsec = up_nsec;
+ return 0;
+}
+
+static int check_uptime(void)
+{
+ struct timespec uptime_new, uptime_old;
+ time_t uptime_expected;
+ double prec = MAX_TEST_TIME_SEC;
+
+ if (switch_ns(parent_ns))
+ return pr_err("switch_ns(%d)", parent_ns);
+
+ if (read_proc_uptime(&uptime_old))
+ return 1;
+
+ if (switch_ns(child_ns))
+ return pr_err("switch_ns(%d)", child_ns);
+
+ if (read_proc_uptime(&uptime_new))
+ return 1;
+
+ uptime_expected = uptime_old.tv_sec + TEN_DAYS_IN_SEC;
+ if (fabs(difftime(uptime_new.tv_sec, uptime_expected)) > prec) {
+ pr_fail("uptime in /proc/uptime: old %ld, new %ld [%ld]",
+ uptime_old.tv_sec, uptime_new.tv_sec,
+ uptime_old.tv_sec + TEN_DAYS_IN_SEC);
+ return 1;
+ }
+
+ ksft_test_result_pass("Passed for /proc/uptime\n");
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ int ret = 0;
+
+ nscheck();
+
+ ksft_set_plan(1);
+
+ if (init_namespaces())
+ return 1;
+
+ ret |= check_uptime();
+
+ if (ret)
+ ksft_exit_fail();
+ ksft_exit_pass();
+ return ret;
+}
diff --git a/tools/testing/selftests/timens/timens.c b/tools/testing/selftests/timens/timens.c
new file mode 100644
index 000000000..52b6a1185
--- /dev/null
+++ b/tools/testing/selftests/timens/timens.c
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "log.h"
+#include "timens.h"
+
+/*
+ * Test shouldn't be run for a day, so add 10 days to child
+ * time and check parent's time to be in the same day.
+ */
+#define DAY_IN_SEC (60*60*24)
+#define TEN_DAYS_IN_SEC (10*DAY_IN_SEC)
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+struct test_clock {
+ clockid_t id;
+ char *name;
+ /*
+ * off_id is -1 if a clock has own offset, or it contains an index
+ * which contains a right offset of this clock.
+ */
+ int off_id;
+ time_t offset;
+};
+
+#define ct(clock, off_id) { clock, #clock, off_id }
+static struct test_clock clocks[] = {
+ ct(CLOCK_BOOTTIME, -1),
+ ct(CLOCK_BOOTTIME_ALARM, 1),
+ ct(CLOCK_MONOTONIC, -1),
+ ct(CLOCK_MONOTONIC_COARSE, 1),
+ ct(CLOCK_MONOTONIC_RAW, 1),
+};
+#undef ct
+
+static int child_ns, parent_ns = -1;
+
+static int switch_ns(int fd)
+{
+ if (setns(fd, CLONE_NEWTIME)) {
+ pr_perror("setns()");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int init_namespaces(void)
+{
+ char path[] = "/proc/self/ns/time_for_children";
+ struct stat st1, st2;
+
+ if (parent_ns == -1) {
+ parent_ns = open(path, O_RDONLY);
+ if (parent_ns <= 0)
+ return pr_perror("Unable to open %s", path);
+ }
+
+ if (fstat(parent_ns, &st1))
+ return pr_perror("Unable to stat the parent timens");
+
+ if (unshare_timens())
+ return -1;
+
+ child_ns = open(path, O_RDONLY);
+ if (child_ns <= 0)
+ return pr_perror("Unable to open %s", path);
+
+ if (fstat(child_ns, &st2))
+ return pr_perror("Unable to stat the timens");
+
+ if (st1.st_ino == st2.st_ino)
+ return pr_perror("The same child_ns after CLONE_NEWTIME");
+
+ return 0;
+}
+
+static int test_gettime(clockid_t clock_index, bool raw_syscall, time_t offset)
+{
+ struct timespec child_ts_new, parent_ts_old, cur_ts;
+ char *entry = raw_syscall ? "syscall" : "vdso";
+ double precision = 0.0;
+
+ if (check_skip(clocks[clock_index].id))
+ return 0;
+
+ switch (clocks[clock_index].id) {
+ case CLOCK_MONOTONIC_COARSE:
+ case CLOCK_MONOTONIC_RAW:
+ precision = -2.0;
+ break;
+ }
+
+ if (switch_ns(parent_ns))
+ return pr_err("switch_ns(%d)", child_ns);
+
+ if (_gettime(clocks[clock_index].id, &parent_ts_old, raw_syscall))
+ return -1;
+
+ child_ts_new.tv_nsec = parent_ts_old.tv_nsec;
+ child_ts_new.tv_sec = parent_ts_old.tv_sec + offset;
+
+ if (switch_ns(child_ns))
+ return pr_err("switch_ns(%d)", child_ns);
+
+ if (_gettime(clocks[clock_index].id, &cur_ts, raw_syscall))
+ return -1;
+
+ if (difftime(cur_ts.tv_sec, child_ts_new.tv_sec) < precision) {
+ ksft_test_result_fail(
+ "Child's %s (%s) time has not changed: %lu -> %lu [%lu]\n",
+ clocks[clock_index].name, entry, parent_ts_old.tv_sec,
+ child_ts_new.tv_sec, cur_ts.tv_sec);
+ return -1;
+ }
+
+ if (switch_ns(parent_ns))
+ return pr_err("switch_ns(%d)", parent_ns);
+
+ if (_gettime(clocks[clock_index].id, &cur_ts, raw_syscall))
+ return -1;
+
+ if (difftime(cur_ts.tv_sec, parent_ts_old.tv_sec) > DAY_IN_SEC) {
+ ksft_test_result_fail(
+ "Parent's %s (%s) time has changed: %lu -> %lu [%lu]\n",
+ clocks[clock_index].name, entry, parent_ts_old.tv_sec,
+ child_ts_new.tv_sec, cur_ts.tv_sec);
+ /* Let's play nice and put it closer to original */
+ clock_settime(clocks[clock_index].id, &cur_ts);
+ return -1;
+ }
+
+ ksft_test_result_pass("Passed for %s (%s)\n",
+ clocks[clock_index].name, entry);
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ unsigned int i;
+ time_t offset;
+ int ret = 0;
+
+ nscheck();
+
+ check_supported_timers();
+
+ ksft_set_plan(ARRAY_SIZE(clocks) * 2);
+
+ if (init_namespaces())
+ return 1;
+
+ /* Offsets have to be set before tasks enter the namespace. */
+ for (i = 0; i < ARRAY_SIZE(clocks); i++) {
+ if (clocks[i].off_id != -1)
+ continue;
+ offset = TEN_DAYS_IN_SEC + i * 1000;
+ clocks[i].offset = offset;
+ if (_settime(clocks[i].id, offset))
+ return 1;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(clocks); i++) {
+ if (clocks[i].off_id != -1)
+ offset = clocks[clocks[i].off_id].offset;
+ else
+ offset = clocks[i].offset;
+ ret |= test_gettime(i, true, offset);
+ ret |= test_gettime(i, false, offset);
+ }
+
+ if (ret)
+ ksft_exit_fail();
+
+ ksft_exit_pass();
+ return !!ret;
+}
diff --git a/tools/testing/selftests/timens/timens.h b/tools/testing/selftests/timens/timens.h
new file mode 100644
index 000000000..d4fc52d47
--- /dev/null
+++ b/tools/testing/selftests/timens/timens.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __TIMENS_H__
+#define __TIMENS_H__
+
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+#include "../kselftest.h"
+
+#ifndef CLONE_NEWTIME
+# define CLONE_NEWTIME 0x00000080
+#endif
+
+static int config_posix_timers = true;
+static int config_alarm_timers = true;
+
+static inline void check_supported_timers(void)
+{
+ struct timespec ts;
+
+ if (timer_create(-1, 0, 0) == -1 && errno == ENOSYS)
+ config_posix_timers = false;
+
+ if (clock_gettime(CLOCK_BOOTTIME_ALARM, &ts) == -1 && errno == EINVAL)
+ config_alarm_timers = false;
+}
+
+static inline bool check_skip(int clockid)
+{
+ if (!config_alarm_timers && clockid == CLOCK_BOOTTIME_ALARM) {
+ ksft_test_result_skip("CLOCK_BOOTTIME_ALARM isn't supported\n");
+ return true;
+ }
+
+ if (config_posix_timers)
+ return false;
+
+ switch (clockid) {
+ /* Only these clocks are supported without CONFIG_POSIX_TIMERS. */
+ case CLOCK_BOOTTIME:
+ case CLOCK_MONOTONIC:
+ case CLOCK_REALTIME:
+ return false;
+ default:
+ ksft_test_result_skip("Posix Clocks & timers are not supported\n");
+ return true;
+ }
+
+ return false;
+}
+
+static inline int unshare_timens(void)
+{
+ if (unshare(CLONE_NEWTIME)) {
+ if (errno == EPERM)
+ ksft_exit_skip("need to run as root\n");
+ return pr_perror("Can't unshare() timens");
+ }
+ return 0;
+}
+
+static inline int _settime(clockid_t clk_id, time_t offset)
+{
+ int fd, len;
+ char buf[4096];
+
+ if (clk_id == CLOCK_MONOTONIC_COARSE || clk_id == CLOCK_MONOTONIC_RAW)
+ clk_id = CLOCK_MONOTONIC;
+
+ len = snprintf(buf, sizeof(buf), "%d %ld 0", clk_id, offset);
+
+ fd = open("/proc/self/timens_offsets", O_WRONLY);
+ if (fd < 0)
+ return pr_perror("/proc/self/timens_offsets");
+
+ if (write(fd, buf, len) != len)
+ return pr_perror("/proc/self/timens_offsets");
+
+ close(fd);
+
+ return 0;
+}
+
+static inline int _gettime(clockid_t clk_id, struct timespec *res, bool raw_syscall)
+{
+ int err;
+
+ if (!raw_syscall) {
+ if (clock_gettime(clk_id, res)) {
+ pr_perror("clock_gettime(%d)", (int)clk_id);
+ return -1;
+ }
+ return 0;
+ }
+
+ err = syscall(SYS_clock_gettime, clk_id, res);
+ if (err)
+ pr_perror("syscall(SYS_clock_gettime(%d))", (int)clk_id);
+
+ return err;
+}
+
+static inline void nscheck(void)
+{
+ if (access("/proc/self/ns/time", F_OK) < 0)
+ ksft_exit_skip("Time namespaces are not supported\n");
+}
+
+#endif
diff --git a/tools/testing/selftests/timens/timer.c b/tools/testing/selftests/timens/timer.c
new file mode 100644
index 000000000..5e7f0051b
--- /dev/null
+++ b/tools/testing/selftests/timens/timer.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sched.h>
+
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <signal.h>
+
+#include "log.h"
+#include "timens.h"
+
+int run_test(int clockid, struct timespec now)
+{
+ struct itimerspec new_value;
+ long long elapsed;
+ timer_t fd;
+ int i;
+
+ if (check_skip(clockid))
+ return 0;
+
+ for (i = 0; i < 2; i++) {
+ struct sigevent sevp = {.sigev_notify = SIGEV_NONE};
+ int flags = 0;
+
+ new_value.it_value.tv_sec = 3600;
+ new_value.it_value.tv_nsec = 0;
+ new_value.it_interval.tv_sec = 1;
+ new_value.it_interval.tv_nsec = 0;
+
+ if (i == 1) {
+ new_value.it_value.tv_sec += now.tv_sec;
+ new_value.it_value.tv_nsec += now.tv_nsec;
+ }
+
+ if (timer_create(clockid, &sevp, &fd) == -1) {
+ if (errno == ENOSYS) {
+ ksft_test_result_skip("Posix Clocks & timers are supported\n");
+ return 0;
+ }
+ return pr_perror("timerfd_create");
+ }
+
+ if (i == 1)
+ flags |= TIMER_ABSTIME;
+ if (timer_settime(fd, flags, &new_value, NULL) == -1)
+ return pr_perror("timerfd_settime");
+
+ if (timer_gettime(fd, &new_value) == -1)
+ return pr_perror("timerfd_gettime");
+
+ elapsed = new_value.it_value.tv_sec;
+ if (abs(elapsed - 3600) > 60) {
+ ksft_test_result_fail("clockid: %d elapsed: %lld\n",
+ clockid, elapsed);
+ return 1;
+ }
+ }
+
+ ksft_test_result_pass("clockid=%d\n", clockid);
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ int ret, status, len, fd;
+ char buf[4096];
+ pid_t pid;
+ struct timespec btime_now, mtime_now;
+
+ nscheck();
+
+ check_supported_timers();
+
+ ksft_set_plan(3);
+
+ clock_gettime(CLOCK_MONOTONIC, &mtime_now);
+ clock_gettime(CLOCK_BOOTTIME, &btime_now);
+
+ if (unshare_timens())
+ return 1;
+
+ len = snprintf(buf, sizeof(buf), "%d %d 0\n%d %d 0",
+ CLOCK_MONOTONIC, 70 * 24 * 3600,
+ CLOCK_BOOTTIME, 9 * 24 * 3600);
+ fd = open("/proc/self/timens_offsets", O_WRONLY);
+ if (fd < 0)
+ return pr_perror("/proc/self/timens_offsets");
+
+ if (write(fd, buf, len) != len)
+ return pr_perror("/proc/self/timens_offsets");
+
+ close(fd);
+ mtime_now.tv_sec += 70 * 24 * 3600;
+ btime_now.tv_sec += 9 * 24 * 3600;
+
+ pid = fork();
+ if (pid < 0)
+ return pr_perror("Unable to fork");
+ if (pid == 0) {
+ ret = 0;
+ ret |= run_test(CLOCK_BOOTTIME, btime_now);
+ ret |= run_test(CLOCK_MONOTONIC, mtime_now);
+ ret |= run_test(CLOCK_BOOTTIME_ALARM, btime_now);
+
+ if (ret)
+ ksft_exit_fail();
+ ksft_exit_pass();
+ return ret;
+ }
+
+ if (waitpid(pid, &status, 0) != pid)
+ return pr_perror("Unable to wait the child process");
+
+ if (WIFEXITED(status))
+ return WEXITSTATUS(status);
+
+ return 1;
+}
diff --git a/tools/testing/selftests/timens/timerfd.c b/tools/testing/selftests/timens/timerfd.c
new file mode 100644
index 000000000..9edd43d6b
--- /dev/null
+++ b/tools/testing/selftests/timens/timerfd.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sched.h>
+
+#include <sys/timerfd.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+
+#include "log.h"
+#include "timens.h"
+
+static int tclock_gettime(clock_t clockid, struct timespec *now)
+{
+ if (clockid == CLOCK_BOOTTIME_ALARM)
+ clockid = CLOCK_BOOTTIME;
+ return clock_gettime(clockid, now);
+}
+
+int run_test(int clockid, struct timespec now)
+{
+ struct itimerspec new_value;
+ long long elapsed;
+ int fd, i;
+
+ if (check_skip(clockid))
+ return 0;
+
+ if (tclock_gettime(clockid, &now))
+ return pr_perror("clock_gettime(%d)", clockid);
+
+ for (i = 0; i < 2; i++) {
+ int flags = 0;
+
+ new_value.it_value.tv_sec = 3600;
+ new_value.it_value.tv_nsec = 0;
+ new_value.it_interval.tv_sec = 1;
+ new_value.it_interval.tv_nsec = 0;
+
+ if (i == 1) {
+ new_value.it_value.tv_sec += now.tv_sec;
+ new_value.it_value.tv_nsec += now.tv_nsec;
+ }
+
+ fd = timerfd_create(clockid, 0);
+ if (fd == -1)
+ return pr_perror("timerfd_create(%d)", clockid);
+
+ if (i == 1)
+ flags |= TFD_TIMER_ABSTIME;
+
+ if (timerfd_settime(fd, flags, &new_value, NULL))
+ return pr_perror("timerfd_settime(%d)", clockid);
+
+ if (timerfd_gettime(fd, &new_value))
+ return pr_perror("timerfd_gettime(%d)", clockid);
+
+ elapsed = new_value.it_value.tv_sec;
+ if (abs(elapsed - 3600) > 60) {
+ ksft_test_result_fail("clockid: %d elapsed: %lld\n",
+ clockid, elapsed);
+ return 1;
+ }
+
+ close(fd);
+ }
+
+ ksft_test_result_pass("clockid=%d\n", clockid);
+
+ return 0;
+}
+
+int main(int argc, char *argv[])
+{
+ int ret, status, len, fd;
+ char buf[4096];
+ pid_t pid;
+ struct timespec btime_now, mtime_now;
+
+ nscheck();
+
+ check_supported_timers();
+
+ ksft_set_plan(3);
+
+ clock_gettime(CLOCK_MONOTONIC, &mtime_now);
+ clock_gettime(CLOCK_BOOTTIME, &btime_now);
+
+ if (unshare_timens())
+ return 1;
+
+ len = snprintf(buf, sizeof(buf), "%d %d 0\n%d %d 0",
+ CLOCK_MONOTONIC, 70 * 24 * 3600,
+ CLOCK_BOOTTIME, 9 * 24 * 3600);
+ fd = open("/proc/self/timens_offsets", O_WRONLY);
+ if (fd < 0)
+ return pr_perror("/proc/self/timens_offsets");
+
+ if (write(fd, buf, len) != len)
+ return pr_perror("/proc/self/timens_offsets");
+
+ close(fd);
+ mtime_now.tv_sec += 70 * 24 * 3600;
+ btime_now.tv_sec += 9 * 24 * 3600;
+
+ pid = fork();
+ if (pid < 0)
+ return pr_perror("Unable to fork");
+ if (pid == 0) {
+ ret = 0;
+ ret |= run_test(CLOCK_BOOTTIME, btime_now);
+ ret |= run_test(CLOCK_MONOTONIC, mtime_now);
+ ret |= run_test(CLOCK_BOOTTIME_ALARM, btime_now);
+
+ if (ret)
+ ksft_exit_fail();
+ ksft_exit_pass();
+ return ret;
+ }
+
+ if (waitpid(pid, &status, 0) != pid)
+ return pr_perror("Unable to wait the child process");
+
+ if (WIFEXITED(status))
+ return WEXITSTATUS(status);
+
+ return 1;
+}
diff --git a/tools/testing/selftests/timers/.gitignore b/tools/testing/selftests/timers/.gitignore
new file mode 100644
index 000000000..bb5326ff9
--- /dev/null
+++ b/tools/testing/selftests/timers/.gitignore
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: GPL-2.0-only
+alarmtimer-suspend
+change_skew
+clocksource-switch
+inconsistency-check
+leap-a-day
+leapcrash
+mqueue-lat
+nanosleep
+nsleep-lat
+posix_timers
+raw_skew
+rtcpie
+set-2038
+set-tai
+set-timer-lat
+skew_consistency
+threadtest
+valid-adjtimex
+adjtick
+set-tz
+freq-step
diff --git a/tools/testing/selftests/timers/Makefile b/tools/testing/selftests/timers/Makefile
new file mode 100644
index 000000000..0e73a1687
--- /dev/null
+++ b/tools/testing/selftests/timers/Makefile
@@ -0,0 +1,24 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS += -O3 -Wl,-no-as-needed -Wall
+LDLIBS += -lrt -lpthread -lm
+
+# these are all "safe" tests that don't modify
+# system time or require escalated privileges
+TEST_GEN_PROGS = posix_timers nanosleep nsleep-lat set-timer-lat mqueue-lat \
+ inconsistency-check raw_skew threadtest rtcpie
+
+DESTRUCTIVE_TESTS = alarmtimer-suspend valid-adjtimex adjtick change_skew \
+ skew_consistency clocksource-switch freq-step leap-a-day \
+ leapcrash set-tai set-2038 set-tz
+
+TEST_GEN_PROGS_EXTENDED = $(DESTRUCTIVE_TESTS)
+
+TEST_FILES := settings
+
+include ../lib.mk
+
+# these tests require escalated privileges
+# and may modify the system time or trigger
+# other behavior like suspend
+run_destructive_tests: run_tests
+ $(call RUN_TESTS, $(DESTRUCTIVE_TESTS))
diff --git a/tools/testing/selftests/timers/adjtick.c b/tools/testing/selftests/timers/adjtick.c
new file mode 100644
index 000000000..54d8d87f3
--- /dev/null
+++ b/tools/testing/selftests/timers/adjtick.c
@@ -0,0 +1,211 @@
+/* adjtimex() tick adjustment test
+ * by: John Stultz <john.stultz@linaro.org>
+ * (C) Copyright Linaro Limited 2015
+ * Licensed under the GPLv2
+ *
+ * To build:
+ * $ gcc adjtick.c -o adjtick -lrt
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <time.h>
+
+#include "../kselftest.h"
+
+#define CLOCK_MONOTONIC_RAW 4
+
+#define NSEC_PER_SEC 1000000000LL
+#define USEC_PER_SEC 1000000
+
+#define MILLION 1000000
+
+long systick;
+
+long long llabs(long long val)
+{
+ if (val < 0)
+ val = -val;
+ return val;
+}
+
+unsigned long long ts_to_nsec(struct timespec ts)
+{
+ return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec;
+}
+
+struct timespec nsec_to_ts(long long ns)
+{
+ struct timespec ts;
+
+ ts.tv_sec = ns/NSEC_PER_SEC;
+ ts.tv_nsec = ns%NSEC_PER_SEC;
+
+ return ts;
+}
+
+long long diff_timespec(struct timespec start, struct timespec end)
+{
+ long long start_ns, end_ns;
+
+ start_ns = ts_to_nsec(start);
+ end_ns = ts_to_nsec(end);
+
+ return end_ns - start_ns;
+}
+
+void get_monotonic_and_raw(struct timespec *mon, struct timespec *raw)
+{
+ struct timespec start, mid, end;
+ long long diff = 0, tmp;
+ int i;
+
+ clock_gettime(CLOCK_MONOTONIC, mon);
+ clock_gettime(CLOCK_MONOTONIC_RAW, raw);
+
+ /* Try to get a more tightly bound pairing */
+ for (i = 0; i < 3; i++) {
+ long long newdiff;
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ clock_gettime(CLOCK_MONOTONIC_RAW, &mid);
+ clock_gettime(CLOCK_MONOTONIC, &end);
+
+ newdiff = diff_timespec(start, end);
+ if (diff == 0 || newdiff < diff) {
+ diff = newdiff;
+ *raw = mid;
+ tmp = (ts_to_nsec(start) + ts_to_nsec(end))/2;
+ *mon = nsec_to_ts(tmp);
+ }
+ }
+}
+
+long long get_ppm_drift(void)
+{
+ struct timespec mon_start, raw_start, mon_end, raw_end;
+ long long delta1, delta2, eppm;
+
+ get_monotonic_and_raw(&mon_start, &raw_start);
+
+ sleep(15);
+
+ get_monotonic_and_raw(&mon_end, &raw_end);
+
+ delta1 = diff_timespec(mon_start, mon_end);
+ delta2 = diff_timespec(raw_start, raw_end);
+
+ eppm = (delta1*MILLION)/delta2 - MILLION;
+
+ return eppm;
+}
+
+int check_tick_adj(long tickval)
+{
+ long long eppm, ppm;
+ struct timex tx1;
+
+ tx1.modes = ADJ_TICK;
+ tx1.modes |= ADJ_OFFSET;
+ tx1.modes |= ADJ_FREQUENCY;
+ tx1.modes |= ADJ_STATUS;
+
+ tx1.status = STA_PLL;
+ tx1.offset = 0;
+ tx1.freq = 0;
+ tx1.tick = tickval;
+
+ adjtimex(&tx1);
+
+ sleep(1);
+
+ ppm = ((long long)tickval * MILLION)/systick - MILLION;
+ printf("Estimating tick (act: %ld usec, %lld ppm): ", tickval, ppm);
+
+ eppm = get_ppm_drift();
+ printf("%lld usec, %lld ppm", systick + (systick * eppm / MILLION), eppm);
+ fflush(stdout);
+
+ tx1.modes = 0;
+ adjtimex(&tx1);
+
+ if (tx1.offset || tx1.freq || tx1.tick != tickval) {
+ printf(" [ERROR]\n");
+ printf("\tUnexpected adjtimex return values, make sure ntpd is not running.\n");
+ return -1;
+ }
+
+ /*
+ * Here we use 100ppm difference as an error bound.
+ * We likely should see better, but some coarse clocksources
+ * cannot match the HZ tick size accurately, so we have a
+ * internal correction factor that doesn't scale exactly
+ * with the adjustment, resulting in > 10ppm error during
+ * a 10% adjustment. 100ppm also gives us more breathing
+ * room for interruptions during the measurement.
+ */
+ if (llabs(eppm - ppm) > 100) {
+ printf(" [FAILED]\n");
+ return -1;
+ }
+ printf(" [OK]\n");
+
+ return 0;
+}
+
+int main(int argv, char **argc)
+{
+ struct timespec raw;
+ long tick, max, interval, err;
+ struct timex tx1;
+
+ err = 0;
+ setbuf(stdout, NULL);
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &raw)) {
+ printf("ERR: NO CLOCK_MONOTONIC_RAW\n");
+ return -1;
+ }
+
+ printf("Each iteration takes about 15 seconds\n");
+
+ systick = sysconf(_SC_CLK_TCK);
+ systick = USEC_PER_SEC/sysconf(_SC_CLK_TCK);
+ max = systick/10; /* +/- 10% */
+ interval = max/4; /* in 4 steps each side */
+
+ for (tick = (systick - max); tick < (systick + max); tick += interval) {
+ if (check_tick_adj(tick)) {
+ err = 1;
+ break;
+ }
+ }
+
+ /* Reset things to zero */
+ tx1.modes = ADJ_TICK;
+ tx1.modes |= ADJ_OFFSET;
+ tx1.modes |= ADJ_FREQUENCY;
+
+ tx1.offset = 0;
+ tx1.freq = 0;
+ tx1.tick = systick;
+
+ adjtimex(&tx1);
+
+ if (err)
+ return ksft_exit_fail();
+
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/alarmtimer-suspend.c b/tools/testing/selftests/timers/alarmtimer-suspend.c
new file mode 100644
index 000000000..4da09dbf8
--- /dev/null
+++ b/tools/testing/selftests/timers/alarmtimer-suspend.c
@@ -0,0 +1,178 @@
+/* alarmtimer suspend test
+ * John Stultz (john.stultz@linaro.org)
+ * (C) Copyright Linaro 2013
+ * Licensed under the GPLv2
+ *
+ * This test makes sure the alarmtimer & RTC wakeup code is
+ * functioning.
+ *
+ * To build:
+ * $ gcc alarmtimer-suspend.c -o alarmtimer-suspend -lrt
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+
+#include <stdio.h>
+#include <unistd.h>
+#include <time.h>
+#include <string.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include "../kselftest.h"
+
+#define CLOCK_REALTIME 0
+#define CLOCK_MONOTONIC 1
+#define CLOCK_PROCESS_CPUTIME_ID 2
+#define CLOCK_THREAD_CPUTIME_ID 3
+#define CLOCK_MONOTONIC_RAW 4
+#define CLOCK_REALTIME_COARSE 5
+#define CLOCK_MONOTONIC_COARSE 6
+#define CLOCK_BOOTTIME 7
+#define CLOCK_REALTIME_ALARM 8
+#define CLOCK_BOOTTIME_ALARM 9
+#define CLOCK_HWSPECIFIC 10
+#define CLOCK_TAI 11
+#define NR_CLOCKIDS 12
+
+
+#define NSEC_PER_SEC 1000000000ULL
+#define UNREASONABLE_LAT (NSEC_PER_SEC * 5) /* hopefully we resume in 5 secs */
+
+#define SUSPEND_SECS 15
+int alarmcount;
+int alarm_clock_id;
+struct timespec start_time;
+
+
+char *clockstring(int clockid)
+{
+ switch (clockid) {
+ case CLOCK_REALTIME:
+ return "CLOCK_REALTIME";
+ case CLOCK_MONOTONIC:
+ return "CLOCK_MONOTONIC";
+ case CLOCK_PROCESS_CPUTIME_ID:
+ return "CLOCK_PROCESS_CPUTIME_ID";
+ case CLOCK_THREAD_CPUTIME_ID:
+ return "CLOCK_THREAD_CPUTIME_ID";
+ case CLOCK_MONOTONIC_RAW:
+ return "CLOCK_MONOTONIC_RAW";
+ case CLOCK_REALTIME_COARSE:
+ return "CLOCK_REALTIME_COARSE";
+ case CLOCK_MONOTONIC_COARSE:
+ return "CLOCK_MONOTONIC_COARSE";
+ case CLOCK_BOOTTIME:
+ return "CLOCK_BOOTTIME";
+ case CLOCK_REALTIME_ALARM:
+ return "CLOCK_REALTIME_ALARM";
+ case CLOCK_BOOTTIME_ALARM:
+ return "CLOCK_BOOTTIME_ALARM";
+ case CLOCK_TAI:
+ return "CLOCK_TAI";
+ };
+ return "UNKNOWN_CLOCKID";
+}
+
+
+long long timespec_sub(struct timespec a, struct timespec b)
+{
+ long long ret = NSEC_PER_SEC * b.tv_sec + b.tv_nsec;
+
+ ret -= NSEC_PER_SEC * a.tv_sec + a.tv_nsec;
+ return ret;
+}
+
+int final_ret = 0;
+
+void sigalarm(int signo)
+{
+ long long delta_ns;
+ struct timespec ts;
+
+ clock_gettime(alarm_clock_id, &ts);
+ alarmcount++;
+
+ delta_ns = timespec_sub(start_time, ts);
+ delta_ns -= NSEC_PER_SEC * SUSPEND_SECS * alarmcount;
+
+ printf("ALARM(%i): %ld:%ld latency: %lld ns ", alarmcount, ts.tv_sec,
+ ts.tv_nsec, delta_ns);
+
+ if (delta_ns > UNREASONABLE_LAT) {
+ printf("[FAIL]\n");
+ final_ret = -1;
+ } else
+ printf("[OK]\n");
+
+}
+
+int main(void)
+{
+ timer_t tm1;
+ struct itimerspec its1, its2;
+ struct sigevent se;
+ struct sigaction act;
+ int signum = SIGRTMAX;
+
+ /* Set up signal handler: */
+ sigfillset(&act.sa_mask);
+ act.sa_flags = 0;
+ act.sa_handler = sigalarm;
+ sigaction(signum, &act, NULL);
+
+ /* Set up timer: */
+ memset(&se, 0, sizeof(se));
+ se.sigev_notify = SIGEV_SIGNAL;
+ se.sigev_signo = signum;
+ se.sigev_value.sival_int = 0;
+
+ for (alarm_clock_id = CLOCK_REALTIME_ALARM;
+ alarm_clock_id <= CLOCK_BOOTTIME_ALARM;
+ alarm_clock_id++) {
+
+ alarmcount = 0;
+ if (timer_create(alarm_clock_id, &se, &tm1) == -1) {
+ printf("timer_create failed, %s unsupported?\n",
+ clockstring(alarm_clock_id));
+ break;
+ }
+
+ clock_gettime(alarm_clock_id, &start_time);
+ printf("Start time (%s): %ld:%ld\n", clockstring(alarm_clock_id),
+ start_time.tv_sec, start_time.tv_nsec);
+ printf("Setting alarm for every %i seconds\n", SUSPEND_SECS);
+ its1.it_value = start_time;
+ its1.it_value.tv_sec += SUSPEND_SECS;
+ its1.it_interval.tv_sec = SUSPEND_SECS;
+ its1.it_interval.tv_nsec = 0;
+
+ timer_settime(tm1, TIMER_ABSTIME, &its1, &its2);
+
+ while (alarmcount < 5)
+ sleep(1); /* First 5 alarms, do nothing */
+
+ printf("Starting suspend loops\n");
+ while (alarmcount < 10) {
+ int ret;
+
+ sleep(3);
+ ret = system("echo mem > /sys/power/state");
+ if (ret)
+ break;
+ }
+ timer_delete(tm1);
+ }
+ if (final_ret)
+ return ksft_exit_fail();
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/change_skew.c b/tools/testing/selftests/timers/change_skew.c
new file mode 100644
index 000000000..c4eab7124
--- /dev/null
+++ b/tools/testing/selftests/timers/change_skew.c
@@ -0,0 +1,96 @@
+/* ADJ_FREQ Skew change test
+ * by: john stultz (johnstul@us.ibm.com)
+ * (C) Copyright IBM 2012
+ * Licensed under the GPLv2
+ *
+ * NOTE: This is a meta-test which cranks the ADJ_FREQ knob and
+ * then uses other tests to detect problems. Thus this test requires
+ * that the raw_skew, inconsistency-check and nanosleep tests be
+ * present in the same directory it is run from.
+ *
+ * To build:
+ * $ gcc change_skew.c -o change_skew -lrt
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <time.h>
+#include "../kselftest.h"
+
+#define NSEC_PER_SEC 1000000000LL
+
+
+int change_skew_test(int ppm)
+{
+ struct timex tx;
+ int ret;
+
+ tx.modes = ADJ_FREQUENCY;
+ tx.freq = ppm << 16;
+
+ ret = adjtimex(&tx);
+ if (ret < 0) {
+ printf("Error adjusting freq\n");
+ return ret;
+ }
+
+ ret = system("./raw_skew");
+ ret |= system("./inconsistency-check");
+ ret |= system("./nanosleep");
+
+ return ret;
+}
+
+
+int main(int argv, char **argc)
+{
+ struct timex tx;
+ int i, ret;
+
+ int ppm[5] = {0, 250, 500, -250, -500};
+
+ /* Kill ntpd */
+ ret = system("killall -9 ntpd");
+
+ /* Make sure there's no offset adjustment going on */
+ tx.modes = ADJ_OFFSET;
+ tx.offset = 0;
+ ret = adjtimex(&tx);
+
+ if (ret < 0) {
+ printf("Maybe you're not running as root?\n");
+ return -1;
+ }
+
+ for (i = 0; i < 5; i++) {
+ printf("Using %i ppm adjustment\n", ppm[i]);
+ ret = change_skew_test(ppm[i]);
+ if (ret)
+ break;
+ }
+
+ /* Set things back */
+ tx.modes = ADJ_FREQUENCY;
+ tx.offset = 0;
+ adjtimex(&tx);
+
+ if (ret) {
+ printf("[FAIL]");
+ return ksft_exit_fail();
+ }
+ printf("[OK]");
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/clocksource-switch.c b/tools/testing/selftests/timers/clocksource-switch.c
new file mode 100644
index 000000000..c18313a5f
--- /dev/null
+++ b/tools/testing/selftests/timers/clocksource-switch.c
@@ -0,0 +1,168 @@
+/* Clocksource change test
+ * by: john stultz (johnstul@us.ibm.com)
+ * (C) Copyright IBM 2012
+ * Licensed under the GPLv2
+ *
+ * NOTE: This is a meta-test which quickly changes the clocksourc and
+ * then uses other tests to detect problems. Thus this test requires
+ * that the inconsistency-check and nanosleep tests be present in the
+ * same directory it is run from.
+ *
+ * To build:
+ * $ gcc clocksource-switch.c -o clocksource-switch -lrt
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/wait.h>
+#include "../kselftest.h"
+
+
+int get_clocksources(char list[][30])
+{
+ int fd, i;
+ size_t size;
+ char buf[512];
+ char *head, *tmp;
+
+ fd = open("/sys/devices/system/clocksource/clocksource0/available_clocksource", O_RDONLY);
+
+ size = read(fd, buf, 512);
+
+ close(fd);
+
+ for (i = 0; i < 10; i++)
+ list[i][0] = '\0';
+
+ head = buf;
+ i = 0;
+ while (head - buf < size) {
+ /* Find the next space */
+ for (tmp = head; *tmp != ' '; tmp++) {
+ if (*tmp == '\n')
+ break;
+ if (*tmp == '\0')
+ break;
+ }
+ *tmp = '\0';
+ strcpy(list[i], head);
+ head = tmp + 1;
+ i++;
+ }
+
+ return i-1;
+}
+
+int get_cur_clocksource(char *buf, size_t size)
+{
+ int fd;
+
+ fd = open("/sys/devices/system/clocksource/clocksource0/current_clocksource", O_RDONLY);
+
+ size = read(fd, buf, size);
+
+ return 0;
+}
+
+int change_clocksource(char *clocksource)
+{
+ int fd;
+ ssize_t size;
+
+ fd = open("/sys/devices/system/clocksource/clocksource0/current_clocksource", O_WRONLY);
+
+ if (fd < 0)
+ return -1;
+
+ size = write(fd, clocksource, strlen(clocksource));
+
+ if (size < 0)
+ return -1;
+
+ close(fd);
+ return 0;
+}
+
+
+int run_tests(int secs)
+{
+ int ret;
+ char buf[255];
+
+ sprintf(buf, "./inconsistency-check -t %i", secs);
+ ret = system(buf);
+ if (WIFEXITED(ret) && WEXITSTATUS(ret))
+ return WEXITSTATUS(ret);
+ ret = system("./nanosleep");
+ return WIFEXITED(ret) ? WEXITSTATUS(ret) : 0;
+}
+
+
+char clocksource_list[10][30];
+
+int main(int argv, char **argc)
+{
+ char orig_clk[512];
+ int count, i, status;
+ pid_t pid;
+
+ get_cur_clocksource(orig_clk, 512);
+
+ count = get_clocksources(clocksource_list);
+
+ if (change_clocksource(clocksource_list[0])) {
+ printf("Error: You probably need to run this as root\n");
+ return -1;
+ }
+
+ /* Check everything is sane before we start switching asyncrhonously */
+ for (i = 0; i < count; i++) {
+ printf("Validating clocksource %s\n", clocksource_list[i]);
+ if (change_clocksource(clocksource_list[i])) {
+ status = -1;
+ goto out;
+ }
+ if (run_tests(5)) {
+ status = -1;
+ goto out;
+ }
+ }
+
+
+ printf("Running Asynchronous Switching Tests...\n");
+ pid = fork();
+ if (!pid)
+ return run_tests(60);
+
+ while (pid != waitpid(pid, &status, WNOHANG))
+ for (i = 0; i < count; i++)
+ if (change_clocksource(clocksource_list[i])) {
+ status = -1;
+ goto out;
+ }
+out:
+ change_clocksource(orig_clk);
+
+ if (status)
+ return ksft_exit_fail();
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/freq-step.c b/tools/testing/selftests/timers/freq-step.c
new file mode 100644
index 000000000..4b76450d7
--- /dev/null
+++ b/tools/testing/selftests/timers/freq-step.c
@@ -0,0 +1,263 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This test checks the response of the system clock to frequency
+ * steps made with adjtimex(). The frequency error and stability of
+ * the CLOCK_MONOTONIC clock relative to the CLOCK_MONOTONIC_RAW clock
+ * is measured in two intervals following the step. The test fails if
+ * values from the second interval exceed specified limits.
+ *
+ * Copyright (C) Miroslav Lichvar <mlichvar@redhat.com> 2017
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <sys/timex.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+
+#define SAMPLES 100
+#define SAMPLE_READINGS 10
+#define MEAN_SAMPLE_INTERVAL 0.1
+#define STEP_INTERVAL 1.0
+#define MAX_PRECISION 500e-9
+#define MAX_FREQ_ERROR 0.02e-6
+#define MAX_STDDEV 50e-9
+
+#ifndef ADJ_SETOFFSET
+ #define ADJ_SETOFFSET 0x0100
+#endif
+
+struct sample {
+ double offset;
+ double time;
+};
+
+static time_t mono_raw_base;
+static time_t mono_base;
+static long user_hz;
+static double precision;
+static double mono_freq_offset;
+
+static double diff_timespec(struct timespec *ts1, struct timespec *ts2)
+{
+ return ts1->tv_sec - ts2->tv_sec + (ts1->tv_nsec - ts2->tv_nsec) / 1e9;
+}
+
+static double get_sample(struct sample *sample)
+{
+ double delay, mindelay = 0.0;
+ struct timespec ts1, ts2, ts3;
+ int i;
+
+ for (i = 0; i < SAMPLE_READINGS; i++) {
+ clock_gettime(CLOCK_MONOTONIC_RAW, &ts1);
+ clock_gettime(CLOCK_MONOTONIC, &ts2);
+ clock_gettime(CLOCK_MONOTONIC_RAW, &ts3);
+
+ ts1.tv_sec -= mono_raw_base;
+ ts2.tv_sec -= mono_base;
+ ts3.tv_sec -= mono_raw_base;
+
+ delay = diff_timespec(&ts3, &ts1);
+ if (delay <= 1e-9) {
+ i--;
+ continue;
+ }
+
+ if (!i || delay < mindelay) {
+ sample->offset = diff_timespec(&ts2, &ts1);
+ sample->offset -= delay / 2.0;
+ sample->time = ts1.tv_sec + ts1.tv_nsec / 1e9;
+ mindelay = delay;
+ }
+ }
+
+ return mindelay;
+}
+
+static void reset_ntp_error(void)
+{
+ struct timex txc;
+
+ txc.modes = ADJ_SETOFFSET;
+ txc.time.tv_sec = 0;
+ txc.time.tv_usec = 0;
+
+ if (adjtimex(&txc) < 0) {
+ perror("[FAIL] adjtimex");
+ ksft_exit_fail();
+ }
+}
+
+static void set_frequency(double freq)
+{
+ struct timex txc;
+ int tick_offset;
+
+ tick_offset = 1e6 * freq / user_hz;
+
+ txc.modes = ADJ_TICK | ADJ_FREQUENCY;
+ txc.tick = 1000000 / user_hz + tick_offset;
+ txc.freq = (1e6 * freq - user_hz * tick_offset) * (1 << 16);
+
+ if (adjtimex(&txc) < 0) {
+ perror("[FAIL] adjtimex");
+ ksft_exit_fail();
+ }
+}
+
+static void regress(struct sample *samples, int n, double *intercept,
+ double *slope, double *r_stddev, double *r_max)
+{
+ double x, y, r, x_sum, y_sum, xy_sum, x2_sum, r2_sum;
+ int i;
+
+ x_sum = 0.0, y_sum = 0.0, xy_sum = 0.0, x2_sum = 0.0;
+
+ for (i = 0; i < n; i++) {
+ x = samples[i].time;
+ y = samples[i].offset;
+
+ x_sum += x;
+ y_sum += y;
+ xy_sum += x * y;
+ x2_sum += x * x;
+ }
+
+ *slope = (xy_sum - x_sum * y_sum / n) / (x2_sum - x_sum * x_sum / n);
+ *intercept = (y_sum - *slope * x_sum) / n;
+
+ *r_max = 0.0, r2_sum = 0.0;
+
+ for (i = 0; i < n; i++) {
+ x = samples[i].time;
+ y = samples[i].offset;
+ r = fabs(x * *slope + *intercept - y);
+ if (*r_max < r)
+ *r_max = r;
+ r2_sum += r * r;
+ }
+
+ *r_stddev = sqrt(r2_sum / n);
+}
+
+static int run_test(int calibration, double freq_base, double freq_step)
+{
+ struct sample samples[SAMPLES];
+ double intercept, slope, stddev1, max1, stddev2, max2;
+ double freq_error1, freq_error2;
+ int i;
+
+ set_frequency(freq_base);
+
+ for (i = 0; i < 10; i++)
+ usleep(1e6 * MEAN_SAMPLE_INTERVAL / 10);
+
+ reset_ntp_error();
+
+ set_frequency(freq_base + freq_step);
+
+ for (i = 0; i < 10; i++)
+ usleep(rand() % 2000000 * STEP_INTERVAL / 10);
+
+ set_frequency(freq_base);
+
+ for (i = 0; i < SAMPLES; i++) {
+ usleep(rand() % 2000000 * MEAN_SAMPLE_INTERVAL);
+ get_sample(&samples[i]);
+ }
+
+ if (calibration) {
+ regress(samples, SAMPLES, &intercept, &slope, &stddev1, &max1);
+ mono_freq_offset = slope;
+ printf("CLOCK_MONOTONIC_RAW frequency offset: %11.3f ppm\n",
+ 1e6 * mono_freq_offset);
+ return 0;
+ }
+
+ regress(samples, SAMPLES / 2, &intercept, &slope, &stddev1, &max1);
+ freq_error1 = slope * (1.0 - mono_freq_offset) - mono_freq_offset -
+ freq_base;
+
+ regress(samples + SAMPLES / 2, SAMPLES / 2, &intercept, &slope,
+ &stddev2, &max2);
+ freq_error2 = slope * (1.0 - mono_freq_offset) - mono_freq_offset -
+ freq_base;
+
+ printf("%6.0f %+10.3f %6.0f %7.0f %+10.3f %6.0f %7.0f\t",
+ 1e6 * freq_step,
+ 1e6 * freq_error1, 1e9 * stddev1, 1e9 * max1,
+ 1e6 * freq_error2, 1e9 * stddev2, 1e9 * max2);
+
+ if (fabs(freq_error2) > MAX_FREQ_ERROR || stddev2 > MAX_STDDEV) {
+ printf("[FAIL]\n");
+ return 1;
+ }
+
+ printf("[OK]\n");
+ return 0;
+}
+
+static void init_test(void)
+{
+ struct timespec ts;
+ struct sample sample;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &ts)) {
+ perror("[FAIL] clock_gettime(CLOCK_MONOTONIC_RAW)");
+ ksft_exit_fail();
+ }
+
+ mono_raw_base = ts.tv_sec;
+
+ if (clock_gettime(CLOCK_MONOTONIC, &ts)) {
+ perror("[FAIL] clock_gettime(CLOCK_MONOTONIC)");
+ ksft_exit_fail();
+ }
+
+ mono_base = ts.tv_sec;
+
+ user_hz = sysconf(_SC_CLK_TCK);
+
+ precision = get_sample(&sample) / 2.0;
+ printf("CLOCK_MONOTONIC_RAW+CLOCK_MONOTONIC precision: %.0f ns\t\t",
+ 1e9 * precision);
+
+ if (precision > MAX_PRECISION)
+ ksft_exit_skip("precision: %.0f ns > MAX_PRECISION: %.0f ns\n",
+ 1e9 * precision, 1e9 * MAX_PRECISION);
+
+ printf("[OK]\n");
+ srand(ts.tv_sec ^ ts.tv_nsec);
+
+ run_test(1, 0.0, 0.0);
+}
+
+int main(int argc, char **argv)
+{
+ double freq_base, freq_step;
+ int i, j, fails = 0;
+
+ init_test();
+
+ printf("Checking response to frequency step:\n");
+ printf(" Step 1st interval 2nd interval\n");
+ printf(" Freq Dev Max Freq Dev Max\n");
+
+ for (i = 2; i >= 0; i--) {
+ for (j = 0; j < 5; j++) {
+ freq_base = (rand() % (1 << 24) - (1 << 23)) / 65536e6;
+ freq_step = 10e-6 * (1 << (6 * i));
+ fails += run_test(0, freq_base, freq_step);
+ }
+ }
+
+ set_frequency(0.0);
+
+ if (fails)
+ return ksft_exit_fail();
+
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/inconsistency-check.c b/tools/testing/selftests/timers/inconsistency-check.c
new file mode 100644
index 000000000..022d3ffe3
--- /dev/null
+++ b/tools/testing/selftests/timers/inconsistency-check.c
@@ -0,0 +1,193 @@
+/* Time inconsistency check test
+ * by: john stultz (johnstul@us.ibm.com)
+ * (C) Copyright IBM 2003, 2004, 2005, 2012
+ * (C) Copyright Linaro Limited 2015
+ * Licensed under the GPLv2
+ *
+ * To build:
+ * $ gcc inconsistency-check.c -o inconsistency-check -lrt
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <string.h>
+#include <signal.h>
+#include "../kselftest.h"
+
+#define CALLS_PER_LOOP 64
+#define NSEC_PER_SEC 1000000000ULL
+
+#define CLOCK_REALTIME 0
+#define CLOCK_MONOTONIC 1
+#define CLOCK_PROCESS_CPUTIME_ID 2
+#define CLOCK_THREAD_CPUTIME_ID 3
+#define CLOCK_MONOTONIC_RAW 4
+#define CLOCK_REALTIME_COARSE 5
+#define CLOCK_MONOTONIC_COARSE 6
+#define CLOCK_BOOTTIME 7
+#define CLOCK_REALTIME_ALARM 8
+#define CLOCK_BOOTTIME_ALARM 9
+#define CLOCK_HWSPECIFIC 10
+#define CLOCK_TAI 11
+#define NR_CLOCKIDS 12
+
+char *clockstring(int clockid)
+{
+ switch (clockid) {
+ case CLOCK_REALTIME:
+ return "CLOCK_REALTIME";
+ case CLOCK_MONOTONIC:
+ return "CLOCK_MONOTONIC";
+ case CLOCK_PROCESS_CPUTIME_ID:
+ return "CLOCK_PROCESS_CPUTIME_ID";
+ case CLOCK_THREAD_CPUTIME_ID:
+ return "CLOCK_THREAD_CPUTIME_ID";
+ case CLOCK_MONOTONIC_RAW:
+ return "CLOCK_MONOTONIC_RAW";
+ case CLOCK_REALTIME_COARSE:
+ return "CLOCK_REALTIME_COARSE";
+ case CLOCK_MONOTONIC_COARSE:
+ return "CLOCK_MONOTONIC_COARSE";
+ case CLOCK_BOOTTIME:
+ return "CLOCK_BOOTTIME";
+ case CLOCK_REALTIME_ALARM:
+ return "CLOCK_REALTIME_ALARM";
+ case CLOCK_BOOTTIME_ALARM:
+ return "CLOCK_BOOTTIME_ALARM";
+ case CLOCK_TAI:
+ return "CLOCK_TAI";
+ };
+ return "UNKNOWN_CLOCKID";
+}
+
+/* returns 1 if a <= b, 0 otherwise */
+static inline int in_order(struct timespec a, struct timespec b)
+{
+ /* use unsigned to avoid false positives on 2038 rollover */
+ if ((unsigned long)a.tv_sec < (unsigned long)b.tv_sec)
+ return 1;
+ if ((unsigned long)a.tv_sec > (unsigned long)b.tv_sec)
+ return 0;
+ if (a.tv_nsec > b.tv_nsec)
+ return 0;
+ return 1;
+}
+
+
+
+int consistency_test(int clock_type, unsigned long seconds)
+{
+ struct timespec list[CALLS_PER_LOOP];
+ int i, inconsistent;
+ long now, then;
+ time_t t;
+ char *start_str;
+
+ clock_gettime(clock_type, &list[0]);
+ now = then = list[0].tv_sec;
+
+ /* timestamp start of test */
+ t = time(0);
+ start_str = ctime(&t);
+
+ while (seconds == -1 || now - then < seconds) {
+ inconsistent = -1;
+
+ /* Fill list */
+ for (i = 0; i < CALLS_PER_LOOP; i++)
+ clock_gettime(clock_type, &list[i]);
+
+ /* Check for inconsistencies */
+ for (i = 0; i < CALLS_PER_LOOP - 1; i++)
+ if (!in_order(list[i], list[i+1]))
+ inconsistent = i;
+
+ /* display inconsistency */
+ if (inconsistent >= 0) {
+ unsigned long long delta;
+
+ printf("\%s\n", start_str);
+ for (i = 0; i < CALLS_PER_LOOP; i++) {
+ if (i == inconsistent)
+ printf("--------------------\n");
+ printf("%lu:%lu\n", list[i].tv_sec,
+ list[i].tv_nsec);
+ if (i == inconsistent + 1)
+ printf("--------------------\n");
+ }
+ delta = list[inconsistent].tv_sec * NSEC_PER_SEC;
+ delta += list[inconsistent].tv_nsec;
+ delta -= list[inconsistent+1].tv_sec * NSEC_PER_SEC;
+ delta -= list[inconsistent+1].tv_nsec;
+ printf("Delta: %llu ns\n", delta);
+ fflush(0);
+ /* timestamp inconsistency*/
+ t = time(0);
+ printf("%s\n", ctime(&t));
+ printf("[FAILED]\n");
+ return -1;
+ }
+ now = list[0].tv_sec;
+ }
+ printf("[OK]\n");
+ return 0;
+}
+
+
+int main(int argc, char *argv[])
+{
+ int clockid, opt;
+ int userclock = CLOCK_REALTIME;
+ int maxclocks = NR_CLOCKIDS;
+ int runtime = 10;
+ struct timespec ts;
+
+ /* Process arguments */
+ while ((opt = getopt(argc, argv, "t:c:")) != -1) {
+ switch (opt) {
+ case 't':
+ runtime = atoi(optarg);
+ break;
+ case 'c':
+ userclock = atoi(optarg);
+ maxclocks = userclock + 1;
+ break;
+ default:
+ printf("Usage: %s [-t <secs>] [-c <clockid>]\n", argv[0]);
+ printf(" -t: Number of seconds to run\n");
+ printf(" -c: clockid to use (default, all clockids)\n");
+ exit(-1);
+ }
+ }
+
+ setbuf(stdout, NULL);
+
+ for (clockid = userclock; clockid < maxclocks; clockid++) {
+
+ if (clockid == CLOCK_HWSPECIFIC)
+ continue;
+
+ if (!clock_gettime(clockid, &ts)) {
+ printf("Consistent %-30s ", clockstring(clockid));
+ if (consistency_test(clockid, runtime))
+ return ksft_exit_fail();
+ }
+ }
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/leap-a-day.c b/tools/testing/selftests/timers/leap-a-day.c
new file mode 100644
index 000000000..19e46ed5d
--- /dev/null
+++ b/tools/testing/selftests/timers/leap-a-day.c
@@ -0,0 +1,378 @@
+/* Leap second stress test
+ * by: John Stultz (john.stultz@linaro.org)
+ * (C) Copyright IBM 2012
+ * (C) Copyright 2013, 2015 Linaro Limited
+ * Licensed under the GPLv2
+ *
+ * This test signals the kernel to insert a leap second
+ * every day at midnight GMT. This allows for stessing the
+ * kernel's leap-second behavior, as well as how well applications
+ * handle the leap-second discontinuity.
+ *
+ * Usage: leap-a-day [-s] [-i <num>]
+ *
+ * Options:
+ * -s: Each iteration, set the date to 10 seconds before midnight GMT.
+ * This speeds up the number of leapsecond transitions tested,
+ * but because it calls settimeofday frequently, advancing the
+ * time by 24 hours every ~16 seconds, it may cause application
+ * disruption.
+ *
+ * -i: Number of iterations to run (default: infinite)
+ *
+ * Other notes: Disabling NTP prior to running this is advised, as the two
+ * may conflict in their commands to the kernel.
+ *
+ * To build:
+ * $ gcc leap-a-day.c -o leap-a-day -lrt
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <sys/errno.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+#include "../kselftest.h"
+
+#define NSEC_PER_SEC 1000000000ULL
+#define CLOCK_TAI 11
+
+time_t next_leap;
+int error_found;
+
+/* returns 1 if a <= b, 0 otherwise */
+static inline int in_order(struct timespec a, struct timespec b)
+{
+ if (a.tv_sec < b.tv_sec)
+ return 1;
+ if (a.tv_sec > b.tv_sec)
+ return 0;
+ if (a.tv_nsec > b.tv_nsec)
+ return 0;
+ return 1;
+}
+
+struct timespec timespec_add(struct timespec ts, unsigned long long ns)
+{
+ ts.tv_nsec += ns;
+ while (ts.tv_nsec >= NSEC_PER_SEC) {
+ ts.tv_nsec -= NSEC_PER_SEC;
+ ts.tv_sec++;
+ }
+ return ts;
+}
+
+char *time_state_str(int state)
+{
+ switch (state) {
+ case TIME_OK: return "TIME_OK";
+ case TIME_INS: return "TIME_INS";
+ case TIME_DEL: return "TIME_DEL";
+ case TIME_OOP: return "TIME_OOP";
+ case TIME_WAIT: return "TIME_WAIT";
+ case TIME_BAD: return "TIME_BAD";
+ }
+ return "ERROR";
+}
+
+/* clear NTP time_status & time_state */
+int clear_time_state(void)
+{
+ struct timex tx;
+ int ret;
+
+ /*
+ * We have to call adjtime twice here, as kernels
+ * prior to 6b1859dba01c7 (included in 3.5 and
+ * -stable), had an issue with the state machine
+ * and wouldn't clear the STA_INS/DEL flag directly.
+ */
+ tx.modes = ADJ_STATUS;
+ tx.status = STA_PLL;
+ ret = adjtimex(&tx);
+
+ /* Clear maxerror, as it can cause UNSYNC to be set */
+ tx.modes = ADJ_MAXERROR;
+ tx.maxerror = 0;
+ ret = adjtimex(&tx);
+
+ /* Clear the status */
+ tx.modes = ADJ_STATUS;
+ tx.status = 0;
+ ret = adjtimex(&tx);
+
+ return ret;
+}
+
+/* Make sure we cleanup on ctrl-c */
+void handler(int unused)
+{
+ clear_time_state();
+ exit(0);
+}
+
+void sigalarm(int signo)
+{
+ struct timex tx;
+ int ret;
+
+ tx.modes = 0;
+ ret = adjtimex(&tx);
+
+ if (tx.time.tv_sec < next_leap) {
+ printf("Error: Early timer expiration! (Should be %ld)\n", next_leap);
+ error_found = 1;
+ printf("adjtimex: %10ld sec + %6ld us (%i)\t%s\n",
+ tx.time.tv_sec,
+ tx.time.tv_usec,
+ tx.tai,
+ time_state_str(ret));
+ }
+ if (ret != TIME_WAIT) {
+ printf("Error: Timer seeing incorrect NTP state? (Should be TIME_WAIT)\n");
+ error_found = 1;
+ printf("adjtimex: %10ld sec + %6ld us (%i)\t%s\n",
+ tx.time.tv_sec,
+ tx.time.tv_usec,
+ tx.tai,
+ time_state_str(ret));
+ }
+}
+
+
+/* Test for known hrtimer failure */
+void test_hrtimer_failure(void)
+{
+ struct timespec now, target;
+
+ clock_gettime(CLOCK_REALTIME, &now);
+ target = timespec_add(now, NSEC_PER_SEC/2);
+ clock_nanosleep(CLOCK_REALTIME, TIMER_ABSTIME, &target, NULL);
+ clock_gettime(CLOCK_REALTIME, &now);
+
+ if (!in_order(target, now)) {
+ printf("ERROR: hrtimer early expiration failure observed.\n");
+ error_found = 1;
+ }
+}
+
+int main(int argc, char **argv)
+{
+ timer_t tm1;
+ struct itimerspec its1;
+ struct sigevent se;
+ struct sigaction act;
+ int signum = SIGRTMAX;
+ int settime = 1;
+ int tai_time = 0;
+ int insert = 1;
+ int iterations = 10;
+ int opt;
+
+ /* Process arguments */
+ while ((opt = getopt(argc, argv, "sti:")) != -1) {
+ switch (opt) {
+ case 'w':
+ printf("Only setting leap-flag, not changing time. It could take up to a day for leap to trigger.\n");
+ settime = 0;
+ break;
+ case 'i':
+ iterations = atoi(optarg);
+ break;
+ case 't':
+ tai_time = 1;
+ break;
+ default:
+ printf("Usage: %s [-w] [-i <iterations>]\n", argv[0]);
+ printf(" -w: Set flag and wait for leap second each iteration");
+ printf(" (default sets time to right before leapsecond)\n");
+ printf(" -i: Number of iterations (-1 = infinite, default is 10)\n");
+ printf(" -t: Print TAI time\n");
+ exit(-1);
+ }
+ }
+
+ /* Make sure TAI support is present if -t was used */
+ if (tai_time) {
+ struct timespec ts;
+
+ if (clock_gettime(CLOCK_TAI, &ts)) {
+ printf("System doesn't support CLOCK_TAI\n");
+ ksft_exit_fail();
+ }
+ }
+
+ signal(SIGINT, handler);
+ signal(SIGKILL, handler);
+
+ /* Set up timer signal handler: */
+ sigfillset(&act.sa_mask);
+ act.sa_flags = 0;
+ act.sa_handler = sigalarm;
+ sigaction(signum, &act, NULL);
+
+ if (iterations < 0)
+ printf("This runs continuously. Press ctrl-c to stop\n");
+ else
+ printf("Running for %i iterations. Press ctrl-c to stop\n", iterations);
+
+ printf("\n");
+ while (1) {
+ int ret;
+ struct timespec ts;
+ struct timex tx;
+ time_t now;
+
+ /* Get the current time */
+ clock_gettime(CLOCK_REALTIME, &ts);
+
+ /* Calculate the next possible leap second 23:59:60 GMT */
+ next_leap = ts.tv_sec;
+ next_leap += 86400 - (next_leap % 86400);
+
+ if (settime) {
+ struct timeval tv;
+
+ tv.tv_sec = next_leap - 10;
+ tv.tv_usec = 0;
+ settimeofday(&tv, NULL);
+ printf("Setting time to %s", ctime(&tv.tv_sec));
+ }
+
+ /* Reset NTP time state */
+ clear_time_state();
+
+ /* Set the leap second insert flag */
+ tx.modes = ADJ_STATUS;
+ if (insert)
+ tx.status = STA_INS;
+ else
+ tx.status = STA_DEL;
+ ret = adjtimex(&tx);
+ if (ret < 0) {
+ printf("Error: Problem setting STA_INS/STA_DEL!: %s\n",
+ time_state_str(ret));
+ return ksft_exit_fail();
+ }
+
+ /* Validate STA_INS was set */
+ tx.modes = 0;
+ ret = adjtimex(&tx);
+ if (tx.status != STA_INS && tx.status != STA_DEL) {
+ printf("Error: STA_INS/STA_DEL not set!: %s\n",
+ time_state_str(ret));
+ return ksft_exit_fail();
+ }
+
+ if (tai_time) {
+ printf("Using TAI time,"
+ " no inconsistencies should be seen!\n");
+ }
+
+ printf("Scheduling leap second for %s", ctime(&next_leap));
+
+ /* Set up timer */
+ printf("Setting timer for %ld - %s", next_leap, ctime(&next_leap));
+ memset(&se, 0, sizeof(se));
+ se.sigev_notify = SIGEV_SIGNAL;
+ se.sigev_signo = signum;
+ se.sigev_value.sival_int = 0;
+ if (timer_create(CLOCK_REALTIME, &se, &tm1) == -1) {
+ printf("Error: timer_create failed\n");
+ return ksft_exit_fail();
+ }
+ its1.it_value.tv_sec = next_leap;
+ its1.it_value.tv_nsec = 0;
+ its1.it_interval.tv_sec = 0;
+ its1.it_interval.tv_nsec = 0;
+ timer_settime(tm1, TIMER_ABSTIME, &its1, NULL);
+
+ /* Wake up 3 seconds before leap */
+ ts.tv_sec = next_leap - 3;
+ ts.tv_nsec = 0;
+
+
+ while (clock_nanosleep(CLOCK_REALTIME, TIMER_ABSTIME, &ts, NULL))
+ printf("Something woke us up, returning to sleep\n");
+
+ /* Validate STA_INS is still set */
+ tx.modes = 0;
+ ret = adjtimex(&tx);
+ if (tx.status != STA_INS && tx.status != STA_DEL) {
+ printf("Something cleared STA_INS/STA_DEL, setting it again.\n");
+ tx.modes = ADJ_STATUS;
+ if (insert)
+ tx.status = STA_INS;
+ else
+ tx.status = STA_DEL;
+ ret = adjtimex(&tx);
+ }
+
+ /* Check adjtimex output every half second */
+ now = tx.time.tv_sec;
+ while (now < next_leap + 2) {
+ char buf[26];
+ struct timespec tai;
+ int ret;
+
+ tx.modes = 0;
+ ret = adjtimex(&tx);
+
+ if (tai_time) {
+ clock_gettime(CLOCK_TAI, &tai);
+ printf("%ld sec, %9ld ns\t%s\n",
+ tai.tv_sec,
+ tai.tv_nsec,
+ time_state_str(ret));
+ } else {
+ ctime_r(&tx.time.tv_sec, buf);
+ buf[strlen(buf)-1] = 0; /*remove trailing\n */
+
+ printf("%s + %6ld us (%i)\t%s\n",
+ buf,
+ tx.time.tv_usec,
+ tx.tai,
+ time_state_str(ret));
+ }
+ now = tx.time.tv_sec;
+ /* Sleep for another half second */
+ ts.tv_sec = 0;
+ ts.tv_nsec = NSEC_PER_SEC / 2;
+ clock_nanosleep(CLOCK_MONOTONIC, 0, &ts, NULL);
+ }
+ /* Switch to using other mode */
+ insert = !insert;
+
+ /* Note if kernel has known hrtimer failure */
+ test_hrtimer_failure();
+
+ printf("Leap complete\n");
+ if (error_found) {
+ printf("Errors observed\n");
+ clear_time_state();
+ return ksft_exit_fail();
+ }
+ printf("\n");
+ if ((iterations != -1) && !(--iterations))
+ break;
+ }
+
+ clear_time_state();
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/leapcrash.c b/tools/testing/selftests/timers/leapcrash.c
new file mode 100644
index 000000000..dc80728ed
--- /dev/null
+++ b/tools/testing/selftests/timers/leapcrash.c
@@ -0,0 +1,108 @@
+/* Demo leapsecond deadlock
+ * by: John Stultz (john.stultz@linaro.org)
+ * (C) Copyright IBM 2012
+ * (C) Copyright 2013, 2015 Linaro Limited
+ * Licensed under the GPL
+ *
+ * This test demonstrates leapsecond deadlock that is possibe
+ * on kernels from 2.6.26 to 3.3.
+ *
+ * WARNING: THIS WILL LIKELY HARDHANG SYSTEMS AND MAY LOSE DATA
+ * RUN AT YOUR OWN RISK!
+ * To build:
+ * $ gcc leapcrash.c -o leapcrash -lrt
+ */
+
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <string.h>
+#include <signal.h>
+#include "../kselftest.h"
+
+/* clear NTP time_status & time_state */
+int clear_time_state(void)
+{
+ struct timex tx;
+ int ret;
+
+ /*
+ * We have to call adjtime twice here, as kernels
+ * prior to 6b1859dba01c7 (included in 3.5 and
+ * -stable), had an issue with the state machine
+ * and wouldn't clear the STA_INS/DEL flag directly.
+ */
+ tx.modes = ADJ_STATUS;
+ tx.status = STA_PLL;
+ ret = adjtimex(&tx);
+
+ tx.modes = ADJ_STATUS;
+ tx.status = 0;
+ ret = adjtimex(&tx);
+
+ return ret;
+}
+
+/* Make sure we cleanup on ctrl-c */
+void handler(int unused)
+{
+ clear_time_state();
+ exit(0);
+}
+
+
+int main(void)
+{
+ struct timex tx;
+ struct timespec ts;
+ time_t next_leap;
+ int count = 0;
+
+ setbuf(stdout, NULL);
+
+ signal(SIGINT, handler);
+ signal(SIGKILL, handler);
+ printf("This runs for a few minutes. Press ctrl-c to stop\n");
+
+ clear_time_state();
+
+
+ /* Get the current time */
+ clock_gettime(CLOCK_REALTIME, &ts);
+
+ /* Calculate the next possible leap second 23:59:60 GMT */
+ next_leap = ts.tv_sec;
+ next_leap += 86400 - (next_leap % 86400);
+
+ for (count = 0; count < 20; count++) {
+ struct timeval tv;
+
+
+ /* set the time to 2 seconds before the leap */
+ tv.tv_sec = next_leap - 2;
+ tv.tv_usec = 0;
+ if (settimeofday(&tv, NULL)) {
+ printf("Error: You're likely not running with proper (ie: root) permissions\n");
+ return ksft_exit_fail();
+ }
+ tx.modes = 0;
+ adjtimex(&tx);
+
+ /* hammer on adjtime w/ STA_INS */
+ while (tx.time.tv_sec < next_leap + 1) {
+ /* Set the leap second insert flag */
+ tx.modes = ADJ_STATUS;
+ tx.status = STA_INS;
+ adjtimex(&tx);
+ }
+ clear_time_state();
+ printf(".");
+ fflush(stdout);
+ }
+ printf("[OK]\n");
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/mqueue-lat.c b/tools/testing/selftests/timers/mqueue-lat.c
new file mode 100644
index 000000000..7916cf5cc
--- /dev/null
+++ b/tools/testing/selftests/timers/mqueue-lat.c
@@ -0,0 +1,114 @@
+/* Measure mqueue timeout latency
+ * by: john stultz (john.stultz@linaro.org)
+ * (C) Copyright Linaro 2013
+ *
+ * Inspired with permission from example test by:
+ * Romain Francoise <romain@orebokech.com>
+ * Licensed under the GPLv2
+ *
+ * To build:
+ * $ gcc mqueue-lat.c -o mqueue-lat -lrt
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <string.h>
+#include <signal.h>
+#include <errno.h>
+#include <mqueue.h>
+#include "../kselftest.h"
+
+#define NSEC_PER_SEC 1000000000ULL
+
+#define TARGET_TIMEOUT 100000000 /* 100ms in nanoseconds */
+#define UNRESONABLE_LATENCY 40000000 /* 40ms in nanosecs */
+
+
+long long timespec_sub(struct timespec a, struct timespec b)
+{
+ long long ret = NSEC_PER_SEC * b.tv_sec + b.tv_nsec;
+
+ ret -= NSEC_PER_SEC * a.tv_sec + a.tv_nsec;
+ return ret;
+}
+
+struct timespec timespec_add(struct timespec ts, unsigned long long ns)
+{
+ ts.tv_nsec += ns;
+ while (ts.tv_nsec >= NSEC_PER_SEC) {
+ ts.tv_nsec -= NSEC_PER_SEC;
+ ts.tv_sec++;
+ }
+ return ts;
+}
+
+int mqueue_lat_test(void)
+{
+
+ mqd_t q;
+ struct mq_attr attr;
+ struct timespec start, end, now, target;
+ int i, count, ret;
+
+ q = mq_open("/foo", O_CREAT | O_RDONLY, 0666, NULL);
+ if (q < 0) {
+ perror("mq_open");
+ return -1;
+ }
+ mq_getattr(q, &attr);
+
+
+ count = 100;
+ clock_gettime(CLOCK_MONOTONIC, &start);
+
+ for (i = 0; i < count; i++) {
+ char buf[attr.mq_msgsize];
+
+ clock_gettime(CLOCK_REALTIME, &now);
+ target = now;
+ target = timespec_add(now, TARGET_TIMEOUT); /* 100ms */
+
+ ret = mq_timedreceive(q, buf, sizeof(buf), NULL, &target);
+ if (ret < 0 && errno != ETIMEDOUT) {
+ perror("mq_timedreceive");
+ return -1;
+ }
+ }
+ clock_gettime(CLOCK_MONOTONIC, &end);
+
+ mq_close(q);
+
+ if ((timespec_sub(start, end)/count) > TARGET_TIMEOUT + UNRESONABLE_LATENCY)
+ return -1;
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ int ret;
+
+ printf("Mqueue latency : ");
+ fflush(stdout);
+
+ ret = mqueue_lat_test();
+ if (ret < 0) {
+ printf("[FAILED]\n");
+ return ksft_exit_fail();
+ }
+ printf("[OK]\n");
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/nanosleep.c b/tools/testing/selftests/timers/nanosleep.c
new file mode 100644
index 000000000..71b5441c2
--- /dev/null
+++ b/tools/testing/selftests/timers/nanosleep.c
@@ -0,0 +1,165 @@
+/* Make sure timers don't return early
+ * by: john stultz (johnstul@us.ibm.com)
+ * John Stultz (john.stultz@linaro.org)
+ * (C) Copyright IBM 2012
+ * (C) Copyright Linaro 2013 2015
+ * Licensed under the GPLv2
+ *
+ * To build:
+ * $ gcc nanosleep.c -o nanosleep -lrt
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <string.h>
+#include <signal.h>
+#include "../kselftest.h"
+
+#define NSEC_PER_SEC 1000000000ULL
+
+#define CLOCK_REALTIME 0
+#define CLOCK_MONOTONIC 1
+#define CLOCK_PROCESS_CPUTIME_ID 2
+#define CLOCK_THREAD_CPUTIME_ID 3
+#define CLOCK_MONOTONIC_RAW 4
+#define CLOCK_REALTIME_COARSE 5
+#define CLOCK_MONOTONIC_COARSE 6
+#define CLOCK_BOOTTIME 7
+#define CLOCK_REALTIME_ALARM 8
+#define CLOCK_BOOTTIME_ALARM 9
+#define CLOCK_HWSPECIFIC 10
+#define CLOCK_TAI 11
+#define NR_CLOCKIDS 12
+
+#define UNSUPPORTED 0xf00f
+
+char *clockstring(int clockid)
+{
+ switch (clockid) {
+ case CLOCK_REALTIME:
+ return "CLOCK_REALTIME";
+ case CLOCK_MONOTONIC:
+ return "CLOCK_MONOTONIC";
+ case CLOCK_PROCESS_CPUTIME_ID:
+ return "CLOCK_PROCESS_CPUTIME_ID";
+ case CLOCK_THREAD_CPUTIME_ID:
+ return "CLOCK_THREAD_CPUTIME_ID";
+ case CLOCK_MONOTONIC_RAW:
+ return "CLOCK_MONOTONIC_RAW";
+ case CLOCK_REALTIME_COARSE:
+ return "CLOCK_REALTIME_COARSE";
+ case CLOCK_MONOTONIC_COARSE:
+ return "CLOCK_MONOTONIC_COARSE";
+ case CLOCK_BOOTTIME:
+ return "CLOCK_BOOTTIME";
+ case CLOCK_REALTIME_ALARM:
+ return "CLOCK_REALTIME_ALARM";
+ case CLOCK_BOOTTIME_ALARM:
+ return "CLOCK_BOOTTIME_ALARM";
+ case CLOCK_TAI:
+ return "CLOCK_TAI";
+ };
+ return "UNKNOWN_CLOCKID";
+}
+
+/* returns 1 if a <= b, 0 otherwise */
+static inline int in_order(struct timespec a, struct timespec b)
+{
+ if (a.tv_sec < b.tv_sec)
+ return 1;
+ if (a.tv_sec > b.tv_sec)
+ return 0;
+ if (a.tv_nsec > b.tv_nsec)
+ return 0;
+ return 1;
+}
+
+struct timespec timespec_add(struct timespec ts, unsigned long long ns)
+{
+ ts.tv_nsec += ns;
+ while (ts.tv_nsec >= NSEC_PER_SEC) {
+ ts.tv_nsec -= NSEC_PER_SEC;
+ ts.tv_sec++;
+ }
+ return ts;
+}
+
+int nanosleep_test(int clockid, long long ns)
+{
+ struct timespec now, target, rel;
+
+ /* First check abs time */
+ if (clock_gettime(clockid, &now))
+ return UNSUPPORTED;
+ target = timespec_add(now, ns);
+
+ if (clock_nanosleep(clockid, TIMER_ABSTIME, &target, NULL))
+ return UNSUPPORTED;
+ clock_gettime(clockid, &now);
+
+ if (!in_order(target, now))
+ return -1;
+
+ /* Second check reltime */
+ clock_gettime(clockid, &now);
+ rel.tv_sec = 0;
+ rel.tv_nsec = 0;
+ rel = timespec_add(rel, ns);
+ target = timespec_add(now, ns);
+ clock_nanosleep(clockid, 0, &rel, NULL);
+ clock_gettime(clockid, &now);
+
+ if (!in_order(target, now))
+ return -1;
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ long long length;
+ int clockid, ret;
+
+ for (clockid = CLOCK_REALTIME; clockid < NR_CLOCKIDS; clockid++) {
+
+ /* Skip cputime clockids since nanosleep won't increment cputime */
+ if (clockid == CLOCK_PROCESS_CPUTIME_ID ||
+ clockid == CLOCK_THREAD_CPUTIME_ID ||
+ clockid == CLOCK_HWSPECIFIC)
+ continue;
+
+ printf("Nanosleep %-31s ", clockstring(clockid));
+ fflush(stdout);
+
+ length = 10;
+ while (length <= (NSEC_PER_SEC * 10)) {
+ ret = nanosleep_test(clockid, length);
+ if (ret == UNSUPPORTED) {
+ printf("[UNSUPPORTED]\n");
+ goto next;
+ }
+ if (ret < 0) {
+ printf("[FAILED]\n");
+ return ksft_exit_fail();
+ }
+ length *= 100;
+ }
+ printf("[OK]\n");
+next:
+ ret = 0;
+ }
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/nsleep-lat.c b/tools/testing/selftests/timers/nsleep-lat.c
new file mode 100644
index 000000000..eb3e79ed7
--- /dev/null
+++ b/tools/testing/selftests/timers/nsleep-lat.c
@@ -0,0 +1,180 @@
+/* Measure nanosleep timer latency
+ * by: john stultz (john.stultz@linaro.org)
+ * (C) Copyright Linaro 2013
+ * Licensed under the GPLv2
+ *
+ * To build:
+ * $ gcc nsleep-lat.c -o nsleep-lat -lrt
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <string.h>
+#include <signal.h>
+#include "../kselftest.h"
+
+#define NSEC_PER_SEC 1000000000ULL
+
+#define UNRESONABLE_LATENCY 40000000 /* 40ms in nanosecs */
+
+
+#define CLOCK_REALTIME 0
+#define CLOCK_MONOTONIC 1
+#define CLOCK_PROCESS_CPUTIME_ID 2
+#define CLOCK_THREAD_CPUTIME_ID 3
+#define CLOCK_MONOTONIC_RAW 4
+#define CLOCK_REALTIME_COARSE 5
+#define CLOCK_MONOTONIC_COARSE 6
+#define CLOCK_BOOTTIME 7
+#define CLOCK_REALTIME_ALARM 8
+#define CLOCK_BOOTTIME_ALARM 9
+#define CLOCK_HWSPECIFIC 10
+#define CLOCK_TAI 11
+#define NR_CLOCKIDS 12
+
+#define UNSUPPORTED 0xf00f
+
+char *clockstring(int clockid)
+{
+ switch (clockid) {
+ case CLOCK_REALTIME:
+ return "CLOCK_REALTIME";
+ case CLOCK_MONOTONIC:
+ return "CLOCK_MONOTONIC";
+ case CLOCK_PROCESS_CPUTIME_ID:
+ return "CLOCK_PROCESS_CPUTIME_ID";
+ case CLOCK_THREAD_CPUTIME_ID:
+ return "CLOCK_THREAD_CPUTIME_ID";
+ case CLOCK_MONOTONIC_RAW:
+ return "CLOCK_MONOTONIC_RAW";
+ case CLOCK_REALTIME_COARSE:
+ return "CLOCK_REALTIME_COARSE";
+ case CLOCK_MONOTONIC_COARSE:
+ return "CLOCK_MONOTONIC_COARSE";
+ case CLOCK_BOOTTIME:
+ return "CLOCK_BOOTTIME";
+ case CLOCK_REALTIME_ALARM:
+ return "CLOCK_REALTIME_ALARM";
+ case CLOCK_BOOTTIME_ALARM:
+ return "CLOCK_BOOTTIME_ALARM";
+ case CLOCK_TAI:
+ return "CLOCK_TAI";
+ };
+ return "UNKNOWN_CLOCKID";
+}
+
+struct timespec timespec_add(struct timespec ts, unsigned long long ns)
+{
+ ts.tv_nsec += ns;
+ while (ts.tv_nsec >= NSEC_PER_SEC) {
+ ts.tv_nsec -= NSEC_PER_SEC;
+ ts.tv_sec++;
+ }
+ return ts;
+}
+
+
+long long timespec_sub(struct timespec a, struct timespec b)
+{
+ long long ret = NSEC_PER_SEC * b.tv_sec + b.tv_nsec;
+
+ ret -= NSEC_PER_SEC * a.tv_sec + a.tv_nsec;
+ return ret;
+}
+
+int nanosleep_lat_test(int clockid, long long ns)
+{
+ struct timespec start, end, target;
+ long long latency = 0;
+ int i, count;
+
+ target.tv_sec = ns/NSEC_PER_SEC;
+ target.tv_nsec = ns%NSEC_PER_SEC;
+
+ if (clock_gettime(clockid, &start))
+ return UNSUPPORTED;
+ if (clock_nanosleep(clockid, 0, &target, NULL))
+ return UNSUPPORTED;
+
+ count = 10;
+
+ /* First check relative latency */
+ clock_gettime(clockid, &start);
+ for (i = 0; i < count; i++)
+ clock_nanosleep(clockid, 0, &target, NULL);
+ clock_gettime(clockid, &end);
+
+ if (((timespec_sub(start, end)/count)-ns) > UNRESONABLE_LATENCY) {
+ printf("Large rel latency: %lld ns :", (timespec_sub(start, end)/count)-ns);
+ return -1;
+ }
+
+ /* Next check absolute latency */
+ for (i = 0; i < count; i++) {
+ clock_gettime(clockid, &start);
+ target = timespec_add(start, ns);
+ clock_nanosleep(clockid, TIMER_ABSTIME, &target, NULL);
+ clock_gettime(clockid, &end);
+ latency += timespec_sub(target, end);
+ }
+
+ if (latency/count > UNRESONABLE_LATENCY) {
+ printf("Large abs latency: %lld ns :", latency/count);
+ return -1;
+ }
+
+ return 0;
+}
+
+
+
+int main(int argc, char **argv)
+{
+ long long length;
+ int clockid, ret;
+
+ for (clockid = CLOCK_REALTIME; clockid < NR_CLOCKIDS; clockid++) {
+
+ /* Skip cputime clockids since nanosleep won't increment cputime */
+ if (clockid == CLOCK_PROCESS_CPUTIME_ID ||
+ clockid == CLOCK_THREAD_CPUTIME_ID ||
+ clockid == CLOCK_HWSPECIFIC)
+ continue;
+
+ printf("nsleep latency %-26s ", clockstring(clockid));
+ fflush(stdout);
+
+ length = 10;
+ while (length <= (NSEC_PER_SEC * 10)) {
+ ret = nanosleep_lat_test(clockid, length);
+ if (ret)
+ break;
+ length *= 100;
+
+ }
+
+ if (ret == UNSUPPORTED) {
+ printf("[UNSUPPORTED]\n");
+ continue;
+ }
+ if (ret < 0) {
+ printf("[FAILED]\n");
+ return ksft_exit_fail();
+ }
+ printf("[OK]\n");
+ }
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c
new file mode 100644
index 000000000..0ba500056
--- /dev/null
+++ b/tools/testing/selftests/timers/posix_timers.c
@@ -0,0 +1,221 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2013 Red Hat, Inc., Frederic Weisbecker <fweisbec@redhat.com>
+ *
+ * Selftests for a few posix timers interface.
+ *
+ * Kernel loop code stolen from Steven Rostedt <srostedt@redhat.com>
+ */
+
+#include <sys/time.h>
+#include <stdio.h>
+#include <signal.h>
+#include <unistd.h>
+#include <time.h>
+#include <pthread.h>
+
+#include "../kselftest.h"
+
+#define DELAY 2
+#define USECS_PER_SEC 1000000
+
+static volatile int done;
+
+/* Busy loop in userspace to elapse ITIMER_VIRTUAL */
+static void user_loop(void)
+{
+ while (!done);
+}
+
+/*
+ * Try to spend as much time as possible in kernelspace
+ * to elapse ITIMER_PROF.
+ */
+static void kernel_loop(void)
+{
+ void *addr = sbrk(0);
+ int err = 0;
+
+ while (!done && !err) {
+ err = brk(addr + 4096);
+ err |= brk(addr);
+ }
+}
+
+/*
+ * Sleep until ITIMER_REAL expiration.
+ */
+static void idle_loop(void)
+{
+ pause();
+}
+
+static void sig_handler(int nr)
+{
+ done = 1;
+}
+
+/*
+ * Check the expected timer expiration matches the GTOD elapsed delta since
+ * we armed the timer. Keep a 0.5 sec error margin due to various jitter.
+ */
+static int check_diff(struct timeval start, struct timeval end)
+{
+ long long diff;
+
+ diff = end.tv_usec - start.tv_usec;
+ diff += (end.tv_sec - start.tv_sec) * USECS_PER_SEC;
+
+ if (abs(diff - DELAY * USECS_PER_SEC) > USECS_PER_SEC / 2) {
+ printf("Diff too high: %lld..", diff);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int check_itimer(int which)
+{
+ int err;
+ struct timeval start, end;
+ struct itimerval val = {
+ .it_value.tv_sec = DELAY,
+ };
+
+ printf("Check itimer ");
+
+ if (which == ITIMER_VIRTUAL)
+ printf("virtual... ");
+ else if (which == ITIMER_PROF)
+ printf("prof... ");
+ else if (which == ITIMER_REAL)
+ printf("real... ");
+
+ fflush(stdout);
+
+ done = 0;
+
+ if (which == ITIMER_VIRTUAL)
+ signal(SIGVTALRM, sig_handler);
+ else if (which == ITIMER_PROF)
+ signal(SIGPROF, sig_handler);
+ else if (which == ITIMER_REAL)
+ signal(SIGALRM, sig_handler);
+
+ err = gettimeofday(&start, NULL);
+ if (err < 0) {
+ perror("Can't call gettimeofday()\n");
+ return -1;
+ }
+
+ err = setitimer(which, &val, NULL);
+ if (err < 0) {
+ perror("Can't set timer\n");
+ return -1;
+ }
+
+ if (which == ITIMER_VIRTUAL)
+ user_loop();
+ else if (which == ITIMER_PROF)
+ kernel_loop();
+ else if (which == ITIMER_REAL)
+ idle_loop();
+
+ err = gettimeofday(&end, NULL);
+ if (err < 0) {
+ perror("Can't call gettimeofday()\n");
+ return -1;
+ }
+
+ if (!check_diff(start, end))
+ printf("[OK]\n");
+ else
+ printf("[FAIL]\n");
+
+ return 0;
+}
+
+static int check_timer_create(int which)
+{
+ int err;
+ timer_t id;
+ struct timeval start, end;
+ struct itimerspec val = {
+ .it_value.tv_sec = DELAY,
+ };
+
+ printf("Check timer_create() ");
+ if (which == CLOCK_THREAD_CPUTIME_ID) {
+ printf("per thread... ");
+ } else if (which == CLOCK_PROCESS_CPUTIME_ID) {
+ printf("per process... ");
+ }
+ fflush(stdout);
+
+ done = 0;
+ err = timer_create(which, NULL, &id);
+ if (err < 0) {
+ perror("Can't create timer\n");
+ return -1;
+ }
+ signal(SIGALRM, sig_handler);
+
+ err = gettimeofday(&start, NULL);
+ if (err < 0) {
+ perror("Can't call gettimeofday()\n");
+ return -1;
+ }
+
+ err = timer_settime(id, 0, &val, NULL);
+ if (err < 0) {
+ perror("Can't set timer\n");
+ return -1;
+ }
+
+ user_loop();
+
+ err = gettimeofday(&end, NULL);
+ if (err < 0) {
+ perror("Can't call gettimeofday()\n");
+ return -1;
+ }
+
+ if (!check_diff(start, end))
+ printf("[OK]\n");
+ else
+ printf("[FAIL]\n");
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ printf("Testing posix timers. False negative may happen on CPU execution \n");
+ printf("based timers if other threads run on the CPU...\n");
+
+ if (check_itimer(ITIMER_VIRTUAL) < 0)
+ return ksft_exit_fail();
+
+ if (check_itimer(ITIMER_PROF) < 0)
+ return ksft_exit_fail();
+
+ if (check_itimer(ITIMER_REAL) < 0)
+ return ksft_exit_fail();
+
+ if (check_timer_create(CLOCK_THREAD_CPUTIME_ID) < 0)
+ return ksft_exit_fail();
+
+ /*
+ * It's unfortunately hard to reliably test a timer expiration
+ * on parallel multithread cputime. We could arm it to expire
+ * on DELAY * nr_threads, with nr_threads busy looping, then wait
+ * the normal DELAY since the time is elapsing nr_threads faster.
+ * But for that we need to ensure we have real physical free CPUs
+ * to ensure true parallelism. So test only one thread until we
+ * find a better solution.
+ */
+ if (check_timer_create(CLOCK_PROCESS_CPUTIME_ID) < 0)
+ return ksft_exit_fail();
+
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/raw_skew.c b/tools/testing/selftests/timers/raw_skew.c
new file mode 100644
index 000000000..b41d8dd0c
--- /dev/null
+++ b/tools/testing/selftests/timers/raw_skew.c
@@ -0,0 +1,148 @@
+/* CLOCK_MONOTONIC vs CLOCK_MONOTONIC_RAW skew test
+ * by: john stultz (johnstul@us.ibm.com)
+ * John Stultz <john.stultz@linaro.org>
+ * (C) Copyright IBM 2012
+ * (C) Copyright Linaro Limited 2015
+ * Licensed under the GPLv2
+ *
+ * To build:
+ * $ gcc raw_skew.c -o raw_skew -lrt
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <time.h>
+#include "../kselftest.h"
+
+#define CLOCK_MONOTONIC_RAW 4
+#define NSEC_PER_SEC 1000000000LL
+
+#define shift_right(x, s) ({ \
+ __typeof__(x) __x = (x); \
+ __typeof__(s) __s = (s); \
+ __x < 0 ? -(-__x >> __s) : __x >> __s; \
+})
+
+long long llabs(long long val)
+{
+ if (val < 0)
+ val = -val;
+ return val;
+}
+
+unsigned long long ts_to_nsec(struct timespec ts)
+{
+ return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec;
+}
+
+struct timespec nsec_to_ts(long long ns)
+{
+ struct timespec ts;
+
+ ts.tv_sec = ns/NSEC_PER_SEC;
+ ts.tv_nsec = ns%NSEC_PER_SEC;
+ return ts;
+}
+
+long long diff_timespec(struct timespec start, struct timespec end)
+{
+ long long start_ns, end_ns;
+
+ start_ns = ts_to_nsec(start);
+ end_ns = ts_to_nsec(end);
+ return end_ns - start_ns;
+}
+
+void get_monotonic_and_raw(struct timespec *mon, struct timespec *raw)
+{
+ struct timespec start, mid, end;
+ long long diff = 0, tmp;
+ int i;
+
+ for (i = 0; i < 3; i++) {
+ long long newdiff;
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ clock_gettime(CLOCK_MONOTONIC_RAW, &mid);
+ clock_gettime(CLOCK_MONOTONIC, &end);
+
+ newdiff = diff_timespec(start, end);
+ if (diff == 0 || newdiff < diff) {
+ diff = newdiff;
+ *raw = mid;
+ tmp = (ts_to_nsec(start) + ts_to_nsec(end))/2;
+ *mon = nsec_to_ts(tmp);
+ }
+ }
+}
+
+int main(int argv, char **argc)
+{
+ struct timespec mon, raw, start, end;
+ long long delta1, delta2, interval, eppm, ppm;
+ struct timex tx1, tx2;
+
+ setbuf(stdout, NULL);
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &raw)) {
+ printf("ERR: NO CLOCK_MONOTONIC_RAW\n");
+ return -1;
+ }
+
+ tx1.modes = 0;
+ adjtimex(&tx1);
+ get_monotonic_and_raw(&mon, &raw);
+ start = mon;
+ delta1 = diff_timespec(mon, raw);
+
+ if (tx1.offset)
+ printf("WARNING: ADJ_OFFSET in progress, this will cause inaccurate results\n");
+
+ printf("Estimating clock drift: ");
+ fflush(stdout);
+ sleep(120);
+
+ get_monotonic_and_raw(&mon, &raw);
+ end = mon;
+ tx2.modes = 0;
+ adjtimex(&tx2);
+ delta2 = diff_timespec(mon, raw);
+
+ interval = diff_timespec(start, end);
+
+ /* calculate measured ppm between MONOTONIC and MONOTONIC_RAW */
+ eppm = ((delta2-delta1)*NSEC_PER_SEC)/interval;
+ eppm = -eppm;
+ printf("%lld.%i(est)", eppm/1000, abs((int)(eppm%1000)));
+
+ /* Avg the two actual freq samples adjtimex gave us */
+ ppm = (tx1.freq + tx2.freq) * 1000 / 2;
+ ppm = (long long)tx1.freq * 1000;
+ ppm = shift_right(ppm, 16);
+ printf(" %lld.%i(act)", ppm/1000, abs((int)(ppm%1000)));
+
+ if (llabs(eppm - ppm) > 1000) {
+ if (tx1.offset || tx2.offset ||
+ tx1.freq != tx2.freq || tx1.tick != tx2.tick) {
+ printf(" [SKIP]\n");
+ return ksft_exit_skip("The clock was adjusted externally. Shutdown NTPd or other time sync daemons\n");
+ }
+ printf(" [FAILED]\n");
+ return ksft_exit_fail();
+ }
+ printf(" [OK]\n");
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/rtcpie.c b/tools/testing/selftests/timers/rtcpie.c
new file mode 100644
index 000000000..4ef2184f1
--- /dev/null
+++ b/tools/testing/selftests/timers/rtcpie.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Real Time Clock Periodic Interrupt test program
+ *
+ * Since commit 6610e0893b8bc ("RTC: Rework RTC code to use timerqueue for
+ * events"), PIE are completely handled using hrtimers, without actually using
+ * any underlying hardware RTC.
+ *
+ */
+
+#include <stdio.h>
+#include <linux/rtc.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <errno.h>
+
+#include "../kselftest.h"
+
+/*
+ * This expects the new RTC class driver framework, working with
+ * clocks that will often not be clones of what the PC-AT had.
+ * Use the command line to specify another RTC if you need one.
+ */
+static const char default_rtc[] = "/dev/rtc0";
+
+int main(int argc, char **argv)
+{
+ int i, fd, retval, irqcount = 0;
+ unsigned long tmp, data, old_pie_rate;
+ const char *rtc = default_rtc;
+ struct timeval start, end, diff;
+
+ switch (argc) {
+ case 2:
+ rtc = argv[1];
+ break;
+ case 1:
+ fd = open(default_rtc, O_RDONLY);
+ if (fd == -1) {
+ printf("Default RTC %s does not exist. Test Skipped!\n", default_rtc);
+ exit(KSFT_SKIP);
+ }
+ close(fd);
+ break;
+ default:
+ fprintf(stderr, "usage: rtctest [rtcdev] [d]\n");
+ return 1;
+ }
+
+ fd = open(rtc, O_RDONLY);
+
+ if (fd == -1) {
+ perror(rtc);
+ exit(errno);
+ }
+
+ /* Read periodic IRQ rate */
+ retval = ioctl(fd, RTC_IRQP_READ, &old_pie_rate);
+ if (retval == -1) {
+ /* not all RTCs support periodic IRQs */
+ if (errno == EINVAL) {
+ fprintf(stderr, "\nNo periodic IRQ support\n");
+ goto done;
+ }
+ perror("RTC_IRQP_READ ioctl");
+ exit(errno);
+ }
+ fprintf(stderr, "\nPeriodic IRQ rate is %ldHz.\n", old_pie_rate);
+
+ fprintf(stderr, "Counting 20 interrupts at:");
+ fflush(stderr);
+
+ /* The frequencies 128Hz, 256Hz, ... 8192Hz are only allowed for root. */
+ for (tmp=2; tmp<=64; tmp*=2) {
+
+ retval = ioctl(fd, RTC_IRQP_SET, tmp);
+ if (retval == -1) {
+ /* not all RTCs can change their periodic IRQ rate */
+ if (errno == EINVAL) {
+ fprintf(stderr,
+ "\n...Periodic IRQ rate is fixed\n");
+ goto done;
+ }
+ perror("RTC_IRQP_SET ioctl");
+ exit(errno);
+ }
+
+ fprintf(stderr, "\n%ldHz:\t", tmp);
+ fflush(stderr);
+
+ /* Enable periodic interrupts */
+ retval = ioctl(fd, RTC_PIE_ON, 0);
+ if (retval == -1) {
+ perror("RTC_PIE_ON ioctl");
+ exit(errno);
+ }
+
+ for (i=1; i<21; i++) {
+ gettimeofday(&start, NULL);
+ /* This blocks */
+ retval = read(fd, &data, sizeof(unsigned long));
+ if (retval == -1) {
+ perror("read");
+ exit(errno);
+ }
+ gettimeofday(&end, NULL);
+ timersub(&end, &start, &diff);
+ if (diff.tv_sec > 0 ||
+ diff.tv_usec > ((1000000L / tmp) * 1.10)) {
+ fprintf(stderr, "\nPIE delta error: %ld.%06ld should be close to 0.%06ld\n",
+ diff.tv_sec, diff.tv_usec,
+ (1000000L / tmp));
+ fflush(stdout);
+ exit(-1);
+ }
+
+ fprintf(stderr, " %d",i);
+ fflush(stderr);
+ irqcount++;
+ }
+
+ /* Disable periodic interrupts */
+ retval = ioctl(fd, RTC_PIE_OFF, 0);
+ if (retval == -1) {
+ perror("RTC_PIE_OFF ioctl");
+ exit(errno);
+ }
+ }
+
+done:
+ ioctl(fd, RTC_IRQP_SET, old_pie_rate);
+
+ fprintf(stderr, "\n\n\t\t\t *** Test complete ***\n");
+
+ close(fd);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/timers/set-2038.c b/tools/testing/selftests/timers/set-2038.c
new file mode 100644
index 000000000..688cfd81b
--- /dev/null
+++ b/tools/testing/selftests/timers/set-2038.c
@@ -0,0 +1,133 @@
+/* Time bounds setting test
+ * by: john stultz (johnstul@us.ibm.com)
+ * (C) Copyright IBM 2012
+ * Licensed under the GPLv2
+ *
+ * NOTE: This is a meta-test which sets the time to edge cases then
+ * uses other tests to detect problems. Thus this test requires that
+ * the inconsistency-check and nanosleep tests be present in the same
+ * directory it is run from.
+ *
+ * To build:
+ * $ gcc set-2038.c -o set-2038 -lrt
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <time.h>
+#include <sys/time.h>
+#include "../kselftest.h"
+
+#define NSEC_PER_SEC 1000000000LL
+
+#define KTIME_MAX ((long long)~((unsigned long long)1 << 63))
+#define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC)
+
+#define YEAR_1901 (-0x7fffffffL)
+#define YEAR_1970 1
+#define YEAR_2038 0x7fffffffL /*overflows 32bit time_t */
+#define YEAR_2262 KTIME_SEC_MAX /*overflows 64bit ktime_t */
+#define YEAR_MAX ((long long)((1ULL<<63)-1)) /*overflows 64bit time_t */
+
+int is32bits(void)
+{
+ return (sizeof(long) == 4);
+}
+
+int settime(long long time)
+{
+ struct timeval now;
+ int ret;
+
+ now.tv_sec = (time_t)time;
+ now.tv_usec = 0;
+
+ ret = settimeofday(&now, NULL);
+
+ printf("Setting time to 0x%lx: %d\n", (long)time, ret);
+ return ret;
+}
+
+int do_tests(void)
+{
+ int ret;
+
+ ret = system("date");
+ ret = system("./inconsistency-check -c 0 -t 20");
+ ret |= system("./nanosleep");
+ ret |= system("./nsleep-lat");
+ return ret;
+
+}
+
+int main(int argc, char *argv[])
+{
+ int ret = 0;
+ int opt, dangerous = 0;
+ time_t start;
+
+ /* Process arguments */
+ while ((opt = getopt(argc, argv, "d")) != -1) {
+ switch (opt) {
+ case 'd':
+ dangerous = 1;
+ }
+ }
+
+ start = time(0);
+
+ /* First test that crazy values don't work */
+ if (!settime(YEAR_1901)) {
+ ret = -1;
+ goto out;
+ }
+ if (!settime(YEAR_MAX)) {
+ ret = -1;
+ goto out;
+ }
+ if (!is32bits() && !settime(YEAR_2262)) {
+ ret = -1;
+ goto out;
+ }
+
+ /* Now test behavior near edges */
+ settime(YEAR_1970);
+ ret = do_tests();
+ if (ret)
+ goto out;
+
+ settime(YEAR_2038 - 600);
+ ret = do_tests();
+ if (ret)
+ goto out;
+
+ /* The rest of the tests can blowup on 32bit systems */
+ if (is32bits() && !dangerous)
+ goto out;
+ /* Test rollover behavior 32bit edge */
+ settime(YEAR_2038 - 10);
+ ret = do_tests();
+ if (ret)
+ goto out;
+
+ settime(YEAR_2262 - 600);
+ ret = do_tests();
+
+out:
+ /* restore clock */
+ settime(start);
+ if (ret)
+ return ksft_exit_fail();
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/set-tai.c b/tools/testing/selftests/timers/set-tai.c
new file mode 100644
index 000000000..8c4179ee2
--- /dev/null
+++ b/tools/testing/selftests/timers/set-tai.c
@@ -0,0 +1,69 @@
+/* Set tai offset
+ * by: John Stultz <john.stultz@linaro.org>
+ * (C) Copyright Linaro 2013
+ * Licensed under the GPLv2
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+#include "../kselftest.h"
+
+int set_tai(int offset)
+{
+ struct timex tx;
+
+ memset(&tx, 0, sizeof(tx));
+
+ tx.modes = ADJ_TAI;
+ tx.constant = offset;
+
+ return adjtimex(&tx);
+}
+
+int get_tai(void)
+{
+ struct timex tx;
+
+ memset(&tx, 0, sizeof(tx));
+
+ adjtimex(&tx);
+ return tx.tai;
+}
+
+int main(int argc, char **argv)
+{
+ int i, ret;
+
+ ret = get_tai();
+ printf("tai offset started at %i\n", ret);
+
+ printf("Checking tai offsets can be properly set: ");
+ fflush(stdout);
+ for (i = 1; i <= 60; i++) {
+ ret = set_tai(i);
+ ret = get_tai();
+ if (ret != i) {
+ printf("[FAILED] expected: %i got %i\n", i, ret);
+ return ksft_exit_fail();
+ }
+ }
+ printf("[OK]\n");
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/set-timer-lat.c b/tools/testing/selftests/timers/set-timer-lat.c
new file mode 100644
index 000000000..50da45437
--- /dev/null
+++ b/tools/testing/selftests/timers/set-timer-lat.c
@@ -0,0 +1,283 @@
+/* set_timer latency test
+ * John Stultz (john.stultz@linaro.org)
+ * (C) Copyright Linaro 2014
+ * Licensed under the GPLv2
+ *
+ * This test makes sure the set_timer api is correct
+ *
+ * To build:
+ * $ gcc set-timer-lat.c -o set-timer-lat -lrt
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <time.h>
+#include <string.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include "../kselftest.h"
+
+#define CLOCK_REALTIME 0
+#define CLOCK_MONOTONIC 1
+#define CLOCK_PROCESS_CPUTIME_ID 2
+#define CLOCK_THREAD_CPUTIME_ID 3
+#define CLOCK_MONOTONIC_RAW 4
+#define CLOCK_REALTIME_COARSE 5
+#define CLOCK_MONOTONIC_COARSE 6
+#define CLOCK_BOOTTIME 7
+#define CLOCK_REALTIME_ALARM 8
+#define CLOCK_BOOTTIME_ALARM 9
+#define CLOCK_HWSPECIFIC 10
+#define CLOCK_TAI 11
+#define NR_CLOCKIDS 12
+
+
+#define NSEC_PER_SEC 1000000000ULL
+#define UNRESONABLE_LATENCY 40000000 /* 40ms in nanosecs */
+
+#define TIMER_SECS 1
+int alarmcount;
+int clock_id;
+struct timespec start_time;
+long long max_latency_ns;
+int timer_fired_early;
+
+char *clockstring(int clockid)
+{
+ switch (clockid) {
+ case CLOCK_REALTIME:
+ return "CLOCK_REALTIME";
+ case CLOCK_MONOTONIC:
+ return "CLOCK_MONOTONIC";
+ case CLOCK_PROCESS_CPUTIME_ID:
+ return "CLOCK_PROCESS_CPUTIME_ID";
+ case CLOCK_THREAD_CPUTIME_ID:
+ return "CLOCK_THREAD_CPUTIME_ID";
+ case CLOCK_MONOTONIC_RAW:
+ return "CLOCK_MONOTONIC_RAW";
+ case CLOCK_REALTIME_COARSE:
+ return "CLOCK_REALTIME_COARSE";
+ case CLOCK_MONOTONIC_COARSE:
+ return "CLOCK_MONOTONIC_COARSE";
+ case CLOCK_BOOTTIME:
+ return "CLOCK_BOOTTIME";
+ case CLOCK_REALTIME_ALARM:
+ return "CLOCK_REALTIME_ALARM";
+ case CLOCK_BOOTTIME_ALARM:
+ return "CLOCK_BOOTTIME_ALARM";
+ case CLOCK_TAI:
+ return "CLOCK_TAI";
+ };
+ return "UNKNOWN_CLOCKID";
+}
+
+
+long long timespec_sub(struct timespec a, struct timespec b)
+{
+ long long ret = NSEC_PER_SEC * b.tv_sec + b.tv_nsec;
+
+ ret -= NSEC_PER_SEC * a.tv_sec + a.tv_nsec;
+ return ret;
+}
+
+
+void sigalarm(int signo)
+{
+ long long delta_ns;
+ struct timespec ts;
+
+ clock_gettime(clock_id, &ts);
+ alarmcount++;
+
+ delta_ns = timespec_sub(start_time, ts);
+ delta_ns -= NSEC_PER_SEC * TIMER_SECS * alarmcount;
+
+ if (delta_ns < 0)
+ timer_fired_early = 1;
+
+ if (delta_ns > max_latency_ns)
+ max_latency_ns = delta_ns;
+}
+
+void describe_timer(int flags, int interval)
+{
+ printf("%-22s %s %s ",
+ clockstring(clock_id),
+ flags ? "ABSTIME":"RELTIME",
+ interval ? "PERIODIC":"ONE-SHOT");
+}
+
+int setup_timer(int clock_id, int flags, int interval, timer_t *tm1)
+{
+ struct sigevent se;
+ struct itimerspec its1, its2;
+ int err;
+
+ /* Set up timer: */
+ memset(&se, 0, sizeof(se));
+ se.sigev_notify = SIGEV_SIGNAL;
+ se.sigev_signo = SIGRTMAX;
+ se.sigev_value.sival_int = 0;
+
+ max_latency_ns = 0;
+ alarmcount = 0;
+ timer_fired_early = 0;
+
+ err = timer_create(clock_id, &se, tm1);
+ if (err) {
+ if ((clock_id == CLOCK_REALTIME_ALARM) ||
+ (clock_id == CLOCK_BOOTTIME_ALARM)) {
+ printf("%-22s %s missing CAP_WAKE_ALARM? : [UNSUPPORTED]\n",
+ clockstring(clock_id),
+ flags ? "ABSTIME":"RELTIME");
+ /* Indicate timer isn't set, so caller doesn't wait */
+ return 1;
+ }
+ printf("%s - timer_create() failed\n", clockstring(clock_id));
+ return -1;
+ }
+
+ clock_gettime(clock_id, &start_time);
+ if (flags) {
+ its1.it_value = start_time;
+ its1.it_value.tv_sec += TIMER_SECS;
+ } else {
+ its1.it_value.tv_sec = TIMER_SECS;
+ its1.it_value.tv_nsec = 0;
+ }
+ its1.it_interval.tv_sec = interval;
+ its1.it_interval.tv_nsec = 0;
+
+ err = timer_settime(*tm1, flags, &its1, &its2);
+ if (err) {
+ printf("%s - timer_settime() failed\n", clockstring(clock_id));
+ return -1;
+ }
+
+ return 0;
+}
+
+int check_timer_latency(int flags, int interval)
+{
+ int err = 0;
+
+ describe_timer(flags, interval);
+ printf("timer fired early: %7d : ", timer_fired_early);
+ if (!timer_fired_early) {
+ printf("[OK]\n");
+ } else {
+ printf("[FAILED]\n");
+ err = -1;
+ }
+
+ describe_timer(flags, interval);
+ printf("max latency: %10lld ns : ", max_latency_ns);
+
+ if (max_latency_ns < UNRESONABLE_LATENCY) {
+ printf("[OK]\n");
+ } else {
+ printf("[FAILED]\n");
+ err = -1;
+ }
+ return err;
+}
+
+int check_alarmcount(int flags, int interval)
+{
+ describe_timer(flags, interval);
+ printf("count: %19d : ", alarmcount);
+ if (alarmcount == 1) {
+ printf("[OK]\n");
+ return 0;
+ }
+ printf("[FAILED]\n");
+ return -1;
+}
+
+int do_timer(int clock_id, int flags)
+{
+ timer_t tm1;
+ const int interval = TIMER_SECS;
+ int err;
+
+ err = setup_timer(clock_id, flags, interval, &tm1);
+ /* Unsupported case - return 0 to not fail the test */
+ if (err)
+ return err == 1 ? 0 : err;
+
+ while (alarmcount < 5)
+ sleep(1);
+
+ timer_delete(tm1);
+ return check_timer_latency(flags, interval);
+}
+
+int do_timer_oneshot(int clock_id, int flags)
+{
+ timer_t tm1;
+ const int interval = 0;
+ struct timeval timeout;
+ int err;
+
+ err = setup_timer(clock_id, flags, interval, &tm1);
+ /* Unsupported case - return 0 to not fail the test */
+ if (err)
+ return err == 1 ? 0 : err;
+
+ memset(&timeout, 0, sizeof(timeout));
+ timeout.tv_sec = 5;
+ do {
+ err = select(0, NULL, NULL, NULL, &timeout);
+ } while (err == -1 && errno == EINTR);
+
+ timer_delete(tm1);
+ err = check_timer_latency(flags, interval);
+ err |= check_alarmcount(flags, interval);
+ return err;
+}
+
+int main(void)
+{
+ struct sigaction act;
+ int signum = SIGRTMAX;
+ int ret = 0;
+
+ /* Set up signal handler: */
+ sigfillset(&act.sa_mask);
+ act.sa_flags = 0;
+ act.sa_handler = sigalarm;
+ sigaction(signum, &act, NULL);
+
+ printf("Setting timers for every %i seconds\n", TIMER_SECS);
+ for (clock_id = 0; clock_id < NR_CLOCKIDS; clock_id++) {
+
+ if ((clock_id == CLOCK_PROCESS_CPUTIME_ID) ||
+ (clock_id == CLOCK_THREAD_CPUTIME_ID) ||
+ (clock_id == CLOCK_MONOTONIC_RAW) ||
+ (clock_id == CLOCK_REALTIME_COARSE) ||
+ (clock_id == CLOCK_MONOTONIC_COARSE) ||
+ (clock_id == CLOCK_HWSPECIFIC))
+ continue;
+
+ ret |= do_timer(clock_id, TIMER_ABSTIME);
+ ret |= do_timer(clock_id, 0);
+ ret |= do_timer_oneshot(clock_id, TIMER_ABSTIME);
+ ret |= do_timer_oneshot(clock_id, 0);
+ }
+ if (ret)
+ return ksft_exit_fail();
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/set-tz.c b/tools/testing/selftests/timers/set-tz.c
new file mode 100644
index 000000000..62bd33eb1
--- /dev/null
+++ b/tools/testing/selftests/timers/set-tz.c
@@ -0,0 +1,110 @@
+/* Set tz value
+ * by: John Stultz <john.stultz@linaro.org>
+ * (C) Copyright Linaro 2016
+ * Licensed under the GPLv2
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+#include "../kselftest.h"
+
+int set_tz(int min, int dst)
+{
+ struct timezone tz;
+
+ tz.tz_minuteswest = min;
+ tz.tz_dsttime = dst;
+
+ return settimeofday(0, &tz);
+}
+
+int get_tz_min(void)
+{
+ struct timezone tz;
+ struct timeval tv;
+
+ memset(&tz, 0, sizeof(tz));
+ gettimeofday(&tv, &tz);
+ return tz.tz_minuteswest;
+}
+
+int get_tz_dst(void)
+{
+ struct timezone tz;
+ struct timeval tv;
+
+ memset(&tz, 0, sizeof(tz));
+ gettimeofday(&tv, &tz);
+ return tz.tz_dsttime;
+}
+
+int main(int argc, char **argv)
+{
+ int i, ret;
+ int min, dst;
+
+ min = get_tz_min();
+ dst = get_tz_dst();
+ printf("tz_minuteswest started at %i, dst at %i\n", min, dst);
+
+ printf("Checking tz_minuteswest can be properly set: ");
+ fflush(stdout);
+ for (i = -15*60; i < 15*60; i += 30) {
+ ret = set_tz(i, dst);
+ ret = get_tz_min();
+ if (ret != i) {
+ printf("[FAILED] expected: %i got %i\n", i, ret);
+ goto err;
+ }
+ }
+ printf("[OK]\n");
+
+ printf("Checking invalid tz_minuteswest values are caught: ");
+ fflush(stdout);
+
+ if (!set_tz(-15*60-1, dst)) {
+ printf("[FAILED] %i didn't return failure!\n", -15*60-1);
+ goto err;
+ }
+
+ if (!set_tz(15*60+1, dst)) {
+ printf("[FAILED] %i didn't return failure!\n", 15*60+1);
+ goto err;
+ }
+
+ if (!set_tz(-24*60, dst)) {
+ printf("[FAILED] %i didn't return failure!\n", -24*60);
+ goto err;
+ }
+
+ if (!set_tz(24*60, dst)) {
+ printf("[FAILED] %i didn't return failure!\n", 24*60);
+ goto err;
+ }
+
+ printf("[OK]\n");
+
+ set_tz(min, dst);
+ return ksft_exit_pass();
+
+err:
+ set_tz(min, dst);
+ return ksft_exit_fail();
+}
diff --git a/tools/testing/selftests/timers/settings b/tools/testing/selftests/timers/settings
new file mode 100644
index 000000000..e7b941753
--- /dev/null
+++ b/tools/testing/selftests/timers/settings
@@ -0,0 +1 @@
+timeout=0
diff --git a/tools/testing/selftests/timers/skew_consistency.c b/tools/testing/selftests/timers/skew_consistency.c
new file mode 100644
index 000000000..8066be9af
--- /dev/null
+++ b/tools/testing/selftests/timers/skew_consistency.c
@@ -0,0 +1,77 @@
+/* ADJ_FREQ Skew consistency test
+ * by: john stultz (johnstul@us.ibm.com)
+ * (C) Copyright IBM 2012
+ * Licensed under the GPLv2
+ *
+ * NOTE: This is a meta-test which cranks the ADJ_FREQ knob back
+ * and forth and watches for consistency problems. Thus this test requires
+ * that the inconsistency-check tests be present in the same directory it
+ * is run from.
+ *
+ * To build:
+ * $ gcc skew_consistency.c -o skew_consistency -lrt
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/wait.h>
+#include "../kselftest.h"
+
+#define NSEC_PER_SEC 1000000000LL
+
+int main(int argv, char **argc)
+{
+ struct timex tx;
+ int ret, ppm;
+ pid_t pid;
+
+
+ printf("Running Asynchronous Frequency Changing Tests...\n");
+
+ pid = fork();
+ if (!pid)
+ return system("./inconsistency-check -c 1 -t 600");
+
+ ppm = 500;
+ ret = 0;
+
+ while (pid != waitpid(pid, &ret, WNOHANG)) {
+ ppm = -ppm;
+ tx.modes = ADJ_FREQUENCY;
+ tx.freq = ppm << 16;
+ adjtimex(&tx);
+ usleep(500000);
+ }
+
+ /* Set things back */
+ tx.modes = ADJ_FREQUENCY;
+ tx.offset = 0;
+ adjtimex(&tx);
+
+
+ if (ret) {
+ printf("[FAILED]\n");
+ return ksft_exit_fail();
+ }
+ printf("[OK]\n");
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/threadtest.c b/tools/testing/selftests/timers/threadtest.c
new file mode 100644
index 000000000..cf3e48919
--- /dev/null
+++ b/tools/testing/selftests/timers/threadtest.c
@@ -0,0 +1,193 @@
+/* threadtest.c
+ * by: john stultz (johnstul@us.ibm.com)
+ * (C) Copyright IBM 2004, 2005, 2006, 2012
+ * Licensed under the GPLv2
+ *
+ * To build:
+ * $ gcc threadtest.c -o threadtest -lrt
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <pthread.h>
+#include "../kselftest.h"
+
+/* serializes shared list access */
+pthread_mutex_t list_lock = PTHREAD_MUTEX_INITIALIZER;
+/* serializes console output */
+pthread_mutex_t print_lock = PTHREAD_MUTEX_INITIALIZER;
+
+
+#define MAX_THREADS 128
+#define LISTSIZE 128
+
+int done = 0;
+
+struct timespec global_list[LISTSIZE];
+int listcount = 0;
+
+
+void checklist(struct timespec *list, int size)
+{
+ int i, j;
+ struct timespec *a, *b;
+
+ /* scan the list */
+ for (i = 0; i < size-1; i++) {
+ a = &list[i];
+ b = &list[i+1];
+
+ /* look for any time inconsistencies */
+ if ((b->tv_sec <= a->tv_sec) &&
+ (b->tv_nsec < a->tv_nsec)) {
+
+ /* flag other threads */
+ done = 1;
+
+ /*serialize printing to avoid junky output*/
+ pthread_mutex_lock(&print_lock);
+
+ /* dump the list */
+ printf("\n");
+ for (j = 0; j < size; j++) {
+ if (j == i)
+ printf("---------------\n");
+ printf("%lu:%lu\n", list[j].tv_sec, list[j].tv_nsec);
+ if (j == i+1)
+ printf("---------------\n");
+ }
+ printf("[FAILED]\n");
+
+ pthread_mutex_unlock(&print_lock);
+ }
+ }
+}
+
+/* The shared thread shares a global list
+ * that each thread fills while holding the lock.
+ * This stresses clock syncronization across cpus.
+ */
+void *shared_thread(void *arg)
+{
+ while (!done) {
+ /* protect the list */
+ pthread_mutex_lock(&list_lock);
+
+ /* see if we're ready to check the list */
+ if (listcount >= LISTSIZE) {
+ checklist(global_list, LISTSIZE);
+ listcount = 0;
+ }
+ clock_gettime(CLOCK_MONOTONIC, &global_list[listcount++]);
+
+ pthread_mutex_unlock(&list_lock);
+ }
+ return NULL;
+}
+
+
+/* Each independent thread fills in its own
+ * list. This stresses clock_gettime() lock contention.
+ */
+void *independent_thread(void *arg)
+{
+ struct timespec my_list[LISTSIZE];
+ int count;
+
+ while (!done) {
+ /* fill the list */
+ for (count = 0; count < LISTSIZE; count++)
+ clock_gettime(CLOCK_MONOTONIC, &my_list[count]);
+ checklist(my_list, LISTSIZE);
+ }
+ return NULL;
+}
+
+#define DEFAULT_THREAD_COUNT 8
+#define DEFAULT_RUNTIME 30
+
+int main(int argc, char **argv)
+{
+ int thread_count, i;
+ time_t start, now, runtime;
+ char buf[255];
+ pthread_t pth[MAX_THREADS];
+ int opt;
+ void *tret;
+ int ret = 0;
+ void *(*thread)(void *) = shared_thread;
+
+ thread_count = DEFAULT_THREAD_COUNT;
+ runtime = DEFAULT_RUNTIME;
+
+ /* Process arguments */
+ while ((opt = getopt(argc, argv, "t:n:i")) != -1) {
+ switch (opt) {
+ case 't':
+ runtime = atoi(optarg);
+ break;
+ case 'n':
+ thread_count = atoi(optarg);
+ break;
+ case 'i':
+ thread = independent_thread;
+ printf("using independent threads\n");
+ break;
+ default:
+ printf("Usage: %s [-t <secs>] [-n <numthreads>] [-i]\n", argv[0]);
+ printf(" -t: time to run\n");
+ printf(" -n: number of threads\n");
+ printf(" -i: use independent threads\n");
+ return -1;
+ }
+ }
+
+ if (thread_count > MAX_THREADS)
+ thread_count = MAX_THREADS;
+
+
+ setbuf(stdout, NULL);
+
+ start = time(0);
+ strftime(buf, 255, "%a, %d %b %Y %T %z", localtime(&start));
+ printf("%s\n", buf);
+ printf("Testing consistency with %i threads for %ld seconds: ", thread_count, runtime);
+ fflush(stdout);
+
+ /* spawn */
+ for (i = 0; i < thread_count; i++)
+ pthread_create(&pth[i], 0, thread, 0);
+
+ while (time(&now) < start + runtime) {
+ sleep(1);
+ if (done) {
+ ret = 1;
+ strftime(buf, 255, "%a, %d %b %Y %T %z", localtime(&now));
+ printf("%s\n", buf);
+ goto out;
+ }
+ }
+ printf("[OK]\n");
+ done = 1;
+
+out:
+ /* wait */
+ for (i = 0; i < thread_count; i++)
+ pthread_join(pth[i], &tret);
+
+ /* die */
+ if (ret)
+ ksft_exit_fail();
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/timers/valid-adjtimex.c b/tools/testing/selftests/timers/valid-adjtimex.c
new file mode 100644
index 000000000..48b9a8032
--- /dev/null
+++ b/tools/testing/selftests/timers/valid-adjtimex.c
@@ -0,0 +1,330 @@
+/* valid adjtimex test
+ * by: John Stultz <john.stultz@linaro.org>
+ * (C) Copyright Linaro 2015
+ * Licensed under the GPLv2
+ *
+ * This test validates adjtimex interface with valid
+ * and invalid test data.
+ *
+ * Usage: valid-adjtimex
+ *
+ * To build:
+ * $ gcc valid-adjtimex.c -o valid-adjtimex -lrt
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/timex.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+#include "../kselftest.h"
+
+#define NSEC_PER_SEC 1000000000LL
+#define USEC_PER_SEC 1000000LL
+
+#define ADJ_SETOFFSET 0x0100
+
+#include <sys/syscall.h>
+int clock_adjtime(clockid_t id, struct timex *tx)
+{
+ return syscall(__NR_clock_adjtime, id, tx);
+}
+
+
+/* clear NTP time_status & time_state */
+int clear_time_state(void)
+{
+ struct timex tx;
+ int ret;
+
+ tx.modes = ADJ_STATUS;
+ tx.status = 0;
+ ret = adjtimex(&tx);
+ return ret;
+}
+
+#define NUM_FREQ_VALID 32
+#define NUM_FREQ_OUTOFRANGE 4
+#define NUM_FREQ_INVALID 2
+
+long valid_freq[NUM_FREQ_VALID] = {
+ -499<<16,
+ -450<<16,
+ -400<<16,
+ -350<<16,
+ -300<<16,
+ -250<<16,
+ -200<<16,
+ -150<<16,
+ -100<<16,
+ -75<<16,
+ -50<<16,
+ -25<<16,
+ -10<<16,
+ -5<<16,
+ -1<<16,
+ -1000,
+ 1<<16,
+ 5<<16,
+ 10<<16,
+ 25<<16,
+ 50<<16,
+ 75<<16,
+ 100<<16,
+ 150<<16,
+ 200<<16,
+ 250<<16,
+ 300<<16,
+ 350<<16,
+ 400<<16,
+ 450<<16,
+ 499<<16,
+};
+
+long outofrange_freq[NUM_FREQ_OUTOFRANGE] = {
+ -1000<<16,
+ -550<<16,
+ 550<<16,
+ 1000<<16,
+};
+
+#define LONG_MAX (~0UL>>1)
+#define LONG_MIN (-LONG_MAX - 1)
+
+long invalid_freq[NUM_FREQ_INVALID] = {
+ LONG_MAX,
+ LONG_MIN,
+};
+
+int validate_freq(void)
+{
+ struct timex tx;
+ int ret, pass = 0;
+ int i;
+
+ clear_time_state();
+
+ memset(&tx, 0, sizeof(struct timex));
+ /* Set the leap second insert flag */
+
+ printf("Testing ADJ_FREQ... ");
+ fflush(stdout);
+ for (i = 0; i < NUM_FREQ_VALID; i++) {
+ tx.modes = ADJ_FREQUENCY;
+ tx.freq = valid_freq[i];
+
+ ret = adjtimex(&tx);
+ if (ret < 0) {
+ printf("[FAIL]\n");
+ printf("Error: adjtimex(ADJ_FREQ, %ld - %ld ppm\n",
+ valid_freq[i], valid_freq[i]>>16);
+ pass = -1;
+ goto out;
+ }
+ tx.modes = 0;
+ ret = adjtimex(&tx);
+ if (tx.freq != valid_freq[i]) {
+ printf("Warning: freq value %ld not what we set it (%ld)!\n",
+ tx.freq, valid_freq[i]);
+ }
+ }
+ for (i = 0; i < NUM_FREQ_OUTOFRANGE; i++) {
+ tx.modes = ADJ_FREQUENCY;
+ tx.freq = outofrange_freq[i];
+
+ ret = adjtimex(&tx);
+ if (ret < 0) {
+ printf("[FAIL]\n");
+ printf("Error: adjtimex(ADJ_FREQ, %ld - %ld ppm\n",
+ outofrange_freq[i], outofrange_freq[i]>>16);
+ pass = -1;
+ goto out;
+ }
+ tx.modes = 0;
+ ret = adjtimex(&tx);
+ if (tx.freq == outofrange_freq[i]) {
+ printf("[FAIL]\n");
+ printf("ERROR: out of range value %ld actually set!\n",
+ tx.freq);
+ pass = -1;
+ goto out;
+ }
+ }
+
+
+ if (sizeof(long) == 8) { /* this case only applies to 64bit systems */
+ for (i = 0; i < NUM_FREQ_INVALID; i++) {
+ tx.modes = ADJ_FREQUENCY;
+ tx.freq = invalid_freq[i];
+ ret = adjtimex(&tx);
+ if (ret >= 0) {
+ printf("[FAIL]\n");
+ printf("Error: No failure on invalid ADJ_FREQUENCY %ld\n",
+ invalid_freq[i]);
+ pass = -1;
+ goto out;
+ }
+ }
+ }
+
+ printf("[OK]\n");
+out:
+ /* reset freq to zero */
+ tx.modes = ADJ_FREQUENCY;
+ tx.freq = 0;
+ ret = adjtimex(&tx);
+
+ return pass;
+}
+
+
+int set_offset(long long offset, int use_nano)
+{
+ struct timex tmx = {};
+ int ret;
+
+ tmx.modes = ADJ_SETOFFSET;
+ if (use_nano) {
+ tmx.modes |= ADJ_NANO;
+
+ tmx.time.tv_sec = offset / NSEC_PER_SEC;
+ tmx.time.tv_usec = offset % NSEC_PER_SEC;
+
+ if (offset < 0 && tmx.time.tv_usec) {
+ tmx.time.tv_sec -= 1;
+ tmx.time.tv_usec += NSEC_PER_SEC;
+ }
+ } else {
+ tmx.time.tv_sec = offset / USEC_PER_SEC;
+ tmx.time.tv_usec = offset % USEC_PER_SEC;
+
+ if (offset < 0 && tmx.time.tv_usec) {
+ tmx.time.tv_sec -= 1;
+ tmx.time.tv_usec += USEC_PER_SEC;
+ }
+ }
+
+ ret = clock_adjtime(CLOCK_REALTIME, &tmx);
+ if (ret < 0) {
+ printf("(sec: %ld usec: %ld) ", tmx.time.tv_sec, tmx.time.tv_usec);
+ printf("[FAIL]\n");
+ return -1;
+ }
+ return 0;
+}
+
+int set_bad_offset(long sec, long usec, int use_nano)
+{
+ struct timex tmx = {};
+ int ret;
+
+ tmx.modes = ADJ_SETOFFSET;
+ if (use_nano)
+ tmx.modes |= ADJ_NANO;
+
+ tmx.time.tv_sec = sec;
+ tmx.time.tv_usec = usec;
+ ret = clock_adjtime(CLOCK_REALTIME, &tmx);
+ if (ret >= 0) {
+ printf("Invalid (sec: %ld usec: %ld) did not fail! ", tmx.time.tv_sec, tmx.time.tv_usec);
+ printf("[FAIL]\n");
+ return -1;
+ }
+ return 0;
+}
+
+int validate_set_offset(void)
+{
+ printf("Testing ADJ_SETOFFSET... ");
+ fflush(stdout);
+
+ /* Test valid values */
+ if (set_offset(NSEC_PER_SEC - 1, 1))
+ return -1;
+
+ if (set_offset(-NSEC_PER_SEC + 1, 1))
+ return -1;
+
+ if (set_offset(-NSEC_PER_SEC - 1, 1))
+ return -1;
+
+ if (set_offset(5 * NSEC_PER_SEC, 1))
+ return -1;
+
+ if (set_offset(-5 * NSEC_PER_SEC, 1))
+ return -1;
+
+ if (set_offset(5 * NSEC_PER_SEC + NSEC_PER_SEC / 2, 1))
+ return -1;
+
+ if (set_offset(-5 * NSEC_PER_SEC - NSEC_PER_SEC / 2, 1))
+ return -1;
+
+ if (set_offset(USEC_PER_SEC - 1, 0))
+ return -1;
+
+ if (set_offset(-USEC_PER_SEC + 1, 0))
+ return -1;
+
+ if (set_offset(-USEC_PER_SEC - 1, 0))
+ return -1;
+
+ if (set_offset(5 * USEC_PER_SEC, 0))
+ return -1;
+
+ if (set_offset(-5 * USEC_PER_SEC, 0))
+ return -1;
+
+ if (set_offset(5 * USEC_PER_SEC + USEC_PER_SEC / 2, 0))
+ return -1;
+
+ if (set_offset(-5 * USEC_PER_SEC - USEC_PER_SEC / 2, 0))
+ return -1;
+
+ /* Test invalid values */
+ if (set_bad_offset(0, -1, 1))
+ return -1;
+ if (set_bad_offset(0, -1, 0))
+ return -1;
+ if (set_bad_offset(0, 2 * NSEC_PER_SEC, 1))
+ return -1;
+ if (set_bad_offset(0, 2 * USEC_PER_SEC, 0))
+ return -1;
+ if (set_bad_offset(0, NSEC_PER_SEC, 1))
+ return -1;
+ if (set_bad_offset(0, USEC_PER_SEC, 0))
+ return -1;
+ if (set_bad_offset(0, -NSEC_PER_SEC, 1))
+ return -1;
+ if (set_bad_offset(0, -USEC_PER_SEC, 0))
+ return -1;
+
+ printf("[OK]\n");
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ if (validate_freq())
+ return ksft_exit_fail();
+
+ if (validate_set_offset())
+ return ksft_exit_fail();
+
+ return ksft_exit_pass();
+}
diff --git a/tools/testing/selftests/tmpfs/.gitignore b/tools/testing/selftests/tmpfs/.gitignore
new file mode 100644
index 000000000..b1afaa925
--- /dev/null
+++ b/tools/testing/selftests/tmpfs/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+/bug-link-o-tmpfile
diff --git a/tools/testing/selftests/tmpfs/Makefile b/tools/testing/selftests/tmpfs/Makefile
new file mode 100644
index 000000000..aa11ccc92
--- /dev/null
+++ b/tools/testing/selftests/tmpfs/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+CFLAGS += -Wall -O2
+CFLAGS += -D_GNU_SOURCE
+
+TEST_GEN_PROGS :=
+TEST_GEN_PROGS += bug-link-o-tmpfile
+
+include ../lib.mk
diff --git a/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c b/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c
new file mode 100644
index 000000000..b5c3ddb90
--- /dev/null
+++ b/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2019 Alexey Dobriyan <adobriyan@gmail.com>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+/* Test that open(O_TMPFILE), linkat() doesn't screw accounting. */
+#include <errno.h>
+#include <sched.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/mount.h>
+#include <unistd.h>
+
+int main(void)
+{
+ int fd;
+
+ if (unshare(CLONE_NEWNS) == -1) {
+ if (errno == ENOSYS || errno == EPERM) {
+ fprintf(stderr, "error: unshare, errno %d\n", errno);
+ return 4;
+ }
+ fprintf(stderr, "error: unshare, errno %d\n", errno);
+ return 1;
+ }
+ if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) {
+ fprintf(stderr, "error: mount '/', errno %d\n", errno);
+ return 1;
+ }
+
+ /* Our heroes: 1 root inode, 1 O_TMPFILE inode, 1 permanent inode. */
+ if (mount(NULL, "/tmp", "tmpfs", 0, "nr_inodes=3") == -1) {
+ fprintf(stderr, "error: mount tmpfs, errno %d\n", errno);
+ return 1;
+ }
+
+ fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_TMPFILE, 0600);
+ if (fd == -1) {
+ fprintf(stderr, "error: open 1, errno %d\n", errno);
+ return 1;
+ }
+ if (linkat(fd, "", AT_FDCWD, "/tmp/1", AT_EMPTY_PATH) == -1) {
+ fprintf(stderr, "error: linkat, errno %d\n", errno);
+ return 1;
+ }
+ close(fd);
+
+ fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_TMPFILE, 0600);
+ if (fd == -1) {
+ fprintf(stderr, "error: open 2, errno %d\n", errno);
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/tpm2/Makefile b/tools/testing/selftests/tpm2/Makefile
new file mode 100644
index 000000000..1a5db1eb8
--- /dev/null
+++ b/tools/testing/selftests/tpm2/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+include ../lib.mk
+
+TEST_PROGS := test_smoke.sh test_space.sh
+TEST_PROGS_EXTENDED := tpm2.py tpm2_tests.py
diff --git a/tools/testing/selftests/tpm2/test_smoke.sh b/tools/testing/selftests/tpm2/test_smoke.sh
new file mode 100755
index 000000000..3e5ff29ee
--- /dev/null
+++ b/tools/testing/selftests/tpm2/test_smoke.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+[ -e /dev/tpm0 ] || exit $ksft_skip
+
+python3 -m unittest -v tpm2_tests.SmokeTest
+python3 -m unittest -v tpm2_tests.AsyncTest
diff --git a/tools/testing/selftests/tpm2/test_space.sh b/tools/testing/selftests/tpm2/test_space.sh
new file mode 100755
index 000000000..04c47b13f
--- /dev/null
+++ b/tools/testing/selftests/tpm2/test_space.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+[ -e /dev/tpmrm0 ] || exit $ksft_skip
+
+python3 -m unittest -v tpm2_tests.SpaceTest
diff --git a/tools/testing/selftests/tpm2/tpm2.py b/tools/testing/selftests/tpm2/tpm2.py
new file mode 100644
index 000000000..3e67fdb51
--- /dev/null
+++ b/tools/testing/selftests/tpm2/tpm2.py
@@ -0,0 +1,718 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+
+import hashlib
+import os
+import socket
+import struct
+import sys
+import unittest
+import fcntl
+import select
+
+TPM2_ST_NO_SESSIONS = 0x8001
+TPM2_ST_SESSIONS = 0x8002
+
+TPM2_CC_FIRST = 0x01FF
+
+TPM2_CC_CREATE_PRIMARY = 0x0131
+TPM2_CC_DICTIONARY_ATTACK_LOCK_RESET = 0x0139
+TPM2_CC_CREATE = 0x0153
+TPM2_CC_LOAD = 0x0157
+TPM2_CC_UNSEAL = 0x015E
+TPM2_CC_FLUSH_CONTEXT = 0x0165
+TPM2_CC_START_AUTH_SESSION = 0x0176
+TPM2_CC_GET_CAPABILITY = 0x017A
+TPM2_CC_GET_RANDOM = 0x017B
+TPM2_CC_PCR_READ = 0x017E
+TPM2_CC_POLICY_PCR = 0x017F
+TPM2_CC_PCR_EXTEND = 0x0182
+TPM2_CC_POLICY_PASSWORD = 0x018C
+TPM2_CC_POLICY_GET_DIGEST = 0x0189
+
+TPM2_SE_POLICY = 0x01
+TPM2_SE_TRIAL = 0x03
+
+TPM2_ALG_RSA = 0x0001
+TPM2_ALG_SHA1 = 0x0004
+TPM2_ALG_AES = 0x0006
+TPM2_ALG_KEYEDHASH = 0x0008
+TPM2_ALG_SHA256 = 0x000B
+TPM2_ALG_NULL = 0x0010
+TPM2_ALG_CBC = 0x0042
+TPM2_ALG_CFB = 0x0043
+
+TPM2_RH_OWNER = 0x40000001
+TPM2_RH_NULL = 0x40000007
+TPM2_RH_LOCKOUT = 0x4000000A
+TPM2_RS_PW = 0x40000009
+
+TPM2_RC_SIZE = 0x01D5
+TPM2_RC_AUTH_FAIL = 0x098E
+TPM2_RC_POLICY_FAIL = 0x099D
+TPM2_RC_COMMAND_CODE = 0x0143
+
+TSS2_RC_LAYER_SHIFT = 16
+TSS2_RESMGR_TPM_RC_LAYER = (11 << TSS2_RC_LAYER_SHIFT)
+
+TPM2_CAP_HANDLES = 0x00000001
+TPM2_CAP_COMMANDS = 0x00000002
+TPM2_CAP_TPM_PROPERTIES = 0x00000006
+
+TPM2_PT_FIXED = 0x100
+TPM2_PT_TOTAL_COMMANDS = TPM2_PT_FIXED + 41
+
+HR_SHIFT = 24
+HR_LOADED_SESSION = 0x02000000
+HR_TRANSIENT = 0x80000000
+
+SHA1_DIGEST_SIZE = 20
+SHA256_DIGEST_SIZE = 32
+
+TPM2_VER0_ERRORS = {
+ 0x000: "TPM_RC_SUCCESS",
+ 0x030: "TPM_RC_BAD_TAG",
+}
+
+TPM2_VER1_ERRORS = {
+ 0x000: "TPM_RC_FAILURE",
+ 0x001: "TPM_RC_FAILURE",
+ 0x003: "TPM_RC_SEQUENCE",
+ 0x00B: "TPM_RC_PRIVATE",
+ 0x019: "TPM_RC_HMAC",
+ 0x020: "TPM_RC_DISABLED",
+ 0x021: "TPM_RC_EXCLUSIVE",
+ 0x024: "TPM_RC_AUTH_TYPE",
+ 0x025: "TPM_RC_AUTH_MISSING",
+ 0x026: "TPM_RC_POLICY",
+ 0x027: "TPM_RC_PCR",
+ 0x028: "TPM_RC_PCR_CHANGED",
+ 0x02D: "TPM_RC_UPGRADE",
+ 0x02E: "TPM_RC_TOO_MANY_CONTEXTS",
+ 0x02F: "TPM_RC_AUTH_UNAVAILABLE",
+ 0x030: "TPM_RC_REBOOT",
+ 0x031: "TPM_RC_UNBALANCED",
+ 0x042: "TPM_RC_COMMAND_SIZE",
+ 0x043: "TPM_RC_COMMAND_CODE",
+ 0x044: "TPM_RC_AUTHSIZE",
+ 0x045: "TPM_RC_AUTH_CONTEXT",
+ 0x046: "TPM_RC_NV_RANGE",
+ 0x047: "TPM_RC_NV_SIZE",
+ 0x048: "TPM_RC_NV_LOCKED",
+ 0x049: "TPM_RC_NV_AUTHORIZATION",
+ 0x04A: "TPM_RC_NV_UNINITIALIZED",
+ 0x04B: "TPM_RC_NV_SPACE",
+ 0x04C: "TPM_RC_NV_DEFINED",
+ 0x050: "TPM_RC_BAD_CONTEXT",
+ 0x051: "TPM_RC_CPHASH",
+ 0x052: "TPM_RC_PARENT",
+ 0x053: "TPM_RC_NEEDS_TEST",
+ 0x054: "TPM_RC_NO_RESULT",
+ 0x055: "TPM_RC_SENSITIVE",
+ 0x07F: "RC_MAX_FM0",
+}
+
+TPM2_FMT1_ERRORS = {
+ 0x001: "TPM_RC_ASYMMETRIC",
+ 0x002: "TPM_RC_ATTRIBUTES",
+ 0x003: "TPM_RC_HASH",
+ 0x004: "TPM_RC_VALUE",
+ 0x005: "TPM_RC_HIERARCHY",
+ 0x007: "TPM_RC_KEY_SIZE",
+ 0x008: "TPM_RC_MGF",
+ 0x009: "TPM_RC_MODE",
+ 0x00A: "TPM_RC_TYPE",
+ 0x00B: "TPM_RC_HANDLE",
+ 0x00C: "TPM_RC_KDF",
+ 0x00D: "TPM_RC_RANGE",
+ 0x00E: "TPM_RC_AUTH_FAIL",
+ 0x00F: "TPM_RC_NONCE",
+ 0x010: "TPM_RC_PP",
+ 0x012: "TPM_RC_SCHEME",
+ 0x015: "TPM_RC_SIZE",
+ 0x016: "TPM_RC_SYMMETRIC",
+ 0x017: "TPM_RC_TAG",
+ 0x018: "TPM_RC_SELECTOR",
+ 0x01A: "TPM_RC_INSUFFICIENT",
+ 0x01B: "TPM_RC_SIGNATURE",
+ 0x01C: "TPM_RC_KEY",
+ 0x01D: "TPM_RC_POLICY_FAIL",
+ 0x01F: "TPM_RC_INTEGRITY",
+ 0x020: "TPM_RC_TICKET",
+ 0x021: "TPM_RC_RESERVED_BITS",
+ 0x022: "TPM_RC_BAD_AUTH",
+ 0x023: "TPM_RC_EXPIRED",
+ 0x024: "TPM_RC_POLICY_CC",
+ 0x025: "TPM_RC_BINDING",
+ 0x026: "TPM_RC_CURVE",
+ 0x027: "TPM_RC_ECC_POINT",
+}
+
+TPM2_WARN_ERRORS = {
+ 0x001: "TPM_RC_CONTEXT_GAP",
+ 0x002: "TPM_RC_OBJECT_MEMORY",
+ 0x003: "TPM_RC_SESSION_MEMORY",
+ 0x004: "TPM_RC_MEMORY",
+ 0x005: "TPM_RC_SESSION_HANDLES",
+ 0x006: "TPM_RC_OBJECT_HANDLES",
+ 0x007: "TPM_RC_LOCALITY",
+ 0x008: "TPM_RC_YIELDED",
+ 0x009: "TPM_RC_CANCELED",
+ 0x00A: "TPM_RC_TESTING",
+ 0x010: "TPM_RC_REFERENCE_H0",
+ 0x011: "TPM_RC_REFERENCE_H1",
+ 0x012: "TPM_RC_REFERENCE_H2",
+ 0x013: "TPM_RC_REFERENCE_H3",
+ 0x014: "TPM_RC_REFERENCE_H4",
+ 0x015: "TPM_RC_REFERENCE_H5",
+ 0x016: "TPM_RC_REFERENCE_H6",
+ 0x018: "TPM_RC_REFERENCE_S0",
+ 0x019: "TPM_RC_REFERENCE_S1",
+ 0x01A: "TPM_RC_REFERENCE_S2",
+ 0x01B: "TPM_RC_REFERENCE_S3",
+ 0x01C: "TPM_RC_REFERENCE_S4",
+ 0x01D: "TPM_RC_REFERENCE_S5",
+ 0x01E: "TPM_RC_REFERENCE_S6",
+ 0x020: "TPM_RC_NV_RATE",
+ 0x021: "TPM_RC_LOCKOUT",
+ 0x022: "TPM_RC_RETRY",
+ 0x023: "TPM_RC_NV_UNAVAILABLE",
+ 0x7F: "TPM_RC_NOT_USED",
+}
+
+RC_VER1 = 0x100
+RC_FMT1 = 0x080
+RC_WARN = 0x900
+
+ALG_DIGEST_SIZE_MAP = {
+ TPM2_ALG_SHA1: SHA1_DIGEST_SIZE,
+ TPM2_ALG_SHA256: SHA256_DIGEST_SIZE,
+}
+
+ALG_HASH_FUNCTION_MAP = {
+ TPM2_ALG_SHA1: hashlib.sha1,
+ TPM2_ALG_SHA256: hashlib.sha256
+}
+
+NAME_ALG_MAP = {
+ "sha1": TPM2_ALG_SHA1,
+ "sha256": TPM2_ALG_SHA256,
+}
+
+
+class UnknownAlgorithmIdError(Exception):
+ def __init__(self, alg):
+ self.alg = alg
+
+ def __str__(self):
+ return '0x%0x' % (alg)
+
+
+class UnknownAlgorithmNameError(Exception):
+ def __init__(self, name):
+ self.name = name
+
+ def __str__(self):
+ return name
+
+
+class UnknownPCRBankError(Exception):
+ def __init__(self, alg):
+ self.alg = alg
+
+ def __str__(self):
+ return '0x%0x' % (alg)
+
+
+class ProtocolError(Exception):
+ def __init__(self, cc, rc):
+ self.cc = cc
+ self.rc = rc
+
+ if (rc & RC_FMT1) == RC_FMT1:
+ self.name = TPM2_FMT1_ERRORS.get(rc & 0x3f, "TPM_RC_UNKNOWN")
+ elif (rc & RC_WARN) == RC_WARN:
+ self.name = TPM2_WARN_ERRORS.get(rc & 0x7f, "TPM_RC_UNKNOWN")
+ elif (rc & RC_VER1) == RC_VER1:
+ self.name = TPM2_VER1_ERRORS.get(rc & 0x7f, "TPM_RC_UNKNOWN")
+ else:
+ self.name = TPM2_VER0_ERRORS.get(rc & 0x7f, "TPM_RC_UNKNOWN")
+
+ def __str__(self):
+ if self.cc:
+ return '%s: cc=0x%08x, rc=0x%08x' % (self.name, self.cc, self.rc)
+ else:
+ return '%s: rc=0x%08x' % (self.name, self.rc)
+
+
+class AuthCommand(object):
+ """TPMS_AUTH_COMMAND"""
+
+ def __init__(self, session_handle=TPM2_RS_PW, nonce=bytes(),
+ session_attributes=0, hmac=bytes()):
+ self.session_handle = session_handle
+ self.nonce = nonce
+ self.session_attributes = session_attributes
+ self.hmac = hmac
+
+ def __bytes__(self):
+ fmt = '>I H%us B H%us' % (len(self.nonce), len(self.hmac))
+ return struct.pack(fmt, self.session_handle, len(self.nonce),
+ self.nonce, self.session_attributes, len(self.hmac),
+ self.hmac)
+
+ def __len__(self):
+ fmt = '>I H%us B H%us' % (len(self.nonce), len(self.hmac))
+ return struct.calcsize(fmt)
+
+
+class SensitiveCreate(object):
+ """TPMS_SENSITIVE_CREATE"""
+
+ def __init__(self, user_auth=bytes(), data=bytes()):
+ self.user_auth = user_auth
+ self.data = data
+
+ def __bytes__(self):
+ fmt = '>H%us H%us' % (len(self.user_auth), len(self.data))
+ return struct.pack(fmt, len(self.user_auth), self.user_auth,
+ len(self.data), self.data)
+
+ def __len__(self):
+ fmt = '>H%us H%us' % (len(self.user_auth), len(self.data))
+ return struct.calcsize(fmt)
+
+
+class Public(object):
+ """TPMT_PUBLIC"""
+
+ FIXED_TPM = (1 << 1)
+ FIXED_PARENT = (1 << 4)
+ SENSITIVE_DATA_ORIGIN = (1 << 5)
+ USER_WITH_AUTH = (1 << 6)
+ RESTRICTED = (1 << 16)
+ DECRYPT = (1 << 17)
+
+ def __fmt(self):
+ return '>HHIH%us%usH%us' % \
+ (len(self.auth_policy), len(self.parameters), len(self.unique))
+
+ def __init__(self, object_type, name_alg, object_attributes,
+ auth_policy=bytes(), parameters=bytes(),
+ unique=bytes()):
+ self.object_type = object_type
+ self.name_alg = name_alg
+ self.object_attributes = object_attributes
+ self.auth_policy = auth_policy
+ self.parameters = parameters
+ self.unique = unique
+
+ def __bytes__(self):
+ return struct.pack(self.__fmt(),
+ self.object_type,
+ self.name_alg,
+ self.object_attributes,
+ len(self.auth_policy),
+ self.auth_policy,
+ self.parameters,
+ len(self.unique),
+ self.unique)
+
+ def __len__(self):
+ return struct.calcsize(self.__fmt())
+
+
+def get_digest_size(alg):
+ ds = ALG_DIGEST_SIZE_MAP.get(alg)
+ if not ds:
+ raise UnknownAlgorithmIdError(alg)
+ return ds
+
+
+def get_hash_function(alg):
+ f = ALG_HASH_FUNCTION_MAP.get(alg)
+ if not f:
+ raise UnknownAlgorithmIdError(alg)
+ return f
+
+
+def get_algorithm(name):
+ alg = NAME_ALG_MAP.get(name)
+ if not alg:
+ raise UnknownAlgorithmNameError(name)
+ return alg
+
+
+def hex_dump(d):
+ d = [format(ord(x), '02x') for x in d]
+ d = [d[i: i + 16] for i in range(0, len(d), 16)]
+ d = [' '.join(x) for x in d]
+ d = os.linesep.join(d)
+
+ return d
+
+class Client:
+ FLAG_DEBUG = 0x01
+ FLAG_SPACE = 0x02
+ FLAG_NONBLOCK = 0x04
+ TPM_IOC_NEW_SPACE = 0xa200
+
+ def __init__(self, flags = 0):
+ self.flags = flags
+
+ if (self.flags & Client.FLAG_SPACE) == 0:
+ self.tpm = open('/dev/tpm0', 'r+b', buffering=0)
+ else:
+ self.tpm = open('/dev/tpmrm0', 'r+b', buffering=0)
+
+ if (self.flags & Client.FLAG_NONBLOCK):
+ flags = fcntl.fcntl(self.tpm, fcntl.F_GETFL)
+ flags |= os.O_NONBLOCK
+ fcntl.fcntl(self.tpm, fcntl.F_SETFL, flags)
+ self.tpm_poll = select.poll()
+
+ def __del__(self):
+ if self.tpm:
+ self.tpm.close()
+
+ def close(self):
+ self.tpm.close()
+
+ def send_cmd(self, cmd):
+ self.tpm.write(cmd)
+
+ if (self.flags & Client.FLAG_NONBLOCK):
+ self.tpm_poll.register(self.tpm, select.POLLIN)
+ self.tpm_poll.poll(10000)
+
+ rsp = self.tpm.read()
+
+ if (self.flags & Client.FLAG_NONBLOCK):
+ self.tpm_poll.unregister(self.tpm)
+
+ if (self.flags & Client.FLAG_DEBUG) != 0:
+ sys.stderr.write('cmd' + os.linesep)
+ sys.stderr.write(hex_dump(cmd) + os.linesep)
+ sys.stderr.write('rsp' + os.linesep)
+ sys.stderr.write(hex_dump(rsp) + os.linesep)
+
+ rc = struct.unpack('>I', rsp[6:10])[0]
+ if rc != 0:
+ cc = struct.unpack('>I', cmd[6:10])[0]
+ raise ProtocolError(cc, rc)
+
+ return rsp
+
+ def read_pcr(self, i, bank_alg = TPM2_ALG_SHA1):
+ pcrsel_len = max((i >> 3) + 1, 3)
+ pcrsel = [0] * pcrsel_len
+ pcrsel[i >> 3] = 1 << (i & 7)
+ pcrsel = ''.join(map(chr, pcrsel)).encode()
+
+ fmt = '>HII IHB%us' % (pcrsel_len)
+ cmd = struct.pack(fmt,
+ TPM2_ST_NO_SESSIONS,
+ struct.calcsize(fmt),
+ TPM2_CC_PCR_READ,
+ 1,
+ bank_alg,
+ pcrsel_len, pcrsel)
+
+ rsp = self.send_cmd(cmd)
+
+ pcr_update_cnt, pcr_select_cnt = struct.unpack('>II', rsp[10:18])
+ assert pcr_select_cnt == 1
+ rsp = rsp[18:]
+
+ alg2, pcrsel_len2 = struct.unpack('>HB', rsp[:3])
+ assert bank_alg == alg2 and pcrsel_len == pcrsel_len2
+ rsp = rsp[3 + pcrsel_len:]
+
+ digest_cnt = struct.unpack('>I', rsp[:4])[0]
+ if digest_cnt == 0:
+ return None
+ rsp = rsp[6:]
+
+ return rsp
+
+ def extend_pcr(self, i, dig, bank_alg = TPM2_ALG_SHA1):
+ ds = get_digest_size(bank_alg)
+ assert(ds == len(dig))
+
+ auth_cmd = AuthCommand()
+
+ fmt = '>HII I I%us IH%us' % (len(auth_cmd), ds)
+ cmd = struct.pack(
+ fmt,
+ TPM2_ST_SESSIONS,
+ struct.calcsize(fmt),
+ TPM2_CC_PCR_EXTEND,
+ i,
+ len(auth_cmd),
+ bytes(auth_cmd),
+ 1, bank_alg, dig)
+
+ self.send_cmd(cmd)
+
+ def start_auth_session(self, session_type, name_alg = TPM2_ALG_SHA1):
+ fmt = '>HII IIH16sHBHH'
+ cmd = struct.pack(fmt,
+ TPM2_ST_NO_SESSIONS,
+ struct.calcsize(fmt),
+ TPM2_CC_START_AUTH_SESSION,
+ TPM2_RH_NULL,
+ TPM2_RH_NULL,
+ 16,
+ ('\0' * 16).encode(),
+ 0,
+ session_type,
+ TPM2_ALG_NULL,
+ name_alg)
+
+ return struct.unpack('>I', self.send_cmd(cmd)[10:14])[0]
+
+ def __calc_pcr_digest(self, pcrs, bank_alg = TPM2_ALG_SHA1,
+ digest_alg = TPM2_ALG_SHA1):
+ x = []
+ f = get_hash_function(digest_alg)
+
+ for i in pcrs:
+ pcr = self.read_pcr(i, bank_alg)
+ if pcr is None:
+ return None
+ x += pcr
+
+ return f(bytearray(x)).digest()
+
+ def policy_pcr(self, handle, pcrs, bank_alg = TPM2_ALG_SHA1,
+ name_alg = TPM2_ALG_SHA1):
+ ds = get_digest_size(name_alg)
+ dig = self.__calc_pcr_digest(pcrs, bank_alg, name_alg)
+ if not dig:
+ raise UnknownPCRBankError(bank_alg)
+
+ pcrsel_len = max((max(pcrs) >> 3) + 1, 3)
+ pcrsel = [0] * pcrsel_len
+ for i in pcrs:
+ pcrsel[i >> 3] |= 1 << (i & 7)
+ pcrsel = ''.join(map(chr, pcrsel)).encode()
+
+ fmt = '>HII IH%usIHB3s' % ds
+ cmd = struct.pack(fmt,
+ TPM2_ST_NO_SESSIONS,
+ struct.calcsize(fmt),
+ TPM2_CC_POLICY_PCR,
+ handle,
+ len(dig),
+ bytes(dig),
+ 1,
+ bank_alg,
+ pcrsel_len, pcrsel)
+
+ self.send_cmd(cmd)
+
+ def policy_password(self, handle):
+ fmt = '>HII I'
+ cmd = struct.pack(fmt,
+ TPM2_ST_NO_SESSIONS,
+ struct.calcsize(fmt),
+ TPM2_CC_POLICY_PASSWORD,
+ handle)
+
+ self.send_cmd(cmd)
+
+ def get_policy_digest(self, handle):
+ fmt = '>HII I'
+ cmd = struct.pack(fmt,
+ TPM2_ST_NO_SESSIONS,
+ struct.calcsize(fmt),
+ TPM2_CC_POLICY_GET_DIGEST,
+ handle)
+
+ return self.send_cmd(cmd)[12:]
+
+ def flush_context(self, handle):
+ fmt = '>HIII'
+ cmd = struct.pack(fmt,
+ TPM2_ST_NO_SESSIONS,
+ struct.calcsize(fmt),
+ TPM2_CC_FLUSH_CONTEXT,
+ handle)
+
+ self.send_cmd(cmd)
+
+ def create_root_key(self, auth_value = bytes()):
+ attributes = \
+ Public.FIXED_TPM | \
+ Public.FIXED_PARENT | \
+ Public.SENSITIVE_DATA_ORIGIN | \
+ Public.USER_WITH_AUTH | \
+ Public.RESTRICTED | \
+ Public.DECRYPT
+
+ auth_cmd = AuthCommand()
+ sensitive = SensitiveCreate(user_auth=auth_value)
+
+ public_parms = struct.pack(
+ '>HHHHHI',
+ TPM2_ALG_AES,
+ 128,
+ TPM2_ALG_CFB,
+ TPM2_ALG_NULL,
+ 2048,
+ 0)
+
+ public = Public(
+ object_type=TPM2_ALG_RSA,
+ name_alg=TPM2_ALG_SHA1,
+ object_attributes=attributes,
+ parameters=public_parms)
+
+ fmt = '>HIII I%us H%us H%us HI' % \
+ (len(auth_cmd), len(sensitive), len(public))
+ cmd = struct.pack(
+ fmt,
+ TPM2_ST_SESSIONS,
+ struct.calcsize(fmt),
+ TPM2_CC_CREATE_PRIMARY,
+ TPM2_RH_OWNER,
+ len(auth_cmd),
+ bytes(auth_cmd),
+ len(sensitive),
+ bytes(sensitive),
+ len(public),
+ bytes(public),
+ 0, 0)
+
+ return struct.unpack('>I', self.send_cmd(cmd)[10:14])[0]
+
+ def seal(self, parent_key, data, auth_value, policy_dig,
+ name_alg = TPM2_ALG_SHA1):
+ ds = get_digest_size(name_alg)
+ assert(not policy_dig or ds == len(policy_dig))
+
+ attributes = 0
+ if not policy_dig:
+ attributes |= Public.USER_WITH_AUTH
+ policy_dig = bytes()
+
+ auth_cmd = AuthCommand()
+ sensitive = SensitiveCreate(user_auth=auth_value, data=data)
+
+ public = Public(
+ object_type=TPM2_ALG_KEYEDHASH,
+ name_alg=name_alg,
+ object_attributes=attributes,
+ auth_policy=policy_dig,
+ parameters=struct.pack('>H', TPM2_ALG_NULL))
+
+ fmt = '>HIII I%us H%us H%us HI' % \
+ (len(auth_cmd), len(sensitive), len(public))
+ cmd = struct.pack(
+ fmt,
+ TPM2_ST_SESSIONS,
+ struct.calcsize(fmt),
+ TPM2_CC_CREATE,
+ parent_key,
+ len(auth_cmd),
+ bytes(auth_cmd),
+ len(sensitive),
+ bytes(sensitive),
+ len(public),
+ bytes(public),
+ 0, 0)
+
+ rsp = self.send_cmd(cmd)
+
+ return rsp[14:]
+
+ def unseal(self, parent_key, blob, auth_value, policy_handle):
+ private_len = struct.unpack('>H', blob[0:2])[0]
+ public_start = private_len + 2
+ public_len = struct.unpack('>H', blob[public_start:public_start + 2])[0]
+ blob = blob[:private_len + public_len + 4]
+
+ auth_cmd = AuthCommand()
+
+ fmt = '>HII I I%us %us' % (len(auth_cmd), len(blob))
+ cmd = struct.pack(
+ fmt,
+ TPM2_ST_SESSIONS,
+ struct.calcsize(fmt),
+ TPM2_CC_LOAD,
+ parent_key,
+ len(auth_cmd),
+ bytes(auth_cmd),
+ blob)
+
+ data_handle = struct.unpack('>I', self.send_cmd(cmd)[10:14])[0]
+
+ if policy_handle:
+ auth_cmd = AuthCommand(session_handle=policy_handle, hmac=auth_value)
+ else:
+ auth_cmd = AuthCommand(hmac=auth_value)
+
+ fmt = '>HII I I%us' % (len(auth_cmd))
+ cmd = struct.pack(
+ fmt,
+ TPM2_ST_SESSIONS,
+ struct.calcsize(fmt),
+ TPM2_CC_UNSEAL,
+ data_handle,
+ len(auth_cmd),
+ bytes(auth_cmd))
+
+ try:
+ rsp = self.send_cmd(cmd)
+ finally:
+ self.flush_context(data_handle)
+
+ data_len = struct.unpack('>I', rsp[10:14])[0] - 2
+
+ return rsp[16:16 + data_len]
+
+ def reset_da_lock(self):
+ auth_cmd = AuthCommand()
+
+ fmt = '>HII I I%us' % (len(auth_cmd))
+ cmd = struct.pack(
+ fmt,
+ TPM2_ST_SESSIONS,
+ struct.calcsize(fmt),
+ TPM2_CC_DICTIONARY_ATTACK_LOCK_RESET,
+ TPM2_RH_LOCKOUT,
+ len(auth_cmd),
+ bytes(auth_cmd))
+
+ self.send_cmd(cmd)
+
+ def __get_cap_cnt(self, cap, pt, cnt):
+ handles = []
+ fmt = '>HII III'
+
+ cmd = struct.pack(fmt,
+ TPM2_ST_NO_SESSIONS,
+ struct.calcsize(fmt),
+ TPM2_CC_GET_CAPABILITY,
+ cap, pt, cnt)
+
+ rsp = self.send_cmd(cmd)[10:]
+ more_data, cap, cnt = struct.unpack('>BII', rsp[:9])
+ rsp = rsp[9:]
+
+ for i in range(0, cnt):
+ handle = struct.unpack('>I', rsp[:4])[0]
+ handles.append(handle)
+ rsp = rsp[4:]
+
+ return handles, more_data
+
+ def get_cap(self, cap, pt):
+ handles = []
+
+ more_data = True
+ while more_data:
+ next_handles, more_data = self.__get_cap_cnt(cap, pt, 1)
+ handles += next_handles
+ pt += 1
+
+ return handles
diff --git a/tools/testing/selftests/tpm2/tpm2_tests.py b/tools/testing/selftests/tpm2/tpm2_tests.py
new file mode 100644
index 000000000..9d7643068
--- /dev/null
+++ b/tools/testing/selftests/tpm2/tpm2_tests.py
@@ -0,0 +1,304 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+
+from argparse import ArgumentParser
+from argparse import FileType
+import os
+import sys
+import tpm2
+from tpm2 import ProtocolError
+import unittest
+import logging
+import struct
+
+class SmokeTest(unittest.TestCase):
+ def setUp(self):
+ self.client = tpm2.Client()
+ self.root_key = self.client.create_root_key()
+
+ def tearDown(self):
+ self.client.flush_context(self.root_key)
+ self.client.close()
+
+ def test_seal_with_auth(self):
+ data = ('X' * 64).encode()
+ auth = ('A' * 15).encode()
+
+ blob = self.client.seal(self.root_key, data, auth, None)
+ result = self.client.unseal(self.root_key, blob, auth, None)
+ self.assertEqual(data, result)
+
+ def test_seal_with_policy(self):
+ handle = self.client.start_auth_session(tpm2.TPM2_SE_TRIAL)
+
+ data = ('X' * 64).encode()
+ auth = ('A' * 15).encode()
+ pcrs = [16]
+
+ try:
+ self.client.policy_pcr(handle, pcrs)
+ self.client.policy_password(handle)
+
+ policy_dig = self.client.get_policy_digest(handle)
+ finally:
+ self.client.flush_context(handle)
+
+ blob = self.client.seal(self.root_key, data, auth, policy_dig)
+
+ handle = self.client.start_auth_session(tpm2.TPM2_SE_POLICY)
+
+ try:
+ self.client.policy_pcr(handle, pcrs)
+ self.client.policy_password(handle)
+
+ result = self.client.unseal(self.root_key, blob, auth, handle)
+ except:
+ self.client.flush_context(handle)
+ raise
+
+ self.assertEqual(data, result)
+
+ def test_unseal_with_wrong_auth(self):
+ data = ('X' * 64).encode()
+ auth = ('A' * 20).encode()
+ rc = 0
+
+ blob = self.client.seal(self.root_key, data, auth, None)
+ try:
+ result = self.client.unseal(self.root_key, blob,
+ auth[:-1] + 'B'.encode(), None)
+ except ProtocolError as e:
+ rc = e.rc
+
+ self.assertEqual(rc, tpm2.TPM2_RC_AUTH_FAIL)
+
+ def test_unseal_with_wrong_policy(self):
+ handle = self.client.start_auth_session(tpm2.TPM2_SE_TRIAL)
+
+ data = ('X' * 64).encode()
+ auth = ('A' * 17).encode()
+ pcrs = [16]
+
+ try:
+ self.client.policy_pcr(handle, pcrs)
+ self.client.policy_password(handle)
+
+ policy_dig = self.client.get_policy_digest(handle)
+ finally:
+ self.client.flush_context(handle)
+
+ blob = self.client.seal(self.root_key, data, auth, policy_dig)
+
+ # Extend first a PCR that is not part of the policy and try to unseal.
+ # This should succeed.
+
+ ds = tpm2.get_digest_size(tpm2.TPM2_ALG_SHA1)
+ self.client.extend_pcr(1, ('X' * ds).encode())
+
+ handle = self.client.start_auth_session(tpm2.TPM2_SE_POLICY)
+
+ try:
+ self.client.policy_pcr(handle, pcrs)
+ self.client.policy_password(handle)
+
+ result = self.client.unseal(self.root_key, blob, auth, handle)
+ except:
+ self.client.flush_context(handle)
+ raise
+
+ self.assertEqual(data, result)
+
+ # Then, extend a PCR that is part of the policy and try to unseal.
+ # This should fail.
+ self.client.extend_pcr(16, ('X' * ds).encode())
+
+ handle = self.client.start_auth_session(tpm2.TPM2_SE_POLICY)
+
+ rc = 0
+
+ try:
+ self.client.policy_pcr(handle, pcrs)
+ self.client.policy_password(handle)
+
+ result = self.client.unseal(self.root_key, blob, auth, handle)
+ except ProtocolError as e:
+ rc = e.rc
+ self.client.flush_context(handle)
+ except:
+ self.client.flush_context(handle)
+ raise
+
+ self.assertEqual(rc, tpm2.TPM2_RC_POLICY_FAIL)
+
+ def test_seal_with_too_long_auth(self):
+ ds = tpm2.get_digest_size(tpm2.TPM2_ALG_SHA1)
+ data = ('X' * 64).encode()
+ auth = ('A' * (ds + 1)).encode()
+
+ rc = 0
+ try:
+ blob = self.client.seal(self.root_key, data, auth, None)
+ except ProtocolError as e:
+ rc = e.rc
+
+ self.assertEqual(rc, tpm2.TPM2_RC_SIZE)
+
+ def test_too_short_cmd(self):
+ rejected = False
+ try:
+ fmt = '>HIII'
+ cmd = struct.pack(fmt,
+ tpm2.TPM2_ST_NO_SESSIONS,
+ struct.calcsize(fmt) + 1,
+ tpm2.TPM2_CC_FLUSH_CONTEXT,
+ 0xDEADBEEF)
+
+ self.client.send_cmd(cmd)
+ except IOError as e:
+ rejected = True
+ except:
+ pass
+ self.assertEqual(rejected, True)
+
+ def test_read_partial_resp(self):
+ try:
+ fmt = '>HIIH'
+ cmd = struct.pack(fmt,
+ tpm2.TPM2_ST_NO_SESSIONS,
+ struct.calcsize(fmt),
+ tpm2.TPM2_CC_GET_RANDOM,
+ 0x20)
+ self.client.tpm.write(cmd)
+ hdr = self.client.tpm.read(10)
+ sz = struct.unpack('>I', hdr[2:6])[0]
+ rsp = self.client.tpm.read()
+ except:
+ pass
+ self.assertEqual(sz, 10 + 2 + 32)
+ self.assertEqual(len(rsp), 2 + 32)
+
+ def test_read_partial_overwrite(self):
+ try:
+ fmt = '>HIIH'
+ cmd = struct.pack(fmt,
+ tpm2.TPM2_ST_NO_SESSIONS,
+ struct.calcsize(fmt),
+ tpm2.TPM2_CC_GET_RANDOM,
+ 0x20)
+ self.client.tpm.write(cmd)
+ # Read part of the respone
+ rsp1 = self.client.tpm.read(15)
+
+ # Send a new cmd
+ self.client.tpm.write(cmd)
+
+ # Read the whole respone
+ rsp2 = self.client.tpm.read()
+ except:
+ pass
+ self.assertEqual(len(rsp1), 15)
+ self.assertEqual(len(rsp2), 10 + 2 + 32)
+
+ def test_send_two_cmds(self):
+ rejected = False
+ try:
+ fmt = '>HIIH'
+ cmd = struct.pack(fmt,
+ tpm2.TPM2_ST_NO_SESSIONS,
+ struct.calcsize(fmt),
+ tpm2.TPM2_CC_GET_RANDOM,
+ 0x20)
+ self.client.tpm.write(cmd)
+
+ # expect the second one to raise -EBUSY error
+ self.client.tpm.write(cmd)
+ rsp = self.client.tpm.read()
+
+ except IOError as e:
+ # read the response
+ rsp = self.client.tpm.read()
+ rejected = True
+ pass
+ except:
+ pass
+ self.assertEqual(rejected, True)
+
+class SpaceTest(unittest.TestCase):
+ def setUp(self):
+ logging.basicConfig(filename='SpaceTest.log', level=logging.DEBUG)
+
+ def test_make_two_spaces(self):
+ log = logging.getLogger(__name__)
+ log.debug("test_make_two_spaces")
+
+ space1 = tpm2.Client(tpm2.Client.FLAG_SPACE)
+ root1 = space1.create_root_key()
+ space2 = tpm2.Client(tpm2.Client.FLAG_SPACE)
+ root2 = space2.create_root_key()
+ root3 = space2.create_root_key()
+
+ log.debug("%08x" % (root1))
+ log.debug("%08x" % (root2))
+ log.debug("%08x" % (root3))
+
+ def test_flush_context(self):
+ log = logging.getLogger(__name__)
+ log.debug("test_flush_context")
+
+ space1 = tpm2.Client(tpm2.Client.FLAG_SPACE)
+ root1 = space1.create_root_key()
+ log.debug("%08x" % (root1))
+
+ space1.flush_context(root1)
+
+ def test_get_handles(self):
+ log = logging.getLogger(__name__)
+ log.debug("test_get_handles")
+
+ space1 = tpm2.Client(tpm2.Client.FLAG_SPACE)
+ space1.create_root_key()
+ space2 = tpm2.Client(tpm2.Client.FLAG_SPACE)
+ space2.create_root_key()
+ space2.create_root_key()
+
+ handles = space2.get_cap(tpm2.TPM2_CAP_HANDLES, tpm2.HR_TRANSIENT)
+
+ self.assertEqual(len(handles), 2)
+
+ log.debug("%08x" % (handles[0]))
+ log.debug("%08x" % (handles[1]))
+
+ def test_invalid_cc(self):
+ log = logging.getLogger(__name__)
+ log.debug(sys._getframe().f_code.co_name)
+
+ TPM2_CC_INVALID = tpm2.TPM2_CC_FIRST - 1
+
+ space1 = tpm2.Client(tpm2.Client.FLAG_SPACE)
+ root1 = space1.create_root_key()
+ log.debug("%08x" % (root1))
+
+ fmt = '>HII'
+ cmd = struct.pack(fmt, tpm2.TPM2_ST_NO_SESSIONS, struct.calcsize(fmt),
+ TPM2_CC_INVALID)
+
+ rc = 0
+ try:
+ space1.send_cmd(cmd)
+ except ProtocolError as e:
+ rc = e.rc
+
+ self.assertEqual(rc, tpm2.TPM2_RC_COMMAND_CODE |
+ tpm2.TSS2_RESMGR_TPM_RC_LAYER)
+
+class AsyncTest(unittest.TestCase):
+ def setUp(self):
+ logging.basicConfig(filename='AsyncTest.log', level=logging.DEBUG)
+
+ def test_async(self):
+ log = logging.getLogger(__name__)
+ log.debug(sys._getframe().f_code.co_name)
+
+ async_client = tpm2.Client(tpm2.Client.FLAG_NONBLOCK)
+ log.debug("Calling get_cap in a NON_BLOCKING mode")
+ async_client.get_cap(tpm2.TPM2_CAP_HANDLES, tpm2.HR_LOADED_SESSION)
+ async_client.close()
diff --git a/tools/testing/selftests/uevent/Makefile b/tools/testing/selftests/uevent/Makefile
new file mode 100644
index 000000000..f7baa9aa2
--- /dev/null
+++ b/tools/testing/selftests/uevent/Makefile
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0
+all:
+
+include ../lib.mk
+
+.PHONY: all clean
+
+BINARIES := uevent_filtering
+CFLAGS += -Wl,-no-as-needed -Wall
+
+uevent_filtering: uevent_filtering.c ../kselftest.h ../kselftest_harness.h
+ $(CC) $(CFLAGS) $< -o $@
+
+TEST_PROGS += $(BINARIES)
+EXTRA_CLEAN := $(BINARIES)
+
+all: $(BINARIES)
diff --git a/tools/testing/selftests/uevent/config b/tools/testing/selftests/uevent/config
new file mode 100644
index 000000000..1038f4515
--- /dev/null
+++ b/tools/testing/selftests/uevent/config
@@ -0,0 +1,2 @@
+CONFIG_USER_NS=y
+CONFIG_NET=y
diff --git a/tools/testing/selftests/uevent/uevent_filtering.c b/tools/testing/selftests/uevent/uevent_filtering.c
new file mode 100644
index 000000000..5cebfb356
--- /dev/null
+++ b/tools/testing/selftests/uevent/uevent_filtering.c
@@ -0,0 +1,485 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/netlink.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/prctl.h>
+#include <sys/socket.h>
+#include <sched.h>
+#include <sys/eventfd.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "../kselftest_harness.h"
+
+#define __DEV_FULL "/sys/devices/virtual/mem/full/uevent"
+#define __UEVENT_BUFFER_SIZE (2048 * 2)
+#define __UEVENT_HEADER "add@/devices/virtual/mem/full"
+#define __UEVENT_HEADER_LEN sizeof("add@/devices/virtual/mem/full")
+#define __UEVENT_LISTEN_ALL -1
+
+ssize_t read_nointr(int fd, void *buf, size_t count)
+{
+ ssize_t ret;
+
+again:
+ ret = read(fd, buf, count);
+ if (ret < 0 && errno == EINTR)
+ goto again;
+
+ return ret;
+}
+
+ssize_t write_nointr(int fd, const void *buf, size_t count)
+{
+ ssize_t ret;
+
+again:
+ ret = write(fd, buf, count);
+ if (ret < 0 && errno == EINTR)
+ goto again;
+
+ return ret;
+}
+
+int wait_for_pid(pid_t pid)
+{
+ int status, ret;
+
+again:
+ ret = waitpid(pid, &status, 0);
+ if (ret == -1) {
+ if (errno == EINTR)
+ goto again;
+
+ return -1;
+ }
+
+ if (ret != pid)
+ goto again;
+
+ if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
+ return -1;
+
+ return 0;
+}
+
+static int uevent_listener(unsigned long post_flags, bool expect_uevent,
+ int sync_fd)
+{
+ int sk_fd, ret;
+ socklen_t sk_addr_len;
+ int fret = -1, rcv_buf_sz = __UEVENT_BUFFER_SIZE;
+ uint64_t sync_add = 1;
+ struct sockaddr_nl sk_addr = { 0 }, rcv_addr = { 0 };
+ char buf[__UEVENT_BUFFER_SIZE] = { 0 };
+ struct iovec iov = { buf, __UEVENT_BUFFER_SIZE };
+ char control[CMSG_SPACE(sizeof(struct ucred))];
+ struct msghdr hdr = {
+ &rcv_addr, sizeof(rcv_addr), &iov, 1,
+ control, sizeof(control), 0,
+ };
+
+ sk_fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC,
+ NETLINK_KOBJECT_UEVENT);
+ if (sk_fd < 0) {
+ fprintf(stderr, "%s - Failed to open uevent socket\n", strerror(errno));
+ return -1;
+ }
+
+ ret = setsockopt(sk_fd, SOL_SOCKET, SO_RCVBUF, &rcv_buf_sz,
+ sizeof(rcv_buf_sz));
+ if (ret < 0) {
+ fprintf(stderr, "%s - Failed to set socket options\n", strerror(errno));
+ goto on_error;
+ }
+
+ sk_addr.nl_family = AF_NETLINK;
+ sk_addr.nl_groups = __UEVENT_LISTEN_ALL;
+
+ sk_addr_len = sizeof(sk_addr);
+ ret = bind(sk_fd, (struct sockaddr *)&sk_addr, sk_addr_len);
+ if (ret < 0) {
+ fprintf(stderr, "%s - Failed to bind socket\n", strerror(errno));
+ goto on_error;
+ }
+
+ ret = getsockname(sk_fd, (struct sockaddr *)&sk_addr, &sk_addr_len);
+ if (ret < 0) {
+ fprintf(stderr, "%s - Failed to retrieve socket name\n", strerror(errno));
+ goto on_error;
+ }
+
+ if ((size_t)sk_addr_len != sizeof(sk_addr)) {
+ fprintf(stderr, "Invalid socket address size\n");
+ goto on_error;
+ }
+
+ if (post_flags & CLONE_NEWUSER) {
+ ret = unshare(CLONE_NEWUSER);
+ if (ret < 0) {
+ fprintf(stderr,
+ "%s - Failed to unshare user namespace\n",
+ strerror(errno));
+ goto on_error;
+ }
+ }
+
+ if (post_flags & CLONE_NEWNET) {
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ fprintf(stderr,
+ "%s - Failed to unshare network namespace\n",
+ strerror(errno));
+ goto on_error;
+ }
+ }
+
+ ret = write_nointr(sync_fd, &sync_add, sizeof(sync_add));
+ close(sync_fd);
+ if (ret != sizeof(sync_add)) {
+ fprintf(stderr, "Failed to synchronize with parent process\n");
+ goto on_error;
+ }
+
+ fret = 0;
+ for (;;) {
+ ssize_t r;
+
+ r = recvmsg(sk_fd, &hdr, 0);
+ if (r <= 0) {
+ fprintf(stderr, "%s - Failed to receive uevent\n", strerror(errno));
+ ret = -1;
+ break;
+ }
+
+ /* ignore libudev messages */
+ if (memcmp(buf, "libudev", 8) == 0)
+ continue;
+
+ /* ignore uevents we didn't trigger */
+ if (memcmp(buf, __UEVENT_HEADER, __UEVENT_HEADER_LEN) != 0)
+ continue;
+
+ if (!expect_uevent) {
+ fprintf(stderr, "Received unexpected uevent:\n");
+ ret = -1;
+ }
+
+ if (TH_LOG_ENABLED) {
+ /* If logging is enabled dump the received uevent. */
+ (void)write_nointr(STDERR_FILENO, buf, r);
+ (void)write_nointr(STDERR_FILENO, "\n", 1);
+ }
+
+ break;
+ }
+
+on_error:
+ close(sk_fd);
+
+ return fret;
+}
+
+int trigger_uevent(unsigned int times)
+{
+ int fd, ret;
+ unsigned int i;
+
+ fd = open(__DEV_FULL, O_RDWR | O_CLOEXEC);
+ if (fd < 0) {
+ if (errno != ENOENT)
+ return -EINVAL;
+
+ return -1;
+ }
+
+ for (i = 0; i < times; i++) {
+ ret = write_nointr(fd, "add\n", sizeof("add\n") - 1);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to trigger uevent\n");
+ break;
+ }
+ }
+ close(fd);
+
+ return ret;
+}
+
+int set_death_signal(void)
+{
+ int ret;
+ pid_t ppid;
+
+ ret = prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
+
+ /* Check whether we have been orphaned. */
+ ppid = getppid();
+ if (ppid == 1) {
+ pid_t self;
+
+ self = getpid();
+ ret = kill(self, SIGKILL);
+ }
+
+ if (ret < 0)
+ return -1;
+
+ return 0;
+}
+
+static int do_test(unsigned long pre_flags, unsigned long post_flags,
+ bool expect_uevent, int sync_fd)
+{
+ int ret;
+ uint64_t wait_val;
+ pid_t pid;
+ sigset_t mask;
+ sigset_t orig_mask;
+ struct timespec timeout;
+
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGCHLD);
+
+ ret = sigprocmask(SIG_BLOCK, &mask, &orig_mask);
+ if (ret < 0) {
+ fprintf(stderr, "%s- Failed to block SIGCHLD\n", strerror(errno));
+ return -1;
+ }
+
+ pid = fork();
+ if (pid < 0) {
+ fprintf(stderr, "%s - Failed to fork() new process\n", strerror(errno));
+ return -1;
+ }
+
+ if (pid == 0) {
+ /* Make sure that we go away when our parent dies. */
+ ret = set_death_signal();
+ if (ret < 0) {
+ fprintf(stderr, "Failed to set PR_SET_PDEATHSIG to SIGKILL\n");
+ _exit(EXIT_FAILURE);
+ }
+
+ if (pre_flags & CLONE_NEWUSER) {
+ ret = unshare(CLONE_NEWUSER);
+ if (ret < 0) {
+ fprintf(stderr,
+ "%s - Failed to unshare user namespace\n",
+ strerror(errno));
+ _exit(EXIT_FAILURE);
+ }
+ }
+
+ if (pre_flags & CLONE_NEWNET) {
+ ret = unshare(CLONE_NEWNET);
+ if (ret < 0) {
+ fprintf(stderr,
+ "%s - Failed to unshare network namespace\n",
+ strerror(errno));
+ _exit(EXIT_FAILURE);
+ }
+ }
+
+ if (uevent_listener(post_flags, expect_uevent, sync_fd) < 0)
+ _exit(EXIT_FAILURE);
+
+ _exit(EXIT_SUCCESS);
+ }
+
+ ret = read_nointr(sync_fd, &wait_val, sizeof(wait_val));
+ if (ret != sizeof(wait_val)) {
+ fprintf(stderr, "Failed to synchronize with child process\n");
+ _exit(EXIT_FAILURE);
+ }
+
+ /* Trigger 10 uevents to account for the case where the kernel might
+ * drop some.
+ */
+ ret = trigger_uevent(10);
+ if (ret < 0)
+ fprintf(stderr, "Failed triggering uevents\n");
+
+ /* Wait for 2 seconds before considering this failed. This should be
+ * plenty of time for the kernel to deliver the uevent even under heavy
+ * load.
+ */
+ timeout.tv_sec = 2;
+ timeout.tv_nsec = 0;
+
+again:
+ ret = sigtimedwait(&mask, NULL, &timeout);
+ if (ret < 0) {
+ if (errno == EINTR)
+ goto again;
+
+ if (!expect_uevent)
+ ret = kill(pid, SIGTERM); /* success */
+ else
+ ret = kill(pid, SIGUSR1); /* error */
+ if (ret < 0)
+ return -1;
+ }
+
+ ret = wait_for_pid(pid);
+ if (ret < 0)
+ return -1;
+
+ return ret;
+}
+
+static void signal_handler(int sig)
+{
+ if (sig == SIGTERM)
+ _exit(EXIT_SUCCESS);
+
+ _exit(EXIT_FAILURE);
+}
+
+TEST(uevent_filtering)
+{
+ int ret, sync_fd;
+ struct sigaction act;
+
+ if (geteuid()) {
+ TH_LOG("Uevent filtering tests require root privileges. Skipping test");
+ _exit(KSFT_SKIP);
+ }
+
+ ret = access(__DEV_FULL, F_OK);
+ EXPECT_EQ(0, ret) {
+ if (errno == ENOENT) {
+ TH_LOG(__DEV_FULL " does not exist. Skipping test");
+ _exit(KSFT_SKIP);
+ }
+
+ _exit(KSFT_FAIL);
+ }
+
+ act.sa_handler = signal_handler;
+ act.sa_flags = 0;
+ sigemptyset(&act.sa_mask);
+
+ ret = sigaction(SIGTERM, &act, NULL);
+ ASSERT_EQ(0, ret);
+
+ sync_fd = eventfd(0, EFD_CLOEXEC);
+ ASSERT_GE(sync_fd, 0);
+
+ /*
+ * Setup:
+ * - Open uevent listening socket in initial network namespace owned by
+ * initial user namespace.
+ * - Trigger uevent in initial network namespace owned by initial user
+ * namespace.
+ * Expected Result:
+ * - uevent listening socket receives uevent
+ */
+ ret = do_test(0, 0, true, sync_fd);
+ ASSERT_EQ(0, ret) {
+ goto do_cleanup;
+ }
+
+ /*
+ * Setup:
+ * - Open uevent listening socket in non-initial network namespace
+ * owned by initial user namespace.
+ * - Trigger uevent in initial network namespace owned by initial user
+ * namespace.
+ * Expected Result:
+ * - uevent listening socket receives uevent
+ */
+ ret = do_test(CLONE_NEWNET, 0, true, sync_fd);
+ ASSERT_EQ(0, ret) {
+ goto do_cleanup;
+ }
+
+ /*
+ * Setup:
+ * - unshare user namespace
+ * - Open uevent listening socket in initial network namespace
+ * owned by initial user namespace.
+ * - Trigger uevent in initial network namespace owned by initial user
+ * namespace.
+ * Expected Result:
+ * - uevent listening socket receives uevent
+ */
+ ret = do_test(CLONE_NEWUSER, 0, true, sync_fd);
+ ASSERT_EQ(0, ret) {
+ goto do_cleanup;
+ }
+
+ /*
+ * Setup:
+ * - Open uevent listening socket in non-initial network namespace
+ * owned by non-initial user namespace.
+ * - Trigger uevent in initial network namespace owned by initial user
+ * namespace.
+ * Expected Result:
+ * - uevent listening socket receives no uevent
+ */
+ ret = do_test(CLONE_NEWUSER | CLONE_NEWNET, 0, false, sync_fd);
+ ASSERT_EQ(0, ret) {
+ goto do_cleanup;
+ }
+
+ /*
+ * Setup:
+ * - Open uevent listening socket in initial network namespace
+ * owned by initial user namespace.
+ * - unshare network namespace
+ * - Trigger uevent in initial network namespace owned by initial user
+ * namespace.
+ * Expected Result:
+ * - uevent listening socket receives uevent
+ */
+ ret = do_test(0, CLONE_NEWNET, true, sync_fd);
+ ASSERT_EQ(0, ret) {
+ goto do_cleanup;
+ }
+
+ /*
+ * Setup:
+ * - Open uevent listening socket in initial network namespace
+ * owned by initial user namespace.
+ * - unshare user namespace
+ * - Trigger uevent in initial network namespace owned by initial user
+ * namespace.
+ * Expected Result:
+ * - uevent listening socket receives uevent
+ */
+ ret = do_test(0, CLONE_NEWUSER, true, sync_fd);
+ ASSERT_EQ(0, ret) {
+ goto do_cleanup;
+ }
+
+ /*
+ * Setup:
+ * - Open uevent listening socket in initial network namespace
+ * owned by initial user namespace.
+ * - unshare user namespace
+ * - unshare network namespace
+ * - Trigger uevent in initial network namespace owned by initial user
+ * namespace.
+ * Expected Result:
+ * - uevent listening socket receives uevent
+ */
+ ret = do_test(0, CLONE_NEWUSER | CLONE_NEWNET, true, sync_fd);
+ ASSERT_EQ(0, ret) {
+ goto do_cleanup;
+ }
+
+do_cleanup:
+ close(sync_fd);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/user/Makefile b/tools/testing/selftests/user/Makefile
new file mode 100644
index 000000000..640a40f9b
--- /dev/null
+++ b/tools/testing/selftests/user/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Makefile for user memory selftests
+
+# No binaries, but make sure arg-less "make" doesn't trigger "run_tests"
+all:
+
+TEST_PROGS := test_user_copy.sh
+
+include ../lib.mk
diff --git a/tools/testing/selftests/user/config b/tools/testing/selftests/user/config
new file mode 100644
index 000000000..784ed8416
--- /dev/null
+++ b/tools/testing/selftests/user/config
@@ -0,0 +1 @@
+CONFIG_TEST_USER_COPY=m
diff --git a/tools/testing/selftests/user/test_user_copy.sh b/tools/testing/selftests/user/test_user_copy.sh
new file mode 100755
index 000000000..f9b31a574
--- /dev/null
+++ b/tools/testing/selftests/user/test_user_copy.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# Runs copy_to/from_user infrastructure using test_user_copy kernel module
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+if ! /sbin/modprobe -q -n test_user_copy; then
+ echo "user: module test_user_copy is not found [SKIP]"
+ exit $ksft_skip
+fi
+if /sbin/modprobe -q test_user_copy; then
+ /sbin/modprobe -q -r test_user_copy
+ echo "user_copy: ok"
+else
+ echo "user_copy: [FAIL]"
+ exit 1
+fi
diff --git a/tools/testing/selftests/vDSO/.gitignore b/tools/testing/selftests/vDSO/.gitignore
new file mode 100644
index 000000000..5eb64d41e
--- /dev/null
+++ b/tools/testing/selftests/vDSO/.gitignore
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+vdso_test
+vdso_test_gettimeofday
+vdso_test_getcpu
+vdso_standalone_test_x86
diff --git a/tools/testing/selftests/vDSO/Makefile b/tools/testing/selftests/vDSO/Makefile
new file mode 100644
index 000000000..0069f2f83
--- /dev/null
+++ b/tools/testing/selftests/vDSO/Makefile
@@ -0,0 +1,27 @@
+# SPDX-License-Identifier: GPL-2.0
+include ../lib.mk
+
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/x86/ -e s/x86_64/x86/)
+
+TEST_GEN_PROGS := $(OUTPUT)/vdso_test_gettimeofday $(OUTPUT)/vdso_test_getcpu
+ifeq ($(ARCH),x86)
+TEST_GEN_PROGS += $(OUTPUT)/vdso_standalone_test_x86
+endif
+
+ifndef CROSS_COMPILE
+CFLAGS := -std=gnu99
+CFLAGS_vdso_standalone_test_x86 := -nostdlib -fno-asynchronous-unwind-tables -fno-stack-protector
+ifeq ($(CONFIG_X86_32),y)
+LDLIBS += -lgcc_s
+endif
+
+all: $(TEST_GEN_PROGS)
+$(OUTPUT)/vdso_test_gettimeofday: parse_vdso.c vdso_test_gettimeofday.c
+$(OUTPUT)/vdso_test_getcpu: parse_vdso.c vdso_test_getcpu.c
+$(OUTPUT)/vdso_standalone_test_x86: vdso_standalone_test_x86.c parse_vdso.c
+ $(CC) $(CFLAGS) $(CFLAGS_vdso_standalone_test_x86) \
+ vdso_standalone_test_x86.c parse_vdso.c \
+ -o $@
+
+endif
diff --git a/tools/testing/selftests/vDSO/parse_vdso.c b/tools/testing/selftests/vDSO/parse_vdso.c
new file mode 100644
index 000000000..413f75620
--- /dev/null
+++ b/tools/testing/selftests/vDSO/parse_vdso.c
@@ -0,0 +1,247 @@
+/*
+ * parse_vdso.c: Linux reference vDSO parser
+ * Written by Andrew Lutomirski, 2011-2014.
+ *
+ * This code is meant to be linked in to various programs that run on Linux.
+ * As such, it is available with as few restrictions as possible. This file
+ * is licensed under the Creative Commons Zero License, version 1.0,
+ * available at http://creativecommons.org/publicdomain/zero/1.0/legalcode
+ *
+ * The vDSO is a regular ELF DSO that the kernel maps into user space when
+ * it starts a program. It works equally well in statically and dynamically
+ * linked binaries.
+ *
+ * This code is tested on x86. In principle it should work on any
+ * architecture that has a vDSO.
+ */
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+#include <limits.h>
+#include <elf.h>
+
+#include "parse_vdso.h"
+
+/* And here's the code. */
+#ifndef ELF_BITS
+# if ULONG_MAX > 0xffffffffUL
+# define ELF_BITS 64
+# else
+# define ELF_BITS 32
+# endif
+#endif
+
+#define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x
+#define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x)
+#define ELF(x) ELF_BITS_XFORM(ELF_BITS, x)
+
+static struct vdso_info
+{
+ bool valid;
+
+ /* Load information */
+ uintptr_t load_addr;
+ uintptr_t load_offset; /* load_addr - recorded vaddr */
+
+ /* Symbol table */
+ ELF(Sym) *symtab;
+ const char *symstrings;
+ ELF(Word) *bucket, *chain;
+ ELF(Word) nbucket, nchain;
+
+ /* Version table */
+ ELF(Versym) *versym;
+ ELF(Verdef) *verdef;
+} vdso_info;
+
+/* Straight from the ELF specification. */
+static unsigned long elf_hash(const unsigned char *name)
+{
+ unsigned long h = 0, g;
+ while (*name)
+ {
+ h = (h << 4) + *name++;
+ if (g = h & 0xf0000000)
+ h ^= g >> 24;
+ h &= ~g;
+ }
+ return h;
+}
+
+void vdso_init_from_sysinfo_ehdr(uintptr_t base)
+{
+ size_t i;
+ bool found_vaddr = false;
+
+ vdso_info.valid = false;
+
+ vdso_info.load_addr = base;
+
+ ELF(Ehdr) *hdr = (ELF(Ehdr)*)base;
+ if (hdr->e_ident[EI_CLASS] !=
+ (ELF_BITS == 32 ? ELFCLASS32 : ELFCLASS64)) {
+ return; /* Wrong ELF class -- check ELF_BITS */
+ }
+
+ ELF(Phdr) *pt = (ELF(Phdr)*)(vdso_info.load_addr + hdr->e_phoff);
+ ELF(Dyn) *dyn = 0;
+
+ /*
+ * We need two things from the segment table: the load offset
+ * and the dynamic table.
+ */
+ for (i = 0; i < hdr->e_phnum; i++)
+ {
+ if (pt[i].p_type == PT_LOAD && !found_vaddr) {
+ found_vaddr = true;
+ vdso_info.load_offset = base
+ + (uintptr_t)pt[i].p_offset
+ - (uintptr_t)pt[i].p_vaddr;
+ } else if (pt[i].p_type == PT_DYNAMIC) {
+ dyn = (ELF(Dyn)*)(base + pt[i].p_offset);
+ }
+ }
+
+ if (!found_vaddr || !dyn)
+ return; /* Failed */
+
+ /*
+ * Fish out the useful bits of the dynamic table.
+ */
+ ELF(Word) *hash = 0;
+ vdso_info.symstrings = 0;
+ vdso_info.symtab = 0;
+ vdso_info.versym = 0;
+ vdso_info.verdef = 0;
+ for (i = 0; dyn[i].d_tag != DT_NULL; i++) {
+ switch (dyn[i].d_tag) {
+ case DT_STRTAB:
+ vdso_info.symstrings = (const char *)
+ ((uintptr_t)dyn[i].d_un.d_ptr
+ + vdso_info.load_offset);
+ break;
+ case DT_SYMTAB:
+ vdso_info.symtab = (ELF(Sym) *)
+ ((uintptr_t)dyn[i].d_un.d_ptr
+ + vdso_info.load_offset);
+ break;
+ case DT_HASH:
+ hash = (ELF(Word) *)
+ ((uintptr_t)dyn[i].d_un.d_ptr
+ + vdso_info.load_offset);
+ break;
+ case DT_VERSYM:
+ vdso_info.versym = (ELF(Versym) *)
+ ((uintptr_t)dyn[i].d_un.d_ptr
+ + vdso_info.load_offset);
+ break;
+ case DT_VERDEF:
+ vdso_info.verdef = (ELF(Verdef) *)
+ ((uintptr_t)dyn[i].d_un.d_ptr
+ + vdso_info.load_offset);
+ break;
+ }
+ }
+ if (!vdso_info.symstrings || !vdso_info.symtab || !hash)
+ return; /* Failed */
+
+ if (!vdso_info.verdef)
+ vdso_info.versym = 0;
+
+ /* Parse the hash table header. */
+ vdso_info.nbucket = hash[0];
+ vdso_info.nchain = hash[1];
+ vdso_info.bucket = &hash[2];
+ vdso_info.chain = &hash[vdso_info.nbucket + 2];
+
+ /* That's all we need. */
+ vdso_info.valid = true;
+}
+
+static bool vdso_match_version(ELF(Versym) ver,
+ const char *name, ELF(Word) hash)
+{
+ /*
+ * This is a helper function to check if the version indexed by
+ * ver matches name (which hashes to hash).
+ *
+ * The version definition table is a mess, and I don't know how
+ * to do this in better than linear time without allocating memory
+ * to build an index. I also don't know why the table has
+ * variable size entries in the first place.
+ *
+ * For added fun, I can't find a comprehensible specification of how
+ * to parse all the weird flags in the table.
+ *
+ * So I just parse the whole table every time.
+ */
+
+ /* First step: find the version definition */
+ ver &= 0x7fff; /* Apparently bit 15 means "hidden" */
+ ELF(Verdef) *def = vdso_info.verdef;
+ while(true) {
+ if ((def->vd_flags & VER_FLG_BASE) == 0
+ && (def->vd_ndx & 0x7fff) == ver)
+ break;
+
+ if (def->vd_next == 0)
+ return false; /* No definition. */
+
+ def = (ELF(Verdef) *)((char *)def + def->vd_next);
+ }
+
+ /* Now figure out whether it matches. */
+ ELF(Verdaux) *aux = (ELF(Verdaux)*)((char *)def + def->vd_aux);
+ return def->vd_hash == hash
+ && !strcmp(name, vdso_info.symstrings + aux->vda_name);
+}
+
+void *vdso_sym(const char *version, const char *name)
+{
+ unsigned long ver_hash;
+ if (!vdso_info.valid)
+ return 0;
+
+ ver_hash = elf_hash(version);
+ ELF(Word) chain = vdso_info.bucket[elf_hash(name) % vdso_info.nbucket];
+
+ for (; chain != STN_UNDEF; chain = vdso_info.chain[chain]) {
+ ELF(Sym) *sym = &vdso_info.symtab[chain];
+
+ /* Check for a defined global or weak function w/ right name. */
+ if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC)
+ continue;
+ if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL &&
+ ELF64_ST_BIND(sym->st_info) != STB_WEAK)
+ continue;
+ if (sym->st_shndx == SHN_UNDEF)
+ continue;
+ if (strcmp(name, vdso_info.symstrings + sym->st_name))
+ continue;
+
+ /* Check symbol version. */
+ if (vdso_info.versym
+ && !vdso_match_version(vdso_info.versym[chain],
+ version, ver_hash))
+ continue;
+
+ return (void *)(vdso_info.load_offset + sym->st_value);
+ }
+
+ return 0;
+}
+
+void vdso_init_from_auxv(void *auxv)
+{
+ ELF(auxv_t) *elf_auxv = auxv;
+ for (int i = 0; elf_auxv[i].a_type != AT_NULL; i++)
+ {
+ if (elf_auxv[i].a_type == AT_SYSINFO_EHDR) {
+ vdso_init_from_sysinfo_ehdr(elf_auxv[i].a_un.a_val);
+ return;
+ }
+ }
+
+ vdso_info.valid = false;
+}
diff --git a/tools/testing/selftests/vDSO/parse_vdso.h b/tools/testing/selftests/vDSO/parse_vdso.h
new file mode 100644
index 000000000..de0453067
--- /dev/null
+++ b/tools/testing/selftests/vDSO/parse_vdso.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef PARSE_VDSO_H
+#define PARSE_VDSO_H
+
+#include <stdint.h>
+
+/*
+ * To use this vDSO parser, first call one of the vdso_init_* functions.
+ * If you've already parsed auxv, then pass the value of AT_SYSINFO_EHDR
+ * to vdso_init_from_sysinfo_ehdr. Otherwise pass auxv to vdso_init_from_auxv.
+ * Then call vdso_sym for each symbol you want. For example, to look up
+ * gettimeofday on x86_64, use:
+ *
+ * <some pointer> = vdso_sym("LINUX_2.6", "gettimeofday");
+ * or
+ * <some pointer> = vdso_sym("LINUX_2.6", "__vdso_gettimeofday");
+ *
+ * vdso_sym will return 0 if the symbol doesn't exist or if the init function
+ * failed or was not called. vdso_sym is a little slow, so its return value
+ * should be cached.
+ *
+ * vdso_sym is threadsafe; the init functions are not.
+ *
+ * These are the prototypes:
+ */
+void *vdso_sym(const char *version, const char *name);
+void vdso_init_from_sysinfo_ehdr(uintptr_t base);
+void vdso_init_from_auxv(void *auxv);
+
+#endif
diff --git a/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c
new file mode 100644
index 000000000..8a44ff973
--- /dev/null
+++ b/tools/testing/selftests/vDSO/vdso_standalone_test_x86.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vdso_test.c: Sample code to test parse_vdso.c on x86
+ * Copyright (c) 2011-2014 Andy Lutomirski
+ *
+ * You can amuse yourself by compiling with:
+ * gcc -std=gnu99 -nostdlib
+ * -Os -fno-asynchronous-unwind-tables -flto -lgcc_s
+ * vdso_standalone_test_x86.c parse_vdso.c
+ * to generate a small binary. On x86_64, you can omit -lgcc_s
+ * if you want the binary to be completely standalone.
+ */
+
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <stdint.h>
+
+#include "parse_vdso.h"
+
+/* We need a libc functions... */
+int strcmp(const char *a, const char *b)
+{
+ /* This implementation is buggy: it never returns -1. */
+ while (*a || *b) {
+ if (*a != *b)
+ return 1;
+ if (*a == 0 || *b == 0)
+ return 1;
+ a++;
+ b++;
+ }
+
+ return 0;
+}
+
+/* ...and two syscalls. This is x86-specific. */
+static inline long x86_syscall3(long nr, long a0, long a1, long a2)
+{
+ long ret;
+#ifdef __x86_64__
+ asm volatile ("syscall" : "=a" (ret) : "a" (nr),
+ "D" (a0), "S" (a1), "d" (a2) :
+ "cc", "memory", "rcx",
+ "r8", "r9", "r10", "r11" );
+#else
+ asm volatile ("int $0x80" : "=a" (ret) : "a" (nr),
+ "b" (a0), "c" (a1), "d" (a2) :
+ "cc", "memory" );
+#endif
+ return ret;
+}
+
+static inline long linux_write(int fd, const void *data, size_t len)
+{
+ return x86_syscall3(__NR_write, fd, (long)data, (long)len);
+}
+
+static inline void linux_exit(int code)
+{
+ x86_syscall3(__NR_exit, code, 0, 0);
+}
+
+void to_base10(char *lastdig, time_t n)
+{
+ while (n) {
+ *lastdig = (n % 10) + '0';
+ n /= 10;
+ lastdig--;
+ }
+}
+
+__attribute__((externally_visible)) void c_main(void **stack)
+{
+ /* Parse the stack */
+ long argc = (long)*stack;
+ stack += argc + 2;
+
+ /* Now we're pointing at the environment. Skip it. */
+ while(*stack)
+ stack++;
+ stack++;
+
+ /* Now we're pointing at auxv. Initialize the vDSO parser. */
+ vdso_init_from_auxv((void *)stack);
+
+ /* Find gettimeofday. */
+ typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz);
+ gtod_t gtod = (gtod_t)vdso_sym("LINUX_2.6", "__vdso_gettimeofday");
+
+ if (!gtod)
+ linux_exit(1);
+
+ struct timeval tv;
+ long ret = gtod(&tv, 0);
+
+ if (ret == 0) {
+ char buf[] = "The time is .000000\n";
+ to_base10(buf + 31, tv.tv_sec);
+ to_base10(buf + 38, tv.tv_usec);
+ linux_write(1, buf, sizeof(buf) - 1);
+ } else {
+ linux_exit(ret);
+ }
+
+ linux_exit(0);
+}
+
+/*
+ * This is the real entry point. It passes the initial stack into
+ * the C entry point.
+ */
+asm (
+ ".text\n"
+ ".global _start\n"
+ ".type _start,@function\n"
+ "_start:\n\t"
+#ifdef __x86_64__
+ "mov %rsp,%rdi\n\t"
+ "jmp c_main"
+#else
+ "push %esp\n\t"
+ "call c_main\n\t"
+ "int $3"
+#endif
+ );
diff --git a/tools/testing/selftests/vDSO/vdso_test_getcpu.c b/tools/testing/selftests/vDSO/vdso_test_getcpu.c
new file mode 100644
index 000000000..fc25ede13
--- /dev/null
+++ b/tools/testing/selftests/vDSO/vdso_test_getcpu.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vdso_test_getcpu.c: Sample code to test parse_vdso.c and vDSO getcpu()
+ *
+ * Copyright (c) 2020 Arm Ltd
+ */
+
+#include <stdint.h>
+#include <elf.h>
+#include <stdio.h>
+#include <sys/auxv.h>
+#include <sys/time.h>
+
+#include "../kselftest.h"
+#include "parse_vdso.h"
+
+const char *version = "LINUX_2.6";
+const char *name = "__vdso_getcpu";
+
+struct getcpu_cache;
+typedef long (*getcpu_t)(unsigned int *, unsigned int *,
+ struct getcpu_cache *);
+
+int main(int argc, char **argv)
+{
+ unsigned long sysinfo_ehdr;
+ unsigned int cpu, node;
+ getcpu_t get_cpu;
+ long ret;
+
+ sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR);
+ if (!sysinfo_ehdr) {
+ printf("AT_SYSINFO_EHDR is not present!\n");
+ return KSFT_SKIP;
+ }
+
+ vdso_init_from_sysinfo_ehdr(getauxval(AT_SYSINFO_EHDR));
+
+ get_cpu = (getcpu_t)vdso_sym(version, name);
+ if (!get_cpu) {
+ printf("Could not find %s\n", name);
+ return KSFT_SKIP;
+ }
+
+ ret = get_cpu(&cpu, &node, 0);
+ if (ret == 0) {
+ printf("Running on CPU %u node %u\n", cpu, node);
+ } else {
+ printf("%s failed\n", name);
+ return KSFT_FAIL;
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c
new file mode 100644
index 000000000..8ccc73ed8
--- /dev/null
+++ b/tools/testing/selftests/vDSO/vdso_test_gettimeofday.c
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vdso_test_gettimeofday.c: Sample code to test parse_vdso.c and
+ * vDSO gettimeofday()
+ * Copyright (c) 2014 Andy Lutomirski
+ *
+ * Compile with:
+ * gcc -std=gnu99 vdso_test_gettimeofday.c parse_vdso_gettimeofday.c
+ *
+ * Tested on x86, 32-bit and 64-bit. It may work on other architectures, too.
+ */
+
+#include <stdint.h>
+#include <elf.h>
+#include <stdio.h>
+#include <sys/auxv.h>
+#include <sys/time.h>
+
+#include "../kselftest.h"
+#include "parse_vdso.h"
+
+/*
+ * ARM64's vDSO exports its gettimeofday() implementation with a different
+ * name and version from other architectures, so we need to handle it as
+ * a special case.
+ */
+#if defined(__aarch64__)
+const char *version = "LINUX_2.6.39";
+const char *name = "__kernel_gettimeofday";
+#else
+const char *version = "LINUX_2.6";
+const char *name = "__vdso_gettimeofday";
+#endif
+
+int main(int argc, char **argv)
+{
+ unsigned long sysinfo_ehdr = getauxval(AT_SYSINFO_EHDR);
+ if (!sysinfo_ehdr) {
+ printf("AT_SYSINFO_EHDR is not present!\n");
+ return KSFT_SKIP;
+ }
+
+ vdso_init_from_sysinfo_ehdr(getauxval(AT_SYSINFO_EHDR));
+
+ /* Find gettimeofday. */
+ typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz);
+ gtod_t gtod = (gtod_t)vdso_sym(version, name);
+
+ if (!gtod) {
+ printf("Could not find %s\n", name);
+ return KSFT_SKIP;
+ }
+
+ struct timeval tv;
+ long ret = gtod(&tv, 0);
+
+ if (ret == 0) {
+ printf("The time is %lld.%06lld\n",
+ (long long)tv.tv_sec, (long long)tv.tv_usec);
+ } else {
+ printf("%s failed\n", name);
+ return KSFT_FAIL;
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/vm/.gitignore b/tools/testing/selftests/vm/.gitignore
new file mode 100644
index 000000000..849e82263
--- /dev/null
+++ b/tools/testing/selftests/vm/.gitignore
@@ -0,0 +1,22 @@
+# SPDX-License-Identifier: GPL-2.0-only
+hugepage-mmap
+hugepage-shm
+khugepaged
+map_hugetlb
+map_populate
+thuge-gen
+compaction_test
+mlock2-tests
+mremap_dontunmap
+on-fault-limit
+transhuge-stress
+protection_keys
+userfaultfd
+mlock-intersect-test
+mlock-random-test
+virtual_address_range
+gup_benchmark
+va_128TBswitch
+map_fixed_noreplace
+write_to_hugetlbfs
+hmm-tests
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
new file mode 100644
index 000000000..d8479552e
--- /dev/null
+++ b/tools/testing/selftests/vm/Makefile
@@ -0,0 +1,136 @@
+# SPDX-License-Identifier: GPL-2.0
+# Makefile for vm selftests
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/')
+
+# Without this, failed build products remain, with up-to-date timestamps,
+# thus tricking Make (and you!) into believing that All Is Well, in subsequent
+# make invocations:
+.DELETE_ON_ERROR:
+
+# Avoid accidental wrong builds, due to built-in rules working just a little
+# bit too well--but not quite as well as required for our situation here.
+#
+# In other words, "make userfaultfd" is supposed to fail to build at all,
+# because this Makefile only supports either "make" (all), or "make /full/path".
+# However, the built-in rules, if not suppressed, will pick up CFLAGS and the
+# initial LDLIBS (but not the target-specific LDLIBS, because those are only
+# set for the full path target!). This causes it to get pretty far into building
+# things despite using incorrect values such as an *occasionally* incomplete
+# LDLIBS.
+MAKEFLAGS += --no-builtin-rules
+
+CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS)
+LDLIBS = -lrt
+TEST_GEN_FILES = compaction_test
+TEST_GEN_FILES += gup_benchmark
+TEST_GEN_FILES += hmm-tests
+TEST_GEN_FILES += hugepage-mmap
+TEST_GEN_FILES += hugepage-shm
+TEST_GEN_FILES += map_hugetlb
+TEST_GEN_FILES += map_fixed_noreplace
+TEST_GEN_FILES += map_populate
+TEST_GEN_FILES += mlock-random-test
+TEST_GEN_FILES += mlock2-tests
+TEST_GEN_FILES += mremap_dontunmap
+TEST_GEN_FILES += on-fault-limit
+TEST_GEN_FILES += thuge-gen
+TEST_GEN_FILES += transhuge-stress
+TEST_GEN_FILES += userfaultfd
+TEST_GEN_FILES += khugepaged
+
+ifeq ($(MACHINE),x86_64)
+CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32)
+CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_program.c)
+CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie)
+
+VMTARGETS := protection_keys
+BINARIES_32 := $(VMTARGETS:%=%_32)
+BINARIES_64 := $(VMTARGETS:%=%_64)
+
+ifeq ($(CAN_BUILD_WITH_NOPIE),1)
+CFLAGS += -no-pie
+endif
+
+ifeq ($(CAN_BUILD_I386),1)
+TEST_GEN_FILES += $(BINARIES_32)
+endif
+
+ifeq ($(CAN_BUILD_X86_64),1)
+TEST_GEN_FILES += $(BINARIES_64)
+endif
+else
+
+ifneq (,$(findstring $(MACHINE),ppc64))
+TEST_GEN_FILES += protection_keys
+endif
+
+endif
+
+ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sh64 sparc64 x86_64))
+TEST_GEN_FILES += va_128TBswitch
+TEST_GEN_FILES += virtual_address_range
+TEST_GEN_FILES += write_to_hugetlbfs
+endif
+
+TEST_PROGS := run_vmtests
+
+TEST_FILES := test_vmalloc.sh
+
+KSFT_KHDR_INSTALL := 1
+include ../lib.mk
+
+$(OUTPUT)/hmm-tests: LDLIBS += -lhugetlbfs -lpthread
+
+ifeq ($(MACHINE),x86_64)
+BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32))
+BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64))
+
+define gen-target-rule-32
+$(1) $(1)_32: $(OUTPUT)/$(1)_32
+.PHONY: $(1) $(1)_32
+endef
+
+define gen-target-rule-64
+$(1) $(1)_64: $(OUTPUT)/$(1)_64
+.PHONY: $(1) $(1)_64
+endef
+
+ifeq ($(CAN_BUILD_I386),1)
+$(BINARIES_32): CFLAGS += -m32
+$(BINARIES_32): LDLIBS += -lrt -ldl -lm
+$(BINARIES_32): $(OUTPUT)/%_32: %.c
+ $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@
+$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-32,$(t))))
+endif
+
+ifeq ($(CAN_BUILD_X86_64),1)
+$(BINARIES_64): CFLAGS += -m64
+$(BINARIES_64): LDLIBS += -lrt -ldl
+$(BINARIES_64): $(OUTPUT)/%_64: %.c
+ $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@
+$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-64,$(t))))
+endif
+
+# x86_64 users should be encouraged to install 32-bit libraries
+ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01)
+all: warn_32bit_failure
+
+warn_32bit_failure:
+ @echo "Warning: you seem to have a broken 32-bit build" 2>&1; \
+ echo "environment. This will reduce test coverage of 64-bit" 2>&1; \
+ echo "kernels. If you are using a Debian-like distribution," 2>&1; \
+ echo "try:"; 2>&1; \
+ echo ""; \
+ echo " apt-get install gcc-multilib libc6-i386 libc6-dev-i386"; \
+ echo ""; \
+ echo "If you are using a Fedora-like distribution, try:"; \
+ echo ""; \
+ echo " yum install glibc-devel.*i686"; \
+ exit 0;
+endif
+endif
+
+$(OUTPUT)/userfaultfd: LDLIBS += -lpthread
+
+$(OUTPUT)/mlock-random-test: LDLIBS += -lcap
diff --git a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh b/tools/testing/selftests/vm/charge_reserved_hugetlb.sh
new file mode 100644
index 000000000..7536ff2f8
--- /dev/null
+++ b/tools/testing/selftests/vm/charge_reserved_hugetlb.sh
@@ -0,0 +1,581 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+if [[ $(id -u) -ne 0 ]]; then
+ echo "This test must be run as root. Skipping..."
+ exit 0
+fi
+
+fault_limit_file=limit_in_bytes
+reservation_limit_file=rsvd.limit_in_bytes
+fault_usage_file=usage_in_bytes
+reservation_usage_file=rsvd.usage_in_bytes
+
+if [[ "$1" == "-cgroup-v2" ]]; then
+ cgroup2=1
+ fault_limit_file=max
+ reservation_limit_file=rsvd.max
+ fault_usage_file=current
+ reservation_usage_file=rsvd.current
+fi
+
+if [[ $cgroup2 ]]; then
+ cgroup_path=$(mount -t cgroup2 | head -1 | awk '{print $3}')
+ if [[ -z "$cgroup_path" ]]; then
+ cgroup_path=/dev/cgroup/memory
+ mount -t cgroup2 none $cgroup_path
+ do_umount=1
+ fi
+ echo "+hugetlb" >$cgroup_path/cgroup.subtree_control
+else
+ cgroup_path=$(mount -t cgroup | grep ",hugetlb" | awk '{print $3}')
+ if [[ -z "$cgroup_path" ]]; then
+ cgroup_path=/dev/cgroup/memory
+ mount -t cgroup memory,hugetlb $cgroup_path
+ do_umount=1
+ fi
+fi
+export cgroup_path
+
+function cleanup() {
+ if [[ $cgroup2 ]]; then
+ echo $$ >$cgroup_path/cgroup.procs
+ else
+ echo $$ >$cgroup_path/tasks
+ fi
+
+ if [[ -e /mnt/huge ]]; then
+ rm -rf /mnt/huge/*
+ umount /mnt/huge || echo error
+ rmdir /mnt/huge
+ fi
+ if [[ -e $cgroup_path/hugetlb_cgroup_test ]]; then
+ rmdir $cgroup_path/hugetlb_cgroup_test
+ fi
+ if [[ -e $cgroup_path/hugetlb_cgroup_test1 ]]; then
+ rmdir $cgroup_path/hugetlb_cgroup_test1
+ fi
+ if [[ -e $cgroup_path/hugetlb_cgroup_test2 ]]; then
+ rmdir $cgroup_path/hugetlb_cgroup_test2
+ fi
+ echo 0 >/proc/sys/vm/nr_hugepages
+ echo CLEANUP DONE
+}
+
+function expect_equal() {
+ local expected="$1"
+ local actual="$2"
+ local error="$3"
+
+ if [[ "$expected" != "$actual" ]]; then
+ echo "expected ($expected) != actual ($actual): $3"
+ cleanup
+ exit 1
+ fi
+}
+
+function get_machine_hugepage_size() {
+ hpz=$(grep -i hugepagesize /proc/meminfo)
+ kb=${hpz:14:-3}
+ mb=$(($kb / 1024))
+ echo $mb
+}
+
+MB=$(get_machine_hugepage_size)
+
+function setup_cgroup() {
+ local name="$1"
+ local cgroup_limit="$2"
+ local reservation_limit="$3"
+
+ mkdir $cgroup_path/$name
+
+ echo writing cgroup limit: "$cgroup_limit"
+ echo "$cgroup_limit" >$cgroup_path/$name/hugetlb.${MB}MB.$fault_limit_file
+
+ echo writing reseravation limit: "$reservation_limit"
+ echo "$reservation_limit" > \
+ $cgroup_path/$name/hugetlb.${MB}MB.$reservation_limit_file
+
+ if [ -e "$cgroup_path/$name/cpuset.cpus" ]; then
+ echo 0 >$cgroup_path/$name/cpuset.cpus
+ fi
+ if [ -e "$cgroup_path/$name/cpuset.mems" ]; then
+ echo 0 >$cgroup_path/$name/cpuset.mems
+ fi
+}
+
+function wait_for_hugetlb_memory_to_get_depleted() {
+ local cgroup="$1"
+ local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file"
+ # Wait for hugetlbfs memory to get depleted.
+ while [ $(cat $path) != 0 ]; do
+ echo Waiting for hugetlb memory to get depleted.
+ cat $path
+ sleep 0.5
+ done
+}
+
+function wait_for_hugetlb_memory_to_get_reserved() {
+ local cgroup="$1"
+ local size="$2"
+
+ local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file"
+ # Wait for hugetlbfs memory to get written.
+ while [ $(cat $path) != $size ]; do
+ echo Waiting for hugetlb memory reservation to reach size $size.
+ cat $path
+ sleep 0.5
+ done
+}
+
+function wait_for_hugetlb_memory_to_get_written() {
+ local cgroup="$1"
+ local size="$2"
+
+ local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file"
+ # Wait for hugetlbfs memory to get written.
+ while [ $(cat $path) != $size ]; do
+ echo Waiting for hugetlb memory to reach size $size.
+ cat $path
+ sleep 0.5
+ done
+}
+
+function write_hugetlbfs_and_get_usage() {
+ local cgroup="$1"
+ local size="$2"
+ local populate="$3"
+ local write="$4"
+ local path="$5"
+ local method="$6"
+ local private="$7"
+ local expect_failure="$8"
+ local reserve="$9"
+
+ # Function return values.
+ reservation_failed=0
+ oom_killed=0
+ hugetlb_difference=0
+ reserved_difference=0
+
+ local hugetlb_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file
+ local reserved_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file
+
+ local hugetlb_before=$(cat $hugetlb_usage)
+ local reserved_before=$(cat $reserved_usage)
+
+ echo
+ echo Starting:
+ echo hugetlb_usage="$hugetlb_before"
+ echo reserved_usage="$reserved_before"
+ echo expect_failure is "$expect_failure"
+
+ output=$(mktemp)
+ set +e
+ if [[ "$method" == "1" ]] || [[ "$method" == 2 ]] ||
+ [[ "$private" == "-r" ]] && [[ "$expect_failure" != 1 ]]; then
+
+ bash write_hugetlb_memory.sh "$size" "$populate" "$write" \
+ "$cgroup" "$path" "$method" "$private" "-l" "$reserve" 2>&1 | tee $output &
+
+ local write_result=$?
+ local write_pid=$!
+
+ until grep -q -i "DONE" $output; do
+ echo waiting for DONE signal.
+ if ! ps $write_pid > /dev/null
+ then
+ echo "FAIL: The write died"
+ cleanup
+ exit 1
+ fi
+ sleep 0.5
+ done
+
+ echo ================= write_hugetlb_memory.sh output is:
+ cat $output
+ echo ================= end output.
+
+ if [[ "$populate" == "-o" ]] || [[ "$write" == "-w" ]]; then
+ wait_for_hugetlb_memory_to_get_written "$cgroup" "$size"
+ elif [[ "$reserve" != "-n" ]]; then
+ wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size"
+ else
+ # This case doesn't produce visible effects, but we still have
+ # to wait for the async process to start and execute...
+ sleep 0.5
+ fi
+
+ echo write_result is $write_result
+ else
+ bash write_hugetlb_memory.sh "$size" "$populate" "$write" \
+ "$cgroup" "$path" "$method" "$private" "$reserve"
+ local write_result=$?
+
+ if [[ "$reserve" != "-n" ]]; then
+ wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size"
+ fi
+ fi
+ set -e
+
+ if [[ "$write_result" == 1 ]]; then
+ reservation_failed=1
+ fi
+
+ # On linus/master, the above process gets SIGBUS'd on oomkill, with
+ # return code 135. On earlier kernels, it gets actual oomkill, with return
+ # code 137, so just check for both conditions in case we're testing
+ # against an earlier kernel.
+ if [[ "$write_result" == 135 ]] || [[ "$write_result" == 137 ]]; then
+ oom_killed=1
+ fi
+
+ local hugetlb_after=$(cat $hugetlb_usage)
+ local reserved_after=$(cat $reserved_usage)
+
+ echo After write:
+ echo hugetlb_usage="$hugetlb_after"
+ echo reserved_usage="$reserved_after"
+
+ hugetlb_difference=$(($hugetlb_after - $hugetlb_before))
+ reserved_difference=$(($reserved_after - $reserved_before))
+}
+
+function cleanup_hugetlb_memory() {
+ set +e
+ local cgroup="$1"
+ if [[ "$(pgrep -f write_to_hugetlbfs)" != "" ]]; then
+ echo killing write_to_hugetlbfs
+ killall -2 write_to_hugetlbfs
+ wait_for_hugetlb_memory_to_get_depleted $cgroup
+ fi
+ set -e
+
+ if [[ -e /mnt/huge ]]; then
+ rm -rf /mnt/huge/*
+ umount /mnt/huge
+ rmdir /mnt/huge
+ fi
+}
+
+function run_test() {
+ local size=$(($1 * ${MB} * 1024 * 1024))
+ local populate="$2"
+ local write="$3"
+ local cgroup_limit=$(($4 * ${MB} * 1024 * 1024))
+ local reservation_limit=$(($5 * ${MB} * 1024 * 1024))
+ local nr_hugepages="$6"
+ local method="$7"
+ local private="$8"
+ local expect_failure="$9"
+ local reserve="${10}"
+
+ # Function return values.
+ hugetlb_difference=0
+ reserved_difference=0
+ reservation_failed=0
+ oom_killed=0
+
+ echo nr hugepages = "$nr_hugepages"
+ echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages
+
+ setup_cgroup "hugetlb_cgroup_test" "$cgroup_limit" "$reservation_limit"
+
+ mkdir -p /mnt/huge
+ mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge
+
+ write_hugetlbfs_and_get_usage "hugetlb_cgroup_test" "$size" "$populate" \
+ "$write" "/mnt/huge/test" "$method" "$private" "$expect_failure" \
+ "$reserve"
+
+ cleanup_hugetlb_memory "hugetlb_cgroup_test"
+
+ local final_hugetlb=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$fault_usage_file)
+ local final_reservation=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$reservation_usage_file)
+
+ echo $hugetlb_difference
+ echo $reserved_difference
+ expect_equal "0" "$final_hugetlb" "final hugetlb is not zero"
+ expect_equal "0" "$final_reservation" "final reservation is not zero"
+}
+
+function run_multiple_cgroup_test() {
+ local size1="$1"
+ local populate1="$2"
+ local write1="$3"
+ local cgroup_limit1="$4"
+ local reservation_limit1="$5"
+
+ local size2="$6"
+ local populate2="$7"
+ local write2="$8"
+ local cgroup_limit2="$9"
+ local reservation_limit2="${10}"
+
+ local nr_hugepages="${11}"
+ local method="${12}"
+ local private="${13}"
+ local expect_failure="${14}"
+ local reserve="${15}"
+
+ # Function return values.
+ hugetlb_difference1=0
+ reserved_difference1=0
+ reservation_failed1=0
+ oom_killed1=0
+
+ hugetlb_difference2=0
+ reserved_difference2=0
+ reservation_failed2=0
+ oom_killed2=0
+
+ echo nr hugepages = "$nr_hugepages"
+ echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages
+
+ setup_cgroup "hugetlb_cgroup_test1" "$cgroup_limit1" "$reservation_limit1"
+ setup_cgroup "hugetlb_cgroup_test2" "$cgroup_limit2" "$reservation_limit2"
+
+ mkdir -p /mnt/huge
+ mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge
+
+ write_hugetlbfs_and_get_usage "hugetlb_cgroup_test1" "$size1" \
+ "$populate1" "$write1" "/mnt/huge/test1" "$method" "$private" \
+ "$expect_failure" "$reserve"
+
+ hugetlb_difference1=$hugetlb_difference
+ reserved_difference1=$reserved_difference
+ reservation_failed1=$reservation_failed
+ oom_killed1=$oom_killed
+
+ local cgroup1_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$fault_usage_file
+ local cgroup1_reservation_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$reservation_usage_file
+ local cgroup2_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$fault_usage_file
+ local cgroup2_reservation_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$reservation_usage_file
+
+ local usage_before_second_write=$(cat $cgroup1_hugetlb_usage)
+ local reservation_usage_before_second_write=$(cat $cgroup1_reservation_usage)
+
+ write_hugetlbfs_and_get_usage "hugetlb_cgroup_test2" "$size2" \
+ "$populate2" "$write2" "/mnt/huge/test2" "$method" "$private" \
+ "$expect_failure" "$reserve"
+
+ hugetlb_difference2=$hugetlb_difference
+ reserved_difference2=$reserved_difference
+ reservation_failed2=$reservation_failed
+ oom_killed2=$oom_killed
+
+ expect_equal "$usage_before_second_write" \
+ "$(cat $cgroup1_hugetlb_usage)" "Usage changed."
+ expect_equal "$reservation_usage_before_second_write" \
+ "$(cat $cgroup1_reservation_usage)" "Reservation usage changed."
+
+ cleanup_hugetlb_memory
+
+ local final_hugetlb=$(cat $cgroup1_hugetlb_usage)
+ local final_reservation=$(cat $cgroup1_reservation_usage)
+
+ expect_equal "0" "$final_hugetlb" \
+ "hugetlbt_cgroup_test1 final hugetlb is not zero"
+ expect_equal "0" "$final_reservation" \
+ "hugetlbt_cgroup_test1 final reservation is not zero"
+
+ local final_hugetlb=$(cat $cgroup2_hugetlb_usage)
+ local final_reservation=$(cat $cgroup2_reservation_usage)
+
+ expect_equal "0" "$final_hugetlb" \
+ "hugetlb_cgroup_test2 final hugetlb is not zero"
+ expect_equal "0" "$final_reservation" \
+ "hugetlb_cgroup_test2 final reservation is not zero"
+}
+
+cleanup
+
+for populate in "" "-o"; do
+ for method in 0 1 2; do
+ for private in "" "-r"; do
+ for reserve in "" "-n"; do
+
+ # Skip mmap(MAP_HUGETLB | MAP_SHARED). Doesn't seem to be supported.
+ if [[ "$method" == 1 ]] && [[ "$private" == "" ]]; then
+ continue
+ fi
+
+ # Skip populated shmem tests. Doesn't seem to be supported.
+ if [[ "$method" == 2"" ]] && [[ "$populate" == "-o" ]]; then
+ continue
+ fi
+
+ if [[ "$method" == 2"" ]] && [[ "$reserve" == "-n" ]]; then
+ continue
+ fi
+
+ cleanup
+ echo
+ echo
+ echo
+ echo Test normal case.
+ echo private=$private, populate=$populate, method=$method, reserve=$reserve
+ run_test 5 "$populate" "" 10 10 10 "$method" "$private" "0" "$reserve"
+
+ echo Memory charged to hugtlb=$hugetlb_difference
+ echo Memory charged to reservation=$reserved_difference
+
+ if [[ "$populate" == "-o" ]]; then
+ expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \
+ "Reserved memory charged to hugetlb cgroup."
+ else
+ expect_equal "0" "$hugetlb_difference" \
+ "Reserved memory charged to hugetlb cgroup."
+ fi
+
+ if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then
+ expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \
+ "Reserved memory not charged to reservation usage."
+ else
+ expect_equal "0" "$reserved_difference" \
+ "Reserved memory not charged to reservation usage."
+ fi
+
+ echo 'PASS'
+
+ cleanup
+ echo
+ echo
+ echo
+ echo Test normal case with write.
+ echo private=$private, populate=$populate, method=$method, reserve=$reserve
+ run_test 5 "$populate" '-w' 5 5 10 "$method" "$private" "0" "$reserve"
+
+ echo Memory charged to hugtlb=$hugetlb_difference
+ echo Memory charged to reservation=$reserved_difference
+
+ expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \
+ "Reserved memory charged to hugetlb cgroup."
+
+ expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \
+ "Reserved memory not charged to reservation usage."
+
+ echo 'PASS'
+
+ cleanup
+ continue
+ echo
+ echo
+ echo
+ echo Test more than reservation case.
+ echo private=$private, populate=$populate, method=$method, reserve=$reserve
+
+ if [ "$reserve" != "-n" ]; then
+ run_test "5" "$populate" '' "10" "2" "10" "$method" "$private" "1" \
+ "$reserve"
+
+ expect_equal "1" "$reservation_failed" "Reservation succeeded."
+ fi
+
+ echo 'PASS'
+
+ cleanup
+
+ echo
+ echo
+ echo
+ echo Test more than cgroup limit case.
+ echo private=$private, populate=$populate, method=$method, reserve=$reserve
+
+ # Not sure if shm memory can be cleaned up when the process gets sigbus'd.
+ if [[ "$method" != 2 ]]; then
+ run_test 5 "$populate" "-w" 2 10 10 "$method" "$private" "1" "$reserve"
+
+ expect_equal "1" "$oom_killed" "Not oom killed."
+ fi
+ echo 'PASS'
+
+ cleanup
+
+ echo
+ echo
+ echo
+ echo Test normal case, multiple cgroups.
+ echo private=$private, populate=$populate, method=$method, reserve=$reserve
+ run_multiple_cgroup_test "3" "$populate" "" "10" "10" "5" \
+ "$populate" "" "10" "10" "10" \
+ "$method" "$private" "0" "$reserve"
+
+ echo Memory charged to hugtlb1=$hugetlb_difference1
+ echo Memory charged to reservation1=$reserved_difference1
+ echo Memory charged to hugtlb2=$hugetlb_difference2
+ echo Memory charged to reservation2=$reserved_difference2
+
+ if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then
+ expect_equal "3" "$reserved_difference1" \
+ "Incorrect reservations charged to cgroup 1."
+
+ expect_equal "5" "$reserved_difference2" \
+ "Incorrect reservation charged to cgroup 2."
+
+ else
+ expect_equal "0" "$reserved_difference1" \
+ "Incorrect reservations charged to cgroup 1."
+
+ expect_equal "0" "$reserved_difference2" \
+ "Incorrect reservation charged to cgroup 2."
+ fi
+
+ if [[ "$populate" == "-o" ]]; then
+ expect_equal "3" "$hugetlb_difference1" \
+ "Incorrect hugetlb charged to cgroup 1."
+
+ expect_equal "5" "$hugetlb_difference2" \
+ "Incorrect hugetlb charged to cgroup 2."
+
+ else
+ expect_equal "0" "$hugetlb_difference1" \
+ "Incorrect hugetlb charged to cgroup 1."
+
+ expect_equal "0" "$hugetlb_difference2" \
+ "Incorrect hugetlb charged to cgroup 2."
+ fi
+ echo 'PASS'
+
+ cleanup
+ echo
+ echo
+ echo
+ echo Test normal case with write, multiple cgroups.
+ echo private=$private, populate=$populate, method=$method, reserve=$reserve
+ run_multiple_cgroup_test "3" "$populate" "-w" "10" "10" "5" \
+ "$populate" "-w" "10" "10" "10" \
+ "$method" "$private" "0" "$reserve"
+
+ echo Memory charged to hugtlb1=$hugetlb_difference1
+ echo Memory charged to reservation1=$reserved_difference1
+ echo Memory charged to hugtlb2=$hugetlb_difference2
+ echo Memory charged to reservation2=$reserved_difference2
+
+ expect_equal "3" "$hugetlb_difference1" \
+ "Incorrect hugetlb charged to cgroup 1."
+
+ expect_equal "3" "$reserved_difference1" \
+ "Incorrect reservation charged to cgroup 1."
+
+ expect_equal "5" "$hugetlb_difference2" \
+ "Incorrect hugetlb charged to cgroup 2."
+
+ expect_equal "5" "$reserved_difference2" \
+ "Incorrected reservation charged to cgroup 2."
+ echo 'PASS'
+
+ cleanup
+
+ done # reserve
+ done # private
+ done # populate
+done # method
+
+if [[ $do_umount ]]; then
+ umount $cgroup_path
+ rmdir $cgroup_path
+fi
diff --git a/tools/testing/selftests/vm/compaction_test.c b/tools/testing/selftests/vm/compaction_test.c
new file mode 100644
index 000000000..9b420140b
--- /dev/null
+++ b/tools/testing/selftests/vm/compaction_test.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * A test for the patch "Allow compaction of unevictable pages".
+ * With this patch we should be able to allocate at least 1/4
+ * of RAM in huge pages. Without the patch much less is
+ * allocated.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/resource.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "../kselftest.h"
+
+#define MAP_SIZE_MB 100
+#define MAP_SIZE (MAP_SIZE_MB * 1024 * 1024)
+
+struct map_list {
+ void *map;
+ struct map_list *next;
+};
+
+int read_memory_info(unsigned long *memfree, unsigned long *hugepagesize)
+{
+ char buffer[256] = {0};
+ char *cmd = "cat /proc/meminfo | grep -i memfree | grep -o '[0-9]*'";
+ FILE *cmdfile = popen(cmd, "r");
+
+ if (!(fgets(buffer, sizeof(buffer), cmdfile))) {
+ perror("Failed to read meminfo\n");
+ return -1;
+ }
+
+ pclose(cmdfile);
+
+ *memfree = atoll(buffer);
+ cmd = "cat /proc/meminfo | grep -i hugepagesize | grep -o '[0-9]*'";
+ cmdfile = popen(cmd, "r");
+
+ if (!(fgets(buffer, sizeof(buffer), cmdfile))) {
+ perror("Failed to read meminfo\n");
+ return -1;
+ }
+
+ pclose(cmdfile);
+ *hugepagesize = atoll(buffer);
+
+ return 0;
+}
+
+int prereq(void)
+{
+ char allowed;
+ int fd;
+
+ fd = open("/proc/sys/vm/compact_unevictable_allowed",
+ O_RDONLY | O_NONBLOCK);
+ if (fd < 0) {
+ perror("Failed to open\n"
+ "/proc/sys/vm/compact_unevictable_allowed\n");
+ return -1;
+ }
+
+ if (read(fd, &allowed, sizeof(char)) != sizeof(char)) {
+ perror("Failed to read from\n"
+ "/proc/sys/vm/compact_unevictable_allowed\n");
+ close(fd);
+ return -1;
+ }
+
+ close(fd);
+ if (allowed == '1')
+ return 0;
+
+ return -1;
+}
+
+int check_compaction(unsigned long mem_free, unsigned int hugepage_size)
+{
+ int fd;
+ int compaction_index = 0;
+ char initial_nr_hugepages[10] = {0};
+ char nr_hugepages[10] = {0};
+
+ /* We want to test with 80% of available memory. Else, OOM killer comes
+ in to play */
+ mem_free = mem_free * 0.8;
+
+ fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK);
+ if (fd < 0) {
+ perror("Failed to open /proc/sys/vm/nr_hugepages");
+ return -1;
+ }
+
+ if (read(fd, initial_nr_hugepages, sizeof(initial_nr_hugepages)) <= 0) {
+ perror("Failed to read from /proc/sys/vm/nr_hugepages");
+ goto close_fd;
+ }
+
+ /* Start with the initial condition of 0 huge pages*/
+ if (write(fd, "0", sizeof(char)) != sizeof(char)) {
+ perror("Failed to write 0 to /proc/sys/vm/nr_hugepages\n");
+ goto close_fd;
+ }
+
+ lseek(fd, 0, SEEK_SET);
+
+ /* Request a large number of huge pages. The Kernel will allocate
+ as much as it can */
+ if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) {
+ perror("Failed to write 100000 to /proc/sys/vm/nr_hugepages\n");
+ goto close_fd;
+ }
+
+ lseek(fd, 0, SEEK_SET);
+
+ if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) {
+ perror("Failed to re-read from /proc/sys/vm/nr_hugepages\n");
+ goto close_fd;
+ }
+
+ /* We should have been able to request at least 1/3 rd of the memory in
+ huge pages */
+ compaction_index = mem_free/(atoi(nr_hugepages) * hugepage_size);
+
+ if (compaction_index > 3) {
+ printf("No of huge pages allocated = %d\n",
+ (atoi(nr_hugepages)));
+ fprintf(stderr, "ERROR: Less that 1/%d of memory is available\n"
+ "as huge pages\n", compaction_index);
+ goto close_fd;
+ }
+
+ printf("No of huge pages allocated = %d\n",
+ (atoi(nr_hugepages)));
+
+ lseek(fd, 0, SEEK_SET);
+
+ if (write(fd, initial_nr_hugepages, strlen(initial_nr_hugepages))
+ != strlen(initial_nr_hugepages)) {
+ perror("Failed to write value to /proc/sys/vm/nr_hugepages\n");
+ goto close_fd;
+ }
+
+ close(fd);
+ return 0;
+
+ close_fd:
+ close(fd);
+ printf("Not OK. Compaction test failed.");
+ return -1;
+}
+
+
+int main(int argc, char **argv)
+{
+ struct rlimit lim;
+ struct map_list *list, *entry;
+ size_t page_size, i;
+ void *map = NULL;
+ unsigned long mem_free = 0;
+ unsigned long hugepage_size = 0;
+ long mem_fragmentable_MB = 0;
+
+ if (prereq() != 0) {
+ printf("Either the sysctl compact_unevictable_allowed is not\n"
+ "set to 1 or couldn't read the proc file.\n"
+ "Skipping the test\n");
+ return KSFT_SKIP;
+ }
+
+ lim.rlim_cur = RLIM_INFINITY;
+ lim.rlim_max = RLIM_INFINITY;
+ if (setrlimit(RLIMIT_MEMLOCK, &lim)) {
+ perror("Failed to set rlimit:\n");
+ return -1;
+ }
+
+ page_size = getpagesize();
+
+ list = NULL;
+
+ if (read_memory_info(&mem_free, &hugepage_size) != 0) {
+ printf("ERROR: Cannot read meminfo\n");
+ return -1;
+ }
+
+ mem_fragmentable_MB = mem_free * 0.8 / 1024;
+
+ while (mem_fragmentable_MB > 0) {
+ map = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED, -1, 0);
+ if (map == MAP_FAILED)
+ break;
+
+ entry = malloc(sizeof(struct map_list));
+ if (!entry) {
+ munmap(map, MAP_SIZE);
+ break;
+ }
+ entry->map = map;
+ entry->next = list;
+ list = entry;
+
+ /* Write something (in this case the address of the map) to
+ * ensure that KSM can't merge the mapped pages
+ */
+ for (i = 0; i < MAP_SIZE; i += page_size)
+ *(unsigned long *)(map + i) = (unsigned long)map + i;
+
+ mem_fragmentable_MB -= MAP_SIZE_MB;
+ }
+
+ for (entry = list; entry != NULL; entry = entry->next) {
+ munmap(entry->map, MAP_SIZE);
+ if (!entry->next)
+ break;
+ entry = entry->next;
+ }
+
+ if (check_compaction(mem_free, hugepage_size) == 0)
+ return 0;
+
+ return -1;
+}
diff --git a/tools/testing/selftests/vm/config b/tools/testing/selftests/vm/config
new file mode 100644
index 000000000..69dd0d1aa
--- /dev/null
+++ b/tools/testing/selftests/vm/config
@@ -0,0 +1,6 @@
+CONFIG_SYSVIPC=y
+CONFIG_USERFAULTFD=y
+CONFIG_TEST_VMALLOC=m
+CONFIG_DEVICE_PRIVATE=y
+CONFIG_TEST_HMM=m
+CONFIG_GUP_BENCHMARK=y
diff --git a/tools/testing/selftests/vm/gup_benchmark.c b/tools/testing/selftests/vm/gup_benchmark.c
new file mode 100644
index 000000000..1d4359341
--- /dev/null
+++ b/tools/testing/selftests/vm/gup_benchmark.c
@@ -0,0 +1,143 @@
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <linux/types.h>
+
+#define MB (1UL << 20)
+#define PAGE_SIZE sysconf(_SC_PAGESIZE)
+
+#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark)
+#define GUP_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
+
+/* Similar to above, but use FOLL_PIN instead of FOLL_GET. */
+#define PIN_FAST_BENCHMARK _IOWR('g', 3, struct gup_benchmark)
+#define PIN_BENCHMARK _IOWR('g', 4, struct gup_benchmark)
+#define PIN_LONGTERM_BENCHMARK _IOWR('g', 5, struct gup_benchmark)
+
+/* Just the flags we need, copied from mm.h: */
+#define FOLL_WRITE 0x01 /* check pte is writable */
+
+struct gup_benchmark {
+ __u64 get_delta_usec;
+ __u64 put_delta_usec;
+ __u64 addr;
+ __u64 size;
+ __u32 nr_pages_per_call;
+ __u32 flags;
+ __u64 expansion[10]; /* For future use */
+};
+
+int main(int argc, char **argv)
+{
+ struct gup_benchmark gup;
+ unsigned long size = 128 * MB;
+ int i, fd, filed, opt, nr_pages = 1, thp = -1, repeats = 1, write = 0;
+ int cmd = GUP_FAST_BENCHMARK, flags = MAP_PRIVATE;
+ char *file = "/dev/zero";
+ char *p;
+
+ while ((opt = getopt(argc, argv, "m:r:n:f:abtTLUuwSH")) != -1) {
+ switch (opt) {
+ case 'a':
+ cmd = PIN_FAST_BENCHMARK;
+ break;
+ case 'b':
+ cmd = PIN_BENCHMARK;
+ break;
+ case 'L':
+ cmd = PIN_LONGTERM_BENCHMARK;
+ break;
+ case 'm':
+ size = atoi(optarg) * MB;
+ break;
+ case 'r':
+ repeats = atoi(optarg);
+ break;
+ case 'n':
+ nr_pages = atoi(optarg);
+ break;
+ case 't':
+ thp = 1;
+ break;
+ case 'T':
+ thp = 0;
+ break;
+ case 'U':
+ cmd = GUP_BENCHMARK;
+ break;
+ case 'u':
+ cmd = GUP_FAST_BENCHMARK;
+ break;
+ case 'w':
+ write = 1;
+ break;
+ case 'f':
+ file = optarg;
+ break;
+ case 'S':
+ flags &= ~MAP_PRIVATE;
+ flags |= MAP_SHARED;
+ break;
+ case 'H':
+ flags |= (MAP_HUGETLB | MAP_ANONYMOUS);
+ break;
+ default:
+ return -1;
+ }
+ }
+
+ filed = open(file, O_RDWR|O_CREAT);
+ if (filed < 0) {
+ perror("open");
+ exit(filed);
+ }
+
+ gup.nr_pages_per_call = nr_pages;
+ if (write)
+ gup.flags |= FOLL_WRITE;
+
+ fd = open("/sys/kernel/debug/gup_benchmark", O_RDWR);
+ if (fd == -1) {
+ perror("open");
+ exit(1);
+ }
+
+ p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0);
+ if (p == MAP_FAILED) {
+ perror("mmap");
+ exit(1);
+ }
+ gup.addr = (unsigned long)p;
+
+ if (thp == 1)
+ madvise(p, size, MADV_HUGEPAGE);
+ else if (thp == 0)
+ madvise(p, size, MADV_NOHUGEPAGE);
+
+ for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE)
+ p[0] = 0;
+
+ for (i = 0; i < repeats; i++) {
+ gup.size = size;
+ if (ioctl(fd, cmd, &gup)) {
+ perror("ioctl");
+ exit(1);
+ }
+
+ printf("Time: get:%lld put:%lld us", gup.get_delta_usec,
+ gup.put_delta_usec);
+ if (gup.size != size)
+ printf(", truncated (size: %lld)", gup.size);
+ printf("\n");
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c
new file mode 100644
index 000000000..426dccc08
--- /dev/null
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -0,0 +1,1522 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * HMM stands for Heterogeneous Memory Management, it is a helper layer inside
+ * the linux kernel to help device drivers mirror a process address space in
+ * the device. This allows the device to use the same address space which
+ * makes communication and data exchange a lot easier.
+ *
+ * This framework's sole purpose is to exercise various code paths inside
+ * the kernel to make sure that HMM performs as expected and to flush out any
+ * bugs.
+ */
+
+#include "../kselftest_harness.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <strings.h>
+#include <time.h>
+#include <pthread.h>
+#include <hugetlbfs.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+
+/*
+ * This is a private UAPI to the kernel test module so it isn't exported
+ * in the usual include/uapi/... directory.
+ */
+#include "../../../../lib/test_hmm_uapi.h"
+
+struct hmm_buffer {
+ void *ptr;
+ void *mirror;
+ unsigned long size;
+ int fd;
+ uint64_t cpages;
+ uint64_t faults;
+};
+
+#define TWOMEG (1 << 21)
+#define HMM_BUFFER_SIZE (1024 << 12)
+#define HMM_PATH_MAX 64
+#define NTIMES 10
+
+#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1)))
+
+FIXTURE(hmm)
+{
+ int fd;
+ unsigned int page_size;
+ unsigned int page_shift;
+};
+
+FIXTURE(hmm2)
+{
+ int fd0;
+ int fd1;
+ unsigned int page_size;
+ unsigned int page_shift;
+};
+
+static int hmm_open(int unit)
+{
+ char pathname[HMM_PATH_MAX];
+ int fd;
+
+ snprintf(pathname, sizeof(pathname), "/dev/hmm_dmirror%d", unit);
+ fd = open(pathname, O_RDWR, 0);
+ if (fd < 0)
+ fprintf(stderr, "could not open hmm dmirror driver (%s)\n",
+ pathname);
+ return fd;
+}
+
+FIXTURE_SETUP(hmm)
+{
+ self->page_size = sysconf(_SC_PAGE_SIZE);
+ self->page_shift = ffs(self->page_size) - 1;
+
+ self->fd = hmm_open(0);
+ ASSERT_GE(self->fd, 0);
+}
+
+FIXTURE_SETUP(hmm2)
+{
+ self->page_size = sysconf(_SC_PAGE_SIZE);
+ self->page_shift = ffs(self->page_size) - 1;
+
+ self->fd0 = hmm_open(0);
+ ASSERT_GE(self->fd0, 0);
+ self->fd1 = hmm_open(1);
+ ASSERT_GE(self->fd1, 0);
+}
+
+FIXTURE_TEARDOWN(hmm)
+{
+ int ret = close(self->fd);
+
+ ASSERT_EQ(ret, 0);
+ self->fd = -1;
+}
+
+FIXTURE_TEARDOWN(hmm2)
+{
+ int ret = close(self->fd0);
+
+ ASSERT_EQ(ret, 0);
+ self->fd0 = -1;
+
+ ret = close(self->fd1);
+ ASSERT_EQ(ret, 0);
+ self->fd1 = -1;
+}
+
+static int hmm_dmirror_cmd(int fd,
+ unsigned long request,
+ struct hmm_buffer *buffer,
+ unsigned long npages)
+{
+ struct hmm_dmirror_cmd cmd;
+ int ret;
+
+ /* Simulate a device reading system memory. */
+ cmd.addr = (__u64)buffer->ptr;
+ cmd.ptr = (__u64)buffer->mirror;
+ cmd.npages = npages;
+
+ for (;;) {
+ ret = ioctl(fd, request, &cmd);
+ if (ret == 0)
+ break;
+ if (errno == EINTR)
+ continue;
+ return -errno;
+ }
+ buffer->cpages = cmd.cpages;
+ buffer->faults = cmd.faults;
+
+ return 0;
+}
+
+static void hmm_buffer_free(struct hmm_buffer *buffer)
+{
+ if (buffer == NULL)
+ return;
+
+ if (buffer->ptr)
+ munmap(buffer->ptr, buffer->size);
+ free(buffer->mirror);
+ free(buffer);
+}
+
+/*
+ * Create a temporary file that will be deleted on close.
+ */
+static int hmm_create_file(unsigned long size)
+{
+ char path[HMM_PATH_MAX];
+ int fd;
+
+ strcpy(path, "/tmp");
+ fd = open(path, O_TMPFILE | O_EXCL | O_RDWR, 0600);
+ if (fd >= 0) {
+ int r;
+
+ do {
+ r = ftruncate(fd, size);
+ } while (r == -1 && errno == EINTR);
+ if (!r)
+ return fd;
+ close(fd);
+ }
+ return -1;
+}
+
+/*
+ * Return a random unsigned number.
+ */
+static unsigned int hmm_random(void)
+{
+ static int fd = -1;
+ unsigned int r;
+
+ if (fd < 0) {
+ fd = open("/dev/urandom", O_RDONLY);
+ if (fd < 0) {
+ fprintf(stderr, "%s:%d failed to open /dev/urandom\n",
+ __FILE__, __LINE__);
+ return ~0U;
+ }
+ }
+ read(fd, &r, sizeof(r));
+ return r;
+}
+
+static void hmm_nanosleep(unsigned int n)
+{
+ struct timespec t;
+
+ t.tv_sec = 0;
+ t.tv_nsec = n;
+ nanosleep(&t, NULL);
+}
+
+/*
+ * Simple NULL test of device open/close.
+ */
+TEST_F(hmm, open_close)
+{
+}
+
+/*
+ * Read private anonymous memory.
+ */
+TEST_F(hmm, anon_read)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+ int val;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /*
+ * Initialize buffer in system memory but leave the first two pages
+ * zero (pte_none and pfn_zero).
+ */
+ i = 2 * self->page_size / sizeof(*ptr);
+ for (ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Set buffer permission to read-only. */
+ ret = mprotect(buffer->ptr, size, PROT_READ);
+ ASSERT_EQ(ret, 0);
+
+ /* Populate the CPU page table with a special zero page. */
+ val = *(int *)(buffer->ptr + self->page_size);
+ ASSERT_EQ(val, 0);
+
+ /* Simulate a device reading system memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device read. */
+ ptr = buffer->mirror;
+ for (i = 0; i < 2 * self->page_size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], 0);
+ for (; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Read private anonymous memory which has been protected with
+ * mprotect() PROT_NONE.
+ */
+TEST_F(hmm, anon_read_prot)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Initialize mirror buffer so we can verify it isn't written. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ptr[i] = -i;
+
+ /* Protect buffer from reading. */
+ ret = mprotect(buffer->ptr, size, PROT_NONE);
+ ASSERT_EQ(ret, 0);
+
+ /* Simulate a device reading system memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages);
+ ASSERT_EQ(ret, -EFAULT);
+
+ /* Allow CPU to read the buffer so we can check it. */
+ ret = mprotect(buffer->ptr, size, PROT_READ);
+ ASSERT_EQ(ret, 0);
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], -i);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Write private anonymous memory.
+ */
+TEST_F(hmm, anon_write)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize data that the device will write to buffer->ptr. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Simulate a device writing system memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device wrote. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Write private anonymous memory which has been protected with
+ * mprotect() PROT_READ.
+ */
+TEST_F(hmm, anon_write_prot)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Simulate a device reading a zero page of memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, 1);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, 1);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Initialize data that the device will write to buffer->ptr. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Simulate a device writing system memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+ ASSERT_EQ(ret, -EPERM);
+
+ /* Check what the device wrote. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], 0);
+
+ /* Now allow writing and see that the zero page is replaced. */
+ ret = mprotect(buffer->ptr, size, PROT_WRITE | PROT_READ);
+ ASSERT_EQ(ret, 0);
+
+ /* Simulate a device writing system memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device wrote. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Check that a device writing an anonymous private mapping
+ * will copy-on-write if a child process inherits the mapping.
+ */
+TEST_F(hmm, anon_write_child)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ pid_t pid;
+ int child_fd;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer->ptr so we can tell if it is written. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Initialize data that the device will write to buffer->ptr. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ptr[i] = -i;
+
+ pid = fork();
+ if (pid == -1)
+ ASSERT_EQ(pid, 0);
+ if (pid != 0) {
+ waitpid(pid, &ret, 0);
+ ASSERT_EQ(WIFEXITED(ret), 1);
+
+ /* Check that the parent's buffer did not change. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+ return;
+ }
+
+ /* Check that we see the parent's values. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], -i);
+
+ /* The child process needs its own mirror to its own mm. */
+ child_fd = hmm_open(0);
+ ASSERT_GE(child_fd, 0);
+
+ /* Simulate a device writing system memory. */
+ ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device wrote. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], -i);
+
+ close(child_fd);
+ exit(0);
+}
+
+/*
+ * Check that a device writing an anonymous shared mapping
+ * will not copy-on-write if a child process inherits the mapping.
+ */
+TEST_F(hmm, anon_write_child_shared)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ pid_t pid;
+ int child_fd;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer->ptr so we can tell if it is written. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Initialize data that the device will write to buffer->ptr. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ptr[i] = -i;
+
+ pid = fork();
+ if (pid == -1)
+ ASSERT_EQ(pid, 0);
+ if (pid != 0) {
+ waitpid(pid, &ret, 0);
+ ASSERT_EQ(WIFEXITED(ret), 1);
+
+ /* Check that the parent's buffer did change. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], -i);
+ return;
+ }
+
+ /* Check that we see the parent's values. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], -i);
+
+ /* The child process needs its own mirror to its own mm. */
+ child_fd = hmm_open(0);
+ ASSERT_GE(child_fd, 0);
+
+ /* Simulate a device writing system memory. */
+ ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device wrote. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], -i);
+
+ close(child_fd);
+ exit(0);
+}
+
+/*
+ * Write private anonymous huge page.
+ */
+TEST_F(hmm, anon_write_huge)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ void *old_ptr;
+ void *map;
+ int *ptr;
+ int ret;
+
+ size = 2 * TWOMEG;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ size = TWOMEG;
+ npages = size >> self->page_shift;
+ map = (void *)ALIGN((uintptr_t)buffer->ptr, size);
+ ret = madvise(map, size, MADV_HUGEPAGE);
+ ASSERT_EQ(ret, 0);
+ old_ptr = buffer->ptr;
+ buffer->ptr = map;
+
+ /* Initialize data that the device will write to buffer->ptr. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Simulate a device writing system memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device wrote. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ buffer->ptr = old_ptr;
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Write huge TLBFS page.
+ */
+TEST_F(hmm, anon_write_hugetlbfs)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+ long pagesizes[4];
+ int n, idx;
+
+ /* Skip test if we can't allocate a hugetlbfs page. */
+
+ n = gethugepagesizes(pagesizes, 4);
+ if (n <= 0)
+ SKIP(return, "Huge page size could not be determined");
+ for (idx = 0; --n > 0; ) {
+ if (pagesizes[n] < pagesizes[idx])
+ idx = n;
+ }
+ size = ALIGN(TWOMEG, pagesizes[idx]);
+ npages = size >> self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->ptr = get_hugepage_region(size, GHR_STRICT);
+ if (buffer->ptr == NULL) {
+ free(buffer);
+ SKIP(return, "Huge page could not be allocated");
+ }
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ /* Initialize data that the device will write to buffer->ptr. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Simulate a device writing system memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device wrote. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ free_hugepage_region(buffer->ptr);
+ buffer->ptr = NULL;
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Read mmap'ed file memory.
+ */
+TEST_F(hmm, file_read)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+ int fd;
+ ssize_t len;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ fd = hmm_create_file(size);
+ ASSERT_GE(fd, 0);
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = fd;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ /* Write initial contents of the file. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+ len = pwrite(fd, buffer->mirror, size, 0);
+ ASSERT_EQ(len, size);
+ memset(buffer->mirror, 0, size);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ,
+ MAP_SHARED,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Simulate a device reading system memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Write mmap'ed file memory.
+ */
+TEST_F(hmm, file_write)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+ int fd;
+ ssize_t len;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ fd = hmm_create_file(size);
+ ASSERT_GE(fd, 0);
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = fd;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize data that the device will write to buffer->ptr. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Simulate a device writing system memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device wrote. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ /* Check that the device also wrote the file. */
+ len = pread(fd, buffer->mirror, size, 0);
+ ASSERT_EQ(len, size);
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Migrate anonymous memory to device private memory.
+ */
+TEST_F(hmm, migrate)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Migrate memory to device. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Migrate anonymous memory to device private memory and fault some of it back
+ * to system memory, then try migrating the resulting mix of system and device
+ * private memory to the device.
+ */
+TEST_F(hmm, migrate_fault)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Migrate memory to device. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ /* Fault half the pages back to system memory and check them. */
+ for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ /* Migrate memory to the device again. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Migrate anonymous shared memory to device private memory.
+ */
+TEST_F(hmm, migrate_shared)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Migrate memory to device. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages);
+ ASSERT_EQ(ret, -ENOENT);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Try to migrate various memory types to device private memory.
+ */
+TEST_F(hmm2, migrate_mixed)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ int *ptr;
+ unsigned char *p;
+ int ret;
+ int val;
+
+ npages = 6;
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ /* Reserve a range of addresses. */
+ buffer->ptr = mmap(NULL, size,
+ PROT_NONE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+ p = buffer->ptr;
+
+ /* Migrating a protected area should be an error. */
+ ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, npages);
+ ASSERT_EQ(ret, -EINVAL);
+
+ /* Punch a hole after the first page address. */
+ ret = munmap(buffer->ptr + self->page_size, self->page_size);
+ ASSERT_EQ(ret, 0);
+
+ /* We expect an error if the vma doesn't cover the range. */
+ ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 3);
+ ASSERT_EQ(ret, -EINVAL);
+
+ /* Page 2 will be a read-only zero page. */
+ ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size,
+ PROT_READ);
+ ASSERT_EQ(ret, 0);
+ ptr = (int *)(buffer->ptr + 2 * self->page_size);
+ val = *ptr + 3;
+ ASSERT_EQ(val, 3);
+
+ /* Page 3 will be read-only. */
+ ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
+ PROT_READ | PROT_WRITE);
+ ASSERT_EQ(ret, 0);
+ ptr = (int *)(buffer->ptr + 3 * self->page_size);
+ *ptr = val;
+ ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
+ PROT_READ);
+ ASSERT_EQ(ret, 0);
+
+ /* Page 4-5 will be read-write. */
+ ret = mprotect(buffer->ptr + 4 * self->page_size, 2 * self->page_size,
+ PROT_READ | PROT_WRITE);
+ ASSERT_EQ(ret, 0);
+ ptr = (int *)(buffer->ptr + 4 * self->page_size);
+ *ptr = val;
+ ptr = (int *)(buffer->ptr + 5 * self->page_size);
+ *ptr = val;
+
+ /* Now try to migrate pages 2-5 to device 1. */
+ buffer->ptr = p + 2 * self->page_size;
+ ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 4);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, 4);
+
+ /* Page 5 won't be migrated to device 0 because it's on device 1. */
+ buffer->ptr = p + 5 * self->page_size;
+ ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_MIGRATE, buffer, 1);
+ ASSERT_EQ(ret, -ENOENT);
+ buffer->ptr = p;
+
+ buffer->ptr = p;
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Migrate anonymous memory to device private memory and fault it back to system
+ * memory multiple times.
+ */
+TEST_F(hmm, migrate_multiple)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ unsigned long c;
+ int *ptr;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ for (c = 0; c < NTIMES; c++) {
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Migrate memory to device. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer,
+ npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ /* Fault pages back to system memory and check them. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ hmm_buffer_free(buffer);
+ }
+}
+
+/*
+ * Read anonymous memory multiple times.
+ */
+TEST_F(hmm, anon_read_multiple)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ unsigned long c;
+ int *ptr;
+ int ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ for (c = 0; c < NTIMES; c++) {
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i + c;
+
+ /* Simulate a device reading system memory. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer,
+ npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i + c);
+
+ hmm_buffer_free(buffer);
+ }
+}
+
+void *unmap_buffer(void *p)
+{
+ struct hmm_buffer *buffer = p;
+
+ /* Delay for a bit and then unmap buffer while it is being read. */
+ hmm_nanosleep(hmm_random() % 32000);
+ munmap(buffer->ptr + buffer->size / 2, buffer->size / 2);
+ buffer->ptr = NULL;
+
+ return NULL;
+}
+
+/*
+ * Try reading anonymous memory while it is being unmapped.
+ */
+TEST_F(hmm, anon_teardown)
+{
+ unsigned long npages;
+ unsigned long size;
+ unsigned long c;
+ void *ret;
+
+ npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift;
+ ASSERT_NE(npages, 0);
+ size = npages << self->page_shift;
+
+ for (c = 0; c < NTIMES; ++c) {
+ pthread_t thread;
+ struct hmm_buffer *buffer;
+ unsigned long i;
+ int *ptr;
+ int rc;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(size);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i + c;
+
+ rc = pthread_create(&thread, NULL, unmap_buffer, buffer);
+ ASSERT_EQ(rc, 0);
+
+ /* Simulate a device reading system memory. */
+ rc = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer,
+ npages);
+ if (rc == 0) {
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror;
+ i < size / sizeof(*ptr);
+ ++i)
+ ASSERT_EQ(ptr[i], i + c);
+ }
+
+ pthread_join(thread, &ret);
+ hmm_buffer_free(buffer);
+ }
+}
+
+/*
+ * Test memory snapshot without faulting in pages accessed by the device.
+ */
+TEST_F(hmm, mixedmap)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned char *m;
+ int ret;
+
+ npages = 1;
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(npages);
+ ASSERT_NE(buffer->mirror, NULL);
+
+
+ /* Reserve a range of addresses. */
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE,
+ self->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Simulate a device snapshotting CPU pagetables. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device saw. */
+ m = buffer->mirror;
+ ASSERT_EQ(m[0], HMM_DMIRROR_PROT_READ);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Test memory snapshot without faulting in pages accessed by the device.
+ */
+TEST_F(hmm2, snapshot)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ int *ptr;
+ unsigned char *p;
+ unsigned char *m;
+ int ret;
+ int val;
+
+ npages = 7;
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(npages);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ /* Reserve a range of addresses. */
+ buffer->ptr = mmap(NULL, size,
+ PROT_NONE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+ p = buffer->ptr;
+
+ /* Punch a hole after the first page address. */
+ ret = munmap(buffer->ptr + self->page_size, self->page_size);
+ ASSERT_EQ(ret, 0);
+
+ /* Page 2 will be read-only zero page. */
+ ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size,
+ PROT_READ);
+ ASSERT_EQ(ret, 0);
+ ptr = (int *)(buffer->ptr + 2 * self->page_size);
+ val = *ptr + 3;
+ ASSERT_EQ(val, 3);
+
+ /* Page 3 will be read-only. */
+ ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
+ PROT_READ | PROT_WRITE);
+ ASSERT_EQ(ret, 0);
+ ptr = (int *)(buffer->ptr + 3 * self->page_size);
+ *ptr = val;
+ ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size,
+ PROT_READ);
+ ASSERT_EQ(ret, 0);
+
+ /* Page 4-6 will be read-write. */
+ ret = mprotect(buffer->ptr + 4 * self->page_size, 3 * self->page_size,
+ PROT_READ | PROT_WRITE);
+ ASSERT_EQ(ret, 0);
+ ptr = (int *)(buffer->ptr + 4 * self->page_size);
+ *ptr = val;
+
+ /* Page 5 will be migrated to device 0. */
+ buffer->ptr = p + 5 * self->page_size;
+ ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_MIGRATE, buffer, 1);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, 1);
+
+ /* Page 6 will be migrated to device 1. */
+ buffer->ptr = p + 6 * self->page_size;
+ ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_MIGRATE, buffer, 1);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, 1);
+
+ /* Simulate a device snapshotting CPU pagetables. */
+ buffer->ptr = p;
+ ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device saw. */
+ m = buffer->mirror;
+ ASSERT_EQ(m[0], HMM_DMIRROR_PROT_ERROR);
+ ASSERT_EQ(m[1], HMM_DMIRROR_PROT_ERROR);
+ ASSERT_EQ(m[2], HMM_DMIRROR_PROT_ZERO | HMM_DMIRROR_PROT_READ);
+ ASSERT_EQ(m[3], HMM_DMIRROR_PROT_READ);
+ ASSERT_EQ(m[4], HMM_DMIRROR_PROT_WRITE);
+ ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL |
+ HMM_DMIRROR_PROT_WRITE);
+ ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE);
+
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Test the hmm_range_fault() HMM_PFN_PMD flag for large pages that
+ * should be mapped by a large page table entry.
+ */
+TEST_F(hmm, compound)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ int *ptr;
+ unsigned char *m;
+ int ret;
+ long pagesizes[4];
+ int n, idx;
+ unsigned long i;
+
+ /* Skip test if we can't allocate a hugetlbfs page. */
+
+ n = gethugepagesizes(pagesizes, 4);
+ if (n <= 0)
+ return;
+ for (idx = 0; --n > 0; ) {
+ if (pagesizes[n] < pagesizes[idx])
+ idx = n;
+ }
+ size = ALIGN(TWOMEG, pagesizes[idx]);
+ npages = size >> self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->ptr = get_hugepage_region(size, GHR_STRICT);
+ if (buffer->ptr == NULL) {
+ free(buffer);
+ return;
+ }
+
+ buffer->size = size;
+ buffer->mirror = malloc(npages);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ /* Initialize the pages the device will snapshot in buffer->ptr. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Simulate a device snapshotting CPU pagetables. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device saw. */
+ m = buffer->mirror;
+ for (i = 0; i < npages; ++i)
+ ASSERT_EQ(m[i], HMM_DMIRROR_PROT_WRITE |
+ HMM_DMIRROR_PROT_PMD);
+
+ /* Make the region read-only. */
+ ret = mprotect(buffer->ptr, size, PROT_READ);
+ ASSERT_EQ(ret, 0);
+
+ /* Simulate a device snapshotting CPU pagetables. */
+ ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+
+ /* Check what the device saw. */
+ m = buffer->mirror;
+ for (i = 0; i < npages; ++i)
+ ASSERT_EQ(m[i], HMM_DMIRROR_PROT_READ |
+ HMM_DMIRROR_PROT_PMD);
+
+ free_hugepage_region(buffer->ptr);
+ buffer->ptr = NULL;
+ hmm_buffer_free(buffer);
+}
+
+/*
+ * Test two devices reading the same memory (double mapped).
+ */
+TEST_F(hmm2, double_map)
+{
+ struct hmm_buffer *buffer;
+ unsigned long npages;
+ unsigned long size;
+ unsigned long i;
+ int *ptr;
+ int ret;
+
+ npages = 6;
+ size = npages << self->page_shift;
+
+ buffer = malloc(sizeof(*buffer));
+ ASSERT_NE(buffer, NULL);
+
+ buffer->fd = -1;
+ buffer->size = size;
+ buffer->mirror = malloc(npages);
+ ASSERT_NE(buffer->mirror, NULL);
+
+ /* Reserve a range of addresses. */
+ buffer->ptr = mmap(NULL, size,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS,
+ buffer->fd, 0);
+ ASSERT_NE(buffer->ptr, MAP_FAILED);
+
+ /* Initialize buffer in system memory. */
+ for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i)
+ ptr[i] = i;
+
+ /* Make region read-only. */
+ ret = mprotect(buffer->ptr, size, PROT_READ);
+ ASSERT_EQ(ret, 0);
+
+ /* Simulate device 0 reading system memory. */
+ ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ /* Simulate device 1 reading system memory. */
+ ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_READ, buffer, npages);
+ ASSERT_EQ(ret, 0);
+ ASSERT_EQ(buffer->cpages, npages);
+ ASSERT_EQ(buffer->faults, 1);
+
+ /* Check what the device read. */
+ for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i)
+ ASSERT_EQ(ptr[i], i);
+
+ /* Punch a hole after the first page address. */
+ ret = munmap(buffer->ptr + self->page_size, self->page_size);
+ ASSERT_EQ(ret, 0);
+
+ hmm_buffer_free(buffer);
+}
+
+TEST_HARNESS_MAIN
diff --git a/tools/testing/selftests/vm/hugepage-mmap.c b/tools/testing/selftests/vm/hugepage-mmap.c
new file mode 100644
index 000000000..93f9e7b81
--- /dev/null
+++ b/tools/testing/selftests/vm/hugepage-mmap.c
@@ -0,0 +1,93 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * hugepage-mmap:
+ *
+ * Example of using huge page memory in a user application using the mmap
+ * system call. Before running this application, make sure that the
+ * administrator has mounted the hugetlbfs filesystem (on some directory
+ * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this
+ * example, the app is requesting memory of size 256MB that is backed by
+ * huge pages.
+ *
+ * For the ia64 architecture, the Linux kernel reserves Region number 4 for
+ * huge pages. That means that if one requires a fixed address, a huge page
+ * aligned address starting with 0x800000... will be required. If a fixed
+ * address is not required, the kernel will select an address in the proper
+ * range.
+ * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#define FILE_NAME "huge/hugepagefile"
+#define LENGTH (256UL*1024*1024)
+#define PROTECTION (PROT_READ | PROT_WRITE)
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define FLAGS (MAP_SHARED | MAP_FIXED)
+#else
+#define ADDR (void *)(0x0UL)
+#define FLAGS (MAP_SHARED)
+#endif
+
+static void check_bytes(char *addr)
+{
+ printf("First hex is %x\n", *((unsigned int *)addr));
+}
+
+static void write_bytes(char *addr)
+{
+ unsigned long i;
+
+ for (i = 0; i < LENGTH; i++)
+ *(addr + i) = (char)i;
+}
+
+static int read_bytes(char *addr)
+{
+ unsigned long i;
+
+ check_bytes(addr);
+ for (i = 0; i < LENGTH; i++)
+ if (*(addr + i) != (char)i) {
+ printf("Mismatch at %lu\n", i);
+ return 1;
+ }
+ return 0;
+}
+
+int main(void)
+{
+ void *addr;
+ int fd, ret;
+
+ fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755);
+ if (fd < 0) {
+ perror("Open failed");
+ exit(1);
+ }
+
+ addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0);
+ if (addr == MAP_FAILED) {
+ perror("mmap");
+ unlink(FILE_NAME);
+ exit(1);
+ }
+
+ printf("Returned address is %p\n", addr);
+ check_bytes(addr);
+ write_bytes(addr);
+ ret = read_bytes(addr);
+
+ munmap(addr, LENGTH);
+ close(fd);
+ unlink(FILE_NAME);
+
+ return ret;
+}
diff --git a/tools/testing/selftests/vm/hugepage-shm.c b/tools/testing/selftests/vm/hugepage-shm.c
new file mode 100644
index 000000000..e2527f320
--- /dev/null
+++ b/tools/testing/selftests/vm/hugepage-shm.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * hugepage-shm:
+ *
+ * Example of using huge page memory in a user application using Sys V shared
+ * memory system calls. In this example the app is requesting 256MB of
+ * memory that is backed by huge pages. The application uses the flag
+ * SHM_HUGETLB in the shmget system call to inform the kernel that it is
+ * requesting huge pages.
+ *
+ * For the ia64 architecture, the Linux kernel reserves Region number 4 for
+ * huge pages. That means that if one requires a fixed address, a huge page
+ * aligned address starting with 0x800000... will be required. If a fixed
+ * address is not required, the kernel will select an address in the proper
+ * range.
+ * Other architectures, such as ppc64, i386 or x86_64 are not so constrained.
+ *
+ * Note: The default shared memory limit is quite low on many kernels,
+ * you may need to increase it via:
+ *
+ * echo 268435456 > /proc/sys/kernel/shmmax
+ *
+ * This will increase the maximum size per shared memory segment to 256MB.
+ * The other limit that you will hit eventually is shmall which is the
+ * total amount of shared memory in pages. To set it to 16GB on a system
+ * with a 4kB pagesize do:
+ *
+ * echo 4194304 > /proc/sys/kernel/shmall
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/mman.h>
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+#define LENGTH (256UL*1024*1024)
+
+#define dprintf(x) printf(x)
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define SHMAT_FLAGS (SHM_RND)
+#else
+#define ADDR (void *)(0x0UL)
+#define SHMAT_FLAGS (0)
+#endif
+
+int main(void)
+{
+ int shmid;
+ unsigned long i;
+ char *shmaddr;
+
+ shmid = shmget(2, LENGTH, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
+ if (shmid < 0) {
+ perror("shmget");
+ exit(1);
+ }
+ printf("shmid: 0x%x\n", shmid);
+
+ shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS);
+ if (shmaddr == (char *)-1) {
+ perror("Shared memory attach failure");
+ shmctl(shmid, IPC_RMID, NULL);
+ exit(2);
+ }
+ printf("shmaddr: %p\n", shmaddr);
+
+ dprintf("Starting the writes:\n");
+ for (i = 0; i < LENGTH; i++) {
+ shmaddr[i] = (char)(i);
+ if (!(i % (1024 * 1024)))
+ dprintf(".");
+ }
+ dprintf("\n");
+
+ dprintf("Starting the Check...");
+ for (i = 0; i < LENGTH; i++)
+ if (shmaddr[i] != (char)i) {
+ printf("\nIndex %lu mismatched\n", i);
+ exit(3);
+ }
+ dprintf("Done.\n");
+
+ if (shmdt((const void *)shmaddr) != 0) {
+ perror("Detach failure");
+ shmctl(shmid, IPC_RMID, NULL);
+ exit(4);
+ }
+
+ shmctl(shmid, IPC_RMID, NULL);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh b/tools/testing/selftests/vm/hugetlb_reparenting_test.sh
new file mode 100644
index 000000000..c665b16f1
--- /dev/null
+++ b/tools/testing/selftests/vm/hugetlb_reparenting_test.sh
@@ -0,0 +1,249 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+if [[ $(id -u) -ne 0 ]]; then
+ echo "This test must be run as root. Skipping..."
+ exit 0
+fi
+
+usage_file=usage_in_bytes
+
+if [[ "$1" == "-cgroup-v2" ]]; then
+ cgroup2=1
+ usage_file=current
+fi
+
+
+if [[ $cgroup2 ]]; then
+ CGROUP_ROOT=$(mount -t cgroup2 | head -1 | awk '{print $3}')
+ if [[ -z "$CGROUP_ROOT" ]]; then
+ CGROUP_ROOT=/dev/cgroup/memory
+ mount -t cgroup2 none $CGROUP_ROOT
+ do_umount=1
+ fi
+ echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control
+else
+ CGROUP_ROOT=$(mount -t cgroup | grep ",hugetlb" | awk '{print $3}')
+ if [[ -z "$CGROUP_ROOT" ]]; then
+ CGROUP_ROOT=/dev/cgroup/memory
+ mount -t cgroup memory,hugetlb $CGROUP_ROOT
+ do_umount=1
+ fi
+fi
+MNT='/mnt/huge/'
+
+function get_machine_hugepage_size() {
+ hpz=$(grep -i hugepagesize /proc/meminfo)
+ kb=${hpz:14:-3}
+ mb=$(($kb / 1024))
+ echo $mb
+}
+
+MB=$(get_machine_hugepage_size)
+
+function cleanup() {
+ echo cleanup
+ set +e
+ rm -rf "$MNT"/* 2>/dev/null
+ umount "$MNT" 2>/dev/null
+ rmdir "$MNT" 2>/dev/null
+ rmdir "$CGROUP_ROOT"/a/b 2>/dev/null
+ rmdir "$CGROUP_ROOT"/a 2>/dev/null
+ rmdir "$CGROUP_ROOT"/test1 2>/dev/null
+ echo 0 >/proc/sys/vm/nr_hugepages
+ set -e
+}
+
+function assert_state() {
+ local expected_a="$1"
+ local expected_a_hugetlb="$2"
+ local expected_b=""
+ local expected_b_hugetlb=""
+
+ if [ ! -z ${3:-} ] && [ ! -z ${4:-} ]; then
+ expected_b="$3"
+ expected_b_hugetlb="$4"
+ fi
+ local tolerance=$((5 * 1024 * 1024))
+
+ local actual_a
+ actual_a="$(cat "$CGROUP_ROOT"/a/memory.$usage_file)"
+ if [[ $actual_a -lt $(($expected_a - $tolerance)) ]] ||
+ [[ $actual_a -gt $(($expected_a + $tolerance)) ]]; then
+ echo actual a = $((${actual_a%% *} / 1024 / 1024)) MB
+ echo expected a = $((${expected_a%% *} / 1024 / 1024)) MB
+ echo fail
+
+ cleanup
+ exit 1
+ fi
+
+ local actual_a_hugetlb
+ actual_a_hugetlb="$(cat "$CGROUP_ROOT"/a/hugetlb.${MB}MB.$usage_file)"
+ if [[ $actual_a_hugetlb -lt $(($expected_a_hugetlb - $tolerance)) ]] ||
+ [[ $actual_a_hugetlb -gt $(($expected_a_hugetlb + $tolerance)) ]]; then
+ echo actual a hugetlb = $((${actual_a_hugetlb%% *} / 1024 / 1024)) MB
+ echo expected a hugetlb = $((${expected_a_hugetlb%% *} / 1024 / 1024)) MB
+ echo fail
+
+ cleanup
+ exit 1
+ fi
+
+ if [[ -z "$expected_b" || -z "$expected_b_hugetlb" ]]; then
+ return
+ fi
+
+ local actual_b
+ actual_b="$(cat "$CGROUP_ROOT"/a/b/memory.$usage_file)"
+ if [[ $actual_b -lt $(($expected_b - $tolerance)) ]] ||
+ [[ $actual_b -gt $(($expected_b + $tolerance)) ]]; then
+ echo actual b = $((${actual_b%% *} / 1024 / 1024)) MB
+ echo expected b = $((${expected_b%% *} / 1024 / 1024)) MB
+ echo fail
+
+ cleanup
+ exit 1
+ fi
+
+ local actual_b_hugetlb
+ actual_b_hugetlb="$(cat "$CGROUP_ROOT"/a/b/hugetlb.${MB}MB.$usage_file)"
+ if [[ $actual_b_hugetlb -lt $(($expected_b_hugetlb - $tolerance)) ]] ||
+ [[ $actual_b_hugetlb -gt $(($expected_b_hugetlb + $tolerance)) ]]; then
+ echo actual b hugetlb = $((${actual_b_hugetlb%% *} / 1024 / 1024)) MB
+ echo expected b hugetlb = $((${expected_b_hugetlb%% *} / 1024 / 1024)) MB
+ echo fail
+
+ cleanup
+ exit 1
+ fi
+}
+
+function setup() {
+ echo 100 >/proc/sys/vm/nr_hugepages
+ mkdir "$CGROUP_ROOT"/a
+ sleep 1
+ if [[ $cgroup2 ]]; then
+ echo "+hugetlb +memory" >$CGROUP_ROOT/a/cgroup.subtree_control
+ else
+ echo 0 >$CGROUP_ROOT/a/cpuset.mems
+ echo 0 >$CGROUP_ROOT/a/cpuset.cpus
+ fi
+
+ mkdir "$CGROUP_ROOT"/a/b
+
+ if [[ ! $cgroup2 ]]; then
+ echo 0 >$CGROUP_ROOT/a/b/cpuset.mems
+ echo 0 >$CGROUP_ROOT/a/b/cpuset.cpus
+ fi
+
+ mkdir -p "$MNT"
+ mount -t hugetlbfs none "$MNT"
+}
+
+write_hugetlbfs() {
+ local cgroup="$1"
+ local path="$2"
+ local size="$3"
+
+ if [[ $cgroup2 ]]; then
+ echo $$ >$CGROUP_ROOT/$cgroup/cgroup.procs
+ else
+ echo 0 >$CGROUP_ROOT/$cgroup/cpuset.mems
+ echo 0 >$CGROUP_ROOT/$cgroup/cpuset.cpus
+ echo $$ >"$CGROUP_ROOT/$cgroup/tasks"
+ fi
+ ./write_to_hugetlbfs -p "$path" -s "$size" -m 0 -o
+ if [[ $cgroup2 ]]; then
+ echo $$ >$CGROUP_ROOT/cgroup.procs
+ else
+ echo $$ >"$CGROUP_ROOT/tasks"
+ fi
+ echo
+}
+
+set -e
+
+size=$((${MB} * 1024 * 1024 * 25)) # 50MB = 25 * 2MB hugepages.
+
+cleanup
+
+echo
+echo
+echo Test charge, rmdir, uncharge
+setup
+echo mkdir
+mkdir $CGROUP_ROOT/test1
+
+echo write
+write_hugetlbfs test1 "$MNT"/test $size
+
+echo rmdir
+rmdir $CGROUP_ROOT/test1
+mkdir $CGROUP_ROOT/test1
+
+echo uncharge
+rm -rf /mnt/huge/*
+
+cleanup
+
+echo done
+echo
+echo
+if [[ ! $cgroup2 ]]; then
+ echo "Test parent and child hugetlb usage"
+ setup
+
+ echo write
+ write_hugetlbfs a "$MNT"/test $size
+
+ echo Assert memory charged correctly for parent use.
+ assert_state 0 $size 0 0
+
+ write_hugetlbfs a/b "$MNT"/test2 $size
+
+ echo Assert memory charged correctly for child use.
+ assert_state 0 $(($size * 2)) 0 $size
+
+ rmdir "$CGROUP_ROOT"/a/b
+ sleep 5
+ echo Assert memory reparent correctly.
+ assert_state 0 $(($size * 2))
+
+ rm -rf "$MNT"/*
+ umount "$MNT"
+ echo Assert memory uncharged correctly.
+ assert_state 0 0
+
+ cleanup
+fi
+
+echo
+echo
+echo "Test child only hugetlb usage"
+echo setup
+setup
+
+echo write
+write_hugetlbfs a/b "$MNT"/test2 $size
+
+echo Assert memory charged correctly for child only use.
+assert_state 0 $(($size)) 0 $size
+
+rmdir "$CGROUP_ROOT"/a/b
+echo Assert memory reparent correctly.
+assert_state 0 $size
+
+rm -rf "$MNT"/*
+umount "$MNT"
+echo Assert memory uncharged correctly.
+assert_state 0 0
+
+cleanup
+
+echo ALL PASS
+
+umount $CGROUP_ROOT
+rm -rf $CGROUP_ROOT
diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c
new file mode 100644
index 000000000..8b7582130
--- /dev/null
+++ b/tools/testing/selftests/vm/khugepaged.c
@@ -0,0 +1,1035 @@
+#define _GNU_SOURCE
+#include <fcntl.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/mman.h>
+#include <sys/wait.h>
+
+#ifndef MADV_PAGEOUT
+#define MADV_PAGEOUT 21
+#endif
+
+#define BASE_ADDR ((void *)(1UL << 30))
+static unsigned long hpage_pmd_size;
+static unsigned long page_size;
+static int hpage_pmd_nr;
+
+#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/"
+#define PID_SMAPS "/proc/self/smaps"
+
+enum thp_enabled {
+ THP_ALWAYS,
+ THP_MADVISE,
+ THP_NEVER,
+};
+
+static const char *thp_enabled_strings[] = {
+ "always",
+ "madvise",
+ "never",
+ NULL
+};
+
+enum thp_defrag {
+ THP_DEFRAG_ALWAYS,
+ THP_DEFRAG_DEFER,
+ THP_DEFRAG_DEFER_MADVISE,
+ THP_DEFRAG_MADVISE,
+ THP_DEFRAG_NEVER,
+};
+
+static const char *thp_defrag_strings[] = {
+ "always",
+ "defer",
+ "defer+madvise",
+ "madvise",
+ "never",
+ NULL
+};
+
+enum shmem_enabled {
+ SHMEM_ALWAYS,
+ SHMEM_WITHIN_SIZE,
+ SHMEM_ADVISE,
+ SHMEM_NEVER,
+ SHMEM_DENY,
+ SHMEM_FORCE,
+};
+
+static const char *shmem_enabled_strings[] = {
+ "always",
+ "within_size",
+ "advise",
+ "never",
+ "deny",
+ "force",
+ NULL
+};
+
+struct khugepaged_settings {
+ bool defrag;
+ unsigned int alloc_sleep_millisecs;
+ unsigned int scan_sleep_millisecs;
+ unsigned int max_ptes_none;
+ unsigned int max_ptes_swap;
+ unsigned int max_ptes_shared;
+ unsigned long pages_to_scan;
+};
+
+struct settings {
+ enum thp_enabled thp_enabled;
+ enum thp_defrag thp_defrag;
+ enum shmem_enabled shmem_enabled;
+ bool debug_cow;
+ bool use_zero_page;
+ struct khugepaged_settings khugepaged;
+};
+
+static struct settings default_settings = {
+ .thp_enabled = THP_MADVISE,
+ .thp_defrag = THP_DEFRAG_ALWAYS,
+ .shmem_enabled = SHMEM_NEVER,
+ .debug_cow = 0,
+ .use_zero_page = 0,
+ .khugepaged = {
+ .defrag = 1,
+ .alloc_sleep_millisecs = 10,
+ .scan_sleep_millisecs = 10,
+ },
+};
+
+static struct settings saved_settings;
+static bool skip_settings_restore;
+
+static int exit_status;
+
+static void success(const char *msg)
+{
+ printf(" \e[32m%s\e[0m\n", msg);
+}
+
+static void fail(const char *msg)
+{
+ printf(" \e[31m%s\e[0m\n", msg);
+ exit_status++;
+}
+
+static int read_file(const char *path, char *buf, size_t buflen)
+{
+ int fd;
+ ssize_t numread;
+
+ fd = open(path, O_RDONLY);
+ if (fd == -1)
+ return 0;
+
+ numread = read(fd, buf, buflen - 1);
+ if (numread < 1) {
+ close(fd);
+ return 0;
+ }
+
+ buf[numread] = '\0';
+ close(fd);
+
+ return (unsigned int) numread;
+}
+
+static int write_file(const char *path, const char *buf, size_t buflen)
+{
+ int fd;
+ ssize_t numwritten;
+
+ fd = open(path, O_WRONLY);
+ if (fd == -1)
+ return 0;
+
+ numwritten = write(fd, buf, buflen - 1);
+ close(fd);
+ if (numwritten < 1)
+ return 0;
+
+ return (unsigned int) numwritten;
+}
+
+static int read_string(const char *name, const char *strings[])
+{
+ char path[PATH_MAX];
+ char buf[256];
+ char *c;
+ int ret;
+
+ ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+ if (ret >= PATH_MAX) {
+ printf("%s: Pathname is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+ if (!read_file(path, buf, sizeof(buf))) {
+ perror(path);
+ exit(EXIT_FAILURE);
+ }
+
+ c = strchr(buf, '[');
+ if (!c) {
+ printf("%s: Parse failure\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+ c++;
+ memmove(buf, c, sizeof(buf) - (c - buf));
+
+ c = strchr(buf, ']');
+ if (!c) {
+ printf("%s: Parse failure\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+ *c = '\0';
+
+ ret = 0;
+ while (strings[ret]) {
+ if (!strcmp(strings[ret], buf))
+ return ret;
+ ret++;
+ }
+
+ printf("Failed to parse %s\n", name);
+ exit(EXIT_FAILURE);
+}
+
+static void write_string(const char *name, const char *val)
+{
+ char path[PATH_MAX];
+ int ret;
+
+ ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+ if (ret >= PATH_MAX) {
+ printf("%s: Pathname is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+ if (!write_file(path, val, strlen(val) + 1)) {
+ perror(path);
+ exit(EXIT_FAILURE);
+ }
+}
+
+static const unsigned long read_num(const char *name)
+{
+ char path[PATH_MAX];
+ char buf[21];
+ int ret;
+
+ ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+ if (ret >= PATH_MAX) {
+ printf("%s: Pathname is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+ ret = read_file(path, buf, sizeof(buf));
+ if (ret < 0) {
+ perror("read_file(read_num)");
+ exit(EXIT_FAILURE);
+ }
+
+ return strtoul(buf, NULL, 10);
+}
+
+static void write_num(const char *name, unsigned long num)
+{
+ char path[PATH_MAX];
+ char buf[21];
+ int ret;
+
+ ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
+ if (ret >= PATH_MAX) {
+ printf("%s: Pathname is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+ sprintf(buf, "%ld", num);
+ if (!write_file(path, buf, strlen(buf) + 1)) {
+ perror(path);
+ exit(EXIT_FAILURE);
+ }
+}
+
+static void write_settings(struct settings *settings)
+{
+ struct khugepaged_settings *khugepaged = &settings->khugepaged;
+
+ write_string("enabled", thp_enabled_strings[settings->thp_enabled]);
+ write_string("defrag", thp_defrag_strings[settings->thp_defrag]);
+ write_string("shmem_enabled",
+ shmem_enabled_strings[settings->shmem_enabled]);
+ write_num("debug_cow", settings->debug_cow);
+ write_num("use_zero_page", settings->use_zero_page);
+
+ write_num("khugepaged/defrag", khugepaged->defrag);
+ write_num("khugepaged/alloc_sleep_millisecs",
+ khugepaged->alloc_sleep_millisecs);
+ write_num("khugepaged/scan_sleep_millisecs",
+ khugepaged->scan_sleep_millisecs);
+ write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none);
+ write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap);
+ write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared);
+ write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
+}
+
+static void restore_settings(int sig)
+{
+ if (skip_settings_restore)
+ goto out;
+
+ printf("Restore THP and khugepaged settings...");
+ write_settings(&saved_settings);
+ success("OK");
+ if (sig)
+ exit(EXIT_FAILURE);
+out:
+ exit(exit_status);
+}
+
+static void save_settings(void)
+{
+ printf("Save THP and khugepaged settings...");
+ saved_settings = (struct settings) {
+ .thp_enabled = read_string("enabled", thp_enabled_strings),
+ .thp_defrag = read_string("defrag", thp_defrag_strings),
+ .shmem_enabled =
+ read_string("shmem_enabled", shmem_enabled_strings),
+ .debug_cow = read_num("debug_cow"),
+ .use_zero_page = read_num("use_zero_page"),
+ };
+ saved_settings.khugepaged = (struct khugepaged_settings) {
+ .defrag = read_num("khugepaged/defrag"),
+ .alloc_sleep_millisecs =
+ read_num("khugepaged/alloc_sleep_millisecs"),
+ .scan_sleep_millisecs =
+ read_num("khugepaged/scan_sleep_millisecs"),
+ .max_ptes_none = read_num("khugepaged/max_ptes_none"),
+ .max_ptes_swap = read_num("khugepaged/max_ptes_swap"),
+ .max_ptes_shared = read_num("khugepaged/max_ptes_shared"),
+ .pages_to_scan = read_num("khugepaged/pages_to_scan"),
+ };
+ success("OK");
+
+ signal(SIGTERM, restore_settings);
+ signal(SIGINT, restore_settings);
+ signal(SIGHUP, restore_settings);
+ signal(SIGQUIT, restore_settings);
+}
+
+static void adjust_settings(void)
+{
+
+ printf("Adjust settings...");
+ write_settings(&default_settings);
+ success("OK");
+}
+
+#define MAX_LINE_LENGTH 500
+
+static bool check_for_pattern(FILE *fp, char *pattern, char *buf)
+{
+ while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) {
+ if (!strncmp(buf, pattern, strlen(pattern)))
+ return true;
+ }
+ return false;
+}
+
+static bool check_huge(void *addr)
+{
+ bool thp = false;
+ int ret;
+ FILE *fp;
+ char buffer[MAX_LINE_LENGTH];
+ char addr_pattern[MAX_LINE_LENGTH];
+
+ ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
+ (unsigned long) addr);
+ if (ret >= MAX_LINE_LENGTH) {
+ printf("%s: Pattern is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+
+ fp = fopen(PID_SMAPS, "r");
+ if (!fp) {
+ printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
+ exit(EXIT_FAILURE);
+ }
+ if (!check_for_pattern(fp, addr_pattern, buffer))
+ goto err_out;
+
+ ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "AnonHugePages:%10ld kB",
+ hpage_pmd_size >> 10);
+ if (ret >= MAX_LINE_LENGTH) {
+ printf("%s: Pattern is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+ /*
+ * Fetch the AnonHugePages: in the same block and check whether it got
+ * the expected number of hugeepages next.
+ */
+ if (!check_for_pattern(fp, "AnonHugePages:", buffer))
+ goto err_out;
+
+ if (strncmp(buffer, addr_pattern, strlen(addr_pattern)))
+ goto err_out;
+
+ thp = true;
+err_out:
+ fclose(fp);
+ return thp;
+}
+
+
+static bool check_swap(void *addr, unsigned long size)
+{
+ bool swap = false;
+ int ret;
+ FILE *fp;
+ char buffer[MAX_LINE_LENGTH];
+ char addr_pattern[MAX_LINE_LENGTH];
+
+ ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
+ (unsigned long) addr);
+ if (ret >= MAX_LINE_LENGTH) {
+ printf("%s: Pattern is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+
+
+ fp = fopen(PID_SMAPS, "r");
+ if (!fp) {
+ printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
+ exit(EXIT_FAILURE);
+ }
+ if (!check_for_pattern(fp, addr_pattern, buffer))
+ goto err_out;
+
+ ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB",
+ size >> 10);
+ if (ret >= MAX_LINE_LENGTH) {
+ printf("%s: Pattern is too long\n", __func__);
+ exit(EXIT_FAILURE);
+ }
+ /*
+ * Fetch the Swap: in the same block and check whether it got
+ * the expected number of hugeepages next.
+ */
+ if (!check_for_pattern(fp, "Swap:", buffer))
+ goto err_out;
+
+ if (strncmp(buffer, addr_pattern, strlen(addr_pattern)))
+ goto err_out;
+
+ swap = true;
+err_out:
+ fclose(fp);
+ return swap;
+}
+
+static void *alloc_mapping(void)
+{
+ void *p;
+
+ p = mmap(BASE_ADDR, hpage_pmd_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (p != BASE_ADDR) {
+ printf("Failed to allocate VMA at %p\n", BASE_ADDR);
+ exit(EXIT_FAILURE);
+ }
+
+ return p;
+}
+
+static void fill_memory(int *p, unsigned long start, unsigned long end)
+{
+ int i;
+
+ for (i = start / page_size; i < end / page_size; i++)
+ p[i * page_size / sizeof(*p)] = i + 0xdead0000;
+}
+
+static void validate_memory(int *p, unsigned long start, unsigned long end)
+{
+ int i;
+
+ for (i = start / page_size; i < end / page_size; i++) {
+ if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) {
+ printf("Page %d is corrupted: %#x\n",
+ i, p[i * page_size / sizeof(*p)]);
+ exit(EXIT_FAILURE);
+ }
+ }
+}
+
+#define TICK 500000
+static bool wait_for_scan(const char *msg, char *p)
+{
+ int full_scans;
+ int timeout = 6; /* 3 seconds */
+
+ /* Sanity check */
+ if (check_huge(p)) {
+ printf("Unexpected huge page\n");
+ exit(EXIT_FAILURE);
+ }
+
+ madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+
+ /* Wait until the second full_scan completed */
+ full_scans = read_num("khugepaged/full_scans") + 2;
+
+ printf("%s...", msg);
+ while (timeout--) {
+ if (check_huge(p))
+ break;
+ if (read_num("khugepaged/full_scans") >= full_scans)
+ break;
+ printf(".");
+ usleep(TICK);
+ }
+
+ madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+
+ return timeout == -1;
+}
+
+static void alloc_at_fault(void)
+{
+ struct settings settings = default_settings;
+ char *p;
+
+ settings.thp_enabled = THP_ALWAYS;
+ write_settings(&settings);
+
+ p = alloc_mapping();
+ *p = 1;
+ printf("Allocate huge page on fault...");
+ if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ write_settings(&default_settings);
+
+ madvise(p, page_size, MADV_DONTNEED);
+ printf("Split huge PMD on MADV_DONTNEED...");
+ if (!check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_full(void)
+{
+ void *p;
+
+ p = alloc_mapping();
+ fill_memory(p, 0, hpage_pmd_size);
+ if (wait_for_scan("Collapse fully populated PTE table", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, hpage_pmd_size);
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_empty(void)
+{
+ void *p;
+
+ p = alloc_mapping();
+ if (wait_for_scan("Do not collapse empty PTE table", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ fail("Fail");
+ else
+ success("OK");
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_single_pte_entry(void)
+{
+ void *p;
+
+ p = alloc_mapping();
+ fill_memory(p, 0, page_size);
+ if (wait_for_scan("Collapse PTE table with single PTE entry present", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, page_size);
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_none(void)
+{
+ int max_ptes_none = hpage_pmd_nr / 2;
+ struct settings settings = default_settings;
+ void *p;
+
+ settings.khugepaged.max_ptes_none = max_ptes_none;
+ write_settings(&settings);
+
+ p = alloc_mapping();
+
+ fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
+ if (wait_for_scan("Do not collapse with max_ptes_none exceeded", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ fail("Fail");
+ else
+ success("OK");
+ validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
+
+ fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
+ if (wait_for_scan("Collapse with max_ptes_none PTEs empty", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
+
+ munmap(p, hpage_pmd_size);
+ write_settings(&default_settings);
+}
+
+static void collapse_swapin_single_pte(void)
+{
+ void *p;
+ p = alloc_mapping();
+ fill_memory(p, 0, hpage_pmd_size);
+
+ printf("Swapout one page...");
+ if (madvise(p, page_size, MADV_PAGEOUT)) {
+ perror("madvise(MADV_PAGEOUT)");
+ exit(EXIT_FAILURE);
+ }
+ if (check_swap(p, page_size)) {
+ success("OK");
+ } else {
+ fail("Fail");
+ goto out;
+ }
+
+ if (wait_for_scan("Collapse with swapping in single PTE entry", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, hpage_pmd_size);
+out:
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_swap(void)
+{
+ int max_ptes_swap = read_num("khugepaged/max_ptes_swap");
+ void *p;
+
+ p = alloc_mapping();
+
+ fill_memory(p, 0, hpage_pmd_size);
+ printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
+ if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) {
+ perror("madvise(MADV_PAGEOUT)");
+ exit(EXIT_FAILURE);
+ }
+ if (check_swap(p, (max_ptes_swap + 1) * page_size)) {
+ success("OK");
+ } else {
+ fail("Fail");
+ goto out;
+ }
+
+ if (wait_for_scan("Do not collapse with max_ptes_swap exceeded", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ fail("Fail");
+ else
+ success("OK");
+ validate_memory(p, 0, hpage_pmd_size);
+
+ fill_memory(p, 0, hpage_pmd_size);
+ printf("Swapout %d of %d pages...", max_ptes_swap, hpage_pmd_nr);
+ if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
+ perror("madvise(MADV_PAGEOUT)");
+ exit(EXIT_FAILURE);
+ }
+ if (check_swap(p, max_ptes_swap * page_size)) {
+ success("OK");
+ } else {
+ fail("Fail");
+ goto out;
+ }
+
+ if (wait_for_scan("Collapse with max_ptes_swap pages swapped out", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, hpage_pmd_size);
+out:
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_single_pte_entry_compound(void)
+{
+ void *p;
+
+ p = alloc_mapping();
+
+ printf("Allocate huge page...");
+ madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+ fill_memory(p, 0, hpage_pmd_size);
+ if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+
+ printf("Split huge page leaving single PTE mapping compound page...");
+ madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
+ if (!check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ if (wait_for_scan("Collapse PTE table with single PTE mapping compound page", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, page_size);
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_full_of_compound(void)
+{
+ void *p;
+
+ p = alloc_mapping();
+
+ printf("Allocate huge page...");
+ madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+ fill_memory(p, 0, hpage_pmd_size);
+ if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ printf("Split huge page leaving single PTE page table full of compound pages...");
+ madvise(p, page_size, MADV_NOHUGEPAGE);
+ madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+ if (!check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ if (wait_for_scan("Collapse PTE table full of compound pages", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, hpage_pmd_size);
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_compound_extreme(void)
+{
+ void *p;
+ int i;
+
+ p = alloc_mapping();
+ for (i = 0; i < hpage_pmd_nr; i++) {
+ printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
+ i + 1, hpage_pmd_nr);
+
+ madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
+ fill_memory(BASE_ADDR, 0, hpage_pmd_size);
+ if (!check_huge(BASE_ADDR)) {
+ printf("Failed to allocate huge page\n");
+ exit(EXIT_FAILURE);
+ }
+ madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
+
+ p = mremap(BASE_ADDR - i * page_size,
+ i * page_size + hpage_pmd_size,
+ (i + 1) * page_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED,
+ BASE_ADDR + 2 * hpage_pmd_size);
+ if (p == MAP_FAILED) {
+ perror("mremap+unmap");
+ exit(EXIT_FAILURE);
+ }
+
+ p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
+ (i + 1) * page_size,
+ (i + 1) * page_size + hpage_pmd_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED,
+ BASE_ADDR - (i + 1) * page_size);
+ if (p == MAP_FAILED) {
+ perror("mremap+alloc");
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ munmap(BASE_ADDR, hpage_pmd_size);
+ fill_memory(p, 0, hpage_pmd_size);
+ if (!check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ if (wait_for_scan("Collapse PTE table full of different compound pages", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ validate_memory(p, 0, hpage_pmd_size);
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_fork(void)
+{
+ int wstatus;
+ void *p;
+
+ p = alloc_mapping();
+
+ printf("Allocate small page...");
+ fill_memory(p, 0, page_size);
+ if (!check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ printf("Share small page over fork()...");
+ if (!fork()) {
+ /* Do not touch settings on child exit */
+ skip_settings_restore = true;
+ exit_status = 0;
+
+ if (!check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ fill_memory(p, page_size, 2 * page_size);
+
+ if (wait_for_scan("Collapse PTE table with single page shared with parent process", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ validate_memory(p, 0, page_size);
+ munmap(p, hpage_pmd_size);
+ exit(exit_status);
+ }
+
+ wait(&wstatus);
+ exit_status += WEXITSTATUS(wstatus);
+
+ printf("Check if parent still has small page...");
+ if (!check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, page_size);
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_fork_compound(void)
+{
+ int wstatus;
+ void *p;
+
+ p = alloc_mapping();
+
+ printf("Allocate huge page...");
+ madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+ fill_memory(p, 0, hpage_pmd_size);
+ if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ printf("Share huge page over fork()...");
+ if (!fork()) {
+ /* Do not touch settings on child exit */
+ skip_settings_restore = true;
+ exit_status = 0;
+
+ if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ printf("Split huge page PMD in child process...");
+ madvise(p, page_size, MADV_NOHUGEPAGE);
+ madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
+ if (!check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ fill_memory(p, 0, page_size);
+
+ write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
+ if (wait_for_scan("Collapse PTE table full of compound pages in child", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ write_num("khugepaged/max_ptes_shared",
+ default_settings.khugepaged.max_ptes_shared);
+
+ validate_memory(p, 0, hpage_pmd_size);
+ munmap(p, hpage_pmd_size);
+ exit(exit_status);
+ }
+
+ wait(&wstatus);
+ exit_status += WEXITSTATUS(wstatus);
+
+ printf("Check if parent still has huge page...");
+ if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, hpage_pmd_size);
+ munmap(p, hpage_pmd_size);
+}
+
+static void collapse_max_ptes_shared()
+{
+ int max_ptes_shared = read_num("khugepaged/max_ptes_shared");
+ int wstatus;
+ void *p;
+
+ p = alloc_mapping();
+
+ printf("Allocate huge page...");
+ madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
+ fill_memory(p, 0, hpage_pmd_size);
+ if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ printf("Share huge page over fork()...");
+ if (!fork()) {
+ /* Do not touch settings on child exit */
+ skip_settings_restore = true;
+ exit_status = 0;
+
+ if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ printf("Trigger CoW on page %d of %d...",
+ hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
+ fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
+ if (!check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ if (wait_for_scan("Do not collapse with max_ptes_shared exceeded", p))
+ fail("Timeout");
+ else if (!check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ printf("Trigger CoW on page %d of %d...",
+ hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
+ fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size);
+ if (!check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+
+ if (wait_for_scan("Collapse with max_ptes_shared PTEs shared", p))
+ fail("Timeout");
+ else if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+
+ validate_memory(p, 0, hpage_pmd_size);
+ munmap(p, hpage_pmd_size);
+ exit(exit_status);
+ }
+
+ wait(&wstatus);
+ exit_status += WEXITSTATUS(wstatus);
+
+ printf("Check if parent still has huge page...");
+ if (check_huge(p))
+ success("OK");
+ else
+ fail("Fail");
+ validate_memory(p, 0, hpage_pmd_size);
+ munmap(p, hpage_pmd_size);
+}
+
+int main(void)
+{
+ setbuf(stdout, NULL);
+
+ page_size = getpagesize();
+ hpage_pmd_size = read_num("hpage_pmd_size");
+ hpage_pmd_nr = hpage_pmd_size / page_size;
+
+ default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
+ default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
+ default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
+ default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
+
+ save_settings();
+ adjust_settings();
+
+ alloc_at_fault();
+ collapse_full();
+ collapse_empty();
+ collapse_single_pte_entry();
+ collapse_max_ptes_none();
+ collapse_swapin_single_pte();
+ collapse_max_ptes_swap();
+ collapse_single_pte_entry_compound();
+ collapse_full_of_compound();
+ collapse_compound_extreme();
+ collapse_fork();
+ collapse_fork_compound();
+ collapse_max_ptes_shared();
+
+ restore_settings(0);
+}
diff --git a/tools/testing/selftests/vm/map_fixed_noreplace.c b/tools/testing/selftests/vm/map_fixed_noreplace.c
new file mode 100644
index 000000000..eed44322d
--- /dev/null
+++ b/tools/testing/selftests/vm/map_fixed_noreplace.c
@@ -0,0 +1,231 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Test that MAP_FIXED_NOREPLACE works.
+ *
+ * Copyright 2018, Jann Horn <jannh@google.com>
+ * Copyright 2018, Michael Ellerman, IBM Corporation.
+ */
+
+#include <sys/mman.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#ifndef MAP_FIXED_NOREPLACE
+#define MAP_FIXED_NOREPLACE 0x100000
+#endif
+
+static void dump_maps(void)
+{
+ char cmd[32];
+
+ snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid());
+ system(cmd);
+}
+
+static unsigned long find_base_addr(unsigned long size)
+{
+ void *addr;
+ unsigned long flags;
+
+ flags = MAP_PRIVATE | MAP_ANONYMOUS;
+ addr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
+ if (addr == MAP_FAILED) {
+ printf("Error: couldn't map the space we need for the test\n");
+ return 0;
+ }
+
+ if (munmap(addr, size) != 0) {
+ printf("Error: couldn't map the space we need for the test\n");
+ return 0;
+ }
+ return (unsigned long)addr;
+}
+
+int main(void)
+{
+ unsigned long base_addr;
+ unsigned long flags, addr, size, page_size;
+ char *p;
+
+ page_size = sysconf(_SC_PAGE_SIZE);
+
+ //let's find a base addr that is free before we start the tests
+ size = 5 * page_size;
+ base_addr = find_base_addr(size);
+ if (!base_addr) {
+ printf("Error: couldn't map the space we need for the test\n");
+ return 1;
+ }
+
+ flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE;
+
+ // Check we can map all the areas we need below
+ errno = 0;
+ addr = base_addr;
+ size = 5 * page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+
+ printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+ if (p == MAP_FAILED) {
+ dump_maps();
+ printf("Error: couldn't map the space we need for the test\n");
+ return 1;
+ }
+
+ errno = 0;
+ if (munmap((void *)addr, 5 * page_size) != 0) {
+ dump_maps();
+ printf("Error: munmap failed!?\n");
+ return 1;
+ }
+ printf("unmap() successful\n");
+
+ errno = 0;
+ addr = base_addr + page_size;
+ size = 3 * page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+ printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+ if (p == MAP_FAILED) {
+ dump_maps();
+ printf("Error: first mmap() failed unexpectedly\n");
+ return 1;
+ }
+
+ /*
+ * Exact same mapping again:
+ * base | free | new
+ * +1 | mapped | new
+ * +2 | mapped | new
+ * +3 | mapped | new
+ * +4 | free | new
+ */
+ errno = 0;
+ addr = base_addr;
+ size = 5 * page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+ printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+ if (p != MAP_FAILED) {
+ dump_maps();
+ printf("Error:1: mmap() succeeded when it shouldn't have\n");
+ return 1;
+ }
+
+ /*
+ * Second mapping contained within first:
+ *
+ * base | free |
+ * +1 | mapped |
+ * +2 | mapped | new
+ * +3 | mapped |
+ * +4 | free |
+ */
+ errno = 0;
+ addr = base_addr + (2 * page_size);
+ size = page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+ printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+ if (p != MAP_FAILED) {
+ dump_maps();
+ printf("Error:2: mmap() succeeded when it shouldn't have\n");
+ return 1;
+ }
+
+ /*
+ * Overlap end of existing mapping:
+ * base | free |
+ * +1 | mapped |
+ * +2 | mapped |
+ * +3 | mapped | new
+ * +4 | free | new
+ */
+ errno = 0;
+ addr = base_addr + (3 * page_size);
+ size = 2 * page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+ printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+ if (p != MAP_FAILED) {
+ dump_maps();
+ printf("Error:3: mmap() succeeded when it shouldn't have\n");
+ return 1;
+ }
+
+ /*
+ * Overlap start of existing mapping:
+ * base | free | new
+ * +1 | mapped | new
+ * +2 | mapped |
+ * +3 | mapped |
+ * +4 | free |
+ */
+ errno = 0;
+ addr = base_addr;
+ size = 2 * page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+ printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+ if (p != MAP_FAILED) {
+ dump_maps();
+ printf("Error:4: mmap() succeeded when it shouldn't have\n");
+ return 1;
+ }
+
+ /*
+ * Adjacent to start of existing mapping:
+ * base | free | new
+ * +1 | mapped |
+ * +2 | mapped |
+ * +3 | mapped |
+ * +4 | free |
+ */
+ errno = 0;
+ addr = base_addr;
+ size = page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+ printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+ if (p == MAP_FAILED) {
+ dump_maps();
+ printf("Error:5: mmap() failed when it shouldn't have\n");
+ return 1;
+ }
+
+ /*
+ * Adjacent to end of existing mapping:
+ * base | free |
+ * +1 | mapped |
+ * +2 | mapped |
+ * +3 | mapped |
+ * +4 | free | new
+ */
+ errno = 0;
+ addr = base_addr + (4 * page_size);
+ size = page_size;
+ p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0);
+ printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p);
+
+ if (p == MAP_FAILED) {
+ dump_maps();
+ printf("Error:6: mmap() failed when it shouldn't have\n");
+ return 1;
+ }
+
+ addr = base_addr;
+ size = 5 * page_size;
+ if (munmap((void *)addr, size) != 0) {
+ dump_maps();
+ printf("Error: munmap failed!?\n");
+ return 1;
+ }
+ printf("unmap() successful\n");
+
+ printf("OK\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/vm/map_hugetlb.c b/tools/testing/selftests/vm/map_hugetlb.c
new file mode 100644
index 000000000..312889edb
--- /dev/null
+++ b/tools/testing/selftests/vm/map_hugetlb.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Example of using hugepage memory in a user application using the mmap
+ * system call with MAP_HUGETLB flag. Before running this program make
+ * sure the administrator has allocated enough default sized huge pages
+ * to cover the 256 MB allocation.
+ *
+ * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages.
+ * That means the addresses starting with 0x800000... will need to be
+ * specified. Specifying a fixed address is not required on ppc64, i386
+ * or x86_64.
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#define LENGTH (256UL*1024*1024)
+#define PROTECTION (PROT_READ | PROT_WRITE)
+
+#ifndef MAP_HUGETLB
+#define MAP_HUGETLB 0x40000 /* arch specific */
+#endif
+
+#ifndef MAP_HUGE_SHIFT
+#define MAP_HUGE_SHIFT 26
+#endif
+
+#ifndef MAP_HUGE_MASK
+#define MAP_HUGE_MASK 0x3f
+#endif
+
+/* Only ia64 requires this */
+#ifdef __ia64__
+#define ADDR (void *)(0x8000000000000000UL)
+#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED)
+#else
+#define ADDR (void *)(0x0UL)
+#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB)
+#endif
+
+static void check_bytes(char *addr)
+{
+ printf("First hex is %x\n", *((unsigned int *)addr));
+}
+
+static void write_bytes(char *addr, size_t length)
+{
+ unsigned long i;
+
+ for (i = 0; i < length; i++)
+ *(addr + i) = (char)i;
+}
+
+static int read_bytes(char *addr, size_t length)
+{
+ unsigned long i;
+
+ check_bytes(addr);
+ for (i = 0; i < length; i++)
+ if (*(addr + i) != (char)i) {
+ printf("Mismatch at %lu\n", i);
+ return 1;
+ }
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ void *addr;
+ int ret;
+ size_t length = LENGTH;
+ int flags = FLAGS;
+ int shift = 0;
+
+ if (argc > 1)
+ length = atol(argv[1]) << 20;
+ if (argc > 2) {
+ shift = atoi(argv[2]);
+ if (shift)
+ flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
+ }
+
+ if (shift)
+ printf("%u kB hugepages\n", 1 << (shift - 10));
+ else
+ printf("Default size hugepages\n");
+ printf("Mapping %lu Mbytes\n", (unsigned long)length >> 20);
+
+ addr = mmap(ADDR, length, PROTECTION, flags, -1, 0);
+ if (addr == MAP_FAILED) {
+ perror("mmap");
+ exit(1);
+ }
+
+ printf("Returned address is %p\n", addr);
+ check_bytes(addr);
+ write_bytes(addr, length);
+ ret = read_bytes(addr, length);
+
+ /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */
+ if (munmap(addr, length)) {
+ perror("munmap");
+ exit(1);
+ }
+
+ return ret;
+}
diff --git a/tools/testing/selftests/vm/map_populate.c b/tools/testing/selftests/vm/map_populate.c
new file mode 100644
index 000000000..6b8aeaa0b
--- /dev/null
+++ b/tools/testing/selftests/vm/map_populate.c
@@ -0,0 +1,113 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2018 Dmitry Safonov, Arista Networks
+ *
+ * MAP_POPULATE | MAP_PRIVATE should COW VMA pages.
+ */
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#ifndef MMAP_SZ
+#define MMAP_SZ 4096
+#endif
+
+#define BUG_ON(condition, description) \
+ do { \
+ if (condition) { \
+ fprintf(stderr, "[FAIL]\t%s:%d\t%s:%s\n", __func__, \
+ __LINE__, (description), strerror(errno)); \
+ exit(1); \
+ } \
+ } while (0)
+
+static int parent_f(int sock, unsigned long *smap, int child)
+{
+ int status, ret;
+
+ ret = read(sock, &status, sizeof(int));
+ BUG_ON(ret <= 0, "read(sock)");
+
+ *smap = 0x22222BAD;
+ ret = msync(smap, MMAP_SZ, MS_SYNC);
+ BUG_ON(ret, "msync()");
+
+ ret = write(sock, &status, sizeof(int));
+ BUG_ON(ret <= 0, "write(sock)");
+
+ waitpid(child, &status, 0);
+ BUG_ON(!WIFEXITED(status), "child in unexpected state");
+
+ return WEXITSTATUS(status);
+}
+
+static int child_f(int sock, unsigned long *smap, int fd)
+{
+ int ret, buf = 0;
+
+ smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_POPULATE, fd, 0);
+ BUG_ON(smap == MAP_FAILED, "mmap()");
+
+ BUG_ON(*smap != 0xdeadbabe, "MAP_PRIVATE | MAP_POPULATE changed file");
+
+ ret = write(sock, &buf, sizeof(int));
+ BUG_ON(ret <= 0, "write(sock)");
+
+ ret = read(sock, &buf, sizeof(int));
+ BUG_ON(ret <= 0, "read(sock)");
+
+ BUG_ON(*smap == 0x22222BAD, "MAP_POPULATE didn't COW private page");
+ BUG_ON(*smap != 0xdeadbabe, "mapping was corrupted");
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ int sock[2], child, ret;
+ FILE *ftmp;
+ unsigned long *smap;
+
+ ftmp = tmpfile();
+ BUG_ON(ftmp == 0, "tmpfile()");
+
+ ret = ftruncate(fileno(ftmp), MMAP_SZ);
+ BUG_ON(ret, "ftruncate()");
+
+ smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fileno(ftmp), 0);
+ BUG_ON(smap == MAP_FAILED, "mmap()");
+
+ *smap = 0xdeadbabe;
+ /* Probably unnecessary, but let it be. */
+ ret = msync(smap, MMAP_SZ, MS_SYNC);
+ BUG_ON(ret, "msync()");
+
+ ret = socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sock);
+ BUG_ON(ret, "socketpair()");
+
+ child = fork();
+ BUG_ON(child == -1, "fork()");
+
+ if (child) {
+ ret = close(sock[0]);
+ BUG_ON(ret, "close()");
+
+ return parent_f(sock[1], smap, child);
+ }
+
+ ret = close(sock[1]);
+ BUG_ON(ret, "close()");
+
+ return child_f(sock[0], smap, fileno(ftmp));
+}
diff --git a/tools/testing/selftests/vm/mlock-random-test.c b/tools/testing/selftests/vm/mlock-random-test.c
new file mode 100644
index 000000000..ff4d72eb7
--- /dev/null
+++ b/tools/testing/selftests/vm/mlock-random-test.c
@@ -0,0 +1,294 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * It tests the mlock/mlock2() when they are invoked
+ * on randomly memory region.
+ */
+#include <unistd.h>
+#include <sys/resource.h>
+#include <sys/capability.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <time.h>
+#include "mlock2.h"
+
+#define CHUNK_UNIT (128 * 1024)
+#define MLOCK_RLIMIT_SIZE (CHUNK_UNIT * 2)
+#define MLOCK_WITHIN_LIMIT_SIZE CHUNK_UNIT
+#define MLOCK_OUTOF_LIMIT_SIZE (CHUNK_UNIT * 3)
+
+#define TEST_LOOP 100
+#define PAGE_ALIGN(size, ps) (((size) + ((ps) - 1)) & ~((ps) - 1))
+
+int set_cap_limits(rlim_t max)
+{
+ struct rlimit new;
+ cap_t cap = cap_init();
+
+ new.rlim_cur = max;
+ new.rlim_max = max;
+ if (setrlimit(RLIMIT_MEMLOCK, &new)) {
+ perror("setrlimit() returns error\n");
+ return -1;
+ }
+
+ /* drop capabilities including CAP_IPC_LOCK */
+ if (cap_set_proc(cap)) {
+ perror("cap_set_proc() returns error\n");
+ return -2;
+ }
+
+ return 0;
+}
+
+int get_proc_locked_vm_size(void)
+{
+ FILE *f;
+ int ret = -1;
+ char line[1024] = {0};
+ unsigned long lock_size = 0;
+
+ f = fopen("/proc/self/status", "r");
+ if (!f) {
+ perror("fopen");
+ return -1;
+ }
+
+ while (fgets(line, 1024, f)) {
+ if (strstr(line, "VmLck")) {
+ ret = sscanf(line, "VmLck:\t%8lu kB", &lock_size);
+ if (ret <= 0) {
+ printf("sscanf() on VmLck error: %s: %d\n",
+ line, ret);
+ fclose(f);
+ return -1;
+ }
+ fclose(f);
+ return (int)(lock_size << 10);
+ }
+ }
+
+ perror("cann't parse VmLck in /proc/self/status\n");
+ fclose(f);
+ return -1;
+}
+
+/*
+ * Get the MMUPageSize of the memory region including input
+ * address from proc file.
+ *
+ * return value: on error case, 0 will be returned.
+ * Otherwise the page size(in bytes) is returned.
+ */
+int get_proc_page_size(unsigned long addr)
+{
+ FILE *smaps;
+ char *line;
+ unsigned long mmupage_size = 0;
+ size_t size;
+
+ smaps = seek_to_smaps_entry(addr);
+ if (!smaps) {
+ printf("Unable to parse /proc/self/smaps\n");
+ return 0;
+ }
+
+ while (getline(&line, &size, smaps) > 0) {
+ if (!strstr(line, "MMUPageSize")) {
+ free(line);
+ line = NULL;
+ size = 0;
+ continue;
+ }
+
+ /* found the MMUPageSize of this section */
+ if (sscanf(line, "MMUPageSize: %8lu kB",
+ &mmupage_size) < 1) {
+ printf("Unable to parse smaps entry for Size:%s\n",
+ line);
+ break;
+ }
+
+ }
+ free(line);
+ if (smaps)
+ fclose(smaps);
+ return mmupage_size << 10;
+}
+
+/*
+ * Test mlock/mlock2() on provided memory chunk.
+ * It expects the mlock/mlock2() to be successful (within rlimit)
+ *
+ * With allocated memory chunk [p, p + alloc_size), this
+ * test will choose start/len randomly to perform mlock/mlock2
+ * [start, start + len] memory range. The range is within range
+ * of the allocated chunk.
+ *
+ * The memory region size alloc_size is within the rlimit.
+ * So we always expect a success of mlock/mlock2.
+ *
+ * VmLck is assumed to be 0 before this test.
+ *
+ * return value: 0 - success
+ * else: failure
+ */
+int test_mlock_within_limit(char *p, int alloc_size)
+{
+ int i;
+ int ret = 0;
+ int locked_vm_size = 0;
+ struct rlimit cur;
+ int page_size = 0;
+
+ getrlimit(RLIMIT_MEMLOCK, &cur);
+ if (cur.rlim_cur < alloc_size) {
+ printf("alloc_size[%d] < %u rlimit,lead to mlock failure\n",
+ alloc_size, (unsigned int)cur.rlim_cur);
+ return -1;
+ }
+
+ srand(time(NULL));
+ for (i = 0; i < TEST_LOOP; i++) {
+ /*
+ * - choose mlock/mlock2 randomly
+ * - choose lock_size randomly but lock_size < alloc_size
+ * - choose start_offset randomly but p+start_offset+lock_size
+ * < p+alloc_size
+ */
+ int is_mlock = !!(rand() % 2);
+ int lock_size = rand() % alloc_size;
+ int start_offset = rand() % (alloc_size - lock_size);
+
+ if (is_mlock)
+ ret = mlock(p + start_offset, lock_size);
+ else
+ ret = mlock2_(p + start_offset, lock_size,
+ MLOCK_ONFAULT);
+
+ if (ret) {
+ printf("%s() failure at |%p(%d)| mlock:|%p(%d)|\n",
+ is_mlock ? "mlock" : "mlock2",
+ p, alloc_size,
+ p + start_offset, lock_size);
+ return ret;
+ }
+ }
+
+ /*
+ * Check VmLck left by the tests.
+ */
+ locked_vm_size = get_proc_locked_vm_size();
+ page_size = get_proc_page_size((unsigned long)p);
+ if (page_size == 0) {
+ printf("cannot get proc MMUPageSize\n");
+ return -1;
+ }
+
+ if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size) {
+ printf("test_mlock_within_limit() left VmLck:%d on %d chunk\n",
+ locked_vm_size, alloc_size);
+ return -1;
+ }
+
+ return 0;
+}
+
+
+/*
+ * We expect the mlock/mlock2() to be fail (outof limitation)
+ *
+ * With allocated memory chunk [p, p + alloc_size), this
+ * test will randomly choose start/len and perform mlock/mlock2
+ * on [start, start+len] range.
+ *
+ * The memory region size alloc_size is above the rlimit.
+ * And the len to be locked is higher than rlimit.
+ * So we always expect a failure of mlock/mlock2.
+ * No locked page number should be increased as a side effect.
+ *
+ * return value: 0 - success
+ * else: failure
+ */
+int test_mlock_outof_limit(char *p, int alloc_size)
+{
+ int i;
+ int ret = 0;
+ int locked_vm_size = 0, old_locked_vm_size = 0;
+ struct rlimit cur;
+
+ getrlimit(RLIMIT_MEMLOCK, &cur);
+ if (cur.rlim_cur >= alloc_size) {
+ printf("alloc_size[%d] >%u rlimit, violates test condition\n",
+ alloc_size, (unsigned int)cur.rlim_cur);
+ return -1;
+ }
+
+ old_locked_vm_size = get_proc_locked_vm_size();
+ srand(time(NULL));
+ for (i = 0; i < TEST_LOOP; i++) {
+ int is_mlock = !!(rand() % 2);
+ int lock_size = (rand() % (alloc_size - cur.rlim_cur))
+ + cur.rlim_cur;
+ int start_offset = rand() % (alloc_size - lock_size);
+
+ if (is_mlock)
+ ret = mlock(p + start_offset, lock_size);
+ else
+ ret = mlock2_(p + start_offset, lock_size,
+ MLOCK_ONFAULT);
+ if (ret == 0) {
+ printf("%s() succeeds? on %p(%d) mlock%p(%d)\n",
+ is_mlock ? "mlock" : "mlock2",
+ p, alloc_size,
+ p + start_offset, lock_size);
+ return -1;
+ }
+ }
+
+ locked_vm_size = get_proc_locked_vm_size();
+ if (locked_vm_size != old_locked_vm_size) {
+ printf("tests leads to new mlocked page: old[%d], new[%d]\n",
+ old_locked_vm_size,
+ locked_vm_size);
+ return -1;
+ }
+
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ char *p = NULL;
+ int ret = 0;
+
+ if (set_cap_limits(MLOCK_RLIMIT_SIZE))
+ return -1;
+
+ p = malloc(MLOCK_WITHIN_LIMIT_SIZE);
+ if (p == NULL) {
+ perror("malloc() failure\n");
+ return -1;
+ }
+ ret = test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE);
+ if (ret)
+ return ret;
+ munlock(p, MLOCK_WITHIN_LIMIT_SIZE);
+ free(p);
+
+
+ p = malloc(MLOCK_OUTOF_LIMIT_SIZE);
+ if (p == NULL) {
+ perror("malloc() failure\n");
+ return -1;
+ }
+ ret = test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE);
+ if (ret)
+ return ret;
+ munlock(p, MLOCK_OUTOF_LIMIT_SIZE);
+ free(p);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/vm/mlock2-tests.c b/tools/testing/selftests/vm/mlock2-tests.c
new file mode 100644
index 000000000..11b2301f3
--- /dev/null
+++ b/tools/testing/selftests/vm/mlock2-tests.c
@@ -0,0 +1,520 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+#include <sys/mman.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <stdbool.h>
+#include "mlock2.h"
+
+#include "../kselftest.h"
+
+struct vm_boundaries {
+ unsigned long start;
+ unsigned long end;
+};
+
+static int get_vm_area(unsigned long addr, struct vm_boundaries *area)
+{
+ FILE *file;
+ int ret = 1;
+ char line[1024] = {0};
+ char *end_addr;
+ char *stop;
+ unsigned long start;
+ unsigned long end;
+
+ if (!area)
+ return ret;
+
+ file = fopen("/proc/self/maps", "r");
+ if (!file) {
+ perror("fopen");
+ return ret;
+ }
+
+ memset(area, 0, sizeof(struct vm_boundaries));
+
+ while(fgets(line, 1024, file)) {
+ end_addr = strchr(line, '-');
+ if (!end_addr) {
+ printf("cannot parse /proc/self/maps\n");
+ goto out;
+ }
+ *end_addr = '\0';
+ end_addr++;
+ stop = strchr(end_addr, ' ');
+ if (!stop) {
+ printf("cannot parse /proc/self/maps\n");
+ goto out;
+ }
+ stop = '\0';
+
+ sscanf(line, "%lx", &start);
+ sscanf(end_addr, "%lx", &end);
+
+ if (start <= addr && end > addr) {
+ area->start = start;
+ area->end = end;
+ ret = 0;
+ goto out;
+ }
+ }
+out:
+ fclose(file);
+ return ret;
+}
+
+#define VMFLAGS "VmFlags:"
+
+static bool is_vmflag_set(unsigned long addr, const char *vmflag)
+{
+ char *line = NULL;
+ char *flags;
+ size_t size = 0;
+ bool ret = false;
+ FILE *smaps;
+
+ smaps = seek_to_smaps_entry(addr);
+ if (!smaps) {
+ printf("Unable to parse /proc/self/smaps\n");
+ goto out;
+ }
+
+ while (getline(&line, &size, smaps) > 0) {
+ if (!strstr(line, VMFLAGS)) {
+ free(line);
+ line = NULL;
+ size = 0;
+ continue;
+ }
+
+ flags = line + strlen(VMFLAGS);
+ ret = (strstr(flags, vmflag) != NULL);
+ goto out;
+ }
+
+out:
+ free(line);
+ fclose(smaps);
+ return ret;
+}
+
+#define SIZE "Size:"
+#define RSS "Rss:"
+#define LOCKED "lo"
+
+static unsigned long get_value_for_name(unsigned long addr, const char *name)
+{
+ char *line = NULL;
+ size_t size = 0;
+ char *value_ptr;
+ FILE *smaps = NULL;
+ unsigned long value = -1UL;
+
+ smaps = seek_to_smaps_entry(addr);
+ if (!smaps) {
+ printf("Unable to parse /proc/self/smaps\n");
+ goto out;
+ }
+
+ while (getline(&line, &size, smaps) > 0) {
+ if (!strstr(line, name)) {
+ free(line);
+ line = NULL;
+ size = 0;
+ continue;
+ }
+
+ value_ptr = line + strlen(name);
+ if (sscanf(value_ptr, "%lu kB", &value) < 1) {
+ printf("Unable to parse smaps entry for Size\n");
+ goto out;
+ }
+ break;
+ }
+
+out:
+ if (smaps)
+ fclose(smaps);
+ free(line);
+ return value;
+}
+
+static bool is_vma_lock_on_fault(unsigned long addr)
+{
+ bool locked;
+ unsigned long vma_size, vma_rss;
+
+ locked = is_vmflag_set(addr, LOCKED);
+ if (!locked)
+ return false;
+
+ vma_size = get_value_for_name(addr, SIZE);
+ vma_rss = get_value_for_name(addr, RSS);
+
+ /* only one page is faulted in */
+ return (vma_rss < vma_size);
+}
+
+#define PRESENT_BIT 0x8000000000000000ULL
+#define PFN_MASK 0x007FFFFFFFFFFFFFULL
+#define UNEVICTABLE_BIT (1UL << 18)
+
+static int lock_check(unsigned long addr)
+{
+ bool locked;
+ unsigned long vma_size, vma_rss;
+
+ locked = is_vmflag_set(addr, LOCKED);
+ if (!locked)
+ return false;
+
+ vma_size = get_value_for_name(addr, SIZE);
+ vma_rss = get_value_for_name(addr, RSS);
+
+ return (vma_rss == vma_size);
+}
+
+static int unlock_lock_check(char *map)
+{
+ if (is_vmflag_set((unsigned long)map, LOCKED)) {
+ printf("VMA flag %s is present on page 1 after unlock\n", LOCKED);
+ return 1;
+ }
+
+ return 0;
+}
+
+static int test_mlock_lock()
+{
+ char *map;
+ int ret = 1;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (map == MAP_FAILED) {
+ perror("test_mlock_locked mmap");
+ goto out;
+ }
+
+ if (mlock2_(map, 2 * page_size, 0)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(KSFT_SKIP);
+ }
+ perror("mlock2(0)");
+ goto unmap;
+ }
+
+ if (!lock_check((unsigned long)map))
+ goto unmap;
+
+ /* Now unlock and recheck attributes */
+ if (munlock(map, 2 * page_size)) {
+ perror("munlock()");
+ goto unmap;
+ }
+
+ ret = unlock_lock_check(map);
+
+unmap:
+ munmap(map, 2 * page_size);
+out:
+ return ret;
+}
+
+static int onfault_check(char *map)
+{
+ *map = 'a';
+ if (!is_vma_lock_on_fault((unsigned long)map)) {
+ printf("VMA is not marked for lock on fault\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+static int unlock_onfault_check(char *map)
+{
+ unsigned long page_size = getpagesize();
+
+ if (is_vma_lock_on_fault((unsigned long)map) ||
+ is_vma_lock_on_fault((unsigned long)map + page_size)) {
+ printf("VMA is still lock on fault after unlock\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+static int test_mlock_onfault()
+{
+ char *map;
+ int ret = 1;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (map == MAP_FAILED) {
+ perror("test_mlock_locked mmap");
+ goto out;
+ }
+
+ if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(KSFT_SKIP);
+ }
+ perror("mlock2(MLOCK_ONFAULT)");
+ goto unmap;
+ }
+
+ if (onfault_check(map))
+ goto unmap;
+
+ /* Now unlock and recheck attributes */
+ if (munlock(map, 2 * page_size)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(KSFT_SKIP);
+ }
+ perror("munlock()");
+ goto unmap;
+ }
+
+ ret = unlock_onfault_check(map);
+unmap:
+ munmap(map, 2 * page_size);
+out:
+ return ret;
+}
+
+static int test_lock_onfault_of_present()
+{
+ char *map;
+ int ret = 1;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (map == MAP_FAILED) {
+ perror("test_mlock_locked mmap");
+ goto out;
+ }
+
+ *map = 'a';
+
+ if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(KSFT_SKIP);
+ }
+ perror("mlock2(MLOCK_ONFAULT)");
+ goto unmap;
+ }
+
+ if (!is_vma_lock_on_fault((unsigned long)map) ||
+ !is_vma_lock_on_fault((unsigned long)map + page_size)) {
+ printf("VMA with present pages is not marked lock on fault\n");
+ goto unmap;
+ }
+ ret = 0;
+unmap:
+ munmap(map, 2 * page_size);
+out:
+ return ret;
+}
+
+static int test_munlockall()
+{
+ char *map;
+ int ret = 1;
+ unsigned long page_size = getpagesize();
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+
+ if (map == MAP_FAILED) {
+ perror("test_munlockall mmap");
+ goto out;
+ }
+
+ if (mlockall(MCL_CURRENT)) {
+ perror("mlockall(MCL_CURRENT)");
+ goto out;
+ }
+
+ if (!lock_check((unsigned long)map))
+ goto unmap;
+
+ if (munlockall()) {
+ perror("munlockall()");
+ goto unmap;
+ }
+
+ if (unlock_lock_check(map))
+ goto unmap;
+
+ munmap(map, 2 * page_size);
+
+ map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+
+ if (map == MAP_FAILED) {
+ perror("test_munlockall second mmap");
+ goto out;
+ }
+
+ if (mlockall(MCL_CURRENT | MCL_ONFAULT)) {
+ perror("mlockall(MCL_CURRENT | MCL_ONFAULT)");
+ goto unmap;
+ }
+
+ if (onfault_check(map))
+ goto unmap;
+
+ if (munlockall()) {
+ perror("munlockall()");
+ goto unmap;
+ }
+
+ if (unlock_onfault_check(map))
+ goto unmap;
+
+ if (mlockall(MCL_CURRENT | MCL_FUTURE)) {
+ perror("mlockall(MCL_CURRENT | MCL_FUTURE)");
+ goto out;
+ }
+
+ if (!lock_check((unsigned long)map))
+ goto unmap;
+
+ if (munlockall()) {
+ perror("munlockall()");
+ goto unmap;
+ }
+
+ ret = unlock_lock_check(map);
+
+unmap:
+ munmap(map, 2 * page_size);
+out:
+ munlockall();
+ return ret;
+}
+
+static int test_vma_management(bool call_mlock)
+{
+ int ret = 1;
+ void *map;
+ unsigned long page_size = getpagesize();
+ struct vm_boundaries page1;
+ struct vm_boundaries page2;
+ struct vm_boundaries page3;
+
+ map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (map == MAP_FAILED) {
+ perror("mmap()");
+ return ret;
+ }
+
+ if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) {
+ if (errno == ENOSYS) {
+ printf("Cannot call new mlock family, skipping test\n");
+ _exit(KSFT_SKIP);
+ }
+ perror("mlock(ONFAULT)\n");
+ goto out;
+ }
+
+ if (get_vm_area((unsigned long)map, &page1) ||
+ get_vm_area((unsigned long)map + page_size, &page2) ||
+ get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+ printf("couldn't find mapping in /proc/self/maps\n");
+ goto out;
+ }
+
+ /*
+ * Before we unlock a portion, we need to that all three pages are in
+ * the same VMA. If they are not we abort this test (Note that this is
+ * not a failure)
+ */
+ if (page1.start != page2.start || page2.start != page3.start) {
+ printf("VMAs are not merged to start, aborting test\n");
+ ret = 0;
+ goto out;
+ }
+
+ if (munlock(map + page_size, page_size)) {
+ perror("munlock()");
+ goto out;
+ }
+
+ if (get_vm_area((unsigned long)map, &page1) ||
+ get_vm_area((unsigned long)map + page_size, &page2) ||
+ get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+ printf("couldn't find mapping in /proc/self/maps\n");
+ goto out;
+ }
+
+ /* All three VMAs should be different */
+ if (page1.start == page2.start || page2.start == page3.start) {
+ printf("failed to split VMA for munlock\n");
+ goto out;
+ }
+
+ /* Now unlock the first and third page and check the VMAs again */
+ if (munlock(map, page_size * 3)) {
+ perror("munlock()");
+ goto out;
+ }
+
+ if (get_vm_area((unsigned long)map, &page1) ||
+ get_vm_area((unsigned long)map + page_size, &page2) ||
+ get_vm_area((unsigned long)map + page_size * 2, &page3)) {
+ printf("couldn't find mapping in /proc/self/maps\n");
+ goto out;
+ }
+
+ /* Now all three VMAs should be the same */
+ if (page1.start != page2.start || page2.start != page3.start) {
+ printf("failed to merge VMAs after munlock\n");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ munmap(map, 3 * page_size);
+ return ret;
+}
+
+static int test_mlockall(int (test_function)(bool call_mlock))
+{
+ int ret = 1;
+
+ if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) {
+ perror("mlockall");
+ return ret;
+ }
+
+ ret = test_function(false);
+ munlockall();
+ return ret;
+}
+
+int main(int argc, char **argv)
+{
+ int ret = 0;
+ ret += test_mlock_lock();
+ ret += test_mlock_onfault();
+ ret += test_munlockall();
+ ret += test_lock_onfault_of_present();
+ ret += test_vma_management(true);
+ ret += test_mlockall(test_vma_management);
+ return ret;
+}
diff --git a/tools/testing/selftests/vm/mlock2.h b/tools/testing/selftests/vm/mlock2.h
new file mode 100644
index 000000000..2a6e76c22
--- /dev/null
+++ b/tools/testing/selftests/vm/mlock2.h
@@ -0,0 +1,63 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <syscall.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifndef MLOCK_ONFAULT
+#define MLOCK_ONFAULT 1
+#endif
+
+#ifndef MCL_ONFAULT
+#define MCL_ONFAULT (MCL_FUTURE << 1)
+#endif
+
+static int mlock2_(void *start, size_t len, int flags)
+{
+#ifdef __NR_mlock2
+ return syscall(__NR_mlock2, start, len, flags);
+#else
+ errno = ENOSYS;
+ return -1;
+#endif
+}
+
+static FILE *seek_to_smaps_entry(unsigned long addr)
+{
+ FILE *file;
+ char *line = NULL;
+ size_t size = 0;
+ unsigned long start, end;
+ char perms[5];
+ unsigned long offset;
+ char dev[32];
+ unsigned long inode;
+ char path[BUFSIZ];
+
+ file = fopen("/proc/self/smaps", "r");
+ if (!file) {
+ perror("fopen smaps");
+ _exit(1);
+ }
+
+ while (getline(&line, &size, file) > 0) {
+ if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n",
+ &start, &end, perms, &offset, dev, &inode, path) < 6)
+ goto next;
+
+ if (start <= addr && addr < end)
+ goto out;
+
+next:
+ free(line);
+ line = NULL;
+ size = 0;
+ }
+
+ fclose(file);
+ file = NULL;
+
+out:
+ free(line);
+ return file;
+}
diff --git a/tools/testing/selftests/vm/mremap_dontunmap.c b/tools/testing/selftests/vm/mremap_dontunmap.c
new file mode 100644
index 000000000..3a7b5ef0b
--- /dev/null
+++ b/tools/testing/selftests/vm/mremap_dontunmap.c
@@ -0,0 +1,312 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Tests for mremap w/ MREMAP_DONTUNMAP.
+ *
+ * Copyright 2020, Brian Geffon <bgeffon@google.com>
+ */
+#define _GNU_SOURCE
+#include <sys/mman.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+
+#ifndef MREMAP_DONTUNMAP
+#define MREMAP_DONTUNMAP 4
+#endif
+
+unsigned long page_size;
+char *page_buffer;
+
+static void dump_maps(void)
+{
+ char cmd[32];
+
+ snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid());
+ system(cmd);
+}
+
+#define BUG_ON(condition, description) \
+ do { \
+ if (condition) { \
+ fprintf(stderr, "[FAIL]\t%s():%d\t%s:%s\n", __func__, \
+ __LINE__, (description), strerror(errno)); \
+ dump_maps(); \
+ exit(1); \
+ } \
+ } while (0)
+
+// Try a simple operation for to "test" for kernel support this prevents
+// reporting tests as failed when it's run on an older kernel.
+static int kernel_support_for_mremap_dontunmap()
+{
+ int ret = 0;
+ unsigned long num_pages = 1;
+ void *source_mapping = mmap(NULL, num_pages * page_size, PROT_NONE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(source_mapping == MAP_FAILED, "mmap");
+
+ // This simple remap should only fail if MREMAP_DONTUNMAP isn't
+ // supported.
+ void *dest_mapping =
+ mremap(source_mapping, num_pages * page_size, num_pages * page_size,
+ MREMAP_DONTUNMAP | MREMAP_MAYMOVE, 0);
+ if (dest_mapping == MAP_FAILED) {
+ ret = errno;
+ } else {
+ BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
+ "unable to unmap destination mapping");
+ }
+
+ BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+ "unable to unmap source mapping");
+ return ret;
+}
+
+// This helper will just validate that an entire mapping contains the expected
+// byte.
+static int check_region_contains_byte(void *addr, unsigned long size, char byte)
+{
+ BUG_ON(size & (page_size - 1),
+ "check_region_contains_byte expects page multiples");
+ BUG_ON((unsigned long)addr & (page_size - 1),
+ "check_region_contains_byte expects page alignment");
+
+ memset(page_buffer, byte, page_size);
+
+ unsigned long num_pages = size / page_size;
+ unsigned long i;
+
+ // Compare each page checking that it contains our expected byte.
+ for (i = 0; i < num_pages; ++i) {
+ int ret =
+ memcmp(addr + (i * page_size), page_buffer, page_size);
+ if (ret) {
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+// this test validates that MREMAP_DONTUNMAP moves the pagetables while leaving
+// the source mapping mapped.
+static void mremap_dontunmap_simple()
+{
+ unsigned long num_pages = 5;
+
+ void *source_mapping =
+ mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(source_mapping == MAP_FAILED, "mmap");
+
+ memset(source_mapping, 'a', num_pages * page_size);
+
+ // Try to just move the whole mapping anywhere (not fixed).
+ void *dest_mapping =
+ mremap(source_mapping, num_pages * page_size, num_pages * page_size,
+ MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL);
+ BUG_ON(dest_mapping == MAP_FAILED, "mremap");
+
+ // Validate that the pages have been moved, we know they were moved if
+ // the dest_mapping contains a's.
+ BUG_ON(check_region_contains_byte
+ (dest_mapping, num_pages * page_size, 'a') != 0,
+ "pages did not migrate");
+ BUG_ON(check_region_contains_byte
+ (source_mapping, num_pages * page_size, 0) != 0,
+ "source should have no ptes");
+
+ BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
+ "unable to unmap destination mapping");
+ BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+ "unable to unmap source mapping");
+}
+
+// This test validates MREMAP_DONTUNMAP will move page tables to a specific
+// destination using MREMAP_FIXED, also while validating that the source
+// remains intact.
+static void mremap_dontunmap_simple_fixed()
+{
+ unsigned long num_pages = 5;
+
+ // Since we want to guarantee that we can remap to a point, we will
+ // create a mapping up front.
+ void *dest_mapping =
+ mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(dest_mapping == MAP_FAILED, "mmap");
+ memset(dest_mapping, 'X', num_pages * page_size);
+
+ void *source_mapping =
+ mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(source_mapping == MAP_FAILED, "mmap");
+ memset(source_mapping, 'a', num_pages * page_size);
+
+ void *remapped_mapping =
+ mremap(source_mapping, num_pages * page_size, num_pages * page_size,
+ MREMAP_FIXED | MREMAP_DONTUNMAP | MREMAP_MAYMOVE,
+ dest_mapping);
+ BUG_ON(remapped_mapping == MAP_FAILED, "mremap");
+ BUG_ON(remapped_mapping != dest_mapping,
+ "mremap should have placed the remapped mapping at dest_mapping");
+
+ // The dest mapping will have been unmap by mremap so we expect the Xs
+ // to be gone and replaced with a's.
+ BUG_ON(check_region_contains_byte
+ (dest_mapping, num_pages * page_size, 'a') != 0,
+ "pages did not migrate");
+
+ // And the source mapping will have had its ptes dropped.
+ BUG_ON(check_region_contains_byte
+ (source_mapping, num_pages * page_size, 0) != 0,
+ "source should have no ptes");
+
+ BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
+ "unable to unmap destination mapping");
+ BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+ "unable to unmap source mapping");
+}
+
+// This test validates that we can MREMAP_DONTUNMAP for a portion of an
+// existing mapping.
+static void mremap_dontunmap_partial_mapping()
+{
+ /*
+ * source mapping:
+ * --------------
+ * | aaaaaaaaaa |
+ * --------------
+ * to become:
+ * --------------
+ * | aaaaa00000 |
+ * --------------
+ * With the destination mapping containing 5 pages of As.
+ * ---------
+ * | aaaaa |
+ * ---------
+ */
+ unsigned long num_pages = 10;
+ void *source_mapping =
+ mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(source_mapping == MAP_FAILED, "mmap");
+ memset(source_mapping, 'a', num_pages * page_size);
+
+ // We will grab the last 5 pages of the source and move them.
+ void *dest_mapping =
+ mremap(source_mapping + (5 * page_size), 5 * page_size,
+ 5 * page_size,
+ MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL);
+ BUG_ON(dest_mapping == MAP_FAILED, "mremap");
+
+ // We expect the first 5 pages of the source to contain a's and the
+ // final 5 pages to contain zeros.
+ BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 'a') !=
+ 0, "first 5 pages of source should have original pages");
+ BUG_ON(check_region_contains_byte
+ (source_mapping + (5 * page_size), 5 * page_size, 0) != 0,
+ "final 5 pages of source should have no ptes");
+
+ // Finally we expect the destination to have 5 pages worth of a's.
+ BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') !=
+ 0, "dest mapping should contain ptes from the source");
+
+ BUG_ON(munmap(dest_mapping, 5 * page_size) == -1,
+ "unable to unmap destination mapping");
+ BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+ "unable to unmap source mapping");
+}
+
+// This test validates that we can remap over only a portion of a mapping.
+static void mremap_dontunmap_partial_mapping_overwrite(void)
+{
+ /*
+ * source mapping:
+ * ---------
+ * |aaaaa|
+ * ---------
+ * dest mapping initially:
+ * -----------
+ * |XXXXXXXXXX|
+ * ------------
+ * Source to become:
+ * ---------
+ * |00000|
+ * ---------
+ * With the destination mapping containing 5 pages of As.
+ * ------------
+ * |aaaaaXXXXX|
+ * ------------
+ */
+ void *source_mapping =
+ mmap(NULL, 5 * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(source_mapping == MAP_FAILED, "mmap");
+ memset(source_mapping, 'a', 5 * page_size);
+
+ void *dest_mapping =
+ mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(dest_mapping == MAP_FAILED, "mmap");
+ memset(dest_mapping, 'X', 10 * page_size);
+
+ // We will grab the last 5 pages of the source and move them.
+ void *remapped_mapping =
+ mremap(source_mapping, 5 * page_size,
+ 5 * page_size,
+ MREMAP_DONTUNMAP | MREMAP_MAYMOVE | MREMAP_FIXED, dest_mapping);
+ BUG_ON(dest_mapping == MAP_FAILED, "mremap");
+ BUG_ON(dest_mapping != remapped_mapping, "expected to remap to dest_mapping");
+
+ BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 0) !=
+ 0, "first 5 pages of source should have no ptes");
+
+ // Finally we expect the destination to have 5 pages worth of a's.
+ BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') != 0,
+ "dest mapping should contain ptes from the source");
+
+ // Finally the last 5 pages shouldn't have been touched.
+ BUG_ON(check_region_contains_byte(dest_mapping + (5 * page_size),
+ 5 * page_size, 'X') != 0,
+ "dest mapping should have retained the last 5 pages");
+
+ BUG_ON(munmap(dest_mapping, 10 * page_size) == -1,
+ "unable to unmap destination mapping");
+ BUG_ON(munmap(source_mapping, 5 * page_size) == -1,
+ "unable to unmap source mapping");
+}
+
+int main(void)
+{
+ page_size = sysconf(_SC_PAGE_SIZE);
+
+ // test for kernel support for MREMAP_DONTUNMAP skipping the test if
+ // not.
+ if (kernel_support_for_mremap_dontunmap() != 0) {
+ printf("No kernel support for MREMAP_DONTUNMAP\n");
+ return KSFT_SKIP;
+ }
+
+ // Keep a page sized buffer around for when we need it.
+ page_buffer =
+ mmap(NULL, page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(page_buffer == MAP_FAILED, "unable to mmap a page.");
+
+ mremap_dontunmap_simple();
+ mremap_dontunmap_simple_fixed();
+ mremap_dontunmap_partial_mapping();
+ mremap_dontunmap_partial_mapping_overwrite();
+
+ BUG_ON(munmap(page_buffer, page_size) == -1,
+ "unable to unmap page buffer");
+
+ printf("OK\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/vm/on-fault-limit.c b/tools/testing/selftests/vm/on-fault-limit.c
new file mode 100644
index 000000000..634d87dfb
--- /dev/null
+++ b/tools/testing/selftests/vm/on-fault-limit.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <sys/mman.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#ifndef MCL_ONFAULT
+#define MCL_ONFAULT (MCL_FUTURE << 1)
+#endif
+
+static int test_limit(void)
+{
+ int ret = 1;
+ struct rlimit lims;
+ void *map;
+
+ if (getrlimit(RLIMIT_MEMLOCK, &lims)) {
+ perror("getrlimit");
+ return ret;
+ }
+
+ if (mlockall(MCL_ONFAULT | MCL_FUTURE)) {
+ perror("mlockall");
+ return ret;
+ }
+
+ map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0);
+ if (map != MAP_FAILED)
+ printf("mmap should have failed, but didn't\n");
+ else {
+ ret = 0;
+ munmap(map, 2 * lims.rlim_max);
+ }
+
+ munlockall();
+ return ret;
+}
+
+int main(int argc, char **argv)
+{
+ int ret = 0;
+
+ ret += test_limit();
+ return ret;
+}
diff --git a/tools/testing/selftests/vm/pkey-helpers.h b/tools/testing/selftests/vm/pkey-helpers.h
new file mode 100644
index 000000000..622a85848
--- /dev/null
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -0,0 +1,225 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PKEYS_HELPER_H
+#define _PKEYS_HELPER_H
+#define _GNU_SOURCE
+#include <string.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <ucontext.h>
+#include <sys/mman.h>
+
+/* Define some kernel-like types */
+#define u8 __u8
+#define u16 __u16
+#define u32 __u32
+#define u64 __u64
+
+#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
+
+#ifndef DEBUG_LEVEL
+#define DEBUG_LEVEL 0
+#endif
+#define DPRINT_IN_SIGNAL_BUF_SIZE 4096
+extern int dprint_in_signal;
+extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
+
+extern int test_nr;
+extern int iteration_nr;
+
+#ifdef __GNUC__
+__attribute__((format(printf, 1, 2)))
+#endif
+static inline void sigsafe_printf(const char *format, ...)
+{
+ va_list ap;
+
+ if (!dprint_in_signal) {
+ va_start(ap, format);
+ vprintf(format, ap);
+ va_end(ap);
+ } else {
+ int ret;
+ /*
+ * No printf() functions are signal-safe.
+ * They deadlock easily. Write the format
+ * string to get some output, even if
+ * incomplete.
+ */
+ ret = write(1, format, strlen(format));
+ if (ret < 0)
+ exit(1);
+ }
+}
+#define dprintf_level(level, args...) do { \
+ if (level <= DEBUG_LEVEL) \
+ sigsafe_printf(args); \
+} while (0)
+#define dprintf0(args...) dprintf_level(0, args)
+#define dprintf1(args...) dprintf_level(1, args)
+#define dprintf2(args...) dprintf_level(2, args)
+#define dprintf3(args...) dprintf_level(3, args)
+#define dprintf4(args...) dprintf_level(4, args)
+
+extern void abort_hooks(void);
+#define pkey_assert(condition) do { \
+ if (!(condition)) { \
+ dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \
+ __FILE__, __LINE__, \
+ test_nr, iteration_nr); \
+ dprintf0("errno at assert: %d", errno); \
+ abort_hooks(); \
+ exit(__LINE__); \
+ } \
+} while (0)
+
+__attribute__((noinline)) int read_ptr(int *ptr);
+void expected_pkey_fault(int pkey);
+int sys_pkey_alloc(unsigned long flags, unsigned long init_val);
+int sys_pkey_free(unsigned long pkey);
+int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+ unsigned long pkey);
+void record_pkey_malloc(void *ptr, long size, int prot);
+
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+#include "pkey-x86.h"
+#elif defined(__powerpc64__) /* arch */
+#include "pkey-powerpc.h"
+#else /* arch */
+#error Architecture not supported
+#endif /* arch */
+
+#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)
+
+static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags)
+{
+ u32 shift = pkey_bit_position(pkey);
+ /* mask out bits from pkey in old value */
+ reg &= ~((u64)PKEY_MASK << shift);
+ /* OR in new bits for pkey */
+ reg |= (flags & PKEY_MASK) << shift;
+ return reg;
+}
+
+static inline u64 get_pkey_bits(u64 reg, int pkey)
+{
+ u32 shift = pkey_bit_position(pkey);
+ /*
+ * shift down the relevant bits to the lowest two, then
+ * mask off all the other higher bits
+ */
+ return ((reg >> shift) & PKEY_MASK);
+}
+
+extern u64 shadow_pkey_reg;
+
+static inline u64 _read_pkey_reg(int line)
+{
+ u64 pkey_reg = __read_pkey_reg();
+
+ dprintf4("read_pkey_reg(line=%d) pkey_reg: %016llx"
+ " shadow: %016llx\n",
+ line, pkey_reg, shadow_pkey_reg);
+ assert(pkey_reg == shadow_pkey_reg);
+
+ return pkey_reg;
+}
+
+#define read_pkey_reg() _read_pkey_reg(__LINE__)
+
+static inline void write_pkey_reg(u64 pkey_reg)
+{
+ dprintf4("%s() changing %016llx to %016llx\n", __func__,
+ __read_pkey_reg(), pkey_reg);
+ /* will do the shadow check for us: */
+ read_pkey_reg();
+ __write_pkey_reg(pkey_reg);
+ shadow_pkey_reg = pkey_reg;
+ dprintf4("%s(%016llx) pkey_reg: %016llx\n", __func__,
+ pkey_reg, __read_pkey_reg());
+}
+
+/*
+ * These are technically racy. since something could
+ * change PKEY register between the read and the write.
+ */
+static inline void __pkey_access_allow(int pkey, int do_allow)
+{
+ u64 pkey_reg = read_pkey_reg();
+ int bit = pkey * 2;
+
+ if (do_allow)
+ pkey_reg &= (1<<bit);
+ else
+ pkey_reg |= (1<<bit);
+
+ dprintf4("pkey_reg now: %016llx\n", read_pkey_reg());
+ write_pkey_reg(pkey_reg);
+}
+
+static inline void __pkey_write_allow(int pkey, int do_allow_write)
+{
+ u64 pkey_reg = read_pkey_reg();
+ int bit = pkey * 2 + 1;
+
+ if (do_allow_write)
+ pkey_reg &= (1<<bit);
+ else
+ pkey_reg |= (1<<bit);
+
+ write_pkey_reg(pkey_reg);
+ dprintf4("pkey_reg now: %016llx\n", read_pkey_reg());
+}
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1))
+#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
+#define ALIGN_PTR_UP(p, ptr_align_to) \
+ ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to))
+#define ALIGN_PTR_DOWN(p, ptr_align_to) \
+ ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to))
+#define __stringify_1(x...) #x
+#define __stringify(x...) __stringify_1(x)
+
+static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si)
+{
+#ifdef si_pkey
+ return &si->si_pkey;
+#else
+ return (u32 *)(((u8 *)si) + si_pkey_offset);
+#endif
+}
+
+static inline int kernel_has_pkeys(void)
+{
+ /* try allocating a key and see if it succeeds */
+ int ret = sys_pkey_alloc(0, 0);
+ if (ret <= 0) {
+ return 0;
+ }
+ sys_pkey_free(ret);
+ return 1;
+}
+
+static inline int is_pkeys_supported(void)
+{
+ /* check if the cpu supports pkeys */
+ if (!cpu_has_pkeys()) {
+ dprintf1("SKIP: %s: no CPU support\n", __func__);
+ return 0;
+ }
+
+ /* check if the kernel supports pkeys */
+ if (!kernel_has_pkeys()) {
+ dprintf1("SKIP: %s: no kernel support\n", __func__);
+ return 0;
+ }
+
+ return 1;
+}
+
+#endif /* _PKEYS_HELPER_H */
diff --git a/tools/testing/selftests/vm/pkey-powerpc.h b/tools/testing/selftests/vm/pkey-powerpc.h
new file mode 100644
index 000000000..1ebb586b2
--- /dev/null
+++ b/tools/testing/selftests/vm/pkey-powerpc.h
@@ -0,0 +1,133 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _PKEYS_POWERPC_H
+#define _PKEYS_POWERPC_H
+
+#ifndef SYS_mprotect_key
+# define SYS_mprotect_key 386
+#endif
+#ifndef SYS_pkey_alloc
+# define SYS_pkey_alloc 384
+# define SYS_pkey_free 385
+#endif
+#define REG_IP_IDX PT_NIP
+#define REG_TRAPNO PT_TRAP
+#define gregs gp_regs
+#define fpregs fp_regs
+#define si_pkey_offset 0x20
+
+#undef PKEY_DISABLE_ACCESS
+#define PKEY_DISABLE_ACCESS 0x3 /* disable read and write */
+
+#undef PKEY_DISABLE_WRITE
+#define PKEY_DISABLE_WRITE 0x2
+
+#define NR_PKEYS 32
+#define NR_RESERVED_PKEYS_4K 27 /* pkey-0, pkey-1, exec-only-pkey
+ and 24 other keys that cannot be
+ represented in the PTE */
+#define NR_RESERVED_PKEYS_64K_3KEYS 3 /* PowerNV and KVM: pkey-0,
+ pkey-1 and exec-only key */
+#define NR_RESERVED_PKEYS_64K_4KEYS 4 /* PowerVM: pkey-0, pkey-1,
+ pkey-31 and exec-only key */
+#define PKEY_BITS_PER_PKEY 2
+#define HPAGE_SIZE (1UL << 24)
+#define PAGE_SIZE sysconf(_SC_PAGESIZE)
+
+static inline u32 pkey_bit_position(int pkey)
+{
+ return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY;
+}
+
+static inline u64 __read_pkey_reg(void)
+{
+ u64 pkey_reg;
+
+ asm volatile("mfspr %0, 0xd" : "=r" (pkey_reg));
+
+ return pkey_reg;
+}
+
+static inline void __write_pkey_reg(u64 pkey_reg)
+{
+ u64 amr = pkey_reg;
+
+ dprintf4("%s() changing %016llx to %016llx\n",
+ __func__, __read_pkey_reg(), pkey_reg);
+
+ asm volatile("isync; mtspr 0xd, %0; isync"
+ : : "r" ((unsigned long)(amr)) : "memory");
+
+ dprintf4("%s() pkey register after changing %016llx to %016llx\n",
+ __func__, __read_pkey_reg(), pkey_reg);
+}
+
+static inline int cpu_has_pkeys(void)
+{
+ /* No simple way to determine this */
+ return 1;
+}
+
+static inline bool arch_is_powervm()
+{
+ struct stat buf;
+
+ if ((stat("/sys/firmware/devicetree/base/ibm,partition-name", &buf) == 0) &&
+ (stat("/sys/firmware/devicetree/base/hmc-managed?", &buf) == 0) &&
+ (stat("/sys/firmware/devicetree/base/chosen/qemu,graphic-width", &buf) == -1) )
+ return true;
+
+ return false;
+}
+
+static inline int get_arch_reserved_keys(void)
+{
+ if (sysconf(_SC_PAGESIZE) == 4096)
+ return NR_RESERVED_PKEYS_4K;
+ else
+ if (arch_is_powervm())
+ return NR_RESERVED_PKEYS_64K_4KEYS;
+ else
+ return NR_RESERVED_PKEYS_64K_3KEYS;
+}
+
+void expect_fault_on_read_execonly_key(void *p1, int pkey)
+{
+ /*
+ * powerpc does not allow userspace to change permissions of exec-only
+ * keys since those keys are not allocated by userspace. The signal
+ * handler wont be able to reset the permissions, which means the code
+ * will infinitely continue to segfault here.
+ */
+ return;
+}
+
+/* 4-byte instructions * 16384 = 64K page */
+#define __page_o_noops() asm(".rept 16384 ; nop; .endr")
+
+void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
+{
+ void *ptr;
+ int ret;
+
+ dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+ size, prot, pkey);
+ pkey_assert(pkey < NR_PKEYS);
+ ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+ pkey_assert(ptr != (void *)-1);
+
+ ret = syscall(__NR_subpage_prot, ptr, size, NULL);
+ if (ret) {
+ perror("subpage_perm");
+ return PTR_ERR_ENOTSUP;
+ }
+
+ ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
+ pkey_assert(!ret);
+ record_pkey_malloc(ptr, size, prot);
+
+ dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
+ return ptr;
+}
+
+#endif /* _PKEYS_POWERPC_H */
diff --git a/tools/testing/selftests/vm/pkey-x86.h b/tools/testing/selftests/vm/pkey-x86.h
new file mode 100644
index 000000000..3be20f5d5
--- /dev/null
+++ b/tools/testing/selftests/vm/pkey-x86.h
@@ -0,0 +1,181 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _PKEYS_X86_H
+#define _PKEYS_X86_H
+
+#ifdef __i386__
+
+#ifndef SYS_mprotect_key
+# define SYS_mprotect_key 380
+#endif
+
+#ifndef SYS_pkey_alloc
+# define SYS_pkey_alloc 381
+# define SYS_pkey_free 382
+#endif
+
+#define REG_IP_IDX REG_EIP
+#define si_pkey_offset 0x14
+
+#else
+
+#ifndef SYS_mprotect_key
+# define SYS_mprotect_key 329
+#endif
+
+#ifndef SYS_pkey_alloc
+# define SYS_pkey_alloc 330
+# define SYS_pkey_free 331
+#endif
+
+#define REG_IP_IDX REG_RIP
+#define si_pkey_offset 0x20
+
+#endif
+
+#ifndef PKEY_DISABLE_ACCESS
+# define PKEY_DISABLE_ACCESS 0x1
+#endif
+
+#ifndef PKEY_DISABLE_WRITE
+# define PKEY_DISABLE_WRITE 0x2
+#endif
+
+#define NR_PKEYS 16
+#define NR_RESERVED_PKEYS 2 /* pkey-0 and exec-only-pkey */
+#define PKEY_BITS_PER_PKEY 2
+#define HPAGE_SIZE (1UL<<21)
+#define PAGE_SIZE 4096
+#define MB (1<<20)
+
+static inline void __page_o_noops(void)
+{
+ /* 8-bytes of instruction * 512 bytes = 1 page */
+ asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr");
+}
+
+static inline u64 __read_pkey_reg(void)
+{
+ unsigned int eax, edx;
+ unsigned int ecx = 0;
+ unsigned pkey_reg;
+
+ asm volatile(".byte 0x0f,0x01,0xee\n\t"
+ : "=a" (eax), "=d" (edx)
+ : "c" (ecx));
+ pkey_reg = eax;
+ return pkey_reg;
+}
+
+static inline void __write_pkey_reg(u64 pkey_reg)
+{
+ unsigned int eax = pkey_reg;
+ unsigned int ecx = 0;
+ unsigned int edx = 0;
+
+ dprintf4("%s() changing %016llx to %016llx\n", __func__,
+ __read_pkey_reg(), pkey_reg);
+ asm volatile(".byte 0x0f,0x01,0xef\n\t"
+ : : "a" (eax), "c" (ecx), "d" (edx));
+ assert(pkey_reg == __read_pkey_reg());
+}
+
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ /* ecx is often an input as well as an output. */
+ asm volatile(
+ "cpuid;"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx));
+}
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */
+#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */
+#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */
+
+static inline int cpu_has_pkeys(void)
+{
+ unsigned int eax;
+ unsigned int ebx;
+ unsigned int ecx;
+ unsigned int edx;
+
+ eax = 0x7;
+ ecx = 0x0;
+ __cpuid(&eax, &ebx, &ecx, &edx);
+
+ if (!(ecx & X86_FEATURE_PKU)) {
+ dprintf2("cpu does not have PKU\n");
+ return 0;
+ }
+ if (!(ecx & X86_FEATURE_OSPKE)) {
+ dprintf2("cpu does not have OSPKE\n");
+ return 0;
+ }
+ return 1;
+}
+
+static inline u32 pkey_bit_position(int pkey)
+{
+ return pkey * PKEY_BITS_PER_PKEY;
+}
+
+#define XSTATE_PKEY_BIT (9)
+#define XSTATE_PKEY 0x200
+
+int pkey_reg_xstate_offset(void)
+{
+ unsigned int eax;
+ unsigned int ebx;
+ unsigned int ecx;
+ unsigned int edx;
+ int xstate_offset;
+ int xstate_size;
+ unsigned long XSTATE_CPUID = 0xd;
+ int leaf;
+
+ /* assume that XSTATE_PKEY is set in XCR0 */
+ leaf = XSTATE_PKEY_BIT;
+ {
+ eax = XSTATE_CPUID;
+ ecx = leaf;
+ __cpuid(&eax, &ebx, &ecx, &edx);
+
+ if (leaf == XSTATE_PKEY_BIT) {
+ xstate_offset = ebx;
+ xstate_size = eax;
+ }
+ }
+
+ if (xstate_size == 0) {
+ printf("could not find size/offset of PKEY in xsave state\n");
+ return 0;
+ }
+
+ return xstate_offset;
+}
+
+static inline int get_arch_reserved_keys(void)
+{
+ return NR_RESERVED_PKEYS;
+}
+
+void expect_fault_on_read_execonly_key(void *p1, int pkey)
+{
+ int ptr_contents;
+
+ ptr_contents = read_ptr(p1);
+ dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
+ expected_pkey_fault(pkey);
+}
+
+void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
+{
+ return PTR_ERR_ENOTSUP;
+}
+
+#endif /* _PKEYS_X86_H */
diff --git a/tools/testing/selftests/vm/protection_keys.c b/tools/testing/selftests/vm/protection_keys.c
new file mode 100644
index 000000000..87eecd5ba
--- /dev/null
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -0,0 +1,1588 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst)
+ *
+ * There are examples in here of:
+ * * how to set protection keys on memory
+ * * how to set/clear bits in pkey registers (the rights register)
+ * * how to handle SEGV_PKUERR signals and extract pkey-relevant
+ * information from the siginfo
+ *
+ * Things to add:
+ * make sure KSM and KSM COW breaking works
+ * prefault pages in at malloc, or not
+ * protect MPX bounds tables with protection keys?
+ * make sure VMA splitting/merging is working correctly
+ * OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys
+ * look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel
+ * do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks
+ *
+ * Compile like this:
+ * gcc -o protection_keys -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
+ * gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
+ */
+#define _GNU_SOURCE
+#define __SANE_USERSPACE_TYPES__
+#include <errno.h>
+#include <linux/futex.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/syscall.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <ucontext.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/ptrace.h>
+#include <setjmp.h>
+
+#include "pkey-helpers.h"
+
+int iteration_nr = 1;
+int test_nr;
+
+u64 shadow_pkey_reg;
+int dprint_in_signal;
+char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
+
+void cat_into_file(char *str, char *file)
+{
+ int fd = open(file, O_RDWR);
+ int ret;
+
+ dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file);
+ /*
+ * these need to be raw because they are called under
+ * pkey_assert()
+ */
+ if (fd < 0) {
+ fprintf(stderr, "error opening '%s'\n", str);
+ perror("error: ");
+ exit(__LINE__);
+ }
+
+ ret = write(fd, str, strlen(str));
+ if (ret != strlen(str)) {
+ perror("write to file failed");
+ fprintf(stderr, "filename: '%s' str: '%s'\n", file, str);
+ exit(__LINE__);
+ }
+ close(fd);
+}
+
+#if CONTROL_TRACING > 0
+static int warned_tracing;
+int tracing_root_ok(void)
+{
+ if (geteuid() != 0) {
+ if (!warned_tracing)
+ fprintf(stderr, "WARNING: not run as root, "
+ "can not do tracing control\n");
+ warned_tracing = 1;
+ return 0;
+ }
+ return 1;
+}
+#endif
+
+void tracing_on(void)
+{
+#if CONTROL_TRACING > 0
+#define TRACEDIR "/sys/kernel/debug/tracing"
+ char pidstr[32];
+
+ if (!tracing_root_ok())
+ return;
+
+ sprintf(pidstr, "%d", getpid());
+ cat_into_file("0", TRACEDIR "/tracing_on");
+ cat_into_file("\n", TRACEDIR "/trace");
+ if (1) {
+ cat_into_file("function_graph", TRACEDIR "/current_tracer");
+ cat_into_file("1", TRACEDIR "/options/funcgraph-proc");
+ } else {
+ cat_into_file("nop", TRACEDIR "/current_tracer");
+ }
+ cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid");
+ cat_into_file("1", TRACEDIR "/tracing_on");
+ dprintf1("enabled tracing\n");
+#endif
+}
+
+void tracing_off(void)
+{
+#if CONTROL_TRACING > 0
+ if (!tracing_root_ok())
+ return;
+ cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on");
+#endif
+}
+
+void abort_hooks(void)
+{
+ fprintf(stderr, "running %s()...\n", __func__);
+ tracing_off();
+#ifdef SLEEP_ON_ABORT
+ sleep(SLEEP_ON_ABORT);
+#endif
+}
+
+/*
+ * This attempts to have roughly a page of instructions followed by a few
+ * instructions that do a write, and another page of instructions. That
+ * way, we are pretty sure that the write is in the second page of
+ * instructions and has at least a page of padding behind it.
+ *
+ * *That* lets us be sure to madvise() away the write instruction, which
+ * will then fault, which makes sure that the fault code handles
+ * execute-only memory properly.
+ */
+#ifdef __powerpc64__
+/* This way, both 4K and 64K alignment are maintained */
+__attribute__((__aligned__(65536)))
+#else
+__attribute__((__aligned__(PAGE_SIZE)))
+#endif
+void lots_o_noops_around_write(int *write_to_me)
+{
+ dprintf3("running %s()\n", __func__);
+ __page_o_noops();
+ /* Assume this happens in the second page of instructions: */
+ *write_to_me = __LINE__;
+ /* pad out by another page: */
+ __page_o_noops();
+ dprintf3("%s() done\n", __func__);
+}
+
+void dump_mem(void *dumpme, int len_bytes)
+{
+ char *c = (void *)dumpme;
+ int i;
+
+ for (i = 0; i < len_bytes; i += sizeof(u64)) {
+ u64 *ptr = (u64 *)(c + i);
+ dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr);
+ }
+}
+
+static u32 hw_pkey_get(int pkey, unsigned long flags)
+{
+ u64 pkey_reg = __read_pkey_reg();
+
+ dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n",
+ __func__, pkey, flags, 0, 0);
+ dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg);
+
+ return (u32) get_pkey_bits(pkey_reg, pkey);
+}
+
+static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags)
+{
+ u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
+ u64 old_pkey_reg = __read_pkey_reg();
+ u64 new_pkey_reg;
+
+ /* make sure that 'rights' only contains the bits we expect: */
+ assert(!(rights & ~mask));
+
+ /* modify bits accordingly in old pkey_reg and assign it */
+ new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights);
+
+ __write_pkey_reg(new_pkey_reg);
+
+ dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x"
+ " pkey_reg now: %016llx old_pkey_reg: %016llx\n",
+ __func__, pkey, rights, flags, 0, __read_pkey_reg(),
+ old_pkey_reg);
+ return 0;
+}
+
+void pkey_disable_set(int pkey, int flags)
+{
+ unsigned long syscall_flags = 0;
+ int ret;
+ int pkey_rights;
+ u64 orig_pkey_reg = read_pkey_reg();
+
+ dprintf1("START->%s(%d, 0x%x)\n", __func__,
+ pkey, flags);
+ pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
+
+ pkey_rights = hw_pkey_get(pkey, syscall_flags);
+
+ dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+ pkey, pkey, pkey_rights);
+
+ pkey_assert(pkey_rights >= 0);
+
+ pkey_rights |= flags;
+
+ ret = hw_pkey_set(pkey, pkey_rights, syscall_flags);
+ assert(!ret);
+ /* pkey_reg and flags have the same format */
+ shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights);
+ dprintf1("%s(%d) shadow: 0x%016llx\n",
+ __func__, pkey, shadow_pkey_reg);
+
+ pkey_assert(ret >= 0);
+
+ pkey_rights = hw_pkey_get(pkey, syscall_flags);
+ dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+ pkey, pkey, pkey_rights);
+
+ dprintf1("%s(%d) pkey_reg: 0x%016llx\n",
+ __func__, pkey, read_pkey_reg());
+ if (flags)
+ pkey_assert(read_pkey_reg() >= orig_pkey_reg);
+ dprintf1("END<---%s(%d, 0x%x)\n", __func__,
+ pkey, flags);
+}
+
+void pkey_disable_clear(int pkey, int flags)
+{
+ unsigned long syscall_flags = 0;
+ int ret;
+ int pkey_rights = hw_pkey_get(pkey, syscall_flags);
+ u64 orig_pkey_reg = read_pkey_reg();
+
+ pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
+
+ dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+ pkey, pkey, pkey_rights);
+ pkey_assert(pkey_rights >= 0);
+
+ pkey_rights &= ~flags;
+
+ ret = hw_pkey_set(pkey, pkey_rights, 0);
+ shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights);
+ pkey_assert(ret >= 0);
+
+ pkey_rights = hw_pkey_get(pkey, syscall_flags);
+ dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
+ pkey, pkey, pkey_rights);
+
+ dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__,
+ pkey, read_pkey_reg());
+ if (flags)
+ assert(read_pkey_reg() <= orig_pkey_reg);
+}
+
+void pkey_write_allow(int pkey)
+{
+ pkey_disable_clear(pkey, PKEY_DISABLE_WRITE);
+}
+void pkey_write_deny(int pkey)
+{
+ pkey_disable_set(pkey, PKEY_DISABLE_WRITE);
+}
+void pkey_access_allow(int pkey)
+{
+ pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS);
+}
+void pkey_access_deny(int pkey)
+{
+ pkey_disable_set(pkey, PKEY_DISABLE_ACCESS);
+}
+
+/* Failed address bound checks: */
+#ifndef SEGV_BNDERR
+# define SEGV_BNDERR 3
+#endif
+
+#ifndef SEGV_PKUERR
+# define SEGV_PKUERR 4
+#endif
+
+static char *si_code_str(int si_code)
+{
+ if (si_code == SEGV_MAPERR)
+ return "SEGV_MAPERR";
+ if (si_code == SEGV_ACCERR)
+ return "SEGV_ACCERR";
+ if (si_code == SEGV_BNDERR)
+ return "SEGV_BNDERR";
+ if (si_code == SEGV_PKUERR)
+ return "SEGV_PKUERR";
+ return "UNKNOWN";
+}
+
+int pkey_faults;
+int last_si_pkey = -1;
+void signal_handler(int signum, siginfo_t *si, void *vucontext)
+{
+ ucontext_t *uctxt = vucontext;
+ int trapno;
+ unsigned long ip;
+ char *fpregs;
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+ u32 *pkey_reg_ptr;
+ int pkey_reg_offset;
+#endif /* arch */
+ u64 siginfo_pkey;
+ u32 *si_pkey_ptr;
+
+ dprint_in_signal = 1;
+ dprintf1(">>>>===============SIGSEGV============================\n");
+ dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n",
+ __func__, __LINE__,
+ __read_pkey_reg(), shadow_pkey_reg);
+
+ trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
+ ip = uctxt->uc_mcontext.gregs[REG_IP_IDX];
+ fpregs = (char *) uctxt->uc_mcontext.fpregs;
+
+ dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n",
+ __func__, trapno, ip, si_code_str(si->si_code),
+ si->si_code);
+
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+#ifdef __i386__
+ /*
+ * 32-bit has some extra padding so that userspace can tell whether
+ * the XSTATE header is present in addition to the "legacy" FPU
+ * state. We just assume that it is here.
+ */
+ fpregs += 0x70;
+#endif /* i386 */
+ pkey_reg_offset = pkey_reg_xstate_offset();
+ pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]);
+
+ /*
+ * If we got a PKEY fault, we *HAVE* to have at least one bit set in
+ * here.
+ */
+ dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset());
+ if (DEBUG_LEVEL > 4)
+ dump_mem(pkey_reg_ptr - 128, 256);
+ pkey_assert(*pkey_reg_ptr);
+#endif /* arch */
+
+ dprintf1("siginfo: %p\n", si);
+ dprintf1(" fpregs: %p\n", fpregs);
+
+ if ((si->si_code == SEGV_MAPERR) ||
+ (si->si_code == SEGV_ACCERR) ||
+ (si->si_code == SEGV_BNDERR)) {
+ printf("non-PK si_code, exiting...\n");
+ exit(4);
+ }
+
+ si_pkey_ptr = siginfo_get_pkey_ptr(si);
+ dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr);
+ dump_mem((u8 *)si_pkey_ptr - 8, 24);
+ siginfo_pkey = *si_pkey_ptr;
+ pkey_assert(siginfo_pkey < NR_PKEYS);
+ last_si_pkey = siginfo_pkey;
+
+ /*
+ * need __read_pkey_reg() version so we do not do shadow_pkey_reg
+ * checking
+ */
+ dprintf1("signal pkey_reg from pkey_reg: %016llx\n",
+ __read_pkey_reg());
+ dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey);
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+ dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr);
+ *(u64 *)pkey_reg_ptr = 0x00000000;
+ dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n");
+#elif defined(__powerpc64__) /* arch */
+ /* restore access and let the faulting instruction continue */
+ pkey_access_allow(siginfo_pkey);
+#endif /* arch */
+ pkey_faults++;
+ dprintf1("<<<<==================================================\n");
+ dprint_in_signal = 0;
+}
+
+int wait_all_children(void)
+{
+ int status;
+ return waitpid(-1, &status, 0);
+}
+
+void sig_chld(int x)
+{
+ dprint_in_signal = 1;
+ dprintf2("[%d] SIGCHLD: %d\n", getpid(), x);
+ dprint_in_signal = 0;
+}
+
+void setup_sigsegv_handler(void)
+{
+ int r, rs;
+ struct sigaction newact;
+ struct sigaction oldact;
+
+ /* #PF is mapped to sigsegv */
+ int signum = SIGSEGV;
+
+ newact.sa_handler = 0;
+ newact.sa_sigaction = signal_handler;
+
+ /*sigset_t - signals to block while in the handler */
+ /* get the old signal mask. */
+ rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask);
+ pkey_assert(rs == 0);
+
+ /* call sa_sigaction, not sa_handler*/
+ newact.sa_flags = SA_SIGINFO;
+
+ newact.sa_restorer = 0; /* void(*)(), obsolete */
+ r = sigaction(signum, &newact, &oldact);
+ r = sigaction(SIGALRM, &newact, &oldact);
+ pkey_assert(r == 0);
+}
+
+void setup_handlers(void)
+{
+ signal(SIGCHLD, &sig_chld);
+ setup_sigsegv_handler();
+}
+
+pid_t fork_lazy_child(void)
+{
+ pid_t forkret;
+
+ forkret = fork();
+ pkey_assert(forkret >= 0);
+ dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
+
+ if (!forkret) {
+ /* in the child */
+ while (1) {
+ dprintf1("child sleeping...\n");
+ sleep(30);
+ }
+ }
+ return forkret;
+}
+
+int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+ unsigned long pkey)
+{
+ int sret;
+
+ dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__,
+ ptr, size, orig_prot, pkey);
+
+ errno = 0;
+ sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey);
+ if (errno) {
+ dprintf2("SYS_mprotect_key sret: %d\n", sret);
+ dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot);
+ dprintf2("SYS_mprotect_key failed, errno: %d\n", errno);
+ if (DEBUG_LEVEL >= 2)
+ perror("SYS_mprotect_pkey");
+ }
+ return sret;
+}
+
+int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
+{
+ int ret = syscall(SYS_pkey_alloc, flags, init_val);
+ dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n",
+ __func__, flags, init_val, ret, errno);
+ return ret;
+}
+
+int alloc_pkey(void)
+{
+ int ret;
+ unsigned long init_val = 0x0;
+
+ dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n",
+ __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg);
+ ret = sys_pkey_alloc(0, init_val);
+ /*
+ * pkey_alloc() sets PKEY register, so we need to reflect it in
+ * shadow_pkey_reg:
+ */
+ dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n",
+ __func__, __LINE__, ret, __read_pkey_reg(),
+ shadow_pkey_reg);
+ if (ret > 0) {
+ /* clear both the bits: */
+ shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret,
+ ~PKEY_MASK);
+ dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n",
+ __func__,
+ __LINE__, ret, __read_pkey_reg(),
+ shadow_pkey_reg);
+ /*
+ * move the new state in from init_val
+ * (remember, we cheated and init_val == pkey_reg format)
+ */
+ shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret,
+ init_val);
+ }
+ dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n",
+ __func__, __LINE__, ret, __read_pkey_reg(),
+ shadow_pkey_reg);
+ dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno);
+ /* for shadow checking: */
+ read_pkey_reg();
+ dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n",
+ __func__, __LINE__, ret, __read_pkey_reg(),
+ shadow_pkey_reg);
+ return ret;
+}
+
+int sys_pkey_free(unsigned long pkey)
+{
+ int ret = syscall(SYS_pkey_free, pkey);
+ dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret);
+ return ret;
+}
+
+/*
+ * I had a bug where pkey bits could be set by mprotect() but
+ * not cleared. This ensures we get lots of random bit sets
+ * and clears on the vma and pte pkey bits.
+ */
+int alloc_random_pkey(void)
+{
+ int max_nr_pkey_allocs;
+ int ret;
+ int i;
+ int alloced_pkeys[NR_PKEYS];
+ int nr_alloced = 0;
+ int random_index;
+ memset(alloced_pkeys, 0, sizeof(alloced_pkeys));
+
+ /* allocate every possible key and make a note of which ones we got */
+ max_nr_pkey_allocs = NR_PKEYS;
+ for (i = 0; i < max_nr_pkey_allocs; i++) {
+ int new_pkey = alloc_pkey();
+ if (new_pkey < 0)
+ break;
+ alloced_pkeys[nr_alloced++] = new_pkey;
+ }
+
+ pkey_assert(nr_alloced > 0);
+ /* select a random one out of the allocated ones */
+ random_index = rand() % nr_alloced;
+ ret = alloced_pkeys[random_index];
+ /* now zero it out so we don't free it next */
+ alloced_pkeys[random_index] = 0;
+
+ /* go through the allocated ones that we did not want and free them */
+ for (i = 0; i < nr_alloced; i++) {
+ int free_ret;
+ if (!alloced_pkeys[i])
+ continue;
+ free_ret = sys_pkey_free(alloced_pkeys[i]);
+ pkey_assert(!free_ret);
+ }
+ dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n", __func__,
+ __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg);
+ return ret;
+}
+
+int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+ unsigned long pkey)
+{
+ int nr_iterations = random() % 100;
+ int ret;
+
+ while (0) {
+ int rpkey = alloc_random_pkey();
+ ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
+ dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
+ ptr, size, orig_prot, pkey, ret);
+ if (nr_iterations-- < 0)
+ break;
+
+ dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n",
+ __func__, __LINE__, ret, __read_pkey_reg(),
+ shadow_pkey_reg);
+ sys_pkey_free(rpkey);
+ dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n",
+ __func__, __LINE__, ret, __read_pkey_reg(),
+ shadow_pkey_reg);
+ }
+ pkey_assert(pkey < NR_PKEYS);
+
+ ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
+ dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
+ ptr, size, orig_prot, pkey, ret);
+ pkey_assert(!ret);
+ dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n", __func__,
+ __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg);
+ return ret;
+}
+
+struct pkey_malloc_record {
+ void *ptr;
+ long size;
+ int prot;
+};
+struct pkey_malloc_record *pkey_malloc_records;
+struct pkey_malloc_record *pkey_last_malloc_record;
+long nr_pkey_malloc_records;
+void record_pkey_malloc(void *ptr, long size, int prot)
+{
+ long i;
+ struct pkey_malloc_record *rec = NULL;
+
+ for (i = 0; i < nr_pkey_malloc_records; i++) {
+ rec = &pkey_malloc_records[i];
+ /* find a free record */
+ if (rec)
+ break;
+ }
+ if (!rec) {
+ /* every record is full */
+ size_t old_nr_records = nr_pkey_malloc_records;
+ size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1);
+ size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record);
+ dprintf2("new_nr_records: %zd\n", new_nr_records);
+ dprintf2("new_size: %zd\n", new_size);
+ pkey_malloc_records = realloc(pkey_malloc_records, new_size);
+ pkey_assert(pkey_malloc_records != NULL);
+ rec = &pkey_malloc_records[nr_pkey_malloc_records];
+ /*
+ * realloc() does not initialize memory, so zero it from
+ * the first new record all the way to the end.
+ */
+ for (i = 0; i < new_nr_records - old_nr_records; i++)
+ memset(rec + i, 0, sizeof(*rec));
+ }
+ dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n",
+ (int)(rec - pkey_malloc_records), rec, ptr, size);
+ rec->ptr = ptr;
+ rec->size = size;
+ rec->prot = prot;
+ pkey_last_malloc_record = rec;
+ nr_pkey_malloc_records++;
+}
+
+void free_pkey_malloc(void *ptr)
+{
+ long i;
+ int ret;
+ dprintf3("%s(%p)\n", __func__, ptr);
+ for (i = 0; i < nr_pkey_malloc_records; i++) {
+ struct pkey_malloc_record *rec = &pkey_malloc_records[i];
+ dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n",
+ ptr, i, rec, rec->ptr, rec->size);
+ if ((ptr < rec->ptr) ||
+ (ptr >= rec->ptr + rec->size))
+ continue;
+
+ dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n",
+ ptr, i, rec, rec->ptr, rec->size);
+ nr_pkey_malloc_records--;
+ ret = munmap(rec->ptr, rec->size);
+ dprintf3("munmap ret: %d\n", ret);
+ pkey_assert(!ret);
+ dprintf3("clearing rec->ptr, rec: %p\n", rec);
+ rec->ptr = NULL;
+ dprintf3("done clearing rec->ptr, rec: %p\n", rec);
+ return;
+ }
+ pkey_assert(false);
+}
+
+
+void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
+{
+ void *ptr;
+ int ret;
+
+ read_pkey_reg();
+ dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+ size, prot, pkey);
+ pkey_assert(pkey < NR_PKEYS);
+ ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+ pkey_assert(ptr != (void *)-1);
+ ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
+ pkey_assert(!ret);
+ record_pkey_malloc(ptr, size, prot);
+ read_pkey_reg();
+
+ dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
+ return ptr;
+}
+
+void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
+{
+ int ret;
+ void *ptr;
+
+ dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+ size, prot, pkey);
+ /*
+ * Guarantee we can fit at least one huge page in the resulting
+ * allocation by allocating space for 2:
+ */
+ size = ALIGN_UP(size, HPAGE_SIZE * 2);
+ ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+ pkey_assert(ptr != (void *)-1);
+ record_pkey_malloc(ptr, size, prot);
+ mprotect_pkey(ptr, size, prot, pkey);
+
+ dprintf1("unaligned ptr: %p\n", ptr);
+ ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE);
+ dprintf1(" aligned ptr: %p\n", ptr);
+ ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE);
+ dprintf1("MADV_HUGEPAGE ret: %d\n", ret);
+ ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED);
+ dprintf1("MADV_WILLNEED ret: %d\n", ret);
+ memset(ptr, 0, HPAGE_SIZE);
+
+ dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr);
+ return ptr;
+}
+
+int hugetlb_setup_ok;
+#define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages"
+#define GET_NR_HUGE_PAGES 10
+void setup_hugetlbfs(void)
+{
+ int err;
+ int fd;
+ char buf[256];
+ long hpagesz_kb;
+ long hpagesz_mb;
+
+ if (geteuid() != 0) {
+ fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n");
+ return;
+ }
+
+ cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages");
+
+ /*
+ * Now go make sure that we got the pages and that they
+ * are PMD-level pages. Someone might have made PUD-level
+ * pages the default.
+ */
+ hpagesz_kb = HPAGE_SIZE / 1024;
+ hpagesz_mb = hpagesz_kb / 1024;
+ sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb);
+ fd = open(buf, O_RDONLY);
+ if (fd < 0) {
+ fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n",
+ hpagesz_mb, strerror(errno));
+ return;
+ }
+
+ /* -1 to guarantee leaving the trailing \0 */
+ err = read(fd, buf, sizeof(buf)-1);
+ close(fd);
+ if (err <= 0) {
+ fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n",
+ hpagesz_mb, strerror(errno));
+ return;
+ }
+
+ if (atoi(buf) != GET_NR_HUGE_PAGES) {
+ fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n",
+ hpagesz_mb, buf, GET_NR_HUGE_PAGES);
+ return;
+ }
+
+ hugetlb_setup_ok = 1;
+}
+
+void *malloc_pkey_hugetlb(long size, int prot, u16 pkey)
+{
+ void *ptr;
+ int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB;
+
+ if (!hugetlb_setup_ok)
+ return PTR_ERR_ENOTSUP;
+
+ dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey);
+ size = ALIGN_UP(size, HPAGE_SIZE * 2);
+ pkey_assert(pkey < NR_PKEYS);
+ ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
+ pkey_assert(ptr != (void *)-1);
+ mprotect_pkey(ptr, size, prot, pkey);
+
+ record_pkey_malloc(ptr, size, prot);
+
+ dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr);
+ return ptr;
+}
+
+void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey)
+{
+ void *ptr;
+ int fd;
+
+ dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+ size, prot, pkey);
+ pkey_assert(pkey < NR_PKEYS);
+ fd = open("/dax/foo", O_RDWR);
+ pkey_assert(fd >= 0);
+
+ ptr = mmap(0, size, prot, MAP_SHARED, fd, 0);
+ pkey_assert(ptr != (void *)-1);
+
+ mprotect_pkey(ptr, size, prot, pkey);
+
+ record_pkey_malloc(ptr, size, prot);
+
+ dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr);
+ close(fd);
+ return ptr;
+}
+
+void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
+
+ malloc_pkey_with_mprotect,
+ malloc_pkey_with_mprotect_subpage,
+ malloc_pkey_anon_huge,
+ malloc_pkey_hugetlb
+/* can not do direct with the pkey_mprotect() API:
+ malloc_pkey_mmap_direct,
+ malloc_pkey_mmap_dax,
+*/
+};
+
+void *malloc_pkey(long size, int prot, u16 pkey)
+{
+ void *ret;
+ static int malloc_type;
+ int nr_malloc_types = ARRAY_SIZE(pkey_malloc);
+
+ pkey_assert(pkey < NR_PKEYS);
+
+ while (1) {
+ pkey_assert(malloc_type < nr_malloc_types);
+
+ ret = pkey_malloc[malloc_type](size, prot, pkey);
+ pkey_assert(ret != (void *)-1);
+
+ malloc_type++;
+ if (malloc_type >= nr_malloc_types)
+ malloc_type = (random()%nr_malloc_types);
+
+ /* try again if the malloc_type we tried is unsupported */
+ if (ret == PTR_ERR_ENOTSUP)
+ continue;
+
+ break;
+ }
+
+ dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__,
+ size, prot, pkey, ret);
+ return ret;
+}
+
+int last_pkey_faults;
+#define UNKNOWN_PKEY -2
+void expected_pkey_fault(int pkey)
+{
+ dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n",
+ __func__, last_pkey_faults, pkey_faults);
+ dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey);
+ pkey_assert(last_pkey_faults + 1 == pkey_faults);
+
+ /*
+ * For exec-only memory, we do not know the pkey in
+ * advance, so skip this check.
+ */
+ if (pkey != UNKNOWN_PKEY)
+ pkey_assert(last_si_pkey == pkey);
+
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+ /*
+ * The signal handler shold have cleared out PKEY register to let the
+ * test program continue. We now have to restore it.
+ */
+ if (__read_pkey_reg() != 0)
+#else /* arch */
+ if (__read_pkey_reg() != shadow_pkey_reg)
+#endif /* arch */
+ pkey_assert(0);
+
+ __write_pkey_reg(shadow_pkey_reg);
+ dprintf1("%s() set pkey_reg=%016llx to restore state after signal "
+ "nuked it\n", __func__, shadow_pkey_reg);
+ last_pkey_faults = pkey_faults;
+ last_si_pkey = -1;
+}
+
+#define do_not_expect_pkey_fault(msg) do { \
+ if (last_pkey_faults != pkey_faults) \
+ dprintf0("unexpected PKey fault: %s\n", msg); \
+ pkey_assert(last_pkey_faults == pkey_faults); \
+} while (0)
+
+int test_fds[10] = { -1 };
+int nr_test_fds;
+void __save_test_fd(int fd)
+{
+ pkey_assert(fd >= 0);
+ pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds));
+ test_fds[nr_test_fds] = fd;
+ nr_test_fds++;
+}
+
+int get_test_read_fd(void)
+{
+ int test_fd = open("/etc/passwd", O_RDONLY);
+ __save_test_fd(test_fd);
+ return test_fd;
+}
+
+void close_test_fds(void)
+{
+ int i;
+
+ for (i = 0; i < nr_test_fds; i++) {
+ if (test_fds[i] < 0)
+ continue;
+ close(test_fds[i]);
+ test_fds[i] = -1;
+ }
+ nr_test_fds = 0;
+}
+
+#define barrier() __asm__ __volatile__("": : :"memory")
+__attribute__((noinline)) int read_ptr(int *ptr)
+{
+ /*
+ * Keep GCC from optimizing this away somehow
+ */
+ barrier();
+ return *ptr;
+}
+
+void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey)
+{
+ int i, err;
+ int max_nr_pkey_allocs;
+ int alloced_pkeys[NR_PKEYS];
+ int nr_alloced = 0;
+ long size;
+
+ pkey_assert(pkey_last_malloc_record);
+ size = pkey_last_malloc_record->size;
+ /*
+ * This is a bit of a hack. But mprotect() requires
+ * huge-page-aligned sizes when operating on hugetlbfs.
+ * So, make sure that we use something that's a multiple
+ * of a huge page when we can.
+ */
+ if (size >= HPAGE_SIZE)
+ size = HPAGE_SIZE;
+
+ /* allocate every possible key and make sure key-0 never got allocated */
+ max_nr_pkey_allocs = NR_PKEYS;
+ for (i = 0; i < max_nr_pkey_allocs; i++) {
+ int new_pkey = alloc_pkey();
+ pkey_assert(new_pkey != 0);
+
+ if (new_pkey < 0)
+ break;
+ alloced_pkeys[nr_alloced++] = new_pkey;
+ }
+ /* free all the allocated keys */
+ for (i = 0; i < nr_alloced; i++) {
+ int free_ret;
+
+ if (!alloced_pkeys[i])
+ continue;
+ free_ret = sys_pkey_free(alloced_pkeys[i]);
+ pkey_assert(!free_ret);
+ }
+
+ /* attach key-0 in various modes */
+ err = sys_mprotect_pkey(ptr, size, PROT_READ, 0);
+ pkey_assert(!err);
+ err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0);
+ pkey_assert(!err);
+ err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0);
+ pkey_assert(!err);
+ err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0);
+ pkey_assert(!err);
+ err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0);
+ pkey_assert(!err);
+}
+
+void test_read_of_write_disabled_region(int *ptr, u16 pkey)
+{
+ int ptr_contents;
+
+ dprintf1("disabling write access to PKEY[1], doing read\n");
+ pkey_write_deny(pkey);
+ ptr_contents = read_ptr(ptr);
+ dprintf1("*ptr: %d\n", ptr_contents);
+ dprintf1("\n");
+}
+void test_read_of_access_disabled_region(int *ptr, u16 pkey)
+{
+ int ptr_contents;
+
+ dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr);
+ read_pkey_reg();
+ pkey_access_deny(pkey);
+ ptr_contents = read_ptr(ptr);
+ dprintf1("*ptr: %d\n", ptr_contents);
+ expected_pkey_fault(pkey);
+}
+
+void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr,
+ u16 pkey)
+{
+ int ptr_contents;
+
+ dprintf1("disabling access to PKEY[%02d], doing read @ %p\n",
+ pkey, ptr);
+ ptr_contents = read_ptr(ptr);
+ dprintf1("reading ptr before disabling the read : %d\n",
+ ptr_contents);
+ read_pkey_reg();
+ pkey_access_deny(pkey);
+ ptr_contents = read_ptr(ptr);
+ dprintf1("*ptr: %d\n", ptr_contents);
+ expected_pkey_fault(pkey);
+}
+
+void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr,
+ u16 pkey)
+{
+ *ptr = __LINE__;
+ dprintf1("disabling write access; after accessing the page, "
+ "to PKEY[%02d], doing write\n", pkey);
+ pkey_write_deny(pkey);
+ *ptr = __LINE__;
+ expected_pkey_fault(pkey);
+}
+
+void test_write_of_write_disabled_region(int *ptr, u16 pkey)
+{
+ dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
+ pkey_write_deny(pkey);
+ *ptr = __LINE__;
+ expected_pkey_fault(pkey);
+}
+void test_write_of_access_disabled_region(int *ptr, u16 pkey)
+{
+ dprintf1("disabling access to PKEY[%02d], doing write\n", pkey);
+ pkey_access_deny(pkey);
+ *ptr = __LINE__;
+ expected_pkey_fault(pkey);
+}
+
+void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr,
+ u16 pkey)
+{
+ *ptr = __LINE__;
+ dprintf1("disabling access; after accessing the page, "
+ " to PKEY[%02d], doing write\n", pkey);
+ pkey_access_deny(pkey);
+ *ptr = __LINE__;
+ expected_pkey_fault(pkey);
+}
+
+void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey)
+{
+ int ret;
+ int test_fd = get_test_read_fd();
+
+ dprintf1("disabling access to PKEY[%02d], "
+ "having kernel read() to buffer\n", pkey);
+ pkey_access_deny(pkey);
+ ret = read(test_fd, ptr, 1);
+ dprintf1("read ret: %d\n", ret);
+ pkey_assert(ret);
+}
+void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey)
+{
+ int ret;
+ int test_fd = get_test_read_fd();
+
+ pkey_write_deny(pkey);
+ ret = read(test_fd, ptr, 100);
+ dprintf1("read ret: %d\n", ret);
+ if (ret < 0 && (DEBUG_LEVEL > 0))
+ perror("verbose read result (OK for this to be bad)");
+ pkey_assert(ret);
+}
+
+void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey)
+{
+ int pipe_ret, vmsplice_ret;
+ struct iovec iov;
+ int pipe_fds[2];
+
+ pipe_ret = pipe(pipe_fds);
+
+ pkey_assert(pipe_ret == 0);
+ dprintf1("disabling access to PKEY[%02d], "
+ "having kernel vmsplice from buffer\n", pkey);
+ pkey_access_deny(pkey);
+ iov.iov_base = ptr;
+ iov.iov_len = PAGE_SIZE;
+ vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT);
+ dprintf1("vmsplice() ret: %d\n", vmsplice_ret);
+ pkey_assert(vmsplice_ret == -1);
+
+ close(pipe_fds[0]);
+ close(pipe_fds[1]);
+}
+
+void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey)
+{
+ int ignored = 0xdada;
+ int futex_ret;
+ int some_int = __LINE__;
+
+ dprintf1("disabling write to PKEY[%02d], "
+ "doing futex gunk in buffer\n", pkey);
+ *ptr = some_int;
+ pkey_write_deny(pkey);
+ futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL,
+ &ignored, ignored);
+ if (DEBUG_LEVEL > 0)
+ perror("futex");
+ dprintf1("futex() ret: %d\n", futex_ret);
+}
+
+/* Assumes that all pkeys other than 'pkey' are unallocated */
+void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey)
+{
+ int err;
+ int i;
+
+ /* Note: 0 is the default pkey, so don't mess with it */
+ for (i = 1; i < NR_PKEYS; i++) {
+ if (pkey == i)
+ continue;
+
+ dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i);
+ err = sys_pkey_free(i);
+ pkey_assert(err);
+
+ err = sys_pkey_free(i);
+ pkey_assert(err);
+
+ err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i);
+ pkey_assert(err);
+ }
+}
+
+/* Assumes that all pkeys other than 'pkey' are unallocated */
+void test_pkey_syscalls_bad_args(int *ptr, u16 pkey)
+{
+ int err;
+ int bad_pkey = NR_PKEYS+99;
+
+ /* pass a known-invalid pkey in: */
+ err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey);
+ pkey_assert(err);
+}
+
+void become_child(void)
+{
+ pid_t forkret;
+
+ forkret = fork();
+ pkey_assert(forkret >= 0);
+ dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
+
+ if (!forkret) {
+ /* in the child */
+ return;
+ }
+ exit(0);
+}
+
+/* Assumes that all pkeys other than 'pkey' are unallocated */
+void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
+{
+ int err;
+ int allocated_pkeys[NR_PKEYS] = {0};
+ int nr_allocated_pkeys = 0;
+ int i;
+
+ for (i = 0; i < NR_PKEYS*3; i++) {
+ int new_pkey;
+ dprintf1("%s() alloc loop: %d\n", __func__, i);
+ new_pkey = alloc_pkey();
+ dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx"
+ " shadow: 0x%016llx\n",
+ __func__, __LINE__, err, __read_pkey_reg(),
+ shadow_pkey_reg);
+ read_pkey_reg(); /* for shadow checking */
+ dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC);
+ if ((new_pkey == -1) && (errno == ENOSPC)) {
+ dprintf2("%s() failed to allocate pkey after %d tries\n",
+ __func__, nr_allocated_pkeys);
+ } else {
+ /*
+ * Ensure the number of successes never
+ * exceeds the number of keys supported
+ * in the hardware.
+ */
+ pkey_assert(nr_allocated_pkeys < NR_PKEYS);
+ allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
+ }
+
+ /*
+ * Make sure that allocation state is properly
+ * preserved across fork().
+ */
+ if (i == NR_PKEYS*2)
+ become_child();
+ }
+
+ dprintf3("%s()::%d\n", __func__, __LINE__);
+
+ /*
+ * On x86:
+ * There are 16 pkeys supported in hardware. Three are
+ * allocated by the time we get here:
+ * 1. The default key (0)
+ * 2. One possibly consumed by an execute-only mapping.
+ * 3. One allocated by the test code and passed in via
+ * 'pkey' to this function.
+ * Ensure that we can allocate at least another 13 (16-3).
+ *
+ * On powerpc:
+ * There are either 5, 28, 29 or 32 pkeys supported in
+ * hardware depending on the page size (4K or 64K) and
+ * platform (powernv or powervm). Four are allocated by
+ * the time we get here. These include pkey-0, pkey-1,
+ * exec-only pkey and the one allocated by the test code.
+ * Ensure that we can allocate the remaining.
+ */
+ pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1));
+
+ for (i = 0; i < nr_allocated_pkeys; i++) {
+ err = sys_pkey_free(allocated_pkeys[i]);
+ pkey_assert(!err);
+ read_pkey_reg(); /* for shadow checking */
+ }
+}
+
+/*
+ * pkey 0 is special. It is allocated by default, so you do not
+ * have to call pkey_alloc() to use it first. Make sure that it
+ * is usable.
+ */
+void test_mprotect_with_pkey_0(int *ptr, u16 pkey)
+{
+ long size;
+ int prot;
+
+ assert(pkey_last_malloc_record);
+ size = pkey_last_malloc_record->size;
+ /*
+ * This is a bit of a hack. But mprotect() requires
+ * huge-page-aligned sizes when operating on hugetlbfs.
+ * So, make sure that we use something that's a multiple
+ * of a huge page when we can.
+ */
+ if (size >= HPAGE_SIZE)
+ size = HPAGE_SIZE;
+ prot = pkey_last_malloc_record->prot;
+
+ /* Use pkey 0 */
+ mprotect_pkey(ptr, size, prot, 0);
+
+ /* Make sure that we can set it back to the original pkey. */
+ mprotect_pkey(ptr, size, prot, pkey);
+}
+
+void test_ptrace_of_child(int *ptr, u16 pkey)
+{
+ __attribute__((__unused__)) int peek_result;
+ pid_t child_pid;
+ void *ignored = 0;
+ long ret;
+ int status;
+ /*
+ * This is the "control" for our little expermient. Make sure
+ * we can always access it when ptracing.
+ */
+ int *plain_ptr_unaligned = malloc(HPAGE_SIZE);
+ int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE);
+
+ /*
+ * Fork a child which is an exact copy of this process, of course.
+ * That means we can do all of our tests via ptrace() and then plain
+ * memory access and ensure they work differently.
+ */
+ child_pid = fork_lazy_child();
+ dprintf1("[%d] child pid: %d\n", getpid(), child_pid);
+
+ ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored);
+ if (ret)
+ perror("attach");
+ dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__);
+ pkey_assert(ret != -1);
+ ret = waitpid(child_pid, &status, WUNTRACED);
+ if ((ret != child_pid) || !(WIFSTOPPED(status))) {
+ fprintf(stderr, "weird waitpid result %ld stat %x\n",
+ ret, status);
+ pkey_assert(0);
+ }
+ dprintf2("waitpid ret: %ld\n", ret);
+ dprintf2("waitpid status: %d\n", status);
+
+ pkey_access_deny(pkey);
+ pkey_write_deny(pkey);
+
+ /* Write access, untested for now:
+ ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data);
+ pkey_assert(ret != -1);
+ dprintf1("poke at %p: %ld\n", peek_at, ret);
+ */
+
+ /*
+ * Try to access the pkey-protected "ptr" via ptrace:
+ */
+ ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored);
+ /* expect it to work, without an error: */
+ pkey_assert(ret != -1);
+ /* Now access from the current task, and expect an exception: */
+ peek_result = read_ptr(ptr);
+ expected_pkey_fault(pkey);
+
+ /*
+ * Try to access the NON-pkey-protected "plain_ptr" via ptrace:
+ */
+ ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored);
+ /* expect it to work, without an error: */
+ pkey_assert(ret != -1);
+ /* Now access from the current task, and expect NO exception: */
+ peek_result = read_ptr(plain_ptr);
+ do_not_expect_pkey_fault("read plain pointer after ptrace");
+
+ ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0);
+ pkey_assert(ret != -1);
+
+ ret = kill(child_pid, SIGKILL);
+ pkey_assert(ret != -1);
+
+ wait(&status);
+
+ free(plain_ptr_unaligned);
+}
+
+void *get_pointer_to_instructions(void)
+{
+ void *p1;
+
+ p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE);
+ dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write);
+ /* lots_o_noops_around_write should be page-aligned already */
+ assert(p1 == &lots_o_noops_around_write);
+
+ /* Point 'p1' at the *second* page of the function: */
+ p1 += PAGE_SIZE;
+
+ /*
+ * Try to ensure we fault this in on next touch to ensure
+ * we get an instruction fault as opposed to a data one
+ */
+ madvise(p1, PAGE_SIZE, MADV_DONTNEED);
+
+ return p1;
+}
+
+void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
+{
+ void *p1;
+ int scratch;
+ int ptr_contents;
+ int ret;
+
+ p1 = get_pointer_to_instructions();
+ lots_o_noops_around_write(&scratch);
+ ptr_contents = read_ptr(p1);
+ dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
+
+ ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey);
+ pkey_assert(!ret);
+ pkey_access_deny(pkey);
+
+ dprintf2("pkey_reg: %016llx\n", read_pkey_reg());
+
+ /*
+ * Make sure this is an *instruction* fault
+ */
+ madvise(p1, PAGE_SIZE, MADV_DONTNEED);
+ lots_o_noops_around_write(&scratch);
+ do_not_expect_pkey_fault("executing on PROT_EXEC memory");
+ expect_fault_on_read_execonly_key(p1, pkey);
+}
+
+void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
+{
+ void *p1;
+ int scratch;
+ int ptr_contents;
+ int ret;
+
+ dprintf1("%s() start\n", __func__);
+
+ p1 = get_pointer_to_instructions();
+ lots_o_noops_around_write(&scratch);
+ ptr_contents = read_ptr(p1);
+ dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
+
+ /* Use a *normal* mprotect(), not mprotect_pkey(): */
+ ret = mprotect(p1, PAGE_SIZE, PROT_EXEC);
+ pkey_assert(!ret);
+
+ /*
+ * Reset the shadow, assuming that the above mprotect()
+ * correctly changed PKRU, but to an unknown value since
+ * the actual alllocated pkey is unknown.
+ */
+ shadow_pkey_reg = __read_pkey_reg();
+
+ dprintf2("pkey_reg: %016llx\n", read_pkey_reg());
+
+ /* Make sure this is an *instruction* fault */
+ madvise(p1, PAGE_SIZE, MADV_DONTNEED);
+ lots_o_noops_around_write(&scratch);
+ do_not_expect_pkey_fault("executing on PROT_EXEC memory");
+ expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY);
+
+ /*
+ * Put the memory back to non-PROT_EXEC. Should clear the
+ * exec-only pkey off the VMA and allow it to be readable
+ * again. Go to PROT_NONE first to check for a kernel bug
+ * that did not clear the pkey when doing PROT_NONE.
+ */
+ ret = mprotect(p1, PAGE_SIZE, PROT_NONE);
+ pkey_assert(!ret);
+
+ ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC);
+ pkey_assert(!ret);
+ ptr_contents = read_ptr(p1);
+ do_not_expect_pkey_fault("plain read on recently PROT_EXEC area");
+}
+
+void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
+{
+ int size = PAGE_SIZE;
+ int sret;
+
+ if (cpu_has_pkeys()) {
+ dprintf1("SKIP: %s: no CPU support\n", __func__);
+ return;
+ }
+
+ sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey);
+ pkey_assert(sret < 0);
+}
+
+void (*pkey_tests[])(int *ptr, u16 pkey) = {
+ test_read_of_write_disabled_region,
+ test_read_of_access_disabled_region,
+ test_read_of_access_disabled_region_with_page_already_mapped,
+ test_write_of_write_disabled_region,
+ test_write_of_write_disabled_region_with_page_already_mapped,
+ test_write_of_access_disabled_region,
+ test_write_of_access_disabled_region_with_page_already_mapped,
+ test_kernel_write_of_access_disabled_region,
+ test_kernel_write_of_write_disabled_region,
+ test_kernel_gup_of_access_disabled_region,
+ test_kernel_gup_write_to_write_disabled_region,
+ test_executing_on_unreadable_memory,
+ test_implicit_mprotect_exec_only_memory,
+ test_mprotect_with_pkey_0,
+ test_ptrace_of_child,
+ test_pkey_syscalls_on_non_allocated_pkey,
+ test_pkey_syscalls_bad_args,
+ test_pkey_alloc_exhaust,
+ test_pkey_alloc_free_attach_pkey0,
+};
+
+void run_tests_once(void)
+{
+ int *ptr;
+ int prot = PROT_READ|PROT_WRITE;
+
+ for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) {
+ int pkey;
+ int orig_pkey_faults = pkey_faults;
+
+ dprintf1("======================\n");
+ dprintf1("test %d preparing...\n", test_nr);
+
+ tracing_on();
+ pkey = alloc_random_pkey();
+ dprintf1("test %d starting with pkey: %d\n", test_nr, pkey);
+ ptr = malloc_pkey(PAGE_SIZE, prot, pkey);
+ dprintf1("test %d starting...\n", test_nr);
+ pkey_tests[test_nr](ptr, pkey);
+ dprintf1("freeing test memory: %p\n", ptr);
+ free_pkey_malloc(ptr);
+ sys_pkey_free(pkey);
+
+ dprintf1("pkey_faults: %d\n", pkey_faults);
+ dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults);
+
+ tracing_off();
+ close_test_fds();
+
+ printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr);
+ dprintf1("======================\n\n");
+ }
+ iteration_nr++;
+}
+
+void pkey_setup_shadow(void)
+{
+ shadow_pkey_reg = __read_pkey_reg();
+}
+
+int main(void)
+{
+ int nr_iterations = 22;
+ int pkeys_supported = is_pkeys_supported();
+
+ srand((unsigned int)time(NULL));
+
+ setup_handlers();
+
+ printf("has pkeys: %d\n", pkeys_supported);
+
+ if (!pkeys_supported) {
+ int size = PAGE_SIZE;
+ int *ptr;
+
+ printf("running PKEY tests for unsupported CPU/OS\n");
+
+ ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+ assert(ptr != (void *)-1);
+ test_mprotect_pkey_on_unsupported_cpu(ptr, 1);
+ exit(0);
+ }
+
+ pkey_setup_shadow();
+ printf("startup pkey_reg: %016llx\n", read_pkey_reg());
+ setup_hugetlbfs();
+
+ while (nr_iterations-- > 0)
+ run_tests_once();
+
+ printf("done (all tests OK)\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
new file mode 100755
index 000000000..a3f4f30f0
--- /dev/null
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -0,0 +1,326 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#please run as root
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+mnt=./huge
+exitcode=0
+
+#get huge pagesize and freepages from /proc/meminfo
+while read name size unit; do
+ if [ "$name" = "HugePages_Free:" ]; then
+ freepgs=$size
+ fi
+ if [ "$name" = "Hugepagesize:" ]; then
+ hpgsize_KB=$size
+ fi
+done < /proc/meminfo
+
+# Simple hugetlbfs tests have a hardcoded minimum requirement of
+# huge pages totaling 256MB (262144KB) in size. The userfaultfd
+# hugetlb test requires a minimum of 2 * nr_cpus huge pages. Take
+# both of these requirements into account and attempt to increase
+# number of huge pages available.
+nr_cpus=$(nproc)
+hpgsize_MB=$((hpgsize_KB / 1024))
+half_ufd_size_MB=$((((nr_cpus * hpgsize_MB + 127) / 128) * 128))
+needmem_KB=$((half_ufd_size_MB * 2 * 1024))
+
+#set proper nr_hugepages
+if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then
+ nr_hugepgs=`cat /proc/sys/vm/nr_hugepages`
+ needpgs=$((needmem_KB / hpgsize_KB))
+ tries=2
+ while [ $tries -gt 0 ] && [ $freepgs -lt $needpgs ]; do
+ lackpgs=$(( $needpgs - $freepgs ))
+ echo 3 > /proc/sys/vm/drop_caches
+ echo $(( $lackpgs + $nr_hugepgs )) > /proc/sys/vm/nr_hugepages
+ if [ $? -ne 0 ]; then
+ echo "Please run this test as root"
+ exit $ksft_skip
+ fi
+ while read name size unit; do
+ if [ "$name" = "HugePages_Free:" ]; then
+ freepgs=$size
+ fi
+ done < /proc/meminfo
+ tries=$((tries - 1))
+ done
+ if [ $freepgs -lt $needpgs ]; then
+ printf "Not enough huge pages available (%d < %d)\n" \
+ $freepgs $needpgs
+ exit 1
+ fi
+else
+ echo "no hugetlbfs support in kernel?"
+ exit 1
+fi
+
+#filter 64bit architectures
+ARCH64STR="arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64"
+if [ -z $ARCH ]; then
+ ARCH=`uname -m 2>/dev/null | sed -e 's/aarch64.*/arm64/'`
+fi
+VADDR64=0
+echo "$ARCH64STR" | grep $ARCH && VADDR64=1
+
+mkdir $mnt
+mount -t hugetlbfs none $mnt
+
+echo "---------------------"
+echo "running hugepage-mmap"
+echo "---------------------"
+./hugepage-mmap
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+shmmax=`cat /proc/sys/kernel/shmmax`
+shmall=`cat /proc/sys/kernel/shmall`
+echo 268435456 > /proc/sys/kernel/shmmax
+echo 4194304 > /proc/sys/kernel/shmall
+echo "--------------------"
+echo "running hugepage-shm"
+echo "--------------------"
+./hugepage-shm
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+echo $shmmax > /proc/sys/kernel/shmmax
+echo $shmall > /proc/sys/kernel/shmall
+
+echo "-------------------"
+echo "running map_hugetlb"
+echo "-------------------"
+./map_hugetlb
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "NOTE: The above hugetlb tests provide minimal coverage. Use"
+echo " https://github.com/libhugetlbfs/libhugetlbfs.git for"
+echo " hugetlb regression testing."
+
+echo "---------------------------"
+echo "running map_fixed_noreplace"
+echo "---------------------------"
+./map_fixed_noreplace
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "--------------------------------------------"
+echo "running 'gup_benchmark -U' (normal/slow gup)"
+echo "--------------------------------------------"
+./gup_benchmark -U
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "------------------------------------------"
+echo "running gup_benchmark -b (pin_user_pages)"
+echo "------------------------------------------"
+./gup_benchmark -b
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "-------------------"
+echo "running userfaultfd"
+echo "-------------------"
+./userfaultfd anon 128 32
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "---------------------------"
+echo "running userfaultfd_hugetlb"
+echo "---------------------------"
+# Test requires source and destination huge pages. Size of source
+# (half_ufd_size_MB) is passed as argument to test.
+./userfaultfd hugetlb $half_ufd_size_MB 32 $mnt/ufd_test_file
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+rm -f $mnt/ufd_test_file
+
+echo "-------------------------"
+echo "running userfaultfd_shmem"
+echo "-------------------------"
+./userfaultfd shmem 128 32
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+#cleanup
+umount $mnt
+rm -rf $mnt
+echo $nr_hugepgs > /proc/sys/vm/nr_hugepages
+
+echo "-----------------------"
+echo "running compaction_test"
+echo "-----------------------"
+./compaction_test
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "----------------------"
+echo "running on-fault-limit"
+echo "----------------------"
+sudo -u nobody ./on-fault-limit
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "--------------------"
+echo "running map_populate"
+echo "--------------------"
+./map_populate
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "-------------------------"
+echo "running mlock-random-test"
+echo "-------------------------"
+./mlock-random-test
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "--------------------"
+echo "running mlock2-tests"
+echo "--------------------"
+./mlock2-tests
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "-----------------"
+echo "running thuge-gen"
+echo "-----------------"
+./thuge-gen
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+if [ $VADDR64 -ne 0 ]; then
+echo "-----------------------------"
+echo "running virtual_address_range"
+echo "-----------------------------"
+./virtual_address_range
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+
+echo "-----------------------------"
+echo "running virtual address 128TB switch test"
+echo "-----------------------------"
+./va_128TBswitch
+if [ $? -ne 0 ]; then
+ echo "[FAIL]"
+ exitcode=1
+else
+ echo "[PASS]"
+fi
+fi # VADDR64
+
+echo "------------------------------------"
+echo "running vmalloc stability smoke test"
+echo "------------------------------------"
+./test_vmalloc.sh smoke
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+ echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+ echo "[SKIP]"
+ exitcode=$ksft_skip
+else
+ echo "[FAIL]"
+ exitcode=1
+fi
+
+echo "------------------------------------"
+echo "running MREMAP_DONTUNMAP smoke test"
+echo "------------------------------------"
+./mremap_dontunmap
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+ echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+ echo "[SKIP]"
+ exitcode=$ksft_skip
+else
+ echo "[FAIL]"
+ exitcode=1
+fi
+
+echo "running HMM smoke test"
+echo "------------------------------------"
+./test_hmm.sh smoke
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+ echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+ echo "[SKIP]"
+ exitcode=$ksft_skip
+else
+ echo "[FAIL]"
+ exitcode=1
+fi
+
+exit $exitcode
diff --git a/tools/testing/selftests/vm/test_hmm.sh b/tools/testing/selftests/vm/test_hmm.sh
new file mode 100755
index 000000000..0647b525a
--- /dev/null
+++ b/tools/testing/selftests/vm/test_hmm.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com>
+#
+# This is a test script for the kernel test driver to analyse vmalloc
+# allocator. Therefore it is just a kernel module loader. You can specify
+# and pass different parameters in order to:
+# a) analyse performance of vmalloc allocations;
+# b) stressing and stability check of vmalloc subsystem.
+
+TEST_NAME="test_hmm"
+DRIVER="test_hmm"
+
+# 1 if fails
+exitcode=1
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+check_test_requirements()
+{
+ uid=$(id -u)
+ if [ $uid -ne 0 ]; then
+ echo "$0: Must be run as root"
+ exit $ksft_skip
+ fi
+
+ if ! which modprobe > /dev/null 2>&1; then
+ echo "$0: You need modprobe installed"
+ exit $ksft_skip
+ fi
+
+ if ! modinfo $DRIVER > /dev/null 2>&1; then
+ echo "$0: You must have the following enabled in your kernel:"
+ echo "CONFIG_TEST_HMM=m"
+ exit $ksft_skip
+ fi
+}
+
+load_driver()
+{
+ modprobe $DRIVER > /dev/null 2>&1
+ if [ $? == 0 ]; then
+ major=$(awk "\$2==\"HMM_DMIRROR\" {print \$1}" /proc/devices)
+ mknod /dev/hmm_dmirror0 c $major 0
+ mknod /dev/hmm_dmirror1 c $major 1
+ fi
+}
+
+unload_driver()
+{
+ modprobe -r $DRIVER > /dev/null 2>&1
+ rm -f /dev/hmm_dmirror?
+}
+
+run_smoke()
+{
+ echo "Running smoke test. Note, this test provides basic coverage."
+
+ load_driver
+ $(dirname "${BASH_SOURCE[0]}")/hmm-tests
+ unload_driver
+}
+
+usage()
+{
+ echo -n "Usage: $0"
+ echo
+ echo "Example usage:"
+ echo
+ echo "# Shows help message"
+ echo "./${TEST_NAME}.sh"
+ echo
+ echo "# Smoke testing"
+ echo "./${TEST_NAME}.sh smoke"
+ echo
+ exit 0
+}
+
+function run_test()
+{
+ if [ $# -eq 0 ]; then
+ usage
+ else
+ if [ "$1" = "smoke" ]; then
+ run_smoke
+ else
+ usage
+ fi
+ fi
+}
+
+check_test_requirements
+run_test $@
+
+exit 0
diff --git a/tools/testing/selftests/vm/test_vmalloc.sh b/tools/testing/selftests/vm/test_vmalloc.sh
new file mode 100755
index 000000000..06d2bb109
--- /dev/null
+++ b/tools/testing/selftests/vm/test_vmalloc.sh
@@ -0,0 +1,176 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com>
+#
+# This is a test script for the kernel test driver to analyse vmalloc
+# allocator. Therefore it is just a kernel module loader. You can specify
+# and pass different parameters in order to:
+# a) analyse performance of vmalloc allocations;
+# b) stressing and stability check of vmalloc subsystem.
+
+TEST_NAME="vmalloc"
+DRIVER="test_${TEST_NAME}"
+
+# 1 if fails
+exitcode=1
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+#
+# Static templates for performance, stressing and smoke tests.
+# Also it is possible to pass any supported parameters manualy.
+#
+PERF_PARAM="single_cpu_test=1 sequential_test_order=1 test_repeat_count=3"
+SMOKE_PARAM="single_cpu_test=1 test_loop_count=10000 test_repeat_count=10"
+STRESS_PARAM="test_repeat_count=20"
+
+check_test_requirements()
+{
+ uid=$(id -u)
+ if [ $uid -ne 0 ]; then
+ echo "$0: Must be run as root"
+ exit $ksft_skip
+ fi
+
+ if ! which modprobe > /dev/null 2>&1; then
+ echo "$0: You need modprobe installed"
+ exit $ksft_skip
+ fi
+
+ if ! modinfo $DRIVER > /dev/null 2>&1; then
+ echo "$0: You must have the following enabled in your kernel:"
+ echo "CONFIG_TEST_VMALLOC=m"
+ exit $ksft_skip
+ fi
+}
+
+run_perfformance_check()
+{
+ echo "Run performance tests to evaluate how fast vmalloc allocation is."
+ echo "It runs all test cases on one single CPU with sequential order."
+
+ modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1
+ echo "Done."
+ echo "Ccheck the kernel message buffer to see the summary."
+}
+
+run_stability_check()
+{
+ echo "Run stability tests. In order to stress vmalloc subsystem we run"
+ echo "all available test cases on all available CPUs simultaneously."
+ echo "It will take time, so be patient."
+
+ modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1
+ echo "Done."
+ echo "Check the kernel ring buffer to see the summary."
+}
+
+run_smoke_check()
+{
+ echo "Run smoke test. Note, this test provides basic coverage."
+ echo "Please check $0 output how it can be used"
+ echo "for deep performance analysis as well as stress testing."
+
+ modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1
+ echo "Done."
+ echo "Check the kernel ring buffer to see the summary."
+}
+
+usage()
+{
+ echo -n "Usage: $0 [ performance ] | [ stress ] | | [ smoke ] | "
+ echo "manual parameters"
+ echo
+ echo "Valid tests and parameters:"
+ echo
+ modinfo $DRIVER
+ echo
+ echo "Example usage:"
+ echo
+ echo "# Shows help message"
+ echo "./${DRIVER}.sh"
+ echo
+ echo "# Runs 1 test(id_1), repeats it 5 times on all online CPUs"
+ echo "./${DRIVER}.sh run_test_mask=1 test_repeat_count=5"
+ echo
+ echo -n "# Runs 4 tests(id_1|id_2|id_4|id_16) on one CPU with "
+ echo "sequential order"
+ echo -n "./${DRIVER}.sh single_cpu_test=1 sequential_test_order=1 "
+ echo "run_test_mask=23"
+ echo
+ echo -n "# Runs all tests on all online CPUs, shuffled order, repeats "
+ echo "20 times"
+ echo "./${DRIVER}.sh test_repeat_count=20"
+ echo
+ echo "# Performance analysis"
+ echo "./${DRIVER}.sh performance"
+ echo
+ echo "# Stress testing"
+ echo "./${DRIVER}.sh stress"
+ echo
+ exit 0
+}
+
+function validate_passed_args()
+{
+ VALID_ARGS=`modinfo $DRIVER | awk '/parm:/ {print $2}' | sed 's/:.*//'`
+
+ #
+ # Something has been passed, check it.
+ #
+ for passed_arg in $@; do
+ key=${passed_arg//=*/}
+ val="${passed_arg:$((${#key}+1))}"
+ valid=0
+
+ for valid_arg in $VALID_ARGS; do
+ if [[ $key = $valid_arg ]] && [[ $val -gt 0 ]]; then
+ valid=1
+ break
+ fi
+ done
+
+ if [[ $valid -ne 1 ]]; then
+ echo "Error: key or value is not correct: ${key} $val"
+ exit $exitcode
+ fi
+ done
+}
+
+function run_manual_check()
+{
+ #
+ # Validate passed parameters. If there is wrong one,
+ # the script exists and does not execute further.
+ #
+ validate_passed_args $@
+
+ echo "Run the test with following parameters: $@"
+ modprobe $DRIVER $@ > /dev/null 2>&1
+ echo "Done."
+ echo "Check the kernel ring buffer to see the summary."
+}
+
+function run_test()
+{
+ if [ $# -eq 0 ]; then
+ usage
+ else
+ if [[ "$1" = "performance" ]]; then
+ run_perfformance_check
+ elif [[ "$1" = "stress" ]]; then
+ run_stability_check
+ elif [[ "$1" = "smoke" ]]; then
+ run_smoke_check
+ else
+ run_manual_check $@
+ fi
+ fi
+}
+
+check_test_requirements
+run_test $@
+
+exit 0
diff --git a/tools/testing/selftests/vm/thuge-gen.c b/tools/testing/selftests/vm/thuge-gen.c
new file mode 100644
index 000000000..361ef7192
--- /dev/null
+++ b/tools/testing/selftests/vm/thuge-gen.c
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Test selecting other page sizes for mmap/shmget.
+
+ Before running this huge pages for each huge page size must have been
+ reserved.
+ For large pages beyond MAX_ORDER (like 1GB on x86) boot options must be used.
+ Also shmmax must be increased.
+ And you need to run as root to work around some weird permissions in shm.
+ And nothing using huge pages should run in parallel.
+ When the program aborts you may need to clean up the shm segments with
+ ipcrm -m by hand, like this
+ sudo ipcs | awk '$1 == "0x00000000" {print $2}' | xargs -n1 sudo ipcrm -m
+ (warning this will remove all if someone else uses them) */
+
+#define _GNU_SOURCE 1
+#include <sys/mman.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+#include <glob.h>
+#include <assert.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <string.h>
+
+#define err(x) perror(x), exit(1)
+
+#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
+#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
+#define MAP_HUGE_SHIFT 26
+#define MAP_HUGE_MASK 0x3f
+#if !defined(MAP_HUGETLB)
+#define MAP_HUGETLB 0x40000
+#endif
+
+#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */
+#define SHM_HUGE_SHIFT 26
+#define SHM_HUGE_MASK 0x3f
+#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT)
+#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT)
+
+#define NUM_PAGESIZES 5
+
+#define NUM_PAGES 4
+
+#define Dprintf(fmt...) // printf(fmt)
+
+unsigned long page_sizes[NUM_PAGESIZES];
+int num_page_sizes;
+
+int ilog2(unsigned long v)
+{
+ int l = 0;
+ while ((1UL << l) < v)
+ l++;
+ return l;
+}
+
+void find_pagesizes(void)
+{
+ glob_t g;
+ int i;
+ glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g);
+ assert(g.gl_pathc <= NUM_PAGESIZES);
+ for (i = 0; i < g.gl_pathc; i++) {
+ sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB",
+ &page_sizes[i]);
+ page_sizes[i] <<= 10;
+ printf("Found %luMB\n", page_sizes[i] >> 20);
+ }
+ num_page_sizes = g.gl_pathc;
+ globfree(&g);
+}
+
+unsigned long default_huge_page_size(void)
+{
+ unsigned long hps = 0;
+ char *line = NULL;
+ size_t linelen = 0;
+ FILE *f = fopen("/proc/meminfo", "r");
+ if (!f)
+ return 0;
+ while (getline(&line, &linelen, f) > 0) {
+ if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) {
+ hps <<= 10;
+ break;
+ }
+ }
+ free(line);
+ return hps;
+}
+
+void show(unsigned long ps)
+{
+ char buf[100];
+ if (ps == getpagesize())
+ return;
+ printf("%luMB: ", ps >> 20);
+ fflush(stdout);
+ snprintf(buf, sizeof buf,
+ "cat /sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
+ ps >> 10);
+ system(buf);
+}
+
+unsigned long read_sysfs(int warn, char *fmt, ...)
+{
+ char *line = NULL;
+ size_t linelen = 0;
+ char buf[100];
+ FILE *f;
+ va_list ap;
+ unsigned long val = 0;
+
+ va_start(ap, fmt);
+ vsnprintf(buf, sizeof buf, fmt, ap);
+ va_end(ap);
+
+ f = fopen(buf, "r");
+ if (!f) {
+ if (warn)
+ printf("missing %s\n", buf);
+ return 0;
+ }
+ if (getline(&line, &linelen, f) > 0) {
+ sscanf(line, "%lu", &val);
+ }
+ fclose(f);
+ free(line);
+ return val;
+}
+
+unsigned long read_free(unsigned long ps)
+{
+ return read_sysfs(ps != getpagesize(),
+ "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages",
+ ps >> 10);
+}
+
+void test_mmap(unsigned long size, unsigned flags)
+{
+ char *map;
+ unsigned long before, after;
+ int err;
+
+ before = read_free(size);
+ map = mmap(NULL, size*NUM_PAGES, PROT_READ|PROT_WRITE,
+ MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB|flags, -1, 0);
+
+ if (map == (char *)-1) err("mmap");
+ memset(map, 0xff, size*NUM_PAGES);
+ after = read_free(size);
+ Dprintf("before %lu after %lu diff %ld size %lu\n",
+ before, after, before - after, size);
+ assert(size == getpagesize() || (before - after) == NUM_PAGES);
+ show(size);
+ err = munmap(map, size);
+ assert(!err);
+}
+
+void test_shmget(unsigned long size, unsigned flags)
+{
+ int id;
+ unsigned long before, after;
+ int err;
+
+ before = read_free(size);
+ id = shmget(IPC_PRIVATE, size * NUM_PAGES, IPC_CREAT|0600|flags);
+ if (id < 0) err("shmget");
+
+ struct shm_info i;
+ if (shmctl(id, SHM_INFO, (void *)&i) < 0) err("shmctl");
+ Dprintf("alloc %lu res %lu\n", i.shm_tot, i.shm_rss);
+
+
+ Dprintf("id %d\n", id);
+ char *map = shmat(id, NULL, 0600);
+ if (map == (char*)-1) err("shmat");
+
+ shmctl(id, IPC_RMID, NULL);
+
+ memset(map, 0xff, size*NUM_PAGES);
+ after = read_free(size);
+
+ Dprintf("before %lu after %lu diff %ld size %lu\n",
+ before, after, before - after, size);
+ assert(size == getpagesize() || (before - after) == NUM_PAGES);
+ show(size);
+ err = shmdt(map);
+ assert(!err);
+}
+
+void sanity_checks(void)
+{
+ int i;
+ unsigned long largest = getpagesize();
+
+ for (i = 0; i < num_page_sizes; i++) {
+ if (page_sizes[i] > largest)
+ largest = page_sizes[i];
+
+ if (read_free(page_sizes[i]) < NUM_PAGES) {
+ printf("Not enough huge pages for page size %lu MB, need %u\n",
+ page_sizes[i] >> 20,
+ NUM_PAGES);
+ exit(0);
+ }
+ }
+
+ if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) {
+ printf("Please do echo %lu > /proc/sys/kernel/shmmax", largest * NUM_PAGES);
+ exit(0);
+ }
+
+#if defined(__x86_64__)
+ if (largest != 1U<<30) {
+ printf("No GB pages available on x86-64\n"
+ "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES);
+ exit(0);
+ }
+#endif
+}
+
+int main(void)
+{
+ int i;
+ unsigned default_hps = default_huge_page_size();
+
+ find_pagesizes();
+
+ sanity_checks();
+
+ for (i = 0; i < num_page_sizes; i++) {
+ unsigned long ps = page_sizes[i];
+ int arg = ilog2(ps) << MAP_HUGE_SHIFT;
+ printf("Testing %luMB mmap with shift %x\n", ps >> 20, arg);
+ test_mmap(ps, MAP_HUGETLB | arg);
+ }
+ printf("Testing default huge mmap\n");
+ test_mmap(default_hps, SHM_HUGETLB);
+
+ puts("Testing non-huge shmget");
+ test_shmget(getpagesize(), 0);
+
+ for (i = 0; i < num_page_sizes; i++) {
+ unsigned long ps = page_sizes[i];
+ int arg = ilog2(ps) << SHM_HUGE_SHIFT;
+ printf("Testing %luMB shmget with shift %x\n", ps >> 20, arg);
+ test_shmget(ps, SHM_HUGETLB | arg);
+ }
+ puts("default huge shmget");
+ test_shmget(default_hps, SHM_HUGETLB);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/vm/transhuge-stress.c b/tools/testing/selftests/vm/transhuge-stress.c
new file mode 100644
index 000000000..fd7f1b4a9
--- /dev/null
+++ b/tools/testing/selftests/vm/transhuge-stress.c
@@ -0,0 +1,144 @@
+/*
+ * Stress test for transparent huge pages, memory compaction and migration.
+ *
+ * Authors: Konstantin Khlebnikov <koct9i@gmail.com>
+ *
+ * This is free and unencumbered software released into the public domain.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <err.h>
+#include <time.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/mman.h>
+
+#define PAGE_SHIFT 12
+#define HPAGE_SHIFT 21
+
+#define PAGE_SIZE (1 << PAGE_SHIFT)
+#define HPAGE_SIZE (1 << HPAGE_SHIFT)
+
+#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0)
+#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1))
+
+int pagemap_fd;
+
+int64_t allocate_transhuge(void *ptr)
+{
+ uint64_t ent[2];
+
+ /* drop pmd */
+ if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_FIXED | MAP_ANONYMOUS |
+ MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr)
+ errx(2, "mmap transhuge");
+
+ if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE))
+ err(2, "MADV_HUGEPAGE");
+
+ /* allocate transparent huge page */
+ *(volatile void **)ptr = ptr;
+
+ if (pread(pagemap_fd, ent, sizeof(ent),
+ (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent))
+ err(2, "read pagemap");
+
+ if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) &&
+ PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) &&
+ !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1)))
+ return PAGEMAP_PFN(ent[0]);
+
+ return -1;
+}
+
+int main(int argc, char **argv)
+{
+ size_t ram, len;
+ void *ptr, *p;
+ struct timespec a, b;
+ double s;
+ uint8_t *map;
+ size_t map_len;
+
+ ram = sysconf(_SC_PHYS_PAGES);
+ if (ram > SIZE_MAX / sysconf(_SC_PAGESIZE) / 4)
+ ram = SIZE_MAX / 4;
+ else
+ ram *= sysconf(_SC_PAGESIZE);
+
+ if (argc == 1)
+ len = ram;
+ else if (!strcmp(argv[1], "-h"))
+ errx(1, "usage: %s [size in MiB]", argv[0]);
+ else
+ len = atoll(argv[1]) << 20;
+
+ warnx("allocate %zd transhuge pages, using %zd MiB virtual memory"
+ " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20,
+ len >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1));
+
+ pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+ if (pagemap_fd < 0)
+ err(2, "open pagemap");
+
+ len -= len % HPAGE_SIZE;
+ ptr = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0);
+ if (ptr == MAP_FAILED)
+ err(2, "initial mmap");
+ ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE;
+
+ if (madvise(ptr, len, MADV_HUGEPAGE))
+ err(2, "MADV_HUGEPAGE");
+
+ map_len = ram >> (HPAGE_SHIFT - 1);
+ map = malloc(map_len);
+ if (!map)
+ errx(2, "map malloc");
+
+ while (1) {
+ int nr_succeed = 0, nr_failed = 0, nr_pages = 0;
+
+ memset(map, 0, map_len);
+
+ clock_gettime(CLOCK_MONOTONIC, &a);
+ for (p = ptr; p < ptr + len; p += HPAGE_SIZE) {
+ int64_t pfn;
+
+ pfn = allocate_transhuge(p);
+
+ if (pfn < 0) {
+ nr_failed++;
+ } else {
+ size_t idx = pfn >> (HPAGE_SHIFT - PAGE_SHIFT);
+
+ nr_succeed++;
+ if (idx >= map_len) {
+ map = realloc(map, idx + 1);
+ if (!map)
+ errx(2, "map realloc");
+ memset(map + map_len, 0, idx + 1 - map_len);
+ map_len = idx + 1;
+ }
+ if (!map[idx])
+ nr_pages++;
+ map[idx] = 1;
+ }
+
+ /* split transhuge page, keep last page */
+ if (madvise(p, HPAGE_SIZE - PAGE_SIZE, MADV_DONTNEED))
+ err(2, "MADV_DONTNEED");
+ }
+ clock_gettime(CLOCK_MONOTONIC, &b);
+ s = b.tv_sec - a.tv_sec + (b.tv_nsec - a.tv_nsec) / 1000000000.;
+
+ warnx("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t"
+ "%4d succeed, %4d failed, %4d different pages",
+ s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20),
+ nr_succeed, nr_failed, nr_pages);
+ }
+}
diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
new file mode 100644
index 000000000..034245ea3
--- /dev/null
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -0,0 +1,1559 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Stress userfaultfd syscall.
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ *
+ * This test allocates two virtual areas and bounces the physical
+ * memory across the two virtual areas (from area_src to area_dst)
+ * using userfaultfd.
+ *
+ * There are three threads running per CPU:
+ *
+ * 1) one per-CPU thread takes a per-page pthread_mutex in a random
+ * page of the area_dst (while the physical page may still be in
+ * area_src), and increments a per-page counter in the same page,
+ * and checks its value against a verification region.
+ *
+ * 2) another per-CPU thread handles the userfaults generated by
+ * thread 1 above. userfaultfd blocking reads or poll() modes are
+ * exercised interleaved.
+ *
+ * 3) one last per-CPU thread transfers the memory in the background
+ * at maximum bandwidth (if not already transferred by thread
+ * 2). Each cpu thread takes cares of transferring a portion of the
+ * area.
+ *
+ * When all threads of type 3 completed the transfer, one bounce is
+ * complete. area_src and area_dst are then swapped. All threads are
+ * respawned and so the bounce is immediately restarted in the
+ * opposite direction.
+ *
+ * per-CPU threads 1 by triggering userfaults inside
+ * pthread_mutex_lock will also verify the atomicity of the memory
+ * transfer (UFFDIO_COPY).
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <signal.h>
+#include <poll.h>
+#include <string.h>
+#include <linux/mman.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/wait.h>
+#include <pthread.h>
+#include <linux/userfaultfd.h>
+#include <setjmp.h>
+#include <stdbool.h>
+#include <assert.h>
+
+#include "../kselftest.h"
+
+#ifdef __NR_userfaultfd
+
+static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
+
+#define BOUNCE_RANDOM (1<<0)
+#define BOUNCE_RACINGFAULTS (1<<1)
+#define BOUNCE_VERIFY (1<<2)
+#define BOUNCE_POLL (1<<3)
+static int bounces;
+
+#define TEST_ANON 1
+#define TEST_HUGETLB 2
+#define TEST_SHMEM 3
+static int test_type;
+
+/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
+#define ALARM_INTERVAL_SECS 10
+static volatile bool test_uffdio_copy_eexist = true;
+static volatile bool test_uffdio_zeropage_eexist = true;
+/* Whether to test uffd write-protection */
+static bool test_uffdio_wp = false;
+
+static bool map_shared;
+static int huge_fd;
+static char *huge_fd_off0;
+static unsigned long long *count_verify;
+static int uffd, uffd_flags, finished, *pipefd;
+static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
+static char *zeropage;
+pthread_attr_t attr;
+
+/* Userfaultfd test statistics */
+struct uffd_stats {
+ int cpu;
+ unsigned long missing_faults;
+ unsigned long wp_faults;
+};
+
+/* pthread_mutex_t starts at page offset 0 */
+#define area_mutex(___area, ___nr) \
+ ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
+/*
+ * count is placed in the page after pthread_mutex_t naturally aligned
+ * to avoid non alignment faults on non-x86 archs.
+ */
+#define area_count(___area, ___nr) \
+ ((volatile unsigned long long *) ((unsigned long) \
+ ((___area) + (___nr)*page_size + \
+ sizeof(pthread_mutex_t) + \
+ sizeof(unsigned long long) - 1) & \
+ ~(unsigned long)(sizeof(unsigned long long) \
+ - 1)))
+
+const char *examples =
+ "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
+ "./userfaultfd anon 100 99999\n\n"
+ "# Run share memory test on 1GiB region with 99 bounces:\n"
+ "./userfaultfd shmem 1000 99\n\n"
+ "# Run hugetlb memory test on 256MiB region with 50 bounces (using /dev/hugepages/hugefile):\n"
+ "./userfaultfd hugetlb 256 50 /dev/hugepages/hugefile\n\n"
+ "# Run the same hugetlb test but using shmem:\n"
+ "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n"
+ "# 10MiB-~6GiB 999 bounces anonymous test, "
+ "continue forever unless an error triggers\n"
+ "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
+
+static void usage(void)
+{
+ fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> "
+ "[hugetlbfs_file]\n\n");
+ fprintf(stderr, "Supported <test type>: anon, hugetlb, "
+ "hugetlb_shared, shmem\n\n");
+ fprintf(stderr, "Examples:\n\n");
+ fprintf(stderr, "%s", examples);
+ exit(1);
+}
+
+static void uffd_stats_reset(struct uffd_stats *uffd_stats,
+ unsigned long n_cpus)
+{
+ int i;
+
+ for (i = 0; i < n_cpus; i++) {
+ uffd_stats[i].cpu = i;
+ uffd_stats[i].missing_faults = 0;
+ uffd_stats[i].wp_faults = 0;
+ }
+}
+
+static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
+{
+ int i;
+ unsigned long long miss_total = 0, wp_total = 0;
+
+ for (i = 0; i < n_cpus; i++) {
+ miss_total += stats[i].missing_faults;
+ wp_total += stats[i].wp_faults;
+ }
+
+ printf("userfaults: %llu missing (", miss_total);
+ for (i = 0; i < n_cpus; i++)
+ printf("%lu+", stats[i].missing_faults);
+ printf("\b), %llu wp (", wp_total);
+ for (i = 0; i < n_cpus; i++)
+ printf("%lu+", stats[i].wp_faults);
+ printf("\b)\n");
+}
+
+static int anon_release_pages(char *rel_area)
+{
+ int ret = 0;
+
+ if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) {
+ perror("madvise");
+ ret = 1;
+ }
+
+ return ret;
+}
+
+static void anon_allocate_area(void **alloc_area)
+{
+ *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (*alloc_area == MAP_FAILED) {
+ fprintf(stderr, "mmap of anonymous memory failed");
+ *alloc_area = NULL;
+ }
+}
+
+static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+}
+
+/* HugeTLB memory */
+static int hugetlb_release_pages(char *rel_area)
+{
+ int ret = 0;
+
+ if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ rel_area == huge_fd_off0 ? 0 :
+ nr_pages * page_size,
+ nr_pages * page_size)) {
+ perror("fallocate");
+ ret = 1;
+ }
+
+ return ret;
+}
+
+static void hugetlb_allocate_area(void **alloc_area)
+{
+ void *area_alias = NULL;
+ char **alloc_area_alias;
+
+ *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+ (map_shared ? MAP_SHARED : MAP_PRIVATE) |
+ MAP_HUGETLB,
+ huge_fd, *alloc_area == area_src ? 0 :
+ nr_pages * page_size);
+ if (*alloc_area == MAP_FAILED) {
+ perror("mmap of hugetlbfs file failed");
+ goto fail;
+ }
+
+ if (map_shared) {
+ area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_HUGETLB,
+ huge_fd, *alloc_area == area_src ? 0 :
+ nr_pages * page_size);
+ if (area_alias == MAP_FAILED) {
+ perror("mmap of hugetlb file alias failed");
+ goto fail_munmap;
+ }
+ }
+
+ if (*alloc_area == area_src) {
+ huge_fd_off0 = *alloc_area;
+ alloc_area_alias = &area_src_alias;
+ } else {
+ alloc_area_alias = &area_dst_alias;
+ }
+ if (area_alias)
+ *alloc_area_alias = area_alias;
+
+ return;
+
+fail_munmap:
+ if (munmap(*alloc_area, nr_pages * page_size) < 0) {
+ perror("hugetlb munmap");
+ exit(1);
+ }
+fail:
+ *alloc_area = NULL;
+}
+
+static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+ if (!map_shared)
+ return;
+ /*
+ * We can't zap just the pagetable with hugetlbfs because
+ * MADV_DONTEED won't work. So exercise -EEXIST on a alias
+ * mapping where the pagetables are not established initially,
+ * this way we'll exercise the -EEXEC at the fs level.
+ */
+ *start = (unsigned long) area_dst_alias + offset;
+}
+
+/* Shared memory */
+static int shmem_release_pages(char *rel_area)
+{
+ int ret = 0;
+
+ if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) {
+ perror("madvise");
+ ret = 1;
+ }
+
+ return ret;
+}
+
+static void shmem_allocate_area(void **alloc_area)
+{
+ *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+ if (*alloc_area == MAP_FAILED) {
+ fprintf(stderr, "shared memory mmap failed\n");
+ *alloc_area = NULL;
+ }
+}
+
+struct uffd_test_ops {
+ unsigned long expected_ioctls;
+ void (*allocate_area)(void **alloc_area);
+ int (*release_pages)(char *rel_area);
+ void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
+};
+
+#define SHMEM_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \
+ (1 << _UFFDIO_COPY) | \
+ (1 << _UFFDIO_ZEROPAGE))
+
+#define ANON_EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \
+ (1 << _UFFDIO_COPY) | \
+ (1 << _UFFDIO_ZEROPAGE) | \
+ (1 << _UFFDIO_WRITEPROTECT))
+
+static struct uffd_test_ops anon_uffd_test_ops = {
+ .expected_ioctls = ANON_EXPECTED_IOCTLS,
+ .allocate_area = anon_allocate_area,
+ .release_pages = anon_release_pages,
+ .alias_mapping = noop_alias_mapping,
+};
+
+static struct uffd_test_ops shmem_uffd_test_ops = {
+ .expected_ioctls = SHMEM_EXPECTED_IOCTLS,
+ .allocate_area = shmem_allocate_area,
+ .release_pages = shmem_release_pages,
+ .alias_mapping = noop_alias_mapping,
+};
+
+static struct uffd_test_ops hugetlb_uffd_test_ops = {
+ .expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC,
+ .allocate_area = hugetlb_allocate_area,
+ .release_pages = hugetlb_release_pages,
+ .alias_mapping = hugetlb_alias_mapping,
+};
+
+static struct uffd_test_ops *uffd_test_ops;
+
+static int my_bcmp(char *str1, char *str2, size_t n)
+{
+ unsigned long i;
+ for (i = 0; i < n; i++)
+ if (str1[i] != str2[i])
+ return 1;
+ return 0;
+}
+
+static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
+{
+ struct uffdio_writeprotect prms = { 0 };
+
+ /* Write protection page faults */
+ prms.range.start = start;
+ prms.range.len = len;
+ /* Undo write-protect, do wakeup after that */
+ prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
+
+ if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) {
+ fprintf(stderr, "clear WP failed for address 0x%Lx\n", start);
+ exit(1);
+ }
+}
+
+static void *locking_thread(void *arg)
+{
+ unsigned long cpu = (unsigned long) arg;
+ struct random_data rand;
+ unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
+ int32_t rand_nr;
+ unsigned long long count;
+ char randstate[64];
+ unsigned int seed;
+ time_t start;
+
+ if (bounces & BOUNCE_RANDOM) {
+ seed = (unsigned int) time(NULL) - bounces;
+ if (!(bounces & BOUNCE_RACINGFAULTS))
+ seed += cpu;
+ bzero(&rand, sizeof(rand));
+ bzero(&randstate, sizeof(randstate));
+ if (initstate_r(seed, randstate, sizeof(randstate), &rand)) {
+ fprintf(stderr, "srandom_r error\n");
+ exit(1);
+ }
+ } else {
+ page_nr = -bounces;
+ if (!(bounces & BOUNCE_RACINGFAULTS))
+ page_nr += cpu * nr_pages_per_cpu;
+ }
+
+ while (!finished) {
+ if (bounces & BOUNCE_RANDOM) {
+ if (random_r(&rand, &rand_nr)) {
+ fprintf(stderr, "random_r 1 error\n");
+ exit(1);
+ }
+ page_nr = rand_nr;
+ if (sizeof(page_nr) > sizeof(rand_nr)) {
+ if (random_r(&rand, &rand_nr)) {
+ fprintf(stderr, "random_r 2 error\n");
+ exit(1);
+ }
+ page_nr |= (((unsigned long) rand_nr) << 16) <<
+ 16;
+ }
+ } else
+ page_nr += 1;
+ page_nr %= nr_pages;
+
+ start = time(NULL);
+ if (bounces & BOUNCE_VERIFY) {
+ count = *area_count(area_dst, page_nr);
+ if (!count) {
+ fprintf(stderr,
+ "page_nr %lu wrong count %Lu %Lu\n",
+ page_nr, count,
+ count_verify[page_nr]);
+ exit(1);
+ }
+
+
+ /*
+ * We can't use bcmp (or memcmp) because that
+ * returns 0 erroneously if the memory is
+ * changing under it (even if the end of the
+ * page is never changing and always
+ * different).
+ */
+#if 1
+ if (!my_bcmp(area_dst + page_nr * page_size, zeropage,
+ page_size)) {
+ fprintf(stderr,
+ "my_bcmp page_nr %lu wrong count %Lu %Lu\n",
+ page_nr, count, count_verify[page_nr]);
+ exit(1);
+ }
+#else
+ unsigned long loops;
+
+ loops = 0;
+ /* uncomment the below line to test with mutex */
+ /* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */
+ while (!bcmp(area_dst + page_nr * page_size, zeropage,
+ page_size)) {
+ loops += 1;
+ if (loops > 10)
+ break;
+ }
+ /* uncomment below line to test with mutex */
+ /* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */
+ if (loops) {
+ fprintf(stderr,
+ "page_nr %lu all zero thread %lu %p %lu\n",
+ page_nr, cpu, area_dst + page_nr * page_size,
+ loops);
+ if (loops > 10)
+ exit(1);
+ }
+#endif
+ }
+
+ pthread_mutex_lock(area_mutex(area_dst, page_nr));
+ count = *area_count(area_dst, page_nr);
+ if (count != count_verify[page_nr]) {
+ fprintf(stderr,
+ "page_nr %lu memory corruption %Lu %Lu\n",
+ page_nr, count,
+ count_verify[page_nr]); exit(1);
+ }
+ count++;
+ *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
+ pthread_mutex_unlock(area_mutex(area_dst, page_nr));
+
+ if (time(NULL) - start > 1)
+ fprintf(stderr,
+ "userfault too slow %ld "
+ "possible false positive with overcommit\n",
+ time(NULL) - start);
+ }
+
+ return NULL;
+}
+
+static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
+ unsigned long offset)
+{
+ uffd_test_ops->alias_mapping(&uffdio_copy->dst,
+ uffdio_copy->len,
+ offset);
+ if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
+ /* real retval in ufdio_copy.copy */
+ if (uffdio_copy->copy != -EEXIST) {
+ fprintf(stderr, "UFFDIO_COPY retry error %Ld\n",
+ uffdio_copy->copy);
+ exit(1);
+ }
+ } else {
+ fprintf(stderr, "UFFDIO_COPY retry unexpected %Ld\n",
+ uffdio_copy->copy); exit(1);
+ }
+}
+
+static int __copy_page(int ufd, unsigned long offset, bool retry)
+{
+ struct uffdio_copy uffdio_copy;
+
+ if (offset >= nr_pages * page_size) {
+ fprintf(stderr, "unexpected offset %lu\n", offset);
+ exit(1);
+ }
+ uffdio_copy.dst = (unsigned long) area_dst + offset;
+ uffdio_copy.src = (unsigned long) area_src + offset;
+ uffdio_copy.len = page_size;
+ if (test_uffdio_wp)
+ uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
+ else
+ uffdio_copy.mode = 0;
+ uffdio_copy.copy = 0;
+ if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
+ /* real retval in ufdio_copy.copy */
+ if (uffdio_copy.copy != -EEXIST) {
+ fprintf(stderr, "UFFDIO_COPY error %Ld\n",
+ uffdio_copy.copy);
+ exit(1);
+ }
+ } else if (uffdio_copy.copy != page_size) {
+ fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n",
+ uffdio_copy.copy); exit(1);
+ } else {
+ if (test_uffdio_copy_eexist && retry) {
+ test_uffdio_copy_eexist = false;
+ retry_copy_page(ufd, &uffdio_copy, offset);
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static int copy_page_retry(int ufd, unsigned long offset)
+{
+ return __copy_page(ufd, offset, true);
+}
+
+static int copy_page(int ufd, unsigned long offset)
+{
+ return __copy_page(ufd, offset, false);
+}
+
+static int uffd_read_msg(int ufd, struct uffd_msg *msg)
+{
+ int ret = read(uffd, msg, sizeof(*msg));
+
+ if (ret != sizeof(*msg)) {
+ if (ret < 0) {
+ if (errno == EAGAIN)
+ return 1;
+ perror("blocking read error");
+ } else {
+ fprintf(stderr, "short read\n");
+ }
+ exit(1);
+ }
+
+ return 0;
+}
+
+static void uffd_handle_page_fault(struct uffd_msg *msg,
+ struct uffd_stats *stats)
+{
+ unsigned long offset;
+
+ if (msg->event != UFFD_EVENT_PAGEFAULT) {
+ fprintf(stderr, "unexpected msg event %u\n", msg->event);
+ exit(1);
+ }
+
+ if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
+ wp_range(uffd, msg->arg.pagefault.address, page_size, false);
+ stats->wp_faults++;
+ } else {
+ /* Missing page faults */
+ if (bounces & BOUNCE_VERIFY &&
+ msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) {
+ fprintf(stderr, "unexpected write fault\n");
+ exit(1);
+ }
+
+ offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
+ offset &= ~(page_size-1);
+
+ if (copy_page(uffd, offset))
+ stats->missing_faults++;
+ }
+}
+
+static void *uffd_poll_thread(void *arg)
+{
+ struct uffd_stats *stats = (struct uffd_stats *)arg;
+ unsigned long cpu = stats->cpu;
+ struct pollfd pollfd[2];
+ struct uffd_msg msg;
+ struct uffdio_register uffd_reg;
+ int ret;
+ char tmp_chr;
+
+ pollfd[0].fd = uffd;
+ pollfd[0].events = POLLIN;
+ pollfd[1].fd = pipefd[cpu*2];
+ pollfd[1].events = POLLIN;
+
+ for (;;) {
+ ret = poll(pollfd, 2, -1);
+ if (!ret) {
+ fprintf(stderr, "poll error %d\n", ret);
+ exit(1);
+ }
+ if (ret < 0) {
+ perror("poll");
+ exit(1);
+ }
+ if (pollfd[1].revents & POLLIN) {
+ if (read(pollfd[1].fd, &tmp_chr, 1) != 1) {
+ fprintf(stderr, "read pipefd error\n");
+ exit(1);
+ }
+ break;
+ }
+ if (!(pollfd[0].revents & POLLIN)) {
+ fprintf(stderr, "pollfd[0].revents %d\n",
+ pollfd[0].revents);
+ exit(1);
+ }
+ if (uffd_read_msg(uffd, &msg))
+ continue;
+ switch (msg.event) {
+ default:
+ fprintf(stderr, "unexpected msg event %u\n",
+ msg.event); exit(1);
+ break;
+ case UFFD_EVENT_PAGEFAULT:
+ uffd_handle_page_fault(&msg, stats);
+ break;
+ case UFFD_EVENT_FORK:
+ close(uffd);
+ uffd = msg.arg.fork.ufd;
+ pollfd[0].fd = uffd;
+ break;
+ case UFFD_EVENT_REMOVE:
+ uffd_reg.range.start = msg.arg.remove.start;
+ uffd_reg.range.len = msg.arg.remove.end -
+ msg.arg.remove.start;
+ if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) {
+ fprintf(stderr, "remove failure\n");
+ exit(1);
+ }
+ break;
+ case UFFD_EVENT_REMAP:
+ area_dst = (char *)(unsigned long)msg.arg.remap.to;
+ break;
+ }
+ }
+
+ return NULL;
+}
+
+pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void *uffd_read_thread(void *arg)
+{
+ struct uffd_stats *stats = (struct uffd_stats *)arg;
+ struct uffd_msg msg;
+
+ pthread_mutex_unlock(&uffd_read_mutex);
+ /* from here cancellation is ok */
+
+ for (;;) {
+ if (uffd_read_msg(uffd, &msg))
+ continue;
+ uffd_handle_page_fault(&msg, stats);
+ }
+
+ return NULL;
+}
+
+static void *background_thread(void *arg)
+{
+ unsigned long cpu = (unsigned long) arg;
+ unsigned long page_nr, start_nr, mid_nr, end_nr;
+
+ start_nr = cpu * nr_pages_per_cpu;
+ end_nr = (cpu+1) * nr_pages_per_cpu;
+ mid_nr = (start_nr + end_nr) / 2;
+
+ /* Copy the first half of the pages */
+ for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
+ copy_page_retry(uffd, page_nr * page_size);
+
+ /*
+ * If we need to test uffd-wp, set it up now. Then we'll have
+ * at least the first half of the pages mapped already which
+ * can be write-protected for testing
+ */
+ if (test_uffdio_wp)
+ wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
+ nr_pages_per_cpu * page_size, true);
+
+ /*
+ * Continue the 2nd half of the page copying, handling write
+ * protection faults if any
+ */
+ for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
+ copy_page_retry(uffd, page_nr * page_size);
+
+ return NULL;
+}
+
+static int stress(struct uffd_stats *uffd_stats)
+{
+ unsigned long cpu;
+ pthread_t locking_threads[nr_cpus];
+ pthread_t uffd_threads[nr_cpus];
+ pthread_t background_threads[nr_cpus];
+
+ finished = 0;
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ if (pthread_create(&locking_threads[cpu], &attr,
+ locking_thread, (void *)cpu))
+ return 1;
+ if (bounces & BOUNCE_POLL) {
+ if (pthread_create(&uffd_threads[cpu], &attr,
+ uffd_poll_thread,
+ (void *)&uffd_stats[cpu]))
+ return 1;
+ } else {
+ if (pthread_create(&uffd_threads[cpu], &attr,
+ uffd_read_thread,
+ (void *)&uffd_stats[cpu]))
+ return 1;
+ pthread_mutex_lock(&uffd_read_mutex);
+ }
+ if (pthread_create(&background_threads[cpu], &attr,
+ background_thread, (void *)cpu))
+ return 1;
+ }
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ if (pthread_join(background_threads[cpu], NULL))
+ return 1;
+
+ /*
+ * Be strict and immediately zap area_src, the whole area has
+ * been transferred already by the background treads. The
+ * area_src could then be faulted in in a racy way by still
+ * running uffdio_threads reading zeropages after we zapped
+ * area_src (but they're guaranteed to get -EEXIST from
+ * UFFDIO_COPY without writing zero pages into area_dst
+ * because the background threads already completed).
+ */
+ if (uffd_test_ops->release_pages(area_src))
+ return 1;
+
+
+ finished = 1;
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ if (pthread_join(locking_threads[cpu], NULL))
+ return 1;
+
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ char c;
+ if (bounces & BOUNCE_POLL) {
+ if (write(pipefd[cpu*2+1], &c, 1) != 1) {
+ fprintf(stderr, "pipefd write error\n");
+ return 1;
+ }
+ if (pthread_join(uffd_threads[cpu],
+ (void *)&uffd_stats[cpu]))
+ return 1;
+ } else {
+ if (pthread_cancel(uffd_threads[cpu]))
+ return 1;
+ if (pthread_join(uffd_threads[cpu], NULL))
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static int userfaultfd_open(int features)
+{
+ struct uffdio_api uffdio_api;
+
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ if (uffd < 0) {
+ fprintf(stderr,
+ "userfaultfd syscall not available in this kernel\n");
+ return 1;
+ }
+ uffd_flags = fcntl(uffd, F_GETFD, NULL);
+
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = features;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
+ fprintf(stderr, "UFFDIO_API\n");
+ return 1;
+ }
+ if (uffdio_api.api != UFFD_API) {
+ fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api);
+ return 1;
+ }
+
+ return 0;
+}
+
+sigjmp_buf jbuf, *sigbuf;
+
+static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
+{
+ if (sig == SIGBUS) {
+ if (sigbuf)
+ siglongjmp(*sigbuf, 1);
+ abort();
+ }
+}
+
+/*
+ * For non-cooperative userfaultfd test we fork() a process that will
+ * generate pagefaults, will mremap the area monitored by the
+ * userfaultfd and at last this process will release the monitored
+ * area.
+ * For the anonymous and shared memory the area is divided into two
+ * parts, the first part is accessed before mremap, and the second
+ * part is accessed after mremap. Since hugetlbfs does not support
+ * mremap, the entire monitored area is accessed in a single pass for
+ * HUGETLB_TEST.
+ * The release of the pages currently generates event for shmem and
+ * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
+ * for hugetlb.
+ * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
+ * monitored area, generate pagefaults and test that signal is delivered.
+ * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
+ * test robustness use case - we release monitored area, fork a process
+ * that will generate pagefaults and verify signal is generated.
+ * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
+ * feature. Using monitor thread, verify no userfault events are generated.
+ */
+static int faulting_process(int signal_test)
+{
+ unsigned long nr;
+ unsigned long long count;
+ unsigned long split_nr_pages;
+ unsigned long lastnr;
+ struct sigaction act;
+ unsigned long signalled = 0;
+
+ if (test_type != TEST_HUGETLB)
+ split_nr_pages = (nr_pages + 1) / 2;
+ else
+ split_nr_pages = nr_pages;
+
+ if (signal_test) {
+ sigbuf = &jbuf;
+ memset(&act, 0, sizeof(act));
+ act.sa_sigaction = sighndl;
+ act.sa_flags = SA_SIGINFO;
+ if (sigaction(SIGBUS, &act, 0)) {
+ perror("sigaction");
+ return 1;
+ }
+ lastnr = (unsigned long)-1;
+ }
+
+ for (nr = 0; nr < split_nr_pages; nr++) {
+ int steps = 1;
+ unsigned long offset = nr * page_size;
+
+ if (signal_test) {
+ if (sigsetjmp(*sigbuf, 1) != 0) {
+ if (steps == 1 && nr == lastnr) {
+ fprintf(stderr, "Signal repeated\n");
+ return 1;
+ }
+
+ lastnr = nr;
+ if (signal_test == 1) {
+ if (steps == 1) {
+ /* This is a MISSING request */
+ steps++;
+ if (copy_page(uffd, offset))
+ signalled++;
+ } else {
+ /* This is a WP request */
+ assert(steps == 2);
+ wp_range(uffd,
+ (__u64)area_dst +
+ offset,
+ page_size, false);
+ }
+ } else {
+ signalled++;
+ continue;
+ }
+ }
+ }
+
+ count = *area_count(area_dst, nr);
+ if (count != count_verify[nr]) {
+ fprintf(stderr,
+ "nr %lu memory corruption %Lu %Lu\n",
+ nr, count,
+ count_verify[nr]);
+ }
+ /*
+ * Trigger write protection if there is by writting
+ * the same value back.
+ */
+ *area_count(area_dst, nr) = count;
+ }
+
+ if (signal_test)
+ return signalled != split_nr_pages;
+
+ if (test_type == TEST_HUGETLB)
+ return 0;
+
+ area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size,
+ MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
+ if (area_dst == MAP_FAILED) {
+ perror("mremap");
+ exit(1);
+ }
+
+ for (; nr < nr_pages; nr++) {
+ count = *area_count(area_dst, nr);
+ if (count != count_verify[nr]) {
+ fprintf(stderr,
+ "nr %lu memory corruption %Lu %Lu\n",
+ nr, count,
+ count_verify[nr]); exit(1);
+ }
+ /*
+ * Trigger write protection if there is by writting
+ * the same value back.
+ */
+ *area_count(area_dst, nr) = count;
+ }
+
+ if (uffd_test_ops->release_pages(area_dst))
+ return 1;
+
+ for (nr = 0; nr < nr_pages; nr++) {
+ if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) {
+ fprintf(stderr, "nr %lu is not zero\n", nr);
+ exit(1);
+ }
+ }
+
+ return 0;
+}
+
+static void retry_uffdio_zeropage(int ufd,
+ struct uffdio_zeropage *uffdio_zeropage,
+ unsigned long offset)
+{
+ uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
+ uffdio_zeropage->range.len,
+ offset);
+ if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
+ if (uffdio_zeropage->zeropage != -EEXIST) {
+ fprintf(stderr, "UFFDIO_ZEROPAGE retry error %Ld\n",
+ uffdio_zeropage->zeropage);
+ exit(1);
+ }
+ } else {
+ fprintf(stderr, "UFFDIO_ZEROPAGE retry unexpected %Ld\n",
+ uffdio_zeropage->zeropage); exit(1);
+ }
+}
+
+static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
+{
+ struct uffdio_zeropage uffdio_zeropage;
+ int ret;
+ unsigned long has_zeropage;
+
+ has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE);
+
+ if (offset >= nr_pages * page_size) {
+ fprintf(stderr, "unexpected offset %lu\n", offset);
+ exit(1);
+ }
+ uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
+ uffdio_zeropage.range.len = page_size;
+ uffdio_zeropage.mode = 0;
+ ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
+ if (ret) {
+ /* real retval in ufdio_zeropage.zeropage */
+ if (has_zeropage) {
+ if (uffdio_zeropage.zeropage == -EEXIST) {
+ fprintf(stderr, "UFFDIO_ZEROPAGE -EEXIST\n");
+ exit(1);
+ } else {
+ fprintf(stderr, "UFFDIO_ZEROPAGE error %Ld\n",
+ uffdio_zeropage.zeropage);
+ exit(1);
+ }
+ } else {
+ if (uffdio_zeropage.zeropage != -EINVAL) {
+ fprintf(stderr,
+ "UFFDIO_ZEROPAGE not -EINVAL %Ld\n",
+ uffdio_zeropage.zeropage);
+ exit(1);
+ }
+ }
+ } else if (has_zeropage) {
+ if (uffdio_zeropage.zeropage != page_size) {
+ fprintf(stderr, "UFFDIO_ZEROPAGE unexpected %Ld\n",
+ uffdio_zeropage.zeropage); exit(1);
+ } else {
+ if (test_uffdio_zeropage_eexist && retry) {
+ test_uffdio_zeropage_eexist = false;
+ retry_uffdio_zeropage(ufd, &uffdio_zeropage,
+ offset);
+ }
+ return 1;
+ }
+ } else {
+ fprintf(stderr,
+ "UFFDIO_ZEROPAGE succeeded %Ld\n",
+ uffdio_zeropage.zeropage); exit(1);
+ }
+
+ return 0;
+}
+
+static int uffdio_zeropage(int ufd, unsigned long offset)
+{
+ return __uffdio_zeropage(ufd, offset, false);
+}
+
+/* exercise UFFDIO_ZEROPAGE */
+static int userfaultfd_zeropage_test(void)
+{
+ struct uffdio_register uffdio_register;
+ unsigned long expected_ioctls;
+
+ printf("testing UFFDIO_ZEROPAGE: ");
+ fflush(stdout);
+
+ if (uffd_test_ops->release_pages(area_dst))
+ return 1;
+
+ if (userfaultfd_open(0) < 0)
+ return 1;
+ uffdio_register.range.start = (unsigned long) area_dst;
+ uffdio_register.range.len = nr_pages * page_size;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (test_uffdio_wp)
+ uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
+ fprintf(stderr, "register failure\n");
+ exit(1);
+ }
+
+ expected_ioctls = uffd_test_ops->expected_ioctls;
+ if ((uffdio_register.ioctls & expected_ioctls) !=
+ expected_ioctls) {
+ fprintf(stderr,
+ "unexpected missing ioctl for anon memory\n");
+ exit(1);
+ }
+
+ if (uffdio_zeropage(uffd, 0)) {
+ if (my_bcmp(area_dst, zeropage, page_size)) {
+ fprintf(stderr, "zeropage is not zero\n");
+ exit(1);
+ }
+ }
+
+ close(uffd);
+ printf("done.\n");
+ return 0;
+}
+
+static int userfaultfd_events_test(void)
+{
+ struct uffdio_register uffdio_register;
+ unsigned long expected_ioctls;
+ pthread_t uffd_mon;
+ int err, features;
+ pid_t pid;
+ char c;
+ struct uffd_stats stats = { 0 };
+
+ printf("testing events (fork, remap, remove): ");
+ fflush(stdout);
+
+ if (uffd_test_ops->release_pages(area_dst))
+ return 1;
+
+ features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
+ UFFD_FEATURE_EVENT_REMOVE;
+ if (userfaultfd_open(features) < 0)
+ return 1;
+ fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+
+ uffdio_register.range.start = (unsigned long) area_dst;
+ uffdio_register.range.len = nr_pages * page_size;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (test_uffdio_wp)
+ uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
+ fprintf(stderr, "register failure\n");
+ exit(1);
+ }
+
+ expected_ioctls = uffd_test_ops->expected_ioctls;
+ if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
+ fprintf(stderr, "unexpected missing ioctl for anon memory\n");
+ exit(1);
+ }
+
+ if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) {
+ perror("uffd_poll_thread create");
+ exit(1);
+ }
+
+ pid = fork();
+ if (pid < 0) {
+ perror("fork");
+ exit(1);
+ }
+
+ if (!pid)
+ return faulting_process(0);
+
+ waitpid(pid, &err, 0);
+ if (err) {
+ fprintf(stderr, "faulting process failed\n");
+ exit(1);
+ }
+
+ if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) {
+ perror("pipe write");
+ exit(1);
+ }
+ if (pthread_join(uffd_mon, NULL))
+ return 1;
+
+ close(uffd);
+
+ uffd_stats_report(&stats, 1);
+
+ return stats.missing_faults != nr_pages;
+}
+
+static int userfaultfd_sig_test(void)
+{
+ struct uffdio_register uffdio_register;
+ unsigned long expected_ioctls;
+ unsigned long userfaults;
+ pthread_t uffd_mon;
+ int err, features;
+ pid_t pid;
+ char c;
+ struct uffd_stats stats = { 0 };
+
+ printf("testing signal delivery: ");
+ fflush(stdout);
+
+ if (uffd_test_ops->release_pages(area_dst))
+ return 1;
+
+ features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
+ if (userfaultfd_open(features) < 0)
+ return 1;
+ fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+
+ uffdio_register.range.start = (unsigned long) area_dst;
+ uffdio_register.range.len = nr_pages * page_size;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (test_uffdio_wp)
+ uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
+ fprintf(stderr, "register failure\n");
+ exit(1);
+ }
+
+ expected_ioctls = uffd_test_ops->expected_ioctls;
+ if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
+ fprintf(stderr, "unexpected missing ioctl for anon memory\n");
+ exit(1);
+ }
+
+ if (faulting_process(1)) {
+ fprintf(stderr, "faulting process failed\n");
+ exit(1);
+ }
+
+ if (uffd_test_ops->release_pages(area_dst))
+ return 1;
+
+ if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) {
+ perror("uffd_poll_thread create");
+ exit(1);
+ }
+
+ pid = fork();
+ if (pid < 0) {
+ perror("fork");
+ exit(1);
+ }
+
+ if (!pid)
+ exit(faulting_process(2));
+
+ waitpid(pid, &err, 0);
+ if (err) {
+ fprintf(stderr, "faulting process failed\n");
+ exit(1);
+ }
+
+ if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) {
+ perror("pipe write");
+ exit(1);
+ }
+ if (pthread_join(uffd_mon, (void **)&userfaults))
+ return 1;
+
+ printf("done.\n");
+ if (userfaults)
+ fprintf(stderr, "Signal test failed, userfaults: %ld\n",
+ userfaults);
+ close(uffd);
+ return userfaults != 0;
+}
+
+static int userfaultfd_stress(void)
+{
+ void *area;
+ char *tmp_area;
+ unsigned long nr;
+ struct uffdio_register uffdio_register;
+ unsigned long cpu;
+ int err;
+ struct uffd_stats uffd_stats[nr_cpus];
+
+ uffd_test_ops->allocate_area((void **)&area_src);
+ if (!area_src)
+ return 1;
+ uffd_test_ops->allocate_area((void **)&area_dst);
+ if (!area_dst)
+ return 1;
+
+ if (userfaultfd_open(0) < 0)
+ return 1;
+
+ count_verify = malloc(nr_pages * sizeof(unsigned long long));
+ if (!count_verify) {
+ perror("count_verify");
+ return 1;
+ }
+
+ for (nr = 0; nr < nr_pages; nr++) {
+ *area_mutex(area_src, nr) = (pthread_mutex_t)
+ PTHREAD_MUTEX_INITIALIZER;
+ count_verify[nr] = *area_count(area_src, nr) = 1;
+ /*
+ * In the transition between 255 to 256, powerpc will
+ * read out of order in my_bcmp and see both bytes as
+ * zero, so leave a placeholder below always non-zero
+ * after the count, to avoid my_bcmp to trigger false
+ * positives.
+ */
+ *(area_count(area_src, nr) + 1) = 1;
+ }
+
+ pipefd = malloc(sizeof(int) * nr_cpus * 2);
+ if (!pipefd) {
+ perror("pipefd");
+ return 1;
+ }
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) {
+ perror("pipe");
+ return 1;
+ }
+ }
+
+ if (posix_memalign(&area, page_size, page_size)) {
+ fprintf(stderr, "out of memory\n");
+ return 1;
+ }
+ zeropage = area;
+ bzero(zeropage, page_size);
+
+ pthread_mutex_lock(&uffd_read_mutex);
+
+ pthread_attr_init(&attr);
+ pthread_attr_setstacksize(&attr, 16*1024*1024);
+
+ err = 0;
+ while (bounces--) {
+ unsigned long expected_ioctls;
+
+ printf("bounces: %d, mode:", bounces);
+ if (bounces & BOUNCE_RANDOM)
+ printf(" rnd");
+ if (bounces & BOUNCE_RACINGFAULTS)
+ printf(" racing");
+ if (bounces & BOUNCE_VERIFY)
+ printf(" ver");
+ if (bounces & BOUNCE_POLL)
+ printf(" poll");
+ printf(", ");
+ fflush(stdout);
+
+ if (bounces & BOUNCE_POLL)
+ fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
+ else
+ fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
+
+ /* register */
+ uffdio_register.range.start = (unsigned long) area_dst;
+ uffdio_register.range.len = nr_pages * page_size;
+ uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
+ if (test_uffdio_wp)
+ uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
+ fprintf(stderr, "register failure\n");
+ return 1;
+ }
+ expected_ioctls = uffd_test_ops->expected_ioctls;
+ if ((uffdio_register.ioctls & expected_ioctls) !=
+ expected_ioctls) {
+ fprintf(stderr,
+ "unexpected missing ioctl for anon memory\n");
+ return 1;
+ }
+
+ if (area_dst_alias) {
+ uffdio_register.range.start = (unsigned long)
+ area_dst_alias;
+ if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
+ fprintf(stderr, "register failure alias\n");
+ return 1;
+ }
+ }
+
+ /*
+ * The madvise done previously isn't enough: some
+ * uffd_thread could have read userfaults (one of
+ * those already resolved by the background thread)
+ * and it may be in the process of calling
+ * UFFDIO_COPY. UFFDIO_COPY will read the zapped
+ * area_src and it would map a zero page in it (of
+ * course such a UFFDIO_COPY is perfectly safe as it'd
+ * return -EEXIST). The problem comes at the next
+ * bounce though: that racing UFFDIO_COPY would
+ * generate zeropages in the area_src, so invalidating
+ * the previous MADV_DONTNEED. Without this additional
+ * MADV_DONTNEED those zeropages leftovers in the
+ * area_src would lead to -EEXIST failure during the
+ * next bounce, effectively leaving a zeropage in the
+ * area_dst.
+ *
+ * Try to comment this out madvise to see the memory
+ * corruption being caught pretty quick.
+ *
+ * khugepaged is also inhibited to collapse THP after
+ * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
+ * required to MADV_DONTNEED here.
+ */
+ if (uffd_test_ops->release_pages(area_dst))
+ return 1;
+
+ uffd_stats_reset(uffd_stats, nr_cpus);
+
+ /* bounce pass */
+ if (stress(uffd_stats))
+ return 1;
+
+ /* Clear all the write protections if there is any */
+ if (test_uffdio_wp)
+ wp_range(uffd, (unsigned long)area_dst,
+ nr_pages * page_size, false);
+
+ /* unregister */
+ if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
+ fprintf(stderr, "unregister failure\n");
+ return 1;
+ }
+ if (area_dst_alias) {
+ uffdio_register.range.start = (unsigned long) area_dst;
+ if (ioctl(uffd, UFFDIO_UNREGISTER,
+ &uffdio_register.range)) {
+ fprintf(stderr, "unregister failure alias\n");
+ return 1;
+ }
+ }
+
+ /* verification */
+ if (bounces & BOUNCE_VERIFY) {
+ for (nr = 0; nr < nr_pages; nr++) {
+ if (*area_count(area_dst, nr) != count_verify[nr]) {
+ fprintf(stderr,
+ "error area_count %Lu %Lu %lu\n",
+ *area_count(area_src, nr),
+ count_verify[nr],
+ nr);
+ err = 1;
+ bounces = 0;
+ }
+ }
+ }
+
+ /* prepare next bounce */
+ tmp_area = area_src;
+ area_src = area_dst;
+ area_dst = tmp_area;
+
+ tmp_area = area_src_alias;
+ area_src_alias = area_dst_alias;
+ area_dst_alias = tmp_area;
+
+ uffd_stats_report(uffd_stats, nr_cpus);
+ }
+
+ if (err)
+ return err;
+
+ close(uffd);
+ return userfaultfd_zeropage_test() || userfaultfd_sig_test()
+ || userfaultfd_events_test();
+}
+
+/*
+ * Copied from mlock2-tests.c
+ */
+unsigned long default_huge_page_size(void)
+{
+ unsigned long hps = 0;
+ char *line = NULL;
+ size_t linelen = 0;
+ FILE *f = fopen("/proc/meminfo", "r");
+
+ if (!f)
+ return 0;
+ while (getline(&line, &linelen, f) > 0) {
+ if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) {
+ hps <<= 10;
+ break;
+ }
+ }
+
+ free(line);
+ fclose(f);
+ return hps;
+}
+
+static void set_test_type(const char *type)
+{
+ if (!strcmp(type, "anon")) {
+ test_type = TEST_ANON;
+ uffd_test_ops = &anon_uffd_test_ops;
+ /* Only enable write-protect test for anonymous test */
+ test_uffdio_wp = true;
+ } else if (!strcmp(type, "hugetlb")) {
+ test_type = TEST_HUGETLB;
+ uffd_test_ops = &hugetlb_uffd_test_ops;
+ } else if (!strcmp(type, "hugetlb_shared")) {
+ map_shared = true;
+ test_type = TEST_HUGETLB;
+ uffd_test_ops = &hugetlb_uffd_test_ops;
+ } else if (!strcmp(type, "shmem")) {
+ map_shared = true;
+ test_type = TEST_SHMEM;
+ uffd_test_ops = &shmem_uffd_test_ops;
+ } else {
+ fprintf(stderr, "Unknown test type: %s\n", type); exit(1);
+ }
+
+ if (test_type == TEST_HUGETLB)
+ page_size = default_huge_page_size();
+ else
+ page_size = sysconf(_SC_PAGE_SIZE);
+
+ if (!page_size) {
+ fprintf(stderr, "Unable to determine page size\n");
+ exit(2);
+ }
+ if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
+ > page_size) {
+ fprintf(stderr, "Impossible to run this test\n");
+ exit(2);
+ }
+}
+
+static void sigalrm(int sig)
+{
+ if (sig != SIGALRM)
+ abort();
+ test_uffdio_copy_eexist = true;
+ test_uffdio_zeropage_eexist = true;
+ alarm(ALARM_INTERVAL_SECS);
+}
+
+int main(int argc, char **argv)
+{
+ if (argc < 4)
+ usage();
+
+ if (signal(SIGALRM, sigalrm) == SIG_ERR) {
+ fprintf(stderr, "failed to arm SIGALRM");
+ exit(1);
+ }
+ alarm(ALARM_INTERVAL_SECS);
+
+ set_test_type(argv[1]);
+
+ nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+ nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
+ nr_cpus;
+ if (!nr_pages_per_cpu) {
+ fprintf(stderr, "invalid MiB\n");
+ usage();
+ }
+
+ bounces = atoi(argv[3]);
+ if (bounces <= 0) {
+ fprintf(stderr, "invalid bounces\n");
+ usage();
+ }
+ nr_pages = nr_pages_per_cpu * nr_cpus;
+
+ if (test_type == TEST_HUGETLB) {
+ if (argc < 5)
+ usage();
+ huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
+ if (huge_fd < 0) {
+ fprintf(stderr, "Open of %s failed", argv[3]);
+ perror("open");
+ exit(1);
+ }
+ if (ftruncate(huge_fd, 0)) {
+ fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
+ perror("ftruncate");
+ exit(1);
+ }
+ }
+ printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
+ nr_pages, nr_pages_per_cpu);
+ return userfaultfd_stress();
+}
+
+#else /* __NR_userfaultfd */
+
+#warning "missing __NR_userfaultfd definition"
+
+int main(void)
+{
+ printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
+ return KSFT_SKIP;
+}
+
+#endif /* __NR_userfaultfd */
diff --git a/tools/testing/selftests/vm/va_128TBswitch.c b/tools/testing/selftests/vm/va_128TBswitch.c
new file mode 100644
index 000000000..83acdff26
--- /dev/null
+++ b/tools/testing/selftests/vm/va_128TBswitch.c
@@ -0,0 +1,289 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * Authors: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+ * Authors: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ */
+
+#include <stdio.h>
+#include <sys/mman.h>
+#include <string.h>
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+#ifdef __powerpc64__
+#define PAGE_SIZE (64 << 10)
+/*
+ * This will work with 16M and 2M hugepage size
+ */
+#define HUGETLB_SIZE (16 << 20)
+#else
+#define PAGE_SIZE (4 << 10)
+#define HUGETLB_SIZE (2 << 20)
+#endif
+
+/*
+ * >= 128TB is the hint addr value we used to select
+ * large address space.
+ */
+#define ADDR_SWITCH_HINT (1UL << 47)
+#define LOW_ADDR ((void *) (1UL << 30))
+#define HIGH_ADDR ((void *) (1UL << 48))
+
+struct testcase {
+ void *addr;
+ unsigned long size;
+ unsigned long flags;
+ const char *msg;
+ unsigned int low_addr_required:1;
+ unsigned int keep_mapped:1;
+};
+
+static struct testcase testcases[] = {
+ {
+ /*
+ * If stack is moved, we could possibly allocate
+ * this at the requested address.
+ */
+ .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
+ .size = PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)",
+ .low_addr_required = 1,
+ },
+ {
+ /*
+ * We should never allocate at the requested address or above it
+ * The len cross the 128TB boundary. Without MAP_FIXED
+ * we will always search in the lower address space.
+ */
+ .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, (2 * PAGE_SIZE))",
+ .low_addr_required = 1,
+ },
+ {
+ /*
+ * Exact mapping at 128TB, the area is free we should get that
+ * even without MAP_FIXED.
+ */
+ .addr = ((void *)(ADDR_SWITCH_HINT)),
+ .size = PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *)(ADDR_SWITCH_HINT),
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+ .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)",
+ },
+ {
+ .addr = NULL,
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(NULL)",
+ .low_addr_required = 1,
+ },
+ {
+ .addr = LOW_ADDR,
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(LOW_ADDR)",
+ .low_addr_required = 1,
+ },
+ {
+ .addr = HIGH_ADDR,
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(HIGH_ADDR)",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = HIGH_ADDR,
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(HIGH_ADDR) again",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = HIGH_ADDR,
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+ .msg = "mmap(HIGH_ADDR, MAP_FIXED)",
+ },
+ {
+ .addr = (void *) -1,
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(-1)",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *) -1,
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(-1) again",
+ },
+ {
+ .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)),
+ .size = PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)",
+ .low_addr_required = 1,
+ },
+ {
+ .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE),
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2 * PAGE_SIZE)",
+ .low_addr_required = 1,
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE / 2),
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE/2 , 2 * PAGE_SIZE)",
+ .low_addr_required = 1,
+ .keep_mapped = 1,
+ },
+ {
+ .addr = ((void *)(ADDR_SWITCH_HINT)),
+ .size = PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)",
+ },
+ {
+ .addr = (void *)(ADDR_SWITCH_HINT),
+ .size = 2 * PAGE_SIZE,
+ .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+ .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)",
+ },
+};
+
+static struct testcase hugetlb_testcases[] = {
+ {
+ .addr = NULL,
+ .size = HUGETLB_SIZE,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(NULL, MAP_HUGETLB)",
+ .low_addr_required = 1,
+ },
+ {
+ .addr = LOW_ADDR,
+ .size = HUGETLB_SIZE,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(LOW_ADDR, MAP_HUGETLB)",
+ .low_addr_required = 1,
+ },
+ {
+ .addr = HIGH_ADDR,
+ .size = HUGETLB_SIZE,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = HIGH_ADDR,
+ .size = HUGETLB_SIZE,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = HIGH_ADDR,
+ .size = HUGETLB_SIZE,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+ .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)",
+ },
+ {
+ .addr = (void *) -1,
+ .size = HUGETLB_SIZE,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(-1, MAP_HUGETLB)",
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *) -1,
+ .size = HUGETLB_SIZE,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(-1, MAP_HUGETLB) again",
+ },
+ {
+ .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE),
+ .size = 2 * HUGETLB_SIZE,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS,
+ .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2*HUGETLB_SIZE, MAP_HUGETLB)",
+ .low_addr_required = 1,
+ .keep_mapped = 1,
+ },
+ {
+ .addr = (void *)(ADDR_SWITCH_HINT),
+ .size = 2 * HUGETLB_SIZE,
+ .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
+ .msg = "mmap(ADDR_SWITCH_HINT , 2*HUGETLB_SIZE, MAP_FIXED | MAP_HUGETLB)",
+ },
+};
+
+static int run_test(struct testcase *test, int count)
+{
+ void *p;
+ int i, ret = 0;
+
+ for (i = 0; i < count; i++) {
+ struct testcase *t = test + i;
+
+ p = mmap(t->addr, t->size, PROT_READ | PROT_WRITE, t->flags, -1, 0);
+
+ printf("%s: %p - ", t->msg, p);
+
+ if (p == MAP_FAILED) {
+ printf("FAILED\n");
+ ret = 1;
+ continue;
+ }
+
+ if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) {
+ printf("FAILED\n");
+ ret = 1;
+ } else {
+ /*
+ * Do a dereference of the address returned so that we catch
+ * bugs in page fault handling
+ */
+ memset(p, 0, t->size);
+ printf("OK\n");
+ }
+ if (!t->keep_mapped)
+ munmap(p, t->size);
+ }
+
+ return ret;
+}
+
+static int supported_arch(void)
+{
+#if defined(__powerpc64__)
+ return 1;
+#elif defined(__x86_64__)
+ return 1;
+#else
+ return 0;
+#endif
+}
+
+int main(int argc, char **argv)
+{
+ int ret;
+
+ if (!supported_arch())
+ return 0;
+
+ ret = run_test(testcases, ARRAY_SIZE(testcases));
+ if (argc == 2 && !strcmp(argv[1], "--run-hugetlb"))
+ ret = run_test(hugetlb_testcases, ARRAY_SIZE(hugetlb_testcases));
+ return ret;
+}
diff --git a/tools/testing/selftests/vm/virtual_address_range.c b/tools/testing/selftests/vm/virtual_address_range.c
new file mode 100644
index 000000000..c0592646e
--- /dev/null
+++ b/tools/testing/selftests/vm/virtual_address_range.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2017, Anshuman Khandual, IBM Corp.
+ *
+ * Works on architectures which support 128TB virtual
+ * address range and beyond.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+
+/*
+ * Maximum address range mapped with a single mmap()
+ * call is little bit more than 16GB. Hence 16GB is
+ * chosen as the single chunk size for address space
+ * mapping.
+ */
+#define MAP_CHUNK_SIZE 17179869184UL /* 16GB */
+
+/*
+ * Address space till 128TB is mapped without any hint
+ * and is enabled by default. Address space beyond 128TB
+ * till 512TB is obtained by passing hint address as the
+ * first argument into mmap() system call.
+ *
+ * The process heap address space is divided into two
+ * different areas one below 128TB and one above 128TB
+ * till it reaches 512TB. One with size 128TB and the
+ * other being 384TB.
+ *
+ * On Arm64 the address space is 256TB and no high mappings
+ * are supported so far.
+ */
+
+#define NR_CHUNKS_128TB 8192UL /* Number of 16GB chunks for 128TB */
+#define NR_CHUNKS_256TB (NR_CHUNKS_128TB * 2UL)
+#define NR_CHUNKS_384TB (NR_CHUNKS_128TB * 3UL)
+
+#define ADDR_MARK_128TB (1UL << 47) /* First address beyond 128TB */
+#define ADDR_MARK_256TB (1UL << 48) /* First address beyond 256TB */
+
+#ifdef __aarch64__
+#define HIGH_ADDR_MARK ADDR_MARK_256TB
+#define HIGH_ADDR_SHIFT 49
+#define NR_CHUNKS_LOW NR_CHUNKS_256TB
+#define NR_CHUNKS_HIGH 0
+#else
+#define HIGH_ADDR_MARK ADDR_MARK_128TB
+#define HIGH_ADDR_SHIFT 48
+#define NR_CHUNKS_LOW NR_CHUNKS_128TB
+#define NR_CHUNKS_HIGH NR_CHUNKS_384TB
+#endif
+
+static char *hind_addr(void)
+{
+ int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT);
+
+ return (char *) (1UL << bits);
+}
+
+static int validate_addr(char *ptr, int high_addr)
+{
+ unsigned long addr = (unsigned long) ptr;
+
+ if (high_addr) {
+ if (addr < HIGH_ADDR_MARK) {
+ printf("Bad address %lx\n", addr);
+ return 1;
+ }
+ return 0;
+ }
+
+ if (addr > HIGH_ADDR_MARK) {
+ printf("Bad address %lx\n", addr);
+ return 1;
+ }
+ return 0;
+}
+
+static int validate_lower_address_hint(void)
+{
+ char *ptr;
+
+ ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ |
+ PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (ptr == MAP_FAILED)
+ return 0;
+
+ return 1;
+}
+
+int main(int argc, char *argv[])
+{
+ char *ptr[NR_CHUNKS_LOW];
+ char *hptr[NR_CHUNKS_HIGH];
+ char *hint;
+ unsigned long i, lchunks, hchunks;
+
+ for (i = 0; i < NR_CHUNKS_LOW; i++) {
+ ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (ptr[i] == MAP_FAILED) {
+ if (validate_lower_address_hint())
+ return 1;
+ break;
+ }
+
+ if (validate_addr(ptr[i], 0))
+ return 1;
+ }
+ lchunks = i;
+
+ for (i = 0; i < NR_CHUNKS_HIGH; i++) {
+ hint = hind_addr();
+ hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (hptr[i] == MAP_FAILED)
+ break;
+
+ if (validate_addr(hptr[i], 1))
+ return 1;
+ }
+ hchunks = i;
+
+ for (i = 0; i < lchunks; i++)
+ munmap(ptr[i], MAP_CHUNK_SIZE);
+
+ for (i = 0; i < hchunks; i++)
+ munmap(hptr[i], MAP_CHUNK_SIZE);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/vm/write_hugetlb_memory.sh b/tools/testing/selftests/vm/write_hugetlb_memory.sh
new file mode 100644
index 000000000..70a02301f
--- /dev/null
+++ b/tools/testing/selftests/vm/write_hugetlb_memory.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+
+set -e
+
+size=$1
+populate=$2
+write=$3
+cgroup=$4
+path=$5
+method=$6
+private=$7
+want_sleep=$8
+reserve=$9
+
+echo "Putting task in cgroup '$cgroup'"
+echo $$ > ${cgroup_path:-/dev/cgroup/memory}/"$cgroup"/cgroup.procs
+
+echo "Method is $method"
+
+set +e
+./write_to_hugetlbfs -p "$path" -s "$size" "$write" "$populate" -m "$method" \
+ "$private" "$want_sleep" "$reserve"
diff --git a/tools/testing/selftests/vm/write_to_hugetlbfs.c b/tools/testing/selftests/vm/write_to_hugetlbfs.c
new file mode 100644
index 000000000..6a2caba19
--- /dev/null
+++ b/tools/testing/selftests/vm/write_to_hugetlbfs.c
@@ -0,0 +1,240 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This program reserves and uses hugetlb memory, supporting a bunch of
+ * scenarios needed by the charged_reserved_hugetlb.sh test.
+ */
+
+#include <err.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+/* Global definitions. */
+enum method {
+ HUGETLBFS,
+ MMAP_MAP_HUGETLB,
+ SHM,
+ MAX_METHOD
+};
+
+
+/* Global variables. */
+static const char *self;
+static char *shmaddr;
+static int shmid;
+
+/*
+ * Show usage and exit.
+ */
+static void exit_usage(void)
+{
+ printf("Usage: %s -p <path to hugetlbfs file> -s <size to map> "
+ "[-m <0=hugetlbfs | 1=mmap(MAP_HUGETLB)>] [-l] [-r] "
+ "[-o] [-w] [-n]\n",
+ self);
+ exit(EXIT_FAILURE);
+}
+
+void sig_handler(int signo)
+{
+ printf("Received %d.\n", signo);
+ if (signo == SIGINT) {
+ printf("Deleting the memory\n");
+ if (shmdt((const void *)shmaddr) != 0) {
+ perror("Detach failure");
+ shmctl(shmid, IPC_RMID, NULL);
+ exit(4);
+ }
+
+ shmctl(shmid, IPC_RMID, NULL);
+ printf("Done deleting the memory\n");
+ }
+ exit(2);
+}
+
+int main(int argc, char **argv)
+{
+ int fd = 0;
+ int key = 0;
+ int *ptr = NULL;
+ int c = 0;
+ int size = 0;
+ char path[256] = "";
+ enum method method = MAX_METHOD;
+ int want_sleep = 0, private = 0;
+ int populate = 0;
+ int write = 0;
+ int reserve = 1;
+
+ if (signal(SIGINT, sig_handler) == SIG_ERR)
+ err(1, "\ncan't catch SIGINT\n");
+
+ /* Parse command-line arguments. */
+ setvbuf(stdout, NULL, _IONBF, 0);
+ self = argv[0];
+
+ while ((c = getopt(argc, argv, "s:p:m:owlrn")) != -1) {
+ switch (c) {
+ case 's':
+ size = atoi(optarg);
+ break;
+ case 'p':
+ strncpy(path, optarg, sizeof(path));
+ break;
+ case 'm':
+ if (atoi(optarg) >= MAX_METHOD) {
+ errno = EINVAL;
+ perror("Invalid -m.");
+ exit_usage();
+ }
+ method = atoi(optarg);
+ break;
+ case 'o':
+ populate = 1;
+ break;
+ case 'w':
+ write = 1;
+ break;
+ case 'l':
+ want_sleep = 1;
+ break;
+ case 'r':
+ private
+ = 1;
+ break;
+ case 'n':
+ reserve = 0;
+ break;
+ default:
+ errno = EINVAL;
+ perror("Invalid arg");
+ exit_usage();
+ }
+ }
+
+ if (strncmp(path, "", sizeof(path)) != 0) {
+ printf("Writing to this path: %s\n", path);
+ } else {
+ errno = EINVAL;
+ perror("path not found");
+ exit_usage();
+ }
+
+ if (size != 0) {
+ printf("Writing this size: %d\n", size);
+ } else {
+ errno = EINVAL;
+ perror("size not found");
+ exit_usage();
+ }
+
+ if (!populate)
+ printf("Not populating.\n");
+ else
+ printf("Populating.\n");
+
+ if (!write)
+ printf("Not writing to memory.\n");
+
+ if (method == MAX_METHOD) {
+ errno = EINVAL;
+ perror("-m Invalid");
+ exit_usage();
+ } else
+ printf("Using method=%d\n", method);
+
+ if (!private)
+ printf("Shared mapping.\n");
+ else
+ printf("Private mapping.\n");
+
+ if (!reserve)
+ printf("NO_RESERVE mapping.\n");
+ else
+ printf("RESERVE mapping.\n");
+
+ switch (method) {
+ case HUGETLBFS:
+ printf("Allocating using HUGETLBFS.\n");
+ fd = open(path, O_CREAT | O_RDWR, 0777);
+ if (fd == -1)
+ err(1, "Failed to open file.");
+
+ ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ (private ? MAP_PRIVATE : MAP_SHARED) |
+ (populate ? MAP_POPULATE : 0) |
+ (reserve ? 0 : MAP_NORESERVE),
+ fd, 0);
+
+ if (ptr == MAP_FAILED) {
+ close(fd);
+ err(1, "Error mapping the file");
+ }
+ break;
+ case MMAP_MAP_HUGETLB:
+ printf("Allocating using MAP_HUGETLB.\n");
+ ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ (private ? (MAP_PRIVATE | MAP_ANONYMOUS) :
+ MAP_SHARED) |
+ MAP_HUGETLB | (populate ? MAP_POPULATE : 0) |
+ (reserve ? 0 : MAP_NORESERVE),
+ -1, 0);
+
+ if (ptr == MAP_FAILED)
+ err(1, "mmap");
+
+ printf("Returned address is %p\n", ptr);
+ break;
+ case SHM:
+ printf("Allocating using SHM.\n");
+ shmid = shmget(key, size,
+ SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
+ if (shmid < 0) {
+ shmid = shmget(++key, size,
+ SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W);
+ if (shmid < 0)
+ err(1, "shmget");
+ }
+ printf("shmid: 0x%x, shmget key:%d\n", shmid, key);
+
+ ptr = shmat(shmid, NULL, 0);
+ if (ptr == (int *)-1) {
+ perror("Shared memory attach failure");
+ shmctl(shmid, IPC_RMID, NULL);
+ exit(2);
+ }
+ printf("shmaddr: %p\n", ptr);
+
+ break;
+ default:
+ errno = EINVAL;
+ err(1, "Invalid method.");
+ }
+
+ if (write) {
+ printf("Writing to memory.\n");
+ memset(ptr, 1, size);
+ }
+
+ if (want_sleep) {
+ /* Signal to caller that we're done. */
+ printf("DONE\n");
+
+ /* Hold memory until external kill signal is delivered. */
+ while (1)
+ sleep(100);
+ }
+
+ if (method == HUGETLBFS)
+ close(fd);
+
+ return 0;
+}
diff --git a/tools/testing/selftests/watchdog/.gitignore b/tools/testing/selftests/watchdog/.gitignore
new file mode 100644
index 000000000..61d7b89cd
--- /dev/null
+++ b/tools/testing/selftests/watchdog/.gitignore
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+watchdog-test
diff --git a/tools/testing/selftests/watchdog/Makefile b/tools/testing/selftests/watchdog/Makefile
new file mode 100644
index 000000000..6b5598b55
--- /dev/null
+++ b/tools/testing/selftests/watchdog/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+TEST_GEN_PROGS := watchdog-test
+
+include ../lib.mk
diff --git a/tools/testing/selftests/watchdog/watchdog-test.c b/tools/testing/selftests/watchdog/watchdog-test.c
new file mode 100644
index 000000000..f45e51050
--- /dev/null
+++ b/tools/testing/selftests/watchdog/watchdog-test.c
@@ -0,0 +1,257 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Watchdog Driver Test Program
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <getopt.h>
+#include <sys/ioctl.h>
+#include <linux/types.h>
+#include <linux/watchdog.h>
+
+#define DEFAULT_PING_RATE 1
+
+int fd;
+const char v = 'V';
+static const char sopts[] = "bdehp:t:Tn:NLf:i";
+static const struct option lopts[] = {
+ {"bootstatus", no_argument, NULL, 'b'},
+ {"disable", no_argument, NULL, 'd'},
+ {"enable", no_argument, NULL, 'e'},
+ {"help", no_argument, NULL, 'h'},
+ {"pingrate", required_argument, NULL, 'p'},
+ {"timeout", required_argument, NULL, 't'},
+ {"gettimeout", no_argument, NULL, 'T'},
+ {"pretimeout", required_argument, NULL, 'n'},
+ {"getpretimeout", no_argument, NULL, 'N'},
+ {"gettimeleft", no_argument, NULL, 'L'},
+ {"file", required_argument, NULL, 'f'},
+ {"info", no_argument, NULL, 'i'},
+ {NULL, no_argument, NULL, 0x0}
+};
+
+/*
+ * This function simply sends an IOCTL to the driver, which in turn ticks
+ * the PC Watchdog card to reset its internal timer so it doesn't trigger
+ * a computer reset.
+ */
+static void keep_alive(void)
+{
+ int dummy;
+ int ret;
+
+ ret = ioctl(fd, WDIOC_KEEPALIVE, &dummy);
+ if (!ret)
+ printf(".");
+}
+
+/*
+ * The main program. Run the program with "-d" to disable the card,
+ * or "-e" to enable the card.
+ */
+
+static void term(int sig)
+{
+ int ret = write(fd, &v, 1);
+
+ close(fd);
+ if (ret < 0)
+ printf("\nStopping watchdog ticks failed (%d)...\n", errno);
+ else
+ printf("\nStopping watchdog ticks...\n");
+ exit(0);
+}
+
+static void usage(char *progname)
+{
+ printf("Usage: %s [options]\n", progname);
+ printf(" -f, --file\t\tOpen watchdog device file\n");
+ printf("\t\t\tDefault is /dev/watchdog\n");
+ printf(" -i, --info\t\tShow watchdog_info\n");
+ printf(" -b, --bootstatus\tGet last boot status (Watchdog/POR)\n");
+ printf(" -d, --disable\t\tTurn off the watchdog timer\n");
+ printf(" -e, --enable\t\tTurn on the watchdog timer\n");
+ printf(" -h, --help\t\tPrint the help message\n");
+ printf(" -p, --pingrate=P\tSet ping rate to P seconds (default %d)\n",
+ DEFAULT_PING_RATE);
+ printf(" -t, --timeout=T\tSet timeout to T seconds\n");
+ printf(" -T, --gettimeout\tGet the timeout\n");
+ printf(" -n, --pretimeout=T\tSet the pretimeout to T seconds\n");
+ printf(" -N, --getpretimeout\tGet the pretimeout\n");
+ printf(" -L, --gettimeleft\tGet the time left until timer expires\n");
+ printf("\n");
+ printf("Parameters are parsed left-to-right in real-time.\n");
+ printf("Example: %s -d -t 10 -p 5 -e\n", progname);
+ printf("Example: %s -t 12 -T -n 7 -N\n", progname);
+}
+
+int main(int argc, char *argv[])
+{
+ int flags;
+ unsigned int ping_rate = DEFAULT_PING_RATE;
+ int ret;
+ int c;
+ int oneshot = 0;
+ char *file = "/dev/watchdog";
+ struct watchdog_info info;
+
+ setbuf(stdout, NULL);
+
+ while ((c = getopt_long(argc, argv, sopts, lopts, NULL)) != -1) {
+ if (c == 'f')
+ file = optarg;
+ }
+
+ fd = open(file, O_WRONLY);
+
+ if (fd == -1) {
+ if (errno == ENOENT)
+ printf("Watchdog device (%s) not found.\n", file);
+ else if (errno == EACCES)
+ printf("Run watchdog as root.\n");
+ else
+ printf("Watchdog device open failed %s\n",
+ strerror(errno));
+ exit(-1);
+ }
+
+ /*
+ * Validate that `file` is a watchdog device
+ */
+ ret = ioctl(fd, WDIOC_GETSUPPORT, &info);
+ if (ret) {
+ printf("WDIOC_GETSUPPORT error '%s'\n", strerror(errno));
+ close(fd);
+ exit(ret);
+ }
+
+ optind = 0;
+
+ while ((c = getopt_long(argc, argv, sopts, lopts, NULL)) != -1) {
+ switch (c) {
+ case 'b':
+ flags = 0;
+ oneshot = 1;
+ ret = ioctl(fd, WDIOC_GETBOOTSTATUS, &flags);
+ if (!ret)
+ printf("Last boot is caused by: %s.\n", (flags != 0) ?
+ "Watchdog" : "Power-On-Reset");
+ else
+ printf("WDIOC_GETBOOTSTATUS error '%s'\n", strerror(errno));
+ break;
+ case 'd':
+ flags = WDIOS_DISABLECARD;
+ ret = ioctl(fd, WDIOC_SETOPTIONS, &flags);
+ if (!ret)
+ printf("Watchdog card disabled.\n");
+ else {
+ printf("WDIOS_DISABLECARD error '%s'\n", strerror(errno));
+ oneshot = 1;
+ }
+ break;
+ case 'e':
+ flags = WDIOS_ENABLECARD;
+ ret = ioctl(fd, WDIOC_SETOPTIONS, &flags);
+ if (!ret)
+ printf("Watchdog card enabled.\n");
+ else {
+ printf("WDIOS_ENABLECARD error '%s'\n", strerror(errno));
+ oneshot = 1;
+ }
+ break;
+ case 'p':
+ ping_rate = strtoul(optarg, NULL, 0);
+ if (!ping_rate)
+ ping_rate = DEFAULT_PING_RATE;
+ printf("Watchdog ping rate set to %u seconds.\n", ping_rate);
+ break;
+ case 't':
+ flags = strtoul(optarg, NULL, 0);
+ ret = ioctl(fd, WDIOC_SETTIMEOUT, &flags);
+ if (!ret)
+ printf("Watchdog timeout set to %u seconds.\n", flags);
+ else {
+ printf("WDIOC_SETTIMEOUT error '%s'\n", strerror(errno));
+ oneshot = 1;
+ }
+ break;
+ case 'T':
+ oneshot = 1;
+ ret = ioctl(fd, WDIOC_GETTIMEOUT, &flags);
+ if (!ret)
+ printf("WDIOC_GETTIMEOUT returns %u seconds.\n", flags);
+ else
+ printf("WDIOC_GETTIMEOUT error '%s'\n", strerror(errno));
+ break;
+ case 'n':
+ flags = strtoul(optarg, NULL, 0);
+ ret = ioctl(fd, WDIOC_SETPRETIMEOUT, &flags);
+ if (!ret)
+ printf("Watchdog pretimeout set to %u seconds.\n", flags);
+ else {
+ printf("WDIOC_SETPRETIMEOUT error '%s'\n", strerror(errno));
+ oneshot = 1;
+ }
+ break;
+ case 'N':
+ oneshot = 1;
+ ret = ioctl(fd, WDIOC_GETPRETIMEOUT, &flags);
+ if (!ret)
+ printf("WDIOC_GETPRETIMEOUT returns %u seconds.\n", flags);
+ else
+ printf("WDIOC_GETPRETIMEOUT error '%s'\n", strerror(errno));
+ break;
+ case 'L':
+ oneshot = 1;
+ ret = ioctl(fd, WDIOC_GETTIMELEFT, &flags);
+ if (!ret)
+ printf("WDIOC_GETTIMELEFT returns %u seconds.\n", flags);
+ else
+ printf("WDIOC_GETTIMELEFT error '%s'\n", strerror(errno));
+ break;
+ case 'f':
+ /* Handled above */
+ break;
+ case 'i':
+ /*
+ * watchdog_info was obtained as part of file open
+ * validation. So we just show it here.
+ */
+ oneshot = 1;
+ printf("watchdog_info:\n");
+ printf(" identity:\t\t%s\n", info.identity);
+ printf(" firmware_version:\t%u\n",
+ info.firmware_version);
+ printf(" options:\t\t%08x\n", info.options);
+ break;
+
+ default:
+ usage(argv[0]);
+ goto end;
+ }
+ }
+
+ if (oneshot)
+ goto end;
+
+ printf("Watchdog Ticking Away!\n");
+
+ signal(SIGINT, term);
+
+ while (1) {
+ keep_alive();
+ sleep(ping_rate);
+ }
+end:
+ ret = write(fd, &v, 1);
+ if (ret < 0)
+ printf("Stopping watchdog ticks failed (%d)...\n", errno);
+ close(fd);
+ return 0;
+}
diff --git a/tools/testing/selftests/wireguard/netns.sh b/tools/testing/selftests/wireguard/netns.sh
new file mode 100755
index 000000000..93e44410f
--- /dev/null
+++ b/tools/testing/selftests/wireguard/netns.sh
@@ -0,0 +1,684 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+#
+# This script tests the below topology:
+#
+# ┌─────────────────────┐ ┌──────────────────────────────────┐ ┌─────────────────────┐
+# │ $ns1 namespace │ │ $ns0 namespace │ │ $ns2 namespace │
+# │ │ │ │ │ │
+# │┌────────┐ │ │ ┌────────┐ │ │ ┌────────┐│
+# ││ wg0 │───────────┼───┼────────────│ lo │────────────┼───┼───────────│ wg0 ││
+# │├────────┴──────────┐│ │ ┌───────┴────────┴────────┐ │ │┌──────────┴────────┤│
+# ││192.168.241.1/24 ││ │ │(ns1) (ns2) │ │ ││192.168.241.2/24 ││
+# ││fd00::1/24 ││ │ │127.0.0.1:1 127.0.0.1:2│ │ ││fd00::2/24 ││
+# │└───────────────────┘│ │ │[::]:1 [::]:2 │ │ │└───────────────────┘│
+# └─────────────────────┘ │ └─────────────────────────┘ │ └─────────────────────┘
+# └──────────────────────────────────┘
+#
+# After the topology is prepared we run a series of TCP/UDP iperf3 tests between the
+# wireguard peers in $ns1 and $ns2. Note that $ns0 is the endpoint for the wg0
+# interfaces in $ns1 and $ns2. See https://www.wireguard.com/netns/ for further
+# details on how this is accomplished.
+set -e
+
+exec 3>&1
+export LANG=C
+export WG_HIDE_KEYS=never
+netns0="wg-test-$$-0"
+netns1="wg-test-$$-1"
+netns2="wg-test-$$-2"
+pretty() { echo -e "\x1b[32m\x1b[1m[+] ${1:+NS$1: }${2}\x1b[0m" >&3; }
+pp() { pretty "" "$*"; "$@"; }
+maybe_exec() { if [[ $BASHPID -eq $$ ]]; then "$@"; else exec "$@"; fi; }
+n0() { pretty 0 "$*"; maybe_exec ip netns exec $netns0 "$@"; }
+n1() { pretty 1 "$*"; maybe_exec ip netns exec $netns1 "$@"; }
+n2() { pretty 2 "$*"; maybe_exec ip netns exec $netns2 "$@"; }
+ip0() { pretty 0 "ip $*"; ip -n $netns0 "$@"; }
+ip1() { pretty 1 "ip $*"; ip -n $netns1 "$@"; }
+ip2() { pretty 2 "ip $*"; ip -n $netns2 "$@"; }
+sleep() { read -t "$1" -N 1 || true; }
+waitiperf() { pretty "${1//*-}" "wait for iperf:${3:-5201} pid $2"; while [[ $(ss -N "$1" -tlpH "sport = ${3:-5201}") != *\"iperf3\",pid=$2,fd=* ]]; do sleep 0.1; done; }
+waitncatudp() { pretty "${1//*-}" "wait for udp:1111 pid $2"; while [[ $(ss -N "$1" -ulpH 'sport = 1111') != *\"ncat\",pid=$2,fd=* ]]; do sleep 0.1; done; }
+waitiface() { pretty "${1//*-}" "wait for $2 to come up"; ip netns exec "$1" bash -c "while [[ \$(< \"/sys/class/net/$2/operstate\") != up ]]; do read -t .1 -N 0 || true; done;"; }
+
+cleanup() {
+ set +e
+ exec 2>/dev/null
+ printf "$orig_message_cost" > /proc/sys/net/core/message_cost
+ ip0 link del dev wg0
+ ip0 link del dev wg1
+ ip1 link del dev wg0
+ ip1 link del dev wg1
+ ip2 link del dev wg0
+ ip2 link del dev wg1
+ local to_kill="$(ip netns pids $netns0) $(ip netns pids $netns1) $(ip netns pids $netns2)"
+ [[ -n $to_kill ]] && kill $to_kill
+ pp ip netns del $netns1
+ pp ip netns del $netns2
+ pp ip netns del $netns0
+ exit
+}
+
+orig_message_cost="$(< /proc/sys/net/core/message_cost)"
+trap cleanup EXIT
+printf 0 > /proc/sys/net/core/message_cost
+
+ip netns del $netns0 2>/dev/null || true
+ip netns del $netns1 2>/dev/null || true
+ip netns del $netns2 2>/dev/null || true
+pp ip netns add $netns0
+pp ip netns add $netns1
+pp ip netns add $netns2
+ip0 link set up dev lo
+
+ip0 link add dev wg0 type wireguard
+ip0 link set wg0 netns $netns1
+ip0 link add dev wg0 type wireguard
+ip0 link set wg0 netns $netns2
+key1="$(pp wg genkey)"
+key2="$(pp wg genkey)"
+key3="$(pp wg genkey)"
+key4="$(pp wg genkey)"
+pub1="$(pp wg pubkey <<<"$key1")"
+pub2="$(pp wg pubkey <<<"$key2")"
+pub3="$(pp wg pubkey <<<"$key3")"
+pub4="$(pp wg pubkey <<<"$key4")"
+psk="$(pp wg genpsk)"
+[[ -n $key1 && -n $key2 && -n $psk ]]
+
+configure_peers() {
+ ip1 addr add 192.168.241.1/24 dev wg0
+ ip1 addr add fd00::1/112 dev wg0
+
+ ip2 addr add 192.168.241.2/24 dev wg0
+ ip2 addr add fd00::2/112 dev wg0
+
+ n1 wg set wg0 \
+ private-key <(echo "$key1") \
+ listen-port 1 \
+ peer "$pub2" \
+ preshared-key <(echo "$psk") \
+ allowed-ips 192.168.241.2/32,fd00::2/128
+ n2 wg set wg0 \
+ private-key <(echo "$key2") \
+ listen-port 2 \
+ peer "$pub1" \
+ preshared-key <(echo "$psk") \
+ allowed-ips 192.168.241.1/32,fd00::1/128
+
+ ip1 link set up dev wg0
+ ip2 link set up dev wg0
+}
+configure_peers
+
+tests() {
+ # Ping over IPv4
+ n2 ping -c 10 -f -W 1 192.168.241.1
+ n1 ping -c 10 -f -W 1 192.168.241.2
+
+ # Ping over IPv6
+ n2 ping6 -c 10 -f -W 1 fd00::1
+ n1 ping6 -c 10 -f -W 1 fd00::2
+
+ # TCP over IPv4
+ n2 iperf3 -s -1 -B 192.168.241.2 &
+ waitiperf $netns2 $!
+ n1 iperf3 -Z -t 3 -c 192.168.241.2
+
+ # TCP over IPv6
+ n1 iperf3 -s -1 -B fd00::1 &
+ waitiperf $netns1 $!
+ n2 iperf3 -Z -t 3 -c fd00::1
+
+ # UDP over IPv4
+ n1 iperf3 -s -1 -B 192.168.241.1 &
+ waitiperf $netns1 $!
+ n2 iperf3 -Z -t 3 -b 0 -u -c 192.168.241.1
+
+ # UDP over IPv6
+ n2 iperf3 -s -1 -B fd00::2 &
+ waitiperf $netns2 $!
+ n1 iperf3 -Z -t 3 -b 0 -u -c fd00::2
+
+ # TCP over IPv4, in parallel
+ for max in 4 5 50; do
+ local pids=( )
+ for ((i=0; i < max; ++i)) do
+ n2 iperf3 -p $(( 5200 + i )) -s -1 -B 192.168.241.2 &
+ pids+=( $! ); waitiperf $netns2 $! $(( 5200 + i ))
+ done
+ for ((i=0; i < max; ++i)) do
+ n1 iperf3 -Z -t 3 -p $(( 5200 + i )) -c 192.168.241.2 &
+ done
+ wait "${pids[@]}"
+ done
+}
+
+[[ $(ip1 link show dev wg0) =~ mtu\ ([0-9]+) ]] && orig_mtu="${BASH_REMATCH[1]}"
+big_mtu=$(( 34816 - 1500 + $orig_mtu ))
+
+# Test using IPv4 as outer transport
+n1 wg set wg0 peer "$pub2" endpoint 127.0.0.1:2
+n2 wg set wg0 peer "$pub1" endpoint 127.0.0.1:1
+# Before calling tests, we first make sure that the stats counters and timestamper are working
+n2 ping -c 10 -f -W 1 192.168.241.1
+{ read _; read _; read _; read rx_bytes _; read _; read tx_bytes _; } < <(ip2 -stats link show dev wg0)
+(( rx_bytes == 1372 && (tx_bytes == 1428 || tx_bytes == 1460) ))
+{ read _; read _; read _; read rx_bytes _; read _; read tx_bytes _; } < <(ip1 -stats link show dev wg0)
+(( tx_bytes == 1372 && (rx_bytes == 1428 || rx_bytes == 1460) ))
+read _ rx_bytes tx_bytes < <(n2 wg show wg0 transfer)
+(( rx_bytes == 1372 && (tx_bytes == 1428 || tx_bytes == 1460) ))
+read _ rx_bytes tx_bytes < <(n1 wg show wg0 transfer)
+(( tx_bytes == 1372 && (rx_bytes == 1428 || rx_bytes == 1460) ))
+read _ timestamp < <(n1 wg show wg0 latest-handshakes)
+(( timestamp != 0 ))
+
+tests
+ip1 link set wg0 mtu $big_mtu
+ip2 link set wg0 mtu $big_mtu
+tests
+
+ip1 link set wg0 mtu $orig_mtu
+ip2 link set wg0 mtu $orig_mtu
+
+# Test using IPv6 as outer transport
+n1 wg set wg0 peer "$pub2" endpoint [::1]:2
+n2 wg set wg0 peer "$pub1" endpoint [::1]:1
+tests
+ip1 link set wg0 mtu $big_mtu
+ip2 link set wg0 mtu $big_mtu
+tests
+
+# Test that route MTUs work with the padding
+ip1 link set wg0 mtu 1300
+ip2 link set wg0 mtu 1300
+n1 wg set wg0 peer "$pub2" endpoint 127.0.0.1:2
+n2 wg set wg0 peer "$pub1" endpoint 127.0.0.1:1
+n0 iptables -A INPUT -m length --length 1360 -j DROP
+n1 ip route add 192.168.241.2/32 dev wg0 mtu 1299
+n2 ip route add 192.168.241.1/32 dev wg0 mtu 1299
+n2 ping -c 1 -W 1 -s 1269 192.168.241.1
+n2 ip route delete 192.168.241.1/32 dev wg0 mtu 1299
+n1 ip route delete 192.168.241.2/32 dev wg0 mtu 1299
+n0 iptables -F INPUT
+
+ip1 link set wg0 mtu $orig_mtu
+ip2 link set wg0 mtu $orig_mtu
+
+# Test using IPv4 that roaming works
+ip0 -4 addr del 127.0.0.1/8 dev lo
+ip0 -4 addr add 127.212.121.99/8 dev lo
+n1 wg set wg0 listen-port 9999
+n1 wg set wg0 peer "$pub2" endpoint 127.0.0.1:2
+n1 ping6 -W 1 -c 1 fd00::2
+[[ $(n2 wg show wg0 endpoints) == "$pub1 127.212.121.99:9999" ]]
+
+# Test using IPv6 that roaming works
+n1 wg set wg0 listen-port 9998
+n1 wg set wg0 peer "$pub2" endpoint [::1]:2
+n1 ping -W 1 -c 1 192.168.241.2
+[[ $(n2 wg show wg0 endpoints) == "$pub1 [::1]:9998" ]]
+
+# Test that crypto-RP filter works
+n1 wg set wg0 peer "$pub2" allowed-ips 192.168.241.0/24
+exec 4< <(n1 ncat -l -u -p 1111)
+ncat_pid=$!
+waitncatudp $netns1 $ncat_pid
+n2 ncat -u 192.168.241.1 1111 <<<"X"
+read -r -N 1 -t 1 out <&4 && [[ $out == "X" ]]
+kill $ncat_pid
+more_specific_key="$(pp wg genkey | pp wg pubkey)"
+n1 wg set wg0 peer "$more_specific_key" allowed-ips 192.168.241.2/32
+n2 wg set wg0 listen-port 9997
+exec 4< <(n1 ncat -l -u -p 1111)
+ncat_pid=$!
+waitncatudp $netns1 $ncat_pid
+n2 ncat -u 192.168.241.1 1111 <<<"X"
+! read -r -N 1 -t 1 out <&4 || false
+kill $ncat_pid
+n1 wg set wg0 peer "$more_specific_key" remove
+[[ $(n1 wg show wg0 endpoints) == "$pub2 [::1]:9997" ]]
+
+# Test that we can change private keys keys and immediately handshake
+n1 wg set wg0 private-key <(echo "$key1") peer "$pub2" preshared-key <(echo "$psk") allowed-ips 192.168.241.2/32 endpoint 127.0.0.1:2
+n2 wg set wg0 private-key <(echo "$key2") listen-port 2 peer "$pub1" preshared-key <(echo "$psk") allowed-ips 192.168.241.1/32
+n1 ping -W 1 -c 1 192.168.241.2
+n1 wg set wg0 private-key <(echo "$key3")
+n2 wg set wg0 peer "$pub3" preshared-key <(echo "$psk") allowed-ips 192.168.241.1/32 peer "$pub1" remove
+n1 ping -W 1 -c 1 192.168.241.2
+n2 wg set wg0 peer "$pub3" remove
+
+# Test that we can route wg through wg
+ip1 addr flush dev wg0
+ip2 addr flush dev wg0
+ip1 addr add fd00::5:1/112 dev wg0
+ip2 addr add fd00::5:2/112 dev wg0
+n1 wg set wg0 private-key <(echo "$key1") peer "$pub2" preshared-key <(echo "$psk") allowed-ips fd00::5:2/128 endpoint 127.0.0.1:2
+n2 wg set wg0 private-key <(echo "$key2") listen-port 2 peer "$pub1" preshared-key <(echo "$psk") allowed-ips fd00::5:1/128 endpoint 127.212.121.99:9998
+ip1 link add wg1 type wireguard
+ip2 link add wg1 type wireguard
+ip1 addr add 192.168.241.1/24 dev wg1
+ip1 addr add fd00::1/112 dev wg1
+ip2 addr add 192.168.241.2/24 dev wg1
+ip2 addr add fd00::2/112 dev wg1
+ip1 link set mtu 1340 up dev wg1
+ip2 link set mtu 1340 up dev wg1
+n1 wg set wg1 listen-port 5 private-key <(echo "$key3") peer "$pub4" allowed-ips 192.168.241.2/32,fd00::2/128 endpoint [fd00::5:2]:5
+n2 wg set wg1 listen-port 5 private-key <(echo "$key4") peer "$pub3" allowed-ips 192.168.241.1/32,fd00::1/128 endpoint [fd00::5:1]:5
+tests
+# Try to set up a routing loop between the two namespaces
+ip1 link set netns $netns0 dev wg1
+ip0 addr add 192.168.241.1/24 dev wg1
+ip0 link set up dev wg1
+n0 ping -W 1 -c 1 192.168.241.2
+n1 wg set wg0 peer "$pub2" endpoint 192.168.241.2:7
+ip2 link del wg0
+ip2 link del wg1
+read _ _ tx_bytes_before < <(n0 wg show wg1 transfer)
+! n0 ping -W 1 -c 10 -f 192.168.241.2 || false
+sleep 1
+read _ _ tx_bytes_after < <(n0 wg show wg1 transfer)
+(( tx_bytes_after - tx_bytes_before < 70000 ))
+
+ip0 link del wg1
+ip1 link del wg0
+
+# Test using NAT. We now change the topology to this:
+# ┌────────────────────────────────────────┐ ┌────────────────────────────────────────────────┐ ┌────────────────────────────────────────┐
+# │ $ns1 namespace │ │ $ns0 namespace │ │ $ns2 namespace │
+# │ │ │ │ │ │
+# │ ┌─────┐ ┌─────┐ │ │ ┌──────┐ ┌──────┐ │ │ ┌─────┐ ┌─────┐ │
+# │ │ wg0 │─────────────│vethc│───────────┼────┼────│vethrc│ │vethrs│──────────────┼─────┼──│veths│────────────│ wg0 │ │
+# │ ├─────┴──────────┐ ├─────┴──────────┐│ │ ├──────┴─────────┐ ├──────┴────────────┐ │ │ ├─────┴──────────┐ ├─────┴──────────┐ │
+# │ │192.168.241.1/24│ │192.168.1.100/24││ │ │192.168.1.1/24 │ │10.0.0.1/24 │ │ │ │10.0.0.100/24 │ │192.168.241.2/24│ │
+# │ │fd00::1/24 │ │ ││ │ │ │ │SNAT:192.168.1.0/24│ │ │ │ │ │fd00::2/24 │ │
+# │ └────────────────┘ └────────────────┘│ │ └────────────────┘ └───────────────────┘ │ │ └────────────────┘ └────────────────┘ │
+# └────────────────────────────────────────┘ └────────────────────────────────────────────────┘ └────────────────────────────────────────┘
+
+ip1 link add dev wg0 type wireguard
+ip2 link add dev wg0 type wireguard
+configure_peers
+
+ip0 link add vethrc type veth peer name vethc
+ip0 link add vethrs type veth peer name veths
+ip0 link set vethc netns $netns1
+ip0 link set veths netns $netns2
+ip0 link set vethrc up
+ip0 link set vethrs up
+ip0 addr add 192.168.1.1/24 dev vethrc
+ip0 addr add 10.0.0.1/24 dev vethrs
+ip1 addr add 192.168.1.100/24 dev vethc
+ip1 link set vethc up
+ip1 route add default via 192.168.1.1
+ip2 addr add 10.0.0.100/24 dev veths
+ip2 link set veths up
+waitiface $netns0 vethrc
+waitiface $netns0 vethrs
+waitiface $netns1 vethc
+waitiface $netns2 veths
+
+n0 bash -c 'printf 1 > /proc/sys/net/ipv4/ip_forward'
+n0 bash -c 'printf 2 > /proc/sys/net/netfilter/nf_conntrack_udp_timeout'
+n0 bash -c 'printf 2 > /proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream'
+n0 iptables -t nat -A POSTROUTING -s 192.168.1.0/24 -d 10.0.0.0/24 -j SNAT --to 10.0.0.1
+
+n1 wg set wg0 peer "$pub2" endpoint 10.0.0.100:2 persistent-keepalive 1
+n1 ping -W 1 -c 1 192.168.241.2
+n2 ping -W 1 -c 1 192.168.241.1
+[[ $(n2 wg show wg0 endpoints) == "$pub1 10.0.0.1:1" ]]
+# Demonstrate n2 can still send packets to n1, since persistent-keepalive will prevent connection tracking entry from expiring (to see entries: `n0 conntrack -L`).
+pp sleep 3
+n2 ping -W 1 -c 1 192.168.241.1
+n1 wg set wg0 peer "$pub2" persistent-keepalive 0
+
+# Test that sk_bound_dev_if works
+n1 ping -I wg0 -c 1 -W 1 192.168.241.2
+# What about when the mark changes and the packet must be rerouted?
+n1 iptables -t mangle -I OUTPUT -j MARK --set-xmark 1
+n1 ping -c 1 -W 1 192.168.241.2 # First the boring case
+n1 ping -I wg0 -c 1 -W 1 192.168.241.2 # Then the sk_bound_dev_if case
+n1 iptables -t mangle -D OUTPUT -j MARK --set-xmark 1
+
+# Test that onion routing works, even when it loops
+n1 wg set wg0 peer "$pub3" allowed-ips 192.168.242.2/32 endpoint 192.168.241.2:5
+ip1 addr add 192.168.242.1/24 dev wg0
+ip2 link add wg1 type wireguard
+ip2 addr add 192.168.242.2/24 dev wg1
+n2 wg set wg1 private-key <(echo "$key3") listen-port 5 peer "$pub1" allowed-ips 192.168.242.1/32
+ip2 link set wg1 up
+n1 ping -W 1 -c 1 192.168.242.2
+ip2 link del wg1
+n1 wg set wg0 peer "$pub3" endpoint 192.168.242.2:5
+! n1 ping -W 1 -c 1 192.168.242.2 || false # Should not crash kernel
+n1 wg set wg0 peer "$pub3" remove
+ip1 addr del 192.168.242.1/24 dev wg0
+
+# Do a wg-quick(8)-style policy routing for the default route, making sure vethc has a v6 address to tease out bugs.
+ip1 -6 addr add fc00::9/96 dev vethc
+ip1 -6 route add default via fc00::1
+ip2 -4 addr add 192.168.99.7/32 dev wg0
+ip2 -6 addr add abab::1111/128 dev wg0
+n1 wg set wg0 fwmark 51820 peer "$pub2" allowed-ips 192.168.99.7,abab::1111
+ip1 -6 route add default dev wg0 table 51820
+ip1 -6 rule add not fwmark 51820 table 51820
+ip1 -6 rule add table main suppress_prefixlength 0
+ip1 -4 route add default dev wg0 table 51820
+ip1 -4 rule add not fwmark 51820 table 51820
+ip1 -4 rule add table main suppress_prefixlength 0
+n1 bash -c 'printf 0 > /proc/sys/net/ipv4/conf/vethc/rp_filter'
+# Flood the pings instead of sending just one, to trigger routing table reference counting bugs.
+n1 ping -W 1 -c 100 -f 192.168.99.7
+n1 ping -W 1 -c 100 -f abab::1111
+
+# Have ns2 NAT into wg0 packets from ns0, but return an icmp error along the right route.
+n2 iptables -t nat -A POSTROUTING -s 10.0.0.0/24 -d 192.168.241.0/24 -j SNAT --to 192.168.241.2
+n0 iptables -t filter -A INPUT \! -s 10.0.0.0/24 -i vethrs -j DROP # Manual rpfilter just to be explicit.
+n2 bash -c 'printf 1 > /proc/sys/net/ipv4/ip_forward'
+ip0 -4 route add 192.168.241.1 via 10.0.0.100
+n2 wg set wg0 peer "$pub1" remove
+[[ $(! n0 ping -W 1 -c 1 192.168.241.1 || false) == *"From 10.0.0.100 icmp_seq=1 Destination Host Unreachable"* ]]
+
+n0 iptables -t nat -F
+n0 iptables -t filter -F
+n2 iptables -t nat -F
+ip0 link del vethrc
+ip0 link del vethrs
+ip1 link del wg0
+ip2 link del wg0
+
+# Test that saddr routing is sticky but not too sticky, changing to this topology:
+# ┌────────────────────────────────────────┐ ┌────────────────────────────────────────┐
+# │ $ns1 namespace │ │ $ns2 namespace │
+# │ │ │ │
+# │ ┌─────┐ ┌─────┐ │ │ ┌─────┐ ┌─────┐ │
+# │ │ wg0 │─────────────│veth1│───────────┼────┼──│veth2│────────────│ wg0 │ │
+# │ ├─────┴──────────┐ ├─────┴──────────┐│ │ ├─────┴──────────┐ ├─────┴──────────┐ │
+# │ │192.168.241.1/24│ │10.0.0.1/24 ││ │ │10.0.0.2/24 │ │192.168.241.2/24│ │
+# │ │fd00::1/24 │ │fd00:aa::1/96 ││ │ │fd00:aa::2/96 │ │fd00::2/24 │ │
+# │ └────────────────┘ └────────────────┘│ │ └────────────────┘ └────────────────┘ │
+# └────────────────────────────────────────┘ └────────────────────────────────────────┘
+
+ip1 link add dev wg0 type wireguard
+ip2 link add dev wg0 type wireguard
+configure_peers
+ip1 link add veth1 type veth peer name veth2
+ip1 link set veth2 netns $netns2
+n1 bash -c 'printf 0 > /proc/sys/net/ipv6/conf/all/accept_dad'
+n2 bash -c 'printf 0 > /proc/sys/net/ipv6/conf/all/accept_dad'
+n1 bash -c 'printf 0 > /proc/sys/net/ipv6/conf/veth1/accept_dad'
+n2 bash -c 'printf 0 > /proc/sys/net/ipv6/conf/veth2/accept_dad'
+n1 bash -c 'printf 1 > /proc/sys/net/ipv4/conf/veth1/promote_secondaries'
+
+# First we check that we aren't overly sticky and can fall over to new IPs when old ones are removed
+ip1 addr add 10.0.0.1/24 dev veth1
+ip1 addr add fd00:aa::1/96 dev veth1
+ip2 addr add 10.0.0.2/24 dev veth2
+ip2 addr add fd00:aa::2/96 dev veth2
+ip1 link set veth1 up
+ip2 link set veth2 up
+waitiface $netns1 veth1
+waitiface $netns2 veth2
+n1 wg set wg0 peer "$pub2" endpoint 10.0.0.2:2
+n1 ping -W 1 -c 1 192.168.241.2
+ip1 addr add 10.0.0.10/24 dev veth1
+ip1 addr del 10.0.0.1/24 dev veth1
+n1 ping -W 1 -c 1 192.168.241.2
+n1 wg set wg0 peer "$pub2" endpoint [fd00:aa::2]:2
+n1 ping -W 1 -c 1 192.168.241.2
+ip1 addr add fd00:aa::10/96 dev veth1
+ip1 addr del fd00:aa::1/96 dev veth1
+n1 ping -W 1 -c 1 192.168.241.2
+
+# Now we show that we can successfully do reply to sender routing
+ip1 link set veth1 down
+ip2 link set veth2 down
+ip1 addr flush dev veth1
+ip2 addr flush dev veth2
+ip1 addr add 10.0.0.1/24 dev veth1
+ip1 addr add 10.0.0.2/24 dev veth1
+ip1 addr add fd00:aa::1/96 dev veth1
+ip1 addr add fd00:aa::2/96 dev veth1
+ip2 addr add 10.0.0.3/24 dev veth2
+ip2 addr add fd00:aa::3/96 dev veth2
+ip1 link set veth1 up
+ip2 link set veth2 up
+waitiface $netns1 veth1
+waitiface $netns2 veth2
+n2 wg set wg0 peer "$pub1" endpoint 10.0.0.1:1
+n2 ping -W 1 -c 1 192.168.241.1
+[[ $(n2 wg show wg0 endpoints) == "$pub1 10.0.0.1:1" ]]
+n2 wg set wg0 peer "$pub1" endpoint [fd00:aa::1]:1
+n2 ping -W 1 -c 1 192.168.241.1
+[[ $(n2 wg show wg0 endpoints) == "$pub1 [fd00:aa::1]:1" ]]
+n2 wg set wg0 peer "$pub1" endpoint 10.0.0.2:1
+n2 ping -W 1 -c 1 192.168.241.1
+[[ $(n2 wg show wg0 endpoints) == "$pub1 10.0.0.2:1" ]]
+n2 wg set wg0 peer "$pub1" endpoint [fd00:aa::2]:1
+n2 ping -W 1 -c 1 192.168.241.1
+[[ $(n2 wg show wg0 endpoints) == "$pub1 [fd00:aa::2]:1" ]]
+
+# What happens if the inbound destination address belongs to a different interface as the default route?
+ip1 link add dummy0 type dummy
+ip1 addr add 10.50.0.1/24 dev dummy0
+ip1 link set dummy0 up
+ip2 route add 10.50.0.0/24 dev veth2
+n2 wg set wg0 peer "$pub1" endpoint 10.50.0.1:1
+n2 ping -W 1 -c 1 192.168.241.1
+[[ $(n2 wg show wg0 endpoints) == "$pub1 10.50.0.1:1" ]]
+
+ip1 link del dummy0
+ip1 addr flush dev veth1
+ip2 addr flush dev veth2
+ip1 route flush dev veth1
+ip2 route flush dev veth2
+
+# Now we see what happens if another interface route takes precedence over an ongoing one
+ip1 link add veth3 type veth peer name veth4
+ip1 link set veth4 netns $netns2
+ip1 addr add 10.0.0.1/24 dev veth1
+ip2 addr add 10.0.0.2/24 dev veth2
+ip1 addr add 10.0.0.3/24 dev veth3
+ip1 link set veth1 up
+ip2 link set veth2 up
+ip1 link set veth3 up
+ip2 link set veth4 up
+waitiface $netns1 veth1
+waitiface $netns2 veth2
+waitiface $netns1 veth3
+waitiface $netns2 veth4
+ip1 route flush dev veth1
+ip1 route flush dev veth3
+ip1 route add 10.0.0.0/24 dev veth1 src 10.0.0.1 metric 2
+n1 wg set wg0 peer "$pub2" endpoint 10.0.0.2:2
+n1 ping -W 1 -c 1 192.168.241.2
+[[ $(n2 wg show wg0 endpoints) == "$pub1 10.0.0.1:1" ]]
+ip1 route add 10.0.0.0/24 dev veth3 src 10.0.0.3 metric 1
+n1 bash -c 'printf 0 > /proc/sys/net/ipv4/conf/veth1/rp_filter'
+n2 bash -c 'printf 0 > /proc/sys/net/ipv4/conf/veth4/rp_filter'
+n1 bash -c 'printf 0 > /proc/sys/net/ipv4/conf/all/rp_filter'
+n2 bash -c 'printf 0 > /proc/sys/net/ipv4/conf/all/rp_filter'
+n1 ping -W 1 -c 1 192.168.241.2
+[[ $(n2 wg show wg0 endpoints) == "$pub1 10.0.0.3:1" ]]
+
+ip1 link del dev veth3
+ip1 link del dev wg0
+ip2 link del dev wg0
+
+# Make sure persistent keep alives are sent when an adapter comes up
+ip1 link add dev wg0 type wireguard
+n1 wg set wg0 private-key <(echo "$key1") peer "$pub2" endpoint 10.0.0.1:1 persistent-keepalive 1
+read _ _ tx_bytes < <(n1 wg show wg0 transfer)
+[[ $tx_bytes -eq 0 ]]
+ip1 link set dev wg0 up
+read _ _ tx_bytes < <(n1 wg show wg0 transfer)
+[[ $tx_bytes -gt 0 ]]
+ip1 link del dev wg0
+# This should also happen even if the private key is set later
+ip1 link add dev wg0 type wireguard
+n1 wg set wg0 peer "$pub2" endpoint 10.0.0.1:1 persistent-keepalive 1
+read _ _ tx_bytes < <(n1 wg show wg0 transfer)
+[[ $tx_bytes -eq 0 ]]
+ip1 link set dev wg0 up
+read _ _ tx_bytes < <(n1 wg show wg0 transfer)
+[[ $tx_bytes -eq 0 ]]
+n1 wg set wg0 private-key <(echo "$key1")
+read _ _ tx_bytes < <(n1 wg show wg0 transfer)
+[[ $tx_bytes -gt 0 ]]
+ip1 link del dev veth1
+ip1 link del dev wg0
+
+# We test that Netlink/IPC is working properly by doing things that usually cause split responses
+ip0 link add dev wg0 type wireguard
+config=( "[Interface]" "PrivateKey=$(wg genkey)" "[Peer]" "PublicKey=$(wg genkey)" )
+for a in {1..255}; do
+ for b in {0..255}; do
+ config+=( "AllowedIPs=$a.$b.0.0/16,$a::$b/128" )
+ done
+done
+n0 wg setconf wg0 <(printf '%s\n' "${config[@]}")
+i=0
+for ip in $(n0 wg show wg0 allowed-ips); do
+ ((++i))
+done
+((i == 255*256*2+1))
+ip0 link del wg0
+ip0 link add dev wg0 type wireguard
+config=( "[Interface]" "PrivateKey=$(wg genkey)" )
+for a in {1..40}; do
+ config+=( "[Peer]" "PublicKey=$(wg genkey)" )
+ for b in {1..52}; do
+ config+=( "AllowedIPs=$a.$b.0.0/16" )
+ done
+done
+n0 wg setconf wg0 <(printf '%s\n' "${config[@]}")
+i=0
+while read -r line; do
+ j=0
+ for ip in $line; do
+ ((++j))
+ done
+ ((j == 53))
+ ((++i))
+done < <(n0 wg show wg0 allowed-ips)
+((i == 40))
+ip0 link del wg0
+ip0 link add wg0 type wireguard
+config=( )
+for i in {1..29}; do
+ config+=( "[Peer]" "PublicKey=$(wg genkey)" )
+done
+config+=( "[Peer]" "PublicKey=$(wg genkey)" "AllowedIPs=255.2.3.4/32,abcd::255/128" )
+n0 wg setconf wg0 <(printf '%s\n' "${config[@]}")
+n0 wg showconf wg0 > /dev/null
+ip0 link del wg0
+
+allowedips=( )
+for i in {1..197}; do
+ allowedips+=( abcd::$i )
+done
+saved_ifs="$IFS"
+IFS=,
+allowedips="${allowedips[*]}"
+IFS="$saved_ifs"
+ip0 link add wg0 type wireguard
+n0 wg set wg0 peer "$pub1"
+n0 wg set wg0 peer "$pub2" allowed-ips "$allowedips"
+{
+ read -r pub allowedips
+ [[ $pub == "$pub1" && $allowedips == "(none)" ]]
+ read -r pub allowedips
+ [[ $pub == "$pub2" ]]
+ i=0
+ for _ in $allowedips; do
+ ((++i))
+ done
+ ((i == 197))
+} < <(n0 wg show wg0 allowed-ips)
+ip0 link del wg0
+
+! n0 wg show doesnotexist || false
+
+ip0 link add wg0 type wireguard
+n0 wg set wg0 private-key <(echo "$key1") peer "$pub2" preshared-key <(echo "$psk")
+[[ $(n0 wg show wg0 private-key) == "$key1" ]]
+[[ $(n0 wg show wg0 preshared-keys) == "$pub2 $psk" ]]
+n0 wg set wg0 private-key /dev/null peer "$pub2" preshared-key /dev/null
+[[ $(n0 wg show wg0 private-key) == "(none)" ]]
+[[ $(n0 wg show wg0 preshared-keys) == "$pub2 (none)" ]]
+n0 wg set wg0 peer "$pub2"
+n0 wg set wg0 private-key <(echo "$key2")
+[[ $(n0 wg show wg0 public-key) == "$pub2" ]]
+[[ -z $(n0 wg show wg0 peers) ]]
+n0 wg set wg0 peer "$pub2"
+[[ -z $(n0 wg show wg0 peers) ]]
+n0 wg set wg0 private-key <(echo "$key1")
+n0 wg set wg0 peer "$pub2"
+[[ $(n0 wg show wg0 peers) == "$pub2" ]]
+n0 wg set wg0 private-key <(echo "/${key1:1}")
+[[ $(n0 wg show wg0 private-key) == "+${key1:1}" ]]
+n0 wg set wg0 peer "$pub2" allowed-ips 0.0.0.0/0,10.0.0.0/8,100.0.0.0/10,172.16.0.0/12,192.168.0.0/16
+n0 wg set wg0 peer "$pub2" allowed-ips 0.0.0.0/0
+n0 wg set wg0 peer "$pub2" allowed-ips ::/0,1700::/111,5000::/4,e000::/37,9000::/75
+n0 wg set wg0 peer "$pub2" allowed-ips ::/0
+n0 wg set wg0 peer "$pub2" remove
+for low_order_point in AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA= AQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA= 4Ot6fDtBuK4WVuP68Z/EatoJjeucMrH9hmIFFl9JuAA= X5yVvKNQjCSx0LFVnIPvWwREXMRYHI6G2CJO3dCfEVc= 7P///////////////////////////////////////38= 7f///////////////////////////////////////38= 7v///////////////////////////////////////38=; do
+ n0 wg set wg0 peer "$low_order_point" persistent-keepalive 1 endpoint 127.0.0.1:1111
+done
+[[ -n $(n0 wg show wg0 peers) ]]
+exec 4< <(n0 ncat -l -u -p 1111)
+ncat_pid=$!
+waitncatudp $netns0 $ncat_pid
+ip0 link set wg0 up
+! read -r -n 1 -t 2 <&4 || false
+kill $ncat_pid
+ip0 link del wg0
+
+# Ensure that dst_cache references don't outlive netns lifetime
+ip1 link add dev wg0 type wireguard
+ip2 link add dev wg0 type wireguard
+configure_peers
+ip1 link add veth1 type veth peer name veth2
+ip1 link set veth2 netns $netns2
+ip1 addr add fd00:aa::1/64 dev veth1
+ip2 addr add fd00:aa::2/64 dev veth2
+ip1 link set veth1 up
+ip2 link set veth2 up
+waitiface $netns1 veth1
+waitiface $netns2 veth2
+ip1 -6 route add default dev veth1 via fd00:aa::2
+ip2 -6 route add default dev veth2 via fd00:aa::1
+n1 wg set wg0 peer "$pub2" endpoint [fd00:aa::2]:2
+n2 wg set wg0 peer "$pub1" endpoint [fd00:aa::1]:1
+n1 ping6 -c 1 fd00::2
+pp ip netns delete $netns1
+pp ip netns delete $netns2
+pp ip netns add $netns1
+pp ip netns add $netns2
+
+# Ensure there aren't circular reference loops
+ip1 link add wg1 type wireguard
+ip2 link add wg2 type wireguard
+ip1 link set wg1 netns $netns2
+ip2 link set wg2 netns $netns1
+pp ip netns delete $netns1
+pp ip netns delete $netns2
+pp ip netns add $netns1
+pp ip netns add $netns2
+
+sleep 2 # Wait for cleanup and grace periods
+declare -A objects
+while read -t 0.1 -r line 2>/dev/null || [[ $? -ne 142 ]]; do
+ [[ $line =~ .*(wg[0-9]+:\ [A-Z][a-z]+\ ?[0-9]*)\ .*(created|destroyed).* ]] || continue
+ objects["${BASH_REMATCH[1]}"]+="${BASH_REMATCH[2]}"
+done < /dev/kmsg
+alldeleted=1
+for object in "${!objects[@]}"; do
+ if [[ ${objects["$object"]} != *createddestroyed && ${objects["$object"]} != *createdcreateddestroyeddestroyed ]]; then
+ echo "Error: $object: merely ${objects["$object"]}" >&3
+ alldeleted=0
+ fi
+done
+[[ $alldeleted -eq 1 ]]
+pretty "" "Objects that were created were also destroyed."
diff --git a/tools/testing/selftests/wireguard/qemu/.gitignore b/tools/testing/selftests/wireguard/qemu/.gitignore
new file mode 100644
index 000000000..bfa15e6fe
--- /dev/null
+++ b/tools/testing/selftests/wireguard/qemu/.gitignore
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+build/
+distfiles/
diff --git a/tools/testing/selftests/wireguard/qemu/Makefile b/tools/testing/selftests/wireguard/qemu/Makefile
new file mode 100644
index 000000000..4bdd6c1a1
--- /dev/null
+++ b/tools/testing/selftests/wireguard/qemu/Makefile
@@ -0,0 +1,377 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+
+PWD := $(shell pwd)
+
+CHOST := $(shell gcc -dumpmachine)
+HOST_ARCH := $(firstword $(subst -, ,$(CHOST)))
+ifneq (,$(ARCH))
+CBUILD := $(subst -gcc,,$(lastword $(subst /, ,$(firstword $(wildcard $(foreach bindir,$(subst :, ,$(PATH)),$(bindir)/$(ARCH)-*-gcc))))))
+ifeq (,$(CBUILD))
+$(error The toolchain for $(ARCH) is not installed)
+endif
+else
+CBUILD := $(CHOST)
+ARCH := $(firstword $(subst -, ,$(CBUILD)))
+endif
+
+# Set these from the environment to override
+KERNEL_PATH ?= $(PWD)/../../../../..
+BUILD_PATH ?= $(PWD)/build/$(ARCH)
+DISTFILES_PATH ?= $(PWD)/distfiles
+NR_CPUS ?= 4
+
+MIRROR := https://download.wireguard.com/qemu-test/distfiles/
+
+default: qemu
+
+# variable name, tarball project name, version, tarball extension, default URI base
+define tar_download =
+$(1)_VERSION := $(3)
+$(1)_NAME := $(2)-$$($(1)_VERSION)
+$(1)_TAR := $(DISTFILES_PATH)/$$($(1)_NAME)$(4)
+$(1)_PATH := $(BUILD_PATH)/$$($(1)_NAME)
+$(call file_download,$$($(1)_NAME)$(4),$(5),$(6))
+endef
+
+define file_download =
+$(DISTFILES_PATH)/$(1):
+ mkdir -p $(DISTFILES_PATH)
+ flock -x $$@.lock -c '[ -f $$@ ] && exit 0; wget -O $$@.tmp $(MIRROR)$(1) || wget -O $$@.tmp $(2)$(1) || rm -f $$@.tmp; [ -f $$@.tmp ] || exit 1; if echo "$(3) $$@.tmp" | sha256sum -c -; then mv $$@.tmp $$@; else rm -f $$@.tmp; exit 71; fi'
+endef
+
+$(eval $(call tar_download,MUSL,musl,1.2.0,.tar.gz,https://musl.libc.org/releases/,c6de7b191139142d3f9a7b5b702c9cae1b5ee6e7f57e582da9328629408fd4e8))
+$(eval $(call tar_download,IPERF,iperf,3.7,.tar.gz,https://downloads.es.net/pub/iperf/,d846040224317caf2f75c843d309a950a7db23f9b44b94688ccbe557d6d1710c))
+$(eval $(call tar_download,BASH,bash,5.0,.tar.gz,https://ftp.gnu.org/gnu/bash/,b4a80f2ac66170b2913efbfb9f2594f1f76c7b1afd11f799e22035d63077fb4d))
+$(eval $(call tar_download,IPROUTE2,iproute2,5.6.0,.tar.xz,https://www.kernel.org/pub/linux/utils/net/iproute2/,1b5b0e25ce6e23da7526ea1da044e814ad85ba761b10dd29c2b027c056b04692))
+$(eval $(call tar_download,IPTABLES,iptables,1.8.4,.tar.bz2,https://www.netfilter.org/projects/iptables/files/,993a3a5490a544c2cbf2ef15cf7e7ed21af1845baf228318d5c36ef8827e157c))
+$(eval $(call tar_download,NMAP,nmap,7.80,.tar.bz2,https://nmap.org/dist/,fcfa5a0e42099e12e4bf7a68ebe6fde05553383a682e816a7ec9256ab4773faa))
+$(eval $(call tar_download,IPUTILS,iputils,s20190709,.tar.gz,https://github.com/iputils/iputils/archive/s20190709.tar.gz/#,a15720dd741d7538dd2645f9f516d193636ae4300ff7dbc8bfca757bf166490a))
+$(eval $(call tar_download,WIREGUARD_TOOLS,wireguard-tools,1.0.20200206,.tar.xz,https://git.zx2c4.com/wireguard-tools/snapshot/,f5207248c6a3c3e3bfc9ab30b91c1897b00802ed861e1f9faaed873366078c64))
+
+KERNEL_BUILD_PATH := $(BUILD_PATH)/kernel$(if $(findstring yes,$(DEBUG_KERNEL)),-debug)
+rwildcard=$(foreach d,$(wildcard $1*),$(call rwildcard,$d/,$2) $(filter $(subst *,%,$2),$d))
+WIREGUARD_SOURCES := $(call rwildcard,$(KERNEL_PATH)/drivers/net/wireguard/,*)
+
+export CFLAGS ?= -O3 -pipe
+export LDFLAGS ?=
+export CPPFLAGS := -I$(BUILD_PATH)/include
+
+ifeq ($(HOST_ARCH),$(ARCH))
+CROSS_COMPILE_FLAG := --host=$(CHOST)
+CFLAGS += -march=native
+STRIP := strip
+else
+$(info Cross compilation: building for $(CBUILD) using $(CHOST))
+CROSS_COMPILE_FLAG := --build=$(CBUILD) --host=$(CHOST)
+export CROSS_COMPILE=$(CBUILD)-
+STRIP := $(CBUILD)-strip
+endif
+ifeq ($(ARCH),aarch64)
+QEMU_ARCH := aarch64
+KERNEL_ARCH := arm64
+KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/arm64/boot/Image
+ifeq ($(HOST_ARCH),$(ARCH))
+QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm
+else
+QEMU_MACHINE := -cpu cortex-a53 -machine virt
+CFLAGS += -march=armv8-a -mtune=cortex-a53
+endif
+else ifeq ($(ARCH),aarch64_be)
+QEMU_ARCH := aarch64
+KERNEL_ARCH := arm64
+KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/arm64/boot/Image
+ifeq ($(HOST_ARCH),$(ARCH))
+QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm
+else
+QEMU_MACHINE := -cpu cortex-a53 -machine virt
+CFLAGS += -march=armv8-a -mtune=cortex-a53
+endif
+else ifeq ($(ARCH),arm)
+QEMU_ARCH := arm
+KERNEL_ARCH := arm
+KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/arm/boot/zImage
+ifeq ($(HOST_ARCH),$(ARCH))
+QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm
+else
+QEMU_MACHINE := -cpu cortex-a15 -machine virt
+CFLAGS += -march=armv7-a -mtune=cortex-a15 -mabi=aapcs-linux
+endif
+else ifeq ($(ARCH),armeb)
+QEMU_ARCH := arm
+KERNEL_ARCH := arm
+KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/arm/boot/zImage
+ifeq ($(HOST_ARCH),$(ARCH))
+QEMU_MACHINE := -cpu host -machine virt,gic_version=host,accel=kvm
+else
+QEMU_MACHINE := -cpu cortex-a15 -machine virt
+CFLAGS += -march=armv7-a -mabi=aapcs-linux # We don't pass -mtune=cortex-a15 due to a compiler bug on big endian.
+LDFLAGS += -Wl,--be8
+endif
+else ifeq ($(ARCH),x86_64)
+QEMU_ARCH := x86_64
+KERNEL_ARCH := x86_64
+KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/x86/boot/bzImage
+ifeq ($(HOST_ARCH),$(ARCH))
+QEMU_MACHINE := -cpu host -machine q35,accel=kvm
+else
+QEMU_MACHINE := -cpu Skylake-Server -machine q35
+CFLAGS += -march=skylake-avx512
+endif
+else ifeq ($(ARCH),i686)
+QEMU_ARCH := i386
+KERNEL_ARCH := x86
+KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/x86/boot/bzImage
+ifeq ($(subst x86_64,i686,$(HOST_ARCH)),$(ARCH))
+QEMU_MACHINE := -cpu host -machine q35,accel=kvm
+else
+QEMU_MACHINE := -cpu coreduo -machine q35
+CFLAGS += -march=prescott
+endif
+else ifeq ($(ARCH),mips64)
+QEMU_ARCH := mips64
+KERNEL_ARCH := mips
+KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
+ifeq ($(HOST_ARCH),$(ARCH))
+QEMU_MACHINE := -cpu host -machine malta,accel=kvm
+CFLAGS += -EB
+else
+QEMU_MACHINE := -cpu MIPS64R2-generic -machine malta -smp 1
+CFLAGS += -march=mips64r2 -EB
+endif
+else ifeq ($(ARCH),mips64el)
+QEMU_ARCH := mips64el
+KERNEL_ARCH := mips
+KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
+ifeq ($(HOST_ARCH),$(ARCH))
+QEMU_MACHINE := -cpu host -machine malta,accel=kvm
+CFLAGS += -EL
+else
+QEMU_MACHINE := -cpu MIPS64R2-generic -machine malta -smp 1
+CFLAGS += -march=mips64r2 -EL
+endif
+else ifeq ($(ARCH),mips)
+QEMU_ARCH := mips
+KERNEL_ARCH := mips
+KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
+ifeq ($(HOST_ARCH),$(ARCH))
+QEMU_MACHINE := -cpu host -machine malta,accel=kvm
+CFLAGS += -EB
+else
+QEMU_MACHINE := -cpu 24Kf -machine malta -smp 1
+CFLAGS += -march=mips32r2 -EB
+endif
+else ifeq ($(ARCH),mipsel)
+QEMU_ARCH := mipsel
+KERNEL_ARCH := mips
+KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
+ifeq ($(HOST_ARCH),$(ARCH))
+QEMU_MACHINE := -cpu host -machine malta,accel=kvm
+CFLAGS += -EL
+else
+QEMU_MACHINE := -cpu 24Kf -machine malta -smp 1
+CFLAGS += -march=mips32r2 -EL
+endif
+else ifeq ($(ARCH),powerpc64le)
+QEMU_ARCH := ppc64
+KERNEL_ARCH := powerpc
+KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
+ifeq ($(HOST_ARCH),$(ARCH))
+QEMU_MACHINE := -cpu host,accel=kvm -machine pseries
+else
+QEMU_MACHINE := -machine pseries
+endif
+CFLAGS += -mcpu=powerpc64le -mlong-double-64
+else ifeq ($(ARCH),powerpc)
+QEMU_ARCH := ppc
+KERNEL_ARCH := powerpc
+KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/powerpc/boot/uImage
+ifeq ($(HOST_ARCH),$(ARCH))
+QEMU_MACHINE := -cpu host,accel=kvm -machine ppce500
+else
+QEMU_MACHINE := -machine ppce500
+endif
+CFLAGS += -mcpu=powerpc -mlong-double-64 -msecure-plt
+else ifeq ($(ARCH),m68k)
+QEMU_ARCH := m68k
+KERNEL_ARCH := m68k
+KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/vmlinux
+KERNEL_CMDLINE := $(shell sed -n 's/CONFIG_CMDLINE=\(.*\)/\1/p' arch/m68k.config)
+ifeq ($(HOST_ARCH),$(ARCH))
+QEMU_MACHINE := -cpu host,accel=kvm -machine q800 -smp 1 -append $(KERNEL_CMDLINE)
+else
+QEMU_MACHINE := -machine q800 -smp 1 -append $(KERNEL_CMDLINE)
+endif
+else
+$(error I only build: x86_64, i686, arm, armeb, aarch64, aarch64_be, mips, mipsel, mips64, mips64el, powerpc64le, powerpc, m68k)
+endif
+
+REAL_CC := $(CBUILD)-gcc
+MUSL_CC := $(BUILD_PATH)/musl-gcc
+export CC := $(MUSL_CC)
+USERSPACE_DEPS := $(MUSL_CC) $(BUILD_PATH)/include/.installed $(BUILD_PATH)/include/linux/.installed
+
+build: $(KERNEL_BZIMAGE)
+qemu: $(KERNEL_BZIMAGE)
+ rm -f $(BUILD_PATH)/result
+ timeout --foreground 20m qemu-system-$(QEMU_ARCH) \
+ -nodefaults \
+ -nographic \
+ -smp $(NR_CPUS) \
+ $(QEMU_MACHINE) \
+ -m $$(grep -q CONFIG_DEBUG_KMEMLEAK=y $(KERNEL_BUILD_PATH)/.config && echo 1G || echo 256M) \
+ -serial stdio \
+ -serial file:$(BUILD_PATH)/result \
+ -no-reboot \
+ -monitor none \
+ -kernel $<
+ grep -Fq success $(BUILD_PATH)/result
+
+$(BUILD_PATH)/init-cpio-spec.txt:
+ mkdir -p $(BUILD_PATH)
+ echo "file /init $(BUILD_PATH)/init 755 0 0" > $@
+ echo "file /init.sh $(PWD)/../netns.sh 755 0 0" >> $@
+ echo "dir /dev 755 0 0" >> $@
+ echo "nod /dev/console 644 0 0 c 5 1" >> $@
+ echo "dir /bin 755 0 0" >> $@
+ echo "file /bin/iperf3 $(IPERF_PATH)/src/iperf3 755 0 0" >> $@
+ echo "file /bin/wg $(WIREGUARD_TOOLS_PATH)/src/wg 755 0 0" >> $@
+ echo "file /bin/bash $(BASH_PATH)/bash 755 0 0" >> $@
+ echo "file /bin/ip $(IPROUTE2_PATH)/ip/ip 755 0 0" >> $@
+ echo "file /bin/ss $(IPROUTE2_PATH)/misc/ss 755 0 0" >> $@
+ echo "file /bin/ping $(IPUTILS_PATH)/ping 755 0 0" >> $@
+ echo "file /bin/ncat $(NMAP_PATH)/ncat/ncat 755 0 0" >> $@
+ echo "file /bin/xtables-legacy-multi $(IPTABLES_PATH)/iptables/xtables-legacy-multi 755 0 0" >> $@
+ echo "slink /bin/iptables xtables-legacy-multi 777 0 0" >> $@
+ echo "slink /bin/ping6 ping 777 0 0" >> $@
+ echo "dir /lib 755 0 0" >> $@
+ echo "file /lib/libc.so $(MUSL_PATH)/lib/libc.so 755 0 0" >> $@
+ echo "slink /lib/ld-linux.so.1 libc.so 777 0 0" >> $@
+
+$(KERNEL_BUILD_PATH)/.config: kernel.config arch/$(ARCH).config
+ mkdir -p $(KERNEL_BUILD_PATH)
+ cp kernel.config $(KERNEL_BUILD_PATH)/minimal.config
+ printf 'CONFIG_NR_CPUS=$(NR_CPUS)\nCONFIG_INITRAMFS_SOURCE="$(BUILD_PATH)/init-cpio-spec.txt"\n' >> $(KERNEL_BUILD_PATH)/minimal.config
+ cat arch/$(ARCH).config >> $(KERNEL_BUILD_PATH)/minimal.config
+ $(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) ARCH=$(KERNEL_ARCH) allnoconfig
+ cd $(KERNEL_BUILD_PATH) && ARCH=$(KERNEL_ARCH) $(KERNEL_PATH)/scripts/kconfig/merge_config.sh -n $(KERNEL_BUILD_PATH)/.config $(KERNEL_BUILD_PATH)/minimal.config
+ $(if $(findstring yes,$(DEBUG_KERNEL)),cp debug.config $(KERNEL_BUILD_PATH) && cd $(KERNEL_BUILD_PATH) && ARCH=$(KERNEL_ARCH) $(KERNEL_PATH)/scripts/kconfig/merge_config.sh -n $(KERNEL_BUILD_PATH)/.config debug.config,)
+
+$(KERNEL_BZIMAGE): $(KERNEL_BUILD_PATH)/.config $(BUILD_PATH)/init-cpio-spec.txt $(MUSL_PATH)/lib/libc.so $(IPERF_PATH)/src/iperf3 $(IPUTILS_PATH)/ping $(BASH_PATH)/bash $(IPROUTE2_PATH)/misc/ss $(IPROUTE2_PATH)/ip/ip $(IPTABLES_PATH)/iptables/xtables-legacy-multi $(NMAP_PATH)/ncat/ncat $(WIREGUARD_TOOLS_PATH)/src/wg $(BUILD_PATH)/init ../netns.sh $(WIREGUARD_SOURCES)
+ $(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE)
+
+$(BUILD_PATH)/include/linux/.installed: | $(KERNEL_BUILD_PATH)/.config
+ $(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) INSTALL_HDR_PATH=$(BUILD_PATH) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE) headers_install
+ touch $@
+
+$(MUSL_PATH)/lib/libc.so: $(MUSL_TAR)
+ mkdir -p $(BUILD_PATH)
+ flock -s $<.lock tar -C $(BUILD_PATH) -xf $<
+ cd $(MUSL_PATH) && CC=$(REAL_CC) ./configure --prefix=/ --disable-static --build=$(CBUILD)
+ $(MAKE) -C $(MUSL_PATH)
+ $(STRIP) -s $@
+
+$(BUILD_PATH)/include/.installed: $(MUSL_PATH)/lib/libc.so
+ $(MAKE) -C $(MUSL_PATH) DESTDIR=$(BUILD_PATH) install-headers
+ touch $@
+
+$(MUSL_CC): $(MUSL_PATH)/lib/libc.so
+ sh $(MUSL_PATH)/tools/musl-gcc.specs.sh $(BUILD_PATH)/include $(MUSL_PATH)/lib /lib/ld-linux.so.1 > $(BUILD_PATH)/musl-gcc.specs
+ printf '#!/bin/sh\nexec "$(REAL_CC)" --specs="$(BUILD_PATH)/musl-gcc.specs" "$$@"\n' > $(BUILD_PATH)/musl-gcc
+ chmod +x $(BUILD_PATH)/musl-gcc
+
+$(IPERF_PATH)/.installed: $(IPERF_TAR)
+ mkdir -p $(BUILD_PATH)
+ flock -s $<.lock tar -C $(BUILD_PATH) -xf $<
+ sed -i '1s/^/#include <stdint.h>/' $(IPERF_PATH)/src/cjson.h $(IPERF_PATH)/src/timer.h
+ sed -i -r 's/-p?g//g' $(IPERF_PATH)/src/Makefile*
+ touch $@
+
+$(IPERF_PATH)/src/iperf3: | $(IPERF_PATH)/.installed $(USERSPACE_DEPS)
+ cd $(IPERF_PATH) && CFLAGS="$(CFLAGS) -D_GNU_SOURCE" ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared --with-openssl=no
+ $(MAKE) -C $(IPERF_PATH)
+ $(STRIP) -s $@
+
+$(WIREGUARD_TOOLS_PATH)/.installed: $(WIREGUARD_TOOLS_TAR)
+ mkdir -p $(BUILD_PATH)
+ flock -s $<.lock tar -C $(BUILD_PATH) -xf $<
+ touch $@
+
+$(WIREGUARD_TOOLS_PATH)/src/wg: | $(WIREGUARD_TOOLS_PATH)/.installed $(USERSPACE_DEPS)
+ $(MAKE) -C $(WIREGUARD_TOOLS_PATH)/src wg
+ $(STRIP) -s $@
+
+$(BUILD_PATH)/init: init.c | $(USERSPACE_DEPS)
+ mkdir -p $(BUILD_PATH)
+ $(MUSL_CC) -o $@ $(CFLAGS) $(LDFLAGS) -std=gnu11 $<
+ $(STRIP) -s $@
+
+$(IPUTILS_PATH)/.installed: $(IPUTILS_TAR)
+ mkdir -p $(BUILD_PATH)
+ flock -s $<.lock tar -C $(BUILD_PATH) -xf $<
+ touch $@
+
+$(IPUTILS_PATH)/ping: | $(IPUTILS_PATH)/.installed $(USERSPACE_DEPS)
+ sed -i /atexit/d $(IPUTILS_PATH)/ping.c
+ cd $(IPUTILS_PATH) && $(CC) $(CFLAGS) -std=c99 -o $@ ping.c ping_common.c ping6_common.c iputils_common.c -D_GNU_SOURCE -D'IPUTILS_VERSION(f)=f' -lresolv $(LDFLAGS)
+ $(STRIP) -s $@
+
+$(BASH_PATH)/.installed: $(BASH_TAR)
+ mkdir -p $(BUILD_PATH)
+ flock -s $<.lock tar -C $(BUILD_PATH) -xf $<
+ touch $@
+
+$(BASH_PATH)/bash: | $(BASH_PATH)/.installed $(USERSPACE_DEPS)
+ cd $(BASH_PATH) && ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --without-bash-malloc --disable-debugger --disable-help-builtin --disable-history --disable-multibyte --disable-progcomp --disable-readline --disable-mem-scramble
+ $(MAKE) -C $(BASH_PATH)
+ $(STRIP) -s $@
+
+$(IPROUTE2_PATH)/.installed: $(IPROUTE2_TAR)
+ mkdir -p $(BUILD_PATH)
+ flock -s $<.lock tar -C $(BUILD_PATH) -xf $<
+ printf 'CC:=$(CC)\nPKG_CONFIG:=pkg-config\nTC_CONFIG_XT:=n\nTC_CONFIG_ATM:=n\nTC_CONFIG_IPSET:=n\nIP_CONFIG_SETNS:=y\nHAVE_ELF:=n\nHAVE_MNL:=n\nHAVE_BERKELEY_DB:=n\nHAVE_LATEX:=n\nHAVE_PDFLATEX:=n\nCFLAGS+=-DHAVE_SETNS\n' > $(IPROUTE2_PATH)/config.mk
+ printf 'lib: snapshot\n\t$$(MAKE) -C lib\nip/ip: lib\n\t$$(MAKE) -C ip ip\nmisc/ss: lib\n\t$$(MAKE) -C misc ss\n' >> $(IPROUTE2_PATH)/Makefile
+ touch $@
+
+$(IPROUTE2_PATH)/ip/ip: | $(IPROUTE2_PATH)/.installed $(USERSPACE_DEPS)
+ $(MAKE) -C $(IPROUTE2_PATH) PREFIX=/ ip/ip
+ $(STRIP) -s $@
+
+$(IPROUTE2_PATH)/misc/ss: | $(IPROUTE2_PATH)/.installed $(USERSPACE_DEPS)
+ $(MAKE) -C $(IPROUTE2_PATH) PREFIX=/ misc/ss
+ $(STRIP) -s $@
+
+$(IPTABLES_PATH)/.installed: $(IPTABLES_TAR)
+ mkdir -p $(BUILD_PATH)
+ flock -s $<.lock tar -C $(BUILD_PATH) -xf $<
+ sed -i -e "/nfnetlink=[01]/s:=[01]:=0:" -e "/nfconntrack=[01]/s:=[01]:=0:" $(IPTABLES_PATH)/configure
+ touch $@
+
+$(IPTABLES_PATH)/iptables/xtables-legacy-multi: | $(IPTABLES_PATH)/.installed $(USERSPACE_DEPS)
+ cd $(IPTABLES_PATH) && ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared --disable-nftables --disable-bpf-compiler --disable-nfsynproxy --disable-libipq --disable-connlabel --with-kernel=$(BUILD_PATH)/include
+ $(MAKE) -C $(IPTABLES_PATH)
+ $(STRIP) -s $@
+
+$(NMAP_PATH)/.installed: $(NMAP_TAR)
+ mkdir -p $(BUILD_PATH)
+ flock -s $<.lock tar -C $(BUILD_PATH) -xf $<
+ touch $@
+
+$(NMAP_PATH)/ncat/ncat: | $(NMAP_PATH)/.installed $(USERSPACE_DEPS)
+ cd $(NMAP_PATH) && ./configure --prefix=/ $(CROSS_COMPILE_FLAG) --enable-static --disable-shared --without-ndiff --without-zenmap --without-nping --with-libpcap=included --with-libpcre=included --with-libdnet=included --without-liblua --with-liblinear=included --without-nmap-update --without-openssl --with-pcap=linux --without-libssh
+ $(MAKE) -C $(NMAP_PATH)/libpcap
+ $(MAKE) -C $(NMAP_PATH)/ncat
+ $(STRIP) -s $@
+
+clean:
+ rm -rf $(BUILD_PATH)
+
+distclean: clean
+ rm -rf $(DISTFILES_PATH)
+
+menuconfig: $(KERNEL_BUILD_PATH)/.config
+ $(MAKE) -C $(KERNEL_PATH) O=$(KERNEL_BUILD_PATH) ARCH=$(KERNEL_ARCH) CROSS_COMPILE=$(CROSS_COMPILE) menuconfig
+
+.PHONY: qemu build clean distclean menuconfig
+.DELETE_ON_ERROR:
diff --git a/tools/testing/selftests/wireguard/qemu/arch/aarch64.config b/tools/testing/selftests/wireguard/qemu/arch/aarch64.config
new file mode 100644
index 000000000..3d063bb24
--- /dev/null
+++ b/tools/testing/selftests/wireguard/qemu/arch/aarch64.config
@@ -0,0 +1,5 @@
+CONFIG_SERIAL_AMBA_PL011=y
+CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="console=ttyAMA0 wg.success=ttyAMA1"
+CONFIG_FRAME_WARN=1280
diff --git a/tools/testing/selftests/wireguard/qemu/arch/aarch64_be.config b/tools/testing/selftests/wireguard/qemu/arch/aarch64_be.config
new file mode 100644
index 000000000..dbdc7e406
--- /dev/null
+++ b/tools/testing/selftests/wireguard/qemu/arch/aarch64_be.config
@@ -0,0 +1,6 @@
+CONFIG_CPU_BIG_ENDIAN=y
+CONFIG_SERIAL_AMBA_PL011=y
+CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="console=ttyAMA0 wg.success=ttyAMA1"
+CONFIG_FRAME_WARN=1280
diff --git a/tools/testing/selftests/wireguard/qemu/arch/arm.config b/tools/testing/selftests/wireguard/qemu/arch/arm.config
new file mode 100644
index 000000000..148f49905
--- /dev/null
+++ b/tools/testing/selftests/wireguard/qemu/arch/arm.config
@@ -0,0 +1,9 @@
+CONFIG_MMU=y
+CONFIG_ARCH_MULTI_V7=y
+CONFIG_ARCH_VIRT=y
+CONFIG_THUMB2_KERNEL=n
+CONFIG_SERIAL_AMBA_PL011=y
+CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="console=ttyAMA0 wg.success=ttyAMA1"
+CONFIG_FRAME_WARN=1024
diff --git a/tools/testing/selftests/wireguard/qemu/arch/armeb.config b/tools/testing/selftests/wireguard/qemu/arch/armeb.config
new file mode 100644
index 000000000..bd76b07d0
--- /dev/null
+++ b/tools/testing/selftests/wireguard/qemu/arch/armeb.config
@@ -0,0 +1,10 @@
+CONFIG_MMU=y
+CONFIG_ARCH_MULTI_V7=y
+CONFIG_ARCH_VIRT=y
+CONFIG_THUMB2_KERNEL=n
+CONFIG_SERIAL_AMBA_PL011=y
+CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="console=ttyAMA0 wg.success=ttyAMA1"
+CONFIG_CPU_BIG_ENDIAN=y
+CONFIG_FRAME_WARN=1024
diff --git a/tools/testing/selftests/wireguard/qemu/arch/i686.config b/tools/testing/selftests/wireguard/qemu/arch/i686.config
new file mode 100644
index 000000000..a85025d72
--- /dev/null
+++ b/tools/testing/selftests/wireguard/qemu/arch/i686.config
@@ -0,0 +1,5 @@
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1"
+CONFIG_FRAME_WARN=1024
diff --git a/tools/testing/selftests/wireguard/qemu/arch/m68k.config b/tools/testing/selftests/wireguard/qemu/arch/m68k.config
new file mode 100644
index 000000000..62a15bdb8
--- /dev/null
+++ b/tools/testing/selftests/wireguard/qemu/arch/m68k.config
@@ -0,0 +1,9 @@
+CONFIG_MMU=y
+CONFIG_M68KCLASSIC=y
+CONFIG_M68040=y
+CONFIG_MAC=y
+CONFIG_SERIAL_PMACZILOG=y
+CONFIG_SERIAL_PMACZILOG_TTYS=y
+CONFIG_SERIAL_PMACZILOG_CONSOLE=y
+CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1"
+CONFIG_FRAME_WARN=1024
diff --git a/tools/testing/selftests/wireguard/qemu/arch/mips.config b/tools/testing/selftests/wireguard/qemu/arch/mips.config
new file mode 100644
index 000000000..df71d6b95
--- /dev/null
+++ b/tools/testing/selftests/wireguard/qemu/arch/mips.config
@@ -0,0 +1,11 @@
+CONFIG_CPU_MIPS32_R2=y
+CONFIG_MIPS_MALTA=y
+CONFIG_MIPS_CPS=y
+CONFIG_MIPS_FP_SUPPORT=y
+CONFIG_POWER_RESET=y
+CONFIG_POWER_RESET_SYSCON=y
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1"
+CONFIG_FRAME_WARN=1024
diff --git a/tools/testing/selftests/wireguard/qemu/arch/mips64.config b/tools/testing/selftests/wireguard/qemu/arch/mips64.config
new file mode 100644
index 000000000..90c783f72
--- /dev/null
+++ b/tools/testing/selftests/wireguard/qemu/arch/mips64.config
@@ -0,0 +1,14 @@
+CONFIG_64BIT=y
+CONFIG_CPU_MIPS64_R2=y
+CONFIG_MIPS32_N32=y
+CONFIG_CPU_HAS_MSA=y
+CONFIG_MIPS_MALTA=y
+CONFIG_MIPS_CPS=y
+CONFIG_MIPS_FP_SUPPORT=y
+CONFIG_POWER_RESET=y
+CONFIG_POWER_RESET_SYSCON=y
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1"
+CONFIG_FRAME_WARN=1280
diff --git a/tools/testing/selftests/wireguard/qemu/arch/mips64el.config b/tools/testing/selftests/wireguard/qemu/arch/mips64el.config
new file mode 100644
index 000000000..435b0b43e
--- /dev/null
+++ b/tools/testing/selftests/wireguard/qemu/arch/mips64el.config
@@ -0,0 +1,15 @@
+CONFIG_64BIT=y
+CONFIG_CPU_MIPS64_R2=y
+CONFIG_MIPS32_N32=y
+CONFIG_CPU_HAS_MSA=y
+CONFIG_MIPS_MALTA=y
+CONFIG_CPU_LITTLE_ENDIAN=y
+CONFIG_MIPS_CPS=y
+CONFIG_MIPS_FP_SUPPORT=y
+CONFIG_POWER_RESET=y
+CONFIG_POWER_RESET_SYSCON=y
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1"
+CONFIG_FRAME_WARN=1280
diff --git a/tools/testing/selftests/wireguard/qemu/arch/mipsel.config b/tools/testing/selftests/wireguard/qemu/arch/mipsel.config
new file mode 100644
index 000000000..62bb50c4a
--- /dev/null
+++ b/tools/testing/selftests/wireguard/qemu/arch/mipsel.config
@@ -0,0 +1,12 @@
+CONFIG_CPU_MIPS32_R2=y
+CONFIG_MIPS_MALTA=y
+CONFIG_CPU_LITTLE_ENDIAN=y
+CONFIG_MIPS_CPS=y
+CONFIG_MIPS_FP_SUPPORT=y
+CONFIG_POWER_RESET=y
+CONFIG_POWER_RESET_SYSCON=y
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1"
+CONFIG_FRAME_WARN=1024
diff --git a/tools/testing/selftests/wireguard/qemu/arch/powerpc.config b/tools/testing/selftests/wireguard/qemu/arch/powerpc.config
new file mode 100644
index 000000000..57957093b
--- /dev/null
+++ b/tools/testing/selftests/wireguard/qemu/arch/powerpc.config
@@ -0,0 +1,10 @@
+CONFIG_PPC_QEMU_E500=y
+CONFIG_FSL_SOC_BOOKE=y
+CONFIG_PPC_85xx=y
+CONFIG_PHYS_64BIT=y
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_MATH_EMULATION=y
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1"
+CONFIG_FRAME_WARN=1024
diff --git a/tools/testing/selftests/wireguard/qemu/arch/powerpc64le.config b/tools/testing/selftests/wireguard/qemu/arch/powerpc64le.config
new file mode 100644
index 000000000..f52f1e2bc
--- /dev/null
+++ b/tools/testing/selftests/wireguard/qemu/arch/powerpc64le.config
@@ -0,0 +1,13 @@
+CONFIG_PPC64=y
+CONFIG_PPC_PSERIES=y
+CONFIG_ALTIVEC=y
+CONFIG_VSX=y
+CONFIG_PPC_OF_BOOT_TRAMPOLINE=y
+CONFIG_PPC_RADIX_MMU=y
+CONFIG_HVC_CONSOLE=y
+CONFIG_CPU_LITTLE_ENDIAN=y
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="console=hvc0 wg.success=hvc1"
+CONFIG_SECTION_MISMATCH_WARN_ONLY=y
+CONFIG_FRAME_WARN=1280
+CONFIG_THREAD_SHIFT=14
diff --git a/tools/testing/selftests/wireguard/qemu/arch/x86_64.config b/tools/testing/selftests/wireguard/qemu/arch/x86_64.config
new file mode 100644
index 000000000..00a1ef486
--- /dev/null
+++ b/tools/testing/selftests/wireguard/qemu/arch/x86_64.config
@@ -0,0 +1,5 @@
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="console=ttyS0 wg.success=ttyS1"
+CONFIG_FRAME_WARN=1280
diff --git a/tools/testing/selftests/wireguard/qemu/debug.config b/tools/testing/selftests/wireguard/qemu/debug.config
new file mode 100644
index 000000000..a92c5590e
--- /dev/null
+++ b/tools/testing/selftests/wireguard/qemu/debug.config
@@ -0,0 +1,64 @@
+CONFIG_LOCALVERSION="-debug"
+CONFIG_ENABLE_MUST_CHECK=y
+CONFIG_FRAME_POINTER=y
+CONFIG_STACK_VALIDATION=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_INFO_DWARF4=y
+CONFIG_PAGE_EXTENSION=y
+CONFIG_PAGE_POISONING=y
+CONFIG_DEBUG_OBJECTS=y
+CONFIG_DEBUG_OBJECTS_FREE=y
+CONFIG_DEBUG_OBJECTS_TIMERS=y
+CONFIG_DEBUG_OBJECTS_WORK=y
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
+CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER=y
+CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1
+CONFIG_SLUB_DEBUG_ON=y
+CONFIG_DEBUG_VM=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_HAVE_DEBUG_STACKOVERFLOW=y
+CONFIG_DEBUG_STACKOVERFLOW=y
+CONFIG_HAVE_ARCH_KMEMCHECK=y
+CONFIG_HAVE_ARCH_KASAN=y
+CONFIG_KASAN=y
+CONFIG_KASAN_INLINE=y
+CONFIG_UBSAN=y
+CONFIG_UBSAN_SANITIZE_ALL=y
+CONFIG_UBSAN_NULL=y
+CONFIG_DEBUG_KMEMLEAK=y
+CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE=8192
+CONFIG_DEBUG_STACK_USAGE=y
+CONFIG_DEBUG_SHIRQ=y
+CONFIG_WQ_WATCHDOG=y
+CONFIG_SCHED_DEBUG=y
+CONFIG_SCHED_INFO=y
+CONFIG_SCHEDSTATS=y
+CONFIG_SCHED_STACK_END_CHECK=y
+CONFIG_DEBUG_TIMEKEEPING=y
+CONFIG_TIMER_STATS=y
+CONFIG_DEBUG_PREEMPT=y
+CONFIG_DEBUG_RT_MUTEXES=y
+CONFIG_DEBUG_SPINLOCK=y
+CONFIG_DEBUG_MUTEXES=y
+CONFIG_DEBUG_LOCK_ALLOC=y
+CONFIG_PROVE_LOCKING=y
+CONFIG_LOCKDEP=y
+CONFIG_DEBUG_ATOMIC_SLEEP=y
+CONFIG_TRACE_IRQFLAGS=y
+CONFIG_DEBUG_BUGVERBOSE=y
+CONFIG_DEBUG_LIST=y
+CONFIG_DEBUG_PLIST=y
+CONFIG_PROVE_RCU=y
+CONFIG_SPARSE_RCU_POINTER=y
+CONFIG_RCU_CPU_STALL_TIMEOUT=21
+CONFIG_RCU_TRACE=y
+CONFIG_RCU_EQS_DEBUG=y
+CONFIG_USER_STACKTRACE_SUPPORT=y
+CONFIG_DEBUG_SG=y
+CONFIG_DEBUG_NOTIFIERS=y
+CONFIG_X86_DEBUG_FPU=y
+CONFIG_DEBUG_SECTION_MISMATCH=y
+CONFIG_DEBUG_PAGEALLOC=y
+CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT=y
+CONFIG_DEBUG_WW_MUTEX_SLOWPATH=y
diff --git a/tools/testing/selftests/wireguard/qemu/init.c b/tools/testing/selftests/wireguard/qemu/init.c
new file mode 100644
index 000000000..c9698120a
--- /dev/null
+++ b/tools/testing/selftests/wireguard/qemu/init.c
@@ -0,0 +1,284 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ */
+
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <fcntl.h>
+#include <sys/wait.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/io.h>
+#include <sys/ioctl.h>
+#include <sys/reboot.h>
+#include <sys/utsname.h>
+#include <sys/sendfile.h>
+#include <sys/sysmacros.h>
+#include <linux/random.h>
+#include <linux/version.h>
+
+__attribute__((noreturn)) static void poweroff(void)
+{
+ fflush(stdout);
+ fflush(stderr);
+ reboot(RB_AUTOBOOT);
+ sleep(30);
+ fprintf(stderr, "\x1b[37m\x1b[41m\x1b[1mFailed to power off!!!\x1b[0m\n");
+ exit(1);
+}
+
+static void panic(const char *what)
+{
+ fprintf(stderr, "\n\n\x1b[37m\x1b[41m\x1b[1mSOMETHING WENT HORRIBLY WRONG\x1b[0m\n\n \x1b[31m\x1b[1m%s: %s\x1b[0m\n\n\x1b[37m\x1b[44m\x1b[1mPower off...\x1b[0m\n\n", what, strerror(errno));
+ poweroff();
+}
+
+#define pretty_message(msg) puts("\x1b[32m\x1b[1m" msg "\x1b[0m")
+
+static void print_banner(void)
+{
+ struct utsname utsname;
+ int len;
+
+ if (uname(&utsname) < 0)
+ panic("uname");
+
+ len = strlen(" WireGuard Test Suite on ") + strlen(utsname.sysname) + strlen(utsname.release) + strlen(utsname.machine);
+ printf("\x1b[45m\x1b[33m\x1b[1m%*.s\x1b[0m\n\x1b[45m\x1b[33m\x1b[1m WireGuard Test Suite on %s %s %s \x1b[0m\n\x1b[45m\x1b[33m\x1b[1m%*.s\x1b[0m\n\n", len, "", utsname.sysname, utsname.release, utsname.machine, len, "");
+}
+
+static void seed_rng(void)
+{
+ int fd;
+ struct {
+ int entropy_count;
+ int buffer_size;
+ unsigned char buffer[256];
+ } entropy = {
+ .entropy_count = sizeof(entropy.buffer) * 8,
+ .buffer_size = sizeof(entropy.buffer),
+ .buffer = "Adding real entropy is not actually important for these tests. Don't try this at home, kids!"
+ };
+
+ if (mknod("/dev/urandom", S_IFCHR | 0644, makedev(1, 9)))
+ panic("mknod(/dev/urandom)");
+ fd = open("/dev/urandom", O_WRONLY);
+ if (fd < 0)
+ panic("open(urandom)");
+ for (int i = 0; i < 256; ++i) {
+ if (ioctl(fd, RNDADDENTROPY, &entropy) < 0)
+ panic("ioctl(urandom)");
+ }
+ close(fd);
+}
+
+static void mount_filesystems(void)
+{
+ pretty_message("[+] Mounting filesystems...");
+ mkdir("/dev", 0755);
+ mkdir("/proc", 0755);
+ mkdir("/sys", 0755);
+ mkdir("/tmp", 0755);
+ mkdir("/run", 0755);
+ mkdir("/var", 0755);
+ if (mount("none", "/dev", "devtmpfs", 0, NULL))
+ panic("devtmpfs mount");
+ if (mount("none", "/proc", "proc", 0, NULL))
+ panic("procfs mount");
+ if (mount("none", "/sys", "sysfs", 0, NULL))
+ panic("sysfs mount");
+ if (mount("none", "/tmp", "tmpfs", 0, NULL))
+ panic("tmpfs mount");
+ if (mount("none", "/run", "tmpfs", 0, NULL))
+ panic("tmpfs mount");
+ if (mount("none", "/sys/kernel/debug", "debugfs", 0, NULL))
+ ; /* Not a problem if it fails.*/
+ if (symlink("/run", "/var/run"))
+ panic("run symlink");
+ if (symlink("/proc/self/fd", "/dev/fd"))
+ panic("fd symlink");
+}
+
+static void enable_logging(void)
+{
+ int fd;
+ pretty_message("[+] Enabling logging...");
+ fd = open("/proc/sys/kernel/printk", O_WRONLY);
+ if (fd >= 0) {
+ if (write(fd, "9\n", 2) != 2)
+ panic("write(printk)");
+ close(fd);
+ }
+ fd = open("/proc/sys/debug/exception-trace", O_WRONLY);
+ if (fd >= 0) {
+ if (write(fd, "1\n", 2) != 2)
+ panic("write(exception-trace)");
+ close(fd);
+ }
+ fd = open("/proc/sys/kernel/panic_on_warn", O_WRONLY);
+ if (fd >= 0) {
+ if (write(fd, "1\n", 2) != 2)
+ panic("write(panic_on_warn)");
+ close(fd);
+ }
+}
+
+static void kmod_selftests(void)
+{
+ FILE *file;
+ char line[2048], *start, *pass;
+ bool success = true;
+ pretty_message("[+] Module self-tests:");
+ file = fopen("/proc/kmsg", "r");
+ if (!file)
+ panic("fopen(kmsg)");
+ if (fcntl(fileno(file), F_SETFL, O_NONBLOCK) < 0)
+ panic("fcntl(kmsg, nonblock)");
+ while (fgets(line, sizeof(line), file)) {
+ start = strstr(line, "wireguard: ");
+ if (!start)
+ continue;
+ start += 11;
+ *strchrnul(start, '\n') = '\0';
+ if (strstr(start, "www.wireguard.com"))
+ break;
+ pass = strstr(start, ": pass");
+ if (!pass || pass[6] != '\0') {
+ success = false;
+ printf(" \x1b[31m* %s\x1b[0m\n", start);
+ } else
+ printf(" \x1b[32m* %s\x1b[0m\n", start);
+ }
+ fclose(file);
+ if (!success) {
+ puts("\x1b[31m\x1b[1m[-] Tests failed! \u2639\x1b[0m");
+ poweroff();
+ }
+}
+
+static void launch_tests(void)
+{
+ char cmdline[4096], *success_dev;
+ int status, fd;
+ pid_t pid;
+
+ pretty_message("[+] Launching tests...");
+ pid = fork();
+ if (pid == -1)
+ panic("fork");
+ else if (pid == 0) {
+ execl("/init.sh", "init", NULL);
+ panic("exec");
+ }
+ if (waitpid(pid, &status, 0) < 0)
+ panic("waitpid");
+ if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
+ pretty_message("[+] Tests successful! :-)");
+ fd = open("/proc/cmdline", O_RDONLY);
+ if (fd < 0)
+ panic("open(/proc/cmdline)");
+ if (read(fd, cmdline, sizeof(cmdline) - 1) <= 0)
+ panic("read(/proc/cmdline)");
+ cmdline[sizeof(cmdline) - 1] = '\0';
+ for (success_dev = strtok(cmdline, " \n"); success_dev; success_dev = strtok(NULL, " \n")) {
+ if (strncmp(success_dev, "wg.success=", 11))
+ continue;
+ memcpy(success_dev + 11 - 5, "/dev/", 5);
+ success_dev += 11 - 5;
+ break;
+ }
+ if (!success_dev || !strlen(success_dev))
+ panic("Unable to find success device");
+
+ fd = open(success_dev, O_WRONLY);
+ if (fd < 0)
+ panic("open(success_dev)");
+ if (write(fd, "success\n", 8) != 8)
+ panic("write(success_dev)");
+ close(fd);
+ } else {
+ const char *why = "unknown cause";
+ int what = -1;
+
+ if (WIFEXITED(status)) {
+ why = "exit code";
+ what = WEXITSTATUS(status);
+ } else if (WIFSIGNALED(status)) {
+ why = "signal";
+ what = WTERMSIG(status);
+ }
+ printf("\x1b[31m\x1b[1m[-] Tests failed with %s %d! \u2639\x1b[0m\n", why, what);
+ }
+}
+
+static void ensure_console(void)
+{
+ for (unsigned int i = 0; i < 1000; ++i) {
+ int fd = open("/dev/console", O_RDWR);
+ if (fd < 0) {
+ usleep(50000);
+ continue;
+ }
+ dup2(fd, 0);
+ dup2(fd, 1);
+ dup2(fd, 2);
+ close(fd);
+ if (write(1, "\0\0\0\0\n", 5) == 5)
+ return;
+ }
+ panic("Unable to open console device");
+}
+
+static void clear_leaks(void)
+{
+ int fd;
+
+ fd = open("/sys/kernel/debug/kmemleak", O_WRONLY);
+ if (fd < 0)
+ return;
+ pretty_message("[+] Starting memory leak detection...");
+ write(fd, "clear\n", 5);
+ close(fd);
+}
+
+static void check_leaks(void)
+{
+ int fd;
+
+ fd = open("/sys/kernel/debug/kmemleak", O_WRONLY);
+ if (fd < 0)
+ return;
+ pretty_message("[+] Scanning for memory leaks...");
+ sleep(2); /* Wait for any grace periods. */
+ write(fd, "scan\n", 5);
+ close(fd);
+
+ fd = open("/sys/kernel/debug/kmemleak", O_RDONLY);
+ if (fd < 0)
+ return;
+ if (sendfile(1, fd, NULL, 0x7ffff000) > 0)
+ panic("Memory leaks encountered");
+ close(fd);
+}
+
+int main(int argc, char *argv[])
+{
+ seed_rng();
+ ensure_console();
+ print_banner();
+ mount_filesystems();
+ kmod_selftests();
+ enable_logging();
+ clear_leaks();
+ launch_tests();
+ check_leaks();
+ poweroff();
+ return 1;
+}
diff --git a/tools/testing/selftests/wireguard/qemu/kernel.config b/tools/testing/selftests/wireguard/qemu/kernel.config
new file mode 100644
index 000000000..a9b5a520a
--- /dev/null
+++ b/tools/testing/selftests/wireguard/qemu/kernel.config
@@ -0,0 +1,89 @@
+CONFIG_LOCALVERSION=""
+CONFIG_NET=y
+CONFIG_NETDEVICES=y
+CONFIG_NET_CORE=y
+CONFIG_NET_IPIP=y
+CONFIG_DUMMY=y
+CONFIG_VETH=y
+CONFIG_MULTIUSER=y
+CONFIG_NAMESPACES=y
+CONFIG_NET_NS=y
+CONFIG_UNIX=y
+CONFIG_INET=y
+CONFIG_IPV6=y
+CONFIG_NETFILTER=y
+CONFIG_NETFILTER_ADVANCED=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_NAT=y
+CONFIG_NETFILTER_XTABLES=y
+CONFIG_NETFILTER_XT_NAT=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MARK=y
+CONFIG_NF_NAT_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_TTY=y
+CONFIG_BINFMT_ELF=y
+CONFIG_BINFMT_SCRIPT=y
+CONFIG_VDSO=y
+CONFIG_VIRTUALIZATION=y
+CONFIG_HYPERVISOR_GUEST=y
+CONFIG_PARAVIRT=y
+CONFIG_KVM_GUEST=y
+CONFIG_PARAVIRT_SPINLOCKS=y
+CONFIG_PRINTK=y
+CONFIG_KALLSYMS=y
+CONFIG_BUG=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y
+CONFIG_JUMP_LABEL=y
+CONFIG_EMBEDDED=n
+CONFIG_BASE_FULL=y
+CONFIG_FUTEX=y
+CONFIG_SHMEM=y
+CONFIG_SLUB=y
+CONFIG_SPARSEMEM_VMEMMAP=y
+CONFIG_SMP=y
+CONFIG_SCHED_SMT=y
+CONFIG_SCHED_MC=y
+CONFIG_NUMA=y
+CONFIG_PREEMPT=y
+CONFIG_NO_HZ=y
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_HZ_PERIODIC=n
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_ARCH_RANDOM=y
+CONFIG_FILE_LOCKING=y
+CONFIG_POSIX_TIMERS=y
+CONFIG_DEVTMPFS=y
+CONFIG_PROC_FS=y
+CONFIG_PROC_SYSCTL=y
+CONFIG_SYSFS=y
+CONFIG_TMPFS=y
+CONFIG_CONSOLE_LOGLEVEL_DEFAULT=15
+CONFIG_LOG_BUF_SHIFT=18
+CONFIG_PRINTK_TIME=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_LEGACY_VSYSCALL_NONE=y
+CONFIG_KERNEL_GZIP=y
+CONFIG_PANIC_ON_OOPS=y
+CONFIG_BUG_ON_DATA_CORRUPTION=y
+CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
+CONFIG_HARDLOCKUP_DETECTOR=y
+CONFIG_WQ_WATCHDOG=y
+CONFIG_DETECT_HUNG_TASK=y
+CONFIG_BOOTPARAM_HARDLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC=y
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC=y
+CONFIG_PANIC_TIMEOUT=-1
+CONFIG_STACKTRACE=y
+CONFIG_EARLY_PRINTK=y
+CONFIG_GDB_SCRIPTS=y
+CONFIG_WIREGUARD=y
+CONFIG_WIREGUARD_DEBUG=y
diff --git a/tools/testing/selftests/x86/.gitignore b/tools/testing/selftests/x86/.gitignore
new file mode 100644
index 000000000..1aaef5bf1
--- /dev/null
+++ b/tools/testing/selftests/x86/.gitignore
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
+*_32
+*_64
+single_step_syscall
+sysret_ss_attrs
+syscall_nt
+ptrace_syscall
+test_mremap_vdso
+check_initial_reg_state
+sigreturn
+ldt_gdt
+iopl
+mpx-mini-test
+ioperm
+test_vdso
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
new file mode 100644
index 000000000..f1b675a40
--- /dev/null
+++ b/tools/testing/selftests/x86/Makefile
@@ -0,0 +1,106 @@
+# SPDX-License-Identifier: GPL-2.0
+all:
+
+include ../lib.mk
+
+.PHONY: all all_32 all_64 warn_32bit_failure clean
+
+UNAME_M := $(shell uname -m)
+CAN_BUILD_I386 := $(shell ./check_cc.sh "$(CC)" trivial_32bit_program.c -m32)
+CAN_BUILD_X86_64 := $(shell ./check_cc.sh "$(CC)" trivial_64bit_program.c)
+CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh "$(CC)" trivial_program.c -no-pie)
+
+TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \
+ check_initial_reg_state sigreturn iopl ioperm \
+ test_vdso test_vsyscall mov_ss_trap \
+ syscall_arg_fault fsgsbase_restore
+TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
+ test_FCMOV test_FCOMI test_FISTTP \
+ vdso_restorer
+TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip syscall_numbering
+# Some selftests require 32bit support enabled also on 64bit systems
+TARGETS_C_32BIT_NEEDED := ldt_gdt ptrace_syscall
+
+TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) $(TARGETS_C_32BIT_NEEDED)
+TARGETS_C_64BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_64BIT_ONLY)
+ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),11)
+TARGETS_C_64BIT_ALL += $(TARGETS_C_32BIT_NEEDED)
+endif
+
+BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
+BINARIES_64 := $(TARGETS_C_64BIT_ALL:%=%_64)
+
+BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32))
+BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64))
+
+CFLAGS := -O2 -g -std=gnu99 -pthread -Wall
+
+# call32_from_64 in thunks.S uses absolute addresses.
+ifeq ($(CAN_BUILD_WITH_NOPIE),1)
+CFLAGS += -no-pie
+endif
+
+define gen-target-rule-32
+$(1) $(1)_32: $(OUTPUT)/$(1)_32
+.PHONY: $(1) $(1)_32
+endef
+
+define gen-target-rule-64
+$(1) $(1)_64: $(OUTPUT)/$(1)_64
+.PHONY: $(1) $(1)_64
+endef
+
+ifeq ($(CAN_BUILD_I386),1)
+all: all_32
+TEST_PROGS += $(BINARIES_32)
+EXTRA_CFLAGS += -DCAN_BUILD_32
+$(foreach t,$(TARGETS_C_32BIT_ALL),$(eval $(call gen-target-rule-32,$(t))))
+endif
+
+ifeq ($(CAN_BUILD_X86_64),1)
+all: all_64
+TEST_PROGS += $(BINARIES_64)
+EXTRA_CFLAGS += -DCAN_BUILD_64
+$(foreach t,$(TARGETS_C_64BIT_ALL),$(eval $(call gen-target-rule-64,$(t))))
+endif
+
+all_32: $(BINARIES_32)
+
+all_64: $(BINARIES_64)
+
+EXTRA_CLEAN := $(BINARIES_32) $(BINARIES_64)
+
+$(BINARIES_32): $(OUTPUT)/%_32: %.c helpers.h
+ $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl -lm
+
+$(BINARIES_64): $(OUTPUT)/%_64: %.c helpers.h
+ $(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl
+
+# x86_64 users should be encouraged to install 32-bit libraries
+ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01)
+all: warn_32bit_failure
+
+warn_32bit_failure:
+ @echo "Warning: you seem to have a broken 32-bit build" 2>&1; \
+ echo "environment. This will reduce test coverage of 64-bit" 2>&1; \
+ echo "kernels. If you are using a Debian-like distribution," 2>&1; \
+ echo "try:"; 2>&1; \
+ echo ""; \
+ echo " apt-get install gcc-multilib libc6-i386 libc6-dev-i386"; \
+ echo ""; \
+ echo "If you are using a Fedora-like distribution, try:"; \
+ echo ""; \
+ echo " yum install glibc-devel.*i686"; \
+ exit 0;
+endif
+
+# Some tests have additional dependencies.
+$(OUTPUT)/sysret_ss_attrs_64: thunks.S
+$(OUTPUT)/ptrace_syscall_32: raw_syscall_helper_32.S
+$(OUTPUT)/test_syscall_vdso_32: thunks_32.S
+
+# check_initial_reg_state is special: it needs a custom entry, and it
+# needs to be static so that its interpreter doesn't destroy its initial
+# state.
+$(OUTPUT)/check_initial_reg_state_32: CFLAGS += -Wl,-ereal_start -static
+$(OUTPUT)/check_initial_reg_state_64: CFLAGS += -Wl,-ereal_start -static
diff --git a/tools/testing/selftests/x86/check_cc.sh b/tools/testing/selftests/x86/check_cc.sh
new file mode 100755
index 000000000..8c669c0d6
--- /dev/null
+++ b/tools/testing/selftests/x86/check_cc.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-only
+# check_cc.sh - Helper to test userspace compilation support
+# Copyright (c) 2015 Andrew Lutomirski
+
+CC="$1"
+TESTPROG="$2"
+shift 2
+
+if [ -n "$CC" ] && $CC -o /dev/null "$TESTPROG" -O0 "$@" 2>/dev/null; then
+ echo 1
+else
+ echo 0
+fi
+
+exit 0
diff --git a/tools/testing/selftests/x86/check_initial_reg_state.c b/tools/testing/selftests/x86/check_initial_reg_state.c
new file mode 100644
index 000000000..3bc95f3ed
--- /dev/null
+++ b/tools/testing/selftests/x86/check_initial_reg_state.c
@@ -0,0 +1,101 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * check_initial_reg_state.c - check that execve sets the correct state
+ * Copyright (c) 2014-2016 Andrew Lutomirski
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+
+unsigned long ax, bx, cx, dx, si, di, bp, sp, flags;
+unsigned long r8, r9, r10, r11, r12, r13, r14, r15;
+
+asm (
+ ".pushsection .text\n\t"
+ ".type real_start, @function\n\t"
+ ".global real_start\n\t"
+ "real_start:\n\t"
+#ifdef __x86_64__
+ "mov %rax, ax\n\t"
+ "mov %rbx, bx\n\t"
+ "mov %rcx, cx\n\t"
+ "mov %rdx, dx\n\t"
+ "mov %rsi, si\n\t"
+ "mov %rdi, di\n\t"
+ "mov %rbp, bp\n\t"
+ "mov %rsp, sp\n\t"
+ "mov %r8, r8\n\t"
+ "mov %r9, r9\n\t"
+ "mov %r10, r10\n\t"
+ "mov %r11, r11\n\t"
+ "mov %r12, r12\n\t"
+ "mov %r13, r13\n\t"
+ "mov %r14, r14\n\t"
+ "mov %r15, r15\n\t"
+ "pushfq\n\t"
+ "popq flags\n\t"
+#else
+ "mov %eax, ax\n\t"
+ "mov %ebx, bx\n\t"
+ "mov %ecx, cx\n\t"
+ "mov %edx, dx\n\t"
+ "mov %esi, si\n\t"
+ "mov %edi, di\n\t"
+ "mov %ebp, bp\n\t"
+ "mov %esp, sp\n\t"
+ "pushfl\n\t"
+ "popl flags\n\t"
+#endif
+ "jmp _start\n\t"
+ ".size real_start, . - real_start\n\t"
+ ".popsection");
+
+int main()
+{
+ int nerrs = 0;
+
+ if (sp == 0) {
+ printf("[FAIL]\tTest was built incorrectly\n");
+ return 1;
+ }
+
+ if (ax || bx || cx || dx || si || di || bp
+#ifdef __x86_64__
+ || r8 || r9 || r10 || r11 || r12 || r13 || r14 || r15
+#endif
+ ) {
+ printf("[FAIL]\tAll GPRs except SP should be 0\n");
+#define SHOW(x) printf("\t" #x " = 0x%lx\n", x);
+ SHOW(ax);
+ SHOW(bx);
+ SHOW(cx);
+ SHOW(dx);
+ SHOW(si);
+ SHOW(di);
+ SHOW(bp);
+ SHOW(sp);
+#ifdef __x86_64__
+ SHOW(r8);
+ SHOW(r9);
+ SHOW(r10);
+ SHOW(r11);
+ SHOW(r12);
+ SHOW(r13);
+ SHOW(r14);
+ SHOW(r15);
+#endif
+ nerrs++;
+ } else {
+ printf("[OK]\tAll GPRs except SP are 0\n");
+ }
+
+ if (flags != 0x202) {
+ printf("[FAIL]\tFLAGS is 0x%lx, but it should be 0x202\n", flags);
+ nerrs++;
+ } else {
+ printf("[OK]\tFLAGS is 0x202\n");
+ }
+
+ return nerrs ? 1 : 0;
+}
diff --git a/tools/testing/selftests/x86/entry_from_vm86.c b/tools/testing/selftests/x86/entry_from_vm86.c
new file mode 100644
index 000000000..d1e919b0c
--- /dev/null
+++ b/tools/testing/selftests/x86/entry_from_vm86.c
@@ -0,0 +1,348 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * entry_from_vm86.c - tests kernel entries from vm86 mode
+ * Copyright (c) 2014-2015 Andrew Lutomirski
+ *
+ * This exercises a few paths that need to special-case vm86 mode.
+ */
+
+#define _GNU_SOURCE
+
+#include <assert.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <sys/signal.h>
+#include <sys/ucontext.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <err.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <errno.h>
+#include <sys/vm86.h>
+
+static unsigned long load_addr = 0x10000;
+static int nerrs = 0;
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = SIG_DFL;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static sig_atomic_t got_signal;
+
+static void sighandler(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+ if (ctx->uc_mcontext.gregs[REG_EFL] & X86_EFLAGS_VM ||
+ (ctx->uc_mcontext.gregs[REG_CS] & 3) != 3) {
+ printf("[FAIL]\tSignal frame should not reflect vm86 mode\n");
+ nerrs++;
+ }
+
+ const char *signame;
+ if (sig == SIGSEGV)
+ signame = "SIGSEGV";
+ else if (sig == SIGILL)
+ signame = "SIGILL";
+ else
+ signame = "unexpected signal";
+
+ printf("[INFO]\t%s: FLAGS = 0x%lx, CS = 0x%hx\n", signame,
+ (unsigned long)ctx->uc_mcontext.gregs[REG_EFL],
+ (unsigned short)ctx->uc_mcontext.gregs[REG_CS]);
+
+ got_signal = 1;
+}
+
+asm (
+ ".pushsection .rodata\n\t"
+ ".type vmcode_bound, @object\n\t"
+ "vmcode:\n\t"
+ "vmcode_bound:\n\t"
+ ".code16\n\t"
+ "bound %ax, (2048)\n\t"
+ "int3\n\t"
+ "vmcode_sysenter:\n\t"
+ "sysenter\n\t"
+ "vmcode_syscall:\n\t"
+ "syscall\n\t"
+ "vmcode_sti:\n\t"
+ "sti\n\t"
+ "vmcode_int3:\n\t"
+ "int3\n\t"
+ "vmcode_int80:\n\t"
+ "int $0x80\n\t"
+ "vmcode_popf_hlt:\n\t"
+ "push %ax\n\t"
+ "popf\n\t"
+ "hlt\n\t"
+ "vmcode_umip:\n\t"
+ /* addressing via displacements */
+ "smsw (2052)\n\t"
+ "sidt (2054)\n\t"
+ "sgdt (2060)\n\t"
+ /* addressing via registers */
+ "mov $2066, %bx\n\t"
+ "smsw (%bx)\n\t"
+ "mov $2068, %bx\n\t"
+ "sidt (%bx)\n\t"
+ "mov $2074, %bx\n\t"
+ "sgdt (%bx)\n\t"
+ /* register operands, only for smsw */
+ "smsw %ax\n\t"
+ "mov %ax, (2080)\n\t"
+ "int3\n\t"
+ "vmcode_umip_str:\n\t"
+ "str %eax\n\t"
+ "vmcode_umip_sldt:\n\t"
+ "sldt %eax\n\t"
+ "int3\n\t"
+ ".size vmcode, . - vmcode\n\t"
+ "end_vmcode:\n\t"
+ ".code32\n\t"
+ ".popsection"
+ );
+
+extern unsigned char vmcode[], end_vmcode[];
+extern unsigned char vmcode_bound[], vmcode_sysenter[], vmcode_syscall[],
+ vmcode_sti[], vmcode_int3[], vmcode_int80[], vmcode_popf_hlt[],
+ vmcode_umip[], vmcode_umip_str[], vmcode_umip_sldt[];
+
+/* Returns false if the test was skipped. */
+static bool do_test(struct vm86plus_struct *v86, unsigned long eip,
+ unsigned int rettype, unsigned int retarg,
+ const char *text)
+{
+ long ret;
+
+ printf("[RUN]\t%s from vm86 mode\n", text);
+ v86->regs.eip = eip;
+ ret = vm86(VM86_ENTER, v86);
+
+ if (ret == -1 && (errno == ENOSYS || errno == EPERM)) {
+ printf("[SKIP]\tvm86 %s\n",
+ errno == ENOSYS ? "not supported" : "not allowed");
+ return false;
+ }
+
+ if (VM86_TYPE(ret) == VM86_INTx) {
+ char trapname[32];
+ int trapno = VM86_ARG(ret);
+ if (trapno == 13)
+ strcpy(trapname, "GP");
+ else if (trapno == 5)
+ strcpy(trapname, "BR");
+ else if (trapno == 14)
+ strcpy(trapname, "PF");
+ else
+ sprintf(trapname, "%d", trapno);
+
+ printf("[INFO]\tExited vm86 mode due to #%s\n", trapname);
+ } else if (VM86_TYPE(ret) == VM86_UNKNOWN) {
+ printf("[INFO]\tExited vm86 mode due to unhandled GP fault\n");
+ } else if (VM86_TYPE(ret) == VM86_TRAP) {
+ printf("[INFO]\tExited vm86 mode due to a trap (arg=%ld)\n",
+ VM86_ARG(ret));
+ } else if (VM86_TYPE(ret) == VM86_SIGNAL) {
+ printf("[INFO]\tExited vm86 mode due to a signal\n");
+ } else if (VM86_TYPE(ret) == VM86_STI) {
+ printf("[INFO]\tExited vm86 mode due to STI\n");
+ } else {
+ printf("[INFO]\tExited vm86 mode due to type %ld, arg %ld\n",
+ VM86_TYPE(ret), VM86_ARG(ret));
+ }
+
+ if (rettype == -1 ||
+ (VM86_TYPE(ret) == rettype && VM86_ARG(ret) == retarg)) {
+ printf("[OK]\tReturned correctly\n");
+ } else {
+ printf("[FAIL]\tIncorrect return reason (started at eip = 0x%lx, ended at eip = 0x%lx)\n", eip, v86->regs.eip);
+ nerrs++;
+ }
+
+ return true;
+}
+
+void do_umip_tests(struct vm86plus_struct *vm86, unsigned char *test_mem)
+{
+ struct table_desc {
+ unsigned short limit;
+ unsigned long base;
+ } __attribute__((packed));
+
+ /* Initialize variables with arbitrary values */
+ struct table_desc gdt1 = { .base = 0x3c3c3c3c, .limit = 0x9999 };
+ struct table_desc gdt2 = { .base = 0x1a1a1a1a, .limit = 0xaeae };
+ struct table_desc idt1 = { .base = 0x7b7b7b7b, .limit = 0xf1f1 };
+ struct table_desc idt2 = { .base = 0x89898989, .limit = 0x1313 };
+ unsigned short msw1 = 0x1414, msw2 = 0x2525, msw3 = 3737;
+
+ /* UMIP -- exit with INT3 unless kernel emulation did not trap #GP */
+ do_test(vm86, vmcode_umip - vmcode, VM86_TRAP, 3, "UMIP tests");
+
+ /* Results from displacement-only addressing */
+ msw1 = *(unsigned short *)(test_mem + 2052);
+ memcpy(&idt1, test_mem + 2054, sizeof(idt1));
+ memcpy(&gdt1, test_mem + 2060, sizeof(gdt1));
+
+ /* Results from register-indirect addressing */
+ msw2 = *(unsigned short *)(test_mem + 2066);
+ memcpy(&idt2, test_mem + 2068, sizeof(idt2));
+ memcpy(&gdt2, test_mem + 2074, sizeof(gdt2));
+
+ /* Results when using register operands */
+ msw3 = *(unsigned short *)(test_mem + 2080);
+
+ printf("[INFO]\tResult from SMSW:[0x%04x]\n", msw1);
+ printf("[INFO]\tResult from SIDT: limit[0x%04x]base[0x%08lx]\n",
+ idt1.limit, idt1.base);
+ printf("[INFO]\tResult from SGDT: limit[0x%04x]base[0x%08lx]\n",
+ gdt1.limit, gdt1.base);
+
+ if (msw1 != msw2 || msw1 != msw3)
+ printf("[FAIL]\tAll the results of SMSW should be the same.\n");
+ else
+ printf("[PASS]\tAll the results from SMSW are identical.\n");
+
+ if (memcmp(&gdt1, &gdt2, sizeof(gdt1)))
+ printf("[FAIL]\tAll the results of SGDT should be the same.\n");
+ else
+ printf("[PASS]\tAll the results from SGDT are identical.\n");
+
+ if (memcmp(&idt1, &idt2, sizeof(idt1)))
+ printf("[FAIL]\tAll the results of SIDT should be the same.\n");
+ else
+ printf("[PASS]\tAll the results from SIDT are identical.\n");
+
+ sethandler(SIGILL, sighandler, 0);
+ do_test(vm86, vmcode_umip_str - vmcode, VM86_SIGNAL, 0,
+ "STR instruction");
+ clearhandler(SIGILL);
+
+ sethandler(SIGILL, sighandler, 0);
+ do_test(vm86, vmcode_umip_sldt - vmcode, VM86_SIGNAL, 0,
+ "SLDT instruction");
+ clearhandler(SIGILL);
+}
+
+int main(void)
+{
+ struct vm86plus_struct v86;
+ unsigned char *addr = mmap((void *)load_addr, 4096,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_ANONYMOUS | MAP_PRIVATE, -1,0);
+ if (addr != (unsigned char *)load_addr)
+ err(1, "mmap");
+
+ memcpy(addr, vmcode, end_vmcode - vmcode);
+ addr[2048] = 2;
+ addr[2050] = 3;
+
+ memset(&v86, 0, sizeof(v86));
+
+ v86.regs.cs = load_addr / 16;
+ v86.regs.ss = load_addr / 16;
+ v86.regs.ds = load_addr / 16;
+ v86.regs.es = load_addr / 16;
+
+ /* Use the end of the page as our stack. */
+ v86.regs.esp = 4096;
+
+ assert((v86.regs.cs & 3) == 0); /* Looks like RPL = 0 */
+
+ /* #BR -- should deliver SIG??? */
+ do_test(&v86, vmcode_bound - vmcode, VM86_INTx, 5, "#BR");
+
+ /*
+ * SYSENTER -- should cause #GP or #UD depending on CPU.
+ * Expected return type -1 means that we shouldn't validate
+ * the vm86 return value. This will avoid problems on non-SEP
+ * CPUs.
+ */
+ sethandler(SIGILL, sighandler, 0);
+ do_test(&v86, vmcode_sysenter - vmcode, -1, 0, "SYSENTER");
+ clearhandler(SIGILL);
+
+ /*
+ * SYSCALL would be a disaster in VM86 mode. Fortunately,
+ * there is no kernel that both enables SYSCALL and sets
+ * EFER.SCE, so it's #UD on all systems. But vm86 is
+ * buggy (or has a "feature"), so the SIGILL will actually
+ * be delivered.
+ */
+ sethandler(SIGILL, sighandler, 0);
+ do_test(&v86, vmcode_syscall - vmcode, VM86_SIGNAL, 0, "SYSCALL");
+ clearhandler(SIGILL);
+
+ /* STI with VIP set */
+ v86.regs.eflags |= X86_EFLAGS_VIP;
+ v86.regs.eflags &= ~X86_EFLAGS_IF;
+ do_test(&v86, vmcode_sti - vmcode, VM86_STI, 0, "STI with VIP set");
+
+ /* POPF with VIP set but IF clear: should not trap */
+ v86.regs.eflags = X86_EFLAGS_VIP;
+ v86.regs.eax = 0;
+ do_test(&v86, vmcode_popf_hlt - vmcode, VM86_UNKNOWN, 0, "POPF with VIP set and IF clear");
+
+ /* POPF with VIP set and IF set: should trap */
+ v86.regs.eflags = X86_EFLAGS_VIP;
+ v86.regs.eax = X86_EFLAGS_IF;
+ do_test(&v86, vmcode_popf_hlt - vmcode, VM86_STI, 0, "POPF with VIP and IF set");
+
+ /* POPF with VIP clear and IF set: should not trap */
+ v86.regs.eflags = 0;
+ v86.regs.eax = X86_EFLAGS_IF;
+ do_test(&v86, vmcode_popf_hlt - vmcode, VM86_UNKNOWN, 0, "POPF with VIP clear and IF set");
+
+ v86.regs.eflags = 0;
+
+ /* INT3 -- should cause #BP */
+ do_test(&v86, vmcode_int3 - vmcode, VM86_TRAP, 3, "INT3");
+
+ /* INT80 -- should exit with "INTx 0x80" */
+ v86.regs.eax = (unsigned int)-1;
+ do_test(&v86, vmcode_int80 - vmcode, VM86_INTx, 0x80, "int80");
+
+ /* UMIP -- should exit with INTx 0x80 unless UMIP was not disabled */
+ do_umip_tests(&v86, addr);
+
+ /* Execute a null pointer */
+ v86.regs.cs = 0;
+ v86.regs.ss = 0;
+ sethandler(SIGSEGV, sighandler, 0);
+ got_signal = 0;
+ if (do_test(&v86, 0, VM86_SIGNAL, 0, "Execute null pointer") &&
+ !got_signal) {
+ printf("[FAIL]\tDid not receive SIGSEGV\n");
+ nerrs++;
+ }
+ clearhandler(SIGSEGV);
+
+ /* Make sure nothing explodes if we fork. */
+ if (fork() == 0)
+ return 0;
+
+ return (nerrs == 0 ? 0 : 1);
+}
diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c
new file mode 100644
index 000000000..7161cfc2e
--- /dev/null
+++ b/tools/testing/selftests/x86/fsgsbase.c
@@ -0,0 +1,678 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * fsgsbase.c, an fsgsbase test
+ * Copyright (c) 2014-2016 Andy Lutomirski
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <err.h>
+#include <sys/user.h>
+#include <asm/prctl.h>
+#include <sys/prctl.h>
+#include <signal.h>
+#include <limits.h>
+#include <sys/ucontext.h>
+#include <sched.h>
+#include <linux/futex.h>
+#include <pthread.h>
+#include <asm/ldt.h>
+#include <sys/mman.h>
+#include <stddef.h>
+#include <sys/ptrace.h>
+#include <sys/wait.h>
+#include <setjmp.h>
+
+#ifndef __x86_64__
+# error This test is 64-bit only
+#endif
+
+static volatile sig_atomic_t want_segv;
+static volatile unsigned long segv_addr;
+
+static unsigned short *shared_scratch;
+
+static int nerrs;
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = SIG_DFL;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static void sigsegv(int sig, siginfo_t *si, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+ if (!want_segv) {
+ clearhandler(SIGSEGV);
+ return; /* Crash cleanly. */
+ }
+
+ want_segv = false;
+ segv_addr = (unsigned long)si->si_addr;
+
+ ctx->uc_mcontext.gregs[REG_RIP] += 4; /* Skip the faulting mov */
+
+}
+
+static jmp_buf jmpbuf;
+
+static void sigill(int sig, siginfo_t *si, void *ctx_void)
+{
+ siglongjmp(jmpbuf, 1);
+}
+
+static bool have_fsgsbase;
+
+static inline unsigned long rdgsbase(void)
+{
+ unsigned long gsbase;
+
+ asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory");
+
+ return gsbase;
+}
+
+static inline unsigned long rdfsbase(void)
+{
+ unsigned long fsbase;
+
+ asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory");
+
+ return fsbase;
+}
+
+static inline void wrgsbase(unsigned long gsbase)
+{
+ asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory");
+}
+
+static inline void wrfsbase(unsigned long fsbase)
+{
+ asm volatile("wrfsbase %0" :: "r" (fsbase) : "memory");
+}
+
+enum which_base { FS, GS };
+
+static unsigned long read_base(enum which_base which)
+{
+ unsigned long offset;
+ /*
+ * Unless we have FSGSBASE, there's no direct way to do this from
+ * user mode. We can get at it indirectly using signals, though.
+ */
+
+ want_segv = true;
+
+ offset = 0;
+ if (which == FS) {
+ /* Use a constant-length instruction here. */
+ asm volatile ("mov %%fs:(%%rcx), %%rax" : : "c" (offset) : "rax");
+ } else {
+ asm volatile ("mov %%gs:(%%rcx), %%rax" : : "c" (offset) : "rax");
+ }
+ if (!want_segv)
+ return segv_addr + offset;
+
+ /*
+ * If that didn't segfault, try the other end of the address space.
+ * Unless we get really unlucky and run into the vsyscall page, this
+ * is guaranteed to segfault.
+ */
+
+ offset = (ULONG_MAX >> 1) + 1;
+ if (which == FS) {
+ asm volatile ("mov %%fs:(%%rcx), %%rax"
+ : : "c" (offset) : "rax");
+ } else {
+ asm volatile ("mov %%gs:(%%rcx), %%rax"
+ : : "c" (offset) : "rax");
+ }
+ if (!want_segv)
+ return segv_addr + offset;
+
+ abort();
+}
+
+static void check_gs_value(unsigned long value)
+{
+ unsigned long base;
+ unsigned short sel;
+
+ printf("[RUN]\tARCH_SET_GS to 0x%lx\n", value);
+ if (syscall(SYS_arch_prctl, ARCH_SET_GS, value) != 0)
+ err(1, "ARCH_SET_GS");
+
+ asm volatile ("mov %%gs, %0" : "=rm" (sel));
+ base = read_base(GS);
+ if (base == value) {
+ printf("[OK]\tGSBASE was set as expected (selector 0x%hx)\n",
+ sel);
+ } else {
+ nerrs++;
+ printf("[FAIL]\tGSBASE was not as expected: got 0x%lx (selector 0x%hx)\n",
+ base, sel);
+ }
+
+ if (syscall(SYS_arch_prctl, ARCH_GET_GS, &base) != 0)
+ err(1, "ARCH_GET_GS");
+ if (base == value) {
+ printf("[OK]\tARCH_GET_GS worked as expected (selector 0x%hx)\n",
+ sel);
+ } else {
+ nerrs++;
+ printf("[FAIL]\tARCH_GET_GS was not as expected: got 0x%lx (selector 0x%hx)\n",
+ base, sel);
+ }
+}
+
+static void mov_0_gs(unsigned long initial_base, bool schedule)
+{
+ unsigned long base, arch_base;
+
+ printf("[RUN]\tARCH_SET_GS to 0x%lx then mov 0 to %%gs%s\n", initial_base, schedule ? " and schedule " : "");
+ if (syscall(SYS_arch_prctl, ARCH_SET_GS, initial_base) != 0)
+ err(1, "ARCH_SET_GS");
+
+ if (schedule)
+ usleep(10);
+
+ asm volatile ("mov %0, %%gs" : : "rm" (0));
+ base = read_base(GS);
+ if (syscall(SYS_arch_prctl, ARCH_GET_GS, &arch_base) != 0)
+ err(1, "ARCH_GET_GS");
+ if (base == arch_base) {
+ printf("[OK]\tGSBASE is 0x%lx\n", base);
+ } else {
+ nerrs++;
+ printf("[FAIL]\tGSBASE changed to 0x%lx but kernel reports 0x%lx\n", base, arch_base);
+ }
+}
+
+static volatile unsigned long remote_base;
+static volatile bool remote_hard_zero;
+static volatile unsigned int ftx;
+
+/*
+ * ARCH_SET_FS/GS(0) may or may not program a selector of zero. HARD_ZERO
+ * means to force the selector to zero to improve test coverage.
+ */
+#define HARD_ZERO 0xa1fa5f343cb85fa4
+
+static void do_remote_base()
+{
+ unsigned long to_set = remote_base;
+ bool hard_zero = false;
+ if (to_set == HARD_ZERO) {
+ to_set = 0;
+ hard_zero = true;
+ }
+
+ if (syscall(SYS_arch_prctl, ARCH_SET_GS, to_set) != 0)
+ err(1, "ARCH_SET_GS");
+
+ if (hard_zero)
+ asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0));
+
+ unsigned short sel;
+ asm volatile ("mov %%gs, %0" : "=rm" (sel));
+ printf("\tother thread: ARCH_SET_GS(0x%lx)%s -- sel is 0x%hx\n",
+ to_set, hard_zero ? " and clear gs" : "", sel);
+}
+
+static __thread int set_thread_area_entry_number = -1;
+
+static unsigned short load_gs(void)
+{
+ /*
+ * Sets GS != 0 and GSBASE != 0 but arranges for the kernel to think
+ * that GSBASE == 0 (i.e. thread.gsbase == 0).
+ */
+
+ /* Step 1: tell the kernel that we have GSBASE == 0. */
+ if (syscall(SYS_arch_prctl, ARCH_SET_GS, 0) != 0)
+ err(1, "ARCH_SET_GS");
+
+ /* Step 2: change GSBASE without telling the kernel. */
+ struct user_desc desc = {
+ .entry_number = 0,
+ .base_addr = 0xBAADF00D,
+ .limit = 0xfffff,
+ .seg_32bit = 1,
+ .contents = 0, /* Data, grow-up */
+ .read_exec_only = 0,
+ .limit_in_pages = 1,
+ .seg_not_present = 0,
+ .useable = 0
+ };
+ if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) == 0) {
+ printf("\tusing LDT slot 0\n");
+ asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7));
+ return 0x7;
+ } else {
+ /* No modify_ldt for us (configured out, perhaps) */
+
+ struct user_desc *low_desc = mmap(
+ NULL, sizeof(desc),
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_32BIT, -1, 0);
+ memcpy(low_desc, &desc, sizeof(desc));
+
+ low_desc->entry_number = set_thread_area_entry_number;
+
+ /* 32-bit set_thread_area */
+ long ret;
+ asm volatile ("int $0x80"
+ : "=a" (ret), "+m" (*low_desc)
+ : "a" (243), "b" (low_desc)
+ : "r8", "r9", "r10", "r11");
+ memcpy(&desc, low_desc, sizeof(desc));
+ munmap(low_desc, sizeof(desc));
+
+ if (ret != 0) {
+ printf("[NOTE]\tcould not create a segment -- test won't do anything\n");
+ return 0;
+ }
+ printf("\tusing GDT slot %d\n", desc.entry_number);
+ set_thread_area_entry_number = desc.entry_number;
+
+ unsigned short gs = (unsigned short)((desc.entry_number << 3) | 0x3);
+ asm volatile ("mov %0, %%gs" : : "rm" (gs));
+ return gs;
+ }
+}
+
+void test_wrbase(unsigned short index, unsigned long base)
+{
+ unsigned short newindex;
+ unsigned long newbase;
+
+ printf("[RUN]\tGS = 0x%hx, GSBASE = 0x%lx\n", index, base);
+
+ asm volatile ("mov %0, %%gs" : : "rm" (index));
+ wrgsbase(base);
+
+ remote_base = 0;
+ ftx = 1;
+ syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+ while (ftx != 0)
+ syscall(SYS_futex, &ftx, FUTEX_WAIT, 1, NULL, NULL, 0);
+
+ asm volatile ("mov %%gs, %0" : "=rm" (newindex));
+ newbase = rdgsbase();
+
+ if (newindex == index && newbase == base) {
+ printf("[OK]\tIndex and base were preserved\n");
+ } else {
+ printf("[FAIL]\tAfter switch, GS = 0x%hx and GSBASE = 0x%lx\n",
+ newindex, newbase);
+ nerrs++;
+ }
+}
+
+static void *threadproc(void *ctx)
+{
+ while (1) {
+ while (ftx == 0)
+ syscall(SYS_futex, &ftx, FUTEX_WAIT, 0, NULL, NULL, 0);
+ if (ftx == 3)
+ return NULL;
+
+ if (ftx == 1) {
+ do_remote_base();
+ } else if (ftx == 2) {
+ /*
+ * On AMD chips, this causes GSBASE != 0, GS == 0, and
+ * thread.gsbase == 0.
+ */
+
+ load_gs();
+ asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0));
+ } else {
+ errx(1, "helper thread got bad command");
+ }
+
+ ftx = 0;
+ syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+ }
+}
+
+static void set_gs_and_switch_to(unsigned long local,
+ unsigned short force_sel,
+ unsigned long remote)
+{
+ unsigned long base;
+ unsigned short sel_pre_sched, sel_post_sched;
+
+ bool hard_zero = false;
+ if (local == HARD_ZERO) {
+ hard_zero = true;
+ local = 0;
+ }
+
+ printf("[RUN]\tARCH_SET_GS(0x%lx)%s, then schedule to 0x%lx\n",
+ local, hard_zero ? " and clear gs" : "", remote);
+ if (force_sel)
+ printf("\tBefore schedule, set selector to 0x%hx\n", force_sel);
+ if (syscall(SYS_arch_prctl, ARCH_SET_GS, local) != 0)
+ err(1, "ARCH_SET_GS");
+ if (hard_zero)
+ asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0));
+
+ if (read_base(GS) != local) {
+ nerrs++;
+ printf("[FAIL]\tGSBASE wasn't set as expected\n");
+ }
+
+ if (force_sel) {
+ asm volatile ("mov %0, %%gs" : : "rm" (force_sel));
+ sel_pre_sched = force_sel;
+ local = read_base(GS);
+
+ /*
+ * Signal delivery seems to mess up weird selectors. Put it
+ * back.
+ */
+ asm volatile ("mov %0, %%gs" : : "rm" (force_sel));
+ } else {
+ asm volatile ("mov %%gs, %0" : "=rm" (sel_pre_sched));
+ }
+
+ remote_base = remote;
+ ftx = 1;
+ syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+ while (ftx != 0)
+ syscall(SYS_futex, &ftx, FUTEX_WAIT, 1, NULL, NULL, 0);
+
+ asm volatile ("mov %%gs, %0" : "=rm" (sel_post_sched));
+ base = read_base(GS);
+ if (base == local && sel_pre_sched == sel_post_sched) {
+ printf("[OK]\tGS/BASE remained 0x%hx/0x%lx\n",
+ sel_pre_sched, local);
+ } else {
+ nerrs++;
+ printf("[FAIL]\tGS/BASE changed from 0x%hx/0x%lx to 0x%hx/0x%lx\n",
+ sel_pre_sched, local, sel_post_sched, base);
+ }
+}
+
+static void test_unexpected_base(void)
+{
+ unsigned long base;
+
+ printf("[RUN]\tARCH_SET_GS(0), clear gs, then manipulate GSBASE in a different thread\n");
+ if (syscall(SYS_arch_prctl, ARCH_SET_GS, 0) != 0)
+ err(1, "ARCH_SET_GS");
+ asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0));
+
+ ftx = 2;
+ syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+ while (ftx != 0)
+ syscall(SYS_futex, &ftx, FUTEX_WAIT, 1, NULL, NULL, 0);
+
+ base = read_base(GS);
+ if (base == 0) {
+ printf("[OK]\tGSBASE remained 0\n");
+ } else {
+ nerrs++;
+ printf("[FAIL]\tGSBASE changed to 0x%lx\n", base);
+ }
+}
+
+#define USER_REGS_OFFSET(r) offsetof(struct user_regs_struct, r)
+
+static void test_ptrace_write_gs_read_base(void)
+{
+ int status;
+ pid_t child = fork();
+
+ if (child < 0)
+ err(1, "fork");
+
+ if (child == 0) {
+ printf("[RUN]\tPTRACE_POKE GS, read GSBASE back\n");
+
+ printf("[RUN]\tARCH_SET_GS to 1\n");
+ if (syscall(SYS_arch_prctl, ARCH_SET_GS, 1) != 0)
+ err(1, "ARCH_SET_GS");
+
+ if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) != 0)
+ err(1, "PTRACE_TRACEME");
+
+ raise(SIGTRAP);
+ _exit(0);
+ }
+
+ wait(&status);
+
+ if (WSTOPSIG(status) == SIGTRAP) {
+ unsigned long base;
+ unsigned long gs_offset = USER_REGS_OFFSET(gs);
+ unsigned long base_offset = USER_REGS_OFFSET(gs_base);
+
+ /* Read the initial base. It should be 1. */
+ base = ptrace(PTRACE_PEEKUSER, child, base_offset, NULL);
+ if (base == 1) {
+ printf("[OK]\tGSBASE started at 1\n");
+ } else {
+ nerrs++;
+ printf("[FAIL]\tGSBASE started at 0x%lx\n", base);
+ }
+
+ printf("[RUN]\tSet GS = 0x7, read GSBASE\n");
+
+ /* Poke an LDT selector into GS. */
+ if (ptrace(PTRACE_POKEUSER, child, gs_offset, 0x7) != 0)
+ err(1, "PTRACE_POKEUSER");
+
+ /* And read the base. */
+ base = ptrace(PTRACE_PEEKUSER, child, base_offset, NULL);
+
+ if (base == 0 || base == 1) {
+ printf("[OK]\tGSBASE reads as 0x%lx with invalid GS\n", base);
+ } else {
+ nerrs++;
+ printf("[FAIL]\tGSBASE=0x%lx (should be 0 or 1)\n", base);
+ }
+ }
+
+ ptrace(PTRACE_CONT, child, NULL, NULL);
+
+ wait(&status);
+ if (!WIFEXITED(status))
+ printf("[WARN]\tChild didn't exit cleanly.\n");
+}
+
+static void test_ptrace_write_gsbase(void)
+{
+ int status;
+ pid_t child = fork();
+
+ if (child < 0)
+ err(1, "fork");
+
+ if (child == 0) {
+ printf("[RUN]\tPTRACE_POKE(), write GSBASE from ptracer\n");
+
+ *shared_scratch = load_gs();
+
+ if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) != 0)
+ err(1, "PTRACE_TRACEME");
+
+ raise(SIGTRAP);
+ _exit(0);
+ }
+
+ wait(&status);
+
+ if (WSTOPSIG(status) == SIGTRAP) {
+ unsigned long gs, base;
+ unsigned long gs_offset = USER_REGS_OFFSET(gs);
+ unsigned long base_offset = USER_REGS_OFFSET(gs_base);
+
+ gs = ptrace(PTRACE_PEEKUSER, child, gs_offset, NULL);
+
+ if (gs != *shared_scratch) {
+ nerrs++;
+ printf("[FAIL]\tGS is not prepared with nonzero\n");
+ goto END;
+ }
+
+ if (ptrace(PTRACE_POKEUSER, child, base_offset, 0xFF) != 0)
+ err(1, "PTRACE_POKEUSER");
+
+ gs = ptrace(PTRACE_PEEKUSER, child, gs_offset, NULL);
+ base = ptrace(PTRACE_PEEKUSER, child, base_offset, NULL);
+
+ /*
+ * In a non-FSGSBASE system, the nonzero selector will load
+ * GSBASE (again). But what is tested here is whether the
+ * selector value is changed or not by the GSBASE write in
+ * a ptracer.
+ */
+ if (gs != *shared_scratch) {
+ nerrs++;
+ printf("[FAIL]\tGS changed to %lx\n", gs);
+
+ /*
+ * On older kernels, poking a nonzero value into the
+ * base would zero the selector. On newer kernels,
+ * this behavior has changed -- poking the base
+ * changes only the base and, if FSGSBASE is not
+ * available, this may have no effect once the tracee
+ * is resumed.
+ */
+ if (gs == 0)
+ printf("\tNote: this is expected behavior on older kernels.\n");
+ } else if (have_fsgsbase && (base != 0xFF)) {
+ nerrs++;
+ printf("[FAIL]\tGSBASE changed to %lx\n", base);
+ } else {
+ printf("[OK]\tGS remained 0x%hx", *shared_scratch);
+ if (have_fsgsbase)
+ printf(" and GSBASE changed to 0xFF");
+ printf("\n");
+ }
+ }
+
+END:
+ ptrace(PTRACE_CONT, child, NULL, NULL);
+ wait(&status);
+ if (!WIFEXITED(status))
+ printf("[WARN]\tChild didn't exit cleanly.\n");
+}
+
+int main()
+{
+ pthread_t thread;
+
+ shared_scratch = mmap(NULL, 4096, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+
+ /* Do these tests before we have an LDT. */
+ test_ptrace_write_gs_read_base();
+
+ /* Probe FSGSBASE */
+ sethandler(SIGILL, sigill, 0);
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ rdfsbase();
+ have_fsgsbase = true;
+ printf("\tFSGSBASE instructions are enabled\n");
+ } else {
+ printf("\tFSGSBASE instructions are disabled\n");
+ }
+ clearhandler(SIGILL);
+
+ sethandler(SIGSEGV, sigsegv, 0);
+
+ check_gs_value(0);
+ check_gs_value(1);
+ check_gs_value(0x200000000);
+ check_gs_value(0);
+ check_gs_value(0x200000000);
+ check_gs_value(1);
+
+ for (int sched = 0; sched < 2; sched++) {
+ mov_0_gs(0, !!sched);
+ mov_0_gs(1, !!sched);
+ mov_0_gs(0x200000000, !!sched);
+ }
+
+ /* Set up for multithreading. */
+
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+ CPU_SET(0, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
+ err(1, "sched_setaffinity to CPU 0"); /* should never fail */
+
+ if (pthread_create(&thread, 0, threadproc, 0) != 0)
+ err(1, "pthread_create");
+
+ static unsigned long bases_with_hard_zero[] = {
+ 0, HARD_ZERO, 1, 0x200000000,
+ };
+
+ for (int local = 0; local < 4; local++) {
+ for (int remote = 0; remote < 4; remote++) {
+ for (unsigned short s = 0; s < 5; s++) {
+ unsigned short sel = s;
+ if (s == 4)
+ asm ("mov %%ss, %0" : "=rm" (sel));
+ set_gs_and_switch_to(
+ bases_with_hard_zero[local],
+ sel,
+ bases_with_hard_zero[remote]);
+ }
+ }
+ }
+
+ test_unexpected_base();
+
+ if (have_fsgsbase) {
+ unsigned short ss;
+
+ asm volatile ("mov %%ss, %0" : "=rm" (ss));
+
+ test_wrbase(0, 0);
+ test_wrbase(0, 1);
+ test_wrbase(0, 0x200000000);
+ test_wrbase(0, 0xffffffffffffffff);
+ test_wrbase(ss, 0);
+ test_wrbase(ss, 1);
+ test_wrbase(ss, 0x200000000);
+ test_wrbase(ss, 0xffffffffffffffff);
+ }
+
+ ftx = 3; /* Kill the thread. */
+ syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+
+ if (pthread_join(thread, NULL) != 0)
+ err(1, "pthread_join");
+
+ test_ptrace_write_gsbase();
+
+ return nerrs == 0 ? 0 : 1;
+}
diff --git a/tools/testing/selftests/x86/fsgsbase_restore.c b/tools/testing/selftests/x86/fsgsbase_restore.c
new file mode 100644
index 000000000..6fffadc51
--- /dev/null
+++ b/tools/testing/selftests/x86/fsgsbase_restore.c
@@ -0,0 +1,245 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * fsgsbase_restore.c, test ptrace vs fsgsbase
+ * Copyright (c) 2020 Andy Lutomirski
+ *
+ * This test case simulates a tracer redirecting tracee execution to
+ * a function and then restoring tracee state using PTRACE_GETREGS and
+ * PTRACE_SETREGS. This is similar to what gdb does when doing
+ * 'p func()'. The catch is that this test has the called function
+ * modify a segment register. This makes sure that ptrace correctly
+ * restores segment state when using PTRACE_SETREGS.
+ *
+ * This is not part of fsgsbase.c, because that test is 64-bit only.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <err.h>
+#include <sys/user.h>
+#include <asm/prctl.h>
+#include <sys/prctl.h>
+#include <asm/ldt.h>
+#include <sys/mman.h>
+#include <stddef.h>
+#include <sys/ptrace.h>
+#include <sys/wait.h>
+#include <stdint.h>
+
+#define EXPECTED_VALUE 0x1337f00d
+
+#ifdef __x86_64__
+# define SEG "%gs"
+#else
+# define SEG "%fs"
+#endif
+
+static unsigned int dereference_seg_base(void)
+{
+ int ret;
+ asm volatile ("mov %" SEG ":(0), %0" : "=rm" (ret));
+ return ret;
+}
+
+static void init_seg(void)
+{
+ unsigned int *target = mmap(
+ NULL, sizeof(unsigned int),
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_32BIT, -1, 0);
+ if (target == MAP_FAILED)
+ err(1, "mmap");
+
+ *target = EXPECTED_VALUE;
+
+ printf("\tsegment base address = 0x%lx\n", (unsigned long)target);
+
+ struct user_desc desc = {
+ .entry_number = 0,
+ .base_addr = (unsigned int)(uintptr_t)target,
+ .limit = sizeof(unsigned int) - 1,
+ .seg_32bit = 1,
+ .contents = 0, /* Data, grow-up */
+ .read_exec_only = 0,
+ .limit_in_pages = 0,
+ .seg_not_present = 0,
+ .useable = 0
+ };
+ if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) == 0) {
+ printf("\tusing LDT slot 0\n");
+ asm volatile ("mov %0, %" SEG :: "rm" ((unsigned short)0x7));
+ } else {
+ /* No modify_ldt for us (configured out, perhaps) */
+
+ struct user_desc *low_desc = mmap(
+ NULL, sizeof(desc),
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_32BIT, -1, 0);
+ memcpy(low_desc, &desc, sizeof(desc));
+
+ low_desc->entry_number = -1;
+
+ /* 32-bit set_thread_area */
+ long ret;
+ asm volatile ("int $0x80"
+ : "=a" (ret), "+m" (*low_desc)
+ : "a" (243), "b" (low_desc)
+#ifdef __x86_64__
+ : "r8", "r9", "r10", "r11"
+#endif
+ );
+ memcpy(&desc, low_desc, sizeof(desc));
+ munmap(low_desc, sizeof(desc));
+
+ if (ret != 0) {
+ printf("[NOTE]\tcould not create a segment -- can't test anything\n");
+ exit(0);
+ }
+ printf("\tusing GDT slot %d\n", desc.entry_number);
+
+ unsigned short sel = (unsigned short)((desc.entry_number << 3) | 0x3);
+ asm volatile ("mov %0, %" SEG :: "rm" (sel));
+ }
+}
+
+static void tracee_zap_segment(void)
+{
+ /*
+ * The tracer will redirect execution here. This is meant to
+ * work like gdb's 'p func()' feature. The tricky bit is that
+ * we modify a segment register in order to make sure that ptrace
+ * can correctly restore segment registers.
+ */
+ printf("\tTracee: in tracee_zap_segment()\n");
+
+ /*
+ * Write a nonzero selector with base zero to the segment register.
+ * Using a null selector would defeat the test on AMD pre-Zen2
+ * CPUs, as such CPUs don't clear the base when loading a null
+ * selector.
+ */
+ unsigned short sel;
+ asm volatile ("mov %%ss, %0\n\t"
+ "mov %0, %" SEG
+ : "=rm" (sel));
+
+ pid_t pid = getpid(), tid = syscall(SYS_gettid);
+
+ printf("\tTracee is going back to sleep\n");
+ syscall(SYS_tgkill, pid, tid, SIGSTOP);
+
+ /* Should not get here. */
+ while (true) {
+ printf("[FAIL]\tTracee hit unreachable code\n");
+ pause();
+ }
+}
+
+int main()
+{
+ printf("\tSetting up a segment\n");
+ init_seg();
+
+ unsigned int val = dereference_seg_base();
+ if (val != EXPECTED_VALUE) {
+ printf("[FAIL]\tseg[0] == %x; should be %x\n", val, EXPECTED_VALUE);
+ return 1;
+ }
+ printf("[OK]\tThe segment points to the right place.\n");
+
+ pid_t chld = fork();
+ if (chld < 0)
+ err(1, "fork");
+
+ if (chld == 0) {
+ prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0, 0);
+
+ if (ptrace(PTRACE_TRACEME, 0, 0, 0) != 0)
+ err(1, "PTRACE_TRACEME");
+
+ pid_t pid = getpid(), tid = syscall(SYS_gettid);
+
+ printf("\tTracee will take a nap until signaled\n");
+ syscall(SYS_tgkill, pid, tid, SIGSTOP);
+
+ printf("\tTracee was resumed. Will re-check segment.\n");
+
+ val = dereference_seg_base();
+ if (val != EXPECTED_VALUE) {
+ printf("[FAIL]\tseg[0] == %x; should be %x\n", val, EXPECTED_VALUE);
+ exit(1);
+ }
+
+ printf("[OK]\tThe segment points to the right place.\n");
+ exit(0);
+ }
+
+ int status;
+
+ /* Wait for SIGSTOP. */
+ if (waitpid(chld, &status, 0) != chld || !WIFSTOPPED(status))
+ err(1, "waitpid");
+
+ struct user_regs_struct regs;
+
+ if (ptrace(PTRACE_GETREGS, chld, NULL, &regs) != 0)
+ err(1, "PTRACE_GETREGS");
+
+#ifdef __x86_64__
+ printf("\tChild GS=0x%lx, GSBASE=0x%lx\n", (unsigned long)regs.gs, (unsigned long)regs.gs_base);
+#else
+ printf("\tChild FS=0x%lx\n", (unsigned long)regs.xfs);
+#endif
+
+ struct user_regs_struct regs2 = regs;
+#ifdef __x86_64__
+ regs2.rip = (unsigned long)tracee_zap_segment;
+ regs2.rsp -= 128; /* Don't clobber the redzone. */
+#else
+ regs2.eip = (unsigned long)tracee_zap_segment;
+#endif
+
+ printf("\tTracer: redirecting tracee to tracee_zap_segment()\n");
+ if (ptrace(PTRACE_SETREGS, chld, NULL, &regs2) != 0)
+ err(1, "PTRACE_GETREGS");
+ if (ptrace(PTRACE_CONT, chld, NULL, NULL) != 0)
+ err(1, "PTRACE_GETREGS");
+
+ /* Wait for SIGSTOP. */
+ if (waitpid(chld, &status, 0) != chld || !WIFSTOPPED(status))
+ err(1, "waitpid");
+
+ printf("\tTracer: restoring tracee state\n");
+ if (ptrace(PTRACE_SETREGS, chld, NULL, &regs) != 0)
+ err(1, "PTRACE_GETREGS");
+ if (ptrace(PTRACE_DETACH, chld, NULL, NULL) != 0)
+ err(1, "PTRACE_GETREGS");
+
+ /* Wait for SIGSTOP. */
+ if (waitpid(chld, &status, 0) != chld)
+ err(1, "waitpid");
+
+ if (WIFSIGNALED(status)) {
+ printf("[FAIL]\tTracee crashed\n");
+ return 1;
+ }
+
+ if (!WIFEXITED(status)) {
+ printf("[FAIL]\tTracee stopped for an unexpected reason: %d\n", status);
+ return 1;
+ }
+
+ int exitcode = WEXITSTATUS(status);
+ if (exitcode != 0) {
+ printf("[FAIL]\tTracee reported failure\n");
+ return 1;
+ }
+
+ printf("[OK]\tAll is well.\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/helpers.h b/tools/testing/selftests/x86/helpers.h
new file mode 100644
index 000000000..f5ff2a261
--- /dev/null
+++ b/tools/testing/selftests/x86/helpers.h
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#ifndef __SELFTESTS_X86_HELPERS_H
+#define __SELFTESTS_X86_HELPERS_H
+
+#include <asm/processor-flags.h>
+
+static inline unsigned long get_eflags(void)
+{
+ unsigned long eflags;
+
+ asm volatile (
+#ifdef __x86_64__
+ "subq $128, %%rsp\n\t"
+ "pushfq\n\t"
+ "popq %0\n\t"
+ "addq $128, %%rsp"
+#else
+ "pushfl\n\t"
+ "popl %0"
+#endif
+ : "=r" (eflags) :: "memory");
+
+ return eflags;
+}
+
+static inline void set_eflags(unsigned long eflags)
+{
+ asm volatile (
+#ifdef __x86_64__
+ "subq $128, %%rsp\n\t"
+ "pushq %0\n\t"
+ "popfq\n\t"
+ "addq $128, %%rsp"
+#else
+ "pushl %0\n\t"
+ "popfl"
+#endif
+ :: "r" (eflags) : "flags", "memory");
+}
+
+#endif /* __SELFTESTS_X86_HELPERS_H */
diff --git a/tools/testing/selftests/x86/ioperm.c b/tools/testing/selftests/x86/ioperm.c
new file mode 100644
index 000000000..57ec5e99e
--- /dev/null
+++ b/tools/testing/selftests/x86/ioperm.c
@@ -0,0 +1,185 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ioperm.c - Test case for ioperm(2)
+ * Copyright (c) 2015 Andrew Lutomirski
+ */
+
+#define _GNU_SOURCE
+#include <err.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <signal.h>
+#include <setjmp.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdbool.h>
+#include <sched.h>
+#include <sys/io.h>
+
+static int nerrs = 0;
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+
+}
+
+static void clearhandler(int sig)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = SIG_DFL;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv(int sig, siginfo_t *si, void *ctx_void)
+{
+ siglongjmp(jmpbuf, 1);
+}
+
+static bool try_outb(unsigned short port)
+{
+ sethandler(SIGSEGV, sigsegv, SA_RESETHAND);
+ if (sigsetjmp(jmpbuf, 1) != 0) {
+ return false;
+ } else {
+ asm volatile ("outb %%al, %w[port]"
+ : : [port] "Nd" (port), "a" (0));
+ return true;
+ }
+ clearhandler(SIGSEGV);
+}
+
+static void expect_ok(unsigned short port)
+{
+ if (!try_outb(port)) {
+ printf("[FAIL]\toutb to 0x%02hx failed\n", port);
+ exit(1);
+ }
+
+ printf("[OK]\toutb to 0x%02hx worked\n", port);
+}
+
+static void expect_gp(unsigned short port)
+{
+ if (try_outb(port)) {
+ printf("[FAIL]\toutb to 0x%02hx worked\n", port);
+ exit(1);
+ }
+
+ printf("[OK]\toutb to 0x%02hx failed\n", port);
+}
+
+int main(void)
+{
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+ CPU_SET(0, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
+ err(1, "sched_setaffinity to CPU 0");
+
+ expect_gp(0x80);
+ expect_gp(0xed);
+
+ /*
+ * Probe for ioperm support. Note that clearing ioperm bits
+ * works even as nonroot.
+ */
+ printf("[RUN]\tenable 0x80\n");
+ if (ioperm(0x80, 1, 1) != 0) {
+ printf("[OK]\tioperm(0x80, 1, 1) failed (%d) -- try running as root\n",
+ errno);
+ return 0;
+ }
+ expect_ok(0x80);
+ expect_gp(0xed);
+
+ printf("[RUN]\tdisable 0x80\n");
+ if (ioperm(0x80, 1, 0) != 0) {
+ printf("[FAIL]\tioperm(0x80, 1, 0) failed (%d)", errno);
+ return 1;
+ }
+ expect_gp(0x80);
+ expect_gp(0xed);
+
+ /* Make sure that fork() preserves ioperm. */
+ if (ioperm(0x80, 1, 1) != 0) {
+ printf("[FAIL]\tioperm(0x80, 1, 0) failed (%d)", errno);
+ return 1;
+ }
+
+ pid_t child = fork();
+ if (child == -1)
+ err(1, "fork");
+
+ if (child == 0) {
+ printf("[RUN]\tchild: check that we inherited permissions\n");
+ expect_ok(0x80);
+ expect_gp(0xed);
+ printf("[RUN]\tchild: Extend permissions to 0x81\n");
+ if (ioperm(0x81, 1, 1) != 0) {
+ printf("[FAIL]\tioperm(0x81, 1, 1) failed (%d)", errno);
+ return 1;
+ }
+ printf("[RUN]\tchild: Drop permissions to 0x80\n");
+ if (ioperm(0x80, 1, 0) != 0) {
+ printf("[FAIL]\tioperm(0x80, 1, 0) failed (%d)", errno);
+ return 1;
+ }
+ expect_gp(0x80);
+ return 0;
+ } else {
+ int status;
+ if (waitpid(child, &status, 0) != child ||
+ !WIFEXITED(status)) {
+ printf("[FAIL]\tChild died\n");
+ nerrs++;
+ } else if (WEXITSTATUS(status) != 0) {
+ printf("[FAIL]\tChild failed\n");
+ nerrs++;
+ } else {
+ printf("[OK]\tChild succeeded\n");
+ }
+ }
+
+ /* Verify that the child dropping 0x80 did not affect the parent */
+ printf("\tVerify that unsharing the bitmap worked\n");
+ expect_ok(0x80);
+
+ /* Test the capability checks. */
+ printf("\tDrop privileges\n");
+ if (setresuid(1, 1, 1) != 0) {
+ printf("[WARN]\tDropping privileges failed\n");
+ return 0;
+ }
+
+ printf("[RUN]\tdisable 0x80\n");
+ if (ioperm(0x80, 1, 0) != 0) {
+ printf("[FAIL]\tioperm(0x80, 1, 0) failed (%d)", errno);
+ return 1;
+ }
+ printf("[OK]\tit worked\n");
+
+ printf("[RUN]\tenable 0x80 again\n");
+ if (ioperm(0x80, 1, 1) == 0) {
+ printf("[FAIL]\tit succeeded but should have failed.\n");
+ return 1;
+ }
+ printf("[OK]\tit failed\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/iopl.c b/tools/testing/selftests/x86/iopl.c
new file mode 100644
index 000000000..7e3e09c1a
--- /dev/null
+++ b/tools/testing/selftests/x86/iopl.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * iopl.c - Test case for a Linux on Xen 64-bit bug
+ * Copyright (c) 2015 Andrew Lutomirski
+ */
+
+#define _GNU_SOURCE
+#include <err.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <signal.h>
+#include <setjmp.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdbool.h>
+#include <sched.h>
+#include <sys/io.h>
+
+static int nerrs = 0;
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+
+}
+
+static void clearhandler(int sig)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = SIG_DFL;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv(int sig, siginfo_t *si, void *ctx_void)
+{
+ siglongjmp(jmpbuf, 1);
+}
+
+static bool try_outb(unsigned short port)
+{
+ sethandler(SIGSEGV, sigsegv, SA_RESETHAND);
+ if (sigsetjmp(jmpbuf, 1) != 0) {
+ return false;
+ } else {
+ asm volatile ("outb %%al, %w[port]"
+ : : [port] "Nd" (port), "a" (0));
+ return true;
+ }
+ clearhandler(SIGSEGV);
+}
+
+static void expect_ok_outb(unsigned short port)
+{
+ if (!try_outb(port)) {
+ printf("[FAIL]\toutb to 0x%02hx failed\n", port);
+ exit(1);
+ }
+
+ printf("[OK]\toutb to 0x%02hx worked\n", port);
+}
+
+static void expect_gp_outb(unsigned short port)
+{
+ if (try_outb(port)) {
+ printf("[FAIL]\toutb to 0x%02hx worked\n", port);
+ nerrs++;
+ }
+
+ printf("[OK]\toutb to 0x%02hx failed\n", port);
+}
+
+#define RET_FAULTED 0
+#define RET_FAIL 1
+#define RET_EMUL 2
+
+static int try_cli(void)
+{
+ unsigned long flags;
+
+ sethandler(SIGSEGV, sigsegv, SA_RESETHAND);
+ if (sigsetjmp(jmpbuf, 1) != 0) {
+ return RET_FAULTED;
+ } else {
+ asm volatile("cli; pushf; pop %[flags]"
+ : [flags] "=rm" (flags));
+
+ /* X86_FLAGS_IF */
+ if (!(flags & (1 << 9)))
+ return RET_FAIL;
+ else
+ return RET_EMUL;
+ }
+ clearhandler(SIGSEGV);
+}
+
+static int try_sti(bool irqs_off)
+{
+ unsigned long flags;
+
+ sethandler(SIGSEGV, sigsegv, SA_RESETHAND);
+ if (sigsetjmp(jmpbuf, 1) != 0) {
+ return RET_FAULTED;
+ } else {
+ asm volatile("sti; pushf; pop %[flags]"
+ : [flags] "=rm" (flags));
+
+ /* X86_FLAGS_IF */
+ if (irqs_off && (flags & (1 << 9)))
+ return RET_FAIL;
+ else
+ return RET_EMUL;
+ }
+ clearhandler(SIGSEGV);
+}
+
+static void expect_gp_sti(bool irqs_off)
+{
+ int ret = try_sti(irqs_off);
+
+ switch (ret) {
+ case RET_FAULTED:
+ printf("[OK]\tSTI faulted\n");
+ break;
+ case RET_EMUL:
+ printf("[OK]\tSTI NOPped\n");
+ break;
+ default:
+ printf("[FAIL]\tSTI worked\n");
+ nerrs++;
+ }
+}
+
+/*
+ * Returns whether it managed to disable interrupts.
+ */
+static bool test_cli(void)
+{
+ int ret = try_cli();
+
+ switch (ret) {
+ case RET_FAULTED:
+ printf("[OK]\tCLI faulted\n");
+ break;
+ case RET_EMUL:
+ printf("[OK]\tCLI NOPped\n");
+ break;
+ default:
+ printf("[FAIL]\tCLI worked\n");
+ nerrs++;
+ return true;
+ }
+
+ return false;
+}
+
+int main(void)
+{
+ cpu_set_t cpuset;
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(0, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
+ err(1, "sched_setaffinity to CPU 0");
+
+ /* Probe for iopl support. Note that iopl(0) works even as nonroot. */
+ switch(iopl(3)) {
+ case 0:
+ break;
+ case -ENOSYS:
+ printf("[OK]\tiopl() nor supported\n");
+ return 0;
+ default:
+ printf("[OK]\tiopl(3) failed (%d) -- try running as root\n",
+ errno);
+ return 0;
+ }
+
+ /* Make sure that CLI/STI are blocked even with IOPL level 3 */
+ expect_gp_sti(test_cli());
+ expect_ok_outb(0x80);
+
+ /* Establish an I/O bitmap to test the restore */
+ if (ioperm(0x80, 1, 1) != 0)
+ err(1, "ioperm(0x80, 1, 1) failed\n");
+
+ /* Restore our original state prior to starting the fork test. */
+ if (iopl(0) != 0)
+ err(1, "iopl(0)");
+
+ /*
+ * Verify that IOPL emulation is disabled and the I/O bitmap still
+ * works.
+ */
+ expect_ok_outb(0x80);
+ expect_gp_outb(0xed);
+ /* Drop the I/O bitmap */
+ if (ioperm(0x80, 1, 0) != 0)
+ err(1, "ioperm(0x80, 1, 0) failed\n");
+
+ pid_t child = fork();
+ if (child == -1)
+ err(1, "fork");
+
+ if (child == 0) {
+ printf("\tchild: set IOPL to 3\n");
+ if (iopl(3) != 0)
+ err(1, "iopl");
+
+ printf("[RUN]\tchild: write to 0x80\n");
+ asm volatile ("outb %%al, $0x80" : : "a" (0));
+
+ return 0;
+ } else {
+ int status;
+ if (waitpid(child, &status, 0) != child ||
+ !WIFEXITED(status)) {
+ printf("[FAIL]\tChild died\n");
+ nerrs++;
+ } else if (WEXITSTATUS(status) != 0) {
+ printf("[FAIL]\tChild failed\n");
+ nerrs++;
+ } else {
+ printf("[OK]\tChild succeeded\n");
+ }
+ }
+
+ printf("[RUN]\tparent: write to 0x80 (should fail)\n");
+
+ expect_gp_outb(0x80);
+ expect_gp_sti(test_cli());
+
+ /* Test the capability checks. */
+ printf("\tiopl(3)\n");
+ if (iopl(3) != 0)
+ err(1, "iopl(3)");
+
+ printf("\tDrop privileges\n");
+ if (setresuid(1, 1, 1) != 0) {
+ printf("[WARN]\tDropping privileges failed\n");
+ goto done;
+ }
+
+ printf("[RUN]\tiopl(3) unprivileged but with IOPL==3\n");
+ if (iopl(3) != 0) {
+ printf("[FAIL]\tiopl(3) should work if iopl is already 3 even if unprivileged\n");
+ nerrs++;
+ }
+
+ printf("[RUN]\tiopl(0) unprivileged\n");
+ if (iopl(0) != 0) {
+ printf("[FAIL]\tiopl(0) should work if iopl is already 3 even if unprivileged\n");
+ nerrs++;
+ }
+
+ printf("[RUN]\tiopl(3) unprivileged\n");
+ if (iopl(3) == 0) {
+ printf("[FAIL]\tiopl(3) should fail if when unprivileged if iopl==0\n");
+ nerrs++;
+ } else {
+ printf("[OK]\tFailed as expected\n");
+ }
+
+done:
+ return nerrs ? 1 : 0;
+}
diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c
new file mode 100644
index 000000000..1aef72df2
--- /dev/null
+++ b/tools/testing/selftests/x86/ldt_gdt.c
@@ -0,0 +1,927 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ldt_gdt.c - Test cases for LDT and GDT access
+ * Copyright (c) 2015 Andrew Lutomirski
+ */
+
+#define _GNU_SOURCE
+#include <err.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <signal.h>
+#include <setjmp.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <asm/ldt.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdbool.h>
+#include <pthread.h>
+#include <sched.h>
+#include <linux/futex.h>
+#include <sys/mman.h>
+#include <asm/prctl.h>
+#include <sys/prctl.h>
+
+#define AR_ACCESSED (1<<8)
+
+#define AR_TYPE_RODATA (0 * (1<<9))
+#define AR_TYPE_RWDATA (1 * (1<<9))
+#define AR_TYPE_RODATA_EXPDOWN (2 * (1<<9))
+#define AR_TYPE_RWDATA_EXPDOWN (3 * (1<<9))
+#define AR_TYPE_XOCODE (4 * (1<<9))
+#define AR_TYPE_XRCODE (5 * (1<<9))
+#define AR_TYPE_XOCODE_CONF (6 * (1<<9))
+#define AR_TYPE_XRCODE_CONF (7 * (1<<9))
+
+#define AR_DPL3 (3 * (1<<13))
+
+#define AR_S (1 << 12)
+#define AR_P (1 << 15)
+#define AR_AVL (1 << 20)
+#define AR_L (1 << 21)
+#define AR_DB (1 << 22)
+#define AR_G (1 << 23)
+
+#ifdef __x86_64__
+# define INT80_CLOBBERS "r8", "r9", "r10", "r11"
+#else
+# define INT80_CLOBBERS
+#endif
+
+static int nerrs;
+
+/* Points to an array of 1024 ints, each holding its own index. */
+static const unsigned int *counter_page;
+static struct user_desc *low_user_desc;
+static struct user_desc *low_user_desc_clear; /* Use to delete GDT entry */
+static int gdt_entry_num;
+
+static void check_invalid_segment(uint16_t index, int ldt)
+{
+ uint32_t has_limit = 0, has_ar = 0, limit, ar;
+ uint32_t selector = (index << 3) | (ldt << 2) | 3;
+
+ asm ("lsl %[selector], %[limit]\n\t"
+ "jnz 1f\n\t"
+ "movl $1, %[has_limit]\n\t"
+ "1:"
+ : [limit] "=r" (limit), [has_limit] "+rm" (has_limit)
+ : [selector] "r" (selector));
+ asm ("larl %[selector], %[ar]\n\t"
+ "jnz 1f\n\t"
+ "movl $1, %[has_ar]\n\t"
+ "1:"
+ : [ar] "=r" (ar), [has_ar] "+rm" (has_ar)
+ : [selector] "r" (selector));
+
+ if (has_limit || has_ar) {
+ printf("[FAIL]\t%s entry %hu is valid but should be invalid\n",
+ (ldt ? "LDT" : "GDT"), index);
+ nerrs++;
+ } else {
+ printf("[OK]\t%s entry %hu is invalid\n",
+ (ldt ? "LDT" : "GDT"), index);
+ }
+}
+
+static void check_valid_segment(uint16_t index, int ldt,
+ uint32_t expected_ar, uint32_t expected_limit,
+ bool verbose)
+{
+ uint32_t has_limit = 0, has_ar = 0, limit, ar;
+ uint32_t selector = (index << 3) | (ldt << 2) | 3;
+
+ asm ("lsl %[selector], %[limit]\n\t"
+ "jnz 1f\n\t"
+ "movl $1, %[has_limit]\n\t"
+ "1:"
+ : [limit] "=r" (limit), [has_limit] "+rm" (has_limit)
+ : [selector] "r" (selector));
+ asm ("larl %[selector], %[ar]\n\t"
+ "jnz 1f\n\t"
+ "movl $1, %[has_ar]\n\t"
+ "1:"
+ : [ar] "=r" (ar), [has_ar] "+rm" (has_ar)
+ : [selector] "r" (selector));
+
+ if (!has_limit || !has_ar) {
+ printf("[FAIL]\t%s entry %hu is invalid but should be valid\n",
+ (ldt ? "LDT" : "GDT"), index);
+ nerrs++;
+ return;
+ }
+
+ /* The SDM says "bits 19:16 are undefined". Thanks. */
+ ar &= ~0xF0000;
+
+ /*
+ * NB: Different Linux versions do different things with the
+ * accessed bit in set_thread_area().
+ */
+ if (ar != expected_ar && ar != (expected_ar | AR_ACCESSED)) {
+ printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n",
+ (ldt ? "LDT" : "GDT"), index, ar, expected_ar);
+ nerrs++;
+ } else if (limit != expected_limit) {
+ printf("[FAIL]\t%s entry %hu has limit 0x%08X but expected 0x%08X\n",
+ (ldt ? "LDT" : "GDT"), index, limit, expected_limit);
+ nerrs++;
+ } else if (verbose) {
+ printf("[OK]\t%s entry %hu has AR 0x%08X and limit 0x%08X\n",
+ (ldt ? "LDT" : "GDT"), index, ar, limit);
+ }
+}
+
+static bool install_valid_mode(const struct user_desc *d, uint32_t ar,
+ bool oldmode, bool ldt)
+{
+ struct user_desc desc = *d;
+ int ret;
+
+ if (!ldt) {
+#ifndef __i386__
+ /* No point testing set_thread_area in a 64-bit build */
+ return false;
+#endif
+ if (!gdt_entry_num)
+ return false;
+ desc.entry_number = gdt_entry_num;
+
+ ret = syscall(SYS_set_thread_area, &desc);
+ } else {
+ ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11,
+ &desc, sizeof(desc));
+
+ if (ret < -1)
+ errno = -ret;
+
+ if (ret != 0 && errno == ENOSYS) {
+ printf("[OK]\tmodify_ldt returned -ENOSYS\n");
+ return false;
+ }
+ }
+
+ if (ret == 0) {
+ uint32_t limit = desc.limit;
+ if (desc.limit_in_pages)
+ limit = (limit << 12) + 4095;
+ check_valid_segment(desc.entry_number, ldt, ar, limit, true);
+ return true;
+ } else {
+ if (desc.seg_32bit) {
+ printf("[FAIL]\tUnexpected %s failure %d\n",
+ ldt ? "modify_ldt" : "set_thread_area",
+ errno);
+ nerrs++;
+ return false;
+ } else {
+ printf("[OK]\t%s rejected 16 bit segment\n",
+ ldt ? "modify_ldt" : "set_thread_area");
+ return false;
+ }
+ }
+}
+
+static bool install_valid(const struct user_desc *desc, uint32_t ar)
+{
+ bool ret = install_valid_mode(desc, ar, false, true);
+
+ if (desc->contents <= 1 && desc->seg_32bit &&
+ !desc->seg_not_present) {
+ /* Should work in the GDT, too. */
+ install_valid_mode(desc, ar, false, false);
+ }
+
+ return ret;
+}
+
+static void install_invalid(const struct user_desc *desc, bool oldmode)
+{
+ int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11,
+ desc, sizeof(*desc));
+ if (ret < -1)
+ errno = -ret;
+ if (ret == 0) {
+ check_invalid_segment(desc->entry_number, 1);
+ } else if (errno == ENOSYS) {
+ printf("[OK]\tmodify_ldt returned -ENOSYS\n");
+ } else {
+ if (desc->seg_32bit) {
+ printf("[FAIL]\tUnexpected modify_ldt failure %d\n",
+ errno);
+ nerrs++;
+ } else {
+ printf("[OK]\tmodify_ldt rejected 16 bit segment\n");
+ }
+ }
+}
+
+static int safe_modify_ldt(int func, struct user_desc *ptr,
+ unsigned long bytecount)
+{
+ int ret = syscall(SYS_modify_ldt, 0x11, ptr, bytecount);
+ if (ret < -1)
+ errno = -ret;
+ return ret;
+}
+
+static void fail_install(struct user_desc *desc)
+{
+ if (safe_modify_ldt(0x11, desc, sizeof(*desc)) == 0) {
+ printf("[FAIL]\tmodify_ldt accepted a bad descriptor\n");
+ nerrs++;
+ } else if (errno == ENOSYS) {
+ printf("[OK]\tmodify_ldt returned -ENOSYS\n");
+ } else {
+ printf("[OK]\tmodify_ldt failure %d\n", errno);
+ }
+}
+
+static void do_simple_tests(void)
+{
+ struct user_desc desc = {
+ .entry_number = 0,
+ .base_addr = 0,
+ .limit = 10,
+ .seg_32bit = 1,
+ .contents = 2, /* Code, not conforming */
+ .read_exec_only = 0,
+ .limit_in_pages = 0,
+ .seg_not_present = 0,
+ .useable = 0
+ };
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB);
+
+ desc.limit_in_pages = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+ AR_S | AR_P | AR_DB | AR_G);
+
+ check_invalid_segment(1, 1);
+
+ desc.entry_number = 2;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+ AR_S | AR_P | AR_DB | AR_G);
+
+ check_invalid_segment(1, 1);
+
+ desc.base_addr = 0xf0000000;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+ AR_S | AR_P | AR_DB | AR_G);
+
+ desc.useable = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+ AR_S | AR_P | AR_DB | AR_G | AR_AVL);
+
+ desc.seg_not_present = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+ AR_S | AR_DB | AR_G | AR_AVL);
+
+ desc.seg_32bit = 0;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+ AR_S | AR_G | AR_AVL);
+
+ desc.seg_32bit = 1;
+ desc.contents = 0;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA |
+ AR_S | AR_DB | AR_G | AR_AVL);
+
+ desc.read_exec_only = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA |
+ AR_S | AR_DB | AR_G | AR_AVL);
+
+ desc.contents = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA_EXPDOWN |
+ AR_S | AR_DB | AR_G | AR_AVL);
+
+ desc.read_exec_only = 0;
+ desc.limit_in_pages = 0;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA_EXPDOWN |
+ AR_S | AR_DB | AR_AVL);
+
+ desc.contents = 3;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE_CONF |
+ AR_S | AR_DB | AR_AVL);
+
+ desc.read_exec_only = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE_CONF |
+ AR_S | AR_DB | AR_AVL);
+
+ desc.read_exec_only = 0;
+ desc.contents = 2;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+ AR_S | AR_DB | AR_AVL);
+
+ desc.read_exec_only = 1;
+
+#ifdef __x86_64__
+ desc.lm = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE |
+ AR_S | AR_DB | AR_AVL);
+ desc.lm = 0;
+#endif
+
+ bool entry1_okay = install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE |
+ AR_S | AR_DB | AR_AVL);
+
+ if (entry1_okay) {
+ printf("[RUN]\tTest fork\n");
+ pid_t child = fork();
+ if (child == 0) {
+ nerrs = 0;
+ check_valid_segment(desc.entry_number, 1,
+ AR_DPL3 | AR_TYPE_XOCODE |
+ AR_S | AR_DB | AR_AVL, desc.limit,
+ true);
+ check_invalid_segment(1, 1);
+ exit(nerrs ? 1 : 0);
+ } else {
+ int status;
+ if (waitpid(child, &status, 0) != child ||
+ !WIFEXITED(status)) {
+ printf("[FAIL]\tChild died\n");
+ nerrs++;
+ } else if (WEXITSTATUS(status) != 0) {
+ printf("[FAIL]\tChild failed\n");
+ nerrs++;
+ } else {
+ printf("[OK]\tChild succeeded\n");
+ }
+ }
+
+ printf("[RUN]\tTest size\n");
+ int i;
+ for (i = 0; i < 8192; i++) {
+ desc.entry_number = i;
+ desc.limit = i;
+ if (safe_modify_ldt(0x11, &desc, sizeof(desc)) != 0) {
+ printf("[FAIL]\tFailed to install entry %d\n", i);
+ nerrs++;
+ break;
+ }
+ }
+ for (int j = 0; j < i; j++) {
+ check_valid_segment(j, 1, AR_DPL3 | AR_TYPE_XOCODE |
+ AR_S | AR_DB | AR_AVL, j, false);
+ }
+ printf("[DONE]\tSize test\n");
+ } else {
+ printf("[SKIP]\tSkipping fork and size tests because we have no LDT\n");
+ }
+
+ /* Test entry_number too high. */
+ desc.entry_number = 8192;
+ fail_install(&desc);
+
+ /* Test deletion and actions mistakeable for deletion. */
+ memset(&desc, 0, sizeof(desc));
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P);
+
+ desc.seg_not_present = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S);
+
+ desc.seg_not_present = 0;
+ desc.read_exec_only = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S | AR_P);
+
+ desc.read_exec_only = 0;
+ desc.seg_not_present = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S);
+
+ desc.read_exec_only = 1;
+ desc.limit = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S);
+
+ desc.limit = 0;
+ desc.base_addr = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S);
+
+ desc.base_addr = 0;
+ install_invalid(&desc, false);
+
+ desc.seg_not_present = 0;
+ desc.seg_32bit = 1;
+ desc.read_exec_only = 0;
+ desc.limit = 0xfffff;
+
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P | AR_DB);
+
+ desc.limit_in_pages = 1;
+
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P | AR_DB | AR_G);
+ desc.read_exec_only = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S | AR_P | AR_DB | AR_G);
+ desc.contents = 1;
+ desc.read_exec_only = 0;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA_EXPDOWN | AR_S | AR_P | AR_DB | AR_G);
+ desc.read_exec_only = 1;
+ install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA_EXPDOWN | AR_S | AR_P | AR_DB | AR_G);
+
+ desc.limit = 0;
+ install_invalid(&desc, true);
+}
+
+/*
+ * 0: thread is idle
+ * 1: thread armed
+ * 2: thread should clear LDT entry 0
+ * 3: thread should exit
+ */
+static volatile unsigned int ftx;
+
+static void *threadproc(void *ctx)
+{
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+ CPU_SET(1, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
+ err(1, "sched_setaffinity to CPU 1"); /* should never fail */
+
+ while (1) {
+ syscall(SYS_futex, &ftx, FUTEX_WAIT, 0, NULL, NULL, 0);
+ while (ftx != 2) {
+ if (ftx >= 3)
+ return NULL;
+ }
+
+ /* clear LDT entry 0 */
+ const struct user_desc desc = {};
+ if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) != 0)
+ err(1, "modify_ldt");
+
+ /* If ftx == 2, set it to zero. If ftx == 100, quit. */
+ unsigned int x = -2;
+ asm volatile ("lock xaddl %[x], %[ftx]" :
+ [x] "+r" (x), [ftx] "+m" (ftx));
+ if (x != 2)
+ return NULL;
+ }
+}
+
+#ifdef __i386__
+
+#ifndef SA_RESTORE
+#define SA_RESTORER 0x04000000
+#endif
+
+/*
+ * The UAPI header calls this 'struct sigaction', which conflicts with
+ * glibc. Sigh.
+ */
+struct fake_ksigaction {
+ void *handler; /* the real type is nasty */
+ unsigned long sa_flags;
+ void (*sa_restorer)(void);
+ unsigned char sigset[8];
+};
+
+static void fix_sa_restorer(int sig)
+{
+ struct fake_ksigaction ksa;
+
+ if (syscall(SYS_rt_sigaction, sig, NULL, &ksa, 8) == 0) {
+ /*
+ * glibc has a nasty bug: it sometimes writes garbage to
+ * sa_restorer. This interacts quite badly with anything
+ * that fiddles with SS because it can trigger legacy
+ * stack switching. Patch it up. See:
+ *
+ * https://sourceware.org/bugzilla/show_bug.cgi?id=21269
+ */
+ if (!(ksa.sa_flags & SA_RESTORER) && ksa.sa_restorer) {
+ ksa.sa_restorer = NULL;
+ if (syscall(SYS_rt_sigaction, sig, &ksa, NULL,
+ sizeof(ksa.sigset)) != 0)
+ err(1, "rt_sigaction");
+ }
+ }
+}
+#else
+static void fix_sa_restorer(int sig)
+{
+ /* 64-bit glibc works fine. */
+}
+#endif
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+
+ fix_sa_restorer(sig);
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
+{
+ siglongjmp(jmpbuf, 1);
+}
+
+static void do_multicpu_tests(void)
+{
+ cpu_set_t cpuset;
+ pthread_t thread;
+ int failures = 0, iters = 5, i;
+ unsigned short orig_ss;
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(1, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) {
+ printf("[SKIP]\tCannot set affinity to CPU 1\n");
+ return;
+ }
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(0, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) {
+ printf("[SKIP]\tCannot set affinity to CPU 0\n");
+ return;
+ }
+
+ sethandler(SIGSEGV, sigsegv, 0);
+#ifdef __i386__
+ /* True 32-bit kernels send SIGILL instead of SIGSEGV on IRET faults. */
+ sethandler(SIGILL, sigsegv, 0);
+#endif
+
+ printf("[RUN]\tCross-CPU LDT invalidation\n");
+
+ if (pthread_create(&thread, 0, threadproc, 0) != 0)
+ err(1, "pthread_create");
+
+ asm volatile ("mov %%ss, %0" : "=rm" (orig_ss));
+
+ for (i = 0; i < 5; i++) {
+ if (sigsetjmp(jmpbuf, 1) != 0)
+ continue;
+
+ /* Make sure the thread is ready after the last test. */
+ while (ftx != 0)
+ ;
+
+ struct user_desc desc = {
+ .entry_number = 0,
+ .base_addr = 0,
+ .limit = 0xfffff,
+ .seg_32bit = 1,
+ .contents = 0, /* Data */
+ .read_exec_only = 0,
+ .limit_in_pages = 1,
+ .seg_not_present = 0,
+ .useable = 0
+ };
+
+ if (safe_modify_ldt(0x11, &desc, sizeof(desc)) != 0) {
+ if (errno != ENOSYS)
+ err(1, "modify_ldt");
+ printf("[SKIP]\tmodify_ldt unavailable\n");
+ break;
+ }
+
+ /* Arm the thread. */
+ ftx = 1;
+ syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+
+ asm volatile ("mov %0, %%ss" : : "r" (0x7));
+
+ /* Go! */
+ ftx = 2;
+
+ while (ftx != 0)
+ ;
+
+ /*
+ * On success, modify_ldt will segfault us synchronously,
+ * and we'll escape via siglongjmp.
+ */
+
+ failures++;
+ asm volatile ("mov %0, %%ss" : : "rm" (orig_ss));
+ };
+
+ ftx = 100; /* Kill the thread. */
+ syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+
+ if (pthread_join(thread, NULL) != 0)
+ err(1, "pthread_join");
+
+ if (failures) {
+ printf("[FAIL]\t%d of %d iterations failed\n", failures, iters);
+ nerrs++;
+ } else {
+ printf("[OK]\tAll %d iterations succeeded\n", iters);
+ }
+}
+
+static int finish_exec_test(void)
+{
+ /*
+ * Older kernel versions did inherit the LDT on exec() which is
+ * wrong because exec() starts from a clean state.
+ */
+ check_invalid_segment(0, 1);
+
+ return nerrs ? 1 : 0;
+}
+
+static void do_exec_test(void)
+{
+ printf("[RUN]\tTest exec\n");
+
+ struct user_desc desc = {
+ .entry_number = 0,
+ .base_addr = 0,
+ .limit = 42,
+ .seg_32bit = 1,
+ .contents = 2, /* Code, not conforming */
+ .read_exec_only = 0,
+ .limit_in_pages = 0,
+ .seg_not_present = 0,
+ .useable = 0
+ };
+ install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB);
+
+ pid_t child = fork();
+ if (child == 0) {
+ execl("/proc/self/exe", "ldt_gdt_test_exec", NULL);
+ printf("[FAIL]\tCould not exec self\n");
+ exit(1); /* exec failed */
+ } else {
+ int status;
+ if (waitpid(child, &status, 0) != child ||
+ !WIFEXITED(status)) {
+ printf("[FAIL]\tChild died\n");
+ nerrs++;
+ } else if (WEXITSTATUS(status) != 0) {
+ printf("[FAIL]\tChild failed\n");
+ nerrs++;
+ } else {
+ printf("[OK]\tChild succeeded\n");
+ }
+ }
+}
+
+static void setup_counter_page(void)
+{
+ unsigned int *page = mmap(NULL, 4096, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_32BIT, -1, 0);
+ if (page == MAP_FAILED)
+ err(1, "mmap");
+
+ for (int i = 0; i < 1024; i++)
+ page[i] = i;
+ counter_page = page;
+}
+
+static int invoke_set_thread_area(void)
+{
+ int ret;
+ asm volatile ("int $0x80"
+ : "=a" (ret), "+m" (low_user_desc) :
+ "a" (243), "b" (low_user_desc)
+ : INT80_CLOBBERS);
+ return ret;
+}
+
+static void setup_low_user_desc(void)
+{
+ low_user_desc = mmap(NULL, 2 * sizeof(struct user_desc),
+ PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_PRIVATE | MAP_32BIT, -1, 0);
+ if (low_user_desc == MAP_FAILED)
+ err(1, "mmap");
+
+ low_user_desc->entry_number = -1;
+ low_user_desc->base_addr = (unsigned long)&counter_page[1];
+ low_user_desc->limit = 0xfffff;
+ low_user_desc->seg_32bit = 1;
+ low_user_desc->contents = 0; /* Data, grow-up*/
+ low_user_desc->read_exec_only = 0;
+ low_user_desc->limit_in_pages = 1;
+ low_user_desc->seg_not_present = 0;
+ low_user_desc->useable = 0;
+
+ if (invoke_set_thread_area() == 0) {
+ gdt_entry_num = low_user_desc->entry_number;
+ printf("[NOTE]\tset_thread_area is available; will use GDT index %d\n", gdt_entry_num);
+ } else {
+ printf("[NOTE]\tset_thread_area is unavailable\n");
+ }
+
+ low_user_desc_clear = low_user_desc + 1;
+ low_user_desc_clear->entry_number = gdt_entry_num;
+ low_user_desc_clear->read_exec_only = 1;
+ low_user_desc_clear->seg_not_present = 1;
+}
+
+static void test_gdt_invalidation(void)
+{
+ if (!gdt_entry_num)
+ return; /* 64-bit only system -- we can't use set_thread_area */
+
+ unsigned short prev_sel;
+ unsigned short sel;
+ unsigned int eax;
+ const char *result;
+#ifdef __x86_64__
+ unsigned long saved_base;
+ unsigned long new_base;
+#endif
+
+ /* Test DS */
+ invoke_set_thread_area();
+ eax = 243;
+ sel = (gdt_entry_num << 3) | 3;
+ asm volatile ("movw %%ds, %[prev_sel]\n\t"
+ "movw %[sel], %%ds\n\t"
+#ifdef __i386__
+ "pushl %%ebx\n\t"
+#endif
+ "movl %[arg1], %%ebx\n\t"
+ "int $0x80\n\t" /* Should invalidate ds */
+#ifdef __i386__
+ "popl %%ebx\n\t"
+#endif
+ "movw %%ds, %[sel]\n\t"
+ "movw %[prev_sel], %%ds"
+ : [prev_sel] "=&r" (prev_sel), [sel] "+r" (sel),
+ "+a" (eax)
+ : "m" (low_user_desc_clear),
+ [arg1] "r" ((unsigned int)(unsigned long)low_user_desc_clear)
+ : INT80_CLOBBERS);
+
+ if (sel != 0) {
+ result = "FAIL";
+ nerrs++;
+ } else {
+ result = "OK";
+ }
+ printf("[%s]\tInvalidate DS with set_thread_area: new DS = 0x%hx\n",
+ result, sel);
+
+ /* Test ES */
+ invoke_set_thread_area();
+ eax = 243;
+ sel = (gdt_entry_num << 3) | 3;
+ asm volatile ("movw %%es, %[prev_sel]\n\t"
+ "movw %[sel], %%es\n\t"
+#ifdef __i386__
+ "pushl %%ebx\n\t"
+#endif
+ "movl %[arg1], %%ebx\n\t"
+ "int $0x80\n\t" /* Should invalidate es */
+#ifdef __i386__
+ "popl %%ebx\n\t"
+#endif
+ "movw %%es, %[sel]\n\t"
+ "movw %[prev_sel], %%es"
+ : [prev_sel] "=&r" (prev_sel), [sel] "+r" (sel),
+ "+a" (eax)
+ : "m" (low_user_desc_clear),
+ [arg1] "r" ((unsigned int)(unsigned long)low_user_desc_clear)
+ : INT80_CLOBBERS);
+
+ if (sel != 0) {
+ result = "FAIL";
+ nerrs++;
+ } else {
+ result = "OK";
+ }
+ printf("[%s]\tInvalidate ES with set_thread_area: new ES = 0x%hx\n",
+ result, sel);
+
+ /* Test FS */
+ invoke_set_thread_area();
+ eax = 243;
+ sel = (gdt_entry_num << 3) | 3;
+#ifdef __x86_64__
+ syscall(SYS_arch_prctl, ARCH_GET_FS, &saved_base);
+#endif
+ asm volatile ("movw %%fs, %[prev_sel]\n\t"
+ "movw %[sel], %%fs\n\t"
+#ifdef __i386__
+ "pushl %%ebx\n\t"
+#endif
+ "movl %[arg1], %%ebx\n\t"
+ "int $0x80\n\t" /* Should invalidate fs */
+#ifdef __i386__
+ "popl %%ebx\n\t"
+#endif
+ "movw %%fs, %[sel]\n\t"
+ : [prev_sel] "=&r" (prev_sel), [sel] "+r" (sel),
+ "+a" (eax)
+ : "m" (low_user_desc_clear),
+ [arg1] "r" ((unsigned int)(unsigned long)low_user_desc_clear)
+ : INT80_CLOBBERS);
+
+#ifdef __x86_64__
+ syscall(SYS_arch_prctl, ARCH_GET_FS, &new_base);
+#endif
+
+ /* Restore FS/BASE for glibc */
+ asm volatile ("movw %[prev_sel], %%fs" : : [prev_sel] "rm" (prev_sel));
+#ifdef __x86_64__
+ if (saved_base)
+ syscall(SYS_arch_prctl, ARCH_SET_FS, saved_base);
+#endif
+
+ if (sel != 0) {
+ result = "FAIL";
+ nerrs++;
+ } else {
+ result = "OK";
+ }
+ printf("[%s]\tInvalidate FS with set_thread_area: new FS = 0x%hx\n",
+ result, sel);
+
+#ifdef __x86_64__
+ if (sel == 0 && new_base != 0) {
+ nerrs++;
+ printf("[FAIL]\tNew FSBASE was 0x%lx\n", new_base);
+ } else {
+ printf("[OK]\tNew FSBASE was zero\n");
+ }
+#endif
+
+ /* Test GS */
+ invoke_set_thread_area();
+ eax = 243;
+ sel = (gdt_entry_num << 3) | 3;
+#ifdef __x86_64__
+ syscall(SYS_arch_prctl, ARCH_GET_GS, &saved_base);
+#endif
+ asm volatile ("movw %%gs, %[prev_sel]\n\t"
+ "movw %[sel], %%gs\n\t"
+#ifdef __i386__
+ "pushl %%ebx\n\t"
+#endif
+ "movl %[arg1], %%ebx\n\t"
+ "int $0x80\n\t" /* Should invalidate gs */
+#ifdef __i386__
+ "popl %%ebx\n\t"
+#endif
+ "movw %%gs, %[sel]\n\t"
+ : [prev_sel] "=&r" (prev_sel), [sel] "+r" (sel),
+ "+a" (eax)
+ : "m" (low_user_desc_clear),
+ [arg1] "r" ((unsigned int)(unsigned long)low_user_desc_clear)
+ : INT80_CLOBBERS);
+
+#ifdef __x86_64__
+ syscall(SYS_arch_prctl, ARCH_GET_GS, &new_base);
+#endif
+
+ /* Restore GS/BASE for glibc */
+ asm volatile ("movw %[prev_sel], %%gs" : : [prev_sel] "rm" (prev_sel));
+#ifdef __x86_64__
+ if (saved_base)
+ syscall(SYS_arch_prctl, ARCH_SET_GS, saved_base);
+#endif
+
+ if (sel != 0) {
+ result = "FAIL";
+ nerrs++;
+ } else {
+ result = "OK";
+ }
+ printf("[%s]\tInvalidate GS with set_thread_area: new GS = 0x%hx\n",
+ result, sel);
+
+#ifdef __x86_64__
+ if (sel == 0 && new_base != 0) {
+ nerrs++;
+ printf("[FAIL]\tNew GSBASE was 0x%lx\n", new_base);
+ } else {
+ printf("[OK]\tNew GSBASE was zero\n");
+ }
+#endif
+}
+
+int main(int argc, char **argv)
+{
+ if (argc == 1 && !strcmp(argv[0], "ldt_gdt_test_exec"))
+ return finish_exec_test();
+
+ setup_counter_page();
+ setup_low_user_desc();
+
+ do_simple_tests();
+
+ do_multicpu_tests();
+
+ do_exec_test();
+
+ test_gdt_invalidation();
+
+ return nerrs ? 1 : 0;
+}
diff --git a/tools/testing/selftests/x86/mov_ss_trap.c b/tools/testing/selftests/x86/mov_ss_trap.c
new file mode 100644
index 000000000..6da0ac3f0
--- /dev/null
+++ b/tools/testing/selftests/x86/mov_ss_trap.c
@@ -0,0 +1,286 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * mov_ss_trap.c: Exercise the bizarre side effects of a watchpoint on MOV SS
+ *
+ * This does MOV SS from a watchpointed address followed by various
+ * types of kernel entries. A MOV SS that hits a watchpoint will queue
+ * up a #DB trap but will not actually deliver that trap. The trap
+ * will be delivered after the next instruction instead. The CPU's logic
+ * seems to be:
+ *
+ * - Any fault: drop the pending #DB trap.
+ * - INT $N, INT3, INTO, SYSCALL, SYSENTER: enter the kernel and then
+ * deliver #DB.
+ * - ICEBP: enter the kernel but do not deliver the watchpoint trap
+ * - breakpoint: only one #DB is delivered (phew!)
+ *
+ * There are plenty of ways for a kernel to handle this incorrectly. This
+ * test tries to exercise all the cases.
+ *
+ * This should mostly cover CVE-2018-1087 and CVE-2018-8897.
+ */
+#define _GNU_SOURCE
+
+#include <stdlib.h>
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/user.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <err.h>
+#include <string.h>
+#include <setjmp.h>
+#include <sys/prctl.h>
+
+#define X86_EFLAGS_RF (1UL << 16)
+
+#if __x86_64__
+# define REG_IP REG_RIP
+#else
+# define REG_IP REG_EIP
+#endif
+
+unsigned short ss;
+extern unsigned char breakpoint_insn[];
+sigjmp_buf jmpbuf;
+static unsigned char altstack_data[SIGSTKSZ];
+
+static void enable_watchpoint(void)
+{
+ pid_t parent = getpid();
+ int status;
+
+ pid_t child = fork();
+ if (child < 0)
+ err(1, "fork");
+
+ if (child) {
+ if (waitpid(child, &status, 0) != child)
+ err(1, "waitpid for child");
+ } else {
+ unsigned long dr0, dr1, dr7;
+
+ dr0 = (unsigned long)&ss;
+ dr1 = (unsigned long)breakpoint_insn;
+ dr7 = ((1UL << 1) | /* G0 */
+ (3UL << 16) | /* RW0 = read or write */
+ (1UL << 18) | /* LEN0 = 2 bytes */
+ (1UL << 3)); /* G1, RW1 = insn */
+
+ if (ptrace(PTRACE_ATTACH, parent, NULL, NULL) != 0)
+ err(1, "PTRACE_ATTACH");
+
+ if (waitpid(parent, &status, 0) != parent)
+ err(1, "waitpid for child");
+
+ if (ptrace(PTRACE_POKEUSER, parent, (void *)offsetof(struct user, u_debugreg[0]), dr0) != 0)
+ err(1, "PTRACE_POKEUSER DR0");
+
+ if (ptrace(PTRACE_POKEUSER, parent, (void *)offsetof(struct user, u_debugreg[1]), dr1) != 0)
+ err(1, "PTRACE_POKEUSER DR1");
+
+ if (ptrace(PTRACE_POKEUSER, parent, (void *)offsetof(struct user, u_debugreg[7]), dr7) != 0)
+ err(1, "PTRACE_POKEUSER DR7");
+
+ printf("\tDR0 = %lx, DR1 = %lx, DR7 = %lx\n", dr0, dr1, dr7);
+
+ if (ptrace(PTRACE_DETACH, parent, NULL, NULL) != 0)
+ err(1, "PTRACE_DETACH");
+
+ exit(0);
+ }
+}
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static char const * const signames[] = {
+ [SIGSEGV] = "SIGSEGV",
+ [SIGBUS] = "SIBGUS",
+ [SIGTRAP] = "SIGTRAP",
+ [SIGILL] = "SIGILL",
+};
+
+static void sigtrap(int sig, siginfo_t *si, void *ctx_void)
+{
+ ucontext_t *ctx = ctx_void;
+
+ printf("\tGot SIGTRAP with RIP=%lx, EFLAGS.RF=%d\n",
+ (unsigned long)ctx->uc_mcontext.gregs[REG_IP],
+ !!(ctx->uc_mcontext.gregs[REG_EFL] & X86_EFLAGS_RF));
+}
+
+static void handle_and_return(int sig, siginfo_t *si, void *ctx_void)
+{
+ ucontext_t *ctx = ctx_void;
+
+ printf("\tGot %s with RIP=%lx\n", signames[sig],
+ (unsigned long)ctx->uc_mcontext.gregs[REG_IP]);
+}
+
+static void handle_and_longjmp(int sig, siginfo_t *si, void *ctx_void)
+{
+ ucontext_t *ctx = ctx_void;
+
+ printf("\tGot %s with RIP=%lx\n", signames[sig],
+ (unsigned long)ctx->uc_mcontext.gregs[REG_IP]);
+
+ siglongjmp(jmpbuf, 1);
+}
+
+int main()
+{
+ unsigned long nr;
+
+ asm volatile ("mov %%ss, %[ss]" : [ss] "=m" (ss));
+ printf("\tSS = 0x%hx, &SS = 0x%p\n", ss, &ss);
+
+ if (prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, 0, 0, 0) == 0)
+ printf("\tPR_SET_PTRACER_ANY succeeded\n");
+
+ printf("\tSet up a watchpoint\n");
+ sethandler(SIGTRAP, sigtrap, 0);
+ enable_watchpoint();
+
+ printf("[RUN]\tRead from watched memory (should get SIGTRAP)\n");
+ asm volatile ("mov %[ss], %[tmp]" : [tmp] "=r" (nr) : [ss] "m" (ss));
+
+ printf("[RUN]\tMOV SS; INT3\n");
+ asm volatile ("mov %[ss], %%ss; int3" :: [ss] "m" (ss));
+
+ printf("[RUN]\tMOV SS; INT 3\n");
+ asm volatile ("mov %[ss], %%ss; .byte 0xcd, 0x3" :: [ss] "m" (ss));
+
+ printf("[RUN]\tMOV SS; CS CS INT3\n");
+ asm volatile ("mov %[ss], %%ss; .byte 0x2e, 0x2e; int3" :: [ss] "m" (ss));
+
+ printf("[RUN]\tMOV SS; CSx14 INT3\n");
+ asm volatile ("mov %[ss], %%ss; .fill 14,1,0x2e; int3" :: [ss] "m" (ss));
+
+ printf("[RUN]\tMOV SS; INT 4\n");
+ sethandler(SIGSEGV, handle_and_return, SA_RESETHAND);
+ asm volatile ("mov %[ss], %%ss; int $4" :: [ss] "m" (ss));
+
+#ifdef __i386__
+ printf("[RUN]\tMOV SS; INTO\n");
+ sethandler(SIGSEGV, handle_and_return, SA_RESETHAND);
+ nr = -1;
+ asm volatile ("add $1, %[tmp]; mov %[ss], %%ss; into"
+ : [tmp] "+r" (nr) : [ss] "m" (ss));
+#endif
+
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ printf("[RUN]\tMOV SS; ICEBP\n");
+
+ /* Some emulators (e.g. QEMU TCG) don't emulate ICEBP. */
+ sethandler(SIGILL, handle_and_longjmp, SA_RESETHAND);
+
+ asm volatile ("mov %[ss], %%ss; .byte 0xf1" :: [ss] "m" (ss));
+ }
+
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ printf("[RUN]\tMOV SS; CLI\n");
+ sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND);
+ asm volatile ("mov %[ss], %%ss; cli" :: [ss] "m" (ss));
+ }
+
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ printf("[RUN]\tMOV SS; #PF\n");
+ sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND);
+ asm volatile ("mov %[ss], %%ss; mov (-1), %[tmp]"
+ : [tmp] "=r" (nr) : [ss] "m" (ss));
+ }
+
+ /*
+ * INT $1: if #DB has DPL=3 and there isn't special handling,
+ * then the kernel will die.
+ */
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ printf("[RUN]\tMOV SS; INT 1\n");
+ sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND);
+ asm volatile ("mov %[ss], %%ss; int $1" :: [ss] "m" (ss));
+ }
+
+#ifdef __x86_64__
+ /*
+ * In principle, we should test 32-bit SYSCALL as well, but
+ * the calling convention is so unpredictable that it's
+ * not obviously worth the effort.
+ */
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ printf("[RUN]\tMOV SS; SYSCALL\n");
+ sethandler(SIGILL, handle_and_longjmp, SA_RESETHAND);
+ nr = SYS_getpid;
+ /*
+ * Toggle the high bit of RSP to make it noncanonical to
+ * strengthen this test on non-SMAP systems.
+ */
+ asm volatile ("btc $63, %%rsp\n\t"
+ "mov %[ss], %%ss; syscall\n\t"
+ "btc $63, %%rsp"
+ : "+a" (nr) : [ss] "m" (ss)
+ : "rcx"
+#ifdef __x86_64__
+ , "r11"
+#endif
+ );
+ }
+#endif
+
+ printf("[RUN]\tMOV SS; breakpointed NOP\n");
+ asm volatile ("mov %[ss], %%ss; breakpoint_insn: nop" :: [ss] "m" (ss));
+
+ /*
+ * Invoking SYSENTER directly breaks all the rules. Just handle
+ * the SIGSEGV.
+ */
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ printf("[RUN]\tMOV SS; SYSENTER\n");
+ stack_t stack = {
+ .ss_sp = altstack_data,
+ .ss_size = SIGSTKSZ,
+ };
+ if (sigaltstack(&stack, NULL) != 0)
+ err(1, "sigaltstack");
+ sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND | SA_ONSTACK);
+ nr = SYS_getpid;
+ /* Clear EBP first to make sure we segfault cleanly. */
+ asm volatile ("xorl %%ebp, %%ebp; mov %[ss], %%ss; SYSENTER" : "+a" (nr)
+ : [ss] "m" (ss) : "flags", "rcx"
+#ifdef __x86_64__
+ , "r11"
+#endif
+ );
+
+ /* We're unreachable here. SYSENTER forgets RIP. */
+ }
+
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ printf("[RUN]\tMOV SS; INT $0x80\n");
+ sethandler(SIGSEGV, handle_and_longjmp, SA_RESETHAND);
+ nr = 20; /* compat getpid */
+ asm volatile ("mov %[ss], %%ss; int $0x80"
+ : "+a" (nr) : [ss] "m" (ss)
+ : "flags"
+#ifdef __x86_64__
+ , "r8", "r9", "r10", "r11"
+#endif
+ );
+ }
+
+ printf("[OK]\tI aten't dead\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/ptrace_syscall.c b/tools/testing/selftests/x86/ptrace_syscall.c
new file mode 100644
index 000000000..12aaa0631
--- /dev/null
+++ b/tools/testing/selftests/x86/ptrace_syscall.c
@@ -0,0 +1,430 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/syscall.h>
+#include <sys/user.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <err.h>
+#include <string.h>
+#include <asm/ptrace-abi.h>
+#include <sys/auxv.h>
+
+/* Bitness-agnostic defines for user_regs_struct fields. */
+#ifdef __x86_64__
+# define user_syscall_nr orig_rax
+# define user_arg0 rdi
+# define user_arg1 rsi
+# define user_arg2 rdx
+# define user_arg3 r10
+# define user_arg4 r8
+# define user_arg5 r9
+# define user_ip rip
+# define user_ax rax
+#else
+# define user_syscall_nr orig_eax
+# define user_arg0 ebx
+# define user_arg1 ecx
+# define user_arg2 edx
+# define user_arg3 esi
+# define user_arg4 edi
+# define user_arg5 ebp
+# define user_ip eip
+# define user_ax eax
+#endif
+
+static int nerrs = 0;
+
+struct syscall_args32 {
+ uint32_t nr, arg0, arg1, arg2, arg3, arg4, arg5;
+};
+
+#ifdef __i386__
+extern void sys32_helper(struct syscall_args32 *, void *);
+extern void int80_and_ret(void);
+#endif
+
+/*
+ * Helper to invoke int80 with controlled regs and capture the final regs.
+ */
+static void do_full_int80(struct syscall_args32 *args)
+{
+#ifdef __x86_64__
+ register unsigned long bp asm("bp") = args->arg5;
+ asm volatile ("int $0x80"
+ : "+a" (args->nr),
+ "+b" (args->arg0), "+c" (args->arg1), "+d" (args->arg2),
+ "+S" (args->arg3), "+D" (args->arg4), "+r" (bp)
+ : : "r8", "r9", "r10", "r11");
+ args->arg5 = bp;
+#else
+ sys32_helper(args, int80_and_ret);
+#endif
+}
+
+#ifdef __i386__
+static void (*vsyscall32)(void);
+
+/*
+ * Nasty helper to invoke AT_SYSINFO (i.e. __kernel_vsyscall) with
+ * controlled regs and capture the final regs. This is so nasty that it
+ * crashes my copy of gdb :)
+ */
+static void do_full_vsyscall32(struct syscall_args32 *args)
+{
+ sys32_helper(args, vsyscall32);
+}
+#endif
+
+static siginfo_t wait_trap(pid_t chld)
+{
+ siginfo_t si;
+ if (waitid(P_PID, chld, &si, WEXITED|WSTOPPED) != 0)
+ err(1, "waitid");
+ if (si.si_pid != chld)
+ errx(1, "got unexpected pid in event\n");
+ if (si.si_code != CLD_TRAPPED)
+ errx(1, "got unexpected event type %d\n", si.si_code);
+ return si;
+}
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static void setsigign(int sig, int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = (void *)SIG_IGN;
+ sa.sa_flags = flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = SIG_DFL;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+#ifdef __x86_64__
+# define REG_BP REG_RBP
+#else
+# define REG_BP REG_EBP
+#endif
+
+static void empty_handler(int sig, siginfo_t *si, void *ctx_void)
+{
+}
+
+static void test_sys32_regs(void (*do_syscall)(struct syscall_args32 *))
+{
+ struct syscall_args32 args = {
+ .nr = 224, /* gettid */
+ .arg0 = 10, .arg1 = 11, .arg2 = 12,
+ .arg3 = 13, .arg4 = 14, .arg5 = 15,
+ };
+
+ do_syscall(&args);
+
+ if (args.nr != getpid() ||
+ args.arg0 != 10 || args.arg1 != 11 || args.arg2 != 12 ||
+ args.arg3 != 13 || args.arg4 != 14 || args.arg5 != 15) {
+ printf("[FAIL]\tgetpid() failed to preserve regs\n");
+ nerrs++;
+ } else {
+ printf("[OK]\tgetpid() preserves regs\n");
+ }
+
+ sethandler(SIGUSR1, empty_handler, 0);
+
+ args.nr = 37; /* kill */
+ args.arg0 = getpid();
+ args.arg1 = SIGUSR1;
+ do_syscall(&args);
+ if (args.nr != 0 ||
+ args.arg0 != getpid() || args.arg1 != SIGUSR1 || args.arg2 != 12 ||
+ args.arg3 != 13 || args.arg4 != 14 || args.arg5 != 15) {
+ printf("[FAIL]\tkill(getpid(), SIGUSR1) failed to preserve regs\n");
+ nerrs++;
+ } else {
+ printf("[OK]\tkill(getpid(), SIGUSR1) preserves regs\n");
+ }
+ clearhandler(SIGUSR1);
+}
+
+static void test_ptrace_syscall_restart(void)
+{
+ printf("[RUN]\tptrace-induced syscall restart\n");
+ pid_t chld = fork();
+ if (chld < 0)
+ err(1, "fork");
+
+ if (chld == 0) {
+ if (ptrace(PTRACE_TRACEME, 0, 0, 0) != 0)
+ err(1, "PTRACE_TRACEME");
+
+ pid_t pid = getpid(), tid = syscall(SYS_gettid);
+
+ printf("\tChild will make one syscall\n");
+ syscall(SYS_tgkill, pid, tid, SIGSTOP);
+
+ syscall(SYS_gettid, 10, 11, 12, 13, 14, 15);
+ _exit(0);
+ }
+
+ int status;
+
+ /* Wait for SIGSTOP. */
+ if (waitpid(chld, &status, 0) != chld || !WIFSTOPPED(status))
+ err(1, "waitpid");
+
+ struct user_regs_struct regs;
+
+ printf("[RUN]\tSYSEMU\n");
+ if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
+ err(1, "PTRACE_SYSEMU");
+ wait_trap(chld);
+
+ if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_GETREGS");
+
+ if (regs.user_syscall_nr != SYS_gettid ||
+ regs.user_arg0 != 10 || regs.user_arg1 != 11 ||
+ regs.user_arg2 != 12 || regs.user_arg3 != 13 ||
+ regs.user_arg4 != 14 || regs.user_arg5 != 15) {
+ printf("[FAIL]\tInitial args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
+ nerrs++;
+ } else {
+ printf("[OK]\tInitial nr and args are correct\n");
+ }
+
+ printf("[RUN]\tRestart the syscall (ip = 0x%lx)\n",
+ (unsigned long)regs.user_ip);
+
+ /*
+ * This does exactly what it appears to do if syscall is int80 or
+ * SYSCALL64. For SYSCALL32 or SYSENTER, though, this is highly
+ * magical. It needs to work so that ptrace and syscall restart
+ * work as expected.
+ */
+ regs.user_ax = regs.user_syscall_nr;
+ regs.user_ip -= 2;
+ if (ptrace(PTRACE_SETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_SETREGS");
+
+ if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
+ err(1, "PTRACE_SYSEMU");
+ wait_trap(chld);
+
+ if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_GETREGS");
+
+ if (regs.user_syscall_nr != SYS_gettid ||
+ regs.user_arg0 != 10 || regs.user_arg1 != 11 ||
+ regs.user_arg2 != 12 || regs.user_arg3 != 13 ||
+ regs.user_arg4 != 14 || regs.user_arg5 != 15) {
+ printf("[FAIL]\tRestart nr or args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
+ nerrs++;
+ } else {
+ printf("[OK]\tRestarted nr and args are correct\n");
+ }
+
+ printf("[RUN]\tChange nr and args and restart the syscall (ip = 0x%lx)\n",
+ (unsigned long)regs.user_ip);
+
+ regs.user_ax = SYS_getpid;
+ regs.user_arg0 = 20;
+ regs.user_arg1 = 21;
+ regs.user_arg2 = 22;
+ regs.user_arg3 = 23;
+ regs.user_arg4 = 24;
+ regs.user_arg5 = 25;
+ regs.user_ip -= 2;
+
+ if (ptrace(PTRACE_SETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_SETREGS");
+
+ if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0)
+ err(1, "PTRACE_SYSEMU");
+ wait_trap(chld);
+
+ if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_GETREGS");
+
+ if (regs.user_syscall_nr != SYS_getpid ||
+ regs.user_arg0 != 20 || regs.user_arg1 != 21 || regs.user_arg2 != 22 ||
+ regs.user_arg3 != 23 || regs.user_arg4 != 24 || regs.user_arg5 != 25) {
+ printf("[FAIL]\tRestart nr or args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
+ nerrs++;
+ } else {
+ printf("[OK]\tReplacement nr and args are correct\n");
+ }
+
+ if (ptrace(PTRACE_CONT, chld, 0, 0) != 0)
+ err(1, "PTRACE_CONT");
+ if (waitpid(chld, &status, 0) != chld)
+ err(1, "waitpid");
+ if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
+ printf("[FAIL]\tChild failed\n");
+ nerrs++;
+ } else {
+ printf("[OK]\tChild exited cleanly\n");
+ }
+}
+
+static void test_restart_under_ptrace(void)
+{
+ printf("[RUN]\tkernel syscall restart under ptrace\n");
+ pid_t chld = fork();
+ if (chld < 0)
+ err(1, "fork");
+
+ if (chld == 0) {
+ if (ptrace(PTRACE_TRACEME, 0, 0, 0) != 0)
+ err(1, "PTRACE_TRACEME");
+
+ pid_t pid = getpid(), tid = syscall(SYS_gettid);
+
+ printf("\tChild will take a nap until signaled\n");
+ setsigign(SIGUSR1, SA_RESTART);
+ syscall(SYS_tgkill, pid, tid, SIGSTOP);
+
+ syscall(SYS_pause, 0, 0, 0, 0, 0, 0);
+ _exit(0);
+ }
+
+ int status;
+
+ /* Wait for SIGSTOP. */
+ if (waitpid(chld, &status, 0) != chld || !WIFSTOPPED(status))
+ err(1, "waitpid");
+
+ struct user_regs_struct regs;
+
+ printf("[RUN]\tSYSCALL\n");
+ if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0)
+ err(1, "PTRACE_SYSCALL");
+ wait_trap(chld);
+
+ /* We should be stopped at pause(2) entry. */
+
+ if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_GETREGS");
+
+ if (regs.user_syscall_nr != SYS_pause ||
+ regs.user_arg0 != 0 || regs.user_arg1 != 0 ||
+ regs.user_arg2 != 0 || regs.user_arg3 != 0 ||
+ regs.user_arg4 != 0 || regs.user_arg5 != 0) {
+ printf("[FAIL]\tInitial args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
+ nerrs++;
+ } else {
+ printf("[OK]\tInitial nr and args are correct\n");
+ }
+
+ /* Interrupt it. */
+ kill(chld, SIGUSR1);
+
+ /* Advance. We should be stopped at exit. */
+ printf("[RUN]\tSYSCALL\n");
+ if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0)
+ err(1, "PTRACE_SYSCALL");
+ wait_trap(chld);
+
+ if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_GETREGS");
+
+ if (regs.user_syscall_nr != SYS_pause ||
+ regs.user_arg0 != 0 || regs.user_arg1 != 0 ||
+ regs.user_arg2 != 0 || regs.user_arg3 != 0 ||
+ regs.user_arg4 != 0 || regs.user_arg5 != 0) {
+ printf("[FAIL]\tArgs after SIGUSR1 are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
+ nerrs++;
+ } else {
+ printf("[OK]\tArgs after SIGUSR1 are correct (ax = %ld)\n",
+ (long)regs.user_ax);
+ }
+
+ /* Poke the regs back in. This must not break anything. */
+ if (ptrace(PTRACE_SETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_SETREGS");
+
+ /* Catch the (ignored) SIGUSR1. */
+ if (ptrace(PTRACE_CONT, chld, 0, 0) != 0)
+ err(1, "PTRACE_CONT");
+ if (waitpid(chld, &status, 0) != chld)
+ err(1, "waitpid");
+ if (!WIFSTOPPED(status)) {
+ printf("[FAIL]\tChild was stopped for SIGUSR1 (status = 0x%x)\n", status);
+ nerrs++;
+ } else {
+ printf("[OK]\tChild got SIGUSR1\n");
+ }
+
+ /* The next event should be pause(2) again. */
+ printf("[RUN]\tStep again\n");
+ if (ptrace(PTRACE_SYSCALL, chld, 0, 0) != 0)
+ err(1, "PTRACE_SYSCALL");
+ wait_trap(chld);
+
+ /* We should be stopped at pause(2) entry. */
+
+ if (ptrace(PTRACE_GETREGS, chld, 0, &regs) != 0)
+ err(1, "PTRACE_GETREGS");
+
+ if (regs.user_syscall_nr != SYS_pause ||
+ regs.user_arg0 != 0 || regs.user_arg1 != 0 ||
+ regs.user_arg2 != 0 || regs.user_arg3 != 0 ||
+ regs.user_arg4 != 0 || regs.user_arg5 != 0) {
+ printf("[FAIL]\tpause did not restart (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5);
+ nerrs++;
+ } else {
+ printf("[OK]\tpause(2) restarted correctly\n");
+ }
+
+ /* Kill it. */
+ kill(chld, SIGKILL);
+ if (waitpid(chld, &status, 0) != chld)
+ err(1, "waitpid");
+}
+
+int main()
+{
+ printf("[RUN]\tCheck int80 return regs\n");
+ test_sys32_regs(do_full_int80);
+
+#if defined(__i386__) && (!defined(__GLIBC__) || __GLIBC__ > 2 || __GLIBC_MINOR__ >= 16)
+ vsyscall32 = (void *)getauxval(AT_SYSINFO);
+ if (vsyscall32) {
+ printf("[RUN]\tCheck AT_SYSINFO return regs\n");
+ test_sys32_regs(do_full_vsyscall32);
+ } else {
+ printf("[SKIP]\tAT_SYSINFO is not available\n");
+ }
+#endif
+
+ test_ptrace_syscall_restart();
+
+ test_restart_under_ptrace();
+
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/raw_syscall_helper_32.S b/tools/testing/selftests/x86/raw_syscall_helper_32.S
new file mode 100644
index 000000000..94410fa2b
--- /dev/null
+++ b/tools/testing/selftests/x86/raw_syscall_helper_32.S
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+.global sys32_helper
+sys32_helper:
+ /* Args: syscall_args_32*, function pointer */
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ movl 5*4(%esp), %eax /* pointer to args struct */
+
+ movl 1*4(%eax), %ebx
+ movl 2*4(%eax), %ecx
+ movl 3*4(%eax), %edx
+ movl 4*4(%eax), %esi
+ movl 5*4(%eax), %edi
+ movl 6*4(%eax), %ebp
+ movl 0*4(%eax), %eax
+
+ call *(6*4)(%esp) /* Do the syscall */
+
+ /* Now we need to recover without losing any reg values */
+ pushl %eax
+ movl 6*4(%esp), %eax
+ popl 0*4(%eax)
+ movl %ebx, 1*4(%eax)
+ movl %ecx, 2*4(%eax)
+ movl %edx, 3*4(%eax)
+ movl %esi, 4*4(%eax)
+ movl %edi, 5*4(%eax)
+ movl %ebp, 6*4(%eax)
+
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
+ ret
+
+ .type sys32_helper, @function
+ .size sys32_helper, .-sys32_helper
+
+.global int80_and_ret
+int80_and_ret:
+ int $0x80
+ ret
+
+ .type int80_and_ret, @function
+ .size int80_and_ret, .-int80_and_ret
diff --git a/tools/testing/selftests/x86/sigreturn.c b/tools/testing/selftests/x86/sigreturn.c
new file mode 100644
index 000000000..57c4f67f1
--- /dev/null
+++ b/tools/testing/selftests/x86/sigreturn.c
@@ -0,0 +1,876 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * sigreturn.c - tests for x86 sigreturn(2) and exit-to-userspace
+ * Copyright (c) 2014-2015 Andrew Lutomirski
+ *
+ * This is a series of tests that exercises the sigreturn(2) syscall and
+ * the IRET / SYSRET paths in the kernel.
+ *
+ * For now, this focuses on the effects of unusual CS and SS values,
+ * and it has a bunch of tests to make sure that ESP/RSP is restored
+ * properly.
+ *
+ * The basic idea behind these tests is to raise(SIGUSR1) to create a
+ * sigcontext frame, plug in the values to be tested, and then return,
+ * which implicitly invokes sigreturn(2) and programs the user context
+ * as desired.
+ *
+ * For tests for which we expect sigreturn and the subsequent return to
+ * user mode to succeed, we return to a short trampoline that generates
+ * SIGTRAP so that the meat of the tests can be ordinary C code in a
+ * SIGTRAP handler.
+ *
+ * The inner workings of each test is documented below.
+ *
+ * Do not run on outdated, unpatched kernels at risk of nasty crashes.
+ */
+
+#define _GNU_SOURCE
+
+#include <sys/time.h>
+#include <time.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <sys/signal.h>
+#include <sys/ucontext.h>
+#include <asm/ldt.h>
+#include <err.h>
+#include <setjmp.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <sys/ptrace.h>
+#include <sys/user.h>
+
+/* Pull in AR_xyz defines. */
+typedef unsigned int u32;
+typedef unsigned short u16;
+#include "../../../../arch/x86/include/asm/desc_defs.h"
+
+/*
+ * Copied from asm/ucontext.h, as asm/ucontext.h conflicts badly with the glibc
+ * headers.
+ */
+#ifdef __x86_64__
+/*
+ * UC_SIGCONTEXT_SS will be set when delivering 64-bit or x32 signals on
+ * kernels that save SS in the sigcontext. All kernels that set
+ * UC_SIGCONTEXT_SS will correctly restore at least the low 32 bits of esp
+ * regardless of SS (i.e. they implement espfix).
+ *
+ * Kernels that set UC_SIGCONTEXT_SS will also set UC_STRICT_RESTORE_SS
+ * when delivering a signal that came from 64-bit code.
+ *
+ * Sigreturn restores SS as follows:
+ *
+ * if (saved SS is valid || UC_STRICT_RESTORE_SS is set ||
+ * saved CS is not 64-bit)
+ * new SS = saved SS (will fail IRET and signal if invalid)
+ * else
+ * new SS = a flat 32-bit data segment
+ */
+#define UC_SIGCONTEXT_SS 0x2
+#define UC_STRICT_RESTORE_SS 0x4
+#endif
+
+/*
+ * In principle, this test can run on Linux emulation layers (e.g.
+ * Illumos "LX branded zones"). Solaris-based kernels reserve LDT
+ * entries 0-5 for their own internal purposes, so start our LDT
+ * allocations above that reservation. (The tests don't pass on LX
+ * branded zones, but at least this lets them run.)
+ */
+#define LDT_OFFSET 6
+
+/* An aligned stack accessible through some of our segments. */
+static unsigned char stack16[65536] __attribute__((aligned(4096)));
+
+/*
+ * An aligned int3 instruction used as a trampoline. Some of the tests
+ * want to fish out their ss values, so this trampoline copies ss to eax
+ * before the int3.
+ */
+asm (".pushsection .text\n\t"
+ ".type int3, @function\n\t"
+ ".align 4096\n\t"
+ "int3:\n\t"
+ "mov %ss,%ecx\n\t"
+ "int3\n\t"
+ ".size int3, . - int3\n\t"
+ ".align 4096, 0xcc\n\t"
+ ".popsection");
+extern char int3[4096];
+
+/*
+ * At startup, we prepapre:
+ *
+ * - ldt_nonexistent_sel: An LDT entry that doesn't exist (all-zero
+ * descriptor or out of bounds).
+ * - code16_sel: A 16-bit LDT code segment pointing to int3.
+ * - data16_sel: A 16-bit LDT data segment pointing to stack16.
+ * - npcode32_sel: A 32-bit not-present LDT code segment pointing to int3.
+ * - npdata32_sel: A 32-bit not-present LDT data segment pointing to stack16.
+ * - gdt_data16_idx: A 16-bit GDT data segment pointing to stack16.
+ * - gdt_npdata32_idx: A 32-bit not-present GDT data segment pointing to
+ * stack16.
+ *
+ * For no particularly good reason, xyz_sel is a selector value with the
+ * RPL and LDT bits filled in, whereas xyz_idx is just an index into the
+ * descriptor table. These variables will be zero if their respective
+ * segments could not be allocated.
+ */
+static unsigned short ldt_nonexistent_sel;
+static unsigned short code16_sel, data16_sel, npcode32_sel, npdata32_sel;
+
+static unsigned short gdt_data16_idx, gdt_npdata32_idx;
+
+static unsigned short GDT3(int idx)
+{
+ return (idx << 3) | 3;
+}
+
+static unsigned short LDT3(int idx)
+{
+ return (idx << 3) | 7;
+}
+
+/* Our sigaltstack scratch space. */
+static char altstack_data[SIGSTKSZ];
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = SIG_DFL;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static void add_ldt(const struct user_desc *desc, unsigned short *var,
+ const char *name)
+{
+ if (syscall(SYS_modify_ldt, 1, desc, sizeof(*desc)) == 0) {
+ *var = LDT3(desc->entry_number);
+ } else {
+ printf("[NOTE]\tFailed to create %s segment\n", name);
+ *var = 0;
+ }
+}
+
+static void setup_ldt(void)
+{
+ if ((unsigned long)stack16 > (1ULL << 32) - sizeof(stack16))
+ errx(1, "stack16 is too high\n");
+ if ((unsigned long)int3 > (1ULL << 32) - sizeof(int3))
+ errx(1, "int3 is too high\n");
+
+ ldt_nonexistent_sel = LDT3(LDT_OFFSET + 2);
+
+ const struct user_desc code16_desc = {
+ .entry_number = LDT_OFFSET + 0,
+ .base_addr = (unsigned long)int3,
+ .limit = 4095,
+ .seg_32bit = 0,
+ .contents = 2, /* Code, not conforming */
+ .read_exec_only = 0,
+ .limit_in_pages = 0,
+ .seg_not_present = 0,
+ .useable = 0
+ };
+ add_ldt(&code16_desc, &code16_sel, "code16");
+
+ const struct user_desc data16_desc = {
+ .entry_number = LDT_OFFSET + 1,
+ .base_addr = (unsigned long)stack16,
+ .limit = 0xffff,
+ .seg_32bit = 0,
+ .contents = 0, /* Data, grow-up */
+ .read_exec_only = 0,
+ .limit_in_pages = 0,
+ .seg_not_present = 0,
+ .useable = 0
+ };
+ add_ldt(&data16_desc, &data16_sel, "data16");
+
+ const struct user_desc npcode32_desc = {
+ .entry_number = LDT_OFFSET + 3,
+ .base_addr = (unsigned long)int3,
+ .limit = 4095,
+ .seg_32bit = 1,
+ .contents = 2, /* Code, not conforming */
+ .read_exec_only = 0,
+ .limit_in_pages = 0,
+ .seg_not_present = 1,
+ .useable = 0
+ };
+ add_ldt(&npcode32_desc, &npcode32_sel, "npcode32");
+
+ const struct user_desc npdata32_desc = {
+ .entry_number = LDT_OFFSET + 4,
+ .base_addr = (unsigned long)stack16,
+ .limit = 0xffff,
+ .seg_32bit = 1,
+ .contents = 0, /* Data, grow-up */
+ .read_exec_only = 0,
+ .limit_in_pages = 0,
+ .seg_not_present = 1,
+ .useable = 0
+ };
+ add_ldt(&npdata32_desc, &npdata32_sel, "npdata32");
+
+ struct user_desc gdt_data16_desc = {
+ .entry_number = -1,
+ .base_addr = (unsigned long)stack16,
+ .limit = 0xffff,
+ .seg_32bit = 0,
+ .contents = 0, /* Data, grow-up */
+ .read_exec_only = 0,
+ .limit_in_pages = 0,
+ .seg_not_present = 0,
+ .useable = 0
+ };
+
+ if (syscall(SYS_set_thread_area, &gdt_data16_desc) == 0) {
+ /*
+ * This probably indicates vulnerability to CVE-2014-8133.
+ * Merely getting here isn't definitive, though, and we'll
+ * diagnose the problem for real later on.
+ */
+ printf("[WARN]\tset_thread_area allocated data16 at index %d\n",
+ gdt_data16_desc.entry_number);
+ gdt_data16_idx = gdt_data16_desc.entry_number;
+ } else {
+ printf("[OK]\tset_thread_area refused 16-bit data\n");
+ }
+
+ struct user_desc gdt_npdata32_desc = {
+ .entry_number = -1,
+ .base_addr = (unsigned long)stack16,
+ .limit = 0xffff,
+ .seg_32bit = 1,
+ .contents = 0, /* Data, grow-up */
+ .read_exec_only = 0,
+ .limit_in_pages = 0,
+ .seg_not_present = 1,
+ .useable = 0
+ };
+
+ if (syscall(SYS_set_thread_area, &gdt_npdata32_desc) == 0) {
+ /*
+ * As a hardening measure, newer kernels don't allow this.
+ */
+ printf("[WARN]\tset_thread_area allocated npdata32 at index %d\n",
+ gdt_npdata32_desc.entry_number);
+ gdt_npdata32_idx = gdt_npdata32_desc.entry_number;
+ } else {
+ printf("[OK]\tset_thread_area refused 16-bit data\n");
+ }
+}
+
+/* State used by our signal handlers. */
+static gregset_t initial_regs, requested_regs, resulting_regs;
+
+/* Instructions for the SIGUSR1 handler. */
+static volatile unsigned short sig_cs, sig_ss;
+static volatile sig_atomic_t sig_trapped, sig_err, sig_trapno;
+#ifdef __x86_64__
+static volatile sig_atomic_t sig_corrupt_final_ss;
+#endif
+
+/* Abstractions for some 32-bit vs 64-bit differences. */
+#ifdef __x86_64__
+# define REG_IP REG_RIP
+# define REG_SP REG_RSP
+# define REG_CX REG_RCX
+
+struct selectors {
+ unsigned short cs, gs, fs, ss;
+};
+
+static unsigned short *ssptr(ucontext_t *ctx)
+{
+ struct selectors *sels = (void *)&ctx->uc_mcontext.gregs[REG_CSGSFS];
+ return &sels->ss;
+}
+
+static unsigned short *csptr(ucontext_t *ctx)
+{
+ struct selectors *sels = (void *)&ctx->uc_mcontext.gregs[REG_CSGSFS];
+ return &sels->cs;
+}
+#else
+# define REG_IP REG_EIP
+# define REG_SP REG_ESP
+# define REG_CX REG_ECX
+
+static greg_t *ssptr(ucontext_t *ctx)
+{
+ return &ctx->uc_mcontext.gregs[REG_SS];
+}
+
+static greg_t *csptr(ucontext_t *ctx)
+{
+ return &ctx->uc_mcontext.gregs[REG_CS];
+}
+#endif
+
+/*
+ * Checks a given selector for its code bitness or returns -1 if it's not
+ * a usable code segment selector.
+ */
+int cs_bitness(unsigned short cs)
+{
+ uint32_t valid = 0, ar;
+ asm ("lar %[cs], %[ar]\n\t"
+ "jnz 1f\n\t"
+ "mov $1, %[valid]\n\t"
+ "1:"
+ : [ar] "=r" (ar), [valid] "+rm" (valid)
+ : [cs] "r" (cs));
+
+ if (!valid)
+ return -1;
+
+ bool db = (ar & (1 << 22));
+ bool l = (ar & (1 << 21));
+
+ if (!(ar & (1<<11)))
+ return -1; /* Not code. */
+
+ if (l && !db)
+ return 64;
+ else if (!l && db)
+ return 32;
+ else if (!l && !db)
+ return 16;
+ else
+ return -1; /* Unknown bitness. */
+}
+
+/*
+ * Checks a given selector for its code bitness or returns -1 if it's not
+ * a usable code segment selector.
+ */
+bool is_valid_ss(unsigned short cs)
+{
+ uint32_t valid = 0, ar;
+ asm ("lar %[cs], %[ar]\n\t"
+ "jnz 1f\n\t"
+ "mov $1, %[valid]\n\t"
+ "1:"
+ : [ar] "=r" (ar), [valid] "+rm" (valid)
+ : [cs] "r" (cs));
+
+ if (!valid)
+ return false;
+
+ if ((ar & AR_TYPE_MASK) != AR_TYPE_RWDATA &&
+ (ar & AR_TYPE_MASK) != AR_TYPE_RWDATA_EXPDOWN)
+ return false;
+
+ return (ar & AR_P);
+}
+
+/* Number of errors in the current test case. */
+static volatile sig_atomic_t nerrs;
+
+static void validate_signal_ss(int sig, ucontext_t *ctx)
+{
+#ifdef __x86_64__
+ bool was_64bit = (cs_bitness(*csptr(ctx)) == 64);
+
+ if (!(ctx->uc_flags & UC_SIGCONTEXT_SS)) {
+ printf("[FAIL]\tUC_SIGCONTEXT_SS was not set\n");
+ nerrs++;
+
+ /*
+ * This happens on Linux 4.1. The rest will fail, too, so
+ * return now to reduce the noise.
+ */
+ return;
+ }
+
+ /* UC_STRICT_RESTORE_SS is set iff we came from 64-bit mode. */
+ if (!!(ctx->uc_flags & UC_STRICT_RESTORE_SS) != was_64bit) {
+ printf("[FAIL]\tUC_STRICT_RESTORE_SS was wrong in signal %d\n",
+ sig);
+ nerrs++;
+ }
+
+ if (is_valid_ss(*ssptr(ctx))) {
+ /*
+ * DOSEMU was written before 64-bit sigcontext had SS, and
+ * it tries to figure out the signal source SS by looking at
+ * the physical register. Make sure that keeps working.
+ */
+ unsigned short hw_ss;
+ asm ("mov %%ss, %0" : "=rm" (hw_ss));
+ if (hw_ss != *ssptr(ctx)) {
+ printf("[FAIL]\tHW SS didn't match saved SS\n");
+ nerrs++;
+ }
+ }
+#endif
+}
+
+/*
+ * SIGUSR1 handler. Sets CS and SS as requested and points IP to the
+ * int3 trampoline. Sets SP to a large known value so that we can see
+ * whether the value round-trips back to user mode correctly.
+ */
+static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+ validate_signal_ss(sig, ctx);
+
+ memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
+
+ *csptr(ctx) = sig_cs;
+ *ssptr(ctx) = sig_ss;
+
+ ctx->uc_mcontext.gregs[REG_IP] =
+ sig_cs == code16_sel ? 0 : (unsigned long)&int3;
+ ctx->uc_mcontext.gregs[REG_SP] = (unsigned long)0x8badf00d5aadc0deULL;
+ ctx->uc_mcontext.gregs[REG_CX] = 0;
+
+#ifdef __i386__
+ /*
+ * Make sure the kernel doesn't inadvertently use DS or ES-relative
+ * accesses in a region where user DS or ES is loaded.
+ *
+ * Skip this for 64-bit builds because long mode doesn't care about
+ * DS and ES and skipping it increases test coverage a little bit,
+ * since 64-bit kernels can still run the 32-bit build.
+ */
+ ctx->uc_mcontext.gregs[REG_DS] = 0;
+ ctx->uc_mcontext.gregs[REG_ES] = 0;
+#endif
+
+ memcpy(&requested_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
+ requested_regs[REG_CX] = *ssptr(ctx); /* The asm code does this. */
+
+ return;
+}
+
+/*
+ * Called after a successful sigreturn (via int3) or from a failed
+ * sigreturn (directly by kernel). Restores our state so that the
+ * original raise(SIGUSR1) returns.
+ */
+static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+ validate_signal_ss(sig, ctx);
+
+ sig_err = ctx->uc_mcontext.gregs[REG_ERR];
+ sig_trapno = ctx->uc_mcontext.gregs[REG_TRAPNO];
+
+ unsigned short ss;
+ asm ("mov %%ss,%0" : "=r" (ss));
+
+ greg_t asm_ss = ctx->uc_mcontext.gregs[REG_CX];
+ if (asm_ss != sig_ss && sig == SIGTRAP) {
+ /* Sanity check failure. */
+ printf("[FAIL]\tSIGTRAP: ss = %hx, frame ss = %hx, ax = %llx\n",
+ ss, *ssptr(ctx), (unsigned long long)asm_ss);
+ nerrs++;
+ }
+
+ memcpy(&resulting_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
+ memcpy(&ctx->uc_mcontext.gregs, &initial_regs, sizeof(gregset_t));
+
+#ifdef __x86_64__
+ if (sig_corrupt_final_ss) {
+ if (ctx->uc_flags & UC_STRICT_RESTORE_SS) {
+ printf("[FAIL]\tUC_STRICT_RESTORE_SS was set inappropriately\n");
+ nerrs++;
+ } else {
+ /*
+ * DOSEMU transitions from 32-bit to 64-bit mode by
+ * adjusting sigcontext, and it requires that this work
+ * even if the saved SS is bogus.
+ */
+ printf("\tCorrupting SS on return to 64-bit mode\n");
+ *ssptr(ctx) = 0;
+ }
+ }
+#endif
+
+ sig_trapped = sig;
+}
+
+#ifdef __x86_64__
+/* Tests recovery if !UC_STRICT_RESTORE_SS */
+static void sigusr2(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+ if (!(ctx->uc_flags & UC_STRICT_RESTORE_SS)) {
+ printf("[FAIL]\traise(2) didn't set UC_STRICT_RESTORE_SS\n");
+ nerrs++;
+ return; /* We can't do the rest. */
+ }
+
+ ctx->uc_flags &= ~UC_STRICT_RESTORE_SS;
+ *ssptr(ctx) = 0;
+
+ /* Return. The kernel should recover without sending another signal. */
+}
+
+static int test_nonstrict_ss(void)
+{
+ clearhandler(SIGUSR1);
+ clearhandler(SIGTRAP);
+ clearhandler(SIGSEGV);
+ clearhandler(SIGILL);
+ sethandler(SIGUSR2, sigusr2, 0);
+
+ nerrs = 0;
+
+ printf("[RUN]\tClear UC_STRICT_RESTORE_SS and corrupt SS\n");
+ raise(SIGUSR2);
+ if (!nerrs)
+ printf("[OK]\tIt worked\n");
+
+ return nerrs;
+}
+#endif
+
+/* Finds a usable code segment of the requested bitness. */
+int find_cs(int bitness)
+{
+ unsigned short my_cs;
+
+ asm ("mov %%cs,%0" : "=r" (my_cs));
+
+ if (cs_bitness(my_cs) == bitness)
+ return my_cs;
+ if (cs_bitness(my_cs + (2 << 3)) == bitness)
+ return my_cs + (2 << 3);
+ if (my_cs > (2<<3) && cs_bitness(my_cs - (2 << 3)) == bitness)
+ return my_cs - (2 << 3);
+ if (cs_bitness(code16_sel) == bitness)
+ return code16_sel;
+
+ printf("[WARN]\tCould not find %d-bit CS\n", bitness);
+ return -1;
+}
+
+static int test_valid_sigreturn(int cs_bits, bool use_16bit_ss, int force_ss)
+{
+ int cs = find_cs(cs_bits);
+ if (cs == -1) {
+ printf("[SKIP]\tCode segment unavailable for %d-bit CS, %d-bit SS\n",
+ cs_bits, use_16bit_ss ? 16 : 32);
+ return 0;
+ }
+
+ if (force_ss != -1) {
+ sig_ss = force_ss;
+ } else {
+ if (use_16bit_ss) {
+ if (!data16_sel) {
+ printf("[SKIP]\tData segment unavailable for %d-bit CS, 16-bit SS\n",
+ cs_bits);
+ return 0;
+ }
+ sig_ss = data16_sel;
+ } else {
+ asm volatile ("mov %%ss,%0" : "=r" (sig_ss));
+ }
+ }
+
+ sig_cs = cs;
+
+ printf("[RUN]\tValid sigreturn: %d-bit CS (%hx), %d-bit SS (%hx%s)\n",
+ cs_bits, sig_cs, use_16bit_ss ? 16 : 32, sig_ss,
+ (sig_ss & 4) ? "" : ", GDT");
+
+ raise(SIGUSR1);
+
+ nerrs = 0;
+
+ /*
+ * Check that each register had an acceptable value when the
+ * int3 trampoline was invoked.
+ */
+ for (int i = 0; i < NGREG; i++) {
+ greg_t req = requested_regs[i], res = resulting_regs[i];
+
+ if (i == REG_TRAPNO || i == REG_IP)
+ continue; /* don't care */
+
+ if (i == REG_SP) {
+ /*
+ * If we were using a 16-bit stack segment, then
+ * the kernel is a bit stuck: IRET only restores
+ * the low 16 bits of ESP/RSP if SS is 16-bit.
+ * The kernel uses a hack to restore bits 31:16,
+ * but that hack doesn't help with bits 63:32.
+ * On Intel CPUs, bits 63:32 end up zeroed, and, on
+ * AMD CPUs, they leak the high bits of the kernel
+ * espfix64 stack pointer. There's very little that
+ * the kernel can do about it.
+ *
+ * Similarly, if we are returning to a 32-bit context,
+ * the CPU will often lose the high 32 bits of RSP.
+ */
+
+ if (res == req)
+ continue;
+
+ if (cs_bits != 64 && ((res ^ req) & 0xFFFFFFFF) == 0) {
+ printf("[NOTE]\tSP: %llx -> %llx\n",
+ (unsigned long long)req,
+ (unsigned long long)res);
+ continue;
+ }
+
+ printf("[FAIL]\tSP mismatch: requested 0x%llx; got 0x%llx\n",
+ (unsigned long long)requested_regs[i],
+ (unsigned long long)resulting_regs[i]);
+ nerrs++;
+ continue;
+ }
+
+ bool ignore_reg = false;
+#if __i386__
+ if (i == REG_UESP)
+ ignore_reg = true;
+#else
+ if (i == REG_CSGSFS) {
+ struct selectors *req_sels =
+ (void *)&requested_regs[REG_CSGSFS];
+ struct selectors *res_sels =
+ (void *)&resulting_regs[REG_CSGSFS];
+ if (req_sels->cs != res_sels->cs) {
+ printf("[FAIL]\tCS mismatch: requested 0x%hx; got 0x%hx\n",
+ req_sels->cs, res_sels->cs);
+ nerrs++;
+ }
+
+ if (req_sels->ss != res_sels->ss) {
+ printf("[FAIL]\tSS mismatch: requested 0x%hx; got 0x%hx\n",
+ req_sels->ss, res_sels->ss);
+ nerrs++;
+ }
+
+ continue;
+ }
+#endif
+
+ /* Sanity check on the kernel */
+ if (i == REG_CX && req != res) {
+ printf("[FAIL]\tCX (saved SP) mismatch: requested 0x%llx; got 0x%llx\n",
+ (unsigned long long)req,
+ (unsigned long long)res);
+ nerrs++;
+ continue;
+ }
+
+ if (req != res && !ignore_reg) {
+ printf("[FAIL]\tReg %d mismatch: requested 0x%llx; got 0x%llx\n",
+ i, (unsigned long long)req,
+ (unsigned long long)res);
+ nerrs++;
+ }
+ }
+
+ if (nerrs == 0)
+ printf("[OK]\tall registers okay\n");
+
+ return nerrs;
+}
+
+static int test_bad_iret(int cs_bits, unsigned short ss, int force_cs)
+{
+ int cs = force_cs == -1 ? find_cs(cs_bits) : force_cs;
+ if (cs == -1)
+ return 0;
+
+ sig_cs = cs;
+ sig_ss = ss;
+
+ printf("[RUN]\t%d-bit CS (%hx), bogus SS (%hx)\n",
+ cs_bits, sig_cs, sig_ss);
+
+ sig_trapped = 0;
+ raise(SIGUSR1);
+ if (sig_trapped) {
+ char errdesc[32] = "";
+ if (sig_err) {
+ const char *src = (sig_err & 1) ? " EXT" : "";
+ const char *table;
+ if ((sig_err & 0x6) == 0x0)
+ table = "GDT";
+ else if ((sig_err & 0x6) == 0x4)
+ table = "LDT";
+ else if ((sig_err & 0x6) == 0x2)
+ table = "IDT";
+ else
+ table = "???";
+
+ sprintf(errdesc, "%s%s index %d, ",
+ table, src, sig_err >> 3);
+ }
+
+ char trapname[32];
+ if (sig_trapno == 13)
+ strcpy(trapname, "GP");
+ else if (sig_trapno == 11)
+ strcpy(trapname, "NP");
+ else if (sig_trapno == 12)
+ strcpy(trapname, "SS");
+ else if (sig_trapno == 32)
+ strcpy(trapname, "IRET"); /* X86_TRAP_IRET */
+ else
+ sprintf(trapname, "%d", sig_trapno);
+
+ printf("[OK]\tGot #%s(0x%lx) (i.e. %s%s)\n",
+ trapname, (unsigned long)sig_err,
+ errdesc, strsignal(sig_trapped));
+ return 0;
+ } else {
+ /*
+ * This also implicitly tests UC_STRICT_RESTORE_SS:
+ * We check that these signals set UC_STRICT_RESTORE_SS and,
+ * if UC_STRICT_RESTORE_SS doesn't cause strict behavior,
+ * then we won't get SIGSEGV.
+ */
+ printf("[FAIL]\tDid not get SIGSEGV\n");
+ return 1;
+ }
+}
+
+int main()
+{
+ int total_nerrs = 0;
+ unsigned short my_cs, my_ss;
+
+ asm volatile ("mov %%cs,%0" : "=r" (my_cs));
+ asm volatile ("mov %%ss,%0" : "=r" (my_ss));
+ setup_ldt();
+
+ stack_t stack = {
+ .ss_sp = altstack_data,
+ .ss_size = SIGSTKSZ,
+ };
+ if (sigaltstack(&stack, NULL) != 0)
+ err(1, "sigaltstack");
+
+ sethandler(SIGUSR1, sigusr1, 0);
+ sethandler(SIGTRAP, sigtrap, SA_ONSTACK);
+
+ /* Easy cases: return to a 32-bit SS in each possible CS bitness. */
+ total_nerrs += test_valid_sigreturn(64, false, -1);
+ total_nerrs += test_valid_sigreturn(32, false, -1);
+ total_nerrs += test_valid_sigreturn(16, false, -1);
+
+ /*
+ * Test easy espfix cases: return to a 16-bit LDT SS in each possible
+ * CS bitness. NB: with a long mode CS, the SS bitness is irrelevant.
+ *
+ * This catches the original missing-espfix-on-64-bit-kernels issue
+ * as well as CVE-2014-8134.
+ */
+ total_nerrs += test_valid_sigreturn(64, true, -1);
+ total_nerrs += test_valid_sigreturn(32, true, -1);
+ total_nerrs += test_valid_sigreturn(16, true, -1);
+
+ if (gdt_data16_idx) {
+ /*
+ * For performance reasons, Linux skips espfix if SS points
+ * to the GDT. If we were able to allocate a 16-bit SS in
+ * the GDT, see if it leaks parts of the kernel stack pointer.
+ *
+ * This tests for CVE-2014-8133.
+ */
+ total_nerrs += test_valid_sigreturn(64, true,
+ GDT3(gdt_data16_idx));
+ total_nerrs += test_valid_sigreturn(32, true,
+ GDT3(gdt_data16_idx));
+ total_nerrs += test_valid_sigreturn(16, true,
+ GDT3(gdt_data16_idx));
+ }
+
+#ifdef __x86_64__
+ /* Nasty ABI case: check SS corruption handling. */
+ sig_corrupt_final_ss = 1;
+ total_nerrs += test_valid_sigreturn(32, false, -1);
+ total_nerrs += test_valid_sigreturn(32, true, -1);
+ sig_corrupt_final_ss = 0;
+#endif
+
+ /*
+ * We're done testing valid sigreturn cases. Now we test states
+ * for which sigreturn itself will succeed but the subsequent
+ * entry to user mode will fail.
+ *
+ * Depending on the failure mode and the kernel bitness, these
+ * entry failures can generate SIGSEGV, SIGBUS, or SIGILL.
+ */
+ clearhandler(SIGTRAP);
+ sethandler(SIGSEGV, sigtrap, SA_ONSTACK);
+ sethandler(SIGBUS, sigtrap, SA_ONSTACK);
+ sethandler(SIGILL, sigtrap, SA_ONSTACK); /* 32-bit kernels do this */
+
+ /* Easy failures: invalid SS, resulting in #GP(0) */
+ test_bad_iret(64, ldt_nonexistent_sel, -1);
+ test_bad_iret(32, ldt_nonexistent_sel, -1);
+ test_bad_iret(16, ldt_nonexistent_sel, -1);
+
+ /* These fail because SS isn't a data segment, resulting in #GP(SS) */
+ test_bad_iret(64, my_cs, -1);
+ test_bad_iret(32, my_cs, -1);
+ test_bad_iret(16, my_cs, -1);
+
+ /* Try to return to a not-present code segment, triggering #NP(SS). */
+ test_bad_iret(32, my_ss, npcode32_sel);
+
+ /*
+ * Try to return to a not-present but otherwise valid data segment.
+ * This will cause IRET to fail with #SS on the espfix stack. This
+ * exercises CVE-2014-9322.
+ *
+ * Note that, if espfix is enabled, 64-bit Linux will lose track
+ * of the actual cause of failure and report #GP(0) instead.
+ * This would be very difficult for Linux to avoid, because
+ * espfix64 causes IRET failures to be promoted to #DF, so the
+ * original exception frame is never pushed onto the stack.
+ */
+ test_bad_iret(32, npdata32_sel, -1);
+
+ /*
+ * Try to return to a not-present but otherwise valid data
+ * segment without invoking espfix. Newer kernels don't allow
+ * this to happen in the first place. On older kernels, though,
+ * this can trigger CVE-2014-9322.
+ */
+ if (gdt_npdata32_idx)
+ test_bad_iret(32, GDT3(gdt_npdata32_idx), -1);
+
+#ifdef __x86_64__
+ total_nerrs += test_nonstrict_ss();
+#endif
+
+ return total_nerrs ? 1 : 0;
+}
diff --git a/tools/testing/selftests/x86/single_step_syscall.c b/tools/testing/selftests/x86/single_step_syscall.c
new file mode 100644
index 000000000..120ac741f
--- /dev/null
+++ b/tools/testing/selftests/x86/single_step_syscall.c
@@ -0,0 +1,242 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * single_step_syscall.c - single-steps various x86 syscalls
+ * Copyright (c) 2014-2015 Andrew Lutomirski
+ *
+ * This is a very simple series of tests that makes system calls with
+ * the TF flag set. This exercises some nasty kernel code in the
+ * SYSENTER case: SYSENTER does not clear TF, so SYSENTER with TF set
+ * immediately issues #DB from CPL 0. This requires special handling in
+ * the kernel.
+ */
+
+#define _GNU_SOURCE
+
+#include <sys/time.h>
+#include <time.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <sys/signal.h>
+#include <sys/ucontext.h>
+#include <asm/ldt.h>
+#include <err.h>
+#include <setjmp.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <sys/ptrace.h>
+#include <sys/user.h>
+
+#include "helpers.h"
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = SIG_DFL;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static volatile sig_atomic_t sig_traps, sig_eflags;
+sigjmp_buf jmpbuf;
+static unsigned char altstack_data[SIGSTKSZ];
+
+#ifdef __x86_64__
+# define REG_IP REG_RIP
+# define WIDTH "q"
+# define INT80_CLOBBERS "r8", "r9", "r10", "r11"
+#else
+# define REG_IP REG_EIP
+# define WIDTH "l"
+# define INT80_CLOBBERS
+#endif
+
+static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+ if (get_eflags() & X86_EFLAGS_TF) {
+ set_eflags(get_eflags() & ~X86_EFLAGS_TF);
+ printf("[WARN]\tSIGTRAP handler had TF set\n");
+ _exit(1);
+ }
+
+ sig_traps++;
+
+ if (sig_traps == 10000 || sig_traps == 10001) {
+ printf("[WARN]\tHit %d SIGTRAPs with si_addr 0x%lx, ip 0x%lx\n",
+ (int)sig_traps,
+ (unsigned long)info->si_addr,
+ (unsigned long)ctx->uc_mcontext.gregs[REG_IP]);
+ }
+}
+
+static char const * const signames[] = {
+ [SIGSEGV] = "SIGSEGV",
+ [SIGBUS] = "SIBGUS",
+ [SIGTRAP] = "SIGTRAP",
+ [SIGILL] = "SIGILL",
+};
+
+static void print_and_longjmp(int sig, siginfo_t *si, void *ctx_void)
+{
+ ucontext_t *ctx = ctx_void;
+
+ printf("\tGot %s with RIP=%lx, TF=%ld\n", signames[sig],
+ (unsigned long)ctx->uc_mcontext.gregs[REG_IP],
+ (unsigned long)ctx->uc_mcontext.gregs[REG_EFL] & X86_EFLAGS_TF);
+
+ sig_eflags = (unsigned long)ctx->uc_mcontext.gregs[REG_EFL];
+ siglongjmp(jmpbuf, 1);
+}
+
+static void check_result(void)
+{
+ unsigned long new_eflags = get_eflags();
+ set_eflags(new_eflags & ~X86_EFLAGS_TF);
+
+ if (!sig_traps) {
+ printf("[FAIL]\tNo SIGTRAP\n");
+ exit(1);
+ }
+
+ if (!(new_eflags & X86_EFLAGS_TF)) {
+ printf("[FAIL]\tTF was cleared\n");
+ exit(1);
+ }
+
+ printf("[OK]\tSurvived with TF set and %d traps\n", (int)sig_traps);
+ sig_traps = 0;
+}
+
+static void fast_syscall_no_tf(void)
+{
+ sig_traps = 0;
+ printf("[RUN]\tFast syscall with TF cleared\n");
+ fflush(stdout); /* Force a syscall */
+ if (get_eflags() & X86_EFLAGS_TF) {
+ printf("[FAIL]\tTF is now set\n");
+ exit(1);
+ }
+ if (sig_traps) {
+ printf("[FAIL]\tGot SIGTRAP\n");
+ exit(1);
+ }
+ printf("[OK]\tNothing unexpected happened\n");
+}
+
+int main()
+{
+#ifdef CAN_BUILD_32
+ int tmp;
+#endif
+
+ sethandler(SIGTRAP, sigtrap, 0);
+
+ printf("[RUN]\tSet TF and check nop\n");
+ set_eflags(get_eflags() | X86_EFLAGS_TF);
+ asm volatile ("nop");
+ check_result();
+
+#ifdef __x86_64__
+ printf("[RUN]\tSet TF and check syscall-less opportunistic sysret\n");
+ set_eflags(get_eflags() | X86_EFLAGS_TF);
+ extern unsigned char post_nop[];
+ asm volatile ("pushf" WIDTH "\n\t"
+ "pop" WIDTH " %%r11\n\t"
+ "nop\n\t"
+ "post_nop:"
+ : : "c" (post_nop) : "r11");
+ check_result();
+#endif
+#ifdef CAN_BUILD_32
+ printf("[RUN]\tSet TF and check int80\n");
+ set_eflags(get_eflags() | X86_EFLAGS_TF);
+ asm volatile ("int $0x80" : "=a" (tmp) : "a" (SYS_getpid)
+ : INT80_CLOBBERS);
+ check_result();
+#endif
+
+ /*
+ * This test is particularly interesting if fast syscalls use
+ * SYSENTER: it triggers a nasty design flaw in SYSENTER.
+ * Specifically, SYSENTER does not clear TF, so either SYSENTER
+ * or the next instruction traps at CPL0. (Of course, Intel
+ * mostly forgot to document exactly what happens here.) So we
+ * get a CPL0 fault with usergs (on 64-bit kernels) and possibly
+ * no stack. The only sane way the kernel can possibly handle
+ * it is to clear TF on return from the #DB handler, but this
+ * happens way too early to set TF in the saved pt_regs, so the
+ * kernel has to do something clever to avoid losing track of
+ * the TF bit.
+ *
+ * Needless to say, we've had bugs in this area.
+ */
+ syscall(SYS_getpid); /* Force symbol binding without TF set. */
+ printf("[RUN]\tSet TF and check a fast syscall\n");
+ set_eflags(get_eflags() | X86_EFLAGS_TF);
+ syscall(SYS_getpid);
+ check_result();
+
+ /* Now make sure that another fast syscall doesn't set TF again. */
+ fast_syscall_no_tf();
+
+ /*
+ * And do a forced SYSENTER to make sure that this works even if
+ * fast syscalls don't use SYSENTER.
+ *
+ * Invoking SYSENTER directly breaks all the rules. Just handle
+ * the SIGSEGV.
+ */
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ unsigned long nr = SYS_getpid;
+ printf("[RUN]\tSet TF and check SYSENTER\n");
+ stack_t stack = {
+ .ss_sp = altstack_data,
+ .ss_size = SIGSTKSZ,
+ };
+ if (sigaltstack(&stack, NULL) != 0)
+ err(1, "sigaltstack");
+ sethandler(SIGSEGV, print_and_longjmp,
+ SA_RESETHAND | SA_ONSTACK);
+ sethandler(SIGILL, print_and_longjmp, SA_RESETHAND);
+ set_eflags(get_eflags() | X86_EFLAGS_TF);
+ /* Clear EBP first to make sure we segfault cleanly. */
+ asm volatile ("xorl %%ebp, %%ebp; SYSENTER" : "+a" (nr) :: "flags", "rcx"
+#ifdef __x86_64__
+ , "r11"
+#endif
+ );
+
+ /* We're unreachable here. SYSENTER forgets RIP. */
+ }
+ clearhandler(SIGSEGV);
+ clearhandler(SIGILL);
+ if (!(sig_eflags & X86_EFLAGS_TF)) {
+ printf("[FAIL]\tTF was cleared\n");
+ exit(1);
+ }
+
+ /* Now make sure that another fast syscall doesn't set TF again. */
+ fast_syscall_no_tf();
+
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/syscall_arg_fault.c b/tools/testing/selftests/x86/syscall_arg_fault.c
new file mode 100644
index 000000000..bff474b5e
--- /dev/null
+++ b/tools/testing/selftests/x86/syscall_arg_fault.c
@@ -0,0 +1,237 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * syscall_arg_fault.c - tests faults 32-bit fast syscall stack args
+ * Copyright (c) 2015 Andrew Lutomirski
+ */
+
+#define _GNU_SOURCE
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/signal.h>
+#include <sys/ucontext.h>
+#include <err.h>
+#include <setjmp.h>
+#include <errno.h>
+
+#include "helpers.h"
+
+/* Our sigaltstack scratch space. */
+static unsigned char altstack_data[SIGSTKSZ];
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static volatile sig_atomic_t sig_traps;
+static sigjmp_buf jmpbuf;
+
+static volatile sig_atomic_t n_errs;
+
+#ifdef __x86_64__
+#define REG_AX REG_RAX
+#define REG_IP REG_RIP
+#else
+#define REG_AX REG_EAX
+#define REG_IP REG_EIP
+#endif
+
+static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+ long ax = (long)ctx->uc_mcontext.gregs[REG_AX];
+
+ if (ax != -EFAULT && ax != -ENOSYS) {
+ printf("[FAIL]\tAX had the wrong value: 0x%lx\n",
+ (unsigned long)ax);
+ printf("\tIP = 0x%lx\n", (unsigned long)ctx->uc_mcontext.gregs[REG_IP]);
+ n_errs++;
+ } else {
+ printf("[OK]\tSeems okay\n");
+ }
+
+ siglongjmp(jmpbuf, 1);
+}
+
+static volatile sig_atomic_t sigtrap_consecutive_syscalls;
+
+static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
+{
+ /*
+ * KVM has some bugs that can cause us to stop making progress.
+ * detect them and complain, but don't infinite loop or fail the
+ * test.
+ */
+
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+ unsigned short *ip = (unsigned short *)ctx->uc_mcontext.gregs[REG_IP];
+
+ if (*ip == 0x340f || *ip == 0x050f) {
+ /* The trap was on SYSCALL or SYSENTER */
+ sigtrap_consecutive_syscalls++;
+ if (sigtrap_consecutive_syscalls > 3) {
+ printf("[WARN]\tGot stuck single-stepping -- you probably have a KVM bug\n");
+ siglongjmp(jmpbuf, 1);
+ }
+ } else {
+ sigtrap_consecutive_syscalls = 0;
+ }
+}
+
+static void sigill(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+ unsigned short *ip = (unsigned short *)ctx->uc_mcontext.gregs[REG_IP];
+
+ if (*ip == 0x0b0f) {
+ /* one of the ud2 instructions faulted */
+ printf("[OK]\tSYSCALL returned normally\n");
+ } else {
+ printf("[SKIP]\tIllegal instruction\n");
+ }
+ siglongjmp(jmpbuf, 1);
+}
+
+int main()
+{
+ stack_t stack = {
+ .ss_sp = altstack_data,
+ .ss_size = SIGSTKSZ,
+ };
+ if (sigaltstack(&stack, NULL) != 0)
+ err(1, "sigaltstack");
+
+ sethandler(SIGSEGV, sigsegv_or_sigbus, SA_ONSTACK);
+ /*
+ * The actual exception can vary. On Atom CPUs, we get #SS
+ * instead of #PF when the vDSO fails to access the stack when
+ * ESP is too close to 2^32, and #SS causes SIGBUS.
+ */
+ sethandler(SIGBUS, sigsegv_or_sigbus, SA_ONSTACK);
+ sethandler(SIGILL, sigill, SA_ONSTACK);
+
+ /*
+ * Exercise another nasty special case. The 32-bit SYSCALL
+ * and SYSENTER instructions (even in compat mode) each
+ * clobber one register. A Linux system call has a syscall
+ * number and six arguments, and the user stack pointer
+ * needs to live in some register on return. That means
+ * that we need eight registers, but SYSCALL and SYSENTER
+ * only preserve seven registers. As a result, one argument
+ * ends up on the stack. The stack is user memory, which
+ * means that the kernel can fail to read it.
+ *
+ * The 32-bit fast system calls don't have a defined ABI:
+ * we're supposed to invoke them through the vDSO. So we'll
+ * fudge it: we set all regs to invalid pointer values and
+ * invoke the entry instruction. The return will fail no
+ * matter what, and we completely lose our program state,
+ * but we can fix it up with a signal handler.
+ */
+
+ printf("[RUN]\tSYSENTER with invalid state\n");
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ asm volatile (
+ "movl $-1, %%eax\n\t"
+ "movl $-1, %%ebx\n\t"
+ "movl $-1, %%ecx\n\t"
+ "movl $-1, %%edx\n\t"
+ "movl $-1, %%esi\n\t"
+ "movl $-1, %%edi\n\t"
+ "movl $-1, %%ebp\n\t"
+ "movl $-1, %%esp\n\t"
+ "sysenter"
+ : : : "memory", "flags");
+ }
+
+ printf("[RUN]\tSYSCALL with invalid state\n");
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ asm volatile (
+ "movl $-1, %%eax\n\t"
+ "movl $-1, %%ebx\n\t"
+ "movl $-1, %%ecx\n\t"
+ "movl $-1, %%edx\n\t"
+ "movl $-1, %%esi\n\t"
+ "movl $-1, %%edi\n\t"
+ "movl $-1, %%ebp\n\t"
+ "movl $-1, %%esp\n\t"
+ "syscall\n\t"
+ "ud2" /* make sure we recover cleanly */
+ : : : "memory", "flags");
+ }
+
+ printf("[RUN]\tSYSENTER with TF and invalid state\n");
+ sethandler(SIGTRAP, sigtrap, SA_ONSTACK);
+
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ sigtrap_consecutive_syscalls = 0;
+ set_eflags(get_eflags() | X86_EFLAGS_TF);
+ asm volatile (
+ "movl $-1, %%eax\n\t"
+ "movl $-1, %%ebx\n\t"
+ "movl $-1, %%ecx\n\t"
+ "movl $-1, %%edx\n\t"
+ "movl $-1, %%esi\n\t"
+ "movl $-1, %%edi\n\t"
+ "movl $-1, %%ebp\n\t"
+ "movl $-1, %%esp\n\t"
+ "sysenter"
+ : : : "memory", "flags");
+ }
+ set_eflags(get_eflags() & ~X86_EFLAGS_TF);
+
+ printf("[RUN]\tSYSCALL with TF and invalid state\n");
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ sigtrap_consecutive_syscalls = 0;
+ set_eflags(get_eflags() | X86_EFLAGS_TF);
+ asm volatile (
+ "movl $-1, %%eax\n\t"
+ "movl $-1, %%ebx\n\t"
+ "movl $-1, %%ecx\n\t"
+ "movl $-1, %%edx\n\t"
+ "movl $-1, %%esi\n\t"
+ "movl $-1, %%edi\n\t"
+ "movl $-1, %%ebp\n\t"
+ "movl $-1, %%esp\n\t"
+ "syscall\n\t"
+ "ud2" /* make sure we recover cleanly */
+ : : : "memory", "flags");
+ }
+ set_eflags(get_eflags() & ~X86_EFLAGS_TF);
+
+#ifdef __x86_64__
+ printf("[RUN]\tSYSENTER with TF, invalid state, and GSBASE < 0\n");
+
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ sigtrap_consecutive_syscalls = 0;
+
+ asm volatile ("wrgsbase %%rax\n\t"
+ :: "a" (0xffffffffffff0000UL));
+
+ set_eflags(get_eflags() | X86_EFLAGS_TF);
+ asm volatile (
+ "movl $-1, %%eax\n\t"
+ "movl $-1, %%ebx\n\t"
+ "movl $-1, %%ecx\n\t"
+ "movl $-1, %%edx\n\t"
+ "movl $-1, %%esi\n\t"
+ "movl $-1, %%edi\n\t"
+ "movl $-1, %%ebp\n\t"
+ "movl $-1, %%esp\n\t"
+ "sysenter"
+ : : : "memory", "flags");
+ }
+ set_eflags(get_eflags() & ~X86_EFLAGS_TF);
+#endif
+
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/syscall_nt.c b/tools/testing/selftests/x86/syscall_nt.c
new file mode 100644
index 000000000..a108b80dd
--- /dev/null
+++ b/tools/testing/selftests/x86/syscall_nt.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * syscall_nt.c - checks syscalls with NT set
+ * Copyright (c) 2014-2015 Andrew Lutomirski
+ *
+ * Some obscure user-space code requires the ability to make system calls
+ * with FLAGS.NT set. Make sure it works.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <signal.h>
+#include <err.h>
+#include <sys/syscall.h>
+
+#include "helpers.h"
+
+static unsigned int nerrs;
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static void sigtrap(int sig, siginfo_t *si, void *ctx_void)
+{
+}
+
+static void do_it(unsigned long extraflags)
+{
+ unsigned long flags;
+
+ set_eflags(get_eflags() | extraflags);
+ syscall(SYS_getpid);
+ flags = get_eflags();
+ set_eflags(X86_EFLAGS_IF | X86_EFLAGS_FIXED);
+ if ((flags & extraflags) == extraflags) {
+ printf("[OK]\tThe syscall worked and flags are still set\n");
+ } else {
+ printf("[FAIL]\tThe syscall worked but flags were cleared (flags = 0x%lx but expected 0x%lx set)\n",
+ flags, extraflags);
+ nerrs++;
+ }
+}
+
+int main(void)
+{
+ printf("[RUN]\tSet NT and issue a syscall\n");
+ do_it(X86_EFLAGS_NT);
+
+ printf("[RUN]\tSet AC and issue a syscall\n");
+ do_it(X86_EFLAGS_AC);
+
+ printf("[RUN]\tSet NT|AC and issue a syscall\n");
+ do_it(X86_EFLAGS_NT | X86_EFLAGS_AC);
+
+ /*
+ * Now try it again with TF set -- TF forces returns via IRET in all
+ * cases except non-ptregs-using 64-bit full fast path syscalls.
+ */
+
+ sethandler(SIGTRAP, sigtrap, 0);
+
+ printf("[RUN]\tSet TF and issue a syscall\n");
+ do_it(X86_EFLAGS_TF);
+
+ printf("[RUN]\tSet NT|TF and issue a syscall\n");
+ do_it(X86_EFLAGS_NT | X86_EFLAGS_TF);
+
+ printf("[RUN]\tSet AC|TF and issue a syscall\n");
+ do_it(X86_EFLAGS_AC | X86_EFLAGS_TF);
+
+ printf("[RUN]\tSet NT|AC|TF and issue a syscall\n");
+ do_it(X86_EFLAGS_NT | X86_EFLAGS_AC | X86_EFLAGS_TF);
+
+ /*
+ * Now try DF. This is evil and it's plausible that we will crash
+ * glibc, but glibc would have to do something rather surprising
+ * for this to happen.
+ */
+ printf("[RUN]\tSet DF and issue a syscall\n");
+ do_it(X86_EFLAGS_DF);
+
+ printf("[RUN]\tSet TF|DF and issue a syscall\n");
+ do_it(X86_EFLAGS_TF | X86_EFLAGS_DF);
+
+ return nerrs == 0 ? 0 : 1;
+}
diff --git a/tools/testing/selftests/x86/syscall_numbering.c b/tools/testing/selftests/x86/syscall_numbering.c
new file mode 100644
index 000000000..d6b09cb1a
--- /dev/null
+++ b/tools/testing/selftests/x86/syscall_numbering.c
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * syscall_arg_fault.c - tests faults 32-bit fast syscall stack args
+ * Copyright (c) 2018 Andrew Lutomirski
+ */
+
+#define _GNU_SOURCE
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <errno.h>
+#include <unistd.h>
+#include <syscall.h>
+
+static int nerrs;
+
+#define X32_BIT 0x40000000UL
+
+static void check_enosys(unsigned long nr, bool *ok)
+{
+ /* If this fails, a segfault is reasonably likely. */
+ fflush(stdout);
+
+ long ret = syscall(nr, 0, 0, 0, 0, 0, 0);
+ if (ret == 0) {
+ printf("[FAIL]\tsyscall %lu succeeded, but it should have failed\n", nr);
+ *ok = false;
+ } else if (errno != ENOSYS) {
+ printf("[FAIL]\tsyscall %lu had error code %d, but it should have reported ENOSYS\n", nr, errno);
+ *ok = false;
+ }
+}
+
+static void test_x32_without_x32_bit(void)
+{
+ bool ok = true;
+
+ /*
+ * Syscalls 512-547 are "x32" syscalls. They are intended to be
+ * called with the x32 (0x40000000) bit set. Calling them without
+ * the x32 bit set is nonsense and should not work.
+ */
+ printf("[RUN]\tChecking syscalls 512-547\n");
+ for (int i = 512; i <= 547; i++)
+ check_enosys(i, &ok);
+
+ /*
+ * Check that a handful of 64-bit-only syscalls are rejected if the x32
+ * bit is set.
+ */
+ printf("[RUN]\tChecking some 64-bit syscalls in x32 range\n");
+ check_enosys(16 | X32_BIT, &ok); /* ioctl */
+ check_enosys(19 | X32_BIT, &ok); /* readv */
+ check_enosys(20 | X32_BIT, &ok); /* writev */
+
+ /*
+ * Check some syscalls with high bits set.
+ */
+ printf("[RUN]\tChecking numbers above 2^32-1\n");
+ check_enosys((1UL << 32), &ok);
+ check_enosys(X32_BIT | (1UL << 32), &ok);
+
+ if (!ok)
+ nerrs++;
+ else
+ printf("[OK]\tThey all returned -ENOSYS\n");
+}
+
+int main()
+{
+ /*
+ * Anyone diagnosing a failure will want to know whether the kernel
+ * supports x32. Tell them.
+ */
+ printf("\tChecking for x32...");
+ fflush(stdout);
+ if (syscall(39 | X32_BIT, 0, 0, 0, 0, 0, 0) >= 0) {
+ printf(" supported\n");
+ } else if (errno == ENOSYS) {
+ printf(" not supported\n");
+ } else {
+ printf(" confused\n");
+ }
+
+ test_x32_without_x32_bit();
+
+ return nerrs ? 1 : 0;
+}
diff --git a/tools/testing/selftests/x86/sysret_rip.c b/tools/testing/selftests/x86/sysret_rip.c
new file mode 100644
index 000000000..84d74be1d
--- /dev/null
+++ b/tools/testing/selftests/x86/sysret_rip.c
@@ -0,0 +1,187 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * sigreturn.c - tests that x86 avoids Intel SYSRET pitfalls
+ * Copyright (c) 2014-2016 Andrew Lutomirski
+ */
+
+#define _GNU_SOURCE
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <sys/signal.h>
+#include <sys/ucontext.h>
+#include <sys/syscall.h>
+#include <err.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <setjmp.h>
+#include <sys/user.h>
+#include <sys/mman.h>
+#include <assert.h>
+
+
+asm (
+ ".pushsection \".text\", \"ax\"\n\t"
+ ".balign 4096\n\t"
+ "test_page: .globl test_page\n\t"
+ ".fill 4094,1,0xcc\n\t"
+ "test_syscall_insn:\n\t"
+ "syscall\n\t"
+ ".ifne . - test_page - 4096\n\t"
+ ".error \"test page is not one page long\"\n\t"
+ ".endif\n\t"
+ ".popsection"
+ );
+
+extern const char test_page[];
+static void const *current_test_page_addr = test_page;
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_handler = SIG_DFL;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+/* State used by our signal handlers. */
+static gregset_t initial_regs;
+
+static volatile unsigned long rip;
+
+static void sigsegv_for_sigreturn_test(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+ if (rip != ctx->uc_mcontext.gregs[REG_RIP]) {
+ printf("[FAIL]\tRequested RIP=0x%lx but got RIP=0x%lx\n",
+ rip, (unsigned long)ctx->uc_mcontext.gregs[REG_RIP]);
+ fflush(stdout);
+ _exit(1);
+ }
+
+ memcpy(&ctx->uc_mcontext.gregs, &initial_regs, sizeof(gregset_t));
+
+ printf("[OK]\tGot SIGSEGV at RIP=0x%lx\n", rip);
+}
+
+static void sigusr1(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+ memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t));
+
+ /* Set IP and CX to match so that SYSRET can happen. */
+ ctx->uc_mcontext.gregs[REG_RIP] = rip;
+ ctx->uc_mcontext.gregs[REG_RCX] = rip;
+
+ /* R11 and EFLAGS should already match. */
+ assert(ctx->uc_mcontext.gregs[REG_EFL] ==
+ ctx->uc_mcontext.gregs[REG_R11]);
+
+ sethandler(SIGSEGV, sigsegv_for_sigreturn_test, SA_RESETHAND);
+
+ return;
+}
+
+static void test_sigreturn_to(unsigned long ip)
+{
+ rip = ip;
+ printf("[RUN]\tsigreturn to 0x%lx\n", ip);
+ raise(SIGUSR1);
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv_for_fallthrough(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+ if (rip != ctx->uc_mcontext.gregs[REG_RIP]) {
+ printf("[FAIL]\tExpected SIGSEGV at 0x%lx but got RIP=0x%lx\n",
+ rip, (unsigned long)ctx->uc_mcontext.gregs[REG_RIP]);
+ fflush(stdout);
+ _exit(1);
+ }
+
+ siglongjmp(jmpbuf, 1);
+}
+
+static void test_syscall_fallthrough_to(unsigned long ip)
+{
+ void *new_address = (void *)(ip - 4096);
+ void *ret;
+
+ printf("[RUN]\tTrying a SYSCALL that falls through to 0x%lx\n", ip);
+
+ ret = mremap((void *)current_test_page_addr, 4096, 4096,
+ MREMAP_MAYMOVE | MREMAP_FIXED, new_address);
+ if (ret == MAP_FAILED) {
+ if (ip <= (1UL << 47) - PAGE_SIZE) {
+ err(1, "mremap to %p", new_address);
+ } else {
+ printf("[OK]\tmremap to %p failed\n", new_address);
+ return;
+ }
+ }
+
+ if (ret != new_address)
+ errx(1, "mremap malfunctioned: asked for %p but got %p\n",
+ new_address, ret);
+
+ current_test_page_addr = new_address;
+ rip = ip;
+
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ asm volatile ("call *%[syscall_insn]" :: "a" (SYS_getpid),
+ [syscall_insn] "rm" (ip - 2));
+ errx(1, "[FAIL]\tSyscall trampoline returned");
+ }
+
+ printf("[OK]\tWe survived\n");
+}
+
+int main()
+{
+ /*
+ * When the kernel returns from a slow-path syscall, it will
+ * detect whether SYSRET is appropriate. If it incorrectly
+ * thinks that SYSRET is appropriate when RIP is noncanonical,
+ * it'll crash on Intel CPUs.
+ */
+ sethandler(SIGUSR1, sigusr1, 0);
+ for (int i = 47; i < 64; i++)
+ test_sigreturn_to(1UL<<i);
+
+ clearhandler(SIGUSR1);
+
+ sethandler(SIGSEGV, sigsegv_for_fallthrough, 0);
+
+ /* One extra test to check that we didn't screw up the mremap logic. */
+ test_syscall_fallthrough_to((1UL << 47) - 2*PAGE_SIZE);
+
+ /* These are the interesting cases. */
+ for (int i = 47; i < 64; i++) {
+ test_syscall_fallthrough_to((1UL<<i) - PAGE_SIZE);
+ test_syscall_fallthrough_to(1UL<<i);
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/sysret_ss_attrs.c b/tools/testing/selftests/x86/sysret_ss_attrs.c
new file mode 100644
index 000000000..5f3d4fca4
--- /dev/null
+++ b/tools/testing/selftests/x86/sysret_ss_attrs.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * sysret_ss_attrs.c - test that syscalls return valid hidden SS attributes
+ * Copyright (c) 2015 Andrew Lutomirski
+ *
+ * On AMD CPUs, SYSRET can return with a valid SS descriptor with with
+ * the hidden attributes set to an unusable state. Make sure the kernel
+ * doesn't let this happen.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <err.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <pthread.h>
+
+static void *threadproc(void *ctx)
+{
+ /*
+ * Do our best to cause sleeps on this CPU to exit the kernel and
+ * re-enter with SS = 0.
+ */
+ while (true)
+ ;
+
+ return NULL;
+}
+
+#ifdef __x86_64__
+extern unsigned long call32_from_64(void *stack, void (*function)(void));
+
+asm (".pushsection .text\n\t"
+ ".code32\n\t"
+ "test_ss:\n\t"
+ "pushl $0\n\t"
+ "popl %eax\n\t"
+ "ret\n\t"
+ ".code64");
+extern void test_ss(void);
+#endif
+
+int main()
+{
+ /*
+ * Start a busy-looping thread on the same CPU we're on.
+ * For simplicity, just stick everything to CPU 0. This will
+ * fail in some containers, but that's probably okay.
+ */
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+ CPU_SET(0, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
+ printf("[WARN]\tsched_setaffinity failed\n");
+
+ pthread_t thread;
+ if (pthread_create(&thread, 0, threadproc, 0) != 0)
+ err(1, "pthread_create");
+
+#ifdef __x86_64__
+ unsigned char *stack32 = mmap(NULL, 4096, PROT_READ | PROT_WRITE,
+ MAP_32BIT | MAP_ANONYMOUS | MAP_PRIVATE,
+ -1, 0);
+ if (stack32 == MAP_FAILED)
+ err(1, "mmap");
+#endif
+
+ printf("[RUN]\tSyscalls followed by SS validation\n");
+
+ for (int i = 0; i < 1000; i++) {
+ /*
+ * Go to sleep and return using sysret (if we're 64-bit
+ * or we're 32-bit on AMD on a 64-bit kernel). On AMD CPUs,
+ * SYSRET doesn't fix up the cached SS descriptor, so the
+ * kernel needs some kind of workaround to make sure that we
+ * end the system call with a valid stack segment. This
+ * can be a confusing failure because the SS *selector*
+ * is the same regardless.
+ */
+ usleep(2);
+
+#ifdef __x86_64__
+ /*
+ * On 32-bit, just doing a syscall through glibc is enough
+ * to cause a crash if our cached SS descriptor is invalid.
+ * On 64-bit, it's not, so try extra hard.
+ */
+ call32_from_64(stack32 + 4088, test_ss);
+#endif
+ }
+
+ printf("[OK]\tWe survived\n");
+
+#ifdef __x86_64__
+ munmap(stack32, 4096);
+#endif
+
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/test_FCMOV.c b/tools/testing/selftests/x86/test_FCMOV.c
new file mode 100644
index 000000000..6b5036fbb
--- /dev/null
+++ b/tools/testing/selftests/x86/test_FCMOV.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0
+#undef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#undef __USE_GNU
+#define __USE_GNU 1
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+
+#define TEST(insn) \
+long double __attribute__((noinline)) insn(long flags) \
+{ \
+ long double out; \
+ asm ("\n" \
+ " push %1""\n" \
+ " popf""\n" \
+ " fldpi""\n" \
+ " fld1""\n" \
+ " " #insn " %%st(1), %%st" "\n" \
+ " ffree %%st(1)" "\n" \
+ : "=t" (out) \
+ : "r" (flags) \
+ ); \
+ return out; \
+}
+
+TEST(fcmovb)
+TEST(fcmove)
+TEST(fcmovbe)
+TEST(fcmovu)
+TEST(fcmovnb)
+TEST(fcmovne)
+TEST(fcmovnbe)
+TEST(fcmovnu)
+
+enum {
+ CF = 1 << 0,
+ PF = 1 << 2,
+ ZF = 1 << 6,
+};
+
+void sighandler(int sig)
+{
+ printf("[FAIL]\tGot signal %d, exiting\n", sig);
+ exit(1);
+}
+
+int main(int argc, char **argv, char **envp)
+{
+ int err = 0;
+
+ /* SIGILL triggers on 32-bit kernels w/o fcomi emulation
+ * when run with "no387 nofxsr". Other signals are caught
+ * just in case.
+ */
+ signal(SIGILL, sighandler);
+ signal(SIGFPE, sighandler);
+ signal(SIGSEGV, sighandler);
+
+ printf("[RUN]\tTesting fcmovCC instructions\n");
+ /* If fcmovCC() returns 1.0, the move wasn't done */
+ err |= !(fcmovb(0) == 1.0); err |= !(fcmovnb(0) != 1.0);
+ err |= !(fcmove(0) == 1.0); err |= !(fcmovne(0) != 1.0);
+ err |= !(fcmovbe(0) == 1.0); err |= !(fcmovnbe(0) != 1.0);
+ err |= !(fcmovu(0) == 1.0); err |= !(fcmovnu(0) != 1.0);
+
+ err |= !(fcmovb(CF) != 1.0); err |= !(fcmovnb(CF) == 1.0);
+ err |= !(fcmove(CF) == 1.0); err |= !(fcmovne(CF) != 1.0);
+ err |= !(fcmovbe(CF) != 1.0); err |= !(fcmovnbe(CF) == 1.0);
+ err |= !(fcmovu(CF) == 1.0); err |= !(fcmovnu(CF) != 1.0);
+
+ err |= !(fcmovb(ZF) == 1.0); err |= !(fcmovnb(ZF) != 1.0);
+ err |= !(fcmove(ZF) != 1.0); err |= !(fcmovne(ZF) == 1.0);
+ err |= !(fcmovbe(ZF) != 1.0); err |= !(fcmovnbe(ZF) == 1.0);
+ err |= !(fcmovu(ZF) == 1.0); err |= !(fcmovnu(ZF) != 1.0);
+
+ err |= !(fcmovb(PF) == 1.0); err |= !(fcmovnb(PF) != 1.0);
+ err |= !(fcmove(PF) == 1.0); err |= !(fcmovne(PF) != 1.0);
+ err |= !(fcmovbe(PF) == 1.0); err |= !(fcmovnbe(PF) != 1.0);
+ err |= !(fcmovu(PF) != 1.0); err |= !(fcmovnu(PF) == 1.0);
+
+ if (!err)
+ printf("[OK]\tfcmovCC\n");
+ else
+ printf("[FAIL]\tfcmovCC errors: %d\n", err);
+
+ return err;
+}
diff --git a/tools/testing/selftests/x86/test_FCOMI.c b/tools/testing/selftests/x86/test_FCOMI.c
new file mode 100644
index 000000000..aec6692c6
--- /dev/null
+++ b/tools/testing/selftests/x86/test_FCOMI.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0
+#undef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#undef __USE_GNU
+#define __USE_GNU 1
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <fenv.h>
+
+enum {
+ CF = 1 << 0,
+ PF = 1 << 2,
+ ZF = 1 << 6,
+ ARITH = CF | PF | ZF,
+};
+
+long res_fcomi_pi_1;
+long res_fcomi_1_pi;
+long res_fcomi_1_1;
+long res_fcomi_nan_1;
+/* sNaN is s|111 1111 1|1xx xxxx xxxx xxxx xxxx xxxx */
+/* qNaN is s|111 1111 1|0xx xxxx xxxx xxxx xxxx xxxx (some x must be nonzero) */
+int snan = 0x7fc11111;
+int qnan = 0x7f811111;
+unsigned short snan1[5];
+/* sNaN80 is s|111 1111 1111 1111 |10xx xx...xx (some x must be nonzero) */
+unsigned short snan80[5] = { 0x1111, 0x1111, 0x1111, 0x8111, 0x7fff };
+
+int test(long flags)
+{
+ feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+ asm ("\n"
+
+ " push %0""\n"
+ " popf""\n"
+ " fld1""\n"
+ " fldpi""\n"
+ " fcomi %%st(1), %%st" "\n"
+ " ffree %%st(0)" "\n"
+ " ffree %%st(1)" "\n"
+ " pushf""\n"
+ " pop res_fcomi_1_pi""\n"
+
+ " push %0""\n"
+ " popf""\n"
+ " fldpi""\n"
+ " fld1""\n"
+ " fcomi %%st(1), %%st" "\n"
+ " ffree %%st(0)" "\n"
+ " ffree %%st(1)" "\n"
+ " pushf""\n"
+ " pop res_fcomi_pi_1""\n"
+
+ " push %0""\n"
+ " popf""\n"
+ " fld1""\n"
+ " fld1""\n"
+ " fcomi %%st(1), %%st" "\n"
+ " ffree %%st(0)" "\n"
+ " ffree %%st(1)" "\n"
+ " pushf""\n"
+ " pop res_fcomi_1_1""\n"
+ :
+ : "r" (flags)
+ );
+ if ((res_fcomi_1_pi & ARITH) != (0)) {
+ printf("[BAD]\tfcomi_1_pi with flags:%lx\n", flags);
+ return 1;
+ }
+ if ((res_fcomi_pi_1 & ARITH) != (CF)) {
+ printf("[BAD]\tfcomi_pi_1 with flags:%lx->%lx\n", flags, res_fcomi_pi_1 & ARITH);
+ return 1;
+ }
+ if ((res_fcomi_1_1 & ARITH) != (ZF)) {
+ printf("[BAD]\tfcomi_1_1 with flags:%lx\n", flags);
+ return 1;
+ }
+ if (fetestexcept(FE_INVALID) != 0) {
+ printf("[BAD]\tFE_INVALID is set in %s\n", __func__);
+ return 1;
+ }
+ return 0;
+}
+
+int test_qnan(long flags)
+{
+ feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+ asm ("\n"
+ " push %0""\n"
+ " popf""\n"
+ " flds qnan""\n"
+ " fld1""\n"
+ " fnclex""\n" // fld of a qnan raised FE_INVALID, clear it
+ " fcomi %%st(1), %%st" "\n"
+ " ffree %%st(0)" "\n"
+ " ffree %%st(1)" "\n"
+ " pushf""\n"
+ " pop res_fcomi_nan_1""\n"
+ :
+ : "r" (flags)
+ );
+ if ((res_fcomi_nan_1 & ARITH) != (ZF|CF|PF)) {
+ printf("[BAD]\tfcomi_qnan_1 with flags:%lx\n", flags);
+ return 1;
+ }
+ if (fetestexcept(FE_INVALID) != FE_INVALID) {
+ printf("[BAD]\tFE_INVALID is not set in %s\n", __func__);
+ return 1;
+ }
+ return 0;
+}
+
+int testu_qnan(long flags)
+{
+ feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+ asm ("\n"
+ " push %0""\n"
+ " popf""\n"
+ " flds qnan""\n"
+ " fld1""\n"
+ " fnclex""\n" // fld of a qnan raised FE_INVALID, clear it
+ " fucomi %%st(1), %%st" "\n"
+ " ffree %%st(0)" "\n"
+ " ffree %%st(1)" "\n"
+ " pushf""\n"
+ " pop res_fcomi_nan_1""\n"
+ :
+ : "r" (flags)
+ );
+ if ((res_fcomi_nan_1 & ARITH) != (ZF|CF|PF)) {
+ printf("[BAD]\tfcomi_qnan_1 with flags:%lx\n", flags);
+ return 1;
+ }
+ if (fetestexcept(FE_INVALID) != 0) {
+ printf("[BAD]\tFE_INVALID is set in %s\n", __func__);
+ return 1;
+ }
+ return 0;
+}
+
+int testu_snan(long flags)
+{
+ feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+ asm ("\n"
+ " push %0""\n"
+ " popf""\n"
+// " flds snan""\n" // WRONG, this will convert 32-bit fp snan to a *qnan* in 80-bit fp register!
+// " fstpt snan1""\n" // if uncommented, it prints "snan1:7fff c111 1100 0000 0000" - c111, not 8111!
+// " fnclex""\n" // flds of a snan raised FE_INVALID, clear it
+ " fldt snan80""\n" // fldt never raise FE_INVALID
+ " fld1""\n"
+ " fucomi %%st(1), %%st" "\n"
+ " ffree %%st(0)" "\n"
+ " ffree %%st(1)" "\n"
+ " pushf""\n"
+ " pop res_fcomi_nan_1""\n"
+ :
+ : "r" (flags)
+ );
+ if ((res_fcomi_nan_1 & ARITH) != (ZF|CF|PF)) {
+ printf("[BAD]\tfcomi_qnan_1 with flags:%lx\n", flags);
+ return 1;
+ }
+// printf("snan:%x snan1:%04x %04x %04x %04x %04x\n", snan, snan1[4], snan1[3], snan1[2], snan1[1], snan1[0]);
+ if (fetestexcept(FE_INVALID) != FE_INVALID) {
+ printf("[BAD]\tFE_INVALID is not set in %s\n", __func__);
+ return 1;
+ }
+ return 0;
+}
+
+int testp(long flags)
+{
+ feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+ asm ("\n"
+
+ " push %0""\n"
+ " popf""\n"
+ " fld1""\n"
+ " fldpi""\n"
+ " fcomip %%st(1), %%st" "\n"
+ " ffree %%st(0)" "\n"
+ " pushf""\n"
+ " pop res_fcomi_1_pi""\n"
+
+ " push %0""\n"
+ " popf""\n"
+ " fldpi""\n"
+ " fld1""\n"
+ " fcomip %%st(1), %%st" "\n"
+ " ffree %%st(0)" "\n"
+ " pushf""\n"
+ " pop res_fcomi_pi_1""\n"
+
+ " push %0""\n"
+ " popf""\n"
+ " fld1""\n"
+ " fld1""\n"
+ " fcomip %%st(1), %%st" "\n"
+ " ffree %%st(0)" "\n"
+ " pushf""\n"
+ " pop res_fcomi_1_1""\n"
+ :
+ : "r" (flags)
+ );
+ if ((res_fcomi_1_pi & ARITH) != (0)) {
+ printf("[BAD]\tfcomi_1_pi with flags:%lx\n", flags);
+ return 1;
+ }
+ if ((res_fcomi_pi_1 & ARITH) != (CF)) {
+ printf("[BAD]\tfcomi_pi_1 with flags:%lx->%lx\n", flags, res_fcomi_pi_1 & ARITH);
+ return 1;
+ }
+ if ((res_fcomi_1_1 & ARITH) != (ZF)) {
+ printf("[BAD]\tfcomi_1_1 with flags:%lx\n", flags);
+ return 1;
+ }
+ if (fetestexcept(FE_INVALID) != 0) {
+ printf("[BAD]\tFE_INVALID is set in %s\n", __func__);
+ return 1;
+ }
+ return 0;
+}
+
+int testp_qnan(long flags)
+{
+ feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+ asm ("\n"
+ " push %0""\n"
+ " popf""\n"
+ " flds qnan""\n"
+ " fld1""\n"
+ " fnclex""\n" // fld of a qnan raised FE_INVALID, clear it
+ " fcomip %%st(1), %%st" "\n"
+ " ffree %%st(0)" "\n"
+ " pushf""\n"
+ " pop res_fcomi_nan_1""\n"
+ :
+ : "r" (flags)
+ );
+ if ((res_fcomi_nan_1 & ARITH) != (ZF|CF|PF)) {
+ printf("[BAD]\tfcomi_qnan_1 with flags:%lx\n", flags);
+ return 1;
+ }
+ if (fetestexcept(FE_INVALID) != FE_INVALID) {
+ printf("[BAD]\tFE_INVALID is not set in %s\n", __func__);
+ return 1;
+ }
+ return 0;
+}
+
+int testup_qnan(long flags)
+{
+ feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+
+ asm ("\n"
+ " push %0""\n"
+ " popf""\n"
+ " flds qnan""\n"
+ " fld1""\n"
+ " fnclex""\n" // fld of a qnan raised FE_INVALID, clear it
+ " fucomip %%st(1), %%st" "\n"
+ " ffree %%st(0)" "\n"
+ " pushf""\n"
+ " pop res_fcomi_nan_1""\n"
+ :
+ : "r" (flags)
+ );
+ if ((res_fcomi_nan_1 & ARITH) != (ZF|CF|PF)) {
+ printf("[BAD]\tfcomi_qnan_1 with flags:%lx\n", flags);
+ return 1;
+ }
+ if (fetestexcept(FE_INVALID) != 0) {
+ printf("[BAD]\tFE_INVALID is set in %s\n", __func__);
+ return 1;
+ }
+ return 0;
+}
+
+void sighandler(int sig)
+{
+ printf("[FAIL]\tGot signal %d, exiting\n", sig);
+ exit(1);
+}
+
+int main(int argc, char **argv, char **envp)
+{
+ int err = 0;
+
+ /* SIGILL triggers on 32-bit kernels w/o fcomi emulation
+ * when run with "no387 nofxsr". Other signals are caught
+ * just in case.
+ */
+ signal(SIGILL, sighandler);
+ signal(SIGFPE, sighandler);
+ signal(SIGSEGV, sighandler);
+
+ printf("[RUN]\tTesting f[u]comi[p] instructions\n");
+ err |= test(0);
+ err |= test_qnan(0);
+ err |= testu_qnan(0);
+ err |= testu_snan(0);
+ err |= test(CF|ZF|PF);
+ err |= test_qnan(CF|ZF|PF);
+ err |= testu_qnan(CF|ZF|PF);
+ err |= testu_snan(CF|ZF|PF);
+ err |= testp(0);
+ err |= testp_qnan(0);
+ err |= testup_qnan(0);
+ err |= testp(CF|ZF|PF);
+ err |= testp_qnan(CF|ZF|PF);
+ err |= testup_qnan(CF|ZF|PF);
+ if (!err)
+ printf("[OK]\tf[u]comi[p]\n");
+ else
+ printf("[FAIL]\tf[u]comi[p] errors: %d\n", err);
+
+ return err;
+}
diff --git a/tools/testing/selftests/x86/test_FISTTP.c b/tools/testing/selftests/x86/test_FISTTP.c
new file mode 100644
index 000000000..09789c0ce
--- /dev/null
+++ b/tools/testing/selftests/x86/test_FISTTP.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+#undef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#undef __USE_GNU
+#define __USE_GNU 1
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <fenv.h>
+
+unsigned long long res64 = -1;
+unsigned int res32 = -1;
+unsigned short res16 = -1;
+
+int test(void)
+{
+ int ex;
+
+ feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+ asm volatile ("\n"
+ " fld1""\n"
+ " fisttp res16""\n"
+ " fld1""\n"
+ " fisttpl res32""\n"
+ " fld1""\n"
+ " fisttpll res64""\n"
+ : : : "memory"
+ );
+ if (res16 != 1 || res32 != 1 || res64 != 1) {
+ printf("[BAD]\tfisttp 1\n");
+ return 1;
+ }
+ ex = fetestexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+ if (ex != 0) {
+ printf("[BAD]\tfisttp 1: wrong exception state\n");
+ return 1;
+ }
+
+ feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+ asm volatile ("\n"
+ " fldpi""\n"
+ " fisttp res16""\n"
+ " fldpi""\n"
+ " fisttpl res32""\n"
+ " fldpi""\n"
+ " fisttpll res64""\n"
+ : : : "memory"
+ );
+ if (res16 != 3 || res32 != 3 || res64 != 3) {
+ printf("[BAD]\tfisttp pi\n");
+ return 1;
+ }
+ ex = fetestexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+ if (ex != FE_INEXACT) {
+ printf("[BAD]\tfisttp pi: wrong exception state\n");
+ return 1;
+ }
+
+ feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+ asm volatile ("\n"
+ " fldpi""\n"
+ " fchs""\n"
+ " fisttp res16""\n"
+ " fldpi""\n"
+ " fchs""\n"
+ " fisttpl res32""\n"
+ " fldpi""\n"
+ " fchs""\n"
+ " fisttpll res64""\n"
+ : : : "memory"
+ );
+ if (res16 != 0xfffd || res32 != 0xfffffffd || res64 != 0xfffffffffffffffdULL) {
+ printf("[BAD]\tfisttp -pi\n");
+ return 1;
+ }
+ ex = fetestexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+ if (ex != FE_INEXACT) {
+ printf("[BAD]\tfisttp -pi: wrong exception state\n");
+ return 1;
+ }
+
+ feclearexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+ asm volatile ("\n"
+ " fldln2""\n"
+ " fisttp res16""\n"
+ " fldln2""\n"
+ " fisttpl res32""\n"
+ " fldln2""\n"
+ " fisttpll res64""\n"
+ : : : "memory"
+ );
+ /* Test truncation to zero (round-to-nearest would give 1 here) */
+ if (res16 != 0 || res32 != 0 || res64 != 0) {
+ printf("[BAD]\tfisttp ln2\n");
+ return 1;
+ }
+ ex = fetestexcept(FE_DIVBYZERO|FE_INEXACT|FE_INVALID|FE_OVERFLOW|FE_UNDERFLOW);
+ if (ex != FE_INEXACT) {
+ printf("[BAD]\tfisttp ln2: wrong exception state\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+void sighandler(int sig)
+{
+ printf("[FAIL]\tGot signal %d, exiting\n", sig);
+ exit(1);
+}
+
+int main(int argc, char **argv, char **envp)
+{
+ int err = 0;
+
+ /* SIGILL triggers on 32-bit kernels w/o fisttp emulation
+ * when run with "no387 nofxsr". Other signals are caught
+ * just in case.
+ */
+ signal(SIGILL, sighandler);
+ signal(SIGFPE, sighandler);
+ signal(SIGSEGV, sighandler);
+
+ printf("[RUN]\tTesting fisttp instructions\n");
+ err |= test();
+ if (!err)
+ printf("[OK]\tfisttp\n");
+ else
+ printf("[FAIL]\tfisttp errors: %d\n", err);
+
+ return err;
+}
diff --git a/tools/testing/selftests/x86/test_mremap_vdso.c b/tools/testing/selftests/x86/test_mremap_vdso.c
new file mode 100644
index 000000000..f0d876d48
--- /dev/null
+++ b/tools/testing/selftests/x86/test_mremap_vdso.c
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * 32-bit test to check vDSO mremap.
+ *
+ * Copyright (c) 2016 Dmitry Safonov
+ * Suggested-by: Andrew Lutomirski
+ */
+/*
+ * Can be built statically:
+ * gcc -Os -Wall -static -m32 test_mremap_vdso.c
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <string.h>
+
+#include <sys/mman.h>
+#include <sys/auxv.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+
+#define PAGE_SIZE 4096
+
+static int try_to_remap(void *vdso_addr, unsigned long size)
+{
+ void *dest_addr, *new_addr;
+
+ /* Searching for memory location where to remap */
+ dest_addr = mmap(0, size, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+ if (dest_addr == MAP_FAILED) {
+ printf("[WARN]\tmmap failed (%d): %m\n", errno);
+ return 0;
+ }
+
+ printf("[NOTE]\tMoving vDSO: [%p, %#lx] -> [%p, %#lx]\n",
+ vdso_addr, (unsigned long)vdso_addr + size,
+ dest_addr, (unsigned long)dest_addr + size);
+ fflush(stdout);
+
+ new_addr = mremap(vdso_addr, size, size,
+ MREMAP_FIXED|MREMAP_MAYMOVE, dest_addr);
+ if ((unsigned long)new_addr == (unsigned long)-1) {
+ munmap(dest_addr, size);
+ if (errno == EINVAL) {
+ printf("[NOTE]\tvDSO partial move failed, will try with bigger size\n");
+ return -1; /* Retry with larger */
+ }
+ printf("[FAIL]\tmremap failed (%d): %m\n", errno);
+ return 1;
+ }
+
+ return 0;
+
+}
+
+int main(int argc, char **argv, char **envp)
+{
+ pid_t child;
+
+ child = fork();
+ if (child == -1) {
+ printf("[WARN]\tfailed to fork (%d): %m\n", errno);
+ return 1;
+ }
+
+ if (child == 0) {
+ unsigned long vdso_size = PAGE_SIZE;
+ unsigned long auxval;
+ int ret = -1;
+
+ auxval = getauxval(AT_SYSINFO_EHDR);
+ printf("\tAT_SYSINFO_EHDR is %#lx\n", auxval);
+ if (!auxval || auxval == -ENOENT) {
+ printf("[WARN]\tgetauxval failed\n");
+ return 0;
+ }
+
+ /* Simpler than parsing ELF header */
+ while (ret < 0) {
+ ret = try_to_remap((void *)auxval, vdso_size);
+ vdso_size += PAGE_SIZE;
+ }
+
+#ifdef __i386__
+ /* Glibc is likely to explode now - exit with raw syscall */
+ asm volatile ("int $0x80" : : "a" (__NR_exit), "b" (!!ret));
+#else /* __x86_64__ */
+ syscall(SYS_exit, ret);
+#endif
+ } else {
+ int status;
+
+ if (waitpid(child, &status, 0) != child ||
+ !WIFEXITED(status)) {
+ printf("[FAIL]\tmremap() of the vDSO does not work on this kernel!\n");
+ return 1;
+ } else if (WEXITSTATUS(status) != 0) {
+ printf("[FAIL]\tChild failed with %d\n",
+ WEXITSTATUS(status));
+ return 1;
+ }
+ printf("[OK]\n");
+ }
+
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/test_syscall_vdso.c b/tools/testing/selftests/x86/test_syscall_vdso.c
new file mode 100644
index 000000000..8965c311b
--- /dev/null
+++ b/tools/testing/selftests/x86/test_syscall_vdso.c
@@ -0,0 +1,400 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * 32-bit syscall ABI conformance test.
+ *
+ * Copyright (c) 2015 Denys Vlasenko
+ */
+/*
+ * Can be built statically:
+ * gcc -Os -Wall -static -m32 test_syscall_vdso.c thunks_32.S
+ */
+#undef _GNU_SOURCE
+#define _GNU_SOURCE 1
+#undef __USE_GNU
+#define __USE_GNU 1
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/select.h>
+#include <sys/time.h>
+#include <elf.h>
+#include <sys/ptrace.h>
+#include <sys/wait.h>
+
+#if !defined(__i386__)
+int main(int argc, char **argv, char **envp)
+{
+ printf("[SKIP]\tNot a 32-bit x86 userspace\n");
+ return 0;
+}
+#else
+
+long syscall_addr;
+long get_syscall(char **envp)
+{
+ Elf32_auxv_t *auxv;
+ while (*envp++ != NULL)
+ continue;
+ for (auxv = (void *)envp; auxv->a_type != AT_NULL; auxv++)
+ if (auxv->a_type == AT_SYSINFO)
+ return auxv->a_un.a_val;
+ printf("[WARN]\tAT_SYSINFO not supplied\n");
+ return 0;
+}
+
+asm (
+ " .pushsection .text\n"
+ " .global int80\n"
+ "int80:\n"
+ " int $0x80\n"
+ " ret\n"
+ " .popsection\n"
+);
+extern char int80;
+
+struct regs64 {
+ uint64_t rax, rbx, rcx, rdx;
+ uint64_t rsi, rdi, rbp, rsp;
+ uint64_t r8, r9, r10, r11;
+ uint64_t r12, r13, r14, r15;
+};
+struct regs64 regs64;
+int kernel_is_64bit;
+
+asm (
+ " .pushsection .text\n"
+ " .code64\n"
+ "get_regs64:\n"
+ " push %rax\n"
+ " mov $regs64, %eax\n"
+ " pop 0*8(%rax)\n"
+ " movq %rbx, 1*8(%rax)\n"
+ " movq %rcx, 2*8(%rax)\n"
+ " movq %rdx, 3*8(%rax)\n"
+ " movq %rsi, 4*8(%rax)\n"
+ " movq %rdi, 5*8(%rax)\n"
+ " movq %rbp, 6*8(%rax)\n"
+ " movq %rsp, 7*8(%rax)\n"
+ " movq %r8, 8*8(%rax)\n"
+ " movq %r9, 9*8(%rax)\n"
+ " movq %r10, 10*8(%rax)\n"
+ " movq %r11, 11*8(%rax)\n"
+ " movq %r12, 12*8(%rax)\n"
+ " movq %r13, 13*8(%rax)\n"
+ " movq %r14, 14*8(%rax)\n"
+ " movq %r15, 15*8(%rax)\n"
+ " ret\n"
+ "poison_regs64:\n"
+ " movq $0x7f7f7f7f, %r8\n"
+ " shl $32, %r8\n"
+ " orq $0x7f7f7f7f, %r8\n"
+ " movq %r8, %r9\n"
+ " incq %r9\n"
+ " movq %r9, %r10\n"
+ " incq %r10\n"
+ " movq %r10, %r11\n"
+ " incq %r11\n"
+ " movq %r11, %r12\n"
+ " incq %r12\n"
+ " movq %r12, %r13\n"
+ " incq %r13\n"
+ " movq %r13, %r14\n"
+ " incq %r14\n"
+ " movq %r14, %r15\n"
+ " incq %r15\n"
+ " ret\n"
+ " .code32\n"
+ " .popsection\n"
+);
+extern void get_regs64(void);
+extern void poison_regs64(void);
+extern unsigned long call64_from_32(void (*function)(void));
+void print_regs64(void)
+{
+ if (!kernel_is_64bit)
+ return;
+ printf("ax:%016llx bx:%016llx cx:%016llx dx:%016llx\n", regs64.rax, regs64.rbx, regs64.rcx, regs64.rdx);
+ printf("si:%016llx di:%016llx bp:%016llx sp:%016llx\n", regs64.rsi, regs64.rdi, regs64.rbp, regs64.rsp);
+ printf(" 8:%016llx 9:%016llx 10:%016llx 11:%016llx\n", regs64.r8 , regs64.r9 , regs64.r10, regs64.r11);
+ printf("12:%016llx 13:%016llx 14:%016llx 15:%016llx\n", regs64.r12, regs64.r13, regs64.r14, regs64.r15);
+}
+
+int check_regs64(void)
+{
+ int err = 0;
+ int num = 8;
+ uint64_t *r64 = &regs64.r8;
+ uint64_t expected = 0x7f7f7f7f7f7f7f7fULL;
+
+ if (!kernel_is_64bit)
+ return 0;
+
+ do {
+ if (*r64 == expected++)
+ continue; /* register did not change */
+ if (syscall_addr != (long)&int80) {
+ /*
+ * Non-INT80 syscall entrypoints are allowed to clobber R8+ regs:
+ * either clear them to 0, or for R11, load EFLAGS.
+ */
+ if (*r64 == 0)
+ continue;
+ if (num == 11) {
+ printf("[NOTE]\tR11 has changed:%016llx - assuming clobbered by SYSRET insn\n", *r64);
+ continue;
+ }
+ } else {
+ /*
+ * INT80 syscall entrypoint can be used by
+ * 64-bit programs too, unlike SYSCALL/SYSENTER.
+ * Therefore it must preserve R12+
+ * (they are callee-saved registers in 64-bit C ABI).
+ *
+ * Starting in Linux 4.17 (and any kernel that
+ * backports the change), R8..11 are preserved.
+ * Historically (and probably unintentionally), they
+ * were clobbered or zeroed.
+ */
+ }
+ printf("[FAIL]\tR%d has changed:%016llx\n", num, *r64);
+ err++;
+ } while (r64++, ++num < 16);
+
+ if (!err)
+ printf("[OK]\tR8..R15 did not leak kernel data\n");
+ return err;
+}
+
+int nfds;
+fd_set rfds;
+fd_set wfds;
+fd_set efds;
+struct timespec timeout;
+sigset_t sigmask;
+struct {
+ sigset_t *sp;
+ int sz;
+} sigmask_desc;
+
+void prep_args()
+{
+ nfds = 42;
+ FD_ZERO(&rfds);
+ FD_ZERO(&wfds);
+ FD_ZERO(&efds);
+ FD_SET(0, &rfds);
+ FD_SET(1, &wfds);
+ FD_SET(2, &efds);
+ timeout.tv_sec = 0;
+ timeout.tv_nsec = 123;
+ sigemptyset(&sigmask);
+ sigaddset(&sigmask, SIGINT);
+ sigaddset(&sigmask, SIGUSR2);
+ sigaddset(&sigmask, SIGRTMAX);
+ sigmask_desc.sp = &sigmask;
+ sigmask_desc.sz = 8; /* bytes */
+}
+
+static void print_flags(const char *name, unsigned long r)
+{
+ static const char *bitarray[] = {
+ "\n" ,"c\n" ,/* Carry Flag */
+ "0 " ,"1 " ,/* Bit 1 - always on */
+ "" ,"p " ,/* Parity Flag */
+ "0 " ,"3? " ,
+ "" ,"a " ,/* Auxiliary carry Flag */
+ "0 " ,"5? " ,
+ "" ,"z " ,/* Zero Flag */
+ "" ,"s " ,/* Sign Flag */
+ "" ,"t " ,/* Trap Flag */
+ "" ,"i " ,/* Interrupt Flag */
+ "" ,"d " ,/* Direction Flag */
+ "" ,"o " ,/* Overflow Flag */
+ "0 " ,"1 " ,/* I/O Privilege Level (2 bits) */
+ "0" ,"1" ,/* I/O Privilege Level (2 bits) */
+ "" ,"n " ,/* Nested Task */
+ "0 " ,"15? ",
+ "" ,"r " ,/* Resume Flag */
+ "" ,"v " ,/* Virtual Mode */
+ "" ,"ac " ,/* Alignment Check/Access Control */
+ "" ,"vif ",/* Virtual Interrupt Flag */
+ "" ,"vip ",/* Virtual Interrupt Pending */
+ "" ,"id " ,/* CPUID detection */
+ NULL
+ };
+ const char **bitstr;
+ int bit;
+
+ printf("%s=%016lx ", name, r);
+ bitstr = bitarray + 42;
+ bit = 21;
+ if ((r >> 22) != 0)
+ printf("(extra bits are set) ");
+ do {
+ if (bitstr[(r >> bit) & 1][0])
+ fputs(bitstr[(r >> bit) & 1], stdout);
+ bitstr -= 2;
+ bit--;
+ } while (bit >= 0);
+}
+
+int run_syscall(void)
+{
+ long flags, bad_arg;
+
+ prep_args();
+
+ if (kernel_is_64bit)
+ call64_from_32(poison_regs64);
+ /*print_regs64();*/
+
+ asm("\n"
+ /* Try 6-arg syscall: pselect. It should return quickly */
+ " push %%ebp\n"
+ " mov $308, %%eax\n" /* PSELECT */
+ " mov nfds, %%ebx\n" /* ebx arg1 */
+ " mov $rfds, %%ecx\n" /* ecx arg2 */
+ " mov $wfds, %%edx\n" /* edx arg3 */
+ " mov $efds, %%esi\n" /* esi arg4 */
+ " mov $timeout, %%edi\n" /* edi arg5 */
+ " mov $sigmask_desc, %%ebp\n" /* %ebp arg6 */
+ " push $0x200ed7\n" /* set almost all flags */
+ " popf\n" /* except TF, IOPL, NT, RF, VM, AC, VIF, VIP */
+ " call *syscall_addr\n"
+ /* Check that registers are not clobbered */
+ " pushf\n"
+ " pop %%eax\n"
+ " cld\n"
+ " cmp nfds, %%ebx\n" /* ebx arg1 */
+ " mov $1, %%ebx\n"
+ " jne 1f\n"
+ " cmp $rfds, %%ecx\n" /* ecx arg2 */
+ " mov $2, %%ebx\n"
+ " jne 1f\n"
+ " cmp $wfds, %%edx\n" /* edx arg3 */
+ " mov $3, %%ebx\n"
+ " jne 1f\n"
+ " cmp $efds, %%esi\n" /* esi arg4 */
+ " mov $4, %%ebx\n"
+ " jne 1f\n"
+ " cmp $timeout, %%edi\n" /* edi arg5 */
+ " mov $5, %%ebx\n"
+ " jne 1f\n"
+ " cmpl $sigmask_desc, %%ebp\n" /* %ebp arg6 */
+ " mov $6, %%ebx\n"
+ " jne 1f\n"
+ " mov $0, %%ebx\n"
+ "1:\n"
+ " pop %%ebp\n"
+ : "=a" (flags), "=b" (bad_arg)
+ :
+ : "cx", "dx", "si", "di"
+ );
+
+ if (kernel_is_64bit) {
+ memset(&regs64, 0x77, sizeof(regs64));
+ call64_from_32(get_regs64);
+ /*print_regs64();*/
+ }
+
+ /*
+ * On paravirt kernels, flags are not preserved across syscalls.
+ * Thus, we do not consider it a bug if some are changed.
+ * We just show ones which do.
+ */
+ if ((0x200ed7 ^ flags) != 0) {
+ print_flags("[WARN]\tFlags before", 0x200ed7);
+ print_flags("[WARN]\tFlags after", flags);
+ print_flags("[WARN]\tFlags change", (0x200ed7 ^ flags));
+ }
+
+ if (bad_arg) {
+ printf("[FAIL]\targ#%ld clobbered\n", bad_arg);
+ return 1;
+ }
+ printf("[OK]\tArguments are preserved across syscall\n");
+
+ return check_regs64();
+}
+
+int run_syscall_twice()
+{
+ int exitcode = 0;
+ long sv;
+
+ if (syscall_addr) {
+ printf("[RUN]\tExecuting 6-argument 32-bit syscall via VDSO\n");
+ exitcode = run_syscall();
+ }
+ sv = syscall_addr;
+ syscall_addr = (long)&int80;
+ printf("[RUN]\tExecuting 6-argument 32-bit syscall via INT 80\n");
+ exitcode += run_syscall();
+ syscall_addr = sv;
+ return exitcode;
+}
+
+void ptrace_me()
+{
+ pid_t pid;
+
+ fflush(NULL);
+ pid = fork();
+ if (pid < 0)
+ exit(1);
+ if (pid == 0) {
+ /* child */
+ if (ptrace(PTRACE_TRACEME, 0L, 0L, 0L) != 0)
+ exit(0);
+ raise(SIGSTOP);
+ return;
+ }
+ /* parent */
+ printf("[RUN]\tRunning tests under ptrace\n");
+ while (1) {
+ int status;
+ pid = waitpid(-1, &status, __WALL);
+ if (WIFEXITED(status))
+ exit(WEXITSTATUS(status));
+ if (WIFSIGNALED(status))
+ exit(WTERMSIG(status));
+ if (pid <= 0 || !WIFSTOPPED(status)) /* paranoia */
+ exit(255);
+ /*
+ * Note: we do not inject sig = WSTOPSIG(status).
+ * We probably should, but careful: do not inject SIGTRAP
+ * generated by syscall entry/exit stops.
+ * That kills the child.
+ */
+ ptrace(PTRACE_SYSCALL, pid, 0L, 0L /*sig*/);
+ }
+}
+
+int main(int argc, char **argv, char **envp)
+{
+ int exitcode = 0;
+ int cs;
+
+ asm("\n"
+ " movl %%cs, %%eax\n"
+ : "=a" (cs)
+ );
+ kernel_is_64bit = (cs == 0x23);
+ if (!kernel_is_64bit)
+ printf("[NOTE]\tNot a 64-bit kernel, won't test R8..R15 leaks\n");
+
+ /* This only works for non-static builds:
+ * syscall_addr = dlsym(dlopen("linux-gate.so.1", RTLD_NOW), "__kernel_vsyscall");
+ */
+ syscall_addr = get_syscall(envp);
+
+ exitcode += run_syscall_twice();
+ ptrace_me();
+ exitcode += run_syscall_twice();
+
+ return exitcode;
+}
+#endif
diff --git a/tools/testing/selftests/x86/test_vdso.c b/tools/testing/selftests/x86/test_vdso.c
new file mode 100644
index 000000000..42052db0f
--- /dev/null
+++ b/tools/testing/selftests/x86/test_vdso.c
@@ -0,0 +1,342 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ldt_gdt.c - Test cases for LDT and GDT access
+ * Copyright (c) 2011-2015 Andrew Lutomirski
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <sys/time.h>
+#include <time.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <dlfcn.h>
+#include <string.h>
+#include <errno.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <limits.h>
+
+#ifndef SYS_getcpu
+# ifdef __x86_64__
+# define SYS_getcpu 309
+# else
+# define SYS_getcpu 318
+# endif
+#endif
+
+/* max length of lines in /proc/self/maps - anything longer is skipped here */
+#define MAPS_LINE_LEN 128
+
+int nerrs = 0;
+
+typedef int (*vgettime_t)(clockid_t, struct timespec *);
+
+vgettime_t vdso_clock_gettime;
+
+typedef long (*vgtod_t)(struct timeval *tv, struct timezone *tz);
+
+vgtod_t vdso_gettimeofday;
+
+typedef long (*getcpu_t)(unsigned *, unsigned *, void *);
+
+getcpu_t vgetcpu;
+getcpu_t vdso_getcpu;
+
+static void *vsyscall_getcpu(void)
+{
+#ifdef __x86_64__
+ FILE *maps;
+ char line[MAPS_LINE_LEN];
+ bool found = false;
+
+ maps = fopen("/proc/self/maps", "r");
+ if (!maps) /* might still be present, but ignore it here, as we test vDSO not vsyscall */
+ return NULL;
+
+ while (fgets(line, MAPS_LINE_LEN, maps)) {
+ char r, x;
+ void *start, *end;
+ char name[MAPS_LINE_LEN];
+
+ /* sscanf() is safe here as strlen(name) >= strlen(line) */
+ if (sscanf(line, "%p-%p %c-%cp %*x %*x:%*x %*u %s",
+ &start, &end, &r, &x, name) != 5)
+ continue;
+
+ if (strcmp(name, "[vsyscall]"))
+ continue;
+
+ /* assume entries are OK, as we test vDSO here not vsyscall */
+ found = true;
+ break;
+ }
+
+ fclose(maps);
+
+ if (!found) {
+ printf("Warning: failed to find vsyscall getcpu\n");
+ return NULL;
+ }
+ return (void *) (0xffffffffff600800);
+#else
+ return NULL;
+#endif
+}
+
+
+static void fill_function_pointers()
+{
+ void *vdso = dlopen("linux-vdso.so.1",
+ RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+ if (!vdso)
+ vdso = dlopen("linux-gate.so.1",
+ RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+ if (!vdso) {
+ printf("[WARN]\tfailed to find vDSO\n");
+ return;
+ }
+
+ vdso_getcpu = (getcpu_t)dlsym(vdso, "__vdso_getcpu");
+ if (!vdso_getcpu)
+ printf("Warning: failed to find getcpu in vDSO\n");
+
+ vgetcpu = (getcpu_t) vsyscall_getcpu();
+
+ vdso_clock_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime");
+ if (!vdso_clock_gettime)
+ printf("Warning: failed to find clock_gettime in vDSO\n");
+
+ vdso_gettimeofday = (vgtod_t)dlsym(vdso, "__vdso_gettimeofday");
+ if (!vdso_gettimeofday)
+ printf("Warning: failed to find gettimeofday in vDSO\n");
+
+}
+
+static long sys_getcpu(unsigned * cpu, unsigned * node,
+ void* cache)
+{
+ return syscall(__NR_getcpu, cpu, node, cache);
+}
+
+static inline int sys_clock_gettime(clockid_t id, struct timespec *ts)
+{
+ return syscall(__NR_clock_gettime, id, ts);
+}
+
+static inline int sys_gettimeofday(struct timeval *tv, struct timezone *tz)
+{
+ return syscall(__NR_gettimeofday, tv, tz);
+}
+
+static void test_getcpu(void)
+{
+ printf("[RUN]\tTesting getcpu...\n");
+
+ for (int cpu = 0; ; cpu++) {
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpu, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
+ return;
+
+ unsigned cpu_sys, cpu_vdso, cpu_vsys,
+ node_sys, node_vdso, node_vsys;
+ long ret_sys, ret_vdso = 1, ret_vsys = 1;
+ unsigned node;
+
+ ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0);
+ if (vdso_getcpu)
+ ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0);
+ if (vgetcpu)
+ ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0);
+
+ if (!ret_sys)
+ node = node_sys;
+ else if (!ret_vdso)
+ node = node_vdso;
+ else if (!ret_vsys)
+ node = node_vsys;
+
+ bool ok = true;
+ if (!ret_sys && (cpu_sys != cpu || node_sys != node))
+ ok = false;
+ if (!ret_vdso && (cpu_vdso != cpu || node_vdso != node))
+ ok = false;
+ if (!ret_vsys && (cpu_vsys != cpu || node_vsys != node))
+ ok = false;
+
+ printf("[%s]\tCPU %u:", ok ? "OK" : "FAIL", cpu);
+ if (!ret_sys)
+ printf(" syscall: cpu %u, node %u", cpu_sys, node_sys);
+ if (!ret_vdso)
+ printf(" vdso: cpu %u, node %u", cpu_vdso, node_vdso);
+ if (!ret_vsys)
+ printf(" vsyscall: cpu %u, node %u", cpu_vsys,
+ node_vsys);
+ printf("\n");
+
+ if (!ok)
+ nerrs++;
+ }
+}
+
+static bool ts_leq(const struct timespec *a, const struct timespec *b)
+{
+ if (a->tv_sec != b->tv_sec)
+ return a->tv_sec < b->tv_sec;
+ else
+ return a->tv_nsec <= b->tv_nsec;
+}
+
+static bool tv_leq(const struct timeval *a, const struct timeval *b)
+{
+ if (a->tv_sec != b->tv_sec)
+ return a->tv_sec < b->tv_sec;
+ else
+ return a->tv_usec <= b->tv_usec;
+}
+
+static char const * const clocknames[] = {
+ [0] = "CLOCK_REALTIME",
+ [1] = "CLOCK_MONOTONIC",
+ [2] = "CLOCK_PROCESS_CPUTIME_ID",
+ [3] = "CLOCK_THREAD_CPUTIME_ID",
+ [4] = "CLOCK_MONOTONIC_RAW",
+ [5] = "CLOCK_REALTIME_COARSE",
+ [6] = "CLOCK_MONOTONIC_COARSE",
+ [7] = "CLOCK_BOOTTIME",
+ [8] = "CLOCK_REALTIME_ALARM",
+ [9] = "CLOCK_BOOTTIME_ALARM",
+ [10] = "CLOCK_SGI_CYCLE",
+ [11] = "CLOCK_TAI",
+};
+
+static void test_one_clock_gettime(int clock, const char *name)
+{
+ struct timespec start, vdso, end;
+ int vdso_ret, end_ret;
+
+ printf("[RUN]\tTesting clock_gettime for clock %s (%d)...\n", name, clock);
+
+ if (sys_clock_gettime(clock, &start) < 0) {
+ if (errno == EINVAL) {
+ vdso_ret = vdso_clock_gettime(clock, &vdso);
+ if (vdso_ret == -EINVAL) {
+ printf("[OK]\tNo such clock.\n");
+ } else {
+ printf("[FAIL]\tNo such clock, but __vdso_clock_gettime returned %d\n", vdso_ret);
+ nerrs++;
+ }
+ } else {
+ printf("[WARN]\t clock_gettime(%d) syscall returned error %d\n", clock, errno);
+ }
+ return;
+ }
+
+ vdso_ret = vdso_clock_gettime(clock, &vdso);
+ end_ret = sys_clock_gettime(clock, &end);
+
+ if (vdso_ret != 0 || end_ret != 0) {
+ printf("[FAIL]\tvDSO returned %d, syscall errno=%d\n",
+ vdso_ret, errno);
+ nerrs++;
+ return;
+ }
+
+ printf("\t%llu.%09ld %llu.%09ld %llu.%09ld\n",
+ (unsigned long long)start.tv_sec, start.tv_nsec,
+ (unsigned long long)vdso.tv_sec, vdso.tv_nsec,
+ (unsigned long long)end.tv_sec, end.tv_nsec);
+
+ if (!ts_leq(&start, &vdso) || !ts_leq(&vdso, &end)) {
+ printf("[FAIL]\tTimes are out of sequence\n");
+ nerrs++;
+ }
+}
+
+static void test_clock_gettime(void)
+{
+ if (!vdso_clock_gettime) {
+ printf("[SKIP]\tNo vDSO, so skipping clock_gettime() tests\n");
+ return;
+ }
+
+ for (int clock = 0; clock < sizeof(clocknames) / sizeof(clocknames[0]);
+ clock++) {
+ test_one_clock_gettime(clock, clocknames[clock]);
+ }
+
+ /* Also test some invalid clock ids */
+ test_one_clock_gettime(-1, "invalid");
+ test_one_clock_gettime(INT_MIN, "invalid");
+ test_one_clock_gettime(INT_MAX, "invalid");
+}
+
+static void test_gettimeofday(void)
+{
+ struct timeval start, vdso, end;
+ struct timezone sys_tz, vdso_tz;
+ int vdso_ret, end_ret;
+
+ if (!vdso_gettimeofday)
+ return;
+
+ printf("[RUN]\tTesting gettimeofday...\n");
+
+ if (sys_gettimeofday(&start, &sys_tz) < 0) {
+ printf("[FAIL]\tsys_gettimeofday failed (%d)\n", errno);
+ nerrs++;
+ return;
+ }
+
+ vdso_ret = vdso_gettimeofday(&vdso, &vdso_tz);
+ end_ret = sys_gettimeofday(&end, NULL);
+
+ if (vdso_ret != 0 || end_ret != 0) {
+ printf("[FAIL]\tvDSO returned %d, syscall errno=%d\n",
+ vdso_ret, errno);
+ nerrs++;
+ return;
+ }
+
+ printf("\t%llu.%06ld %llu.%06ld %llu.%06ld\n",
+ (unsigned long long)start.tv_sec, start.tv_usec,
+ (unsigned long long)vdso.tv_sec, vdso.tv_usec,
+ (unsigned long long)end.tv_sec, end.tv_usec);
+
+ if (!tv_leq(&start, &vdso) || !tv_leq(&vdso, &end)) {
+ printf("[FAIL]\tTimes are out of sequence\n");
+ nerrs++;
+ }
+
+ if (sys_tz.tz_minuteswest == vdso_tz.tz_minuteswest &&
+ sys_tz.tz_dsttime == vdso_tz.tz_dsttime) {
+ printf("[OK]\ttimezones match: minuteswest=%d, dsttime=%d\n",
+ sys_tz.tz_minuteswest, sys_tz.tz_dsttime);
+ } else {
+ printf("[FAIL]\ttimezones do not match\n");
+ nerrs++;
+ }
+
+ /* And make sure that passing NULL for tz doesn't crash. */
+ vdso_gettimeofday(&vdso, NULL);
+}
+
+int main(int argc, char **argv)
+{
+ fill_function_pointers();
+
+ test_clock_gettime();
+ test_gettimeofday();
+
+ /*
+ * Test getcpu() last so that, if something goes wrong setting affinity,
+ * we still run the other tests.
+ */
+ test_getcpu();
+
+ return nerrs ? 1 : 0;
+}
diff --git a/tools/testing/selftests/x86/test_vsyscall.c b/tools/testing/selftests/x86/test_vsyscall.c
new file mode 100644
index 000000000..5b45e6986
--- /dev/null
+++ b/tools/testing/selftests/x86/test_vsyscall.c
@@ -0,0 +1,583 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <sys/time.h>
+#include <time.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+#include <dlfcn.h>
+#include <string.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <sys/ucontext.h>
+#include <errno.h>
+#include <err.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <setjmp.h>
+#include <sys/uio.h>
+
+#include "helpers.h"
+
+#ifdef __x86_64__
+# define VSYS(x) (x)
+#else
+# define VSYS(x) 0
+#endif
+
+#ifndef SYS_getcpu
+# ifdef __x86_64__
+# define SYS_getcpu 309
+# else
+# define SYS_getcpu 318
+# endif
+#endif
+
+/* max length of lines in /proc/self/maps - anything longer is skipped here */
+#define MAPS_LINE_LEN 128
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+/* vsyscalls and vDSO */
+bool vsyscall_map_r = false, vsyscall_map_x = false;
+
+typedef long (*gtod_t)(struct timeval *tv, struct timezone *tz);
+const gtod_t vgtod = (gtod_t)VSYS(0xffffffffff600000);
+gtod_t vdso_gtod;
+
+typedef int (*vgettime_t)(clockid_t, struct timespec *);
+vgettime_t vdso_gettime;
+
+typedef long (*time_func_t)(time_t *t);
+const time_func_t vtime = (time_func_t)VSYS(0xffffffffff600400);
+time_func_t vdso_time;
+
+typedef long (*getcpu_t)(unsigned *, unsigned *, void *);
+const getcpu_t vgetcpu = (getcpu_t)VSYS(0xffffffffff600800);
+getcpu_t vdso_getcpu;
+
+static void init_vdso(void)
+{
+ void *vdso = dlopen("linux-vdso.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+ if (!vdso)
+ vdso = dlopen("linux-gate.so.1", RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+ if (!vdso) {
+ printf("[WARN]\tfailed to find vDSO\n");
+ return;
+ }
+
+ vdso_gtod = (gtod_t)dlsym(vdso, "__vdso_gettimeofday");
+ if (!vdso_gtod)
+ printf("[WARN]\tfailed to find gettimeofday in vDSO\n");
+
+ vdso_gettime = (vgettime_t)dlsym(vdso, "__vdso_clock_gettime");
+ if (!vdso_gettime)
+ printf("[WARN]\tfailed to find clock_gettime in vDSO\n");
+
+ vdso_time = (time_func_t)dlsym(vdso, "__vdso_time");
+ if (!vdso_time)
+ printf("[WARN]\tfailed to find time in vDSO\n");
+
+ vdso_getcpu = (getcpu_t)dlsym(vdso, "__vdso_getcpu");
+ if (!vdso_getcpu) {
+ /* getcpu() was never wired up in the 32-bit vDSO. */
+ printf("[%s]\tfailed to find getcpu in vDSO\n",
+ sizeof(long) == 8 ? "WARN" : "NOTE");
+ }
+}
+
+static int init_vsys(void)
+{
+#ifdef __x86_64__
+ int nerrs = 0;
+ FILE *maps;
+ char line[MAPS_LINE_LEN];
+ bool found = false;
+
+ maps = fopen("/proc/self/maps", "r");
+ if (!maps) {
+ printf("[WARN]\tCould not open /proc/self/maps -- assuming vsyscall is r-x\n");
+ vsyscall_map_r = true;
+ return 0;
+ }
+
+ while (fgets(line, MAPS_LINE_LEN, maps)) {
+ char r, x;
+ void *start, *end;
+ char name[MAPS_LINE_LEN];
+
+ /* sscanf() is safe here as strlen(name) >= strlen(line) */
+ if (sscanf(line, "%p-%p %c-%cp %*x %*x:%*x %*u %s",
+ &start, &end, &r, &x, name) != 5)
+ continue;
+
+ if (strcmp(name, "[vsyscall]"))
+ continue;
+
+ printf("\tvsyscall map: %s", line);
+
+ if (start != (void *)0xffffffffff600000 ||
+ end != (void *)0xffffffffff601000) {
+ printf("[FAIL]\taddress range is nonsense\n");
+ nerrs++;
+ }
+
+ printf("\tvsyscall permissions are %c-%c\n", r, x);
+ vsyscall_map_r = (r == 'r');
+ vsyscall_map_x = (x == 'x');
+
+ found = true;
+ break;
+ }
+
+ fclose(maps);
+
+ if (!found) {
+ printf("\tno vsyscall map in /proc/self/maps\n");
+ vsyscall_map_r = false;
+ vsyscall_map_x = false;
+ }
+
+ return nerrs;
+#else
+ return 0;
+#endif
+}
+
+/* syscalls */
+static inline long sys_gtod(struct timeval *tv, struct timezone *tz)
+{
+ return syscall(SYS_gettimeofday, tv, tz);
+}
+
+static inline int sys_clock_gettime(clockid_t id, struct timespec *ts)
+{
+ return syscall(SYS_clock_gettime, id, ts);
+}
+
+static inline long sys_time(time_t *t)
+{
+ return syscall(SYS_time, t);
+}
+
+static inline long sys_getcpu(unsigned * cpu, unsigned * node,
+ void* cache)
+{
+ return syscall(SYS_getcpu, cpu, node, cache);
+}
+
+static jmp_buf jmpbuf;
+static volatile unsigned long segv_err;
+
+static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t *)ctx_void;
+
+ segv_err = ctx->uc_mcontext.gregs[REG_ERR];
+ siglongjmp(jmpbuf, 1);
+}
+
+static double tv_diff(const struct timeval *a, const struct timeval *b)
+{
+ return (double)(a->tv_sec - b->tv_sec) +
+ (double)((int)a->tv_usec - (int)b->tv_usec) * 1e-6;
+}
+
+static int check_gtod(const struct timeval *tv_sys1,
+ const struct timeval *tv_sys2,
+ const struct timezone *tz_sys,
+ const char *which,
+ const struct timeval *tv_other,
+ const struct timezone *tz_other)
+{
+ int nerrs = 0;
+ double d1, d2;
+
+ if (tz_other && (tz_sys->tz_minuteswest != tz_other->tz_minuteswest || tz_sys->tz_dsttime != tz_other->tz_dsttime)) {
+ printf("[FAIL] %s tz mismatch\n", which);
+ nerrs++;
+ }
+
+ d1 = tv_diff(tv_other, tv_sys1);
+ d2 = tv_diff(tv_sys2, tv_other);
+ printf("\t%s time offsets: %lf %lf\n", which, d1, d2);
+
+ if (d1 < 0 || d2 < 0) {
+ printf("[FAIL]\t%s time was inconsistent with the syscall\n", which);
+ nerrs++;
+ } else {
+ printf("[OK]\t%s gettimeofday()'s timeval was okay\n", which);
+ }
+
+ return nerrs;
+}
+
+static int test_gtod(void)
+{
+ struct timeval tv_sys1, tv_sys2, tv_vdso, tv_vsys;
+ struct timezone tz_sys, tz_vdso, tz_vsys;
+ long ret_vdso = -1;
+ long ret_vsys = -1;
+ int nerrs = 0;
+
+ printf("[RUN]\ttest gettimeofday()\n");
+
+ if (sys_gtod(&tv_sys1, &tz_sys) != 0)
+ err(1, "syscall gettimeofday");
+ if (vdso_gtod)
+ ret_vdso = vdso_gtod(&tv_vdso, &tz_vdso);
+ if (vsyscall_map_x)
+ ret_vsys = vgtod(&tv_vsys, &tz_vsys);
+ if (sys_gtod(&tv_sys2, &tz_sys) != 0)
+ err(1, "syscall gettimeofday");
+
+ if (vdso_gtod) {
+ if (ret_vdso == 0) {
+ nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vDSO", &tv_vdso, &tz_vdso);
+ } else {
+ printf("[FAIL]\tvDSO gettimeofday() failed: %ld\n", ret_vdso);
+ nerrs++;
+ }
+ }
+
+ if (vsyscall_map_x) {
+ if (ret_vsys == 0) {
+ nerrs += check_gtod(&tv_sys1, &tv_sys2, &tz_sys, "vsyscall", &tv_vsys, &tz_vsys);
+ } else {
+ printf("[FAIL]\tvsys gettimeofday() failed: %ld\n", ret_vsys);
+ nerrs++;
+ }
+ }
+
+ return nerrs;
+}
+
+static int test_time(void) {
+ int nerrs = 0;
+
+ printf("[RUN]\ttest time()\n");
+ long t_sys1, t_sys2, t_vdso = 0, t_vsys = 0;
+ long t2_sys1 = -1, t2_sys2 = -1, t2_vdso = -1, t2_vsys = -1;
+ t_sys1 = sys_time(&t2_sys1);
+ if (vdso_time)
+ t_vdso = vdso_time(&t2_vdso);
+ if (vsyscall_map_x)
+ t_vsys = vtime(&t2_vsys);
+ t_sys2 = sys_time(&t2_sys2);
+ if (t_sys1 < 0 || t_sys1 != t2_sys1 || t_sys2 < 0 || t_sys2 != t2_sys2) {
+ printf("[FAIL]\tsyscall failed (ret1:%ld output1:%ld ret2:%ld output2:%ld)\n", t_sys1, t2_sys1, t_sys2, t2_sys2);
+ nerrs++;
+ return nerrs;
+ }
+
+ if (vdso_time) {
+ if (t_vdso < 0 || t_vdso != t2_vdso) {
+ printf("[FAIL]\tvDSO failed (ret:%ld output:%ld)\n", t_vdso, t2_vdso);
+ nerrs++;
+ } else if (t_vdso < t_sys1 || t_vdso > t_sys2) {
+ printf("[FAIL]\tvDSO returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vdso, t_sys2);
+ nerrs++;
+ } else {
+ printf("[OK]\tvDSO time() is okay\n");
+ }
+ }
+
+ if (vsyscall_map_x) {
+ if (t_vsys < 0 || t_vsys != t2_vsys) {
+ printf("[FAIL]\tvsyscall failed (ret:%ld output:%ld)\n", t_vsys, t2_vsys);
+ nerrs++;
+ } else if (t_vsys < t_sys1 || t_vsys > t_sys2) {
+ printf("[FAIL]\tvsyscall returned the wrong time (%ld %ld %ld)\n", t_sys1, t_vsys, t_sys2);
+ nerrs++;
+ } else {
+ printf("[OK]\tvsyscall time() is okay\n");
+ }
+ }
+
+ return nerrs;
+}
+
+static int test_getcpu(int cpu)
+{
+ int nerrs = 0;
+ long ret_sys, ret_vdso = -1, ret_vsys = -1;
+
+ printf("[RUN]\tgetcpu() on CPU %d\n", cpu);
+
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpu, &cpuset);
+ if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) {
+ printf("[SKIP]\tfailed to force CPU %d\n", cpu);
+ return nerrs;
+ }
+
+ unsigned cpu_sys, cpu_vdso, cpu_vsys, node_sys, node_vdso, node_vsys;
+ unsigned node = 0;
+ bool have_node = false;
+ ret_sys = sys_getcpu(&cpu_sys, &node_sys, 0);
+ if (vdso_getcpu)
+ ret_vdso = vdso_getcpu(&cpu_vdso, &node_vdso, 0);
+ if (vsyscall_map_x)
+ ret_vsys = vgetcpu(&cpu_vsys, &node_vsys, 0);
+
+ if (ret_sys == 0) {
+ if (cpu_sys != cpu) {
+ printf("[FAIL]\tsyscall reported CPU %hu but should be %d\n", cpu_sys, cpu);
+ nerrs++;
+ }
+
+ have_node = true;
+ node = node_sys;
+ }
+
+ if (vdso_getcpu) {
+ if (ret_vdso) {
+ printf("[FAIL]\tvDSO getcpu() failed\n");
+ nerrs++;
+ } else {
+ if (!have_node) {
+ have_node = true;
+ node = node_vdso;
+ }
+
+ if (cpu_vdso != cpu) {
+ printf("[FAIL]\tvDSO reported CPU %hu but should be %d\n", cpu_vdso, cpu);
+ nerrs++;
+ } else {
+ printf("[OK]\tvDSO reported correct CPU\n");
+ }
+
+ if (node_vdso != node) {
+ printf("[FAIL]\tvDSO reported node %hu but should be %hu\n", node_vdso, node);
+ nerrs++;
+ } else {
+ printf("[OK]\tvDSO reported correct node\n");
+ }
+ }
+ }
+
+ if (vsyscall_map_x) {
+ if (ret_vsys) {
+ printf("[FAIL]\tvsyscall getcpu() failed\n");
+ nerrs++;
+ } else {
+ if (!have_node) {
+ have_node = true;
+ node = node_vsys;
+ }
+
+ if (cpu_vsys != cpu) {
+ printf("[FAIL]\tvsyscall reported CPU %hu but should be %d\n", cpu_vsys, cpu);
+ nerrs++;
+ } else {
+ printf("[OK]\tvsyscall reported correct CPU\n");
+ }
+
+ if (node_vsys != node) {
+ printf("[FAIL]\tvsyscall reported node %hu but should be %hu\n", node_vsys, node);
+ nerrs++;
+ } else {
+ printf("[OK]\tvsyscall reported correct node\n");
+ }
+ }
+ }
+
+ return nerrs;
+}
+
+static int test_vsys_r(void)
+{
+#ifdef __x86_64__
+ printf("[RUN]\tChecking read access to the vsyscall page\n");
+ bool can_read;
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ *(volatile int *)0xffffffffff600000;
+ can_read = true;
+ } else {
+ can_read = false;
+ }
+
+ if (can_read && !vsyscall_map_r) {
+ printf("[FAIL]\tWe have read access, but we shouldn't\n");
+ return 1;
+ } else if (!can_read && vsyscall_map_r) {
+ printf("[FAIL]\tWe don't have read access, but we should\n");
+ return 1;
+ } else if (can_read) {
+ printf("[OK]\tWe have read access\n");
+ } else {
+ printf("[OK]\tWe do not have read access: #PF(0x%lx)\n",
+ segv_err);
+ }
+#endif
+
+ return 0;
+}
+
+static int test_vsys_x(void)
+{
+#ifdef __x86_64__
+ if (vsyscall_map_x) {
+ /* We already tested this adequately. */
+ return 0;
+ }
+
+ printf("[RUN]\tMake sure that vsyscalls really page fault\n");
+
+ bool can_exec;
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ vgtod(NULL, NULL);
+ can_exec = true;
+ } else {
+ can_exec = false;
+ }
+
+ if (can_exec) {
+ printf("[FAIL]\tExecuting the vsyscall did not page fault\n");
+ return 1;
+ } else if (segv_err & (1 << 4)) { /* INSTR */
+ printf("[OK]\tExecuting the vsyscall page failed: #PF(0x%lx)\n",
+ segv_err);
+ } else {
+ printf("[FAIL]\tExecution failed with the wrong error: #PF(0x%lx)\n",
+ segv_err);
+ return 1;
+ }
+#endif
+
+ return 0;
+}
+
+/*
+ * Debuggers expect ptrace() to be able to peek at the vsyscall page.
+ * Use process_vm_readv() as a proxy for ptrace() to test this. We
+ * want it to work in the vsyscall=emulate case and to fail in the
+ * vsyscall=xonly case.
+ *
+ * It's worth noting that this ABI is a bit nutty. write(2) can't
+ * read from the vsyscall page on any kernel version or mode. The
+ * fact that ptrace() ever worked was a nice courtesy of old kernels,
+ * but the code to support it is fairly gross.
+ */
+static int test_process_vm_readv(void)
+{
+#ifdef __x86_64__
+ char buf[4096];
+ struct iovec local, remote;
+ int ret;
+
+ printf("[RUN]\tprocess_vm_readv() from vsyscall page\n");
+
+ local.iov_base = buf;
+ local.iov_len = 4096;
+ remote.iov_base = (void *)0xffffffffff600000;
+ remote.iov_len = 4096;
+ ret = process_vm_readv(getpid(), &local, 1, &remote, 1, 0);
+ if (ret != 4096) {
+ /*
+ * We expect process_vm_readv() to work if and only if the
+ * vsyscall page is readable.
+ */
+ printf("[%s]\tprocess_vm_readv() failed (ret = %d, errno = %d)\n", vsyscall_map_r ? "FAIL" : "OK", ret, errno);
+ return vsyscall_map_r ? 1 : 0;
+ }
+
+ if (vsyscall_map_r) {
+ if (!memcmp(buf, remote.iov_base, sizeof(buf))) {
+ printf("[OK]\tIt worked and read correct data\n");
+ } else {
+ printf("[FAIL]\tIt worked but returned incorrect data\n");
+ return 1;
+ }
+ } else {
+ printf("[FAIL]\tprocess_rm_readv() succeeded, but it should have failed in this configuration\n");
+ return 1;
+ }
+#endif
+
+ return 0;
+}
+
+#ifdef __x86_64__
+static volatile sig_atomic_t num_vsyscall_traps;
+
+static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t *)ctx_void;
+ unsigned long ip = ctx->uc_mcontext.gregs[REG_RIP];
+
+ if (((ip ^ 0xffffffffff600000UL) & ~0xfffUL) == 0)
+ num_vsyscall_traps++;
+}
+
+static int test_emulation(void)
+{
+ time_t tmp;
+ bool is_native;
+
+ if (!vsyscall_map_x)
+ return 0;
+
+ printf("[RUN]\tchecking that vsyscalls are emulated\n");
+ sethandler(SIGTRAP, sigtrap, 0);
+ set_eflags(get_eflags() | X86_EFLAGS_TF);
+ vtime(&tmp);
+ set_eflags(get_eflags() & ~X86_EFLAGS_TF);
+
+ /*
+ * If vsyscalls are emulated, we expect a single trap in the
+ * vsyscall page -- the call instruction will trap with RIP
+ * pointing to the entry point before emulation takes over.
+ * In native mode, we expect two traps, since whatever code
+ * the vsyscall page contains will be more than just a ret
+ * instruction.
+ */
+ is_native = (num_vsyscall_traps > 1);
+
+ printf("[%s]\tvsyscalls are %s (%d instructions in vsyscall page)\n",
+ (is_native ? "FAIL" : "OK"),
+ (is_native ? "native" : "emulated"),
+ (int)num_vsyscall_traps);
+
+ return is_native;
+}
+#endif
+
+int main(int argc, char **argv)
+{
+ int nerrs = 0;
+
+ init_vdso();
+ nerrs += init_vsys();
+
+ nerrs += test_gtod();
+ nerrs += test_time();
+ nerrs += test_getcpu(0);
+ nerrs += test_getcpu(1);
+
+ sethandler(SIGSEGV, sigsegv, 0);
+ nerrs += test_vsys_r();
+ nerrs += test_vsys_x();
+
+ nerrs += test_process_vm_readv();
+
+#ifdef __x86_64__
+ nerrs += test_emulation();
+#endif
+
+ return nerrs ? 1 : 0;
+}
diff --git a/tools/testing/selftests/x86/thunks.S b/tools/testing/selftests/x86/thunks.S
new file mode 100644
index 000000000..1bb5d62c1
--- /dev/null
+++ b/tools/testing/selftests/x86/thunks.S
@@ -0,0 +1,59 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * thunks.S - assembly helpers for mixed-bitness code
+ * Copyright (c) 2015 Andrew Lutomirski
+ *
+ * These are little helpers that make it easier to switch bitness on
+ * the fly.
+ */
+
+ .text
+
+ .global call32_from_64
+ .type call32_from_64, @function
+call32_from_64:
+ // rdi: stack to use
+ // esi: function to call
+
+ // Save registers
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ pushfq
+
+ // Switch stacks
+ mov %rsp,(%rdi)
+ mov %rdi,%rsp
+
+ // Switch to compatibility mode
+ pushq $0x23 /* USER32_CS */
+ pushq $1f
+ lretq
+
+1:
+ .code32
+ // Call the function
+ call *%esi
+ // Switch back to long mode
+ jmp $0x33,$1f
+ .code64
+
+1:
+ // Restore the stack
+ mov (%rsp),%rsp
+
+ // Restore registers
+ popfq
+ popq %r15
+ popq %r14
+ popq %r13
+ popq %r12
+ popq %rbp
+ popq %rbx
+
+ ret
+
+.size call32_from_64, .-call32_from_64
diff --git a/tools/testing/selftests/x86/thunks_32.S b/tools/testing/selftests/x86/thunks_32.S
new file mode 100644
index 000000000..a71d92da8
--- /dev/null
+++ b/tools/testing/selftests/x86/thunks_32.S
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * thunks_32.S - assembly helpers for mixed-bitness code
+ * Copyright (c) 2015 Denys Vlasenko
+ *
+ * These are little helpers that make it easier to switch bitness on
+ * the fly.
+ */
+
+ .text
+ .code32
+
+ .global call64_from_32
+ .type call32_from_64, @function
+
+ // 4(%esp): function to call
+call64_from_32:
+ // Fetch function address
+ mov 4(%esp), %eax
+
+ // Save registers which are callee-clobbered by 64-bit ABI
+ push %ecx
+ push %edx
+ push %esi
+ push %edi
+
+ // Switch to long mode
+ jmp $0x33,$1f
+1: .code64
+
+ // Call the function
+ call *%rax
+
+ // Switch to compatibility mode
+ push $0x23 /* USER32_CS */
+ .code32; push $1f; .code64 /* hack: can't have X86_64_32S relocation in 32-bit ELF */
+ lretq
+1: .code32
+
+ pop %edi
+ pop %esi
+ pop %edx
+ pop %ecx
+
+ ret
+
+.size call64_from_32, .-call64_from_32
diff --git a/tools/testing/selftests/x86/trivial_32bit_program.c b/tools/testing/selftests/x86/trivial_32bit_program.c
new file mode 100644
index 000000000..aa1f58c2f
--- /dev/null
+++ b/tools/testing/selftests/x86/trivial_32bit_program.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Trivial program to check that we have a valid 32-bit build environment.
+ * Copyright (c) 2015 Andy Lutomirski
+ */
+
+#ifndef __i386__
+# error wrong architecture
+#endif
+
+#include <stdio.h>
+
+int main()
+{
+ printf("\n");
+
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/trivial_64bit_program.c b/tools/testing/selftests/x86/trivial_64bit_program.c
new file mode 100644
index 000000000..39f4b84fb
--- /dev/null
+++ b/tools/testing/selftests/x86/trivial_64bit_program.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Trivial program to check that we have a valid 64-bit build environment.
+ * Copyright (c) 2015 Andy Lutomirski
+ */
+
+#ifndef __x86_64__
+# error wrong architecture
+#endif
+
+#include <stdio.h>
+
+int main()
+{
+ printf("\n");
+
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/trivial_program.c b/tools/testing/selftests/x86/trivial_program.c
new file mode 100644
index 000000000..46a447163
--- /dev/null
+++ b/tools/testing/selftests/x86/trivial_program.c
@@ -0,0 +1,10 @@
+/* Trivial program to check that compilation with certain flags is working. */
+
+#include <stdio.h>
+
+int
+main(void)
+{
+ puts("");
+ return 0;
+}
diff --git a/tools/testing/selftests/x86/unwind_vdso.c b/tools/testing/selftests/x86/unwind_vdso.c
new file mode 100644
index 000000000..4c311e1af
--- /dev/null
+++ b/tools/testing/selftests/x86/unwind_vdso.c
@@ -0,0 +1,183 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * unwind_vdso.c - tests unwind info for AT_SYSINFO in the vDSO
+ * Copyright (c) 2014-2015 Andrew Lutomirski
+ *
+ * This tests __kernel_vsyscall's unwind info.
+ */
+
+#define _GNU_SOURCE
+
+#include <features.h>
+#include <stdio.h>
+
+#include "helpers.h"
+
+#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ < 16
+
+int main()
+{
+ /* We need getauxval(). */
+ printf("[SKIP]\tGLIBC before 2.16 cannot compile this test\n");
+ return 0;
+}
+
+#else
+
+#include <sys/time.h>
+#include <stdlib.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <string.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <sys/ucontext.h>
+#include <err.h>
+#include <stddef.h>
+#include <stdbool.h>
+#include <sys/ptrace.h>
+#include <sys/user.h>
+#include <link.h>
+#include <sys/auxv.h>
+#include <dlfcn.h>
+#include <unwind.h>
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+ int flags)
+{
+ struct sigaction sa;
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = handler;
+ sa.sa_flags = SA_SIGINFO | flags;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(sig, &sa, 0))
+ err(1, "sigaction");
+}
+
+static volatile sig_atomic_t nerrs;
+static unsigned long sysinfo;
+static bool got_sysinfo = false;
+static unsigned long return_address;
+
+struct unwind_state {
+ unsigned long ip; /* trap source */
+ int depth; /* -1 until we hit the trap source */
+};
+
+_Unwind_Reason_Code trace_fn(struct _Unwind_Context * ctx, void *opaque)
+{
+ struct unwind_state *state = opaque;
+ unsigned long ip = _Unwind_GetIP(ctx);
+
+ if (state->depth == -1) {
+ if (ip == state->ip)
+ state->depth = 0;
+ else
+ return _URC_NO_REASON; /* Not there yet */
+ }
+ printf("\t 0x%lx\n", ip);
+
+ if (ip == return_address) {
+ /* Here we are. */
+ unsigned long eax = _Unwind_GetGR(ctx, 0);
+ unsigned long ecx = _Unwind_GetGR(ctx, 1);
+ unsigned long edx = _Unwind_GetGR(ctx, 2);
+ unsigned long ebx = _Unwind_GetGR(ctx, 3);
+ unsigned long ebp = _Unwind_GetGR(ctx, 5);
+ unsigned long esi = _Unwind_GetGR(ctx, 6);
+ unsigned long edi = _Unwind_GetGR(ctx, 7);
+ bool ok = (eax == SYS_getpid || eax == getpid()) &&
+ ebx == 1 && ecx == 2 && edx == 3 &&
+ esi == 4 && edi == 5 && ebp == 6;
+
+ if (!ok)
+ nerrs++;
+ printf("[%s]\t NR = %ld, args = %ld, %ld, %ld, %ld, %ld, %ld\n",
+ (ok ? "OK" : "FAIL"),
+ eax, ebx, ecx, edx, esi, edi, ebp);
+
+ return _URC_NORMAL_STOP;
+ } else {
+ state->depth++;
+ return _URC_NO_REASON;
+ }
+}
+
+static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
+{
+ ucontext_t *ctx = (ucontext_t *)ctx_void;
+ struct unwind_state state;
+ unsigned long ip = ctx->uc_mcontext.gregs[REG_EIP];
+
+ if (!got_sysinfo && ip == sysinfo) {
+ got_sysinfo = true;
+
+ /* Find the return address. */
+ return_address = *(unsigned long *)(unsigned long)ctx->uc_mcontext.gregs[REG_ESP];
+
+ printf("\tIn vsyscall at 0x%lx, returning to 0x%lx\n",
+ ip, return_address);
+ }
+
+ if (!got_sysinfo)
+ return; /* Not there yet */
+
+ if (ip == return_address) {
+ ctx->uc_mcontext.gregs[REG_EFL] &= ~X86_EFLAGS_TF;
+ printf("\tVsyscall is done\n");
+ return;
+ }
+
+ printf("\tSIGTRAP at 0x%lx\n", ip);
+
+ state.ip = ip;
+ state.depth = -1;
+ _Unwind_Backtrace(trace_fn, &state);
+}
+
+int main()
+{
+ sysinfo = getauxval(AT_SYSINFO);
+ printf("\tAT_SYSINFO is 0x%lx\n", sysinfo);
+
+ Dl_info info;
+ if (!dladdr((void *)sysinfo, &info)) {
+ printf("[WARN]\tdladdr failed on AT_SYSINFO\n");
+ } else {
+ printf("[OK]\tAT_SYSINFO maps to %s, loaded at 0x%p\n",
+ info.dli_fname, info.dli_fbase);
+ }
+
+ sethandler(SIGTRAP, sigtrap, 0);
+
+ syscall(SYS_getpid); /* Force symbol binding without TF set. */
+ printf("[RUN]\tSet TF and check a fast syscall\n");
+ set_eflags(get_eflags() | X86_EFLAGS_TF);
+ syscall(SYS_getpid, 1, 2, 3, 4, 5, 6);
+ if (!got_sysinfo) {
+ set_eflags(get_eflags() & ~X86_EFLAGS_TF);
+
+ /*
+ * The most likely cause of this is that you're on Debian or
+ * a Debian-based distro, you're missing libc6-i686, and you're
+ * affected by libc/19006 (https://sourceware.org/PR19006).
+ */
+ printf("[WARN]\tsyscall(2) didn't enter AT_SYSINFO\n");
+ }
+
+ if (get_eflags() & X86_EFLAGS_TF) {
+ printf("[FAIL]\tTF is still set\n");
+ nerrs++;
+ }
+
+ if (nerrs) {
+ printf("[FAIL]\tThere were errors\n");
+ return 1;
+ } else {
+ printf("[OK]\tAll is well\n");
+ return 0;
+ }
+}
+
+#endif /* New enough libc */
diff --git a/tools/testing/selftests/x86/vdso_restorer.c b/tools/testing/selftests/x86/vdso_restorer.c
new file mode 100644
index 000000000..fe99f2434
--- /dev/null
+++ b/tools/testing/selftests/x86/vdso_restorer.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vdso_restorer.c - tests vDSO-based signal restore
+ * Copyright (c) 2015 Andrew Lutomirski
+ *
+ * This makes sure that sa_restorer == NULL keeps working on 32-bit
+ * configurations. Modern glibc doesn't use it under any circumstances,
+ * so it's easy to overlook breakage.
+ *
+ * 64-bit userspace has never supported sa_restorer == NULL, so this is
+ * 32-bit only.
+ */
+
+#define _GNU_SOURCE
+
+#include <err.h>
+#include <stdio.h>
+#include <dlfcn.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <sys/syscall.h>
+
+/* Open-code this -- the headers are too messy to easily use them. */
+struct real_sigaction {
+ void *handler;
+ unsigned long flags;
+ void *restorer;
+ unsigned int mask[2];
+};
+
+static volatile sig_atomic_t handler_called;
+
+static void handler_with_siginfo(int sig, siginfo_t *info, void *ctx_void)
+{
+ handler_called = 1;
+}
+
+static void handler_without_siginfo(int sig)
+{
+ handler_called = 1;
+}
+
+int main()
+{
+ int nerrs = 0;
+ struct real_sigaction sa;
+
+ void *vdso = dlopen("linux-vdso.so.1",
+ RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+ if (!vdso)
+ vdso = dlopen("linux-gate.so.1",
+ RTLD_LAZY | RTLD_LOCAL | RTLD_NOLOAD);
+ if (!vdso) {
+ printf("[SKIP]\tFailed to find vDSO. Tests are not expected to work.\n");
+ return 0;
+ }
+
+ memset(&sa, 0, sizeof(sa));
+ sa.handler = handler_with_siginfo;
+ sa.flags = SA_SIGINFO;
+ sa.restorer = NULL; /* request kernel-provided restorer */
+
+ printf("[RUN]\tRaise a signal, SA_SIGINFO, sa.restorer == NULL\n");
+
+ if (syscall(SYS_rt_sigaction, SIGUSR1, &sa, NULL, 8) != 0)
+ err(1, "raw rt_sigaction syscall");
+
+ raise(SIGUSR1);
+
+ if (handler_called) {
+ printf("[OK]\tSA_SIGINFO handler returned successfully\n");
+ } else {
+ printf("[FAIL]\tSA_SIGINFO handler was not called\n");
+ nerrs++;
+ }
+
+ printf("[RUN]\tRaise a signal, !SA_SIGINFO, sa.restorer == NULL\n");
+
+ sa.flags = 0;
+ sa.handler = handler_without_siginfo;
+ if (syscall(SYS_sigaction, SIGUSR1, &sa, 0) != 0)
+ err(1, "raw sigaction syscall");
+ handler_called = 0;
+
+ raise(SIGUSR1);
+
+ if (handler_called) {
+ printf("[OK]\t!SA_SIGINFO handler returned successfully\n");
+ } else {
+ printf("[FAIL]\t!SA_SIGINFO handler was not called\n");
+ nerrs++;
+ }
+}
diff --git a/tools/testing/selftests/zram/Makefile b/tools/testing/selftests/zram/Makefile
new file mode 100644
index 000000000..7f78eb1b5
--- /dev/null
+++ b/tools/testing/selftests/zram/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+all:
+
+TEST_PROGS := zram.sh
+TEST_FILES := zram01.sh zram02.sh zram_lib.sh
+EXTRA_CLEAN := err.log
+
+include ../lib.mk
+
diff --git a/tools/testing/selftests/zram/README b/tools/testing/selftests/zram/README
new file mode 100644
index 000000000..110b34834
--- /dev/null
+++ b/tools/testing/selftests/zram/README
@@ -0,0 +1,40 @@
+zram: Compressed RAM based block devices
+----------------------------------------
+* Introduction
+
+The zram module creates RAM based block devices named /dev/zram<id>
+(<id> = 0, 1, ...). Pages written to these disks are compressed and stored
+in memory itself. These disks allow very fast I/O and compression provides
+good amounts of memory savings. Some of the usecases include /tmp storage,
+use as swap disks, various caches under /var and maybe many more :)
+
+Statistics for individual zram devices are exported through sysfs nodes at
+/sys/block/zram<id>/
+
+Kconfig required:
+CONFIG_ZRAM=y
+CONFIG_CRYPTO_LZ4=y
+CONFIG_ZPOOL=y
+CONFIG_ZSMALLOC=y
+
+ZRAM Testcases
+--------------
+zram_lib.sh: create library with initialization/cleanup functions
+zram.sh: For sanity check of CONFIG_ZRAM and to run zram01 and zram02
+
+Two functional tests: zram01 and zram02:
+zram01.sh: creates general purpose ram disks with ext4 filesystems
+zram02.sh: creates block device for swap
+
+Commands required for testing:
+ - bc
+ - dd
+ - free
+ - awk
+ - mkswap
+ - swapon
+ - swapoff
+ - mkfs/ mkfs.ext4
+
+For more information please refer:
+kernel-source-tree/Documentation/admin-guide/blockdev/zram.rst
diff --git a/tools/testing/selftests/zram/config b/tools/testing/selftests/zram/config
new file mode 100644
index 000000000..e0cc47e2c
--- /dev/null
+++ b/tools/testing/selftests/zram/config
@@ -0,0 +1,2 @@
+CONFIG_ZSMALLOC=y
+CONFIG_ZRAM=m
diff --git a/tools/testing/selftests/zram/zram.sh b/tools/testing/selftests/zram/zram.sh
new file mode 100755
index 000000000..b0b91d9b0
--- /dev/null
+++ b/tools/testing/selftests/zram/zram.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+TCID="zram.sh"
+
+. ./zram_lib.sh
+
+run_zram () {
+echo "--------------------"
+echo "running zram tests"
+echo "--------------------"
+./zram01.sh
+echo ""
+./zram02.sh
+}
+
+check_prereqs
+
+run_zram
diff --git a/tools/testing/selftests/zram/zram01.sh b/tools/testing/selftests/zram/zram01.sh
new file mode 100755
index 000000000..8f4affe34
--- /dev/null
+++ b/tools/testing/selftests/zram/zram01.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2015 Oracle and/or its affiliates. All Rights Reserved.
+#
+# Test creates several zram devices with different filesystems on them.
+# It fills each device with zeros and checks that compression works.
+#
+# Author: Alexey Kodanev <alexey.kodanev@oracle.com>
+# Modified: Naresh Kamboju <naresh.kamboju@linaro.org>
+
+TCID="zram01"
+ERR_CODE=0
+
+. ./zram_lib.sh
+
+# Test will create the following number of zram devices:
+dev_num=1
+# This is a list of parameters for zram devices.
+# Number of items must be equal to 'dev_num' parameter.
+zram_max_streams="2"
+
+# The zram sysfs node 'disksize' value can be either in bytes,
+# or you can use mem suffixes. But in some old kernels, mem
+# suffixes are not supported, for example, in RHEL6.6GA's kernel
+# layer, it uses strict_strtoull() to parse disksize which does
+# not support mem suffixes, in some newer kernels, they use
+# memparse() which supports mem suffixes. So here we just use
+# bytes to make sure everything works correctly.
+zram_sizes="2097152" # 2MB
+zram_mem_limits="2M"
+zram_filesystems="ext4"
+zram_algs="lzo"
+
+zram_fill_fs()
+{
+ for i in $(seq $dev_start $dev_end); do
+ echo "fill zram$i..."
+ local b=0
+ while [ true ]; do
+ dd conv=notrunc if=/dev/zero of=zram${i}/file \
+ oflag=append count=1 bs=1024 status=none \
+ > /dev/null 2>&1 || break
+ b=$(($b + 1))
+ done
+ echo "zram$i can be filled with '$b' KB"
+
+ local mem_used_total=`awk '{print $3}' "/sys/block/zram$i/mm_stat"`
+ local v=$((100 * 1024 * $b / $mem_used_total))
+ if [ "$v" -lt 100 ]; then
+ echo "FAIL compression ratio: 0.$v:1"
+ ERR_CODE=-1
+ return
+ fi
+
+ echo "zram compression ratio: $(echo "scale=2; $v / 100 " | bc):1: OK"
+ done
+}
+
+check_prereqs
+zram_load
+zram_max_streams
+zram_compress_alg
+zram_set_disksizes
+zram_set_memlimit
+zram_makefs
+zram_mount
+
+zram_fill_fs
+zram_cleanup
+
+if [ $ERR_CODE -ne 0 ]; then
+ echo "$TCID : [FAIL]"
+else
+ echo "$TCID : [PASS]"
+fi
diff --git a/tools/testing/selftests/zram/zram02.sh b/tools/testing/selftests/zram/zram02.sh
new file mode 100755
index 000000000..2418b0c4e
--- /dev/null
+++ b/tools/testing/selftests/zram/zram02.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2015 Oracle and/or its affiliates. All Rights Reserved.
+#
+# Test checks that we can create swap zram device.
+#
+# Author: Alexey Kodanev <alexey.kodanev@oracle.com>
+# Modified: Naresh Kamboju <naresh.kamboju@linaro.org>
+
+TCID="zram02"
+ERR_CODE=0
+
+. ./zram_lib.sh
+
+# Test will create the following number of zram devices:
+dev_num=1
+# This is a list of parameters for zram devices.
+# Number of items must be equal to 'dev_num' parameter.
+zram_max_streams="2"
+
+# The zram sysfs node 'disksize' value can be either in bytes,
+# or you can use mem suffixes. But in some old kernels, mem
+# suffixes are not supported, for example, in RHEL6.6GA's kernel
+# layer, it uses strict_strtoull() to parse disksize which does
+# not support mem suffixes, in some newer kernels, they use
+# memparse() which supports mem suffixes. So here we just use
+# bytes to make sure everything works correctly.
+zram_sizes="1048576" # 1M
+zram_mem_limits="1M"
+
+check_prereqs
+zram_load
+zram_max_streams
+zram_set_disksizes
+zram_set_memlimit
+zram_makeswap
+zram_swapoff
+zram_cleanup
+
+if [ $ERR_CODE -ne 0 ]; then
+ echo "$TCID : [FAIL]"
+else
+ echo "$TCID : [PASS]"
+fi
diff --git a/tools/testing/selftests/zram/zram_lib.sh b/tools/testing/selftests/zram/zram_lib.sh
new file mode 100755
index 000000000..21ec1966d
--- /dev/null
+++ b/tools/testing/selftests/zram/zram_lib.sh
@@ -0,0 +1,269 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (c) 2015 Oracle and/or its affiliates. All Rights Reserved.
+#
+# Author: Alexey Kodanev <alexey.kodanev@oracle.com>
+# Modified: Naresh Kamboju <naresh.kamboju@linaro.org>
+
+dev_makeswap=-1
+dev_mounted=-1
+dev_start=0
+dev_end=-1
+module_load=-1
+sys_control=-1
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+kernel_version=`uname -r | cut -d'.' -f1,2`
+kernel_major=${kernel_version%.*}
+kernel_minor=${kernel_version#*.}
+
+trap INT
+
+check_prereqs()
+{
+ local msg="skip all tests:"
+ local uid=$(id -u)
+
+ if [ $uid -ne 0 ]; then
+ echo $msg must be run as root >&2
+ exit $ksft_skip
+ fi
+}
+
+kernel_gte()
+{
+ major=${1%.*}
+ minor=${1#*.}
+
+ if [ $kernel_major -gt $major ]; then
+ return 0
+ elif [[ $kernel_major -eq $major && $kernel_minor -ge $minor ]]; then
+ return 0
+ fi
+
+ return 1
+}
+
+zram_cleanup()
+{
+ echo "zram cleanup"
+ local i=
+ for i in $(seq $dev_start $dev_makeswap); do
+ swapoff /dev/zram$i
+ done
+
+ for i in $(seq $dev_start $dev_mounted); do
+ umount /dev/zram$i
+ done
+
+ for i in $(seq $dev_start $dev_end); do
+ echo 1 > /sys/block/zram${i}/reset
+ rm -rf zram$i
+ done
+
+ if [ $sys_control -eq 1 ]; then
+ for i in $(seq $dev_start $dev_end); do
+ echo $i > /sys/class/zram-control/hot_remove
+ done
+ fi
+
+ if [ $module_load -eq 1 ]; then
+ rmmod zram > /dev/null 2>&1
+ fi
+}
+
+zram_load()
+{
+ echo "create '$dev_num' zram device(s)"
+
+ # zram module loaded, new kernel
+ if [ -d "/sys/class/zram-control" ]; then
+ echo "zram modules already loaded, kernel supports" \
+ "zram-control interface"
+ dev_start=$(ls /dev/zram* | wc -w)
+ dev_end=$(($dev_start + $dev_num - 1))
+ sys_control=1
+
+ for i in $(seq $dev_start $dev_end); do
+ cat /sys/class/zram-control/hot_add > /dev/null
+ done
+
+ echo "all zram devices (/dev/zram$dev_start~$dev_end" \
+ "successfully created"
+ return 0
+ fi
+
+ # detect old kernel or built-in
+ modprobe zram num_devices=$dev_num
+ if [ ! -d "/sys/class/zram-control" ]; then
+ if grep -q '^zram' /proc/modules; then
+ rmmod zram > /dev/null 2>&1
+ if [ $? -ne 0 ]; then
+ echo "zram module is being used on old kernel" \
+ "without zram-control interface"
+ exit $ksft_skip
+ fi
+ else
+ echo "test needs CONFIG_ZRAM=m on old kernel without" \
+ "zram-control interface"
+ exit $ksft_skip
+ fi
+ modprobe zram num_devices=$dev_num
+ fi
+
+ module_load=1
+ dev_end=$(($dev_num - 1))
+ echo "all zram devices (/dev/zram0~$dev_end) successfully created"
+}
+
+zram_max_streams()
+{
+ echo "set max_comp_streams to zram device(s)"
+
+ kernel_gte 4.7
+ if [ $? -eq 0 ]; then
+ echo "The device attribute max_comp_streams was"\
+ "deprecated in 4.7"
+ return 0
+ fi
+
+ local i=$dev_start
+ for max_s in $zram_max_streams; do
+ local sys_path="/sys/block/zram${i}/max_comp_streams"
+ echo $max_s > $sys_path || \
+ echo "FAIL failed to set '$max_s' to $sys_path"
+ sleep 1
+ local max_streams=$(cat $sys_path)
+
+ [ "$max_s" -ne "$max_streams" ] && \
+ echo "FAIL can't set max_streams '$max_s', get $max_stream"
+
+ i=$(($i + 1))
+ echo "$sys_path = '$max_streams'"
+ done
+
+ echo "zram max streams: OK"
+}
+
+zram_compress_alg()
+{
+ echo "test that we can set compression algorithm"
+
+ local i=$dev_start
+ local algs=$(cat /sys/block/zram${i}/comp_algorithm)
+ echo "supported algs: $algs"
+
+ for alg in $zram_algs; do
+ local sys_path="/sys/block/zram${i}/comp_algorithm"
+ echo "$alg" > $sys_path || \
+ echo "FAIL can't set '$alg' to $sys_path"
+ i=$(($i + 1))
+ echo "$sys_path = '$alg'"
+ done
+
+ echo "zram set compression algorithm: OK"
+}
+
+zram_set_disksizes()
+{
+ echo "set disk size to zram device(s)"
+ local i=$dev_start
+ for ds in $zram_sizes; do
+ local sys_path="/sys/block/zram${i}/disksize"
+ echo "$ds" > $sys_path || \
+ echo "FAIL can't set '$ds' to $sys_path"
+
+ i=$(($i + 1))
+ echo "$sys_path = '$ds'"
+ done
+
+ echo "zram set disksizes: OK"
+}
+
+zram_set_memlimit()
+{
+ echo "set memory limit to zram device(s)"
+
+ local i=$dev_start
+ for ds in $zram_mem_limits; do
+ local sys_path="/sys/block/zram${i}/mem_limit"
+ echo "$ds" > $sys_path || \
+ echo "FAIL can't set '$ds' to $sys_path"
+
+ i=$(($i + 1))
+ echo "$sys_path = '$ds'"
+ done
+
+ echo "zram set memory limit: OK"
+}
+
+zram_makeswap()
+{
+ echo "make swap with zram device(s)"
+ local i=$dev_start
+ for i in $(seq $dev_start $dev_end); do
+ mkswap /dev/zram$i > err.log 2>&1
+ if [ $? -ne 0 ]; then
+ cat err.log
+ echo "FAIL mkswap /dev/zram$1 failed"
+ fi
+
+ swapon /dev/zram$i > err.log 2>&1
+ if [ $? -ne 0 ]; then
+ cat err.log
+ echo "FAIL swapon /dev/zram$1 failed"
+ fi
+
+ echo "done with /dev/zram$i"
+ dev_makeswap=$i
+ done
+
+ echo "zram making zram mkswap and swapon: OK"
+}
+
+zram_swapoff()
+{
+ local i=
+ for i in $(seq $dev_start $dev_end); do
+ swapoff /dev/zram$i > err.log 2>&1
+ if [ $? -ne 0 ]; then
+ cat err.log
+ echo "FAIL swapoff /dev/zram$i failed"
+ fi
+ done
+ dev_makeswap=-1
+
+ echo "zram swapoff: OK"
+}
+
+zram_makefs()
+{
+ local i=$dev_start
+ for fs in $zram_filesystems; do
+ # if requested fs not supported default it to ext2
+ which mkfs.$fs > /dev/null 2>&1 || fs=ext2
+
+ echo "make $fs filesystem on /dev/zram$i"
+ mkfs.$fs /dev/zram$i > err.log 2>&1
+ if [ $? -ne 0 ]; then
+ cat err.log
+ echo "FAIL failed to make $fs on /dev/zram$i"
+ fi
+ i=$(($i + 1))
+ echo "zram mkfs.$fs: OK"
+ done
+}
+
+zram_mount()
+{
+ local i=0
+ for i in $(seq $dev_start $dev_end); do
+ echo "mount /dev/zram$i"
+ mkdir zram$i
+ mount /dev/zram$i zram$i > /dev/null || \
+ echo "FAIL mount /dev/zram$i failed"
+ dev_mounted=$i
+ done
+
+ echo "zram mount of zram device(s): OK"
+}